diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,224033 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 1.2179989799258544, + "eval_steps": 5000000.0, + "global_step": 320000, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 3.806246812268295e-05, + "grad_norm": 0.08437860012054443, + "learning_rate": 5e-06, + "loss": 2.1394, + "step": 10 + }, + { + "epoch": 7.61249362453659e-05, + "grad_norm": 0.08035453408956528, + "learning_rate": 1e-05, + "loss": 2.1428, + "step": 20 + }, + { + "epoch": 0.00011418740436804885, + "grad_norm": 0.07435434311628342, + "learning_rate": 1.5e-05, + "loss": 2.1325, + "step": 30 + }, + { + "epoch": 0.0001522498724907318, + "grad_norm": 0.07114154100418091, + "learning_rate": 2e-05, + "loss": 2.1374, + "step": 40 + }, + { + "epoch": 0.00019031234061341474, + "grad_norm": 0.07513192296028137, + "learning_rate": 2.5e-05, + "loss": 2.1407, + "step": 50 + }, + { + "epoch": 0.0002283748087360977, + "grad_norm": 0.07081104069948196, + "learning_rate": 3e-05, + "loss": 2.1129, + "step": 60 + }, + { + "epoch": 0.00026643727685878065, + "grad_norm": 0.0714515671133995, + "learning_rate": 3.5000000000000004e-05, + "loss": 2.1324, + "step": 70 + }, + { + "epoch": 0.0003044997449814636, + "grad_norm": 0.07307267189025879, + "learning_rate": 4e-05, + "loss": 2.1071, + "step": 80 + }, + { + "epoch": 0.0003425622131041465, + "grad_norm": 0.07330012321472168, + "learning_rate": 4.4999999999999996e-05, + "loss": 2.1344, + "step": 90 + }, + { + "epoch": 0.0003806246812268295, + "grad_norm": 0.07287479937076569, + "learning_rate": 5e-05, + "loss": 2.1276, + "step": 100 + }, + { + "epoch": 0.0004186871493495124, + "grad_norm": 0.07311001420021057, + "learning_rate": 5.5e-05, + "loss": 2.13, + "step": 110 + }, + { + "epoch": 0.0004567496174721954, + "grad_norm": 0.07323328405618668, + "learning_rate": 6e-05, + "loss": 2.0992, + "step": 120 + }, + { + "epoch": 0.0004948120855948783, + "grad_norm": 0.07578736543655396, + "learning_rate": 6.500000000000001e-05, + "loss": 2.1201, + "step": 130 + }, + { + "epoch": 0.0005328745537175613, + "grad_norm": 0.073748379945755, + "learning_rate": 7.000000000000001e-05, + "loss": 2.1309, + "step": 140 + }, + { + "epoch": 0.0005709370218402442, + "grad_norm": 0.07653962820768356, + "learning_rate": 7.5e-05, + "loss": 2.115, + "step": 150 + }, + { + "epoch": 0.0006089994899629272, + "grad_norm": 0.07610829174518585, + "learning_rate": 8e-05, + "loss": 2.1164, + "step": 160 + }, + { + "epoch": 0.0006470619580856101, + "grad_norm": 0.07669228315353394, + "learning_rate": 8.5e-05, + "loss": 2.1047, + "step": 170 + }, + { + "epoch": 0.000685124426208293, + "grad_norm": 0.07672448456287384, + "learning_rate": 8.999999999999999e-05, + "loss": 2.1072, + "step": 180 + }, + { + "epoch": 0.000723186894330976, + "grad_norm": 0.07850458472967148, + "learning_rate": 9.5e-05, + "loss": 2.1163, + "step": 190 + }, + { + "epoch": 0.000761249362453659, + "grad_norm": 0.07940211147069931, + "learning_rate": 0.0001, + "loss": 2.1152, + "step": 200 + }, + { + "epoch": 0.0007993118305763419, + "grad_norm": 0.07731697708368301, + "learning_rate": 0.000105, + "loss": 2.1049, + "step": 210 + }, + { + "epoch": 0.0008373742986990248, + "grad_norm": 0.07781223952770233, + "learning_rate": 0.00011, + "loss": 2.1002, + "step": 220 + }, + { + "epoch": 0.0008754367668217077, + "grad_norm": 0.07948898524045944, + "learning_rate": 0.000115, + "loss": 2.0993, + "step": 230 + }, + { + "epoch": 0.0009134992349443908, + "grad_norm": 0.07564356923103333, + "learning_rate": 0.00012, + "loss": 2.1122, + "step": 240 + }, + { + "epoch": 0.0009515617030670737, + "grad_norm": 0.0799378901720047, + "learning_rate": 0.000125, + "loss": 2.1155, + "step": 250 + }, + { + "epoch": 0.0009896241711897565, + "grad_norm": 0.07956661283969879, + "learning_rate": 0.00013000000000000002, + "loss": 2.1079, + "step": 260 + }, + { + "epoch": 0.0010276866393124397, + "grad_norm": 0.08274099230766296, + "learning_rate": 0.000135, + "loss": 2.0886, + "step": 270 + }, + { + "epoch": 0.0010657491074351226, + "grad_norm": 0.08240476250648499, + "learning_rate": 0.00014000000000000001, + "loss": 2.1123, + "step": 280 + }, + { + "epoch": 0.0011038115755578055, + "grad_norm": 0.08515627682209015, + "learning_rate": 0.000145, + "loss": 2.1003, + "step": 290 + }, + { + "epoch": 0.0011418740436804885, + "grad_norm": 0.08448918908834457, + "learning_rate": 0.00015, + "loss": 2.1146, + "step": 300 + }, + { + "epoch": 0.0011799365118031714, + "grad_norm": 0.08467483520507812, + "learning_rate": 0.000155, + "loss": 2.1044, + "step": 310 + }, + { + "epoch": 0.0012179989799258543, + "grad_norm": 0.08878104388713837, + "learning_rate": 0.00016, + "loss": 2.1088, + "step": 320 + }, + { + "epoch": 0.0012560614480485372, + "grad_norm": 0.08743339031934738, + "learning_rate": 0.000165, + "loss": 2.1146, + "step": 330 + }, + { + "epoch": 0.0012941239161712202, + "grad_norm": 0.08558017015457153, + "learning_rate": 0.00017, + "loss": 2.1011, + "step": 340 + }, + { + "epoch": 0.001332186384293903, + "grad_norm": 0.08609547466039658, + "learning_rate": 0.000175, + "loss": 2.1014, + "step": 350 + }, + { + "epoch": 0.001370248852416586, + "grad_norm": 0.0850830003619194, + "learning_rate": 0.00017999999999999998, + "loss": 2.0965, + "step": 360 + }, + { + "epoch": 0.0014083113205392692, + "grad_norm": 0.08757133781909943, + "learning_rate": 0.000185, + "loss": 2.0986, + "step": 370 + }, + { + "epoch": 0.001446373788661952, + "grad_norm": 0.08911364525556564, + "learning_rate": 0.00019, + "loss": 2.1004, + "step": 380 + }, + { + "epoch": 0.001484436256784635, + "grad_norm": 0.09093070030212402, + "learning_rate": 0.00019500000000000002, + "loss": 2.1119, + "step": 390 + }, + { + "epoch": 0.001522498724907318, + "grad_norm": 0.10527298599481583, + "learning_rate": 0.0002, + "loss": 2.1042, + "step": 400 + }, + { + "epoch": 0.0015605611930300009, + "grad_norm": 0.0866197869181633, + "learning_rate": 0.000205, + "loss": 2.1035, + "step": 410 + }, + { + "epoch": 0.0015986236611526838, + "grad_norm": 0.09171626716852188, + "learning_rate": 0.00021, + "loss": 2.0852, + "step": 420 + }, + { + "epoch": 0.0016366861292753667, + "grad_norm": 0.0898546651005745, + "learning_rate": 0.000215, + "loss": 2.0928, + "step": 430 + }, + { + "epoch": 0.0016747485973980496, + "grad_norm": 0.09225795418024063, + "learning_rate": 0.00022, + "loss": 2.1046, + "step": 440 + }, + { + "epoch": 0.0017128110655207326, + "grad_norm": 0.09323623031377792, + "learning_rate": 0.00022500000000000002, + "loss": 2.1001, + "step": 450 + }, + { + "epoch": 0.0017508735336434155, + "grad_norm": 0.08722110092639923, + "learning_rate": 0.00023, + "loss": 2.1023, + "step": 460 + }, + { + "epoch": 0.0017889360017660984, + "grad_norm": 0.09408703446388245, + "learning_rate": 0.000235, + "loss": 2.0973, + "step": 470 + }, + { + "epoch": 0.0018269984698887816, + "grad_norm": 0.0959773138165474, + "learning_rate": 0.00024, + "loss": 2.1032, + "step": 480 + }, + { + "epoch": 0.0018650609380114645, + "grad_norm": 0.0968349426984787, + "learning_rate": 0.000245, + "loss": 2.093, + "step": 490 + }, + { + "epoch": 0.0019031234061341474, + "grad_norm": 0.0896330177783966, + "learning_rate": 0.00025, + "loss": 2.1037, + "step": 500 + }, + { + "epoch": 0.0019411858742568303, + "grad_norm": 0.09475599974393845, + "learning_rate": 0.000255, + "loss": 2.1015, + "step": 510 + }, + { + "epoch": 0.001979248342379513, + "grad_norm": 0.09628993272781372, + "learning_rate": 0.00026000000000000003, + "loss": 2.1125, + "step": 520 + }, + { + "epoch": 0.002017310810502196, + "grad_norm": 0.10538001358509064, + "learning_rate": 0.00026500000000000004, + "loss": 2.1087, + "step": 530 + }, + { + "epoch": 0.0020553732786248793, + "grad_norm": 0.09868626296520233, + "learning_rate": 0.00027, + "loss": 2.0996, + "step": 540 + }, + { + "epoch": 0.002093435746747562, + "grad_norm": 0.09002415835857391, + "learning_rate": 0.000275, + "loss": 2.1165, + "step": 550 + }, + { + "epoch": 0.002131498214870245, + "grad_norm": 0.09453532099723816, + "learning_rate": 0.00028000000000000003, + "loss": 2.0897, + "step": 560 + }, + { + "epoch": 0.002169560682992928, + "grad_norm": 0.09969169646501541, + "learning_rate": 0.000285, + "loss": 2.0933, + "step": 570 + }, + { + "epoch": 0.002207623151115611, + "grad_norm": 0.09832243621349335, + "learning_rate": 0.00029, + "loss": 2.0854, + "step": 580 + }, + { + "epoch": 0.0022456856192382938, + "grad_norm": 0.10161517560482025, + "learning_rate": 0.000295, + "loss": 2.0979, + "step": 590 + }, + { + "epoch": 0.002283748087360977, + "grad_norm": 0.10710626095533371, + "learning_rate": 0.0003, + "loss": 2.1076, + "step": 600 + }, + { + "epoch": 0.0023218105554836596, + "grad_norm": 0.09586170315742493, + "learning_rate": 0.000305, + "loss": 2.1116, + "step": 610 + }, + { + "epoch": 0.0023598730236063428, + "grad_norm": 0.09288761764764786, + "learning_rate": 0.00031, + "loss": 2.1174, + "step": 620 + }, + { + "epoch": 0.0023979354917290255, + "grad_norm": 0.10027331113815308, + "learning_rate": 0.000315, + "loss": 2.1152, + "step": 630 + }, + { + "epoch": 0.0024359979598517086, + "grad_norm": 0.0982818529009819, + "learning_rate": 0.00032, + "loss": 2.1039, + "step": 640 + }, + { + "epoch": 0.0024740604279743918, + "grad_norm": 0.09577590972185135, + "learning_rate": 0.00032500000000000004, + "loss": 2.1071, + "step": 650 + }, + { + "epoch": 0.0025121228960970745, + "grad_norm": 0.09431636333465576, + "learning_rate": 0.00033, + "loss": 2.1073, + "step": 660 + }, + { + "epoch": 0.0025501853642197576, + "grad_norm": 0.10308602452278137, + "learning_rate": 0.000335, + "loss": 2.1145, + "step": 670 + }, + { + "epoch": 0.0025882478323424403, + "grad_norm": 0.0990699827671051, + "learning_rate": 0.00034, + "loss": 2.1081, + "step": 680 + }, + { + "epoch": 0.0026263103004651235, + "grad_norm": 0.10600564628839493, + "learning_rate": 0.000345, + "loss": 2.11, + "step": 690 + }, + { + "epoch": 0.002664372768587806, + "grad_norm": 0.10044153034687042, + "learning_rate": 0.00035, + "loss": 2.1018, + "step": 700 + }, + { + "epoch": 0.0027024352367104893, + "grad_norm": 0.09696255624294281, + "learning_rate": 0.000355, + "loss": 2.1055, + "step": 710 + }, + { + "epoch": 0.002740497704833172, + "grad_norm": 0.10313771665096283, + "learning_rate": 0.00035999999999999997, + "loss": 2.1031, + "step": 720 + }, + { + "epoch": 0.002778560172955855, + "grad_norm": 0.10236144065856934, + "learning_rate": 0.000365, + "loss": 2.1022, + "step": 730 + }, + { + "epoch": 0.0028166226410785383, + "grad_norm": 0.10610686242580414, + "learning_rate": 0.00037, + "loss": 2.1216, + "step": 740 + }, + { + "epoch": 0.002854685109201221, + "grad_norm": 0.10392218083143234, + "learning_rate": 0.000375, + "loss": 2.1294, + "step": 750 + }, + { + "epoch": 0.002892747577323904, + "grad_norm": 0.11248600482940674, + "learning_rate": 0.00038, + "loss": 2.1082, + "step": 760 + }, + { + "epoch": 0.002930810045446587, + "grad_norm": 0.10452433675527573, + "learning_rate": 0.00038500000000000003, + "loss": 2.1289, + "step": 770 + }, + { + "epoch": 0.00296887251356927, + "grad_norm": 0.1146014854311943, + "learning_rate": 0.00039000000000000005, + "loss": 2.1147, + "step": 780 + }, + { + "epoch": 0.0030069349816919527, + "grad_norm": 0.10917292535305023, + "learning_rate": 0.000395, + "loss": 2.1198, + "step": 790 + }, + { + "epoch": 0.003044997449814636, + "grad_norm": 0.10151583701372147, + "learning_rate": 0.0004, + "loss": 2.1119, + "step": 800 + }, + { + "epoch": 0.0030830599179373186, + "grad_norm": 0.10754556208848953, + "learning_rate": 0.00040500000000000003, + "loss": 2.1106, + "step": 810 + }, + { + "epoch": 0.0031211223860600017, + "grad_norm": 0.11548687517642975, + "learning_rate": 0.00041, + "loss": 2.1151, + "step": 820 + }, + { + "epoch": 0.0031591848541826844, + "grad_norm": 0.1291622668504715, + "learning_rate": 0.000415, + "loss": 2.1142, + "step": 830 + }, + { + "epoch": 0.0031972473223053676, + "grad_norm": 0.10699468106031418, + "learning_rate": 0.00042, + "loss": 2.1266, + "step": 840 + }, + { + "epoch": 0.0032353097904280507, + "grad_norm": 0.11056250333786011, + "learning_rate": 0.000425, + "loss": 2.1279, + "step": 850 + }, + { + "epoch": 0.0032733722585507334, + "grad_norm": 0.10794071853160858, + "learning_rate": 0.00043, + "loss": 2.1094, + "step": 860 + }, + { + "epoch": 0.0033114347266734166, + "grad_norm": 0.10671833902597427, + "learning_rate": 0.000435, + "loss": 2.1074, + "step": 870 + }, + { + "epoch": 0.0033494971947960993, + "grad_norm": 0.12604257464408875, + "learning_rate": 0.00044, + "loss": 2.1375, + "step": 880 + }, + { + "epoch": 0.0033875596629187824, + "grad_norm": 0.12434367090463638, + "learning_rate": 0.00044500000000000003, + "loss": 2.1147, + "step": 890 + }, + { + "epoch": 0.003425622131041465, + "grad_norm": 0.11782653629779816, + "learning_rate": 0.00045000000000000004, + "loss": 2.1343, + "step": 900 + }, + { + "epoch": 0.0034636845991641483, + "grad_norm": 0.11729437857866287, + "learning_rate": 0.000455, + "loss": 2.1274, + "step": 910 + }, + { + "epoch": 0.003501747067286831, + "grad_norm": 0.10995844751596451, + "learning_rate": 0.00046, + "loss": 2.1398, + "step": 920 + }, + { + "epoch": 0.003539809535409514, + "grad_norm": 0.09970034658908844, + "learning_rate": 0.000465, + "loss": 2.1258, + "step": 930 + }, + { + "epoch": 0.003577872003532197, + "grad_norm": 0.11986082047224045, + "learning_rate": 0.00047, + "loss": 2.1359, + "step": 940 + }, + { + "epoch": 0.00361593447165488, + "grad_norm": 0.12323372066020966, + "learning_rate": 0.000475, + "loss": 2.1323, + "step": 950 + }, + { + "epoch": 0.003653996939777563, + "grad_norm": 0.10662589967250824, + "learning_rate": 0.00048, + "loss": 2.1449, + "step": 960 + }, + { + "epoch": 0.003692059407900246, + "grad_norm": 0.10080249607563019, + "learning_rate": 0.00048499999999999997, + "loss": 2.1253, + "step": 970 + }, + { + "epoch": 0.003730121876022929, + "grad_norm": 0.11241836100816727, + "learning_rate": 0.00049, + "loss": 2.1465, + "step": 980 + }, + { + "epoch": 0.0037681843441456117, + "grad_norm": 0.11886326968669891, + "learning_rate": 0.000495, + "loss": 2.141, + "step": 990 + }, + { + "epoch": 0.003806246812268295, + "grad_norm": 0.12044759094715118, + "learning_rate": 0.0005, + "loss": 2.1339, + "step": 1000 + }, + { + "epoch": 0.0038443092803909775, + "grad_norm": 0.11921434849500656, + "learning_rate": 0.0005, + "loss": 2.1449, + "step": 1010 + }, + { + "epoch": 0.0038823717485136607, + "grad_norm": 0.10983549058437347, + "learning_rate": 0.0005, + "loss": 2.1313, + "step": 1020 + }, + { + "epoch": 0.003920434216636344, + "grad_norm": 0.11801790446043015, + "learning_rate": 0.0005, + "loss": 2.123, + "step": 1030 + }, + { + "epoch": 0.003958496684759026, + "grad_norm": 0.12907488644123077, + "learning_rate": 0.0005, + "loss": 2.1423, + "step": 1040 + }, + { + "epoch": 0.003996559152881709, + "grad_norm": 0.11516977846622467, + "learning_rate": 0.0005, + "loss": 2.1381, + "step": 1050 + }, + { + "epoch": 0.004034621621004392, + "grad_norm": 0.1066596731543541, + "learning_rate": 0.0005, + "loss": 2.1424, + "step": 1060 + }, + { + "epoch": 0.0040726840891270755, + "grad_norm": 0.12312052398920059, + "learning_rate": 0.0005, + "loss": 2.1452, + "step": 1070 + }, + { + "epoch": 0.004110746557249759, + "grad_norm": 0.11293191462755203, + "learning_rate": 0.0005, + "loss": 2.1338, + "step": 1080 + }, + { + "epoch": 0.004148809025372441, + "grad_norm": 0.1253630816936493, + "learning_rate": 0.0005, + "loss": 2.1493, + "step": 1090 + }, + { + "epoch": 0.004186871493495124, + "grad_norm": 0.10422249883413315, + "learning_rate": 0.0005, + "loss": 2.1507, + "step": 1100 + }, + { + "epoch": 0.004224933961617807, + "grad_norm": 0.1098194494843483, + "learning_rate": 0.0005, + "loss": 2.1408, + "step": 1110 + }, + { + "epoch": 0.00426299642974049, + "grad_norm": 0.10751495510339737, + "learning_rate": 0.0005, + "loss": 2.12, + "step": 1120 + }, + { + "epoch": 0.004301058897863173, + "grad_norm": 0.12427380681037903, + "learning_rate": 0.0005, + "loss": 2.1429, + "step": 1130 + }, + { + "epoch": 0.004339121365985856, + "grad_norm": 0.13727758824825287, + "learning_rate": 0.0005, + "loss": 2.1494, + "step": 1140 + }, + { + "epoch": 0.004377183834108539, + "grad_norm": 0.10616891831159592, + "learning_rate": 0.0005, + "loss": 2.1287, + "step": 1150 + }, + { + "epoch": 0.004415246302231222, + "grad_norm": 0.10865821689367294, + "learning_rate": 0.0005, + "loss": 2.1446, + "step": 1160 + }, + { + "epoch": 0.004453308770353905, + "grad_norm": 0.11342296004295349, + "learning_rate": 0.0005, + "loss": 2.1313, + "step": 1170 + }, + { + "epoch": 0.0044913712384765875, + "grad_norm": 0.10417460650205612, + "learning_rate": 0.0005, + "loss": 2.1358, + "step": 1180 + }, + { + "epoch": 0.004529433706599271, + "grad_norm": 0.11205831915140152, + "learning_rate": 0.0005, + "loss": 2.1393, + "step": 1190 + }, + { + "epoch": 0.004567496174721954, + "grad_norm": 0.10783912241458893, + "learning_rate": 0.0005, + "loss": 2.1282, + "step": 1200 + }, + { + "epoch": 0.004605558642844637, + "grad_norm": 0.10432789474725723, + "learning_rate": 0.0005, + "loss": 2.1394, + "step": 1210 + }, + { + "epoch": 0.004643621110967319, + "grad_norm": 0.10828524082899094, + "learning_rate": 0.0005, + "loss": 2.1317, + "step": 1220 + }, + { + "epoch": 0.004681683579090002, + "grad_norm": 0.10294743627309799, + "learning_rate": 0.0005, + "loss": 2.1477, + "step": 1230 + }, + { + "epoch": 0.0047197460472126855, + "grad_norm": 0.10957608371973038, + "learning_rate": 0.0005, + "loss": 2.1429, + "step": 1240 + }, + { + "epoch": 0.004757808515335369, + "grad_norm": 0.11409583687782288, + "learning_rate": 0.0005, + "loss": 2.1261, + "step": 1250 + }, + { + "epoch": 0.004795870983458051, + "grad_norm": 0.10860385745763779, + "learning_rate": 0.0005, + "loss": 2.1435, + "step": 1260 + }, + { + "epoch": 0.004833933451580734, + "grad_norm": 0.10973881185054779, + "learning_rate": 0.0005, + "loss": 2.1568, + "step": 1270 + }, + { + "epoch": 0.004871995919703417, + "grad_norm": 0.10787376016378403, + "learning_rate": 0.0005, + "loss": 2.1314, + "step": 1280 + }, + { + "epoch": 0.0049100583878261, + "grad_norm": 0.11786238849163055, + "learning_rate": 0.0005, + "loss": 2.1388, + "step": 1290 + }, + { + "epoch": 0.0049481208559487835, + "grad_norm": 0.11037887632846832, + "learning_rate": 0.0005, + "loss": 2.128, + "step": 1300 + }, + { + "epoch": 0.004986183324071466, + "grad_norm": 0.11480342596769333, + "learning_rate": 0.0005, + "loss": 2.1456, + "step": 1310 + }, + { + "epoch": 0.005024245792194149, + "grad_norm": 0.10816387087106705, + "learning_rate": 0.0005, + "loss": 2.1485, + "step": 1320 + }, + { + "epoch": 0.005062308260316832, + "grad_norm": 0.11930803954601288, + "learning_rate": 0.0005, + "loss": 2.1374, + "step": 1330 + }, + { + "epoch": 0.005100370728439515, + "grad_norm": 0.12375199794769287, + "learning_rate": 0.0005, + "loss": 2.1453, + "step": 1340 + }, + { + "epoch": 0.0051384331965621975, + "grad_norm": 0.10290851444005966, + "learning_rate": 0.0005, + "loss": 2.1449, + "step": 1350 + }, + { + "epoch": 0.005176495664684881, + "grad_norm": 0.11710334569215775, + "learning_rate": 0.0005, + "loss": 2.1378, + "step": 1360 + }, + { + "epoch": 0.005214558132807564, + "grad_norm": 0.10058943927288055, + "learning_rate": 0.0005, + "loss": 2.1467, + "step": 1370 + }, + { + "epoch": 0.005252620600930247, + "grad_norm": 0.11752262711524963, + "learning_rate": 0.0005, + "loss": 2.14, + "step": 1380 + }, + { + "epoch": 0.00529068306905293, + "grad_norm": 0.12212508171796799, + "learning_rate": 0.0005, + "loss": 2.1479, + "step": 1390 + }, + { + "epoch": 0.005328745537175612, + "grad_norm": 0.11722426861524582, + "learning_rate": 0.0005, + "loss": 2.1462, + "step": 1400 + }, + { + "epoch": 0.0053668080052982955, + "grad_norm": 0.10403724014759064, + "learning_rate": 0.0005, + "loss": 2.1408, + "step": 1410 + }, + { + "epoch": 0.005404870473420979, + "grad_norm": 0.11112242937088013, + "learning_rate": 0.0005, + "loss": 2.1525, + "step": 1420 + }, + { + "epoch": 0.005442932941543662, + "grad_norm": 0.1107005923986435, + "learning_rate": 0.0005, + "loss": 2.1496, + "step": 1430 + }, + { + "epoch": 0.005480995409666344, + "grad_norm": 0.11069675534963608, + "learning_rate": 0.0005, + "loss": 2.1444, + "step": 1440 + }, + { + "epoch": 0.005519057877789027, + "grad_norm": 0.1197921559214592, + "learning_rate": 0.0005, + "loss": 2.1278, + "step": 1450 + }, + { + "epoch": 0.00555712034591171, + "grad_norm": 0.1074242815375328, + "learning_rate": 0.0005, + "loss": 2.1439, + "step": 1460 + }, + { + "epoch": 0.0055951828140343935, + "grad_norm": 0.11167777329683304, + "learning_rate": 0.0005, + "loss": 2.1386, + "step": 1470 + }, + { + "epoch": 0.005633245282157077, + "grad_norm": 0.11702080816030502, + "learning_rate": 0.0005, + "loss": 2.1487, + "step": 1480 + }, + { + "epoch": 0.005671307750279759, + "grad_norm": 0.13201381266117096, + "learning_rate": 0.0005, + "loss": 2.1592, + "step": 1490 + }, + { + "epoch": 0.005709370218402442, + "grad_norm": 0.1159653514623642, + "learning_rate": 0.0005, + "loss": 2.1546, + "step": 1500 + }, + { + "epoch": 0.005747432686525125, + "grad_norm": 0.09878107905387878, + "learning_rate": 0.0005, + "loss": 2.1375, + "step": 1510 + }, + { + "epoch": 0.005785495154647808, + "grad_norm": 0.10091791301965714, + "learning_rate": 0.0005, + "loss": 2.1668, + "step": 1520 + }, + { + "epoch": 0.005823557622770491, + "grad_norm": 0.11111395061016083, + "learning_rate": 0.0005, + "loss": 2.13, + "step": 1530 + }, + { + "epoch": 0.005861620090893174, + "grad_norm": 0.13084878027439117, + "learning_rate": 0.0005, + "loss": 2.1393, + "step": 1540 + }, + { + "epoch": 0.005899682559015857, + "grad_norm": 0.11034942418336868, + "learning_rate": 0.0005, + "loss": 2.13, + "step": 1550 + }, + { + "epoch": 0.00593774502713854, + "grad_norm": 0.11699171364307404, + "learning_rate": 0.0005, + "loss": 2.1414, + "step": 1560 + }, + { + "epoch": 0.005975807495261222, + "grad_norm": 0.11403294652700424, + "learning_rate": 0.0005, + "loss": 2.1442, + "step": 1570 + }, + { + "epoch": 0.0060138699633839054, + "grad_norm": 0.13072063028812408, + "learning_rate": 0.0005, + "loss": 2.1493, + "step": 1580 + }, + { + "epoch": 0.006051932431506589, + "grad_norm": 0.11782855540513992, + "learning_rate": 0.0005, + "loss": 2.1406, + "step": 1590 + }, + { + "epoch": 0.006089994899629272, + "grad_norm": 0.10452011972665787, + "learning_rate": 0.0005, + "loss": 2.1393, + "step": 1600 + }, + { + "epoch": 0.006128057367751955, + "grad_norm": 0.11926303058862686, + "learning_rate": 0.0005, + "loss": 2.1456, + "step": 1610 + }, + { + "epoch": 0.006166119835874637, + "grad_norm": 0.11260377615690231, + "learning_rate": 0.0005, + "loss": 2.129, + "step": 1620 + }, + { + "epoch": 0.00620418230399732, + "grad_norm": 0.10686420649290085, + "learning_rate": 0.0005, + "loss": 2.1284, + "step": 1630 + }, + { + "epoch": 0.0062422447721200034, + "grad_norm": 0.10754341632127762, + "learning_rate": 0.0005, + "loss": 2.1458, + "step": 1640 + }, + { + "epoch": 0.006280307240242687, + "grad_norm": 0.1057191789150238, + "learning_rate": 0.0005, + "loss": 2.1405, + "step": 1650 + }, + { + "epoch": 0.006318369708365369, + "grad_norm": 0.1099468246102333, + "learning_rate": 0.0005, + "loss": 2.1395, + "step": 1660 + }, + { + "epoch": 0.006356432176488052, + "grad_norm": 0.11434461176395416, + "learning_rate": 0.0005, + "loss": 2.134, + "step": 1670 + }, + { + "epoch": 0.006394494644610735, + "grad_norm": 0.11896966397762299, + "learning_rate": 0.0005, + "loss": 2.1423, + "step": 1680 + }, + { + "epoch": 0.006432557112733418, + "grad_norm": 0.11888067424297333, + "learning_rate": 0.0005, + "loss": 2.1559, + "step": 1690 + }, + { + "epoch": 0.0064706195808561014, + "grad_norm": 0.1014968678355217, + "learning_rate": 0.0005, + "loss": 2.138, + "step": 1700 + }, + { + "epoch": 0.006508682048978784, + "grad_norm": 0.1254124641418457, + "learning_rate": 0.0005, + "loss": 2.1466, + "step": 1710 + }, + { + "epoch": 0.006546744517101467, + "grad_norm": 0.10429880768060684, + "learning_rate": 0.0005, + "loss": 2.1436, + "step": 1720 + }, + { + "epoch": 0.00658480698522415, + "grad_norm": 0.10479571670293808, + "learning_rate": 0.0005, + "loss": 2.1333, + "step": 1730 + }, + { + "epoch": 0.006622869453346833, + "grad_norm": 0.1161293312907219, + "learning_rate": 0.0005, + "loss": 2.1296, + "step": 1740 + }, + { + "epoch": 0.006660931921469515, + "grad_norm": 0.11300167441368103, + "learning_rate": 0.0005, + "loss": 2.1493, + "step": 1750 + }, + { + "epoch": 0.0066989943895921986, + "grad_norm": 0.11762259900569916, + "learning_rate": 0.0005, + "loss": 2.1501, + "step": 1760 + }, + { + "epoch": 0.006737056857714882, + "grad_norm": 0.11976729333400726, + "learning_rate": 0.0005, + "loss": 2.1399, + "step": 1770 + }, + { + "epoch": 0.006775119325837565, + "grad_norm": 0.13087745010852814, + "learning_rate": 0.0005, + "loss": 2.141, + "step": 1780 + }, + { + "epoch": 0.006813181793960247, + "grad_norm": 0.11606195569038391, + "learning_rate": 0.0005, + "loss": 2.1451, + "step": 1790 + }, + { + "epoch": 0.00685124426208293, + "grad_norm": 0.12213372439146042, + "learning_rate": 0.0005, + "loss": 2.1434, + "step": 1800 + }, + { + "epoch": 0.006889306730205613, + "grad_norm": 0.11877444386482239, + "learning_rate": 0.0005, + "loss": 2.1391, + "step": 1810 + }, + { + "epoch": 0.0069273691983282966, + "grad_norm": 0.11366769671440125, + "learning_rate": 0.0005, + "loss": 2.1611, + "step": 1820 + }, + { + "epoch": 0.00696543166645098, + "grad_norm": 0.11194705218076706, + "learning_rate": 0.0005, + "loss": 2.1607, + "step": 1830 + }, + { + "epoch": 0.007003494134573662, + "grad_norm": 0.12459465116262436, + "learning_rate": 0.0005, + "loss": 2.1334, + "step": 1840 + }, + { + "epoch": 0.007041556602696345, + "grad_norm": 0.10596070438623428, + "learning_rate": 0.0005, + "loss": 2.154, + "step": 1850 + }, + { + "epoch": 0.007079619070819028, + "grad_norm": 0.11263356357812881, + "learning_rate": 0.0005, + "loss": 2.1522, + "step": 1860 + }, + { + "epoch": 0.007117681538941711, + "grad_norm": 0.11101698130369186, + "learning_rate": 0.0005, + "loss": 2.1406, + "step": 1870 + }, + { + "epoch": 0.007155744007064394, + "grad_norm": 0.10336757451295853, + "learning_rate": 0.0005, + "loss": 2.1492, + "step": 1880 + }, + { + "epoch": 0.007193806475187077, + "grad_norm": 0.10375799983739853, + "learning_rate": 0.0005, + "loss": 2.1344, + "step": 1890 + }, + { + "epoch": 0.00723186894330976, + "grad_norm": 0.11153291165828705, + "learning_rate": 0.0005, + "loss": 2.1587, + "step": 1900 + }, + { + "epoch": 0.007269931411432443, + "grad_norm": 0.1019439771771431, + "learning_rate": 0.0005, + "loss": 2.1502, + "step": 1910 + }, + { + "epoch": 0.007307993879555126, + "grad_norm": 0.1037280336022377, + "learning_rate": 0.0005, + "loss": 2.1525, + "step": 1920 + }, + { + "epoch": 0.0073460563476778085, + "grad_norm": 0.12573005259037018, + "learning_rate": 0.0005, + "loss": 2.1375, + "step": 1930 + }, + { + "epoch": 0.007384118815800492, + "grad_norm": 0.11176995187997818, + "learning_rate": 0.0005, + "loss": 2.1367, + "step": 1940 + }, + { + "epoch": 0.007422181283923175, + "grad_norm": 0.10850751399993896, + "learning_rate": 0.0005, + "loss": 2.143, + "step": 1950 + }, + { + "epoch": 0.007460243752045858, + "grad_norm": 0.10029775649309158, + "learning_rate": 0.0005, + "loss": 2.1448, + "step": 1960 + }, + { + "epoch": 0.00749830622016854, + "grad_norm": 0.10943976044654846, + "learning_rate": 0.0005, + "loss": 2.1469, + "step": 1970 + }, + { + "epoch": 0.007536368688291223, + "grad_norm": 0.11815327405929565, + "learning_rate": 0.0005, + "loss": 2.1479, + "step": 1980 + }, + { + "epoch": 0.0075744311564139065, + "grad_norm": 0.10207410156726837, + "learning_rate": 0.0005, + "loss": 2.1407, + "step": 1990 + }, + { + "epoch": 0.00761249362453659, + "grad_norm": 0.12041871249675751, + "learning_rate": 0.0005, + "loss": 2.1341, + "step": 2000 + }, + { + "epoch": 0.007650556092659273, + "grad_norm": 0.11289781332015991, + "learning_rate": 0.0005, + "loss": 2.1497, + "step": 2010 + }, + { + "epoch": 0.007688618560781955, + "grad_norm": 0.11642885953187943, + "learning_rate": 0.0005, + "loss": 2.15, + "step": 2020 + }, + { + "epoch": 0.007726681028904638, + "grad_norm": 0.1056954562664032, + "learning_rate": 0.0005, + "loss": 2.1414, + "step": 2030 + }, + { + "epoch": 0.007764743497027321, + "grad_norm": 0.11253539472818375, + "learning_rate": 0.0005, + "loss": 2.1484, + "step": 2040 + }, + { + "epoch": 0.0078028059651500045, + "grad_norm": 0.11248568445444107, + "learning_rate": 0.0005, + "loss": 2.1635, + "step": 2050 + }, + { + "epoch": 0.007840868433272688, + "grad_norm": 0.1113501563668251, + "learning_rate": 0.0005, + "loss": 2.1456, + "step": 2060 + }, + { + "epoch": 0.00787893090139537, + "grad_norm": 0.12106958031654358, + "learning_rate": 0.0005, + "loss": 2.1467, + "step": 2070 + }, + { + "epoch": 0.007916993369518052, + "grad_norm": 0.11136061698198318, + "learning_rate": 0.0005, + "loss": 2.1474, + "step": 2080 + }, + { + "epoch": 0.007955055837640735, + "grad_norm": 0.11817717552185059, + "learning_rate": 0.0005, + "loss": 2.1401, + "step": 2090 + }, + { + "epoch": 0.007993118305763419, + "grad_norm": 0.11543906480073929, + "learning_rate": 0.0005, + "loss": 2.1373, + "step": 2100 + }, + { + "epoch": 0.008031180773886102, + "grad_norm": 0.11145801842212677, + "learning_rate": 0.0005, + "loss": 2.1439, + "step": 2110 + }, + { + "epoch": 0.008069243242008785, + "grad_norm": 0.10091929137706757, + "learning_rate": 0.0005, + "loss": 2.1363, + "step": 2120 + }, + { + "epoch": 0.008107305710131468, + "grad_norm": 0.10481547564268112, + "learning_rate": 0.0005, + "loss": 2.153, + "step": 2130 + }, + { + "epoch": 0.008145368178254151, + "grad_norm": 0.10759226232767105, + "learning_rate": 0.0005, + "loss": 2.1397, + "step": 2140 + }, + { + "epoch": 0.008183430646376834, + "grad_norm": 0.12797632813453674, + "learning_rate": 0.0005, + "loss": 2.1404, + "step": 2150 + }, + { + "epoch": 0.008221493114499517, + "grad_norm": 0.11188928782939911, + "learning_rate": 0.0005, + "loss": 2.1589, + "step": 2160 + }, + { + "epoch": 0.008259555582622199, + "grad_norm": 0.11084867268800735, + "learning_rate": 0.0005, + "loss": 2.1537, + "step": 2170 + }, + { + "epoch": 0.008297618050744882, + "grad_norm": 0.11851538717746735, + "learning_rate": 0.0005, + "loss": 2.131, + "step": 2180 + }, + { + "epoch": 0.008335680518867565, + "grad_norm": 0.11423757672309875, + "learning_rate": 0.0005, + "loss": 2.1374, + "step": 2190 + }, + { + "epoch": 0.008373742986990248, + "grad_norm": 0.12670071423053741, + "learning_rate": 0.0005, + "loss": 2.1477, + "step": 2200 + }, + { + "epoch": 0.008411805455112931, + "grad_norm": 0.11328286677598953, + "learning_rate": 0.0005, + "loss": 2.148, + "step": 2210 + }, + { + "epoch": 0.008449867923235614, + "grad_norm": 0.10992307960987091, + "learning_rate": 0.0005, + "loss": 2.1454, + "step": 2220 + }, + { + "epoch": 0.008487930391358298, + "grad_norm": 0.11549299955368042, + "learning_rate": 0.0005, + "loss": 2.1405, + "step": 2230 + }, + { + "epoch": 0.00852599285948098, + "grad_norm": 0.1016901507973671, + "learning_rate": 0.0005, + "loss": 2.1468, + "step": 2240 + }, + { + "epoch": 0.008564055327603664, + "grad_norm": 0.11092566698789597, + "learning_rate": 0.0005, + "loss": 2.1457, + "step": 2250 + }, + { + "epoch": 0.008602117795726345, + "grad_norm": 0.10870914906263351, + "learning_rate": 0.0005, + "loss": 2.1502, + "step": 2260 + }, + { + "epoch": 0.008640180263849028, + "grad_norm": 0.10901869088411331, + "learning_rate": 0.0005, + "loss": 2.134, + "step": 2270 + }, + { + "epoch": 0.008678242731971712, + "grad_norm": 0.11331748217344284, + "learning_rate": 0.0005, + "loss": 2.1489, + "step": 2280 + }, + { + "epoch": 0.008716305200094395, + "grad_norm": 0.10923583805561066, + "learning_rate": 0.0005, + "loss": 2.1541, + "step": 2290 + }, + { + "epoch": 0.008754367668217078, + "grad_norm": 0.10727989673614502, + "learning_rate": 0.0005, + "loss": 2.1472, + "step": 2300 + }, + { + "epoch": 0.008792430136339761, + "grad_norm": 0.10583169013261795, + "learning_rate": 0.0005, + "loss": 2.1315, + "step": 2310 + }, + { + "epoch": 0.008830492604462444, + "grad_norm": 0.10441339015960693, + "learning_rate": 0.0005, + "loss": 2.1506, + "step": 2320 + }, + { + "epoch": 0.008868555072585127, + "grad_norm": 0.1082761138677597, + "learning_rate": 0.0005, + "loss": 2.1363, + "step": 2330 + }, + { + "epoch": 0.00890661754070781, + "grad_norm": 0.12606731057167053, + "learning_rate": 0.0005, + "loss": 2.1523, + "step": 2340 + }, + { + "epoch": 0.008944680008830492, + "grad_norm": 0.12127711623907089, + "learning_rate": 0.0005, + "loss": 2.1324, + "step": 2350 + }, + { + "epoch": 0.008982742476953175, + "grad_norm": 0.13403546810150146, + "learning_rate": 0.0005, + "loss": 2.151, + "step": 2360 + }, + { + "epoch": 0.009020804945075858, + "grad_norm": 0.12472402304410934, + "learning_rate": 0.0005, + "loss": 2.1535, + "step": 2370 + }, + { + "epoch": 0.009058867413198541, + "grad_norm": 0.10507071018218994, + "learning_rate": 0.0005, + "loss": 2.1633, + "step": 2380 + }, + { + "epoch": 0.009096929881321224, + "grad_norm": 0.12194889783859253, + "learning_rate": 0.0005, + "loss": 2.1456, + "step": 2390 + }, + { + "epoch": 0.009134992349443908, + "grad_norm": 0.11172129213809967, + "learning_rate": 0.0005, + "loss": 2.1402, + "step": 2400 + }, + { + "epoch": 0.00917305481756659, + "grad_norm": 0.10743540525436401, + "learning_rate": 0.0005, + "loss": 2.1398, + "step": 2410 + }, + { + "epoch": 0.009211117285689274, + "grad_norm": 0.1062987893819809, + "learning_rate": 0.0005, + "loss": 2.1579, + "step": 2420 + }, + { + "epoch": 0.009249179753811957, + "grad_norm": 0.12290962040424347, + "learning_rate": 0.0005, + "loss": 2.1545, + "step": 2430 + }, + { + "epoch": 0.009287242221934638, + "grad_norm": 0.10917174816131592, + "learning_rate": 0.0005, + "loss": 2.1195, + "step": 2440 + }, + { + "epoch": 0.009325304690057322, + "grad_norm": 0.11427836120128632, + "learning_rate": 0.0005, + "loss": 2.1442, + "step": 2450 + }, + { + "epoch": 0.009363367158180005, + "grad_norm": 0.11217708885669708, + "learning_rate": 0.0005, + "loss": 2.1502, + "step": 2460 + }, + { + "epoch": 0.009401429626302688, + "grad_norm": 0.10235556215047836, + "learning_rate": 0.0005, + "loss": 2.1477, + "step": 2470 + }, + { + "epoch": 0.009439492094425371, + "grad_norm": 0.10068736970424652, + "learning_rate": 0.0005, + "loss": 2.142, + "step": 2480 + }, + { + "epoch": 0.009477554562548054, + "grad_norm": 0.10361825674772263, + "learning_rate": 0.0005, + "loss": 2.148, + "step": 2490 + }, + { + "epoch": 0.009515617030670737, + "grad_norm": 0.13224199414253235, + "learning_rate": 0.0005, + "loss": 2.1457, + "step": 2500 + }, + { + "epoch": 0.00955367949879342, + "grad_norm": 0.11844684928655624, + "learning_rate": 0.0005, + "loss": 2.1307, + "step": 2510 + }, + { + "epoch": 0.009591741966916102, + "grad_norm": 0.11305680871009827, + "learning_rate": 0.0005, + "loss": 2.1484, + "step": 2520 + }, + { + "epoch": 0.009629804435038785, + "grad_norm": 0.11989112198352814, + "learning_rate": 0.0005, + "loss": 2.1449, + "step": 2530 + }, + { + "epoch": 0.009667866903161468, + "grad_norm": 0.09989438951015472, + "learning_rate": 0.0005, + "loss": 2.1433, + "step": 2540 + }, + { + "epoch": 0.009705929371284151, + "grad_norm": 0.10064006596803665, + "learning_rate": 0.0005, + "loss": 2.144, + "step": 2550 + }, + { + "epoch": 0.009743991839406834, + "grad_norm": 0.11770729720592499, + "learning_rate": 0.0005, + "loss": 2.1576, + "step": 2560 + }, + { + "epoch": 0.009782054307529518, + "grad_norm": 0.10948820412158966, + "learning_rate": 0.0005, + "loss": 2.1459, + "step": 2570 + }, + { + "epoch": 0.0098201167756522, + "grad_norm": 0.11830486357212067, + "learning_rate": 0.0005, + "loss": 2.151, + "step": 2580 + }, + { + "epoch": 0.009858179243774884, + "grad_norm": 0.1152670606970787, + "learning_rate": 0.0005, + "loss": 2.1482, + "step": 2590 + }, + { + "epoch": 0.009896241711897567, + "grad_norm": 0.10491305589675903, + "learning_rate": 0.0005, + "loss": 2.1376, + "step": 2600 + }, + { + "epoch": 0.009934304180020248, + "grad_norm": 0.12046907842159271, + "learning_rate": 0.0005, + "loss": 2.141, + "step": 2610 + }, + { + "epoch": 0.009972366648142932, + "grad_norm": 0.11437215656042099, + "learning_rate": 0.0005, + "loss": 2.1543, + "step": 2620 + }, + { + "epoch": 0.010010429116265615, + "grad_norm": 0.10899262130260468, + "learning_rate": 0.0005, + "loss": 2.1475, + "step": 2630 + }, + { + "epoch": 0.010048491584388298, + "grad_norm": 0.11079401522874832, + "learning_rate": 0.0005, + "loss": 2.1379, + "step": 2640 + }, + { + "epoch": 0.010086554052510981, + "grad_norm": 0.10931838303804398, + "learning_rate": 0.0005, + "loss": 2.1567, + "step": 2650 + }, + { + "epoch": 0.010124616520633664, + "grad_norm": 0.10471278429031372, + "learning_rate": 0.0005, + "loss": 2.1323, + "step": 2660 + }, + { + "epoch": 0.010162678988756347, + "grad_norm": 0.1277533322572708, + "learning_rate": 0.0005, + "loss": 2.1506, + "step": 2670 + }, + { + "epoch": 0.01020074145687903, + "grad_norm": 0.11684451997280121, + "learning_rate": 0.0005, + "loss": 2.1516, + "step": 2680 + }, + { + "epoch": 0.010238803925001714, + "grad_norm": 0.11240722239017487, + "learning_rate": 0.0005, + "loss": 2.1448, + "step": 2690 + }, + { + "epoch": 0.010276866393124395, + "grad_norm": 0.11600897461175919, + "learning_rate": 0.0005, + "loss": 2.1322, + "step": 2700 + }, + { + "epoch": 0.010314928861247078, + "grad_norm": 0.11586230248212814, + "learning_rate": 0.0005, + "loss": 2.1495, + "step": 2710 + }, + { + "epoch": 0.010352991329369761, + "grad_norm": 0.1122283861041069, + "learning_rate": 0.0005, + "loss": 2.1528, + "step": 2720 + }, + { + "epoch": 0.010391053797492444, + "grad_norm": 0.11556751281023026, + "learning_rate": 0.0005, + "loss": 2.1418, + "step": 2730 + }, + { + "epoch": 0.010429116265615128, + "grad_norm": 0.10538724064826965, + "learning_rate": 0.0005, + "loss": 2.1592, + "step": 2740 + }, + { + "epoch": 0.01046717873373781, + "grad_norm": 0.11462972313165665, + "learning_rate": 0.0005, + "loss": 2.1495, + "step": 2750 + }, + { + "epoch": 0.010505241201860494, + "grad_norm": 0.1125258207321167, + "learning_rate": 0.0005, + "loss": 2.1453, + "step": 2760 + }, + { + "epoch": 0.010543303669983177, + "grad_norm": 0.11863648891448975, + "learning_rate": 0.0005, + "loss": 2.1419, + "step": 2770 + }, + { + "epoch": 0.01058136613810586, + "grad_norm": 0.1135948970913887, + "learning_rate": 0.0005, + "loss": 2.1439, + "step": 2780 + }, + { + "epoch": 0.010619428606228542, + "grad_norm": 0.10585552453994751, + "learning_rate": 0.0005, + "loss": 2.1365, + "step": 2790 + }, + { + "epoch": 0.010657491074351225, + "grad_norm": 0.10917206853628159, + "learning_rate": 0.0005, + "loss": 2.1366, + "step": 2800 + }, + { + "epoch": 0.010695553542473908, + "grad_norm": 0.11167892813682556, + "learning_rate": 0.0005, + "loss": 2.1423, + "step": 2810 + }, + { + "epoch": 0.010733616010596591, + "grad_norm": 0.12436781078577042, + "learning_rate": 0.0005, + "loss": 2.1496, + "step": 2820 + }, + { + "epoch": 0.010771678478719274, + "grad_norm": 0.11940976977348328, + "learning_rate": 0.0005, + "loss": 2.1281, + "step": 2830 + }, + { + "epoch": 0.010809740946841957, + "grad_norm": 0.10679332166910172, + "learning_rate": 0.0005, + "loss": 2.1499, + "step": 2840 + }, + { + "epoch": 0.01084780341496464, + "grad_norm": 0.10838009417057037, + "learning_rate": 0.0005, + "loss": 2.1396, + "step": 2850 + }, + { + "epoch": 0.010885865883087324, + "grad_norm": 0.11248808354139328, + "learning_rate": 0.0005, + "loss": 2.1489, + "step": 2860 + }, + { + "epoch": 0.010923928351210007, + "grad_norm": 0.12218758463859558, + "learning_rate": 0.0005, + "loss": 2.1504, + "step": 2870 + }, + { + "epoch": 0.010961990819332688, + "grad_norm": 0.12311123311519623, + "learning_rate": 0.0005, + "loss": 2.1612, + "step": 2880 + }, + { + "epoch": 0.011000053287455371, + "grad_norm": 0.12107503414154053, + "learning_rate": 0.0005, + "loss": 2.1521, + "step": 2890 + }, + { + "epoch": 0.011038115755578054, + "grad_norm": 0.1265459507703781, + "learning_rate": 0.0005, + "loss": 2.1458, + "step": 2900 + }, + { + "epoch": 0.011076178223700738, + "grad_norm": 0.10080008208751678, + "learning_rate": 0.0005, + "loss": 2.1449, + "step": 2910 + }, + { + "epoch": 0.01111424069182342, + "grad_norm": 0.1411270946264267, + "learning_rate": 0.0005, + "loss": 2.1427, + "step": 2920 + }, + { + "epoch": 0.011152303159946104, + "grad_norm": 0.10671328008174896, + "learning_rate": 0.0005, + "loss": 2.149, + "step": 2930 + }, + { + "epoch": 0.011190365628068787, + "grad_norm": 0.1083766371011734, + "learning_rate": 0.0005, + "loss": 2.1421, + "step": 2940 + }, + { + "epoch": 0.01122842809619147, + "grad_norm": 0.10850624740123749, + "learning_rate": 0.0005, + "loss": 2.1453, + "step": 2950 + }, + { + "epoch": 0.011266490564314153, + "grad_norm": 0.11468614637851715, + "learning_rate": 0.0005, + "loss": 2.1523, + "step": 2960 + }, + { + "epoch": 0.011304553032436835, + "grad_norm": 0.11435503512620926, + "learning_rate": 0.0005, + "loss": 2.154, + "step": 2970 + }, + { + "epoch": 0.011342615500559518, + "grad_norm": 0.12531743943691254, + "learning_rate": 0.0005, + "loss": 2.1354, + "step": 2980 + }, + { + "epoch": 0.011380677968682201, + "grad_norm": 0.12513144314289093, + "learning_rate": 0.0005, + "loss": 2.1408, + "step": 2990 + }, + { + "epoch": 0.011418740436804884, + "grad_norm": 0.10959810763597488, + "learning_rate": 0.0005, + "loss": 2.1571, + "step": 3000 + }, + { + "epoch": 0.011456802904927567, + "grad_norm": 0.11499848961830139, + "learning_rate": 0.0005, + "loss": 2.1529, + "step": 3010 + }, + { + "epoch": 0.01149486537305025, + "grad_norm": 0.11116579920053482, + "learning_rate": 0.0005, + "loss": 2.1493, + "step": 3020 + }, + { + "epoch": 0.011532927841172934, + "grad_norm": 0.11482273787260056, + "learning_rate": 0.0005, + "loss": 2.1508, + "step": 3030 + }, + { + "epoch": 0.011570990309295617, + "grad_norm": 0.11653285473585129, + "learning_rate": 0.0005, + "loss": 2.1624, + "step": 3040 + }, + { + "epoch": 0.011609052777418298, + "grad_norm": 0.10759188234806061, + "learning_rate": 0.0005, + "loss": 2.1453, + "step": 3050 + }, + { + "epoch": 0.011647115245540981, + "grad_norm": 0.10869356244802475, + "learning_rate": 0.0005, + "loss": 2.1489, + "step": 3060 + }, + { + "epoch": 0.011685177713663664, + "grad_norm": 0.11057727783918381, + "learning_rate": 0.0005, + "loss": 2.1549, + "step": 3070 + }, + { + "epoch": 0.011723240181786347, + "grad_norm": 0.12141691893339157, + "learning_rate": 0.0005, + "loss": 2.146, + "step": 3080 + }, + { + "epoch": 0.01176130264990903, + "grad_norm": 0.11846601963043213, + "learning_rate": 0.0005, + "loss": 2.1543, + "step": 3090 + }, + { + "epoch": 0.011799365118031714, + "grad_norm": 0.10390357673168182, + "learning_rate": 0.0005, + "loss": 2.1404, + "step": 3100 + }, + { + "epoch": 0.011837427586154397, + "grad_norm": 0.10856925696134567, + "learning_rate": 0.0005, + "loss": 2.1344, + "step": 3110 + }, + { + "epoch": 0.01187549005427708, + "grad_norm": 0.11197613179683685, + "learning_rate": 0.0005, + "loss": 2.1251, + "step": 3120 + }, + { + "epoch": 0.011913552522399763, + "grad_norm": 0.10197338461875916, + "learning_rate": 0.0005, + "loss": 2.147, + "step": 3130 + }, + { + "epoch": 0.011951614990522445, + "grad_norm": 0.10315605998039246, + "learning_rate": 0.0005, + "loss": 2.1456, + "step": 3140 + }, + { + "epoch": 0.011989677458645128, + "grad_norm": 0.11616446822881699, + "learning_rate": 0.0005, + "loss": 2.152, + "step": 3150 + }, + { + "epoch": 0.012027739926767811, + "grad_norm": 0.1289011538028717, + "learning_rate": 0.0005, + "loss": 2.1466, + "step": 3160 + }, + { + "epoch": 0.012065802394890494, + "grad_norm": 0.11502829194068909, + "learning_rate": 0.0005, + "loss": 2.1532, + "step": 3170 + }, + { + "epoch": 0.012103864863013177, + "grad_norm": 0.11335300654172897, + "learning_rate": 0.0005, + "loss": 2.1519, + "step": 3180 + }, + { + "epoch": 0.01214192733113586, + "grad_norm": 0.1180783212184906, + "learning_rate": 0.0005, + "loss": 2.1551, + "step": 3190 + }, + { + "epoch": 0.012179989799258543, + "grad_norm": 0.10899759829044342, + "learning_rate": 0.0005, + "loss": 2.1391, + "step": 3200 + }, + { + "epoch": 0.012218052267381227, + "grad_norm": 0.1238299235701561, + "learning_rate": 0.0005, + "loss": 2.1462, + "step": 3210 + }, + { + "epoch": 0.01225611473550391, + "grad_norm": 0.1062338650226593, + "learning_rate": 0.0005, + "loss": 2.1386, + "step": 3220 + }, + { + "epoch": 0.012294177203626591, + "grad_norm": 0.10229603201150894, + "learning_rate": 0.0005, + "loss": 2.1365, + "step": 3230 + }, + { + "epoch": 0.012332239671749274, + "grad_norm": 0.109988272190094, + "learning_rate": 0.0005, + "loss": 2.1279, + "step": 3240 + }, + { + "epoch": 0.012370302139871957, + "grad_norm": 0.11193165183067322, + "learning_rate": 0.0005, + "loss": 2.151, + "step": 3250 + }, + { + "epoch": 0.01240836460799464, + "grad_norm": 0.10783321410417557, + "learning_rate": 0.0005, + "loss": 2.1551, + "step": 3260 + }, + { + "epoch": 0.012446427076117324, + "grad_norm": 0.1088133156299591, + "learning_rate": 0.0005, + "loss": 2.1659, + "step": 3270 + }, + { + "epoch": 0.012484489544240007, + "grad_norm": 0.11209650337696075, + "learning_rate": 0.0005, + "loss": 2.1455, + "step": 3280 + }, + { + "epoch": 0.01252255201236269, + "grad_norm": 0.11094122380018234, + "learning_rate": 0.0005, + "loss": 2.1439, + "step": 3290 + }, + { + "epoch": 0.012560614480485373, + "grad_norm": 0.11281769722700119, + "learning_rate": 0.0005, + "loss": 2.1387, + "step": 3300 + }, + { + "epoch": 0.012598676948608056, + "grad_norm": 0.11721902340650558, + "learning_rate": 0.0005, + "loss": 2.1358, + "step": 3310 + }, + { + "epoch": 0.012636739416730738, + "grad_norm": 0.1123766303062439, + "learning_rate": 0.0005, + "loss": 2.145, + "step": 3320 + }, + { + "epoch": 0.01267480188485342, + "grad_norm": 0.11648537218570709, + "learning_rate": 0.0005, + "loss": 2.1423, + "step": 3330 + }, + { + "epoch": 0.012712864352976104, + "grad_norm": 0.11745714396238327, + "learning_rate": 0.0005, + "loss": 2.1521, + "step": 3340 + }, + { + "epoch": 0.012750926821098787, + "grad_norm": 0.10987821221351624, + "learning_rate": 0.0005, + "loss": 2.1391, + "step": 3350 + }, + { + "epoch": 0.01278898928922147, + "grad_norm": 0.11140109598636627, + "learning_rate": 0.0005, + "loss": 2.1564, + "step": 3360 + }, + { + "epoch": 0.012827051757344153, + "grad_norm": 0.1149124875664711, + "learning_rate": 0.0005, + "loss": 2.1509, + "step": 3370 + }, + { + "epoch": 0.012865114225466837, + "grad_norm": 0.10883333534002304, + "learning_rate": 0.0005, + "loss": 2.1362, + "step": 3380 + }, + { + "epoch": 0.01290317669358952, + "grad_norm": 0.11000484973192215, + "learning_rate": 0.0005, + "loss": 2.1411, + "step": 3390 + }, + { + "epoch": 0.012941239161712203, + "grad_norm": 0.11543362587690353, + "learning_rate": 0.0005, + "loss": 2.1549, + "step": 3400 + }, + { + "epoch": 0.012979301629834884, + "grad_norm": 0.1056831106543541, + "learning_rate": 0.0005, + "loss": 2.1515, + "step": 3410 + }, + { + "epoch": 0.013017364097957567, + "grad_norm": 0.11518535017967224, + "learning_rate": 0.0005, + "loss": 2.1402, + "step": 3420 + }, + { + "epoch": 0.01305542656608025, + "grad_norm": 0.09811803698539734, + "learning_rate": 0.0005, + "loss": 2.1557, + "step": 3430 + }, + { + "epoch": 0.013093489034202934, + "grad_norm": 0.11671673506498337, + "learning_rate": 0.0005, + "loss": 2.1378, + "step": 3440 + }, + { + "epoch": 0.013131551502325617, + "grad_norm": 0.12312810868024826, + "learning_rate": 0.0005, + "loss": 2.1371, + "step": 3450 + }, + { + "epoch": 0.0131696139704483, + "grad_norm": 0.10879986733198166, + "learning_rate": 0.0005, + "loss": 2.1483, + "step": 3460 + }, + { + "epoch": 0.013207676438570983, + "grad_norm": 0.11917275190353394, + "learning_rate": 0.0005, + "loss": 2.1505, + "step": 3470 + }, + { + "epoch": 0.013245738906693666, + "grad_norm": 0.10777828842401505, + "learning_rate": 0.0005, + "loss": 2.1466, + "step": 3480 + }, + { + "epoch": 0.01328380137481635, + "grad_norm": 0.10327920317649841, + "learning_rate": 0.0005, + "loss": 2.1602, + "step": 3490 + }, + { + "epoch": 0.01332186384293903, + "grad_norm": 0.10455843061208725, + "learning_rate": 0.0005, + "loss": 2.1375, + "step": 3500 + }, + { + "epoch": 0.013359926311061714, + "grad_norm": 0.11596754938364029, + "learning_rate": 0.0005, + "loss": 2.1587, + "step": 3510 + }, + { + "epoch": 0.013397988779184397, + "grad_norm": 0.12860432267189026, + "learning_rate": 0.0005, + "loss": 2.1333, + "step": 3520 + }, + { + "epoch": 0.01343605124730708, + "grad_norm": 0.12856276333332062, + "learning_rate": 0.0005, + "loss": 2.1381, + "step": 3530 + }, + { + "epoch": 0.013474113715429763, + "grad_norm": 0.10185491293668747, + "learning_rate": 0.0005, + "loss": 2.1438, + "step": 3540 + }, + { + "epoch": 0.013512176183552447, + "grad_norm": 0.10582708567380905, + "learning_rate": 0.0005, + "loss": 2.149, + "step": 3550 + }, + { + "epoch": 0.01355023865167513, + "grad_norm": 0.10672181844711304, + "learning_rate": 0.0005, + "loss": 2.1632, + "step": 3560 + }, + { + "epoch": 0.013588301119797813, + "grad_norm": 0.0988573208451271, + "learning_rate": 0.0005, + "loss": 2.1372, + "step": 3570 + }, + { + "epoch": 0.013626363587920494, + "grad_norm": 0.11891688406467438, + "learning_rate": 0.0005, + "loss": 2.154, + "step": 3580 + }, + { + "epoch": 0.013664426056043177, + "grad_norm": 0.12185022234916687, + "learning_rate": 0.0005, + "loss": 2.1539, + "step": 3590 + }, + { + "epoch": 0.01370248852416586, + "grad_norm": 0.12364726513624191, + "learning_rate": 0.0005, + "loss": 2.1463, + "step": 3600 + }, + { + "epoch": 0.013740550992288544, + "grad_norm": 0.10995651036500931, + "learning_rate": 0.0005, + "loss": 2.1506, + "step": 3610 + }, + { + "epoch": 0.013778613460411227, + "grad_norm": 0.10640208423137665, + "learning_rate": 0.0005, + "loss": 2.1456, + "step": 3620 + }, + { + "epoch": 0.01381667592853391, + "grad_norm": 0.10756561160087585, + "learning_rate": 0.0005, + "loss": 2.1334, + "step": 3630 + }, + { + "epoch": 0.013854738396656593, + "grad_norm": 0.11378846317529678, + "learning_rate": 0.0005, + "loss": 2.1458, + "step": 3640 + }, + { + "epoch": 0.013892800864779276, + "grad_norm": 0.11161532998085022, + "learning_rate": 0.0005, + "loss": 2.1455, + "step": 3650 + }, + { + "epoch": 0.01393086333290196, + "grad_norm": 0.1208384558558464, + "learning_rate": 0.0005, + "loss": 2.147, + "step": 3660 + }, + { + "epoch": 0.01396892580102464, + "grad_norm": 0.1076679527759552, + "learning_rate": 0.0005, + "loss": 2.1513, + "step": 3670 + }, + { + "epoch": 0.014006988269147324, + "grad_norm": 0.1010458767414093, + "learning_rate": 0.0005, + "loss": 2.15, + "step": 3680 + }, + { + "epoch": 0.014045050737270007, + "grad_norm": 0.10920700430870056, + "learning_rate": 0.0005, + "loss": 2.1499, + "step": 3690 + }, + { + "epoch": 0.01408311320539269, + "grad_norm": 0.1006520465016365, + "learning_rate": 0.0005, + "loss": 2.1554, + "step": 3700 + }, + { + "epoch": 0.014121175673515373, + "grad_norm": 0.11252916604280472, + "learning_rate": 0.0005, + "loss": 2.1495, + "step": 3710 + }, + { + "epoch": 0.014159238141638057, + "grad_norm": 0.1261662244796753, + "learning_rate": 0.0005, + "loss": 2.1382, + "step": 3720 + }, + { + "epoch": 0.01419730060976074, + "grad_norm": 0.11702711135149002, + "learning_rate": 0.0005, + "loss": 2.1267, + "step": 3730 + }, + { + "epoch": 0.014235363077883423, + "grad_norm": 0.1109805703163147, + "learning_rate": 0.0005, + "loss": 2.1557, + "step": 3740 + }, + { + "epoch": 0.014273425546006106, + "grad_norm": 0.11875700950622559, + "learning_rate": 0.0005, + "loss": 2.1425, + "step": 3750 + }, + { + "epoch": 0.014311488014128787, + "grad_norm": 0.11200769245624542, + "learning_rate": 0.0005, + "loss": 2.1437, + "step": 3760 + }, + { + "epoch": 0.01434955048225147, + "grad_norm": 0.11239437758922577, + "learning_rate": 0.0005, + "loss": 2.1515, + "step": 3770 + }, + { + "epoch": 0.014387612950374154, + "grad_norm": 0.10733044147491455, + "learning_rate": 0.0005, + "loss": 2.1465, + "step": 3780 + }, + { + "epoch": 0.014425675418496837, + "grad_norm": 0.1107863038778305, + "learning_rate": 0.0005, + "loss": 2.135, + "step": 3790 + }, + { + "epoch": 0.01446373788661952, + "grad_norm": 0.11729779094457626, + "learning_rate": 0.0005, + "loss": 2.137, + "step": 3800 + }, + { + "epoch": 0.014501800354742203, + "grad_norm": 0.13996534049510956, + "learning_rate": 0.0005, + "loss": 2.1524, + "step": 3810 + }, + { + "epoch": 0.014539862822864886, + "grad_norm": 0.1317211240530014, + "learning_rate": 0.0005, + "loss": 2.1499, + "step": 3820 + }, + { + "epoch": 0.01457792529098757, + "grad_norm": 0.11421209573745728, + "learning_rate": 0.0005, + "loss": 2.1392, + "step": 3830 + }, + { + "epoch": 0.014615987759110253, + "grad_norm": 0.1259276568889618, + "learning_rate": 0.0005, + "loss": 2.15, + "step": 3840 + }, + { + "epoch": 0.014654050227232934, + "grad_norm": 0.10297126322984695, + "learning_rate": 0.0005, + "loss": 2.1423, + "step": 3850 + }, + { + "epoch": 0.014692112695355617, + "grad_norm": 0.11537662148475647, + "learning_rate": 0.0005, + "loss": 2.1423, + "step": 3860 + }, + { + "epoch": 0.0147301751634783, + "grad_norm": 0.10959656536579132, + "learning_rate": 0.0005, + "loss": 2.1349, + "step": 3870 + }, + { + "epoch": 0.014768237631600983, + "grad_norm": 0.13356612622737885, + "learning_rate": 0.0005, + "loss": 2.1421, + "step": 3880 + }, + { + "epoch": 0.014806300099723666, + "grad_norm": 0.11244034022092819, + "learning_rate": 0.0005, + "loss": 2.1283, + "step": 3890 + }, + { + "epoch": 0.01484436256784635, + "grad_norm": 0.10692012310028076, + "learning_rate": 0.0005, + "loss": 2.1413, + "step": 3900 + }, + { + "epoch": 0.014882425035969033, + "grad_norm": 0.12048804759979248, + "learning_rate": 0.0005, + "loss": 2.1355, + "step": 3910 + }, + { + "epoch": 0.014920487504091716, + "grad_norm": 0.11501602083444595, + "learning_rate": 0.0005, + "loss": 2.1519, + "step": 3920 + }, + { + "epoch": 0.014958549972214399, + "grad_norm": 0.13394662737846375, + "learning_rate": 0.0005, + "loss": 2.144, + "step": 3930 + }, + { + "epoch": 0.01499661244033708, + "grad_norm": 0.1142013743519783, + "learning_rate": 0.0005, + "loss": 2.1489, + "step": 3940 + }, + { + "epoch": 0.015034674908459764, + "grad_norm": 0.12087413668632507, + "learning_rate": 0.0005, + "loss": 2.1635, + "step": 3950 + }, + { + "epoch": 0.015072737376582447, + "grad_norm": 0.12638744711875916, + "learning_rate": 0.0005, + "loss": 2.1508, + "step": 3960 + }, + { + "epoch": 0.01511079984470513, + "grad_norm": 0.11318952590227127, + "learning_rate": 0.0005, + "loss": 2.1301, + "step": 3970 + }, + { + "epoch": 0.015148862312827813, + "grad_norm": 0.1457756757736206, + "learning_rate": 0.0005, + "loss": 2.1463, + "step": 3980 + }, + { + "epoch": 0.015186924780950496, + "grad_norm": 0.11943541467189789, + "learning_rate": 0.0005, + "loss": 2.1455, + "step": 3990 + }, + { + "epoch": 0.01522498724907318, + "grad_norm": 0.12239276617765427, + "learning_rate": 0.0005, + "loss": 2.1479, + "step": 4000 + }, + { + "epoch": 0.015263049717195862, + "grad_norm": 0.11262499541044235, + "learning_rate": 0.0005, + "loss": 2.1524, + "step": 4010 + }, + { + "epoch": 0.015301112185318546, + "grad_norm": 0.10902900248765945, + "learning_rate": 0.0005, + "loss": 2.155, + "step": 4020 + }, + { + "epoch": 0.015339174653441227, + "grad_norm": 0.13082574307918549, + "learning_rate": 0.0005, + "loss": 2.1405, + "step": 4030 + }, + { + "epoch": 0.01537723712156391, + "grad_norm": 0.11959017068147659, + "learning_rate": 0.0005, + "loss": 2.1454, + "step": 4040 + }, + { + "epoch": 0.015415299589686593, + "grad_norm": 0.1089482307434082, + "learning_rate": 0.0005, + "loss": 2.1525, + "step": 4050 + }, + { + "epoch": 0.015453362057809276, + "grad_norm": 0.11102756857872009, + "learning_rate": 0.0005, + "loss": 2.1616, + "step": 4060 + }, + { + "epoch": 0.01549142452593196, + "grad_norm": 0.11872225999832153, + "learning_rate": 0.0005, + "loss": 2.1478, + "step": 4070 + }, + { + "epoch": 0.015529486994054643, + "grad_norm": 0.11290697008371353, + "learning_rate": 0.0005, + "loss": 2.1473, + "step": 4080 + }, + { + "epoch": 0.015567549462177326, + "grad_norm": 0.12047409266233444, + "learning_rate": 0.0005, + "loss": 2.1383, + "step": 4090 + }, + { + "epoch": 0.015605611930300009, + "grad_norm": 0.1093122810125351, + "learning_rate": 0.0005, + "loss": 2.1385, + "step": 4100 + }, + { + "epoch": 0.01564367439842269, + "grad_norm": 0.11345285177230835, + "learning_rate": 0.0005, + "loss": 2.1393, + "step": 4110 + }, + { + "epoch": 0.015681736866545375, + "grad_norm": 0.1162080317735672, + "learning_rate": 0.0005, + "loss": 2.1325, + "step": 4120 + }, + { + "epoch": 0.015719799334668057, + "grad_norm": 0.11164089292287827, + "learning_rate": 0.0005, + "loss": 2.1529, + "step": 4130 + }, + { + "epoch": 0.01575786180279074, + "grad_norm": 0.1144440621137619, + "learning_rate": 0.0005, + "loss": 2.1341, + "step": 4140 + }, + { + "epoch": 0.015795924270913423, + "grad_norm": 0.11888343840837479, + "learning_rate": 0.0005, + "loss": 2.1581, + "step": 4150 + }, + { + "epoch": 0.015833986739036104, + "grad_norm": 0.12672144174575806, + "learning_rate": 0.0005, + "loss": 2.1607, + "step": 4160 + }, + { + "epoch": 0.01587204920715879, + "grad_norm": 0.11533960700035095, + "learning_rate": 0.0005, + "loss": 2.1507, + "step": 4170 + }, + { + "epoch": 0.01591011167528147, + "grad_norm": 0.11102886497974396, + "learning_rate": 0.0005, + "loss": 2.1413, + "step": 4180 + }, + { + "epoch": 0.015948174143404156, + "grad_norm": 0.11495087295770645, + "learning_rate": 0.0005, + "loss": 2.1345, + "step": 4190 + }, + { + "epoch": 0.015986236611526837, + "grad_norm": 0.11841031163930893, + "learning_rate": 0.0005, + "loss": 2.1453, + "step": 4200 + }, + { + "epoch": 0.016024299079649522, + "grad_norm": 0.0984388217329979, + "learning_rate": 0.0005, + "loss": 2.1342, + "step": 4210 + }, + { + "epoch": 0.016062361547772203, + "grad_norm": 0.10668789595365524, + "learning_rate": 0.0005, + "loss": 2.1667, + "step": 4220 + }, + { + "epoch": 0.016100424015894888, + "grad_norm": 0.10329697281122208, + "learning_rate": 0.0005, + "loss": 2.1572, + "step": 4230 + }, + { + "epoch": 0.01613848648401757, + "grad_norm": 0.12043692916631699, + "learning_rate": 0.0005, + "loss": 2.1383, + "step": 4240 + }, + { + "epoch": 0.01617654895214025, + "grad_norm": 0.09935463964939117, + "learning_rate": 0.0005, + "loss": 2.1449, + "step": 4250 + }, + { + "epoch": 0.016214611420262936, + "grad_norm": 0.10463161766529083, + "learning_rate": 0.0005, + "loss": 2.139, + "step": 4260 + }, + { + "epoch": 0.016252673888385617, + "grad_norm": 0.10367552191019058, + "learning_rate": 0.0005, + "loss": 2.1518, + "step": 4270 + }, + { + "epoch": 0.016290736356508302, + "grad_norm": 0.1247159019112587, + "learning_rate": 0.0005, + "loss": 2.1563, + "step": 4280 + }, + { + "epoch": 0.016328798824630984, + "grad_norm": 0.1147475615143776, + "learning_rate": 0.0005, + "loss": 2.1341, + "step": 4290 + }, + { + "epoch": 0.01636686129275367, + "grad_norm": 0.12010012567043304, + "learning_rate": 0.0005, + "loss": 2.1578, + "step": 4300 + }, + { + "epoch": 0.01640492376087635, + "grad_norm": 0.10670970380306244, + "learning_rate": 0.0005, + "loss": 2.1448, + "step": 4310 + }, + { + "epoch": 0.016442986228999035, + "grad_norm": 0.11666595935821533, + "learning_rate": 0.0005, + "loss": 2.1484, + "step": 4320 + }, + { + "epoch": 0.016481048697121716, + "grad_norm": 0.10902661085128784, + "learning_rate": 0.0005, + "loss": 2.1483, + "step": 4330 + }, + { + "epoch": 0.016519111165244398, + "grad_norm": 0.10571938008069992, + "learning_rate": 0.0005, + "loss": 2.1515, + "step": 4340 + }, + { + "epoch": 0.016557173633367082, + "grad_norm": 0.10284340381622314, + "learning_rate": 0.0005, + "loss": 2.1512, + "step": 4350 + }, + { + "epoch": 0.016595236101489764, + "grad_norm": 0.10644084960222244, + "learning_rate": 0.0005, + "loss": 2.1468, + "step": 4360 + }, + { + "epoch": 0.01663329856961245, + "grad_norm": 0.12925571203231812, + "learning_rate": 0.0005, + "loss": 2.1422, + "step": 4370 + }, + { + "epoch": 0.01667136103773513, + "grad_norm": 0.11615116149187088, + "learning_rate": 0.0005, + "loss": 2.1632, + "step": 4380 + }, + { + "epoch": 0.016709423505857815, + "grad_norm": 0.1273653209209442, + "learning_rate": 0.0005, + "loss": 2.1595, + "step": 4390 + }, + { + "epoch": 0.016747485973980496, + "grad_norm": 0.11785674840211868, + "learning_rate": 0.0005, + "loss": 2.1347, + "step": 4400 + }, + { + "epoch": 0.01678554844210318, + "grad_norm": 0.10750816017389297, + "learning_rate": 0.0005, + "loss": 2.1615, + "step": 4410 + }, + { + "epoch": 0.016823610910225863, + "grad_norm": 0.11100894957780838, + "learning_rate": 0.0005, + "loss": 2.1483, + "step": 4420 + }, + { + "epoch": 0.016861673378348544, + "grad_norm": 0.11811844259500504, + "learning_rate": 0.0005, + "loss": 2.1534, + "step": 4430 + }, + { + "epoch": 0.01689973584647123, + "grad_norm": 0.12537986040115356, + "learning_rate": 0.0005, + "loss": 2.1318, + "step": 4440 + }, + { + "epoch": 0.01693779831459391, + "grad_norm": 0.12277396768331528, + "learning_rate": 0.0005, + "loss": 2.1477, + "step": 4450 + }, + { + "epoch": 0.016975860782716595, + "grad_norm": 0.10384443402290344, + "learning_rate": 0.0005, + "loss": 2.145, + "step": 4460 + }, + { + "epoch": 0.017013923250839277, + "grad_norm": 0.10271551460027695, + "learning_rate": 0.0005, + "loss": 2.1392, + "step": 4470 + }, + { + "epoch": 0.01705198571896196, + "grad_norm": 0.10529985278844833, + "learning_rate": 0.0005, + "loss": 2.1285, + "step": 4480 + }, + { + "epoch": 0.017090048187084643, + "grad_norm": 0.1150999516248703, + "learning_rate": 0.0005, + "loss": 2.1446, + "step": 4490 + }, + { + "epoch": 0.017128110655207328, + "grad_norm": 0.1289505660533905, + "learning_rate": 0.0005, + "loss": 2.159, + "step": 4500 + }, + { + "epoch": 0.01716617312333001, + "grad_norm": 0.12131360918283463, + "learning_rate": 0.0005, + "loss": 2.1437, + "step": 4510 + }, + { + "epoch": 0.01720423559145269, + "grad_norm": 0.12527745962142944, + "learning_rate": 0.0005, + "loss": 2.1413, + "step": 4520 + }, + { + "epoch": 0.017242298059575376, + "grad_norm": 0.11226322501897812, + "learning_rate": 0.0005, + "loss": 2.1409, + "step": 4530 + }, + { + "epoch": 0.017280360527698057, + "grad_norm": 0.11783391982316971, + "learning_rate": 0.0005, + "loss": 2.1456, + "step": 4540 + }, + { + "epoch": 0.017318422995820742, + "grad_norm": 0.09702415764331818, + "learning_rate": 0.0005, + "loss": 2.1331, + "step": 4550 + }, + { + "epoch": 0.017356485463943423, + "grad_norm": 0.1273190677165985, + "learning_rate": 0.0005, + "loss": 2.153, + "step": 4560 + }, + { + "epoch": 0.017394547932066108, + "grad_norm": 0.10814052075147629, + "learning_rate": 0.0005, + "loss": 2.1375, + "step": 4570 + }, + { + "epoch": 0.01743261040018879, + "grad_norm": 0.11285334080457687, + "learning_rate": 0.0005, + "loss": 2.1528, + "step": 4580 + }, + { + "epoch": 0.017470672868311474, + "grad_norm": 0.1097639873623848, + "learning_rate": 0.0005, + "loss": 2.1505, + "step": 4590 + }, + { + "epoch": 0.017508735336434156, + "grad_norm": 0.10255081206560135, + "learning_rate": 0.0005, + "loss": 2.1509, + "step": 4600 + }, + { + "epoch": 0.017546797804556837, + "grad_norm": 0.12281888723373413, + "learning_rate": 0.0005, + "loss": 2.1527, + "step": 4610 + }, + { + "epoch": 0.017584860272679522, + "grad_norm": 0.10445702821016312, + "learning_rate": 0.0005, + "loss": 2.1368, + "step": 4620 + }, + { + "epoch": 0.017622922740802204, + "grad_norm": 0.1302148401737213, + "learning_rate": 0.0005, + "loss": 2.1328, + "step": 4630 + }, + { + "epoch": 0.01766098520892489, + "grad_norm": 0.11009400337934494, + "learning_rate": 0.0005, + "loss": 2.1475, + "step": 4640 + }, + { + "epoch": 0.01769904767704757, + "grad_norm": 0.11754176765680313, + "learning_rate": 0.0005, + "loss": 2.1403, + "step": 4650 + }, + { + "epoch": 0.017737110145170255, + "grad_norm": 0.12567470967769623, + "learning_rate": 0.0005, + "loss": 2.1248, + "step": 4660 + }, + { + "epoch": 0.017775172613292936, + "grad_norm": 0.11794472485780716, + "learning_rate": 0.0005, + "loss": 2.1595, + "step": 4670 + }, + { + "epoch": 0.01781323508141562, + "grad_norm": 0.11542123556137085, + "learning_rate": 0.0005, + "loss": 2.1515, + "step": 4680 + }, + { + "epoch": 0.017851297549538302, + "grad_norm": 0.10805080085992813, + "learning_rate": 0.0005, + "loss": 2.1428, + "step": 4690 + }, + { + "epoch": 0.017889360017660984, + "grad_norm": 0.10144393891096115, + "learning_rate": 0.0005, + "loss": 2.1421, + "step": 4700 + }, + { + "epoch": 0.01792742248578367, + "grad_norm": 0.10997345298528671, + "learning_rate": 0.0005, + "loss": 2.1591, + "step": 4710 + }, + { + "epoch": 0.01796548495390635, + "grad_norm": 0.12483686953783035, + "learning_rate": 0.0005, + "loss": 2.1287, + "step": 4720 + }, + { + "epoch": 0.018003547422029035, + "grad_norm": 0.12187418341636658, + "learning_rate": 0.0005, + "loss": 2.1227, + "step": 4730 + }, + { + "epoch": 0.018041609890151716, + "grad_norm": 0.10523146390914917, + "learning_rate": 0.0005, + "loss": 2.1452, + "step": 4740 + }, + { + "epoch": 0.0180796723582744, + "grad_norm": 0.11619247496128082, + "learning_rate": 0.0005, + "loss": 2.1262, + "step": 4750 + }, + { + "epoch": 0.018117734826397083, + "grad_norm": 0.1118154227733612, + "learning_rate": 0.0005, + "loss": 2.1542, + "step": 4760 + }, + { + "epoch": 0.018155797294519768, + "grad_norm": 0.12364581227302551, + "learning_rate": 0.0005, + "loss": 2.1519, + "step": 4770 + }, + { + "epoch": 0.01819385976264245, + "grad_norm": 0.10529709607362747, + "learning_rate": 0.0005, + "loss": 2.1432, + "step": 4780 + }, + { + "epoch": 0.01823192223076513, + "grad_norm": 0.11116600036621094, + "learning_rate": 0.0005, + "loss": 2.1529, + "step": 4790 + }, + { + "epoch": 0.018269984698887815, + "grad_norm": 0.10533681511878967, + "learning_rate": 0.0005, + "loss": 2.1629, + "step": 4800 + }, + { + "epoch": 0.018308047167010497, + "grad_norm": 0.10860110819339752, + "learning_rate": 0.0005, + "loss": 2.1602, + "step": 4810 + }, + { + "epoch": 0.01834610963513318, + "grad_norm": 0.1140654981136322, + "learning_rate": 0.0005, + "loss": 2.1381, + "step": 4820 + }, + { + "epoch": 0.018384172103255863, + "grad_norm": 0.1118798777461052, + "learning_rate": 0.0005, + "loss": 2.1607, + "step": 4830 + }, + { + "epoch": 0.018422234571378548, + "grad_norm": 0.10930532217025757, + "learning_rate": 0.0005, + "loss": 2.1507, + "step": 4840 + }, + { + "epoch": 0.01846029703950123, + "grad_norm": 0.11511654406785965, + "learning_rate": 0.0005, + "loss": 2.1458, + "step": 4850 + }, + { + "epoch": 0.018498359507623914, + "grad_norm": 0.12256434559822083, + "learning_rate": 0.0005, + "loss": 2.1394, + "step": 4860 + }, + { + "epoch": 0.018536421975746595, + "grad_norm": 0.10739784687757492, + "learning_rate": 0.0005, + "loss": 2.1544, + "step": 4870 + }, + { + "epoch": 0.018574484443869277, + "grad_norm": 0.1213674396276474, + "learning_rate": 0.0005, + "loss": 2.159, + "step": 4880 + }, + { + "epoch": 0.018612546911991962, + "grad_norm": 0.10939610004425049, + "learning_rate": 0.0005, + "loss": 2.16, + "step": 4890 + }, + { + "epoch": 0.018650609380114643, + "grad_norm": 0.12793177366256714, + "learning_rate": 0.0005, + "loss": 2.1427, + "step": 4900 + }, + { + "epoch": 0.018688671848237328, + "grad_norm": 0.10633699595928192, + "learning_rate": 0.0005, + "loss": 2.1476, + "step": 4910 + }, + { + "epoch": 0.01872673431636001, + "grad_norm": 0.10457268357276917, + "learning_rate": 0.0005, + "loss": 2.164, + "step": 4920 + }, + { + "epoch": 0.018764796784482694, + "grad_norm": 0.10391423106193542, + "learning_rate": 0.0005, + "loss": 2.1287, + "step": 4930 + }, + { + "epoch": 0.018802859252605376, + "grad_norm": 0.11842848360538483, + "learning_rate": 0.0005, + "loss": 2.1433, + "step": 4940 + }, + { + "epoch": 0.01884092172072806, + "grad_norm": 0.10565365105867386, + "learning_rate": 0.0005, + "loss": 2.1227, + "step": 4950 + }, + { + "epoch": 0.018878984188850742, + "grad_norm": 0.11975981295108795, + "learning_rate": 0.0005, + "loss": 2.1542, + "step": 4960 + }, + { + "epoch": 0.018917046656973423, + "grad_norm": 0.11828301846981049, + "learning_rate": 0.0005, + "loss": 2.1515, + "step": 4970 + }, + { + "epoch": 0.01895510912509611, + "grad_norm": 0.11009760200977325, + "learning_rate": 0.0005, + "loss": 2.1505, + "step": 4980 + }, + { + "epoch": 0.01899317159321879, + "grad_norm": 0.10771029442548752, + "learning_rate": 0.0005, + "loss": 2.1286, + "step": 4990 + }, + { + "epoch": 0.019031234061341475, + "grad_norm": 0.1059429720044136, + "learning_rate": 0.0005, + "loss": 2.1383, + "step": 5000 + }, + { + "epoch": 0.019069296529464156, + "grad_norm": 0.12671475112438202, + "learning_rate": 0.0005, + "loss": 2.1593, + "step": 5010 + }, + { + "epoch": 0.01910735899758684, + "grad_norm": 0.1201770231127739, + "learning_rate": 0.0005, + "loss": 2.1395, + "step": 5020 + }, + { + "epoch": 0.019145421465709522, + "grad_norm": 0.1037500873208046, + "learning_rate": 0.0005, + "loss": 2.1307, + "step": 5030 + }, + { + "epoch": 0.019183483933832204, + "grad_norm": 0.11439839750528336, + "learning_rate": 0.0005, + "loss": 2.1452, + "step": 5040 + }, + { + "epoch": 0.01922154640195489, + "grad_norm": 0.1112758219242096, + "learning_rate": 0.0005, + "loss": 2.1582, + "step": 5050 + }, + { + "epoch": 0.01925960887007757, + "grad_norm": 0.10737047344446182, + "learning_rate": 0.0005, + "loss": 2.151, + "step": 5060 + }, + { + "epoch": 0.019297671338200255, + "grad_norm": 0.10944429039955139, + "learning_rate": 0.0005, + "loss": 2.1324, + "step": 5070 + }, + { + "epoch": 0.019335733806322936, + "grad_norm": 0.09820980578660965, + "learning_rate": 0.0005, + "loss": 2.1531, + "step": 5080 + }, + { + "epoch": 0.01937379627444562, + "grad_norm": 0.12407433986663818, + "learning_rate": 0.0005, + "loss": 2.1368, + "step": 5090 + }, + { + "epoch": 0.019411858742568303, + "grad_norm": 0.10912308096885681, + "learning_rate": 0.0005, + "loss": 2.1322, + "step": 5100 + }, + { + "epoch": 0.019449921210690987, + "grad_norm": 0.11376690119504929, + "learning_rate": 0.0005, + "loss": 2.1513, + "step": 5110 + }, + { + "epoch": 0.01948798367881367, + "grad_norm": 0.10237011313438416, + "learning_rate": 0.0005, + "loss": 2.1469, + "step": 5120 + }, + { + "epoch": 0.01952604614693635, + "grad_norm": 0.12656770646572113, + "learning_rate": 0.0005, + "loss": 2.1534, + "step": 5130 + }, + { + "epoch": 0.019564108615059035, + "grad_norm": 0.11813058704137802, + "learning_rate": 0.0005, + "loss": 2.143, + "step": 5140 + }, + { + "epoch": 0.019602171083181717, + "grad_norm": 0.11121872812509537, + "learning_rate": 0.0005, + "loss": 2.1561, + "step": 5150 + }, + { + "epoch": 0.0196402335513044, + "grad_norm": 0.10498569905757904, + "learning_rate": 0.0005, + "loss": 2.1488, + "step": 5160 + }, + { + "epoch": 0.019678296019427083, + "grad_norm": 0.12507279217243195, + "learning_rate": 0.0005, + "loss": 2.1553, + "step": 5170 + }, + { + "epoch": 0.019716358487549768, + "grad_norm": 0.11539608985185623, + "learning_rate": 0.0005, + "loss": 2.1582, + "step": 5180 + }, + { + "epoch": 0.01975442095567245, + "grad_norm": 0.11775651574134827, + "learning_rate": 0.0005, + "loss": 2.1479, + "step": 5190 + }, + { + "epoch": 0.019792483423795134, + "grad_norm": 0.1406317800283432, + "learning_rate": 0.0005, + "loss": 2.1395, + "step": 5200 + }, + { + "epoch": 0.019830545891917815, + "grad_norm": 0.1074119284749031, + "learning_rate": 0.0005, + "loss": 2.1484, + "step": 5210 + }, + { + "epoch": 0.019868608360040497, + "grad_norm": 0.1114739403128624, + "learning_rate": 0.0005, + "loss": 2.1444, + "step": 5220 + }, + { + "epoch": 0.01990667082816318, + "grad_norm": 0.107333704829216, + "learning_rate": 0.0005, + "loss": 2.1494, + "step": 5230 + }, + { + "epoch": 0.019944733296285863, + "grad_norm": 0.10788275301456451, + "learning_rate": 0.0005, + "loss": 2.1344, + "step": 5240 + }, + { + "epoch": 0.019982795764408548, + "grad_norm": 0.11002978682518005, + "learning_rate": 0.0005, + "loss": 2.1402, + "step": 5250 + }, + { + "epoch": 0.02002085823253123, + "grad_norm": 0.10800745338201523, + "learning_rate": 0.0005, + "loss": 2.1429, + "step": 5260 + }, + { + "epoch": 0.020058920700653914, + "grad_norm": 0.09979681670665741, + "learning_rate": 0.0005, + "loss": 2.1558, + "step": 5270 + }, + { + "epoch": 0.020096983168776596, + "grad_norm": 0.10672004520893097, + "learning_rate": 0.0005, + "loss": 2.14, + "step": 5280 + }, + { + "epoch": 0.02013504563689928, + "grad_norm": 0.10285453498363495, + "learning_rate": 0.0005, + "loss": 2.1467, + "step": 5290 + }, + { + "epoch": 0.020173108105021962, + "grad_norm": 0.11391220986843109, + "learning_rate": 0.0005, + "loss": 2.1541, + "step": 5300 + }, + { + "epoch": 0.020211170573144643, + "grad_norm": 0.1035693883895874, + "learning_rate": 0.0005, + "loss": 2.1424, + "step": 5310 + }, + { + "epoch": 0.020249233041267328, + "grad_norm": 0.0980655699968338, + "learning_rate": 0.0005, + "loss": 2.1456, + "step": 5320 + }, + { + "epoch": 0.02028729550939001, + "grad_norm": 0.10977238416671753, + "learning_rate": 0.0005, + "loss": 2.144, + "step": 5330 + }, + { + "epoch": 0.020325357977512695, + "grad_norm": 0.10729347169399261, + "learning_rate": 0.0005, + "loss": 2.1511, + "step": 5340 + }, + { + "epoch": 0.020363420445635376, + "grad_norm": 0.11655550450086594, + "learning_rate": 0.0005, + "loss": 2.1498, + "step": 5350 + }, + { + "epoch": 0.02040148291375806, + "grad_norm": 0.10931521654129028, + "learning_rate": 0.0005, + "loss": 2.1308, + "step": 5360 + }, + { + "epoch": 0.020439545381880742, + "grad_norm": 0.119588702917099, + "learning_rate": 0.0005, + "loss": 2.1523, + "step": 5370 + }, + { + "epoch": 0.020477607850003427, + "grad_norm": 0.12008684128522873, + "learning_rate": 0.0005, + "loss": 2.1304, + "step": 5380 + }, + { + "epoch": 0.02051567031812611, + "grad_norm": 0.10880262404680252, + "learning_rate": 0.0005, + "loss": 2.1393, + "step": 5390 + }, + { + "epoch": 0.02055373278624879, + "grad_norm": 0.12208819389343262, + "learning_rate": 0.0005, + "loss": 2.155, + "step": 5400 + }, + { + "epoch": 0.020591795254371475, + "grad_norm": 0.11892218887805939, + "learning_rate": 0.0005, + "loss": 2.1546, + "step": 5410 + }, + { + "epoch": 0.020629857722494156, + "grad_norm": 0.10370012372732162, + "learning_rate": 0.0005, + "loss": 2.1424, + "step": 5420 + }, + { + "epoch": 0.02066792019061684, + "grad_norm": 0.11022347956895828, + "learning_rate": 0.0005, + "loss": 2.1576, + "step": 5430 + }, + { + "epoch": 0.020705982658739523, + "grad_norm": 0.11062967777252197, + "learning_rate": 0.0005, + "loss": 2.1431, + "step": 5440 + }, + { + "epoch": 0.020744045126862207, + "grad_norm": 0.09825879335403442, + "learning_rate": 0.0005, + "loss": 2.1466, + "step": 5450 + }, + { + "epoch": 0.02078210759498489, + "grad_norm": 0.10920194536447525, + "learning_rate": 0.0005, + "loss": 2.1221, + "step": 5460 + }, + { + "epoch": 0.020820170063107574, + "grad_norm": 0.10575275868177414, + "learning_rate": 0.0005, + "loss": 2.1436, + "step": 5470 + }, + { + "epoch": 0.020858232531230255, + "grad_norm": 0.11406227946281433, + "learning_rate": 0.0005, + "loss": 2.16, + "step": 5480 + }, + { + "epoch": 0.020896294999352936, + "grad_norm": 0.11863389611244202, + "learning_rate": 0.0005, + "loss": 2.152, + "step": 5490 + }, + { + "epoch": 0.02093435746747562, + "grad_norm": 0.10407883673906326, + "learning_rate": 0.0005, + "loss": 2.1453, + "step": 5500 + }, + { + "epoch": 0.020972419935598303, + "grad_norm": 0.1140441820025444, + "learning_rate": 0.0005, + "loss": 2.1491, + "step": 5510 + }, + { + "epoch": 0.021010482403720988, + "grad_norm": 0.11707664281129837, + "learning_rate": 0.0005, + "loss": 2.1476, + "step": 5520 + }, + { + "epoch": 0.02104854487184367, + "grad_norm": 0.11001245677471161, + "learning_rate": 0.0005, + "loss": 2.1254, + "step": 5530 + }, + { + "epoch": 0.021086607339966354, + "grad_norm": 0.12250164896249771, + "learning_rate": 0.0005, + "loss": 2.1532, + "step": 5540 + }, + { + "epoch": 0.021124669808089035, + "grad_norm": 0.11971727758646011, + "learning_rate": 0.0005, + "loss": 2.1386, + "step": 5550 + }, + { + "epoch": 0.02116273227621172, + "grad_norm": 0.10103358328342438, + "learning_rate": 0.0005, + "loss": 2.1341, + "step": 5560 + }, + { + "epoch": 0.0212007947443344, + "grad_norm": 0.10445115715265274, + "learning_rate": 0.0005, + "loss": 2.1468, + "step": 5570 + }, + { + "epoch": 0.021238857212457083, + "grad_norm": 0.10383953154087067, + "learning_rate": 0.0005, + "loss": 2.1568, + "step": 5580 + }, + { + "epoch": 0.021276919680579768, + "grad_norm": 0.12046512961387634, + "learning_rate": 0.0005, + "loss": 2.1551, + "step": 5590 + }, + { + "epoch": 0.02131498214870245, + "grad_norm": 0.11003930121660233, + "learning_rate": 0.0005, + "loss": 2.1411, + "step": 5600 + }, + { + "epoch": 0.021353044616825134, + "grad_norm": 0.1069195419549942, + "learning_rate": 0.0005, + "loss": 2.1491, + "step": 5610 + }, + { + "epoch": 0.021391107084947816, + "grad_norm": 0.12431596964597702, + "learning_rate": 0.0005, + "loss": 2.1462, + "step": 5620 + }, + { + "epoch": 0.0214291695530705, + "grad_norm": 0.1099320501089096, + "learning_rate": 0.0005, + "loss": 2.1371, + "step": 5630 + }, + { + "epoch": 0.021467232021193182, + "grad_norm": 0.12026389688253403, + "learning_rate": 0.0005, + "loss": 2.1366, + "step": 5640 + }, + { + "epoch": 0.021505294489315867, + "grad_norm": 0.10532669723033905, + "learning_rate": 0.0005, + "loss": 2.1427, + "step": 5650 + }, + { + "epoch": 0.021543356957438548, + "grad_norm": 0.10554736107587814, + "learning_rate": 0.0005, + "loss": 2.1487, + "step": 5660 + }, + { + "epoch": 0.02158141942556123, + "grad_norm": 0.11826571822166443, + "learning_rate": 0.0005, + "loss": 2.1459, + "step": 5670 + }, + { + "epoch": 0.021619481893683914, + "grad_norm": 0.10795663297176361, + "learning_rate": 0.0005, + "loss": 2.1453, + "step": 5680 + }, + { + "epoch": 0.021657544361806596, + "grad_norm": 0.11151348054409027, + "learning_rate": 0.0005, + "loss": 2.1475, + "step": 5690 + }, + { + "epoch": 0.02169560682992928, + "grad_norm": 0.10803024470806122, + "learning_rate": 0.0005, + "loss": 2.1503, + "step": 5700 + }, + { + "epoch": 0.021733669298051962, + "grad_norm": 0.13073943555355072, + "learning_rate": 0.0005, + "loss": 2.1417, + "step": 5710 + }, + { + "epoch": 0.021771731766174647, + "grad_norm": 0.11931940913200378, + "learning_rate": 0.0005, + "loss": 2.1545, + "step": 5720 + }, + { + "epoch": 0.02180979423429733, + "grad_norm": 0.1153026893734932, + "learning_rate": 0.0005, + "loss": 2.1488, + "step": 5730 + }, + { + "epoch": 0.021847856702420013, + "grad_norm": 0.11887199431657791, + "learning_rate": 0.0005, + "loss": 2.1294, + "step": 5740 + }, + { + "epoch": 0.021885919170542695, + "grad_norm": 0.1110357940196991, + "learning_rate": 0.0005, + "loss": 2.1381, + "step": 5750 + }, + { + "epoch": 0.021923981638665376, + "grad_norm": 0.13553187251091003, + "learning_rate": 0.0005, + "loss": 2.1479, + "step": 5760 + }, + { + "epoch": 0.02196204410678806, + "grad_norm": 0.10310986638069153, + "learning_rate": 0.0005, + "loss": 2.1425, + "step": 5770 + }, + { + "epoch": 0.022000106574910742, + "grad_norm": 0.11090600490570068, + "learning_rate": 0.0005, + "loss": 2.1591, + "step": 5780 + }, + { + "epoch": 0.022038169043033427, + "grad_norm": 0.1099453866481781, + "learning_rate": 0.0005, + "loss": 2.1287, + "step": 5790 + }, + { + "epoch": 0.02207623151115611, + "grad_norm": 0.10403940826654434, + "learning_rate": 0.0005, + "loss": 2.145, + "step": 5800 + }, + { + "epoch": 0.022114293979278794, + "grad_norm": 0.113205686211586, + "learning_rate": 0.0005, + "loss": 2.1586, + "step": 5810 + }, + { + "epoch": 0.022152356447401475, + "grad_norm": 0.12265747785568237, + "learning_rate": 0.0005, + "loss": 2.1512, + "step": 5820 + }, + { + "epoch": 0.02219041891552416, + "grad_norm": 0.1367434561252594, + "learning_rate": 0.0005, + "loss": 2.1396, + "step": 5830 + }, + { + "epoch": 0.02222848138364684, + "grad_norm": 0.11390835791826248, + "learning_rate": 0.0005, + "loss": 2.1448, + "step": 5840 + }, + { + "epoch": 0.022266543851769523, + "grad_norm": 0.11761996150016785, + "learning_rate": 0.0005, + "loss": 2.1428, + "step": 5850 + }, + { + "epoch": 0.022304606319892208, + "grad_norm": 0.10379810631275177, + "learning_rate": 0.0005, + "loss": 2.1504, + "step": 5860 + }, + { + "epoch": 0.02234266878801489, + "grad_norm": 0.12037274986505508, + "learning_rate": 0.0005, + "loss": 2.138, + "step": 5870 + }, + { + "epoch": 0.022380731256137574, + "grad_norm": 0.11196594685316086, + "learning_rate": 0.0005, + "loss": 2.1507, + "step": 5880 + }, + { + "epoch": 0.022418793724260255, + "grad_norm": 0.1106754019856453, + "learning_rate": 0.0005, + "loss": 2.122, + "step": 5890 + }, + { + "epoch": 0.02245685619238294, + "grad_norm": 0.11773858219385147, + "learning_rate": 0.0005, + "loss": 2.131, + "step": 5900 + }, + { + "epoch": 0.02249491866050562, + "grad_norm": 0.11126694828271866, + "learning_rate": 0.0005, + "loss": 2.1563, + "step": 5910 + }, + { + "epoch": 0.022532981128628306, + "grad_norm": 0.10651341825723648, + "learning_rate": 0.0005, + "loss": 2.1435, + "step": 5920 + }, + { + "epoch": 0.022571043596750988, + "grad_norm": 0.12159726768732071, + "learning_rate": 0.0005, + "loss": 2.1443, + "step": 5930 + }, + { + "epoch": 0.02260910606487367, + "grad_norm": 0.10570742934942245, + "learning_rate": 0.0005, + "loss": 2.1545, + "step": 5940 + }, + { + "epoch": 0.022647168532996354, + "grad_norm": 0.1210443377494812, + "learning_rate": 0.0005, + "loss": 2.1559, + "step": 5950 + }, + { + "epoch": 0.022685231001119036, + "grad_norm": 0.12900283932685852, + "learning_rate": 0.0005, + "loss": 2.1613, + "step": 5960 + }, + { + "epoch": 0.02272329346924172, + "grad_norm": 0.10966738313436508, + "learning_rate": 0.0005, + "loss": 2.1219, + "step": 5970 + }, + { + "epoch": 0.022761355937364402, + "grad_norm": 0.10426750034093857, + "learning_rate": 0.0005, + "loss": 2.1468, + "step": 5980 + }, + { + "epoch": 0.022799418405487087, + "grad_norm": 0.12919247150421143, + "learning_rate": 0.0005, + "loss": 2.1431, + "step": 5990 + }, + { + "epoch": 0.022837480873609768, + "grad_norm": 0.13425137102603912, + "learning_rate": 0.0005, + "loss": 2.1427, + "step": 6000 + }, + { + "epoch": 0.022875543341732453, + "grad_norm": 0.09929858148097992, + "learning_rate": 0.0005, + "loss": 2.1488, + "step": 6010 + }, + { + "epoch": 0.022913605809855134, + "grad_norm": 0.10623922199010849, + "learning_rate": 0.0005, + "loss": 2.1411, + "step": 6020 + }, + { + "epoch": 0.022951668277977816, + "grad_norm": 0.10986531525850296, + "learning_rate": 0.0005, + "loss": 2.1449, + "step": 6030 + }, + { + "epoch": 0.0229897307461005, + "grad_norm": 0.11614275723695755, + "learning_rate": 0.0005, + "loss": 2.1424, + "step": 6040 + }, + { + "epoch": 0.023027793214223182, + "grad_norm": 0.12838366627693176, + "learning_rate": 0.0005, + "loss": 2.1452, + "step": 6050 + }, + { + "epoch": 0.023065855682345867, + "grad_norm": 0.10890116542577744, + "learning_rate": 0.0005, + "loss": 2.1407, + "step": 6060 + }, + { + "epoch": 0.02310391815046855, + "grad_norm": 0.10804659128189087, + "learning_rate": 0.0005, + "loss": 2.1509, + "step": 6070 + }, + { + "epoch": 0.023141980618591233, + "grad_norm": 0.1182192713022232, + "learning_rate": 0.0005, + "loss": 2.1557, + "step": 6080 + }, + { + "epoch": 0.023180043086713915, + "grad_norm": 0.11064272373914719, + "learning_rate": 0.0005, + "loss": 2.1439, + "step": 6090 + }, + { + "epoch": 0.023218105554836596, + "grad_norm": 0.10852929204702377, + "learning_rate": 0.0005, + "loss": 2.1391, + "step": 6100 + }, + { + "epoch": 0.02325616802295928, + "grad_norm": 0.11034633219242096, + "learning_rate": 0.0005, + "loss": 2.1343, + "step": 6110 + }, + { + "epoch": 0.023294230491081962, + "grad_norm": 0.1135808527469635, + "learning_rate": 0.0005, + "loss": 2.1466, + "step": 6120 + }, + { + "epoch": 0.023332292959204647, + "grad_norm": 0.11563987284898758, + "learning_rate": 0.0005, + "loss": 2.1304, + "step": 6130 + }, + { + "epoch": 0.02337035542732733, + "grad_norm": 0.10818443447351456, + "learning_rate": 0.0005, + "loss": 2.1432, + "step": 6140 + }, + { + "epoch": 0.023408417895450014, + "grad_norm": 0.11164136230945587, + "learning_rate": 0.0005, + "loss": 2.1527, + "step": 6150 + }, + { + "epoch": 0.023446480363572695, + "grad_norm": 0.12221473455429077, + "learning_rate": 0.0005, + "loss": 2.151, + "step": 6160 + }, + { + "epoch": 0.02348454283169538, + "grad_norm": 0.10707166790962219, + "learning_rate": 0.0005, + "loss": 2.1443, + "step": 6170 + }, + { + "epoch": 0.02352260529981806, + "grad_norm": 0.10560087114572525, + "learning_rate": 0.0005, + "loss": 2.1482, + "step": 6180 + }, + { + "epoch": 0.023560667767940743, + "grad_norm": 0.12548519670963287, + "learning_rate": 0.0005, + "loss": 2.1444, + "step": 6190 + }, + { + "epoch": 0.023598730236063428, + "grad_norm": 0.1081615537405014, + "learning_rate": 0.0005, + "loss": 2.1615, + "step": 6200 + }, + { + "epoch": 0.02363679270418611, + "grad_norm": 0.11263230443000793, + "learning_rate": 0.0005, + "loss": 2.1631, + "step": 6210 + }, + { + "epoch": 0.023674855172308794, + "grad_norm": 0.12632977962493896, + "learning_rate": 0.0005, + "loss": 2.1531, + "step": 6220 + }, + { + "epoch": 0.023712917640431475, + "grad_norm": 0.12329486012458801, + "learning_rate": 0.0005, + "loss": 2.135, + "step": 6230 + }, + { + "epoch": 0.02375098010855416, + "grad_norm": 0.11623676121234894, + "learning_rate": 0.0005, + "loss": 2.1521, + "step": 6240 + }, + { + "epoch": 0.02378904257667684, + "grad_norm": 0.1167716458439827, + "learning_rate": 0.0005, + "loss": 2.1459, + "step": 6250 + }, + { + "epoch": 0.023827105044799526, + "grad_norm": 0.12007655948400497, + "learning_rate": 0.0005, + "loss": 2.1567, + "step": 6260 + }, + { + "epoch": 0.023865167512922208, + "grad_norm": 0.11647479981184006, + "learning_rate": 0.0005, + "loss": 2.1235, + "step": 6270 + }, + { + "epoch": 0.02390322998104489, + "grad_norm": 0.12291767448186874, + "learning_rate": 0.0005, + "loss": 2.1509, + "step": 6280 + }, + { + "epoch": 0.023941292449167574, + "grad_norm": 0.11392831802368164, + "learning_rate": 0.0005, + "loss": 2.142, + "step": 6290 + }, + { + "epoch": 0.023979354917290256, + "grad_norm": 0.13158921897411346, + "learning_rate": 0.0005, + "loss": 2.1372, + "step": 6300 + }, + { + "epoch": 0.02401741738541294, + "grad_norm": 0.11344356834888458, + "learning_rate": 0.0005, + "loss": 2.1504, + "step": 6310 + }, + { + "epoch": 0.024055479853535622, + "grad_norm": 0.1382695436477661, + "learning_rate": 0.0005, + "loss": 2.1425, + "step": 6320 + }, + { + "epoch": 0.024093542321658307, + "grad_norm": 0.11335012316703796, + "learning_rate": 0.0005, + "loss": 2.1313, + "step": 6330 + }, + { + "epoch": 0.024131604789780988, + "grad_norm": 0.11333515495061874, + "learning_rate": 0.0005, + "loss": 2.1595, + "step": 6340 + }, + { + "epoch": 0.024169667257903673, + "grad_norm": 0.12147689610719681, + "learning_rate": 0.0005, + "loss": 2.124, + "step": 6350 + }, + { + "epoch": 0.024207729726026354, + "grad_norm": 0.1204022616147995, + "learning_rate": 0.0005, + "loss": 2.154, + "step": 6360 + }, + { + "epoch": 0.024245792194149036, + "grad_norm": 0.11988788098096848, + "learning_rate": 0.0005, + "loss": 2.1419, + "step": 6370 + }, + { + "epoch": 0.02428385466227172, + "grad_norm": 0.11760450154542923, + "learning_rate": 0.0005, + "loss": 2.1459, + "step": 6380 + }, + { + "epoch": 0.024321917130394402, + "grad_norm": 0.11097273975610733, + "learning_rate": 0.0005, + "loss": 2.1432, + "step": 6390 + }, + { + "epoch": 0.024359979598517087, + "grad_norm": 0.11635033041238785, + "learning_rate": 0.0005, + "loss": 2.1388, + "step": 6400 + }, + { + "epoch": 0.02439804206663977, + "grad_norm": 0.10954947024583817, + "learning_rate": 0.0005, + "loss": 2.1267, + "step": 6410 + }, + { + "epoch": 0.024436104534762453, + "grad_norm": 0.12753041088581085, + "learning_rate": 0.0005, + "loss": 2.1472, + "step": 6420 + }, + { + "epoch": 0.024474167002885135, + "grad_norm": 0.11430566757917404, + "learning_rate": 0.0005, + "loss": 2.1434, + "step": 6430 + }, + { + "epoch": 0.02451222947100782, + "grad_norm": 0.10539361089468002, + "learning_rate": 0.0005, + "loss": 2.1494, + "step": 6440 + }, + { + "epoch": 0.0245502919391305, + "grad_norm": 0.10601361095905304, + "learning_rate": 0.0005, + "loss": 2.1452, + "step": 6450 + }, + { + "epoch": 0.024588354407253182, + "grad_norm": 0.12343835085630417, + "learning_rate": 0.0005, + "loss": 2.142, + "step": 6460 + }, + { + "epoch": 0.024626416875375867, + "grad_norm": 0.10459932684898376, + "learning_rate": 0.0005, + "loss": 2.1549, + "step": 6470 + }, + { + "epoch": 0.02466447934349855, + "grad_norm": 0.11193633824586868, + "learning_rate": 0.0005, + "loss": 2.1554, + "step": 6480 + }, + { + "epoch": 0.024702541811621233, + "grad_norm": 0.11788009852170944, + "learning_rate": 0.0005, + "loss": 2.1509, + "step": 6490 + }, + { + "epoch": 0.024740604279743915, + "grad_norm": 0.11227643489837646, + "learning_rate": 0.0005, + "loss": 2.1634, + "step": 6500 + }, + { + "epoch": 0.0247786667478666, + "grad_norm": 0.1095656156539917, + "learning_rate": 0.0005, + "loss": 2.1344, + "step": 6510 + }, + { + "epoch": 0.02481672921598928, + "grad_norm": 0.10564127564430237, + "learning_rate": 0.0005, + "loss": 2.1404, + "step": 6520 + }, + { + "epoch": 0.024854791684111966, + "grad_norm": 0.11981156468391418, + "learning_rate": 0.0005, + "loss": 2.1556, + "step": 6530 + }, + { + "epoch": 0.024892854152234647, + "grad_norm": 0.11325754225254059, + "learning_rate": 0.0005, + "loss": 2.1384, + "step": 6540 + }, + { + "epoch": 0.02493091662035733, + "grad_norm": 0.13337786495685577, + "learning_rate": 0.0005, + "loss": 2.1394, + "step": 6550 + }, + { + "epoch": 0.024968979088480014, + "grad_norm": 0.13114260137081146, + "learning_rate": 0.0005, + "loss": 2.1411, + "step": 6560 + }, + { + "epoch": 0.025007041556602695, + "grad_norm": 0.1203683391213417, + "learning_rate": 0.0005, + "loss": 2.1504, + "step": 6570 + }, + { + "epoch": 0.02504510402472538, + "grad_norm": 0.11538281291723251, + "learning_rate": 0.0005, + "loss": 2.143, + "step": 6580 + }, + { + "epoch": 0.02508316649284806, + "grad_norm": 0.10961015522480011, + "learning_rate": 0.0005, + "loss": 2.1586, + "step": 6590 + }, + { + "epoch": 0.025121228960970746, + "grad_norm": 0.11965086311101913, + "learning_rate": 0.0005, + "loss": 2.1382, + "step": 6600 + }, + { + "epoch": 0.025159291429093428, + "grad_norm": 0.11759531497955322, + "learning_rate": 0.0005, + "loss": 2.1294, + "step": 6610 + }, + { + "epoch": 0.025197353897216113, + "grad_norm": 0.1089395061135292, + "learning_rate": 0.0005, + "loss": 2.1533, + "step": 6620 + }, + { + "epoch": 0.025235416365338794, + "grad_norm": 0.11554133892059326, + "learning_rate": 0.0005, + "loss": 2.1356, + "step": 6630 + }, + { + "epoch": 0.025273478833461475, + "grad_norm": 0.10535012185573578, + "learning_rate": 0.0005, + "loss": 2.1452, + "step": 6640 + }, + { + "epoch": 0.02531154130158416, + "grad_norm": 0.12574630975723267, + "learning_rate": 0.0005, + "loss": 2.152, + "step": 6650 + }, + { + "epoch": 0.02534960376970684, + "grad_norm": 0.12262838333845139, + "learning_rate": 0.0005, + "loss": 2.1407, + "step": 6660 + }, + { + "epoch": 0.025387666237829527, + "grad_norm": 0.1025639995932579, + "learning_rate": 0.0005, + "loss": 2.1456, + "step": 6670 + }, + { + "epoch": 0.025425728705952208, + "grad_norm": 0.11699458956718445, + "learning_rate": 0.0005, + "loss": 2.1421, + "step": 6680 + }, + { + "epoch": 0.025463791174074893, + "grad_norm": 0.10782221704721451, + "learning_rate": 0.0005, + "loss": 2.1523, + "step": 6690 + }, + { + "epoch": 0.025501853642197574, + "grad_norm": 0.11212996393442154, + "learning_rate": 0.0005, + "loss": 2.1264, + "step": 6700 + }, + { + "epoch": 0.02553991611032026, + "grad_norm": 0.10972334444522858, + "learning_rate": 0.0005, + "loss": 2.1326, + "step": 6710 + }, + { + "epoch": 0.02557797857844294, + "grad_norm": 0.11672057956457138, + "learning_rate": 0.0005, + "loss": 2.136, + "step": 6720 + }, + { + "epoch": 0.025616041046565622, + "grad_norm": 0.1198444738984108, + "learning_rate": 0.0005, + "loss": 2.1546, + "step": 6730 + }, + { + "epoch": 0.025654103514688307, + "grad_norm": 0.10473254323005676, + "learning_rate": 0.0005, + "loss": 2.1549, + "step": 6740 + }, + { + "epoch": 0.02569216598281099, + "grad_norm": 0.11385004222393036, + "learning_rate": 0.0005, + "loss": 2.1523, + "step": 6750 + }, + { + "epoch": 0.025730228450933673, + "grad_norm": 0.11270426958799362, + "learning_rate": 0.0005, + "loss": 2.1369, + "step": 6760 + }, + { + "epoch": 0.025768290919056355, + "grad_norm": 0.11007201671600342, + "learning_rate": 0.0005, + "loss": 2.142, + "step": 6770 + }, + { + "epoch": 0.02580635338717904, + "grad_norm": 0.11610470712184906, + "learning_rate": 0.0005, + "loss": 2.1376, + "step": 6780 + }, + { + "epoch": 0.02584441585530172, + "grad_norm": 0.12364150583744049, + "learning_rate": 0.0005, + "loss": 2.1564, + "step": 6790 + }, + { + "epoch": 0.025882478323424406, + "grad_norm": 0.1113385483622551, + "learning_rate": 0.0005, + "loss": 2.1457, + "step": 6800 + }, + { + "epoch": 0.025920540791547087, + "grad_norm": 0.1211201623082161, + "learning_rate": 0.0005, + "loss": 2.1555, + "step": 6810 + }, + { + "epoch": 0.02595860325966977, + "grad_norm": 0.12166136503219604, + "learning_rate": 0.0005, + "loss": 2.1463, + "step": 6820 + }, + { + "epoch": 0.025996665727792453, + "grad_norm": 0.10655366629362106, + "learning_rate": 0.0005, + "loss": 2.1371, + "step": 6830 + }, + { + "epoch": 0.026034728195915135, + "grad_norm": 0.10823944956064224, + "learning_rate": 0.0005, + "loss": 2.1426, + "step": 6840 + }, + { + "epoch": 0.02607279066403782, + "grad_norm": 0.12336534261703491, + "learning_rate": 0.0005, + "loss": 2.1441, + "step": 6850 + }, + { + "epoch": 0.0261108531321605, + "grad_norm": 0.11191314458847046, + "learning_rate": 0.0005, + "loss": 2.1513, + "step": 6860 + }, + { + "epoch": 0.026148915600283186, + "grad_norm": 0.10364817082881927, + "learning_rate": 0.0005, + "loss": 2.1567, + "step": 6870 + }, + { + "epoch": 0.026186978068405867, + "grad_norm": 0.12202759087085724, + "learning_rate": 0.0005, + "loss": 2.1601, + "step": 6880 + }, + { + "epoch": 0.026225040536528552, + "grad_norm": 0.11444476246833801, + "learning_rate": 0.0005, + "loss": 2.1334, + "step": 6890 + }, + { + "epoch": 0.026263103004651234, + "grad_norm": 0.107620969414711, + "learning_rate": 0.0005, + "loss": 2.1491, + "step": 6900 + }, + { + "epoch": 0.026301165472773915, + "grad_norm": 0.1263957917690277, + "learning_rate": 0.0005, + "loss": 2.1333, + "step": 6910 + }, + { + "epoch": 0.0263392279408966, + "grad_norm": 0.11235121637582779, + "learning_rate": 0.0005, + "loss": 2.1288, + "step": 6920 + }, + { + "epoch": 0.02637729040901928, + "grad_norm": 0.1108112782239914, + "learning_rate": 0.0005, + "loss": 2.1505, + "step": 6930 + }, + { + "epoch": 0.026415352877141966, + "grad_norm": 0.11218473315238953, + "learning_rate": 0.0005, + "loss": 2.1429, + "step": 6940 + }, + { + "epoch": 0.026453415345264648, + "grad_norm": 0.10671050101518631, + "learning_rate": 0.0005, + "loss": 2.14, + "step": 6950 + }, + { + "epoch": 0.026491477813387333, + "grad_norm": 0.11324463784694672, + "learning_rate": 0.0005, + "loss": 2.1407, + "step": 6960 + }, + { + "epoch": 0.026529540281510014, + "grad_norm": 0.1347011923789978, + "learning_rate": 0.0005, + "loss": 2.1375, + "step": 6970 + }, + { + "epoch": 0.0265676027496327, + "grad_norm": 0.13093167543411255, + "learning_rate": 0.0005, + "loss": 2.1402, + "step": 6980 + }, + { + "epoch": 0.02660566521775538, + "grad_norm": 0.11753138154745102, + "learning_rate": 0.0005, + "loss": 2.1423, + "step": 6990 + }, + { + "epoch": 0.02664372768587806, + "grad_norm": 0.10647093504667282, + "learning_rate": 0.0005, + "loss": 2.1311, + "step": 7000 + }, + { + "epoch": 0.026681790154000747, + "grad_norm": 0.11827953904867172, + "learning_rate": 0.0005, + "loss": 2.1455, + "step": 7010 + }, + { + "epoch": 0.026719852622123428, + "grad_norm": 0.12453864514827728, + "learning_rate": 0.0005, + "loss": 2.1484, + "step": 7020 + }, + { + "epoch": 0.026757915090246113, + "grad_norm": 0.12062682956457138, + "learning_rate": 0.0005, + "loss": 2.1282, + "step": 7030 + }, + { + "epoch": 0.026795977558368794, + "grad_norm": 0.11530686914920807, + "learning_rate": 0.0005, + "loss": 2.1471, + "step": 7040 + }, + { + "epoch": 0.02683404002649148, + "grad_norm": 0.1173919066786766, + "learning_rate": 0.0005, + "loss": 2.1427, + "step": 7050 + }, + { + "epoch": 0.02687210249461416, + "grad_norm": 0.11233177781105042, + "learning_rate": 0.0005, + "loss": 2.1438, + "step": 7060 + }, + { + "epoch": 0.026910164962736845, + "grad_norm": 0.12039361894130707, + "learning_rate": 0.0005, + "loss": 2.15, + "step": 7070 + }, + { + "epoch": 0.026948227430859527, + "grad_norm": 0.11644244939088821, + "learning_rate": 0.0005, + "loss": 2.1445, + "step": 7080 + }, + { + "epoch": 0.026986289898982208, + "grad_norm": 0.12728439271450043, + "learning_rate": 0.0005, + "loss": 2.1544, + "step": 7090 + }, + { + "epoch": 0.027024352367104893, + "grad_norm": 0.11197049170732498, + "learning_rate": 0.0005, + "loss": 2.1524, + "step": 7100 + }, + { + "epoch": 0.027062414835227575, + "grad_norm": 0.1252804547548294, + "learning_rate": 0.0005, + "loss": 2.1272, + "step": 7110 + }, + { + "epoch": 0.02710047730335026, + "grad_norm": 0.11560700088739395, + "learning_rate": 0.0005, + "loss": 2.1474, + "step": 7120 + }, + { + "epoch": 0.02713853977147294, + "grad_norm": 0.1106269583106041, + "learning_rate": 0.0005, + "loss": 2.1408, + "step": 7130 + }, + { + "epoch": 0.027176602239595626, + "grad_norm": 0.10807859152555466, + "learning_rate": 0.0005, + "loss": 2.1314, + "step": 7140 + }, + { + "epoch": 0.027214664707718307, + "grad_norm": 0.13045553863048553, + "learning_rate": 0.0005, + "loss": 2.1484, + "step": 7150 + }, + { + "epoch": 0.02725272717584099, + "grad_norm": 0.1218976378440857, + "learning_rate": 0.0005, + "loss": 2.1511, + "step": 7160 + }, + { + "epoch": 0.027290789643963673, + "grad_norm": 0.11346330493688583, + "learning_rate": 0.0005, + "loss": 2.1359, + "step": 7170 + }, + { + "epoch": 0.027328852112086355, + "grad_norm": 0.1056937724351883, + "learning_rate": 0.0005, + "loss": 2.1416, + "step": 7180 + }, + { + "epoch": 0.02736691458020904, + "grad_norm": 0.1118604838848114, + "learning_rate": 0.0005, + "loss": 2.14, + "step": 7190 + }, + { + "epoch": 0.02740497704833172, + "grad_norm": 0.10969972610473633, + "learning_rate": 0.0005, + "loss": 2.1361, + "step": 7200 + }, + { + "epoch": 0.027443039516454406, + "grad_norm": 0.11284728348255157, + "learning_rate": 0.0005, + "loss": 2.1341, + "step": 7210 + }, + { + "epoch": 0.027481101984577087, + "grad_norm": 0.11114726215600967, + "learning_rate": 0.0005, + "loss": 2.1367, + "step": 7220 + }, + { + "epoch": 0.027519164452699772, + "grad_norm": 0.116744764149189, + "learning_rate": 0.0005, + "loss": 2.1454, + "step": 7230 + }, + { + "epoch": 0.027557226920822454, + "grad_norm": 0.1106627881526947, + "learning_rate": 0.0005, + "loss": 2.1535, + "step": 7240 + }, + { + "epoch": 0.027595289388945135, + "grad_norm": 0.10879063606262207, + "learning_rate": 0.0005, + "loss": 2.1479, + "step": 7250 + }, + { + "epoch": 0.02763335185706782, + "grad_norm": 0.11433703452348709, + "learning_rate": 0.0005, + "loss": 2.1377, + "step": 7260 + }, + { + "epoch": 0.0276714143251905, + "grad_norm": 0.11956377327442169, + "learning_rate": 0.0005, + "loss": 2.1422, + "step": 7270 + }, + { + "epoch": 0.027709476793313186, + "grad_norm": 0.10950538516044617, + "learning_rate": 0.0005, + "loss": 2.1506, + "step": 7280 + }, + { + "epoch": 0.027747539261435868, + "grad_norm": 0.10610220581293106, + "learning_rate": 0.0005, + "loss": 2.1387, + "step": 7290 + }, + { + "epoch": 0.027785601729558553, + "grad_norm": 0.10162237286567688, + "learning_rate": 0.0005, + "loss": 2.1399, + "step": 7300 + }, + { + "epoch": 0.027823664197681234, + "grad_norm": 0.11824820935726166, + "learning_rate": 0.0005, + "loss": 2.1439, + "step": 7310 + }, + { + "epoch": 0.02786172666580392, + "grad_norm": 0.11466916650533676, + "learning_rate": 0.0005, + "loss": 2.1444, + "step": 7320 + }, + { + "epoch": 0.0278997891339266, + "grad_norm": 0.11753802001476288, + "learning_rate": 0.0005, + "loss": 2.1332, + "step": 7330 + }, + { + "epoch": 0.02793785160204928, + "grad_norm": 0.10082371532917023, + "learning_rate": 0.0005, + "loss": 2.153, + "step": 7340 + }, + { + "epoch": 0.027975914070171966, + "grad_norm": 0.1257477104663849, + "learning_rate": 0.0005, + "loss": 2.1366, + "step": 7350 + }, + { + "epoch": 0.028013976538294648, + "grad_norm": 0.1157645434141159, + "learning_rate": 0.0005, + "loss": 2.1589, + "step": 7360 + }, + { + "epoch": 0.028052039006417333, + "grad_norm": 0.12373010814189911, + "learning_rate": 0.0005, + "loss": 2.1384, + "step": 7370 + }, + { + "epoch": 0.028090101474540014, + "grad_norm": 0.12133149057626724, + "learning_rate": 0.0005, + "loss": 2.1316, + "step": 7380 + }, + { + "epoch": 0.0281281639426627, + "grad_norm": 0.108877994120121, + "learning_rate": 0.0005, + "loss": 2.1403, + "step": 7390 + }, + { + "epoch": 0.02816622641078538, + "grad_norm": 0.1152234897017479, + "learning_rate": 0.0005, + "loss": 2.1439, + "step": 7400 + }, + { + "epoch": 0.028204288878908065, + "grad_norm": 0.10108889639377594, + "learning_rate": 0.0005, + "loss": 2.1688, + "step": 7410 + }, + { + "epoch": 0.028242351347030747, + "grad_norm": 0.11246947199106216, + "learning_rate": 0.0005, + "loss": 2.1365, + "step": 7420 + }, + { + "epoch": 0.028280413815153428, + "grad_norm": 0.11029767245054245, + "learning_rate": 0.0005, + "loss": 2.1389, + "step": 7430 + }, + { + "epoch": 0.028318476283276113, + "grad_norm": 0.10918702930212021, + "learning_rate": 0.0005, + "loss": 2.1443, + "step": 7440 + }, + { + "epoch": 0.028356538751398794, + "grad_norm": 0.12176518887281418, + "learning_rate": 0.0005, + "loss": 2.133, + "step": 7450 + }, + { + "epoch": 0.02839460121952148, + "grad_norm": 0.12156303226947784, + "learning_rate": 0.0005, + "loss": 2.1603, + "step": 7460 + }, + { + "epoch": 0.02843266368764416, + "grad_norm": 0.11835929751396179, + "learning_rate": 0.0005, + "loss": 2.1522, + "step": 7470 + }, + { + "epoch": 0.028470726155766846, + "grad_norm": 0.10707549750804901, + "learning_rate": 0.0005, + "loss": 2.157, + "step": 7480 + }, + { + "epoch": 0.028508788623889527, + "grad_norm": 0.11295874416828156, + "learning_rate": 0.0005, + "loss": 2.15, + "step": 7490 + }, + { + "epoch": 0.028546851092012212, + "grad_norm": 0.11424469202756882, + "learning_rate": 0.0005, + "loss": 2.1493, + "step": 7500 + }, + { + "epoch": 0.028584913560134893, + "grad_norm": 0.12551772594451904, + "learning_rate": 0.0005, + "loss": 2.1348, + "step": 7510 + }, + { + "epoch": 0.028622976028257575, + "grad_norm": 0.11593035608530045, + "learning_rate": 0.0005, + "loss": 2.1547, + "step": 7520 + }, + { + "epoch": 0.02866103849638026, + "grad_norm": 0.1208166629076004, + "learning_rate": 0.0005, + "loss": 2.1465, + "step": 7530 + }, + { + "epoch": 0.02869910096450294, + "grad_norm": 0.11421196162700653, + "learning_rate": 0.0005, + "loss": 2.1358, + "step": 7540 + }, + { + "epoch": 0.028737163432625626, + "grad_norm": 0.1322488635778427, + "learning_rate": 0.0005, + "loss": 2.1568, + "step": 7550 + }, + { + "epoch": 0.028775225900748307, + "grad_norm": 0.13235977292060852, + "learning_rate": 0.0005, + "loss": 2.1376, + "step": 7560 + }, + { + "epoch": 0.028813288368870992, + "grad_norm": 0.10404398292303085, + "learning_rate": 0.0005, + "loss": 2.1533, + "step": 7570 + }, + { + "epoch": 0.028851350836993674, + "grad_norm": 0.1253490298986435, + "learning_rate": 0.0005, + "loss": 2.1291, + "step": 7580 + }, + { + "epoch": 0.02888941330511636, + "grad_norm": 0.11406102031469345, + "learning_rate": 0.0005, + "loss": 2.1477, + "step": 7590 + }, + { + "epoch": 0.02892747577323904, + "grad_norm": 0.11629606783390045, + "learning_rate": 0.0005, + "loss": 2.1377, + "step": 7600 + }, + { + "epoch": 0.02896553824136172, + "grad_norm": 0.10625175386667252, + "learning_rate": 0.0005, + "loss": 2.1469, + "step": 7610 + }, + { + "epoch": 0.029003600709484406, + "grad_norm": 0.11212033778429031, + "learning_rate": 0.0005, + "loss": 2.1504, + "step": 7620 + }, + { + "epoch": 0.029041663177607088, + "grad_norm": 0.10987796634435654, + "learning_rate": 0.0005, + "loss": 2.137, + "step": 7630 + }, + { + "epoch": 0.029079725645729772, + "grad_norm": 0.10258632153272629, + "learning_rate": 0.0005, + "loss": 2.1297, + "step": 7640 + }, + { + "epoch": 0.029117788113852454, + "grad_norm": 0.10536502301692963, + "learning_rate": 0.0005, + "loss": 2.143, + "step": 7650 + }, + { + "epoch": 0.02915585058197514, + "grad_norm": 0.10387527942657471, + "learning_rate": 0.0005, + "loss": 2.1433, + "step": 7660 + }, + { + "epoch": 0.02919391305009782, + "grad_norm": 0.10539811849594116, + "learning_rate": 0.0005, + "loss": 2.1436, + "step": 7670 + }, + { + "epoch": 0.029231975518220505, + "grad_norm": 0.1249329149723053, + "learning_rate": 0.0005, + "loss": 2.1508, + "step": 7680 + }, + { + "epoch": 0.029270037986343186, + "grad_norm": 0.12725548446178436, + "learning_rate": 0.0005, + "loss": 2.1526, + "step": 7690 + }, + { + "epoch": 0.029308100454465868, + "grad_norm": 0.12263181805610657, + "learning_rate": 0.0005, + "loss": 2.1282, + "step": 7700 + }, + { + "epoch": 0.029346162922588553, + "grad_norm": 0.1168946847319603, + "learning_rate": 0.0005, + "loss": 2.1388, + "step": 7710 + }, + { + "epoch": 0.029384225390711234, + "grad_norm": 0.12111609429121017, + "learning_rate": 0.0005, + "loss": 2.1323, + "step": 7720 + }, + { + "epoch": 0.02942228785883392, + "grad_norm": 0.10218352824449539, + "learning_rate": 0.0005, + "loss": 2.1377, + "step": 7730 + }, + { + "epoch": 0.0294603503269566, + "grad_norm": 0.10925023257732391, + "learning_rate": 0.0005, + "loss": 2.1473, + "step": 7740 + }, + { + "epoch": 0.029498412795079285, + "grad_norm": 0.12212564051151276, + "learning_rate": 0.0005, + "loss": 2.1335, + "step": 7750 + }, + { + "epoch": 0.029536475263201967, + "grad_norm": 0.10729344934225082, + "learning_rate": 0.0005, + "loss": 2.1386, + "step": 7760 + }, + { + "epoch": 0.02957453773132465, + "grad_norm": 0.10953337699174881, + "learning_rate": 0.0005, + "loss": 2.1422, + "step": 7770 + }, + { + "epoch": 0.029612600199447333, + "grad_norm": 0.11217037588357925, + "learning_rate": 0.0005, + "loss": 2.1513, + "step": 7780 + }, + { + "epoch": 0.029650662667570014, + "grad_norm": 0.11673374474048615, + "learning_rate": 0.0005, + "loss": 2.1371, + "step": 7790 + }, + { + "epoch": 0.0296887251356927, + "grad_norm": 0.11757118254899979, + "learning_rate": 0.0005, + "loss": 2.1509, + "step": 7800 + }, + { + "epoch": 0.02972678760381538, + "grad_norm": 0.10758315771818161, + "learning_rate": 0.0005, + "loss": 2.1535, + "step": 7810 + }, + { + "epoch": 0.029764850071938066, + "grad_norm": 0.11497870832681656, + "learning_rate": 0.0005, + "loss": 2.1466, + "step": 7820 + }, + { + "epoch": 0.029802912540060747, + "grad_norm": 0.12000583857297897, + "learning_rate": 0.0005, + "loss": 2.136, + "step": 7830 + }, + { + "epoch": 0.029840975008183432, + "grad_norm": 0.12263140082359314, + "learning_rate": 0.0005, + "loss": 2.1507, + "step": 7840 + }, + { + "epoch": 0.029879037476306113, + "grad_norm": 0.11573194712400436, + "learning_rate": 0.0005, + "loss": 2.1496, + "step": 7850 + }, + { + "epoch": 0.029917099944428798, + "grad_norm": 0.111940398812294, + "learning_rate": 0.0005, + "loss": 2.1327, + "step": 7860 + }, + { + "epoch": 0.02995516241255148, + "grad_norm": 0.11048493534326553, + "learning_rate": 0.0005, + "loss": 2.1428, + "step": 7870 + }, + { + "epoch": 0.02999322488067416, + "grad_norm": 0.11425173282623291, + "learning_rate": 0.0005, + "loss": 2.1433, + "step": 7880 + }, + { + "epoch": 0.030031287348796846, + "grad_norm": 0.14338618516921997, + "learning_rate": 0.0005, + "loss": 2.1555, + "step": 7890 + }, + { + "epoch": 0.030069349816919527, + "grad_norm": 0.12020622938871384, + "learning_rate": 0.0005, + "loss": 2.1625, + "step": 7900 + }, + { + "epoch": 0.030107412285042212, + "grad_norm": 0.11354848742485046, + "learning_rate": 0.0005, + "loss": 2.1351, + "step": 7910 + }, + { + "epoch": 0.030145474753164894, + "grad_norm": 0.11214245855808258, + "learning_rate": 0.0005, + "loss": 2.1422, + "step": 7920 + }, + { + "epoch": 0.03018353722128758, + "grad_norm": 0.1082155704498291, + "learning_rate": 0.0005, + "loss": 2.156, + "step": 7930 + }, + { + "epoch": 0.03022159968941026, + "grad_norm": 0.11786253750324249, + "learning_rate": 0.0005, + "loss": 2.1545, + "step": 7940 + }, + { + "epoch": 0.030259662157532945, + "grad_norm": 0.11564178764820099, + "learning_rate": 0.0005, + "loss": 2.1406, + "step": 7950 + }, + { + "epoch": 0.030297724625655626, + "grad_norm": 0.11741790175437927, + "learning_rate": 0.0005, + "loss": 2.1451, + "step": 7960 + }, + { + "epoch": 0.030335787093778308, + "grad_norm": 0.13863076269626617, + "learning_rate": 0.0005, + "loss": 2.151, + "step": 7970 + }, + { + "epoch": 0.030373849561900992, + "grad_norm": 0.11684587597846985, + "learning_rate": 0.0005, + "loss": 2.1308, + "step": 7980 + }, + { + "epoch": 0.030411912030023674, + "grad_norm": 0.11619321256875992, + "learning_rate": 0.0005, + "loss": 2.1623, + "step": 7990 + }, + { + "epoch": 0.03044997449814636, + "grad_norm": 0.11050112545490265, + "learning_rate": 0.0005, + "loss": 2.1441, + "step": 8000 + }, + { + "epoch": 0.03048803696626904, + "grad_norm": 0.11222852766513824, + "learning_rate": 0.0005, + "loss": 2.146, + "step": 8010 + }, + { + "epoch": 0.030526099434391725, + "grad_norm": 0.10987085849046707, + "learning_rate": 0.0005, + "loss": 2.1437, + "step": 8020 + }, + { + "epoch": 0.030564161902514406, + "grad_norm": 0.11533662676811218, + "learning_rate": 0.0005, + "loss": 2.1449, + "step": 8030 + }, + { + "epoch": 0.03060222437063709, + "grad_norm": 0.11021203547716141, + "learning_rate": 0.0005, + "loss": 2.1413, + "step": 8040 + }, + { + "epoch": 0.030640286838759773, + "grad_norm": 0.10626066476106644, + "learning_rate": 0.0005, + "loss": 2.157, + "step": 8050 + }, + { + "epoch": 0.030678349306882454, + "grad_norm": 0.10716816037893295, + "learning_rate": 0.0005, + "loss": 2.134, + "step": 8060 + }, + { + "epoch": 0.03071641177500514, + "grad_norm": 0.14796046912670135, + "learning_rate": 0.0005, + "loss": 2.1474, + "step": 8070 + }, + { + "epoch": 0.03075447424312782, + "grad_norm": 0.11453462392091751, + "learning_rate": 0.0005, + "loss": 2.141, + "step": 8080 + }, + { + "epoch": 0.030792536711250505, + "grad_norm": 0.10463167726993561, + "learning_rate": 0.0005, + "loss": 2.1471, + "step": 8090 + }, + { + "epoch": 0.030830599179373187, + "grad_norm": 0.10290495306253433, + "learning_rate": 0.0005, + "loss": 2.1343, + "step": 8100 + }, + { + "epoch": 0.03086866164749587, + "grad_norm": 0.10581576824188232, + "learning_rate": 0.0005, + "loss": 2.1367, + "step": 8110 + }, + { + "epoch": 0.030906724115618553, + "grad_norm": 0.12016568332910538, + "learning_rate": 0.0005, + "loss": 2.132, + "step": 8120 + }, + { + "epoch": 0.030944786583741238, + "grad_norm": 0.11537013202905655, + "learning_rate": 0.0005, + "loss": 2.1295, + "step": 8130 + }, + { + "epoch": 0.03098284905186392, + "grad_norm": 0.12919427454471588, + "learning_rate": 0.0005, + "loss": 2.1442, + "step": 8140 + }, + { + "epoch": 0.0310209115199866, + "grad_norm": 0.11007408797740936, + "learning_rate": 0.0005, + "loss": 2.1367, + "step": 8150 + }, + { + "epoch": 0.031058973988109286, + "grad_norm": 0.11892461031675339, + "learning_rate": 0.0005, + "loss": 2.1517, + "step": 8160 + }, + { + "epoch": 0.031097036456231967, + "grad_norm": 0.17755889892578125, + "learning_rate": 0.0005, + "loss": 2.1713, + "step": 8170 + }, + { + "epoch": 0.031135098924354652, + "grad_norm": 0.11239070445299149, + "learning_rate": 0.0005, + "loss": 2.1545, + "step": 8180 + }, + { + "epoch": 0.031173161392477333, + "grad_norm": 0.1028556078672409, + "learning_rate": 0.0005, + "loss": 2.1573, + "step": 8190 + }, + { + "epoch": 0.031211223860600018, + "grad_norm": 0.1157098338007927, + "learning_rate": 0.0005, + "loss": 2.1426, + "step": 8200 + }, + { + "epoch": 0.0312492863287227, + "grad_norm": 0.10809467732906342, + "learning_rate": 0.0005, + "loss": 2.1322, + "step": 8210 + }, + { + "epoch": 0.03128734879684538, + "grad_norm": 0.11528951674699783, + "learning_rate": 0.0005, + "loss": 2.1467, + "step": 8220 + }, + { + "epoch": 0.031325411264968066, + "grad_norm": 0.12987327575683594, + "learning_rate": 0.0005, + "loss": 2.1535, + "step": 8230 + }, + { + "epoch": 0.03136347373309075, + "grad_norm": 0.11531908810138702, + "learning_rate": 0.0005, + "loss": 2.1537, + "step": 8240 + }, + { + "epoch": 0.03140153620121343, + "grad_norm": 0.10050345212221146, + "learning_rate": 0.0005, + "loss": 2.1433, + "step": 8250 + }, + { + "epoch": 0.03143959866933611, + "grad_norm": 0.12242735922336578, + "learning_rate": 0.0005, + "loss": 2.1493, + "step": 8260 + }, + { + "epoch": 0.0314776611374588, + "grad_norm": 0.1069076880812645, + "learning_rate": 0.0005, + "loss": 2.1484, + "step": 8270 + }, + { + "epoch": 0.03151572360558148, + "grad_norm": 0.1232587918639183, + "learning_rate": 0.0005, + "loss": 2.1481, + "step": 8280 + }, + { + "epoch": 0.03155378607370416, + "grad_norm": 0.10760942846536636, + "learning_rate": 0.0005, + "loss": 2.1377, + "step": 8290 + }, + { + "epoch": 0.031591848541826846, + "grad_norm": 0.1072341799736023, + "learning_rate": 0.0005, + "loss": 2.1412, + "step": 8300 + }, + { + "epoch": 0.03162991100994953, + "grad_norm": 0.11002219468355179, + "learning_rate": 0.0005, + "loss": 2.1458, + "step": 8310 + }, + { + "epoch": 0.03166797347807221, + "grad_norm": 0.11687692254781723, + "learning_rate": 0.0005, + "loss": 2.1488, + "step": 8320 + }, + { + "epoch": 0.031706035946194894, + "grad_norm": 0.11089745163917542, + "learning_rate": 0.0005, + "loss": 2.1358, + "step": 8330 + }, + { + "epoch": 0.03174409841431758, + "grad_norm": 0.11608854681253433, + "learning_rate": 0.0005, + "loss": 2.1526, + "step": 8340 + }, + { + "epoch": 0.031782160882440263, + "grad_norm": 0.10124436765909195, + "learning_rate": 0.0005, + "loss": 2.1463, + "step": 8350 + }, + { + "epoch": 0.03182022335056294, + "grad_norm": 0.10724233835935593, + "learning_rate": 0.0005, + "loss": 2.1484, + "step": 8360 + }, + { + "epoch": 0.031858285818685626, + "grad_norm": 0.11886536329984665, + "learning_rate": 0.0005, + "loss": 2.1481, + "step": 8370 + }, + { + "epoch": 0.03189634828680831, + "grad_norm": 0.11546676605939865, + "learning_rate": 0.0005, + "loss": 2.14, + "step": 8380 + }, + { + "epoch": 0.031934410754930996, + "grad_norm": 0.12106689810752869, + "learning_rate": 0.0005, + "loss": 2.153, + "step": 8390 + }, + { + "epoch": 0.031972473223053674, + "grad_norm": 0.11836668848991394, + "learning_rate": 0.0005, + "loss": 2.1377, + "step": 8400 + }, + { + "epoch": 0.03201053569117636, + "grad_norm": 0.10874088108539581, + "learning_rate": 0.0005, + "loss": 2.1282, + "step": 8410 + }, + { + "epoch": 0.032048598159299044, + "grad_norm": 0.1145242378115654, + "learning_rate": 0.0005, + "loss": 2.1295, + "step": 8420 + }, + { + "epoch": 0.03208666062742172, + "grad_norm": 0.11752122640609741, + "learning_rate": 0.0005, + "loss": 2.1277, + "step": 8430 + }, + { + "epoch": 0.03212472309554441, + "grad_norm": 0.12958693504333496, + "learning_rate": 0.0005, + "loss": 2.1475, + "step": 8440 + }, + { + "epoch": 0.03216278556366709, + "grad_norm": 0.11110221594572067, + "learning_rate": 0.0005, + "loss": 2.1379, + "step": 8450 + }, + { + "epoch": 0.032200848031789776, + "grad_norm": 0.11337503045797348, + "learning_rate": 0.0005, + "loss": 2.1556, + "step": 8460 + }, + { + "epoch": 0.032238910499912454, + "grad_norm": 0.11539135873317719, + "learning_rate": 0.0005, + "loss": 2.1439, + "step": 8470 + }, + { + "epoch": 0.03227697296803514, + "grad_norm": 0.10884083807468414, + "learning_rate": 0.0005, + "loss": 2.1506, + "step": 8480 + }, + { + "epoch": 0.032315035436157824, + "grad_norm": 0.12452396005392075, + "learning_rate": 0.0005, + "loss": 2.1256, + "step": 8490 + }, + { + "epoch": 0.0323530979042805, + "grad_norm": 0.1131415143609047, + "learning_rate": 0.0005, + "loss": 2.1457, + "step": 8500 + }, + { + "epoch": 0.03239116037240319, + "grad_norm": 0.11396261304616928, + "learning_rate": 0.0005, + "loss": 2.1516, + "step": 8510 + }, + { + "epoch": 0.03242922284052587, + "grad_norm": 0.12674427032470703, + "learning_rate": 0.0005, + "loss": 2.1559, + "step": 8520 + }, + { + "epoch": 0.03246728530864856, + "grad_norm": 0.11402904242277145, + "learning_rate": 0.0005, + "loss": 2.1367, + "step": 8530 + }, + { + "epoch": 0.032505347776771235, + "grad_norm": 0.10981621593236923, + "learning_rate": 0.0005, + "loss": 2.1492, + "step": 8540 + }, + { + "epoch": 0.03254341024489392, + "grad_norm": 0.10459432005882263, + "learning_rate": 0.0005, + "loss": 2.1424, + "step": 8550 + }, + { + "epoch": 0.032581472713016604, + "grad_norm": 0.10308747738599777, + "learning_rate": 0.0005, + "loss": 2.15, + "step": 8560 + }, + { + "epoch": 0.03261953518113929, + "grad_norm": 0.11720939725637436, + "learning_rate": 0.0005, + "loss": 2.1351, + "step": 8570 + }, + { + "epoch": 0.03265759764926197, + "grad_norm": 0.11396331340074539, + "learning_rate": 0.0005, + "loss": 2.1418, + "step": 8580 + }, + { + "epoch": 0.03269566011738465, + "grad_norm": 0.11162128299474716, + "learning_rate": 0.0005, + "loss": 2.1432, + "step": 8590 + }, + { + "epoch": 0.03273372258550734, + "grad_norm": 0.12489422410726547, + "learning_rate": 0.0005, + "loss": 2.1395, + "step": 8600 + }, + { + "epoch": 0.032771785053630015, + "grad_norm": 0.10979968309402466, + "learning_rate": 0.0005, + "loss": 2.1429, + "step": 8610 + }, + { + "epoch": 0.0328098475217527, + "grad_norm": 0.11844202876091003, + "learning_rate": 0.0005, + "loss": 2.17, + "step": 8620 + }, + { + "epoch": 0.032847909989875385, + "grad_norm": 0.11446519196033478, + "learning_rate": 0.0005, + "loss": 2.1443, + "step": 8630 + }, + { + "epoch": 0.03288597245799807, + "grad_norm": 0.12458764016628265, + "learning_rate": 0.0005, + "loss": 2.1505, + "step": 8640 + }, + { + "epoch": 0.03292403492612075, + "grad_norm": 0.2882729470729828, + "learning_rate": 0.0005, + "loss": 2.1591, + "step": 8650 + }, + { + "epoch": 0.03296209739424343, + "grad_norm": 0.11263883113861084, + "learning_rate": 0.0005, + "loss": 2.145, + "step": 8660 + }, + { + "epoch": 0.03300015986236612, + "grad_norm": 0.13356174528598785, + "learning_rate": 0.0005, + "loss": 2.1446, + "step": 8670 + }, + { + "epoch": 0.033038222330488795, + "grad_norm": 0.10944783687591553, + "learning_rate": 0.0005, + "loss": 2.1393, + "step": 8680 + }, + { + "epoch": 0.03307628479861148, + "grad_norm": 0.12128116935491562, + "learning_rate": 0.0005, + "loss": 2.1399, + "step": 8690 + }, + { + "epoch": 0.033114347266734165, + "grad_norm": 0.11913850903511047, + "learning_rate": 0.0005, + "loss": 2.1427, + "step": 8700 + }, + { + "epoch": 0.03315240973485685, + "grad_norm": 0.11542215943336487, + "learning_rate": 0.0005, + "loss": 2.1427, + "step": 8710 + }, + { + "epoch": 0.03319047220297953, + "grad_norm": 0.11794381588697433, + "learning_rate": 0.0005, + "loss": 2.1389, + "step": 8720 + }, + { + "epoch": 0.03322853467110221, + "grad_norm": 0.1318274885416031, + "learning_rate": 0.0005, + "loss": 2.1545, + "step": 8730 + }, + { + "epoch": 0.0332665971392249, + "grad_norm": 0.11836835741996765, + "learning_rate": 0.0005, + "loss": 2.14, + "step": 8740 + }, + { + "epoch": 0.03330465960734758, + "grad_norm": 0.12125727534294128, + "learning_rate": 0.0005, + "loss": 2.1462, + "step": 8750 + }, + { + "epoch": 0.03334272207547026, + "grad_norm": 0.1427885890007019, + "learning_rate": 0.0005, + "loss": 2.149, + "step": 8760 + }, + { + "epoch": 0.033380784543592945, + "grad_norm": 0.1247292086482048, + "learning_rate": 0.0005, + "loss": 2.1337, + "step": 8770 + }, + { + "epoch": 0.03341884701171563, + "grad_norm": 0.10335478186607361, + "learning_rate": 0.0005, + "loss": 2.1439, + "step": 8780 + }, + { + "epoch": 0.03345690947983831, + "grad_norm": 0.11352815479040146, + "learning_rate": 0.0005, + "loss": 2.1398, + "step": 8790 + }, + { + "epoch": 0.03349497194796099, + "grad_norm": 0.12018328905105591, + "learning_rate": 0.0005, + "loss": 2.1509, + "step": 8800 + }, + { + "epoch": 0.03353303441608368, + "grad_norm": 0.11159548163414001, + "learning_rate": 0.0005, + "loss": 2.1439, + "step": 8810 + }, + { + "epoch": 0.03357109688420636, + "grad_norm": 0.11696930229663849, + "learning_rate": 0.0005, + "loss": 2.1464, + "step": 8820 + }, + { + "epoch": 0.03360915935232904, + "grad_norm": 0.1082887277007103, + "learning_rate": 0.0005, + "loss": 2.1387, + "step": 8830 + }, + { + "epoch": 0.033647221820451725, + "grad_norm": 0.12014048546552658, + "learning_rate": 0.0005, + "loss": 2.1568, + "step": 8840 + }, + { + "epoch": 0.03368528428857441, + "grad_norm": 0.1180727481842041, + "learning_rate": 0.0005, + "loss": 2.1405, + "step": 8850 + }, + { + "epoch": 0.03372334675669709, + "grad_norm": 0.10631722211837769, + "learning_rate": 0.0005, + "loss": 2.1454, + "step": 8860 + }, + { + "epoch": 0.03376140922481977, + "grad_norm": 0.1066332533955574, + "learning_rate": 0.0005, + "loss": 2.1275, + "step": 8870 + }, + { + "epoch": 0.03379947169294246, + "grad_norm": 0.10460060834884644, + "learning_rate": 0.0005, + "loss": 2.1301, + "step": 8880 + }, + { + "epoch": 0.03383753416106514, + "grad_norm": 0.11816377192735672, + "learning_rate": 0.0005, + "loss": 2.1408, + "step": 8890 + }, + { + "epoch": 0.03387559662918782, + "grad_norm": 0.12219863384962082, + "learning_rate": 0.0005, + "loss": 2.1514, + "step": 8900 + }, + { + "epoch": 0.033913659097310506, + "grad_norm": 0.12654411792755127, + "learning_rate": 0.0005, + "loss": 2.1283, + "step": 8910 + }, + { + "epoch": 0.03395172156543319, + "grad_norm": 0.10688552260398865, + "learning_rate": 0.0005, + "loss": 2.1463, + "step": 8920 + }, + { + "epoch": 0.03398978403355587, + "grad_norm": 0.13524161279201508, + "learning_rate": 0.0005, + "loss": 2.1327, + "step": 8930 + }, + { + "epoch": 0.03402784650167855, + "grad_norm": 0.1091911792755127, + "learning_rate": 0.0005, + "loss": 2.1431, + "step": 8940 + }, + { + "epoch": 0.03406590896980124, + "grad_norm": 0.12308672070503235, + "learning_rate": 0.0005, + "loss": 2.1367, + "step": 8950 + }, + { + "epoch": 0.03410397143792392, + "grad_norm": 0.11753041297197342, + "learning_rate": 0.0005, + "loss": 2.1424, + "step": 8960 + }, + { + "epoch": 0.0341420339060466, + "grad_norm": 0.11694305390119553, + "learning_rate": 0.0005, + "loss": 2.1501, + "step": 8970 + }, + { + "epoch": 0.034180096374169286, + "grad_norm": 0.12045048922300339, + "learning_rate": 0.0005, + "loss": 2.1365, + "step": 8980 + }, + { + "epoch": 0.03421815884229197, + "grad_norm": 0.11517022550106049, + "learning_rate": 0.0005, + "loss": 2.1468, + "step": 8990 + }, + { + "epoch": 0.034256221310414656, + "grad_norm": 0.13119100034236908, + "learning_rate": 0.0005, + "loss": 2.1403, + "step": 9000 + }, + { + "epoch": 0.034294283778537334, + "grad_norm": 0.2477245330810547, + "learning_rate": 0.0005, + "loss": 2.1287, + "step": 9010 + }, + { + "epoch": 0.03433234624666002, + "grad_norm": 0.11066543310880661, + "learning_rate": 0.0005, + "loss": 2.1408, + "step": 9020 + }, + { + "epoch": 0.0343704087147827, + "grad_norm": 0.1192685142159462, + "learning_rate": 0.0005, + "loss": 2.148, + "step": 9030 + }, + { + "epoch": 0.03440847118290538, + "grad_norm": 0.11177363246679306, + "learning_rate": 0.0005, + "loss": 2.1442, + "step": 9040 + }, + { + "epoch": 0.034446533651028066, + "grad_norm": 0.1382625252008438, + "learning_rate": 0.0005, + "loss": 2.1504, + "step": 9050 + }, + { + "epoch": 0.03448459611915075, + "grad_norm": 0.11589354276657104, + "learning_rate": 0.0005, + "loss": 2.1601, + "step": 9060 + }, + { + "epoch": 0.034522658587273436, + "grad_norm": 0.13153360784053802, + "learning_rate": 0.0005, + "loss": 2.1466, + "step": 9070 + }, + { + "epoch": 0.034560721055396114, + "grad_norm": 0.11433520168066025, + "learning_rate": 0.0005, + "loss": 2.1305, + "step": 9080 + }, + { + "epoch": 0.0345987835235188, + "grad_norm": 0.13263995945453644, + "learning_rate": 0.0005, + "loss": 2.1496, + "step": 9090 + }, + { + "epoch": 0.034636845991641484, + "grad_norm": 0.11930017918348312, + "learning_rate": 0.0005, + "loss": 2.1435, + "step": 9100 + }, + { + "epoch": 0.03467490845976416, + "grad_norm": 0.10451359301805496, + "learning_rate": 0.0005, + "loss": 2.1497, + "step": 9110 + }, + { + "epoch": 0.034712970927886846, + "grad_norm": 0.12230315804481506, + "learning_rate": 0.0005, + "loss": 2.143, + "step": 9120 + }, + { + "epoch": 0.03475103339600953, + "grad_norm": 0.10477308928966522, + "learning_rate": 0.0005, + "loss": 2.1438, + "step": 9130 + }, + { + "epoch": 0.034789095864132216, + "grad_norm": 0.1123172715306282, + "learning_rate": 0.0005, + "loss": 2.144, + "step": 9140 + }, + { + "epoch": 0.034827158332254894, + "grad_norm": 0.12633569538593292, + "learning_rate": 0.0005, + "loss": 2.1348, + "step": 9150 + }, + { + "epoch": 0.03486522080037758, + "grad_norm": 0.11606360226869583, + "learning_rate": 0.0005, + "loss": 2.1477, + "step": 9160 + }, + { + "epoch": 0.034903283268500264, + "grad_norm": 0.1049937903881073, + "learning_rate": 0.0005, + "loss": 2.1543, + "step": 9170 + }, + { + "epoch": 0.03494134573662295, + "grad_norm": 0.12141606211662292, + "learning_rate": 0.0005, + "loss": 2.1457, + "step": 9180 + }, + { + "epoch": 0.03497940820474563, + "grad_norm": 0.10848421603441238, + "learning_rate": 0.0005, + "loss": 2.14, + "step": 9190 + }, + { + "epoch": 0.03501747067286831, + "grad_norm": 0.12193699181079865, + "learning_rate": 0.0005, + "loss": 2.144, + "step": 9200 + }, + { + "epoch": 0.035055533140990996, + "grad_norm": 0.10787303745746613, + "learning_rate": 0.0005, + "loss": 2.1311, + "step": 9210 + }, + { + "epoch": 0.035093595609113674, + "grad_norm": 0.10789080709218979, + "learning_rate": 0.0005, + "loss": 2.146, + "step": 9220 + }, + { + "epoch": 0.03513165807723636, + "grad_norm": 0.1300131380558014, + "learning_rate": 0.0005, + "loss": 2.1409, + "step": 9230 + }, + { + "epoch": 0.035169720545359044, + "grad_norm": 0.11685086041688919, + "learning_rate": 0.0005, + "loss": 2.1401, + "step": 9240 + }, + { + "epoch": 0.03520778301348173, + "grad_norm": 0.10747739672660828, + "learning_rate": 0.0005, + "loss": 2.1638, + "step": 9250 + }, + { + "epoch": 0.03524584548160441, + "grad_norm": 0.12577669322490692, + "learning_rate": 0.0005, + "loss": 2.1525, + "step": 9260 + }, + { + "epoch": 0.03528390794972709, + "grad_norm": 0.12838762998580933, + "learning_rate": 0.0005, + "loss": 2.1318, + "step": 9270 + }, + { + "epoch": 0.03532197041784978, + "grad_norm": 0.10830754786729813, + "learning_rate": 0.0005, + "loss": 2.1327, + "step": 9280 + }, + { + "epoch": 0.035360032885972455, + "grad_norm": 0.11010642349720001, + "learning_rate": 0.0005, + "loss": 2.1513, + "step": 9290 + }, + { + "epoch": 0.03539809535409514, + "grad_norm": 0.11120496690273285, + "learning_rate": 0.0005, + "loss": 2.15, + "step": 9300 + }, + { + "epoch": 0.035436157822217824, + "grad_norm": 0.10505425930023193, + "learning_rate": 0.0005, + "loss": 2.1404, + "step": 9310 + }, + { + "epoch": 0.03547422029034051, + "grad_norm": 0.11274050921201706, + "learning_rate": 0.0005, + "loss": 2.1376, + "step": 9320 + }, + { + "epoch": 0.03551228275846319, + "grad_norm": 0.1192653551697731, + "learning_rate": 0.0005, + "loss": 2.1542, + "step": 9330 + }, + { + "epoch": 0.03555034522658587, + "grad_norm": 0.10686255246400833, + "learning_rate": 0.0005, + "loss": 2.1368, + "step": 9340 + }, + { + "epoch": 0.03558840769470856, + "grad_norm": 0.1253276765346527, + "learning_rate": 0.0005, + "loss": 2.1293, + "step": 9350 + }, + { + "epoch": 0.03562647016283124, + "grad_norm": 0.11075481027364731, + "learning_rate": 0.0005, + "loss": 2.139, + "step": 9360 + }, + { + "epoch": 0.03566453263095392, + "grad_norm": 0.11865369975566864, + "learning_rate": 0.0005, + "loss": 2.1486, + "step": 9370 + }, + { + "epoch": 0.035702595099076605, + "grad_norm": 0.10745938122272491, + "learning_rate": 0.0005, + "loss": 2.1584, + "step": 9380 + }, + { + "epoch": 0.03574065756719929, + "grad_norm": 0.11748365312814713, + "learning_rate": 0.0005, + "loss": 2.1552, + "step": 9390 + }, + { + "epoch": 0.03577872003532197, + "grad_norm": 0.11585269123315811, + "learning_rate": 0.0005, + "loss": 2.1345, + "step": 9400 + }, + { + "epoch": 0.03581678250344465, + "grad_norm": 0.10678518563508987, + "learning_rate": 0.0005, + "loss": 2.1507, + "step": 9410 + }, + { + "epoch": 0.03585484497156734, + "grad_norm": 0.11283028870820999, + "learning_rate": 0.0005, + "loss": 2.1477, + "step": 9420 + }, + { + "epoch": 0.03589290743969002, + "grad_norm": 0.11182880401611328, + "learning_rate": 0.0005, + "loss": 2.1459, + "step": 9430 + }, + { + "epoch": 0.0359309699078127, + "grad_norm": 0.11779739707708359, + "learning_rate": 0.0005, + "loss": 2.1379, + "step": 9440 + }, + { + "epoch": 0.035969032375935385, + "grad_norm": 0.11326213926076889, + "learning_rate": 0.0005, + "loss": 2.1439, + "step": 9450 + }, + { + "epoch": 0.03600709484405807, + "grad_norm": 0.11592642962932587, + "learning_rate": 0.0005, + "loss": 2.1435, + "step": 9460 + }, + { + "epoch": 0.03604515731218075, + "grad_norm": 0.11186369508504868, + "learning_rate": 0.0005, + "loss": 2.1403, + "step": 9470 + }, + { + "epoch": 0.03608321978030343, + "grad_norm": 0.11607187241315842, + "learning_rate": 0.0005, + "loss": 2.1537, + "step": 9480 + }, + { + "epoch": 0.03612128224842612, + "grad_norm": 0.1025259718298912, + "learning_rate": 0.0005, + "loss": 2.143, + "step": 9490 + }, + { + "epoch": 0.0361593447165488, + "grad_norm": 0.10464241355657578, + "learning_rate": 0.0005, + "loss": 2.1487, + "step": 9500 + }, + { + "epoch": 0.03619740718467148, + "grad_norm": 0.1146213710308075, + "learning_rate": 0.0005, + "loss": 2.1477, + "step": 9510 + }, + { + "epoch": 0.036235469652794165, + "grad_norm": 0.11801069229841232, + "learning_rate": 0.0005, + "loss": 2.1423, + "step": 9520 + }, + { + "epoch": 0.03627353212091685, + "grad_norm": 0.10072599351406097, + "learning_rate": 0.0005, + "loss": 2.1511, + "step": 9530 + }, + { + "epoch": 0.036311594589039535, + "grad_norm": 0.11770542711019516, + "learning_rate": 0.0005, + "loss": 2.1462, + "step": 9540 + }, + { + "epoch": 0.03634965705716221, + "grad_norm": 0.1119447574019432, + "learning_rate": 0.0005, + "loss": 2.1299, + "step": 9550 + }, + { + "epoch": 0.0363877195252849, + "grad_norm": 0.12693637609481812, + "learning_rate": 0.0005, + "loss": 2.1275, + "step": 9560 + }, + { + "epoch": 0.03642578199340758, + "grad_norm": 0.12111397087574005, + "learning_rate": 0.0005, + "loss": 2.1445, + "step": 9570 + }, + { + "epoch": 0.03646384446153026, + "grad_norm": 0.11317116022109985, + "learning_rate": 0.0005, + "loss": 2.1336, + "step": 9580 + }, + { + "epoch": 0.036501906929652946, + "grad_norm": 0.12426373362541199, + "learning_rate": 0.0005, + "loss": 2.1382, + "step": 9590 + }, + { + "epoch": 0.03653996939777563, + "grad_norm": 0.11448558419942856, + "learning_rate": 0.0005, + "loss": 2.141, + "step": 9600 + }, + { + "epoch": 0.036578031865898315, + "grad_norm": 0.1049763485789299, + "learning_rate": 0.0005, + "loss": 2.1362, + "step": 9610 + }, + { + "epoch": 0.03661609433402099, + "grad_norm": 0.12384191900491714, + "learning_rate": 0.0005, + "loss": 2.1417, + "step": 9620 + }, + { + "epoch": 0.03665415680214368, + "grad_norm": 0.12874995172023773, + "learning_rate": 0.0005, + "loss": 2.1293, + "step": 9630 + }, + { + "epoch": 0.03669221927026636, + "grad_norm": 0.11989755183458328, + "learning_rate": 0.0005, + "loss": 2.1451, + "step": 9640 + }, + { + "epoch": 0.03673028173838904, + "grad_norm": 0.11521641165018082, + "learning_rate": 0.0005, + "loss": 2.1534, + "step": 9650 + }, + { + "epoch": 0.036768344206511726, + "grad_norm": 0.10849806666374207, + "learning_rate": 0.0005, + "loss": 2.1531, + "step": 9660 + }, + { + "epoch": 0.03680640667463441, + "grad_norm": 0.10426240414381027, + "learning_rate": 0.0005, + "loss": 2.1451, + "step": 9670 + }, + { + "epoch": 0.036844469142757096, + "grad_norm": 0.11630252748727798, + "learning_rate": 0.0005, + "loss": 2.1553, + "step": 9680 + }, + { + "epoch": 0.036882531610879773, + "grad_norm": 0.1050904244184494, + "learning_rate": 0.0005, + "loss": 2.1346, + "step": 9690 + }, + { + "epoch": 0.03692059407900246, + "grad_norm": 0.14695580303668976, + "learning_rate": 0.0005, + "loss": 2.1423, + "step": 9700 + }, + { + "epoch": 0.03695865654712514, + "grad_norm": 0.11704635620117188, + "learning_rate": 0.0005, + "loss": 2.1394, + "step": 9710 + }, + { + "epoch": 0.03699671901524783, + "grad_norm": 0.11972527205944061, + "learning_rate": 0.0005, + "loss": 2.1386, + "step": 9720 + }, + { + "epoch": 0.037034781483370506, + "grad_norm": 0.11107856035232544, + "learning_rate": 0.0005, + "loss": 2.1533, + "step": 9730 + }, + { + "epoch": 0.03707284395149319, + "grad_norm": 0.11058147251605988, + "learning_rate": 0.0005, + "loss": 2.1386, + "step": 9740 + }, + { + "epoch": 0.037110906419615876, + "grad_norm": 0.12540003657341003, + "learning_rate": 0.0005, + "loss": 2.1339, + "step": 9750 + }, + { + "epoch": 0.037148968887738554, + "grad_norm": 0.10699540376663208, + "learning_rate": 0.0005, + "loss": 2.1313, + "step": 9760 + }, + { + "epoch": 0.03718703135586124, + "grad_norm": 0.11579212546348572, + "learning_rate": 0.0005, + "loss": 2.1445, + "step": 9770 + }, + { + "epoch": 0.037225093823983924, + "grad_norm": 0.10480821877717972, + "learning_rate": 0.0005, + "loss": 2.1523, + "step": 9780 + }, + { + "epoch": 0.03726315629210661, + "grad_norm": 0.11196145415306091, + "learning_rate": 0.0005, + "loss": 2.1433, + "step": 9790 + }, + { + "epoch": 0.037301218760229286, + "grad_norm": 0.12401875853538513, + "learning_rate": 0.0005, + "loss": 2.136, + "step": 9800 + }, + { + "epoch": 0.03733928122835197, + "grad_norm": 0.11669589579105377, + "learning_rate": 0.0005, + "loss": 2.1541, + "step": 9810 + }, + { + "epoch": 0.037377343696474656, + "grad_norm": 0.10399942845106125, + "learning_rate": 0.0005, + "loss": 2.1516, + "step": 9820 + }, + { + "epoch": 0.037415406164597334, + "grad_norm": 0.12019993364810944, + "learning_rate": 0.0005, + "loss": 2.1353, + "step": 9830 + }, + { + "epoch": 0.03745346863272002, + "grad_norm": 0.11188608407974243, + "learning_rate": 0.0005, + "loss": 2.1512, + "step": 9840 + }, + { + "epoch": 0.037491531100842704, + "grad_norm": 0.11040358990430832, + "learning_rate": 0.0005, + "loss": 2.1433, + "step": 9850 + }, + { + "epoch": 0.03752959356896539, + "grad_norm": 0.1256285011768341, + "learning_rate": 0.0005, + "loss": 2.1435, + "step": 9860 + }, + { + "epoch": 0.03756765603708807, + "grad_norm": 0.1065913662314415, + "learning_rate": 0.0005, + "loss": 2.1588, + "step": 9870 + }, + { + "epoch": 0.03760571850521075, + "grad_norm": 0.11596047133207321, + "learning_rate": 0.0005, + "loss": 2.1385, + "step": 9880 + }, + { + "epoch": 0.037643780973333436, + "grad_norm": 0.108096644282341, + "learning_rate": 0.0005, + "loss": 2.1524, + "step": 9890 + }, + { + "epoch": 0.03768184344145612, + "grad_norm": 0.11330022662878036, + "learning_rate": 0.0005, + "loss": 2.1296, + "step": 9900 + }, + { + "epoch": 0.0377199059095788, + "grad_norm": 0.5385860800743103, + "learning_rate": 0.0005, + "loss": 2.1555, + "step": 9910 + }, + { + "epoch": 0.037757968377701484, + "grad_norm": 0.1062675416469574, + "learning_rate": 0.0005, + "loss": 2.1442, + "step": 9920 + }, + { + "epoch": 0.03779603084582417, + "grad_norm": 0.11074505001306534, + "learning_rate": 0.0005, + "loss": 2.1497, + "step": 9930 + }, + { + "epoch": 0.03783409331394685, + "grad_norm": 0.12226357311010361, + "learning_rate": 0.0005, + "loss": 2.1448, + "step": 9940 + }, + { + "epoch": 0.03787215578206953, + "grad_norm": 0.11123912781476974, + "learning_rate": 0.0005, + "loss": 2.1222, + "step": 9950 + }, + { + "epoch": 0.03791021825019222, + "grad_norm": 0.14550866186618805, + "learning_rate": 0.0005, + "loss": 2.1396, + "step": 9960 + }, + { + "epoch": 0.0379482807183149, + "grad_norm": 0.12066611647605896, + "learning_rate": 0.0005, + "loss": 2.1176, + "step": 9970 + }, + { + "epoch": 0.03798634318643758, + "grad_norm": 0.12039341777563095, + "learning_rate": 0.0005, + "loss": 2.1388, + "step": 9980 + }, + { + "epoch": 0.038024405654560264, + "grad_norm": 0.11662400513887405, + "learning_rate": 0.0005, + "loss": 2.1386, + "step": 9990 + }, + { + "epoch": 0.03806246812268295, + "grad_norm": 0.12345922738313675, + "learning_rate": 0.0005, + "loss": 2.1249, + "step": 10000 + }, + { + "epoch": 0.03810053059080563, + "grad_norm": 0.11478866636753082, + "learning_rate": 0.0005, + "loss": 2.1352, + "step": 10010 + }, + { + "epoch": 0.03813859305892831, + "grad_norm": 0.13019050657749176, + "learning_rate": 0.0005, + "loss": 2.1582, + "step": 10020 + }, + { + "epoch": 0.038176655527051, + "grad_norm": 0.12493010610342026, + "learning_rate": 0.0005, + "loss": 2.1407, + "step": 10030 + }, + { + "epoch": 0.03821471799517368, + "grad_norm": 0.12378795444965363, + "learning_rate": 0.0005, + "loss": 2.1457, + "step": 10040 + }, + { + "epoch": 0.03825278046329636, + "grad_norm": 0.1202225461602211, + "learning_rate": 0.0005, + "loss": 2.1565, + "step": 10050 + }, + { + "epoch": 0.038290842931419045, + "grad_norm": 0.1146734431385994, + "learning_rate": 0.0005, + "loss": 2.134, + "step": 10060 + }, + { + "epoch": 0.03832890539954173, + "grad_norm": 0.1264592409133911, + "learning_rate": 0.0005, + "loss": 2.1431, + "step": 10070 + }, + { + "epoch": 0.03836696786766441, + "grad_norm": 0.11752371490001678, + "learning_rate": 0.0005, + "loss": 2.1608, + "step": 10080 + }, + { + "epoch": 0.03840503033578709, + "grad_norm": 0.1169474869966507, + "learning_rate": 0.0005, + "loss": 2.1339, + "step": 10090 + }, + { + "epoch": 0.03844309280390978, + "grad_norm": 0.1330779492855072, + "learning_rate": 0.0005, + "loss": 2.163, + "step": 10100 + }, + { + "epoch": 0.03848115527203246, + "grad_norm": 0.1204131469130516, + "learning_rate": 0.0005, + "loss": 2.1401, + "step": 10110 + }, + { + "epoch": 0.03851921774015514, + "grad_norm": 0.11632289737462997, + "learning_rate": 0.0005, + "loss": 2.1359, + "step": 10120 + }, + { + "epoch": 0.038557280208277825, + "grad_norm": 0.10953623056411743, + "learning_rate": 0.0005, + "loss": 2.1552, + "step": 10130 + }, + { + "epoch": 0.03859534267640051, + "grad_norm": 0.12678860127925873, + "learning_rate": 0.0005, + "loss": 2.1532, + "step": 10140 + }, + { + "epoch": 0.038633405144523195, + "grad_norm": 0.101773202419281, + "learning_rate": 0.0005, + "loss": 2.1495, + "step": 10150 + }, + { + "epoch": 0.03867146761264587, + "grad_norm": 0.10864038020372391, + "learning_rate": 0.0005, + "loss": 2.1545, + "step": 10160 + }, + { + "epoch": 0.03870953008076856, + "grad_norm": 0.12198641151189804, + "learning_rate": 0.0005, + "loss": 2.1315, + "step": 10170 + }, + { + "epoch": 0.03874759254889124, + "grad_norm": 0.12078975886106491, + "learning_rate": 0.0005, + "loss": 2.1416, + "step": 10180 + }, + { + "epoch": 0.03878565501701392, + "grad_norm": 0.1104569360613823, + "learning_rate": 0.0005, + "loss": 2.1376, + "step": 10190 + }, + { + "epoch": 0.038823717485136605, + "grad_norm": 0.10495690256357193, + "learning_rate": 0.0005, + "loss": 2.1455, + "step": 10200 + }, + { + "epoch": 0.03886177995325929, + "grad_norm": 0.11198532581329346, + "learning_rate": 0.0005, + "loss": 2.1425, + "step": 10210 + }, + { + "epoch": 0.038899842421381975, + "grad_norm": 0.10918529331684113, + "learning_rate": 0.0005, + "loss": 2.1327, + "step": 10220 + }, + { + "epoch": 0.03893790488950465, + "grad_norm": 0.13063205778598785, + "learning_rate": 0.0005, + "loss": 2.1301, + "step": 10230 + }, + { + "epoch": 0.03897596735762734, + "grad_norm": 0.11537187546491623, + "learning_rate": 0.0005, + "loss": 2.1235, + "step": 10240 + }, + { + "epoch": 0.03901402982575002, + "grad_norm": 0.11714793741703033, + "learning_rate": 0.0005, + "loss": 2.1521, + "step": 10250 + }, + { + "epoch": 0.0390520922938727, + "grad_norm": 0.10736022889614105, + "learning_rate": 0.0005, + "loss": 2.1446, + "step": 10260 + }, + { + "epoch": 0.039090154761995385, + "grad_norm": 0.12772643566131592, + "learning_rate": 0.0005, + "loss": 2.1525, + "step": 10270 + }, + { + "epoch": 0.03912821723011807, + "grad_norm": 0.11992309242486954, + "learning_rate": 0.0005, + "loss": 2.1352, + "step": 10280 + }, + { + "epoch": 0.039166279698240755, + "grad_norm": 0.1298178732395172, + "learning_rate": 0.0005, + "loss": 2.1518, + "step": 10290 + }, + { + "epoch": 0.03920434216636343, + "grad_norm": 0.12157441675662994, + "learning_rate": 0.0005, + "loss": 2.14, + "step": 10300 + }, + { + "epoch": 0.03924240463448612, + "grad_norm": 0.10330235213041306, + "learning_rate": 0.0005, + "loss": 2.1315, + "step": 10310 + }, + { + "epoch": 0.0392804671026088, + "grad_norm": 0.11134987324476242, + "learning_rate": 0.0005, + "loss": 2.136, + "step": 10320 + }, + { + "epoch": 0.03931852957073149, + "grad_norm": 0.11837029457092285, + "learning_rate": 0.0005, + "loss": 2.148, + "step": 10330 + }, + { + "epoch": 0.039356592038854166, + "grad_norm": 0.11824219673871994, + "learning_rate": 0.0005, + "loss": 2.1267, + "step": 10340 + }, + { + "epoch": 0.03939465450697685, + "grad_norm": 0.1204075962305069, + "learning_rate": 0.0005, + "loss": 2.1592, + "step": 10350 + }, + { + "epoch": 0.039432716975099535, + "grad_norm": 0.11336227506399155, + "learning_rate": 0.0005, + "loss": 2.1295, + "step": 10360 + }, + { + "epoch": 0.03947077944322221, + "grad_norm": 0.12388197332620621, + "learning_rate": 0.0005, + "loss": 2.1541, + "step": 10370 + }, + { + "epoch": 0.0395088419113449, + "grad_norm": 0.11918102204799652, + "learning_rate": 0.0005, + "loss": 2.1511, + "step": 10380 + }, + { + "epoch": 0.03954690437946758, + "grad_norm": 0.11205635219812393, + "learning_rate": 0.0005, + "loss": 2.1477, + "step": 10390 + }, + { + "epoch": 0.03958496684759027, + "grad_norm": 0.11027996242046356, + "learning_rate": 0.0005, + "loss": 2.1387, + "step": 10400 + }, + { + "epoch": 0.039623029315712946, + "grad_norm": 0.13072600960731506, + "learning_rate": 0.0005, + "loss": 2.1583, + "step": 10410 + }, + { + "epoch": 0.03966109178383563, + "grad_norm": 0.11660933494567871, + "learning_rate": 0.0005, + "loss": 2.1491, + "step": 10420 + }, + { + "epoch": 0.039699154251958316, + "grad_norm": 0.11653047800064087, + "learning_rate": 0.0005, + "loss": 2.1657, + "step": 10430 + }, + { + "epoch": 0.039737216720080994, + "grad_norm": 0.125066876411438, + "learning_rate": 0.0005, + "loss": 2.147, + "step": 10440 + }, + { + "epoch": 0.03977527918820368, + "grad_norm": 0.11215624958276749, + "learning_rate": 0.0005, + "loss": 2.163, + "step": 10450 + }, + { + "epoch": 0.03981334165632636, + "grad_norm": 0.11980816721916199, + "learning_rate": 0.0005, + "loss": 2.161, + "step": 10460 + }, + { + "epoch": 0.03985140412444905, + "grad_norm": 0.11664776504039764, + "learning_rate": 0.0005, + "loss": 2.1407, + "step": 10470 + }, + { + "epoch": 0.039889466592571726, + "grad_norm": 0.1179792582988739, + "learning_rate": 0.0005, + "loss": 2.1569, + "step": 10480 + }, + { + "epoch": 0.03992752906069441, + "grad_norm": 0.13249574601650238, + "learning_rate": 0.0005, + "loss": 2.1431, + "step": 10490 + }, + { + "epoch": 0.039965591528817096, + "grad_norm": 0.1165829673409462, + "learning_rate": 0.0005, + "loss": 2.1543, + "step": 10500 + }, + { + "epoch": 0.04000365399693978, + "grad_norm": 0.12203454971313477, + "learning_rate": 0.0005, + "loss": 2.1433, + "step": 10510 + }, + { + "epoch": 0.04004171646506246, + "grad_norm": 0.11847560852766037, + "learning_rate": 0.0005, + "loss": 2.1369, + "step": 10520 + }, + { + "epoch": 0.040079778933185144, + "grad_norm": 0.12019187957048416, + "learning_rate": 0.0005, + "loss": 2.135, + "step": 10530 + }, + { + "epoch": 0.04011784140130783, + "grad_norm": 0.12712723016738892, + "learning_rate": 0.0005, + "loss": 2.1501, + "step": 10540 + }, + { + "epoch": 0.040155903869430506, + "grad_norm": 0.17454050481319427, + "learning_rate": 0.0005, + "loss": 2.148, + "step": 10550 + }, + { + "epoch": 0.04019396633755319, + "grad_norm": 0.12277776747941971, + "learning_rate": 0.0005, + "loss": 2.151, + "step": 10560 + }, + { + "epoch": 0.040232028805675876, + "grad_norm": 0.12997326254844666, + "learning_rate": 0.0005, + "loss": 2.1324, + "step": 10570 + }, + { + "epoch": 0.04027009127379856, + "grad_norm": 0.12896186113357544, + "learning_rate": 0.0005, + "loss": 2.1511, + "step": 10580 + }, + { + "epoch": 0.04030815374192124, + "grad_norm": 0.12204229831695557, + "learning_rate": 0.0005, + "loss": 2.1504, + "step": 10590 + }, + { + "epoch": 0.040346216210043924, + "grad_norm": 0.10776589810848236, + "learning_rate": 0.0005, + "loss": 2.1545, + "step": 10600 + }, + { + "epoch": 0.04038427867816661, + "grad_norm": 0.11930728703737259, + "learning_rate": 0.0005, + "loss": 2.1483, + "step": 10610 + }, + { + "epoch": 0.04042234114628929, + "grad_norm": 0.12586049735546112, + "learning_rate": 0.0005, + "loss": 2.1382, + "step": 10620 + }, + { + "epoch": 0.04046040361441197, + "grad_norm": 0.10810121893882751, + "learning_rate": 0.0005, + "loss": 2.1417, + "step": 10630 + }, + { + "epoch": 0.040498466082534657, + "grad_norm": 0.11577042937278748, + "learning_rate": 0.0005, + "loss": 2.1479, + "step": 10640 + }, + { + "epoch": 0.04053652855065734, + "grad_norm": 0.11528394371271133, + "learning_rate": 0.0005, + "loss": 2.1589, + "step": 10650 + }, + { + "epoch": 0.04057459101878002, + "grad_norm": 0.12058746814727783, + "learning_rate": 0.0005, + "loss": 2.1375, + "step": 10660 + }, + { + "epoch": 0.040612653486902704, + "grad_norm": 0.11240281164646149, + "learning_rate": 0.0005, + "loss": 2.1534, + "step": 10670 + }, + { + "epoch": 0.04065071595502539, + "grad_norm": 0.12102147191762924, + "learning_rate": 0.0005, + "loss": 2.1289, + "step": 10680 + }, + { + "epoch": 0.040688778423148074, + "grad_norm": 0.12779374420642853, + "learning_rate": 0.0005, + "loss": 2.1454, + "step": 10690 + }, + { + "epoch": 0.04072684089127075, + "grad_norm": 0.11621201783418655, + "learning_rate": 0.0005, + "loss": 2.1464, + "step": 10700 + }, + { + "epoch": 0.04076490335939344, + "grad_norm": 0.10922065377235413, + "learning_rate": 0.0005, + "loss": 2.1361, + "step": 10710 + }, + { + "epoch": 0.04080296582751612, + "grad_norm": 0.11401363462209702, + "learning_rate": 0.0005, + "loss": 2.1441, + "step": 10720 + }, + { + "epoch": 0.0408410282956388, + "grad_norm": 0.1241462379693985, + "learning_rate": 0.0005, + "loss": 2.1668, + "step": 10730 + }, + { + "epoch": 0.040879090763761484, + "grad_norm": 0.1269540786743164, + "learning_rate": 0.0005, + "loss": 2.1516, + "step": 10740 + }, + { + "epoch": 0.04091715323188417, + "grad_norm": 0.12056253850460052, + "learning_rate": 0.0005, + "loss": 2.1323, + "step": 10750 + }, + { + "epoch": 0.040955215700006854, + "grad_norm": 0.11011311411857605, + "learning_rate": 0.0005, + "loss": 2.132, + "step": 10760 + }, + { + "epoch": 0.04099327816812953, + "grad_norm": 0.13204540312290192, + "learning_rate": 0.0005, + "loss": 2.1473, + "step": 10770 + }, + { + "epoch": 0.04103134063625222, + "grad_norm": 0.12694178521633148, + "learning_rate": 0.0005, + "loss": 2.1444, + "step": 10780 + }, + { + "epoch": 0.0410694031043749, + "grad_norm": 0.11872044205665588, + "learning_rate": 0.0005, + "loss": 2.153, + "step": 10790 + }, + { + "epoch": 0.04110746557249758, + "grad_norm": 0.1134173646569252, + "learning_rate": 0.0005, + "loss": 2.1469, + "step": 10800 + }, + { + "epoch": 0.041145528040620265, + "grad_norm": 0.11684293299913406, + "learning_rate": 0.0005, + "loss": 2.1259, + "step": 10810 + }, + { + "epoch": 0.04118359050874295, + "grad_norm": 0.11171558499336243, + "learning_rate": 0.0005, + "loss": 2.1376, + "step": 10820 + }, + { + "epoch": 0.041221652976865635, + "grad_norm": 0.13105729222297668, + "learning_rate": 0.0005, + "loss": 2.1356, + "step": 10830 + }, + { + "epoch": 0.04125971544498831, + "grad_norm": 0.11976274102926254, + "learning_rate": 0.0005, + "loss": 2.1424, + "step": 10840 + }, + { + "epoch": 0.041297777913111, + "grad_norm": 0.1136620044708252, + "learning_rate": 0.0005, + "loss": 2.1594, + "step": 10850 + }, + { + "epoch": 0.04133584038123368, + "grad_norm": 0.1374150663614273, + "learning_rate": 0.0005, + "loss": 2.1233, + "step": 10860 + }, + { + "epoch": 0.04137390284935637, + "grad_norm": 0.11635814607143402, + "learning_rate": 0.0005, + "loss": 2.1319, + "step": 10870 + }, + { + "epoch": 0.041411965317479045, + "grad_norm": 0.11033984273672104, + "learning_rate": 0.0005, + "loss": 2.1409, + "step": 10880 + }, + { + "epoch": 0.04145002778560173, + "grad_norm": 0.11242027580738068, + "learning_rate": 0.0005, + "loss": 2.1279, + "step": 10890 + }, + { + "epoch": 0.041488090253724415, + "grad_norm": 0.11739082634449005, + "learning_rate": 0.0005, + "loss": 2.1326, + "step": 10900 + }, + { + "epoch": 0.04152615272184709, + "grad_norm": 0.12341734021902084, + "learning_rate": 0.0005, + "loss": 2.121, + "step": 10910 + }, + { + "epoch": 0.04156421518996978, + "grad_norm": 0.1177162379026413, + "learning_rate": 0.0005, + "loss": 2.1459, + "step": 10920 + }, + { + "epoch": 0.04160227765809246, + "grad_norm": 0.11812618374824524, + "learning_rate": 0.0005, + "loss": 2.1509, + "step": 10930 + }, + { + "epoch": 0.04164034012621515, + "grad_norm": 0.11785928159952164, + "learning_rate": 0.0005, + "loss": 2.1405, + "step": 10940 + }, + { + "epoch": 0.041678402594337825, + "grad_norm": 0.11944951117038727, + "learning_rate": 0.0005, + "loss": 2.1214, + "step": 10950 + }, + { + "epoch": 0.04171646506246051, + "grad_norm": 0.10570940375328064, + "learning_rate": 0.0005, + "loss": 2.1577, + "step": 10960 + }, + { + "epoch": 0.041754527530583195, + "grad_norm": 0.12870647013187408, + "learning_rate": 0.0005, + "loss": 2.1332, + "step": 10970 + }, + { + "epoch": 0.04179258999870587, + "grad_norm": 0.11651504039764404, + "learning_rate": 0.0005, + "loss": 2.1362, + "step": 10980 + }, + { + "epoch": 0.04183065246682856, + "grad_norm": 0.11421600729227066, + "learning_rate": 0.0005, + "loss": 2.1368, + "step": 10990 + }, + { + "epoch": 0.04186871493495124, + "grad_norm": 0.11833158135414124, + "learning_rate": 0.0005, + "loss": 2.143, + "step": 11000 + }, + { + "epoch": 0.04190677740307393, + "grad_norm": 0.11282741278409958, + "learning_rate": 0.0005, + "loss": 2.1426, + "step": 11010 + }, + { + "epoch": 0.041944839871196606, + "grad_norm": 0.11734993010759354, + "learning_rate": 0.0005, + "loss": 2.1487, + "step": 11020 + }, + { + "epoch": 0.04198290233931929, + "grad_norm": 0.10959252715110779, + "learning_rate": 0.0005, + "loss": 2.1546, + "step": 11030 + }, + { + "epoch": 0.042020964807441975, + "grad_norm": 0.11803598701953888, + "learning_rate": 0.0005, + "loss": 2.119, + "step": 11040 + }, + { + "epoch": 0.04205902727556465, + "grad_norm": 0.12175311893224716, + "learning_rate": 0.0005, + "loss": 2.1414, + "step": 11050 + }, + { + "epoch": 0.04209708974368734, + "grad_norm": 0.13541342318058014, + "learning_rate": 0.0005, + "loss": 2.1475, + "step": 11060 + }, + { + "epoch": 0.04213515221181002, + "grad_norm": 0.11709780246019363, + "learning_rate": 0.0005, + "loss": 2.137, + "step": 11070 + }, + { + "epoch": 0.04217321467993271, + "grad_norm": 0.11346873641014099, + "learning_rate": 0.0005, + "loss": 2.1683, + "step": 11080 + }, + { + "epoch": 0.042211277148055386, + "grad_norm": 0.10680264979600906, + "learning_rate": 0.0005, + "loss": 2.1558, + "step": 11090 + }, + { + "epoch": 0.04224933961617807, + "grad_norm": 0.11458170413970947, + "learning_rate": 0.0005, + "loss": 2.1521, + "step": 11100 + }, + { + "epoch": 0.042287402084300756, + "grad_norm": 0.1096096783876419, + "learning_rate": 0.0005, + "loss": 2.1468, + "step": 11110 + }, + { + "epoch": 0.04232546455242344, + "grad_norm": 0.11289811134338379, + "learning_rate": 0.0005, + "loss": 2.1347, + "step": 11120 + }, + { + "epoch": 0.04236352702054612, + "grad_norm": 0.1036987230181694, + "learning_rate": 0.0005, + "loss": 2.1451, + "step": 11130 + }, + { + "epoch": 0.0424015894886688, + "grad_norm": 0.12004408985376358, + "learning_rate": 0.0005, + "loss": 2.1338, + "step": 11140 + }, + { + "epoch": 0.04243965195679149, + "grad_norm": 0.12811318039894104, + "learning_rate": 0.0005, + "loss": 2.1482, + "step": 11150 + }, + { + "epoch": 0.042477714424914166, + "grad_norm": 0.11849028617143631, + "learning_rate": 0.0005, + "loss": 2.1452, + "step": 11160 + }, + { + "epoch": 0.04251577689303685, + "grad_norm": 0.11515294760465622, + "learning_rate": 0.0005, + "loss": 2.1583, + "step": 11170 + }, + { + "epoch": 0.042553839361159536, + "grad_norm": 0.106211818754673, + "learning_rate": 0.0005, + "loss": 2.1342, + "step": 11180 + }, + { + "epoch": 0.04259190182928222, + "grad_norm": 0.10687409341335297, + "learning_rate": 0.0005, + "loss": 2.1505, + "step": 11190 + }, + { + "epoch": 0.0426299642974049, + "grad_norm": 0.11422056704759598, + "learning_rate": 0.0005, + "loss": 2.161, + "step": 11200 + }, + { + "epoch": 0.042668026765527584, + "grad_norm": 0.11594432592391968, + "learning_rate": 0.0005, + "loss": 2.1432, + "step": 11210 + }, + { + "epoch": 0.04270608923365027, + "grad_norm": 0.11001340299844742, + "learning_rate": 0.0005, + "loss": 2.1438, + "step": 11220 + }, + { + "epoch": 0.042744151701772946, + "grad_norm": 0.10638782382011414, + "learning_rate": 0.0005, + "loss": 2.1357, + "step": 11230 + }, + { + "epoch": 0.04278221416989563, + "grad_norm": 0.11647048592567444, + "learning_rate": 0.0005, + "loss": 2.1483, + "step": 11240 + }, + { + "epoch": 0.042820276638018316, + "grad_norm": 0.10914741456508636, + "learning_rate": 0.0005, + "loss": 2.1413, + "step": 11250 + }, + { + "epoch": 0.042858339106141, + "grad_norm": 0.11346109211444855, + "learning_rate": 0.0005, + "loss": 2.1311, + "step": 11260 + }, + { + "epoch": 0.04289640157426368, + "grad_norm": 0.11912467330694199, + "learning_rate": 0.0005, + "loss": 2.1451, + "step": 11270 + }, + { + "epoch": 0.042934464042386364, + "grad_norm": 0.11510618031024933, + "learning_rate": 0.0005, + "loss": 2.1438, + "step": 11280 + }, + { + "epoch": 0.04297252651050905, + "grad_norm": 0.10769151151180267, + "learning_rate": 0.0005, + "loss": 2.1489, + "step": 11290 + }, + { + "epoch": 0.043010588978631734, + "grad_norm": 0.1087614893913269, + "learning_rate": 0.0005, + "loss": 2.1429, + "step": 11300 + }, + { + "epoch": 0.04304865144675441, + "grad_norm": 0.1045922189950943, + "learning_rate": 0.0005, + "loss": 2.1475, + "step": 11310 + }, + { + "epoch": 0.043086713914877096, + "grad_norm": 0.12309058010578156, + "learning_rate": 0.0005, + "loss": 2.1498, + "step": 11320 + }, + { + "epoch": 0.04312477638299978, + "grad_norm": 0.11188312619924545, + "learning_rate": 0.0005, + "loss": 2.1471, + "step": 11330 + }, + { + "epoch": 0.04316283885112246, + "grad_norm": 0.11572781205177307, + "learning_rate": 0.0005, + "loss": 2.124, + "step": 11340 + }, + { + "epoch": 0.043200901319245144, + "grad_norm": 0.11633489280939102, + "learning_rate": 0.0005, + "loss": 2.1534, + "step": 11350 + }, + { + "epoch": 0.04323896378736783, + "grad_norm": 0.11622288078069687, + "learning_rate": 0.0005, + "loss": 2.1466, + "step": 11360 + }, + { + "epoch": 0.043277026255490514, + "grad_norm": 0.11403504759073257, + "learning_rate": 0.0005, + "loss": 2.15, + "step": 11370 + }, + { + "epoch": 0.04331508872361319, + "grad_norm": 0.13158078491687775, + "learning_rate": 0.0005, + "loss": 2.1455, + "step": 11380 + }, + { + "epoch": 0.04335315119173588, + "grad_norm": 0.11610095202922821, + "learning_rate": 0.0005, + "loss": 2.1443, + "step": 11390 + }, + { + "epoch": 0.04339121365985856, + "grad_norm": 0.10601352155208588, + "learning_rate": 0.0005, + "loss": 2.1505, + "step": 11400 + }, + { + "epoch": 0.04342927612798124, + "grad_norm": 0.1139233261346817, + "learning_rate": 0.0005, + "loss": 2.1464, + "step": 11410 + }, + { + "epoch": 0.043467338596103924, + "grad_norm": 0.12444206327199936, + "learning_rate": 0.0005, + "loss": 2.1362, + "step": 11420 + }, + { + "epoch": 0.04350540106422661, + "grad_norm": 0.12657175958156586, + "learning_rate": 0.0005, + "loss": 2.1275, + "step": 11430 + }, + { + "epoch": 0.043543463532349294, + "grad_norm": 0.12856648862361908, + "learning_rate": 0.0005, + "loss": 2.1622, + "step": 11440 + }, + { + "epoch": 0.04358152600047197, + "grad_norm": 0.12082226574420929, + "learning_rate": 0.0005, + "loss": 2.1481, + "step": 11450 + }, + { + "epoch": 0.04361958846859466, + "grad_norm": 0.11483483761548996, + "learning_rate": 0.0005, + "loss": 2.1512, + "step": 11460 + }, + { + "epoch": 0.04365765093671734, + "grad_norm": 0.11333905905485153, + "learning_rate": 0.0005, + "loss": 2.1245, + "step": 11470 + }, + { + "epoch": 0.04369571340484003, + "grad_norm": 0.16719497740268707, + "learning_rate": 0.0005, + "loss": 2.154, + "step": 11480 + }, + { + "epoch": 0.043733775872962705, + "grad_norm": 0.11585808545351028, + "learning_rate": 0.0005, + "loss": 2.1553, + "step": 11490 + }, + { + "epoch": 0.04377183834108539, + "grad_norm": 0.13105982542037964, + "learning_rate": 0.0005, + "loss": 2.1239, + "step": 11500 + }, + { + "epoch": 0.043809900809208074, + "grad_norm": 0.11006354540586472, + "learning_rate": 0.0005, + "loss": 2.145, + "step": 11510 + }, + { + "epoch": 0.04384796327733075, + "grad_norm": 0.11695243418216705, + "learning_rate": 0.0005, + "loss": 2.1338, + "step": 11520 + }, + { + "epoch": 0.04388602574545344, + "grad_norm": 0.12058282643556595, + "learning_rate": 0.0005, + "loss": 2.1186, + "step": 11530 + }, + { + "epoch": 0.04392408821357612, + "grad_norm": 0.1141928881406784, + "learning_rate": 0.0005, + "loss": 2.1347, + "step": 11540 + }, + { + "epoch": 0.04396215068169881, + "grad_norm": 0.10934638231992722, + "learning_rate": 0.0005, + "loss": 2.148, + "step": 11550 + }, + { + "epoch": 0.044000213149821485, + "grad_norm": 0.11017810553312302, + "learning_rate": 0.0005, + "loss": 2.1612, + "step": 11560 + }, + { + "epoch": 0.04403827561794417, + "grad_norm": 0.10793840140104294, + "learning_rate": 0.0005, + "loss": 2.1637, + "step": 11570 + }, + { + "epoch": 0.044076338086066855, + "grad_norm": 0.12375883013010025, + "learning_rate": 0.0005, + "loss": 2.1459, + "step": 11580 + }, + { + "epoch": 0.04411440055418953, + "grad_norm": 0.12532900273799896, + "learning_rate": 0.0005, + "loss": 2.1444, + "step": 11590 + }, + { + "epoch": 0.04415246302231222, + "grad_norm": 0.11699268966913223, + "learning_rate": 0.0005, + "loss": 2.1442, + "step": 11600 + }, + { + "epoch": 0.0441905254904349, + "grad_norm": 0.1009625494480133, + "learning_rate": 0.0005, + "loss": 2.1513, + "step": 11610 + }, + { + "epoch": 0.04422858795855759, + "grad_norm": 0.10361494868993759, + "learning_rate": 0.0005, + "loss": 2.1411, + "step": 11620 + }, + { + "epoch": 0.044266650426680265, + "grad_norm": 0.110933817923069, + "learning_rate": 0.0005, + "loss": 2.1441, + "step": 11630 + }, + { + "epoch": 0.04430471289480295, + "grad_norm": 0.11680163443088531, + "learning_rate": 0.0005, + "loss": 2.1422, + "step": 11640 + }, + { + "epoch": 0.044342775362925635, + "grad_norm": 0.11191077530384064, + "learning_rate": 0.0005, + "loss": 2.1499, + "step": 11650 + }, + { + "epoch": 0.04438083783104832, + "grad_norm": 0.11856496334075928, + "learning_rate": 0.0005, + "loss": 2.1475, + "step": 11660 + }, + { + "epoch": 0.044418900299171, + "grad_norm": 0.1134980171918869, + "learning_rate": 0.0005, + "loss": 2.1497, + "step": 11670 + }, + { + "epoch": 0.04445696276729368, + "grad_norm": 0.12401348352432251, + "learning_rate": 0.0005, + "loss": 2.1572, + "step": 11680 + }, + { + "epoch": 0.04449502523541637, + "grad_norm": 0.12504594027996063, + "learning_rate": 0.0005, + "loss": 2.1215, + "step": 11690 + }, + { + "epoch": 0.044533087703539045, + "grad_norm": 0.11235152184963226, + "learning_rate": 0.0005, + "loss": 2.1298, + "step": 11700 + }, + { + "epoch": 0.04457115017166173, + "grad_norm": 0.11963079869747162, + "learning_rate": 0.0005, + "loss": 2.1348, + "step": 11710 + }, + { + "epoch": 0.044609212639784415, + "grad_norm": 0.11335135996341705, + "learning_rate": 0.0005, + "loss": 2.1524, + "step": 11720 + }, + { + "epoch": 0.0446472751079071, + "grad_norm": 0.11719565838575363, + "learning_rate": 0.0005, + "loss": 2.1587, + "step": 11730 + }, + { + "epoch": 0.04468533757602978, + "grad_norm": 0.11393314599990845, + "learning_rate": 0.0005, + "loss": 2.1436, + "step": 11740 + }, + { + "epoch": 0.04472340004415246, + "grad_norm": 0.11835304647684097, + "learning_rate": 0.0005, + "loss": 2.1495, + "step": 11750 + }, + { + "epoch": 0.04476146251227515, + "grad_norm": 0.15484121441841125, + "learning_rate": 0.0005, + "loss": 2.1536, + "step": 11760 + }, + { + "epoch": 0.044799524980397826, + "grad_norm": 0.12383699417114258, + "learning_rate": 0.0005, + "loss": 2.1332, + "step": 11770 + }, + { + "epoch": 0.04483758744852051, + "grad_norm": 0.12207262217998505, + "learning_rate": 0.0005, + "loss": 2.1442, + "step": 11780 + }, + { + "epoch": 0.044875649916643195, + "grad_norm": 0.10953963547945023, + "learning_rate": 0.0005, + "loss": 2.15, + "step": 11790 + }, + { + "epoch": 0.04491371238476588, + "grad_norm": 0.12065628916025162, + "learning_rate": 0.0005, + "loss": 2.1491, + "step": 11800 + }, + { + "epoch": 0.04495177485288856, + "grad_norm": 0.10965772718191147, + "learning_rate": 0.0005, + "loss": 2.1361, + "step": 11810 + }, + { + "epoch": 0.04498983732101124, + "grad_norm": 0.11649373173713684, + "learning_rate": 0.0005, + "loss": 2.1434, + "step": 11820 + }, + { + "epoch": 0.04502789978913393, + "grad_norm": 0.10941790044307709, + "learning_rate": 0.0005, + "loss": 2.1277, + "step": 11830 + }, + { + "epoch": 0.04506596225725661, + "grad_norm": 0.12691697478294373, + "learning_rate": 0.0005, + "loss": 2.1417, + "step": 11840 + }, + { + "epoch": 0.04510402472537929, + "grad_norm": 0.12523461878299713, + "learning_rate": 0.0005, + "loss": 2.1552, + "step": 11850 + }, + { + "epoch": 0.045142087193501976, + "grad_norm": 0.12670059502124786, + "learning_rate": 0.0005, + "loss": 2.148, + "step": 11860 + }, + { + "epoch": 0.04518014966162466, + "grad_norm": 0.11297820508480072, + "learning_rate": 0.0005, + "loss": 2.1352, + "step": 11870 + }, + { + "epoch": 0.04521821212974734, + "grad_norm": 0.12396519631147385, + "learning_rate": 0.0005, + "loss": 2.138, + "step": 11880 + }, + { + "epoch": 0.04525627459787002, + "grad_norm": 0.10837090015411377, + "learning_rate": 0.0005, + "loss": 2.1435, + "step": 11890 + }, + { + "epoch": 0.04529433706599271, + "grad_norm": 0.11423803120851517, + "learning_rate": 0.0005, + "loss": 2.1542, + "step": 11900 + }, + { + "epoch": 0.04533239953411539, + "grad_norm": 0.10724925249814987, + "learning_rate": 0.0005, + "loss": 2.1479, + "step": 11910 + }, + { + "epoch": 0.04537046200223807, + "grad_norm": 0.11402925103902817, + "learning_rate": 0.0005, + "loss": 2.1444, + "step": 11920 + }, + { + "epoch": 0.045408524470360756, + "grad_norm": 0.11158744245767593, + "learning_rate": 0.0005, + "loss": 2.1497, + "step": 11930 + }, + { + "epoch": 0.04544658693848344, + "grad_norm": 0.10439786314964294, + "learning_rate": 0.0005, + "loss": 2.1391, + "step": 11940 + }, + { + "epoch": 0.04548464940660612, + "grad_norm": 0.13331472873687744, + "learning_rate": 0.0005, + "loss": 2.1607, + "step": 11950 + }, + { + "epoch": 0.045522711874728804, + "grad_norm": 0.13179340958595276, + "learning_rate": 0.0005, + "loss": 2.1484, + "step": 11960 + }, + { + "epoch": 0.04556077434285149, + "grad_norm": 0.11391700804233551, + "learning_rate": 0.0005, + "loss": 2.1687, + "step": 11970 + }, + { + "epoch": 0.04559883681097417, + "grad_norm": 0.11158286780118942, + "learning_rate": 0.0005, + "loss": 2.1506, + "step": 11980 + }, + { + "epoch": 0.04563689927909685, + "grad_norm": 0.1332971751689911, + "learning_rate": 0.0005, + "loss": 2.1394, + "step": 11990 + }, + { + "epoch": 0.045674961747219536, + "grad_norm": 0.10865537077188492, + "learning_rate": 0.0005, + "loss": 2.121, + "step": 12000 + }, + { + "epoch": 0.04571302421534222, + "grad_norm": 0.11893896758556366, + "learning_rate": 0.0005, + "loss": 2.145, + "step": 12010 + }, + { + "epoch": 0.045751086683464906, + "grad_norm": 0.11757250875234604, + "learning_rate": 0.0005, + "loss": 2.1513, + "step": 12020 + }, + { + "epoch": 0.045789149151587584, + "grad_norm": 0.11391414701938629, + "learning_rate": 0.0005, + "loss": 2.1392, + "step": 12030 + }, + { + "epoch": 0.04582721161971027, + "grad_norm": 0.11165343970060349, + "learning_rate": 0.0005, + "loss": 2.1355, + "step": 12040 + }, + { + "epoch": 0.045865274087832954, + "grad_norm": 0.1106116846203804, + "learning_rate": 0.0005, + "loss": 2.1428, + "step": 12050 + }, + { + "epoch": 0.04590333655595563, + "grad_norm": 0.1184442788362503, + "learning_rate": 0.0005, + "loss": 2.1382, + "step": 12060 + }, + { + "epoch": 0.04594139902407832, + "grad_norm": 0.11048837751150131, + "learning_rate": 0.0005, + "loss": 2.1488, + "step": 12070 + }, + { + "epoch": 0.045979461492201, + "grad_norm": 0.11627710610628128, + "learning_rate": 0.0005, + "loss": 2.1347, + "step": 12080 + }, + { + "epoch": 0.046017523960323686, + "grad_norm": 0.12435373663902283, + "learning_rate": 0.0005, + "loss": 2.1499, + "step": 12090 + }, + { + "epoch": 0.046055586428446364, + "grad_norm": 0.10093658417463303, + "learning_rate": 0.0005, + "loss": 2.1528, + "step": 12100 + }, + { + "epoch": 0.04609364889656905, + "grad_norm": 0.122145876288414, + "learning_rate": 0.0005, + "loss": 2.1432, + "step": 12110 + }, + { + "epoch": 0.046131711364691734, + "grad_norm": 0.1312531679868698, + "learning_rate": 0.0005, + "loss": 2.1437, + "step": 12120 + }, + { + "epoch": 0.04616977383281441, + "grad_norm": 0.1195501983165741, + "learning_rate": 0.0005, + "loss": 2.1596, + "step": 12130 + }, + { + "epoch": 0.0462078363009371, + "grad_norm": 0.119472935795784, + "learning_rate": 0.0005, + "loss": 2.1504, + "step": 12140 + }, + { + "epoch": 0.04624589876905978, + "grad_norm": 0.11523136496543884, + "learning_rate": 0.0005, + "loss": 2.1338, + "step": 12150 + }, + { + "epoch": 0.04628396123718247, + "grad_norm": 0.11513196676969528, + "learning_rate": 0.0005, + "loss": 2.1501, + "step": 12160 + }, + { + "epoch": 0.046322023705305145, + "grad_norm": 0.11697802692651749, + "learning_rate": 0.0005, + "loss": 2.1409, + "step": 12170 + }, + { + "epoch": 0.04636008617342783, + "grad_norm": 0.12429428100585938, + "learning_rate": 0.0005, + "loss": 2.1481, + "step": 12180 + }, + { + "epoch": 0.046398148641550514, + "grad_norm": 0.12820155918598175, + "learning_rate": 0.0005, + "loss": 2.1304, + "step": 12190 + }, + { + "epoch": 0.04643621110967319, + "grad_norm": 0.1277361959218979, + "learning_rate": 0.0005, + "loss": 2.15, + "step": 12200 + }, + { + "epoch": 0.04647427357779588, + "grad_norm": 0.1044423058629036, + "learning_rate": 0.0005, + "loss": 2.1445, + "step": 12210 + }, + { + "epoch": 0.04651233604591856, + "grad_norm": 0.10871636867523193, + "learning_rate": 0.0005, + "loss": 2.1402, + "step": 12220 + }, + { + "epoch": 0.04655039851404125, + "grad_norm": 0.11233247816562653, + "learning_rate": 0.0005, + "loss": 2.1336, + "step": 12230 + }, + { + "epoch": 0.046588460982163925, + "grad_norm": 0.11828743666410446, + "learning_rate": 0.0005, + "loss": 2.1544, + "step": 12240 + }, + { + "epoch": 0.04662652345028661, + "grad_norm": 0.11883151531219482, + "learning_rate": 0.0005, + "loss": 2.1402, + "step": 12250 + }, + { + "epoch": 0.046664585918409295, + "grad_norm": 0.11778468638658524, + "learning_rate": 0.0005, + "loss": 2.1348, + "step": 12260 + }, + { + "epoch": 0.04670264838653198, + "grad_norm": 0.12145578116178513, + "learning_rate": 0.0005, + "loss": 2.1378, + "step": 12270 + }, + { + "epoch": 0.04674071085465466, + "grad_norm": 0.10665465891361237, + "learning_rate": 0.0005, + "loss": 2.1388, + "step": 12280 + }, + { + "epoch": 0.04677877332277734, + "grad_norm": 0.10334984213113785, + "learning_rate": 0.0005, + "loss": 2.1395, + "step": 12290 + }, + { + "epoch": 0.04681683579090003, + "grad_norm": 0.13126300275325775, + "learning_rate": 0.0005, + "loss": 2.1419, + "step": 12300 + }, + { + "epoch": 0.046854898259022705, + "grad_norm": 0.17418350279331207, + "learning_rate": 0.0005, + "loss": 2.1357, + "step": 12310 + }, + { + "epoch": 0.04689296072714539, + "grad_norm": 0.13565418124198914, + "learning_rate": 0.0005, + "loss": 2.137, + "step": 12320 + }, + { + "epoch": 0.046931023195268075, + "grad_norm": 0.13762266933918, + "learning_rate": 0.0005, + "loss": 2.1403, + "step": 12330 + }, + { + "epoch": 0.04696908566339076, + "grad_norm": 0.11412933468818665, + "learning_rate": 0.0005, + "loss": 2.1282, + "step": 12340 + }, + { + "epoch": 0.04700714813151344, + "grad_norm": 0.10721313953399658, + "learning_rate": 0.0005, + "loss": 2.1349, + "step": 12350 + }, + { + "epoch": 0.04704521059963612, + "grad_norm": 0.11282768845558167, + "learning_rate": 0.0005, + "loss": 2.1475, + "step": 12360 + }, + { + "epoch": 0.04708327306775881, + "grad_norm": 0.1208154559135437, + "learning_rate": 0.0005, + "loss": 2.144, + "step": 12370 + }, + { + "epoch": 0.047121335535881485, + "grad_norm": 0.14094211161136627, + "learning_rate": 0.0005, + "loss": 2.129, + "step": 12380 + }, + { + "epoch": 0.04715939800400417, + "grad_norm": 0.12425476312637329, + "learning_rate": 0.0005, + "loss": 2.1653, + "step": 12390 + }, + { + "epoch": 0.047197460472126855, + "grad_norm": 0.11844827979803085, + "learning_rate": 0.0005, + "loss": 2.1405, + "step": 12400 + }, + { + "epoch": 0.04723552294024954, + "grad_norm": 0.10863589495420456, + "learning_rate": 0.0005, + "loss": 2.1473, + "step": 12410 + }, + { + "epoch": 0.04727358540837222, + "grad_norm": 0.13282550871372223, + "learning_rate": 0.0005, + "loss": 2.1456, + "step": 12420 + }, + { + "epoch": 0.0473116478764949, + "grad_norm": 0.10495536774396896, + "learning_rate": 0.0005, + "loss": 2.1603, + "step": 12430 + }, + { + "epoch": 0.04734971034461759, + "grad_norm": 0.11081427335739136, + "learning_rate": 0.0005, + "loss": 2.1431, + "step": 12440 + }, + { + "epoch": 0.04738777281274027, + "grad_norm": 0.12156649678945541, + "learning_rate": 0.0005, + "loss": 2.1395, + "step": 12450 + }, + { + "epoch": 0.04742583528086295, + "grad_norm": 0.13550612330436707, + "learning_rate": 0.0005, + "loss": 2.1437, + "step": 12460 + }, + { + "epoch": 0.047463897748985635, + "grad_norm": 0.11600054055452347, + "learning_rate": 0.0005, + "loss": 2.1281, + "step": 12470 + }, + { + "epoch": 0.04750196021710832, + "grad_norm": 0.11698690801858902, + "learning_rate": 0.0005, + "loss": 2.141, + "step": 12480 + }, + { + "epoch": 0.047540022685231, + "grad_norm": 0.1163286417722702, + "learning_rate": 0.0005, + "loss": 2.1462, + "step": 12490 + }, + { + "epoch": 0.04757808515335368, + "grad_norm": 0.11984337866306305, + "learning_rate": 0.0005, + "loss": 2.1389, + "step": 12500 + }, + { + "epoch": 0.04761614762147637, + "grad_norm": 0.11663845181465149, + "learning_rate": 0.0005, + "loss": 2.1478, + "step": 12510 + }, + { + "epoch": 0.04765421008959905, + "grad_norm": 0.1164526715874672, + "learning_rate": 0.0005, + "loss": 2.1579, + "step": 12520 + }, + { + "epoch": 0.04769227255772173, + "grad_norm": 0.1177983358502388, + "learning_rate": 0.0005, + "loss": 2.1462, + "step": 12530 + }, + { + "epoch": 0.047730335025844416, + "grad_norm": 0.1029103472828865, + "learning_rate": 0.0005, + "loss": 2.1534, + "step": 12540 + }, + { + "epoch": 0.0477683974939671, + "grad_norm": 0.1200849637389183, + "learning_rate": 0.0005, + "loss": 2.1414, + "step": 12550 + }, + { + "epoch": 0.04780645996208978, + "grad_norm": 0.12218188494443893, + "learning_rate": 0.0005, + "loss": 2.1572, + "step": 12560 + }, + { + "epoch": 0.04784452243021246, + "grad_norm": 0.11455339193344116, + "learning_rate": 0.0005, + "loss": 2.1397, + "step": 12570 + }, + { + "epoch": 0.04788258489833515, + "grad_norm": 0.122074656188488, + "learning_rate": 0.0005, + "loss": 2.148, + "step": 12580 + }, + { + "epoch": 0.04792064736645783, + "grad_norm": 0.12012477964162827, + "learning_rate": 0.0005, + "loss": 2.1421, + "step": 12590 + }, + { + "epoch": 0.04795870983458051, + "grad_norm": 0.12606510519981384, + "learning_rate": 0.0005, + "loss": 2.1573, + "step": 12600 + }, + { + "epoch": 0.047996772302703196, + "grad_norm": 0.12740558385849, + "learning_rate": 0.0005, + "loss": 2.1536, + "step": 12610 + }, + { + "epoch": 0.04803483477082588, + "grad_norm": 0.11046471446752548, + "learning_rate": 0.0005, + "loss": 2.1454, + "step": 12620 + }, + { + "epoch": 0.048072897238948566, + "grad_norm": 0.13119575381278992, + "learning_rate": 0.0005, + "loss": 2.146, + "step": 12630 + }, + { + "epoch": 0.048110959707071244, + "grad_norm": 0.11584407091140747, + "learning_rate": 0.0005, + "loss": 2.1468, + "step": 12640 + }, + { + "epoch": 0.04814902217519393, + "grad_norm": 0.12544192373752594, + "learning_rate": 0.0005, + "loss": 2.1605, + "step": 12650 + }, + { + "epoch": 0.04818708464331661, + "grad_norm": 0.12177877873182297, + "learning_rate": 0.0005, + "loss": 2.1662, + "step": 12660 + }, + { + "epoch": 0.04822514711143929, + "grad_norm": 0.13451693952083588, + "learning_rate": 0.0005, + "loss": 2.1424, + "step": 12670 + }, + { + "epoch": 0.048263209579561976, + "grad_norm": 0.12698346376419067, + "learning_rate": 0.0005, + "loss": 2.1589, + "step": 12680 + }, + { + "epoch": 0.04830127204768466, + "grad_norm": 0.11226338148117065, + "learning_rate": 0.0005, + "loss": 2.1276, + "step": 12690 + }, + { + "epoch": 0.048339334515807346, + "grad_norm": 0.11505680531263351, + "learning_rate": 0.0005, + "loss": 2.1412, + "step": 12700 + }, + { + "epoch": 0.048377396983930024, + "grad_norm": 0.1290256232023239, + "learning_rate": 0.0005, + "loss": 2.1319, + "step": 12710 + }, + { + "epoch": 0.04841545945205271, + "grad_norm": 0.11181953549385071, + "learning_rate": 0.0005, + "loss": 2.1528, + "step": 12720 + }, + { + "epoch": 0.048453521920175394, + "grad_norm": 0.11853830516338348, + "learning_rate": 0.0005, + "loss": 2.1342, + "step": 12730 + }, + { + "epoch": 0.04849158438829807, + "grad_norm": 0.12245513498783112, + "learning_rate": 0.0005, + "loss": 2.1519, + "step": 12740 + }, + { + "epoch": 0.048529646856420756, + "grad_norm": 0.11556795984506607, + "learning_rate": 0.0005, + "loss": 2.1216, + "step": 12750 + }, + { + "epoch": 0.04856770932454344, + "grad_norm": 0.11135729402303696, + "learning_rate": 0.0005, + "loss": 2.1578, + "step": 12760 + }, + { + "epoch": 0.048605771792666126, + "grad_norm": 0.13058693706989288, + "learning_rate": 0.0005, + "loss": 2.1583, + "step": 12770 + }, + { + "epoch": 0.048643834260788804, + "grad_norm": 0.12102789431810379, + "learning_rate": 0.0005, + "loss": 2.1411, + "step": 12780 + }, + { + "epoch": 0.04868189672891149, + "grad_norm": 0.1127203106880188, + "learning_rate": 0.0005, + "loss": 2.1371, + "step": 12790 + }, + { + "epoch": 0.048719959197034174, + "grad_norm": 0.1162179708480835, + "learning_rate": 0.0005, + "loss": 2.1426, + "step": 12800 + }, + { + "epoch": 0.04875802166515686, + "grad_norm": 0.12106090039014816, + "learning_rate": 0.0005, + "loss": 2.15, + "step": 12810 + }, + { + "epoch": 0.04879608413327954, + "grad_norm": 0.12847653031349182, + "learning_rate": 0.0005, + "loss": 2.1364, + "step": 12820 + }, + { + "epoch": 0.04883414660140222, + "grad_norm": 0.12018883228302002, + "learning_rate": 0.0005, + "loss": 2.1411, + "step": 12830 + }, + { + "epoch": 0.048872209069524906, + "grad_norm": 0.12718555331230164, + "learning_rate": 0.0005, + "loss": 2.1464, + "step": 12840 + }, + { + "epoch": 0.048910271537647584, + "grad_norm": 0.13112443685531616, + "learning_rate": 0.0005, + "loss": 2.1342, + "step": 12850 + }, + { + "epoch": 0.04894833400577027, + "grad_norm": 0.12325897067785263, + "learning_rate": 0.0005, + "loss": 2.1438, + "step": 12860 + }, + { + "epoch": 0.048986396473892954, + "grad_norm": 0.11568273603916168, + "learning_rate": 0.0005, + "loss": 2.1398, + "step": 12870 + }, + { + "epoch": 0.04902445894201564, + "grad_norm": 0.11961609125137329, + "learning_rate": 0.0005, + "loss": 2.1467, + "step": 12880 + }, + { + "epoch": 0.04906252141013832, + "grad_norm": 0.13741044700145721, + "learning_rate": 0.0005, + "loss": 2.1464, + "step": 12890 + }, + { + "epoch": 0.049100583878261, + "grad_norm": 0.12998056411743164, + "learning_rate": 0.0005, + "loss": 2.1445, + "step": 12900 + }, + { + "epoch": 0.04913864634638369, + "grad_norm": 0.1075034886598587, + "learning_rate": 0.0005, + "loss": 2.1394, + "step": 12910 + }, + { + "epoch": 0.049176708814506365, + "grad_norm": 0.11980056762695312, + "learning_rate": 0.0005, + "loss": 2.1496, + "step": 12920 + }, + { + "epoch": 0.04921477128262905, + "grad_norm": 0.1278366595506668, + "learning_rate": 0.0005, + "loss": 2.1513, + "step": 12930 + }, + { + "epoch": 0.049252833750751734, + "grad_norm": 0.12148536741733551, + "learning_rate": 0.0005, + "loss": 2.1382, + "step": 12940 + }, + { + "epoch": 0.04929089621887442, + "grad_norm": 0.13064073026180267, + "learning_rate": 0.0005, + "loss": 2.1463, + "step": 12950 + }, + { + "epoch": 0.0493289586869971, + "grad_norm": 0.12474600970745087, + "learning_rate": 0.0005, + "loss": 2.1514, + "step": 12960 + }, + { + "epoch": 0.04936702115511978, + "grad_norm": 0.1313926726579666, + "learning_rate": 0.0005, + "loss": 2.1513, + "step": 12970 + }, + { + "epoch": 0.04940508362324247, + "grad_norm": 0.12423042207956314, + "learning_rate": 0.0005, + "loss": 2.152, + "step": 12980 + }, + { + "epoch": 0.04944314609136515, + "grad_norm": 0.12478918582201004, + "learning_rate": 0.0005, + "loss": 2.1157, + "step": 12990 + }, + { + "epoch": 0.04948120855948783, + "grad_norm": 0.11432372778654099, + "learning_rate": 0.0005, + "loss": 2.1426, + "step": 13000 + }, + { + "epoch": 0.049519271027610515, + "grad_norm": 0.11537044495344162, + "learning_rate": 0.0005, + "loss": 2.1361, + "step": 13010 + }, + { + "epoch": 0.0495573334957332, + "grad_norm": 0.12360623478889465, + "learning_rate": 0.0005, + "loss": 2.1575, + "step": 13020 + }, + { + "epoch": 0.04959539596385588, + "grad_norm": 0.12187385559082031, + "learning_rate": 0.0005, + "loss": 2.1429, + "step": 13030 + }, + { + "epoch": 0.04963345843197856, + "grad_norm": 0.1254439353942871, + "learning_rate": 0.0005, + "loss": 2.1423, + "step": 13040 + }, + { + "epoch": 0.04967152090010125, + "grad_norm": 0.12457789480686188, + "learning_rate": 0.0005, + "loss": 2.1584, + "step": 13050 + }, + { + "epoch": 0.04970958336822393, + "grad_norm": 0.11507994681596756, + "learning_rate": 0.0005, + "loss": 2.1332, + "step": 13060 + }, + { + "epoch": 0.04974764583634661, + "grad_norm": 0.12360002845525742, + "learning_rate": 0.0005, + "loss": 2.1463, + "step": 13070 + }, + { + "epoch": 0.049785708304469295, + "grad_norm": 0.10747748613357544, + "learning_rate": 0.0005, + "loss": 2.1417, + "step": 13080 + }, + { + "epoch": 0.04982377077259198, + "grad_norm": 0.10097293555736542, + "learning_rate": 0.0005, + "loss": 2.15, + "step": 13090 + }, + { + "epoch": 0.04986183324071466, + "grad_norm": 0.12312037497758865, + "learning_rate": 0.0005, + "loss": 2.1499, + "step": 13100 + }, + { + "epoch": 0.04989989570883734, + "grad_norm": 0.12179164588451385, + "learning_rate": 0.0005, + "loss": 2.1437, + "step": 13110 + }, + { + "epoch": 0.04993795817696003, + "grad_norm": 0.1149076297879219, + "learning_rate": 0.0005, + "loss": 2.1421, + "step": 13120 + }, + { + "epoch": 0.04997602064508271, + "grad_norm": 0.11413358896970749, + "learning_rate": 0.0005, + "loss": 2.1406, + "step": 13130 + }, + { + "epoch": 0.05001408311320539, + "grad_norm": 0.12448055297136307, + "learning_rate": 0.0005, + "loss": 2.139, + "step": 13140 + }, + { + "epoch": 0.050052145581328075, + "grad_norm": 0.1304500848054886, + "learning_rate": 0.0005, + "loss": 2.1413, + "step": 13150 + }, + { + "epoch": 0.05009020804945076, + "grad_norm": 0.10983631759881973, + "learning_rate": 0.0005, + "loss": 2.1514, + "step": 13160 + }, + { + "epoch": 0.050128270517573445, + "grad_norm": 0.11582379788160324, + "learning_rate": 0.0005, + "loss": 2.1414, + "step": 13170 + }, + { + "epoch": 0.05016633298569612, + "grad_norm": 0.11181753128767014, + "learning_rate": 0.0005, + "loss": 2.1386, + "step": 13180 + }, + { + "epoch": 0.05020439545381881, + "grad_norm": 0.11949356645345688, + "learning_rate": 0.0005, + "loss": 2.141, + "step": 13190 + }, + { + "epoch": 0.05024245792194149, + "grad_norm": 0.111565500497818, + "learning_rate": 0.0005, + "loss": 2.1299, + "step": 13200 + }, + { + "epoch": 0.05028052039006417, + "grad_norm": 0.12090222537517548, + "learning_rate": 0.0005, + "loss": 2.1596, + "step": 13210 + }, + { + "epoch": 0.050318582858186855, + "grad_norm": 0.10770760476589203, + "learning_rate": 0.0005, + "loss": 2.1571, + "step": 13220 + }, + { + "epoch": 0.05035664532630954, + "grad_norm": 0.13612627983093262, + "learning_rate": 0.0005, + "loss": 2.1526, + "step": 13230 + }, + { + "epoch": 0.050394707794432225, + "grad_norm": 0.1083691418170929, + "learning_rate": 0.0005, + "loss": 2.1422, + "step": 13240 + }, + { + "epoch": 0.0504327702625549, + "grad_norm": 0.1286705583333969, + "learning_rate": 0.0005, + "loss": 2.1239, + "step": 13250 + }, + { + "epoch": 0.05047083273067759, + "grad_norm": 0.13741260766983032, + "learning_rate": 0.0005, + "loss": 2.1522, + "step": 13260 + }, + { + "epoch": 0.05050889519880027, + "grad_norm": 0.11412322521209717, + "learning_rate": 0.0005, + "loss": 2.1371, + "step": 13270 + }, + { + "epoch": 0.05054695766692295, + "grad_norm": 0.11559664458036423, + "learning_rate": 0.0005, + "loss": 2.1504, + "step": 13280 + }, + { + "epoch": 0.050585020135045636, + "grad_norm": 0.12051431089639664, + "learning_rate": 0.0005, + "loss": 2.1525, + "step": 13290 + }, + { + "epoch": 0.05062308260316832, + "grad_norm": 0.12224344909191132, + "learning_rate": 0.0005, + "loss": 2.1439, + "step": 13300 + }, + { + "epoch": 0.050661145071291006, + "grad_norm": 0.11170416325330734, + "learning_rate": 0.0005, + "loss": 2.13, + "step": 13310 + }, + { + "epoch": 0.05069920753941368, + "grad_norm": 0.172870472073555, + "learning_rate": 0.0005, + "loss": 2.1355, + "step": 13320 + }, + { + "epoch": 0.05073727000753637, + "grad_norm": 0.1193099096417427, + "learning_rate": 0.0005, + "loss": 2.1121, + "step": 13330 + }, + { + "epoch": 0.05077533247565905, + "grad_norm": 0.1267542988061905, + "learning_rate": 0.0005, + "loss": 2.1603, + "step": 13340 + }, + { + "epoch": 0.05081339494378173, + "grad_norm": 0.11663828045129776, + "learning_rate": 0.0005, + "loss": 2.1364, + "step": 13350 + }, + { + "epoch": 0.050851457411904416, + "grad_norm": 0.12765999138355255, + "learning_rate": 0.0005, + "loss": 2.1391, + "step": 13360 + }, + { + "epoch": 0.0508895198800271, + "grad_norm": 0.12195882946252823, + "learning_rate": 0.0005, + "loss": 2.1396, + "step": 13370 + }, + { + "epoch": 0.050927582348149786, + "grad_norm": 0.12730921804904938, + "learning_rate": 0.0005, + "loss": 2.1446, + "step": 13380 + }, + { + "epoch": 0.050965644816272464, + "grad_norm": 0.12172224372625351, + "learning_rate": 0.0005, + "loss": 2.1591, + "step": 13390 + }, + { + "epoch": 0.05100370728439515, + "grad_norm": 0.12180347740650177, + "learning_rate": 0.0005, + "loss": 2.1402, + "step": 13400 + }, + { + "epoch": 0.051041769752517833, + "grad_norm": 0.10897450149059296, + "learning_rate": 0.0005, + "loss": 2.1513, + "step": 13410 + }, + { + "epoch": 0.05107983222064052, + "grad_norm": 0.12180740386247635, + "learning_rate": 0.0005, + "loss": 2.1391, + "step": 13420 + }, + { + "epoch": 0.051117894688763196, + "grad_norm": 0.10590329766273499, + "learning_rate": 0.0005, + "loss": 2.1602, + "step": 13430 + }, + { + "epoch": 0.05115595715688588, + "grad_norm": 0.102902851998806, + "learning_rate": 0.0005, + "loss": 2.1411, + "step": 13440 + }, + { + "epoch": 0.051194019625008566, + "grad_norm": 0.12098443508148193, + "learning_rate": 0.0005, + "loss": 2.1603, + "step": 13450 + }, + { + "epoch": 0.051232082093131244, + "grad_norm": 0.1269441395998001, + "learning_rate": 0.0005, + "loss": 2.1361, + "step": 13460 + }, + { + "epoch": 0.05127014456125393, + "grad_norm": 0.11580880731344223, + "learning_rate": 0.0005, + "loss": 2.1464, + "step": 13470 + }, + { + "epoch": 0.051308207029376614, + "grad_norm": 0.10751645267009735, + "learning_rate": 0.0005, + "loss": 2.1228, + "step": 13480 + }, + { + "epoch": 0.0513462694974993, + "grad_norm": 0.11714409291744232, + "learning_rate": 0.0005, + "loss": 2.1477, + "step": 13490 + }, + { + "epoch": 0.05138433196562198, + "grad_norm": 0.11440756171941757, + "learning_rate": 0.0005, + "loss": 2.1551, + "step": 13500 + }, + { + "epoch": 0.05142239443374466, + "grad_norm": 0.11872199177742004, + "learning_rate": 0.0005, + "loss": 2.1459, + "step": 13510 + }, + { + "epoch": 0.051460456901867346, + "grad_norm": 0.11766054481267929, + "learning_rate": 0.0005, + "loss": 2.1335, + "step": 13520 + }, + { + "epoch": 0.051498519369990024, + "grad_norm": 0.11872246116399765, + "learning_rate": 0.0005, + "loss": 2.1388, + "step": 13530 + }, + { + "epoch": 0.05153658183811271, + "grad_norm": 0.10821834951639175, + "learning_rate": 0.0005, + "loss": 2.1443, + "step": 13540 + }, + { + "epoch": 0.051574644306235394, + "grad_norm": 0.11680571734905243, + "learning_rate": 0.0005, + "loss": 2.1416, + "step": 13550 + }, + { + "epoch": 0.05161270677435808, + "grad_norm": 0.11038310825824738, + "learning_rate": 0.0005, + "loss": 2.1338, + "step": 13560 + }, + { + "epoch": 0.05165076924248076, + "grad_norm": 0.11159025132656097, + "learning_rate": 0.0005, + "loss": 2.1352, + "step": 13570 + }, + { + "epoch": 0.05168883171060344, + "grad_norm": 0.11030484735965729, + "learning_rate": 0.0005, + "loss": 2.1353, + "step": 13580 + }, + { + "epoch": 0.05172689417872613, + "grad_norm": 0.11383315920829773, + "learning_rate": 0.0005, + "loss": 2.1515, + "step": 13590 + }, + { + "epoch": 0.05176495664684881, + "grad_norm": 0.111046202480793, + "learning_rate": 0.0005, + "loss": 2.1337, + "step": 13600 + }, + { + "epoch": 0.05180301911497149, + "grad_norm": 0.109249547123909, + "learning_rate": 0.0005, + "loss": 2.1433, + "step": 13610 + }, + { + "epoch": 0.051841081583094174, + "grad_norm": 0.11588253825902939, + "learning_rate": 0.0005, + "loss": 2.146, + "step": 13620 + }, + { + "epoch": 0.05187914405121686, + "grad_norm": 0.11833309382200241, + "learning_rate": 0.0005, + "loss": 2.1536, + "step": 13630 + }, + { + "epoch": 0.05191720651933954, + "grad_norm": 0.12221593409776688, + "learning_rate": 0.0005, + "loss": 2.1517, + "step": 13640 + }, + { + "epoch": 0.05195526898746222, + "grad_norm": 0.11224260926246643, + "learning_rate": 0.0005, + "loss": 2.1461, + "step": 13650 + }, + { + "epoch": 0.05199333145558491, + "grad_norm": 0.12999624013900757, + "learning_rate": 0.0005, + "loss": 2.1485, + "step": 13660 + }, + { + "epoch": 0.05203139392370759, + "grad_norm": 0.1172730103135109, + "learning_rate": 0.0005, + "loss": 2.1288, + "step": 13670 + }, + { + "epoch": 0.05206945639183027, + "grad_norm": 0.14356952905654907, + "learning_rate": 0.0005, + "loss": 2.154, + "step": 13680 + }, + { + "epoch": 0.052107518859952955, + "grad_norm": 0.11941909044981003, + "learning_rate": 0.0005, + "loss": 2.1229, + "step": 13690 + }, + { + "epoch": 0.05214558132807564, + "grad_norm": 0.10828124731779099, + "learning_rate": 0.0005, + "loss": 2.1346, + "step": 13700 + }, + { + "epoch": 0.05218364379619832, + "grad_norm": 0.11855553090572357, + "learning_rate": 0.0005, + "loss": 2.1368, + "step": 13710 + }, + { + "epoch": 0.052221706264321, + "grad_norm": 0.1110978052020073, + "learning_rate": 0.0005, + "loss": 2.1383, + "step": 13720 + }, + { + "epoch": 0.05225976873244369, + "grad_norm": 0.11708894371986389, + "learning_rate": 0.0005, + "loss": 2.1673, + "step": 13730 + }, + { + "epoch": 0.05229783120056637, + "grad_norm": 0.11091696470975876, + "learning_rate": 0.0005, + "loss": 2.1428, + "step": 13740 + }, + { + "epoch": 0.05233589366868905, + "grad_norm": 0.11971557885408401, + "learning_rate": 0.0005, + "loss": 2.139, + "step": 13750 + }, + { + "epoch": 0.052373956136811735, + "grad_norm": 0.12013959139585495, + "learning_rate": 0.0005, + "loss": 2.1447, + "step": 13760 + }, + { + "epoch": 0.05241201860493442, + "grad_norm": 0.12948036193847656, + "learning_rate": 0.0005, + "loss": 2.1404, + "step": 13770 + }, + { + "epoch": 0.052450081073057105, + "grad_norm": 0.12004504352807999, + "learning_rate": 0.0005, + "loss": 2.1435, + "step": 13780 + }, + { + "epoch": 0.05248814354117978, + "grad_norm": 0.12624917924404144, + "learning_rate": 0.0005, + "loss": 2.1461, + "step": 13790 + }, + { + "epoch": 0.05252620600930247, + "grad_norm": 0.12713316082954407, + "learning_rate": 0.0005, + "loss": 2.1443, + "step": 13800 + }, + { + "epoch": 0.05256426847742515, + "grad_norm": 0.11220577359199524, + "learning_rate": 0.0005, + "loss": 2.1509, + "step": 13810 + }, + { + "epoch": 0.05260233094554783, + "grad_norm": 0.10795598477125168, + "learning_rate": 0.0005, + "loss": 2.1448, + "step": 13820 + }, + { + "epoch": 0.052640393413670515, + "grad_norm": 0.11754649877548218, + "learning_rate": 0.0005, + "loss": 2.1346, + "step": 13830 + }, + { + "epoch": 0.0526784558817932, + "grad_norm": 0.12715084850788116, + "learning_rate": 0.0005, + "loss": 2.1398, + "step": 13840 + }, + { + "epoch": 0.052716518349915885, + "grad_norm": 0.11614597588777542, + "learning_rate": 0.0005, + "loss": 2.1383, + "step": 13850 + }, + { + "epoch": 0.05275458081803856, + "grad_norm": 0.12445352226495743, + "learning_rate": 0.0005, + "loss": 2.1443, + "step": 13860 + }, + { + "epoch": 0.05279264328616125, + "grad_norm": 0.12445578724145889, + "learning_rate": 0.0005, + "loss": 2.1595, + "step": 13870 + }, + { + "epoch": 0.05283070575428393, + "grad_norm": 0.13612398505210876, + "learning_rate": 0.0005, + "loss": 2.1503, + "step": 13880 + }, + { + "epoch": 0.05286876822240661, + "grad_norm": 0.10932034254074097, + "learning_rate": 0.0005, + "loss": 2.1326, + "step": 13890 + }, + { + "epoch": 0.052906830690529295, + "grad_norm": 0.12032169103622437, + "learning_rate": 0.0005, + "loss": 2.1639, + "step": 13900 + }, + { + "epoch": 0.05294489315865198, + "grad_norm": 0.12248440086841583, + "learning_rate": 0.0005, + "loss": 2.1501, + "step": 13910 + }, + { + "epoch": 0.052982955626774665, + "grad_norm": 0.11166456341743469, + "learning_rate": 0.0005, + "loss": 2.1389, + "step": 13920 + }, + { + "epoch": 0.05302101809489734, + "grad_norm": 0.11123169213533401, + "learning_rate": 0.0005, + "loss": 2.1455, + "step": 13930 + }, + { + "epoch": 0.05305908056302003, + "grad_norm": 0.11789651960134506, + "learning_rate": 0.0005, + "loss": 2.1331, + "step": 13940 + }, + { + "epoch": 0.05309714303114271, + "grad_norm": 0.11766628175973892, + "learning_rate": 0.0005, + "loss": 2.1512, + "step": 13950 + }, + { + "epoch": 0.0531352054992654, + "grad_norm": 0.12354835867881775, + "learning_rate": 0.0005, + "loss": 2.1334, + "step": 13960 + }, + { + "epoch": 0.053173267967388076, + "grad_norm": 0.14414092898368835, + "learning_rate": 0.0005, + "loss": 2.1475, + "step": 13970 + }, + { + "epoch": 0.05321133043551076, + "grad_norm": 0.12397162616252899, + "learning_rate": 0.0005, + "loss": 2.1512, + "step": 13980 + }, + { + "epoch": 0.053249392903633445, + "grad_norm": 0.10753097385168076, + "learning_rate": 0.0005, + "loss": 2.1325, + "step": 13990 + }, + { + "epoch": 0.05328745537175612, + "grad_norm": 0.11333264410495758, + "learning_rate": 0.0005, + "loss": 2.1384, + "step": 14000 + }, + { + "epoch": 0.05332551783987881, + "grad_norm": 0.11931371688842773, + "learning_rate": 0.0005, + "loss": 2.1369, + "step": 14010 + }, + { + "epoch": 0.05336358030800149, + "grad_norm": 0.12130443006753922, + "learning_rate": 0.0005, + "loss": 2.148, + "step": 14020 + }, + { + "epoch": 0.05340164277612418, + "grad_norm": 0.11555348336696625, + "learning_rate": 0.0005, + "loss": 2.1566, + "step": 14030 + }, + { + "epoch": 0.053439705244246856, + "grad_norm": 0.1037520095705986, + "learning_rate": 0.0005, + "loss": 2.1279, + "step": 14040 + }, + { + "epoch": 0.05347776771236954, + "grad_norm": 0.11452918499708176, + "learning_rate": 0.0005, + "loss": 2.1413, + "step": 14050 + }, + { + "epoch": 0.053515830180492226, + "grad_norm": 0.13714534044265747, + "learning_rate": 0.0005, + "loss": 2.1298, + "step": 14060 + }, + { + "epoch": 0.053553892648614904, + "grad_norm": 0.13775818049907684, + "learning_rate": 0.0005, + "loss": 2.1389, + "step": 14070 + }, + { + "epoch": 0.05359195511673759, + "grad_norm": 0.119764044880867, + "learning_rate": 0.0005, + "loss": 2.1452, + "step": 14080 + }, + { + "epoch": 0.05363001758486027, + "grad_norm": 0.10478755831718445, + "learning_rate": 0.0005, + "loss": 2.1336, + "step": 14090 + }, + { + "epoch": 0.05366808005298296, + "grad_norm": 0.12574444711208344, + "learning_rate": 0.0005, + "loss": 2.1312, + "step": 14100 + }, + { + "epoch": 0.053706142521105636, + "grad_norm": 0.12435276061296463, + "learning_rate": 0.0005, + "loss": 2.1483, + "step": 14110 + }, + { + "epoch": 0.05374420498922832, + "grad_norm": 0.1113581657409668, + "learning_rate": 0.0005, + "loss": 2.1256, + "step": 14120 + }, + { + "epoch": 0.053782267457351006, + "grad_norm": 0.1145823672413826, + "learning_rate": 0.0005, + "loss": 2.1444, + "step": 14130 + }, + { + "epoch": 0.05382032992547369, + "grad_norm": 0.11234494298696518, + "learning_rate": 0.0005, + "loss": 2.1346, + "step": 14140 + }, + { + "epoch": 0.05385839239359637, + "grad_norm": 0.11848796904087067, + "learning_rate": 0.0005, + "loss": 2.1435, + "step": 14150 + }, + { + "epoch": 0.053896454861719054, + "grad_norm": 0.12532465159893036, + "learning_rate": 0.0005, + "loss": 2.1429, + "step": 14160 + }, + { + "epoch": 0.05393451732984174, + "grad_norm": 0.10763829201459885, + "learning_rate": 0.0005, + "loss": 2.1457, + "step": 14170 + }, + { + "epoch": 0.053972579797964416, + "grad_norm": 0.12030988931655884, + "learning_rate": 0.0005, + "loss": 2.1422, + "step": 14180 + }, + { + "epoch": 0.0540106422660871, + "grad_norm": 0.1350197196006775, + "learning_rate": 0.0005, + "loss": 2.1429, + "step": 14190 + }, + { + "epoch": 0.054048704734209786, + "grad_norm": 0.11541256308555603, + "learning_rate": 0.0005, + "loss": 2.1521, + "step": 14200 + }, + { + "epoch": 0.05408676720233247, + "grad_norm": 0.12731333076953888, + "learning_rate": 0.0005, + "loss": 2.1508, + "step": 14210 + }, + { + "epoch": 0.05412482967045515, + "grad_norm": 0.12442631274461746, + "learning_rate": 0.0005, + "loss": 2.1436, + "step": 14220 + }, + { + "epoch": 0.054162892138577834, + "grad_norm": 0.12272724509239197, + "learning_rate": 0.0005, + "loss": 2.1413, + "step": 14230 + }, + { + "epoch": 0.05420095460670052, + "grad_norm": 0.11611298471689224, + "learning_rate": 0.0005, + "loss": 2.139, + "step": 14240 + }, + { + "epoch": 0.0542390170748232, + "grad_norm": 0.10913633555173874, + "learning_rate": 0.0005, + "loss": 2.1402, + "step": 14250 + }, + { + "epoch": 0.05427707954294588, + "grad_norm": 0.10878828912973404, + "learning_rate": 0.0005, + "loss": 2.1451, + "step": 14260 + }, + { + "epoch": 0.054315142011068566, + "grad_norm": 0.12029829621315002, + "learning_rate": 0.0005, + "loss": 2.1441, + "step": 14270 + }, + { + "epoch": 0.05435320447919125, + "grad_norm": 0.11875171959400177, + "learning_rate": 0.0005, + "loss": 2.1393, + "step": 14280 + }, + { + "epoch": 0.05439126694731393, + "grad_norm": 0.12768295407295227, + "learning_rate": 0.0005, + "loss": 2.161, + "step": 14290 + }, + { + "epoch": 0.054429329415436614, + "grad_norm": 0.13291147351264954, + "learning_rate": 0.0005, + "loss": 2.1553, + "step": 14300 + }, + { + "epoch": 0.0544673918835593, + "grad_norm": 0.12438298016786575, + "learning_rate": 0.0005, + "loss": 2.1513, + "step": 14310 + }, + { + "epoch": 0.05450545435168198, + "grad_norm": 0.1304919719696045, + "learning_rate": 0.0005, + "loss": 2.1388, + "step": 14320 + }, + { + "epoch": 0.05454351681980466, + "grad_norm": 0.1263260394334793, + "learning_rate": 0.0005, + "loss": 2.1407, + "step": 14330 + }, + { + "epoch": 0.05458157928792735, + "grad_norm": 0.1273345649242401, + "learning_rate": 0.0005, + "loss": 2.1477, + "step": 14340 + }, + { + "epoch": 0.05461964175605003, + "grad_norm": 0.11910910904407501, + "learning_rate": 0.0005, + "loss": 2.1362, + "step": 14350 + }, + { + "epoch": 0.05465770422417271, + "grad_norm": 0.10915575176477432, + "learning_rate": 0.0005, + "loss": 2.1225, + "step": 14360 + }, + { + "epoch": 0.054695766692295394, + "grad_norm": 0.11233872920274734, + "learning_rate": 0.0005, + "loss": 2.1608, + "step": 14370 + }, + { + "epoch": 0.05473382916041808, + "grad_norm": 0.1092049777507782, + "learning_rate": 0.0005, + "loss": 2.1312, + "step": 14380 + }, + { + "epoch": 0.054771891628540764, + "grad_norm": 0.10772830247879028, + "learning_rate": 0.0005, + "loss": 2.1317, + "step": 14390 + }, + { + "epoch": 0.05480995409666344, + "grad_norm": 0.11499619483947754, + "learning_rate": 0.0005, + "loss": 2.1314, + "step": 14400 + }, + { + "epoch": 0.05484801656478613, + "grad_norm": 0.12505364418029785, + "learning_rate": 0.0005, + "loss": 2.1215, + "step": 14410 + }, + { + "epoch": 0.05488607903290881, + "grad_norm": 0.10888305306434631, + "learning_rate": 0.0005, + "loss": 2.1406, + "step": 14420 + }, + { + "epoch": 0.05492414150103149, + "grad_norm": 0.11248484998941422, + "learning_rate": 0.0005, + "loss": 2.1372, + "step": 14430 + }, + { + "epoch": 0.054962203969154175, + "grad_norm": 0.12412357330322266, + "learning_rate": 0.0005, + "loss": 2.13, + "step": 14440 + }, + { + "epoch": 0.05500026643727686, + "grad_norm": 0.13320519030094147, + "learning_rate": 0.0005, + "loss": 2.1478, + "step": 14450 + }, + { + "epoch": 0.055038328905399544, + "grad_norm": 0.11333293467760086, + "learning_rate": 0.0005, + "loss": 2.1486, + "step": 14460 + }, + { + "epoch": 0.05507639137352222, + "grad_norm": 0.12249065935611725, + "learning_rate": 0.0005, + "loss": 2.1376, + "step": 14470 + }, + { + "epoch": 0.05511445384164491, + "grad_norm": 0.11434555053710938, + "learning_rate": 0.0005, + "loss": 2.1449, + "step": 14480 + }, + { + "epoch": 0.05515251630976759, + "grad_norm": 0.12702178955078125, + "learning_rate": 0.0005, + "loss": 2.1449, + "step": 14490 + }, + { + "epoch": 0.05519057877789027, + "grad_norm": 0.10995277017354965, + "learning_rate": 0.0005, + "loss": 2.1305, + "step": 14500 + }, + { + "epoch": 0.055228641246012955, + "grad_norm": 0.1193859875202179, + "learning_rate": 0.0005, + "loss": 2.1455, + "step": 14510 + }, + { + "epoch": 0.05526670371413564, + "grad_norm": 0.12037155777215958, + "learning_rate": 0.0005, + "loss": 2.1372, + "step": 14520 + }, + { + "epoch": 0.055304766182258325, + "grad_norm": 0.11841357499361038, + "learning_rate": 0.0005, + "loss": 2.1512, + "step": 14530 + }, + { + "epoch": 0.055342828650381, + "grad_norm": 0.11028337478637695, + "learning_rate": 0.0005, + "loss": 2.1459, + "step": 14540 + }, + { + "epoch": 0.05538089111850369, + "grad_norm": 0.11902716010808945, + "learning_rate": 0.0005, + "loss": 2.1481, + "step": 14550 + }, + { + "epoch": 0.05541895358662637, + "grad_norm": 0.11138655245304108, + "learning_rate": 0.0005, + "loss": 2.133, + "step": 14560 + }, + { + "epoch": 0.05545701605474906, + "grad_norm": 0.11641339957714081, + "learning_rate": 0.0005, + "loss": 2.1341, + "step": 14570 + }, + { + "epoch": 0.055495078522871735, + "grad_norm": 0.13227738440036774, + "learning_rate": 0.0005, + "loss": 2.1301, + "step": 14580 + }, + { + "epoch": 0.05553314099099442, + "grad_norm": 0.12470174580812454, + "learning_rate": 0.0005, + "loss": 2.1475, + "step": 14590 + }, + { + "epoch": 0.055571203459117105, + "grad_norm": 0.10701868683099747, + "learning_rate": 0.0005, + "loss": 2.1429, + "step": 14600 + }, + { + "epoch": 0.05560926592723978, + "grad_norm": 0.12646402418613434, + "learning_rate": 0.0005, + "loss": 2.157, + "step": 14610 + }, + { + "epoch": 0.05564732839536247, + "grad_norm": 0.10631322115659714, + "learning_rate": 0.0005, + "loss": 2.1458, + "step": 14620 + }, + { + "epoch": 0.05568539086348515, + "grad_norm": 0.11027562618255615, + "learning_rate": 0.0005, + "loss": 2.1407, + "step": 14630 + }, + { + "epoch": 0.05572345333160784, + "grad_norm": 0.12378138303756714, + "learning_rate": 0.0005, + "loss": 2.127, + "step": 14640 + }, + { + "epoch": 0.055761515799730516, + "grad_norm": 0.11404086649417877, + "learning_rate": 0.0005, + "loss": 2.1452, + "step": 14650 + }, + { + "epoch": 0.0557995782678532, + "grad_norm": 0.11708112806081772, + "learning_rate": 0.0005, + "loss": 2.139, + "step": 14660 + }, + { + "epoch": 0.055837640735975885, + "grad_norm": 0.12451019138097763, + "learning_rate": 0.0005, + "loss": 2.1457, + "step": 14670 + }, + { + "epoch": 0.05587570320409856, + "grad_norm": 0.13042905926704407, + "learning_rate": 0.0005, + "loss": 2.1429, + "step": 14680 + }, + { + "epoch": 0.05591376567222125, + "grad_norm": 0.1280081868171692, + "learning_rate": 0.0005, + "loss": 2.1469, + "step": 14690 + }, + { + "epoch": 0.05595182814034393, + "grad_norm": 0.1199612095952034, + "learning_rate": 0.0005, + "loss": 2.1353, + "step": 14700 + }, + { + "epoch": 0.05598989060846662, + "grad_norm": 0.11185267567634583, + "learning_rate": 0.0005, + "loss": 2.1352, + "step": 14710 + }, + { + "epoch": 0.056027953076589296, + "grad_norm": 0.11486394703388214, + "learning_rate": 0.0005, + "loss": 2.144, + "step": 14720 + }, + { + "epoch": 0.05606601554471198, + "grad_norm": 0.11386962980031967, + "learning_rate": 0.0005, + "loss": 2.1347, + "step": 14730 + }, + { + "epoch": 0.056104078012834666, + "grad_norm": 0.13496342301368713, + "learning_rate": 0.0005, + "loss": 2.1457, + "step": 14740 + }, + { + "epoch": 0.05614214048095735, + "grad_norm": 0.1216830313205719, + "learning_rate": 0.0005, + "loss": 2.1501, + "step": 14750 + }, + { + "epoch": 0.05618020294908003, + "grad_norm": 0.1195463240146637, + "learning_rate": 0.0005, + "loss": 2.1347, + "step": 14760 + }, + { + "epoch": 0.05621826541720271, + "grad_norm": 0.12027116119861603, + "learning_rate": 0.0005, + "loss": 2.1442, + "step": 14770 + }, + { + "epoch": 0.0562563278853254, + "grad_norm": 0.12256968766450882, + "learning_rate": 0.0005, + "loss": 2.1483, + "step": 14780 + }, + { + "epoch": 0.056294390353448076, + "grad_norm": 0.12090452015399933, + "learning_rate": 0.0005, + "loss": 2.1434, + "step": 14790 + }, + { + "epoch": 0.05633245282157076, + "grad_norm": 0.12250377982854843, + "learning_rate": 0.0005, + "loss": 2.1404, + "step": 14800 + }, + { + "epoch": 0.056370515289693446, + "grad_norm": 0.11214666813611984, + "learning_rate": 0.0005, + "loss": 2.1611, + "step": 14810 + }, + { + "epoch": 0.05640857775781613, + "grad_norm": 0.11024061590433121, + "learning_rate": 0.0005, + "loss": 2.136, + "step": 14820 + }, + { + "epoch": 0.05644664022593881, + "grad_norm": 0.120402991771698, + "learning_rate": 0.0005, + "loss": 2.1477, + "step": 14830 + }, + { + "epoch": 0.056484702694061494, + "grad_norm": 0.12345562875270844, + "learning_rate": 0.0005, + "loss": 2.15, + "step": 14840 + }, + { + "epoch": 0.05652276516218418, + "grad_norm": 0.1288871020078659, + "learning_rate": 0.0005, + "loss": 2.1318, + "step": 14850 + }, + { + "epoch": 0.056560827630306856, + "grad_norm": 0.12485507130622864, + "learning_rate": 0.0005, + "loss": 2.1457, + "step": 14860 + }, + { + "epoch": 0.05659889009842954, + "grad_norm": 0.11655443906784058, + "learning_rate": 0.0005, + "loss": 2.141, + "step": 14870 + }, + { + "epoch": 0.056636952566552226, + "grad_norm": 0.136055126786232, + "learning_rate": 0.0005, + "loss": 2.1419, + "step": 14880 + }, + { + "epoch": 0.05667501503467491, + "grad_norm": 0.11421075463294983, + "learning_rate": 0.0005, + "loss": 2.1413, + "step": 14890 + }, + { + "epoch": 0.05671307750279759, + "grad_norm": 0.11379828304052353, + "learning_rate": 0.0005, + "loss": 2.1385, + "step": 14900 + }, + { + "epoch": 0.056751139970920274, + "grad_norm": 0.11044083535671234, + "learning_rate": 0.0005, + "loss": 2.1285, + "step": 14910 + }, + { + "epoch": 0.05678920243904296, + "grad_norm": 0.11647764593362808, + "learning_rate": 0.0005, + "loss": 2.1373, + "step": 14920 + }, + { + "epoch": 0.056827264907165644, + "grad_norm": 0.1301194727420807, + "learning_rate": 0.0005, + "loss": 2.1416, + "step": 14930 + }, + { + "epoch": 0.05686532737528832, + "grad_norm": 0.11834192276000977, + "learning_rate": 0.0005, + "loss": 2.1238, + "step": 14940 + }, + { + "epoch": 0.056903389843411006, + "grad_norm": 0.13392376899719238, + "learning_rate": 0.0005, + "loss": 2.1596, + "step": 14950 + }, + { + "epoch": 0.05694145231153369, + "grad_norm": 0.1137416735291481, + "learning_rate": 0.0005, + "loss": 2.1399, + "step": 14960 + }, + { + "epoch": 0.05697951477965637, + "grad_norm": 0.12346642464399338, + "learning_rate": 0.0005, + "loss": 2.1433, + "step": 14970 + }, + { + "epoch": 0.057017577247779054, + "grad_norm": 0.12635205686092377, + "learning_rate": 0.0005, + "loss": 2.1464, + "step": 14980 + }, + { + "epoch": 0.05705563971590174, + "grad_norm": 0.1041107177734375, + "learning_rate": 0.0005, + "loss": 2.1411, + "step": 14990 + }, + { + "epoch": 0.057093702184024424, + "grad_norm": 0.11996068060398102, + "learning_rate": 0.0005, + "loss": 2.1377, + "step": 15000 + }, + { + "epoch": 0.0571317646521471, + "grad_norm": 0.10924813151359558, + "learning_rate": 0.0005, + "loss": 2.1493, + "step": 15010 + }, + { + "epoch": 0.05716982712026979, + "grad_norm": 0.11912663280963898, + "learning_rate": 0.0005, + "loss": 2.1359, + "step": 15020 + }, + { + "epoch": 0.05720788958839247, + "grad_norm": 0.10958463698625565, + "learning_rate": 0.0005, + "loss": 2.1295, + "step": 15030 + }, + { + "epoch": 0.05724595205651515, + "grad_norm": 0.11127685755491257, + "learning_rate": 0.0005, + "loss": 2.1484, + "step": 15040 + }, + { + "epoch": 0.057284014524637834, + "grad_norm": 0.11640200763940811, + "learning_rate": 0.0005, + "loss": 2.1431, + "step": 15050 + }, + { + "epoch": 0.05732207699276052, + "grad_norm": 0.10901486128568649, + "learning_rate": 0.0005, + "loss": 2.1501, + "step": 15060 + }, + { + "epoch": 0.057360139460883204, + "grad_norm": 0.11258064955472946, + "learning_rate": 0.0005, + "loss": 2.1449, + "step": 15070 + }, + { + "epoch": 0.05739820192900588, + "grad_norm": 0.13269981741905212, + "learning_rate": 0.0005, + "loss": 2.1559, + "step": 15080 + }, + { + "epoch": 0.05743626439712857, + "grad_norm": 0.11579059064388275, + "learning_rate": 0.0005, + "loss": 2.1497, + "step": 15090 + }, + { + "epoch": 0.05747432686525125, + "grad_norm": 0.10881291329860687, + "learning_rate": 0.0005, + "loss": 2.1153, + "step": 15100 + }, + { + "epoch": 0.05751238933337394, + "grad_norm": 0.11739391833543777, + "learning_rate": 0.0005, + "loss": 2.1367, + "step": 15110 + }, + { + "epoch": 0.057550451801496615, + "grad_norm": 0.114154152572155, + "learning_rate": 0.0005, + "loss": 2.1504, + "step": 15120 + }, + { + "epoch": 0.0575885142696193, + "grad_norm": 0.11408552527427673, + "learning_rate": 0.0005, + "loss": 2.1467, + "step": 15130 + }, + { + "epoch": 0.057626576737741984, + "grad_norm": 0.12134097516536713, + "learning_rate": 0.0005, + "loss": 2.1582, + "step": 15140 + }, + { + "epoch": 0.05766463920586466, + "grad_norm": 0.11878085881471634, + "learning_rate": 0.0005, + "loss": 2.1255, + "step": 15150 + }, + { + "epoch": 0.05770270167398735, + "grad_norm": 0.11943016946315765, + "learning_rate": 0.0005, + "loss": 2.1462, + "step": 15160 + }, + { + "epoch": 0.05774076414211003, + "grad_norm": 0.12231114506721497, + "learning_rate": 0.0005, + "loss": 2.1464, + "step": 15170 + }, + { + "epoch": 0.05777882661023272, + "grad_norm": 0.1249522864818573, + "learning_rate": 0.0005, + "loss": 2.1272, + "step": 15180 + }, + { + "epoch": 0.057816889078355395, + "grad_norm": 0.12952202558517456, + "learning_rate": 0.0005, + "loss": 2.1185, + "step": 15190 + }, + { + "epoch": 0.05785495154647808, + "grad_norm": 0.12828490138053894, + "learning_rate": 0.0005, + "loss": 2.1502, + "step": 15200 + }, + { + "epoch": 0.057893014014600765, + "grad_norm": 0.11592209339141846, + "learning_rate": 0.0005, + "loss": 2.1361, + "step": 15210 + }, + { + "epoch": 0.05793107648272344, + "grad_norm": 0.11671043932437897, + "learning_rate": 0.0005, + "loss": 2.1482, + "step": 15220 + }, + { + "epoch": 0.05796913895084613, + "grad_norm": 0.11773068457841873, + "learning_rate": 0.0005, + "loss": 2.1347, + "step": 15230 + }, + { + "epoch": 0.05800720141896881, + "grad_norm": 0.12474120408296585, + "learning_rate": 0.0005, + "loss": 2.1445, + "step": 15240 + }, + { + "epoch": 0.0580452638870915, + "grad_norm": 0.11964181810617447, + "learning_rate": 0.0005, + "loss": 2.1496, + "step": 15250 + }, + { + "epoch": 0.058083326355214175, + "grad_norm": 0.12345830351114273, + "learning_rate": 0.0005, + "loss": 2.1486, + "step": 15260 + }, + { + "epoch": 0.05812138882333686, + "grad_norm": 0.1257256418466568, + "learning_rate": 0.0005, + "loss": 2.1345, + "step": 15270 + }, + { + "epoch": 0.058159451291459545, + "grad_norm": 0.10888057947158813, + "learning_rate": 0.0005, + "loss": 2.143, + "step": 15280 + }, + { + "epoch": 0.05819751375958223, + "grad_norm": 0.10951346158981323, + "learning_rate": 0.0005, + "loss": 2.1369, + "step": 15290 + }, + { + "epoch": 0.05823557622770491, + "grad_norm": 0.10857826471328735, + "learning_rate": 0.0005, + "loss": 2.1291, + "step": 15300 + }, + { + "epoch": 0.05827363869582759, + "grad_norm": 0.11710096895694733, + "learning_rate": 0.0005, + "loss": 2.1423, + "step": 15310 + }, + { + "epoch": 0.05831170116395028, + "grad_norm": 0.11304796487092972, + "learning_rate": 0.0005, + "loss": 2.1348, + "step": 15320 + }, + { + "epoch": 0.058349763632072955, + "grad_norm": 0.12150050699710846, + "learning_rate": 0.0005, + "loss": 2.1413, + "step": 15330 + }, + { + "epoch": 0.05838782610019564, + "grad_norm": 0.11822327971458435, + "learning_rate": 0.0005, + "loss": 2.1495, + "step": 15340 + }, + { + "epoch": 0.058425888568318325, + "grad_norm": 0.1256248503923416, + "learning_rate": 0.0005, + "loss": 2.1444, + "step": 15350 + }, + { + "epoch": 0.05846395103644101, + "grad_norm": 0.12715673446655273, + "learning_rate": 0.0005, + "loss": 2.1455, + "step": 15360 + }, + { + "epoch": 0.05850201350456369, + "grad_norm": 0.12773144245147705, + "learning_rate": 0.0005, + "loss": 2.1357, + "step": 15370 + }, + { + "epoch": 0.05854007597268637, + "grad_norm": 0.11277401447296143, + "learning_rate": 0.0005, + "loss": 2.1554, + "step": 15380 + }, + { + "epoch": 0.05857813844080906, + "grad_norm": 0.10849560052156448, + "learning_rate": 0.0005, + "loss": 2.1479, + "step": 15390 + }, + { + "epoch": 0.058616200908931736, + "grad_norm": 0.11700870841741562, + "learning_rate": 0.0005, + "loss": 2.1337, + "step": 15400 + }, + { + "epoch": 0.05865426337705442, + "grad_norm": 0.12502211332321167, + "learning_rate": 0.0005, + "loss": 2.1337, + "step": 15410 + }, + { + "epoch": 0.058692325845177105, + "grad_norm": 0.11653271317481995, + "learning_rate": 0.0005, + "loss": 2.161, + "step": 15420 + }, + { + "epoch": 0.05873038831329979, + "grad_norm": 0.12796089053153992, + "learning_rate": 0.0005, + "loss": 2.1375, + "step": 15430 + }, + { + "epoch": 0.05876845078142247, + "grad_norm": 0.12059906125068665, + "learning_rate": 0.0005, + "loss": 2.1537, + "step": 15440 + }, + { + "epoch": 0.05880651324954515, + "grad_norm": 0.11414425820112228, + "learning_rate": 0.0005, + "loss": 2.1406, + "step": 15450 + }, + { + "epoch": 0.05884457571766784, + "grad_norm": 0.11696535348892212, + "learning_rate": 0.0005, + "loss": 2.1544, + "step": 15460 + }, + { + "epoch": 0.058882638185790516, + "grad_norm": 0.12169355154037476, + "learning_rate": 0.0005, + "loss": 2.1463, + "step": 15470 + }, + { + "epoch": 0.0589207006539132, + "grad_norm": 0.13853904604911804, + "learning_rate": 0.0005, + "loss": 2.1433, + "step": 15480 + }, + { + "epoch": 0.058958763122035886, + "grad_norm": 0.11219564080238342, + "learning_rate": 0.0005, + "loss": 2.1359, + "step": 15490 + }, + { + "epoch": 0.05899682559015857, + "grad_norm": 0.1147218570113182, + "learning_rate": 0.0005, + "loss": 2.136, + "step": 15500 + }, + { + "epoch": 0.05903488805828125, + "grad_norm": 0.11296788603067398, + "learning_rate": 0.0005, + "loss": 2.1342, + "step": 15510 + }, + { + "epoch": 0.05907295052640393, + "grad_norm": 0.11690889298915863, + "learning_rate": 0.0005, + "loss": 2.1442, + "step": 15520 + }, + { + "epoch": 0.05911101299452662, + "grad_norm": 0.12081010639667511, + "learning_rate": 0.0005, + "loss": 2.1577, + "step": 15530 + }, + { + "epoch": 0.0591490754626493, + "grad_norm": 0.12585335969924927, + "learning_rate": 0.0005, + "loss": 2.1556, + "step": 15540 + }, + { + "epoch": 0.05918713793077198, + "grad_norm": 0.11698474735021591, + "learning_rate": 0.0005, + "loss": 2.1328, + "step": 15550 + }, + { + "epoch": 0.059225200398894666, + "grad_norm": 0.12431799620389938, + "learning_rate": 0.0005, + "loss": 2.1231, + "step": 15560 + }, + { + "epoch": 0.05926326286701735, + "grad_norm": 0.1195605993270874, + "learning_rate": 0.0005, + "loss": 2.1375, + "step": 15570 + }, + { + "epoch": 0.05930132533514003, + "grad_norm": 0.12227523326873779, + "learning_rate": 0.0005, + "loss": 2.1406, + "step": 15580 + }, + { + "epoch": 0.059339387803262714, + "grad_norm": 0.11026965826749802, + "learning_rate": 0.0005, + "loss": 2.1399, + "step": 15590 + }, + { + "epoch": 0.0593774502713854, + "grad_norm": 0.12768976390361786, + "learning_rate": 0.0005, + "loss": 2.1289, + "step": 15600 + }, + { + "epoch": 0.05941551273950808, + "grad_norm": 0.1162969097495079, + "learning_rate": 0.0005, + "loss": 2.1606, + "step": 15610 + }, + { + "epoch": 0.05945357520763076, + "grad_norm": 0.1282632201910019, + "learning_rate": 0.0005, + "loss": 2.1525, + "step": 15620 + }, + { + "epoch": 0.059491637675753446, + "grad_norm": 0.11534926295280457, + "learning_rate": 0.0005, + "loss": 2.1322, + "step": 15630 + }, + { + "epoch": 0.05952970014387613, + "grad_norm": 0.12293092906475067, + "learning_rate": 0.0005, + "loss": 2.1272, + "step": 15640 + }, + { + "epoch": 0.05956776261199881, + "grad_norm": 0.13364486396312714, + "learning_rate": 0.0005, + "loss": 2.1434, + "step": 15650 + }, + { + "epoch": 0.059605825080121494, + "grad_norm": 0.1044587567448616, + "learning_rate": 0.0005, + "loss": 2.1513, + "step": 15660 + }, + { + "epoch": 0.05964388754824418, + "grad_norm": 0.11058751493692398, + "learning_rate": 0.0005, + "loss": 2.1376, + "step": 15670 + }, + { + "epoch": 0.059681950016366864, + "grad_norm": 0.11776610463857651, + "learning_rate": 0.0005, + "loss": 2.1451, + "step": 15680 + }, + { + "epoch": 0.05972001248448954, + "grad_norm": 0.11538823693990707, + "learning_rate": 0.0005, + "loss": 2.1451, + "step": 15690 + }, + { + "epoch": 0.059758074952612227, + "grad_norm": 0.11598982661962509, + "learning_rate": 0.0005, + "loss": 2.1373, + "step": 15700 + }, + { + "epoch": 0.05979613742073491, + "grad_norm": 0.11938263475894928, + "learning_rate": 0.0005, + "loss": 2.1264, + "step": 15710 + }, + { + "epoch": 0.059834199888857596, + "grad_norm": 0.1095249280333519, + "learning_rate": 0.0005, + "loss": 2.1593, + "step": 15720 + }, + { + "epoch": 0.059872262356980274, + "grad_norm": 0.10784026980400085, + "learning_rate": 0.0005, + "loss": 2.1382, + "step": 15730 + }, + { + "epoch": 0.05991032482510296, + "grad_norm": 0.11930166929960251, + "learning_rate": 0.0005, + "loss": 2.1461, + "step": 15740 + }, + { + "epoch": 0.059948387293225644, + "grad_norm": 0.15975899994373322, + "learning_rate": 0.0005, + "loss": 2.1545, + "step": 15750 + }, + { + "epoch": 0.05998644976134832, + "grad_norm": 0.11798959225416183, + "learning_rate": 0.0005, + "loss": 2.1452, + "step": 15760 + }, + { + "epoch": 0.06002451222947101, + "grad_norm": 0.12302548438310623, + "learning_rate": 0.0005, + "loss": 2.1419, + "step": 15770 + }, + { + "epoch": 0.06006257469759369, + "grad_norm": 0.1195300966501236, + "learning_rate": 0.0005, + "loss": 2.151, + "step": 15780 + }, + { + "epoch": 0.06010063716571638, + "grad_norm": 0.1508270651102066, + "learning_rate": 0.0005, + "loss": 2.1473, + "step": 15790 + }, + { + "epoch": 0.060138699633839054, + "grad_norm": 0.11350975930690765, + "learning_rate": 0.0005, + "loss": 2.1499, + "step": 15800 + }, + { + "epoch": 0.06017676210196174, + "grad_norm": 0.11031148582696915, + "learning_rate": 0.0005, + "loss": 2.1358, + "step": 15810 + }, + { + "epoch": 0.060214824570084424, + "grad_norm": 0.12113554775714874, + "learning_rate": 0.0005, + "loss": 2.1473, + "step": 15820 + }, + { + "epoch": 0.0602528870382071, + "grad_norm": 0.11519166082143784, + "learning_rate": 0.0005, + "loss": 2.1338, + "step": 15830 + }, + { + "epoch": 0.06029094950632979, + "grad_norm": 0.11963526904582977, + "learning_rate": 0.0005, + "loss": 2.1461, + "step": 15840 + }, + { + "epoch": 0.06032901197445247, + "grad_norm": 0.11226295679807663, + "learning_rate": 0.0005, + "loss": 2.1403, + "step": 15850 + }, + { + "epoch": 0.06036707444257516, + "grad_norm": 0.11856474727392197, + "learning_rate": 0.0005, + "loss": 2.1482, + "step": 15860 + }, + { + "epoch": 0.060405136910697835, + "grad_norm": 0.11463222652673721, + "learning_rate": 0.0005, + "loss": 2.1427, + "step": 15870 + }, + { + "epoch": 0.06044319937882052, + "grad_norm": 0.10893730074167252, + "learning_rate": 0.0005, + "loss": 2.1316, + "step": 15880 + }, + { + "epoch": 0.060481261846943205, + "grad_norm": 0.12570078670978546, + "learning_rate": 0.0005, + "loss": 2.1428, + "step": 15890 + }, + { + "epoch": 0.06051932431506589, + "grad_norm": 0.12432834506034851, + "learning_rate": 0.0005, + "loss": 2.1388, + "step": 15900 + }, + { + "epoch": 0.06055738678318857, + "grad_norm": 0.11896125227212906, + "learning_rate": 0.0005, + "loss": 2.1616, + "step": 15910 + }, + { + "epoch": 0.06059544925131125, + "grad_norm": 0.1110902652144432, + "learning_rate": 0.0005, + "loss": 2.1402, + "step": 15920 + }, + { + "epoch": 0.06063351171943394, + "grad_norm": 0.1337718665599823, + "learning_rate": 0.0005, + "loss": 2.1339, + "step": 15930 + }, + { + "epoch": 0.060671574187556615, + "grad_norm": 0.11414303630590439, + "learning_rate": 0.0005, + "loss": 2.1414, + "step": 15940 + }, + { + "epoch": 0.0607096366556793, + "grad_norm": 0.11986953020095825, + "learning_rate": 0.0005, + "loss": 2.1562, + "step": 15950 + }, + { + "epoch": 0.060747699123801985, + "grad_norm": 0.12421073019504547, + "learning_rate": 0.0005, + "loss": 2.1246, + "step": 15960 + }, + { + "epoch": 0.06078576159192467, + "grad_norm": 0.12681721150875092, + "learning_rate": 0.0005, + "loss": 2.1421, + "step": 15970 + }, + { + "epoch": 0.06082382406004735, + "grad_norm": 0.41695889830589294, + "learning_rate": 0.0005, + "loss": 2.1375, + "step": 15980 + }, + { + "epoch": 0.06086188652817003, + "grad_norm": 0.11906461417675018, + "learning_rate": 0.0005, + "loss": 2.1369, + "step": 15990 + }, + { + "epoch": 0.06089994899629272, + "grad_norm": 0.10986065119504929, + "learning_rate": 0.0005, + "loss": 2.1329, + "step": 16000 + }, + { + "epoch": 0.060938011464415395, + "grad_norm": 0.12073797732591629, + "learning_rate": 0.0005, + "loss": 2.1374, + "step": 16010 + }, + { + "epoch": 0.06097607393253808, + "grad_norm": 0.12336317449808121, + "learning_rate": 0.0005, + "loss": 2.1395, + "step": 16020 + }, + { + "epoch": 0.061014136400660765, + "grad_norm": 0.11709783971309662, + "learning_rate": 0.0005, + "loss": 2.1222, + "step": 16030 + }, + { + "epoch": 0.06105219886878345, + "grad_norm": 0.11148954182863235, + "learning_rate": 0.0005, + "loss": 2.1346, + "step": 16040 + }, + { + "epoch": 0.06109026133690613, + "grad_norm": 0.1247798353433609, + "learning_rate": 0.0005, + "loss": 2.1479, + "step": 16050 + }, + { + "epoch": 0.06112832380502881, + "grad_norm": 0.11071789264678955, + "learning_rate": 0.0005, + "loss": 2.1561, + "step": 16060 + }, + { + "epoch": 0.0611663862731515, + "grad_norm": 0.27611425518989563, + "learning_rate": 0.0005, + "loss": 2.1361, + "step": 16070 + }, + { + "epoch": 0.06120444874127418, + "grad_norm": 0.11537760496139526, + "learning_rate": 0.0005, + "loss": 2.1413, + "step": 16080 + }, + { + "epoch": 0.06124251120939686, + "grad_norm": 0.12730243802070618, + "learning_rate": 0.0005, + "loss": 2.1598, + "step": 16090 + }, + { + "epoch": 0.061280573677519545, + "grad_norm": 0.12035439908504486, + "learning_rate": 0.0005, + "loss": 2.1343, + "step": 16100 + }, + { + "epoch": 0.06131863614564223, + "grad_norm": 0.1186971440911293, + "learning_rate": 0.0005, + "loss": 2.1283, + "step": 16110 + }, + { + "epoch": 0.06135669861376491, + "grad_norm": 0.11212721467018127, + "learning_rate": 0.0005, + "loss": 2.1358, + "step": 16120 + }, + { + "epoch": 0.06139476108188759, + "grad_norm": 0.11165232956409454, + "learning_rate": 0.0005, + "loss": 2.1404, + "step": 16130 + }, + { + "epoch": 0.06143282355001028, + "grad_norm": 0.12069948017597198, + "learning_rate": 0.0005, + "loss": 2.1239, + "step": 16140 + }, + { + "epoch": 0.06147088601813296, + "grad_norm": 0.11381001025438309, + "learning_rate": 0.0005, + "loss": 2.1293, + "step": 16150 + }, + { + "epoch": 0.06150894848625564, + "grad_norm": 0.12348002195358276, + "learning_rate": 0.0005, + "loss": 2.15, + "step": 16160 + }, + { + "epoch": 0.061547010954378326, + "grad_norm": 0.11615636944770813, + "learning_rate": 0.0005, + "loss": 2.1544, + "step": 16170 + }, + { + "epoch": 0.06158507342250101, + "grad_norm": 0.111156165599823, + "learning_rate": 0.0005, + "loss": 2.1559, + "step": 16180 + }, + { + "epoch": 0.06162313589062369, + "grad_norm": 0.12605836987495422, + "learning_rate": 0.0005, + "loss": 2.1309, + "step": 16190 + }, + { + "epoch": 0.06166119835874637, + "grad_norm": 0.12461890280246735, + "learning_rate": 0.0005, + "loss": 2.1373, + "step": 16200 + }, + { + "epoch": 0.06169926082686906, + "grad_norm": 0.12980882823467255, + "learning_rate": 0.0005, + "loss": 2.1449, + "step": 16210 + }, + { + "epoch": 0.06173732329499174, + "grad_norm": 0.12242074310779572, + "learning_rate": 0.0005, + "loss": 2.1395, + "step": 16220 + }, + { + "epoch": 0.06177538576311442, + "grad_norm": 0.13245800137519836, + "learning_rate": 0.0005, + "loss": 2.1425, + "step": 16230 + }, + { + "epoch": 0.061813448231237106, + "grad_norm": 0.11558622121810913, + "learning_rate": 0.0005, + "loss": 2.1433, + "step": 16240 + }, + { + "epoch": 0.06185151069935979, + "grad_norm": 0.13138440251350403, + "learning_rate": 0.0005, + "loss": 2.1476, + "step": 16250 + }, + { + "epoch": 0.061889573167482476, + "grad_norm": 0.11061680316925049, + "learning_rate": 0.0005, + "loss": 2.1295, + "step": 16260 + }, + { + "epoch": 0.061927635635605154, + "grad_norm": 0.12240659445524216, + "learning_rate": 0.0005, + "loss": 2.1402, + "step": 16270 + }, + { + "epoch": 0.06196569810372784, + "grad_norm": 0.12258733063936234, + "learning_rate": 0.0005, + "loss": 2.1499, + "step": 16280 + }, + { + "epoch": 0.06200376057185052, + "grad_norm": 0.12067878991365433, + "learning_rate": 0.0005, + "loss": 2.1511, + "step": 16290 + }, + { + "epoch": 0.0620418230399732, + "grad_norm": 0.10910321027040482, + "learning_rate": 0.0005, + "loss": 2.1418, + "step": 16300 + }, + { + "epoch": 0.062079885508095886, + "grad_norm": 0.11242785304784775, + "learning_rate": 0.0005, + "loss": 2.1476, + "step": 16310 + }, + { + "epoch": 0.06211794797621857, + "grad_norm": 0.11315638571977615, + "learning_rate": 0.0005, + "loss": 2.1353, + "step": 16320 + }, + { + "epoch": 0.062156010444341256, + "grad_norm": 0.11431436985731125, + "learning_rate": 0.0005, + "loss": 2.1448, + "step": 16330 + }, + { + "epoch": 0.062194072912463934, + "grad_norm": 0.10935595631599426, + "learning_rate": 0.0005, + "loss": 2.1372, + "step": 16340 + }, + { + "epoch": 0.06223213538058662, + "grad_norm": 0.11540095508098602, + "learning_rate": 0.0005, + "loss": 2.1455, + "step": 16350 + }, + { + "epoch": 0.062270197848709304, + "grad_norm": 0.1297951191663742, + "learning_rate": 0.0005, + "loss": 2.1361, + "step": 16360 + }, + { + "epoch": 0.06230826031683198, + "grad_norm": 0.12086521089076996, + "learning_rate": 0.0005, + "loss": 2.1385, + "step": 16370 + }, + { + "epoch": 0.062346322784954666, + "grad_norm": 0.1273258775472641, + "learning_rate": 0.0005, + "loss": 2.1452, + "step": 16380 + }, + { + "epoch": 0.06238438525307735, + "grad_norm": 0.12046687304973602, + "learning_rate": 0.0005, + "loss": 2.1304, + "step": 16390 + }, + { + "epoch": 0.062422447721200036, + "grad_norm": 0.11739485710859299, + "learning_rate": 0.0005, + "loss": 2.1497, + "step": 16400 + }, + { + "epoch": 0.062460510189322714, + "grad_norm": 0.13197965919971466, + "learning_rate": 0.0005, + "loss": 2.144, + "step": 16410 + }, + { + "epoch": 0.0624985726574454, + "grad_norm": 0.11531320214271545, + "learning_rate": 0.0005, + "loss": 2.1201, + "step": 16420 + }, + { + "epoch": 0.06253663512556808, + "grad_norm": 0.12141018360853195, + "learning_rate": 0.0005, + "loss": 2.1266, + "step": 16430 + }, + { + "epoch": 0.06257469759369076, + "grad_norm": 0.11645390838384628, + "learning_rate": 0.0005, + "loss": 2.1393, + "step": 16440 + }, + { + "epoch": 0.06261276006181345, + "grad_norm": 0.11072708666324615, + "learning_rate": 0.0005, + "loss": 2.1333, + "step": 16450 + }, + { + "epoch": 0.06265082252993613, + "grad_norm": 0.112320177257061, + "learning_rate": 0.0005, + "loss": 2.15, + "step": 16460 + }, + { + "epoch": 0.06268888499805882, + "grad_norm": 0.12140627950429916, + "learning_rate": 0.0005, + "loss": 2.136, + "step": 16470 + }, + { + "epoch": 0.0627269474661815, + "grad_norm": 0.11296653002500534, + "learning_rate": 0.0005, + "loss": 2.1434, + "step": 16480 + }, + { + "epoch": 0.06276500993430419, + "grad_norm": 0.11528851091861725, + "learning_rate": 0.0005, + "loss": 2.1314, + "step": 16490 + }, + { + "epoch": 0.06280307240242686, + "grad_norm": 0.1190958246588707, + "learning_rate": 0.0005, + "loss": 2.1424, + "step": 16500 + }, + { + "epoch": 0.06284113487054954, + "grad_norm": 0.11648474633693695, + "learning_rate": 0.0005, + "loss": 2.1365, + "step": 16510 + }, + { + "epoch": 0.06287919733867223, + "grad_norm": 0.1304817795753479, + "learning_rate": 0.0005, + "loss": 2.1214, + "step": 16520 + }, + { + "epoch": 0.06291725980679491, + "grad_norm": 0.2063707560300827, + "learning_rate": 0.0005, + "loss": 2.1347, + "step": 16530 + }, + { + "epoch": 0.0629553222749176, + "grad_norm": 0.10847392678260803, + "learning_rate": 0.0005, + "loss": 2.137, + "step": 16540 + }, + { + "epoch": 0.06299338474304028, + "grad_norm": 0.12247609347105026, + "learning_rate": 0.0005, + "loss": 2.1568, + "step": 16550 + }, + { + "epoch": 0.06303144721116297, + "grad_norm": 0.11401928216218948, + "learning_rate": 0.0005, + "loss": 2.1193, + "step": 16560 + }, + { + "epoch": 0.06306950967928564, + "grad_norm": 0.12218509614467621, + "learning_rate": 0.0005, + "loss": 2.1565, + "step": 16570 + }, + { + "epoch": 0.06310757214740832, + "grad_norm": 0.12032946199178696, + "learning_rate": 0.0005, + "loss": 2.1412, + "step": 16580 + }, + { + "epoch": 0.06314563461553101, + "grad_norm": 0.12295140326023102, + "learning_rate": 0.0005, + "loss": 2.1385, + "step": 16590 + }, + { + "epoch": 0.06318369708365369, + "grad_norm": 0.11174926906824112, + "learning_rate": 0.0005, + "loss": 2.1319, + "step": 16600 + }, + { + "epoch": 0.06322175955177638, + "grad_norm": 0.12819042801856995, + "learning_rate": 0.0005, + "loss": 2.135, + "step": 16610 + }, + { + "epoch": 0.06325982201989906, + "grad_norm": 0.11440946161746979, + "learning_rate": 0.0005, + "loss": 2.1374, + "step": 16620 + }, + { + "epoch": 0.06329788448802175, + "grad_norm": 0.1090090423822403, + "learning_rate": 0.0005, + "loss": 2.1413, + "step": 16630 + }, + { + "epoch": 0.06333594695614442, + "grad_norm": 0.13145922124385834, + "learning_rate": 0.0005, + "loss": 2.1518, + "step": 16640 + }, + { + "epoch": 0.0633740094242671, + "grad_norm": 0.1270890235900879, + "learning_rate": 0.0005, + "loss": 2.1288, + "step": 16650 + }, + { + "epoch": 0.06341207189238979, + "grad_norm": 0.11212950944900513, + "learning_rate": 0.0005, + "loss": 2.1328, + "step": 16660 + }, + { + "epoch": 0.06345013436051247, + "grad_norm": 0.12071997672319412, + "learning_rate": 0.0005, + "loss": 2.1395, + "step": 16670 + }, + { + "epoch": 0.06348819682863516, + "grad_norm": 0.12150036543607712, + "learning_rate": 0.0005, + "loss": 2.1405, + "step": 16680 + }, + { + "epoch": 0.06352625929675784, + "grad_norm": 0.11558043211698532, + "learning_rate": 0.0005, + "loss": 2.1383, + "step": 16690 + }, + { + "epoch": 0.06356432176488053, + "grad_norm": 0.11684717983007431, + "learning_rate": 0.0005, + "loss": 2.1335, + "step": 16700 + }, + { + "epoch": 0.0636023842330032, + "grad_norm": 0.10834956914186478, + "learning_rate": 0.0005, + "loss": 2.1371, + "step": 16710 + }, + { + "epoch": 0.06364044670112588, + "grad_norm": 0.11303365230560303, + "learning_rate": 0.0005, + "loss": 2.1414, + "step": 16720 + }, + { + "epoch": 0.06367850916924857, + "grad_norm": 0.11881110072135925, + "learning_rate": 0.0005, + "loss": 2.1393, + "step": 16730 + }, + { + "epoch": 0.06371657163737125, + "grad_norm": 0.1124265193939209, + "learning_rate": 0.0005, + "loss": 2.1393, + "step": 16740 + }, + { + "epoch": 0.06375463410549394, + "grad_norm": 0.11897893995046616, + "learning_rate": 0.0005, + "loss": 2.1397, + "step": 16750 + }, + { + "epoch": 0.06379269657361662, + "grad_norm": 0.10814017057418823, + "learning_rate": 0.0005, + "loss": 2.1386, + "step": 16760 + }, + { + "epoch": 0.06383075904173931, + "grad_norm": 0.12043372541666031, + "learning_rate": 0.0005, + "loss": 2.1636, + "step": 16770 + }, + { + "epoch": 0.06386882150986199, + "grad_norm": 0.108718641102314, + "learning_rate": 0.0005, + "loss": 2.1383, + "step": 16780 + }, + { + "epoch": 0.06390688397798466, + "grad_norm": 0.10894080251455307, + "learning_rate": 0.0005, + "loss": 2.1453, + "step": 16790 + }, + { + "epoch": 0.06394494644610735, + "grad_norm": 0.12031812220811844, + "learning_rate": 0.0005, + "loss": 2.1349, + "step": 16800 + }, + { + "epoch": 0.06398300891423003, + "grad_norm": 0.10882657766342163, + "learning_rate": 0.0005, + "loss": 2.1497, + "step": 16810 + }, + { + "epoch": 0.06402107138235272, + "grad_norm": 0.11654344946146011, + "learning_rate": 0.0005, + "loss": 2.1418, + "step": 16820 + }, + { + "epoch": 0.0640591338504754, + "grad_norm": 0.1230119913816452, + "learning_rate": 0.0005, + "loss": 2.1364, + "step": 16830 + }, + { + "epoch": 0.06409719631859809, + "grad_norm": 0.11708708852529526, + "learning_rate": 0.0005, + "loss": 2.1595, + "step": 16840 + }, + { + "epoch": 0.06413525878672077, + "grad_norm": 0.1183198019862175, + "learning_rate": 0.0005, + "loss": 2.1516, + "step": 16850 + }, + { + "epoch": 0.06417332125484344, + "grad_norm": 0.1250128149986267, + "learning_rate": 0.0005, + "loss": 2.1181, + "step": 16860 + }, + { + "epoch": 0.06421138372296613, + "grad_norm": 0.12850812077522278, + "learning_rate": 0.0005, + "loss": 2.158, + "step": 16870 + }, + { + "epoch": 0.06424944619108881, + "grad_norm": 0.12004446983337402, + "learning_rate": 0.0005, + "loss": 2.1426, + "step": 16880 + }, + { + "epoch": 0.0642875086592115, + "grad_norm": 0.1127396896481514, + "learning_rate": 0.0005, + "loss": 2.1522, + "step": 16890 + }, + { + "epoch": 0.06432557112733418, + "grad_norm": 0.13206587731838226, + "learning_rate": 0.0005, + "loss": 2.1443, + "step": 16900 + }, + { + "epoch": 0.06436363359545687, + "grad_norm": 0.11783764511346817, + "learning_rate": 0.0005, + "loss": 2.1438, + "step": 16910 + }, + { + "epoch": 0.06440169606357955, + "grad_norm": 0.12285258620977402, + "learning_rate": 0.0005, + "loss": 2.1362, + "step": 16920 + }, + { + "epoch": 0.06443975853170222, + "grad_norm": 0.13280342519283295, + "learning_rate": 0.0005, + "loss": 2.1376, + "step": 16930 + }, + { + "epoch": 0.06447782099982491, + "grad_norm": 0.11519664525985718, + "learning_rate": 0.0005, + "loss": 2.1394, + "step": 16940 + }, + { + "epoch": 0.0645158834679476, + "grad_norm": 0.11494658142328262, + "learning_rate": 0.0005, + "loss": 2.1447, + "step": 16950 + }, + { + "epoch": 0.06455394593607028, + "grad_norm": 0.10861913114786148, + "learning_rate": 0.0005, + "loss": 2.1417, + "step": 16960 + }, + { + "epoch": 0.06459200840419296, + "grad_norm": 0.1098102480173111, + "learning_rate": 0.0005, + "loss": 2.1384, + "step": 16970 + }, + { + "epoch": 0.06463007087231565, + "grad_norm": 0.11972511559724808, + "learning_rate": 0.0005, + "loss": 2.1504, + "step": 16980 + }, + { + "epoch": 0.06466813334043833, + "grad_norm": 0.11844224482774734, + "learning_rate": 0.0005, + "loss": 2.1371, + "step": 16990 + }, + { + "epoch": 0.064706195808561, + "grad_norm": 0.10886506736278534, + "learning_rate": 0.0005, + "loss": 2.1565, + "step": 17000 + }, + { + "epoch": 0.06474425827668369, + "grad_norm": 0.11520258337259293, + "learning_rate": 0.0005, + "loss": 2.1439, + "step": 17010 + }, + { + "epoch": 0.06478232074480637, + "grad_norm": 0.113410085439682, + "learning_rate": 0.0005, + "loss": 2.1337, + "step": 17020 + }, + { + "epoch": 0.06482038321292906, + "grad_norm": 0.11354994773864746, + "learning_rate": 0.0005, + "loss": 2.1356, + "step": 17030 + }, + { + "epoch": 0.06485844568105174, + "grad_norm": 0.1343780755996704, + "learning_rate": 0.0005, + "loss": 2.153, + "step": 17040 + }, + { + "epoch": 0.06489650814917443, + "grad_norm": 0.12449301779270172, + "learning_rate": 0.0005, + "loss": 2.1428, + "step": 17050 + }, + { + "epoch": 0.06493457061729711, + "grad_norm": 0.12187916040420532, + "learning_rate": 0.0005, + "loss": 2.1281, + "step": 17060 + }, + { + "epoch": 0.06497263308541978, + "grad_norm": 0.11673722416162491, + "learning_rate": 0.0005, + "loss": 2.1473, + "step": 17070 + }, + { + "epoch": 0.06501069555354247, + "grad_norm": 0.1161603257060051, + "learning_rate": 0.0005, + "loss": 2.1276, + "step": 17080 + }, + { + "epoch": 0.06504875802166515, + "grad_norm": 0.12846381962299347, + "learning_rate": 0.0005, + "loss": 2.1515, + "step": 17090 + }, + { + "epoch": 0.06508682048978784, + "grad_norm": 0.13948954641819, + "learning_rate": 0.0005, + "loss": 2.1471, + "step": 17100 + }, + { + "epoch": 0.06512488295791052, + "grad_norm": 0.11778617650270462, + "learning_rate": 0.0005, + "loss": 2.1578, + "step": 17110 + }, + { + "epoch": 0.06516294542603321, + "grad_norm": 0.11828217655420303, + "learning_rate": 0.0005, + "loss": 2.114, + "step": 17120 + }, + { + "epoch": 0.0652010078941559, + "grad_norm": 0.1147540882229805, + "learning_rate": 0.0005, + "loss": 2.1453, + "step": 17130 + }, + { + "epoch": 0.06523907036227858, + "grad_norm": 0.10472284257411957, + "learning_rate": 0.0005, + "loss": 2.1474, + "step": 17140 + }, + { + "epoch": 0.06527713283040125, + "grad_norm": 0.12210440635681152, + "learning_rate": 0.0005, + "loss": 2.1484, + "step": 17150 + }, + { + "epoch": 0.06531519529852393, + "grad_norm": 0.1380053013563156, + "learning_rate": 0.0005, + "loss": 2.1431, + "step": 17160 + }, + { + "epoch": 0.06535325776664662, + "grad_norm": 0.12499924004077911, + "learning_rate": 0.0005, + "loss": 2.121, + "step": 17170 + }, + { + "epoch": 0.0653913202347693, + "grad_norm": 0.11574744433164597, + "learning_rate": 0.0005, + "loss": 2.1446, + "step": 17180 + }, + { + "epoch": 0.06542938270289199, + "grad_norm": 0.11198943108320236, + "learning_rate": 0.0005, + "loss": 2.1352, + "step": 17190 + }, + { + "epoch": 0.06546744517101467, + "grad_norm": 0.1090979054570198, + "learning_rate": 0.0005, + "loss": 2.1422, + "step": 17200 + }, + { + "epoch": 0.06550550763913736, + "grad_norm": 0.11089687049388885, + "learning_rate": 0.0005, + "loss": 2.1459, + "step": 17210 + }, + { + "epoch": 0.06554357010726003, + "grad_norm": 0.1218784749507904, + "learning_rate": 0.0005, + "loss": 2.1423, + "step": 17220 + }, + { + "epoch": 0.06558163257538271, + "grad_norm": 0.1282770186662674, + "learning_rate": 0.0005, + "loss": 2.1505, + "step": 17230 + }, + { + "epoch": 0.0656196950435054, + "grad_norm": 0.11114176362752914, + "learning_rate": 0.0005, + "loss": 2.1293, + "step": 17240 + }, + { + "epoch": 0.06565775751162808, + "grad_norm": 0.118684783577919, + "learning_rate": 0.0005, + "loss": 2.1301, + "step": 17250 + }, + { + "epoch": 0.06569581997975077, + "grad_norm": 0.11742344498634338, + "learning_rate": 0.0005, + "loss": 2.1514, + "step": 17260 + }, + { + "epoch": 0.06573388244787345, + "grad_norm": 0.1274501532316208, + "learning_rate": 0.0005, + "loss": 2.1473, + "step": 17270 + }, + { + "epoch": 0.06577194491599614, + "grad_norm": 0.11824849992990494, + "learning_rate": 0.0005, + "loss": 2.1472, + "step": 17280 + }, + { + "epoch": 0.06581000738411881, + "grad_norm": 0.1180088147521019, + "learning_rate": 0.0005, + "loss": 2.1467, + "step": 17290 + }, + { + "epoch": 0.0658480698522415, + "grad_norm": 0.1161024421453476, + "learning_rate": 0.0005, + "loss": 2.1344, + "step": 17300 + }, + { + "epoch": 0.06588613232036418, + "grad_norm": 0.13447245955467224, + "learning_rate": 0.0005, + "loss": 2.1462, + "step": 17310 + }, + { + "epoch": 0.06592419478848686, + "grad_norm": 0.13925780355930328, + "learning_rate": 0.0005, + "loss": 2.1416, + "step": 17320 + }, + { + "epoch": 0.06596225725660955, + "grad_norm": 0.11427681148052216, + "learning_rate": 0.0005, + "loss": 2.1551, + "step": 17330 + }, + { + "epoch": 0.06600031972473223, + "grad_norm": 0.11538650095462799, + "learning_rate": 0.0005, + "loss": 2.1344, + "step": 17340 + }, + { + "epoch": 0.06603838219285492, + "grad_norm": 0.11472231894731522, + "learning_rate": 0.0005, + "loss": 2.1414, + "step": 17350 + }, + { + "epoch": 0.06607644466097759, + "grad_norm": 0.13605837523937225, + "learning_rate": 0.0005, + "loss": 2.1473, + "step": 17360 + }, + { + "epoch": 0.06611450712910028, + "grad_norm": 0.12702429294586182, + "learning_rate": 0.0005, + "loss": 2.1537, + "step": 17370 + }, + { + "epoch": 0.06615256959722296, + "grad_norm": 0.11571726202964783, + "learning_rate": 0.0005, + "loss": 2.1351, + "step": 17380 + }, + { + "epoch": 0.06619063206534564, + "grad_norm": 0.11434004455804825, + "learning_rate": 0.0005, + "loss": 2.1508, + "step": 17390 + }, + { + "epoch": 0.06622869453346833, + "grad_norm": 0.11851716041564941, + "learning_rate": 0.0005, + "loss": 2.1559, + "step": 17400 + }, + { + "epoch": 0.06626675700159101, + "grad_norm": 0.11230608075857162, + "learning_rate": 0.0005, + "loss": 2.1405, + "step": 17410 + }, + { + "epoch": 0.0663048194697137, + "grad_norm": 0.11201955378055573, + "learning_rate": 0.0005, + "loss": 2.1572, + "step": 17420 + }, + { + "epoch": 0.06634288193783637, + "grad_norm": 0.13140922784805298, + "learning_rate": 0.0005, + "loss": 2.1215, + "step": 17430 + }, + { + "epoch": 0.06638094440595906, + "grad_norm": 0.1312038004398346, + "learning_rate": 0.0005, + "loss": 2.1447, + "step": 17440 + }, + { + "epoch": 0.06641900687408174, + "grad_norm": 0.11670250445604324, + "learning_rate": 0.0005, + "loss": 2.1392, + "step": 17450 + }, + { + "epoch": 0.06645706934220443, + "grad_norm": 0.11685652285814285, + "learning_rate": 0.0005, + "loss": 2.1471, + "step": 17460 + }, + { + "epoch": 0.06649513181032711, + "grad_norm": 0.12408076226711273, + "learning_rate": 0.0005, + "loss": 2.15, + "step": 17470 + }, + { + "epoch": 0.0665331942784498, + "grad_norm": 0.13604721426963806, + "learning_rate": 0.0005, + "loss": 2.1377, + "step": 17480 + }, + { + "epoch": 0.06657125674657248, + "grad_norm": 0.11466772109270096, + "learning_rate": 0.0005, + "loss": 2.1395, + "step": 17490 + }, + { + "epoch": 0.06660931921469516, + "grad_norm": 0.12748092412948608, + "learning_rate": 0.0005, + "loss": 2.1141, + "step": 17500 + }, + { + "epoch": 0.06664738168281784, + "grad_norm": 0.12154994904994965, + "learning_rate": 0.0005, + "loss": 2.1425, + "step": 17510 + }, + { + "epoch": 0.06668544415094052, + "grad_norm": 0.11171988397836685, + "learning_rate": 0.0005, + "loss": 2.1596, + "step": 17520 + }, + { + "epoch": 0.0667235066190632, + "grad_norm": 0.10425330698490143, + "learning_rate": 0.0005, + "loss": 2.1384, + "step": 17530 + }, + { + "epoch": 0.06676156908718589, + "grad_norm": 0.13074353337287903, + "learning_rate": 0.0005, + "loss": 2.1311, + "step": 17540 + }, + { + "epoch": 0.06679963155530858, + "grad_norm": 0.11248553544282913, + "learning_rate": 0.0005, + "loss": 2.1389, + "step": 17550 + }, + { + "epoch": 0.06683769402343126, + "grad_norm": 0.11872132867574692, + "learning_rate": 0.0005, + "loss": 2.1603, + "step": 17560 + }, + { + "epoch": 0.06687575649155394, + "grad_norm": 0.11587295681238174, + "learning_rate": 0.0005, + "loss": 2.142, + "step": 17570 + }, + { + "epoch": 0.06691381895967662, + "grad_norm": 0.11544958502054214, + "learning_rate": 0.0005, + "loss": 2.1368, + "step": 17580 + }, + { + "epoch": 0.0669518814277993, + "grad_norm": 0.13210080564022064, + "learning_rate": 0.0005, + "loss": 2.1476, + "step": 17590 + }, + { + "epoch": 0.06698994389592199, + "grad_norm": 0.1230560839176178, + "learning_rate": 0.0005, + "loss": 2.128, + "step": 17600 + }, + { + "epoch": 0.06702800636404467, + "grad_norm": 0.13568370044231415, + "learning_rate": 0.0005, + "loss": 2.1474, + "step": 17610 + }, + { + "epoch": 0.06706606883216736, + "grad_norm": 0.12618105113506317, + "learning_rate": 0.0005, + "loss": 2.1464, + "step": 17620 + }, + { + "epoch": 0.06710413130029004, + "grad_norm": 0.11416677385568619, + "learning_rate": 0.0005, + "loss": 2.1417, + "step": 17630 + }, + { + "epoch": 0.06714219376841273, + "grad_norm": 0.11842382699251175, + "learning_rate": 0.0005, + "loss": 2.1303, + "step": 17640 + }, + { + "epoch": 0.0671802562365354, + "grad_norm": 0.11482279002666473, + "learning_rate": 0.0005, + "loss": 2.1391, + "step": 17650 + }, + { + "epoch": 0.06721831870465808, + "grad_norm": 0.11678756028413773, + "learning_rate": 0.0005, + "loss": 2.1487, + "step": 17660 + }, + { + "epoch": 0.06725638117278077, + "grad_norm": 0.12048120051622391, + "learning_rate": 0.0005, + "loss": 2.1489, + "step": 17670 + }, + { + "epoch": 0.06729444364090345, + "grad_norm": 0.1227121651172638, + "learning_rate": 0.0005, + "loss": 2.1433, + "step": 17680 + }, + { + "epoch": 0.06733250610902614, + "grad_norm": 0.11371784657239914, + "learning_rate": 0.0005, + "loss": 2.1409, + "step": 17690 + }, + { + "epoch": 0.06737056857714882, + "grad_norm": 0.10713077336549759, + "learning_rate": 0.0005, + "loss": 2.1324, + "step": 17700 + }, + { + "epoch": 0.0674086310452715, + "grad_norm": 0.10683862119913101, + "learning_rate": 0.0005, + "loss": 2.142, + "step": 17710 + }, + { + "epoch": 0.06744669351339418, + "grad_norm": 0.1217755526304245, + "learning_rate": 0.0005, + "loss": 2.1321, + "step": 17720 + }, + { + "epoch": 0.06748475598151686, + "grad_norm": 0.13459083437919617, + "learning_rate": 0.0005, + "loss": 2.1391, + "step": 17730 + }, + { + "epoch": 0.06752281844963955, + "grad_norm": 0.11992931365966797, + "learning_rate": 0.0005, + "loss": 2.132, + "step": 17740 + }, + { + "epoch": 0.06756088091776223, + "grad_norm": 0.11505747586488724, + "learning_rate": 0.0005, + "loss": 2.1323, + "step": 17750 + }, + { + "epoch": 0.06759894338588492, + "grad_norm": 0.11355841904878616, + "learning_rate": 0.0005, + "loss": 2.1442, + "step": 17760 + }, + { + "epoch": 0.0676370058540076, + "grad_norm": 0.12791898846626282, + "learning_rate": 0.0005, + "loss": 2.1564, + "step": 17770 + }, + { + "epoch": 0.06767506832213029, + "grad_norm": 0.11727561056613922, + "learning_rate": 0.0005, + "loss": 2.1374, + "step": 17780 + }, + { + "epoch": 0.06771313079025296, + "grad_norm": 0.11516810953617096, + "learning_rate": 0.0005, + "loss": 2.1488, + "step": 17790 + }, + { + "epoch": 0.06775119325837564, + "grad_norm": 0.11258723586797714, + "learning_rate": 0.0005, + "loss": 2.1414, + "step": 17800 + }, + { + "epoch": 0.06778925572649833, + "grad_norm": 0.12778383493423462, + "learning_rate": 0.0005, + "loss": 2.1557, + "step": 17810 + }, + { + "epoch": 0.06782731819462101, + "grad_norm": 0.11237140744924545, + "learning_rate": 0.0005, + "loss": 2.1361, + "step": 17820 + }, + { + "epoch": 0.0678653806627437, + "grad_norm": 0.13233919441699982, + "learning_rate": 0.0005, + "loss": 2.1366, + "step": 17830 + }, + { + "epoch": 0.06790344313086638, + "grad_norm": 0.1204909235239029, + "learning_rate": 0.0005, + "loss": 2.135, + "step": 17840 + }, + { + "epoch": 0.06794150559898907, + "grad_norm": 0.11478203535079956, + "learning_rate": 0.0005, + "loss": 2.1403, + "step": 17850 + }, + { + "epoch": 0.06797956806711174, + "grad_norm": 0.1364944875240326, + "learning_rate": 0.0005, + "loss": 2.1431, + "step": 17860 + }, + { + "epoch": 0.06801763053523442, + "grad_norm": 0.11078420281410217, + "learning_rate": 0.0005, + "loss": 2.1566, + "step": 17870 + }, + { + "epoch": 0.0680556930033571, + "grad_norm": 0.12364714592695236, + "learning_rate": 0.0005, + "loss": 2.14, + "step": 17880 + }, + { + "epoch": 0.06809375547147979, + "grad_norm": 0.11979297548532486, + "learning_rate": 0.0005, + "loss": 2.1407, + "step": 17890 + }, + { + "epoch": 0.06813181793960248, + "grad_norm": 0.11409571021795273, + "learning_rate": 0.0005, + "loss": 2.1343, + "step": 17900 + }, + { + "epoch": 0.06816988040772516, + "grad_norm": 0.12002553045749664, + "learning_rate": 0.0005, + "loss": 2.1459, + "step": 17910 + }, + { + "epoch": 0.06820794287584785, + "grad_norm": 0.10696928203105927, + "learning_rate": 0.0005, + "loss": 2.1569, + "step": 17920 + }, + { + "epoch": 0.06824600534397053, + "grad_norm": 0.1150284856557846, + "learning_rate": 0.0005, + "loss": 2.1469, + "step": 17930 + }, + { + "epoch": 0.0682840678120932, + "grad_norm": 0.1319814920425415, + "learning_rate": 0.0005, + "loss": 2.1385, + "step": 17940 + }, + { + "epoch": 0.06832213028021589, + "grad_norm": 0.11064282804727554, + "learning_rate": 0.0005, + "loss": 2.1413, + "step": 17950 + }, + { + "epoch": 0.06836019274833857, + "grad_norm": 0.12755045294761658, + "learning_rate": 0.0005, + "loss": 2.169, + "step": 17960 + }, + { + "epoch": 0.06839825521646126, + "grad_norm": 0.11299976706504822, + "learning_rate": 0.0005, + "loss": 2.1336, + "step": 17970 + }, + { + "epoch": 0.06843631768458394, + "grad_norm": 0.12607094645500183, + "learning_rate": 0.0005, + "loss": 2.1528, + "step": 17980 + }, + { + "epoch": 0.06847438015270663, + "grad_norm": 0.1116366758942604, + "learning_rate": 0.0005, + "loss": 2.141, + "step": 17990 + }, + { + "epoch": 0.06851244262082931, + "grad_norm": 0.11597177386283875, + "learning_rate": 0.0005, + "loss": 2.1469, + "step": 18000 + }, + { + "epoch": 0.06855050508895198, + "grad_norm": 0.12607623636722565, + "learning_rate": 0.0005, + "loss": 2.1354, + "step": 18010 + }, + { + "epoch": 0.06858856755707467, + "grad_norm": 0.10983790457248688, + "learning_rate": 0.0005, + "loss": 2.1415, + "step": 18020 + }, + { + "epoch": 0.06862663002519735, + "grad_norm": 0.1086328849196434, + "learning_rate": 0.0005, + "loss": 2.1478, + "step": 18030 + }, + { + "epoch": 0.06866469249332004, + "grad_norm": 0.1174158975481987, + "learning_rate": 0.0005, + "loss": 2.1326, + "step": 18040 + }, + { + "epoch": 0.06870275496144272, + "grad_norm": 0.11614352464675903, + "learning_rate": 0.0005, + "loss": 2.1317, + "step": 18050 + }, + { + "epoch": 0.0687408174295654, + "grad_norm": 0.1141098216176033, + "learning_rate": 0.0005, + "loss": 2.141, + "step": 18060 + }, + { + "epoch": 0.06877887989768809, + "grad_norm": 0.1251964271068573, + "learning_rate": 0.0005, + "loss": 2.1316, + "step": 18070 + }, + { + "epoch": 0.06881694236581076, + "grad_norm": 0.1291750818490982, + "learning_rate": 0.0005, + "loss": 2.1368, + "step": 18080 + }, + { + "epoch": 0.06885500483393345, + "grad_norm": 0.11388098448514938, + "learning_rate": 0.0005, + "loss": 2.1386, + "step": 18090 + }, + { + "epoch": 0.06889306730205613, + "grad_norm": 0.10952631384134293, + "learning_rate": 0.0005, + "loss": 2.1383, + "step": 18100 + }, + { + "epoch": 0.06893112977017882, + "grad_norm": 0.1275061070919037, + "learning_rate": 0.0005, + "loss": 2.1307, + "step": 18110 + }, + { + "epoch": 0.0689691922383015, + "grad_norm": 0.09898626804351807, + "learning_rate": 0.0005, + "loss": 2.1508, + "step": 18120 + }, + { + "epoch": 0.06900725470642419, + "grad_norm": 0.11916225403547287, + "learning_rate": 0.0005, + "loss": 2.1204, + "step": 18130 + }, + { + "epoch": 0.06904531717454687, + "grad_norm": 0.12321484833955765, + "learning_rate": 0.0005, + "loss": 2.1433, + "step": 18140 + }, + { + "epoch": 0.06908337964266954, + "grad_norm": 0.11095452308654785, + "learning_rate": 0.0005, + "loss": 2.1333, + "step": 18150 + }, + { + "epoch": 0.06912144211079223, + "grad_norm": 0.1228238046169281, + "learning_rate": 0.0005, + "loss": 2.1466, + "step": 18160 + }, + { + "epoch": 0.06915950457891491, + "grad_norm": 0.1137445792555809, + "learning_rate": 0.0005, + "loss": 2.1415, + "step": 18170 + }, + { + "epoch": 0.0691975670470376, + "grad_norm": 0.11743122339248657, + "learning_rate": 0.0005, + "loss": 2.1501, + "step": 18180 + }, + { + "epoch": 0.06923562951516028, + "grad_norm": 0.129505917429924, + "learning_rate": 0.0005, + "loss": 2.1365, + "step": 18190 + }, + { + "epoch": 0.06927369198328297, + "grad_norm": 0.1274467259645462, + "learning_rate": 0.0005, + "loss": 2.1362, + "step": 18200 + }, + { + "epoch": 0.06931175445140565, + "grad_norm": 0.12610933184623718, + "learning_rate": 0.0005, + "loss": 2.1389, + "step": 18210 + }, + { + "epoch": 0.06934981691952832, + "grad_norm": 0.12468487024307251, + "learning_rate": 0.0005, + "loss": 2.138, + "step": 18220 + }, + { + "epoch": 0.06938787938765101, + "grad_norm": 0.11550614982843399, + "learning_rate": 0.0005, + "loss": 2.1256, + "step": 18230 + }, + { + "epoch": 0.06942594185577369, + "grad_norm": 0.12684103846549988, + "learning_rate": 0.0005, + "loss": 2.1397, + "step": 18240 + }, + { + "epoch": 0.06946400432389638, + "grad_norm": 0.1284370720386505, + "learning_rate": 0.0005, + "loss": 2.143, + "step": 18250 + }, + { + "epoch": 0.06950206679201906, + "grad_norm": 0.12868432700634003, + "learning_rate": 0.0005, + "loss": 2.1507, + "step": 18260 + }, + { + "epoch": 0.06954012926014175, + "grad_norm": 0.13332092761993408, + "learning_rate": 0.0005, + "loss": 2.145, + "step": 18270 + }, + { + "epoch": 0.06957819172826443, + "grad_norm": 0.11784756183624268, + "learning_rate": 0.0005, + "loss": 2.1421, + "step": 18280 + }, + { + "epoch": 0.06961625419638712, + "grad_norm": 0.12656499445438385, + "learning_rate": 0.0005, + "loss": 2.1407, + "step": 18290 + }, + { + "epoch": 0.06965431666450979, + "grad_norm": 0.1278528869152069, + "learning_rate": 0.0005, + "loss": 2.1523, + "step": 18300 + }, + { + "epoch": 0.06969237913263247, + "grad_norm": 0.12106233090162277, + "learning_rate": 0.0005, + "loss": 2.1405, + "step": 18310 + }, + { + "epoch": 0.06973044160075516, + "grad_norm": 0.1094827800989151, + "learning_rate": 0.0005, + "loss": 2.1398, + "step": 18320 + }, + { + "epoch": 0.06976850406887784, + "grad_norm": 0.12173653393983841, + "learning_rate": 0.0005, + "loss": 2.1378, + "step": 18330 + }, + { + "epoch": 0.06980656653700053, + "grad_norm": 0.13432110846042633, + "learning_rate": 0.0005, + "loss": 2.1437, + "step": 18340 + }, + { + "epoch": 0.06984462900512321, + "grad_norm": 0.12391926348209381, + "learning_rate": 0.0005, + "loss": 2.141, + "step": 18350 + }, + { + "epoch": 0.0698826914732459, + "grad_norm": 0.11583663523197174, + "learning_rate": 0.0005, + "loss": 2.1492, + "step": 18360 + }, + { + "epoch": 0.06992075394136857, + "grad_norm": 0.13333432376384735, + "learning_rate": 0.0005, + "loss": 2.1454, + "step": 18370 + }, + { + "epoch": 0.06995881640949125, + "grad_norm": 0.1078396886587143, + "learning_rate": 0.0005, + "loss": 2.1468, + "step": 18380 + }, + { + "epoch": 0.06999687887761394, + "grad_norm": 0.11615791916847229, + "learning_rate": 0.0005, + "loss": 2.1507, + "step": 18390 + }, + { + "epoch": 0.07003494134573662, + "grad_norm": 0.11581426858901978, + "learning_rate": 0.0005, + "loss": 2.1521, + "step": 18400 + }, + { + "epoch": 0.07007300381385931, + "grad_norm": 0.11066626757383347, + "learning_rate": 0.0005, + "loss": 2.1322, + "step": 18410 + }, + { + "epoch": 0.07011106628198199, + "grad_norm": 0.12922365963459015, + "learning_rate": 0.0005, + "loss": 2.1436, + "step": 18420 + }, + { + "epoch": 0.07014912875010468, + "grad_norm": 0.11376654356718063, + "learning_rate": 0.0005, + "loss": 2.1493, + "step": 18430 + }, + { + "epoch": 0.07018719121822735, + "grad_norm": 0.12547221779823303, + "learning_rate": 0.0005, + "loss": 2.1316, + "step": 18440 + }, + { + "epoch": 0.07022525368635003, + "grad_norm": 0.1183745265007019, + "learning_rate": 0.0005, + "loss": 2.1387, + "step": 18450 + }, + { + "epoch": 0.07026331615447272, + "grad_norm": 0.11223198473453522, + "learning_rate": 0.0005, + "loss": 2.1489, + "step": 18460 + }, + { + "epoch": 0.0703013786225954, + "grad_norm": 0.12431668490171432, + "learning_rate": 0.0005, + "loss": 2.1332, + "step": 18470 + }, + { + "epoch": 0.07033944109071809, + "grad_norm": 0.12922313809394836, + "learning_rate": 0.0005, + "loss": 2.1407, + "step": 18480 + }, + { + "epoch": 0.07037750355884077, + "grad_norm": 0.11536691337823868, + "learning_rate": 0.0005, + "loss": 2.1449, + "step": 18490 + }, + { + "epoch": 0.07041556602696346, + "grad_norm": 0.1122339516878128, + "learning_rate": 0.0005, + "loss": 2.1494, + "step": 18500 + }, + { + "epoch": 0.07045362849508613, + "grad_norm": 0.11720927059650421, + "learning_rate": 0.0005, + "loss": 2.1396, + "step": 18510 + }, + { + "epoch": 0.07049169096320881, + "grad_norm": 0.10899330675601959, + "learning_rate": 0.0005, + "loss": 2.1419, + "step": 18520 + }, + { + "epoch": 0.0705297534313315, + "grad_norm": 0.11351705342531204, + "learning_rate": 0.0005, + "loss": 2.1358, + "step": 18530 + }, + { + "epoch": 0.07056781589945418, + "grad_norm": 0.11961089074611664, + "learning_rate": 0.0005, + "loss": 2.1438, + "step": 18540 + }, + { + "epoch": 0.07060587836757687, + "grad_norm": 0.11589095741510391, + "learning_rate": 0.0005, + "loss": 2.1277, + "step": 18550 + }, + { + "epoch": 0.07064394083569955, + "grad_norm": 0.13567408919334412, + "learning_rate": 0.0005, + "loss": 2.1352, + "step": 18560 + }, + { + "epoch": 0.07068200330382224, + "grad_norm": 0.23228104412555695, + "learning_rate": 0.0005, + "loss": 2.146, + "step": 18570 + }, + { + "epoch": 0.07072006577194491, + "grad_norm": 0.1197076290845871, + "learning_rate": 0.0005, + "loss": 2.1523, + "step": 18580 + }, + { + "epoch": 0.0707581282400676, + "grad_norm": 0.11958331614732742, + "learning_rate": 0.0005, + "loss": 2.1448, + "step": 18590 + }, + { + "epoch": 0.07079619070819028, + "grad_norm": 0.12761662900447845, + "learning_rate": 0.0005, + "loss": 2.1387, + "step": 18600 + }, + { + "epoch": 0.07083425317631296, + "grad_norm": 0.12039892375469208, + "learning_rate": 0.0005, + "loss": 2.1337, + "step": 18610 + }, + { + "epoch": 0.07087231564443565, + "grad_norm": 0.12191681563854218, + "learning_rate": 0.0005, + "loss": 2.1454, + "step": 18620 + }, + { + "epoch": 0.07091037811255833, + "grad_norm": 0.12030960619449615, + "learning_rate": 0.0005, + "loss": 2.1381, + "step": 18630 + }, + { + "epoch": 0.07094844058068102, + "grad_norm": 0.11639144271612167, + "learning_rate": 0.0005, + "loss": 2.1562, + "step": 18640 + }, + { + "epoch": 0.0709865030488037, + "grad_norm": 0.12752319872379303, + "learning_rate": 0.0005, + "loss": 2.1334, + "step": 18650 + }, + { + "epoch": 0.07102456551692637, + "grad_norm": 0.1367523968219757, + "learning_rate": 0.0005, + "loss": 2.1381, + "step": 18660 + }, + { + "epoch": 0.07106262798504906, + "grad_norm": 0.12350116670131683, + "learning_rate": 0.0005, + "loss": 2.1361, + "step": 18670 + }, + { + "epoch": 0.07110069045317174, + "grad_norm": 0.1435011476278305, + "learning_rate": 0.0005, + "loss": 2.1504, + "step": 18680 + }, + { + "epoch": 0.07113875292129443, + "grad_norm": 0.13752315938472748, + "learning_rate": 0.0005, + "loss": 2.1299, + "step": 18690 + }, + { + "epoch": 0.07117681538941711, + "grad_norm": 0.1276029795408249, + "learning_rate": 0.0005, + "loss": 2.1475, + "step": 18700 + }, + { + "epoch": 0.0712148778575398, + "grad_norm": 0.11291246861219406, + "learning_rate": 0.0005, + "loss": 2.1463, + "step": 18710 + }, + { + "epoch": 0.07125294032566248, + "grad_norm": 0.11970442533493042, + "learning_rate": 0.0005, + "loss": 2.1446, + "step": 18720 + }, + { + "epoch": 0.07129100279378515, + "grad_norm": 0.11741726100444794, + "learning_rate": 0.0005, + "loss": 2.1578, + "step": 18730 + }, + { + "epoch": 0.07132906526190784, + "grad_norm": 0.12280002236366272, + "learning_rate": 0.0005, + "loss": 2.1446, + "step": 18740 + }, + { + "epoch": 0.07136712773003052, + "grad_norm": 0.1194797158241272, + "learning_rate": 0.0005, + "loss": 2.1444, + "step": 18750 + }, + { + "epoch": 0.07140519019815321, + "grad_norm": 0.12653280794620514, + "learning_rate": 0.0005, + "loss": 2.1454, + "step": 18760 + }, + { + "epoch": 0.0714432526662759, + "grad_norm": 0.10515428334474564, + "learning_rate": 0.0005, + "loss": 2.1315, + "step": 18770 + }, + { + "epoch": 0.07148131513439858, + "grad_norm": 0.12359602004289627, + "learning_rate": 0.0005, + "loss": 2.1506, + "step": 18780 + }, + { + "epoch": 0.07151937760252126, + "grad_norm": 0.1203257292509079, + "learning_rate": 0.0005, + "loss": 2.1468, + "step": 18790 + }, + { + "epoch": 0.07155744007064394, + "grad_norm": 0.124353788793087, + "learning_rate": 0.0005, + "loss": 2.1356, + "step": 18800 + }, + { + "epoch": 0.07159550253876662, + "grad_norm": 0.13274051249027252, + "learning_rate": 0.0005, + "loss": 2.1262, + "step": 18810 + }, + { + "epoch": 0.0716335650068893, + "grad_norm": 0.12063852697610855, + "learning_rate": 0.0005, + "loss": 2.139, + "step": 18820 + }, + { + "epoch": 0.07167162747501199, + "grad_norm": 0.13945847749710083, + "learning_rate": 0.0005, + "loss": 2.1295, + "step": 18830 + }, + { + "epoch": 0.07170968994313467, + "grad_norm": 0.12185059487819672, + "learning_rate": 0.0005, + "loss": 2.1279, + "step": 18840 + }, + { + "epoch": 0.07174775241125736, + "grad_norm": 0.10996271669864655, + "learning_rate": 0.0005, + "loss": 2.1422, + "step": 18850 + }, + { + "epoch": 0.07178581487938004, + "grad_norm": 0.11733502149581909, + "learning_rate": 0.0005, + "loss": 2.1456, + "step": 18860 + }, + { + "epoch": 0.07182387734750272, + "grad_norm": 0.12420696020126343, + "learning_rate": 0.0005, + "loss": 2.1484, + "step": 18870 + }, + { + "epoch": 0.0718619398156254, + "grad_norm": 0.11696764081716537, + "learning_rate": 0.0005, + "loss": 2.1364, + "step": 18880 + }, + { + "epoch": 0.07190000228374809, + "grad_norm": 0.11581572145223618, + "learning_rate": 0.0005, + "loss": 2.1374, + "step": 18890 + }, + { + "epoch": 0.07193806475187077, + "grad_norm": 0.1266104280948639, + "learning_rate": 0.0005, + "loss": 2.141, + "step": 18900 + }, + { + "epoch": 0.07197612721999345, + "grad_norm": 0.11462007462978363, + "learning_rate": 0.0005, + "loss": 2.1453, + "step": 18910 + }, + { + "epoch": 0.07201418968811614, + "grad_norm": 0.1148967519402504, + "learning_rate": 0.0005, + "loss": 2.1452, + "step": 18920 + }, + { + "epoch": 0.07205225215623882, + "grad_norm": 0.11684548854827881, + "learning_rate": 0.0005, + "loss": 2.1338, + "step": 18930 + }, + { + "epoch": 0.0720903146243615, + "grad_norm": 0.12068881839513779, + "learning_rate": 0.0005, + "loss": 2.1478, + "step": 18940 + }, + { + "epoch": 0.07212837709248418, + "grad_norm": 0.13326232135295868, + "learning_rate": 0.0005, + "loss": 2.1428, + "step": 18950 + }, + { + "epoch": 0.07216643956060687, + "grad_norm": 0.10820142179727554, + "learning_rate": 0.0005, + "loss": 2.1527, + "step": 18960 + }, + { + "epoch": 0.07220450202872955, + "grad_norm": 0.12245963513851166, + "learning_rate": 0.0005, + "loss": 2.1344, + "step": 18970 + }, + { + "epoch": 0.07224256449685224, + "grad_norm": 0.1312924176454544, + "learning_rate": 0.0005, + "loss": 2.1556, + "step": 18980 + }, + { + "epoch": 0.07228062696497492, + "grad_norm": 0.12191825360059738, + "learning_rate": 0.0005, + "loss": 2.1468, + "step": 18990 + }, + { + "epoch": 0.0723186894330976, + "grad_norm": 0.11584730446338654, + "learning_rate": 0.0005, + "loss": 2.128, + "step": 19000 + }, + { + "epoch": 0.07235675190122028, + "grad_norm": 0.11659805476665497, + "learning_rate": 0.0005, + "loss": 2.1425, + "step": 19010 + }, + { + "epoch": 0.07239481436934296, + "grad_norm": 0.11305439472198486, + "learning_rate": 0.0005, + "loss": 2.131, + "step": 19020 + }, + { + "epoch": 0.07243287683746565, + "grad_norm": 0.11925303190946579, + "learning_rate": 0.0005, + "loss": 2.1417, + "step": 19030 + }, + { + "epoch": 0.07247093930558833, + "grad_norm": 0.13060851395130157, + "learning_rate": 0.0005, + "loss": 2.1415, + "step": 19040 + }, + { + "epoch": 0.07250900177371102, + "grad_norm": 0.11164027452468872, + "learning_rate": 0.0005, + "loss": 2.1467, + "step": 19050 + }, + { + "epoch": 0.0725470642418337, + "grad_norm": 0.12202701717615128, + "learning_rate": 0.0005, + "loss": 2.1324, + "step": 19060 + }, + { + "epoch": 0.07258512670995639, + "grad_norm": 0.1152314841747284, + "learning_rate": 0.0005, + "loss": 2.1215, + "step": 19070 + }, + { + "epoch": 0.07262318917807907, + "grad_norm": 0.11402899026870728, + "learning_rate": 0.0005, + "loss": 2.1495, + "step": 19080 + }, + { + "epoch": 0.07266125164620174, + "grad_norm": 0.1244824081659317, + "learning_rate": 0.0005, + "loss": 2.1312, + "step": 19090 + }, + { + "epoch": 0.07269931411432443, + "grad_norm": 0.13850195705890656, + "learning_rate": 0.0005, + "loss": 2.1403, + "step": 19100 + }, + { + "epoch": 0.07273737658244711, + "grad_norm": 0.11881489306688309, + "learning_rate": 0.0005, + "loss": 2.137, + "step": 19110 + }, + { + "epoch": 0.0727754390505698, + "grad_norm": 0.11745548248291016, + "learning_rate": 0.0005, + "loss": 2.1381, + "step": 19120 + }, + { + "epoch": 0.07281350151869248, + "grad_norm": 0.12252794206142426, + "learning_rate": 0.0005, + "loss": 2.1214, + "step": 19130 + }, + { + "epoch": 0.07285156398681517, + "grad_norm": 0.12518726289272308, + "learning_rate": 0.0005, + "loss": 2.1499, + "step": 19140 + }, + { + "epoch": 0.07288962645493785, + "grad_norm": 0.11924619227647781, + "learning_rate": 0.0005, + "loss": 2.1347, + "step": 19150 + }, + { + "epoch": 0.07292768892306052, + "grad_norm": 0.11636649817228317, + "learning_rate": 0.0005, + "loss": 2.1313, + "step": 19160 + }, + { + "epoch": 0.0729657513911832, + "grad_norm": 0.11934640258550644, + "learning_rate": 0.0005, + "loss": 2.1438, + "step": 19170 + }, + { + "epoch": 0.07300381385930589, + "grad_norm": 0.1170443445444107, + "learning_rate": 0.0005, + "loss": 2.1283, + "step": 19180 + }, + { + "epoch": 0.07304187632742858, + "grad_norm": 0.11632074415683746, + "learning_rate": 0.0005, + "loss": 2.1312, + "step": 19190 + }, + { + "epoch": 0.07307993879555126, + "grad_norm": 0.12896569073200226, + "learning_rate": 0.0005, + "loss": 2.1435, + "step": 19200 + }, + { + "epoch": 0.07311800126367395, + "grad_norm": 0.1291140466928482, + "learning_rate": 0.0005, + "loss": 2.151, + "step": 19210 + }, + { + "epoch": 0.07315606373179663, + "grad_norm": 0.11567611247301102, + "learning_rate": 0.0005, + "loss": 2.1459, + "step": 19220 + }, + { + "epoch": 0.0731941261999193, + "grad_norm": 0.11281578242778778, + "learning_rate": 0.0005, + "loss": 2.1437, + "step": 19230 + }, + { + "epoch": 0.07323218866804199, + "grad_norm": 0.16732710599899292, + "learning_rate": 0.0005, + "loss": 2.1554, + "step": 19240 + }, + { + "epoch": 0.07327025113616467, + "grad_norm": 0.13350912928581238, + "learning_rate": 0.0005, + "loss": 2.1364, + "step": 19250 + }, + { + "epoch": 0.07330831360428736, + "grad_norm": 0.12566125392913818, + "learning_rate": 0.0005, + "loss": 2.1452, + "step": 19260 + }, + { + "epoch": 0.07334637607241004, + "grad_norm": 0.12431460618972778, + "learning_rate": 0.0005, + "loss": 2.1265, + "step": 19270 + }, + { + "epoch": 0.07338443854053273, + "grad_norm": 0.14875809848308563, + "learning_rate": 0.0005, + "loss": 2.1502, + "step": 19280 + }, + { + "epoch": 0.07342250100865541, + "grad_norm": 0.11105544120073318, + "learning_rate": 0.0005, + "loss": 2.1549, + "step": 19290 + }, + { + "epoch": 0.07346056347677808, + "grad_norm": 0.11542293429374695, + "learning_rate": 0.0005, + "loss": 2.1491, + "step": 19300 + }, + { + "epoch": 0.07349862594490077, + "grad_norm": 0.10311754047870636, + "learning_rate": 0.0005, + "loss": 2.1368, + "step": 19310 + }, + { + "epoch": 0.07353668841302345, + "grad_norm": 0.10644451528787613, + "learning_rate": 0.0005, + "loss": 2.1292, + "step": 19320 + }, + { + "epoch": 0.07357475088114614, + "grad_norm": 0.13121920824050903, + "learning_rate": 0.0005, + "loss": 2.123, + "step": 19330 + }, + { + "epoch": 0.07361281334926882, + "grad_norm": 0.12500645220279694, + "learning_rate": 0.0005, + "loss": 2.1369, + "step": 19340 + }, + { + "epoch": 0.0736508758173915, + "grad_norm": 0.11441967636346817, + "learning_rate": 0.0005, + "loss": 2.1306, + "step": 19350 + }, + { + "epoch": 0.07368893828551419, + "grad_norm": 0.12399695813655853, + "learning_rate": 0.0005, + "loss": 2.1348, + "step": 19360 + }, + { + "epoch": 0.07372700075363686, + "grad_norm": 0.10966715216636658, + "learning_rate": 0.0005, + "loss": 2.1258, + "step": 19370 + }, + { + "epoch": 0.07376506322175955, + "grad_norm": 0.12043928354978561, + "learning_rate": 0.0005, + "loss": 2.1436, + "step": 19380 + }, + { + "epoch": 0.07380312568988223, + "grad_norm": 0.11464910954236984, + "learning_rate": 0.0005, + "loss": 2.1428, + "step": 19390 + }, + { + "epoch": 0.07384118815800492, + "grad_norm": 0.11865679919719696, + "learning_rate": 0.0005, + "loss": 2.149, + "step": 19400 + }, + { + "epoch": 0.0738792506261276, + "grad_norm": 0.12201295047998428, + "learning_rate": 0.0005, + "loss": 2.1527, + "step": 19410 + }, + { + "epoch": 0.07391731309425029, + "grad_norm": 0.11938714981079102, + "learning_rate": 0.0005, + "loss": 2.1405, + "step": 19420 + }, + { + "epoch": 0.07395537556237297, + "grad_norm": 0.1209445670247078, + "learning_rate": 0.0005, + "loss": 2.1497, + "step": 19430 + }, + { + "epoch": 0.07399343803049566, + "grad_norm": 0.11020023375749588, + "learning_rate": 0.0005, + "loss": 2.1428, + "step": 19440 + }, + { + "epoch": 0.07403150049861833, + "grad_norm": 0.12486252933740616, + "learning_rate": 0.0005, + "loss": 2.1439, + "step": 19450 + }, + { + "epoch": 0.07406956296674101, + "grad_norm": 0.12022566050291061, + "learning_rate": 0.0005, + "loss": 2.1292, + "step": 19460 + }, + { + "epoch": 0.0741076254348637, + "grad_norm": 0.11662788689136505, + "learning_rate": 0.0005, + "loss": 2.1371, + "step": 19470 + }, + { + "epoch": 0.07414568790298638, + "grad_norm": 0.14027582108974457, + "learning_rate": 0.0005, + "loss": 2.1442, + "step": 19480 + }, + { + "epoch": 0.07418375037110907, + "grad_norm": 0.11259403824806213, + "learning_rate": 0.0005, + "loss": 2.129, + "step": 19490 + }, + { + "epoch": 0.07422181283923175, + "grad_norm": 0.12038365751504898, + "learning_rate": 0.0005, + "loss": 2.1433, + "step": 19500 + }, + { + "epoch": 0.07425987530735444, + "grad_norm": 0.12720441818237305, + "learning_rate": 0.0005, + "loss": 2.1288, + "step": 19510 + }, + { + "epoch": 0.07429793777547711, + "grad_norm": 0.11576971411705017, + "learning_rate": 0.0005, + "loss": 2.1374, + "step": 19520 + }, + { + "epoch": 0.07433600024359979, + "grad_norm": 0.1155039444565773, + "learning_rate": 0.0005, + "loss": 2.1253, + "step": 19530 + }, + { + "epoch": 0.07437406271172248, + "grad_norm": 0.12826068699359894, + "learning_rate": 0.0005, + "loss": 2.1451, + "step": 19540 + }, + { + "epoch": 0.07441212517984516, + "grad_norm": 0.11534158885478973, + "learning_rate": 0.0005, + "loss": 2.1365, + "step": 19550 + }, + { + "epoch": 0.07445018764796785, + "grad_norm": 0.11677207797765732, + "learning_rate": 0.0005, + "loss": 2.1415, + "step": 19560 + }, + { + "epoch": 0.07448825011609053, + "grad_norm": 0.12580077350139618, + "learning_rate": 0.0005, + "loss": 2.125, + "step": 19570 + }, + { + "epoch": 0.07452631258421322, + "grad_norm": 0.12557542324066162, + "learning_rate": 0.0005, + "loss": 2.1357, + "step": 19580 + }, + { + "epoch": 0.07456437505233589, + "grad_norm": 0.1145382896065712, + "learning_rate": 0.0005, + "loss": 2.1437, + "step": 19590 + }, + { + "epoch": 0.07460243752045857, + "grad_norm": 0.1154215931892395, + "learning_rate": 0.0005, + "loss": 2.1494, + "step": 19600 + }, + { + "epoch": 0.07464049998858126, + "grad_norm": 0.11266138404607773, + "learning_rate": 0.0005, + "loss": 2.1514, + "step": 19610 + }, + { + "epoch": 0.07467856245670394, + "grad_norm": 0.11759719252586365, + "learning_rate": 0.0005, + "loss": 2.1194, + "step": 19620 + }, + { + "epoch": 0.07471662492482663, + "grad_norm": 0.12315239012241364, + "learning_rate": 0.0005, + "loss": 2.1343, + "step": 19630 + }, + { + "epoch": 0.07475468739294931, + "grad_norm": 0.1262696534395218, + "learning_rate": 0.0005, + "loss": 2.1638, + "step": 19640 + }, + { + "epoch": 0.074792749861072, + "grad_norm": 0.12901771068572998, + "learning_rate": 0.0005, + "loss": 2.1587, + "step": 19650 + }, + { + "epoch": 0.07483081232919467, + "grad_norm": 0.12097518891096115, + "learning_rate": 0.0005, + "loss": 2.1526, + "step": 19660 + }, + { + "epoch": 0.07486887479731735, + "grad_norm": 0.11667900532484055, + "learning_rate": 0.0005, + "loss": 2.1256, + "step": 19670 + }, + { + "epoch": 0.07490693726544004, + "grad_norm": 0.12947173416614532, + "learning_rate": 0.0005, + "loss": 2.1405, + "step": 19680 + }, + { + "epoch": 0.07494499973356272, + "grad_norm": 0.12104139477014542, + "learning_rate": 0.0005, + "loss": 2.1446, + "step": 19690 + }, + { + "epoch": 0.07498306220168541, + "grad_norm": 0.12678302824497223, + "learning_rate": 0.0005, + "loss": 2.1381, + "step": 19700 + }, + { + "epoch": 0.07502112466980809, + "grad_norm": 0.12390743941068649, + "learning_rate": 0.0005, + "loss": 2.1452, + "step": 19710 + }, + { + "epoch": 0.07505918713793078, + "grad_norm": 0.13258126378059387, + "learning_rate": 0.0005, + "loss": 2.1461, + "step": 19720 + }, + { + "epoch": 0.07509724960605345, + "grad_norm": 0.1314745992422104, + "learning_rate": 0.0005, + "loss": 2.1365, + "step": 19730 + }, + { + "epoch": 0.07513531207417613, + "grad_norm": 0.12059265375137329, + "learning_rate": 0.0005, + "loss": 2.1429, + "step": 19740 + }, + { + "epoch": 0.07517337454229882, + "grad_norm": 0.12166786938905716, + "learning_rate": 0.0005, + "loss": 2.1161, + "step": 19750 + }, + { + "epoch": 0.0752114370104215, + "grad_norm": 0.12821173667907715, + "learning_rate": 0.0005, + "loss": 2.1538, + "step": 19760 + }, + { + "epoch": 0.07524949947854419, + "grad_norm": 0.12376900017261505, + "learning_rate": 0.0005, + "loss": 2.144, + "step": 19770 + }, + { + "epoch": 0.07528756194666687, + "grad_norm": 0.1299164593219757, + "learning_rate": 0.0005, + "loss": 2.1282, + "step": 19780 + }, + { + "epoch": 0.07532562441478956, + "grad_norm": 0.1086520105600357, + "learning_rate": 0.0005, + "loss": 2.1195, + "step": 19790 + }, + { + "epoch": 0.07536368688291224, + "grad_norm": 0.11544159054756165, + "learning_rate": 0.0005, + "loss": 2.1503, + "step": 19800 + }, + { + "epoch": 0.07540174935103491, + "grad_norm": 0.12263761460781097, + "learning_rate": 0.0005, + "loss": 2.1581, + "step": 19810 + }, + { + "epoch": 0.0754398118191576, + "grad_norm": 0.17075596749782562, + "learning_rate": 0.0005, + "loss": 2.1277, + "step": 19820 + }, + { + "epoch": 0.07547787428728028, + "grad_norm": 0.1148160845041275, + "learning_rate": 0.0005, + "loss": 2.1187, + "step": 19830 + }, + { + "epoch": 0.07551593675540297, + "grad_norm": 0.11638422310352325, + "learning_rate": 0.0005, + "loss": 2.1448, + "step": 19840 + }, + { + "epoch": 0.07555399922352565, + "grad_norm": 0.10947652906179428, + "learning_rate": 0.0005, + "loss": 2.1478, + "step": 19850 + }, + { + "epoch": 0.07559206169164834, + "grad_norm": 0.11749492585659027, + "learning_rate": 0.0005, + "loss": 2.1437, + "step": 19860 + }, + { + "epoch": 0.07563012415977102, + "grad_norm": 0.12871646881103516, + "learning_rate": 0.0005, + "loss": 2.1427, + "step": 19870 + }, + { + "epoch": 0.0756681866278937, + "grad_norm": 0.119835264980793, + "learning_rate": 0.0005, + "loss": 2.1375, + "step": 19880 + }, + { + "epoch": 0.07570624909601638, + "grad_norm": 0.12297997623682022, + "learning_rate": 0.0005, + "loss": 2.1422, + "step": 19890 + }, + { + "epoch": 0.07574431156413906, + "grad_norm": 0.11132816970348358, + "learning_rate": 0.0005, + "loss": 2.143, + "step": 19900 + }, + { + "epoch": 0.07578237403226175, + "grad_norm": 0.12448112666606903, + "learning_rate": 0.0005, + "loss": 2.1444, + "step": 19910 + }, + { + "epoch": 0.07582043650038443, + "grad_norm": 0.1350453794002533, + "learning_rate": 0.0005, + "loss": 2.156, + "step": 19920 + }, + { + "epoch": 0.07585849896850712, + "grad_norm": 0.11363179236650467, + "learning_rate": 0.0005, + "loss": 2.1383, + "step": 19930 + }, + { + "epoch": 0.0758965614366298, + "grad_norm": 0.23336747288703918, + "learning_rate": 0.0005, + "loss": 2.1301, + "step": 19940 + }, + { + "epoch": 0.07593462390475247, + "grad_norm": 0.13528256118297577, + "learning_rate": 0.0005, + "loss": 2.1404, + "step": 19950 + }, + { + "epoch": 0.07597268637287516, + "grad_norm": 0.11738921701908112, + "learning_rate": 0.0005, + "loss": 2.1426, + "step": 19960 + }, + { + "epoch": 0.07601074884099784, + "grad_norm": 0.11897687613964081, + "learning_rate": 0.0005, + "loss": 2.1465, + "step": 19970 + }, + { + "epoch": 0.07604881130912053, + "grad_norm": 0.12669093906879425, + "learning_rate": 0.0005, + "loss": 2.1329, + "step": 19980 + }, + { + "epoch": 0.07608687377724321, + "grad_norm": 0.11751807481050491, + "learning_rate": 0.0005, + "loss": 2.1317, + "step": 19990 + }, + { + "epoch": 0.0761249362453659, + "grad_norm": 0.13250021636486053, + "learning_rate": 0.0005, + "loss": 2.1254, + "step": 20000 + }, + { + "epoch": 0.07616299871348858, + "grad_norm": 0.12080192565917969, + "learning_rate": 0.0005, + "loss": 2.1334, + "step": 20010 + }, + { + "epoch": 0.07620106118161125, + "grad_norm": 0.13605637848377228, + "learning_rate": 0.0005, + "loss": 2.1417, + "step": 20020 + }, + { + "epoch": 0.07623912364973394, + "grad_norm": 0.13749563694000244, + "learning_rate": 0.0005, + "loss": 2.1557, + "step": 20030 + }, + { + "epoch": 0.07627718611785662, + "grad_norm": 0.12290322780609131, + "learning_rate": 0.0005, + "loss": 2.1404, + "step": 20040 + }, + { + "epoch": 0.07631524858597931, + "grad_norm": 0.12153260409832001, + "learning_rate": 0.0005, + "loss": 2.148, + "step": 20050 + }, + { + "epoch": 0.076353311054102, + "grad_norm": 0.11187773942947388, + "learning_rate": 0.0005, + "loss": 2.1516, + "step": 20060 + }, + { + "epoch": 0.07639137352222468, + "grad_norm": 0.12065640836954117, + "learning_rate": 0.0005, + "loss": 2.133, + "step": 20070 + }, + { + "epoch": 0.07642943599034736, + "grad_norm": 0.13967113196849823, + "learning_rate": 0.0005, + "loss": 2.1369, + "step": 20080 + }, + { + "epoch": 0.07646749845847003, + "grad_norm": 0.12397325038909912, + "learning_rate": 0.0005, + "loss": 2.1395, + "step": 20090 + }, + { + "epoch": 0.07650556092659272, + "grad_norm": 0.1470394879579544, + "learning_rate": 0.0005, + "loss": 2.1449, + "step": 20100 + }, + { + "epoch": 0.0765436233947154, + "grad_norm": 0.11773227900266647, + "learning_rate": 0.0005, + "loss": 2.1407, + "step": 20110 + }, + { + "epoch": 0.07658168586283809, + "grad_norm": 0.1158328652381897, + "learning_rate": 0.0005, + "loss": 2.1343, + "step": 20120 + }, + { + "epoch": 0.07661974833096077, + "grad_norm": 0.1100144013762474, + "learning_rate": 0.0005, + "loss": 2.1441, + "step": 20130 + }, + { + "epoch": 0.07665781079908346, + "grad_norm": 0.113619863986969, + "learning_rate": 0.0005, + "loss": 2.1456, + "step": 20140 + }, + { + "epoch": 0.07669587326720614, + "grad_norm": 0.11518946290016174, + "learning_rate": 0.0005, + "loss": 2.1295, + "step": 20150 + }, + { + "epoch": 0.07673393573532881, + "grad_norm": 0.1303129643201828, + "learning_rate": 0.0005, + "loss": 2.1496, + "step": 20160 + }, + { + "epoch": 0.0767719982034515, + "grad_norm": 0.10941874235868454, + "learning_rate": 0.0005, + "loss": 2.117, + "step": 20170 + }, + { + "epoch": 0.07681006067157418, + "grad_norm": 0.12623172998428345, + "learning_rate": 0.0005, + "loss": 2.1313, + "step": 20180 + }, + { + "epoch": 0.07684812313969687, + "grad_norm": 0.1392190307378769, + "learning_rate": 0.0005, + "loss": 2.1438, + "step": 20190 + }, + { + "epoch": 0.07688618560781955, + "grad_norm": 0.12022106349468231, + "learning_rate": 0.0005, + "loss": 2.1476, + "step": 20200 + }, + { + "epoch": 0.07692424807594224, + "grad_norm": 0.12242559343576431, + "learning_rate": 0.0005, + "loss": 2.1439, + "step": 20210 + }, + { + "epoch": 0.07696231054406492, + "grad_norm": 0.1221131682395935, + "learning_rate": 0.0005, + "loss": 2.1493, + "step": 20220 + }, + { + "epoch": 0.07700037301218761, + "grad_norm": 0.14182278513908386, + "learning_rate": 0.0005, + "loss": 2.1629, + "step": 20230 + }, + { + "epoch": 0.07703843548031028, + "grad_norm": 0.11639798432588577, + "learning_rate": 0.0005, + "loss": 2.1366, + "step": 20240 + }, + { + "epoch": 0.07707649794843296, + "grad_norm": 0.1260325163602829, + "learning_rate": 0.0005, + "loss": 2.1323, + "step": 20250 + }, + { + "epoch": 0.07711456041655565, + "grad_norm": 0.1177816316485405, + "learning_rate": 0.0005, + "loss": 2.1332, + "step": 20260 + }, + { + "epoch": 0.07715262288467833, + "grad_norm": 0.12593698501586914, + "learning_rate": 0.0005, + "loss": 2.1402, + "step": 20270 + }, + { + "epoch": 0.07719068535280102, + "grad_norm": 0.12568175792694092, + "learning_rate": 0.0005, + "loss": 2.1379, + "step": 20280 + }, + { + "epoch": 0.0772287478209237, + "grad_norm": 0.12054798752069473, + "learning_rate": 0.0005, + "loss": 2.1264, + "step": 20290 + }, + { + "epoch": 0.07726681028904639, + "grad_norm": 0.1292949765920639, + "learning_rate": 0.0005, + "loss": 2.1257, + "step": 20300 + }, + { + "epoch": 0.07730487275716906, + "grad_norm": 0.11451704055070877, + "learning_rate": 0.0005, + "loss": 2.1143, + "step": 20310 + }, + { + "epoch": 0.07734293522529175, + "grad_norm": 0.12161093950271606, + "learning_rate": 0.0005, + "loss": 2.1402, + "step": 20320 + }, + { + "epoch": 0.07738099769341443, + "grad_norm": 0.12055949866771698, + "learning_rate": 0.0005, + "loss": 2.1457, + "step": 20330 + }, + { + "epoch": 0.07741906016153711, + "grad_norm": 0.133913055062294, + "learning_rate": 0.0005, + "loss": 2.1354, + "step": 20340 + }, + { + "epoch": 0.0774571226296598, + "grad_norm": 0.12069426476955414, + "learning_rate": 0.0005, + "loss": 2.1216, + "step": 20350 + }, + { + "epoch": 0.07749518509778248, + "grad_norm": 0.11371367424726486, + "learning_rate": 0.0005, + "loss": 2.1416, + "step": 20360 + }, + { + "epoch": 0.07753324756590517, + "grad_norm": 0.13321714103221893, + "learning_rate": 0.0005, + "loss": 2.13, + "step": 20370 + }, + { + "epoch": 0.07757131003402784, + "grad_norm": 0.11763419955968857, + "learning_rate": 0.0005, + "loss": 2.149, + "step": 20380 + }, + { + "epoch": 0.07760937250215053, + "grad_norm": 0.1404077112674713, + "learning_rate": 0.0005, + "loss": 2.1392, + "step": 20390 + }, + { + "epoch": 0.07764743497027321, + "grad_norm": 0.11764439940452576, + "learning_rate": 0.0005, + "loss": 2.1436, + "step": 20400 + }, + { + "epoch": 0.0776854974383959, + "grad_norm": 0.14752964675426483, + "learning_rate": 0.0005, + "loss": 2.1255, + "step": 20410 + }, + { + "epoch": 0.07772355990651858, + "grad_norm": 0.1245315670967102, + "learning_rate": 0.0005, + "loss": 2.1383, + "step": 20420 + }, + { + "epoch": 0.07776162237464126, + "grad_norm": 0.1200486496090889, + "learning_rate": 0.0005, + "loss": 2.1355, + "step": 20430 + }, + { + "epoch": 0.07779968484276395, + "grad_norm": 0.1259072721004486, + "learning_rate": 0.0005, + "loss": 2.1631, + "step": 20440 + }, + { + "epoch": 0.07783774731088662, + "grad_norm": 0.11854323744773865, + "learning_rate": 0.0005, + "loss": 2.1282, + "step": 20450 + }, + { + "epoch": 0.0778758097790093, + "grad_norm": 0.14033794403076172, + "learning_rate": 0.0005, + "loss": 2.1521, + "step": 20460 + }, + { + "epoch": 0.07791387224713199, + "grad_norm": 0.11894816905260086, + "learning_rate": 0.0005, + "loss": 2.1466, + "step": 20470 + }, + { + "epoch": 0.07795193471525468, + "grad_norm": 0.11670060455799103, + "learning_rate": 0.0005, + "loss": 2.1298, + "step": 20480 + }, + { + "epoch": 0.07798999718337736, + "grad_norm": 0.1166292354464531, + "learning_rate": 0.0005, + "loss": 2.1395, + "step": 20490 + }, + { + "epoch": 0.07802805965150005, + "grad_norm": 0.12800370156764984, + "learning_rate": 0.0005, + "loss": 2.1474, + "step": 20500 + }, + { + "epoch": 0.07806612211962273, + "grad_norm": 0.11468150466680527, + "learning_rate": 0.0005, + "loss": 2.1349, + "step": 20510 + }, + { + "epoch": 0.0781041845877454, + "grad_norm": 0.12797722220420837, + "learning_rate": 0.0005, + "loss": 2.1434, + "step": 20520 + }, + { + "epoch": 0.07814224705586809, + "grad_norm": 0.1424039900302887, + "learning_rate": 0.0005, + "loss": 2.1278, + "step": 20530 + }, + { + "epoch": 0.07818030952399077, + "grad_norm": 0.11964821815490723, + "learning_rate": 0.0005, + "loss": 2.1425, + "step": 20540 + }, + { + "epoch": 0.07821837199211346, + "grad_norm": 0.13776551187038422, + "learning_rate": 0.0005, + "loss": 2.146, + "step": 20550 + }, + { + "epoch": 0.07825643446023614, + "grad_norm": 0.13417179882526398, + "learning_rate": 0.0005, + "loss": 2.1381, + "step": 20560 + }, + { + "epoch": 0.07829449692835883, + "grad_norm": 0.13099367916584015, + "learning_rate": 0.0005, + "loss": 2.1519, + "step": 20570 + }, + { + "epoch": 0.07833255939648151, + "grad_norm": 0.1310262531042099, + "learning_rate": 0.0005, + "loss": 2.1336, + "step": 20580 + }, + { + "epoch": 0.0783706218646042, + "grad_norm": 0.10515132546424866, + "learning_rate": 0.0005, + "loss": 2.1367, + "step": 20590 + }, + { + "epoch": 0.07840868433272687, + "grad_norm": 0.13627183437347412, + "learning_rate": 0.0005, + "loss": 2.1488, + "step": 20600 + }, + { + "epoch": 0.07844674680084955, + "grad_norm": 0.12111663818359375, + "learning_rate": 0.0005, + "loss": 2.1482, + "step": 20610 + }, + { + "epoch": 0.07848480926897224, + "grad_norm": 0.11795415729284286, + "learning_rate": 0.0005, + "loss": 2.1394, + "step": 20620 + }, + { + "epoch": 0.07852287173709492, + "grad_norm": 0.11769380420446396, + "learning_rate": 0.0005, + "loss": 2.1698, + "step": 20630 + }, + { + "epoch": 0.0785609342052176, + "grad_norm": 0.12819325923919678, + "learning_rate": 0.0005, + "loss": 2.1428, + "step": 20640 + }, + { + "epoch": 0.07859899667334029, + "grad_norm": 0.135583758354187, + "learning_rate": 0.0005, + "loss": 2.1341, + "step": 20650 + }, + { + "epoch": 0.07863705914146298, + "grad_norm": 0.13728412985801697, + "learning_rate": 0.0005, + "loss": 2.1338, + "step": 20660 + }, + { + "epoch": 0.07867512160958565, + "grad_norm": 0.11297822743654251, + "learning_rate": 0.0005, + "loss": 2.143, + "step": 20670 + }, + { + "epoch": 0.07871318407770833, + "grad_norm": 0.13838008046150208, + "learning_rate": 0.0005, + "loss": 2.1469, + "step": 20680 + }, + { + "epoch": 0.07875124654583102, + "grad_norm": 0.11964000761508942, + "learning_rate": 0.0005, + "loss": 2.1286, + "step": 20690 + }, + { + "epoch": 0.0787893090139537, + "grad_norm": 0.11306457221508026, + "learning_rate": 0.0005, + "loss": 2.1383, + "step": 20700 + }, + { + "epoch": 0.07882737148207639, + "grad_norm": 0.1265837848186493, + "learning_rate": 0.0005, + "loss": 2.1495, + "step": 20710 + }, + { + "epoch": 0.07886543395019907, + "grad_norm": 0.13833211362361908, + "learning_rate": 0.0005, + "loss": 2.1489, + "step": 20720 + }, + { + "epoch": 0.07890349641832176, + "grad_norm": 0.12850770354270935, + "learning_rate": 0.0005, + "loss": 2.1381, + "step": 20730 + }, + { + "epoch": 0.07894155888644443, + "grad_norm": 0.125825434923172, + "learning_rate": 0.0005, + "loss": 2.1485, + "step": 20740 + }, + { + "epoch": 0.07897962135456711, + "grad_norm": 0.11365757882595062, + "learning_rate": 0.0005, + "loss": 2.1355, + "step": 20750 + }, + { + "epoch": 0.0790176838226898, + "grad_norm": 0.11496775597333908, + "learning_rate": 0.0005, + "loss": 2.141, + "step": 20760 + }, + { + "epoch": 0.07905574629081248, + "grad_norm": 0.11879292875528336, + "learning_rate": 0.0005, + "loss": 2.1573, + "step": 20770 + }, + { + "epoch": 0.07909380875893517, + "grad_norm": 0.4306030571460724, + "learning_rate": 0.0005, + "loss": 2.1394, + "step": 20780 + }, + { + "epoch": 0.07913187122705785, + "grad_norm": 0.12225326895713806, + "learning_rate": 0.0005, + "loss": 2.1462, + "step": 20790 + }, + { + "epoch": 0.07916993369518054, + "grad_norm": 0.10665999352931976, + "learning_rate": 0.0005, + "loss": 2.1351, + "step": 20800 + }, + { + "epoch": 0.07920799616330321, + "grad_norm": 0.11364707350730896, + "learning_rate": 0.0005, + "loss": 2.1286, + "step": 20810 + }, + { + "epoch": 0.07924605863142589, + "grad_norm": 0.11900816112756729, + "learning_rate": 0.0005, + "loss": 2.1308, + "step": 20820 + }, + { + "epoch": 0.07928412109954858, + "grad_norm": 0.1321338266134262, + "learning_rate": 0.0005, + "loss": 2.1249, + "step": 20830 + }, + { + "epoch": 0.07932218356767126, + "grad_norm": 0.11729200184345245, + "learning_rate": 0.0005, + "loss": 2.1452, + "step": 20840 + }, + { + "epoch": 0.07936024603579395, + "grad_norm": 0.1253681480884552, + "learning_rate": 0.0005, + "loss": 2.1412, + "step": 20850 + }, + { + "epoch": 0.07939830850391663, + "grad_norm": 0.10999830067157745, + "learning_rate": 0.0005, + "loss": 2.1467, + "step": 20860 + }, + { + "epoch": 0.07943637097203932, + "grad_norm": 0.1298547387123108, + "learning_rate": 0.0005, + "loss": 2.1387, + "step": 20870 + }, + { + "epoch": 0.07947443344016199, + "grad_norm": 0.11368982493877411, + "learning_rate": 0.0005, + "loss": 2.1346, + "step": 20880 + }, + { + "epoch": 0.07951249590828467, + "grad_norm": 0.10747841745615005, + "learning_rate": 0.0005, + "loss": 2.1528, + "step": 20890 + }, + { + "epoch": 0.07955055837640736, + "grad_norm": 0.12383821606636047, + "learning_rate": 0.0005, + "loss": 2.1518, + "step": 20900 + }, + { + "epoch": 0.07958862084453004, + "grad_norm": 0.1289435178041458, + "learning_rate": 0.0005, + "loss": 2.1321, + "step": 20910 + }, + { + "epoch": 0.07962668331265273, + "grad_norm": 0.1302112489938736, + "learning_rate": 0.0005, + "loss": 2.1442, + "step": 20920 + }, + { + "epoch": 0.07966474578077541, + "grad_norm": 0.11910293251276016, + "learning_rate": 0.0005, + "loss": 2.1392, + "step": 20930 + }, + { + "epoch": 0.0797028082488981, + "grad_norm": 0.12185361236333847, + "learning_rate": 0.0005, + "loss": 2.1481, + "step": 20940 + }, + { + "epoch": 0.07974087071702077, + "grad_norm": 0.12269507348537445, + "learning_rate": 0.0005, + "loss": 2.1388, + "step": 20950 + }, + { + "epoch": 0.07977893318514345, + "grad_norm": 0.12856151163578033, + "learning_rate": 0.0005, + "loss": 2.1424, + "step": 20960 + }, + { + "epoch": 0.07981699565326614, + "grad_norm": 0.11458587646484375, + "learning_rate": 0.0005, + "loss": 2.1272, + "step": 20970 + }, + { + "epoch": 0.07985505812138882, + "grad_norm": 0.11583131551742554, + "learning_rate": 0.0005, + "loss": 2.1291, + "step": 20980 + }, + { + "epoch": 0.07989312058951151, + "grad_norm": 0.121613048017025, + "learning_rate": 0.0005, + "loss": 2.1466, + "step": 20990 + }, + { + "epoch": 0.07993118305763419, + "grad_norm": 0.11847091466188431, + "learning_rate": 0.0005, + "loss": 2.1403, + "step": 21000 + }, + { + "epoch": 0.07996924552575688, + "grad_norm": 0.13026578724384308, + "learning_rate": 0.0005, + "loss": 2.1493, + "step": 21010 + }, + { + "epoch": 0.08000730799387956, + "grad_norm": 0.10834717005491257, + "learning_rate": 0.0005, + "loss": 2.1306, + "step": 21020 + }, + { + "epoch": 0.08004537046200223, + "grad_norm": 0.12242411822080612, + "learning_rate": 0.0005, + "loss": 2.1312, + "step": 21030 + }, + { + "epoch": 0.08008343293012492, + "grad_norm": 0.11778593808412552, + "learning_rate": 0.0005, + "loss": 2.1484, + "step": 21040 + }, + { + "epoch": 0.0801214953982476, + "grad_norm": 0.11754197627305984, + "learning_rate": 0.0005, + "loss": 2.1465, + "step": 21050 + }, + { + "epoch": 0.08015955786637029, + "grad_norm": 0.11059972643852234, + "learning_rate": 0.0005, + "loss": 2.1415, + "step": 21060 + }, + { + "epoch": 0.08019762033449297, + "grad_norm": 0.12480046600103378, + "learning_rate": 0.0005, + "loss": 2.1269, + "step": 21070 + }, + { + "epoch": 0.08023568280261566, + "grad_norm": 0.10805027186870575, + "learning_rate": 0.0005, + "loss": 2.137, + "step": 21080 + }, + { + "epoch": 0.08027374527073834, + "grad_norm": 0.12280254811048508, + "learning_rate": 0.0005, + "loss": 2.1391, + "step": 21090 + }, + { + "epoch": 0.08031180773886101, + "grad_norm": 0.12043575197458267, + "learning_rate": 0.0005, + "loss": 2.1338, + "step": 21100 + }, + { + "epoch": 0.0803498702069837, + "grad_norm": 0.11959745734930038, + "learning_rate": 0.0005, + "loss": 2.1307, + "step": 21110 + }, + { + "epoch": 0.08038793267510638, + "grad_norm": 0.11902160197496414, + "learning_rate": 0.0005, + "loss": 2.1368, + "step": 21120 + }, + { + "epoch": 0.08042599514322907, + "grad_norm": 0.12124022096395493, + "learning_rate": 0.0005, + "loss": 2.1381, + "step": 21130 + }, + { + "epoch": 0.08046405761135175, + "grad_norm": 0.12325721979141235, + "learning_rate": 0.0005, + "loss": 2.1452, + "step": 21140 + }, + { + "epoch": 0.08050212007947444, + "grad_norm": 0.12744925916194916, + "learning_rate": 0.0005, + "loss": 2.1409, + "step": 21150 + }, + { + "epoch": 0.08054018254759712, + "grad_norm": 0.12019761651754379, + "learning_rate": 0.0005, + "loss": 2.1462, + "step": 21160 + }, + { + "epoch": 0.0805782450157198, + "grad_norm": 0.1297874003648758, + "learning_rate": 0.0005, + "loss": 2.1299, + "step": 21170 + }, + { + "epoch": 0.08061630748384248, + "grad_norm": 0.11909880489110947, + "learning_rate": 0.0005, + "loss": 2.1451, + "step": 21180 + }, + { + "epoch": 0.08065436995196516, + "grad_norm": 0.11740684509277344, + "learning_rate": 0.0005, + "loss": 2.1328, + "step": 21190 + }, + { + "epoch": 0.08069243242008785, + "grad_norm": 0.11909528821706772, + "learning_rate": 0.0005, + "loss": 2.1486, + "step": 21200 + }, + { + "epoch": 0.08073049488821053, + "grad_norm": 0.12825709581375122, + "learning_rate": 0.0005, + "loss": 2.146, + "step": 21210 + }, + { + "epoch": 0.08076855735633322, + "grad_norm": 0.12015367299318314, + "learning_rate": 0.0005, + "loss": 2.1368, + "step": 21220 + }, + { + "epoch": 0.0808066198244559, + "grad_norm": 0.12286186218261719, + "learning_rate": 0.0005, + "loss": 2.1518, + "step": 21230 + }, + { + "epoch": 0.08084468229257857, + "grad_norm": 0.13856559991836548, + "learning_rate": 0.0005, + "loss": 2.1384, + "step": 21240 + }, + { + "epoch": 0.08088274476070126, + "grad_norm": 0.12698425352573395, + "learning_rate": 0.0005, + "loss": 2.1392, + "step": 21250 + }, + { + "epoch": 0.08092080722882394, + "grad_norm": 0.12557373940944672, + "learning_rate": 0.0005, + "loss": 2.1343, + "step": 21260 + }, + { + "epoch": 0.08095886969694663, + "grad_norm": 0.10690121352672577, + "learning_rate": 0.0005, + "loss": 2.1462, + "step": 21270 + }, + { + "epoch": 0.08099693216506931, + "grad_norm": 0.10676196217536926, + "learning_rate": 0.0005, + "loss": 2.1276, + "step": 21280 + }, + { + "epoch": 0.081034994633192, + "grad_norm": 0.13777486979961395, + "learning_rate": 0.0005, + "loss": 2.1399, + "step": 21290 + }, + { + "epoch": 0.08107305710131468, + "grad_norm": 0.1320984661579132, + "learning_rate": 0.0005, + "loss": 2.138, + "step": 21300 + }, + { + "epoch": 0.08111111956943735, + "grad_norm": 0.11867671459913254, + "learning_rate": 0.0005, + "loss": 2.1424, + "step": 21310 + }, + { + "epoch": 0.08114918203756004, + "grad_norm": 0.12420900166034698, + "learning_rate": 0.0005, + "loss": 2.1453, + "step": 21320 + }, + { + "epoch": 0.08118724450568272, + "grad_norm": 0.12168706208467484, + "learning_rate": 0.0005, + "loss": 2.1273, + "step": 21330 + }, + { + "epoch": 0.08122530697380541, + "grad_norm": 0.11081932485103607, + "learning_rate": 0.0005, + "loss": 2.1371, + "step": 21340 + }, + { + "epoch": 0.0812633694419281, + "grad_norm": 0.1164914146065712, + "learning_rate": 0.0005, + "loss": 2.1421, + "step": 21350 + }, + { + "epoch": 0.08130143191005078, + "grad_norm": 0.11893417686223984, + "learning_rate": 0.0005, + "loss": 2.1468, + "step": 21360 + }, + { + "epoch": 0.08133949437817346, + "grad_norm": 0.14425797760486603, + "learning_rate": 0.0005, + "loss": 2.144, + "step": 21370 + }, + { + "epoch": 0.08137755684629615, + "grad_norm": 0.12183461338281631, + "learning_rate": 0.0005, + "loss": 2.1391, + "step": 21380 + }, + { + "epoch": 0.08141561931441882, + "grad_norm": 0.12465500086545944, + "learning_rate": 0.0005, + "loss": 2.1462, + "step": 21390 + }, + { + "epoch": 0.0814536817825415, + "grad_norm": 0.14262641966342926, + "learning_rate": 0.0005, + "loss": 2.1458, + "step": 21400 + }, + { + "epoch": 0.08149174425066419, + "grad_norm": 0.12002576887607574, + "learning_rate": 0.0005, + "loss": 2.1357, + "step": 21410 + }, + { + "epoch": 0.08152980671878687, + "grad_norm": 0.12708404660224915, + "learning_rate": 0.0005, + "loss": 2.1402, + "step": 21420 + }, + { + "epoch": 0.08156786918690956, + "grad_norm": 0.131977841258049, + "learning_rate": 0.0005, + "loss": 2.1457, + "step": 21430 + }, + { + "epoch": 0.08160593165503224, + "grad_norm": 0.1188463419675827, + "learning_rate": 0.0005, + "loss": 2.1442, + "step": 21440 + }, + { + "epoch": 0.08164399412315493, + "grad_norm": 0.12172247469425201, + "learning_rate": 0.0005, + "loss": 2.152, + "step": 21450 + }, + { + "epoch": 0.0816820565912776, + "grad_norm": 0.1287251114845276, + "learning_rate": 0.0005, + "loss": 2.1334, + "step": 21460 + }, + { + "epoch": 0.08172011905940028, + "grad_norm": 0.12985315918922424, + "learning_rate": 0.0005, + "loss": 2.1349, + "step": 21470 + }, + { + "epoch": 0.08175818152752297, + "grad_norm": 0.136166512966156, + "learning_rate": 0.0005, + "loss": 2.1602, + "step": 21480 + }, + { + "epoch": 0.08179624399564565, + "grad_norm": 0.12115434557199478, + "learning_rate": 0.0005, + "loss": 2.1456, + "step": 21490 + }, + { + "epoch": 0.08183430646376834, + "grad_norm": 0.11187921464443207, + "learning_rate": 0.0005, + "loss": 2.1413, + "step": 21500 + }, + { + "epoch": 0.08187236893189102, + "grad_norm": 0.13115552067756653, + "learning_rate": 0.0005, + "loss": 2.1371, + "step": 21510 + }, + { + "epoch": 0.08191043140001371, + "grad_norm": 0.12254226207733154, + "learning_rate": 0.0005, + "loss": 2.1365, + "step": 21520 + }, + { + "epoch": 0.08194849386813638, + "grad_norm": 0.11350936442613602, + "learning_rate": 0.0005, + "loss": 2.1428, + "step": 21530 + }, + { + "epoch": 0.08198655633625906, + "grad_norm": 0.13715927302837372, + "learning_rate": 0.0005, + "loss": 2.1264, + "step": 21540 + }, + { + "epoch": 0.08202461880438175, + "grad_norm": 0.11191634833812714, + "learning_rate": 0.0005, + "loss": 2.1252, + "step": 21550 + }, + { + "epoch": 0.08206268127250443, + "grad_norm": 0.12232932448387146, + "learning_rate": 0.0005, + "loss": 2.1449, + "step": 21560 + }, + { + "epoch": 0.08210074374062712, + "grad_norm": 0.10877176374197006, + "learning_rate": 0.0005, + "loss": 2.1313, + "step": 21570 + }, + { + "epoch": 0.0821388062087498, + "grad_norm": 0.11842214316129684, + "learning_rate": 0.0005, + "loss": 2.1302, + "step": 21580 + }, + { + "epoch": 0.08217686867687249, + "grad_norm": 0.1253902018070221, + "learning_rate": 0.0005, + "loss": 2.1417, + "step": 21590 + }, + { + "epoch": 0.08221493114499516, + "grad_norm": 0.12462165206670761, + "learning_rate": 0.0005, + "loss": 2.1442, + "step": 21600 + }, + { + "epoch": 0.08225299361311784, + "grad_norm": 0.11757472157478333, + "learning_rate": 0.0005, + "loss": 2.1403, + "step": 21610 + }, + { + "epoch": 0.08229105608124053, + "grad_norm": 0.12682631611824036, + "learning_rate": 0.0005, + "loss": 2.1333, + "step": 21620 + }, + { + "epoch": 0.08232911854936321, + "grad_norm": 0.12222106754779816, + "learning_rate": 0.0005, + "loss": 2.1496, + "step": 21630 + }, + { + "epoch": 0.0823671810174859, + "grad_norm": 0.12464617937803268, + "learning_rate": 0.0005, + "loss": 2.1381, + "step": 21640 + }, + { + "epoch": 0.08240524348560858, + "grad_norm": 0.11862632632255554, + "learning_rate": 0.0005, + "loss": 2.1295, + "step": 21650 + }, + { + "epoch": 0.08244330595373127, + "grad_norm": 0.1316101849079132, + "learning_rate": 0.0005, + "loss": 2.1333, + "step": 21660 + }, + { + "epoch": 0.08248136842185394, + "grad_norm": 0.12959465384483337, + "learning_rate": 0.0005, + "loss": 2.1342, + "step": 21670 + }, + { + "epoch": 0.08251943088997662, + "grad_norm": 0.1375904530286789, + "learning_rate": 0.0005, + "loss": 2.1326, + "step": 21680 + }, + { + "epoch": 0.08255749335809931, + "grad_norm": 0.11778242141008377, + "learning_rate": 0.0005, + "loss": 2.146, + "step": 21690 + }, + { + "epoch": 0.082595555826222, + "grad_norm": 0.13062040507793427, + "learning_rate": 0.0005, + "loss": 2.1527, + "step": 21700 + }, + { + "epoch": 0.08263361829434468, + "grad_norm": 0.11083482950925827, + "learning_rate": 0.0005, + "loss": 2.1429, + "step": 21710 + }, + { + "epoch": 0.08267168076246736, + "grad_norm": 0.11338507384061813, + "learning_rate": 0.0005, + "loss": 2.1438, + "step": 21720 + }, + { + "epoch": 0.08270974323059005, + "grad_norm": 0.12225829064846039, + "learning_rate": 0.0005, + "loss": 2.1371, + "step": 21730 + }, + { + "epoch": 0.08274780569871273, + "grad_norm": 0.13471394777297974, + "learning_rate": 0.0005, + "loss": 2.1512, + "step": 21740 + }, + { + "epoch": 0.0827858681668354, + "grad_norm": 0.12748363614082336, + "learning_rate": 0.0005, + "loss": 2.1467, + "step": 21750 + }, + { + "epoch": 0.08282393063495809, + "grad_norm": 0.15540693700313568, + "learning_rate": 0.0005, + "loss": 2.1334, + "step": 21760 + }, + { + "epoch": 0.08286199310308077, + "grad_norm": 0.11434046924114227, + "learning_rate": 0.0005, + "loss": 2.1336, + "step": 21770 + }, + { + "epoch": 0.08290005557120346, + "grad_norm": 0.12636405229568481, + "learning_rate": 0.0005, + "loss": 2.1524, + "step": 21780 + }, + { + "epoch": 0.08293811803932614, + "grad_norm": 0.1177835464477539, + "learning_rate": 0.0005, + "loss": 2.1333, + "step": 21790 + }, + { + "epoch": 0.08297618050744883, + "grad_norm": 0.11512091010808945, + "learning_rate": 0.0005, + "loss": 2.1348, + "step": 21800 + }, + { + "epoch": 0.08301424297557151, + "grad_norm": 0.10817056149244308, + "learning_rate": 0.0005, + "loss": 2.1392, + "step": 21810 + }, + { + "epoch": 0.08305230544369419, + "grad_norm": 0.1211874708533287, + "learning_rate": 0.0005, + "loss": 2.1328, + "step": 21820 + }, + { + "epoch": 0.08309036791181687, + "grad_norm": 0.12446257472038269, + "learning_rate": 0.0005, + "loss": 2.1461, + "step": 21830 + }, + { + "epoch": 0.08312843037993956, + "grad_norm": 0.11935817450284958, + "learning_rate": 0.0005, + "loss": 2.1409, + "step": 21840 + }, + { + "epoch": 0.08316649284806224, + "grad_norm": 0.12793755531311035, + "learning_rate": 0.0005, + "loss": 2.1257, + "step": 21850 + }, + { + "epoch": 0.08320455531618492, + "grad_norm": 0.12205001711845398, + "learning_rate": 0.0005, + "loss": 2.1416, + "step": 21860 + }, + { + "epoch": 0.08324261778430761, + "grad_norm": 0.11728016287088394, + "learning_rate": 0.0005, + "loss": 2.1487, + "step": 21870 + }, + { + "epoch": 0.0832806802524303, + "grad_norm": 0.12212223559617996, + "learning_rate": 0.0005, + "loss": 2.1446, + "step": 21880 + }, + { + "epoch": 0.08331874272055297, + "grad_norm": 0.12426520138978958, + "learning_rate": 0.0005, + "loss": 2.1465, + "step": 21890 + }, + { + "epoch": 0.08335680518867565, + "grad_norm": 0.1372053176164627, + "learning_rate": 0.0005, + "loss": 2.1311, + "step": 21900 + }, + { + "epoch": 0.08339486765679834, + "grad_norm": 0.1247352659702301, + "learning_rate": 0.0005, + "loss": 2.1416, + "step": 21910 + }, + { + "epoch": 0.08343293012492102, + "grad_norm": 0.11587857455015182, + "learning_rate": 0.0005, + "loss": 2.136, + "step": 21920 + }, + { + "epoch": 0.0834709925930437, + "grad_norm": 0.11178795248270035, + "learning_rate": 0.0005, + "loss": 2.1435, + "step": 21930 + }, + { + "epoch": 0.08350905506116639, + "grad_norm": 0.12461380660533905, + "learning_rate": 0.0005, + "loss": 2.1522, + "step": 21940 + }, + { + "epoch": 0.08354711752928907, + "grad_norm": 0.10774627327919006, + "learning_rate": 0.0005, + "loss": 2.1438, + "step": 21950 + }, + { + "epoch": 0.08358517999741175, + "grad_norm": 0.12161616235971451, + "learning_rate": 0.0005, + "loss": 2.1341, + "step": 21960 + }, + { + "epoch": 0.08362324246553443, + "grad_norm": 0.12391353398561478, + "learning_rate": 0.0005, + "loss": 2.1363, + "step": 21970 + }, + { + "epoch": 0.08366130493365712, + "grad_norm": 0.1828557401895523, + "learning_rate": 0.0005, + "loss": 2.1303, + "step": 21980 + }, + { + "epoch": 0.0836993674017798, + "grad_norm": 0.11445656418800354, + "learning_rate": 0.0005, + "loss": 2.1481, + "step": 21990 + }, + { + "epoch": 0.08373742986990249, + "grad_norm": 0.12685362994670868, + "learning_rate": 0.0005, + "loss": 2.1441, + "step": 22000 + }, + { + "epoch": 0.08377549233802517, + "grad_norm": 0.14515061676502228, + "learning_rate": 0.0005, + "loss": 2.1318, + "step": 22010 + }, + { + "epoch": 0.08381355480614786, + "grad_norm": 0.11717475205659866, + "learning_rate": 0.0005, + "loss": 2.1369, + "step": 22020 + }, + { + "epoch": 0.08385161727427053, + "grad_norm": 0.1403116136789322, + "learning_rate": 0.0005, + "loss": 2.1442, + "step": 22030 + }, + { + "epoch": 0.08388967974239321, + "grad_norm": 0.11670785397291183, + "learning_rate": 0.0005, + "loss": 2.1373, + "step": 22040 + }, + { + "epoch": 0.0839277422105159, + "grad_norm": 0.13050960004329681, + "learning_rate": 0.0005, + "loss": 2.1462, + "step": 22050 + }, + { + "epoch": 0.08396580467863858, + "grad_norm": 0.11913710087537766, + "learning_rate": 0.0005, + "loss": 2.1408, + "step": 22060 + }, + { + "epoch": 0.08400386714676127, + "grad_norm": 0.12010898441076279, + "learning_rate": 0.0005, + "loss": 2.149, + "step": 22070 + }, + { + "epoch": 0.08404192961488395, + "grad_norm": 0.122723788022995, + "learning_rate": 0.0005, + "loss": 2.1456, + "step": 22080 + }, + { + "epoch": 0.08407999208300664, + "grad_norm": 0.12780143320560455, + "learning_rate": 0.0005, + "loss": 2.1359, + "step": 22090 + }, + { + "epoch": 0.0841180545511293, + "grad_norm": 0.11837394535541534, + "learning_rate": 0.0005, + "loss": 2.1553, + "step": 22100 + }, + { + "epoch": 0.08415611701925199, + "grad_norm": 0.1171930804848671, + "learning_rate": 0.0005, + "loss": 2.1418, + "step": 22110 + }, + { + "epoch": 0.08419417948737468, + "grad_norm": 0.12089846283197403, + "learning_rate": 0.0005, + "loss": 2.1548, + "step": 22120 + }, + { + "epoch": 0.08423224195549736, + "grad_norm": 0.11761198937892914, + "learning_rate": 0.0005, + "loss": 2.133, + "step": 22130 + }, + { + "epoch": 0.08427030442362005, + "grad_norm": 0.11975211650133133, + "learning_rate": 0.0005, + "loss": 2.1326, + "step": 22140 + }, + { + "epoch": 0.08430836689174273, + "grad_norm": 0.12231068313121796, + "learning_rate": 0.0005, + "loss": 2.1405, + "step": 22150 + }, + { + "epoch": 0.08434642935986542, + "grad_norm": 0.12235705554485321, + "learning_rate": 0.0005, + "loss": 2.1439, + "step": 22160 + }, + { + "epoch": 0.0843844918279881, + "grad_norm": 0.12323001772165298, + "learning_rate": 0.0005, + "loss": 2.1469, + "step": 22170 + }, + { + "epoch": 0.08442255429611077, + "grad_norm": 0.14007116854190826, + "learning_rate": 0.0005, + "loss": 2.1476, + "step": 22180 + }, + { + "epoch": 0.08446061676423346, + "grad_norm": 0.11277811229228973, + "learning_rate": 0.0005, + "loss": 2.1378, + "step": 22190 + }, + { + "epoch": 0.08449867923235614, + "grad_norm": 0.12346283346414566, + "learning_rate": 0.0005, + "loss": 2.1359, + "step": 22200 + }, + { + "epoch": 0.08453674170047883, + "grad_norm": 0.1161845400929451, + "learning_rate": 0.0005, + "loss": 2.1422, + "step": 22210 + }, + { + "epoch": 0.08457480416860151, + "grad_norm": 0.11737547814846039, + "learning_rate": 0.0005, + "loss": 2.1305, + "step": 22220 + }, + { + "epoch": 0.0846128666367242, + "grad_norm": 0.1171225979924202, + "learning_rate": 0.0005, + "loss": 2.1319, + "step": 22230 + }, + { + "epoch": 0.08465092910484688, + "grad_norm": 0.11654467135667801, + "learning_rate": 0.0005, + "loss": 2.1409, + "step": 22240 + }, + { + "epoch": 0.08468899157296955, + "grad_norm": 0.11299663782119751, + "learning_rate": 0.0005, + "loss": 2.1303, + "step": 22250 + }, + { + "epoch": 0.08472705404109224, + "grad_norm": 0.11551041901111603, + "learning_rate": 0.0005, + "loss": 2.1443, + "step": 22260 + }, + { + "epoch": 0.08476511650921492, + "grad_norm": 0.11503621935844421, + "learning_rate": 0.0005, + "loss": 2.1582, + "step": 22270 + }, + { + "epoch": 0.0848031789773376, + "grad_norm": 0.12097814679145813, + "learning_rate": 0.0005, + "loss": 2.1508, + "step": 22280 + }, + { + "epoch": 0.08484124144546029, + "grad_norm": 0.12010312080383301, + "learning_rate": 0.0005, + "loss": 2.1479, + "step": 22290 + }, + { + "epoch": 0.08487930391358298, + "grad_norm": 0.12147096544504166, + "learning_rate": 0.0005, + "loss": 2.1339, + "step": 22300 + }, + { + "epoch": 0.08491736638170566, + "grad_norm": 0.12864457070827484, + "learning_rate": 0.0005, + "loss": 2.135, + "step": 22310 + }, + { + "epoch": 0.08495542884982833, + "grad_norm": 0.12055821716785431, + "learning_rate": 0.0005, + "loss": 2.1285, + "step": 22320 + }, + { + "epoch": 0.08499349131795102, + "grad_norm": 0.11782816797494888, + "learning_rate": 0.0005, + "loss": 2.1283, + "step": 22330 + }, + { + "epoch": 0.0850315537860737, + "grad_norm": 0.1284317523241043, + "learning_rate": 0.0005, + "loss": 2.1497, + "step": 22340 + }, + { + "epoch": 0.08506961625419639, + "grad_norm": 0.1229197159409523, + "learning_rate": 0.0005, + "loss": 2.1426, + "step": 22350 + }, + { + "epoch": 0.08510767872231907, + "grad_norm": 0.11341395974159241, + "learning_rate": 0.0005, + "loss": 2.1591, + "step": 22360 + }, + { + "epoch": 0.08514574119044176, + "grad_norm": 0.13056255877017975, + "learning_rate": 0.0005, + "loss": 2.1327, + "step": 22370 + }, + { + "epoch": 0.08518380365856444, + "grad_norm": 0.13087743520736694, + "learning_rate": 0.0005, + "loss": 2.1489, + "step": 22380 + }, + { + "epoch": 0.08522186612668711, + "grad_norm": 0.12757542729377747, + "learning_rate": 0.0005, + "loss": 2.1356, + "step": 22390 + }, + { + "epoch": 0.0852599285948098, + "grad_norm": 0.14698006212711334, + "learning_rate": 0.0005, + "loss": 2.1287, + "step": 22400 + }, + { + "epoch": 0.08529799106293248, + "grad_norm": 0.11643604189157486, + "learning_rate": 0.0005, + "loss": 2.1394, + "step": 22410 + }, + { + "epoch": 0.08533605353105517, + "grad_norm": 0.12725846469402313, + "learning_rate": 0.0005, + "loss": 2.1289, + "step": 22420 + }, + { + "epoch": 0.08537411599917785, + "grad_norm": 0.14252601563930511, + "learning_rate": 0.0005, + "loss": 2.1531, + "step": 22430 + }, + { + "epoch": 0.08541217846730054, + "grad_norm": 0.11785967648029327, + "learning_rate": 0.0005, + "loss": 2.1487, + "step": 22440 + }, + { + "epoch": 0.08545024093542322, + "grad_norm": 0.11564429104328156, + "learning_rate": 0.0005, + "loss": 2.1399, + "step": 22450 + }, + { + "epoch": 0.08548830340354589, + "grad_norm": 0.13538044691085815, + "learning_rate": 0.0005, + "loss": 2.1361, + "step": 22460 + }, + { + "epoch": 0.08552636587166858, + "grad_norm": 0.11665502935647964, + "learning_rate": 0.0005, + "loss": 2.1389, + "step": 22470 + }, + { + "epoch": 0.08556442833979126, + "grad_norm": 0.12820222973823547, + "learning_rate": 0.0005, + "loss": 2.1382, + "step": 22480 + }, + { + "epoch": 0.08560249080791395, + "grad_norm": 0.1172187402844429, + "learning_rate": 0.0005, + "loss": 2.146, + "step": 22490 + }, + { + "epoch": 0.08564055327603663, + "grad_norm": 0.15104034543037415, + "learning_rate": 0.0005, + "loss": 2.1468, + "step": 22500 + }, + { + "epoch": 0.08567861574415932, + "grad_norm": 0.13248179852962494, + "learning_rate": 0.0005, + "loss": 2.1465, + "step": 22510 + }, + { + "epoch": 0.085716678212282, + "grad_norm": 0.14016737043857574, + "learning_rate": 0.0005, + "loss": 2.1321, + "step": 22520 + }, + { + "epoch": 0.08575474068040469, + "grad_norm": 0.1385992020368576, + "learning_rate": 0.0005, + "loss": 2.1304, + "step": 22530 + }, + { + "epoch": 0.08579280314852736, + "grad_norm": 0.12560173869132996, + "learning_rate": 0.0005, + "loss": 2.1529, + "step": 22540 + }, + { + "epoch": 0.08583086561665004, + "grad_norm": 0.12612128257751465, + "learning_rate": 0.0005, + "loss": 2.1324, + "step": 22550 + }, + { + "epoch": 0.08586892808477273, + "grad_norm": 0.13057899475097656, + "learning_rate": 0.0005, + "loss": 2.1478, + "step": 22560 + }, + { + "epoch": 0.08590699055289541, + "grad_norm": 0.1250109225511551, + "learning_rate": 0.0005, + "loss": 2.1489, + "step": 22570 + }, + { + "epoch": 0.0859450530210181, + "grad_norm": 0.12412979453802109, + "learning_rate": 0.0005, + "loss": 2.147, + "step": 22580 + }, + { + "epoch": 0.08598311548914078, + "grad_norm": 0.13171933591365814, + "learning_rate": 0.0005, + "loss": 2.1411, + "step": 22590 + }, + { + "epoch": 0.08602117795726347, + "grad_norm": 0.11354347318410873, + "learning_rate": 0.0005, + "loss": 2.1384, + "step": 22600 + }, + { + "epoch": 0.08605924042538614, + "grad_norm": 0.1296190619468689, + "learning_rate": 0.0005, + "loss": 2.1353, + "step": 22610 + }, + { + "epoch": 0.08609730289350882, + "grad_norm": 0.12379322201013565, + "learning_rate": 0.0005, + "loss": 2.1498, + "step": 22620 + }, + { + "epoch": 0.08613536536163151, + "grad_norm": 0.11893151700496674, + "learning_rate": 0.0005, + "loss": 2.1413, + "step": 22630 + }, + { + "epoch": 0.08617342782975419, + "grad_norm": 0.11662375926971436, + "learning_rate": 0.0005, + "loss": 2.1234, + "step": 22640 + }, + { + "epoch": 0.08621149029787688, + "grad_norm": 0.12412435561418533, + "learning_rate": 0.0005, + "loss": 2.1458, + "step": 22650 + }, + { + "epoch": 0.08624955276599956, + "grad_norm": 0.12220267951488495, + "learning_rate": 0.0005, + "loss": 2.1402, + "step": 22660 + }, + { + "epoch": 0.08628761523412225, + "grad_norm": 0.11996627599000931, + "learning_rate": 0.0005, + "loss": 2.1522, + "step": 22670 + }, + { + "epoch": 0.08632567770224492, + "grad_norm": 0.13411886990070343, + "learning_rate": 0.0005, + "loss": 2.1463, + "step": 22680 + }, + { + "epoch": 0.0863637401703676, + "grad_norm": 0.20231394469738007, + "learning_rate": 0.0005, + "loss": 2.1243, + "step": 22690 + }, + { + "epoch": 0.08640180263849029, + "grad_norm": 0.11263010650873184, + "learning_rate": 0.0005, + "loss": 2.147, + "step": 22700 + }, + { + "epoch": 0.08643986510661297, + "grad_norm": 0.11748948693275452, + "learning_rate": 0.0005, + "loss": 2.1445, + "step": 22710 + }, + { + "epoch": 0.08647792757473566, + "grad_norm": 0.11988667398691177, + "learning_rate": 0.0005, + "loss": 2.153, + "step": 22720 + }, + { + "epoch": 0.08651599004285834, + "grad_norm": 0.11542542278766632, + "learning_rate": 0.0005, + "loss": 2.1281, + "step": 22730 + }, + { + "epoch": 0.08655405251098103, + "grad_norm": 0.11557333171367645, + "learning_rate": 0.0005, + "loss": 2.138, + "step": 22740 + }, + { + "epoch": 0.0865921149791037, + "grad_norm": 0.13377471268177032, + "learning_rate": 0.0005, + "loss": 2.1306, + "step": 22750 + }, + { + "epoch": 0.08663017744722638, + "grad_norm": 0.13291336596012115, + "learning_rate": 0.0005, + "loss": 2.1527, + "step": 22760 + }, + { + "epoch": 0.08666823991534907, + "grad_norm": 0.13353805243968964, + "learning_rate": 0.0005, + "loss": 2.145, + "step": 22770 + }, + { + "epoch": 0.08670630238347175, + "grad_norm": 0.11050818860530853, + "learning_rate": 0.0005, + "loss": 2.1372, + "step": 22780 + }, + { + "epoch": 0.08674436485159444, + "grad_norm": 0.10883437842130661, + "learning_rate": 0.0005, + "loss": 2.1329, + "step": 22790 + }, + { + "epoch": 0.08678242731971712, + "grad_norm": 0.12287956476211548, + "learning_rate": 0.0005, + "loss": 2.1584, + "step": 22800 + }, + { + "epoch": 0.08682048978783981, + "grad_norm": 0.11929097026586533, + "learning_rate": 0.0005, + "loss": 2.1261, + "step": 22810 + }, + { + "epoch": 0.08685855225596248, + "grad_norm": 0.11532725393772125, + "learning_rate": 0.0005, + "loss": 2.1525, + "step": 22820 + }, + { + "epoch": 0.08689661472408516, + "grad_norm": 0.12584735453128815, + "learning_rate": 0.0005, + "loss": 2.147, + "step": 22830 + }, + { + "epoch": 0.08693467719220785, + "grad_norm": 0.13039319217205048, + "learning_rate": 0.0005, + "loss": 2.1388, + "step": 22840 + }, + { + "epoch": 0.08697273966033053, + "grad_norm": 0.11363296210765839, + "learning_rate": 0.0005, + "loss": 2.1484, + "step": 22850 + }, + { + "epoch": 0.08701080212845322, + "grad_norm": 0.11446147412061691, + "learning_rate": 0.0005, + "loss": 2.1324, + "step": 22860 + }, + { + "epoch": 0.0870488645965759, + "grad_norm": 0.13208778202533722, + "learning_rate": 0.0005, + "loss": 2.1387, + "step": 22870 + }, + { + "epoch": 0.08708692706469859, + "grad_norm": 0.11517845839262009, + "learning_rate": 0.0005, + "loss": 2.1387, + "step": 22880 + }, + { + "epoch": 0.08712498953282127, + "grad_norm": 0.11906059831380844, + "learning_rate": 0.0005, + "loss": 2.1388, + "step": 22890 + }, + { + "epoch": 0.08716305200094394, + "grad_norm": 0.1303129941225052, + "learning_rate": 0.0005, + "loss": 2.1382, + "step": 22900 + }, + { + "epoch": 0.08720111446906663, + "grad_norm": 0.11496607959270477, + "learning_rate": 0.0005, + "loss": 2.1438, + "step": 22910 + }, + { + "epoch": 0.08723917693718931, + "grad_norm": 0.11427242308855057, + "learning_rate": 0.0005, + "loss": 2.1384, + "step": 22920 + }, + { + "epoch": 0.087277239405312, + "grad_norm": 0.11897694319486618, + "learning_rate": 0.0005, + "loss": 2.1363, + "step": 22930 + }, + { + "epoch": 0.08731530187343468, + "grad_norm": 0.11437114328145981, + "learning_rate": 0.0005, + "loss": 2.1419, + "step": 22940 + }, + { + "epoch": 0.08735336434155737, + "grad_norm": 0.12996020913124084, + "learning_rate": 0.0005, + "loss": 2.1606, + "step": 22950 + }, + { + "epoch": 0.08739142680968005, + "grad_norm": 0.11425703763961792, + "learning_rate": 0.0005, + "loss": 2.1427, + "step": 22960 + }, + { + "epoch": 0.08742948927780272, + "grad_norm": 0.12618638575077057, + "learning_rate": 0.0005, + "loss": 2.1387, + "step": 22970 + }, + { + "epoch": 0.08746755174592541, + "grad_norm": 0.1312779039144516, + "learning_rate": 0.0005, + "loss": 2.1455, + "step": 22980 + }, + { + "epoch": 0.0875056142140481, + "grad_norm": 0.11608152091503143, + "learning_rate": 0.0005, + "loss": 2.1425, + "step": 22990 + }, + { + "epoch": 0.08754367668217078, + "grad_norm": 0.1256219446659088, + "learning_rate": 0.0005, + "loss": 2.1196, + "step": 23000 + }, + { + "epoch": 0.08758173915029346, + "grad_norm": 0.1317347139120102, + "learning_rate": 0.0005, + "loss": 2.1361, + "step": 23010 + }, + { + "epoch": 0.08761980161841615, + "grad_norm": 0.11706885695457458, + "learning_rate": 0.0005, + "loss": 2.1309, + "step": 23020 + }, + { + "epoch": 0.08765786408653883, + "grad_norm": 0.10700934380292892, + "learning_rate": 0.0005, + "loss": 2.1316, + "step": 23030 + }, + { + "epoch": 0.0876959265546615, + "grad_norm": 0.11388903111219406, + "learning_rate": 0.0005, + "loss": 2.1276, + "step": 23040 + }, + { + "epoch": 0.08773398902278419, + "grad_norm": 0.12671348452568054, + "learning_rate": 0.0005, + "loss": 2.1405, + "step": 23050 + }, + { + "epoch": 0.08777205149090687, + "grad_norm": 0.12644319236278534, + "learning_rate": 0.0005, + "loss": 2.141, + "step": 23060 + }, + { + "epoch": 0.08781011395902956, + "grad_norm": 0.1404653638601303, + "learning_rate": 0.0005, + "loss": 2.1459, + "step": 23070 + }, + { + "epoch": 0.08784817642715224, + "grad_norm": 0.12288325279951096, + "learning_rate": 0.0005, + "loss": 2.1462, + "step": 23080 + }, + { + "epoch": 0.08788623889527493, + "grad_norm": 0.12028903514146805, + "learning_rate": 0.0005, + "loss": 2.1426, + "step": 23090 + }, + { + "epoch": 0.08792430136339761, + "grad_norm": 0.12061980366706848, + "learning_rate": 0.0005, + "loss": 2.129, + "step": 23100 + }, + { + "epoch": 0.08796236383152028, + "grad_norm": 0.13747268915176392, + "learning_rate": 0.0005, + "loss": 2.1489, + "step": 23110 + }, + { + "epoch": 0.08800042629964297, + "grad_norm": 0.1254771500825882, + "learning_rate": 0.0005, + "loss": 2.1381, + "step": 23120 + }, + { + "epoch": 0.08803848876776565, + "grad_norm": 0.11595059186220169, + "learning_rate": 0.0005, + "loss": 2.1451, + "step": 23130 + }, + { + "epoch": 0.08807655123588834, + "grad_norm": 0.11654561758041382, + "learning_rate": 0.0005, + "loss": 2.1539, + "step": 23140 + }, + { + "epoch": 0.08811461370401102, + "grad_norm": 0.12951432168483734, + "learning_rate": 0.0005, + "loss": 2.1431, + "step": 23150 + }, + { + "epoch": 0.08815267617213371, + "grad_norm": 0.11898689717054367, + "learning_rate": 0.0005, + "loss": 2.1384, + "step": 23160 + }, + { + "epoch": 0.0881907386402564, + "grad_norm": 0.11868039518594742, + "learning_rate": 0.0005, + "loss": 2.1258, + "step": 23170 + }, + { + "epoch": 0.08822880110837907, + "grad_norm": 0.1292949616909027, + "learning_rate": 0.0005, + "loss": 2.1498, + "step": 23180 + }, + { + "epoch": 0.08826686357650175, + "grad_norm": 0.11264042556285858, + "learning_rate": 0.0005, + "loss": 2.1363, + "step": 23190 + }, + { + "epoch": 0.08830492604462443, + "grad_norm": 0.1304251104593277, + "learning_rate": 0.0005, + "loss": 2.141, + "step": 23200 + }, + { + "epoch": 0.08834298851274712, + "grad_norm": 0.11987806111574173, + "learning_rate": 0.0005, + "loss": 2.1465, + "step": 23210 + }, + { + "epoch": 0.0883810509808698, + "grad_norm": 0.12419069558382034, + "learning_rate": 0.0005, + "loss": 2.1255, + "step": 23220 + }, + { + "epoch": 0.08841911344899249, + "grad_norm": 0.12453112006187439, + "learning_rate": 0.0005, + "loss": 2.1334, + "step": 23230 + }, + { + "epoch": 0.08845717591711517, + "grad_norm": 0.1267368644475937, + "learning_rate": 0.0005, + "loss": 2.1464, + "step": 23240 + }, + { + "epoch": 0.08849523838523785, + "grad_norm": 0.11241846531629562, + "learning_rate": 0.0005, + "loss": 2.139, + "step": 23250 + }, + { + "epoch": 0.08853330085336053, + "grad_norm": 0.12173543125391006, + "learning_rate": 0.0005, + "loss": 2.1582, + "step": 23260 + }, + { + "epoch": 0.08857136332148322, + "grad_norm": 0.13116681575775146, + "learning_rate": 0.0005, + "loss": 2.1512, + "step": 23270 + }, + { + "epoch": 0.0886094257896059, + "grad_norm": 0.11620502173900604, + "learning_rate": 0.0005, + "loss": 2.1467, + "step": 23280 + }, + { + "epoch": 0.08864748825772858, + "grad_norm": 0.11876282095909119, + "learning_rate": 0.0005, + "loss": 2.1406, + "step": 23290 + }, + { + "epoch": 0.08868555072585127, + "grad_norm": 0.11996292322874069, + "learning_rate": 0.0005, + "loss": 2.1313, + "step": 23300 + }, + { + "epoch": 0.08872361319397395, + "grad_norm": 0.1141175851225853, + "learning_rate": 0.0005, + "loss": 2.1473, + "step": 23310 + }, + { + "epoch": 0.08876167566209664, + "grad_norm": 0.12724260985851288, + "learning_rate": 0.0005, + "loss": 2.1357, + "step": 23320 + }, + { + "epoch": 0.08879973813021931, + "grad_norm": 0.11278630793094635, + "learning_rate": 0.0005, + "loss": 2.1397, + "step": 23330 + }, + { + "epoch": 0.088837800598342, + "grad_norm": 0.12381735444068909, + "learning_rate": 0.0005, + "loss": 2.1423, + "step": 23340 + }, + { + "epoch": 0.08887586306646468, + "grad_norm": 0.12097107619047165, + "learning_rate": 0.0005, + "loss": 2.1266, + "step": 23350 + }, + { + "epoch": 0.08891392553458737, + "grad_norm": 0.12126494944095612, + "learning_rate": 0.0005, + "loss": 2.1425, + "step": 23360 + }, + { + "epoch": 0.08895198800271005, + "grad_norm": 0.12865309417247772, + "learning_rate": 0.0005, + "loss": 2.145, + "step": 23370 + }, + { + "epoch": 0.08899005047083274, + "grad_norm": 0.13124258816242218, + "learning_rate": 0.0005, + "loss": 2.1265, + "step": 23380 + }, + { + "epoch": 0.08902811293895542, + "grad_norm": 0.12501779198646545, + "learning_rate": 0.0005, + "loss": 2.1471, + "step": 23390 + }, + { + "epoch": 0.08906617540707809, + "grad_norm": 0.1225801333785057, + "learning_rate": 0.0005, + "loss": 2.1483, + "step": 23400 + }, + { + "epoch": 0.08910423787520078, + "grad_norm": 0.1309002786874771, + "learning_rate": 0.0005, + "loss": 2.1405, + "step": 23410 + }, + { + "epoch": 0.08914230034332346, + "grad_norm": 0.11393202096223831, + "learning_rate": 0.0005, + "loss": 2.1563, + "step": 23420 + }, + { + "epoch": 0.08918036281144615, + "grad_norm": 0.12912751734256744, + "learning_rate": 0.0005, + "loss": 2.1305, + "step": 23430 + }, + { + "epoch": 0.08921842527956883, + "grad_norm": 0.1320345550775528, + "learning_rate": 0.0005, + "loss": 2.1529, + "step": 23440 + }, + { + "epoch": 0.08925648774769152, + "grad_norm": 0.1260627806186676, + "learning_rate": 0.0005, + "loss": 2.1307, + "step": 23450 + }, + { + "epoch": 0.0892945502158142, + "grad_norm": 0.12123764306306839, + "learning_rate": 0.0005, + "loss": 2.1454, + "step": 23460 + }, + { + "epoch": 0.08933261268393687, + "grad_norm": 0.11679759621620178, + "learning_rate": 0.0005, + "loss": 2.1324, + "step": 23470 + }, + { + "epoch": 0.08937067515205956, + "grad_norm": 0.11947986483573914, + "learning_rate": 0.0005, + "loss": 2.1174, + "step": 23480 + }, + { + "epoch": 0.08940873762018224, + "grad_norm": 0.11585034430027008, + "learning_rate": 0.0005, + "loss": 2.1441, + "step": 23490 + }, + { + "epoch": 0.08944680008830493, + "grad_norm": 0.1223374456167221, + "learning_rate": 0.0005, + "loss": 2.1327, + "step": 23500 + }, + { + "epoch": 0.08948486255642761, + "grad_norm": 0.13731849193572998, + "learning_rate": 0.0005, + "loss": 2.1439, + "step": 23510 + }, + { + "epoch": 0.0895229250245503, + "grad_norm": 0.12628504633903503, + "learning_rate": 0.0005, + "loss": 2.1337, + "step": 23520 + }, + { + "epoch": 0.08956098749267298, + "grad_norm": 0.13378030061721802, + "learning_rate": 0.0005, + "loss": 2.1419, + "step": 23530 + }, + { + "epoch": 0.08959904996079565, + "grad_norm": 0.11454793810844421, + "learning_rate": 0.0005, + "loss": 2.1372, + "step": 23540 + }, + { + "epoch": 0.08963711242891834, + "grad_norm": 0.11796699464321136, + "learning_rate": 0.0005, + "loss": 2.1156, + "step": 23550 + }, + { + "epoch": 0.08967517489704102, + "grad_norm": 0.1294197291135788, + "learning_rate": 0.0005, + "loss": 2.1383, + "step": 23560 + }, + { + "epoch": 0.0897132373651637, + "grad_norm": 0.10944836586713791, + "learning_rate": 0.0005, + "loss": 2.129, + "step": 23570 + }, + { + "epoch": 0.08975129983328639, + "grad_norm": 0.14641229808330536, + "learning_rate": 0.0005, + "loss": 2.1391, + "step": 23580 + }, + { + "epoch": 0.08978936230140908, + "grad_norm": 0.12429311126470566, + "learning_rate": 0.0005, + "loss": 2.1571, + "step": 23590 + }, + { + "epoch": 0.08982742476953176, + "grad_norm": 0.1437855064868927, + "learning_rate": 0.0005, + "loss": 2.1312, + "step": 23600 + }, + { + "epoch": 0.08986548723765443, + "grad_norm": 0.1226692646741867, + "learning_rate": 0.0005, + "loss": 2.1462, + "step": 23610 + }, + { + "epoch": 0.08990354970577712, + "grad_norm": 0.13100013136863708, + "learning_rate": 0.0005, + "loss": 2.1262, + "step": 23620 + }, + { + "epoch": 0.0899416121738998, + "grad_norm": 0.11815108358860016, + "learning_rate": 0.0005, + "loss": 2.1523, + "step": 23630 + }, + { + "epoch": 0.08997967464202249, + "grad_norm": 0.12475232779979706, + "learning_rate": 0.0005, + "loss": 2.1352, + "step": 23640 + }, + { + "epoch": 0.09001773711014517, + "grad_norm": 0.11662489920854568, + "learning_rate": 0.0005, + "loss": 2.1401, + "step": 23650 + }, + { + "epoch": 0.09005579957826786, + "grad_norm": 0.12282786518335342, + "learning_rate": 0.0005, + "loss": 2.144, + "step": 23660 + }, + { + "epoch": 0.09009386204639054, + "grad_norm": 0.11744583398103714, + "learning_rate": 0.0005, + "loss": 2.1365, + "step": 23670 + }, + { + "epoch": 0.09013192451451323, + "grad_norm": 0.14883597195148468, + "learning_rate": 0.0005, + "loss": 2.1352, + "step": 23680 + }, + { + "epoch": 0.0901699869826359, + "grad_norm": 0.12501350045204163, + "learning_rate": 0.0005, + "loss": 2.1323, + "step": 23690 + }, + { + "epoch": 0.09020804945075858, + "grad_norm": 0.1260724812746048, + "learning_rate": 0.0005, + "loss": 2.1295, + "step": 23700 + }, + { + "epoch": 0.09024611191888127, + "grad_norm": 0.11755555868148804, + "learning_rate": 0.0005, + "loss": 2.1396, + "step": 23710 + }, + { + "epoch": 0.09028417438700395, + "grad_norm": 0.11762434244155884, + "learning_rate": 0.0005, + "loss": 2.1554, + "step": 23720 + }, + { + "epoch": 0.09032223685512664, + "grad_norm": 0.12634523212909698, + "learning_rate": 0.0005, + "loss": 2.1459, + "step": 23730 + }, + { + "epoch": 0.09036029932324932, + "grad_norm": 0.11729571223258972, + "learning_rate": 0.0005, + "loss": 2.1245, + "step": 23740 + }, + { + "epoch": 0.090398361791372, + "grad_norm": 0.11078878492116928, + "learning_rate": 0.0005, + "loss": 2.1496, + "step": 23750 + }, + { + "epoch": 0.09043642425949468, + "grad_norm": 0.13865669071674347, + "learning_rate": 0.0005, + "loss": 2.1384, + "step": 23760 + }, + { + "epoch": 0.09047448672761736, + "grad_norm": 0.11994817107915878, + "learning_rate": 0.0005, + "loss": 2.1455, + "step": 23770 + }, + { + "epoch": 0.09051254919574005, + "grad_norm": 0.11820890754461288, + "learning_rate": 0.0005, + "loss": 2.1449, + "step": 23780 + }, + { + "epoch": 0.09055061166386273, + "grad_norm": 0.11758625507354736, + "learning_rate": 0.0005, + "loss": 2.1344, + "step": 23790 + }, + { + "epoch": 0.09058867413198542, + "grad_norm": 0.12771601974964142, + "learning_rate": 0.0005, + "loss": 2.1498, + "step": 23800 + }, + { + "epoch": 0.0906267366001081, + "grad_norm": 0.13300511240959167, + "learning_rate": 0.0005, + "loss": 2.1451, + "step": 23810 + }, + { + "epoch": 0.09066479906823079, + "grad_norm": 0.13406731188297272, + "learning_rate": 0.0005, + "loss": 2.1282, + "step": 23820 + }, + { + "epoch": 0.09070286153635346, + "grad_norm": 0.12391498684883118, + "learning_rate": 0.0005, + "loss": 2.1432, + "step": 23830 + }, + { + "epoch": 0.09074092400447614, + "grad_norm": 0.11452176421880722, + "learning_rate": 0.0005, + "loss": 2.1341, + "step": 23840 + }, + { + "epoch": 0.09077898647259883, + "grad_norm": 0.12261798977851868, + "learning_rate": 0.0005, + "loss": 2.1291, + "step": 23850 + }, + { + "epoch": 0.09081704894072151, + "grad_norm": 0.1292411983013153, + "learning_rate": 0.0005, + "loss": 2.1378, + "step": 23860 + }, + { + "epoch": 0.0908551114088442, + "grad_norm": 0.11482907086610794, + "learning_rate": 0.0005, + "loss": 2.134, + "step": 23870 + }, + { + "epoch": 0.09089317387696688, + "grad_norm": 0.13183918595314026, + "learning_rate": 0.0005, + "loss": 2.1285, + "step": 23880 + }, + { + "epoch": 0.09093123634508957, + "grad_norm": 0.1228523850440979, + "learning_rate": 0.0005, + "loss": 2.1392, + "step": 23890 + }, + { + "epoch": 0.09096929881321224, + "grad_norm": 0.12467924505472183, + "learning_rate": 0.0005, + "loss": 2.1459, + "step": 23900 + }, + { + "epoch": 0.09100736128133492, + "grad_norm": 0.10971734672784805, + "learning_rate": 0.0005, + "loss": 2.1475, + "step": 23910 + }, + { + "epoch": 0.09104542374945761, + "grad_norm": 0.1282947063446045, + "learning_rate": 0.0005, + "loss": 2.1365, + "step": 23920 + }, + { + "epoch": 0.09108348621758029, + "grad_norm": 0.14029517769813538, + "learning_rate": 0.0005, + "loss": 2.1311, + "step": 23930 + }, + { + "epoch": 0.09112154868570298, + "grad_norm": 0.1280580312013626, + "learning_rate": 0.0005, + "loss": 2.1379, + "step": 23940 + }, + { + "epoch": 0.09115961115382566, + "grad_norm": 0.13255015015602112, + "learning_rate": 0.0005, + "loss": 2.1531, + "step": 23950 + }, + { + "epoch": 0.09119767362194835, + "grad_norm": 0.12445107102394104, + "learning_rate": 0.0005, + "loss": 2.1328, + "step": 23960 + }, + { + "epoch": 0.09123573609007102, + "grad_norm": 0.127155140042305, + "learning_rate": 0.0005, + "loss": 2.1522, + "step": 23970 + }, + { + "epoch": 0.0912737985581937, + "grad_norm": 0.12632615864276886, + "learning_rate": 0.0005, + "loss": 2.1236, + "step": 23980 + }, + { + "epoch": 0.09131186102631639, + "grad_norm": 0.11836609989404678, + "learning_rate": 0.0005, + "loss": 2.1327, + "step": 23990 + }, + { + "epoch": 0.09134992349443907, + "grad_norm": 0.13590745627880096, + "learning_rate": 0.0005, + "loss": 2.1215, + "step": 24000 + }, + { + "epoch": 0.09138798596256176, + "grad_norm": 0.1197049468755722, + "learning_rate": 0.0005, + "loss": 2.1449, + "step": 24010 + }, + { + "epoch": 0.09142604843068444, + "grad_norm": 0.13206225633621216, + "learning_rate": 0.0005, + "loss": 2.1245, + "step": 24020 + }, + { + "epoch": 0.09146411089880713, + "grad_norm": 0.13378150761127472, + "learning_rate": 0.0005, + "loss": 2.1384, + "step": 24030 + }, + { + "epoch": 0.09150217336692981, + "grad_norm": 0.11511759459972382, + "learning_rate": 0.0005, + "loss": 2.1321, + "step": 24040 + }, + { + "epoch": 0.09154023583505248, + "grad_norm": 0.1247391626238823, + "learning_rate": 0.0005, + "loss": 2.1407, + "step": 24050 + }, + { + "epoch": 0.09157829830317517, + "grad_norm": 0.12974436581134796, + "learning_rate": 0.0005, + "loss": 2.1538, + "step": 24060 + }, + { + "epoch": 0.09161636077129785, + "grad_norm": 0.11919248849153519, + "learning_rate": 0.0005, + "loss": 2.1279, + "step": 24070 + }, + { + "epoch": 0.09165442323942054, + "grad_norm": 0.1240987479686737, + "learning_rate": 0.0005, + "loss": 2.1488, + "step": 24080 + }, + { + "epoch": 0.09169248570754322, + "grad_norm": 0.12819018959999084, + "learning_rate": 0.0005, + "loss": 2.1432, + "step": 24090 + }, + { + "epoch": 0.09173054817566591, + "grad_norm": 0.11645195633172989, + "learning_rate": 0.0005, + "loss": 2.1337, + "step": 24100 + }, + { + "epoch": 0.09176861064378859, + "grad_norm": 0.1362362951040268, + "learning_rate": 0.0005, + "loss": 2.1343, + "step": 24110 + }, + { + "epoch": 0.09180667311191126, + "grad_norm": 0.12752680480480194, + "learning_rate": 0.0005, + "loss": 2.1327, + "step": 24120 + }, + { + "epoch": 0.09184473558003395, + "grad_norm": 0.11361732333898544, + "learning_rate": 0.0005, + "loss": 2.1315, + "step": 24130 + }, + { + "epoch": 0.09188279804815663, + "grad_norm": 0.11131453514099121, + "learning_rate": 0.0005, + "loss": 2.1377, + "step": 24140 + }, + { + "epoch": 0.09192086051627932, + "grad_norm": 0.11890824884176254, + "learning_rate": 0.0005, + "loss": 2.1399, + "step": 24150 + }, + { + "epoch": 0.091958922984402, + "grad_norm": 0.11522199213504791, + "learning_rate": 0.0005, + "loss": 2.1267, + "step": 24160 + }, + { + "epoch": 0.09199698545252469, + "grad_norm": 0.12634597718715668, + "learning_rate": 0.0005, + "loss": 2.1542, + "step": 24170 + }, + { + "epoch": 0.09203504792064737, + "grad_norm": 0.11525263637304306, + "learning_rate": 0.0005, + "loss": 2.13, + "step": 24180 + }, + { + "epoch": 0.09207311038877004, + "grad_norm": 0.11317164450883865, + "learning_rate": 0.0005, + "loss": 2.1453, + "step": 24190 + }, + { + "epoch": 0.09211117285689273, + "grad_norm": 0.10907386243343353, + "learning_rate": 0.0005, + "loss": 2.1523, + "step": 24200 + }, + { + "epoch": 0.09214923532501541, + "grad_norm": 0.11778762191534042, + "learning_rate": 0.0005, + "loss": 2.1326, + "step": 24210 + }, + { + "epoch": 0.0921872977931381, + "grad_norm": 0.11565057933330536, + "learning_rate": 0.0005, + "loss": 2.1252, + "step": 24220 + }, + { + "epoch": 0.09222536026126078, + "grad_norm": 0.13700050115585327, + "learning_rate": 0.0005, + "loss": 2.1314, + "step": 24230 + }, + { + "epoch": 0.09226342272938347, + "grad_norm": 0.13002556562423706, + "learning_rate": 0.0005, + "loss": 2.161, + "step": 24240 + }, + { + "epoch": 0.09230148519750615, + "grad_norm": 0.11756189167499542, + "learning_rate": 0.0005, + "loss": 2.1418, + "step": 24250 + }, + { + "epoch": 0.09233954766562882, + "grad_norm": 0.12187279015779495, + "learning_rate": 0.0005, + "loss": 2.1477, + "step": 24260 + }, + { + "epoch": 0.09237761013375151, + "grad_norm": 0.11067736148834229, + "learning_rate": 0.0005, + "loss": 2.1444, + "step": 24270 + }, + { + "epoch": 0.0924156726018742, + "grad_norm": 0.1086835041642189, + "learning_rate": 0.0005, + "loss": 2.1388, + "step": 24280 + }, + { + "epoch": 0.09245373506999688, + "grad_norm": 0.11878637224435806, + "learning_rate": 0.0005, + "loss": 2.1361, + "step": 24290 + }, + { + "epoch": 0.09249179753811956, + "grad_norm": 0.13691183924674988, + "learning_rate": 0.0005, + "loss": 2.1414, + "step": 24300 + }, + { + "epoch": 0.09252986000624225, + "grad_norm": 0.13181743025779724, + "learning_rate": 0.0005, + "loss": 2.1525, + "step": 24310 + }, + { + "epoch": 0.09256792247436493, + "grad_norm": 0.13542625308036804, + "learning_rate": 0.0005, + "loss": 2.1298, + "step": 24320 + }, + { + "epoch": 0.0926059849424876, + "grad_norm": 0.12063945829868317, + "learning_rate": 0.0005, + "loss": 2.1429, + "step": 24330 + }, + { + "epoch": 0.09264404741061029, + "grad_norm": 0.12136218696832657, + "learning_rate": 0.0005, + "loss": 2.1335, + "step": 24340 + }, + { + "epoch": 0.09268210987873297, + "grad_norm": 0.10949312150478363, + "learning_rate": 0.0005, + "loss": 2.1149, + "step": 24350 + }, + { + "epoch": 0.09272017234685566, + "grad_norm": 0.1149957925081253, + "learning_rate": 0.0005, + "loss": 2.1177, + "step": 24360 + }, + { + "epoch": 0.09275823481497834, + "grad_norm": 0.1147845983505249, + "learning_rate": 0.0005, + "loss": 2.1533, + "step": 24370 + }, + { + "epoch": 0.09279629728310103, + "grad_norm": 0.1364920735359192, + "learning_rate": 0.0005, + "loss": 2.1399, + "step": 24380 + }, + { + "epoch": 0.09283435975122371, + "grad_norm": 0.13210929930210114, + "learning_rate": 0.0005, + "loss": 2.1324, + "step": 24390 + }, + { + "epoch": 0.09287242221934638, + "grad_norm": 0.13170039653778076, + "learning_rate": 0.0005, + "loss": 2.1267, + "step": 24400 + }, + { + "epoch": 0.09291048468746907, + "grad_norm": 0.12380106002092361, + "learning_rate": 0.0005, + "loss": 2.1353, + "step": 24410 + }, + { + "epoch": 0.09294854715559175, + "grad_norm": 0.11916085332632065, + "learning_rate": 0.0005, + "loss": 2.1366, + "step": 24420 + }, + { + "epoch": 0.09298660962371444, + "grad_norm": 0.11754573881626129, + "learning_rate": 0.0005, + "loss": 2.1358, + "step": 24430 + }, + { + "epoch": 0.09302467209183712, + "grad_norm": 0.11410285532474518, + "learning_rate": 0.0005, + "loss": 2.1508, + "step": 24440 + }, + { + "epoch": 0.09306273455995981, + "grad_norm": 0.12948638200759888, + "learning_rate": 0.0005, + "loss": 2.1292, + "step": 24450 + }, + { + "epoch": 0.0931007970280825, + "grad_norm": 0.14242137968540192, + "learning_rate": 0.0005, + "loss": 2.1313, + "step": 24460 + }, + { + "epoch": 0.09313885949620518, + "grad_norm": 0.11610152572393417, + "learning_rate": 0.0005, + "loss": 2.1506, + "step": 24470 + }, + { + "epoch": 0.09317692196432785, + "grad_norm": 0.11516385525465012, + "learning_rate": 0.0005, + "loss": 2.1417, + "step": 24480 + }, + { + "epoch": 0.09321498443245053, + "grad_norm": 0.11933036893606186, + "learning_rate": 0.0005, + "loss": 2.1296, + "step": 24490 + }, + { + "epoch": 0.09325304690057322, + "grad_norm": 0.11288411915302277, + "learning_rate": 0.0005, + "loss": 2.1246, + "step": 24500 + }, + { + "epoch": 0.0932911093686959, + "grad_norm": 0.11823565512895584, + "learning_rate": 0.0005, + "loss": 2.1532, + "step": 24510 + }, + { + "epoch": 0.09332917183681859, + "grad_norm": 0.14338666200637817, + "learning_rate": 0.0005, + "loss": 2.1394, + "step": 24520 + }, + { + "epoch": 0.09336723430494127, + "grad_norm": 0.12625204026699066, + "learning_rate": 0.0005, + "loss": 2.1521, + "step": 24530 + }, + { + "epoch": 0.09340529677306396, + "grad_norm": 0.13908584415912628, + "learning_rate": 0.0005, + "loss": 2.1493, + "step": 24540 + }, + { + "epoch": 0.09344335924118663, + "grad_norm": 0.11579623818397522, + "learning_rate": 0.0005, + "loss": 2.151, + "step": 24550 + }, + { + "epoch": 0.09348142170930931, + "grad_norm": 0.1155904158949852, + "learning_rate": 0.0005, + "loss": 2.1367, + "step": 24560 + }, + { + "epoch": 0.093519484177432, + "grad_norm": 0.11854032427072525, + "learning_rate": 0.0005, + "loss": 2.1416, + "step": 24570 + }, + { + "epoch": 0.09355754664555468, + "grad_norm": 0.1252460926771164, + "learning_rate": 0.0005, + "loss": 2.1317, + "step": 24580 + }, + { + "epoch": 0.09359560911367737, + "grad_norm": 0.1279488205909729, + "learning_rate": 0.0005, + "loss": 2.1306, + "step": 24590 + }, + { + "epoch": 0.09363367158180005, + "grad_norm": 0.12807022035121918, + "learning_rate": 0.0005, + "loss": 2.1129, + "step": 24600 + }, + { + "epoch": 0.09367173404992274, + "grad_norm": 0.1232905313372612, + "learning_rate": 0.0005, + "loss": 2.1284, + "step": 24610 + }, + { + "epoch": 0.09370979651804541, + "grad_norm": 0.13406997919082642, + "learning_rate": 0.0005, + "loss": 2.1348, + "step": 24620 + }, + { + "epoch": 0.0937478589861681, + "grad_norm": 0.11872179806232452, + "learning_rate": 0.0005, + "loss": 2.147, + "step": 24630 + }, + { + "epoch": 0.09378592145429078, + "grad_norm": 0.12115013599395752, + "learning_rate": 0.0005, + "loss": 2.148, + "step": 24640 + }, + { + "epoch": 0.09382398392241346, + "grad_norm": 0.12303373962640762, + "learning_rate": 0.0005, + "loss": 2.1491, + "step": 24650 + }, + { + "epoch": 0.09386204639053615, + "grad_norm": 0.11456926167011261, + "learning_rate": 0.0005, + "loss": 2.1415, + "step": 24660 + }, + { + "epoch": 0.09390010885865883, + "grad_norm": 0.13177600502967834, + "learning_rate": 0.0005, + "loss": 2.1573, + "step": 24670 + }, + { + "epoch": 0.09393817132678152, + "grad_norm": 0.1192450150847435, + "learning_rate": 0.0005, + "loss": 2.1454, + "step": 24680 + }, + { + "epoch": 0.09397623379490419, + "grad_norm": 0.11517942696809769, + "learning_rate": 0.0005, + "loss": 2.1443, + "step": 24690 + }, + { + "epoch": 0.09401429626302688, + "grad_norm": 0.1183677464723587, + "learning_rate": 0.0005, + "loss": 2.1166, + "step": 24700 + }, + { + "epoch": 0.09405235873114956, + "grad_norm": 0.12436609715223312, + "learning_rate": 0.0005, + "loss": 2.1326, + "step": 24710 + }, + { + "epoch": 0.09409042119927225, + "grad_norm": 0.12491034716367722, + "learning_rate": 0.0005, + "loss": 2.1366, + "step": 24720 + }, + { + "epoch": 0.09412848366739493, + "grad_norm": 0.1131734699010849, + "learning_rate": 0.0005, + "loss": 2.1497, + "step": 24730 + }, + { + "epoch": 0.09416654613551761, + "grad_norm": 0.11752540618181229, + "learning_rate": 0.0005, + "loss": 2.1395, + "step": 24740 + }, + { + "epoch": 0.0942046086036403, + "grad_norm": 0.12445096671581268, + "learning_rate": 0.0005, + "loss": 2.1331, + "step": 24750 + }, + { + "epoch": 0.09424267107176297, + "grad_norm": 0.13811765611171722, + "learning_rate": 0.0005, + "loss": 2.1406, + "step": 24760 + }, + { + "epoch": 0.09428073353988566, + "grad_norm": 0.11550748348236084, + "learning_rate": 0.0005, + "loss": 2.1407, + "step": 24770 + }, + { + "epoch": 0.09431879600800834, + "grad_norm": 0.12589634954929352, + "learning_rate": 0.0005, + "loss": 2.1504, + "step": 24780 + }, + { + "epoch": 0.09435685847613103, + "grad_norm": 0.12445308268070221, + "learning_rate": 0.0005, + "loss": 2.1301, + "step": 24790 + }, + { + "epoch": 0.09439492094425371, + "grad_norm": 0.12869003415107727, + "learning_rate": 0.0005, + "loss": 2.1474, + "step": 24800 + }, + { + "epoch": 0.0944329834123764, + "grad_norm": 0.12242034077644348, + "learning_rate": 0.0005, + "loss": 2.1337, + "step": 24810 + }, + { + "epoch": 0.09447104588049908, + "grad_norm": 0.12127193808555603, + "learning_rate": 0.0005, + "loss": 2.1364, + "step": 24820 + }, + { + "epoch": 0.09450910834862176, + "grad_norm": 0.12238039821386337, + "learning_rate": 0.0005, + "loss": 2.1387, + "step": 24830 + }, + { + "epoch": 0.09454717081674444, + "grad_norm": 0.12273263186216354, + "learning_rate": 0.0005, + "loss": 2.1466, + "step": 24840 + }, + { + "epoch": 0.09458523328486712, + "grad_norm": 0.12089051306247711, + "learning_rate": 0.0005, + "loss": 2.1448, + "step": 24850 + }, + { + "epoch": 0.0946232957529898, + "grad_norm": 0.12131716310977936, + "learning_rate": 0.0005, + "loss": 2.1404, + "step": 24860 + }, + { + "epoch": 0.09466135822111249, + "grad_norm": 0.13538216054439545, + "learning_rate": 0.0005, + "loss": 2.1277, + "step": 24870 + }, + { + "epoch": 0.09469942068923518, + "grad_norm": 0.1291877180337906, + "learning_rate": 0.0005, + "loss": 2.1392, + "step": 24880 + }, + { + "epoch": 0.09473748315735786, + "grad_norm": 0.11714226007461548, + "learning_rate": 0.0005, + "loss": 2.1397, + "step": 24890 + }, + { + "epoch": 0.09477554562548055, + "grad_norm": 0.12920613586902618, + "learning_rate": 0.0005, + "loss": 2.1522, + "step": 24900 + }, + { + "epoch": 0.09481360809360322, + "grad_norm": 0.1269528567790985, + "learning_rate": 0.0005, + "loss": 2.1484, + "step": 24910 + }, + { + "epoch": 0.0948516705617259, + "grad_norm": 0.11459819227457047, + "learning_rate": 0.0005, + "loss": 2.1427, + "step": 24920 + }, + { + "epoch": 0.09488973302984859, + "grad_norm": 0.1146237924695015, + "learning_rate": 0.0005, + "loss": 2.1509, + "step": 24930 + }, + { + "epoch": 0.09492779549797127, + "grad_norm": 0.12081960588693619, + "learning_rate": 0.0005, + "loss": 2.1445, + "step": 24940 + }, + { + "epoch": 0.09496585796609396, + "grad_norm": 0.13120268285274506, + "learning_rate": 0.0005, + "loss": 2.14, + "step": 24950 + }, + { + "epoch": 0.09500392043421664, + "grad_norm": 0.12134882062673569, + "learning_rate": 0.0005, + "loss": 2.1403, + "step": 24960 + }, + { + "epoch": 0.09504198290233933, + "grad_norm": 0.11924862116575241, + "learning_rate": 0.0005, + "loss": 2.1473, + "step": 24970 + }, + { + "epoch": 0.095080045370462, + "grad_norm": 0.13948355615139008, + "learning_rate": 0.0005, + "loss": 2.1439, + "step": 24980 + }, + { + "epoch": 0.09511810783858468, + "grad_norm": 0.11267533153295517, + "learning_rate": 0.0005, + "loss": 2.1418, + "step": 24990 + }, + { + "epoch": 0.09515617030670737, + "grad_norm": 0.1245555654168129, + "learning_rate": 0.0005, + "loss": 2.13, + "step": 25000 + }, + { + "epoch": 0.09519423277483005, + "grad_norm": 0.12775032222270966, + "learning_rate": 0.0005, + "loss": 2.1442, + "step": 25010 + }, + { + "epoch": 0.09523229524295274, + "grad_norm": 0.12445028126239777, + "learning_rate": 0.0005, + "loss": 2.1371, + "step": 25020 + }, + { + "epoch": 0.09527035771107542, + "grad_norm": 0.11729653179645538, + "learning_rate": 0.0005, + "loss": 2.1282, + "step": 25030 + }, + { + "epoch": 0.0953084201791981, + "grad_norm": 0.11761856824159622, + "learning_rate": 0.0005, + "loss": 2.1268, + "step": 25040 + }, + { + "epoch": 0.09534648264732078, + "grad_norm": 0.11484331637620926, + "learning_rate": 0.0005, + "loss": 2.1218, + "step": 25050 + }, + { + "epoch": 0.09538454511544346, + "grad_norm": 0.12111509591341019, + "learning_rate": 0.0005, + "loss": 2.1478, + "step": 25060 + }, + { + "epoch": 0.09542260758356615, + "grad_norm": 0.11838772892951965, + "learning_rate": 0.0005, + "loss": 2.1475, + "step": 25070 + }, + { + "epoch": 0.09546067005168883, + "grad_norm": 0.12760284543037415, + "learning_rate": 0.0005, + "loss": 2.1406, + "step": 25080 + }, + { + "epoch": 0.09549873251981152, + "grad_norm": 0.13428868353366852, + "learning_rate": 0.0005, + "loss": 2.1187, + "step": 25090 + }, + { + "epoch": 0.0955367949879342, + "grad_norm": 0.13183973729610443, + "learning_rate": 0.0005, + "loss": 2.1406, + "step": 25100 + }, + { + "epoch": 0.09557485745605689, + "grad_norm": 0.12350989133119583, + "learning_rate": 0.0005, + "loss": 2.125, + "step": 25110 + }, + { + "epoch": 0.09561291992417956, + "grad_norm": 0.13064606487751007, + "learning_rate": 0.0005, + "loss": 2.1352, + "step": 25120 + }, + { + "epoch": 0.09565098239230224, + "grad_norm": 0.11381707340478897, + "learning_rate": 0.0005, + "loss": 2.1169, + "step": 25130 + }, + { + "epoch": 0.09568904486042493, + "grad_norm": 0.1271132081747055, + "learning_rate": 0.0005, + "loss": 2.1391, + "step": 25140 + }, + { + "epoch": 0.09572710732854761, + "grad_norm": 0.12241170555353165, + "learning_rate": 0.0005, + "loss": 2.1395, + "step": 25150 + }, + { + "epoch": 0.0957651697966703, + "grad_norm": 0.27211326360702515, + "learning_rate": 0.0005, + "loss": 2.1386, + "step": 25160 + }, + { + "epoch": 0.09580323226479298, + "grad_norm": 0.13097335398197174, + "learning_rate": 0.0005, + "loss": 2.1415, + "step": 25170 + }, + { + "epoch": 0.09584129473291567, + "grad_norm": 0.12096305191516876, + "learning_rate": 0.0005, + "loss": 2.1382, + "step": 25180 + }, + { + "epoch": 0.09587935720103835, + "grad_norm": 0.12811264395713806, + "learning_rate": 0.0005, + "loss": 2.1196, + "step": 25190 + }, + { + "epoch": 0.09591741966916102, + "grad_norm": 0.13960425555706024, + "learning_rate": 0.0005, + "loss": 2.1337, + "step": 25200 + }, + { + "epoch": 0.0959554821372837, + "grad_norm": 0.1224188506603241, + "learning_rate": 0.0005, + "loss": 2.1505, + "step": 25210 + }, + { + "epoch": 0.09599354460540639, + "grad_norm": 0.12553152441978455, + "learning_rate": 0.0005, + "loss": 2.1488, + "step": 25220 + }, + { + "epoch": 0.09603160707352908, + "grad_norm": 0.12428688257932663, + "learning_rate": 0.0005, + "loss": 2.1418, + "step": 25230 + }, + { + "epoch": 0.09606966954165176, + "grad_norm": 0.12976130843162537, + "learning_rate": 0.0005, + "loss": 2.1526, + "step": 25240 + }, + { + "epoch": 0.09610773200977445, + "grad_norm": 0.13057775795459747, + "learning_rate": 0.0005, + "loss": 2.1499, + "step": 25250 + }, + { + "epoch": 0.09614579447789713, + "grad_norm": 0.13561537861824036, + "learning_rate": 0.0005, + "loss": 2.1357, + "step": 25260 + }, + { + "epoch": 0.0961838569460198, + "grad_norm": 0.12290742993354797, + "learning_rate": 0.0005, + "loss": 2.1602, + "step": 25270 + }, + { + "epoch": 0.09622191941414249, + "grad_norm": 0.1177307739853859, + "learning_rate": 0.0005, + "loss": 2.1393, + "step": 25280 + }, + { + "epoch": 0.09625998188226517, + "grad_norm": 0.13508401811122894, + "learning_rate": 0.0005, + "loss": 2.1346, + "step": 25290 + }, + { + "epoch": 0.09629804435038786, + "grad_norm": 0.11391041427850723, + "learning_rate": 0.0005, + "loss": 2.1278, + "step": 25300 + }, + { + "epoch": 0.09633610681851054, + "grad_norm": 0.11419051140546799, + "learning_rate": 0.0005, + "loss": 2.1322, + "step": 25310 + }, + { + "epoch": 0.09637416928663323, + "grad_norm": 0.12973779439926147, + "learning_rate": 0.0005, + "loss": 2.1279, + "step": 25320 + }, + { + "epoch": 0.09641223175475591, + "grad_norm": 0.13356907665729523, + "learning_rate": 0.0005, + "loss": 2.1434, + "step": 25330 + }, + { + "epoch": 0.09645029422287858, + "grad_norm": 0.12621325254440308, + "learning_rate": 0.0005, + "loss": 2.1418, + "step": 25340 + }, + { + "epoch": 0.09648835669100127, + "grad_norm": 0.12215954065322876, + "learning_rate": 0.0005, + "loss": 2.1403, + "step": 25350 + }, + { + "epoch": 0.09652641915912395, + "grad_norm": 0.1351398378610611, + "learning_rate": 0.0005, + "loss": 2.1308, + "step": 25360 + }, + { + "epoch": 0.09656448162724664, + "grad_norm": 0.1220025047659874, + "learning_rate": 0.0005, + "loss": 2.1361, + "step": 25370 + }, + { + "epoch": 0.09660254409536932, + "grad_norm": 0.12556889653205872, + "learning_rate": 0.0005, + "loss": 2.1375, + "step": 25380 + }, + { + "epoch": 0.09664060656349201, + "grad_norm": 0.13660888373851776, + "learning_rate": 0.0005, + "loss": 2.1248, + "step": 25390 + }, + { + "epoch": 0.09667866903161469, + "grad_norm": 0.12122069299221039, + "learning_rate": 0.0005, + "loss": 2.1407, + "step": 25400 + }, + { + "epoch": 0.09671673149973736, + "grad_norm": 0.12084165960550308, + "learning_rate": 0.0005, + "loss": 2.1331, + "step": 25410 + }, + { + "epoch": 0.09675479396786005, + "grad_norm": 0.12457843124866486, + "learning_rate": 0.0005, + "loss": 2.1279, + "step": 25420 + }, + { + "epoch": 0.09679285643598273, + "grad_norm": 0.13408710062503815, + "learning_rate": 0.0005, + "loss": 2.1304, + "step": 25430 + }, + { + "epoch": 0.09683091890410542, + "grad_norm": 0.1271134912967682, + "learning_rate": 0.0005, + "loss": 2.1522, + "step": 25440 + }, + { + "epoch": 0.0968689813722281, + "grad_norm": 0.11684239655733109, + "learning_rate": 0.0005, + "loss": 2.1297, + "step": 25450 + }, + { + "epoch": 0.09690704384035079, + "grad_norm": 0.1285119354724884, + "learning_rate": 0.0005, + "loss": 2.1406, + "step": 25460 + }, + { + "epoch": 0.09694510630847347, + "grad_norm": 0.11452360451221466, + "learning_rate": 0.0005, + "loss": 2.1404, + "step": 25470 + }, + { + "epoch": 0.09698316877659614, + "grad_norm": 0.1238560900092125, + "learning_rate": 0.0005, + "loss": 2.1445, + "step": 25480 + }, + { + "epoch": 0.09702123124471883, + "grad_norm": 0.11912228912115097, + "learning_rate": 0.0005, + "loss": 2.1357, + "step": 25490 + }, + { + "epoch": 0.09705929371284151, + "grad_norm": 0.114809051156044, + "learning_rate": 0.0005, + "loss": 2.1451, + "step": 25500 + }, + { + "epoch": 0.0970973561809642, + "grad_norm": 0.12524668872356415, + "learning_rate": 0.0005, + "loss": 2.1463, + "step": 25510 + }, + { + "epoch": 0.09713541864908688, + "grad_norm": 0.11931375414133072, + "learning_rate": 0.0005, + "loss": 2.1312, + "step": 25520 + }, + { + "epoch": 0.09717348111720957, + "grad_norm": 0.12906037271022797, + "learning_rate": 0.0005, + "loss": 2.1348, + "step": 25530 + }, + { + "epoch": 0.09721154358533225, + "grad_norm": 0.11984331905841827, + "learning_rate": 0.0005, + "loss": 2.1347, + "step": 25540 + }, + { + "epoch": 0.09724960605345492, + "grad_norm": 0.11141162365674973, + "learning_rate": 0.0005, + "loss": 2.1379, + "step": 25550 + }, + { + "epoch": 0.09728766852157761, + "grad_norm": 0.1167030781507492, + "learning_rate": 0.0005, + "loss": 2.1302, + "step": 25560 + }, + { + "epoch": 0.0973257309897003, + "grad_norm": 0.12758852541446686, + "learning_rate": 0.0005, + "loss": 2.1414, + "step": 25570 + }, + { + "epoch": 0.09736379345782298, + "grad_norm": 0.12121187150478363, + "learning_rate": 0.0005, + "loss": 2.1359, + "step": 25580 + }, + { + "epoch": 0.09740185592594566, + "grad_norm": 0.12168534845113754, + "learning_rate": 0.0005, + "loss": 2.1411, + "step": 25590 + }, + { + "epoch": 0.09743991839406835, + "grad_norm": 0.12349078804254532, + "learning_rate": 0.0005, + "loss": 2.1362, + "step": 25600 + }, + { + "epoch": 0.09747798086219103, + "grad_norm": 0.13133080303668976, + "learning_rate": 0.0005, + "loss": 2.1466, + "step": 25610 + }, + { + "epoch": 0.09751604333031372, + "grad_norm": 0.12626859545707703, + "learning_rate": 0.0005, + "loss": 2.1345, + "step": 25620 + }, + { + "epoch": 0.09755410579843639, + "grad_norm": 0.12265080213546753, + "learning_rate": 0.0005, + "loss": 2.1334, + "step": 25630 + }, + { + "epoch": 0.09759216826655907, + "grad_norm": 0.12489853799343109, + "learning_rate": 0.0005, + "loss": 2.1387, + "step": 25640 + }, + { + "epoch": 0.09763023073468176, + "grad_norm": 0.1116076335310936, + "learning_rate": 0.0005, + "loss": 2.1441, + "step": 25650 + }, + { + "epoch": 0.09766829320280444, + "grad_norm": 0.12283624708652496, + "learning_rate": 0.0005, + "loss": 2.1355, + "step": 25660 + }, + { + "epoch": 0.09770635567092713, + "grad_norm": 0.12440288066864014, + "learning_rate": 0.0005, + "loss": 2.1439, + "step": 25670 + }, + { + "epoch": 0.09774441813904981, + "grad_norm": 0.1259060502052307, + "learning_rate": 0.0005, + "loss": 2.1402, + "step": 25680 + }, + { + "epoch": 0.0977824806071725, + "grad_norm": 0.12068597972393036, + "learning_rate": 0.0005, + "loss": 2.1376, + "step": 25690 + }, + { + "epoch": 0.09782054307529517, + "grad_norm": 0.10977241396903992, + "learning_rate": 0.0005, + "loss": 2.1247, + "step": 25700 + }, + { + "epoch": 0.09785860554341785, + "grad_norm": 0.11703129857778549, + "learning_rate": 0.0005, + "loss": 2.1508, + "step": 25710 + }, + { + "epoch": 0.09789666801154054, + "grad_norm": 0.11471304297447205, + "learning_rate": 0.0005, + "loss": 2.1465, + "step": 25720 + }, + { + "epoch": 0.09793473047966322, + "grad_norm": 0.12228024005889893, + "learning_rate": 0.0005, + "loss": 2.1554, + "step": 25730 + }, + { + "epoch": 0.09797279294778591, + "grad_norm": 0.12173251062631607, + "learning_rate": 0.0005, + "loss": 2.1234, + "step": 25740 + }, + { + "epoch": 0.0980108554159086, + "grad_norm": 0.14087113738059998, + "learning_rate": 0.0005, + "loss": 2.136, + "step": 25750 + }, + { + "epoch": 0.09804891788403128, + "grad_norm": 0.1175365000963211, + "learning_rate": 0.0005, + "loss": 2.1292, + "step": 25760 + }, + { + "epoch": 0.09808698035215395, + "grad_norm": 0.12194423377513885, + "learning_rate": 0.0005, + "loss": 2.1306, + "step": 25770 + }, + { + "epoch": 0.09812504282027663, + "grad_norm": 0.10810630023479462, + "learning_rate": 0.0005, + "loss": 2.1405, + "step": 25780 + }, + { + "epoch": 0.09816310528839932, + "grad_norm": 0.11678377538919449, + "learning_rate": 0.0005, + "loss": 2.1412, + "step": 25790 + }, + { + "epoch": 0.098201167756522, + "grad_norm": 0.11521114408969879, + "learning_rate": 0.0005, + "loss": 2.1342, + "step": 25800 + }, + { + "epoch": 0.09823923022464469, + "grad_norm": 0.12144433706998825, + "learning_rate": 0.0005, + "loss": 2.1321, + "step": 25810 + }, + { + "epoch": 0.09827729269276737, + "grad_norm": 0.13035742938518524, + "learning_rate": 0.0005, + "loss": 2.1283, + "step": 25820 + }, + { + "epoch": 0.09831535516089006, + "grad_norm": 0.12414418160915375, + "learning_rate": 0.0005, + "loss": 2.1365, + "step": 25830 + }, + { + "epoch": 0.09835341762901273, + "grad_norm": 0.12440216541290283, + "learning_rate": 0.0005, + "loss": 2.1443, + "step": 25840 + }, + { + "epoch": 0.09839148009713541, + "grad_norm": 0.1393919289112091, + "learning_rate": 0.0005, + "loss": 2.1278, + "step": 25850 + }, + { + "epoch": 0.0984295425652581, + "grad_norm": 0.13919106125831604, + "learning_rate": 0.0005, + "loss": 2.1354, + "step": 25860 + }, + { + "epoch": 0.09846760503338078, + "grad_norm": 0.1080850213766098, + "learning_rate": 0.0005, + "loss": 2.1442, + "step": 25870 + }, + { + "epoch": 0.09850566750150347, + "grad_norm": 0.12063482403755188, + "learning_rate": 0.0005, + "loss": 2.1377, + "step": 25880 + }, + { + "epoch": 0.09854372996962615, + "grad_norm": 0.11347249150276184, + "learning_rate": 0.0005, + "loss": 2.1213, + "step": 25890 + }, + { + "epoch": 0.09858179243774884, + "grad_norm": 0.11986780911684036, + "learning_rate": 0.0005, + "loss": 2.1384, + "step": 25900 + }, + { + "epoch": 0.09861985490587151, + "grad_norm": 0.12909086048603058, + "learning_rate": 0.0005, + "loss": 2.1369, + "step": 25910 + }, + { + "epoch": 0.0986579173739942, + "grad_norm": 0.12397724390029907, + "learning_rate": 0.0005, + "loss": 2.1589, + "step": 25920 + }, + { + "epoch": 0.09869597984211688, + "grad_norm": 0.12564894556999207, + "learning_rate": 0.0005, + "loss": 2.153, + "step": 25930 + }, + { + "epoch": 0.09873404231023956, + "grad_norm": 0.1232154592871666, + "learning_rate": 0.0005, + "loss": 2.1387, + "step": 25940 + }, + { + "epoch": 0.09877210477836225, + "grad_norm": 0.13897015154361725, + "learning_rate": 0.0005, + "loss": 2.1317, + "step": 25950 + }, + { + "epoch": 0.09881016724648493, + "grad_norm": 0.12695081532001495, + "learning_rate": 0.0005, + "loss": 2.1495, + "step": 25960 + }, + { + "epoch": 0.09884822971460762, + "grad_norm": 0.13669979572296143, + "learning_rate": 0.0005, + "loss": 2.15, + "step": 25970 + }, + { + "epoch": 0.0988862921827303, + "grad_norm": 0.1393309384584427, + "learning_rate": 0.0005, + "loss": 2.1318, + "step": 25980 + }, + { + "epoch": 0.09892435465085297, + "grad_norm": 0.12158825248479843, + "learning_rate": 0.0005, + "loss": 2.1289, + "step": 25990 + }, + { + "epoch": 0.09896241711897566, + "grad_norm": 0.12393995374441147, + "learning_rate": 0.0005, + "loss": 2.1328, + "step": 26000 + }, + { + "epoch": 0.09900047958709834, + "grad_norm": 0.12239037454128265, + "learning_rate": 0.0005, + "loss": 2.1494, + "step": 26010 + }, + { + "epoch": 0.09903854205522103, + "grad_norm": 0.12146341055631638, + "learning_rate": 0.0005, + "loss": 2.131, + "step": 26020 + }, + { + "epoch": 0.09907660452334371, + "grad_norm": 0.11653164029121399, + "learning_rate": 0.0005, + "loss": 2.1467, + "step": 26030 + }, + { + "epoch": 0.0991146669914664, + "grad_norm": 0.12449685484170914, + "learning_rate": 0.0005, + "loss": 2.1258, + "step": 26040 + }, + { + "epoch": 0.09915272945958908, + "grad_norm": 0.11595933139324188, + "learning_rate": 0.0005, + "loss": 2.1371, + "step": 26050 + }, + { + "epoch": 0.09919079192771176, + "grad_norm": 0.1267801821231842, + "learning_rate": 0.0005, + "loss": 2.134, + "step": 26060 + }, + { + "epoch": 0.09922885439583444, + "grad_norm": 0.11027488112449646, + "learning_rate": 0.0005, + "loss": 2.1252, + "step": 26070 + }, + { + "epoch": 0.09926691686395712, + "grad_norm": 0.15915116667747498, + "learning_rate": 0.0005, + "loss": 2.1235, + "step": 26080 + }, + { + "epoch": 0.09930497933207981, + "grad_norm": 0.12621380388736725, + "learning_rate": 0.0005, + "loss": 2.1461, + "step": 26090 + }, + { + "epoch": 0.0993430418002025, + "grad_norm": 0.14087072014808655, + "learning_rate": 0.0005, + "loss": 2.135, + "step": 26100 + }, + { + "epoch": 0.09938110426832518, + "grad_norm": 0.12439528107643127, + "learning_rate": 0.0005, + "loss": 2.1452, + "step": 26110 + }, + { + "epoch": 0.09941916673644786, + "grad_norm": 0.12714435160160065, + "learning_rate": 0.0005, + "loss": 2.1401, + "step": 26120 + }, + { + "epoch": 0.09945722920457054, + "grad_norm": 0.12384948879480362, + "learning_rate": 0.0005, + "loss": 2.1422, + "step": 26130 + }, + { + "epoch": 0.09949529167269322, + "grad_norm": 0.15680019557476044, + "learning_rate": 0.0005, + "loss": 2.1275, + "step": 26140 + }, + { + "epoch": 0.0995333541408159, + "grad_norm": 0.14214445650577545, + "learning_rate": 0.0005, + "loss": 2.1278, + "step": 26150 + }, + { + "epoch": 0.09957141660893859, + "grad_norm": 0.12956596910953522, + "learning_rate": 0.0005, + "loss": 2.1163, + "step": 26160 + }, + { + "epoch": 0.09960947907706127, + "grad_norm": 0.11495132744312286, + "learning_rate": 0.0005, + "loss": 2.1498, + "step": 26170 + }, + { + "epoch": 0.09964754154518396, + "grad_norm": 0.11616240441799164, + "learning_rate": 0.0005, + "loss": 2.1385, + "step": 26180 + }, + { + "epoch": 0.09968560401330664, + "grad_norm": 0.11863888055086136, + "learning_rate": 0.0005, + "loss": 2.1295, + "step": 26190 + }, + { + "epoch": 0.09972366648142932, + "grad_norm": 0.1459449827671051, + "learning_rate": 0.0005, + "loss": 2.1444, + "step": 26200 + }, + { + "epoch": 0.099761728949552, + "grad_norm": 0.11841373145580292, + "learning_rate": 0.0005, + "loss": 2.137, + "step": 26210 + }, + { + "epoch": 0.09979979141767469, + "grad_norm": 0.11825912445783615, + "learning_rate": 0.0005, + "loss": 2.1472, + "step": 26220 + }, + { + "epoch": 0.09983785388579737, + "grad_norm": 0.1280236691236496, + "learning_rate": 0.0005, + "loss": 2.1567, + "step": 26230 + }, + { + "epoch": 0.09987591635392006, + "grad_norm": 0.11771514266729355, + "learning_rate": 0.0005, + "loss": 2.1401, + "step": 26240 + }, + { + "epoch": 0.09991397882204274, + "grad_norm": 0.1152941957116127, + "learning_rate": 0.0005, + "loss": 2.128, + "step": 26250 + }, + { + "epoch": 0.09995204129016542, + "grad_norm": 0.11362841725349426, + "learning_rate": 0.0005, + "loss": 2.1392, + "step": 26260 + }, + { + "epoch": 0.0999901037582881, + "grad_norm": 0.13889946043491364, + "learning_rate": 0.0005, + "loss": 2.1198, + "step": 26270 + }, + { + "epoch": 0.10002816622641078, + "grad_norm": 0.11212174594402313, + "learning_rate": 0.0005, + "loss": 2.1512, + "step": 26280 + }, + { + "epoch": 0.10006622869453347, + "grad_norm": 0.12263673543930054, + "learning_rate": 0.0005, + "loss": 2.1286, + "step": 26290 + }, + { + "epoch": 0.10010429116265615, + "grad_norm": 0.12970253825187683, + "learning_rate": 0.0005, + "loss": 2.1368, + "step": 26300 + }, + { + "epoch": 0.10014235363077884, + "grad_norm": 0.12810315191745758, + "learning_rate": 0.0005, + "loss": 2.1351, + "step": 26310 + }, + { + "epoch": 0.10018041609890152, + "grad_norm": 0.1226223036646843, + "learning_rate": 0.0005, + "loss": 2.1327, + "step": 26320 + }, + { + "epoch": 0.1002184785670242, + "grad_norm": 0.1228988990187645, + "learning_rate": 0.0005, + "loss": 2.1288, + "step": 26330 + }, + { + "epoch": 0.10025654103514689, + "grad_norm": 0.13820667564868927, + "learning_rate": 0.0005, + "loss": 2.1506, + "step": 26340 + }, + { + "epoch": 0.10029460350326956, + "grad_norm": 0.1304493099451065, + "learning_rate": 0.0005, + "loss": 2.1542, + "step": 26350 + }, + { + "epoch": 0.10033266597139225, + "grad_norm": 0.12311537563800812, + "learning_rate": 0.0005, + "loss": 2.137, + "step": 26360 + }, + { + "epoch": 0.10037072843951493, + "grad_norm": 0.11408624798059464, + "learning_rate": 0.0005, + "loss": 2.1292, + "step": 26370 + }, + { + "epoch": 0.10040879090763762, + "grad_norm": 0.16143427789211273, + "learning_rate": 0.0005, + "loss": 2.133, + "step": 26380 + }, + { + "epoch": 0.1004468533757603, + "grad_norm": 0.12645834684371948, + "learning_rate": 0.0005, + "loss": 2.134, + "step": 26390 + }, + { + "epoch": 0.10048491584388299, + "grad_norm": 0.1337132304906845, + "learning_rate": 0.0005, + "loss": 2.133, + "step": 26400 + }, + { + "epoch": 0.10052297831200567, + "grad_norm": 0.1292242407798767, + "learning_rate": 0.0005, + "loss": 2.1269, + "step": 26410 + }, + { + "epoch": 0.10056104078012834, + "grad_norm": 0.11790705472230911, + "learning_rate": 0.0005, + "loss": 2.1456, + "step": 26420 + }, + { + "epoch": 0.10059910324825103, + "grad_norm": 0.11704066395759583, + "learning_rate": 0.0005, + "loss": 2.1543, + "step": 26430 + }, + { + "epoch": 0.10063716571637371, + "grad_norm": 0.11400522291660309, + "learning_rate": 0.0005, + "loss": 2.143, + "step": 26440 + }, + { + "epoch": 0.1006752281844964, + "grad_norm": 0.12126346677541733, + "learning_rate": 0.0005, + "loss": 2.1352, + "step": 26450 + }, + { + "epoch": 0.10071329065261908, + "grad_norm": 0.11732926219701767, + "learning_rate": 0.0005, + "loss": 2.1342, + "step": 26460 + }, + { + "epoch": 0.10075135312074177, + "grad_norm": 0.12718385457992554, + "learning_rate": 0.0005, + "loss": 2.1274, + "step": 26470 + }, + { + "epoch": 0.10078941558886445, + "grad_norm": 0.11345270276069641, + "learning_rate": 0.0005, + "loss": 2.1412, + "step": 26480 + }, + { + "epoch": 0.10082747805698712, + "grad_norm": 0.13248255848884583, + "learning_rate": 0.0005, + "loss": 2.1301, + "step": 26490 + }, + { + "epoch": 0.1008655405251098, + "grad_norm": 0.12024351954460144, + "learning_rate": 0.0005, + "loss": 2.1341, + "step": 26500 + }, + { + "epoch": 0.10090360299323249, + "grad_norm": 0.13912230730056763, + "learning_rate": 0.0005, + "loss": 2.1328, + "step": 26510 + }, + { + "epoch": 0.10094166546135518, + "grad_norm": 0.11878001689910889, + "learning_rate": 0.0005, + "loss": 2.1296, + "step": 26520 + }, + { + "epoch": 0.10097972792947786, + "grad_norm": 0.12029380351305008, + "learning_rate": 0.0005, + "loss": 2.1308, + "step": 26530 + }, + { + "epoch": 0.10101779039760055, + "grad_norm": 0.1200578361749649, + "learning_rate": 0.0005, + "loss": 2.1306, + "step": 26540 + }, + { + "epoch": 0.10105585286572323, + "grad_norm": 0.12079061567783356, + "learning_rate": 0.0005, + "loss": 2.1352, + "step": 26550 + }, + { + "epoch": 0.1010939153338459, + "grad_norm": 0.13137206435203552, + "learning_rate": 0.0005, + "loss": 2.1366, + "step": 26560 + }, + { + "epoch": 0.10113197780196859, + "grad_norm": 0.13583819568157196, + "learning_rate": 0.0005, + "loss": 2.1381, + "step": 26570 + }, + { + "epoch": 0.10117004027009127, + "grad_norm": 0.12240844964981079, + "learning_rate": 0.0005, + "loss": 2.1352, + "step": 26580 + }, + { + "epoch": 0.10120810273821396, + "grad_norm": 0.14458432793617249, + "learning_rate": 0.0005, + "loss": 2.1412, + "step": 26590 + }, + { + "epoch": 0.10124616520633664, + "grad_norm": 0.1237378790974617, + "learning_rate": 0.0005, + "loss": 2.1343, + "step": 26600 + }, + { + "epoch": 0.10128422767445933, + "grad_norm": 0.12536272406578064, + "learning_rate": 0.0005, + "loss": 2.1392, + "step": 26610 + }, + { + "epoch": 0.10132229014258201, + "grad_norm": 0.14227186143398285, + "learning_rate": 0.0005, + "loss": 2.1333, + "step": 26620 + }, + { + "epoch": 0.10136035261070468, + "grad_norm": 0.11502430588006973, + "learning_rate": 0.0005, + "loss": 2.1553, + "step": 26630 + }, + { + "epoch": 0.10139841507882737, + "grad_norm": 0.12336134910583496, + "learning_rate": 0.0005, + "loss": 2.1386, + "step": 26640 + }, + { + "epoch": 0.10143647754695005, + "grad_norm": 0.4810260832309723, + "learning_rate": 0.0005, + "loss": 2.1352, + "step": 26650 + }, + { + "epoch": 0.10147454001507274, + "grad_norm": 0.12972472608089447, + "learning_rate": 0.0005, + "loss": 2.1292, + "step": 26660 + }, + { + "epoch": 0.10151260248319542, + "grad_norm": 0.13647037744522095, + "learning_rate": 0.0005, + "loss": 2.1454, + "step": 26670 + }, + { + "epoch": 0.1015506649513181, + "grad_norm": 0.12054859101772308, + "learning_rate": 0.0005, + "loss": 2.1319, + "step": 26680 + }, + { + "epoch": 0.10158872741944079, + "grad_norm": 0.1270749419927597, + "learning_rate": 0.0005, + "loss": 2.1417, + "step": 26690 + }, + { + "epoch": 0.10162678988756346, + "grad_norm": 0.12211668491363525, + "learning_rate": 0.0005, + "loss": 2.1529, + "step": 26700 + }, + { + "epoch": 0.10166485235568615, + "grad_norm": 0.11024312674999237, + "learning_rate": 0.0005, + "loss": 2.1305, + "step": 26710 + }, + { + "epoch": 0.10170291482380883, + "grad_norm": 0.1252600997686386, + "learning_rate": 0.0005, + "loss": 2.135, + "step": 26720 + }, + { + "epoch": 0.10174097729193152, + "grad_norm": 0.12198615819215775, + "learning_rate": 0.0005, + "loss": 2.1444, + "step": 26730 + }, + { + "epoch": 0.1017790397600542, + "grad_norm": 0.11293346434831619, + "learning_rate": 0.0005, + "loss": 2.1395, + "step": 26740 + }, + { + "epoch": 0.10181710222817689, + "grad_norm": 0.11615917831659317, + "learning_rate": 0.0005, + "loss": 2.1325, + "step": 26750 + }, + { + "epoch": 0.10185516469629957, + "grad_norm": 0.11647096276283264, + "learning_rate": 0.0005, + "loss": 2.1318, + "step": 26760 + }, + { + "epoch": 0.10189322716442226, + "grad_norm": 0.1299409121274948, + "learning_rate": 0.0005, + "loss": 2.1326, + "step": 26770 + }, + { + "epoch": 0.10193128963254493, + "grad_norm": 0.13532549142837524, + "learning_rate": 0.0005, + "loss": 2.1445, + "step": 26780 + }, + { + "epoch": 0.10196935210066761, + "grad_norm": 0.12113643437623978, + "learning_rate": 0.0005, + "loss": 2.141, + "step": 26790 + }, + { + "epoch": 0.1020074145687903, + "grad_norm": 0.12588463723659515, + "learning_rate": 0.0005, + "loss": 2.1313, + "step": 26800 + }, + { + "epoch": 0.10204547703691298, + "grad_norm": 0.12778820097446442, + "learning_rate": 0.0005, + "loss": 2.1415, + "step": 26810 + }, + { + "epoch": 0.10208353950503567, + "grad_norm": 0.11738457530736923, + "learning_rate": 0.0005, + "loss": 2.1323, + "step": 26820 + }, + { + "epoch": 0.10212160197315835, + "grad_norm": 0.1250801533460617, + "learning_rate": 0.0005, + "loss": 2.14, + "step": 26830 + }, + { + "epoch": 0.10215966444128104, + "grad_norm": 0.11063364148139954, + "learning_rate": 0.0005, + "loss": 2.1354, + "step": 26840 + }, + { + "epoch": 0.10219772690940371, + "grad_norm": 0.12232097238302231, + "learning_rate": 0.0005, + "loss": 2.143, + "step": 26850 + }, + { + "epoch": 0.10223578937752639, + "grad_norm": 0.12268010526895523, + "learning_rate": 0.0005, + "loss": 2.127, + "step": 26860 + }, + { + "epoch": 0.10227385184564908, + "grad_norm": 0.11671310663223267, + "learning_rate": 0.0005, + "loss": 2.152, + "step": 26870 + }, + { + "epoch": 0.10231191431377176, + "grad_norm": 0.12211109697818756, + "learning_rate": 0.0005, + "loss": 2.1267, + "step": 26880 + }, + { + "epoch": 0.10234997678189445, + "grad_norm": 0.12568749487400055, + "learning_rate": 0.0005, + "loss": 2.1419, + "step": 26890 + }, + { + "epoch": 0.10238803925001713, + "grad_norm": 0.12220566719770432, + "learning_rate": 0.0005, + "loss": 2.1448, + "step": 26900 + }, + { + "epoch": 0.10242610171813982, + "grad_norm": 0.13257314264774323, + "learning_rate": 0.0005, + "loss": 2.1457, + "step": 26910 + }, + { + "epoch": 0.10246416418626249, + "grad_norm": 0.14253629744052887, + "learning_rate": 0.0005, + "loss": 2.1474, + "step": 26920 + }, + { + "epoch": 0.10250222665438517, + "grad_norm": 0.12506955862045288, + "learning_rate": 0.0005, + "loss": 2.1202, + "step": 26930 + }, + { + "epoch": 0.10254028912250786, + "grad_norm": 0.1251208335161209, + "learning_rate": 0.0005, + "loss": 2.1511, + "step": 26940 + }, + { + "epoch": 0.10257835159063054, + "grad_norm": 0.1233387216925621, + "learning_rate": 0.0005, + "loss": 2.1294, + "step": 26950 + }, + { + "epoch": 0.10261641405875323, + "grad_norm": 0.12301129847764969, + "learning_rate": 0.0005, + "loss": 2.1339, + "step": 26960 + }, + { + "epoch": 0.10265447652687591, + "grad_norm": 0.1209215372800827, + "learning_rate": 0.0005, + "loss": 2.1405, + "step": 26970 + }, + { + "epoch": 0.1026925389949986, + "grad_norm": 0.11270321160554886, + "learning_rate": 0.0005, + "loss": 2.1362, + "step": 26980 + }, + { + "epoch": 0.10273060146312127, + "grad_norm": 0.1325046867132187, + "learning_rate": 0.0005, + "loss": 2.1244, + "step": 26990 + }, + { + "epoch": 0.10276866393124395, + "grad_norm": 0.12131819874048233, + "learning_rate": 0.0005, + "loss": 2.139, + "step": 27000 + }, + { + "epoch": 0.10280672639936664, + "grad_norm": 0.12465294450521469, + "learning_rate": 0.0005, + "loss": 2.1432, + "step": 27010 + }, + { + "epoch": 0.10284478886748932, + "grad_norm": 0.13112106919288635, + "learning_rate": 0.0005, + "loss": 2.1528, + "step": 27020 + }, + { + "epoch": 0.10288285133561201, + "grad_norm": 0.11870895326137543, + "learning_rate": 0.0005, + "loss": 2.1561, + "step": 27030 + }, + { + "epoch": 0.10292091380373469, + "grad_norm": 0.12203743308782578, + "learning_rate": 0.0005, + "loss": 2.1472, + "step": 27040 + }, + { + "epoch": 0.10295897627185738, + "grad_norm": 0.11765111237764359, + "learning_rate": 0.0005, + "loss": 2.1219, + "step": 27050 + }, + { + "epoch": 0.10299703873998005, + "grad_norm": 0.12385214120149612, + "learning_rate": 0.0005, + "loss": 2.1301, + "step": 27060 + }, + { + "epoch": 0.10303510120810273, + "grad_norm": 0.1228310838341713, + "learning_rate": 0.0005, + "loss": 2.1391, + "step": 27070 + }, + { + "epoch": 0.10307316367622542, + "grad_norm": 0.11405244469642639, + "learning_rate": 0.0005, + "loss": 2.1429, + "step": 27080 + }, + { + "epoch": 0.1031112261443481, + "grad_norm": 0.12276868522167206, + "learning_rate": 0.0005, + "loss": 2.1292, + "step": 27090 + }, + { + "epoch": 0.10314928861247079, + "grad_norm": 0.11903540045022964, + "learning_rate": 0.0005, + "loss": 2.1463, + "step": 27100 + }, + { + "epoch": 0.10318735108059347, + "grad_norm": 0.14137893915176392, + "learning_rate": 0.0005, + "loss": 2.138, + "step": 27110 + }, + { + "epoch": 0.10322541354871616, + "grad_norm": 0.12298958003520966, + "learning_rate": 0.0005, + "loss": 2.1366, + "step": 27120 + }, + { + "epoch": 0.10326347601683884, + "grad_norm": 0.11589296162128448, + "learning_rate": 0.0005, + "loss": 2.1414, + "step": 27130 + }, + { + "epoch": 0.10330153848496151, + "grad_norm": 0.10948190093040466, + "learning_rate": 0.0005, + "loss": 2.1415, + "step": 27140 + }, + { + "epoch": 0.1033396009530842, + "grad_norm": 0.12524929642677307, + "learning_rate": 0.0005, + "loss": 2.1345, + "step": 27150 + }, + { + "epoch": 0.10337766342120688, + "grad_norm": 0.12025979161262512, + "learning_rate": 0.0005, + "loss": 2.1337, + "step": 27160 + }, + { + "epoch": 0.10341572588932957, + "grad_norm": 0.1243286207318306, + "learning_rate": 0.0005, + "loss": 2.1297, + "step": 27170 + }, + { + "epoch": 0.10345378835745225, + "grad_norm": 0.11419710516929626, + "learning_rate": 0.0005, + "loss": 2.1404, + "step": 27180 + }, + { + "epoch": 0.10349185082557494, + "grad_norm": 0.13044367730617523, + "learning_rate": 0.0005, + "loss": 2.1266, + "step": 27190 + }, + { + "epoch": 0.10352991329369762, + "grad_norm": 0.12243938446044922, + "learning_rate": 0.0005, + "loss": 2.1387, + "step": 27200 + }, + { + "epoch": 0.1035679757618203, + "grad_norm": 0.11654973775148392, + "learning_rate": 0.0005, + "loss": 2.1215, + "step": 27210 + }, + { + "epoch": 0.10360603822994298, + "grad_norm": 0.12980137765407562, + "learning_rate": 0.0005, + "loss": 2.1335, + "step": 27220 + }, + { + "epoch": 0.10364410069806566, + "grad_norm": 0.11914009600877762, + "learning_rate": 0.0005, + "loss": 2.1291, + "step": 27230 + }, + { + "epoch": 0.10368216316618835, + "grad_norm": 0.12766490876674652, + "learning_rate": 0.0005, + "loss": 2.1356, + "step": 27240 + }, + { + "epoch": 0.10372022563431103, + "grad_norm": 0.12520195543766022, + "learning_rate": 0.0005, + "loss": 2.125, + "step": 27250 + }, + { + "epoch": 0.10375828810243372, + "grad_norm": 0.13088081777095795, + "learning_rate": 0.0005, + "loss": 2.1402, + "step": 27260 + }, + { + "epoch": 0.1037963505705564, + "grad_norm": 0.12486676871776581, + "learning_rate": 0.0005, + "loss": 2.1393, + "step": 27270 + }, + { + "epoch": 0.10383441303867907, + "grad_norm": 0.12424474954605103, + "learning_rate": 0.0005, + "loss": 2.1354, + "step": 27280 + }, + { + "epoch": 0.10387247550680176, + "grad_norm": 0.17023976147174835, + "learning_rate": 0.0005, + "loss": 2.1417, + "step": 27290 + }, + { + "epoch": 0.10391053797492444, + "grad_norm": 0.1305316537618637, + "learning_rate": 0.0005, + "loss": 2.1412, + "step": 27300 + }, + { + "epoch": 0.10394860044304713, + "grad_norm": 0.11967485398054123, + "learning_rate": 0.0005, + "loss": 2.1499, + "step": 27310 + }, + { + "epoch": 0.10398666291116981, + "grad_norm": 0.13274578750133514, + "learning_rate": 0.0005, + "loss": 2.1423, + "step": 27320 + }, + { + "epoch": 0.1040247253792925, + "grad_norm": 0.12439022213220596, + "learning_rate": 0.0005, + "loss": 2.1359, + "step": 27330 + }, + { + "epoch": 0.10406278784741518, + "grad_norm": 0.1212477907538414, + "learning_rate": 0.0005, + "loss": 2.15, + "step": 27340 + }, + { + "epoch": 0.10410085031553785, + "grad_norm": 0.12385757267475128, + "learning_rate": 0.0005, + "loss": 2.1335, + "step": 27350 + }, + { + "epoch": 0.10413891278366054, + "grad_norm": 0.12493318319320679, + "learning_rate": 0.0005, + "loss": 2.1449, + "step": 27360 + }, + { + "epoch": 0.10417697525178322, + "grad_norm": 0.12151855230331421, + "learning_rate": 0.0005, + "loss": 2.1255, + "step": 27370 + }, + { + "epoch": 0.10421503771990591, + "grad_norm": 0.11811287701129913, + "learning_rate": 0.0005, + "loss": 2.1494, + "step": 27380 + }, + { + "epoch": 0.1042531001880286, + "grad_norm": 0.11648032814264297, + "learning_rate": 0.0005, + "loss": 2.1373, + "step": 27390 + }, + { + "epoch": 0.10429116265615128, + "grad_norm": 0.11927644908428192, + "learning_rate": 0.0005, + "loss": 2.1332, + "step": 27400 + }, + { + "epoch": 0.10432922512427396, + "grad_norm": 0.12469673901796341, + "learning_rate": 0.0005, + "loss": 2.1375, + "step": 27410 + }, + { + "epoch": 0.10436728759239663, + "grad_norm": 0.1321975588798523, + "learning_rate": 0.0005, + "loss": 2.1284, + "step": 27420 + }, + { + "epoch": 0.10440535006051932, + "grad_norm": 0.11792142689228058, + "learning_rate": 0.0005, + "loss": 2.1272, + "step": 27430 + }, + { + "epoch": 0.104443412528642, + "grad_norm": 0.11825040727853775, + "learning_rate": 0.0005, + "loss": 2.1357, + "step": 27440 + }, + { + "epoch": 0.10448147499676469, + "grad_norm": 0.12313884496688843, + "learning_rate": 0.0005, + "loss": 2.1336, + "step": 27450 + }, + { + "epoch": 0.10451953746488737, + "grad_norm": 0.130935400724411, + "learning_rate": 0.0005, + "loss": 2.1524, + "step": 27460 + }, + { + "epoch": 0.10455759993301006, + "grad_norm": 0.11850696802139282, + "learning_rate": 0.0005, + "loss": 2.1326, + "step": 27470 + }, + { + "epoch": 0.10459566240113274, + "grad_norm": 0.1413838118314743, + "learning_rate": 0.0005, + "loss": 2.1251, + "step": 27480 + }, + { + "epoch": 0.10463372486925543, + "grad_norm": 0.1255698800086975, + "learning_rate": 0.0005, + "loss": 2.1293, + "step": 27490 + }, + { + "epoch": 0.1046717873373781, + "grad_norm": 0.11011414974927902, + "learning_rate": 0.0005, + "loss": 2.1253, + "step": 27500 + }, + { + "epoch": 0.10470984980550078, + "grad_norm": 0.12200158834457397, + "learning_rate": 0.0005, + "loss": 2.1316, + "step": 27510 + }, + { + "epoch": 0.10474791227362347, + "grad_norm": 0.12272480875253677, + "learning_rate": 0.0005, + "loss": 2.145, + "step": 27520 + }, + { + "epoch": 0.10478597474174615, + "grad_norm": 0.1249915137887001, + "learning_rate": 0.0005, + "loss": 2.1301, + "step": 27530 + }, + { + "epoch": 0.10482403720986884, + "grad_norm": 0.11734936386346817, + "learning_rate": 0.0005, + "loss": 2.1297, + "step": 27540 + }, + { + "epoch": 0.10486209967799152, + "grad_norm": 0.1365039348602295, + "learning_rate": 0.0005, + "loss": 2.132, + "step": 27550 + }, + { + "epoch": 0.10490016214611421, + "grad_norm": 0.11033467203378677, + "learning_rate": 0.0005, + "loss": 2.1388, + "step": 27560 + }, + { + "epoch": 0.10493822461423688, + "grad_norm": 0.1132252886891365, + "learning_rate": 0.0005, + "loss": 2.1178, + "step": 27570 + }, + { + "epoch": 0.10497628708235957, + "grad_norm": 0.11575080454349518, + "learning_rate": 0.0005, + "loss": 2.1411, + "step": 27580 + }, + { + "epoch": 0.10501434955048225, + "grad_norm": 0.129554882645607, + "learning_rate": 0.0005, + "loss": 2.1567, + "step": 27590 + }, + { + "epoch": 0.10505241201860493, + "grad_norm": 0.11766406148672104, + "learning_rate": 0.0005, + "loss": 2.1366, + "step": 27600 + }, + { + "epoch": 0.10509047448672762, + "grad_norm": 0.12703871726989746, + "learning_rate": 0.0005, + "loss": 2.1388, + "step": 27610 + }, + { + "epoch": 0.1051285369548503, + "grad_norm": 0.1272251456975937, + "learning_rate": 0.0005, + "loss": 2.1517, + "step": 27620 + }, + { + "epoch": 0.10516659942297299, + "grad_norm": 0.12048023194074631, + "learning_rate": 0.0005, + "loss": 2.1396, + "step": 27630 + }, + { + "epoch": 0.10520466189109566, + "grad_norm": 0.12507270276546478, + "learning_rate": 0.0005, + "loss": 2.1365, + "step": 27640 + }, + { + "epoch": 0.10524272435921835, + "grad_norm": 0.1116430014371872, + "learning_rate": 0.0005, + "loss": 2.1379, + "step": 27650 + }, + { + "epoch": 0.10528078682734103, + "grad_norm": 0.1226653978228569, + "learning_rate": 0.0005, + "loss": 2.1333, + "step": 27660 + }, + { + "epoch": 0.10531884929546372, + "grad_norm": 0.1241232305765152, + "learning_rate": 0.0005, + "loss": 2.1258, + "step": 27670 + }, + { + "epoch": 0.1053569117635864, + "grad_norm": 0.12949034571647644, + "learning_rate": 0.0005, + "loss": 2.1401, + "step": 27680 + }, + { + "epoch": 0.10539497423170908, + "grad_norm": 0.13007037341594696, + "learning_rate": 0.0005, + "loss": 2.1424, + "step": 27690 + }, + { + "epoch": 0.10543303669983177, + "grad_norm": 0.29032719135284424, + "learning_rate": 0.0005, + "loss": 2.1265, + "step": 27700 + }, + { + "epoch": 0.10547109916795444, + "grad_norm": 0.11723814159631729, + "learning_rate": 0.0005, + "loss": 2.1569, + "step": 27710 + }, + { + "epoch": 0.10550916163607713, + "grad_norm": 0.14339888095855713, + "learning_rate": 0.0005, + "loss": 2.1265, + "step": 27720 + }, + { + "epoch": 0.10554722410419981, + "grad_norm": 0.1379217654466629, + "learning_rate": 0.0005, + "loss": 2.1314, + "step": 27730 + }, + { + "epoch": 0.1055852865723225, + "grad_norm": 0.1293562948703766, + "learning_rate": 0.0005, + "loss": 2.1215, + "step": 27740 + }, + { + "epoch": 0.10562334904044518, + "grad_norm": 0.12406893074512482, + "learning_rate": 0.0005, + "loss": 2.1358, + "step": 27750 + }, + { + "epoch": 0.10566141150856787, + "grad_norm": 0.12450039386749268, + "learning_rate": 0.0005, + "loss": 2.1385, + "step": 27760 + }, + { + "epoch": 0.10569947397669055, + "grad_norm": 0.11588918417692184, + "learning_rate": 0.0005, + "loss": 2.1353, + "step": 27770 + }, + { + "epoch": 0.10573753644481322, + "grad_norm": 0.11539763957262039, + "learning_rate": 0.0005, + "loss": 2.1415, + "step": 27780 + }, + { + "epoch": 0.1057755989129359, + "grad_norm": 0.12182429432868958, + "learning_rate": 0.0005, + "loss": 2.1367, + "step": 27790 + }, + { + "epoch": 0.10581366138105859, + "grad_norm": 0.11426839977502823, + "learning_rate": 0.0005, + "loss": 2.1142, + "step": 27800 + }, + { + "epoch": 0.10585172384918128, + "grad_norm": 0.13639409840106964, + "learning_rate": 0.0005, + "loss": 2.1264, + "step": 27810 + }, + { + "epoch": 0.10588978631730396, + "grad_norm": 0.11492707580327988, + "learning_rate": 0.0005, + "loss": 2.1378, + "step": 27820 + }, + { + "epoch": 0.10592784878542665, + "grad_norm": 0.11488751322031021, + "learning_rate": 0.0005, + "loss": 2.1449, + "step": 27830 + }, + { + "epoch": 0.10596591125354933, + "grad_norm": 0.12409866601228714, + "learning_rate": 0.0005, + "loss": 2.1238, + "step": 27840 + }, + { + "epoch": 0.106003973721672, + "grad_norm": 0.11955158412456512, + "learning_rate": 0.0005, + "loss": 2.1385, + "step": 27850 + }, + { + "epoch": 0.10604203618979469, + "grad_norm": 0.12790343165397644, + "learning_rate": 0.0005, + "loss": 2.1357, + "step": 27860 + }, + { + "epoch": 0.10608009865791737, + "grad_norm": 0.13152866065502167, + "learning_rate": 0.0005, + "loss": 2.1469, + "step": 27870 + }, + { + "epoch": 0.10611816112604006, + "grad_norm": 0.13242776691913605, + "learning_rate": 0.0005, + "loss": 2.1468, + "step": 27880 + }, + { + "epoch": 0.10615622359416274, + "grad_norm": 0.13706664741039276, + "learning_rate": 0.0005, + "loss": 2.145, + "step": 27890 + }, + { + "epoch": 0.10619428606228543, + "grad_norm": 0.11688551306724548, + "learning_rate": 0.0005, + "loss": 2.1252, + "step": 27900 + }, + { + "epoch": 0.10623234853040811, + "grad_norm": 0.14044661819934845, + "learning_rate": 0.0005, + "loss": 2.1278, + "step": 27910 + }, + { + "epoch": 0.1062704109985308, + "grad_norm": 0.17804557085037231, + "learning_rate": 0.0005, + "loss": 2.1389, + "step": 27920 + }, + { + "epoch": 0.10630847346665347, + "grad_norm": 0.11811062693595886, + "learning_rate": 0.0005, + "loss": 2.1398, + "step": 27930 + }, + { + "epoch": 0.10634653593477615, + "grad_norm": 0.1283624917268753, + "learning_rate": 0.0005, + "loss": 2.1434, + "step": 27940 + }, + { + "epoch": 0.10638459840289884, + "grad_norm": 0.1313037872314453, + "learning_rate": 0.0005, + "loss": 2.1476, + "step": 27950 + }, + { + "epoch": 0.10642266087102152, + "grad_norm": 0.1112949475646019, + "learning_rate": 0.0005, + "loss": 2.134, + "step": 27960 + }, + { + "epoch": 0.1064607233391442, + "grad_norm": 0.1099490225315094, + "learning_rate": 0.0005, + "loss": 2.1438, + "step": 27970 + }, + { + "epoch": 0.10649878580726689, + "grad_norm": 0.1101028248667717, + "learning_rate": 0.0005, + "loss": 2.1435, + "step": 27980 + }, + { + "epoch": 0.10653684827538958, + "grad_norm": 0.12001043558120728, + "learning_rate": 0.0005, + "loss": 2.1381, + "step": 27990 + }, + { + "epoch": 0.10657491074351225, + "grad_norm": 0.11522848159074783, + "learning_rate": 0.0005, + "loss": 2.1142, + "step": 28000 + }, + { + "epoch": 0.10661297321163493, + "grad_norm": 0.12180564552545547, + "learning_rate": 0.0005, + "loss": 2.1427, + "step": 28010 + }, + { + "epoch": 0.10665103567975762, + "grad_norm": 0.13159476220607758, + "learning_rate": 0.0005, + "loss": 2.1296, + "step": 28020 + }, + { + "epoch": 0.1066890981478803, + "grad_norm": 0.12761522829532623, + "learning_rate": 0.0005, + "loss": 2.1548, + "step": 28030 + }, + { + "epoch": 0.10672716061600299, + "grad_norm": 0.11224870383739471, + "learning_rate": 0.0005, + "loss": 2.1115, + "step": 28040 + }, + { + "epoch": 0.10676522308412567, + "grad_norm": 0.11419076472520828, + "learning_rate": 0.0005, + "loss": 2.1376, + "step": 28050 + }, + { + "epoch": 0.10680328555224836, + "grad_norm": 0.12258830666542053, + "learning_rate": 0.0005, + "loss": 2.1374, + "step": 28060 + }, + { + "epoch": 0.10684134802037103, + "grad_norm": 0.11559466272592545, + "learning_rate": 0.0005, + "loss": 2.1385, + "step": 28070 + }, + { + "epoch": 0.10687941048849371, + "grad_norm": 0.1134631335735321, + "learning_rate": 0.0005, + "loss": 2.1322, + "step": 28080 + }, + { + "epoch": 0.1069174729566164, + "grad_norm": 0.13319198787212372, + "learning_rate": 0.0005, + "loss": 2.1135, + "step": 28090 + }, + { + "epoch": 0.10695553542473908, + "grad_norm": 0.1264987289905548, + "learning_rate": 0.0005, + "loss": 2.1323, + "step": 28100 + }, + { + "epoch": 0.10699359789286177, + "grad_norm": 0.12210672348737717, + "learning_rate": 0.0005, + "loss": 2.146, + "step": 28110 + }, + { + "epoch": 0.10703166036098445, + "grad_norm": 0.12532632052898407, + "learning_rate": 0.0005, + "loss": 2.1418, + "step": 28120 + }, + { + "epoch": 0.10706972282910714, + "grad_norm": 0.11959918588399887, + "learning_rate": 0.0005, + "loss": 2.1509, + "step": 28130 + }, + { + "epoch": 0.10710778529722981, + "grad_norm": 0.2721995413303375, + "learning_rate": 0.0005, + "loss": 2.1368, + "step": 28140 + }, + { + "epoch": 0.10714584776535249, + "grad_norm": 0.11988692730665207, + "learning_rate": 0.0005, + "loss": 2.1277, + "step": 28150 + }, + { + "epoch": 0.10718391023347518, + "grad_norm": 0.12983128428459167, + "learning_rate": 0.0005, + "loss": 2.1381, + "step": 28160 + }, + { + "epoch": 0.10722197270159786, + "grad_norm": 0.11977101862430573, + "learning_rate": 0.0005, + "loss": 2.1509, + "step": 28170 + }, + { + "epoch": 0.10726003516972055, + "grad_norm": 0.12527891993522644, + "learning_rate": 0.0005, + "loss": 2.1504, + "step": 28180 + }, + { + "epoch": 0.10729809763784323, + "grad_norm": 0.12681803107261658, + "learning_rate": 0.0005, + "loss": 2.1318, + "step": 28190 + }, + { + "epoch": 0.10733616010596592, + "grad_norm": 0.12825150787830353, + "learning_rate": 0.0005, + "loss": 2.1429, + "step": 28200 + }, + { + "epoch": 0.10737422257408859, + "grad_norm": 0.12182246893644333, + "learning_rate": 0.0005, + "loss": 2.1392, + "step": 28210 + }, + { + "epoch": 0.10741228504221127, + "grad_norm": 0.13522516191005707, + "learning_rate": 0.0005, + "loss": 2.1454, + "step": 28220 + }, + { + "epoch": 0.10745034751033396, + "grad_norm": 0.12266946583986282, + "learning_rate": 0.0005, + "loss": 2.1259, + "step": 28230 + }, + { + "epoch": 0.10748840997845664, + "grad_norm": 0.11807511746883392, + "learning_rate": 0.0005, + "loss": 2.1372, + "step": 28240 + }, + { + "epoch": 0.10752647244657933, + "grad_norm": 0.1240006610751152, + "learning_rate": 0.0005, + "loss": 2.1331, + "step": 28250 + }, + { + "epoch": 0.10756453491470201, + "grad_norm": 0.11999105662107468, + "learning_rate": 0.0005, + "loss": 2.1272, + "step": 28260 + }, + { + "epoch": 0.1076025973828247, + "grad_norm": 0.11139243096113205, + "learning_rate": 0.0005, + "loss": 2.1315, + "step": 28270 + }, + { + "epoch": 0.10764065985094738, + "grad_norm": 0.1327504962682724, + "learning_rate": 0.0005, + "loss": 2.142, + "step": 28280 + }, + { + "epoch": 0.10767872231907005, + "grad_norm": 0.12326204031705856, + "learning_rate": 0.0005, + "loss": 2.1304, + "step": 28290 + }, + { + "epoch": 0.10771678478719274, + "grad_norm": 0.11592575907707214, + "learning_rate": 0.0005, + "loss": 2.1257, + "step": 28300 + }, + { + "epoch": 0.10775484725531542, + "grad_norm": 0.12289170175790787, + "learning_rate": 0.0005, + "loss": 2.1278, + "step": 28310 + }, + { + "epoch": 0.10779290972343811, + "grad_norm": 0.1368006020784378, + "learning_rate": 0.0005, + "loss": 2.1436, + "step": 28320 + }, + { + "epoch": 0.10783097219156079, + "grad_norm": 0.12864282727241516, + "learning_rate": 0.0005, + "loss": 2.1373, + "step": 28330 + }, + { + "epoch": 0.10786903465968348, + "grad_norm": 0.12400609254837036, + "learning_rate": 0.0005, + "loss": 2.1316, + "step": 28340 + }, + { + "epoch": 0.10790709712780616, + "grad_norm": 0.1271352469921112, + "learning_rate": 0.0005, + "loss": 2.1318, + "step": 28350 + }, + { + "epoch": 0.10794515959592883, + "grad_norm": 0.1236211434006691, + "learning_rate": 0.0005, + "loss": 2.128, + "step": 28360 + }, + { + "epoch": 0.10798322206405152, + "grad_norm": 0.1169639304280281, + "learning_rate": 0.0005, + "loss": 2.1239, + "step": 28370 + }, + { + "epoch": 0.1080212845321742, + "grad_norm": 0.1226038709282875, + "learning_rate": 0.0005, + "loss": 2.1397, + "step": 28380 + }, + { + "epoch": 0.10805934700029689, + "grad_norm": 0.12057903409004211, + "learning_rate": 0.0005, + "loss": 2.1393, + "step": 28390 + }, + { + "epoch": 0.10809740946841957, + "grad_norm": 0.12454306334257126, + "learning_rate": 0.0005, + "loss": 2.1536, + "step": 28400 + }, + { + "epoch": 0.10813547193654226, + "grad_norm": 0.10948032885789871, + "learning_rate": 0.0005, + "loss": 2.1516, + "step": 28410 + }, + { + "epoch": 0.10817353440466494, + "grad_norm": 0.13948583602905273, + "learning_rate": 0.0005, + "loss": 2.1381, + "step": 28420 + }, + { + "epoch": 0.10821159687278761, + "grad_norm": 0.11749628931283951, + "learning_rate": 0.0005, + "loss": 2.1329, + "step": 28430 + }, + { + "epoch": 0.1082496593409103, + "grad_norm": 0.1229894757270813, + "learning_rate": 0.0005, + "loss": 2.1452, + "step": 28440 + }, + { + "epoch": 0.10828772180903298, + "grad_norm": 0.12310022860765457, + "learning_rate": 0.0005, + "loss": 2.1311, + "step": 28450 + }, + { + "epoch": 0.10832578427715567, + "grad_norm": 0.1235974133014679, + "learning_rate": 0.0005, + "loss": 2.1481, + "step": 28460 + }, + { + "epoch": 0.10836384674527835, + "grad_norm": 0.12037523090839386, + "learning_rate": 0.0005, + "loss": 2.1403, + "step": 28470 + }, + { + "epoch": 0.10840190921340104, + "grad_norm": 0.1255193054676056, + "learning_rate": 0.0005, + "loss": 2.1347, + "step": 28480 + }, + { + "epoch": 0.10843997168152372, + "grad_norm": 0.11792249232530594, + "learning_rate": 0.0005, + "loss": 2.1362, + "step": 28490 + }, + { + "epoch": 0.1084780341496464, + "grad_norm": 0.11522363871335983, + "learning_rate": 0.0005, + "loss": 2.1392, + "step": 28500 + }, + { + "epoch": 0.10851609661776908, + "grad_norm": 0.1397753655910492, + "learning_rate": 0.0005, + "loss": 2.1415, + "step": 28510 + }, + { + "epoch": 0.10855415908589176, + "grad_norm": 0.1190962940454483, + "learning_rate": 0.0005, + "loss": 2.1324, + "step": 28520 + }, + { + "epoch": 0.10859222155401445, + "grad_norm": 0.12126445770263672, + "learning_rate": 0.0005, + "loss": 2.1308, + "step": 28530 + }, + { + "epoch": 0.10863028402213713, + "grad_norm": 0.1267649233341217, + "learning_rate": 0.0005, + "loss": 2.1457, + "step": 28540 + }, + { + "epoch": 0.10866834649025982, + "grad_norm": 0.11734792590141296, + "learning_rate": 0.0005, + "loss": 2.1471, + "step": 28550 + }, + { + "epoch": 0.1087064089583825, + "grad_norm": 0.13454262912273407, + "learning_rate": 0.0005, + "loss": 2.1211, + "step": 28560 + }, + { + "epoch": 0.10874447142650517, + "grad_norm": 0.12421667575836182, + "learning_rate": 0.0005, + "loss": 2.1496, + "step": 28570 + }, + { + "epoch": 0.10878253389462786, + "grad_norm": 0.12329269200563431, + "learning_rate": 0.0005, + "loss": 2.1379, + "step": 28580 + }, + { + "epoch": 0.10882059636275054, + "grad_norm": 0.1253172904253006, + "learning_rate": 0.0005, + "loss": 2.1469, + "step": 28590 + }, + { + "epoch": 0.10885865883087323, + "grad_norm": 0.12167198956012726, + "learning_rate": 0.0005, + "loss": 2.1496, + "step": 28600 + }, + { + "epoch": 0.10889672129899591, + "grad_norm": 0.11725557595491409, + "learning_rate": 0.0005, + "loss": 2.1288, + "step": 28610 + }, + { + "epoch": 0.1089347837671186, + "grad_norm": 0.14386829733848572, + "learning_rate": 0.0005, + "loss": 2.1306, + "step": 28620 + }, + { + "epoch": 0.10897284623524128, + "grad_norm": 0.11399870365858078, + "learning_rate": 0.0005, + "loss": 2.1422, + "step": 28630 + }, + { + "epoch": 0.10901090870336395, + "grad_norm": 0.12038716673851013, + "learning_rate": 0.0005, + "loss": 2.1413, + "step": 28640 + }, + { + "epoch": 0.10904897117148664, + "grad_norm": 0.12456893920898438, + "learning_rate": 0.0005, + "loss": 2.1355, + "step": 28650 + }, + { + "epoch": 0.10908703363960932, + "grad_norm": 0.13040480017662048, + "learning_rate": 0.0005, + "loss": 2.1475, + "step": 28660 + }, + { + "epoch": 0.10912509610773201, + "grad_norm": 0.12755368649959564, + "learning_rate": 0.0005, + "loss": 2.1399, + "step": 28670 + }, + { + "epoch": 0.1091631585758547, + "grad_norm": 0.1210189163684845, + "learning_rate": 0.0005, + "loss": 2.1257, + "step": 28680 + }, + { + "epoch": 0.10920122104397738, + "grad_norm": 0.12757094204425812, + "learning_rate": 0.0005, + "loss": 2.1359, + "step": 28690 + }, + { + "epoch": 0.10923928351210006, + "grad_norm": 0.12309765070676804, + "learning_rate": 0.0005, + "loss": 2.1271, + "step": 28700 + }, + { + "epoch": 0.10927734598022275, + "grad_norm": 0.1145697832107544, + "learning_rate": 0.0005, + "loss": 2.1332, + "step": 28710 + }, + { + "epoch": 0.10931540844834542, + "grad_norm": 0.22067514061927795, + "learning_rate": 0.0005, + "loss": 2.1405, + "step": 28720 + }, + { + "epoch": 0.1093534709164681, + "grad_norm": 0.12916387617588043, + "learning_rate": 0.0005, + "loss": 2.1472, + "step": 28730 + }, + { + "epoch": 0.10939153338459079, + "grad_norm": 0.13922347128391266, + "learning_rate": 0.0005, + "loss": 2.1345, + "step": 28740 + }, + { + "epoch": 0.10942959585271347, + "grad_norm": 0.12103313952684402, + "learning_rate": 0.0005, + "loss": 2.1319, + "step": 28750 + }, + { + "epoch": 0.10946765832083616, + "grad_norm": 0.11259905248880386, + "learning_rate": 0.0005, + "loss": 2.1372, + "step": 28760 + }, + { + "epoch": 0.10950572078895884, + "grad_norm": 0.11306339502334595, + "learning_rate": 0.0005, + "loss": 2.1362, + "step": 28770 + }, + { + "epoch": 0.10954378325708153, + "grad_norm": 0.12106167525053024, + "learning_rate": 0.0005, + "loss": 2.1377, + "step": 28780 + }, + { + "epoch": 0.1095818457252042, + "grad_norm": 0.1086219921708107, + "learning_rate": 0.0005, + "loss": 2.15, + "step": 28790 + }, + { + "epoch": 0.10961990819332688, + "grad_norm": 0.12661181390285492, + "learning_rate": 0.0005, + "loss": 2.1313, + "step": 28800 + }, + { + "epoch": 0.10965797066144957, + "grad_norm": 0.11590348929166794, + "learning_rate": 0.0005, + "loss": 2.1231, + "step": 28810 + }, + { + "epoch": 0.10969603312957225, + "grad_norm": 0.12942777574062347, + "learning_rate": 0.0005, + "loss": 2.1456, + "step": 28820 + }, + { + "epoch": 0.10973409559769494, + "grad_norm": 0.12246377021074295, + "learning_rate": 0.0005, + "loss": 2.1415, + "step": 28830 + }, + { + "epoch": 0.10977215806581762, + "grad_norm": 0.11831900477409363, + "learning_rate": 0.0005, + "loss": 2.1215, + "step": 28840 + }, + { + "epoch": 0.10981022053394031, + "grad_norm": 0.12228821218013763, + "learning_rate": 0.0005, + "loss": 2.1398, + "step": 28850 + }, + { + "epoch": 0.10984828300206298, + "grad_norm": 0.12343055009841919, + "learning_rate": 0.0005, + "loss": 2.1357, + "step": 28860 + }, + { + "epoch": 0.10988634547018566, + "grad_norm": 0.12655648589134216, + "learning_rate": 0.0005, + "loss": 2.1505, + "step": 28870 + }, + { + "epoch": 0.10992440793830835, + "grad_norm": 0.12234000116586685, + "learning_rate": 0.0005, + "loss": 2.1277, + "step": 28880 + }, + { + "epoch": 0.10996247040643103, + "grad_norm": 0.12989850342273712, + "learning_rate": 0.0005, + "loss": 2.1393, + "step": 28890 + }, + { + "epoch": 0.11000053287455372, + "grad_norm": 0.11979439854621887, + "learning_rate": 0.0005, + "loss": 2.139, + "step": 28900 + }, + { + "epoch": 0.1100385953426764, + "grad_norm": 0.133430615067482, + "learning_rate": 0.0005, + "loss": 2.1205, + "step": 28910 + }, + { + "epoch": 0.11007665781079909, + "grad_norm": 0.11913889646530151, + "learning_rate": 0.0005, + "loss": 2.1398, + "step": 28920 + }, + { + "epoch": 0.11011472027892176, + "grad_norm": 0.12442494928836823, + "learning_rate": 0.0005, + "loss": 2.1262, + "step": 28930 + }, + { + "epoch": 0.11015278274704444, + "grad_norm": 0.14333483576774597, + "learning_rate": 0.0005, + "loss": 2.1417, + "step": 28940 + }, + { + "epoch": 0.11019084521516713, + "grad_norm": 0.12277472019195557, + "learning_rate": 0.0005, + "loss": 2.1304, + "step": 28950 + }, + { + "epoch": 0.11022890768328981, + "grad_norm": 0.10870832204818726, + "learning_rate": 0.0005, + "loss": 2.1351, + "step": 28960 + }, + { + "epoch": 0.1102669701514125, + "grad_norm": 0.12688913941383362, + "learning_rate": 0.0005, + "loss": 2.1211, + "step": 28970 + }, + { + "epoch": 0.11030503261953518, + "grad_norm": 0.12579499185085297, + "learning_rate": 0.0005, + "loss": 2.1293, + "step": 28980 + }, + { + "epoch": 0.11034309508765787, + "grad_norm": 0.1273648887872696, + "learning_rate": 0.0005, + "loss": 2.1365, + "step": 28990 + }, + { + "epoch": 0.11038115755578054, + "grad_norm": 0.12454326450824738, + "learning_rate": 0.0005, + "loss": 2.1421, + "step": 29000 + }, + { + "epoch": 0.11041922002390323, + "grad_norm": 0.12940780818462372, + "learning_rate": 0.0005, + "loss": 2.1293, + "step": 29010 + }, + { + "epoch": 0.11045728249202591, + "grad_norm": 0.11854497343301773, + "learning_rate": 0.0005, + "loss": 2.1347, + "step": 29020 + }, + { + "epoch": 0.1104953449601486, + "grad_norm": 0.11319740861654282, + "learning_rate": 0.0005, + "loss": 2.1157, + "step": 29030 + }, + { + "epoch": 0.11053340742827128, + "grad_norm": 0.12677529454231262, + "learning_rate": 0.0005, + "loss": 2.145, + "step": 29040 + }, + { + "epoch": 0.11057146989639396, + "grad_norm": 0.12414832413196564, + "learning_rate": 0.0005, + "loss": 2.1306, + "step": 29050 + }, + { + "epoch": 0.11060953236451665, + "grad_norm": 0.12062861770391464, + "learning_rate": 0.0005, + "loss": 2.1379, + "step": 29060 + }, + { + "epoch": 0.11064759483263933, + "grad_norm": 0.13710670173168182, + "learning_rate": 0.0005, + "loss": 2.1391, + "step": 29070 + }, + { + "epoch": 0.110685657300762, + "grad_norm": 0.13636557757854462, + "learning_rate": 0.0005, + "loss": 2.1352, + "step": 29080 + }, + { + "epoch": 0.11072371976888469, + "grad_norm": 0.13088567554950714, + "learning_rate": 0.0005, + "loss": 2.1454, + "step": 29090 + }, + { + "epoch": 0.11076178223700738, + "grad_norm": 0.11774623394012451, + "learning_rate": 0.0005, + "loss": 2.143, + "step": 29100 + }, + { + "epoch": 0.11079984470513006, + "grad_norm": 0.13022582232952118, + "learning_rate": 0.0005, + "loss": 2.1432, + "step": 29110 + }, + { + "epoch": 0.11083790717325274, + "grad_norm": 0.1218847781419754, + "learning_rate": 0.0005, + "loss": 2.1516, + "step": 29120 + }, + { + "epoch": 0.11087596964137543, + "grad_norm": 0.13264073431491852, + "learning_rate": 0.0005, + "loss": 2.146, + "step": 29130 + }, + { + "epoch": 0.11091403210949811, + "grad_norm": 0.12379482388496399, + "learning_rate": 0.0005, + "loss": 2.1413, + "step": 29140 + }, + { + "epoch": 0.11095209457762079, + "grad_norm": 0.1278836876153946, + "learning_rate": 0.0005, + "loss": 2.1311, + "step": 29150 + }, + { + "epoch": 0.11099015704574347, + "grad_norm": 0.12311187386512756, + "learning_rate": 0.0005, + "loss": 2.1341, + "step": 29160 + }, + { + "epoch": 0.11102821951386616, + "grad_norm": 0.11225911229848862, + "learning_rate": 0.0005, + "loss": 2.1392, + "step": 29170 + }, + { + "epoch": 0.11106628198198884, + "grad_norm": 0.12489734590053558, + "learning_rate": 0.0005, + "loss": 2.1332, + "step": 29180 + }, + { + "epoch": 0.11110434445011153, + "grad_norm": 0.131637841463089, + "learning_rate": 0.0005, + "loss": 2.1301, + "step": 29190 + }, + { + "epoch": 0.11114240691823421, + "grad_norm": 0.1125171110033989, + "learning_rate": 0.0005, + "loss": 2.1539, + "step": 29200 + }, + { + "epoch": 0.1111804693863569, + "grad_norm": 0.120730459690094, + "learning_rate": 0.0005, + "loss": 2.1273, + "step": 29210 + }, + { + "epoch": 0.11121853185447957, + "grad_norm": 0.13983865082263947, + "learning_rate": 0.0005, + "loss": 2.1419, + "step": 29220 + }, + { + "epoch": 0.11125659432260225, + "grad_norm": 0.12853026390075684, + "learning_rate": 0.0005, + "loss": 2.1324, + "step": 29230 + }, + { + "epoch": 0.11129465679072494, + "grad_norm": 0.12522001564502716, + "learning_rate": 0.0005, + "loss": 2.133, + "step": 29240 + }, + { + "epoch": 0.11133271925884762, + "grad_norm": 0.15114690363407135, + "learning_rate": 0.0005, + "loss": 2.1352, + "step": 29250 + }, + { + "epoch": 0.1113707817269703, + "grad_norm": 0.11838917434215546, + "learning_rate": 0.0005, + "loss": 2.1273, + "step": 29260 + }, + { + "epoch": 0.11140884419509299, + "grad_norm": 0.11473576724529266, + "learning_rate": 0.0005, + "loss": 2.1267, + "step": 29270 + }, + { + "epoch": 0.11144690666321568, + "grad_norm": 0.11432286351919174, + "learning_rate": 0.0005, + "loss": 2.1465, + "step": 29280 + }, + { + "epoch": 0.11148496913133835, + "grad_norm": 0.12724675238132477, + "learning_rate": 0.0005, + "loss": 2.1523, + "step": 29290 + }, + { + "epoch": 0.11152303159946103, + "grad_norm": 0.1361140012741089, + "learning_rate": 0.0005, + "loss": 2.1271, + "step": 29300 + }, + { + "epoch": 0.11156109406758372, + "grad_norm": 0.11614620685577393, + "learning_rate": 0.0005, + "loss": 2.1326, + "step": 29310 + }, + { + "epoch": 0.1115991565357064, + "grad_norm": 0.12610851228237152, + "learning_rate": 0.0005, + "loss": 2.1504, + "step": 29320 + }, + { + "epoch": 0.11163721900382909, + "grad_norm": 0.12461981177330017, + "learning_rate": 0.0005, + "loss": 2.126, + "step": 29330 + }, + { + "epoch": 0.11167528147195177, + "grad_norm": 0.11165129393339157, + "learning_rate": 0.0005, + "loss": 2.1316, + "step": 29340 + }, + { + "epoch": 0.11171334394007446, + "grad_norm": 0.16419917345046997, + "learning_rate": 0.0005, + "loss": 2.1456, + "step": 29350 + }, + { + "epoch": 0.11175140640819713, + "grad_norm": 0.1316666156053543, + "learning_rate": 0.0005, + "loss": 2.1258, + "step": 29360 + }, + { + "epoch": 0.11178946887631981, + "grad_norm": 0.12397214025259018, + "learning_rate": 0.0005, + "loss": 2.134, + "step": 29370 + }, + { + "epoch": 0.1118275313444425, + "grad_norm": 0.12347505986690521, + "learning_rate": 0.0005, + "loss": 2.1295, + "step": 29380 + }, + { + "epoch": 0.11186559381256518, + "grad_norm": 0.13741114735603333, + "learning_rate": 0.0005, + "loss": 2.1228, + "step": 29390 + }, + { + "epoch": 0.11190365628068787, + "grad_norm": 0.13383878767490387, + "learning_rate": 0.0005, + "loss": 2.1487, + "step": 29400 + }, + { + "epoch": 0.11194171874881055, + "grad_norm": 0.1291329264640808, + "learning_rate": 0.0005, + "loss": 2.1293, + "step": 29410 + }, + { + "epoch": 0.11197978121693324, + "grad_norm": 0.132411926984787, + "learning_rate": 0.0005, + "loss": 2.1527, + "step": 29420 + }, + { + "epoch": 0.11201784368505592, + "grad_norm": 0.11728857457637787, + "learning_rate": 0.0005, + "loss": 2.1553, + "step": 29430 + }, + { + "epoch": 0.11205590615317859, + "grad_norm": 0.11532270163297653, + "learning_rate": 0.0005, + "loss": 2.1371, + "step": 29440 + }, + { + "epoch": 0.11209396862130128, + "grad_norm": 0.11879745870828629, + "learning_rate": 0.0005, + "loss": 2.1303, + "step": 29450 + }, + { + "epoch": 0.11213203108942396, + "grad_norm": 0.1252891719341278, + "learning_rate": 0.0005, + "loss": 2.1298, + "step": 29460 + }, + { + "epoch": 0.11217009355754665, + "grad_norm": 0.11775480210781097, + "learning_rate": 0.0005, + "loss": 2.1447, + "step": 29470 + }, + { + "epoch": 0.11220815602566933, + "grad_norm": 0.13671059906482697, + "learning_rate": 0.0005, + "loss": 2.1413, + "step": 29480 + }, + { + "epoch": 0.11224621849379202, + "grad_norm": 0.13583379983901978, + "learning_rate": 0.0005, + "loss": 2.1354, + "step": 29490 + }, + { + "epoch": 0.1122842809619147, + "grad_norm": 0.1581609696149826, + "learning_rate": 0.0005, + "loss": 2.1558, + "step": 29500 + }, + { + "epoch": 0.11232234343003737, + "grad_norm": 0.12587234377861023, + "learning_rate": 0.0005, + "loss": 2.1329, + "step": 29510 + }, + { + "epoch": 0.11236040589816006, + "grad_norm": 0.12166285514831543, + "learning_rate": 0.0005, + "loss": 2.1312, + "step": 29520 + }, + { + "epoch": 0.11239846836628274, + "grad_norm": 0.12624861299991608, + "learning_rate": 0.0005, + "loss": 2.1363, + "step": 29530 + }, + { + "epoch": 0.11243653083440543, + "grad_norm": 0.10956565290689468, + "learning_rate": 0.0005, + "loss": 2.1391, + "step": 29540 + }, + { + "epoch": 0.11247459330252811, + "grad_norm": 0.12157244980335236, + "learning_rate": 0.0005, + "loss": 2.1269, + "step": 29550 + }, + { + "epoch": 0.1125126557706508, + "grad_norm": 0.12983137369155884, + "learning_rate": 0.0005, + "loss": 2.1354, + "step": 29560 + }, + { + "epoch": 0.11255071823877348, + "grad_norm": 0.12666325271129608, + "learning_rate": 0.0005, + "loss": 2.1285, + "step": 29570 + }, + { + "epoch": 0.11258878070689615, + "grad_norm": 0.1175815686583519, + "learning_rate": 0.0005, + "loss": 2.1142, + "step": 29580 + }, + { + "epoch": 0.11262684317501884, + "grad_norm": 0.12979090213775635, + "learning_rate": 0.0005, + "loss": 2.1301, + "step": 29590 + }, + { + "epoch": 0.11266490564314152, + "grad_norm": 0.12051022797822952, + "learning_rate": 0.0005, + "loss": 2.1472, + "step": 29600 + }, + { + "epoch": 0.1127029681112642, + "grad_norm": 0.11917869746685028, + "learning_rate": 0.0005, + "loss": 2.1529, + "step": 29610 + }, + { + "epoch": 0.11274103057938689, + "grad_norm": 0.11437923461198807, + "learning_rate": 0.0005, + "loss": 2.1306, + "step": 29620 + }, + { + "epoch": 0.11277909304750958, + "grad_norm": 0.11656700819730759, + "learning_rate": 0.0005, + "loss": 2.142, + "step": 29630 + }, + { + "epoch": 0.11281715551563226, + "grad_norm": 0.11907721310853958, + "learning_rate": 0.0005, + "loss": 2.13, + "step": 29640 + }, + { + "epoch": 0.11285521798375493, + "grad_norm": 0.14156955480575562, + "learning_rate": 0.0005, + "loss": 2.1297, + "step": 29650 + }, + { + "epoch": 0.11289328045187762, + "grad_norm": 0.12269634008407593, + "learning_rate": 0.0005, + "loss": 2.137, + "step": 29660 + }, + { + "epoch": 0.1129313429200003, + "grad_norm": 0.11319083720445633, + "learning_rate": 0.0005, + "loss": 2.1449, + "step": 29670 + }, + { + "epoch": 0.11296940538812299, + "grad_norm": 0.11652123928070068, + "learning_rate": 0.0005, + "loss": 2.1273, + "step": 29680 + }, + { + "epoch": 0.11300746785624567, + "grad_norm": 0.12396130710840225, + "learning_rate": 0.0005, + "loss": 2.1605, + "step": 29690 + }, + { + "epoch": 0.11304553032436836, + "grad_norm": 0.1111893430352211, + "learning_rate": 0.0005, + "loss": 2.1437, + "step": 29700 + }, + { + "epoch": 0.11308359279249104, + "grad_norm": 0.12545843422412872, + "learning_rate": 0.0005, + "loss": 2.1508, + "step": 29710 + }, + { + "epoch": 0.11312165526061371, + "grad_norm": 0.1228807270526886, + "learning_rate": 0.0005, + "loss": 2.1359, + "step": 29720 + }, + { + "epoch": 0.1131597177287364, + "grad_norm": 0.11801854521036148, + "learning_rate": 0.0005, + "loss": 2.1449, + "step": 29730 + }, + { + "epoch": 0.11319778019685908, + "grad_norm": 0.12060797214508057, + "learning_rate": 0.0005, + "loss": 2.1316, + "step": 29740 + }, + { + "epoch": 0.11323584266498177, + "grad_norm": 0.12158960849046707, + "learning_rate": 0.0005, + "loss": 2.1404, + "step": 29750 + }, + { + "epoch": 0.11327390513310445, + "grad_norm": 0.12883733212947845, + "learning_rate": 0.0005, + "loss": 2.1253, + "step": 29760 + }, + { + "epoch": 0.11331196760122714, + "grad_norm": 0.12048101425170898, + "learning_rate": 0.0005, + "loss": 2.1233, + "step": 29770 + }, + { + "epoch": 0.11335003006934982, + "grad_norm": 0.12671560049057007, + "learning_rate": 0.0005, + "loss": 2.1326, + "step": 29780 + }, + { + "epoch": 0.11338809253747249, + "grad_norm": 0.1330437809228897, + "learning_rate": 0.0005, + "loss": 2.1345, + "step": 29790 + }, + { + "epoch": 0.11342615500559518, + "grad_norm": 0.12295544892549515, + "learning_rate": 0.0005, + "loss": 2.1335, + "step": 29800 + }, + { + "epoch": 0.11346421747371786, + "grad_norm": 0.12095817923545837, + "learning_rate": 0.0005, + "loss": 2.131, + "step": 29810 + }, + { + "epoch": 0.11350227994184055, + "grad_norm": 0.11469054967164993, + "learning_rate": 0.0005, + "loss": 2.1329, + "step": 29820 + }, + { + "epoch": 0.11354034240996323, + "grad_norm": 0.12663350999355316, + "learning_rate": 0.0005, + "loss": 2.1492, + "step": 29830 + }, + { + "epoch": 0.11357840487808592, + "grad_norm": 0.12309986352920532, + "learning_rate": 0.0005, + "loss": 2.1386, + "step": 29840 + }, + { + "epoch": 0.1136164673462086, + "grad_norm": 0.1423839032649994, + "learning_rate": 0.0005, + "loss": 2.1393, + "step": 29850 + }, + { + "epoch": 0.11365452981433129, + "grad_norm": 0.13435962796211243, + "learning_rate": 0.0005, + "loss": 2.1459, + "step": 29860 + }, + { + "epoch": 0.11369259228245396, + "grad_norm": 0.12411796301603317, + "learning_rate": 0.0005, + "loss": 2.1449, + "step": 29870 + }, + { + "epoch": 0.11373065475057664, + "grad_norm": 0.11389677971601486, + "learning_rate": 0.0005, + "loss": 2.1217, + "step": 29880 + }, + { + "epoch": 0.11376871721869933, + "grad_norm": 0.11975105851888657, + "learning_rate": 0.0005, + "loss": 2.127, + "step": 29890 + }, + { + "epoch": 0.11380677968682201, + "grad_norm": 0.11731837689876556, + "learning_rate": 0.0005, + "loss": 2.1498, + "step": 29900 + }, + { + "epoch": 0.1138448421549447, + "grad_norm": 0.12533852458000183, + "learning_rate": 0.0005, + "loss": 2.1371, + "step": 29910 + }, + { + "epoch": 0.11388290462306738, + "grad_norm": 0.11128509044647217, + "learning_rate": 0.0005, + "loss": 2.137, + "step": 29920 + }, + { + "epoch": 0.11392096709119007, + "grad_norm": 0.1130741611123085, + "learning_rate": 0.0005, + "loss": 2.1414, + "step": 29930 + }, + { + "epoch": 0.11395902955931274, + "grad_norm": 0.11497542262077332, + "learning_rate": 0.0005, + "loss": 2.1413, + "step": 29940 + }, + { + "epoch": 0.11399709202743542, + "grad_norm": 0.13669130206108093, + "learning_rate": 0.0005, + "loss": 2.1291, + "step": 29950 + }, + { + "epoch": 0.11403515449555811, + "grad_norm": 0.1437700241804123, + "learning_rate": 0.0005, + "loss": 2.1304, + "step": 29960 + }, + { + "epoch": 0.11407321696368079, + "grad_norm": 0.12841551005840302, + "learning_rate": 0.0005, + "loss": 2.1422, + "step": 29970 + }, + { + "epoch": 0.11411127943180348, + "grad_norm": 0.13106156885623932, + "learning_rate": 0.0005, + "loss": 2.1445, + "step": 29980 + }, + { + "epoch": 0.11414934189992616, + "grad_norm": 0.12786227464675903, + "learning_rate": 0.0005, + "loss": 2.1497, + "step": 29990 + }, + { + "epoch": 0.11418740436804885, + "grad_norm": 0.1266452670097351, + "learning_rate": 0.0005, + "loss": 2.1362, + "step": 30000 + }, + { + "epoch": 0.11422546683617152, + "grad_norm": 0.12195294350385666, + "learning_rate": 0.0005, + "loss": 2.1283, + "step": 30010 + }, + { + "epoch": 0.1142635293042942, + "grad_norm": 0.1278504729270935, + "learning_rate": 0.0005, + "loss": 2.1246, + "step": 30020 + }, + { + "epoch": 0.11430159177241689, + "grad_norm": 0.12020813673734665, + "learning_rate": 0.0005, + "loss": 2.1483, + "step": 30030 + }, + { + "epoch": 0.11433965424053957, + "grad_norm": 0.12903952598571777, + "learning_rate": 0.0005, + "loss": 2.141, + "step": 30040 + }, + { + "epoch": 0.11437771670866226, + "grad_norm": 0.12148747593164444, + "learning_rate": 0.0005, + "loss": 2.1426, + "step": 30050 + }, + { + "epoch": 0.11441577917678494, + "grad_norm": 0.12210709601640701, + "learning_rate": 0.0005, + "loss": 2.1445, + "step": 30060 + }, + { + "epoch": 0.11445384164490763, + "grad_norm": 0.12444771826267242, + "learning_rate": 0.0005, + "loss": 2.144, + "step": 30070 + }, + { + "epoch": 0.1144919041130303, + "grad_norm": 0.12810124456882477, + "learning_rate": 0.0005, + "loss": 2.1394, + "step": 30080 + }, + { + "epoch": 0.11452996658115298, + "grad_norm": 0.13028530776500702, + "learning_rate": 0.0005, + "loss": 2.1459, + "step": 30090 + }, + { + "epoch": 0.11456802904927567, + "grad_norm": 0.1409108191728592, + "learning_rate": 0.0005, + "loss": 2.1423, + "step": 30100 + }, + { + "epoch": 0.11460609151739835, + "grad_norm": 0.12987381219863892, + "learning_rate": 0.0005, + "loss": 2.1334, + "step": 30110 + }, + { + "epoch": 0.11464415398552104, + "grad_norm": 0.12352680414915085, + "learning_rate": 0.0005, + "loss": 2.139, + "step": 30120 + }, + { + "epoch": 0.11468221645364372, + "grad_norm": 0.12190854549407959, + "learning_rate": 0.0005, + "loss": 2.1403, + "step": 30130 + }, + { + "epoch": 0.11472027892176641, + "grad_norm": 0.13718342781066895, + "learning_rate": 0.0005, + "loss": 2.1497, + "step": 30140 + }, + { + "epoch": 0.11475834138988908, + "grad_norm": 0.16431915760040283, + "learning_rate": 0.0005, + "loss": 2.1319, + "step": 30150 + }, + { + "epoch": 0.11479640385801176, + "grad_norm": 0.11458581686019897, + "learning_rate": 0.0005, + "loss": 2.1321, + "step": 30160 + }, + { + "epoch": 0.11483446632613445, + "grad_norm": 0.1225656121969223, + "learning_rate": 0.0005, + "loss": 2.1487, + "step": 30170 + }, + { + "epoch": 0.11487252879425713, + "grad_norm": 0.13149769604206085, + "learning_rate": 0.0005, + "loss": 2.1425, + "step": 30180 + }, + { + "epoch": 0.11491059126237982, + "grad_norm": 0.1142268106341362, + "learning_rate": 0.0005, + "loss": 2.1419, + "step": 30190 + }, + { + "epoch": 0.1149486537305025, + "grad_norm": 0.12473702430725098, + "learning_rate": 0.0005, + "loss": 2.1367, + "step": 30200 + }, + { + "epoch": 0.11498671619862519, + "grad_norm": 0.12434647232294083, + "learning_rate": 0.0005, + "loss": 2.1504, + "step": 30210 + }, + { + "epoch": 0.11502477866674787, + "grad_norm": 0.12315231561660767, + "learning_rate": 0.0005, + "loss": 2.145, + "step": 30220 + }, + { + "epoch": 0.11506284113487054, + "grad_norm": 0.11632184684276581, + "learning_rate": 0.0005, + "loss": 2.1293, + "step": 30230 + }, + { + "epoch": 0.11510090360299323, + "grad_norm": 0.1280062347650528, + "learning_rate": 0.0005, + "loss": 2.1452, + "step": 30240 + }, + { + "epoch": 0.11513896607111591, + "grad_norm": 0.14986898005008698, + "learning_rate": 0.0005, + "loss": 2.1284, + "step": 30250 + }, + { + "epoch": 0.1151770285392386, + "grad_norm": 0.12317207455635071, + "learning_rate": 0.0005, + "loss": 2.1347, + "step": 30260 + }, + { + "epoch": 0.11521509100736128, + "grad_norm": 0.12557360529899597, + "learning_rate": 0.0005, + "loss": 2.1331, + "step": 30270 + }, + { + "epoch": 0.11525315347548397, + "grad_norm": 0.5975351333618164, + "learning_rate": 0.0005, + "loss": 2.146, + "step": 30280 + }, + { + "epoch": 0.11529121594360665, + "grad_norm": 0.12424667179584503, + "learning_rate": 0.0005, + "loss": 2.1248, + "step": 30290 + }, + { + "epoch": 0.11532927841172932, + "grad_norm": 0.11362697929143906, + "learning_rate": 0.0005, + "loss": 2.1295, + "step": 30300 + }, + { + "epoch": 0.11536734087985201, + "grad_norm": 0.1270321160554886, + "learning_rate": 0.0005, + "loss": 2.1377, + "step": 30310 + }, + { + "epoch": 0.1154054033479747, + "grad_norm": 0.12559852004051208, + "learning_rate": 0.0005, + "loss": 2.1319, + "step": 30320 + }, + { + "epoch": 0.11544346581609738, + "grad_norm": 0.12761664390563965, + "learning_rate": 0.0005, + "loss": 2.1351, + "step": 30330 + }, + { + "epoch": 0.11548152828422006, + "grad_norm": 0.1325635462999344, + "learning_rate": 0.0005, + "loss": 2.133, + "step": 30340 + }, + { + "epoch": 0.11551959075234275, + "grad_norm": 0.10914253443479538, + "learning_rate": 0.0005, + "loss": 2.1281, + "step": 30350 + }, + { + "epoch": 0.11555765322046543, + "grad_norm": 0.14419271051883698, + "learning_rate": 0.0005, + "loss": 2.1385, + "step": 30360 + }, + { + "epoch": 0.1155957156885881, + "grad_norm": 0.12012334167957306, + "learning_rate": 0.0005, + "loss": 2.1497, + "step": 30370 + }, + { + "epoch": 0.11563377815671079, + "grad_norm": 0.14006179571151733, + "learning_rate": 0.0005, + "loss": 2.1419, + "step": 30380 + }, + { + "epoch": 0.11567184062483347, + "grad_norm": 0.1367906928062439, + "learning_rate": 0.0005, + "loss": 2.1375, + "step": 30390 + }, + { + "epoch": 0.11570990309295616, + "grad_norm": 0.11864303797483444, + "learning_rate": 0.0005, + "loss": 2.1436, + "step": 30400 + }, + { + "epoch": 0.11574796556107884, + "grad_norm": 0.12833933532238007, + "learning_rate": 0.0005, + "loss": 2.1392, + "step": 30410 + }, + { + "epoch": 0.11578602802920153, + "grad_norm": 0.1643187254667282, + "learning_rate": 0.0005, + "loss": 2.1384, + "step": 30420 + }, + { + "epoch": 0.11582409049732421, + "grad_norm": 0.12334811687469482, + "learning_rate": 0.0005, + "loss": 2.1451, + "step": 30430 + }, + { + "epoch": 0.11586215296544689, + "grad_norm": 0.11577615141868591, + "learning_rate": 0.0005, + "loss": 2.1401, + "step": 30440 + }, + { + "epoch": 0.11590021543356957, + "grad_norm": 0.12428303807973862, + "learning_rate": 0.0005, + "loss": 2.1262, + "step": 30450 + }, + { + "epoch": 0.11593827790169225, + "grad_norm": 0.12436975538730621, + "learning_rate": 0.0005, + "loss": 2.1488, + "step": 30460 + }, + { + "epoch": 0.11597634036981494, + "grad_norm": 0.12434789538383484, + "learning_rate": 0.0005, + "loss": 2.1303, + "step": 30470 + }, + { + "epoch": 0.11601440283793762, + "grad_norm": 0.11997382342815399, + "learning_rate": 0.0005, + "loss": 2.1365, + "step": 30480 + }, + { + "epoch": 0.11605246530606031, + "grad_norm": 0.1221473217010498, + "learning_rate": 0.0005, + "loss": 2.1507, + "step": 30490 + }, + { + "epoch": 0.116090527774183, + "grad_norm": 0.14918829500675201, + "learning_rate": 0.0005, + "loss": 2.1313, + "step": 30500 + }, + { + "epoch": 0.11612859024230567, + "grad_norm": 0.1305091381072998, + "learning_rate": 0.0005, + "loss": 2.1327, + "step": 30510 + }, + { + "epoch": 0.11616665271042835, + "grad_norm": 0.1277039498090744, + "learning_rate": 0.0005, + "loss": 2.134, + "step": 30520 + }, + { + "epoch": 0.11620471517855104, + "grad_norm": 0.11043685674667358, + "learning_rate": 0.0005, + "loss": 2.134, + "step": 30530 + }, + { + "epoch": 0.11624277764667372, + "grad_norm": 0.12230376899242401, + "learning_rate": 0.0005, + "loss": 2.1289, + "step": 30540 + }, + { + "epoch": 0.1162808401147964, + "grad_norm": 0.11290695518255234, + "learning_rate": 0.0005, + "loss": 2.1206, + "step": 30550 + }, + { + "epoch": 0.11631890258291909, + "grad_norm": 0.1329115480184555, + "learning_rate": 0.0005, + "loss": 2.1391, + "step": 30560 + }, + { + "epoch": 0.11635696505104177, + "grad_norm": 0.12522819638252258, + "learning_rate": 0.0005, + "loss": 2.1439, + "step": 30570 + }, + { + "epoch": 0.11639502751916446, + "grad_norm": 0.1319337785243988, + "learning_rate": 0.0005, + "loss": 2.1268, + "step": 30580 + }, + { + "epoch": 0.11643308998728713, + "grad_norm": 0.11786749213933945, + "learning_rate": 0.0005, + "loss": 2.1358, + "step": 30590 + }, + { + "epoch": 0.11647115245540982, + "grad_norm": 0.12414077669382095, + "learning_rate": 0.0005, + "loss": 2.1534, + "step": 30600 + }, + { + "epoch": 0.1165092149235325, + "grad_norm": 0.12328702211380005, + "learning_rate": 0.0005, + "loss": 2.1329, + "step": 30610 + }, + { + "epoch": 0.11654727739165519, + "grad_norm": 0.12226787209510803, + "learning_rate": 0.0005, + "loss": 2.1219, + "step": 30620 + }, + { + "epoch": 0.11658533985977787, + "grad_norm": 0.11650876700878143, + "learning_rate": 0.0005, + "loss": 2.1351, + "step": 30630 + }, + { + "epoch": 0.11662340232790055, + "grad_norm": 0.12521617114543915, + "learning_rate": 0.0005, + "loss": 2.1364, + "step": 30640 + }, + { + "epoch": 0.11666146479602324, + "grad_norm": 0.12409238517284393, + "learning_rate": 0.0005, + "loss": 2.1337, + "step": 30650 + }, + { + "epoch": 0.11669952726414591, + "grad_norm": 0.1295614391565323, + "learning_rate": 0.0005, + "loss": 2.1278, + "step": 30660 + }, + { + "epoch": 0.1167375897322686, + "grad_norm": 0.14065662026405334, + "learning_rate": 0.0005, + "loss": 2.1293, + "step": 30670 + }, + { + "epoch": 0.11677565220039128, + "grad_norm": 0.12377720326185226, + "learning_rate": 0.0005, + "loss": 2.141, + "step": 30680 + }, + { + "epoch": 0.11681371466851397, + "grad_norm": 0.1270572394132614, + "learning_rate": 0.0005, + "loss": 2.1312, + "step": 30690 + }, + { + "epoch": 0.11685177713663665, + "grad_norm": 0.12031006813049316, + "learning_rate": 0.0005, + "loss": 2.1412, + "step": 30700 + }, + { + "epoch": 0.11688983960475934, + "grad_norm": 0.1141844317317009, + "learning_rate": 0.0005, + "loss": 2.1452, + "step": 30710 + }, + { + "epoch": 0.11692790207288202, + "grad_norm": 0.11144188046455383, + "learning_rate": 0.0005, + "loss": 2.1439, + "step": 30720 + }, + { + "epoch": 0.11696596454100469, + "grad_norm": 0.1195073276758194, + "learning_rate": 0.0005, + "loss": 2.1481, + "step": 30730 + }, + { + "epoch": 0.11700402700912738, + "grad_norm": 0.11948345601558685, + "learning_rate": 0.0005, + "loss": 2.1406, + "step": 30740 + }, + { + "epoch": 0.11704208947725006, + "grad_norm": 0.11476597934961319, + "learning_rate": 0.0005, + "loss": 2.133, + "step": 30750 + }, + { + "epoch": 0.11708015194537275, + "grad_norm": 0.12001378834247589, + "learning_rate": 0.0005, + "loss": 2.1327, + "step": 30760 + }, + { + "epoch": 0.11711821441349543, + "grad_norm": 0.13760356605052948, + "learning_rate": 0.0005, + "loss": 2.1424, + "step": 30770 + }, + { + "epoch": 0.11715627688161812, + "grad_norm": 0.12049812078475952, + "learning_rate": 0.0005, + "loss": 2.1283, + "step": 30780 + }, + { + "epoch": 0.1171943393497408, + "grad_norm": 0.11435163021087646, + "learning_rate": 0.0005, + "loss": 2.1478, + "step": 30790 + }, + { + "epoch": 0.11723240181786347, + "grad_norm": 0.1208324208855629, + "learning_rate": 0.0005, + "loss": 2.1428, + "step": 30800 + }, + { + "epoch": 0.11727046428598616, + "grad_norm": 0.1224512904882431, + "learning_rate": 0.0005, + "loss": 2.1458, + "step": 30810 + }, + { + "epoch": 0.11730852675410884, + "grad_norm": 0.14688503742218018, + "learning_rate": 0.0005, + "loss": 2.1321, + "step": 30820 + }, + { + "epoch": 0.11734658922223153, + "grad_norm": 0.12300082296133041, + "learning_rate": 0.0005, + "loss": 2.132, + "step": 30830 + }, + { + "epoch": 0.11738465169035421, + "grad_norm": 0.13545553386211395, + "learning_rate": 0.0005, + "loss": 2.1352, + "step": 30840 + }, + { + "epoch": 0.1174227141584769, + "grad_norm": 0.13323470950126648, + "learning_rate": 0.0005, + "loss": 2.1581, + "step": 30850 + }, + { + "epoch": 0.11746077662659958, + "grad_norm": 0.11637146770954132, + "learning_rate": 0.0005, + "loss": 2.1389, + "step": 30860 + }, + { + "epoch": 0.11749883909472225, + "grad_norm": 0.11425112932920456, + "learning_rate": 0.0005, + "loss": 2.1294, + "step": 30870 + }, + { + "epoch": 0.11753690156284494, + "grad_norm": 0.11201392859220505, + "learning_rate": 0.0005, + "loss": 2.1397, + "step": 30880 + }, + { + "epoch": 0.11757496403096762, + "grad_norm": 0.11493717133998871, + "learning_rate": 0.0005, + "loss": 2.1281, + "step": 30890 + }, + { + "epoch": 0.1176130264990903, + "grad_norm": 0.11629050970077515, + "learning_rate": 0.0005, + "loss": 2.1378, + "step": 30900 + }, + { + "epoch": 0.11765108896721299, + "grad_norm": 0.1346382051706314, + "learning_rate": 0.0005, + "loss": 2.1351, + "step": 30910 + }, + { + "epoch": 0.11768915143533568, + "grad_norm": 0.13523519039154053, + "learning_rate": 0.0005, + "loss": 2.147, + "step": 30920 + }, + { + "epoch": 0.11772721390345836, + "grad_norm": 0.13149811327457428, + "learning_rate": 0.0005, + "loss": 2.1355, + "step": 30930 + }, + { + "epoch": 0.11776527637158103, + "grad_norm": 0.11538290977478027, + "learning_rate": 0.0005, + "loss": 2.1297, + "step": 30940 + }, + { + "epoch": 0.11780333883970372, + "grad_norm": 0.12799760699272156, + "learning_rate": 0.0005, + "loss": 2.1411, + "step": 30950 + }, + { + "epoch": 0.1178414013078264, + "grad_norm": 0.11699873208999634, + "learning_rate": 0.0005, + "loss": 2.1261, + "step": 30960 + }, + { + "epoch": 0.11787946377594909, + "grad_norm": 0.12116017192602158, + "learning_rate": 0.0005, + "loss": 2.1478, + "step": 30970 + }, + { + "epoch": 0.11791752624407177, + "grad_norm": 0.12173012644052505, + "learning_rate": 0.0005, + "loss": 2.1302, + "step": 30980 + }, + { + "epoch": 0.11795558871219446, + "grad_norm": 0.12548451125621796, + "learning_rate": 0.0005, + "loss": 2.1511, + "step": 30990 + }, + { + "epoch": 0.11799365118031714, + "grad_norm": 0.1358858197927475, + "learning_rate": 0.0005, + "loss": 2.1437, + "step": 31000 + }, + { + "epoch": 0.11803171364843983, + "grad_norm": 0.14300884306430817, + "learning_rate": 0.0005, + "loss": 2.1382, + "step": 31010 + }, + { + "epoch": 0.1180697761165625, + "grad_norm": 0.14722184836864471, + "learning_rate": 0.0005, + "loss": 2.151, + "step": 31020 + }, + { + "epoch": 0.11810783858468518, + "grad_norm": 0.11908479779958725, + "learning_rate": 0.0005, + "loss": 2.1482, + "step": 31030 + }, + { + "epoch": 0.11814590105280787, + "grad_norm": 0.11892859637737274, + "learning_rate": 0.0005, + "loss": 2.1491, + "step": 31040 + }, + { + "epoch": 0.11818396352093055, + "grad_norm": 0.11230923235416412, + "learning_rate": 0.0005, + "loss": 2.1308, + "step": 31050 + }, + { + "epoch": 0.11822202598905324, + "grad_norm": 0.11231476813554764, + "learning_rate": 0.0005, + "loss": 2.1259, + "step": 31060 + }, + { + "epoch": 0.11826008845717592, + "grad_norm": 0.13890352845191956, + "learning_rate": 0.0005, + "loss": 2.14, + "step": 31070 + }, + { + "epoch": 0.1182981509252986, + "grad_norm": 0.12718895077705383, + "learning_rate": 0.0005, + "loss": 2.153, + "step": 31080 + }, + { + "epoch": 0.11833621339342128, + "grad_norm": 0.12791119515895844, + "learning_rate": 0.0005, + "loss": 2.1404, + "step": 31090 + }, + { + "epoch": 0.11837427586154396, + "grad_norm": 0.11950281262397766, + "learning_rate": 0.0005, + "loss": 2.1282, + "step": 31100 + }, + { + "epoch": 0.11841233832966665, + "grad_norm": 0.12373452633619308, + "learning_rate": 0.0005, + "loss": 2.1248, + "step": 31110 + }, + { + "epoch": 0.11845040079778933, + "grad_norm": 0.11845168471336365, + "learning_rate": 0.0005, + "loss": 2.1334, + "step": 31120 + }, + { + "epoch": 0.11848846326591202, + "grad_norm": 0.12394628673791885, + "learning_rate": 0.0005, + "loss": 2.1321, + "step": 31130 + }, + { + "epoch": 0.1185265257340347, + "grad_norm": 0.13420367240905762, + "learning_rate": 0.0005, + "loss": 2.1396, + "step": 31140 + }, + { + "epoch": 0.11856458820215739, + "grad_norm": 0.12310107797384262, + "learning_rate": 0.0005, + "loss": 2.121, + "step": 31150 + }, + { + "epoch": 0.11860265067028006, + "grad_norm": 0.11265326291322708, + "learning_rate": 0.0005, + "loss": 2.1267, + "step": 31160 + }, + { + "epoch": 0.11864071313840274, + "grad_norm": 0.11336939036846161, + "learning_rate": 0.0005, + "loss": 2.1249, + "step": 31170 + }, + { + "epoch": 0.11867877560652543, + "grad_norm": 0.12562648952007294, + "learning_rate": 0.0005, + "loss": 2.1405, + "step": 31180 + }, + { + "epoch": 0.11871683807464811, + "grad_norm": 0.12459778040647507, + "learning_rate": 0.0005, + "loss": 2.134, + "step": 31190 + }, + { + "epoch": 0.1187549005427708, + "grad_norm": 0.1319107562303543, + "learning_rate": 0.0005, + "loss": 2.1517, + "step": 31200 + }, + { + "epoch": 0.11879296301089348, + "grad_norm": 0.1265908181667328, + "learning_rate": 0.0005, + "loss": 2.1301, + "step": 31210 + }, + { + "epoch": 0.11883102547901617, + "grad_norm": 0.13179562985897064, + "learning_rate": 0.0005, + "loss": 2.1436, + "step": 31220 + }, + { + "epoch": 0.11886908794713884, + "grad_norm": 0.12190480530261993, + "learning_rate": 0.0005, + "loss": 2.1363, + "step": 31230 + }, + { + "epoch": 0.11890715041526152, + "grad_norm": 0.12243692576885223, + "learning_rate": 0.0005, + "loss": 2.1315, + "step": 31240 + }, + { + "epoch": 0.11894521288338421, + "grad_norm": 0.13950267434120178, + "learning_rate": 0.0005, + "loss": 2.1311, + "step": 31250 + }, + { + "epoch": 0.11898327535150689, + "grad_norm": 0.12174614518880844, + "learning_rate": 0.0005, + "loss": 2.1193, + "step": 31260 + }, + { + "epoch": 0.11902133781962958, + "grad_norm": 0.12926694750785828, + "learning_rate": 0.0005, + "loss": 2.1441, + "step": 31270 + }, + { + "epoch": 0.11905940028775226, + "grad_norm": 0.12084470689296722, + "learning_rate": 0.0005, + "loss": 2.1145, + "step": 31280 + }, + { + "epoch": 0.11909746275587495, + "grad_norm": 0.13466468453407288, + "learning_rate": 0.0005, + "loss": 2.1359, + "step": 31290 + }, + { + "epoch": 0.11913552522399762, + "grad_norm": 0.12772592902183533, + "learning_rate": 0.0005, + "loss": 2.1276, + "step": 31300 + }, + { + "epoch": 0.1191735876921203, + "grad_norm": 0.11330266296863556, + "learning_rate": 0.0005, + "loss": 2.1456, + "step": 31310 + }, + { + "epoch": 0.11921165016024299, + "grad_norm": 0.12213649600744247, + "learning_rate": 0.0005, + "loss": 2.1512, + "step": 31320 + }, + { + "epoch": 0.11924971262836567, + "grad_norm": 0.134195476770401, + "learning_rate": 0.0005, + "loss": 2.1274, + "step": 31330 + }, + { + "epoch": 0.11928777509648836, + "grad_norm": 0.12048410624265671, + "learning_rate": 0.0005, + "loss": 2.1396, + "step": 31340 + }, + { + "epoch": 0.11932583756461104, + "grad_norm": 0.136560320854187, + "learning_rate": 0.0005, + "loss": 2.1438, + "step": 31350 + }, + { + "epoch": 0.11936390003273373, + "grad_norm": 0.117899589240551, + "learning_rate": 0.0005, + "loss": 2.1323, + "step": 31360 + }, + { + "epoch": 0.11940196250085641, + "grad_norm": 0.13653498888015747, + "learning_rate": 0.0005, + "loss": 2.1441, + "step": 31370 + }, + { + "epoch": 0.11944002496897908, + "grad_norm": 0.13502705097198486, + "learning_rate": 0.0005, + "loss": 2.1398, + "step": 31380 + }, + { + "epoch": 0.11947808743710177, + "grad_norm": 0.11919333785772324, + "learning_rate": 0.0005, + "loss": 2.162, + "step": 31390 + }, + { + "epoch": 0.11951614990522445, + "grad_norm": 0.13186244666576385, + "learning_rate": 0.0005, + "loss": 2.1608, + "step": 31400 + }, + { + "epoch": 0.11955421237334714, + "grad_norm": 0.11902909725904465, + "learning_rate": 0.0005, + "loss": 2.1464, + "step": 31410 + }, + { + "epoch": 0.11959227484146982, + "grad_norm": 0.11676482111215591, + "learning_rate": 0.0005, + "loss": 2.1375, + "step": 31420 + }, + { + "epoch": 0.11963033730959251, + "grad_norm": 0.13875927031040192, + "learning_rate": 0.0005, + "loss": 2.1342, + "step": 31430 + }, + { + "epoch": 0.11966839977771519, + "grad_norm": 0.1109924167394638, + "learning_rate": 0.0005, + "loss": 2.1137, + "step": 31440 + }, + { + "epoch": 0.11970646224583786, + "grad_norm": 0.1171596497297287, + "learning_rate": 0.0005, + "loss": 2.1307, + "step": 31450 + }, + { + "epoch": 0.11974452471396055, + "grad_norm": 0.14218342304229736, + "learning_rate": 0.0005, + "loss": 2.1317, + "step": 31460 + }, + { + "epoch": 0.11978258718208323, + "grad_norm": 0.1319875717163086, + "learning_rate": 0.0005, + "loss": 2.1434, + "step": 31470 + }, + { + "epoch": 0.11982064965020592, + "grad_norm": 0.12911342084407806, + "learning_rate": 0.0005, + "loss": 2.1324, + "step": 31480 + }, + { + "epoch": 0.1198587121183286, + "grad_norm": 0.12023784965276718, + "learning_rate": 0.0005, + "loss": 2.143, + "step": 31490 + }, + { + "epoch": 0.11989677458645129, + "grad_norm": 0.12053267657756805, + "learning_rate": 0.0005, + "loss": 2.1404, + "step": 31500 + }, + { + "epoch": 0.11993483705457397, + "grad_norm": 0.11411106586456299, + "learning_rate": 0.0005, + "loss": 2.1391, + "step": 31510 + }, + { + "epoch": 0.11997289952269664, + "grad_norm": 0.11728163808584213, + "learning_rate": 0.0005, + "loss": 2.1383, + "step": 31520 + }, + { + "epoch": 0.12001096199081933, + "grad_norm": 0.12292353808879852, + "learning_rate": 0.0005, + "loss": 2.1365, + "step": 31530 + }, + { + "epoch": 0.12004902445894201, + "grad_norm": 0.12793846428394318, + "learning_rate": 0.0005, + "loss": 2.1231, + "step": 31540 + }, + { + "epoch": 0.1200870869270647, + "grad_norm": 0.11590106785297394, + "learning_rate": 0.0005, + "loss": 2.1452, + "step": 31550 + }, + { + "epoch": 0.12012514939518738, + "grad_norm": 0.12416353076696396, + "learning_rate": 0.0005, + "loss": 2.131, + "step": 31560 + }, + { + "epoch": 0.12016321186331007, + "grad_norm": 0.12791256606578827, + "learning_rate": 0.0005, + "loss": 2.1467, + "step": 31570 + }, + { + "epoch": 0.12020127433143275, + "grad_norm": 0.12399394810199738, + "learning_rate": 0.0005, + "loss": 2.1246, + "step": 31580 + }, + { + "epoch": 0.12023933679955542, + "grad_norm": 0.11579488962888718, + "learning_rate": 0.0005, + "loss": 2.1387, + "step": 31590 + }, + { + "epoch": 0.12027739926767811, + "grad_norm": 0.1366676241159439, + "learning_rate": 0.0005, + "loss": 2.1391, + "step": 31600 + }, + { + "epoch": 0.1203154617358008, + "grad_norm": 0.13556742668151855, + "learning_rate": 0.0005, + "loss": 2.1213, + "step": 31610 + }, + { + "epoch": 0.12035352420392348, + "grad_norm": 0.11851835995912552, + "learning_rate": 0.0005, + "loss": 2.1409, + "step": 31620 + }, + { + "epoch": 0.12039158667204616, + "grad_norm": 0.11656619608402252, + "learning_rate": 0.0005, + "loss": 2.1427, + "step": 31630 + }, + { + "epoch": 0.12042964914016885, + "grad_norm": 0.13701976835727692, + "learning_rate": 0.0005, + "loss": 2.1229, + "step": 31640 + }, + { + "epoch": 0.12046771160829153, + "grad_norm": 0.13046278059482574, + "learning_rate": 0.0005, + "loss": 2.1245, + "step": 31650 + }, + { + "epoch": 0.1205057740764142, + "grad_norm": 0.12305255234241486, + "learning_rate": 0.0005, + "loss": 2.1509, + "step": 31660 + }, + { + "epoch": 0.12054383654453689, + "grad_norm": 0.12715394794940948, + "learning_rate": 0.0005, + "loss": 2.1482, + "step": 31670 + }, + { + "epoch": 0.12058189901265957, + "grad_norm": 0.11681829392910004, + "learning_rate": 0.0005, + "loss": 2.1271, + "step": 31680 + }, + { + "epoch": 0.12061996148078226, + "grad_norm": 0.11660370230674744, + "learning_rate": 0.0005, + "loss": 2.14, + "step": 31690 + }, + { + "epoch": 0.12065802394890494, + "grad_norm": 0.1314779371023178, + "learning_rate": 0.0005, + "loss": 2.1375, + "step": 31700 + }, + { + "epoch": 0.12069608641702763, + "grad_norm": 0.13059192895889282, + "learning_rate": 0.0005, + "loss": 2.1455, + "step": 31710 + }, + { + "epoch": 0.12073414888515031, + "grad_norm": 0.14255261421203613, + "learning_rate": 0.0005, + "loss": 2.1473, + "step": 31720 + }, + { + "epoch": 0.120772211353273, + "grad_norm": 0.12946201860904694, + "learning_rate": 0.0005, + "loss": 2.1345, + "step": 31730 + }, + { + "epoch": 0.12081027382139567, + "grad_norm": 0.12346374243497849, + "learning_rate": 0.0005, + "loss": 2.1467, + "step": 31740 + }, + { + "epoch": 0.12084833628951835, + "grad_norm": 0.12624172866344452, + "learning_rate": 0.0005, + "loss": 2.1395, + "step": 31750 + }, + { + "epoch": 0.12088639875764104, + "grad_norm": 0.11160004884004593, + "learning_rate": 0.0005, + "loss": 2.1215, + "step": 31760 + }, + { + "epoch": 0.12092446122576372, + "grad_norm": 0.12799963355064392, + "learning_rate": 0.0005, + "loss": 2.142, + "step": 31770 + }, + { + "epoch": 0.12096252369388641, + "grad_norm": 0.11344436556100845, + "learning_rate": 0.0005, + "loss": 2.143, + "step": 31780 + }, + { + "epoch": 0.1210005861620091, + "grad_norm": 0.12890136241912842, + "learning_rate": 0.0005, + "loss": 2.1543, + "step": 31790 + }, + { + "epoch": 0.12103864863013178, + "grad_norm": 0.11942970007658005, + "learning_rate": 0.0005, + "loss": 2.1357, + "step": 31800 + }, + { + "epoch": 0.12107671109825445, + "grad_norm": 0.12209248542785645, + "learning_rate": 0.0005, + "loss": 2.1236, + "step": 31810 + }, + { + "epoch": 0.12111477356637713, + "grad_norm": 0.123374804854393, + "learning_rate": 0.0005, + "loss": 2.1271, + "step": 31820 + }, + { + "epoch": 0.12115283603449982, + "grad_norm": 0.11081527173519135, + "learning_rate": 0.0005, + "loss": 2.1357, + "step": 31830 + }, + { + "epoch": 0.1211908985026225, + "grad_norm": 0.11383873969316483, + "learning_rate": 0.0005, + "loss": 2.1205, + "step": 31840 + }, + { + "epoch": 0.12122896097074519, + "grad_norm": 0.12558364868164062, + "learning_rate": 0.0005, + "loss": 2.1396, + "step": 31850 + }, + { + "epoch": 0.12126702343886787, + "grad_norm": 0.12574754655361176, + "learning_rate": 0.0005, + "loss": 2.1425, + "step": 31860 + }, + { + "epoch": 0.12130508590699056, + "grad_norm": 0.12244436889886856, + "learning_rate": 0.0005, + "loss": 2.1515, + "step": 31870 + }, + { + "epoch": 0.12134314837511323, + "grad_norm": 0.13229981064796448, + "learning_rate": 0.0005, + "loss": 2.1385, + "step": 31880 + }, + { + "epoch": 0.12138121084323591, + "grad_norm": 0.11815565079450607, + "learning_rate": 0.0005, + "loss": 2.1324, + "step": 31890 + }, + { + "epoch": 0.1214192733113586, + "grad_norm": 0.13423267006874084, + "learning_rate": 0.0005, + "loss": 2.1389, + "step": 31900 + }, + { + "epoch": 0.12145733577948128, + "grad_norm": 0.11861734837293625, + "learning_rate": 0.0005, + "loss": 2.1381, + "step": 31910 + }, + { + "epoch": 0.12149539824760397, + "grad_norm": 0.1253175586462021, + "learning_rate": 0.0005, + "loss": 2.1287, + "step": 31920 + }, + { + "epoch": 0.12153346071572665, + "grad_norm": 0.124781534075737, + "learning_rate": 0.0005, + "loss": 2.1271, + "step": 31930 + }, + { + "epoch": 0.12157152318384934, + "grad_norm": 0.11508560925722122, + "learning_rate": 0.0005, + "loss": 2.1399, + "step": 31940 + }, + { + "epoch": 0.12160958565197201, + "grad_norm": 0.1265585869550705, + "learning_rate": 0.0005, + "loss": 2.1472, + "step": 31950 + }, + { + "epoch": 0.1216476481200947, + "grad_norm": 0.13155826926231384, + "learning_rate": 0.0005, + "loss": 2.1366, + "step": 31960 + }, + { + "epoch": 0.12168571058821738, + "grad_norm": 0.1338793784379959, + "learning_rate": 0.0005, + "loss": 2.1422, + "step": 31970 + }, + { + "epoch": 0.12172377305634006, + "grad_norm": 0.1322241872549057, + "learning_rate": 0.0005, + "loss": 2.1446, + "step": 31980 + }, + { + "epoch": 0.12176183552446275, + "grad_norm": 0.11637207865715027, + "learning_rate": 0.0005, + "loss": 2.131, + "step": 31990 + }, + { + "epoch": 0.12179989799258543, + "grad_norm": 0.12174341827630997, + "learning_rate": 0.0005, + "loss": 2.1371, + "step": 32000 + }, + { + "epoch": 0.12183796046070812, + "grad_norm": 0.12612277269363403, + "learning_rate": 0.0005, + "loss": 2.1298, + "step": 32010 + }, + { + "epoch": 0.12187602292883079, + "grad_norm": 0.11656036972999573, + "learning_rate": 0.0005, + "loss": 2.1416, + "step": 32020 + }, + { + "epoch": 0.12191408539695348, + "grad_norm": 0.12601672112941742, + "learning_rate": 0.0005, + "loss": 2.1478, + "step": 32030 + }, + { + "epoch": 0.12195214786507616, + "grad_norm": 0.11768614500761032, + "learning_rate": 0.0005, + "loss": 2.1496, + "step": 32040 + }, + { + "epoch": 0.12199021033319885, + "grad_norm": 0.11926057189702988, + "learning_rate": 0.0005, + "loss": 2.1432, + "step": 32050 + }, + { + "epoch": 0.12202827280132153, + "grad_norm": 0.11361296474933624, + "learning_rate": 0.0005, + "loss": 2.1241, + "step": 32060 + }, + { + "epoch": 0.12206633526944421, + "grad_norm": 0.1238214448094368, + "learning_rate": 0.0005, + "loss": 2.1309, + "step": 32070 + }, + { + "epoch": 0.1221043977375669, + "grad_norm": 0.13550199568271637, + "learning_rate": 0.0005, + "loss": 2.1269, + "step": 32080 + }, + { + "epoch": 0.12214246020568957, + "grad_norm": 0.13755367696285248, + "learning_rate": 0.0005, + "loss": 2.1257, + "step": 32090 + }, + { + "epoch": 0.12218052267381226, + "grad_norm": 0.12826938927173615, + "learning_rate": 0.0005, + "loss": 2.1359, + "step": 32100 + }, + { + "epoch": 0.12221858514193494, + "grad_norm": 0.12358184158802032, + "learning_rate": 0.0005, + "loss": 2.1611, + "step": 32110 + }, + { + "epoch": 0.12225664761005763, + "grad_norm": 0.12005414068698883, + "learning_rate": 0.0005, + "loss": 2.1259, + "step": 32120 + }, + { + "epoch": 0.12229471007818031, + "grad_norm": 0.11488837748765945, + "learning_rate": 0.0005, + "loss": 2.1323, + "step": 32130 + }, + { + "epoch": 0.122332772546303, + "grad_norm": 0.1216205582022667, + "learning_rate": 0.0005, + "loss": 2.1323, + "step": 32140 + }, + { + "epoch": 0.12237083501442568, + "grad_norm": 0.11891251802444458, + "learning_rate": 0.0005, + "loss": 2.1469, + "step": 32150 + }, + { + "epoch": 0.12240889748254836, + "grad_norm": 0.13558711111545563, + "learning_rate": 0.0005, + "loss": 2.1469, + "step": 32160 + }, + { + "epoch": 0.12244695995067104, + "grad_norm": 0.11588139086961746, + "learning_rate": 0.0005, + "loss": 2.1209, + "step": 32170 + }, + { + "epoch": 0.12248502241879372, + "grad_norm": 0.11844465136528015, + "learning_rate": 0.0005, + "loss": 2.1323, + "step": 32180 + }, + { + "epoch": 0.1225230848869164, + "grad_norm": 0.11257705837488174, + "learning_rate": 0.0005, + "loss": 2.1338, + "step": 32190 + }, + { + "epoch": 0.12256114735503909, + "grad_norm": 0.13820578157901764, + "learning_rate": 0.0005, + "loss": 2.1485, + "step": 32200 + }, + { + "epoch": 0.12259920982316178, + "grad_norm": 0.122981958091259, + "learning_rate": 0.0005, + "loss": 2.1214, + "step": 32210 + }, + { + "epoch": 0.12263727229128446, + "grad_norm": 0.1278199404478073, + "learning_rate": 0.0005, + "loss": 2.1315, + "step": 32220 + }, + { + "epoch": 0.12267533475940715, + "grad_norm": 0.13852286338806152, + "learning_rate": 0.0005, + "loss": 2.1404, + "step": 32230 + }, + { + "epoch": 0.12271339722752982, + "grad_norm": 0.11759456247091293, + "learning_rate": 0.0005, + "loss": 2.1482, + "step": 32240 + }, + { + "epoch": 0.1227514596956525, + "grad_norm": 0.12406472116708755, + "learning_rate": 0.0005, + "loss": 2.1353, + "step": 32250 + }, + { + "epoch": 0.12278952216377519, + "grad_norm": 0.11742253601551056, + "learning_rate": 0.0005, + "loss": 2.1172, + "step": 32260 + }, + { + "epoch": 0.12282758463189787, + "grad_norm": 0.13278815150260925, + "learning_rate": 0.0005, + "loss": 2.1365, + "step": 32270 + }, + { + "epoch": 0.12286564710002056, + "grad_norm": 0.12992510199546814, + "learning_rate": 0.0005, + "loss": 2.1513, + "step": 32280 + }, + { + "epoch": 0.12290370956814324, + "grad_norm": 0.13639768958091736, + "learning_rate": 0.0005, + "loss": 2.132, + "step": 32290 + }, + { + "epoch": 0.12294177203626593, + "grad_norm": 0.12147095799446106, + "learning_rate": 0.0005, + "loss": 2.1497, + "step": 32300 + }, + { + "epoch": 0.1229798345043886, + "grad_norm": 0.13889503479003906, + "learning_rate": 0.0005, + "loss": 2.137, + "step": 32310 + }, + { + "epoch": 0.12301789697251128, + "grad_norm": 0.1264171451330185, + "learning_rate": 0.0005, + "loss": 2.1318, + "step": 32320 + }, + { + "epoch": 0.12305595944063397, + "grad_norm": 0.12271172553300858, + "learning_rate": 0.0005, + "loss": 2.1353, + "step": 32330 + }, + { + "epoch": 0.12309402190875665, + "grad_norm": 0.13769884407520294, + "learning_rate": 0.0005, + "loss": 2.1432, + "step": 32340 + }, + { + "epoch": 0.12313208437687934, + "grad_norm": 0.11881277710199356, + "learning_rate": 0.0005, + "loss": 2.1468, + "step": 32350 + }, + { + "epoch": 0.12317014684500202, + "grad_norm": 0.13204024732112885, + "learning_rate": 0.0005, + "loss": 2.1451, + "step": 32360 + }, + { + "epoch": 0.1232082093131247, + "grad_norm": 0.13416194915771484, + "learning_rate": 0.0005, + "loss": 2.1362, + "step": 32370 + }, + { + "epoch": 0.12324627178124738, + "grad_norm": 0.12615594267845154, + "learning_rate": 0.0005, + "loss": 2.1432, + "step": 32380 + }, + { + "epoch": 0.12328433424937006, + "grad_norm": 0.1237650066614151, + "learning_rate": 0.0005, + "loss": 2.1326, + "step": 32390 + }, + { + "epoch": 0.12332239671749275, + "grad_norm": 0.12454275041818619, + "learning_rate": 0.0005, + "loss": 2.142, + "step": 32400 + }, + { + "epoch": 0.12336045918561543, + "grad_norm": 0.12052928656339645, + "learning_rate": 0.0005, + "loss": 2.1533, + "step": 32410 + }, + { + "epoch": 0.12339852165373812, + "grad_norm": 0.10830031335353851, + "learning_rate": 0.0005, + "loss": 2.1434, + "step": 32420 + }, + { + "epoch": 0.1234365841218608, + "grad_norm": 0.1310432404279709, + "learning_rate": 0.0005, + "loss": 2.1323, + "step": 32430 + }, + { + "epoch": 0.12347464658998349, + "grad_norm": 0.11813438683748245, + "learning_rate": 0.0005, + "loss": 2.1468, + "step": 32440 + }, + { + "epoch": 0.12351270905810616, + "grad_norm": 0.13674898445606232, + "learning_rate": 0.0005, + "loss": 2.1298, + "step": 32450 + }, + { + "epoch": 0.12355077152622884, + "grad_norm": 0.14288246631622314, + "learning_rate": 0.0005, + "loss": 2.1368, + "step": 32460 + }, + { + "epoch": 0.12358883399435153, + "grad_norm": 0.12398666888475418, + "learning_rate": 0.0005, + "loss": 2.1359, + "step": 32470 + }, + { + "epoch": 0.12362689646247421, + "grad_norm": 0.11381809413433075, + "learning_rate": 0.0005, + "loss": 2.1459, + "step": 32480 + }, + { + "epoch": 0.1236649589305969, + "grad_norm": 0.12265384197235107, + "learning_rate": 0.0005, + "loss": 2.1357, + "step": 32490 + }, + { + "epoch": 0.12370302139871958, + "grad_norm": 0.138731449842453, + "learning_rate": 0.0005, + "loss": 2.1459, + "step": 32500 + }, + { + "epoch": 0.12374108386684227, + "grad_norm": 0.13886143267154694, + "learning_rate": 0.0005, + "loss": 2.146, + "step": 32510 + }, + { + "epoch": 0.12377914633496495, + "grad_norm": 0.11149519681930542, + "learning_rate": 0.0005, + "loss": 2.1264, + "step": 32520 + }, + { + "epoch": 0.12381720880308762, + "grad_norm": 0.11746298521757126, + "learning_rate": 0.0005, + "loss": 2.133, + "step": 32530 + }, + { + "epoch": 0.12385527127121031, + "grad_norm": 0.13325706124305725, + "learning_rate": 0.0005, + "loss": 2.1412, + "step": 32540 + }, + { + "epoch": 0.12389333373933299, + "grad_norm": 0.13062238693237305, + "learning_rate": 0.0005, + "loss": 2.1447, + "step": 32550 + }, + { + "epoch": 0.12393139620745568, + "grad_norm": 0.11524543911218643, + "learning_rate": 0.0005, + "loss": 2.1481, + "step": 32560 + }, + { + "epoch": 0.12396945867557836, + "grad_norm": 0.11860724538564682, + "learning_rate": 0.0005, + "loss": 2.1299, + "step": 32570 + }, + { + "epoch": 0.12400752114370105, + "grad_norm": 0.11665958166122437, + "learning_rate": 0.0005, + "loss": 2.1297, + "step": 32580 + }, + { + "epoch": 0.12404558361182373, + "grad_norm": 0.1318943053483963, + "learning_rate": 0.0005, + "loss": 2.1498, + "step": 32590 + }, + { + "epoch": 0.1240836460799464, + "grad_norm": 0.12036443501710892, + "learning_rate": 0.0005, + "loss": 2.1477, + "step": 32600 + }, + { + "epoch": 0.12412170854806909, + "grad_norm": 0.12892819941043854, + "learning_rate": 0.0005, + "loss": 2.1522, + "step": 32610 + }, + { + "epoch": 0.12415977101619177, + "grad_norm": 0.137188121676445, + "learning_rate": 0.0005, + "loss": 2.1425, + "step": 32620 + }, + { + "epoch": 0.12419783348431446, + "grad_norm": 0.12776799499988556, + "learning_rate": 0.0005, + "loss": 2.1237, + "step": 32630 + }, + { + "epoch": 0.12423589595243714, + "grad_norm": 0.1168050542473793, + "learning_rate": 0.0005, + "loss": 2.1382, + "step": 32640 + }, + { + "epoch": 0.12427395842055983, + "grad_norm": 0.12604869902133942, + "learning_rate": 0.0005, + "loss": 2.1284, + "step": 32650 + }, + { + "epoch": 0.12431202088868251, + "grad_norm": 0.12291269749403, + "learning_rate": 0.0005, + "loss": 2.1559, + "step": 32660 + }, + { + "epoch": 0.12435008335680518, + "grad_norm": 0.11891001462936401, + "learning_rate": 0.0005, + "loss": 2.127, + "step": 32670 + }, + { + "epoch": 0.12438814582492787, + "grad_norm": 0.1177201047539711, + "learning_rate": 0.0005, + "loss": 2.1377, + "step": 32680 + }, + { + "epoch": 0.12442620829305055, + "grad_norm": 0.12202750146389008, + "learning_rate": 0.0005, + "loss": 2.1401, + "step": 32690 + }, + { + "epoch": 0.12446427076117324, + "grad_norm": 0.12012209743261337, + "learning_rate": 0.0005, + "loss": 2.1286, + "step": 32700 + }, + { + "epoch": 0.12450233322929592, + "grad_norm": 0.13094893097877502, + "learning_rate": 0.0005, + "loss": 2.1336, + "step": 32710 + }, + { + "epoch": 0.12454039569741861, + "grad_norm": 0.12597648799419403, + "learning_rate": 0.0005, + "loss": 2.1406, + "step": 32720 + }, + { + "epoch": 0.12457845816554129, + "grad_norm": 0.13822618126869202, + "learning_rate": 0.0005, + "loss": 2.1376, + "step": 32730 + }, + { + "epoch": 0.12461652063366396, + "grad_norm": 0.1339564472436905, + "learning_rate": 0.0005, + "loss": 2.1385, + "step": 32740 + }, + { + "epoch": 0.12465458310178665, + "grad_norm": 0.1378975808620453, + "learning_rate": 0.0005, + "loss": 2.1413, + "step": 32750 + }, + { + "epoch": 0.12469264556990933, + "grad_norm": 0.11569910496473312, + "learning_rate": 0.0005, + "loss": 2.1319, + "step": 32760 + }, + { + "epoch": 0.12473070803803202, + "grad_norm": 0.11427119374275208, + "learning_rate": 0.0005, + "loss": 2.1432, + "step": 32770 + }, + { + "epoch": 0.1247687705061547, + "grad_norm": 0.12961478531360626, + "learning_rate": 0.0005, + "loss": 2.1298, + "step": 32780 + }, + { + "epoch": 0.12480683297427739, + "grad_norm": 1.0075163841247559, + "learning_rate": 0.0005, + "loss": 2.1169, + "step": 32790 + }, + { + "epoch": 0.12484489544240007, + "grad_norm": 0.11409968882799149, + "learning_rate": 0.0005, + "loss": 2.1596, + "step": 32800 + }, + { + "epoch": 0.12488295791052274, + "grad_norm": 0.1265895962715149, + "learning_rate": 0.0005, + "loss": 2.1378, + "step": 32810 + }, + { + "epoch": 0.12492102037864543, + "grad_norm": 0.11860281974077225, + "learning_rate": 0.0005, + "loss": 2.1382, + "step": 32820 + }, + { + "epoch": 0.12495908284676811, + "grad_norm": 0.12064908444881439, + "learning_rate": 0.0005, + "loss": 2.1422, + "step": 32830 + }, + { + "epoch": 0.1249971453148908, + "grad_norm": 0.1199292466044426, + "learning_rate": 0.0005, + "loss": 2.14, + "step": 32840 + }, + { + "epoch": 0.12503520778301347, + "grad_norm": 0.1281987875699997, + "learning_rate": 0.0005, + "loss": 2.1332, + "step": 32850 + }, + { + "epoch": 0.12507327025113615, + "grad_norm": 0.12015961110591888, + "learning_rate": 0.0005, + "loss": 2.1333, + "step": 32860 + }, + { + "epoch": 0.12511133271925884, + "grad_norm": 0.11467165499925613, + "learning_rate": 0.0005, + "loss": 2.1328, + "step": 32870 + }, + { + "epoch": 0.12514939518738152, + "grad_norm": 0.13084529340267181, + "learning_rate": 0.0005, + "loss": 2.1335, + "step": 32880 + }, + { + "epoch": 0.1251874576555042, + "grad_norm": 0.1264481097459793, + "learning_rate": 0.0005, + "loss": 2.1523, + "step": 32890 + }, + { + "epoch": 0.1252255201236269, + "grad_norm": 0.11985542625188828, + "learning_rate": 0.0005, + "loss": 2.1281, + "step": 32900 + }, + { + "epoch": 0.12526358259174958, + "grad_norm": 0.12975460290908813, + "learning_rate": 0.0005, + "loss": 2.1455, + "step": 32910 + }, + { + "epoch": 0.12530164505987226, + "grad_norm": 0.11748325824737549, + "learning_rate": 0.0005, + "loss": 2.1267, + "step": 32920 + }, + { + "epoch": 0.12533970752799495, + "grad_norm": 0.12457460910081863, + "learning_rate": 0.0005, + "loss": 2.1356, + "step": 32930 + }, + { + "epoch": 0.12537776999611763, + "grad_norm": 0.1297825276851654, + "learning_rate": 0.0005, + "loss": 2.1302, + "step": 32940 + }, + { + "epoch": 0.12541583246424032, + "grad_norm": 0.14112438261508942, + "learning_rate": 0.0005, + "loss": 2.1341, + "step": 32950 + }, + { + "epoch": 0.125453894932363, + "grad_norm": 0.12287167459726334, + "learning_rate": 0.0005, + "loss": 2.1231, + "step": 32960 + }, + { + "epoch": 0.1254919574004857, + "grad_norm": 0.12792804837226868, + "learning_rate": 0.0005, + "loss": 2.1254, + "step": 32970 + }, + { + "epoch": 0.12553001986860837, + "grad_norm": 0.11716022342443466, + "learning_rate": 0.0005, + "loss": 2.147, + "step": 32980 + }, + { + "epoch": 0.12556808233673103, + "grad_norm": 0.1469336301088333, + "learning_rate": 0.0005, + "loss": 2.1487, + "step": 32990 + }, + { + "epoch": 0.12560614480485371, + "grad_norm": 0.1205238625407219, + "learning_rate": 0.0005, + "loss": 2.1484, + "step": 33000 + }, + { + "epoch": 0.1256442072729764, + "grad_norm": 0.11697216331958771, + "learning_rate": 0.0005, + "loss": 2.1487, + "step": 33010 + }, + { + "epoch": 0.12568226974109908, + "grad_norm": 0.10925181210041046, + "learning_rate": 0.0005, + "loss": 2.1405, + "step": 33020 + }, + { + "epoch": 0.12572033220922177, + "grad_norm": 0.1148732453584671, + "learning_rate": 0.0005, + "loss": 2.1405, + "step": 33030 + }, + { + "epoch": 0.12575839467734445, + "grad_norm": 0.1138172373175621, + "learning_rate": 0.0005, + "loss": 2.1306, + "step": 33040 + }, + { + "epoch": 0.12579645714546714, + "grad_norm": 0.12518808245658875, + "learning_rate": 0.0005, + "loss": 2.1431, + "step": 33050 + }, + { + "epoch": 0.12583451961358982, + "grad_norm": 0.12803898751735687, + "learning_rate": 0.0005, + "loss": 2.144, + "step": 33060 + }, + { + "epoch": 0.1258725820817125, + "grad_norm": 0.1226850152015686, + "learning_rate": 0.0005, + "loss": 2.1458, + "step": 33070 + }, + { + "epoch": 0.1259106445498352, + "grad_norm": 0.1251627802848816, + "learning_rate": 0.0005, + "loss": 2.1396, + "step": 33080 + }, + { + "epoch": 0.12594870701795788, + "grad_norm": 0.12983007729053497, + "learning_rate": 0.0005, + "loss": 2.1472, + "step": 33090 + }, + { + "epoch": 0.12598676948608056, + "grad_norm": 0.11952579766511917, + "learning_rate": 0.0005, + "loss": 2.1532, + "step": 33100 + }, + { + "epoch": 0.12602483195420325, + "grad_norm": 0.12259417027235031, + "learning_rate": 0.0005, + "loss": 2.1388, + "step": 33110 + }, + { + "epoch": 0.12606289442232593, + "grad_norm": 0.12636563181877136, + "learning_rate": 0.0005, + "loss": 2.148, + "step": 33120 + }, + { + "epoch": 0.12610095689044862, + "grad_norm": 0.1269903928041458, + "learning_rate": 0.0005, + "loss": 2.1282, + "step": 33130 + }, + { + "epoch": 0.12613901935857127, + "grad_norm": 0.13562357425689697, + "learning_rate": 0.0005, + "loss": 2.145, + "step": 33140 + }, + { + "epoch": 0.12617708182669396, + "grad_norm": 0.11688810586929321, + "learning_rate": 0.0005, + "loss": 2.1492, + "step": 33150 + }, + { + "epoch": 0.12621514429481664, + "grad_norm": 0.11836229264736176, + "learning_rate": 0.0005, + "loss": 2.1175, + "step": 33160 + }, + { + "epoch": 0.12625320676293933, + "grad_norm": 0.12402665615081787, + "learning_rate": 0.0005, + "loss": 2.1325, + "step": 33170 + }, + { + "epoch": 0.12629126923106201, + "grad_norm": 0.13359686732292175, + "learning_rate": 0.0005, + "loss": 2.1393, + "step": 33180 + }, + { + "epoch": 0.1263293316991847, + "grad_norm": 0.21467049419879913, + "learning_rate": 0.0005, + "loss": 2.1373, + "step": 33190 + }, + { + "epoch": 0.12636739416730738, + "grad_norm": 0.1315467804670334, + "learning_rate": 0.0005, + "loss": 2.1242, + "step": 33200 + }, + { + "epoch": 0.12640545663543007, + "grad_norm": 0.1368442177772522, + "learning_rate": 0.0005, + "loss": 2.1337, + "step": 33210 + }, + { + "epoch": 0.12644351910355275, + "grad_norm": 0.1287236213684082, + "learning_rate": 0.0005, + "loss": 2.1376, + "step": 33220 + }, + { + "epoch": 0.12648158157167544, + "grad_norm": 0.12770278751850128, + "learning_rate": 0.0005, + "loss": 2.1317, + "step": 33230 + }, + { + "epoch": 0.12651964403979812, + "grad_norm": 0.13059884309768677, + "learning_rate": 0.0005, + "loss": 2.141, + "step": 33240 + }, + { + "epoch": 0.1265577065079208, + "grad_norm": 0.12187020480632782, + "learning_rate": 0.0005, + "loss": 2.144, + "step": 33250 + }, + { + "epoch": 0.1265957689760435, + "grad_norm": 0.12663011252880096, + "learning_rate": 0.0005, + "loss": 2.1469, + "step": 33260 + }, + { + "epoch": 0.12663383144416618, + "grad_norm": 0.13077551126480103, + "learning_rate": 0.0005, + "loss": 2.1476, + "step": 33270 + }, + { + "epoch": 0.12667189391228884, + "grad_norm": 0.12449769675731659, + "learning_rate": 0.0005, + "loss": 2.1333, + "step": 33280 + }, + { + "epoch": 0.12670995638041152, + "grad_norm": 0.12197001278400421, + "learning_rate": 0.0005, + "loss": 2.1361, + "step": 33290 + }, + { + "epoch": 0.1267480188485342, + "grad_norm": 0.11997847259044647, + "learning_rate": 0.0005, + "loss": 2.1328, + "step": 33300 + }, + { + "epoch": 0.1267860813166569, + "grad_norm": 0.11869657784700394, + "learning_rate": 0.0005, + "loss": 2.138, + "step": 33310 + }, + { + "epoch": 0.12682414378477957, + "grad_norm": 0.14480037987232208, + "learning_rate": 0.0005, + "loss": 2.133, + "step": 33320 + }, + { + "epoch": 0.12686220625290226, + "grad_norm": 0.1121041402220726, + "learning_rate": 0.0005, + "loss": 2.1424, + "step": 33330 + }, + { + "epoch": 0.12690026872102494, + "grad_norm": 0.11679790169000626, + "learning_rate": 0.0005, + "loss": 2.1371, + "step": 33340 + }, + { + "epoch": 0.12693833118914763, + "grad_norm": 0.11968211829662323, + "learning_rate": 0.0005, + "loss": 2.1415, + "step": 33350 + }, + { + "epoch": 0.12697639365727031, + "grad_norm": 0.1225665882229805, + "learning_rate": 0.0005, + "loss": 2.1449, + "step": 33360 + }, + { + "epoch": 0.127014456125393, + "grad_norm": 0.13022850453853607, + "learning_rate": 0.0005, + "loss": 2.141, + "step": 33370 + }, + { + "epoch": 0.12705251859351568, + "grad_norm": 0.11416057497262955, + "learning_rate": 0.0005, + "loss": 2.1316, + "step": 33380 + }, + { + "epoch": 0.12709058106163837, + "grad_norm": 0.1268990933895111, + "learning_rate": 0.0005, + "loss": 2.1392, + "step": 33390 + }, + { + "epoch": 0.12712864352976105, + "grad_norm": 0.11592286080121994, + "learning_rate": 0.0005, + "loss": 2.1361, + "step": 33400 + }, + { + "epoch": 0.12716670599788374, + "grad_norm": 0.12762770056724548, + "learning_rate": 0.0005, + "loss": 2.1455, + "step": 33410 + }, + { + "epoch": 0.1272047684660064, + "grad_norm": 0.12282517552375793, + "learning_rate": 0.0005, + "loss": 2.1238, + "step": 33420 + }, + { + "epoch": 0.12724283093412908, + "grad_norm": 0.1247594878077507, + "learning_rate": 0.0005, + "loss": 2.1337, + "step": 33430 + }, + { + "epoch": 0.12728089340225177, + "grad_norm": 0.11615666002035141, + "learning_rate": 0.0005, + "loss": 2.1137, + "step": 33440 + }, + { + "epoch": 0.12731895587037445, + "grad_norm": 0.12424363940954208, + "learning_rate": 0.0005, + "loss": 2.1394, + "step": 33450 + }, + { + "epoch": 0.12735701833849714, + "grad_norm": 0.1269409954547882, + "learning_rate": 0.0005, + "loss": 2.1396, + "step": 33460 + }, + { + "epoch": 0.12739508080661982, + "grad_norm": 0.12168506532907486, + "learning_rate": 0.0005, + "loss": 2.1394, + "step": 33470 + }, + { + "epoch": 0.1274331432747425, + "grad_norm": 0.12149317562580109, + "learning_rate": 0.0005, + "loss": 2.1379, + "step": 33480 + }, + { + "epoch": 0.1274712057428652, + "grad_norm": 0.13020338118076324, + "learning_rate": 0.0005, + "loss": 2.1494, + "step": 33490 + }, + { + "epoch": 0.12750926821098788, + "grad_norm": 0.12166621536016464, + "learning_rate": 0.0005, + "loss": 2.1326, + "step": 33500 + }, + { + "epoch": 0.12754733067911056, + "grad_norm": 0.12197873741388321, + "learning_rate": 0.0005, + "loss": 2.1344, + "step": 33510 + }, + { + "epoch": 0.12758539314723324, + "grad_norm": 0.115919329226017, + "learning_rate": 0.0005, + "loss": 2.1393, + "step": 33520 + }, + { + "epoch": 0.12762345561535593, + "grad_norm": 0.1329682618379593, + "learning_rate": 0.0005, + "loss": 2.1372, + "step": 33530 + }, + { + "epoch": 0.12766151808347861, + "grad_norm": 0.12893423438072205, + "learning_rate": 0.0005, + "loss": 2.1177, + "step": 33540 + }, + { + "epoch": 0.1276995805516013, + "grad_norm": 0.12900428473949432, + "learning_rate": 0.0005, + "loss": 2.1359, + "step": 33550 + }, + { + "epoch": 0.12773764301972398, + "grad_norm": 0.11646492779254913, + "learning_rate": 0.0005, + "loss": 2.1372, + "step": 33560 + }, + { + "epoch": 0.12777570548784664, + "grad_norm": 0.11849641054868698, + "learning_rate": 0.0005, + "loss": 2.1527, + "step": 33570 + }, + { + "epoch": 0.12781376795596933, + "grad_norm": 0.12793667614459991, + "learning_rate": 0.0005, + "loss": 2.1283, + "step": 33580 + }, + { + "epoch": 0.127851830424092, + "grad_norm": 0.11357719451189041, + "learning_rate": 0.0005, + "loss": 2.1259, + "step": 33590 + }, + { + "epoch": 0.1278898928922147, + "grad_norm": 0.12847648561000824, + "learning_rate": 0.0005, + "loss": 2.1348, + "step": 33600 + }, + { + "epoch": 0.12792795536033738, + "grad_norm": 0.1309060901403427, + "learning_rate": 0.0005, + "loss": 2.1345, + "step": 33610 + }, + { + "epoch": 0.12796601782846007, + "grad_norm": 0.11633670330047607, + "learning_rate": 0.0005, + "loss": 2.1351, + "step": 33620 + }, + { + "epoch": 0.12800408029658275, + "grad_norm": 0.12359406799077988, + "learning_rate": 0.0005, + "loss": 2.1317, + "step": 33630 + }, + { + "epoch": 0.12804214276470544, + "grad_norm": 0.11715533584356308, + "learning_rate": 0.0005, + "loss": 2.1393, + "step": 33640 + }, + { + "epoch": 0.12808020523282812, + "grad_norm": 0.12290675938129425, + "learning_rate": 0.0005, + "loss": 2.1419, + "step": 33650 + }, + { + "epoch": 0.1281182677009508, + "grad_norm": 0.13310450315475464, + "learning_rate": 0.0005, + "loss": 2.1302, + "step": 33660 + }, + { + "epoch": 0.1281563301690735, + "grad_norm": 0.13879603147506714, + "learning_rate": 0.0005, + "loss": 2.1117, + "step": 33670 + }, + { + "epoch": 0.12819439263719618, + "grad_norm": 0.14221248030662537, + "learning_rate": 0.0005, + "loss": 2.1385, + "step": 33680 + }, + { + "epoch": 0.12823245510531886, + "grad_norm": 0.12564119696617126, + "learning_rate": 0.0005, + "loss": 2.1336, + "step": 33690 + }, + { + "epoch": 0.12827051757344154, + "grad_norm": 0.1257466971874237, + "learning_rate": 0.0005, + "loss": 2.1296, + "step": 33700 + }, + { + "epoch": 0.1283085800415642, + "grad_norm": 0.12742167711257935, + "learning_rate": 0.0005, + "loss": 2.1197, + "step": 33710 + }, + { + "epoch": 0.1283466425096869, + "grad_norm": 0.1109384074807167, + "learning_rate": 0.0005, + "loss": 2.1331, + "step": 33720 + }, + { + "epoch": 0.12838470497780957, + "grad_norm": 0.13379578292369843, + "learning_rate": 0.0005, + "loss": 2.144, + "step": 33730 + }, + { + "epoch": 0.12842276744593226, + "grad_norm": 0.14024749398231506, + "learning_rate": 0.0005, + "loss": 2.1303, + "step": 33740 + }, + { + "epoch": 0.12846082991405494, + "grad_norm": 0.15728142857551575, + "learning_rate": 0.0005, + "loss": 2.1306, + "step": 33750 + }, + { + "epoch": 0.12849889238217763, + "grad_norm": 0.11820480227470398, + "learning_rate": 0.0005, + "loss": 2.1236, + "step": 33760 + }, + { + "epoch": 0.1285369548503003, + "grad_norm": 0.11761283129453659, + "learning_rate": 0.0005, + "loss": 2.1203, + "step": 33770 + }, + { + "epoch": 0.128575017318423, + "grad_norm": 0.1218690425157547, + "learning_rate": 0.0005, + "loss": 2.1457, + "step": 33780 + }, + { + "epoch": 0.12861307978654568, + "grad_norm": 0.13610930740833282, + "learning_rate": 0.0005, + "loss": 2.1289, + "step": 33790 + }, + { + "epoch": 0.12865114225466837, + "grad_norm": 0.1270422637462616, + "learning_rate": 0.0005, + "loss": 2.1377, + "step": 33800 + }, + { + "epoch": 0.12868920472279105, + "grad_norm": 0.1524609476327896, + "learning_rate": 0.0005, + "loss": 2.1496, + "step": 33810 + }, + { + "epoch": 0.12872726719091374, + "grad_norm": 0.13481594622135162, + "learning_rate": 0.0005, + "loss": 2.1363, + "step": 33820 + }, + { + "epoch": 0.12876532965903642, + "grad_norm": 0.1191399097442627, + "learning_rate": 0.0005, + "loss": 2.1299, + "step": 33830 + }, + { + "epoch": 0.1288033921271591, + "grad_norm": 0.13578785955905914, + "learning_rate": 0.0005, + "loss": 2.137, + "step": 33840 + }, + { + "epoch": 0.1288414545952818, + "grad_norm": 0.1373569369316101, + "learning_rate": 0.0005, + "loss": 2.1208, + "step": 33850 + }, + { + "epoch": 0.12887951706340445, + "grad_norm": 0.12380945682525635, + "learning_rate": 0.0005, + "loss": 2.1401, + "step": 33860 + }, + { + "epoch": 0.12891757953152713, + "grad_norm": 0.12317613512277603, + "learning_rate": 0.0005, + "loss": 2.1409, + "step": 33870 + }, + { + "epoch": 0.12895564199964982, + "grad_norm": 0.13095170259475708, + "learning_rate": 0.0005, + "loss": 2.151, + "step": 33880 + }, + { + "epoch": 0.1289937044677725, + "grad_norm": 0.17096243798732758, + "learning_rate": 0.0005, + "loss": 2.145, + "step": 33890 + }, + { + "epoch": 0.1290317669358952, + "grad_norm": 0.11582276225090027, + "learning_rate": 0.0005, + "loss": 2.1422, + "step": 33900 + }, + { + "epoch": 0.12906982940401787, + "grad_norm": 0.12426700443029404, + "learning_rate": 0.0005, + "loss": 2.1372, + "step": 33910 + }, + { + "epoch": 0.12910789187214056, + "grad_norm": 0.12811040878295898, + "learning_rate": 0.0005, + "loss": 2.1408, + "step": 33920 + }, + { + "epoch": 0.12914595434026324, + "grad_norm": 0.11992785334587097, + "learning_rate": 0.0005, + "loss": 2.152, + "step": 33930 + }, + { + "epoch": 0.12918401680838593, + "grad_norm": 0.1243828758597374, + "learning_rate": 0.0005, + "loss": 2.1324, + "step": 33940 + }, + { + "epoch": 0.1292220792765086, + "grad_norm": 0.11735928058624268, + "learning_rate": 0.0005, + "loss": 2.144, + "step": 33950 + }, + { + "epoch": 0.1292601417446313, + "grad_norm": 0.11512462049722672, + "learning_rate": 0.0005, + "loss": 2.1235, + "step": 33960 + }, + { + "epoch": 0.12929820421275398, + "grad_norm": 0.11653441935777664, + "learning_rate": 0.0005, + "loss": 2.1397, + "step": 33970 + }, + { + "epoch": 0.12933626668087667, + "grad_norm": 0.11661040037870407, + "learning_rate": 0.0005, + "loss": 2.1393, + "step": 33980 + }, + { + "epoch": 0.12937432914899935, + "grad_norm": 0.13248026371002197, + "learning_rate": 0.0005, + "loss": 2.1468, + "step": 33990 + }, + { + "epoch": 0.129412391617122, + "grad_norm": 0.1311565339565277, + "learning_rate": 0.0005, + "loss": 2.1245, + "step": 34000 + }, + { + "epoch": 0.1294504540852447, + "grad_norm": 0.13021622598171234, + "learning_rate": 0.0005, + "loss": 2.127, + "step": 34010 + }, + { + "epoch": 0.12948851655336738, + "grad_norm": 0.11427046358585358, + "learning_rate": 0.0005, + "loss": 2.1532, + "step": 34020 + }, + { + "epoch": 0.12952657902149006, + "grad_norm": 0.13352714478969574, + "learning_rate": 0.0005, + "loss": 2.1372, + "step": 34030 + }, + { + "epoch": 0.12956464148961275, + "grad_norm": 0.10974465310573578, + "learning_rate": 0.0005, + "loss": 2.1283, + "step": 34040 + }, + { + "epoch": 0.12960270395773543, + "grad_norm": 0.11958344280719757, + "learning_rate": 0.0005, + "loss": 2.1353, + "step": 34050 + }, + { + "epoch": 0.12964076642585812, + "grad_norm": 0.12198679894208908, + "learning_rate": 0.0005, + "loss": 2.133, + "step": 34060 + }, + { + "epoch": 0.1296788288939808, + "grad_norm": 0.1296238899230957, + "learning_rate": 0.0005, + "loss": 2.1393, + "step": 34070 + }, + { + "epoch": 0.1297168913621035, + "grad_norm": 0.11404258757829666, + "learning_rate": 0.0005, + "loss": 2.119, + "step": 34080 + }, + { + "epoch": 0.12975495383022617, + "grad_norm": 0.12420056760311127, + "learning_rate": 0.0005, + "loss": 2.1296, + "step": 34090 + }, + { + "epoch": 0.12979301629834886, + "grad_norm": 0.1177731528878212, + "learning_rate": 0.0005, + "loss": 2.1315, + "step": 34100 + }, + { + "epoch": 0.12983107876647154, + "grad_norm": 0.12449658662080765, + "learning_rate": 0.0005, + "loss": 2.1483, + "step": 34110 + }, + { + "epoch": 0.12986914123459423, + "grad_norm": 0.1282752901315689, + "learning_rate": 0.0005, + "loss": 2.1339, + "step": 34120 + }, + { + "epoch": 0.1299072037027169, + "grad_norm": 0.13742892444133759, + "learning_rate": 0.0005, + "loss": 2.1361, + "step": 34130 + }, + { + "epoch": 0.12994526617083957, + "grad_norm": 0.12925803661346436, + "learning_rate": 0.0005, + "loss": 2.1339, + "step": 34140 + }, + { + "epoch": 0.12998332863896225, + "grad_norm": 0.1282254308462143, + "learning_rate": 0.0005, + "loss": 2.138, + "step": 34150 + }, + { + "epoch": 0.13002139110708494, + "grad_norm": 0.12230316549539566, + "learning_rate": 0.0005, + "loss": 2.1389, + "step": 34160 + }, + { + "epoch": 0.13005945357520762, + "grad_norm": 0.1297813206911087, + "learning_rate": 0.0005, + "loss": 2.1393, + "step": 34170 + }, + { + "epoch": 0.1300975160433303, + "grad_norm": 0.12948966026306152, + "learning_rate": 0.0005, + "loss": 2.1383, + "step": 34180 + }, + { + "epoch": 0.130135578511453, + "grad_norm": 0.1196913793683052, + "learning_rate": 0.0005, + "loss": 2.1409, + "step": 34190 + }, + { + "epoch": 0.13017364097957568, + "grad_norm": 0.12581545114517212, + "learning_rate": 0.0005, + "loss": 2.1567, + "step": 34200 + }, + { + "epoch": 0.13021170344769836, + "grad_norm": 0.10900751501321793, + "learning_rate": 0.0005, + "loss": 2.128, + "step": 34210 + }, + { + "epoch": 0.13024976591582105, + "grad_norm": 0.12207692861557007, + "learning_rate": 0.0005, + "loss": 2.1326, + "step": 34220 + }, + { + "epoch": 0.13028782838394373, + "grad_norm": 0.12275753170251846, + "learning_rate": 0.0005, + "loss": 2.1529, + "step": 34230 + }, + { + "epoch": 0.13032589085206642, + "grad_norm": 0.12474631518125534, + "learning_rate": 0.0005, + "loss": 2.1257, + "step": 34240 + }, + { + "epoch": 0.1303639533201891, + "grad_norm": 0.12170256674289703, + "learning_rate": 0.0005, + "loss": 2.1358, + "step": 34250 + }, + { + "epoch": 0.1304020157883118, + "grad_norm": 0.1270124763250351, + "learning_rate": 0.0005, + "loss": 2.1449, + "step": 34260 + }, + { + "epoch": 0.13044007825643447, + "grad_norm": 0.14917618036270142, + "learning_rate": 0.0005, + "loss": 2.1394, + "step": 34270 + }, + { + "epoch": 0.13047814072455716, + "grad_norm": 0.1254420131444931, + "learning_rate": 0.0005, + "loss": 2.1327, + "step": 34280 + }, + { + "epoch": 0.1305162031926798, + "grad_norm": 0.12572523951530457, + "learning_rate": 0.0005, + "loss": 2.1233, + "step": 34290 + }, + { + "epoch": 0.1305542656608025, + "grad_norm": 0.11523977667093277, + "learning_rate": 0.0005, + "loss": 2.1395, + "step": 34300 + }, + { + "epoch": 0.13059232812892518, + "grad_norm": 0.1346060186624527, + "learning_rate": 0.0005, + "loss": 2.1347, + "step": 34310 + }, + { + "epoch": 0.13063039059704787, + "grad_norm": 0.12890216708183289, + "learning_rate": 0.0005, + "loss": 2.1324, + "step": 34320 + }, + { + "epoch": 0.13066845306517055, + "grad_norm": 0.13144852221012115, + "learning_rate": 0.0005, + "loss": 2.1303, + "step": 34330 + }, + { + "epoch": 0.13070651553329324, + "grad_norm": 0.1467370092868805, + "learning_rate": 0.0005, + "loss": 2.1402, + "step": 34340 + }, + { + "epoch": 0.13074457800141592, + "grad_norm": 0.12182767689228058, + "learning_rate": 0.0005, + "loss": 2.1365, + "step": 34350 + }, + { + "epoch": 0.1307826404695386, + "grad_norm": 0.12545464932918549, + "learning_rate": 0.0005, + "loss": 2.1323, + "step": 34360 + }, + { + "epoch": 0.1308207029376613, + "grad_norm": 0.11060311645269394, + "learning_rate": 0.0005, + "loss": 2.1365, + "step": 34370 + }, + { + "epoch": 0.13085876540578398, + "grad_norm": 0.13011306524276733, + "learning_rate": 0.0005, + "loss": 2.1465, + "step": 34380 + }, + { + "epoch": 0.13089682787390666, + "grad_norm": 0.13094794750213623, + "learning_rate": 0.0005, + "loss": 2.1248, + "step": 34390 + }, + { + "epoch": 0.13093489034202935, + "grad_norm": 0.1258162409067154, + "learning_rate": 0.0005, + "loss": 2.131, + "step": 34400 + }, + { + "epoch": 0.13097295281015203, + "grad_norm": 0.12662410736083984, + "learning_rate": 0.0005, + "loss": 2.1409, + "step": 34410 + }, + { + "epoch": 0.13101101527827472, + "grad_norm": 0.12168075889348984, + "learning_rate": 0.0005, + "loss": 2.1307, + "step": 34420 + }, + { + "epoch": 0.13104907774639737, + "grad_norm": 0.12663094699382782, + "learning_rate": 0.0005, + "loss": 2.145, + "step": 34430 + }, + { + "epoch": 0.13108714021452006, + "grad_norm": 0.11823868006467819, + "learning_rate": 0.0005, + "loss": 2.1247, + "step": 34440 + }, + { + "epoch": 0.13112520268264274, + "grad_norm": 0.12802527844905853, + "learning_rate": 0.0005, + "loss": 2.1451, + "step": 34450 + }, + { + "epoch": 0.13116326515076543, + "grad_norm": 0.12292591482400894, + "learning_rate": 0.0005, + "loss": 2.135, + "step": 34460 + }, + { + "epoch": 0.13120132761888811, + "grad_norm": 0.13579413294792175, + "learning_rate": 0.0005, + "loss": 2.1271, + "step": 34470 + }, + { + "epoch": 0.1312393900870108, + "grad_norm": 0.11999151855707169, + "learning_rate": 0.0005, + "loss": 2.1283, + "step": 34480 + }, + { + "epoch": 0.13127745255513348, + "grad_norm": 0.12723271548748016, + "learning_rate": 0.0005, + "loss": 2.1207, + "step": 34490 + }, + { + "epoch": 0.13131551502325617, + "grad_norm": 0.13324995338916779, + "learning_rate": 0.0005, + "loss": 2.1306, + "step": 34500 + }, + { + "epoch": 0.13135357749137885, + "grad_norm": 0.12204291671514511, + "learning_rate": 0.0005, + "loss": 2.1156, + "step": 34510 + }, + { + "epoch": 0.13139163995950154, + "grad_norm": 0.11512850224971771, + "learning_rate": 0.0005, + "loss": 2.1493, + "step": 34520 + }, + { + "epoch": 0.13142970242762422, + "grad_norm": 0.12048737704753876, + "learning_rate": 0.0005, + "loss": 2.1356, + "step": 34530 + }, + { + "epoch": 0.1314677648957469, + "grad_norm": 0.12571175396442413, + "learning_rate": 0.0005, + "loss": 2.1343, + "step": 34540 + }, + { + "epoch": 0.1315058273638696, + "grad_norm": 0.120302215218544, + "learning_rate": 0.0005, + "loss": 2.1308, + "step": 34550 + }, + { + "epoch": 0.13154388983199228, + "grad_norm": 0.13598310947418213, + "learning_rate": 0.0005, + "loss": 2.1359, + "step": 34560 + }, + { + "epoch": 0.13158195230011493, + "grad_norm": 0.1138150691986084, + "learning_rate": 0.0005, + "loss": 2.1402, + "step": 34570 + }, + { + "epoch": 0.13162001476823762, + "grad_norm": 0.136013001203537, + "learning_rate": 0.0005, + "loss": 2.1407, + "step": 34580 + }, + { + "epoch": 0.1316580772363603, + "grad_norm": 0.11748948693275452, + "learning_rate": 0.0005, + "loss": 2.1487, + "step": 34590 + }, + { + "epoch": 0.131696139704483, + "grad_norm": 0.11970457434654236, + "learning_rate": 0.0005, + "loss": 2.1285, + "step": 34600 + }, + { + "epoch": 0.13173420217260567, + "grad_norm": 0.12416719645261765, + "learning_rate": 0.0005, + "loss": 2.1369, + "step": 34610 + }, + { + "epoch": 0.13177226464072836, + "grad_norm": 0.1187528520822525, + "learning_rate": 0.0005, + "loss": 2.128, + "step": 34620 + }, + { + "epoch": 0.13181032710885104, + "grad_norm": 0.12313452363014221, + "learning_rate": 0.0005, + "loss": 2.142, + "step": 34630 + }, + { + "epoch": 0.13184838957697373, + "grad_norm": 0.13389801979064941, + "learning_rate": 0.0005, + "loss": 2.1294, + "step": 34640 + }, + { + "epoch": 0.13188645204509641, + "grad_norm": 0.1279163956642151, + "learning_rate": 0.0005, + "loss": 2.1232, + "step": 34650 + }, + { + "epoch": 0.1319245145132191, + "grad_norm": 0.13198937475681305, + "learning_rate": 0.0005, + "loss": 2.1335, + "step": 34660 + }, + { + "epoch": 0.13196257698134178, + "grad_norm": 0.11823670566082001, + "learning_rate": 0.0005, + "loss": 2.149, + "step": 34670 + }, + { + "epoch": 0.13200063944946447, + "grad_norm": 0.11793782562017441, + "learning_rate": 0.0005, + "loss": 2.138, + "step": 34680 + }, + { + "epoch": 0.13203870191758715, + "grad_norm": 0.1369984745979309, + "learning_rate": 0.0005, + "loss": 2.1299, + "step": 34690 + }, + { + "epoch": 0.13207676438570984, + "grad_norm": 0.12443507462739944, + "learning_rate": 0.0005, + "loss": 2.1273, + "step": 34700 + }, + { + "epoch": 0.13211482685383252, + "grad_norm": 0.13990186154842377, + "learning_rate": 0.0005, + "loss": 2.12, + "step": 34710 + }, + { + "epoch": 0.13215288932195518, + "grad_norm": 0.13101623952388763, + "learning_rate": 0.0005, + "loss": 2.1376, + "step": 34720 + }, + { + "epoch": 0.13219095179007787, + "grad_norm": 0.12870261073112488, + "learning_rate": 0.0005, + "loss": 2.1407, + "step": 34730 + }, + { + "epoch": 0.13222901425820055, + "grad_norm": 0.12486039847135544, + "learning_rate": 0.0005, + "loss": 2.1385, + "step": 34740 + }, + { + "epoch": 0.13226707672632323, + "grad_norm": 0.11611022800207138, + "learning_rate": 0.0005, + "loss": 2.14, + "step": 34750 + }, + { + "epoch": 0.13230513919444592, + "grad_norm": 0.41225793957710266, + "learning_rate": 0.0005, + "loss": 2.1387, + "step": 34760 + }, + { + "epoch": 0.1323432016625686, + "grad_norm": 0.12023355066776276, + "learning_rate": 0.0005, + "loss": 2.138, + "step": 34770 + }, + { + "epoch": 0.1323812641306913, + "grad_norm": 0.11662869900465012, + "learning_rate": 0.0005, + "loss": 2.1418, + "step": 34780 + }, + { + "epoch": 0.13241932659881397, + "grad_norm": 0.10957567393779755, + "learning_rate": 0.0005, + "loss": 2.1495, + "step": 34790 + }, + { + "epoch": 0.13245738906693666, + "grad_norm": 0.12181458622217178, + "learning_rate": 0.0005, + "loss": 2.14, + "step": 34800 + }, + { + "epoch": 0.13249545153505934, + "grad_norm": 0.1326204389333725, + "learning_rate": 0.0005, + "loss": 2.1411, + "step": 34810 + }, + { + "epoch": 0.13253351400318203, + "grad_norm": 0.1400616616010666, + "learning_rate": 0.0005, + "loss": 2.1372, + "step": 34820 + }, + { + "epoch": 0.13257157647130471, + "grad_norm": 0.12994307279586792, + "learning_rate": 0.0005, + "loss": 2.1422, + "step": 34830 + }, + { + "epoch": 0.1326096389394274, + "grad_norm": 0.1476479470729828, + "learning_rate": 0.0005, + "loss": 2.1348, + "step": 34840 + }, + { + "epoch": 0.13264770140755008, + "grad_norm": 0.12488952279090881, + "learning_rate": 0.0005, + "loss": 2.1526, + "step": 34850 + }, + { + "epoch": 0.13268576387567274, + "grad_norm": 0.13037273287773132, + "learning_rate": 0.0005, + "loss": 2.1197, + "step": 34860 + }, + { + "epoch": 0.13272382634379543, + "grad_norm": 0.11375074088573456, + "learning_rate": 0.0005, + "loss": 2.1397, + "step": 34870 + }, + { + "epoch": 0.1327618888119181, + "grad_norm": 0.11915922164916992, + "learning_rate": 0.0005, + "loss": 2.1241, + "step": 34880 + }, + { + "epoch": 0.1327999512800408, + "grad_norm": 0.11518298089504242, + "learning_rate": 0.0005, + "loss": 2.1307, + "step": 34890 + }, + { + "epoch": 0.13283801374816348, + "grad_norm": 0.13177339732646942, + "learning_rate": 0.0005, + "loss": 2.1349, + "step": 34900 + }, + { + "epoch": 0.13287607621628617, + "grad_norm": 0.12411221116781235, + "learning_rate": 0.0005, + "loss": 2.134, + "step": 34910 + }, + { + "epoch": 0.13291413868440885, + "grad_norm": 0.12184184789657593, + "learning_rate": 0.0005, + "loss": 2.1411, + "step": 34920 + }, + { + "epoch": 0.13295220115253154, + "grad_norm": 0.13678424060344696, + "learning_rate": 0.0005, + "loss": 2.1368, + "step": 34930 + }, + { + "epoch": 0.13299026362065422, + "grad_norm": 0.1206953153014183, + "learning_rate": 0.0005, + "loss": 2.1379, + "step": 34940 + }, + { + "epoch": 0.1330283260887769, + "grad_norm": 0.13750183582305908, + "learning_rate": 0.0005, + "loss": 2.1351, + "step": 34950 + }, + { + "epoch": 0.1330663885568996, + "grad_norm": 0.12357515096664429, + "learning_rate": 0.0005, + "loss": 2.1312, + "step": 34960 + }, + { + "epoch": 0.13310445102502227, + "grad_norm": 0.11207663267850876, + "learning_rate": 0.0005, + "loss": 2.1277, + "step": 34970 + }, + { + "epoch": 0.13314251349314496, + "grad_norm": 0.1267058104276657, + "learning_rate": 0.0005, + "loss": 2.1421, + "step": 34980 + }, + { + "epoch": 0.13318057596126764, + "grad_norm": 0.12690705060958862, + "learning_rate": 0.0005, + "loss": 2.1299, + "step": 34990 + }, + { + "epoch": 0.13321863842939033, + "grad_norm": 0.1255311369895935, + "learning_rate": 0.0005, + "loss": 2.1426, + "step": 35000 + }, + { + "epoch": 0.133256700897513, + "grad_norm": 0.11876732110977173, + "learning_rate": 0.0005, + "loss": 2.1409, + "step": 35010 + }, + { + "epoch": 0.13329476336563567, + "grad_norm": 0.12742522358894348, + "learning_rate": 0.0005, + "loss": 2.1315, + "step": 35020 + }, + { + "epoch": 0.13333282583375836, + "grad_norm": 0.13061149418354034, + "learning_rate": 0.0005, + "loss": 2.1372, + "step": 35030 + }, + { + "epoch": 0.13337088830188104, + "grad_norm": 0.14856840670108795, + "learning_rate": 0.0005, + "loss": 2.1215, + "step": 35040 + }, + { + "epoch": 0.13340895077000373, + "grad_norm": 0.11686535179615021, + "learning_rate": 0.0005, + "loss": 2.1442, + "step": 35050 + }, + { + "epoch": 0.1334470132381264, + "grad_norm": 0.12009676545858383, + "learning_rate": 0.0005, + "loss": 2.143, + "step": 35060 + }, + { + "epoch": 0.1334850757062491, + "grad_norm": 0.1326339691877365, + "learning_rate": 0.0005, + "loss": 2.131, + "step": 35070 + }, + { + "epoch": 0.13352313817437178, + "grad_norm": 0.11814519762992859, + "learning_rate": 0.0005, + "loss": 2.1202, + "step": 35080 + }, + { + "epoch": 0.13356120064249447, + "grad_norm": 0.12367371469736099, + "learning_rate": 0.0005, + "loss": 2.1549, + "step": 35090 + }, + { + "epoch": 0.13359926311061715, + "grad_norm": 0.1246814876794815, + "learning_rate": 0.0005, + "loss": 2.1339, + "step": 35100 + }, + { + "epoch": 0.13363732557873984, + "grad_norm": 0.1327364146709442, + "learning_rate": 0.0005, + "loss": 2.1488, + "step": 35110 + }, + { + "epoch": 0.13367538804686252, + "grad_norm": 0.12271331250667572, + "learning_rate": 0.0005, + "loss": 2.1405, + "step": 35120 + }, + { + "epoch": 0.1337134505149852, + "grad_norm": 0.1177125796675682, + "learning_rate": 0.0005, + "loss": 2.1404, + "step": 35130 + }, + { + "epoch": 0.1337515129831079, + "grad_norm": 0.11724243313074112, + "learning_rate": 0.0005, + "loss": 2.1496, + "step": 35140 + }, + { + "epoch": 0.13378957545123055, + "grad_norm": 0.13234220445156097, + "learning_rate": 0.0005, + "loss": 2.1426, + "step": 35150 + }, + { + "epoch": 0.13382763791935323, + "grad_norm": 0.12565819919109344, + "learning_rate": 0.0005, + "loss": 2.1227, + "step": 35160 + }, + { + "epoch": 0.13386570038747592, + "grad_norm": 0.12357509136199951, + "learning_rate": 0.0005, + "loss": 2.1323, + "step": 35170 + }, + { + "epoch": 0.1339037628555986, + "grad_norm": 0.12172172218561172, + "learning_rate": 0.0005, + "loss": 2.1326, + "step": 35180 + }, + { + "epoch": 0.1339418253237213, + "grad_norm": 0.1319197118282318, + "learning_rate": 0.0005, + "loss": 2.1363, + "step": 35190 + }, + { + "epoch": 0.13397988779184397, + "grad_norm": 0.12759651243686676, + "learning_rate": 0.0005, + "loss": 2.1392, + "step": 35200 + }, + { + "epoch": 0.13401795025996666, + "grad_norm": 0.14796006679534912, + "learning_rate": 0.0005, + "loss": 2.1294, + "step": 35210 + }, + { + "epoch": 0.13405601272808934, + "grad_norm": 0.13388635218143463, + "learning_rate": 0.0005, + "loss": 2.1363, + "step": 35220 + }, + { + "epoch": 0.13409407519621203, + "grad_norm": 0.11979969590902328, + "learning_rate": 0.0005, + "loss": 2.1364, + "step": 35230 + }, + { + "epoch": 0.1341321376643347, + "grad_norm": 0.11983058601617813, + "learning_rate": 0.0005, + "loss": 2.1204, + "step": 35240 + }, + { + "epoch": 0.1341702001324574, + "grad_norm": 0.11468696594238281, + "learning_rate": 0.0005, + "loss": 2.1426, + "step": 35250 + }, + { + "epoch": 0.13420826260058008, + "grad_norm": 0.1290093958377838, + "learning_rate": 0.0005, + "loss": 2.1359, + "step": 35260 + }, + { + "epoch": 0.13424632506870277, + "grad_norm": 0.12517830729484558, + "learning_rate": 0.0005, + "loss": 2.1213, + "step": 35270 + }, + { + "epoch": 0.13428438753682545, + "grad_norm": 0.13029910624027252, + "learning_rate": 0.0005, + "loss": 2.1545, + "step": 35280 + }, + { + "epoch": 0.1343224500049481, + "grad_norm": 0.12360113859176636, + "learning_rate": 0.0005, + "loss": 2.1472, + "step": 35290 + }, + { + "epoch": 0.1343605124730708, + "grad_norm": 0.13174830377101898, + "learning_rate": 0.0005, + "loss": 2.1544, + "step": 35300 + }, + { + "epoch": 0.13439857494119348, + "grad_norm": 0.1239006370306015, + "learning_rate": 0.0005, + "loss": 2.1454, + "step": 35310 + }, + { + "epoch": 0.13443663740931616, + "grad_norm": 0.1369420737028122, + "learning_rate": 0.0005, + "loss": 2.1401, + "step": 35320 + }, + { + "epoch": 0.13447469987743885, + "grad_norm": 0.11984249204397202, + "learning_rate": 0.0005, + "loss": 2.1283, + "step": 35330 + }, + { + "epoch": 0.13451276234556153, + "grad_norm": 0.1304861605167389, + "learning_rate": 0.0005, + "loss": 2.146, + "step": 35340 + }, + { + "epoch": 0.13455082481368422, + "grad_norm": 0.12857241928577423, + "learning_rate": 0.0005, + "loss": 2.1369, + "step": 35350 + }, + { + "epoch": 0.1345888872818069, + "grad_norm": 0.12768466770648956, + "learning_rate": 0.0005, + "loss": 2.1364, + "step": 35360 + }, + { + "epoch": 0.1346269497499296, + "grad_norm": 0.12753640115261078, + "learning_rate": 0.0005, + "loss": 2.1313, + "step": 35370 + }, + { + "epoch": 0.13466501221805227, + "grad_norm": 0.12317626178264618, + "learning_rate": 0.0005, + "loss": 2.1405, + "step": 35380 + }, + { + "epoch": 0.13470307468617496, + "grad_norm": 0.12090485543012619, + "learning_rate": 0.0005, + "loss": 2.1466, + "step": 35390 + }, + { + "epoch": 0.13474113715429764, + "grad_norm": 0.13802245259284973, + "learning_rate": 0.0005, + "loss": 2.1357, + "step": 35400 + }, + { + "epoch": 0.13477919962242033, + "grad_norm": 0.12055863440036774, + "learning_rate": 0.0005, + "loss": 2.158, + "step": 35410 + }, + { + "epoch": 0.134817262090543, + "grad_norm": 0.13493365049362183, + "learning_rate": 0.0005, + "loss": 2.1277, + "step": 35420 + }, + { + "epoch": 0.1348553245586657, + "grad_norm": 0.11872437596321106, + "learning_rate": 0.0005, + "loss": 2.1401, + "step": 35430 + }, + { + "epoch": 0.13489338702678835, + "grad_norm": 0.127437025308609, + "learning_rate": 0.0005, + "loss": 2.1387, + "step": 35440 + }, + { + "epoch": 0.13493144949491104, + "grad_norm": 0.13511709868907928, + "learning_rate": 0.0005, + "loss": 2.1475, + "step": 35450 + }, + { + "epoch": 0.13496951196303372, + "grad_norm": 0.12347541749477386, + "learning_rate": 0.0005, + "loss": 2.1419, + "step": 35460 + }, + { + "epoch": 0.1350075744311564, + "grad_norm": 0.12082315236330032, + "learning_rate": 0.0005, + "loss": 2.1365, + "step": 35470 + }, + { + "epoch": 0.1350456368992791, + "grad_norm": 0.11401130259037018, + "learning_rate": 0.0005, + "loss": 2.1376, + "step": 35480 + }, + { + "epoch": 0.13508369936740178, + "grad_norm": 0.1377699226140976, + "learning_rate": 0.0005, + "loss": 2.1571, + "step": 35490 + }, + { + "epoch": 0.13512176183552446, + "grad_norm": 0.11370331048965454, + "learning_rate": 0.0005, + "loss": 2.1324, + "step": 35500 + }, + { + "epoch": 0.13515982430364715, + "grad_norm": 0.1212431862950325, + "learning_rate": 0.0005, + "loss": 2.13, + "step": 35510 + }, + { + "epoch": 0.13519788677176983, + "grad_norm": 0.1107739806175232, + "learning_rate": 0.0005, + "loss": 2.142, + "step": 35520 + }, + { + "epoch": 0.13523594923989252, + "grad_norm": 0.1320618987083435, + "learning_rate": 0.0005, + "loss": 2.1334, + "step": 35530 + }, + { + "epoch": 0.1352740117080152, + "grad_norm": 0.14418496191501617, + "learning_rate": 0.0005, + "loss": 2.1445, + "step": 35540 + }, + { + "epoch": 0.1353120741761379, + "grad_norm": 0.12587794661521912, + "learning_rate": 0.0005, + "loss": 2.1389, + "step": 35550 + }, + { + "epoch": 0.13535013664426057, + "grad_norm": 0.11606061458587646, + "learning_rate": 0.0005, + "loss": 2.1328, + "step": 35560 + }, + { + "epoch": 0.13538819911238326, + "grad_norm": 0.1209748387336731, + "learning_rate": 0.0005, + "loss": 2.1413, + "step": 35570 + }, + { + "epoch": 0.1354262615805059, + "grad_norm": 0.11753486096858978, + "learning_rate": 0.0005, + "loss": 2.1419, + "step": 35580 + }, + { + "epoch": 0.1354643240486286, + "grad_norm": 0.15218910574913025, + "learning_rate": 0.0005, + "loss": 2.1312, + "step": 35590 + }, + { + "epoch": 0.13550238651675128, + "grad_norm": 0.12810829281806946, + "learning_rate": 0.0005, + "loss": 2.1203, + "step": 35600 + }, + { + "epoch": 0.13554044898487397, + "grad_norm": 0.12554165720939636, + "learning_rate": 0.0005, + "loss": 2.1379, + "step": 35610 + }, + { + "epoch": 0.13557851145299665, + "grad_norm": 0.11238068342208862, + "learning_rate": 0.0005, + "loss": 2.1411, + "step": 35620 + }, + { + "epoch": 0.13561657392111934, + "grad_norm": 0.11834820359945297, + "learning_rate": 0.0005, + "loss": 2.1387, + "step": 35630 + }, + { + "epoch": 0.13565463638924202, + "grad_norm": 0.1300107091665268, + "learning_rate": 0.0005, + "loss": 2.1337, + "step": 35640 + }, + { + "epoch": 0.1356926988573647, + "grad_norm": 0.12143900245428085, + "learning_rate": 0.0005, + "loss": 2.1337, + "step": 35650 + }, + { + "epoch": 0.1357307613254874, + "grad_norm": 0.11875910311937332, + "learning_rate": 0.0005, + "loss": 2.1444, + "step": 35660 + }, + { + "epoch": 0.13576882379361008, + "grad_norm": 0.1321776658296585, + "learning_rate": 0.0005, + "loss": 2.1414, + "step": 35670 + }, + { + "epoch": 0.13580688626173276, + "grad_norm": 0.13334231078624725, + "learning_rate": 0.0005, + "loss": 2.1437, + "step": 35680 + }, + { + "epoch": 0.13584494872985545, + "grad_norm": 0.14358310401439667, + "learning_rate": 0.0005, + "loss": 2.1323, + "step": 35690 + }, + { + "epoch": 0.13588301119797813, + "grad_norm": 0.12222322821617126, + "learning_rate": 0.0005, + "loss": 2.1462, + "step": 35700 + }, + { + "epoch": 0.13592107366610082, + "grad_norm": 0.13186433911323547, + "learning_rate": 0.0005, + "loss": 2.1312, + "step": 35710 + }, + { + "epoch": 0.13595913613422347, + "grad_norm": 0.12122279405593872, + "learning_rate": 0.0005, + "loss": 2.1409, + "step": 35720 + }, + { + "epoch": 0.13599719860234616, + "grad_norm": 0.11469511687755585, + "learning_rate": 0.0005, + "loss": 2.1282, + "step": 35730 + }, + { + "epoch": 0.13603526107046884, + "grad_norm": 0.14768067002296448, + "learning_rate": 0.0005, + "loss": 2.1319, + "step": 35740 + }, + { + "epoch": 0.13607332353859153, + "grad_norm": 0.11734780669212341, + "learning_rate": 0.0005, + "loss": 2.1357, + "step": 35750 + }, + { + "epoch": 0.1361113860067142, + "grad_norm": 0.11367372423410416, + "learning_rate": 0.0005, + "loss": 2.1429, + "step": 35760 + }, + { + "epoch": 0.1361494484748369, + "grad_norm": 0.12199106812477112, + "learning_rate": 0.0005, + "loss": 2.1396, + "step": 35770 + }, + { + "epoch": 0.13618751094295958, + "grad_norm": 0.11509162187576294, + "learning_rate": 0.0005, + "loss": 2.151, + "step": 35780 + }, + { + "epoch": 0.13622557341108227, + "grad_norm": 0.11662641912698746, + "learning_rate": 0.0005, + "loss": 2.1445, + "step": 35790 + }, + { + "epoch": 0.13626363587920495, + "grad_norm": 0.12217991799116135, + "learning_rate": 0.0005, + "loss": 2.1501, + "step": 35800 + }, + { + "epoch": 0.13630169834732764, + "grad_norm": 0.11236120760440826, + "learning_rate": 0.0005, + "loss": 2.1284, + "step": 35810 + }, + { + "epoch": 0.13633976081545032, + "grad_norm": 0.12871301174163818, + "learning_rate": 0.0005, + "loss": 2.1485, + "step": 35820 + }, + { + "epoch": 0.136377823283573, + "grad_norm": 0.12680701911449432, + "learning_rate": 0.0005, + "loss": 2.1455, + "step": 35830 + }, + { + "epoch": 0.1364158857516957, + "grad_norm": 0.12248557060956955, + "learning_rate": 0.0005, + "loss": 2.1175, + "step": 35840 + }, + { + "epoch": 0.13645394821981838, + "grad_norm": 0.12114865332841873, + "learning_rate": 0.0005, + "loss": 2.1306, + "step": 35850 + }, + { + "epoch": 0.13649201068794106, + "grad_norm": 0.13375422358512878, + "learning_rate": 0.0005, + "loss": 2.1344, + "step": 35860 + }, + { + "epoch": 0.13653007315606372, + "grad_norm": 0.1284463107585907, + "learning_rate": 0.0005, + "loss": 2.14, + "step": 35870 + }, + { + "epoch": 0.1365681356241864, + "grad_norm": 0.1267269253730774, + "learning_rate": 0.0005, + "loss": 2.1334, + "step": 35880 + }, + { + "epoch": 0.1366061980923091, + "grad_norm": 0.1250191330909729, + "learning_rate": 0.0005, + "loss": 2.1446, + "step": 35890 + }, + { + "epoch": 0.13664426056043177, + "grad_norm": 0.1312512457370758, + "learning_rate": 0.0005, + "loss": 2.1265, + "step": 35900 + }, + { + "epoch": 0.13668232302855446, + "grad_norm": 0.11584340035915375, + "learning_rate": 0.0005, + "loss": 2.1404, + "step": 35910 + }, + { + "epoch": 0.13672038549667714, + "grad_norm": 0.11950317770242691, + "learning_rate": 0.0005, + "loss": 2.1583, + "step": 35920 + }, + { + "epoch": 0.13675844796479983, + "grad_norm": 0.13057249784469604, + "learning_rate": 0.0005, + "loss": 2.1349, + "step": 35930 + }, + { + "epoch": 0.1367965104329225, + "grad_norm": 0.12502941489219666, + "learning_rate": 0.0005, + "loss": 2.1372, + "step": 35940 + }, + { + "epoch": 0.1368345729010452, + "grad_norm": 0.13216383755207062, + "learning_rate": 0.0005, + "loss": 2.1396, + "step": 35950 + }, + { + "epoch": 0.13687263536916788, + "grad_norm": 0.11722370982170105, + "learning_rate": 0.0005, + "loss": 2.142, + "step": 35960 + }, + { + "epoch": 0.13691069783729057, + "grad_norm": 0.12868359684944153, + "learning_rate": 0.0005, + "loss": 2.1474, + "step": 35970 + }, + { + "epoch": 0.13694876030541325, + "grad_norm": 0.12457242608070374, + "learning_rate": 0.0005, + "loss": 2.1477, + "step": 35980 + }, + { + "epoch": 0.13698682277353594, + "grad_norm": 0.11647374927997589, + "learning_rate": 0.0005, + "loss": 2.1306, + "step": 35990 + }, + { + "epoch": 0.13702488524165862, + "grad_norm": 0.1253417730331421, + "learning_rate": 0.0005, + "loss": 2.1406, + "step": 36000 + }, + { + "epoch": 0.13706294770978128, + "grad_norm": 0.13089333474636078, + "learning_rate": 0.0005, + "loss": 2.1396, + "step": 36010 + }, + { + "epoch": 0.13710101017790396, + "grad_norm": 0.12575079500675201, + "learning_rate": 0.0005, + "loss": 2.1485, + "step": 36020 + }, + { + "epoch": 0.13713907264602665, + "grad_norm": 0.12303038686513901, + "learning_rate": 0.0005, + "loss": 2.144, + "step": 36030 + }, + { + "epoch": 0.13717713511414933, + "grad_norm": 0.11526139080524445, + "learning_rate": 0.0005, + "loss": 2.1362, + "step": 36040 + }, + { + "epoch": 0.13721519758227202, + "grad_norm": 0.12276002019643784, + "learning_rate": 0.0005, + "loss": 2.1179, + "step": 36050 + }, + { + "epoch": 0.1372532600503947, + "grad_norm": 0.1275072991847992, + "learning_rate": 0.0005, + "loss": 2.1416, + "step": 36060 + }, + { + "epoch": 0.1372913225185174, + "grad_norm": 0.14315102994441986, + "learning_rate": 0.0005, + "loss": 2.1297, + "step": 36070 + }, + { + "epoch": 0.13732938498664007, + "grad_norm": 0.11972994357347488, + "learning_rate": 0.0005, + "loss": 2.1391, + "step": 36080 + }, + { + "epoch": 0.13736744745476276, + "grad_norm": 0.13286733627319336, + "learning_rate": 0.0005, + "loss": 2.1509, + "step": 36090 + }, + { + "epoch": 0.13740550992288544, + "grad_norm": 0.129139244556427, + "learning_rate": 0.0005, + "loss": 2.1303, + "step": 36100 + }, + { + "epoch": 0.13744357239100813, + "grad_norm": 0.12458977103233337, + "learning_rate": 0.0005, + "loss": 2.156, + "step": 36110 + }, + { + "epoch": 0.1374816348591308, + "grad_norm": 0.12720626592636108, + "learning_rate": 0.0005, + "loss": 2.1529, + "step": 36120 + }, + { + "epoch": 0.1375196973272535, + "grad_norm": 0.11940285563468933, + "learning_rate": 0.0005, + "loss": 2.134, + "step": 36130 + }, + { + "epoch": 0.13755775979537618, + "grad_norm": 0.1287795752286911, + "learning_rate": 0.0005, + "loss": 2.1455, + "step": 36140 + }, + { + "epoch": 0.13759582226349887, + "grad_norm": 0.11979345977306366, + "learning_rate": 0.0005, + "loss": 2.1314, + "step": 36150 + }, + { + "epoch": 0.13763388473162153, + "grad_norm": 0.1211363896727562, + "learning_rate": 0.0005, + "loss": 2.1497, + "step": 36160 + }, + { + "epoch": 0.1376719471997442, + "grad_norm": 0.11665554344654083, + "learning_rate": 0.0005, + "loss": 2.1362, + "step": 36170 + }, + { + "epoch": 0.1377100096678669, + "grad_norm": 0.11989234387874603, + "learning_rate": 0.0005, + "loss": 2.1457, + "step": 36180 + }, + { + "epoch": 0.13774807213598958, + "grad_norm": 0.12173985689878464, + "learning_rate": 0.0005, + "loss": 2.1364, + "step": 36190 + }, + { + "epoch": 0.13778613460411226, + "grad_norm": 0.11196370422840118, + "learning_rate": 0.0005, + "loss": 2.1214, + "step": 36200 + }, + { + "epoch": 0.13782419707223495, + "grad_norm": 0.1438337117433548, + "learning_rate": 0.0005, + "loss": 2.1442, + "step": 36210 + }, + { + "epoch": 0.13786225954035763, + "grad_norm": 0.13081873953342438, + "learning_rate": 0.0005, + "loss": 2.143, + "step": 36220 + }, + { + "epoch": 0.13790032200848032, + "grad_norm": 0.12510628998279572, + "learning_rate": 0.0005, + "loss": 2.1305, + "step": 36230 + }, + { + "epoch": 0.137938384476603, + "grad_norm": 0.11957137286663055, + "learning_rate": 0.0005, + "loss": 2.1335, + "step": 36240 + }, + { + "epoch": 0.1379764469447257, + "grad_norm": 0.12658755481243134, + "learning_rate": 0.0005, + "loss": 2.144, + "step": 36250 + }, + { + "epoch": 0.13801450941284837, + "grad_norm": 0.13881908357143402, + "learning_rate": 0.0005, + "loss": 2.1332, + "step": 36260 + }, + { + "epoch": 0.13805257188097106, + "grad_norm": 0.12685273587703705, + "learning_rate": 0.0005, + "loss": 2.1277, + "step": 36270 + }, + { + "epoch": 0.13809063434909374, + "grad_norm": 0.12292812764644623, + "learning_rate": 0.0005, + "loss": 2.1417, + "step": 36280 + }, + { + "epoch": 0.13812869681721643, + "grad_norm": 0.1255536824464798, + "learning_rate": 0.0005, + "loss": 2.1341, + "step": 36290 + }, + { + "epoch": 0.13816675928533909, + "grad_norm": 0.13109980523586273, + "learning_rate": 0.0005, + "loss": 2.1383, + "step": 36300 + }, + { + "epoch": 0.13820482175346177, + "grad_norm": 0.12761881947517395, + "learning_rate": 0.0005, + "loss": 2.1316, + "step": 36310 + }, + { + "epoch": 0.13824288422158446, + "grad_norm": 0.1440645009279251, + "learning_rate": 0.0005, + "loss": 2.1402, + "step": 36320 + }, + { + "epoch": 0.13828094668970714, + "grad_norm": 0.12873628735542297, + "learning_rate": 0.0005, + "loss": 2.138, + "step": 36330 + }, + { + "epoch": 0.13831900915782983, + "grad_norm": 0.12757310271263123, + "learning_rate": 0.0005, + "loss": 2.113, + "step": 36340 + }, + { + "epoch": 0.1383570716259525, + "grad_norm": 0.14658595621585846, + "learning_rate": 0.0005, + "loss": 2.1395, + "step": 36350 + }, + { + "epoch": 0.1383951340940752, + "grad_norm": 0.12868966162204742, + "learning_rate": 0.0005, + "loss": 2.1245, + "step": 36360 + }, + { + "epoch": 0.13843319656219788, + "grad_norm": 0.12172853201627731, + "learning_rate": 0.0005, + "loss": 2.1496, + "step": 36370 + }, + { + "epoch": 0.13847125903032056, + "grad_norm": 0.12915430963039398, + "learning_rate": 0.0005, + "loss": 2.1312, + "step": 36380 + }, + { + "epoch": 0.13850932149844325, + "grad_norm": 0.11664684861898422, + "learning_rate": 0.0005, + "loss": 2.1413, + "step": 36390 + }, + { + "epoch": 0.13854738396656593, + "grad_norm": 0.1285259872674942, + "learning_rate": 0.0005, + "loss": 2.1389, + "step": 36400 + }, + { + "epoch": 0.13858544643468862, + "grad_norm": 0.12674078345298767, + "learning_rate": 0.0005, + "loss": 2.1451, + "step": 36410 + }, + { + "epoch": 0.1386235089028113, + "grad_norm": 0.1258033961057663, + "learning_rate": 0.0005, + "loss": 2.1399, + "step": 36420 + }, + { + "epoch": 0.138661571370934, + "grad_norm": 0.13805773854255676, + "learning_rate": 0.0005, + "loss": 2.1361, + "step": 36430 + }, + { + "epoch": 0.13869963383905665, + "grad_norm": 0.12224043905735016, + "learning_rate": 0.0005, + "loss": 2.1601, + "step": 36440 + }, + { + "epoch": 0.13873769630717933, + "grad_norm": 0.11138515174388885, + "learning_rate": 0.0005, + "loss": 2.1293, + "step": 36450 + }, + { + "epoch": 0.13877575877530202, + "grad_norm": 0.13060350716114044, + "learning_rate": 0.0005, + "loss": 2.1304, + "step": 36460 + }, + { + "epoch": 0.1388138212434247, + "grad_norm": 0.12439647316932678, + "learning_rate": 0.0005, + "loss": 2.1418, + "step": 36470 + }, + { + "epoch": 0.13885188371154739, + "grad_norm": 0.1259547621011734, + "learning_rate": 0.0005, + "loss": 2.1433, + "step": 36480 + }, + { + "epoch": 0.13888994617967007, + "grad_norm": 0.12292451411485672, + "learning_rate": 0.0005, + "loss": 2.1135, + "step": 36490 + }, + { + "epoch": 0.13892800864779276, + "grad_norm": 0.11878538131713867, + "learning_rate": 0.0005, + "loss": 2.1187, + "step": 36500 + }, + { + "epoch": 0.13896607111591544, + "grad_norm": 0.13011427223682404, + "learning_rate": 0.0005, + "loss": 2.1425, + "step": 36510 + }, + { + "epoch": 0.13900413358403813, + "grad_norm": 0.1278398334980011, + "learning_rate": 0.0005, + "loss": 2.1356, + "step": 36520 + }, + { + "epoch": 0.1390421960521608, + "grad_norm": 0.1215527355670929, + "learning_rate": 0.0005, + "loss": 2.1333, + "step": 36530 + }, + { + "epoch": 0.1390802585202835, + "grad_norm": 0.11837470531463623, + "learning_rate": 0.0005, + "loss": 2.1381, + "step": 36540 + }, + { + "epoch": 0.13911832098840618, + "grad_norm": 0.1168607845902443, + "learning_rate": 0.0005, + "loss": 2.1393, + "step": 36550 + }, + { + "epoch": 0.13915638345652886, + "grad_norm": 0.12791843712329865, + "learning_rate": 0.0005, + "loss": 2.1409, + "step": 36560 + }, + { + "epoch": 0.13919444592465155, + "grad_norm": 0.13247643411159515, + "learning_rate": 0.0005, + "loss": 2.1454, + "step": 36570 + }, + { + "epoch": 0.13923250839277423, + "grad_norm": 0.13305790722370148, + "learning_rate": 0.0005, + "loss": 2.148, + "step": 36580 + }, + { + "epoch": 0.1392705708608969, + "grad_norm": 0.1222585067152977, + "learning_rate": 0.0005, + "loss": 2.1379, + "step": 36590 + }, + { + "epoch": 0.13930863332901958, + "grad_norm": 0.12503567337989807, + "learning_rate": 0.0005, + "loss": 2.1313, + "step": 36600 + }, + { + "epoch": 0.13934669579714226, + "grad_norm": 0.12737417221069336, + "learning_rate": 0.0005, + "loss": 2.1253, + "step": 36610 + }, + { + "epoch": 0.13938475826526495, + "grad_norm": 0.19949164986610413, + "learning_rate": 0.0005, + "loss": 2.1256, + "step": 36620 + }, + { + "epoch": 0.13942282073338763, + "grad_norm": 0.12841491401195526, + "learning_rate": 0.0005, + "loss": 2.1212, + "step": 36630 + }, + { + "epoch": 0.13946088320151032, + "grad_norm": 0.1240854263305664, + "learning_rate": 0.0005, + "loss": 2.1365, + "step": 36640 + }, + { + "epoch": 0.139498945669633, + "grad_norm": 0.11256958544254303, + "learning_rate": 0.0005, + "loss": 2.1368, + "step": 36650 + }, + { + "epoch": 0.13953700813775569, + "grad_norm": 0.11909143626689911, + "learning_rate": 0.0005, + "loss": 2.1399, + "step": 36660 + }, + { + "epoch": 0.13957507060587837, + "grad_norm": 0.11793980002403259, + "learning_rate": 0.0005, + "loss": 2.1401, + "step": 36670 + }, + { + "epoch": 0.13961313307400106, + "grad_norm": 0.12352468818426132, + "learning_rate": 0.0005, + "loss": 2.1336, + "step": 36680 + }, + { + "epoch": 0.13965119554212374, + "grad_norm": 0.11416994780302048, + "learning_rate": 0.0005, + "loss": 2.1445, + "step": 36690 + }, + { + "epoch": 0.13968925801024643, + "grad_norm": 0.11804043501615524, + "learning_rate": 0.0005, + "loss": 2.1417, + "step": 36700 + }, + { + "epoch": 0.1397273204783691, + "grad_norm": 0.1268421709537506, + "learning_rate": 0.0005, + "loss": 2.1374, + "step": 36710 + }, + { + "epoch": 0.1397653829464918, + "grad_norm": 0.1253766119480133, + "learning_rate": 0.0005, + "loss": 2.1523, + "step": 36720 + }, + { + "epoch": 0.13980344541461445, + "grad_norm": 0.13076674938201904, + "learning_rate": 0.0005, + "loss": 2.1195, + "step": 36730 + }, + { + "epoch": 0.13984150788273714, + "grad_norm": 0.12516288459300995, + "learning_rate": 0.0005, + "loss": 2.1436, + "step": 36740 + }, + { + "epoch": 0.13987957035085982, + "grad_norm": 0.12267495691776276, + "learning_rate": 0.0005, + "loss": 2.1324, + "step": 36750 + }, + { + "epoch": 0.1399176328189825, + "grad_norm": 0.13755877315998077, + "learning_rate": 0.0005, + "loss": 2.1384, + "step": 36760 + }, + { + "epoch": 0.1399556952871052, + "grad_norm": 0.12464100122451782, + "learning_rate": 0.0005, + "loss": 2.1411, + "step": 36770 + }, + { + "epoch": 0.13999375775522788, + "grad_norm": 0.11837789416313171, + "learning_rate": 0.0005, + "loss": 2.1304, + "step": 36780 + }, + { + "epoch": 0.14003182022335056, + "grad_norm": 0.12519973516464233, + "learning_rate": 0.0005, + "loss": 2.1304, + "step": 36790 + }, + { + "epoch": 0.14006988269147325, + "grad_norm": 0.12340795993804932, + "learning_rate": 0.0005, + "loss": 2.127, + "step": 36800 + }, + { + "epoch": 0.14010794515959593, + "grad_norm": 0.12131775170564651, + "learning_rate": 0.0005, + "loss": 2.1436, + "step": 36810 + }, + { + "epoch": 0.14014600762771862, + "grad_norm": 0.12783758342266083, + "learning_rate": 0.0005, + "loss": 2.1383, + "step": 36820 + }, + { + "epoch": 0.1401840700958413, + "grad_norm": 0.11167051643133163, + "learning_rate": 0.0005, + "loss": 2.1393, + "step": 36830 + }, + { + "epoch": 0.14022213256396399, + "grad_norm": 0.12533417344093323, + "learning_rate": 0.0005, + "loss": 2.1386, + "step": 36840 + }, + { + "epoch": 0.14026019503208667, + "grad_norm": 0.13003957271575928, + "learning_rate": 0.0005, + "loss": 2.1316, + "step": 36850 + }, + { + "epoch": 0.14029825750020936, + "grad_norm": 0.12811410427093506, + "learning_rate": 0.0005, + "loss": 2.1371, + "step": 36860 + }, + { + "epoch": 0.140336319968332, + "grad_norm": 0.13217073678970337, + "learning_rate": 0.0005, + "loss": 2.1322, + "step": 36870 + }, + { + "epoch": 0.1403743824364547, + "grad_norm": 0.12067825347185135, + "learning_rate": 0.0005, + "loss": 2.1445, + "step": 36880 + }, + { + "epoch": 0.14041244490457738, + "grad_norm": 0.12828949093818665, + "learning_rate": 0.0005, + "loss": 2.1345, + "step": 36890 + }, + { + "epoch": 0.14045050737270007, + "grad_norm": 0.11882949620485306, + "learning_rate": 0.0005, + "loss": 2.1277, + "step": 36900 + }, + { + "epoch": 0.14048856984082275, + "grad_norm": 0.11670491844415665, + "learning_rate": 0.0005, + "loss": 2.1303, + "step": 36910 + }, + { + "epoch": 0.14052663230894544, + "grad_norm": 0.12217088788747787, + "learning_rate": 0.0005, + "loss": 2.1343, + "step": 36920 + }, + { + "epoch": 0.14056469477706812, + "grad_norm": 0.10978880524635315, + "learning_rate": 0.0005, + "loss": 2.1299, + "step": 36930 + }, + { + "epoch": 0.1406027572451908, + "grad_norm": 0.11447262018918991, + "learning_rate": 0.0005, + "loss": 2.1323, + "step": 36940 + }, + { + "epoch": 0.1406408197133135, + "grad_norm": 0.13392773270606995, + "learning_rate": 0.0005, + "loss": 2.1545, + "step": 36950 + }, + { + "epoch": 0.14067888218143618, + "grad_norm": 0.1232706680893898, + "learning_rate": 0.0005, + "loss": 2.1505, + "step": 36960 + }, + { + "epoch": 0.14071694464955886, + "grad_norm": 0.13241569697856903, + "learning_rate": 0.0005, + "loss": 2.1537, + "step": 36970 + }, + { + "epoch": 0.14075500711768155, + "grad_norm": 0.11878272891044617, + "learning_rate": 0.0005, + "loss": 2.1344, + "step": 36980 + }, + { + "epoch": 0.14079306958580423, + "grad_norm": 0.12405449151992798, + "learning_rate": 0.0005, + "loss": 2.1308, + "step": 36990 + }, + { + "epoch": 0.14083113205392692, + "grad_norm": 0.11694375425577164, + "learning_rate": 0.0005, + "loss": 2.1409, + "step": 37000 + }, + { + "epoch": 0.1408691945220496, + "grad_norm": 0.11805060505867004, + "learning_rate": 0.0005, + "loss": 2.1402, + "step": 37010 + }, + { + "epoch": 0.14090725699017226, + "grad_norm": 0.1309378743171692, + "learning_rate": 0.0005, + "loss": 2.1357, + "step": 37020 + }, + { + "epoch": 0.14094531945829494, + "grad_norm": 0.13261401653289795, + "learning_rate": 0.0005, + "loss": 2.1334, + "step": 37030 + }, + { + "epoch": 0.14098338192641763, + "grad_norm": 0.14187407493591309, + "learning_rate": 0.0005, + "loss": 2.1309, + "step": 37040 + }, + { + "epoch": 0.1410214443945403, + "grad_norm": 0.11595277488231659, + "learning_rate": 0.0005, + "loss": 2.1377, + "step": 37050 + }, + { + "epoch": 0.141059506862663, + "grad_norm": 0.12343664467334747, + "learning_rate": 0.0005, + "loss": 2.1248, + "step": 37060 + }, + { + "epoch": 0.14109756933078568, + "grad_norm": 0.11963911354541779, + "learning_rate": 0.0005, + "loss": 2.1328, + "step": 37070 + }, + { + "epoch": 0.14113563179890837, + "grad_norm": 0.12053116410970688, + "learning_rate": 0.0005, + "loss": 2.1329, + "step": 37080 + }, + { + "epoch": 0.14117369426703105, + "grad_norm": 0.1328931450843811, + "learning_rate": 0.0005, + "loss": 2.1263, + "step": 37090 + }, + { + "epoch": 0.14121175673515374, + "grad_norm": 0.12884977459907532, + "learning_rate": 0.0005, + "loss": 2.1307, + "step": 37100 + }, + { + "epoch": 0.14124981920327642, + "grad_norm": 0.12232869118452072, + "learning_rate": 0.0005, + "loss": 2.136, + "step": 37110 + }, + { + "epoch": 0.1412878816713991, + "grad_norm": 0.12900561094284058, + "learning_rate": 0.0005, + "loss": 2.135, + "step": 37120 + }, + { + "epoch": 0.1413259441395218, + "grad_norm": 0.12357156723737717, + "learning_rate": 0.0005, + "loss": 2.151, + "step": 37130 + }, + { + "epoch": 0.14136400660764448, + "grad_norm": 0.11812389642000198, + "learning_rate": 0.0005, + "loss": 2.1435, + "step": 37140 + }, + { + "epoch": 0.14140206907576716, + "grad_norm": 0.12773482501506805, + "learning_rate": 0.0005, + "loss": 2.137, + "step": 37150 + }, + { + "epoch": 0.14144013154388982, + "grad_norm": 0.12107904255390167, + "learning_rate": 0.0005, + "loss": 2.1569, + "step": 37160 + }, + { + "epoch": 0.1414781940120125, + "grad_norm": 0.12494766712188721, + "learning_rate": 0.0005, + "loss": 2.1522, + "step": 37170 + }, + { + "epoch": 0.1415162564801352, + "grad_norm": 0.12003045529127121, + "learning_rate": 0.0005, + "loss": 2.1421, + "step": 37180 + }, + { + "epoch": 0.14155431894825787, + "grad_norm": 0.1376497894525528, + "learning_rate": 0.0005, + "loss": 2.1364, + "step": 37190 + }, + { + "epoch": 0.14159238141638056, + "grad_norm": 0.1384882777929306, + "learning_rate": 0.0005, + "loss": 2.1447, + "step": 37200 + }, + { + "epoch": 0.14163044388450324, + "grad_norm": 0.13794715702533722, + "learning_rate": 0.0005, + "loss": 2.1324, + "step": 37210 + }, + { + "epoch": 0.14166850635262593, + "grad_norm": 0.12299910932779312, + "learning_rate": 0.0005, + "loss": 2.1355, + "step": 37220 + }, + { + "epoch": 0.1417065688207486, + "grad_norm": 0.12569528818130493, + "learning_rate": 0.0005, + "loss": 2.1408, + "step": 37230 + }, + { + "epoch": 0.1417446312888713, + "grad_norm": 0.11445607244968414, + "learning_rate": 0.0005, + "loss": 2.1425, + "step": 37240 + }, + { + "epoch": 0.14178269375699398, + "grad_norm": 0.13320550322532654, + "learning_rate": 0.0005, + "loss": 2.1273, + "step": 37250 + }, + { + "epoch": 0.14182075622511667, + "grad_norm": 0.13344018161296844, + "learning_rate": 0.0005, + "loss": 2.1403, + "step": 37260 + }, + { + "epoch": 0.14185881869323935, + "grad_norm": 0.12571915984153748, + "learning_rate": 0.0005, + "loss": 2.1272, + "step": 37270 + }, + { + "epoch": 0.14189688116136204, + "grad_norm": 0.13246026635169983, + "learning_rate": 0.0005, + "loss": 2.1376, + "step": 37280 + }, + { + "epoch": 0.14193494362948472, + "grad_norm": 0.1409611701965332, + "learning_rate": 0.0005, + "loss": 2.1255, + "step": 37290 + }, + { + "epoch": 0.1419730060976074, + "grad_norm": 0.12814782559871674, + "learning_rate": 0.0005, + "loss": 2.1424, + "step": 37300 + }, + { + "epoch": 0.14201106856573006, + "grad_norm": 0.14565905928611755, + "learning_rate": 0.0005, + "loss": 2.1173, + "step": 37310 + }, + { + "epoch": 0.14204913103385275, + "grad_norm": 0.13787856698036194, + "learning_rate": 0.0005, + "loss": 2.1416, + "step": 37320 + }, + { + "epoch": 0.14208719350197543, + "grad_norm": 0.11704063415527344, + "learning_rate": 0.0005, + "loss": 2.1364, + "step": 37330 + }, + { + "epoch": 0.14212525597009812, + "grad_norm": 0.12095765024423599, + "learning_rate": 0.0005, + "loss": 2.1416, + "step": 37340 + }, + { + "epoch": 0.1421633184382208, + "grad_norm": 0.13464663922786713, + "learning_rate": 0.0005, + "loss": 2.1374, + "step": 37350 + }, + { + "epoch": 0.1422013809063435, + "grad_norm": 0.11635299772024155, + "learning_rate": 0.0005, + "loss": 2.1357, + "step": 37360 + }, + { + "epoch": 0.14223944337446617, + "grad_norm": 0.12324802577495575, + "learning_rate": 0.0005, + "loss": 2.1258, + "step": 37370 + }, + { + "epoch": 0.14227750584258886, + "grad_norm": 0.12421499192714691, + "learning_rate": 0.0005, + "loss": 2.1305, + "step": 37380 + }, + { + "epoch": 0.14231556831071154, + "grad_norm": 0.1331142783164978, + "learning_rate": 0.0005, + "loss": 2.131, + "step": 37390 + }, + { + "epoch": 0.14235363077883423, + "grad_norm": 0.1281598061323166, + "learning_rate": 0.0005, + "loss": 2.1496, + "step": 37400 + }, + { + "epoch": 0.1423916932469569, + "grad_norm": 0.12264150381088257, + "learning_rate": 0.0005, + "loss": 2.1373, + "step": 37410 + }, + { + "epoch": 0.1424297557150796, + "grad_norm": 0.11806928366422653, + "learning_rate": 0.0005, + "loss": 2.1467, + "step": 37420 + }, + { + "epoch": 0.14246781818320228, + "grad_norm": 0.11630085110664368, + "learning_rate": 0.0005, + "loss": 2.1412, + "step": 37430 + }, + { + "epoch": 0.14250588065132497, + "grad_norm": 0.1202859953045845, + "learning_rate": 0.0005, + "loss": 2.1285, + "step": 37440 + }, + { + "epoch": 0.14254394311944762, + "grad_norm": 0.13352476060390472, + "learning_rate": 0.0005, + "loss": 2.1334, + "step": 37450 + }, + { + "epoch": 0.1425820055875703, + "grad_norm": 0.12109289318323135, + "learning_rate": 0.0005, + "loss": 2.1373, + "step": 37460 + }, + { + "epoch": 0.142620068055693, + "grad_norm": 0.1269426792860031, + "learning_rate": 0.0005, + "loss": 2.1137, + "step": 37470 + }, + { + "epoch": 0.14265813052381568, + "grad_norm": 0.11681367456912994, + "learning_rate": 0.0005, + "loss": 2.1326, + "step": 37480 + }, + { + "epoch": 0.14269619299193836, + "grad_norm": 0.1320292204618454, + "learning_rate": 0.0005, + "loss": 2.1422, + "step": 37490 + }, + { + "epoch": 0.14273425546006105, + "grad_norm": 0.12286791950464249, + "learning_rate": 0.0005, + "loss": 2.1433, + "step": 37500 + }, + { + "epoch": 0.14277231792818373, + "grad_norm": 0.11698734760284424, + "learning_rate": 0.0005, + "loss": 2.1318, + "step": 37510 + }, + { + "epoch": 0.14281038039630642, + "grad_norm": 0.12439266592264175, + "learning_rate": 0.0005, + "loss": 2.1392, + "step": 37520 + }, + { + "epoch": 0.1428484428644291, + "grad_norm": 0.11994090676307678, + "learning_rate": 0.0005, + "loss": 2.1442, + "step": 37530 + }, + { + "epoch": 0.1428865053325518, + "grad_norm": 0.12152058631181717, + "learning_rate": 0.0005, + "loss": 2.1253, + "step": 37540 + }, + { + "epoch": 0.14292456780067447, + "grad_norm": 0.13039402663707733, + "learning_rate": 0.0005, + "loss": 2.1473, + "step": 37550 + }, + { + "epoch": 0.14296263026879716, + "grad_norm": 0.1288081258535385, + "learning_rate": 0.0005, + "loss": 2.1375, + "step": 37560 + }, + { + "epoch": 0.14300069273691984, + "grad_norm": 0.11937260627746582, + "learning_rate": 0.0005, + "loss": 2.1505, + "step": 37570 + }, + { + "epoch": 0.14303875520504253, + "grad_norm": 0.13222989439964294, + "learning_rate": 0.0005, + "loss": 2.1401, + "step": 37580 + }, + { + "epoch": 0.14307681767316519, + "grad_norm": 0.1243140771985054, + "learning_rate": 0.0005, + "loss": 2.1505, + "step": 37590 + }, + { + "epoch": 0.14311488014128787, + "grad_norm": 0.1320112645626068, + "learning_rate": 0.0005, + "loss": 2.1405, + "step": 37600 + }, + { + "epoch": 0.14315294260941056, + "grad_norm": 0.12151151895523071, + "learning_rate": 0.0005, + "loss": 2.1259, + "step": 37610 + }, + { + "epoch": 0.14319100507753324, + "grad_norm": 0.14852769672870636, + "learning_rate": 0.0005, + "loss": 2.1275, + "step": 37620 + }, + { + "epoch": 0.14322906754565592, + "grad_norm": 0.12357402592897415, + "learning_rate": 0.0005, + "loss": 2.1293, + "step": 37630 + }, + { + "epoch": 0.1432671300137786, + "grad_norm": 0.12055303901433945, + "learning_rate": 0.0005, + "loss": 2.1214, + "step": 37640 + }, + { + "epoch": 0.1433051924819013, + "grad_norm": 0.15052442252635956, + "learning_rate": 0.0005, + "loss": 2.1305, + "step": 37650 + }, + { + "epoch": 0.14334325495002398, + "grad_norm": 0.12786470353603363, + "learning_rate": 0.0005, + "loss": 2.1584, + "step": 37660 + }, + { + "epoch": 0.14338131741814666, + "grad_norm": 0.13811320066452026, + "learning_rate": 0.0005, + "loss": 2.1459, + "step": 37670 + }, + { + "epoch": 0.14341937988626935, + "grad_norm": 0.1383814960718155, + "learning_rate": 0.0005, + "loss": 2.147, + "step": 37680 + }, + { + "epoch": 0.14345744235439203, + "grad_norm": 0.12483002990484238, + "learning_rate": 0.0005, + "loss": 2.1392, + "step": 37690 + }, + { + "epoch": 0.14349550482251472, + "grad_norm": 0.11994721740484238, + "learning_rate": 0.0005, + "loss": 2.1406, + "step": 37700 + }, + { + "epoch": 0.1435335672906374, + "grad_norm": 0.13180948793888092, + "learning_rate": 0.0005, + "loss": 2.1368, + "step": 37710 + }, + { + "epoch": 0.1435716297587601, + "grad_norm": 0.12397965788841248, + "learning_rate": 0.0005, + "loss": 2.1463, + "step": 37720 + }, + { + "epoch": 0.14360969222688277, + "grad_norm": 0.11854337155818939, + "learning_rate": 0.0005, + "loss": 2.1353, + "step": 37730 + }, + { + "epoch": 0.14364775469500543, + "grad_norm": 0.11657878011465073, + "learning_rate": 0.0005, + "loss": 2.1349, + "step": 37740 + }, + { + "epoch": 0.14368581716312812, + "grad_norm": 0.12216203659772873, + "learning_rate": 0.0005, + "loss": 2.1323, + "step": 37750 + }, + { + "epoch": 0.1437238796312508, + "grad_norm": 0.1354895681142807, + "learning_rate": 0.0005, + "loss": 2.1379, + "step": 37760 + }, + { + "epoch": 0.14376194209937349, + "grad_norm": 0.1245875358581543, + "learning_rate": 0.0005, + "loss": 2.1385, + "step": 37770 + }, + { + "epoch": 0.14380000456749617, + "grad_norm": 0.11784937977790833, + "learning_rate": 0.0005, + "loss": 2.147, + "step": 37780 + }, + { + "epoch": 0.14383806703561886, + "grad_norm": 0.1278153657913208, + "learning_rate": 0.0005, + "loss": 2.1477, + "step": 37790 + }, + { + "epoch": 0.14387612950374154, + "grad_norm": 0.12693701684474945, + "learning_rate": 0.0005, + "loss": 2.1363, + "step": 37800 + }, + { + "epoch": 0.14391419197186422, + "grad_norm": 0.13049623370170593, + "learning_rate": 0.0005, + "loss": 2.1308, + "step": 37810 + }, + { + "epoch": 0.1439522544399869, + "grad_norm": 0.12316485494375229, + "learning_rate": 0.0005, + "loss": 2.1266, + "step": 37820 + }, + { + "epoch": 0.1439903169081096, + "grad_norm": 0.1379222273826599, + "learning_rate": 0.0005, + "loss": 2.1312, + "step": 37830 + }, + { + "epoch": 0.14402837937623228, + "grad_norm": 0.11370328068733215, + "learning_rate": 0.0005, + "loss": 2.1289, + "step": 37840 + }, + { + "epoch": 0.14406644184435496, + "grad_norm": 0.1197470873594284, + "learning_rate": 0.0005, + "loss": 2.1371, + "step": 37850 + }, + { + "epoch": 0.14410450431247765, + "grad_norm": 0.12998999655246735, + "learning_rate": 0.0005, + "loss": 2.1437, + "step": 37860 + }, + { + "epoch": 0.14414256678060033, + "grad_norm": 0.13060982525348663, + "learning_rate": 0.0005, + "loss": 2.1204, + "step": 37870 + }, + { + "epoch": 0.144180629248723, + "grad_norm": 0.11585245281457901, + "learning_rate": 0.0005, + "loss": 2.156, + "step": 37880 + }, + { + "epoch": 0.14421869171684568, + "grad_norm": 0.12489481270313263, + "learning_rate": 0.0005, + "loss": 2.1352, + "step": 37890 + }, + { + "epoch": 0.14425675418496836, + "grad_norm": 0.15002131462097168, + "learning_rate": 0.0005, + "loss": 2.1429, + "step": 37900 + }, + { + "epoch": 0.14429481665309105, + "grad_norm": 0.12778665125370026, + "learning_rate": 0.0005, + "loss": 2.141, + "step": 37910 + }, + { + "epoch": 0.14433287912121373, + "grad_norm": 0.12881921231746674, + "learning_rate": 0.0005, + "loss": 2.138, + "step": 37920 + }, + { + "epoch": 0.14437094158933642, + "grad_norm": 0.11601099371910095, + "learning_rate": 0.0005, + "loss": 2.1302, + "step": 37930 + }, + { + "epoch": 0.1444090040574591, + "grad_norm": 0.12787161767482758, + "learning_rate": 0.0005, + "loss": 2.1414, + "step": 37940 + }, + { + "epoch": 0.14444706652558179, + "grad_norm": 0.12424326688051224, + "learning_rate": 0.0005, + "loss": 2.1434, + "step": 37950 + }, + { + "epoch": 0.14448512899370447, + "grad_norm": 0.12463422119617462, + "learning_rate": 0.0005, + "loss": 2.138, + "step": 37960 + }, + { + "epoch": 0.14452319146182716, + "grad_norm": 0.11958973109722137, + "learning_rate": 0.0005, + "loss": 2.1286, + "step": 37970 + }, + { + "epoch": 0.14456125392994984, + "grad_norm": 0.12186741083860397, + "learning_rate": 0.0005, + "loss": 2.1333, + "step": 37980 + }, + { + "epoch": 0.14459931639807252, + "grad_norm": 0.1192806214094162, + "learning_rate": 0.0005, + "loss": 2.1344, + "step": 37990 + }, + { + "epoch": 0.1446373788661952, + "grad_norm": 0.11857765913009644, + "learning_rate": 0.0005, + "loss": 2.1294, + "step": 38000 + }, + { + "epoch": 0.1446754413343179, + "grad_norm": 0.1343282014131546, + "learning_rate": 0.0005, + "loss": 2.1386, + "step": 38010 + }, + { + "epoch": 0.14471350380244055, + "grad_norm": 0.13186195492744446, + "learning_rate": 0.0005, + "loss": 2.1351, + "step": 38020 + }, + { + "epoch": 0.14475156627056324, + "grad_norm": 0.12335959076881409, + "learning_rate": 0.0005, + "loss": 2.1421, + "step": 38030 + }, + { + "epoch": 0.14478962873868592, + "grad_norm": 0.1687193065881729, + "learning_rate": 0.0005, + "loss": 2.144, + "step": 38040 + }, + { + "epoch": 0.1448276912068086, + "grad_norm": 0.14608801901340485, + "learning_rate": 0.0005, + "loss": 2.1235, + "step": 38050 + }, + { + "epoch": 0.1448657536749313, + "grad_norm": 0.13224822282791138, + "learning_rate": 0.0005, + "loss": 2.1442, + "step": 38060 + }, + { + "epoch": 0.14490381614305398, + "grad_norm": 0.12992419302463531, + "learning_rate": 0.0005, + "loss": 2.1372, + "step": 38070 + }, + { + "epoch": 0.14494187861117666, + "grad_norm": 0.1225738450884819, + "learning_rate": 0.0005, + "loss": 2.1513, + "step": 38080 + }, + { + "epoch": 0.14497994107929935, + "grad_norm": 0.12545767426490784, + "learning_rate": 0.0005, + "loss": 2.1311, + "step": 38090 + }, + { + "epoch": 0.14501800354742203, + "grad_norm": 0.12390364706516266, + "learning_rate": 0.0005, + "loss": 2.1436, + "step": 38100 + }, + { + "epoch": 0.14505606601554472, + "grad_norm": 0.11716514825820923, + "learning_rate": 0.0005, + "loss": 2.1187, + "step": 38110 + }, + { + "epoch": 0.1450941284836674, + "grad_norm": 0.1196618601679802, + "learning_rate": 0.0005, + "loss": 2.1305, + "step": 38120 + }, + { + "epoch": 0.14513219095179009, + "grad_norm": 0.11519220471382141, + "learning_rate": 0.0005, + "loss": 2.1428, + "step": 38130 + }, + { + "epoch": 0.14517025341991277, + "grad_norm": 0.1359359472990036, + "learning_rate": 0.0005, + "loss": 2.1345, + "step": 38140 + }, + { + "epoch": 0.14520831588803546, + "grad_norm": 0.132180318236351, + "learning_rate": 0.0005, + "loss": 2.1471, + "step": 38150 + }, + { + "epoch": 0.14524637835615814, + "grad_norm": 0.13018040359020233, + "learning_rate": 0.0005, + "loss": 2.1326, + "step": 38160 + }, + { + "epoch": 0.1452844408242808, + "grad_norm": 0.12225526571273804, + "learning_rate": 0.0005, + "loss": 2.1331, + "step": 38170 + }, + { + "epoch": 0.14532250329240348, + "grad_norm": 0.11801525950431824, + "learning_rate": 0.0005, + "loss": 2.1241, + "step": 38180 + }, + { + "epoch": 0.14536056576052617, + "grad_norm": 0.11402857303619385, + "learning_rate": 0.0005, + "loss": 2.135, + "step": 38190 + }, + { + "epoch": 0.14539862822864885, + "grad_norm": 0.11925767362117767, + "learning_rate": 0.0005, + "loss": 2.1419, + "step": 38200 + }, + { + "epoch": 0.14543669069677154, + "grad_norm": 0.11742289364337921, + "learning_rate": 0.0005, + "loss": 2.1367, + "step": 38210 + }, + { + "epoch": 0.14547475316489422, + "grad_norm": 0.11711934953927994, + "learning_rate": 0.0005, + "loss": 2.1315, + "step": 38220 + }, + { + "epoch": 0.1455128156330169, + "grad_norm": 0.10924158990383148, + "learning_rate": 0.0005, + "loss": 2.1264, + "step": 38230 + }, + { + "epoch": 0.1455508781011396, + "grad_norm": 0.12829233705997467, + "learning_rate": 0.0005, + "loss": 2.1121, + "step": 38240 + }, + { + "epoch": 0.14558894056926228, + "grad_norm": 0.10900772362947464, + "learning_rate": 0.0005, + "loss": 2.1341, + "step": 38250 + }, + { + "epoch": 0.14562700303738496, + "grad_norm": 0.1258832812309265, + "learning_rate": 0.0005, + "loss": 2.1317, + "step": 38260 + }, + { + "epoch": 0.14566506550550765, + "grad_norm": 0.11658526957035065, + "learning_rate": 0.0005, + "loss": 2.1314, + "step": 38270 + }, + { + "epoch": 0.14570312797363033, + "grad_norm": 0.1278267353773117, + "learning_rate": 0.0005, + "loss": 2.13, + "step": 38280 + }, + { + "epoch": 0.14574119044175302, + "grad_norm": 0.1270705610513687, + "learning_rate": 0.0005, + "loss": 2.127, + "step": 38290 + }, + { + "epoch": 0.1457792529098757, + "grad_norm": 0.11987922340631485, + "learning_rate": 0.0005, + "loss": 2.1322, + "step": 38300 + }, + { + "epoch": 0.14581731537799836, + "grad_norm": 0.12335024774074554, + "learning_rate": 0.0005, + "loss": 2.1369, + "step": 38310 + }, + { + "epoch": 0.14585537784612104, + "grad_norm": 0.12798041105270386, + "learning_rate": 0.0005, + "loss": 2.1316, + "step": 38320 + }, + { + "epoch": 0.14589344031424373, + "grad_norm": 0.13565818965435028, + "learning_rate": 0.0005, + "loss": 2.1302, + "step": 38330 + }, + { + "epoch": 0.1459315027823664, + "grad_norm": 0.13244055211544037, + "learning_rate": 0.0005, + "loss": 2.132, + "step": 38340 + }, + { + "epoch": 0.1459695652504891, + "grad_norm": 0.13553233444690704, + "learning_rate": 0.0005, + "loss": 2.1231, + "step": 38350 + }, + { + "epoch": 0.14600762771861178, + "grad_norm": 0.1146652102470398, + "learning_rate": 0.0005, + "loss": 2.135, + "step": 38360 + }, + { + "epoch": 0.14604569018673447, + "grad_norm": 0.13296912610530853, + "learning_rate": 0.0005, + "loss": 2.1383, + "step": 38370 + }, + { + "epoch": 0.14608375265485715, + "grad_norm": 0.13272130489349365, + "learning_rate": 0.0005, + "loss": 2.1396, + "step": 38380 + }, + { + "epoch": 0.14612181512297984, + "grad_norm": 0.12927483022212982, + "learning_rate": 0.0005, + "loss": 2.1303, + "step": 38390 + }, + { + "epoch": 0.14615987759110252, + "grad_norm": 0.12845762073993683, + "learning_rate": 0.0005, + "loss": 2.128, + "step": 38400 + }, + { + "epoch": 0.1461979400592252, + "grad_norm": 0.11350326985120773, + "learning_rate": 0.0005, + "loss": 2.1509, + "step": 38410 + }, + { + "epoch": 0.1462360025273479, + "grad_norm": 0.11804868280887604, + "learning_rate": 0.0005, + "loss": 2.1515, + "step": 38420 + }, + { + "epoch": 0.14627406499547058, + "grad_norm": 0.127664253115654, + "learning_rate": 0.0005, + "loss": 2.1232, + "step": 38430 + }, + { + "epoch": 0.14631212746359326, + "grad_norm": 0.13080111145973206, + "learning_rate": 0.0005, + "loss": 2.1308, + "step": 38440 + }, + { + "epoch": 0.14635018993171595, + "grad_norm": 0.12366043031215668, + "learning_rate": 0.0005, + "loss": 2.1261, + "step": 38450 + }, + { + "epoch": 0.1463882523998386, + "grad_norm": 0.1258617490530014, + "learning_rate": 0.0005, + "loss": 2.1382, + "step": 38460 + }, + { + "epoch": 0.1464263148679613, + "grad_norm": 0.12624163925647736, + "learning_rate": 0.0005, + "loss": 2.1375, + "step": 38470 + }, + { + "epoch": 0.14646437733608397, + "grad_norm": 0.11812226474285126, + "learning_rate": 0.0005, + "loss": 2.126, + "step": 38480 + }, + { + "epoch": 0.14650243980420666, + "grad_norm": 0.12122759968042374, + "learning_rate": 0.0005, + "loss": 2.1206, + "step": 38490 + }, + { + "epoch": 0.14654050227232934, + "grad_norm": 0.12794890999794006, + "learning_rate": 0.0005, + "loss": 2.1215, + "step": 38500 + }, + { + "epoch": 0.14657856474045203, + "grad_norm": 0.1506595015525818, + "learning_rate": 0.0005, + "loss": 2.1389, + "step": 38510 + }, + { + "epoch": 0.1466166272085747, + "grad_norm": 0.20770882070064545, + "learning_rate": 0.0005, + "loss": 2.1371, + "step": 38520 + }, + { + "epoch": 0.1466546896766974, + "grad_norm": 0.1322670727968216, + "learning_rate": 0.0005, + "loss": 2.1506, + "step": 38530 + }, + { + "epoch": 0.14669275214482008, + "grad_norm": 0.13230635225772858, + "learning_rate": 0.0005, + "loss": 2.1478, + "step": 38540 + }, + { + "epoch": 0.14673081461294277, + "grad_norm": 0.12269840389490128, + "learning_rate": 0.0005, + "loss": 2.1231, + "step": 38550 + }, + { + "epoch": 0.14676887708106545, + "grad_norm": 0.12784303724765778, + "learning_rate": 0.0005, + "loss": 2.1128, + "step": 38560 + }, + { + "epoch": 0.14680693954918814, + "grad_norm": 0.12518112361431122, + "learning_rate": 0.0005, + "loss": 2.1443, + "step": 38570 + }, + { + "epoch": 0.14684500201731082, + "grad_norm": 0.11033365875482559, + "learning_rate": 0.0005, + "loss": 2.1358, + "step": 38580 + }, + { + "epoch": 0.1468830644854335, + "grad_norm": 0.11702969670295715, + "learning_rate": 0.0005, + "loss": 2.1333, + "step": 38590 + }, + { + "epoch": 0.14692112695355616, + "grad_norm": 0.1252330094575882, + "learning_rate": 0.0005, + "loss": 2.1154, + "step": 38600 + }, + { + "epoch": 0.14695918942167885, + "grad_norm": 0.12296465784311295, + "learning_rate": 0.0005, + "loss": 2.1434, + "step": 38610 + }, + { + "epoch": 0.14699725188980153, + "grad_norm": 0.1148548498749733, + "learning_rate": 0.0005, + "loss": 2.1389, + "step": 38620 + }, + { + "epoch": 0.14703531435792422, + "grad_norm": 0.13031576573848724, + "learning_rate": 0.0005, + "loss": 2.1438, + "step": 38630 + }, + { + "epoch": 0.1470733768260469, + "grad_norm": 0.11590581387281418, + "learning_rate": 0.0005, + "loss": 2.1437, + "step": 38640 + }, + { + "epoch": 0.1471114392941696, + "grad_norm": 0.12280098348855972, + "learning_rate": 0.0005, + "loss": 2.1543, + "step": 38650 + }, + { + "epoch": 0.14714950176229227, + "grad_norm": 0.1293204426765442, + "learning_rate": 0.0005, + "loss": 2.1311, + "step": 38660 + }, + { + "epoch": 0.14718756423041496, + "grad_norm": 0.12075809389352798, + "learning_rate": 0.0005, + "loss": 2.1297, + "step": 38670 + }, + { + "epoch": 0.14722562669853764, + "grad_norm": 0.13208520412445068, + "learning_rate": 0.0005, + "loss": 2.1525, + "step": 38680 + }, + { + "epoch": 0.14726368916666033, + "grad_norm": 0.138469398021698, + "learning_rate": 0.0005, + "loss": 2.1289, + "step": 38690 + }, + { + "epoch": 0.147301751634783, + "grad_norm": 0.12534962594509125, + "learning_rate": 0.0005, + "loss": 2.15, + "step": 38700 + }, + { + "epoch": 0.1473398141029057, + "grad_norm": 0.1162012442946434, + "learning_rate": 0.0005, + "loss": 2.1442, + "step": 38710 + }, + { + "epoch": 0.14737787657102838, + "grad_norm": 0.1286344975233078, + "learning_rate": 0.0005, + "loss": 2.1337, + "step": 38720 + }, + { + "epoch": 0.14741593903915107, + "grad_norm": 0.12214122712612152, + "learning_rate": 0.0005, + "loss": 2.1417, + "step": 38730 + }, + { + "epoch": 0.14745400150727372, + "grad_norm": 0.12790422141551971, + "learning_rate": 0.0005, + "loss": 2.128, + "step": 38740 + }, + { + "epoch": 0.1474920639753964, + "grad_norm": 0.13529656827449799, + "learning_rate": 0.0005, + "loss": 2.1418, + "step": 38750 + }, + { + "epoch": 0.1475301264435191, + "grad_norm": 0.1380883902311325, + "learning_rate": 0.0005, + "loss": 2.1467, + "step": 38760 + }, + { + "epoch": 0.14756818891164178, + "grad_norm": 0.12100613117218018, + "learning_rate": 0.0005, + "loss": 2.1384, + "step": 38770 + }, + { + "epoch": 0.14760625137976446, + "grad_norm": 0.13053341209888458, + "learning_rate": 0.0005, + "loss": 2.1233, + "step": 38780 + }, + { + "epoch": 0.14764431384788715, + "grad_norm": 0.1276913583278656, + "learning_rate": 0.0005, + "loss": 2.1411, + "step": 38790 + }, + { + "epoch": 0.14768237631600983, + "grad_norm": 0.12804895639419556, + "learning_rate": 0.0005, + "loss": 2.1486, + "step": 38800 + }, + { + "epoch": 0.14772043878413252, + "grad_norm": 0.1234828382730484, + "learning_rate": 0.0005, + "loss": 2.1183, + "step": 38810 + }, + { + "epoch": 0.1477585012522552, + "grad_norm": 0.10741379112005234, + "learning_rate": 0.0005, + "loss": 2.1303, + "step": 38820 + }, + { + "epoch": 0.1477965637203779, + "grad_norm": 0.13724245131015778, + "learning_rate": 0.0005, + "loss": 2.1372, + "step": 38830 + }, + { + "epoch": 0.14783462618850057, + "grad_norm": 0.11840417236089706, + "learning_rate": 0.0005, + "loss": 2.1373, + "step": 38840 + }, + { + "epoch": 0.14787268865662326, + "grad_norm": 0.13324248790740967, + "learning_rate": 0.0005, + "loss": 2.146, + "step": 38850 + }, + { + "epoch": 0.14791075112474594, + "grad_norm": 0.12486392259597778, + "learning_rate": 0.0005, + "loss": 2.1525, + "step": 38860 + }, + { + "epoch": 0.14794881359286863, + "grad_norm": 0.13672782480716705, + "learning_rate": 0.0005, + "loss": 2.1463, + "step": 38870 + }, + { + "epoch": 0.1479868760609913, + "grad_norm": 0.11927548050880432, + "learning_rate": 0.0005, + "loss": 2.1312, + "step": 38880 + }, + { + "epoch": 0.14802493852911397, + "grad_norm": 0.1426343470811844, + "learning_rate": 0.0005, + "loss": 2.1316, + "step": 38890 + }, + { + "epoch": 0.14806300099723665, + "grad_norm": 0.1276564598083496, + "learning_rate": 0.0005, + "loss": 2.1352, + "step": 38900 + }, + { + "epoch": 0.14810106346535934, + "grad_norm": 0.11925414949655533, + "learning_rate": 0.0005, + "loss": 2.1152, + "step": 38910 + }, + { + "epoch": 0.14813912593348202, + "grad_norm": 0.13122230768203735, + "learning_rate": 0.0005, + "loss": 2.1194, + "step": 38920 + }, + { + "epoch": 0.1481771884016047, + "grad_norm": 0.12605653703212738, + "learning_rate": 0.0005, + "loss": 2.1364, + "step": 38930 + }, + { + "epoch": 0.1482152508697274, + "grad_norm": 0.12071428447961807, + "learning_rate": 0.0005, + "loss": 2.1287, + "step": 38940 + }, + { + "epoch": 0.14825331333785008, + "grad_norm": 0.14357160031795502, + "learning_rate": 0.0005, + "loss": 2.1349, + "step": 38950 + }, + { + "epoch": 0.14829137580597276, + "grad_norm": 0.118900828063488, + "learning_rate": 0.0005, + "loss": 2.1448, + "step": 38960 + }, + { + "epoch": 0.14832943827409545, + "grad_norm": 0.1205672100186348, + "learning_rate": 0.0005, + "loss": 2.128, + "step": 38970 + }, + { + "epoch": 0.14836750074221813, + "grad_norm": 0.11521317064762115, + "learning_rate": 0.0005, + "loss": 2.1313, + "step": 38980 + }, + { + "epoch": 0.14840556321034082, + "grad_norm": 0.13125668466091156, + "learning_rate": 0.0005, + "loss": 2.1446, + "step": 38990 + }, + { + "epoch": 0.1484436256784635, + "grad_norm": 0.11577267944812775, + "learning_rate": 0.0005, + "loss": 2.1332, + "step": 39000 + }, + { + "epoch": 0.1484816881465862, + "grad_norm": 0.13303667306900024, + "learning_rate": 0.0005, + "loss": 2.1393, + "step": 39010 + }, + { + "epoch": 0.14851975061470887, + "grad_norm": 0.12673087418079376, + "learning_rate": 0.0005, + "loss": 2.1323, + "step": 39020 + }, + { + "epoch": 0.14855781308283153, + "grad_norm": 0.12567567825317383, + "learning_rate": 0.0005, + "loss": 2.1322, + "step": 39030 + }, + { + "epoch": 0.14859587555095422, + "grad_norm": 0.1653776913881302, + "learning_rate": 0.0005, + "loss": 2.1335, + "step": 39040 + }, + { + "epoch": 0.1486339380190769, + "grad_norm": 0.12336651980876923, + "learning_rate": 0.0005, + "loss": 2.1511, + "step": 39050 + }, + { + "epoch": 0.14867200048719958, + "grad_norm": 0.1362055391073227, + "learning_rate": 0.0005, + "loss": 2.1399, + "step": 39060 + }, + { + "epoch": 0.14871006295532227, + "grad_norm": 0.12506848573684692, + "learning_rate": 0.0005, + "loss": 2.1334, + "step": 39070 + }, + { + "epoch": 0.14874812542344495, + "grad_norm": 0.1321392059326172, + "learning_rate": 0.0005, + "loss": 2.1157, + "step": 39080 + }, + { + "epoch": 0.14878618789156764, + "grad_norm": 0.12654539942741394, + "learning_rate": 0.0005, + "loss": 2.1156, + "step": 39090 + }, + { + "epoch": 0.14882425035969032, + "grad_norm": 0.13101084530353546, + "learning_rate": 0.0005, + "loss": 2.142, + "step": 39100 + }, + { + "epoch": 0.148862312827813, + "grad_norm": 0.1241462379693985, + "learning_rate": 0.0005, + "loss": 2.1391, + "step": 39110 + }, + { + "epoch": 0.1489003752959357, + "grad_norm": 0.1250251829624176, + "learning_rate": 0.0005, + "loss": 2.138, + "step": 39120 + }, + { + "epoch": 0.14893843776405838, + "grad_norm": 0.11190465837717056, + "learning_rate": 0.0005, + "loss": 2.1457, + "step": 39130 + }, + { + "epoch": 0.14897650023218106, + "grad_norm": 0.11848768591880798, + "learning_rate": 0.0005, + "loss": 2.1254, + "step": 39140 + }, + { + "epoch": 0.14901456270030375, + "grad_norm": 0.1262151002883911, + "learning_rate": 0.0005, + "loss": 2.1416, + "step": 39150 + }, + { + "epoch": 0.14905262516842643, + "grad_norm": 0.12781231105327606, + "learning_rate": 0.0005, + "loss": 2.1337, + "step": 39160 + }, + { + "epoch": 0.1490906876365491, + "grad_norm": 0.12659291923046112, + "learning_rate": 0.0005, + "loss": 2.135, + "step": 39170 + }, + { + "epoch": 0.14912875010467178, + "grad_norm": 0.11960427463054657, + "learning_rate": 0.0005, + "loss": 2.1381, + "step": 39180 + }, + { + "epoch": 0.14916681257279446, + "grad_norm": 0.12238552421331406, + "learning_rate": 0.0005, + "loss": 2.1443, + "step": 39190 + }, + { + "epoch": 0.14920487504091715, + "grad_norm": 0.12759989500045776, + "learning_rate": 0.0005, + "loss": 2.139, + "step": 39200 + }, + { + "epoch": 0.14924293750903983, + "grad_norm": 0.13447055220603943, + "learning_rate": 0.0005, + "loss": 2.1408, + "step": 39210 + }, + { + "epoch": 0.14928099997716252, + "grad_norm": 0.13414421677589417, + "learning_rate": 0.0005, + "loss": 2.143, + "step": 39220 + }, + { + "epoch": 0.1493190624452852, + "grad_norm": 0.12294954806566238, + "learning_rate": 0.0005, + "loss": 2.1301, + "step": 39230 + }, + { + "epoch": 0.14935712491340788, + "grad_norm": 0.12728038430213928, + "learning_rate": 0.0005, + "loss": 2.124, + "step": 39240 + }, + { + "epoch": 0.14939518738153057, + "grad_norm": 0.12434787303209305, + "learning_rate": 0.0005, + "loss": 2.1499, + "step": 39250 + }, + { + "epoch": 0.14943324984965325, + "grad_norm": 0.11438928544521332, + "learning_rate": 0.0005, + "loss": 2.1326, + "step": 39260 + }, + { + "epoch": 0.14947131231777594, + "grad_norm": 0.11270195990800858, + "learning_rate": 0.0005, + "loss": 2.1345, + "step": 39270 + }, + { + "epoch": 0.14950937478589862, + "grad_norm": 0.14034071564674377, + "learning_rate": 0.0005, + "loss": 2.1342, + "step": 39280 + }, + { + "epoch": 0.1495474372540213, + "grad_norm": 0.12831617891788483, + "learning_rate": 0.0005, + "loss": 2.1284, + "step": 39290 + }, + { + "epoch": 0.149585499722144, + "grad_norm": 0.1376051902770996, + "learning_rate": 0.0005, + "loss": 2.1373, + "step": 39300 + }, + { + "epoch": 0.14962356219026668, + "grad_norm": 0.11683588474988937, + "learning_rate": 0.0005, + "loss": 2.1346, + "step": 39310 + }, + { + "epoch": 0.14966162465838934, + "grad_norm": 0.11693572252988815, + "learning_rate": 0.0005, + "loss": 2.1277, + "step": 39320 + }, + { + "epoch": 0.14969968712651202, + "grad_norm": 0.11618661880493164, + "learning_rate": 0.0005, + "loss": 2.1388, + "step": 39330 + }, + { + "epoch": 0.1497377495946347, + "grad_norm": 0.13750776648521423, + "learning_rate": 0.0005, + "loss": 2.137, + "step": 39340 + }, + { + "epoch": 0.1497758120627574, + "grad_norm": 0.128860741853714, + "learning_rate": 0.0005, + "loss": 2.1279, + "step": 39350 + }, + { + "epoch": 0.14981387453088008, + "grad_norm": 0.11993896961212158, + "learning_rate": 0.0005, + "loss": 2.1162, + "step": 39360 + }, + { + "epoch": 0.14985193699900276, + "grad_norm": 0.12513276934623718, + "learning_rate": 0.0005, + "loss": 2.132, + "step": 39370 + }, + { + "epoch": 0.14988999946712545, + "grad_norm": 0.12072756141424179, + "learning_rate": 0.0005, + "loss": 2.1339, + "step": 39380 + }, + { + "epoch": 0.14992806193524813, + "grad_norm": 0.11543644219636917, + "learning_rate": 0.0005, + "loss": 2.1415, + "step": 39390 + }, + { + "epoch": 0.14996612440337082, + "grad_norm": 0.12288960069417953, + "learning_rate": 0.0005, + "loss": 2.135, + "step": 39400 + }, + { + "epoch": 0.1500041868714935, + "grad_norm": 0.13772891461849213, + "learning_rate": 0.0005, + "loss": 2.1406, + "step": 39410 + }, + { + "epoch": 0.15004224933961618, + "grad_norm": 0.14142270386219025, + "learning_rate": 0.0005, + "loss": 2.1383, + "step": 39420 + }, + { + "epoch": 0.15008031180773887, + "grad_norm": 0.13073478639125824, + "learning_rate": 0.0005, + "loss": 2.1435, + "step": 39430 + }, + { + "epoch": 0.15011837427586155, + "grad_norm": 0.13670803606510162, + "learning_rate": 0.0005, + "loss": 2.1232, + "step": 39440 + }, + { + "epoch": 0.15015643674398424, + "grad_norm": 0.1232762262225151, + "learning_rate": 0.0005, + "loss": 2.1254, + "step": 39450 + }, + { + "epoch": 0.1501944992121069, + "grad_norm": 0.11498036235570908, + "learning_rate": 0.0005, + "loss": 2.1293, + "step": 39460 + }, + { + "epoch": 0.15023256168022958, + "grad_norm": 0.1252405345439911, + "learning_rate": 0.0005, + "loss": 2.1429, + "step": 39470 + }, + { + "epoch": 0.15027062414835227, + "grad_norm": 0.11288294196128845, + "learning_rate": 0.0005, + "loss": 2.1412, + "step": 39480 + }, + { + "epoch": 0.15030868661647495, + "grad_norm": 0.12291713058948517, + "learning_rate": 0.0005, + "loss": 2.1325, + "step": 39490 + }, + { + "epoch": 0.15034674908459764, + "grad_norm": 0.12331737577915192, + "learning_rate": 0.0005, + "loss": 2.1429, + "step": 39500 + }, + { + "epoch": 0.15038481155272032, + "grad_norm": 0.11889985203742981, + "learning_rate": 0.0005, + "loss": 2.1304, + "step": 39510 + }, + { + "epoch": 0.150422874020843, + "grad_norm": 0.1344471573829651, + "learning_rate": 0.0005, + "loss": 2.1333, + "step": 39520 + }, + { + "epoch": 0.1504609364889657, + "grad_norm": 0.13127154111862183, + "learning_rate": 0.0005, + "loss": 2.1367, + "step": 39530 + }, + { + "epoch": 0.15049899895708838, + "grad_norm": 0.13445231318473816, + "learning_rate": 0.0005, + "loss": 2.1347, + "step": 39540 + }, + { + "epoch": 0.15053706142521106, + "grad_norm": 0.12982484698295593, + "learning_rate": 0.0005, + "loss": 2.1247, + "step": 39550 + }, + { + "epoch": 0.15057512389333375, + "grad_norm": 0.12241829931735992, + "learning_rate": 0.0005, + "loss": 2.1315, + "step": 39560 + }, + { + "epoch": 0.15061318636145643, + "grad_norm": 0.12182803452014923, + "learning_rate": 0.0005, + "loss": 2.139, + "step": 39570 + }, + { + "epoch": 0.15065124882957912, + "grad_norm": 0.12285663187503815, + "learning_rate": 0.0005, + "loss": 2.1211, + "step": 39580 + }, + { + "epoch": 0.1506893112977018, + "grad_norm": 0.12623947858810425, + "learning_rate": 0.0005, + "loss": 2.1293, + "step": 39590 + }, + { + "epoch": 0.15072737376582448, + "grad_norm": 0.126447856426239, + "learning_rate": 0.0005, + "loss": 2.1469, + "step": 39600 + }, + { + "epoch": 0.15076543623394714, + "grad_norm": 0.11390262097120285, + "learning_rate": 0.0005, + "loss": 2.1264, + "step": 39610 + }, + { + "epoch": 0.15080349870206983, + "grad_norm": 0.1311657577753067, + "learning_rate": 0.0005, + "loss": 2.1228, + "step": 39620 + }, + { + "epoch": 0.1508415611701925, + "grad_norm": 0.12902265787124634, + "learning_rate": 0.0005, + "loss": 2.1248, + "step": 39630 + }, + { + "epoch": 0.1508796236383152, + "grad_norm": 0.11394213885068893, + "learning_rate": 0.0005, + "loss": 2.1301, + "step": 39640 + }, + { + "epoch": 0.15091768610643788, + "grad_norm": 0.12555819749832153, + "learning_rate": 0.0005, + "loss": 2.1282, + "step": 39650 + }, + { + "epoch": 0.15095574857456057, + "grad_norm": 0.12388810515403748, + "learning_rate": 0.0005, + "loss": 2.1405, + "step": 39660 + }, + { + "epoch": 0.15099381104268325, + "grad_norm": 0.13326005637645721, + "learning_rate": 0.0005, + "loss": 2.1397, + "step": 39670 + }, + { + "epoch": 0.15103187351080594, + "grad_norm": 0.1274305284023285, + "learning_rate": 0.0005, + "loss": 2.1314, + "step": 39680 + }, + { + "epoch": 0.15106993597892862, + "grad_norm": 0.12226250767707825, + "learning_rate": 0.0005, + "loss": 2.1425, + "step": 39690 + }, + { + "epoch": 0.1511079984470513, + "grad_norm": 0.14611788094043732, + "learning_rate": 0.0005, + "loss": 2.1318, + "step": 39700 + }, + { + "epoch": 0.151146060915174, + "grad_norm": 0.13780009746551514, + "learning_rate": 0.0005, + "loss": 2.1406, + "step": 39710 + }, + { + "epoch": 0.15118412338329668, + "grad_norm": 0.1312699317932129, + "learning_rate": 0.0005, + "loss": 2.1291, + "step": 39720 + }, + { + "epoch": 0.15122218585141936, + "grad_norm": 0.1251794397830963, + "learning_rate": 0.0005, + "loss": 2.1274, + "step": 39730 + }, + { + "epoch": 0.15126024831954205, + "grad_norm": 0.11780116707086563, + "learning_rate": 0.0005, + "loss": 2.1426, + "step": 39740 + }, + { + "epoch": 0.1512983107876647, + "grad_norm": 0.1149701476097107, + "learning_rate": 0.0005, + "loss": 2.1355, + "step": 39750 + }, + { + "epoch": 0.1513363732557874, + "grad_norm": 0.13181151449680328, + "learning_rate": 0.0005, + "loss": 2.1472, + "step": 39760 + }, + { + "epoch": 0.15137443572391007, + "grad_norm": 0.14071857929229736, + "learning_rate": 0.0005, + "loss": 2.1413, + "step": 39770 + }, + { + "epoch": 0.15141249819203276, + "grad_norm": 0.12593907117843628, + "learning_rate": 0.0005, + "loss": 2.1336, + "step": 39780 + }, + { + "epoch": 0.15145056066015544, + "grad_norm": 0.13257673382759094, + "learning_rate": 0.0005, + "loss": 2.141, + "step": 39790 + }, + { + "epoch": 0.15148862312827813, + "grad_norm": 0.12204756587743759, + "learning_rate": 0.0005, + "loss": 2.1429, + "step": 39800 + }, + { + "epoch": 0.1515266855964008, + "grad_norm": 0.15629100799560547, + "learning_rate": 0.0005, + "loss": 2.1429, + "step": 39810 + }, + { + "epoch": 0.1515647480645235, + "grad_norm": 0.13981211185455322, + "learning_rate": 0.0005, + "loss": 2.1264, + "step": 39820 + }, + { + "epoch": 0.15160281053264618, + "grad_norm": 0.11483721435070038, + "learning_rate": 0.0005, + "loss": 2.1261, + "step": 39830 + }, + { + "epoch": 0.15164087300076887, + "grad_norm": 0.11665428429841995, + "learning_rate": 0.0005, + "loss": 2.1342, + "step": 39840 + }, + { + "epoch": 0.15167893546889155, + "grad_norm": 0.11376866698265076, + "learning_rate": 0.0005, + "loss": 2.1461, + "step": 39850 + }, + { + "epoch": 0.15171699793701424, + "grad_norm": 0.12335699051618576, + "learning_rate": 0.0005, + "loss": 2.1308, + "step": 39860 + }, + { + "epoch": 0.15175506040513692, + "grad_norm": 0.13599348068237305, + "learning_rate": 0.0005, + "loss": 2.1479, + "step": 39870 + }, + { + "epoch": 0.1517931228732596, + "grad_norm": 0.11808918416500092, + "learning_rate": 0.0005, + "loss": 2.1443, + "step": 39880 + }, + { + "epoch": 0.15183118534138226, + "grad_norm": 0.1326199173927307, + "learning_rate": 0.0005, + "loss": 2.1335, + "step": 39890 + }, + { + "epoch": 0.15186924780950495, + "grad_norm": 0.1278129518032074, + "learning_rate": 0.0005, + "loss": 2.1327, + "step": 39900 + }, + { + "epoch": 0.15190731027762763, + "grad_norm": 0.12414394319057465, + "learning_rate": 0.0005, + "loss": 2.1505, + "step": 39910 + }, + { + "epoch": 0.15194537274575032, + "grad_norm": 0.14067183434963226, + "learning_rate": 0.0005, + "loss": 2.1294, + "step": 39920 + }, + { + "epoch": 0.151983435213873, + "grad_norm": 0.13897956907749176, + "learning_rate": 0.0005, + "loss": 2.1325, + "step": 39930 + }, + { + "epoch": 0.1520214976819957, + "grad_norm": 0.12281318753957748, + "learning_rate": 0.0005, + "loss": 2.1302, + "step": 39940 + }, + { + "epoch": 0.15205956015011837, + "grad_norm": 0.11802121996879578, + "learning_rate": 0.0005, + "loss": 2.1195, + "step": 39950 + }, + { + "epoch": 0.15209762261824106, + "grad_norm": 0.14475691318511963, + "learning_rate": 0.0005, + "loss": 2.1357, + "step": 39960 + }, + { + "epoch": 0.15213568508636374, + "grad_norm": 0.1253284513950348, + "learning_rate": 0.0005, + "loss": 2.151, + "step": 39970 + }, + { + "epoch": 0.15217374755448643, + "grad_norm": 0.12120725214481354, + "learning_rate": 0.0005, + "loss": 2.1443, + "step": 39980 + }, + { + "epoch": 0.1522118100226091, + "grad_norm": 0.12783914804458618, + "learning_rate": 0.0005, + "loss": 2.146, + "step": 39990 + }, + { + "epoch": 0.1522498724907318, + "grad_norm": 0.11306191235780716, + "learning_rate": 0.0005, + "loss": 2.1448, + "step": 40000 + }, + { + "epoch": 0.15228793495885448, + "grad_norm": 0.12707388401031494, + "learning_rate": 0.0005, + "loss": 2.128, + "step": 40010 + }, + { + "epoch": 0.15232599742697717, + "grad_norm": 0.11756369471549988, + "learning_rate": 0.0005, + "loss": 2.1243, + "step": 40020 + }, + { + "epoch": 0.15236405989509985, + "grad_norm": 0.141278937458992, + "learning_rate": 0.0005, + "loss": 2.1477, + "step": 40030 + }, + { + "epoch": 0.1524021223632225, + "grad_norm": 0.12917031347751617, + "learning_rate": 0.0005, + "loss": 2.1303, + "step": 40040 + }, + { + "epoch": 0.1524401848313452, + "grad_norm": 0.1284797489643097, + "learning_rate": 0.0005, + "loss": 2.1343, + "step": 40050 + }, + { + "epoch": 0.15247824729946788, + "grad_norm": 0.1224033460021019, + "learning_rate": 0.0005, + "loss": 2.1427, + "step": 40060 + }, + { + "epoch": 0.15251630976759056, + "grad_norm": 0.12608686089515686, + "learning_rate": 0.0005, + "loss": 2.1407, + "step": 40070 + }, + { + "epoch": 0.15255437223571325, + "grad_norm": 0.15755626559257507, + "learning_rate": 0.0005, + "loss": 2.1297, + "step": 40080 + }, + { + "epoch": 0.15259243470383593, + "grad_norm": 0.1173790842294693, + "learning_rate": 0.0005, + "loss": 2.1602, + "step": 40090 + }, + { + "epoch": 0.15263049717195862, + "grad_norm": 0.12344210594892502, + "learning_rate": 0.0005, + "loss": 2.134, + "step": 40100 + }, + { + "epoch": 0.1526685596400813, + "grad_norm": 0.12379170209169388, + "learning_rate": 0.0005, + "loss": 2.1242, + "step": 40110 + }, + { + "epoch": 0.152706622108204, + "grad_norm": 0.13954222202301025, + "learning_rate": 0.0005, + "loss": 2.1294, + "step": 40120 + }, + { + "epoch": 0.15274468457632667, + "grad_norm": 0.12113303691148758, + "learning_rate": 0.0005, + "loss": 2.1259, + "step": 40130 + }, + { + "epoch": 0.15278274704444936, + "grad_norm": 0.12871098518371582, + "learning_rate": 0.0005, + "loss": 2.1375, + "step": 40140 + }, + { + "epoch": 0.15282080951257204, + "grad_norm": 0.12698256969451904, + "learning_rate": 0.0005, + "loss": 2.1312, + "step": 40150 + }, + { + "epoch": 0.15285887198069473, + "grad_norm": 0.12224501371383667, + "learning_rate": 0.0005, + "loss": 2.1305, + "step": 40160 + }, + { + "epoch": 0.1528969344488174, + "grad_norm": 0.1328393965959549, + "learning_rate": 0.0005, + "loss": 2.1285, + "step": 40170 + }, + { + "epoch": 0.15293499691694007, + "grad_norm": 0.13421137630939484, + "learning_rate": 0.0005, + "loss": 2.1275, + "step": 40180 + }, + { + "epoch": 0.15297305938506275, + "grad_norm": 0.13883419334888458, + "learning_rate": 0.0005, + "loss": 2.1313, + "step": 40190 + }, + { + "epoch": 0.15301112185318544, + "grad_norm": 0.13581129908561707, + "learning_rate": 0.0005, + "loss": 2.1326, + "step": 40200 + }, + { + "epoch": 0.15304918432130812, + "grad_norm": 0.11238997429609299, + "learning_rate": 0.0005, + "loss": 2.1365, + "step": 40210 + }, + { + "epoch": 0.1530872467894308, + "grad_norm": 0.13544441759586334, + "learning_rate": 0.0005, + "loss": 2.1446, + "step": 40220 + }, + { + "epoch": 0.1531253092575535, + "grad_norm": 0.11856289207935333, + "learning_rate": 0.0005, + "loss": 2.1271, + "step": 40230 + }, + { + "epoch": 0.15316337172567618, + "grad_norm": 0.12014134973287582, + "learning_rate": 0.0005, + "loss": 2.1467, + "step": 40240 + }, + { + "epoch": 0.15320143419379886, + "grad_norm": 0.11337390542030334, + "learning_rate": 0.0005, + "loss": 2.1299, + "step": 40250 + }, + { + "epoch": 0.15323949666192155, + "grad_norm": 0.12547358870506287, + "learning_rate": 0.0005, + "loss": 2.1274, + "step": 40260 + }, + { + "epoch": 0.15327755913004423, + "grad_norm": 0.12301641702651978, + "learning_rate": 0.0005, + "loss": 2.1399, + "step": 40270 + }, + { + "epoch": 0.15331562159816692, + "grad_norm": 0.1294236034154892, + "learning_rate": 0.0005, + "loss": 2.1365, + "step": 40280 + }, + { + "epoch": 0.1533536840662896, + "grad_norm": 0.14758537709712982, + "learning_rate": 0.0005, + "loss": 2.1292, + "step": 40290 + }, + { + "epoch": 0.1533917465344123, + "grad_norm": 0.12281176447868347, + "learning_rate": 0.0005, + "loss": 2.134, + "step": 40300 + }, + { + "epoch": 0.15342980900253497, + "grad_norm": 0.11867820471525192, + "learning_rate": 0.0005, + "loss": 2.1365, + "step": 40310 + }, + { + "epoch": 0.15346787147065763, + "grad_norm": 0.12238939106464386, + "learning_rate": 0.0005, + "loss": 2.1349, + "step": 40320 + }, + { + "epoch": 0.15350593393878031, + "grad_norm": 0.11539949476718903, + "learning_rate": 0.0005, + "loss": 2.1523, + "step": 40330 + }, + { + "epoch": 0.153543996406903, + "grad_norm": 0.1254558265209198, + "learning_rate": 0.0005, + "loss": 2.1417, + "step": 40340 + }, + { + "epoch": 0.15358205887502568, + "grad_norm": 0.15088531374931335, + "learning_rate": 0.0005, + "loss": 2.1451, + "step": 40350 + }, + { + "epoch": 0.15362012134314837, + "grad_norm": 0.13487495481967926, + "learning_rate": 0.0005, + "loss": 2.1348, + "step": 40360 + }, + { + "epoch": 0.15365818381127105, + "grad_norm": 0.136166051030159, + "learning_rate": 0.0005, + "loss": 2.1509, + "step": 40370 + }, + { + "epoch": 0.15369624627939374, + "grad_norm": 0.11267966777086258, + "learning_rate": 0.0005, + "loss": 2.1343, + "step": 40380 + }, + { + "epoch": 0.15373430874751642, + "grad_norm": 0.11552360653877258, + "learning_rate": 0.0005, + "loss": 2.1241, + "step": 40390 + }, + { + "epoch": 0.1537723712156391, + "grad_norm": 0.11731477081775665, + "learning_rate": 0.0005, + "loss": 2.1403, + "step": 40400 + }, + { + "epoch": 0.1538104336837618, + "grad_norm": 0.12443472445011139, + "learning_rate": 0.0005, + "loss": 2.1284, + "step": 40410 + }, + { + "epoch": 0.15384849615188448, + "grad_norm": 0.11814551055431366, + "learning_rate": 0.0005, + "loss": 2.1328, + "step": 40420 + }, + { + "epoch": 0.15388655862000716, + "grad_norm": 0.13402269780635834, + "learning_rate": 0.0005, + "loss": 2.1412, + "step": 40430 + }, + { + "epoch": 0.15392462108812985, + "grad_norm": 0.11917763948440552, + "learning_rate": 0.0005, + "loss": 2.1248, + "step": 40440 + }, + { + "epoch": 0.15396268355625253, + "grad_norm": 0.12332938611507416, + "learning_rate": 0.0005, + "loss": 2.1486, + "step": 40450 + }, + { + "epoch": 0.15400074602437522, + "grad_norm": 0.12632392346858978, + "learning_rate": 0.0005, + "loss": 2.126, + "step": 40460 + }, + { + "epoch": 0.15403880849249788, + "grad_norm": 0.12391570955514908, + "learning_rate": 0.0005, + "loss": 2.125, + "step": 40470 + }, + { + "epoch": 0.15407687096062056, + "grad_norm": 0.11539748311042786, + "learning_rate": 0.0005, + "loss": 2.1373, + "step": 40480 + }, + { + "epoch": 0.15411493342874324, + "grad_norm": 0.1276472806930542, + "learning_rate": 0.0005, + "loss": 2.1281, + "step": 40490 + }, + { + "epoch": 0.15415299589686593, + "grad_norm": 0.13653264939785004, + "learning_rate": 0.0005, + "loss": 2.1377, + "step": 40500 + }, + { + "epoch": 0.15419105836498861, + "grad_norm": 0.12682753801345825, + "learning_rate": 0.0005, + "loss": 2.1505, + "step": 40510 + }, + { + "epoch": 0.1542291208331113, + "grad_norm": 0.11323153972625732, + "learning_rate": 0.0005, + "loss": 2.1355, + "step": 40520 + }, + { + "epoch": 0.15426718330123398, + "grad_norm": 0.13077697157859802, + "learning_rate": 0.0005, + "loss": 2.1446, + "step": 40530 + }, + { + "epoch": 0.15430524576935667, + "grad_norm": 0.1277543604373932, + "learning_rate": 0.0005, + "loss": 2.1449, + "step": 40540 + }, + { + "epoch": 0.15434330823747935, + "grad_norm": 0.12148383259773254, + "learning_rate": 0.0005, + "loss": 2.1351, + "step": 40550 + }, + { + "epoch": 0.15438137070560204, + "grad_norm": 0.12244511395692825, + "learning_rate": 0.0005, + "loss": 2.1362, + "step": 40560 + }, + { + "epoch": 0.15441943317372472, + "grad_norm": 0.12497160583734512, + "learning_rate": 0.0005, + "loss": 2.1458, + "step": 40570 + }, + { + "epoch": 0.1544574956418474, + "grad_norm": 0.1199488416314125, + "learning_rate": 0.0005, + "loss": 2.1312, + "step": 40580 + }, + { + "epoch": 0.1544955581099701, + "grad_norm": 0.13732865452766418, + "learning_rate": 0.0005, + "loss": 2.1388, + "step": 40590 + }, + { + "epoch": 0.15453362057809278, + "grad_norm": 0.14091861248016357, + "learning_rate": 0.0005, + "loss": 2.1436, + "step": 40600 + }, + { + "epoch": 0.15457168304621544, + "grad_norm": 0.1148281842470169, + "learning_rate": 0.0005, + "loss": 2.1462, + "step": 40610 + }, + { + "epoch": 0.15460974551433812, + "grad_norm": 0.12657825648784637, + "learning_rate": 0.0005, + "loss": 2.1313, + "step": 40620 + }, + { + "epoch": 0.1546478079824608, + "grad_norm": 0.11575401574373245, + "learning_rate": 0.0005, + "loss": 2.1286, + "step": 40630 + }, + { + "epoch": 0.1546858704505835, + "grad_norm": 0.13155962526798248, + "learning_rate": 0.0005, + "loss": 2.1311, + "step": 40640 + }, + { + "epoch": 0.15472393291870618, + "grad_norm": 0.13908006250858307, + "learning_rate": 0.0005, + "loss": 2.1377, + "step": 40650 + }, + { + "epoch": 0.15476199538682886, + "grad_norm": 0.12285695225000381, + "learning_rate": 0.0005, + "loss": 2.1413, + "step": 40660 + }, + { + "epoch": 0.15480005785495154, + "grad_norm": 0.11925322562456131, + "learning_rate": 0.0005, + "loss": 2.1431, + "step": 40670 + }, + { + "epoch": 0.15483812032307423, + "grad_norm": 0.12518733739852905, + "learning_rate": 0.0005, + "loss": 2.1336, + "step": 40680 + }, + { + "epoch": 0.15487618279119691, + "grad_norm": 0.11846065521240234, + "learning_rate": 0.0005, + "loss": 2.1404, + "step": 40690 + }, + { + "epoch": 0.1549142452593196, + "grad_norm": 0.13635188341140747, + "learning_rate": 0.0005, + "loss": 2.124, + "step": 40700 + }, + { + "epoch": 0.15495230772744228, + "grad_norm": 0.11662866175174713, + "learning_rate": 0.0005, + "loss": 2.1289, + "step": 40710 + }, + { + "epoch": 0.15499037019556497, + "grad_norm": 0.13562005758285522, + "learning_rate": 0.0005, + "loss": 2.1367, + "step": 40720 + }, + { + "epoch": 0.15502843266368765, + "grad_norm": 0.12767818570137024, + "learning_rate": 0.0005, + "loss": 2.1217, + "step": 40730 + }, + { + "epoch": 0.15506649513181034, + "grad_norm": 0.13494707643985748, + "learning_rate": 0.0005, + "loss": 2.1507, + "step": 40740 + }, + { + "epoch": 0.155104557599933, + "grad_norm": 0.13277818262577057, + "learning_rate": 0.0005, + "loss": 2.1344, + "step": 40750 + }, + { + "epoch": 0.15514262006805568, + "grad_norm": 0.7971560955047607, + "learning_rate": 0.0005, + "loss": 2.148, + "step": 40760 + }, + { + "epoch": 0.15518068253617837, + "grad_norm": 0.12246012687683105, + "learning_rate": 0.0005, + "loss": 2.1194, + "step": 40770 + }, + { + "epoch": 0.15521874500430105, + "grad_norm": 0.12156087160110474, + "learning_rate": 0.0005, + "loss": 2.1371, + "step": 40780 + }, + { + "epoch": 0.15525680747242374, + "grad_norm": 0.12802185118198395, + "learning_rate": 0.0005, + "loss": 2.1207, + "step": 40790 + }, + { + "epoch": 0.15529486994054642, + "grad_norm": 0.11559620499610901, + "learning_rate": 0.0005, + "loss": 2.1346, + "step": 40800 + }, + { + "epoch": 0.1553329324086691, + "grad_norm": 0.12793463468551636, + "learning_rate": 0.0005, + "loss": 2.1355, + "step": 40810 + }, + { + "epoch": 0.1553709948767918, + "grad_norm": 0.1196453645825386, + "learning_rate": 0.0005, + "loss": 2.1319, + "step": 40820 + }, + { + "epoch": 0.15540905734491448, + "grad_norm": 0.13778267800807953, + "learning_rate": 0.0005, + "loss": 2.1407, + "step": 40830 + }, + { + "epoch": 0.15544711981303716, + "grad_norm": 0.126423642039299, + "learning_rate": 0.0005, + "loss": 2.1421, + "step": 40840 + }, + { + "epoch": 0.15548518228115984, + "grad_norm": 0.12409335374832153, + "learning_rate": 0.0005, + "loss": 2.1521, + "step": 40850 + }, + { + "epoch": 0.15552324474928253, + "grad_norm": 0.1275656372308731, + "learning_rate": 0.0005, + "loss": 2.1383, + "step": 40860 + }, + { + "epoch": 0.15556130721740521, + "grad_norm": 0.12488967180252075, + "learning_rate": 0.0005, + "loss": 2.1388, + "step": 40870 + }, + { + "epoch": 0.1555993696855279, + "grad_norm": 0.1289723962545395, + "learning_rate": 0.0005, + "loss": 2.1321, + "step": 40880 + }, + { + "epoch": 0.15563743215365058, + "grad_norm": 0.12757979333400726, + "learning_rate": 0.0005, + "loss": 2.1268, + "step": 40890 + }, + { + "epoch": 0.15567549462177324, + "grad_norm": 0.11737728863954544, + "learning_rate": 0.0005, + "loss": 2.1381, + "step": 40900 + }, + { + "epoch": 0.15571355708989593, + "grad_norm": 0.13236309587955475, + "learning_rate": 0.0005, + "loss": 2.1414, + "step": 40910 + }, + { + "epoch": 0.1557516195580186, + "grad_norm": 0.12473408877849579, + "learning_rate": 0.0005, + "loss": 2.1417, + "step": 40920 + }, + { + "epoch": 0.1557896820261413, + "grad_norm": 0.12784703075885773, + "learning_rate": 0.0005, + "loss": 2.1314, + "step": 40930 + }, + { + "epoch": 0.15582774449426398, + "grad_norm": 0.11651159822940826, + "learning_rate": 0.0005, + "loss": 2.1386, + "step": 40940 + }, + { + "epoch": 0.15586580696238667, + "grad_norm": 0.13728098571300507, + "learning_rate": 0.0005, + "loss": 2.1454, + "step": 40950 + }, + { + "epoch": 0.15590386943050935, + "grad_norm": 0.13878300786018372, + "learning_rate": 0.0005, + "loss": 2.1276, + "step": 40960 + }, + { + "epoch": 0.15594193189863204, + "grad_norm": 0.12241476029157639, + "learning_rate": 0.0005, + "loss": 2.1285, + "step": 40970 + }, + { + "epoch": 0.15597999436675472, + "grad_norm": 0.13079239428043365, + "learning_rate": 0.0005, + "loss": 2.1438, + "step": 40980 + }, + { + "epoch": 0.1560180568348774, + "grad_norm": 0.1226423978805542, + "learning_rate": 0.0005, + "loss": 2.139, + "step": 40990 + }, + { + "epoch": 0.1560561193030001, + "grad_norm": 0.12032759934663773, + "learning_rate": 0.0005, + "loss": 2.134, + "step": 41000 + }, + { + "epoch": 0.15609418177112278, + "grad_norm": 0.12876206636428833, + "learning_rate": 0.0005, + "loss": 2.1325, + "step": 41010 + }, + { + "epoch": 0.15613224423924546, + "grad_norm": 0.12257958203554153, + "learning_rate": 0.0005, + "loss": 2.1355, + "step": 41020 + }, + { + "epoch": 0.15617030670736815, + "grad_norm": 0.12270672619342804, + "learning_rate": 0.0005, + "loss": 2.142, + "step": 41030 + }, + { + "epoch": 0.1562083691754908, + "grad_norm": 0.11476442217826843, + "learning_rate": 0.0005, + "loss": 2.1452, + "step": 41040 + }, + { + "epoch": 0.1562464316436135, + "grad_norm": 0.11680492013692856, + "learning_rate": 0.0005, + "loss": 2.1441, + "step": 41050 + }, + { + "epoch": 0.15628449411173617, + "grad_norm": 0.13939838111400604, + "learning_rate": 0.0005, + "loss": 2.1311, + "step": 41060 + }, + { + "epoch": 0.15632255657985886, + "grad_norm": 0.13249801099300385, + "learning_rate": 0.0005, + "loss": 2.1224, + "step": 41070 + }, + { + "epoch": 0.15636061904798154, + "grad_norm": 0.12342536449432373, + "learning_rate": 0.0005, + "loss": 2.1267, + "step": 41080 + }, + { + "epoch": 0.15639868151610423, + "grad_norm": 0.11840111017227173, + "learning_rate": 0.0005, + "loss": 2.1426, + "step": 41090 + }, + { + "epoch": 0.1564367439842269, + "grad_norm": 0.12191004306077957, + "learning_rate": 0.0005, + "loss": 2.1316, + "step": 41100 + }, + { + "epoch": 0.1564748064523496, + "grad_norm": 0.1340741664171219, + "learning_rate": 0.0005, + "loss": 2.1457, + "step": 41110 + }, + { + "epoch": 0.15651286892047228, + "grad_norm": 0.12074489146471024, + "learning_rate": 0.0005, + "loss": 2.1334, + "step": 41120 + }, + { + "epoch": 0.15655093138859497, + "grad_norm": 0.12547871470451355, + "learning_rate": 0.0005, + "loss": 2.1347, + "step": 41130 + }, + { + "epoch": 0.15658899385671765, + "grad_norm": 0.12773819267749786, + "learning_rate": 0.0005, + "loss": 2.143, + "step": 41140 + }, + { + "epoch": 0.15662705632484034, + "grad_norm": 0.12264664471149445, + "learning_rate": 0.0005, + "loss": 2.1428, + "step": 41150 + }, + { + "epoch": 0.15666511879296302, + "grad_norm": 0.12241547554731369, + "learning_rate": 0.0005, + "loss": 2.1287, + "step": 41160 + }, + { + "epoch": 0.1567031812610857, + "grad_norm": 0.12571392953395844, + "learning_rate": 0.0005, + "loss": 2.1287, + "step": 41170 + }, + { + "epoch": 0.1567412437292084, + "grad_norm": 0.11955336481332779, + "learning_rate": 0.0005, + "loss": 2.123, + "step": 41180 + }, + { + "epoch": 0.15677930619733105, + "grad_norm": 0.13170619308948517, + "learning_rate": 0.0005, + "loss": 2.1424, + "step": 41190 + }, + { + "epoch": 0.15681736866545373, + "grad_norm": 0.12544187903404236, + "learning_rate": 0.0005, + "loss": 2.1166, + "step": 41200 + }, + { + "epoch": 0.15685543113357642, + "grad_norm": 0.11725877970457077, + "learning_rate": 0.0005, + "loss": 2.1365, + "step": 41210 + }, + { + "epoch": 0.1568934936016991, + "grad_norm": 0.11695241928100586, + "learning_rate": 0.0005, + "loss": 2.1501, + "step": 41220 + }, + { + "epoch": 0.1569315560698218, + "grad_norm": 0.12395942211151123, + "learning_rate": 0.0005, + "loss": 2.1436, + "step": 41230 + }, + { + "epoch": 0.15696961853794447, + "grad_norm": 0.12017051875591278, + "learning_rate": 0.0005, + "loss": 2.1365, + "step": 41240 + }, + { + "epoch": 0.15700768100606716, + "grad_norm": 0.11665423214435577, + "learning_rate": 0.0005, + "loss": 2.1464, + "step": 41250 + }, + { + "epoch": 0.15704574347418984, + "grad_norm": 0.11894332617521286, + "learning_rate": 0.0005, + "loss": 2.1289, + "step": 41260 + }, + { + "epoch": 0.15708380594231253, + "grad_norm": 0.1246785819530487, + "learning_rate": 0.0005, + "loss": 2.1309, + "step": 41270 + }, + { + "epoch": 0.1571218684104352, + "grad_norm": 0.12720352411270142, + "learning_rate": 0.0005, + "loss": 2.1295, + "step": 41280 + }, + { + "epoch": 0.1571599308785579, + "grad_norm": 0.13201768696308136, + "learning_rate": 0.0005, + "loss": 2.1346, + "step": 41290 + }, + { + "epoch": 0.15719799334668058, + "grad_norm": 0.11735563725233078, + "learning_rate": 0.0005, + "loss": 2.129, + "step": 41300 + }, + { + "epoch": 0.15723605581480327, + "grad_norm": 0.13252070546150208, + "learning_rate": 0.0005, + "loss": 2.1549, + "step": 41310 + }, + { + "epoch": 0.15727411828292595, + "grad_norm": 0.1171932965517044, + "learning_rate": 0.0005, + "loss": 2.1222, + "step": 41320 + }, + { + "epoch": 0.1573121807510486, + "grad_norm": 0.12152529507875443, + "learning_rate": 0.0005, + "loss": 2.1373, + "step": 41330 + }, + { + "epoch": 0.1573502432191713, + "grad_norm": 0.14090144634246826, + "learning_rate": 0.0005, + "loss": 2.119, + "step": 41340 + }, + { + "epoch": 0.15738830568729398, + "grad_norm": 0.13475215435028076, + "learning_rate": 0.0005, + "loss": 2.1451, + "step": 41350 + }, + { + "epoch": 0.15742636815541666, + "grad_norm": 0.11995682120323181, + "learning_rate": 0.0005, + "loss": 2.1244, + "step": 41360 + }, + { + "epoch": 0.15746443062353935, + "grad_norm": 0.13988709449768066, + "learning_rate": 0.0005, + "loss": 2.1385, + "step": 41370 + }, + { + "epoch": 0.15750249309166203, + "grad_norm": 0.12187988311052322, + "learning_rate": 0.0005, + "loss": 2.1395, + "step": 41380 + }, + { + "epoch": 0.15754055555978472, + "grad_norm": 0.1212259903550148, + "learning_rate": 0.0005, + "loss": 2.134, + "step": 41390 + }, + { + "epoch": 0.1575786180279074, + "grad_norm": 0.12687280774116516, + "learning_rate": 0.0005, + "loss": 2.1447, + "step": 41400 + }, + { + "epoch": 0.1576166804960301, + "grad_norm": 0.1433592289686203, + "learning_rate": 0.0005, + "loss": 2.1399, + "step": 41410 + }, + { + "epoch": 0.15765474296415277, + "grad_norm": 0.13183125853538513, + "learning_rate": 0.0005, + "loss": 2.1437, + "step": 41420 + }, + { + "epoch": 0.15769280543227546, + "grad_norm": 0.12849292159080505, + "learning_rate": 0.0005, + "loss": 2.1209, + "step": 41430 + }, + { + "epoch": 0.15773086790039814, + "grad_norm": 0.11942418664693832, + "learning_rate": 0.0005, + "loss": 2.1183, + "step": 41440 + }, + { + "epoch": 0.15776893036852083, + "grad_norm": 0.12183071672916412, + "learning_rate": 0.0005, + "loss": 2.1291, + "step": 41450 + }, + { + "epoch": 0.1578069928366435, + "grad_norm": 0.12043503671884537, + "learning_rate": 0.0005, + "loss": 2.1314, + "step": 41460 + }, + { + "epoch": 0.15784505530476617, + "grad_norm": 0.10971025377511978, + "learning_rate": 0.0005, + "loss": 2.1412, + "step": 41470 + }, + { + "epoch": 0.15788311777288885, + "grad_norm": 0.11589441448450089, + "learning_rate": 0.0005, + "loss": 2.1346, + "step": 41480 + }, + { + "epoch": 0.15792118024101154, + "grad_norm": 0.12625034153461456, + "learning_rate": 0.0005, + "loss": 2.1373, + "step": 41490 + }, + { + "epoch": 0.15795924270913422, + "grad_norm": 0.1306608021259308, + "learning_rate": 0.0005, + "loss": 2.1376, + "step": 41500 + }, + { + "epoch": 0.1579973051772569, + "grad_norm": 0.11992191523313522, + "learning_rate": 0.0005, + "loss": 2.1298, + "step": 41510 + }, + { + "epoch": 0.1580353676453796, + "grad_norm": 0.14196783304214478, + "learning_rate": 0.0005, + "loss": 2.1394, + "step": 41520 + }, + { + "epoch": 0.15807343011350228, + "grad_norm": 0.11304987221956253, + "learning_rate": 0.0005, + "loss": 2.1375, + "step": 41530 + }, + { + "epoch": 0.15811149258162496, + "grad_norm": 0.12227694690227509, + "learning_rate": 0.0005, + "loss": 2.1384, + "step": 41540 + }, + { + "epoch": 0.15814955504974765, + "grad_norm": 0.1242867261171341, + "learning_rate": 0.0005, + "loss": 2.1383, + "step": 41550 + }, + { + "epoch": 0.15818761751787033, + "grad_norm": 0.13180440664291382, + "learning_rate": 0.0005, + "loss": 2.1326, + "step": 41560 + }, + { + "epoch": 0.15822567998599302, + "grad_norm": 0.1172434613108635, + "learning_rate": 0.0005, + "loss": 2.1359, + "step": 41570 + }, + { + "epoch": 0.1582637424541157, + "grad_norm": 0.12316454201936722, + "learning_rate": 0.0005, + "loss": 2.149, + "step": 41580 + }, + { + "epoch": 0.1583018049222384, + "grad_norm": 0.1225380226969719, + "learning_rate": 0.0005, + "loss": 2.1396, + "step": 41590 + }, + { + "epoch": 0.15833986739036107, + "grad_norm": 0.12523019313812256, + "learning_rate": 0.0005, + "loss": 2.1303, + "step": 41600 + }, + { + "epoch": 0.15837792985848376, + "grad_norm": 0.14092890918254852, + "learning_rate": 0.0005, + "loss": 2.1405, + "step": 41610 + }, + { + "epoch": 0.15841599232660641, + "grad_norm": 0.1391395926475525, + "learning_rate": 0.0005, + "loss": 2.1279, + "step": 41620 + }, + { + "epoch": 0.1584540547947291, + "grad_norm": 0.12023383378982544, + "learning_rate": 0.0005, + "loss": 2.1359, + "step": 41630 + }, + { + "epoch": 0.15849211726285178, + "grad_norm": 0.13288703560829163, + "learning_rate": 0.0005, + "loss": 2.1679, + "step": 41640 + }, + { + "epoch": 0.15853017973097447, + "grad_norm": 0.12219773977994919, + "learning_rate": 0.0005, + "loss": 2.1414, + "step": 41650 + }, + { + "epoch": 0.15856824219909715, + "grad_norm": 0.11769925057888031, + "learning_rate": 0.0005, + "loss": 2.1457, + "step": 41660 + }, + { + "epoch": 0.15860630466721984, + "grad_norm": 0.11972153186798096, + "learning_rate": 0.0005, + "loss": 2.1434, + "step": 41670 + }, + { + "epoch": 0.15864436713534252, + "grad_norm": 0.12952840328216553, + "learning_rate": 0.0005, + "loss": 2.1337, + "step": 41680 + }, + { + "epoch": 0.1586824296034652, + "grad_norm": 0.11720789968967438, + "learning_rate": 0.0005, + "loss": 2.1528, + "step": 41690 + }, + { + "epoch": 0.1587204920715879, + "grad_norm": 0.11736951768398285, + "learning_rate": 0.0005, + "loss": 2.1399, + "step": 41700 + }, + { + "epoch": 0.15875855453971058, + "grad_norm": 0.10920219868421555, + "learning_rate": 0.0005, + "loss": 2.1333, + "step": 41710 + }, + { + "epoch": 0.15879661700783326, + "grad_norm": 0.12645253539085388, + "learning_rate": 0.0005, + "loss": 2.1286, + "step": 41720 + }, + { + "epoch": 0.15883467947595595, + "grad_norm": 0.1483883261680603, + "learning_rate": 0.0005, + "loss": 2.1408, + "step": 41730 + }, + { + "epoch": 0.15887274194407863, + "grad_norm": 0.12281257659196854, + "learning_rate": 0.0005, + "loss": 2.1305, + "step": 41740 + }, + { + "epoch": 0.15891080441220132, + "grad_norm": 0.12573546171188354, + "learning_rate": 0.0005, + "loss": 2.1374, + "step": 41750 + }, + { + "epoch": 0.15894886688032397, + "grad_norm": 0.13053619861602783, + "learning_rate": 0.0005, + "loss": 2.1303, + "step": 41760 + }, + { + "epoch": 0.15898692934844666, + "grad_norm": 0.12806281447410583, + "learning_rate": 0.0005, + "loss": 2.1331, + "step": 41770 + }, + { + "epoch": 0.15902499181656934, + "grad_norm": 0.13922947645187378, + "learning_rate": 0.0005, + "loss": 2.1341, + "step": 41780 + }, + { + "epoch": 0.15906305428469203, + "grad_norm": 0.12998144328594208, + "learning_rate": 0.0005, + "loss": 2.1426, + "step": 41790 + }, + { + "epoch": 0.15910111675281471, + "grad_norm": 0.12506332993507385, + "learning_rate": 0.0005, + "loss": 2.132, + "step": 41800 + }, + { + "epoch": 0.1591391792209374, + "grad_norm": 0.1313193440437317, + "learning_rate": 0.0005, + "loss": 2.1467, + "step": 41810 + }, + { + "epoch": 0.15917724168906008, + "grad_norm": 0.12198295444250107, + "learning_rate": 0.0005, + "loss": 2.1328, + "step": 41820 + }, + { + "epoch": 0.15921530415718277, + "grad_norm": 0.11847379803657532, + "learning_rate": 0.0005, + "loss": 2.1411, + "step": 41830 + }, + { + "epoch": 0.15925336662530545, + "grad_norm": 0.14477434754371643, + "learning_rate": 0.0005, + "loss": 2.1511, + "step": 41840 + }, + { + "epoch": 0.15929142909342814, + "grad_norm": 0.12056008726358414, + "learning_rate": 0.0005, + "loss": 2.121, + "step": 41850 + }, + { + "epoch": 0.15932949156155082, + "grad_norm": 0.11610439419746399, + "learning_rate": 0.0005, + "loss": 2.1319, + "step": 41860 + }, + { + "epoch": 0.1593675540296735, + "grad_norm": 0.1233954057097435, + "learning_rate": 0.0005, + "loss": 2.1313, + "step": 41870 + }, + { + "epoch": 0.1594056164977962, + "grad_norm": 0.11605192720890045, + "learning_rate": 0.0005, + "loss": 2.1157, + "step": 41880 + }, + { + "epoch": 0.15944367896591888, + "grad_norm": 0.13223831355571747, + "learning_rate": 0.0005, + "loss": 2.1379, + "step": 41890 + }, + { + "epoch": 0.15948174143404154, + "grad_norm": 0.12090417742729187, + "learning_rate": 0.0005, + "loss": 2.1552, + "step": 41900 + }, + { + "epoch": 0.15951980390216422, + "grad_norm": 0.1277896612882614, + "learning_rate": 0.0005, + "loss": 2.1311, + "step": 41910 + }, + { + "epoch": 0.1595578663702869, + "grad_norm": 0.12657415866851807, + "learning_rate": 0.0005, + "loss": 2.1379, + "step": 41920 + }, + { + "epoch": 0.1595959288384096, + "grad_norm": 0.11490483582019806, + "learning_rate": 0.0005, + "loss": 2.1395, + "step": 41930 + }, + { + "epoch": 0.15963399130653227, + "grad_norm": 0.1302410364151001, + "learning_rate": 0.0005, + "loss": 2.1389, + "step": 41940 + }, + { + "epoch": 0.15967205377465496, + "grad_norm": 0.12107552587985992, + "learning_rate": 0.0005, + "loss": 2.1294, + "step": 41950 + }, + { + "epoch": 0.15971011624277764, + "grad_norm": 0.1330546736717224, + "learning_rate": 0.0005, + "loss": 2.1614, + "step": 41960 + }, + { + "epoch": 0.15974817871090033, + "grad_norm": 0.130593940615654, + "learning_rate": 0.0005, + "loss": 2.1297, + "step": 41970 + }, + { + "epoch": 0.15978624117902301, + "grad_norm": 0.11943244934082031, + "learning_rate": 0.0005, + "loss": 2.1487, + "step": 41980 + }, + { + "epoch": 0.1598243036471457, + "grad_norm": 0.12528935074806213, + "learning_rate": 0.0005, + "loss": 2.135, + "step": 41990 + }, + { + "epoch": 0.15986236611526838, + "grad_norm": 0.11964350938796997, + "learning_rate": 0.0005, + "loss": 2.1289, + "step": 42000 + }, + { + "epoch": 0.15990042858339107, + "grad_norm": 0.1630644053220749, + "learning_rate": 0.0005, + "loss": 2.1479, + "step": 42010 + }, + { + "epoch": 0.15993849105151375, + "grad_norm": 0.13449083268642426, + "learning_rate": 0.0005, + "loss": 2.1322, + "step": 42020 + }, + { + "epoch": 0.15997655351963644, + "grad_norm": 0.13047908246517181, + "learning_rate": 0.0005, + "loss": 2.1371, + "step": 42030 + }, + { + "epoch": 0.16001461598775912, + "grad_norm": 0.14681918919086456, + "learning_rate": 0.0005, + "loss": 2.1139, + "step": 42040 + }, + { + "epoch": 0.16005267845588178, + "grad_norm": 0.11368191987276077, + "learning_rate": 0.0005, + "loss": 2.1444, + "step": 42050 + }, + { + "epoch": 0.16009074092400447, + "grad_norm": 0.11434902995824814, + "learning_rate": 0.0005, + "loss": 2.135, + "step": 42060 + }, + { + "epoch": 0.16012880339212715, + "grad_norm": 0.11909456551074982, + "learning_rate": 0.0005, + "loss": 2.1259, + "step": 42070 + }, + { + "epoch": 0.16016686586024984, + "grad_norm": 0.11459674686193466, + "learning_rate": 0.0005, + "loss": 2.1311, + "step": 42080 + }, + { + "epoch": 0.16020492832837252, + "grad_norm": 0.11956145614385605, + "learning_rate": 0.0005, + "loss": 2.1435, + "step": 42090 + }, + { + "epoch": 0.1602429907964952, + "grad_norm": 0.12218481302261353, + "learning_rate": 0.0005, + "loss": 2.124, + "step": 42100 + }, + { + "epoch": 0.1602810532646179, + "grad_norm": 0.1367029994726181, + "learning_rate": 0.0005, + "loss": 2.1351, + "step": 42110 + }, + { + "epoch": 0.16031911573274057, + "grad_norm": 0.13083545863628387, + "learning_rate": 0.0005, + "loss": 2.121, + "step": 42120 + }, + { + "epoch": 0.16035717820086326, + "grad_norm": 0.14804820716381073, + "learning_rate": 0.0005, + "loss": 2.1491, + "step": 42130 + }, + { + "epoch": 0.16039524066898594, + "grad_norm": 0.12293334305286407, + "learning_rate": 0.0005, + "loss": 2.1339, + "step": 42140 + }, + { + "epoch": 0.16043330313710863, + "grad_norm": 0.12244443595409393, + "learning_rate": 0.0005, + "loss": 2.137, + "step": 42150 + }, + { + "epoch": 0.16047136560523131, + "grad_norm": 0.11816592514514923, + "learning_rate": 0.0005, + "loss": 2.125, + "step": 42160 + }, + { + "epoch": 0.160509428073354, + "grad_norm": 0.12204094231128693, + "learning_rate": 0.0005, + "loss": 2.1287, + "step": 42170 + }, + { + "epoch": 0.16054749054147668, + "grad_norm": 0.1250113993883133, + "learning_rate": 0.0005, + "loss": 2.1502, + "step": 42180 + }, + { + "epoch": 0.16058555300959934, + "grad_norm": 0.1222028136253357, + "learning_rate": 0.0005, + "loss": 2.1301, + "step": 42190 + }, + { + "epoch": 0.16062361547772203, + "grad_norm": 0.1175677701830864, + "learning_rate": 0.0005, + "loss": 2.1309, + "step": 42200 + }, + { + "epoch": 0.1606616779458447, + "grad_norm": 0.11348593980073929, + "learning_rate": 0.0005, + "loss": 2.1438, + "step": 42210 + }, + { + "epoch": 0.1606997404139674, + "grad_norm": 0.1285901814699173, + "learning_rate": 0.0005, + "loss": 2.1313, + "step": 42220 + }, + { + "epoch": 0.16073780288209008, + "grad_norm": 0.12811289727687836, + "learning_rate": 0.0005, + "loss": 2.1361, + "step": 42230 + }, + { + "epoch": 0.16077586535021277, + "grad_norm": 0.12517668306827545, + "learning_rate": 0.0005, + "loss": 2.1271, + "step": 42240 + }, + { + "epoch": 0.16081392781833545, + "grad_norm": 0.12036725878715515, + "learning_rate": 0.0005, + "loss": 2.1189, + "step": 42250 + }, + { + "epoch": 0.16085199028645814, + "grad_norm": 0.14227166771888733, + "learning_rate": 0.0005, + "loss": 2.1374, + "step": 42260 + }, + { + "epoch": 0.16089005275458082, + "grad_norm": 0.12093579024076462, + "learning_rate": 0.0005, + "loss": 2.1421, + "step": 42270 + }, + { + "epoch": 0.1609281152227035, + "grad_norm": 0.13048814237117767, + "learning_rate": 0.0005, + "loss": 2.1486, + "step": 42280 + }, + { + "epoch": 0.1609661776908262, + "grad_norm": 0.12342751026153564, + "learning_rate": 0.0005, + "loss": 2.1366, + "step": 42290 + }, + { + "epoch": 0.16100424015894887, + "grad_norm": 0.12928670644760132, + "learning_rate": 0.0005, + "loss": 2.1469, + "step": 42300 + }, + { + "epoch": 0.16104230262707156, + "grad_norm": 0.1347813606262207, + "learning_rate": 0.0005, + "loss": 2.1389, + "step": 42310 + }, + { + "epoch": 0.16108036509519424, + "grad_norm": 0.13783428072929382, + "learning_rate": 0.0005, + "loss": 2.1347, + "step": 42320 + }, + { + "epoch": 0.16111842756331693, + "grad_norm": 0.12047230452299118, + "learning_rate": 0.0005, + "loss": 2.1164, + "step": 42330 + }, + { + "epoch": 0.1611564900314396, + "grad_norm": 0.13066697120666504, + "learning_rate": 0.0005, + "loss": 2.1435, + "step": 42340 + }, + { + "epoch": 0.16119455249956227, + "grad_norm": 0.12590228021144867, + "learning_rate": 0.0005, + "loss": 2.1416, + "step": 42350 + }, + { + "epoch": 0.16123261496768496, + "grad_norm": 0.13206040859222412, + "learning_rate": 0.0005, + "loss": 2.1579, + "step": 42360 + }, + { + "epoch": 0.16127067743580764, + "grad_norm": 0.12833815813064575, + "learning_rate": 0.0005, + "loss": 2.1421, + "step": 42370 + }, + { + "epoch": 0.16130873990393033, + "grad_norm": 0.12771695852279663, + "learning_rate": 0.0005, + "loss": 2.1427, + "step": 42380 + }, + { + "epoch": 0.161346802372053, + "grad_norm": 0.12943068146705627, + "learning_rate": 0.0005, + "loss": 2.1167, + "step": 42390 + }, + { + "epoch": 0.1613848648401757, + "grad_norm": 0.12026971578598022, + "learning_rate": 0.0005, + "loss": 2.1352, + "step": 42400 + }, + { + "epoch": 0.16142292730829838, + "grad_norm": 0.11510438472032547, + "learning_rate": 0.0005, + "loss": 2.1322, + "step": 42410 + }, + { + "epoch": 0.16146098977642107, + "grad_norm": 0.13893866539001465, + "learning_rate": 0.0005, + "loss": 2.13, + "step": 42420 + }, + { + "epoch": 0.16149905224454375, + "grad_norm": 0.11227071285247803, + "learning_rate": 0.0005, + "loss": 2.1359, + "step": 42430 + }, + { + "epoch": 0.16153711471266644, + "grad_norm": 0.126421719789505, + "learning_rate": 0.0005, + "loss": 2.1386, + "step": 42440 + }, + { + "epoch": 0.16157517718078912, + "grad_norm": 0.12309280037879944, + "learning_rate": 0.0005, + "loss": 2.1307, + "step": 42450 + }, + { + "epoch": 0.1616132396489118, + "grad_norm": 0.12749263644218445, + "learning_rate": 0.0005, + "loss": 2.1302, + "step": 42460 + }, + { + "epoch": 0.1616513021170345, + "grad_norm": 0.122982457280159, + "learning_rate": 0.0005, + "loss": 2.128, + "step": 42470 + }, + { + "epoch": 0.16168936458515715, + "grad_norm": 0.12029185891151428, + "learning_rate": 0.0005, + "loss": 2.1484, + "step": 42480 + }, + { + "epoch": 0.16172742705327983, + "grad_norm": 0.12866486608982086, + "learning_rate": 0.0005, + "loss": 2.1258, + "step": 42490 + }, + { + "epoch": 0.16176548952140252, + "grad_norm": 0.12323121726512909, + "learning_rate": 0.0005, + "loss": 2.1539, + "step": 42500 + }, + { + "epoch": 0.1618035519895252, + "grad_norm": 0.12656740844249725, + "learning_rate": 0.0005, + "loss": 2.1314, + "step": 42510 + }, + { + "epoch": 0.1618416144576479, + "grad_norm": 0.13577412068843842, + "learning_rate": 0.0005, + "loss": 2.1377, + "step": 42520 + }, + { + "epoch": 0.16187967692577057, + "grad_norm": 0.12688353657722473, + "learning_rate": 0.0005, + "loss": 2.1374, + "step": 42530 + }, + { + "epoch": 0.16191773939389326, + "grad_norm": 0.1538940966129303, + "learning_rate": 0.0005, + "loss": 2.141, + "step": 42540 + }, + { + "epoch": 0.16195580186201594, + "grad_norm": 0.12012957781553268, + "learning_rate": 0.0005, + "loss": 2.1348, + "step": 42550 + }, + { + "epoch": 0.16199386433013863, + "grad_norm": 0.11320848762989044, + "learning_rate": 0.0005, + "loss": 2.1217, + "step": 42560 + }, + { + "epoch": 0.1620319267982613, + "grad_norm": 0.13399673998355865, + "learning_rate": 0.0005, + "loss": 2.1169, + "step": 42570 + }, + { + "epoch": 0.162069989266384, + "grad_norm": 0.12073542922735214, + "learning_rate": 0.0005, + "loss": 2.1445, + "step": 42580 + }, + { + "epoch": 0.16210805173450668, + "grad_norm": 0.13058961927890778, + "learning_rate": 0.0005, + "loss": 2.1298, + "step": 42590 + }, + { + "epoch": 0.16214611420262937, + "grad_norm": 0.12318974733352661, + "learning_rate": 0.0005, + "loss": 2.1373, + "step": 42600 + }, + { + "epoch": 0.16218417667075205, + "grad_norm": 0.12237012386322021, + "learning_rate": 0.0005, + "loss": 2.1306, + "step": 42610 + }, + { + "epoch": 0.1622222391388747, + "grad_norm": 0.124078668653965, + "learning_rate": 0.0005, + "loss": 2.1266, + "step": 42620 + }, + { + "epoch": 0.1622603016069974, + "grad_norm": 0.12561817467212677, + "learning_rate": 0.0005, + "loss": 2.1335, + "step": 42630 + }, + { + "epoch": 0.16229836407512008, + "grad_norm": 0.12398776412010193, + "learning_rate": 0.0005, + "loss": 2.139, + "step": 42640 + }, + { + "epoch": 0.16233642654324276, + "grad_norm": 0.12158331274986267, + "learning_rate": 0.0005, + "loss": 2.1457, + "step": 42650 + }, + { + "epoch": 0.16237448901136545, + "grad_norm": 0.11844036728143692, + "learning_rate": 0.0005, + "loss": 2.1207, + "step": 42660 + }, + { + "epoch": 0.16241255147948813, + "grad_norm": 0.13241811096668243, + "learning_rate": 0.0005, + "loss": 2.1522, + "step": 42670 + }, + { + "epoch": 0.16245061394761082, + "grad_norm": 0.12555848062038422, + "learning_rate": 0.0005, + "loss": 2.1307, + "step": 42680 + }, + { + "epoch": 0.1624886764157335, + "grad_norm": 0.12289304286241531, + "learning_rate": 0.0005, + "loss": 2.1353, + "step": 42690 + }, + { + "epoch": 0.1625267388838562, + "grad_norm": 0.11282894760370255, + "learning_rate": 0.0005, + "loss": 2.1301, + "step": 42700 + }, + { + "epoch": 0.16256480135197887, + "grad_norm": 0.12983134388923645, + "learning_rate": 0.0005, + "loss": 2.139, + "step": 42710 + }, + { + "epoch": 0.16260286382010156, + "grad_norm": 0.1355329006910324, + "learning_rate": 0.0005, + "loss": 2.1343, + "step": 42720 + }, + { + "epoch": 0.16264092628822424, + "grad_norm": 0.11260616779327393, + "learning_rate": 0.0005, + "loss": 2.1378, + "step": 42730 + }, + { + "epoch": 0.16267898875634693, + "grad_norm": 0.12481415271759033, + "learning_rate": 0.0005, + "loss": 2.1405, + "step": 42740 + }, + { + "epoch": 0.1627170512244696, + "grad_norm": 0.14663955569267273, + "learning_rate": 0.0005, + "loss": 2.1235, + "step": 42750 + }, + { + "epoch": 0.1627551136925923, + "grad_norm": 0.1387481838464737, + "learning_rate": 0.0005, + "loss": 2.136, + "step": 42760 + }, + { + "epoch": 0.16279317616071495, + "grad_norm": 0.12403888255357742, + "learning_rate": 0.0005, + "loss": 2.133, + "step": 42770 + }, + { + "epoch": 0.16283123862883764, + "grad_norm": 0.1486683040857315, + "learning_rate": 0.0005, + "loss": 2.1305, + "step": 42780 + }, + { + "epoch": 0.16286930109696032, + "grad_norm": 0.14229144155979156, + "learning_rate": 0.0005, + "loss": 2.1491, + "step": 42790 + }, + { + "epoch": 0.162907363565083, + "grad_norm": 0.1370912492275238, + "learning_rate": 0.0005, + "loss": 2.1285, + "step": 42800 + }, + { + "epoch": 0.1629454260332057, + "grad_norm": 0.11742990463972092, + "learning_rate": 0.0005, + "loss": 2.1507, + "step": 42810 + }, + { + "epoch": 0.16298348850132838, + "grad_norm": 0.13674955070018768, + "learning_rate": 0.0005, + "loss": 2.1329, + "step": 42820 + }, + { + "epoch": 0.16302155096945106, + "grad_norm": 0.11049659550189972, + "learning_rate": 0.0005, + "loss": 2.14, + "step": 42830 + }, + { + "epoch": 0.16305961343757375, + "grad_norm": 0.11355409026145935, + "learning_rate": 0.0005, + "loss": 2.125, + "step": 42840 + }, + { + "epoch": 0.16309767590569643, + "grad_norm": 0.1293742060661316, + "learning_rate": 0.0005, + "loss": 2.1267, + "step": 42850 + }, + { + "epoch": 0.16313573837381912, + "grad_norm": 0.12040603160858154, + "learning_rate": 0.0005, + "loss": 2.1306, + "step": 42860 + }, + { + "epoch": 0.1631738008419418, + "grad_norm": 0.11985547095537186, + "learning_rate": 0.0005, + "loss": 2.1375, + "step": 42870 + }, + { + "epoch": 0.1632118633100645, + "grad_norm": 0.11281248182058334, + "learning_rate": 0.0005, + "loss": 2.1386, + "step": 42880 + }, + { + "epoch": 0.16324992577818717, + "grad_norm": 0.12250163406133652, + "learning_rate": 0.0005, + "loss": 2.1369, + "step": 42890 + }, + { + "epoch": 0.16328798824630986, + "grad_norm": 0.13779056072235107, + "learning_rate": 0.0005, + "loss": 2.1489, + "step": 42900 + }, + { + "epoch": 0.1633260507144325, + "grad_norm": 0.12871775031089783, + "learning_rate": 0.0005, + "loss": 2.1344, + "step": 42910 + }, + { + "epoch": 0.1633641131825552, + "grad_norm": 0.11977288126945496, + "learning_rate": 0.0005, + "loss": 2.1391, + "step": 42920 + }, + { + "epoch": 0.16340217565067788, + "grad_norm": 0.12549398839473724, + "learning_rate": 0.0005, + "loss": 2.1242, + "step": 42930 + }, + { + "epoch": 0.16344023811880057, + "grad_norm": 0.11586546152830124, + "learning_rate": 0.0005, + "loss": 2.145, + "step": 42940 + }, + { + "epoch": 0.16347830058692325, + "grad_norm": 0.11639491468667984, + "learning_rate": 0.0005, + "loss": 2.1244, + "step": 42950 + }, + { + "epoch": 0.16351636305504594, + "grad_norm": 0.1248018816113472, + "learning_rate": 0.0005, + "loss": 2.1397, + "step": 42960 + }, + { + "epoch": 0.16355442552316862, + "grad_norm": 0.13171744346618652, + "learning_rate": 0.0005, + "loss": 2.144, + "step": 42970 + }, + { + "epoch": 0.1635924879912913, + "grad_norm": 0.11659903079271317, + "learning_rate": 0.0005, + "loss": 2.1328, + "step": 42980 + }, + { + "epoch": 0.163630550459414, + "grad_norm": 0.12795613706111908, + "learning_rate": 0.0005, + "loss": 2.1403, + "step": 42990 + }, + { + "epoch": 0.16366861292753668, + "grad_norm": 0.11983273178339005, + "learning_rate": 0.0005, + "loss": 2.1369, + "step": 43000 + }, + { + "epoch": 0.16370667539565936, + "grad_norm": 0.11790401488542557, + "learning_rate": 0.0005, + "loss": 2.1369, + "step": 43010 + }, + { + "epoch": 0.16374473786378205, + "grad_norm": 0.13478805124759674, + "learning_rate": 0.0005, + "loss": 2.1397, + "step": 43020 + }, + { + "epoch": 0.16378280033190473, + "grad_norm": 0.13745801150798798, + "learning_rate": 0.0005, + "loss": 2.1408, + "step": 43030 + }, + { + "epoch": 0.16382086280002742, + "grad_norm": 0.13325530290603638, + "learning_rate": 0.0005, + "loss": 2.1375, + "step": 43040 + }, + { + "epoch": 0.16385892526815007, + "grad_norm": 0.11879530549049377, + "learning_rate": 0.0005, + "loss": 2.1275, + "step": 43050 + }, + { + "epoch": 0.16389698773627276, + "grad_norm": 0.12253537774085999, + "learning_rate": 0.0005, + "loss": 2.1158, + "step": 43060 + }, + { + "epoch": 0.16393505020439544, + "grad_norm": 0.1257277876138687, + "learning_rate": 0.0005, + "loss": 2.1338, + "step": 43070 + }, + { + "epoch": 0.16397311267251813, + "grad_norm": 0.12686146795749664, + "learning_rate": 0.0005, + "loss": 2.1279, + "step": 43080 + }, + { + "epoch": 0.1640111751406408, + "grad_norm": 0.12342148274183273, + "learning_rate": 0.0005, + "loss": 2.1448, + "step": 43090 + }, + { + "epoch": 0.1640492376087635, + "grad_norm": 0.12092763930559158, + "learning_rate": 0.0005, + "loss": 2.1366, + "step": 43100 + }, + { + "epoch": 0.16408730007688618, + "grad_norm": 0.1287047415971756, + "learning_rate": 0.0005, + "loss": 2.1433, + "step": 43110 + }, + { + "epoch": 0.16412536254500887, + "grad_norm": 0.13601644337177277, + "learning_rate": 0.0005, + "loss": 2.1276, + "step": 43120 + }, + { + "epoch": 0.16416342501313155, + "grad_norm": 0.12395923584699631, + "learning_rate": 0.0005, + "loss": 2.1345, + "step": 43130 + }, + { + "epoch": 0.16420148748125424, + "grad_norm": 0.14422424137592316, + "learning_rate": 0.0005, + "loss": 2.1278, + "step": 43140 + }, + { + "epoch": 0.16423954994937692, + "grad_norm": 0.12677833437919617, + "learning_rate": 0.0005, + "loss": 2.1369, + "step": 43150 + }, + { + "epoch": 0.1642776124174996, + "grad_norm": 0.12512822449207306, + "learning_rate": 0.0005, + "loss": 2.1469, + "step": 43160 + }, + { + "epoch": 0.1643156748856223, + "grad_norm": 0.13113650679588318, + "learning_rate": 0.0005, + "loss": 2.1374, + "step": 43170 + }, + { + "epoch": 0.16435373735374498, + "grad_norm": 0.13267137110233307, + "learning_rate": 0.0005, + "loss": 2.13, + "step": 43180 + }, + { + "epoch": 0.16439179982186766, + "grad_norm": 0.12079484015703201, + "learning_rate": 0.0005, + "loss": 2.1465, + "step": 43190 + }, + { + "epoch": 0.16442986228999032, + "grad_norm": 0.11671856045722961, + "learning_rate": 0.0005, + "loss": 2.1374, + "step": 43200 + }, + { + "epoch": 0.164467924758113, + "grad_norm": 0.11519180983304977, + "learning_rate": 0.0005, + "loss": 2.1299, + "step": 43210 + }, + { + "epoch": 0.1645059872262357, + "grad_norm": 0.12597531080245972, + "learning_rate": 0.0005, + "loss": 2.1362, + "step": 43220 + }, + { + "epoch": 0.16454404969435837, + "grad_norm": 0.12310968339443207, + "learning_rate": 0.0005, + "loss": 2.1546, + "step": 43230 + }, + { + "epoch": 0.16458211216248106, + "grad_norm": 0.1263173371553421, + "learning_rate": 0.0005, + "loss": 2.1373, + "step": 43240 + }, + { + "epoch": 0.16462017463060374, + "grad_norm": 0.13074570894241333, + "learning_rate": 0.0005, + "loss": 2.137, + "step": 43250 + }, + { + "epoch": 0.16465823709872643, + "grad_norm": 0.11949026584625244, + "learning_rate": 0.0005, + "loss": 2.1334, + "step": 43260 + }, + { + "epoch": 0.1646962995668491, + "grad_norm": 0.1144823208451271, + "learning_rate": 0.0005, + "loss": 2.1484, + "step": 43270 + }, + { + "epoch": 0.1647343620349718, + "grad_norm": 0.12179239839315414, + "learning_rate": 0.0005, + "loss": 2.1323, + "step": 43280 + }, + { + "epoch": 0.16477242450309448, + "grad_norm": 0.15474900603294373, + "learning_rate": 0.0005, + "loss": 2.1345, + "step": 43290 + }, + { + "epoch": 0.16481048697121717, + "grad_norm": 0.1269451230764389, + "learning_rate": 0.0005, + "loss": 2.1233, + "step": 43300 + }, + { + "epoch": 0.16484854943933985, + "grad_norm": 0.11439156532287598, + "learning_rate": 0.0005, + "loss": 2.1194, + "step": 43310 + }, + { + "epoch": 0.16488661190746254, + "grad_norm": 0.12010608613491058, + "learning_rate": 0.0005, + "loss": 2.1422, + "step": 43320 + }, + { + "epoch": 0.16492467437558522, + "grad_norm": 0.11821283400058746, + "learning_rate": 0.0005, + "loss": 2.1393, + "step": 43330 + }, + { + "epoch": 0.16496273684370788, + "grad_norm": 0.13478679955005646, + "learning_rate": 0.0005, + "loss": 2.1489, + "step": 43340 + }, + { + "epoch": 0.16500079931183056, + "grad_norm": 0.12389303743839264, + "learning_rate": 0.0005, + "loss": 2.142, + "step": 43350 + }, + { + "epoch": 0.16503886177995325, + "grad_norm": 0.12443176656961441, + "learning_rate": 0.0005, + "loss": 2.1336, + "step": 43360 + }, + { + "epoch": 0.16507692424807593, + "grad_norm": 0.1438198685646057, + "learning_rate": 0.0005, + "loss": 2.135, + "step": 43370 + }, + { + "epoch": 0.16511498671619862, + "grad_norm": 0.12440897524356842, + "learning_rate": 0.0005, + "loss": 2.1292, + "step": 43380 + }, + { + "epoch": 0.1651530491843213, + "grad_norm": 0.12202125042676926, + "learning_rate": 0.0005, + "loss": 2.1332, + "step": 43390 + }, + { + "epoch": 0.165191111652444, + "grad_norm": 0.11001090705394745, + "learning_rate": 0.0005, + "loss": 2.1298, + "step": 43400 + }, + { + "epoch": 0.16522917412056667, + "grad_norm": 0.13812753558158875, + "learning_rate": 0.0005, + "loss": 2.1411, + "step": 43410 + }, + { + "epoch": 0.16526723658868936, + "grad_norm": 0.1187279224395752, + "learning_rate": 0.0005, + "loss": 2.1413, + "step": 43420 + }, + { + "epoch": 0.16530529905681204, + "grad_norm": 0.11560482531785965, + "learning_rate": 0.0005, + "loss": 2.1293, + "step": 43430 + }, + { + "epoch": 0.16534336152493473, + "grad_norm": 0.13228921592235565, + "learning_rate": 0.0005, + "loss": 2.1384, + "step": 43440 + }, + { + "epoch": 0.1653814239930574, + "grad_norm": 0.1330287605524063, + "learning_rate": 0.0005, + "loss": 2.1243, + "step": 43450 + }, + { + "epoch": 0.1654194864611801, + "grad_norm": 0.13445092737674713, + "learning_rate": 0.0005, + "loss": 2.1311, + "step": 43460 + }, + { + "epoch": 0.16545754892930278, + "grad_norm": 0.11717677861452103, + "learning_rate": 0.0005, + "loss": 2.1301, + "step": 43470 + }, + { + "epoch": 0.16549561139742547, + "grad_norm": 0.14144597947597504, + "learning_rate": 0.0005, + "loss": 2.1302, + "step": 43480 + }, + { + "epoch": 0.16553367386554813, + "grad_norm": 0.12285357713699341, + "learning_rate": 0.0005, + "loss": 2.15, + "step": 43490 + }, + { + "epoch": 0.1655717363336708, + "grad_norm": 0.15100809931755066, + "learning_rate": 0.0005, + "loss": 2.14, + "step": 43500 + }, + { + "epoch": 0.1656097988017935, + "grad_norm": 0.12419594079256058, + "learning_rate": 0.0005, + "loss": 2.1347, + "step": 43510 + }, + { + "epoch": 0.16564786126991618, + "grad_norm": 0.12664470076560974, + "learning_rate": 0.0005, + "loss": 2.1329, + "step": 43520 + }, + { + "epoch": 0.16568592373803886, + "grad_norm": 0.11580910533666611, + "learning_rate": 0.0005, + "loss": 2.132, + "step": 43530 + }, + { + "epoch": 0.16572398620616155, + "grad_norm": 0.12186925858259201, + "learning_rate": 0.0005, + "loss": 2.1377, + "step": 43540 + }, + { + "epoch": 0.16576204867428423, + "grad_norm": 0.10877561569213867, + "learning_rate": 0.0005, + "loss": 2.1281, + "step": 43550 + }, + { + "epoch": 0.16580011114240692, + "grad_norm": 0.13248339295387268, + "learning_rate": 0.0005, + "loss": 2.1301, + "step": 43560 + }, + { + "epoch": 0.1658381736105296, + "grad_norm": 0.14807666838169098, + "learning_rate": 0.0005, + "loss": 2.1402, + "step": 43570 + }, + { + "epoch": 0.1658762360786523, + "grad_norm": 0.12139953672885895, + "learning_rate": 0.0005, + "loss": 2.1486, + "step": 43580 + }, + { + "epoch": 0.16591429854677497, + "grad_norm": 0.12874317169189453, + "learning_rate": 0.0005, + "loss": 2.1366, + "step": 43590 + }, + { + "epoch": 0.16595236101489766, + "grad_norm": 0.1149357259273529, + "learning_rate": 0.0005, + "loss": 2.1296, + "step": 43600 + }, + { + "epoch": 0.16599042348302034, + "grad_norm": 0.12108978629112244, + "learning_rate": 0.0005, + "loss": 2.1266, + "step": 43610 + }, + { + "epoch": 0.16602848595114303, + "grad_norm": 0.12273205071687698, + "learning_rate": 0.0005, + "loss": 2.1369, + "step": 43620 + }, + { + "epoch": 0.16606654841926569, + "grad_norm": 0.11966592818498611, + "learning_rate": 0.0005, + "loss": 2.1321, + "step": 43630 + }, + { + "epoch": 0.16610461088738837, + "grad_norm": 0.11612257361412048, + "learning_rate": 0.0005, + "loss": 2.1283, + "step": 43640 + }, + { + "epoch": 0.16614267335551106, + "grad_norm": 0.12440352886915207, + "learning_rate": 0.0005, + "loss": 2.1387, + "step": 43650 + }, + { + "epoch": 0.16618073582363374, + "grad_norm": 0.11509761214256287, + "learning_rate": 0.0005, + "loss": 2.1577, + "step": 43660 + }, + { + "epoch": 0.16621879829175643, + "grad_norm": 0.11704332381486893, + "learning_rate": 0.0005, + "loss": 2.1344, + "step": 43670 + }, + { + "epoch": 0.1662568607598791, + "grad_norm": 0.12189175188541412, + "learning_rate": 0.0005, + "loss": 2.1428, + "step": 43680 + }, + { + "epoch": 0.1662949232280018, + "grad_norm": 0.11538931727409363, + "learning_rate": 0.0005, + "loss": 2.1462, + "step": 43690 + }, + { + "epoch": 0.16633298569612448, + "grad_norm": 0.12895295023918152, + "learning_rate": 0.0005, + "loss": 2.117, + "step": 43700 + }, + { + "epoch": 0.16637104816424717, + "grad_norm": 0.11244889348745346, + "learning_rate": 0.0005, + "loss": 2.1317, + "step": 43710 + }, + { + "epoch": 0.16640911063236985, + "grad_norm": 0.13465918600559235, + "learning_rate": 0.0005, + "loss": 2.1315, + "step": 43720 + }, + { + "epoch": 0.16644717310049253, + "grad_norm": 0.13419680297374725, + "learning_rate": 0.0005, + "loss": 2.1264, + "step": 43730 + }, + { + "epoch": 0.16648523556861522, + "grad_norm": 0.12253239750862122, + "learning_rate": 0.0005, + "loss": 2.139, + "step": 43740 + }, + { + "epoch": 0.1665232980367379, + "grad_norm": 0.1258445829153061, + "learning_rate": 0.0005, + "loss": 2.1287, + "step": 43750 + }, + { + "epoch": 0.1665613605048606, + "grad_norm": 0.1273617297410965, + "learning_rate": 0.0005, + "loss": 2.1495, + "step": 43760 + }, + { + "epoch": 0.16659942297298325, + "grad_norm": 0.13250529766082764, + "learning_rate": 0.0005, + "loss": 2.1181, + "step": 43770 + }, + { + "epoch": 0.16663748544110593, + "grad_norm": 0.1205471083521843, + "learning_rate": 0.0005, + "loss": 2.1519, + "step": 43780 + }, + { + "epoch": 0.16667554790922862, + "grad_norm": 0.11968290060758591, + "learning_rate": 0.0005, + "loss": 2.1288, + "step": 43790 + }, + { + "epoch": 0.1667136103773513, + "grad_norm": 0.137216255068779, + "learning_rate": 0.0005, + "loss": 2.1372, + "step": 43800 + }, + { + "epoch": 0.16675167284547399, + "grad_norm": 0.13451290130615234, + "learning_rate": 0.0005, + "loss": 2.1148, + "step": 43810 + }, + { + "epoch": 0.16678973531359667, + "grad_norm": 0.12728510797023773, + "learning_rate": 0.0005, + "loss": 2.1488, + "step": 43820 + }, + { + "epoch": 0.16682779778171936, + "grad_norm": 0.116569884121418, + "learning_rate": 0.0005, + "loss": 2.1295, + "step": 43830 + }, + { + "epoch": 0.16686586024984204, + "grad_norm": 0.11471624672412872, + "learning_rate": 0.0005, + "loss": 2.1403, + "step": 43840 + }, + { + "epoch": 0.16690392271796473, + "grad_norm": 0.12343640625476837, + "learning_rate": 0.0005, + "loss": 2.1411, + "step": 43850 + }, + { + "epoch": 0.1669419851860874, + "grad_norm": 0.11519220471382141, + "learning_rate": 0.0005, + "loss": 2.1291, + "step": 43860 + }, + { + "epoch": 0.1669800476542101, + "grad_norm": 0.1342388391494751, + "learning_rate": 0.0005, + "loss": 2.1344, + "step": 43870 + }, + { + "epoch": 0.16701811012233278, + "grad_norm": 0.12278634309768677, + "learning_rate": 0.0005, + "loss": 2.1421, + "step": 43880 + }, + { + "epoch": 0.16705617259045547, + "grad_norm": 0.13932234048843384, + "learning_rate": 0.0005, + "loss": 2.1486, + "step": 43890 + }, + { + "epoch": 0.16709423505857815, + "grad_norm": 0.12469235062599182, + "learning_rate": 0.0005, + "loss": 2.1337, + "step": 43900 + }, + { + "epoch": 0.16713229752670083, + "grad_norm": 0.12053560465574265, + "learning_rate": 0.0005, + "loss": 2.1315, + "step": 43910 + }, + { + "epoch": 0.1671703599948235, + "grad_norm": 0.12285034358501434, + "learning_rate": 0.0005, + "loss": 2.1542, + "step": 43920 + }, + { + "epoch": 0.16720842246294618, + "grad_norm": 0.12460073083639145, + "learning_rate": 0.0005, + "loss": 2.144, + "step": 43930 + }, + { + "epoch": 0.16724648493106886, + "grad_norm": 0.11997386068105698, + "learning_rate": 0.0005, + "loss": 2.1302, + "step": 43940 + }, + { + "epoch": 0.16728454739919155, + "grad_norm": 0.1238400787115097, + "learning_rate": 0.0005, + "loss": 2.1395, + "step": 43950 + }, + { + "epoch": 0.16732260986731423, + "grad_norm": 0.11750750243663788, + "learning_rate": 0.0005, + "loss": 2.1334, + "step": 43960 + }, + { + "epoch": 0.16736067233543692, + "grad_norm": 0.1257324367761612, + "learning_rate": 0.0005, + "loss": 2.1358, + "step": 43970 + }, + { + "epoch": 0.1673987348035596, + "grad_norm": 0.11816301196813583, + "learning_rate": 0.0005, + "loss": 2.1323, + "step": 43980 + }, + { + "epoch": 0.16743679727168229, + "grad_norm": 0.1174677386879921, + "learning_rate": 0.0005, + "loss": 2.1321, + "step": 43990 + }, + { + "epoch": 0.16747485973980497, + "grad_norm": 0.11954568326473236, + "learning_rate": 0.0005, + "loss": 2.1301, + "step": 44000 + }, + { + "epoch": 0.16751292220792766, + "grad_norm": 0.12409286946058273, + "learning_rate": 0.0005, + "loss": 2.1306, + "step": 44010 + }, + { + "epoch": 0.16755098467605034, + "grad_norm": 0.1352391242980957, + "learning_rate": 0.0005, + "loss": 2.1336, + "step": 44020 + }, + { + "epoch": 0.16758904714417303, + "grad_norm": 0.1292494535446167, + "learning_rate": 0.0005, + "loss": 2.1324, + "step": 44030 + }, + { + "epoch": 0.1676271096122957, + "grad_norm": 0.13378708064556122, + "learning_rate": 0.0005, + "loss": 2.1413, + "step": 44040 + }, + { + "epoch": 0.1676651720804184, + "grad_norm": 0.1259700208902359, + "learning_rate": 0.0005, + "loss": 2.1264, + "step": 44050 + }, + { + "epoch": 0.16770323454854105, + "grad_norm": 0.12806585431098938, + "learning_rate": 0.0005, + "loss": 2.1464, + "step": 44060 + }, + { + "epoch": 0.16774129701666374, + "grad_norm": 0.11934912949800491, + "learning_rate": 0.0005, + "loss": 2.1253, + "step": 44070 + }, + { + "epoch": 0.16777935948478642, + "grad_norm": 0.1166028380393982, + "learning_rate": 0.0005, + "loss": 2.118, + "step": 44080 + }, + { + "epoch": 0.1678174219529091, + "grad_norm": 0.11846883594989777, + "learning_rate": 0.0005, + "loss": 2.1328, + "step": 44090 + }, + { + "epoch": 0.1678554844210318, + "grad_norm": 0.11931490898132324, + "learning_rate": 0.0005, + "loss": 2.1444, + "step": 44100 + }, + { + "epoch": 0.16789354688915448, + "grad_norm": 0.11143594980239868, + "learning_rate": 0.0005, + "loss": 2.1291, + "step": 44110 + }, + { + "epoch": 0.16793160935727716, + "grad_norm": 0.11396325379610062, + "learning_rate": 0.0005, + "loss": 2.1408, + "step": 44120 + }, + { + "epoch": 0.16796967182539985, + "grad_norm": 0.1326463669538498, + "learning_rate": 0.0005, + "loss": 2.144, + "step": 44130 + }, + { + "epoch": 0.16800773429352253, + "grad_norm": 0.13630157709121704, + "learning_rate": 0.0005, + "loss": 2.1293, + "step": 44140 + }, + { + "epoch": 0.16804579676164522, + "grad_norm": 0.11827398091554642, + "learning_rate": 0.0005, + "loss": 2.1191, + "step": 44150 + }, + { + "epoch": 0.1680838592297679, + "grad_norm": 0.12105869501829147, + "learning_rate": 0.0005, + "loss": 2.1418, + "step": 44160 + }, + { + "epoch": 0.1681219216978906, + "grad_norm": 0.13034142553806305, + "learning_rate": 0.0005, + "loss": 2.1427, + "step": 44170 + }, + { + "epoch": 0.16815998416601327, + "grad_norm": 0.1371888369321823, + "learning_rate": 0.0005, + "loss": 2.1156, + "step": 44180 + }, + { + "epoch": 0.16819804663413596, + "grad_norm": 0.11673656851053238, + "learning_rate": 0.0005, + "loss": 2.1203, + "step": 44190 + }, + { + "epoch": 0.1682361091022586, + "grad_norm": 0.1259164661169052, + "learning_rate": 0.0005, + "loss": 2.1389, + "step": 44200 + }, + { + "epoch": 0.1682741715703813, + "grad_norm": 0.12241323292255402, + "learning_rate": 0.0005, + "loss": 2.13, + "step": 44210 + }, + { + "epoch": 0.16831223403850398, + "grad_norm": 0.12541623413562775, + "learning_rate": 0.0005, + "loss": 2.1258, + "step": 44220 + }, + { + "epoch": 0.16835029650662667, + "grad_norm": 0.12463592737913132, + "learning_rate": 0.0005, + "loss": 2.1469, + "step": 44230 + }, + { + "epoch": 0.16838835897474935, + "grad_norm": 0.11558257043361664, + "learning_rate": 0.0005, + "loss": 2.1306, + "step": 44240 + }, + { + "epoch": 0.16842642144287204, + "grad_norm": 0.12510859966278076, + "learning_rate": 0.0005, + "loss": 2.1301, + "step": 44250 + }, + { + "epoch": 0.16846448391099472, + "grad_norm": 0.12276134639978409, + "learning_rate": 0.0005, + "loss": 2.1231, + "step": 44260 + }, + { + "epoch": 0.1685025463791174, + "grad_norm": 0.1275489777326584, + "learning_rate": 0.0005, + "loss": 2.1204, + "step": 44270 + }, + { + "epoch": 0.1685406088472401, + "grad_norm": 0.12813347578048706, + "learning_rate": 0.0005, + "loss": 2.1423, + "step": 44280 + }, + { + "epoch": 0.16857867131536278, + "grad_norm": 0.1250738650560379, + "learning_rate": 0.0005, + "loss": 2.138, + "step": 44290 + }, + { + "epoch": 0.16861673378348546, + "grad_norm": 0.11898188292980194, + "learning_rate": 0.0005, + "loss": 2.1243, + "step": 44300 + }, + { + "epoch": 0.16865479625160815, + "grad_norm": 0.14133501052856445, + "learning_rate": 0.0005, + "loss": 2.1398, + "step": 44310 + }, + { + "epoch": 0.16869285871973083, + "grad_norm": 0.12976321578025818, + "learning_rate": 0.0005, + "loss": 2.1357, + "step": 44320 + }, + { + "epoch": 0.16873092118785352, + "grad_norm": 0.12245601415634155, + "learning_rate": 0.0005, + "loss": 2.1542, + "step": 44330 + }, + { + "epoch": 0.1687689836559762, + "grad_norm": 0.12810446321964264, + "learning_rate": 0.0005, + "loss": 2.1358, + "step": 44340 + }, + { + "epoch": 0.16880704612409886, + "grad_norm": 0.16562506556510925, + "learning_rate": 0.0005, + "loss": 2.1426, + "step": 44350 + }, + { + "epoch": 0.16884510859222154, + "grad_norm": 0.12520432472229004, + "learning_rate": 0.0005, + "loss": 2.141, + "step": 44360 + }, + { + "epoch": 0.16888317106034423, + "grad_norm": 0.14297902584075928, + "learning_rate": 0.0005, + "loss": 2.152, + "step": 44370 + }, + { + "epoch": 0.1689212335284669, + "grad_norm": 0.12294340133666992, + "learning_rate": 0.0005, + "loss": 2.1344, + "step": 44380 + }, + { + "epoch": 0.1689592959965896, + "grad_norm": 0.1253482699394226, + "learning_rate": 0.0005, + "loss": 2.1391, + "step": 44390 + }, + { + "epoch": 0.16899735846471228, + "grad_norm": 0.12899786233901978, + "learning_rate": 0.0005, + "loss": 2.1303, + "step": 44400 + }, + { + "epoch": 0.16903542093283497, + "grad_norm": 0.12162794172763824, + "learning_rate": 0.0005, + "loss": 2.1418, + "step": 44410 + }, + { + "epoch": 0.16907348340095765, + "grad_norm": 0.12179070711135864, + "learning_rate": 0.0005, + "loss": 2.1421, + "step": 44420 + }, + { + "epoch": 0.16911154586908034, + "grad_norm": 0.11872969567775726, + "learning_rate": 0.0005, + "loss": 2.1258, + "step": 44430 + }, + { + "epoch": 0.16914960833720302, + "grad_norm": 0.11570776998996735, + "learning_rate": 0.0005, + "loss": 2.1306, + "step": 44440 + }, + { + "epoch": 0.1691876708053257, + "grad_norm": 0.18721316754817963, + "learning_rate": 0.0005, + "loss": 2.1219, + "step": 44450 + }, + { + "epoch": 0.1692257332734484, + "grad_norm": 0.11968997120857239, + "learning_rate": 0.0005, + "loss": 2.144, + "step": 44460 + }, + { + "epoch": 0.16926379574157108, + "grad_norm": 0.1322961300611496, + "learning_rate": 0.0005, + "loss": 2.1262, + "step": 44470 + }, + { + "epoch": 0.16930185820969376, + "grad_norm": 0.11864107102155685, + "learning_rate": 0.0005, + "loss": 2.1397, + "step": 44480 + }, + { + "epoch": 0.16933992067781642, + "grad_norm": 0.133161261677742, + "learning_rate": 0.0005, + "loss": 2.1296, + "step": 44490 + }, + { + "epoch": 0.1693779831459391, + "grad_norm": 0.12709060311317444, + "learning_rate": 0.0005, + "loss": 2.1284, + "step": 44500 + }, + { + "epoch": 0.1694160456140618, + "grad_norm": 0.127763032913208, + "learning_rate": 0.0005, + "loss": 2.1342, + "step": 44510 + }, + { + "epoch": 0.16945410808218447, + "grad_norm": 0.1264316290616989, + "learning_rate": 0.0005, + "loss": 2.1366, + "step": 44520 + }, + { + "epoch": 0.16949217055030716, + "grad_norm": 0.1268509477376938, + "learning_rate": 0.0005, + "loss": 2.148, + "step": 44530 + }, + { + "epoch": 0.16953023301842984, + "grad_norm": 0.12328742444515228, + "learning_rate": 0.0005, + "loss": 2.1357, + "step": 44540 + }, + { + "epoch": 0.16956829548655253, + "grad_norm": 0.1354341059923172, + "learning_rate": 0.0005, + "loss": 2.1344, + "step": 44550 + }, + { + "epoch": 0.1696063579546752, + "grad_norm": 0.11687562614679337, + "learning_rate": 0.0005, + "loss": 2.1452, + "step": 44560 + }, + { + "epoch": 0.1696444204227979, + "grad_norm": 0.11326216906309128, + "learning_rate": 0.0005, + "loss": 2.1333, + "step": 44570 + }, + { + "epoch": 0.16968248289092058, + "grad_norm": 0.14775125682353973, + "learning_rate": 0.0005, + "loss": 2.1436, + "step": 44580 + }, + { + "epoch": 0.16972054535904327, + "grad_norm": 0.13913802802562714, + "learning_rate": 0.0005, + "loss": 2.1312, + "step": 44590 + }, + { + "epoch": 0.16975860782716595, + "grad_norm": 0.11591693758964539, + "learning_rate": 0.0005, + "loss": 2.135, + "step": 44600 + }, + { + "epoch": 0.16979667029528864, + "grad_norm": 0.11765572428703308, + "learning_rate": 0.0005, + "loss": 2.1229, + "step": 44610 + }, + { + "epoch": 0.16983473276341132, + "grad_norm": 0.1285104751586914, + "learning_rate": 0.0005, + "loss": 2.1316, + "step": 44620 + }, + { + "epoch": 0.169872795231534, + "grad_norm": 0.13289599120616913, + "learning_rate": 0.0005, + "loss": 2.124, + "step": 44630 + }, + { + "epoch": 0.16991085769965666, + "grad_norm": 0.11635494977235794, + "learning_rate": 0.0005, + "loss": 2.1299, + "step": 44640 + }, + { + "epoch": 0.16994892016777935, + "grad_norm": 0.12370641529560089, + "learning_rate": 0.0005, + "loss": 2.1262, + "step": 44650 + }, + { + "epoch": 0.16998698263590203, + "grad_norm": 0.11813714355230331, + "learning_rate": 0.0005, + "loss": 2.1361, + "step": 44660 + }, + { + "epoch": 0.17002504510402472, + "grad_norm": 0.1225036233663559, + "learning_rate": 0.0005, + "loss": 2.1411, + "step": 44670 + }, + { + "epoch": 0.1700631075721474, + "grad_norm": 0.12147258967161179, + "learning_rate": 0.0005, + "loss": 2.1286, + "step": 44680 + }, + { + "epoch": 0.1701011700402701, + "grad_norm": 0.12780791521072388, + "learning_rate": 0.0005, + "loss": 2.1393, + "step": 44690 + }, + { + "epoch": 0.17013923250839277, + "grad_norm": 0.12251202017068863, + "learning_rate": 0.0005, + "loss": 2.1386, + "step": 44700 + }, + { + "epoch": 0.17017729497651546, + "grad_norm": 0.12178204953670502, + "learning_rate": 0.0005, + "loss": 2.1333, + "step": 44710 + }, + { + "epoch": 0.17021535744463814, + "grad_norm": 0.10848259180784225, + "learning_rate": 0.0005, + "loss": 2.1381, + "step": 44720 + }, + { + "epoch": 0.17025341991276083, + "grad_norm": 0.1275780349969864, + "learning_rate": 0.0005, + "loss": 2.1373, + "step": 44730 + }, + { + "epoch": 0.1702914823808835, + "grad_norm": 0.12610189616680145, + "learning_rate": 0.0005, + "loss": 2.1334, + "step": 44740 + }, + { + "epoch": 0.1703295448490062, + "grad_norm": 0.15247602760791779, + "learning_rate": 0.0005, + "loss": 2.1188, + "step": 44750 + }, + { + "epoch": 0.17036760731712888, + "grad_norm": 0.1426989883184433, + "learning_rate": 0.0005, + "loss": 2.1514, + "step": 44760 + }, + { + "epoch": 0.17040566978525157, + "grad_norm": 0.13441897928714752, + "learning_rate": 0.0005, + "loss": 2.1321, + "step": 44770 + }, + { + "epoch": 0.17044373225337422, + "grad_norm": 0.12179271876811981, + "learning_rate": 0.0005, + "loss": 2.1364, + "step": 44780 + }, + { + "epoch": 0.1704817947214969, + "grad_norm": 0.11661059409379959, + "learning_rate": 0.0005, + "loss": 2.136, + "step": 44790 + }, + { + "epoch": 0.1705198571896196, + "grad_norm": 0.12268161028623581, + "learning_rate": 0.0005, + "loss": 2.1197, + "step": 44800 + }, + { + "epoch": 0.17055791965774228, + "grad_norm": 0.1112898513674736, + "learning_rate": 0.0005, + "loss": 2.1271, + "step": 44810 + }, + { + "epoch": 0.17059598212586496, + "grad_norm": 0.1266551911830902, + "learning_rate": 0.0005, + "loss": 2.1434, + "step": 44820 + }, + { + "epoch": 0.17063404459398765, + "grad_norm": 0.13600681722164154, + "learning_rate": 0.0005, + "loss": 2.1348, + "step": 44830 + }, + { + "epoch": 0.17067210706211033, + "grad_norm": 0.12621985375881195, + "learning_rate": 0.0005, + "loss": 2.137, + "step": 44840 + }, + { + "epoch": 0.17071016953023302, + "grad_norm": 0.1442611813545227, + "learning_rate": 0.0005, + "loss": 2.1362, + "step": 44850 + }, + { + "epoch": 0.1707482319983557, + "grad_norm": 0.12562938034534454, + "learning_rate": 0.0005, + "loss": 2.1399, + "step": 44860 + }, + { + "epoch": 0.1707862944664784, + "grad_norm": 0.14552602171897888, + "learning_rate": 0.0005, + "loss": 2.1199, + "step": 44870 + }, + { + "epoch": 0.17082435693460107, + "grad_norm": 0.12736286222934723, + "learning_rate": 0.0005, + "loss": 2.1379, + "step": 44880 + }, + { + "epoch": 0.17086241940272376, + "grad_norm": 0.13560928404331207, + "learning_rate": 0.0005, + "loss": 2.1301, + "step": 44890 + }, + { + "epoch": 0.17090048187084644, + "grad_norm": 0.13860104978084564, + "learning_rate": 0.0005, + "loss": 2.1345, + "step": 44900 + }, + { + "epoch": 0.17093854433896913, + "grad_norm": 0.11921971291303635, + "learning_rate": 0.0005, + "loss": 2.1224, + "step": 44910 + }, + { + "epoch": 0.17097660680709179, + "grad_norm": 0.13215811550617218, + "learning_rate": 0.0005, + "loss": 2.1476, + "step": 44920 + }, + { + "epoch": 0.17101466927521447, + "grad_norm": 0.1387583315372467, + "learning_rate": 0.0005, + "loss": 2.158, + "step": 44930 + }, + { + "epoch": 0.17105273174333716, + "grad_norm": 0.12246926128864288, + "learning_rate": 0.0005, + "loss": 2.1256, + "step": 44940 + }, + { + "epoch": 0.17109079421145984, + "grad_norm": 0.11858603358268738, + "learning_rate": 0.0005, + "loss": 2.1354, + "step": 44950 + }, + { + "epoch": 0.17112885667958253, + "grad_norm": 0.12365826219320297, + "learning_rate": 0.0005, + "loss": 2.1223, + "step": 44960 + }, + { + "epoch": 0.1711669191477052, + "grad_norm": 0.13618750870227814, + "learning_rate": 0.0005, + "loss": 2.1262, + "step": 44970 + }, + { + "epoch": 0.1712049816158279, + "grad_norm": 0.12615805864334106, + "learning_rate": 0.0005, + "loss": 2.1301, + "step": 44980 + }, + { + "epoch": 0.17124304408395058, + "grad_norm": 0.12646010518074036, + "learning_rate": 0.0005, + "loss": 2.1367, + "step": 44990 + }, + { + "epoch": 0.17128110655207326, + "grad_norm": 0.13197092711925507, + "learning_rate": 0.0005, + "loss": 2.1408, + "step": 45000 + }, + { + "epoch": 0.17131916902019595, + "grad_norm": 0.12390444427728653, + "learning_rate": 0.0005, + "loss": 2.1226, + "step": 45010 + }, + { + "epoch": 0.17135723148831863, + "grad_norm": 0.11571706086397171, + "learning_rate": 0.0005, + "loss": 2.1286, + "step": 45020 + }, + { + "epoch": 0.17139529395644132, + "grad_norm": 0.11823263764381409, + "learning_rate": 0.0005, + "loss": 2.1286, + "step": 45030 + }, + { + "epoch": 0.171433356424564, + "grad_norm": 0.13923102617263794, + "learning_rate": 0.0005, + "loss": 2.1382, + "step": 45040 + }, + { + "epoch": 0.1714714188926867, + "grad_norm": 0.12148989737033844, + "learning_rate": 0.0005, + "loss": 2.1339, + "step": 45050 + }, + { + "epoch": 0.17150948136080937, + "grad_norm": 0.11793144047260284, + "learning_rate": 0.0005, + "loss": 2.1431, + "step": 45060 + }, + { + "epoch": 0.17154754382893203, + "grad_norm": 0.1269650161266327, + "learning_rate": 0.0005, + "loss": 2.1187, + "step": 45070 + }, + { + "epoch": 0.17158560629705472, + "grad_norm": 0.12455953657627106, + "learning_rate": 0.0005, + "loss": 2.1225, + "step": 45080 + }, + { + "epoch": 0.1716236687651774, + "grad_norm": 0.1227991133928299, + "learning_rate": 0.0005, + "loss": 2.1435, + "step": 45090 + }, + { + "epoch": 0.17166173123330009, + "grad_norm": 0.12377261370420456, + "learning_rate": 0.0005, + "loss": 2.1232, + "step": 45100 + }, + { + "epoch": 0.17169979370142277, + "grad_norm": 0.12025581300258636, + "learning_rate": 0.0005, + "loss": 2.1338, + "step": 45110 + }, + { + "epoch": 0.17173785616954546, + "grad_norm": 0.12686417996883392, + "learning_rate": 0.0005, + "loss": 2.1358, + "step": 45120 + }, + { + "epoch": 0.17177591863766814, + "grad_norm": 0.12684020400047302, + "learning_rate": 0.0005, + "loss": 2.1362, + "step": 45130 + }, + { + "epoch": 0.17181398110579083, + "grad_norm": 0.12716318666934967, + "learning_rate": 0.0005, + "loss": 2.1213, + "step": 45140 + }, + { + "epoch": 0.1718520435739135, + "grad_norm": 0.11402035504579544, + "learning_rate": 0.0005, + "loss": 2.1261, + "step": 45150 + }, + { + "epoch": 0.1718901060420362, + "grad_norm": 0.11905350536108017, + "learning_rate": 0.0005, + "loss": 2.1298, + "step": 45160 + }, + { + "epoch": 0.17192816851015888, + "grad_norm": 0.12106286734342575, + "learning_rate": 0.0005, + "loss": 2.1322, + "step": 45170 + }, + { + "epoch": 0.17196623097828156, + "grad_norm": 0.12988130748271942, + "learning_rate": 0.0005, + "loss": 2.1386, + "step": 45180 + }, + { + "epoch": 0.17200429344640425, + "grad_norm": 0.13417784869670868, + "learning_rate": 0.0005, + "loss": 2.135, + "step": 45190 + }, + { + "epoch": 0.17204235591452693, + "grad_norm": 0.1289902925491333, + "learning_rate": 0.0005, + "loss": 2.1431, + "step": 45200 + }, + { + "epoch": 0.1720804183826496, + "grad_norm": 0.1163710206747055, + "learning_rate": 0.0005, + "loss": 2.1415, + "step": 45210 + }, + { + "epoch": 0.17211848085077228, + "grad_norm": 0.15872104465961456, + "learning_rate": 0.0005, + "loss": 2.1359, + "step": 45220 + }, + { + "epoch": 0.17215654331889496, + "grad_norm": 0.11624854803085327, + "learning_rate": 0.0005, + "loss": 2.1234, + "step": 45230 + }, + { + "epoch": 0.17219460578701765, + "grad_norm": 0.11021952331066132, + "learning_rate": 0.0005, + "loss": 2.1329, + "step": 45240 + }, + { + "epoch": 0.17223266825514033, + "grad_norm": 0.11777684092521667, + "learning_rate": 0.0005, + "loss": 2.1176, + "step": 45250 + }, + { + "epoch": 0.17227073072326302, + "grad_norm": 0.1267765313386917, + "learning_rate": 0.0005, + "loss": 2.1361, + "step": 45260 + }, + { + "epoch": 0.1723087931913857, + "grad_norm": 0.12344861775636673, + "learning_rate": 0.0005, + "loss": 2.1502, + "step": 45270 + }, + { + "epoch": 0.17234685565950839, + "grad_norm": 0.1236143633723259, + "learning_rate": 0.0005, + "loss": 2.1449, + "step": 45280 + }, + { + "epoch": 0.17238491812763107, + "grad_norm": 0.12572093307971954, + "learning_rate": 0.0005, + "loss": 2.145, + "step": 45290 + }, + { + "epoch": 0.17242298059575376, + "grad_norm": 0.13047395646572113, + "learning_rate": 0.0005, + "loss": 2.1361, + "step": 45300 + }, + { + "epoch": 0.17246104306387644, + "grad_norm": 0.12851989269256592, + "learning_rate": 0.0005, + "loss": 2.133, + "step": 45310 + }, + { + "epoch": 0.17249910553199913, + "grad_norm": 0.1399923413991928, + "learning_rate": 0.0005, + "loss": 2.1171, + "step": 45320 + }, + { + "epoch": 0.1725371680001218, + "grad_norm": 0.12635745108127594, + "learning_rate": 0.0005, + "loss": 2.1207, + "step": 45330 + }, + { + "epoch": 0.1725752304682445, + "grad_norm": 0.1274142563343048, + "learning_rate": 0.0005, + "loss": 2.1479, + "step": 45340 + }, + { + "epoch": 0.17261329293636715, + "grad_norm": 0.13066533207893372, + "learning_rate": 0.0005, + "loss": 2.1425, + "step": 45350 + }, + { + "epoch": 0.17265135540448984, + "grad_norm": 0.11880338191986084, + "learning_rate": 0.0005, + "loss": 2.1295, + "step": 45360 + }, + { + "epoch": 0.17268941787261252, + "grad_norm": 0.11296599358320236, + "learning_rate": 0.0005, + "loss": 2.1236, + "step": 45370 + }, + { + "epoch": 0.1727274803407352, + "grad_norm": 0.11921131610870361, + "learning_rate": 0.0005, + "loss": 2.1297, + "step": 45380 + }, + { + "epoch": 0.1727655428088579, + "grad_norm": 0.12866875529289246, + "learning_rate": 0.0005, + "loss": 2.1237, + "step": 45390 + }, + { + "epoch": 0.17280360527698058, + "grad_norm": 0.12386047840118408, + "learning_rate": 0.0005, + "loss": 2.1448, + "step": 45400 + }, + { + "epoch": 0.17284166774510326, + "grad_norm": 0.1407991200685501, + "learning_rate": 0.0005, + "loss": 2.1419, + "step": 45410 + }, + { + "epoch": 0.17287973021322595, + "grad_norm": 0.11899222433567047, + "learning_rate": 0.0005, + "loss": 2.1391, + "step": 45420 + }, + { + "epoch": 0.17291779268134863, + "grad_norm": 0.13968820869922638, + "learning_rate": 0.0005, + "loss": 2.12, + "step": 45430 + }, + { + "epoch": 0.17295585514947132, + "grad_norm": 0.12605208158493042, + "learning_rate": 0.0005, + "loss": 2.1438, + "step": 45440 + }, + { + "epoch": 0.172993917617594, + "grad_norm": 0.11998526006937027, + "learning_rate": 0.0005, + "loss": 2.136, + "step": 45450 + }, + { + "epoch": 0.17303198008571669, + "grad_norm": 0.11852370202541351, + "learning_rate": 0.0005, + "loss": 2.1269, + "step": 45460 + }, + { + "epoch": 0.17307004255383937, + "grad_norm": 0.12640227377414703, + "learning_rate": 0.0005, + "loss": 2.1275, + "step": 45470 + }, + { + "epoch": 0.17310810502196206, + "grad_norm": 0.12370039522647858, + "learning_rate": 0.0005, + "loss": 2.1384, + "step": 45480 + }, + { + "epoch": 0.17314616749008474, + "grad_norm": 0.1274411529302597, + "learning_rate": 0.0005, + "loss": 2.1356, + "step": 45490 + }, + { + "epoch": 0.1731842299582074, + "grad_norm": 0.11666889488697052, + "learning_rate": 0.0005, + "loss": 2.1281, + "step": 45500 + }, + { + "epoch": 0.17322229242633008, + "grad_norm": 0.13356836140155792, + "learning_rate": 0.0005, + "loss": 2.1286, + "step": 45510 + }, + { + "epoch": 0.17326035489445277, + "grad_norm": 0.12142832577228546, + "learning_rate": 0.0005, + "loss": 2.1327, + "step": 45520 + }, + { + "epoch": 0.17329841736257545, + "grad_norm": 0.12746329605579376, + "learning_rate": 0.0005, + "loss": 2.1405, + "step": 45530 + }, + { + "epoch": 0.17333647983069814, + "grad_norm": 0.1255599856376648, + "learning_rate": 0.0005, + "loss": 2.1256, + "step": 45540 + }, + { + "epoch": 0.17337454229882082, + "grad_norm": 0.12974071502685547, + "learning_rate": 0.0005, + "loss": 2.1362, + "step": 45550 + }, + { + "epoch": 0.1734126047669435, + "grad_norm": 0.13310503959655762, + "learning_rate": 0.0005, + "loss": 2.1148, + "step": 45560 + }, + { + "epoch": 0.1734506672350662, + "grad_norm": 0.1318385899066925, + "learning_rate": 0.0005, + "loss": 2.1589, + "step": 45570 + }, + { + "epoch": 0.17348872970318888, + "grad_norm": 0.1205783486366272, + "learning_rate": 0.0005, + "loss": 2.1376, + "step": 45580 + }, + { + "epoch": 0.17352679217131156, + "grad_norm": 0.12158027291297913, + "learning_rate": 0.0005, + "loss": 2.1415, + "step": 45590 + }, + { + "epoch": 0.17356485463943425, + "grad_norm": 0.12061561644077301, + "learning_rate": 0.0005, + "loss": 2.1371, + "step": 45600 + }, + { + "epoch": 0.17360291710755693, + "grad_norm": 0.11390230059623718, + "learning_rate": 0.0005, + "loss": 2.1269, + "step": 45610 + }, + { + "epoch": 0.17364097957567962, + "grad_norm": 0.12488873302936554, + "learning_rate": 0.0005, + "loss": 2.1326, + "step": 45620 + }, + { + "epoch": 0.1736790420438023, + "grad_norm": 0.12872549891471863, + "learning_rate": 0.0005, + "loss": 2.1345, + "step": 45630 + }, + { + "epoch": 0.17371710451192496, + "grad_norm": 0.13098588585853577, + "learning_rate": 0.0005, + "loss": 2.1439, + "step": 45640 + }, + { + "epoch": 0.17375516698004764, + "grad_norm": 0.13178899884223938, + "learning_rate": 0.0005, + "loss": 2.1405, + "step": 45650 + }, + { + "epoch": 0.17379322944817033, + "grad_norm": 0.12277550995349884, + "learning_rate": 0.0005, + "loss": 2.1317, + "step": 45660 + }, + { + "epoch": 0.173831291916293, + "grad_norm": 0.1527359038591385, + "learning_rate": 0.0005, + "loss": 2.1367, + "step": 45670 + }, + { + "epoch": 0.1738693543844157, + "grad_norm": 0.12647026777267456, + "learning_rate": 0.0005, + "loss": 2.1314, + "step": 45680 + }, + { + "epoch": 0.17390741685253838, + "grad_norm": 0.13957686722278595, + "learning_rate": 0.0005, + "loss": 2.1417, + "step": 45690 + }, + { + "epoch": 0.17394547932066107, + "grad_norm": 0.13181020319461823, + "learning_rate": 0.0005, + "loss": 2.1246, + "step": 45700 + }, + { + "epoch": 0.17398354178878375, + "grad_norm": 0.12165477126836777, + "learning_rate": 0.0005, + "loss": 2.1303, + "step": 45710 + }, + { + "epoch": 0.17402160425690644, + "grad_norm": 0.1272124946117401, + "learning_rate": 0.0005, + "loss": 2.1386, + "step": 45720 + }, + { + "epoch": 0.17405966672502912, + "grad_norm": 0.1310453861951828, + "learning_rate": 0.0005, + "loss": 2.1299, + "step": 45730 + }, + { + "epoch": 0.1740977291931518, + "grad_norm": 0.1178453266620636, + "learning_rate": 0.0005, + "loss": 2.1266, + "step": 45740 + }, + { + "epoch": 0.1741357916612745, + "grad_norm": 0.12335092574357986, + "learning_rate": 0.0005, + "loss": 2.1363, + "step": 45750 + }, + { + "epoch": 0.17417385412939718, + "grad_norm": 0.13877379894256592, + "learning_rate": 0.0005, + "loss": 2.1414, + "step": 45760 + }, + { + "epoch": 0.17421191659751986, + "grad_norm": 0.11673180013895035, + "learning_rate": 0.0005, + "loss": 2.149, + "step": 45770 + }, + { + "epoch": 0.17424997906564255, + "grad_norm": 0.13545605540275574, + "learning_rate": 0.0005, + "loss": 2.1236, + "step": 45780 + }, + { + "epoch": 0.1742880415337652, + "grad_norm": 0.124485544860363, + "learning_rate": 0.0005, + "loss": 2.1379, + "step": 45790 + }, + { + "epoch": 0.1743261040018879, + "grad_norm": 0.12190944701433182, + "learning_rate": 0.0005, + "loss": 2.1385, + "step": 45800 + }, + { + "epoch": 0.17436416647001057, + "grad_norm": 0.12852436304092407, + "learning_rate": 0.0005, + "loss": 2.1468, + "step": 45810 + }, + { + "epoch": 0.17440222893813326, + "grad_norm": 0.12599514424800873, + "learning_rate": 0.0005, + "loss": 2.1411, + "step": 45820 + }, + { + "epoch": 0.17444029140625594, + "grad_norm": 0.11554643511772156, + "learning_rate": 0.0005, + "loss": 2.1165, + "step": 45830 + }, + { + "epoch": 0.17447835387437863, + "grad_norm": 0.12082778662443161, + "learning_rate": 0.0005, + "loss": 2.1505, + "step": 45840 + }, + { + "epoch": 0.1745164163425013, + "grad_norm": 0.12294019758701324, + "learning_rate": 0.0005, + "loss": 2.1333, + "step": 45850 + }, + { + "epoch": 0.174554478810624, + "grad_norm": 0.12255796045064926, + "learning_rate": 0.0005, + "loss": 2.138, + "step": 45860 + }, + { + "epoch": 0.17459254127874668, + "grad_norm": 0.1204526275396347, + "learning_rate": 0.0005, + "loss": 2.1433, + "step": 45870 + }, + { + "epoch": 0.17463060374686937, + "grad_norm": 0.124061718583107, + "learning_rate": 0.0005, + "loss": 2.1351, + "step": 45880 + }, + { + "epoch": 0.17466866621499205, + "grad_norm": 0.11348722875118256, + "learning_rate": 0.0005, + "loss": 2.1273, + "step": 45890 + }, + { + "epoch": 0.17470672868311474, + "grad_norm": 0.12603124976158142, + "learning_rate": 0.0005, + "loss": 2.1602, + "step": 45900 + }, + { + "epoch": 0.17474479115123742, + "grad_norm": 0.12413591146469116, + "learning_rate": 0.0005, + "loss": 2.1414, + "step": 45910 + }, + { + "epoch": 0.1747828536193601, + "grad_norm": 0.13132306933403015, + "learning_rate": 0.0005, + "loss": 2.1165, + "step": 45920 + }, + { + "epoch": 0.17482091608748276, + "grad_norm": 0.12539812922477722, + "learning_rate": 0.0005, + "loss": 2.1352, + "step": 45930 + }, + { + "epoch": 0.17485897855560545, + "grad_norm": 0.10985272377729416, + "learning_rate": 0.0005, + "loss": 2.1355, + "step": 45940 + }, + { + "epoch": 0.17489704102372813, + "grad_norm": 0.1305789202451706, + "learning_rate": 0.0005, + "loss": 2.1402, + "step": 45950 + }, + { + "epoch": 0.17493510349185082, + "grad_norm": 0.1933310478925705, + "learning_rate": 0.0005, + "loss": 2.1359, + "step": 45960 + }, + { + "epoch": 0.1749731659599735, + "grad_norm": 0.12040173262357712, + "learning_rate": 0.0005, + "loss": 2.1337, + "step": 45970 + }, + { + "epoch": 0.1750112284280962, + "grad_norm": 0.15249690413475037, + "learning_rate": 0.0005, + "loss": 2.1193, + "step": 45980 + }, + { + "epoch": 0.17504929089621887, + "grad_norm": 0.1186014786362648, + "learning_rate": 0.0005, + "loss": 2.1273, + "step": 45990 + }, + { + "epoch": 0.17508735336434156, + "grad_norm": 0.12150265276432037, + "learning_rate": 0.0005, + "loss": 2.1283, + "step": 46000 + }, + { + "epoch": 0.17512541583246424, + "grad_norm": 0.1239115297794342, + "learning_rate": 0.0005, + "loss": 2.1376, + "step": 46010 + }, + { + "epoch": 0.17516347830058693, + "grad_norm": 0.13498394191265106, + "learning_rate": 0.0005, + "loss": 2.1249, + "step": 46020 + }, + { + "epoch": 0.1752015407687096, + "grad_norm": 0.12338341772556305, + "learning_rate": 0.0005, + "loss": 2.133, + "step": 46030 + }, + { + "epoch": 0.1752396032368323, + "grad_norm": 0.1274867206811905, + "learning_rate": 0.0005, + "loss": 2.1252, + "step": 46040 + }, + { + "epoch": 0.17527766570495498, + "grad_norm": 0.11397123336791992, + "learning_rate": 0.0005, + "loss": 2.131, + "step": 46050 + }, + { + "epoch": 0.17531572817307767, + "grad_norm": 0.16468773782253265, + "learning_rate": 0.0005, + "loss": 2.1454, + "step": 46060 + }, + { + "epoch": 0.17535379064120032, + "grad_norm": 0.11684907227754593, + "learning_rate": 0.0005, + "loss": 2.1331, + "step": 46070 + }, + { + "epoch": 0.175391853109323, + "grad_norm": 0.13277249038219452, + "learning_rate": 0.0005, + "loss": 2.1373, + "step": 46080 + }, + { + "epoch": 0.1754299155774457, + "grad_norm": 0.1336735486984253, + "learning_rate": 0.0005, + "loss": 2.1361, + "step": 46090 + }, + { + "epoch": 0.17546797804556838, + "grad_norm": 0.1380987912416458, + "learning_rate": 0.0005, + "loss": 2.123, + "step": 46100 + }, + { + "epoch": 0.17550604051369106, + "grad_norm": 0.12328807264566422, + "learning_rate": 0.0005, + "loss": 2.1491, + "step": 46110 + }, + { + "epoch": 0.17554410298181375, + "grad_norm": 0.12212718278169632, + "learning_rate": 0.0005, + "loss": 2.1355, + "step": 46120 + }, + { + "epoch": 0.17558216544993643, + "grad_norm": 0.11557463556528091, + "learning_rate": 0.0005, + "loss": 2.1232, + "step": 46130 + }, + { + "epoch": 0.17562022791805912, + "grad_norm": 0.1220347136259079, + "learning_rate": 0.0005, + "loss": 2.1216, + "step": 46140 + }, + { + "epoch": 0.1756582903861818, + "grad_norm": 0.13644815981388092, + "learning_rate": 0.0005, + "loss": 2.1149, + "step": 46150 + }, + { + "epoch": 0.1756963528543045, + "grad_norm": 0.13459473848342896, + "learning_rate": 0.0005, + "loss": 2.1413, + "step": 46160 + }, + { + "epoch": 0.17573441532242717, + "grad_norm": 0.12515737116336823, + "learning_rate": 0.0005, + "loss": 2.1341, + "step": 46170 + }, + { + "epoch": 0.17577247779054986, + "grad_norm": 0.13208098709583282, + "learning_rate": 0.0005, + "loss": 2.1323, + "step": 46180 + }, + { + "epoch": 0.17581054025867254, + "grad_norm": 0.11350484192371368, + "learning_rate": 0.0005, + "loss": 2.1289, + "step": 46190 + }, + { + "epoch": 0.17584860272679523, + "grad_norm": 0.11895857751369476, + "learning_rate": 0.0005, + "loss": 2.1383, + "step": 46200 + }, + { + "epoch": 0.1758866651949179, + "grad_norm": 0.11057837307453156, + "learning_rate": 0.0005, + "loss": 2.1325, + "step": 46210 + }, + { + "epoch": 0.17592472766304057, + "grad_norm": 0.1252758502960205, + "learning_rate": 0.0005, + "loss": 2.1241, + "step": 46220 + }, + { + "epoch": 0.17596279013116325, + "grad_norm": 0.12299270182847977, + "learning_rate": 0.0005, + "loss": 2.1371, + "step": 46230 + }, + { + "epoch": 0.17600085259928594, + "grad_norm": 0.12673571705818176, + "learning_rate": 0.0005, + "loss": 2.1331, + "step": 46240 + }, + { + "epoch": 0.17603891506740862, + "grad_norm": 0.10950964689254761, + "learning_rate": 0.0005, + "loss": 2.1413, + "step": 46250 + }, + { + "epoch": 0.1760769775355313, + "grad_norm": 0.11511676013469696, + "learning_rate": 0.0005, + "loss": 2.1404, + "step": 46260 + }, + { + "epoch": 0.176115040003654, + "grad_norm": 0.12869395315647125, + "learning_rate": 0.0005, + "loss": 2.1501, + "step": 46270 + }, + { + "epoch": 0.17615310247177668, + "grad_norm": 0.14847025275230408, + "learning_rate": 0.0005, + "loss": 2.1413, + "step": 46280 + }, + { + "epoch": 0.17619116493989936, + "grad_norm": 0.11274388432502747, + "learning_rate": 0.0005, + "loss": 2.1348, + "step": 46290 + }, + { + "epoch": 0.17622922740802205, + "grad_norm": 0.12950894236564636, + "learning_rate": 0.0005, + "loss": 2.1367, + "step": 46300 + }, + { + "epoch": 0.17626728987614473, + "grad_norm": 0.1113792359828949, + "learning_rate": 0.0005, + "loss": 2.1326, + "step": 46310 + }, + { + "epoch": 0.17630535234426742, + "grad_norm": 0.15397079288959503, + "learning_rate": 0.0005, + "loss": 2.1355, + "step": 46320 + }, + { + "epoch": 0.1763434148123901, + "grad_norm": 0.13755261898040771, + "learning_rate": 0.0005, + "loss": 2.1389, + "step": 46330 + }, + { + "epoch": 0.1763814772805128, + "grad_norm": 0.11487016081809998, + "learning_rate": 0.0005, + "loss": 2.1406, + "step": 46340 + }, + { + "epoch": 0.17641953974863547, + "grad_norm": 0.11901956796646118, + "learning_rate": 0.0005, + "loss": 2.1439, + "step": 46350 + }, + { + "epoch": 0.17645760221675813, + "grad_norm": 0.12137291580438614, + "learning_rate": 0.0005, + "loss": 2.1573, + "step": 46360 + }, + { + "epoch": 0.17649566468488082, + "grad_norm": 0.11371459066867828, + "learning_rate": 0.0005, + "loss": 2.1182, + "step": 46370 + }, + { + "epoch": 0.1765337271530035, + "grad_norm": 0.1144208237528801, + "learning_rate": 0.0005, + "loss": 2.1343, + "step": 46380 + }, + { + "epoch": 0.17657178962112619, + "grad_norm": 0.1280364841222763, + "learning_rate": 0.0005, + "loss": 2.1281, + "step": 46390 + }, + { + "epoch": 0.17660985208924887, + "grad_norm": 0.12925398349761963, + "learning_rate": 0.0005, + "loss": 2.1186, + "step": 46400 + }, + { + "epoch": 0.17664791455737155, + "grad_norm": 0.12857505679130554, + "learning_rate": 0.0005, + "loss": 2.1305, + "step": 46410 + }, + { + "epoch": 0.17668597702549424, + "grad_norm": 0.13170984387397766, + "learning_rate": 0.0005, + "loss": 2.1437, + "step": 46420 + }, + { + "epoch": 0.17672403949361692, + "grad_norm": 0.12619630992412567, + "learning_rate": 0.0005, + "loss": 2.1272, + "step": 46430 + }, + { + "epoch": 0.1767621019617396, + "grad_norm": 0.14194762706756592, + "learning_rate": 0.0005, + "loss": 2.1354, + "step": 46440 + }, + { + "epoch": 0.1768001644298623, + "grad_norm": 0.12867200374603271, + "learning_rate": 0.0005, + "loss": 2.1379, + "step": 46450 + }, + { + "epoch": 0.17683822689798498, + "grad_norm": 0.12789608538150787, + "learning_rate": 0.0005, + "loss": 2.1308, + "step": 46460 + }, + { + "epoch": 0.17687628936610766, + "grad_norm": 0.1245100274682045, + "learning_rate": 0.0005, + "loss": 2.1338, + "step": 46470 + }, + { + "epoch": 0.17691435183423035, + "grad_norm": 0.15106020867824554, + "learning_rate": 0.0005, + "loss": 2.1558, + "step": 46480 + }, + { + "epoch": 0.17695241430235303, + "grad_norm": 0.13041353225708008, + "learning_rate": 0.0005, + "loss": 2.1357, + "step": 46490 + }, + { + "epoch": 0.1769904767704757, + "grad_norm": 0.13027580082416534, + "learning_rate": 0.0005, + "loss": 2.1385, + "step": 46500 + }, + { + "epoch": 0.17702853923859838, + "grad_norm": 0.11549444496631622, + "learning_rate": 0.0005, + "loss": 2.1465, + "step": 46510 + }, + { + "epoch": 0.17706660170672106, + "grad_norm": 0.13323059678077698, + "learning_rate": 0.0005, + "loss": 2.1233, + "step": 46520 + }, + { + "epoch": 0.17710466417484375, + "grad_norm": 0.1289878934621811, + "learning_rate": 0.0005, + "loss": 2.1228, + "step": 46530 + }, + { + "epoch": 0.17714272664296643, + "grad_norm": 0.13852861523628235, + "learning_rate": 0.0005, + "loss": 2.1216, + "step": 46540 + }, + { + "epoch": 0.17718078911108912, + "grad_norm": 0.13921350240707397, + "learning_rate": 0.0005, + "loss": 2.1385, + "step": 46550 + }, + { + "epoch": 0.1772188515792118, + "grad_norm": 0.12161636352539062, + "learning_rate": 0.0005, + "loss": 2.1171, + "step": 46560 + }, + { + "epoch": 0.17725691404733449, + "grad_norm": 0.13465529680252075, + "learning_rate": 0.0005, + "loss": 2.1225, + "step": 46570 + }, + { + "epoch": 0.17729497651545717, + "grad_norm": 0.17854134738445282, + "learning_rate": 0.0005, + "loss": 2.1433, + "step": 46580 + }, + { + "epoch": 0.17733303898357985, + "grad_norm": 0.12033980339765549, + "learning_rate": 0.0005, + "loss": 2.1301, + "step": 46590 + }, + { + "epoch": 0.17737110145170254, + "grad_norm": 0.1267291158437729, + "learning_rate": 0.0005, + "loss": 2.1373, + "step": 46600 + }, + { + "epoch": 0.17740916391982522, + "grad_norm": 0.11675343662500381, + "learning_rate": 0.0005, + "loss": 2.1312, + "step": 46610 + }, + { + "epoch": 0.1774472263879479, + "grad_norm": 0.12187250703573227, + "learning_rate": 0.0005, + "loss": 2.1395, + "step": 46620 + }, + { + "epoch": 0.1774852888560706, + "grad_norm": 0.12096511572599411, + "learning_rate": 0.0005, + "loss": 2.148, + "step": 46630 + }, + { + "epoch": 0.17752335132419328, + "grad_norm": 0.12929442524909973, + "learning_rate": 0.0005, + "loss": 2.1401, + "step": 46640 + }, + { + "epoch": 0.17756141379231594, + "grad_norm": 0.12891370058059692, + "learning_rate": 0.0005, + "loss": 2.1316, + "step": 46650 + }, + { + "epoch": 0.17759947626043862, + "grad_norm": 0.12333334982395172, + "learning_rate": 0.0005, + "loss": 2.1385, + "step": 46660 + }, + { + "epoch": 0.1776375387285613, + "grad_norm": 0.1466992199420929, + "learning_rate": 0.0005, + "loss": 2.1293, + "step": 46670 + }, + { + "epoch": 0.177675601196684, + "grad_norm": 0.12020840495824814, + "learning_rate": 0.0005, + "loss": 2.1251, + "step": 46680 + }, + { + "epoch": 0.17771366366480668, + "grad_norm": 0.12505066394805908, + "learning_rate": 0.0005, + "loss": 2.139, + "step": 46690 + }, + { + "epoch": 0.17775172613292936, + "grad_norm": 0.1469816416501999, + "learning_rate": 0.0005, + "loss": 2.1555, + "step": 46700 + }, + { + "epoch": 0.17778978860105205, + "grad_norm": 0.12790228426456451, + "learning_rate": 0.0005, + "loss": 2.1441, + "step": 46710 + }, + { + "epoch": 0.17782785106917473, + "grad_norm": 0.12211241573095322, + "learning_rate": 0.0005, + "loss": 2.1296, + "step": 46720 + }, + { + "epoch": 0.17786591353729742, + "grad_norm": 0.1326773464679718, + "learning_rate": 0.0005, + "loss": 2.1269, + "step": 46730 + }, + { + "epoch": 0.1779039760054201, + "grad_norm": 0.14631135761737823, + "learning_rate": 0.0005, + "loss": 2.1217, + "step": 46740 + }, + { + "epoch": 0.17794203847354279, + "grad_norm": 0.12743769586086273, + "learning_rate": 0.0005, + "loss": 2.1391, + "step": 46750 + }, + { + "epoch": 0.17798010094166547, + "grad_norm": 0.12034343928098679, + "learning_rate": 0.0005, + "loss": 2.131, + "step": 46760 + }, + { + "epoch": 0.17801816340978815, + "grad_norm": 0.146920844912529, + "learning_rate": 0.0005, + "loss": 2.1296, + "step": 46770 + }, + { + "epoch": 0.17805622587791084, + "grad_norm": 0.12294947355985641, + "learning_rate": 0.0005, + "loss": 2.1376, + "step": 46780 + }, + { + "epoch": 0.1780942883460335, + "grad_norm": 0.13369952142238617, + "learning_rate": 0.0005, + "loss": 2.1288, + "step": 46790 + }, + { + "epoch": 0.17813235081415618, + "grad_norm": 0.12316075712442398, + "learning_rate": 0.0005, + "loss": 2.1397, + "step": 46800 + }, + { + "epoch": 0.17817041328227887, + "grad_norm": 0.1283416450023651, + "learning_rate": 0.0005, + "loss": 2.1346, + "step": 46810 + }, + { + "epoch": 0.17820847575040155, + "grad_norm": 0.11663859337568283, + "learning_rate": 0.0005, + "loss": 2.1317, + "step": 46820 + }, + { + "epoch": 0.17824653821852424, + "grad_norm": 0.1286969780921936, + "learning_rate": 0.0005, + "loss": 2.1371, + "step": 46830 + }, + { + "epoch": 0.17828460068664692, + "grad_norm": 0.12117967009544373, + "learning_rate": 0.0005, + "loss": 2.1359, + "step": 46840 + }, + { + "epoch": 0.1783226631547696, + "grad_norm": 0.1272825002670288, + "learning_rate": 0.0005, + "loss": 2.1303, + "step": 46850 + }, + { + "epoch": 0.1783607256228923, + "grad_norm": 0.12218283116817474, + "learning_rate": 0.0005, + "loss": 2.1309, + "step": 46860 + }, + { + "epoch": 0.17839878809101498, + "grad_norm": 0.12377354502677917, + "learning_rate": 0.0005, + "loss": 2.1587, + "step": 46870 + }, + { + "epoch": 0.17843685055913766, + "grad_norm": 0.12458238005638123, + "learning_rate": 0.0005, + "loss": 2.1252, + "step": 46880 + }, + { + "epoch": 0.17847491302726035, + "grad_norm": 0.12494272738695145, + "learning_rate": 0.0005, + "loss": 2.1267, + "step": 46890 + }, + { + "epoch": 0.17851297549538303, + "grad_norm": 0.13676008582115173, + "learning_rate": 0.0005, + "loss": 2.1283, + "step": 46900 + }, + { + "epoch": 0.17855103796350572, + "grad_norm": 0.14277496933937073, + "learning_rate": 0.0005, + "loss": 2.1218, + "step": 46910 + }, + { + "epoch": 0.1785891004316284, + "grad_norm": 0.12220119684934616, + "learning_rate": 0.0005, + "loss": 2.1252, + "step": 46920 + }, + { + "epoch": 0.17862716289975109, + "grad_norm": 0.13440436124801636, + "learning_rate": 0.0005, + "loss": 2.1328, + "step": 46930 + }, + { + "epoch": 0.17866522536787374, + "grad_norm": 0.1321924924850464, + "learning_rate": 0.0005, + "loss": 2.1359, + "step": 46940 + }, + { + "epoch": 0.17870328783599643, + "grad_norm": 0.13210120797157288, + "learning_rate": 0.0005, + "loss": 2.1487, + "step": 46950 + }, + { + "epoch": 0.1787413503041191, + "grad_norm": 0.12004867941141129, + "learning_rate": 0.0005, + "loss": 2.1285, + "step": 46960 + }, + { + "epoch": 0.1787794127722418, + "grad_norm": 0.13169914484024048, + "learning_rate": 0.0005, + "loss": 2.1232, + "step": 46970 + }, + { + "epoch": 0.17881747524036448, + "grad_norm": 0.12706497311592102, + "learning_rate": 0.0005, + "loss": 2.133, + "step": 46980 + }, + { + "epoch": 0.17885553770848717, + "grad_norm": 0.1386055052280426, + "learning_rate": 0.0005, + "loss": 2.1358, + "step": 46990 + }, + { + "epoch": 0.17889360017660985, + "grad_norm": 0.1212029978632927, + "learning_rate": 0.0005, + "loss": 2.1131, + "step": 47000 + }, + { + "epoch": 0.17893166264473254, + "grad_norm": 0.12467978894710541, + "learning_rate": 0.0005, + "loss": 2.1351, + "step": 47010 + }, + { + "epoch": 0.17896972511285522, + "grad_norm": 0.141191303730011, + "learning_rate": 0.0005, + "loss": 2.1508, + "step": 47020 + }, + { + "epoch": 0.1790077875809779, + "grad_norm": 0.13441681861877441, + "learning_rate": 0.0005, + "loss": 2.142, + "step": 47030 + }, + { + "epoch": 0.1790458500491006, + "grad_norm": 0.12653379142284393, + "learning_rate": 0.0005, + "loss": 2.1302, + "step": 47040 + }, + { + "epoch": 0.17908391251722328, + "grad_norm": 0.1331082135438919, + "learning_rate": 0.0005, + "loss": 2.1312, + "step": 47050 + }, + { + "epoch": 0.17912197498534596, + "grad_norm": 0.12425190955400467, + "learning_rate": 0.0005, + "loss": 2.1286, + "step": 47060 + }, + { + "epoch": 0.17916003745346865, + "grad_norm": 0.12457893788814545, + "learning_rate": 0.0005, + "loss": 2.1319, + "step": 47070 + }, + { + "epoch": 0.1791980999215913, + "grad_norm": 0.1274491250514984, + "learning_rate": 0.0005, + "loss": 2.1284, + "step": 47080 + }, + { + "epoch": 0.179236162389714, + "grad_norm": 0.12396488338708878, + "learning_rate": 0.0005, + "loss": 2.1264, + "step": 47090 + }, + { + "epoch": 0.17927422485783667, + "grad_norm": 0.12364597618579865, + "learning_rate": 0.0005, + "loss": 2.1279, + "step": 47100 + }, + { + "epoch": 0.17931228732595936, + "grad_norm": 0.11393121629953384, + "learning_rate": 0.0005, + "loss": 2.1472, + "step": 47110 + }, + { + "epoch": 0.17935034979408204, + "grad_norm": 0.14089448750019073, + "learning_rate": 0.0005, + "loss": 2.1216, + "step": 47120 + }, + { + "epoch": 0.17938841226220473, + "grad_norm": 0.12793299555778503, + "learning_rate": 0.0005, + "loss": 2.1297, + "step": 47130 + }, + { + "epoch": 0.1794264747303274, + "grad_norm": 0.14278213679790497, + "learning_rate": 0.0005, + "loss": 2.1544, + "step": 47140 + }, + { + "epoch": 0.1794645371984501, + "grad_norm": 0.12074515223503113, + "learning_rate": 0.0005, + "loss": 2.125, + "step": 47150 + }, + { + "epoch": 0.17950259966657278, + "grad_norm": 0.11604276299476624, + "learning_rate": 0.0005, + "loss": 2.1381, + "step": 47160 + }, + { + "epoch": 0.17954066213469547, + "grad_norm": 0.12846846878528595, + "learning_rate": 0.0005, + "loss": 2.1196, + "step": 47170 + }, + { + "epoch": 0.17957872460281815, + "grad_norm": 0.10897751897573471, + "learning_rate": 0.0005, + "loss": 2.1344, + "step": 47180 + }, + { + "epoch": 0.17961678707094084, + "grad_norm": 0.12026036530733109, + "learning_rate": 0.0005, + "loss": 2.1331, + "step": 47190 + }, + { + "epoch": 0.17965484953906352, + "grad_norm": 0.12652936577796936, + "learning_rate": 0.0005, + "loss": 2.1321, + "step": 47200 + }, + { + "epoch": 0.1796929120071862, + "grad_norm": 0.1351630538702011, + "learning_rate": 0.0005, + "loss": 2.1254, + "step": 47210 + }, + { + "epoch": 0.17973097447530886, + "grad_norm": 0.15102262794971466, + "learning_rate": 0.0005, + "loss": 2.1306, + "step": 47220 + }, + { + "epoch": 0.17976903694343155, + "grad_norm": 0.1387786865234375, + "learning_rate": 0.0005, + "loss": 2.1278, + "step": 47230 + }, + { + "epoch": 0.17980709941155423, + "grad_norm": 0.11889027804136276, + "learning_rate": 0.0005, + "loss": 2.1259, + "step": 47240 + }, + { + "epoch": 0.17984516187967692, + "grad_norm": 0.1278916597366333, + "learning_rate": 0.0005, + "loss": 2.1412, + "step": 47250 + }, + { + "epoch": 0.1798832243477996, + "grad_norm": 0.13283556699752808, + "learning_rate": 0.0005, + "loss": 2.1228, + "step": 47260 + }, + { + "epoch": 0.1799212868159223, + "grad_norm": 0.11985579878091812, + "learning_rate": 0.0005, + "loss": 2.1388, + "step": 47270 + }, + { + "epoch": 0.17995934928404497, + "grad_norm": 0.11472861468791962, + "learning_rate": 0.0005, + "loss": 2.1242, + "step": 47280 + }, + { + "epoch": 0.17999741175216766, + "grad_norm": 0.11842813342809677, + "learning_rate": 0.0005, + "loss": 2.1277, + "step": 47290 + }, + { + "epoch": 0.18003547422029034, + "grad_norm": 0.12179480493068695, + "learning_rate": 0.0005, + "loss": 2.1311, + "step": 47300 + }, + { + "epoch": 0.18007353668841303, + "grad_norm": 0.11679831147193909, + "learning_rate": 0.0005, + "loss": 2.1309, + "step": 47310 + }, + { + "epoch": 0.1801115991565357, + "grad_norm": 0.12437867373228073, + "learning_rate": 0.0005, + "loss": 2.1533, + "step": 47320 + }, + { + "epoch": 0.1801496616246584, + "grad_norm": 0.11317010223865509, + "learning_rate": 0.0005, + "loss": 2.1368, + "step": 47330 + }, + { + "epoch": 0.18018772409278108, + "grad_norm": 0.13073226809501648, + "learning_rate": 0.0005, + "loss": 2.139, + "step": 47340 + }, + { + "epoch": 0.18022578656090377, + "grad_norm": 0.12761497497558594, + "learning_rate": 0.0005, + "loss": 2.1213, + "step": 47350 + }, + { + "epoch": 0.18026384902902645, + "grad_norm": 0.1252940446138382, + "learning_rate": 0.0005, + "loss": 2.1271, + "step": 47360 + }, + { + "epoch": 0.1803019114971491, + "grad_norm": 0.12291482836008072, + "learning_rate": 0.0005, + "loss": 2.138, + "step": 47370 + }, + { + "epoch": 0.1803399739652718, + "grad_norm": 0.12462043762207031, + "learning_rate": 0.0005, + "loss": 2.1393, + "step": 47380 + }, + { + "epoch": 0.18037803643339448, + "grad_norm": 0.13734501600265503, + "learning_rate": 0.0005, + "loss": 2.117, + "step": 47390 + }, + { + "epoch": 0.18041609890151716, + "grad_norm": 0.13022781908512115, + "learning_rate": 0.0005, + "loss": 2.1407, + "step": 47400 + }, + { + "epoch": 0.18045416136963985, + "grad_norm": 0.13055624067783356, + "learning_rate": 0.0005, + "loss": 2.1245, + "step": 47410 + }, + { + "epoch": 0.18049222383776253, + "grad_norm": 0.1153981164097786, + "learning_rate": 0.0005, + "loss": 2.1245, + "step": 47420 + }, + { + "epoch": 0.18053028630588522, + "grad_norm": 0.14261126518249512, + "learning_rate": 0.0005, + "loss": 2.1468, + "step": 47430 + }, + { + "epoch": 0.1805683487740079, + "grad_norm": 0.15162013471126556, + "learning_rate": 0.0005, + "loss": 2.1276, + "step": 47440 + }, + { + "epoch": 0.1806064112421306, + "grad_norm": 0.12845498323440552, + "learning_rate": 0.0005, + "loss": 2.1388, + "step": 47450 + }, + { + "epoch": 0.18064447371025327, + "grad_norm": 0.11967799067497253, + "learning_rate": 0.0005, + "loss": 2.1424, + "step": 47460 + }, + { + "epoch": 0.18068253617837596, + "grad_norm": 0.12578807771205902, + "learning_rate": 0.0005, + "loss": 2.1198, + "step": 47470 + }, + { + "epoch": 0.18072059864649864, + "grad_norm": 0.1274806410074234, + "learning_rate": 0.0005, + "loss": 2.1378, + "step": 47480 + }, + { + "epoch": 0.18075866111462133, + "grad_norm": 0.14504271745681763, + "learning_rate": 0.0005, + "loss": 2.1318, + "step": 47490 + }, + { + "epoch": 0.180796723582744, + "grad_norm": 0.13173021376132965, + "learning_rate": 0.0005, + "loss": 2.1268, + "step": 47500 + }, + { + "epoch": 0.18083478605086667, + "grad_norm": 0.13466207683086395, + "learning_rate": 0.0005, + "loss": 2.1169, + "step": 47510 + }, + { + "epoch": 0.18087284851898935, + "grad_norm": 0.1106971949338913, + "learning_rate": 0.0005, + "loss": 2.1239, + "step": 47520 + }, + { + "epoch": 0.18091091098711204, + "grad_norm": 0.128569096326828, + "learning_rate": 0.0005, + "loss": 2.1347, + "step": 47530 + }, + { + "epoch": 0.18094897345523472, + "grad_norm": 0.13816776871681213, + "learning_rate": 0.0005, + "loss": 2.1538, + "step": 47540 + }, + { + "epoch": 0.1809870359233574, + "grad_norm": 0.14278821647167206, + "learning_rate": 0.0005, + "loss": 2.1346, + "step": 47550 + }, + { + "epoch": 0.1810250983914801, + "grad_norm": 0.13838042318820953, + "learning_rate": 0.0005, + "loss": 2.1327, + "step": 47560 + }, + { + "epoch": 0.18106316085960278, + "grad_norm": 0.1207902655005455, + "learning_rate": 0.0005, + "loss": 2.1333, + "step": 47570 + }, + { + "epoch": 0.18110122332772546, + "grad_norm": 0.12243013083934784, + "learning_rate": 0.0005, + "loss": 2.1375, + "step": 47580 + }, + { + "epoch": 0.18113928579584815, + "grad_norm": 0.11340376734733582, + "learning_rate": 0.0005, + "loss": 2.1225, + "step": 47590 + }, + { + "epoch": 0.18117734826397083, + "grad_norm": 0.12646237015724182, + "learning_rate": 0.0005, + "loss": 2.1239, + "step": 47600 + }, + { + "epoch": 0.18121541073209352, + "grad_norm": 0.13434185087680817, + "learning_rate": 0.0005, + "loss": 2.1437, + "step": 47610 + }, + { + "epoch": 0.1812534732002162, + "grad_norm": 0.11621993035078049, + "learning_rate": 0.0005, + "loss": 2.1269, + "step": 47620 + }, + { + "epoch": 0.1812915356683389, + "grad_norm": 0.12238717824220657, + "learning_rate": 0.0005, + "loss": 2.131, + "step": 47630 + }, + { + "epoch": 0.18132959813646157, + "grad_norm": 0.12311246991157532, + "learning_rate": 0.0005, + "loss": 2.1302, + "step": 47640 + }, + { + "epoch": 0.18136766060458423, + "grad_norm": 0.12371234595775604, + "learning_rate": 0.0005, + "loss": 2.13, + "step": 47650 + }, + { + "epoch": 0.18140572307270691, + "grad_norm": 0.13039273023605347, + "learning_rate": 0.0005, + "loss": 2.1267, + "step": 47660 + }, + { + "epoch": 0.1814437855408296, + "grad_norm": 0.12399288266897202, + "learning_rate": 0.0005, + "loss": 2.125, + "step": 47670 + }, + { + "epoch": 0.18148184800895228, + "grad_norm": 0.13051196932792664, + "learning_rate": 0.0005, + "loss": 2.1284, + "step": 47680 + }, + { + "epoch": 0.18151991047707497, + "grad_norm": 0.11748791486024857, + "learning_rate": 0.0005, + "loss": 2.1194, + "step": 47690 + }, + { + "epoch": 0.18155797294519765, + "grad_norm": 0.11903548985719681, + "learning_rate": 0.0005, + "loss": 2.1301, + "step": 47700 + }, + { + "epoch": 0.18159603541332034, + "grad_norm": 0.12035155296325684, + "learning_rate": 0.0005, + "loss": 2.1372, + "step": 47710 + }, + { + "epoch": 0.18163409788144302, + "grad_norm": 0.11695858836174011, + "learning_rate": 0.0005, + "loss": 2.1347, + "step": 47720 + }, + { + "epoch": 0.1816721603495657, + "grad_norm": 0.12408757954835892, + "learning_rate": 0.0005, + "loss": 2.1357, + "step": 47730 + }, + { + "epoch": 0.1817102228176884, + "grad_norm": 0.12435764074325562, + "learning_rate": 0.0005, + "loss": 2.1431, + "step": 47740 + }, + { + "epoch": 0.18174828528581108, + "grad_norm": 0.12510110437870026, + "learning_rate": 0.0005, + "loss": 2.1285, + "step": 47750 + }, + { + "epoch": 0.18178634775393376, + "grad_norm": 0.12769056856632233, + "learning_rate": 0.0005, + "loss": 2.1338, + "step": 47760 + }, + { + "epoch": 0.18182441022205645, + "grad_norm": 0.12242080271244049, + "learning_rate": 0.0005, + "loss": 2.129, + "step": 47770 + }, + { + "epoch": 0.18186247269017913, + "grad_norm": 0.13732148706912994, + "learning_rate": 0.0005, + "loss": 2.1438, + "step": 47780 + }, + { + "epoch": 0.18190053515830182, + "grad_norm": 0.13399755954742432, + "learning_rate": 0.0005, + "loss": 2.1422, + "step": 47790 + }, + { + "epoch": 0.18193859762642448, + "grad_norm": 0.12308243662118912, + "learning_rate": 0.0005, + "loss": 2.1423, + "step": 47800 + }, + { + "epoch": 0.18197666009454716, + "grad_norm": 0.11698419600725174, + "learning_rate": 0.0005, + "loss": 2.1341, + "step": 47810 + }, + { + "epoch": 0.18201472256266985, + "grad_norm": 0.10664495825767517, + "learning_rate": 0.0005, + "loss": 2.1291, + "step": 47820 + }, + { + "epoch": 0.18205278503079253, + "grad_norm": 0.1362016648054123, + "learning_rate": 0.0005, + "loss": 2.1298, + "step": 47830 + }, + { + "epoch": 0.18209084749891521, + "grad_norm": 0.12378685176372528, + "learning_rate": 0.0005, + "loss": 2.1292, + "step": 47840 + }, + { + "epoch": 0.1821289099670379, + "grad_norm": 0.11399336904287338, + "learning_rate": 0.0005, + "loss": 2.1361, + "step": 47850 + }, + { + "epoch": 0.18216697243516058, + "grad_norm": 0.13297849893569946, + "learning_rate": 0.0005, + "loss": 2.1319, + "step": 47860 + }, + { + "epoch": 0.18220503490328327, + "grad_norm": 0.13696694374084473, + "learning_rate": 0.0005, + "loss": 2.1336, + "step": 47870 + }, + { + "epoch": 0.18224309737140595, + "grad_norm": 0.13576148450374603, + "learning_rate": 0.0005, + "loss": 2.1403, + "step": 47880 + }, + { + "epoch": 0.18228115983952864, + "grad_norm": 0.12247444689273834, + "learning_rate": 0.0005, + "loss": 2.1131, + "step": 47890 + }, + { + "epoch": 0.18231922230765132, + "grad_norm": 0.12061848491430283, + "learning_rate": 0.0005, + "loss": 2.1139, + "step": 47900 + }, + { + "epoch": 0.182357284775774, + "grad_norm": 0.1310357004404068, + "learning_rate": 0.0005, + "loss": 2.1354, + "step": 47910 + }, + { + "epoch": 0.1823953472438967, + "grad_norm": 0.12328796088695526, + "learning_rate": 0.0005, + "loss": 2.1339, + "step": 47920 + }, + { + "epoch": 0.18243340971201938, + "grad_norm": 0.13524876534938812, + "learning_rate": 0.0005, + "loss": 2.1137, + "step": 47930 + }, + { + "epoch": 0.18247147218014204, + "grad_norm": 0.13573935627937317, + "learning_rate": 0.0005, + "loss": 2.1258, + "step": 47940 + }, + { + "epoch": 0.18250953464826472, + "grad_norm": 0.12921947240829468, + "learning_rate": 0.0005, + "loss": 2.1355, + "step": 47950 + }, + { + "epoch": 0.1825475971163874, + "grad_norm": 0.12181194871664047, + "learning_rate": 0.0005, + "loss": 2.126, + "step": 47960 + }, + { + "epoch": 0.1825856595845101, + "grad_norm": 0.12479119002819061, + "learning_rate": 0.0005, + "loss": 2.1371, + "step": 47970 + }, + { + "epoch": 0.18262372205263278, + "grad_norm": 0.1317525953054428, + "learning_rate": 0.0005, + "loss": 2.1417, + "step": 47980 + }, + { + "epoch": 0.18266178452075546, + "grad_norm": 0.12510502338409424, + "learning_rate": 0.0005, + "loss": 2.1258, + "step": 47990 + }, + { + "epoch": 0.18269984698887815, + "grad_norm": 0.13090606033802032, + "learning_rate": 0.0005, + "loss": 2.1355, + "step": 48000 + }, + { + "epoch": 0.18273790945700083, + "grad_norm": 0.1211482584476471, + "learning_rate": 0.0005, + "loss": 2.1377, + "step": 48010 + }, + { + "epoch": 0.18277597192512351, + "grad_norm": 0.12430895119905472, + "learning_rate": 0.0005, + "loss": 2.1331, + "step": 48020 + }, + { + "epoch": 0.1828140343932462, + "grad_norm": 0.1278323233127594, + "learning_rate": 0.0005, + "loss": 2.1359, + "step": 48030 + }, + { + "epoch": 0.18285209686136888, + "grad_norm": 0.13269633054733276, + "learning_rate": 0.0005, + "loss": 2.1329, + "step": 48040 + }, + { + "epoch": 0.18289015932949157, + "grad_norm": 0.1252775341272354, + "learning_rate": 0.0005, + "loss": 2.1377, + "step": 48050 + }, + { + "epoch": 0.18292822179761425, + "grad_norm": 0.14520369470119476, + "learning_rate": 0.0005, + "loss": 2.134, + "step": 48060 + }, + { + "epoch": 0.18296628426573694, + "grad_norm": 0.1397608369588852, + "learning_rate": 0.0005, + "loss": 2.1384, + "step": 48070 + }, + { + "epoch": 0.18300434673385962, + "grad_norm": 0.1267949640750885, + "learning_rate": 0.0005, + "loss": 2.1344, + "step": 48080 + }, + { + "epoch": 0.18304240920198228, + "grad_norm": 0.13453710079193115, + "learning_rate": 0.0005, + "loss": 2.1287, + "step": 48090 + }, + { + "epoch": 0.18308047167010497, + "grad_norm": 0.12775638699531555, + "learning_rate": 0.0005, + "loss": 2.131, + "step": 48100 + }, + { + "epoch": 0.18311853413822765, + "grad_norm": 0.1129986122250557, + "learning_rate": 0.0005, + "loss": 2.1412, + "step": 48110 + }, + { + "epoch": 0.18315659660635034, + "grad_norm": 0.11675728112459183, + "learning_rate": 0.0005, + "loss": 2.1316, + "step": 48120 + }, + { + "epoch": 0.18319465907447302, + "grad_norm": 0.11513984203338623, + "learning_rate": 0.0005, + "loss": 2.1249, + "step": 48130 + }, + { + "epoch": 0.1832327215425957, + "grad_norm": 0.12235622853040695, + "learning_rate": 0.0005, + "loss": 2.129, + "step": 48140 + }, + { + "epoch": 0.1832707840107184, + "grad_norm": 0.13701052963733673, + "learning_rate": 0.0005, + "loss": 2.1198, + "step": 48150 + }, + { + "epoch": 0.18330884647884108, + "grad_norm": 0.11779025197029114, + "learning_rate": 0.0005, + "loss": 2.1254, + "step": 48160 + }, + { + "epoch": 0.18334690894696376, + "grad_norm": 0.11653947830200195, + "learning_rate": 0.0005, + "loss": 2.1302, + "step": 48170 + }, + { + "epoch": 0.18338497141508645, + "grad_norm": 0.11426492035388947, + "learning_rate": 0.0005, + "loss": 2.1253, + "step": 48180 + }, + { + "epoch": 0.18342303388320913, + "grad_norm": 0.12913554906845093, + "learning_rate": 0.0005, + "loss": 2.1215, + "step": 48190 + }, + { + "epoch": 0.18346109635133181, + "grad_norm": 0.11243739724159241, + "learning_rate": 0.0005, + "loss": 2.1272, + "step": 48200 + }, + { + "epoch": 0.1834991588194545, + "grad_norm": 0.12019768357276917, + "learning_rate": 0.0005, + "loss": 2.1478, + "step": 48210 + }, + { + "epoch": 0.18353722128757718, + "grad_norm": 0.12446115911006927, + "learning_rate": 0.0005, + "loss": 2.1333, + "step": 48220 + }, + { + "epoch": 0.18357528375569984, + "grad_norm": 0.12636056542396545, + "learning_rate": 0.0005, + "loss": 2.1278, + "step": 48230 + }, + { + "epoch": 0.18361334622382253, + "grad_norm": 0.1382257640361786, + "learning_rate": 0.0005, + "loss": 2.1227, + "step": 48240 + }, + { + "epoch": 0.1836514086919452, + "grad_norm": 0.12824708223342896, + "learning_rate": 0.0005, + "loss": 2.1179, + "step": 48250 + }, + { + "epoch": 0.1836894711600679, + "grad_norm": 0.11088469624519348, + "learning_rate": 0.0005, + "loss": 2.1323, + "step": 48260 + }, + { + "epoch": 0.18372753362819058, + "grad_norm": 0.12594173848628998, + "learning_rate": 0.0005, + "loss": 2.1227, + "step": 48270 + }, + { + "epoch": 0.18376559609631327, + "grad_norm": 0.1285485476255417, + "learning_rate": 0.0005, + "loss": 2.1239, + "step": 48280 + }, + { + "epoch": 0.18380365856443595, + "grad_norm": 0.13596130907535553, + "learning_rate": 0.0005, + "loss": 2.129, + "step": 48290 + }, + { + "epoch": 0.18384172103255864, + "grad_norm": 0.13879016041755676, + "learning_rate": 0.0005, + "loss": 2.1272, + "step": 48300 + }, + { + "epoch": 0.18387978350068132, + "grad_norm": 0.12482644617557526, + "learning_rate": 0.0005, + "loss": 2.1346, + "step": 48310 + }, + { + "epoch": 0.183917845968804, + "grad_norm": 0.13269701600074768, + "learning_rate": 0.0005, + "loss": 2.1409, + "step": 48320 + }, + { + "epoch": 0.1839559084369267, + "grad_norm": 0.1344316005706787, + "learning_rate": 0.0005, + "loss": 2.1467, + "step": 48330 + }, + { + "epoch": 0.18399397090504938, + "grad_norm": 0.1299740970134735, + "learning_rate": 0.0005, + "loss": 2.1424, + "step": 48340 + }, + { + "epoch": 0.18403203337317206, + "grad_norm": 0.1218443289399147, + "learning_rate": 0.0005, + "loss": 2.1231, + "step": 48350 + }, + { + "epoch": 0.18407009584129475, + "grad_norm": 0.12609827518463135, + "learning_rate": 0.0005, + "loss": 2.1343, + "step": 48360 + }, + { + "epoch": 0.1841081583094174, + "grad_norm": 0.12949436902999878, + "learning_rate": 0.0005, + "loss": 2.1122, + "step": 48370 + }, + { + "epoch": 0.1841462207775401, + "grad_norm": 0.13706724345684052, + "learning_rate": 0.0005, + "loss": 2.1588, + "step": 48380 + }, + { + "epoch": 0.18418428324566277, + "grad_norm": 0.11350060999393463, + "learning_rate": 0.0005, + "loss": 2.1325, + "step": 48390 + }, + { + "epoch": 0.18422234571378546, + "grad_norm": 0.12369034439325333, + "learning_rate": 0.0005, + "loss": 2.1297, + "step": 48400 + }, + { + "epoch": 0.18426040818190814, + "grad_norm": 0.13806772232055664, + "learning_rate": 0.0005, + "loss": 2.1407, + "step": 48410 + }, + { + "epoch": 0.18429847065003083, + "grad_norm": 0.13240782916545868, + "learning_rate": 0.0005, + "loss": 2.1333, + "step": 48420 + }, + { + "epoch": 0.1843365331181535, + "grad_norm": 0.1262694150209427, + "learning_rate": 0.0005, + "loss": 2.1217, + "step": 48430 + }, + { + "epoch": 0.1843745955862762, + "grad_norm": 0.13484366238117218, + "learning_rate": 0.0005, + "loss": 2.161, + "step": 48440 + }, + { + "epoch": 0.18441265805439888, + "grad_norm": 0.1304924190044403, + "learning_rate": 0.0005, + "loss": 2.1394, + "step": 48450 + }, + { + "epoch": 0.18445072052252157, + "grad_norm": 0.12426736950874329, + "learning_rate": 0.0005, + "loss": 2.1385, + "step": 48460 + }, + { + "epoch": 0.18448878299064425, + "grad_norm": 0.13261044025421143, + "learning_rate": 0.0005, + "loss": 2.1303, + "step": 48470 + }, + { + "epoch": 0.18452684545876694, + "grad_norm": 0.11962955445051193, + "learning_rate": 0.0005, + "loss": 2.1344, + "step": 48480 + }, + { + "epoch": 0.18456490792688962, + "grad_norm": 0.1203152984380722, + "learning_rate": 0.0005, + "loss": 2.1259, + "step": 48490 + }, + { + "epoch": 0.1846029703950123, + "grad_norm": 0.12043966352939606, + "learning_rate": 0.0005, + "loss": 2.1257, + "step": 48500 + }, + { + "epoch": 0.184641032863135, + "grad_norm": 0.1077461838722229, + "learning_rate": 0.0005, + "loss": 2.1308, + "step": 48510 + }, + { + "epoch": 0.18467909533125765, + "grad_norm": 0.125952810049057, + "learning_rate": 0.0005, + "loss": 2.144, + "step": 48520 + }, + { + "epoch": 0.18471715779938033, + "grad_norm": 0.11711869388818741, + "learning_rate": 0.0005, + "loss": 2.1256, + "step": 48530 + }, + { + "epoch": 0.18475522026750302, + "grad_norm": 0.12230297923088074, + "learning_rate": 0.0005, + "loss": 2.1317, + "step": 48540 + }, + { + "epoch": 0.1847932827356257, + "grad_norm": 0.12001660466194153, + "learning_rate": 0.0005, + "loss": 2.1404, + "step": 48550 + }, + { + "epoch": 0.1848313452037484, + "grad_norm": 0.12919172644615173, + "learning_rate": 0.0005, + "loss": 2.1481, + "step": 48560 + }, + { + "epoch": 0.18486940767187107, + "grad_norm": 0.1313171237707138, + "learning_rate": 0.0005, + "loss": 2.1337, + "step": 48570 + }, + { + "epoch": 0.18490747013999376, + "grad_norm": 0.9556818604469299, + "learning_rate": 0.0005, + "loss": 2.1124, + "step": 48580 + }, + { + "epoch": 0.18494553260811644, + "grad_norm": 0.24016167223453522, + "learning_rate": 0.0005, + "loss": 2.1325, + "step": 48590 + }, + { + "epoch": 0.18498359507623913, + "grad_norm": 0.12441037595272064, + "learning_rate": 0.0005, + "loss": 2.1284, + "step": 48600 + }, + { + "epoch": 0.1850216575443618, + "grad_norm": 0.13198134303092957, + "learning_rate": 0.0005, + "loss": 2.1425, + "step": 48610 + }, + { + "epoch": 0.1850597200124845, + "grad_norm": 0.11687224358320236, + "learning_rate": 0.0005, + "loss": 2.1388, + "step": 48620 + }, + { + "epoch": 0.18509778248060718, + "grad_norm": 0.11632097512483597, + "learning_rate": 0.0005, + "loss": 2.1314, + "step": 48630 + }, + { + "epoch": 0.18513584494872987, + "grad_norm": 0.12353020161390305, + "learning_rate": 0.0005, + "loss": 2.1371, + "step": 48640 + }, + { + "epoch": 0.18517390741685255, + "grad_norm": 0.13218729197978973, + "learning_rate": 0.0005, + "loss": 2.1272, + "step": 48650 + }, + { + "epoch": 0.1852119698849752, + "grad_norm": 0.10913823544979095, + "learning_rate": 0.0005, + "loss": 2.1371, + "step": 48660 + }, + { + "epoch": 0.1852500323530979, + "grad_norm": 0.12533710896968842, + "learning_rate": 0.0005, + "loss": 2.1189, + "step": 48670 + }, + { + "epoch": 0.18528809482122058, + "grad_norm": 0.10914388298988342, + "learning_rate": 0.0005, + "loss": 2.1267, + "step": 48680 + }, + { + "epoch": 0.18532615728934326, + "grad_norm": 0.13747212290763855, + "learning_rate": 0.0005, + "loss": 2.1313, + "step": 48690 + }, + { + "epoch": 0.18536421975746595, + "grad_norm": 0.140378937125206, + "learning_rate": 0.0005, + "loss": 2.1326, + "step": 48700 + }, + { + "epoch": 0.18540228222558863, + "grad_norm": 0.1217515617609024, + "learning_rate": 0.0005, + "loss": 2.1266, + "step": 48710 + }, + { + "epoch": 0.18544034469371132, + "grad_norm": 0.11559165269136429, + "learning_rate": 0.0005, + "loss": 2.1307, + "step": 48720 + }, + { + "epoch": 0.185478407161834, + "grad_norm": 0.11101264506578445, + "learning_rate": 0.0005, + "loss": 2.1329, + "step": 48730 + }, + { + "epoch": 0.1855164696299567, + "grad_norm": 0.12215148657560349, + "learning_rate": 0.0005, + "loss": 2.1358, + "step": 48740 + }, + { + "epoch": 0.18555453209807937, + "grad_norm": 0.12364372611045837, + "learning_rate": 0.0005, + "loss": 2.1237, + "step": 48750 + }, + { + "epoch": 0.18559259456620206, + "grad_norm": 0.12270593643188477, + "learning_rate": 0.0005, + "loss": 2.1238, + "step": 48760 + }, + { + "epoch": 0.18563065703432474, + "grad_norm": 0.13053959608078003, + "learning_rate": 0.0005, + "loss": 2.1361, + "step": 48770 + }, + { + "epoch": 0.18566871950244743, + "grad_norm": 0.11770855635404587, + "learning_rate": 0.0005, + "loss": 2.1225, + "step": 48780 + }, + { + "epoch": 0.1857067819705701, + "grad_norm": 0.12644435465335846, + "learning_rate": 0.0005, + "loss": 2.1259, + "step": 48790 + }, + { + "epoch": 0.18574484443869277, + "grad_norm": 0.1235184296965599, + "learning_rate": 0.0005, + "loss": 2.1288, + "step": 48800 + }, + { + "epoch": 0.18578290690681545, + "grad_norm": 0.12732064723968506, + "learning_rate": 0.0005, + "loss": 2.1258, + "step": 48810 + }, + { + "epoch": 0.18582096937493814, + "grad_norm": 0.11984202265739441, + "learning_rate": 0.0005, + "loss": 2.1173, + "step": 48820 + }, + { + "epoch": 0.18585903184306082, + "grad_norm": 0.1389389932155609, + "learning_rate": 0.0005, + "loss": 2.1268, + "step": 48830 + }, + { + "epoch": 0.1858970943111835, + "grad_norm": 0.118854820728302, + "learning_rate": 0.0005, + "loss": 2.1302, + "step": 48840 + }, + { + "epoch": 0.1859351567793062, + "grad_norm": 0.134931281208992, + "learning_rate": 0.0005, + "loss": 2.1413, + "step": 48850 + }, + { + "epoch": 0.18597321924742888, + "grad_norm": 0.1198066771030426, + "learning_rate": 0.0005, + "loss": 2.1423, + "step": 48860 + }, + { + "epoch": 0.18601128171555156, + "grad_norm": 0.11885160207748413, + "learning_rate": 0.0005, + "loss": 2.1393, + "step": 48870 + }, + { + "epoch": 0.18604934418367425, + "grad_norm": 0.11900264024734497, + "learning_rate": 0.0005, + "loss": 2.1411, + "step": 48880 + }, + { + "epoch": 0.18608740665179693, + "grad_norm": 0.12022736668586731, + "learning_rate": 0.0005, + "loss": 2.1359, + "step": 48890 + }, + { + "epoch": 0.18612546911991962, + "grad_norm": 0.12380018830299377, + "learning_rate": 0.0005, + "loss": 2.1258, + "step": 48900 + }, + { + "epoch": 0.1861635315880423, + "grad_norm": 0.12020589411258698, + "learning_rate": 0.0005, + "loss": 2.1331, + "step": 48910 + }, + { + "epoch": 0.186201594056165, + "grad_norm": 0.11976821720600128, + "learning_rate": 0.0005, + "loss": 2.1389, + "step": 48920 + }, + { + "epoch": 0.18623965652428767, + "grad_norm": 0.11832680553197861, + "learning_rate": 0.0005, + "loss": 2.1339, + "step": 48930 + }, + { + "epoch": 0.18627771899241036, + "grad_norm": 0.13924725353717804, + "learning_rate": 0.0005, + "loss": 2.111, + "step": 48940 + }, + { + "epoch": 0.18631578146053301, + "grad_norm": 0.13474136590957642, + "learning_rate": 0.0005, + "loss": 2.128, + "step": 48950 + }, + { + "epoch": 0.1863538439286557, + "grad_norm": 0.12513698637485504, + "learning_rate": 0.0005, + "loss": 2.1351, + "step": 48960 + }, + { + "epoch": 0.18639190639677838, + "grad_norm": 0.13066360354423523, + "learning_rate": 0.0005, + "loss": 2.1269, + "step": 48970 + }, + { + "epoch": 0.18642996886490107, + "grad_norm": 0.12704874575138092, + "learning_rate": 0.0005, + "loss": 2.1366, + "step": 48980 + }, + { + "epoch": 0.18646803133302375, + "grad_norm": 0.13729922473430634, + "learning_rate": 0.0005, + "loss": 2.1317, + "step": 48990 + }, + { + "epoch": 0.18650609380114644, + "grad_norm": 0.1181914284825325, + "learning_rate": 0.0005, + "loss": 2.1381, + "step": 49000 + }, + { + "epoch": 0.18654415626926912, + "grad_norm": 0.11445435136556625, + "learning_rate": 0.0005, + "loss": 2.1299, + "step": 49010 + }, + { + "epoch": 0.1865822187373918, + "grad_norm": 0.12288684397935867, + "learning_rate": 0.0005, + "loss": 2.1354, + "step": 49020 + }, + { + "epoch": 0.1866202812055145, + "grad_norm": 0.14258378744125366, + "learning_rate": 0.0005, + "loss": 2.1157, + "step": 49030 + }, + { + "epoch": 0.18665834367363718, + "grad_norm": 0.12620224058628082, + "learning_rate": 0.0005, + "loss": 2.1516, + "step": 49040 + }, + { + "epoch": 0.18669640614175986, + "grad_norm": 0.12300539016723633, + "learning_rate": 0.0005, + "loss": 2.1358, + "step": 49050 + }, + { + "epoch": 0.18673446860988255, + "grad_norm": 0.12325773388147354, + "learning_rate": 0.0005, + "loss": 2.1246, + "step": 49060 + }, + { + "epoch": 0.18677253107800523, + "grad_norm": 0.11611814796924591, + "learning_rate": 0.0005, + "loss": 2.1202, + "step": 49070 + }, + { + "epoch": 0.18681059354612792, + "grad_norm": 0.1306041181087494, + "learning_rate": 0.0005, + "loss": 2.128, + "step": 49080 + }, + { + "epoch": 0.18684865601425057, + "grad_norm": 0.12352015823125839, + "learning_rate": 0.0005, + "loss": 2.1381, + "step": 49090 + }, + { + "epoch": 0.18688671848237326, + "grad_norm": 0.12239983677864075, + "learning_rate": 0.0005, + "loss": 2.1373, + "step": 49100 + }, + { + "epoch": 0.18692478095049594, + "grad_norm": 0.12327679991722107, + "learning_rate": 0.0005, + "loss": 2.1248, + "step": 49110 + }, + { + "epoch": 0.18696284341861863, + "grad_norm": 0.12638281285762787, + "learning_rate": 0.0005, + "loss": 2.136, + "step": 49120 + }, + { + "epoch": 0.18700090588674131, + "grad_norm": 0.11979183554649353, + "learning_rate": 0.0005, + "loss": 2.1606, + "step": 49130 + }, + { + "epoch": 0.187038968354864, + "grad_norm": 0.11857368052005768, + "learning_rate": 0.0005, + "loss": 2.1293, + "step": 49140 + }, + { + "epoch": 0.18707703082298668, + "grad_norm": 0.11690958589315414, + "learning_rate": 0.0005, + "loss": 2.1292, + "step": 49150 + }, + { + "epoch": 0.18711509329110937, + "grad_norm": 0.13325461745262146, + "learning_rate": 0.0005, + "loss": 2.1565, + "step": 49160 + }, + { + "epoch": 0.18715315575923205, + "grad_norm": 0.1198049932718277, + "learning_rate": 0.0005, + "loss": 2.1298, + "step": 49170 + }, + { + "epoch": 0.18719121822735474, + "grad_norm": 0.12244052439928055, + "learning_rate": 0.0005, + "loss": 2.1441, + "step": 49180 + }, + { + "epoch": 0.18722928069547742, + "grad_norm": 0.11625416576862335, + "learning_rate": 0.0005, + "loss": 2.1308, + "step": 49190 + }, + { + "epoch": 0.1872673431636001, + "grad_norm": 0.12328100204467773, + "learning_rate": 0.0005, + "loss": 2.1328, + "step": 49200 + }, + { + "epoch": 0.1873054056317228, + "grad_norm": 0.1413453072309494, + "learning_rate": 0.0005, + "loss": 2.1237, + "step": 49210 + }, + { + "epoch": 0.18734346809984548, + "grad_norm": 0.11608593910932541, + "learning_rate": 0.0005, + "loss": 2.1308, + "step": 49220 + }, + { + "epoch": 0.18738153056796816, + "grad_norm": 0.142526313662529, + "learning_rate": 0.0005, + "loss": 2.1204, + "step": 49230 + }, + { + "epoch": 0.18741959303609082, + "grad_norm": 0.13891670107841492, + "learning_rate": 0.0005, + "loss": 2.138, + "step": 49240 + }, + { + "epoch": 0.1874576555042135, + "grad_norm": 0.119914211332798, + "learning_rate": 0.0005, + "loss": 2.137, + "step": 49250 + }, + { + "epoch": 0.1874957179723362, + "grad_norm": 0.12706288695335388, + "learning_rate": 0.0005, + "loss": 2.1211, + "step": 49260 + }, + { + "epoch": 0.18753378044045887, + "grad_norm": 0.13240917026996613, + "learning_rate": 0.0005, + "loss": 2.1241, + "step": 49270 + }, + { + "epoch": 0.18757184290858156, + "grad_norm": 0.1289173662662506, + "learning_rate": 0.0005, + "loss": 2.1508, + "step": 49280 + }, + { + "epoch": 0.18760990537670424, + "grad_norm": 0.110454261302948, + "learning_rate": 0.0005, + "loss": 2.1393, + "step": 49290 + }, + { + "epoch": 0.18764796784482693, + "grad_norm": 0.10865352302789688, + "learning_rate": 0.0005, + "loss": 2.1256, + "step": 49300 + }, + { + "epoch": 0.18768603031294961, + "grad_norm": 0.11935272812843323, + "learning_rate": 0.0005, + "loss": 2.1477, + "step": 49310 + }, + { + "epoch": 0.1877240927810723, + "grad_norm": 0.12708429992198944, + "learning_rate": 0.0005, + "loss": 2.1261, + "step": 49320 + }, + { + "epoch": 0.18776215524919498, + "grad_norm": 0.13141992688179016, + "learning_rate": 0.0005, + "loss": 2.1402, + "step": 49330 + }, + { + "epoch": 0.18780021771731767, + "grad_norm": 0.12144871056079865, + "learning_rate": 0.0005, + "loss": 2.1189, + "step": 49340 + }, + { + "epoch": 0.18783828018544035, + "grad_norm": 0.12627474963665009, + "learning_rate": 0.0005, + "loss": 2.1348, + "step": 49350 + }, + { + "epoch": 0.18787634265356304, + "grad_norm": 0.11026687920093536, + "learning_rate": 0.0005, + "loss": 2.1376, + "step": 49360 + }, + { + "epoch": 0.18791440512168572, + "grad_norm": 0.12230470031499863, + "learning_rate": 0.0005, + "loss": 2.1227, + "step": 49370 + }, + { + "epoch": 0.18795246758980838, + "grad_norm": 0.12970934808254242, + "learning_rate": 0.0005, + "loss": 2.1304, + "step": 49380 + }, + { + "epoch": 0.18799053005793107, + "grad_norm": 0.13543622195720673, + "learning_rate": 0.0005, + "loss": 2.1287, + "step": 49390 + }, + { + "epoch": 0.18802859252605375, + "grad_norm": 0.11476121097803116, + "learning_rate": 0.0005, + "loss": 2.1291, + "step": 49400 + }, + { + "epoch": 0.18806665499417644, + "grad_norm": 0.12133664637804031, + "learning_rate": 0.0005, + "loss": 2.1158, + "step": 49410 + }, + { + "epoch": 0.18810471746229912, + "grad_norm": 0.12081196159124374, + "learning_rate": 0.0005, + "loss": 2.1394, + "step": 49420 + }, + { + "epoch": 0.1881427799304218, + "grad_norm": 0.12121246755123138, + "learning_rate": 0.0005, + "loss": 2.1377, + "step": 49430 + }, + { + "epoch": 0.1881808423985445, + "grad_norm": 0.1321023851633072, + "learning_rate": 0.0005, + "loss": 2.1339, + "step": 49440 + }, + { + "epoch": 0.18821890486666717, + "grad_norm": 0.12149399518966675, + "learning_rate": 0.0005, + "loss": 2.1304, + "step": 49450 + }, + { + "epoch": 0.18825696733478986, + "grad_norm": 0.12127463519573212, + "learning_rate": 0.0005, + "loss": 2.1303, + "step": 49460 + }, + { + "epoch": 0.18829502980291254, + "grad_norm": 0.12894226610660553, + "learning_rate": 0.0005, + "loss": 2.1347, + "step": 49470 + }, + { + "epoch": 0.18833309227103523, + "grad_norm": 0.11598140746355057, + "learning_rate": 0.0005, + "loss": 2.1335, + "step": 49480 + }, + { + "epoch": 0.18837115473915791, + "grad_norm": 0.11026100814342499, + "learning_rate": 0.0005, + "loss": 2.1223, + "step": 49490 + }, + { + "epoch": 0.1884092172072806, + "grad_norm": 0.11874309182167053, + "learning_rate": 0.0005, + "loss": 2.1258, + "step": 49500 + }, + { + "epoch": 0.18844727967540328, + "grad_norm": 0.12631499767303467, + "learning_rate": 0.0005, + "loss": 2.1305, + "step": 49510 + }, + { + "epoch": 0.18848534214352594, + "grad_norm": 0.1200314313173294, + "learning_rate": 0.0005, + "loss": 2.1302, + "step": 49520 + }, + { + "epoch": 0.18852340461164863, + "grad_norm": 0.12178215384483337, + "learning_rate": 0.0005, + "loss": 2.1252, + "step": 49530 + }, + { + "epoch": 0.1885614670797713, + "grad_norm": 0.11553493142127991, + "learning_rate": 0.0005, + "loss": 2.1306, + "step": 49540 + }, + { + "epoch": 0.188599529547894, + "grad_norm": 0.24640120565891266, + "learning_rate": 0.0005, + "loss": 2.1243, + "step": 49550 + }, + { + "epoch": 0.18863759201601668, + "grad_norm": 0.12426994740962982, + "learning_rate": 0.0005, + "loss": 2.1342, + "step": 49560 + }, + { + "epoch": 0.18867565448413937, + "grad_norm": 0.13868281245231628, + "learning_rate": 0.0005, + "loss": 2.1437, + "step": 49570 + }, + { + "epoch": 0.18871371695226205, + "grad_norm": 0.12489303201436996, + "learning_rate": 0.0005, + "loss": 2.1391, + "step": 49580 + }, + { + "epoch": 0.18875177942038474, + "grad_norm": 0.1338043361902237, + "learning_rate": 0.0005, + "loss": 2.1402, + "step": 49590 + }, + { + "epoch": 0.18878984188850742, + "grad_norm": 0.13678200542926788, + "learning_rate": 0.0005, + "loss": 2.1437, + "step": 49600 + }, + { + "epoch": 0.1888279043566301, + "grad_norm": 0.12890838086605072, + "learning_rate": 0.0005, + "loss": 2.1372, + "step": 49610 + }, + { + "epoch": 0.1888659668247528, + "grad_norm": 0.11385848373174667, + "learning_rate": 0.0005, + "loss": 2.139, + "step": 49620 + }, + { + "epoch": 0.18890402929287547, + "grad_norm": 0.11929275095462799, + "learning_rate": 0.0005, + "loss": 2.1457, + "step": 49630 + }, + { + "epoch": 0.18894209176099816, + "grad_norm": 0.13784067332744598, + "learning_rate": 0.0005, + "loss": 2.1211, + "step": 49640 + }, + { + "epoch": 0.18898015422912084, + "grad_norm": 0.12938465178012848, + "learning_rate": 0.0005, + "loss": 2.1264, + "step": 49650 + }, + { + "epoch": 0.18901821669724353, + "grad_norm": 0.14009609818458557, + "learning_rate": 0.0005, + "loss": 2.1328, + "step": 49660 + }, + { + "epoch": 0.1890562791653662, + "grad_norm": 0.1335345357656479, + "learning_rate": 0.0005, + "loss": 2.1265, + "step": 49670 + }, + { + "epoch": 0.18909434163348887, + "grad_norm": 0.12277305871248245, + "learning_rate": 0.0005, + "loss": 2.1308, + "step": 49680 + }, + { + "epoch": 0.18913240410161156, + "grad_norm": 0.12641063332557678, + "learning_rate": 0.0005, + "loss": 2.1311, + "step": 49690 + }, + { + "epoch": 0.18917046656973424, + "grad_norm": 0.13216422498226166, + "learning_rate": 0.0005, + "loss": 2.1444, + "step": 49700 + }, + { + "epoch": 0.18920852903785693, + "grad_norm": 0.1259606033563614, + "learning_rate": 0.0005, + "loss": 2.124, + "step": 49710 + }, + { + "epoch": 0.1892465915059796, + "grad_norm": 0.1319742500782013, + "learning_rate": 0.0005, + "loss": 2.1273, + "step": 49720 + }, + { + "epoch": 0.1892846539741023, + "grad_norm": 0.14151360094547272, + "learning_rate": 0.0005, + "loss": 2.1328, + "step": 49730 + }, + { + "epoch": 0.18932271644222498, + "grad_norm": 0.1352107673883438, + "learning_rate": 0.0005, + "loss": 2.146, + "step": 49740 + }, + { + "epoch": 0.18936077891034767, + "grad_norm": 0.11739267408847809, + "learning_rate": 0.0005, + "loss": 2.1385, + "step": 49750 + }, + { + "epoch": 0.18939884137847035, + "grad_norm": 0.1501825600862503, + "learning_rate": 0.0005, + "loss": 2.1296, + "step": 49760 + }, + { + "epoch": 0.18943690384659304, + "grad_norm": 0.1281415820121765, + "learning_rate": 0.0005, + "loss": 2.1305, + "step": 49770 + }, + { + "epoch": 0.18947496631471572, + "grad_norm": 0.12109538912773132, + "learning_rate": 0.0005, + "loss": 2.1366, + "step": 49780 + }, + { + "epoch": 0.1895130287828384, + "grad_norm": 0.13439509272575378, + "learning_rate": 0.0005, + "loss": 2.156, + "step": 49790 + }, + { + "epoch": 0.1895510912509611, + "grad_norm": 0.11888731271028519, + "learning_rate": 0.0005, + "loss": 2.135, + "step": 49800 + }, + { + "epoch": 0.18958915371908375, + "grad_norm": 0.13279883563518524, + "learning_rate": 0.0005, + "loss": 2.1175, + "step": 49810 + }, + { + "epoch": 0.18962721618720643, + "grad_norm": 0.11696521192789078, + "learning_rate": 0.0005, + "loss": 2.1277, + "step": 49820 + }, + { + "epoch": 0.18966527865532912, + "grad_norm": 0.11308915913105011, + "learning_rate": 0.0005, + "loss": 2.1298, + "step": 49830 + }, + { + "epoch": 0.1897033411234518, + "grad_norm": 0.12376297265291214, + "learning_rate": 0.0005, + "loss": 2.1321, + "step": 49840 + }, + { + "epoch": 0.1897414035915745, + "grad_norm": 0.14907288551330566, + "learning_rate": 0.0005, + "loss": 2.1456, + "step": 49850 + }, + { + "epoch": 0.18977946605969717, + "grad_norm": 0.11262844502925873, + "learning_rate": 0.0005, + "loss": 2.1287, + "step": 49860 + }, + { + "epoch": 0.18981752852781986, + "grad_norm": 0.12050331383943558, + "learning_rate": 0.0005, + "loss": 2.1414, + "step": 49870 + }, + { + "epoch": 0.18985559099594254, + "grad_norm": 0.12309075146913528, + "learning_rate": 0.0005, + "loss": 2.113, + "step": 49880 + }, + { + "epoch": 0.18989365346406523, + "grad_norm": 0.12859241664409637, + "learning_rate": 0.0005, + "loss": 2.1124, + "step": 49890 + }, + { + "epoch": 0.1899317159321879, + "grad_norm": 0.13295090198516846, + "learning_rate": 0.0005, + "loss": 2.1306, + "step": 49900 + }, + { + "epoch": 0.1899697784003106, + "grad_norm": 0.13653841614723206, + "learning_rate": 0.0005, + "loss": 2.1398, + "step": 49910 + }, + { + "epoch": 0.19000784086843328, + "grad_norm": 0.12351825833320618, + "learning_rate": 0.0005, + "loss": 2.135, + "step": 49920 + }, + { + "epoch": 0.19004590333655597, + "grad_norm": 0.11759936064481735, + "learning_rate": 0.0005, + "loss": 2.1284, + "step": 49930 + }, + { + "epoch": 0.19008396580467865, + "grad_norm": 0.12333490699529648, + "learning_rate": 0.0005, + "loss": 2.1312, + "step": 49940 + }, + { + "epoch": 0.1901220282728013, + "grad_norm": 0.12636221945285797, + "learning_rate": 0.0005, + "loss": 2.1266, + "step": 49950 + }, + { + "epoch": 0.190160090740924, + "grad_norm": 0.11555564403533936, + "learning_rate": 0.0005, + "loss": 2.1439, + "step": 49960 + }, + { + "epoch": 0.19019815320904668, + "grad_norm": 0.11537200212478638, + "learning_rate": 0.0005, + "loss": 2.136, + "step": 49970 + }, + { + "epoch": 0.19023621567716936, + "grad_norm": 0.1284588724374771, + "learning_rate": 0.0005, + "loss": 2.1446, + "step": 49980 + }, + { + "epoch": 0.19027427814529205, + "grad_norm": 0.151369109749794, + "learning_rate": 0.0005, + "loss": 2.1327, + "step": 49990 + }, + { + "epoch": 0.19031234061341473, + "grad_norm": 0.11686275154352188, + "learning_rate": 0.0005, + "loss": 2.1461, + "step": 50000 + }, + { + "epoch": 0.19035040308153742, + "grad_norm": 0.10784438997507095, + "learning_rate": 0.0005, + "loss": 2.1342, + "step": 50010 + }, + { + "epoch": 0.1903884655496601, + "grad_norm": 0.22661909461021423, + "learning_rate": 0.0005, + "loss": 2.1315, + "step": 50020 + }, + { + "epoch": 0.1904265280177828, + "grad_norm": 0.10580966621637344, + "learning_rate": 0.0005, + "loss": 2.1217, + "step": 50030 + }, + { + "epoch": 0.19046459048590547, + "grad_norm": 0.12006790190935135, + "learning_rate": 0.0005, + "loss": 2.1377, + "step": 50040 + }, + { + "epoch": 0.19050265295402816, + "grad_norm": 0.1274092197418213, + "learning_rate": 0.0005, + "loss": 2.1348, + "step": 50050 + }, + { + "epoch": 0.19054071542215084, + "grad_norm": 0.13993249833583832, + "learning_rate": 0.0005, + "loss": 2.1332, + "step": 50060 + }, + { + "epoch": 0.19057877789027353, + "grad_norm": 0.13096733391284943, + "learning_rate": 0.0005, + "loss": 2.1319, + "step": 50070 + }, + { + "epoch": 0.1906168403583962, + "grad_norm": 0.11933179944753647, + "learning_rate": 0.0005, + "loss": 2.1427, + "step": 50080 + }, + { + "epoch": 0.1906549028265189, + "grad_norm": 0.1263686716556549, + "learning_rate": 0.0005, + "loss": 2.1311, + "step": 50090 + }, + { + "epoch": 0.19069296529464155, + "grad_norm": 0.12206216901540756, + "learning_rate": 0.0005, + "loss": 2.1448, + "step": 50100 + }, + { + "epoch": 0.19073102776276424, + "grad_norm": 0.13494496047496796, + "learning_rate": 0.0005, + "loss": 2.1366, + "step": 50110 + }, + { + "epoch": 0.19076909023088692, + "grad_norm": 0.11924020200967789, + "learning_rate": 0.0005, + "loss": 2.1282, + "step": 50120 + }, + { + "epoch": 0.1908071526990096, + "grad_norm": 0.13925659656524658, + "learning_rate": 0.0005, + "loss": 2.1257, + "step": 50130 + }, + { + "epoch": 0.1908452151671323, + "grad_norm": 0.12809541821479797, + "learning_rate": 0.0005, + "loss": 2.1378, + "step": 50140 + }, + { + "epoch": 0.19088327763525498, + "grad_norm": 0.11883596330881119, + "learning_rate": 0.0005, + "loss": 2.1307, + "step": 50150 + }, + { + "epoch": 0.19092134010337766, + "grad_norm": 0.11638153344392776, + "learning_rate": 0.0005, + "loss": 2.1246, + "step": 50160 + }, + { + "epoch": 0.19095940257150035, + "grad_norm": 0.12264906615018845, + "learning_rate": 0.0005, + "loss": 2.1291, + "step": 50170 + }, + { + "epoch": 0.19099746503962303, + "grad_norm": 0.12147228419780731, + "learning_rate": 0.0005, + "loss": 2.1194, + "step": 50180 + }, + { + "epoch": 0.19103552750774572, + "grad_norm": 0.11615349352359772, + "learning_rate": 0.0005, + "loss": 2.1278, + "step": 50190 + }, + { + "epoch": 0.1910735899758684, + "grad_norm": 0.1336466521024704, + "learning_rate": 0.0005, + "loss": 2.1222, + "step": 50200 + }, + { + "epoch": 0.1911116524439911, + "grad_norm": 0.12595194578170776, + "learning_rate": 0.0005, + "loss": 2.134, + "step": 50210 + }, + { + "epoch": 0.19114971491211377, + "grad_norm": 0.11743763834238052, + "learning_rate": 0.0005, + "loss": 2.1313, + "step": 50220 + }, + { + "epoch": 0.19118777738023646, + "grad_norm": 0.11929907649755478, + "learning_rate": 0.0005, + "loss": 2.1418, + "step": 50230 + }, + { + "epoch": 0.1912258398483591, + "grad_norm": 0.11452007293701172, + "learning_rate": 0.0005, + "loss": 2.1289, + "step": 50240 + }, + { + "epoch": 0.1912639023164818, + "grad_norm": 0.13159744441509247, + "learning_rate": 0.0005, + "loss": 2.1356, + "step": 50250 + }, + { + "epoch": 0.19130196478460448, + "grad_norm": 0.13729321956634521, + "learning_rate": 0.0005, + "loss": 2.1262, + "step": 50260 + }, + { + "epoch": 0.19134002725272717, + "grad_norm": 0.1385490447282791, + "learning_rate": 0.0005, + "loss": 2.1428, + "step": 50270 + }, + { + "epoch": 0.19137808972084985, + "grad_norm": 0.12463898211717606, + "learning_rate": 0.0005, + "loss": 2.1292, + "step": 50280 + }, + { + "epoch": 0.19141615218897254, + "grad_norm": 0.13166537880897522, + "learning_rate": 0.0005, + "loss": 2.1283, + "step": 50290 + }, + { + "epoch": 0.19145421465709522, + "grad_norm": 0.1303899586200714, + "learning_rate": 0.0005, + "loss": 2.1222, + "step": 50300 + }, + { + "epoch": 0.1914922771252179, + "grad_norm": 0.12919609248638153, + "learning_rate": 0.0005, + "loss": 2.1142, + "step": 50310 + }, + { + "epoch": 0.1915303395933406, + "grad_norm": 0.13564112782478333, + "learning_rate": 0.0005, + "loss": 2.1357, + "step": 50320 + }, + { + "epoch": 0.19156840206146328, + "grad_norm": 0.12312301993370056, + "learning_rate": 0.0005, + "loss": 2.141, + "step": 50330 + }, + { + "epoch": 0.19160646452958596, + "grad_norm": 0.13214318454265594, + "learning_rate": 0.0005, + "loss": 2.1327, + "step": 50340 + }, + { + "epoch": 0.19164452699770865, + "grad_norm": 0.12344446033239365, + "learning_rate": 0.0005, + "loss": 2.1262, + "step": 50350 + }, + { + "epoch": 0.19168258946583133, + "grad_norm": 0.1164083480834961, + "learning_rate": 0.0005, + "loss": 2.1427, + "step": 50360 + }, + { + "epoch": 0.19172065193395402, + "grad_norm": 0.1222231537103653, + "learning_rate": 0.0005, + "loss": 2.12, + "step": 50370 + }, + { + "epoch": 0.1917587144020767, + "grad_norm": 0.13423392176628113, + "learning_rate": 0.0005, + "loss": 2.1452, + "step": 50380 + }, + { + "epoch": 0.19179677687019936, + "grad_norm": 0.14245694875717163, + "learning_rate": 0.0005, + "loss": 2.1305, + "step": 50390 + }, + { + "epoch": 0.19183483933832204, + "grad_norm": 0.1276567131280899, + "learning_rate": 0.0005, + "loss": 2.1307, + "step": 50400 + }, + { + "epoch": 0.19187290180644473, + "grad_norm": 0.1121436059474945, + "learning_rate": 0.0005, + "loss": 2.1191, + "step": 50410 + }, + { + "epoch": 0.1919109642745674, + "grad_norm": 0.1311924308538437, + "learning_rate": 0.0005, + "loss": 2.1238, + "step": 50420 + }, + { + "epoch": 0.1919490267426901, + "grad_norm": 0.12689100205898285, + "learning_rate": 0.0005, + "loss": 2.1429, + "step": 50430 + }, + { + "epoch": 0.19198708921081278, + "grad_norm": 0.1223243772983551, + "learning_rate": 0.0005, + "loss": 2.1402, + "step": 50440 + }, + { + "epoch": 0.19202515167893547, + "grad_norm": 0.11202087998390198, + "learning_rate": 0.0005, + "loss": 2.1272, + "step": 50450 + }, + { + "epoch": 0.19206321414705815, + "grad_norm": 0.12189372628927231, + "learning_rate": 0.0005, + "loss": 2.1215, + "step": 50460 + }, + { + "epoch": 0.19210127661518084, + "grad_norm": 0.1238129734992981, + "learning_rate": 0.0005, + "loss": 2.1257, + "step": 50470 + }, + { + "epoch": 0.19213933908330352, + "grad_norm": 0.1346205621957779, + "learning_rate": 0.0005, + "loss": 2.1453, + "step": 50480 + }, + { + "epoch": 0.1921774015514262, + "grad_norm": 0.11418620496988297, + "learning_rate": 0.0005, + "loss": 2.132, + "step": 50490 + }, + { + "epoch": 0.1922154640195489, + "grad_norm": 0.12495341897010803, + "learning_rate": 0.0005, + "loss": 2.1299, + "step": 50500 + }, + { + "epoch": 0.19225352648767158, + "grad_norm": 0.11481189727783203, + "learning_rate": 0.0005, + "loss": 2.1382, + "step": 50510 + }, + { + "epoch": 0.19229158895579426, + "grad_norm": 0.13317109644412994, + "learning_rate": 0.0005, + "loss": 2.1398, + "step": 50520 + }, + { + "epoch": 0.19232965142391692, + "grad_norm": 0.12128578871488571, + "learning_rate": 0.0005, + "loss": 2.1313, + "step": 50530 + }, + { + "epoch": 0.1923677138920396, + "grad_norm": 0.12536193430423737, + "learning_rate": 0.0005, + "loss": 2.138, + "step": 50540 + }, + { + "epoch": 0.1924057763601623, + "grad_norm": 0.12248624861240387, + "learning_rate": 0.0005, + "loss": 2.1299, + "step": 50550 + }, + { + "epoch": 0.19244383882828497, + "grad_norm": 0.11845123022794724, + "learning_rate": 0.0005, + "loss": 2.1377, + "step": 50560 + }, + { + "epoch": 0.19248190129640766, + "grad_norm": 0.11631769686937332, + "learning_rate": 0.0005, + "loss": 2.1302, + "step": 50570 + }, + { + "epoch": 0.19251996376453034, + "grad_norm": 0.11704082041978836, + "learning_rate": 0.0005, + "loss": 2.1302, + "step": 50580 + }, + { + "epoch": 0.19255802623265303, + "grad_norm": 0.11989939212799072, + "learning_rate": 0.0005, + "loss": 2.1465, + "step": 50590 + }, + { + "epoch": 0.1925960887007757, + "grad_norm": 0.11196364462375641, + "learning_rate": 0.0005, + "loss": 2.1315, + "step": 50600 + }, + { + "epoch": 0.1926341511688984, + "grad_norm": 0.15060563385486603, + "learning_rate": 0.0005, + "loss": 2.12, + "step": 50610 + }, + { + "epoch": 0.19267221363702108, + "grad_norm": 0.1247202679514885, + "learning_rate": 0.0005, + "loss": 2.1388, + "step": 50620 + }, + { + "epoch": 0.19271027610514377, + "grad_norm": 0.13385535776615143, + "learning_rate": 0.0005, + "loss": 2.1307, + "step": 50630 + }, + { + "epoch": 0.19274833857326645, + "grad_norm": 0.12829150259494781, + "learning_rate": 0.0005, + "loss": 2.1464, + "step": 50640 + }, + { + "epoch": 0.19278640104138914, + "grad_norm": 0.11384773999452591, + "learning_rate": 0.0005, + "loss": 2.1336, + "step": 50650 + }, + { + "epoch": 0.19282446350951182, + "grad_norm": 0.11690258234739304, + "learning_rate": 0.0005, + "loss": 2.1262, + "step": 50660 + }, + { + "epoch": 0.19286252597763448, + "grad_norm": 0.12879469990730286, + "learning_rate": 0.0005, + "loss": 2.132, + "step": 50670 + }, + { + "epoch": 0.19290058844575717, + "grad_norm": 0.13677574694156647, + "learning_rate": 0.0005, + "loss": 2.1315, + "step": 50680 + }, + { + "epoch": 0.19293865091387985, + "grad_norm": 0.11285001039505005, + "learning_rate": 0.0005, + "loss": 2.1361, + "step": 50690 + }, + { + "epoch": 0.19297671338200253, + "grad_norm": 0.11710193753242493, + "learning_rate": 0.0005, + "loss": 2.1132, + "step": 50700 + }, + { + "epoch": 0.19301477585012522, + "grad_norm": 0.13037873804569244, + "learning_rate": 0.0005, + "loss": 2.1204, + "step": 50710 + }, + { + "epoch": 0.1930528383182479, + "grad_norm": 0.11687429249286652, + "learning_rate": 0.0005, + "loss": 2.1206, + "step": 50720 + }, + { + "epoch": 0.1930909007863706, + "grad_norm": 0.11817038804292679, + "learning_rate": 0.0005, + "loss": 2.1167, + "step": 50730 + }, + { + "epoch": 0.19312896325449327, + "grad_norm": 0.12508970499038696, + "learning_rate": 0.0005, + "loss": 2.1219, + "step": 50740 + }, + { + "epoch": 0.19316702572261596, + "grad_norm": 0.11719117313623428, + "learning_rate": 0.0005, + "loss": 2.1417, + "step": 50750 + }, + { + "epoch": 0.19320508819073864, + "grad_norm": 0.12354373931884766, + "learning_rate": 0.0005, + "loss": 2.1255, + "step": 50760 + }, + { + "epoch": 0.19324315065886133, + "grad_norm": 0.5578530430793762, + "learning_rate": 0.0005, + "loss": 2.1176, + "step": 50770 + }, + { + "epoch": 0.19328121312698401, + "grad_norm": 0.13043108582496643, + "learning_rate": 0.0005, + "loss": 2.1305, + "step": 50780 + }, + { + "epoch": 0.1933192755951067, + "grad_norm": 0.12195798009634018, + "learning_rate": 0.0005, + "loss": 2.1492, + "step": 50790 + }, + { + "epoch": 0.19335733806322938, + "grad_norm": 0.13272693753242493, + "learning_rate": 0.0005, + "loss": 2.1238, + "step": 50800 + }, + { + "epoch": 0.19339540053135207, + "grad_norm": 0.1230696439743042, + "learning_rate": 0.0005, + "loss": 2.123, + "step": 50810 + }, + { + "epoch": 0.19343346299947473, + "grad_norm": 0.12242135405540466, + "learning_rate": 0.0005, + "loss": 2.1368, + "step": 50820 + }, + { + "epoch": 0.1934715254675974, + "grad_norm": 0.11244331300258636, + "learning_rate": 0.0005, + "loss": 2.1297, + "step": 50830 + }, + { + "epoch": 0.1935095879357201, + "grad_norm": 0.1284191906452179, + "learning_rate": 0.0005, + "loss": 2.1366, + "step": 50840 + }, + { + "epoch": 0.19354765040384278, + "grad_norm": 0.14463888108730316, + "learning_rate": 0.0005, + "loss": 2.1242, + "step": 50850 + }, + { + "epoch": 0.19358571287196547, + "grad_norm": 0.12740777432918549, + "learning_rate": 0.0005, + "loss": 2.1075, + "step": 50860 + }, + { + "epoch": 0.19362377534008815, + "grad_norm": 0.1244569942355156, + "learning_rate": 0.0005, + "loss": 2.1224, + "step": 50870 + }, + { + "epoch": 0.19366183780821083, + "grad_norm": 0.1358763873577118, + "learning_rate": 0.0005, + "loss": 2.1282, + "step": 50880 + }, + { + "epoch": 0.19369990027633352, + "grad_norm": 0.13158994913101196, + "learning_rate": 0.0005, + "loss": 2.1354, + "step": 50890 + }, + { + "epoch": 0.1937379627444562, + "grad_norm": 0.12264818698167801, + "learning_rate": 0.0005, + "loss": 2.1286, + "step": 50900 + }, + { + "epoch": 0.1937760252125789, + "grad_norm": 0.12059519439935684, + "learning_rate": 0.0005, + "loss": 2.1343, + "step": 50910 + }, + { + "epoch": 0.19381408768070157, + "grad_norm": 0.12643907964229584, + "learning_rate": 0.0005, + "loss": 2.1394, + "step": 50920 + }, + { + "epoch": 0.19385215014882426, + "grad_norm": 0.12796556949615479, + "learning_rate": 0.0005, + "loss": 2.1383, + "step": 50930 + }, + { + "epoch": 0.19389021261694694, + "grad_norm": 0.11849892139434814, + "learning_rate": 0.0005, + "loss": 2.134, + "step": 50940 + }, + { + "epoch": 0.19392827508506963, + "grad_norm": 0.12396416068077087, + "learning_rate": 0.0005, + "loss": 2.1149, + "step": 50950 + }, + { + "epoch": 0.1939663375531923, + "grad_norm": 0.1255057454109192, + "learning_rate": 0.0005, + "loss": 2.1433, + "step": 50960 + }, + { + "epoch": 0.19400440002131497, + "grad_norm": 0.11904613673686981, + "learning_rate": 0.0005, + "loss": 2.1168, + "step": 50970 + }, + { + "epoch": 0.19404246248943766, + "grad_norm": 0.13274280726909637, + "learning_rate": 0.0005, + "loss": 2.1324, + "step": 50980 + }, + { + "epoch": 0.19408052495756034, + "grad_norm": 0.11690010130405426, + "learning_rate": 0.0005, + "loss": 2.1324, + "step": 50990 + }, + { + "epoch": 0.19411858742568303, + "grad_norm": 0.12247753888368607, + "learning_rate": 0.0005, + "loss": 2.1138, + "step": 51000 + }, + { + "epoch": 0.1941566498938057, + "grad_norm": 0.11586964875459671, + "learning_rate": 0.0005, + "loss": 2.1377, + "step": 51010 + }, + { + "epoch": 0.1941947123619284, + "grad_norm": 0.12469108402729034, + "learning_rate": 0.0005, + "loss": 2.1348, + "step": 51020 + }, + { + "epoch": 0.19423277483005108, + "grad_norm": 0.14118361473083496, + "learning_rate": 0.0005, + "loss": 2.1217, + "step": 51030 + }, + { + "epoch": 0.19427083729817377, + "grad_norm": 0.15017235279083252, + "learning_rate": 0.0005, + "loss": 2.1334, + "step": 51040 + }, + { + "epoch": 0.19430889976629645, + "grad_norm": 0.12537063658237457, + "learning_rate": 0.0005, + "loss": 2.1251, + "step": 51050 + }, + { + "epoch": 0.19434696223441913, + "grad_norm": 0.13568291068077087, + "learning_rate": 0.0005, + "loss": 2.1272, + "step": 51060 + }, + { + "epoch": 0.19438502470254182, + "grad_norm": 0.1382315754890442, + "learning_rate": 0.0005, + "loss": 2.1238, + "step": 51070 + }, + { + "epoch": 0.1944230871706645, + "grad_norm": 0.35568517446517944, + "learning_rate": 0.0005, + "loss": 2.1246, + "step": 51080 + }, + { + "epoch": 0.1944611496387872, + "grad_norm": 0.11623500287532806, + "learning_rate": 0.0005, + "loss": 2.1183, + "step": 51090 + }, + { + "epoch": 0.19449921210690985, + "grad_norm": 0.13279907405376434, + "learning_rate": 0.0005, + "loss": 2.1277, + "step": 51100 + }, + { + "epoch": 0.19453727457503253, + "grad_norm": 0.1344604790210724, + "learning_rate": 0.0005, + "loss": 2.1221, + "step": 51110 + }, + { + "epoch": 0.19457533704315522, + "grad_norm": 0.12161675095558167, + "learning_rate": 0.0005, + "loss": 2.1429, + "step": 51120 + }, + { + "epoch": 0.1946133995112779, + "grad_norm": 0.11104878783226013, + "learning_rate": 0.0005, + "loss": 2.1392, + "step": 51130 + }, + { + "epoch": 0.1946514619794006, + "grad_norm": 0.12998434901237488, + "learning_rate": 0.0005, + "loss": 2.1308, + "step": 51140 + }, + { + "epoch": 0.19468952444752327, + "grad_norm": 0.12012049555778503, + "learning_rate": 0.0005, + "loss": 2.135, + "step": 51150 + }, + { + "epoch": 0.19472758691564596, + "grad_norm": 0.16529619693756104, + "learning_rate": 0.0005, + "loss": 2.1257, + "step": 51160 + }, + { + "epoch": 0.19476564938376864, + "grad_norm": 0.11907965689897537, + "learning_rate": 0.0005, + "loss": 2.123, + "step": 51170 + }, + { + "epoch": 0.19480371185189133, + "grad_norm": 0.13431844115257263, + "learning_rate": 0.0005, + "loss": 2.1355, + "step": 51180 + }, + { + "epoch": 0.194841774320014, + "grad_norm": 0.12098430842161179, + "learning_rate": 0.0005, + "loss": 2.1242, + "step": 51190 + }, + { + "epoch": 0.1948798367881367, + "grad_norm": 0.11998952925205231, + "learning_rate": 0.0005, + "loss": 2.1241, + "step": 51200 + }, + { + "epoch": 0.19491789925625938, + "grad_norm": 0.12975624203681946, + "learning_rate": 0.0005, + "loss": 2.1387, + "step": 51210 + }, + { + "epoch": 0.19495596172438207, + "grad_norm": 0.1254945546388626, + "learning_rate": 0.0005, + "loss": 2.1415, + "step": 51220 + }, + { + "epoch": 0.19499402419250475, + "grad_norm": 0.11360818147659302, + "learning_rate": 0.0005, + "loss": 2.1321, + "step": 51230 + }, + { + "epoch": 0.19503208666062744, + "grad_norm": 0.12699361145496368, + "learning_rate": 0.0005, + "loss": 2.1382, + "step": 51240 + }, + { + "epoch": 0.1950701491287501, + "grad_norm": 0.11986838281154633, + "learning_rate": 0.0005, + "loss": 2.1277, + "step": 51250 + }, + { + "epoch": 0.19510821159687278, + "grad_norm": 0.11837062984704971, + "learning_rate": 0.0005, + "loss": 2.1339, + "step": 51260 + }, + { + "epoch": 0.19514627406499546, + "grad_norm": 0.1267804652452469, + "learning_rate": 0.0005, + "loss": 2.1336, + "step": 51270 + }, + { + "epoch": 0.19518433653311815, + "grad_norm": 0.12151280045509338, + "learning_rate": 0.0005, + "loss": 2.1211, + "step": 51280 + }, + { + "epoch": 0.19522239900124083, + "grad_norm": 0.13360044360160828, + "learning_rate": 0.0005, + "loss": 2.138, + "step": 51290 + }, + { + "epoch": 0.19526046146936352, + "grad_norm": 0.12029470503330231, + "learning_rate": 0.0005, + "loss": 2.1386, + "step": 51300 + }, + { + "epoch": 0.1952985239374862, + "grad_norm": 0.1321459412574768, + "learning_rate": 0.0005, + "loss": 2.1188, + "step": 51310 + }, + { + "epoch": 0.1953365864056089, + "grad_norm": 0.12111317366361618, + "learning_rate": 0.0005, + "loss": 2.1442, + "step": 51320 + }, + { + "epoch": 0.19537464887373157, + "grad_norm": 0.13475729525089264, + "learning_rate": 0.0005, + "loss": 2.1346, + "step": 51330 + }, + { + "epoch": 0.19541271134185426, + "grad_norm": 0.13722528517246246, + "learning_rate": 0.0005, + "loss": 2.1175, + "step": 51340 + }, + { + "epoch": 0.19545077380997694, + "grad_norm": 0.12443096190690994, + "learning_rate": 0.0005, + "loss": 2.1342, + "step": 51350 + }, + { + "epoch": 0.19548883627809963, + "grad_norm": 0.13265202939510345, + "learning_rate": 0.0005, + "loss": 2.1271, + "step": 51360 + }, + { + "epoch": 0.1955268987462223, + "grad_norm": 0.12839211523532867, + "learning_rate": 0.0005, + "loss": 2.1198, + "step": 51370 + }, + { + "epoch": 0.195564961214345, + "grad_norm": 0.11944104731082916, + "learning_rate": 0.0005, + "loss": 2.1279, + "step": 51380 + }, + { + "epoch": 0.19560302368246765, + "grad_norm": 0.12753385305404663, + "learning_rate": 0.0005, + "loss": 2.13, + "step": 51390 + }, + { + "epoch": 0.19564108615059034, + "grad_norm": 0.12177974730730057, + "learning_rate": 0.0005, + "loss": 2.1327, + "step": 51400 + }, + { + "epoch": 0.19567914861871302, + "grad_norm": 0.12434303760528564, + "learning_rate": 0.0005, + "loss": 2.1232, + "step": 51410 + }, + { + "epoch": 0.1957172110868357, + "grad_norm": 0.13650812208652496, + "learning_rate": 0.0005, + "loss": 2.1349, + "step": 51420 + }, + { + "epoch": 0.1957552735549584, + "grad_norm": 0.11836228519678116, + "learning_rate": 0.0005, + "loss": 2.1356, + "step": 51430 + }, + { + "epoch": 0.19579333602308108, + "grad_norm": 0.1358971744775772, + "learning_rate": 0.0005, + "loss": 2.1442, + "step": 51440 + }, + { + "epoch": 0.19583139849120376, + "grad_norm": 0.1255125254392624, + "learning_rate": 0.0005, + "loss": 2.1368, + "step": 51450 + }, + { + "epoch": 0.19586946095932645, + "grad_norm": 0.12064887583255768, + "learning_rate": 0.0005, + "loss": 2.123, + "step": 51460 + }, + { + "epoch": 0.19590752342744913, + "grad_norm": 0.13757894933223724, + "learning_rate": 0.0005, + "loss": 2.1235, + "step": 51470 + }, + { + "epoch": 0.19594558589557182, + "grad_norm": 0.12057624012231827, + "learning_rate": 0.0005, + "loss": 2.1379, + "step": 51480 + }, + { + "epoch": 0.1959836483636945, + "grad_norm": 0.12316569685935974, + "learning_rate": 0.0005, + "loss": 2.128, + "step": 51490 + }, + { + "epoch": 0.1960217108318172, + "grad_norm": 0.13517151772975922, + "learning_rate": 0.0005, + "loss": 2.125, + "step": 51500 + }, + { + "epoch": 0.19605977329993987, + "grad_norm": 0.12520772218704224, + "learning_rate": 0.0005, + "loss": 2.1343, + "step": 51510 + }, + { + "epoch": 0.19609783576806256, + "grad_norm": 0.13712680339813232, + "learning_rate": 0.0005, + "loss": 2.1234, + "step": 51520 + }, + { + "epoch": 0.19613589823618524, + "grad_norm": 0.1230507493019104, + "learning_rate": 0.0005, + "loss": 2.1159, + "step": 51530 + }, + { + "epoch": 0.1961739607043079, + "grad_norm": 0.12265963107347488, + "learning_rate": 0.0005, + "loss": 2.1313, + "step": 51540 + }, + { + "epoch": 0.19621202317243058, + "grad_norm": 0.1328250914812088, + "learning_rate": 0.0005, + "loss": 2.1404, + "step": 51550 + }, + { + "epoch": 0.19625008564055327, + "grad_norm": 0.13123562932014465, + "learning_rate": 0.0005, + "loss": 2.1101, + "step": 51560 + }, + { + "epoch": 0.19628814810867595, + "grad_norm": 0.12056384980678558, + "learning_rate": 0.0005, + "loss": 2.128, + "step": 51570 + }, + { + "epoch": 0.19632621057679864, + "grad_norm": 0.12020045518875122, + "learning_rate": 0.0005, + "loss": 2.1548, + "step": 51580 + }, + { + "epoch": 0.19636427304492132, + "grad_norm": 0.11768834292888641, + "learning_rate": 0.0005, + "loss": 2.1352, + "step": 51590 + }, + { + "epoch": 0.196402335513044, + "grad_norm": 0.12291178852319717, + "learning_rate": 0.0005, + "loss": 2.1215, + "step": 51600 + }, + { + "epoch": 0.1964403979811667, + "grad_norm": 0.12086016684770584, + "learning_rate": 0.0005, + "loss": 2.1344, + "step": 51610 + }, + { + "epoch": 0.19647846044928938, + "grad_norm": 0.12200043350458145, + "learning_rate": 0.0005, + "loss": 2.135, + "step": 51620 + }, + { + "epoch": 0.19651652291741206, + "grad_norm": 0.1366575062274933, + "learning_rate": 0.0005, + "loss": 2.1251, + "step": 51630 + }, + { + "epoch": 0.19655458538553475, + "grad_norm": 0.1327793151140213, + "learning_rate": 0.0005, + "loss": 2.1308, + "step": 51640 + }, + { + "epoch": 0.19659264785365743, + "grad_norm": 0.1226876825094223, + "learning_rate": 0.0005, + "loss": 2.1255, + "step": 51650 + }, + { + "epoch": 0.19663071032178012, + "grad_norm": 0.11945180594921112, + "learning_rate": 0.0005, + "loss": 2.1417, + "step": 51660 + }, + { + "epoch": 0.1966687727899028, + "grad_norm": 0.12915171682834625, + "learning_rate": 0.0005, + "loss": 2.1343, + "step": 51670 + }, + { + "epoch": 0.19670683525802546, + "grad_norm": 0.12176530808210373, + "learning_rate": 0.0005, + "loss": 2.1359, + "step": 51680 + }, + { + "epoch": 0.19674489772614814, + "grad_norm": 0.11432671546936035, + "learning_rate": 0.0005, + "loss": 2.1316, + "step": 51690 + }, + { + "epoch": 0.19678296019427083, + "grad_norm": 0.12373737245798111, + "learning_rate": 0.0005, + "loss": 2.1377, + "step": 51700 + }, + { + "epoch": 0.1968210226623935, + "grad_norm": 0.12407080084085464, + "learning_rate": 0.0005, + "loss": 2.1308, + "step": 51710 + }, + { + "epoch": 0.1968590851305162, + "grad_norm": 0.11970575898885727, + "learning_rate": 0.0005, + "loss": 2.13, + "step": 51720 + }, + { + "epoch": 0.19689714759863888, + "grad_norm": 0.13847972452640533, + "learning_rate": 0.0005, + "loss": 2.1225, + "step": 51730 + }, + { + "epoch": 0.19693521006676157, + "grad_norm": 0.13606782257556915, + "learning_rate": 0.0005, + "loss": 2.1276, + "step": 51740 + }, + { + "epoch": 0.19697327253488425, + "grad_norm": 0.11516030877828598, + "learning_rate": 0.0005, + "loss": 2.1331, + "step": 51750 + }, + { + "epoch": 0.19701133500300694, + "grad_norm": 0.13698327541351318, + "learning_rate": 0.0005, + "loss": 2.133, + "step": 51760 + }, + { + "epoch": 0.19704939747112962, + "grad_norm": 0.12343499809503555, + "learning_rate": 0.0005, + "loss": 2.1175, + "step": 51770 + }, + { + "epoch": 0.1970874599392523, + "grad_norm": 0.11765126883983612, + "learning_rate": 0.0005, + "loss": 2.1062, + "step": 51780 + }, + { + "epoch": 0.197125522407375, + "grad_norm": 0.1384083479642868, + "learning_rate": 0.0005, + "loss": 2.1212, + "step": 51790 + }, + { + "epoch": 0.19716358487549768, + "grad_norm": 0.12069667130708694, + "learning_rate": 0.0005, + "loss": 2.1239, + "step": 51800 + }, + { + "epoch": 0.19720164734362036, + "grad_norm": 0.1360340118408203, + "learning_rate": 0.0005, + "loss": 2.1423, + "step": 51810 + }, + { + "epoch": 0.19723970981174302, + "grad_norm": 0.13412491977214813, + "learning_rate": 0.0005, + "loss": 2.125, + "step": 51820 + }, + { + "epoch": 0.1972777722798657, + "grad_norm": 0.14328962564468384, + "learning_rate": 0.0005, + "loss": 2.1234, + "step": 51830 + }, + { + "epoch": 0.1973158347479884, + "grad_norm": 0.12440038472414017, + "learning_rate": 0.0005, + "loss": 2.1273, + "step": 51840 + }, + { + "epoch": 0.19735389721611107, + "grad_norm": 0.11765279620885849, + "learning_rate": 0.0005, + "loss": 2.1358, + "step": 51850 + }, + { + "epoch": 0.19739195968423376, + "grad_norm": 0.12797802686691284, + "learning_rate": 0.0005, + "loss": 2.1406, + "step": 51860 + }, + { + "epoch": 0.19743002215235644, + "grad_norm": 0.12895552814006805, + "learning_rate": 0.0005, + "loss": 2.1275, + "step": 51870 + }, + { + "epoch": 0.19746808462047913, + "grad_norm": 0.11671297252178192, + "learning_rate": 0.0005, + "loss": 2.1375, + "step": 51880 + }, + { + "epoch": 0.1975061470886018, + "grad_norm": 0.13894277811050415, + "learning_rate": 0.0005, + "loss": 2.1386, + "step": 51890 + }, + { + "epoch": 0.1975442095567245, + "grad_norm": 0.13398738205432892, + "learning_rate": 0.0005, + "loss": 2.136, + "step": 51900 + }, + { + "epoch": 0.19758227202484718, + "grad_norm": 0.12182886153459549, + "learning_rate": 0.0005, + "loss": 2.116, + "step": 51910 + }, + { + "epoch": 0.19762033449296987, + "grad_norm": 0.11361069977283478, + "learning_rate": 0.0005, + "loss": 2.1331, + "step": 51920 + }, + { + "epoch": 0.19765839696109255, + "grad_norm": 0.12386829406023026, + "learning_rate": 0.0005, + "loss": 2.1343, + "step": 51930 + }, + { + "epoch": 0.19769645942921524, + "grad_norm": 0.13164442777633667, + "learning_rate": 0.0005, + "loss": 2.1334, + "step": 51940 + }, + { + "epoch": 0.19773452189733792, + "grad_norm": 0.11578863859176636, + "learning_rate": 0.0005, + "loss": 2.1328, + "step": 51950 + }, + { + "epoch": 0.1977725843654606, + "grad_norm": 0.1270083636045456, + "learning_rate": 0.0005, + "loss": 2.1089, + "step": 51960 + }, + { + "epoch": 0.19781064683358326, + "grad_norm": 0.11406195908784866, + "learning_rate": 0.0005, + "loss": 2.1441, + "step": 51970 + }, + { + "epoch": 0.19784870930170595, + "grad_norm": 0.12531724572181702, + "learning_rate": 0.0005, + "loss": 2.1355, + "step": 51980 + }, + { + "epoch": 0.19788677176982863, + "grad_norm": 0.12408492714166641, + "learning_rate": 0.0005, + "loss": 2.1328, + "step": 51990 + }, + { + "epoch": 0.19792483423795132, + "grad_norm": 0.12840083241462708, + "learning_rate": 0.0005, + "loss": 2.1505, + "step": 52000 + }, + { + "epoch": 0.197962896706074, + "grad_norm": 0.1234857514500618, + "learning_rate": 0.0005, + "loss": 2.1399, + "step": 52010 + }, + { + "epoch": 0.1980009591741967, + "grad_norm": 0.12177139520645142, + "learning_rate": 0.0005, + "loss": 2.1337, + "step": 52020 + }, + { + "epoch": 0.19803902164231937, + "grad_norm": 0.13745243847370148, + "learning_rate": 0.0005, + "loss": 2.1414, + "step": 52030 + }, + { + "epoch": 0.19807708411044206, + "grad_norm": 0.12879960238933563, + "learning_rate": 0.0005, + "loss": 2.1492, + "step": 52040 + }, + { + "epoch": 0.19811514657856474, + "grad_norm": 0.12872251868247986, + "learning_rate": 0.0005, + "loss": 2.1369, + "step": 52050 + }, + { + "epoch": 0.19815320904668743, + "grad_norm": 0.11028557270765305, + "learning_rate": 0.0005, + "loss": 2.1333, + "step": 52060 + }, + { + "epoch": 0.1981912715148101, + "grad_norm": 0.12214988470077515, + "learning_rate": 0.0005, + "loss": 2.144, + "step": 52070 + }, + { + "epoch": 0.1982293339829328, + "grad_norm": 0.12096168845891953, + "learning_rate": 0.0005, + "loss": 2.1354, + "step": 52080 + }, + { + "epoch": 0.19826739645105548, + "grad_norm": 0.12791207432746887, + "learning_rate": 0.0005, + "loss": 2.1248, + "step": 52090 + }, + { + "epoch": 0.19830545891917817, + "grad_norm": 0.12044400721788406, + "learning_rate": 0.0005, + "loss": 2.1274, + "step": 52100 + }, + { + "epoch": 0.19834352138730083, + "grad_norm": 0.128586083650589, + "learning_rate": 0.0005, + "loss": 2.11, + "step": 52110 + }, + { + "epoch": 0.1983815838554235, + "grad_norm": 0.12912602722644806, + "learning_rate": 0.0005, + "loss": 2.1275, + "step": 52120 + }, + { + "epoch": 0.1984196463235462, + "grad_norm": 0.11891495436429977, + "learning_rate": 0.0005, + "loss": 2.1213, + "step": 52130 + }, + { + "epoch": 0.19845770879166888, + "grad_norm": 0.16164763271808624, + "learning_rate": 0.0005, + "loss": 2.1334, + "step": 52140 + }, + { + "epoch": 0.19849577125979156, + "grad_norm": 0.12841705977916718, + "learning_rate": 0.0005, + "loss": 2.1312, + "step": 52150 + }, + { + "epoch": 0.19853383372791425, + "grad_norm": 0.12408839911222458, + "learning_rate": 0.0005, + "loss": 2.1269, + "step": 52160 + }, + { + "epoch": 0.19857189619603693, + "grad_norm": 0.1209225282073021, + "learning_rate": 0.0005, + "loss": 2.1371, + "step": 52170 + }, + { + "epoch": 0.19860995866415962, + "grad_norm": 0.1380387544631958, + "learning_rate": 0.0005, + "loss": 2.1324, + "step": 52180 + }, + { + "epoch": 0.1986480211322823, + "grad_norm": 0.1275915503501892, + "learning_rate": 0.0005, + "loss": 2.1382, + "step": 52190 + }, + { + "epoch": 0.198686083600405, + "grad_norm": 0.11722587794065475, + "learning_rate": 0.0005, + "loss": 2.125, + "step": 52200 + }, + { + "epoch": 0.19872414606852767, + "grad_norm": 0.12306281924247742, + "learning_rate": 0.0005, + "loss": 2.1341, + "step": 52210 + }, + { + "epoch": 0.19876220853665036, + "grad_norm": 0.1213437169790268, + "learning_rate": 0.0005, + "loss": 2.1269, + "step": 52220 + }, + { + "epoch": 0.19880027100477304, + "grad_norm": 0.12615956366062164, + "learning_rate": 0.0005, + "loss": 2.1302, + "step": 52230 + }, + { + "epoch": 0.19883833347289573, + "grad_norm": 0.11721881479024887, + "learning_rate": 0.0005, + "loss": 2.1315, + "step": 52240 + }, + { + "epoch": 0.19887639594101839, + "grad_norm": 0.11572463065385818, + "learning_rate": 0.0005, + "loss": 2.1314, + "step": 52250 + }, + { + "epoch": 0.19891445840914107, + "grad_norm": 0.12876826524734497, + "learning_rate": 0.0005, + "loss": 2.1245, + "step": 52260 + }, + { + "epoch": 0.19895252087726376, + "grad_norm": 0.12575292587280273, + "learning_rate": 0.0005, + "loss": 2.1317, + "step": 52270 + }, + { + "epoch": 0.19899058334538644, + "grad_norm": 0.12764307856559753, + "learning_rate": 0.0005, + "loss": 2.134, + "step": 52280 + }, + { + "epoch": 0.19902864581350913, + "grad_norm": 0.1270405650138855, + "learning_rate": 0.0005, + "loss": 2.1227, + "step": 52290 + }, + { + "epoch": 0.1990667082816318, + "grad_norm": 0.13285189867019653, + "learning_rate": 0.0005, + "loss": 2.1318, + "step": 52300 + }, + { + "epoch": 0.1991047707497545, + "grad_norm": 0.11400274932384491, + "learning_rate": 0.0005, + "loss": 2.1503, + "step": 52310 + }, + { + "epoch": 0.19914283321787718, + "grad_norm": 0.1264936625957489, + "learning_rate": 0.0005, + "loss": 2.1457, + "step": 52320 + }, + { + "epoch": 0.19918089568599986, + "grad_norm": 0.13964684307575226, + "learning_rate": 0.0005, + "loss": 2.1124, + "step": 52330 + }, + { + "epoch": 0.19921895815412255, + "grad_norm": 0.1267102211713791, + "learning_rate": 0.0005, + "loss": 2.1339, + "step": 52340 + }, + { + "epoch": 0.19925702062224523, + "grad_norm": 0.11472469568252563, + "learning_rate": 0.0005, + "loss": 2.1358, + "step": 52350 + }, + { + "epoch": 0.19929508309036792, + "grad_norm": 0.11693333089351654, + "learning_rate": 0.0005, + "loss": 2.1335, + "step": 52360 + }, + { + "epoch": 0.1993331455584906, + "grad_norm": 0.1316034495830536, + "learning_rate": 0.0005, + "loss": 2.1274, + "step": 52370 + }, + { + "epoch": 0.1993712080266133, + "grad_norm": 0.12253102660179138, + "learning_rate": 0.0005, + "loss": 2.1316, + "step": 52380 + }, + { + "epoch": 0.19940927049473597, + "grad_norm": 0.11671502143144608, + "learning_rate": 0.0005, + "loss": 2.1509, + "step": 52390 + }, + { + "epoch": 0.19944733296285863, + "grad_norm": 0.11247535049915314, + "learning_rate": 0.0005, + "loss": 2.1463, + "step": 52400 + }, + { + "epoch": 0.19948539543098132, + "grad_norm": 0.11922860145568848, + "learning_rate": 0.0005, + "loss": 2.1409, + "step": 52410 + }, + { + "epoch": 0.199523457899104, + "grad_norm": 0.1103215217590332, + "learning_rate": 0.0005, + "loss": 2.1383, + "step": 52420 + }, + { + "epoch": 0.19956152036722669, + "grad_norm": 0.16089947521686554, + "learning_rate": 0.0005, + "loss": 2.1388, + "step": 52430 + }, + { + "epoch": 0.19959958283534937, + "grad_norm": 0.1370892971754074, + "learning_rate": 0.0005, + "loss": 2.1206, + "step": 52440 + }, + { + "epoch": 0.19963764530347206, + "grad_norm": 0.13198357820510864, + "learning_rate": 0.0005, + "loss": 2.132, + "step": 52450 + }, + { + "epoch": 0.19967570777159474, + "grad_norm": 0.11394302546977997, + "learning_rate": 0.0005, + "loss": 2.1271, + "step": 52460 + }, + { + "epoch": 0.19971377023971743, + "grad_norm": 0.11629348993301392, + "learning_rate": 0.0005, + "loss": 2.1339, + "step": 52470 + }, + { + "epoch": 0.1997518327078401, + "grad_norm": 0.11404618620872498, + "learning_rate": 0.0005, + "loss": 2.1303, + "step": 52480 + }, + { + "epoch": 0.1997898951759628, + "grad_norm": 0.13269637525081635, + "learning_rate": 0.0005, + "loss": 2.1294, + "step": 52490 + }, + { + "epoch": 0.19982795764408548, + "grad_norm": 0.12785989046096802, + "learning_rate": 0.0005, + "loss": 2.1339, + "step": 52500 + }, + { + "epoch": 0.19986602011220816, + "grad_norm": 0.12473263591527939, + "learning_rate": 0.0005, + "loss": 2.1319, + "step": 52510 + }, + { + "epoch": 0.19990408258033085, + "grad_norm": 0.12538190186023712, + "learning_rate": 0.0005, + "loss": 2.1316, + "step": 52520 + }, + { + "epoch": 0.19994214504845353, + "grad_norm": 0.1268540769815445, + "learning_rate": 0.0005, + "loss": 2.1445, + "step": 52530 + }, + { + "epoch": 0.1999802075165762, + "grad_norm": 0.1344127058982849, + "learning_rate": 0.0005, + "loss": 2.1502, + "step": 52540 + }, + { + "epoch": 0.20001826998469888, + "grad_norm": 0.12185916304588318, + "learning_rate": 0.0005, + "loss": 2.1356, + "step": 52550 + }, + { + "epoch": 0.20005633245282156, + "grad_norm": 0.12213872373104095, + "learning_rate": 0.0005, + "loss": 2.1174, + "step": 52560 + }, + { + "epoch": 0.20009439492094425, + "grad_norm": 0.12694446742534637, + "learning_rate": 0.0005, + "loss": 2.1414, + "step": 52570 + }, + { + "epoch": 0.20013245738906693, + "grad_norm": 0.13906416296958923, + "learning_rate": 0.0005, + "loss": 2.1343, + "step": 52580 + }, + { + "epoch": 0.20017051985718962, + "grad_norm": 0.13742320239543915, + "learning_rate": 0.0005, + "loss": 2.1264, + "step": 52590 + }, + { + "epoch": 0.2002085823253123, + "grad_norm": 0.13378769159317017, + "learning_rate": 0.0005, + "loss": 2.127, + "step": 52600 + }, + { + "epoch": 0.20024664479343499, + "grad_norm": 0.13367900252342224, + "learning_rate": 0.0005, + "loss": 2.1334, + "step": 52610 + }, + { + "epoch": 0.20028470726155767, + "grad_norm": 0.13418786227703094, + "learning_rate": 0.0005, + "loss": 2.129, + "step": 52620 + }, + { + "epoch": 0.20032276972968036, + "grad_norm": 0.12620803713798523, + "learning_rate": 0.0005, + "loss": 2.1501, + "step": 52630 + }, + { + "epoch": 0.20036083219780304, + "grad_norm": 0.12302321195602417, + "learning_rate": 0.0005, + "loss": 2.1295, + "step": 52640 + }, + { + "epoch": 0.20039889466592573, + "grad_norm": 0.11489461362361908, + "learning_rate": 0.0005, + "loss": 2.1233, + "step": 52650 + }, + { + "epoch": 0.2004369571340484, + "grad_norm": 0.12589262425899506, + "learning_rate": 0.0005, + "loss": 2.1553, + "step": 52660 + }, + { + "epoch": 0.2004750196021711, + "grad_norm": 0.12370242923498154, + "learning_rate": 0.0005, + "loss": 2.1337, + "step": 52670 + }, + { + "epoch": 0.20051308207029378, + "grad_norm": 0.12273893505334854, + "learning_rate": 0.0005, + "loss": 2.131, + "step": 52680 + }, + { + "epoch": 0.20055114453841644, + "grad_norm": 0.12193376570940018, + "learning_rate": 0.0005, + "loss": 2.1227, + "step": 52690 + }, + { + "epoch": 0.20058920700653912, + "grad_norm": 0.125262051820755, + "learning_rate": 0.0005, + "loss": 2.1293, + "step": 52700 + }, + { + "epoch": 0.2006272694746618, + "grad_norm": 0.13586099445819855, + "learning_rate": 0.0005, + "loss": 2.127, + "step": 52710 + }, + { + "epoch": 0.2006653319427845, + "grad_norm": 0.11950060725212097, + "learning_rate": 0.0005, + "loss": 2.1454, + "step": 52720 + }, + { + "epoch": 0.20070339441090718, + "grad_norm": 0.1289469301700592, + "learning_rate": 0.0005, + "loss": 2.122, + "step": 52730 + }, + { + "epoch": 0.20074145687902986, + "grad_norm": 0.12563686072826385, + "learning_rate": 0.0005, + "loss": 2.1371, + "step": 52740 + }, + { + "epoch": 0.20077951934715255, + "grad_norm": 0.11492034047842026, + "learning_rate": 0.0005, + "loss": 2.1386, + "step": 52750 + }, + { + "epoch": 0.20081758181527523, + "grad_norm": 0.12467867881059647, + "learning_rate": 0.0005, + "loss": 2.1217, + "step": 52760 + }, + { + "epoch": 0.20085564428339792, + "grad_norm": 0.1249057725071907, + "learning_rate": 0.0005, + "loss": 2.1252, + "step": 52770 + }, + { + "epoch": 0.2008937067515206, + "grad_norm": 0.13855883479118347, + "learning_rate": 0.0005, + "loss": 2.1318, + "step": 52780 + }, + { + "epoch": 0.20093176921964329, + "grad_norm": 0.12252828478813171, + "learning_rate": 0.0005, + "loss": 2.1297, + "step": 52790 + }, + { + "epoch": 0.20096983168776597, + "grad_norm": 0.11606127768754959, + "learning_rate": 0.0005, + "loss": 2.1307, + "step": 52800 + }, + { + "epoch": 0.20100789415588866, + "grad_norm": 0.1138739287853241, + "learning_rate": 0.0005, + "loss": 2.1256, + "step": 52810 + }, + { + "epoch": 0.20104595662401134, + "grad_norm": 0.14339643716812134, + "learning_rate": 0.0005, + "loss": 2.1283, + "step": 52820 + }, + { + "epoch": 0.201084019092134, + "grad_norm": 0.13310876488685608, + "learning_rate": 0.0005, + "loss": 2.1401, + "step": 52830 + }, + { + "epoch": 0.20112208156025668, + "grad_norm": 0.11275873333215714, + "learning_rate": 0.0005, + "loss": 2.1375, + "step": 52840 + }, + { + "epoch": 0.20116014402837937, + "grad_norm": 0.11566092818975449, + "learning_rate": 0.0005, + "loss": 2.1203, + "step": 52850 + }, + { + "epoch": 0.20119820649650205, + "grad_norm": 0.14016412198543549, + "learning_rate": 0.0005, + "loss": 2.1348, + "step": 52860 + }, + { + "epoch": 0.20123626896462474, + "grad_norm": 0.12313041090965271, + "learning_rate": 0.0005, + "loss": 2.1131, + "step": 52870 + }, + { + "epoch": 0.20127433143274742, + "grad_norm": 0.12930551171302795, + "learning_rate": 0.0005, + "loss": 2.1241, + "step": 52880 + }, + { + "epoch": 0.2013123939008701, + "grad_norm": 0.13301874697208405, + "learning_rate": 0.0005, + "loss": 2.1374, + "step": 52890 + }, + { + "epoch": 0.2013504563689928, + "grad_norm": 0.11824183166027069, + "learning_rate": 0.0005, + "loss": 2.1415, + "step": 52900 + }, + { + "epoch": 0.20138851883711548, + "grad_norm": 0.1287565529346466, + "learning_rate": 0.0005, + "loss": 2.1277, + "step": 52910 + }, + { + "epoch": 0.20142658130523816, + "grad_norm": 0.13923634588718414, + "learning_rate": 0.0005, + "loss": 2.1251, + "step": 52920 + }, + { + "epoch": 0.20146464377336085, + "grad_norm": 0.134573295712471, + "learning_rate": 0.0005, + "loss": 2.1216, + "step": 52930 + }, + { + "epoch": 0.20150270624148353, + "grad_norm": 0.1330741047859192, + "learning_rate": 0.0005, + "loss": 2.1266, + "step": 52940 + }, + { + "epoch": 0.20154076870960622, + "grad_norm": 0.12362990528345108, + "learning_rate": 0.0005, + "loss": 2.1305, + "step": 52950 + }, + { + "epoch": 0.2015788311777289, + "grad_norm": 0.11643882095813751, + "learning_rate": 0.0005, + "loss": 2.1298, + "step": 52960 + }, + { + "epoch": 0.20161689364585156, + "grad_norm": 0.13288746774196625, + "learning_rate": 0.0005, + "loss": 2.1367, + "step": 52970 + }, + { + "epoch": 0.20165495611397424, + "grad_norm": 0.11521294713020325, + "learning_rate": 0.0005, + "loss": 2.1305, + "step": 52980 + }, + { + "epoch": 0.20169301858209693, + "grad_norm": 0.1399589329957962, + "learning_rate": 0.0005, + "loss": 2.1432, + "step": 52990 + }, + { + "epoch": 0.2017310810502196, + "grad_norm": 0.12761197984218597, + "learning_rate": 0.0005, + "loss": 2.124, + "step": 53000 + }, + { + "epoch": 0.2017691435183423, + "grad_norm": 0.14741909503936768, + "learning_rate": 0.0005, + "loss": 2.1266, + "step": 53010 + }, + { + "epoch": 0.20180720598646498, + "grad_norm": 0.12950895726680756, + "learning_rate": 0.0005, + "loss": 2.1236, + "step": 53020 + }, + { + "epoch": 0.20184526845458767, + "grad_norm": 0.1153949499130249, + "learning_rate": 0.0005, + "loss": 2.126, + "step": 53030 + }, + { + "epoch": 0.20188333092271035, + "grad_norm": 0.1166260614991188, + "learning_rate": 0.0005, + "loss": 2.1328, + "step": 53040 + }, + { + "epoch": 0.20192139339083304, + "grad_norm": 0.11685400456190109, + "learning_rate": 0.0005, + "loss": 2.1309, + "step": 53050 + }, + { + "epoch": 0.20195945585895572, + "grad_norm": 0.1260029524564743, + "learning_rate": 0.0005, + "loss": 2.1241, + "step": 53060 + }, + { + "epoch": 0.2019975183270784, + "grad_norm": 0.11610860377550125, + "learning_rate": 0.0005, + "loss": 2.141, + "step": 53070 + }, + { + "epoch": 0.2020355807952011, + "grad_norm": 0.11834888905286789, + "learning_rate": 0.0005, + "loss": 2.1316, + "step": 53080 + }, + { + "epoch": 0.20207364326332378, + "grad_norm": 0.1214648187160492, + "learning_rate": 0.0005, + "loss": 2.1294, + "step": 53090 + }, + { + "epoch": 0.20211170573144646, + "grad_norm": 0.12263604253530502, + "learning_rate": 0.0005, + "loss": 2.1214, + "step": 53100 + }, + { + "epoch": 0.20214976819956915, + "grad_norm": 0.11759454756975174, + "learning_rate": 0.0005, + "loss": 2.1251, + "step": 53110 + }, + { + "epoch": 0.2021878306676918, + "grad_norm": 0.1269487589597702, + "learning_rate": 0.0005, + "loss": 2.1293, + "step": 53120 + }, + { + "epoch": 0.2022258931358145, + "grad_norm": 0.13687825202941895, + "learning_rate": 0.0005, + "loss": 2.1333, + "step": 53130 + }, + { + "epoch": 0.20226395560393717, + "grad_norm": 0.13485699892044067, + "learning_rate": 0.0005, + "loss": 2.1369, + "step": 53140 + }, + { + "epoch": 0.20230201807205986, + "grad_norm": 0.12800122797489166, + "learning_rate": 0.0005, + "loss": 2.1404, + "step": 53150 + }, + { + "epoch": 0.20234008054018254, + "grad_norm": 0.12188751995563507, + "learning_rate": 0.0005, + "loss": 2.1416, + "step": 53160 + }, + { + "epoch": 0.20237814300830523, + "grad_norm": 0.11998622864484787, + "learning_rate": 0.0005, + "loss": 2.1371, + "step": 53170 + }, + { + "epoch": 0.2024162054764279, + "grad_norm": 0.12778060138225555, + "learning_rate": 0.0005, + "loss": 2.1198, + "step": 53180 + }, + { + "epoch": 0.2024542679445506, + "grad_norm": 0.12257424741983414, + "learning_rate": 0.0005, + "loss": 2.1448, + "step": 53190 + }, + { + "epoch": 0.20249233041267328, + "grad_norm": 0.1922028511762619, + "learning_rate": 0.0005, + "loss": 2.1369, + "step": 53200 + }, + { + "epoch": 0.20253039288079597, + "grad_norm": 0.11676789075136185, + "learning_rate": 0.0005, + "loss": 2.1322, + "step": 53210 + }, + { + "epoch": 0.20256845534891865, + "grad_norm": 0.12324552983045578, + "learning_rate": 0.0005, + "loss": 2.1328, + "step": 53220 + }, + { + "epoch": 0.20260651781704134, + "grad_norm": 0.12745507061481476, + "learning_rate": 0.0005, + "loss": 2.1402, + "step": 53230 + }, + { + "epoch": 0.20264458028516402, + "grad_norm": 0.13323475420475006, + "learning_rate": 0.0005, + "loss": 2.1402, + "step": 53240 + }, + { + "epoch": 0.2026826427532867, + "grad_norm": 0.13235697150230408, + "learning_rate": 0.0005, + "loss": 2.1445, + "step": 53250 + }, + { + "epoch": 0.20272070522140936, + "grad_norm": 0.126156285405159, + "learning_rate": 0.0005, + "loss": 2.1347, + "step": 53260 + }, + { + "epoch": 0.20275876768953205, + "grad_norm": 0.12333841621875763, + "learning_rate": 0.0005, + "loss": 2.1318, + "step": 53270 + }, + { + "epoch": 0.20279683015765473, + "grad_norm": 0.12026585638523102, + "learning_rate": 0.0005, + "loss": 2.131, + "step": 53280 + }, + { + "epoch": 0.20283489262577742, + "grad_norm": 0.13235647976398468, + "learning_rate": 0.0005, + "loss": 2.1329, + "step": 53290 + }, + { + "epoch": 0.2028729550939001, + "grad_norm": 0.11956822127103806, + "learning_rate": 0.0005, + "loss": 2.1305, + "step": 53300 + }, + { + "epoch": 0.2029110175620228, + "grad_norm": 0.12753432989120483, + "learning_rate": 0.0005, + "loss": 2.1277, + "step": 53310 + }, + { + "epoch": 0.20294908003014547, + "grad_norm": 0.11608036607503891, + "learning_rate": 0.0005, + "loss": 2.1353, + "step": 53320 + }, + { + "epoch": 0.20298714249826816, + "grad_norm": 0.1314079910516739, + "learning_rate": 0.0005, + "loss": 2.1281, + "step": 53330 + }, + { + "epoch": 0.20302520496639084, + "grad_norm": 0.11957383155822754, + "learning_rate": 0.0005, + "loss": 2.1416, + "step": 53340 + }, + { + "epoch": 0.20306326743451353, + "grad_norm": 0.14062124490737915, + "learning_rate": 0.0005, + "loss": 2.1506, + "step": 53350 + }, + { + "epoch": 0.2031013299026362, + "grad_norm": 0.12807902693748474, + "learning_rate": 0.0005, + "loss": 2.1436, + "step": 53360 + }, + { + "epoch": 0.2031393923707589, + "grad_norm": 0.1215902715921402, + "learning_rate": 0.0005, + "loss": 2.1318, + "step": 53370 + }, + { + "epoch": 0.20317745483888158, + "grad_norm": 0.12466635555028915, + "learning_rate": 0.0005, + "loss": 2.1237, + "step": 53380 + }, + { + "epoch": 0.20321551730700427, + "grad_norm": 0.12310461699962616, + "learning_rate": 0.0005, + "loss": 2.1397, + "step": 53390 + }, + { + "epoch": 0.20325357977512692, + "grad_norm": 0.13518789410591125, + "learning_rate": 0.0005, + "loss": 2.132, + "step": 53400 + }, + { + "epoch": 0.2032916422432496, + "grad_norm": 0.13395272195339203, + "learning_rate": 0.0005, + "loss": 2.1046, + "step": 53410 + }, + { + "epoch": 0.2033297047113723, + "grad_norm": 0.12013889849185944, + "learning_rate": 0.0005, + "loss": 2.1338, + "step": 53420 + }, + { + "epoch": 0.20336776717949498, + "grad_norm": 0.12862953543663025, + "learning_rate": 0.0005, + "loss": 2.135, + "step": 53430 + }, + { + "epoch": 0.20340582964761766, + "grad_norm": 0.119823157787323, + "learning_rate": 0.0005, + "loss": 2.1258, + "step": 53440 + }, + { + "epoch": 0.20344389211574035, + "grad_norm": 0.12746182084083557, + "learning_rate": 0.0005, + "loss": 2.133, + "step": 53450 + }, + { + "epoch": 0.20348195458386303, + "grad_norm": 0.12012957781553268, + "learning_rate": 0.0005, + "loss": 2.1299, + "step": 53460 + }, + { + "epoch": 0.20352001705198572, + "grad_norm": 0.12806500494480133, + "learning_rate": 0.0005, + "loss": 2.1273, + "step": 53470 + }, + { + "epoch": 0.2035580795201084, + "grad_norm": 0.12169921398162842, + "learning_rate": 0.0005, + "loss": 2.1254, + "step": 53480 + }, + { + "epoch": 0.2035961419882311, + "grad_norm": 0.5924624800682068, + "learning_rate": 0.0005, + "loss": 2.1444, + "step": 53490 + }, + { + "epoch": 0.20363420445635377, + "grad_norm": 0.1756318360567093, + "learning_rate": 0.0005, + "loss": 2.1477, + "step": 53500 + }, + { + "epoch": 0.20367226692447646, + "grad_norm": 0.14018510282039642, + "learning_rate": 0.0005, + "loss": 2.1425, + "step": 53510 + }, + { + "epoch": 0.20371032939259914, + "grad_norm": 0.12534722685813904, + "learning_rate": 0.0005, + "loss": 2.1346, + "step": 53520 + }, + { + "epoch": 0.20374839186072183, + "grad_norm": 0.1105349212884903, + "learning_rate": 0.0005, + "loss": 2.1251, + "step": 53530 + }, + { + "epoch": 0.2037864543288445, + "grad_norm": 0.11550001800060272, + "learning_rate": 0.0005, + "loss": 2.1156, + "step": 53540 + }, + { + "epoch": 0.20382451679696717, + "grad_norm": 0.12112603336572647, + "learning_rate": 0.0005, + "loss": 2.1179, + "step": 53550 + }, + { + "epoch": 0.20386257926508985, + "grad_norm": 0.12003546953201294, + "learning_rate": 0.0005, + "loss": 2.1291, + "step": 53560 + }, + { + "epoch": 0.20390064173321254, + "grad_norm": 0.13228186964988708, + "learning_rate": 0.0005, + "loss": 2.1254, + "step": 53570 + }, + { + "epoch": 0.20393870420133522, + "grad_norm": 0.12017644941806793, + "learning_rate": 0.0005, + "loss": 2.1275, + "step": 53580 + }, + { + "epoch": 0.2039767666694579, + "grad_norm": 0.12018509954214096, + "learning_rate": 0.0005, + "loss": 2.1234, + "step": 53590 + }, + { + "epoch": 0.2040148291375806, + "grad_norm": 0.12411874532699585, + "learning_rate": 0.0005, + "loss": 2.1357, + "step": 53600 + }, + { + "epoch": 0.20405289160570328, + "grad_norm": 0.1323518604040146, + "learning_rate": 0.0005, + "loss": 2.1477, + "step": 53610 + }, + { + "epoch": 0.20409095407382596, + "grad_norm": 0.12746353447437286, + "learning_rate": 0.0005, + "loss": 2.1399, + "step": 53620 + }, + { + "epoch": 0.20412901654194865, + "grad_norm": 0.11377382278442383, + "learning_rate": 0.0005, + "loss": 2.1376, + "step": 53630 + }, + { + "epoch": 0.20416707901007133, + "grad_norm": 0.1344747096300125, + "learning_rate": 0.0005, + "loss": 2.1277, + "step": 53640 + }, + { + "epoch": 0.20420514147819402, + "grad_norm": 0.12653642892837524, + "learning_rate": 0.0005, + "loss": 2.1357, + "step": 53650 + }, + { + "epoch": 0.2042432039463167, + "grad_norm": 0.13320209085941315, + "learning_rate": 0.0005, + "loss": 2.1412, + "step": 53660 + }, + { + "epoch": 0.2042812664144394, + "grad_norm": 0.12339047342538834, + "learning_rate": 0.0005, + "loss": 2.1335, + "step": 53670 + }, + { + "epoch": 0.20431932888256207, + "grad_norm": 0.1247817873954773, + "learning_rate": 0.0005, + "loss": 2.1501, + "step": 53680 + }, + { + "epoch": 0.20435739135068473, + "grad_norm": 0.11579610407352448, + "learning_rate": 0.0005, + "loss": 2.1519, + "step": 53690 + }, + { + "epoch": 0.20439545381880742, + "grad_norm": 0.11140631884336472, + "learning_rate": 0.0005, + "loss": 2.1308, + "step": 53700 + }, + { + "epoch": 0.2044335162869301, + "grad_norm": 0.11983554810285568, + "learning_rate": 0.0005, + "loss": 2.1353, + "step": 53710 + }, + { + "epoch": 0.20447157875505279, + "grad_norm": 0.11999595910310745, + "learning_rate": 0.0005, + "loss": 2.1353, + "step": 53720 + }, + { + "epoch": 0.20450964122317547, + "grad_norm": 0.11656316369771957, + "learning_rate": 0.0005, + "loss": 2.1404, + "step": 53730 + }, + { + "epoch": 0.20454770369129815, + "grad_norm": 0.1222524419426918, + "learning_rate": 0.0005, + "loss": 2.1379, + "step": 53740 + }, + { + "epoch": 0.20458576615942084, + "grad_norm": 0.1305672526359558, + "learning_rate": 0.0005, + "loss": 2.1388, + "step": 53750 + }, + { + "epoch": 0.20462382862754352, + "grad_norm": 0.13974280655384064, + "learning_rate": 0.0005, + "loss": 2.1359, + "step": 53760 + }, + { + "epoch": 0.2046618910956662, + "grad_norm": 0.11578554660081863, + "learning_rate": 0.0005, + "loss": 2.1114, + "step": 53770 + }, + { + "epoch": 0.2046999535637889, + "grad_norm": 0.12131907045841217, + "learning_rate": 0.0005, + "loss": 2.127, + "step": 53780 + }, + { + "epoch": 0.20473801603191158, + "grad_norm": 0.36936134099960327, + "learning_rate": 0.0005, + "loss": 2.1329, + "step": 53790 + }, + { + "epoch": 0.20477607850003426, + "grad_norm": 0.11620636284351349, + "learning_rate": 0.0005, + "loss": 2.1319, + "step": 53800 + }, + { + "epoch": 0.20481414096815695, + "grad_norm": 0.15168805420398712, + "learning_rate": 0.0005, + "loss": 2.1277, + "step": 53810 + }, + { + "epoch": 0.20485220343627963, + "grad_norm": 0.1311609297990799, + "learning_rate": 0.0005, + "loss": 2.1341, + "step": 53820 + }, + { + "epoch": 0.20489026590440232, + "grad_norm": 0.11689729988574982, + "learning_rate": 0.0005, + "loss": 2.1341, + "step": 53830 + }, + { + "epoch": 0.20492832837252498, + "grad_norm": 0.12733341753482819, + "learning_rate": 0.0005, + "loss": 2.1518, + "step": 53840 + }, + { + "epoch": 0.20496639084064766, + "grad_norm": 0.13116632401943207, + "learning_rate": 0.0005, + "loss": 2.1264, + "step": 53850 + }, + { + "epoch": 0.20500445330877035, + "grad_norm": 0.13507050275802612, + "learning_rate": 0.0005, + "loss": 2.1254, + "step": 53860 + }, + { + "epoch": 0.20504251577689303, + "grad_norm": 0.14825035631656647, + "learning_rate": 0.0005, + "loss": 2.1486, + "step": 53870 + }, + { + "epoch": 0.20508057824501572, + "grad_norm": 0.1258191466331482, + "learning_rate": 0.0005, + "loss": 2.1242, + "step": 53880 + }, + { + "epoch": 0.2051186407131384, + "grad_norm": 0.12025585025548935, + "learning_rate": 0.0005, + "loss": 2.1262, + "step": 53890 + }, + { + "epoch": 0.20515670318126109, + "grad_norm": 0.11997051537036896, + "learning_rate": 0.0005, + "loss": 2.1335, + "step": 53900 + }, + { + "epoch": 0.20519476564938377, + "grad_norm": 0.1282651275396347, + "learning_rate": 0.0005, + "loss": 2.1287, + "step": 53910 + }, + { + "epoch": 0.20523282811750646, + "grad_norm": 0.11547555774450302, + "learning_rate": 0.0005, + "loss": 2.1247, + "step": 53920 + }, + { + "epoch": 0.20527089058562914, + "grad_norm": 0.1203666478395462, + "learning_rate": 0.0005, + "loss": 2.1295, + "step": 53930 + }, + { + "epoch": 0.20530895305375182, + "grad_norm": 0.12874288856983185, + "learning_rate": 0.0005, + "loss": 2.1196, + "step": 53940 + }, + { + "epoch": 0.2053470155218745, + "grad_norm": 0.10989463329315186, + "learning_rate": 0.0005, + "loss": 2.1269, + "step": 53950 + }, + { + "epoch": 0.2053850779899972, + "grad_norm": 0.11719923466444016, + "learning_rate": 0.0005, + "loss": 2.1328, + "step": 53960 + }, + { + "epoch": 0.20542314045811988, + "grad_norm": 0.14622372388839722, + "learning_rate": 0.0005, + "loss": 2.1304, + "step": 53970 + }, + { + "epoch": 0.20546120292624254, + "grad_norm": 0.12204207479953766, + "learning_rate": 0.0005, + "loss": 2.1293, + "step": 53980 + }, + { + "epoch": 0.20549926539436522, + "grad_norm": 0.11967600882053375, + "learning_rate": 0.0005, + "loss": 2.1247, + "step": 53990 + }, + { + "epoch": 0.2055373278624879, + "grad_norm": 0.13717426359653473, + "learning_rate": 0.0005, + "loss": 2.1311, + "step": 54000 + }, + { + "epoch": 0.2055753903306106, + "grad_norm": 0.12481812387704849, + "learning_rate": 0.0005, + "loss": 2.13, + "step": 54010 + }, + { + "epoch": 0.20561345279873328, + "grad_norm": 0.12184108048677444, + "learning_rate": 0.0005, + "loss": 2.1223, + "step": 54020 + }, + { + "epoch": 0.20565151526685596, + "grad_norm": 0.1144619882106781, + "learning_rate": 0.0005, + "loss": 2.1322, + "step": 54030 + }, + { + "epoch": 0.20568957773497865, + "grad_norm": 0.12227953970432281, + "learning_rate": 0.0005, + "loss": 2.1404, + "step": 54040 + }, + { + "epoch": 0.20572764020310133, + "grad_norm": 0.12673652172088623, + "learning_rate": 0.0005, + "loss": 2.1079, + "step": 54050 + }, + { + "epoch": 0.20576570267122402, + "grad_norm": 0.12035863101482391, + "learning_rate": 0.0005, + "loss": 2.138, + "step": 54060 + }, + { + "epoch": 0.2058037651393467, + "grad_norm": 0.1242937445640564, + "learning_rate": 0.0005, + "loss": 2.1468, + "step": 54070 + }, + { + "epoch": 0.20584182760746939, + "grad_norm": 0.1221139058470726, + "learning_rate": 0.0005, + "loss": 2.1362, + "step": 54080 + }, + { + "epoch": 0.20587989007559207, + "grad_norm": 0.12572316825389862, + "learning_rate": 0.0005, + "loss": 2.1196, + "step": 54090 + }, + { + "epoch": 0.20591795254371476, + "grad_norm": 0.10956557840108871, + "learning_rate": 0.0005, + "loss": 2.1414, + "step": 54100 + }, + { + "epoch": 0.20595601501183744, + "grad_norm": 0.12266967445611954, + "learning_rate": 0.0005, + "loss": 2.1358, + "step": 54110 + }, + { + "epoch": 0.2059940774799601, + "grad_norm": 0.12685400247573853, + "learning_rate": 0.0005, + "loss": 2.1494, + "step": 54120 + }, + { + "epoch": 0.20603213994808278, + "grad_norm": 0.12227378040552139, + "learning_rate": 0.0005, + "loss": 2.1251, + "step": 54130 + }, + { + "epoch": 0.20607020241620547, + "grad_norm": 0.11356621235609055, + "learning_rate": 0.0005, + "loss": 2.1365, + "step": 54140 + }, + { + "epoch": 0.20610826488432815, + "grad_norm": 0.12191888689994812, + "learning_rate": 0.0005, + "loss": 2.1311, + "step": 54150 + }, + { + "epoch": 0.20614632735245084, + "grad_norm": 0.14179301261901855, + "learning_rate": 0.0005, + "loss": 2.1271, + "step": 54160 + }, + { + "epoch": 0.20618438982057352, + "grad_norm": 0.13185860216617584, + "learning_rate": 0.0005, + "loss": 2.1361, + "step": 54170 + }, + { + "epoch": 0.2062224522886962, + "grad_norm": 0.12983562052249908, + "learning_rate": 0.0005, + "loss": 2.1345, + "step": 54180 + }, + { + "epoch": 0.2062605147568189, + "grad_norm": 0.13673518598079681, + "learning_rate": 0.0005, + "loss": 2.1347, + "step": 54190 + }, + { + "epoch": 0.20629857722494158, + "grad_norm": 0.13771027326583862, + "learning_rate": 0.0005, + "loss": 2.1257, + "step": 54200 + }, + { + "epoch": 0.20633663969306426, + "grad_norm": 0.12526799738407135, + "learning_rate": 0.0005, + "loss": 2.1365, + "step": 54210 + }, + { + "epoch": 0.20637470216118695, + "grad_norm": 0.11866194754838943, + "learning_rate": 0.0005, + "loss": 2.1321, + "step": 54220 + }, + { + "epoch": 0.20641276462930963, + "grad_norm": 0.11927735805511475, + "learning_rate": 0.0005, + "loss": 2.1309, + "step": 54230 + }, + { + "epoch": 0.20645082709743232, + "grad_norm": 0.11958098411560059, + "learning_rate": 0.0005, + "loss": 2.1246, + "step": 54240 + }, + { + "epoch": 0.206488889565555, + "grad_norm": 0.11478394269943237, + "learning_rate": 0.0005, + "loss": 2.1323, + "step": 54250 + }, + { + "epoch": 0.20652695203367769, + "grad_norm": 0.13040174543857574, + "learning_rate": 0.0005, + "loss": 2.1435, + "step": 54260 + }, + { + "epoch": 0.20656501450180034, + "grad_norm": 0.13240981101989746, + "learning_rate": 0.0005, + "loss": 2.125, + "step": 54270 + }, + { + "epoch": 0.20660307696992303, + "grad_norm": 0.11602520942687988, + "learning_rate": 0.0005, + "loss": 2.1252, + "step": 54280 + }, + { + "epoch": 0.2066411394380457, + "grad_norm": 0.13566020131111145, + "learning_rate": 0.0005, + "loss": 2.1265, + "step": 54290 + }, + { + "epoch": 0.2066792019061684, + "grad_norm": 0.1175849512219429, + "learning_rate": 0.0005, + "loss": 2.1247, + "step": 54300 + }, + { + "epoch": 0.20671726437429108, + "grad_norm": 0.12190808355808258, + "learning_rate": 0.0005, + "loss": 2.1389, + "step": 54310 + }, + { + "epoch": 0.20675532684241377, + "grad_norm": 0.11281128227710724, + "learning_rate": 0.0005, + "loss": 2.1386, + "step": 54320 + }, + { + "epoch": 0.20679338931053645, + "grad_norm": 0.11461440473794937, + "learning_rate": 0.0005, + "loss": 2.1184, + "step": 54330 + }, + { + "epoch": 0.20683145177865914, + "grad_norm": 0.121035136282444, + "learning_rate": 0.0005, + "loss": 2.1286, + "step": 54340 + }, + { + "epoch": 0.20686951424678182, + "grad_norm": 0.12157423049211502, + "learning_rate": 0.0005, + "loss": 2.1393, + "step": 54350 + }, + { + "epoch": 0.2069075767149045, + "grad_norm": 0.11936475336551666, + "learning_rate": 0.0005, + "loss": 2.1363, + "step": 54360 + }, + { + "epoch": 0.2069456391830272, + "grad_norm": 0.1261560320854187, + "learning_rate": 0.0005, + "loss": 2.1168, + "step": 54370 + }, + { + "epoch": 0.20698370165114988, + "grad_norm": 0.12038817256689072, + "learning_rate": 0.0005, + "loss": 2.1343, + "step": 54380 + }, + { + "epoch": 0.20702176411927256, + "grad_norm": 0.12115965038537979, + "learning_rate": 0.0005, + "loss": 2.1452, + "step": 54390 + }, + { + "epoch": 0.20705982658739525, + "grad_norm": 0.1181049793958664, + "learning_rate": 0.0005, + "loss": 2.1262, + "step": 54400 + }, + { + "epoch": 0.2070978890555179, + "grad_norm": 0.11807470768690109, + "learning_rate": 0.0005, + "loss": 2.1409, + "step": 54410 + }, + { + "epoch": 0.2071359515236406, + "grad_norm": 0.1191549152135849, + "learning_rate": 0.0005, + "loss": 2.1295, + "step": 54420 + }, + { + "epoch": 0.20717401399176327, + "grad_norm": 0.11723997443914413, + "learning_rate": 0.0005, + "loss": 2.1336, + "step": 54430 + }, + { + "epoch": 0.20721207645988596, + "grad_norm": 0.13964317739009857, + "learning_rate": 0.0005, + "loss": 2.1388, + "step": 54440 + }, + { + "epoch": 0.20725013892800864, + "grad_norm": 0.12053508311510086, + "learning_rate": 0.0005, + "loss": 2.1345, + "step": 54450 + }, + { + "epoch": 0.20728820139613133, + "grad_norm": 0.1290549635887146, + "learning_rate": 0.0005, + "loss": 2.145, + "step": 54460 + }, + { + "epoch": 0.207326263864254, + "grad_norm": 0.11613258719444275, + "learning_rate": 0.0005, + "loss": 2.1324, + "step": 54470 + }, + { + "epoch": 0.2073643263323767, + "grad_norm": 0.1176428496837616, + "learning_rate": 0.0005, + "loss": 2.1247, + "step": 54480 + }, + { + "epoch": 0.20740238880049938, + "grad_norm": 0.12306077033281326, + "learning_rate": 0.0005, + "loss": 2.1226, + "step": 54490 + }, + { + "epoch": 0.20744045126862207, + "grad_norm": 0.12678076326847076, + "learning_rate": 0.0005, + "loss": 2.1185, + "step": 54500 + }, + { + "epoch": 0.20747851373674475, + "grad_norm": 0.13372944295406342, + "learning_rate": 0.0005, + "loss": 2.1275, + "step": 54510 + }, + { + "epoch": 0.20751657620486744, + "grad_norm": 0.1182679682970047, + "learning_rate": 0.0005, + "loss": 2.1238, + "step": 54520 + }, + { + "epoch": 0.20755463867299012, + "grad_norm": 0.11917141079902649, + "learning_rate": 0.0005, + "loss": 2.137, + "step": 54530 + }, + { + "epoch": 0.2075927011411128, + "grad_norm": 0.12174999713897705, + "learning_rate": 0.0005, + "loss": 2.1427, + "step": 54540 + }, + { + "epoch": 0.20763076360923546, + "grad_norm": 0.12785743176937103, + "learning_rate": 0.0005, + "loss": 2.1392, + "step": 54550 + }, + { + "epoch": 0.20766882607735815, + "grad_norm": 0.13062579929828644, + "learning_rate": 0.0005, + "loss": 2.142, + "step": 54560 + }, + { + "epoch": 0.20770688854548083, + "grad_norm": 0.1259687840938568, + "learning_rate": 0.0005, + "loss": 2.1442, + "step": 54570 + }, + { + "epoch": 0.20774495101360352, + "grad_norm": 0.1330173909664154, + "learning_rate": 0.0005, + "loss": 2.1379, + "step": 54580 + }, + { + "epoch": 0.2077830134817262, + "grad_norm": 0.12119072675704956, + "learning_rate": 0.0005, + "loss": 2.1318, + "step": 54590 + }, + { + "epoch": 0.2078210759498489, + "grad_norm": 0.1250128597021103, + "learning_rate": 0.0005, + "loss": 2.1318, + "step": 54600 + }, + { + "epoch": 0.20785913841797157, + "grad_norm": 0.12661468982696533, + "learning_rate": 0.0005, + "loss": 2.1338, + "step": 54610 + }, + { + "epoch": 0.20789720088609426, + "grad_norm": 0.13943052291870117, + "learning_rate": 0.0005, + "loss": 2.1462, + "step": 54620 + }, + { + "epoch": 0.20793526335421694, + "grad_norm": 0.1279287338256836, + "learning_rate": 0.0005, + "loss": 2.1299, + "step": 54630 + }, + { + "epoch": 0.20797332582233963, + "grad_norm": 0.1283564567565918, + "learning_rate": 0.0005, + "loss": 2.1266, + "step": 54640 + }, + { + "epoch": 0.2080113882904623, + "grad_norm": 0.12714558839797974, + "learning_rate": 0.0005, + "loss": 2.1364, + "step": 54650 + }, + { + "epoch": 0.208049450758585, + "grad_norm": 0.12611687183380127, + "learning_rate": 0.0005, + "loss": 2.1301, + "step": 54660 + }, + { + "epoch": 0.20808751322670768, + "grad_norm": 0.11924757808446884, + "learning_rate": 0.0005, + "loss": 2.1395, + "step": 54670 + }, + { + "epoch": 0.20812557569483037, + "grad_norm": 0.11965058743953705, + "learning_rate": 0.0005, + "loss": 2.1342, + "step": 54680 + }, + { + "epoch": 0.20816363816295305, + "grad_norm": 0.12703274190425873, + "learning_rate": 0.0005, + "loss": 2.1228, + "step": 54690 + }, + { + "epoch": 0.2082017006310757, + "grad_norm": 0.10818231105804443, + "learning_rate": 0.0005, + "loss": 2.1182, + "step": 54700 + }, + { + "epoch": 0.2082397630991984, + "grad_norm": 0.12397830188274384, + "learning_rate": 0.0005, + "loss": 2.1323, + "step": 54710 + }, + { + "epoch": 0.20827782556732108, + "grad_norm": 0.11835771799087524, + "learning_rate": 0.0005, + "loss": 2.1308, + "step": 54720 + }, + { + "epoch": 0.20831588803544376, + "grad_norm": 0.12970468401908875, + "learning_rate": 0.0005, + "loss": 2.1217, + "step": 54730 + }, + { + "epoch": 0.20835395050356645, + "grad_norm": 0.1369071751832962, + "learning_rate": 0.0005, + "loss": 2.1162, + "step": 54740 + }, + { + "epoch": 0.20839201297168913, + "grad_norm": 0.13071531057357788, + "learning_rate": 0.0005, + "loss": 2.1323, + "step": 54750 + }, + { + "epoch": 0.20843007543981182, + "grad_norm": 0.12456535547971725, + "learning_rate": 0.0005, + "loss": 2.1221, + "step": 54760 + }, + { + "epoch": 0.2084681379079345, + "grad_norm": 0.12861059606075287, + "learning_rate": 0.0005, + "loss": 2.1204, + "step": 54770 + }, + { + "epoch": 0.2085062003760572, + "grad_norm": 0.12247523665428162, + "learning_rate": 0.0005, + "loss": 2.1233, + "step": 54780 + }, + { + "epoch": 0.20854426284417987, + "grad_norm": 0.12303100526332855, + "learning_rate": 0.0005, + "loss": 2.1372, + "step": 54790 + }, + { + "epoch": 0.20858232531230256, + "grad_norm": 0.13596375286579132, + "learning_rate": 0.0005, + "loss": 2.1302, + "step": 54800 + }, + { + "epoch": 0.20862038778042524, + "grad_norm": 0.12867873907089233, + "learning_rate": 0.0005, + "loss": 2.1433, + "step": 54810 + }, + { + "epoch": 0.20865845024854793, + "grad_norm": 0.12008437514305115, + "learning_rate": 0.0005, + "loss": 2.1352, + "step": 54820 + }, + { + "epoch": 0.2086965127166706, + "grad_norm": 0.12185460329055786, + "learning_rate": 0.0005, + "loss": 2.1394, + "step": 54830 + }, + { + "epoch": 0.20873457518479327, + "grad_norm": 0.12056107074022293, + "learning_rate": 0.0005, + "loss": 2.1454, + "step": 54840 + }, + { + "epoch": 0.20877263765291595, + "grad_norm": 0.11947619169950485, + "learning_rate": 0.0005, + "loss": 2.1325, + "step": 54850 + }, + { + "epoch": 0.20881070012103864, + "grad_norm": 0.14452539384365082, + "learning_rate": 0.0005, + "loss": 2.1376, + "step": 54860 + }, + { + "epoch": 0.20884876258916132, + "grad_norm": 0.12263192236423492, + "learning_rate": 0.0005, + "loss": 2.1244, + "step": 54870 + }, + { + "epoch": 0.208886825057284, + "grad_norm": 0.11858075857162476, + "learning_rate": 0.0005, + "loss": 2.1363, + "step": 54880 + }, + { + "epoch": 0.2089248875254067, + "grad_norm": 0.12804926931858063, + "learning_rate": 0.0005, + "loss": 2.1296, + "step": 54890 + }, + { + "epoch": 0.20896294999352938, + "grad_norm": 0.14871400594711304, + "learning_rate": 0.0005, + "loss": 2.1307, + "step": 54900 + }, + { + "epoch": 0.20900101246165206, + "grad_norm": 0.12267819792032242, + "learning_rate": 0.0005, + "loss": 2.1301, + "step": 54910 + }, + { + "epoch": 0.20903907492977475, + "grad_norm": 0.12310918420553207, + "learning_rate": 0.0005, + "loss": 2.1388, + "step": 54920 + }, + { + "epoch": 0.20907713739789743, + "grad_norm": 0.12073167413473129, + "learning_rate": 0.0005, + "loss": 2.1404, + "step": 54930 + }, + { + "epoch": 0.20911519986602012, + "grad_norm": 0.11783216148614883, + "learning_rate": 0.0005, + "loss": 2.121, + "step": 54940 + }, + { + "epoch": 0.2091532623341428, + "grad_norm": 0.12339416891336441, + "learning_rate": 0.0005, + "loss": 2.1349, + "step": 54950 + }, + { + "epoch": 0.2091913248022655, + "grad_norm": 0.14555498957633972, + "learning_rate": 0.0005, + "loss": 2.1269, + "step": 54960 + }, + { + "epoch": 0.20922938727038817, + "grad_norm": 0.1503671109676361, + "learning_rate": 0.0005, + "loss": 2.129, + "step": 54970 + }, + { + "epoch": 0.20926744973851086, + "grad_norm": 0.12491462379693985, + "learning_rate": 0.0005, + "loss": 2.148, + "step": 54980 + }, + { + "epoch": 0.20930551220663351, + "grad_norm": 0.117496557533741, + "learning_rate": 0.0005, + "loss": 2.1292, + "step": 54990 + }, + { + "epoch": 0.2093435746747562, + "grad_norm": 0.1343589425086975, + "learning_rate": 0.0005, + "loss": 2.1333, + "step": 55000 + }, + { + "epoch": 0.20938163714287888, + "grad_norm": 0.12166984379291534, + "learning_rate": 0.0005, + "loss": 2.1285, + "step": 55010 + }, + { + "epoch": 0.20941969961100157, + "grad_norm": 0.1231997087597847, + "learning_rate": 0.0005, + "loss": 2.138, + "step": 55020 + }, + { + "epoch": 0.20945776207912425, + "grad_norm": 0.11930809915065765, + "learning_rate": 0.0005, + "loss": 2.1458, + "step": 55030 + }, + { + "epoch": 0.20949582454724694, + "grad_norm": 0.13089969754219055, + "learning_rate": 0.0005, + "loss": 2.1331, + "step": 55040 + }, + { + "epoch": 0.20953388701536962, + "grad_norm": 0.12502965331077576, + "learning_rate": 0.0005, + "loss": 2.122, + "step": 55050 + }, + { + "epoch": 0.2095719494834923, + "grad_norm": 0.12578997015953064, + "learning_rate": 0.0005, + "loss": 2.1239, + "step": 55060 + }, + { + "epoch": 0.209610011951615, + "grad_norm": 0.1373293399810791, + "learning_rate": 0.0005, + "loss": 2.1513, + "step": 55070 + }, + { + "epoch": 0.20964807441973768, + "grad_norm": 0.11725837737321854, + "learning_rate": 0.0005, + "loss": 2.1403, + "step": 55080 + }, + { + "epoch": 0.20968613688786036, + "grad_norm": 0.12207408994436264, + "learning_rate": 0.0005, + "loss": 2.1257, + "step": 55090 + }, + { + "epoch": 0.20972419935598305, + "grad_norm": 0.13774704933166504, + "learning_rate": 0.0005, + "loss": 2.1229, + "step": 55100 + }, + { + "epoch": 0.20976226182410573, + "grad_norm": 0.13758708536624908, + "learning_rate": 0.0005, + "loss": 2.1293, + "step": 55110 + }, + { + "epoch": 0.20980032429222842, + "grad_norm": 0.1205608993768692, + "learning_rate": 0.0005, + "loss": 2.1346, + "step": 55120 + }, + { + "epoch": 0.20983838676035108, + "grad_norm": 0.1296151876449585, + "learning_rate": 0.0005, + "loss": 2.1128, + "step": 55130 + }, + { + "epoch": 0.20987644922847376, + "grad_norm": 0.1482541412115097, + "learning_rate": 0.0005, + "loss": 2.1304, + "step": 55140 + }, + { + "epoch": 0.20991451169659645, + "grad_norm": 0.1319926232099533, + "learning_rate": 0.0005, + "loss": 2.1397, + "step": 55150 + }, + { + "epoch": 0.20995257416471913, + "grad_norm": 0.12207752466201782, + "learning_rate": 0.0005, + "loss": 2.1404, + "step": 55160 + }, + { + "epoch": 0.20999063663284182, + "grad_norm": 0.13193126022815704, + "learning_rate": 0.0005, + "loss": 2.1246, + "step": 55170 + }, + { + "epoch": 0.2100286991009645, + "grad_norm": 0.11937710642814636, + "learning_rate": 0.0005, + "loss": 2.1251, + "step": 55180 + }, + { + "epoch": 0.21006676156908718, + "grad_norm": 0.12013963609933853, + "learning_rate": 0.0005, + "loss": 2.1285, + "step": 55190 + }, + { + "epoch": 0.21010482403720987, + "grad_norm": 0.12295087426900864, + "learning_rate": 0.0005, + "loss": 2.1359, + "step": 55200 + }, + { + "epoch": 0.21014288650533255, + "grad_norm": 0.1291741281747818, + "learning_rate": 0.0005, + "loss": 2.1131, + "step": 55210 + }, + { + "epoch": 0.21018094897345524, + "grad_norm": 0.14405788481235504, + "learning_rate": 0.0005, + "loss": 2.1376, + "step": 55220 + }, + { + "epoch": 0.21021901144157792, + "grad_norm": 0.12718753516674042, + "learning_rate": 0.0005, + "loss": 2.117, + "step": 55230 + }, + { + "epoch": 0.2102570739097006, + "grad_norm": 0.12040422111749649, + "learning_rate": 0.0005, + "loss": 2.1362, + "step": 55240 + }, + { + "epoch": 0.2102951363778233, + "grad_norm": 0.11796865612268448, + "learning_rate": 0.0005, + "loss": 2.1173, + "step": 55250 + }, + { + "epoch": 0.21033319884594598, + "grad_norm": 0.11834219098091125, + "learning_rate": 0.0005, + "loss": 2.1266, + "step": 55260 + }, + { + "epoch": 0.21037126131406864, + "grad_norm": 0.12916326522827148, + "learning_rate": 0.0005, + "loss": 2.1309, + "step": 55270 + }, + { + "epoch": 0.21040932378219132, + "grad_norm": 0.12509098649024963, + "learning_rate": 0.0005, + "loss": 2.1292, + "step": 55280 + }, + { + "epoch": 0.210447386250314, + "grad_norm": 0.14400167763233185, + "learning_rate": 0.0005, + "loss": 2.1096, + "step": 55290 + }, + { + "epoch": 0.2104854487184367, + "grad_norm": 0.11268658936023712, + "learning_rate": 0.0005, + "loss": 2.1228, + "step": 55300 + }, + { + "epoch": 0.21052351118655938, + "grad_norm": 0.1184752807021141, + "learning_rate": 0.0005, + "loss": 2.1405, + "step": 55310 + }, + { + "epoch": 0.21056157365468206, + "grad_norm": 0.12232375144958496, + "learning_rate": 0.0005, + "loss": 2.1203, + "step": 55320 + }, + { + "epoch": 0.21059963612280475, + "grad_norm": 0.1253381222486496, + "learning_rate": 0.0005, + "loss": 2.1246, + "step": 55330 + }, + { + "epoch": 0.21063769859092743, + "grad_norm": 0.1177864670753479, + "learning_rate": 0.0005, + "loss": 2.1212, + "step": 55340 + }, + { + "epoch": 0.21067576105905012, + "grad_norm": 0.12369973212480545, + "learning_rate": 0.0005, + "loss": 2.1305, + "step": 55350 + }, + { + "epoch": 0.2107138235271728, + "grad_norm": 0.12825240194797516, + "learning_rate": 0.0005, + "loss": 2.1353, + "step": 55360 + }, + { + "epoch": 0.21075188599529548, + "grad_norm": 0.12207330763339996, + "learning_rate": 0.0005, + "loss": 2.114, + "step": 55370 + }, + { + "epoch": 0.21078994846341817, + "grad_norm": 0.1274675577878952, + "learning_rate": 0.0005, + "loss": 2.1447, + "step": 55380 + }, + { + "epoch": 0.21082801093154085, + "grad_norm": 0.14211657643318176, + "learning_rate": 0.0005, + "loss": 2.1188, + "step": 55390 + }, + { + "epoch": 0.21086607339966354, + "grad_norm": 0.14069093763828278, + "learning_rate": 0.0005, + "loss": 2.132, + "step": 55400 + }, + { + "epoch": 0.21090413586778622, + "grad_norm": 0.12487372756004333, + "learning_rate": 0.0005, + "loss": 2.1329, + "step": 55410 + }, + { + "epoch": 0.21094219833590888, + "grad_norm": 0.12757901847362518, + "learning_rate": 0.0005, + "loss": 2.1341, + "step": 55420 + }, + { + "epoch": 0.21098026080403157, + "grad_norm": 0.14222578704357147, + "learning_rate": 0.0005, + "loss": 2.1362, + "step": 55430 + }, + { + "epoch": 0.21101832327215425, + "grad_norm": 0.12456969171762466, + "learning_rate": 0.0005, + "loss": 2.1261, + "step": 55440 + }, + { + "epoch": 0.21105638574027694, + "grad_norm": 0.1276063323020935, + "learning_rate": 0.0005, + "loss": 2.1235, + "step": 55450 + }, + { + "epoch": 0.21109444820839962, + "grad_norm": 0.14173118770122528, + "learning_rate": 0.0005, + "loss": 2.1182, + "step": 55460 + }, + { + "epoch": 0.2111325106765223, + "grad_norm": 0.13104242086410522, + "learning_rate": 0.0005, + "loss": 2.1176, + "step": 55470 + }, + { + "epoch": 0.211170573144645, + "grad_norm": 0.31636613607406616, + "learning_rate": 0.0005, + "loss": 2.1288, + "step": 55480 + }, + { + "epoch": 0.21120863561276768, + "grad_norm": 0.11914330720901489, + "learning_rate": 0.0005, + "loss": 2.1302, + "step": 55490 + }, + { + "epoch": 0.21124669808089036, + "grad_norm": 0.12945348024368286, + "learning_rate": 0.0005, + "loss": 2.1201, + "step": 55500 + }, + { + "epoch": 0.21128476054901305, + "grad_norm": 0.13458223640918732, + "learning_rate": 0.0005, + "loss": 2.1469, + "step": 55510 + }, + { + "epoch": 0.21132282301713573, + "grad_norm": 0.11938342452049255, + "learning_rate": 0.0005, + "loss": 2.1371, + "step": 55520 + }, + { + "epoch": 0.21136088548525842, + "grad_norm": 0.12643754482269287, + "learning_rate": 0.0005, + "loss": 2.1282, + "step": 55530 + }, + { + "epoch": 0.2113989479533811, + "grad_norm": 0.11904884874820709, + "learning_rate": 0.0005, + "loss": 2.1452, + "step": 55540 + }, + { + "epoch": 0.21143701042150378, + "grad_norm": 0.12168022245168686, + "learning_rate": 0.0005, + "loss": 2.1273, + "step": 55550 + }, + { + "epoch": 0.21147507288962644, + "grad_norm": 0.11386290192604065, + "learning_rate": 0.0005, + "loss": 2.1257, + "step": 55560 + }, + { + "epoch": 0.21151313535774913, + "grad_norm": 0.12103776633739471, + "learning_rate": 0.0005, + "loss": 2.1098, + "step": 55570 + }, + { + "epoch": 0.2115511978258718, + "grad_norm": 0.12591663002967834, + "learning_rate": 0.0005, + "loss": 2.1239, + "step": 55580 + }, + { + "epoch": 0.2115892602939945, + "grad_norm": 0.17933526635169983, + "learning_rate": 0.0005, + "loss": 2.1389, + "step": 55590 + }, + { + "epoch": 0.21162732276211718, + "grad_norm": 0.11258309334516525, + "learning_rate": 0.0005, + "loss": 2.1297, + "step": 55600 + }, + { + "epoch": 0.21166538523023987, + "grad_norm": 0.12344492226839066, + "learning_rate": 0.0005, + "loss": 2.139, + "step": 55610 + }, + { + "epoch": 0.21170344769836255, + "grad_norm": 0.13639947772026062, + "learning_rate": 0.0005, + "loss": 2.128, + "step": 55620 + }, + { + "epoch": 0.21174151016648524, + "grad_norm": 0.12072647362947464, + "learning_rate": 0.0005, + "loss": 2.1342, + "step": 55630 + }, + { + "epoch": 0.21177957263460792, + "grad_norm": 0.12417701631784439, + "learning_rate": 0.0005, + "loss": 2.1212, + "step": 55640 + }, + { + "epoch": 0.2118176351027306, + "grad_norm": 0.12251461297273636, + "learning_rate": 0.0005, + "loss": 2.1297, + "step": 55650 + }, + { + "epoch": 0.2118556975708533, + "grad_norm": 0.13269966840744019, + "learning_rate": 0.0005, + "loss": 2.1364, + "step": 55660 + }, + { + "epoch": 0.21189376003897598, + "grad_norm": 0.12810970842838287, + "learning_rate": 0.0005, + "loss": 2.1414, + "step": 55670 + }, + { + "epoch": 0.21193182250709866, + "grad_norm": 0.12790927290916443, + "learning_rate": 0.0005, + "loss": 2.1127, + "step": 55680 + }, + { + "epoch": 0.21196988497522135, + "grad_norm": 0.11479417979717255, + "learning_rate": 0.0005, + "loss": 2.1267, + "step": 55690 + }, + { + "epoch": 0.212007947443344, + "grad_norm": 0.1199273020029068, + "learning_rate": 0.0005, + "loss": 2.1304, + "step": 55700 + }, + { + "epoch": 0.2120460099114667, + "grad_norm": 0.12614107131958008, + "learning_rate": 0.0005, + "loss": 2.1378, + "step": 55710 + }, + { + "epoch": 0.21208407237958937, + "grad_norm": 0.13167840242385864, + "learning_rate": 0.0005, + "loss": 2.1398, + "step": 55720 + }, + { + "epoch": 0.21212213484771206, + "grad_norm": 0.13410525023937225, + "learning_rate": 0.0005, + "loss": 2.1376, + "step": 55730 + }, + { + "epoch": 0.21216019731583474, + "grad_norm": 0.11915973573923111, + "learning_rate": 0.0005, + "loss": 2.1359, + "step": 55740 + }, + { + "epoch": 0.21219825978395743, + "grad_norm": 0.11454334855079651, + "learning_rate": 0.0005, + "loss": 2.1438, + "step": 55750 + }, + { + "epoch": 0.2122363222520801, + "grad_norm": 0.11463811993598938, + "learning_rate": 0.0005, + "loss": 2.1367, + "step": 55760 + }, + { + "epoch": 0.2122743847202028, + "grad_norm": 0.12144269049167633, + "learning_rate": 0.0005, + "loss": 2.1434, + "step": 55770 + }, + { + "epoch": 0.21231244718832548, + "grad_norm": 0.1233169436454773, + "learning_rate": 0.0005, + "loss": 2.143, + "step": 55780 + }, + { + "epoch": 0.21235050965644817, + "grad_norm": 0.12609365582466125, + "learning_rate": 0.0005, + "loss": 2.1371, + "step": 55790 + }, + { + "epoch": 0.21238857212457085, + "grad_norm": 0.12684963643550873, + "learning_rate": 0.0005, + "loss": 2.1135, + "step": 55800 + }, + { + "epoch": 0.21242663459269354, + "grad_norm": 0.11530327051877975, + "learning_rate": 0.0005, + "loss": 2.1194, + "step": 55810 + }, + { + "epoch": 0.21246469706081622, + "grad_norm": 0.1189451664686203, + "learning_rate": 0.0005, + "loss": 2.1395, + "step": 55820 + }, + { + "epoch": 0.2125027595289389, + "grad_norm": 0.12495563924312592, + "learning_rate": 0.0005, + "loss": 2.1307, + "step": 55830 + }, + { + "epoch": 0.2125408219970616, + "grad_norm": 0.1268143653869629, + "learning_rate": 0.0005, + "loss": 2.1425, + "step": 55840 + }, + { + "epoch": 0.21257888446518425, + "grad_norm": 0.12778593599796295, + "learning_rate": 0.0005, + "loss": 2.1201, + "step": 55850 + }, + { + "epoch": 0.21261694693330693, + "grad_norm": 0.12761004269123077, + "learning_rate": 0.0005, + "loss": 2.1505, + "step": 55860 + }, + { + "epoch": 0.21265500940142962, + "grad_norm": 0.11692184954881668, + "learning_rate": 0.0005, + "loss": 2.129, + "step": 55870 + }, + { + "epoch": 0.2126930718695523, + "grad_norm": 0.13846901059150696, + "learning_rate": 0.0005, + "loss": 2.1159, + "step": 55880 + }, + { + "epoch": 0.212731134337675, + "grad_norm": 0.11994395405054092, + "learning_rate": 0.0005, + "loss": 2.1297, + "step": 55890 + }, + { + "epoch": 0.21276919680579767, + "grad_norm": 0.11813051998615265, + "learning_rate": 0.0005, + "loss": 2.1181, + "step": 55900 + }, + { + "epoch": 0.21280725927392036, + "grad_norm": 0.11680085957050323, + "learning_rate": 0.0005, + "loss": 2.1302, + "step": 55910 + }, + { + "epoch": 0.21284532174204304, + "grad_norm": 0.1264512985944748, + "learning_rate": 0.0005, + "loss": 2.1313, + "step": 55920 + }, + { + "epoch": 0.21288338421016573, + "grad_norm": 0.12074155360460281, + "learning_rate": 0.0005, + "loss": 2.1334, + "step": 55930 + }, + { + "epoch": 0.2129214466782884, + "grad_norm": 0.11390884965658188, + "learning_rate": 0.0005, + "loss": 2.1441, + "step": 55940 + }, + { + "epoch": 0.2129595091464111, + "grad_norm": 0.10812906920909882, + "learning_rate": 0.0005, + "loss": 2.1203, + "step": 55950 + }, + { + "epoch": 0.21299757161453378, + "grad_norm": 0.11598875373601913, + "learning_rate": 0.0005, + "loss": 2.1331, + "step": 55960 + }, + { + "epoch": 0.21303563408265647, + "grad_norm": 0.14625869691371918, + "learning_rate": 0.0005, + "loss": 2.1328, + "step": 55970 + }, + { + "epoch": 0.21307369655077915, + "grad_norm": 0.13654451072216034, + "learning_rate": 0.0005, + "loss": 2.1447, + "step": 55980 + }, + { + "epoch": 0.2131117590189018, + "grad_norm": 0.12408126890659332, + "learning_rate": 0.0005, + "loss": 2.145, + "step": 55990 + }, + { + "epoch": 0.2131498214870245, + "grad_norm": 0.12511536478996277, + "learning_rate": 0.0005, + "loss": 2.1173, + "step": 56000 + }, + { + "epoch": 0.21318788395514718, + "grad_norm": 0.1340360790491104, + "learning_rate": 0.0005, + "loss": 2.1498, + "step": 56010 + }, + { + "epoch": 0.21322594642326986, + "grad_norm": 0.13497580587863922, + "learning_rate": 0.0005, + "loss": 2.1304, + "step": 56020 + }, + { + "epoch": 0.21326400889139255, + "grad_norm": 0.12230490148067474, + "learning_rate": 0.0005, + "loss": 2.1138, + "step": 56030 + }, + { + "epoch": 0.21330207135951523, + "grad_norm": 0.11444181203842163, + "learning_rate": 0.0005, + "loss": 2.1269, + "step": 56040 + }, + { + "epoch": 0.21334013382763792, + "grad_norm": 0.12098285555839539, + "learning_rate": 0.0005, + "loss": 2.1266, + "step": 56050 + }, + { + "epoch": 0.2133781962957606, + "grad_norm": 0.13039463758468628, + "learning_rate": 0.0005, + "loss": 2.142, + "step": 56060 + }, + { + "epoch": 0.2134162587638833, + "grad_norm": 0.12801070511341095, + "learning_rate": 0.0005, + "loss": 2.1291, + "step": 56070 + }, + { + "epoch": 0.21345432123200597, + "grad_norm": 0.12597905099391937, + "learning_rate": 0.0005, + "loss": 2.124, + "step": 56080 + }, + { + "epoch": 0.21349238370012866, + "grad_norm": 0.1314854919910431, + "learning_rate": 0.0005, + "loss": 2.1433, + "step": 56090 + }, + { + "epoch": 0.21353044616825134, + "grad_norm": 0.11816215515136719, + "learning_rate": 0.0005, + "loss": 2.1349, + "step": 56100 + }, + { + "epoch": 0.21356850863637403, + "grad_norm": 0.1357698142528534, + "learning_rate": 0.0005, + "loss": 2.1182, + "step": 56110 + }, + { + "epoch": 0.2136065711044967, + "grad_norm": 0.13428688049316406, + "learning_rate": 0.0005, + "loss": 2.1394, + "step": 56120 + }, + { + "epoch": 0.2136446335726194, + "grad_norm": 0.1258360743522644, + "learning_rate": 0.0005, + "loss": 2.1288, + "step": 56130 + }, + { + "epoch": 0.21368269604074205, + "grad_norm": 0.12718361616134644, + "learning_rate": 0.0005, + "loss": 2.1336, + "step": 56140 + }, + { + "epoch": 0.21372075850886474, + "grad_norm": 0.1284942328929901, + "learning_rate": 0.0005, + "loss": 2.1336, + "step": 56150 + }, + { + "epoch": 0.21375882097698742, + "grad_norm": 0.12394726276397705, + "learning_rate": 0.0005, + "loss": 2.1317, + "step": 56160 + }, + { + "epoch": 0.2137968834451101, + "grad_norm": 0.10978233814239502, + "learning_rate": 0.0005, + "loss": 2.1207, + "step": 56170 + }, + { + "epoch": 0.2138349459132328, + "grad_norm": 0.1203223243355751, + "learning_rate": 0.0005, + "loss": 2.1431, + "step": 56180 + }, + { + "epoch": 0.21387300838135548, + "grad_norm": 0.12922045588493347, + "learning_rate": 0.0005, + "loss": 2.1373, + "step": 56190 + }, + { + "epoch": 0.21391107084947816, + "grad_norm": 0.12522399425506592, + "learning_rate": 0.0005, + "loss": 2.1338, + "step": 56200 + }, + { + "epoch": 0.21394913331760085, + "grad_norm": 0.12170863896608353, + "learning_rate": 0.0005, + "loss": 2.1265, + "step": 56210 + }, + { + "epoch": 0.21398719578572353, + "grad_norm": 0.1256794035434723, + "learning_rate": 0.0005, + "loss": 2.1367, + "step": 56220 + }, + { + "epoch": 0.21402525825384622, + "grad_norm": 0.13030460476875305, + "learning_rate": 0.0005, + "loss": 2.1272, + "step": 56230 + }, + { + "epoch": 0.2140633207219689, + "grad_norm": 0.11182166635990143, + "learning_rate": 0.0005, + "loss": 2.1392, + "step": 56240 + }, + { + "epoch": 0.2141013831900916, + "grad_norm": 0.12376523017883301, + "learning_rate": 0.0005, + "loss": 2.1342, + "step": 56250 + }, + { + "epoch": 0.21413944565821427, + "grad_norm": 0.1058717891573906, + "learning_rate": 0.0005, + "loss": 2.1368, + "step": 56260 + }, + { + "epoch": 0.21417750812633696, + "grad_norm": 0.11659777164459229, + "learning_rate": 0.0005, + "loss": 2.1354, + "step": 56270 + }, + { + "epoch": 0.21421557059445961, + "grad_norm": 0.12966254353523254, + "learning_rate": 0.0005, + "loss": 2.1243, + "step": 56280 + }, + { + "epoch": 0.2142536330625823, + "grad_norm": 0.12180996686220169, + "learning_rate": 0.0005, + "loss": 2.1324, + "step": 56290 + }, + { + "epoch": 0.21429169553070498, + "grad_norm": 0.1273222416639328, + "learning_rate": 0.0005, + "loss": 2.1176, + "step": 56300 + }, + { + "epoch": 0.21432975799882767, + "grad_norm": 0.12970444560050964, + "learning_rate": 0.0005, + "loss": 2.1282, + "step": 56310 + }, + { + "epoch": 0.21436782046695035, + "grad_norm": 0.11615218967199326, + "learning_rate": 0.0005, + "loss": 2.123, + "step": 56320 + }, + { + "epoch": 0.21440588293507304, + "grad_norm": 0.13962231576442719, + "learning_rate": 0.0005, + "loss": 2.1351, + "step": 56330 + }, + { + "epoch": 0.21444394540319572, + "grad_norm": 0.13303855061531067, + "learning_rate": 0.0005, + "loss": 2.1289, + "step": 56340 + }, + { + "epoch": 0.2144820078713184, + "grad_norm": 0.12693250179290771, + "learning_rate": 0.0005, + "loss": 2.1295, + "step": 56350 + }, + { + "epoch": 0.2145200703394411, + "grad_norm": 0.11924530565738678, + "learning_rate": 0.0005, + "loss": 2.1403, + "step": 56360 + }, + { + "epoch": 0.21455813280756378, + "grad_norm": 0.12394033372402191, + "learning_rate": 0.0005, + "loss": 2.1369, + "step": 56370 + }, + { + "epoch": 0.21459619527568646, + "grad_norm": 0.12687383592128754, + "learning_rate": 0.0005, + "loss": 2.1363, + "step": 56380 + }, + { + "epoch": 0.21463425774380915, + "grad_norm": 0.1161162480711937, + "learning_rate": 0.0005, + "loss": 2.1356, + "step": 56390 + }, + { + "epoch": 0.21467232021193183, + "grad_norm": 0.12959563732147217, + "learning_rate": 0.0005, + "loss": 2.1299, + "step": 56400 + }, + { + "epoch": 0.21471038268005452, + "grad_norm": 0.1229131892323494, + "learning_rate": 0.0005, + "loss": 2.1319, + "step": 56410 + }, + { + "epoch": 0.21474844514817718, + "grad_norm": 0.12190618366003036, + "learning_rate": 0.0005, + "loss": 2.1139, + "step": 56420 + }, + { + "epoch": 0.21478650761629986, + "grad_norm": 0.11919743567705154, + "learning_rate": 0.0005, + "loss": 2.13, + "step": 56430 + }, + { + "epoch": 0.21482457008442254, + "grad_norm": 0.12470883876085281, + "learning_rate": 0.0005, + "loss": 2.1408, + "step": 56440 + }, + { + "epoch": 0.21486263255254523, + "grad_norm": 0.12248383462429047, + "learning_rate": 0.0005, + "loss": 2.138, + "step": 56450 + }, + { + "epoch": 0.21490069502066791, + "grad_norm": 0.12124987691640854, + "learning_rate": 0.0005, + "loss": 2.1407, + "step": 56460 + }, + { + "epoch": 0.2149387574887906, + "grad_norm": 0.13539837300777435, + "learning_rate": 0.0005, + "loss": 2.1318, + "step": 56470 + }, + { + "epoch": 0.21497681995691328, + "grad_norm": 0.12916968762874603, + "learning_rate": 0.0005, + "loss": 2.1375, + "step": 56480 + }, + { + "epoch": 0.21501488242503597, + "grad_norm": 0.11934902518987656, + "learning_rate": 0.0005, + "loss": 2.1204, + "step": 56490 + }, + { + "epoch": 0.21505294489315865, + "grad_norm": 0.1366918981075287, + "learning_rate": 0.0005, + "loss": 2.1333, + "step": 56500 + }, + { + "epoch": 0.21509100736128134, + "grad_norm": 0.11489017307758331, + "learning_rate": 0.0005, + "loss": 2.1223, + "step": 56510 + }, + { + "epoch": 0.21512906982940402, + "grad_norm": 0.1359739601612091, + "learning_rate": 0.0005, + "loss": 2.1234, + "step": 56520 + }, + { + "epoch": 0.2151671322975267, + "grad_norm": 0.11837109923362732, + "learning_rate": 0.0005, + "loss": 2.1305, + "step": 56530 + }, + { + "epoch": 0.2152051947656494, + "grad_norm": 0.11647136509418488, + "learning_rate": 0.0005, + "loss": 2.1193, + "step": 56540 + }, + { + "epoch": 0.21524325723377208, + "grad_norm": 0.13424059748649597, + "learning_rate": 0.0005, + "loss": 2.1292, + "step": 56550 + }, + { + "epoch": 0.21528131970189476, + "grad_norm": 0.12664872407913208, + "learning_rate": 0.0005, + "loss": 2.1136, + "step": 56560 + }, + { + "epoch": 0.21531938217001742, + "grad_norm": 0.12171950936317444, + "learning_rate": 0.0005, + "loss": 2.1375, + "step": 56570 + }, + { + "epoch": 0.2153574446381401, + "grad_norm": 0.12074828892946243, + "learning_rate": 0.0005, + "loss": 2.1271, + "step": 56580 + }, + { + "epoch": 0.2153955071062628, + "grad_norm": 0.1294257789850235, + "learning_rate": 0.0005, + "loss": 2.1247, + "step": 56590 + }, + { + "epoch": 0.21543356957438548, + "grad_norm": 0.11204027384519577, + "learning_rate": 0.0005, + "loss": 2.1196, + "step": 56600 + }, + { + "epoch": 0.21547163204250816, + "grad_norm": 0.12621349096298218, + "learning_rate": 0.0005, + "loss": 2.1311, + "step": 56610 + }, + { + "epoch": 0.21550969451063084, + "grad_norm": 0.13478349149227142, + "learning_rate": 0.0005, + "loss": 2.1275, + "step": 56620 + }, + { + "epoch": 0.21554775697875353, + "grad_norm": 0.11155591160058975, + "learning_rate": 0.0005, + "loss": 2.1175, + "step": 56630 + }, + { + "epoch": 0.21558581944687621, + "grad_norm": 0.12927065789699554, + "learning_rate": 0.0005, + "loss": 2.1373, + "step": 56640 + }, + { + "epoch": 0.2156238819149989, + "grad_norm": 0.11920293420553207, + "learning_rate": 0.0005, + "loss": 2.1124, + "step": 56650 + }, + { + "epoch": 0.21566194438312158, + "grad_norm": 0.12005160003900528, + "learning_rate": 0.0005, + "loss": 2.1294, + "step": 56660 + }, + { + "epoch": 0.21570000685124427, + "grad_norm": 0.1394878476858139, + "learning_rate": 0.0005, + "loss": 2.129, + "step": 56670 + }, + { + "epoch": 0.21573806931936695, + "grad_norm": 0.12196838855743408, + "learning_rate": 0.0005, + "loss": 2.127, + "step": 56680 + }, + { + "epoch": 0.21577613178748964, + "grad_norm": 0.11932896822690964, + "learning_rate": 0.0005, + "loss": 2.1297, + "step": 56690 + }, + { + "epoch": 0.21581419425561232, + "grad_norm": 0.12393977493047714, + "learning_rate": 0.0005, + "loss": 2.1286, + "step": 56700 + }, + { + "epoch": 0.21585225672373498, + "grad_norm": 0.13766314089298248, + "learning_rate": 0.0005, + "loss": 2.1223, + "step": 56710 + }, + { + "epoch": 0.21589031919185767, + "grad_norm": 0.13553282618522644, + "learning_rate": 0.0005, + "loss": 2.1437, + "step": 56720 + }, + { + "epoch": 0.21592838165998035, + "grad_norm": 0.18950755894184113, + "learning_rate": 0.0005, + "loss": 2.1284, + "step": 56730 + }, + { + "epoch": 0.21596644412810304, + "grad_norm": 0.12270800769329071, + "learning_rate": 0.0005, + "loss": 2.1231, + "step": 56740 + }, + { + "epoch": 0.21600450659622572, + "grad_norm": 0.11453820019960403, + "learning_rate": 0.0005, + "loss": 2.1551, + "step": 56750 + }, + { + "epoch": 0.2160425690643484, + "grad_norm": 0.13194343447685242, + "learning_rate": 0.0005, + "loss": 2.124, + "step": 56760 + }, + { + "epoch": 0.2160806315324711, + "grad_norm": 0.1288219690322876, + "learning_rate": 0.0005, + "loss": 2.1435, + "step": 56770 + }, + { + "epoch": 0.21611869400059378, + "grad_norm": 0.12786085903644562, + "learning_rate": 0.0005, + "loss": 2.1344, + "step": 56780 + }, + { + "epoch": 0.21615675646871646, + "grad_norm": 0.1349632740020752, + "learning_rate": 0.0005, + "loss": 2.1192, + "step": 56790 + }, + { + "epoch": 0.21619481893683914, + "grad_norm": 0.12566514313220978, + "learning_rate": 0.0005, + "loss": 2.1361, + "step": 56800 + }, + { + "epoch": 0.21623288140496183, + "grad_norm": 0.11484618484973907, + "learning_rate": 0.0005, + "loss": 2.1302, + "step": 56810 + }, + { + "epoch": 0.21627094387308451, + "grad_norm": 0.12926548719406128, + "learning_rate": 0.0005, + "loss": 2.1232, + "step": 56820 + }, + { + "epoch": 0.2163090063412072, + "grad_norm": 0.143005833029747, + "learning_rate": 0.0005, + "loss": 2.1409, + "step": 56830 + }, + { + "epoch": 0.21634706880932988, + "grad_norm": 0.11763927340507507, + "learning_rate": 0.0005, + "loss": 2.1252, + "step": 56840 + }, + { + "epoch": 0.21638513127745254, + "grad_norm": 0.14074474573135376, + "learning_rate": 0.0005, + "loss": 2.1358, + "step": 56850 + }, + { + "epoch": 0.21642319374557523, + "grad_norm": 0.13238893449306488, + "learning_rate": 0.0005, + "loss": 2.13, + "step": 56860 + }, + { + "epoch": 0.2164612562136979, + "grad_norm": 0.11847522109746933, + "learning_rate": 0.0005, + "loss": 2.1235, + "step": 56870 + }, + { + "epoch": 0.2164993186818206, + "grad_norm": 0.13169977068901062, + "learning_rate": 0.0005, + "loss": 2.1505, + "step": 56880 + }, + { + "epoch": 0.21653738114994328, + "grad_norm": 0.12500376999378204, + "learning_rate": 0.0005, + "loss": 2.128, + "step": 56890 + }, + { + "epoch": 0.21657544361806597, + "grad_norm": 0.1372148096561432, + "learning_rate": 0.0005, + "loss": 2.1219, + "step": 56900 + }, + { + "epoch": 0.21661350608618865, + "grad_norm": 0.13714422285556793, + "learning_rate": 0.0005, + "loss": 2.1253, + "step": 56910 + }, + { + "epoch": 0.21665156855431134, + "grad_norm": 0.12532460689544678, + "learning_rate": 0.0005, + "loss": 2.1398, + "step": 56920 + }, + { + "epoch": 0.21668963102243402, + "grad_norm": 0.45631900429725647, + "learning_rate": 0.0005, + "loss": 2.1333, + "step": 56930 + }, + { + "epoch": 0.2167276934905567, + "grad_norm": 0.13041189312934875, + "learning_rate": 0.0005, + "loss": 2.1262, + "step": 56940 + }, + { + "epoch": 0.2167657559586794, + "grad_norm": 0.1290024071931839, + "learning_rate": 0.0005, + "loss": 2.1292, + "step": 56950 + }, + { + "epoch": 0.21680381842680208, + "grad_norm": 0.12570427358150482, + "learning_rate": 0.0005, + "loss": 2.1297, + "step": 56960 + }, + { + "epoch": 0.21684188089492476, + "grad_norm": 0.12115761637687683, + "learning_rate": 0.0005, + "loss": 2.1312, + "step": 56970 + }, + { + "epoch": 0.21687994336304744, + "grad_norm": 0.1134210154414177, + "learning_rate": 0.0005, + "loss": 2.1349, + "step": 56980 + }, + { + "epoch": 0.21691800583117013, + "grad_norm": 0.12490539252758026, + "learning_rate": 0.0005, + "loss": 2.1326, + "step": 56990 + }, + { + "epoch": 0.2169560682992928, + "grad_norm": 0.11384977400302887, + "learning_rate": 0.0005, + "loss": 2.1082, + "step": 57000 + }, + { + "epoch": 0.21699413076741547, + "grad_norm": 0.12341322749853134, + "learning_rate": 0.0005, + "loss": 2.1339, + "step": 57010 + }, + { + "epoch": 0.21703219323553816, + "grad_norm": 0.11756706982851028, + "learning_rate": 0.0005, + "loss": 2.1289, + "step": 57020 + }, + { + "epoch": 0.21707025570366084, + "grad_norm": 0.11976876109838486, + "learning_rate": 0.0005, + "loss": 2.1333, + "step": 57030 + }, + { + "epoch": 0.21710831817178353, + "grad_norm": 0.12024601548910141, + "learning_rate": 0.0005, + "loss": 2.1338, + "step": 57040 + }, + { + "epoch": 0.2171463806399062, + "grad_norm": 0.11507564783096313, + "learning_rate": 0.0005, + "loss": 2.1188, + "step": 57050 + }, + { + "epoch": 0.2171844431080289, + "grad_norm": 0.14740623533725739, + "learning_rate": 0.0005, + "loss": 2.1284, + "step": 57060 + }, + { + "epoch": 0.21722250557615158, + "grad_norm": 0.12662845849990845, + "learning_rate": 0.0005, + "loss": 2.1357, + "step": 57070 + }, + { + "epoch": 0.21726056804427427, + "grad_norm": 0.1305474191904068, + "learning_rate": 0.0005, + "loss": 2.1359, + "step": 57080 + }, + { + "epoch": 0.21729863051239695, + "grad_norm": 0.1467132717370987, + "learning_rate": 0.0005, + "loss": 2.131, + "step": 57090 + }, + { + "epoch": 0.21733669298051964, + "grad_norm": 0.13564428687095642, + "learning_rate": 0.0005, + "loss": 2.1317, + "step": 57100 + }, + { + "epoch": 0.21737475544864232, + "grad_norm": 0.1273927390575409, + "learning_rate": 0.0005, + "loss": 2.1306, + "step": 57110 + }, + { + "epoch": 0.217412817916765, + "grad_norm": 0.14611203968524933, + "learning_rate": 0.0005, + "loss": 2.1242, + "step": 57120 + }, + { + "epoch": 0.2174508803848877, + "grad_norm": 0.1306157112121582, + "learning_rate": 0.0005, + "loss": 2.1303, + "step": 57130 + }, + { + "epoch": 0.21748894285301035, + "grad_norm": 0.12513276934623718, + "learning_rate": 0.0005, + "loss": 2.1399, + "step": 57140 + }, + { + "epoch": 0.21752700532113303, + "grad_norm": 0.11249308288097382, + "learning_rate": 0.0005, + "loss": 2.1121, + "step": 57150 + }, + { + "epoch": 0.21756506778925572, + "grad_norm": 0.12884452939033508, + "learning_rate": 0.0005, + "loss": 2.1305, + "step": 57160 + }, + { + "epoch": 0.2176031302573784, + "grad_norm": 0.1316312849521637, + "learning_rate": 0.0005, + "loss": 2.1288, + "step": 57170 + }, + { + "epoch": 0.2176411927255011, + "grad_norm": 0.1316588968038559, + "learning_rate": 0.0005, + "loss": 2.1192, + "step": 57180 + }, + { + "epoch": 0.21767925519362377, + "grad_norm": 0.12284323573112488, + "learning_rate": 0.0005, + "loss": 2.1337, + "step": 57190 + }, + { + "epoch": 0.21771731766174646, + "grad_norm": 0.11931144446134567, + "learning_rate": 0.0005, + "loss": 2.1244, + "step": 57200 + }, + { + "epoch": 0.21775538012986914, + "grad_norm": 0.12624037265777588, + "learning_rate": 0.0005, + "loss": 2.1358, + "step": 57210 + }, + { + "epoch": 0.21779344259799183, + "grad_norm": 0.12465736269950867, + "learning_rate": 0.0005, + "loss": 2.1274, + "step": 57220 + }, + { + "epoch": 0.2178315050661145, + "grad_norm": 0.12285862863063812, + "learning_rate": 0.0005, + "loss": 2.1264, + "step": 57230 + }, + { + "epoch": 0.2178695675342372, + "grad_norm": 0.13169236481189728, + "learning_rate": 0.0005, + "loss": 2.1297, + "step": 57240 + }, + { + "epoch": 0.21790763000235988, + "grad_norm": 0.1352112889289856, + "learning_rate": 0.0005, + "loss": 2.1286, + "step": 57250 + }, + { + "epoch": 0.21794569247048257, + "grad_norm": 0.11984165757894516, + "learning_rate": 0.0005, + "loss": 2.1192, + "step": 57260 + }, + { + "epoch": 0.21798375493860525, + "grad_norm": 0.12893593311309814, + "learning_rate": 0.0005, + "loss": 2.1332, + "step": 57270 + }, + { + "epoch": 0.2180218174067279, + "grad_norm": 0.11700885742902756, + "learning_rate": 0.0005, + "loss": 2.1416, + "step": 57280 + }, + { + "epoch": 0.2180598798748506, + "grad_norm": 0.24958999454975128, + "learning_rate": 0.0005, + "loss": 2.1432, + "step": 57290 + }, + { + "epoch": 0.21809794234297328, + "grad_norm": 0.13076789677143097, + "learning_rate": 0.0005, + "loss": 2.1346, + "step": 57300 + }, + { + "epoch": 0.21813600481109596, + "grad_norm": 0.12193410843610764, + "learning_rate": 0.0005, + "loss": 2.132, + "step": 57310 + }, + { + "epoch": 0.21817406727921865, + "grad_norm": 0.1376117765903473, + "learning_rate": 0.0005, + "loss": 2.1288, + "step": 57320 + }, + { + "epoch": 0.21821212974734133, + "grad_norm": 0.12163210660219193, + "learning_rate": 0.0005, + "loss": 2.141, + "step": 57330 + }, + { + "epoch": 0.21825019221546402, + "grad_norm": 0.1255846470594406, + "learning_rate": 0.0005, + "loss": 2.1254, + "step": 57340 + }, + { + "epoch": 0.2182882546835867, + "grad_norm": 0.13697326183319092, + "learning_rate": 0.0005, + "loss": 2.132, + "step": 57350 + }, + { + "epoch": 0.2183263171517094, + "grad_norm": 0.12124402076005936, + "learning_rate": 0.0005, + "loss": 2.1457, + "step": 57360 + }, + { + "epoch": 0.21836437961983207, + "grad_norm": 0.12365327030420303, + "learning_rate": 0.0005, + "loss": 2.1302, + "step": 57370 + }, + { + "epoch": 0.21840244208795476, + "grad_norm": 0.11128167808055878, + "learning_rate": 0.0005, + "loss": 2.1158, + "step": 57380 + }, + { + "epoch": 0.21844050455607744, + "grad_norm": 0.12737533450126648, + "learning_rate": 0.0005, + "loss": 2.1304, + "step": 57390 + }, + { + "epoch": 0.21847856702420013, + "grad_norm": 0.1185334324836731, + "learning_rate": 0.0005, + "loss": 2.1426, + "step": 57400 + }, + { + "epoch": 0.2185166294923228, + "grad_norm": 0.12620334327220917, + "learning_rate": 0.0005, + "loss": 2.1447, + "step": 57410 + }, + { + "epoch": 0.2185546919604455, + "grad_norm": 0.11594204604625702, + "learning_rate": 0.0005, + "loss": 2.1211, + "step": 57420 + }, + { + "epoch": 0.21859275442856815, + "grad_norm": 0.1219063401222229, + "learning_rate": 0.0005, + "loss": 2.1314, + "step": 57430 + }, + { + "epoch": 0.21863081689669084, + "grad_norm": 0.11989148706197739, + "learning_rate": 0.0005, + "loss": 2.1399, + "step": 57440 + }, + { + "epoch": 0.21866887936481352, + "grad_norm": 0.12347909063100815, + "learning_rate": 0.0005, + "loss": 2.1448, + "step": 57450 + }, + { + "epoch": 0.2187069418329362, + "grad_norm": 0.13595686852931976, + "learning_rate": 0.0005, + "loss": 2.1327, + "step": 57460 + }, + { + "epoch": 0.2187450043010589, + "grad_norm": 0.12859046459197998, + "learning_rate": 0.0005, + "loss": 2.1277, + "step": 57470 + }, + { + "epoch": 0.21878306676918158, + "grad_norm": 0.12548702955245972, + "learning_rate": 0.0005, + "loss": 2.1392, + "step": 57480 + }, + { + "epoch": 0.21882112923730426, + "grad_norm": 0.13332229852676392, + "learning_rate": 0.0005, + "loss": 2.1228, + "step": 57490 + }, + { + "epoch": 0.21885919170542695, + "grad_norm": 0.1241702064871788, + "learning_rate": 0.0005, + "loss": 2.1245, + "step": 57500 + }, + { + "epoch": 0.21889725417354963, + "grad_norm": 0.1415553241968155, + "learning_rate": 0.0005, + "loss": 2.1338, + "step": 57510 + }, + { + "epoch": 0.21893531664167232, + "grad_norm": 0.151481032371521, + "learning_rate": 0.0005, + "loss": 2.1283, + "step": 57520 + }, + { + "epoch": 0.218973379109795, + "grad_norm": 0.13601690530776978, + "learning_rate": 0.0005, + "loss": 2.1361, + "step": 57530 + }, + { + "epoch": 0.2190114415779177, + "grad_norm": 0.13995510339736938, + "learning_rate": 0.0005, + "loss": 2.1309, + "step": 57540 + }, + { + "epoch": 0.21904950404604037, + "grad_norm": 0.1266234964132309, + "learning_rate": 0.0005, + "loss": 2.1249, + "step": 57550 + }, + { + "epoch": 0.21908756651416306, + "grad_norm": 0.11919340491294861, + "learning_rate": 0.0005, + "loss": 2.1196, + "step": 57560 + }, + { + "epoch": 0.21912562898228571, + "grad_norm": 0.1267729252576828, + "learning_rate": 0.0005, + "loss": 2.1253, + "step": 57570 + }, + { + "epoch": 0.2191636914504084, + "grad_norm": 0.11670149862766266, + "learning_rate": 0.0005, + "loss": 2.1289, + "step": 57580 + }, + { + "epoch": 0.21920175391853108, + "grad_norm": 0.11498446017503738, + "learning_rate": 0.0005, + "loss": 2.1362, + "step": 57590 + }, + { + "epoch": 0.21923981638665377, + "grad_norm": 0.1346731334924698, + "learning_rate": 0.0005, + "loss": 2.134, + "step": 57600 + }, + { + "epoch": 0.21927787885477645, + "grad_norm": 0.12346021831035614, + "learning_rate": 0.0005, + "loss": 2.128, + "step": 57610 + }, + { + "epoch": 0.21931594132289914, + "grad_norm": 0.11403003334999084, + "learning_rate": 0.0005, + "loss": 2.1431, + "step": 57620 + }, + { + "epoch": 0.21935400379102182, + "grad_norm": 0.11895783990621567, + "learning_rate": 0.0005, + "loss": 2.1333, + "step": 57630 + }, + { + "epoch": 0.2193920662591445, + "grad_norm": 0.13433951139450073, + "learning_rate": 0.0005, + "loss": 2.1418, + "step": 57640 + }, + { + "epoch": 0.2194301287272672, + "grad_norm": 0.13205264508724213, + "learning_rate": 0.0005, + "loss": 2.1197, + "step": 57650 + }, + { + "epoch": 0.21946819119538988, + "grad_norm": 0.13134273886680603, + "learning_rate": 0.0005, + "loss": 2.1391, + "step": 57660 + }, + { + "epoch": 0.21950625366351256, + "grad_norm": 0.12045015394687653, + "learning_rate": 0.0005, + "loss": 2.1336, + "step": 57670 + }, + { + "epoch": 0.21954431613163525, + "grad_norm": 0.1349083036184311, + "learning_rate": 0.0005, + "loss": 2.1303, + "step": 57680 + }, + { + "epoch": 0.21958237859975793, + "grad_norm": 0.13863080739974976, + "learning_rate": 0.0005, + "loss": 2.1202, + "step": 57690 + }, + { + "epoch": 0.21962044106788062, + "grad_norm": 0.13695912063121796, + "learning_rate": 0.0005, + "loss": 2.1294, + "step": 57700 + }, + { + "epoch": 0.2196585035360033, + "grad_norm": 0.12278943508863449, + "learning_rate": 0.0005, + "loss": 2.1411, + "step": 57710 + }, + { + "epoch": 0.21969656600412596, + "grad_norm": 0.1360238492488861, + "learning_rate": 0.0005, + "loss": 2.1343, + "step": 57720 + }, + { + "epoch": 0.21973462847224864, + "grad_norm": 0.14207656681537628, + "learning_rate": 0.0005, + "loss": 2.1477, + "step": 57730 + }, + { + "epoch": 0.21977269094037133, + "grad_norm": 0.13055188953876495, + "learning_rate": 0.0005, + "loss": 2.1302, + "step": 57740 + }, + { + "epoch": 0.21981075340849401, + "grad_norm": 0.12562668323516846, + "learning_rate": 0.0005, + "loss": 2.1335, + "step": 57750 + }, + { + "epoch": 0.2198488158766167, + "grad_norm": 0.12057308107614517, + "learning_rate": 0.0005, + "loss": 2.1252, + "step": 57760 + }, + { + "epoch": 0.21988687834473938, + "grad_norm": 0.14905333518981934, + "learning_rate": 0.0005, + "loss": 2.1262, + "step": 57770 + }, + { + "epoch": 0.21992494081286207, + "grad_norm": 0.14193713665008545, + "learning_rate": 0.0005, + "loss": 2.1335, + "step": 57780 + }, + { + "epoch": 0.21996300328098475, + "grad_norm": 0.1211729571223259, + "learning_rate": 0.0005, + "loss": 2.1305, + "step": 57790 + }, + { + "epoch": 0.22000106574910744, + "grad_norm": 0.12316634505987167, + "learning_rate": 0.0005, + "loss": 2.1308, + "step": 57800 + }, + { + "epoch": 0.22003912821723012, + "grad_norm": 0.11444377154111862, + "learning_rate": 0.0005, + "loss": 2.1345, + "step": 57810 + }, + { + "epoch": 0.2200771906853528, + "grad_norm": 0.1265476793050766, + "learning_rate": 0.0005, + "loss": 2.1383, + "step": 57820 + }, + { + "epoch": 0.2201152531534755, + "grad_norm": 0.12637244164943695, + "learning_rate": 0.0005, + "loss": 2.1271, + "step": 57830 + }, + { + "epoch": 0.22015331562159818, + "grad_norm": 0.11900036782026291, + "learning_rate": 0.0005, + "loss": 2.1343, + "step": 57840 + }, + { + "epoch": 0.22019137808972086, + "grad_norm": 0.1138107106089592, + "learning_rate": 0.0005, + "loss": 2.1416, + "step": 57850 + }, + { + "epoch": 0.22022944055784352, + "grad_norm": 0.11928112804889679, + "learning_rate": 0.0005, + "loss": 2.1467, + "step": 57860 + }, + { + "epoch": 0.2202675030259662, + "grad_norm": 0.12178890407085419, + "learning_rate": 0.0005, + "loss": 2.1371, + "step": 57870 + }, + { + "epoch": 0.2203055654940889, + "grad_norm": 0.12526173889636993, + "learning_rate": 0.0005, + "loss": 2.1054, + "step": 57880 + }, + { + "epoch": 0.22034362796221157, + "grad_norm": 0.11737733334302902, + "learning_rate": 0.0005, + "loss": 2.1396, + "step": 57890 + }, + { + "epoch": 0.22038169043033426, + "grad_norm": 0.12308654934167862, + "learning_rate": 0.0005, + "loss": 2.1249, + "step": 57900 + }, + { + "epoch": 0.22041975289845694, + "grad_norm": 0.1160750612616539, + "learning_rate": 0.0005, + "loss": 2.1295, + "step": 57910 + }, + { + "epoch": 0.22045781536657963, + "grad_norm": 0.11439873278141022, + "learning_rate": 0.0005, + "loss": 2.14, + "step": 57920 + }, + { + "epoch": 0.22049587783470231, + "grad_norm": 0.12854161858558655, + "learning_rate": 0.0005, + "loss": 2.1406, + "step": 57930 + }, + { + "epoch": 0.220533940302825, + "grad_norm": 0.11376269161701202, + "learning_rate": 0.0005, + "loss": 2.1242, + "step": 57940 + }, + { + "epoch": 0.22057200277094768, + "grad_norm": 0.11966376006603241, + "learning_rate": 0.0005, + "loss": 2.1457, + "step": 57950 + }, + { + "epoch": 0.22061006523907037, + "grad_norm": 0.12348086386919022, + "learning_rate": 0.0005, + "loss": 2.1228, + "step": 57960 + }, + { + "epoch": 0.22064812770719305, + "grad_norm": 0.11511658877134323, + "learning_rate": 0.0005, + "loss": 2.1294, + "step": 57970 + }, + { + "epoch": 0.22068619017531574, + "grad_norm": 0.12527251243591309, + "learning_rate": 0.0005, + "loss": 2.1393, + "step": 57980 + }, + { + "epoch": 0.22072425264343842, + "grad_norm": 0.120834581553936, + "learning_rate": 0.0005, + "loss": 2.1307, + "step": 57990 + }, + { + "epoch": 0.22076231511156108, + "grad_norm": 0.12000294774770737, + "learning_rate": 0.0005, + "loss": 2.1321, + "step": 58000 + }, + { + "epoch": 0.22080037757968377, + "grad_norm": 0.1237744390964508, + "learning_rate": 0.0005, + "loss": 2.1267, + "step": 58010 + }, + { + "epoch": 0.22083844004780645, + "grad_norm": 0.132208913564682, + "learning_rate": 0.0005, + "loss": 2.1381, + "step": 58020 + }, + { + "epoch": 0.22087650251592914, + "grad_norm": 0.12867999076843262, + "learning_rate": 0.0005, + "loss": 2.1393, + "step": 58030 + }, + { + "epoch": 0.22091456498405182, + "grad_norm": 0.12405936419963837, + "learning_rate": 0.0005, + "loss": 2.1379, + "step": 58040 + }, + { + "epoch": 0.2209526274521745, + "grad_norm": 0.12567295134067535, + "learning_rate": 0.0005, + "loss": 2.1247, + "step": 58050 + }, + { + "epoch": 0.2209906899202972, + "grad_norm": 0.1235484927892685, + "learning_rate": 0.0005, + "loss": 2.126, + "step": 58060 + }, + { + "epoch": 0.22102875238841987, + "grad_norm": 0.13820816576480865, + "learning_rate": 0.0005, + "loss": 2.1299, + "step": 58070 + }, + { + "epoch": 0.22106681485654256, + "grad_norm": 0.12887045741081238, + "learning_rate": 0.0005, + "loss": 2.1367, + "step": 58080 + }, + { + "epoch": 0.22110487732466524, + "grad_norm": 0.13162380456924438, + "learning_rate": 0.0005, + "loss": 2.1419, + "step": 58090 + }, + { + "epoch": 0.22114293979278793, + "grad_norm": 0.11963611096143723, + "learning_rate": 0.0005, + "loss": 2.1083, + "step": 58100 + }, + { + "epoch": 0.22118100226091061, + "grad_norm": 0.11864755302667618, + "learning_rate": 0.0005, + "loss": 2.1344, + "step": 58110 + }, + { + "epoch": 0.2212190647290333, + "grad_norm": 0.11773239076137543, + "learning_rate": 0.0005, + "loss": 2.126, + "step": 58120 + }, + { + "epoch": 0.22125712719715598, + "grad_norm": 0.135623961687088, + "learning_rate": 0.0005, + "loss": 2.1335, + "step": 58130 + }, + { + "epoch": 0.22129518966527867, + "grad_norm": 0.12874767184257507, + "learning_rate": 0.0005, + "loss": 2.1267, + "step": 58140 + }, + { + "epoch": 0.22133325213340133, + "grad_norm": 0.1876586377620697, + "learning_rate": 0.0005, + "loss": 2.1246, + "step": 58150 + }, + { + "epoch": 0.221371314601524, + "grad_norm": 0.12583576142787933, + "learning_rate": 0.0005, + "loss": 2.136, + "step": 58160 + }, + { + "epoch": 0.2214093770696467, + "grad_norm": 0.13214285671710968, + "learning_rate": 0.0005, + "loss": 2.1138, + "step": 58170 + }, + { + "epoch": 0.22144743953776938, + "grad_norm": 0.12951025366783142, + "learning_rate": 0.0005, + "loss": 2.1343, + "step": 58180 + }, + { + "epoch": 0.22148550200589207, + "grad_norm": 0.13020943105220795, + "learning_rate": 0.0005, + "loss": 2.1322, + "step": 58190 + }, + { + "epoch": 0.22152356447401475, + "grad_norm": 0.11596996337175369, + "learning_rate": 0.0005, + "loss": 2.1243, + "step": 58200 + }, + { + "epoch": 0.22156162694213744, + "grad_norm": 0.12256282567977905, + "learning_rate": 0.0005, + "loss": 2.1418, + "step": 58210 + }, + { + "epoch": 0.22159968941026012, + "grad_norm": 0.11792580038309097, + "learning_rate": 0.0005, + "loss": 2.1272, + "step": 58220 + }, + { + "epoch": 0.2216377518783828, + "grad_norm": 0.11020893603563309, + "learning_rate": 0.0005, + "loss": 2.1259, + "step": 58230 + }, + { + "epoch": 0.2216758143465055, + "grad_norm": 0.12330953776836395, + "learning_rate": 0.0005, + "loss": 2.1313, + "step": 58240 + }, + { + "epoch": 0.22171387681462817, + "grad_norm": 0.14022402465343475, + "learning_rate": 0.0005, + "loss": 2.1342, + "step": 58250 + }, + { + "epoch": 0.22175193928275086, + "grad_norm": 0.129679337143898, + "learning_rate": 0.0005, + "loss": 2.1409, + "step": 58260 + }, + { + "epoch": 0.22179000175087354, + "grad_norm": 0.12865565717220306, + "learning_rate": 0.0005, + "loss": 2.1401, + "step": 58270 + }, + { + "epoch": 0.22182806421899623, + "grad_norm": 0.13360846042633057, + "learning_rate": 0.0005, + "loss": 2.1174, + "step": 58280 + }, + { + "epoch": 0.2218661266871189, + "grad_norm": 0.12509751319885254, + "learning_rate": 0.0005, + "loss": 2.1267, + "step": 58290 + }, + { + "epoch": 0.22190418915524157, + "grad_norm": 0.127641499042511, + "learning_rate": 0.0005, + "loss": 2.1223, + "step": 58300 + }, + { + "epoch": 0.22194225162336426, + "grad_norm": 0.11071023344993591, + "learning_rate": 0.0005, + "loss": 2.1355, + "step": 58310 + }, + { + "epoch": 0.22198031409148694, + "grad_norm": 0.11509322375059128, + "learning_rate": 0.0005, + "loss": 2.1218, + "step": 58320 + }, + { + "epoch": 0.22201837655960963, + "grad_norm": 0.11422469466924667, + "learning_rate": 0.0005, + "loss": 2.1308, + "step": 58330 + }, + { + "epoch": 0.2220564390277323, + "grad_norm": 0.1354234218597412, + "learning_rate": 0.0005, + "loss": 2.1508, + "step": 58340 + }, + { + "epoch": 0.222094501495855, + "grad_norm": 0.1264037936925888, + "learning_rate": 0.0005, + "loss": 2.1439, + "step": 58350 + }, + { + "epoch": 0.22213256396397768, + "grad_norm": 0.12463847547769547, + "learning_rate": 0.0005, + "loss": 2.1435, + "step": 58360 + }, + { + "epoch": 0.22217062643210037, + "grad_norm": 0.11444063484668732, + "learning_rate": 0.0005, + "loss": 2.1339, + "step": 58370 + }, + { + "epoch": 0.22220868890022305, + "grad_norm": 0.12904338538646698, + "learning_rate": 0.0005, + "loss": 2.1466, + "step": 58380 + }, + { + "epoch": 0.22224675136834574, + "grad_norm": 0.1209927424788475, + "learning_rate": 0.0005, + "loss": 2.1276, + "step": 58390 + }, + { + "epoch": 0.22228481383646842, + "grad_norm": 0.14710427820682526, + "learning_rate": 0.0005, + "loss": 2.1266, + "step": 58400 + }, + { + "epoch": 0.2223228763045911, + "grad_norm": 0.12501521408557892, + "learning_rate": 0.0005, + "loss": 2.1267, + "step": 58410 + }, + { + "epoch": 0.2223609387727138, + "grad_norm": 0.11510436236858368, + "learning_rate": 0.0005, + "loss": 2.1175, + "step": 58420 + }, + { + "epoch": 0.22239900124083645, + "grad_norm": 0.12269311398267746, + "learning_rate": 0.0005, + "loss": 2.1183, + "step": 58430 + }, + { + "epoch": 0.22243706370895913, + "grad_norm": 0.12804774940013885, + "learning_rate": 0.0005, + "loss": 2.1221, + "step": 58440 + }, + { + "epoch": 0.22247512617708182, + "grad_norm": 0.12788155674934387, + "learning_rate": 0.0005, + "loss": 2.1245, + "step": 58450 + }, + { + "epoch": 0.2225131886452045, + "grad_norm": 0.12186574190855026, + "learning_rate": 0.0005, + "loss": 2.1391, + "step": 58460 + }, + { + "epoch": 0.2225512511133272, + "grad_norm": 0.13440869748592377, + "learning_rate": 0.0005, + "loss": 2.1328, + "step": 58470 + }, + { + "epoch": 0.22258931358144987, + "grad_norm": 0.1120695099234581, + "learning_rate": 0.0005, + "loss": 2.1371, + "step": 58480 + }, + { + "epoch": 0.22262737604957256, + "grad_norm": 0.12774385511875153, + "learning_rate": 0.0005, + "loss": 2.1365, + "step": 58490 + }, + { + "epoch": 0.22266543851769524, + "grad_norm": 0.12559452652931213, + "learning_rate": 0.0005, + "loss": 2.1286, + "step": 58500 + }, + { + "epoch": 0.22270350098581793, + "grad_norm": 0.1218034029006958, + "learning_rate": 0.0005, + "loss": 2.1422, + "step": 58510 + }, + { + "epoch": 0.2227415634539406, + "grad_norm": 0.12162283062934875, + "learning_rate": 0.0005, + "loss": 2.127, + "step": 58520 + }, + { + "epoch": 0.2227796259220633, + "grad_norm": 0.13595223426818848, + "learning_rate": 0.0005, + "loss": 2.1275, + "step": 58530 + }, + { + "epoch": 0.22281768839018598, + "grad_norm": 0.14094847440719604, + "learning_rate": 0.0005, + "loss": 2.1365, + "step": 58540 + }, + { + "epoch": 0.22285575085830867, + "grad_norm": 0.14060798287391663, + "learning_rate": 0.0005, + "loss": 2.1268, + "step": 58550 + }, + { + "epoch": 0.22289381332643135, + "grad_norm": 0.1249428391456604, + "learning_rate": 0.0005, + "loss": 2.131, + "step": 58560 + }, + { + "epoch": 0.22293187579455404, + "grad_norm": 0.12495312839746475, + "learning_rate": 0.0005, + "loss": 2.1449, + "step": 58570 + }, + { + "epoch": 0.2229699382626767, + "grad_norm": 0.12324405461549759, + "learning_rate": 0.0005, + "loss": 2.1359, + "step": 58580 + }, + { + "epoch": 0.22300800073079938, + "grad_norm": 0.12775039672851562, + "learning_rate": 0.0005, + "loss": 2.1304, + "step": 58590 + }, + { + "epoch": 0.22304606319892206, + "grad_norm": 0.12679044902324677, + "learning_rate": 0.0005, + "loss": 2.1269, + "step": 58600 + }, + { + "epoch": 0.22308412566704475, + "grad_norm": 0.13023437559604645, + "learning_rate": 0.0005, + "loss": 2.1264, + "step": 58610 + }, + { + "epoch": 0.22312218813516743, + "grad_norm": 0.13190847635269165, + "learning_rate": 0.0005, + "loss": 2.1143, + "step": 58620 + }, + { + "epoch": 0.22316025060329012, + "grad_norm": 0.13421021401882172, + "learning_rate": 0.0005, + "loss": 2.129, + "step": 58630 + }, + { + "epoch": 0.2231983130714128, + "grad_norm": 0.12466822564601898, + "learning_rate": 0.0005, + "loss": 2.1299, + "step": 58640 + }, + { + "epoch": 0.2232363755395355, + "grad_norm": 0.12487687915563583, + "learning_rate": 0.0005, + "loss": 2.1367, + "step": 58650 + }, + { + "epoch": 0.22327443800765817, + "grad_norm": 0.13558396697044373, + "learning_rate": 0.0005, + "loss": 2.1147, + "step": 58660 + }, + { + "epoch": 0.22331250047578086, + "grad_norm": 0.12072444707155228, + "learning_rate": 0.0005, + "loss": 2.1265, + "step": 58670 + }, + { + "epoch": 0.22335056294390354, + "grad_norm": 0.12258957326412201, + "learning_rate": 0.0005, + "loss": 2.1404, + "step": 58680 + }, + { + "epoch": 0.22338862541202623, + "grad_norm": 0.12579022347927094, + "learning_rate": 0.0005, + "loss": 2.1241, + "step": 58690 + }, + { + "epoch": 0.2234266878801489, + "grad_norm": 0.13216014206409454, + "learning_rate": 0.0005, + "loss": 2.1421, + "step": 58700 + }, + { + "epoch": 0.2234647503482716, + "grad_norm": 0.12116879969835281, + "learning_rate": 0.0005, + "loss": 2.1339, + "step": 58710 + }, + { + "epoch": 0.22350281281639425, + "grad_norm": 0.13866692781448364, + "learning_rate": 0.0005, + "loss": 2.1318, + "step": 58720 + }, + { + "epoch": 0.22354087528451694, + "grad_norm": 0.13507381081581116, + "learning_rate": 0.0005, + "loss": 2.1346, + "step": 58730 + }, + { + "epoch": 0.22357893775263962, + "grad_norm": 0.13417589664459229, + "learning_rate": 0.0005, + "loss": 2.1478, + "step": 58740 + }, + { + "epoch": 0.2236170002207623, + "grad_norm": 0.12745191156864166, + "learning_rate": 0.0005, + "loss": 2.1183, + "step": 58750 + }, + { + "epoch": 0.223655062688885, + "grad_norm": 0.13191528618335724, + "learning_rate": 0.0005, + "loss": 2.1224, + "step": 58760 + }, + { + "epoch": 0.22369312515700768, + "grad_norm": 0.12790720164775848, + "learning_rate": 0.0005, + "loss": 2.1342, + "step": 58770 + }, + { + "epoch": 0.22373118762513036, + "grad_norm": 0.1307389736175537, + "learning_rate": 0.0005, + "loss": 2.1261, + "step": 58780 + }, + { + "epoch": 0.22376925009325305, + "grad_norm": 0.11681444197893143, + "learning_rate": 0.0005, + "loss": 2.122, + "step": 58790 + }, + { + "epoch": 0.22380731256137573, + "grad_norm": 0.1257818192243576, + "learning_rate": 0.0005, + "loss": 2.1298, + "step": 58800 + }, + { + "epoch": 0.22384537502949842, + "grad_norm": 0.12122656404972076, + "learning_rate": 0.0005, + "loss": 2.1252, + "step": 58810 + }, + { + "epoch": 0.2238834374976211, + "grad_norm": 0.11498332023620605, + "learning_rate": 0.0005, + "loss": 2.1318, + "step": 58820 + }, + { + "epoch": 0.2239214999657438, + "grad_norm": 0.12811604142189026, + "learning_rate": 0.0005, + "loss": 2.1269, + "step": 58830 + }, + { + "epoch": 0.22395956243386647, + "grad_norm": 0.1314505636692047, + "learning_rate": 0.0005, + "loss": 2.1275, + "step": 58840 + }, + { + "epoch": 0.22399762490198916, + "grad_norm": 0.12542065978050232, + "learning_rate": 0.0005, + "loss": 2.1295, + "step": 58850 + }, + { + "epoch": 0.22403568737011184, + "grad_norm": 0.12298982590436935, + "learning_rate": 0.0005, + "loss": 2.1282, + "step": 58860 + }, + { + "epoch": 0.2240737498382345, + "grad_norm": 0.12842786312103271, + "learning_rate": 0.0005, + "loss": 2.1226, + "step": 58870 + }, + { + "epoch": 0.22411181230635718, + "grad_norm": 0.12505269050598145, + "learning_rate": 0.0005, + "loss": 2.1275, + "step": 58880 + }, + { + "epoch": 0.22414987477447987, + "grad_norm": 0.13934390246868134, + "learning_rate": 0.0005, + "loss": 2.1335, + "step": 58890 + }, + { + "epoch": 0.22418793724260255, + "grad_norm": 0.13097402453422546, + "learning_rate": 0.0005, + "loss": 2.1416, + "step": 58900 + }, + { + "epoch": 0.22422599971072524, + "grad_norm": 0.1333012878894806, + "learning_rate": 0.0005, + "loss": 2.1388, + "step": 58910 + }, + { + "epoch": 0.22426406217884792, + "grad_norm": 0.12261933088302612, + "learning_rate": 0.0005, + "loss": 2.1211, + "step": 58920 + }, + { + "epoch": 0.2243021246469706, + "grad_norm": 0.13171008229255676, + "learning_rate": 0.0005, + "loss": 2.122, + "step": 58930 + }, + { + "epoch": 0.2243401871150933, + "grad_norm": 0.12285832315683365, + "learning_rate": 0.0005, + "loss": 2.1278, + "step": 58940 + }, + { + "epoch": 0.22437824958321598, + "grad_norm": 0.11739574372768402, + "learning_rate": 0.0005, + "loss": 2.1422, + "step": 58950 + }, + { + "epoch": 0.22441631205133866, + "grad_norm": 0.12524369359016418, + "learning_rate": 0.0005, + "loss": 2.1373, + "step": 58960 + }, + { + "epoch": 0.22445437451946135, + "grad_norm": 0.1167164146900177, + "learning_rate": 0.0005, + "loss": 2.1228, + "step": 58970 + }, + { + "epoch": 0.22449243698758403, + "grad_norm": 0.12820903956890106, + "learning_rate": 0.0005, + "loss": 2.1294, + "step": 58980 + }, + { + "epoch": 0.22453049945570672, + "grad_norm": 0.1264885663986206, + "learning_rate": 0.0005, + "loss": 2.1054, + "step": 58990 + }, + { + "epoch": 0.2245685619238294, + "grad_norm": 0.12076272070407867, + "learning_rate": 0.0005, + "loss": 2.1398, + "step": 59000 + }, + { + "epoch": 0.22460662439195206, + "grad_norm": 0.12537932395935059, + "learning_rate": 0.0005, + "loss": 2.1323, + "step": 59010 + }, + { + "epoch": 0.22464468686007474, + "grad_norm": 0.1189693734049797, + "learning_rate": 0.0005, + "loss": 2.1138, + "step": 59020 + }, + { + "epoch": 0.22468274932819743, + "grad_norm": 0.12680798768997192, + "learning_rate": 0.0005, + "loss": 2.129, + "step": 59030 + }, + { + "epoch": 0.2247208117963201, + "grad_norm": 0.12540461122989655, + "learning_rate": 0.0005, + "loss": 2.1357, + "step": 59040 + }, + { + "epoch": 0.2247588742644428, + "grad_norm": 0.12299077957868576, + "learning_rate": 0.0005, + "loss": 2.1311, + "step": 59050 + }, + { + "epoch": 0.22479693673256548, + "grad_norm": 0.12139592319726944, + "learning_rate": 0.0005, + "loss": 2.1332, + "step": 59060 + }, + { + "epoch": 0.22483499920068817, + "grad_norm": 0.12649716436862946, + "learning_rate": 0.0005, + "loss": 2.1331, + "step": 59070 + }, + { + "epoch": 0.22487306166881085, + "grad_norm": 0.12666192650794983, + "learning_rate": 0.0005, + "loss": 2.1309, + "step": 59080 + }, + { + "epoch": 0.22491112413693354, + "grad_norm": 0.11255883425474167, + "learning_rate": 0.0005, + "loss": 2.1384, + "step": 59090 + }, + { + "epoch": 0.22494918660505622, + "grad_norm": 0.13696226477622986, + "learning_rate": 0.0005, + "loss": 2.1038, + "step": 59100 + }, + { + "epoch": 0.2249872490731789, + "grad_norm": 0.12781773507595062, + "learning_rate": 0.0005, + "loss": 2.1307, + "step": 59110 + }, + { + "epoch": 0.2250253115413016, + "grad_norm": 0.13813403248786926, + "learning_rate": 0.0005, + "loss": 2.1142, + "step": 59120 + }, + { + "epoch": 0.22506337400942428, + "grad_norm": 0.12762613594532013, + "learning_rate": 0.0005, + "loss": 2.1258, + "step": 59130 + }, + { + "epoch": 0.22510143647754696, + "grad_norm": 0.13078367710113525, + "learning_rate": 0.0005, + "loss": 2.1256, + "step": 59140 + }, + { + "epoch": 0.22513949894566962, + "grad_norm": 0.11561431735754013, + "learning_rate": 0.0005, + "loss": 2.1502, + "step": 59150 + }, + { + "epoch": 0.2251775614137923, + "grad_norm": 0.12774159014225006, + "learning_rate": 0.0005, + "loss": 2.1326, + "step": 59160 + }, + { + "epoch": 0.225215623881915, + "grad_norm": 0.12542405724525452, + "learning_rate": 0.0005, + "loss": 2.1274, + "step": 59170 + }, + { + "epoch": 0.22525368635003767, + "grad_norm": 0.1185683012008667, + "learning_rate": 0.0005, + "loss": 2.1222, + "step": 59180 + }, + { + "epoch": 0.22529174881816036, + "grad_norm": 0.13364127278327942, + "learning_rate": 0.0005, + "loss": 2.1253, + "step": 59190 + }, + { + "epoch": 0.22532981128628304, + "grad_norm": 0.14398641884326935, + "learning_rate": 0.0005, + "loss": 2.1357, + "step": 59200 + }, + { + "epoch": 0.22536787375440573, + "grad_norm": 0.1276194155216217, + "learning_rate": 0.0005, + "loss": 2.1135, + "step": 59210 + }, + { + "epoch": 0.2254059362225284, + "grad_norm": 0.11286783963441849, + "learning_rate": 0.0005, + "loss": 2.1267, + "step": 59220 + }, + { + "epoch": 0.2254439986906511, + "grad_norm": 0.13871227204799652, + "learning_rate": 0.0005, + "loss": 2.1298, + "step": 59230 + }, + { + "epoch": 0.22548206115877378, + "grad_norm": 0.11784695833921432, + "learning_rate": 0.0005, + "loss": 2.1365, + "step": 59240 + }, + { + "epoch": 0.22552012362689647, + "grad_norm": 0.10824364423751831, + "learning_rate": 0.0005, + "loss": 2.134, + "step": 59250 + }, + { + "epoch": 0.22555818609501915, + "grad_norm": 0.12316665053367615, + "learning_rate": 0.0005, + "loss": 2.1387, + "step": 59260 + }, + { + "epoch": 0.22559624856314184, + "grad_norm": 0.12541303038597107, + "learning_rate": 0.0005, + "loss": 2.1366, + "step": 59270 + }, + { + "epoch": 0.22563431103126452, + "grad_norm": 0.11731971800327301, + "learning_rate": 0.0005, + "loss": 2.1219, + "step": 59280 + }, + { + "epoch": 0.2256723734993872, + "grad_norm": 0.13416261970996857, + "learning_rate": 0.0005, + "loss": 2.1301, + "step": 59290 + }, + { + "epoch": 0.22571043596750986, + "grad_norm": 0.1211361289024353, + "learning_rate": 0.0005, + "loss": 2.1337, + "step": 59300 + }, + { + "epoch": 0.22574849843563255, + "grad_norm": 0.11899378895759583, + "learning_rate": 0.0005, + "loss": 2.1163, + "step": 59310 + }, + { + "epoch": 0.22578656090375523, + "grad_norm": 0.12704631686210632, + "learning_rate": 0.0005, + "loss": 2.125, + "step": 59320 + }, + { + "epoch": 0.22582462337187792, + "grad_norm": 0.13148818910121918, + "learning_rate": 0.0005, + "loss": 2.1398, + "step": 59330 + }, + { + "epoch": 0.2258626858400006, + "grad_norm": 0.12430557608604431, + "learning_rate": 0.0005, + "loss": 2.1426, + "step": 59340 + }, + { + "epoch": 0.2259007483081233, + "grad_norm": 0.11459135264158249, + "learning_rate": 0.0005, + "loss": 2.1163, + "step": 59350 + }, + { + "epoch": 0.22593881077624597, + "grad_norm": 0.13341468572616577, + "learning_rate": 0.0005, + "loss": 2.1218, + "step": 59360 + }, + { + "epoch": 0.22597687324436866, + "grad_norm": 0.1272948682308197, + "learning_rate": 0.0005, + "loss": 2.133, + "step": 59370 + }, + { + "epoch": 0.22601493571249134, + "grad_norm": 0.13307124376296997, + "learning_rate": 0.0005, + "loss": 2.1384, + "step": 59380 + }, + { + "epoch": 0.22605299818061403, + "grad_norm": 0.12040016055107117, + "learning_rate": 0.0005, + "loss": 2.1237, + "step": 59390 + }, + { + "epoch": 0.2260910606487367, + "grad_norm": 0.13233061134815216, + "learning_rate": 0.0005, + "loss": 2.1494, + "step": 59400 + }, + { + "epoch": 0.2261291231168594, + "grad_norm": 0.12807966768741608, + "learning_rate": 0.0005, + "loss": 2.1348, + "step": 59410 + }, + { + "epoch": 0.22616718558498208, + "grad_norm": 0.13461320102214813, + "learning_rate": 0.0005, + "loss": 2.1213, + "step": 59420 + }, + { + "epoch": 0.22620524805310477, + "grad_norm": 0.13165999948978424, + "learning_rate": 0.0005, + "loss": 2.1316, + "step": 59430 + }, + { + "epoch": 0.22624331052122743, + "grad_norm": 0.13798262178897858, + "learning_rate": 0.0005, + "loss": 2.1313, + "step": 59440 + }, + { + "epoch": 0.2262813729893501, + "grad_norm": 0.11773668974637985, + "learning_rate": 0.0005, + "loss": 2.1357, + "step": 59450 + }, + { + "epoch": 0.2263194354574728, + "grad_norm": 0.12125545740127563, + "learning_rate": 0.0005, + "loss": 2.1304, + "step": 59460 + }, + { + "epoch": 0.22635749792559548, + "grad_norm": 0.13386575877666473, + "learning_rate": 0.0005, + "loss": 2.1359, + "step": 59470 + }, + { + "epoch": 0.22639556039371816, + "grad_norm": 0.14520864188671112, + "learning_rate": 0.0005, + "loss": 2.1349, + "step": 59480 + }, + { + "epoch": 0.22643362286184085, + "grad_norm": 0.1348080039024353, + "learning_rate": 0.0005, + "loss": 2.1312, + "step": 59490 + }, + { + "epoch": 0.22647168532996353, + "grad_norm": 0.1281449943780899, + "learning_rate": 0.0005, + "loss": 2.1309, + "step": 59500 + }, + { + "epoch": 0.22650974779808622, + "grad_norm": 0.11811570823192596, + "learning_rate": 0.0005, + "loss": 2.1231, + "step": 59510 + }, + { + "epoch": 0.2265478102662089, + "grad_norm": 0.128734290599823, + "learning_rate": 0.0005, + "loss": 2.1147, + "step": 59520 + }, + { + "epoch": 0.2265858727343316, + "grad_norm": 0.1225765272974968, + "learning_rate": 0.0005, + "loss": 2.1377, + "step": 59530 + }, + { + "epoch": 0.22662393520245427, + "grad_norm": 0.12594637274742126, + "learning_rate": 0.0005, + "loss": 2.1384, + "step": 59540 + }, + { + "epoch": 0.22666199767057696, + "grad_norm": 0.10884319245815277, + "learning_rate": 0.0005, + "loss": 2.1407, + "step": 59550 + }, + { + "epoch": 0.22670006013869964, + "grad_norm": 0.1423255354166031, + "learning_rate": 0.0005, + "loss": 2.116, + "step": 59560 + }, + { + "epoch": 0.22673812260682233, + "grad_norm": 0.12744948267936707, + "learning_rate": 0.0005, + "loss": 2.1417, + "step": 59570 + }, + { + "epoch": 0.22677618507494499, + "grad_norm": 0.12403052300214767, + "learning_rate": 0.0005, + "loss": 2.15, + "step": 59580 + }, + { + "epoch": 0.22681424754306767, + "grad_norm": 0.12405093014240265, + "learning_rate": 0.0005, + "loss": 2.1342, + "step": 59590 + }, + { + "epoch": 0.22685231001119036, + "grad_norm": 0.12157925963401794, + "learning_rate": 0.0005, + "loss": 2.1143, + "step": 59600 + }, + { + "epoch": 0.22689037247931304, + "grad_norm": 0.13508589565753937, + "learning_rate": 0.0005, + "loss": 2.1227, + "step": 59610 + }, + { + "epoch": 0.22692843494743573, + "grad_norm": 0.13147228956222534, + "learning_rate": 0.0005, + "loss": 2.1307, + "step": 59620 + }, + { + "epoch": 0.2269664974155584, + "grad_norm": 0.12281271070241928, + "learning_rate": 0.0005, + "loss": 2.1258, + "step": 59630 + }, + { + "epoch": 0.2270045598836811, + "grad_norm": 0.11494458466768265, + "learning_rate": 0.0005, + "loss": 2.1573, + "step": 59640 + }, + { + "epoch": 0.22704262235180378, + "grad_norm": 0.12068134546279907, + "learning_rate": 0.0005, + "loss": 2.1186, + "step": 59650 + }, + { + "epoch": 0.22708068481992646, + "grad_norm": 0.1246868148446083, + "learning_rate": 0.0005, + "loss": 2.1183, + "step": 59660 + }, + { + "epoch": 0.22711874728804915, + "grad_norm": 0.1122557520866394, + "learning_rate": 0.0005, + "loss": 2.1461, + "step": 59670 + }, + { + "epoch": 0.22715680975617183, + "grad_norm": 0.1379440277814865, + "learning_rate": 0.0005, + "loss": 2.1258, + "step": 59680 + }, + { + "epoch": 0.22719487222429452, + "grad_norm": 0.11093917489051819, + "learning_rate": 0.0005, + "loss": 2.1365, + "step": 59690 + }, + { + "epoch": 0.2272329346924172, + "grad_norm": 0.13247358798980713, + "learning_rate": 0.0005, + "loss": 2.1226, + "step": 59700 + }, + { + "epoch": 0.2272709971605399, + "grad_norm": 0.11783503741025925, + "learning_rate": 0.0005, + "loss": 2.1187, + "step": 59710 + }, + { + "epoch": 0.22730905962866257, + "grad_norm": 0.13418060541152954, + "learning_rate": 0.0005, + "loss": 2.1245, + "step": 59720 + }, + { + "epoch": 0.22734712209678523, + "grad_norm": 0.11433655768632889, + "learning_rate": 0.0005, + "loss": 2.1164, + "step": 59730 + }, + { + "epoch": 0.22738518456490792, + "grad_norm": 0.13238011300563812, + "learning_rate": 0.0005, + "loss": 2.126, + "step": 59740 + }, + { + "epoch": 0.2274232470330306, + "grad_norm": 0.13886448740959167, + "learning_rate": 0.0005, + "loss": 2.1379, + "step": 59750 + }, + { + "epoch": 0.22746130950115329, + "grad_norm": 0.12896914780139923, + "learning_rate": 0.0005, + "loss": 2.1358, + "step": 59760 + }, + { + "epoch": 0.22749937196927597, + "grad_norm": 0.13450363278388977, + "learning_rate": 0.0005, + "loss": 2.1166, + "step": 59770 + }, + { + "epoch": 0.22753743443739866, + "grad_norm": 0.1338476538658142, + "learning_rate": 0.0005, + "loss": 2.1369, + "step": 59780 + }, + { + "epoch": 0.22757549690552134, + "grad_norm": 0.1098618283867836, + "learning_rate": 0.0005, + "loss": 2.1193, + "step": 59790 + }, + { + "epoch": 0.22761355937364403, + "grad_norm": 0.11677566915750504, + "learning_rate": 0.0005, + "loss": 2.1398, + "step": 59800 + }, + { + "epoch": 0.2276516218417667, + "grad_norm": 0.11757448315620422, + "learning_rate": 0.0005, + "loss": 2.1292, + "step": 59810 + }, + { + "epoch": 0.2276896843098894, + "grad_norm": 0.12055247277021408, + "learning_rate": 0.0005, + "loss": 2.1387, + "step": 59820 + }, + { + "epoch": 0.22772774677801208, + "grad_norm": 0.12822188436985016, + "learning_rate": 0.0005, + "loss": 2.1446, + "step": 59830 + }, + { + "epoch": 0.22776580924613476, + "grad_norm": 0.1212920993566513, + "learning_rate": 0.0005, + "loss": 2.1275, + "step": 59840 + }, + { + "epoch": 0.22780387171425745, + "grad_norm": 0.1271173506975174, + "learning_rate": 0.0005, + "loss": 2.13, + "step": 59850 + }, + { + "epoch": 0.22784193418238013, + "grad_norm": 0.12114972621202469, + "learning_rate": 0.0005, + "loss": 2.1279, + "step": 59860 + }, + { + "epoch": 0.2278799966505028, + "grad_norm": 0.12875624001026154, + "learning_rate": 0.0005, + "loss": 2.1367, + "step": 59870 + }, + { + "epoch": 0.22791805911862548, + "grad_norm": 0.1280161440372467, + "learning_rate": 0.0005, + "loss": 2.13, + "step": 59880 + }, + { + "epoch": 0.22795612158674816, + "grad_norm": 0.12320836633443832, + "learning_rate": 0.0005, + "loss": 2.1292, + "step": 59890 + }, + { + "epoch": 0.22799418405487085, + "grad_norm": 0.1314483880996704, + "learning_rate": 0.0005, + "loss": 2.1357, + "step": 59900 + }, + { + "epoch": 0.22803224652299353, + "grad_norm": 0.13151994347572327, + "learning_rate": 0.0005, + "loss": 2.1344, + "step": 59910 + }, + { + "epoch": 0.22807030899111622, + "grad_norm": 0.13985653221607208, + "learning_rate": 0.0005, + "loss": 2.1281, + "step": 59920 + }, + { + "epoch": 0.2281083714592389, + "grad_norm": 0.12865976989269257, + "learning_rate": 0.0005, + "loss": 2.121, + "step": 59930 + }, + { + "epoch": 0.22814643392736159, + "grad_norm": 0.12414994090795517, + "learning_rate": 0.0005, + "loss": 2.1468, + "step": 59940 + }, + { + "epoch": 0.22818449639548427, + "grad_norm": 0.13476936519145966, + "learning_rate": 0.0005, + "loss": 2.1435, + "step": 59950 + }, + { + "epoch": 0.22822255886360696, + "grad_norm": 0.12013564258813858, + "learning_rate": 0.0005, + "loss": 2.1263, + "step": 59960 + }, + { + "epoch": 0.22826062133172964, + "grad_norm": 0.11777539551258087, + "learning_rate": 0.0005, + "loss": 2.1362, + "step": 59970 + }, + { + "epoch": 0.22829868379985233, + "grad_norm": 0.1243869811296463, + "learning_rate": 0.0005, + "loss": 2.1184, + "step": 59980 + }, + { + "epoch": 0.228336746267975, + "grad_norm": 0.11036140471696854, + "learning_rate": 0.0005, + "loss": 2.1281, + "step": 59990 + }, + { + "epoch": 0.2283748087360977, + "grad_norm": 0.11138096451759338, + "learning_rate": 0.0005, + "loss": 2.1427, + "step": 60000 + }, + { + "epoch": 0.22841287120422038, + "grad_norm": 0.11755473166704178, + "learning_rate": 0.0005, + "loss": 2.1407, + "step": 60010 + }, + { + "epoch": 0.22845093367234304, + "grad_norm": 0.11770655959844589, + "learning_rate": 0.0005, + "loss": 2.14, + "step": 60020 + }, + { + "epoch": 0.22848899614046572, + "grad_norm": 0.11558213084936142, + "learning_rate": 0.0005, + "loss": 2.1222, + "step": 60030 + }, + { + "epoch": 0.2285270586085884, + "grad_norm": 0.12601837515830994, + "learning_rate": 0.0005, + "loss": 2.1494, + "step": 60040 + }, + { + "epoch": 0.2285651210767111, + "grad_norm": 0.12481823563575745, + "learning_rate": 0.0005, + "loss": 2.1305, + "step": 60050 + }, + { + "epoch": 0.22860318354483378, + "grad_norm": 0.11852145940065384, + "learning_rate": 0.0005, + "loss": 2.1333, + "step": 60060 + }, + { + "epoch": 0.22864124601295646, + "grad_norm": 0.12432295083999634, + "learning_rate": 0.0005, + "loss": 2.1427, + "step": 60070 + }, + { + "epoch": 0.22867930848107915, + "grad_norm": 0.13385199010372162, + "learning_rate": 0.0005, + "loss": 2.1138, + "step": 60080 + }, + { + "epoch": 0.22871737094920183, + "grad_norm": 0.14017799496650696, + "learning_rate": 0.0005, + "loss": 2.125, + "step": 60090 + }, + { + "epoch": 0.22875543341732452, + "grad_norm": 0.11703263968229294, + "learning_rate": 0.0005, + "loss": 2.1489, + "step": 60100 + }, + { + "epoch": 0.2287934958854472, + "grad_norm": 0.11561126261949539, + "learning_rate": 0.0005, + "loss": 2.1306, + "step": 60110 + }, + { + "epoch": 0.22883155835356989, + "grad_norm": 0.12033192813396454, + "learning_rate": 0.0005, + "loss": 2.1144, + "step": 60120 + }, + { + "epoch": 0.22886962082169257, + "grad_norm": 0.14562615752220154, + "learning_rate": 0.0005, + "loss": 2.131, + "step": 60130 + }, + { + "epoch": 0.22890768328981526, + "grad_norm": 0.11416006088256836, + "learning_rate": 0.0005, + "loss": 2.1287, + "step": 60140 + }, + { + "epoch": 0.22894574575793794, + "grad_norm": 0.13422337174415588, + "learning_rate": 0.0005, + "loss": 2.1192, + "step": 60150 + }, + { + "epoch": 0.2289838082260606, + "grad_norm": 0.1290620118379593, + "learning_rate": 0.0005, + "loss": 2.1251, + "step": 60160 + }, + { + "epoch": 0.22902187069418328, + "grad_norm": 0.11742956936359406, + "learning_rate": 0.0005, + "loss": 2.1282, + "step": 60170 + }, + { + "epoch": 0.22905993316230597, + "grad_norm": 0.11944698542356491, + "learning_rate": 0.0005, + "loss": 2.1338, + "step": 60180 + }, + { + "epoch": 0.22909799563042865, + "grad_norm": 0.11954623460769653, + "learning_rate": 0.0005, + "loss": 2.1491, + "step": 60190 + }, + { + "epoch": 0.22913605809855134, + "grad_norm": 0.12461133301258087, + "learning_rate": 0.0005, + "loss": 2.1275, + "step": 60200 + }, + { + "epoch": 0.22917412056667402, + "grad_norm": 0.12100213766098022, + "learning_rate": 0.0005, + "loss": 2.1323, + "step": 60210 + }, + { + "epoch": 0.2292121830347967, + "grad_norm": 0.14461475610733032, + "learning_rate": 0.0005, + "loss": 2.126, + "step": 60220 + }, + { + "epoch": 0.2292502455029194, + "grad_norm": 0.11810819059610367, + "learning_rate": 0.0005, + "loss": 2.1412, + "step": 60230 + }, + { + "epoch": 0.22928830797104208, + "grad_norm": 0.1200481727719307, + "learning_rate": 0.0005, + "loss": 2.1336, + "step": 60240 + }, + { + "epoch": 0.22932637043916476, + "grad_norm": 0.12129189074039459, + "learning_rate": 0.0005, + "loss": 2.1348, + "step": 60250 + }, + { + "epoch": 0.22936443290728745, + "grad_norm": 0.1197345107793808, + "learning_rate": 0.0005, + "loss": 2.1303, + "step": 60260 + }, + { + "epoch": 0.22940249537541013, + "grad_norm": 0.13456512987613678, + "learning_rate": 0.0005, + "loss": 2.1322, + "step": 60270 + }, + { + "epoch": 0.22944055784353282, + "grad_norm": 0.1252896934747696, + "learning_rate": 0.0005, + "loss": 2.1428, + "step": 60280 + }, + { + "epoch": 0.2294786203116555, + "grad_norm": 0.11544041335582733, + "learning_rate": 0.0005, + "loss": 2.1275, + "step": 60290 + }, + { + "epoch": 0.22951668277977816, + "grad_norm": 0.13014082610607147, + "learning_rate": 0.0005, + "loss": 2.1359, + "step": 60300 + }, + { + "epoch": 0.22955474524790084, + "grad_norm": 0.11489399522542953, + "learning_rate": 0.0005, + "loss": 2.1265, + "step": 60310 + }, + { + "epoch": 0.22959280771602353, + "grad_norm": 0.11891859769821167, + "learning_rate": 0.0005, + "loss": 2.1456, + "step": 60320 + }, + { + "epoch": 0.2296308701841462, + "grad_norm": 0.11885011196136475, + "learning_rate": 0.0005, + "loss": 2.1348, + "step": 60330 + }, + { + "epoch": 0.2296689326522689, + "grad_norm": 0.1199352964758873, + "learning_rate": 0.0005, + "loss": 2.1233, + "step": 60340 + }, + { + "epoch": 0.22970699512039158, + "grad_norm": 0.14349274337291718, + "learning_rate": 0.0005, + "loss": 2.1255, + "step": 60350 + }, + { + "epoch": 0.22974505758851427, + "grad_norm": 0.15010002255439758, + "learning_rate": 0.0005, + "loss": 2.1206, + "step": 60360 + }, + { + "epoch": 0.22978312005663695, + "grad_norm": 0.13680097460746765, + "learning_rate": 0.0005, + "loss": 2.1419, + "step": 60370 + }, + { + "epoch": 0.22982118252475964, + "grad_norm": 0.12735390663146973, + "learning_rate": 0.0005, + "loss": 2.1391, + "step": 60380 + }, + { + "epoch": 0.22985924499288232, + "grad_norm": 0.13412439823150635, + "learning_rate": 0.0005, + "loss": 2.1381, + "step": 60390 + }, + { + "epoch": 0.229897307461005, + "grad_norm": 0.11966119706630707, + "learning_rate": 0.0005, + "loss": 2.1357, + "step": 60400 + }, + { + "epoch": 0.2299353699291277, + "grad_norm": 0.11467495560646057, + "learning_rate": 0.0005, + "loss": 2.134, + "step": 60410 + }, + { + "epoch": 0.22997343239725038, + "grad_norm": 0.12715892493724823, + "learning_rate": 0.0005, + "loss": 2.1205, + "step": 60420 + }, + { + "epoch": 0.23001149486537306, + "grad_norm": 0.13274915516376495, + "learning_rate": 0.0005, + "loss": 2.121, + "step": 60430 + }, + { + "epoch": 0.23004955733349575, + "grad_norm": 0.12153349071741104, + "learning_rate": 0.0005, + "loss": 2.1327, + "step": 60440 + }, + { + "epoch": 0.2300876198016184, + "grad_norm": 0.11797938495874405, + "learning_rate": 0.0005, + "loss": 2.1117, + "step": 60450 + }, + { + "epoch": 0.2301256822697411, + "grad_norm": 0.1457214504480362, + "learning_rate": 0.0005, + "loss": 2.1303, + "step": 60460 + }, + { + "epoch": 0.23016374473786377, + "grad_norm": 0.12316471338272095, + "learning_rate": 0.0005, + "loss": 2.1419, + "step": 60470 + }, + { + "epoch": 0.23020180720598646, + "grad_norm": 0.11603264510631561, + "learning_rate": 0.0005, + "loss": 2.1197, + "step": 60480 + }, + { + "epoch": 0.23023986967410914, + "grad_norm": 0.1149735152721405, + "learning_rate": 0.0005, + "loss": 2.1321, + "step": 60490 + }, + { + "epoch": 0.23027793214223183, + "grad_norm": 0.1266094446182251, + "learning_rate": 0.0005, + "loss": 2.142, + "step": 60500 + }, + { + "epoch": 0.2303159946103545, + "grad_norm": 0.13430431485176086, + "learning_rate": 0.0005, + "loss": 2.1257, + "step": 60510 + }, + { + "epoch": 0.2303540570784772, + "grad_norm": 0.12228171527385712, + "learning_rate": 0.0005, + "loss": 2.1302, + "step": 60520 + }, + { + "epoch": 0.23039211954659988, + "grad_norm": 0.12757080793380737, + "learning_rate": 0.0005, + "loss": 2.126, + "step": 60530 + }, + { + "epoch": 0.23043018201472257, + "grad_norm": 0.12125714123249054, + "learning_rate": 0.0005, + "loss": 2.125, + "step": 60540 + }, + { + "epoch": 0.23046824448284525, + "grad_norm": 0.11723601073026657, + "learning_rate": 0.0005, + "loss": 2.1347, + "step": 60550 + }, + { + "epoch": 0.23050630695096794, + "grad_norm": 0.11978837847709656, + "learning_rate": 0.0005, + "loss": 2.1388, + "step": 60560 + }, + { + "epoch": 0.23054436941909062, + "grad_norm": 0.12238425761461258, + "learning_rate": 0.0005, + "loss": 2.127, + "step": 60570 + }, + { + "epoch": 0.2305824318872133, + "grad_norm": 0.13493356108665466, + "learning_rate": 0.0005, + "loss": 2.1335, + "step": 60580 + }, + { + "epoch": 0.23062049435533596, + "grad_norm": 0.1277237832546234, + "learning_rate": 0.0005, + "loss": 2.132, + "step": 60590 + }, + { + "epoch": 0.23065855682345865, + "grad_norm": 0.12236456573009491, + "learning_rate": 0.0005, + "loss": 2.1334, + "step": 60600 + }, + { + "epoch": 0.23069661929158133, + "grad_norm": 0.14096243679523468, + "learning_rate": 0.0005, + "loss": 2.1248, + "step": 60610 + }, + { + "epoch": 0.23073468175970402, + "grad_norm": 0.13399477303028107, + "learning_rate": 0.0005, + "loss": 2.1221, + "step": 60620 + }, + { + "epoch": 0.2307727442278267, + "grad_norm": 0.12829075753688812, + "learning_rate": 0.0005, + "loss": 2.1174, + "step": 60630 + }, + { + "epoch": 0.2308108066959494, + "grad_norm": 0.11446889489889145, + "learning_rate": 0.0005, + "loss": 2.1325, + "step": 60640 + }, + { + "epoch": 0.23084886916407207, + "grad_norm": 0.12685668468475342, + "learning_rate": 0.0005, + "loss": 2.113, + "step": 60650 + }, + { + "epoch": 0.23088693163219476, + "grad_norm": 0.13323184847831726, + "learning_rate": 0.0005, + "loss": 2.1397, + "step": 60660 + }, + { + "epoch": 0.23092499410031744, + "grad_norm": 0.13773372769355774, + "learning_rate": 0.0005, + "loss": 2.1387, + "step": 60670 + }, + { + "epoch": 0.23096305656844013, + "grad_norm": 0.1213175356388092, + "learning_rate": 0.0005, + "loss": 2.1228, + "step": 60680 + }, + { + "epoch": 0.2310011190365628, + "grad_norm": 0.12439834326505661, + "learning_rate": 0.0005, + "loss": 2.1312, + "step": 60690 + }, + { + "epoch": 0.2310391815046855, + "grad_norm": 0.11664614081382751, + "learning_rate": 0.0005, + "loss": 2.1352, + "step": 60700 + }, + { + "epoch": 0.23107724397280818, + "grad_norm": 0.12157955765724182, + "learning_rate": 0.0005, + "loss": 2.132, + "step": 60710 + }, + { + "epoch": 0.23111530644093087, + "grad_norm": 0.13951872289180756, + "learning_rate": 0.0005, + "loss": 2.1323, + "step": 60720 + }, + { + "epoch": 0.23115336890905352, + "grad_norm": 0.12050854414701462, + "learning_rate": 0.0005, + "loss": 2.1339, + "step": 60730 + }, + { + "epoch": 0.2311914313771762, + "grad_norm": 0.13324572145938873, + "learning_rate": 0.0005, + "loss": 2.1208, + "step": 60740 + }, + { + "epoch": 0.2312294938452989, + "grad_norm": 0.13760831952095032, + "learning_rate": 0.0005, + "loss": 2.1293, + "step": 60750 + }, + { + "epoch": 0.23126755631342158, + "grad_norm": 0.12362300604581833, + "learning_rate": 0.0005, + "loss": 2.1228, + "step": 60760 + }, + { + "epoch": 0.23130561878154426, + "grad_norm": 0.12636949121952057, + "learning_rate": 0.0005, + "loss": 2.1469, + "step": 60770 + }, + { + "epoch": 0.23134368124966695, + "grad_norm": 0.1140715703368187, + "learning_rate": 0.0005, + "loss": 2.107, + "step": 60780 + }, + { + "epoch": 0.23138174371778963, + "grad_norm": 0.13143981993198395, + "learning_rate": 0.0005, + "loss": 2.1328, + "step": 60790 + }, + { + "epoch": 0.23141980618591232, + "grad_norm": 0.1388804316520691, + "learning_rate": 0.0005, + "loss": 2.1185, + "step": 60800 + }, + { + "epoch": 0.231457868654035, + "grad_norm": 0.15259552001953125, + "learning_rate": 0.0005, + "loss": 2.1391, + "step": 60810 + }, + { + "epoch": 0.2314959311221577, + "grad_norm": 0.130338653922081, + "learning_rate": 0.0005, + "loss": 2.1299, + "step": 60820 + }, + { + "epoch": 0.23153399359028037, + "grad_norm": 0.12489153444766998, + "learning_rate": 0.0005, + "loss": 2.1368, + "step": 60830 + }, + { + "epoch": 0.23157205605840306, + "grad_norm": 0.133027121424675, + "learning_rate": 0.0005, + "loss": 2.1356, + "step": 60840 + }, + { + "epoch": 0.23161011852652574, + "grad_norm": 0.13067150115966797, + "learning_rate": 0.0005, + "loss": 2.1219, + "step": 60850 + }, + { + "epoch": 0.23164818099464843, + "grad_norm": 0.11416888982057571, + "learning_rate": 0.0005, + "loss": 2.1217, + "step": 60860 + }, + { + "epoch": 0.2316862434627711, + "grad_norm": 0.1309850960969925, + "learning_rate": 0.0005, + "loss": 2.1411, + "step": 60870 + }, + { + "epoch": 0.23172430593089377, + "grad_norm": 0.12706001102924347, + "learning_rate": 0.0005, + "loss": 2.1215, + "step": 60880 + }, + { + "epoch": 0.23176236839901646, + "grad_norm": 0.12186629325151443, + "learning_rate": 0.0005, + "loss": 2.1262, + "step": 60890 + }, + { + "epoch": 0.23180043086713914, + "grad_norm": 0.11978558450937271, + "learning_rate": 0.0005, + "loss": 2.1262, + "step": 60900 + }, + { + "epoch": 0.23183849333526182, + "grad_norm": 0.13007833063602448, + "learning_rate": 0.0005, + "loss": 2.134, + "step": 60910 + }, + { + "epoch": 0.2318765558033845, + "grad_norm": 0.12153036892414093, + "learning_rate": 0.0005, + "loss": 2.1405, + "step": 60920 + }, + { + "epoch": 0.2319146182715072, + "grad_norm": 0.12330733239650726, + "learning_rate": 0.0005, + "loss": 2.1212, + "step": 60930 + }, + { + "epoch": 0.23195268073962988, + "grad_norm": 0.11360269784927368, + "learning_rate": 0.0005, + "loss": 2.1346, + "step": 60940 + }, + { + "epoch": 0.23199074320775256, + "grad_norm": 0.11848775297403336, + "learning_rate": 0.0005, + "loss": 2.1454, + "step": 60950 + }, + { + "epoch": 0.23202880567587525, + "grad_norm": 0.1268378645181656, + "learning_rate": 0.0005, + "loss": 2.1304, + "step": 60960 + }, + { + "epoch": 0.23206686814399793, + "grad_norm": 0.11616990715265274, + "learning_rate": 0.0005, + "loss": 2.134, + "step": 60970 + }, + { + "epoch": 0.23210493061212062, + "grad_norm": 0.11881053447723389, + "learning_rate": 0.0005, + "loss": 2.1315, + "step": 60980 + }, + { + "epoch": 0.2321429930802433, + "grad_norm": 0.12208161503076553, + "learning_rate": 0.0005, + "loss": 2.1303, + "step": 60990 + }, + { + "epoch": 0.232181055548366, + "grad_norm": 0.12031666934490204, + "learning_rate": 0.0005, + "loss": 2.1262, + "step": 61000 + }, + { + "epoch": 0.23221911801648867, + "grad_norm": 0.12577299773693085, + "learning_rate": 0.0005, + "loss": 2.1439, + "step": 61010 + }, + { + "epoch": 0.23225718048461133, + "grad_norm": 0.12651118636131287, + "learning_rate": 0.0005, + "loss": 2.1157, + "step": 61020 + }, + { + "epoch": 0.23229524295273402, + "grad_norm": 0.1322355568408966, + "learning_rate": 0.0005, + "loss": 2.1313, + "step": 61030 + }, + { + "epoch": 0.2323333054208567, + "grad_norm": 0.11694405227899551, + "learning_rate": 0.0005, + "loss": 2.123, + "step": 61040 + }, + { + "epoch": 0.23237136788897939, + "grad_norm": 0.12027653306722641, + "learning_rate": 0.0005, + "loss": 2.1236, + "step": 61050 + }, + { + "epoch": 0.23240943035710207, + "grad_norm": 0.1370190531015396, + "learning_rate": 0.0005, + "loss": 2.139, + "step": 61060 + }, + { + "epoch": 0.23244749282522476, + "grad_norm": 0.1162046492099762, + "learning_rate": 0.0005, + "loss": 2.1228, + "step": 61070 + }, + { + "epoch": 0.23248555529334744, + "grad_norm": 0.13960926234722137, + "learning_rate": 0.0005, + "loss": 2.1145, + "step": 61080 + }, + { + "epoch": 0.23252361776147012, + "grad_norm": 0.1295863837003708, + "learning_rate": 0.0005, + "loss": 2.1307, + "step": 61090 + }, + { + "epoch": 0.2325616802295928, + "grad_norm": 0.140008807182312, + "learning_rate": 0.0005, + "loss": 2.1373, + "step": 61100 + }, + { + "epoch": 0.2325997426977155, + "grad_norm": 0.13163819909095764, + "learning_rate": 0.0005, + "loss": 2.1382, + "step": 61110 + }, + { + "epoch": 0.23263780516583818, + "grad_norm": 0.12477041780948639, + "learning_rate": 0.0005, + "loss": 2.1287, + "step": 61120 + }, + { + "epoch": 0.23267586763396086, + "grad_norm": 0.1220005601644516, + "learning_rate": 0.0005, + "loss": 2.1306, + "step": 61130 + }, + { + "epoch": 0.23271393010208355, + "grad_norm": 0.11905595660209656, + "learning_rate": 0.0005, + "loss": 2.1274, + "step": 61140 + }, + { + "epoch": 0.23275199257020623, + "grad_norm": 0.11650003492832184, + "learning_rate": 0.0005, + "loss": 2.1392, + "step": 61150 + }, + { + "epoch": 0.23279005503832892, + "grad_norm": 0.12131864577531815, + "learning_rate": 0.0005, + "loss": 2.1362, + "step": 61160 + }, + { + "epoch": 0.23282811750645158, + "grad_norm": 0.12896190583705902, + "learning_rate": 0.0005, + "loss": 2.1269, + "step": 61170 + }, + { + "epoch": 0.23286617997457426, + "grad_norm": 0.13321678340435028, + "learning_rate": 0.0005, + "loss": 2.1223, + "step": 61180 + }, + { + "epoch": 0.23290424244269695, + "grad_norm": 0.11942190676927567, + "learning_rate": 0.0005, + "loss": 2.1383, + "step": 61190 + }, + { + "epoch": 0.23294230491081963, + "grad_norm": 0.11913148313760757, + "learning_rate": 0.0005, + "loss": 2.1208, + "step": 61200 + }, + { + "epoch": 0.23298036737894232, + "grad_norm": 0.4585500955581665, + "learning_rate": 0.0005, + "loss": 2.1338, + "step": 61210 + }, + { + "epoch": 0.233018429847065, + "grad_norm": 0.12030050158500671, + "learning_rate": 0.0005, + "loss": 2.1294, + "step": 61220 + }, + { + "epoch": 0.23305649231518769, + "grad_norm": 0.4939073324203491, + "learning_rate": 0.0005, + "loss": 2.1313, + "step": 61230 + }, + { + "epoch": 0.23309455478331037, + "grad_norm": 0.13352811336517334, + "learning_rate": 0.0005, + "loss": 2.1316, + "step": 61240 + }, + { + "epoch": 0.23313261725143306, + "grad_norm": 0.1354418694972992, + "learning_rate": 0.0005, + "loss": 2.1308, + "step": 61250 + }, + { + "epoch": 0.23317067971955574, + "grad_norm": 0.13711479306221008, + "learning_rate": 0.0005, + "loss": 2.1379, + "step": 61260 + }, + { + "epoch": 0.23320874218767843, + "grad_norm": 0.11538518220186234, + "learning_rate": 0.0005, + "loss": 2.143, + "step": 61270 + }, + { + "epoch": 0.2332468046558011, + "grad_norm": 0.13111010193824768, + "learning_rate": 0.0005, + "loss": 2.1244, + "step": 61280 + }, + { + "epoch": 0.2332848671239238, + "grad_norm": 0.13194629549980164, + "learning_rate": 0.0005, + "loss": 2.1325, + "step": 61290 + }, + { + "epoch": 0.23332292959204648, + "grad_norm": 0.11781266331672668, + "learning_rate": 0.0005, + "loss": 2.1252, + "step": 61300 + }, + { + "epoch": 0.23336099206016914, + "grad_norm": 0.12541015446186066, + "learning_rate": 0.0005, + "loss": 2.1157, + "step": 61310 + }, + { + "epoch": 0.23339905452829182, + "grad_norm": 0.13061216473579407, + "learning_rate": 0.0005, + "loss": 2.1306, + "step": 61320 + }, + { + "epoch": 0.2334371169964145, + "grad_norm": 0.12053248286247253, + "learning_rate": 0.0005, + "loss": 2.1233, + "step": 61330 + }, + { + "epoch": 0.2334751794645372, + "grad_norm": 0.12311103194952011, + "learning_rate": 0.0005, + "loss": 2.1224, + "step": 61340 + }, + { + "epoch": 0.23351324193265988, + "grad_norm": 0.13860678672790527, + "learning_rate": 0.0005, + "loss": 2.1365, + "step": 61350 + }, + { + "epoch": 0.23355130440078256, + "grad_norm": 0.11981858313083649, + "learning_rate": 0.0005, + "loss": 2.12, + "step": 61360 + }, + { + "epoch": 0.23358936686890525, + "grad_norm": 0.146539106965065, + "learning_rate": 0.0005, + "loss": 2.12, + "step": 61370 + }, + { + "epoch": 0.23362742933702793, + "grad_norm": 0.1242060512304306, + "learning_rate": 0.0005, + "loss": 2.1111, + "step": 61380 + }, + { + "epoch": 0.23366549180515062, + "grad_norm": 0.12461848556995392, + "learning_rate": 0.0005, + "loss": 2.1248, + "step": 61390 + }, + { + "epoch": 0.2337035542732733, + "grad_norm": 0.12711970508098602, + "learning_rate": 0.0005, + "loss": 2.1347, + "step": 61400 + }, + { + "epoch": 0.23374161674139599, + "grad_norm": 0.13660112023353577, + "learning_rate": 0.0005, + "loss": 2.112, + "step": 61410 + }, + { + "epoch": 0.23377967920951867, + "grad_norm": 0.13711483776569366, + "learning_rate": 0.0005, + "loss": 2.1269, + "step": 61420 + }, + { + "epoch": 0.23381774167764136, + "grad_norm": 0.11872289329767227, + "learning_rate": 0.0005, + "loss": 2.1216, + "step": 61430 + }, + { + "epoch": 0.23385580414576404, + "grad_norm": 0.12258458882570267, + "learning_rate": 0.0005, + "loss": 2.1243, + "step": 61440 + }, + { + "epoch": 0.2338938666138867, + "grad_norm": 0.1237039715051651, + "learning_rate": 0.0005, + "loss": 2.1262, + "step": 61450 + }, + { + "epoch": 0.23393192908200938, + "grad_norm": 0.11179503053426743, + "learning_rate": 0.0005, + "loss": 2.1368, + "step": 61460 + }, + { + "epoch": 0.23396999155013207, + "grad_norm": 0.12895576655864716, + "learning_rate": 0.0005, + "loss": 2.1372, + "step": 61470 + }, + { + "epoch": 0.23400805401825475, + "grad_norm": 0.12361761927604675, + "learning_rate": 0.0005, + "loss": 2.1334, + "step": 61480 + }, + { + "epoch": 0.23404611648637744, + "grad_norm": 0.11232542246580124, + "learning_rate": 0.0005, + "loss": 2.1388, + "step": 61490 + }, + { + "epoch": 0.23408417895450012, + "grad_norm": 0.12916137278079987, + "learning_rate": 0.0005, + "loss": 2.1392, + "step": 61500 + }, + { + "epoch": 0.2341222414226228, + "grad_norm": 0.12421401590108871, + "learning_rate": 0.0005, + "loss": 2.1256, + "step": 61510 + }, + { + "epoch": 0.2341603038907455, + "grad_norm": 0.12263153493404388, + "learning_rate": 0.0005, + "loss": 2.1341, + "step": 61520 + }, + { + "epoch": 0.23419836635886818, + "grad_norm": 0.11718087643384933, + "learning_rate": 0.0005, + "loss": 2.1366, + "step": 61530 + }, + { + "epoch": 0.23423642882699086, + "grad_norm": 0.13158558309078217, + "learning_rate": 0.0005, + "loss": 2.1206, + "step": 61540 + }, + { + "epoch": 0.23427449129511355, + "grad_norm": 0.13179059326648712, + "learning_rate": 0.0005, + "loss": 2.1292, + "step": 61550 + }, + { + "epoch": 0.23431255376323623, + "grad_norm": 0.12416069954633713, + "learning_rate": 0.0005, + "loss": 2.1266, + "step": 61560 + }, + { + "epoch": 0.23435061623135892, + "grad_norm": 0.12887023389339447, + "learning_rate": 0.0005, + "loss": 2.1221, + "step": 61570 + }, + { + "epoch": 0.2343886786994816, + "grad_norm": 0.11714029312133789, + "learning_rate": 0.0005, + "loss": 2.1235, + "step": 61580 + }, + { + "epoch": 0.23442674116760429, + "grad_norm": 0.8283962607383728, + "learning_rate": 0.0005, + "loss": 2.1265, + "step": 61590 + }, + { + "epoch": 0.23446480363572694, + "grad_norm": 0.12651962041854858, + "learning_rate": 0.0005, + "loss": 2.1354, + "step": 61600 + }, + { + "epoch": 0.23450286610384963, + "grad_norm": 0.11088352650403976, + "learning_rate": 0.0005, + "loss": 2.1359, + "step": 61610 + }, + { + "epoch": 0.2345409285719723, + "grad_norm": 0.14070887863636017, + "learning_rate": 0.0005, + "loss": 2.1269, + "step": 61620 + }, + { + "epoch": 0.234578991040095, + "grad_norm": 0.1269426941871643, + "learning_rate": 0.0005, + "loss": 2.1392, + "step": 61630 + }, + { + "epoch": 0.23461705350821768, + "grad_norm": 0.11625342071056366, + "learning_rate": 0.0005, + "loss": 2.1214, + "step": 61640 + }, + { + "epoch": 0.23465511597634037, + "grad_norm": 0.1273365020751953, + "learning_rate": 0.0005, + "loss": 2.1353, + "step": 61650 + }, + { + "epoch": 0.23469317844446305, + "grad_norm": 0.1282196342945099, + "learning_rate": 0.0005, + "loss": 2.1406, + "step": 61660 + }, + { + "epoch": 0.23473124091258574, + "grad_norm": 0.1189308762550354, + "learning_rate": 0.0005, + "loss": 2.1283, + "step": 61670 + }, + { + "epoch": 0.23476930338070842, + "grad_norm": 0.12329194694757462, + "learning_rate": 0.0005, + "loss": 2.1115, + "step": 61680 + }, + { + "epoch": 0.2348073658488311, + "grad_norm": 0.13189809024333954, + "learning_rate": 0.0005, + "loss": 2.1453, + "step": 61690 + }, + { + "epoch": 0.2348454283169538, + "grad_norm": 0.12398454546928406, + "learning_rate": 0.0005, + "loss": 2.1268, + "step": 61700 + }, + { + "epoch": 0.23488349078507648, + "grad_norm": 0.14037716388702393, + "learning_rate": 0.0005, + "loss": 2.1411, + "step": 61710 + }, + { + "epoch": 0.23492155325319916, + "grad_norm": 0.12988056242465973, + "learning_rate": 0.0005, + "loss": 2.1395, + "step": 61720 + }, + { + "epoch": 0.23495961572132185, + "grad_norm": 0.11807936429977417, + "learning_rate": 0.0005, + "loss": 2.1323, + "step": 61730 + }, + { + "epoch": 0.2349976781894445, + "grad_norm": 0.12387745827436447, + "learning_rate": 0.0005, + "loss": 2.128, + "step": 61740 + }, + { + "epoch": 0.2350357406575672, + "grad_norm": 0.1267496794462204, + "learning_rate": 0.0005, + "loss": 2.1322, + "step": 61750 + }, + { + "epoch": 0.23507380312568987, + "grad_norm": 0.13048484921455383, + "learning_rate": 0.0005, + "loss": 2.132, + "step": 61760 + }, + { + "epoch": 0.23511186559381256, + "grad_norm": 0.12397941201925278, + "learning_rate": 0.0005, + "loss": 2.1249, + "step": 61770 + }, + { + "epoch": 0.23514992806193524, + "grad_norm": 0.1166549026966095, + "learning_rate": 0.0005, + "loss": 2.1196, + "step": 61780 + }, + { + "epoch": 0.23518799053005793, + "grad_norm": 0.11860918253660202, + "learning_rate": 0.0005, + "loss": 2.1133, + "step": 61790 + }, + { + "epoch": 0.2352260529981806, + "grad_norm": 0.1338878870010376, + "learning_rate": 0.0005, + "loss": 2.1268, + "step": 61800 + }, + { + "epoch": 0.2352641154663033, + "grad_norm": 0.12876847386360168, + "learning_rate": 0.0005, + "loss": 2.1288, + "step": 61810 + }, + { + "epoch": 0.23530217793442598, + "grad_norm": 0.12465507537126541, + "learning_rate": 0.0005, + "loss": 2.1385, + "step": 61820 + }, + { + "epoch": 0.23534024040254867, + "grad_norm": 0.11806610971689224, + "learning_rate": 0.0005, + "loss": 2.1193, + "step": 61830 + }, + { + "epoch": 0.23537830287067135, + "grad_norm": 0.19733448326587677, + "learning_rate": 0.0005, + "loss": 2.1299, + "step": 61840 + }, + { + "epoch": 0.23541636533879404, + "grad_norm": 0.12105879187583923, + "learning_rate": 0.0005, + "loss": 2.1369, + "step": 61850 + }, + { + "epoch": 0.23545442780691672, + "grad_norm": 0.11597780138254166, + "learning_rate": 0.0005, + "loss": 2.1248, + "step": 61860 + }, + { + "epoch": 0.2354924902750394, + "grad_norm": 0.137285053730011, + "learning_rate": 0.0005, + "loss": 2.1181, + "step": 61870 + }, + { + "epoch": 0.23553055274316206, + "grad_norm": 0.12559430301189423, + "learning_rate": 0.0005, + "loss": 2.1313, + "step": 61880 + }, + { + "epoch": 0.23556861521128475, + "grad_norm": 0.12780259549617767, + "learning_rate": 0.0005, + "loss": 2.133, + "step": 61890 + }, + { + "epoch": 0.23560667767940743, + "grad_norm": 0.13546954095363617, + "learning_rate": 0.0005, + "loss": 2.1251, + "step": 61900 + }, + { + "epoch": 0.23564474014753012, + "grad_norm": 0.12587662041187286, + "learning_rate": 0.0005, + "loss": 2.1221, + "step": 61910 + }, + { + "epoch": 0.2356828026156528, + "grad_norm": 0.14555040001869202, + "learning_rate": 0.0005, + "loss": 2.139, + "step": 61920 + }, + { + "epoch": 0.2357208650837755, + "grad_norm": 0.11890752613544464, + "learning_rate": 0.0005, + "loss": 2.1229, + "step": 61930 + }, + { + "epoch": 0.23575892755189817, + "grad_norm": 0.1446986049413681, + "learning_rate": 0.0005, + "loss": 2.14, + "step": 61940 + }, + { + "epoch": 0.23579699002002086, + "grad_norm": 0.11749228090047836, + "learning_rate": 0.0005, + "loss": 2.148, + "step": 61950 + }, + { + "epoch": 0.23583505248814354, + "grad_norm": 0.12498153746128082, + "learning_rate": 0.0005, + "loss": 2.1271, + "step": 61960 + }, + { + "epoch": 0.23587311495626623, + "grad_norm": 0.13425499200820923, + "learning_rate": 0.0005, + "loss": 2.121, + "step": 61970 + }, + { + "epoch": 0.2359111774243889, + "grad_norm": 0.13690124452114105, + "learning_rate": 0.0005, + "loss": 2.1293, + "step": 61980 + }, + { + "epoch": 0.2359492398925116, + "grad_norm": 0.1303202509880066, + "learning_rate": 0.0005, + "loss": 2.1313, + "step": 61990 + }, + { + "epoch": 0.23598730236063428, + "grad_norm": 0.12199945747852325, + "learning_rate": 0.0005, + "loss": 2.1178, + "step": 62000 + }, + { + "epoch": 0.23602536482875697, + "grad_norm": 0.12263193726539612, + "learning_rate": 0.0005, + "loss": 2.1258, + "step": 62010 + }, + { + "epoch": 0.23606342729687965, + "grad_norm": 0.12076544016599655, + "learning_rate": 0.0005, + "loss": 2.1226, + "step": 62020 + }, + { + "epoch": 0.2361014897650023, + "grad_norm": 0.12237869948148727, + "learning_rate": 0.0005, + "loss": 2.1225, + "step": 62030 + }, + { + "epoch": 0.236139552233125, + "grad_norm": 0.12361596524715424, + "learning_rate": 0.0005, + "loss": 2.1376, + "step": 62040 + }, + { + "epoch": 0.23617761470124768, + "grad_norm": 0.12535211443901062, + "learning_rate": 0.0005, + "loss": 2.1528, + "step": 62050 + }, + { + "epoch": 0.23621567716937036, + "grad_norm": 0.1265745609998703, + "learning_rate": 0.0005, + "loss": 2.1327, + "step": 62060 + }, + { + "epoch": 0.23625373963749305, + "grad_norm": 0.11701487004756927, + "learning_rate": 0.0005, + "loss": 2.1288, + "step": 62070 + }, + { + "epoch": 0.23629180210561573, + "grad_norm": 0.1229015588760376, + "learning_rate": 0.0005, + "loss": 2.1227, + "step": 62080 + }, + { + "epoch": 0.23632986457373842, + "grad_norm": 0.1286887526512146, + "learning_rate": 0.0005, + "loss": 2.1242, + "step": 62090 + }, + { + "epoch": 0.2363679270418611, + "grad_norm": 0.12128289043903351, + "learning_rate": 0.0005, + "loss": 2.1316, + "step": 62100 + }, + { + "epoch": 0.2364059895099838, + "grad_norm": 0.12501858174800873, + "learning_rate": 0.0005, + "loss": 2.1164, + "step": 62110 + }, + { + "epoch": 0.23644405197810647, + "grad_norm": 0.14402461051940918, + "learning_rate": 0.0005, + "loss": 2.1247, + "step": 62120 + }, + { + "epoch": 0.23648211444622916, + "grad_norm": 0.13072051107883453, + "learning_rate": 0.0005, + "loss": 2.129, + "step": 62130 + }, + { + "epoch": 0.23652017691435184, + "grad_norm": 0.11157186329364777, + "learning_rate": 0.0005, + "loss": 2.1232, + "step": 62140 + }, + { + "epoch": 0.23655823938247453, + "grad_norm": 0.1368582844734192, + "learning_rate": 0.0005, + "loss": 2.1318, + "step": 62150 + }, + { + "epoch": 0.2365963018505972, + "grad_norm": 0.12076527625322342, + "learning_rate": 0.0005, + "loss": 2.1244, + "step": 62160 + }, + { + "epoch": 0.23663436431871987, + "grad_norm": 0.23976096510887146, + "learning_rate": 0.0005, + "loss": 2.1248, + "step": 62170 + }, + { + "epoch": 0.23667242678684255, + "grad_norm": 0.14966896176338196, + "learning_rate": 0.0005, + "loss": 2.1285, + "step": 62180 + }, + { + "epoch": 0.23671048925496524, + "grad_norm": 0.12820686399936676, + "learning_rate": 0.0005, + "loss": 2.1273, + "step": 62190 + }, + { + "epoch": 0.23674855172308792, + "grad_norm": 0.12050171941518784, + "learning_rate": 0.0005, + "loss": 2.1288, + "step": 62200 + }, + { + "epoch": 0.2367866141912106, + "grad_norm": 0.12259026616811752, + "learning_rate": 0.0005, + "loss": 2.1469, + "step": 62210 + }, + { + "epoch": 0.2368246766593333, + "grad_norm": 0.13420963287353516, + "learning_rate": 0.0005, + "loss": 2.1189, + "step": 62220 + }, + { + "epoch": 0.23686273912745598, + "grad_norm": 0.13222679495811462, + "learning_rate": 0.0005, + "loss": 2.1351, + "step": 62230 + }, + { + "epoch": 0.23690080159557866, + "grad_norm": 0.11894519627094269, + "learning_rate": 0.0005, + "loss": 2.1254, + "step": 62240 + }, + { + "epoch": 0.23693886406370135, + "grad_norm": 0.12254227697849274, + "learning_rate": 0.0005, + "loss": 2.1314, + "step": 62250 + }, + { + "epoch": 0.23697692653182403, + "grad_norm": 0.12389254570007324, + "learning_rate": 0.0005, + "loss": 2.1377, + "step": 62260 + }, + { + "epoch": 0.23701498899994672, + "grad_norm": 0.11126727610826492, + "learning_rate": 0.0005, + "loss": 2.1221, + "step": 62270 + }, + { + "epoch": 0.2370530514680694, + "grad_norm": 0.1380900889635086, + "learning_rate": 0.0005, + "loss": 2.1363, + "step": 62280 + }, + { + "epoch": 0.2370911139361921, + "grad_norm": 0.12606385350227356, + "learning_rate": 0.0005, + "loss": 2.1193, + "step": 62290 + }, + { + "epoch": 0.23712917640431477, + "grad_norm": 0.12130409479141235, + "learning_rate": 0.0005, + "loss": 2.1407, + "step": 62300 + }, + { + "epoch": 0.23716723887243746, + "grad_norm": 0.11956527084112167, + "learning_rate": 0.0005, + "loss": 2.1219, + "step": 62310 + }, + { + "epoch": 0.23720530134056012, + "grad_norm": 0.11794503778219223, + "learning_rate": 0.0005, + "loss": 2.1329, + "step": 62320 + }, + { + "epoch": 0.2372433638086828, + "grad_norm": 0.12638238072395325, + "learning_rate": 0.0005, + "loss": 2.1322, + "step": 62330 + }, + { + "epoch": 0.23728142627680548, + "grad_norm": 0.1229153424501419, + "learning_rate": 0.0005, + "loss": 2.1312, + "step": 62340 + }, + { + "epoch": 0.23731948874492817, + "grad_norm": 0.12465435266494751, + "learning_rate": 0.0005, + "loss": 2.1278, + "step": 62350 + }, + { + "epoch": 0.23735755121305085, + "grad_norm": 0.11661525815725327, + "learning_rate": 0.0005, + "loss": 2.1386, + "step": 62360 + }, + { + "epoch": 0.23739561368117354, + "grad_norm": 0.1256762146949768, + "learning_rate": 0.0005, + "loss": 2.1227, + "step": 62370 + }, + { + "epoch": 0.23743367614929622, + "grad_norm": 0.13339322805404663, + "learning_rate": 0.0005, + "loss": 2.1342, + "step": 62380 + }, + { + "epoch": 0.2374717386174189, + "grad_norm": 0.12632669508457184, + "learning_rate": 0.0005, + "loss": 2.133, + "step": 62390 + }, + { + "epoch": 0.2375098010855416, + "grad_norm": 0.12359143048524857, + "learning_rate": 0.0005, + "loss": 2.1139, + "step": 62400 + }, + { + "epoch": 0.23754786355366428, + "grad_norm": 0.12854132056236267, + "learning_rate": 0.0005, + "loss": 2.1317, + "step": 62410 + }, + { + "epoch": 0.23758592602178696, + "grad_norm": 0.13429182767868042, + "learning_rate": 0.0005, + "loss": 2.1305, + "step": 62420 + }, + { + "epoch": 0.23762398848990965, + "grad_norm": 0.12218470126390457, + "learning_rate": 0.0005, + "loss": 2.1401, + "step": 62430 + }, + { + "epoch": 0.23766205095803233, + "grad_norm": 0.13051879405975342, + "learning_rate": 0.0005, + "loss": 2.1235, + "step": 62440 + }, + { + "epoch": 0.23770011342615502, + "grad_norm": 0.12221650779247284, + "learning_rate": 0.0005, + "loss": 2.1395, + "step": 62450 + }, + { + "epoch": 0.23773817589427768, + "grad_norm": 0.12356175482273102, + "learning_rate": 0.0005, + "loss": 2.1235, + "step": 62460 + }, + { + "epoch": 0.23777623836240036, + "grad_norm": 0.12108529359102249, + "learning_rate": 0.0005, + "loss": 2.1241, + "step": 62470 + }, + { + "epoch": 0.23781430083052305, + "grad_norm": 0.1264612227678299, + "learning_rate": 0.0005, + "loss": 2.1255, + "step": 62480 + }, + { + "epoch": 0.23785236329864573, + "grad_norm": 0.12323181331157684, + "learning_rate": 0.0005, + "loss": 2.1272, + "step": 62490 + }, + { + "epoch": 0.23789042576676842, + "grad_norm": 0.11964374780654907, + "learning_rate": 0.0005, + "loss": 2.1208, + "step": 62500 + }, + { + "epoch": 0.2379284882348911, + "grad_norm": 0.12805040180683136, + "learning_rate": 0.0005, + "loss": 2.126, + "step": 62510 + }, + { + "epoch": 0.23796655070301378, + "grad_norm": 0.14057007431983948, + "learning_rate": 0.0005, + "loss": 2.1141, + "step": 62520 + }, + { + "epoch": 0.23800461317113647, + "grad_norm": 0.1295660436153412, + "learning_rate": 0.0005, + "loss": 2.1406, + "step": 62530 + }, + { + "epoch": 0.23804267563925915, + "grad_norm": 0.14525508880615234, + "learning_rate": 0.0005, + "loss": 2.1281, + "step": 62540 + }, + { + "epoch": 0.23808073810738184, + "grad_norm": 0.13063256442546844, + "learning_rate": 0.0005, + "loss": 2.1272, + "step": 62550 + }, + { + "epoch": 0.23811880057550452, + "grad_norm": 0.11782248318195343, + "learning_rate": 0.0005, + "loss": 2.1222, + "step": 62560 + }, + { + "epoch": 0.2381568630436272, + "grad_norm": 0.13702130317687988, + "learning_rate": 0.0005, + "loss": 2.1285, + "step": 62570 + }, + { + "epoch": 0.2381949255117499, + "grad_norm": 0.11541248112916946, + "learning_rate": 0.0005, + "loss": 2.1179, + "step": 62580 + }, + { + "epoch": 0.23823298797987258, + "grad_norm": 0.12375971674919128, + "learning_rate": 0.0005, + "loss": 2.1194, + "step": 62590 + }, + { + "epoch": 0.23827105044799524, + "grad_norm": 0.1241425946354866, + "learning_rate": 0.0005, + "loss": 2.1378, + "step": 62600 + }, + { + "epoch": 0.23830911291611792, + "grad_norm": 0.14192144572734833, + "learning_rate": 0.0005, + "loss": 2.1246, + "step": 62610 + }, + { + "epoch": 0.2383471753842406, + "grad_norm": 0.12440304458141327, + "learning_rate": 0.0005, + "loss": 2.1526, + "step": 62620 + }, + { + "epoch": 0.2383852378523633, + "grad_norm": 0.1185927540063858, + "learning_rate": 0.0005, + "loss": 2.1255, + "step": 62630 + }, + { + "epoch": 0.23842330032048598, + "grad_norm": 0.11676016449928284, + "learning_rate": 0.0005, + "loss": 2.1184, + "step": 62640 + }, + { + "epoch": 0.23846136278860866, + "grad_norm": 0.12082359194755554, + "learning_rate": 0.0005, + "loss": 2.1403, + "step": 62650 + }, + { + "epoch": 0.23849942525673135, + "grad_norm": 0.12140568345785141, + "learning_rate": 0.0005, + "loss": 2.1281, + "step": 62660 + }, + { + "epoch": 0.23853748772485403, + "grad_norm": 0.11932049691677094, + "learning_rate": 0.0005, + "loss": 2.1328, + "step": 62670 + }, + { + "epoch": 0.23857555019297672, + "grad_norm": 0.13034158945083618, + "learning_rate": 0.0005, + "loss": 2.1333, + "step": 62680 + }, + { + "epoch": 0.2386136126610994, + "grad_norm": 0.12311530858278275, + "learning_rate": 0.0005, + "loss": 2.1346, + "step": 62690 + }, + { + "epoch": 0.23865167512922209, + "grad_norm": 0.1283925622701645, + "learning_rate": 0.0005, + "loss": 2.1248, + "step": 62700 + }, + { + "epoch": 0.23868973759734477, + "grad_norm": 0.11840459704399109, + "learning_rate": 0.0005, + "loss": 2.1047, + "step": 62710 + }, + { + "epoch": 0.23872780006546745, + "grad_norm": 0.12968389689922333, + "learning_rate": 0.0005, + "loss": 2.1188, + "step": 62720 + }, + { + "epoch": 0.23876586253359014, + "grad_norm": 0.11802906543016434, + "learning_rate": 0.0005, + "loss": 2.1433, + "step": 62730 + }, + { + "epoch": 0.23880392500171282, + "grad_norm": 0.12069299817085266, + "learning_rate": 0.0005, + "loss": 2.1218, + "step": 62740 + }, + { + "epoch": 0.23884198746983548, + "grad_norm": 0.12242452800273895, + "learning_rate": 0.0005, + "loss": 2.1361, + "step": 62750 + }, + { + "epoch": 0.23888004993795817, + "grad_norm": 0.13938666880130768, + "learning_rate": 0.0005, + "loss": 2.1259, + "step": 62760 + }, + { + "epoch": 0.23891811240608085, + "grad_norm": 0.1349049061536789, + "learning_rate": 0.0005, + "loss": 2.1406, + "step": 62770 + }, + { + "epoch": 0.23895617487420354, + "grad_norm": 0.12318526953458786, + "learning_rate": 0.0005, + "loss": 2.1293, + "step": 62780 + }, + { + "epoch": 0.23899423734232622, + "grad_norm": 0.11634445190429688, + "learning_rate": 0.0005, + "loss": 2.1421, + "step": 62790 + }, + { + "epoch": 0.2390322998104489, + "grad_norm": 0.13093838095664978, + "learning_rate": 0.0005, + "loss": 2.1397, + "step": 62800 + }, + { + "epoch": 0.2390703622785716, + "grad_norm": 0.11626966297626495, + "learning_rate": 0.0005, + "loss": 2.1363, + "step": 62810 + }, + { + "epoch": 0.23910842474669428, + "grad_norm": 0.12809191644191742, + "learning_rate": 0.0005, + "loss": 2.1314, + "step": 62820 + }, + { + "epoch": 0.23914648721481696, + "grad_norm": 0.11999858915805817, + "learning_rate": 0.0005, + "loss": 2.1233, + "step": 62830 + }, + { + "epoch": 0.23918454968293965, + "grad_norm": 0.13391542434692383, + "learning_rate": 0.0005, + "loss": 2.128, + "step": 62840 + }, + { + "epoch": 0.23922261215106233, + "grad_norm": 0.12422390282154083, + "learning_rate": 0.0005, + "loss": 2.1386, + "step": 62850 + }, + { + "epoch": 0.23926067461918502, + "grad_norm": 0.12316978722810745, + "learning_rate": 0.0005, + "loss": 2.1181, + "step": 62860 + }, + { + "epoch": 0.2392987370873077, + "grad_norm": 0.132602259516716, + "learning_rate": 0.0005, + "loss": 2.135, + "step": 62870 + }, + { + "epoch": 0.23933679955543039, + "grad_norm": 0.1147371381521225, + "learning_rate": 0.0005, + "loss": 2.1408, + "step": 62880 + }, + { + "epoch": 0.23937486202355304, + "grad_norm": 0.11935495585203171, + "learning_rate": 0.0005, + "loss": 2.1168, + "step": 62890 + }, + { + "epoch": 0.23941292449167573, + "grad_norm": 0.11032240092754364, + "learning_rate": 0.0005, + "loss": 2.1185, + "step": 62900 + }, + { + "epoch": 0.2394509869597984, + "grad_norm": 0.12904879450798035, + "learning_rate": 0.0005, + "loss": 2.1407, + "step": 62910 + }, + { + "epoch": 0.2394890494279211, + "grad_norm": 0.12383519113063812, + "learning_rate": 0.0005, + "loss": 2.1257, + "step": 62920 + }, + { + "epoch": 0.23952711189604378, + "grad_norm": 0.1210516095161438, + "learning_rate": 0.0005, + "loss": 2.1238, + "step": 62930 + }, + { + "epoch": 0.23956517436416647, + "grad_norm": 0.1306571364402771, + "learning_rate": 0.0005, + "loss": 2.1193, + "step": 62940 + }, + { + "epoch": 0.23960323683228915, + "grad_norm": 0.12337831407785416, + "learning_rate": 0.0005, + "loss": 2.1405, + "step": 62950 + }, + { + "epoch": 0.23964129930041184, + "grad_norm": 0.12242639809846878, + "learning_rate": 0.0005, + "loss": 2.1307, + "step": 62960 + }, + { + "epoch": 0.23967936176853452, + "grad_norm": 0.14051468670368195, + "learning_rate": 0.0005, + "loss": 2.1377, + "step": 62970 + }, + { + "epoch": 0.2397174242366572, + "grad_norm": 0.13228604197502136, + "learning_rate": 0.0005, + "loss": 2.123, + "step": 62980 + }, + { + "epoch": 0.2397554867047799, + "grad_norm": 0.11779261380434036, + "learning_rate": 0.0005, + "loss": 2.1177, + "step": 62990 + }, + { + "epoch": 0.23979354917290258, + "grad_norm": 0.12763015925884247, + "learning_rate": 0.0005, + "loss": 2.1387, + "step": 63000 + }, + { + "epoch": 0.23983161164102526, + "grad_norm": 0.13263078033924103, + "learning_rate": 0.0005, + "loss": 2.1326, + "step": 63010 + }, + { + "epoch": 0.23986967410914795, + "grad_norm": 0.12094102799892426, + "learning_rate": 0.0005, + "loss": 2.1269, + "step": 63020 + }, + { + "epoch": 0.2399077365772706, + "grad_norm": 0.11784138530492783, + "learning_rate": 0.0005, + "loss": 2.1232, + "step": 63030 + }, + { + "epoch": 0.2399457990453933, + "grad_norm": 0.1280038058757782, + "learning_rate": 0.0005, + "loss": 2.1208, + "step": 63040 + }, + { + "epoch": 0.23998386151351597, + "grad_norm": 0.12300426512956619, + "learning_rate": 0.0005, + "loss": 2.1319, + "step": 63050 + }, + { + "epoch": 0.24002192398163866, + "grad_norm": 0.1237001046538353, + "learning_rate": 0.0005, + "loss": 2.1275, + "step": 63060 + }, + { + "epoch": 0.24005998644976134, + "grad_norm": 0.15138910710811615, + "learning_rate": 0.0005, + "loss": 2.1207, + "step": 63070 + }, + { + "epoch": 0.24009804891788403, + "grad_norm": 0.15691739320755005, + "learning_rate": 0.0005, + "loss": 2.1287, + "step": 63080 + }, + { + "epoch": 0.2401361113860067, + "grad_norm": 0.12051752209663391, + "learning_rate": 0.0005, + "loss": 2.1251, + "step": 63090 + }, + { + "epoch": 0.2401741738541294, + "grad_norm": 0.12068517506122589, + "learning_rate": 0.0005, + "loss": 2.1289, + "step": 63100 + }, + { + "epoch": 0.24021223632225208, + "grad_norm": 0.12128940969705582, + "learning_rate": 0.0005, + "loss": 2.1352, + "step": 63110 + }, + { + "epoch": 0.24025029879037477, + "grad_norm": 0.1229841560125351, + "learning_rate": 0.0005, + "loss": 2.1248, + "step": 63120 + }, + { + "epoch": 0.24028836125849745, + "grad_norm": 0.13807646930217743, + "learning_rate": 0.0005, + "loss": 2.1253, + "step": 63130 + }, + { + "epoch": 0.24032642372662014, + "grad_norm": 0.13278523087501526, + "learning_rate": 0.0005, + "loss": 2.1393, + "step": 63140 + }, + { + "epoch": 0.24036448619474282, + "grad_norm": 0.11325494199991226, + "learning_rate": 0.0005, + "loss": 2.1384, + "step": 63150 + }, + { + "epoch": 0.2404025486628655, + "grad_norm": 0.11987940222024918, + "learning_rate": 0.0005, + "loss": 2.1209, + "step": 63160 + }, + { + "epoch": 0.2404406111309882, + "grad_norm": 0.14351606369018555, + "learning_rate": 0.0005, + "loss": 2.1196, + "step": 63170 + }, + { + "epoch": 0.24047867359911085, + "grad_norm": 0.1169760599732399, + "learning_rate": 0.0005, + "loss": 2.1295, + "step": 63180 + }, + { + "epoch": 0.24051673606723353, + "grad_norm": 0.14521263539791107, + "learning_rate": 0.0005, + "loss": 2.1398, + "step": 63190 + }, + { + "epoch": 0.24055479853535622, + "grad_norm": 0.14447267353534698, + "learning_rate": 0.0005, + "loss": 2.1305, + "step": 63200 + }, + { + "epoch": 0.2405928610034789, + "grad_norm": 0.12431297451257706, + "learning_rate": 0.0005, + "loss": 2.1112, + "step": 63210 + }, + { + "epoch": 0.2406309234716016, + "grad_norm": 0.1190367341041565, + "learning_rate": 0.0005, + "loss": 2.1322, + "step": 63220 + }, + { + "epoch": 0.24066898593972427, + "grad_norm": 0.11925860494375229, + "learning_rate": 0.0005, + "loss": 2.1407, + "step": 63230 + }, + { + "epoch": 0.24070704840784696, + "grad_norm": 0.11938245594501495, + "learning_rate": 0.0005, + "loss": 2.1218, + "step": 63240 + }, + { + "epoch": 0.24074511087596964, + "grad_norm": 0.12791714072227478, + "learning_rate": 0.0005, + "loss": 2.1348, + "step": 63250 + }, + { + "epoch": 0.24078317334409233, + "grad_norm": 0.12287945300340652, + "learning_rate": 0.0005, + "loss": 2.1329, + "step": 63260 + }, + { + "epoch": 0.240821235812215, + "grad_norm": 0.13500581681728363, + "learning_rate": 0.0005, + "loss": 2.128, + "step": 63270 + }, + { + "epoch": 0.2408592982803377, + "grad_norm": 0.14133571088314056, + "learning_rate": 0.0005, + "loss": 2.1444, + "step": 63280 + }, + { + "epoch": 0.24089736074846038, + "grad_norm": 0.11892928928136826, + "learning_rate": 0.0005, + "loss": 2.1363, + "step": 63290 + }, + { + "epoch": 0.24093542321658307, + "grad_norm": 0.13436049222946167, + "learning_rate": 0.0005, + "loss": 2.1238, + "step": 63300 + }, + { + "epoch": 0.24097348568470575, + "grad_norm": 0.12387977540493011, + "learning_rate": 0.0005, + "loss": 2.111, + "step": 63310 + }, + { + "epoch": 0.2410115481528284, + "grad_norm": 0.11793079972267151, + "learning_rate": 0.0005, + "loss": 2.1244, + "step": 63320 + }, + { + "epoch": 0.2410496106209511, + "grad_norm": 0.12627609074115753, + "learning_rate": 0.0005, + "loss": 2.1327, + "step": 63330 + }, + { + "epoch": 0.24108767308907378, + "grad_norm": 0.12230251729488373, + "learning_rate": 0.0005, + "loss": 2.131, + "step": 63340 + }, + { + "epoch": 0.24112573555719646, + "grad_norm": 0.1256856471300125, + "learning_rate": 0.0005, + "loss": 2.1376, + "step": 63350 + }, + { + "epoch": 0.24116379802531915, + "grad_norm": 0.14163659512996674, + "learning_rate": 0.0005, + "loss": 2.1393, + "step": 63360 + }, + { + "epoch": 0.24120186049344183, + "grad_norm": 0.12445111572742462, + "learning_rate": 0.0005, + "loss": 2.1298, + "step": 63370 + }, + { + "epoch": 0.24123992296156452, + "grad_norm": 0.13497750461101532, + "learning_rate": 0.0005, + "loss": 2.1381, + "step": 63380 + }, + { + "epoch": 0.2412779854296872, + "grad_norm": 0.15080420672893524, + "learning_rate": 0.0005, + "loss": 2.1305, + "step": 63390 + }, + { + "epoch": 0.2413160478978099, + "grad_norm": 0.1224711537361145, + "learning_rate": 0.0005, + "loss": 2.134, + "step": 63400 + }, + { + "epoch": 0.24135411036593257, + "grad_norm": 0.1363290399312973, + "learning_rate": 0.0005, + "loss": 2.1193, + "step": 63410 + }, + { + "epoch": 0.24139217283405526, + "grad_norm": 0.12231992930173874, + "learning_rate": 0.0005, + "loss": 2.1378, + "step": 63420 + }, + { + "epoch": 0.24143023530217794, + "grad_norm": 0.12203171104192734, + "learning_rate": 0.0005, + "loss": 2.1294, + "step": 63430 + }, + { + "epoch": 0.24146829777030063, + "grad_norm": 0.12021537870168686, + "learning_rate": 0.0005, + "loss": 2.1286, + "step": 63440 + }, + { + "epoch": 0.2415063602384233, + "grad_norm": 0.11745815724134445, + "learning_rate": 0.0005, + "loss": 2.1235, + "step": 63450 + }, + { + "epoch": 0.241544422706546, + "grad_norm": 0.14927086234092712, + "learning_rate": 0.0005, + "loss": 2.137, + "step": 63460 + }, + { + "epoch": 0.24158248517466865, + "grad_norm": 0.13094308972358704, + "learning_rate": 0.0005, + "loss": 2.1368, + "step": 63470 + }, + { + "epoch": 0.24162054764279134, + "grad_norm": 0.12097480893135071, + "learning_rate": 0.0005, + "loss": 2.1262, + "step": 63480 + }, + { + "epoch": 0.24165861011091402, + "grad_norm": 0.12082047015428543, + "learning_rate": 0.0005, + "loss": 2.1372, + "step": 63490 + }, + { + "epoch": 0.2416966725790367, + "grad_norm": 0.14711803197860718, + "learning_rate": 0.0005, + "loss": 2.1299, + "step": 63500 + }, + { + "epoch": 0.2417347350471594, + "grad_norm": 0.13210797309875488, + "learning_rate": 0.0005, + "loss": 2.1345, + "step": 63510 + }, + { + "epoch": 0.24177279751528208, + "grad_norm": 0.12244979292154312, + "learning_rate": 0.0005, + "loss": 2.1347, + "step": 63520 + }, + { + "epoch": 0.24181085998340476, + "grad_norm": 0.12733928859233856, + "learning_rate": 0.0005, + "loss": 2.126, + "step": 63530 + }, + { + "epoch": 0.24184892245152745, + "grad_norm": 0.14031542837619781, + "learning_rate": 0.0005, + "loss": 2.1265, + "step": 63540 + }, + { + "epoch": 0.24188698491965013, + "grad_norm": 0.1467483937740326, + "learning_rate": 0.0005, + "loss": 2.1475, + "step": 63550 + }, + { + "epoch": 0.24192504738777282, + "grad_norm": 0.11587478220462799, + "learning_rate": 0.0005, + "loss": 2.1153, + "step": 63560 + }, + { + "epoch": 0.2419631098558955, + "grad_norm": 0.11463305354118347, + "learning_rate": 0.0005, + "loss": 2.1292, + "step": 63570 + }, + { + "epoch": 0.2420011723240182, + "grad_norm": 0.1248435229063034, + "learning_rate": 0.0005, + "loss": 2.1441, + "step": 63580 + }, + { + "epoch": 0.24203923479214087, + "grad_norm": 0.13611268997192383, + "learning_rate": 0.0005, + "loss": 2.1457, + "step": 63590 + }, + { + "epoch": 0.24207729726026356, + "grad_norm": 0.14438386261463165, + "learning_rate": 0.0005, + "loss": 2.118, + "step": 63600 + }, + { + "epoch": 0.24211535972838621, + "grad_norm": 0.13003243505954742, + "learning_rate": 0.0005, + "loss": 2.1316, + "step": 63610 + }, + { + "epoch": 0.2421534221965089, + "grad_norm": 0.13023284077644348, + "learning_rate": 0.0005, + "loss": 2.1266, + "step": 63620 + }, + { + "epoch": 0.24219148466463158, + "grad_norm": 0.12970775365829468, + "learning_rate": 0.0005, + "loss": 2.1473, + "step": 63630 + }, + { + "epoch": 0.24222954713275427, + "grad_norm": 0.12701945006847382, + "learning_rate": 0.0005, + "loss": 2.1251, + "step": 63640 + }, + { + "epoch": 0.24226760960087695, + "grad_norm": 0.12962621450424194, + "learning_rate": 0.0005, + "loss": 2.1254, + "step": 63650 + }, + { + "epoch": 0.24230567206899964, + "grad_norm": 0.12591864168643951, + "learning_rate": 0.0005, + "loss": 2.1334, + "step": 63660 + }, + { + "epoch": 0.24234373453712232, + "grad_norm": 0.11711875349283218, + "learning_rate": 0.0005, + "loss": 2.119, + "step": 63670 + }, + { + "epoch": 0.242381797005245, + "grad_norm": 0.13258887827396393, + "learning_rate": 0.0005, + "loss": 2.1418, + "step": 63680 + }, + { + "epoch": 0.2424198594733677, + "grad_norm": 0.12952230870723724, + "learning_rate": 0.0005, + "loss": 2.1319, + "step": 63690 + }, + { + "epoch": 0.24245792194149038, + "grad_norm": 0.12011770904064178, + "learning_rate": 0.0005, + "loss": 2.1407, + "step": 63700 + }, + { + "epoch": 0.24249598440961306, + "grad_norm": 0.11629187315702438, + "learning_rate": 0.0005, + "loss": 2.1309, + "step": 63710 + }, + { + "epoch": 0.24253404687773575, + "grad_norm": 0.11819379776716232, + "learning_rate": 0.0005, + "loss": 2.1131, + "step": 63720 + }, + { + "epoch": 0.24257210934585843, + "grad_norm": 0.14825862646102905, + "learning_rate": 0.0005, + "loss": 2.14, + "step": 63730 + }, + { + "epoch": 0.24261017181398112, + "grad_norm": 0.1542445570230484, + "learning_rate": 0.0005, + "loss": 2.1044, + "step": 63740 + }, + { + "epoch": 0.24264823428210378, + "grad_norm": 0.11805212497711182, + "learning_rate": 0.0005, + "loss": 2.1215, + "step": 63750 + }, + { + "epoch": 0.24268629675022646, + "grad_norm": 0.14055785536766052, + "learning_rate": 0.0005, + "loss": 2.1294, + "step": 63760 + }, + { + "epoch": 0.24272435921834914, + "grad_norm": 0.12182782590389252, + "learning_rate": 0.0005, + "loss": 2.1179, + "step": 63770 + }, + { + "epoch": 0.24276242168647183, + "grad_norm": 0.11553092300891876, + "learning_rate": 0.0005, + "loss": 2.1274, + "step": 63780 + }, + { + "epoch": 0.24280048415459451, + "grad_norm": 0.11161351203918457, + "learning_rate": 0.0005, + "loss": 2.141, + "step": 63790 + }, + { + "epoch": 0.2428385466227172, + "grad_norm": 0.12397214770317078, + "learning_rate": 0.0005, + "loss": 2.1352, + "step": 63800 + }, + { + "epoch": 0.24287660909083988, + "grad_norm": 0.12839215993881226, + "learning_rate": 0.0005, + "loss": 2.1314, + "step": 63810 + }, + { + "epoch": 0.24291467155896257, + "grad_norm": 0.12061683088541031, + "learning_rate": 0.0005, + "loss": 2.1242, + "step": 63820 + }, + { + "epoch": 0.24295273402708525, + "grad_norm": 0.11531350016593933, + "learning_rate": 0.0005, + "loss": 2.1048, + "step": 63830 + }, + { + "epoch": 0.24299079649520794, + "grad_norm": 0.12071649730205536, + "learning_rate": 0.0005, + "loss": 2.1247, + "step": 63840 + }, + { + "epoch": 0.24302885896333062, + "grad_norm": 0.11956681311130524, + "learning_rate": 0.0005, + "loss": 2.1283, + "step": 63850 + }, + { + "epoch": 0.2430669214314533, + "grad_norm": 0.1328822672367096, + "learning_rate": 0.0005, + "loss": 2.13, + "step": 63860 + }, + { + "epoch": 0.243104983899576, + "grad_norm": 0.13923752307891846, + "learning_rate": 0.0005, + "loss": 2.1415, + "step": 63870 + }, + { + "epoch": 0.24314304636769868, + "grad_norm": 0.1264955997467041, + "learning_rate": 0.0005, + "loss": 2.129, + "step": 63880 + }, + { + "epoch": 0.24318110883582136, + "grad_norm": 0.12596023082733154, + "learning_rate": 0.0005, + "loss": 2.1341, + "step": 63890 + }, + { + "epoch": 0.24321917130394402, + "grad_norm": 0.11505821347236633, + "learning_rate": 0.0005, + "loss": 2.1227, + "step": 63900 + }, + { + "epoch": 0.2432572337720667, + "grad_norm": 0.13088856637477875, + "learning_rate": 0.0005, + "loss": 2.1451, + "step": 63910 + }, + { + "epoch": 0.2432952962401894, + "grad_norm": 0.12740778923034668, + "learning_rate": 0.0005, + "loss": 2.125, + "step": 63920 + }, + { + "epoch": 0.24333335870831208, + "grad_norm": 0.12028442323207855, + "learning_rate": 0.0005, + "loss": 2.1239, + "step": 63930 + }, + { + "epoch": 0.24337142117643476, + "grad_norm": 0.11218731105327606, + "learning_rate": 0.0005, + "loss": 2.1266, + "step": 63940 + }, + { + "epoch": 0.24340948364455745, + "grad_norm": 0.1180717945098877, + "learning_rate": 0.0005, + "loss": 2.1343, + "step": 63950 + }, + { + "epoch": 0.24344754611268013, + "grad_norm": 0.12286002188920975, + "learning_rate": 0.0005, + "loss": 2.1287, + "step": 63960 + }, + { + "epoch": 0.24348560858080281, + "grad_norm": 0.11139651387929916, + "learning_rate": 0.0005, + "loss": 2.1154, + "step": 63970 + }, + { + "epoch": 0.2435236710489255, + "grad_norm": 0.12381156533956528, + "learning_rate": 0.0005, + "loss": 2.1286, + "step": 63980 + }, + { + "epoch": 0.24356173351704818, + "grad_norm": 0.14636299014091492, + "learning_rate": 0.0005, + "loss": 2.1327, + "step": 63990 + }, + { + "epoch": 0.24359979598517087, + "grad_norm": 0.13266819715499878, + "learning_rate": 0.0005, + "loss": 2.1152, + "step": 64000 + }, + { + "epoch": 0.24363785845329355, + "grad_norm": 0.12030017375946045, + "learning_rate": 0.0005, + "loss": 2.1294, + "step": 64010 + }, + { + "epoch": 0.24367592092141624, + "grad_norm": 0.1427072137594223, + "learning_rate": 0.0005, + "loss": 2.1259, + "step": 64020 + }, + { + "epoch": 0.24371398338953892, + "grad_norm": 0.12498658150434494, + "learning_rate": 0.0005, + "loss": 2.1227, + "step": 64030 + }, + { + "epoch": 0.24375204585766158, + "grad_norm": 0.12415233999490738, + "learning_rate": 0.0005, + "loss": 2.1254, + "step": 64040 + }, + { + "epoch": 0.24379010832578427, + "grad_norm": 0.12559351325035095, + "learning_rate": 0.0005, + "loss": 2.1198, + "step": 64050 + }, + { + "epoch": 0.24382817079390695, + "grad_norm": 0.12085636705160141, + "learning_rate": 0.0005, + "loss": 2.1375, + "step": 64060 + }, + { + "epoch": 0.24386623326202964, + "grad_norm": 0.12610945105552673, + "learning_rate": 0.0005, + "loss": 2.1334, + "step": 64070 + }, + { + "epoch": 0.24390429573015232, + "grad_norm": 0.11584189534187317, + "learning_rate": 0.0005, + "loss": 2.1367, + "step": 64080 + }, + { + "epoch": 0.243942358198275, + "grad_norm": 0.12166000157594681, + "learning_rate": 0.0005, + "loss": 2.1156, + "step": 64090 + }, + { + "epoch": 0.2439804206663977, + "grad_norm": 0.1491120308637619, + "learning_rate": 0.0005, + "loss": 2.1269, + "step": 64100 + }, + { + "epoch": 0.24401848313452038, + "grad_norm": 0.11442311853170395, + "learning_rate": 0.0005, + "loss": 2.1382, + "step": 64110 + }, + { + "epoch": 0.24405654560264306, + "grad_norm": 0.118242047727108, + "learning_rate": 0.0005, + "loss": 2.1381, + "step": 64120 + }, + { + "epoch": 0.24409460807076575, + "grad_norm": 0.12931834161281586, + "learning_rate": 0.0005, + "loss": 2.1303, + "step": 64130 + }, + { + "epoch": 0.24413267053888843, + "grad_norm": 0.12151516228914261, + "learning_rate": 0.0005, + "loss": 2.13, + "step": 64140 + }, + { + "epoch": 0.24417073300701111, + "grad_norm": 0.11908268928527832, + "learning_rate": 0.0005, + "loss": 2.138, + "step": 64150 + }, + { + "epoch": 0.2442087954751338, + "grad_norm": 0.13682927191257477, + "learning_rate": 0.0005, + "loss": 2.1386, + "step": 64160 + }, + { + "epoch": 0.24424685794325648, + "grad_norm": 0.13674937188625336, + "learning_rate": 0.0005, + "loss": 2.1165, + "step": 64170 + }, + { + "epoch": 0.24428492041137914, + "grad_norm": 0.12140002101659775, + "learning_rate": 0.0005, + "loss": 2.1249, + "step": 64180 + }, + { + "epoch": 0.24432298287950183, + "grad_norm": 0.13333989679813385, + "learning_rate": 0.0005, + "loss": 2.1365, + "step": 64190 + }, + { + "epoch": 0.2443610453476245, + "grad_norm": 0.1255248337984085, + "learning_rate": 0.0005, + "loss": 2.1147, + "step": 64200 + }, + { + "epoch": 0.2443991078157472, + "grad_norm": 0.11844970285892487, + "learning_rate": 0.0005, + "loss": 2.1355, + "step": 64210 + }, + { + "epoch": 0.24443717028386988, + "grad_norm": 0.23616556823253632, + "learning_rate": 0.0005, + "loss": 2.1221, + "step": 64220 + }, + { + "epoch": 0.24447523275199257, + "grad_norm": 0.11513097584247589, + "learning_rate": 0.0005, + "loss": 2.1234, + "step": 64230 + }, + { + "epoch": 0.24451329522011525, + "grad_norm": 0.12044999748468399, + "learning_rate": 0.0005, + "loss": 2.1373, + "step": 64240 + }, + { + "epoch": 0.24455135768823794, + "grad_norm": 0.133218914270401, + "learning_rate": 0.0005, + "loss": 2.1289, + "step": 64250 + }, + { + "epoch": 0.24458942015636062, + "grad_norm": 0.15163981914520264, + "learning_rate": 0.0005, + "loss": 2.1107, + "step": 64260 + }, + { + "epoch": 0.2446274826244833, + "grad_norm": 0.1269594132900238, + "learning_rate": 0.0005, + "loss": 2.1322, + "step": 64270 + }, + { + "epoch": 0.244665545092606, + "grad_norm": 0.12632466852664948, + "learning_rate": 0.0005, + "loss": 2.1351, + "step": 64280 + }, + { + "epoch": 0.24470360756072868, + "grad_norm": 0.12292557954788208, + "learning_rate": 0.0005, + "loss": 2.1348, + "step": 64290 + }, + { + "epoch": 0.24474167002885136, + "grad_norm": 0.1207164004445076, + "learning_rate": 0.0005, + "loss": 2.1319, + "step": 64300 + }, + { + "epoch": 0.24477973249697405, + "grad_norm": 0.1272284984588623, + "learning_rate": 0.0005, + "loss": 2.1378, + "step": 64310 + }, + { + "epoch": 0.24481779496509673, + "grad_norm": 0.12717846035957336, + "learning_rate": 0.0005, + "loss": 2.1442, + "step": 64320 + }, + { + "epoch": 0.2448558574332194, + "grad_norm": 0.12871286273002625, + "learning_rate": 0.0005, + "loss": 2.1274, + "step": 64330 + }, + { + "epoch": 0.24489391990134207, + "grad_norm": 0.1255853921175003, + "learning_rate": 0.0005, + "loss": 2.1102, + "step": 64340 + }, + { + "epoch": 0.24493198236946476, + "grad_norm": 0.12132584303617477, + "learning_rate": 0.0005, + "loss": 2.118, + "step": 64350 + }, + { + "epoch": 0.24497004483758744, + "grad_norm": 0.12538164854049683, + "learning_rate": 0.0005, + "loss": 2.1244, + "step": 64360 + }, + { + "epoch": 0.24500810730571013, + "grad_norm": 0.13736794888973236, + "learning_rate": 0.0005, + "loss": 2.1347, + "step": 64370 + }, + { + "epoch": 0.2450461697738328, + "grad_norm": 0.12930458784103394, + "learning_rate": 0.0005, + "loss": 2.1333, + "step": 64380 + }, + { + "epoch": 0.2450842322419555, + "grad_norm": 0.10916830599308014, + "learning_rate": 0.0005, + "loss": 2.1379, + "step": 64390 + }, + { + "epoch": 0.24512229471007818, + "grad_norm": 0.128557026386261, + "learning_rate": 0.0005, + "loss": 2.1254, + "step": 64400 + }, + { + "epoch": 0.24516035717820087, + "grad_norm": 0.12013334035873413, + "learning_rate": 0.0005, + "loss": 2.121, + "step": 64410 + }, + { + "epoch": 0.24519841964632355, + "grad_norm": 0.13635292649269104, + "learning_rate": 0.0005, + "loss": 2.1201, + "step": 64420 + }, + { + "epoch": 0.24523648211444624, + "grad_norm": 0.12600380182266235, + "learning_rate": 0.0005, + "loss": 2.1155, + "step": 64430 + }, + { + "epoch": 0.24527454458256892, + "grad_norm": 0.1258964091539383, + "learning_rate": 0.0005, + "loss": 2.1263, + "step": 64440 + }, + { + "epoch": 0.2453126070506916, + "grad_norm": 0.12330331653356552, + "learning_rate": 0.0005, + "loss": 2.1367, + "step": 64450 + }, + { + "epoch": 0.2453506695188143, + "grad_norm": 0.12577103078365326, + "learning_rate": 0.0005, + "loss": 2.1382, + "step": 64460 + }, + { + "epoch": 0.24538873198693695, + "grad_norm": 0.11576177924871445, + "learning_rate": 0.0005, + "loss": 2.1323, + "step": 64470 + }, + { + "epoch": 0.24542679445505963, + "grad_norm": 0.169657900929451, + "learning_rate": 0.0005, + "loss": 2.1361, + "step": 64480 + }, + { + "epoch": 0.24546485692318232, + "grad_norm": 0.12155858427286148, + "learning_rate": 0.0005, + "loss": 2.1258, + "step": 64490 + }, + { + "epoch": 0.245502919391305, + "grad_norm": 0.11039765924215317, + "learning_rate": 0.0005, + "loss": 2.1193, + "step": 64500 + }, + { + "epoch": 0.2455409818594277, + "grad_norm": 0.11560600250959396, + "learning_rate": 0.0005, + "loss": 2.1228, + "step": 64510 + }, + { + "epoch": 0.24557904432755037, + "grad_norm": 0.12468001991510391, + "learning_rate": 0.0005, + "loss": 2.1249, + "step": 64520 + }, + { + "epoch": 0.24561710679567306, + "grad_norm": 0.11813274025917053, + "learning_rate": 0.0005, + "loss": 2.1353, + "step": 64530 + }, + { + "epoch": 0.24565516926379574, + "grad_norm": 0.11909537762403488, + "learning_rate": 0.0005, + "loss": 2.1272, + "step": 64540 + }, + { + "epoch": 0.24569323173191843, + "grad_norm": 0.1262873411178589, + "learning_rate": 0.0005, + "loss": 2.1299, + "step": 64550 + }, + { + "epoch": 0.2457312942000411, + "grad_norm": 0.13570715487003326, + "learning_rate": 0.0005, + "loss": 2.1276, + "step": 64560 + }, + { + "epoch": 0.2457693566681638, + "grad_norm": 0.12662003934383392, + "learning_rate": 0.0005, + "loss": 2.1331, + "step": 64570 + }, + { + "epoch": 0.24580741913628648, + "grad_norm": 0.12291039526462555, + "learning_rate": 0.0005, + "loss": 2.1422, + "step": 64580 + }, + { + "epoch": 0.24584548160440917, + "grad_norm": 0.12123207002878189, + "learning_rate": 0.0005, + "loss": 2.13, + "step": 64590 + }, + { + "epoch": 0.24588354407253185, + "grad_norm": 0.12253370881080627, + "learning_rate": 0.0005, + "loss": 2.1205, + "step": 64600 + }, + { + "epoch": 0.24592160654065454, + "grad_norm": 0.1316492259502411, + "learning_rate": 0.0005, + "loss": 2.1232, + "step": 64610 + }, + { + "epoch": 0.2459596690087772, + "grad_norm": 0.1356193870306015, + "learning_rate": 0.0005, + "loss": 2.1486, + "step": 64620 + }, + { + "epoch": 0.24599773147689988, + "grad_norm": 0.132023423910141, + "learning_rate": 0.0005, + "loss": 2.134, + "step": 64630 + }, + { + "epoch": 0.24603579394502256, + "grad_norm": 0.12114515900611877, + "learning_rate": 0.0005, + "loss": 2.1336, + "step": 64640 + }, + { + "epoch": 0.24607385641314525, + "grad_norm": 0.12968000769615173, + "learning_rate": 0.0005, + "loss": 2.1192, + "step": 64650 + }, + { + "epoch": 0.24611191888126793, + "grad_norm": 0.13613907992839813, + "learning_rate": 0.0005, + "loss": 2.126, + "step": 64660 + }, + { + "epoch": 0.24614998134939062, + "grad_norm": 0.12374147772789001, + "learning_rate": 0.0005, + "loss": 2.1276, + "step": 64670 + }, + { + "epoch": 0.2461880438175133, + "grad_norm": 0.12278860062360764, + "learning_rate": 0.0005, + "loss": 2.1236, + "step": 64680 + }, + { + "epoch": 0.246226106285636, + "grad_norm": 0.12701402604579926, + "learning_rate": 0.0005, + "loss": 2.1229, + "step": 64690 + }, + { + "epoch": 0.24626416875375867, + "grad_norm": 0.11772578209638596, + "learning_rate": 0.0005, + "loss": 2.1303, + "step": 64700 + }, + { + "epoch": 0.24630223122188136, + "grad_norm": 0.11964793503284454, + "learning_rate": 0.0005, + "loss": 2.1274, + "step": 64710 + }, + { + "epoch": 0.24634029369000404, + "grad_norm": 0.12465240061283112, + "learning_rate": 0.0005, + "loss": 2.1437, + "step": 64720 + }, + { + "epoch": 0.24637835615812673, + "grad_norm": 0.14138975739479065, + "learning_rate": 0.0005, + "loss": 2.144, + "step": 64730 + }, + { + "epoch": 0.2464164186262494, + "grad_norm": 0.13955868780612946, + "learning_rate": 0.0005, + "loss": 2.1203, + "step": 64740 + }, + { + "epoch": 0.2464544810943721, + "grad_norm": 0.1234760656952858, + "learning_rate": 0.0005, + "loss": 2.1311, + "step": 64750 + }, + { + "epoch": 0.24649254356249475, + "grad_norm": 0.11192571371793747, + "learning_rate": 0.0005, + "loss": 2.1252, + "step": 64760 + }, + { + "epoch": 0.24653060603061744, + "grad_norm": 0.11702840775251389, + "learning_rate": 0.0005, + "loss": 2.1341, + "step": 64770 + }, + { + "epoch": 0.24656866849874012, + "grad_norm": 0.1265072077512741, + "learning_rate": 0.0005, + "loss": 2.1443, + "step": 64780 + }, + { + "epoch": 0.2466067309668628, + "grad_norm": 0.12466669827699661, + "learning_rate": 0.0005, + "loss": 2.1398, + "step": 64790 + }, + { + "epoch": 0.2466447934349855, + "grad_norm": 0.12184906005859375, + "learning_rate": 0.0005, + "loss": 2.1303, + "step": 64800 + }, + { + "epoch": 0.24668285590310818, + "grad_norm": 0.12810151278972626, + "learning_rate": 0.0005, + "loss": 2.1265, + "step": 64810 + }, + { + "epoch": 0.24672091837123086, + "grad_norm": 0.12840422987937927, + "learning_rate": 0.0005, + "loss": 2.1392, + "step": 64820 + }, + { + "epoch": 0.24675898083935355, + "grad_norm": 0.11984486877918243, + "learning_rate": 0.0005, + "loss": 2.1027, + "step": 64830 + }, + { + "epoch": 0.24679704330747623, + "grad_norm": 0.12411165237426758, + "learning_rate": 0.0005, + "loss": 2.1431, + "step": 64840 + }, + { + "epoch": 0.24683510577559892, + "grad_norm": 0.14512589573860168, + "learning_rate": 0.0005, + "loss": 2.1201, + "step": 64850 + }, + { + "epoch": 0.2468731682437216, + "grad_norm": 0.4416506886482239, + "learning_rate": 0.0005, + "loss": 2.1372, + "step": 64860 + }, + { + "epoch": 0.2469112307118443, + "grad_norm": 0.12932956218719482, + "learning_rate": 0.0005, + "loss": 2.1248, + "step": 64870 + }, + { + "epoch": 0.24694929317996697, + "grad_norm": 0.1224246621131897, + "learning_rate": 0.0005, + "loss": 2.1229, + "step": 64880 + }, + { + "epoch": 0.24698735564808966, + "grad_norm": 0.11386513710021973, + "learning_rate": 0.0005, + "loss": 2.1442, + "step": 64890 + }, + { + "epoch": 0.24702541811621231, + "grad_norm": 0.11869112402200699, + "learning_rate": 0.0005, + "loss": 2.1193, + "step": 64900 + }, + { + "epoch": 0.247063480584335, + "grad_norm": 0.11242227256298065, + "learning_rate": 0.0005, + "loss": 2.1264, + "step": 64910 + }, + { + "epoch": 0.24710154305245768, + "grad_norm": 0.11348606646060944, + "learning_rate": 0.0005, + "loss": 2.1218, + "step": 64920 + }, + { + "epoch": 0.24713960552058037, + "grad_norm": 0.13171638548374176, + "learning_rate": 0.0005, + "loss": 2.1182, + "step": 64930 + }, + { + "epoch": 0.24717766798870305, + "grad_norm": 0.12162895500659943, + "learning_rate": 0.0005, + "loss": 2.1406, + "step": 64940 + }, + { + "epoch": 0.24721573045682574, + "grad_norm": 0.1277979016304016, + "learning_rate": 0.0005, + "loss": 2.115, + "step": 64950 + }, + { + "epoch": 0.24725379292494842, + "grad_norm": 0.12796761095523834, + "learning_rate": 0.0005, + "loss": 2.1182, + "step": 64960 + }, + { + "epoch": 0.2472918553930711, + "grad_norm": 0.11699617654085159, + "learning_rate": 0.0005, + "loss": 2.151, + "step": 64970 + }, + { + "epoch": 0.2473299178611938, + "grad_norm": 0.113719142973423, + "learning_rate": 0.0005, + "loss": 2.1381, + "step": 64980 + }, + { + "epoch": 0.24736798032931648, + "grad_norm": 0.12022378295660019, + "learning_rate": 0.0005, + "loss": 2.1288, + "step": 64990 + }, + { + "epoch": 0.24740604279743916, + "grad_norm": 0.11734028160572052, + "learning_rate": 0.0005, + "loss": 2.1397, + "step": 65000 + }, + { + "epoch": 0.24744410526556185, + "grad_norm": 0.13308216631412506, + "learning_rate": 0.0005, + "loss": 2.127, + "step": 65010 + }, + { + "epoch": 0.24748216773368453, + "grad_norm": 0.1400652527809143, + "learning_rate": 0.0005, + "loss": 2.1356, + "step": 65020 + }, + { + "epoch": 0.24752023020180722, + "grad_norm": 0.12651506066322327, + "learning_rate": 0.0005, + "loss": 2.1316, + "step": 65030 + }, + { + "epoch": 0.2475582926699299, + "grad_norm": 0.1413552612066269, + "learning_rate": 0.0005, + "loss": 2.1212, + "step": 65040 + }, + { + "epoch": 0.24759635513805256, + "grad_norm": 0.1348857581615448, + "learning_rate": 0.0005, + "loss": 2.1238, + "step": 65050 + }, + { + "epoch": 0.24763441760617524, + "grad_norm": 0.12505334615707397, + "learning_rate": 0.0005, + "loss": 2.12, + "step": 65060 + }, + { + "epoch": 0.24767248007429793, + "grad_norm": 0.12925513088703156, + "learning_rate": 0.0005, + "loss": 2.1233, + "step": 65070 + }, + { + "epoch": 0.24771054254242061, + "grad_norm": 0.12137854844331741, + "learning_rate": 0.0005, + "loss": 2.1259, + "step": 65080 + }, + { + "epoch": 0.2477486050105433, + "grad_norm": 0.12658438086509705, + "learning_rate": 0.0005, + "loss": 2.1302, + "step": 65090 + }, + { + "epoch": 0.24778666747866598, + "grad_norm": 0.11648210138082504, + "learning_rate": 0.0005, + "loss": 2.1366, + "step": 65100 + }, + { + "epoch": 0.24782472994678867, + "grad_norm": 0.12034095823764801, + "learning_rate": 0.0005, + "loss": 2.1418, + "step": 65110 + }, + { + "epoch": 0.24786279241491135, + "grad_norm": 0.1281271129846573, + "learning_rate": 0.0005, + "loss": 2.1281, + "step": 65120 + }, + { + "epoch": 0.24790085488303404, + "grad_norm": 0.12572212517261505, + "learning_rate": 0.0005, + "loss": 2.1262, + "step": 65130 + }, + { + "epoch": 0.24793891735115672, + "grad_norm": 0.14182905852794647, + "learning_rate": 0.0005, + "loss": 2.1176, + "step": 65140 + }, + { + "epoch": 0.2479769798192794, + "grad_norm": 0.12417514622211456, + "learning_rate": 0.0005, + "loss": 2.1256, + "step": 65150 + }, + { + "epoch": 0.2480150422874021, + "grad_norm": 0.1255902200937271, + "learning_rate": 0.0005, + "loss": 2.1253, + "step": 65160 + }, + { + "epoch": 0.24805310475552478, + "grad_norm": 0.12469867616891861, + "learning_rate": 0.0005, + "loss": 2.1183, + "step": 65170 + }, + { + "epoch": 0.24809116722364746, + "grad_norm": 0.12777365744113922, + "learning_rate": 0.0005, + "loss": 2.1336, + "step": 65180 + }, + { + "epoch": 0.24812922969177012, + "grad_norm": 0.12414289265871048, + "learning_rate": 0.0005, + "loss": 2.1163, + "step": 65190 + }, + { + "epoch": 0.2481672921598928, + "grad_norm": 0.13034619390964508, + "learning_rate": 0.0005, + "loss": 2.1332, + "step": 65200 + }, + { + "epoch": 0.2482053546280155, + "grad_norm": 0.1196286529302597, + "learning_rate": 0.0005, + "loss": 2.1115, + "step": 65210 + }, + { + "epoch": 0.24824341709613817, + "grad_norm": 0.11457592993974686, + "learning_rate": 0.0005, + "loss": 2.1337, + "step": 65220 + }, + { + "epoch": 0.24828147956426086, + "grad_norm": 0.13046908378601074, + "learning_rate": 0.0005, + "loss": 2.1251, + "step": 65230 + }, + { + "epoch": 0.24831954203238354, + "grad_norm": 0.12603069841861725, + "learning_rate": 0.0005, + "loss": 2.1177, + "step": 65240 + }, + { + "epoch": 0.24835760450050623, + "grad_norm": 0.12878099083900452, + "learning_rate": 0.0005, + "loss": 2.1155, + "step": 65250 + }, + { + "epoch": 0.24839566696862891, + "grad_norm": 0.14651422202587128, + "learning_rate": 0.0005, + "loss": 2.1376, + "step": 65260 + }, + { + "epoch": 0.2484337294367516, + "grad_norm": 0.12332681566476822, + "learning_rate": 0.0005, + "loss": 2.1361, + "step": 65270 + }, + { + "epoch": 0.24847179190487428, + "grad_norm": 0.11837499588727951, + "learning_rate": 0.0005, + "loss": 2.1228, + "step": 65280 + }, + { + "epoch": 0.24850985437299697, + "grad_norm": 0.11317744106054306, + "learning_rate": 0.0005, + "loss": 2.1156, + "step": 65290 + }, + { + "epoch": 0.24854791684111965, + "grad_norm": 0.13026000559329987, + "learning_rate": 0.0005, + "loss": 2.1436, + "step": 65300 + }, + { + "epoch": 0.24858597930924234, + "grad_norm": 0.12601986527442932, + "learning_rate": 0.0005, + "loss": 2.1292, + "step": 65310 + }, + { + "epoch": 0.24862404177736502, + "grad_norm": 0.12494723498821259, + "learning_rate": 0.0005, + "loss": 2.1256, + "step": 65320 + }, + { + "epoch": 0.24866210424548768, + "grad_norm": 0.1308157593011856, + "learning_rate": 0.0005, + "loss": 2.1342, + "step": 65330 + }, + { + "epoch": 0.24870016671361037, + "grad_norm": 0.1284460723400116, + "learning_rate": 0.0005, + "loss": 2.1316, + "step": 65340 + }, + { + "epoch": 0.24873822918173305, + "grad_norm": 0.13919104635715485, + "learning_rate": 0.0005, + "loss": 2.1358, + "step": 65350 + }, + { + "epoch": 0.24877629164985574, + "grad_norm": 0.1327711045742035, + "learning_rate": 0.0005, + "loss": 2.126, + "step": 65360 + }, + { + "epoch": 0.24881435411797842, + "grad_norm": 0.1231195405125618, + "learning_rate": 0.0005, + "loss": 2.131, + "step": 65370 + }, + { + "epoch": 0.2488524165861011, + "grad_norm": 0.11069672554731369, + "learning_rate": 0.0005, + "loss": 2.1242, + "step": 65380 + }, + { + "epoch": 0.2488904790542238, + "grad_norm": 0.14658358693122864, + "learning_rate": 0.0005, + "loss": 2.126, + "step": 65390 + }, + { + "epoch": 0.24892854152234647, + "grad_norm": 0.14038780331611633, + "learning_rate": 0.0005, + "loss": 2.1164, + "step": 65400 + }, + { + "epoch": 0.24896660399046916, + "grad_norm": 0.11526990681886673, + "learning_rate": 0.0005, + "loss": 2.1259, + "step": 65410 + }, + { + "epoch": 0.24900466645859184, + "grad_norm": 0.12075836211442947, + "learning_rate": 0.0005, + "loss": 2.1318, + "step": 65420 + }, + { + "epoch": 0.24904272892671453, + "grad_norm": 0.12481142580509186, + "learning_rate": 0.0005, + "loss": 2.1323, + "step": 65430 + }, + { + "epoch": 0.24908079139483721, + "grad_norm": 0.12299896031618118, + "learning_rate": 0.0005, + "loss": 2.1315, + "step": 65440 + }, + { + "epoch": 0.2491188538629599, + "grad_norm": 0.13120685517787933, + "learning_rate": 0.0005, + "loss": 2.1362, + "step": 65450 + }, + { + "epoch": 0.24915691633108258, + "grad_norm": 0.12218866497278214, + "learning_rate": 0.0005, + "loss": 2.1316, + "step": 65460 + }, + { + "epoch": 0.24919497879920527, + "grad_norm": 0.1156734749674797, + "learning_rate": 0.0005, + "loss": 2.1345, + "step": 65470 + }, + { + "epoch": 0.24923304126732793, + "grad_norm": 0.12150178849697113, + "learning_rate": 0.0005, + "loss": 2.1301, + "step": 65480 + }, + { + "epoch": 0.2492711037354506, + "grad_norm": 0.12679104506969452, + "learning_rate": 0.0005, + "loss": 2.1215, + "step": 65490 + }, + { + "epoch": 0.2493091662035733, + "grad_norm": 0.12691758573055267, + "learning_rate": 0.0005, + "loss": 2.1285, + "step": 65500 + }, + { + "epoch": 0.24934722867169598, + "grad_norm": 0.11793230473995209, + "learning_rate": 0.0005, + "loss": 2.1259, + "step": 65510 + }, + { + "epoch": 0.24938529113981867, + "grad_norm": 0.11975978314876556, + "learning_rate": 0.0005, + "loss": 2.1309, + "step": 65520 + }, + { + "epoch": 0.24942335360794135, + "grad_norm": 0.1233128160238266, + "learning_rate": 0.0005, + "loss": 2.1342, + "step": 65530 + }, + { + "epoch": 0.24946141607606404, + "grad_norm": 0.13914133608341217, + "learning_rate": 0.0005, + "loss": 2.1277, + "step": 65540 + }, + { + "epoch": 0.24949947854418672, + "grad_norm": 0.1286463737487793, + "learning_rate": 0.0005, + "loss": 2.1374, + "step": 65550 + }, + { + "epoch": 0.2495375410123094, + "grad_norm": 0.13319668173789978, + "learning_rate": 0.0005, + "loss": 2.1209, + "step": 65560 + }, + { + "epoch": 0.2495756034804321, + "grad_norm": 0.1314106285572052, + "learning_rate": 0.0005, + "loss": 2.1297, + "step": 65570 + }, + { + "epoch": 0.24961366594855477, + "grad_norm": 0.1226097047328949, + "learning_rate": 0.0005, + "loss": 2.1401, + "step": 65580 + }, + { + "epoch": 0.24965172841667746, + "grad_norm": 0.1222294494509697, + "learning_rate": 0.0005, + "loss": 2.1248, + "step": 65590 + }, + { + "epoch": 0.24968979088480014, + "grad_norm": 0.1437918245792389, + "learning_rate": 0.0005, + "loss": 2.1169, + "step": 65600 + }, + { + "epoch": 0.24972785335292283, + "grad_norm": 0.11622069776058197, + "learning_rate": 0.0005, + "loss": 2.1363, + "step": 65610 + }, + { + "epoch": 0.2497659158210455, + "grad_norm": 0.11489139497280121, + "learning_rate": 0.0005, + "loss": 2.1297, + "step": 65620 + }, + { + "epoch": 0.24980397828916817, + "grad_norm": 0.12246556580066681, + "learning_rate": 0.0005, + "loss": 2.1239, + "step": 65630 + }, + { + "epoch": 0.24984204075729086, + "grad_norm": 0.11942298710346222, + "learning_rate": 0.0005, + "loss": 2.1215, + "step": 65640 + }, + { + "epoch": 0.24988010322541354, + "grad_norm": 0.12547263503074646, + "learning_rate": 0.0005, + "loss": 2.1287, + "step": 65650 + }, + { + "epoch": 0.24991816569353623, + "grad_norm": 0.12355095148086548, + "learning_rate": 0.0005, + "loss": 2.1326, + "step": 65660 + }, + { + "epoch": 0.2499562281616589, + "grad_norm": 0.14005330204963684, + "learning_rate": 0.0005, + "loss": 2.1189, + "step": 65670 + }, + { + "epoch": 0.2499942906297816, + "grad_norm": 0.12415273487567902, + "learning_rate": 0.0005, + "loss": 2.1189, + "step": 65680 + }, + { + "epoch": 0.2500323530979043, + "grad_norm": 0.1350027173757553, + "learning_rate": 0.0005, + "loss": 2.133, + "step": 65690 + }, + { + "epoch": 0.25007041556602694, + "grad_norm": 0.12149950861930847, + "learning_rate": 0.0005, + "loss": 2.1275, + "step": 65700 + }, + { + "epoch": 0.25010847803414965, + "grad_norm": 0.1218823567032814, + "learning_rate": 0.0005, + "loss": 2.139, + "step": 65710 + }, + { + "epoch": 0.2501465405022723, + "grad_norm": 0.1255420595407486, + "learning_rate": 0.0005, + "loss": 2.1216, + "step": 65720 + }, + { + "epoch": 0.250184602970395, + "grad_norm": 0.12200768291950226, + "learning_rate": 0.0005, + "loss": 2.1193, + "step": 65730 + }, + { + "epoch": 0.2502226654385177, + "grad_norm": 0.11212620884180069, + "learning_rate": 0.0005, + "loss": 2.1281, + "step": 65740 + }, + { + "epoch": 0.2502607279066404, + "grad_norm": 0.11211491376161575, + "learning_rate": 0.0005, + "loss": 2.1138, + "step": 65750 + }, + { + "epoch": 0.25029879037476305, + "grad_norm": 0.11531613022089005, + "learning_rate": 0.0005, + "loss": 2.1028, + "step": 65760 + }, + { + "epoch": 0.25033685284288576, + "grad_norm": 0.1244373545050621, + "learning_rate": 0.0005, + "loss": 2.1238, + "step": 65770 + }, + { + "epoch": 0.2503749153110084, + "grad_norm": 0.12171068042516708, + "learning_rate": 0.0005, + "loss": 2.1331, + "step": 65780 + }, + { + "epoch": 0.25041297777913113, + "grad_norm": 0.1351395547389984, + "learning_rate": 0.0005, + "loss": 2.1294, + "step": 65790 + }, + { + "epoch": 0.2504510402472538, + "grad_norm": 0.12298998981714249, + "learning_rate": 0.0005, + "loss": 2.1267, + "step": 65800 + }, + { + "epoch": 0.2504891027153765, + "grad_norm": 0.1542890965938568, + "learning_rate": 0.0005, + "loss": 2.1316, + "step": 65810 + }, + { + "epoch": 0.25052716518349916, + "grad_norm": 0.13249491155147552, + "learning_rate": 0.0005, + "loss": 2.1296, + "step": 65820 + }, + { + "epoch": 0.25056522765162187, + "grad_norm": 0.12993395328521729, + "learning_rate": 0.0005, + "loss": 2.1325, + "step": 65830 + }, + { + "epoch": 0.2506032901197445, + "grad_norm": 0.12128250300884247, + "learning_rate": 0.0005, + "loss": 2.1046, + "step": 65840 + }, + { + "epoch": 0.2506413525878672, + "grad_norm": 0.12215390056371689, + "learning_rate": 0.0005, + "loss": 2.1214, + "step": 65850 + }, + { + "epoch": 0.2506794150559899, + "grad_norm": 0.12768031656742096, + "learning_rate": 0.0005, + "loss": 2.1372, + "step": 65860 + }, + { + "epoch": 0.25071747752411255, + "grad_norm": 0.1210104376077652, + "learning_rate": 0.0005, + "loss": 2.1233, + "step": 65870 + }, + { + "epoch": 0.25075553999223527, + "grad_norm": 0.14039009809494019, + "learning_rate": 0.0005, + "loss": 2.1268, + "step": 65880 + }, + { + "epoch": 0.2507936024603579, + "grad_norm": 0.1367824524641037, + "learning_rate": 0.0005, + "loss": 2.1253, + "step": 65890 + }, + { + "epoch": 0.25083166492848064, + "grad_norm": 0.11305456608533859, + "learning_rate": 0.0005, + "loss": 2.1416, + "step": 65900 + }, + { + "epoch": 0.2508697273966033, + "grad_norm": 0.1420050412416458, + "learning_rate": 0.0005, + "loss": 2.1298, + "step": 65910 + }, + { + "epoch": 0.250907789864726, + "grad_norm": 0.13083982467651367, + "learning_rate": 0.0005, + "loss": 2.1307, + "step": 65920 + }, + { + "epoch": 0.25094585233284866, + "grad_norm": 0.1243060901761055, + "learning_rate": 0.0005, + "loss": 2.13, + "step": 65930 + }, + { + "epoch": 0.2509839148009714, + "grad_norm": 0.13240239024162292, + "learning_rate": 0.0005, + "loss": 2.1282, + "step": 65940 + }, + { + "epoch": 0.25102197726909403, + "grad_norm": 0.1207834780216217, + "learning_rate": 0.0005, + "loss": 2.1289, + "step": 65950 + }, + { + "epoch": 0.25106003973721674, + "grad_norm": 0.12781934440135956, + "learning_rate": 0.0005, + "loss": 2.1377, + "step": 65960 + }, + { + "epoch": 0.2510981022053394, + "grad_norm": 0.11338264495134354, + "learning_rate": 0.0005, + "loss": 2.1307, + "step": 65970 + }, + { + "epoch": 0.25113616467346206, + "grad_norm": 0.12367227673530579, + "learning_rate": 0.0005, + "loss": 2.1277, + "step": 65980 + }, + { + "epoch": 0.25117422714158477, + "grad_norm": 0.12069065868854523, + "learning_rate": 0.0005, + "loss": 2.1275, + "step": 65990 + }, + { + "epoch": 0.25121228960970743, + "grad_norm": 0.1273607313632965, + "learning_rate": 0.0005, + "loss": 2.1362, + "step": 66000 + }, + { + "epoch": 0.25125035207783014, + "grad_norm": 0.13285642862319946, + "learning_rate": 0.0005, + "loss": 2.11, + "step": 66010 + }, + { + "epoch": 0.2512884145459528, + "grad_norm": 0.12723097205162048, + "learning_rate": 0.0005, + "loss": 2.1378, + "step": 66020 + }, + { + "epoch": 0.2513264770140755, + "grad_norm": 0.1333588808774948, + "learning_rate": 0.0005, + "loss": 2.1348, + "step": 66030 + }, + { + "epoch": 0.25136453948219817, + "grad_norm": 0.13743548095226288, + "learning_rate": 0.0005, + "loss": 2.1287, + "step": 66040 + }, + { + "epoch": 0.2514026019503209, + "grad_norm": 0.11628317832946777, + "learning_rate": 0.0005, + "loss": 2.1335, + "step": 66050 + }, + { + "epoch": 0.25144066441844354, + "grad_norm": 0.14928993582725525, + "learning_rate": 0.0005, + "loss": 2.1236, + "step": 66060 + }, + { + "epoch": 0.25147872688656625, + "grad_norm": 0.12406531721353531, + "learning_rate": 0.0005, + "loss": 2.1369, + "step": 66070 + }, + { + "epoch": 0.2515167893546889, + "grad_norm": 0.12148601561784744, + "learning_rate": 0.0005, + "loss": 2.1272, + "step": 66080 + }, + { + "epoch": 0.2515548518228116, + "grad_norm": 0.11786960810422897, + "learning_rate": 0.0005, + "loss": 2.1273, + "step": 66090 + }, + { + "epoch": 0.2515929142909343, + "grad_norm": 0.12175590544939041, + "learning_rate": 0.0005, + "loss": 2.1425, + "step": 66100 + }, + { + "epoch": 0.251630976759057, + "grad_norm": 0.1423157900571823, + "learning_rate": 0.0005, + "loss": 2.1371, + "step": 66110 + }, + { + "epoch": 0.25166903922717965, + "grad_norm": 0.24454635381698608, + "learning_rate": 0.0005, + "loss": 2.1206, + "step": 66120 + }, + { + "epoch": 0.2517071016953023, + "grad_norm": 0.1412813365459442, + "learning_rate": 0.0005, + "loss": 2.131, + "step": 66130 + }, + { + "epoch": 0.251745164163425, + "grad_norm": 0.11888179928064346, + "learning_rate": 0.0005, + "loss": 2.1143, + "step": 66140 + }, + { + "epoch": 0.2517832266315477, + "grad_norm": 0.13959814608097076, + "learning_rate": 0.0005, + "loss": 2.1397, + "step": 66150 + }, + { + "epoch": 0.2518212890996704, + "grad_norm": 0.12186913937330246, + "learning_rate": 0.0005, + "loss": 2.1213, + "step": 66160 + }, + { + "epoch": 0.25185935156779304, + "grad_norm": 0.11685211956501007, + "learning_rate": 0.0005, + "loss": 2.1475, + "step": 66170 + }, + { + "epoch": 0.25189741403591576, + "grad_norm": 0.11924509704113007, + "learning_rate": 0.0005, + "loss": 2.1335, + "step": 66180 + }, + { + "epoch": 0.2519354765040384, + "grad_norm": 0.1242474839091301, + "learning_rate": 0.0005, + "loss": 2.1486, + "step": 66190 + }, + { + "epoch": 0.2519735389721611, + "grad_norm": 0.1427517682313919, + "learning_rate": 0.0005, + "loss": 2.1524, + "step": 66200 + }, + { + "epoch": 0.2520116014402838, + "grad_norm": 0.11653508245944977, + "learning_rate": 0.0005, + "loss": 2.1171, + "step": 66210 + }, + { + "epoch": 0.2520496639084065, + "grad_norm": 0.13120757043361664, + "learning_rate": 0.0005, + "loss": 2.1091, + "step": 66220 + }, + { + "epoch": 0.25208772637652915, + "grad_norm": 0.12150296568870544, + "learning_rate": 0.0005, + "loss": 2.1268, + "step": 66230 + }, + { + "epoch": 0.25212578884465187, + "grad_norm": 0.134052112698555, + "learning_rate": 0.0005, + "loss": 2.1278, + "step": 66240 + }, + { + "epoch": 0.2521638513127745, + "grad_norm": 0.12599046528339386, + "learning_rate": 0.0005, + "loss": 2.1197, + "step": 66250 + }, + { + "epoch": 0.25220191378089724, + "grad_norm": 0.1288243979215622, + "learning_rate": 0.0005, + "loss": 2.1349, + "step": 66260 + }, + { + "epoch": 0.2522399762490199, + "grad_norm": 0.12849846482276917, + "learning_rate": 0.0005, + "loss": 2.1199, + "step": 66270 + }, + { + "epoch": 0.25227803871714255, + "grad_norm": 0.1309838891029358, + "learning_rate": 0.0005, + "loss": 2.1245, + "step": 66280 + }, + { + "epoch": 0.25231610118526526, + "grad_norm": 0.11939046531915665, + "learning_rate": 0.0005, + "loss": 2.12, + "step": 66290 + }, + { + "epoch": 0.2523541636533879, + "grad_norm": 0.11373452842235565, + "learning_rate": 0.0005, + "loss": 2.145, + "step": 66300 + }, + { + "epoch": 0.25239222612151063, + "grad_norm": 0.12958891689777374, + "learning_rate": 0.0005, + "loss": 2.1435, + "step": 66310 + }, + { + "epoch": 0.2524302885896333, + "grad_norm": 0.12233640998601913, + "learning_rate": 0.0005, + "loss": 2.1258, + "step": 66320 + }, + { + "epoch": 0.252468351057756, + "grad_norm": 0.1465737372636795, + "learning_rate": 0.0005, + "loss": 2.1253, + "step": 66330 + }, + { + "epoch": 0.25250641352587866, + "grad_norm": 0.1597142219543457, + "learning_rate": 0.0005, + "loss": 2.1163, + "step": 66340 + }, + { + "epoch": 0.25254447599400137, + "grad_norm": 0.14479586482048035, + "learning_rate": 0.0005, + "loss": 2.134, + "step": 66350 + }, + { + "epoch": 0.25258253846212403, + "grad_norm": 0.12330485880374908, + "learning_rate": 0.0005, + "loss": 2.1411, + "step": 66360 + }, + { + "epoch": 0.25262060093024674, + "grad_norm": 0.12251732498407364, + "learning_rate": 0.0005, + "loss": 2.1384, + "step": 66370 + }, + { + "epoch": 0.2526586633983694, + "grad_norm": 0.1360514760017395, + "learning_rate": 0.0005, + "loss": 2.1331, + "step": 66380 + }, + { + "epoch": 0.2526967258664921, + "grad_norm": 0.12024963647127151, + "learning_rate": 0.0005, + "loss": 2.1274, + "step": 66390 + }, + { + "epoch": 0.25273478833461477, + "grad_norm": 0.11666081100702286, + "learning_rate": 0.0005, + "loss": 2.1336, + "step": 66400 + }, + { + "epoch": 0.2527728508027374, + "grad_norm": 0.11214052885770798, + "learning_rate": 0.0005, + "loss": 2.1203, + "step": 66410 + }, + { + "epoch": 0.25281091327086014, + "grad_norm": 0.13760724663734436, + "learning_rate": 0.0005, + "loss": 2.1484, + "step": 66420 + }, + { + "epoch": 0.2528489757389828, + "grad_norm": 0.11763881146907806, + "learning_rate": 0.0005, + "loss": 2.1335, + "step": 66430 + }, + { + "epoch": 0.2528870382071055, + "grad_norm": 0.11611486971378326, + "learning_rate": 0.0005, + "loss": 2.1424, + "step": 66440 + }, + { + "epoch": 0.25292510067522816, + "grad_norm": 0.13202884793281555, + "learning_rate": 0.0005, + "loss": 2.1258, + "step": 66450 + }, + { + "epoch": 0.2529631631433509, + "grad_norm": 0.13429687917232513, + "learning_rate": 0.0005, + "loss": 2.134, + "step": 66460 + }, + { + "epoch": 0.25300122561147353, + "grad_norm": 0.1115928664803505, + "learning_rate": 0.0005, + "loss": 2.1429, + "step": 66470 + }, + { + "epoch": 0.25303928807959625, + "grad_norm": 0.11740121245384216, + "learning_rate": 0.0005, + "loss": 2.1329, + "step": 66480 + }, + { + "epoch": 0.2530773505477189, + "grad_norm": 0.12334487587213516, + "learning_rate": 0.0005, + "loss": 2.1137, + "step": 66490 + }, + { + "epoch": 0.2531154130158416, + "grad_norm": 0.12216371297836304, + "learning_rate": 0.0005, + "loss": 2.1156, + "step": 66500 + }, + { + "epoch": 0.2531534754839643, + "grad_norm": 0.13560952246189117, + "learning_rate": 0.0005, + "loss": 2.127, + "step": 66510 + }, + { + "epoch": 0.253191537952087, + "grad_norm": 0.11898113042116165, + "learning_rate": 0.0005, + "loss": 2.1208, + "step": 66520 + }, + { + "epoch": 0.25322960042020964, + "grad_norm": 0.1410965621471405, + "learning_rate": 0.0005, + "loss": 2.1431, + "step": 66530 + }, + { + "epoch": 0.25326766288833236, + "grad_norm": 0.12651461362838745, + "learning_rate": 0.0005, + "loss": 2.1456, + "step": 66540 + }, + { + "epoch": 0.253305725356455, + "grad_norm": 0.12641902267932892, + "learning_rate": 0.0005, + "loss": 2.1186, + "step": 66550 + }, + { + "epoch": 0.25334378782457767, + "grad_norm": 0.87174391746521, + "learning_rate": 0.0005, + "loss": 2.1288, + "step": 66560 + }, + { + "epoch": 0.2533818502927004, + "grad_norm": 0.12635809183120728, + "learning_rate": 0.0005, + "loss": 2.1234, + "step": 66570 + }, + { + "epoch": 0.25341991276082304, + "grad_norm": 0.12377549707889557, + "learning_rate": 0.0005, + "loss": 2.1304, + "step": 66580 + }, + { + "epoch": 0.25345797522894575, + "grad_norm": 0.1181359812617302, + "learning_rate": 0.0005, + "loss": 2.1289, + "step": 66590 + }, + { + "epoch": 0.2534960376970684, + "grad_norm": 0.1183367669582367, + "learning_rate": 0.0005, + "loss": 2.1287, + "step": 66600 + }, + { + "epoch": 0.2535341001651911, + "grad_norm": 0.12796492874622345, + "learning_rate": 0.0005, + "loss": 2.1308, + "step": 66610 + }, + { + "epoch": 0.2535721626333138, + "grad_norm": 0.12470392882823944, + "learning_rate": 0.0005, + "loss": 2.1315, + "step": 66620 + }, + { + "epoch": 0.2536102251014365, + "grad_norm": 0.12096191942691803, + "learning_rate": 0.0005, + "loss": 2.1398, + "step": 66630 + }, + { + "epoch": 0.25364828756955915, + "grad_norm": 0.13148650527000427, + "learning_rate": 0.0005, + "loss": 2.1258, + "step": 66640 + }, + { + "epoch": 0.25368635003768186, + "grad_norm": 0.13177184760570526, + "learning_rate": 0.0005, + "loss": 2.1408, + "step": 66650 + }, + { + "epoch": 0.2537244125058045, + "grad_norm": 0.1254524439573288, + "learning_rate": 0.0005, + "loss": 2.1182, + "step": 66660 + }, + { + "epoch": 0.25376247497392723, + "grad_norm": 0.12147463113069534, + "learning_rate": 0.0005, + "loss": 2.1383, + "step": 66670 + }, + { + "epoch": 0.2538005374420499, + "grad_norm": 0.11849434673786163, + "learning_rate": 0.0005, + "loss": 2.1159, + "step": 66680 + }, + { + "epoch": 0.2538385999101726, + "grad_norm": 0.12578263878822327, + "learning_rate": 0.0005, + "loss": 2.1379, + "step": 66690 + }, + { + "epoch": 0.25387666237829526, + "grad_norm": 0.13222847878932953, + "learning_rate": 0.0005, + "loss": 2.1272, + "step": 66700 + }, + { + "epoch": 0.2539147248464179, + "grad_norm": 0.1336478441953659, + "learning_rate": 0.0005, + "loss": 2.1329, + "step": 66710 + }, + { + "epoch": 0.25395278731454063, + "grad_norm": 0.12106378376483917, + "learning_rate": 0.0005, + "loss": 2.122, + "step": 66720 + }, + { + "epoch": 0.2539908497826633, + "grad_norm": 0.12488880008459091, + "learning_rate": 0.0005, + "loss": 2.1247, + "step": 66730 + }, + { + "epoch": 0.254028912250786, + "grad_norm": 0.1426115781068802, + "learning_rate": 0.0005, + "loss": 2.1305, + "step": 66740 + }, + { + "epoch": 0.25406697471890866, + "grad_norm": 0.1336047649383545, + "learning_rate": 0.0005, + "loss": 2.1103, + "step": 66750 + }, + { + "epoch": 0.25410503718703137, + "grad_norm": 0.12687437236309052, + "learning_rate": 0.0005, + "loss": 2.1354, + "step": 66760 + }, + { + "epoch": 0.254143099655154, + "grad_norm": 0.12486867606639862, + "learning_rate": 0.0005, + "loss": 2.1214, + "step": 66770 + }, + { + "epoch": 0.25418116212327674, + "grad_norm": 0.12586849927902222, + "learning_rate": 0.0005, + "loss": 2.1196, + "step": 66780 + }, + { + "epoch": 0.2542192245913994, + "grad_norm": 0.12848053872585297, + "learning_rate": 0.0005, + "loss": 2.1224, + "step": 66790 + }, + { + "epoch": 0.2542572870595221, + "grad_norm": 0.12215209752321243, + "learning_rate": 0.0005, + "loss": 2.1378, + "step": 66800 + }, + { + "epoch": 0.25429534952764477, + "grad_norm": 0.12931972742080688, + "learning_rate": 0.0005, + "loss": 2.1224, + "step": 66810 + }, + { + "epoch": 0.2543334119957675, + "grad_norm": 0.12214656919240952, + "learning_rate": 0.0005, + "loss": 2.1331, + "step": 66820 + }, + { + "epoch": 0.25437147446389013, + "grad_norm": 0.12178391218185425, + "learning_rate": 0.0005, + "loss": 2.1276, + "step": 66830 + }, + { + "epoch": 0.2544095369320128, + "grad_norm": 0.134334996342659, + "learning_rate": 0.0005, + "loss": 2.1227, + "step": 66840 + }, + { + "epoch": 0.2544475994001355, + "grad_norm": 0.12711933255195618, + "learning_rate": 0.0005, + "loss": 2.1183, + "step": 66850 + }, + { + "epoch": 0.25448566186825816, + "grad_norm": 0.12724703550338745, + "learning_rate": 0.0005, + "loss": 2.1442, + "step": 66860 + }, + { + "epoch": 0.2545237243363809, + "grad_norm": 0.15072989463806152, + "learning_rate": 0.0005, + "loss": 2.1338, + "step": 66870 + }, + { + "epoch": 0.25456178680450353, + "grad_norm": 0.1301760971546173, + "learning_rate": 0.0005, + "loss": 2.1338, + "step": 66880 + }, + { + "epoch": 0.25459984927262624, + "grad_norm": 0.13474169373512268, + "learning_rate": 0.0005, + "loss": 2.1353, + "step": 66890 + }, + { + "epoch": 0.2546379117407489, + "grad_norm": 0.1241116151213646, + "learning_rate": 0.0005, + "loss": 2.1278, + "step": 66900 + }, + { + "epoch": 0.2546759742088716, + "grad_norm": 0.1309492588043213, + "learning_rate": 0.0005, + "loss": 2.1328, + "step": 66910 + }, + { + "epoch": 0.25471403667699427, + "grad_norm": 0.11885924637317657, + "learning_rate": 0.0005, + "loss": 2.1504, + "step": 66920 + }, + { + "epoch": 0.254752099145117, + "grad_norm": 0.127330020070076, + "learning_rate": 0.0005, + "loss": 2.1279, + "step": 66930 + }, + { + "epoch": 0.25479016161323964, + "grad_norm": 0.13746142387390137, + "learning_rate": 0.0005, + "loss": 2.1437, + "step": 66940 + }, + { + "epoch": 0.25482822408136235, + "grad_norm": 0.12805262207984924, + "learning_rate": 0.0005, + "loss": 2.1296, + "step": 66950 + }, + { + "epoch": 0.254866286549485, + "grad_norm": 0.13134385645389557, + "learning_rate": 0.0005, + "loss": 2.1306, + "step": 66960 + }, + { + "epoch": 0.2549043490176077, + "grad_norm": 0.12819813191890717, + "learning_rate": 0.0005, + "loss": 2.1352, + "step": 66970 + }, + { + "epoch": 0.2549424114857304, + "grad_norm": 0.12185374647378922, + "learning_rate": 0.0005, + "loss": 2.119, + "step": 66980 + }, + { + "epoch": 0.25498047395385304, + "grad_norm": 0.12614475190639496, + "learning_rate": 0.0005, + "loss": 2.131, + "step": 66990 + }, + { + "epoch": 0.25501853642197575, + "grad_norm": 0.114668108522892, + "learning_rate": 0.0005, + "loss": 2.1241, + "step": 67000 + }, + { + "epoch": 0.2550565988900984, + "grad_norm": 0.13993407785892487, + "learning_rate": 0.0005, + "loss": 2.1218, + "step": 67010 + }, + { + "epoch": 0.2550946613582211, + "grad_norm": 0.129254549741745, + "learning_rate": 0.0005, + "loss": 2.14, + "step": 67020 + }, + { + "epoch": 0.2551327238263438, + "grad_norm": 0.11793850362300873, + "learning_rate": 0.0005, + "loss": 2.1247, + "step": 67030 + }, + { + "epoch": 0.2551707862944665, + "grad_norm": 0.4932312071323395, + "learning_rate": 0.0005, + "loss": 2.1221, + "step": 67040 + }, + { + "epoch": 0.25520884876258915, + "grad_norm": 0.12364540249109268, + "learning_rate": 0.0005, + "loss": 2.126, + "step": 67050 + }, + { + "epoch": 0.25524691123071186, + "grad_norm": 0.12063073366880417, + "learning_rate": 0.0005, + "loss": 2.1174, + "step": 67060 + }, + { + "epoch": 0.2552849736988345, + "grad_norm": 0.11659274995326996, + "learning_rate": 0.0005, + "loss": 2.1246, + "step": 67070 + }, + { + "epoch": 0.25532303616695723, + "grad_norm": 0.1296776533126831, + "learning_rate": 0.0005, + "loss": 2.1227, + "step": 67080 + }, + { + "epoch": 0.2553610986350799, + "grad_norm": 0.1239800825715065, + "learning_rate": 0.0005, + "loss": 2.1243, + "step": 67090 + }, + { + "epoch": 0.2553991611032026, + "grad_norm": 0.12165100127458572, + "learning_rate": 0.0005, + "loss": 2.1323, + "step": 67100 + }, + { + "epoch": 0.25543722357132526, + "grad_norm": 0.11917266994714737, + "learning_rate": 0.0005, + "loss": 2.1189, + "step": 67110 + }, + { + "epoch": 0.25547528603944797, + "grad_norm": 0.13199719786643982, + "learning_rate": 0.0005, + "loss": 2.1281, + "step": 67120 + }, + { + "epoch": 0.2555133485075706, + "grad_norm": 0.1282149851322174, + "learning_rate": 0.0005, + "loss": 2.1105, + "step": 67130 + }, + { + "epoch": 0.2555514109756933, + "grad_norm": 0.12987367808818817, + "learning_rate": 0.0005, + "loss": 2.1239, + "step": 67140 + }, + { + "epoch": 0.255589473443816, + "grad_norm": 0.13135084509849548, + "learning_rate": 0.0005, + "loss": 2.1208, + "step": 67150 + }, + { + "epoch": 0.25562753591193865, + "grad_norm": 0.1275607943534851, + "learning_rate": 0.0005, + "loss": 2.1141, + "step": 67160 + }, + { + "epoch": 0.25566559838006137, + "grad_norm": 0.11255636066198349, + "learning_rate": 0.0005, + "loss": 2.1127, + "step": 67170 + }, + { + "epoch": 0.255703660848184, + "grad_norm": 0.12683634459972382, + "learning_rate": 0.0005, + "loss": 2.1279, + "step": 67180 + }, + { + "epoch": 0.25574172331630673, + "grad_norm": 0.11311507225036621, + "learning_rate": 0.0005, + "loss": 2.1169, + "step": 67190 + }, + { + "epoch": 0.2557797857844294, + "grad_norm": 0.12024758756160736, + "learning_rate": 0.0005, + "loss": 2.1228, + "step": 67200 + }, + { + "epoch": 0.2558178482525521, + "grad_norm": 0.12955109775066376, + "learning_rate": 0.0005, + "loss": 2.1291, + "step": 67210 + }, + { + "epoch": 0.25585591072067476, + "grad_norm": 0.1267477422952652, + "learning_rate": 0.0005, + "loss": 2.1262, + "step": 67220 + }, + { + "epoch": 0.2558939731887975, + "grad_norm": 0.1404080092906952, + "learning_rate": 0.0005, + "loss": 2.1192, + "step": 67230 + }, + { + "epoch": 0.25593203565692013, + "grad_norm": 0.1284545212984085, + "learning_rate": 0.0005, + "loss": 2.1283, + "step": 67240 + }, + { + "epoch": 0.25597009812504284, + "grad_norm": 0.11248300969600677, + "learning_rate": 0.0005, + "loss": 2.1394, + "step": 67250 + }, + { + "epoch": 0.2560081605931655, + "grad_norm": 0.11709735542535782, + "learning_rate": 0.0005, + "loss": 2.1203, + "step": 67260 + }, + { + "epoch": 0.25604622306128816, + "grad_norm": 0.1246451884508133, + "learning_rate": 0.0005, + "loss": 2.1152, + "step": 67270 + }, + { + "epoch": 0.25608428552941087, + "grad_norm": 0.12055303901433945, + "learning_rate": 0.0005, + "loss": 2.129, + "step": 67280 + }, + { + "epoch": 0.25612234799753353, + "grad_norm": 0.1246945932507515, + "learning_rate": 0.0005, + "loss": 2.1429, + "step": 67290 + }, + { + "epoch": 0.25616041046565624, + "grad_norm": 0.1278133988380432, + "learning_rate": 0.0005, + "loss": 2.1155, + "step": 67300 + }, + { + "epoch": 0.2561984729337789, + "grad_norm": 0.14413084089756012, + "learning_rate": 0.0005, + "loss": 2.1224, + "step": 67310 + }, + { + "epoch": 0.2562365354019016, + "grad_norm": 0.13160665333271027, + "learning_rate": 0.0005, + "loss": 2.123, + "step": 67320 + }, + { + "epoch": 0.25627459787002427, + "grad_norm": 0.12283961474895477, + "learning_rate": 0.0005, + "loss": 2.1369, + "step": 67330 + }, + { + "epoch": 0.256312660338147, + "grad_norm": 0.1365843117237091, + "learning_rate": 0.0005, + "loss": 2.1347, + "step": 67340 + }, + { + "epoch": 0.25635072280626964, + "grad_norm": 0.1217200830578804, + "learning_rate": 0.0005, + "loss": 2.1471, + "step": 67350 + }, + { + "epoch": 0.25638878527439235, + "grad_norm": 0.12052513659000397, + "learning_rate": 0.0005, + "loss": 2.1287, + "step": 67360 + }, + { + "epoch": 0.256426847742515, + "grad_norm": 0.125356063246727, + "learning_rate": 0.0005, + "loss": 2.1308, + "step": 67370 + }, + { + "epoch": 0.2564649102106377, + "grad_norm": 0.11431968957185745, + "learning_rate": 0.0005, + "loss": 2.1405, + "step": 67380 + }, + { + "epoch": 0.2565029726787604, + "grad_norm": 0.11986550688743591, + "learning_rate": 0.0005, + "loss": 2.1178, + "step": 67390 + }, + { + "epoch": 0.2565410351468831, + "grad_norm": 0.12903617322444916, + "learning_rate": 0.0005, + "loss": 2.1152, + "step": 67400 + }, + { + "epoch": 0.25657909761500575, + "grad_norm": 0.14601320028305054, + "learning_rate": 0.0005, + "loss": 2.1305, + "step": 67410 + }, + { + "epoch": 0.2566171600831284, + "grad_norm": 0.12710809707641602, + "learning_rate": 0.0005, + "loss": 2.121, + "step": 67420 + }, + { + "epoch": 0.2566552225512511, + "grad_norm": 0.11787423491477966, + "learning_rate": 0.0005, + "loss": 2.1267, + "step": 67430 + }, + { + "epoch": 0.2566932850193738, + "grad_norm": 0.1185850203037262, + "learning_rate": 0.0005, + "loss": 2.129, + "step": 67440 + }, + { + "epoch": 0.2567313474874965, + "grad_norm": 0.1283193826675415, + "learning_rate": 0.0005, + "loss": 2.1344, + "step": 67450 + }, + { + "epoch": 0.25676940995561914, + "grad_norm": 0.12702149152755737, + "learning_rate": 0.0005, + "loss": 2.124, + "step": 67460 + }, + { + "epoch": 0.25680747242374186, + "grad_norm": 0.12243399769067764, + "learning_rate": 0.0005, + "loss": 2.1461, + "step": 67470 + }, + { + "epoch": 0.2568455348918645, + "grad_norm": 0.12209542095661163, + "learning_rate": 0.0005, + "loss": 2.1213, + "step": 67480 + }, + { + "epoch": 0.2568835973599872, + "grad_norm": 0.11669564247131348, + "learning_rate": 0.0005, + "loss": 2.141, + "step": 67490 + }, + { + "epoch": 0.2569216598281099, + "grad_norm": 0.12182539701461792, + "learning_rate": 0.0005, + "loss": 2.1343, + "step": 67500 + }, + { + "epoch": 0.2569597222962326, + "grad_norm": 0.1310204267501831, + "learning_rate": 0.0005, + "loss": 2.1257, + "step": 67510 + }, + { + "epoch": 0.25699778476435525, + "grad_norm": 0.12333172559738159, + "learning_rate": 0.0005, + "loss": 2.1111, + "step": 67520 + }, + { + "epoch": 0.25703584723247797, + "grad_norm": 0.13070839643478394, + "learning_rate": 0.0005, + "loss": 2.1127, + "step": 67530 + }, + { + "epoch": 0.2570739097006006, + "grad_norm": 0.13536477088928223, + "learning_rate": 0.0005, + "loss": 2.1294, + "step": 67540 + }, + { + "epoch": 0.25711197216872334, + "grad_norm": 0.11995254456996918, + "learning_rate": 0.0005, + "loss": 2.1276, + "step": 67550 + }, + { + "epoch": 0.257150034636846, + "grad_norm": 0.12875477969646454, + "learning_rate": 0.0005, + "loss": 2.1381, + "step": 67560 + }, + { + "epoch": 0.25718809710496865, + "grad_norm": 0.1281040757894516, + "learning_rate": 0.0005, + "loss": 2.1295, + "step": 67570 + }, + { + "epoch": 0.25722615957309136, + "grad_norm": 0.12523114681243896, + "learning_rate": 0.0005, + "loss": 2.1403, + "step": 67580 + }, + { + "epoch": 0.257264222041214, + "grad_norm": 0.12587867677211761, + "learning_rate": 0.0005, + "loss": 2.1307, + "step": 67590 + }, + { + "epoch": 0.25730228450933673, + "grad_norm": 0.12696507573127747, + "learning_rate": 0.0005, + "loss": 2.134, + "step": 67600 + }, + { + "epoch": 0.2573403469774594, + "grad_norm": 0.12220026552677155, + "learning_rate": 0.0005, + "loss": 2.1352, + "step": 67610 + }, + { + "epoch": 0.2573784094455821, + "grad_norm": 0.1330202966928482, + "learning_rate": 0.0005, + "loss": 2.1267, + "step": 67620 + }, + { + "epoch": 0.25741647191370476, + "grad_norm": 0.12621164321899414, + "learning_rate": 0.0005, + "loss": 2.1316, + "step": 67630 + }, + { + "epoch": 0.25745453438182747, + "grad_norm": 0.12764981389045715, + "learning_rate": 0.0005, + "loss": 2.1274, + "step": 67640 + }, + { + "epoch": 0.25749259684995013, + "grad_norm": 0.11342509835958481, + "learning_rate": 0.0005, + "loss": 2.1211, + "step": 67650 + }, + { + "epoch": 0.25753065931807284, + "grad_norm": 0.11275508254766464, + "learning_rate": 0.0005, + "loss": 2.1224, + "step": 67660 + }, + { + "epoch": 0.2575687217861955, + "grad_norm": 0.12244296073913574, + "learning_rate": 0.0005, + "loss": 2.1352, + "step": 67670 + }, + { + "epoch": 0.2576067842543182, + "grad_norm": 0.12804259359836578, + "learning_rate": 0.0005, + "loss": 2.1382, + "step": 67680 + }, + { + "epoch": 0.25764484672244087, + "grad_norm": 0.1230596974492073, + "learning_rate": 0.0005, + "loss": 2.1182, + "step": 67690 + }, + { + "epoch": 0.2576829091905636, + "grad_norm": 0.12710057199001312, + "learning_rate": 0.0005, + "loss": 2.1399, + "step": 67700 + }, + { + "epoch": 0.25772097165868624, + "grad_norm": 0.12617017328739166, + "learning_rate": 0.0005, + "loss": 2.1158, + "step": 67710 + }, + { + "epoch": 0.2577590341268089, + "grad_norm": 0.17379063367843628, + "learning_rate": 0.0005, + "loss": 2.1129, + "step": 67720 + }, + { + "epoch": 0.2577970965949316, + "grad_norm": 0.1185583844780922, + "learning_rate": 0.0005, + "loss": 2.1323, + "step": 67730 + }, + { + "epoch": 0.25783515906305426, + "grad_norm": 0.14808182418346405, + "learning_rate": 0.0005, + "loss": 2.121, + "step": 67740 + }, + { + "epoch": 0.257873221531177, + "grad_norm": 0.12773369252681732, + "learning_rate": 0.0005, + "loss": 2.1242, + "step": 67750 + }, + { + "epoch": 0.25791128399929963, + "grad_norm": 0.12895694375038147, + "learning_rate": 0.0005, + "loss": 2.1182, + "step": 67760 + }, + { + "epoch": 0.25794934646742235, + "grad_norm": 0.12679018080234528, + "learning_rate": 0.0005, + "loss": 2.1255, + "step": 67770 + }, + { + "epoch": 0.257987408935545, + "grad_norm": 0.13147412240505219, + "learning_rate": 0.0005, + "loss": 2.1316, + "step": 67780 + }, + { + "epoch": 0.2580254714036677, + "grad_norm": 0.12539303302764893, + "learning_rate": 0.0005, + "loss": 2.1149, + "step": 67790 + }, + { + "epoch": 0.2580635338717904, + "grad_norm": 0.11887650191783905, + "learning_rate": 0.0005, + "loss": 2.1118, + "step": 67800 + }, + { + "epoch": 0.2581015963399131, + "grad_norm": 0.11561664193868637, + "learning_rate": 0.0005, + "loss": 2.1115, + "step": 67810 + }, + { + "epoch": 0.25813965880803574, + "grad_norm": 0.13444383442401886, + "learning_rate": 0.0005, + "loss": 2.1381, + "step": 67820 + }, + { + "epoch": 0.25817772127615846, + "grad_norm": 0.13082697987556458, + "learning_rate": 0.0005, + "loss": 2.1289, + "step": 67830 + }, + { + "epoch": 0.2582157837442811, + "grad_norm": 0.13210126757621765, + "learning_rate": 0.0005, + "loss": 2.1234, + "step": 67840 + }, + { + "epoch": 0.25825384621240377, + "grad_norm": 0.13368277251720428, + "learning_rate": 0.0005, + "loss": 2.1078, + "step": 67850 + }, + { + "epoch": 0.2582919086805265, + "grad_norm": 0.13177311420440674, + "learning_rate": 0.0005, + "loss": 2.119, + "step": 67860 + }, + { + "epoch": 0.25832997114864914, + "grad_norm": 0.12315039336681366, + "learning_rate": 0.0005, + "loss": 2.1304, + "step": 67870 + }, + { + "epoch": 0.25836803361677185, + "grad_norm": 0.11096197366714478, + "learning_rate": 0.0005, + "loss": 2.1353, + "step": 67880 + }, + { + "epoch": 0.2584060960848945, + "grad_norm": 0.11734460294246674, + "learning_rate": 0.0005, + "loss": 2.1256, + "step": 67890 + }, + { + "epoch": 0.2584441585530172, + "grad_norm": 0.12845294177532196, + "learning_rate": 0.0005, + "loss": 2.1119, + "step": 67900 + }, + { + "epoch": 0.2584822210211399, + "grad_norm": 0.12121350318193436, + "learning_rate": 0.0005, + "loss": 2.122, + "step": 67910 + }, + { + "epoch": 0.2585202834892626, + "grad_norm": 0.12107622623443604, + "learning_rate": 0.0005, + "loss": 2.1185, + "step": 67920 + }, + { + "epoch": 0.25855834595738525, + "grad_norm": 0.12794913351535797, + "learning_rate": 0.0005, + "loss": 2.1374, + "step": 67930 + }, + { + "epoch": 0.25859640842550796, + "grad_norm": 0.11874082684516907, + "learning_rate": 0.0005, + "loss": 2.12, + "step": 67940 + }, + { + "epoch": 0.2586344708936306, + "grad_norm": 0.158302441239357, + "learning_rate": 0.0005, + "loss": 2.1347, + "step": 67950 + }, + { + "epoch": 0.25867253336175333, + "grad_norm": 0.13007745146751404, + "learning_rate": 0.0005, + "loss": 2.1204, + "step": 67960 + }, + { + "epoch": 0.258710595829876, + "grad_norm": 0.13091380894184113, + "learning_rate": 0.0005, + "loss": 2.1081, + "step": 67970 + }, + { + "epoch": 0.2587486582979987, + "grad_norm": 0.1220938190817833, + "learning_rate": 0.0005, + "loss": 2.1378, + "step": 67980 + }, + { + "epoch": 0.25878672076612136, + "grad_norm": 0.12740594148635864, + "learning_rate": 0.0005, + "loss": 2.1451, + "step": 67990 + }, + { + "epoch": 0.258824783234244, + "grad_norm": 0.11739125847816467, + "learning_rate": 0.0005, + "loss": 2.1223, + "step": 68000 + }, + { + "epoch": 0.25886284570236673, + "grad_norm": 0.12394890189170837, + "learning_rate": 0.0005, + "loss": 2.1383, + "step": 68010 + }, + { + "epoch": 0.2589009081704894, + "grad_norm": 0.12760695815086365, + "learning_rate": 0.0005, + "loss": 2.1184, + "step": 68020 + }, + { + "epoch": 0.2589389706386121, + "grad_norm": 0.1216312125325203, + "learning_rate": 0.0005, + "loss": 2.1265, + "step": 68030 + }, + { + "epoch": 0.25897703310673476, + "grad_norm": 0.1176496222615242, + "learning_rate": 0.0005, + "loss": 2.1184, + "step": 68040 + }, + { + "epoch": 0.25901509557485747, + "grad_norm": 0.1290826052427292, + "learning_rate": 0.0005, + "loss": 2.1219, + "step": 68050 + }, + { + "epoch": 0.2590531580429801, + "grad_norm": 0.13374052941799164, + "learning_rate": 0.0005, + "loss": 2.1171, + "step": 68060 + }, + { + "epoch": 0.25909122051110284, + "grad_norm": 0.11010095477104187, + "learning_rate": 0.0005, + "loss": 2.1207, + "step": 68070 + }, + { + "epoch": 0.2591292829792255, + "grad_norm": 0.13147957623004913, + "learning_rate": 0.0005, + "loss": 2.1419, + "step": 68080 + }, + { + "epoch": 0.2591673454473482, + "grad_norm": 0.12701791524887085, + "learning_rate": 0.0005, + "loss": 2.1306, + "step": 68090 + }, + { + "epoch": 0.25920540791547086, + "grad_norm": 0.1303391009569168, + "learning_rate": 0.0005, + "loss": 2.1115, + "step": 68100 + }, + { + "epoch": 0.2592434703835936, + "grad_norm": 0.29294759035110474, + "learning_rate": 0.0005, + "loss": 2.1133, + "step": 68110 + }, + { + "epoch": 0.25928153285171623, + "grad_norm": 0.11597706377506256, + "learning_rate": 0.0005, + "loss": 2.1199, + "step": 68120 + }, + { + "epoch": 0.25931959531983895, + "grad_norm": 0.1320647895336151, + "learning_rate": 0.0005, + "loss": 2.1307, + "step": 68130 + }, + { + "epoch": 0.2593576577879616, + "grad_norm": 0.11850161850452423, + "learning_rate": 0.0005, + "loss": 2.1221, + "step": 68140 + }, + { + "epoch": 0.25939572025608426, + "grad_norm": 0.12103667855262756, + "learning_rate": 0.0005, + "loss": 2.1249, + "step": 68150 + }, + { + "epoch": 0.259433782724207, + "grad_norm": 0.12830379605293274, + "learning_rate": 0.0005, + "loss": 2.1478, + "step": 68160 + }, + { + "epoch": 0.25947184519232963, + "grad_norm": 0.1356051117181778, + "learning_rate": 0.0005, + "loss": 2.1172, + "step": 68170 + }, + { + "epoch": 0.25950990766045234, + "grad_norm": 0.11961928755044937, + "learning_rate": 0.0005, + "loss": 2.1211, + "step": 68180 + }, + { + "epoch": 0.259547970128575, + "grad_norm": 0.12661005556583405, + "learning_rate": 0.0005, + "loss": 2.1149, + "step": 68190 + }, + { + "epoch": 0.2595860325966977, + "grad_norm": 0.11801932007074356, + "learning_rate": 0.0005, + "loss": 2.1171, + "step": 68200 + }, + { + "epoch": 0.25962409506482037, + "grad_norm": 0.11936034262180328, + "learning_rate": 0.0005, + "loss": 2.1265, + "step": 68210 + }, + { + "epoch": 0.2596621575329431, + "grad_norm": 0.15334779024124146, + "learning_rate": 0.0005, + "loss": 2.1363, + "step": 68220 + }, + { + "epoch": 0.25970022000106574, + "grad_norm": 0.12236060202121735, + "learning_rate": 0.0005, + "loss": 2.127, + "step": 68230 + }, + { + "epoch": 0.25973828246918845, + "grad_norm": 0.11557623744010925, + "learning_rate": 0.0005, + "loss": 2.1359, + "step": 68240 + }, + { + "epoch": 0.2597763449373111, + "grad_norm": 0.12068971246480942, + "learning_rate": 0.0005, + "loss": 2.1167, + "step": 68250 + }, + { + "epoch": 0.2598144074054338, + "grad_norm": 0.1296335607767105, + "learning_rate": 0.0005, + "loss": 2.1293, + "step": 68260 + }, + { + "epoch": 0.2598524698735565, + "grad_norm": 0.13720114529132843, + "learning_rate": 0.0005, + "loss": 2.1261, + "step": 68270 + }, + { + "epoch": 0.25989053234167914, + "grad_norm": 0.13136669993400574, + "learning_rate": 0.0005, + "loss": 2.1073, + "step": 68280 + }, + { + "epoch": 0.25992859480980185, + "grad_norm": 0.1193876564502716, + "learning_rate": 0.0005, + "loss": 2.1143, + "step": 68290 + }, + { + "epoch": 0.2599666572779245, + "grad_norm": 0.11309941112995148, + "learning_rate": 0.0005, + "loss": 2.128, + "step": 68300 + }, + { + "epoch": 0.2600047197460472, + "grad_norm": 0.14278921484947205, + "learning_rate": 0.0005, + "loss": 2.1387, + "step": 68310 + }, + { + "epoch": 0.2600427822141699, + "grad_norm": 0.12418635934591293, + "learning_rate": 0.0005, + "loss": 2.1207, + "step": 68320 + }, + { + "epoch": 0.2600808446822926, + "grad_norm": 0.12227904051542282, + "learning_rate": 0.0005, + "loss": 2.119, + "step": 68330 + }, + { + "epoch": 0.26011890715041525, + "grad_norm": 0.12280824035406113, + "learning_rate": 0.0005, + "loss": 2.1307, + "step": 68340 + }, + { + "epoch": 0.26015696961853796, + "grad_norm": 0.1240249052643776, + "learning_rate": 0.0005, + "loss": 2.1224, + "step": 68350 + }, + { + "epoch": 0.2601950320866606, + "grad_norm": 0.13094522058963776, + "learning_rate": 0.0005, + "loss": 2.1235, + "step": 68360 + }, + { + "epoch": 0.26023309455478333, + "grad_norm": 0.1325329691171646, + "learning_rate": 0.0005, + "loss": 2.1216, + "step": 68370 + }, + { + "epoch": 0.260271157022906, + "grad_norm": 0.12583765387535095, + "learning_rate": 0.0005, + "loss": 2.132, + "step": 68380 + }, + { + "epoch": 0.2603092194910287, + "grad_norm": 0.13313855230808258, + "learning_rate": 0.0005, + "loss": 2.1397, + "step": 68390 + }, + { + "epoch": 0.26034728195915136, + "grad_norm": 0.11284197121858597, + "learning_rate": 0.0005, + "loss": 2.1392, + "step": 68400 + }, + { + "epoch": 0.26038534442727407, + "grad_norm": 0.11301718652248383, + "learning_rate": 0.0005, + "loss": 2.1331, + "step": 68410 + }, + { + "epoch": 0.2604234068953967, + "grad_norm": 0.12848864495754242, + "learning_rate": 0.0005, + "loss": 2.1267, + "step": 68420 + }, + { + "epoch": 0.2604614693635194, + "grad_norm": 0.1173487976193428, + "learning_rate": 0.0005, + "loss": 2.1255, + "step": 68430 + }, + { + "epoch": 0.2604995318316421, + "grad_norm": 0.12169645726680756, + "learning_rate": 0.0005, + "loss": 2.1214, + "step": 68440 + }, + { + "epoch": 0.26053759429976475, + "grad_norm": 0.12304247915744781, + "learning_rate": 0.0005, + "loss": 2.1275, + "step": 68450 + }, + { + "epoch": 0.26057565676788746, + "grad_norm": 0.12181452661752701, + "learning_rate": 0.0005, + "loss": 2.114, + "step": 68460 + }, + { + "epoch": 0.2606137192360101, + "grad_norm": 0.1285940557718277, + "learning_rate": 0.0005, + "loss": 2.1354, + "step": 68470 + }, + { + "epoch": 0.26065178170413283, + "grad_norm": 0.1419471800327301, + "learning_rate": 0.0005, + "loss": 2.1207, + "step": 68480 + }, + { + "epoch": 0.2606898441722555, + "grad_norm": 0.1256123185157776, + "learning_rate": 0.0005, + "loss": 2.1213, + "step": 68490 + }, + { + "epoch": 0.2607279066403782, + "grad_norm": 0.11510717868804932, + "learning_rate": 0.0005, + "loss": 2.1198, + "step": 68500 + }, + { + "epoch": 0.26076596910850086, + "grad_norm": 0.12129577994346619, + "learning_rate": 0.0005, + "loss": 2.1293, + "step": 68510 + }, + { + "epoch": 0.2608040315766236, + "grad_norm": 0.13086970150470734, + "learning_rate": 0.0005, + "loss": 2.1299, + "step": 68520 + }, + { + "epoch": 0.26084209404474623, + "grad_norm": 0.13211217522621155, + "learning_rate": 0.0005, + "loss": 2.1409, + "step": 68530 + }, + { + "epoch": 0.26088015651286894, + "grad_norm": 0.13078242540359497, + "learning_rate": 0.0005, + "loss": 2.1186, + "step": 68540 + }, + { + "epoch": 0.2609182189809916, + "grad_norm": 0.11888111382722855, + "learning_rate": 0.0005, + "loss": 2.1617, + "step": 68550 + }, + { + "epoch": 0.2609562814491143, + "grad_norm": 0.14845970273017883, + "learning_rate": 0.0005, + "loss": 2.1259, + "step": 68560 + }, + { + "epoch": 0.26099434391723697, + "grad_norm": 0.12593623995780945, + "learning_rate": 0.0005, + "loss": 2.1302, + "step": 68570 + }, + { + "epoch": 0.2610324063853596, + "grad_norm": 0.1280219405889511, + "learning_rate": 0.0005, + "loss": 2.1329, + "step": 68580 + }, + { + "epoch": 0.26107046885348234, + "grad_norm": 0.12807269394397736, + "learning_rate": 0.0005, + "loss": 2.1245, + "step": 68590 + }, + { + "epoch": 0.261108531321605, + "grad_norm": 0.12324277311563492, + "learning_rate": 0.0005, + "loss": 2.1455, + "step": 68600 + }, + { + "epoch": 0.2611465937897277, + "grad_norm": 0.13317665457725525, + "learning_rate": 0.0005, + "loss": 2.1265, + "step": 68610 + }, + { + "epoch": 0.26118465625785037, + "grad_norm": 0.12222850322723389, + "learning_rate": 0.0005, + "loss": 2.1238, + "step": 68620 + }, + { + "epoch": 0.2612227187259731, + "grad_norm": 0.12694650888442993, + "learning_rate": 0.0005, + "loss": 2.1216, + "step": 68630 + }, + { + "epoch": 0.26126078119409574, + "grad_norm": 0.12431015074253082, + "learning_rate": 0.0005, + "loss": 2.1136, + "step": 68640 + }, + { + "epoch": 0.26129884366221845, + "grad_norm": 0.12812888622283936, + "learning_rate": 0.0005, + "loss": 2.1224, + "step": 68650 + }, + { + "epoch": 0.2613369061303411, + "grad_norm": 0.13000817596912384, + "learning_rate": 0.0005, + "loss": 2.1321, + "step": 68660 + }, + { + "epoch": 0.2613749685984638, + "grad_norm": 0.1278466284275055, + "learning_rate": 0.0005, + "loss": 2.1276, + "step": 68670 + }, + { + "epoch": 0.2614130310665865, + "grad_norm": 0.1243181899189949, + "learning_rate": 0.0005, + "loss": 2.1285, + "step": 68680 + }, + { + "epoch": 0.2614510935347092, + "grad_norm": 0.11554887890815735, + "learning_rate": 0.0005, + "loss": 2.1271, + "step": 68690 + }, + { + "epoch": 0.26148915600283185, + "grad_norm": 0.12728217244148254, + "learning_rate": 0.0005, + "loss": 2.1109, + "step": 68700 + }, + { + "epoch": 0.2615272184709545, + "grad_norm": 0.11840543150901794, + "learning_rate": 0.0005, + "loss": 2.1371, + "step": 68710 + }, + { + "epoch": 0.2615652809390772, + "grad_norm": 0.12170374393463135, + "learning_rate": 0.0005, + "loss": 2.1369, + "step": 68720 + }, + { + "epoch": 0.2616033434071999, + "grad_norm": 0.1255207061767578, + "learning_rate": 0.0005, + "loss": 2.1254, + "step": 68730 + }, + { + "epoch": 0.2616414058753226, + "grad_norm": 0.11496064066886902, + "learning_rate": 0.0005, + "loss": 2.1279, + "step": 68740 + }, + { + "epoch": 0.26167946834344524, + "grad_norm": 0.11680759489536285, + "learning_rate": 0.0005, + "loss": 2.1215, + "step": 68750 + }, + { + "epoch": 0.26171753081156796, + "grad_norm": 0.14723053574562073, + "learning_rate": 0.0005, + "loss": 2.1476, + "step": 68760 + }, + { + "epoch": 0.2617555932796906, + "grad_norm": 0.11675665527582169, + "learning_rate": 0.0005, + "loss": 2.118, + "step": 68770 + }, + { + "epoch": 0.2617936557478133, + "grad_norm": 0.11647092550992966, + "learning_rate": 0.0005, + "loss": 2.1407, + "step": 68780 + }, + { + "epoch": 0.261831718215936, + "grad_norm": 0.12418477982282639, + "learning_rate": 0.0005, + "loss": 2.1325, + "step": 68790 + }, + { + "epoch": 0.2618697806840587, + "grad_norm": 0.12684816122055054, + "learning_rate": 0.0005, + "loss": 2.1295, + "step": 68800 + }, + { + "epoch": 0.26190784315218135, + "grad_norm": 0.12460498511791229, + "learning_rate": 0.0005, + "loss": 2.1189, + "step": 68810 + }, + { + "epoch": 0.26194590562030406, + "grad_norm": 0.11982579529285431, + "learning_rate": 0.0005, + "loss": 2.1312, + "step": 68820 + }, + { + "epoch": 0.2619839680884267, + "grad_norm": 0.12077322602272034, + "learning_rate": 0.0005, + "loss": 2.1215, + "step": 68830 + }, + { + "epoch": 0.26202203055654943, + "grad_norm": 0.1211499571800232, + "learning_rate": 0.0005, + "loss": 2.115, + "step": 68840 + }, + { + "epoch": 0.2620600930246721, + "grad_norm": 0.12279724329710007, + "learning_rate": 0.0005, + "loss": 2.1258, + "step": 68850 + }, + { + "epoch": 0.26209815549279475, + "grad_norm": 0.14473022520542145, + "learning_rate": 0.0005, + "loss": 2.1289, + "step": 68860 + }, + { + "epoch": 0.26213621796091746, + "grad_norm": 0.12201548367738724, + "learning_rate": 0.0005, + "loss": 2.1242, + "step": 68870 + }, + { + "epoch": 0.2621742804290401, + "grad_norm": 0.15855537354946136, + "learning_rate": 0.0005, + "loss": 2.1391, + "step": 68880 + }, + { + "epoch": 0.26221234289716283, + "grad_norm": 0.1251879632472992, + "learning_rate": 0.0005, + "loss": 2.1176, + "step": 68890 + }, + { + "epoch": 0.2622504053652855, + "grad_norm": 0.15588484704494476, + "learning_rate": 0.0005, + "loss": 2.1311, + "step": 68900 + }, + { + "epoch": 0.2622884678334082, + "grad_norm": 0.11355319619178772, + "learning_rate": 0.0005, + "loss": 2.1166, + "step": 68910 + }, + { + "epoch": 0.26232653030153086, + "grad_norm": 0.12741701304912567, + "learning_rate": 0.0005, + "loss": 2.1237, + "step": 68920 + }, + { + "epoch": 0.26236459276965357, + "grad_norm": 0.12800206243991852, + "learning_rate": 0.0005, + "loss": 2.1344, + "step": 68930 + }, + { + "epoch": 0.26240265523777623, + "grad_norm": 0.12004125118255615, + "learning_rate": 0.0005, + "loss": 2.141, + "step": 68940 + }, + { + "epoch": 0.26244071770589894, + "grad_norm": 0.14113670587539673, + "learning_rate": 0.0005, + "loss": 2.1277, + "step": 68950 + }, + { + "epoch": 0.2624787801740216, + "grad_norm": 0.1426374763250351, + "learning_rate": 0.0005, + "loss": 2.1305, + "step": 68960 + }, + { + "epoch": 0.2625168426421443, + "grad_norm": 0.12280486524105072, + "learning_rate": 0.0005, + "loss": 2.1318, + "step": 68970 + }, + { + "epoch": 0.26255490511026697, + "grad_norm": 0.13465885818004608, + "learning_rate": 0.0005, + "loss": 2.14, + "step": 68980 + }, + { + "epoch": 0.2625929675783897, + "grad_norm": 0.13163095712661743, + "learning_rate": 0.0005, + "loss": 2.1416, + "step": 68990 + }, + { + "epoch": 0.26263103004651234, + "grad_norm": 0.18279962241649628, + "learning_rate": 0.0005, + "loss": 2.1295, + "step": 69000 + }, + { + "epoch": 0.262669092514635, + "grad_norm": 0.11554229259490967, + "learning_rate": 0.0005, + "loss": 2.1213, + "step": 69010 + }, + { + "epoch": 0.2627071549827577, + "grad_norm": 0.12765353918075562, + "learning_rate": 0.0005, + "loss": 2.1373, + "step": 69020 + }, + { + "epoch": 0.26274521745088036, + "grad_norm": 0.11855091154575348, + "learning_rate": 0.0005, + "loss": 2.1184, + "step": 69030 + }, + { + "epoch": 0.2627832799190031, + "grad_norm": 0.13786230981349945, + "learning_rate": 0.0005, + "loss": 2.1333, + "step": 69040 + }, + { + "epoch": 0.26282134238712573, + "grad_norm": 0.11385150998830795, + "learning_rate": 0.0005, + "loss": 2.1228, + "step": 69050 + }, + { + "epoch": 0.26285940485524845, + "grad_norm": 0.11940497159957886, + "learning_rate": 0.0005, + "loss": 2.1478, + "step": 69060 + }, + { + "epoch": 0.2628974673233711, + "grad_norm": 0.12230822443962097, + "learning_rate": 0.0005, + "loss": 2.118, + "step": 69070 + }, + { + "epoch": 0.2629355297914938, + "grad_norm": 0.12506653368473053, + "learning_rate": 0.0005, + "loss": 2.1287, + "step": 69080 + }, + { + "epoch": 0.2629735922596165, + "grad_norm": 0.12588395178318024, + "learning_rate": 0.0005, + "loss": 2.1466, + "step": 69090 + }, + { + "epoch": 0.2630116547277392, + "grad_norm": 0.12764878571033478, + "learning_rate": 0.0005, + "loss": 2.1312, + "step": 69100 + }, + { + "epoch": 0.26304971719586184, + "grad_norm": 0.15151946246623993, + "learning_rate": 0.0005, + "loss": 2.1187, + "step": 69110 + }, + { + "epoch": 0.26308777966398456, + "grad_norm": 0.1337091028690338, + "learning_rate": 0.0005, + "loss": 2.1171, + "step": 69120 + }, + { + "epoch": 0.2631258421321072, + "grad_norm": 0.11346838623285294, + "learning_rate": 0.0005, + "loss": 2.1262, + "step": 69130 + }, + { + "epoch": 0.26316390460022987, + "grad_norm": 0.11807180941104889, + "learning_rate": 0.0005, + "loss": 2.1241, + "step": 69140 + }, + { + "epoch": 0.2632019670683526, + "grad_norm": 0.12256387621164322, + "learning_rate": 0.0005, + "loss": 2.1268, + "step": 69150 + }, + { + "epoch": 0.26324002953647524, + "grad_norm": 0.11840704083442688, + "learning_rate": 0.0005, + "loss": 2.1239, + "step": 69160 + }, + { + "epoch": 0.26327809200459795, + "grad_norm": 0.12220650166273117, + "learning_rate": 0.0005, + "loss": 2.128, + "step": 69170 + }, + { + "epoch": 0.2633161544727206, + "grad_norm": 0.13918541371822357, + "learning_rate": 0.0005, + "loss": 2.1433, + "step": 69180 + }, + { + "epoch": 0.2633542169408433, + "grad_norm": 0.11918140202760696, + "learning_rate": 0.0005, + "loss": 2.1147, + "step": 69190 + }, + { + "epoch": 0.263392279408966, + "grad_norm": 0.12479212880134583, + "learning_rate": 0.0005, + "loss": 2.1263, + "step": 69200 + }, + { + "epoch": 0.2634303418770887, + "grad_norm": 0.13087302446365356, + "learning_rate": 0.0005, + "loss": 2.1299, + "step": 69210 + }, + { + "epoch": 0.26346840434521135, + "grad_norm": 0.1416410207748413, + "learning_rate": 0.0005, + "loss": 2.1242, + "step": 69220 + }, + { + "epoch": 0.26350646681333406, + "grad_norm": 0.12037502229213715, + "learning_rate": 0.0005, + "loss": 2.1178, + "step": 69230 + }, + { + "epoch": 0.2635445292814567, + "grad_norm": 0.13915680348873138, + "learning_rate": 0.0005, + "loss": 2.1232, + "step": 69240 + }, + { + "epoch": 0.26358259174957943, + "grad_norm": 0.14051872491836548, + "learning_rate": 0.0005, + "loss": 2.1349, + "step": 69250 + }, + { + "epoch": 0.2636206542177021, + "grad_norm": 0.12612906098365784, + "learning_rate": 0.0005, + "loss": 2.1431, + "step": 69260 + }, + { + "epoch": 0.2636587166858248, + "grad_norm": 0.13273148238658905, + "learning_rate": 0.0005, + "loss": 2.1225, + "step": 69270 + }, + { + "epoch": 0.26369677915394746, + "grad_norm": 0.13362795114517212, + "learning_rate": 0.0005, + "loss": 2.1114, + "step": 69280 + }, + { + "epoch": 0.2637348416220701, + "grad_norm": 0.12257708609104156, + "learning_rate": 0.0005, + "loss": 2.124, + "step": 69290 + }, + { + "epoch": 0.26377290409019283, + "grad_norm": 0.12788087129592896, + "learning_rate": 0.0005, + "loss": 2.1279, + "step": 69300 + }, + { + "epoch": 0.2638109665583155, + "grad_norm": 0.11019985377788544, + "learning_rate": 0.0005, + "loss": 2.1274, + "step": 69310 + }, + { + "epoch": 0.2638490290264382, + "grad_norm": 0.12560158967971802, + "learning_rate": 0.0005, + "loss": 2.1192, + "step": 69320 + }, + { + "epoch": 0.26388709149456085, + "grad_norm": 0.12579011917114258, + "learning_rate": 0.0005, + "loss": 2.1131, + "step": 69330 + }, + { + "epoch": 0.26392515396268357, + "grad_norm": 0.1318081021308899, + "learning_rate": 0.0005, + "loss": 2.1424, + "step": 69340 + }, + { + "epoch": 0.2639632164308062, + "grad_norm": 0.12909837067127228, + "learning_rate": 0.0005, + "loss": 2.1297, + "step": 69350 + }, + { + "epoch": 0.26400127889892894, + "grad_norm": 0.12274816632270813, + "learning_rate": 0.0005, + "loss": 2.1252, + "step": 69360 + }, + { + "epoch": 0.2640393413670516, + "grad_norm": 0.11799930781126022, + "learning_rate": 0.0005, + "loss": 2.1255, + "step": 69370 + }, + { + "epoch": 0.2640774038351743, + "grad_norm": 0.13099320232868195, + "learning_rate": 0.0005, + "loss": 2.1266, + "step": 69380 + }, + { + "epoch": 0.26411546630329696, + "grad_norm": 0.118326835334301, + "learning_rate": 0.0005, + "loss": 2.1381, + "step": 69390 + }, + { + "epoch": 0.2641535287714197, + "grad_norm": 0.12781855463981628, + "learning_rate": 0.0005, + "loss": 2.1315, + "step": 69400 + }, + { + "epoch": 0.26419159123954233, + "grad_norm": 0.1174471378326416, + "learning_rate": 0.0005, + "loss": 2.1304, + "step": 69410 + }, + { + "epoch": 0.26422965370766505, + "grad_norm": 0.12137634307146072, + "learning_rate": 0.0005, + "loss": 2.1309, + "step": 69420 + }, + { + "epoch": 0.2642677161757877, + "grad_norm": 0.12340341508388519, + "learning_rate": 0.0005, + "loss": 2.1362, + "step": 69430 + }, + { + "epoch": 0.26430577864391036, + "grad_norm": 0.14514338970184326, + "learning_rate": 0.0005, + "loss": 2.1259, + "step": 69440 + }, + { + "epoch": 0.2643438411120331, + "grad_norm": 0.13454660773277283, + "learning_rate": 0.0005, + "loss": 2.1275, + "step": 69450 + }, + { + "epoch": 0.26438190358015573, + "grad_norm": 0.13224738836288452, + "learning_rate": 0.0005, + "loss": 2.1187, + "step": 69460 + }, + { + "epoch": 0.26441996604827844, + "grad_norm": 0.11716333031654358, + "learning_rate": 0.0005, + "loss": 2.1306, + "step": 69470 + }, + { + "epoch": 0.2644580285164011, + "grad_norm": 0.12512919306755066, + "learning_rate": 0.0005, + "loss": 2.1312, + "step": 69480 + }, + { + "epoch": 0.2644960909845238, + "grad_norm": 0.12434687465429306, + "learning_rate": 0.0005, + "loss": 2.121, + "step": 69490 + }, + { + "epoch": 0.26453415345264647, + "grad_norm": 0.12177757173776627, + "learning_rate": 0.0005, + "loss": 2.121, + "step": 69500 + }, + { + "epoch": 0.2645722159207692, + "grad_norm": 0.1328718662261963, + "learning_rate": 0.0005, + "loss": 2.1158, + "step": 69510 + }, + { + "epoch": 0.26461027838889184, + "grad_norm": 0.12692506611347198, + "learning_rate": 0.0005, + "loss": 2.1291, + "step": 69520 + }, + { + "epoch": 0.26464834085701455, + "grad_norm": 0.1363808661699295, + "learning_rate": 0.0005, + "loss": 2.1187, + "step": 69530 + }, + { + "epoch": 0.2646864033251372, + "grad_norm": 0.1261531114578247, + "learning_rate": 0.0005, + "loss": 2.126, + "step": 69540 + }, + { + "epoch": 0.2647244657932599, + "grad_norm": 0.11819518357515335, + "learning_rate": 0.0005, + "loss": 2.1388, + "step": 69550 + }, + { + "epoch": 0.2647625282613826, + "grad_norm": 0.12026696652173996, + "learning_rate": 0.0005, + "loss": 2.1226, + "step": 69560 + }, + { + "epoch": 0.26480059072950524, + "grad_norm": 0.12217105180025101, + "learning_rate": 0.0005, + "loss": 2.1265, + "step": 69570 + }, + { + "epoch": 0.26483865319762795, + "grad_norm": 0.11567086726427078, + "learning_rate": 0.0005, + "loss": 2.1221, + "step": 69580 + }, + { + "epoch": 0.2648767156657506, + "grad_norm": 0.1255672425031662, + "learning_rate": 0.0005, + "loss": 2.1288, + "step": 69590 + }, + { + "epoch": 0.2649147781338733, + "grad_norm": 0.14222608506679535, + "learning_rate": 0.0005, + "loss": 2.1267, + "step": 69600 + }, + { + "epoch": 0.264952840601996, + "grad_norm": 0.11926767975091934, + "learning_rate": 0.0005, + "loss": 2.1177, + "step": 69610 + }, + { + "epoch": 0.2649909030701187, + "grad_norm": 0.11993763595819473, + "learning_rate": 0.0005, + "loss": 2.1263, + "step": 69620 + }, + { + "epoch": 0.26502896553824135, + "grad_norm": 0.13194534182548523, + "learning_rate": 0.0005, + "loss": 2.1381, + "step": 69630 + }, + { + "epoch": 0.26506702800636406, + "grad_norm": 0.11919394135475159, + "learning_rate": 0.0005, + "loss": 2.1297, + "step": 69640 + }, + { + "epoch": 0.2651050904744867, + "grad_norm": 0.1282067596912384, + "learning_rate": 0.0005, + "loss": 2.1305, + "step": 69650 + }, + { + "epoch": 0.26514315294260943, + "grad_norm": 0.12731339037418365, + "learning_rate": 0.0005, + "loss": 2.1398, + "step": 69660 + }, + { + "epoch": 0.2651812154107321, + "grad_norm": 0.1202862411737442, + "learning_rate": 0.0005, + "loss": 2.1179, + "step": 69670 + }, + { + "epoch": 0.2652192778788548, + "grad_norm": 0.13804534077644348, + "learning_rate": 0.0005, + "loss": 2.1272, + "step": 69680 + }, + { + "epoch": 0.26525734034697745, + "grad_norm": 0.12254875898361206, + "learning_rate": 0.0005, + "loss": 2.1174, + "step": 69690 + }, + { + "epoch": 0.26529540281510017, + "grad_norm": 0.13759438693523407, + "learning_rate": 0.0005, + "loss": 2.117, + "step": 69700 + }, + { + "epoch": 0.2653334652832228, + "grad_norm": 0.1366581916809082, + "learning_rate": 0.0005, + "loss": 2.1241, + "step": 69710 + }, + { + "epoch": 0.2653715277513455, + "grad_norm": 0.1233244389295578, + "learning_rate": 0.0005, + "loss": 2.114, + "step": 69720 + }, + { + "epoch": 0.2654095902194682, + "grad_norm": 0.13405732810497284, + "learning_rate": 0.0005, + "loss": 2.1265, + "step": 69730 + }, + { + "epoch": 0.26544765268759085, + "grad_norm": 0.11756180226802826, + "learning_rate": 0.0005, + "loss": 2.1178, + "step": 69740 + }, + { + "epoch": 0.26548571515571356, + "grad_norm": 0.1275683045387268, + "learning_rate": 0.0005, + "loss": 2.1264, + "step": 69750 + }, + { + "epoch": 0.2655237776238362, + "grad_norm": 0.1181812584400177, + "learning_rate": 0.0005, + "loss": 2.1385, + "step": 69760 + }, + { + "epoch": 0.26556184009195893, + "grad_norm": 0.12255300581455231, + "learning_rate": 0.0005, + "loss": 2.1326, + "step": 69770 + }, + { + "epoch": 0.2655999025600816, + "grad_norm": 0.12513133883476257, + "learning_rate": 0.0005, + "loss": 2.1404, + "step": 69780 + }, + { + "epoch": 0.2656379650282043, + "grad_norm": 0.13193869590759277, + "learning_rate": 0.0005, + "loss": 2.1212, + "step": 69790 + }, + { + "epoch": 0.26567602749632696, + "grad_norm": 0.12380549311637878, + "learning_rate": 0.0005, + "loss": 2.108, + "step": 69800 + }, + { + "epoch": 0.2657140899644497, + "grad_norm": 0.15247705578804016, + "learning_rate": 0.0005, + "loss": 2.1261, + "step": 69810 + }, + { + "epoch": 0.26575215243257233, + "grad_norm": 0.14371612668037415, + "learning_rate": 0.0005, + "loss": 2.1325, + "step": 69820 + }, + { + "epoch": 0.26579021490069504, + "grad_norm": 0.11494036763906479, + "learning_rate": 0.0005, + "loss": 2.1316, + "step": 69830 + }, + { + "epoch": 0.2658282773688177, + "grad_norm": 0.12889176607131958, + "learning_rate": 0.0005, + "loss": 2.1192, + "step": 69840 + }, + { + "epoch": 0.2658663398369404, + "grad_norm": 0.1366003006696701, + "learning_rate": 0.0005, + "loss": 2.1064, + "step": 69850 + }, + { + "epoch": 0.26590440230506307, + "grad_norm": 0.1334492415189743, + "learning_rate": 0.0005, + "loss": 2.1171, + "step": 69860 + }, + { + "epoch": 0.2659424647731857, + "grad_norm": 0.12129637598991394, + "learning_rate": 0.0005, + "loss": 2.1445, + "step": 69870 + }, + { + "epoch": 0.26598052724130844, + "grad_norm": 0.12232821434736252, + "learning_rate": 0.0005, + "loss": 2.1332, + "step": 69880 + }, + { + "epoch": 0.2660185897094311, + "grad_norm": 0.14795225858688354, + "learning_rate": 0.0005, + "loss": 2.1257, + "step": 69890 + }, + { + "epoch": 0.2660566521775538, + "grad_norm": 0.13824805617332458, + "learning_rate": 0.0005, + "loss": 2.1183, + "step": 69900 + }, + { + "epoch": 0.26609471464567647, + "grad_norm": 0.1196480542421341, + "learning_rate": 0.0005, + "loss": 2.1383, + "step": 69910 + }, + { + "epoch": 0.2661327771137992, + "grad_norm": 0.12577514350414276, + "learning_rate": 0.0005, + "loss": 2.1267, + "step": 69920 + }, + { + "epoch": 0.26617083958192184, + "grad_norm": 0.1174229308962822, + "learning_rate": 0.0005, + "loss": 2.1209, + "step": 69930 + }, + { + "epoch": 0.26620890205004455, + "grad_norm": 0.12444499135017395, + "learning_rate": 0.0005, + "loss": 2.1317, + "step": 69940 + }, + { + "epoch": 0.2662469645181672, + "grad_norm": 0.1242179125547409, + "learning_rate": 0.0005, + "loss": 2.1309, + "step": 69950 + }, + { + "epoch": 0.2662850269862899, + "grad_norm": 0.14606353640556335, + "learning_rate": 0.0005, + "loss": 2.1179, + "step": 69960 + }, + { + "epoch": 0.2663230894544126, + "grad_norm": 0.11545176059007645, + "learning_rate": 0.0005, + "loss": 2.1407, + "step": 69970 + }, + { + "epoch": 0.2663611519225353, + "grad_norm": 0.12576058506965637, + "learning_rate": 0.0005, + "loss": 2.1163, + "step": 69980 + }, + { + "epoch": 0.26639921439065795, + "grad_norm": 0.12525852024555206, + "learning_rate": 0.0005, + "loss": 2.1286, + "step": 69990 + }, + { + "epoch": 0.26643727685878066, + "grad_norm": 0.12224716693162918, + "learning_rate": 0.0005, + "loss": 2.1262, + "step": 70000 + }, + { + "epoch": 0.2664753393269033, + "grad_norm": 0.13207238912582397, + "learning_rate": 0.0005, + "loss": 2.122, + "step": 70010 + }, + { + "epoch": 0.266513401795026, + "grad_norm": 0.1505841761827469, + "learning_rate": 0.0005, + "loss": 2.1304, + "step": 70020 + }, + { + "epoch": 0.2665514642631487, + "grad_norm": 0.14168912172317505, + "learning_rate": 0.0005, + "loss": 2.1296, + "step": 70030 + }, + { + "epoch": 0.26658952673127134, + "grad_norm": 0.12297537177801132, + "learning_rate": 0.0005, + "loss": 2.1205, + "step": 70040 + }, + { + "epoch": 0.26662758919939405, + "grad_norm": 0.14183282852172852, + "learning_rate": 0.0005, + "loss": 2.1345, + "step": 70050 + }, + { + "epoch": 0.2666656516675167, + "grad_norm": 0.1262883096933365, + "learning_rate": 0.0005, + "loss": 2.125, + "step": 70060 + }, + { + "epoch": 0.2667037141356394, + "grad_norm": 0.13181641697883606, + "learning_rate": 0.0005, + "loss": 2.1246, + "step": 70070 + }, + { + "epoch": 0.2667417766037621, + "grad_norm": 0.12226633727550507, + "learning_rate": 0.0005, + "loss": 2.1273, + "step": 70080 + }, + { + "epoch": 0.2667798390718848, + "grad_norm": 0.12573277950286865, + "learning_rate": 0.0005, + "loss": 2.1365, + "step": 70090 + }, + { + "epoch": 0.26681790154000745, + "grad_norm": 0.13281308114528656, + "learning_rate": 0.0005, + "loss": 2.1259, + "step": 70100 + }, + { + "epoch": 0.26685596400813016, + "grad_norm": 0.12960529327392578, + "learning_rate": 0.0005, + "loss": 2.1175, + "step": 70110 + }, + { + "epoch": 0.2668940264762528, + "grad_norm": 0.11388522386550903, + "learning_rate": 0.0005, + "loss": 2.1196, + "step": 70120 + }, + { + "epoch": 0.26693208894437553, + "grad_norm": 0.12484312802553177, + "learning_rate": 0.0005, + "loss": 2.1321, + "step": 70130 + }, + { + "epoch": 0.2669701514124982, + "grad_norm": 0.11258337646722794, + "learning_rate": 0.0005, + "loss": 2.1298, + "step": 70140 + }, + { + "epoch": 0.26700821388062085, + "grad_norm": 0.11932699382305145, + "learning_rate": 0.0005, + "loss": 2.1174, + "step": 70150 + }, + { + "epoch": 0.26704627634874356, + "grad_norm": 0.12339174002408981, + "learning_rate": 0.0005, + "loss": 2.1374, + "step": 70160 + }, + { + "epoch": 0.2670843388168662, + "grad_norm": 0.12048283219337463, + "learning_rate": 0.0005, + "loss": 2.1322, + "step": 70170 + }, + { + "epoch": 0.26712240128498893, + "grad_norm": 0.11754105240106583, + "learning_rate": 0.0005, + "loss": 2.1297, + "step": 70180 + }, + { + "epoch": 0.2671604637531116, + "grad_norm": 0.12378121167421341, + "learning_rate": 0.0005, + "loss": 2.1312, + "step": 70190 + }, + { + "epoch": 0.2671985262212343, + "grad_norm": 0.12185100466012955, + "learning_rate": 0.0005, + "loss": 2.13, + "step": 70200 + }, + { + "epoch": 0.26723658868935696, + "grad_norm": 0.13541337847709656, + "learning_rate": 0.0005, + "loss": 2.1235, + "step": 70210 + }, + { + "epoch": 0.26727465115747967, + "grad_norm": 0.12544508278369904, + "learning_rate": 0.0005, + "loss": 2.1436, + "step": 70220 + }, + { + "epoch": 0.2673127136256023, + "grad_norm": 0.12439633905887604, + "learning_rate": 0.0005, + "loss": 2.1444, + "step": 70230 + }, + { + "epoch": 0.26735077609372504, + "grad_norm": 0.13282085955142975, + "learning_rate": 0.0005, + "loss": 2.1376, + "step": 70240 + }, + { + "epoch": 0.2673888385618477, + "grad_norm": 0.11832325905561447, + "learning_rate": 0.0005, + "loss": 2.1196, + "step": 70250 + }, + { + "epoch": 0.2674269010299704, + "grad_norm": 0.12853141129016876, + "learning_rate": 0.0005, + "loss": 2.1163, + "step": 70260 + }, + { + "epoch": 0.26746496349809307, + "grad_norm": 0.11417865008115768, + "learning_rate": 0.0005, + "loss": 2.1148, + "step": 70270 + }, + { + "epoch": 0.2675030259662158, + "grad_norm": 0.12991313636302948, + "learning_rate": 0.0005, + "loss": 2.1189, + "step": 70280 + }, + { + "epoch": 0.26754108843433844, + "grad_norm": 0.12928979098796844, + "learning_rate": 0.0005, + "loss": 2.1314, + "step": 70290 + }, + { + "epoch": 0.2675791509024611, + "grad_norm": 0.13762488961219788, + "learning_rate": 0.0005, + "loss": 2.112, + "step": 70300 + }, + { + "epoch": 0.2676172133705838, + "grad_norm": 0.14076592028141022, + "learning_rate": 0.0005, + "loss": 2.1165, + "step": 70310 + }, + { + "epoch": 0.26765527583870646, + "grad_norm": 0.136482372879982, + "learning_rate": 0.0005, + "loss": 2.1203, + "step": 70320 + }, + { + "epoch": 0.2676933383068292, + "grad_norm": 0.13098162412643433, + "learning_rate": 0.0005, + "loss": 2.1248, + "step": 70330 + }, + { + "epoch": 0.26773140077495183, + "grad_norm": 0.10979632288217545, + "learning_rate": 0.0005, + "loss": 2.1342, + "step": 70340 + }, + { + "epoch": 0.26776946324307455, + "grad_norm": 0.1365809589624405, + "learning_rate": 0.0005, + "loss": 2.1436, + "step": 70350 + }, + { + "epoch": 0.2678075257111972, + "grad_norm": 0.12630967795848846, + "learning_rate": 0.0005, + "loss": 2.1282, + "step": 70360 + }, + { + "epoch": 0.2678455881793199, + "grad_norm": 0.12642823159694672, + "learning_rate": 0.0005, + "loss": 2.1377, + "step": 70370 + }, + { + "epoch": 0.2678836506474426, + "grad_norm": 0.12485835701227188, + "learning_rate": 0.0005, + "loss": 2.1428, + "step": 70380 + }, + { + "epoch": 0.2679217131155653, + "grad_norm": 0.12378615885972977, + "learning_rate": 0.0005, + "loss": 2.125, + "step": 70390 + }, + { + "epoch": 0.26795977558368794, + "grad_norm": 0.11807256191968918, + "learning_rate": 0.0005, + "loss": 2.1292, + "step": 70400 + }, + { + "epoch": 0.26799783805181066, + "grad_norm": 0.1206602081656456, + "learning_rate": 0.0005, + "loss": 2.1218, + "step": 70410 + }, + { + "epoch": 0.2680359005199333, + "grad_norm": 0.13767731189727783, + "learning_rate": 0.0005, + "loss": 2.1266, + "step": 70420 + }, + { + "epoch": 0.268073962988056, + "grad_norm": 0.11438094079494476, + "learning_rate": 0.0005, + "loss": 2.1332, + "step": 70430 + }, + { + "epoch": 0.2681120254561787, + "grad_norm": 0.15731793642044067, + "learning_rate": 0.0005, + "loss": 2.105, + "step": 70440 + }, + { + "epoch": 0.26815008792430134, + "grad_norm": 0.13722564280033112, + "learning_rate": 0.0005, + "loss": 2.1201, + "step": 70450 + }, + { + "epoch": 0.26818815039242405, + "grad_norm": 0.12116382271051407, + "learning_rate": 0.0005, + "loss": 2.1163, + "step": 70460 + }, + { + "epoch": 0.2682262128605467, + "grad_norm": 0.125070720911026, + "learning_rate": 0.0005, + "loss": 2.1402, + "step": 70470 + }, + { + "epoch": 0.2682642753286694, + "grad_norm": 0.13829511404037476, + "learning_rate": 0.0005, + "loss": 2.1321, + "step": 70480 + }, + { + "epoch": 0.2683023377967921, + "grad_norm": 0.1141280084848404, + "learning_rate": 0.0005, + "loss": 2.112, + "step": 70490 + }, + { + "epoch": 0.2683404002649148, + "grad_norm": 0.17980065941810608, + "learning_rate": 0.0005, + "loss": 2.122, + "step": 70500 + }, + { + "epoch": 0.26837846273303745, + "grad_norm": 0.14300209283828735, + "learning_rate": 0.0005, + "loss": 2.1207, + "step": 70510 + }, + { + "epoch": 0.26841652520116016, + "grad_norm": 0.15837660431861877, + "learning_rate": 0.0005, + "loss": 2.1264, + "step": 70520 + }, + { + "epoch": 0.2684545876692828, + "grad_norm": 0.12317508459091187, + "learning_rate": 0.0005, + "loss": 2.1442, + "step": 70530 + }, + { + "epoch": 0.26849265013740553, + "grad_norm": 0.11688899248838425, + "learning_rate": 0.0005, + "loss": 2.1369, + "step": 70540 + }, + { + "epoch": 0.2685307126055282, + "grad_norm": 0.12119658291339874, + "learning_rate": 0.0005, + "loss": 2.1266, + "step": 70550 + }, + { + "epoch": 0.2685687750736509, + "grad_norm": 0.1074419841170311, + "learning_rate": 0.0005, + "loss": 2.1247, + "step": 70560 + }, + { + "epoch": 0.26860683754177356, + "grad_norm": 0.12809164822101593, + "learning_rate": 0.0005, + "loss": 2.1258, + "step": 70570 + }, + { + "epoch": 0.2686449000098962, + "grad_norm": 0.13653728365898132, + "learning_rate": 0.0005, + "loss": 2.1307, + "step": 70580 + }, + { + "epoch": 0.2686829624780189, + "grad_norm": 0.13394500315189362, + "learning_rate": 0.0005, + "loss": 2.1314, + "step": 70590 + }, + { + "epoch": 0.2687210249461416, + "grad_norm": 0.1318061202764511, + "learning_rate": 0.0005, + "loss": 2.1448, + "step": 70600 + }, + { + "epoch": 0.2687590874142643, + "grad_norm": 0.24545541405677795, + "learning_rate": 0.0005, + "loss": 2.1109, + "step": 70610 + }, + { + "epoch": 0.26879714988238695, + "grad_norm": 0.12553299963474274, + "learning_rate": 0.0005, + "loss": 2.1075, + "step": 70620 + }, + { + "epoch": 0.26883521235050967, + "grad_norm": 0.12328213453292847, + "learning_rate": 0.0005, + "loss": 2.1148, + "step": 70630 + }, + { + "epoch": 0.2688732748186323, + "grad_norm": 0.125651016831398, + "learning_rate": 0.0005, + "loss": 2.1315, + "step": 70640 + }, + { + "epoch": 0.26891133728675504, + "grad_norm": 0.13269735872745514, + "learning_rate": 0.0005, + "loss": 2.1183, + "step": 70650 + }, + { + "epoch": 0.2689493997548777, + "grad_norm": 0.12147653847932816, + "learning_rate": 0.0005, + "loss": 2.1249, + "step": 70660 + }, + { + "epoch": 0.2689874622230004, + "grad_norm": 0.12299970537424088, + "learning_rate": 0.0005, + "loss": 2.1144, + "step": 70670 + }, + { + "epoch": 0.26902552469112306, + "grad_norm": 0.12026827782392502, + "learning_rate": 0.0005, + "loss": 2.1154, + "step": 70680 + }, + { + "epoch": 0.2690635871592458, + "grad_norm": 0.13033393025398254, + "learning_rate": 0.0005, + "loss": 2.1243, + "step": 70690 + }, + { + "epoch": 0.26910164962736843, + "grad_norm": 0.1141572818160057, + "learning_rate": 0.0005, + "loss": 2.1316, + "step": 70700 + }, + { + "epoch": 0.26913971209549115, + "grad_norm": 0.11448405683040619, + "learning_rate": 0.0005, + "loss": 2.128, + "step": 70710 + }, + { + "epoch": 0.2691777745636138, + "grad_norm": 0.1274835169315338, + "learning_rate": 0.0005, + "loss": 2.1445, + "step": 70720 + }, + { + "epoch": 0.26921583703173646, + "grad_norm": 0.12299812585115433, + "learning_rate": 0.0005, + "loss": 2.1173, + "step": 70730 + }, + { + "epoch": 0.2692538994998592, + "grad_norm": 0.11992338299751282, + "learning_rate": 0.0005, + "loss": 2.1309, + "step": 70740 + }, + { + "epoch": 0.26929196196798183, + "grad_norm": 0.11715205758810043, + "learning_rate": 0.0005, + "loss": 2.1299, + "step": 70750 + }, + { + "epoch": 0.26933002443610454, + "grad_norm": 0.1122170016169548, + "learning_rate": 0.0005, + "loss": 2.1209, + "step": 70760 + }, + { + "epoch": 0.2693680869042272, + "grad_norm": 0.1312478929758072, + "learning_rate": 0.0005, + "loss": 2.1177, + "step": 70770 + }, + { + "epoch": 0.2694061493723499, + "grad_norm": 0.12616901099681854, + "learning_rate": 0.0005, + "loss": 2.1056, + "step": 70780 + }, + { + "epoch": 0.26944421184047257, + "grad_norm": 0.12851281464099884, + "learning_rate": 0.0005, + "loss": 2.1065, + "step": 70790 + }, + { + "epoch": 0.2694822743085953, + "grad_norm": 0.11669650673866272, + "learning_rate": 0.0005, + "loss": 2.1243, + "step": 70800 + }, + { + "epoch": 0.26952033677671794, + "grad_norm": 0.12413670867681503, + "learning_rate": 0.0005, + "loss": 2.1383, + "step": 70810 + }, + { + "epoch": 0.26955839924484065, + "grad_norm": 0.12353887408971786, + "learning_rate": 0.0005, + "loss": 2.1351, + "step": 70820 + }, + { + "epoch": 0.2695964617129633, + "grad_norm": 0.12555578351020813, + "learning_rate": 0.0005, + "loss": 2.1167, + "step": 70830 + }, + { + "epoch": 0.269634524181086, + "grad_norm": 0.12032879889011383, + "learning_rate": 0.0005, + "loss": 2.1162, + "step": 70840 + }, + { + "epoch": 0.2696725866492087, + "grad_norm": 0.12755469977855682, + "learning_rate": 0.0005, + "loss": 2.1034, + "step": 70850 + }, + { + "epoch": 0.2697106491173314, + "grad_norm": 0.12309958040714264, + "learning_rate": 0.0005, + "loss": 2.1114, + "step": 70860 + }, + { + "epoch": 0.26974871158545405, + "grad_norm": 0.1292559802532196, + "learning_rate": 0.0005, + "loss": 2.1235, + "step": 70870 + }, + { + "epoch": 0.2697867740535767, + "grad_norm": 0.11725734174251556, + "learning_rate": 0.0005, + "loss": 2.1297, + "step": 70880 + }, + { + "epoch": 0.2698248365216994, + "grad_norm": 0.12600664794445038, + "learning_rate": 0.0005, + "loss": 2.1154, + "step": 70890 + }, + { + "epoch": 0.2698628989898221, + "grad_norm": 0.1374066025018692, + "learning_rate": 0.0005, + "loss": 2.1304, + "step": 70900 + }, + { + "epoch": 0.2699009614579448, + "grad_norm": 0.13278096914291382, + "learning_rate": 0.0005, + "loss": 2.1146, + "step": 70910 + }, + { + "epoch": 0.26993902392606745, + "grad_norm": 0.125087708234787, + "learning_rate": 0.0005, + "loss": 2.1214, + "step": 70920 + }, + { + "epoch": 0.26997708639419016, + "grad_norm": 0.13760273158550262, + "learning_rate": 0.0005, + "loss": 2.127, + "step": 70930 + }, + { + "epoch": 0.2700151488623128, + "grad_norm": 0.1145314872264862, + "learning_rate": 0.0005, + "loss": 2.1289, + "step": 70940 + }, + { + "epoch": 0.2700532113304355, + "grad_norm": 0.12293320894241333, + "learning_rate": 0.0005, + "loss": 2.1156, + "step": 70950 + }, + { + "epoch": 0.2700912737985582, + "grad_norm": 0.12370413541793823, + "learning_rate": 0.0005, + "loss": 2.1184, + "step": 70960 + }, + { + "epoch": 0.2701293362666809, + "grad_norm": 0.12029270082712173, + "learning_rate": 0.0005, + "loss": 2.1225, + "step": 70970 + }, + { + "epoch": 0.27016739873480355, + "grad_norm": 0.13946790993213654, + "learning_rate": 0.0005, + "loss": 2.1282, + "step": 70980 + }, + { + "epoch": 0.27020546120292627, + "grad_norm": 0.13315635919570923, + "learning_rate": 0.0005, + "loss": 2.1294, + "step": 70990 + }, + { + "epoch": 0.2702435236710489, + "grad_norm": 0.1230052262544632, + "learning_rate": 0.0005, + "loss": 2.1287, + "step": 71000 + }, + { + "epoch": 0.2702815861391716, + "grad_norm": 0.11799309402704239, + "learning_rate": 0.0005, + "loss": 2.1224, + "step": 71010 + }, + { + "epoch": 0.2703196486072943, + "grad_norm": 0.12883540987968445, + "learning_rate": 0.0005, + "loss": 2.1274, + "step": 71020 + }, + { + "epoch": 0.27035771107541695, + "grad_norm": 0.12491268664598465, + "learning_rate": 0.0005, + "loss": 2.1344, + "step": 71030 + }, + { + "epoch": 0.27039577354353966, + "grad_norm": 0.120393306016922, + "learning_rate": 0.0005, + "loss": 2.1238, + "step": 71040 + }, + { + "epoch": 0.2704338360116623, + "grad_norm": 0.11560060828924179, + "learning_rate": 0.0005, + "loss": 2.1328, + "step": 71050 + }, + { + "epoch": 0.27047189847978503, + "grad_norm": 0.11167508363723755, + "learning_rate": 0.0005, + "loss": 2.1262, + "step": 71060 + }, + { + "epoch": 0.2705099609479077, + "grad_norm": 0.1285915970802307, + "learning_rate": 0.0005, + "loss": 2.1306, + "step": 71070 + }, + { + "epoch": 0.2705480234160304, + "grad_norm": 0.13339774310588837, + "learning_rate": 0.0005, + "loss": 2.1139, + "step": 71080 + }, + { + "epoch": 0.27058608588415306, + "grad_norm": 0.13140417635440826, + "learning_rate": 0.0005, + "loss": 2.1101, + "step": 71090 + }, + { + "epoch": 0.2706241483522758, + "grad_norm": 0.11885833740234375, + "learning_rate": 0.0005, + "loss": 2.1286, + "step": 71100 + }, + { + "epoch": 0.27066221082039843, + "grad_norm": 0.12500646710395813, + "learning_rate": 0.0005, + "loss": 2.0974, + "step": 71110 + }, + { + "epoch": 0.27070027328852114, + "grad_norm": 0.12917517125606537, + "learning_rate": 0.0005, + "loss": 2.1246, + "step": 71120 + }, + { + "epoch": 0.2707383357566438, + "grad_norm": 0.11975311487913132, + "learning_rate": 0.0005, + "loss": 2.1229, + "step": 71130 + }, + { + "epoch": 0.2707763982247665, + "grad_norm": 0.12097851186990738, + "learning_rate": 0.0005, + "loss": 2.129, + "step": 71140 + }, + { + "epoch": 0.27081446069288917, + "grad_norm": 0.12181615084409714, + "learning_rate": 0.0005, + "loss": 2.1363, + "step": 71150 + }, + { + "epoch": 0.2708525231610118, + "grad_norm": 0.13859118521213531, + "learning_rate": 0.0005, + "loss": 2.1214, + "step": 71160 + }, + { + "epoch": 0.27089058562913454, + "grad_norm": 0.133628249168396, + "learning_rate": 0.0005, + "loss": 2.1143, + "step": 71170 + }, + { + "epoch": 0.2709286480972572, + "grad_norm": 0.11724966019392014, + "learning_rate": 0.0005, + "loss": 2.1054, + "step": 71180 + }, + { + "epoch": 0.2709667105653799, + "grad_norm": 0.12641079723834991, + "learning_rate": 0.0005, + "loss": 2.116, + "step": 71190 + }, + { + "epoch": 0.27100477303350257, + "grad_norm": 0.1277732402086258, + "learning_rate": 0.0005, + "loss": 2.1448, + "step": 71200 + }, + { + "epoch": 0.2710428355016253, + "grad_norm": 0.13520966470241547, + "learning_rate": 0.0005, + "loss": 2.1262, + "step": 71210 + }, + { + "epoch": 0.27108089796974794, + "grad_norm": 0.13680309057235718, + "learning_rate": 0.0005, + "loss": 2.1186, + "step": 71220 + }, + { + "epoch": 0.27111896043787065, + "grad_norm": 0.13837707042694092, + "learning_rate": 0.0005, + "loss": 2.1162, + "step": 71230 + }, + { + "epoch": 0.2711570229059933, + "grad_norm": 0.12964460253715515, + "learning_rate": 0.0005, + "loss": 2.1244, + "step": 71240 + }, + { + "epoch": 0.271195085374116, + "grad_norm": 0.13072986900806427, + "learning_rate": 0.0005, + "loss": 2.1288, + "step": 71250 + }, + { + "epoch": 0.2712331478422387, + "grad_norm": 0.12068648636341095, + "learning_rate": 0.0005, + "loss": 2.14, + "step": 71260 + }, + { + "epoch": 0.2712712103103614, + "grad_norm": 0.12971381843090057, + "learning_rate": 0.0005, + "loss": 2.1372, + "step": 71270 + }, + { + "epoch": 0.27130927277848405, + "grad_norm": 0.12840324640274048, + "learning_rate": 0.0005, + "loss": 2.1071, + "step": 71280 + }, + { + "epoch": 0.27134733524660676, + "grad_norm": 0.12420535832643509, + "learning_rate": 0.0005, + "loss": 2.1326, + "step": 71290 + }, + { + "epoch": 0.2713853977147294, + "grad_norm": 0.12429869174957275, + "learning_rate": 0.0005, + "loss": 2.1167, + "step": 71300 + }, + { + "epoch": 0.27142346018285207, + "grad_norm": 0.12602099776268005, + "learning_rate": 0.0005, + "loss": 2.1301, + "step": 71310 + }, + { + "epoch": 0.2714615226509748, + "grad_norm": 0.1185278370976448, + "learning_rate": 0.0005, + "loss": 2.1087, + "step": 71320 + }, + { + "epoch": 0.27149958511909744, + "grad_norm": 0.13878685235977173, + "learning_rate": 0.0005, + "loss": 2.1155, + "step": 71330 + }, + { + "epoch": 0.27153764758722015, + "grad_norm": 0.12120170891284943, + "learning_rate": 0.0005, + "loss": 2.1342, + "step": 71340 + }, + { + "epoch": 0.2715757100553428, + "grad_norm": 0.12688519060611725, + "learning_rate": 0.0005, + "loss": 2.1329, + "step": 71350 + }, + { + "epoch": 0.2716137725234655, + "grad_norm": 0.13000376522541046, + "learning_rate": 0.0005, + "loss": 2.1195, + "step": 71360 + }, + { + "epoch": 0.2716518349915882, + "grad_norm": 0.12218218296766281, + "learning_rate": 0.0005, + "loss": 2.1201, + "step": 71370 + }, + { + "epoch": 0.2716898974597109, + "grad_norm": 0.14565366506576538, + "learning_rate": 0.0005, + "loss": 2.1243, + "step": 71380 + }, + { + "epoch": 0.27172795992783355, + "grad_norm": 0.13229680061340332, + "learning_rate": 0.0005, + "loss": 2.1322, + "step": 71390 + }, + { + "epoch": 0.27176602239595626, + "grad_norm": 0.138389453291893, + "learning_rate": 0.0005, + "loss": 2.128, + "step": 71400 + }, + { + "epoch": 0.2718040848640789, + "grad_norm": 0.136601060628891, + "learning_rate": 0.0005, + "loss": 2.1139, + "step": 71410 + }, + { + "epoch": 0.27184214733220163, + "grad_norm": 0.13356034457683563, + "learning_rate": 0.0005, + "loss": 2.1442, + "step": 71420 + }, + { + "epoch": 0.2718802098003243, + "grad_norm": 0.12265916913747787, + "learning_rate": 0.0005, + "loss": 2.1287, + "step": 71430 + }, + { + "epoch": 0.27191827226844695, + "grad_norm": 0.12345866113901138, + "learning_rate": 0.0005, + "loss": 2.1446, + "step": 71440 + }, + { + "epoch": 0.27195633473656966, + "grad_norm": 0.12076914310455322, + "learning_rate": 0.0005, + "loss": 2.1218, + "step": 71450 + }, + { + "epoch": 0.2719943972046923, + "grad_norm": 0.12351701408624649, + "learning_rate": 0.0005, + "loss": 2.1197, + "step": 71460 + }, + { + "epoch": 0.27203245967281503, + "grad_norm": 0.1288938671350479, + "learning_rate": 0.0005, + "loss": 2.143, + "step": 71470 + }, + { + "epoch": 0.2720705221409377, + "grad_norm": 0.13546644151210785, + "learning_rate": 0.0005, + "loss": 2.1321, + "step": 71480 + }, + { + "epoch": 0.2721085846090604, + "grad_norm": 0.11406464874744415, + "learning_rate": 0.0005, + "loss": 2.1135, + "step": 71490 + }, + { + "epoch": 0.27214664707718306, + "grad_norm": 0.12579847872257233, + "learning_rate": 0.0005, + "loss": 2.1165, + "step": 71500 + }, + { + "epoch": 0.27218470954530577, + "grad_norm": 0.12819981575012207, + "learning_rate": 0.0005, + "loss": 2.1201, + "step": 71510 + }, + { + "epoch": 0.2722227720134284, + "grad_norm": 0.12256570160388947, + "learning_rate": 0.0005, + "loss": 2.1411, + "step": 71520 + }, + { + "epoch": 0.27226083448155114, + "grad_norm": 0.11458420753479004, + "learning_rate": 0.0005, + "loss": 2.136, + "step": 71530 + }, + { + "epoch": 0.2722988969496738, + "grad_norm": 0.12950736284255981, + "learning_rate": 0.0005, + "loss": 2.1244, + "step": 71540 + }, + { + "epoch": 0.2723369594177965, + "grad_norm": 0.11344198882579803, + "learning_rate": 0.0005, + "loss": 2.1231, + "step": 71550 + }, + { + "epoch": 0.27237502188591917, + "grad_norm": 0.11599477380514145, + "learning_rate": 0.0005, + "loss": 2.1396, + "step": 71560 + }, + { + "epoch": 0.2724130843540419, + "grad_norm": 0.11852739006280899, + "learning_rate": 0.0005, + "loss": 2.1268, + "step": 71570 + }, + { + "epoch": 0.27245114682216454, + "grad_norm": 0.11747030168771744, + "learning_rate": 0.0005, + "loss": 2.1321, + "step": 71580 + }, + { + "epoch": 0.2724892092902872, + "grad_norm": 0.13440634310245514, + "learning_rate": 0.0005, + "loss": 2.1306, + "step": 71590 + }, + { + "epoch": 0.2725272717584099, + "grad_norm": 0.12431393563747406, + "learning_rate": 0.0005, + "loss": 2.1356, + "step": 71600 + }, + { + "epoch": 0.27256533422653256, + "grad_norm": 0.1287822425365448, + "learning_rate": 0.0005, + "loss": 2.1294, + "step": 71610 + }, + { + "epoch": 0.2726033966946553, + "grad_norm": 0.12533940374851227, + "learning_rate": 0.0005, + "loss": 2.1288, + "step": 71620 + }, + { + "epoch": 0.27264145916277793, + "grad_norm": 0.1331218034029007, + "learning_rate": 0.0005, + "loss": 2.1249, + "step": 71630 + }, + { + "epoch": 0.27267952163090065, + "grad_norm": 0.11935290694236755, + "learning_rate": 0.0005, + "loss": 2.1261, + "step": 71640 + }, + { + "epoch": 0.2727175840990233, + "grad_norm": 0.12688718736171722, + "learning_rate": 0.0005, + "loss": 2.1225, + "step": 71650 + }, + { + "epoch": 0.272755646567146, + "grad_norm": 0.12275033444166183, + "learning_rate": 0.0005, + "loss": 2.1143, + "step": 71660 + }, + { + "epoch": 0.27279370903526867, + "grad_norm": 0.14148494601249695, + "learning_rate": 0.0005, + "loss": 2.1247, + "step": 71670 + }, + { + "epoch": 0.2728317715033914, + "grad_norm": 0.13131959736347198, + "learning_rate": 0.0005, + "loss": 2.1307, + "step": 71680 + }, + { + "epoch": 0.27286983397151404, + "grad_norm": 0.12532196938991547, + "learning_rate": 0.0005, + "loss": 2.1125, + "step": 71690 + }, + { + "epoch": 0.27290789643963675, + "grad_norm": 0.1329348236322403, + "learning_rate": 0.0005, + "loss": 2.1225, + "step": 71700 + }, + { + "epoch": 0.2729459589077594, + "grad_norm": 0.1316693127155304, + "learning_rate": 0.0005, + "loss": 2.1337, + "step": 71710 + }, + { + "epoch": 0.2729840213758821, + "grad_norm": 0.13339757919311523, + "learning_rate": 0.0005, + "loss": 2.1197, + "step": 71720 + }, + { + "epoch": 0.2730220838440048, + "grad_norm": 0.1378227323293686, + "learning_rate": 0.0005, + "loss": 2.1325, + "step": 71730 + }, + { + "epoch": 0.27306014631212744, + "grad_norm": 0.12965930998325348, + "learning_rate": 0.0005, + "loss": 2.1374, + "step": 71740 + }, + { + "epoch": 0.27309820878025015, + "grad_norm": 0.11927662044763565, + "learning_rate": 0.0005, + "loss": 2.1301, + "step": 71750 + }, + { + "epoch": 0.2731362712483728, + "grad_norm": 0.12863266468048096, + "learning_rate": 0.0005, + "loss": 2.1196, + "step": 71760 + }, + { + "epoch": 0.2731743337164955, + "grad_norm": 0.13583002984523773, + "learning_rate": 0.0005, + "loss": 2.1098, + "step": 71770 + }, + { + "epoch": 0.2732123961846182, + "grad_norm": 0.132333904504776, + "learning_rate": 0.0005, + "loss": 2.1228, + "step": 71780 + }, + { + "epoch": 0.2732504586527409, + "grad_norm": 0.12701696157455444, + "learning_rate": 0.0005, + "loss": 2.115, + "step": 71790 + }, + { + "epoch": 0.27328852112086355, + "grad_norm": 0.13651852309703827, + "learning_rate": 0.0005, + "loss": 2.1286, + "step": 71800 + }, + { + "epoch": 0.27332658358898626, + "grad_norm": 0.13848647475242615, + "learning_rate": 0.0005, + "loss": 2.1282, + "step": 71810 + }, + { + "epoch": 0.2733646460571089, + "grad_norm": 0.11856270581483841, + "learning_rate": 0.0005, + "loss": 2.1217, + "step": 71820 + }, + { + "epoch": 0.27340270852523163, + "grad_norm": 0.12606863677501678, + "learning_rate": 0.0005, + "loss": 2.1291, + "step": 71830 + }, + { + "epoch": 0.2734407709933543, + "grad_norm": 0.14228320121765137, + "learning_rate": 0.0005, + "loss": 2.1308, + "step": 71840 + }, + { + "epoch": 0.273478833461477, + "grad_norm": 0.1278725564479828, + "learning_rate": 0.0005, + "loss": 2.1125, + "step": 71850 + }, + { + "epoch": 0.27351689592959966, + "grad_norm": 0.13307037949562073, + "learning_rate": 0.0005, + "loss": 2.1259, + "step": 71860 + }, + { + "epoch": 0.2735549583977223, + "grad_norm": 0.6107332110404968, + "learning_rate": 0.0005, + "loss": 2.1309, + "step": 71870 + }, + { + "epoch": 0.273593020865845, + "grad_norm": 0.12317752838134766, + "learning_rate": 0.0005, + "loss": 2.1467, + "step": 71880 + }, + { + "epoch": 0.2736310833339677, + "grad_norm": 0.12442834675312042, + "learning_rate": 0.0005, + "loss": 2.1395, + "step": 71890 + }, + { + "epoch": 0.2736691458020904, + "grad_norm": 0.11339490860700607, + "learning_rate": 0.0005, + "loss": 2.1461, + "step": 71900 + }, + { + "epoch": 0.27370720827021305, + "grad_norm": 0.1318211406469345, + "learning_rate": 0.0005, + "loss": 2.1326, + "step": 71910 + }, + { + "epoch": 0.27374527073833577, + "grad_norm": 0.1129222959280014, + "learning_rate": 0.0005, + "loss": 2.1346, + "step": 71920 + }, + { + "epoch": 0.2737833332064584, + "grad_norm": 0.14258186519145966, + "learning_rate": 0.0005, + "loss": 2.1301, + "step": 71930 + }, + { + "epoch": 0.27382139567458114, + "grad_norm": 0.1269596368074417, + "learning_rate": 0.0005, + "loss": 2.1045, + "step": 71940 + }, + { + "epoch": 0.2738594581427038, + "grad_norm": 0.13796262443065643, + "learning_rate": 0.0005, + "loss": 2.1281, + "step": 71950 + }, + { + "epoch": 0.2738975206108265, + "grad_norm": 0.1308310478925705, + "learning_rate": 0.0005, + "loss": 2.1308, + "step": 71960 + }, + { + "epoch": 0.27393558307894916, + "grad_norm": 0.1301732063293457, + "learning_rate": 0.0005, + "loss": 2.1224, + "step": 71970 + }, + { + "epoch": 0.2739736455470719, + "grad_norm": 0.11353213340044022, + "learning_rate": 0.0005, + "loss": 2.1163, + "step": 71980 + }, + { + "epoch": 0.27401170801519453, + "grad_norm": 0.1273941546678543, + "learning_rate": 0.0005, + "loss": 2.1232, + "step": 71990 + }, + { + "epoch": 0.27404977048331725, + "grad_norm": 0.12934479117393494, + "learning_rate": 0.0005, + "loss": 2.1338, + "step": 72000 + }, + { + "epoch": 0.2740878329514399, + "grad_norm": 0.11563912779092789, + "learning_rate": 0.0005, + "loss": 2.1274, + "step": 72010 + }, + { + "epoch": 0.27412589541956256, + "grad_norm": 0.1264961212873459, + "learning_rate": 0.0005, + "loss": 2.1222, + "step": 72020 + }, + { + "epoch": 0.2741639578876853, + "grad_norm": 0.14248254895210266, + "learning_rate": 0.0005, + "loss": 2.1296, + "step": 72030 + }, + { + "epoch": 0.27420202035580793, + "grad_norm": 0.11887305229902267, + "learning_rate": 0.0005, + "loss": 2.1289, + "step": 72040 + }, + { + "epoch": 0.27424008282393064, + "grad_norm": 0.12073611468076706, + "learning_rate": 0.0005, + "loss": 2.136, + "step": 72050 + }, + { + "epoch": 0.2742781452920533, + "grad_norm": 0.1239795982837677, + "learning_rate": 0.0005, + "loss": 2.1261, + "step": 72060 + }, + { + "epoch": 0.274316207760176, + "grad_norm": 0.12811580300331116, + "learning_rate": 0.0005, + "loss": 2.138, + "step": 72070 + }, + { + "epoch": 0.27435427022829867, + "grad_norm": 0.12124840915203094, + "learning_rate": 0.0005, + "loss": 2.1158, + "step": 72080 + }, + { + "epoch": 0.2743923326964214, + "grad_norm": 0.12464161962270737, + "learning_rate": 0.0005, + "loss": 2.1245, + "step": 72090 + }, + { + "epoch": 0.27443039516454404, + "grad_norm": 0.13844406604766846, + "learning_rate": 0.0005, + "loss": 2.1227, + "step": 72100 + }, + { + "epoch": 0.27446845763266675, + "grad_norm": 0.15234865248203278, + "learning_rate": 0.0005, + "loss": 2.1389, + "step": 72110 + }, + { + "epoch": 0.2745065201007894, + "grad_norm": 0.10983190685510635, + "learning_rate": 0.0005, + "loss": 2.1201, + "step": 72120 + }, + { + "epoch": 0.2745445825689121, + "grad_norm": 0.13060855865478516, + "learning_rate": 0.0005, + "loss": 2.1365, + "step": 72130 + }, + { + "epoch": 0.2745826450370348, + "grad_norm": 0.12701942026615143, + "learning_rate": 0.0005, + "loss": 2.1231, + "step": 72140 + }, + { + "epoch": 0.2746207075051575, + "grad_norm": 0.11977972090244293, + "learning_rate": 0.0005, + "loss": 2.1334, + "step": 72150 + }, + { + "epoch": 0.27465876997328015, + "grad_norm": 0.12274200469255447, + "learning_rate": 0.0005, + "loss": 2.1241, + "step": 72160 + }, + { + "epoch": 0.2746968324414028, + "grad_norm": 0.12599435448646545, + "learning_rate": 0.0005, + "loss": 2.1222, + "step": 72170 + }, + { + "epoch": 0.2747348949095255, + "grad_norm": 0.13417574763298035, + "learning_rate": 0.0005, + "loss": 2.1223, + "step": 72180 + }, + { + "epoch": 0.2747729573776482, + "grad_norm": 0.11986130475997925, + "learning_rate": 0.0005, + "loss": 2.1203, + "step": 72190 + }, + { + "epoch": 0.2748110198457709, + "grad_norm": 0.1296272724866867, + "learning_rate": 0.0005, + "loss": 2.1327, + "step": 72200 + }, + { + "epoch": 0.27484908231389354, + "grad_norm": 0.11655092984437943, + "learning_rate": 0.0005, + "loss": 2.1319, + "step": 72210 + }, + { + "epoch": 0.27488714478201626, + "grad_norm": 0.12658260762691498, + "learning_rate": 0.0005, + "loss": 2.1182, + "step": 72220 + }, + { + "epoch": 0.2749252072501389, + "grad_norm": 0.11190526187419891, + "learning_rate": 0.0005, + "loss": 2.1291, + "step": 72230 + }, + { + "epoch": 0.2749632697182616, + "grad_norm": 0.12382861971855164, + "learning_rate": 0.0005, + "loss": 2.1209, + "step": 72240 + }, + { + "epoch": 0.2750013321863843, + "grad_norm": 0.14002086222171783, + "learning_rate": 0.0005, + "loss": 2.1176, + "step": 72250 + }, + { + "epoch": 0.275039394654507, + "grad_norm": 0.12437102198600769, + "learning_rate": 0.0005, + "loss": 2.1116, + "step": 72260 + }, + { + "epoch": 0.27507745712262965, + "grad_norm": 0.12130562216043472, + "learning_rate": 0.0005, + "loss": 2.1461, + "step": 72270 + }, + { + "epoch": 0.27511551959075237, + "grad_norm": 0.12964333593845367, + "learning_rate": 0.0005, + "loss": 2.1319, + "step": 72280 + }, + { + "epoch": 0.275153582058875, + "grad_norm": 0.1255006045103073, + "learning_rate": 0.0005, + "loss": 2.1358, + "step": 72290 + }, + { + "epoch": 0.27519164452699774, + "grad_norm": 0.13847501575946808, + "learning_rate": 0.0005, + "loss": 2.125, + "step": 72300 + }, + { + "epoch": 0.2752297069951204, + "grad_norm": 0.1399521380662918, + "learning_rate": 0.0005, + "loss": 2.1279, + "step": 72310 + }, + { + "epoch": 0.27526776946324305, + "grad_norm": 0.1263510286808014, + "learning_rate": 0.0005, + "loss": 2.1023, + "step": 72320 + }, + { + "epoch": 0.27530583193136576, + "grad_norm": 0.12814727425575256, + "learning_rate": 0.0005, + "loss": 2.1083, + "step": 72330 + }, + { + "epoch": 0.2753438943994884, + "grad_norm": 0.12721291184425354, + "learning_rate": 0.0005, + "loss": 2.1178, + "step": 72340 + }, + { + "epoch": 0.27538195686761113, + "grad_norm": 0.12828604876995087, + "learning_rate": 0.0005, + "loss": 2.1182, + "step": 72350 + }, + { + "epoch": 0.2754200193357338, + "grad_norm": 0.13843713700771332, + "learning_rate": 0.0005, + "loss": 2.1015, + "step": 72360 + }, + { + "epoch": 0.2754580818038565, + "grad_norm": 0.12065796554088593, + "learning_rate": 0.0005, + "loss": 2.1287, + "step": 72370 + }, + { + "epoch": 0.27549614427197916, + "grad_norm": 0.1309119164943695, + "learning_rate": 0.0005, + "loss": 2.1258, + "step": 72380 + }, + { + "epoch": 0.2755342067401019, + "grad_norm": 0.12360799312591553, + "learning_rate": 0.0005, + "loss": 2.1087, + "step": 72390 + }, + { + "epoch": 0.27557226920822453, + "grad_norm": 0.11458812654018402, + "learning_rate": 0.0005, + "loss": 2.1264, + "step": 72400 + }, + { + "epoch": 0.27561033167634724, + "grad_norm": 0.12010073661804199, + "learning_rate": 0.0005, + "loss": 2.1225, + "step": 72410 + }, + { + "epoch": 0.2756483941444699, + "grad_norm": 0.1172177717089653, + "learning_rate": 0.0005, + "loss": 2.1312, + "step": 72420 + }, + { + "epoch": 0.2756864566125926, + "grad_norm": 0.1232071965932846, + "learning_rate": 0.0005, + "loss": 2.1326, + "step": 72430 + }, + { + "epoch": 0.27572451908071527, + "grad_norm": 0.12564972043037415, + "learning_rate": 0.0005, + "loss": 2.1269, + "step": 72440 + }, + { + "epoch": 0.2757625815488379, + "grad_norm": 0.11357922106981277, + "learning_rate": 0.0005, + "loss": 2.1428, + "step": 72450 + }, + { + "epoch": 0.27580064401696064, + "grad_norm": 0.13264420628547668, + "learning_rate": 0.0005, + "loss": 2.1185, + "step": 72460 + }, + { + "epoch": 0.2758387064850833, + "grad_norm": 0.1259356290102005, + "learning_rate": 0.0005, + "loss": 2.1272, + "step": 72470 + }, + { + "epoch": 0.275876768953206, + "grad_norm": 0.1306888610124588, + "learning_rate": 0.0005, + "loss": 2.1294, + "step": 72480 + }, + { + "epoch": 0.27591483142132867, + "grad_norm": 0.12248992919921875, + "learning_rate": 0.0005, + "loss": 2.1284, + "step": 72490 + }, + { + "epoch": 0.2759528938894514, + "grad_norm": 0.11357472091913223, + "learning_rate": 0.0005, + "loss": 2.1281, + "step": 72500 + }, + { + "epoch": 0.27599095635757404, + "grad_norm": 0.11590695381164551, + "learning_rate": 0.0005, + "loss": 2.1407, + "step": 72510 + }, + { + "epoch": 0.27602901882569675, + "grad_norm": 0.12227820605039597, + "learning_rate": 0.0005, + "loss": 2.1248, + "step": 72520 + }, + { + "epoch": 0.2760670812938194, + "grad_norm": 0.11770544946193695, + "learning_rate": 0.0005, + "loss": 2.1201, + "step": 72530 + }, + { + "epoch": 0.2761051437619421, + "grad_norm": 0.12528224289417267, + "learning_rate": 0.0005, + "loss": 2.1461, + "step": 72540 + }, + { + "epoch": 0.2761432062300648, + "grad_norm": 0.12878407537937164, + "learning_rate": 0.0005, + "loss": 2.1335, + "step": 72550 + }, + { + "epoch": 0.2761812686981875, + "grad_norm": 0.13306809961795807, + "learning_rate": 0.0005, + "loss": 2.1427, + "step": 72560 + }, + { + "epoch": 0.27621933116631014, + "grad_norm": 0.13325989246368408, + "learning_rate": 0.0005, + "loss": 2.1287, + "step": 72570 + }, + { + "epoch": 0.27625739363443286, + "grad_norm": 0.1145695373415947, + "learning_rate": 0.0005, + "loss": 2.1344, + "step": 72580 + }, + { + "epoch": 0.2762954561025555, + "grad_norm": 0.11948937922716141, + "learning_rate": 0.0005, + "loss": 2.1322, + "step": 72590 + }, + { + "epoch": 0.27633351857067817, + "grad_norm": 0.11720909178256989, + "learning_rate": 0.0005, + "loss": 2.1272, + "step": 72600 + }, + { + "epoch": 0.2763715810388009, + "grad_norm": 0.12367577850818634, + "learning_rate": 0.0005, + "loss": 2.124, + "step": 72610 + }, + { + "epoch": 0.27640964350692354, + "grad_norm": 0.13133859634399414, + "learning_rate": 0.0005, + "loss": 2.1246, + "step": 72620 + }, + { + "epoch": 0.27644770597504625, + "grad_norm": 0.12476001679897308, + "learning_rate": 0.0005, + "loss": 2.1109, + "step": 72630 + }, + { + "epoch": 0.2764857684431689, + "grad_norm": 0.12994538247585297, + "learning_rate": 0.0005, + "loss": 2.13, + "step": 72640 + }, + { + "epoch": 0.2765238309112916, + "grad_norm": 0.13551446795463562, + "learning_rate": 0.0005, + "loss": 2.1143, + "step": 72650 + }, + { + "epoch": 0.2765618933794143, + "grad_norm": 0.13254836201667786, + "learning_rate": 0.0005, + "loss": 2.125, + "step": 72660 + }, + { + "epoch": 0.276599955847537, + "grad_norm": 0.13123776018619537, + "learning_rate": 0.0005, + "loss": 2.1314, + "step": 72670 + }, + { + "epoch": 0.27663801831565965, + "grad_norm": 0.11721350997686386, + "learning_rate": 0.0005, + "loss": 2.1264, + "step": 72680 + }, + { + "epoch": 0.27667608078378236, + "grad_norm": 0.125374436378479, + "learning_rate": 0.0005, + "loss": 2.1315, + "step": 72690 + }, + { + "epoch": 0.276714143251905, + "grad_norm": 0.129308819770813, + "learning_rate": 0.0005, + "loss": 2.124, + "step": 72700 + }, + { + "epoch": 0.27675220572002773, + "grad_norm": 0.14361171424388885, + "learning_rate": 0.0005, + "loss": 2.1395, + "step": 72710 + }, + { + "epoch": 0.2767902681881504, + "grad_norm": 0.14105713367462158, + "learning_rate": 0.0005, + "loss": 2.1293, + "step": 72720 + }, + { + "epoch": 0.2768283306562731, + "grad_norm": 0.12079322338104248, + "learning_rate": 0.0005, + "loss": 2.1224, + "step": 72730 + }, + { + "epoch": 0.27686639312439576, + "grad_norm": 0.12566441297531128, + "learning_rate": 0.0005, + "loss": 2.1144, + "step": 72740 + }, + { + "epoch": 0.2769044555925184, + "grad_norm": 0.12056665867567062, + "learning_rate": 0.0005, + "loss": 2.1209, + "step": 72750 + }, + { + "epoch": 0.27694251806064113, + "grad_norm": 0.11491995304822922, + "learning_rate": 0.0005, + "loss": 2.1285, + "step": 72760 + }, + { + "epoch": 0.2769805805287638, + "grad_norm": 0.12552793323993683, + "learning_rate": 0.0005, + "loss": 2.1263, + "step": 72770 + }, + { + "epoch": 0.2770186429968865, + "grad_norm": 0.12376090884208679, + "learning_rate": 0.0005, + "loss": 2.1364, + "step": 72780 + }, + { + "epoch": 0.27705670546500916, + "grad_norm": 0.11079999059438705, + "learning_rate": 0.0005, + "loss": 2.1259, + "step": 72790 + }, + { + "epoch": 0.27709476793313187, + "grad_norm": 0.11827901005744934, + "learning_rate": 0.0005, + "loss": 2.1274, + "step": 72800 + }, + { + "epoch": 0.2771328304012545, + "grad_norm": 0.12451273202896118, + "learning_rate": 0.0005, + "loss": 2.1226, + "step": 72810 + }, + { + "epoch": 0.27717089286937724, + "grad_norm": 0.13412445783615112, + "learning_rate": 0.0005, + "loss": 2.1154, + "step": 72820 + }, + { + "epoch": 0.2772089553374999, + "grad_norm": 0.12760554254055023, + "learning_rate": 0.0005, + "loss": 2.1197, + "step": 72830 + }, + { + "epoch": 0.2772470178056226, + "grad_norm": 0.1256927102804184, + "learning_rate": 0.0005, + "loss": 2.1135, + "step": 72840 + }, + { + "epoch": 0.27728508027374527, + "grad_norm": 0.12824873626232147, + "learning_rate": 0.0005, + "loss": 2.1365, + "step": 72850 + }, + { + "epoch": 0.277323142741868, + "grad_norm": 0.11432984471321106, + "learning_rate": 0.0005, + "loss": 2.129, + "step": 72860 + }, + { + "epoch": 0.27736120520999064, + "grad_norm": 0.11752847582101822, + "learning_rate": 0.0005, + "loss": 2.1298, + "step": 72870 + }, + { + "epoch": 0.2773992676781133, + "grad_norm": 0.12238814681768417, + "learning_rate": 0.0005, + "loss": 2.1239, + "step": 72880 + }, + { + "epoch": 0.277437330146236, + "grad_norm": 0.128130242228508, + "learning_rate": 0.0005, + "loss": 2.1286, + "step": 72890 + }, + { + "epoch": 0.27747539261435866, + "grad_norm": 0.1231345683336258, + "learning_rate": 0.0005, + "loss": 2.1262, + "step": 72900 + }, + { + "epoch": 0.2775134550824814, + "grad_norm": 0.11722811311483383, + "learning_rate": 0.0005, + "loss": 2.1059, + "step": 72910 + }, + { + "epoch": 0.27755151755060403, + "grad_norm": 0.1214623749256134, + "learning_rate": 0.0005, + "loss": 2.1426, + "step": 72920 + }, + { + "epoch": 0.27758958001872674, + "grad_norm": 0.1423121839761734, + "learning_rate": 0.0005, + "loss": 2.1211, + "step": 72930 + }, + { + "epoch": 0.2776276424868494, + "grad_norm": 0.12980258464813232, + "learning_rate": 0.0005, + "loss": 2.1299, + "step": 72940 + }, + { + "epoch": 0.2776657049549721, + "grad_norm": 0.14954525232315063, + "learning_rate": 0.0005, + "loss": 2.1236, + "step": 72950 + }, + { + "epoch": 0.27770376742309477, + "grad_norm": 0.12002082914113998, + "learning_rate": 0.0005, + "loss": 2.1203, + "step": 72960 + }, + { + "epoch": 0.2777418298912175, + "grad_norm": 0.11208499222993851, + "learning_rate": 0.0005, + "loss": 2.1252, + "step": 72970 + }, + { + "epoch": 0.27777989235934014, + "grad_norm": 0.12585601210594177, + "learning_rate": 0.0005, + "loss": 2.1262, + "step": 72980 + }, + { + "epoch": 0.27781795482746285, + "grad_norm": 0.135234072804451, + "learning_rate": 0.0005, + "loss": 2.1274, + "step": 72990 + }, + { + "epoch": 0.2778560172955855, + "grad_norm": 0.11779285222291946, + "learning_rate": 0.0005, + "loss": 2.1371, + "step": 73000 + }, + { + "epoch": 0.2778940797637082, + "grad_norm": 0.13158877193927765, + "learning_rate": 0.0005, + "loss": 2.1133, + "step": 73010 + }, + { + "epoch": 0.2779321422318309, + "grad_norm": 0.31607118248939514, + "learning_rate": 0.0005, + "loss": 2.1352, + "step": 73020 + }, + { + "epoch": 0.27797020469995354, + "grad_norm": 0.11737809330224991, + "learning_rate": 0.0005, + "loss": 2.1376, + "step": 73030 + }, + { + "epoch": 0.27800826716807625, + "grad_norm": 0.12738829851150513, + "learning_rate": 0.0005, + "loss": 2.1231, + "step": 73040 + }, + { + "epoch": 0.2780463296361989, + "grad_norm": 0.13510660827159882, + "learning_rate": 0.0005, + "loss": 2.1118, + "step": 73050 + }, + { + "epoch": 0.2780843921043216, + "grad_norm": 0.12289441376924515, + "learning_rate": 0.0005, + "loss": 2.1192, + "step": 73060 + }, + { + "epoch": 0.2781224545724443, + "grad_norm": 0.15084408223628998, + "learning_rate": 0.0005, + "loss": 2.1273, + "step": 73070 + }, + { + "epoch": 0.278160517040567, + "grad_norm": 0.11964649707078934, + "learning_rate": 0.0005, + "loss": 2.1409, + "step": 73080 + }, + { + "epoch": 0.27819857950868965, + "grad_norm": 0.1252226084470749, + "learning_rate": 0.0005, + "loss": 2.1251, + "step": 73090 + }, + { + "epoch": 0.27823664197681236, + "grad_norm": 0.12739278376102448, + "learning_rate": 0.0005, + "loss": 2.1262, + "step": 73100 + }, + { + "epoch": 0.278274704444935, + "grad_norm": 0.10967393219470978, + "learning_rate": 0.0005, + "loss": 2.1232, + "step": 73110 + }, + { + "epoch": 0.27831276691305773, + "grad_norm": 0.13770455121994019, + "learning_rate": 0.0005, + "loss": 2.1254, + "step": 73120 + }, + { + "epoch": 0.2783508293811804, + "grad_norm": 0.1319299340248108, + "learning_rate": 0.0005, + "loss": 2.1215, + "step": 73130 + }, + { + "epoch": 0.2783888918493031, + "grad_norm": 0.12463415414094925, + "learning_rate": 0.0005, + "loss": 2.1379, + "step": 73140 + }, + { + "epoch": 0.27842695431742576, + "grad_norm": 0.12271592766046524, + "learning_rate": 0.0005, + "loss": 2.1171, + "step": 73150 + }, + { + "epoch": 0.27846501678554847, + "grad_norm": 0.14967022836208344, + "learning_rate": 0.0005, + "loss": 2.112, + "step": 73160 + }, + { + "epoch": 0.2785030792536711, + "grad_norm": 0.13166722655296326, + "learning_rate": 0.0005, + "loss": 2.1239, + "step": 73170 + }, + { + "epoch": 0.2785411417217938, + "grad_norm": 0.13443920016288757, + "learning_rate": 0.0005, + "loss": 2.124, + "step": 73180 + }, + { + "epoch": 0.2785792041899165, + "grad_norm": 0.12196287512779236, + "learning_rate": 0.0005, + "loss": 2.131, + "step": 73190 + }, + { + "epoch": 0.27861726665803915, + "grad_norm": 0.14742255210876465, + "learning_rate": 0.0005, + "loss": 2.118, + "step": 73200 + }, + { + "epoch": 0.27865532912616187, + "grad_norm": 0.11828165501356125, + "learning_rate": 0.0005, + "loss": 2.1145, + "step": 73210 + }, + { + "epoch": 0.2786933915942845, + "grad_norm": 0.12427137792110443, + "learning_rate": 0.0005, + "loss": 2.1281, + "step": 73220 + }, + { + "epoch": 0.27873145406240724, + "grad_norm": 0.11481276899576187, + "learning_rate": 0.0005, + "loss": 2.1244, + "step": 73230 + }, + { + "epoch": 0.2787695165305299, + "grad_norm": 0.13980519771575928, + "learning_rate": 0.0005, + "loss": 2.1389, + "step": 73240 + }, + { + "epoch": 0.2788075789986526, + "grad_norm": 0.12095464020967484, + "learning_rate": 0.0005, + "loss": 2.1143, + "step": 73250 + }, + { + "epoch": 0.27884564146677526, + "grad_norm": 0.1202949658036232, + "learning_rate": 0.0005, + "loss": 2.1275, + "step": 73260 + }, + { + "epoch": 0.278883703934898, + "grad_norm": 0.12349908798933029, + "learning_rate": 0.0005, + "loss": 2.1363, + "step": 73270 + }, + { + "epoch": 0.27892176640302063, + "grad_norm": 0.12171231955289841, + "learning_rate": 0.0005, + "loss": 2.1346, + "step": 73280 + }, + { + "epoch": 0.27895982887114334, + "grad_norm": 0.12804117798805237, + "learning_rate": 0.0005, + "loss": 2.1289, + "step": 73290 + }, + { + "epoch": 0.278997891339266, + "grad_norm": 0.139211505651474, + "learning_rate": 0.0005, + "loss": 2.1086, + "step": 73300 + }, + { + "epoch": 0.27903595380738866, + "grad_norm": 0.12075608968734741, + "learning_rate": 0.0005, + "loss": 2.1121, + "step": 73310 + }, + { + "epoch": 0.27907401627551137, + "grad_norm": 0.120372474193573, + "learning_rate": 0.0005, + "loss": 2.1047, + "step": 73320 + }, + { + "epoch": 0.27911207874363403, + "grad_norm": 0.1180444061756134, + "learning_rate": 0.0005, + "loss": 2.136, + "step": 73330 + }, + { + "epoch": 0.27915014121175674, + "grad_norm": 0.12552054226398468, + "learning_rate": 0.0005, + "loss": 2.1122, + "step": 73340 + }, + { + "epoch": 0.2791882036798794, + "grad_norm": 0.15276488661766052, + "learning_rate": 0.0005, + "loss": 2.1422, + "step": 73350 + }, + { + "epoch": 0.2792262661480021, + "grad_norm": 0.13675646483898163, + "learning_rate": 0.0005, + "loss": 2.1367, + "step": 73360 + }, + { + "epoch": 0.27926432861612477, + "grad_norm": 0.1263941377401352, + "learning_rate": 0.0005, + "loss": 2.1278, + "step": 73370 + }, + { + "epoch": 0.2793023910842475, + "grad_norm": 0.12853482365608215, + "learning_rate": 0.0005, + "loss": 2.135, + "step": 73380 + }, + { + "epoch": 0.27934045355237014, + "grad_norm": 0.13050010800361633, + "learning_rate": 0.0005, + "loss": 2.1126, + "step": 73390 + }, + { + "epoch": 0.27937851602049285, + "grad_norm": 0.12878821790218353, + "learning_rate": 0.0005, + "loss": 2.1197, + "step": 73400 + }, + { + "epoch": 0.2794165784886155, + "grad_norm": 0.12213098257780075, + "learning_rate": 0.0005, + "loss": 2.1325, + "step": 73410 + }, + { + "epoch": 0.2794546409567382, + "grad_norm": 0.12876629829406738, + "learning_rate": 0.0005, + "loss": 2.1361, + "step": 73420 + }, + { + "epoch": 0.2794927034248609, + "grad_norm": 0.1272810399532318, + "learning_rate": 0.0005, + "loss": 2.1296, + "step": 73430 + }, + { + "epoch": 0.2795307658929836, + "grad_norm": 0.12028060853481293, + "learning_rate": 0.0005, + "loss": 2.1176, + "step": 73440 + }, + { + "epoch": 0.27956882836110625, + "grad_norm": 0.1291312426328659, + "learning_rate": 0.0005, + "loss": 2.1262, + "step": 73450 + }, + { + "epoch": 0.2796068908292289, + "grad_norm": 0.1288689821958542, + "learning_rate": 0.0005, + "loss": 2.1311, + "step": 73460 + }, + { + "epoch": 0.2796449532973516, + "grad_norm": 0.1291615068912506, + "learning_rate": 0.0005, + "loss": 2.1303, + "step": 73470 + }, + { + "epoch": 0.2796830157654743, + "grad_norm": 0.12141037732362747, + "learning_rate": 0.0005, + "loss": 2.124, + "step": 73480 + }, + { + "epoch": 0.279721078233597, + "grad_norm": 0.14140747487545013, + "learning_rate": 0.0005, + "loss": 2.115, + "step": 73490 + }, + { + "epoch": 0.27975914070171964, + "grad_norm": 0.12675082683563232, + "learning_rate": 0.0005, + "loss": 2.1249, + "step": 73500 + }, + { + "epoch": 0.27979720316984236, + "grad_norm": 0.12179327756166458, + "learning_rate": 0.0005, + "loss": 2.1192, + "step": 73510 + }, + { + "epoch": 0.279835265637965, + "grad_norm": 0.11616285890340805, + "learning_rate": 0.0005, + "loss": 2.137, + "step": 73520 + }, + { + "epoch": 0.2798733281060877, + "grad_norm": 0.12648041546344757, + "learning_rate": 0.0005, + "loss": 2.1356, + "step": 73530 + }, + { + "epoch": 0.2799113905742104, + "grad_norm": 0.12828604876995087, + "learning_rate": 0.0005, + "loss": 2.1196, + "step": 73540 + }, + { + "epoch": 0.2799494530423331, + "grad_norm": 0.11238578706979752, + "learning_rate": 0.0005, + "loss": 2.1374, + "step": 73550 + }, + { + "epoch": 0.27998751551045575, + "grad_norm": 0.13090801239013672, + "learning_rate": 0.0005, + "loss": 2.1312, + "step": 73560 + }, + { + "epoch": 0.28002557797857847, + "grad_norm": 0.13082750141620636, + "learning_rate": 0.0005, + "loss": 2.1249, + "step": 73570 + }, + { + "epoch": 0.2800636404467011, + "grad_norm": 0.13069972395896912, + "learning_rate": 0.0005, + "loss": 2.1332, + "step": 73580 + }, + { + "epoch": 0.28010170291482384, + "grad_norm": 0.12461374700069427, + "learning_rate": 0.0005, + "loss": 2.1189, + "step": 73590 + }, + { + "epoch": 0.2801397653829465, + "grad_norm": 0.13140574097633362, + "learning_rate": 0.0005, + "loss": 2.1242, + "step": 73600 + }, + { + "epoch": 0.28017782785106915, + "grad_norm": 0.12235130369663239, + "learning_rate": 0.0005, + "loss": 2.1227, + "step": 73610 + }, + { + "epoch": 0.28021589031919186, + "grad_norm": 0.1320742815732956, + "learning_rate": 0.0005, + "loss": 2.1256, + "step": 73620 + }, + { + "epoch": 0.2802539527873145, + "grad_norm": 0.13617640733718872, + "learning_rate": 0.0005, + "loss": 2.1418, + "step": 73630 + }, + { + "epoch": 0.28029201525543723, + "grad_norm": 0.12380155920982361, + "learning_rate": 0.0005, + "loss": 2.1307, + "step": 73640 + }, + { + "epoch": 0.2803300777235599, + "grad_norm": 0.14408768713474274, + "learning_rate": 0.0005, + "loss": 2.1343, + "step": 73650 + }, + { + "epoch": 0.2803681401916826, + "grad_norm": 0.11952300369739532, + "learning_rate": 0.0005, + "loss": 2.1116, + "step": 73660 + }, + { + "epoch": 0.28040620265980526, + "grad_norm": 0.14055335521697998, + "learning_rate": 0.0005, + "loss": 2.1237, + "step": 73670 + }, + { + "epoch": 0.28044426512792797, + "grad_norm": 0.12879961729049683, + "learning_rate": 0.0005, + "loss": 2.1231, + "step": 73680 + }, + { + "epoch": 0.28048232759605063, + "grad_norm": 0.1171557605266571, + "learning_rate": 0.0005, + "loss": 2.1254, + "step": 73690 + }, + { + "epoch": 0.28052039006417334, + "grad_norm": 0.1254773586988449, + "learning_rate": 0.0005, + "loss": 2.1331, + "step": 73700 + }, + { + "epoch": 0.280558452532296, + "grad_norm": 0.14595383405685425, + "learning_rate": 0.0005, + "loss": 2.1402, + "step": 73710 + }, + { + "epoch": 0.2805965150004187, + "grad_norm": 0.1298341304063797, + "learning_rate": 0.0005, + "loss": 2.131, + "step": 73720 + }, + { + "epoch": 0.28063457746854137, + "grad_norm": 0.12870018184185028, + "learning_rate": 0.0005, + "loss": 2.1213, + "step": 73730 + }, + { + "epoch": 0.280672639936664, + "grad_norm": 0.12347274273633957, + "learning_rate": 0.0005, + "loss": 2.1226, + "step": 73740 + }, + { + "epoch": 0.28071070240478674, + "grad_norm": 0.11614564806222916, + "learning_rate": 0.0005, + "loss": 2.1184, + "step": 73750 + }, + { + "epoch": 0.2807487648729094, + "grad_norm": 0.120811328291893, + "learning_rate": 0.0005, + "loss": 2.1181, + "step": 73760 + }, + { + "epoch": 0.2807868273410321, + "grad_norm": 0.11254284530878067, + "learning_rate": 0.0005, + "loss": 2.1246, + "step": 73770 + }, + { + "epoch": 0.28082488980915477, + "grad_norm": 0.12072857469320297, + "learning_rate": 0.0005, + "loss": 2.1258, + "step": 73780 + }, + { + "epoch": 0.2808629522772775, + "grad_norm": 0.11864668130874634, + "learning_rate": 0.0005, + "loss": 2.1279, + "step": 73790 + }, + { + "epoch": 0.28090101474540013, + "grad_norm": 0.12393586337566376, + "learning_rate": 0.0005, + "loss": 2.1313, + "step": 73800 + }, + { + "epoch": 0.28093907721352285, + "grad_norm": 0.11929041892290115, + "learning_rate": 0.0005, + "loss": 2.1119, + "step": 73810 + }, + { + "epoch": 0.2809771396816455, + "grad_norm": 0.11602532863616943, + "learning_rate": 0.0005, + "loss": 2.1245, + "step": 73820 + }, + { + "epoch": 0.2810152021497682, + "grad_norm": 0.12017183005809784, + "learning_rate": 0.0005, + "loss": 2.1263, + "step": 73830 + }, + { + "epoch": 0.2810532646178909, + "grad_norm": 0.131440207362175, + "learning_rate": 0.0005, + "loss": 2.1288, + "step": 73840 + }, + { + "epoch": 0.2810913270860136, + "grad_norm": 0.12414850294589996, + "learning_rate": 0.0005, + "loss": 2.1311, + "step": 73850 + }, + { + "epoch": 0.28112938955413624, + "grad_norm": 0.1354745477437973, + "learning_rate": 0.0005, + "loss": 2.1303, + "step": 73860 + }, + { + "epoch": 0.28116745202225896, + "grad_norm": 0.13583865761756897, + "learning_rate": 0.0005, + "loss": 2.1175, + "step": 73870 + }, + { + "epoch": 0.2812055144903816, + "grad_norm": 0.12519274652004242, + "learning_rate": 0.0005, + "loss": 2.1307, + "step": 73880 + }, + { + "epoch": 0.28124357695850427, + "grad_norm": 0.12412375956773758, + "learning_rate": 0.0005, + "loss": 2.1177, + "step": 73890 + }, + { + "epoch": 0.281281639426627, + "grad_norm": 0.12623530626296997, + "learning_rate": 0.0005, + "loss": 2.1183, + "step": 73900 + }, + { + "epoch": 0.28131970189474964, + "grad_norm": 0.12054393440485, + "learning_rate": 0.0005, + "loss": 2.1246, + "step": 73910 + }, + { + "epoch": 0.28135776436287235, + "grad_norm": 0.1273050457239151, + "learning_rate": 0.0005, + "loss": 2.1366, + "step": 73920 + }, + { + "epoch": 0.281395826830995, + "grad_norm": 0.13887496292591095, + "learning_rate": 0.0005, + "loss": 2.1196, + "step": 73930 + }, + { + "epoch": 0.2814338892991177, + "grad_norm": 0.14566300809383392, + "learning_rate": 0.0005, + "loss": 2.1367, + "step": 73940 + }, + { + "epoch": 0.2814719517672404, + "grad_norm": 0.11882128566503525, + "learning_rate": 0.0005, + "loss": 2.1208, + "step": 73950 + }, + { + "epoch": 0.2815100142353631, + "grad_norm": 0.13641339540481567, + "learning_rate": 0.0005, + "loss": 2.1332, + "step": 73960 + }, + { + "epoch": 0.28154807670348575, + "grad_norm": 0.12365715205669403, + "learning_rate": 0.0005, + "loss": 2.1187, + "step": 73970 + }, + { + "epoch": 0.28158613917160846, + "grad_norm": 0.15532588958740234, + "learning_rate": 0.0005, + "loss": 2.1347, + "step": 73980 + }, + { + "epoch": 0.2816242016397311, + "grad_norm": 0.1406833678483963, + "learning_rate": 0.0005, + "loss": 2.1195, + "step": 73990 + }, + { + "epoch": 0.28166226410785383, + "grad_norm": 0.1372503936290741, + "learning_rate": 0.0005, + "loss": 2.1242, + "step": 74000 + }, + { + "epoch": 0.2817003265759765, + "grad_norm": 0.11215030401945114, + "learning_rate": 0.0005, + "loss": 2.1278, + "step": 74010 + }, + { + "epoch": 0.2817383890440992, + "grad_norm": 0.12551802396774292, + "learning_rate": 0.0005, + "loss": 2.1259, + "step": 74020 + }, + { + "epoch": 0.28177645151222186, + "grad_norm": 0.11723242700099945, + "learning_rate": 0.0005, + "loss": 2.1122, + "step": 74030 + }, + { + "epoch": 0.2818145139803445, + "grad_norm": 0.12735706567764282, + "learning_rate": 0.0005, + "loss": 2.1331, + "step": 74040 + }, + { + "epoch": 0.28185257644846723, + "grad_norm": 0.12368855625391006, + "learning_rate": 0.0005, + "loss": 2.1307, + "step": 74050 + }, + { + "epoch": 0.2818906389165899, + "grad_norm": 0.1375809907913208, + "learning_rate": 0.0005, + "loss": 2.1297, + "step": 74060 + }, + { + "epoch": 0.2819287013847126, + "grad_norm": 0.128162682056427, + "learning_rate": 0.0005, + "loss": 2.132, + "step": 74070 + }, + { + "epoch": 0.28196676385283526, + "grad_norm": 0.1310570240020752, + "learning_rate": 0.0005, + "loss": 2.1413, + "step": 74080 + }, + { + "epoch": 0.28200482632095797, + "grad_norm": 0.11914895474910736, + "learning_rate": 0.0005, + "loss": 2.1428, + "step": 74090 + }, + { + "epoch": 0.2820428887890806, + "grad_norm": 0.12058401107788086, + "learning_rate": 0.0005, + "loss": 2.1311, + "step": 74100 + }, + { + "epoch": 0.28208095125720334, + "grad_norm": 0.13332392275333405, + "learning_rate": 0.0005, + "loss": 2.1418, + "step": 74110 + }, + { + "epoch": 0.282119013725326, + "grad_norm": 0.130690336227417, + "learning_rate": 0.0005, + "loss": 2.1176, + "step": 74120 + }, + { + "epoch": 0.2821570761934487, + "grad_norm": 0.1214623898267746, + "learning_rate": 0.0005, + "loss": 2.1343, + "step": 74130 + }, + { + "epoch": 0.28219513866157137, + "grad_norm": 0.1267269253730774, + "learning_rate": 0.0005, + "loss": 2.1293, + "step": 74140 + }, + { + "epoch": 0.2822332011296941, + "grad_norm": 0.13244369626045227, + "learning_rate": 0.0005, + "loss": 2.1217, + "step": 74150 + }, + { + "epoch": 0.28227126359781674, + "grad_norm": 0.12093862146139145, + "learning_rate": 0.0005, + "loss": 2.1272, + "step": 74160 + }, + { + "epoch": 0.2823093260659394, + "grad_norm": 0.13259826600551605, + "learning_rate": 0.0005, + "loss": 2.1198, + "step": 74170 + }, + { + "epoch": 0.2823473885340621, + "grad_norm": 0.12631496787071228, + "learning_rate": 0.0005, + "loss": 2.1022, + "step": 74180 + }, + { + "epoch": 0.28238545100218476, + "grad_norm": 0.11987242102622986, + "learning_rate": 0.0005, + "loss": 2.1355, + "step": 74190 + }, + { + "epoch": 0.2824235134703075, + "grad_norm": 0.12496347725391388, + "learning_rate": 0.0005, + "loss": 2.1343, + "step": 74200 + }, + { + "epoch": 0.28246157593843013, + "grad_norm": 0.12754859030246735, + "learning_rate": 0.0005, + "loss": 2.1133, + "step": 74210 + }, + { + "epoch": 0.28249963840655284, + "grad_norm": 0.12239838391542435, + "learning_rate": 0.0005, + "loss": 2.1377, + "step": 74220 + }, + { + "epoch": 0.2825377008746755, + "grad_norm": 0.13033181428909302, + "learning_rate": 0.0005, + "loss": 2.1306, + "step": 74230 + }, + { + "epoch": 0.2825757633427982, + "grad_norm": 0.12822109460830688, + "learning_rate": 0.0005, + "loss": 2.1267, + "step": 74240 + }, + { + "epoch": 0.28261382581092087, + "grad_norm": 0.12703551352024078, + "learning_rate": 0.0005, + "loss": 2.126, + "step": 74250 + }, + { + "epoch": 0.2826518882790436, + "grad_norm": 0.11725924909114838, + "learning_rate": 0.0005, + "loss": 2.1181, + "step": 74260 + }, + { + "epoch": 0.28268995074716624, + "grad_norm": 0.12533323466777802, + "learning_rate": 0.0005, + "loss": 2.1236, + "step": 74270 + }, + { + "epoch": 0.28272801321528895, + "grad_norm": 0.11936778575181961, + "learning_rate": 0.0005, + "loss": 2.1199, + "step": 74280 + }, + { + "epoch": 0.2827660756834116, + "grad_norm": 0.1154133751988411, + "learning_rate": 0.0005, + "loss": 2.1212, + "step": 74290 + }, + { + "epoch": 0.2828041381515343, + "grad_norm": 0.11242377758026123, + "learning_rate": 0.0005, + "loss": 2.1297, + "step": 74300 + }, + { + "epoch": 0.282842200619657, + "grad_norm": 0.12405408173799515, + "learning_rate": 0.0005, + "loss": 2.1282, + "step": 74310 + }, + { + "epoch": 0.28288026308777964, + "grad_norm": 0.11994063854217529, + "learning_rate": 0.0005, + "loss": 2.1475, + "step": 74320 + }, + { + "epoch": 0.28291832555590235, + "grad_norm": 0.1216566413640976, + "learning_rate": 0.0005, + "loss": 2.1219, + "step": 74330 + }, + { + "epoch": 0.282956388024025, + "grad_norm": 0.13246215879917145, + "learning_rate": 0.0005, + "loss": 2.1162, + "step": 74340 + }, + { + "epoch": 0.2829944504921477, + "grad_norm": 0.13144953548908234, + "learning_rate": 0.0005, + "loss": 2.1097, + "step": 74350 + }, + { + "epoch": 0.2830325129602704, + "grad_norm": 0.14443819224834442, + "learning_rate": 0.0005, + "loss": 2.1365, + "step": 74360 + }, + { + "epoch": 0.2830705754283931, + "grad_norm": 0.13754788041114807, + "learning_rate": 0.0005, + "loss": 2.1247, + "step": 74370 + }, + { + "epoch": 0.28310863789651575, + "grad_norm": 0.12103603780269623, + "learning_rate": 0.0005, + "loss": 2.1431, + "step": 74380 + }, + { + "epoch": 0.28314670036463846, + "grad_norm": 0.13121990859508514, + "learning_rate": 0.0005, + "loss": 2.1322, + "step": 74390 + }, + { + "epoch": 0.2831847628327611, + "grad_norm": 0.11447214335203171, + "learning_rate": 0.0005, + "loss": 2.1269, + "step": 74400 + }, + { + "epoch": 0.28322282530088383, + "grad_norm": 0.12295140326023102, + "learning_rate": 0.0005, + "loss": 2.1195, + "step": 74410 + }, + { + "epoch": 0.2832608877690065, + "grad_norm": 0.13450969755649567, + "learning_rate": 0.0005, + "loss": 2.1249, + "step": 74420 + }, + { + "epoch": 0.2832989502371292, + "grad_norm": 0.12559343874454498, + "learning_rate": 0.0005, + "loss": 2.127, + "step": 74430 + }, + { + "epoch": 0.28333701270525186, + "grad_norm": 0.11981473118066788, + "learning_rate": 0.0005, + "loss": 2.1192, + "step": 74440 + }, + { + "epoch": 0.28337507517337457, + "grad_norm": 0.14503072202205658, + "learning_rate": 0.0005, + "loss": 2.1246, + "step": 74450 + }, + { + "epoch": 0.2834131376414972, + "grad_norm": 0.1257060319185257, + "learning_rate": 0.0005, + "loss": 2.1193, + "step": 74460 + }, + { + "epoch": 0.2834512001096199, + "grad_norm": 0.12952490150928497, + "learning_rate": 0.0005, + "loss": 2.1076, + "step": 74470 + }, + { + "epoch": 0.2834892625777426, + "grad_norm": 0.12097674608230591, + "learning_rate": 0.0005, + "loss": 2.1318, + "step": 74480 + }, + { + "epoch": 0.28352732504586525, + "grad_norm": 0.13185162842273712, + "learning_rate": 0.0005, + "loss": 2.1319, + "step": 74490 + }, + { + "epoch": 0.28356538751398797, + "grad_norm": 0.14450843632221222, + "learning_rate": 0.0005, + "loss": 2.1296, + "step": 74500 + }, + { + "epoch": 0.2836034499821106, + "grad_norm": 0.12296470254659653, + "learning_rate": 0.0005, + "loss": 2.1219, + "step": 74510 + }, + { + "epoch": 0.28364151245023334, + "grad_norm": 0.13791923224925995, + "learning_rate": 0.0005, + "loss": 2.1174, + "step": 74520 + }, + { + "epoch": 0.283679574918356, + "grad_norm": 0.12625502049922943, + "learning_rate": 0.0005, + "loss": 2.1338, + "step": 74530 + }, + { + "epoch": 0.2837176373864787, + "grad_norm": 0.13758540153503418, + "learning_rate": 0.0005, + "loss": 2.1267, + "step": 74540 + }, + { + "epoch": 0.28375569985460136, + "grad_norm": 0.11705252528190613, + "learning_rate": 0.0005, + "loss": 2.129, + "step": 74550 + }, + { + "epoch": 0.2837937623227241, + "grad_norm": 0.11666113138198853, + "learning_rate": 0.0005, + "loss": 2.1401, + "step": 74560 + }, + { + "epoch": 0.28383182479084673, + "grad_norm": 0.1299813836812973, + "learning_rate": 0.0005, + "loss": 2.1264, + "step": 74570 + }, + { + "epoch": 0.28386988725896944, + "grad_norm": 0.3286300003528595, + "learning_rate": 0.0005, + "loss": 2.1568, + "step": 74580 + }, + { + "epoch": 0.2839079497270921, + "grad_norm": 0.1263127326965332, + "learning_rate": 0.0005, + "loss": 2.1269, + "step": 74590 + }, + { + "epoch": 0.2839460121952148, + "grad_norm": 0.12550726532936096, + "learning_rate": 0.0005, + "loss": 2.1442, + "step": 74600 + }, + { + "epoch": 0.28398407466333747, + "grad_norm": 0.13197514414787292, + "learning_rate": 0.0005, + "loss": 2.1258, + "step": 74610 + }, + { + "epoch": 0.28402213713146013, + "grad_norm": 0.11156166344881058, + "learning_rate": 0.0005, + "loss": 2.1395, + "step": 74620 + }, + { + "epoch": 0.28406019959958284, + "grad_norm": 0.12309394776821136, + "learning_rate": 0.0005, + "loss": 2.1266, + "step": 74630 + }, + { + "epoch": 0.2840982620677055, + "grad_norm": 0.13029909133911133, + "learning_rate": 0.0005, + "loss": 2.117, + "step": 74640 + }, + { + "epoch": 0.2841363245358282, + "grad_norm": 0.13694219291210175, + "learning_rate": 0.0005, + "loss": 2.1165, + "step": 74650 + }, + { + "epoch": 0.28417438700395087, + "grad_norm": 0.12063515186309814, + "learning_rate": 0.0005, + "loss": 2.1418, + "step": 74660 + }, + { + "epoch": 0.2842124494720736, + "grad_norm": 0.136690154671669, + "learning_rate": 0.0005, + "loss": 2.1193, + "step": 74670 + }, + { + "epoch": 0.28425051194019624, + "grad_norm": 0.1438709944486618, + "learning_rate": 0.0005, + "loss": 2.1361, + "step": 74680 + }, + { + "epoch": 0.28428857440831895, + "grad_norm": 0.14761757850646973, + "learning_rate": 0.0005, + "loss": 2.1222, + "step": 74690 + }, + { + "epoch": 0.2843266368764416, + "grad_norm": 0.14363446831703186, + "learning_rate": 0.0005, + "loss": 2.1086, + "step": 74700 + }, + { + "epoch": 0.2843646993445643, + "grad_norm": 0.13309715688228607, + "learning_rate": 0.0005, + "loss": 2.1257, + "step": 74710 + }, + { + "epoch": 0.284402761812687, + "grad_norm": 0.14354118704795837, + "learning_rate": 0.0005, + "loss": 2.1093, + "step": 74720 + }, + { + "epoch": 0.2844408242808097, + "grad_norm": 0.11641010642051697, + "learning_rate": 0.0005, + "loss": 2.1162, + "step": 74730 + }, + { + "epoch": 0.28447888674893235, + "grad_norm": 0.13731835782527924, + "learning_rate": 0.0005, + "loss": 2.1342, + "step": 74740 + }, + { + "epoch": 0.284516949217055, + "grad_norm": 0.13670092821121216, + "learning_rate": 0.0005, + "loss": 2.1257, + "step": 74750 + }, + { + "epoch": 0.2845550116851777, + "grad_norm": 0.1361704021692276, + "learning_rate": 0.0005, + "loss": 2.1213, + "step": 74760 + }, + { + "epoch": 0.2845930741533004, + "grad_norm": 0.12592118978500366, + "learning_rate": 0.0005, + "loss": 2.1171, + "step": 74770 + }, + { + "epoch": 0.2846311366214231, + "grad_norm": 0.12394729256629944, + "learning_rate": 0.0005, + "loss": 2.1423, + "step": 74780 + }, + { + "epoch": 0.28466919908954574, + "grad_norm": 0.12797507643699646, + "learning_rate": 0.0005, + "loss": 2.1295, + "step": 74790 + }, + { + "epoch": 0.28470726155766846, + "grad_norm": 0.13249561190605164, + "learning_rate": 0.0005, + "loss": 2.1274, + "step": 74800 + }, + { + "epoch": 0.2847453240257911, + "grad_norm": 0.13181069493293762, + "learning_rate": 0.0005, + "loss": 2.1249, + "step": 74810 + }, + { + "epoch": 0.2847833864939138, + "grad_norm": 0.13363561034202576, + "learning_rate": 0.0005, + "loss": 2.1283, + "step": 74820 + }, + { + "epoch": 0.2848214489620365, + "grad_norm": 0.12551195919513702, + "learning_rate": 0.0005, + "loss": 2.1266, + "step": 74830 + }, + { + "epoch": 0.2848595114301592, + "grad_norm": 0.12691380083560944, + "learning_rate": 0.0005, + "loss": 2.1377, + "step": 74840 + }, + { + "epoch": 0.28489757389828185, + "grad_norm": 0.13347996771335602, + "learning_rate": 0.0005, + "loss": 2.111, + "step": 74850 + }, + { + "epoch": 0.28493563636640457, + "grad_norm": 0.12483879178762436, + "learning_rate": 0.0005, + "loss": 2.134, + "step": 74860 + }, + { + "epoch": 0.2849736988345272, + "grad_norm": 0.11743100732564926, + "learning_rate": 0.0005, + "loss": 2.1269, + "step": 74870 + }, + { + "epoch": 0.28501176130264994, + "grad_norm": 0.11756300181150436, + "learning_rate": 0.0005, + "loss": 2.1204, + "step": 74880 + }, + { + "epoch": 0.2850498237707726, + "grad_norm": 0.1318596601486206, + "learning_rate": 0.0005, + "loss": 2.1247, + "step": 74890 + }, + { + "epoch": 0.28508788623889525, + "grad_norm": 0.13540859520435333, + "learning_rate": 0.0005, + "loss": 2.1436, + "step": 74900 + }, + { + "epoch": 0.28512594870701796, + "grad_norm": 0.13051091134548187, + "learning_rate": 0.0005, + "loss": 2.1208, + "step": 74910 + }, + { + "epoch": 0.2851640111751406, + "grad_norm": 0.13236452639102936, + "learning_rate": 0.0005, + "loss": 2.1383, + "step": 74920 + }, + { + "epoch": 0.28520207364326333, + "grad_norm": 0.12521909177303314, + "learning_rate": 0.0005, + "loss": 2.124, + "step": 74930 + }, + { + "epoch": 0.285240136111386, + "grad_norm": 0.11868865042924881, + "learning_rate": 0.0005, + "loss": 2.1329, + "step": 74940 + }, + { + "epoch": 0.2852781985795087, + "grad_norm": 0.12900525331497192, + "learning_rate": 0.0005, + "loss": 2.1201, + "step": 74950 + }, + { + "epoch": 0.28531626104763136, + "grad_norm": 0.12465689331293106, + "learning_rate": 0.0005, + "loss": 2.1346, + "step": 74960 + }, + { + "epoch": 0.28535432351575407, + "grad_norm": 0.12036836892366409, + "learning_rate": 0.0005, + "loss": 2.1207, + "step": 74970 + }, + { + "epoch": 0.28539238598387673, + "grad_norm": 0.1260298192501068, + "learning_rate": 0.0005, + "loss": 2.1285, + "step": 74980 + }, + { + "epoch": 0.28543044845199944, + "grad_norm": 0.12549512088298798, + "learning_rate": 0.0005, + "loss": 2.1266, + "step": 74990 + }, + { + "epoch": 0.2854685109201221, + "grad_norm": 0.1338365375995636, + "learning_rate": 0.0005, + "loss": 2.1347, + "step": 75000 + }, + { + "epoch": 0.2855065733882448, + "grad_norm": 0.1249644011259079, + "learning_rate": 0.0005, + "loss": 2.1331, + "step": 75010 + }, + { + "epoch": 0.28554463585636747, + "grad_norm": 0.11954779922962189, + "learning_rate": 0.0005, + "loss": 2.1207, + "step": 75020 + }, + { + "epoch": 0.2855826983244902, + "grad_norm": 0.12321747094392776, + "learning_rate": 0.0005, + "loss": 2.1456, + "step": 75030 + }, + { + "epoch": 0.28562076079261284, + "grad_norm": 0.12530706822872162, + "learning_rate": 0.0005, + "loss": 2.1222, + "step": 75040 + }, + { + "epoch": 0.2856588232607355, + "grad_norm": 0.11701780557632446, + "learning_rate": 0.0005, + "loss": 2.1301, + "step": 75050 + }, + { + "epoch": 0.2856968857288582, + "grad_norm": 0.12913040816783905, + "learning_rate": 0.0005, + "loss": 2.1149, + "step": 75060 + }, + { + "epoch": 0.28573494819698086, + "grad_norm": 0.13130801916122437, + "learning_rate": 0.0005, + "loss": 2.1354, + "step": 75070 + }, + { + "epoch": 0.2857730106651036, + "grad_norm": 0.12802307307720184, + "learning_rate": 0.0005, + "loss": 2.1283, + "step": 75080 + }, + { + "epoch": 0.28581107313322623, + "grad_norm": 0.12605822086334229, + "learning_rate": 0.0005, + "loss": 2.1317, + "step": 75090 + }, + { + "epoch": 0.28584913560134895, + "grad_norm": 0.12512341141700745, + "learning_rate": 0.0005, + "loss": 2.1124, + "step": 75100 + }, + { + "epoch": 0.2858871980694716, + "grad_norm": 0.13102522492408752, + "learning_rate": 0.0005, + "loss": 2.1284, + "step": 75110 + }, + { + "epoch": 0.2859252605375943, + "grad_norm": 0.3720043897628784, + "learning_rate": 0.0005, + "loss": 2.1222, + "step": 75120 + }, + { + "epoch": 0.285963323005717, + "grad_norm": 0.11563277244567871, + "learning_rate": 0.0005, + "loss": 2.1282, + "step": 75130 + }, + { + "epoch": 0.2860013854738397, + "grad_norm": 0.11799177527427673, + "learning_rate": 0.0005, + "loss": 2.1262, + "step": 75140 + }, + { + "epoch": 0.28603944794196234, + "grad_norm": 0.13001975417137146, + "learning_rate": 0.0005, + "loss": 2.1376, + "step": 75150 + }, + { + "epoch": 0.28607751041008506, + "grad_norm": 0.13930030167102814, + "learning_rate": 0.0005, + "loss": 2.1254, + "step": 75160 + }, + { + "epoch": 0.2861155728782077, + "grad_norm": 0.12200480699539185, + "learning_rate": 0.0005, + "loss": 2.1486, + "step": 75170 + }, + { + "epoch": 0.28615363534633037, + "grad_norm": 0.1231589987874031, + "learning_rate": 0.0005, + "loss": 2.1311, + "step": 75180 + }, + { + "epoch": 0.2861916978144531, + "grad_norm": 0.1211152896285057, + "learning_rate": 0.0005, + "loss": 2.1369, + "step": 75190 + }, + { + "epoch": 0.28622976028257574, + "grad_norm": 0.1369701474905014, + "learning_rate": 0.0005, + "loss": 2.1268, + "step": 75200 + }, + { + "epoch": 0.28626782275069845, + "grad_norm": 0.1483720988035202, + "learning_rate": 0.0005, + "loss": 2.1233, + "step": 75210 + }, + { + "epoch": 0.2863058852188211, + "grad_norm": 0.1256931573152542, + "learning_rate": 0.0005, + "loss": 2.1077, + "step": 75220 + }, + { + "epoch": 0.2863439476869438, + "grad_norm": 0.12462868541479111, + "learning_rate": 0.0005, + "loss": 2.1159, + "step": 75230 + }, + { + "epoch": 0.2863820101550665, + "grad_norm": 0.12703180313110352, + "learning_rate": 0.0005, + "loss": 2.1314, + "step": 75240 + }, + { + "epoch": 0.2864200726231892, + "grad_norm": 0.127125084400177, + "learning_rate": 0.0005, + "loss": 2.1184, + "step": 75250 + }, + { + "epoch": 0.28645813509131185, + "grad_norm": 0.13014651834964752, + "learning_rate": 0.0005, + "loss": 2.1081, + "step": 75260 + }, + { + "epoch": 0.28649619755943456, + "grad_norm": 0.13283497095108032, + "learning_rate": 0.0005, + "loss": 2.1205, + "step": 75270 + }, + { + "epoch": 0.2865342600275572, + "grad_norm": 0.13949061930179596, + "learning_rate": 0.0005, + "loss": 2.128, + "step": 75280 + }, + { + "epoch": 0.28657232249567993, + "grad_norm": 0.12150692194700241, + "learning_rate": 0.0005, + "loss": 2.1247, + "step": 75290 + }, + { + "epoch": 0.2866103849638026, + "grad_norm": 0.13123773038387299, + "learning_rate": 0.0005, + "loss": 2.1313, + "step": 75300 + }, + { + "epoch": 0.2866484474319253, + "grad_norm": 0.1253703385591507, + "learning_rate": 0.0005, + "loss": 2.127, + "step": 75310 + }, + { + "epoch": 0.28668650990004796, + "grad_norm": 0.12628312408924103, + "learning_rate": 0.0005, + "loss": 2.1163, + "step": 75320 + }, + { + "epoch": 0.2867245723681706, + "grad_norm": 0.11620379239320755, + "learning_rate": 0.0005, + "loss": 2.1229, + "step": 75330 + }, + { + "epoch": 0.28676263483629333, + "grad_norm": 0.12156597524881363, + "learning_rate": 0.0005, + "loss": 2.1362, + "step": 75340 + }, + { + "epoch": 0.286800697304416, + "grad_norm": 0.1484280377626419, + "learning_rate": 0.0005, + "loss": 2.1298, + "step": 75350 + }, + { + "epoch": 0.2868387597725387, + "grad_norm": 0.14153222739696503, + "learning_rate": 0.0005, + "loss": 2.1407, + "step": 75360 + }, + { + "epoch": 0.28687682224066136, + "grad_norm": 0.1369723528623581, + "learning_rate": 0.0005, + "loss": 2.1149, + "step": 75370 + }, + { + "epoch": 0.28691488470878407, + "grad_norm": 0.12162962555885315, + "learning_rate": 0.0005, + "loss": 2.1357, + "step": 75380 + }, + { + "epoch": 0.2869529471769067, + "grad_norm": 0.12366687506437302, + "learning_rate": 0.0005, + "loss": 2.1254, + "step": 75390 + }, + { + "epoch": 0.28699100964502944, + "grad_norm": 0.11675120145082474, + "learning_rate": 0.0005, + "loss": 2.1316, + "step": 75400 + }, + { + "epoch": 0.2870290721131521, + "grad_norm": 0.12342153489589691, + "learning_rate": 0.0005, + "loss": 2.1296, + "step": 75410 + }, + { + "epoch": 0.2870671345812748, + "grad_norm": 0.14156018197536469, + "learning_rate": 0.0005, + "loss": 2.1238, + "step": 75420 + }, + { + "epoch": 0.28710519704939746, + "grad_norm": 0.135422945022583, + "learning_rate": 0.0005, + "loss": 2.1156, + "step": 75430 + }, + { + "epoch": 0.2871432595175202, + "grad_norm": 0.13034862279891968, + "learning_rate": 0.0005, + "loss": 2.122, + "step": 75440 + }, + { + "epoch": 0.28718132198564283, + "grad_norm": 0.12067141383886337, + "learning_rate": 0.0005, + "loss": 2.1304, + "step": 75450 + }, + { + "epoch": 0.28721938445376555, + "grad_norm": 0.14023612439632416, + "learning_rate": 0.0005, + "loss": 2.1272, + "step": 75460 + }, + { + "epoch": 0.2872574469218882, + "grad_norm": 0.1173662468791008, + "learning_rate": 0.0005, + "loss": 2.1441, + "step": 75470 + }, + { + "epoch": 0.28729550939001086, + "grad_norm": 0.13125747442245483, + "learning_rate": 0.0005, + "loss": 2.1209, + "step": 75480 + }, + { + "epoch": 0.2873335718581336, + "grad_norm": 0.1212615892291069, + "learning_rate": 0.0005, + "loss": 2.1212, + "step": 75490 + }, + { + "epoch": 0.28737163432625623, + "grad_norm": 0.12185412645339966, + "learning_rate": 0.0005, + "loss": 2.1342, + "step": 75500 + }, + { + "epoch": 0.28740969679437894, + "grad_norm": 0.11994576454162598, + "learning_rate": 0.0005, + "loss": 2.121, + "step": 75510 + }, + { + "epoch": 0.2874477592625016, + "grad_norm": 0.12515906989574432, + "learning_rate": 0.0005, + "loss": 2.1317, + "step": 75520 + }, + { + "epoch": 0.2874858217306243, + "grad_norm": 0.12057694792747498, + "learning_rate": 0.0005, + "loss": 2.1151, + "step": 75530 + }, + { + "epoch": 0.28752388419874697, + "grad_norm": 0.12256751954555511, + "learning_rate": 0.0005, + "loss": 2.1469, + "step": 75540 + }, + { + "epoch": 0.2875619466668697, + "grad_norm": 0.12268295139074326, + "learning_rate": 0.0005, + "loss": 2.1269, + "step": 75550 + }, + { + "epoch": 0.28760000913499234, + "grad_norm": 0.1268979161977768, + "learning_rate": 0.0005, + "loss": 2.119, + "step": 75560 + }, + { + "epoch": 0.28763807160311505, + "grad_norm": 0.1251557618379593, + "learning_rate": 0.0005, + "loss": 2.1252, + "step": 75570 + }, + { + "epoch": 0.2876761340712377, + "grad_norm": 0.12928272783756256, + "learning_rate": 0.0005, + "loss": 2.127, + "step": 75580 + }, + { + "epoch": 0.2877141965393604, + "grad_norm": 0.11583087593317032, + "learning_rate": 0.0005, + "loss": 2.1363, + "step": 75590 + }, + { + "epoch": 0.2877522590074831, + "grad_norm": 0.12400549650192261, + "learning_rate": 0.0005, + "loss": 2.117, + "step": 75600 + }, + { + "epoch": 0.28779032147560574, + "grad_norm": 0.13491888344287872, + "learning_rate": 0.0005, + "loss": 2.1286, + "step": 75610 + }, + { + "epoch": 0.28782838394372845, + "grad_norm": 0.1373259276151657, + "learning_rate": 0.0005, + "loss": 2.1336, + "step": 75620 + }, + { + "epoch": 0.2878664464118511, + "grad_norm": 0.1513000875711441, + "learning_rate": 0.0005, + "loss": 2.133, + "step": 75630 + }, + { + "epoch": 0.2879045088799738, + "grad_norm": 0.13211409747600555, + "learning_rate": 0.0005, + "loss": 2.1244, + "step": 75640 + }, + { + "epoch": 0.2879425713480965, + "grad_norm": 0.11001871526241302, + "learning_rate": 0.0005, + "loss": 2.1245, + "step": 75650 + }, + { + "epoch": 0.2879806338162192, + "grad_norm": 0.12871001660823822, + "learning_rate": 0.0005, + "loss": 2.1377, + "step": 75660 + }, + { + "epoch": 0.28801869628434185, + "grad_norm": 0.11630739271640778, + "learning_rate": 0.0005, + "loss": 2.1333, + "step": 75670 + }, + { + "epoch": 0.28805675875246456, + "grad_norm": 0.12869928777217865, + "learning_rate": 0.0005, + "loss": 2.1288, + "step": 75680 + }, + { + "epoch": 0.2880948212205872, + "grad_norm": 0.1421797275543213, + "learning_rate": 0.0005, + "loss": 2.1185, + "step": 75690 + }, + { + "epoch": 0.28813288368870993, + "grad_norm": 0.14080357551574707, + "learning_rate": 0.0005, + "loss": 2.1297, + "step": 75700 + }, + { + "epoch": 0.2881709461568326, + "grad_norm": 0.12262321263551712, + "learning_rate": 0.0005, + "loss": 2.1251, + "step": 75710 + }, + { + "epoch": 0.2882090086249553, + "grad_norm": 0.12388603389263153, + "learning_rate": 0.0005, + "loss": 2.1323, + "step": 75720 + }, + { + "epoch": 0.28824707109307796, + "grad_norm": 0.1366262435913086, + "learning_rate": 0.0005, + "loss": 2.1136, + "step": 75730 + }, + { + "epoch": 0.28828513356120067, + "grad_norm": 0.11656782776117325, + "learning_rate": 0.0005, + "loss": 2.1371, + "step": 75740 + }, + { + "epoch": 0.2883231960293233, + "grad_norm": 0.12055030465126038, + "learning_rate": 0.0005, + "loss": 2.1232, + "step": 75750 + }, + { + "epoch": 0.288361258497446, + "grad_norm": 0.12981313467025757, + "learning_rate": 0.0005, + "loss": 2.1156, + "step": 75760 + }, + { + "epoch": 0.2883993209655687, + "grad_norm": 0.12313870340585709, + "learning_rate": 0.0005, + "loss": 2.123, + "step": 75770 + }, + { + "epoch": 0.28843738343369135, + "grad_norm": 0.13794687390327454, + "learning_rate": 0.0005, + "loss": 2.1177, + "step": 75780 + }, + { + "epoch": 0.28847544590181406, + "grad_norm": 0.12309377640485764, + "learning_rate": 0.0005, + "loss": 2.1333, + "step": 75790 + }, + { + "epoch": 0.2885135083699367, + "grad_norm": 0.13513195514678955, + "learning_rate": 0.0005, + "loss": 2.1034, + "step": 75800 + }, + { + "epoch": 0.28855157083805943, + "grad_norm": 0.1164679303765297, + "learning_rate": 0.0005, + "loss": 2.1199, + "step": 75810 + }, + { + "epoch": 0.2885896333061821, + "grad_norm": 0.1197584792971611, + "learning_rate": 0.0005, + "loss": 2.1218, + "step": 75820 + }, + { + "epoch": 0.2886276957743048, + "grad_norm": 0.10965582728385925, + "learning_rate": 0.0005, + "loss": 2.118, + "step": 75830 + }, + { + "epoch": 0.28866575824242746, + "grad_norm": 0.1275942176580429, + "learning_rate": 0.0005, + "loss": 2.1303, + "step": 75840 + }, + { + "epoch": 0.2887038207105502, + "grad_norm": 0.11930042505264282, + "learning_rate": 0.0005, + "loss": 2.1205, + "step": 75850 + }, + { + "epoch": 0.28874188317867283, + "grad_norm": 0.13442851603031158, + "learning_rate": 0.0005, + "loss": 2.1083, + "step": 75860 + }, + { + "epoch": 0.28877994564679554, + "grad_norm": 0.1342979371547699, + "learning_rate": 0.0005, + "loss": 2.1161, + "step": 75870 + }, + { + "epoch": 0.2888180081149182, + "grad_norm": 0.12899208068847656, + "learning_rate": 0.0005, + "loss": 2.1281, + "step": 75880 + }, + { + "epoch": 0.2888560705830409, + "grad_norm": 0.1196591928601265, + "learning_rate": 0.0005, + "loss": 2.1215, + "step": 75890 + }, + { + "epoch": 0.28889413305116357, + "grad_norm": 0.12251043319702148, + "learning_rate": 0.0005, + "loss": 2.1488, + "step": 75900 + }, + { + "epoch": 0.28893219551928623, + "grad_norm": 0.12906859815120697, + "learning_rate": 0.0005, + "loss": 2.1387, + "step": 75910 + }, + { + "epoch": 0.28897025798740894, + "grad_norm": 0.12498313933610916, + "learning_rate": 0.0005, + "loss": 2.1191, + "step": 75920 + }, + { + "epoch": 0.2890083204555316, + "grad_norm": 0.13862529397010803, + "learning_rate": 0.0005, + "loss": 2.1263, + "step": 75930 + }, + { + "epoch": 0.2890463829236543, + "grad_norm": 0.12166710942983627, + "learning_rate": 0.0005, + "loss": 2.1398, + "step": 75940 + }, + { + "epoch": 0.28908444539177697, + "grad_norm": 0.11352758854627609, + "learning_rate": 0.0005, + "loss": 2.1303, + "step": 75950 + }, + { + "epoch": 0.2891225078598997, + "grad_norm": 0.1291627287864685, + "learning_rate": 0.0005, + "loss": 2.1146, + "step": 75960 + }, + { + "epoch": 0.28916057032802234, + "grad_norm": 0.1307760775089264, + "learning_rate": 0.0005, + "loss": 2.1389, + "step": 75970 + }, + { + "epoch": 0.28919863279614505, + "grad_norm": 0.13838613033294678, + "learning_rate": 0.0005, + "loss": 2.126, + "step": 75980 + }, + { + "epoch": 0.2892366952642677, + "grad_norm": 0.1261829286813736, + "learning_rate": 0.0005, + "loss": 2.1304, + "step": 75990 + }, + { + "epoch": 0.2892747577323904, + "grad_norm": 0.12509678304195404, + "learning_rate": 0.0005, + "loss": 2.133, + "step": 76000 + }, + { + "epoch": 0.2893128202005131, + "grad_norm": 0.12762987613677979, + "learning_rate": 0.0005, + "loss": 2.1374, + "step": 76010 + }, + { + "epoch": 0.2893508826686358, + "grad_norm": 0.14674775302410126, + "learning_rate": 0.0005, + "loss": 2.1359, + "step": 76020 + }, + { + "epoch": 0.28938894513675845, + "grad_norm": 0.12226782739162445, + "learning_rate": 0.0005, + "loss": 2.1193, + "step": 76030 + }, + { + "epoch": 0.2894270076048811, + "grad_norm": 0.13139107823371887, + "learning_rate": 0.0005, + "loss": 2.1257, + "step": 76040 + }, + { + "epoch": 0.2894650700730038, + "grad_norm": 0.12519071996212006, + "learning_rate": 0.0005, + "loss": 2.1252, + "step": 76050 + }, + { + "epoch": 0.2895031325411265, + "grad_norm": 0.12543900310993195, + "learning_rate": 0.0005, + "loss": 2.1243, + "step": 76060 + }, + { + "epoch": 0.2895411950092492, + "grad_norm": 0.14839930832386017, + "learning_rate": 0.0005, + "loss": 2.1237, + "step": 76070 + }, + { + "epoch": 0.28957925747737184, + "grad_norm": 0.12887993454933167, + "learning_rate": 0.0005, + "loss": 2.1334, + "step": 76080 + }, + { + "epoch": 0.28961731994549456, + "grad_norm": 0.12014911323785782, + "learning_rate": 0.0005, + "loss": 2.1376, + "step": 76090 + }, + { + "epoch": 0.2896553824136172, + "grad_norm": 0.1231941282749176, + "learning_rate": 0.0005, + "loss": 2.1304, + "step": 76100 + }, + { + "epoch": 0.2896934448817399, + "grad_norm": 0.14408724009990692, + "learning_rate": 0.0005, + "loss": 2.1282, + "step": 76110 + }, + { + "epoch": 0.2897315073498626, + "grad_norm": 0.12259387224912643, + "learning_rate": 0.0005, + "loss": 2.1206, + "step": 76120 + }, + { + "epoch": 0.2897695698179853, + "grad_norm": 0.13093988597393036, + "learning_rate": 0.0005, + "loss": 2.1172, + "step": 76130 + }, + { + "epoch": 0.28980763228610795, + "grad_norm": 0.12523812055587769, + "learning_rate": 0.0005, + "loss": 2.1217, + "step": 76140 + }, + { + "epoch": 0.28984569475423066, + "grad_norm": 0.13562336564064026, + "learning_rate": 0.0005, + "loss": 2.1363, + "step": 76150 + }, + { + "epoch": 0.2898837572223533, + "grad_norm": 0.12516431510448456, + "learning_rate": 0.0005, + "loss": 2.1218, + "step": 76160 + }, + { + "epoch": 0.28992181969047603, + "grad_norm": 0.12162970751523972, + "learning_rate": 0.0005, + "loss": 2.1199, + "step": 76170 + }, + { + "epoch": 0.2899598821585987, + "grad_norm": 0.12627138197422028, + "learning_rate": 0.0005, + "loss": 2.1167, + "step": 76180 + }, + { + "epoch": 0.28999794462672135, + "grad_norm": 0.13824544847011566, + "learning_rate": 0.0005, + "loss": 2.1132, + "step": 76190 + }, + { + "epoch": 0.29003600709484406, + "grad_norm": 0.12304244190454483, + "learning_rate": 0.0005, + "loss": 2.1298, + "step": 76200 + }, + { + "epoch": 0.2900740695629667, + "grad_norm": 0.1312766671180725, + "learning_rate": 0.0005, + "loss": 2.1324, + "step": 76210 + }, + { + "epoch": 0.29011213203108943, + "grad_norm": 0.13339751958847046, + "learning_rate": 0.0005, + "loss": 2.1259, + "step": 76220 + }, + { + "epoch": 0.2901501944992121, + "grad_norm": 0.1529046893119812, + "learning_rate": 0.0005, + "loss": 2.1091, + "step": 76230 + }, + { + "epoch": 0.2901882569673348, + "grad_norm": 0.13795912265777588, + "learning_rate": 0.0005, + "loss": 2.1061, + "step": 76240 + }, + { + "epoch": 0.29022631943545746, + "grad_norm": 0.12184420228004456, + "learning_rate": 0.0005, + "loss": 2.1357, + "step": 76250 + }, + { + "epoch": 0.29026438190358017, + "grad_norm": 0.12675736844539642, + "learning_rate": 0.0005, + "loss": 2.1356, + "step": 76260 + }, + { + "epoch": 0.29030244437170283, + "grad_norm": 0.12017787992954254, + "learning_rate": 0.0005, + "loss": 2.1239, + "step": 76270 + }, + { + "epoch": 0.29034050683982554, + "grad_norm": 0.12479761242866516, + "learning_rate": 0.0005, + "loss": 2.1334, + "step": 76280 + }, + { + "epoch": 0.2903785693079482, + "grad_norm": 0.1499720960855484, + "learning_rate": 0.0005, + "loss": 2.1256, + "step": 76290 + }, + { + "epoch": 0.2904166317760709, + "grad_norm": 0.12147608399391174, + "learning_rate": 0.0005, + "loss": 2.1469, + "step": 76300 + }, + { + "epoch": 0.29045469424419357, + "grad_norm": 0.12691138684749603, + "learning_rate": 0.0005, + "loss": 2.1237, + "step": 76310 + }, + { + "epoch": 0.2904927567123163, + "grad_norm": 0.14038929343223572, + "learning_rate": 0.0005, + "loss": 2.1212, + "step": 76320 + }, + { + "epoch": 0.29053081918043894, + "grad_norm": 0.12324102967977524, + "learning_rate": 0.0005, + "loss": 2.1128, + "step": 76330 + }, + { + "epoch": 0.2905688816485616, + "grad_norm": 0.1249835193157196, + "learning_rate": 0.0005, + "loss": 2.1264, + "step": 76340 + }, + { + "epoch": 0.2906069441166843, + "grad_norm": 0.12767577171325684, + "learning_rate": 0.0005, + "loss": 2.1208, + "step": 76350 + }, + { + "epoch": 0.29064500658480696, + "grad_norm": 0.11480826884508133, + "learning_rate": 0.0005, + "loss": 2.1356, + "step": 76360 + }, + { + "epoch": 0.2906830690529297, + "grad_norm": 0.11960236728191376, + "learning_rate": 0.0005, + "loss": 2.1128, + "step": 76370 + }, + { + "epoch": 0.29072113152105233, + "grad_norm": 0.1302240490913391, + "learning_rate": 0.0005, + "loss": 2.1132, + "step": 76380 + }, + { + "epoch": 0.29075919398917505, + "grad_norm": 0.14519484341144562, + "learning_rate": 0.0005, + "loss": 2.1135, + "step": 76390 + }, + { + "epoch": 0.2907972564572977, + "grad_norm": 0.1436435431241989, + "learning_rate": 0.0005, + "loss": 2.1141, + "step": 76400 + }, + { + "epoch": 0.2908353189254204, + "grad_norm": 0.11903155595064163, + "learning_rate": 0.0005, + "loss": 2.1162, + "step": 76410 + }, + { + "epoch": 0.2908733813935431, + "grad_norm": 0.1267947554588318, + "learning_rate": 0.0005, + "loss": 2.1422, + "step": 76420 + }, + { + "epoch": 0.2909114438616658, + "grad_norm": 0.12625740468502045, + "learning_rate": 0.0005, + "loss": 2.1276, + "step": 76430 + }, + { + "epoch": 0.29094950632978844, + "grad_norm": 0.11667564511299133, + "learning_rate": 0.0005, + "loss": 2.108, + "step": 76440 + }, + { + "epoch": 0.29098756879791116, + "grad_norm": 0.12369339913129807, + "learning_rate": 0.0005, + "loss": 2.1337, + "step": 76450 + }, + { + "epoch": 0.2910256312660338, + "grad_norm": 0.12127465754747391, + "learning_rate": 0.0005, + "loss": 2.1514, + "step": 76460 + }, + { + "epoch": 0.29106369373415647, + "grad_norm": 0.12116898596286774, + "learning_rate": 0.0005, + "loss": 2.1295, + "step": 76470 + }, + { + "epoch": 0.2911017562022792, + "grad_norm": 0.11794110387563705, + "learning_rate": 0.0005, + "loss": 2.1144, + "step": 76480 + }, + { + "epoch": 0.29113981867040184, + "grad_norm": 0.12648813426494598, + "learning_rate": 0.0005, + "loss": 2.1287, + "step": 76490 + }, + { + "epoch": 0.29117788113852455, + "grad_norm": 0.1190585196018219, + "learning_rate": 0.0005, + "loss": 2.1317, + "step": 76500 + }, + { + "epoch": 0.2912159436066472, + "grad_norm": 0.13424323499202728, + "learning_rate": 0.0005, + "loss": 2.1263, + "step": 76510 + }, + { + "epoch": 0.2912540060747699, + "grad_norm": 0.12387403100728989, + "learning_rate": 0.0005, + "loss": 2.1303, + "step": 76520 + }, + { + "epoch": 0.2912920685428926, + "grad_norm": 0.13513913750648499, + "learning_rate": 0.0005, + "loss": 2.1322, + "step": 76530 + }, + { + "epoch": 0.2913301310110153, + "grad_norm": 0.12977351248264313, + "learning_rate": 0.0005, + "loss": 2.1234, + "step": 76540 + }, + { + "epoch": 0.29136819347913795, + "grad_norm": 0.12603046000003815, + "learning_rate": 0.0005, + "loss": 2.1277, + "step": 76550 + }, + { + "epoch": 0.29140625594726066, + "grad_norm": 0.1314680576324463, + "learning_rate": 0.0005, + "loss": 2.1264, + "step": 76560 + }, + { + "epoch": 0.2914443184153833, + "grad_norm": 0.1237976998090744, + "learning_rate": 0.0005, + "loss": 2.1278, + "step": 76570 + }, + { + "epoch": 0.29148238088350603, + "grad_norm": 0.1232212632894516, + "learning_rate": 0.0005, + "loss": 2.1112, + "step": 76580 + }, + { + "epoch": 0.2915204433516287, + "grad_norm": 0.1358116716146469, + "learning_rate": 0.0005, + "loss": 2.1357, + "step": 76590 + }, + { + "epoch": 0.2915585058197514, + "grad_norm": 0.1278366893529892, + "learning_rate": 0.0005, + "loss": 2.112, + "step": 76600 + }, + { + "epoch": 0.29159656828787406, + "grad_norm": 0.12315915524959564, + "learning_rate": 0.0005, + "loss": 2.101, + "step": 76610 + }, + { + "epoch": 0.2916346307559967, + "grad_norm": 0.13123762607574463, + "learning_rate": 0.0005, + "loss": 2.1161, + "step": 76620 + }, + { + "epoch": 0.29167269322411943, + "grad_norm": 0.12064166367053986, + "learning_rate": 0.0005, + "loss": 2.1263, + "step": 76630 + }, + { + "epoch": 0.2917107556922421, + "grad_norm": 0.13507580757141113, + "learning_rate": 0.0005, + "loss": 2.1157, + "step": 76640 + }, + { + "epoch": 0.2917488181603648, + "grad_norm": 0.11641356348991394, + "learning_rate": 0.0005, + "loss": 2.1261, + "step": 76650 + }, + { + "epoch": 0.29178688062848745, + "grad_norm": 0.14286088943481445, + "learning_rate": 0.0005, + "loss": 2.1214, + "step": 76660 + }, + { + "epoch": 0.29182494309661017, + "grad_norm": 0.12444773316383362, + "learning_rate": 0.0005, + "loss": 2.105, + "step": 76670 + }, + { + "epoch": 0.2918630055647328, + "grad_norm": 0.12737885117530823, + "learning_rate": 0.0005, + "loss": 2.142, + "step": 76680 + }, + { + "epoch": 0.29190106803285554, + "grad_norm": 0.1293538510799408, + "learning_rate": 0.0005, + "loss": 2.1219, + "step": 76690 + }, + { + "epoch": 0.2919391305009782, + "grad_norm": 0.11138560622930527, + "learning_rate": 0.0005, + "loss": 2.1189, + "step": 76700 + }, + { + "epoch": 0.2919771929691009, + "grad_norm": 0.12534061074256897, + "learning_rate": 0.0005, + "loss": 2.1271, + "step": 76710 + }, + { + "epoch": 0.29201525543722356, + "grad_norm": 0.12235898524522781, + "learning_rate": 0.0005, + "loss": 2.1238, + "step": 76720 + }, + { + "epoch": 0.2920533179053463, + "grad_norm": 0.12848271429538727, + "learning_rate": 0.0005, + "loss": 2.11, + "step": 76730 + }, + { + "epoch": 0.29209138037346893, + "grad_norm": 0.14083117246627808, + "learning_rate": 0.0005, + "loss": 2.1238, + "step": 76740 + }, + { + "epoch": 0.29212944284159165, + "grad_norm": 0.12196497619152069, + "learning_rate": 0.0005, + "loss": 2.1284, + "step": 76750 + }, + { + "epoch": 0.2921675053097143, + "grad_norm": 0.14936399459838867, + "learning_rate": 0.0005, + "loss": 2.1293, + "step": 76760 + }, + { + "epoch": 0.29220556777783696, + "grad_norm": 0.1394042670726776, + "learning_rate": 0.0005, + "loss": 2.1419, + "step": 76770 + }, + { + "epoch": 0.2922436302459597, + "grad_norm": 0.13346344232559204, + "learning_rate": 0.0005, + "loss": 2.1394, + "step": 76780 + }, + { + "epoch": 0.29228169271408233, + "grad_norm": 0.13088877499103546, + "learning_rate": 0.0005, + "loss": 2.127, + "step": 76790 + }, + { + "epoch": 0.29231975518220504, + "grad_norm": 0.12399112433195114, + "learning_rate": 0.0005, + "loss": 2.121, + "step": 76800 + }, + { + "epoch": 0.2923578176503277, + "grad_norm": 0.12089783698320389, + "learning_rate": 0.0005, + "loss": 2.1204, + "step": 76810 + }, + { + "epoch": 0.2923958801184504, + "grad_norm": 0.12687864899635315, + "learning_rate": 0.0005, + "loss": 2.1083, + "step": 76820 + }, + { + "epoch": 0.29243394258657307, + "grad_norm": 0.12511956691741943, + "learning_rate": 0.0005, + "loss": 2.1156, + "step": 76830 + }, + { + "epoch": 0.2924720050546958, + "grad_norm": 0.11711279302835464, + "learning_rate": 0.0005, + "loss": 2.1203, + "step": 76840 + }, + { + "epoch": 0.29251006752281844, + "grad_norm": 0.12532839179039001, + "learning_rate": 0.0005, + "loss": 2.1179, + "step": 76850 + }, + { + "epoch": 0.29254812999094115, + "grad_norm": 0.12755703926086426, + "learning_rate": 0.0005, + "loss": 2.1161, + "step": 76860 + }, + { + "epoch": 0.2925861924590638, + "grad_norm": 0.1276831328868866, + "learning_rate": 0.0005, + "loss": 2.1281, + "step": 76870 + }, + { + "epoch": 0.2926242549271865, + "grad_norm": 0.12448505312204361, + "learning_rate": 0.0005, + "loss": 2.1388, + "step": 76880 + }, + { + "epoch": 0.2926623173953092, + "grad_norm": 0.13176031410694122, + "learning_rate": 0.0005, + "loss": 2.1338, + "step": 76890 + }, + { + "epoch": 0.2927003798634319, + "grad_norm": 0.1303681880235672, + "learning_rate": 0.0005, + "loss": 2.1169, + "step": 76900 + }, + { + "epoch": 0.29273844233155455, + "grad_norm": 0.12910409271717072, + "learning_rate": 0.0005, + "loss": 2.113, + "step": 76910 + }, + { + "epoch": 0.2927765047996772, + "grad_norm": 0.14085090160369873, + "learning_rate": 0.0005, + "loss": 2.1295, + "step": 76920 + }, + { + "epoch": 0.2928145672677999, + "grad_norm": 0.1368868201971054, + "learning_rate": 0.0005, + "loss": 2.111, + "step": 76930 + }, + { + "epoch": 0.2928526297359226, + "grad_norm": 0.14073766767978668, + "learning_rate": 0.0005, + "loss": 2.1319, + "step": 76940 + }, + { + "epoch": 0.2928906922040453, + "grad_norm": 0.11861219257116318, + "learning_rate": 0.0005, + "loss": 2.1293, + "step": 76950 + }, + { + "epoch": 0.29292875467216795, + "grad_norm": 0.12039192020893097, + "learning_rate": 0.0005, + "loss": 2.1261, + "step": 76960 + }, + { + "epoch": 0.29296681714029066, + "grad_norm": 0.11999056488275528, + "learning_rate": 0.0005, + "loss": 2.1196, + "step": 76970 + }, + { + "epoch": 0.2930048796084133, + "grad_norm": 0.13445483148097992, + "learning_rate": 0.0005, + "loss": 2.1255, + "step": 76980 + }, + { + "epoch": 0.29304294207653603, + "grad_norm": 0.1250647008419037, + "learning_rate": 0.0005, + "loss": 2.1291, + "step": 76990 + }, + { + "epoch": 0.2930810045446587, + "grad_norm": 0.11895033717155457, + "learning_rate": 0.0005, + "loss": 2.1333, + "step": 77000 + }, + { + "epoch": 0.2931190670127814, + "grad_norm": 0.13068489730358124, + "learning_rate": 0.0005, + "loss": 2.1276, + "step": 77010 + }, + { + "epoch": 0.29315712948090406, + "grad_norm": 0.12797445058822632, + "learning_rate": 0.0005, + "loss": 2.1169, + "step": 77020 + }, + { + "epoch": 0.29319519194902677, + "grad_norm": 0.13003671169281006, + "learning_rate": 0.0005, + "loss": 2.1346, + "step": 77030 + }, + { + "epoch": 0.2932332544171494, + "grad_norm": 0.1370018720626831, + "learning_rate": 0.0005, + "loss": 2.1295, + "step": 77040 + }, + { + "epoch": 0.2932713168852721, + "grad_norm": 0.13659188151359558, + "learning_rate": 0.0005, + "loss": 2.1129, + "step": 77050 + }, + { + "epoch": 0.2933093793533948, + "grad_norm": 0.1316078156232834, + "learning_rate": 0.0005, + "loss": 2.1256, + "step": 77060 + }, + { + "epoch": 0.29334744182151745, + "grad_norm": 0.12242893129587173, + "learning_rate": 0.0005, + "loss": 2.1407, + "step": 77070 + }, + { + "epoch": 0.29338550428964016, + "grad_norm": 0.14071308076381683, + "learning_rate": 0.0005, + "loss": 2.1312, + "step": 77080 + }, + { + "epoch": 0.2934235667577628, + "grad_norm": 0.11522958427667618, + "learning_rate": 0.0005, + "loss": 2.1209, + "step": 77090 + }, + { + "epoch": 0.29346162922588553, + "grad_norm": 0.1277889758348465, + "learning_rate": 0.0005, + "loss": 2.1271, + "step": 77100 + }, + { + "epoch": 0.2934996916940082, + "grad_norm": 0.12273859977722168, + "learning_rate": 0.0005, + "loss": 2.1235, + "step": 77110 + }, + { + "epoch": 0.2935377541621309, + "grad_norm": 0.13316503167152405, + "learning_rate": 0.0005, + "loss": 2.1244, + "step": 77120 + }, + { + "epoch": 0.29357581663025356, + "grad_norm": 0.11983367800712585, + "learning_rate": 0.0005, + "loss": 2.1223, + "step": 77130 + }, + { + "epoch": 0.2936138790983763, + "grad_norm": 0.11889369785785675, + "learning_rate": 0.0005, + "loss": 2.1271, + "step": 77140 + }, + { + "epoch": 0.29365194156649893, + "grad_norm": 0.1265701949596405, + "learning_rate": 0.0005, + "loss": 2.1191, + "step": 77150 + }, + { + "epoch": 0.29369000403462164, + "grad_norm": 0.13863658905029297, + "learning_rate": 0.0005, + "loss": 2.1158, + "step": 77160 + }, + { + "epoch": 0.2937280665027443, + "grad_norm": 0.14393354952335358, + "learning_rate": 0.0005, + "loss": 2.1222, + "step": 77170 + }, + { + "epoch": 0.293766128970867, + "grad_norm": 0.12794476747512817, + "learning_rate": 0.0005, + "loss": 2.1271, + "step": 77180 + }, + { + "epoch": 0.29380419143898967, + "grad_norm": 0.12353495508432388, + "learning_rate": 0.0005, + "loss": 2.1184, + "step": 77190 + }, + { + "epoch": 0.2938422539071123, + "grad_norm": 0.12665621936321259, + "learning_rate": 0.0005, + "loss": 2.1416, + "step": 77200 + }, + { + "epoch": 0.29388031637523504, + "grad_norm": 0.12271159887313843, + "learning_rate": 0.0005, + "loss": 2.1155, + "step": 77210 + }, + { + "epoch": 0.2939183788433577, + "grad_norm": 0.14436013996601105, + "learning_rate": 0.0005, + "loss": 2.1139, + "step": 77220 + }, + { + "epoch": 0.2939564413114804, + "grad_norm": 0.12181149423122406, + "learning_rate": 0.0005, + "loss": 2.1231, + "step": 77230 + }, + { + "epoch": 0.29399450377960307, + "grad_norm": 0.13174769282341003, + "learning_rate": 0.0005, + "loss": 2.1309, + "step": 77240 + }, + { + "epoch": 0.2940325662477258, + "grad_norm": 0.1304592788219452, + "learning_rate": 0.0005, + "loss": 2.119, + "step": 77250 + }, + { + "epoch": 0.29407062871584844, + "grad_norm": 0.13582275807857513, + "learning_rate": 0.0005, + "loss": 2.1137, + "step": 77260 + }, + { + "epoch": 0.29410869118397115, + "grad_norm": 0.12943530082702637, + "learning_rate": 0.0005, + "loss": 2.1215, + "step": 77270 + }, + { + "epoch": 0.2941467536520938, + "grad_norm": 0.12374609708786011, + "learning_rate": 0.0005, + "loss": 2.14, + "step": 77280 + }, + { + "epoch": 0.2941848161202165, + "grad_norm": 0.12450854480266571, + "learning_rate": 0.0005, + "loss": 2.118, + "step": 77290 + }, + { + "epoch": 0.2942228785883392, + "grad_norm": 0.11895159631967545, + "learning_rate": 0.0005, + "loss": 2.1131, + "step": 77300 + }, + { + "epoch": 0.2942609410564619, + "grad_norm": 0.12057216465473175, + "learning_rate": 0.0005, + "loss": 2.1181, + "step": 77310 + }, + { + "epoch": 0.29429900352458455, + "grad_norm": 0.11752424389123917, + "learning_rate": 0.0005, + "loss": 2.1356, + "step": 77320 + }, + { + "epoch": 0.29433706599270726, + "grad_norm": 0.1295253336429596, + "learning_rate": 0.0005, + "loss": 2.1385, + "step": 77330 + }, + { + "epoch": 0.2943751284608299, + "grad_norm": 0.1286362111568451, + "learning_rate": 0.0005, + "loss": 2.1328, + "step": 77340 + }, + { + "epoch": 0.2944131909289526, + "grad_norm": 0.11909947544336319, + "learning_rate": 0.0005, + "loss": 2.126, + "step": 77350 + }, + { + "epoch": 0.2944512533970753, + "grad_norm": 0.13250653445720673, + "learning_rate": 0.0005, + "loss": 2.1331, + "step": 77360 + }, + { + "epoch": 0.29448931586519794, + "grad_norm": 0.1223980188369751, + "learning_rate": 0.0005, + "loss": 2.1062, + "step": 77370 + }, + { + "epoch": 0.29452737833332066, + "grad_norm": 0.12595222890377045, + "learning_rate": 0.0005, + "loss": 2.1191, + "step": 77380 + }, + { + "epoch": 0.2945654408014433, + "grad_norm": 0.1286715269088745, + "learning_rate": 0.0005, + "loss": 2.0977, + "step": 77390 + }, + { + "epoch": 0.294603503269566, + "grad_norm": 0.13232561945915222, + "learning_rate": 0.0005, + "loss": 2.1217, + "step": 77400 + }, + { + "epoch": 0.2946415657376887, + "grad_norm": 0.1344754844903946, + "learning_rate": 0.0005, + "loss": 2.1168, + "step": 77410 + }, + { + "epoch": 0.2946796282058114, + "grad_norm": 0.12267153710126877, + "learning_rate": 0.0005, + "loss": 2.1069, + "step": 77420 + }, + { + "epoch": 0.29471769067393405, + "grad_norm": 0.11879909783601761, + "learning_rate": 0.0005, + "loss": 2.1196, + "step": 77430 + }, + { + "epoch": 0.29475575314205676, + "grad_norm": 0.1467399001121521, + "learning_rate": 0.0005, + "loss": 2.1326, + "step": 77440 + }, + { + "epoch": 0.2947938156101794, + "grad_norm": 0.12790904939174652, + "learning_rate": 0.0005, + "loss": 2.1256, + "step": 77450 + }, + { + "epoch": 0.29483187807830213, + "grad_norm": 0.12110629677772522, + "learning_rate": 0.0005, + "loss": 2.1266, + "step": 77460 + }, + { + "epoch": 0.2948699405464248, + "grad_norm": 0.11823920905590057, + "learning_rate": 0.0005, + "loss": 2.1198, + "step": 77470 + }, + { + "epoch": 0.29490800301454745, + "grad_norm": 0.11439647525548935, + "learning_rate": 0.0005, + "loss": 2.1074, + "step": 77480 + }, + { + "epoch": 0.29494606548267016, + "grad_norm": 0.13713325560092926, + "learning_rate": 0.0005, + "loss": 2.1273, + "step": 77490 + }, + { + "epoch": 0.2949841279507928, + "grad_norm": 0.1307079792022705, + "learning_rate": 0.0005, + "loss": 2.122, + "step": 77500 + }, + { + "epoch": 0.29502219041891553, + "grad_norm": 0.12214863300323486, + "learning_rate": 0.0005, + "loss": 2.1404, + "step": 77510 + }, + { + "epoch": 0.2950602528870382, + "grad_norm": 0.12139144539833069, + "learning_rate": 0.0005, + "loss": 2.1151, + "step": 77520 + }, + { + "epoch": 0.2950983153551609, + "grad_norm": 0.13657598197460175, + "learning_rate": 0.0005, + "loss": 2.1362, + "step": 77530 + }, + { + "epoch": 0.29513637782328356, + "grad_norm": 0.12807506322860718, + "learning_rate": 0.0005, + "loss": 2.1152, + "step": 77540 + }, + { + "epoch": 0.29517444029140627, + "grad_norm": 0.13245591521263123, + "learning_rate": 0.0005, + "loss": 2.1206, + "step": 77550 + }, + { + "epoch": 0.2952125027595289, + "grad_norm": 0.12371648848056793, + "learning_rate": 0.0005, + "loss": 2.1416, + "step": 77560 + }, + { + "epoch": 0.29525056522765164, + "grad_norm": 0.11524897068738937, + "learning_rate": 0.0005, + "loss": 2.118, + "step": 77570 + }, + { + "epoch": 0.2952886276957743, + "grad_norm": 0.11479327082633972, + "learning_rate": 0.0005, + "loss": 2.1178, + "step": 77580 + }, + { + "epoch": 0.295326690163897, + "grad_norm": 0.14055931568145752, + "learning_rate": 0.0005, + "loss": 2.1388, + "step": 77590 + }, + { + "epoch": 0.29536475263201967, + "grad_norm": 0.11579246819019318, + "learning_rate": 0.0005, + "loss": 2.1256, + "step": 77600 + }, + { + "epoch": 0.2954028151001424, + "grad_norm": 0.12510639429092407, + "learning_rate": 0.0005, + "loss": 2.1218, + "step": 77610 + }, + { + "epoch": 0.29544087756826504, + "grad_norm": 0.15101352334022522, + "learning_rate": 0.0005, + "loss": 2.1096, + "step": 77620 + }, + { + "epoch": 0.2954789400363877, + "grad_norm": 0.1322011798620224, + "learning_rate": 0.0005, + "loss": 2.1387, + "step": 77630 + }, + { + "epoch": 0.2955170025045104, + "grad_norm": 0.13096703588962555, + "learning_rate": 0.0005, + "loss": 2.1242, + "step": 77640 + }, + { + "epoch": 0.29555506497263306, + "grad_norm": 0.13723981380462646, + "learning_rate": 0.0005, + "loss": 2.1285, + "step": 77650 + }, + { + "epoch": 0.2955931274407558, + "grad_norm": 0.14213553071022034, + "learning_rate": 0.0005, + "loss": 2.1255, + "step": 77660 + }, + { + "epoch": 0.29563118990887843, + "grad_norm": 0.14133940637111664, + "learning_rate": 0.0005, + "loss": 2.1312, + "step": 77670 + }, + { + "epoch": 0.29566925237700115, + "grad_norm": 0.1276482492685318, + "learning_rate": 0.0005, + "loss": 2.1168, + "step": 77680 + }, + { + "epoch": 0.2957073148451238, + "grad_norm": 0.12077028304338455, + "learning_rate": 0.0005, + "loss": 2.1308, + "step": 77690 + }, + { + "epoch": 0.2957453773132465, + "grad_norm": 0.1180778294801712, + "learning_rate": 0.0005, + "loss": 2.1211, + "step": 77700 + }, + { + "epoch": 0.2957834397813692, + "grad_norm": 0.12569206953048706, + "learning_rate": 0.0005, + "loss": 2.1285, + "step": 77710 + }, + { + "epoch": 0.2958215022494919, + "grad_norm": 0.14068447053432465, + "learning_rate": 0.0005, + "loss": 2.1359, + "step": 77720 + }, + { + "epoch": 0.29585956471761454, + "grad_norm": 0.12761713564395905, + "learning_rate": 0.0005, + "loss": 2.135, + "step": 77730 + }, + { + "epoch": 0.29589762718573726, + "grad_norm": 0.1189335286617279, + "learning_rate": 0.0005, + "loss": 2.12, + "step": 77740 + }, + { + "epoch": 0.2959356896538599, + "grad_norm": 0.11740929633378983, + "learning_rate": 0.0005, + "loss": 2.1345, + "step": 77750 + }, + { + "epoch": 0.2959737521219826, + "grad_norm": 0.12548650801181793, + "learning_rate": 0.0005, + "loss": 2.1419, + "step": 77760 + }, + { + "epoch": 0.2960118145901053, + "grad_norm": 0.13285624980926514, + "learning_rate": 0.0005, + "loss": 2.1198, + "step": 77770 + }, + { + "epoch": 0.29604987705822794, + "grad_norm": 0.12186500430107117, + "learning_rate": 0.0005, + "loss": 2.1312, + "step": 77780 + }, + { + "epoch": 0.29608793952635065, + "grad_norm": 0.14229805767536163, + "learning_rate": 0.0005, + "loss": 2.1204, + "step": 77790 + }, + { + "epoch": 0.2961260019944733, + "grad_norm": 0.12769339978694916, + "learning_rate": 0.0005, + "loss": 2.1218, + "step": 77800 + }, + { + "epoch": 0.296164064462596, + "grad_norm": 0.12053908407688141, + "learning_rate": 0.0005, + "loss": 2.1163, + "step": 77810 + }, + { + "epoch": 0.2962021269307187, + "grad_norm": 0.12113712728023529, + "learning_rate": 0.0005, + "loss": 2.1327, + "step": 77820 + }, + { + "epoch": 0.2962401893988414, + "grad_norm": 0.12331096082925797, + "learning_rate": 0.0005, + "loss": 2.1282, + "step": 77830 + }, + { + "epoch": 0.29627825186696405, + "grad_norm": 0.12478657066822052, + "learning_rate": 0.0005, + "loss": 2.1356, + "step": 77840 + }, + { + "epoch": 0.29631631433508676, + "grad_norm": 0.14286023378372192, + "learning_rate": 0.0005, + "loss": 2.1245, + "step": 77850 + }, + { + "epoch": 0.2963543768032094, + "grad_norm": 0.11814841628074646, + "learning_rate": 0.0005, + "loss": 2.1195, + "step": 77860 + }, + { + "epoch": 0.29639243927133213, + "grad_norm": 0.12050684541463852, + "learning_rate": 0.0005, + "loss": 2.1204, + "step": 77870 + }, + { + "epoch": 0.2964305017394548, + "grad_norm": 0.2006334811449051, + "learning_rate": 0.0005, + "loss": 2.1134, + "step": 77880 + }, + { + "epoch": 0.2964685642075775, + "grad_norm": 0.1282692402601242, + "learning_rate": 0.0005, + "loss": 2.1377, + "step": 77890 + }, + { + "epoch": 0.29650662667570016, + "grad_norm": 0.11931289732456207, + "learning_rate": 0.0005, + "loss": 2.1262, + "step": 77900 + }, + { + "epoch": 0.2965446891438228, + "grad_norm": 0.12939167022705078, + "learning_rate": 0.0005, + "loss": 2.1257, + "step": 77910 + }, + { + "epoch": 0.2965827516119455, + "grad_norm": 0.1268099993467331, + "learning_rate": 0.0005, + "loss": 2.1234, + "step": 77920 + }, + { + "epoch": 0.2966208140800682, + "grad_norm": 0.13123664259910583, + "learning_rate": 0.0005, + "loss": 2.1143, + "step": 77930 + }, + { + "epoch": 0.2966588765481909, + "grad_norm": 0.12263063341379166, + "learning_rate": 0.0005, + "loss": 2.1325, + "step": 77940 + }, + { + "epoch": 0.29669693901631355, + "grad_norm": 0.12018568813800812, + "learning_rate": 0.0005, + "loss": 2.1242, + "step": 77950 + }, + { + "epoch": 0.29673500148443627, + "grad_norm": 0.11666128784418106, + "learning_rate": 0.0005, + "loss": 2.1148, + "step": 77960 + }, + { + "epoch": 0.2967730639525589, + "grad_norm": 0.12188054621219635, + "learning_rate": 0.0005, + "loss": 2.126, + "step": 77970 + }, + { + "epoch": 0.29681112642068164, + "grad_norm": 0.12619496881961823, + "learning_rate": 0.0005, + "loss": 2.1332, + "step": 77980 + }, + { + "epoch": 0.2968491888888043, + "grad_norm": 0.13010945916175842, + "learning_rate": 0.0005, + "loss": 2.1258, + "step": 77990 + }, + { + "epoch": 0.296887251356927, + "grad_norm": 0.1309840977191925, + "learning_rate": 0.0005, + "loss": 2.1196, + "step": 78000 + }, + { + "epoch": 0.29692531382504966, + "grad_norm": 0.11490017920732498, + "learning_rate": 0.0005, + "loss": 2.1151, + "step": 78010 + }, + { + "epoch": 0.2969633762931724, + "grad_norm": 0.1267571896314621, + "learning_rate": 0.0005, + "loss": 2.1319, + "step": 78020 + }, + { + "epoch": 0.29700143876129503, + "grad_norm": 0.12055765092372894, + "learning_rate": 0.0005, + "loss": 2.1185, + "step": 78030 + }, + { + "epoch": 0.29703950122941775, + "grad_norm": 0.13392072916030884, + "learning_rate": 0.0005, + "loss": 2.124, + "step": 78040 + }, + { + "epoch": 0.2970775636975404, + "grad_norm": 0.11891845613718033, + "learning_rate": 0.0005, + "loss": 2.1297, + "step": 78050 + }, + { + "epoch": 0.29711562616566306, + "grad_norm": 0.12135432660579681, + "learning_rate": 0.0005, + "loss": 2.1351, + "step": 78060 + }, + { + "epoch": 0.2971536886337858, + "grad_norm": 0.12567847967147827, + "learning_rate": 0.0005, + "loss": 2.1289, + "step": 78070 + }, + { + "epoch": 0.29719175110190843, + "grad_norm": 0.12707142531871796, + "learning_rate": 0.0005, + "loss": 2.1275, + "step": 78080 + }, + { + "epoch": 0.29722981357003114, + "grad_norm": 0.11984868347644806, + "learning_rate": 0.0005, + "loss": 2.1134, + "step": 78090 + }, + { + "epoch": 0.2972678760381538, + "grad_norm": 0.11494628340005875, + "learning_rate": 0.0005, + "loss": 2.1321, + "step": 78100 + }, + { + "epoch": 0.2973059385062765, + "grad_norm": 0.11759883165359497, + "learning_rate": 0.0005, + "loss": 2.1228, + "step": 78110 + }, + { + "epoch": 0.29734400097439917, + "grad_norm": 0.1299513429403305, + "learning_rate": 0.0005, + "loss": 2.1199, + "step": 78120 + }, + { + "epoch": 0.2973820634425219, + "grad_norm": 0.1307305544614792, + "learning_rate": 0.0005, + "loss": 2.1327, + "step": 78130 + }, + { + "epoch": 0.29742012591064454, + "grad_norm": 0.12624263763427734, + "learning_rate": 0.0005, + "loss": 2.1357, + "step": 78140 + }, + { + "epoch": 0.29745818837876725, + "grad_norm": 0.1459318846464157, + "learning_rate": 0.0005, + "loss": 2.1429, + "step": 78150 + }, + { + "epoch": 0.2974962508468899, + "grad_norm": 0.1276773065328598, + "learning_rate": 0.0005, + "loss": 2.125, + "step": 78160 + }, + { + "epoch": 0.2975343133150126, + "grad_norm": 0.12030398100614548, + "learning_rate": 0.0005, + "loss": 2.1373, + "step": 78170 + }, + { + "epoch": 0.2975723757831353, + "grad_norm": 0.12564341723918915, + "learning_rate": 0.0005, + "loss": 2.109, + "step": 78180 + }, + { + "epoch": 0.297610438251258, + "grad_norm": 0.12345995008945465, + "learning_rate": 0.0005, + "loss": 2.1301, + "step": 78190 + }, + { + "epoch": 0.29764850071938065, + "grad_norm": 0.1233118325471878, + "learning_rate": 0.0005, + "loss": 2.1264, + "step": 78200 + }, + { + "epoch": 0.2976865631875033, + "grad_norm": 0.1251271665096283, + "learning_rate": 0.0005, + "loss": 2.1131, + "step": 78210 + }, + { + "epoch": 0.297724625655626, + "grad_norm": 0.11773156374692917, + "learning_rate": 0.0005, + "loss": 2.1319, + "step": 78220 + }, + { + "epoch": 0.2977626881237487, + "grad_norm": 0.1261213719844818, + "learning_rate": 0.0005, + "loss": 2.1146, + "step": 78230 + }, + { + "epoch": 0.2978007505918714, + "grad_norm": 0.12224511802196503, + "learning_rate": 0.0005, + "loss": 2.1189, + "step": 78240 + }, + { + "epoch": 0.29783881305999405, + "grad_norm": 0.12229447066783905, + "learning_rate": 0.0005, + "loss": 2.1261, + "step": 78250 + }, + { + "epoch": 0.29787687552811676, + "grad_norm": 0.13535583019256592, + "learning_rate": 0.0005, + "loss": 2.1242, + "step": 78260 + }, + { + "epoch": 0.2979149379962394, + "grad_norm": 0.120675228536129, + "learning_rate": 0.0005, + "loss": 2.1236, + "step": 78270 + }, + { + "epoch": 0.2979530004643621, + "grad_norm": 0.13022558391094208, + "learning_rate": 0.0005, + "loss": 2.1133, + "step": 78280 + }, + { + "epoch": 0.2979910629324848, + "grad_norm": 0.12307160347700119, + "learning_rate": 0.0005, + "loss": 2.1327, + "step": 78290 + }, + { + "epoch": 0.2980291254006075, + "grad_norm": 0.1300327628850937, + "learning_rate": 0.0005, + "loss": 2.1325, + "step": 78300 + }, + { + "epoch": 0.29806718786873015, + "grad_norm": 0.11864542961120605, + "learning_rate": 0.0005, + "loss": 2.1173, + "step": 78310 + }, + { + "epoch": 0.29810525033685287, + "grad_norm": 0.13210463523864746, + "learning_rate": 0.0005, + "loss": 2.1259, + "step": 78320 + }, + { + "epoch": 0.2981433128049755, + "grad_norm": 0.11801157891750336, + "learning_rate": 0.0005, + "loss": 2.1157, + "step": 78330 + }, + { + "epoch": 0.2981813752730982, + "grad_norm": 0.12514695525169373, + "learning_rate": 0.0005, + "loss": 2.125, + "step": 78340 + }, + { + "epoch": 0.2982194377412209, + "grad_norm": 0.12310615181922913, + "learning_rate": 0.0005, + "loss": 2.1282, + "step": 78350 + }, + { + "epoch": 0.29825750020934355, + "grad_norm": 0.13028378784656525, + "learning_rate": 0.0005, + "loss": 2.1241, + "step": 78360 + }, + { + "epoch": 0.29829556267746626, + "grad_norm": 0.1156439259648323, + "learning_rate": 0.0005, + "loss": 2.1134, + "step": 78370 + }, + { + "epoch": 0.2983336251455889, + "grad_norm": 0.12388564646244049, + "learning_rate": 0.0005, + "loss": 2.1165, + "step": 78380 + }, + { + "epoch": 0.29837168761371163, + "grad_norm": 0.13688057661056519, + "learning_rate": 0.0005, + "loss": 2.125, + "step": 78390 + }, + { + "epoch": 0.2984097500818343, + "grad_norm": 0.13023453950881958, + "learning_rate": 0.0005, + "loss": 2.121, + "step": 78400 + }, + { + "epoch": 0.298447812549957, + "grad_norm": 0.12264353781938553, + "learning_rate": 0.0005, + "loss": 2.1298, + "step": 78410 + }, + { + "epoch": 0.29848587501807966, + "grad_norm": 0.14520218968391418, + "learning_rate": 0.0005, + "loss": 2.1259, + "step": 78420 + }, + { + "epoch": 0.2985239374862024, + "grad_norm": 0.11594079434871674, + "learning_rate": 0.0005, + "loss": 2.1293, + "step": 78430 + }, + { + "epoch": 0.29856199995432503, + "grad_norm": 0.12649290263652802, + "learning_rate": 0.0005, + "loss": 2.1225, + "step": 78440 + }, + { + "epoch": 0.29860006242244774, + "grad_norm": 0.11950668692588806, + "learning_rate": 0.0005, + "loss": 2.1276, + "step": 78450 + }, + { + "epoch": 0.2986381248905704, + "grad_norm": 0.1311403214931488, + "learning_rate": 0.0005, + "loss": 2.1194, + "step": 78460 + }, + { + "epoch": 0.2986761873586931, + "grad_norm": 0.14947395026683807, + "learning_rate": 0.0005, + "loss": 2.1233, + "step": 78470 + }, + { + "epoch": 0.29871424982681577, + "grad_norm": 0.13090112805366516, + "learning_rate": 0.0005, + "loss": 2.1416, + "step": 78480 + }, + { + "epoch": 0.2987523122949384, + "grad_norm": 0.11776856333017349, + "learning_rate": 0.0005, + "loss": 2.1125, + "step": 78490 + }, + { + "epoch": 0.29879037476306114, + "grad_norm": 0.16615436971187592, + "learning_rate": 0.0005, + "loss": 2.1297, + "step": 78500 + }, + { + "epoch": 0.2988284372311838, + "grad_norm": 0.121536485850811, + "learning_rate": 0.0005, + "loss": 2.1287, + "step": 78510 + }, + { + "epoch": 0.2988664996993065, + "grad_norm": 0.12763504683971405, + "learning_rate": 0.0005, + "loss": 2.1205, + "step": 78520 + }, + { + "epoch": 0.29890456216742917, + "grad_norm": 0.13472750782966614, + "learning_rate": 0.0005, + "loss": 2.131, + "step": 78530 + }, + { + "epoch": 0.2989426246355519, + "grad_norm": 0.12608040869235992, + "learning_rate": 0.0005, + "loss": 2.122, + "step": 78540 + }, + { + "epoch": 0.29898068710367454, + "grad_norm": 0.12341855466365814, + "learning_rate": 0.0005, + "loss": 2.119, + "step": 78550 + }, + { + "epoch": 0.29901874957179725, + "grad_norm": 0.1237914189696312, + "learning_rate": 0.0005, + "loss": 2.1279, + "step": 78560 + }, + { + "epoch": 0.2990568120399199, + "grad_norm": 0.1223236694931984, + "learning_rate": 0.0005, + "loss": 2.1306, + "step": 78570 + }, + { + "epoch": 0.2990948745080426, + "grad_norm": 0.12150850892066956, + "learning_rate": 0.0005, + "loss": 2.1218, + "step": 78580 + }, + { + "epoch": 0.2991329369761653, + "grad_norm": 0.11814062297344208, + "learning_rate": 0.0005, + "loss": 2.1177, + "step": 78590 + }, + { + "epoch": 0.299170999444288, + "grad_norm": 0.1291181743144989, + "learning_rate": 0.0005, + "loss": 2.114, + "step": 78600 + }, + { + "epoch": 0.29920906191241065, + "grad_norm": 0.1198849007487297, + "learning_rate": 0.0005, + "loss": 2.1152, + "step": 78610 + }, + { + "epoch": 0.29924712438053336, + "grad_norm": 0.12144499272108078, + "learning_rate": 0.0005, + "loss": 2.1269, + "step": 78620 + }, + { + "epoch": 0.299285186848656, + "grad_norm": 0.12497196346521378, + "learning_rate": 0.0005, + "loss": 2.1299, + "step": 78630 + }, + { + "epoch": 0.2993232493167787, + "grad_norm": 0.12080928683280945, + "learning_rate": 0.0005, + "loss": 2.1264, + "step": 78640 + }, + { + "epoch": 0.2993613117849014, + "grad_norm": 0.14563171565532684, + "learning_rate": 0.0005, + "loss": 2.1541, + "step": 78650 + }, + { + "epoch": 0.29939937425302404, + "grad_norm": 0.13387221097946167, + "learning_rate": 0.0005, + "loss": 2.1305, + "step": 78660 + }, + { + "epoch": 0.29943743672114675, + "grad_norm": 0.11875801533460617, + "learning_rate": 0.0005, + "loss": 2.1396, + "step": 78670 + }, + { + "epoch": 0.2994754991892694, + "grad_norm": 0.11633630841970444, + "learning_rate": 0.0005, + "loss": 2.1113, + "step": 78680 + }, + { + "epoch": 0.2995135616573921, + "grad_norm": 0.12366536259651184, + "learning_rate": 0.0005, + "loss": 2.12, + "step": 78690 + }, + { + "epoch": 0.2995516241255148, + "grad_norm": 0.1260582059621811, + "learning_rate": 0.0005, + "loss": 2.1215, + "step": 78700 + }, + { + "epoch": 0.2995896865936375, + "grad_norm": 0.12684305012226105, + "learning_rate": 0.0005, + "loss": 2.1179, + "step": 78710 + }, + { + "epoch": 0.29962774906176015, + "grad_norm": 0.1218743845820427, + "learning_rate": 0.0005, + "loss": 2.1264, + "step": 78720 + }, + { + "epoch": 0.29966581152988286, + "grad_norm": 0.1152462288737297, + "learning_rate": 0.0005, + "loss": 2.1214, + "step": 78730 + }, + { + "epoch": 0.2997038739980055, + "grad_norm": 0.12806940078735352, + "learning_rate": 0.0005, + "loss": 2.1298, + "step": 78740 + }, + { + "epoch": 0.29974193646612823, + "grad_norm": 0.11596892774105072, + "learning_rate": 0.0005, + "loss": 2.1254, + "step": 78750 + }, + { + "epoch": 0.2997799989342509, + "grad_norm": 0.12253748625516891, + "learning_rate": 0.0005, + "loss": 2.1374, + "step": 78760 + }, + { + "epoch": 0.29981806140237355, + "grad_norm": 0.13022203743457794, + "learning_rate": 0.0005, + "loss": 2.1246, + "step": 78770 + }, + { + "epoch": 0.29985612387049626, + "grad_norm": 0.12577566504478455, + "learning_rate": 0.0005, + "loss": 2.1248, + "step": 78780 + }, + { + "epoch": 0.2998941863386189, + "grad_norm": 0.12384268641471863, + "learning_rate": 0.0005, + "loss": 2.1211, + "step": 78790 + }, + { + "epoch": 0.29993224880674163, + "grad_norm": 0.12241975963115692, + "learning_rate": 0.0005, + "loss": 2.1224, + "step": 78800 + }, + { + "epoch": 0.2999703112748643, + "grad_norm": 0.12391491234302521, + "learning_rate": 0.0005, + "loss": 2.1279, + "step": 78810 + }, + { + "epoch": 0.300008373742987, + "grad_norm": 0.12020351737737656, + "learning_rate": 0.0005, + "loss": 2.131, + "step": 78820 + }, + { + "epoch": 0.30004643621110966, + "grad_norm": 0.11576558649539948, + "learning_rate": 0.0005, + "loss": 2.1402, + "step": 78830 + }, + { + "epoch": 0.30008449867923237, + "grad_norm": 0.11263838410377502, + "learning_rate": 0.0005, + "loss": 2.1187, + "step": 78840 + }, + { + "epoch": 0.300122561147355, + "grad_norm": 0.1252390593290329, + "learning_rate": 0.0005, + "loss": 2.1111, + "step": 78850 + }, + { + "epoch": 0.30016062361547774, + "grad_norm": 0.13681459426879883, + "learning_rate": 0.0005, + "loss": 2.1332, + "step": 78860 + }, + { + "epoch": 0.3001986860836004, + "grad_norm": 0.1321314573287964, + "learning_rate": 0.0005, + "loss": 2.1396, + "step": 78870 + }, + { + "epoch": 0.3002367485517231, + "grad_norm": 0.12426672130823135, + "learning_rate": 0.0005, + "loss": 2.1245, + "step": 78880 + }, + { + "epoch": 0.30027481101984577, + "grad_norm": 0.1215248554944992, + "learning_rate": 0.0005, + "loss": 2.1252, + "step": 78890 + }, + { + "epoch": 0.3003128734879685, + "grad_norm": 0.1245889812707901, + "learning_rate": 0.0005, + "loss": 2.129, + "step": 78900 + }, + { + "epoch": 0.30035093595609114, + "grad_norm": 0.12007194012403488, + "learning_rate": 0.0005, + "loss": 2.1347, + "step": 78910 + }, + { + "epoch": 0.3003889984242138, + "grad_norm": 0.1228775754570961, + "learning_rate": 0.0005, + "loss": 2.1299, + "step": 78920 + }, + { + "epoch": 0.3004270608923365, + "grad_norm": 0.1209733709692955, + "learning_rate": 0.0005, + "loss": 2.1155, + "step": 78930 + }, + { + "epoch": 0.30046512336045916, + "grad_norm": 0.11423403024673462, + "learning_rate": 0.0005, + "loss": 2.1121, + "step": 78940 + }, + { + "epoch": 0.3005031858285819, + "grad_norm": 0.12101174890995026, + "learning_rate": 0.0005, + "loss": 2.1359, + "step": 78950 + }, + { + "epoch": 0.30054124829670453, + "grad_norm": 0.13360558450222015, + "learning_rate": 0.0005, + "loss": 2.1354, + "step": 78960 + }, + { + "epoch": 0.30057931076482725, + "grad_norm": 0.11620435863733292, + "learning_rate": 0.0005, + "loss": 2.1254, + "step": 78970 + }, + { + "epoch": 0.3006173732329499, + "grad_norm": 0.12485792487859726, + "learning_rate": 0.0005, + "loss": 2.1289, + "step": 78980 + }, + { + "epoch": 0.3006554357010726, + "grad_norm": 0.12738344073295593, + "learning_rate": 0.0005, + "loss": 2.1205, + "step": 78990 + }, + { + "epoch": 0.3006934981691953, + "grad_norm": 0.1248827874660492, + "learning_rate": 0.0005, + "loss": 2.1334, + "step": 79000 + }, + { + "epoch": 0.300731560637318, + "grad_norm": 0.1254810392856598, + "learning_rate": 0.0005, + "loss": 2.134, + "step": 79010 + }, + { + "epoch": 0.30076962310544064, + "grad_norm": 0.13309615850448608, + "learning_rate": 0.0005, + "loss": 2.1168, + "step": 79020 + }, + { + "epoch": 0.30080768557356335, + "grad_norm": 0.11738866567611694, + "learning_rate": 0.0005, + "loss": 2.1188, + "step": 79030 + }, + { + "epoch": 0.300845748041686, + "grad_norm": 0.12218448519706726, + "learning_rate": 0.0005, + "loss": 2.1257, + "step": 79040 + }, + { + "epoch": 0.3008838105098087, + "grad_norm": 0.13277316093444824, + "learning_rate": 0.0005, + "loss": 2.1266, + "step": 79050 + }, + { + "epoch": 0.3009218729779314, + "grad_norm": 0.11647707223892212, + "learning_rate": 0.0005, + "loss": 2.1155, + "step": 79060 + }, + { + "epoch": 0.30095993544605404, + "grad_norm": 0.12075653672218323, + "learning_rate": 0.0005, + "loss": 2.1239, + "step": 79070 + }, + { + "epoch": 0.30099799791417675, + "grad_norm": 0.12462682276964188, + "learning_rate": 0.0005, + "loss": 2.1162, + "step": 79080 + }, + { + "epoch": 0.3010360603822994, + "grad_norm": 0.1321898251771927, + "learning_rate": 0.0005, + "loss": 2.143, + "step": 79090 + }, + { + "epoch": 0.3010741228504221, + "grad_norm": 0.12736287713050842, + "learning_rate": 0.0005, + "loss": 2.1128, + "step": 79100 + }, + { + "epoch": 0.3011121853185448, + "grad_norm": 0.1307191550731659, + "learning_rate": 0.0005, + "loss": 2.1347, + "step": 79110 + }, + { + "epoch": 0.3011502477866675, + "grad_norm": 0.13263042271137238, + "learning_rate": 0.0005, + "loss": 2.1145, + "step": 79120 + }, + { + "epoch": 0.30118831025479015, + "grad_norm": 0.12208875268697739, + "learning_rate": 0.0005, + "loss": 2.1351, + "step": 79130 + }, + { + "epoch": 0.30122637272291286, + "grad_norm": 0.1463027149438858, + "learning_rate": 0.0005, + "loss": 2.1148, + "step": 79140 + }, + { + "epoch": 0.3012644351910355, + "grad_norm": 0.1252584457397461, + "learning_rate": 0.0005, + "loss": 2.1307, + "step": 79150 + }, + { + "epoch": 0.30130249765915823, + "grad_norm": 0.1358025074005127, + "learning_rate": 0.0005, + "loss": 2.1239, + "step": 79160 + }, + { + "epoch": 0.3013405601272809, + "grad_norm": 0.12210721522569656, + "learning_rate": 0.0005, + "loss": 2.1234, + "step": 79170 + }, + { + "epoch": 0.3013786225954036, + "grad_norm": 0.13368509709835052, + "learning_rate": 0.0005, + "loss": 2.1382, + "step": 79180 + }, + { + "epoch": 0.30141668506352626, + "grad_norm": 0.1288861781358719, + "learning_rate": 0.0005, + "loss": 2.1341, + "step": 79190 + }, + { + "epoch": 0.30145474753164897, + "grad_norm": 0.12868791818618774, + "learning_rate": 0.0005, + "loss": 2.118, + "step": 79200 + }, + { + "epoch": 0.3014928099997716, + "grad_norm": 0.12193211913108826, + "learning_rate": 0.0005, + "loss": 2.1249, + "step": 79210 + }, + { + "epoch": 0.3015308724678943, + "grad_norm": 0.11928920447826385, + "learning_rate": 0.0005, + "loss": 2.1306, + "step": 79220 + }, + { + "epoch": 0.301568934936017, + "grad_norm": 0.11605922132730484, + "learning_rate": 0.0005, + "loss": 2.1113, + "step": 79230 + }, + { + "epoch": 0.30160699740413965, + "grad_norm": 0.12350272387266159, + "learning_rate": 0.0005, + "loss": 2.1234, + "step": 79240 + }, + { + "epoch": 0.30164505987226237, + "grad_norm": 0.12360288202762604, + "learning_rate": 0.0005, + "loss": 2.1241, + "step": 79250 + }, + { + "epoch": 0.301683122340385, + "grad_norm": 0.11717429757118225, + "learning_rate": 0.0005, + "loss": 2.1156, + "step": 79260 + }, + { + "epoch": 0.30172118480850774, + "grad_norm": 0.12328846752643585, + "learning_rate": 0.0005, + "loss": 2.1226, + "step": 79270 + }, + { + "epoch": 0.3017592472766304, + "grad_norm": 0.11897186934947968, + "learning_rate": 0.0005, + "loss": 2.1407, + "step": 79280 + }, + { + "epoch": 0.3017973097447531, + "grad_norm": 0.1136583536863327, + "learning_rate": 0.0005, + "loss": 2.1364, + "step": 79290 + }, + { + "epoch": 0.30183537221287576, + "grad_norm": 0.12160983681678772, + "learning_rate": 0.0005, + "loss": 2.1261, + "step": 79300 + }, + { + "epoch": 0.3018734346809985, + "grad_norm": 0.12209036201238632, + "learning_rate": 0.0005, + "loss": 2.1163, + "step": 79310 + }, + { + "epoch": 0.30191149714912113, + "grad_norm": 0.1218925416469574, + "learning_rate": 0.0005, + "loss": 2.1314, + "step": 79320 + }, + { + "epoch": 0.30194955961724385, + "grad_norm": 0.14273086190223694, + "learning_rate": 0.0005, + "loss": 2.1415, + "step": 79330 + }, + { + "epoch": 0.3019876220853665, + "grad_norm": 0.11334306746721268, + "learning_rate": 0.0005, + "loss": 2.1205, + "step": 79340 + }, + { + "epoch": 0.30202568455348916, + "grad_norm": 0.13532891869544983, + "learning_rate": 0.0005, + "loss": 2.1185, + "step": 79350 + }, + { + "epoch": 0.3020637470216119, + "grad_norm": 0.13038349151611328, + "learning_rate": 0.0005, + "loss": 2.1199, + "step": 79360 + }, + { + "epoch": 0.30210180948973453, + "grad_norm": 0.1337863951921463, + "learning_rate": 0.0005, + "loss": 2.1355, + "step": 79370 + }, + { + "epoch": 0.30213987195785724, + "grad_norm": 0.11505336314439774, + "learning_rate": 0.0005, + "loss": 2.1171, + "step": 79380 + }, + { + "epoch": 0.3021779344259799, + "grad_norm": 0.13061103224754333, + "learning_rate": 0.0005, + "loss": 2.138, + "step": 79390 + }, + { + "epoch": 0.3022159968941026, + "grad_norm": 0.12964025139808655, + "learning_rate": 0.0005, + "loss": 2.1122, + "step": 79400 + }, + { + "epoch": 0.30225405936222527, + "grad_norm": 0.11681259423494339, + "learning_rate": 0.0005, + "loss": 2.121, + "step": 79410 + }, + { + "epoch": 0.302292121830348, + "grad_norm": 0.11721569299697876, + "learning_rate": 0.0005, + "loss": 2.1252, + "step": 79420 + }, + { + "epoch": 0.30233018429847064, + "grad_norm": 0.12377406656742096, + "learning_rate": 0.0005, + "loss": 2.1186, + "step": 79430 + }, + { + "epoch": 0.30236824676659335, + "grad_norm": 0.13084007799625397, + "learning_rate": 0.0005, + "loss": 2.1152, + "step": 79440 + }, + { + "epoch": 0.302406309234716, + "grad_norm": 0.12417054176330566, + "learning_rate": 0.0005, + "loss": 2.1247, + "step": 79450 + }, + { + "epoch": 0.3024443717028387, + "grad_norm": 0.15026329457759857, + "learning_rate": 0.0005, + "loss": 2.1203, + "step": 79460 + }, + { + "epoch": 0.3024824341709614, + "grad_norm": 0.11741312593221664, + "learning_rate": 0.0005, + "loss": 2.1282, + "step": 79470 + }, + { + "epoch": 0.3025204966390841, + "grad_norm": 0.13819944858551025, + "learning_rate": 0.0005, + "loss": 2.1333, + "step": 79480 + }, + { + "epoch": 0.30255855910720675, + "grad_norm": 0.12461134791374207, + "learning_rate": 0.0005, + "loss": 2.1102, + "step": 79490 + }, + { + "epoch": 0.3025966215753294, + "grad_norm": 0.12379152327775955, + "learning_rate": 0.0005, + "loss": 2.1317, + "step": 79500 + }, + { + "epoch": 0.3026346840434521, + "grad_norm": 0.12998011708259583, + "learning_rate": 0.0005, + "loss": 2.1293, + "step": 79510 + }, + { + "epoch": 0.3026727465115748, + "grad_norm": 0.12944366037845612, + "learning_rate": 0.0005, + "loss": 2.1132, + "step": 79520 + }, + { + "epoch": 0.3027108089796975, + "grad_norm": 0.12776261568069458, + "learning_rate": 0.0005, + "loss": 2.1076, + "step": 79530 + }, + { + "epoch": 0.30274887144782014, + "grad_norm": 0.1301940679550171, + "learning_rate": 0.0005, + "loss": 2.1204, + "step": 79540 + }, + { + "epoch": 0.30278693391594286, + "grad_norm": 0.14208351075649261, + "learning_rate": 0.0005, + "loss": 2.1156, + "step": 79550 + }, + { + "epoch": 0.3028249963840655, + "grad_norm": 0.12232249975204468, + "learning_rate": 0.0005, + "loss": 2.1325, + "step": 79560 + }, + { + "epoch": 0.3028630588521882, + "grad_norm": 0.14237213134765625, + "learning_rate": 0.0005, + "loss": 2.1294, + "step": 79570 + }, + { + "epoch": 0.3029011213203109, + "grad_norm": 0.12116596847772598, + "learning_rate": 0.0005, + "loss": 2.1178, + "step": 79580 + }, + { + "epoch": 0.3029391837884336, + "grad_norm": 0.14186422526836395, + "learning_rate": 0.0005, + "loss": 2.1262, + "step": 79590 + }, + { + "epoch": 0.30297724625655625, + "grad_norm": 0.13023512065410614, + "learning_rate": 0.0005, + "loss": 2.1169, + "step": 79600 + }, + { + "epoch": 0.30301530872467897, + "grad_norm": 0.12457878142595291, + "learning_rate": 0.0005, + "loss": 2.1213, + "step": 79610 + }, + { + "epoch": 0.3030533711928016, + "grad_norm": 0.13523663580417633, + "learning_rate": 0.0005, + "loss": 2.1214, + "step": 79620 + }, + { + "epoch": 0.30309143366092434, + "grad_norm": 0.12412901967763901, + "learning_rate": 0.0005, + "loss": 2.1326, + "step": 79630 + }, + { + "epoch": 0.303129496129047, + "grad_norm": 0.15414534509181976, + "learning_rate": 0.0005, + "loss": 2.1191, + "step": 79640 + }, + { + "epoch": 0.30316755859716965, + "grad_norm": 0.12909327447414398, + "learning_rate": 0.0005, + "loss": 2.1111, + "step": 79650 + }, + { + "epoch": 0.30320562106529236, + "grad_norm": 0.11341016739606857, + "learning_rate": 0.0005, + "loss": 2.1319, + "step": 79660 + }, + { + "epoch": 0.303243683533415, + "grad_norm": 0.1284220963716507, + "learning_rate": 0.0005, + "loss": 2.1255, + "step": 79670 + }, + { + "epoch": 0.30328174600153773, + "grad_norm": 0.12455248087644577, + "learning_rate": 0.0005, + "loss": 2.1198, + "step": 79680 + }, + { + "epoch": 0.3033198084696604, + "grad_norm": 0.13200350105762482, + "learning_rate": 0.0005, + "loss": 2.1118, + "step": 79690 + }, + { + "epoch": 0.3033578709377831, + "grad_norm": 0.1278725415468216, + "learning_rate": 0.0005, + "loss": 2.1403, + "step": 79700 + }, + { + "epoch": 0.30339593340590576, + "grad_norm": 0.12103582173585892, + "learning_rate": 0.0005, + "loss": 2.1055, + "step": 79710 + }, + { + "epoch": 0.3034339958740285, + "grad_norm": 0.13383394479751587, + "learning_rate": 0.0005, + "loss": 2.1293, + "step": 79720 + }, + { + "epoch": 0.30347205834215113, + "grad_norm": 0.14621300995349884, + "learning_rate": 0.0005, + "loss": 2.1266, + "step": 79730 + }, + { + "epoch": 0.30351012081027384, + "grad_norm": 0.12872327864170074, + "learning_rate": 0.0005, + "loss": 2.1051, + "step": 79740 + }, + { + "epoch": 0.3035481832783965, + "grad_norm": 0.1543866991996765, + "learning_rate": 0.0005, + "loss": 2.142, + "step": 79750 + }, + { + "epoch": 0.3035862457465192, + "grad_norm": 0.1281316876411438, + "learning_rate": 0.0005, + "loss": 2.126, + "step": 79760 + }, + { + "epoch": 0.30362430821464187, + "grad_norm": 0.13811922073364258, + "learning_rate": 0.0005, + "loss": 2.1282, + "step": 79770 + }, + { + "epoch": 0.3036623706827645, + "grad_norm": 0.11999211460351944, + "learning_rate": 0.0005, + "loss": 2.1258, + "step": 79780 + }, + { + "epoch": 0.30370043315088724, + "grad_norm": 0.12365420162677765, + "learning_rate": 0.0005, + "loss": 2.1194, + "step": 79790 + }, + { + "epoch": 0.3037384956190099, + "grad_norm": 0.11934592574834824, + "learning_rate": 0.0005, + "loss": 2.1072, + "step": 79800 + }, + { + "epoch": 0.3037765580871326, + "grad_norm": 0.1247946098446846, + "learning_rate": 0.0005, + "loss": 2.1345, + "step": 79810 + }, + { + "epoch": 0.30381462055525527, + "grad_norm": 0.1289442628622055, + "learning_rate": 0.0005, + "loss": 2.1328, + "step": 79820 + }, + { + "epoch": 0.303852683023378, + "grad_norm": 0.13009287416934967, + "learning_rate": 0.0005, + "loss": 2.121, + "step": 79830 + }, + { + "epoch": 0.30389074549150064, + "grad_norm": 0.12187264859676361, + "learning_rate": 0.0005, + "loss": 2.1064, + "step": 79840 + }, + { + "epoch": 0.30392880795962335, + "grad_norm": 0.129324808716774, + "learning_rate": 0.0005, + "loss": 2.108, + "step": 79850 + }, + { + "epoch": 0.303966870427746, + "grad_norm": 0.13142378628253937, + "learning_rate": 0.0005, + "loss": 2.1239, + "step": 79860 + }, + { + "epoch": 0.3040049328958687, + "grad_norm": 0.13151879608631134, + "learning_rate": 0.0005, + "loss": 2.1289, + "step": 79870 + }, + { + "epoch": 0.3040429953639914, + "grad_norm": 0.11516924202442169, + "learning_rate": 0.0005, + "loss": 2.1216, + "step": 79880 + }, + { + "epoch": 0.3040810578321141, + "grad_norm": 0.13340215384960175, + "learning_rate": 0.0005, + "loss": 2.1179, + "step": 79890 + }, + { + "epoch": 0.30411912030023674, + "grad_norm": 0.1257549673318863, + "learning_rate": 0.0005, + "loss": 2.1266, + "step": 79900 + }, + { + "epoch": 0.30415718276835946, + "grad_norm": 0.11783862113952637, + "learning_rate": 0.0005, + "loss": 2.1275, + "step": 79910 + }, + { + "epoch": 0.3041952452364821, + "grad_norm": 0.12970565259456635, + "learning_rate": 0.0005, + "loss": 2.1293, + "step": 79920 + }, + { + "epoch": 0.30423330770460477, + "grad_norm": 0.12826666235923767, + "learning_rate": 0.0005, + "loss": 2.1254, + "step": 79930 + }, + { + "epoch": 0.3042713701727275, + "grad_norm": 0.13249367475509644, + "learning_rate": 0.0005, + "loss": 2.1202, + "step": 79940 + }, + { + "epoch": 0.30430943264085014, + "grad_norm": 0.12484166771173477, + "learning_rate": 0.0005, + "loss": 2.1299, + "step": 79950 + }, + { + "epoch": 0.30434749510897285, + "grad_norm": 0.12426943331956863, + "learning_rate": 0.0005, + "loss": 2.1238, + "step": 79960 + }, + { + "epoch": 0.3043855575770955, + "grad_norm": 0.1309194564819336, + "learning_rate": 0.0005, + "loss": 2.1224, + "step": 79970 + }, + { + "epoch": 0.3044236200452182, + "grad_norm": 0.12265652418136597, + "learning_rate": 0.0005, + "loss": 2.1146, + "step": 79980 + }, + { + "epoch": 0.3044616825133409, + "grad_norm": 0.13268910348415375, + "learning_rate": 0.0005, + "loss": 2.1391, + "step": 79990 + }, + { + "epoch": 0.3044997449814636, + "grad_norm": 0.12794676423072815, + "learning_rate": 0.0005, + "loss": 2.1206, + "step": 80000 + }, + { + "epoch": 0.30453780744958625, + "grad_norm": 0.1281774789094925, + "learning_rate": 0.0005, + "loss": 2.1285, + "step": 80010 + }, + { + "epoch": 0.30457586991770896, + "grad_norm": 0.13531850278377533, + "learning_rate": 0.0005, + "loss": 2.1281, + "step": 80020 + }, + { + "epoch": 0.3046139323858316, + "grad_norm": 0.13700567185878754, + "learning_rate": 0.0005, + "loss": 2.1157, + "step": 80030 + }, + { + "epoch": 0.30465199485395433, + "grad_norm": 0.13160550594329834, + "learning_rate": 0.0005, + "loss": 2.1267, + "step": 80040 + }, + { + "epoch": 0.304690057322077, + "grad_norm": 0.12073525786399841, + "learning_rate": 0.0005, + "loss": 2.135, + "step": 80050 + }, + { + "epoch": 0.3047281197901997, + "grad_norm": 0.12149331718683243, + "learning_rate": 0.0005, + "loss": 2.1216, + "step": 80060 + }, + { + "epoch": 0.30476618225832236, + "grad_norm": 0.13199874758720398, + "learning_rate": 0.0005, + "loss": 2.1161, + "step": 80070 + }, + { + "epoch": 0.304804244726445, + "grad_norm": 0.14713576436042786, + "learning_rate": 0.0005, + "loss": 2.1313, + "step": 80080 + }, + { + "epoch": 0.30484230719456773, + "grad_norm": 0.12162210047245026, + "learning_rate": 0.0005, + "loss": 2.1202, + "step": 80090 + }, + { + "epoch": 0.3048803696626904, + "grad_norm": 0.11976277083158493, + "learning_rate": 0.0005, + "loss": 2.1249, + "step": 80100 + }, + { + "epoch": 0.3049184321308131, + "grad_norm": 0.11663439124822617, + "learning_rate": 0.0005, + "loss": 2.1039, + "step": 80110 + }, + { + "epoch": 0.30495649459893576, + "grad_norm": 0.11407138407230377, + "learning_rate": 0.0005, + "loss": 2.1276, + "step": 80120 + }, + { + "epoch": 0.30499455706705847, + "grad_norm": 0.11895322799682617, + "learning_rate": 0.0005, + "loss": 2.1346, + "step": 80130 + }, + { + "epoch": 0.3050326195351811, + "grad_norm": 0.12373096495866776, + "learning_rate": 0.0005, + "loss": 2.1309, + "step": 80140 + }, + { + "epoch": 0.30507068200330384, + "grad_norm": 0.12218614667654037, + "learning_rate": 0.0005, + "loss": 2.1254, + "step": 80150 + }, + { + "epoch": 0.3051087444714265, + "grad_norm": 0.11597719043493271, + "learning_rate": 0.0005, + "loss": 2.1249, + "step": 80160 + }, + { + "epoch": 0.3051468069395492, + "grad_norm": 0.12582460045814514, + "learning_rate": 0.0005, + "loss": 2.1329, + "step": 80170 + }, + { + "epoch": 0.30518486940767187, + "grad_norm": 0.13839006423950195, + "learning_rate": 0.0005, + "loss": 2.1237, + "step": 80180 + }, + { + "epoch": 0.3052229318757946, + "grad_norm": 0.1346767693758011, + "learning_rate": 0.0005, + "loss": 2.1329, + "step": 80190 + }, + { + "epoch": 0.30526099434391724, + "grad_norm": 0.13050340116024017, + "learning_rate": 0.0005, + "loss": 2.1139, + "step": 80200 + }, + { + "epoch": 0.3052990568120399, + "grad_norm": 0.12204218655824661, + "learning_rate": 0.0005, + "loss": 2.1235, + "step": 80210 + }, + { + "epoch": 0.3053371192801626, + "grad_norm": 0.1268157809972763, + "learning_rate": 0.0005, + "loss": 2.142, + "step": 80220 + }, + { + "epoch": 0.30537518174828526, + "grad_norm": 0.12779980897903442, + "learning_rate": 0.0005, + "loss": 2.1154, + "step": 80230 + }, + { + "epoch": 0.305413244216408, + "grad_norm": 0.12223111093044281, + "learning_rate": 0.0005, + "loss": 2.1171, + "step": 80240 + }, + { + "epoch": 0.30545130668453063, + "grad_norm": 0.11273365467786789, + "learning_rate": 0.0005, + "loss": 2.1284, + "step": 80250 + }, + { + "epoch": 0.30548936915265335, + "grad_norm": 0.13877561688423157, + "learning_rate": 0.0005, + "loss": 2.1553, + "step": 80260 + }, + { + "epoch": 0.305527431620776, + "grad_norm": 0.13216367363929749, + "learning_rate": 0.0005, + "loss": 2.1316, + "step": 80270 + }, + { + "epoch": 0.3055654940888987, + "grad_norm": 0.12295740097761154, + "learning_rate": 0.0005, + "loss": 2.1373, + "step": 80280 + }, + { + "epoch": 0.30560355655702137, + "grad_norm": 0.1392471343278885, + "learning_rate": 0.0005, + "loss": 2.1221, + "step": 80290 + }, + { + "epoch": 0.3056416190251441, + "grad_norm": 0.125979945063591, + "learning_rate": 0.0005, + "loss": 2.1342, + "step": 80300 + }, + { + "epoch": 0.30567968149326674, + "grad_norm": 0.12661942839622498, + "learning_rate": 0.0005, + "loss": 2.1244, + "step": 80310 + }, + { + "epoch": 0.30571774396138945, + "grad_norm": 0.12181428074836731, + "learning_rate": 0.0005, + "loss": 2.1271, + "step": 80320 + }, + { + "epoch": 0.3057558064295121, + "grad_norm": 0.11611612141132355, + "learning_rate": 0.0005, + "loss": 2.1324, + "step": 80330 + }, + { + "epoch": 0.3057938688976348, + "grad_norm": 0.1311190128326416, + "learning_rate": 0.0005, + "loss": 2.1336, + "step": 80340 + }, + { + "epoch": 0.3058319313657575, + "grad_norm": 0.12849178910255432, + "learning_rate": 0.0005, + "loss": 2.1203, + "step": 80350 + }, + { + "epoch": 0.30586999383388014, + "grad_norm": 0.14650395512580872, + "learning_rate": 0.0005, + "loss": 2.1283, + "step": 80360 + }, + { + "epoch": 0.30590805630200285, + "grad_norm": 0.14445063471794128, + "learning_rate": 0.0005, + "loss": 2.1247, + "step": 80370 + }, + { + "epoch": 0.3059461187701255, + "grad_norm": 0.11967745423316956, + "learning_rate": 0.0005, + "loss": 2.1294, + "step": 80380 + }, + { + "epoch": 0.3059841812382482, + "grad_norm": 0.12437941133975983, + "learning_rate": 0.0005, + "loss": 2.1525, + "step": 80390 + }, + { + "epoch": 0.3060222437063709, + "grad_norm": 0.11988309025764465, + "learning_rate": 0.0005, + "loss": 2.1255, + "step": 80400 + }, + { + "epoch": 0.3060603061744936, + "grad_norm": 0.122268445789814, + "learning_rate": 0.0005, + "loss": 2.1285, + "step": 80410 + }, + { + "epoch": 0.30609836864261625, + "grad_norm": 0.11369524151086807, + "learning_rate": 0.0005, + "loss": 2.1237, + "step": 80420 + }, + { + "epoch": 0.30613643111073896, + "grad_norm": 0.1246071457862854, + "learning_rate": 0.0005, + "loss": 2.1254, + "step": 80430 + }, + { + "epoch": 0.3061744935788616, + "grad_norm": 0.13041529059410095, + "learning_rate": 0.0005, + "loss": 2.1313, + "step": 80440 + }, + { + "epoch": 0.30621255604698433, + "grad_norm": 0.12700675427913666, + "learning_rate": 0.0005, + "loss": 2.119, + "step": 80450 + }, + { + "epoch": 0.306250618515107, + "grad_norm": 0.1286431849002838, + "learning_rate": 0.0005, + "loss": 2.1244, + "step": 80460 + }, + { + "epoch": 0.3062886809832297, + "grad_norm": 0.12057080119848251, + "learning_rate": 0.0005, + "loss": 2.1299, + "step": 80470 + }, + { + "epoch": 0.30632674345135236, + "grad_norm": 0.1279134750366211, + "learning_rate": 0.0005, + "loss": 2.1223, + "step": 80480 + }, + { + "epoch": 0.30636480591947507, + "grad_norm": 0.13255710899829865, + "learning_rate": 0.0005, + "loss": 2.1146, + "step": 80490 + }, + { + "epoch": 0.3064028683875977, + "grad_norm": 0.11516743898391724, + "learning_rate": 0.0005, + "loss": 2.1272, + "step": 80500 + }, + { + "epoch": 0.3064409308557204, + "grad_norm": 0.12610827386379242, + "learning_rate": 0.0005, + "loss": 2.127, + "step": 80510 + }, + { + "epoch": 0.3064789933238431, + "grad_norm": 0.11879970878362656, + "learning_rate": 0.0005, + "loss": 2.1386, + "step": 80520 + }, + { + "epoch": 0.30651705579196575, + "grad_norm": 0.11862923949956894, + "learning_rate": 0.0005, + "loss": 2.1246, + "step": 80530 + }, + { + "epoch": 0.30655511826008847, + "grad_norm": 0.12239440530538559, + "learning_rate": 0.0005, + "loss": 2.1206, + "step": 80540 + }, + { + "epoch": 0.3065931807282111, + "grad_norm": 0.12470649182796478, + "learning_rate": 0.0005, + "loss": 2.1383, + "step": 80550 + }, + { + "epoch": 0.30663124319633384, + "grad_norm": 0.12501883506774902, + "learning_rate": 0.0005, + "loss": 2.1248, + "step": 80560 + }, + { + "epoch": 0.3066693056644565, + "grad_norm": 0.1407422125339508, + "learning_rate": 0.0005, + "loss": 2.1314, + "step": 80570 + }, + { + "epoch": 0.3067073681325792, + "grad_norm": 0.12229721248149872, + "learning_rate": 0.0005, + "loss": 2.1394, + "step": 80580 + }, + { + "epoch": 0.30674543060070186, + "grad_norm": 0.11805040389299393, + "learning_rate": 0.0005, + "loss": 2.105, + "step": 80590 + }, + { + "epoch": 0.3067834930688246, + "grad_norm": 0.1299421191215515, + "learning_rate": 0.0005, + "loss": 2.1268, + "step": 80600 + }, + { + "epoch": 0.30682155553694723, + "grad_norm": 0.12241193652153015, + "learning_rate": 0.0005, + "loss": 2.1204, + "step": 80610 + }, + { + "epoch": 0.30685961800506995, + "grad_norm": 0.13378703594207764, + "learning_rate": 0.0005, + "loss": 2.1371, + "step": 80620 + }, + { + "epoch": 0.3068976804731926, + "grad_norm": 0.12666665017604828, + "learning_rate": 0.0005, + "loss": 2.1214, + "step": 80630 + }, + { + "epoch": 0.30693574294131526, + "grad_norm": 0.13109880685806274, + "learning_rate": 0.0005, + "loss": 2.1309, + "step": 80640 + }, + { + "epoch": 0.30697380540943797, + "grad_norm": 0.12502406537532806, + "learning_rate": 0.0005, + "loss": 2.1327, + "step": 80650 + }, + { + "epoch": 0.30701186787756063, + "grad_norm": 0.1233179122209549, + "learning_rate": 0.0005, + "loss": 2.1339, + "step": 80660 + }, + { + "epoch": 0.30704993034568334, + "grad_norm": 0.1341659426689148, + "learning_rate": 0.0005, + "loss": 2.1254, + "step": 80670 + }, + { + "epoch": 0.307087992813806, + "grad_norm": 0.17359262704849243, + "learning_rate": 0.0005, + "loss": 2.1229, + "step": 80680 + }, + { + "epoch": 0.3071260552819287, + "grad_norm": 0.12785649299621582, + "learning_rate": 0.0005, + "loss": 2.1257, + "step": 80690 + }, + { + "epoch": 0.30716411775005137, + "grad_norm": 0.12294891476631165, + "learning_rate": 0.0005, + "loss": 2.1148, + "step": 80700 + }, + { + "epoch": 0.3072021802181741, + "grad_norm": 0.11376363039016724, + "learning_rate": 0.0005, + "loss": 2.125, + "step": 80710 + }, + { + "epoch": 0.30724024268629674, + "grad_norm": 0.12576399743556976, + "learning_rate": 0.0005, + "loss": 2.12, + "step": 80720 + }, + { + "epoch": 0.30727830515441945, + "grad_norm": 0.12487640976905823, + "learning_rate": 0.0005, + "loss": 2.1321, + "step": 80730 + }, + { + "epoch": 0.3073163676225421, + "grad_norm": 0.12618839740753174, + "learning_rate": 0.0005, + "loss": 2.1176, + "step": 80740 + }, + { + "epoch": 0.3073544300906648, + "grad_norm": 0.11612705141305923, + "learning_rate": 0.0005, + "loss": 2.1357, + "step": 80750 + }, + { + "epoch": 0.3073924925587875, + "grad_norm": 0.12097634375095367, + "learning_rate": 0.0005, + "loss": 2.1184, + "step": 80760 + }, + { + "epoch": 0.3074305550269102, + "grad_norm": 0.12826375663280487, + "learning_rate": 0.0005, + "loss": 2.1157, + "step": 80770 + }, + { + "epoch": 0.30746861749503285, + "grad_norm": 0.13108542561531067, + "learning_rate": 0.0005, + "loss": 2.1341, + "step": 80780 + }, + { + "epoch": 0.3075066799631555, + "grad_norm": 0.12649457156658173, + "learning_rate": 0.0005, + "loss": 2.1238, + "step": 80790 + }, + { + "epoch": 0.3075447424312782, + "grad_norm": 0.12309123575687408, + "learning_rate": 0.0005, + "loss": 2.1122, + "step": 80800 + }, + { + "epoch": 0.3075828048994009, + "grad_norm": 0.12929083406925201, + "learning_rate": 0.0005, + "loss": 2.1413, + "step": 80810 + }, + { + "epoch": 0.3076208673675236, + "grad_norm": 0.11814327538013458, + "learning_rate": 0.0005, + "loss": 2.12, + "step": 80820 + }, + { + "epoch": 0.30765892983564624, + "grad_norm": 0.14209777116775513, + "learning_rate": 0.0005, + "loss": 2.1178, + "step": 80830 + }, + { + "epoch": 0.30769699230376896, + "grad_norm": 0.12277386337518692, + "learning_rate": 0.0005, + "loss": 2.1268, + "step": 80840 + }, + { + "epoch": 0.3077350547718916, + "grad_norm": 0.14316172897815704, + "learning_rate": 0.0005, + "loss": 2.1145, + "step": 80850 + }, + { + "epoch": 0.3077731172400143, + "grad_norm": 0.12181728333234787, + "learning_rate": 0.0005, + "loss": 2.1326, + "step": 80860 + }, + { + "epoch": 0.307811179708137, + "grad_norm": 0.12636560201644897, + "learning_rate": 0.0005, + "loss": 2.1209, + "step": 80870 + }, + { + "epoch": 0.3078492421762597, + "grad_norm": 0.13825927674770355, + "learning_rate": 0.0005, + "loss": 2.1306, + "step": 80880 + }, + { + "epoch": 0.30788730464438235, + "grad_norm": 0.14487019181251526, + "learning_rate": 0.0005, + "loss": 2.1241, + "step": 80890 + }, + { + "epoch": 0.30792536711250507, + "grad_norm": 0.1338072568178177, + "learning_rate": 0.0005, + "loss": 2.1186, + "step": 80900 + }, + { + "epoch": 0.3079634295806277, + "grad_norm": 0.12863421440124512, + "learning_rate": 0.0005, + "loss": 2.1245, + "step": 80910 + }, + { + "epoch": 0.30800149204875044, + "grad_norm": 0.1381281465291977, + "learning_rate": 0.0005, + "loss": 2.1213, + "step": 80920 + }, + { + "epoch": 0.3080395545168731, + "grad_norm": 0.12698550522327423, + "learning_rate": 0.0005, + "loss": 2.1068, + "step": 80930 + }, + { + "epoch": 0.30807761698499575, + "grad_norm": 0.12528090178966522, + "learning_rate": 0.0005, + "loss": 2.1153, + "step": 80940 + }, + { + "epoch": 0.30811567945311846, + "grad_norm": 0.12406053394079208, + "learning_rate": 0.0005, + "loss": 2.1223, + "step": 80950 + }, + { + "epoch": 0.3081537419212411, + "grad_norm": 0.12869417667388916, + "learning_rate": 0.0005, + "loss": 2.1263, + "step": 80960 + }, + { + "epoch": 0.30819180438936383, + "grad_norm": 0.12129397690296173, + "learning_rate": 0.0005, + "loss": 2.1397, + "step": 80970 + }, + { + "epoch": 0.3082298668574865, + "grad_norm": 0.14590273797512054, + "learning_rate": 0.0005, + "loss": 2.1238, + "step": 80980 + }, + { + "epoch": 0.3082679293256092, + "grad_norm": 0.12732772529125214, + "learning_rate": 0.0005, + "loss": 2.119, + "step": 80990 + }, + { + "epoch": 0.30830599179373186, + "grad_norm": 0.11924770474433899, + "learning_rate": 0.0005, + "loss": 2.129, + "step": 81000 + }, + { + "epoch": 0.30834405426185457, + "grad_norm": 0.12182494252920151, + "learning_rate": 0.0005, + "loss": 2.1139, + "step": 81010 + }, + { + "epoch": 0.30838211672997723, + "grad_norm": 0.11438652127981186, + "learning_rate": 0.0005, + "loss": 2.1104, + "step": 81020 + }, + { + "epoch": 0.30842017919809994, + "grad_norm": 0.11833816021680832, + "learning_rate": 0.0005, + "loss": 2.1291, + "step": 81030 + }, + { + "epoch": 0.3084582416662226, + "grad_norm": 0.12479960173368454, + "learning_rate": 0.0005, + "loss": 2.1217, + "step": 81040 + }, + { + "epoch": 0.3084963041343453, + "grad_norm": 0.11975480616092682, + "learning_rate": 0.0005, + "loss": 2.1284, + "step": 81050 + }, + { + "epoch": 0.30853436660246797, + "grad_norm": 0.15223437547683716, + "learning_rate": 0.0005, + "loss": 2.1138, + "step": 81060 + }, + { + "epoch": 0.3085724290705906, + "grad_norm": 0.1335378885269165, + "learning_rate": 0.0005, + "loss": 2.1283, + "step": 81070 + }, + { + "epoch": 0.30861049153871334, + "grad_norm": 0.13886189460754395, + "learning_rate": 0.0005, + "loss": 2.121, + "step": 81080 + }, + { + "epoch": 0.308648554006836, + "grad_norm": 0.13074654340744019, + "learning_rate": 0.0005, + "loss": 2.1206, + "step": 81090 + }, + { + "epoch": 0.3086866164749587, + "grad_norm": 0.13728798925876617, + "learning_rate": 0.0005, + "loss": 2.1387, + "step": 81100 + }, + { + "epoch": 0.30872467894308137, + "grad_norm": 0.12100497633218765, + "learning_rate": 0.0005, + "loss": 2.1259, + "step": 81110 + }, + { + "epoch": 0.3087627414112041, + "grad_norm": 0.12018078565597534, + "learning_rate": 0.0005, + "loss": 2.1239, + "step": 81120 + }, + { + "epoch": 0.30880080387932674, + "grad_norm": 0.12105260044336319, + "learning_rate": 0.0005, + "loss": 2.1195, + "step": 81130 + }, + { + "epoch": 0.30883886634744945, + "grad_norm": 0.11997707188129425, + "learning_rate": 0.0005, + "loss": 2.1274, + "step": 81140 + }, + { + "epoch": 0.3088769288155721, + "grad_norm": 0.11912636458873749, + "learning_rate": 0.0005, + "loss": 2.1281, + "step": 81150 + }, + { + "epoch": 0.3089149912836948, + "grad_norm": 0.1256522387266159, + "learning_rate": 0.0005, + "loss": 2.1343, + "step": 81160 + }, + { + "epoch": 0.3089530537518175, + "grad_norm": 0.12447361648082733, + "learning_rate": 0.0005, + "loss": 2.1223, + "step": 81170 + }, + { + "epoch": 0.3089911162199402, + "grad_norm": 0.12331148236989975, + "learning_rate": 0.0005, + "loss": 2.1257, + "step": 81180 + }, + { + "epoch": 0.30902917868806284, + "grad_norm": 0.12121827155351639, + "learning_rate": 0.0005, + "loss": 2.1228, + "step": 81190 + }, + { + "epoch": 0.30906724115618556, + "grad_norm": 0.13635829091072083, + "learning_rate": 0.0005, + "loss": 2.113, + "step": 81200 + }, + { + "epoch": 0.3091053036243082, + "grad_norm": 0.10935920476913452, + "learning_rate": 0.0005, + "loss": 2.12, + "step": 81210 + }, + { + "epoch": 0.30914336609243087, + "grad_norm": 0.12570302188396454, + "learning_rate": 0.0005, + "loss": 2.1313, + "step": 81220 + }, + { + "epoch": 0.3091814285605536, + "grad_norm": 0.13077987730503082, + "learning_rate": 0.0005, + "loss": 2.1095, + "step": 81230 + }, + { + "epoch": 0.30921949102867624, + "grad_norm": 0.12293568253517151, + "learning_rate": 0.0005, + "loss": 2.135, + "step": 81240 + }, + { + "epoch": 0.30925755349679895, + "grad_norm": 0.14048981666564941, + "learning_rate": 0.0005, + "loss": 2.1237, + "step": 81250 + }, + { + "epoch": 0.3092956159649216, + "grad_norm": 0.12315212190151215, + "learning_rate": 0.0005, + "loss": 2.1226, + "step": 81260 + }, + { + "epoch": 0.3093336784330443, + "grad_norm": 0.1180586889386177, + "learning_rate": 0.0005, + "loss": 2.1162, + "step": 81270 + }, + { + "epoch": 0.309371740901167, + "grad_norm": 0.12581458687782288, + "learning_rate": 0.0005, + "loss": 2.135, + "step": 81280 + }, + { + "epoch": 0.3094098033692897, + "grad_norm": 0.13311411440372467, + "learning_rate": 0.0005, + "loss": 2.1128, + "step": 81290 + }, + { + "epoch": 0.30944786583741235, + "grad_norm": 0.12257678806781769, + "learning_rate": 0.0005, + "loss": 2.1357, + "step": 81300 + }, + { + "epoch": 0.30948592830553506, + "grad_norm": 0.12836302816867828, + "learning_rate": 0.0005, + "loss": 2.1331, + "step": 81310 + }, + { + "epoch": 0.3095239907736577, + "grad_norm": 0.11991007626056671, + "learning_rate": 0.0005, + "loss": 2.13, + "step": 81320 + }, + { + "epoch": 0.30956205324178043, + "grad_norm": 0.12842082977294922, + "learning_rate": 0.0005, + "loss": 2.1306, + "step": 81330 + }, + { + "epoch": 0.3096001157099031, + "grad_norm": 0.13021841645240784, + "learning_rate": 0.0005, + "loss": 2.1233, + "step": 81340 + }, + { + "epoch": 0.3096381781780258, + "grad_norm": 0.11219491064548492, + "learning_rate": 0.0005, + "loss": 2.1201, + "step": 81350 + }, + { + "epoch": 0.30967624064614846, + "grad_norm": 0.13072046637535095, + "learning_rate": 0.0005, + "loss": 2.1292, + "step": 81360 + }, + { + "epoch": 0.3097143031142711, + "grad_norm": 0.1303190290927887, + "learning_rate": 0.0005, + "loss": 2.1175, + "step": 81370 + }, + { + "epoch": 0.30975236558239383, + "grad_norm": 0.12236102670431137, + "learning_rate": 0.0005, + "loss": 2.1065, + "step": 81380 + }, + { + "epoch": 0.3097904280505165, + "grad_norm": 0.12813524901866913, + "learning_rate": 0.0005, + "loss": 2.1372, + "step": 81390 + }, + { + "epoch": 0.3098284905186392, + "grad_norm": 0.1337427794933319, + "learning_rate": 0.0005, + "loss": 2.1325, + "step": 81400 + }, + { + "epoch": 0.30986655298676186, + "grad_norm": 0.11618053168058395, + "learning_rate": 0.0005, + "loss": 2.1223, + "step": 81410 + }, + { + "epoch": 0.30990461545488457, + "grad_norm": 0.11345713585615158, + "learning_rate": 0.0005, + "loss": 2.1252, + "step": 81420 + }, + { + "epoch": 0.3099426779230072, + "grad_norm": 0.11531246453523636, + "learning_rate": 0.0005, + "loss": 2.1298, + "step": 81430 + }, + { + "epoch": 0.30998074039112994, + "grad_norm": 0.1406860649585724, + "learning_rate": 0.0005, + "loss": 2.1388, + "step": 81440 + }, + { + "epoch": 0.3100188028592526, + "grad_norm": 0.11967140436172485, + "learning_rate": 0.0005, + "loss": 2.1163, + "step": 81450 + }, + { + "epoch": 0.3100568653273753, + "grad_norm": 0.13636508584022522, + "learning_rate": 0.0005, + "loss": 2.1244, + "step": 81460 + }, + { + "epoch": 0.31009492779549797, + "grad_norm": 0.11664880812168121, + "learning_rate": 0.0005, + "loss": 2.1188, + "step": 81470 + }, + { + "epoch": 0.3101329902636207, + "grad_norm": 0.1184813529253006, + "learning_rate": 0.0005, + "loss": 2.1145, + "step": 81480 + }, + { + "epoch": 0.31017105273174334, + "grad_norm": 0.11842097342014313, + "learning_rate": 0.0005, + "loss": 2.1149, + "step": 81490 + }, + { + "epoch": 0.310209115199866, + "grad_norm": 0.11978063732385635, + "learning_rate": 0.0005, + "loss": 2.1249, + "step": 81500 + }, + { + "epoch": 0.3102471776679887, + "grad_norm": 0.1305670589208603, + "learning_rate": 0.0005, + "loss": 2.1229, + "step": 81510 + }, + { + "epoch": 0.31028524013611136, + "grad_norm": 0.12473088502883911, + "learning_rate": 0.0005, + "loss": 2.0986, + "step": 81520 + }, + { + "epoch": 0.3103233026042341, + "grad_norm": 0.13610634207725525, + "learning_rate": 0.0005, + "loss": 2.1213, + "step": 81530 + }, + { + "epoch": 0.31036136507235673, + "grad_norm": 0.12722398340702057, + "learning_rate": 0.0005, + "loss": 2.1189, + "step": 81540 + }, + { + "epoch": 0.31039942754047944, + "grad_norm": 0.11809282749891281, + "learning_rate": 0.0005, + "loss": 2.1221, + "step": 81550 + }, + { + "epoch": 0.3104374900086021, + "grad_norm": 0.11579307168722153, + "learning_rate": 0.0005, + "loss": 2.135, + "step": 81560 + }, + { + "epoch": 0.3104755524767248, + "grad_norm": 0.1184435710310936, + "learning_rate": 0.0005, + "loss": 2.1271, + "step": 81570 + }, + { + "epoch": 0.31051361494484747, + "grad_norm": 0.11998499184846878, + "learning_rate": 0.0005, + "loss": 2.1153, + "step": 81580 + }, + { + "epoch": 0.3105516774129702, + "grad_norm": 0.11946476250886917, + "learning_rate": 0.0005, + "loss": 2.1091, + "step": 81590 + }, + { + "epoch": 0.31058973988109284, + "grad_norm": 0.12768951058387756, + "learning_rate": 0.0005, + "loss": 2.0991, + "step": 81600 + }, + { + "epoch": 0.31062780234921555, + "grad_norm": 0.11800798773765564, + "learning_rate": 0.0005, + "loss": 2.1214, + "step": 81610 + }, + { + "epoch": 0.3106658648173382, + "grad_norm": 0.1284877508878708, + "learning_rate": 0.0005, + "loss": 2.1336, + "step": 81620 + }, + { + "epoch": 0.3107039272854609, + "grad_norm": 0.13244765996932983, + "learning_rate": 0.0005, + "loss": 2.126, + "step": 81630 + }, + { + "epoch": 0.3107419897535836, + "grad_norm": 0.12389449030160904, + "learning_rate": 0.0005, + "loss": 2.1286, + "step": 81640 + }, + { + "epoch": 0.31078005222170624, + "grad_norm": 0.1269783228635788, + "learning_rate": 0.0005, + "loss": 2.1265, + "step": 81650 + }, + { + "epoch": 0.31081811468982895, + "grad_norm": 0.12645235657691956, + "learning_rate": 0.0005, + "loss": 2.1322, + "step": 81660 + }, + { + "epoch": 0.3108561771579516, + "grad_norm": 0.12159260362386703, + "learning_rate": 0.0005, + "loss": 2.1221, + "step": 81670 + }, + { + "epoch": 0.3108942396260743, + "grad_norm": 0.140539288520813, + "learning_rate": 0.0005, + "loss": 2.1301, + "step": 81680 + }, + { + "epoch": 0.310932302094197, + "grad_norm": 0.12919221818447113, + "learning_rate": 0.0005, + "loss": 2.1034, + "step": 81690 + }, + { + "epoch": 0.3109703645623197, + "grad_norm": 0.13578203320503235, + "learning_rate": 0.0005, + "loss": 2.121, + "step": 81700 + }, + { + "epoch": 0.31100842703044235, + "grad_norm": 0.11720879375934601, + "learning_rate": 0.0005, + "loss": 2.1015, + "step": 81710 + }, + { + "epoch": 0.31104648949856506, + "grad_norm": 0.11770500987768173, + "learning_rate": 0.0005, + "loss": 2.1168, + "step": 81720 + }, + { + "epoch": 0.3110845519666877, + "grad_norm": 0.1371755450963974, + "learning_rate": 0.0005, + "loss": 2.1228, + "step": 81730 + }, + { + "epoch": 0.31112261443481043, + "grad_norm": 0.12106727808713913, + "learning_rate": 0.0005, + "loss": 2.1122, + "step": 81740 + }, + { + "epoch": 0.3111606769029331, + "grad_norm": 0.1305050551891327, + "learning_rate": 0.0005, + "loss": 2.1295, + "step": 81750 + }, + { + "epoch": 0.3111987393710558, + "grad_norm": 0.13445325195789337, + "learning_rate": 0.0005, + "loss": 2.1351, + "step": 81760 + }, + { + "epoch": 0.31123680183917846, + "grad_norm": 0.1206723302602768, + "learning_rate": 0.0005, + "loss": 2.1178, + "step": 81770 + }, + { + "epoch": 0.31127486430730117, + "grad_norm": 0.1241808757185936, + "learning_rate": 0.0005, + "loss": 2.1242, + "step": 81780 + }, + { + "epoch": 0.3113129267754238, + "grad_norm": 0.12647885084152222, + "learning_rate": 0.0005, + "loss": 2.1209, + "step": 81790 + }, + { + "epoch": 0.3113509892435465, + "grad_norm": 0.11793207377195358, + "learning_rate": 0.0005, + "loss": 2.124, + "step": 81800 + }, + { + "epoch": 0.3113890517116692, + "grad_norm": 0.13345180451869965, + "learning_rate": 0.0005, + "loss": 2.1236, + "step": 81810 + }, + { + "epoch": 0.31142711417979185, + "grad_norm": 0.13315977156162262, + "learning_rate": 0.0005, + "loss": 2.1275, + "step": 81820 + }, + { + "epoch": 0.31146517664791457, + "grad_norm": 0.1259089708328247, + "learning_rate": 0.0005, + "loss": 2.117, + "step": 81830 + }, + { + "epoch": 0.3115032391160372, + "grad_norm": 0.11526235193014145, + "learning_rate": 0.0005, + "loss": 2.1182, + "step": 81840 + }, + { + "epoch": 0.31154130158415994, + "grad_norm": 0.13554058969020844, + "learning_rate": 0.0005, + "loss": 2.1212, + "step": 81850 + }, + { + "epoch": 0.3115793640522826, + "grad_norm": 0.1364506483078003, + "learning_rate": 0.0005, + "loss": 2.1287, + "step": 81860 + }, + { + "epoch": 0.3116174265204053, + "grad_norm": 0.1301794946193695, + "learning_rate": 0.0005, + "loss": 2.1256, + "step": 81870 + }, + { + "epoch": 0.31165548898852796, + "grad_norm": 0.1580042839050293, + "learning_rate": 0.0005, + "loss": 2.1174, + "step": 81880 + }, + { + "epoch": 0.3116935514566507, + "grad_norm": 0.13931454718112946, + "learning_rate": 0.0005, + "loss": 2.145, + "step": 81890 + }, + { + "epoch": 0.31173161392477333, + "grad_norm": 0.13075725734233856, + "learning_rate": 0.0005, + "loss": 2.1153, + "step": 81900 + }, + { + "epoch": 0.31176967639289604, + "grad_norm": 0.14682362973690033, + "learning_rate": 0.0005, + "loss": 2.1165, + "step": 81910 + }, + { + "epoch": 0.3118077388610187, + "grad_norm": 0.14456802606582642, + "learning_rate": 0.0005, + "loss": 2.1167, + "step": 81920 + }, + { + "epoch": 0.3118458013291414, + "grad_norm": 0.13011014461517334, + "learning_rate": 0.0005, + "loss": 2.1273, + "step": 81930 + }, + { + "epoch": 0.31188386379726407, + "grad_norm": 0.13531902432441711, + "learning_rate": 0.0005, + "loss": 2.1011, + "step": 81940 + }, + { + "epoch": 0.31192192626538673, + "grad_norm": 0.13501720130443573, + "learning_rate": 0.0005, + "loss": 2.1187, + "step": 81950 + }, + { + "epoch": 0.31195998873350944, + "grad_norm": 0.12197815626859665, + "learning_rate": 0.0005, + "loss": 2.113, + "step": 81960 + }, + { + "epoch": 0.3119980512016321, + "grad_norm": 0.11486975848674774, + "learning_rate": 0.0005, + "loss": 2.1203, + "step": 81970 + }, + { + "epoch": 0.3120361136697548, + "grad_norm": 0.11543624103069305, + "learning_rate": 0.0005, + "loss": 2.1251, + "step": 81980 + }, + { + "epoch": 0.31207417613787747, + "grad_norm": 0.14072877168655396, + "learning_rate": 0.0005, + "loss": 2.1233, + "step": 81990 + }, + { + "epoch": 0.3121122386060002, + "grad_norm": 0.11827097833156586, + "learning_rate": 0.0005, + "loss": 2.1412, + "step": 82000 + }, + { + "epoch": 0.31215030107412284, + "grad_norm": 0.12375832349061966, + "learning_rate": 0.0005, + "loss": 2.1087, + "step": 82010 + }, + { + "epoch": 0.31218836354224555, + "grad_norm": 0.12070365995168686, + "learning_rate": 0.0005, + "loss": 2.1256, + "step": 82020 + }, + { + "epoch": 0.3122264260103682, + "grad_norm": 0.1299162209033966, + "learning_rate": 0.0005, + "loss": 2.1264, + "step": 82030 + }, + { + "epoch": 0.3122644884784909, + "grad_norm": 0.11619587242603302, + "learning_rate": 0.0005, + "loss": 2.1233, + "step": 82040 + }, + { + "epoch": 0.3123025509466136, + "grad_norm": 0.13671796023845673, + "learning_rate": 0.0005, + "loss": 2.135, + "step": 82050 + }, + { + "epoch": 0.3123406134147363, + "grad_norm": 0.11353994160890579, + "learning_rate": 0.0005, + "loss": 2.136, + "step": 82060 + }, + { + "epoch": 0.31237867588285895, + "grad_norm": 0.12547942996025085, + "learning_rate": 0.0005, + "loss": 2.1177, + "step": 82070 + }, + { + "epoch": 0.3124167383509816, + "grad_norm": 0.12046770751476288, + "learning_rate": 0.0005, + "loss": 2.1225, + "step": 82080 + }, + { + "epoch": 0.3124548008191043, + "grad_norm": 0.1242809072136879, + "learning_rate": 0.0005, + "loss": 2.1295, + "step": 82090 + }, + { + "epoch": 0.312492863287227, + "grad_norm": 0.12850497663021088, + "learning_rate": 0.0005, + "loss": 2.135, + "step": 82100 + }, + { + "epoch": 0.3125309257553497, + "grad_norm": 0.13980376720428467, + "learning_rate": 0.0005, + "loss": 2.1192, + "step": 82110 + }, + { + "epoch": 0.31256898822347234, + "grad_norm": 0.12118227034807205, + "learning_rate": 0.0005, + "loss": 2.1311, + "step": 82120 + }, + { + "epoch": 0.31260705069159506, + "grad_norm": 0.13284452259540558, + "learning_rate": 0.0005, + "loss": 2.1224, + "step": 82130 + }, + { + "epoch": 0.3126451131597177, + "grad_norm": 0.13698700070381165, + "learning_rate": 0.0005, + "loss": 2.1455, + "step": 82140 + }, + { + "epoch": 0.3126831756278404, + "grad_norm": 0.13114923238754272, + "learning_rate": 0.0005, + "loss": 2.1198, + "step": 82150 + }, + { + "epoch": 0.3127212380959631, + "grad_norm": 0.14440645277500153, + "learning_rate": 0.0005, + "loss": 2.1225, + "step": 82160 + }, + { + "epoch": 0.3127593005640858, + "grad_norm": 0.1295204907655716, + "learning_rate": 0.0005, + "loss": 2.1171, + "step": 82170 + }, + { + "epoch": 0.31279736303220845, + "grad_norm": 0.12142180651426315, + "learning_rate": 0.0005, + "loss": 2.1301, + "step": 82180 + }, + { + "epoch": 0.31283542550033117, + "grad_norm": 0.12380526959896088, + "learning_rate": 0.0005, + "loss": 2.1326, + "step": 82190 + }, + { + "epoch": 0.3128734879684538, + "grad_norm": 0.1308668702840805, + "learning_rate": 0.0005, + "loss": 2.1231, + "step": 82200 + }, + { + "epoch": 0.31291155043657654, + "grad_norm": 0.12716633081436157, + "learning_rate": 0.0005, + "loss": 2.1333, + "step": 82210 + }, + { + "epoch": 0.3129496129046992, + "grad_norm": 0.13136553764343262, + "learning_rate": 0.0005, + "loss": 2.1222, + "step": 82220 + }, + { + "epoch": 0.31298767537282185, + "grad_norm": 0.12801966071128845, + "learning_rate": 0.0005, + "loss": 2.1465, + "step": 82230 + }, + { + "epoch": 0.31302573784094456, + "grad_norm": 0.12598226964473724, + "learning_rate": 0.0005, + "loss": 2.1245, + "step": 82240 + }, + { + "epoch": 0.3130638003090672, + "grad_norm": 0.11136070638895035, + "learning_rate": 0.0005, + "loss": 2.1196, + "step": 82250 + }, + { + "epoch": 0.31310186277718993, + "grad_norm": 0.12658236920833588, + "learning_rate": 0.0005, + "loss": 2.1242, + "step": 82260 + }, + { + "epoch": 0.3131399252453126, + "grad_norm": 0.1299169957637787, + "learning_rate": 0.0005, + "loss": 2.1162, + "step": 82270 + }, + { + "epoch": 0.3131779877134353, + "grad_norm": 0.11735150963068008, + "learning_rate": 0.0005, + "loss": 2.1295, + "step": 82280 + }, + { + "epoch": 0.31321605018155796, + "grad_norm": 0.11863543093204498, + "learning_rate": 0.0005, + "loss": 2.1351, + "step": 82290 + }, + { + "epoch": 0.31325411264968067, + "grad_norm": 0.1262042224407196, + "learning_rate": 0.0005, + "loss": 2.1218, + "step": 82300 + }, + { + "epoch": 0.31329217511780333, + "grad_norm": 0.11971874535083771, + "learning_rate": 0.0005, + "loss": 2.141, + "step": 82310 + }, + { + "epoch": 0.31333023758592604, + "grad_norm": 0.11084333807229996, + "learning_rate": 0.0005, + "loss": 2.1299, + "step": 82320 + }, + { + "epoch": 0.3133683000540487, + "grad_norm": 0.13071833550930023, + "learning_rate": 0.0005, + "loss": 2.1384, + "step": 82330 + }, + { + "epoch": 0.3134063625221714, + "grad_norm": 0.13296374678611755, + "learning_rate": 0.0005, + "loss": 2.1342, + "step": 82340 + }, + { + "epoch": 0.31344442499029407, + "grad_norm": 0.1339769810438156, + "learning_rate": 0.0005, + "loss": 2.1403, + "step": 82350 + }, + { + "epoch": 0.3134824874584168, + "grad_norm": 0.12778893113136292, + "learning_rate": 0.0005, + "loss": 2.1358, + "step": 82360 + }, + { + "epoch": 0.31352054992653944, + "grad_norm": 0.11761803925037384, + "learning_rate": 0.0005, + "loss": 2.1226, + "step": 82370 + }, + { + "epoch": 0.3135586123946621, + "grad_norm": 0.12162280082702637, + "learning_rate": 0.0005, + "loss": 2.1155, + "step": 82380 + }, + { + "epoch": 0.3135966748627848, + "grad_norm": 0.12358004599809647, + "learning_rate": 0.0005, + "loss": 2.1138, + "step": 82390 + }, + { + "epoch": 0.31363473733090746, + "grad_norm": 0.12462770938873291, + "learning_rate": 0.0005, + "loss": 2.12, + "step": 82400 + }, + { + "epoch": 0.3136727997990302, + "grad_norm": 0.12161625921726227, + "learning_rate": 0.0005, + "loss": 2.1196, + "step": 82410 + }, + { + "epoch": 0.31371086226715283, + "grad_norm": 0.12660494446754456, + "learning_rate": 0.0005, + "loss": 2.1309, + "step": 82420 + }, + { + "epoch": 0.31374892473527555, + "grad_norm": 0.13155150413513184, + "learning_rate": 0.0005, + "loss": 2.1209, + "step": 82430 + }, + { + "epoch": 0.3137869872033982, + "grad_norm": 0.12684141099452972, + "learning_rate": 0.0005, + "loss": 2.1242, + "step": 82440 + }, + { + "epoch": 0.3138250496715209, + "grad_norm": 0.11854333430528641, + "learning_rate": 0.0005, + "loss": 2.1229, + "step": 82450 + }, + { + "epoch": 0.3138631121396436, + "grad_norm": 0.12792110443115234, + "learning_rate": 0.0005, + "loss": 2.1193, + "step": 82460 + }, + { + "epoch": 0.3139011746077663, + "grad_norm": 0.12354903668165207, + "learning_rate": 0.0005, + "loss": 2.1276, + "step": 82470 + }, + { + "epoch": 0.31393923707588894, + "grad_norm": 0.12082675844430923, + "learning_rate": 0.0005, + "loss": 2.1209, + "step": 82480 + }, + { + "epoch": 0.31397729954401166, + "grad_norm": 0.12705554068088531, + "learning_rate": 0.0005, + "loss": 2.1302, + "step": 82490 + }, + { + "epoch": 0.3140153620121343, + "grad_norm": 0.12151361256837845, + "learning_rate": 0.0005, + "loss": 2.1354, + "step": 82500 + }, + { + "epoch": 0.31405342448025697, + "grad_norm": 0.11975157260894775, + "learning_rate": 0.0005, + "loss": 2.1203, + "step": 82510 + }, + { + "epoch": 0.3140914869483797, + "grad_norm": 0.21595405042171478, + "learning_rate": 0.0005, + "loss": 2.1362, + "step": 82520 + }, + { + "epoch": 0.31412954941650234, + "grad_norm": 0.12035899609327316, + "learning_rate": 0.0005, + "loss": 2.1281, + "step": 82530 + }, + { + "epoch": 0.31416761188462505, + "grad_norm": 0.14372199773788452, + "learning_rate": 0.0005, + "loss": 2.1303, + "step": 82540 + }, + { + "epoch": 0.3142056743527477, + "grad_norm": 0.12237467616796494, + "learning_rate": 0.0005, + "loss": 2.1218, + "step": 82550 + }, + { + "epoch": 0.3142437368208704, + "grad_norm": 0.1329762488603592, + "learning_rate": 0.0005, + "loss": 2.1169, + "step": 82560 + }, + { + "epoch": 0.3142817992889931, + "grad_norm": 0.13943737745285034, + "learning_rate": 0.0005, + "loss": 2.1277, + "step": 82570 + }, + { + "epoch": 0.3143198617571158, + "grad_norm": 0.12585225701332092, + "learning_rate": 0.0005, + "loss": 2.1212, + "step": 82580 + }, + { + "epoch": 0.31435792422523845, + "grad_norm": 0.1350608915090561, + "learning_rate": 0.0005, + "loss": 2.1152, + "step": 82590 + }, + { + "epoch": 0.31439598669336116, + "grad_norm": 0.12448044866323471, + "learning_rate": 0.0005, + "loss": 2.1338, + "step": 82600 + }, + { + "epoch": 0.3144340491614838, + "grad_norm": 0.13429243862628937, + "learning_rate": 0.0005, + "loss": 2.127, + "step": 82610 + }, + { + "epoch": 0.31447211162960653, + "grad_norm": 0.12286079674959183, + "learning_rate": 0.0005, + "loss": 2.1459, + "step": 82620 + }, + { + "epoch": 0.3145101740977292, + "grad_norm": 0.12074466049671173, + "learning_rate": 0.0005, + "loss": 2.1232, + "step": 82630 + }, + { + "epoch": 0.3145482365658519, + "grad_norm": 0.12522518634796143, + "learning_rate": 0.0005, + "loss": 2.1229, + "step": 82640 + }, + { + "epoch": 0.31458629903397456, + "grad_norm": 0.1266522854566574, + "learning_rate": 0.0005, + "loss": 2.1235, + "step": 82650 + }, + { + "epoch": 0.3146243615020972, + "grad_norm": 0.1456829160451889, + "learning_rate": 0.0005, + "loss": 2.1156, + "step": 82660 + }, + { + "epoch": 0.31466242397021993, + "grad_norm": 0.12169001996517181, + "learning_rate": 0.0005, + "loss": 2.1248, + "step": 82670 + }, + { + "epoch": 0.3147004864383426, + "grad_norm": 0.11195474117994308, + "learning_rate": 0.0005, + "loss": 2.1209, + "step": 82680 + }, + { + "epoch": 0.3147385489064653, + "grad_norm": 0.12271256744861603, + "learning_rate": 0.0005, + "loss": 2.1179, + "step": 82690 + }, + { + "epoch": 0.31477661137458796, + "grad_norm": 0.1269110143184662, + "learning_rate": 0.0005, + "loss": 2.1319, + "step": 82700 + }, + { + "epoch": 0.31481467384271067, + "grad_norm": 0.12153013050556183, + "learning_rate": 0.0005, + "loss": 2.1241, + "step": 82710 + }, + { + "epoch": 0.3148527363108333, + "grad_norm": 0.13175609707832336, + "learning_rate": 0.0005, + "loss": 2.1222, + "step": 82720 + }, + { + "epoch": 0.31489079877895604, + "grad_norm": 0.1300119310617447, + "learning_rate": 0.0005, + "loss": 2.1391, + "step": 82730 + }, + { + "epoch": 0.3149288612470787, + "grad_norm": 0.12770238518714905, + "learning_rate": 0.0005, + "loss": 2.123, + "step": 82740 + }, + { + "epoch": 0.3149669237152014, + "grad_norm": 0.15108972787857056, + "learning_rate": 0.0005, + "loss": 2.1381, + "step": 82750 + }, + { + "epoch": 0.31500498618332406, + "grad_norm": 0.12652187049388885, + "learning_rate": 0.0005, + "loss": 2.1107, + "step": 82760 + }, + { + "epoch": 0.3150430486514468, + "grad_norm": 0.12813642621040344, + "learning_rate": 0.0005, + "loss": 2.1344, + "step": 82770 + }, + { + "epoch": 0.31508111111956943, + "grad_norm": 0.11277160048484802, + "learning_rate": 0.0005, + "loss": 2.1189, + "step": 82780 + }, + { + "epoch": 0.31511917358769215, + "grad_norm": 0.12133605778217316, + "learning_rate": 0.0005, + "loss": 2.1288, + "step": 82790 + }, + { + "epoch": 0.3151572360558148, + "grad_norm": 0.2113325148820877, + "learning_rate": 0.0005, + "loss": 2.1218, + "step": 82800 + }, + { + "epoch": 0.31519529852393746, + "grad_norm": 0.13138943910598755, + "learning_rate": 0.0005, + "loss": 2.1228, + "step": 82810 + }, + { + "epoch": 0.3152333609920602, + "grad_norm": 0.13198734819889069, + "learning_rate": 0.0005, + "loss": 2.1306, + "step": 82820 + }, + { + "epoch": 0.31527142346018283, + "grad_norm": 0.12452925741672516, + "learning_rate": 0.0005, + "loss": 2.1182, + "step": 82830 + }, + { + "epoch": 0.31530948592830554, + "grad_norm": 0.12573881447315216, + "learning_rate": 0.0005, + "loss": 2.1183, + "step": 82840 + }, + { + "epoch": 0.3153475483964282, + "grad_norm": 0.11985309422016144, + "learning_rate": 0.0005, + "loss": 2.1328, + "step": 82850 + }, + { + "epoch": 0.3153856108645509, + "grad_norm": 0.13780802488327026, + "learning_rate": 0.0005, + "loss": 2.1331, + "step": 82860 + }, + { + "epoch": 0.31542367333267357, + "grad_norm": 0.11680909991264343, + "learning_rate": 0.0005, + "loss": 2.1269, + "step": 82870 + }, + { + "epoch": 0.3154617358007963, + "grad_norm": 0.1293938308954239, + "learning_rate": 0.0005, + "loss": 2.1004, + "step": 82880 + }, + { + "epoch": 0.31549979826891894, + "grad_norm": 0.12560796737670898, + "learning_rate": 0.0005, + "loss": 2.1319, + "step": 82890 + }, + { + "epoch": 0.31553786073704165, + "grad_norm": 0.11934496462345123, + "learning_rate": 0.0005, + "loss": 2.1193, + "step": 82900 + }, + { + "epoch": 0.3155759232051643, + "grad_norm": 0.12571270763874054, + "learning_rate": 0.0005, + "loss": 2.108, + "step": 82910 + }, + { + "epoch": 0.315613985673287, + "grad_norm": 0.10958436131477356, + "learning_rate": 0.0005, + "loss": 2.1426, + "step": 82920 + }, + { + "epoch": 0.3156520481414097, + "grad_norm": 0.13606712222099304, + "learning_rate": 0.0005, + "loss": 2.1337, + "step": 82930 + }, + { + "epoch": 0.31569011060953234, + "grad_norm": 0.11906415224075317, + "learning_rate": 0.0005, + "loss": 2.1325, + "step": 82940 + }, + { + "epoch": 0.31572817307765505, + "grad_norm": 0.11434603482484818, + "learning_rate": 0.0005, + "loss": 2.1299, + "step": 82950 + }, + { + "epoch": 0.3157662355457777, + "grad_norm": 0.1147366389632225, + "learning_rate": 0.0005, + "loss": 2.1218, + "step": 82960 + }, + { + "epoch": 0.3158042980139004, + "grad_norm": 0.12987957894802094, + "learning_rate": 0.0005, + "loss": 2.1236, + "step": 82970 + }, + { + "epoch": 0.3158423604820231, + "grad_norm": 0.13320422172546387, + "learning_rate": 0.0005, + "loss": 2.1178, + "step": 82980 + }, + { + "epoch": 0.3158804229501458, + "grad_norm": 0.12843391299247742, + "learning_rate": 0.0005, + "loss": 2.1272, + "step": 82990 + }, + { + "epoch": 0.31591848541826845, + "grad_norm": 0.1171526238322258, + "learning_rate": 0.0005, + "loss": 2.1274, + "step": 83000 + }, + { + "epoch": 0.31595654788639116, + "grad_norm": 0.1265607625246048, + "learning_rate": 0.0005, + "loss": 2.1379, + "step": 83010 + }, + { + "epoch": 0.3159946103545138, + "grad_norm": 0.13495829701423645, + "learning_rate": 0.0005, + "loss": 2.137, + "step": 83020 + }, + { + "epoch": 0.31603267282263653, + "grad_norm": 0.11635854840278625, + "learning_rate": 0.0005, + "loss": 2.1238, + "step": 83030 + }, + { + "epoch": 0.3160707352907592, + "grad_norm": 0.12299887835979462, + "learning_rate": 0.0005, + "loss": 2.1227, + "step": 83040 + }, + { + "epoch": 0.3161087977588819, + "grad_norm": 0.12657538056373596, + "learning_rate": 0.0005, + "loss": 2.1224, + "step": 83050 + }, + { + "epoch": 0.31614686022700456, + "grad_norm": 0.12187173217535019, + "learning_rate": 0.0005, + "loss": 2.1343, + "step": 83060 + }, + { + "epoch": 0.31618492269512727, + "grad_norm": 0.13706451654434204, + "learning_rate": 0.0005, + "loss": 2.1286, + "step": 83070 + }, + { + "epoch": 0.3162229851632499, + "grad_norm": 0.11517681926488876, + "learning_rate": 0.0005, + "loss": 2.1424, + "step": 83080 + }, + { + "epoch": 0.3162610476313726, + "grad_norm": 0.12629637122154236, + "learning_rate": 0.0005, + "loss": 2.128, + "step": 83090 + }, + { + "epoch": 0.3162991100994953, + "grad_norm": 0.1268184632062912, + "learning_rate": 0.0005, + "loss": 2.132, + "step": 83100 + }, + { + "epoch": 0.31633717256761795, + "grad_norm": 0.12249192595481873, + "learning_rate": 0.0005, + "loss": 2.1276, + "step": 83110 + }, + { + "epoch": 0.31637523503574067, + "grad_norm": 0.12270066142082214, + "learning_rate": 0.0005, + "loss": 2.1366, + "step": 83120 + }, + { + "epoch": 0.3164132975038633, + "grad_norm": 0.11289820820093155, + "learning_rate": 0.0005, + "loss": 2.1312, + "step": 83130 + }, + { + "epoch": 0.31645135997198603, + "grad_norm": 0.14470253884792328, + "learning_rate": 0.0005, + "loss": 2.1308, + "step": 83140 + }, + { + "epoch": 0.3164894224401087, + "grad_norm": 0.12982015311717987, + "learning_rate": 0.0005, + "loss": 2.1278, + "step": 83150 + }, + { + "epoch": 0.3165274849082314, + "grad_norm": 0.13806003332138062, + "learning_rate": 0.0005, + "loss": 2.1284, + "step": 83160 + }, + { + "epoch": 0.31656554737635406, + "grad_norm": 0.12923070788383484, + "learning_rate": 0.0005, + "loss": 2.1423, + "step": 83170 + }, + { + "epoch": 0.3166036098444768, + "grad_norm": 0.113308846950531, + "learning_rate": 0.0005, + "loss": 2.1199, + "step": 83180 + }, + { + "epoch": 0.31664167231259943, + "grad_norm": 0.12376850098371506, + "learning_rate": 0.0005, + "loss": 2.1362, + "step": 83190 + }, + { + "epoch": 0.31667973478072214, + "grad_norm": 0.12424514442682266, + "learning_rate": 0.0005, + "loss": 2.1259, + "step": 83200 + }, + { + "epoch": 0.3167177972488448, + "grad_norm": 0.12274659425020218, + "learning_rate": 0.0005, + "loss": 2.134, + "step": 83210 + }, + { + "epoch": 0.3167558597169675, + "grad_norm": 0.13459163904190063, + "learning_rate": 0.0005, + "loss": 2.1257, + "step": 83220 + }, + { + "epoch": 0.31679392218509017, + "grad_norm": 0.12273278087377548, + "learning_rate": 0.0005, + "loss": 2.1313, + "step": 83230 + }, + { + "epoch": 0.31683198465321283, + "grad_norm": 0.12100246548652649, + "learning_rate": 0.0005, + "loss": 2.1317, + "step": 83240 + }, + { + "epoch": 0.31687004712133554, + "grad_norm": 0.13598744571208954, + "learning_rate": 0.0005, + "loss": 2.1153, + "step": 83250 + }, + { + "epoch": 0.3169081095894582, + "grad_norm": 0.130888432264328, + "learning_rate": 0.0005, + "loss": 2.1193, + "step": 83260 + }, + { + "epoch": 0.3169461720575809, + "grad_norm": 0.1161246970295906, + "learning_rate": 0.0005, + "loss": 2.1393, + "step": 83270 + }, + { + "epoch": 0.31698423452570357, + "grad_norm": 0.1331406831741333, + "learning_rate": 0.0005, + "loss": 2.123, + "step": 83280 + }, + { + "epoch": 0.3170222969938263, + "grad_norm": 0.13716700673103333, + "learning_rate": 0.0005, + "loss": 2.1322, + "step": 83290 + }, + { + "epoch": 0.31706035946194894, + "grad_norm": 0.13076649606227875, + "learning_rate": 0.0005, + "loss": 2.1374, + "step": 83300 + }, + { + "epoch": 0.31709842193007165, + "grad_norm": 0.12399870902299881, + "learning_rate": 0.0005, + "loss": 2.1261, + "step": 83310 + }, + { + "epoch": 0.3171364843981943, + "grad_norm": 0.12629511952400208, + "learning_rate": 0.0005, + "loss": 2.1277, + "step": 83320 + }, + { + "epoch": 0.317174546866317, + "grad_norm": 0.13469186425209045, + "learning_rate": 0.0005, + "loss": 2.1268, + "step": 83330 + }, + { + "epoch": 0.3172126093344397, + "grad_norm": 0.12043331563472748, + "learning_rate": 0.0005, + "loss": 2.1159, + "step": 83340 + }, + { + "epoch": 0.3172506718025624, + "grad_norm": 0.11527103930711746, + "learning_rate": 0.0005, + "loss": 2.1327, + "step": 83350 + }, + { + "epoch": 0.31728873427068505, + "grad_norm": 0.12726227939128876, + "learning_rate": 0.0005, + "loss": 2.1257, + "step": 83360 + }, + { + "epoch": 0.3173267967388077, + "grad_norm": 0.1191035658121109, + "learning_rate": 0.0005, + "loss": 2.1199, + "step": 83370 + }, + { + "epoch": 0.3173648592069304, + "grad_norm": 0.11827237159013748, + "learning_rate": 0.0005, + "loss": 2.1347, + "step": 83380 + }, + { + "epoch": 0.3174029216750531, + "grad_norm": 0.1270776093006134, + "learning_rate": 0.0005, + "loss": 2.1349, + "step": 83390 + }, + { + "epoch": 0.3174409841431758, + "grad_norm": 0.12138780951499939, + "learning_rate": 0.0005, + "loss": 2.125, + "step": 83400 + }, + { + "epoch": 0.31747904661129844, + "grad_norm": 0.1401662975549698, + "learning_rate": 0.0005, + "loss": 2.1369, + "step": 83410 + }, + { + "epoch": 0.31751710907942116, + "grad_norm": 0.12503919005393982, + "learning_rate": 0.0005, + "loss": 2.1376, + "step": 83420 + }, + { + "epoch": 0.3175551715475438, + "grad_norm": 0.12604312598705292, + "learning_rate": 0.0005, + "loss": 2.1068, + "step": 83430 + }, + { + "epoch": 0.3175932340156665, + "grad_norm": 0.1168069988489151, + "learning_rate": 0.0005, + "loss": 2.1096, + "step": 83440 + }, + { + "epoch": 0.3176312964837892, + "grad_norm": 0.12296763062477112, + "learning_rate": 0.0005, + "loss": 2.1226, + "step": 83450 + }, + { + "epoch": 0.3176693589519119, + "grad_norm": 0.13476720452308655, + "learning_rate": 0.0005, + "loss": 2.1256, + "step": 83460 + }, + { + "epoch": 0.31770742142003455, + "grad_norm": 0.12037523835897446, + "learning_rate": 0.0005, + "loss": 2.1282, + "step": 83470 + }, + { + "epoch": 0.31774548388815727, + "grad_norm": 0.12858867645263672, + "learning_rate": 0.0005, + "loss": 2.1326, + "step": 83480 + }, + { + "epoch": 0.3177835463562799, + "grad_norm": 0.1294158548116684, + "learning_rate": 0.0005, + "loss": 2.1313, + "step": 83490 + }, + { + "epoch": 0.31782160882440263, + "grad_norm": 0.13894596695899963, + "learning_rate": 0.0005, + "loss": 2.1325, + "step": 83500 + }, + { + "epoch": 0.3178596712925253, + "grad_norm": 0.12230147421360016, + "learning_rate": 0.0005, + "loss": 2.1166, + "step": 83510 + }, + { + "epoch": 0.31789773376064795, + "grad_norm": 0.14162777364253998, + "learning_rate": 0.0005, + "loss": 2.1032, + "step": 83520 + }, + { + "epoch": 0.31793579622877066, + "grad_norm": 0.1376306414604187, + "learning_rate": 0.0005, + "loss": 2.1375, + "step": 83530 + }, + { + "epoch": 0.3179738586968933, + "grad_norm": 0.12878134846687317, + "learning_rate": 0.0005, + "loss": 2.131, + "step": 83540 + }, + { + "epoch": 0.31801192116501603, + "grad_norm": 0.13225965201854706, + "learning_rate": 0.0005, + "loss": 2.1339, + "step": 83550 + }, + { + "epoch": 0.3180499836331387, + "grad_norm": 0.13180622458457947, + "learning_rate": 0.0005, + "loss": 2.1224, + "step": 83560 + }, + { + "epoch": 0.3180880461012614, + "grad_norm": 0.14126379787921906, + "learning_rate": 0.0005, + "loss": 2.124, + "step": 83570 + }, + { + "epoch": 0.31812610856938406, + "grad_norm": 0.12663407623767853, + "learning_rate": 0.0005, + "loss": 2.1104, + "step": 83580 + }, + { + "epoch": 0.31816417103750677, + "grad_norm": 0.12259071320295334, + "learning_rate": 0.0005, + "loss": 2.1317, + "step": 83590 + }, + { + "epoch": 0.31820223350562943, + "grad_norm": 0.12154918909072876, + "learning_rate": 0.0005, + "loss": 2.1179, + "step": 83600 + }, + { + "epoch": 0.31824029597375214, + "grad_norm": 0.11901416629552841, + "learning_rate": 0.0005, + "loss": 2.1272, + "step": 83610 + }, + { + "epoch": 0.3182783584418748, + "grad_norm": 0.13466015458106995, + "learning_rate": 0.0005, + "loss": 2.1121, + "step": 83620 + }, + { + "epoch": 0.3183164209099975, + "grad_norm": 0.12550848722457886, + "learning_rate": 0.0005, + "loss": 2.1419, + "step": 83630 + }, + { + "epoch": 0.31835448337812017, + "grad_norm": 0.12653183937072754, + "learning_rate": 0.0005, + "loss": 2.1432, + "step": 83640 + }, + { + "epoch": 0.3183925458462429, + "grad_norm": 0.1187266856431961, + "learning_rate": 0.0005, + "loss": 2.1261, + "step": 83650 + }, + { + "epoch": 0.31843060831436554, + "grad_norm": 0.12700314819812775, + "learning_rate": 0.0005, + "loss": 2.1265, + "step": 83660 + }, + { + "epoch": 0.3184686707824882, + "grad_norm": 0.12154296040534973, + "learning_rate": 0.0005, + "loss": 2.0989, + "step": 83670 + }, + { + "epoch": 0.3185067332506109, + "grad_norm": 0.12859545648097992, + "learning_rate": 0.0005, + "loss": 2.1263, + "step": 83680 + }, + { + "epoch": 0.31854479571873356, + "grad_norm": 0.12260852754116058, + "learning_rate": 0.0005, + "loss": 2.1343, + "step": 83690 + }, + { + "epoch": 0.3185828581868563, + "grad_norm": 0.12154964357614517, + "learning_rate": 0.0005, + "loss": 2.1273, + "step": 83700 + }, + { + "epoch": 0.31862092065497893, + "grad_norm": 0.14406907558441162, + "learning_rate": 0.0005, + "loss": 2.1187, + "step": 83710 + }, + { + "epoch": 0.31865898312310165, + "grad_norm": 0.12068268656730652, + "learning_rate": 0.0005, + "loss": 2.1269, + "step": 83720 + }, + { + "epoch": 0.3186970455912243, + "grad_norm": 0.12875112891197205, + "learning_rate": 0.0005, + "loss": 2.1242, + "step": 83730 + }, + { + "epoch": 0.318735108059347, + "grad_norm": 0.1260383427143097, + "learning_rate": 0.0005, + "loss": 2.1343, + "step": 83740 + }, + { + "epoch": 0.3187731705274697, + "grad_norm": 0.118220753967762, + "learning_rate": 0.0005, + "loss": 2.1293, + "step": 83750 + }, + { + "epoch": 0.3188112329955924, + "grad_norm": 0.123013436794281, + "learning_rate": 0.0005, + "loss": 2.1326, + "step": 83760 + }, + { + "epoch": 0.31884929546371504, + "grad_norm": 0.11908526718616486, + "learning_rate": 0.0005, + "loss": 2.1249, + "step": 83770 + }, + { + "epoch": 0.31888735793183776, + "grad_norm": 0.12813295423984528, + "learning_rate": 0.0005, + "loss": 2.1145, + "step": 83780 + }, + { + "epoch": 0.3189254203999604, + "grad_norm": 0.11917294561862946, + "learning_rate": 0.0005, + "loss": 2.1222, + "step": 83790 + }, + { + "epoch": 0.31896348286808307, + "grad_norm": 0.13365842401981354, + "learning_rate": 0.0005, + "loss": 2.1444, + "step": 83800 + }, + { + "epoch": 0.3190015453362058, + "grad_norm": 0.1154765710234642, + "learning_rate": 0.0005, + "loss": 2.1288, + "step": 83810 + }, + { + "epoch": 0.31903960780432844, + "grad_norm": 0.12989890575408936, + "learning_rate": 0.0005, + "loss": 2.134, + "step": 83820 + }, + { + "epoch": 0.31907767027245115, + "grad_norm": 0.12488723546266556, + "learning_rate": 0.0005, + "loss": 2.1177, + "step": 83830 + }, + { + "epoch": 0.3191157327405738, + "grad_norm": 0.11685290932655334, + "learning_rate": 0.0005, + "loss": 2.1257, + "step": 83840 + }, + { + "epoch": 0.3191537952086965, + "grad_norm": 0.12078642845153809, + "learning_rate": 0.0005, + "loss": 2.117, + "step": 83850 + }, + { + "epoch": 0.3191918576768192, + "grad_norm": 0.13583782315254211, + "learning_rate": 0.0005, + "loss": 2.1146, + "step": 83860 + }, + { + "epoch": 0.3192299201449419, + "grad_norm": 0.12751080095767975, + "learning_rate": 0.0005, + "loss": 2.1225, + "step": 83870 + }, + { + "epoch": 0.31926798261306455, + "grad_norm": 0.12122221291065216, + "learning_rate": 0.0005, + "loss": 2.1198, + "step": 83880 + }, + { + "epoch": 0.31930604508118726, + "grad_norm": 0.13013103604316711, + "learning_rate": 0.0005, + "loss": 2.125, + "step": 83890 + }, + { + "epoch": 0.3193441075493099, + "grad_norm": 0.1183476373553276, + "learning_rate": 0.0005, + "loss": 2.1321, + "step": 83900 + }, + { + "epoch": 0.31938217001743263, + "grad_norm": 0.11884431540966034, + "learning_rate": 0.0005, + "loss": 2.1163, + "step": 83910 + }, + { + "epoch": 0.3194202324855553, + "grad_norm": 0.12388347089290619, + "learning_rate": 0.0005, + "loss": 2.112, + "step": 83920 + }, + { + "epoch": 0.319458294953678, + "grad_norm": 0.11375343799591064, + "learning_rate": 0.0005, + "loss": 2.1143, + "step": 83930 + }, + { + "epoch": 0.31949635742180066, + "grad_norm": 0.13243578374385834, + "learning_rate": 0.0005, + "loss": 2.1229, + "step": 83940 + }, + { + "epoch": 0.3195344198899233, + "grad_norm": 0.13904094696044922, + "learning_rate": 0.0005, + "loss": 2.1164, + "step": 83950 + }, + { + "epoch": 0.31957248235804603, + "grad_norm": 0.14737099409103394, + "learning_rate": 0.0005, + "loss": 2.1286, + "step": 83960 + }, + { + "epoch": 0.3196105448261687, + "grad_norm": 0.12814052402973175, + "learning_rate": 0.0005, + "loss": 2.1119, + "step": 83970 + }, + { + "epoch": 0.3196486072942914, + "grad_norm": 0.13516543805599213, + "learning_rate": 0.0005, + "loss": 2.1334, + "step": 83980 + }, + { + "epoch": 0.31968666976241406, + "grad_norm": 0.11763939261436462, + "learning_rate": 0.0005, + "loss": 2.1154, + "step": 83990 + }, + { + "epoch": 0.31972473223053677, + "grad_norm": 0.13765235245227814, + "learning_rate": 0.0005, + "loss": 2.1247, + "step": 84000 + }, + { + "epoch": 0.3197627946986594, + "grad_norm": 0.13086718320846558, + "learning_rate": 0.0005, + "loss": 2.1401, + "step": 84010 + }, + { + "epoch": 0.31980085716678214, + "grad_norm": 0.12069284170866013, + "learning_rate": 0.0005, + "loss": 2.116, + "step": 84020 + }, + { + "epoch": 0.3198389196349048, + "grad_norm": 0.12064868956804276, + "learning_rate": 0.0005, + "loss": 2.1328, + "step": 84030 + }, + { + "epoch": 0.3198769821030275, + "grad_norm": 0.12312685698270798, + "learning_rate": 0.0005, + "loss": 2.1096, + "step": 84040 + }, + { + "epoch": 0.31991504457115016, + "grad_norm": 0.12011557072401047, + "learning_rate": 0.0005, + "loss": 2.1321, + "step": 84050 + }, + { + "epoch": 0.3199531070392729, + "grad_norm": 0.12177038192749023, + "learning_rate": 0.0005, + "loss": 2.1174, + "step": 84060 + }, + { + "epoch": 0.31999116950739553, + "grad_norm": 0.13619963824748993, + "learning_rate": 0.0005, + "loss": 2.139, + "step": 84070 + }, + { + "epoch": 0.32002923197551825, + "grad_norm": 0.1267249584197998, + "learning_rate": 0.0005, + "loss": 2.1317, + "step": 84080 + }, + { + "epoch": 0.3200672944436409, + "grad_norm": 0.1230340376496315, + "learning_rate": 0.0005, + "loss": 2.1308, + "step": 84090 + }, + { + "epoch": 0.32010535691176356, + "grad_norm": 0.1240105852484703, + "learning_rate": 0.0005, + "loss": 2.1104, + "step": 84100 + }, + { + "epoch": 0.3201434193798863, + "grad_norm": 0.12150612473487854, + "learning_rate": 0.0005, + "loss": 2.1184, + "step": 84110 + }, + { + "epoch": 0.32018148184800893, + "grad_norm": 0.12242543697357178, + "learning_rate": 0.0005, + "loss": 2.1191, + "step": 84120 + }, + { + "epoch": 0.32021954431613164, + "grad_norm": 0.11171294748783112, + "learning_rate": 0.0005, + "loss": 2.1284, + "step": 84130 + }, + { + "epoch": 0.3202576067842543, + "grad_norm": 0.11961314082145691, + "learning_rate": 0.0005, + "loss": 2.1129, + "step": 84140 + }, + { + "epoch": 0.320295669252377, + "grad_norm": 0.12245552241802216, + "learning_rate": 0.0005, + "loss": 2.1212, + "step": 84150 + }, + { + "epoch": 0.32033373172049967, + "grad_norm": 0.13306266069412231, + "learning_rate": 0.0005, + "loss": 2.1124, + "step": 84160 + }, + { + "epoch": 0.3203717941886224, + "grad_norm": 0.13139064610004425, + "learning_rate": 0.0005, + "loss": 2.124, + "step": 84170 + }, + { + "epoch": 0.32040985665674504, + "grad_norm": 0.12348007410764694, + "learning_rate": 0.0005, + "loss": 2.133, + "step": 84180 + }, + { + "epoch": 0.32044791912486775, + "grad_norm": 0.1370268613100052, + "learning_rate": 0.0005, + "loss": 2.1339, + "step": 84190 + }, + { + "epoch": 0.3204859815929904, + "grad_norm": 0.13645033538341522, + "learning_rate": 0.0005, + "loss": 2.1246, + "step": 84200 + }, + { + "epoch": 0.3205240440611131, + "grad_norm": 0.11724669486284256, + "learning_rate": 0.0005, + "loss": 2.1195, + "step": 84210 + }, + { + "epoch": 0.3205621065292358, + "grad_norm": 0.13534116744995117, + "learning_rate": 0.0005, + "loss": 2.123, + "step": 84220 + }, + { + "epoch": 0.3206001689973585, + "grad_norm": 0.1163550466299057, + "learning_rate": 0.0005, + "loss": 2.133, + "step": 84230 + }, + { + "epoch": 0.32063823146548115, + "grad_norm": 0.11814413964748383, + "learning_rate": 0.0005, + "loss": 2.1411, + "step": 84240 + }, + { + "epoch": 0.3206762939336038, + "grad_norm": 0.13630716502666473, + "learning_rate": 0.0005, + "loss": 2.1329, + "step": 84250 + }, + { + "epoch": 0.3207143564017265, + "grad_norm": 0.11421690881252289, + "learning_rate": 0.0005, + "loss": 2.1269, + "step": 84260 + }, + { + "epoch": 0.3207524188698492, + "grad_norm": 0.11150612682104111, + "learning_rate": 0.0005, + "loss": 2.1252, + "step": 84270 + }, + { + "epoch": 0.3207904813379719, + "grad_norm": 0.1303691864013672, + "learning_rate": 0.0005, + "loss": 2.1221, + "step": 84280 + }, + { + "epoch": 0.32082854380609455, + "grad_norm": 0.116021066904068, + "learning_rate": 0.0005, + "loss": 2.1187, + "step": 84290 + }, + { + "epoch": 0.32086660627421726, + "grad_norm": 0.12288866192102432, + "learning_rate": 0.0005, + "loss": 2.1329, + "step": 84300 + }, + { + "epoch": 0.3209046687423399, + "grad_norm": 0.13716170191764832, + "learning_rate": 0.0005, + "loss": 2.1143, + "step": 84310 + }, + { + "epoch": 0.32094273121046263, + "grad_norm": 0.11293182522058487, + "learning_rate": 0.0005, + "loss": 2.1112, + "step": 84320 + }, + { + "epoch": 0.3209807936785853, + "grad_norm": 0.12716884911060333, + "learning_rate": 0.0005, + "loss": 2.1376, + "step": 84330 + }, + { + "epoch": 0.321018856146708, + "grad_norm": 0.12877345085144043, + "learning_rate": 0.0005, + "loss": 2.1188, + "step": 84340 + }, + { + "epoch": 0.32105691861483066, + "grad_norm": 0.12783068418502808, + "learning_rate": 0.0005, + "loss": 2.1289, + "step": 84350 + }, + { + "epoch": 0.32109498108295337, + "grad_norm": 0.11733737587928772, + "learning_rate": 0.0005, + "loss": 2.1294, + "step": 84360 + }, + { + "epoch": 0.321133043551076, + "grad_norm": 0.13269150257110596, + "learning_rate": 0.0005, + "loss": 2.1355, + "step": 84370 + }, + { + "epoch": 0.3211711060191987, + "grad_norm": 0.1295628845691681, + "learning_rate": 0.0005, + "loss": 2.1288, + "step": 84380 + }, + { + "epoch": 0.3212091684873214, + "grad_norm": 0.14869417250156403, + "learning_rate": 0.0005, + "loss": 2.1242, + "step": 84390 + }, + { + "epoch": 0.32124723095544405, + "grad_norm": 0.11842743307352066, + "learning_rate": 0.0005, + "loss": 2.1223, + "step": 84400 + }, + { + "epoch": 0.32128529342356676, + "grad_norm": 0.14121049642562866, + "learning_rate": 0.0005, + "loss": 2.1205, + "step": 84410 + }, + { + "epoch": 0.3213233558916894, + "grad_norm": 0.1316528618335724, + "learning_rate": 0.0005, + "loss": 2.1167, + "step": 84420 + }, + { + "epoch": 0.32136141835981213, + "grad_norm": 1.0470399856567383, + "learning_rate": 0.0005, + "loss": 2.1316, + "step": 84430 + }, + { + "epoch": 0.3213994808279348, + "grad_norm": 0.11911292374134064, + "learning_rate": 0.0005, + "loss": 2.1273, + "step": 84440 + }, + { + "epoch": 0.3214375432960575, + "grad_norm": 0.12905144691467285, + "learning_rate": 0.0005, + "loss": 2.1183, + "step": 84450 + }, + { + "epoch": 0.32147560576418016, + "grad_norm": 0.13625507056713104, + "learning_rate": 0.0005, + "loss": 2.1128, + "step": 84460 + }, + { + "epoch": 0.3215136682323029, + "grad_norm": 0.1250603199005127, + "learning_rate": 0.0005, + "loss": 2.1376, + "step": 84470 + }, + { + "epoch": 0.32155173070042553, + "grad_norm": 0.1387881487607956, + "learning_rate": 0.0005, + "loss": 2.1379, + "step": 84480 + }, + { + "epoch": 0.32158979316854824, + "grad_norm": 0.12051466107368469, + "learning_rate": 0.0005, + "loss": 2.1166, + "step": 84490 + }, + { + "epoch": 0.3216278556366709, + "grad_norm": 0.11220446228981018, + "learning_rate": 0.0005, + "loss": 2.1285, + "step": 84500 + }, + { + "epoch": 0.3216659181047936, + "grad_norm": 0.11351220309734344, + "learning_rate": 0.0005, + "loss": 2.1169, + "step": 84510 + }, + { + "epoch": 0.32170398057291627, + "grad_norm": 0.12000728398561478, + "learning_rate": 0.0005, + "loss": 2.1248, + "step": 84520 + }, + { + "epoch": 0.3217420430410389, + "grad_norm": 0.12402566522359848, + "learning_rate": 0.0005, + "loss": 2.1372, + "step": 84530 + }, + { + "epoch": 0.32178010550916164, + "grad_norm": 0.13333047926425934, + "learning_rate": 0.0005, + "loss": 2.1203, + "step": 84540 + }, + { + "epoch": 0.3218181679772843, + "grad_norm": 0.11884795874357224, + "learning_rate": 0.0005, + "loss": 2.1176, + "step": 84550 + }, + { + "epoch": 0.321856230445407, + "grad_norm": 0.12012158334255219, + "learning_rate": 0.0005, + "loss": 2.1244, + "step": 84560 + }, + { + "epoch": 0.32189429291352967, + "grad_norm": 0.12488909810781479, + "learning_rate": 0.0005, + "loss": 2.1353, + "step": 84570 + }, + { + "epoch": 0.3219323553816524, + "grad_norm": 0.11611166596412659, + "learning_rate": 0.0005, + "loss": 2.1301, + "step": 84580 + }, + { + "epoch": 0.32197041784977504, + "grad_norm": 0.11382875591516495, + "learning_rate": 0.0005, + "loss": 2.1123, + "step": 84590 + }, + { + "epoch": 0.32200848031789775, + "grad_norm": 0.12993958592414856, + "learning_rate": 0.0005, + "loss": 2.1087, + "step": 84600 + }, + { + "epoch": 0.3220465427860204, + "grad_norm": 0.12190189212560654, + "learning_rate": 0.0005, + "loss": 2.1323, + "step": 84610 + }, + { + "epoch": 0.3220846052541431, + "grad_norm": 0.12783260643482208, + "learning_rate": 0.0005, + "loss": 2.1347, + "step": 84620 + }, + { + "epoch": 0.3221226677222658, + "grad_norm": 0.12652941048145294, + "learning_rate": 0.0005, + "loss": 2.1106, + "step": 84630 + }, + { + "epoch": 0.3221607301903885, + "grad_norm": 0.12840814888477325, + "learning_rate": 0.0005, + "loss": 2.1266, + "step": 84640 + }, + { + "epoch": 0.32219879265851115, + "grad_norm": 0.13773638010025024, + "learning_rate": 0.0005, + "loss": 2.113, + "step": 84650 + }, + { + "epoch": 0.32223685512663386, + "grad_norm": 0.11401660740375519, + "learning_rate": 0.0005, + "loss": 2.1173, + "step": 84660 + }, + { + "epoch": 0.3222749175947565, + "grad_norm": 0.12564989924430847, + "learning_rate": 0.0005, + "loss": 2.1185, + "step": 84670 + }, + { + "epoch": 0.3223129800628792, + "grad_norm": 0.13151569664478302, + "learning_rate": 0.0005, + "loss": 2.1153, + "step": 84680 + }, + { + "epoch": 0.3223510425310019, + "grad_norm": 0.13308259844779968, + "learning_rate": 0.0005, + "loss": 2.1354, + "step": 84690 + }, + { + "epoch": 0.32238910499912454, + "grad_norm": 0.13082093000411987, + "learning_rate": 0.0005, + "loss": 2.117, + "step": 84700 + }, + { + "epoch": 0.32242716746724726, + "grad_norm": 0.12852692604064941, + "learning_rate": 0.0005, + "loss": 2.1013, + "step": 84710 + }, + { + "epoch": 0.3224652299353699, + "grad_norm": 0.11865947395563126, + "learning_rate": 0.0005, + "loss": 2.13, + "step": 84720 + }, + { + "epoch": 0.3225032924034926, + "grad_norm": 0.1241876482963562, + "learning_rate": 0.0005, + "loss": 2.1179, + "step": 84730 + }, + { + "epoch": 0.3225413548716153, + "grad_norm": 0.12009057402610779, + "learning_rate": 0.0005, + "loss": 2.1195, + "step": 84740 + }, + { + "epoch": 0.322579417339738, + "grad_norm": 0.12893769145011902, + "learning_rate": 0.0005, + "loss": 2.1299, + "step": 84750 + }, + { + "epoch": 0.32261747980786065, + "grad_norm": 0.1296892762184143, + "learning_rate": 0.0005, + "loss": 2.1311, + "step": 84760 + }, + { + "epoch": 0.32265554227598336, + "grad_norm": 0.13293935358524323, + "learning_rate": 0.0005, + "loss": 2.1437, + "step": 84770 + }, + { + "epoch": 0.322693604744106, + "grad_norm": 0.1251651495695114, + "learning_rate": 0.0005, + "loss": 2.1248, + "step": 84780 + }, + { + "epoch": 0.32273166721222873, + "grad_norm": 0.12732888758182526, + "learning_rate": 0.0005, + "loss": 2.1252, + "step": 84790 + }, + { + "epoch": 0.3227697296803514, + "grad_norm": 0.11490562558174133, + "learning_rate": 0.0005, + "loss": 2.1028, + "step": 84800 + }, + { + "epoch": 0.32280779214847405, + "grad_norm": 0.12102054059505463, + "learning_rate": 0.0005, + "loss": 2.1137, + "step": 84810 + }, + { + "epoch": 0.32284585461659676, + "grad_norm": 0.12410101294517517, + "learning_rate": 0.0005, + "loss": 2.1206, + "step": 84820 + }, + { + "epoch": 0.3228839170847194, + "grad_norm": 0.12932229042053223, + "learning_rate": 0.0005, + "loss": 2.1242, + "step": 84830 + }, + { + "epoch": 0.32292197955284213, + "grad_norm": 0.10722965002059937, + "learning_rate": 0.0005, + "loss": 2.1129, + "step": 84840 + }, + { + "epoch": 0.3229600420209648, + "grad_norm": 0.13229092955589294, + "learning_rate": 0.0005, + "loss": 2.1147, + "step": 84850 + }, + { + "epoch": 0.3229981044890875, + "grad_norm": 0.11781799793243408, + "learning_rate": 0.0005, + "loss": 2.1326, + "step": 84860 + }, + { + "epoch": 0.32303616695721016, + "grad_norm": 0.11863507330417633, + "learning_rate": 0.0005, + "loss": 2.1161, + "step": 84870 + }, + { + "epoch": 0.32307422942533287, + "grad_norm": 0.12561804056167603, + "learning_rate": 0.0005, + "loss": 2.1277, + "step": 84880 + }, + { + "epoch": 0.3231122918934555, + "grad_norm": 0.12400005757808685, + "learning_rate": 0.0005, + "loss": 2.1334, + "step": 84890 + }, + { + "epoch": 0.32315035436157824, + "grad_norm": 0.11362763494253159, + "learning_rate": 0.0005, + "loss": 2.132, + "step": 84900 + }, + { + "epoch": 0.3231884168297009, + "grad_norm": 0.11362208425998688, + "learning_rate": 0.0005, + "loss": 2.1299, + "step": 84910 + }, + { + "epoch": 0.3232264792978236, + "grad_norm": 0.12535296380519867, + "learning_rate": 0.0005, + "loss": 2.1178, + "step": 84920 + }, + { + "epoch": 0.32326454176594627, + "grad_norm": 0.12394122779369354, + "learning_rate": 0.0005, + "loss": 2.1202, + "step": 84930 + }, + { + "epoch": 0.323302604234069, + "grad_norm": 0.131861612200737, + "learning_rate": 0.0005, + "loss": 2.1208, + "step": 84940 + }, + { + "epoch": 0.32334066670219164, + "grad_norm": 0.11681771278381348, + "learning_rate": 0.0005, + "loss": 2.1174, + "step": 84950 + }, + { + "epoch": 0.3233787291703143, + "grad_norm": 0.12034600973129272, + "learning_rate": 0.0005, + "loss": 2.1151, + "step": 84960 + }, + { + "epoch": 0.323416791638437, + "grad_norm": 0.11895084381103516, + "learning_rate": 0.0005, + "loss": 2.1175, + "step": 84970 + }, + { + "epoch": 0.32345485410655966, + "grad_norm": 0.12538011372089386, + "learning_rate": 0.0005, + "loss": 2.1264, + "step": 84980 + }, + { + "epoch": 0.3234929165746824, + "grad_norm": 0.12758684158325195, + "learning_rate": 0.0005, + "loss": 2.1433, + "step": 84990 + }, + { + "epoch": 0.32353097904280503, + "grad_norm": 0.12982970476150513, + "learning_rate": 0.0005, + "loss": 2.1306, + "step": 85000 + }, + { + "epoch": 0.32356904151092775, + "grad_norm": 0.12789271771907806, + "learning_rate": 0.0005, + "loss": 2.1262, + "step": 85010 + }, + { + "epoch": 0.3236071039790504, + "grad_norm": 0.12262456119060516, + "learning_rate": 0.0005, + "loss": 2.1116, + "step": 85020 + }, + { + "epoch": 0.3236451664471731, + "grad_norm": 0.12088648229837418, + "learning_rate": 0.0005, + "loss": 2.1261, + "step": 85030 + }, + { + "epoch": 0.3236832289152958, + "grad_norm": 0.11756960302591324, + "learning_rate": 0.0005, + "loss": 2.1047, + "step": 85040 + }, + { + "epoch": 0.3237212913834185, + "grad_norm": 0.13255122303962708, + "learning_rate": 0.0005, + "loss": 2.1301, + "step": 85050 + }, + { + "epoch": 0.32375935385154114, + "grad_norm": 0.12687934935092926, + "learning_rate": 0.0005, + "loss": 2.1282, + "step": 85060 + }, + { + "epoch": 0.32379741631966386, + "grad_norm": 0.12454438954591751, + "learning_rate": 0.0005, + "loss": 2.1264, + "step": 85070 + }, + { + "epoch": 0.3238354787877865, + "grad_norm": 0.29794633388519287, + "learning_rate": 0.0005, + "loss": 2.1375, + "step": 85080 + }, + { + "epoch": 0.3238735412559092, + "grad_norm": 0.13008928298950195, + "learning_rate": 0.0005, + "loss": 2.1093, + "step": 85090 + }, + { + "epoch": 0.3239116037240319, + "grad_norm": 0.12024632096290588, + "learning_rate": 0.0005, + "loss": 2.1316, + "step": 85100 + }, + { + "epoch": 0.32394966619215454, + "grad_norm": 0.11600895971059799, + "learning_rate": 0.0005, + "loss": 2.1166, + "step": 85110 + }, + { + "epoch": 0.32398772866027725, + "grad_norm": 0.12285245209932327, + "learning_rate": 0.0005, + "loss": 2.1249, + "step": 85120 + }, + { + "epoch": 0.3240257911283999, + "grad_norm": 0.1324823647737503, + "learning_rate": 0.0005, + "loss": 2.1138, + "step": 85130 + }, + { + "epoch": 0.3240638535965226, + "grad_norm": 0.1279059499502182, + "learning_rate": 0.0005, + "loss": 2.1202, + "step": 85140 + }, + { + "epoch": 0.3241019160646453, + "grad_norm": 0.13273939490318298, + "learning_rate": 0.0005, + "loss": 2.1393, + "step": 85150 + }, + { + "epoch": 0.324139978532768, + "grad_norm": 0.1353902965784073, + "learning_rate": 0.0005, + "loss": 2.1365, + "step": 85160 + }, + { + "epoch": 0.32417804100089065, + "grad_norm": 0.13185188174247742, + "learning_rate": 0.0005, + "loss": 2.0979, + "step": 85170 + }, + { + "epoch": 0.32421610346901336, + "grad_norm": 0.13154304027557373, + "learning_rate": 0.0005, + "loss": 2.129, + "step": 85180 + }, + { + "epoch": 0.324254165937136, + "grad_norm": 0.12412336468696594, + "learning_rate": 0.0005, + "loss": 2.1246, + "step": 85190 + }, + { + "epoch": 0.32429222840525873, + "grad_norm": 0.1248876303434372, + "learning_rate": 0.0005, + "loss": 2.1342, + "step": 85200 + }, + { + "epoch": 0.3243302908733814, + "grad_norm": 0.13276709616184235, + "learning_rate": 0.0005, + "loss": 2.127, + "step": 85210 + }, + { + "epoch": 0.3243683533415041, + "grad_norm": 0.13410021364688873, + "learning_rate": 0.0005, + "loss": 2.1159, + "step": 85220 + }, + { + "epoch": 0.32440641580962676, + "grad_norm": 0.11405565589666367, + "learning_rate": 0.0005, + "loss": 2.1294, + "step": 85230 + }, + { + "epoch": 0.3244444782777494, + "grad_norm": 0.13638851046562195, + "learning_rate": 0.0005, + "loss": 2.1375, + "step": 85240 + }, + { + "epoch": 0.32448254074587213, + "grad_norm": 0.11063786596059799, + "learning_rate": 0.0005, + "loss": 2.1094, + "step": 85250 + }, + { + "epoch": 0.3245206032139948, + "grad_norm": 0.12583479285240173, + "learning_rate": 0.0005, + "loss": 2.1092, + "step": 85260 + }, + { + "epoch": 0.3245586656821175, + "grad_norm": 0.13034242391586304, + "learning_rate": 0.0005, + "loss": 2.1275, + "step": 85270 + }, + { + "epoch": 0.32459672815024015, + "grad_norm": 0.13332554697990417, + "learning_rate": 0.0005, + "loss": 2.1272, + "step": 85280 + }, + { + "epoch": 0.32463479061836287, + "grad_norm": 0.11839156597852707, + "learning_rate": 0.0005, + "loss": 2.1288, + "step": 85290 + }, + { + "epoch": 0.3246728530864855, + "grad_norm": 0.13876213133335114, + "learning_rate": 0.0005, + "loss": 2.1133, + "step": 85300 + }, + { + "epoch": 0.32471091555460824, + "grad_norm": 0.14040637016296387, + "learning_rate": 0.0005, + "loss": 2.1313, + "step": 85310 + }, + { + "epoch": 0.3247489780227309, + "grad_norm": 0.12406309694051743, + "learning_rate": 0.0005, + "loss": 2.1362, + "step": 85320 + }, + { + "epoch": 0.3247870404908536, + "grad_norm": 0.1268659383058548, + "learning_rate": 0.0005, + "loss": 2.1234, + "step": 85330 + }, + { + "epoch": 0.32482510295897626, + "grad_norm": 0.12533274292945862, + "learning_rate": 0.0005, + "loss": 2.1326, + "step": 85340 + }, + { + "epoch": 0.324863165427099, + "grad_norm": 0.11266963928937912, + "learning_rate": 0.0005, + "loss": 2.1243, + "step": 85350 + }, + { + "epoch": 0.32490122789522163, + "grad_norm": 0.13690349459648132, + "learning_rate": 0.0005, + "loss": 2.1302, + "step": 85360 + }, + { + "epoch": 0.32493929036334435, + "grad_norm": 0.11537948250770569, + "learning_rate": 0.0005, + "loss": 2.1325, + "step": 85370 + }, + { + "epoch": 0.324977352831467, + "grad_norm": 0.11801768839359283, + "learning_rate": 0.0005, + "loss": 2.1284, + "step": 85380 + }, + { + "epoch": 0.32501541529958966, + "grad_norm": 0.1329495757818222, + "learning_rate": 0.0005, + "loss": 2.1324, + "step": 85390 + }, + { + "epoch": 0.3250534777677124, + "grad_norm": 0.14431969821453094, + "learning_rate": 0.0005, + "loss": 2.1113, + "step": 85400 + }, + { + "epoch": 0.32509154023583503, + "grad_norm": 0.12717166543006897, + "learning_rate": 0.0005, + "loss": 2.1425, + "step": 85410 + }, + { + "epoch": 0.32512960270395774, + "grad_norm": 0.13233216106891632, + "learning_rate": 0.0005, + "loss": 2.1295, + "step": 85420 + }, + { + "epoch": 0.3251676651720804, + "grad_norm": 0.11458507180213928, + "learning_rate": 0.0005, + "loss": 2.1287, + "step": 85430 + }, + { + "epoch": 0.3252057276402031, + "grad_norm": 0.12378555536270142, + "learning_rate": 0.0005, + "loss": 2.137, + "step": 85440 + }, + { + "epoch": 0.32524379010832577, + "grad_norm": 0.1393895000219345, + "learning_rate": 0.0005, + "loss": 2.1345, + "step": 85450 + }, + { + "epoch": 0.3252818525764485, + "grad_norm": 0.13885247707366943, + "learning_rate": 0.0005, + "loss": 2.108, + "step": 85460 + }, + { + "epoch": 0.32531991504457114, + "grad_norm": 0.12107256054878235, + "learning_rate": 0.0005, + "loss": 2.1215, + "step": 85470 + }, + { + "epoch": 0.32535797751269385, + "grad_norm": 0.129132479429245, + "learning_rate": 0.0005, + "loss": 2.1327, + "step": 85480 + }, + { + "epoch": 0.3253960399808165, + "grad_norm": 0.12440992891788483, + "learning_rate": 0.0005, + "loss": 2.118, + "step": 85490 + }, + { + "epoch": 0.3254341024489392, + "grad_norm": 0.12130912393331528, + "learning_rate": 0.0005, + "loss": 2.123, + "step": 85500 + }, + { + "epoch": 0.3254721649170619, + "grad_norm": 0.13403859734535217, + "learning_rate": 0.0005, + "loss": 2.1135, + "step": 85510 + }, + { + "epoch": 0.3255102273851846, + "grad_norm": 0.12442876398563385, + "learning_rate": 0.0005, + "loss": 2.105, + "step": 85520 + }, + { + "epoch": 0.32554828985330725, + "grad_norm": 0.11490896344184875, + "learning_rate": 0.0005, + "loss": 2.1408, + "step": 85530 + }, + { + "epoch": 0.3255863523214299, + "grad_norm": 0.1400223970413208, + "learning_rate": 0.0005, + "loss": 2.129, + "step": 85540 + }, + { + "epoch": 0.3256244147895526, + "grad_norm": 0.12713545560836792, + "learning_rate": 0.0005, + "loss": 2.1248, + "step": 85550 + }, + { + "epoch": 0.3256624772576753, + "grad_norm": 0.13676592707633972, + "learning_rate": 0.0005, + "loss": 2.1168, + "step": 85560 + }, + { + "epoch": 0.325700539725798, + "grad_norm": 0.12908118963241577, + "learning_rate": 0.0005, + "loss": 2.1407, + "step": 85570 + }, + { + "epoch": 0.32573860219392065, + "grad_norm": 0.38396722078323364, + "learning_rate": 0.0005, + "loss": 2.1267, + "step": 85580 + }, + { + "epoch": 0.32577666466204336, + "grad_norm": 0.11782268434762955, + "learning_rate": 0.0005, + "loss": 2.1259, + "step": 85590 + }, + { + "epoch": 0.325814727130166, + "grad_norm": 0.15799593925476074, + "learning_rate": 0.0005, + "loss": 2.1375, + "step": 85600 + }, + { + "epoch": 0.32585278959828873, + "grad_norm": 0.11854618787765503, + "learning_rate": 0.0005, + "loss": 2.1401, + "step": 85610 + }, + { + "epoch": 0.3258908520664114, + "grad_norm": 0.12333806604146957, + "learning_rate": 0.0005, + "loss": 2.114, + "step": 85620 + }, + { + "epoch": 0.3259289145345341, + "grad_norm": 0.11984525620937347, + "learning_rate": 0.0005, + "loss": 2.119, + "step": 85630 + }, + { + "epoch": 0.32596697700265675, + "grad_norm": 0.13362917304039001, + "learning_rate": 0.0005, + "loss": 2.1231, + "step": 85640 + }, + { + "epoch": 0.32600503947077947, + "grad_norm": 0.12273038923740387, + "learning_rate": 0.0005, + "loss": 2.1223, + "step": 85650 + }, + { + "epoch": 0.3260431019389021, + "grad_norm": 0.12973017990589142, + "learning_rate": 0.0005, + "loss": 2.134, + "step": 85660 + }, + { + "epoch": 0.3260811644070248, + "grad_norm": 0.12842817604541779, + "learning_rate": 0.0005, + "loss": 2.1197, + "step": 85670 + }, + { + "epoch": 0.3261192268751475, + "grad_norm": 0.12436743080615997, + "learning_rate": 0.0005, + "loss": 2.1262, + "step": 85680 + }, + { + "epoch": 0.32615728934327015, + "grad_norm": 0.12389617413282394, + "learning_rate": 0.0005, + "loss": 2.1387, + "step": 85690 + }, + { + "epoch": 0.32619535181139286, + "grad_norm": 0.139614075422287, + "learning_rate": 0.0005, + "loss": 2.1417, + "step": 85700 + }, + { + "epoch": 0.3262334142795155, + "grad_norm": 0.1295105516910553, + "learning_rate": 0.0005, + "loss": 2.1291, + "step": 85710 + }, + { + "epoch": 0.32627147674763823, + "grad_norm": 0.12636461853981018, + "learning_rate": 0.0005, + "loss": 2.1266, + "step": 85720 + }, + { + "epoch": 0.3263095392157609, + "grad_norm": 0.13094234466552734, + "learning_rate": 0.0005, + "loss": 2.1235, + "step": 85730 + }, + { + "epoch": 0.3263476016838836, + "grad_norm": 0.13378089666366577, + "learning_rate": 0.0005, + "loss": 2.1217, + "step": 85740 + }, + { + "epoch": 0.32638566415200626, + "grad_norm": 0.12451636791229248, + "learning_rate": 0.0005, + "loss": 2.1343, + "step": 85750 + }, + { + "epoch": 0.326423726620129, + "grad_norm": 0.12907682359218597, + "learning_rate": 0.0005, + "loss": 2.1231, + "step": 85760 + }, + { + "epoch": 0.32646178908825163, + "grad_norm": 0.13439998030662537, + "learning_rate": 0.0005, + "loss": 2.1325, + "step": 85770 + }, + { + "epoch": 0.32649985155637434, + "grad_norm": 0.12523990869522095, + "learning_rate": 0.0005, + "loss": 2.1219, + "step": 85780 + }, + { + "epoch": 0.326537914024497, + "grad_norm": 0.14395292103290558, + "learning_rate": 0.0005, + "loss": 2.1248, + "step": 85790 + }, + { + "epoch": 0.3265759764926197, + "grad_norm": 0.1267700046300888, + "learning_rate": 0.0005, + "loss": 2.1236, + "step": 85800 + }, + { + "epoch": 0.32661403896074237, + "grad_norm": 0.12681162357330322, + "learning_rate": 0.0005, + "loss": 2.1315, + "step": 85810 + }, + { + "epoch": 0.326652101428865, + "grad_norm": 0.11388924717903137, + "learning_rate": 0.0005, + "loss": 2.1246, + "step": 85820 + }, + { + "epoch": 0.32669016389698774, + "grad_norm": 0.1388767659664154, + "learning_rate": 0.0005, + "loss": 2.1305, + "step": 85830 + }, + { + "epoch": 0.3267282263651104, + "grad_norm": 0.11999952793121338, + "learning_rate": 0.0005, + "loss": 2.124, + "step": 85840 + }, + { + "epoch": 0.3267662888332331, + "grad_norm": 0.12944869697093964, + "learning_rate": 0.0005, + "loss": 2.122, + "step": 85850 + }, + { + "epoch": 0.32680435130135577, + "grad_norm": 0.12705813348293304, + "learning_rate": 0.0005, + "loss": 2.1256, + "step": 85860 + }, + { + "epoch": 0.3268424137694785, + "grad_norm": 0.1447199285030365, + "learning_rate": 0.0005, + "loss": 2.1058, + "step": 85870 + }, + { + "epoch": 0.32688047623760114, + "grad_norm": 0.14332392811775208, + "learning_rate": 0.0005, + "loss": 2.1272, + "step": 85880 + }, + { + "epoch": 0.32691853870572385, + "grad_norm": 0.1288509964942932, + "learning_rate": 0.0005, + "loss": 2.128, + "step": 85890 + }, + { + "epoch": 0.3269566011738465, + "grad_norm": 0.12456371635198593, + "learning_rate": 0.0005, + "loss": 2.1158, + "step": 85900 + }, + { + "epoch": 0.3269946636419692, + "grad_norm": 0.13260520994663239, + "learning_rate": 0.0005, + "loss": 2.1177, + "step": 85910 + }, + { + "epoch": 0.3270327261100919, + "grad_norm": 0.12597645819187164, + "learning_rate": 0.0005, + "loss": 2.1334, + "step": 85920 + }, + { + "epoch": 0.3270707885782146, + "grad_norm": 0.13826020061969757, + "learning_rate": 0.0005, + "loss": 2.1245, + "step": 85930 + }, + { + "epoch": 0.32710885104633725, + "grad_norm": 0.11652007699012756, + "learning_rate": 0.0005, + "loss": 2.1227, + "step": 85940 + }, + { + "epoch": 0.32714691351445996, + "grad_norm": 0.1280617117881775, + "learning_rate": 0.0005, + "loss": 2.1279, + "step": 85950 + }, + { + "epoch": 0.3271849759825826, + "grad_norm": 0.1129440888762474, + "learning_rate": 0.0005, + "loss": 2.1295, + "step": 85960 + }, + { + "epoch": 0.3272230384507053, + "grad_norm": 0.1350637525320053, + "learning_rate": 0.0005, + "loss": 2.133, + "step": 85970 + }, + { + "epoch": 0.327261100918828, + "grad_norm": 0.13293075561523438, + "learning_rate": 0.0005, + "loss": 2.1192, + "step": 85980 + }, + { + "epoch": 0.32729916338695064, + "grad_norm": 0.12983955442905426, + "learning_rate": 0.0005, + "loss": 2.1086, + "step": 85990 + }, + { + "epoch": 0.32733722585507335, + "grad_norm": 0.1335182785987854, + "learning_rate": 0.0005, + "loss": 2.1282, + "step": 86000 + }, + { + "epoch": 0.327375288323196, + "grad_norm": 0.12860321998596191, + "learning_rate": 0.0005, + "loss": 2.1337, + "step": 86010 + }, + { + "epoch": 0.3274133507913187, + "grad_norm": 0.12237963825464249, + "learning_rate": 0.0005, + "loss": 2.1175, + "step": 86020 + }, + { + "epoch": 0.3274514132594414, + "grad_norm": 0.14233390986919403, + "learning_rate": 0.0005, + "loss": 2.1326, + "step": 86030 + }, + { + "epoch": 0.3274894757275641, + "grad_norm": 0.11780981719493866, + "learning_rate": 0.0005, + "loss": 2.1043, + "step": 86040 + }, + { + "epoch": 0.32752753819568675, + "grad_norm": 0.1323825567960739, + "learning_rate": 0.0005, + "loss": 2.1305, + "step": 86050 + }, + { + "epoch": 0.32756560066380946, + "grad_norm": 0.1262124478816986, + "learning_rate": 0.0005, + "loss": 2.1206, + "step": 86060 + }, + { + "epoch": 0.3276036631319321, + "grad_norm": 0.1186787560582161, + "learning_rate": 0.0005, + "loss": 2.1191, + "step": 86070 + }, + { + "epoch": 0.32764172560005483, + "grad_norm": 0.11766122281551361, + "learning_rate": 0.0005, + "loss": 2.1318, + "step": 86080 + }, + { + "epoch": 0.3276797880681775, + "grad_norm": 0.12597422301769257, + "learning_rate": 0.0005, + "loss": 2.1293, + "step": 86090 + }, + { + "epoch": 0.32771785053630015, + "grad_norm": 0.1503901481628418, + "learning_rate": 0.0005, + "loss": 2.1137, + "step": 86100 + }, + { + "epoch": 0.32775591300442286, + "grad_norm": 0.11765991151332855, + "learning_rate": 0.0005, + "loss": 2.1226, + "step": 86110 + }, + { + "epoch": 0.3277939754725455, + "grad_norm": 0.11862947046756744, + "learning_rate": 0.0005, + "loss": 2.1304, + "step": 86120 + }, + { + "epoch": 0.32783203794066823, + "grad_norm": 0.11336734145879745, + "learning_rate": 0.0005, + "loss": 2.1261, + "step": 86130 + }, + { + "epoch": 0.3278701004087909, + "grad_norm": 0.12237170338630676, + "learning_rate": 0.0005, + "loss": 2.1416, + "step": 86140 + }, + { + "epoch": 0.3279081628769136, + "grad_norm": 0.11985383182764053, + "learning_rate": 0.0005, + "loss": 2.1308, + "step": 86150 + }, + { + "epoch": 0.32794622534503626, + "grad_norm": 0.12470897287130356, + "learning_rate": 0.0005, + "loss": 2.1348, + "step": 86160 + }, + { + "epoch": 0.32798428781315897, + "grad_norm": 0.11982845515012741, + "learning_rate": 0.0005, + "loss": 2.1245, + "step": 86170 + }, + { + "epoch": 0.3280223502812816, + "grad_norm": 0.13332238793373108, + "learning_rate": 0.0005, + "loss": 2.1225, + "step": 86180 + }, + { + "epoch": 0.32806041274940434, + "grad_norm": 0.14622122049331665, + "learning_rate": 0.0005, + "loss": 2.1139, + "step": 86190 + }, + { + "epoch": 0.328098475217527, + "grad_norm": 0.12992194294929504, + "learning_rate": 0.0005, + "loss": 2.1269, + "step": 86200 + }, + { + "epoch": 0.3281365376856497, + "grad_norm": 0.12645530700683594, + "learning_rate": 0.0005, + "loss": 2.1302, + "step": 86210 + }, + { + "epoch": 0.32817460015377237, + "grad_norm": 0.12694615125656128, + "learning_rate": 0.0005, + "loss": 2.1223, + "step": 86220 + }, + { + "epoch": 0.3282126626218951, + "grad_norm": 0.13553451001644135, + "learning_rate": 0.0005, + "loss": 2.1368, + "step": 86230 + }, + { + "epoch": 0.32825072509001774, + "grad_norm": 0.25713515281677246, + "learning_rate": 0.0005, + "loss": 2.1103, + "step": 86240 + }, + { + "epoch": 0.3282887875581404, + "grad_norm": 0.13292460143566132, + "learning_rate": 0.0005, + "loss": 2.1288, + "step": 86250 + }, + { + "epoch": 0.3283268500262631, + "grad_norm": 0.11702064424753189, + "learning_rate": 0.0005, + "loss": 2.1218, + "step": 86260 + }, + { + "epoch": 0.32836491249438576, + "grad_norm": 0.1134921982884407, + "learning_rate": 0.0005, + "loss": 2.1232, + "step": 86270 + }, + { + "epoch": 0.3284029749625085, + "grad_norm": 0.12800496816635132, + "learning_rate": 0.0005, + "loss": 2.1279, + "step": 86280 + }, + { + "epoch": 0.32844103743063113, + "grad_norm": 0.12007319927215576, + "learning_rate": 0.0005, + "loss": 2.1273, + "step": 86290 + }, + { + "epoch": 0.32847909989875385, + "grad_norm": 0.128435418009758, + "learning_rate": 0.0005, + "loss": 2.1078, + "step": 86300 + }, + { + "epoch": 0.3285171623668765, + "grad_norm": 0.12666693329811096, + "learning_rate": 0.0005, + "loss": 2.126, + "step": 86310 + }, + { + "epoch": 0.3285552248349992, + "grad_norm": 0.12615017592906952, + "learning_rate": 0.0005, + "loss": 2.1173, + "step": 86320 + }, + { + "epoch": 0.3285932873031219, + "grad_norm": 0.1369655877351761, + "learning_rate": 0.0005, + "loss": 2.107, + "step": 86330 + }, + { + "epoch": 0.3286313497712446, + "grad_norm": 0.13197752833366394, + "learning_rate": 0.0005, + "loss": 2.1207, + "step": 86340 + }, + { + "epoch": 0.32866941223936724, + "grad_norm": 0.13014714419841766, + "learning_rate": 0.0005, + "loss": 2.1237, + "step": 86350 + }, + { + "epoch": 0.32870747470748996, + "grad_norm": 0.12440134584903717, + "learning_rate": 0.0005, + "loss": 2.1163, + "step": 86360 + }, + { + "epoch": 0.3287455371756126, + "grad_norm": 0.12102056294679642, + "learning_rate": 0.0005, + "loss": 2.1298, + "step": 86370 + }, + { + "epoch": 0.3287835996437353, + "grad_norm": 0.1142989918589592, + "learning_rate": 0.0005, + "loss": 2.1197, + "step": 86380 + }, + { + "epoch": 0.328821662111858, + "grad_norm": 0.12405013293027878, + "learning_rate": 0.0005, + "loss": 2.1143, + "step": 86390 + }, + { + "epoch": 0.32885972457998064, + "grad_norm": 0.11448971182107925, + "learning_rate": 0.0005, + "loss": 2.1176, + "step": 86400 + }, + { + "epoch": 0.32889778704810335, + "grad_norm": 0.1186433881521225, + "learning_rate": 0.0005, + "loss": 2.1207, + "step": 86410 + }, + { + "epoch": 0.328935849516226, + "grad_norm": 0.12564805150032043, + "learning_rate": 0.0005, + "loss": 2.1133, + "step": 86420 + }, + { + "epoch": 0.3289739119843487, + "grad_norm": 0.12833847105503082, + "learning_rate": 0.0005, + "loss": 2.119, + "step": 86430 + }, + { + "epoch": 0.3290119744524714, + "grad_norm": 0.13104651868343353, + "learning_rate": 0.0005, + "loss": 2.1349, + "step": 86440 + }, + { + "epoch": 0.3290500369205941, + "grad_norm": 0.1240994930267334, + "learning_rate": 0.0005, + "loss": 2.1306, + "step": 86450 + }, + { + "epoch": 0.32908809938871675, + "grad_norm": 0.12632007896900177, + "learning_rate": 0.0005, + "loss": 2.139, + "step": 86460 + }, + { + "epoch": 0.32912616185683946, + "grad_norm": 0.11452756077051163, + "learning_rate": 0.0005, + "loss": 2.1258, + "step": 86470 + }, + { + "epoch": 0.3291642243249621, + "grad_norm": 0.11686919629573822, + "learning_rate": 0.0005, + "loss": 2.1344, + "step": 86480 + }, + { + "epoch": 0.32920228679308483, + "grad_norm": 0.13337957859039307, + "learning_rate": 0.0005, + "loss": 2.1318, + "step": 86490 + }, + { + "epoch": 0.3292403492612075, + "grad_norm": 0.11861047893762589, + "learning_rate": 0.0005, + "loss": 2.1274, + "step": 86500 + }, + { + "epoch": 0.3292784117293302, + "grad_norm": 0.12581536173820496, + "learning_rate": 0.0005, + "loss": 2.1246, + "step": 86510 + }, + { + "epoch": 0.32931647419745286, + "grad_norm": 0.11455560475587845, + "learning_rate": 0.0005, + "loss": 2.1332, + "step": 86520 + }, + { + "epoch": 0.32935453666557557, + "grad_norm": 0.13580425083637238, + "learning_rate": 0.0005, + "loss": 2.1172, + "step": 86530 + }, + { + "epoch": 0.3293925991336982, + "grad_norm": 0.12677060067653656, + "learning_rate": 0.0005, + "loss": 2.1262, + "step": 86540 + }, + { + "epoch": 0.3294306616018209, + "grad_norm": 0.13795708119869232, + "learning_rate": 0.0005, + "loss": 2.1202, + "step": 86550 + }, + { + "epoch": 0.3294687240699436, + "grad_norm": 0.11908993870019913, + "learning_rate": 0.0005, + "loss": 2.1223, + "step": 86560 + }, + { + "epoch": 0.32950678653806625, + "grad_norm": 0.13094303011894226, + "learning_rate": 0.0005, + "loss": 2.1281, + "step": 86570 + }, + { + "epoch": 0.32954484900618897, + "grad_norm": 0.12016043812036514, + "learning_rate": 0.0005, + "loss": 2.1318, + "step": 86580 + }, + { + "epoch": 0.3295829114743116, + "grad_norm": 0.13373512029647827, + "learning_rate": 0.0005, + "loss": 2.1261, + "step": 86590 + }, + { + "epoch": 0.32962097394243434, + "grad_norm": 0.13215504586696625, + "learning_rate": 0.0005, + "loss": 2.1213, + "step": 86600 + }, + { + "epoch": 0.329659036410557, + "grad_norm": 0.12117471545934677, + "learning_rate": 0.0005, + "loss": 2.1197, + "step": 86610 + }, + { + "epoch": 0.3296970988786797, + "grad_norm": 0.14617256820201874, + "learning_rate": 0.0005, + "loss": 2.1118, + "step": 86620 + }, + { + "epoch": 0.32973516134680236, + "grad_norm": 0.12067513167858124, + "learning_rate": 0.0005, + "loss": 2.1171, + "step": 86630 + }, + { + "epoch": 0.3297732238149251, + "grad_norm": 0.12430501729249954, + "learning_rate": 0.0005, + "loss": 2.1388, + "step": 86640 + }, + { + "epoch": 0.32981128628304773, + "grad_norm": 0.12513650953769684, + "learning_rate": 0.0005, + "loss": 2.1249, + "step": 86650 + }, + { + "epoch": 0.32984934875117045, + "grad_norm": 0.1374633014202118, + "learning_rate": 0.0005, + "loss": 2.1168, + "step": 86660 + }, + { + "epoch": 0.3298874112192931, + "grad_norm": 0.13568641245365143, + "learning_rate": 0.0005, + "loss": 2.1405, + "step": 86670 + }, + { + "epoch": 0.32992547368741576, + "grad_norm": 0.14530439674854279, + "learning_rate": 0.0005, + "loss": 2.1217, + "step": 86680 + }, + { + "epoch": 0.3299635361555385, + "grad_norm": 0.13062728941440582, + "learning_rate": 0.0005, + "loss": 2.1271, + "step": 86690 + }, + { + "epoch": 0.33000159862366113, + "grad_norm": 0.11495129764080048, + "learning_rate": 0.0005, + "loss": 2.118, + "step": 86700 + }, + { + "epoch": 0.33003966109178384, + "grad_norm": 0.12458905577659607, + "learning_rate": 0.0005, + "loss": 2.1213, + "step": 86710 + }, + { + "epoch": 0.3300777235599065, + "grad_norm": 0.12767943739891052, + "learning_rate": 0.0005, + "loss": 2.1261, + "step": 86720 + }, + { + "epoch": 0.3301157860280292, + "grad_norm": 0.15019801259040833, + "learning_rate": 0.0005, + "loss": 2.1204, + "step": 86730 + }, + { + "epoch": 0.33015384849615187, + "grad_norm": 0.13525643944740295, + "learning_rate": 0.0005, + "loss": 2.1292, + "step": 86740 + }, + { + "epoch": 0.3301919109642746, + "grad_norm": 0.12708579003810883, + "learning_rate": 0.0005, + "loss": 2.1133, + "step": 86750 + }, + { + "epoch": 0.33022997343239724, + "grad_norm": 0.12505242228507996, + "learning_rate": 0.0005, + "loss": 2.1383, + "step": 86760 + }, + { + "epoch": 0.33026803590051995, + "grad_norm": 0.12326527386903763, + "learning_rate": 0.0005, + "loss": 2.1214, + "step": 86770 + }, + { + "epoch": 0.3303060983686426, + "grad_norm": 0.12085497379302979, + "learning_rate": 0.0005, + "loss": 2.1156, + "step": 86780 + }, + { + "epoch": 0.3303441608367653, + "grad_norm": 0.11513642221689224, + "learning_rate": 0.0005, + "loss": 2.1324, + "step": 86790 + }, + { + "epoch": 0.330382223304888, + "grad_norm": 0.1289006471633911, + "learning_rate": 0.0005, + "loss": 2.1288, + "step": 86800 + }, + { + "epoch": 0.3304202857730107, + "grad_norm": 0.12522073090076447, + "learning_rate": 0.0005, + "loss": 2.1308, + "step": 86810 + }, + { + "epoch": 0.33045834824113335, + "grad_norm": 0.12671121954917908, + "learning_rate": 0.0005, + "loss": 2.1313, + "step": 86820 + }, + { + "epoch": 0.330496410709256, + "grad_norm": 0.11735258996486664, + "learning_rate": 0.0005, + "loss": 2.127, + "step": 86830 + }, + { + "epoch": 0.3305344731773787, + "grad_norm": 0.1259993314743042, + "learning_rate": 0.0005, + "loss": 2.1141, + "step": 86840 + }, + { + "epoch": 0.3305725356455014, + "grad_norm": 0.10962878912687302, + "learning_rate": 0.0005, + "loss": 2.1225, + "step": 86850 + }, + { + "epoch": 0.3306105981136241, + "grad_norm": 0.12624777853488922, + "learning_rate": 0.0005, + "loss": 2.1178, + "step": 86860 + }, + { + "epoch": 0.33064866058174675, + "grad_norm": 0.11900725215673447, + "learning_rate": 0.0005, + "loss": 2.142, + "step": 86870 + }, + { + "epoch": 0.33068672304986946, + "grad_norm": 0.13468994200229645, + "learning_rate": 0.0005, + "loss": 2.1158, + "step": 86880 + }, + { + "epoch": 0.3307247855179921, + "grad_norm": 0.1127929762005806, + "learning_rate": 0.0005, + "loss": 2.1141, + "step": 86890 + }, + { + "epoch": 0.3307628479861148, + "grad_norm": 0.11869233846664429, + "learning_rate": 0.0005, + "loss": 2.1035, + "step": 86900 + }, + { + "epoch": 0.3308009104542375, + "grad_norm": 0.10383521020412445, + "learning_rate": 0.0005, + "loss": 2.1233, + "step": 86910 + }, + { + "epoch": 0.3308389729223602, + "grad_norm": 0.1302202194929123, + "learning_rate": 0.0005, + "loss": 2.1, + "step": 86920 + }, + { + "epoch": 0.33087703539048285, + "grad_norm": 0.12471529841423035, + "learning_rate": 0.0005, + "loss": 2.1098, + "step": 86930 + }, + { + "epoch": 0.33091509785860557, + "grad_norm": 0.11835591495037079, + "learning_rate": 0.0005, + "loss": 2.1212, + "step": 86940 + }, + { + "epoch": 0.3309531603267282, + "grad_norm": 0.1361277997493744, + "learning_rate": 0.0005, + "loss": 2.12, + "step": 86950 + }, + { + "epoch": 0.33099122279485094, + "grad_norm": 0.16617321968078613, + "learning_rate": 0.0005, + "loss": 2.1261, + "step": 86960 + }, + { + "epoch": 0.3310292852629736, + "grad_norm": 0.11649385094642639, + "learning_rate": 0.0005, + "loss": 2.1299, + "step": 86970 + }, + { + "epoch": 0.33106734773109625, + "grad_norm": 0.1311468482017517, + "learning_rate": 0.0005, + "loss": 2.1329, + "step": 86980 + }, + { + "epoch": 0.33110541019921896, + "grad_norm": 0.13043631613254547, + "learning_rate": 0.0005, + "loss": 2.1259, + "step": 86990 + }, + { + "epoch": 0.3311434726673416, + "grad_norm": 0.1365789771080017, + "learning_rate": 0.0005, + "loss": 2.1201, + "step": 87000 + }, + { + "epoch": 0.33118153513546433, + "grad_norm": 0.14579400420188904, + "learning_rate": 0.0005, + "loss": 2.1279, + "step": 87010 + }, + { + "epoch": 0.331219597603587, + "grad_norm": 0.10959062725305557, + "learning_rate": 0.0005, + "loss": 2.1368, + "step": 87020 + }, + { + "epoch": 0.3312576600717097, + "grad_norm": 0.12297790497541428, + "learning_rate": 0.0005, + "loss": 2.1303, + "step": 87030 + }, + { + "epoch": 0.33129572253983236, + "grad_norm": 0.11930500715970993, + "learning_rate": 0.0005, + "loss": 2.1184, + "step": 87040 + }, + { + "epoch": 0.3313337850079551, + "grad_norm": 0.12146482616662979, + "learning_rate": 0.0005, + "loss": 2.1123, + "step": 87050 + }, + { + "epoch": 0.33137184747607773, + "grad_norm": 0.12271170318126678, + "learning_rate": 0.0005, + "loss": 2.1294, + "step": 87060 + }, + { + "epoch": 0.33140990994420044, + "grad_norm": 0.11878249794244766, + "learning_rate": 0.0005, + "loss": 2.1046, + "step": 87070 + }, + { + "epoch": 0.3314479724123231, + "grad_norm": 0.1282305121421814, + "learning_rate": 0.0005, + "loss": 2.1236, + "step": 87080 + }, + { + "epoch": 0.3314860348804458, + "grad_norm": 0.13456635177135468, + "learning_rate": 0.0005, + "loss": 2.1109, + "step": 87090 + }, + { + "epoch": 0.33152409734856847, + "grad_norm": 0.1312885731458664, + "learning_rate": 0.0005, + "loss": 2.0989, + "step": 87100 + }, + { + "epoch": 0.3315621598166911, + "grad_norm": 0.1143750250339508, + "learning_rate": 0.0005, + "loss": 2.1119, + "step": 87110 + }, + { + "epoch": 0.33160022228481384, + "grad_norm": 0.1455077975988388, + "learning_rate": 0.0005, + "loss": 2.105, + "step": 87120 + }, + { + "epoch": 0.3316382847529365, + "grad_norm": 0.1323305070400238, + "learning_rate": 0.0005, + "loss": 2.1137, + "step": 87130 + }, + { + "epoch": 0.3316763472210592, + "grad_norm": 0.1347663253545761, + "learning_rate": 0.0005, + "loss": 2.1214, + "step": 87140 + }, + { + "epoch": 0.33171440968918187, + "grad_norm": 0.13488994538784027, + "learning_rate": 0.0005, + "loss": 2.1263, + "step": 87150 + }, + { + "epoch": 0.3317524721573046, + "grad_norm": 0.130838081240654, + "learning_rate": 0.0005, + "loss": 2.125, + "step": 87160 + }, + { + "epoch": 0.33179053462542724, + "grad_norm": 0.11739790439605713, + "learning_rate": 0.0005, + "loss": 2.1303, + "step": 87170 + }, + { + "epoch": 0.33182859709354995, + "grad_norm": 0.13091625273227692, + "learning_rate": 0.0005, + "loss": 2.1209, + "step": 87180 + }, + { + "epoch": 0.3318666595616726, + "grad_norm": 0.12075185775756836, + "learning_rate": 0.0005, + "loss": 2.1469, + "step": 87190 + }, + { + "epoch": 0.3319047220297953, + "grad_norm": 0.12035005539655685, + "learning_rate": 0.0005, + "loss": 2.1206, + "step": 87200 + }, + { + "epoch": 0.331942784497918, + "grad_norm": 0.12317599356174469, + "learning_rate": 0.0005, + "loss": 2.1206, + "step": 87210 + }, + { + "epoch": 0.3319808469660407, + "grad_norm": 0.11673708260059357, + "learning_rate": 0.0005, + "loss": 2.1144, + "step": 87220 + }, + { + "epoch": 0.33201890943416335, + "grad_norm": 0.11935292929410934, + "learning_rate": 0.0005, + "loss": 2.1168, + "step": 87230 + }, + { + "epoch": 0.33205697190228606, + "grad_norm": 0.12443012744188309, + "learning_rate": 0.0005, + "loss": 2.1237, + "step": 87240 + }, + { + "epoch": 0.3320950343704087, + "grad_norm": 0.12056247889995575, + "learning_rate": 0.0005, + "loss": 2.1366, + "step": 87250 + }, + { + "epoch": 0.33213309683853137, + "grad_norm": 0.11743541806936264, + "learning_rate": 0.0005, + "loss": 2.1075, + "step": 87260 + }, + { + "epoch": 0.3321711593066541, + "grad_norm": 0.12407179921865463, + "learning_rate": 0.0005, + "loss": 2.1213, + "step": 87270 + }, + { + "epoch": 0.33220922177477674, + "grad_norm": 0.12037166208028793, + "learning_rate": 0.0005, + "loss": 2.1258, + "step": 87280 + }, + { + "epoch": 0.33224728424289945, + "grad_norm": 0.11869774013757706, + "learning_rate": 0.0005, + "loss": 2.1339, + "step": 87290 + }, + { + "epoch": 0.3322853467110221, + "grad_norm": 0.1175379678606987, + "learning_rate": 0.0005, + "loss": 2.1107, + "step": 87300 + }, + { + "epoch": 0.3323234091791448, + "grad_norm": 0.12499644607305527, + "learning_rate": 0.0005, + "loss": 2.125, + "step": 87310 + }, + { + "epoch": 0.3323614716472675, + "grad_norm": 0.12239792197942734, + "learning_rate": 0.0005, + "loss": 2.1187, + "step": 87320 + }, + { + "epoch": 0.3323995341153902, + "grad_norm": 0.1392892301082611, + "learning_rate": 0.0005, + "loss": 2.1231, + "step": 87330 + }, + { + "epoch": 0.33243759658351285, + "grad_norm": 0.13349878787994385, + "learning_rate": 0.0005, + "loss": 2.1331, + "step": 87340 + }, + { + "epoch": 0.33247565905163556, + "grad_norm": 0.13460364937782288, + "learning_rate": 0.0005, + "loss": 2.1166, + "step": 87350 + }, + { + "epoch": 0.3325137215197582, + "grad_norm": 0.1504037231206894, + "learning_rate": 0.0005, + "loss": 2.1379, + "step": 87360 + }, + { + "epoch": 0.33255178398788093, + "grad_norm": 0.11479683220386505, + "learning_rate": 0.0005, + "loss": 2.1202, + "step": 87370 + }, + { + "epoch": 0.3325898464560036, + "grad_norm": 0.11970684677362442, + "learning_rate": 0.0005, + "loss": 2.134, + "step": 87380 + }, + { + "epoch": 0.3326279089241263, + "grad_norm": 0.11894545704126358, + "learning_rate": 0.0005, + "loss": 2.1228, + "step": 87390 + }, + { + "epoch": 0.33266597139224896, + "grad_norm": 0.12392428517341614, + "learning_rate": 0.0005, + "loss": 2.1416, + "step": 87400 + }, + { + "epoch": 0.3327040338603716, + "grad_norm": 0.13159038126468658, + "learning_rate": 0.0005, + "loss": 2.1206, + "step": 87410 + }, + { + "epoch": 0.33274209632849433, + "grad_norm": 0.12846173346042633, + "learning_rate": 0.0005, + "loss": 2.1241, + "step": 87420 + }, + { + "epoch": 0.332780158796617, + "grad_norm": 0.12723685801029205, + "learning_rate": 0.0005, + "loss": 2.1171, + "step": 87430 + }, + { + "epoch": 0.3328182212647397, + "grad_norm": 0.1315564215183258, + "learning_rate": 0.0005, + "loss": 2.135, + "step": 87440 + }, + { + "epoch": 0.33285628373286236, + "grad_norm": 0.12111755460500717, + "learning_rate": 0.0005, + "loss": 2.1212, + "step": 87450 + }, + { + "epoch": 0.33289434620098507, + "grad_norm": 0.1542283296585083, + "learning_rate": 0.0005, + "loss": 2.1317, + "step": 87460 + }, + { + "epoch": 0.3329324086691077, + "grad_norm": 0.12875567376613617, + "learning_rate": 0.0005, + "loss": 2.1183, + "step": 87470 + }, + { + "epoch": 0.33297047113723044, + "grad_norm": 0.1334831416606903, + "learning_rate": 0.0005, + "loss": 2.1261, + "step": 87480 + }, + { + "epoch": 0.3330085336053531, + "grad_norm": 0.12709291279315948, + "learning_rate": 0.0005, + "loss": 2.1181, + "step": 87490 + }, + { + "epoch": 0.3330465960734758, + "grad_norm": 0.12286421656608582, + "learning_rate": 0.0005, + "loss": 2.112, + "step": 87500 + }, + { + "epoch": 0.33308465854159847, + "grad_norm": 0.1331913024187088, + "learning_rate": 0.0005, + "loss": 2.133, + "step": 87510 + }, + { + "epoch": 0.3331227210097212, + "grad_norm": 0.1212693378329277, + "learning_rate": 0.0005, + "loss": 2.1273, + "step": 87520 + }, + { + "epoch": 0.33316078347784384, + "grad_norm": 0.11731184273958206, + "learning_rate": 0.0005, + "loss": 2.1318, + "step": 87530 + }, + { + "epoch": 0.3331988459459665, + "grad_norm": 0.12474881857633591, + "learning_rate": 0.0005, + "loss": 2.1302, + "step": 87540 + }, + { + "epoch": 0.3332369084140892, + "grad_norm": 0.11683713644742966, + "learning_rate": 0.0005, + "loss": 2.1292, + "step": 87550 + }, + { + "epoch": 0.33327497088221186, + "grad_norm": 0.12048343569040298, + "learning_rate": 0.0005, + "loss": 2.1225, + "step": 87560 + }, + { + "epoch": 0.3333130333503346, + "grad_norm": 0.12834185361862183, + "learning_rate": 0.0005, + "loss": 2.1162, + "step": 87570 + }, + { + "epoch": 0.33335109581845723, + "grad_norm": 0.11707880347967148, + "learning_rate": 0.0005, + "loss": 2.1152, + "step": 87580 + }, + { + "epoch": 0.33338915828657995, + "grad_norm": 0.13019756972789764, + "learning_rate": 0.0005, + "loss": 2.1278, + "step": 87590 + }, + { + "epoch": 0.3334272207547026, + "grad_norm": 0.12006811797618866, + "learning_rate": 0.0005, + "loss": 2.1342, + "step": 87600 + }, + { + "epoch": 0.3334652832228253, + "grad_norm": 0.11884687095880508, + "learning_rate": 0.0005, + "loss": 2.1287, + "step": 87610 + }, + { + "epoch": 0.33350334569094797, + "grad_norm": 0.13654226064682007, + "learning_rate": 0.0005, + "loss": 2.1222, + "step": 87620 + }, + { + "epoch": 0.3335414081590707, + "grad_norm": 0.12057172507047653, + "learning_rate": 0.0005, + "loss": 2.1138, + "step": 87630 + }, + { + "epoch": 0.33357947062719334, + "grad_norm": 0.12495489418506622, + "learning_rate": 0.0005, + "loss": 2.1125, + "step": 87640 + }, + { + "epoch": 0.33361753309531605, + "grad_norm": 0.11863405257463455, + "learning_rate": 0.0005, + "loss": 2.1163, + "step": 87650 + }, + { + "epoch": 0.3336555955634387, + "grad_norm": 0.12780635058879852, + "learning_rate": 0.0005, + "loss": 2.1294, + "step": 87660 + }, + { + "epoch": 0.3336936580315614, + "grad_norm": 0.13394132256507874, + "learning_rate": 0.0005, + "loss": 2.1143, + "step": 87670 + }, + { + "epoch": 0.3337317204996841, + "grad_norm": 0.13047321140766144, + "learning_rate": 0.0005, + "loss": 2.1064, + "step": 87680 + }, + { + "epoch": 0.33376978296780674, + "grad_norm": 0.1311761736869812, + "learning_rate": 0.0005, + "loss": 2.1129, + "step": 87690 + }, + { + "epoch": 0.33380784543592945, + "grad_norm": 0.11715999990701675, + "learning_rate": 0.0005, + "loss": 2.1351, + "step": 87700 + }, + { + "epoch": 0.3338459079040521, + "grad_norm": 0.11198131740093231, + "learning_rate": 0.0005, + "loss": 2.1222, + "step": 87710 + }, + { + "epoch": 0.3338839703721748, + "grad_norm": 0.12425164878368378, + "learning_rate": 0.0005, + "loss": 2.1108, + "step": 87720 + }, + { + "epoch": 0.3339220328402975, + "grad_norm": 0.12112827599048615, + "learning_rate": 0.0005, + "loss": 2.137, + "step": 87730 + }, + { + "epoch": 0.3339600953084202, + "grad_norm": 0.11413715034723282, + "learning_rate": 0.0005, + "loss": 2.1249, + "step": 87740 + }, + { + "epoch": 0.33399815777654285, + "grad_norm": 0.13012996315956116, + "learning_rate": 0.0005, + "loss": 2.1243, + "step": 87750 + }, + { + "epoch": 0.33403622024466556, + "grad_norm": 0.12076626718044281, + "learning_rate": 0.0005, + "loss": 2.1294, + "step": 87760 + }, + { + "epoch": 0.3340742827127882, + "grad_norm": 0.12360948324203491, + "learning_rate": 0.0005, + "loss": 2.1276, + "step": 87770 + }, + { + "epoch": 0.33411234518091093, + "grad_norm": 0.11759869009256363, + "learning_rate": 0.0005, + "loss": 2.1313, + "step": 87780 + }, + { + "epoch": 0.3341504076490336, + "grad_norm": 0.1118142306804657, + "learning_rate": 0.0005, + "loss": 2.1187, + "step": 87790 + }, + { + "epoch": 0.3341884701171563, + "grad_norm": 0.13246989250183105, + "learning_rate": 0.0005, + "loss": 2.1085, + "step": 87800 + }, + { + "epoch": 0.33422653258527896, + "grad_norm": 0.11729934811592102, + "learning_rate": 0.0005, + "loss": 2.1186, + "step": 87810 + }, + { + "epoch": 0.33426459505340167, + "grad_norm": 0.14080481231212616, + "learning_rate": 0.0005, + "loss": 2.1313, + "step": 87820 + }, + { + "epoch": 0.3343026575215243, + "grad_norm": 0.11630220711231232, + "learning_rate": 0.0005, + "loss": 2.1208, + "step": 87830 + }, + { + "epoch": 0.334340719989647, + "grad_norm": 0.13849031925201416, + "learning_rate": 0.0005, + "loss": 2.1132, + "step": 87840 + }, + { + "epoch": 0.3343787824577697, + "grad_norm": 0.12446040660142899, + "learning_rate": 0.0005, + "loss": 2.1177, + "step": 87850 + }, + { + "epoch": 0.33441684492589235, + "grad_norm": 0.11606734991073608, + "learning_rate": 0.0005, + "loss": 2.1121, + "step": 87860 + }, + { + "epoch": 0.33445490739401507, + "grad_norm": 0.1261448860168457, + "learning_rate": 0.0005, + "loss": 2.1107, + "step": 87870 + }, + { + "epoch": 0.3344929698621377, + "grad_norm": 0.12158872187137604, + "learning_rate": 0.0005, + "loss": 2.1234, + "step": 87880 + }, + { + "epoch": 0.33453103233026044, + "grad_norm": 0.1390082687139511, + "learning_rate": 0.0005, + "loss": 2.1325, + "step": 87890 + }, + { + "epoch": 0.3345690947983831, + "grad_norm": 0.13956613838672638, + "learning_rate": 0.0005, + "loss": 2.1123, + "step": 87900 + }, + { + "epoch": 0.3346071572665058, + "grad_norm": 0.13154958188533783, + "learning_rate": 0.0005, + "loss": 2.1222, + "step": 87910 + }, + { + "epoch": 0.33464521973462846, + "grad_norm": 0.12404187023639679, + "learning_rate": 0.0005, + "loss": 2.1157, + "step": 87920 + }, + { + "epoch": 0.3346832822027512, + "grad_norm": 0.1383303999900818, + "learning_rate": 0.0005, + "loss": 2.128, + "step": 87930 + }, + { + "epoch": 0.33472134467087383, + "grad_norm": 0.11572207510471344, + "learning_rate": 0.0005, + "loss": 2.1131, + "step": 87940 + }, + { + "epoch": 0.33475940713899655, + "grad_norm": 0.1205492839217186, + "learning_rate": 0.0005, + "loss": 2.1138, + "step": 87950 + }, + { + "epoch": 0.3347974696071192, + "grad_norm": 0.1155189797282219, + "learning_rate": 0.0005, + "loss": 2.122, + "step": 87960 + }, + { + "epoch": 0.33483553207524186, + "grad_norm": 0.12543031573295593, + "learning_rate": 0.0005, + "loss": 2.1139, + "step": 87970 + }, + { + "epoch": 0.33487359454336457, + "grad_norm": 0.1279943287372589, + "learning_rate": 0.0005, + "loss": 2.1247, + "step": 87980 + }, + { + "epoch": 0.33491165701148723, + "grad_norm": 0.1298796683549881, + "learning_rate": 0.0005, + "loss": 2.1197, + "step": 87990 + }, + { + "epoch": 0.33494971947960994, + "grad_norm": 0.12139620631933212, + "learning_rate": 0.0005, + "loss": 2.1214, + "step": 88000 + }, + { + "epoch": 0.3349877819477326, + "grad_norm": 0.12247146666049957, + "learning_rate": 0.0005, + "loss": 2.1198, + "step": 88010 + }, + { + "epoch": 0.3350258444158553, + "grad_norm": 0.12139434367418289, + "learning_rate": 0.0005, + "loss": 2.1395, + "step": 88020 + }, + { + "epoch": 0.33506390688397797, + "grad_norm": 0.12310953438282013, + "learning_rate": 0.0005, + "loss": 2.1059, + "step": 88030 + }, + { + "epoch": 0.3351019693521007, + "grad_norm": 0.12263503670692444, + "learning_rate": 0.0005, + "loss": 2.1309, + "step": 88040 + }, + { + "epoch": 0.33514003182022334, + "grad_norm": 0.12221545726060867, + "learning_rate": 0.0005, + "loss": 2.1193, + "step": 88050 + }, + { + "epoch": 0.33517809428834605, + "grad_norm": 0.11760221421718597, + "learning_rate": 0.0005, + "loss": 2.1095, + "step": 88060 + }, + { + "epoch": 0.3352161567564687, + "grad_norm": 0.13482467830181122, + "learning_rate": 0.0005, + "loss": 2.1307, + "step": 88070 + }, + { + "epoch": 0.3352542192245914, + "grad_norm": 0.11433924734592438, + "learning_rate": 0.0005, + "loss": 2.1199, + "step": 88080 + }, + { + "epoch": 0.3352922816927141, + "grad_norm": 0.12347762286663055, + "learning_rate": 0.0005, + "loss": 2.1314, + "step": 88090 + }, + { + "epoch": 0.3353303441608368, + "grad_norm": 0.12823589146137238, + "learning_rate": 0.0005, + "loss": 2.1129, + "step": 88100 + }, + { + "epoch": 0.33536840662895945, + "grad_norm": 0.12408977746963501, + "learning_rate": 0.0005, + "loss": 2.1294, + "step": 88110 + }, + { + "epoch": 0.3354064690970821, + "grad_norm": 0.12411817908287048, + "learning_rate": 0.0005, + "loss": 2.1318, + "step": 88120 + }, + { + "epoch": 0.3354445315652048, + "grad_norm": 0.13858748972415924, + "learning_rate": 0.0005, + "loss": 2.1232, + "step": 88130 + }, + { + "epoch": 0.3354825940333275, + "grad_norm": 0.13134264945983887, + "learning_rate": 0.0005, + "loss": 2.1168, + "step": 88140 + }, + { + "epoch": 0.3355206565014502, + "grad_norm": 0.12682613730430603, + "learning_rate": 0.0005, + "loss": 2.1301, + "step": 88150 + }, + { + "epoch": 0.33555871896957284, + "grad_norm": 0.12588344514369965, + "learning_rate": 0.0005, + "loss": 2.1136, + "step": 88160 + }, + { + "epoch": 0.33559678143769556, + "grad_norm": 0.11390385776758194, + "learning_rate": 0.0005, + "loss": 2.126, + "step": 88170 + }, + { + "epoch": 0.3356348439058182, + "grad_norm": 0.13311776518821716, + "learning_rate": 0.0005, + "loss": 2.1286, + "step": 88180 + }, + { + "epoch": 0.3356729063739409, + "grad_norm": 0.1160586029291153, + "learning_rate": 0.0005, + "loss": 2.1156, + "step": 88190 + }, + { + "epoch": 0.3357109688420636, + "grad_norm": 0.13641859591007233, + "learning_rate": 0.0005, + "loss": 2.142, + "step": 88200 + }, + { + "epoch": 0.3357490313101863, + "grad_norm": 0.12794005870819092, + "learning_rate": 0.0005, + "loss": 2.1311, + "step": 88210 + }, + { + "epoch": 0.33578709377830895, + "grad_norm": 0.11959764361381531, + "learning_rate": 0.0005, + "loss": 2.1053, + "step": 88220 + }, + { + "epoch": 0.33582515624643167, + "grad_norm": 0.1298590749502182, + "learning_rate": 0.0005, + "loss": 2.1023, + "step": 88230 + }, + { + "epoch": 0.3358632187145543, + "grad_norm": 0.11784891039133072, + "learning_rate": 0.0005, + "loss": 2.1076, + "step": 88240 + }, + { + "epoch": 0.33590128118267704, + "grad_norm": 0.13400663435459137, + "learning_rate": 0.0005, + "loss": 2.1153, + "step": 88250 + }, + { + "epoch": 0.3359393436507997, + "grad_norm": 0.13214954733848572, + "learning_rate": 0.0005, + "loss": 2.1317, + "step": 88260 + }, + { + "epoch": 0.33597740611892235, + "grad_norm": 0.13531705737113953, + "learning_rate": 0.0005, + "loss": 2.1263, + "step": 88270 + }, + { + "epoch": 0.33601546858704506, + "grad_norm": 0.1262243390083313, + "learning_rate": 0.0005, + "loss": 2.1144, + "step": 88280 + }, + { + "epoch": 0.3360535310551677, + "grad_norm": 0.11508592963218689, + "learning_rate": 0.0005, + "loss": 2.1253, + "step": 88290 + }, + { + "epoch": 0.33609159352329043, + "grad_norm": 0.1250382363796234, + "learning_rate": 0.0005, + "loss": 2.1227, + "step": 88300 + }, + { + "epoch": 0.3361296559914131, + "grad_norm": 0.11667264252901077, + "learning_rate": 0.0005, + "loss": 2.1265, + "step": 88310 + }, + { + "epoch": 0.3361677184595358, + "grad_norm": 0.11802863329648972, + "learning_rate": 0.0005, + "loss": 2.1248, + "step": 88320 + }, + { + "epoch": 0.33620578092765846, + "grad_norm": 0.11546743661165237, + "learning_rate": 0.0005, + "loss": 2.1209, + "step": 88330 + }, + { + "epoch": 0.3362438433957812, + "grad_norm": 0.13666917383670807, + "learning_rate": 0.0005, + "loss": 2.1152, + "step": 88340 + }, + { + "epoch": 0.33628190586390383, + "grad_norm": 0.13605958223342896, + "learning_rate": 0.0005, + "loss": 2.1149, + "step": 88350 + }, + { + "epoch": 0.33631996833202654, + "grad_norm": 0.12794260680675507, + "learning_rate": 0.0005, + "loss": 2.1343, + "step": 88360 + }, + { + "epoch": 0.3363580308001492, + "grad_norm": 0.11411519348621368, + "learning_rate": 0.0005, + "loss": 2.1192, + "step": 88370 + }, + { + "epoch": 0.3363960932682719, + "grad_norm": 0.11841807514429092, + "learning_rate": 0.0005, + "loss": 2.1214, + "step": 88380 + }, + { + "epoch": 0.33643415573639457, + "grad_norm": 0.10895001888275146, + "learning_rate": 0.0005, + "loss": 2.1479, + "step": 88390 + }, + { + "epoch": 0.3364722182045172, + "grad_norm": 0.12098965793848038, + "learning_rate": 0.0005, + "loss": 2.1272, + "step": 88400 + }, + { + "epoch": 0.33651028067263994, + "grad_norm": 0.13581803441047668, + "learning_rate": 0.0005, + "loss": 2.1269, + "step": 88410 + }, + { + "epoch": 0.3365483431407626, + "grad_norm": 0.12655942142009735, + "learning_rate": 0.0005, + "loss": 2.1144, + "step": 88420 + }, + { + "epoch": 0.3365864056088853, + "grad_norm": 0.12174165993928909, + "learning_rate": 0.0005, + "loss": 2.13, + "step": 88430 + }, + { + "epoch": 0.33662446807700797, + "grad_norm": 0.14150436222553253, + "learning_rate": 0.0005, + "loss": 2.1344, + "step": 88440 + }, + { + "epoch": 0.3366625305451307, + "grad_norm": 0.12937967479228973, + "learning_rate": 0.0005, + "loss": 2.133, + "step": 88450 + }, + { + "epoch": 0.33670059301325334, + "grad_norm": 0.12161042541265488, + "learning_rate": 0.0005, + "loss": 2.1261, + "step": 88460 + }, + { + "epoch": 0.33673865548137605, + "grad_norm": 0.1310884952545166, + "learning_rate": 0.0005, + "loss": 2.1174, + "step": 88470 + }, + { + "epoch": 0.3367767179494987, + "grad_norm": 0.1272687315940857, + "learning_rate": 0.0005, + "loss": 2.129, + "step": 88480 + }, + { + "epoch": 0.3368147804176214, + "grad_norm": 0.12220699340105057, + "learning_rate": 0.0005, + "loss": 2.1152, + "step": 88490 + }, + { + "epoch": 0.3368528428857441, + "grad_norm": 0.1358078271150589, + "learning_rate": 0.0005, + "loss": 2.1389, + "step": 88500 + }, + { + "epoch": 0.3368909053538668, + "grad_norm": 0.11216282099485397, + "learning_rate": 0.0005, + "loss": 2.1222, + "step": 88510 + }, + { + "epoch": 0.33692896782198944, + "grad_norm": 0.12000705301761627, + "learning_rate": 0.0005, + "loss": 2.1101, + "step": 88520 + }, + { + "epoch": 0.33696703029011216, + "grad_norm": 0.13831983506679535, + "learning_rate": 0.0005, + "loss": 2.125, + "step": 88530 + }, + { + "epoch": 0.3370050927582348, + "grad_norm": 0.11740361899137497, + "learning_rate": 0.0005, + "loss": 2.1224, + "step": 88540 + }, + { + "epoch": 0.33704315522635747, + "grad_norm": 0.11932548135519028, + "learning_rate": 0.0005, + "loss": 2.122, + "step": 88550 + }, + { + "epoch": 0.3370812176944802, + "grad_norm": 0.11470109224319458, + "learning_rate": 0.0005, + "loss": 2.111, + "step": 88560 + }, + { + "epoch": 0.33711928016260284, + "grad_norm": 0.1229340210556984, + "learning_rate": 0.0005, + "loss": 2.1151, + "step": 88570 + }, + { + "epoch": 0.33715734263072555, + "grad_norm": 0.12469479441642761, + "learning_rate": 0.0005, + "loss": 2.1148, + "step": 88580 + }, + { + "epoch": 0.3371954050988482, + "grad_norm": 0.12042783200740814, + "learning_rate": 0.0005, + "loss": 2.1168, + "step": 88590 + }, + { + "epoch": 0.3372334675669709, + "grad_norm": 0.12501518428325653, + "learning_rate": 0.0005, + "loss": 2.1298, + "step": 88600 + }, + { + "epoch": 0.3372715300350936, + "grad_norm": 0.12341202050447464, + "learning_rate": 0.0005, + "loss": 2.125, + "step": 88610 + }, + { + "epoch": 0.3373095925032163, + "grad_norm": 0.1101434975862503, + "learning_rate": 0.0005, + "loss": 2.1248, + "step": 88620 + }, + { + "epoch": 0.33734765497133895, + "grad_norm": 0.13599303364753723, + "learning_rate": 0.0005, + "loss": 2.1393, + "step": 88630 + }, + { + "epoch": 0.33738571743946166, + "grad_norm": 0.1298111379146576, + "learning_rate": 0.0005, + "loss": 2.1288, + "step": 88640 + }, + { + "epoch": 0.3374237799075843, + "grad_norm": 0.1254395842552185, + "learning_rate": 0.0005, + "loss": 2.1287, + "step": 88650 + }, + { + "epoch": 0.33746184237570703, + "grad_norm": 0.13073401153087616, + "learning_rate": 0.0005, + "loss": 2.1323, + "step": 88660 + }, + { + "epoch": 0.3374999048438297, + "grad_norm": 0.12592065334320068, + "learning_rate": 0.0005, + "loss": 2.1146, + "step": 88670 + }, + { + "epoch": 0.3375379673119524, + "grad_norm": 0.13244742155075073, + "learning_rate": 0.0005, + "loss": 2.1178, + "step": 88680 + }, + { + "epoch": 0.33757602978007506, + "grad_norm": 0.12912435829639435, + "learning_rate": 0.0005, + "loss": 2.1079, + "step": 88690 + }, + { + "epoch": 0.3376140922481977, + "grad_norm": 0.13592544198036194, + "learning_rate": 0.0005, + "loss": 2.1244, + "step": 88700 + }, + { + "epoch": 0.33765215471632043, + "grad_norm": 0.1403253972530365, + "learning_rate": 0.0005, + "loss": 2.1211, + "step": 88710 + }, + { + "epoch": 0.3376902171844431, + "grad_norm": 0.14164945483207703, + "learning_rate": 0.0005, + "loss": 2.1096, + "step": 88720 + }, + { + "epoch": 0.3377282796525658, + "grad_norm": 0.12777163088321686, + "learning_rate": 0.0005, + "loss": 2.1229, + "step": 88730 + }, + { + "epoch": 0.33776634212068846, + "grad_norm": 0.11780541390180588, + "learning_rate": 0.0005, + "loss": 2.1275, + "step": 88740 + }, + { + "epoch": 0.33780440458881117, + "grad_norm": 0.1289723813533783, + "learning_rate": 0.0005, + "loss": 2.143, + "step": 88750 + }, + { + "epoch": 0.3378424670569338, + "grad_norm": 0.12288056313991547, + "learning_rate": 0.0005, + "loss": 2.1302, + "step": 88760 + }, + { + "epoch": 0.33788052952505654, + "grad_norm": 0.11306439340114594, + "learning_rate": 0.0005, + "loss": 2.1226, + "step": 88770 + }, + { + "epoch": 0.3379185919931792, + "grad_norm": 0.12616066634655, + "learning_rate": 0.0005, + "loss": 2.1183, + "step": 88780 + }, + { + "epoch": 0.3379566544613019, + "grad_norm": 0.12859830260276794, + "learning_rate": 0.0005, + "loss": 2.127, + "step": 88790 + }, + { + "epoch": 0.33799471692942457, + "grad_norm": 0.11339247971773148, + "learning_rate": 0.0005, + "loss": 2.1284, + "step": 88800 + }, + { + "epoch": 0.3380327793975473, + "grad_norm": 0.1621411293745041, + "learning_rate": 0.0005, + "loss": 2.1262, + "step": 88810 + }, + { + "epoch": 0.33807084186566994, + "grad_norm": 0.11376577615737915, + "learning_rate": 0.0005, + "loss": 2.1176, + "step": 88820 + }, + { + "epoch": 0.33810890433379265, + "grad_norm": 0.14351233839988708, + "learning_rate": 0.0005, + "loss": 2.1068, + "step": 88830 + }, + { + "epoch": 0.3381469668019153, + "grad_norm": 0.12574923038482666, + "learning_rate": 0.0005, + "loss": 2.1183, + "step": 88840 + }, + { + "epoch": 0.33818502927003796, + "grad_norm": 0.1207105964422226, + "learning_rate": 0.0005, + "loss": 2.1192, + "step": 88850 + }, + { + "epoch": 0.3382230917381607, + "grad_norm": 0.128650963306427, + "learning_rate": 0.0005, + "loss": 2.1156, + "step": 88860 + }, + { + "epoch": 0.33826115420628333, + "grad_norm": 0.13034354150295258, + "learning_rate": 0.0005, + "loss": 2.1261, + "step": 88870 + }, + { + "epoch": 0.33829921667440604, + "grad_norm": 0.1216464415192604, + "learning_rate": 0.0005, + "loss": 2.1293, + "step": 88880 + }, + { + "epoch": 0.3383372791425287, + "grad_norm": 0.12070560455322266, + "learning_rate": 0.0005, + "loss": 2.1375, + "step": 88890 + }, + { + "epoch": 0.3383753416106514, + "grad_norm": 0.11654239892959595, + "learning_rate": 0.0005, + "loss": 2.1085, + "step": 88900 + }, + { + "epoch": 0.33841340407877407, + "grad_norm": 0.13917605578899384, + "learning_rate": 0.0005, + "loss": 2.1379, + "step": 88910 + }, + { + "epoch": 0.3384514665468968, + "grad_norm": 0.10812585055828094, + "learning_rate": 0.0005, + "loss": 2.1275, + "step": 88920 + }, + { + "epoch": 0.33848952901501944, + "grad_norm": 0.11935912817716599, + "learning_rate": 0.0005, + "loss": 2.1376, + "step": 88930 + }, + { + "epoch": 0.33852759148314215, + "grad_norm": 0.1279357522726059, + "learning_rate": 0.0005, + "loss": 2.1144, + "step": 88940 + }, + { + "epoch": 0.3385656539512648, + "grad_norm": 0.1319705843925476, + "learning_rate": 0.0005, + "loss": 2.1335, + "step": 88950 + }, + { + "epoch": 0.3386037164193875, + "grad_norm": 0.129435196518898, + "learning_rate": 0.0005, + "loss": 2.1319, + "step": 88960 + }, + { + "epoch": 0.3386417788875102, + "grad_norm": 0.13411614298820496, + "learning_rate": 0.0005, + "loss": 2.1199, + "step": 88970 + }, + { + "epoch": 0.33867984135563284, + "grad_norm": 0.12597118318080902, + "learning_rate": 0.0005, + "loss": 2.1412, + "step": 88980 + }, + { + "epoch": 0.33871790382375555, + "grad_norm": 0.1208195760846138, + "learning_rate": 0.0005, + "loss": 2.1174, + "step": 88990 + }, + { + "epoch": 0.3387559662918782, + "grad_norm": 0.12813125550746918, + "learning_rate": 0.0005, + "loss": 2.1083, + "step": 89000 + }, + { + "epoch": 0.3387940287600009, + "grad_norm": 0.11439248919487, + "learning_rate": 0.0005, + "loss": 2.1196, + "step": 89010 + }, + { + "epoch": 0.3388320912281236, + "grad_norm": 0.12252800166606903, + "learning_rate": 0.0005, + "loss": 2.1267, + "step": 89020 + }, + { + "epoch": 0.3388701536962463, + "grad_norm": 0.12275537848472595, + "learning_rate": 0.0005, + "loss": 2.1058, + "step": 89030 + }, + { + "epoch": 0.33890821616436895, + "grad_norm": 0.12595809996128082, + "learning_rate": 0.0005, + "loss": 2.1146, + "step": 89040 + }, + { + "epoch": 0.33894627863249166, + "grad_norm": 0.12871016561985016, + "learning_rate": 0.0005, + "loss": 2.1236, + "step": 89050 + }, + { + "epoch": 0.3389843411006143, + "grad_norm": 0.16490459442138672, + "learning_rate": 0.0005, + "loss": 2.1192, + "step": 89060 + }, + { + "epoch": 0.33902240356873703, + "grad_norm": 0.16081076860427856, + "learning_rate": 0.0005, + "loss": 2.1337, + "step": 89070 + }, + { + "epoch": 0.3390604660368597, + "grad_norm": 0.12105927616357803, + "learning_rate": 0.0005, + "loss": 2.1102, + "step": 89080 + }, + { + "epoch": 0.3390985285049824, + "grad_norm": 0.1160271167755127, + "learning_rate": 0.0005, + "loss": 2.1396, + "step": 89090 + }, + { + "epoch": 0.33913659097310506, + "grad_norm": 0.11818355321884155, + "learning_rate": 0.0005, + "loss": 2.132, + "step": 89100 + }, + { + "epoch": 0.33917465344122777, + "grad_norm": 0.12066550552845001, + "learning_rate": 0.0005, + "loss": 2.1195, + "step": 89110 + }, + { + "epoch": 0.3392127159093504, + "grad_norm": 0.11974430084228516, + "learning_rate": 0.0005, + "loss": 2.1211, + "step": 89120 + }, + { + "epoch": 0.3392507783774731, + "grad_norm": 0.1406737118959427, + "learning_rate": 0.0005, + "loss": 2.1155, + "step": 89130 + }, + { + "epoch": 0.3392888408455958, + "grad_norm": 0.13470028340816498, + "learning_rate": 0.0005, + "loss": 2.1259, + "step": 89140 + }, + { + "epoch": 0.33932690331371845, + "grad_norm": 0.13147686421871185, + "learning_rate": 0.0005, + "loss": 2.1142, + "step": 89150 + }, + { + "epoch": 0.33936496578184117, + "grad_norm": 0.11957060545682907, + "learning_rate": 0.0005, + "loss": 2.1246, + "step": 89160 + }, + { + "epoch": 0.3394030282499638, + "grad_norm": 0.12157806009054184, + "learning_rate": 0.0005, + "loss": 2.1179, + "step": 89170 + }, + { + "epoch": 0.33944109071808654, + "grad_norm": 0.1386110782623291, + "learning_rate": 0.0005, + "loss": 2.1206, + "step": 89180 + }, + { + "epoch": 0.3394791531862092, + "grad_norm": 0.11687417328357697, + "learning_rate": 0.0005, + "loss": 2.1158, + "step": 89190 + }, + { + "epoch": 0.3395172156543319, + "grad_norm": 0.1264970302581787, + "learning_rate": 0.0005, + "loss": 2.1444, + "step": 89200 + }, + { + "epoch": 0.33955527812245456, + "grad_norm": 0.13229866325855255, + "learning_rate": 0.0005, + "loss": 2.1236, + "step": 89210 + }, + { + "epoch": 0.3395933405905773, + "grad_norm": 0.14732640981674194, + "learning_rate": 0.0005, + "loss": 2.1259, + "step": 89220 + }, + { + "epoch": 0.33963140305869993, + "grad_norm": 0.12414790689945221, + "learning_rate": 0.0005, + "loss": 2.1229, + "step": 89230 + }, + { + "epoch": 0.33966946552682264, + "grad_norm": 0.13430891931056976, + "learning_rate": 0.0005, + "loss": 2.1351, + "step": 89240 + }, + { + "epoch": 0.3397075279949453, + "grad_norm": 0.11227301508188248, + "learning_rate": 0.0005, + "loss": 2.1197, + "step": 89250 + }, + { + "epoch": 0.339745590463068, + "grad_norm": 0.14740775525569916, + "learning_rate": 0.0005, + "loss": 2.1353, + "step": 89260 + }, + { + "epoch": 0.33978365293119067, + "grad_norm": 0.1203657016158104, + "learning_rate": 0.0005, + "loss": 2.1227, + "step": 89270 + }, + { + "epoch": 0.33982171539931333, + "grad_norm": 0.12663409113883972, + "learning_rate": 0.0005, + "loss": 2.1268, + "step": 89280 + }, + { + "epoch": 0.33985977786743604, + "grad_norm": 0.12828192114830017, + "learning_rate": 0.0005, + "loss": 2.1053, + "step": 89290 + }, + { + "epoch": 0.3398978403355587, + "grad_norm": 0.1220042034983635, + "learning_rate": 0.0005, + "loss": 2.1223, + "step": 89300 + }, + { + "epoch": 0.3399359028036814, + "grad_norm": 0.13182102143764496, + "learning_rate": 0.0005, + "loss": 2.1128, + "step": 89310 + }, + { + "epoch": 0.33997396527180407, + "grad_norm": 0.13868823647499084, + "learning_rate": 0.0005, + "loss": 2.1287, + "step": 89320 + }, + { + "epoch": 0.3400120277399268, + "grad_norm": 0.11847683787345886, + "learning_rate": 0.0005, + "loss": 2.1272, + "step": 89330 + }, + { + "epoch": 0.34005009020804944, + "grad_norm": 0.11458242684602737, + "learning_rate": 0.0005, + "loss": 2.1331, + "step": 89340 + }, + { + "epoch": 0.34008815267617215, + "grad_norm": 0.12344954907894135, + "learning_rate": 0.0005, + "loss": 2.1283, + "step": 89350 + }, + { + "epoch": 0.3401262151442948, + "grad_norm": 0.12169872969388962, + "learning_rate": 0.0005, + "loss": 2.1105, + "step": 89360 + }, + { + "epoch": 0.3401642776124175, + "grad_norm": 0.1187531128525734, + "learning_rate": 0.0005, + "loss": 2.1264, + "step": 89370 + }, + { + "epoch": 0.3402023400805402, + "grad_norm": 0.1197567880153656, + "learning_rate": 0.0005, + "loss": 2.1139, + "step": 89380 + }, + { + "epoch": 0.3402404025486629, + "grad_norm": 0.11214889585971832, + "learning_rate": 0.0005, + "loss": 2.1317, + "step": 89390 + }, + { + "epoch": 0.34027846501678555, + "grad_norm": 0.12737496197223663, + "learning_rate": 0.0005, + "loss": 2.1167, + "step": 89400 + }, + { + "epoch": 0.3403165274849082, + "grad_norm": 0.12622858583927155, + "learning_rate": 0.0005, + "loss": 2.118, + "step": 89410 + }, + { + "epoch": 0.3403545899530309, + "grad_norm": 0.12642420828342438, + "learning_rate": 0.0005, + "loss": 2.1289, + "step": 89420 + }, + { + "epoch": 0.3403926524211536, + "grad_norm": 0.1310669183731079, + "learning_rate": 0.0005, + "loss": 2.1288, + "step": 89430 + }, + { + "epoch": 0.3404307148892763, + "grad_norm": 0.22463572025299072, + "learning_rate": 0.0005, + "loss": 2.1124, + "step": 89440 + }, + { + "epoch": 0.34046877735739894, + "grad_norm": 0.12545807659626007, + "learning_rate": 0.0005, + "loss": 2.1315, + "step": 89450 + }, + { + "epoch": 0.34050683982552166, + "grad_norm": 0.13150864839553833, + "learning_rate": 0.0005, + "loss": 2.1397, + "step": 89460 + }, + { + "epoch": 0.3405449022936443, + "grad_norm": 0.12236011028289795, + "learning_rate": 0.0005, + "loss": 2.1336, + "step": 89470 + }, + { + "epoch": 0.340582964761767, + "grad_norm": 0.1419258564710617, + "learning_rate": 0.0005, + "loss": 2.1126, + "step": 89480 + }, + { + "epoch": 0.3406210272298897, + "grad_norm": 0.13559618592262268, + "learning_rate": 0.0005, + "loss": 2.1148, + "step": 89490 + }, + { + "epoch": 0.3406590896980124, + "grad_norm": 0.13465294241905212, + "learning_rate": 0.0005, + "loss": 2.1071, + "step": 89500 + }, + { + "epoch": 0.34069715216613505, + "grad_norm": 0.11645340174436569, + "learning_rate": 0.0005, + "loss": 2.1206, + "step": 89510 + }, + { + "epoch": 0.34073521463425777, + "grad_norm": 0.1335042268037796, + "learning_rate": 0.0005, + "loss": 2.1028, + "step": 89520 + }, + { + "epoch": 0.3407732771023804, + "grad_norm": 0.11522427201271057, + "learning_rate": 0.0005, + "loss": 2.1294, + "step": 89530 + }, + { + "epoch": 0.34081133957050314, + "grad_norm": 0.11598663032054901, + "learning_rate": 0.0005, + "loss": 2.1244, + "step": 89540 + }, + { + "epoch": 0.3408494020386258, + "grad_norm": 0.1153661459684372, + "learning_rate": 0.0005, + "loss": 2.1358, + "step": 89550 + }, + { + "epoch": 0.34088746450674845, + "grad_norm": 0.13112501800060272, + "learning_rate": 0.0005, + "loss": 2.1275, + "step": 89560 + }, + { + "epoch": 0.34092552697487116, + "grad_norm": 0.15203246474266052, + "learning_rate": 0.0005, + "loss": 2.0977, + "step": 89570 + }, + { + "epoch": 0.3409635894429938, + "grad_norm": 0.11616481095552444, + "learning_rate": 0.0005, + "loss": 2.123, + "step": 89580 + }, + { + "epoch": 0.34100165191111653, + "grad_norm": 0.11862931400537491, + "learning_rate": 0.0005, + "loss": 2.1372, + "step": 89590 + }, + { + "epoch": 0.3410397143792392, + "grad_norm": 0.1210283637046814, + "learning_rate": 0.0005, + "loss": 2.1138, + "step": 89600 + }, + { + "epoch": 0.3410777768473619, + "grad_norm": 0.13654400408267975, + "learning_rate": 0.0005, + "loss": 2.1242, + "step": 89610 + }, + { + "epoch": 0.34111583931548456, + "grad_norm": 0.1305961310863495, + "learning_rate": 0.0005, + "loss": 2.1151, + "step": 89620 + }, + { + "epoch": 0.34115390178360727, + "grad_norm": 0.12389110773801804, + "learning_rate": 0.0005, + "loss": 2.1386, + "step": 89630 + }, + { + "epoch": 0.34119196425172993, + "grad_norm": 0.11891574412584305, + "learning_rate": 0.0005, + "loss": 2.1271, + "step": 89640 + }, + { + "epoch": 0.34123002671985264, + "grad_norm": 0.1385021209716797, + "learning_rate": 0.0005, + "loss": 2.1274, + "step": 89650 + }, + { + "epoch": 0.3412680891879753, + "grad_norm": 0.12695041298866272, + "learning_rate": 0.0005, + "loss": 2.1297, + "step": 89660 + }, + { + "epoch": 0.341306151656098, + "grad_norm": 0.1301238238811493, + "learning_rate": 0.0005, + "loss": 2.113, + "step": 89670 + }, + { + "epoch": 0.34134421412422067, + "grad_norm": 0.1111309602856636, + "learning_rate": 0.0005, + "loss": 2.1135, + "step": 89680 + }, + { + "epoch": 0.3413822765923434, + "grad_norm": 0.11588682234287262, + "learning_rate": 0.0005, + "loss": 2.123, + "step": 89690 + }, + { + "epoch": 0.34142033906046604, + "grad_norm": 0.12726850807666779, + "learning_rate": 0.0005, + "loss": 2.1194, + "step": 89700 + }, + { + "epoch": 0.3414584015285887, + "grad_norm": 0.13112305104732513, + "learning_rate": 0.0005, + "loss": 2.1189, + "step": 89710 + }, + { + "epoch": 0.3414964639967114, + "grad_norm": 0.12595675885677338, + "learning_rate": 0.0005, + "loss": 2.115, + "step": 89720 + }, + { + "epoch": 0.34153452646483407, + "grad_norm": 0.1312045007944107, + "learning_rate": 0.0005, + "loss": 2.1178, + "step": 89730 + }, + { + "epoch": 0.3415725889329568, + "grad_norm": 0.1294623613357544, + "learning_rate": 0.0005, + "loss": 2.1247, + "step": 89740 + }, + { + "epoch": 0.34161065140107943, + "grad_norm": 0.11580775678157806, + "learning_rate": 0.0005, + "loss": 2.1333, + "step": 89750 + }, + { + "epoch": 0.34164871386920215, + "grad_norm": 0.11577937006950378, + "learning_rate": 0.0005, + "loss": 2.1317, + "step": 89760 + }, + { + "epoch": 0.3416867763373248, + "grad_norm": 0.13450708985328674, + "learning_rate": 0.0005, + "loss": 2.1218, + "step": 89770 + }, + { + "epoch": 0.3417248388054475, + "grad_norm": 0.12490272521972656, + "learning_rate": 0.0005, + "loss": 2.1103, + "step": 89780 + }, + { + "epoch": 0.3417629012735702, + "grad_norm": 0.14420974254608154, + "learning_rate": 0.0005, + "loss": 2.1368, + "step": 89790 + }, + { + "epoch": 0.3418009637416929, + "grad_norm": 0.12905338406562805, + "learning_rate": 0.0005, + "loss": 2.1271, + "step": 89800 + }, + { + "epoch": 0.34183902620981554, + "grad_norm": 0.12013151496648788, + "learning_rate": 0.0005, + "loss": 2.1189, + "step": 89810 + }, + { + "epoch": 0.34187708867793826, + "grad_norm": 0.12324290722608566, + "learning_rate": 0.0005, + "loss": 2.1202, + "step": 89820 + }, + { + "epoch": 0.3419151511460609, + "grad_norm": 0.12030022591352463, + "learning_rate": 0.0005, + "loss": 2.1151, + "step": 89830 + }, + { + "epoch": 0.34195321361418357, + "grad_norm": 0.12494122236967087, + "learning_rate": 0.0005, + "loss": 2.1184, + "step": 89840 + }, + { + "epoch": 0.3419912760823063, + "grad_norm": 0.11866031587123871, + "learning_rate": 0.0005, + "loss": 2.1172, + "step": 89850 + }, + { + "epoch": 0.34202933855042894, + "grad_norm": 0.12402195483446121, + "learning_rate": 0.0005, + "loss": 2.1238, + "step": 89860 + }, + { + "epoch": 0.34206740101855165, + "grad_norm": 0.12285647541284561, + "learning_rate": 0.0005, + "loss": 2.1272, + "step": 89870 + }, + { + "epoch": 0.3421054634866743, + "grad_norm": 0.1358739733695984, + "learning_rate": 0.0005, + "loss": 2.1178, + "step": 89880 + }, + { + "epoch": 0.342143525954797, + "grad_norm": 0.12848158180713654, + "learning_rate": 0.0005, + "loss": 2.1277, + "step": 89890 + }, + { + "epoch": 0.3421815884229197, + "grad_norm": 0.1267717331647873, + "learning_rate": 0.0005, + "loss": 2.1171, + "step": 89900 + }, + { + "epoch": 0.3422196508910424, + "grad_norm": 0.13753780722618103, + "learning_rate": 0.0005, + "loss": 2.1238, + "step": 89910 + }, + { + "epoch": 0.34225771335916505, + "grad_norm": 0.12299558520317078, + "learning_rate": 0.0005, + "loss": 2.1159, + "step": 89920 + }, + { + "epoch": 0.34229577582728776, + "grad_norm": 0.12542882561683655, + "learning_rate": 0.0005, + "loss": 2.1359, + "step": 89930 + }, + { + "epoch": 0.3423338382954104, + "grad_norm": 0.14531518518924713, + "learning_rate": 0.0005, + "loss": 2.1186, + "step": 89940 + }, + { + "epoch": 0.34237190076353313, + "grad_norm": 0.1264420747756958, + "learning_rate": 0.0005, + "loss": 2.1346, + "step": 89950 + }, + { + "epoch": 0.3424099632316558, + "grad_norm": 0.12023679912090302, + "learning_rate": 0.0005, + "loss": 2.1383, + "step": 89960 + }, + { + "epoch": 0.3424480256997785, + "grad_norm": 0.11134123802185059, + "learning_rate": 0.0005, + "loss": 2.1234, + "step": 89970 + }, + { + "epoch": 0.34248608816790116, + "grad_norm": 0.12393639236688614, + "learning_rate": 0.0005, + "loss": 2.1263, + "step": 89980 + }, + { + "epoch": 0.3425241506360238, + "grad_norm": 0.12131131440401077, + "learning_rate": 0.0005, + "loss": 2.1285, + "step": 89990 + }, + { + "epoch": 0.34256221310414653, + "grad_norm": 0.12403381615877151, + "learning_rate": 0.0005, + "loss": 2.1062, + "step": 90000 + }, + { + "epoch": 0.3426002755722692, + "grad_norm": 0.11932516098022461, + "learning_rate": 0.0005, + "loss": 2.126, + "step": 90010 + }, + { + "epoch": 0.3426383380403919, + "grad_norm": 0.12505844235420227, + "learning_rate": 0.0005, + "loss": 2.1174, + "step": 90020 + }, + { + "epoch": 0.34267640050851456, + "grad_norm": 0.13085445761680603, + "learning_rate": 0.0005, + "loss": 2.1339, + "step": 90030 + }, + { + "epoch": 0.34271446297663727, + "grad_norm": 0.12016001343727112, + "learning_rate": 0.0005, + "loss": 2.1316, + "step": 90040 + }, + { + "epoch": 0.3427525254447599, + "grad_norm": 0.13243511319160461, + "learning_rate": 0.0005, + "loss": 2.1385, + "step": 90050 + }, + { + "epoch": 0.34279058791288264, + "grad_norm": 0.12323072552680969, + "learning_rate": 0.0005, + "loss": 2.144, + "step": 90060 + }, + { + "epoch": 0.3428286503810053, + "grad_norm": 0.12400073558092117, + "learning_rate": 0.0005, + "loss": 2.1161, + "step": 90070 + }, + { + "epoch": 0.342866712849128, + "grad_norm": 0.12671291828155518, + "learning_rate": 0.0005, + "loss": 2.1284, + "step": 90080 + }, + { + "epoch": 0.34290477531725067, + "grad_norm": 0.1255476474761963, + "learning_rate": 0.0005, + "loss": 2.1233, + "step": 90090 + }, + { + "epoch": 0.3429428377853734, + "grad_norm": 0.1331200748682022, + "learning_rate": 0.0005, + "loss": 2.1181, + "step": 90100 + }, + { + "epoch": 0.34298090025349603, + "grad_norm": 0.12602712213993073, + "learning_rate": 0.0005, + "loss": 2.133, + "step": 90110 + }, + { + "epoch": 0.34301896272161875, + "grad_norm": 0.12500569224357605, + "learning_rate": 0.0005, + "loss": 2.1184, + "step": 90120 + }, + { + "epoch": 0.3430570251897414, + "grad_norm": 0.12748053669929504, + "learning_rate": 0.0005, + "loss": 2.1214, + "step": 90130 + }, + { + "epoch": 0.34309508765786406, + "grad_norm": 0.12714222073554993, + "learning_rate": 0.0005, + "loss": 2.1187, + "step": 90140 + }, + { + "epoch": 0.3431331501259868, + "grad_norm": 0.1465597301721573, + "learning_rate": 0.0005, + "loss": 2.1384, + "step": 90150 + }, + { + "epoch": 0.34317121259410943, + "grad_norm": 0.13284499943256378, + "learning_rate": 0.0005, + "loss": 2.1199, + "step": 90160 + }, + { + "epoch": 0.34320927506223214, + "grad_norm": 0.12137099355459213, + "learning_rate": 0.0005, + "loss": 2.1302, + "step": 90170 + }, + { + "epoch": 0.3432473375303548, + "grad_norm": 0.12611745297908783, + "learning_rate": 0.0005, + "loss": 2.1236, + "step": 90180 + }, + { + "epoch": 0.3432853999984775, + "grad_norm": 0.12008702009916306, + "learning_rate": 0.0005, + "loss": 2.1216, + "step": 90190 + }, + { + "epoch": 0.34332346246660017, + "grad_norm": 0.12618735432624817, + "learning_rate": 0.0005, + "loss": 2.116, + "step": 90200 + }, + { + "epoch": 0.3433615249347229, + "grad_norm": 0.13112717866897583, + "learning_rate": 0.0005, + "loss": 2.1263, + "step": 90210 + }, + { + "epoch": 0.34339958740284554, + "grad_norm": 0.12244976311922073, + "learning_rate": 0.0005, + "loss": 2.1199, + "step": 90220 + }, + { + "epoch": 0.34343764987096825, + "grad_norm": 0.12034299969673157, + "learning_rate": 0.0005, + "loss": 2.1061, + "step": 90230 + }, + { + "epoch": 0.3434757123390909, + "grad_norm": 0.1241462454199791, + "learning_rate": 0.0005, + "loss": 2.1099, + "step": 90240 + }, + { + "epoch": 0.3435137748072136, + "grad_norm": 0.11790863424539566, + "learning_rate": 0.0005, + "loss": 2.1154, + "step": 90250 + }, + { + "epoch": 0.3435518372753363, + "grad_norm": 0.13333600759506226, + "learning_rate": 0.0005, + "loss": 2.1299, + "step": 90260 + }, + { + "epoch": 0.34358989974345894, + "grad_norm": 0.1280602663755417, + "learning_rate": 0.0005, + "loss": 2.1348, + "step": 90270 + }, + { + "epoch": 0.34362796221158165, + "grad_norm": 0.13984990119934082, + "learning_rate": 0.0005, + "loss": 2.1279, + "step": 90280 + }, + { + "epoch": 0.3436660246797043, + "grad_norm": 0.1236758604645729, + "learning_rate": 0.0005, + "loss": 2.1278, + "step": 90290 + }, + { + "epoch": 0.343704087147827, + "grad_norm": 0.11663752049207687, + "learning_rate": 0.0005, + "loss": 2.1221, + "step": 90300 + }, + { + "epoch": 0.3437421496159497, + "grad_norm": 0.10988294333219528, + "learning_rate": 0.0005, + "loss": 2.116, + "step": 90310 + }, + { + "epoch": 0.3437802120840724, + "grad_norm": 0.11985557526350021, + "learning_rate": 0.0005, + "loss": 2.1404, + "step": 90320 + }, + { + "epoch": 0.34381827455219505, + "grad_norm": 0.13246393203735352, + "learning_rate": 0.0005, + "loss": 2.1186, + "step": 90330 + }, + { + "epoch": 0.34385633702031776, + "grad_norm": 0.11569251865148544, + "learning_rate": 0.0005, + "loss": 2.1351, + "step": 90340 + }, + { + "epoch": 0.3438943994884404, + "grad_norm": 0.12747006118297577, + "learning_rate": 0.0005, + "loss": 2.1395, + "step": 90350 + }, + { + "epoch": 0.34393246195656313, + "grad_norm": 0.12384822964668274, + "learning_rate": 0.0005, + "loss": 2.1213, + "step": 90360 + }, + { + "epoch": 0.3439705244246858, + "grad_norm": 0.1292153000831604, + "learning_rate": 0.0005, + "loss": 2.1276, + "step": 90370 + }, + { + "epoch": 0.3440085868928085, + "grad_norm": 0.1250191479921341, + "learning_rate": 0.0005, + "loss": 2.1259, + "step": 90380 + }, + { + "epoch": 0.34404664936093116, + "grad_norm": 0.12791825830936432, + "learning_rate": 0.0005, + "loss": 2.1377, + "step": 90390 + }, + { + "epoch": 0.34408471182905387, + "grad_norm": 0.1329050213098526, + "learning_rate": 0.0005, + "loss": 2.1138, + "step": 90400 + }, + { + "epoch": 0.3441227742971765, + "grad_norm": 0.12665830552577972, + "learning_rate": 0.0005, + "loss": 2.1225, + "step": 90410 + }, + { + "epoch": 0.3441608367652992, + "grad_norm": 0.12122794985771179, + "learning_rate": 0.0005, + "loss": 2.1223, + "step": 90420 + }, + { + "epoch": 0.3441988992334219, + "grad_norm": 0.12537881731987, + "learning_rate": 0.0005, + "loss": 2.1208, + "step": 90430 + }, + { + "epoch": 0.34423696170154455, + "grad_norm": 0.11910541355609894, + "learning_rate": 0.0005, + "loss": 2.1359, + "step": 90440 + }, + { + "epoch": 0.34427502416966727, + "grad_norm": 0.1352231651544571, + "learning_rate": 0.0005, + "loss": 2.1147, + "step": 90450 + }, + { + "epoch": 0.3443130866377899, + "grad_norm": 0.1324281394481659, + "learning_rate": 0.0005, + "loss": 2.1176, + "step": 90460 + }, + { + "epoch": 0.34435114910591264, + "grad_norm": 0.11365267634391785, + "learning_rate": 0.0005, + "loss": 2.1095, + "step": 90470 + }, + { + "epoch": 0.3443892115740353, + "grad_norm": 0.12739905714988708, + "learning_rate": 0.0005, + "loss": 2.1299, + "step": 90480 + }, + { + "epoch": 0.344427274042158, + "grad_norm": 0.11905629932880402, + "learning_rate": 0.0005, + "loss": 2.1142, + "step": 90490 + }, + { + "epoch": 0.34446533651028066, + "grad_norm": 0.11469315737485886, + "learning_rate": 0.0005, + "loss": 2.1287, + "step": 90500 + }, + { + "epoch": 0.3445033989784034, + "grad_norm": 0.11756088584661484, + "learning_rate": 0.0005, + "loss": 2.1284, + "step": 90510 + }, + { + "epoch": 0.34454146144652603, + "grad_norm": 0.12300854176282883, + "learning_rate": 0.0005, + "loss": 2.1385, + "step": 90520 + }, + { + "epoch": 0.34457952391464874, + "grad_norm": 0.1245260089635849, + "learning_rate": 0.0005, + "loss": 2.1159, + "step": 90530 + }, + { + "epoch": 0.3446175863827714, + "grad_norm": 0.12596364319324493, + "learning_rate": 0.0005, + "loss": 2.122, + "step": 90540 + }, + { + "epoch": 0.3446556488508941, + "grad_norm": 0.12460871785879135, + "learning_rate": 0.0005, + "loss": 2.1225, + "step": 90550 + }, + { + "epoch": 0.34469371131901677, + "grad_norm": 0.1315487176179886, + "learning_rate": 0.0005, + "loss": 2.1298, + "step": 90560 + }, + { + "epoch": 0.34473177378713943, + "grad_norm": 0.12347906827926636, + "learning_rate": 0.0005, + "loss": 2.1247, + "step": 90570 + }, + { + "epoch": 0.34476983625526214, + "grad_norm": 0.11345997452735901, + "learning_rate": 0.0005, + "loss": 2.1126, + "step": 90580 + }, + { + "epoch": 0.3448078987233848, + "grad_norm": 0.13715878129005432, + "learning_rate": 0.0005, + "loss": 2.1342, + "step": 90590 + }, + { + "epoch": 0.3448459611915075, + "grad_norm": 0.13639040291309357, + "learning_rate": 0.0005, + "loss": 2.11, + "step": 90600 + }, + { + "epoch": 0.34488402365963017, + "grad_norm": 0.1313633918762207, + "learning_rate": 0.0005, + "loss": 2.1246, + "step": 90610 + }, + { + "epoch": 0.3449220861277529, + "grad_norm": 0.13886944949626923, + "learning_rate": 0.0005, + "loss": 2.1178, + "step": 90620 + }, + { + "epoch": 0.34496014859587554, + "grad_norm": 0.12198083847761154, + "learning_rate": 0.0005, + "loss": 2.1176, + "step": 90630 + }, + { + "epoch": 0.34499821106399825, + "grad_norm": 0.1195736899971962, + "learning_rate": 0.0005, + "loss": 2.1156, + "step": 90640 + }, + { + "epoch": 0.3450362735321209, + "grad_norm": 0.12475568056106567, + "learning_rate": 0.0005, + "loss": 2.1213, + "step": 90650 + }, + { + "epoch": 0.3450743360002436, + "grad_norm": 0.12828674912452698, + "learning_rate": 0.0005, + "loss": 2.111, + "step": 90660 + }, + { + "epoch": 0.3451123984683663, + "grad_norm": 0.12407656759023666, + "learning_rate": 0.0005, + "loss": 2.1337, + "step": 90670 + }, + { + "epoch": 0.345150460936489, + "grad_norm": 0.13938532769680023, + "learning_rate": 0.0005, + "loss": 2.1266, + "step": 90680 + }, + { + "epoch": 0.34518852340461165, + "grad_norm": 0.11502964049577713, + "learning_rate": 0.0005, + "loss": 2.1119, + "step": 90690 + }, + { + "epoch": 0.3452265858727343, + "grad_norm": 0.14421029388904572, + "learning_rate": 0.0005, + "loss": 2.1237, + "step": 90700 + }, + { + "epoch": 0.345264648340857, + "grad_norm": 0.12795130908489227, + "learning_rate": 0.0005, + "loss": 2.1303, + "step": 90710 + }, + { + "epoch": 0.3453027108089797, + "grad_norm": 0.11910250782966614, + "learning_rate": 0.0005, + "loss": 2.1294, + "step": 90720 + }, + { + "epoch": 0.3453407732771024, + "grad_norm": 0.11697063595056534, + "learning_rate": 0.0005, + "loss": 2.1174, + "step": 90730 + }, + { + "epoch": 0.34537883574522504, + "grad_norm": 0.12007717788219452, + "learning_rate": 0.0005, + "loss": 2.1271, + "step": 90740 + }, + { + "epoch": 0.34541689821334776, + "grad_norm": 0.13864076137542725, + "learning_rate": 0.0005, + "loss": 2.1203, + "step": 90750 + }, + { + "epoch": 0.3454549606814704, + "grad_norm": 0.13291937112808228, + "learning_rate": 0.0005, + "loss": 2.1183, + "step": 90760 + }, + { + "epoch": 0.3454930231495931, + "grad_norm": 0.12697722017765045, + "learning_rate": 0.0005, + "loss": 2.1338, + "step": 90770 + }, + { + "epoch": 0.3455310856177158, + "grad_norm": 0.11438869684934616, + "learning_rate": 0.0005, + "loss": 2.1268, + "step": 90780 + }, + { + "epoch": 0.3455691480858385, + "grad_norm": 0.14781485497951508, + "learning_rate": 0.0005, + "loss": 2.1414, + "step": 90790 + }, + { + "epoch": 0.34560721055396115, + "grad_norm": 0.13403859734535217, + "learning_rate": 0.0005, + "loss": 2.1212, + "step": 90800 + }, + { + "epoch": 0.34564527302208387, + "grad_norm": 0.11257542669773102, + "learning_rate": 0.0005, + "loss": 2.1185, + "step": 90810 + }, + { + "epoch": 0.3456833354902065, + "grad_norm": 0.1264079213142395, + "learning_rate": 0.0005, + "loss": 2.1133, + "step": 90820 + }, + { + "epoch": 0.34572139795832924, + "grad_norm": 0.11463584005832672, + "learning_rate": 0.0005, + "loss": 2.1127, + "step": 90830 + }, + { + "epoch": 0.3457594604264519, + "grad_norm": 0.1329016238451004, + "learning_rate": 0.0005, + "loss": 2.1065, + "step": 90840 + }, + { + "epoch": 0.34579752289457455, + "grad_norm": 0.13102352619171143, + "learning_rate": 0.0005, + "loss": 2.1131, + "step": 90850 + }, + { + "epoch": 0.34583558536269726, + "grad_norm": 0.12339334189891815, + "learning_rate": 0.0005, + "loss": 2.1311, + "step": 90860 + }, + { + "epoch": 0.3458736478308199, + "grad_norm": 0.12185164541006088, + "learning_rate": 0.0005, + "loss": 2.1226, + "step": 90870 + }, + { + "epoch": 0.34591171029894263, + "grad_norm": 0.13989120721817017, + "learning_rate": 0.0005, + "loss": 2.1282, + "step": 90880 + }, + { + "epoch": 0.3459497727670653, + "grad_norm": 0.11912752687931061, + "learning_rate": 0.0005, + "loss": 2.1176, + "step": 90890 + }, + { + "epoch": 0.345987835235188, + "grad_norm": 0.12951073050498962, + "learning_rate": 0.0005, + "loss": 2.1169, + "step": 90900 + }, + { + "epoch": 0.34602589770331066, + "grad_norm": 0.11748843640089035, + "learning_rate": 0.0005, + "loss": 2.133, + "step": 90910 + }, + { + "epoch": 0.34606396017143337, + "grad_norm": 0.12727637588977814, + "learning_rate": 0.0005, + "loss": 2.1381, + "step": 90920 + }, + { + "epoch": 0.34610202263955603, + "grad_norm": 0.12068556994199753, + "learning_rate": 0.0005, + "loss": 2.1201, + "step": 90930 + }, + { + "epoch": 0.34614008510767874, + "grad_norm": 0.11873731017112732, + "learning_rate": 0.0005, + "loss": 2.1327, + "step": 90940 + }, + { + "epoch": 0.3461781475758014, + "grad_norm": 0.12427987158298492, + "learning_rate": 0.0005, + "loss": 2.1216, + "step": 90950 + }, + { + "epoch": 0.3462162100439241, + "grad_norm": 0.13062496483325958, + "learning_rate": 0.0005, + "loss": 2.1223, + "step": 90960 + }, + { + "epoch": 0.34625427251204677, + "grad_norm": 0.15881308913230896, + "learning_rate": 0.0005, + "loss": 2.1178, + "step": 90970 + }, + { + "epoch": 0.3462923349801695, + "grad_norm": 0.12678012251853943, + "learning_rate": 0.0005, + "loss": 2.1214, + "step": 90980 + }, + { + "epoch": 0.34633039744829214, + "grad_norm": 0.12431074678897858, + "learning_rate": 0.0005, + "loss": 2.1351, + "step": 90990 + }, + { + "epoch": 0.3463684599164148, + "grad_norm": 0.1260952353477478, + "learning_rate": 0.0005, + "loss": 2.1323, + "step": 91000 + }, + { + "epoch": 0.3464065223845375, + "grad_norm": 0.11722179502248764, + "learning_rate": 0.0005, + "loss": 2.1158, + "step": 91010 + }, + { + "epoch": 0.34644458485266016, + "grad_norm": 0.12878122925758362, + "learning_rate": 0.0005, + "loss": 2.1247, + "step": 91020 + }, + { + "epoch": 0.3464826473207829, + "grad_norm": 0.13498681783676147, + "learning_rate": 0.0005, + "loss": 2.1245, + "step": 91030 + }, + { + "epoch": 0.34652070978890553, + "grad_norm": 0.12687091529369354, + "learning_rate": 0.0005, + "loss": 2.1099, + "step": 91040 + }, + { + "epoch": 0.34655877225702825, + "grad_norm": 0.1323506087064743, + "learning_rate": 0.0005, + "loss": 2.1221, + "step": 91050 + }, + { + "epoch": 0.3465968347251509, + "grad_norm": 0.12858256697654724, + "learning_rate": 0.0005, + "loss": 2.1162, + "step": 91060 + }, + { + "epoch": 0.3466348971932736, + "grad_norm": 0.12089366465806961, + "learning_rate": 0.0005, + "loss": 2.1063, + "step": 91070 + }, + { + "epoch": 0.3466729596613963, + "grad_norm": 0.1293216347694397, + "learning_rate": 0.0005, + "loss": 2.1071, + "step": 91080 + }, + { + "epoch": 0.346711022129519, + "grad_norm": 0.12396867573261261, + "learning_rate": 0.0005, + "loss": 2.1227, + "step": 91090 + }, + { + "epoch": 0.34674908459764164, + "grad_norm": 0.1382405161857605, + "learning_rate": 0.0005, + "loss": 2.1327, + "step": 91100 + }, + { + "epoch": 0.34678714706576436, + "grad_norm": 0.13462446630001068, + "learning_rate": 0.0005, + "loss": 2.1278, + "step": 91110 + }, + { + "epoch": 0.346825209533887, + "grad_norm": 0.1211245208978653, + "learning_rate": 0.0005, + "loss": 2.1065, + "step": 91120 + }, + { + "epoch": 0.3468632720020097, + "grad_norm": 0.11953513324260712, + "learning_rate": 0.0005, + "loss": 2.1305, + "step": 91130 + }, + { + "epoch": 0.3469013344701324, + "grad_norm": 0.1239633709192276, + "learning_rate": 0.0005, + "loss": 2.1303, + "step": 91140 + }, + { + "epoch": 0.34693939693825504, + "grad_norm": 0.13313403725624084, + "learning_rate": 0.0005, + "loss": 2.1289, + "step": 91150 + }, + { + "epoch": 0.34697745940637775, + "grad_norm": 0.13562420010566711, + "learning_rate": 0.0005, + "loss": 2.1228, + "step": 91160 + }, + { + "epoch": 0.3470155218745004, + "grad_norm": 0.11659242957830429, + "learning_rate": 0.0005, + "loss": 2.1266, + "step": 91170 + }, + { + "epoch": 0.3470535843426231, + "grad_norm": 0.1301209181547165, + "learning_rate": 0.0005, + "loss": 2.1218, + "step": 91180 + }, + { + "epoch": 0.3470916468107458, + "grad_norm": 0.12229889631271362, + "learning_rate": 0.0005, + "loss": 2.1185, + "step": 91190 + }, + { + "epoch": 0.3471297092788685, + "grad_norm": 0.12382488697767258, + "learning_rate": 0.0005, + "loss": 2.1112, + "step": 91200 + }, + { + "epoch": 0.34716777174699115, + "grad_norm": 0.1378564089536667, + "learning_rate": 0.0005, + "loss": 2.1203, + "step": 91210 + }, + { + "epoch": 0.34720583421511386, + "grad_norm": 0.1293146312236786, + "learning_rate": 0.0005, + "loss": 2.118, + "step": 91220 + }, + { + "epoch": 0.3472438966832365, + "grad_norm": 0.11996651440858841, + "learning_rate": 0.0005, + "loss": 2.1244, + "step": 91230 + }, + { + "epoch": 0.34728195915135923, + "grad_norm": 0.12518168985843658, + "learning_rate": 0.0005, + "loss": 2.1293, + "step": 91240 + }, + { + "epoch": 0.3473200216194819, + "grad_norm": 0.12766574323177338, + "learning_rate": 0.0005, + "loss": 2.127, + "step": 91250 + }, + { + "epoch": 0.3473580840876046, + "grad_norm": 0.1208205297589302, + "learning_rate": 0.0005, + "loss": 2.1336, + "step": 91260 + }, + { + "epoch": 0.34739614655572726, + "grad_norm": 0.11901720613241196, + "learning_rate": 0.0005, + "loss": 2.1367, + "step": 91270 + }, + { + "epoch": 0.3474342090238499, + "grad_norm": 0.14112527668476105, + "learning_rate": 0.0005, + "loss": 2.1211, + "step": 91280 + }, + { + "epoch": 0.34747227149197263, + "grad_norm": 0.12192676961421967, + "learning_rate": 0.0005, + "loss": 2.1392, + "step": 91290 + }, + { + "epoch": 0.3475103339600953, + "grad_norm": 0.11997781693935394, + "learning_rate": 0.0005, + "loss": 2.1216, + "step": 91300 + }, + { + "epoch": 0.347548396428218, + "grad_norm": 0.12274051457643509, + "learning_rate": 0.0005, + "loss": 2.12, + "step": 91310 + }, + { + "epoch": 0.34758645889634066, + "grad_norm": 0.13235563039779663, + "learning_rate": 0.0005, + "loss": 2.1268, + "step": 91320 + }, + { + "epoch": 0.34762452136446337, + "grad_norm": 0.13290190696716309, + "learning_rate": 0.0005, + "loss": 2.1193, + "step": 91330 + }, + { + "epoch": 0.347662583832586, + "grad_norm": 0.11767739802598953, + "learning_rate": 0.0005, + "loss": 2.126, + "step": 91340 + }, + { + "epoch": 0.34770064630070874, + "grad_norm": 0.13191862404346466, + "learning_rate": 0.0005, + "loss": 2.1172, + "step": 91350 + }, + { + "epoch": 0.3477387087688314, + "grad_norm": 0.12816275656223297, + "learning_rate": 0.0005, + "loss": 2.1291, + "step": 91360 + }, + { + "epoch": 0.3477767712369541, + "grad_norm": 0.11478374153375626, + "learning_rate": 0.0005, + "loss": 2.1244, + "step": 91370 + }, + { + "epoch": 0.34781483370507676, + "grad_norm": 0.139333575963974, + "learning_rate": 0.0005, + "loss": 2.1256, + "step": 91380 + }, + { + "epoch": 0.3478528961731995, + "grad_norm": 0.12281625717878342, + "learning_rate": 0.0005, + "loss": 2.1325, + "step": 91390 + }, + { + "epoch": 0.34789095864132213, + "grad_norm": 0.12818287312984467, + "learning_rate": 0.0005, + "loss": 2.1188, + "step": 91400 + }, + { + "epoch": 0.34792902110944485, + "grad_norm": 0.12466471642255783, + "learning_rate": 0.0005, + "loss": 2.1279, + "step": 91410 + }, + { + "epoch": 0.3479670835775675, + "grad_norm": 0.1379898339509964, + "learning_rate": 0.0005, + "loss": 2.1083, + "step": 91420 + }, + { + "epoch": 0.34800514604569016, + "grad_norm": 0.13826188445091248, + "learning_rate": 0.0005, + "loss": 2.1323, + "step": 91430 + }, + { + "epoch": 0.3480432085138129, + "grad_norm": 0.12144706398248672, + "learning_rate": 0.0005, + "loss": 2.1296, + "step": 91440 + }, + { + "epoch": 0.34808127098193553, + "grad_norm": 0.12366820126771927, + "learning_rate": 0.0005, + "loss": 2.1253, + "step": 91450 + }, + { + "epoch": 0.34811933345005824, + "grad_norm": 0.12699466943740845, + "learning_rate": 0.0005, + "loss": 2.1373, + "step": 91460 + }, + { + "epoch": 0.3481573959181809, + "grad_norm": 0.12522569298744202, + "learning_rate": 0.0005, + "loss": 2.1264, + "step": 91470 + }, + { + "epoch": 0.3481954583863036, + "grad_norm": 0.1333567202091217, + "learning_rate": 0.0005, + "loss": 2.1332, + "step": 91480 + }, + { + "epoch": 0.34823352085442627, + "grad_norm": 0.13545624911785126, + "learning_rate": 0.0005, + "loss": 2.1147, + "step": 91490 + }, + { + "epoch": 0.348271583322549, + "grad_norm": 0.12455707043409348, + "learning_rate": 0.0005, + "loss": 2.1386, + "step": 91500 + }, + { + "epoch": 0.34830964579067164, + "grad_norm": 0.12056753784418106, + "learning_rate": 0.0005, + "loss": 2.1144, + "step": 91510 + }, + { + "epoch": 0.34834770825879435, + "grad_norm": 0.12929877638816833, + "learning_rate": 0.0005, + "loss": 2.1301, + "step": 91520 + }, + { + "epoch": 0.348385770726917, + "grad_norm": 0.1246948316693306, + "learning_rate": 0.0005, + "loss": 2.1197, + "step": 91530 + }, + { + "epoch": 0.3484238331950397, + "grad_norm": 0.11321297287940979, + "learning_rate": 0.0005, + "loss": 2.1164, + "step": 91540 + }, + { + "epoch": 0.3484618956631624, + "grad_norm": 0.12944476306438446, + "learning_rate": 0.0005, + "loss": 2.1172, + "step": 91550 + }, + { + "epoch": 0.3484999581312851, + "grad_norm": 0.11018373817205429, + "learning_rate": 0.0005, + "loss": 2.1227, + "step": 91560 + }, + { + "epoch": 0.34853802059940775, + "grad_norm": 0.12235118448734283, + "learning_rate": 0.0005, + "loss": 2.125, + "step": 91570 + }, + { + "epoch": 0.3485760830675304, + "grad_norm": 0.12135972082614899, + "learning_rate": 0.0005, + "loss": 2.1336, + "step": 91580 + }, + { + "epoch": 0.3486141455356531, + "grad_norm": 0.13681411743164062, + "learning_rate": 0.0005, + "loss": 2.1294, + "step": 91590 + }, + { + "epoch": 0.3486522080037758, + "grad_norm": 0.12161792814731598, + "learning_rate": 0.0005, + "loss": 2.1266, + "step": 91600 + }, + { + "epoch": 0.3486902704718985, + "grad_norm": 0.11866001039743423, + "learning_rate": 0.0005, + "loss": 2.1343, + "step": 91610 + }, + { + "epoch": 0.34872833294002115, + "grad_norm": 0.11633715033531189, + "learning_rate": 0.0005, + "loss": 2.1273, + "step": 91620 + }, + { + "epoch": 0.34876639540814386, + "grad_norm": 0.12331175059080124, + "learning_rate": 0.0005, + "loss": 2.1253, + "step": 91630 + }, + { + "epoch": 0.3488044578762665, + "grad_norm": 0.13465796411037445, + "learning_rate": 0.0005, + "loss": 2.1195, + "step": 91640 + }, + { + "epoch": 0.34884252034438923, + "grad_norm": 0.39071884751319885, + "learning_rate": 0.0005, + "loss": 2.1374, + "step": 91650 + }, + { + "epoch": 0.3488805828125119, + "grad_norm": 0.1384720355272293, + "learning_rate": 0.0005, + "loss": 2.1225, + "step": 91660 + }, + { + "epoch": 0.3489186452806346, + "grad_norm": 0.12433448433876038, + "learning_rate": 0.0005, + "loss": 2.1358, + "step": 91670 + }, + { + "epoch": 0.34895670774875726, + "grad_norm": 0.12369529902935028, + "learning_rate": 0.0005, + "loss": 2.1231, + "step": 91680 + }, + { + "epoch": 0.34899477021687997, + "grad_norm": 0.12780912220478058, + "learning_rate": 0.0005, + "loss": 2.1172, + "step": 91690 + }, + { + "epoch": 0.3490328326850026, + "grad_norm": 0.13243108987808228, + "learning_rate": 0.0005, + "loss": 2.1443, + "step": 91700 + }, + { + "epoch": 0.3490708951531253, + "grad_norm": 0.12568865716457367, + "learning_rate": 0.0005, + "loss": 2.1158, + "step": 91710 + }, + { + "epoch": 0.349108957621248, + "grad_norm": 0.13068446516990662, + "learning_rate": 0.0005, + "loss": 2.1268, + "step": 91720 + }, + { + "epoch": 0.34914702008937065, + "grad_norm": 0.13560396432876587, + "learning_rate": 0.0005, + "loss": 2.1169, + "step": 91730 + }, + { + "epoch": 0.34918508255749336, + "grad_norm": 0.16247346997261047, + "learning_rate": 0.0005, + "loss": 2.1264, + "step": 91740 + }, + { + "epoch": 0.349223145025616, + "grad_norm": 0.13477447628974915, + "learning_rate": 0.0005, + "loss": 2.1244, + "step": 91750 + }, + { + "epoch": 0.34926120749373873, + "grad_norm": 0.12002983689308167, + "learning_rate": 0.0005, + "loss": 2.1263, + "step": 91760 + }, + { + "epoch": 0.3492992699618614, + "grad_norm": 0.12221094965934753, + "learning_rate": 0.0005, + "loss": 2.1364, + "step": 91770 + }, + { + "epoch": 0.3493373324299841, + "grad_norm": 0.14205977320671082, + "learning_rate": 0.0005, + "loss": 2.1302, + "step": 91780 + }, + { + "epoch": 0.34937539489810676, + "grad_norm": 0.11442865431308746, + "learning_rate": 0.0005, + "loss": 2.1309, + "step": 91790 + }, + { + "epoch": 0.3494134573662295, + "grad_norm": 0.13150253891944885, + "learning_rate": 0.0005, + "loss": 2.1163, + "step": 91800 + }, + { + "epoch": 0.34945151983435213, + "grad_norm": 0.13062317669391632, + "learning_rate": 0.0005, + "loss": 2.1275, + "step": 91810 + }, + { + "epoch": 0.34948958230247484, + "grad_norm": 0.12943986058235168, + "learning_rate": 0.0005, + "loss": 2.1262, + "step": 91820 + }, + { + "epoch": 0.3495276447705975, + "grad_norm": 0.12897132337093353, + "learning_rate": 0.0005, + "loss": 2.1215, + "step": 91830 + }, + { + "epoch": 0.3495657072387202, + "grad_norm": 0.12168525159358978, + "learning_rate": 0.0005, + "loss": 2.1144, + "step": 91840 + }, + { + "epoch": 0.34960376970684287, + "grad_norm": 0.12652556598186493, + "learning_rate": 0.0005, + "loss": 2.1256, + "step": 91850 + }, + { + "epoch": 0.34964183217496553, + "grad_norm": 0.14768041670322418, + "learning_rate": 0.0005, + "loss": 2.1205, + "step": 91860 + }, + { + "epoch": 0.34967989464308824, + "grad_norm": 0.11618136614561081, + "learning_rate": 0.0005, + "loss": 2.1348, + "step": 91870 + }, + { + "epoch": 0.3497179571112109, + "grad_norm": 0.12934279441833496, + "learning_rate": 0.0005, + "loss": 2.1075, + "step": 91880 + }, + { + "epoch": 0.3497560195793336, + "grad_norm": 0.12195584177970886, + "learning_rate": 0.0005, + "loss": 2.128, + "step": 91890 + }, + { + "epoch": 0.34979408204745627, + "grad_norm": 0.1398545652627945, + "learning_rate": 0.0005, + "loss": 2.1142, + "step": 91900 + }, + { + "epoch": 0.349832144515579, + "grad_norm": 0.12671756744384766, + "learning_rate": 0.0005, + "loss": 2.1232, + "step": 91910 + }, + { + "epoch": 0.34987020698370164, + "grad_norm": 0.12397027760744095, + "learning_rate": 0.0005, + "loss": 2.1215, + "step": 91920 + }, + { + "epoch": 0.34990826945182435, + "grad_norm": 0.11226258426904678, + "learning_rate": 0.0005, + "loss": 2.1259, + "step": 91930 + }, + { + "epoch": 0.349946331919947, + "grad_norm": 0.12149665504693985, + "learning_rate": 0.0005, + "loss": 2.1333, + "step": 91940 + }, + { + "epoch": 0.3499843943880697, + "grad_norm": 0.11136379092931747, + "learning_rate": 0.0005, + "loss": 2.1222, + "step": 91950 + }, + { + "epoch": 0.3500224568561924, + "grad_norm": 0.1287529319524765, + "learning_rate": 0.0005, + "loss": 2.1262, + "step": 91960 + }, + { + "epoch": 0.3500605193243151, + "grad_norm": 0.12232114374637604, + "learning_rate": 0.0005, + "loss": 2.1112, + "step": 91970 + }, + { + "epoch": 0.35009858179243775, + "grad_norm": 0.12176519632339478, + "learning_rate": 0.0005, + "loss": 2.1131, + "step": 91980 + }, + { + "epoch": 0.35013664426056046, + "grad_norm": 0.11770909279584885, + "learning_rate": 0.0005, + "loss": 2.1171, + "step": 91990 + }, + { + "epoch": 0.3501747067286831, + "grad_norm": 0.12266077101230621, + "learning_rate": 0.0005, + "loss": 2.1159, + "step": 92000 + }, + { + "epoch": 0.3502127691968058, + "grad_norm": 0.13001757860183716, + "learning_rate": 0.0005, + "loss": 2.1253, + "step": 92010 + }, + { + "epoch": 0.3502508316649285, + "grad_norm": 0.1514846533536911, + "learning_rate": 0.0005, + "loss": 2.1344, + "step": 92020 + }, + { + "epoch": 0.35028889413305114, + "grad_norm": 0.12914305925369263, + "learning_rate": 0.0005, + "loss": 2.12, + "step": 92030 + }, + { + "epoch": 0.35032695660117386, + "grad_norm": 0.12426555901765823, + "learning_rate": 0.0005, + "loss": 2.1126, + "step": 92040 + }, + { + "epoch": 0.3503650190692965, + "grad_norm": 0.11400419473648071, + "learning_rate": 0.0005, + "loss": 2.1191, + "step": 92050 + }, + { + "epoch": 0.3504030815374192, + "grad_norm": 0.12072566151618958, + "learning_rate": 0.0005, + "loss": 2.1216, + "step": 92060 + }, + { + "epoch": 0.3504411440055419, + "grad_norm": 0.12593691051006317, + "learning_rate": 0.0005, + "loss": 2.115, + "step": 92070 + }, + { + "epoch": 0.3504792064736646, + "grad_norm": 0.11472805589437485, + "learning_rate": 0.0005, + "loss": 2.1161, + "step": 92080 + }, + { + "epoch": 0.35051726894178725, + "grad_norm": 0.11309553682804108, + "learning_rate": 0.0005, + "loss": 2.1322, + "step": 92090 + }, + { + "epoch": 0.35055533140990996, + "grad_norm": 0.12516191601753235, + "learning_rate": 0.0005, + "loss": 2.1231, + "step": 92100 + }, + { + "epoch": 0.3505933938780326, + "grad_norm": 0.14064562320709229, + "learning_rate": 0.0005, + "loss": 2.1299, + "step": 92110 + }, + { + "epoch": 0.35063145634615533, + "grad_norm": 0.13177159428596497, + "learning_rate": 0.0005, + "loss": 2.1392, + "step": 92120 + }, + { + "epoch": 0.350669518814278, + "grad_norm": 0.13190452754497528, + "learning_rate": 0.0005, + "loss": 2.1254, + "step": 92130 + }, + { + "epoch": 0.35070758128240065, + "grad_norm": 0.11451554298400879, + "learning_rate": 0.0005, + "loss": 2.141, + "step": 92140 + }, + { + "epoch": 0.35074564375052336, + "grad_norm": 0.11608679592609406, + "learning_rate": 0.0005, + "loss": 2.1067, + "step": 92150 + }, + { + "epoch": 0.350783706218646, + "grad_norm": 0.14462372660636902, + "learning_rate": 0.0005, + "loss": 2.125, + "step": 92160 + }, + { + "epoch": 0.35082176868676873, + "grad_norm": 0.12858840823173523, + "learning_rate": 0.0005, + "loss": 2.1237, + "step": 92170 + }, + { + "epoch": 0.3508598311548914, + "grad_norm": 0.13132420182228088, + "learning_rate": 0.0005, + "loss": 2.1229, + "step": 92180 + }, + { + "epoch": 0.3508978936230141, + "grad_norm": 0.1324557065963745, + "learning_rate": 0.0005, + "loss": 2.1132, + "step": 92190 + }, + { + "epoch": 0.35093595609113676, + "grad_norm": 0.1943078637123108, + "learning_rate": 0.0005, + "loss": 2.1104, + "step": 92200 + }, + { + "epoch": 0.35097401855925947, + "grad_norm": 0.11292409151792526, + "learning_rate": 0.0005, + "loss": 2.1354, + "step": 92210 + }, + { + "epoch": 0.35101208102738213, + "grad_norm": 0.1135173812508583, + "learning_rate": 0.0005, + "loss": 2.1249, + "step": 92220 + }, + { + "epoch": 0.35105014349550484, + "grad_norm": 0.1146763414144516, + "learning_rate": 0.0005, + "loss": 2.1378, + "step": 92230 + }, + { + "epoch": 0.3510882059636275, + "grad_norm": 0.1284039467573166, + "learning_rate": 0.0005, + "loss": 2.092, + "step": 92240 + }, + { + "epoch": 0.3511262684317502, + "grad_norm": 0.11903073638677597, + "learning_rate": 0.0005, + "loss": 2.1342, + "step": 92250 + }, + { + "epoch": 0.35116433089987287, + "grad_norm": 0.12131869047880173, + "learning_rate": 0.0005, + "loss": 2.1219, + "step": 92260 + }, + { + "epoch": 0.3512023933679956, + "grad_norm": 0.12632091343402863, + "learning_rate": 0.0005, + "loss": 2.1131, + "step": 92270 + }, + { + "epoch": 0.35124045583611824, + "grad_norm": 0.12733492255210876, + "learning_rate": 0.0005, + "loss": 2.1081, + "step": 92280 + }, + { + "epoch": 0.3512785183042409, + "grad_norm": 0.12252689898014069, + "learning_rate": 0.0005, + "loss": 2.1214, + "step": 92290 + }, + { + "epoch": 0.3513165807723636, + "grad_norm": 0.12945452332496643, + "learning_rate": 0.0005, + "loss": 2.1276, + "step": 92300 + }, + { + "epoch": 0.35135464324048626, + "grad_norm": 0.13178656995296478, + "learning_rate": 0.0005, + "loss": 2.1176, + "step": 92310 + }, + { + "epoch": 0.351392705708609, + "grad_norm": 0.13196706771850586, + "learning_rate": 0.0005, + "loss": 2.1173, + "step": 92320 + }, + { + "epoch": 0.35143076817673163, + "grad_norm": 0.11777035146951675, + "learning_rate": 0.0005, + "loss": 2.1334, + "step": 92330 + }, + { + "epoch": 0.35146883064485435, + "grad_norm": 0.12045314908027649, + "learning_rate": 0.0005, + "loss": 2.1274, + "step": 92340 + }, + { + "epoch": 0.351506893112977, + "grad_norm": 0.12759415805339813, + "learning_rate": 0.0005, + "loss": 2.1129, + "step": 92350 + }, + { + "epoch": 0.3515449555810997, + "grad_norm": 0.1253875195980072, + "learning_rate": 0.0005, + "loss": 2.1144, + "step": 92360 + }, + { + "epoch": 0.3515830180492224, + "grad_norm": 0.10823439806699753, + "learning_rate": 0.0005, + "loss": 2.128, + "step": 92370 + }, + { + "epoch": 0.3516210805173451, + "grad_norm": 0.11525721102952957, + "learning_rate": 0.0005, + "loss": 2.1339, + "step": 92380 + }, + { + "epoch": 0.35165914298546774, + "grad_norm": 0.1259388029575348, + "learning_rate": 0.0005, + "loss": 2.1131, + "step": 92390 + }, + { + "epoch": 0.35169720545359046, + "grad_norm": 0.12903520464897156, + "learning_rate": 0.0005, + "loss": 2.1342, + "step": 92400 + }, + { + "epoch": 0.3517352679217131, + "grad_norm": 0.12380896508693695, + "learning_rate": 0.0005, + "loss": 2.1252, + "step": 92410 + }, + { + "epoch": 0.3517733303898358, + "grad_norm": 0.12490128725767136, + "learning_rate": 0.0005, + "loss": 2.1322, + "step": 92420 + }, + { + "epoch": 0.3518113928579585, + "grad_norm": 0.12302028387784958, + "learning_rate": 0.0005, + "loss": 2.1262, + "step": 92430 + }, + { + "epoch": 0.35184945532608114, + "grad_norm": 0.1265297383069992, + "learning_rate": 0.0005, + "loss": 2.1232, + "step": 92440 + }, + { + "epoch": 0.35188751779420385, + "grad_norm": 0.12258687615394592, + "learning_rate": 0.0005, + "loss": 2.1095, + "step": 92450 + }, + { + "epoch": 0.3519255802623265, + "grad_norm": 0.12869144976139069, + "learning_rate": 0.0005, + "loss": 2.1198, + "step": 92460 + }, + { + "epoch": 0.3519636427304492, + "grad_norm": 0.11812689155340195, + "learning_rate": 0.0005, + "loss": 2.1158, + "step": 92470 + }, + { + "epoch": 0.3520017051985719, + "grad_norm": 0.12622421979904175, + "learning_rate": 0.0005, + "loss": 2.1338, + "step": 92480 + }, + { + "epoch": 0.3520397676666946, + "grad_norm": 0.13387994468212128, + "learning_rate": 0.0005, + "loss": 2.114, + "step": 92490 + }, + { + "epoch": 0.35207783013481725, + "grad_norm": 0.12207360565662384, + "learning_rate": 0.0005, + "loss": 2.1303, + "step": 92500 + }, + { + "epoch": 0.35211589260293996, + "grad_norm": 0.12587909400463104, + "learning_rate": 0.0005, + "loss": 2.1317, + "step": 92510 + }, + { + "epoch": 0.3521539550710626, + "grad_norm": 0.12186160683631897, + "learning_rate": 0.0005, + "loss": 2.127, + "step": 92520 + }, + { + "epoch": 0.35219201753918533, + "grad_norm": 0.11623741686344147, + "learning_rate": 0.0005, + "loss": 2.1291, + "step": 92530 + }, + { + "epoch": 0.352230080007308, + "grad_norm": 0.11479505896568298, + "learning_rate": 0.0005, + "loss": 2.1264, + "step": 92540 + }, + { + "epoch": 0.3522681424754307, + "grad_norm": 0.11657773703336716, + "learning_rate": 0.0005, + "loss": 2.1277, + "step": 92550 + }, + { + "epoch": 0.35230620494355336, + "grad_norm": 0.14515730738639832, + "learning_rate": 0.0005, + "loss": 2.125, + "step": 92560 + }, + { + "epoch": 0.352344267411676, + "grad_norm": 0.11519404500722885, + "learning_rate": 0.0005, + "loss": 2.1312, + "step": 92570 + }, + { + "epoch": 0.35238232987979873, + "grad_norm": 0.11878593266010284, + "learning_rate": 0.0005, + "loss": 2.1204, + "step": 92580 + }, + { + "epoch": 0.3524203923479214, + "grad_norm": 0.11939465999603271, + "learning_rate": 0.0005, + "loss": 2.1118, + "step": 92590 + }, + { + "epoch": 0.3524584548160441, + "grad_norm": 0.128925159573555, + "learning_rate": 0.0005, + "loss": 2.1039, + "step": 92600 + }, + { + "epoch": 0.35249651728416675, + "grad_norm": 0.11379945278167725, + "learning_rate": 0.0005, + "loss": 2.1243, + "step": 92610 + }, + { + "epoch": 0.35253457975228947, + "grad_norm": 0.12493978440761566, + "learning_rate": 0.0005, + "loss": 2.1193, + "step": 92620 + }, + { + "epoch": 0.3525726422204121, + "grad_norm": 0.12576165795326233, + "learning_rate": 0.0005, + "loss": 2.115, + "step": 92630 + }, + { + "epoch": 0.35261070468853484, + "grad_norm": 0.1327223777770996, + "learning_rate": 0.0005, + "loss": 2.1215, + "step": 92640 + }, + { + "epoch": 0.3526487671566575, + "grad_norm": 0.13390015065670013, + "learning_rate": 0.0005, + "loss": 2.1287, + "step": 92650 + }, + { + "epoch": 0.3526868296247802, + "grad_norm": 0.13608437776565552, + "learning_rate": 0.0005, + "loss": 2.1276, + "step": 92660 + }, + { + "epoch": 0.35272489209290286, + "grad_norm": 0.1304791420698166, + "learning_rate": 0.0005, + "loss": 2.1086, + "step": 92670 + }, + { + "epoch": 0.3527629545610256, + "grad_norm": 0.13058798015117645, + "learning_rate": 0.0005, + "loss": 2.1114, + "step": 92680 + }, + { + "epoch": 0.35280101702914823, + "grad_norm": 0.12364275008440018, + "learning_rate": 0.0005, + "loss": 2.1319, + "step": 92690 + }, + { + "epoch": 0.35283907949727095, + "grad_norm": 0.12345042079687119, + "learning_rate": 0.0005, + "loss": 2.1148, + "step": 92700 + }, + { + "epoch": 0.3528771419653936, + "grad_norm": 0.13718070089817047, + "learning_rate": 0.0005, + "loss": 2.1126, + "step": 92710 + }, + { + "epoch": 0.35291520443351626, + "grad_norm": 0.12440600246191025, + "learning_rate": 0.0005, + "loss": 2.1294, + "step": 92720 + }, + { + "epoch": 0.352953266901639, + "grad_norm": 0.1204405426979065, + "learning_rate": 0.0005, + "loss": 2.1239, + "step": 92730 + }, + { + "epoch": 0.35299132936976163, + "grad_norm": 0.1262776255607605, + "learning_rate": 0.0005, + "loss": 2.1101, + "step": 92740 + }, + { + "epoch": 0.35302939183788434, + "grad_norm": 0.11918480694293976, + "learning_rate": 0.0005, + "loss": 2.1329, + "step": 92750 + }, + { + "epoch": 0.353067454306007, + "grad_norm": 0.13499636948108673, + "learning_rate": 0.0005, + "loss": 2.1224, + "step": 92760 + }, + { + "epoch": 0.3531055167741297, + "grad_norm": 0.13470661640167236, + "learning_rate": 0.0005, + "loss": 2.1248, + "step": 92770 + }, + { + "epoch": 0.35314357924225237, + "grad_norm": 0.1193150132894516, + "learning_rate": 0.0005, + "loss": 2.1288, + "step": 92780 + }, + { + "epoch": 0.3531816417103751, + "grad_norm": 0.13073119521141052, + "learning_rate": 0.0005, + "loss": 2.1174, + "step": 92790 + }, + { + "epoch": 0.35321970417849774, + "grad_norm": 0.13316971063613892, + "learning_rate": 0.0005, + "loss": 2.1257, + "step": 92800 + }, + { + "epoch": 0.35325776664662045, + "grad_norm": 0.12979532778263092, + "learning_rate": 0.0005, + "loss": 2.1237, + "step": 92810 + }, + { + "epoch": 0.3532958291147431, + "grad_norm": 0.1356252282857895, + "learning_rate": 0.0005, + "loss": 2.1323, + "step": 92820 + }, + { + "epoch": 0.3533338915828658, + "grad_norm": 0.12186180055141449, + "learning_rate": 0.0005, + "loss": 2.1157, + "step": 92830 + }, + { + "epoch": 0.3533719540509885, + "grad_norm": 0.11846879124641418, + "learning_rate": 0.0005, + "loss": 2.1308, + "step": 92840 + }, + { + "epoch": 0.3534100165191112, + "grad_norm": 0.1233963742852211, + "learning_rate": 0.0005, + "loss": 2.1269, + "step": 92850 + }, + { + "epoch": 0.35344807898723385, + "grad_norm": 0.1270616352558136, + "learning_rate": 0.0005, + "loss": 2.1188, + "step": 92860 + }, + { + "epoch": 0.3534861414553565, + "grad_norm": 0.12139040976762772, + "learning_rate": 0.0005, + "loss": 2.1194, + "step": 92870 + }, + { + "epoch": 0.3535242039234792, + "grad_norm": 0.1251436024904251, + "learning_rate": 0.0005, + "loss": 2.1189, + "step": 92880 + }, + { + "epoch": 0.3535622663916019, + "grad_norm": 0.12637275457382202, + "learning_rate": 0.0005, + "loss": 2.1111, + "step": 92890 + }, + { + "epoch": 0.3536003288597246, + "grad_norm": 0.1288340538740158, + "learning_rate": 0.0005, + "loss": 2.1157, + "step": 92900 + }, + { + "epoch": 0.35363839132784725, + "grad_norm": 0.13565371930599213, + "learning_rate": 0.0005, + "loss": 2.1308, + "step": 92910 + }, + { + "epoch": 0.35367645379596996, + "grad_norm": 0.1296985149383545, + "learning_rate": 0.0005, + "loss": 2.1136, + "step": 92920 + }, + { + "epoch": 0.3537145162640926, + "grad_norm": 0.1276034265756607, + "learning_rate": 0.0005, + "loss": 2.1146, + "step": 92930 + }, + { + "epoch": 0.35375257873221533, + "grad_norm": 0.13032260537147522, + "learning_rate": 0.0005, + "loss": 2.1148, + "step": 92940 + }, + { + "epoch": 0.353790641200338, + "grad_norm": 0.11970576643943787, + "learning_rate": 0.0005, + "loss": 2.1269, + "step": 92950 + }, + { + "epoch": 0.3538287036684607, + "grad_norm": 0.11574655026197433, + "learning_rate": 0.0005, + "loss": 2.122, + "step": 92960 + }, + { + "epoch": 0.35386676613658335, + "grad_norm": 0.12114018946886063, + "learning_rate": 0.0005, + "loss": 2.1075, + "step": 92970 + }, + { + "epoch": 0.35390482860470607, + "grad_norm": 0.12086793035268784, + "learning_rate": 0.0005, + "loss": 2.129, + "step": 92980 + }, + { + "epoch": 0.3539428910728287, + "grad_norm": 0.12161669135093689, + "learning_rate": 0.0005, + "loss": 2.14, + "step": 92990 + }, + { + "epoch": 0.3539809535409514, + "grad_norm": 0.13499213755130768, + "learning_rate": 0.0005, + "loss": 2.1255, + "step": 93000 + }, + { + "epoch": 0.3540190160090741, + "grad_norm": 0.12085911631584167, + "learning_rate": 0.0005, + "loss": 2.1271, + "step": 93010 + }, + { + "epoch": 0.35405707847719675, + "grad_norm": 0.11759299039840698, + "learning_rate": 0.0005, + "loss": 2.1243, + "step": 93020 + }, + { + "epoch": 0.35409514094531946, + "grad_norm": 0.12203952670097351, + "learning_rate": 0.0005, + "loss": 2.1162, + "step": 93030 + }, + { + "epoch": 0.3541332034134421, + "grad_norm": 0.12227179110050201, + "learning_rate": 0.0005, + "loss": 2.1157, + "step": 93040 + }, + { + "epoch": 0.35417126588156483, + "grad_norm": 0.12413538992404938, + "learning_rate": 0.0005, + "loss": 2.124, + "step": 93050 + }, + { + "epoch": 0.3542093283496875, + "grad_norm": 0.13979873061180115, + "learning_rate": 0.0005, + "loss": 2.1202, + "step": 93060 + }, + { + "epoch": 0.3542473908178102, + "grad_norm": 0.11567272990942001, + "learning_rate": 0.0005, + "loss": 2.1219, + "step": 93070 + }, + { + "epoch": 0.35428545328593286, + "grad_norm": 0.11851345747709274, + "learning_rate": 0.0005, + "loss": 2.1397, + "step": 93080 + }, + { + "epoch": 0.3543235157540556, + "grad_norm": 0.13208027184009552, + "learning_rate": 0.0005, + "loss": 2.1125, + "step": 93090 + }, + { + "epoch": 0.35436157822217823, + "grad_norm": 0.13782580196857452, + "learning_rate": 0.0005, + "loss": 2.1134, + "step": 93100 + }, + { + "epoch": 0.35439964069030094, + "grad_norm": 0.15525998175144196, + "learning_rate": 0.0005, + "loss": 2.1332, + "step": 93110 + }, + { + "epoch": 0.3544377031584236, + "grad_norm": 0.12536172568798065, + "learning_rate": 0.0005, + "loss": 2.1263, + "step": 93120 + }, + { + "epoch": 0.3544757656265463, + "grad_norm": 0.13931910693645477, + "learning_rate": 0.0005, + "loss": 2.1437, + "step": 93130 + }, + { + "epoch": 0.35451382809466897, + "grad_norm": 0.12072119861841202, + "learning_rate": 0.0005, + "loss": 2.1149, + "step": 93140 + }, + { + "epoch": 0.3545518905627916, + "grad_norm": 0.13006591796875, + "learning_rate": 0.0005, + "loss": 2.1302, + "step": 93150 + }, + { + "epoch": 0.35458995303091434, + "grad_norm": 0.12391817569732666, + "learning_rate": 0.0005, + "loss": 2.1187, + "step": 93160 + }, + { + "epoch": 0.354628015499037, + "grad_norm": 0.12410353124141693, + "learning_rate": 0.0005, + "loss": 2.1313, + "step": 93170 + }, + { + "epoch": 0.3546660779671597, + "grad_norm": 0.13124974071979523, + "learning_rate": 0.0005, + "loss": 2.1166, + "step": 93180 + }, + { + "epoch": 0.35470414043528237, + "grad_norm": 0.13084763288497925, + "learning_rate": 0.0005, + "loss": 2.1146, + "step": 93190 + }, + { + "epoch": 0.3547422029034051, + "grad_norm": 0.1214420273900032, + "learning_rate": 0.0005, + "loss": 2.1157, + "step": 93200 + }, + { + "epoch": 0.35478026537152774, + "grad_norm": 0.13202619552612305, + "learning_rate": 0.0005, + "loss": 2.1217, + "step": 93210 + }, + { + "epoch": 0.35481832783965045, + "grad_norm": 0.13319921493530273, + "learning_rate": 0.0005, + "loss": 2.1167, + "step": 93220 + }, + { + "epoch": 0.3548563903077731, + "grad_norm": 0.13513195514678955, + "learning_rate": 0.0005, + "loss": 2.136, + "step": 93230 + }, + { + "epoch": 0.3548944527758958, + "grad_norm": 0.13121256232261658, + "learning_rate": 0.0005, + "loss": 2.1214, + "step": 93240 + }, + { + "epoch": 0.3549325152440185, + "grad_norm": 0.11843890696763992, + "learning_rate": 0.0005, + "loss": 2.1093, + "step": 93250 + }, + { + "epoch": 0.3549705777121412, + "grad_norm": 0.13226059079170227, + "learning_rate": 0.0005, + "loss": 2.1334, + "step": 93260 + }, + { + "epoch": 0.35500864018026385, + "grad_norm": 0.12598729133605957, + "learning_rate": 0.0005, + "loss": 2.1136, + "step": 93270 + }, + { + "epoch": 0.35504670264838656, + "grad_norm": 0.13033129274845123, + "learning_rate": 0.0005, + "loss": 2.1164, + "step": 93280 + }, + { + "epoch": 0.3550847651165092, + "grad_norm": 0.12485304474830627, + "learning_rate": 0.0005, + "loss": 2.1214, + "step": 93290 + }, + { + "epoch": 0.3551228275846319, + "grad_norm": 0.11563688516616821, + "learning_rate": 0.0005, + "loss": 2.127, + "step": 93300 + }, + { + "epoch": 0.3551608900527546, + "grad_norm": 0.1208873763680458, + "learning_rate": 0.0005, + "loss": 2.1191, + "step": 93310 + }, + { + "epoch": 0.35519895252087724, + "grad_norm": 0.11897026002407074, + "learning_rate": 0.0005, + "loss": 2.1173, + "step": 93320 + }, + { + "epoch": 0.35523701498899996, + "grad_norm": 0.13029348850250244, + "learning_rate": 0.0005, + "loss": 2.1204, + "step": 93330 + }, + { + "epoch": 0.3552750774571226, + "grad_norm": 0.1318322718143463, + "learning_rate": 0.0005, + "loss": 2.1289, + "step": 93340 + }, + { + "epoch": 0.3553131399252453, + "grad_norm": 0.12738265097141266, + "learning_rate": 0.0005, + "loss": 2.1299, + "step": 93350 + }, + { + "epoch": 0.355351202393368, + "grad_norm": 0.11214686185121536, + "learning_rate": 0.0005, + "loss": 2.1244, + "step": 93360 + }, + { + "epoch": 0.3553892648614907, + "grad_norm": 0.12007127702236176, + "learning_rate": 0.0005, + "loss": 2.1282, + "step": 93370 + }, + { + "epoch": 0.35542732732961335, + "grad_norm": 0.15367311239242554, + "learning_rate": 0.0005, + "loss": 2.1075, + "step": 93380 + }, + { + "epoch": 0.35546538979773606, + "grad_norm": 0.13861685991287231, + "learning_rate": 0.0005, + "loss": 2.1297, + "step": 93390 + }, + { + "epoch": 0.3555034522658587, + "grad_norm": 0.13355308771133423, + "learning_rate": 0.0005, + "loss": 2.1212, + "step": 93400 + }, + { + "epoch": 0.35554151473398143, + "grad_norm": 0.11894602328538895, + "learning_rate": 0.0005, + "loss": 2.1132, + "step": 93410 + }, + { + "epoch": 0.3555795772021041, + "grad_norm": 0.11944910138845444, + "learning_rate": 0.0005, + "loss": 2.1212, + "step": 93420 + }, + { + "epoch": 0.3556176396702268, + "grad_norm": 0.11500494927167892, + "learning_rate": 0.0005, + "loss": 2.1414, + "step": 93430 + }, + { + "epoch": 0.35565570213834946, + "grad_norm": 0.13844679296016693, + "learning_rate": 0.0005, + "loss": 2.0977, + "step": 93440 + }, + { + "epoch": 0.3556937646064721, + "grad_norm": 0.14892104268074036, + "learning_rate": 0.0005, + "loss": 2.129, + "step": 93450 + }, + { + "epoch": 0.35573182707459483, + "grad_norm": 0.12289411574602127, + "learning_rate": 0.0005, + "loss": 2.123, + "step": 93460 + }, + { + "epoch": 0.3557698895427175, + "grad_norm": 0.12943007051944733, + "learning_rate": 0.0005, + "loss": 2.1217, + "step": 93470 + }, + { + "epoch": 0.3558079520108402, + "grad_norm": 0.13010786473751068, + "learning_rate": 0.0005, + "loss": 2.1454, + "step": 93480 + }, + { + "epoch": 0.35584601447896286, + "grad_norm": 0.12736776471138, + "learning_rate": 0.0005, + "loss": 2.1204, + "step": 93490 + }, + { + "epoch": 0.35588407694708557, + "grad_norm": 0.15535278618335724, + "learning_rate": 0.0005, + "loss": 2.1219, + "step": 93500 + }, + { + "epoch": 0.3559221394152082, + "grad_norm": 0.12439072877168655, + "learning_rate": 0.0005, + "loss": 2.1329, + "step": 93510 + }, + { + "epoch": 0.35596020188333094, + "grad_norm": 0.11703763902187347, + "learning_rate": 0.0005, + "loss": 2.1102, + "step": 93520 + }, + { + "epoch": 0.3559982643514536, + "grad_norm": 0.12202954292297363, + "learning_rate": 0.0005, + "loss": 2.124, + "step": 93530 + }, + { + "epoch": 0.3560363268195763, + "grad_norm": 0.12256588786840439, + "learning_rate": 0.0005, + "loss": 2.1165, + "step": 93540 + }, + { + "epoch": 0.35607438928769897, + "grad_norm": 0.14434875547885895, + "learning_rate": 0.0005, + "loss": 2.1223, + "step": 93550 + }, + { + "epoch": 0.3561124517558217, + "grad_norm": 0.11983123421669006, + "learning_rate": 0.0005, + "loss": 2.1147, + "step": 93560 + }, + { + "epoch": 0.35615051422394434, + "grad_norm": 0.11797554790973663, + "learning_rate": 0.0005, + "loss": 2.1144, + "step": 93570 + }, + { + "epoch": 0.356188576692067, + "grad_norm": 0.1287076622247696, + "learning_rate": 0.0005, + "loss": 2.1093, + "step": 93580 + }, + { + "epoch": 0.3562266391601897, + "grad_norm": 0.13552536070346832, + "learning_rate": 0.0005, + "loss": 2.1002, + "step": 93590 + }, + { + "epoch": 0.35626470162831236, + "grad_norm": 0.12075095623731613, + "learning_rate": 0.0005, + "loss": 2.1267, + "step": 93600 + }, + { + "epoch": 0.3563027640964351, + "grad_norm": 0.12171369045972824, + "learning_rate": 0.0005, + "loss": 2.1231, + "step": 93610 + }, + { + "epoch": 0.35634082656455773, + "grad_norm": 0.12191025167703629, + "learning_rate": 0.0005, + "loss": 2.125, + "step": 93620 + }, + { + "epoch": 0.35637888903268045, + "grad_norm": 0.11536803096532822, + "learning_rate": 0.0005, + "loss": 2.132, + "step": 93630 + }, + { + "epoch": 0.3564169515008031, + "grad_norm": 0.12162362784147263, + "learning_rate": 0.0005, + "loss": 2.1313, + "step": 93640 + }, + { + "epoch": 0.3564550139689258, + "grad_norm": 0.12384018301963806, + "learning_rate": 0.0005, + "loss": 2.1189, + "step": 93650 + }, + { + "epoch": 0.3564930764370485, + "grad_norm": 0.11889617145061493, + "learning_rate": 0.0005, + "loss": 2.1184, + "step": 93660 + }, + { + "epoch": 0.3565311389051712, + "grad_norm": 0.140035942196846, + "learning_rate": 0.0005, + "loss": 2.1327, + "step": 93670 + }, + { + "epoch": 0.35656920137329384, + "grad_norm": 0.14636938273906708, + "learning_rate": 0.0005, + "loss": 2.1276, + "step": 93680 + }, + { + "epoch": 0.35660726384141656, + "grad_norm": 0.12248191982507706, + "learning_rate": 0.0005, + "loss": 2.1163, + "step": 93690 + }, + { + "epoch": 0.3566453263095392, + "grad_norm": 0.12582148611545563, + "learning_rate": 0.0005, + "loss": 2.122, + "step": 93700 + }, + { + "epoch": 0.3566833887776619, + "grad_norm": 0.11646874994039536, + "learning_rate": 0.0005, + "loss": 2.1271, + "step": 93710 + }, + { + "epoch": 0.3567214512457846, + "grad_norm": 0.12089702486991882, + "learning_rate": 0.0005, + "loss": 2.1155, + "step": 93720 + }, + { + "epoch": 0.35675951371390724, + "grad_norm": 0.12172345072031021, + "learning_rate": 0.0005, + "loss": 2.1116, + "step": 93730 + }, + { + "epoch": 0.35679757618202995, + "grad_norm": 0.1264006495475769, + "learning_rate": 0.0005, + "loss": 2.142, + "step": 93740 + }, + { + "epoch": 0.3568356386501526, + "grad_norm": 0.12129916995763779, + "learning_rate": 0.0005, + "loss": 2.1374, + "step": 93750 + }, + { + "epoch": 0.3568737011182753, + "grad_norm": 0.12129603326320648, + "learning_rate": 0.0005, + "loss": 2.1236, + "step": 93760 + }, + { + "epoch": 0.356911763586398, + "grad_norm": 0.12828706204891205, + "learning_rate": 0.0005, + "loss": 2.1119, + "step": 93770 + }, + { + "epoch": 0.3569498260545207, + "grad_norm": 0.11421766877174377, + "learning_rate": 0.0005, + "loss": 2.115, + "step": 93780 + }, + { + "epoch": 0.35698788852264335, + "grad_norm": 0.12098051607608795, + "learning_rate": 0.0005, + "loss": 2.1403, + "step": 93790 + }, + { + "epoch": 0.35702595099076606, + "grad_norm": 0.1217089295387268, + "learning_rate": 0.0005, + "loss": 2.1282, + "step": 93800 + }, + { + "epoch": 0.3570640134588887, + "grad_norm": 0.11992437392473221, + "learning_rate": 0.0005, + "loss": 2.1214, + "step": 93810 + }, + { + "epoch": 0.35710207592701143, + "grad_norm": 0.12545369565486908, + "learning_rate": 0.0005, + "loss": 2.1113, + "step": 93820 + }, + { + "epoch": 0.3571401383951341, + "grad_norm": 0.1325383186340332, + "learning_rate": 0.0005, + "loss": 2.1281, + "step": 93830 + }, + { + "epoch": 0.3571782008632568, + "grad_norm": 0.12268385291099548, + "learning_rate": 0.0005, + "loss": 2.1179, + "step": 93840 + }, + { + "epoch": 0.35721626333137946, + "grad_norm": 0.12395138293504715, + "learning_rate": 0.0005, + "loss": 2.1215, + "step": 93850 + }, + { + "epoch": 0.35725432579950217, + "grad_norm": 0.13802285492420197, + "learning_rate": 0.0005, + "loss": 2.1175, + "step": 93860 + }, + { + "epoch": 0.3572923882676248, + "grad_norm": 0.13328036665916443, + "learning_rate": 0.0005, + "loss": 2.1272, + "step": 93870 + }, + { + "epoch": 0.3573304507357475, + "grad_norm": 0.1284911185503006, + "learning_rate": 0.0005, + "loss": 2.1131, + "step": 93880 + }, + { + "epoch": 0.3573685132038702, + "grad_norm": 0.12316421419382095, + "learning_rate": 0.0005, + "loss": 2.1158, + "step": 93890 + }, + { + "epoch": 0.35740657567199285, + "grad_norm": 0.13757121562957764, + "learning_rate": 0.0005, + "loss": 2.1361, + "step": 93900 + }, + { + "epoch": 0.35744463814011557, + "grad_norm": 0.12331431359052658, + "learning_rate": 0.0005, + "loss": 2.1191, + "step": 93910 + }, + { + "epoch": 0.3574827006082382, + "grad_norm": 0.13232360780239105, + "learning_rate": 0.0005, + "loss": 2.1225, + "step": 93920 + }, + { + "epoch": 0.35752076307636094, + "grad_norm": 0.12024432420730591, + "learning_rate": 0.0005, + "loss": 2.1197, + "step": 93930 + }, + { + "epoch": 0.3575588255444836, + "grad_norm": 0.11617932468652725, + "learning_rate": 0.0005, + "loss": 2.1149, + "step": 93940 + }, + { + "epoch": 0.3575968880126063, + "grad_norm": 0.12654536962509155, + "learning_rate": 0.0005, + "loss": 2.1101, + "step": 93950 + }, + { + "epoch": 0.35763495048072896, + "grad_norm": 0.12844809889793396, + "learning_rate": 0.0005, + "loss": 2.1228, + "step": 93960 + }, + { + "epoch": 0.3576730129488517, + "grad_norm": 0.12008248269557953, + "learning_rate": 0.0005, + "loss": 2.1309, + "step": 93970 + }, + { + "epoch": 0.35771107541697433, + "grad_norm": 0.1400655061006546, + "learning_rate": 0.0005, + "loss": 2.1159, + "step": 93980 + }, + { + "epoch": 0.35774913788509705, + "grad_norm": 0.11512552946805954, + "learning_rate": 0.0005, + "loss": 2.1067, + "step": 93990 + }, + { + "epoch": 0.3577872003532197, + "grad_norm": 0.12251134216785431, + "learning_rate": 0.0005, + "loss": 2.1316, + "step": 94000 + }, + { + "epoch": 0.35782526282134236, + "grad_norm": 0.13219940662384033, + "learning_rate": 0.0005, + "loss": 2.1144, + "step": 94010 + }, + { + "epoch": 0.3578633252894651, + "grad_norm": 0.11719072610139847, + "learning_rate": 0.0005, + "loss": 2.14, + "step": 94020 + }, + { + "epoch": 0.35790138775758773, + "grad_norm": 0.127188578248024, + "learning_rate": 0.0005, + "loss": 2.1248, + "step": 94030 + }, + { + "epoch": 0.35793945022571044, + "grad_norm": 0.12552744150161743, + "learning_rate": 0.0005, + "loss": 2.1265, + "step": 94040 + }, + { + "epoch": 0.3579775126938331, + "grad_norm": 0.12586472928524017, + "learning_rate": 0.0005, + "loss": 2.1201, + "step": 94050 + }, + { + "epoch": 0.3580155751619558, + "grad_norm": 0.12713229656219482, + "learning_rate": 0.0005, + "loss": 2.1314, + "step": 94060 + }, + { + "epoch": 0.35805363763007847, + "grad_norm": 0.12572598457336426, + "learning_rate": 0.0005, + "loss": 2.107, + "step": 94070 + }, + { + "epoch": 0.3580917000982012, + "grad_norm": 0.12956538796424866, + "learning_rate": 0.0005, + "loss": 2.1232, + "step": 94080 + }, + { + "epoch": 0.35812976256632384, + "grad_norm": 0.12850606441497803, + "learning_rate": 0.0005, + "loss": 2.1039, + "step": 94090 + }, + { + "epoch": 0.35816782503444655, + "grad_norm": 0.12590013444423676, + "learning_rate": 0.0005, + "loss": 2.1271, + "step": 94100 + }, + { + "epoch": 0.3582058875025692, + "grad_norm": 0.12473782151937485, + "learning_rate": 0.0005, + "loss": 2.1279, + "step": 94110 + }, + { + "epoch": 0.3582439499706919, + "grad_norm": 0.11266183108091354, + "learning_rate": 0.0005, + "loss": 2.1318, + "step": 94120 + }, + { + "epoch": 0.3582820124388146, + "grad_norm": 0.15109241008758545, + "learning_rate": 0.0005, + "loss": 2.1173, + "step": 94130 + }, + { + "epoch": 0.3583200749069373, + "grad_norm": 0.11439945548772812, + "learning_rate": 0.0005, + "loss": 2.1249, + "step": 94140 + }, + { + "epoch": 0.35835813737505995, + "grad_norm": 0.13953430950641632, + "learning_rate": 0.0005, + "loss": 2.1277, + "step": 94150 + }, + { + "epoch": 0.3583961998431826, + "grad_norm": 0.13271038234233856, + "learning_rate": 0.0005, + "loss": 2.11, + "step": 94160 + }, + { + "epoch": 0.3584342623113053, + "grad_norm": 0.13171426951885223, + "learning_rate": 0.0005, + "loss": 2.127, + "step": 94170 + }, + { + "epoch": 0.358472324779428, + "grad_norm": 0.12254917621612549, + "learning_rate": 0.0005, + "loss": 2.1235, + "step": 94180 + }, + { + "epoch": 0.3585103872475507, + "grad_norm": 0.11646232008934021, + "learning_rate": 0.0005, + "loss": 2.108, + "step": 94190 + }, + { + "epoch": 0.35854844971567335, + "grad_norm": 0.14167553186416626, + "learning_rate": 0.0005, + "loss": 2.1456, + "step": 94200 + }, + { + "epoch": 0.35858651218379606, + "grad_norm": 0.12849067151546478, + "learning_rate": 0.0005, + "loss": 2.122, + "step": 94210 + }, + { + "epoch": 0.3586245746519187, + "grad_norm": 0.13245843350887299, + "learning_rate": 0.0005, + "loss": 2.1376, + "step": 94220 + }, + { + "epoch": 0.3586626371200414, + "grad_norm": 0.13472813367843628, + "learning_rate": 0.0005, + "loss": 2.1272, + "step": 94230 + }, + { + "epoch": 0.3587006995881641, + "grad_norm": 0.11261474341154099, + "learning_rate": 0.0005, + "loss": 2.1274, + "step": 94240 + }, + { + "epoch": 0.3587387620562868, + "grad_norm": 0.13049879670143127, + "learning_rate": 0.0005, + "loss": 2.1262, + "step": 94250 + }, + { + "epoch": 0.35877682452440945, + "grad_norm": 0.12245085835456848, + "learning_rate": 0.0005, + "loss": 2.1188, + "step": 94260 + }, + { + "epoch": 0.35881488699253217, + "grad_norm": 0.13632707297801971, + "learning_rate": 0.0005, + "loss": 2.1366, + "step": 94270 + }, + { + "epoch": 0.3588529494606548, + "grad_norm": 0.12433923035860062, + "learning_rate": 0.0005, + "loss": 2.1308, + "step": 94280 + }, + { + "epoch": 0.35889101192877754, + "grad_norm": 0.12938573956489563, + "learning_rate": 0.0005, + "loss": 2.1289, + "step": 94290 + }, + { + "epoch": 0.3589290743969002, + "grad_norm": 0.1309065967798233, + "learning_rate": 0.0005, + "loss": 2.1243, + "step": 94300 + }, + { + "epoch": 0.35896713686502285, + "grad_norm": 0.12268606573343277, + "learning_rate": 0.0005, + "loss": 2.1344, + "step": 94310 + }, + { + "epoch": 0.35900519933314556, + "grad_norm": 0.1260475069284439, + "learning_rate": 0.0005, + "loss": 2.1067, + "step": 94320 + }, + { + "epoch": 0.3590432618012682, + "grad_norm": 0.128811776638031, + "learning_rate": 0.0005, + "loss": 2.1139, + "step": 94330 + }, + { + "epoch": 0.35908132426939093, + "grad_norm": 0.12514546513557434, + "learning_rate": 0.0005, + "loss": 2.1164, + "step": 94340 + }, + { + "epoch": 0.3591193867375136, + "grad_norm": 0.12161403149366379, + "learning_rate": 0.0005, + "loss": 2.1251, + "step": 94350 + }, + { + "epoch": 0.3591574492056363, + "grad_norm": 0.1290997415781021, + "learning_rate": 0.0005, + "loss": 2.1202, + "step": 94360 + }, + { + "epoch": 0.35919551167375896, + "grad_norm": 0.12186875194311142, + "learning_rate": 0.0005, + "loss": 2.1232, + "step": 94370 + }, + { + "epoch": 0.3592335741418817, + "grad_norm": 0.13205765187740326, + "learning_rate": 0.0005, + "loss": 2.1173, + "step": 94380 + }, + { + "epoch": 0.35927163661000433, + "grad_norm": 0.12835603952407837, + "learning_rate": 0.0005, + "loss": 2.1121, + "step": 94390 + }, + { + "epoch": 0.35930969907812704, + "grad_norm": 0.1322995126247406, + "learning_rate": 0.0005, + "loss": 2.129, + "step": 94400 + }, + { + "epoch": 0.3593477615462497, + "grad_norm": 0.12406829744577408, + "learning_rate": 0.0005, + "loss": 2.1268, + "step": 94410 + }, + { + "epoch": 0.3593858240143724, + "grad_norm": 0.12375843524932861, + "learning_rate": 0.0005, + "loss": 2.1132, + "step": 94420 + }, + { + "epoch": 0.35942388648249507, + "grad_norm": 0.1409797966480255, + "learning_rate": 0.0005, + "loss": 2.113, + "step": 94430 + }, + { + "epoch": 0.3594619489506177, + "grad_norm": 0.12111986428499222, + "learning_rate": 0.0005, + "loss": 2.119, + "step": 94440 + }, + { + "epoch": 0.35950001141874044, + "grad_norm": 0.11834825575351715, + "learning_rate": 0.0005, + "loss": 2.1285, + "step": 94450 + }, + { + "epoch": 0.3595380738868631, + "grad_norm": 0.12570960819721222, + "learning_rate": 0.0005, + "loss": 2.1382, + "step": 94460 + }, + { + "epoch": 0.3595761363549858, + "grad_norm": 0.12362303584814072, + "learning_rate": 0.0005, + "loss": 2.1253, + "step": 94470 + }, + { + "epoch": 0.35961419882310847, + "grad_norm": 0.1270592361688614, + "learning_rate": 0.0005, + "loss": 2.1176, + "step": 94480 + }, + { + "epoch": 0.3596522612912312, + "grad_norm": 0.12973882257938385, + "learning_rate": 0.0005, + "loss": 2.1143, + "step": 94490 + }, + { + "epoch": 0.35969032375935384, + "grad_norm": 0.12367293983697891, + "learning_rate": 0.0005, + "loss": 2.1143, + "step": 94500 + }, + { + "epoch": 0.35972838622747655, + "grad_norm": 0.1229577362537384, + "learning_rate": 0.0005, + "loss": 2.119, + "step": 94510 + }, + { + "epoch": 0.3597664486955992, + "grad_norm": 0.1185227707028389, + "learning_rate": 0.0005, + "loss": 2.1273, + "step": 94520 + }, + { + "epoch": 0.3598045111637219, + "grad_norm": 0.11892779171466827, + "learning_rate": 0.0005, + "loss": 2.1108, + "step": 94530 + }, + { + "epoch": 0.3598425736318446, + "grad_norm": 0.12867958843708038, + "learning_rate": 0.0005, + "loss": 2.1328, + "step": 94540 + }, + { + "epoch": 0.3598806360999673, + "grad_norm": 0.14746211469173431, + "learning_rate": 0.0005, + "loss": 2.116, + "step": 94550 + }, + { + "epoch": 0.35991869856808995, + "grad_norm": 0.12404578179121017, + "learning_rate": 0.0005, + "loss": 2.1235, + "step": 94560 + }, + { + "epoch": 0.35995676103621266, + "grad_norm": 0.12142115831375122, + "learning_rate": 0.0005, + "loss": 2.1138, + "step": 94570 + }, + { + "epoch": 0.3599948235043353, + "grad_norm": 0.12541688978672028, + "learning_rate": 0.0005, + "loss": 2.1235, + "step": 94580 + }, + { + "epoch": 0.36003288597245797, + "grad_norm": 0.1460968405008316, + "learning_rate": 0.0005, + "loss": 2.1159, + "step": 94590 + }, + { + "epoch": 0.3600709484405807, + "grad_norm": 0.12225353717803955, + "learning_rate": 0.0005, + "loss": 2.1264, + "step": 94600 + }, + { + "epoch": 0.36010901090870334, + "grad_norm": 0.12600895762443542, + "learning_rate": 0.0005, + "loss": 2.1159, + "step": 94610 + }, + { + "epoch": 0.36014707337682605, + "grad_norm": 0.13123752176761627, + "learning_rate": 0.0005, + "loss": 2.1297, + "step": 94620 + }, + { + "epoch": 0.3601851358449487, + "grad_norm": 0.12744736671447754, + "learning_rate": 0.0005, + "loss": 2.1263, + "step": 94630 + }, + { + "epoch": 0.3602231983130714, + "grad_norm": 0.11714782565832138, + "learning_rate": 0.0005, + "loss": 2.1213, + "step": 94640 + }, + { + "epoch": 0.3602612607811941, + "grad_norm": 0.11957629770040512, + "learning_rate": 0.0005, + "loss": 2.1196, + "step": 94650 + }, + { + "epoch": 0.3602993232493168, + "grad_norm": 0.12496066093444824, + "learning_rate": 0.0005, + "loss": 2.1226, + "step": 94660 + }, + { + "epoch": 0.36033738571743945, + "grad_norm": 0.12312795221805573, + "learning_rate": 0.0005, + "loss": 2.1202, + "step": 94670 + }, + { + "epoch": 0.36037544818556216, + "grad_norm": 0.1335715502500534, + "learning_rate": 0.0005, + "loss": 2.1213, + "step": 94680 + }, + { + "epoch": 0.3604135106536848, + "grad_norm": 0.12358417361974716, + "learning_rate": 0.0005, + "loss": 2.1196, + "step": 94690 + }, + { + "epoch": 0.36045157312180753, + "grad_norm": 0.11705588549375534, + "learning_rate": 0.0005, + "loss": 2.1193, + "step": 94700 + }, + { + "epoch": 0.3604896355899302, + "grad_norm": 0.12685005366802216, + "learning_rate": 0.0005, + "loss": 2.1445, + "step": 94710 + }, + { + "epoch": 0.3605276980580529, + "grad_norm": 0.13659855723381042, + "learning_rate": 0.0005, + "loss": 2.1213, + "step": 94720 + }, + { + "epoch": 0.36056576052617556, + "grad_norm": 0.11961235851049423, + "learning_rate": 0.0005, + "loss": 2.118, + "step": 94730 + }, + { + "epoch": 0.3606038229942982, + "grad_norm": 0.12005238980054855, + "learning_rate": 0.0005, + "loss": 2.1347, + "step": 94740 + }, + { + "epoch": 0.36064188546242093, + "grad_norm": 0.13385742902755737, + "learning_rate": 0.0005, + "loss": 2.126, + "step": 94750 + }, + { + "epoch": 0.3606799479305436, + "grad_norm": 0.1370827555656433, + "learning_rate": 0.0005, + "loss": 2.1245, + "step": 94760 + }, + { + "epoch": 0.3607180103986663, + "grad_norm": 0.12504178285598755, + "learning_rate": 0.0005, + "loss": 2.1244, + "step": 94770 + }, + { + "epoch": 0.36075607286678896, + "grad_norm": 0.127493217587471, + "learning_rate": 0.0005, + "loss": 2.1197, + "step": 94780 + }, + { + "epoch": 0.36079413533491167, + "grad_norm": 0.11956959217786789, + "learning_rate": 0.0005, + "loss": 2.112, + "step": 94790 + }, + { + "epoch": 0.3608321978030343, + "grad_norm": 0.1345692276954651, + "learning_rate": 0.0005, + "loss": 2.1233, + "step": 94800 + }, + { + "epoch": 0.36087026027115704, + "grad_norm": 0.12534931302070618, + "learning_rate": 0.0005, + "loss": 2.113, + "step": 94810 + }, + { + "epoch": 0.3609083227392797, + "grad_norm": 0.12161685526371002, + "learning_rate": 0.0005, + "loss": 2.1042, + "step": 94820 + }, + { + "epoch": 0.3609463852074024, + "grad_norm": 0.13285884261131287, + "learning_rate": 0.0005, + "loss": 2.1081, + "step": 94830 + }, + { + "epoch": 0.36098444767552507, + "grad_norm": 0.12510840594768524, + "learning_rate": 0.0005, + "loss": 2.1347, + "step": 94840 + }, + { + "epoch": 0.3610225101436478, + "grad_norm": 0.11829493939876556, + "learning_rate": 0.0005, + "loss": 2.1248, + "step": 94850 + }, + { + "epoch": 0.36106057261177044, + "grad_norm": 0.1258384734392166, + "learning_rate": 0.0005, + "loss": 2.1044, + "step": 94860 + }, + { + "epoch": 0.3610986350798931, + "grad_norm": 0.1173219308257103, + "learning_rate": 0.0005, + "loss": 2.1217, + "step": 94870 + }, + { + "epoch": 0.3611366975480158, + "grad_norm": 0.1262979507446289, + "learning_rate": 0.0005, + "loss": 2.1299, + "step": 94880 + }, + { + "epoch": 0.36117476001613846, + "grad_norm": 0.12923787534236908, + "learning_rate": 0.0005, + "loss": 2.1233, + "step": 94890 + }, + { + "epoch": 0.3612128224842612, + "grad_norm": 0.12655524909496307, + "learning_rate": 0.0005, + "loss": 2.1209, + "step": 94900 + }, + { + "epoch": 0.36125088495238383, + "grad_norm": 0.12080805003643036, + "learning_rate": 0.0005, + "loss": 2.1218, + "step": 94910 + }, + { + "epoch": 0.36128894742050655, + "grad_norm": 0.11724785715341568, + "learning_rate": 0.0005, + "loss": 2.1187, + "step": 94920 + }, + { + "epoch": 0.3613270098886292, + "grad_norm": 0.12395143508911133, + "learning_rate": 0.0005, + "loss": 2.1268, + "step": 94930 + }, + { + "epoch": 0.3613650723567519, + "grad_norm": 0.12734508514404297, + "learning_rate": 0.0005, + "loss": 2.1339, + "step": 94940 + }, + { + "epoch": 0.3614031348248746, + "grad_norm": 0.11923015862703323, + "learning_rate": 0.0005, + "loss": 2.1024, + "step": 94950 + }, + { + "epoch": 0.3614411972929973, + "grad_norm": 0.11938813328742981, + "learning_rate": 0.0005, + "loss": 2.1299, + "step": 94960 + }, + { + "epoch": 0.36147925976111994, + "grad_norm": 0.12107255309820175, + "learning_rate": 0.0005, + "loss": 2.1269, + "step": 94970 + }, + { + "epoch": 0.36151732222924265, + "grad_norm": 0.11876919120550156, + "learning_rate": 0.0005, + "loss": 2.1133, + "step": 94980 + }, + { + "epoch": 0.3615553846973653, + "grad_norm": 0.1253984272480011, + "learning_rate": 0.0005, + "loss": 2.1261, + "step": 94990 + }, + { + "epoch": 0.361593447165488, + "grad_norm": 0.15340100228786469, + "learning_rate": 0.0005, + "loss": 2.1195, + "step": 95000 + }, + { + "epoch": 0.3616315096336107, + "grad_norm": 0.12683463096618652, + "learning_rate": 0.0005, + "loss": 2.1206, + "step": 95010 + }, + { + "epoch": 0.36166957210173334, + "grad_norm": 0.13147446513175964, + "learning_rate": 0.0005, + "loss": 2.1161, + "step": 95020 + }, + { + "epoch": 0.36170763456985605, + "grad_norm": 0.11206177622079849, + "learning_rate": 0.0005, + "loss": 2.1241, + "step": 95030 + }, + { + "epoch": 0.3617456970379787, + "grad_norm": 0.1205473318696022, + "learning_rate": 0.0005, + "loss": 2.1181, + "step": 95040 + }, + { + "epoch": 0.3617837595061014, + "grad_norm": 0.12132357805967331, + "learning_rate": 0.0005, + "loss": 2.1174, + "step": 95050 + }, + { + "epoch": 0.3618218219742241, + "grad_norm": 0.1197810173034668, + "learning_rate": 0.0005, + "loss": 2.115, + "step": 95060 + }, + { + "epoch": 0.3618598844423468, + "grad_norm": 0.12061937898397446, + "learning_rate": 0.0005, + "loss": 2.1147, + "step": 95070 + }, + { + "epoch": 0.36189794691046945, + "grad_norm": 0.12413901835680008, + "learning_rate": 0.0005, + "loss": 2.1237, + "step": 95080 + }, + { + "epoch": 0.36193600937859216, + "grad_norm": 0.13919556140899658, + "learning_rate": 0.0005, + "loss": 2.1057, + "step": 95090 + }, + { + "epoch": 0.3619740718467148, + "grad_norm": 0.12080375850200653, + "learning_rate": 0.0005, + "loss": 2.1188, + "step": 95100 + }, + { + "epoch": 0.36201213431483753, + "grad_norm": 0.12811049818992615, + "learning_rate": 0.0005, + "loss": 2.1377, + "step": 95110 + }, + { + "epoch": 0.3620501967829602, + "grad_norm": 0.1322176456451416, + "learning_rate": 0.0005, + "loss": 2.1331, + "step": 95120 + }, + { + "epoch": 0.3620882592510829, + "grad_norm": 0.13195084035396576, + "learning_rate": 0.0005, + "loss": 2.1093, + "step": 95130 + }, + { + "epoch": 0.36212632171920556, + "grad_norm": 0.1257820427417755, + "learning_rate": 0.0005, + "loss": 2.1168, + "step": 95140 + }, + { + "epoch": 0.36216438418732827, + "grad_norm": 0.12405338138341904, + "learning_rate": 0.0005, + "loss": 2.1227, + "step": 95150 + }, + { + "epoch": 0.3622024466554509, + "grad_norm": 0.12733334302902222, + "learning_rate": 0.0005, + "loss": 2.1383, + "step": 95160 + }, + { + "epoch": 0.3622405091235736, + "grad_norm": 0.14176435768604279, + "learning_rate": 0.0005, + "loss": 2.1314, + "step": 95170 + }, + { + "epoch": 0.3622785715916963, + "grad_norm": 0.11715825647115707, + "learning_rate": 0.0005, + "loss": 2.11, + "step": 95180 + }, + { + "epoch": 0.36231663405981895, + "grad_norm": 0.12377908080816269, + "learning_rate": 0.0005, + "loss": 2.1319, + "step": 95190 + }, + { + "epoch": 0.36235469652794167, + "grad_norm": 0.1338021606206894, + "learning_rate": 0.0005, + "loss": 2.1166, + "step": 95200 + }, + { + "epoch": 0.3623927589960643, + "grad_norm": 0.1310984194278717, + "learning_rate": 0.0005, + "loss": 2.1212, + "step": 95210 + }, + { + "epoch": 0.36243082146418704, + "grad_norm": 0.13077163696289062, + "learning_rate": 0.0005, + "loss": 2.1179, + "step": 95220 + }, + { + "epoch": 0.3624688839323097, + "grad_norm": 0.1179090291261673, + "learning_rate": 0.0005, + "loss": 2.1314, + "step": 95230 + }, + { + "epoch": 0.3625069464004324, + "grad_norm": 0.12353909015655518, + "learning_rate": 0.0005, + "loss": 2.1112, + "step": 95240 + }, + { + "epoch": 0.36254500886855506, + "grad_norm": 0.12887458503246307, + "learning_rate": 0.0005, + "loss": 2.1239, + "step": 95250 + }, + { + "epoch": 0.3625830713366778, + "grad_norm": 0.12522532045841217, + "learning_rate": 0.0005, + "loss": 2.1203, + "step": 95260 + }, + { + "epoch": 0.36262113380480043, + "grad_norm": 0.12584923207759857, + "learning_rate": 0.0005, + "loss": 2.1199, + "step": 95270 + }, + { + "epoch": 0.36265919627292315, + "grad_norm": 0.12872099876403809, + "learning_rate": 0.0005, + "loss": 2.1298, + "step": 95280 + }, + { + "epoch": 0.3626972587410458, + "grad_norm": 0.12893973290920258, + "learning_rate": 0.0005, + "loss": 2.1304, + "step": 95290 + }, + { + "epoch": 0.36273532120916846, + "grad_norm": 0.12009762972593307, + "learning_rate": 0.0005, + "loss": 2.1171, + "step": 95300 + }, + { + "epoch": 0.3627733836772912, + "grad_norm": 0.13374431431293488, + "learning_rate": 0.0005, + "loss": 2.1214, + "step": 95310 + }, + { + "epoch": 0.36281144614541383, + "grad_norm": 0.11058095097541809, + "learning_rate": 0.0005, + "loss": 2.123, + "step": 95320 + }, + { + "epoch": 0.36284950861353654, + "grad_norm": 0.13277429342269897, + "learning_rate": 0.0005, + "loss": 2.1278, + "step": 95330 + }, + { + "epoch": 0.3628875710816592, + "grad_norm": 0.11551333218812943, + "learning_rate": 0.0005, + "loss": 2.117, + "step": 95340 + }, + { + "epoch": 0.3629256335497819, + "grad_norm": 0.14505021274089813, + "learning_rate": 0.0005, + "loss": 2.1291, + "step": 95350 + }, + { + "epoch": 0.36296369601790457, + "grad_norm": 0.12307033687829971, + "learning_rate": 0.0005, + "loss": 2.1222, + "step": 95360 + }, + { + "epoch": 0.3630017584860273, + "grad_norm": 0.14109376072883606, + "learning_rate": 0.0005, + "loss": 2.1286, + "step": 95370 + }, + { + "epoch": 0.36303982095414994, + "grad_norm": 0.12665635347366333, + "learning_rate": 0.0005, + "loss": 2.1054, + "step": 95380 + }, + { + "epoch": 0.36307788342227265, + "grad_norm": 0.1169692873954773, + "learning_rate": 0.0005, + "loss": 2.1327, + "step": 95390 + }, + { + "epoch": 0.3631159458903953, + "grad_norm": 0.1254958063364029, + "learning_rate": 0.0005, + "loss": 2.1191, + "step": 95400 + }, + { + "epoch": 0.363154008358518, + "grad_norm": 0.13526247441768646, + "learning_rate": 0.0005, + "loss": 2.1304, + "step": 95410 + }, + { + "epoch": 0.3631920708266407, + "grad_norm": 0.12990276515483856, + "learning_rate": 0.0005, + "loss": 2.1385, + "step": 95420 + }, + { + "epoch": 0.3632301332947634, + "grad_norm": 0.1282341182231903, + "learning_rate": 0.0005, + "loss": 2.1064, + "step": 95430 + }, + { + "epoch": 0.36326819576288605, + "grad_norm": 0.12656110525131226, + "learning_rate": 0.0005, + "loss": 2.1267, + "step": 95440 + }, + { + "epoch": 0.3633062582310087, + "grad_norm": 0.11851034313440323, + "learning_rate": 0.0005, + "loss": 2.1237, + "step": 95450 + }, + { + "epoch": 0.3633443206991314, + "grad_norm": 0.12452059984207153, + "learning_rate": 0.0005, + "loss": 2.1296, + "step": 95460 + }, + { + "epoch": 0.3633823831672541, + "grad_norm": 0.1234283372759819, + "learning_rate": 0.0005, + "loss": 2.132, + "step": 95470 + }, + { + "epoch": 0.3634204456353768, + "grad_norm": 0.126150444149971, + "learning_rate": 0.0005, + "loss": 2.1236, + "step": 95480 + }, + { + "epoch": 0.36345850810349944, + "grad_norm": 0.12968218326568604, + "learning_rate": 0.0005, + "loss": 2.1124, + "step": 95490 + }, + { + "epoch": 0.36349657057162216, + "grad_norm": 0.14867307245731354, + "learning_rate": 0.0005, + "loss": 2.1172, + "step": 95500 + }, + { + "epoch": 0.3635346330397448, + "grad_norm": 0.12558214366436005, + "learning_rate": 0.0005, + "loss": 2.1156, + "step": 95510 + }, + { + "epoch": 0.3635726955078675, + "grad_norm": 0.12529504299163818, + "learning_rate": 0.0005, + "loss": 2.1138, + "step": 95520 + }, + { + "epoch": 0.3636107579759902, + "grad_norm": 0.11289634555578232, + "learning_rate": 0.0005, + "loss": 2.1198, + "step": 95530 + }, + { + "epoch": 0.3636488204441129, + "grad_norm": 0.12560494244098663, + "learning_rate": 0.0005, + "loss": 2.1047, + "step": 95540 + }, + { + "epoch": 0.36368688291223555, + "grad_norm": 0.13954435288906097, + "learning_rate": 0.0005, + "loss": 2.1409, + "step": 95550 + }, + { + "epoch": 0.36372494538035827, + "grad_norm": 0.12962469458580017, + "learning_rate": 0.0005, + "loss": 2.1297, + "step": 95560 + }, + { + "epoch": 0.3637630078484809, + "grad_norm": 0.12025895714759827, + "learning_rate": 0.0005, + "loss": 2.1124, + "step": 95570 + }, + { + "epoch": 0.36380107031660364, + "grad_norm": 0.1236664354801178, + "learning_rate": 0.0005, + "loss": 2.1256, + "step": 95580 + }, + { + "epoch": 0.3638391327847263, + "grad_norm": 0.12834928929805756, + "learning_rate": 0.0005, + "loss": 2.1114, + "step": 95590 + }, + { + "epoch": 0.36387719525284895, + "grad_norm": 0.12065238505601883, + "learning_rate": 0.0005, + "loss": 2.1198, + "step": 95600 + }, + { + "epoch": 0.36391525772097166, + "grad_norm": 0.12980596721172333, + "learning_rate": 0.0005, + "loss": 2.117, + "step": 95610 + }, + { + "epoch": 0.3639533201890943, + "grad_norm": 0.12471610307693481, + "learning_rate": 0.0005, + "loss": 2.1258, + "step": 95620 + }, + { + "epoch": 0.36399138265721703, + "grad_norm": 0.1360977739095688, + "learning_rate": 0.0005, + "loss": 2.1231, + "step": 95630 + }, + { + "epoch": 0.3640294451253397, + "grad_norm": 0.12732630968093872, + "learning_rate": 0.0005, + "loss": 2.1266, + "step": 95640 + }, + { + "epoch": 0.3640675075934624, + "grad_norm": 0.12732800841331482, + "learning_rate": 0.0005, + "loss": 2.1238, + "step": 95650 + }, + { + "epoch": 0.36410557006158506, + "grad_norm": 0.12266694754362106, + "learning_rate": 0.0005, + "loss": 2.1269, + "step": 95660 + }, + { + "epoch": 0.3641436325297078, + "grad_norm": 0.14251597225666046, + "learning_rate": 0.0005, + "loss": 2.1121, + "step": 95670 + }, + { + "epoch": 0.36418169499783043, + "grad_norm": 0.1284990906715393, + "learning_rate": 0.0005, + "loss": 2.1183, + "step": 95680 + }, + { + "epoch": 0.36421975746595314, + "grad_norm": 0.12726518511772156, + "learning_rate": 0.0005, + "loss": 2.1103, + "step": 95690 + }, + { + "epoch": 0.3642578199340758, + "grad_norm": 0.12449892610311508, + "learning_rate": 0.0005, + "loss": 2.1229, + "step": 95700 + }, + { + "epoch": 0.3642958824021985, + "grad_norm": 0.11769961565732956, + "learning_rate": 0.0005, + "loss": 2.1273, + "step": 95710 + }, + { + "epoch": 0.36433394487032117, + "grad_norm": 0.12047085165977478, + "learning_rate": 0.0005, + "loss": 2.1239, + "step": 95720 + }, + { + "epoch": 0.3643720073384439, + "grad_norm": 0.11222784221172333, + "learning_rate": 0.0005, + "loss": 2.1104, + "step": 95730 + }, + { + "epoch": 0.36441006980656654, + "grad_norm": 0.12319763749837875, + "learning_rate": 0.0005, + "loss": 2.1185, + "step": 95740 + }, + { + "epoch": 0.3644481322746892, + "grad_norm": 0.11679398268461227, + "learning_rate": 0.0005, + "loss": 2.1165, + "step": 95750 + }, + { + "epoch": 0.3644861947428119, + "grad_norm": 0.1261565238237381, + "learning_rate": 0.0005, + "loss": 2.1133, + "step": 95760 + }, + { + "epoch": 0.36452425721093457, + "grad_norm": 0.12498561292886734, + "learning_rate": 0.0005, + "loss": 2.1421, + "step": 95770 + }, + { + "epoch": 0.3645623196790573, + "grad_norm": 0.12172937393188477, + "learning_rate": 0.0005, + "loss": 2.1087, + "step": 95780 + }, + { + "epoch": 0.36460038214717994, + "grad_norm": 0.11952987313270569, + "learning_rate": 0.0005, + "loss": 2.1102, + "step": 95790 + }, + { + "epoch": 0.36463844461530265, + "grad_norm": 0.12958964705467224, + "learning_rate": 0.0005, + "loss": 2.1171, + "step": 95800 + }, + { + "epoch": 0.3646765070834253, + "grad_norm": 0.12317771464586258, + "learning_rate": 0.0005, + "loss": 2.121, + "step": 95810 + }, + { + "epoch": 0.364714569551548, + "grad_norm": 0.11658994853496552, + "learning_rate": 0.0005, + "loss": 2.119, + "step": 95820 + }, + { + "epoch": 0.3647526320196707, + "grad_norm": 0.11464099586009979, + "learning_rate": 0.0005, + "loss": 2.1208, + "step": 95830 + }, + { + "epoch": 0.3647906944877934, + "grad_norm": 0.1333150416612625, + "learning_rate": 0.0005, + "loss": 2.1213, + "step": 95840 + }, + { + "epoch": 0.36482875695591604, + "grad_norm": 0.1296979784965515, + "learning_rate": 0.0005, + "loss": 2.1185, + "step": 95850 + }, + { + "epoch": 0.36486681942403876, + "grad_norm": 0.12045649439096451, + "learning_rate": 0.0005, + "loss": 2.1189, + "step": 95860 + }, + { + "epoch": 0.3649048818921614, + "grad_norm": 0.12735560536384583, + "learning_rate": 0.0005, + "loss": 2.1229, + "step": 95870 + }, + { + "epoch": 0.36494294436028407, + "grad_norm": 0.13241159915924072, + "learning_rate": 0.0005, + "loss": 2.1152, + "step": 95880 + }, + { + "epoch": 0.3649810068284068, + "grad_norm": 0.1125500351190567, + "learning_rate": 0.0005, + "loss": 2.1255, + "step": 95890 + }, + { + "epoch": 0.36501906929652944, + "grad_norm": 0.1374804973602295, + "learning_rate": 0.0005, + "loss": 2.1292, + "step": 95900 + }, + { + "epoch": 0.36505713176465215, + "grad_norm": 0.11424729973077774, + "learning_rate": 0.0005, + "loss": 2.1092, + "step": 95910 + }, + { + "epoch": 0.3650951942327748, + "grad_norm": 0.12060584872961044, + "learning_rate": 0.0005, + "loss": 2.1162, + "step": 95920 + }, + { + "epoch": 0.3651332567008975, + "grad_norm": 0.12491651624441147, + "learning_rate": 0.0005, + "loss": 2.1124, + "step": 95930 + }, + { + "epoch": 0.3651713191690202, + "grad_norm": 0.12484659254550934, + "learning_rate": 0.0005, + "loss": 2.1006, + "step": 95940 + }, + { + "epoch": 0.3652093816371429, + "grad_norm": 0.11688201874494553, + "learning_rate": 0.0005, + "loss": 2.1151, + "step": 95950 + }, + { + "epoch": 0.36524744410526555, + "grad_norm": 0.12248838692903519, + "learning_rate": 0.0005, + "loss": 2.1292, + "step": 95960 + }, + { + "epoch": 0.36528550657338826, + "grad_norm": 0.12331437319517136, + "learning_rate": 0.0005, + "loss": 2.1394, + "step": 95970 + }, + { + "epoch": 0.3653235690415109, + "grad_norm": 0.11248493939638138, + "learning_rate": 0.0005, + "loss": 2.1287, + "step": 95980 + }, + { + "epoch": 0.36536163150963363, + "grad_norm": 0.12089279294013977, + "learning_rate": 0.0005, + "loss": 2.1194, + "step": 95990 + }, + { + "epoch": 0.3653996939777563, + "grad_norm": 0.12537023425102234, + "learning_rate": 0.0005, + "loss": 2.1235, + "step": 96000 + }, + { + "epoch": 0.365437756445879, + "grad_norm": 0.11896590143442154, + "learning_rate": 0.0005, + "loss": 2.1262, + "step": 96010 + }, + { + "epoch": 0.36547581891400166, + "grad_norm": 0.12212135642766953, + "learning_rate": 0.0005, + "loss": 2.1275, + "step": 96020 + }, + { + "epoch": 0.3655138813821243, + "grad_norm": 0.1215914934873581, + "learning_rate": 0.0005, + "loss": 2.124, + "step": 96030 + }, + { + "epoch": 0.36555194385024703, + "grad_norm": 0.12669996917247772, + "learning_rate": 0.0005, + "loss": 2.1216, + "step": 96040 + }, + { + "epoch": 0.3655900063183697, + "grad_norm": 0.11996883153915405, + "learning_rate": 0.0005, + "loss": 2.134, + "step": 96050 + }, + { + "epoch": 0.3656280687864924, + "grad_norm": 0.11836867034435272, + "learning_rate": 0.0005, + "loss": 2.112, + "step": 96060 + }, + { + "epoch": 0.36566613125461506, + "grad_norm": 0.1243426501750946, + "learning_rate": 0.0005, + "loss": 2.1147, + "step": 96070 + }, + { + "epoch": 0.36570419372273777, + "grad_norm": 0.12142115086317062, + "learning_rate": 0.0005, + "loss": 2.1255, + "step": 96080 + }, + { + "epoch": 0.3657422561908604, + "grad_norm": 0.12365086376667023, + "learning_rate": 0.0005, + "loss": 2.1248, + "step": 96090 + }, + { + "epoch": 0.36578031865898314, + "grad_norm": 0.12358164042234421, + "learning_rate": 0.0005, + "loss": 2.1361, + "step": 96100 + }, + { + "epoch": 0.3658183811271058, + "grad_norm": 0.13264146447181702, + "learning_rate": 0.0005, + "loss": 2.1337, + "step": 96110 + }, + { + "epoch": 0.3658564435952285, + "grad_norm": 0.1243383064866066, + "learning_rate": 0.0005, + "loss": 2.108, + "step": 96120 + }, + { + "epoch": 0.36589450606335117, + "grad_norm": 0.11371131241321564, + "learning_rate": 0.0005, + "loss": 2.1426, + "step": 96130 + }, + { + "epoch": 0.3659325685314739, + "grad_norm": 0.10895078629255295, + "learning_rate": 0.0005, + "loss": 2.1392, + "step": 96140 + }, + { + "epoch": 0.36597063099959654, + "grad_norm": 0.11829078942537308, + "learning_rate": 0.0005, + "loss": 2.1141, + "step": 96150 + }, + { + "epoch": 0.36600869346771925, + "grad_norm": 0.12832999229431152, + "learning_rate": 0.0005, + "loss": 2.1318, + "step": 96160 + }, + { + "epoch": 0.3660467559358419, + "grad_norm": 0.11920901387929916, + "learning_rate": 0.0005, + "loss": 2.1117, + "step": 96170 + }, + { + "epoch": 0.36608481840396456, + "grad_norm": 0.129286989569664, + "learning_rate": 0.0005, + "loss": 2.1067, + "step": 96180 + }, + { + "epoch": 0.3661228808720873, + "grad_norm": 0.12464142590761185, + "learning_rate": 0.0005, + "loss": 2.1321, + "step": 96190 + }, + { + "epoch": 0.36616094334020993, + "grad_norm": 0.12098777294158936, + "learning_rate": 0.0005, + "loss": 2.1196, + "step": 96200 + }, + { + "epoch": 0.36619900580833264, + "grad_norm": 0.11722879111766815, + "learning_rate": 0.0005, + "loss": 2.1187, + "step": 96210 + }, + { + "epoch": 0.3662370682764553, + "grad_norm": 0.13351798057556152, + "learning_rate": 0.0005, + "loss": 2.1125, + "step": 96220 + }, + { + "epoch": 0.366275130744578, + "grad_norm": 0.12770894169807434, + "learning_rate": 0.0005, + "loss": 2.1084, + "step": 96230 + }, + { + "epoch": 0.36631319321270067, + "grad_norm": 0.11819111555814743, + "learning_rate": 0.0005, + "loss": 2.1231, + "step": 96240 + }, + { + "epoch": 0.3663512556808234, + "grad_norm": 0.12946787476539612, + "learning_rate": 0.0005, + "loss": 2.1201, + "step": 96250 + }, + { + "epoch": 0.36638931814894604, + "grad_norm": 0.1193518415093422, + "learning_rate": 0.0005, + "loss": 2.1271, + "step": 96260 + }, + { + "epoch": 0.36642738061706875, + "grad_norm": 0.1213909387588501, + "learning_rate": 0.0005, + "loss": 2.1231, + "step": 96270 + }, + { + "epoch": 0.3664654430851914, + "grad_norm": 0.13134944438934326, + "learning_rate": 0.0005, + "loss": 2.1224, + "step": 96280 + }, + { + "epoch": 0.3665035055533141, + "grad_norm": 0.12113825231790543, + "learning_rate": 0.0005, + "loss": 2.1225, + "step": 96290 + }, + { + "epoch": 0.3665415680214368, + "grad_norm": 0.11982209980487823, + "learning_rate": 0.0005, + "loss": 2.1186, + "step": 96300 + }, + { + "epoch": 0.36657963048955944, + "grad_norm": 0.13760797679424286, + "learning_rate": 0.0005, + "loss": 2.1125, + "step": 96310 + }, + { + "epoch": 0.36661769295768215, + "grad_norm": 0.12309988588094711, + "learning_rate": 0.0005, + "loss": 2.1176, + "step": 96320 + }, + { + "epoch": 0.3666557554258048, + "grad_norm": 0.12127954512834549, + "learning_rate": 0.0005, + "loss": 2.1175, + "step": 96330 + }, + { + "epoch": 0.3666938178939275, + "grad_norm": 0.12880012392997742, + "learning_rate": 0.0005, + "loss": 2.1141, + "step": 96340 + }, + { + "epoch": 0.3667318803620502, + "grad_norm": 0.12680794298648834, + "learning_rate": 0.0005, + "loss": 2.1221, + "step": 96350 + }, + { + "epoch": 0.3667699428301729, + "grad_norm": 0.12031067907810211, + "learning_rate": 0.0005, + "loss": 2.1205, + "step": 96360 + }, + { + "epoch": 0.36680800529829555, + "grad_norm": 0.1160052940249443, + "learning_rate": 0.0005, + "loss": 2.1229, + "step": 96370 + }, + { + "epoch": 0.36684606776641826, + "grad_norm": 0.12784983217716217, + "learning_rate": 0.0005, + "loss": 2.1197, + "step": 96380 + }, + { + "epoch": 0.3668841302345409, + "grad_norm": 0.12080098688602448, + "learning_rate": 0.0005, + "loss": 2.1376, + "step": 96390 + }, + { + "epoch": 0.36692219270266363, + "grad_norm": 0.14079447090625763, + "learning_rate": 0.0005, + "loss": 2.1113, + "step": 96400 + }, + { + "epoch": 0.3669602551707863, + "grad_norm": 0.12535206973552704, + "learning_rate": 0.0005, + "loss": 2.1254, + "step": 96410 + }, + { + "epoch": 0.366998317638909, + "grad_norm": 0.13023337721824646, + "learning_rate": 0.0005, + "loss": 2.1184, + "step": 96420 + }, + { + "epoch": 0.36703638010703166, + "grad_norm": 0.11447879672050476, + "learning_rate": 0.0005, + "loss": 2.1272, + "step": 96430 + }, + { + "epoch": 0.36707444257515437, + "grad_norm": 0.12368776649236679, + "learning_rate": 0.0005, + "loss": 2.1247, + "step": 96440 + }, + { + "epoch": 0.367112505043277, + "grad_norm": 0.1184663251042366, + "learning_rate": 0.0005, + "loss": 2.1195, + "step": 96450 + }, + { + "epoch": 0.3671505675113997, + "grad_norm": 0.1336742341518402, + "learning_rate": 0.0005, + "loss": 2.1182, + "step": 96460 + }, + { + "epoch": 0.3671886299795224, + "grad_norm": 0.12953265011310577, + "learning_rate": 0.0005, + "loss": 2.1318, + "step": 96470 + }, + { + "epoch": 0.36722669244764505, + "grad_norm": 0.12758080661296844, + "learning_rate": 0.0005, + "loss": 2.1112, + "step": 96480 + }, + { + "epoch": 0.36726475491576777, + "grad_norm": 0.12322807312011719, + "learning_rate": 0.0005, + "loss": 2.1202, + "step": 96490 + }, + { + "epoch": 0.3673028173838904, + "grad_norm": 0.12046000361442566, + "learning_rate": 0.0005, + "loss": 2.1092, + "step": 96500 + }, + { + "epoch": 0.36734087985201314, + "grad_norm": 0.12544408440589905, + "learning_rate": 0.0005, + "loss": 2.1161, + "step": 96510 + }, + { + "epoch": 0.3673789423201358, + "grad_norm": 0.1448279321193695, + "learning_rate": 0.0005, + "loss": 2.0988, + "step": 96520 + }, + { + "epoch": 0.3674170047882585, + "grad_norm": 0.13275164365768433, + "learning_rate": 0.0005, + "loss": 2.1271, + "step": 96530 + }, + { + "epoch": 0.36745506725638116, + "grad_norm": 0.11812781542539597, + "learning_rate": 0.0005, + "loss": 2.1129, + "step": 96540 + }, + { + "epoch": 0.3674931297245039, + "grad_norm": 0.11883515864610672, + "learning_rate": 0.0005, + "loss": 2.1239, + "step": 96550 + }, + { + "epoch": 0.36753119219262653, + "grad_norm": 0.12725114822387695, + "learning_rate": 0.0005, + "loss": 2.1185, + "step": 96560 + }, + { + "epoch": 0.36756925466074925, + "grad_norm": 0.23462848365306854, + "learning_rate": 0.0005, + "loss": 2.1122, + "step": 96570 + }, + { + "epoch": 0.3676073171288719, + "grad_norm": 0.12591758370399475, + "learning_rate": 0.0005, + "loss": 2.1214, + "step": 96580 + }, + { + "epoch": 0.3676453795969946, + "grad_norm": 0.1261547952890396, + "learning_rate": 0.0005, + "loss": 2.1164, + "step": 96590 + }, + { + "epoch": 0.36768344206511727, + "grad_norm": 0.11906035989522934, + "learning_rate": 0.0005, + "loss": 2.1084, + "step": 96600 + }, + { + "epoch": 0.36772150453323993, + "grad_norm": 0.12901455163955688, + "learning_rate": 0.0005, + "loss": 2.1194, + "step": 96610 + }, + { + "epoch": 0.36775956700136264, + "grad_norm": 0.12110006809234619, + "learning_rate": 0.0005, + "loss": 2.1181, + "step": 96620 + }, + { + "epoch": 0.3677976294694853, + "grad_norm": 0.13759082555770874, + "learning_rate": 0.0005, + "loss": 2.1246, + "step": 96630 + }, + { + "epoch": 0.367835691937608, + "grad_norm": 0.1277286857366562, + "learning_rate": 0.0005, + "loss": 2.1171, + "step": 96640 + }, + { + "epoch": 0.36787375440573067, + "grad_norm": 0.12250223755836487, + "learning_rate": 0.0005, + "loss": 2.1263, + "step": 96650 + }, + { + "epoch": 0.3679118168738534, + "grad_norm": 0.1282496452331543, + "learning_rate": 0.0005, + "loss": 2.1427, + "step": 96660 + }, + { + "epoch": 0.36794987934197604, + "grad_norm": 0.12418284267187119, + "learning_rate": 0.0005, + "loss": 2.12, + "step": 96670 + }, + { + "epoch": 0.36798794181009875, + "grad_norm": 0.12446416169404984, + "learning_rate": 0.0005, + "loss": 2.1139, + "step": 96680 + }, + { + "epoch": 0.3680260042782214, + "grad_norm": 0.12037511169910431, + "learning_rate": 0.0005, + "loss": 2.1192, + "step": 96690 + }, + { + "epoch": 0.3680640667463441, + "grad_norm": 0.12514127790927887, + "learning_rate": 0.0005, + "loss": 2.1237, + "step": 96700 + }, + { + "epoch": 0.3681021292144668, + "grad_norm": 0.12305998057126999, + "learning_rate": 0.0005, + "loss": 2.1299, + "step": 96710 + }, + { + "epoch": 0.3681401916825895, + "grad_norm": 0.12293251603841782, + "learning_rate": 0.0005, + "loss": 2.1176, + "step": 96720 + }, + { + "epoch": 0.36817825415071215, + "grad_norm": 0.1332688182592392, + "learning_rate": 0.0005, + "loss": 2.1313, + "step": 96730 + }, + { + "epoch": 0.3682163166188348, + "grad_norm": 0.13076834380626678, + "learning_rate": 0.0005, + "loss": 2.1213, + "step": 96740 + }, + { + "epoch": 0.3682543790869575, + "grad_norm": 0.11440906673669815, + "learning_rate": 0.0005, + "loss": 2.1153, + "step": 96750 + }, + { + "epoch": 0.3682924415550802, + "grad_norm": 0.12589947879314423, + "learning_rate": 0.0005, + "loss": 2.1165, + "step": 96760 + }, + { + "epoch": 0.3683305040232029, + "grad_norm": 0.1251448392868042, + "learning_rate": 0.0005, + "loss": 2.1277, + "step": 96770 + }, + { + "epoch": 0.36836856649132554, + "grad_norm": 0.12304763495922089, + "learning_rate": 0.0005, + "loss": 2.1207, + "step": 96780 + }, + { + "epoch": 0.36840662895944826, + "grad_norm": 0.12357057631015778, + "learning_rate": 0.0005, + "loss": 2.1178, + "step": 96790 + }, + { + "epoch": 0.3684446914275709, + "grad_norm": 0.12945859134197235, + "learning_rate": 0.0005, + "loss": 2.1169, + "step": 96800 + }, + { + "epoch": 0.3684827538956936, + "grad_norm": 0.13287395238876343, + "learning_rate": 0.0005, + "loss": 2.1311, + "step": 96810 + }, + { + "epoch": 0.3685208163638163, + "grad_norm": 0.11265331506729126, + "learning_rate": 0.0005, + "loss": 2.1171, + "step": 96820 + }, + { + "epoch": 0.368558878831939, + "grad_norm": 0.12684476375579834, + "learning_rate": 0.0005, + "loss": 2.1181, + "step": 96830 + }, + { + "epoch": 0.36859694130006165, + "grad_norm": 0.12589377164840698, + "learning_rate": 0.0005, + "loss": 2.1128, + "step": 96840 + }, + { + "epoch": 0.36863500376818437, + "grad_norm": 0.12472228705883026, + "learning_rate": 0.0005, + "loss": 2.1216, + "step": 96850 + }, + { + "epoch": 0.368673066236307, + "grad_norm": 0.12273325026035309, + "learning_rate": 0.0005, + "loss": 2.113, + "step": 96860 + }, + { + "epoch": 0.36871112870442974, + "grad_norm": 0.11925730854272842, + "learning_rate": 0.0005, + "loss": 2.1237, + "step": 96870 + }, + { + "epoch": 0.3687491911725524, + "grad_norm": 0.13875854015350342, + "learning_rate": 0.0005, + "loss": 2.1186, + "step": 96880 + }, + { + "epoch": 0.36878725364067505, + "grad_norm": 0.13009196519851685, + "learning_rate": 0.0005, + "loss": 2.1129, + "step": 96890 + }, + { + "epoch": 0.36882531610879776, + "grad_norm": 0.11982984095811844, + "learning_rate": 0.0005, + "loss": 2.1153, + "step": 96900 + }, + { + "epoch": 0.3688633785769204, + "grad_norm": 0.12012947350740433, + "learning_rate": 0.0005, + "loss": 2.1242, + "step": 96910 + }, + { + "epoch": 0.36890144104504313, + "grad_norm": 0.145431786775589, + "learning_rate": 0.0005, + "loss": 2.1211, + "step": 96920 + }, + { + "epoch": 0.3689395035131658, + "grad_norm": 0.11987671256065369, + "learning_rate": 0.0005, + "loss": 2.1185, + "step": 96930 + }, + { + "epoch": 0.3689775659812885, + "grad_norm": 0.13114087283611298, + "learning_rate": 0.0005, + "loss": 2.1312, + "step": 96940 + }, + { + "epoch": 0.36901562844941116, + "grad_norm": 0.12345308065414429, + "learning_rate": 0.0005, + "loss": 2.1194, + "step": 96950 + }, + { + "epoch": 0.36905369091753387, + "grad_norm": 0.11484279483556747, + "learning_rate": 0.0005, + "loss": 2.123, + "step": 96960 + }, + { + "epoch": 0.36909175338565653, + "grad_norm": 0.12712082266807556, + "learning_rate": 0.0005, + "loss": 2.1213, + "step": 96970 + }, + { + "epoch": 0.36912981585377924, + "grad_norm": 0.11770997196435928, + "learning_rate": 0.0005, + "loss": 2.1222, + "step": 96980 + }, + { + "epoch": 0.3691678783219019, + "grad_norm": 0.11867061257362366, + "learning_rate": 0.0005, + "loss": 2.1143, + "step": 96990 + }, + { + "epoch": 0.3692059407900246, + "grad_norm": 0.11773667484521866, + "learning_rate": 0.0005, + "loss": 2.1197, + "step": 97000 + }, + { + "epoch": 0.36924400325814727, + "grad_norm": 0.12527793645858765, + "learning_rate": 0.0005, + "loss": 2.1389, + "step": 97010 + }, + { + "epoch": 0.36928206572627, + "grad_norm": 0.12575988471508026, + "learning_rate": 0.0005, + "loss": 2.1052, + "step": 97020 + }, + { + "epoch": 0.36932012819439264, + "grad_norm": 0.11121707409620285, + "learning_rate": 0.0005, + "loss": 2.1007, + "step": 97030 + }, + { + "epoch": 0.3693581906625153, + "grad_norm": 0.12394191324710846, + "learning_rate": 0.0005, + "loss": 2.1191, + "step": 97040 + }, + { + "epoch": 0.369396253130638, + "grad_norm": 0.1266230195760727, + "learning_rate": 0.0005, + "loss": 2.1147, + "step": 97050 + }, + { + "epoch": 0.36943431559876067, + "grad_norm": 0.11939870566129684, + "learning_rate": 0.0005, + "loss": 2.1155, + "step": 97060 + }, + { + "epoch": 0.3694723780668834, + "grad_norm": 0.11312147229909897, + "learning_rate": 0.0005, + "loss": 2.1116, + "step": 97070 + }, + { + "epoch": 0.36951044053500604, + "grad_norm": 0.11626769602298737, + "learning_rate": 0.0005, + "loss": 2.1176, + "step": 97080 + }, + { + "epoch": 0.36954850300312875, + "grad_norm": 0.1269378960132599, + "learning_rate": 0.0005, + "loss": 2.1247, + "step": 97090 + }, + { + "epoch": 0.3695865654712514, + "grad_norm": 0.12174524366855621, + "learning_rate": 0.0005, + "loss": 2.1187, + "step": 97100 + }, + { + "epoch": 0.3696246279393741, + "grad_norm": 0.12147214263677597, + "learning_rate": 0.0005, + "loss": 2.1313, + "step": 97110 + }, + { + "epoch": 0.3696626904074968, + "grad_norm": 0.11966443061828613, + "learning_rate": 0.0005, + "loss": 2.1143, + "step": 97120 + }, + { + "epoch": 0.3697007528756195, + "grad_norm": 0.12414590269327164, + "learning_rate": 0.0005, + "loss": 2.1123, + "step": 97130 + }, + { + "epoch": 0.36973881534374214, + "grad_norm": 0.1292896568775177, + "learning_rate": 0.0005, + "loss": 2.1339, + "step": 97140 + }, + { + "epoch": 0.36977687781186486, + "grad_norm": 0.1209682747721672, + "learning_rate": 0.0005, + "loss": 2.1185, + "step": 97150 + }, + { + "epoch": 0.3698149402799875, + "grad_norm": 0.1336567997932434, + "learning_rate": 0.0005, + "loss": 2.1272, + "step": 97160 + }, + { + "epoch": 0.36985300274811017, + "grad_norm": 0.11845371127128601, + "learning_rate": 0.0005, + "loss": 2.1169, + "step": 97170 + }, + { + "epoch": 0.3698910652162329, + "grad_norm": 0.12234880775213242, + "learning_rate": 0.0005, + "loss": 2.1115, + "step": 97180 + }, + { + "epoch": 0.36992912768435554, + "grad_norm": 0.13724114000797272, + "learning_rate": 0.0005, + "loss": 2.1176, + "step": 97190 + }, + { + "epoch": 0.36996719015247825, + "grad_norm": 0.13921892642974854, + "learning_rate": 0.0005, + "loss": 2.1357, + "step": 97200 + }, + { + "epoch": 0.3700052526206009, + "grad_norm": 0.12601223587989807, + "learning_rate": 0.0005, + "loss": 2.1149, + "step": 97210 + }, + { + "epoch": 0.3700433150887236, + "grad_norm": 0.13368086516857147, + "learning_rate": 0.0005, + "loss": 2.1335, + "step": 97220 + }, + { + "epoch": 0.3700813775568463, + "grad_norm": 0.11279631406068802, + "learning_rate": 0.0005, + "loss": 2.1335, + "step": 97230 + }, + { + "epoch": 0.370119440024969, + "grad_norm": 0.11834455281496048, + "learning_rate": 0.0005, + "loss": 2.1279, + "step": 97240 + }, + { + "epoch": 0.37015750249309165, + "grad_norm": 0.12333115190267563, + "learning_rate": 0.0005, + "loss": 2.1349, + "step": 97250 + }, + { + "epoch": 0.37019556496121436, + "grad_norm": 0.12108000367879868, + "learning_rate": 0.0005, + "loss": 2.1194, + "step": 97260 + }, + { + "epoch": 0.370233627429337, + "grad_norm": 0.1163671612739563, + "learning_rate": 0.0005, + "loss": 2.1055, + "step": 97270 + }, + { + "epoch": 0.37027168989745973, + "grad_norm": 0.15014775097370148, + "learning_rate": 0.0005, + "loss": 2.1208, + "step": 97280 + }, + { + "epoch": 0.3703097523655824, + "grad_norm": 0.14235511422157288, + "learning_rate": 0.0005, + "loss": 2.1153, + "step": 97290 + }, + { + "epoch": 0.3703478148337051, + "grad_norm": 0.126591756939888, + "learning_rate": 0.0005, + "loss": 2.1201, + "step": 97300 + }, + { + "epoch": 0.37038587730182776, + "grad_norm": 0.12709274888038635, + "learning_rate": 0.0005, + "loss": 2.1293, + "step": 97310 + }, + { + "epoch": 0.3704239397699504, + "grad_norm": 0.1244237869977951, + "learning_rate": 0.0005, + "loss": 2.1259, + "step": 97320 + }, + { + "epoch": 0.37046200223807313, + "grad_norm": 0.11576223373413086, + "learning_rate": 0.0005, + "loss": 2.1176, + "step": 97330 + }, + { + "epoch": 0.3705000647061958, + "grad_norm": 0.12289880961179733, + "learning_rate": 0.0005, + "loss": 2.1042, + "step": 97340 + }, + { + "epoch": 0.3705381271743185, + "grad_norm": 0.12008960545063019, + "learning_rate": 0.0005, + "loss": 2.1278, + "step": 97350 + }, + { + "epoch": 0.37057618964244116, + "grad_norm": 0.1161564290523529, + "learning_rate": 0.0005, + "loss": 2.1178, + "step": 97360 + }, + { + "epoch": 0.37061425211056387, + "grad_norm": 0.1414661556482315, + "learning_rate": 0.0005, + "loss": 2.1306, + "step": 97370 + }, + { + "epoch": 0.3706523145786865, + "grad_norm": 0.13043402135372162, + "learning_rate": 0.0005, + "loss": 2.1191, + "step": 97380 + }, + { + "epoch": 0.37069037704680924, + "grad_norm": 0.12933848798274994, + "learning_rate": 0.0005, + "loss": 2.1267, + "step": 97390 + }, + { + "epoch": 0.3707284395149319, + "grad_norm": 0.13210655748844147, + "learning_rate": 0.0005, + "loss": 2.1139, + "step": 97400 + }, + { + "epoch": 0.3707665019830546, + "grad_norm": 0.14086242020130157, + "learning_rate": 0.0005, + "loss": 2.1165, + "step": 97410 + }, + { + "epoch": 0.37080456445117727, + "grad_norm": 0.14640408754348755, + "learning_rate": 0.0005, + "loss": 2.1412, + "step": 97420 + }, + { + "epoch": 0.3708426269193, + "grad_norm": 0.1357678920030594, + "learning_rate": 0.0005, + "loss": 2.1266, + "step": 97430 + }, + { + "epoch": 0.37088068938742264, + "grad_norm": 0.12262952327728271, + "learning_rate": 0.0005, + "loss": 2.1096, + "step": 97440 + }, + { + "epoch": 0.37091875185554535, + "grad_norm": 0.1472138911485672, + "learning_rate": 0.0005, + "loss": 2.1321, + "step": 97450 + }, + { + "epoch": 0.370956814323668, + "grad_norm": 0.11382775753736496, + "learning_rate": 0.0005, + "loss": 2.1145, + "step": 97460 + }, + { + "epoch": 0.37099487679179066, + "grad_norm": 0.13012194633483887, + "learning_rate": 0.0005, + "loss": 2.1275, + "step": 97470 + }, + { + "epoch": 0.3710329392599134, + "grad_norm": 0.12605558335781097, + "learning_rate": 0.0005, + "loss": 2.1278, + "step": 97480 + }, + { + "epoch": 0.37107100172803603, + "grad_norm": 0.11173474043607712, + "learning_rate": 0.0005, + "loss": 2.1363, + "step": 97490 + }, + { + "epoch": 0.37110906419615874, + "grad_norm": 0.14552360773086548, + "learning_rate": 0.0005, + "loss": 2.114, + "step": 97500 + }, + { + "epoch": 0.3711471266642814, + "grad_norm": 0.11996559053659439, + "learning_rate": 0.0005, + "loss": 2.1243, + "step": 97510 + }, + { + "epoch": 0.3711851891324041, + "grad_norm": 0.12549810111522675, + "learning_rate": 0.0005, + "loss": 2.1182, + "step": 97520 + }, + { + "epoch": 0.37122325160052677, + "grad_norm": 0.13002847135066986, + "learning_rate": 0.0005, + "loss": 2.1403, + "step": 97530 + }, + { + "epoch": 0.3712613140686495, + "grad_norm": 0.11643203347921371, + "learning_rate": 0.0005, + "loss": 2.1106, + "step": 97540 + }, + { + "epoch": 0.37129937653677214, + "grad_norm": 0.1239471510052681, + "learning_rate": 0.0005, + "loss": 2.1127, + "step": 97550 + }, + { + "epoch": 0.37133743900489485, + "grad_norm": 0.11400319635868073, + "learning_rate": 0.0005, + "loss": 2.1124, + "step": 97560 + }, + { + "epoch": 0.3713755014730175, + "grad_norm": 0.11754616349935532, + "learning_rate": 0.0005, + "loss": 2.1156, + "step": 97570 + }, + { + "epoch": 0.3714135639411402, + "grad_norm": 0.12323588132858276, + "learning_rate": 0.0005, + "loss": 2.1386, + "step": 97580 + }, + { + "epoch": 0.3714516264092629, + "grad_norm": 0.12451520562171936, + "learning_rate": 0.0005, + "loss": 2.1205, + "step": 97590 + }, + { + "epoch": 0.37148968887738554, + "grad_norm": 0.11871838569641113, + "learning_rate": 0.0005, + "loss": 2.1274, + "step": 97600 + }, + { + "epoch": 0.37152775134550825, + "grad_norm": 0.1297159045934677, + "learning_rate": 0.0005, + "loss": 2.1187, + "step": 97610 + }, + { + "epoch": 0.3715658138136309, + "grad_norm": 0.1391223818063736, + "learning_rate": 0.0005, + "loss": 2.1129, + "step": 97620 + }, + { + "epoch": 0.3716038762817536, + "grad_norm": 0.1261225789785385, + "learning_rate": 0.0005, + "loss": 2.1241, + "step": 97630 + }, + { + "epoch": 0.3716419387498763, + "grad_norm": 0.12182370573282242, + "learning_rate": 0.0005, + "loss": 2.1296, + "step": 97640 + }, + { + "epoch": 0.371680001217999, + "grad_norm": 0.12699827551841736, + "learning_rate": 0.0005, + "loss": 2.1229, + "step": 97650 + }, + { + "epoch": 0.37171806368612165, + "grad_norm": 0.11652454733848572, + "learning_rate": 0.0005, + "loss": 2.1133, + "step": 97660 + }, + { + "epoch": 0.37175612615424436, + "grad_norm": 0.1208571121096611, + "learning_rate": 0.0005, + "loss": 2.124, + "step": 97670 + }, + { + "epoch": 0.371794188622367, + "grad_norm": 0.13136690855026245, + "learning_rate": 0.0005, + "loss": 2.1204, + "step": 97680 + }, + { + "epoch": 0.37183225109048973, + "grad_norm": 0.11939571052789688, + "learning_rate": 0.0005, + "loss": 2.1109, + "step": 97690 + }, + { + "epoch": 0.3718703135586124, + "grad_norm": 0.12099155783653259, + "learning_rate": 0.0005, + "loss": 2.137, + "step": 97700 + }, + { + "epoch": 0.3719083760267351, + "grad_norm": 0.12223554402589798, + "learning_rate": 0.0005, + "loss": 2.1141, + "step": 97710 + }, + { + "epoch": 0.37194643849485776, + "grad_norm": 0.12005815654993057, + "learning_rate": 0.0005, + "loss": 2.1134, + "step": 97720 + }, + { + "epoch": 0.37198450096298047, + "grad_norm": 0.12749920785427094, + "learning_rate": 0.0005, + "loss": 2.1384, + "step": 97730 + }, + { + "epoch": 0.3720225634311031, + "grad_norm": 0.12131239473819733, + "learning_rate": 0.0005, + "loss": 2.1234, + "step": 97740 + }, + { + "epoch": 0.3720606258992258, + "grad_norm": 0.12197811156511307, + "learning_rate": 0.0005, + "loss": 2.1171, + "step": 97750 + }, + { + "epoch": 0.3720986883673485, + "grad_norm": 0.13216941058635712, + "learning_rate": 0.0005, + "loss": 2.1065, + "step": 97760 + }, + { + "epoch": 0.37213675083547115, + "grad_norm": 0.11797917634248734, + "learning_rate": 0.0005, + "loss": 2.1153, + "step": 97770 + }, + { + "epoch": 0.37217481330359387, + "grad_norm": 0.1313115358352661, + "learning_rate": 0.0005, + "loss": 2.1194, + "step": 97780 + }, + { + "epoch": 0.3722128757717165, + "grad_norm": 0.11875154823064804, + "learning_rate": 0.0005, + "loss": 2.1215, + "step": 97790 + }, + { + "epoch": 0.37225093823983924, + "grad_norm": 0.12649646401405334, + "learning_rate": 0.0005, + "loss": 2.1369, + "step": 97800 + }, + { + "epoch": 0.3722890007079619, + "grad_norm": 0.128423273563385, + "learning_rate": 0.0005, + "loss": 2.1297, + "step": 97810 + }, + { + "epoch": 0.3723270631760846, + "grad_norm": 0.1393405646085739, + "learning_rate": 0.0005, + "loss": 2.1255, + "step": 97820 + }, + { + "epoch": 0.37236512564420726, + "grad_norm": 0.13227048516273499, + "learning_rate": 0.0005, + "loss": 2.1441, + "step": 97830 + }, + { + "epoch": 0.37240318811233, + "grad_norm": 0.13055221736431122, + "learning_rate": 0.0005, + "loss": 2.1241, + "step": 97840 + }, + { + "epoch": 0.37244125058045263, + "grad_norm": 0.11716745048761368, + "learning_rate": 0.0005, + "loss": 2.1138, + "step": 97850 + }, + { + "epoch": 0.37247931304857534, + "grad_norm": 0.12659123539924622, + "learning_rate": 0.0005, + "loss": 2.12, + "step": 97860 + }, + { + "epoch": 0.372517375516698, + "grad_norm": 0.13839322328567505, + "learning_rate": 0.0005, + "loss": 2.1265, + "step": 97870 + }, + { + "epoch": 0.3725554379848207, + "grad_norm": 0.12499116361141205, + "learning_rate": 0.0005, + "loss": 2.1307, + "step": 97880 + }, + { + "epoch": 0.37259350045294337, + "grad_norm": 0.12236038595438004, + "learning_rate": 0.0005, + "loss": 2.1171, + "step": 97890 + }, + { + "epoch": 0.37263156292106603, + "grad_norm": 0.1392621546983719, + "learning_rate": 0.0005, + "loss": 2.1226, + "step": 97900 + }, + { + "epoch": 0.37266962538918874, + "grad_norm": 0.1261049211025238, + "learning_rate": 0.0005, + "loss": 2.1294, + "step": 97910 + }, + { + "epoch": 0.3727076878573114, + "grad_norm": 0.15030497312545776, + "learning_rate": 0.0005, + "loss": 2.1226, + "step": 97920 + }, + { + "epoch": 0.3727457503254341, + "grad_norm": 0.1456809937953949, + "learning_rate": 0.0005, + "loss": 2.1116, + "step": 97930 + }, + { + "epoch": 0.37278381279355677, + "grad_norm": 0.14672748744487762, + "learning_rate": 0.0005, + "loss": 2.1037, + "step": 97940 + }, + { + "epoch": 0.3728218752616795, + "grad_norm": 0.12252899259328842, + "learning_rate": 0.0005, + "loss": 2.1171, + "step": 97950 + }, + { + "epoch": 0.37285993772980214, + "grad_norm": 0.14606109261512756, + "learning_rate": 0.0005, + "loss": 2.1265, + "step": 97960 + }, + { + "epoch": 0.37289800019792485, + "grad_norm": 0.12018848210573196, + "learning_rate": 0.0005, + "loss": 2.1129, + "step": 97970 + }, + { + "epoch": 0.3729360626660475, + "grad_norm": 0.1229386255145073, + "learning_rate": 0.0005, + "loss": 2.1232, + "step": 97980 + }, + { + "epoch": 0.3729741251341702, + "grad_norm": 0.11852700263261795, + "learning_rate": 0.0005, + "loss": 2.1267, + "step": 97990 + }, + { + "epoch": 0.3730121876022929, + "grad_norm": 0.13460673391819, + "learning_rate": 0.0005, + "loss": 2.1268, + "step": 98000 + }, + { + "epoch": 0.3730502500704156, + "grad_norm": 0.140608549118042, + "learning_rate": 0.0005, + "loss": 2.1146, + "step": 98010 + }, + { + "epoch": 0.37308831253853825, + "grad_norm": 0.11561145633459091, + "learning_rate": 0.0005, + "loss": 2.1141, + "step": 98020 + }, + { + "epoch": 0.3731263750066609, + "grad_norm": 0.1210857629776001, + "learning_rate": 0.0005, + "loss": 2.1195, + "step": 98030 + }, + { + "epoch": 0.3731644374747836, + "grad_norm": 0.1265321522951126, + "learning_rate": 0.0005, + "loss": 2.1265, + "step": 98040 + }, + { + "epoch": 0.3732024999429063, + "grad_norm": 0.12456094473600388, + "learning_rate": 0.0005, + "loss": 2.1172, + "step": 98050 + }, + { + "epoch": 0.373240562411029, + "grad_norm": 0.1190691590309143, + "learning_rate": 0.0005, + "loss": 2.115, + "step": 98060 + }, + { + "epoch": 0.37327862487915164, + "grad_norm": 0.11460399627685547, + "learning_rate": 0.0005, + "loss": 2.1222, + "step": 98070 + }, + { + "epoch": 0.37331668734727436, + "grad_norm": 0.1223832294344902, + "learning_rate": 0.0005, + "loss": 2.1291, + "step": 98080 + }, + { + "epoch": 0.373354749815397, + "grad_norm": 0.12511587142944336, + "learning_rate": 0.0005, + "loss": 2.1215, + "step": 98090 + }, + { + "epoch": 0.3733928122835197, + "grad_norm": 0.13690148293972015, + "learning_rate": 0.0005, + "loss": 2.133, + "step": 98100 + }, + { + "epoch": 0.3734308747516424, + "grad_norm": 0.12537544965744019, + "learning_rate": 0.0005, + "loss": 2.125, + "step": 98110 + }, + { + "epoch": 0.3734689372197651, + "grad_norm": 0.13309504091739655, + "learning_rate": 0.0005, + "loss": 2.1234, + "step": 98120 + }, + { + "epoch": 0.37350699968788775, + "grad_norm": 0.12549757957458496, + "learning_rate": 0.0005, + "loss": 2.124, + "step": 98130 + }, + { + "epoch": 0.37354506215601047, + "grad_norm": 0.11821205914020538, + "learning_rate": 0.0005, + "loss": 2.1174, + "step": 98140 + }, + { + "epoch": 0.3735831246241331, + "grad_norm": 0.12350712716579437, + "learning_rate": 0.0005, + "loss": 2.1152, + "step": 98150 + }, + { + "epoch": 0.37362118709225584, + "grad_norm": 0.12245646119117737, + "learning_rate": 0.0005, + "loss": 2.1219, + "step": 98160 + }, + { + "epoch": 0.3736592495603785, + "grad_norm": 0.11302000284194946, + "learning_rate": 0.0005, + "loss": 2.1188, + "step": 98170 + }, + { + "epoch": 0.37369731202850115, + "grad_norm": 0.12072186172008514, + "learning_rate": 0.0005, + "loss": 2.1371, + "step": 98180 + }, + { + "epoch": 0.37373537449662386, + "grad_norm": 0.12735703587532043, + "learning_rate": 0.0005, + "loss": 2.1142, + "step": 98190 + }, + { + "epoch": 0.3737734369647465, + "grad_norm": 0.12989743053913116, + "learning_rate": 0.0005, + "loss": 2.111, + "step": 98200 + }, + { + "epoch": 0.37381149943286923, + "grad_norm": 0.12953096628189087, + "learning_rate": 0.0005, + "loss": 2.124, + "step": 98210 + }, + { + "epoch": 0.3738495619009919, + "grad_norm": 0.12121468782424927, + "learning_rate": 0.0005, + "loss": 2.1241, + "step": 98220 + }, + { + "epoch": 0.3738876243691146, + "grad_norm": 0.11379808187484741, + "learning_rate": 0.0005, + "loss": 2.1246, + "step": 98230 + }, + { + "epoch": 0.37392568683723726, + "grad_norm": 0.12749461829662323, + "learning_rate": 0.0005, + "loss": 2.125, + "step": 98240 + }, + { + "epoch": 0.37396374930535997, + "grad_norm": 0.133274644613266, + "learning_rate": 0.0005, + "loss": 2.1154, + "step": 98250 + }, + { + "epoch": 0.37400181177348263, + "grad_norm": 0.14356647431850433, + "learning_rate": 0.0005, + "loss": 2.1206, + "step": 98260 + }, + { + "epoch": 0.37403987424160534, + "grad_norm": 0.1227138563990593, + "learning_rate": 0.0005, + "loss": 2.1209, + "step": 98270 + }, + { + "epoch": 0.374077936709728, + "grad_norm": 0.13839580118656158, + "learning_rate": 0.0005, + "loss": 2.1302, + "step": 98280 + }, + { + "epoch": 0.3741159991778507, + "grad_norm": 0.15715768933296204, + "learning_rate": 0.0005, + "loss": 2.127, + "step": 98290 + }, + { + "epoch": 0.37415406164597337, + "grad_norm": 0.13170473277568817, + "learning_rate": 0.0005, + "loss": 2.1324, + "step": 98300 + }, + { + "epoch": 0.3741921241140961, + "grad_norm": 0.13743096590042114, + "learning_rate": 0.0005, + "loss": 2.1277, + "step": 98310 + }, + { + "epoch": 0.37423018658221874, + "grad_norm": 1.8420484066009521, + "learning_rate": 0.0005, + "loss": 2.126, + "step": 98320 + }, + { + "epoch": 0.3742682490503414, + "grad_norm": 0.1318242996931076, + "learning_rate": 0.0005, + "loss": 2.1178, + "step": 98330 + }, + { + "epoch": 0.3743063115184641, + "grad_norm": 0.11519080400466919, + "learning_rate": 0.0005, + "loss": 2.1157, + "step": 98340 + }, + { + "epoch": 0.37434437398658676, + "grad_norm": 0.13783110678195953, + "learning_rate": 0.0005, + "loss": 2.1226, + "step": 98350 + }, + { + "epoch": 0.3743824364547095, + "grad_norm": 0.18156218528747559, + "learning_rate": 0.0005, + "loss": 2.1185, + "step": 98360 + }, + { + "epoch": 0.37442049892283213, + "grad_norm": 0.13677464425563812, + "learning_rate": 0.0005, + "loss": 2.1213, + "step": 98370 + }, + { + "epoch": 0.37445856139095485, + "grad_norm": 0.12956520915031433, + "learning_rate": 0.0005, + "loss": 2.1194, + "step": 98380 + }, + { + "epoch": 0.3744966238590775, + "grad_norm": 0.1192094013094902, + "learning_rate": 0.0005, + "loss": 2.14, + "step": 98390 + }, + { + "epoch": 0.3745346863272002, + "grad_norm": 0.1302703469991684, + "learning_rate": 0.0005, + "loss": 2.1219, + "step": 98400 + }, + { + "epoch": 0.3745727487953229, + "grad_norm": 0.11764563620090485, + "learning_rate": 0.0005, + "loss": 2.1178, + "step": 98410 + }, + { + "epoch": 0.3746108112634456, + "grad_norm": 0.12831349670886993, + "learning_rate": 0.0005, + "loss": 2.1259, + "step": 98420 + }, + { + "epoch": 0.37464887373156824, + "grad_norm": 0.1260351687669754, + "learning_rate": 0.0005, + "loss": 2.1223, + "step": 98430 + }, + { + "epoch": 0.37468693619969096, + "grad_norm": 0.12819142639636993, + "learning_rate": 0.0005, + "loss": 2.1276, + "step": 98440 + }, + { + "epoch": 0.3747249986678136, + "grad_norm": 0.14196482300758362, + "learning_rate": 0.0005, + "loss": 2.1191, + "step": 98450 + }, + { + "epoch": 0.3747630611359363, + "grad_norm": 0.12788952887058258, + "learning_rate": 0.0005, + "loss": 2.1144, + "step": 98460 + }, + { + "epoch": 0.374801123604059, + "grad_norm": 0.13228143751621246, + "learning_rate": 0.0005, + "loss": 2.1262, + "step": 98470 + }, + { + "epoch": 0.37483918607218164, + "grad_norm": 0.1315939873456955, + "learning_rate": 0.0005, + "loss": 2.1178, + "step": 98480 + }, + { + "epoch": 0.37487724854030435, + "grad_norm": 0.12740527093410492, + "learning_rate": 0.0005, + "loss": 2.1146, + "step": 98490 + }, + { + "epoch": 0.374915311008427, + "grad_norm": 0.12354233860969543, + "learning_rate": 0.0005, + "loss": 2.1311, + "step": 98500 + }, + { + "epoch": 0.3749533734765497, + "grad_norm": 0.1223762109875679, + "learning_rate": 0.0005, + "loss": 2.1052, + "step": 98510 + }, + { + "epoch": 0.3749914359446724, + "grad_norm": 0.13310754299163818, + "learning_rate": 0.0005, + "loss": 2.1174, + "step": 98520 + }, + { + "epoch": 0.3750294984127951, + "grad_norm": 0.12507161498069763, + "learning_rate": 0.0005, + "loss": 2.1187, + "step": 98530 + }, + { + "epoch": 0.37506756088091775, + "grad_norm": 0.1294754594564438, + "learning_rate": 0.0005, + "loss": 2.1236, + "step": 98540 + }, + { + "epoch": 0.37510562334904046, + "grad_norm": 0.13023291528224945, + "learning_rate": 0.0005, + "loss": 2.1255, + "step": 98550 + }, + { + "epoch": 0.3751436858171631, + "grad_norm": 0.14100757241249084, + "learning_rate": 0.0005, + "loss": 2.1176, + "step": 98560 + }, + { + "epoch": 0.37518174828528583, + "grad_norm": 0.12759803235530853, + "learning_rate": 0.0005, + "loss": 2.1263, + "step": 98570 + }, + { + "epoch": 0.3752198107534085, + "grad_norm": 0.12670016288757324, + "learning_rate": 0.0005, + "loss": 2.1131, + "step": 98580 + }, + { + "epoch": 0.3752578732215312, + "grad_norm": 0.11383096128702164, + "learning_rate": 0.0005, + "loss": 2.1201, + "step": 98590 + }, + { + "epoch": 0.37529593568965386, + "grad_norm": 0.11552945524454117, + "learning_rate": 0.0005, + "loss": 2.119, + "step": 98600 + }, + { + "epoch": 0.3753339981577765, + "grad_norm": 0.10721899569034576, + "learning_rate": 0.0005, + "loss": 2.1311, + "step": 98610 + }, + { + "epoch": 0.37537206062589923, + "grad_norm": 0.13379792869091034, + "learning_rate": 0.0005, + "loss": 2.1112, + "step": 98620 + }, + { + "epoch": 0.3754101230940219, + "grad_norm": 0.11973940581083298, + "learning_rate": 0.0005, + "loss": 2.0989, + "step": 98630 + }, + { + "epoch": 0.3754481855621446, + "grad_norm": 0.12449324131011963, + "learning_rate": 0.0005, + "loss": 2.1237, + "step": 98640 + }, + { + "epoch": 0.37548624803026726, + "grad_norm": 0.11978937685489655, + "learning_rate": 0.0005, + "loss": 2.1186, + "step": 98650 + }, + { + "epoch": 0.37552431049838997, + "grad_norm": 0.128646120429039, + "learning_rate": 0.0005, + "loss": 2.1277, + "step": 98660 + }, + { + "epoch": 0.3755623729665126, + "grad_norm": 0.13415206968784332, + "learning_rate": 0.0005, + "loss": 2.1469, + "step": 98670 + }, + { + "epoch": 0.37560043543463534, + "grad_norm": 0.12093635648488998, + "learning_rate": 0.0005, + "loss": 2.1276, + "step": 98680 + }, + { + "epoch": 0.375638497902758, + "grad_norm": 0.11120793223381042, + "learning_rate": 0.0005, + "loss": 2.1135, + "step": 98690 + }, + { + "epoch": 0.3756765603708807, + "grad_norm": 0.11673329770565033, + "learning_rate": 0.0005, + "loss": 2.1293, + "step": 98700 + }, + { + "epoch": 0.37571462283900336, + "grad_norm": 0.1250150054693222, + "learning_rate": 0.0005, + "loss": 2.1256, + "step": 98710 + }, + { + "epoch": 0.3757526853071261, + "grad_norm": 0.13988950848579407, + "learning_rate": 0.0005, + "loss": 2.1123, + "step": 98720 + }, + { + "epoch": 0.37579074777524873, + "grad_norm": 0.13316217064857483, + "learning_rate": 0.0005, + "loss": 2.111, + "step": 98730 + }, + { + "epoch": 0.37582881024337145, + "grad_norm": 0.13398806750774384, + "learning_rate": 0.0005, + "loss": 2.1094, + "step": 98740 + }, + { + "epoch": 0.3758668727114941, + "grad_norm": 0.1263556182384491, + "learning_rate": 0.0005, + "loss": 2.1114, + "step": 98750 + }, + { + "epoch": 0.37590493517961676, + "grad_norm": 0.12224937975406647, + "learning_rate": 0.0005, + "loss": 2.1165, + "step": 98760 + }, + { + "epoch": 0.3759429976477395, + "grad_norm": 0.12698322534561157, + "learning_rate": 0.0005, + "loss": 2.1313, + "step": 98770 + }, + { + "epoch": 0.37598106011586213, + "grad_norm": 0.12896743416786194, + "learning_rate": 0.0005, + "loss": 2.1295, + "step": 98780 + }, + { + "epoch": 0.37601912258398484, + "grad_norm": 0.12525705993175507, + "learning_rate": 0.0005, + "loss": 2.1302, + "step": 98790 + }, + { + "epoch": 0.3760571850521075, + "grad_norm": 0.12810900807380676, + "learning_rate": 0.0005, + "loss": 2.1176, + "step": 98800 + }, + { + "epoch": 0.3760952475202302, + "grad_norm": 0.1269560009241104, + "learning_rate": 0.0005, + "loss": 2.1309, + "step": 98810 + }, + { + "epoch": 0.37613330998835287, + "grad_norm": 0.12469108402729034, + "learning_rate": 0.0005, + "loss": 2.111, + "step": 98820 + }, + { + "epoch": 0.3761713724564756, + "grad_norm": 0.1346503645181656, + "learning_rate": 0.0005, + "loss": 2.1188, + "step": 98830 + }, + { + "epoch": 0.37620943492459824, + "grad_norm": 0.11811628192663193, + "learning_rate": 0.0005, + "loss": 2.1413, + "step": 98840 + }, + { + "epoch": 0.37624749739272095, + "grad_norm": 0.13591505587100983, + "learning_rate": 0.0005, + "loss": 2.1126, + "step": 98850 + }, + { + "epoch": 0.3762855598608436, + "grad_norm": 0.1304771453142166, + "learning_rate": 0.0005, + "loss": 2.1268, + "step": 98860 + }, + { + "epoch": 0.3763236223289663, + "grad_norm": 0.15867163240909576, + "learning_rate": 0.0005, + "loss": 2.1159, + "step": 98870 + }, + { + "epoch": 0.376361684797089, + "grad_norm": 0.14701250195503235, + "learning_rate": 0.0005, + "loss": 2.1251, + "step": 98880 + }, + { + "epoch": 0.3763997472652117, + "grad_norm": 0.12889814376831055, + "learning_rate": 0.0005, + "loss": 2.1311, + "step": 98890 + }, + { + "epoch": 0.37643780973333435, + "grad_norm": 0.1383143812417984, + "learning_rate": 0.0005, + "loss": 2.1125, + "step": 98900 + }, + { + "epoch": 0.376475872201457, + "grad_norm": 0.1334122270345688, + "learning_rate": 0.0005, + "loss": 2.138, + "step": 98910 + }, + { + "epoch": 0.3765139346695797, + "grad_norm": 0.12685805559158325, + "learning_rate": 0.0005, + "loss": 2.1264, + "step": 98920 + }, + { + "epoch": 0.3765519971377024, + "grad_norm": 0.11598322540521622, + "learning_rate": 0.0005, + "loss": 2.1205, + "step": 98930 + }, + { + "epoch": 0.3765900596058251, + "grad_norm": 0.12357661873102188, + "learning_rate": 0.0005, + "loss": 2.1197, + "step": 98940 + }, + { + "epoch": 0.37662812207394775, + "grad_norm": 0.11699126660823822, + "learning_rate": 0.0005, + "loss": 2.0966, + "step": 98950 + }, + { + "epoch": 0.37666618454207046, + "grad_norm": 0.13793028891086578, + "learning_rate": 0.0005, + "loss": 2.131, + "step": 98960 + }, + { + "epoch": 0.3767042470101931, + "grad_norm": 0.12064649164676666, + "learning_rate": 0.0005, + "loss": 2.1071, + "step": 98970 + }, + { + "epoch": 0.37674230947831583, + "grad_norm": 0.12668059766292572, + "learning_rate": 0.0005, + "loss": 2.1158, + "step": 98980 + }, + { + "epoch": 0.3767803719464385, + "grad_norm": 0.13221696019172668, + "learning_rate": 0.0005, + "loss": 2.1384, + "step": 98990 + }, + { + "epoch": 0.3768184344145612, + "grad_norm": 0.12401723116636276, + "learning_rate": 0.0005, + "loss": 2.1151, + "step": 99000 + }, + { + "epoch": 0.37685649688268386, + "grad_norm": 0.1281975507736206, + "learning_rate": 0.0005, + "loss": 2.1282, + "step": 99010 + }, + { + "epoch": 0.37689455935080657, + "grad_norm": 0.12351026386022568, + "learning_rate": 0.0005, + "loss": 2.1222, + "step": 99020 + }, + { + "epoch": 0.3769326218189292, + "grad_norm": 0.1147201880812645, + "learning_rate": 0.0005, + "loss": 2.1227, + "step": 99030 + }, + { + "epoch": 0.3769706842870519, + "grad_norm": 0.11766034364700317, + "learning_rate": 0.0005, + "loss": 2.1171, + "step": 99040 + }, + { + "epoch": 0.3770087467551746, + "grad_norm": 0.12974408268928528, + "learning_rate": 0.0005, + "loss": 2.1222, + "step": 99050 + }, + { + "epoch": 0.37704680922329725, + "grad_norm": 0.1237754076719284, + "learning_rate": 0.0005, + "loss": 2.1134, + "step": 99060 + }, + { + "epoch": 0.37708487169141996, + "grad_norm": 0.13523773849010468, + "learning_rate": 0.0005, + "loss": 2.1082, + "step": 99070 + }, + { + "epoch": 0.3771229341595426, + "grad_norm": 0.13876040279865265, + "learning_rate": 0.0005, + "loss": 2.1121, + "step": 99080 + }, + { + "epoch": 0.37716099662766533, + "grad_norm": 0.13175301253795624, + "learning_rate": 0.0005, + "loss": 2.1238, + "step": 99090 + }, + { + "epoch": 0.377199059095788, + "grad_norm": 0.1185319572687149, + "learning_rate": 0.0005, + "loss": 2.1273, + "step": 99100 + }, + { + "epoch": 0.3772371215639107, + "grad_norm": 0.12355204671621323, + "learning_rate": 0.0005, + "loss": 2.12, + "step": 99110 + }, + { + "epoch": 0.37727518403203336, + "grad_norm": 0.12566806375980377, + "learning_rate": 0.0005, + "loss": 2.1248, + "step": 99120 + }, + { + "epoch": 0.3773132465001561, + "grad_norm": 0.12418641149997711, + "learning_rate": 0.0005, + "loss": 2.1167, + "step": 99130 + }, + { + "epoch": 0.37735130896827873, + "grad_norm": 0.14258967339992523, + "learning_rate": 0.0005, + "loss": 2.1226, + "step": 99140 + }, + { + "epoch": 0.37738937143640144, + "grad_norm": 0.13339242339134216, + "learning_rate": 0.0005, + "loss": 2.1152, + "step": 99150 + }, + { + "epoch": 0.3774274339045241, + "grad_norm": 0.12326198816299438, + "learning_rate": 0.0005, + "loss": 2.1313, + "step": 99160 + }, + { + "epoch": 0.3774654963726468, + "grad_norm": 0.13680458068847656, + "learning_rate": 0.0005, + "loss": 2.1084, + "step": 99170 + }, + { + "epoch": 0.37750355884076947, + "grad_norm": 0.1330464482307434, + "learning_rate": 0.0005, + "loss": 2.1037, + "step": 99180 + }, + { + "epoch": 0.37754162130889213, + "grad_norm": 0.13588985800743103, + "learning_rate": 0.0005, + "loss": 2.134, + "step": 99190 + }, + { + "epoch": 0.37757968377701484, + "grad_norm": 0.1276908963918686, + "learning_rate": 0.0005, + "loss": 2.1266, + "step": 99200 + }, + { + "epoch": 0.3776177462451375, + "grad_norm": 0.11521900445222855, + "learning_rate": 0.0005, + "loss": 2.1165, + "step": 99210 + }, + { + "epoch": 0.3776558087132602, + "grad_norm": 0.12597927451133728, + "learning_rate": 0.0005, + "loss": 2.123, + "step": 99220 + }, + { + "epoch": 0.37769387118138287, + "grad_norm": 0.1277197152376175, + "learning_rate": 0.0005, + "loss": 2.13, + "step": 99230 + }, + { + "epoch": 0.3777319336495056, + "grad_norm": 0.11337390542030334, + "learning_rate": 0.0005, + "loss": 2.1199, + "step": 99240 + }, + { + "epoch": 0.37776999611762824, + "grad_norm": 0.13153591752052307, + "learning_rate": 0.0005, + "loss": 2.1221, + "step": 99250 + }, + { + "epoch": 0.37780805858575095, + "grad_norm": 0.12262382358312607, + "learning_rate": 0.0005, + "loss": 2.1265, + "step": 99260 + }, + { + "epoch": 0.3778461210538736, + "grad_norm": 0.1202477365732193, + "learning_rate": 0.0005, + "loss": 2.1365, + "step": 99270 + }, + { + "epoch": 0.3778841835219963, + "grad_norm": 0.1355462670326233, + "learning_rate": 0.0005, + "loss": 2.1272, + "step": 99280 + }, + { + "epoch": 0.377922245990119, + "grad_norm": 0.12229014188051224, + "learning_rate": 0.0005, + "loss": 2.1226, + "step": 99290 + }, + { + "epoch": 0.3779603084582417, + "grad_norm": 0.1229465901851654, + "learning_rate": 0.0005, + "loss": 2.1321, + "step": 99300 + }, + { + "epoch": 0.37799837092636435, + "grad_norm": 0.11344870924949646, + "learning_rate": 0.0005, + "loss": 2.1213, + "step": 99310 + }, + { + "epoch": 0.37803643339448706, + "grad_norm": 0.12172535806894302, + "learning_rate": 0.0005, + "loss": 2.112, + "step": 99320 + }, + { + "epoch": 0.3780744958626097, + "grad_norm": 0.1417960375547409, + "learning_rate": 0.0005, + "loss": 2.1366, + "step": 99330 + }, + { + "epoch": 0.3781125583307324, + "grad_norm": 0.12265501916408539, + "learning_rate": 0.0005, + "loss": 2.1299, + "step": 99340 + }, + { + "epoch": 0.3781506207988551, + "grad_norm": 0.1277245581150055, + "learning_rate": 0.0005, + "loss": 2.1184, + "step": 99350 + }, + { + "epoch": 0.37818868326697774, + "grad_norm": 0.12461922317743301, + "learning_rate": 0.0005, + "loss": 2.1282, + "step": 99360 + }, + { + "epoch": 0.37822674573510046, + "grad_norm": 0.11946941912174225, + "learning_rate": 0.0005, + "loss": 2.1297, + "step": 99370 + }, + { + "epoch": 0.3782648082032231, + "grad_norm": 0.1216985285282135, + "learning_rate": 0.0005, + "loss": 2.1025, + "step": 99380 + }, + { + "epoch": 0.3783028706713458, + "grad_norm": 0.14715443551540375, + "learning_rate": 0.0005, + "loss": 2.1161, + "step": 99390 + }, + { + "epoch": 0.3783409331394685, + "grad_norm": 0.12096799165010452, + "learning_rate": 0.0005, + "loss": 2.1244, + "step": 99400 + }, + { + "epoch": 0.3783789956075912, + "grad_norm": 0.11835294961929321, + "learning_rate": 0.0005, + "loss": 2.1305, + "step": 99410 + }, + { + "epoch": 0.37841705807571385, + "grad_norm": 0.13805124163627625, + "learning_rate": 0.0005, + "loss": 2.1275, + "step": 99420 + }, + { + "epoch": 0.37845512054383657, + "grad_norm": 0.12678532302379608, + "learning_rate": 0.0005, + "loss": 2.1135, + "step": 99430 + }, + { + "epoch": 0.3784931830119592, + "grad_norm": 0.11587246507406235, + "learning_rate": 0.0005, + "loss": 2.1349, + "step": 99440 + }, + { + "epoch": 0.37853124548008193, + "grad_norm": 0.11982610076665878, + "learning_rate": 0.0005, + "loss": 2.1275, + "step": 99450 + }, + { + "epoch": 0.3785693079482046, + "grad_norm": 0.1394578516483307, + "learning_rate": 0.0005, + "loss": 2.1305, + "step": 99460 + }, + { + "epoch": 0.37860737041632725, + "grad_norm": 0.12834946811199188, + "learning_rate": 0.0005, + "loss": 2.1074, + "step": 99470 + }, + { + "epoch": 0.37864543288444996, + "grad_norm": 0.12075355648994446, + "learning_rate": 0.0005, + "loss": 2.1148, + "step": 99480 + }, + { + "epoch": 0.3786834953525726, + "grad_norm": 0.11674944311380386, + "learning_rate": 0.0005, + "loss": 2.1259, + "step": 99490 + }, + { + "epoch": 0.37872155782069533, + "grad_norm": 0.11884240806102753, + "learning_rate": 0.0005, + "loss": 2.122, + "step": 99500 + }, + { + "epoch": 0.378759620288818, + "grad_norm": 0.12764117121696472, + "learning_rate": 0.0005, + "loss": 2.1368, + "step": 99510 + }, + { + "epoch": 0.3787976827569407, + "grad_norm": 0.12957796454429626, + "learning_rate": 0.0005, + "loss": 2.1123, + "step": 99520 + }, + { + "epoch": 0.37883574522506336, + "grad_norm": 0.11994970589876175, + "learning_rate": 0.0005, + "loss": 2.1211, + "step": 99530 + }, + { + "epoch": 0.37887380769318607, + "grad_norm": 0.1316346526145935, + "learning_rate": 0.0005, + "loss": 2.1185, + "step": 99540 + }, + { + "epoch": 0.37891187016130873, + "grad_norm": 0.1294875144958496, + "learning_rate": 0.0005, + "loss": 2.1222, + "step": 99550 + }, + { + "epoch": 0.37894993262943144, + "grad_norm": 0.1248297244310379, + "learning_rate": 0.0005, + "loss": 2.1226, + "step": 99560 + }, + { + "epoch": 0.3789879950975541, + "grad_norm": 0.11559437960386276, + "learning_rate": 0.0005, + "loss": 2.1273, + "step": 99570 + }, + { + "epoch": 0.3790260575656768, + "grad_norm": 0.12500129640102386, + "learning_rate": 0.0005, + "loss": 2.1239, + "step": 99580 + }, + { + "epoch": 0.37906412003379947, + "grad_norm": 0.1193610355257988, + "learning_rate": 0.0005, + "loss": 2.1237, + "step": 99590 + }, + { + "epoch": 0.3791021825019222, + "grad_norm": 0.13608145713806152, + "learning_rate": 0.0005, + "loss": 2.1207, + "step": 99600 + }, + { + "epoch": 0.37914024497004484, + "grad_norm": 0.12819543480873108, + "learning_rate": 0.0005, + "loss": 2.1321, + "step": 99610 + }, + { + "epoch": 0.3791783074381675, + "grad_norm": 0.13131417334079742, + "learning_rate": 0.0005, + "loss": 2.1401, + "step": 99620 + }, + { + "epoch": 0.3792163699062902, + "grad_norm": 0.14582496881484985, + "learning_rate": 0.0005, + "loss": 2.1319, + "step": 99630 + }, + { + "epoch": 0.37925443237441286, + "grad_norm": 0.12820561230182648, + "learning_rate": 0.0005, + "loss": 2.1203, + "step": 99640 + }, + { + "epoch": 0.3792924948425356, + "grad_norm": 0.1195472702383995, + "learning_rate": 0.0005, + "loss": 2.1339, + "step": 99650 + }, + { + "epoch": 0.37933055731065823, + "grad_norm": 0.13277654349803925, + "learning_rate": 0.0005, + "loss": 2.1143, + "step": 99660 + }, + { + "epoch": 0.37936861977878095, + "grad_norm": 0.10699354112148285, + "learning_rate": 0.0005, + "loss": 2.1159, + "step": 99670 + }, + { + "epoch": 0.3794066822469036, + "grad_norm": 0.13318446278572083, + "learning_rate": 0.0005, + "loss": 2.1216, + "step": 99680 + }, + { + "epoch": 0.3794447447150263, + "grad_norm": 0.1301574856042862, + "learning_rate": 0.0005, + "loss": 2.1186, + "step": 99690 + }, + { + "epoch": 0.379482807183149, + "grad_norm": 0.13273996114730835, + "learning_rate": 0.0005, + "loss": 2.1093, + "step": 99700 + }, + { + "epoch": 0.3795208696512717, + "grad_norm": 0.119916170835495, + "learning_rate": 0.0005, + "loss": 2.1155, + "step": 99710 + }, + { + "epoch": 0.37955893211939434, + "grad_norm": 0.12175925821065903, + "learning_rate": 0.0005, + "loss": 2.1211, + "step": 99720 + }, + { + "epoch": 0.37959699458751706, + "grad_norm": 0.11958014965057373, + "learning_rate": 0.0005, + "loss": 2.1179, + "step": 99730 + }, + { + "epoch": 0.3796350570556397, + "grad_norm": 0.12307074666023254, + "learning_rate": 0.0005, + "loss": 2.1244, + "step": 99740 + }, + { + "epoch": 0.3796731195237624, + "grad_norm": 0.1192697286605835, + "learning_rate": 0.0005, + "loss": 2.1139, + "step": 99750 + }, + { + "epoch": 0.3797111819918851, + "grad_norm": 0.11530807614326477, + "learning_rate": 0.0005, + "loss": 2.1134, + "step": 99760 + }, + { + "epoch": 0.37974924446000774, + "grad_norm": 0.12537164986133575, + "learning_rate": 0.0005, + "loss": 2.1164, + "step": 99770 + }, + { + "epoch": 0.37978730692813045, + "grad_norm": 0.14167091250419617, + "learning_rate": 0.0005, + "loss": 2.122, + "step": 99780 + }, + { + "epoch": 0.3798253693962531, + "grad_norm": 0.13286343216896057, + "learning_rate": 0.0005, + "loss": 2.1166, + "step": 99790 + }, + { + "epoch": 0.3798634318643758, + "grad_norm": 0.13088318705558777, + "learning_rate": 0.0005, + "loss": 2.1169, + "step": 99800 + }, + { + "epoch": 0.3799014943324985, + "grad_norm": 0.1335020512342453, + "learning_rate": 0.0005, + "loss": 2.1352, + "step": 99810 + }, + { + "epoch": 0.3799395568006212, + "grad_norm": 0.12035942822694778, + "learning_rate": 0.0005, + "loss": 2.121, + "step": 99820 + }, + { + "epoch": 0.37997761926874385, + "grad_norm": 0.13277862966060638, + "learning_rate": 0.0005, + "loss": 2.1181, + "step": 99830 + }, + { + "epoch": 0.38001568173686656, + "grad_norm": 0.12266730517148972, + "learning_rate": 0.0005, + "loss": 2.1224, + "step": 99840 + }, + { + "epoch": 0.3800537442049892, + "grad_norm": 0.12493303418159485, + "learning_rate": 0.0005, + "loss": 2.1432, + "step": 99850 + }, + { + "epoch": 0.38009180667311193, + "grad_norm": 0.1336122304201126, + "learning_rate": 0.0005, + "loss": 2.1226, + "step": 99860 + }, + { + "epoch": 0.3801298691412346, + "grad_norm": 0.12226969748735428, + "learning_rate": 0.0005, + "loss": 2.1284, + "step": 99870 + }, + { + "epoch": 0.3801679316093573, + "grad_norm": 0.1311367005109787, + "learning_rate": 0.0005, + "loss": 2.1167, + "step": 99880 + }, + { + "epoch": 0.38020599407747996, + "grad_norm": 0.12251698970794678, + "learning_rate": 0.0005, + "loss": 2.1153, + "step": 99890 + }, + { + "epoch": 0.3802440565456026, + "grad_norm": 0.1320013850927353, + "learning_rate": 0.0005, + "loss": 2.1243, + "step": 99900 + }, + { + "epoch": 0.38028211901372533, + "grad_norm": 0.12274371087551117, + "learning_rate": 0.0005, + "loss": 2.1251, + "step": 99910 + }, + { + "epoch": 0.380320181481848, + "grad_norm": 0.12575620412826538, + "learning_rate": 0.0005, + "loss": 2.1375, + "step": 99920 + }, + { + "epoch": 0.3803582439499707, + "grad_norm": 0.11021941900253296, + "learning_rate": 0.0005, + "loss": 2.105, + "step": 99930 + }, + { + "epoch": 0.38039630641809336, + "grad_norm": 0.1258212924003601, + "learning_rate": 0.0005, + "loss": 2.1113, + "step": 99940 + }, + { + "epoch": 0.38043436888621607, + "grad_norm": 0.14333589375019073, + "learning_rate": 0.0005, + "loss": 2.1167, + "step": 99950 + }, + { + "epoch": 0.3804724313543387, + "grad_norm": 0.12342032790184021, + "learning_rate": 0.0005, + "loss": 2.1189, + "step": 99960 + }, + { + "epoch": 0.38051049382246144, + "grad_norm": 0.12532587349414825, + "learning_rate": 0.0005, + "loss": 2.1083, + "step": 99970 + }, + { + "epoch": 0.3805485562905841, + "grad_norm": 0.12661361694335938, + "learning_rate": 0.0005, + "loss": 2.123, + "step": 99980 + }, + { + "epoch": 0.3805866187587068, + "grad_norm": 0.13249489665031433, + "learning_rate": 0.0005, + "loss": 2.1142, + "step": 99990 + }, + { + "epoch": 0.38062468122682946, + "grad_norm": 0.11751711368560791, + "learning_rate": 0.0005, + "loss": 2.1032, + "step": 100000 + }, + { + "epoch": 0.3806627436949522, + "grad_norm": 0.1317417472600937, + "learning_rate": 0.0005, + "loss": 2.1179, + "step": 100010 + }, + { + "epoch": 0.38070080616307483, + "grad_norm": 0.1290077269077301, + "learning_rate": 0.0005, + "loss": 2.1317, + "step": 100020 + }, + { + "epoch": 0.38073886863119755, + "grad_norm": 0.1366860419511795, + "learning_rate": 0.0005, + "loss": 2.1314, + "step": 100030 + }, + { + "epoch": 0.3807769310993202, + "grad_norm": 0.11924261599779129, + "learning_rate": 0.0005, + "loss": 2.1079, + "step": 100040 + }, + { + "epoch": 0.38081499356744286, + "grad_norm": 0.11624366044998169, + "learning_rate": 0.0005, + "loss": 2.1142, + "step": 100050 + }, + { + "epoch": 0.3808530560355656, + "grad_norm": 0.12848447263240814, + "learning_rate": 0.0005, + "loss": 2.13, + "step": 100060 + }, + { + "epoch": 0.38089111850368823, + "grad_norm": 0.13916176557540894, + "learning_rate": 0.0005, + "loss": 2.1262, + "step": 100070 + }, + { + "epoch": 0.38092918097181094, + "grad_norm": 0.13948209583759308, + "learning_rate": 0.0005, + "loss": 2.1292, + "step": 100080 + }, + { + "epoch": 0.3809672434399336, + "grad_norm": 0.12091422080993652, + "learning_rate": 0.0005, + "loss": 2.125, + "step": 100090 + }, + { + "epoch": 0.3810053059080563, + "grad_norm": 0.12870047986507416, + "learning_rate": 0.0005, + "loss": 2.1174, + "step": 100100 + }, + { + "epoch": 0.38104336837617897, + "grad_norm": 0.11702585965394974, + "learning_rate": 0.0005, + "loss": 2.1232, + "step": 100110 + }, + { + "epoch": 0.3810814308443017, + "grad_norm": 0.1217094287276268, + "learning_rate": 0.0005, + "loss": 2.1177, + "step": 100120 + }, + { + "epoch": 0.38111949331242434, + "grad_norm": 0.13027448952198029, + "learning_rate": 0.0005, + "loss": 2.1168, + "step": 100130 + }, + { + "epoch": 0.38115755578054705, + "grad_norm": 0.12258381396532059, + "learning_rate": 0.0005, + "loss": 2.1348, + "step": 100140 + }, + { + "epoch": 0.3811956182486697, + "grad_norm": 0.12027046829462051, + "learning_rate": 0.0005, + "loss": 2.128, + "step": 100150 + }, + { + "epoch": 0.3812336807167924, + "grad_norm": 0.13201120495796204, + "learning_rate": 0.0005, + "loss": 2.1395, + "step": 100160 + }, + { + "epoch": 0.3812717431849151, + "grad_norm": 0.12681923806667328, + "learning_rate": 0.0005, + "loss": 2.1182, + "step": 100170 + }, + { + "epoch": 0.3813098056530378, + "grad_norm": 0.12848328053951263, + "learning_rate": 0.0005, + "loss": 2.111, + "step": 100180 + }, + { + "epoch": 0.38134786812116045, + "grad_norm": 0.12477723509073257, + "learning_rate": 0.0005, + "loss": 2.1083, + "step": 100190 + }, + { + "epoch": 0.3813859305892831, + "grad_norm": 0.13087469339370728, + "learning_rate": 0.0005, + "loss": 2.1137, + "step": 100200 + }, + { + "epoch": 0.3814239930574058, + "grad_norm": 0.12919747829437256, + "learning_rate": 0.0005, + "loss": 2.1209, + "step": 100210 + }, + { + "epoch": 0.3814620555255285, + "grad_norm": 0.12140429764986038, + "learning_rate": 0.0005, + "loss": 2.1301, + "step": 100220 + }, + { + "epoch": 0.3815001179936512, + "grad_norm": 0.11911433190107346, + "learning_rate": 0.0005, + "loss": 2.133, + "step": 100230 + }, + { + "epoch": 0.38153818046177385, + "grad_norm": 0.12428278475999832, + "learning_rate": 0.0005, + "loss": 2.1255, + "step": 100240 + }, + { + "epoch": 0.38157624292989656, + "grad_norm": 0.12575286626815796, + "learning_rate": 0.0005, + "loss": 2.1173, + "step": 100250 + }, + { + "epoch": 0.3816143053980192, + "grad_norm": 0.14253956079483032, + "learning_rate": 0.0005, + "loss": 2.1291, + "step": 100260 + }, + { + "epoch": 0.38165236786614193, + "grad_norm": 0.11677713692188263, + "learning_rate": 0.0005, + "loss": 2.135, + "step": 100270 + }, + { + "epoch": 0.3816904303342646, + "grad_norm": 0.11532783508300781, + "learning_rate": 0.0005, + "loss": 2.1224, + "step": 100280 + }, + { + "epoch": 0.3817284928023873, + "grad_norm": 0.12663224339485168, + "learning_rate": 0.0005, + "loss": 2.1172, + "step": 100290 + }, + { + "epoch": 0.38176655527050996, + "grad_norm": 0.1268301010131836, + "learning_rate": 0.0005, + "loss": 2.1242, + "step": 100300 + }, + { + "epoch": 0.38180461773863267, + "grad_norm": 0.14021086692810059, + "learning_rate": 0.0005, + "loss": 2.1135, + "step": 100310 + }, + { + "epoch": 0.3818426802067553, + "grad_norm": 0.1259777545928955, + "learning_rate": 0.0005, + "loss": 2.1001, + "step": 100320 + }, + { + "epoch": 0.381880742674878, + "grad_norm": 0.11848200857639313, + "learning_rate": 0.0005, + "loss": 2.1137, + "step": 100330 + }, + { + "epoch": 0.3819188051430007, + "grad_norm": 0.12533099949359894, + "learning_rate": 0.0005, + "loss": 2.1249, + "step": 100340 + }, + { + "epoch": 0.38195686761112335, + "grad_norm": 0.12691478431224823, + "learning_rate": 0.0005, + "loss": 2.1191, + "step": 100350 + }, + { + "epoch": 0.38199493007924606, + "grad_norm": 0.13623826205730438, + "learning_rate": 0.0005, + "loss": 2.1169, + "step": 100360 + }, + { + "epoch": 0.3820329925473687, + "grad_norm": 0.11483057588338852, + "learning_rate": 0.0005, + "loss": 2.1156, + "step": 100370 + }, + { + "epoch": 0.38207105501549143, + "grad_norm": 0.12684819102287292, + "learning_rate": 0.0005, + "loss": 2.1215, + "step": 100380 + }, + { + "epoch": 0.3821091174836141, + "grad_norm": 0.11294738203287125, + "learning_rate": 0.0005, + "loss": 2.1094, + "step": 100390 + }, + { + "epoch": 0.3821471799517368, + "grad_norm": 0.12915776669979095, + "learning_rate": 0.0005, + "loss": 2.1258, + "step": 100400 + }, + { + "epoch": 0.38218524241985946, + "grad_norm": 0.11914768069982529, + "learning_rate": 0.0005, + "loss": 2.1351, + "step": 100410 + }, + { + "epoch": 0.3822233048879822, + "grad_norm": 0.11203866451978683, + "learning_rate": 0.0005, + "loss": 2.1135, + "step": 100420 + }, + { + "epoch": 0.38226136735610483, + "grad_norm": 0.1280486285686493, + "learning_rate": 0.0005, + "loss": 2.1214, + "step": 100430 + }, + { + "epoch": 0.38229942982422754, + "grad_norm": 0.12039019912481308, + "learning_rate": 0.0005, + "loss": 2.1349, + "step": 100440 + }, + { + "epoch": 0.3823374922923502, + "grad_norm": 0.12762552499771118, + "learning_rate": 0.0005, + "loss": 2.1267, + "step": 100450 + }, + { + "epoch": 0.3823755547604729, + "grad_norm": 0.1186232715845108, + "learning_rate": 0.0005, + "loss": 2.1308, + "step": 100460 + }, + { + "epoch": 0.38241361722859557, + "grad_norm": 0.12590287625789642, + "learning_rate": 0.0005, + "loss": 2.1358, + "step": 100470 + }, + { + "epoch": 0.3824516796967182, + "grad_norm": 0.1216677650809288, + "learning_rate": 0.0005, + "loss": 2.1066, + "step": 100480 + }, + { + "epoch": 0.38248974216484094, + "grad_norm": 0.12391503155231476, + "learning_rate": 0.0005, + "loss": 2.0988, + "step": 100490 + }, + { + "epoch": 0.3825278046329636, + "grad_norm": 0.12275954335927963, + "learning_rate": 0.0005, + "loss": 2.1163, + "step": 100500 + }, + { + "epoch": 0.3825658671010863, + "grad_norm": 0.12779061496257782, + "learning_rate": 0.0005, + "loss": 2.1206, + "step": 100510 + }, + { + "epoch": 0.38260392956920897, + "grad_norm": 0.12359108030796051, + "learning_rate": 0.0005, + "loss": 2.1116, + "step": 100520 + }, + { + "epoch": 0.3826419920373317, + "grad_norm": 0.13737986981868744, + "learning_rate": 0.0005, + "loss": 2.1049, + "step": 100530 + }, + { + "epoch": 0.38268005450545434, + "grad_norm": 0.12026502937078476, + "learning_rate": 0.0005, + "loss": 2.1104, + "step": 100540 + }, + { + "epoch": 0.38271811697357705, + "grad_norm": 0.13088242709636688, + "learning_rate": 0.0005, + "loss": 2.1218, + "step": 100550 + }, + { + "epoch": 0.3827561794416997, + "grad_norm": 0.1281553953886032, + "learning_rate": 0.0005, + "loss": 2.1104, + "step": 100560 + }, + { + "epoch": 0.3827942419098224, + "grad_norm": 0.1370796263217926, + "learning_rate": 0.0005, + "loss": 2.1127, + "step": 100570 + }, + { + "epoch": 0.3828323043779451, + "grad_norm": 0.12698887288570404, + "learning_rate": 0.0005, + "loss": 2.1368, + "step": 100580 + }, + { + "epoch": 0.3828703668460678, + "grad_norm": 0.13594062626361847, + "learning_rate": 0.0005, + "loss": 2.1149, + "step": 100590 + }, + { + "epoch": 0.38290842931419045, + "grad_norm": 0.1371491253376007, + "learning_rate": 0.0005, + "loss": 2.134, + "step": 100600 + }, + { + "epoch": 0.38294649178231316, + "grad_norm": 0.11673377454280853, + "learning_rate": 0.0005, + "loss": 2.1132, + "step": 100610 + }, + { + "epoch": 0.3829845542504358, + "grad_norm": 0.12888579070568085, + "learning_rate": 0.0005, + "loss": 2.1265, + "step": 100620 + }, + { + "epoch": 0.3830226167185585, + "grad_norm": 0.11862296611070633, + "learning_rate": 0.0005, + "loss": 2.1192, + "step": 100630 + }, + { + "epoch": 0.3830606791866812, + "grad_norm": 0.1351194977760315, + "learning_rate": 0.0005, + "loss": 2.1073, + "step": 100640 + }, + { + "epoch": 0.38309874165480384, + "grad_norm": 0.12605620920658112, + "learning_rate": 0.0005, + "loss": 2.1146, + "step": 100650 + }, + { + "epoch": 0.38313680412292656, + "grad_norm": 0.13026510179042816, + "learning_rate": 0.0005, + "loss": 2.1176, + "step": 100660 + }, + { + "epoch": 0.3831748665910492, + "grad_norm": 0.13861605525016785, + "learning_rate": 0.0005, + "loss": 2.1291, + "step": 100670 + }, + { + "epoch": 0.3832129290591719, + "grad_norm": 0.40944308042526245, + "learning_rate": 0.0005, + "loss": 2.1235, + "step": 100680 + }, + { + "epoch": 0.3832509915272946, + "grad_norm": 0.12580206990242004, + "learning_rate": 0.0005, + "loss": 2.137, + "step": 100690 + }, + { + "epoch": 0.3832890539954173, + "grad_norm": 0.13019052147865295, + "learning_rate": 0.0005, + "loss": 2.1131, + "step": 100700 + }, + { + "epoch": 0.38332711646353995, + "grad_norm": 0.13055548071861267, + "learning_rate": 0.0005, + "loss": 2.11, + "step": 100710 + }, + { + "epoch": 0.38336517893166266, + "grad_norm": 0.13237163424491882, + "learning_rate": 0.0005, + "loss": 2.1199, + "step": 100720 + }, + { + "epoch": 0.3834032413997853, + "grad_norm": 0.1250726729631424, + "learning_rate": 0.0005, + "loss": 2.1146, + "step": 100730 + }, + { + "epoch": 0.38344130386790803, + "grad_norm": 0.13131138682365417, + "learning_rate": 0.0005, + "loss": 2.1231, + "step": 100740 + }, + { + "epoch": 0.3834793663360307, + "grad_norm": 0.13097605109214783, + "learning_rate": 0.0005, + "loss": 2.1198, + "step": 100750 + }, + { + "epoch": 0.3835174288041534, + "grad_norm": 0.12748023867607117, + "learning_rate": 0.0005, + "loss": 2.1371, + "step": 100760 + }, + { + "epoch": 0.38355549127227606, + "grad_norm": 0.13089165091514587, + "learning_rate": 0.0005, + "loss": 2.1335, + "step": 100770 + }, + { + "epoch": 0.3835935537403987, + "grad_norm": 0.11884886026382446, + "learning_rate": 0.0005, + "loss": 2.1186, + "step": 100780 + }, + { + "epoch": 0.38363161620852143, + "grad_norm": 0.1299116313457489, + "learning_rate": 0.0005, + "loss": 2.116, + "step": 100790 + }, + { + "epoch": 0.3836696786766441, + "grad_norm": 0.12401427328586578, + "learning_rate": 0.0005, + "loss": 2.1131, + "step": 100800 + }, + { + "epoch": 0.3837077411447668, + "grad_norm": 0.12213096767663956, + "learning_rate": 0.0005, + "loss": 2.1214, + "step": 100810 + }, + { + "epoch": 0.38374580361288946, + "grad_norm": 0.13122443854808807, + "learning_rate": 0.0005, + "loss": 2.1243, + "step": 100820 + }, + { + "epoch": 0.38378386608101217, + "grad_norm": 0.1398945450782776, + "learning_rate": 0.0005, + "loss": 2.1111, + "step": 100830 + }, + { + "epoch": 0.3838219285491348, + "grad_norm": 0.11701014637947083, + "learning_rate": 0.0005, + "loss": 2.1188, + "step": 100840 + }, + { + "epoch": 0.38385999101725754, + "grad_norm": 0.12747806310653687, + "learning_rate": 0.0005, + "loss": 2.1254, + "step": 100850 + }, + { + "epoch": 0.3838980534853802, + "grad_norm": 0.13189804553985596, + "learning_rate": 0.0005, + "loss": 2.1294, + "step": 100860 + }, + { + "epoch": 0.3839361159535029, + "grad_norm": 0.12226402014493942, + "learning_rate": 0.0005, + "loss": 2.1349, + "step": 100870 + }, + { + "epoch": 0.38397417842162557, + "grad_norm": 0.11916231364011765, + "learning_rate": 0.0005, + "loss": 2.1212, + "step": 100880 + }, + { + "epoch": 0.3840122408897483, + "grad_norm": 0.12730424106121063, + "learning_rate": 0.0005, + "loss": 2.1088, + "step": 100890 + }, + { + "epoch": 0.38405030335787094, + "grad_norm": 0.13549363613128662, + "learning_rate": 0.0005, + "loss": 2.1149, + "step": 100900 + }, + { + "epoch": 0.3840883658259936, + "grad_norm": 0.13349735736846924, + "learning_rate": 0.0005, + "loss": 2.1487, + "step": 100910 + }, + { + "epoch": 0.3841264282941163, + "grad_norm": 0.14545609056949615, + "learning_rate": 0.0005, + "loss": 2.13, + "step": 100920 + }, + { + "epoch": 0.38416449076223896, + "grad_norm": 0.12006688117980957, + "learning_rate": 0.0005, + "loss": 2.1192, + "step": 100930 + }, + { + "epoch": 0.3842025532303617, + "grad_norm": 0.1416459083557129, + "learning_rate": 0.0005, + "loss": 2.1142, + "step": 100940 + }, + { + "epoch": 0.38424061569848433, + "grad_norm": 0.1135077103972435, + "learning_rate": 0.0005, + "loss": 2.1309, + "step": 100950 + }, + { + "epoch": 0.38427867816660705, + "grad_norm": 0.12496229261159897, + "learning_rate": 0.0005, + "loss": 2.1081, + "step": 100960 + }, + { + "epoch": 0.3843167406347297, + "grad_norm": 0.11518893390893936, + "learning_rate": 0.0005, + "loss": 2.1366, + "step": 100970 + }, + { + "epoch": 0.3843548031028524, + "grad_norm": 0.13658462464809418, + "learning_rate": 0.0005, + "loss": 2.1235, + "step": 100980 + }, + { + "epoch": 0.3843928655709751, + "grad_norm": 0.13168896734714508, + "learning_rate": 0.0005, + "loss": 2.1303, + "step": 100990 + }, + { + "epoch": 0.3844309280390978, + "grad_norm": 0.11835760623216629, + "learning_rate": 0.0005, + "loss": 2.1294, + "step": 101000 + }, + { + "epoch": 0.38446899050722044, + "grad_norm": 0.13231638073921204, + "learning_rate": 0.0005, + "loss": 2.1188, + "step": 101010 + }, + { + "epoch": 0.38450705297534316, + "grad_norm": 0.1255374252796173, + "learning_rate": 0.0005, + "loss": 2.1207, + "step": 101020 + }, + { + "epoch": 0.3845451154434658, + "grad_norm": 0.1154334619641304, + "learning_rate": 0.0005, + "loss": 2.1269, + "step": 101030 + }, + { + "epoch": 0.3845831779115885, + "grad_norm": 0.12606540322303772, + "learning_rate": 0.0005, + "loss": 2.1256, + "step": 101040 + }, + { + "epoch": 0.3846212403797112, + "grad_norm": 0.12749677896499634, + "learning_rate": 0.0005, + "loss": 2.1336, + "step": 101050 + }, + { + "epoch": 0.38465930284783384, + "grad_norm": 0.12091683596372604, + "learning_rate": 0.0005, + "loss": 2.1177, + "step": 101060 + }, + { + "epoch": 0.38469736531595655, + "grad_norm": 0.12428902834653854, + "learning_rate": 0.0005, + "loss": 2.1271, + "step": 101070 + }, + { + "epoch": 0.3847354277840792, + "grad_norm": 0.12779396772384644, + "learning_rate": 0.0005, + "loss": 2.117, + "step": 101080 + }, + { + "epoch": 0.3847734902522019, + "grad_norm": 0.12357478588819504, + "learning_rate": 0.0005, + "loss": 2.1058, + "step": 101090 + }, + { + "epoch": 0.3848115527203246, + "grad_norm": 0.1376105099916458, + "learning_rate": 0.0005, + "loss": 2.1008, + "step": 101100 + }, + { + "epoch": 0.3848496151884473, + "grad_norm": 0.11973535269498825, + "learning_rate": 0.0005, + "loss": 2.1264, + "step": 101110 + }, + { + "epoch": 0.38488767765656995, + "grad_norm": 0.1322673112154007, + "learning_rate": 0.0005, + "loss": 2.1166, + "step": 101120 + }, + { + "epoch": 0.38492574012469266, + "grad_norm": 0.1237492710351944, + "learning_rate": 0.0005, + "loss": 2.1137, + "step": 101130 + }, + { + "epoch": 0.3849638025928153, + "grad_norm": 0.12245187908411026, + "learning_rate": 0.0005, + "loss": 2.1093, + "step": 101140 + }, + { + "epoch": 0.38500186506093803, + "grad_norm": 0.12443853914737701, + "learning_rate": 0.0005, + "loss": 2.1188, + "step": 101150 + }, + { + "epoch": 0.3850399275290607, + "grad_norm": 0.11383303999900818, + "learning_rate": 0.0005, + "loss": 2.1267, + "step": 101160 + }, + { + "epoch": 0.3850779899971834, + "grad_norm": 0.12312997877597809, + "learning_rate": 0.0005, + "loss": 2.1085, + "step": 101170 + }, + { + "epoch": 0.38511605246530606, + "grad_norm": 0.12528856098651886, + "learning_rate": 0.0005, + "loss": 2.1202, + "step": 101180 + }, + { + "epoch": 0.38515411493342877, + "grad_norm": 0.12684626877307892, + "learning_rate": 0.0005, + "loss": 2.1259, + "step": 101190 + }, + { + "epoch": 0.3851921774015514, + "grad_norm": 0.12324848026037216, + "learning_rate": 0.0005, + "loss": 2.1047, + "step": 101200 + }, + { + "epoch": 0.3852302398696741, + "grad_norm": 0.1210331991314888, + "learning_rate": 0.0005, + "loss": 2.1167, + "step": 101210 + }, + { + "epoch": 0.3852683023377968, + "grad_norm": 0.11422744393348694, + "learning_rate": 0.0005, + "loss": 2.1149, + "step": 101220 + }, + { + "epoch": 0.38530636480591945, + "grad_norm": 0.13483406603336334, + "learning_rate": 0.0005, + "loss": 2.1301, + "step": 101230 + }, + { + "epoch": 0.38534442727404217, + "grad_norm": 0.1397322416305542, + "learning_rate": 0.0005, + "loss": 2.1157, + "step": 101240 + }, + { + "epoch": 0.3853824897421648, + "grad_norm": 0.11989077180624008, + "learning_rate": 0.0005, + "loss": 2.1231, + "step": 101250 + }, + { + "epoch": 0.38542055221028754, + "grad_norm": 0.13294818997383118, + "learning_rate": 0.0005, + "loss": 2.1182, + "step": 101260 + }, + { + "epoch": 0.3854586146784102, + "grad_norm": 0.1253851354122162, + "learning_rate": 0.0005, + "loss": 2.1158, + "step": 101270 + }, + { + "epoch": 0.3854966771465329, + "grad_norm": 0.12310951203107834, + "learning_rate": 0.0005, + "loss": 2.1113, + "step": 101280 + }, + { + "epoch": 0.38553473961465556, + "grad_norm": 0.13452036678791046, + "learning_rate": 0.0005, + "loss": 2.1178, + "step": 101290 + }, + { + "epoch": 0.3855728020827783, + "grad_norm": 0.13898633420467377, + "learning_rate": 0.0005, + "loss": 2.1132, + "step": 101300 + }, + { + "epoch": 0.38561086455090093, + "grad_norm": 0.3918830156326294, + "learning_rate": 0.0005, + "loss": 2.1197, + "step": 101310 + }, + { + "epoch": 0.38564892701902365, + "grad_norm": 0.11546818912029266, + "learning_rate": 0.0005, + "loss": 2.1165, + "step": 101320 + }, + { + "epoch": 0.3856869894871463, + "grad_norm": 0.128716379404068, + "learning_rate": 0.0005, + "loss": 2.1399, + "step": 101330 + }, + { + "epoch": 0.38572505195526896, + "grad_norm": 0.12486767768859863, + "learning_rate": 0.0005, + "loss": 2.1082, + "step": 101340 + }, + { + "epoch": 0.3857631144233917, + "grad_norm": 0.12809643149375916, + "learning_rate": 0.0005, + "loss": 2.1237, + "step": 101350 + }, + { + "epoch": 0.38580117689151433, + "grad_norm": 0.12099676579236984, + "learning_rate": 0.0005, + "loss": 2.1192, + "step": 101360 + }, + { + "epoch": 0.38583923935963704, + "grad_norm": 0.13036486506462097, + "learning_rate": 0.0005, + "loss": 2.1217, + "step": 101370 + }, + { + "epoch": 0.3858773018277597, + "grad_norm": 0.13107898831367493, + "learning_rate": 0.0005, + "loss": 2.1111, + "step": 101380 + }, + { + "epoch": 0.3859153642958824, + "grad_norm": 0.12261070311069489, + "learning_rate": 0.0005, + "loss": 2.1157, + "step": 101390 + }, + { + "epoch": 0.38595342676400507, + "grad_norm": 0.12817129492759705, + "learning_rate": 0.0005, + "loss": 2.1242, + "step": 101400 + }, + { + "epoch": 0.3859914892321278, + "grad_norm": 0.12391543388366699, + "learning_rate": 0.0005, + "loss": 2.1271, + "step": 101410 + }, + { + "epoch": 0.38602955170025044, + "grad_norm": 0.11720357090234756, + "learning_rate": 0.0005, + "loss": 2.1151, + "step": 101420 + }, + { + "epoch": 0.38606761416837315, + "grad_norm": 0.12664619088172913, + "learning_rate": 0.0005, + "loss": 2.1115, + "step": 101430 + }, + { + "epoch": 0.3861056766364958, + "grad_norm": 0.12041633576154709, + "learning_rate": 0.0005, + "loss": 2.1131, + "step": 101440 + }, + { + "epoch": 0.3861437391046185, + "grad_norm": 0.12451664358377457, + "learning_rate": 0.0005, + "loss": 2.1101, + "step": 101450 + }, + { + "epoch": 0.3861818015727412, + "grad_norm": 0.11611100286245346, + "learning_rate": 0.0005, + "loss": 2.1051, + "step": 101460 + }, + { + "epoch": 0.3862198640408639, + "grad_norm": 0.1349475085735321, + "learning_rate": 0.0005, + "loss": 2.1124, + "step": 101470 + }, + { + "epoch": 0.38625792650898655, + "grad_norm": 0.12914234399795532, + "learning_rate": 0.0005, + "loss": 2.133, + "step": 101480 + }, + { + "epoch": 0.3862959889771092, + "grad_norm": 0.1278100609779358, + "learning_rate": 0.0005, + "loss": 2.1318, + "step": 101490 + }, + { + "epoch": 0.3863340514452319, + "grad_norm": 0.12993694841861725, + "learning_rate": 0.0005, + "loss": 2.1199, + "step": 101500 + }, + { + "epoch": 0.3863721139133546, + "grad_norm": 0.1148102730512619, + "learning_rate": 0.0005, + "loss": 2.1277, + "step": 101510 + }, + { + "epoch": 0.3864101763814773, + "grad_norm": 0.1425301879644394, + "learning_rate": 0.0005, + "loss": 2.135, + "step": 101520 + }, + { + "epoch": 0.38644823884959995, + "grad_norm": 0.12484724074602127, + "learning_rate": 0.0005, + "loss": 2.1294, + "step": 101530 + }, + { + "epoch": 0.38648630131772266, + "grad_norm": 0.12289290130138397, + "learning_rate": 0.0005, + "loss": 2.1305, + "step": 101540 + }, + { + "epoch": 0.3865243637858453, + "grad_norm": 0.11314371973276138, + "learning_rate": 0.0005, + "loss": 2.12, + "step": 101550 + }, + { + "epoch": 0.38656242625396803, + "grad_norm": 0.12221511453390121, + "learning_rate": 0.0005, + "loss": 2.1265, + "step": 101560 + }, + { + "epoch": 0.3866004887220907, + "grad_norm": 0.15185439586639404, + "learning_rate": 0.0005, + "loss": 2.1294, + "step": 101570 + }, + { + "epoch": 0.3866385511902134, + "grad_norm": 0.16293543577194214, + "learning_rate": 0.0005, + "loss": 2.1078, + "step": 101580 + }, + { + "epoch": 0.38667661365833605, + "grad_norm": 0.13466903567314148, + "learning_rate": 0.0005, + "loss": 2.1246, + "step": 101590 + }, + { + "epoch": 0.38671467612645877, + "grad_norm": 0.12571501731872559, + "learning_rate": 0.0005, + "loss": 2.1298, + "step": 101600 + }, + { + "epoch": 0.3867527385945814, + "grad_norm": 0.11617813259363174, + "learning_rate": 0.0005, + "loss": 2.1335, + "step": 101610 + }, + { + "epoch": 0.38679080106270414, + "grad_norm": 0.11796658486127853, + "learning_rate": 0.0005, + "loss": 2.112, + "step": 101620 + }, + { + "epoch": 0.3868288635308268, + "grad_norm": 0.1331198364496231, + "learning_rate": 0.0005, + "loss": 2.129, + "step": 101630 + }, + { + "epoch": 0.38686692599894945, + "grad_norm": 0.12835271656513214, + "learning_rate": 0.0005, + "loss": 2.123, + "step": 101640 + }, + { + "epoch": 0.38690498846707216, + "grad_norm": 0.11672591418027878, + "learning_rate": 0.0005, + "loss": 2.1096, + "step": 101650 + }, + { + "epoch": 0.3869430509351948, + "grad_norm": 0.13207651674747467, + "learning_rate": 0.0005, + "loss": 2.103, + "step": 101660 + }, + { + "epoch": 0.38698111340331753, + "grad_norm": 0.12128318846225739, + "learning_rate": 0.0005, + "loss": 2.1344, + "step": 101670 + }, + { + "epoch": 0.3870191758714402, + "grad_norm": 0.12716425955295563, + "learning_rate": 0.0005, + "loss": 2.1278, + "step": 101680 + }, + { + "epoch": 0.3870572383395629, + "grad_norm": 0.12909677624702454, + "learning_rate": 0.0005, + "loss": 2.1153, + "step": 101690 + }, + { + "epoch": 0.38709530080768556, + "grad_norm": 0.13584424555301666, + "learning_rate": 0.0005, + "loss": 2.1307, + "step": 101700 + }, + { + "epoch": 0.3871333632758083, + "grad_norm": 0.12449723482131958, + "learning_rate": 0.0005, + "loss": 2.1139, + "step": 101710 + }, + { + "epoch": 0.38717142574393093, + "grad_norm": 0.13110417127609253, + "learning_rate": 0.0005, + "loss": 2.1237, + "step": 101720 + }, + { + "epoch": 0.38720948821205364, + "grad_norm": 0.1298656314611435, + "learning_rate": 0.0005, + "loss": 2.1143, + "step": 101730 + }, + { + "epoch": 0.3872475506801763, + "grad_norm": 0.11582604050636292, + "learning_rate": 0.0005, + "loss": 2.1053, + "step": 101740 + }, + { + "epoch": 0.387285613148299, + "grad_norm": 0.12446392327547073, + "learning_rate": 0.0005, + "loss": 2.1234, + "step": 101750 + }, + { + "epoch": 0.38732367561642167, + "grad_norm": 0.1227712333202362, + "learning_rate": 0.0005, + "loss": 2.1253, + "step": 101760 + }, + { + "epoch": 0.3873617380845443, + "grad_norm": 0.14874985814094543, + "learning_rate": 0.0005, + "loss": 2.1057, + "step": 101770 + }, + { + "epoch": 0.38739980055266704, + "grad_norm": 0.12331117689609528, + "learning_rate": 0.0005, + "loss": 2.1118, + "step": 101780 + }, + { + "epoch": 0.3874378630207897, + "grad_norm": 0.11972736567258835, + "learning_rate": 0.0005, + "loss": 2.1181, + "step": 101790 + }, + { + "epoch": 0.3874759254889124, + "grad_norm": 0.11666350811719894, + "learning_rate": 0.0005, + "loss": 2.1154, + "step": 101800 + }, + { + "epoch": 0.38751398795703507, + "grad_norm": 0.12108370661735535, + "learning_rate": 0.0005, + "loss": 2.1177, + "step": 101810 + }, + { + "epoch": 0.3875520504251578, + "grad_norm": 0.13469970226287842, + "learning_rate": 0.0005, + "loss": 2.1003, + "step": 101820 + }, + { + "epoch": 0.38759011289328044, + "grad_norm": 0.11954975873231888, + "learning_rate": 0.0005, + "loss": 2.1237, + "step": 101830 + }, + { + "epoch": 0.38762817536140315, + "grad_norm": 0.11528520286083221, + "learning_rate": 0.0005, + "loss": 2.1326, + "step": 101840 + }, + { + "epoch": 0.3876662378295258, + "grad_norm": 0.126707524061203, + "learning_rate": 0.0005, + "loss": 2.1073, + "step": 101850 + }, + { + "epoch": 0.3877043002976485, + "grad_norm": 0.12829038500785828, + "learning_rate": 0.0005, + "loss": 2.1264, + "step": 101860 + }, + { + "epoch": 0.3877423627657712, + "grad_norm": 0.13470347225666046, + "learning_rate": 0.0005, + "loss": 2.1208, + "step": 101870 + }, + { + "epoch": 0.3877804252338939, + "grad_norm": 0.13755831122398376, + "learning_rate": 0.0005, + "loss": 2.1343, + "step": 101880 + }, + { + "epoch": 0.38781848770201655, + "grad_norm": 0.12866328656673431, + "learning_rate": 0.0005, + "loss": 2.118, + "step": 101890 + }, + { + "epoch": 0.38785655017013926, + "grad_norm": 0.12940312922000885, + "learning_rate": 0.0005, + "loss": 2.1211, + "step": 101900 + }, + { + "epoch": 0.3878946126382619, + "grad_norm": 0.13062919676303864, + "learning_rate": 0.0005, + "loss": 2.1118, + "step": 101910 + }, + { + "epoch": 0.3879326751063846, + "grad_norm": 0.14013129472732544, + "learning_rate": 0.0005, + "loss": 2.1238, + "step": 101920 + }, + { + "epoch": 0.3879707375745073, + "grad_norm": 0.13211211562156677, + "learning_rate": 0.0005, + "loss": 2.1202, + "step": 101930 + }, + { + "epoch": 0.38800880004262994, + "grad_norm": 0.1241949051618576, + "learning_rate": 0.0005, + "loss": 2.1165, + "step": 101940 + }, + { + "epoch": 0.38804686251075265, + "grad_norm": 0.11884935945272446, + "learning_rate": 0.0005, + "loss": 2.1236, + "step": 101950 + }, + { + "epoch": 0.3880849249788753, + "grad_norm": 0.12933066487312317, + "learning_rate": 0.0005, + "loss": 2.1111, + "step": 101960 + }, + { + "epoch": 0.388122987446998, + "grad_norm": 0.12881408631801605, + "learning_rate": 0.0005, + "loss": 2.1209, + "step": 101970 + }, + { + "epoch": 0.3881610499151207, + "grad_norm": 0.12620744109153748, + "learning_rate": 0.0005, + "loss": 2.1187, + "step": 101980 + }, + { + "epoch": 0.3881991123832434, + "grad_norm": 0.1269460916519165, + "learning_rate": 0.0005, + "loss": 2.1175, + "step": 101990 + }, + { + "epoch": 0.38823717485136605, + "grad_norm": 0.12020882219076157, + "learning_rate": 0.0005, + "loss": 2.1055, + "step": 102000 + }, + { + "epoch": 0.38827523731948876, + "grad_norm": 0.11622332036495209, + "learning_rate": 0.0005, + "loss": 2.1059, + "step": 102010 + }, + { + "epoch": 0.3883132997876114, + "grad_norm": 0.11607720702886581, + "learning_rate": 0.0005, + "loss": 2.1164, + "step": 102020 + }, + { + "epoch": 0.38835136225573413, + "grad_norm": 0.11501598358154297, + "learning_rate": 0.0005, + "loss": 2.1036, + "step": 102030 + }, + { + "epoch": 0.3883894247238568, + "grad_norm": 0.11709321290254593, + "learning_rate": 0.0005, + "loss": 2.1057, + "step": 102040 + }, + { + "epoch": 0.3884274871919795, + "grad_norm": 0.12677747011184692, + "learning_rate": 0.0005, + "loss": 2.127, + "step": 102050 + }, + { + "epoch": 0.38846554966010216, + "grad_norm": 0.12956194579601288, + "learning_rate": 0.0005, + "loss": 2.1196, + "step": 102060 + }, + { + "epoch": 0.3885036121282248, + "grad_norm": 0.17184223234653473, + "learning_rate": 0.0005, + "loss": 2.118, + "step": 102070 + }, + { + "epoch": 0.38854167459634753, + "grad_norm": 0.11873897910118103, + "learning_rate": 0.0005, + "loss": 2.1244, + "step": 102080 + }, + { + "epoch": 0.3885797370644702, + "grad_norm": 0.13604381680488586, + "learning_rate": 0.0005, + "loss": 2.1108, + "step": 102090 + }, + { + "epoch": 0.3886177995325929, + "grad_norm": 0.1449870616197586, + "learning_rate": 0.0005, + "loss": 2.1358, + "step": 102100 + }, + { + "epoch": 0.38865586200071556, + "grad_norm": 0.13565057516098022, + "learning_rate": 0.0005, + "loss": 2.1244, + "step": 102110 + }, + { + "epoch": 0.38869392446883827, + "grad_norm": 0.134196937084198, + "learning_rate": 0.0005, + "loss": 2.1295, + "step": 102120 + }, + { + "epoch": 0.3887319869369609, + "grad_norm": 0.13982726633548737, + "learning_rate": 0.0005, + "loss": 2.1074, + "step": 102130 + }, + { + "epoch": 0.38877004940508364, + "grad_norm": 0.11478301882743835, + "learning_rate": 0.0005, + "loss": 2.1147, + "step": 102140 + }, + { + "epoch": 0.3888081118732063, + "grad_norm": 0.111292764544487, + "learning_rate": 0.0005, + "loss": 2.1219, + "step": 102150 + }, + { + "epoch": 0.388846174341329, + "grad_norm": 0.12361454218626022, + "learning_rate": 0.0005, + "loss": 2.1315, + "step": 102160 + }, + { + "epoch": 0.38888423680945167, + "grad_norm": 0.11410224437713623, + "learning_rate": 0.0005, + "loss": 2.1202, + "step": 102170 + }, + { + "epoch": 0.3889222992775744, + "grad_norm": 0.1234268844127655, + "learning_rate": 0.0005, + "loss": 2.1338, + "step": 102180 + }, + { + "epoch": 0.38896036174569704, + "grad_norm": 0.11605652421712875, + "learning_rate": 0.0005, + "loss": 2.1169, + "step": 102190 + }, + { + "epoch": 0.3889984242138197, + "grad_norm": 0.13466107845306396, + "learning_rate": 0.0005, + "loss": 2.1213, + "step": 102200 + }, + { + "epoch": 0.3890364866819424, + "grad_norm": 0.1219257116317749, + "learning_rate": 0.0005, + "loss": 2.0981, + "step": 102210 + }, + { + "epoch": 0.38907454915006506, + "grad_norm": 0.13218198716640472, + "learning_rate": 0.0005, + "loss": 2.1092, + "step": 102220 + }, + { + "epoch": 0.3891126116181878, + "grad_norm": 0.13117769360542297, + "learning_rate": 0.0005, + "loss": 2.1233, + "step": 102230 + }, + { + "epoch": 0.38915067408631043, + "grad_norm": 0.13620539009571075, + "learning_rate": 0.0005, + "loss": 2.1231, + "step": 102240 + }, + { + "epoch": 0.38918873655443315, + "grad_norm": 0.11436023563146591, + "learning_rate": 0.0005, + "loss": 2.1091, + "step": 102250 + }, + { + "epoch": 0.3892267990225558, + "grad_norm": 0.12375251203775406, + "learning_rate": 0.0005, + "loss": 2.1276, + "step": 102260 + }, + { + "epoch": 0.3892648614906785, + "grad_norm": 0.13119354844093323, + "learning_rate": 0.0005, + "loss": 2.1377, + "step": 102270 + }, + { + "epoch": 0.3893029239588012, + "grad_norm": 0.1198691576719284, + "learning_rate": 0.0005, + "loss": 2.1323, + "step": 102280 + }, + { + "epoch": 0.3893409864269239, + "grad_norm": 0.12678927183151245, + "learning_rate": 0.0005, + "loss": 2.1237, + "step": 102290 + }, + { + "epoch": 0.38937904889504654, + "grad_norm": 0.12140702456235886, + "learning_rate": 0.0005, + "loss": 2.1212, + "step": 102300 + }, + { + "epoch": 0.38941711136316925, + "grad_norm": 0.1271730661392212, + "learning_rate": 0.0005, + "loss": 2.1166, + "step": 102310 + }, + { + "epoch": 0.3894551738312919, + "grad_norm": 0.1230878159403801, + "learning_rate": 0.0005, + "loss": 2.1303, + "step": 102320 + }, + { + "epoch": 0.3894932362994146, + "grad_norm": 0.12364904582500458, + "learning_rate": 0.0005, + "loss": 2.1088, + "step": 102330 + }, + { + "epoch": 0.3895312987675373, + "grad_norm": 0.12749449908733368, + "learning_rate": 0.0005, + "loss": 2.1262, + "step": 102340 + }, + { + "epoch": 0.38956936123565994, + "grad_norm": 0.1394471675157547, + "learning_rate": 0.0005, + "loss": 2.113, + "step": 102350 + }, + { + "epoch": 0.38960742370378265, + "grad_norm": 0.1296152025461197, + "learning_rate": 0.0005, + "loss": 2.1172, + "step": 102360 + }, + { + "epoch": 0.3896454861719053, + "grad_norm": 0.11513220518827438, + "learning_rate": 0.0005, + "loss": 2.1292, + "step": 102370 + }, + { + "epoch": 0.389683548640028, + "grad_norm": 0.12122727185487747, + "learning_rate": 0.0005, + "loss": 2.1172, + "step": 102380 + }, + { + "epoch": 0.3897216111081507, + "grad_norm": 0.11633864790201187, + "learning_rate": 0.0005, + "loss": 2.1279, + "step": 102390 + }, + { + "epoch": 0.3897596735762734, + "grad_norm": 0.12338420748710632, + "learning_rate": 0.0005, + "loss": 2.1185, + "step": 102400 + }, + { + "epoch": 0.38979773604439605, + "grad_norm": 0.1302909106016159, + "learning_rate": 0.0005, + "loss": 2.1072, + "step": 102410 + }, + { + "epoch": 0.38983579851251876, + "grad_norm": 0.12100611627101898, + "learning_rate": 0.0005, + "loss": 2.1251, + "step": 102420 + }, + { + "epoch": 0.3898738609806414, + "grad_norm": 0.11967547237873077, + "learning_rate": 0.0005, + "loss": 2.1235, + "step": 102430 + }, + { + "epoch": 0.38991192344876413, + "grad_norm": 0.1297510713338852, + "learning_rate": 0.0005, + "loss": 2.1206, + "step": 102440 + }, + { + "epoch": 0.3899499859168868, + "grad_norm": 0.13035336136817932, + "learning_rate": 0.0005, + "loss": 2.115, + "step": 102450 + }, + { + "epoch": 0.3899880483850095, + "grad_norm": 0.13591642677783966, + "learning_rate": 0.0005, + "loss": 2.1317, + "step": 102460 + }, + { + "epoch": 0.39002611085313216, + "grad_norm": 0.12677662074565887, + "learning_rate": 0.0005, + "loss": 2.1313, + "step": 102470 + }, + { + "epoch": 0.39006417332125487, + "grad_norm": 0.11698787659406662, + "learning_rate": 0.0005, + "loss": 2.1133, + "step": 102480 + }, + { + "epoch": 0.3901022357893775, + "grad_norm": 0.11306315660476685, + "learning_rate": 0.0005, + "loss": 2.1279, + "step": 102490 + }, + { + "epoch": 0.3901402982575002, + "grad_norm": 0.11850694566965103, + "learning_rate": 0.0005, + "loss": 2.1253, + "step": 102500 + }, + { + "epoch": 0.3901783607256229, + "grad_norm": 0.13537517189979553, + "learning_rate": 0.0005, + "loss": 2.1217, + "step": 102510 + }, + { + "epoch": 0.39021642319374555, + "grad_norm": 0.13141389191150665, + "learning_rate": 0.0005, + "loss": 2.1294, + "step": 102520 + }, + { + "epoch": 0.39025448566186827, + "grad_norm": 0.1263415813446045, + "learning_rate": 0.0005, + "loss": 2.1198, + "step": 102530 + }, + { + "epoch": 0.3902925481299909, + "grad_norm": 0.12739363312721252, + "learning_rate": 0.0005, + "loss": 2.123, + "step": 102540 + }, + { + "epoch": 0.39033061059811364, + "grad_norm": 0.12390932440757751, + "learning_rate": 0.0005, + "loss": 2.1129, + "step": 102550 + }, + { + "epoch": 0.3903686730662363, + "grad_norm": 0.1357351541519165, + "learning_rate": 0.0005, + "loss": 2.1161, + "step": 102560 + }, + { + "epoch": 0.390406735534359, + "grad_norm": 0.13923802971839905, + "learning_rate": 0.0005, + "loss": 2.1225, + "step": 102570 + }, + { + "epoch": 0.39044479800248166, + "grad_norm": 0.13165980577468872, + "learning_rate": 0.0005, + "loss": 2.1218, + "step": 102580 + }, + { + "epoch": 0.3904828604706044, + "grad_norm": 0.12961310148239136, + "learning_rate": 0.0005, + "loss": 2.1105, + "step": 102590 + }, + { + "epoch": 0.39052092293872703, + "grad_norm": 0.1298636794090271, + "learning_rate": 0.0005, + "loss": 2.1188, + "step": 102600 + }, + { + "epoch": 0.39055898540684975, + "grad_norm": 0.12369903922080994, + "learning_rate": 0.0005, + "loss": 2.1341, + "step": 102610 + }, + { + "epoch": 0.3905970478749724, + "grad_norm": 0.12848657369613647, + "learning_rate": 0.0005, + "loss": 2.1295, + "step": 102620 + }, + { + "epoch": 0.39063511034309506, + "grad_norm": 0.14106298983097076, + "learning_rate": 0.0005, + "loss": 2.1054, + "step": 102630 + }, + { + "epoch": 0.3906731728112178, + "grad_norm": 0.12941746413707733, + "learning_rate": 0.0005, + "loss": 2.1249, + "step": 102640 + }, + { + "epoch": 0.39071123527934043, + "grad_norm": 0.12591303884983063, + "learning_rate": 0.0005, + "loss": 2.1166, + "step": 102650 + }, + { + "epoch": 0.39074929774746314, + "grad_norm": 0.1679193377494812, + "learning_rate": 0.0005, + "loss": 2.1299, + "step": 102660 + }, + { + "epoch": 0.3907873602155858, + "grad_norm": 0.1251562386751175, + "learning_rate": 0.0005, + "loss": 2.1265, + "step": 102670 + }, + { + "epoch": 0.3908254226837085, + "grad_norm": 0.12617377936840057, + "learning_rate": 0.0005, + "loss": 2.1299, + "step": 102680 + }, + { + "epoch": 0.39086348515183117, + "grad_norm": 0.12458827346563339, + "learning_rate": 0.0005, + "loss": 2.118, + "step": 102690 + }, + { + "epoch": 0.3909015476199539, + "grad_norm": 0.11865270137786865, + "learning_rate": 0.0005, + "loss": 2.1358, + "step": 102700 + }, + { + "epoch": 0.39093961008807654, + "grad_norm": 0.12173104286193848, + "learning_rate": 0.0005, + "loss": 2.1274, + "step": 102710 + }, + { + "epoch": 0.39097767255619925, + "grad_norm": 0.12546369433403015, + "learning_rate": 0.0005, + "loss": 2.1135, + "step": 102720 + }, + { + "epoch": 0.3910157350243219, + "grad_norm": 0.1272427886724472, + "learning_rate": 0.0005, + "loss": 2.1295, + "step": 102730 + }, + { + "epoch": 0.3910537974924446, + "grad_norm": 0.11905576288700104, + "learning_rate": 0.0005, + "loss": 2.1209, + "step": 102740 + }, + { + "epoch": 0.3910918599605673, + "grad_norm": 0.11975698918104172, + "learning_rate": 0.0005, + "loss": 2.1264, + "step": 102750 + }, + { + "epoch": 0.39112992242869, + "grad_norm": 0.12736965715885162, + "learning_rate": 0.0005, + "loss": 2.1226, + "step": 102760 + }, + { + "epoch": 0.39116798489681265, + "grad_norm": 0.12499444931745529, + "learning_rate": 0.0005, + "loss": 2.1184, + "step": 102770 + }, + { + "epoch": 0.3912060473649353, + "grad_norm": 0.12132129818201065, + "learning_rate": 0.0005, + "loss": 2.129, + "step": 102780 + }, + { + "epoch": 0.391244109833058, + "grad_norm": 0.12807084619998932, + "learning_rate": 0.0005, + "loss": 2.1417, + "step": 102790 + }, + { + "epoch": 0.3912821723011807, + "grad_norm": 0.12466669827699661, + "learning_rate": 0.0005, + "loss": 2.1176, + "step": 102800 + }, + { + "epoch": 0.3913202347693034, + "grad_norm": 0.13407708704471588, + "learning_rate": 0.0005, + "loss": 2.1212, + "step": 102810 + }, + { + "epoch": 0.39135829723742604, + "grad_norm": 0.12648305296897888, + "learning_rate": 0.0005, + "loss": 2.1114, + "step": 102820 + }, + { + "epoch": 0.39139635970554876, + "grad_norm": 0.13111071288585663, + "learning_rate": 0.0005, + "loss": 2.1324, + "step": 102830 + }, + { + "epoch": 0.3914344221736714, + "grad_norm": 0.11999868601560593, + "learning_rate": 0.0005, + "loss": 2.1121, + "step": 102840 + }, + { + "epoch": 0.3914724846417941, + "grad_norm": 0.11485709995031357, + "learning_rate": 0.0005, + "loss": 2.119, + "step": 102850 + }, + { + "epoch": 0.3915105471099168, + "grad_norm": 0.1268894374370575, + "learning_rate": 0.0005, + "loss": 2.1262, + "step": 102860 + }, + { + "epoch": 0.3915486095780395, + "grad_norm": 0.12413586676120758, + "learning_rate": 0.0005, + "loss": 2.1345, + "step": 102870 + }, + { + "epoch": 0.39158667204616215, + "grad_norm": 0.12754595279693604, + "learning_rate": 0.0005, + "loss": 2.1249, + "step": 102880 + }, + { + "epoch": 0.39162473451428487, + "grad_norm": 0.3754200339317322, + "learning_rate": 0.0005, + "loss": 2.1109, + "step": 102890 + }, + { + "epoch": 0.3916627969824075, + "grad_norm": 0.1305660754442215, + "learning_rate": 0.0005, + "loss": 2.1119, + "step": 102900 + }, + { + "epoch": 0.39170085945053024, + "grad_norm": 0.11650175601243973, + "learning_rate": 0.0005, + "loss": 2.1198, + "step": 102910 + }, + { + "epoch": 0.3917389219186529, + "grad_norm": 0.12068041414022446, + "learning_rate": 0.0005, + "loss": 2.1233, + "step": 102920 + }, + { + "epoch": 0.39177698438677555, + "grad_norm": 0.13195456564426422, + "learning_rate": 0.0005, + "loss": 2.1243, + "step": 102930 + }, + { + "epoch": 0.39181504685489826, + "grad_norm": 0.13185672461986542, + "learning_rate": 0.0005, + "loss": 2.1298, + "step": 102940 + }, + { + "epoch": 0.3918531093230209, + "grad_norm": 0.12099901586771011, + "learning_rate": 0.0005, + "loss": 2.1095, + "step": 102950 + }, + { + "epoch": 0.39189117179114363, + "grad_norm": 0.11593295633792877, + "learning_rate": 0.0005, + "loss": 2.1353, + "step": 102960 + }, + { + "epoch": 0.3919292342592663, + "grad_norm": 0.1235249787569046, + "learning_rate": 0.0005, + "loss": 2.1029, + "step": 102970 + }, + { + "epoch": 0.391967296727389, + "grad_norm": 0.14133043587207794, + "learning_rate": 0.0005, + "loss": 2.1059, + "step": 102980 + }, + { + "epoch": 0.39200535919551166, + "grad_norm": 0.13981376588344574, + "learning_rate": 0.0005, + "loss": 2.1165, + "step": 102990 + }, + { + "epoch": 0.3920434216636344, + "grad_norm": 0.12288466095924377, + "learning_rate": 0.0005, + "loss": 2.1298, + "step": 103000 + }, + { + "epoch": 0.39208148413175703, + "grad_norm": 0.12503664195537567, + "learning_rate": 0.0005, + "loss": 2.121, + "step": 103010 + }, + { + "epoch": 0.39211954659987974, + "grad_norm": 0.12194748967885971, + "learning_rate": 0.0005, + "loss": 2.1129, + "step": 103020 + }, + { + "epoch": 0.3921576090680024, + "grad_norm": 0.13284580409526825, + "learning_rate": 0.0005, + "loss": 2.1231, + "step": 103030 + }, + { + "epoch": 0.3921956715361251, + "grad_norm": 0.12611354887485504, + "learning_rate": 0.0005, + "loss": 2.1289, + "step": 103040 + }, + { + "epoch": 0.39223373400424777, + "grad_norm": 0.11393626779317856, + "learning_rate": 0.0005, + "loss": 2.104, + "step": 103050 + }, + { + "epoch": 0.3922717964723705, + "grad_norm": 0.11623258143663406, + "learning_rate": 0.0005, + "loss": 2.1106, + "step": 103060 + }, + { + "epoch": 0.39230985894049314, + "grad_norm": 0.12657833099365234, + "learning_rate": 0.0005, + "loss": 2.1375, + "step": 103070 + }, + { + "epoch": 0.3923479214086158, + "grad_norm": 0.11555872857570648, + "learning_rate": 0.0005, + "loss": 2.1269, + "step": 103080 + }, + { + "epoch": 0.3923859838767385, + "grad_norm": 0.12106861919164658, + "learning_rate": 0.0005, + "loss": 2.1104, + "step": 103090 + }, + { + "epoch": 0.39242404634486117, + "grad_norm": 0.1341984122991562, + "learning_rate": 0.0005, + "loss": 2.1349, + "step": 103100 + }, + { + "epoch": 0.3924621088129839, + "grad_norm": 0.13863204419612885, + "learning_rate": 0.0005, + "loss": 2.1285, + "step": 103110 + }, + { + "epoch": 0.39250017128110654, + "grad_norm": 0.1220608800649643, + "learning_rate": 0.0005, + "loss": 2.1182, + "step": 103120 + }, + { + "epoch": 0.39253823374922925, + "grad_norm": 0.13805212080478668, + "learning_rate": 0.0005, + "loss": 2.1226, + "step": 103130 + }, + { + "epoch": 0.3925762962173519, + "grad_norm": 0.1482156664133072, + "learning_rate": 0.0005, + "loss": 2.1074, + "step": 103140 + }, + { + "epoch": 0.3926143586854746, + "grad_norm": 0.1242128238081932, + "learning_rate": 0.0005, + "loss": 2.1181, + "step": 103150 + }, + { + "epoch": 0.3926524211535973, + "grad_norm": 0.1172168180346489, + "learning_rate": 0.0005, + "loss": 2.1267, + "step": 103160 + }, + { + "epoch": 0.39269048362172, + "grad_norm": 0.11944341659545898, + "learning_rate": 0.0005, + "loss": 2.1185, + "step": 103170 + }, + { + "epoch": 0.39272854608984265, + "grad_norm": 0.12237085402011871, + "learning_rate": 0.0005, + "loss": 2.1274, + "step": 103180 + }, + { + "epoch": 0.39276660855796536, + "grad_norm": 0.13623057305812836, + "learning_rate": 0.0005, + "loss": 2.1149, + "step": 103190 + }, + { + "epoch": 0.392804671026088, + "grad_norm": 0.13276179134845734, + "learning_rate": 0.0005, + "loss": 2.1253, + "step": 103200 + }, + { + "epoch": 0.39284273349421067, + "grad_norm": 0.12691223621368408, + "learning_rate": 0.0005, + "loss": 2.1232, + "step": 103210 + }, + { + "epoch": 0.3928807959623334, + "grad_norm": 0.1295863837003708, + "learning_rate": 0.0005, + "loss": 2.1258, + "step": 103220 + }, + { + "epoch": 0.39291885843045604, + "grad_norm": 0.12674763798713684, + "learning_rate": 0.0005, + "loss": 2.1134, + "step": 103230 + }, + { + "epoch": 0.39295692089857875, + "grad_norm": 0.1289980411529541, + "learning_rate": 0.0005, + "loss": 2.121, + "step": 103240 + }, + { + "epoch": 0.3929949833667014, + "grad_norm": 0.126417338848114, + "learning_rate": 0.0005, + "loss": 2.1249, + "step": 103250 + }, + { + "epoch": 0.3930330458348241, + "grad_norm": 0.12376362085342407, + "learning_rate": 0.0005, + "loss": 2.1071, + "step": 103260 + }, + { + "epoch": 0.3930711083029468, + "grad_norm": 0.1145966425538063, + "learning_rate": 0.0005, + "loss": 2.1142, + "step": 103270 + }, + { + "epoch": 0.3931091707710695, + "grad_norm": 0.12255162000656128, + "learning_rate": 0.0005, + "loss": 2.1162, + "step": 103280 + }, + { + "epoch": 0.39314723323919215, + "grad_norm": 0.12236989289522171, + "learning_rate": 0.0005, + "loss": 2.1137, + "step": 103290 + }, + { + "epoch": 0.39318529570731486, + "grad_norm": 0.11930961161851883, + "learning_rate": 0.0005, + "loss": 2.1127, + "step": 103300 + }, + { + "epoch": 0.3932233581754375, + "grad_norm": 0.11663472652435303, + "learning_rate": 0.0005, + "loss": 2.1245, + "step": 103310 + }, + { + "epoch": 0.39326142064356023, + "grad_norm": 0.10983003675937653, + "learning_rate": 0.0005, + "loss": 2.1125, + "step": 103320 + }, + { + "epoch": 0.3932994831116829, + "grad_norm": 0.1318846344947815, + "learning_rate": 0.0005, + "loss": 2.1196, + "step": 103330 + }, + { + "epoch": 0.3933375455798056, + "grad_norm": 0.13595378398895264, + "learning_rate": 0.0005, + "loss": 2.1141, + "step": 103340 + }, + { + "epoch": 0.39337560804792826, + "grad_norm": 0.1256626397371292, + "learning_rate": 0.0005, + "loss": 2.1268, + "step": 103350 + }, + { + "epoch": 0.3934136705160509, + "grad_norm": 0.11132992804050446, + "learning_rate": 0.0005, + "loss": 2.1174, + "step": 103360 + }, + { + "epoch": 0.39345173298417363, + "grad_norm": 0.1152862012386322, + "learning_rate": 0.0005, + "loss": 2.1233, + "step": 103370 + }, + { + "epoch": 0.3934897954522963, + "grad_norm": 0.1286613941192627, + "learning_rate": 0.0005, + "loss": 2.1213, + "step": 103380 + }, + { + "epoch": 0.393527857920419, + "grad_norm": 0.12287045270204544, + "learning_rate": 0.0005, + "loss": 2.1199, + "step": 103390 + }, + { + "epoch": 0.39356592038854166, + "grad_norm": 0.1277402639389038, + "learning_rate": 0.0005, + "loss": 2.1256, + "step": 103400 + }, + { + "epoch": 0.39360398285666437, + "grad_norm": 0.12594376504421234, + "learning_rate": 0.0005, + "loss": 2.1178, + "step": 103410 + }, + { + "epoch": 0.393642045324787, + "grad_norm": 0.12150339782238007, + "learning_rate": 0.0005, + "loss": 2.1264, + "step": 103420 + }, + { + "epoch": 0.39368010779290974, + "grad_norm": 0.12241839617490768, + "learning_rate": 0.0005, + "loss": 2.1163, + "step": 103430 + }, + { + "epoch": 0.3937181702610324, + "grad_norm": 0.12070048600435257, + "learning_rate": 0.0005, + "loss": 2.1219, + "step": 103440 + }, + { + "epoch": 0.3937562327291551, + "grad_norm": 0.13486874103546143, + "learning_rate": 0.0005, + "loss": 2.102, + "step": 103450 + }, + { + "epoch": 0.39379429519727777, + "grad_norm": 0.12598316371440887, + "learning_rate": 0.0005, + "loss": 2.1331, + "step": 103460 + }, + { + "epoch": 0.3938323576654005, + "grad_norm": 0.11616887152194977, + "learning_rate": 0.0005, + "loss": 2.1198, + "step": 103470 + }, + { + "epoch": 0.39387042013352314, + "grad_norm": 0.12013692408800125, + "learning_rate": 0.0005, + "loss": 2.1152, + "step": 103480 + }, + { + "epoch": 0.39390848260164585, + "grad_norm": 0.12813392281532288, + "learning_rate": 0.0005, + "loss": 2.1198, + "step": 103490 + }, + { + "epoch": 0.3939465450697685, + "grad_norm": 0.1240399107336998, + "learning_rate": 0.0005, + "loss": 2.1292, + "step": 103500 + }, + { + "epoch": 0.39398460753789116, + "grad_norm": 0.1368178278207779, + "learning_rate": 0.0005, + "loss": 2.128, + "step": 103510 + }, + { + "epoch": 0.3940226700060139, + "grad_norm": 0.11790160089731216, + "learning_rate": 0.0005, + "loss": 2.1219, + "step": 103520 + }, + { + "epoch": 0.39406073247413653, + "grad_norm": 0.12600161135196686, + "learning_rate": 0.0005, + "loss": 2.1264, + "step": 103530 + }, + { + "epoch": 0.39409879494225925, + "grad_norm": 0.11743983626365662, + "learning_rate": 0.0005, + "loss": 2.1226, + "step": 103540 + }, + { + "epoch": 0.3941368574103819, + "grad_norm": 0.11900418996810913, + "learning_rate": 0.0005, + "loss": 2.1174, + "step": 103550 + }, + { + "epoch": 0.3941749198785046, + "grad_norm": 0.13106529414653778, + "learning_rate": 0.0005, + "loss": 2.1092, + "step": 103560 + }, + { + "epoch": 0.39421298234662727, + "grad_norm": 0.126107320189476, + "learning_rate": 0.0005, + "loss": 2.127, + "step": 103570 + }, + { + "epoch": 0.39425104481475, + "grad_norm": 0.13327786326408386, + "learning_rate": 0.0005, + "loss": 2.1148, + "step": 103580 + }, + { + "epoch": 0.39428910728287264, + "grad_norm": 0.13225555419921875, + "learning_rate": 0.0005, + "loss": 2.1155, + "step": 103590 + }, + { + "epoch": 0.39432716975099535, + "grad_norm": 0.12251465022563934, + "learning_rate": 0.0005, + "loss": 2.1174, + "step": 103600 + }, + { + "epoch": 0.394365232219118, + "grad_norm": 0.13184860348701477, + "learning_rate": 0.0005, + "loss": 2.1144, + "step": 103610 + }, + { + "epoch": 0.3944032946872407, + "grad_norm": 0.13471008837223053, + "learning_rate": 0.0005, + "loss": 2.1306, + "step": 103620 + }, + { + "epoch": 0.3944413571553634, + "grad_norm": 0.1355457454919815, + "learning_rate": 0.0005, + "loss": 2.1163, + "step": 103630 + }, + { + "epoch": 0.39447941962348604, + "grad_norm": 0.12295703589916229, + "learning_rate": 0.0005, + "loss": 2.1256, + "step": 103640 + }, + { + "epoch": 0.39451748209160875, + "grad_norm": 0.1327628791332245, + "learning_rate": 0.0005, + "loss": 2.1229, + "step": 103650 + }, + { + "epoch": 0.3945555445597314, + "grad_norm": 0.13155631721019745, + "learning_rate": 0.0005, + "loss": 2.1282, + "step": 103660 + }, + { + "epoch": 0.3945936070278541, + "grad_norm": 0.13830965757369995, + "learning_rate": 0.0005, + "loss": 2.1348, + "step": 103670 + }, + { + "epoch": 0.3946316694959768, + "grad_norm": 0.12834765017032623, + "learning_rate": 0.0005, + "loss": 2.1331, + "step": 103680 + }, + { + "epoch": 0.3946697319640995, + "grad_norm": 0.13996703922748566, + "learning_rate": 0.0005, + "loss": 2.1161, + "step": 103690 + }, + { + "epoch": 0.39470779443222215, + "grad_norm": 0.13787522912025452, + "learning_rate": 0.0005, + "loss": 2.1201, + "step": 103700 + }, + { + "epoch": 0.39474585690034486, + "grad_norm": 0.13497807085514069, + "learning_rate": 0.0005, + "loss": 2.1126, + "step": 103710 + }, + { + "epoch": 0.3947839193684675, + "grad_norm": 0.1280667930841446, + "learning_rate": 0.0005, + "loss": 2.114, + "step": 103720 + }, + { + "epoch": 0.39482198183659023, + "grad_norm": 0.12160675972700119, + "learning_rate": 0.0005, + "loss": 2.1281, + "step": 103730 + }, + { + "epoch": 0.3948600443047129, + "grad_norm": 0.11742819100618362, + "learning_rate": 0.0005, + "loss": 2.1196, + "step": 103740 + }, + { + "epoch": 0.3948981067728356, + "grad_norm": 0.12526483833789825, + "learning_rate": 0.0005, + "loss": 2.1075, + "step": 103750 + }, + { + "epoch": 0.39493616924095826, + "grad_norm": 0.12073063105344772, + "learning_rate": 0.0005, + "loss": 2.1269, + "step": 103760 + }, + { + "epoch": 0.39497423170908097, + "grad_norm": 0.11773089319467545, + "learning_rate": 0.0005, + "loss": 2.1151, + "step": 103770 + }, + { + "epoch": 0.3950122941772036, + "grad_norm": 0.13035206496715546, + "learning_rate": 0.0005, + "loss": 2.1136, + "step": 103780 + }, + { + "epoch": 0.3950503566453263, + "grad_norm": 0.1201128363609314, + "learning_rate": 0.0005, + "loss": 2.1155, + "step": 103790 + }, + { + "epoch": 0.395088419113449, + "grad_norm": 0.11840377748012543, + "learning_rate": 0.0005, + "loss": 2.136, + "step": 103800 + }, + { + "epoch": 0.39512648158157165, + "grad_norm": 0.1157442033290863, + "learning_rate": 0.0005, + "loss": 2.1058, + "step": 103810 + }, + { + "epoch": 0.39516454404969437, + "grad_norm": 0.11903751641511917, + "learning_rate": 0.0005, + "loss": 2.1214, + "step": 103820 + }, + { + "epoch": 0.395202606517817, + "grad_norm": 0.12379298359155655, + "learning_rate": 0.0005, + "loss": 2.1108, + "step": 103830 + }, + { + "epoch": 0.39524066898593974, + "grad_norm": 0.12372729927301407, + "learning_rate": 0.0005, + "loss": 2.1351, + "step": 103840 + }, + { + "epoch": 0.3952787314540624, + "grad_norm": 0.1268322765827179, + "learning_rate": 0.0005, + "loss": 2.1354, + "step": 103850 + }, + { + "epoch": 0.3953167939221851, + "grad_norm": 0.13438722491264343, + "learning_rate": 0.0005, + "loss": 2.1092, + "step": 103860 + }, + { + "epoch": 0.39535485639030776, + "grad_norm": 0.12297304719686508, + "learning_rate": 0.0005, + "loss": 2.1172, + "step": 103870 + }, + { + "epoch": 0.3953929188584305, + "grad_norm": 0.13192100822925568, + "learning_rate": 0.0005, + "loss": 2.1198, + "step": 103880 + }, + { + "epoch": 0.39543098132655313, + "grad_norm": 0.1235286295413971, + "learning_rate": 0.0005, + "loss": 2.1268, + "step": 103890 + }, + { + "epoch": 0.39546904379467585, + "grad_norm": 0.11537299305200577, + "learning_rate": 0.0005, + "loss": 2.1091, + "step": 103900 + }, + { + "epoch": 0.3955071062627985, + "grad_norm": 0.11818663775920868, + "learning_rate": 0.0005, + "loss": 2.1279, + "step": 103910 + }, + { + "epoch": 0.3955451687309212, + "grad_norm": 0.1426924765110016, + "learning_rate": 0.0005, + "loss": 2.1177, + "step": 103920 + }, + { + "epoch": 0.39558323119904387, + "grad_norm": 0.14220619201660156, + "learning_rate": 0.0005, + "loss": 2.1141, + "step": 103930 + }, + { + "epoch": 0.39562129366716653, + "grad_norm": 0.12632210552692413, + "learning_rate": 0.0005, + "loss": 2.1225, + "step": 103940 + }, + { + "epoch": 0.39565935613528924, + "grad_norm": 0.12559543550014496, + "learning_rate": 0.0005, + "loss": 2.1028, + "step": 103950 + }, + { + "epoch": 0.3956974186034119, + "grad_norm": 0.13188651204109192, + "learning_rate": 0.0005, + "loss": 2.114, + "step": 103960 + }, + { + "epoch": 0.3957354810715346, + "grad_norm": 0.12158432602882385, + "learning_rate": 0.0005, + "loss": 2.1151, + "step": 103970 + }, + { + "epoch": 0.39577354353965727, + "grad_norm": 0.13120721280574799, + "learning_rate": 0.0005, + "loss": 2.109, + "step": 103980 + }, + { + "epoch": 0.39581160600778, + "grad_norm": 0.1457386314868927, + "learning_rate": 0.0005, + "loss": 2.128, + "step": 103990 + }, + { + "epoch": 0.39584966847590264, + "grad_norm": 0.12688206136226654, + "learning_rate": 0.0005, + "loss": 2.116, + "step": 104000 + }, + { + "epoch": 0.39588773094402535, + "grad_norm": 0.15381436049938202, + "learning_rate": 0.0005, + "loss": 2.1142, + "step": 104010 + }, + { + "epoch": 0.395925793412148, + "grad_norm": 0.11215616017580032, + "learning_rate": 0.0005, + "loss": 2.131, + "step": 104020 + }, + { + "epoch": 0.3959638558802707, + "grad_norm": 0.12374036014080048, + "learning_rate": 0.0005, + "loss": 2.1171, + "step": 104030 + }, + { + "epoch": 0.3960019183483934, + "grad_norm": 0.11824839562177658, + "learning_rate": 0.0005, + "loss": 2.1018, + "step": 104040 + }, + { + "epoch": 0.3960399808165161, + "grad_norm": 0.14382266998291016, + "learning_rate": 0.0005, + "loss": 2.113, + "step": 104050 + }, + { + "epoch": 0.39607804328463875, + "grad_norm": 0.11462534964084625, + "learning_rate": 0.0005, + "loss": 2.1038, + "step": 104060 + }, + { + "epoch": 0.3961161057527614, + "grad_norm": 0.11632947623729706, + "learning_rate": 0.0005, + "loss": 2.1322, + "step": 104070 + }, + { + "epoch": 0.3961541682208841, + "grad_norm": 0.13774609565734863, + "learning_rate": 0.0005, + "loss": 2.1236, + "step": 104080 + }, + { + "epoch": 0.3961922306890068, + "grad_norm": 0.11715240776538849, + "learning_rate": 0.0005, + "loss": 2.1185, + "step": 104090 + }, + { + "epoch": 0.3962302931571295, + "grad_norm": 0.13816210627555847, + "learning_rate": 0.0005, + "loss": 2.1185, + "step": 104100 + }, + { + "epoch": 0.39626835562525214, + "grad_norm": 0.11385868489742279, + "learning_rate": 0.0005, + "loss": 2.12, + "step": 104110 + }, + { + "epoch": 0.39630641809337486, + "grad_norm": 0.14412954449653625, + "learning_rate": 0.0005, + "loss": 2.1192, + "step": 104120 + }, + { + "epoch": 0.3963444805614975, + "grad_norm": 0.12326811254024506, + "learning_rate": 0.0005, + "loss": 2.1099, + "step": 104130 + }, + { + "epoch": 0.3963825430296202, + "grad_norm": 0.11583611369132996, + "learning_rate": 0.0005, + "loss": 2.1168, + "step": 104140 + }, + { + "epoch": 0.3964206054977429, + "grad_norm": 0.11942502856254578, + "learning_rate": 0.0005, + "loss": 2.1289, + "step": 104150 + }, + { + "epoch": 0.3964586679658656, + "grad_norm": 0.1244032084941864, + "learning_rate": 0.0005, + "loss": 2.1074, + "step": 104160 + }, + { + "epoch": 0.39649673043398825, + "grad_norm": 0.13132727146148682, + "learning_rate": 0.0005, + "loss": 2.1164, + "step": 104170 + }, + { + "epoch": 0.39653479290211097, + "grad_norm": 0.12686878442764282, + "learning_rate": 0.0005, + "loss": 2.1262, + "step": 104180 + }, + { + "epoch": 0.3965728553702336, + "grad_norm": 0.12790274620056152, + "learning_rate": 0.0005, + "loss": 2.1205, + "step": 104190 + }, + { + "epoch": 0.39661091783835634, + "grad_norm": 0.125844344496727, + "learning_rate": 0.0005, + "loss": 2.1143, + "step": 104200 + }, + { + "epoch": 0.396648980306479, + "grad_norm": 0.12397968024015427, + "learning_rate": 0.0005, + "loss": 2.1194, + "step": 104210 + }, + { + "epoch": 0.39668704277460165, + "grad_norm": 0.117226742208004, + "learning_rate": 0.0005, + "loss": 2.1196, + "step": 104220 + }, + { + "epoch": 0.39672510524272436, + "grad_norm": 0.12511186301708221, + "learning_rate": 0.0005, + "loss": 2.1186, + "step": 104230 + }, + { + "epoch": 0.396763167710847, + "grad_norm": 0.12875999510288239, + "learning_rate": 0.0005, + "loss": 2.1242, + "step": 104240 + }, + { + "epoch": 0.39680123017896973, + "grad_norm": 0.1346457302570343, + "learning_rate": 0.0005, + "loss": 2.1273, + "step": 104250 + }, + { + "epoch": 0.3968392926470924, + "grad_norm": 0.11340010911226273, + "learning_rate": 0.0005, + "loss": 2.127, + "step": 104260 + }, + { + "epoch": 0.3968773551152151, + "grad_norm": 0.13111717998981476, + "learning_rate": 0.0005, + "loss": 2.1232, + "step": 104270 + }, + { + "epoch": 0.39691541758333776, + "grad_norm": 0.12048640102148056, + "learning_rate": 0.0005, + "loss": 2.1199, + "step": 104280 + }, + { + "epoch": 0.39695348005146047, + "grad_norm": 0.14155083894729614, + "learning_rate": 0.0005, + "loss": 2.1181, + "step": 104290 + }, + { + "epoch": 0.39699154251958313, + "grad_norm": 0.11819305270910263, + "learning_rate": 0.0005, + "loss": 2.1061, + "step": 104300 + }, + { + "epoch": 0.39702960498770584, + "grad_norm": 0.11852467060089111, + "learning_rate": 0.0005, + "loss": 2.1193, + "step": 104310 + }, + { + "epoch": 0.3970676674558285, + "grad_norm": 0.14167369902133942, + "learning_rate": 0.0005, + "loss": 2.1152, + "step": 104320 + }, + { + "epoch": 0.3971057299239512, + "grad_norm": 0.1200118288397789, + "learning_rate": 0.0005, + "loss": 2.1059, + "step": 104330 + }, + { + "epoch": 0.39714379239207387, + "grad_norm": 0.12136401236057281, + "learning_rate": 0.0005, + "loss": 2.1079, + "step": 104340 + }, + { + "epoch": 0.3971818548601966, + "grad_norm": 0.12574835121631622, + "learning_rate": 0.0005, + "loss": 2.112, + "step": 104350 + }, + { + "epoch": 0.39721991732831924, + "grad_norm": 0.12684336304664612, + "learning_rate": 0.0005, + "loss": 2.1219, + "step": 104360 + }, + { + "epoch": 0.3972579797964419, + "grad_norm": 0.12519370019435883, + "learning_rate": 0.0005, + "loss": 2.1242, + "step": 104370 + }, + { + "epoch": 0.3972960422645646, + "grad_norm": 0.13009725511074066, + "learning_rate": 0.0005, + "loss": 2.1118, + "step": 104380 + }, + { + "epoch": 0.39733410473268727, + "grad_norm": 0.1330563724040985, + "learning_rate": 0.0005, + "loss": 2.117, + "step": 104390 + }, + { + "epoch": 0.39737216720081, + "grad_norm": 0.11972329020500183, + "learning_rate": 0.0005, + "loss": 2.1226, + "step": 104400 + }, + { + "epoch": 0.39741022966893264, + "grad_norm": 0.1132497563958168, + "learning_rate": 0.0005, + "loss": 2.1149, + "step": 104410 + }, + { + "epoch": 0.39744829213705535, + "grad_norm": 0.12347466498613358, + "learning_rate": 0.0005, + "loss": 2.1081, + "step": 104420 + }, + { + "epoch": 0.397486354605178, + "grad_norm": 0.12338844686746597, + "learning_rate": 0.0005, + "loss": 2.118, + "step": 104430 + }, + { + "epoch": 0.3975244170733007, + "grad_norm": 0.13165982067584991, + "learning_rate": 0.0005, + "loss": 2.1287, + "step": 104440 + }, + { + "epoch": 0.3975624795414234, + "grad_norm": 0.1271611452102661, + "learning_rate": 0.0005, + "loss": 2.1181, + "step": 104450 + }, + { + "epoch": 0.3976005420095461, + "grad_norm": 0.12106011807918549, + "learning_rate": 0.0005, + "loss": 2.1315, + "step": 104460 + }, + { + "epoch": 0.39763860447766874, + "grad_norm": 0.12377516180276871, + "learning_rate": 0.0005, + "loss": 2.1229, + "step": 104470 + }, + { + "epoch": 0.39767666694579146, + "grad_norm": 0.12067466974258423, + "learning_rate": 0.0005, + "loss": 2.1218, + "step": 104480 + }, + { + "epoch": 0.3977147294139141, + "grad_norm": 0.12885430455207825, + "learning_rate": 0.0005, + "loss": 2.1199, + "step": 104490 + }, + { + "epoch": 0.39775279188203677, + "grad_norm": 0.11753305047750473, + "learning_rate": 0.0005, + "loss": 2.1209, + "step": 104500 + }, + { + "epoch": 0.3977908543501595, + "grad_norm": 0.1275755614042282, + "learning_rate": 0.0005, + "loss": 2.1092, + "step": 104510 + }, + { + "epoch": 0.39782891681828214, + "grad_norm": 0.12653808295726776, + "learning_rate": 0.0005, + "loss": 2.1252, + "step": 104520 + }, + { + "epoch": 0.39786697928640485, + "grad_norm": 0.1440698355436325, + "learning_rate": 0.0005, + "loss": 2.1275, + "step": 104530 + }, + { + "epoch": 0.3979050417545275, + "grad_norm": 0.11419986188411713, + "learning_rate": 0.0005, + "loss": 2.114, + "step": 104540 + }, + { + "epoch": 0.3979431042226502, + "grad_norm": 0.2337425798177719, + "learning_rate": 0.0005, + "loss": 2.1085, + "step": 104550 + }, + { + "epoch": 0.3979811666907729, + "grad_norm": 0.1223544031381607, + "learning_rate": 0.0005, + "loss": 2.1311, + "step": 104560 + }, + { + "epoch": 0.3980192291588956, + "grad_norm": 0.11273296922445297, + "learning_rate": 0.0005, + "loss": 2.1249, + "step": 104570 + }, + { + "epoch": 0.39805729162701825, + "grad_norm": 0.11929282546043396, + "learning_rate": 0.0005, + "loss": 2.1068, + "step": 104580 + }, + { + "epoch": 0.39809535409514096, + "grad_norm": 0.13098135590553284, + "learning_rate": 0.0005, + "loss": 2.1164, + "step": 104590 + }, + { + "epoch": 0.3981334165632636, + "grad_norm": 0.1531413197517395, + "learning_rate": 0.0005, + "loss": 2.0996, + "step": 104600 + }, + { + "epoch": 0.39817147903138633, + "grad_norm": 0.12102365493774414, + "learning_rate": 0.0005, + "loss": 2.1161, + "step": 104610 + }, + { + "epoch": 0.398209541499509, + "grad_norm": 0.12185650318861008, + "learning_rate": 0.0005, + "loss": 2.1206, + "step": 104620 + }, + { + "epoch": 0.3982476039676317, + "grad_norm": 0.12916898727416992, + "learning_rate": 0.0005, + "loss": 2.119, + "step": 104630 + }, + { + "epoch": 0.39828566643575436, + "grad_norm": 0.12003316730260849, + "learning_rate": 0.0005, + "loss": 2.1099, + "step": 104640 + }, + { + "epoch": 0.398323728903877, + "grad_norm": 0.11966529488563538, + "learning_rate": 0.0005, + "loss": 2.1312, + "step": 104650 + }, + { + "epoch": 0.39836179137199973, + "grad_norm": 0.12870483100414276, + "learning_rate": 0.0005, + "loss": 2.0988, + "step": 104660 + }, + { + "epoch": 0.3983998538401224, + "grad_norm": 0.12053332477807999, + "learning_rate": 0.0005, + "loss": 2.1053, + "step": 104670 + }, + { + "epoch": 0.3984379163082451, + "grad_norm": 0.11767672747373581, + "learning_rate": 0.0005, + "loss": 2.1161, + "step": 104680 + }, + { + "epoch": 0.39847597877636776, + "grad_norm": 0.1143900528550148, + "learning_rate": 0.0005, + "loss": 2.1321, + "step": 104690 + }, + { + "epoch": 0.39851404124449047, + "grad_norm": 0.12328346073627472, + "learning_rate": 0.0005, + "loss": 2.1079, + "step": 104700 + }, + { + "epoch": 0.3985521037126131, + "grad_norm": 0.1246117502450943, + "learning_rate": 0.0005, + "loss": 2.1102, + "step": 104710 + }, + { + "epoch": 0.39859016618073584, + "grad_norm": 0.13052716851234436, + "learning_rate": 0.0005, + "loss": 2.1005, + "step": 104720 + }, + { + "epoch": 0.3986282286488585, + "grad_norm": 0.13421329855918884, + "learning_rate": 0.0005, + "loss": 2.1241, + "step": 104730 + }, + { + "epoch": 0.3986662911169812, + "grad_norm": 0.1216406598687172, + "learning_rate": 0.0005, + "loss": 2.1234, + "step": 104740 + }, + { + "epoch": 0.39870435358510387, + "grad_norm": 0.12023992091417313, + "learning_rate": 0.0005, + "loss": 2.1142, + "step": 104750 + }, + { + "epoch": 0.3987424160532266, + "grad_norm": 0.12383361160755157, + "learning_rate": 0.0005, + "loss": 2.1222, + "step": 104760 + }, + { + "epoch": 0.39878047852134924, + "grad_norm": 0.12238840013742447, + "learning_rate": 0.0005, + "loss": 2.1165, + "step": 104770 + }, + { + "epoch": 0.39881854098947195, + "grad_norm": 0.11502031236886978, + "learning_rate": 0.0005, + "loss": 2.1105, + "step": 104780 + }, + { + "epoch": 0.3988566034575946, + "grad_norm": 0.1243932768702507, + "learning_rate": 0.0005, + "loss": 2.1252, + "step": 104790 + }, + { + "epoch": 0.39889466592571726, + "grad_norm": 0.12052633613348007, + "learning_rate": 0.0005, + "loss": 2.1215, + "step": 104800 + }, + { + "epoch": 0.39893272839384, + "grad_norm": 0.12813816964626312, + "learning_rate": 0.0005, + "loss": 2.1075, + "step": 104810 + }, + { + "epoch": 0.39897079086196263, + "grad_norm": 0.12674903869628906, + "learning_rate": 0.0005, + "loss": 2.1236, + "step": 104820 + }, + { + "epoch": 0.39900885333008534, + "grad_norm": 0.12259454280138016, + "learning_rate": 0.0005, + "loss": 2.1134, + "step": 104830 + }, + { + "epoch": 0.399046915798208, + "grad_norm": 0.12549050152301788, + "learning_rate": 0.0005, + "loss": 2.1189, + "step": 104840 + }, + { + "epoch": 0.3990849782663307, + "grad_norm": 0.12888945639133453, + "learning_rate": 0.0005, + "loss": 2.1149, + "step": 104850 + }, + { + "epoch": 0.39912304073445337, + "grad_norm": 0.1267143189907074, + "learning_rate": 0.0005, + "loss": 2.115, + "step": 104860 + }, + { + "epoch": 0.3991611032025761, + "grad_norm": 0.14655642211437225, + "learning_rate": 0.0005, + "loss": 2.1247, + "step": 104870 + }, + { + "epoch": 0.39919916567069874, + "grad_norm": 0.12013908475637436, + "learning_rate": 0.0005, + "loss": 2.1233, + "step": 104880 + }, + { + "epoch": 0.39923722813882145, + "grad_norm": 0.14582134783267975, + "learning_rate": 0.0005, + "loss": 2.1147, + "step": 104890 + }, + { + "epoch": 0.3992752906069441, + "grad_norm": 0.10703765600919724, + "learning_rate": 0.0005, + "loss": 2.121, + "step": 104900 + }, + { + "epoch": 0.3993133530750668, + "grad_norm": 0.13721676170825958, + "learning_rate": 0.0005, + "loss": 2.1218, + "step": 104910 + }, + { + "epoch": 0.3993514155431895, + "grad_norm": 0.12658992409706116, + "learning_rate": 0.0005, + "loss": 2.1225, + "step": 104920 + }, + { + "epoch": 0.39938947801131214, + "grad_norm": 0.12144620716571808, + "learning_rate": 0.0005, + "loss": 2.1199, + "step": 104930 + }, + { + "epoch": 0.39942754047943485, + "grad_norm": 0.1336260586977005, + "learning_rate": 0.0005, + "loss": 2.112, + "step": 104940 + }, + { + "epoch": 0.3994656029475575, + "grad_norm": 0.12390537559986115, + "learning_rate": 0.0005, + "loss": 2.1124, + "step": 104950 + }, + { + "epoch": 0.3995036654156802, + "grad_norm": 0.1204170510172844, + "learning_rate": 0.0005, + "loss": 2.1273, + "step": 104960 + }, + { + "epoch": 0.3995417278838029, + "grad_norm": 0.12203984707593918, + "learning_rate": 0.0005, + "loss": 2.1174, + "step": 104970 + }, + { + "epoch": 0.3995797903519256, + "grad_norm": 0.11647836118936539, + "learning_rate": 0.0005, + "loss": 2.1251, + "step": 104980 + }, + { + "epoch": 0.39961785282004825, + "grad_norm": 0.12164629250764847, + "learning_rate": 0.0005, + "loss": 2.1171, + "step": 104990 + }, + { + "epoch": 0.39965591528817096, + "grad_norm": 0.1342100352048874, + "learning_rate": 0.0005, + "loss": 2.1192, + "step": 105000 + }, + { + "epoch": 0.3996939777562936, + "grad_norm": 0.13156567513942719, + "learning_rate": 0.0005, + "loss": 2.1101, + "step": 105010 + }, + { + "epoch": 0.39973204022441633, + "grad_norm": 0.13787831366062164, + "learning_rate": 0.0005, + "loss": 2.1273, + "step": 105020 + }, + { + "epoch": 0.399770102692539, + "grad_norm": 0.12670817971229553, + "learning_rate": 0.0005, + "loss": 2.1039, + "step": 105030 + }, + { + "epoch": 0.3998081651606617, + "grad_norm": 0.12434558570384979, + "learning_rate": 0.0005, + "loss": 2.1104, + "step": 105040 + }, + { + "epoch": 0.39984622762878436, + "grad_norm": 0.13706181943416595, + "learning_rate": 0.0005, + "loss": 2.1174, + "step": 105050 + }, + { + "epoch": 0.39988429009690707, + "grad_norm": 0.13545085489749908, + "learning_rate": 0.0005, + "loss": 2.102, + "step": 105060 + }, + { + "epoch": 0.3999223525650297, + "grad_norm": 0.12967762351036072, + "learning_rate": 0.0005, + "loss": 2.1175, + "step": 105070 + }, + { + "epoch": 0.3999604150331524, + "grad_norm": 0.12019047141075134, + "learning_rate": 0.0005, + "loss": 2.1273, + "step": 105080 + }, + { + "epoch": 0.3999984775012751, + "grad_norm": 0.12411542236804962, + "learning_rate": 0.0005, + "loss": 2.1191, + "step": 105090 + }, + { + "epoch": 0.40003653996939775, + "grad_norm": 0.1309502273797989, + "learning_rate": 0.0005, + "loss": 2.1136, + "step": 105100 + }, + { + "epoch": 0.40007460243752047, + "grad_norm": 0.11400396376848221, + "learning_rate": 0.0005, + "loss": 2.1181, + "step": 105110 + }, + { + "epoch": 0.4001126649056431, + "grad_norm": 0.13012060523033142, + "learning_rate": 0.0005, + "loss": 2.1027, + "step": 105120 + }, + { + "epoch": 0.40015072737376584, + "grad_norm": 0.12380818277597427, + "learning_rate": 0.0005, + "loss": 2.0941, + "step": 105130 + }, + { + "epoch": 0.4001887898418885, + "grad_norm": 0.11546797305345535, + "learning_rate": 0.0005, + "loss": 2.1123, + "step": 105140 + }, + { + "epoch": 0.4002268523100112, + "grad_norm": 0.12413118034601212, + "learning_rate": 0.0005, + "loss": 2.1184, + "step": 105150 + }, + { + "epoch": 0.40026491477813386, + "grad_norm": 0.12526023387908936, + "learning_rate": 0.0005, + "loss": 2.1448, + "step": 105160 + }, + { + "epoch": 0.4003029772462566, + "grad_norm": 0.12476836144924164, + "learning_rate": 0.0005, + "loss": 2.1259, + "step": 105170 + }, + { + "epoch": 0.40034103971437923, + "grad_norm": 0.12636984884738922, + "learning_rate": 0.0005, + "loss": 2.1129, + "step": 105180 + }, + { + "epoch": 0.40037910218250194, + "grad_norm": 0.12862300872802734, + "learning_rate": 0.0005, + "loss": 2.1113, + "step": 105190 + }, + { + "epoch": 0.4004171646506246, + "grad_norm": 0.13172148168087006, + "learning_rate": 0.0005, + "loss": 2.1292, + "step": 105200 + }, + { + "epoch": 0.4004552271187473, + "grad_norm": 0.1303453892469406, + "learning_rate": 0.0005, + "loss": 2.1034, + "step": 105210 + }, + { + "epoch": 0.40049328958686997, + "grad_norm": 0.12228729575872421, + "learning_rate": 0.0005, + "loss": 2.1185, + "step": 105220 + }, + { + "epoch": 0.40053135205499263, + "grad_norm": 0.138387992978096, + "learning_rate": 0.0005, + "loss": 2.128, + "step": 105230 + }, + { + "epoch": 0.40056941452311534, + "grad_norm": 0.11740188300609589, + "learning_rate": 0.0005, + "loss": 2.1067, + "step": 105240 + }, + { + "epoch": 0.400607476991238, + "grad_norm": 0.12075120210647583, + "learning_rate": 0.0005, + "loss": 2.121, + "step": 105250 + }, + { + "epoch": 0.4006455394593607, + "grad_norm": 0.12485245615243912, + "learning_rate": 0.0005, + "loss": 2.1184, + "step": 105260 + }, + { + "epoch": 0.40068360192748337, + "grad_norm": 0.12556962668895721, + "learning_rate": 0.0005, + "loss": 2.1239, + "step": 105270 + }, + { + "epoch": 0.4007216643956061, + "grad_norm": 0.14791473746299744, + "learning_rate": 0.0005, + "loss": 2.1061, + "step": 105280 + }, + { + "epoch": 0.40075972686372874, + "grad_norm": 0.1255379170179367, + "learning_rate": 0.0005, + "loss": 2.122, + "step": 105290 + }, + { + "epoch": 0.40079778933185145, + "grad_norm": 0.12866927683353424, + "learning_rate": 0.0005, + "loss": 2.1076, + "step": 105300 + }, + { + "epoch": 0.4008358517999741, + "grad_norm": 0.126442089676857, + "learning_rate": 0.0005, + "loss": 2.1125, + "step": 105310 + }, + { + "epoch": 0.4008739142680968, + "grad_norm": 0.11720888316631317, + "learning_rate": 0.0005, + "loss": 2.1243, + "step": 105320 + }, + { + "epoch": 0.4009119767362195, + "grad_norm": 0.12004213035106659, + "learning_rate": 0.0005, + "loss": 2.1168, + "step": 105330 + }, + { + "epoch": 0.4009500392043422, + "grad_norm": 0.12049522995948792, + "learning_rate": 0.0005, + "loss": 2.1112, + "step": 105340 + }, + { + "epoch": 0.40098810167246485, + "grad_norm": 0.15276920795440674, + "learning_rate": 0.0005, + "loss": 2.1293, + "step": 105350 + }, + { + "epoch": 0.40102616414058756, + "grad_norm": 0.12708880007266998, + "learning_rate": 0.0005, + "loss": 2.1204, + "step": 105360 + }, + { + "epoch": 0.4010642266087102, + "grad_norm": 0.13746021687984467, + "learning_rate": 0.0005, + "loss": 2.1185, + "step": 105370 + }, + { + "epoch": 0.4011022890768329, + "grad_norm": 0.11826682835817337, + "learning_rate": 0.0005, + "loss": 2.124, + "step": 105380 + }, + { + "epoch": 0.4011403515449556, + "grad_norm": 0.12515002489089966, + "learning_rate": 0.0005, + "loss": 2.1249, + "step": 105390 + }, + { + "epoch": 0.40117841401307824, + "grad_norm": 0.13323919475078583, + "learning_rate": 0.0005, + "loss": 2.1199, + "step": 105400 + }, + { + "epoch": 0.40121647648120096, + "grad_norm": 0.13742923736572266, + "learning_rate": 0.0005, + "loss": 2.1061, + "step": 105410 + }, + { + "epoch": 0.4012545389493236, + "grad_norm": 0.11911813914775848, + "learning_rate": 0.0005, + "loss": 2.1269, + "step": 105420 + }, + { + "epoch": 0.4012926014174463, + "grad_norm": 0.11971248686313629, + "learning_rate": 0.0005, + "loss": 2.1121, + "step": 105430 + }, + { + "epoch": 0.401330663885569, + "grad_norm": 0.1214604452252388, + "learning_rate": 0.0005, + "loss": 2.1256, + "step": 105440 + }, + { + "epoch": 0.4013687263536917, + "grad_norm": 0.12025048583745956, + "learning_rate": 0.0005, + "loss": 2.1071, + "step": 105450 + }, + { + "epoch": 0.40140678882181435, + "grad_norm": 0.12886670231819153, + "learning_rate": 0.0005, + "loss": 2.124, + "step": 105460 + }, + { + "epoch": 0.40144485128993707, + "grad_norm": 0.11783948540687561, + "learning_rate": 0.0005, + "loss": 2.1062, + "step": 105470 + }, + { + "epoch": 0.4014829137580597, + "grad_norm": 0.13330931961536407, + "learning_rate": 0.0005, + "loss": 2.1229, + "step": 105480 + }, + { + "epoch": 0.40152097622618244, + "grad_norm": 0.12965691089630127, + "learning_rate": 0.0005, + "loss": 2.1143, + "step": 105490 + }, + { + "epoch": 0.4015590386943051, + "grad_norm": 0.13883811235427856, + "learning_rate": 0.0005, + "loss": 2.0993, + "step": 105500 + }, + { + "epoch": 0.40159710116242775, + "grad_norm": 0.12219704687595367, + "learning_rate": 0.0005, + "loss": 2.0945, + "step": 105510 + }, + { + "epoch": 0.40163516363055046, + "grad_norm": 0.13543589413166046, + "learning_rate": 0.0005, + "loss": 2.1275, + "step": 105520 + }, + { + "epoch": 0.4016732260986731, + "grad_norm": 0.12430472671985626, + "learning_rate": 0.0005, + "loss": 2.1253, + "step": 105530 + }, + { + "epoch": 0.40171128856679583, + "grad_norm": 0.12211424857378006, + "learning_rate": 0.0005, + "loss": 2.1263, + "step": 105540 + }, + { + "epoch": 0.4017493510349185, + "grad_norm": 0.12356506288051605, + "learning_rate": 0.0005, + "loss": 2.1177, + "step": 105550 + }, + { + "epoch": 0.4017874135030412, + "grad_norm": 0.12859176099300385, + "learning_rate": 0.0005, + "loss": 2.136, + "step": 105560 + }, + { + "epoch": 0.40182547597116386, + "grad_norm": 0.12038209289312363, + "learning_rate": 0.0005, + "loss": 2.13, + "step": 105570 + }, + { + "epoch": 0.40186353843928657, + "grad_norm": 0.12111588567495346, + "learning_rate": 0.0005, + "loss": 2.1274, + "step": 105580 + }, + { + "epoch": 0.40190160090740923, + "grad_norm": 0.13333773612976074, + "learning_rate": 0.0005, + "loss": 2.1173, + "step": 105590 + }, + { + "epoch": 0.40193966337553194, + "grad_norm": 0.13389089703559875, + "learning_rate": 0.0005, + "loss": 2.1134, + "step": 105600 + }, + { + "epoch": 0.4019777258436546, + "grad_norm": 0.1274549514055252, + "learning_rate": 0.0005, + "loss": 2.1205, + "step": 105610 + }, + { + "epoch": 0.4020157883117773, + "grad_norm": 0.13398416340351105, + "learning_rate": 0.0005, + "loss": 2.1197, + "step": 105620 + }, + { + "epoch": 0.40205385077989997, + "grad_norm": 0.12145400792360306, + "learning_rate": 0.0005, + "loss": 2.1251, + "step": 105630 + }, + { + "epoch": 0.4020919132480227, + "grad_norm": 0.13358928263187408, + "learning_rate": 0.0005, + "loss": 2.121, + "step": 105640 + }, + { + "epoch": 0.40212997571614534, + "grad_norm": 0.11757256835699081, + "learning_rate": 0.0005, + "loss": 2.1306, + "step": 105650 + }, + { + "epoch": 0.402168038184268, + "grad_norm": 0.11538097262382507, + "learning_rate": 0.0005, + "loss": 2.1297, + "step": 105660 + }, + { + "epoch": 0.4022061006523907, + "grad_norm": 0.13190820813179016, + "learning_rate": 0.0005, + "loss": 2.1142, + "step": 105670 + }, + { + "epoch": 0.40224416312051336, + "grad_norm": 0.14979924261569977, + "learning_rate": 0.0005, + "loss": 2.1328, + "step": 105680 + }, + { + "epoch": 0.4022822255886361, + "grad_norm": 0.1268017739057541, + "learning_rate": 0.0005, + "loss": 2.1273, + "step": 105690 + }, + { + "epoch": 0.40232028805675873, + "grad_norm": 0.12742775678634644, + "learning_rate": 0.0005, + "loss": 2.1197, + "step": 105700 + }, + { + "epoch": 0.40235835052488145, + "grad_norm": 0.12942014634609222, + "learning_rate": 0.0005, + "loss": 2.1178, + "step": 105710 + }, + { + "epoch": 0.4023964129930041, + "grad_norm": 0.11720619350671768, + "learning_rate": 0.0005, + "loss": 2.1276, + "step": 105720 + }, + { + "epoch": 0.4024344754611268, + "grad_norm": 0.13581234216690063, + "learning_rate": 0.0005, + "loss": 2.1274, + "step": 105730 + }, + { + "epoch": 0.4024725379292495, + "grad_norm": 0.12838581204414368, + "learning_rate": 0.0005, + "loss": 2.1045, + "step": 105740 + }, + { + "epoch": 0.4025106003973722, + "grad_norm": 0.12061332911252975, + "learning_rate": 0.0005, + "loss": 2.1145, + "step": 105750 + }, + { + "epoch": 0.40254866286549484, + "grad_norm": 0.12706062197685242, + "learning_rate": 0.0005, + "loss": 2.1085, + "step": 105760 + }, + { + "epoch": 0.40258672533361756, + "grad_norm": 0.12208613753318787, + "learning_rate": 0.0005, + "loss": 2.1057, + "step": 105770 + }, + { + "epoch": 0.4026247878017402, + "grad_norm": 0.12473037838935852, + "learning_rate": 0.0005, + "loss": 2.1126, + "step": 105780 + }, + { + "epoch": 0.4026628502698629, + "grad_norm": 0.11989544332027435, + "learning_rate": 0.0005, + "loss": 2.1243, + "step": 105790 + }, + { + "epoch": 0.4027009127379856, + "grad_norm": 0.18878960609436035, + "learning_rate": 0.0005, + "loss": 2.1283, + "step": 105800 + }, + { + "epoch": 0.40273897520610824, + "grad_norm": 0.12641625106334686, + "learning_rate": 0.0005, + "loss": 2.1194, + "step": 105810 + }, + { + "epoch": 0.40277703767423095, + "grad_norm": 0.12721383571624756, + "learning_rate": 0.0005, + "loss": 2.1314, + "step": 105820 + }, + { + "epoch": 0.4028151001423536, + "grad_norm": 0.13350263237953186, + "learning_rate": 0.0005, + "loss": 2.1261, + "step": 105830 + }, + { + "epoch": 0.4028531626104763, + "grad_norm": 0.13324423134326935, + "learning_rate": 0.0005, + "loss": 2.1275, + "step": 105840 + }, + { + "epoch": 0.402891225078599, + "grad_norm": 0.1388176828622818, + "learning_rate": 0.0005, + "loss": 2.1211, + "step": 105850 + }, + { + "epoch": 0.4029292875467217, + "grad_norm": 0.1237429603934288, + "learning_rate": 0.0005, + "loss": 2.1088, + "step": 105860 + }, + { + "epoch": 0.40296735001484435, + "grad_norm": 0.12776918709278107, + "learning_rate": 0.0005, + "loss": 2.1229, + "step": 105870 + }, + { + "epoch": 0.40300541248296706, + "grad_norm": 0.12410617619752884, + "learning_rate": 0.0005, + "loss": 2.1086, + "step": 105880 + }, + { + "epoch": 0.4030434749510897, + "grad_norm": 0.12657538056373596, + "learning_rate": 0.0005, + "loss": 2.1045, + "step": 105890 + }, + { + "epoch": 0.40308153741921243, + "grad_norm": 0.12220700085163116, + "learning_rate": 0.0005, + "loss": 2.1252, + "step": 105900 + }, + { + "epoch": 0.4031195998873351, + "grad_norm": 0.13133662939071655, + "learning_rate": 0.0005, + "loss": 2.1341, + "step": 105910 + }, + { + "epoch": 0.4031576623554578, + "grad_norm": 0.13154169917106628, + "learning_rate": 0.0005, + "loss": 2.1192, + "step": 105920 + }, + { + "epoch": 0.40319572482358046, + "grad_norm": 0.1365981101989746, + "learning_rate": 0.0005, + "loss": 2.1142, + "step": 105930 + }, + { + "epoch": 0.4032337872917031, + "grad_norm": 0.14633728563785553, + "learning_rate": 0.0005, + "loss": 2.1175, + "step": 105940 + }, + { + "epoch": 0.40327184975982583, + "grad_norm": 0.13925231993198395, + "learning_rate": 0.0005, + "loss": 2.122, + "step": 105950 + }, + { + "epoch": 0.4033099122279485, + "grad_norm": 0.12116432934999466, + "learning_rate": 0.0005, + "loss": 2.1252, + "step": 105960 + }, + { + "epoch": 0.4033479746960712, + "grad_norm": 0.12359362095594406, + "learning_rate": 0.0005, + "loss": 2.1304, + "step": 105970 + }, + { + "epoch": 0.40338603716419386, + "grad_norm": 0.12147456407546997, + "learning_rate": 0.0005, + "loss": 2.132, + "step": 105980 + }, + { + "epoch": 0.40342409963231657, + "grad_norm": 0.11512767523527145, + "learning_rate": 0.0005, + "loss": 2.1227, + "step": 105990 + }, + { + "epoch": 0.4034621621004392, + "grad_norm": 0.12523676455020905, + "learning_rate": 0.0005, + "loss": 2.1257, + "step": 106000 + }, + { + "epoch": 0.40350022456856194, + "grad_norm": 0.1256232112646103, + "learning_rate": 0.0005, + "loss": 2.1112, + "step": 106010 + }, + { + "epoch": 0.4035382870366846, + "grad_norm": 0.15836471319198608, + "learning_rate": 0.0005, + "loss": 2.0999, + "step": 106020 + }, + { + "epoch": 0.4035763495048073, + "grad_norm": 0.13333560526371002, + "learning_rate": 0.0005, + "loss": 2.1258, + "step": 106030 + }, + { + "epoch": 0.40361441197292997, + "grad_norm": 0.12412805110216141, + "learning_rate": 0.0005, + "loss": 2.1108, + "step": 106040 + }, + { + "epoch": 0.4036524744410527, + "grad_norm": 0.12889280915260315, + "learning_rate": 0.0005, + "loss": 2.1075, + "step": 106050 + }, + { + "epoch": 0.40369053690917533, + "grad_norm": 0.12141398340463638, + "learning_rate": 0.0005, + "loss": 2.1112, + "step": 106060 + }, + { + "epoch": 0.40372859937729805, + "grad_norm": 0.12618468701839447, + "learning_rate": 0.0005, + "loss": 2.12, + "step": 106070 + }, + { + "epoch": 0.4037666618454207, + "grad_norm": 0.13387101888656616, + "learning_rate": 0.0005, + "loss": 2.1252, + "step": 106080 + }, + { + "epoch": 0.40380472431354336, + "grad_norm": 0.1238047257065773, + "learning_rate": 0.0005, + "loss": 2.1105, + "step": 106090 + }, + { + "epoch": 0.4038427867816661, + "grad_norm": 0.11805254220962524, + "learning_rate": 0.0005, + "loss": 2.1328, + "step": 106100 + }, + { + "epoch": 0.40388084924978873, + "grad_norm": 0.12471137195825577, + "learning_rate": 0.0005, + "loss": 2.11, + "step": 106110 + }, + { + "epoch": 0.40391891171791144, + "grad_norm": 0.12302107363939285, + "learning_rate": 0.0005, + "loss": 2.1226, + "step": 106120 + }, + { + "epoch": 0.4039569741860341, + "grad_norm": 0.1345338076353073, + "learning_rate": 0.0005, + "loss": 2.1155, + "step": 106130 + }, + { + "epoch": 0.4039950366541568, + "grad_norm": 0.12187851220369339, + "learning_rate": 0.0005, + "loss": 2.119, + "step": 106140 + }, + { + "epoch": 0.40403309912227947, + "grad_norm": 0.12177269905805588, + "learning_rate": 0.0005, + "loss": 2.1328, + "step": 106150 + }, + { + "epoch": 0.4040711615904022, + "grad_norm": 0.11549049615859985, + "learning_rate": 0.0005, + "loss": 2.1262, + "step": 106160 + }, + { + "epoch": 0.40410922405852484, + "grad_norm": 0.15617483854293823, + "learning_rate": 0.0005, + "loss": 2.1183, + "step": 106170 + }, + { + "epoch": 0.40414728652664755, + "grad_norm": 0.11969661712646484, + "learning_rate": 0.0005, + "loss": 2.1296, + "step": 106180 + }, + { + "epoch": 0.4041853489947702, + "grad_norm": 0.11274225264787674, + "learning_rate": 0.0005, + "loss": 2.1035, + "step": 106190 + }, + { + "epoch": 0.4042234114628929, + "grad_norm": 0.13579653203487396, + "learning_rate": 0.0005, + "loss": 2.1039, + "step": 106200 + }, + { + "epoch": 0.4042614739310156, + "grad_norm": 0.13461516797542572, + "learning_rate": 0.0005, + "loss": 2.1176, + "step": 106210 + }, + { + "epoch": 0.4042995363991383, + "grad_norm": 0.13219502568244934, + "learning_rate": 0.0005, + "loss": 2.1226, + "step": 106220 + }, + { + "epoch": 0.40433759886726095, + "grad_norm": 0.11906775832176208, + "learning_rate": 0.0005, + "loss": 2.1075, + "step": 106230 + }, + { + "epoch": 0.4043756613353836, + "grad_norm": 0.13577035069465637, + "learning_rate": 0.0005, + "loss": 2.1181, + "step": 106240 + }, + { + "epoch": 0.4044137238035063, + "grad_norm": 0.11380638927221298, + "learning_rate": 0.0005, + "loss": 2.1181, + "step": 106250 + }, + { + "epoch": 0.404451786271629, + "grad_norm": 0.11306143552064896, + "learning_rate": 0.0005, + "loss": 2.1238, + "step": 106260 + }, + { + "epoch": 0.4044898487397517, + "grad_norm": 0.12404890358448029, + "learning_rate": 0.0005, + "loss": 2.1118, + "step": 106270 + }, + { + "epoch": 0.40452791120787435, + "grad_norm": 0.12584885954856873, + "learning_rate": 0.0005, + "loss": 2.1242, + "step": 106280 + }, + { + "epoch": 0.40456597367599706, + "grad_norm": 0.13881009817123413, + "learning_rate": 0.0005, + "loss": 2.1164, + "step": 106290 + }, + { + "epoch": 0.4046040361441197, + "grad_norm": 0.14016923308372498, + "learning_rate": 0.0005, + "loss": 2.1142, + "step": 106300 + }, + { + "epoch": 0.40464209861224243, + "grad_norm": 0.14057432115077972, + "learning_rate": 0.0005, + "loss": 2.1261, + "step": 106310 + }, + { + "epoch": 0.4046801610803651, + "grad_norm": 0.13729490339756012, + "learning_rate": 0.0005, + "loss": 2.1204, + "step": 106320 + }, + { + "epoch": 0.4047182235484878, + "grad_norm": 0.14463871717453003, + "learning_rate": 0.0005, + "loss": 2.1025, + "step": 106330 + }, + { + "epoch": 0.40475628601661046, + "grad_norm": 0.14529265463352203, + "learning_rate": 0.0005, + "loss": 2.1276, + "step": 106340 + }, + { + "epoch": 0.40479434848473317, + "grad_norm": 0.1335321068763733, + "learning_rate": 0.0005, + "loss": 2.1127, + "step": 106350 + }, + { + "epoch": 0.4048324109528558, + "grad_norm": 0.1342439204454422, + "learning_rate": 0.0005, + "loss": 2.1129, + "step": 106360 + }, + { + "epoch": 0.4048704734209785, + "grad_norm": 0.12198767811059952, + "learning_rate": 0.0005, + "loss": 2.1225, + "step": 106370 + }, + { + "epoch": 0.4049085358891012, + "grad_norm": 0.13221341371536255, + "learning_rate": 0.0005, + "loss": 2.1001, + "step": 106380 + }, + { + "epoch": 0.40494659835722385, + "grad_norm": 0.14494569599628448, + "learning_rate": 0.0005, + "loss": 2.1245, + "step": 106390 + }, + { + "epoch": 0.40498466082534657, + "grad_norm": 0.12156090885400772, + "learning_rate": 0.0005, + "loss": 2.1297, + "step": 106400 + }, + { + "epoch": 0.4050227232934692, + "grad_norm": 0.13109037280082703, + "learning_rate": 0.0005, + "loss": 2.1223, + "step": 106410 + }, + { + "epoch": 0.40506078576159193, + "grad_norm": 0.13591161370277405, + "learning_rate": 0.0005, + "loss": 2.1191, + "step": 106420 + }, + { + "epoch": 0.4050988482297146, + "grad_norm": 0.13301120698451996, + "learning_rate": 0.0005, + "loss": 2.1334, + "step": 106430 + }, + { + "epoch": 0.4051369106978373, + "grad_norm": 0.12221255898475647, + "learning_rate": 0.0005, + "loss": 2.1254, + "step": 106440 + }, + { + "epoch": 0.40517497316595996, + "grad_norm": 0.12087249010801315, + "learning_rate": 0.0005, + "loss": 2.1342, + "step": 106450 + }, + { + "epoch": 0.4052130356340827, + "grad_norm": 0.1255703717470169, + "learning_rate": 0.0005, + "loss": 2.1126, + "step": 106460 + }, + { + "epoch": 0.40525109810220533, + "grad_norm": 0.12779287993907928, + "learning_rate": 0.0005, + "loss": 2.1342, + "step": 106470 + }, + { + "epoch": 0.40528916057032804, + "grad_norm": 0.12255334854125977, + "learning_rate": 0.0005, + "loss": 2.1177, + "step": 106480 + }, + { + "epoch": 0.4053272230384507, + "grad_norm": 0.1284879446029663, + "learning_rate": 0.0005, + "loss": 2.1411, + "step": 106490 + }, + { + "epoch": 0.4053652855065734, + "grad_norm": 0.1315813958644867, + "learning_rate": 0.0005, + "loss": 2.123, + "step": 106500 + }, + { + "epoch": 0.40540334797469607, + "grad_norm": 0.13096173107624054, + "learning_rate": 0.0005, + "loss": 2.1299, + "step": 106510 + }, + { + "epoch": 0.40544141044281873, + "grad_norm": 0.1382998824119568, + "learning_rate": 0.0005, + "loss": 2.1055, + "step": 106520 + }, + { + "epoch": 0.40547947291094144, + "grad_norm": 0.12591317296028137, + "learning_rate": 0.0005, + "loss": 2.1191, + "step": 106530 + }, + { + "epoch": 0.4055175353790641, + "grad_norm": 0.13930954039096832, + "learning_rate": 0.0005, + "loss": 2.1164, + "step": 106540 + }, + { + "epoch": 0.4055555978471868, + "grad_norm": 0.11556414514780045, + "learning_rate": 0.0005, + "loss": 2.1239, + "step": 106550 + }, + { + "epoch": 0.40559366031530947, + "grad_norm": 0.12952682375907898, + "learning_rate": 0.0005, + "loss": 2.1074, + "step": 106560 + }, + { + "epoch": 0.4056317227834322, + "grad_norm": 0.14504283666610718, + "learning_rate": 0.0005, + "loss": 2.1231, + "step": 106570 + }, + { + "epoch": 0.40566978525155484, + "grad_norm": 0.13777132332324982, + "learning_rate": 0.0005, + "loss": 2.123, + "step": 106580 + }, + { + "epoch": 0.40570784771967755, + "grad_norm": 0.12666688859462738, + "learning_rate": 0.0005, + "loss": 2.1254, + "step": 106590 + }, + { + "epoch": 0.4057459101878002, + "grad_norm": 0.12245593219995499, + "learning_rate": 0.0005, + "loss": 2.1006, + "step": 106600 + }, + { + "epoch": 0.4057839726559229, + "grad_norm": 0.14578312635421753, + "learning_rate": 0.0005, + "loss": 2.1167, + "step": 106610 + }, + { + "epoch": 0.4058220351240456, + "grad_norm": 0.1222921684384346, + "learning_rate": 0.0005, + "loss": 2.117, + "step": 106620 + }, + { + "epoch": 0.4058600975921683, + "grad_norm": 0.12834157049655914, + "learning_rate": 0.0005, + "loss": 2.1261, + "step": 106630 + }, + { + "epoch": 0.40589816006029095, + "grad_norm": 0.12874889373779297, + "learning_rate": 0.0005, + "loss": 2.1028, + "step": 106640 + }, + { + "epoch": 0.40593622252841366, + "grad_norm": 0.12798567116260529, + "learning_rate": 0.0005, + "loss": 2.1346, + "step": 106650 + }, + { + "epoch": 0.4059742849965363, + "grad_norm": 0.1158149316906929, + "learning_rate": 0.0005, + "loss": 2.1158, + "step": 106660 + }, + { + "epoch": 0.406012347464659, + "grad_norm": 0.13018986582756042, + "learning_rate": 0.0005, + "loss": 2.1149, + "step": 106670 + }, + { + "epoch": 0.4060504099327817, + "grad_norm": 0.11808565258979797, + "learning_rate": 0.0005, + "loss": 2.1219, + "step": 106680 + }, + { + "epoch": 0.40608847240090434, + "grad_norm": 0.12245102971792221, + "learning_rate": 0.0005, + "loss": 2.1257, + "step": 106690 + }, + { + "epoch": 0.40612653486902706, + "grad_norm": 0.11302248388528824, + "learning_rate": 0.0005, + "loss": 2.1122, + "step": 106700 + }, + { + "epoch": 0.4061645973371497, + "grad_norm": 0.13028843700885773, + "learning_rate": 0.0005, + "loss": 2.1329, + "step": 106710 + }, + { + "epoch": 0.4062026598052724, + "grad_norm": 0.1179802417755127, + "learning_rate": 0.0005, + "loss": 2.1222, + "step": 106720 + }, + { + "epoch": 0.4062407222733951, + "grad_norm": 0.13540194928646088, + "learning_rate": 0.0005, + "loss": 2.1263, + "step": 106730 + }, + { + "epoch": 0.4062787847415178, + "grad_norm": 0.12822499871253967, + "learning_rate": 0.0005, + "loss": 2.1132, + "step": 106740 + }, + { + "epoch": 0.40631684720964045, + "grad_norm": 0.11414218693971634, + "learning_rate": 0.0005, + "loss": 2.1244, + "step": 106750 + }, + { + "epoch": 0.40635490967776317, + "grad_norm": 0.13179926574230194, + "learning_rate": 0.0005, + "loss": 2.1227, + "step": 106760 + }, + { + "epoch": 0.4063929721458858, + "grad_norm": 0.12584665417671204, + "learning_rate": 0.0005, + "loss": 2.1147, + "step": 106770 + }, + { + "epoch": 0.40643103461400854, + "grad_norm": 0.1283552050590515, + "learning_rate": 0.0005, + "loss": 2.1337, + "step": 106780 + }, + { + "epoch": 0.4064690970821312, + "grad_norm": 0.11601763218641281, + "learning_rate": 0.0005, + "loss": 2.1129, + "step": 106790 + }, + { + "epoch": 0.40650715955025385, + "grad_norm": 0.1216268315911293, + "learning_rate": 0.0005, + "loss": 2.1133, + "step": 106800 + }, + { + "epoch": 0.40654522201837656, + "grad_norm": 0.12042805552482605, + "learning_rate": 0.0005, + "loss": 2.1179, + "step": 106810 + }, + { + "epoch": 0.4065832844864992, + "grad_norm": 0.13717348873615265, + "learning_rate": 0.0005, + "loss": 2.1049, + "step": 106820 + }, + { + "epoch": 0.40662134695462193, + "grad_norm": 0.13994011282920837, + "learning_rate": 0.0005, + "loss": 2.1144, + "step": 106830 + }, + { + "epoch": 0.4066594094227446, + "grad_norm": 0.13324889540672302, + "learning_rate": 0.0005, + "loss": 2.131, + "step": 106840 + }, + { + "epoch": 0.4066974718908673, + "grad_norm": 0.1207243949174881, + "learning_rate": 0.0005, + "loss": 2.122, + "step": 106850 + }, + { + "epoch": 0.40673553435898996, + "grad_norm": 0.12666364014148712, + "learning_rate": 0.0005, + "loss": 2.1287, + "step": 106860 + }, + { + "epoch": 0.40677359682711267, + "grad_norm": 0.1227511316537857, + "learning_rate": 0.0005, + "loss": 2.1144, + "step": 106870 + }, + { + "epoch": 0.40681165929523533, + "grad_norm": 0.13686570525169373, + "learning_rate": 0.0005, + "loss": 2.0984, + "step": 106880 + }, + { + "epoch": 0.40684972176335804, + "grad_norm": 0.12842904031276703, + "learning_rate": 0.0005, + "loss": 2.1277, + "step": 106890 + }, + { + "epoch": 0.4068877842314807, + "grad_norm": 0.12260560691356659, + "learning_rate": 0.0005, + "loss": 2.1118, + "step": 106900 + }, + { + "epoch": 0.4069258466996034, + "grad_norm": 0.12146230787038803, + "learning_rate": 0.0005, + "loss": 2.1234, + "step": 106910 + }, + { + "epoch": 0.40696390916772607, + "grad_norm": 0.1416623294353485, + "learning_rate": 0.0005, + "loss": 2.1231, + "step": 106920 + }, + { + "epoch": 0.4070019716358488, + "grad_norm": 0.14083746075630188, + "learning_rate": 0.0005, + "loss": 2.1199, + "step": 106930 + }, + { + "epoch": 0.40704003410397144, + "grad_norm": 0.13980624079704285, + "learning_rate": 0.0005, + "loss": 2.1258, + "step": 106940 + }, + { + "epoch": 0.4070780965720941, + "grad_norm": 0.12137874215841293, + "learning_rate": 0.0005, + "loss": 2.1091, + "step": 106950 + }, + { + "epoch": 0.4071161590402168, + "grad_norm": 0.12216074019670486, + "learning_rate": 0.0005, + "loss": 2.1228, + "step": 106960 + }, + { + "epoch": 0.40715422150833946, + "grad_norm": 0.11431451141834259, + "learning_rate": 0.0005, + "loss": 2.1232, + "step": 106970 + }, + { + "epoch": 0.4071922839764622, + "grad_norm": 0.12706711888313293, + "learning_rate": 0.0005, + "loss": 2.1219, + "step": 106980 + }, + { + "epoch": 0.40723034644458483, + "grad_norm": 0.12637631595134735, + "learning_rate": 0.0005, + "loss": 2.1082, + "step": 106990 + }, + { + "epoch": 0.40726840891270755, + "grad_norm": 0.12496950477361679, + "learning_rate": 0.0005, + "loss": 2.1088, + "step": 107000 + }, + { + "epoch": 0.4073064713808302, + "grad_norm": 0.12362261861562729, + "learning_rate": 0.0005, + "loss": 2.1062, + "step": 107010 + }, + { + "epoch": 0.4073445338489529, + "grad_norm": 0.13388338685035706, + "learning_rate": 0.0005, + "loss": 2.127, + "step": 107020 + }, + { + "epoch": 0.4073825963170756, + "grad_norm": 0.12871481478214264, + "learning_rate": 0.0005, + "loss": 2.1262, + "step": 107030 + }, + { + "epoch": 0.4074206587851983, + "grad_norm": 0.13223305344581604, + "learning_rate": 0.0005, + "loss": 2.1177, + "step": 107040 + }, + { + "epoch": 0.40745872125332094, + "grad_norm": 0.12447560578584671, + "learning_rate": 0.0005, + "loss": 2.1067, + "step": 107050 + }, + { + "epoch": 0.40749678372144366, + "grad_norm": 0.13165107369422913, + "learning_rate": 0.0005, + "loss": 2.1045, + "step": 107060 + }, + { + "epoch": 0.4075348461895663, + "grad_norm": 0.12830142676830292, + "learning_rate": 0.0005, + "loss": 2.1293, + "step": 107070 + }, + { + "epoch": 0.407572908657689, + "grad_norm": 0.11964263767004013, + "learning_rate": 0.0005, + "loss": 2.1089, + "step": 107080 + }, + { + "epoch": 0.4076109711258117, + "grad_norm": 0.11753566563129425, + "learning_rate": 0.0005, + "loss": 2.1146, + "step": 107090 + }, + { + "epoch": 0.40764903359393434, + "grad_norm": 0.12723736464977264, + "learning_rate": 0.0005, + "loss": 2.1129, + "step": 107100 + }, + { + "epoch": 0.40768709606205705, + "grad_norm": 0.11674083769321442, + "learning_rate": 0.0005, + "loss": 2.1285, + "step": 107110 + }, + { + "epoch": 0.4077251585301797, + "grad_norm": 0.12733396887779236, + "learning_rate": 0.0005, + "loss": 2.1281, + "step": 107120 + }, + { + "epoch": 0.4077632209983024, + "grad_norm": 0.13298550248146057, + "learning_rate": 0.0005, + "loss": 2.1241, + "step": 107130 + }, + { + "epoch": 0.4078012834664251, + "grad_norm": 0.11968598514795303, + "learning_rate": 0.0005, + "loss": 2.1215, + "step": 107140 + }, + { + "epoch": 0.4078393459345478, + "grad_norm": 0.12861406803131104, + "learning_rate": 0.0005, + "loss": 2.1176, + "step": 107150 + }, + { + "epoch": 0.40787740840267045, + "grad_norm": 0.1397750824689865, + "learning_rate": 0.0005, + "loss": 2.1247, + "step": 107160 + }, + { + "epoch": 0.40791547087079316, + "grad_norm": 0.12403788417577744, + "learning_rate": 0.0005, + "loss": 2.1143, + "step": 107170 + }, + { + "epoch": 0.4079535333389158, + "grad_norm": 0.14027903974056244, + "learning_rate": 0.0005, + "loss": 2.1199, + "step": 107180 + }, + { + "epoch": 0.40799159580703853, + "grad_norm": 0.1178564578294754, + "learning_rate": 0.0005, + "loss": 2.1208, + "step": 107190 + }, + { + "epoch": 0.4080296582751612, + "grad_norm": 0.12243127077817917, + "learning_rate": 0.0005, + "loss": 2.1294, + "step": 107200 + }, + { + "epoch": 0.4080677207432839, + "grad_norm": 0.12157271802425385, + "learning_rate": 0.0005, + "loss": 2.1261, + "step": 107210 + }, + { + "epoch": 0.40810578321140656, + "grad_norm": 0.12877163290977478, + "learning_rate": 0.0005, + "loss": 2.1238, + "step": 107220 + }, + { + "epoch": 0.4081438456795292, + "grad_norm": 0.1127689853310585, + "learning_rate": 0.0005, + "loss": 2.1185, + "step": 107230 + }, + { + "epoch": 0.40818190814765193, + "grad_norm": 0.1347082555294037, + "learning_rate": 0.0005, + "loss": 2.1294, + "step": 107240 + }, + { + "epoch": 0.4082199706157746, + "grad_norm": 0.1271202713251114, + "learning_rate": 0.0005, + "loss": 2.1202, + "step": 107250 + }, + { + "epoch": 0.4082580330838973, + "grad_norm": 0.12755969166755676, + "learning_rate": 0.0005, + "loss": 2.1271, + "step": 107260 + }, + { + "epoch": 0.40829609555201996, + "grad_norm": 0.13087047636508942, + "learning_rate": 0.0005, + "loss": 2.125, + "step": 107270 + }, + { + "epoch": 0.40833415802014267, + "grad_norm": 0.13195504248142242, + "learning_rate": 0.0005, + "loss": 2.1226, + "step": 107280 + }, + { + "epoch": 0.4083722204882653, + "grad_norm": 0.12616842985153198, + "learning_rate": 0.0005, + "loss": 2.1093, + "step": 107290 + }, + { + "epoch": 0.40841028295638804, + "grad_norm": 0.12336033582687378, + "learning_rate": 0.0005, + "loss": 2.1057, + "step": 107300 + }, + { + "epoch": 0.4084483454245107, + "grad_norm": 0.13530464470386505, + "learning_rate": 0.0005, + "loss": 2.125, + "step": 107310 + }, + { + "epoch": 0.4084864078926334, + "grad_norm": 0.12252623587846756, + "learning_rate": 0.0005, + "loss": 2.1138, + "step": 107320 + }, + { + "epoch": 0.40852447036075606, + "grad_norm": 0.11957842856645584, + "learning_rate": 0.0005, + "loss": 2.1292, + "step": 107330 + }, + { + "epoch": 0.4085625328288788, + "grad_norm": 0.12841619551181793, + "learning_rate": 0.0005, + "loss": 2.1076, + "step": 107340 + }, + { + "epoch": 0.40860059529700143, + "grad_norm": 0.13767950236797333, + "learning_rate": 0.0005, + "loss": 2.1226, + "step": 107350 + }, + { + "epoch": 0.40863865776512415, + "grad_norm": 0.13289736211299896, + "learning_rate": 0.0005, + "loss": 2.1296, + "step": 107360 + }, + { + "epoch": 0.4086767202332468, + "grad_norm": 0.1358732134103775, + "learning_rate": 0.0005, + "loss": 2.1359, + "step": 107370 + }, + { + "epoch": 0.40871478270136946, + "grad_norm": 0.127150297164917, + "learning_rate": 0.0005, + "loss": 2.1131, + "step": 107380 + }, + { + "epoch": 0.4087528451694922, + "grad_norm": 0.12780392169952393, + "learning_rate": 0.0005, + "loss": 2.1377, + "step": 107390 + }, + { + "epoch": 0.40879090763761483, + "grad_norm": 0.13576877117156982, + "learning_rate": 0.0005, + "loss": 2.1196, + "step": 107400 + }, + { + "epoch": 0.40882897010573754, + "grad_norm": 0.12100337445735931, + "learning_rate": 0.0005, + "loss": 2.1224, + "step": 107410 + }, + { + "epoch": 0.4088670325738602, + "grad_norm": 0.12995721399784088, + "learning_rate": 0.0005, + "loss": 2.1221, + "step": 107420 + }, + { + "epoch": 0.4089050950419829, + "grad_norm": 0.12193107604980469, + "learning_rate": 0.0005, + "loss": 2.1224, + "step": 107430 + }, + { + "epoch": 0.40894315751010557, + "grad_norm": 0.1287761926651001, + "learning_rate": 0.0005, + "loss": 2.141, + "step": 107440 + }, + { + "epoch": 0.4089812199782283, + "grad_norm": 0.12959787249565125, + "learning_rate": 0.0005, + "loss": 2.0993, + "step": 107450 + }, + { + "epoch": 0.40901928244635094, + "grad_norm": 0.11854854971170425, + "learning_rate": 0.0005, + "loss": 2.1268, + "step": 107460 + }, + { + "epoch": 0.40905734491447365, + "grad_norm": 0.12470625340938568, + "learning_rate": 0.0005, + "loss": 2.1282, + "step": 107470 + }, + { + "epoch": 0.4090954073825963, + "grad_norm": 0.11399077624082565, + "learning_rate": 0.0005, + "loss": 2.1132, + "step": 107480 + }, + { + "epoch": 0.409133469850719, + "grad_norm": 0.12666715681552887, + "learning_rate": 0.0005, + "loss": 2.1131, + "step": 107490 + }, + { + "epoch": 0.4091715323188417, + "grad_norm": 0.11980894953012466, + "learning_rate": 0.0005, + "loss": 2.1192, + "step": 107500 + }, + { + "epoch": 0.4092095947869644, + "grad_norm": 0.11786861717700958, + "learning_rate": 0.0005, + "loss": 2.1197, + "step": 107510 + }, + { + "epoch": 0.40924765725508705, + "grad_norm": 0.12244259566068649, + "learning_rate": 0.0005, + "loss": 2.1239, + "step": 107520 + }, + { + "epoch": 0.4092857197232097, + "grad_norm": 0.1384306699037552, + "learning_rate": 0.0005, + "loss": 2.1126, + "step": 107530 + }, + { + "epoch": 0.4093237821913324, + "grad_norm": 0.12418042868375778, + "learning_rate": 0.0005, + "loss": 2.1062, + "step": 107540 + }, + { + "epoch": 0.4093618446594551, + "grad_norm": 0.1298133134841919, + "learning_rate": 0.0005, + "loss": 2.1203, + "step": 107550 + }, + { + "epoch": 0.4093999071275778, + "grad_norm": 0.12193987518548965, + "learning_rate": 0.0005, + "loss": 2.12, + "step": 107560 + }, + { + "epoch": 0.40943796959570045, + "grad_norm": 0.12395806610584259, + "learning_rate": 0.0005, + "loss": 2.1152, + "step": 107570 + }, + { + "epoch": 0.40947603206382316, + "grad_norm": 0.13407832384109497, + "learning_rate": 0.0005, + "loss": 2.1054, + "step": 107580 + }, + { + "epoch": 0.4095140945319458, + "grad_norm": 0.12088234722614288, + "learning_rate": 0.0005, + "loss": 2.1203, + "step": 107590 + }, + { + "epoch": 0.40955215700006853, + "grad_norm": 0.12836715579032898, + "learning_rate": 0.0005, + "loss": 2.1121, + "step": 107600 + }, + { + "epoch": 0.4095902194681912, + "grad_norm": 0.12157371640205383, + "learning_rate": 0.0005, + "loss": 2.1295, + "step": 107610 + }, + { + "epoch": 0.4096282819363139, + "grad_norm": 0.12131499499082565, + "learning_rate": 0.0005, + "loss": 2.114, + "step": 107620 + }, + { + "epoch": 0.40966634440443656, + "grad_norm": 0.1216978132724762, + "learning_rate": 0.0005, + "loss": 2.1026, + "step": 107630 + }, + { + "epoch": 0.40970440687255927, + "grad_norm": 0.12099350988864899, + "learning_rate": 0.0005, + "loss": 2.1134, + "step": 107640 + }, + { + "epoch": 0.4097424693406819, + "grad_norm": 0.12920355796813965, + "learning_rate": 0.0005, + "loss": 2.1109, + "step": 107650 + }, + { + "epoch": 0.40978053180880464, + "grad_norm": 0.1273610144853592, + "learning_rate": 0.0005, + "loss": 2.1256, + "step": 107660 + }, + { + "epoch": 0.4098185942769273, + "grad_norm": 0.13866552710533142, + "learning_rate": 0.0005, + "loss": 2.1283, + "step": 107670 + }, + { + "epoch": 0.40985665674504995, + "grad_norm": 0.13295462727546692, + "learning_rate": 0.0005, + "loss": 2.1195, + "step": 107680 + }, + { + "epoch": 0.40989471921317266, + "grad_norm": 0.12541206181049347, + "learning_rate": 0.0005, + "loss": 2.1325, + "step": 107690 + }, + { + "epoch": 0.4099327816812953, + "grad_norm": 0.1251440793275833, + "learning_rate": 0.0005, + "loss": 2.1325, + "step": 107700 + }, + { + "epoch": 0.40997084414941803, + "grad_norm": 0.1260979324579239, + "learning_rate": 0.0005, + "loss": 2.107, + "step": 107710 + }, + { + "epoch": 0.4100089066175407, + "grad_norm": 0.11850418895483017, + "learning_rate": 0.0005, + "loss": 2.1125, + "step": 107720 + }, + { + "epoch": 0.4100469690856634, + "grad_norm": 0.13400107622146606, + "learning_rate": 0.0005, + "loss": 2.1227, + "step": 107730 + }, + { + "epoch": 0.41008503155378606, + "grad_norm": 0.12020386755466461, + "learning_rate": 0.0005, + "loss": 2.116, + "step": 107740 + }, + { + "epoch": 0.4101230940219088, + "grad_norm": 0.12448807805776596, + "learning_rate": 0.0005, + "loss": 2.1245, + "step": 107750 + }, + { + "epoch": 0.41016115649003143, + "grad_norm": 0.12401855736970901, + "learning_rate": 0.0005, + "loss": 2.1266, + "step": 107760 + }, + { + "epoch": 0.41019921895815414, + "grad_norm": 0.12212575227022171, + "learning_rate": 0.0005, + "loss": 2.1184, + "step": 107770 + }, + { + "epoch": 0.4102372814262768, + "grad_norm": 0.1332526057958603, + "learning_rate": 0.0005, + "loss": 2.133, + "step": 107780 + }, + { + "epoch": 0.4102753438943995, + "grad_norm": 0.12568722665309906, + "learning_rate": 0.0005, + "loss": 2.1078, + "step": 107790 + }, + { + "epoch": 0.41031340636252217, + "grad_norm": 0.1174527257680893, + "learning_rate": 0.0005, + "loss": 2.1119, + "step": 107800 + }, + { + "epoch": 0.4103514688306448, + "grad_norm": 0.12806077301502228, + "learning_rate": 0.0005, + "loss": 2.127, + "step": 107810 + }, + { + "epoch": 0.41038953129876754, + "grad_norm": 0.12083237618207932, + "learning_rate": 0.0005, + "loss": 2.1001, + "step": 107820 + }, + { + "epoch": 0.4104275937668902, + "grad_norm": 0.12038543820381165, + "learning_rate": 0.0005, + "loss": 2.1317, + "step": 107830 + }, + { + "epoch": 0.4104656562350129, + "grad_norm": 0.12364353239536285, + "learning_rate": 0.0005, + "loss": 2.1095, + "step": 107840 + }, + { + "epoch": 0.41050371870313557, + "grad_norm": 0.1154148206114769, + "learning_rate": 0.0005, + "loss": 2.1094, + "step": 107850 + }, + { + "epoch": 0.4105417811712583, + "grad_norm": 0.12116552889347076, + "learning_rate": 0.0005, + "loss": 2.1171, + "step": 107860 + }, + { + "epoch": 0.41057984363938094, + "grad_norm": 0.1332802176475525, + "learning_rate": 0.0005, + "loss": 2.1313, + "step": 107870 + }, + { + "epoch": 0.41061790610750365, + "grad_norm": 0.13961386680603027, + "learning_rate": 0.0005, + "loss": 2.1153, + "step": 107880 + }, + { + "epoch": 0.4106559685756263, + "grad_norm": 0.1177176982164383, + "learning_rate": 0.0005, + "loss": 2.1229, + "step": 107890 + }, + { + "epoch": 0.410694031043749, + "grad_norm": 0.1382824331521988, + "learning_rate": 0.0005, + "loss": 2.1143, + "step": 107900 + }, + { + "epoch": 0.4107320935118717, + "grad_norm": 0.13816149532794952, + "learning_rate": 0.0005, + "loss": 2.1186, + "step": 107910 + }, + { + "epoch": 0.4107701559799944, + "grad_norm": 0.13288699090480804, + "learning_rate": 0.0005, + "loss": 2.1243, + "step": 107920 + }, + { + "epoch": 0.41080821844811705, + "grad_norm": 0.13014812767505646, + "learning_rate": 0.0005, + "loss": 2.1136, + "step": 107930 + }, + { + "epoch": 0.41084628091623976, + "grad_norm": 0.12453297525644302, + "learning_rate": 0.0005, + "loss": 2.1216, + "step": 107940 + }, + { + "epoch": 0.4108843433843624, + "grad_norm": 0.11405854672193527, + "learning_rate": 0.0005, + "loss": 2.1148, + "step": 107950 + }, + { + "epoch": 0.4109224058524851, + "grad_norm": 0.12348777055740356, + "learning_rate": 0.0005, + "loss": 2.111, + "step": 107960 + }, + { + "epoch": 0.4109604683206078, + "grad_norm": 0.1157868430018425, + "learning_rate": 0.0005, + "loss": 2.1321, + "step": 107970 + }, + { + "epoch": 0.41099853078873044, + "grad_norm": 0.12794789671897888, + "learning_rate": 0.0005, + "loss": 2.1076, + "step": 107980 + }, + { + "epoch": 0.41103659325685316, + "grad_norm": 0.12420628219842911, + "learning_rate": 0.0005, + "loss": 2.1077, + "step": 107990 + }, + { + "epoch": 0.4110746557249758, + "grad_norm": 0.12100957334041595, + "learning_rate": 0.0005, + "loss": 2.1104, + "step": 108000 + }, + { + "epoch": 0.4111127181930985, + "grad_norm": 0.1334034949541092, + "learning_rate": 0.0005, + "loss": 2.1271, + "step": 108010 + }, + { + "epoch": 0.4111507806612212, + "grad_norm": 0.1218486949801445, + "learning_rate": 0.0005, + "loss": 2.1246, + "step": 108020 + }, + { + "epoch": 0.4111888431293439, + "grad_norm": 0.1329689621925354, + "learning_rate": 0.0005, + "loss": 2.1119, + "step": 108030 + }, + { + "epoch": 0.41122690559746655, + "grad_norm": 0.12003668397665024, + "learning_rate": 0.0005, + "loss": 2.1079, + "step": 108040 + }, + { + "epoch": 0.41126496806558926, + "grad_norm": 0.13905754685401917, + "learning_rate": 0.0005, + "loss": 2.1367, + "step": 108050 + }, + { + "epoch": 0.4113030305337119, + "grad_norm": 0.13376957178115845, + "learning_rate": 0.0005, + "loss": 2.1271, + "step": 108060 + }, + { + "epoch": 0.41134109300183463, + "grad_norm": 0.13737185299396515, + "learning_rate": 0.0005, + "loss": 2.1183, + "step": 108070 + }, + { + "epoch": 0.4113791554699573, + "grad_norm": 0.1212373897433281, + "learning_rate": 0.0005, + "loss": 2.11, + "step": 108080 + }, + { + "epoch": 0.41141721793808, + "grad_norm": 0.14076654613018036, + "learning_rate": 0.0005, + "loss": 2.1042, + "step": 108090 + }, + { + "epoch": 0.41145528040620266, + "grad_norm": 0.1278214007616043, + "learning_rate": 0.0005, + "loss": 2.1171, + "step": 108100 + }, + { + "epoch": 0.4114933428743253, + "grad_norm": 0.14113424718379974, + "learning_rate": 0.0005, + "loss": 2.1396, + "step": 108110 + }, + { + "epoch": 0.41153140534244803, + "grad_norm": 0.13218048214912415, + "learning_rate": 0.0005, + "loss": 2.1223, + "step": 108120 + }, + { + "epoch": 0.4115694678105707, + "grad_norm": 0.11836156994104385, + "learning_rate": 0.0005, + "loss": 2.1076, + "step": 108130 + }, + { + "epoch": 0.4116075302786934, + "grad_norm": 0.12116833031177521, + "learning_rate": 0.0005, + "loss": 2.123, + "step": 108140 + }, + { + "epoch": 0.41164559274681606, + "grad_norm": 0.126220241189003, + "learning_rate": 0.0005, + "loss": 2.1223, + "step": 108150 + }, + { + "epoch": 0.41168365521493877, + "grad_norm": 0.12636734545230865, + "learning_rate": 0.0005, + "loss": 2.1171, + "step": 108160 + }, + { + "epoch": 0.41172171768306143, + "grad_norm": 0.1304181069135666, + "learning_rate": 0.0005, + "loss": 2.1193, + "step": 108170 + }, + { + "epoch": 0.41175978015118414, + "grad_norm": 0.12628519535064697, + "learning_rate": 0.0005, + "loss": 2.1133, + "step": 108180 + }, + { + "epoch": 0.4117978426193068, + "grad_norm": 0.12502655386924744, + "learning_rate": 0.0005, + "loss": 2.1238, + "step": 108190 + }, + { + "epoch": 0.4118359050874295, + "grad_norm": 0.12146708369255066, + "learning_rate": 0.0005, + "loss": 2.127, + "step": 108200 + }, + { + "epoch": 0.41187396755555217, + "grad_norm": 0.12809845805168152, + "learning_rate": 0.0005, + "loss": 2.0932, + "step": 108210 + }, + { + "epoch": 0.4119120300236749, + "grad_norm": 0.15716975927352905, + "learning_rate": 0.0005, + "loss": 2.1155, + "step": 108220 + }, + { + "epoch": 0.41195009249179754, + "grad_norm": 0.1256551891565323, + "learning_rate": 0.0005, + "loss": 2.1152, + "step": 108230 + }, + { + "epoch": 0.4119881549599202, + "grad_norm": 0.11105377227067947, + "learning_rate": 0.0005, + "loss": 2.1202, + "step": 108240 + }, + { + "epoch": 0.4120262174280429, + "grad_norm": 0.12480401247739792, + "learning_rate": 0.0005, + "loss": 2.1354, + "step": 108250 + }, + { + "epoch": 0.41206427989616556, + "grad_norm": 0.13151103258132935, + "learning_rate": 0.0005, + "loss": 2.1128, + "step": 108260 + }, + { + "epoch": 0.4121023423642883, + "grad_norm": 0.1231314092874527, + "learning_rate": 0.0005, + "loss": 2.1236, + "step": 108270 + }, + { + "epoch": 0.41214040483241093, + "grad_norm": 0.12152253836393356, + "learning_rate": 0.0005, + "loss": 2.1108, + "step": 108280 + }, + { + "epoch": 0.41217846730053365, + "grad_norm": 0.11900048702955246, + "learning_rate": 0.0005, + "loss": 2.1048, + "step": 108290 + }, + { + "epoch": 0.4122165297686563, + "grad_norm": 0.13020843267440796, + "learning_rate": 0.0005, + "loss": 2.1289, + "step": 108300 + }, + { + "epoch": 0.412254592236779, + "grad_norm": 0.1309482455253601, + "learning_rate": 0.0005, + "loss": 2.1299, + "step": 108310 + }, + { + "epoch": 0.4122926547049017, + "grad_norm": 0.14295001327991486, + "learning_rate": 0.0005, + "loss": 2.1369, + "step": 108320 + }, + { + "epoch": 0.4123307171730244, + "grad_norm": 0.12833385169506073, + "learning_rate": 0.0005, + "loss": 2.1158, + "step": 108330 + }, + { + "epoch": 0.41236877964114704, + "grad_norm": 0.11909869313240051, + "learning_rate": 0.0005, + "loss": 2.1173, + "step": 108340 + }, + { + "epoch": 0.41240684210926976, + "grad_norm": 0.12089741230010986, + "learning_rate": 0.0005, + "loss": 2.1092, + "step": 108350 + }, + { + "epoch": 0.4124449045773924, + "grad_norm": 0.13093233108520508, + "learning_rate": 0.0005, + "loss": 2.1038, + "step": 108360 + }, + { + "epoch": 0.4124829670455151, + "grad_norm": 0.11446915566921234, + "learning_rate": 0.0005, + "loss": 2.1284, + "step": 108370 + }, + { + "epoch": 0.4125210295136378, + "grad_norm": 0.12707512080669403, + "learning_rate": 0.0005, + "loss": 2.1205, + "step": 108380 + }, + { + "epoch": 0.41255909198176044, + "grad_norm": 0.13645118474960327, + "learning_rate": 0.0005, + "loss": 2.1413, + "step": 108390 + }, + { + "epoch": 0.41259715444988315, + "grad_norm": 0.11449270695447922, + "learning_rate": 0.0005, + "loss": 2.1111, + "step": 108400 + }, + { + "epoch": 0.4126352169180058, + "grad_norm": 0.1177472248673439, + "learning_rate": 0.0005, + "loss": 2.122, + "step": 108410 + }, + { + "epoch": 0.4126732793861285, + "grad_norm": 0.1370553970336914, + "learning_rate": 0.0005, + "loss": 2.1177, + "step": 108420 + }, + { + "epoch": 0.4127113418542512, + "grad_norm": 0.1213172897696495, + "learning_rate": 0.0005, + "loss": 2.1264, + "step": 108430 + }, + { + "epoch": 0.4127494043223739, + "grad_norm": 0.12861493229866028, + "learning_rate": 0.0005, + "loss": 2.1055, + "step": 108440 + }, + { + "epoch": 0.41278746679049655, + "grad_norm": 0.13049571216106415, + "learning_rate": 0.0005, + "loss": 2.1127, + "step": 108450 + }, + { + "epoch": 0.41282552925861926, + "grad_norm": 0.13287471234798431, + "learning_rate": 0.0005, + "loss": 2.1284, + "step": 108460 + }, + { + "epoch": 0.4128635917267419, + "grad_norm": 0.12993772327899933, + "learning_rate": 0.0005, + "loss": 2.11, + "step": 108470 + }, + { + "epoch": 0.41290165419486463, + "grad_norm": 0.12364847958087921, + "learning_rate": 0.0005, + "loss": 2.1187, + "step": 108480 + }, + { + "epoch": 0.4129397166629873, + "grad_norm": 0.1326444149017334, + "learning_rate": 0.0005, + "loss": 2.1119, + "step": 108490 + }, + { + "epoch": 0.41297777913111, + "grad_norm": 0.11822287738323212, + "learning_rate": 0.0005, + "loss": 2.1144, + "step": 108500 + }, + { + "epoch": 0.41301584159923266, + "grad_norm": 0.12279457598924637, + "learning_rate": 0.0005, + "loss": 2.1049, + "step": 108510 + }, + { + "epoch": 0.41305390406735537, + "grad_norm": 0.1423376500606537, + "learning_rate": 0.0005, + "loss": 2.1098, + "step": 108520 + }, + { + "epoch": 0.41309196653547803, + "grad_norm": 0.13920536637306213, + "learning_rate": 0.0005, + "loss": 2.125, + "step": 108530 + }, + { + "epoch": 0.4131300290036007, + "grad_norm": 0.11479239165782928, + "learning_rate": 0.0005, + "loss": 2.1187, + "step": 108540 + }, + { + "epoch": 0.4131680914717234, + "grad_norm": 0.12645624577999115, + "learning_rate": 0.0005, + "loss": 2.1125, + "step": 108550 + }, + { + "epoch": 0.41320615393984605, + "grad_norm": 0.13007549941539764, + "learning_rate": 0.0005, + "loss": 2.1115, + "step": 108560 + }, + { + "epoch": 0.41324421640796877, + "grad_norm": 0.12195257842540741, + "learning_rate": 0.0005, + "loss": 2.1199, + "step": 108570 + }, + { + "epoch": 0.4132822788760914, + "grad_norm": 0.19788667559623718, + "learning_rate": 0.0005, + "loss": 2.1311, + "step": 108580 + }, + { + "epoch": 0.41332034134421414, + "grad_norm": 0.13742467761039734, + "learning_rate": 0.0005, + "loss": 2.1343, + "step": 108590 + }, + { + "epoch": 0.4133584038123368, + "grad_norm": 0.13775934278964996, + "learning_rate": 0.0005, + "loss": 2.1101, + "step": 108600 + }, + { + "epoch": 0.4133964662804595, + "grad_norm": 0.11977819353342056, + "learning_rate": 0.0005, + "loss": 2.1077, + "step": 108610 + }, + { + "epoch": 0.41343452874858216, + "grad_norm": 0.12793083488941193, + "learning_rate": 0.0005, + "loss": 2.1075, + "step": 108620 + }, + { + "epoch": 0.4134725912167049, + "grad_norm": 0.14065957069396973, + "learning_rate": 0.0005, + "loss": 2.1184, + "step": 108630 + }, + { + "epoch": 0.41351065368482753, + "grad_norm": 0.13145712018013, + "learning_rate": 0.0005, + "loss": 2.1203, + "step": 108640 + }, + { + "epoch": 0.41354871615295025, + "grad_norm": 0.11875452101230621, + "learning_rate": 0.0005, + "loss": 2.1108, + "step": 108650 + }, + { + "epoch": 0.4135867786210729, + "grad_norm": 0.11871971189975739, + "learning_rate": 0.0005, + "loss": 2.1215, + "step": 108660 + }, + { + "epoch": 0.41362484108919556, + "grad_norm": 0.13564030826091766, + "learning_rate": 0.0005, + "loss": 2.1109, + "step": 108670 + }, + { + "epoch": 0.4136629035573183, + "grad_norm": 0.12264464795589447, + "learning_rate": 0.0005, + "loss": 2.1062, + "step": 108680 + }, + { + "epoch": 0.41370096602544093, + "grad_norm": 0.11105269193649292, + "learning_rate": 0.0005, + "loss": 2.1193, + "step": 108690 + }, + { + "epoch": 0.41373902849356364, + "grad_norm": 0.12471351027488708, + "learning_rate": 0.0005, + "loss": 2.1295, + "step": 108700 + }, + { + "epoch": 0.4137770909616863, + "grad_norm": 0.14022184908390045, + "learning_rate": 0.0005, + "loss": 2.1227, + "step": 108710 + }, + { + "epoch": 0.413815153429809, + "grad_norm": 0.13109190762043, + "learning_rate": 0.0005, + "loss": 2.1186, + "step": 108720 + }, + { + "epoch": 0.41385321589793167, + "grad_norm": 0.11493074893951416, + "learning_rate": 0.0005, + "loss": 2.1056, + "step": 108730 + }, + { + "epoch": 0.4138912783660544, + "grad_norm": 0.14010822772979736, + "learning_rate": 0.0005, + "loss": 2.1132, + "step": 108740 + }, + { + "epoch": 0.41392934083417704, + "grad_norm": 0.12571784853935242, + "learning_rate": 0.0005, + "loss": 2.1163, + "step": 108750 + }, + { + "epoch": 0.41396740330229975, + "grad_norm": 0.11984385550022125, + "learning_rate": 0.0005, + "loss": 2.1159, + "step": 108760 + }, + { + "epoch": 0.4140054657704224, + "grad_norm": 0.1423097550868988, + "learning_rate": 0.0005, + "loss": 2.1139, + "step": 108770 + }, + { + "epoch": 0.4140435282385451, + "grad_norm": 0.14567236602306366, + "learning_rate": 0.0005, + "loss": 2.1187, + "step": 108780 + }, + { + "epoch": 0.4140815907066678, + "grad_norm": 0.18437223136425018, + "learning_rate": 0.0005, + "loss": 2.1065, + "step": 108790 + }, + { + "epoch": 0.4141196531747905, + "grad_norm": 0.12724503874778748, + "learning_rate": 0.0005, + "loss": 2.111, + "step": 108800 + }, + { + "epoch": 0.41415771564291315, + "grad_norm": 0.13286955654621124, + "learning_rate": 0.0005, + "loss": 2.1326, + "step": 108810 + }, + { + "epoch": 0.4141957781110358, + "grad_norm": 0.1133560836315155, + "learning_rate": 0.0005, + "loss": 2.1196, + "step": 108820 + }, + { + "epoch": 0.4142338405791585, + "grad_norm": 0.11960550397634506, + "learning_rate": 0.0005, + "loss": 2.1104, + "step": 108830 + }, + { + "epoch": 0.4142719030472812, + "grad_norm": 0.13191118836402893, + "learning_rate": 0.0005, + "loss": 2.1324, + "step": 108840 + }, + { + "epoch": 0.4143099655154039, + "grad_norm": 0.12312343716621399, + "learning_rate": 0.0005, + "loss": 2.1069, + "step": 108850 + }, + { + "epoch": 0.41434802798352655, + "grad_norm": 0.1258789449930191, + "learning_rate": 0.0005, + "loss": 2.1221, + "step": 108860 + }, + { + "epoch": 0.41438609045164926, + "grad_norm": 0.12822465598583221, + "learning_rate": 0.0005, + "loss": 2.1254, + "step": 108870 + }, + { + "epoch": 0.4144241529197719, + "grad_norm": 0.1322626918554306, + "learning_rate": 0.0005, + "loss": 2.1133, + "step": 108880 + }, + { + "epoch": 0.41446221538789463, + "grad_norm": 0.1310059130191803, + "learning_rate": 0.0005, + "loss": 2.1157, + "step": 108890 + }, + { + "epoch": 0.4145002778560173, + "grad_norm": 0.12444756925106049, + "learning_rate": 0.0005, + "loss": 2.1127, + "step": 108900 + }, + { + "epoch": 0.41453834032414, + "grad_norm": 0.191010981798172, + "learning_rate": 0.0005, + "loss": 2.1288, + "step": 108910 + }, + { + "epoch": 0.41457640279226265, + "grad_norm": 0.13076089322566986, + "learning_rate": 0.0005, + "loss": 2.1176, + "step": 108920 + }, + { + "epoch": 0.41461446526038537, + "grad_norm": 0.129410058259964, + "learning_rate": 0.0005, + "loss": 2.1263, + "step": 108930 + }, + { + "epoch": 0.414652527728508, + "grad_norm": 0.12310182303190231, + "learning_rate": 0.0005, + "loss": 2.1103, + "step": 108940 + }, + { + "epoch": 0.41469059019663074, + "grad_norm": 0.11926144361495972, + "learning_rate": 0.0005, + "loss": 2.1391, + "step": 108950 + }, + { + "epoch": 0.4147286526647534, + "grad_norm": 0.12537449598312378, + "learning_rate": 0.0005, + "loss": 2.1012, + "step": 108960 + }, + { + "epoch": 0.41476671513287605, + "grad_norm": 0.13044698536396027, + "learning_rate": 0.0005, + "loss": 2.1297, + "step": 108970 + }, + { + "epoch": 0.41480477760099876, + "grad_norm": 0.1533544659614563, + "learning_rate": 0.0005, + "loss": 2.1293, + "step": 108980 + }, + { + "epoch": 0.4148428400691214, + "grad_norm": 0.13309314846992493, + "learning_rate": 0.0005, + "loss": 2.1322, + "step": 108990 + }, + { + "epoch": 0.41488090253724413, + "grad_norm": 0.11752888560295105, + "learning_rate": 0.0005, + "loss": 2.1136, + "step": 109000 + }, + { + "epoch": 0.4149189650053668, + "grad_norm": 0.12122669070959091, + "learning_rate": 0.0005, + "loss": 2.1163, + "step": 109010 + }, + { + "epoch": 0.4149570274734895, + "grad_norm": 0.12491890043020248, + "learning_rate": 0.0005, + "loss": 2.1242, + "step": 109020 + }, + { + "epoch": 0.41499508994161216, + "grad_norm": 0.13501355051994324, + "learning_rate": 0.0005, + "loss": 2.1298, + "step": 109030 + }, + { + "epoch": 0.4150331524097349, + "grad_norm": 0.1353226602077484, + "learning_rate": 0.0005, + "loss": 2.1021, + "step": 109040 + }, + { + "epoch": 0.41507121487785753, + "grad_norm": 0.1288616806268692, + "learning_rate": 0.0005, + "loss": 2.1199, + "step": 109050 + }, + { + "epoch": 0.41510927734598024, + "grad_norm": 0.11121494323015213, + "learning_rate": 0.0005, + "loss": 2.1336, + "step": 109060 + }, + { + "epoch": 0.4151473398141029, + "grad_norm": 0.13124603033065796, + "learning_rate": 0.0005, + "loss": 2.1163, + "step": 109070 + }, + { + "epoch": 0.4151854022822256, + "grad_norm": 0.14034585654735565, + "learning_rate": 0.0005, + "loss": 2.0972, + "step": 109080 + }, + { + "epoch": 0.41522346475034827, + "grad_norm": 0.13125751912593842, + "learning_rate": 0.0005, + "loss": 2.1059, + "step": 109090 + }, + { + "epoch": 0.4152615272184709, + "grad_norm": 0.13639673590660095, + "learning_rate": 0.0005, + "loss": 2.1251, + "step": 109100 + }, + { + "epoch": 0.41529958968659364, + "grad_norm": 0.14386612176895142, + "learning_rate": 0.0005, + "loss": 2.1096, + "step": 109110 + }, + { + "epoch": 0.4153376521547163, + "grad_norm": 0.11938704550266266, + "learning_rate": 0.0005, + "loss": 2.1118, + "step": 109120 + }, + { + "epoch": 0.415375714622839, + "grad_norm": 0.1277218759059906, + "learning_rate": 0.0005, + "loss": 2.1162, + "step": 109130 + }, + { + "epoch": 0.41541377709096167, + "grad_norm": 0.13642755150794983, + "learning_rate": 0.0005, + "loss": 2.1034, + "step": 109140 + }, + { + "epoch": 0.4154518395590844, + "grad_norm": 0.11509495973587036, + "learning_rate": 0.0005, + "loss": 2.1352, + "step": 109150 + }, + { + "epoch": 0.41548990202720704, + "grad_norm": 0.12859298288822174, + "learning_rate": 0.0005, + "loss": 2.1338, + "step": 109160 + }, + { + "epoch": 0.41552796449532975, + "grad_norm": 0.11672429740428925, + "learning_rate": 0.0005, + "loss": 2.1201, + "step": 109170 + }, + { + "epoch": 0.4155660269634524, + "grad_norm": 0.12255655974149704, + "learning_rate": 0.0005, + "loss": 2.1225, + "step": 109180 + }, + { + "epoch": 0.4156040894315751, + "grad_norm": 0.12491326034069061, + "learning_rate": 0.0005, + "loss": 2.132, + "step": 109190 + }, + { + "epoch": 0.4156421518996978, + "grad_norm": 0.1362716257572174, + "learning_rate": 0.0005, + "loss": 2.1273, + "step": 109200 + }, + { + "epoch": 0.4156802143678205, + "grad_norm": 0.13377702236175537, + "learning_rate": 0.0005, + "loss": 2.1219, + "step": 109210 + }, + { + "epoch": 0.41571827683594315, + "grad_norm": 0.13841603696346283, + "learning_rate": 0.0005, + "loss": 2.1163, + "step": 109220 + }, + { + "epoch": 0.41575633930406586, + "grad_norm": 0.139837846159935, + "learning_rate": 0.0005, + "loss": 2.116, + "step": 109230 + }, + { + "epoch": 0.4157944017721885, + "grad_norm": 0.12048280239105225, + "learning_rate": 0.0005, + "loss": 2.1144, + "step": 109240 + }, + { + "epoch": 0.4158324642403112, + "grad_norm": 0.12809380888938904, + "learning_rate": 0.0005, + "loss": 2.1141, + "step": 109250 + }, + { + "epoch": 0.4158705267084339, + "grad_norm": 0.13459837436676025, + "learning_rate": 0.0005, + "loss": 2.1115, + "step": 109260 + }, + { + "epoch": 0.41590858917655654, + "grad_norm": 0.14781025052070618, + "learning_rate": 0.0005, + "loss": 2.1238, + "step": 109270 + }, + { + "epoch": 0.41594665164467925, + "grad_norm": 0.12983685731887817, + "learning_rate": 0.0005, + "loss": 2.1344, + "step": 109280 + }, + { + "epoch": 0.4159847141128019, + "grad_norm": 0.12982770800590515, + "learning_rate": 0.0005, + "loss": 2.106, + "step": 109290 + }, + { + "epoch": 0.4160227765809246, + "grad_norm": 0.12221802771091461, + "learning_rate": 0.0005, + "loss": 2.1274, + "step": 109300 + }, + { + "epoch": 0.4160608390490473, + "grad_norm": 0.1298155039548874, + "learning_rate": 0.0005, + "loss": 2.1276, + "step": 109310 + }, + { + "epoch": 0.41609890151717, + "grad_norm": 0.12300161272287369, + "learning_rate": 0.0005, + "loss": 2.1289, + "step": 109320 + }, + { + "epoch": 0.41613696398529265, + "grad_norm": 0.1435967981815338, + "learning_rate": 0.0005, + "loss": 2.1092, + "step": 109330 + }, + { + "epoch": 0.41617502645341536, + "grad_norm": 0.1296965479850769, + "learning_rate": 0.0005, + "loss": 2.1347, + "step": 109340 + }, + { + "epoch": 0.416213088921538, + "grad_norm": 0.1250772625207901, + "learning_rate": 0.0005, + "loss": 2.1216, + "step": 109350 + }, + { + "epoch": 0.41625115138966073, + "grad_norm": 0.12681035697460175, + "learning_rate": 0.0005, + "loss": 2.1153, + "step": 109360 + }, + { + "epoch": 0.4162892138577834, + "grad_norm": 0.12260159105062485, + "learning_rate": 0.0005, + "loss": 2.1162, + "step": 109370 + }, + { + "epoch": 0.4163272763259061, + "grad_norm": 0.12412714958190918, + "learning_rate": 0.0005, + "loss": 2.1098, + "step": 109380 + }, + { + "epoch": 0.41636533879402876, + "grad_norm": 0.13800972700119019, + "learning_rate": 0.0005, + "loss": 2.1146, + "step": 109390 + }, + { + "epoch": 0.4164034012621514, + "grad_norm": 0.14430291950702667, + "learning_rate": 0.0005, + "loss": 2.1195, + "step": 109400 + }, + { + "epoch": 0.41644146373027413, + "grad_norm": 0.1282130479812622, + "learning_rate": 0.0005, + "loss": 2.108, + "step": 109410 + }, + { + "epoch": 0.4164795261983968, + "grad_norm": 0.12316697835922241, + "learning_rate": 0.0005, + "loss": 2.1168, + "step": 109420 + }, + { + "epoch": 0.4165175886665195, + "grad_norm": 0.1190483421087265, + "learning_rate": 0.0005, + "loss": 2.1216, + "step": 109430 + }, + { + "epoch": 0.41655565113464216, + "grad_norm": 0.1316487193107605, + "learning_rate": 0.0005, + "loss": 2.1331, + "step": 109440 + }, + { + "epoch": 0.41659371360276487, + "grad_norm": 0.1279810070991516, + "learning_rate": 0.0005, + "loss": 2.1262, + "step": 109450 + }, + { + "epoch": 0.4166317760708875, + "grad_norm": 0.12146064639091492, + "learning_rate": 0.0005, + "loss": 2.1118, + "step": 109460 + }, + { + "epoch": 0.41666983853901024, + "grad_norm": 0.12453345954418182, + "learning_rate": 0.0005, + "loss": 2.1243, + "step": 109470 + }, + { + "epoch": 0.4167079010071329, + "grad_norm": 0.12459386140108109, + "learning_rate": 0.0005, + "loss": 2.1227, + "step": 109480 + }, + { + "epoch": 0.4167459634752556, + "grad_norm": 0.12696263194084167, + "learning_rate": 0.0005, + "loss": 2.1291, + "step": 109490 + }, + { + "epoch": 0.41678402594337827, + "grad_norm": 0.13806460797786713, + "learning_rate": 0.0005, + "loss": 2.115, + "step": 109500 + }, + { + "epoch": 0.416822088411501, + "grad_norm": 0.15819978713989258, + "learning_rate": 0.0005, + "loss": 2.1044, + "step": 109510 + }, + { + "epoch": 0.41686015087962364, + "grad_norm": 0.14383956789970398, + "learning_rate": 0.0005, + "loss": 2.1245, + "step": 109520 + }, + { + "epoch": 0.4168982133477463, + "grad_norm": 0.1303814798593521, + "learning_rate": 0.0005, + "loss": 2.1288, + "step": 109530 + }, + { + "epoch": 0.416936275815869, + "grad_norm": 0.12524403631687164, + "learning_rate": 0.0005, + "loss": 2.1192, + "step": 109540 + }, + { + "epoch": 0.41697433828399166, + "grad_norm": 0.13339385390281677, + "learning_rate": 0.0005, + "loss": 2.1148, + "step": 109550 + }, + { + "epoch": 0.4170124007521144, + "grad_norm": 0.13040074706077576, + "learning_rate": 0.0005, + "loss": 2.1126, + "step": 109560 + }, + { + "epoch": 0.41705046322023703, + "grad_norm": 0.14253298938274384, + "learning_rate": 0.0005, + "loss": 2.1382, + "step": 109570 + }, + { + "epoch": 0.41708852568835975, + "grad_norm": 0.13015128672122955, + "learning_rate": 0.0005, + "loss": 2.1408, + "step": 109580 + }, + { + "epoch": 0.4171265881564824, + "grad_norm": 0.11817413568496704, + "learning_rate": 0.0005, + "loss": 2.1167, + "step": 109590 + }, + { + "epoch": 0.4171646506246051, + "grad_norm": 0.1301771104335785, + "learning_rate": 0.0005, + "loss": 2.1219, + "step": 109600 + }, + { + "epoch": 0.4172027130927278, + "grad_norm": 0.1292882263660431, + "learning_rate": 0.0005, + "loss": 2.1095, + "step": 109610 + }, + { + "epoch": 0.4172407755608505, + "grad_norm": 0.1226472333073616, + "learning_rate": 0.0005, + "loss": 2.1234, + "step": 109620 + }, + { + "epoch": 0.41727883802897314, + "grad_norm": 0.12784558534622192, + "learning_rate": 0.0005, + "loss": 2.1111, + "step": 109630 + }, + { + "epoch": 0.41731690049709586, + "grad_norm": 0.12261635810136795, + "learning_rate": 0.0005, + "loss": 2.1179, + "step": 109640 + }, + { + "epoch": 0.4173549629652185, + "grad_norm": 0.13003894686698914, + "learning_rate": 0.0005, + "loss": 2.1141, + "step": 109650 + }, + { + "epoch": 0.4173930254333412, + "grad_norm": 0.13531725108623505, + "learning_rate": 0.0005, + "loss": 2.1332, + "step": 109660 + }, + { + "epoch": 0.4174310879014639, + "grad_norm": 0.1280001699924469, + "learning_rate": 0.0005, + "loss": 2.1139, + "step": 109670 + }, + { + "epoch": 0.41746915036958654, + "grad_norm": 0.17585636675357819, + "learning_rate": 0.0005, + "loss": 2.1308, + "step": 109680 + }, + { + "epoch": 0.41750721283770925, + "grad_norm": 0.1245546042919159, + "learning_rate": 0.0005, + "loss": 2.1134, + "step": 109690 + }, + { + "epoch": 0.4175452753058319, + "grad_norm": 0.14035065472126007, + "learning_rate": 0.0005, + "loss": 2.1172, + "step": 109700 + }, + { + "epoch": 0.4175833377739546, + "grad_norm": 0.127082958817482, + "learning_rate": 0.0005, + "loss": 2.123, + "step": 109710 + }, + { + "epoch": 0.4176214002420773, + "grad_norm": 0.12953048944473267, + "learning_rate": 0.0005, + "loss": 2.1112, + "step": 109720 + }, + { + "epoch": 0.4176594627102, + "grad_norm": 0.13001421093940735, + "learning_rate": 0.0005, + "loss": 2.1062, + "step": 109730 + }, + { + "epoch": 0.41769752517832265, + "grad_norm": 0.130632683634758, + "learning_rate": 0.0005, + "loss": 2.1311, + "step": 109740 + }, + { + "epoch": 0.41773558764644536, + "grad_norm": 0.13448363542556763, + "learning_rate": 0.0005, + "loss": 2.1171, + "step": 109750 + }, + { + "epoch": 0.417773650114568, + "grad_norm": 0.12333223223686218, + "learning_rate": 0.0005, + "loss": 2.1029, + "step": 109760 + }, + { + "epoch": 0.41781171258269073, + "grad_norm": 0.11951614171266556, + "learning_rate": 0.0005, + "loss": 2.115, + "step": 109770 + }, + { + "epoch": 0.4178497750508134, + "grad_norm": 0.14794956147670746, + "learning_rate": 0.0005, + "loss": 2.1094, + "step": 109780 + }, + { + "epoch": 0.4178878375189361, + "grad_norm": 0.13375519216060638, + "learning_rate": 0.0005, + "loss": 2.1057, + "step": 109790 + }, + { + "epoch": 0.41792589998705876, + "grad_norm": 0.1377677023410797, + "learning_rate": 0.0005, + "loss": 2.1435, + "step": 109800 + }, + { + "epoch": 0.41796396245518147, + "grad_norm": 0.12577351927757263, + "learning_rate": 0.0005, + "loss": 2.1294, + "step": 109810 + }, + { + "epoch": 0.4180020249233041, + "grad_norm": 0.1207052618265152, + "learning_rate": 0.0005, + "loss": 2.1263, + "step": 109820 + }, + { + "epoch": 0.4180400873914268, + "grad_norm": 0.11211320012807846, + "learning_rate": 0.0005, + "loss": 2.1208, + "step": 109830 + }, + { + "epoch": 0.4180781498595495, + "grad_norm": 0.12020255625247955, + "learning_rate": 0.0005, + "loss": 2.124, + "step": 109840 + }, + { + "epoch": 0.41811621232767215, + "grad_norm": 0.12813393771648407, + "learning_rate": 0.0005, + "loss": 2.1147, + "step": 109850 + }, + { + "epoch": 0.41815427479579487, + "grad_norm": 0.11809521913528442, + "learning_rate": 0.0005, + "loss": 2.0994, + "step": 109860 + }, + { + "epoch": 0.4181923372639175, + "grad_norm": 0.1264045387506485, + "learning_rate": 0.0005, + "loss": 2.1227, + "step": 109870 + }, + { + "epoch": 0.41823039973204024, + "grad_norm": 0.11605829745531082, + "learning_rate": 0.0005, + "loss": 2.1316, + "step": 109880 + }, + { + "epoch": 0.4182684622001629, + "grad_norm": 0.12971541285514832, + "learning_rate": 0.0005, + "loss": 2.1197, + "step": 109890 + }, + { + "epoch": 0.4183065246682856, + "grad_norm": 0.11807116121053696, + "learning_rate": 0.0005, + "loss": 2.1248, + "step": 109900 + }, + { + "epoch": 0.41834458713640826, + "grad_norm": 0.12423138320446014, + "learning_rate": 0.0005, + "loss": 2.1414, + "step": 109910 + }, + { + "epoch": 0.418382649604531, + "grad_norm": 0.12973003089427948, + "learning_rate": 0.0005, + "loss": 2.1295, + "step": 109920 + }, + { + "epoch": 0.41842071207265363, + "grad_norm": 0.1259157955646515, + "learning_rate": 0.0005, + "loss": 2.1051, + "step": 109930 + }, + { + "epoch": 0.41845877454077635, + "grad_norm": 0.1268201619386673, + "learning_rate": 0.0005, + "loss": 2.1283, + "step": 109940 + }, + { + "epoch": 0.418496837008899, + "grad_norm": 0.1224651113152504, + "learning_rate": 0.0005, + "loss": 2.1192, + "step": 109950 + }, + { + "epoch": 0.4185348994770217, + "grad_norm": 0.1276530921459198, + "learning_rate": 0.0005, + "loss": 2.1268, + "step": 109960 + }, + { + "epoch": 0.4185729619451444, + "grad_norm": 0.12536892294883728, + "learning_rate": 0.0005, + "loss": 2.125, + "step": 109970 + }, + { + "epoch": 0.41861102441326703, + "grad_norm": 0.12223321944475174, + "learning_rate": 0.0005, + "loss": 2.1152, + "step": 109980 + }, + { + "epoch": 0.41864908688138974, + "grad_norm": 0.11520811915397644, + "learning_rate": 0.0005, + "loss": 2.1315, + "step": 109990 + }, + { + "epoch": 0.4186871493495124, + "grad_norm": 0.12322820723056793, + "learning_rate": 0.0005, + "loss": 2.1257, + "step": 110000 + }, + { + "epoch": 0.4187252118176351, + "grad_norm": 0.12412966787815094, + "learning_rate": 0.0005, + "loss": 2.1208, + "step": 110010 + }, + { + "epoch": 0.41876327428575777, + "grad_norm": 0.13063424825668335, + "learning_rate": 0.0005, + "loss": 2.1111, + "step": 110020 + }, + { + "epoch": 0.4188013367538805, + "grad_norm": 0.13508233428001404, + "learning_rate": 0.0005, + "loss": 2.1083, + "step": 110030 + }, + { + "epoch": 0.41883939922200314, + "grad_norm": 0.11833116412162781, + "learning_rate": 0.0005, + "loss": 2.1194, + "step": 110040 + }, + { + "epoch": 0.41887746169012585, + "grad_norm": 0.12042485922574997, + "learning_rate": 0.0005, + "loss": 2.1287, + "step": 110050 + }, + { + "epoch": 0.4189155241582485, + "grad_norm": 0.1296975016593933, + "learning_rate": 0.0005, + "loss": 2.1208, + "step": 110060 + }, + { + "epoch": 0.4189535866263712, + "grad_norm": 0.12232377380132675, + "learning_rate": 0.0005, + "loss": 2.1284, + "step": 110070 + }, + { + "epoch": 0.4189916490944939, + "grad_norm": 0.24385973811149597, + "learning_rate": 0.0005, + "loss": 2.1235, + "step": 110080 + }, + { + "epoch": 0.4190297115626166, + "grad_norm": 0.13045060634613037, + "learning_rate": 0.0005, + "loss": 2.1327, + "step": 110090 + }, + { + "epoch": 0.41906777403073925, + "grad_norm": 0.1269223541021347, + "learning_rate": 0.0005, + "loss": 2.12, + "step": 110100 + }, + { + "epoch": 0.4191058364988619, + "grad_norm": 0.12370885163545609, + "learning_rate": 0.0005, + "loss": 2.1402, + "step": 110110 + }, + { + "epoch": 0.4191438989669846, + "grad_norm": 0.12285878509283066, + "learning_rate": 0.0005, + "loss": 2.128, + "step": 110120 + }, + { + "epoch": 0.4191819614351073, + "grad_norm": 0.12880626320838928, + "learning_rate": 0.0005, + "loss": 2.1181, + "step": 110130 + }, + { + "epoch": 0.41922002390323, + "grad_norm": 0.1254742443561554, + "learning_rate": 0.0005, + "loss": 2.113, + "step": 110140 + }, + { + "epoch": 0.41925808637135265, + "grad_norm": 0.14333873987197876, + "learning_rate": 0.0005, + "loss": 2.1123, + "step": 110150 + }, + { + "epoch": 0.41929614883947536, + "grad_norm": 0.13128043711185455, + "learning_rate": 0.0005, + "loss": 2.136, + "step": 110160 + }, + { + "epoch": 0.419334211307598, + "grad_norm": 0.15221351385116577, + "learning_rate": 0.0005, + "loss": 2.1264, + "step": 110170 + }, + { + "epoch": 0.4193722737757207, + "grad_norm": 0.11888156831264496, + "learning_rate": 0.0005, + "loss": 2.1093, + "step": 110180 + }, + { + "epoch": 0.4194103362438434, + "grad_norm": 0.12805290520191193, + "learning_rate": 0.0005, + "loss": 2.124, + "step": 110190 + }, + { + "epoch": 0.4194483987119661, + "grad_norm": 0.12789416313171387, + "learning_rate": 0.0005, + "loss": 2.1261, + "step": 110200 + }, + { + "epoch": 0.41948646118008875, + "grad_norm": 0.12878291308879852, + "learning_rate": 0.0005, + "loss": 2.1136, + "step": 110210 + }, + { + "epoch": 0.41952452364821147, + "grad_norm": 0.12528999149799347, + "learning_rate": 0.0005, + "loss": 2.121, + "step": 110220 + }, + { + "epoch": 0.4195625861163341, + "grad_norm": 0.12756268680095673, + "learning_rate": 0.0005, + "loss": 2.0986, + "step": 110230 + }, + { + "epoch": 0.41960064858445684, + "grad_norm": 0.1297759711742401, + "learning_rate": 0.0005, + "loss": 2.1167, + "step": 110240 + }, + { + "epoch": 0.4196387110525795, + "grad_norm": 0.11695853620767593, + "learning_rate": 0.0005, + "loss": 2.1208, + "step": 110250 + }, + { + "epoch": 0.41967677352070215, + "grad_norm": 0.14145225286483765, + "learning_rate": 0.0005, + "loss": 2.1201, + "step": 110260 + }, + { + "epoch": 0.41971483598882486, + "grad_norm": 0.13454757630825043, + "learning_rate": 0.0005, + "loss": 2.124, + "step": 110270 + }, + { + "epoch": 0.4197528984569475, + "grad_norm": 0.1235680803656578, + "learning_rate": 0.0005, + "loss": 2.1181, + "step": 110280 + }, + { + "epoch": 0.41979096092507023, + "grad_norm": 0.12550152838230133, + "learning_rate": 0.0005, + "loss": 2.1186, + "step": 110290 + }, + { + "epoch": 0.4198290233931929, + "grad_norm": 0.1230005994439125, + "learning_rate": 0.0005, + "loss": 2.1266, + "step": 110300 + }, + { + "epoch": 0.4198670858613156, + "grad_norm": 0.1317501813173294, + "learning_rate": 0.0005, + "loss": 2.1223, + "step": 110310 + }, + { + "epoch": 0.41990514832943826, + "grad_norm": 0.1260969638824463, + "learning_rate": 0.0005, + "loss": 2.1159, + "step": 110320 + }, + { + "epoch": 0.419943210797561, + "grad_norm": 0.1307699829339981, + "learning_rate": 0.0005, + "loss": 2.1289, + "step": 110330 + }, + { + "epoch": 0.41998127326568363, + "grad_norm": 0.12259736657142639, + "learning_rate": 0.0005, + "loss": 2.1156, + "step": 110340 + }, + { + "epoch": 0.42001933573380634, + "grad_norm": 0.12683376669883728, + "learning_rate": 0.0005, + "loss": 2.1307, + "step": 110350 + }, + { + "epoch": 0.420057398201929, + "grad_norm": 0.12672072649002075, + "learning_rate": 0.0005, + "loss": 2.1152, + "step": 110360 + }, + { + "epoch": 0.4200954606700517, + "grad_norm": 0.12184534966945648, + "learning_rate": 0.0005, + "loss": 2.1095, + "step": 110370 + }, + { + "epoch": 0.42013352313817437, + "grad_norm": 0.13169150054454803, + "learning_rate": 0.0005, + "loss": 2.1004, + "step": 110380 + }, + { + "epoch": 0.4201715856062971, + "grad_norm": 0.13503016531467438, + "learning_rate": 0.0005, + "loss": 2.1233, + "step": 110390 + }, + { + "epoch": 0.42020964807441974, + "grad_norm": 0.12989014387130737, + "learning_rate": 0.0005, + "loss": 2.1338, + "step": 110400 + }, + { + "epoch": 0.4202477105425424, + "grad_norm": 0.13313154876232147, + "learning_rate": 0.0005, + "loss": 2.1225, + "step": 110410 + }, + { + "epoch": 0.4202857730106651, + "grad_norm": 0.12498737126588821, + "learning_rate": 0.0005, + "loss": 2.1223, + "step": 110420 + }, + { + "epoch": 0.42032383547878777, + "grad_norm": 0.1535029113292694, + "learning_rate": 0.0005, + "loss": 2.1161, + "step": 110430 + }, + { + "epoch": 0.4203618979469105, + "grad_norm": 0.1109837144613266, + "learning_rate": 0.0005, + "loss": 2.1225, + "step": 110440 + }, + { + "epoch": 0.42039996041503314, + "grad_norm": 0.12009678035974503, + "learning_rate": 0.0005, + "loss": 2.1311, + "step": 110450 + }, + { + "epoch": 0.42043802288315585, + "grad_norm": 0.12859448790550232, + "learning_rate": 0.0005, + "loss": 2.1244, + "step": 110460 + }, + { + "epoch": 0.4204760853512785, + "grad_norm": 0.1498848795890808, + "learning_rate": 0.0005, + "loss": 2.1169, + "step": 110470 + }, + { + "epoch": 0.4205141478194012, + "grad_norm": 0.1277942806482315, + "learning_rate": 0.0005, + "loss": 2.1196, + "step": 110480 + }, + { + "epoch": 0.4205522102875239, + "grad_norm": 0.12816040217876434, + "learning_rate": 0.0005, + "loss": 2.1289, + "step": 110490 + }, + { + "epoch": 0.4205902727556466, + "grad_norm": 0.12099416553974152, + "learning_rate": 0.0005, + "loss": 2.0978, + "step": 110500 + }, + { + "epoch": 0.42062833522376925, + "grad_norm": 0.12932099401950836, + "learning_rate": 0.0005, + "loss": 2.113, + "step": 110510 + }, + { + "epoch": 0.42066639769189196, + "grad_norm": 0.13165128231048584, + "learning_rate": 0.0005, + "loss": 2.1321, + "step": 110520 + }, + { + "epoch": 0.4207044601600146, + "grad_norm": 0.12216272950172424, + "learning_rate": 0.0005, + "loss": 2.1112, + "step": 110530 + }, + { + "epoch": 0.42074252262813727, + "grad_norm": 0.1164298951625824, + "learning_rate": 0.0005, + "loss": 2.1157, + "step": 110540 + }, + { + "epoch": 0.42078058509626, + "grad_norm": 0.12431590259075165, + "learning_rate": 0.0005, + "loss": 2.1233, + "step": 110550 + }, + { + "epoch": 0.42081864756438264, + "grad_norm": 0.1205281987786293, + "learning_rate": 0.0005, + "loss": 2.1316, + "step": 110560 + }, + { + "epoch": 0.42085671003250535, + "grad_norm": 0.12753494083881378, + "learning_rate": 0.0005, + "loss": 2.1209, + "step": 110570 + }, + { + "epoch": 0.420894772500628, + "grad_norm": 0.13406816124916077, + "learning_rate": 0.0005, + "loss": 2.1143, + "step": 110580 + }, + { + "epoch": 0.4209328349687507, + "grad_norm": 0.13061662018299103, + "learning_rate": 0.0005, + "loss": 2.113, + "step": 110590 + }, + { + "epoch": 0.4209708974368734, + "grad_norm": 0.13684722781181335, + "learning_rate": 0.0005, + "loss": 2.1136, + "step": 110600 + }, + { + "epoch": 0.4210089599049961, + "grad_norm": 0.1287647932767868, + "learning_rate": 0.0005, + "loss": 2.1176, + "step": 110610 + }, + { + "epoch": 0.42104702237311875, + "grad_norm": 0.14672592282295227, + "learning_rate": 0.0005, + "loss": 2.1242, + "step": 110620 + }, + { + "epoch": 0.42108508484124146, + "grad_norm": 0.14222227036952972, + "learning_rate": 0.0005, + "loss": 2.1078, + "step": 110630 + }, + { + "epoch": 0.4211231473093641, + "grad_norm": 0.13052073121070862, + "learning_rate": 0.0005, + "loss": 2.0995, + "step": 110640 + }, + { + "epoch": 0.42116120977748683, + "grad_norm": 0.12732751667499542, + "learning_rate": 0.0005, + "loss": 2.1202, + "step": 110650 + }, + { + "epoch": 0.4211992722456095, + "grad_norm": 0.11703453212976456, + "learning_rate": 0.0005, + "loss": 2.1176, + "step": 110660 + }, + { + "epoch": 0.4212373347137322, + "grad_norm": 0.1263093650341034, + "learning_rate": 0.0005, + "loss": 2.1201, + "step": 110670 + }, + { + "epoch": 0.42127539718185486, + "grad_norm": 0.12344750761985779, + "learning_rate": 0.0005, + "loss": 2.1085, + "step": 110680 + }, + { + "epoch": 0.4213134596499775, + "grad_norm": 0.12565580010414124, + "learning_rate": 0.0005, + "loss": 2.1051, + "step": 110690 + }, + { + "epoch": 0.42135152211810023, + "grad_norm": 0.12912888824939728, + "learning_rate": 0.0005, + "loss": 2.1347, + "step": 110700 + }, + { + "epoch": 0.4213895845862229, + "grad_norm": 0.12103404849767685, + "learning_rate": 0.0005, + "loss": 2.1169, + "step": 110710 + }, + { + "epoch": 0.4214276470543456, + "grad_norm": 0.1245717853307724, + "learning_rate": 0.0005, + "loss": 2.1252, + "step": 110720 + }, + { + "epoch": 0.42146570952246826, + "grad_norm": 0.1308036893606186, + "learning_rate": 0.0005, + "loss": 2.1106, + "step": 110730 + }, + { + "epoch": 0.42150377199059097, + "grad_norm": 0.133273184299469, + "learning_rate": 0.0005, + "loss": 2.1323, + "step": 110740 + }, + { + "epoch": 0.4215418344587136, + "grad_norm": 0.11989910155534744, + "learning_rate": 0.0005, + "loss": 2.1121, + "step": 110750 + }, + { + "epoch": 0.42157989692683634, + "grad_norm": 0.12504975497722626, + "learning_rate": 0.0005, + "loss": 2.1069, + "step": 110760 + }, + { + "epoch": 0.421617959394959, + "grad_norm": 0.12130273878574371, + "learning_rate": 0.0005, + "loss": 2.1234, + "step": 110770 + }, + { + "epoch": 0.4216560218630817, + "grad_norm": 0.12154131382703781, + "learning_rate": 0.0005, + "loss": 2.1183, + "step": 110780 + }, + { + "epoch": 0.42169408433120437, + "grad_norm": 0.13687552511692047, + "learning_rate": 0.0005, + "loss": 2.1289, + "step": 110790 + }, + { + "epoch": 0.4217321467993271, + "grad_norm": 0.13189378380775452, + "learning_rate": 0.0005, + "loss": 2.1223, + "step": 110800 + }, + { + "epoch": 0.42177020926744974, + "grad_norm": 0.14396044611930847, + "learning_rate": 0.0005, + "loss": 2.127, + "step": 110810 + }, + { + "epoch": 0.42180827173557245, + "grad_norm": 0.11544430255889893, + "learning_rate": 0.0005, + "loss": 2.1222, + "step": 110820 + }, + { + "epoch": 0.4218463342036951, + "grad_norm": 0.12114161998033524, + "learning_rate": 0.0005, + "loss": 2.1187, + "step": 110830 + }, + { + "epoch": 0.42188439667181776, + "grad_norm": 0.1264815628528595, + "learning_rate": 0.0005, + "loss": 2.1263, + "step": 110840 + }, + { + "epoch": 0.4219224591399405, + "grad_norm": 0.1236187294125557, + "learning_rate": 0.0005, + "loss": 2.1248, + "step": 110850 + }, + { + "epoch": 0.42196052160806313, + "grad_norm": 0.1219010129570961, + "learning_rate": 0.0005, + "loss": 2.1157, + "step": 110860 + }, + { + "epoch": 0.42199858407618585, + "grad_norm": 0.11491255462169647, + "learning_rate": 0.0005, + "loss": 2.1269, + "step": 110870 + }, + { + "epoch": 0.4220366465443085, + "grad_norm": 0.12548251450061798, + "learning_rate": 0.0005, + "loss": 2.1095, + "step": 110880 + }, + { + "epoch": 0.4220747090124312, + "grad_norm": 0.1220545768737793, + "learning_rate": 0.0005, + "loss": 2.1149, + "step": 110890 + }, + { + "epoch": 0.42211277148055387, + "grad_norm": 0.12204521894454956, + "learning_rate": 0.0005, + "loss": 2.1361, + "step": 110900 + }, + { + "epoch": 0.4221508339486766, + "grad_norm": 0.12067580968141556, + "learning_rate": 0.0005, + "loss": 2.1242, + "step": 110910 + }, + { + "epoch": 0.42218889641679924, + "grad_norm": 0.13050217926502228, + "learning_rate": 0.0005, + "loss": 2.1058, + "step": 110920 + }, + { + "epoch": 0.42222695888492195, + "grad_norm": 0.11881910264492035, + "learning_rate": 0.0005, + "loss": 2.1132, + "step": 110930 + }, + { + "epoch": 0.4222650213530446, + "grad_norm": 0.12050008773803711, + "learning_rate": 0.0005, + "loss": 2.1281, + "step": 110940 + }, + { + "epoch": 0.4223030838211673, + "grad_norm": 0.12418985366821289, + "learning_rate": 0.0005, + "loss": 2.1265, + "step": 110950 + }, + { + "epoch": 0.42234114628929, + "grad_norm": 0.13076156377792358, + "learning_rate": 0.0005, + "loss": 2.1165, + "step": 110960 + }, + { + "epoch": 0.42237920875741264, + "grad_norm": 0.12248754501342773, + "learning_rate": 0.0005, + "loss": 2.109, + "step": 110970 + }, + { + "epoch": 0.42241727122553535, + "grad_norm": 0.12590964138507843, + "learning_rate": 0.0005, + "loss": 2.1291, + "step": 110980 + }, + { + "epoch": 0.422455333693658, + "grad_norm": 0.12941862642765045, + "learning_rate": 0.0005, + "loss": 2.1245, + "step": 110990 + }, + { + "epoch": 0.4224933961617807, + "grad_norm": 0.12657110393047333, + "learning_rate": 0.0005, + "loss": 2.1204, + "step": 111000 + }, + { + "epoch": 0.4225314586299034, + "grad_norm": 0.2467130869626999, + "learning_rate": 0.0005, + "loss": 2.1226, + "step": 111010 + }, + { + "epoch": 0.4225695210980261, + "grad_norm": 0.12490589171648026, + "learning_rate": 0.0005, + "loss": 2.1182, + "step": 111020 + }, + { + "epoch": 0.42260758356614875, + "grad_norm": 0.5077695250511169, + "learning_rate": 0.0005, + "loss": 2.1241, + "step": 111030 + }, + { + "epoch": 0.42264564603427146, + "grad_norm": 0.13496220111846924, + "learning_rate": 0.0005, + "loss": 2.1147, + "step": 111040 + }, + { + "epoch": 0.4226837085023941, + "grad_norm": 0.12376722693443298, + "learning_rate": 0.0005, + "loss": 2.1186, + "step": 111050 + }, + { + "epoch": 0.42272177097051683, + "grad_norm": 0.12206427752971649, + "learning_rate": 0.0005, + "loss": 2.11, + "step": 111060 + }, + { + "epoch": 0.4227598334386395, + "grad_norm": 0.11902330070734024, + "learning_rate": 0.0005, + "loss": 2.1095, + "step": 111070 + }, + { + "epoch": 0.4227978959067622, + "grad_norm": 0.12036629021167755, + "learning_rate": 0.0005, + "loss": 2.1221, + "step": 111080 + }, + { + "epoch": 0.42283595837488486, + "grad_norm": 0.11524273455142975, + "learning_rate": 0.0005, + "loss": 2.1121, + "step": 111090 + }, + { + "epoch": 0.42287402084300757, + "grad_norm": 0.12553663551807404, + "learning_rate": 0.0005, + "loss": 2.1172, + "step": 111100 + }, + { + "epoch": 0.4229120833111302, + "grad_norm": 0.11619086563587189, + "learning_rate": 0.0005, + "loss": 2.1445, + "step": 111110 + }, + { + "epoch": 0.4229501457792529, + "grad_norm": 0.12655623257160187, + "learning_rate": 0.0005, + "loss": 2.1165, + "step": 111120 + }, + { + "epoch": 0.4229882082473756, + "grad_norm": 0.14359208941459656, + "learning_rate": 0.0005, + "loss": 2.1352, + "step": 111130 + }, + { + "epoch": 0.42302627071549825, + "grad_norm": 0.1263459175825119, + "learning_rate": 0.0005, + "loss": 2.1154, + "step": 111140 + }, + { + "epoch": 0.42306433318362097, + "grad_norm": 0.12120271474123001, + "learning_rate": 0.0005, + "loss": 2.1046, + "step": 111150 + }, + { + "epoch": 0.4231023956517436, + "grad_norm": 0.14535973966121674, + "learning_rate": 0.0005, + "loss": 2.1274, + "step": 111160 + }, + { + "epoch": 0.42314045811986634, + "grad_norm": 0.1274825483560562, + "learning_rate": 0.0005, + "loss": 2.1143, + "step": 111170 + }, + { + "epoch": 0.423178520587989, + "grad_norm": 0.1486794650554657, + "learning_rate": 0.0005, + "loss": 2.1174, + "step": 111180 + }, + { + "epoch": 0.4232165830561117, + "grad_norm": 0.13074061274528503, + "learning_rate": 0.0005, + "loss": 2.1094, + "step": 111190 + }, + { + "epoch": 0.42325464552423436, + "grad_norm": 0.12979604303836823, + "learning_rate": 0.0005, + "loss": 2.1146, + "step": 111200 + }, + { + "epoch": 0.4232927079923571, + "grad_norm": 0.1272452175617218, + "learning_rate": 0.0005, + "loss": 2.1278, + "step": 111210 + }, + { + "epoch": 0.42333077046047973, + "grad_norm": 0.13075511157512665, + "learning_rate": 0.0005, + "loss": 2.1026, + "step": 111220 + }, + { + "epoch": 0.42336883292860245, + "grad_norm": 0.13066555559635162, + "learning_rate": 0.0005, + "loss": 2.1208, + "step": 111230 + }, + { + "epoch": 0.4234068953967251, + "grad_norm": 0.11858844757080078, + "learning_rate": 0.0005, + "loss": 2.1103, + "step": 111240 + }, + { + "epoch": 0.4234449578648478, + "grad_norm": 0.14340876042842865, + "learning_rate": 0.0005, + "loss": 2.0985, + "step": 111250 + }, + { + "epoch": 0.4234830203329705, + "grad_norm": 0.13483618199825287, + "learning_rate": 0.0005, + "loss": 2.1235, + "step": 111260 + }, + { + "epoch": 0.42352108280109313, + "grad_norm": 0.12093952298164368, + "learning_rate": 0.0005, + "loss": 2.1146, + "step": 111270 + }, + { + "epoch": 0.42355914526921584, + "grad_norm": 0.13822363317012787, + "learning_rate": 0.0005, + "loss": 2.1108, + "step": 111280 + }, + { + "epoch": 0.4235972077373385, + "grad_norm": 0.120503731071949, + "learning_rate": 0.0005, + "loss": 2.1253, + "step": 111290 + }, + { + "epoch": 0.4236352702054612, + "grad_norm": 0.11571825295686722, + "learning_rate": 0.0005, + "loss": 2.1126, + "step": 111300 + }, + { + "epoch": 0.42367333267358387, + "grad_norm": 0.12174150347709656, + "learning_rate": 0.0005, + "loss": 2.1297, + "step": 111310 + }, + { + "epoch": 0.4237113951417066, + "grad_norm": 0.12075323611497879, + "learning_rate": 0.0005, + "loss": 2.1155, + "step": 111320 + }, + { + "epoch": 0.42374945760982924, + "grad_norm": 0.12751325964927673, + "learning_rate": 0.0005, + "loss": 2.1102, + "step": 111330 + }, + { + "epoch": 0.42378752007795195, + "grad_norm": 0.13146482408046722, + "learning_rate": 0.0005, + "loss": 2.1116, + "step": 111340 + }, + { + "epoch": 0.4238255825460746, + "grad_norm": 0.1132122352719307, + "learning_rate": 0.0005, + "loss": 2.1189, + "step": 111350 + }, + { + "epoch": 0.4238636450141973, + "grad_norm": 0.12749198079109192, + "learning_rate": 0.0005, + "loss": 2.0998, + "step": 111360 + }, + { + "epoch": 0.42390170748232, + "grad_norm": 0.14566144347190857, + "learning_rate": 0.0005, + "loss": 2.1123, + "step": 111370 + }, + { + "epoch": 0.4239397699504427, + "grad_norm": 0.13449816405773163, + "learning_rate": 0.0005, + "loss": 2.1251, + "step": 111380 + }, + { + "epoch": 0.42397783241856535, + "grad_norm": 0.1352243423461914, + "learning_rate": 0.0005, + "loss": 2.1204, + "step": 111390 + }, + { + "epoch": 0.424015894886688, + "grad_norm": 0.12569397687911987, + "learning_rate": 0.0005, + "loss": 2.1349, + "step": 111400 + }, + { + "epoch": 0.4240539573548107, + "grad_norm": 0.12069183588027954, + "learning_rate": 0.0005, + "loss": 2.1047, + "step": 111410 + }, + { + "epoch": 0.4240920198229334, + "grad_norm": 0.13708123564720154, + "learning_rate": 0.0005, + "loss": 2.1145, + "step": 111420 + }, + { + "epoch": 0.4241300822910561, + "grad_norm": 0.14026005566120148, + "learning_rate": 0.0005, + "loss": 2.1296, + "step": 111430 + }, + { + "epoch": 0.42416814475917874, + "grad_norm": 0.12508703768253326, + "learning_rate": 0.0005, + "loss": 2.1191, + "step": 111440 + }, + { + "epoch": 0.42420620722730146, + "grad_norm": 0.12320968508720398, + "learning_rate": 0.0005, + "loss": 2.1227, + "step": 111450 + }, + { + "epoch": 0.4242442696954241, + "grad_norm": 0.128920778632164, + "learning_rate": 0.0005, + "loss": 2.118, + "step": 111460 + }, + { + "epoch": 0.4242823321635468, + "grad_norm": 0.13704611361026764, + "learning_rate": 0.0005, + "loss": 2.1118, + "step": 111470 + }, + { + "epoch": 0.4243203946316695, + "grad_norm": 0.12090374529361725, + "learning_rate": 0.0005, + "loss": 2.1099, + "step": 111480 + }, + { + "epoch": 0.4243584570997922, + "grad_norm": 0.1212400496006012, + "learning_rate": 0.0005, + "loss": 2.1308, + "step": 111490 + }, + { + "epoch": 0.42439651956791485, + "grad_norm": 0.13863784074783325, + "learning_rate": 0.0005, + "loss": 2.134, + "step": 111500 + }, + { + "epoch": 0.42443458203603757, + "grad_norm": 0.13154999911785126, + "learning_rate": 0.0005, + "loss": 2.1085, + "step": 111510 + }, + { + "epoch": 0.4244726445041602, + "grad_norm": 0.12200278043746948, + "learning_rate": 0.0005, + "loss": 2.1113, + "step": 111520 + }, + { + "epoch": 0.42451070697228294, + "grad_norm": 0.11427143961191177, + "learning_rate": 0.0005, + "loss": 2.1134, + "step": 111530 + }, + { + "epoch": 0.4245487694404056, + "grad_norm": 0.12608174979686737, + "learning_rate": 0.0005, + "loss": 2.1231, + "step": 111540 + }, + { + "epoch": 0.42458683190852825, + "grad_norm": 0.11711114645004272, + "learning_rate": 0.0005, + "loss": 2.1274, + "step": 111550 + }, + { + "epoch": 0.42462489437665096, + "grad_norm": 0.13191458582878113, + "learning_rate": 0.0005, + "loss": 2.1114, + "step": 111560 + }, + { + "epoch": 0.4246629568447736, + "grad_norm": 0.13757798075675964, + "learning_rate": 0.0005, + "loss": 2.1361, + "step": 111570 + }, + { + "epoch": 0.42470101931289633, + "grad_norm": 0.13054348528385162, + "learning_rate": 0.0005, + "loss": 2.12, + "step": 111580 + }, + { + "epoch": 0.424739081781019, + "grad_norm": 0.12154204398393631, + "learning_rate": 0.0005, + "loss": 2.1224, + "step": 111590 + }, + { + "epoch": 0.4247771442491417, + "grad_norm": 0.11809223145246506, + "learning_rate": 0.0005, + "loss": 2.1135, + "step": 111600 + }, + { + "epoch": 0.42481520671726436, + "grad_norm": 0.11923825740814209, + "learning_rate": 0.0005, + "loss": 2.1058, + "step": 111610 + }, + { + "epoch": 0.4248532691853871, + "grad_norm": 0.12535466253757477, + "learning_rate": 0.0005, + "loss": 2.1135, + "step": 111620 + }, + { + "epoch": 0.42489133165350973, + "grad_norm": 0.1255112588405609, + "learning_rate": 0.0005, + "loss": 2.1219, + "step": 111630 + }, + { + "epoch": 0.42492939412163244, + "grad_norm": 0.12606526911258698, + "learning_rate": 0.0005, + "loss": 2.1202, + "step": 111640 + }, + { + "epoch": 0.4249674565897551, + "grad_norm": 0.13942117989063263, + "learning_rate": 0.0005, + "loss": 2.143, + "step": 111650 + }, + { + "epoch": 0.4250055190578778, + "grad_norm": 0.12549205124378204, + "learning_rate": 0.0005, + "loss": 2.1174, + "step": 111660 + }, + { + "epoch": 0.42504358152600047, + "grad_norm": 0.11477110534906387, + "learning_rate": 0.0005, + "loss": 2.1088, + "step": 111670 + }, + { + "epoch": 0.4250816439941232, + "grad_norm": 0.12400896847248077, + "learning_rate": 0.0005, + "loss": 2.1251, + "step": 111680 + }, + { + "epoch": 0.42511970646224584, + "grad_norm": 0.12146724760532379, + "learning_rate": 0.0005, + "loss": 2.106, + "step": 111690 + }, + { + "epoch": 0.4251577689303685, + "grad_norm": 0.14244869351387024, + "learning_rate": 0.0005, + "loss": 2.1014, + "step": 111700 + }, + { + "epoch": 0.4251958313984912, + "grad_norm": 0.13024653494358063, + "learning_rate": 0.0005, + "loss": 2.1186, + "step": 111710 + }, + { + "epoch": 0.42523389386661387, + "grad_norm": 0.11714961379766464, + "learning_rate": 0.0005, + "loss": 2.1206, + "step": 111720 + }, + { + "epoch": 0.4252719563347366, + "grad_norm": 0.11902771890163422, + "learning_rate": 0.0005, + "loss": 2.1243, + "step": 111730 + }, + { + "epoch": 0.42531001880285924, + "grad_norm": 0.12405514717102051, + "learning_rate": 0.0005, + "loss": 2.1139, + "step": 111740 + }, + { + "epoch": 0.42534808127098195, + "grad_norm": 0.1149262934923172, + "learning_rate": 0.0005, + "loss": 2.1098, + "step": 111750 + }, + { + "epoch": 0.4253861437391046, + "grad_norm": 0.12680381536483765, + "learning_rate": 0.0005, + "loss": 2.1116, + "step": 111760 + }, + { + "epoch": 0.4254242062072273, + "grad_norm": 0.1344337910413742, + "learning_rate": 0.0005, + "loss": 2.1143, + "step": 111770 + }, + { + "epoch": 0.42546226867535, + "grad_norm": 0.12774929404258728, + "learning_rate": 0.0005, + "loss": 2.1189, + "step": 111780 + }, + { + "epoch": 0.4255003311434727, + "grad_norm": 0.11900809407234192, + "learning_rate": 0.0005, + "loss": 2.1204, + "step": 111790 + }, + { + "epoch": 0.42553839361159534, + "grad_norm": 0.13062550127506256, + "learning_rate": 0.0005, + "loss": 2.1316, + "step": 111800 + }, + { + "epoch": 0.42557645607971806, + "grad_norm": 0.1314554363489151, + "learning_rate": 0.0005, + "loss": 2.1054, + "step": 111810 + }, + { + "epoch": 0.4256145185478407, + "grad_norm": 0.12474439293146133, + "learning_rate": 0.0005, + "loss": 2.1281, + "step": 111820 + }, + { + "epoch": 0.42565258101596337, + "grad_norm": 0.11799319088459015, + "learning_rate": 0.0005, + "loss": 2.1117, + "step": 111830 + }, + { + "epoch": 0.4256906434840861, + "grad_norm": 0.13469941914081573, + "learning_rate": 0.0005, + "loss": 2.1186, + "step": 111840 + }, + { + "epoch": 0.42572870595220874, + "grad_norm": 0.13602536916732788, + "learning_rate": 0.0005, + "loss": 2.1079, + "step": 111850 + }, + { + "epoch": 0.42576676842033145, + "grad_norm": 0.13909238576889038, + "learning_rate": 0.0005, + "loss": 2.108, + "step": 111860 + }, + { + "epoch": 0.4258048308884541, + "grad_norm": 0.11958032101392746, + "learning_rate": 0.0005, + "loss": 2.1146, + "step": 111870 + }, + { + "epoch": 0.4258428933565768, + "grad_norm": 0.11667779088020325, + "learning_rate": 0.0005, + "loss": 2.1288, + "step": 111880 + }, + { + "epoch": 0.4258809558246995, + "grad_norm": 0.13016003370285034, + "learning_rate": 0.0005, + "loss": 2.1137, + "step": 111890 + }, + { + "epoch": 0.4259190182928222, + "grad_norm": 0.12318616360425949, + "learning_rate": 0.0005, + "loss": 2.123, + "step": 111900 + }, + { + "epoch": 0.42595708076094485, + "grad_norm": 0.13127361238002777, + "learning_rate": 0.0005, + "loss": 2.1036, + "step": 111910 + }, + { + "epoch": 0.42599514322906756, + "grad_norm": 0.1385347694158554, + "learning_rate": 0.0005, + "loss": 2.1206, + "step": 111920 + }, + { + "epoch": 0.4260332056971902, + "grad_norm": 0.11675745248794556, + "learning_rate": 0.0005, + "loss": 2.1274, + "step": 111930 + }, + { + "epoch": 0.42607126816531293, + "grad_norm": 0.1306353360414505, + "learning_rate": 0.0005, + "loss": 2.1263, + "step": 111940 + }, + { + "epoch": 0.4261093306334356, + "grad_norm": 0.1188969761133194, + "learning_rate": 0.0005, + "loss": 2.1223, + "step": 111950 + }, + { + "epoch": 0.4261473931015583, + "grad_norm": 0.12761636078357697, + "learning_rate": 0.0005, + "loss": 2.1191, + "step": 111960 + }, + { + "epoch": 0.42618545556968096, + "grad_norm": 0.1282307654619217, + "learning_rate": 0.0005, + "loss": 2.1253, + "step": 111970 + }, + { + "epoch": 0.4262235180378036, + "grad_norm": 0.12889324128627777, + "learning_rate": 0.0005, + "loss": 2.1302, + "step": 111980 + }, + { + "epoch": 0.42626158050592633, + "grad_norm": 0.12320785969495773, + "learning_rate": 0.0005, + "loss": 2.1294, + "step": 111990 + }, + { + "epoch": 0.426299642974049, + "grad_norm": 0.13121485710144043, + "learning_rate": 0.0005, + "loss": 2.1201, + "step": 112000 + }, + { + "epoch": 0.4263377054421717, + "grad_norm": 0.1245298683643341, + "learning_rate": 0.0005, + "loss": 2.1281, + "step": 112010 + }, + { + "epoch": 0.42637576791029436, + "grad_norm": 0.1187111884355545, + "learning_rate": 0.0005, + "loss": 2.1076, + "step": 112020 + }, + { + "epoch": 0.42641383037841707, + "grad_norm": 0.11888144165277481, + "learning_rate": 0.0005, + "loss": 2.1256, + "step": 112030 + }, + { + "epoch": 0.4264518928465397, + "grad_norm": 0.12159156799316406, + "learning_rate": 0.0005, + "loss": 2.1312, + "step": 112040 + }, + { + "epoch": 0.42648995531466244, + "grad_norm": 0.13763214647769928, + "learning_rate": 0.0005, + "loss": 2.1226, + "step": 112050 + }, + { + "epoch": 0.4265280177827851, + "grad_norm": 0.12213977426290512, + "learning_rate": 0.0005, + "loss": 2.1069, + "step": 112060 + }, + { + "epoch": 0.4265660802509078, + "grad_norm": 0.13264048099517822, + "learning_rate": 0.0005, + "loss": 2.1209, + "step": 112070 + }, + { + "epoch": 0.42660414271903047, + "grad_norm": 0.12648789584636688, + "learning_rate": 0.0005, + "loss": 2.1148, + "step": 112080 + }, + { + "epoch": 0.4266422051871532, + "grad_norm": 0.13669641315937042, + "learning_rate": 0.0005, + "loss": 2.1074, + "step": 112090 + }, + { + "epoch": 0.42668026765527584, + "grad_norm": 0.12726552784442902, + "learning_rate": 0.0005, + "loss": 2.1208, + "step": 112100 + }, + { + "epoch": 0.42671833012339855, + "grad_norm": 0.13499999046325684, + "learning_rate": 0.0005, + "loss": 2.134, + "step": 112110 + }, + { + "epoch": 0.4267563925915212, + "grad_norm": 0.12138763815164566, + "learning_rate": 0.0005, + "loss": 2.1029, + "step": 112120 + }, + { + "epoch": 0.42679445505964386, + "grad_norm": 0.11880981177091599, + "learning_rate": 0.0005, + "loss": 2.1283, + "step": 112130 + }, + { + "epoch": 0.4268325175277666, + "grad_norm": 0.127463236451149, + "learning_rate": 0.0005, + "loss": 2.1052, + "step": 112140 + }, + { + "epoch": 0.42687057999588923, + "grad_norm": 0.13302980363368988, + "learning_rate": 0.0005, + "loss": 2.1007, + "step": 112150 + }, + { + "epoch": 0.42690864246401194, + "grad_norm": 0.14324358105659485, + "learning_rate": 0.0005, + "loss": 2.1147, + "step": 112160 + }, + { + "epoch": 0.4269467049321346, + "grad_norm": 0.1454227864742279, + "learning_rate": 0.0005, + "loss": 2.1109, + "step": 112170 + }, + { + "epoch": 0.4269847674002573, + "grad_norm": 0.12385271489620209, + "learning_rate": 0.0005, + "loss": 2.1314, + "step": 112180 + }, + { + "epoch": 0.42702282986837997, + "grad_norm": 0.12873312830924988, + "learning_rate": 0.0005, + "loss": 2.1355, + "step": 112190 + }, + { + "epoch": 0.4270608923365027, + "grad_norm": 0.11225327104330063, + "learning_rate": 0.0005, + "loss": 2.1144, + "step": 112200 + }, + { + "epoch": 0.42709895480462534, + "grad_norm": 0.12275451421737671, + "learning_rate": 0.0005, + "loss": 2.129, + "step": 112210 + }, + { + "epoch": 0.42713701727274805, + "grad_norm": 0.12081906199455261, + "learning_rate": 0.0005, + "loss": 2.0946, + "step": 112220 + }, + { + "epoch": 0.4271750797408707, + "grad_norm": 0.14207085967063904, + "learning_rate": 0.0005, + "loss": 2.1239, + "step": 112230 + }, + { + "epoch": 0.4272131422089934, + "grad_norm": 0.12282105535268784, + "learning_rate": 0.0005, + "loss": 2.1169, + "step": 112240 + }, + { + "epoch": 0.4272512046771161, + "grad_norm": 0.1292695701122284, + "learning_rate": 0.0005, + "loss": 2.1099, + "step": 112250 + }, + { + "epoch": 0.4272892671452388, + "grad_norm": 0.12830670177936554, + "learning_rate": 0.0005, + "loss": 2.1268, + "step": 112260 + }, + { + "epoch": 0.42732732961336145, + "grad_norm": 0.12263115495443344, + "learning_rate": 0.0005, + "loss": 2.1136, + "step": 112270 + }, + { + "epoch": 0.4273653920814841, + "grad_norm": 0.12189716100692749, + "learning_rate": 0.0005, + "loss": 2.1205, + "step": 112280 + }, + { + "epoch": 0.4274034545496068, + "grad_norm": 0.12638132274150848, + "learning_rate": 0.0005, + "loss": 2.1128, + "step": 112290 + }, + { + "epoch": 0.4274415170177295, + "grad_norm": 0.12426239252090454, + "learning_rate": 0.0005, + "loss": 2.1354, + "step": 112300 + }, + { + "epoch": 0.4274795794858522, + "grad_norm": 0.1269400417804718, + "learning_rate": 0.0005, + "loss": 2.1136, + "step": 112310 + }, + { + "epoch": 0.42751764195397485, + "grad_norm": 0.12956169247627258, + "learning_rate": 0.0005, + "loss": 2.1159, + "step": 112320 + }, + { + "epoch": 0.42755570442209756, + "grad_norm": 0.14513267576694489, + "learning_rate": 0.0005, + "loss": 2.1155, + "step": 112330 + }, + { + "epoch": 0.4275937668902202, + "grad_norm": 0.12880150973796844, + "learning_rate": 0.0005, + "loss": 2.1237, + "step": 112340 + }, + { + "epoch": 0.42763182935834293, + "grad_norm": 0.1391003578901291, + "learning_rate": 0.0005, + "loss": 2.109, + "step": 112350 + }, + { + "epoch": 0.4276698918264656, + "grad_norm": 0.12618103623390198, + "learning_rate": 0.0005, + "loss": 2.1071, + "step": 112360 + }, + { + "epoch": 0.4277079542945883, + "grad_norm": 0.1251031756401062, + "learning_rate": 0.0005, + "loss": 2.1225, + "step": 112370 + }, + { + "epoch": 0.42774601676271096, + "grad_norm": 0.11405282467603683, + "learning_rate": 0.0005, + "loss": 2.1172, + "step": 112380 + }, + { + "epoch": 0.42778407923083367, + "grad_norm": 0.13576087355613708, + "learning_rate": 0.0005, + "loss": 2.1171, + "step": 112390 + }, + { + "epoch": 0.4278221416989563, + "grad_norm": 0.1274125576019287, + "learning_rate": 0.0005, + "loss": 2.1093, + "step": 112400 + }, + { + "epoch": 0.427860204167079, + "grad_norm": 0.11096604913473129, + "learning_rate": 0.0005, + "loss": 2.1178, + "step": 112410 + }, + { + "epoch": 0.4278982666352017, + "grad_norm": 0.12114623188972473, + "learning_rate": 0.0005, + "loss": 2.0998, + "step": 112420 + }, + { + "epoch": 0.42793632910332435, + "grad_norm": 0.13127700984477997, + "learning_rate": 0.0005, + "loss": 2.1073, + "step": 112430 + }, + { + "epoch": 0.42797439157144707, + "grad_norm": 0.1115710511803627, + "learning_rate": 0.0005, + "loss": 2.1098, + "step": 112440 + }, + { + "epoch": 0.4280124540395697, + "grad_norm": 0.12019863724708557, + "learning_rate": 0.0005, + "loss": 2.1112, + "step": 112450 + }, + { + "epoch": 0.42805051650769244, + "grad_norm": 0.12405234575271606, + "learning_rate": 0.0005, + "loss": 2.1202, + "step": 112460 + }, + { + "epoch": 0.4280885789758151, + "grad_norm": 0.11588817089796066, + "learning_rate": 0.0005, + "loss": 2.0985, + "step": 112470 + }, + { + "epoch": 0.4281266414439378, + "grad_norm": 0.14037273824214935, + "learning_rate": 0.0005, + "loss": 2.1439, + "step": 112480 + }, + { + "epoch": 0.42816470391206046, + "grad_norm": 0.14530320465564728, + "learning_rate": 0.0005, + "loss": 2.1195, + "step": 112490 + }, + { + "epoch": 0.4282027663801832, + "grad_norm": 0.11581036448478699, + "learning_rate": 0.0005, + "loss": 2.1209, + "step": 112500 + }, + { + "epoch": 0.42824082884830583, + "grad_norm": 0.12013707309961319, + "learning_rate": 0.0005, + "loss": 2.1192, + "step": 112510 + }, + { + "epoch": 0.42827889131642854, + "grad_norm": 0.1147102639079094, + "learning_rate": 0.0005, + "loss": 2.1311, + "step": 112520 + }, + { + "epoch": 0.4283169537845512, + "grad_norm": 0.1331748068332672, + "learning_rate": 0.0005, + "loss": 2.1158, + "step": 112530 + }, + { + "epoch": 0.4283550162526739, + "grad_norm": 0.13831809163093567, + "learning_rate": 0.0005, + "loss": 2.1336, + "step": 112540 + }, + { + "epoch": 0.42839307872079657, + "grad_norm": 0.12723985314369202, + "learning_rate": 0.0005, + "loss": 2.1175, + "step": 112550 + }, + { + "epoch": 0.42843114118891923, + "grad_norm": 0.13498298823833466, + "learning_rate": 0.0005, + "loss": 2.1125, + "step": 112560 + }, + { + "epoch": 0.42846920365704194, + "grad_norm": 0.11712668091058731, + "learning_rate": 0.0005, + "loss": 2.1157, + "step": 112570 + }, + { + "epoch": 0.4285072661251646, + "grad_norm": 0.12820938229560852, + "learning_rate": 0.0005, + "loss": 2.1044, + "step": 112580 + }, + { + "epoch": 0.4285453285932873, + "grad_norm": 0.14569640159606934, + "learning_rate": 0.0005, + "loss": 2.1162, + "step": 112590 + }, + { + "epoch": 0.42858339106140997, + "grad_norm": 0.13953521847724915, + "learning_rate": 0.0005, + "loss": 2.1191, + "step": 112600 + }, + { + "epoch": 0.4286214535295327, + "grad_norm": 0.1234491690993309, + "learning_rate": 0.0005, + "loss": 2.1306, + "step": 112610 + }, + { + "epoch": 0.42865951599765534, + "grad_norm": 0.1317553073167801, + "learning_rate": 0.0005, + "loss": 2.1137, + "step": 112620 + }, + { + "epoch": 0.42869757846577805, + "grad_norm": 0.11450919508934021, + "learning_rate": 0.0005, + "loss": 2.1219, + "step": 112630 + }, + { + "epoch": 0.4287356409339007, + "grad_norm": 0.12273668497800827, + "learning_rate": 0.0005, + "loss": 2.1268, + "step": 112640 + }, + { + "epoch": 0.4287737034020234, + "grad_norm": 0.11965305358171463, + "learning_rate": 0.0005, + "loss": 2.1318, + "step": 112650 + }, + { + "epoch": 0.4288117658701461, + "grad_norm": 0.12610210478305817, + "learning_rate": 0.0005, + "loss": 2.121, + "step": 112660 + }, + { + "epoch": 0.4288498283382688, + "grad_norm": 0.1309218555688858, + "learning_rate": 0.0005, + "loss": 2.1199, + "step": 112670 + }, + { + "epoch": 0.42888789080639145, + "grad_norm": 0.3877588212490082, + "learning_rate": 0.0005, + "loss": 2.1317, + "step": 112680 + }, + { + "epoch": 0.42892595327451416, + "grad_norm": 0.1310279667377472, + "learning_rate": 0.0005, + "loss": 2.12, + "step": 112690 + }, + { + "epoch": 0.4289640157426368, + "grad_norm": 0.12395985424518585, + "learning_rate": 0.0005, + "loss": 2.1142, + "step": 112700 + }, + { + "epoch": 0.4290020782107595, + "grad_norm": 0.11569735407829285, + "learning_rate": 0.0005, + "loss": 2.1163, + "step": 112710 + }, + { + "epoch": 0.4290401406788822, + "grad_norm": 0.11640530824661255, + "learning_rate": 0.0005, + "loss": 2.1127, + "step": 112720 + }, + { + "epoch": 0.42907820314700484, + "grad_norm": 0.1238589659333229, + "learning_rate": 0.0005, + "loss": 2.1188, + "step": 112730 + }, + { + "epoch": 0.42911626561512756, + "grad_norm": 0.12123946845531464, + "learning_rate": 0.0005, + "loss": 2.1361, + "step": 112740 + }, + { + "epoch": 0.4291543280832502, + "grad_norm": 0.13304486870765686, + "learning_rate": 0.0005, + "loss": 2.1217, + "step": 112750 + }, + { + "epoch": 0.4291923905513729, + "grad_norm": 0.1193198636174202, + "learning_rate": 0.0005, + "loss": 2.1248, + "step": 112760 + }, + { + "epoch": 0.4292304530194956, + "grad_norm": 0.1310795098543167, + "learning_rate": 0.0005, + "loss": 2.1046, + "step": 112770 + }, + { + "epoch": 0.4292685154876183, + "grad_norm": 0.13365091383457184, + "learning_rate": 0.0005, + "loss": 2.1252, + "step": 112780 + }, + { + "epoch": 0.42930657795574095, + "grad_norm": 0.12833775579929352, + "learning_rate": 0.0005, + "loss": 2.1122, + "step": 112790 + }, + { + "epoch": 0.42934464042386367, + "grad_norm": 0.14498308300971985, + "learning_rate": 0.0005, + "loss": 2.1202, + "step": 112800 + }, + { + "epoch": 0.4293827028919863, + "grad_norm": 0.12884864211082458, + "learning_rate": 0.0005, + "loss": 2.1149, + "step": 112810 + }, + { + "epoch": 0.42942076536010904, + "grad_norm": 0.1267465502023697, + "learning_rate": 0.0005, + "loss": 2.1194, + "step": 112820 + }, + { + "epoch": 0.4294588278282317, + "grad_norm": 0.12270502001047134, + "learning_rate": 0.0005, + "loss": 2.1227, + "step": 112830 + }, + { + "epoch": 0.42949689029635435, + "grad_norm": 0.13606403768062592, + "learning_rate": 0.0005, + "loss": 2.1188, + "step": 112840 + }, + { + "epoch": 0.42953495276447706, + "grad_norm": 0.12270118296146393, + "learning_rate": 0.0005, + "loss": 2.1091, + "step": 112850 + }, + { + "epoch": 0.4295730152325997, + "grad_norm": 0.14218850433826447, + "learning_rate": 0.0005, + "loss": 2.1172, + "step": 112860 + }, + { + "epoch": 0.42961107770072243, + "grad_norm": 0.12145689129829407, + "learning_rate": 0.0005, + "loss": 2.123, + "step": 112870 + }, + { + "epoch": 0.4296491401688451, + "grad_norm": 0.12895521521568298, + "learning_rate": 0.0005, + "loss": 2.1063, + "step": 112880 + }, + { + "epoch": 0.4296872026369678, + "grad_norm": 0.12357887625694275, + "learning_rate": 0.0005, + "loss": 2.1219, + "step": 112890 + }, + { + "epoch": 0.42972526510509046, + "grad_norm": 0.13553652167320251, + "learning_rate": 0.0005, + "loss": 2.1201, + "step": 112900 + }, + { + "epoch": 0.42976332757321317, + "grad_norm": 0.12917789816856384, + "learning_rate": 0.0005, + "loss": 2.1267, + "step": 112910 + }, + { + "epoch": 0.42980139004133583, + "grad_norm": 0.11609556525945663, + "learning_rate": 0.0005, + "loss": 2.1171, + "step": 112920 + }, + { + "epoch": 0.42983945250945854, + "grad_norm": 0.1176832914352417, + "learning_rate": 0.0005, + "loss": 2.1132, + "step": 112930 + }, + { + "epoch": 0.4298775149775812, + "grad_norm": 0.1269991546869278, + "learning_rate": 0.0005, + "loss": 2.0931, + "step": 112940 + }, + { + "epoch": 0.4299155774457039, + "grad_norm": 0.13056680560112, + "learning_rate": 0.0005, + "loss": 2.1066, + "step": 112950 + }, + { + "epoch": 0.42995363991382657, + "grad_norm": 0.15302568674087524, + "learning_rate": 0.0005, + "loss": 2.1131, + "step": 112960 + }, + { + "epoch": 0.4299917023819493, + "grad_norm": 0.1258421242237091, + "learning_rate": 0.0005, + "loss": 2.1114, + "step": 112970 + }, + { + "epoch": 0.43002976485007194, + "grad_norm": 0.1366618275642395, + "learning_rate": 0.0005, + "loss": 2.1206, + "step": 112980 + }, + { + "epoch": 0.4300678273181946, + "grad_norm": 0.15595024824142456, + "learning_rate": 0.0005, + "loss": 2.1072, + "step": 112990 + }, + { + "epoch": 0.4301058897863173, + "grad_norm": 0.12370587140321732, + "learning_rate": 0.0005, + "loss": 2.1151, + "step": 113000 + }, + { + "epoch": 0.43014395225443997, + "grad_norm": 0.12987452745437622, + "learning_rate": 0.0005, + "loss": 2.1218, + "step": 113010 + }, + { + "epoch": 0.4301820147225627, + "grad_norm": 0.11600293964147568, + "learning_rate": 0.0005, + "loss": 2.1189, + "step": 113020 + }, + { + "epoch": 0.43022007719068533, + "grad_norm": 0.13424547016620636, + "learning_rate": 0.0005, + "loss": 2.1171, + "step": 113030 + }, + { + "epoch": 0.43025813965880805, + "grad_norm": 0.1201629564166069, + "learning_rate": 0.0005, + "loss": 2.1169, + "step": 113040 + }, + { + "epoch": 0.4302962021269307, + "grad_norm": 0.12029099464416504, + "learning_rate": 0.0005, + "loss": 2.1179, + "step": 113050 + }, + { + "epoch": 0.4303342645950534, + "grad_norm": 0.11914312094449997, + "learning_rate": 0.0005, + "loss": 2.1301, + "step": 113060 + }, + { + "epoch": 0.4303723270631761, + "grad_norm": 0.12347181886434555, + "learning_rate": 0.0005, + "loss": 2.1062, + "step": 113070 + }, + { + "epoch": 0.4304103895312988, + "grad_norm": 0.12621614336967468, + "learning_rate": 0.0005, + "loss": 2.1158, + "step": 113080 + }, + { + "epoch": 0.43044845199942144, + "grad_norm": 0.12609432637691498, + "learning_rate": 0.0005, + "loss": 2.1233, + "step": 113090 + }, + { + "epoch": 0.43048651446754416, + "grad_norm": 0.13581635057926178, + "learning_rate": 0.0005, + "loss": 2.1251, + "step": 113100 + }, + { + "epoch": 0.4305245769356668, + "grad_norm": 0.12690937519073486, + "learning_rate": 0.0005, + "loss": 2.1069, + "step": 113110 + }, + { + "epoch": 0.4305626394037895, + "grad_norm": 0.1215238943696022, + "learning_rate": 0.0005, + "loss": 2.0967, + "step": 113120 + }, + { + "epoch": 0.4306007018719122, + "grad_norm": 0.12563979625701904, + "learning_rate": 0.0005, + "loss": 2.1141, + "step": 113130 + }, + { + "epoch": 0.43063876434003484, + "grad_norm": 0.11845796555280685, + "learning_rate": 0.0005, + "loss": 2.121, + "step": 113140 + }, + { + "epoch": 0.43067682680815755, + "grad_norm": 0.11353427916765213, + "learning_rate": 0.0005, + "loss": 2.1135, + "step": 113150 + }, + { + "epoch": 0.4307148892762802, + "grad_norm": 0.1303396373987198, + "learning_rate": 0.0005, + "loss": 2.1197, + "step": 113160 + }, + { + "epoch": 0.4307529517444029, + "grad_norm": 0.1315041482448578, + "learning_rate": 0.0005, + "loss": 2.1232, + "step": 113170 + }, + { + "epoch": 0.4307910142125256, + "grad_norm": 0.12611375749111176, + "learning_rate": 0.0005, + "loss": 2.127, + "step": 113180 + }, + { + "epoch": 0.4308290766806483, + "grad_norm": 0.12498685717582703, + "learning_rate": 0.0005, + "loss": 2.1123, + "step": 113190 + }, + { + "epoch": 0.43086713914877095, + "grad_norm": 0.12216173857450485, + "learning_rate": 0.0005, + "loss": 2.1165, + "step": 113200 + }, + { + "epoch": 0.43090520161689366, + "grad_norm": 0.1408940851688385, + "learning_rate": 0.0005, + "loss": 2.111, + "step": 113210 + }, + { + "epoch": 0.4309432640850163, + "grad_norm": 0.1221887543797493, + "learning_rate": 0.0005, + "loss": 2.1318, + "step": 113220 + }, + { + "epoch": 0.43098132655313903, + "grad_norm": 0.1298237144947052, + "learning_rate": 0.0005, + "loss": 2.1269, + "step": 113230 + }, + { + "epoch": 0.4310193890212617, + "grad_norm": 0.13541021943092346, + "learning_rate": 0.0005, + "loss": 2.109, + "step": 113240 + }, + { + "epoch": 0.4310574514893844, + "grad_norm": 0.13242541253566742, + "learning_rate": 0.0005, + "loss": 2.1083, + "step": 113250 + }, + { + "epoch": 0.43109551395750706, + "grad_norm": 0.161054328083992, + "learning_rate": 0.0005, + "loss": 2.1138, + "step": 113260 + }, + { + "epoch": 0.4311335764256297, + "grad_norm": 0.12451450526714325, + "learning_rate": 0.0005, + "loss": 2.1023, + "step": 113270 + }, + { + "epoch": 0.43117163889375243, + "grad_norm": 0.1456080824136734, + "learning_rate": 0.0005, + "loss": 2.1226, + "step": 113280 + }, + { + "epoch": 0.4312097013618751, + "grad_norm": 0.13522973656654358, + "learning_rate": 0.0005, + "loss": 2.1248, + "step": 113290 + }, + { + "epoch": 0.4312477638299978, + "grad_norm": 0.13681624829769135, + "learning_rate": 0.0005, + "loss": 2.1215, + "step": 113300 + }, + { + "epoch": 0.43128582629812046, + "grad_norm": 0.12967310845851898, + "learning_rate": 0.0005, + "loss": 2.1189, + "step": 113310 + }, + { + "epoch": 0.43132388876624317, + "grad_norm": 0.1302393525838852, + "learning_rate": 0.0005, + "loss": 2.1184, + "step": 113320 + }, + { + "epoch": 0.4313619512343658, + "grad_norm": 0.12448181957006454, + "learning_rate": 0.0005, + "loss": 2.1159, + "step": 113330 + }, + { + "epoch": 0.43140001370248854, + "grad_norm": 0.12567047774791718, + "learning_rate": 0.0005, + "loss": 2.1466, + "step": 113340 + }, + { + "epoch": 0.4314380761706112, + "grad_norm": 0.13223430514335632, + "learning_rate": 0.0005, + "loss": 2.113, + "step": 113350 + }, + { + "epoch": 0.4314761386387339, + "grad_norm": 0.12238463014364243, + "learning_rate": 0.0005, + "loss": 2.1324, + "step": 113360 + }, + { + "epoch": 0.43151420110685657, + "grad_norm": 0.1190914511680603, + "learning_rate": 0.0005, + "loss": 2.1074, + "step": 113370 + }, + { + "epoch": 0.4315522635749793, + "grad_norm": 0.13166844844818115, + "learning_rate": 0.0005, + "loss": 2.1063, + "step": 113380 + }, + { + "epoch": 0.43159032604310194, + "grad_norm": 0.12852655351161957, + "learning_rate": 0.0005, + "loss": 2.1305, + "step": 113390 + }, + { + "epoch": 0.43162838851122465, + "grad_norm": 0.11806194484233856, + "learning_rate": 0.0005, + "loss": 2.1081, + "step": 113400 + }, + { + "epoch": 0.4316664509793473, + "grad_norm": 0.12974756956100464, + "learning_rate": 0.0005, + "loss": 2.1191, + "step": 113410 + }, + { + "epoch": 0.43170451344746996, + "grad_norm": 0.1320747286081314, + "learning_rate": 0.0005, + "loss": 2.121, + "step": 113420 + }, + { + "epoch": 0.4317425759155927, + "grad_norm": 0.11793147027492523, + "learning_rate": 0.0005, + "loss": 2.1249, + "step": 113430 + }, + { + "epoch": 0.43178063838371533, + "grad_norm": 0.12105303257703781, + "learning_rate": 0.0005, + "loss": 2.1231, + "step": 113440 + }, + { + "epoch": 0.43181870085183804, + "grad_norm": 0.1268729716539383, + "learning_rate": 0.0005, + "loss": 2.1248, + "step": 113450 + }, + { + "epoch": 0.4318567633199607, + "grad_norm": 0.1271156668663025, + "learning_rate": 0.0005, + "loss": 2.1208, + "step": 113460 + }, + { + "epoch": 0.4318948257880834, + "grad_norm": 0.13248351216316223, + "learning_rate": 0.0005, + "loss": 2.1137, + "step": 113470 + }, + { + "epoch": 0.43193288825620607, + "grad_norm": 0.13424064218997955, + "learning_rate": 0.0005, + "loss": 2.1279, + "step": 113480 + }, + { + "epoch": 0.4319709507243288, + "grad_norm": 0.13179951906204224, + "learning_rate": 0.0005, + "loss": 2.1193, + "step": 113490 + }, + { + "epoch": 0.43200901319245144, + "grad_norm": 0.11430752277374268, + "learning_rate": 0.0005, + "loss": 2.1106, + "step": 113500 + }, + { + "epoch": 0.43204707566057415, + "grad_norm": 0.12928007543087006, + "learning_rate": 0.0005, + "loss": 2.1199, + "step": 113510 + }, + { + "epoch": 0.4320851381286968, + "grad_norm": 0.13041894137859344, + "learning_rate": 0.0005, + "loss": 2.1156, + "step": 113520 + }, + { + "epoch": 0.4321232005968195, + "grad_norm": 0.12620759010314941, + "learning_rate": 0.0005, + "loss": 2.1174, + "step": 113530 + }, + { + "epoch": 0.4321612630649422, + "grad_norm": 0.12515173852443695, + "learning_rate": 0.0005, + "loss": 2.1201, + "step": 113540 + }, + { + "epoch": 0.4321993255330649, + "grad_norm": 0.11779354512691498, + "learning_rate": 0.0005, + "loss": 2.124, + "step": 113550 + }, + { + "epoch": 0.43223738800118755, + "grad_norm": 0.11467306315898895, + "learning_rate": 0.0005, + "loss": 2.1075, + "step": 113560 + }, + { + "epoch": 0.4322754504693102, + "grad_norm": 0.13338513672351837, + "learning_rate": 0.0005, + "loss": 2.1163, + "step": 113570 + }, + { + "epoch": 0.4323135129374329, + "grad_norm": 0.11753935366868973, + "learning_rate": 0.0005, + "loss": 2.1233, + "step": 113580 + }, + { + "epoch": 0.4323515754055556, + "grad_norm": 0.11876005679368973, + "learning_rate": 0.0005, + "loss": 2.1075, + "step": 113590 + }, + { + "epoch": 0.4323896378736783, + "grad_norm": 0.14191961288452148, + "learning_rate": 0.0005, + "loss": 2.1121, + "step": 113600 + }, + { + "epoch": 0.43242770034180095, + "grad_norm": 0.12320465594530106, + "learning_rate": 0.0005, + "loss": 2.1214, + "step": 113610 + }, + { + "epoch": 0.43246576280992366, + "grad_norm": 0.12368667870759964, + "learning_rate": 0.0005, + "loss": 2.1218, + "step": 113620 + }, + { + "epoch": 0.4325038252780463, + "grad_norm": 0.1309211701154709, + "learning_rate": 0.0005, + "loss": 2.1147, + "step": 113630 + }, + { + "epoch": 0.43254188774616903, + "grad_norm": 0.13724461197853088, + "learning_rate": 0.0005, + "loss": 2.12, + "step": 113640 + }, + { + "epoch": 0.4325799502142917, + "grad_norm": 0.12140762060880661, + "learning_rate": 0.0005, + "loss": 2.1086, + "step": 113650 + }, + { + "epoch": 0.4326180126824144, + "grad_norm": 0.12964437901973724, + "learning_rate": 0.0005, + "loss": 2.1126, + "step": 113660 + }, + { + "epoch": 0.43265607515053706, + "grad_norm": 0.14219939708709717, + "learning_rate": 0.0005, + "loss": 2.1234, + "step": 113670 + }, + { + "epoch": 0.43269413761865977, + "grad_norm": 0.1289874166250229, + "learning_rate": 0.0005, + "loss": 2.1091, + "step": 113680 + }, + { + "epoch": 0.4327322000867824, + "grad_norm": 0.12109565734863281, + "learning_rate": 0.0005, + "loss": 2.1271, + "step": 113690 + }, + { + "epoch": 0.4327702625549051, + "grad_norm": 0.12907147407531738, + "learning_rate": 0.0005, + "loss": 2.1081, + "step": 113700 + }, + { + "epoch": 0.4328083250230278, + "grad_norm": 0.12966330349445343, + "learning_rate": 0.0005, + "loss": 2.1166, + "step": 113710 + }, + { + "epoch": 0.43284638749115045, + "grad_norm": 0.1305544078350067, + "learning_rate": 0.0005, + "loss": 2.123, + "step": 113720 + }, + { + "epoch": 0.43288444995927317, + "grad_norm": 0.1428816020488739, + "learning_rate": 0.0005, + "loss": 2.1285, + "step": 113730 + }, + { + "epoch": 0.4329225124273958, + "grad_norm": 0.12114094942808151, + "learning_rate": 0.0005, + "loss": 2.1118, + "step": 113740 + }, + { + "epoch": 0.43296057489551854, + "grad_norm": 0.14122043550014496, + "learning_rate": 0.0005, + "loss": 2.1163, + "step": 113750 + }, + { + "epoch": 0.4329986373636412, + "grad_norm": 0.13917775452136993, + "learning_rate": 0.0005, + "loss": 2.1081, + "step": 113760 + }, + { + "epoch": 0.4330366998317639, + "grad_norm": 0.1277030110359192, + "learning_rate": 0.0005, + "loss": 2.1271, + "step": 113770 + }, + { + "epoch": 0.43307476229988656, + "grad_norm": 0.14055971801280975, + "learning_rate": 0.0005, + "loss": 2.1207, + "step": 113780 + }, + { + "epoch": 0.4331128247680093, + "grad_norm": 0.1363455355167389, + "learning_rate": 0.0005, + "loss": 2.1239, + "step": 113790 + }, + { + "epoch": 0.43315088723613193, + "grad_norm": 0.12260470539331436, + "learning_rate": 0.0005, + "loss": 2.1317, + "step": 113800 + }, + { + "epoch": 0.43318894970425464, + "grad_norm": 0.13288481533527374, + "learning_rate": 0.0005, + "loss": 2.1418, + "step": 113810 + }, + { + "epoch": 0.4332270121723773, + "grad_norm": 0.1359279304742813, + "learning_rate": 0.0005, + "loss": 2.1281, + "step": 113820 + }, + { + "epoch": 0.4332650746405, + "grad_norm": 0.14037518203258514, + "learning_rate": 0.0005, + "loss": 2.1313, + "step": 113830 + }, + { + "epoch": 0.43330313710862267, + "grad_norm": 0.1292182356119156, + "learning_rate": 0.0005, + "loss": 2.1066, + "step": 113840 + }, + { + "epoch": 0.43334119957674533, + "grad_norm": 0.12281746417284012, + "learning_rate": 0.0005, + "loss": 2.1032, + "step": 113850 + }, + { + "epoch": 0.43337926204486804, + "grad_norm": 0.12527795135974884, + "learning_rate": 0.0005, + "loss": 2.1182, + "step": 113860 + }, + { + "epoch": 0.4334173245129907, + "grad_norm": 0.12615898251533508, + "learning_rate": 0.0005, + "loss": 2.1173, + "step": 113870 + }, + { + "epoch": 0.4334553869811134, + "grad_norm": 0.13604030013084412, + "learning_rate": 0.0005, + "loss": 2.1045, + "step": 113880 + }, + { + "epoch": 0.43349344944923607, + "grad_norm": 0.12271784245967865, + "learning_rate": 0.0005, + "loss": 2.118, + "step": 113890 + }, + { + "epoch": 0.4335315119173588, + "grad_norm": 0.12712498009204865, + "learning_rate": 0.0005, + "loss": 2.116, + "step": 113900 + }, + { + "epoch": 0.43356957438548144, + "grad_norm": 0.11638306826353073, + "learning_rate": 0.0005, + "loss": 2.1038, + "step": 113910 + }, + { + "epoch": 0.43360763685360415, + "grad_norm": 0.12177691608667374, + "learning_rate": 0.0005, + "loss": 2.134, + "step": 113920 + }, + { + "epoch": 0.4336456993217268, + "grad_norm": 0.13364951312541962, + "learning_rate": 0.0005, + "loss": 2.1112, + "step": 113930 + }, + { + "epoch": 0.4336837617898495, + "grad_norm": 0.12290096282958984, + "learning_rate": 0.0005, + "loss": 2.1144, + "step": 113940 + }, + { + "epoch": 0.4337218242579722, + "grad_norm": 0.12592563033103943, + "learning_rate": 0.0005, + "loss": 2.0961, + "step": 113950 + }, + { + "epoch": 0.4337598867260949, + "grad_norm": 0.12498262524604797, + "learning_rate": 0.0005, + "loss": 2.1453, + "step": 113960 + }, + { + "epoch": 0.43379794919421755, + "grad_norm": 0.15119707584381104, + "learning_rate": 0.0005, + "loss": 2.1214, + "step": 113970 + }, + { + "epoch": 0.43383601166234026, + "grad_norm": 0.13209018111228943, + "learning_rate": 0.0005, + "loss": 2.112, + "step": 113980 + }, + { + "epoch": 0.4338740741304629, + "grad_norm": 0.13218539953231812, + "learning_rate": 0.0005, + "loss": 2.1164, + "step": 113990 + }, + { + "epoch": 0.4339121365985856, + "grad_norm": 0.1241869181394577, + "learning_rate": 0.0005, + "loss": 2.1146, + "step": 114000 + }, + { + "epoch": 0.4339501990667083, + "grad_norm": 0.12801161408424377, + "learning_rate": 0.0005, + "loss": 2.1082, + "step": 114010 + }, + { + "epoch": 0.43398826153483094, + "grad_norm": 0.1200789138674736, + "learning_rate": 0.0005, + "loss": 2.1164, + "step": 114020 + }, + { + "epoch": 0.43402632400295366, + "grad_norm": 0.12127118557691574, + "learning_rate": 0.0005, + "loss": 2.12, + "step": 114030 + }, + { + "epoch": 0.4340643864710763, + "grad_norm": 0.1316060721874237, + "learning_rate": 0.0005, + "loss": 2.105, + "step": 114040 + }, + { + "epoch": 0.434102448939199, + "grad_norm": 0.137965127825737, + "learning_rate": 0.0005, + "loss": 2.0959, + "step": 114050 + }, + { + "epoch": 0.4341405114073217, + "grad_norm": 0.12087521702051163, + "learning_rate": 0.0005, + "loss": 2.1275, + "step": 114060 + }, + { + "epoch": 0.4341785738754444, + "grad_norm": 0.14593859016895294, + "learning_rate": 0.0005, + "loss": 2.1211, + "step": 114070 + }, + { + "epoch": 0.43421663634356705, + "grad_norm": 0.12967590987682343, + "learning_rate": 0.0005, + "loss": 2.1149, + "step": 114080 + }, + { + "epoch": 0.43425469881168977, + "grad_norm": 0.1307557374238968, + "learning_rate": 0.0005, + "loss": 2.1303, + "step": 114090 + }, + { + "epoch": 0.4342927612798124, + "grad_norm": 0.1126394271850586, + "learning_rate": 0.0005, + "loss": 2.1062, + "step": 114100 + }, + { + "epoch": 0.43433082374793514, + "grad_norm": 0.13772854208946228, + "learning_rate": 0.0005, + "loss": 2.1137, + "step": 114110 + }, + { + "epoch": 0.4343688862160578, + "grad_norm": 0.11546389758586884, + "learning_rate": 0.0005, + "loss": 2.1124, + "step": 114120 + }, + { + "epoch": 0.43440694868418045, + "grad_norm": 0.12694381177425385, + "learning_rate": 0.0005, + "loss": 2.1272, + "step": 114130 + }, + { + "epoch": 0.43444501115230316, + "grad_norm": 0.1779816597700119, + "learning_rate": 0.0005, + "loss": 2.1264, + "step": 114140 + }, + { + "epoch": 0.4344830736204258, + "grad_norm": 0.13322043418884277, + "learning_rate": 0.0005, + "loss": 2.1215, + "step": 114150 + }, + { + "epoch": 0.43452113608854853, + "grad_norm": 0.13278986513614655, + "learning_rate": 0.0005, + "loss": 2.1035, + "step": 114160 + }, + { + "epoch": 0.4345591985566712, + "grad_norm": 0.13025440275669098, + "learning_rate": 0.0005, + "loss": 2.108, + "step": 114170 + }, + { + "epoch": 0.4345972610247939, + "grad_norm": 0.12454594671726227, + "learning_rate": 0.0005, + "loss": 2.1138, + "step": 114180 + }, + { + "epoch": 0.43463532349291656, + "grad_norm": 0.12686704099178314, + "learning_rate": 0.0005, + "loss": 2.1168, + "step": 114190 + }, + { + "epoch": 0.43467338596103927, + "grad_norm": 0.11205738037824631, + "learning_rate": 0.0005, + "loss": 2.1149, + "step": 114200 + }, + { + "epoch": 0.43471144842916193, + "grad_norm": 0.1268397867679596, + "learning_rate": 0.0005, + "loss": 2.1242, + "step": 114210 + }, + { + "epoch": 0.43474951089728464, + "grad_norm": 0.1218252032995224, + "learning_rate": 0.0005, + "loss": 2.1292, + "step": 114220 + }, + { + "epoch": 0.4347875733654073, + "grad_norm": 0.12137608975172043, + "learning_rate": 0.0005, + "loss": 2.1299, + "step": 114230 + }, + { + "epoch": 0.43482563583353, + "grad_norm": 0.11886157840490341, + "learning_rate": 0.0005, + "loss": 2.1232, + "step": 114240 + }, + { + "epoch": 0.43486369830165267, + "grad_norm": 0.11996449530124664, + "learning_rate": 0.0005, + "loss": 2.1274, + "step": 114250 + }, + { + "epoch": 0.4349017607697754, + "grad_norm": 0.13811056315898895, + "learning_rate": 0.0005, + "loss": 2.1339, + "step": 114260 + }, + { + "epoch": 0.43493982323789804, + "grad_norm": 0.11954815685749054, + "learning_rate": 0.0005, + "loss": 2.1085, + "step": 114270 + }, + { + "epoch": 0.4349778857060207, + "grad_norm": 0.1311139613389969, + "learning_rate": 0.0005, + "loss": 2.1256, + "step": 114280 + }, + { + "epoch": 0.4350159481741434, + "grad_norm": 0.11974254250526428, + "learning_rate": 0.0005, + "loss": 2.1179, + "step": 114290 + }, + { + "epoch": 0.43505401064226606, + "grad_norm": 0.13784751296043396, + "learning_rate": 0.0005, + "loss": 2.1279, + "step": 114300 + }, + { + "epoch": 0.4350920731103888, + "grad_norm": 0.13397420942783356, + "learning_rate": 0.0005, + "loss": 2.1255, + "step": 114310 + }, + { + "epoch": 0.43513013557851143, + "grad_norm": 0.11783301830291748, + "learning_rate": 0.0005, + "loss": 2.1146, + "step": 114320 + }, + { + "epoch": 0.43516819804663415, + "grad_norm": 0.12785419821739197, + "learning_rate": 0.0005, + "loss": 2.1155, + "step": 114330 + }, + { + "epoch": 0.4352062605147568, + "grad_norm": 0.1297289878129959, + "learning_rate": 0.0005, + "loss": 2.1117, + "step": 114340 + }, + { + "epoch": 0.4352443229828795, + "grad_norm": 0.1216789036989212, + "learning_rate": 0.0005, + "loss": 2.1188, + "step": 114350 + }, + { + "epoch": 0.4352823854510022, + "grad_norm": 0.1163412556052208, + "learning_rate": 0.0005, + "loss": 2.1102, + "step": 114360 + }, + { + "epoch": 0.4353204479191249, + "grad_norm": 0.13293077051639557, + "learning_rate": 0.0005, + "loss": 2.12, + "step": 114370 + }, + { + "epoch": 0.43535851038724754, + "grad_norm": 0.12136878818273544, + "learning_rate": 0.0005, + "loss": 2.117, + "step": 114380 + }, + { + "epoch": 0.43539657285537026, + "grad_norm": 0.1324099898338318, + "learning_rate": 0.0005, + "loss": 2.1429, + "step": 114390 + }, + { + "epoch": 0.4354346353234929, + "grad_norm": 0.1286437064409256, + "learning_rate": 0.0005, + "loss": 2.0947, + "step": 114400 + }, + { + "epoch": 0.4354726977916156, + "grad_norm": 0.13256339728832245, + "learning_rate": 0.0005, + "loss": 2.1104, + "step": 114410 + }, + { + "epoch": 0.4355107602597383, + "grad_norm": 0.1252366006374359, + "learning_rate": 0.0005, + "loss": 2.1374, + "step": 114420 + }, + { + "epoch": 0.43554882272786094, + "grad_norm": 0.13053369522094727, + "learning_rate": 0.0005, + "loss": 2.1299, + "step": 114430 + }, + { + "epoch": 0.43558688519598365, + "grad_norm": 0.12626564502716064, + "learning_rate": 0.0005, + "loss": 2.1208, + "step": 114440 + }, + { + "epoch": 0.4356249476641063, + "grad_norm": 0.12247197329998016, + "learning_rate": 0.0005, + "loss": 2.1122, + "step": 114450 + }, + { + "epoch": 0.435663010132229, + "grad_norm": 0.115814208984375, + "learning_rate": 0.0005, + "loss": 2.1212, + "step": 114460 + }, + { + "epoch": 0.4357010726003517, + "grad_norm": 0.12693926692008972, + "learning_rate": 0.0005, + "loss": 2.1309, + "step": 114470 + }, + { + "epoch": 0.4357391350684744, + "grad_norm": 0.12500695884227753, + "learning_rate": 0.0005, + "loss": 2.1226, + "step": 114480 + }, + { + "epoch": 0.43577719753659705, + "grad_norm": 0.1235467940568924, + "learning_rate": 0.0005, + "loss": 2.1328, + "step": 114490 + }, + { + "epoch": 0.43581526000471976, + "grad_norm": 0.12184032797813416, + "learning_rate": 0.0005, + "loss": 2.1274, + "step": 114500 + }, + { + "epoch": 0.4358533224728424, + "grad_norm": 0.12942011654376984, + "learning_rate": 0.0005, + "loss": 2.11, + "step": 114510 + }, + { + "epoch": 0.43589138494096513, + "grad_norm": 0.13408392667770386, + "learning_rate": 0.0005, + "loss": 2.1249, + "step": 114520 + }, + { + "epoch": 0.4359294474090878, + "grad_norm": 0.12868361175060272, + "learning_rate": 0.0005, + "loss": 2.1047, + "step": 114530 + }, + { + "epoch": 0.4359675098772105, + "grad_norm": 0.13124553859233856, + "learning_rate": 0.0005, + "loss": 2.1108, + "step": 114540 + }, + { + "epoch": 0.43600557234533316, + "grad_norm": 0.1267099529504776, + "learning_rate": 0.0005, + "loss": 2.1108, + "step": 114550 + }, + { + "epoch": 0.4360436348134558, + "grad_norm": 0.12259211391210556, + "learning_rate": 0.0005, + "loss": 2.1125, + "step": 114560 + }, + { + "epoch": 0.43608169728157853, + "grad_norm": 0.11855865269899368, + "learning_rate": 0.0005, + "loss": 2.1156, + "step": 114570 + }, + { + "epoch": 0.4361197597497012, + "grad_norm": 0.12500205636024475, + "learning_rate": 0.0005, + "loss": 2.1115, + "step": 114580 + }, + { + "epoch": 0.4361578222178239, + "grad_norm": 0.12759044766426086, + "learning_rate": 0.0005, + "loss": 2.134, + "step": 114590 + }, + { + "epoch": 0.43619588468594656, + "grad_norm": 0.11896944046020508, + "learning_rate": 0.0005, + "loss": 2.1055, + "step": 114600 + }, + { + "epoch": 0.43623394715406927, + "grad_norm": 0.12041134387254715, + "learning_rate": 0.0005, + "loss": 2.1157, + "step": 114610 + }, + { + "epoch": 0.4362720096221919, + "grad_norm": 0.12240686267614365, + "learning_rate": 0.0005, + "loss": 2.1242, + "step": 114620 + }, + { + "epoch": 0.43631007209031464, + "grad_norm": 0.11856778711080551, + "learning_rate": 0.0005, + "loss": 2.1174, + "step": 114630 + }, + { + "epoch": 0.4363481345584373, + "grad_norm": 0.1220201849937439, + "learning_rate": 0.0005, + "loss": 2.1108, + "step": 114640 + }, + { + "epoch": 0.43638619702656, + "grad_norm": 0.1374635100364685, + "learning_rate": 0.0005, + "loss": 2.1199, + "step": 114650 + }, + { + "epoch": 0.43642425949468266, + "grad_norm": 0.13042598962783813, + "learning_rate": 0.0005, + "loss": 2.1136, + "step": 114660 + }, + { + "epoch": 0.4364623219628054, + "grad_norm": 0.11881065368652344, + "learning_rate": 0.0005, + "loss": 2.1199, + "step": 114670 + }, + { + "epoch": 0.43650038443092803, + "grad_norm": 0.12820273637771606, + "learning_rate": 0.0005, + "loss": 2.1145, + "step": 114680 + }, + { + "epoch": 0.43653844689905075, + "grad_norm": 0.12528343498706818, + "learning_rate": 0.0005, + "loss": 2.1167, + "step": 114690 + }, + { + "epoch": 0.4365765093671734, + "grad_norm": 0.12068533897399902, + "learning_rate": 0.0005, + "loss": 2.1071, + "step": 114700 + }, + { + "epoch": 0.43661457183529606, + "grad_norm": 0.10928542166948318, + "learning_rate": 0.0005, + "loss": 2.126, + "step": 114710 + }, + { + "epoch": 0.4366526343034188, + "grad_norm": 0.1172463670372963, + "learning_rate": 0.0005, + "loss": 2.114, + "step": 114720 + }, + { + "epoch": 0.43669069677154143, + "grad_norm": 0.13667593896389008, + "learning_rate": 0.0005, + "loss": 2.1152, + "step": 114730 + }, + { + "epoch": 0.43672875923966414, + "grad_norm": 0.12498161941766739, + "learning_rate": 0.0005, + "loss": 2.1059, + "step": 114740 + }, + { + "epoch": 0.4367668217077868, + "grad_norm": 0.1113005205988884, + "learning_rate": 0.0005, + "loss": 2.1005, + "step": 114750 + }, + { + "epoch": 0.4368048841759095, + "grad_norm": 0.13782349228858948, + "learning_rate": 0.0005, + "loss": 2.1282, + "step": 114760 + }, + { + "epoch": 0.43684294664403217, + "grad_norm": 0.11374063789844513, + "learning_rate": 0.0005, + "loss": 2.0997, + "step": 114770 + }, + { + "epoch": 0.4368810091121549, + "grad_norm": 0.114725261926651, + "learning_rate": 0.0005, + "loss": 2.13, + "step": 114780 + }, + { + "epoch": 0.43691907158027754, + "grad_norm": 0.13262836635112762, + "learning_rate": 0.0005, + "loss": 2.097, + "step": 114790 + }, + { + "epoch": 0.43695713404840025, + "grad_norm": 0.11711817979812622, + "learning_rate": 0.0005, + "loss": 2.1087, + "step": 114800 + }, + { + "epoch": 0.4369951965165229, + "grad_norm": 0.30082806944847107, + "learning_rate": 0.0005, + "loss": 2.1271, + "step": 114810 + }, + { + "epoch": 0.4370332589846456, + "grad_norm": 0.12366364896297455, + "learning_rate": 0.0005, + "loss": 2.1181, + "step": 114820 + }, + { + "epoch": 0.4370713214527683, + "grad_norm": 0.13910606503486633, + "learning_rate": 0.0005, + "loss": 2.1067, + "step": 114830 + }, + { + "epoch": 0.437109383920891, + "grad_norm": 0.1297772377729416, + "learning_rate": 0.0005, + "loss": 2.1211, + "step": 114840 + }, + { + "epoch": 0.43714744638901365, + "grad_norm": 0.1312246471643448, + "learning_rate": 0.0005, + "loss": 2.1238, + "step": 114850 + }, + { + "epoch": 0.4371855088571363, + "grad_norm": 0.12575647234916687, + "learning_rate": 0.0005, + "loss": 2.1178, + "step": 114860 + }, + { + "epoch": 0.437223571325259, + "grad_norm": 0.11815674602985382, + "learning_rate": 0.0005, + "loss": 2.1144, + "step": 114870 + }, + { + "epoch": 0.4372616337933817, + "grad_norm": 0.13090112805366516, + "learning_rate": 0.0005, + "loss": 2.1288, + "step": 114880 + }, + { + "epoch": 0.4372996962615044, + "grad_norm": 0.1162065863609314, + "learning_rate": 0.0005, + "loss": 2.1189, + "step": 114890 + }, + { + "epoch": 0.43733775872962705, + "grad_norm": 0.12324568629264832, + "learning_rate": 0.0005, + "loss": 2.1226, + "step": 114900 + }, + { + "epoch": 0.43737582119774976, + "grad_norm": 0.11646682769060135, + "learning_rate": 0.0005, + "loss": 2.1184, + "step": 114910 + }, + { + "epoch": 0.4374138836658724, + "grad_norm": 0.13061708211898804, + "learning_rate": 0.0005, + "loss": 2.1239, + "step": 114920 + }, + { + "epoch": 0.43745194613399513, + "grad_norm": 0.138249471783638, + "learning_rate": 0.0005, + "loss": 2.1099, + "step": 114930 + }, + { + "epoch": 0.4374900086021178, + "grad_norm": 0.11456823348999023, + "learning_rate": 0.0005, + "loss": 2.1143, + "step": 114940 + }, + { + "epoch": 0.4375280710702405, + "grad_norm": 0.13056766986846924, + "learning_rate": 0.0005, + "loss": 2.1041, + "step": 114950 + }, + { + "epoch": 0.43756613353836316, + "grad_norm": 0.1262996345758438, + "learning_rate": 0.0005, + "loss": 2.1296, + "step": 114960 + }, + { + "epoch": 0.43760419600648587, + "grad_norm": 0.12462644279003143, + "learning_rate": 0.0005, + "loss": 2.1175, + "step": 114970 + }, + { + "epoch": 0.4376422584746085, + "grad_norm": 0.11949781328439713, + "learning_rate": 0.0005, + "loss": 2.1124, + "step": 114980 + }, + { + "epoch": 0.43768032094273124, + "grad_norm": 0.12507055699825287, + "learning_rate": 0.0005, + "loss": 2.1078, + "step": 114990 + }, + { + "epoch": 0.4377183834108539, + "grad_norm": 0.13478222489356995, + "learning_rate": 0.0005, + "loss": 2.1233, + "step": 115000 + }, + { + "epoch": 0.43775644587897655, + "grad_norm": 0.12094340473413467, + "learning_rate": 0.0005, + "loss": 2.1139, + "step": 115010 + }, + { + "epoch": 0.43779450834709926, + "grad_norm": 0.12723763287067413, + "learning_rate": 0.0005, + "loss": 2.1139, + "step": 115020 + }, + { + "epoch": 0.4378325708152219, + "grad_norm": 0.130209282040596, + "learning_rate": 0.0005, + "loss": 2.1099, + "step": 115030 + }, + { + "epoch": 0.43787063328334463, + "grad_norm": 0.2890703082084656, + "learning_rate": 0.0005, + "loss": 2.1267, + "step": 115040 + }, + { + "epoch": 0.4379086957514673, + "grad_norm": 0.13002605736255646, + "learning_rate": 0.0005, + "loss": 2.1305, + "step": 115050 + }, + { + "epoch": 0.43794675821959, + "grad_norm": 0.1323857605457306, + "learning_rate": 0.0005, + "loss": 2.1082, + "step": 115060 + }, + { + "epoch": 0.43798482068771266, + "grad_norm": 0.12515832483768463, + "learning_rate": 0.0005, + "loss": 2.1144, + "step": 115070 + }, + { + "epoch": 0.4380228831558354, + "grad_norm": 0.12032574415206909, + "learning_rate": 0.0005, + "loss": 2.1208, + "step": 115080 + }, + { + "epoch": 0.43806094562395803, + "grad_norm": 0.12321025133132935, + "learning_rate": 0.0005, + "loss": 2.131, + "step": 115090 + }, + { + "epoch": 0.43809900809208074, + "grad_norm": 0.125, + "learning_rate": 0.0005, + "loss": 2.1165, + "step": 115100 + }, + { + "epoch": 0.4381370705602034, + "grad_norm": 0.1492929458618164, + "learning_rate": 0.0005, + "loss": 2.1168, + "step": 115110 + }, + { + "epoch": 0.4381751330283261, + "grad_norm": 0.12361513078212738, + "learning_rate": 0.0005, + "loss": 2.1197, + "step": 115120 + }, + { + "epoch": 0.43821319549644877, + "grad_norm": 0.1414598673582077, + "learning_rate": 0.0005, + "loss": 2.1187, + "step": 115130 + }, + { + "epoch": 0.43825125796457143, + "grad_norm": 0.12446663528680801, + "learning_rate": 0.0005, + "loss": 2.1206, + "step": 115140 + }, + { + "epoch": 0.43828932043269414, + "grad_norm": 0.13060636818408966, + "learning_rate": 0.0005, + "loss": 2.1106, + "step": 115150 + }, + { + "epoch": 0.4383273829008168, + "grad_norm": 0.12709937989711761, + "learning_rate": 0.0005, + "loss": 2.1212, + "step": 115160 + }, + { + "epoch": 0.4383654453689395, + "grad_norm": 0.12051993608474731, + "learning_rate": 0.0005, + "loss": 2.1094, + "step": 115170 + }, + { + "epoch": 0.43840350783706217, + "grad_norm": 0.12320751696825027, + "learning_rate": 0.0005, + "loss": 2.1233, + "step": 115180 + }, + { + "epoch": 0.4384415703051849, + "grad_norm": 0.12034042924642563, + "learning_rate": 0.0005, + "loss": 2.1327, + "step": 115190 + }, + { + "epoch": 0.43847963277330754, + "grad_norm": 0.11698071658611298, + "learning_rate": 0.0005, + "loss": 2.1231, + "step": 115200 + }, + { + "epoch": 0.43851769524143025, + "grad_norm": 0.12866592407226562, + "learning_rate": 0.0005, + "loss": 2.1111, + "step": 115210 + }, + { + "epoch": 0.4385557577095529, + "grad_norm": 0.13302206993103027, + "learning_rate": 0.0005, + "loss": 2.1309, + "step": 115220 + }, + { + "epoch": 0.4385938201776756, + "grad_norm": 0.13836079835891724, + "learning_rate": 0.0005, + "loss": 2.14, + "step": 115230 + }, + { + "epoch": 0.4386318826457983, + "grad_norm": 0.11796905100345612, + "learning_rate": 0.0005, + "loss": 2.1168, + "step": 115240 + }, + { + "epoch": 0.438669945113921, + "grad_norm": 0.12501636147499084, + "learning_rate": 0.0005, + "loss": 2.1118, + "step": 115250 + }, + { + "epoch": 0.43870800758204365, + "grad_norm": 0.13494443893432617, + "learning_rate": 0.0005, + "loss": 2.1177, + "step": 115260 + }, + { + "epoch": 0.43874607005016636, + "grad_norm": 0.12201546132564545, + "learning_rate": 0.0005, + "loss": 2.1183, + "step": 115270 + }, + { + "epoch": 0.438784132518289, + "grad_norm": 0.12451411038637161, + "learning_rate": 0.0005, + "loss": 2.1023, + "step": 115280 + }, + { + "epoch": 0.4388221949864117, + "grad_norm": 0.12425916641950607, + "learning_rate": 0.0005, + "loss": 2.1276, + "step": 115290 + }, + { + "epoch": 0.4388602574545344, + "grad_norm": 0.12091124802827835, + "learning_rate": 0.0005, + "loss": 2.1185, + "step": 115300 + }, + { + "epoch": 0.43889831992265704, + "grad_norm": 0.13333760201931, + "learning_rate": 0.0005, + "loss": 2.1413, + "step": 115310 + }, + { + "epoch": 0.43893638239077976, + "grad_norm": 0.13249099254608154, + "learning_rate": 0.0005, + "loss": 2.1208, + "step": 115320 + }, + { + "epoch": 0.4389744448589024, + "grad_norm": 0.11971089243888855, + "learning_rate": 0.0005, + "loss": 2.1222, + "step": 115330 + }, + { + "epoch": 0.4390125073270251, + "grad_norm": 0.12167433649301529, + "learning_rate": 0.0005, + "loss": 2.0983, + "step": 115340 + }, + { + "epoch": 0.4390505697951478, + "grad_norm": 0.12018298357725143, + "learning_rate": 0.0005, + "loss": 2.1143, + "step": 115350 + }, + { + "epoch": 0.4390886322632705, + "grad_norm": 0.13596774637699127, + "learning_rate": 0.0005, + "loss": 2.1147, + "step": 115360 + }, + { + "epoch": 0.43912669473139315, + "grad_norm": 0.11545957624912262, + "learning_rate": 0.0005, + "loss": 2.1218, + "step": 115370 + }, + { + "epoch": 0.43916475719951586, + "grad_norm": 0.11990311741828918, + "learning_rate": 0.0005, + "loss": 2.119, + "step": 115380 + }, + { + "epoch": 0.4392028196676385, + "grad_norm": 0.1435178518295288, + "learning_rate": 0.0005, + "loss": 2.1169, + "step": 115390 + }, + { + "epoch": 0.43924088213576123, + "grad_norm": 0.13450974225997925, + "learning_rate": 0.0005, + "loss": 2.1264, + "step": 115400 + }, + { + "epoch": 0.4392789446038839, + "grad_norm": 0.13755057752132416, + "learning_rate": 0.0005, + "loss": 2.1161, + "step": 115410 + }, + { + "epoch": 0.4393170070720066, + "grad_norm": 0.13020038604736328, + "learning_rate": 0.0005, + "loss": 2.1211, + "step": 115420 + }, + { + "epoch": 0.43935506954012926, + "grad_norm": 0.12308257818222046, + "learning_rate": 0.0005, + "loss": 2.121, + "step": 115430 + }, + { + "epoch": 0.4393931320082519, + "grad_norm": 0.12247392535209656, + "learning_rate": 0.0005, + "loss": 2.113, + "step": 115440 + }, + { + "epoch": 0.43943119447637463, + "grad_norm": 0.12096204608678818, + "learning_rate": 0.0005, + "loss": 2.1067, + "step": 115450 + }, + { + "epoch": 0.4394692569444973, + "grad_norm": 0.12142517417669296, + "learning_rate": 0.0005, + "loss": 2.116, + "step": 115460 + }, + { + "epoch": 0.43950731941262, + "grad_norm": 0.12045764923095703, + "learning_rate": 0.0005, + "loss": 2.1076, + "step": 115470 + }, + { + "epoch": 0.43954538188074266, + "grad_norm": 0.12330371886491776, + "learning_rate": 0.0005, + "loss": 2.113, + "step": 115480 + }, + { + "epoch": 0.43958344434886537, + "grad_norm": 0.12864790856838226, + "learning_rate": 0.0005, + "loss": 2.119, + "step": 115490 + }, + { + "epoch": 0.43962150681698803, + "grad_norm": 0.12794655561447144, + "learning_rate": 0.0005, + "loss": 2.1158, + "step": 115500 + }, + { + "epoch": 0.43965956928511074, + "grad_norm": 0.14096996188163757, + "learning_rate": 0.0005, + "loss": 2.1357, + "step": 115510 + }, + { + "epoch": 0.4396976317532334, + "grad_norm": 0.11860933899879456, + "learning_rate": 0.0005, + "loss": 2.121, + "step": 115520 + }, + { + "epoch": 0.4397356942213561, + "grad_norm": 0.11834219098091125, + "learning_rate": 0.0005, + "loss": 2.1344, + "step": 115530 + }, + { + "epoch": 0.43977375668947877, + "grad_norm": 0.13245339691638947, + "learning_rate": 0.0005, + "loss": 2.1143, + "step": 115540 + }, + { + "epoch": 0.4398118191576015, + "grad_norm": 0.12836237251758575, + "learning_rate": 0.0005, + "loss": 2.1131, + "step": 115550 + }, + { + "epoch": 0.43984988162572414, + "grad_norm": 0.13424669206142426, + "learning_rate": 0.0005, + "loss": 2.1198, + "step": 115560 + }, + { + "epoch": 0.4398879440938468, + "grad_norm": 0.13692979514598846, + "learning_rate": 0.0005, + "loss": 2.1065, + "step": 115570 + }, + { + "epoch": 0.4399260065619695, + "grad_norm": 0.1289914846420288, + "learning_rate": 0.0005, + "loss": 2.1372, + "step": 115580 + }, + { + "epoch": 0.43996406903009216, + "grad_norm": 0.11068768054246902, + "learning_rate": 0.0005, + "loss": 2.1229, + "step": 115590 + }, + { + "epoch": 0.4400021314982149, + "grad_norm": 0.12041833996772766, + "learning_rate": 0.0005, + "loss": 2.1192, + "step": 115600 + }, + { + "epoch": 0.44004019396633753, + "grad_norm": 0.11720996350049973, + "learning_rate": 0.0005, + "loss": 2.1231, + "step": 115610 + }, + { + "epoch": 0.44007825643446025, + "grad_norm": 0.12740816175937653, + "learning_rate": 0.0005, + "loss": 2.1113, + "step": 115620 + }, + { + "epoch": 0.4401163189025829, + "grad_norm": 0.13115811347961426, + "learning_rate": 0.0005, + "loss": 2.1106, + "step": 115630 + }, + { + "epoch": 0.4401543813707056, + "grad_norm": 0.13463477790355682, + "learning_rate": 0.0005, + "loss": 2.1148, + "step": 115640 + }, + { + "epoch": 0.4401924438388283, + "grad_norm": 0.1432303637266159, + "learning_rate": 0.0005, + "loss": 2.1104, + "step": 115650 + }, + { + "epoch": 0.440230506306951, + "grad_norm": 0.12418785691261292, + "learning_rate": 0.0005, + "loss": 2.1151, + "step": 115660 + }, + { + "epoch": 0.44026856877507364, + "grad_norm": 0.13557015359401703, + "learning_rate": 0.0005, + "loss": 2.1239, + "step": 115670 + }, + { + "epoch": 0.44030663124319636, + "grad_norm": 0.132261261343956, + "learning_rate": 0.0005, + "loss": 2.1221, + "step": 115680 + }, + { + "epoch": 0.440344693711319, + "grad_norm": 0.1169886440038681, + "learning_rate": 0.0005, + "loss": 2.1321, + "step": 115690 + }, + { + "epoch": 0.4403827561794417, + "grad_norm": 0.14475859701633453, + "learning_rate": 0.0005, + "loss": 2.1108, + "step": 115700 + }, + { + "epoch": 0.4404208186475644, + "grad_norm": 0.13027706742286682, + "learning_rate": 0.0005, + "loss": 2.1196, + "step": 115710 + }, + { + "epoch": 0.44045888111568704, + "grad_norm": 0.11329112946987152, + "learning_rate": 0.0005, + "loss": 2.1037, + "step": 115720 + }, + { + "epoch": 0.44049694358380975, + "grad_norm": 0.12300235778093338, + "learning_rate": 0.0005, + "loss": 2.1211, + "step": 115730 + }, + { + "epoch": 0.4405350060519324, + "grad_norm": 0.12906724214553833, + "learning_rate": 0.0005, + "loss": 2.099, + "step": 115740 + }, + { + "epoch": 0.4405730685200551, + "grad_norm": 0.14534308016300201, + "learning_rate": 0.0005, + "loss": 2.1186, + "step": 115750 + }, + { + "epoch": 0.4406111309881778, + "grad_norm": 0.1248600035905838, + "learning_rate": 0.0005, + "loss": 2.1133, + "step": 115760 + }, + { + "epoch": 0.4406491934563005, + "grad_norm": 0.12258388847112656, + "learning_rate": 0.0005, + "loss": 2.1176, + "step": 115770 + }, + { + "epoch": 0.44068725592442315, + "grad_norm": 0.13123729825019836, + "learning_rate": 0.0005, + "loss": 2.1154, + "step": 115780 + }, + { + "epoch": 0.44072531839254586, + "grad_norm": 0.12473517656326294, + "learning_rate": 0.0005, + "loss": 2.1147, + "step": 115790 + }, + { + "epoch": 0.4407633808606685, + "grad_norm": 0.13003067672252655, + "learning_rate": 0.0005, + "loss": 2.1289, + "step": 115800 + }, + { + "epoch": 0.44080144332879123, + "grad_norm": 0.11639449745416641, + "learning_rate": 0.0005, + "loss": 2.1241, + "step": 115810 + }, + { + "epoch": 0.4408395057969139, + "grad_norm": 0.1302078813314438, + "learning_rate": 0.0005, + "loss": 2.1054, + "step": 115820 + }, + { + "epoch": 0.4408775682650366, + "grad_norm": 0.1332431435585022, + "learning_rate": 0.0005, + "loss": 2.0991, + "step": 115830 + }, + { + "epoch": 0.44091563073315926, + "grad_norm": 0.12588725984096527, + "learning_rate": 0.0005, + "loss": 2.1205, + "step": 115840 + }, + { + "epoch": 0.44095369320128197, + "grad_norm": 0.11563403904438019, + "learning_rate": 0.0005, + "loss": 2.1216, + "step": 115850 + }, + { + "epoch": 0.44099175566940463, + "grad_norm": 0.13421843945980072, + "learning_rate": 0.0005, + "loss": 2.1342, + "step": 115860 + }, + { + "epoch": 0.4410298181375273, + "grad_norm": 0.1310349702835083, + "learning_rate": 0.0005, + "loss": 2.1147, + "step": 115870 + }, + { + "epoch": 0.44106788060565, + "grad_norm": 0.11716281622648239, + "learning_rate": 0.0005, + "loss": 2.1146, + "step": 115880 + }, + { + "epoch": 0.44110594307377265, + "grad_norm": 0.12190917134284973, + "learning_rate": 0.0005, + "loss": 2.132, + "step": 115890 + }, + { + "epoch": 0.44114400554189537, + "grad_norm": 0.1441725641489029, + "learning_rate": 0.0005, + "loss": 2.1113, + "step": 115900 + }, + { + "epoch": 0.441182068010018, + "grad_norm": 0.1274310201406479, + "learning_rate": 0.0005, + "loss": 2.1298, + "step": 115910 + }, + { + "epoch": 0.44122013047814074, + "grad_norm": 0.1373417228460312, + "learning_rate": 0.0005, + "loss": 2.1064, + "step": 115920 + }, + { + "epoch": 0.4412581929462634, + "grad_norm": 0.1468418836593628, + "learning_rate": 0.0005, + "loss": 2.1201, + "step": 115930 + }, + { + "epoch": 0.4412962554143861, + "grad_norm": 0.12073398381471634, + "learning_rate": 0.0005, + "loss": 2.112, + "step": 115940 + }, + { + "epoch": 0.44133431788250876, + "grad_norm": 0.12780290842056274, + "learning_rate": 0.0005, + "loss": 2.1261, + "step": 115950 + }, + { + "epoch": 0.4413723803506315, + "grad_norm": 0.13828471302986145, + "learning_rate": 0.0005, + "loss": 2.1154, + "step": 115960 + }, + { + "epoch": 0.44141044281875413, + "grad_norm": 0.11829525977373123, + "learning_rate": 0.0005, + "loss": 2.1117, + "step": 115970 + }, + { + "epoch": 0.44144850528687685, + "grad_norm": 0.12415596842765808, + "learning_rate": 0.0005, + "loss": 2.1287, + "step": 115980 + }, + { + "epoch": 0.4414865677549995, + "grad_norm": 0.12892639636993408, + "learning_rate": 0.0005, + "loss": 2.133, + "step": 115990 + }, + { + "epoch": 0.44152463022312216, + "grad_norm": 0.11956531554460526, + "learning_rate": 0.0005, + "loss": 2.1038, + "step": 116000 + }, + { + "epoch": 0.4415626926912449, + "grad_norm": 0.1185465082526207, + "learning_rate": 0.0005, + "loss": 2.0992, + "step": 116010 + }, + { + "epoch": 0.44160075515936753, + "grad_norm": 0.12413859367370605, + "learning_rate": 0.0005, + "loss": 2.1266, + "step": 116020 + }, + { + "epoch": 0.44163881762749024, + "grad_norm": 0.1255383938550949, + "learning_rate": 0.0005, + "loss": 2.1242, + "step": 116030 + }, + { + "epoch": 0.4416768800956129, + "grad_norm": 0.14520691335201263, + "learning_rate": 0.0005, + "loss": 2.1408, + "step": 116040 + }, + { + "epoch": 0.4417149425637356, + "grad_norm": 0.1431220918893814, + "learning_rate": 0.0005, + "loss": 2.1112, + "step": 116050 + }, + { + "epoch": 0.44175300503185827, + "grad_norm": 0.11558663100004196, + "learning_rate": 0.0005, + "loss": 2.113, + "step": 116060 + }, + { + "epoch": 0.441791067499981, + "grad_norm": 0.14282682538032532, + "learning_rate": 0.0005, + "loss": 2.132, + "step": 116070 + }, + { + "epoch": 0.44182912996810364, + "grad_norm": 0.12574829161167145, + "learning_rate": 0.0005, + "loss": 2.1163, + "step": 116080 + }, + { + "epoch": 0.44186719243622635, + "grad_norm": 0.12052618712186813, + "learning_rate": 0.0005, + "loss": 2.1254, + "step": 116090 + }, + { + "epoch": 0.441905254904349, + "grad_norm": 0.11520765721797943, + "learning_rate": 0.0005, + "loss": 2.1167, + "step": 116100 + }, + { + "epoch": 0.4419433173724717, + "grad_norm": 0.14078138768672943, + "learning_rate": 0.0005, + "loss": 2.1179, + "step": 116110 + }, + { + "epoch": 0.4419813798405944, + "grad_norm": 0.11563374847173691, + "learning_rate": 0.0005, + "loss": 2.1227, + "step": 116120 + }, + { + "epoch": 0.4420194423087171, + "grad_norm": 0.11840452253818512, + "learning_rate": 0.0005, + "loss": 2.1073, + "step": 116130 + }, + { + "epoch": 0.44205750477683975, + "grad_norm": 0.12553977966308594, + "learning_rate": 0.0005, + "loss": 2.1214, + "step": 116140 + }, + { + "epoch": 0.4420955672449624, + "grad_norm": 0.12525105476379395, + "learning_rate": 0.0005, + "loss": 2.1164, + "step": 116150 + }, + { + "epoch": 0.4421336297130851, + "grad_norm": 0.11949176341295242, + "learning_rate": 0.0005, + "loss": 2.1226, + "step": 116160 + }, + { + "epoch": 0.4421716921812078, + "grad_norm": 0.12283053249120712, + "learning_rate": 0.0005, + "loss": 2.1234, + "step": 116170 + }, + { + "epoch": 0.4422097546493305, + "grad_norm": 0.11786118894815445, + "learning_rate": 0.0005, + "loss": 2.1168, + "step": 116180 + }, + { + "epoch": 0.44224781711745315, + "grad_norm": 0.12331389635801315, + "learning_rate": 0.0005, + "loss": 2.1003, + "step": 116190 + }, + { + "epoch": 0.44228587958557586, + "grad_norm": 0.1577860713005066, + "learning_rate": 0.0005, + "loss": 2.116, + "step": 116200 + }, + { + "epoch": 0.4423239420536985, + "grad_norm": 0.12965896725654602, + "learning_rate": 0.0005, + "loss": 2.1453, + "step": 116210 + }, + { + "epoch": 0.44236200452182123, + "grad_norm": 0.11895183473825455, + "learning_rate": 0.0005, + "loss": 2.1201, + "step": 116220 + }, + { + "epoch": 0.4424000669899439, + "grad_norm": 0.11798092722892761, + "learning_rate": 0.0005, + "loss": 2.1173, + "step": 116230 + }, + { + "epoch": 0.4424381294580666, + "grad_norm": 0.11680356413125992, + "learning_rate": 0.0005, + "loss": 2.103, + "step": 116240 + }, + { + "epoch": 0.44247619192618926, + "grad_norm": 0.11378675699234009, + "learning_rate": 0.0005, + "loss": 2.1077, + "step": 116250 + }, + { + "epoch": 0.44251425439431197, + "grad_norm": 0.14306718111038208, + "learning_rate": 0.0005, + "loss": 2.1081, + "step": 116260 + }, + { + "epoch": 0.4425523168624346, + "grad_norm": 0.11955001205205917, + "learning_rate": 0.0005, + "loss": 2.1248, + "step": 116270 + }, + { + "epoch": 0.44259037933055734, + "grad_norm": 0.12565234303474426, + "learning_rate": 0.0005, + "loss": 2.1316, + "step": 116280 + }, + { + "epoch": 0.44262844179868, + "grad_norm": 0.11439774930477142, + "learning_rate": 0.0005, + "loss": 2.1214, + "step": 116290 + }, + { + "epoch": 0.44266650426680265, + "grad_norm": 0.11682023108005524, + "learning_rate": 0.0005, + "loss": 2.1161, + "step": 116300 + }, + { + "epoch": 0.44270456673492536, + "grad_norm": 0.1343322992324829, + "learning_rate": 0.0005, + "loss": 2.1073, + "step": 116310 + }, + { + "epoch": 0.442742629203048, + "grad_norm": 0.11617587506771088, + "learning_rate": 0.0005, + "loss": 2.1089, + "step": 116320 + }, + { + "epoch": 0.44278069167117073, + "grad_norm": 0.11658541113138199, + "learning_rate": 0.0005, + "loss": 2.1133, + "step": 116330 + }, + { + "epoch": 0.4428187541392934, + "grad_norm": 0.12785518169403076, + "learning_rate": 0.0005, + "loss": 2.1283, + "step": 116340 + }, + { + "epoch": 0.4428568166074161, + "grad_norm": 0.11588647216558456, + "learning_rate": 0.0005, + "loss": 2.1241, + "step": 116350 + }, + { + "epoch": 0.44289487907553876, + "grad_norm": 0.1187388226389885, + "learning_rate": 0.0005, + "loss": 2.1146, + "step": 116360 + }, + { + "epoch": 0.4429329415436615, + "grad_norm": 0.12749643623828888, + "learning_rate": 0.0005, + "loss": 2.1208, + "step": 116370 + }, + { + "epoch": 0.44297100401178413, + "grad_norm": 0.11822959780693054, + "learning_rate": 0.0005, + "loss": 2.1174, + "step": 116380 + }, + { + "epoch": 0.44300906647990684, + "grad_norm": 0.13491389155387878, + "learning_rate": 0.0005, + "loss": 2.1155, + "step": 116390 + }, + { + "epoch": 0.4430471289480295, + "grad_norm": 0.11838964372873306, + "learning_rate": 0.0005, + "loss": 2.1219, + "step": 116400 + }, + { + "epoch": 0.4430851914161522, + "grad_norm": 0.1275443136692047, + "learning_rate": 0.0005, + "loss": 2.127, + "step": 116410 + }, + { + "epoch": 0.44312325388427487, + "grad_norm": 0.12287093698978424, + "learning_rate": 0.0005, + "loss": 2.1189, + "step": 116420 + }, + { + "epoch": 0.4431613163523975, + "grad_norm": 0.12383338809013367, + "learning_rate": 0.0005, + "loss": 2.1138, + "step": 116430 + }, + { + "epoch": 0.44319937882052024, + "grad_norm": 0.18328292667865753, + "learning_rate": 0.0005, + "loss": 2.108, + "step": 116440 + }, + { + "epoch": 0.4432374412886429, + "grad_norm": 0.12247510254383087, + "learning_rate": 0.0005, + "loss": 2.1225, + "step": 116450 + }, + { + "epoch": 0.4432755037567656, + "grad_norm": 0.12218674272298813, + "learning_rate": 0.0005, + "loss": 2.1273, + "step": 116460 + }, + { + "epoch": 0.44331356622488827, + "grad_norm": 0.11914005130529404, + "learning_rate": 0.0005, + "loss": 2.122, + "step": 116470 + }, + { + "epoch": 0.443351628693011, + "grad_norm": 0.11963188648223877, + "learning_rate": 0.0005, + "loss": 2.1042, + "step": 116480 + }, + { + "epoch": 0.44338969116113364, + "grad_norm": 0.11633283644914627, + "learning_rate": 0.0005, + "loss": 2.1199, + "step": 116490 + }, + { + "epoch": 0.44342775362925635, + "grad_norm": 0.12306850403547287, + "learning_rate": 0.0005, + "loss": 2.1091, + "step": 116500 + }, + { + "epoch": 0.443465816097379, + "grad_norm": 0.1175755187869072, + "learning_rate": 0.0005, + "loss": 2.1206, + "step": 116510 + }, + { + "epoch": 0.4435038785655017, + "grad_norm": 0.13746808469295502, + "learning_rate": 0.0005, + "loss": 2.1184, + "step": 116520 + }, + { + "epoch": 0.4435419410336244, + "grad_norm": 0.12363948673009872, + "learning_rate": 0.0005, + "loss": 2.1119, + "step": 116530 + }, + { + "epoch": 0.4435800035017471, + "grad_norm": 0.12091385573148727, + "learning_rate": 0.0005, + "loss": 2.1162, + "step": 116540 + }, + { + "epoch": 0.44361806596986975, + "grad_norm": 0.12593325972557068, + "learning_rate": 0.0005, + "loss": 2.1151, + "step": 116550 + }, + { + "epoch": 0.44365612843799246, + "grad_norm": 0.14146867394447327, + "learning_rate": 0.0005, + "loss": 2.1049, + "step": 116560 + }, + { + "epoch": 0.4436941909061151, + "grad_norm": 0.11309035122394562, + "learning_rate": 0.0005, + "loss": 2.1256, + "step": 116570 + }, + { + "epoch": 0.4437322533742378, + "grad_norm": 0.11507805436849594, + "learning_rate": 0.0005, + "loss": 2.1094, + "step": 116580 + }, + { + "epoch": 0.4437703158423605, + "grad_norm": 0.12811732292175293, + "learning_rate": 0.0005, + "loss": 2.1138, + "step": 116590 + }, + { + "epoch": 0.44380837831048314, + "grad_norm": 0.13018152117729187, + "learning_rate": 0.0005, + "loss": 2.1078, + "step": 116600 + }, + { + "epoch": 0.44384644077860586, + "grad_norm": 0.12831726670265198, + "learning_rate": 0.0005, + "loss": 2.1189, + "step": 116610 + }, + { + "epoch": 0.4438845032467285, + "grad_norm": 0.13873030245304108, + "learning_rate": 0.0005, + "loss": 2.1101, + "step": 116620 + }, + { + "epoch": 0.4439225657148512, + "grad_norm": 0.11311579495668411, + "learning_rate": 0.0005, + "loss": 2.126, + "step": 116630 + }, + { + "epoch": 0.4439606281829739, + "grad_norm": 0.12090139091014862, + "learning_rate": 0.0005, + "loss": 2.1243, + "step": 116640 + }, + { + "epoch": 0.4439986906510966, + "grad_norm": 0.1276332587003708, + "learning_rate": 0.0005, + "loss": 2.1162, + "step": 116650 + }, + { + "epoch": 0.44403675311921925, + "grad_norm": 0.1259016990661621, + "learning_rate": 0.0005, + "loss": 2.114, + "step": 116660 + }, + { + "epoch": 0.44407481558734196, + "grad_norm": 0.130734384059906, + "learning_rate": 0.0005, + "loss": 2.1099, + "step": 116670 + }, + { + "epoch": 0.4441128780554646, + "grad_norm": 0.1242118775844574, + "learning_rate": 0.0005, + "loss": 2.1321, + "step": 116680 + }, + { + "epoch": 0.44415094052358733, + "grad_norm": 0.12554258108139038, + "learning_rate": 0.0005, + "loss": 2.1209, + "step": 116690 + }, + { + "epoch": 0.44418900299171, + "grad_norm": 0.12029888480901718, + "learning_rate": 0.0005, + "loss": 2.1058, + "step": 116700 + }, + { + "epoch": 0.4442270654598327, + "grad_norm": 0.12346048653125763, + "learning_rate": 0.0005, + "loss": 2.1249, + "step": 116710 + }, + { + "epoch": 0.44426512792795536, + "grad_norm": 0.13442575931549072, + "learning_rate": 0.0005, + "loss": 2.1199, + "step": 116720 + }, + { + "epoch": 0.444303190396078, + "grad_norm": 0.15281645953655243, + "learning_rate": 0.0005, + "loss": 2.1141, + "step": 116730 + }, + { + "epoch": 0.44434125286420073, + "grad_norm": 0.13485336303710938, + "learning_rate": 0.0005, + "loss": 2.1188, + "step": 116740 + }, + { + "epoch": 0.4443793153323234, + "grad_norm": 0.12896035611629486, + "learning_rate": 0.0005, + "loss": 2.1105, + "step": 116750 + }, + { + "epoch": 0.4444173778004461, + "grad_norm": 0.6141171455383301, + "learning_rate": 0.0005, + "loss": 2.1244, + "step": 116760 + }, + { + "epoch": 0.44445544026856876, + "grad_norm": 0.27819642424583435, + "learning_rate": 0.0005, + "loss": 2.0998, + "step": 116770 + }, + { + "epoch": 0.44449350273669147, + "grad_norm": 0.12349634617567062, + "learning_rate": 0.0005, + "loss": 2.1268, + "step": 116780 + }, + { + "epoch": 0.4445315652048141, + "grad_norm": 0.1380884200334549, + "learning_rate": 0.0005, + "loss": 2.1243, + "step": 116790 + }, + { + "epoch": 0.44456962767293684, + "grad_norm": 0.1299584060907364, + "learning_rate": 0.0005, + "loss": 2.1234, + "step": 116800 + }, + { + "epoch": 0.4446076901410595, + "grad_norm": 0.14807891845703125, + "learning_rate": 0.0005, + "loss": 2.1258, + "step": 116810 + }, + { + "epoch": 0.4446457526091822, + "grad_norm": 0.12537524104118347, + "learning_rate": 0.0005, + "loss": 2.1207, + "step": 116820 + }, + { + "epoch": 0.44468381507730487, + "grad_norm": 0.12808291614055634, + "learning_rate": 0.0005, + "loss": 2.0979, + "step": 116830 + }, + { + "epoch": 0.4447218775454276, + "grad_norm": 0.1370694786310196, + "learning_rate": 0.0005, + "loss": 2.1293, + "step": 116840 + }, + { + "epoch": 0.44475994001355024, + "grad_norm": 0.12036896497011185, + "learning_rate": 0.0005, + "loss": 2.1326, + "step": 116850 + }, + { + "epoch": 0.4447980024816729, + "grad_norm": 0.13645969331264496, + "learning_rate": 0.0005, + "loss": 2.116, + "step": 116860 + }, + { + "epoch": 0.4448360649497956, + "grad_norm": 0.13020938634872437, + "learning_rate": 0.0005, + "loss": 2.1215, + "step": 116870 + }, + { + "epoch": 0.44487412741791826, + "grad_norm": 0.1492963284254074, + "learning_rate": 0.0005, + "loss": 2.1324, + "step": 116880 + }, + { + "epoch": 0.444912189886041, + "grad_norm": 0.11927158385515213, + "learning_rate": 0.0005, + "loss": 2.1251, + "step": 116890 + }, + { + "epoch": 0.44495025235416363, + "grad_norm": 0.1152534931898117, + "learning_rate": 0.0005, + "loss": 2.1157, + "step": 116900 + }, + { + "epoch": 0.44498831482228635, + "grad_norm": 0.11636972427368164, + "learning_rate": 0.0005, + "loss": 2.1194, + "step": 116910 + }, + { + "epoch": 0.445026377290409, + "grad_norm": 0.1195891723036766, + "learning_rate": 0.0005, + "loss": 2.1221, + "step": 116920 + }, + { + "epoch": 0.4450644397585317, + "grad_norm": 0.11876848340034485, + "learning_rate": 0.0005, + "loss": 2.124, + "step": 116930 + }, + { + "epoch": 0.4451025022266544, + "grad_norm": 0.11512313038110733, + "learning_rate": 0.0005, + "loss": 2.1251, + "step": 116940 + }, + { + "epoch": 0.4451405646947771, + "grad_norm": 0.1324792355298996, + "learning_rate": 0.0005, + "loss": 2.107, + "step": 116950 + }, + { + "epoch": 0.44517862716289974, + "grad_norm": 0.13453422486782074, + "learning_rate": 0.0005, + "loss": 2.1117, + "step": 116960 + }, + { + "epoch": 0.44521668963102246, + "grad_norm": 0.12057196348905563, + "learning_rate": 0.0005, + "loss": 2.1361, + "step": 116970 + }, + { + "epoch": 0.4452547520991451, + "grad_norm": 0.11862435191869736, + "learning_rate": 0.0005, + "loss": 2.102, + "step": 116980 + }, + { + "epoch": 0.4452928145672678, + "grad_norm": 0.1300220936536789, + "learning_rate": 0.0005, + "loss": 2.1042, + "step": 116990 + }, + { + "epoch": 0.4453308770353905, + "grad_norm": 0.13250984251499176, + "learning_rate": 0.0005, + "loss": 2.1104, + "step": 117000 + }, + { + "epoch": 0.44536893950351314, + "grad_norm": 0.1232600212097168, + "learning_rate": 0.0005, + "loss": 2.116, + "step": 117010 + }, + { + "epoch": 0.44540700197163585, + "grad_norm": 0.12628987431526184, + "learning_rate": 0.0005, + "loss": 2.128, + "step": 117020 + }, + { + "epoch": 0.4454450644397585, + "grad_norm": 0.13797855377197266, + "learning_rate": 0.0005, + "loss": 2.105, + "step": 117030 + }, + { + "epoch": 0.4454831269078812, + "grad_norm": 0.12938253581523895, + "learning_rate": 0.0005, + "loss": 2.1191, + "step": 117040 + }, + { + "epoch": 0.4455211893760039, + "grad_norm": 0.1354011744260788, + "learning_rate": 0.0005, + "loss": 2.1045, + "step": 117050 + }, + { + "epoch": 0.4455592518441266, + "grad_norm": 0.12878401577472687, + "learning_rate": 0.0005, + "loss": 2.0976, + "step": 117060 + }, + { + "epoch": 0.44559731431224925, + "grad_norm": 0.13308361172676086, + "learning_rate": 0.0005, + "loss": 2.1272, + "step": 117070 + }, + { + "epoch": 0.44563537678037196, + "grad_norm": 0.1289629489183426, + "learning_rate": 0.0005, + "loss": 2.1174, + "step": 117080 + }, + { + "epoch": 0.4456734392484946, + "grad_norm": 0.1313554346561432, + "learning_rate": 0.0005, + "loss": 2.1145, + "step": 117090 + }, + { + "epoch": 0.44571150171661733, + "grad_norm": 0.14513030648231506, + "learning_rate": 0.0005, + "loss": 2.1233, + "step": 117100 + }, + { + "epoch": 0.44574956418474, + "grad_norm": 0.11918684840202332, + "learning_rate": 0.0005, + "loss": 2.1289, + "step": 117110 + }, + { + "epoch": 0.4457876266528627, + "grad_norm": 0.12679529190063477, + "learning_rate": 0.0005, + "loss": 2.1109, + "step": 117120 + }, + { + "epoch": 0.44582568912098536, + "grad_norm": 0.11493398994207382, + "learning_rate": 0.0005, + "loss": 2.1159, + "step": 117130 + }, + { + "epoch": 0.44586375158910807, + "grad_norm": 0.11769044399261475, + "learning_rate": 0.0005, + "loss": 2.1169, + "step": 117140 + }, + { + "epoch": 0.4459018140572307, + "grad_norm": 0.1203865334391594, + "learning_rate": 0.0005, + "loss": 2.1065, + "step": 117150 + }, + { + "epoch": 0.4459398765253534, + "grad_norm": 0.13108976185321808, + "learning_rate": 0.0005, + "loss": 2.1082, + "step": 117160 + }, + { + "epoch": 0.4459779389934761, + "grad_norm": 0.12912702560424805, + "learning_rate": 0.0005, + "loss": 2.1262, + "step": 117170 + }, + { + "epoch": 0.44601600146159875, + "grad_norm": 0.12747284770011902, + "learning_rate": 0.0005, + "loss": 2.1204, + "step": 117180 + }, + { + "epoch": 0.44605406392972147, + "grad_norm": 0.11677516996860504, + "learning_rate": 0.0005, + "loss": 2.112, + "step": 117190 + }, + { + "epoch": 0.4460921263978441, + "grad_norm": 0.12442485243082047, + "learning_rate": 0.0005, + "loss": 2.1095, + "step": 117200 + }, + { + "epoch": 0.44613018886596684, + "grad_norm": 0.1313370019197464, + "learning_rate": 0.0005, + "loss": 2.1109, + "step": 117210 + }, + { + "epoch": 0.4461682513340895, + "grad_norm": 0.12267378717660904, + "learning_rate": 0.0005, + "loss": 2.1326, + "step": 117220 + }, + { + "epoch": 0.4462063138022122, + "grad_norm": 0.13062329590320587, + "learning_rate": 0.0005, + "loss": 2.1273, + "step": 117230 + }, + { + "epoch": 0.44624437627033486, + "grad_norm": 0.13419316709041595, + "learning_rate": 0.0005, + "loss": 2.125, + "step": 117240 + }, + { + "epoch": 0.4462824387384576, + "grad_norm": 0.12934669852256775, + "learning_rate": 0.0005, + "loss": 2.1333, + "step": 117250 + }, + { + "epoch": 0.44632050120658023, + "grad_norm": 0.11728876829147339, + "learning_rate": 0.0005, + "loss": 2.1298, + "step": 117260 + }, + { + "epoch": 0.44635856367470295, + "grad_norm": 0.1360110491514206, + "learning_rate": 0.0005, + "loss": 2.1124, + "step": 117270 + }, + { + "epoch": 0.4463966261428256, + "grad_norm": 0.1395176500082016, + "learning_rate": 0.0005, + "loss": 2.1152, + "step": 117280 + }, + { + "epoch": 0.4464346886109483, + "grad_norm": 0.143223375082016, + "learning_rate": 0.0005, + "loss": 2.11, + "step": 117290 + }, + { + "epoch": 0.446472751079071, + "grad_norm": 0.13000799715518951, + "learning_rate": 0.0005, + "loss": 2.129, + "step": 117300 + }, + { + "epoch": 0.44651081354719363, + "grad_norm": 0.12335141748189926, + "learning_rate": 0.0005, + "loss": 2.121, + "step": 117310 + }, + { + "epoch": 0.44654887601531634, + "grad_norm": 0.11523394286632538, + "learning_rate": 0.0005, + "loss": 2.115, + "step": 117320 + }, + { + "epoch": 0.446586938483439, + "grad_norm": 0.12152241170406342, + "learning_rate": 0.0005, + "loss": 2.1292, + "step": 117330 + }, + { + "epoch": 0.4466250009515617, + "grad_norm": 0.12862616777420044, + "learning_rate": 0.0005, + "loss": 2.1394, + "step": 117340 + }, + { + "epoch": 0.44666306341968437, + "grad_norm": 0.12332156300544739, + "learning_rate": 0.0005, + "loss": 2.1091, + "step": 117350 + }, + { + "epoch": 0.4467011258878071, + "grad_norm": 0.14175184071063995, + "learning_rate": 0.0005, + "loss": 2.1073, + "step": 117360 + }, + { + "epoch": 0.44673918835592974, + "grad_norm": 0.12240368872880936, + "learning_rate": 0.0005, + "loss": 2.1192, + "step": 117370 + }, + { + "epoch": 0.44677725082405245, + "grad_norm": 0.13649588823318481, + "learning_rate": 0.0005, + "loss": 2.1151, + "step": 117380 + }, + { + "epoch": 0.4468153132921751, + "grad_norm": 0.12765458226203918, + "learning_rate": 0.0005, + "loss": 2.1179, + "step": 117390 + }, + { + "epoch": 0.4468533757602978, + "grad_norm": 0.12199946492910385, + "learning_rate": 0.0005, + "loss": 2.1124, + "step": 117400 + }, + { + "epoch": 0.4468914382284205, + "grad_norm": 0.11614719778299332, + "learning_rate": 0.0005, + "loss": 2.1372, + "step": 117410 + }, + { + "epoch": 0.4469295006965432, + "grad_norm": 0.11507588624954224, + "learning_rate": 0.0005, + "loss": 2.1175, + "step": 117420 + }, + { + "epoch": 0.44696756316466585, + "grad_norm": 0.13302651047706604, + "learning_rate": 0.0005, + "loss": 2.1167, + "step": 117430 + }, + { + "epoch": 0.4470056256327885, + "grad_norm": 0.1279689520597458, + "learning_rate": 0.0005, + "loss": 2.1213, + "step": 117440 + }, + { + "epoch": 0.4470436881009112, + "grad_norm": 0.13449238240718842, + "learning_rate": 0.0005, + "loss": 2.1159, + "step": 117450 + }, + { + "epoch": 0.4470817505690339, + "grad_norm": 0.13003098964691162, + "learning_rate": 0.0005, + "loss": 2.1162, + "step": 117460 + }, + { + "epoch": 0.4471198130371566, + "grad_norm": 0.1376037746667862, + "learning_rate": 0.0005, + "loss": 2.1215, + "step": 117470 + }, + { + "epoch": 0.44715787550527925, + "grad_norm": 0.12537793815135956, + "learning_rate": 0.0005, + "loss": 2.1152, + "step": 117480 + }, + { + "epoch": 0.44719593797340196, + "grad_norm": 0.1256958544254303, + "learning_rate": 0.0005, + "loss": 2.1193, + "step": 117490 + }, + { + "epoch": 0.4472340004415246, + "grad_norm": 0.1190306544303894, + "learning_rate": 0.0005, + "loss": 2.1174, + "step": 117500 + }, + { + "epoch": 0.4472720629096473, + "grad_norm": 0.14315101504325867, + "learning_rate": 0.0005, + "loss": 2.1151, + "step": 117510 + }, + { + "epoch": 0.44731012537777, + "grad_norm": 0.12836386263370514, + "learning_rate": 0.0005, + "loss": 2.1088, + "step": 117520 + }, + { + "epoch": 0.4473481878458927, + "grad_norm": 0.11426154524087906, + "learning_rate": 0.0005, + "loss": 2.1106, + "step": 117530 + }, + { + "epoch": 0.44738625031401535, + "grad_norm": 0.29652196168899536, + "learning_rate": 0.0005, + "loss": 2.1174, + "step": 117540 + }, + { + "epoch": 0.44742431278213807, + "grad_norm": 0.13330289721488953, + "learning_rate": 0.0005, + "loss": 2.1299, + "step": 117550 + }, + { + "epoch": 0.4474623752502607, + "grad_norm": 0.11894863098859787, + "learning_rate": 0.0005, + "loss": 2.1235, + "step": 117560 + }, + { + "epoch": 0.44750043771838344, + "grad_norm": 0.11975117772817612, + "learning_rate": 0.0005, + "loss": 2.1219, + "step": 117570 + }, + { + "epoch": 0.4475385001865061, + "grad_norm": 0.12065289169549942, + "learning_rate": 0.0005, + "loss": 2.12, + "step": 117580 + }, + { + "epoch": 0.44757656265462875, + "grad_norm": 0.1413719654083252, + "learning_rate": 0.0005, + "loss": 2.1155, + "step": 117590 + }, + { + "epoch": 0.44761462512275146, + "grad_norm": 0.12168973684310913, + "learning_rate": 0.0005, + "loss": 2.1124, + "step": 117600 + }, + { + "epoch": 0.4476526875908741, + "grad_norm": 0.1406327337026596, + "learning_rate": 0.0005, + "loss": 2.119, + "step": 117610 + }, + { + "epoch": 0.44769075005899683, + "grad_norm": 0.1254739761352539, + "learning_rate": 0.0005, + "loss": 2.12, + "step": 117620 + }, + { + "epoch": 0.4477288125271195, + "grad_norm": 0.18341228365898132, + "learning_rate": 0.0005, + "loss": 2.1163, + "step": 117630 + }, + { + "epoch": 0.4477668749952422, + "grad_norm": 0.36629945039749146, + "learning_rate": 0.0005, + "loss": 2.1258, + "step": 117640 + }, + { + "epoch": 0.44780493746336486, + "grad_norm": 0.1329488456249237, + "learning_rate": 0.0005, + "loss": 2.1272, + "step": 117650 + }, + { + "epoch": 0.4478429999314876, + "grad_norm": 0.11533725261688232, + "learning_rate": 0.0005, + "loss": 2.1146, + "step": 117660 + }, + { + "epoch": 0.44788106239961023, + "grad_norm": 0.12699326872825623, + "learning_rate": 0.0005, + "loss": 2.1241, + "step": 117670 + }, + { + "epoch": 0.44791912486773294, + "grad_norm": 0.1253686547279358, + "learning_rate": 0.0005, + "loss": 2.1347, + "step": 117680 + }, + { + "epoch": 0.4479571873358556, + "grad_norm": 0.12520533800125122, + "learning_rate": 0.0005, + "loss": 2.13, + "step": 117690 + }, + { + "epoch": 0.4479952498039783, + "grad_norm": 0.12939919531345367, + "learning_rate": 0.0005, + "loss": 2.1256, + "step": 117700 + }, + { + "epoch": 0.44803331227210097, + "grad_norm": 0.12653513252735138, + "learning_rate": 0.0005, + "loss": 2.1313, + "step": 117710 + }, + { + "epoch": 0.4480713747402237, + "grad_norm": 0.12521785497665405, + "learning_rate": 0.0005, + "loss": 2.1139, + "step": 117720 + }, + { + "epoch": 0.44810943720834634, + "grad_norm": 0.1243586465716362, + "learning_rate": 0.0005, + "loss": 2.1118, + "step": 117730 + }, + { + "epoch": 0.448147499676469, + "grad_norm": 0.12475720793008804, + "learning_rate": 0.0005, + "loss": 2.1167, + "step": 117740 + }, + { + "epoch": 0.4481855621445917, + "grad_norm": 0.12455473840236664, + "learning_rate": 0.0005, + "loss": 2.1293, + "step": 117750 + }, + { + "epoch": 0.44822362461271437, + "grad_norm": 0.13083574175834656, + "learning_rate": 0.0005, + "loss": 2.1083, + "step": 117760 + }, + { + "epoch": 0.4482616870808371, + "grad_norm": 0.13085252046585083, + "learning_rate": 0.0005, + "loss": 2.1125, + "step": 117770 + }, + { + "epoch": 0.44829974954895974, + "grad_norm": 0.12105019390583038, + "learning_rate": 0.0005, + "loss": 2.1142, + "step": 117780 + }, + { + "epoch": 0.44833781201708245, + "grad_norm": 0.1281406283378601, + "learning_rate": 0.0005, + "loss": 2.1238, + "step": 117790 + }, + { + "epoch": 0.4483758744852051, + "grad_norm": 0.14238356053829193, + "learning_rate": 0.0005, + "loss": 2.1091, + "step": 117800 + }, + { + "epoch": 0.4484139369533278, + "grad_norm": 0.13639430701732635, + "learning_rate": 0.0005, + "loss": 2.1249, + "step": 117810 + }, + { + "epoch": 0.4484519994214505, + "grad_norm": 0.11508873105049133, + "learning_rate": 0.0005, + "loss": 2.1225, + "step": 117820 + }, + { + "epoch": 0.4484900618895732, + "grad_norm": 0.13065040111541748, + "learning_rate": 0.0005, + "loss": 2.1085, + "step": 117830 + }, + { + "epoch": 0.44852812435769585, + "grad_norm": 0.1213822215795517, + "learning_rate": 0.0005, + "loss": 2.1026, + "step": 117840 + }, + { + "epoch": 0.44856618682581856, + "grad_norm": 0.11827198415994644, + "learning_rate": 0.0005, + "loss": 2.1222, + "step": 117850 + }, + { + "epoch": 0.4486042492939412, + "grad_norm": 0.13303664326667786, + "learning_rate": 0.0005, + "loss": 2.1276, + "step": 117860 + }, + { + "epoch": 0.4486423117620639, + "grad_norm": 0.12218086421489716, + "learning_rate": 0.0005, + "loss": 2.1137, + "step": 117870 + }, + { + "epoch": 0.4486803742301866, + "grad_norm": 0.13076268136501312, + "learning_rate": 0.0005, + "loss": 2.1098, + "step": 117880 + }, + { + "epoch": 0.44871843669830924, + "grad_norm": 0.14012432098388672, + "learning_rate": 0.0005, + "loss": 2.1094, + "step": 117890 + }, + { + "epoch": 0.44875649916643195, + "grad_norm": 0.12652704119682312, + "learning_rate": 0.0005, + "loss": 2.1136, + "step": 117900 + }, + { + "epoch": 0.4487945616345546, + "grad_norm": 0.12594130635261536, + "learning_rate": 0.0005, + "loss": 2.1208, + "step": 117910 + }, + { + "epoch": 0.4488326241026773, + "grad_norm": 0.12226948142051697, + "learning_rate": 0.0005, + "loss": 2.1179, + "step": 117920 + }, + { + "epoch": 0.4488706865708, + "grad_norm": 0.1363731175661087, + "learning_rate": 0.0005, + "loss": 2.1141, + "step": 117930 + }, + { + "epoch": 0.4489087490389227, + "grad_norm": 0.12058139592409134, + "learning_rate": 0.0005, + "loss": 2.1302, + "step": 117940 + }, + { + "epoch": 0.44894681150704535, + "grad_norm": 0.12362996488809586, + "learning_rate": 0.0005, + "loss": 2.1101, + "step": 117950 + }, + { + "epoch": 0.44898487397516806, + "grad_norm": 0.13131392002105713, + "learning_rate": 0.0005, + "loss": 2.1137, + "step": 117960 + }, + { + "epoch": 0.4490229364432907, + "grad_norm": 0.14916491508483887, + "learning_rate": 0.0005, + "loss": 2.1318, + "step": 117970 + }, + { + "epoch": 0.44906099891141343, + "grad_norm": 0.14153233170509338, + "learning_rate": 0.0005, + "loss": 2.1338, + "step": 117980 + }, + { + "epoch": 0.4490990613795361, + "grad_norm": 0.12029541283845901, + "learning_rate": 0.0005, + "loss": 2.1119, + "step": 117990 + }, + { + "epoch": 0.4491371238476588, + "grad_norm": 0.12881696224212646, + "learning_rate": 0.0005, + "loss": 2.1303, + "step": 118000 + }, + { + "epoch": 0.44917518631578146, + "grad_norm": 0.11942708492279053, + "learning_rate": 0.0005, + "loss": 2.1093, + "step": 118010 + }, + { + "epoch": 0.4492132487839041, + "grad_norm": 0.12929148972034454, + "learning_rate": 0.0005, + "loss": 2.1192, + "step": 118020 + }, + { + "epoch": 0.44925131125202683, + "grad_norm": 0.1250161975622177, + "learning_rate": 0.0005, + "loss": 2.111, + "step": 118030 + }, + { + "epoch": 0.4492893737201495, + "grad_norm": 0.12096728384494781, + "learning_rate": 0.0005, + "loss": 2.121, + "step": 118040 + }, + { + "epoch": 0.4493274361882722, + "grad_norm": 0.12000516802072525, + "learning_rate": 0.0005, + "loss": 2.1126, + "step": 118050 + }, + { + "epoch": 0.44936549865639486, + "grad_norm": 0.13474564254283905, + "learning_rate": 0.0005, + "loss": 2.1173, + "step": 118060 + }, + { + "epoch": 0.44940356112451757, + "grad_norm": 0.12466412782669067, + "learning_rate": 0.0005, + "loss": 2.1226, + "step": 118070 + }, + { + "epoch": 0.4494416235926402, + "grad_norm": 0.12615053355693817, + "learning_rate": 0.0005, + "loss": 2.1013, + "step": 118080 + }, + { + "epoch": 0.44947968606076294, + "grad_norm": 0.1310206949710846, + "learning_rate": 0.0005, + "loss": 2.1129, + "step": 118090 + }, + { + "epoch": 0.4495177485288856, + "grad_norm": 0.1346968561410904, + "learning_rate": 0.0005, + "loss": 2.1203, + "step": 118100 + }, + { + "epoch": 0.4495558109970083, + "grad_norm": 0.1367679089307785, + "learning_rate": 0.0005, + "loss": 2.1282, + "step": 118110 + }, + { + "epoch": 0.44959387346513097, + "grad_norm": 0.13630978763103485, + "learning_rate": 0.0005, + "loss": 2.1162, + "step": 118120 + }, + { + "epoch": 0.4496319359332537, + "grad_norm": 0.12824587523937225, + "learning_rate": 0.0005, + "loss": 2.1198, + "step": 118130 + }, + { + "epoch": 0.44966999840137634, + "grad_norm": 0.13851365447044373, + "learning_rate": 0.0005, + "loss": 2.1115, + "step": 118140 + }, + { + "epoch": 0.44970806086949905, + "grad_norm": 0.12221483141183853, + "learning_rate": 0.0005, + "loss": 2.1203, + "step": 118150 + }, + { + "epoch": 0.4497461233376217, + "grad_norm": 0.12099538743495941, + "learning_rate": 0.0005, + "loss": 2.1166, + "step": 118160 + }, + { + "epoch": 0.44978418580574436, + "grad_norm": 0.133246511220932, + "learning_rate": 0.0005, + "loss": 2.1255, + "step": 118170 + }, + { + "epoch": 0.4498222482738671, + "grad_norm": 0.1311062127351761, + "learning_rate": 0.0005, + "loss": 2.1099, + "step": 118180 + }, + { + "epoch": 0.44986031074198973, + "grad_norm": 0.12322074174880981, + "learning_rate": 0.0005, + "loss": 2.127, + "step": 118190 + }, + { + "epoch": 0.44989837321011245, + "grad_norm": 0.13975688815116882, + "learning_rate": 0.0005, + "loss": 2.1101, + "step": 118200 + }, + { + "epoch": 0.4499364356782351, + "grad_norm": 0.11596209555864334, + "learning_rate": 0.0005, + "loss": 2.1145, + "step": 118210 + }, + { + "epoch": 0.4499744981463578, + "grad_norm": 0.12691253423690796, + "learning_rate": 0.0005, + "loss": 2.1084, + "step": 118220 + }, + { + "epoch": 0.4500125606144805, + "grad_norm": 0.12624835968017578, + "learning_rate": 0.0005, + "loss": 2.1121, + "step": 118230 + }, + { + "epoch": 0.4500506230826032, + "grad_norm": 0.1271945983171463, + "learning_rate": 0.0005, + "loss": 2.1024, + "step": 118240 + }, + { + "epoch": 0.45008868555072584, + "grad_norm": 0.13511629402637482, + "learning_rate": 0.0005, + "loss": 2.1372, + "step": 118250 + }, + { + "epoch": 0.45012674801884855, + "grad_norm": 0.1170211210846901, + "learning_rate": 0.0005, + "loss": 2.1162, + "step": 118260 + }, + { + "epoch": 0.4501648104869712, + "grad_norm": 0.12537570297718048, + "learning_rate": 0.0005, + "loss": 2.1118, + "step": 118270 + }, + { + "epoch": 0.4502028729550939, + "grad_norm": 0.12275852262973785, + "learning_rate": 0.0005, + "loss": 2.1251, + "step": 118280 + }, + { + "epoch": 0.4502409354232166, + "grad_norm": 0.12469831109046936, + "learning_rate": 0.0005, + "loss": 2.1252, + "step": 118290 + }, + { + "epoch": 0.45027899789133924, + "grad_norm": 0.11219992488622665, + "learning_rate": 0.0005, + "loss": 2.116, + "step": 118300 + }, + { + "epoch": 0.45031706035946195, + "grad_norm": 0.12267205864191055, + "learning_rate": 0.0005, + "loss": 2.1139, + "step": 118310 + }, + { + "epoch": 0.4503551228275846, + "grad_norm": 0.1314876824617386, + "learning_rate": 0.0005, + "loss": 2.1161, + "step": 118320 + }, + { + "epoch": 0.4503931852957073, + "grad_norm": 0.33807751536369324, + "learning_rate": 0.0005, + "loss": 2.1232, + "step": 118330 + }, + { + "epoch": 0.45043124776383, + "grad_norm": 0.1275404542684555, + "learning_rate": 0.0005, + "loss": 2.1098, + "step": 118340 + }, + { + "epoch": 0.4504693102319527, + "grad_norm": 0.1214970275759697, + "learning_rate": 0.0005, + "loss": 2.1164, + "step": 118350 + }, + { + "epoch": 0.45050737270007535, + "grad_norm": 0.1309918314218521, + "learning_rate": 0.0005, + "loss": 2.107, + "step": 118360 + }, + { + "epoch": 0.45054543516819806, + "grad_norm": 0.1250380277633667, + "learning_rate": 0.0005, + "loss": 2.1178, + "step": 118370 + }, + { + "epoch": 0.4505834976363207, + "grad_norm": 0.1208469420671463, + "learning_rate": 0.0005, + "loss": 2.1186, + "step": 118380 + }, + { + "epoch": 0.45062156010444343, + "grad_norm": 0.11546842753887177, + "learning_rate": 0.0005, + "loss": 2.134, + "step": 118390 + }, + { + "epoch": 0.4506596225725661, + "grad_norm": 0.12166312336921692, + "learning_rate": 0.0005, + "loss": 2.1185, + "step": 118400 + }, + { + "epoch": 0.4506976850406888, + "grad_norm": 0.12632746994495392, + "learning_rate": 0.0005, + "loss": 2.1319, + "step": 118410 + }, + { + "epoch": 0.45073574750881146, + "grad_norm": 0.12581978738307953, + "learning_rate": 0.0005, + "loss": 2.1167, + "step": 118420 + }, + { + "epoch": 0.45077380997693417, + "grad_norm": 0.13596107065677643, + "learning_rate": 0.0005, + "loss": 2.1097, + "step": 118430 + }, + { + "epoch": 0.4508118724450568, + "grad_norm": 0.12639673054218292, + "learning_rate": 0.0005, + "loss": 2.1277, + "step": 118440 + }, + { + "epoch": 0.4508499349131795, + "grad_norm": 0.13983288407325745, + "learning_rate": 0.0005, + "loss": 2.1065, + "step": 118450 + }, + { + "epoch": 0.4508879973813022, + "grad_norm": 0.11982744187116623, + "learning_rate": 0.0005, + "loss": 2.1226, + "step": 118460 + }, + { + "epoch": 0.45092605984942485, + "grad_norm": 0.1236228197813034, + "learning_rate": 0.0005, + "loss": 2.1145, + "step": 118470 + }, + { + "epoch": 0.45096412231754757, + "grad_norm": 0.130223348736763, + "learning_rate": 0.0005, + "loss": 2.119, + "step": 118480 + }, + { + "epoch": 0.4510021847856702, + "grad_norm": 0.12643398344516754, + "learning_rate": 0.0005, + "loss": 2.1252, + "step": 118490 + }, + { + "epoch": 0.45104024725379294, + "grad_norm": 0.12892575562000275, + "learning_rate": 0.0005, + "loss": 2.1259, + "step": 118500 + }, + { + "epoch": 0.4510783097219156, + "grad_norm": 0.12612701952457428, + "learning_rate": 0.0005, + "loss": 2.1018, + "step": 118510 + }, + { + "epoch": 0.4511163721900383, + "grad_norm": 0.16297845542430878, + "learning_rate": 0.0005, + "loss": 2.1259, + "step": 118520 + }, + { + "epoch": 0.45115443465816096, + "grad_norm": 0.12743675708770752, + "learning_rate": 0.0005, + "loss": 2.1228, + "step": 118530 + }, + { + "epoch": 0.4511924971262837, + "grad_norm": 0.12563501298427582, + "learning_rate": 0.0005, + "loss": 2.126, + "step": 118540 + }, + { + "epoch": 0.45123055959440633, + "grad_norm": 0.1309593915939331, + "learning_rate": 0.0005, + "loss": 2.1146, + "step": 118550 + }, + { + "epoch": 0.45126862206252905, + "grad_norm": 0.1179802417755127, + "learning_rate": 0.0005, + "loss": 2.1174, + "step": 118560 + }, + { + "epoch": 0.4513066845306517, + "grad_norm": 0.11997191607952118, + "learning_rate": 0.0005, + "loss": 2.1132, + "step": 118570 + }, + { + "epoch": 0.4513447469987744, + "grad_norm": 0.1290757805109024, + "learning_rate": 0.0005, + "loss": 2.1307, + "step": 118580 + }, + { + "epoch": 0.4513828094668971, + "grad_norm": 0.12476909905672073, + "learning_rate": 0.0005, + "loss": 2.107, + "step": 118590 + }, + { + "epoch": 0.45142087193501973, + "grad_norm": 0.1275877058506012, + "learning_rate": 0.0005, + "loss": 2.1195, + "step": 118600 + }, + { + "epoch": 0.45145893440314244, + "grad_norm": 0.1198502779006958, + "learning_rate": 0.0005, + "loss": 2.1218, + "step": 118610 + }, + { + "epoch": 0.4514969968712651, + "grad_norm": 0.13268321752548218, + "learning_rate": 0.0005, + "loss": 2.1085, + "step": 118620 + }, + { + "epoch": 0.4515350593393878, + "grad_norm": 0.12470264732837677, + "learning_rate": 0.0005, + "loss": 2.1177, + "step": 118630 + }, + { + "epoch": 0.45157312180751047, + "grad_norm": 0.12971359491348267, + "learning_rate": 0.0005, + "loss": 2.1115, + "step": 118640 + }, + { + "epoch": 0.4516111842756332, + "grad_norm": 0.12227847427129745, + "learning_rate": 0.0005, + "loss": 2.1138, + "step": 118650 + }, + { + "epoch": 0.45164924674375584, + "grad_norm": 0.12495315819978714, + "learning_rate": 0.0005, + "loss": 2.1317, + "step": 118660 + }, + { + "epoch": 0.45168730921187855, + "grad_norm": 0.12900105118751526, + "learning_rate": 0.0005, + "loss": 2.1157, + "step": 118670 + }, + { + "epoch": 0.4517253716800012, + "grad_norm": 0.13668887317180634, + "learning_rate": 0.0005, + "loss": 2.1263, + "step": 118680 + }, + { + "epoch": 0.4517634341481239, + "grad_norm": 0.13190940022468567, + "learning_rate": 0.0005, + "loss": 2.1163, + "step": 118690 + }, + { + "epoch": 0.4518014966162466, + "grad_norm": 0.1314956396818161, + "learning_rate": 0.0005, + "loss": 2.1271, + "step": 118700 + }, + { + "epoch": 0.4518395590843693, + "grad_norm": 0.12434843927621841, + "learning_rate": 0.0005, + "loss": 2.1177, + "step": 118710 + }, + { + "epoch": 0.45187762155249195, + "grad_norm": 0.12370478361845016, + "learning_rate": 0.0005, + "loss": 2.1151, + "step": 118720 + }, + { + "epoch": 0.4519156840206146, + "grad_norm": 0.11864922940731049, + "learning_rate": 0.0005, + "loss": 2.1165, + "step": 118730 + }, + { + "epoch": 0.4519537464887373, + "grad_norm": 0.14261092245578766, + "learning_rate": 0.0005, + "loss": 2.1046, + "step": 118740 + }, + { + "epoch": 0.45199180895686, + "grad_norm": 0.1361154317855835, + "learning_rate": 0.0005, + "loss": 2.1104, + "step": 118750 + }, + { + "epoch": 0.4520298714249827, + "grad_norm": 0.1354462206363678, + "learning_rate": 0.0005, + "loss": 2.1041, + "step": 118760 + }, + { + "epoch": 0.45206793389310534, + "grad_norm": 0.12536115944385529, + "learning_rate": 0.0005, + "loss": 2.113, + "step": 118770 + }, + { + "epoch": 0.45210599636122806, + "grad_norm": 0.14076842367649078, + "learning_rate": 0.0005, + "loss": 2.1154, + "step": 118780 + }, + { + "epoch": 0.4521440588293507, + "grad_norm": 0.12235935777425766, + "learning_rate": 0.0005, + "loss": 2.1107, + "step": 118790 + }, + { + "epoch": 0.4521821212974734, + "grad_norm": 0.13245408236980438, + "learning_rate": 0.0005, + "loss": 2.1078, + "step": 118800 + }, + { + "epoch": 0.4522201837655961, + "grad_norm": 0.11906524002552032, + "learning_rate": 0.0005, + "loss": 2.1334, + "step": 118810 + }, + { + "epoch": 0.4522582462337188, + "grad_norm": 0.1383557766675949, + "learning_rate": 0.0005, + "loss": 2.1179, + "step": 118820 + }, + { + "epoch": 0.45229630870184145, + "grad_norm": 0.1393071711063385, + "learning_rate": 0.0005, + "loss": 2.1011, + "step": 118830 + }, + { + "epoch": 0.45233437116996417, + "grad_norm": 0.12187449634075165, + "learning_rate": 0.0005, + "loss": 2.1076, + "step": 118840 + }, + { + "epoch": 0.4523724336380868, + "grad_norm": 0.11439521610736847, + "learning_rate": 0.0005, + "loss": 2.1077, + "step": 118850 + }, + { + "epoch": 0.45241049610620954, + "grad_norm": 0.12313267588615417, + "learning_rate": 0.0005, + "loss": 2.1105, + "step": 118860 + }, + { + "epoch": 0.4524485585743322, + "grad_norm": 0.1316363662481308, + "learning_rate": 0.0005, + "loss": 2.1034, + "step": 118870 + }, + { + "epoch": 0.45248662104245485, + "grad_norm": 0.13418705761432648, + "learning_rate": 0.0005, + "loss": 2.1143, + "step": 118880 + }, + { + "epoch": 0.45252468351057756, + "grad_norm": 0.12311230599880219, + "learning_rate": 0.0005, + "loss": 2.1284, + "step": 118890 + }, + { + "epoch": 0.4525627459787002, + "grad_norm": 0.12272609025239944, + "learning_rate": 0.0005, + "loss": 2.1248, + "step": 118900 + }, + { + "epoch": 0.45260080844682293, + "grad_norm": 0.19879673421382904, + "learning_rate": 0.0005, + "loss": 2.118, + "step": 118910 + }, + { + "epoch": 0.4526388709149456, + "grad_norm": 0.11573578417301178, + "learning_rate": 0.0005, + "loss": 2.1189, + "step": 118920 + }, + { + "epoch": 0.4526769333830683, + "grad_norm": 0.12728090584278107, + "learning_rate": 0.0005, + "loss": 2.1185, + "step": 118930 + }, + { + "epoch": 0.45271499585119096, + "grad_norm": 0.1399592161178589, + "learning_rate": 0.0005, + "loss": 2.1225, + "step": 118940 + }, + { + "epoch": 0.4527530583193137, + "grad_norm": 0.1357944756746292, + "learning_rate": 0.0005, + "loss": 2.1164, + "step": 118950 + }, + { + "epoch": 0.45279112078743633, + "grad_norm": 0.13509675860404968, + "learning_rate": 0.0005, + "loss": 2.1186, + "step": 118960 + }, + { + "epoch": 0.45282918325555904, + "grad_norm": 0.13070659339427948, + "learning_rate": 0.0005, + "loss": 2.1115, + "step": 118970 + }, + { + "epoch": 0.4528672457236817, + "grad_norm": 0.13457068800926208, + "learning_rate": 0.0005, + "loss": 2.1273, + "step": 118980 + }, + { + "epoch": 0.4529053081918044, + "grad_norm": 0.13255858421325684, + "learning_rate": 0.0005, + "loss": 2.1235, + "step": 118990 + }, + { + "epoch": 0.45294337065992707, + "grad_norm": 0.1382453590631485, + "learning_rate": 0.0005, + "loss": 2.1266, + "step": 119000 + }, + { + "epoch": 0.4529814331280498, + "grad_norm": 0.12324276566505432, + "learning_rate": 0.0005, + "loss": 2.1109, + "step": 119010 + }, + { + "epoch": 0.45301949559617244, + "grad_norm": 0.12160609662532806, + "learning_rate": 0.0005, + "loss": 2.1069, + "step": 119020 + }, + { + "epoch": 0.4530575580642951, + "grad_norm": 0.13407763838768005, + "learning_rate": 0.0005, + "loss": 2.0978, + "step": 119030 + }, + { + "epoch": 0.4530956205324178, + "grad_norm": 0.1184903234243393, + "learning_rate": 0.0005, + "loss": 2.1136, + "step": 119040 + }, + { + "epoch": 0.45313368300054047, + "grad_norm": 0.13084222376346588, + "learning_rate": 0.0005, + "loss": 2.111, + "step": 119050 + }, + { + "epoch": 0.4531717454686632, + "grad_norm": 0.13515977561473846, + "learning_rate": 0.0005, + "loss": 2.1111, + "step": 119060 + }, + { + "epoch": 0.45320980793678584, + "grad_norm": 0.12019907683134079, + "learning_rate": 0.0005, + "loss": 2.1271, + "step": 119070 + }, + { + "epoch": 0.45324787040490855, + "grad_norm": 0.13739806413650513, + "learning_rate": 0.0005, + "loss": 2.1073, + "step": 119080 + }, + { + "epoch": 0.4532859328730312, + "grad_norm": 0.12706531584262848, + "learning_rate": 0.0005, + "loss": 2.1061, + "step": 119090 + }, + { + "epoch": 0.4533239953411539, + "grad_norm": 0.13009122014045715, + "learning_rate": 0.0005, + "loss": 2.131, + "step": 119100 + }, + { + "epoch": 0.4533620578092766, + "grad_norm": 0.11646008491516113, + "learning_rate": 0.0005, + "loss": 2.1205, + "step": 119110 + }, + { + "epoch": 0.4534001202773993, + "grad_norm": 0.1327752023935318, + "learning_rate": 0.0005, + "loss": 2.1085, + "step": 119120 + }, + { + "epoch": 0.45343818274552194, + "grad_norm": 0.12436755001544952, + "learning_rate": 0.0005, + "loss": 2.1127, + "step": 119130 + }, + { + "epoch": 0.45347624521364466, + "grad_norm": 0.12094046920537949, + "learning_rate": 0.0005, + "loss": 2.1278, + "step": 119140 + }, + { + "epoch": 0.4535143076817673, + "grad_norm": 0.11031058430671692, + "learning_rate": 0.0005, + "loss": 2.1342, + "step": 119150 + }, + { + "epoch": 0.45355237014988997, + "grad_norm": 0.11060208082199097, + "learning_rate": 0.0005, + "loss": 2.1157, + "step": 119160 + }, + { + "epoch": 0.4535904326180127, + "grad_norm": 0.1298583447933197, + "learning_rate": 0.0005, + "loss": 2.1148, + "step": 119170 + }, + { + "epoch": 0.45362849508613534, + "grad_norm": 0.1342618614435196, + "learning_rate": 0.0005, + "loss": 2.1139, + "step": 119180 + }, + { + "epoch": 0.45366655755425805, + "grad_norm": 0.12159561365842819, + "learning_rate": 0.0005, + "loss": 2.119, + "step": 119190 + }, + { + "epoch": 0.4537046200223807, + "grad_norm": 0.12042984366416931, + "learning_rate": 0.0005, + "loss": 2.127, + "step": 119200 + }, + { + "epoch": 0.4537426824905034, + "grad_norm": 0.13506831228733063, + "learning_rate": 0.0005, + "loss": 2.1248, + "step": 119210 + }, + { + "epoch": 0.4537807449586261, + "grad_norm": 0.12235353887081146, + "learning_rate": 0.0005, + "loss": 2.1021, + "step": 119220 + }, + { + "epoch": 0.4538188074267488, + "grad_norm": 0.12120179831981659, + "learning_rate": 0.0005, + "loss": 2.11, + "step": 119230 + }, + { + "epoch": 0.45385686989487145, + "grad_norm": 0.12782564759254456, + "learning_rate": 0.0005, + "loss": 2.1183, + "step": 119240 + }, + { + "epoch": 0.45389493236299416, + "grad_norm": 0.12691909074783325, + "learning_rate": 0.0005, + "loss": 2.1035, + "step": 119250 + }, + { + "epoch": 0.4539329948311168, + "grad_norm": 0.12976911664009094, + "learning_rate": 0.0005, + "loss": 2.1298, + "step": 119260 + }, + { + "epoch": 0.45397105729923953, + "grad_norm": 0.12581056356430054, + "learning_rate": 0.0005, + "loss": 2.1091, + "step": 119270 + }, + { + "epoch": 0.4540091197673622, + "grad_norm": 0.11505091190338135, + "learning_rate": 0.0005, + "loss": 2.1229, + "step": 119280 + }, + { + "epoch": 0.4540471822354849, + "grad_norm": 0.1203627660870552, + "learning_rate": 0.0005, + "loss": 2.1116, + "step": 119290 + }, + { + "epoch": 0.45408524470360756, + "grad_norm": 0.13732272386550903, + "learning_rate": 0.0005, + "loss": 2.1382, + "step": 119300 + }, + { + "epoch": 0.4541233071717302, + "grad_norm": 0.12399745732545853, + "learning_rate": 0.0005, + "loss": 2.118, + "step": 119310 + }, + { + "epoch": 0.45416136963985293, + "grad_norm": 0.13304689526557922, + "learning_rate": 0.0005, + "loss": 2.1259, + "step": 119320 + }, + { + "epoch": 0.4541994321079756, + "grad_norm": 0.12508970499038696, + "learning_rate": 0.0005, + "loss": 2.1087, + "step": 119330 + }, + { + "epoch": 0.4542374945760983, + "grad_norm": 0.12920893728733063, + "learning_rate": 0.0005, + "loss": 2.1241, + "step": 119340 + }, + { + "epoch": 0.45427555704422096, + "grad_norm": 0.12676669657230377, + "learning_rate": 0.0005, + "loss": 2.1073, + "step": 119350 + }, + { + "epoch": 0.45431361951234367, + "grad_norm": 0.1385577917098999, + "learning_rate": 0.0005, + "loss": 2.1087, + "step": 119360 + }, + { + "epoch": 0.4543516819804663, + "grad_norm": 0.13354946672916412, + "learning_rate": 0.0005, + "loss": 2.1165, + "step": 119370 + }, + { + "epoch": 0.45438974444858904, + "grad_norm": 0.11637165397405624, + "learning_rate": 0.0005, + "loss": 2.0929, + "step": 119380 + }, + { + "epoch": 0.4544278069167117, + "grad_norm": 0.12723296880722046, + "learning_rate": 0.0005, + "loss": 2.1104, + "step": 119390 + }, + { + "epoch": 0.4544658693848344, + "grad_norm": 0.1271418035030365, + "learning_rate": 0.0005, + "loss": 2.1048, + "step": 119400 + }, + { + "epoch": 0.45450393185295707, + "grad_norm": 0.1238766461610794, + "learning_rate": 0.0005, + "loss": 2.1163, + "step": 119410 + }, + { + "epoch": 0.4545419943210798, + "grad_norm": 0.12604381144046783, + "learning_rate": 0.0005, + "loss": 2.1144, + "step": 119420 + }, + { + "epoch": 0.45458005678920244, + "grad_norm": 0.12242569029331207, + "learning_rate": 0.0005, + "loss": 2.1164, + "step": 119430 + }, + { + "epoch": 0.45461811925732515, + "grad_norm": 0.12519843876361847, + "learning_rate": 0.0005, + "loss": 2.1088, + "step": 119440 + }, + { + "epoch": 0.4546561817254478, + "grad_norm": 0.1285882592201233, + "learning_rate": 0.0005, + "loss": 2.0992, + "step": 119450 + }, + { + "epoch": 0.45469424419357046, + "grad_norm": 0.13689425587654114, + "learning_rate": 0.0005, + "loss": 2.1074, + "step": 119460 + }, + { + "epoch": 0.4547323066616932, + "grad_norm": 0.11747127771377563, + "learning_rate": 0.0005, + "loss": 2.1132, + "step": 119470 + }, + { + "epoch": 0.45477036912981583, + "grad_norm": 0.1399627923965454, + "learning_rate": 0.0005, + "loss": 2.1144, + "step": 119480 + }, + { + "epoch": 0.45480843159793855, + "grad_norm": 0.1311253160238266, + "learning_rate": 0.0005, + "loss": 2.1077, + "step": 119490 + }, + { + "epoch": 0.4548464940660612, + "grad_norm": 0.1293565183877945, + "learning_rate": 0.0005, + "loss": 2.0939, + "step": 119500 + }, + { + "epoch": 0.4548845565341839, + "grad_norm": 0.13139833509922028, + "learning_rate": 0.0005, + "loss": 2.1202, + "step": 119510 + }, + { + "epoch": 0.45492261900230657, + "grad_norm": 0.13386030495166779, + "learning_rate": 0.0005, + "loss": 2.1082, + "step": 119520 + }, + { + "epoch": 0.4549606814704293, + "grad_norm": 0.12311594933271408, + "learning_rate": 0.0005, + "loss": 2.1154, + "step": 119530 + }, + { + "epoch": 0.45499874393855194, + "grad_norm": 0.11787018179893494, + "learning_rate": 0.0005, + "loss": 2.12, + "step": 119540 + }, + { + "epoch": 0.45503680640667465, + "grad_norm": 0.13005182147026062, + "learning_rate": 0.0005, + "loss": 2.1028, + "step": 119550 + }, + { + "epoch": 0.4550748688747973, + "grad_norm": 0.13512970507144928, + "learning_rate": 0.0005, + "loss": 2.1132, + "step": 119560 + }, + { + "epoch": 0.45511293134292, + "grad_norm": 0.1321474313735962, + "learning_rate": 0.0005, + "loss": 2.1312, + "step": 119570 + }, + { + "epoch": 0.4551509938110427, + "grad_norm": 0.12341282516717911, + "learning_rate": 0.0005, + "loss": 2.1119, + "step": 119580 + }, + { + "epoch": 0.4551890562791654, + "grad_norm": 0.1225198283791542, + "learning_rate": 0.0005, + "loss": 2.1229, + "step": 119590 + }, + { + "epoch": 0.45522711874728805, + "grad_norm": 0.12121542543172836, + "learning_rate": 0.0005, + "loss": 2.1093, + "step": 119600 + }, + { + "epoch": 0.4552651812154107, + "grad_norm": 0.12483129650354385, + "learning_rate": 0.0005, + "loss": 2.1201, + "step": 119610 + }, + { + "epoch": 0.4553032436835334, + "grad_norm": 0.11805702745914459, + "learning_rate": 0.0005, + "loss": 2.1139, + "step": 119620 + }, + { + "epoch": 0.4553413061516561, + "grad_norm": 0.12919245660305023, + "learning_rate": 0.0005, + "loss": 2.1272, + "step": 119630 + }, + { + "epoch": 0.4553793686197788, + "grad_norm": 0.14529746770858765, + "learning_rate": 0.0005, + "loss": 2.1187, + "step": 119640 + }, + { + "epoch": 0.45541743108790145, + "grad_norm": 0.13631920516490936, + "learning_rate": 0.0005, + "loss": 2.1215, + "step": 119650 + }, + { + "epoch": 0.45545549355602416, + "grad_norm": 0.1259685754776001, + "learning_rate": 0.0005, + "loss": 2.1186, + "step": 119660 + }, + { + "epoch": 0.4554935560241468, + "grad_norm": 0.12155263125896454, + "learning_rate": 0.0005, + "loss": 2.1213, + "step": 119670 + }, + { + "epoch": 0.45553161849226953, + "grad_norm": 0.1261385977268219, + "learning_rate": 0.0005, + "loss": 2.1167, + "step": 119680 + }, + { + "epoch": 0.4555696809603922, + "grad_norm": 0.11714118719100952, + "learning_rate": 0.0005, + "loss": 2.1151, + "step": 119690 + }, + { + "epoch": 0.4556077434285149, + "grad_norm": 0.12431380152702332, + "learning_rate": 0.0005, + "loss": 2.1202, + "step": 119700 + }, + { + "epoch": 0.45564580589663756, + "grad_norm": 0.13070593774318695, + "learning_rate": 0.0005, + "loss": 2.1225, + "step": 119710 + }, + { + "epoch": 0.45568386836476027, + "grad_norm": 0.12562495470046997, + "learning_rate": 0.0005, + "loss": 2.1159, + "step": 119720 + }, + { + "epoch": 0.4557219308328829, + "grad_norm": 0.11666475981473923, + "learning_rate": 0.0005, + "loss": 2.1157, + "step": 119730 + }, + { + "epoch": 0.4557599933010056, + "grad_norm": 0.1263495534658432, + "learning_rate": 0.0005, + "loss": 2.1256, + "step": 119740 + }, + { + "epoch": 0.4557980557691283, + "grad_norm": 0.12585152685642242, + "learning_rate": 0.0005, + "loss": 2.1216, + "step": 119750 + }, + { + "epoch": 0.45583611823725095, + "grad_norm": 0.14227187633514404, + "learning_rate": 0.0005, + "loss": 2.1036, + "step": 119760 + }, + { + "epoch": 0.45587418070537367, + "grad_norm": 0.22283364832401276, + "learning_rate": 0.0005, + "loss": 2.1074, + "step": 119770 + }, + { + "epoch": 0.4559122431734963, + "grad_norm": 0.12294365465641022, + "learning_rate": 0.0005, + "loss": 2.1134, + "step": 119780 + }, + { + "epoch": 0.45595030564161904, + "grad_norm": 0.11970822513103485, + "learning_rate": 0.0005, + "loss": 2.1277, + "step": 119790 + }, + { + "epoch": 0.4559883681097417, + "grad_norm": 0.12036450952291489, + "learning_rate": 0.0005, + "loss": 2.1103, + "step": 119800 + }, + { + "epoch": 0.4560264305778644, + "grad_norm": 0.1168990209698677, + "learning_rate": 0.0005, + "loss": 2.1158, + "step": 119810 + }, + { + "epoch": 0.45606449304598706, + "grad_norm": 0.12877270579338074, + "learning_rate": 0.0005, + "loss": 2.1249, + "step": 119820 + }, + { + "epoch": 0.4561025555141098, + "grad_norm": 0.11663218587636948, + "learning_rate": 0.0005, + "loss": 2.108, + "step": 119830 + }, + { + "epoch": 0.45614061798223243, + "grad_norm": 0.12282679975032806, + "learning_rate": 0.0005, + "loss": 2.1071, + "step": 119840 + }, + { + "epoch": 0.45617868045035515, + "grad_norm": 0.12231219559907913, + "learning_rate": 0.0005, + "loss": 2.1183, + "step": 119850 + }, + { + "epoch": 0.4562167429184778, + "grad_norm": 0.11906345933675766, + "learning_rate": 0.0005, + "loss": 2.1181, + "step": 119860 + }, + { + "epoch": 0.4562548053866005, + "grad_norm": 0.1266046166419983, + "learning_rate": 0.0005, + "loss": 2.1279, + "step": 119870 + }, + { + "epoch": 0.45629286785472317, + "grad_norm": 0.1269948035478592, + "learning_rate": 0.0005, + "loss": 2.111, + "step": 119880 + }, + { + "epoch": 0.45633093032284583, + "grad_norm": 0.12274617701768875, + "learning_rate": 0.0005, + "loss": 2.1254, + "step": 119890 + }, + { + "epoch": 0.45636899279096854, + "grad_norm": 0.13784544169902802, + "learning_rate": 0.0005, + "loss": 2.115, + "step": 119900 + }, + { + "epoch": 0.4564070552590912, + "grad_norm": 0.1259078085422516, + "learning_rate": 0.0005, + "loss": 2.1147, + "step": 119910 + }, + { + "epoch": 0.4564451177272139, + "grad_norm": 0.14160633087158203, + "learning_rate": 0.0005, + "loss": 2.119, + "step": 119920 + }, + { + "epoch": 0.45648318019533657, + "grad_norm": 0.12463352829217911, + "learning_rate": 0.0005, + "loss": 2.1084, + "step": 119930 + }, + { + "epoch": 0.4565212426634593, + "grad_norm": 0.1392524242401123, + "learning_rate": 0.0005, + "loss": 2.1304, + "step": 119940 + }, + { + "epoch": 0.45655930513158194, + "grad_norm": 0.12912337481975555, + "learning_rate": 0.0005, + "loss": 2.1133, + "step": 119950 + }, + { + "epoch": 0.45659736759970465, + "grad_norm": 0.15192027390003204, + "learning_rate": 0.0005, + "loss": 2.1073, + "step": 119960 + }, + { + "epoch": 0.4566354300678273, + "grad_norm": 0.13262653350830078, + "learning_rate": 0.0005, + "loss": 2.1253, + "step": 119970 + }, + { + "epoch": 0.45667349253595, + "grad_norm": 0.12109184265136719, + "learning_rate": 0.0005, + "loss": 2.1185, + "step": 119980 + }, + { + "epoch": 0.4567115550040727, + "grad_norm": 0.1148374006152153, + "learning_rate": 0.0005, + "loss": 2.1103, + "step": 119990 + }, + { + "epoch": 0.4567496174721954, + "grad_norm": 0.11489134281873703, + "learning_rate": 0.0005, + "loss": 2.1197, + "step": 120000 + }, + { + "epoch": 0.45678767994031805, + "grad_norm": 0.11090307682752609, + "learning_rate": 0.0005, + "loss": 2.1197, + "step": 120010 + }, + { + "epoch": 0.45682574240844076, + "grad_norm": 0.12963759899139404, + "learning_rate": 0.0005, + "loss": 2.1202, + "step": 120020 + }, + { + "epoch": 0.4568638048765634, + "grad_norm": 0.12337867170572281, + "learning_rate": 0.0005, + "loss": 2.1112, + "step": 120030 + }, + { + "epoch": 0.4569018673446861, + "grad_norm": 0.12097255140542984, + "learning_rate": 0.0005, + "loss": 2.1197, + "step": 120040 + }, + { + "epoch": 0.4569399298128088, + "grad_norm": 0.1328166127204895, + "learning_rate": 0.0005, + "loss": 2.1085, + "step": 120050 + }, + { + "epoch": 0.45697799228093144, + "grad_norm": 0.12114717811346054, + "learning_rate": 0.0005, + "loss": 2.1142, + "step": 120060 + }, + { + "epoch": 0.45701605474905416, + "grad_norm": 0.13208146393299103, + "learning_rate": 0.0005, + "loss": 2.1105, + "step": 120070 + }, + { + "epoch": 0.4570541172171768, + "grad_norm": 0.12861287593841553, + "learning_rate": 0.0005, + "loss": 2.1236, + "step": 120080 + }, + { + "epoch": 0.4570921796852995, + "grad_norm": 0.12194353342056274, + "learning_rate": 0.0005, + "loss": 2.119, + "step": 120090 + }, + { + "epoch": 0.4571302421534222, + "grad_norm": 0.13183648884296417, + "learning_rate": 0.0005, + "loss": 2.1132, + "step": 120100 + }, + { + "epoch": 0.4571683046215449, + "grad_norm": 0.12721547484397888, + "learning_rate": 0.0005, + "loss": 2.1144, + "step": 120110 + }, + { + "epoch": 0.45720636708966755, + "grad_norm": 0.11995424330234528, + "learning_rate": 0.0005, + "loss": 2.1203, + "step": 120120 + }, + { + "epoch": 0.45724442955779027, + "grad_norm": 0.12486982345581055, + "learning_rate": 0.0005, + "loss": 2.0977, + "step": 120130 + }, + { + "epoch": 0.4572824920259129, + "grad_norm": 0.12625457346439362, + "learning_rate": 0.0005, + "loss": 2.1169, + "step": 120140 + }, + { + "epoch": 0.45732055449403564, + "grad_norm": 0.1323632448911667, + "learning_rate": 0.0005, + "loss": 2.1028, + "step": 120150 + }, + { + "epoch": 0.4573586169621583, + "grad_norm": 0.11913406103849411, + "learning_rate": 0.0005, + "loss": 2.1155, + "step": 120160 + }, + { + "epoch": 0.45739667943028095, + "grad_norm": 0.13413356244564056, + "learning_rate": 0.0005, + "loss": 2.1013, + "step": 120170 + }, + { + "epoch": 0.45743474189840366, + "grad_norm": 0.14971719682216644, + "learning_rate": 0.0005, + "loss": 2.1206, + "step": 120180 + }, + { + "epoch": 0.4574728043665263, + "grad_norm": 0.13183917105197906, + "learning_rate": 0.0005, + "loss": 2.1204, + "step": 120190 + }, + { + "epoch": 0.45751086683464903, + "grad_norm": 0.12312076985836029, + "learning_rate": 0.0005, + "loss": 2.1118, + "step": 120200 + }, + { + "epoch": 0.4575489293027717, + "grad_norm": 0.11903540045022964, + "learning_rate": 0.0005, + "loss": 2.13, + "step": 120210 + }, + { + "epoch": 0.4575869917708944, + "grad_norm": 0.12743575870990753, + "learning_rate": 0.0005, + "loss": 2.1044, + "step": 120220 + }, + { + "epoch": 0.45762505423901706, + "grad_norm": 0.11738581955432892, + "learning_rate": 0.0005, + "loss": 2.1138, + "step": 120230 + }, + { + "epoch": 0.45766311670713977, + "grad_norm": 0.13653422892093658, + "learning_rate": 0.0005, + "loss": 2.1281, + "step": 120240 + }, + { + "epoch": 0.45770117917526243, + "grad_norm": 0.12781554460525513, + "learning_rate": 0.0005, + "loss": 2.1076, + "step": 120250 + }, + { + "epoch": 0.45773924164338514, + "grad_norm": 0.14034056663513184, + "learning_rate": 0.0005, + "loss": 2.1209, + "step": 120260 + }, + { + "epoch": 0.4577773041115078, + "grad_norm": 0.12680433690547943, + "learning_rate": 0.0005, + "loss": 2.1181, + "step": 120270 + }, + { + "epoch": 0.4578153665796305, + "grad_norm": 0.13826939463615417, + "learning_rate": 0.0005, + "loss": 2.1348, + "step": 120280 + }, + { + "epoch": 0.45785342904775317, + "grad_norm": 0.1185794547200203, + "learning_rate": 0.0005, + "loss": 2.1201, + "step": 120290 + }, + { + "epoch": 0.4578914915158759, + "grad_norm": 0.12192360311746597, + "learning_rate": 0.0005, + "loss": 2.0962, + "step": 120300 + }, + { + "epoch": 0.45792955398399854, + "grad_norm": 0.14247040450572968, + "learning_rate": 0.0005, + "loss": 2.1049, + "step": 120310 + }, + { + "epoch": 0.4579676164521212, + "grad_norm": 0.1364268809556961, + "learning_rate": 0.0005, + "loss": 2.1167, + "step": 120320 + }, + { + "epoch": 0.4580056789202439, + "grad_norm": 0.13730765879154205, + "learning_rate": 0.0005, + "loss": 2.1307, + "step": 120330 + }, + { + "epoch": 0.45804374138836657, + "grad_norm": 0.12975889444351196, + "learning_rate": 0.0005, + "loss": 2.1199, + "step": 120340 + }, + { + "epoch": 0.4580818038564893, + "grad_norm": 0.12218843400478363, + "learning_rate": 0.0005, + "loss": 2.1178, + "step": 120350 + }, + { + "epoch": 0.45811986632461194, + "grad_norm": 0.11997194588184357, + "learning_rate": 0.0005, + "loss": 2.119, + "step": 120360 + }, + { + "epoch": 0.45815792879273465, + "grad_norm": 0.1225312352180481, + "learning_rate": 0.0005, + "loss": 2.1224, + "step": 120370 + }, + { + "epoch": 0.4581959912608573, + "grad_norm": 0.1350684016942978, + "learning_rate": 0.0005, + "loss": 2.1216, + "step": 120380 + }, + { + "epoch": 0.45823405372898, + "grad_norm": 0.11670040339231491, + "learning_rate": 0.0005, + "loss": 2.1154, + "step": 120390 + }, + { + "epoch": 0.4582721161971027, + "grad_norm": 0.1220269724726677, + "learning_rate": 0.0005, + "loss": 2.1154, + "step": 120400 + }, + { + "epoch": 0.4583101786652254, + "grad_norm": 0.13793805241584778, + "learning_rate": 0.0005, + "loss": 2.1157, + "step": 120410 + }, + { + "epoch": 0.45834824113334804, + "grad_norm": 0.130221426486969, + "learning_rate": 0.0005, + "loss": 2.124, + "step": 120420 + }, + { + "epoch": 0.45838630360147076, + "grad_norm": 0.12842504680156708, + "learning_rate": 0.0005, + "loss": 2.1187, + "step": 120430 + }, + { + "epoch": 0.4584243660695934, + "grad_norm": 0.133062481880188, + "learning_rate": 0.0005, + "loss": 2.116, + "step": 120440 + }, + { + "epoch": 0.4584624285377161, + "grad_norm": 0.12013937532901764, + "learning_rate": 0.0005, + "loss": 2.1046, + "step": 120450 + }, + { + "epoch": 0.4585004910058388, + "grad_norm": 0.11916226148605347, + "learning_rate": 0.0005, + "loss": 2.1153, + "step": 120460 + }, + { + "epoch": 0.45853855347396144, + "grad_norm": 0.11697036027908325, + "learning_rate": 0.0005, + "loss": 2.1095, + "step": 120470 + }, + { + "epoch": 0.45857661594208415, + "grad_norm": 0.11921066790819168, + "learning_rate": 0.0005, + "loss": 2.1225, + "step": 120480 + }, + { + "epoch": 0.4586146784102068, + "grad_norm": 0.12394513189792633, + "learning_rate": 0.0005, + "loss": 2.1161, + "step": 120490 + }, + { + "epoch": 0.4586527408783295, + "grad_norm": 0.11950384080410004, + "learning_rate": 0.0005, + "loss": 2.1303, + "step": 120500 + }, + { + "epoch": 0.4586908033464522, + "grad_norm": 0.133575439453125, + "learning_rate": 0.0005, + "loss": 2.1226, + "step": 120510 + }, + { + "epoch": 0.4587288658145749, + "grad_norm": 0.12195932120084763, + "learning_rate": 0.0005, + "loss": 2.1346, + "step": 120520 + }, + { + "epoch": 0.45876692828269755, + "grad_norm": 0.13978615403175354, + "learning_rate": 0.0005, + "loss": 2.1056, + "step": 120530 + }, + { + "epoch": 0.45880499075082026, + "grad_norm": 0.11702223867177963, + "learning_rate": 0.0005, + "loss": 2.1159, + "step": 120540 + }, + { + "epoch": 0.4588430532189429, + "grad_norm": 0.15952906012535095, + "learning_rate": 0.0005, + "loss": 2.1251, + "step": 120550 + }, + { + "epoch": 0.45888111568706563, + "grad_norm": 0.1334410309791565, + "learning_rate": 0.0005, + "loss": 2.119, + "step": 120560 + }, + { + "epoch": 0.4589191781551883, + "grad_norm": 0.13073650002479553, + "learning_rate": 0.0005, + "loss": 2.1201, + "step": 120570 + }, + { + "epoch": 0.458957240623311, + "grad_norm": 0.12061751633882523, + "learning_rate": 0.0005, + "loss": 2.1277, + "step": 120580 + }, + { + "epoch": 0.45899530309143366, + "grad_norm": 0.12560215592384338, + "learning_rate": 0.0005, + "loss": 2.1225, + "step": 120590 + }, + { + "epoch": 0.4590333655595563, + "grad_norm": 0.13219398260116577, + "learning_rate": 0.0005, + "loss": 2.1368, + "step": 120600 + }, + { + "epoch": 0.45907142802767903, + "grad_norm": 0.13976603746414185, + "learning_rate": 0.0005, + "loss": 2.1078, + "step": 120610 + }, + { + "epoch": 0.4591094904958017, + "grad_norm": 0.12265753746032715, + "learning_rate": 0.0005, + "loss": 2.1231, + "step": 120620 + }, + { + "epoch": 0.4591475529639244, + "grad_norm": 0.1313551366329193, + "learning_rate": 0.0005, + "loss": 2.1221, + "step": 120630 + }, + { + "epoch": 0.45918561543204706, + "grad_norm": 0.11972445994615555, + "learning_rate": 0.0005, + "loss": 2.1282, + "step": 120640 + }, + { + "epoch": 0.45922367790016977, + "grad_norm": 0.13018518686294556, + "learning_rate": 0.0005, + "loss": 2.1129, + "step": 120650 + }, + { + "epoch": 0.4592617403682924, + "grad_norm": 0.14095982909202576, + "learning_rate": 0.0005, + "loss": 2.1156, + "step": 120660 + }, + { + "epoch": 0.45929980283641514, + "grad_norm": 0.11395607143640518, + "learning_rate": 0.0005, + "loss": 2.1099, + "step": 120670 + }, + { + "epoch": 0.4593378653045378, + "grad_norm": 0.1700298935174942, + "learning_rate": 0.0005, + "loss": 2.1121, + "step": 120680 + }, + { + "epoch": 0.4593759277726605, + "grad_norm": 0.1294533908367157, + "learning_rate": 0.0005, + "loss": 2.1306, + "step": 120690 + }, + { + "epoch": 0.45941399024078317, + "grad_norm": 0.12129295617341995, + "learning_rate": 0.0005, + "loss": 2.1136, + "step": 120700 + }, + { + "epoch": 0.4594520527089059, + "grad_norm": 0.12236347049474716, + "learning_rate": 0.0005, + "loss": 2.1094, + "step": 120710 + }, + { + "epoch": 0.45949011517702854, + "grad_norm": 0.12837907671928406, + "learning_rate": 0.0005, + "loss": 2.125, + "step": 120720 + }, + { + "epoch": 0.45952817764515125, + "grad_norm": 0.12483696639537811, + "learning_rate": 0.0005, + "loss": 2.1091, + "step": 120730 + }, + { + "epoch": 0.4595662401132739, + "grad_norm": 0.13650977611541748, + "learning_rate": 0.0005, + "loss": 2.1193, + "step": 120740 + }, + { + "epoch": 0.45960430258139656, + "grad_norm": 0.13617606461048126, + "learning_rate": 0.0005, + "loss": 2.1161, + "step": 120750 + }, + { + "epoch": 0.4596423650495193, + "grad_norm": 0.13370659947395325, + "learning_rate": 0.0005, + "loss": 2.1221, + "step": 120760 + }, + { + "epoch": 0.45968042751764193, + "grad_norm": 0.1324436515569687, + "learning_rate": 0.0005, + "loss": 2.1221, + "step": 120770 + }, + { + "epoch": 0.45971848998576464, + "grad_norm": 0.11624017357826233, + "learning_rate": 0.0005, + "loss": 2.1194, + "step": 120780 + }, + { + "epoch": 0.4597565524538873, + "grad_norm": 0.12417466193437576, + "learning_rate": 0.0005, + "loss": 2.1165, + "step": 120790 + }, + { + "epoch": 0.45979461492201, + "grad_norm": 0.13375669717788696, + "learning_rate": 0.0005, + "loss": 2.1043, + "step": 120800 + }, + { + "epoch": 0.45983267739013267, + "grad_norm": 0.12934353947639465, + "learning_rate": 0.0005, + "loss": 2.1103, + "step": 120810 + }, + { + "epoch": 0.4598707398582554, + "grad_norm": 0.12061566859483719, + "learning_rate": 0.0005, + "loss": 2.1115, + "step": 120820 + }, + { + "epoch": 0.45990880232637804, + "grad_norm": 0.1487627923488617, + "learning_rate": 0.0005, + "loss": 2.1233, + "step": 120830 + }, + { + "epoch": 0.45994686479450075, + "grad_norm": 0.1293063908815384, + "learning_rate": 0.0005, + "loss": 2.1134, + "step": 120840 + }, + { + "epoch": 0.4599849272626234, + "grad_norm": 0.12316817790269852, + "learning_rate": 0.0005, + "loss": 2.1197, + "step": 120850 + }, + { + "epoch": 0.4600229897307461, + "grad_norm": 0.128492072224617, + "learning_rate": 0.0005, + "loss": 2.1029, + "step": 120860 + }, + { + "epoch": 0.4600610521988688, + "grad_norm": 0.13701295852661133, + "learning_rate": 0.0005, + "loss": 2.1328, + "step": 120870 + }, + { + "epoch": 0.4600991146669915, + "grad_norm": 0.12807445228099823, + "learning_rate": 0.0005, + "loss": 2.1154, + "step": 120880 + }, + { + "epoch": 0.46013717713511415, + "grad_norm": 0.12884552776813507, + "learning_rate": 0.0005, + "loss": 2.1232, + "step": 120890 + }, + { + "epoch": 0.4601752396032368, + "grad_norm": 0.13927030563354492, + "learning_rate": 0.0005, + "loss": 2.1101, + "step": 120900 + }, + { + "epoch": 0.4602133020713595, + "grad_norm": 0.14055320620536804, + "learning_rate": 0.0005, + "loss": 2.1189, + "step": 120910 + }, + { + "epoch": 0.4602513645394822, + "grad_norm": 0.1349189579486847, + "learning_rate": 0.0005, + "loss": 2.1145, + "step": 120920 + }, + { + "epoch": 0.4602894270076049, + "grad_norm": 0.1196657121181488, + "learning_rate": 0.0005, + "loss": 2.1164, + "step": 120930 + }, + { + "epoch": 0.46032748947572755, + "grad_norm": 0.11339619010686874, + "learning_rate": 0.0005, + "loss": 2.0867, + "step": 120940 + }, + { + "epoch": 0.46036555194385026, + "grad_norm": 0.13684843480587006, + "learning_rate": 0.0005, + "loss": 2.1166, + "step": 120950 + }, + { + "epoch": 0.4604036144119729, + "grad_norm": 0.11891072243452072, + "learning_rate": 0.0005, + "loss": 2.1209, + "step": 120960 + }, + { + "epoch": 0.46044167688009563, + "grad_norm": 0.1228720098733902, + "learning_rate": 0.0005, + "loss": 2.1127, + "step": 120970 + }, + { + "epoch": 0.4604797393482183, + "grad_norm": 0.11489354074001312, + "learning_rate": 0.0005, + "loss": 2.1115, + "step": 120980 + }, + { + "epoch": 0.460517801816341, + "grad_norm": 0.1338176280260086, + "learning_rate": 0.0005, + "loss": 2.1298, + "step": 120990 + }, + { + "epoch": 0.46055586428446366, + "grad_norm": 0.12991927564144135, + "learning_rate": 0.0005, + "loss": 2.1131, + "step": 121000 + }, + { + "epoch": 0.46059392675258637, + "grad_norm": 0.13514523208141327, + "learning_rate": 0.0005, + "loss": 2.1103, + "step": 121010 + }, + { + "epoch": 0.460631989220709, + "grad_norm": 0.12413940578699112, + "learning_rate": 0.0005, + "loss": 2.1236, + "step": 121020 + }, + { + "epoch": 0.4606700516888317, + "grad_norm": 0.12618406116962433, + "learning_rate": 0.0005, + "loss": 2.1131, + "step": 121030 + }, + { + "epoch": 0.4607081141569544, + "grad_norm": 0.11842025816440582, + "learning_rate": 0.0005, + "loss": 2.1104, + "step": 121040 + }, + { + "epoch": 0.46074617662507705, + "grad_norm": 0.13303187489509583, + "learning_rate": 0.0005, + "loss": 2.118, + "step": 121050 + }, + { + "epoch": 0.46078423909319977, + "grad_norm": 0.14305202662944794, + "learning_rate": 0.0005, + "loss": 2.116, + "step": 121060 + }, + { + "epoch": 0.4608223015613224, + "grad_norm": 0.1284281462430954, + "learning_rate": 0.0005, + "loss": 2.1108, + "step": 121070 + }, + { + "epoch": 0.46086036402944514, + "grad_norm": 0.13640108704566956, + "learning_rate": 0.0005, + "loss": 2.1268, + "step": 121080 + }, + { + "epoch": 0.4608984264975678, + "grad_norm": 0.11827525496482849, + "learning_rate": 0.0005, + "loss": 2.1326, + "step": 121090 + }, + { + "epoch": 0.4609364889656905, + "grad_norm": 0.1367952823638916, + "learning_rate": 0.0005, + "loss": 2.1127, + "step": 121100 + }, + { + "epoch": 0.46097455143381316, + "grad_norm": 0.12315177172422409, + "learning_rate": 0.0005, + "loss": 2.1078, + "step": 121110 + }, + { + "epoch": 0.4610126139019359, + "grad_norm": 0.12004311382770538, + "learning_rate": 0.0005, + "loss": 2.1285, + "step": 121120 + }, + { + "epoch": 0.46105067637005853, + "grad_norm": 0.11687660217285156, + "learning_rate": 0.0005, + "loss": 2.1229, + "step": 121130 + }, + { + "epoch": 0.46108873883818124, + "grad_norm": 0.11443885415792465, + "learning_rate": 0.0005, + "loss": 2.1167, + "step": 121140 + }, + { + "epoch": 0.4611268013063039, + "grad_norm": 0.13030986487865448, + "learning_rate": 0.0005, + "loss": 2.1228, + "step": 121150 + }, + { + "epoch": 0.4611648637744266, + "grad_norm": 0.1401316374540329, + "learning_rate": 0.0005, + "loss": 2.1213, + "step": 121160 + }, + { + "epoch": 0.46120292624254927, + "grad_norm": 0.12074064463376999, + "learning_rate": 0.0005, + "loss": 2.1009, + "step": 121170 + }, + { + "epoch": 0.46124098871067193, + "grad_norm": 0.11939749121665955, + "learning_rate": 0.0005, + "loss": 2.1166, + "step": 121180 + }, + { + "epoch": 0.46127905117879464, + "grad_norm": 0.123379185795784, + "learning_rate": 0.0005, + "loss": 2.1167, + "step": 121190 + }, + { + "epoch": 0.4613171136469173, + "grad_norm": 0.1268387883901596, + "learning_rate": 0.0005, + "loss": 2.1187, + "step": 121200 + }, + { + "epoch": 0.46135517611504, + "grad_norm": 0.14271396398544312, + "learning_rate": 0.0005, + "loss": 2.1183, + "step": 121210 + }, + { + "epoch": 0.46139323858316267, + "grad_norm": 0.11226682364940643, + "learning_rate": 0.0005, + "loss": 2.1031, + "step": 121220 + }, + { + "epoch": 0.4614313010512854, + "grad_norm": 0.12411001324653625, + "learning_rate": 0.0005, + "loss": 2.1098, + "step": 121230 + }, + { + "epoch": 0.46146936351940804, + "grad_norm": 0.12751705944538116, + "learning_rate": 0.0005, + "loss": 2.1295, + "step": 121240 + }, + { + "epoch": 0.46150742598753075, + "grad_norm": 0.11743474751710892, + "learning_rate": 0.0005, + "loss": 2.1015, + "step": 121250 + }, + { + "epoch": 0.4615454884556534, + "grad_norm": 0.1255132555961609, + "learning_rate": 0.0005, + "loss": 2.1376, + "step": 121260 + }, + { + "epoch": 0.4615835509237761, + "grad_norm": 0.12986207008361816, + "learning_rate": 0.0005, + "loss": 2.131, + "step": 121270 + }, + { + "epoch": 0.4616216133918988, + "grad_norm": 0.12577657401561737, + "learning_rate": 0.0005, + "loss": 2.1254, + "step": 121280 + }, + { + "epoch": 0.4616596758600215, + "grad_norm": 0.15010912716388702, + "learning_rate": 0.0005, + "loss": 2.1391, + "step": 121290 + }, + { + "epoch": 0.46169773832814415, + "grad_norm": 0.13126231729984283, + "learning_rate": 0.0005, + "loss": 2.1174, + "step": 121300 + }, + { + "epoch": 0.46173580079626686, + "grad_norm": 0.12573876976966858, + "learning_rate": 0.0005, + "loss": 2.1231, + "step": 121310 + }, + { + "epoch": 0.4617738632643895, + "grad_norm": 0.12809449434280396, + "learning_rate": 0.0005, + "loss": 2.136, + "step": 121320 + }, + { + "epoch": 0.4618119257325122, + "grad_norm": 0.11522986739873886, + "learning_rate": 0.0005, + "loss": 2.1138, + "step": 121330 + }, + { + "epoch": 0.4618499882006349, + "grad_norm": 0.1317320615053177, + "learning_rate": 0.0005, + "loss": 2.1177, + "step": 121340 + }, + { + "epoch": 0.46188805066875754, + "grad_norm": 0.1397280991077423, + "learning_rate": 0.0005, + "loss": 2.1222, + "step": 121350 + }, + { + "epoch": 0.46192611313688026, + "grad_norm": 0.12934623658657074, + "learning_rate": 0.0005, + "loss": 2.1243, + "step": 121360 + }, + { + "epoch": 0.4619641756050029, + "grad_norm": 0.12387774139642715, + "learning_rate": 0.0005, + "loss": 2.1227, + "step": 121370 + }, + { + "epoch": 0.4620022380731256, + "grad_norm": 0.12394656985998154, + "learning_rate": 0.0005, + "loss": 2.1277, + "step": 121380 + }, + { + "epoch": 0.4620403005412483, + "grad_norm": 0.12481960654258728, + "learning_rate": 0.0005, + "loss": 2.1177, + "step": 121390 + }, + { + "epoch": 0.462078363009371, + "grad_norm": 0.12520074844360352, + "learning_rate": 0.0005, + "loss": 2.1117, + "step": 121400 + }, + { + "epoch": 0.46211642547749365, + "grad_norm": 0.12304385006427765, + "learning_rate": 0.0005, + "loss": 2.1243, + "step": 121410 + }, + { + "epoch": 0.46215448794561637, + "grad_norm": 0.1355585902929306, + "learning_rate": 0.0005, + "loss": 2.1046, + "step": 121420 + }, + { + "epoch": 0.462192550413739, + "grad_norm": 0.11432173103094101, + "learning_rate": 0.0005, + "loss": 2.1312, + "step": 121430 + }, + { + "epoch": 0.46223061288186174, + "grad_norm": 0.1187356635928154, + "learning_rate": 0.0005, + "loss": 2.1207, + "step": 121440 + }, + { + "epoch": 0.4622686753499844, + "grad_norm": 0.1240907609462738, + "learning_rate": 0.0005, + "loss": 2.1039, + "step": 121450 + }, + { + "epoch": 0.46230673781810705, + "grad_norm": 0.12461048364639282, + "learning_rate": 0.0005, + "loss": 2.1136, + "step": 121460 + }, + { + "epoch": 0.46234480028622976, + "grad_norm": 0.13123613595962524, + "learning_rate": 0.0005, + "loss": 2.1203, + "step": 121470 + }, + { + "epoch": 0.4623828627543524, + "grad_norm": 0.13103175163269043, + "learning_rate": 0.0005, + "loss": 2.0967, + "step": 121480 + }, + { + "epoch": 0.46242092522247513, + "grad_norm": 0.12875773012638092, + "learning_rate": 0.0005, + "loss": 2.1181, + "step": 121490 + }, + { + "epoch": 0.4624589876905978, + "grad_norm": 0.13313820958137512, + "learning_rate": 0.0005, + "loss": 2.101, + "step": 121500 + }, + { + "epoch": 0.4624970501587205, + "grad_norm": 0.11926737427711487, + "learning_rate": 0.0005, + "loss": 2.1087, + "step": 121510 + }, + { + "epoch": 0.46253511262684316, + "grad_norm": 0.11945221573114395, + "learning_rate": 0.0005, + "loss": 2.1375, + "step": 121520 + }, + { + "epoch": 0.46257317509496587, + "grad_norm": 0.13894334435462952, + "learning_rate": 0.0005, + "loss": 2.0995, + "step": 121530 + }, + { + "epoch": 0.46261123756308853, + "grad_norm": 0.13981805741786957, + "learning_rate": 0.0005, + "loss": 2.1305, + "step": 121540 + }, + { + "epoch": 0.46264930003121124, + "grad_norm": 0.12328756600618362, + "learning_rate": 0.0005, + "loss": 2.122, + "step": 121550 + }, + { + "epoch": 0.4626873624993339, + "grad_norm": 0.1285262405872345, + "learning_rate": 0.0005, + "loss": 2.1071, + "step": 121560 + }, + { + "epoch": 0.4627254249674566, + "grad_norm": 0.1343710571527481, + "learning_rate": 0.0005, + "loss": 2.1144, + "step": 121570 + }, + { + "epoch": 0.46276348743557927, + "grad_norm": 0.1505940854549408, + "learning_rate": 0.0005, + "loss": 2.1147, + "step": 121580 + }, + { + "epoch": 0.462801549903702, + "grad_norm": 0.12722337245941162, + "learning_rate": 0.0005, + "loss": 2.1097, + "step": 121590 + }, + { + "epoch": 0.46283961237182464, + "grad_norm": 0.11916808038949966, + "learning_rate": 0.0005, + "loss": 2.1166, + "step": 121600 + }, + { + "epoch": 0.4628776748399473, + "grad_norm": 0.12827543914318085, + "learning_rate": 0.0005, + "loss": 2.1185, + "step": 121610 + }, + { + "epoch": 0.46291573730807, + "grad_norm": 0.11692959070205688, + "learning_rate": 0.0005, + "loss": 2.115, + "step": 121620 + }, + { + "epoch": 0.46295379977619266, + "grad_norm": 0.12335637211799622, + "learning_rate": 0.0005, + "loss": 2.1126, + "step": 121630 + }, + { + "epoch": 0.4629918622443154, + "grad_norm": 0.11940497905015945, + "learning_rate": 0.0005, + "loss": 2.123, + "step": 121640 + }, + { + "epoch": 0.46302992471243803, + "grad_norm": 0.11708685755729675, + "learning_rate": 0.0005, + "loss": 2.1152, + "step": 121650 + }, + { + "epoch": 0.46306798718056075, + "grad_norm": 0.1382119208574295, + "learning_rate": 0.0005, + "loss": 2.1198, + "step": 121660 + }, + { + "epoch": 0.4631060496486834, + "grad_norm": 0.14125585556030273, + "learning_rate": 0.0005, + "loss": 2.1168, + "step": 121670 + }, + { + "epoch": 0.4631441121168061, + "grad_norm": 0.11189690232276917, + "learning_rate": 0.0005, + "loss": 2.1202, + "step": 121680 + }, + { + "epoch": 0.4631821745849288, + "grad_norm": 0.1348118931055069, + "learning_rate": 0.0005, + "loss": 2.1145, + "step": 121690 + }, + { + "epoch": 0.4632202370530515, + "grad_norm": 0.11401959508657455, + "learning_rate": 0.0005, + "loss": 2.1344, + "step": 121700 + }, + { + "epoch": 0.46325829952117414, + "grad_norm": 0.12618707120418549, + "learning_rate": 0.0005, + "loss": 2.1083, + "step": 121710 + }, + { + "epoch": 0.46329636198929686, + "grad_norm": 0.13240477442741394, + "learning_rate": 0.0005, + "loss": 2.1179, + "step": 121720 + }, + { + "epoch": 0.4633344244574195, + "grad_norm": 0.12720882892608643, + "learning_rate": 0.0005, + "loss": 2.1117, + "step": 121730 + }, + { + "epoch": 0.4633724869255422, + "grad_norm": 0.11353076249361038, + "learning_rate": 0.0005, + "loss": 2.1131, + "step": 121740 + }, + { + "epoch": 0.4634105493936649, + "grad_norm": 0.1258583813905716, + "learning_rate": 0.0005, + "loss": 2.0998, + "step": 121750 + }, + { + "epoch": 0.46344861186178754, + "grad_norm": 0.12236791104078293, + "learning_rate": 0.0005, + "loss": 2.1258, + "step": 121760 + }, + { + "epoch": 0.46348667432991025, + "grad_norm": 0.12761059403419495, + "learning_rate": 0.0005, + "loss": 2.0967, + "step": 121770 + }, + { + "epoch": 0.4635247367980329, + "grad_norm": 0.12125248461961746, + "learning_rate": 0.0005, + "loss": 2.1058, + "step": 121780 + }, + { + "epoch": 0.4635627992661556, + "grad_norm": 0.1282951980829239, + "learning_rate": 0.0005, + "loss": 2.114, + "step": 121790 + }, + { + "epoch": 0.4636008617342783, + "grad_norm": 0.12211728096008301, + "learning_rate": 0.0005, + "loss": 2.1154, + "step": 121800 + }, + { + "epoch": 0.463638924202401, + "grad_norm": 0.11740932613611221, + "learning_rate": 0.0005, + "loss": 2.0923, + "step": 121810 + }, + { + "epoch": 0.46367698667052365, + "grad_norm": 0.11825814098119736, + "learning_rate": 0.0005, + "loss": 2.123, + "step": 121820 + }, + { + "epoch": 0.46371504913864636, + "grad_norm": 0.1204843744635582, + "learning_rate": 0.0005, + "loss": 2.1114, + "step": 121830 + }, + { + "epoch": 0.463753111606769, + "grad_norm": 0.12917695939540863, + "learning_rate": 0.0005, + "loss": 2.1208, + "step": 121840 + }, + { + "epoch": 0.46379117407489173, + "grad_norm": 0.12288330495357513, + "learning_rate": 0.0005, + "loss": 2.1301, + "step": 121850 + }, + { + "epoch": 0.4638292365430144, + "grad_norm": 0.11910853534936905, + "learning_rate": 0.0005, + "loss": 2.1019, + "step": 121860 + }, + { + "epoch": 0.4638672990111371, + "grad_norm": 0.11725097894668579, + "learning_rate": 0.0005, + "loss": 2.1337, + "step": 121870 + }, + { + "epoch": 0.46390536147925976, + "grad_norm": 0.11722494661808014, + "learning_rate": 0.0005, + "loss": 2.1131, + "step": 121880 + }, + { + "epoch": 0.46394342394738247, + "grad_norm": 0.12103597819805145, + "learning_rate": 0.0005, + "loss": 2.1378, + "step": 121890 + }, + { + "epoch": 0.46398148641550513, + "grad_norm": 0.12656059861183167, + "learning_rate": 0.0005, + "loss": 2.1105, + "step": 121900 + }, + { + "epoch": 0.4640195488836278, + "grad_norm": 0.1225254163146019, + "learning_rate": 0.0005, + "loss": 2.1077, + "step": 121910 + }, + { + "epoch": 0.4640576113517505, + "grad_norm": 0.12554019689559937, + "learning_rate": 0.0005, + "loss": 2.096, + "step": 121920 + }, + { + "epoch": 0.46409567381987316, + "grad_norm": 0.1267976462841034, + "learning_rate": 0.0005, + "loss": 2.1107, + "step": 121930 + }, + { + "epoch": 0.46413373628799587, + "grad_norm": 0.13665032386779785, + "learning_rate": 0.0005, + "loss": 2.1266, + "step": 121940 + }, + { + "epoch": 0.4641717987561185, + "grad_norm": 0.1262006163597107, + "learning_rate": 0.0005, + "loss": 2.1343, + "step": 121950 + }, + { + "epoch": 0.46420986122424124, + "grad_norm": 0.12373677641153336, + "learning_rate": 0.0005, + "loss": 2.1147, + "step": 121960 + }, + { + "epoch": 0.4642479236923639, + "grad_norm": 1.3493014574050903, + "learning_rate": 0.0005, + "loss": 2.1216, + "step": 121970 + }, + { + "epoch": 0.4642859861604866, + "grad_norm": 0.1411971002817154, + "learning_rate": 0.0005, + "loss": 2.1234, + "step": 121980 + }, + { + "epoch": 0.46432404862860926, + "grad_norm": 0.1318981945514679, + "learning_rate": 0.0005, + "loss": 2.1246, + "step": 121990 + }, + { + "epoch": 0.464362111096732, + "grad_norm": 0.11528051644563675, + "learning_rate": 0.0005, + "loss": 2.1155, + "step": 122000 + }, + { + "epoch": 0.46440017356485463, + "grad_norm": 0.1222640722990036, + "learning_rate": 0.0005, + "loss": 2.1077, + "step": 122010 + }, + { + "epoch": 0.46443823603297735, + "grad_norm": 0.12968085706233978, + "learning_rate": 0.0005, + "loss": 2.1212, + "step": 122020 + }, + { + "epoch": 0.4644762985011, + "grad_norm": 0.12960520386695862, + "learning_rate": 0.0005, + "loss": 2.1166, + "step": 122030 + }, + { + "epoch": 0.46451436096922266, + "grad_norm": 0.12939879298210144, + "learning_rate": 0.0005, + "loss": 2.1089, + "step": 122040 + }, + { + "epoch": 0.4645524234373454, + "grad_norm": 0.12176186591386795, + "learning_rate": 0.0005, + "loss": 2.1126, + "step": 122050 + }, + { + "epoch": 0.46459048590546803, + "grad_norm": 0.11837892979383469, + "learning_rate": 0.0005, + "loss": 2.1155, + "step": 122060 + }, + { + "epoch": 0.46462854837359074, + "grad_norm": 0.12619103491306305, + "learning_rate": 0.0005, + "loss": 2.1297, + "step": 122070 + }, + { + "epoch": 0.4646666108417134, + "grad_norm": 0.12157522141933441, + "learning_rate": 0.0005, + "loss": 2.1067, + "step": 122080 + }, + { + "epoch": 0.4647046733098361, + "grad_norm": 0.11906483769416809, + "learning_rate": 0.0005, + "loss": 2.1261, + "step": 122090 + }, + { + "epoch": 0.46474273577795877, + "grad_norm": 0.13369664549827576, + "learning_rate": 0.0005, + "loss": 2.1142, + "step": 122100 + }, + { + "epoch": 0.4647807982460815, + "grad_norm": 0.1242845430970192, + "learning_rate": 0.0005, + "loss": 2.1106, + "step": 122110 + }, + { + "epoch": 0.46481886071420414, + "grad_norm": 0.12459714710712433, + "learning_rate": 0.0005, + "loss": 2.1179, + "step": 122120 + }, + { + "epoch": 0.46485692318232685, + "grad_norm": 0.12096191197633743, + "learning_rate": 0.0005, + "loss": 2.114, + "step": 122130 + }, + { + "epoch": 0.4648949856504495, + "grad_norm": 0.12581023573875427, + "learning_rate": 0.0005, + "loss": 2.1066, + "step": 122140 + }, + { + "epoch": 0.4649330481185722, + "grad_norm": 0.12309510260820389, + "learning_rate": 0.0005, + "loss": 2.1011, + "step": 122150 + }, + { + "epoch": 0.4649711105866949, + "grad_norm": 0.12692727148532867, + "learning_rate": 0.0005, + "loss": 2.0989, + "step": 122160 + }, + { + "epoch": 0.4650091730548176, + "grad_norm": 0.12823912501335144, + "learning_rate": 0.0005, + "loss": 2.1239, + "step": 122170 + }, + { + "epoch": 0.46504723552294025, + "grad_norm": 0.12171664834022522, + "learning_rate": 0.0005, + "loss": 2.1131, + "step": 122180 + }, + { + "epoch": 0.4650852979910629, + "grad_norm": 0.14432282745838165, + "learning_rate": 0.0005, + "loss": 2.1219, + "step": 122190 + }, + { + "epoch": 0.4651233604591856, + "grad_norm": 0.11886001378297806, + "learning_rate": 0.0005, + "loss": 2.1157, + "step": 122200 + }, + { + "epoch": 0.4651614229273083, + "grad_norm": 0.12576742470264435, + "learning_rate": 0.0005, + "loss": 2.1086, + "step": 122210 + }, + { + "epoch": 0.465199485395431, + "grad_norm": 0.14525540173053741, + "learning_rate": 0.0005, + "loss": 2.114, + "step": 122220 + }, + { + "epoch": 0.46523754786355365, + "grad_norm": 0.1189890205860138, + "learning_rate": 0.0005, + "loss": 2.1267, + "step": 122230 + }, + { + "epoch": 0.46527561033167636, + "grad_norm": 0.12696944177150726, + "learning_rate": 0.0005, + "loss": 2.1105, + "step": 122240 + }, + { + "epoch": 0.465313672799799, + "grad_norm": 0.11398856341838837, + "learning_rate": 0.0005, + "loss": 2.1038, + "step": 122250 + }, + { + "epoch": 0.46535173526792173, + "grad_norm": 0.11955248564481735, + "learning_rate": 0.0005, + "loss": 2.1102, + "step": 122260 + }, + { + "epoch": 0.4653897977360444, + "grad_norm": 0.13834500312805176, + "learning_rate": 0.0005, + "loss": 2.1245, + "step": 122270 + }, + { + "epoch": 0.4654278602041671, + "grad_norm": 0.12638892233371735, + "learning_rate": 0.0005, + "loss": 2.1097, + "step": 122280 + }, + { + "epoch": 0.46546592267228976, + "grad_norm": 0.12643951177597046, + "learning_rate": 0.0005, + "loss": 2.1095, + "step": 122290 + }, + { + "epoch": 0.46550398514041247, + "grad_norm": 0.13313770294189453, + "learning_rate": 0.0005, + "loss": 2.1169, + "step": 122300 + }, + { + "epoch": 0.4655420476085351, + "grad_norm": 0.13501465320587158, + "learning_rate": 0.0005, + "loss": 2.1271, + "step": 122310 + }, + { + "epoch": 0.46558011007665784, + "grad_norm": 0.13232651352882385, + "learning_rate": 0.0005, + "loss": 2.1093, + "step": 122320 + }, + { + "epoch": 0.4656181725447805, + "grad_norm": 0.12018989026546478, + "learning_rate": 0.0005, + "loss": 2.1167, + "step": 122330 + }, + { + "epoch": 0.46565623501290315, + "grad_norm": 0.1271781027317047, + "learning_rate": 0.0005, + "loss": 2.1066, + "step": 122340 + }, + { + "epoch": 0.46569429748102587, + "grad_norm": 0.12841512262821198, + "learning_rate": 0.0005, + "loss": 2.1259, + "step": 122350 + }, + { + "epoch": 0.4657323599491485, + "grad_norm": 0.11983626335859299, + "learning_rate": 0.0005, + "loss": 2.11, + "step": 122360 + }, + { + "epoch": 0.46577042241727123, + "grad_norm": 0.13224050402641296, + "learning_rate": 0.0005, + "loss": 2.11, + "step": 122370 + }, + { + "epoch": 0.4658084848853939, + "grad_norm": 0.12928161025047302, + "learning_rate": 0.0005, + "loss": 2.12, + "step": 122380 + }, + { + "epoch": 0.4658465473535166, + "grad_norm": 0.11840779334306717, + "learning_rate": 0.0005, + "loss": 2.1225, + "step": 122390 + }, + { + "epoch": 0.46588460982163926, + "grad_norm": 0.12242060899734497, + "learning_rate": 0.0005, + "loss": 2.1153, + "step": 122400 + }, + { + "epoch": 0.465922672289762, + "grad_norm": 0.15045621991157532, + "learning_rate": 0.0005, + "loss": 2.1179, + "step": 122410 + }, + { + "epoch": 0.46596073475788463, + "grad_norm": 0.1329570859670639, + "learning_rate": 0.0005, + "loss": 2.1331, + "step": 122420 + }, + { + "epoch": 0.46599879722600734, + "grad_norm": 0.12627650797367096, + "learning_rate": 0.0005, + "loss": 2.1426, + "step": 122430 + }, + { + "epoch": 0.46603685969413, + "grad_norm": 0.12650640308856964, + "learning_rate": 0.0005, + "loss": 2.1091, + "step": 122440 + }, + { + "epoch": 0.4660749221622527, + "grad_norm": 0.12126204371452332, + "learning_rate": 0.0005, + "loss": 2.114, + "step": 122450 + }, + { + "epoch": 0.46611298463037537, + "grad_norm": 0.12744243443012238, + "learning_rate": 0.0005, + "loss": 2.1213, + "step": 122460 + }, + { + "epoch": 0.46615104709849803, + "grad_norm": 0.11687210947275162, + "learning_rate": 0.0005, + "loss": 2.0983, + "step": 122470 + }, + { + "epoch": 0.46618910956662074, + "grad_norm": 0.12865188717842102, + "learning_rate": 0.0005, + "loss": 2.109, + "step": 122480 + }, + { + "epoch": 0.4662271720347434, + "grad_norm": 0.14424051344394684, + "learning_rate": 0.0005, + "loss": 2.1133, + "step": 122490 + }, + { + "epoch": 0.4662652345028661, + "grad_norm": 0.12110546231269836, + "learning_rate": 0.0005, + "loss": 2.1045, + "step": 122500 + }, + { + "epoch": 0.46630329697098877, + "grad_norm": 0.13489185273647308, + "learning_rate": 0.0005, + "loss": 2.1138, + "step": 122510 + }, + { + "epoch": 0.4663413594391115, + "grad_norm": 0.1381700485944748, + "learning_rate": 0.0005, + "loss": 2.0987, + "step": 122520 + }, + { + "epoch": 0.46637942190723414, + "grad_norm": 0.13211920857429504, + "learning_rate": 0.0005, + "loss": 2.1118, + "step": 122530 + }, + { + "epoch": 0.46641748437535685, + "grad_norm": 0.132435604929924, + "learning_rate": 0.0005, + "loss": 2.1254, + "step": 122540 + }, + { + "epoch": 0.4664555468434795, + "grad_norm": 0.12245769053697586, + "learning_rate": 0.0005, + "loss": 2.1185, + "step": 122550 + }, + { + "epoch": 0.4664936093116022, + "grad_norm": 0.1318317949771881, + "learning_rate": 0.0005, + "loss": 2.125, + "step": 122560 + }, + { + "epoch": 0.4665316717797249, + "grad_norm": 0.13142701983451843, + "learning_rate": 0.0005, + "loss": 2.1081, + "step": 122570 + }, + { + "epoch": 0.4665697342478476, + "grad_norm": 0.1299506574869156, + "learning_rate": 0.0005, + "loss": 2.1167, + "step": 122580 + }, + { + "epoch": 0.46660779671597025, + "grad_norm": 0.14254699647426605, + "learning_rate": 0.0005, + "loss": 2.1167, + "step": 122590 + }, + { + "epoch": 0.46664585918409296, + "grad_norm": 0.11915218085050583, + "learning_rate": 0.0005, + "loss": 2.1262, + "step": 122600 + }, + { + "epoch": 0.4666839216522156, + "grad_norm": 0.11730083078145981, + "learning_rate": 0.0005, + "loss": 2.1073, + "step": 122610 + }, + { + "epoch": 0.4667219841203383, + "grad_norm": 0.14593356847763062, + "learning_rate": 0.0005, + "loss": 2.1228, + "step": 122620 + }, + { + "epoch": 0.466760046588461, + "grad_norm": 0.1312982439994812, + "learning_rate": 0.0005, + "loss": 2.1221, + "step": 122630 + }, + { + "epoch": 0.46679810905658364, + "grad_norm": 0.12263674288988113, + "learning_rate": 0.0005, + "loss": 2.1116, + "step": 122640 + }, + { + "epoch": 0.46683617152470636, + "grad_norm": 0.13540539145469666, + "learning_rate": 0.0005, + "loss": 2.1103, + "step": 122650 + }, + { + "epoch": 0.466874233992829, + "grad_norm": 0.12395121902227402, + "learning_rate": 0.0005, + "loss": 2.1226, + "step": 122660 + }, + { + "epoch": 0.4669122964609517, + "grad_norm": 0.14132188260555267, + "learning_rate": 0.0005, + "loss": 2.1308, + "step": 122670 + }, + { + "epoch": 0.4669503589290744, + "grad_norm": 0.11930973082780838, + "learning_rate": 0.0005, + "loss": 2.1092, + "step": 122680 + }, + { + "epoch": 0.4669884213971971, + "grad_norm": 0.13231684267520905, + "learning_rate": 0.0005, + "loss": 2.1126, + "step": 122690 + }, + { + "epoch": 0.46702648386531975, + "grad_norm": 0.14038299024105072, + "learning_rate": 0.0005, + "loss": 2.114, + "step": 122700 + }, + { + "epoch": 0.46706454633344247, + "grad_norm": 0.1272704005241394, + "learning_rate": 0.0005, + "loss": 2.1253, + "step": 122710 + }, + { + "epoch": 0.4671026088015651, + "grad_norm": 0.1259326934814453, + "learning_rate": 0.0005, + "loss": 2.1144, + "step": 122720 + }, + { + "epoch": 0.46714067126968783, + "grad_norm": 0.11703534424304962, + "learning_rate": 0.0005, + "loss": 2.108, + "step": 122730 + }, + { + "epoch": 0.4671787337378105, + "grad_norm": 0.11842364817857742, + "learning_rate": 0.0005, + "loss": 2.111, + "step": 122740 + }, + { + "epoch": 0.4672167962059332, + "grad_norm": 0.14716917276382446, + "learning_rate": 0.0005, + "loss": 2.1143, + "step": 122750 + }, + { + "epoch": 0.46725485867405586, + "grad_norm": 0.12898138165473938, + "learning_rate": 0.0005, + "loss": 2.1131, + "step": 122760 + }, + { + "epoch": 0.4672929211421785, + "grad_norm": 0.11034823209047318, + "learning_rate": 0.0005, + "loss": 2.1226, + "step": 122770 + }, + { + "epoch": 0.46733098361030123, + "grad_norm": 0.11895115673542023, + "learning_rate": 0.0005, + "loss": 2.1238, + "step": 122780 + }, + { + "epoch": 0.4673690460784239, + "grad_norm": 0.12958012521266937, + "learning_rate": 0.0005, + "loss": 2.1037, + "step": 122790 + }, + { + "epoch": 0.4674071085465466, + "grad_norm": 0.12227039039134979, + "learning_rate": 0.0005, + "loss": 2.1191, + "step": 122800 + }, + { + "epoch": 0.46744517101466926, + "grad_norm": 0.12844082713127136, + "learning_rate": 0.0005, + "loss": 2.105, + "step": 122810 + }, + { + "epoch": 0.46748323348279197, + "grad_norm": 0.11611893773078918, + "learning_rate": 0.0005, + "loss": 2.1162, + "step": 122820 + }, + { + "epoch": 0.46752129595091463, + "grad_norm": 0.16009820997714996, + "learning_rate": 0.0005, + "loss": 2.1242, + "step": 122830 + }, + { + "epoch": 0.46755935841903734, + "grad_norm": 0.1324053853750229, + "learning_rate": 0.0005, + "loss": 2.1222, + "step": 122840 + }, + { + "epoch": 0.46759742088716, + "grad_norm": 0.129500612616539, + "learning_rate": 0.0005, + "loss": 2.0992, + "step": 122850 + }, + { + "epoch": 0.4676354833552827, + "grad_norm": 0.1253543645143509, + "learning_rate": 0.0005, + "loss": 2.1264, + "step": 122860 + }, + { + "epoch": 0.46767354582340537, + "grad_norm": 0.12207604199647903, + "learning_rate": 0.0005, + "loss": 2.1067, + "step": 122870 + }, + { + "epoch": 0.4677116082915281, + "grad_norm": 0.13183759152889252, + "learning_rate": 0.0005, + "loss": 2.1045, + "step": 122880 + }, + { + "epoch": 0.46774967075965074, + "grad_norm": 0.13390865921974182, + "learning_rate": 0.0005, + "loss": 2.1117, + "step": 122890 + }, + { + "epoch": 0.4677877332277734, + "grad_norm": 0.12353968620300293, + "learning_rate": 0.0005, + "loss": 2.1153, + "step": 122900 + }, + { + "epoch": 0.4678257956958961, + "grad_norm": 0.1235695481300354, + "learning_rate": 0.0005, + "loss": 2.1238, + "step": 122910 + }, + { + "epoch": 0.46786385816401876, + "grad_norm": 0.1252755969762802, + "learning_rate": 0.0005, + "loss": 2.1164, + "step": 122920 + }, + { + "epoch": 0.4679019206321415, + "grad_norm": 0.12756647169589996, + "learning_rate": 0.0005, + "loss": 2.1145, + "step": 122930 + }, + { + "epoch": 0.46793998310026413, + "grad_norm": 0.13085618615150452, + "learning_rate": 0.0005, + "loss": 2.1121, + "step": 122940 + }, + { + "epoch": 0.46797804556838685, + "grad_norm": 0.12621687352657318, + "learning_rate": 0.0005, + "loss": 2.1255, + "step": 122950 + }, + { + "epoch": 0.4680161080365095, + "grad_norm": 0.119481660425663, + "learning_rate": 0.0005, + "loss": 2.1171, + "step": 122960 + }, + { + "epoch": 0.4680541705046322, + "grad_norm": 0.12664827704429626, + "learning_rate": 0.0005, + "loss": 2.1226, + "step": 122970 + }, + { + "epoch": 0.4680922329727549, + "grad_norm": 0.12512001395225525, + "learning_rate": 0.0005, + "loss": 2.1193, + "step": 122980 + }, + { + "epoch": 0.4681302954408776, + "grad_norm": 0.13938689231872559, + "learning_rate": 0.0005, + "loss": 2.1243, + "step": 122990 + }, + { + "epoch": 0.46816835790900024, + "grad_norm": 0.1311841458082199, + "learning_rate": 0.0005, + "loss": 2.103, + "step": 123000 + }, + { + "epoch": 0.46820642037712296, + "grad_norm": 0.12570905685424805, + "learning_rate": 0.0005, + "loss": 2.1309, + "step": 123010 + }, + { + "epoch": 0.4682444828452456, + "grad_norm": 0.11313197016716003, + "learning_rate": 0.0005, + "loss": 2.1123, + "step": 123020 + }, + { + "epoch": 0.4682825453133683, + "grad_norm": 0.11979328840970993, + "learning_rate": 0.0005, + "loss": 2.1145, + "step": 123030 + }, + { + "epoch": 0.468320607781491, + "grad_norm": 0.1246039941906929, + "learning_rate": 0.0005, + "loss": 2.1067, + "step": 123040 + }, + { + "epoch": 0.46835867024961364, + "grad_norm": 0.11751657724380493, + "learning_rate": 0.0005, + "loss": 2.1159, + "step": 123050 + }, + { + "epoch": 0.46839673271773635, + "grad_norm": 0.13147424161434174, + "learning_rate": 0.0005, + "loss": 2.1187, + "step": 123060 + }, + { + "epoch": 0.468434795185859, + "grad_norm": 0.119588203728199, + "learning_rate": 0.0005, + "loss": 2.0929, + "step": 123070 + }, + { + "epoch": 0.4684728576539817, + "grad_norm": 0.12306036800146103, + "learning_rate": 0.0005, + "loss": 2.1061, + "step": 123080 + }, + { + "epoch": 0.4685109201221044, + "grad_norm": 0.1368551254272461, + "learning_rate": 0.0005, + "loss": 2.1084, + "step": 123090 + }, + { + "epoch": 0.4685489825902271, + "grad_norm": 0.14483638107776642, + "learning_rate": 0.0005, + "loss": 2.1163, + "step": 123100 + }, + { + "epoch": 0.46858704505834975, + "grad_norm": 0.11744410544633865, + "learning_rate": 0.0005, + "loss": 2.1054, + "step": 123110 + }, + { + "epoch": 0.46862510752647246, + "grad_norm": 0.1234697699546814, + "learning_rate": 0.0005, + "loss": 2.0992, + "step": 123120 + }, + { + "epoch": 0.4686631699945951, + "grad_norm": 0.14475607872009277, + "learning_rate": 0.0005, + "loss": 2.1077, + "step": 123130 + }, + { + "epoch": 0.46870123246271783, + "grad_norm": 0.11726272106170654, + "learning_rate": 0.0005, + "loss": 2.1249, + "step": 123140 + }, + { + "epoch": 0.4687392949308405, + "grad_norm": 0.11984838545322418, + "learning_rate": 0.0005, + "loss": 2.1236, + "step": 123150 + }, + { + "epoch": 0.4687773573989632, + "grad_norm": 0.1178346648812294, + "learning_rate": 0.0005, + "loss": 2.1276, + "step": 123160 + }, + { + "epoch": 0.46881541986708586, + "grad_norm": 0.1334547996520996, + "learning_rate": 0.0005, + "loss": 2.104, + "step": 123170 + }, + { + "epoch": 0.46885348233520857, + "grad_norm": 0.13188724219799042, + "learning_rate": 0.0005, + "loss": 2.109, + "step": 123180 + }, + { + "epoch": 0.46889154480333123, + "grad_norm": 0.12499808520078659, + "learning_rate": 0.0005, + "loss": 2.1175, + "step": 123190 + }, + { + "epoch": 0.4689296072714539, + "grad_norm": 0.12053222954273224, + "learning_rate": 0.0005, + "loss": 2.1081, + "step": 123200 + }, + { + "epoch": 0.4689676697395766, + "grad_norm": 0.12222561240196228, + "learning_rate": 0.0005, + "loss": 2.0979, + "step": 123210 + }, + { + "epoch": 0.46900573220769926, + "grad_norm": 0.12280338257551193, + "learning_rate": 0.0005, + "loss": 2.1064, + "step": 123220 + }, + { + "epoch": 0.46904379467582197, + "grad_norm": 0.12783880531787872, + "learning_rate": 0.0005, + "loss": 2.0973, + "step": 123230 + }, + { + "epoch": 0.4690818571439446, + "grad_norm": 0.15136320888996124, + "learning_rate": 0.0005, + "loss": 2.1169, + "step": 123240 + }, + { + "epoch": 0.46911991961206734, + "grad_norm": 0.1327112317085266, + "learning_rate": 0.0005, + "loss": 2.1285, + "step": 123250 + }, + { + "epoch": 0.46915798208019, + "grad_norm": 0.11947908252477646, + "learning_rate": 0.0005, + "loss": 2.1058, + "step": 123260 + }, + { + "epoch": 0.4691960445483127, + "grad_norm": 0.12162414193153381, + "learning_rate": 0.0005, + "loss": 2.1161, + "step": 123270 + }, + { + "epoch": 0.46923410701643536, + "grad_norm": 0.13324034214019775, + "learning_rate": 0.0005, + "loss": 2.1155, + "step": 123280 + }, + { + "epoch": 0.4692721694845581, + "grad_norm": 0.12378178536891937, + "learning_rate": 0.0005, + "loss": 2.1272, + "step": 123290 + }, + { + "epoch": 0.46931023195268073, + "grad_norm": 0.13062675297260284, + "learning_rate": 0.0005, + "loss": 2.1183, + "step": 123300 + }, + { + "epoch": 0.46934829442080345, + "grad_norm": 0.1286676824092865, + "learning_rate": 0.0005, + "loss": 2.1348, + "step": 123310 + }, + { + "epoch": 0.4693863568889261, + "grad_norm": 0.11752732843160629, + "learning_rate": 0.0005, + "loss": 2.105, + "step": 123320 + }, + { + "epoch": 0.46942441935704876, + "grad_norm": 0.14866508543491364, + "learning_rate": 0.0005, + "loss": 2.1245, + "step": 123330 + }, + { + "epoch": 0.4694624818251715, + "grad_norm": 0.11668729037046432, + "learning_rate": 0.0005, + "loss": 2.124, + "step": 123340 + }, + { + "epoch": 0.46950054429329413, + "grad_norm": 0.13470356166362762, + "learning_rate": 0.0005, + "loss": 2.1128, + "step": 123350 + }, + { + "epoch": 0.46953860676141684, + "grad_norm": 0.14405706524848938, + "learning_rate": 0.0005, + "loss": 2.096, + "step": 123360 + }, + { + "epoch": 0.4695766692295395, + "grad_norm": 0.128583624958992, + "learning_rate": 0.0005, + "loss": 2.1242, + "step": 123370 + }, + { + "epoch": 0.4696147316976622, + "grad_norm": 0.1338459551334381, + "learning_rate": 0.0005, + "loss": 2.1377, + "step": 123380 + }, + { + "epoch": 0.46965279416578487, + "grad_norm": 0.12829987704753876, + "learning_rate": 0.0005, + "loss": 2.1189, + "step": 123390 + }, + { + "epoch": 0.4696908566339076, + "grad_norm": 0.12520089745521545, + "learning_rate": 0.0005, + "loss": 2.1264, + "step": 123400 + }, + { + "epoch": 0.46972891910203024, + "grad_norm": 0.12828919291496277, + "learning_rate": 0.0005, + "loss": 2.1308, + "step": 123410 + }, + { + "epoch": 0.46976698157015295, + "grad_norm": 0.12368378788232803, + "learning_rate": 0.0005, + "loss": 2.1316, + "step": 123420 + }, + { + "epoch": 0.4698050440382756, + "grad_norm": 0.12954358756542206, + "learning_rate": 0.0005, + "loss": 2.1148, + "step": 123430 + }, + { + "epoch": 0.4698431065063983, + "grad_norm": 0.12930402159690857, + "learning_rate": 0.0005, + "loss": 2.1125, + "step": 123440 + }, + { + "epoch": 0.469881168974521, + "grad_norm": 0.12339557707309723, + "learning_rate": 0.0005, + "loss": 2.1158, + "step": 123450 + }, + { + "epoch": 0.4699192314426437, + "grad_norm": 0.11649967730045319, + "learning_rate": 0.0005, + "loss": 2.1189, + "step": 123460 + }, + { + "epoch": 0.46995729391076635, + "grad_norm": 0.1173417717218399, + "learning_rate": 0.0005, + "loss": 2.1199, + "step": 123470 + }, + { + "epoch": 0.469995356378889, + "grad_norm": 0.1417657881975174, + "learning_rate": 0.0005, + "loss": 2.1122, + "step": 123480 + }, + { + "epoch": 0.4700334188470117, + "grad_norm": 0.13985687494277954, + "learning_rate": 0.0005, + "loss": 2.119, + "step": 123490 + }, + { + "epoch": 0.4700714813151344, + "grad_norm": 0.12331917881965637, + "learning_rate": 0.0005, + "loss": 2.1324, + "step": 123500 + }, + { + "epoch": 0.4701095437832571, + "grad_norm": 0.12005551159381866, + "learning_rate": 0.0005, + "loss": 2.1065, + "step": 123510 + }, + { + "epoch": 0.47014760625137975, + "grad_norm": 0.11983375996351242, + "learning_rate": 0.0005, + "loss": 2.1075, + "step": 123520 + }, + { + "epoch": 0.47018566871950246, + "grad_norm": 0.11790003627538681, + "learning_rate": 0.0005, + "loss": 2.1085, + "step": 123530 + }, + { + "epoch": 0.4702237311876251, + "grad_norm": 0.14322248101234436, + "learning_rate": 0.0005, + "loss": 2.1121, + "step": 123540 + }, + { + "epoch": 0.47026179365574783, + "grad_norm": 0.12420692294836044, + "learning_rate": 0.0005, + "loss": 2.1243, + "step": 123550 + }, + { + "epoch": 0.4702998561238705, + "grad_norm": 0.12818403542041779, + "learning_rate": 0.0005, + "loss": 2.1292, + "step": 123560 + }, + { + "epoch": 0.4703379185919932, + "grad_norm": 0.13269901275634766, + "learning_rate": 0.0005, + "loss": 2.099, + "step": 123570 + }, + { + "epoch": 0.47037598106011586, + "grad_norm": 0.12523117661476135, + "learning_rate": 0.0005, + "loss": 2.1202, + "step": 123580 + }, + { + "epoch": 0.47041404352823857, + "grad_norm": 0.11941179633140564, + "learning_rate": 0.0005, + "loss": 2.123, + "step": 123590 + }, + { + "epoch": 0.4704521059963612, + "grad_norm": 0.12068691849708557, + "learning_rate": 0.0005, + "loss": 2.1343, + "step": 123600 + }, + { + "epoch": 0.47049016846448394, + "grad_norm": 0.12186926603317261, + "learning_rate": 0.0005, + "loss": 2.1231, + "step": 123610 + }, + { + "epoch": 0.4705282309326066, + "grad_norm": 0.12164727598428726, + "learning_rate": 0.0005, + "loss": 2.1206, + "step": 123620 + }, + { + "epoch": 0.47056629340072925, + "grad_norm": 0.11695064604282379, + "learning_rate": 0.0005, + "loss": 2.1191, + "step": 123630 + }, + { + "epoch": 0.47060435586885196, + "grad_norm": 0.12620823085308075, + "learning_rate": 0.0005, + "loss": 2.1061, + "step": 123640 + }, + { + "epoch": 0.4706424183369746, + "grad_norm": 0.11972874402999878, + "learning_rate": 0.0005, + "loss": 2.1192, + "step": 123650 + }, + { + "epoch": 0.47068048080509733, + "grad_norm": 0.1494438350200653, + "learning_rate": 0.0005, + "loss": 2.1259, + "step": 123660 + }, + { + "epoch": 0.47071854327322, + "grad_norm": 0.13811999559402466, + "learning_rate": 0.0005, + "loss": 2.1183, + "step": 123670 + }, + { + "epoch": 0.4707566057413427, + "grad_norm": 0.12858720123767853, + "learning_rate": 0.0005, + "loss": 2.1082, + "step": 123680 + }, + { + "epoch": 0.47079466820946536, + "grad_norm": 0.12714563310146332, + "learning_rate": 0.0005, + "loss": 2.1033, + "step": 123690 + }, + { + "epoch": 0.4708327306775881, + "grad_norm": 0.11671023070812225, + "learning_rate": 0.0005, + "loss": 2.125, + "step": 123700 + }, + { + "epoch": 0.47087079314571073, + "grad_norm": 0.13238412141799927, + "learning_rate": 0.0005, + "loss": 2.1074, + "step": 123710 + }, + { + "epoch": 0.47090885561383344, + "grad_norm": 0.11716995388269424, + "learning_rate": 0.0005, + "loss": 2.1301, + "step": 123720 + }, + { + "epoch": 0.4709469180819561, + "grad_norm": 0.1207890436053276, + "learning_rate": 0.0005, + "loss": 2.1117, + "step": 123730 + }, + { + "epoch": 0.4709849805500788, + "grad_norm": 0.12667891383171082, + "learning_rate": 0.0005, + "loss": 2.1067, + "step": 123740 + }, + { + "epoch": 0.47102304301820147, + "grad_norm": 0.12619231641292572, + "learning_rate": 0.0005, + "loss": 2.1242, + "step": 123750 + }, + { + "epoch": 0.4710611054863241, + "grad_norm": 0.11664183437824249, + "learning_rate": 0.0005, + "loss": 2.112, + "step": 123760 + }, + { + "epoch": 0.47109916795444684, + "grad_norm": 0.11987679451704025, + "learning_rate": 0.0005, + "loss": 2.1028, + "step": 123770 + }, + { + "epoch": 0.4711372304225695, + "grad_norm": 0.13168591260910034, + "learning_rate": 0.0005, + "loss": 2.1244, + "step": 123780 + }, + { + "epoch": 0.4711752928906922, + "grad_norm": 0.11626217514276505, + "learning_rate": 0.0005, + "loss": 2.1219, + "step": 123790 + }, + { + "epoch": 0.47121335535881487, + "grad_norm": 0.1286042183637619, + "learning_rate": 0.0005, + "loss": 2.1153, + "step": 123800 + }, + { + "epoch": 0.4712514178269376, + "grad_norm": 0.12957172095775604, + "learning_rate": 0.0005, + "loss": 2.1099, + "step": 123810 + }, + { + "epoch": 0.47128948029506024, + "grad_norm": 0.12456455081701279, + "learning_rate": 0.0005, + "loss": 2.1208, + "step": 123820 + }, + { + "epoch": 0.47132754276318295, + "grad_norm": 0.1218956708908081, + "learning_rate": 0.0005, + "loss": 2.1306, + "step": 123830 + }, + { + "epoch": 0.4713656052313056, + "grad_norm": 0.11381538212299347, + "learning_rate": 0.0005, + "loss": 2.1111, + "step": 123840 + }, + { + "epoch": 0.4714036676994283, + "grad_norm": 0.11608819663524628, + "learning_rate": 0.0005, + "loss": 2.1168, + "step": 123850 + }, + { + "epoch": 0.471441730167551, + "grad_norm": 0.13417969644069672, + "learning_rate": 0.0005, + "loss": 2.0979, + "step": 123860 + }, + { + "epoch": 0.4714797926356737, + "grad_norm": 0.12660396099090576, + "learning_rate": 0.0005, + "loss": 2.1265, + "step": 123870 + }, + { + "epoch": 0.47151785510379635, + "grad_norm": 0.1264629364013672, + "learning_rate": 0.0005, + "loss": 2.1063, + "step": 123880 + }, + { + "epoch": 0.47155591757191906, + "grad_norm": 0.1252276450395584, + "learning_rate": 0.0005, + "loss": 2.1121, + "step": 123890 + }, + { + "epoch": 0.4715939800400417, + "grad_norm": 0.14274397492408752, + "learning_rate": 0.0005, + "loss": 2.1191, + "step": 123900 + }, + { + "epoch": 0.4716320425081644, + "grad_norm": 0.12475555390119553, + "learning_rate": 0.0005, + "loss": 2.1091, + "step": 123910 + }, + { + "epoch": 0.4716701049762871, + "grad_norm": 0.12727521359920502, + "learning_rate": 0.0005, + "loss": 2.1222, + "step": 123920 + }, + { + "epoch": 0.47170816744440974, + "grad_norm": 0.13684184849262238, + "learning_rate": 0.0005, + "loss": 2.1299, + "step": 123930 + }, + { + "epoch": 0.47174622991253246, + "grad_norm": 0.13249535858631134, + "learning_rate": 0.0005, + "loss": 2.121, + "step": 123940 + }, + { + "epoch": 0.4717842923806551, + "grad_norm": 0.1293686330318451, + "learning_rate": 0.0005, + "loss": 2.1283, + "step": 123950 + }, + { + "epoch": 0.4718223548487778, + "grad_norm": 0.14200104773044586, + "learning_rate": 0.0005, + "loss": 2.1111, + "step": 123960 + }, + { + "epoch": 0.4718604173169005, + "grad_norm": 0.12920936942100525, + "learning_rate": 0.0005, + "loss": 2.1242, + "step": 123970 + }, + { + "epoch": 0.4718984797850232, + "grad_norm": 0.1317531168460846, + "learning_rate": 0.0005, + "loss": 2.1146, + "step": 123980 + }, + { + "epoch": 0.47193654225314585, + "grad_norm": 0.13654613494873047, + "learning_rate": 0.0005, + "loss": 2.125, + "step": 123990 + }, + { + "epoch": 0.47197460472126856, + "grad_norm": 0.12663967907428741, + "learning_rate": 0.0005, + "loss": 2.1163, + "step": 124000 + }, + { + "epoch": 0.4720126671893912, + "grad_norm": 0.12163762748241425, + "learning_rate": 0.0005, + "loss": 2.1048, + "step": 124010 + }, + { + "epoch": 0.47205072965751393, + "grad_norm": 0.13067223131656647, + "learning_rate": 0.0005, + "loss": 2.1282, + "step": 124020 + }, + { + "epoch": 0.4720887921256366, + "grad_norm": 0.11829733103513718, + "learning_rate": 0.0005, + "loss": 2.1197, + "step": 124030 + }, + { + "epoch": 0.4721268545937593, + "grad_norm": 0.12859944999217987, + "learning_rate": 0.0005, + "loss": 2.1184, + "step": 124040 + }, + { + "epoch": 0.47216491706188196, + "grad_norm": 0.1161130964756012, + "learning_rate": 0.0005, + "loss": 2.1194, + "step": 124050 + }, + { + "epoch": 0.4722029795300046, + "grad_norm": 0.1277155727148056, + "learning_rate": 0.0005, + "loss": 2.1151, + "step": 124060 + }, + { + "epoch": 0.47224104199812733, + "grad_norm": 0.10972582548856735, + "learning_rate": 0.0005, + "loss": 2.1189, + "step": 124070 + }, + { + "epoch": 0.47227910446625, + "grad_norm": 0.12672588229179382, + "learning_rate": 0.0005, + "loss": 2.1149, + "step": 124080 + }, + { + "epoch": 0.4723171669343727, + "grad_norm": 0.14097382128238678, + "learning_rate": 0.0005, + "loss": 2.1248, + "step": 124090 + }, + { + "epoch": 0.47235522940249536, + "grad_norm": 0.12108251452445984, + "learning_rate": 0.0005, + "loss": 2.1029, + "step": 124100 + }, + { + "epoch": 0.47239329187061807, + "grad_norm": 0.1314038187265396, + "learning_rate": 0.0005, + "loss": 2.1242, + "step": 124110 + }, + { + "epoch": 0.4724313543387407, + "grad_norm": 0.13684318959712982, + "learning_rate": 0.0005, + "loss": 2.1134, + "step": 124120 + }, + { + "epoch": 0.47246941680686344, + "grad_norm": 0.12464044988155365, + "learning_rate": 0.0005, + "loss": 2.12, + "step": 124130 + }, + { + "epoch": 0.4725074792749861, + "grad_norm": 0.13032224774360657, + "learning_rate": 0.0005, + "loss": 2.122, + "step": 124140 + }, + { + "epoch": 0.4725455417431088, + "grad_norm": 0.1219305470585823, + "learning_rate": 0.0005, + "loss": 2.1277, + "step": 124150 + }, + { + "epoch": 0.47258360421123147, + "grad_norm": 0.12953796982765198, + "learning_rate": 0.0005, + "loss": 2.1195, + "step": 124160 + }, + { + "epoch": 0.4726216666793542, + "grad_norm": 0.1365276426076889, + "learning_rate": 0.0005, + "loss": 2.1198, + "step": 124170 + }, + { + "epoch": 0.47265972914747684, + "grad_norm": 0.12354233115911484, + "learning_rate": 0.0005, + "loss": 2.1277, + "step": 124180 + }, + { + "epoch": 0.47269779161559955, + "grad_norm": 0.13295651972293854, + "learning_rate": 0.0005, + "loss": 2.0998, + "step": 124190 + }, + { + "epoch": 0.4727358540837222, + "grad_norm": 0.1274101585149765, + "learning_rate": 0.0005, + "loss": 2.1094, + "step": 124200 + }, + { + "epoch": 0.47277391655184486, + "grad_norm": 0.12417597323656082, + "learning_rate": 0.0005, + "loss": 2.1262, + "step": 124210 + }, + { + "epoch": 0.4728119790199676, + "grad_norm": 0.12661489844322205, + "learning_rate": 0.0005, + "loss": 2.1261, + "step": 124220 + }, + { + "epoch": 0.47285004148809023, + "grad_norm": 0.10787620395421982, + "learning_rate": 0.0005, + "loss": 2.1041, + "step": 124230 + }, + { + "epoch": 0.47288810395621295, + "grad_norm": 0.12663552165031433, + "learning_rate": 0.0005, + "loss": 2.1089, + "step": 124240 + }, + { + "epoch": 0.4729261664243356, + "grad_norm": 0.13825610280036926, + "learning_rate": 0.0005, + "loss": 2.1002, + "step": 124250 + }, + { + "epoch": 0.4729642288924583, + "grad_norm": 0.12444397807121277, + "learning_rate": 0.0005, + "loss": 2.1146, + "step": 124260 + }, + { + "epoch": 0.473002291360581, + "grad_norm": 0.11867283284664154, + "learning_rate": 0.0005, + "loss": 2.1309, + "step": 124270 + }, + { + "epoch": 0.4730403538287037, + "grad_norm": 0.13824884593486786, + "learning_rate": 0.0005, + "loss": 2.1192, + "step": 124280 + }, + { + "epoch": 0.47307841629682634, + "grad_norm": 0.1373097002506256, + "learning_rate": 0.0005, + "loss": 2.1163, + "step": 124290 + }, + { + "epoch": 0.47311647876494906, + "grad_norm": 0.11826524883508682, + "learning_rate": 0.0005, + "loss": 2.1209, + "step": 124300 + }, + { + "epoch": 0.4731545412330717, + "grad_norm": 0.11999563872814178, + "learning_rate": 0.0005, + "loss": 2.1075, + "step": 124310 + }, + { + "epoch": 0.4731926037011944, + "grad_norm": 0.11594437062740326, + "learning_rate": 0.0005, + "loss": 2.1136, + "step": 124320 + }, + { + "epoch": 0.4732306661693171, + "grad_norm": 0.11913499981164932, + "learning_rate": 0.0005, + "loss": 2.1061, + "step": 124330 + }, + { + "epoch": 0.47326872863743974, + "grad_norm": 0.12613172829151154, + "learning_rate": 0.0005, + "loss": 2.1104, + "step": 124340 + }, + { + "epoch": 0.47330679110556245, + "grad_norm": 0.11476528644561768, + "learning_rate": 0.0005, + "loss": 2.1134, + "step": 124350 + }, + { + "epoch": 0.4733448535736851, + "grad_norm": 0.12161380052566528, + "learning_rate": 0.0005, + "loss": 2.1123, + "step": 124360 + }, + { + "epoch": 0.4733829160418078, + "grad_norm": 0.12207907438278198, + "learning_rate": 0.0005, + "loss": 2.1156, + "step": 124370 + }, + { + "epoch": 0.4734209785099305, + "grad_norm": 0.129538431763649, + "learning_rate": 0.0005, + "loss": 2.1181, + "step": 124380 + }, + { + "epoch": 0.4734590409780532, + "grad_norm": 0.12141788005828857, + "learning_rate": 0.0005, + "loss": 2.1121, + "step": 124390 + }, + { + "epoch": 0.47349710344617585, + "grad_norm": 0.13145986199378967, + "learning_rate": 0.0005, + "loss": 2.1046, + "step": 124400 + }, + { + "epoch": 0.47353516591429856, + "grad_norm": 0.12737277150154114, + "learning_rate": 0.0005, + "loss": 2.1202, + "step": 124410 + }, + { + "epoch": 0.4735732283824212, + "grad_norm": 0.13328434526920319, + "learning_rate": 0.0005, + "loss": 2.119, + "step": 124420 + }, + { + "epoch": 0.47361129085054393, + "grad_norm": 0.1177564188838005, + "learning_rate": 0.0005, + "loss": 2.0978, + "step": 124430 + }, + { + "epoch": 0.4736493533186666, + "grad_norm": 0.12407038360834122, + "learning_rate": 0.0005, + "loss": 2.1101, + "step": 124440 + }, + { + "epoch": 0.4736874157867893, + "grad_norm": 0.12505443394184113, + "learning_rate": 0.0005, + "loss": 2.1424, + "step": 124450 + }, + { + "epoch": 0.47372547825491196, + "grad_norm": 0.11660947650671005, + "learning_rate": 0.0005, + "loss": 2.1264, + "step": 124460 + }, + { + "epoch": 0.47376354072303467, + "grad_norm": 0.1413010209798813, + "learning_rate": 0.0005, + "loss": 2.1143, + "step": 124470 + }, + { + "epoch": 0.47380160319115733, + "grad_norm": 0.1457766890525818, + "learning_rate": 0.0005, + "loss": 2.1034, + "step": 124480 + }, + { + "epoch": 0.47383966565928, + "grad_norm": 0.11343556642532349, + "learning_rate": 0.0005, + "loss": 2.1209, + "step": 124490 + }, + { + "epoch": 0.4738777281274027, + "grad_norm": 0.13399618864059448, + "learning_rate": 0.0005, + "loss": 2.1319, + "step": 124500 + }, + { + "epoch": 0.47391579059552535, + "grad_norm": 0.1370176076889038, + "learning_rate": 0.0005, + "loss": 2.1042, + "step": 124510 + }, + { + "epoch": 0.47395385306364807, + "grad_norm": 0.1271154284477234, + "learning_rate": 0.0005, + "loss": 2.1328, + "step": 124520 + }, + { + "epoch": 0.4739919155317707, + "grad_norm": 0.12623514235019684, + "learning_rate": 0.0005, + "loss": 2.1275, + "step": 124530 + }, + { + "epoch": 0.47402997799989344, + "grad_norm": 0.15138548612594604, + "learning_rate": 0.0005, + "loss": 2.1142, + "step": 124540 + }, + { + "epoch": 0.4740680404680161, + "grad_norm": 0.1311340481042862, + "learning_rate": 0.0005, + "loss": 2.1165, + "step": 124550 + }, + { + "epoch": 0.4741061029361388, + "grad_norm": 0.12341172248125076, + "learning_rate": 0.0005, + "loss": 2.1225, + "step": 124560 + }, + { + "epoch": 0.47414416540426146, + "grad_norm": 0.12557727098464966, + "learning_rate": 0.0005, + "loss": 2.114, + "step": 124570 + }, + { + "epoch": 0.4741822278723842, + "grad_norm": 0.12559503316879272, + "learning_rate": 0.0005, + "loss": 2.1133, + "step": 124580 + }, + { + "epoch": 0.47422029034050683, + "grad_norm": 0.1338992416858673, + "learning_rate": 0.0005, + "loss": 2.1134, + "step": 124590 + }, + { + "epoch": 0.47425835280862955, + "grad_norm": 0.12634088099002838, + "learning_rate": 0.0005, + "loss": 2.1197, + "step": 124600 + }, + { + "epoch": 0.4742964152767522, + "grad_norm": 0.12977583706378937, + "learning_rate": 0.0005, + "loss": 2.1015, + "step": 124610 + }, + { + "epoch": 0.4743344777448749, + "grad_norm": 0.11903408169746399, + "learning_rate": 0.0005, + "loss": 2.121, + "step": 124620 + }, + { + "epoch": 0.4743725402129976, + "grad_norm": 0.12493615597486496, + "learning_rate": 0.0005, + "loss": 2.1136, + "step": 124630 + }, + { + "epoch": 0.47441060268112023, + "grad_norm": 0.12668555974960327, + "learning_rate": 0.0005, + "loss": 2.1357, + "step": 124640 + }, + { + "epoch": 0.47444866514924294, + "grad_norm": 0.11522762477397919, + "learning_rate": 0.0005, + "loss": 2.1331, + "step": 124650 + }, + { + "epoch": 0.4744867276173656, + "grad_norm": 0.12461698055267334, + "learning_rate": 0.0005, + "loss": 2.1259, + "step": 124660 + }, + { + "epoch": 0.4745247900854883, + "grad_norm": 0.11919340491294861, + "learning_rate": 0.0005, + "loss": 2.1119, + "step": 124670 + }, + { + "epoch": 0.47456285255361097, + "grad_norm": 0.1142156720161438, + "learning_rate": 0.0005, + "loss": 2.1139, + "step": 124680 + }, + { + "epoch": 0.4746009150217337, + "grad_norm": 0.1265660524368286, + "learning_rate": 0.0005, + "loss": 2.1302, + "step": 124690 + }, + { + "epoch": 0.47463897748985634, + "grad_norm": 0.1387627124786377, + "learning_rate": 0.0005, + "loss": 2.1175, + "step": 124700 + }, + { + "epoch": 0.47467703995797905, + "grad_norm": 0.14065468311309814, + "learning_rate": 0.0005, + "loss": 2.1212, + "step": 124710 + }, + { + "epoch": 0.4747151024261017, + "grad_norm": 0.1256401389837265, + "learning_rate": 0.0005, + "loss": 2.1181, + "step": 124720 + }, + { + "epoch": 0.4747531648942244, + "grad_norm": 0.12581953406333923, + "learning_rate": 0.0005, + "loss": 2.1075, + "step": 124730 + }, + { + "epoch": 0.4747912273623471, + "grad_norm": 0.13003399968147278, + "learning_rate": 0.0005, + "loss": 2.1065, + "step": 124740 + }, + { + "epoch": 0.4748292898304698, + "grad_norm": 0.12539975345134735, + "learning_rate": 0.0005, + "loss": 2.0997, + "step": 124750 + }, + { + "epoch": 0.47486735229859245, + "grad_norm": 0.1365066021680832, + "learning_rate": 0.0005, + "loss": 2.1016, + "step": 124760 + }, + { + "epoch": 0.4749054147667151, + "grad_norm": 0.1297515630722046, + "learning_rate": 0.0005, + "loss": 2.1059, + "step": 124770 + }, + { + "epoch": 0.4749434772348378, + "grad_norm": 0.12623704969882965, + "learning_rate": 0.0005, + "loss": 2.0995, + "step": 124780 + }, + { + "epoch": 0.4749815397029605, + "grad_norm": 0.12066211551427841, + "learning_rate": 0.0005, + "loss": 2.1141, + "step": 124790 + }, + { + "epoch": 0.4750196021710832, + "grad_norm": 0.11945539712905884, + "learning_rate": 0.0005, + "loss": 2.1175, + "step": 124800 + }, + { + "epoch": 0.47505766463920585, + "grad_norm": 0.13884441554546356, + "learning_rate": 0.0005, + "loss": 2.1189, + "step": 124810 + }, + { + "epoch": 0.47509572710732856, + "grad_norm": 0.1486128270626068, + "learning_rate": 0.0005, + "loss": 2.1082, + "step": 124820 + }, + { + "epoch": 0.4751337895754512, + "grad_norm": 0.12276052683591843, + "learning_rate": 0.0005, + "loss": 2.1217, + "step": 124830 + }, + { + "epoch": 0.47517185204357393, + "grad_norm": 0.13106083869934082, + "learning_rate": 0.0005, + "loss": 2.1033, + "step": 124840 + }, + { + "epoch": 0.4752099145116966, + "grad_norm": 0.1291654109954834, + "learning_rate": 0.0005, + "loss": 2.1133, + "step": 124850 + }, + { + "epoch": 0.4752479769798193, + "grad_norm": 0.1254255324602127, + "learning_rate": 0.0005, + "loss": 2.1024, + "step": 124860 + }, + { + "epoch": 0.47528603944794195, + "grad_norm": 0.12424609810113907, + "learning_rate": 0.0005, + "loss": 2.1022, + "step": 124870 + }, + { + "epoch": 0.47532410191606467, + "grad_norm": 0.1316959261894226, + "learning_rate": 0.0005, + "loss": 2.1235, + "step": 124880 + }, + { + "epoch": 0.4753621643841873, + "grad_norm": 0.13111689686775208, + "learning_rate": 0.0005, + "loss": 2.1236, + "step": 124890 + }, + { + "epoch": 0.47540022685231004, + "grad_norm": 0.11223283410072327, + "learning_rate": 0.0005, + "loss": 2.1234, + "step": 124900 + }, + { + "epoch": 0.4754382893204327, + "grad_norm": 0.1185784563422203, + "learning_rate": 0.0005, + "loss": 2.105, + "step": 124910 + }, + { + "epoch": 0.47547635178855535, + "grad_norm": 0.1251397579908371, + "learning_rate": 0.0005, + "loss": 2.1217, + "step": 124920 + }, + { + "epoch": 0.47551441425667806, + "grad_norm": 0.13898153603076935, + "learning_rate": 0.0005, + "loss": 2.1043, + "step": 124930 + }, + { + "epoch": 0.4755524767248007, + "grad_norm": 0.12897156178951263, + "learning_rate": 0.0005, + "loss": 2.1219, + "step": 124940 + }, + { + "epoch": 0.47559053919292343, + "grad_norm": 0.1365572065114975, + "learning_rate": 0.0005, + "loss": 2.1083, + "step": 124950 + }, + { + "epoch": 0.4756286016610461, + "grad_norm": 0.12999938428401947, + "learning_rate": 0.0005, + "loss": 2.1164, + "step": 124960 + }, + { + "epoch": 0.4756666641291688, + "grad_norm": 0.14317527413368225, + "learning_rate": 0.0005, + "loss": 2.1135, + "step": 124970 + }, + { + "epoch": 0.47570472659729146, + "grad_norm": 0.13054832816123962, + "learning_rate": 0.0005, + "loss": 2.1094, + "step": 124980 + }, + { + "epoch": 0.4757427890654142, + "grad_norm": 0.12040127068758011, + "learning_rate": 0.0005, + "loss": 2.1048, + "step": 124990 + }, + { + "epoch": 0.47578085153353683, + "grad_norm": 0.15106411278247833, + "learning_rate": 0.0005, + "loss": 2.1274, + "step": 125000 + }, + { + "epoch": 0.47581891400165954, + "grad_norm": 0.12881748378276825, + "learning_rate": 0.0005, + "loss": 2.1257, + "step": 125010 + }, + { + "epoch": 0.4758569764697822, + "grad_norm": 0.1211240142583847, + "learning_rate": 0.0005, + "loss": 2.1295, + "step": 125020 + }, + { + "epoch": 0.4758950389379049, + "grad_norm": 0.1255410760641098, + "learning_rate": 0.0005, + "loss": 2.108, + "step": 125030 + }, + { + "epoch": 0.47593310140602757, + "grad_norm": 0.13183090090751648, + "learning_rate": 0.0005, + "loss": 2.1284, + "step": 125040 + }, + { + "epoch": 0.4759711638741503, + "grad_norm": 0.4035550653934479, + "learning_rate": 0.0005, + "loss": 2.108, + "step": 125050 + }, + { + "epoch": 0.47600922634227294, + "grad_norm": 0.12410563975572586, + "learning_rate": 0.0005, + "loss": 2.1095, + "step": 125060 + }, + { + "epoch": 0.4760472888103956, + "grad_norm": 0.12635254859924316, + "learning_rate": 0.0005, + "loss": 2.122, + "step": 125070 + }, + { + "epoch": 0.4760853512785183, + "grad_norm": 0.13341419398784637, + "learning_rate": 0.0005, + "loss": 2.1194, + "step": 125080 + }, + { + "epoch": 0.47612341374664097, + "grad_norm": 0.13298030197620392, + "learning_rate": 0.0005, + "loss": 2.1291, + "step": 125090 + }, + { + "epoch": 0.4761614762147637, + "grad_norm": 0.11619975417852402, + "learning_rate": 0.0005, + "loss": 2.1256, + "step": 125100 + }, + { + "epoch": 0.47619953868288634, + "grad_norm": 0.12509651482105255, + "learning_rate": 0.0005, + "loss": 2.1013, + "step": 125110 + }, + { + "epoch": 0.47623760115100905, + "grad_norm": 0.1121395081281662, + "learning_rate": 0.0005, + "loss": 2.1219, + "step": 125120 + }, + { + "epoch": 0.4762756636191317, + "grad_norm": 0.12973745167255402, + "learning_rate": 0.0005, + "loss": 2.1234, + "step": 125130 + }, + { + "epoch": 0.4763137260872544, + "grad_norm": 0.13184334337711334, + "learning_rate": 0.0005, + "loss": 2.1223, + "step": 125140 + }, + { + "epoch": 0.4763517885553771, + "grad_norm": 0.11461430788040161, + "learning_rate": 0.0005, + "loss": 2.112, + "step": 125150 + }, + { + "epoch": 0.4763898510234998, + "grad_norm": 0.13077722489833832, + "learning_rate": 0.0005, + "loss": 2.1089, + "step": 125160 + }, + { + "epoch": 0.47642791349162245, + "grad_norm": 0.14497019350528717, + "learning_rate": 0.0005, + "loss": 2.1101, + "step": 125170 + }, + { + "epoch": 0.47646597595974516, + "grad_norm": 0.12146943807601929, + "learning_rate": 0.0005, + "loss": 2.1194, + "step": 125180 + }, + { + "epoch": 0.4765040384278678, + "grad_norm": 0.11859641224145889, + "learning_rate": 0.0005, + "loss": 2.115, + "step": 125190 + }, + { + "epoch": 0.4765421008959905, + "grad_norm": 0.13055294752120972, + "learning_rate": 0.0005, + "loss": 2.1108, + "step": 125200 + }, + { + "epoch": 0.4765801633641132, + "grad_norm": 0.1231132373213768, + "learning_rate": 0.0005, + "loss": 2.1049, + "step": 125210 + }, + { + "epoch": 0.47661822583223584, + "grad_norm": 0.1298208385705948, + "learning_rate": 0.0005, + "loss": 2.103, + "step": 125220 + }, + { + "epoch": 0.47665628830035855, + "grad_norm": 0.11626437306404114, + "learning_rate": 0.0005, + "loss": 2.1017, + "step": 125230 + }, + { + "epoch": 0.4766943507684812, + "grad_norm": 0.13338100910186768, + "learning_rate": 0.0005, + "loss": 2.104, + "step": 125240 + }, + { + "epoch": 0.4767324132366039, + "grad_norm": 0.14445482194423676, + "learning_rate": 0.0005, + "loss": 2.1222, + "step": 125250 + }, + { + "epoch": 0.4767704757047266, + "grad_norm": 0.12296000868082047, + "learning_rate": 0.0005, + "loss": 2.1064, + "step": 125260 + }, + { + "epoch": 0.4768085381728493, + "grad_norm": 0.11614549905061722, + "learning_rate": 0.0005, + "loss": 2.0973, + "step": 125270 + }, + { + "epoch": 0.47684660064097195, + "grad_norm": 0.11743879318237305, + "learning_rate": 0.0005, + "loss": 2.1106, + "step": 125280 + }, + { + "epoch": 0.47688466310909466, + "grad_norm": 0.12904691696166992, + "learning_rate": 0.0005, + "loss": 2.1148, + "step": 125290 + }, + { + "epoch": 0.4769227255772173, + "grad_norm": 0.13546623289585114, + "learning_rate": 0.0005, + "loss": 2.1202, + "step": 125300 + }, + { + "epoch": 0.47696078804534003, + "grad_norm": 0.13174700736999512, + "learning_rate": 0.0005, + "loss": 2.1116, + "step": 125310 + }, + { + "epoch": 0.4769988505134627, + "grad_norm": 0.128945454955101, + "learning_rate": 0.0005, + "loss": 2.103, + "step": 125320 + }, + { + "epoch": 0.4770369129815854, + "grad_norm": 0.1436336636543274, + "learning_rate": 0.0005, + "loss": 2.1139, + "step": 125330 + }, + { + "epoch": 0.47707497544970806, + "grad_norm": 0.12894588708877563, + "learning_rate": 0.0005, + "loss": 2.0977, + "step": 125340 + }, + { + "epoch": 0.4771130379178307, + "grad_norm": 0.1292628049850464, + "learning_rate": 0.0005, + "loss": 2.1095, + "step": 125350 + }, + { + "epoch": 0.47715110038595343, + "grad_norm": 0.12030140310525894, + "learning_rate": 0.0005, + "loss": 2.1235, + "step": 125360 + }, + { + "epoch": 0.4771891628540761, + "grad_norm": 0.12619850039482117, + "learning_rate": 0.0005, + "loss": 2.1162, + "step": 125370 + }, + { + "epoch": 0.4772272253221988, + "grad_norm": 0.3717951774597168, + "learning_rate": 0.0005, + "loss": 2.1019, + "step": 125380 + }, + { + "epoch": 0.47726528779032146, + "grad_norm": 0.13587363064289093, + "learning_rate": 0.0005, + "loss": 2.1217, + "step": 125390 + }, + { + "epoch": 0.47730335025844417, + "grad_norm": 0.15112850069999695, + "learning_rate": 0.0005, + "loss": 2.1025, + "step": 125400 + }, + { + "epoch": 0.4773414127265668, + "grad_norm": 0.12029213458299637, + "learning_rate": 0.0005, + "loss": 2.121, + "step": 125410 + }, + { + "epoch": 0.47737947519468954, + "grad_norm": 0.1370517760515213, + "learning_rate": 0.0005, + "loss": 2.1155, + "step": 125420 + }, + { + "epoch": 0.4774175376628122, + "grad_norm": 0.12078516185283661, + "learning_rate": 0.0005, + "loss": 2.119, + "step": 125430 + }, + { + "epoch": 0.4774556001309349, + "grad_norm": 0.117357537150383, + "learning_rate": 0.0005, + "loss": 2.112, + "step": 125440 + }, + { + "epoch": 0.47749366259905757, + "grad_norm": 0.11920157074928284, + "learning_rate": 0.0005, + "loss": 2.1055, + "step": 125450 + }, + { + "epoch": 0.4775317250671803, + "grad_norm": 0.1200534850358963, + "learning_rate": 0.0005, + "loss": 2.1197, + "step": 125460 + }, + { + "epoch": 0.47756978753530294, + "grad_norm": 0.13385429978370667, + "learning_rate": 0.0005, + "loss": 2.124, + "step": 125470 + }, + { + "epoch": 0.47760785000342565, + "grad_norm": 0.11925206333398819, + "learning_rate": 0.0005, + "loss": 2.1041, + "step": 125480 + }, + { + "epoch": 0.4776459124715483, + "grad_norm": 0.13422226905822754, + "learning_rate": 0.0005, + "loss": 2.1191, + "step": 125490 + }, + { + "epoch": 0.47768397493967096, + "grad_norm": 0.12674109637737274, + "learning_rate": 0.0005, + "loss": 2.1105, + "step": 125500 + }, + { + "epoch": 0.4777220374077937, + "grad_norm": 0.1292978972196579, + "learning_rate": 0.0005, + "loss": 2.12, + "step": 125510 + }, + { + "epoch": 0.47776009987591633, + "grad_norm": 0.14307229220867157, + "learning_rate": 0.0005, + "loss": 2.1186, + "step": 125520 + }, + { + "epoch": 0.47779816234403905, + "grad_norm": 0.13327215611934662, + "learning_rate": 0.0005, + "loss": 2.1066, + "step": 125530 + }, + { + "epoch": 0.4778362248121617, + "grad_norm": 0.13076509535312653, + "learning_rate": 0.0005, + "loss": 2.1122, + "step": 125540 + }, + { + "epoch": 0.4778742872802844, + "grad_norm": 0.11930997669696808, + "learning_rate": 0.0005, + "loss": 2.1135, + "step": 125550 + }, + { + "epoch": 0.4779123497484071, + "grad_norm": 0.12345302850008011, + "learning_rate": 0.0005, + "loss": 2.1083, + "step": 125560 + }, + { + "epoch": 0.4779504122165298, + "grad_norm": 0.12322073429822922, + "learning_rate": 0.0005, + "loss": 2.1254, + "step": 125570 + }, + { + "epoch": 0.47798847468465244, + "grad_norm": 0.12302183359861374, + "learning_rate": 0.0005, + "loss": 2.1049, + "step": 125580 + }, + { + "epoch": 0.47802653715277516, + "grad_norm": 0.1490904539823532, + "learning_rate": 0.0005, + "loss": 2.1061, + "step": 125590 + }, + { + "epoch": 0.4780645996208978, + "grad_norm": 0.13050857186317444, + "learning_rate": 0.0005, + "loss": 2.1132, + "step": 125600 + }, + { + "epoch": 0.4781026620890205, + "grad_norm": 0.13035564124584198, + "learning_rate": 0.0005, + "loss": 2.1017, + "step": 125610 + }, + { + "epoch": 0.4781407245571432, + "grad_norm": 0.13813672959804535, + "learning_rate": 0.0005, + "loss": 2.1209, + "step": 125620 + }, + { + "epoch": 0.47817878702526584, + "grad_norm": 0.1336815059185028, + "learning_rate": 0.0005, + "loss": 2.1271, + "step": 125630 + }, + { + "epoch": 0.47821684949338855, + "grad_norm": 0.12296368181705475, + "learning_rate": 0.0005, + "loss": 2.1239, + "step": 125640 + }, + { + "epoch": 0.4782549119615112, + "grad_norm": 0.12488293647766113, + "learning_rate": 0.0005, + "loss": 2.1162, + "step": 125650 + }, + { + "epoch": 0.4782929744296339, + "grad_norm": 0.13063012063503265, + "learning_rate": 0.0005, + "loss": 2.0934, + "step": 125660 + }, + { + "epoch": 0.4783310368977566, + "grad_norm": 0.1249491274356842, + "learning_rate": 0.0005, + "loss": 2.109, + "step": 125670 + }, + { + "epoch": 0.4783690993658793, + "grad_norm": 0.2116761952638626, + "learning_rate": 0.0005, + "loss": 2.1249, + "step": 125680 + }, + { + "epoch": 0.47840716183400195, + "grad_norm": 0.12892261147499084, + "learning_rate": 0.0005, + "loss": 2.1092, + "step": 125690 + }, + { + "epoch": 0.47844522430212466, + "grad_norm": 0.13267821073532104, + "learning_rate": 0.0005, + "loss": 2.0941, + "step": 125700 + }, + { + "epoch": 0.4784832867702473, + "grad_norm": 0.1281917840242386, + "learning_rate": 0.0005, + "loss": 2.115, + "step": 125710 + }, + { + "epoch": 0.47852134923837003, + "grad_norm": 0.12860079109668732, + "learning_rate": 0.0005, + "loss": 2.0996, + "step": 125720 + }, + { + "epoch": 0.4785594117064927, + "grad_norm": 0.13791748881340027, + "learning_rate": 0.0005, + "loss": 2.1176, + "step": 125730 + }, + { + "epoch": 0.4785974741746154, + "grad_norm": 0.12474928796291351, + "learning_rate": 0.0005, + "loss": 2.1222, + "step": 125740 + }, + { + "epoch": 0.47863553664273806, + "grad_norm": 0.1367192417383194, + "learning_rate": 0.0005, + "loss": 2.1154, + "step": 125750 + }, + { + "epoch": 0.47867359911086077, + "grad_norm": 0.12175612151622772, + "learning_rate": 0.0005, + "loss": 2.1152, + "step": 125760 + }, + { + "epoch": 0.4787116615789834, + "grad_norm": 0.12259689718484879, + "learning_rate": 0.0005, + "loss": 2.1023, + "step": 125770 + }, + { + "epoch": 0.4787497240471061, + "grad_norm": 0.12758487462997437, + "learning_rate": 0.0005, + "loss": 2.1144, + "step": 125780 + }, + { + "epoch": 0.4787877865152288, + "grad_norm": 0.13025754690170288, + "learning_rate": 0.0005, + "loss": 2.1017, + "step": 125790 + }, + { + "epoch": 0.47882584898335145, + "grad_norm": 0.13321420550346375, + "learning_rate": 0.0005, + "loss": 2.1182, + "step": 125800 + }, + { + "epoch": 0.47886391145147417, + "grad_norm": 0.12373219430446625, + "learning_rate": 0.0005, + "loss": 2.1197, + "step": 125810 + }, + { + "epoch": 0.4789019739195968, + "grad_norm": 0.14295180141925812, + "learning_rate": 0.0005, + "loss": 2.1202, + "step": 125820 + }, + { + "epoch": 0.47894003638771954, + "grad_norm": 0.12274914234876633, + "learning_rate": 0.0005, + "loss": 2.105, + "step": 125830 + }, + { + "epoch": 0.4789780988558422, + "grad_norm": 0.12903110682964325, + "learning_rate": 0.0005, + "loss": 2.1133, + "step": 125840 + }, + { + "epoch": 0.4790161613239649, + "grad_norm": 0.1290123164653778, + "learning_rate": 0.0005, + "loss": 2.1174, + "step": 125850 + }, + { + "epoch": 0.47905422379208756, + "grad_norm": 0.12053056061267853, + "learning_rate": 0.0005, + "loss": 2.1077, + "step": 125860 + }, + { + "epoch": 0.4790922862602103, + "grad_norm": 0.13491888344287872, + "learning_rate": 0.0005, + "loss": 2.1158, + "step": 125870 + }, + { + "epoch": 0.47913034872833293, + "grad_norm": 0.12733201682567596, + "learning_rate": 0.0005, + "loss": 2.1259, + "step": 125880 + }, + { + "epoch": 0.47916841119645565, + "grad_norm": 0.12354981154203415, + "learning_rate": 0.0005, + "loss": 2.1175, + "step": 125890 + }, + { + "epoch": 0.4792064736645783, + "grad_norm": 0.12990646064281464, + "learning_rate": 0.0005, + "loss": 2.104, + "step": 125900 + }, + { + "epoch": 0.479244536132701, + "grad_norm": 0.11663618683815002, + "learning_rate": 0.0005, + "loss": 2.1255, + "step": 125910 + }, + { + "epoch": 0.4792825986008237, + "grad_norm": 0.1272948831319809, + "learning_rate": 0.0005, + "loss": 2.1147, + "step": 125920 + }, + { + "epoch": 0.47932066106894633, + "grad_norm": 0.12534311413764954, + "learning_rate": 0.0005, + "loss": 2.1176, + "step": 125930 + }, + { + "epoch": 0.47935872353706904, + "grad_norm": 0.1488637626171112, + "learning_rate": 0.0005, + "loss": 2.1064, + "step": 125940 + }, + { + "epoch": 0.4793967860051917, + "grad_norm": 0.13527126610279083, + "learning_rate": 0.0005, + "loss": 2.1226, + "step": 125950 + }, + { + "epoch": 0.4794348484733144, + "grad_norm": 0.1611463725566864, + "learning_rate": 0.0005, + "loss": 2.1102, + "step": 125960 + }, + { + "epoch": 0.47947291094143707, + "grad_norm": 0.11967337131500244, + "learning_rate": 0.0005, + "loss": 2.1231, + "step": 125970 + }, + { + "epoch": 0.4795109734095598, + "grad_norm": 0.13560950756072998, + "learning_rate": 0.0005, + "loss": 2.1181, + "step": 125980 + }, + { + "epoch": 0.47954903587768244, + "grad_norm": 0.1264272928237915, + "learning_rate": 0.0005, + "loss": 2.0995, + "step": 125990 + }, + { + "epoch": 0.47958709834580515, + "grad_norm": 0.1372808814048767, + "learning_rate": 0.0005, + "loss": 2.1172, + "step": 126000 + }, + { + "epoch": 0.4796251608139278, + "grad_norm": 0.13430854678153992, + "learning_rate": 0.0005, + "loss": 2.1226, + "step": 126010 + }, + { + "epoch": 0.4796632232820505, + "grad_norm": 0.14079371094703674, + "learning_rate": 0.0005, + "loss": 2.107, + "step": 126020 + }, + { + "epoch": 0.4797012857501732, + "grad_norm": 0.13879041373729706, + "learning_rate": 0.0005, + "loss": 2.1306, + "step": 126030 + }, + { + "epoch": 0.4797393482182959, + "grad_norm": 0.12993349134922028, + "learning_rate": 0.0005, + "loss": 2.1263, + "step": 126040 + }, + { + "epoch": 0.47977741068641855, + "grad_norm": 0.11303086578845978, + "learning_rate": 0.0005, + "loss": 2.1166, + "step": 126050 + }, + { + "epoch": 0.4798154731545412, + "grad_norm": 0.11333756148815155, + "learning_rate": 0.0005, + "loss": 2.1208, + "step": 126060 + }, + { + "epoch": 0.4798535356226639, + "grad_norm": 0.11820187419652939, + "learning_rate": 0.0005, + "loss": 2.1164, + "step": 126070 + }, + { + "epoch": 0.4798915980907866, + "grad_norm": 0.14163796603679657, + "learning_rate": 0.0005, + "loss": 2.1045, + "step": 126080 + }, + { + "epoch": 0.4799296605589093, + "grad_norm": 0.13567952811717987, + "learning_rate": 0.0005, + "loss": 2.1366, + "step": 126090 + }, + { + "epoch": 0.47996772302703195, + "grad_norm": 0.12436114996671677, + "learning_rate": 0.0005, + "loss": 2.1199, + "step": 126100 + }, + { + "epoch": 0.48000578549515466, + "grad_norm": 0.12871521711349487, + "learning_rate": 0.0005, + "loss": 2.0973, + "step": 126110 + }, + { + "epoch": 0.4800438479632773, + "grad_norm": 0.12559446692466736, + "learning_rate": 0.0005, + "loss": 2.1202, + "step": 126120 + }, + { + "epoch": 0.4800819104314, + "grad_norm": 0.11897538602352142, + "learning_rate": 0.0005, + "loss": 2.1066, + "step": 126130 + }, + { + "epoch": 0.4801199728995227, + "grad_norm": 0.13987573981285095, + "learning_rate": 0.0005, + "loss": 2.1149, + "step": 126140 + }, + { + "epoch": 0.4801580353676454, + "grad_norm": 0.13216280937194824, + "learning_rate": 0.0005, + "loss": 2.1139, + "step": 126150 + }, + { + "epoch": 0.48019609783576805, + "grad_norm": 0.11237651854753494, + "learning_rate": 0.0005, + "loss": 2.1121, + "step": 126160 + }, + { + "epoch": 0.48023416030389077, + "grad_norm": 0.12070129811763763, + "learning_rate": 0.0005, + "loss": 2.111, + "step": 126170 + }, + { + "epoch": 0.4802722227720134, + "grad_norm": 0.12533554434776306, + "learning_rate": 0.0005, + "loss": 2.1118, + "step": 126180 + }, + { + "epoch": 0.48031028524013614, + "grad_norm": 0.12822876870632172, + "learning_rate": 0.0005, + "loss": 2.1155, + "step": 126190 + }, + { + "epoch": 0.4803483477082588, + "grad_norm": 0.12372897565364838, + "learning_rate": 0.0005, + "loss": 2.1323, + "step": 126200 + }, + { + "epoch": 0.48038641017638145, + "grad_norm": 0.1299714893102646, + "learning_rate": 0.0005, + "loss": 2.1109, + "step": 126210 + }, + { + "epoch": 0.48042447264450416, + "grad_norm": 0.14231672883033752, + "learning_rate": 0.0005, + "loss": 2.1231, + "step": 126220 + }, + { + "epoch": 0.4804625351126268, + "grad_norm": 0.12293112277984619, + "learning_rate": 0.0005, + "loss": 2.1162, + "step": 126230 + }, + { + "epoch": 0.48050059758074953, + "grad_norm": 0.12258761376142502, + "learning_rate": 0.0005, + "loss": 2.1109, + "step": 126240 + }, + { + "epoch": 0.4805386600488722, + "grad_norm": 0.12533219158649445, + "learning_rate": 0.0005, + "loss": 2.1122, + "step": 126250 + }, + { + "epoch": 0.4805767225169949, + "grad_norm": 0.30654239654541016, + "learning_rate": 0.0005, + "loss": 2.1239, + "step": 126260 + }, + { + "epoch": 0.48061478498511756, + "grad_norm": 0.13637426495552063, + "learning_rate": 0.0005, + "loss": 2.1195, + "step": 126270 + }, + { + "epoch": 0.4806528474532403, + "grad_norm": 0.13394691050052643, + "learning_rate": 0.0005, + "loss": 2.1014, + "step": 126280 + }, + { + "epoch": 0.48069090992136293, + "grad_norm": 0.12441105395555496, + "learning_rate": 0.0005, + "loss": 2.134, + "step": 126290 + }, + { + "epoch": 0.48072897238948564, + "grad_norm": 0.11978069692850113, + "learning_rate": 0.0005, + "loss": 2.1201, + "step": 126300 + }, + { + "epoch": 0.4807670348576083, + "grad_norm": 0.14189858734607697, + "learning_rate": 0.0005, + "loss": 2.1236, + "step": 126310 + }, + { + "epoch": 0.480805097325731, + "grad_norm": 0.1284215748310089, + "learning_rate": 0.0005, + "loss": 2.1233, + "step": 126320 + }, + { + "epoch": 0.48084315979385367, + "grad_norm": 0.13876661658287048, + "learning_rate": 0.0005, + "loss": 2.1058, + "step": 126330 + }, + { + "epoch": 0.4808812222619764, + "grad_norm": 0.1364854872226715, + "learning_rate": 0.0005, + "loss": 2.1154, + "step": 126340 + }, + { + "epoch": 0.48091928473009904, + "grad_norm": 0.13453872501850128, + "learning_rate": 0.0005, + "loss": 2.1059, + "step": 126350 + }, + { + "epoch": 0.4809573471982217, + "grad_norm": 0.12328676879405975, + "learning_rate": 0.0005, + "loss": 2.0941, + "step": 126360 + }, + { + "epoch": 0.4809954096663444, + "grad_norm": 0.12543216347694397, + "learning_rate": 0.0005, + "loss": 2.1337, + "step": 126370 + }, + { + "epoch": 0.48103347213446707, + "grad_norm": 0.1285954713821411, + "learning_rate": 0.0005, + "loss": 2.1096, + "step": 126380 + }, + { + "epoch": 0.4810715346025898, + "grad_norm": 0.13536769151687622, + "learning_rate": 0.0005, + "loss": 2.1081, + "step": 126390 + }, + { + "epoch": 0.48110959707071244, + "grad_norm": 0.12796925008296967, + "learning_rate": 0.0005, + "loss": 2.1159, + "step": 126400 + }, + { + "epoch": 0.48114765953883515, + "grad_norm": 0.13553667068481445, + "learning_rate": 0.0005, + "loss": 2.1119, + "step": 126410 + }, + { + "epoch": 0.4811857220069578, + "grad_norm": 0.12393445521593094, + "learning_rate": 0.0005, + "loss": 2.121, + "step": 126420 + }, + { + "epoch": 0.4812237844750805, + "grad_norm": 0.138578861951828, + "learning_rate": 0.0005, + "loss": 2.1104, + "step": 126430 + }, + { + "epoch": 0.4812618469432032, + "grad_norm": 0.13088813424110413, + "learning_rate": 0.0005, + "loss": 2.1218, + "step": 126440 + }, + { + "epoch": 0.4812999094113259, + "grad_norm": 0.12858811020851135, + "learning_rate": 0.0005, + "loss": 2.1123, + "step": 126450 + }, + { + "epoch": 0.48133797187944855, + "grad_norm": 0.11651439219713211, + "learning_rate": 0.0005, + "loss": 2.1133, + "step": 126460 + }, + { + "epoch": 0.48137603434757126, + "grad_norm": 0.1188209056854248, + "learning_rate": 0.0005, + "loss": 2.1307, + "step": 126470 + }, + { + "epoch": 0.4814140968156939, + "grad_norm": 0.119675412774086, + "learning_rate": 0.0005, + "loss": 2.1088, + "step": 126480 + }, + { + "epoch": 0.4814521592838166, + "grad_norm": 0.13475286960601807, + "learning_rate": 0.0005, + "loss": 2.1216, + "step": 126490 + }, + { + "epoch": 0.4814902217519393, + "grad_norm": 0.12447642534971237, + "learning_rate": 0.0005, + "loss": 2.1156, + "step": 126500 + }, + { + "epoch": 0.48152828422006194, + "grad_norm": 0.12670594453811646, + "learning_rate": 0.0005, + "loss": 2.1143, + "step": 126510 + }, + { + "epoch": 0.48156634668818465, + "grad_norm": 0.12636840343475342, + "learning_rate": 0.0005, + "loss": 2.1181, + "step": 126520 + }, + { + "epoch": 0.4816044091563073, + "grad_norm": 0.13792423903942108, + "learning_rate": 0.0005, + "loss": 2.1281, + "step": 126530 + }, + { + "epoch": 0.48164247162443, + "grad_norm": 0.13417412340641022, + "learning_rate": 0.0005, + "loss": 2.1171, + "step": 126540 + }, + { + "epoch": 0.4816805340925527, + "grad_norm": 0.14853769540786743, + "learning_rate": 0.0005, + "loss": 2.1272, + "step": 126550 + }, + { + "epoch": 0.4817185965606754, + "grad_norm": 0.12657450139522552, + "learning_rate": 0.0005, + "loss": 2.1296, + "step": 126560 + }, + { + "epoch": 0.48175665902879805, + "grad_norm": 0.16439884901046753, + "learning_rate": 0.0005, + "loss": 2.1096, + "step": 126570 + }, + { + "epoch": 0.48179472149692076, + "grad_norm": 0.1257026195526123, + "learning_rate": 0.0005, + "loss": 2.1068, + "step": 126580 + }, + { + "epoch": 0.4818327839650434, + "grad_norm": 0.12009840458631516, + "learning_rate": 0.0005, + "loss": 2.1114, + "step": 126590 + }, + { + "epoch": 0.48187084643316613, + "grad_norm": 0.12126778811216354, + "learning_rate": 0.0005, + "loss": 2.1023, + "step": 126600 + }, + { + "epoch": 0.4819089089012888, + "grad_norm": 0.13086417317390442, + "learning_rate": 0.0005, + "loss": 2.1121, + "step": 126610 + }, + { + "epoch": 0.4819469713694115, + "grad_norm": 0.13769873976707458, + "learning_rate": 0.0005, + "loss": 2.1085, + "step": 126620 + }, + { + "epoch": 0.48198503383753416, + "grad_norm": 0.14477433264255524, + "learning_rate": 0.0005, + "loss": 2.1143, + "step": 126630 + }, + { + "epoch": 0.4820230963056568, + "grad_norm": 0.12043140828609467, + "learning_rate": 0.0005, + "loss": 2.1036, + "step": 126640 + }, + { + "epoch": 0.48206115877377953, + "grad_norm": 0.12772685289382935, + "learning_rate": 0.0005, + "loss": 2.122, + "step": 126650 + }, + { + "epoch": 0.4820992212419022, + "grad_norm": 0.12375368177890778, + "learning_rate": 0.0005, + "loss": 2.1081, + "step": 126660 + }, + { + "epoch": 0.4821372837100249, + "grad_norm": 0.12025720626115799, + "learning_rate": 0.0005, + "loss": 2.114, + "step": 126670 + }, + { + "epoch": 0.48217534617814756, + "grad_norm": 0.1275482028722763, + "learning_rate": 0.0005, + "loss": 2.1159, + "step": 126680 + }, + { + "epoch": 0.48221340864627027, + "grad_norm": 0.12488540261983871, + "learning_rate": 0.0005, + "loss": 2.1218, + "step": 126690 + }, + { + "epoch": 0.4822514711143929, + "grad_norm": 0.12372930347919464, + "learning_rate": 0.0005, + "loss": 2.1129, + "step": 126700 + }, + { + "epoch": 0.48228953358251564, + "grad_norm": 0.13978280127048492, + "learning_rate": 0.0005, + "loss": 2.1121, + "step": 126710 + }, + { + "epoch": 0.4823275960506383, + "grad_norm": 0.13022901117801666, + "learning_rate": 0.0005, + "loss": 2.1325, + "step": 126720 + }, + { + "epoch": 0.482365658518761, + "grad_norm": 0.12196984887123108, + "learning_rate": 0.0005, + "loss": 2.1153, + "step": 126730 + }, + { + "epoch": 0.48240372098688367, + "grad_norm": 0.127203106880188, + "learning_rate": 0.0005, + "loss": 2.1203, + "step": 126740 + }, + { + "epoch": 0.4824417834550064, + "grad_norm": 0.1250368058681488, + "learning_rate": 0.0005, + "loss": 2.1219, + "step": 126750 + }, + { + "epoch": 0.48247984592312904, + "grad_norm": 0.11303237825632095, + "learning_rate": 0.0005, + "loss": 2.1185, + "step": 126760 + }, + { + "epoch": 0.48251790839125175, + "grad_norm": 0.12539029121398926, + "learning_rate": 0.0005, + "loss": 2.1106, + "step": 126770 + }, + { + "epoch": 0.4825559708593744, + "grad_norm": 0.12309008091688156, + "learning_rate": 0.0005, + "loss": 2.1068, + "step": 126780 + }, + { + "epoch": 0.48259403332749706, + "grad_norm": 0.1341366022825241, + "learning_rate": 0.0005, + "loss": 2.0961, + "step": 126790 + }, + { + "epoch": 0.4826320957956198, + "grad_norm": 0.12358230352401733, + "learning_rate": 0.0005, + "loss": 2.1069, + "step": 126800 + }, + { + "epoch": 0.48267015826374243, + "grad_norm": 0.11779564619064331, + "learning_rate": 0.0005, + "loss": 2.1219, + "step": 126810 + }, + { + "epoch": 0.48270822073186515, + "grad_norm": 0.13021689653396606, + "learning_rate": 0.0005, + "loss": 2.1203, + "step": 126820 + }, + { + "epoch": 0.4827462831999878, + "grad_norm": 0.12274055927991867, + "learning_rate": 0.0005, + "loss": 2.1245, + "step": 126830 + }, + { + "epoch": 0.4827843456681105, + "grad_norm": 0.12138562649488449, + "learning_rate": 0.0005, + "loss": 2.1148, + "step": 126840 + }, + { + "epoch": 0.48282240813623317, + "grad_norm": 0.11908987909555435, + "learning_rate": 0.0005, + "loss": 2.128, + "step": 126850 + }, + { + "epoch": 0.4828604706043559, + "grad_norm": 0.12997077405452728, + "learning_rate": 0.0005, + "loss": 2.107, + "step": 126860 + }, + { + "epoch": 0.48289853307247854, + "grad_norm": 0.11264729499816895, + "learning_rate": 0.0005, + "loss": 2.1336, + "step": 126870 + }, + { + "epoch": 0.48293659554060125, + "grad_norm": 0.12156654894351959, + "learning_rate": 0.0005, + "loss": 2.133, + "step": 126880 + }, + { + "epoch": 0.4829746580087239, + "grad_norm": 0.12737615406513214, + "learning_rate": 0.0005, + "loss": 2.1281, + "step": 126890 + }, + { + "epoch": 0.4830127204768466, + "grad_norm": 0.11980323493480682, + "learning_rate": 0.0005, + "loss": 2.1119, + "step": 126900 + }, + { + "epoch": 0.4830507829449693, + "grad_norm": 0.13023775815963745, + "learning_rate": 0.0005, + "loss": 2.1352, + "step": 126910 + }, + { + "epoch": 0.483088845413092, + "grad_norm": 0.12090695649385452, + "learning_rate": 0.0005, + "loss": 2.1149, + "step": 126920 + }, + { + "epoch": 0.48312690788121465, + "grad_norm": 0.13404586911201477, + "learning_rate": 0.0005, + "loss": 2.1112, + "step": 126930 + }, + { + "epoch": 0.4831649703493373, + "grad_norm": 0.12173371016979218, + "learning_rate": 0.0005, + "loss": 2.1247, + "step": 126940 + }, + { + "epoch": 0.48320303281746, + "grad_norm": 0.13910695910453796, + "learning_rate": 0.0005, + "loss": 2.1005, + "step": 126950 + }, + { + "epoch": 0.4832410952855827, + "grad_norm": 0.12336437404155731, + "learning_rate": 0.0005, + "loss": 2.1001, + "step": 126960 + }, + { + "epoch": 0.4832791577537054, + "grad_norm": 0.12722080945968628, + "learning_rate": 0.0005, + "loss": 2.1193, + "step": 126970 + }, + { + "epoch": 0.48331722022182805, + "grad_norm": 0.11827809363603592, + "learning_rate": 0.0005, + "loss": 2.1212, + "step": 126980 + }, + { + "epoch": 0.48335528268995076, + "grad_norm": 0.13443544507026672, + "learning_rate": 0.0005, + "loss": 2.1082, + "step": 126990 + }, + { + "epoch": 0.4833933451580734, + "grad_norm": 0.11773506551980972, + "learning_rate": 0.0005, + "loss": 2.1156, + "step": 127000 + }, + { + "epoch": 0.48343140762619613, + "grad_norm": 0.13921083509922028, + "learning_rate": 0.0005, + "loss": 2.1086, + "step": 127010 + }, + { + "epoch": 0.4834694700943188, + "grad_norm": 0.1361696422100067, + "learning_rate": 0.0005, + "loss": 2.1195, + "step": 127020 + }, + { + "epoch": 0.4835075325624415, + "grad_norm": 0.15075117349624634, + "learning_rate": 0.0005, + "loss": 2.1008, + "step": 127030 + }, + { + "epoch": 0.48354559503056416, + "grad_norm": 0.11024720221757889, + "learning_rate": 0.0005, + "loss": 2.1061, + "step": 127040 + }, + { + "epoch": 0.48358365749868687, + "grad_norm": 0.11815572530031204, + "learning_rate": 0.0005, + "loss": 2.1054, + "step": 127050 + }, + { + "epoch": 0.4836217199668095, + "grad_norm": 0.13710758090019226, + "learning_rate": 0.0005, + "loss": 2.1147, + "step": 127060 + }, + { + "epoch": 0.4836597824349322, + "grad_norm": 0.13314233720302582, + "learning_rate": 0.0005, + "loss": 2.1168, + "step": 127070 + }, + { + "epoch": 0.4836978449030549, + "grad_norm": 0.12891244888305664, + "learning_rate": 0.0005, + "loss": 2.1149, + "step": 127080 + }, + { + "epoch": 0.48373590737117755, + "grad_norm": 0.14799803495407104, + "learning_rate": 0.0005, + "loss": 2.1097, + "step": 127090 + }, + { + "epoch": 0.48377396983930027, + "grad_norm": 0.1257518082857132, + "learning_rate": 0.0005, + "loss": 2.1098, + "step": 127100 + }, + { + "epoch": 0.4838120323074229, + "grad_norm": 0.12361308932304382, + "learning_rate": 0.0005, + "loss": 2.1016, + "step": 127110 + }, + { + "epoch": 0.48385009477554564, + "grad_norm": 0.1231234073638916, + "learning_rate": 0.0005, + "loss": 2.1278, + "step": 127120 + }, + { + "epoch": 0.4838881572436683, + "grad_norm": 0.1312589794397354, + "learning_rate": 0.0005, + "loss": 2.1111, + "step": 127130 + }, + { + "epoch": 0.483926219711791, + "grad_norm": 0.13112683594226837, + "learning_rate": 0.0005, + "loss": 2.1141, + "step": 127140 + }, + { + "epoch": 0.48396428217991366, + "grad_norm": 0.11283516138792038, + "learning_rate": 0.0005, + "loss": 2.1177, + "step": 127150 + }, + { + "epoch": 0.4840023446480364, + "grad_norm": 0.12356321513652802, + "learning_rate": 0.0005, + "loss": 2.1238, + "step": 127160 + }, + { + "epoch": 0.48404040711615903, + "grad_norm": 0.12498544901609421, + "learning_rate": 0.0005, + "loss": 2.1032, + "step": 127170 + }, + { + "epoch": 0.48407846958428175, + "grad_norm": 0.12278742343187332, + "learning_rate": 0.0005, + "loss": 2.1108, + "step": 127180 + }, + { + "epoch": 0.4841165320524044, + "grad_norm": 0.12722855806350708, + "learning_rate": 0.0005, + "loss": 2.1141, + "step": 127190 + }, + { + "epoch": 0.4841545945205271, + "grad_norm": 0.11609535664319992, + "learning_rate": 0.0005, + "loss": 2.1104, + "step": 127200 + }, + { + "epoch": 0.48419265698864977, + "grad_norm": 0.1401669681072235, + "learning_rate": 0.0005, + "loss": 2.1235, + "step": 127210 + }, + { + "epoch": 0.48423071945677243, + "grad_norm": 0.13876022398471832, + "learning_rate": 0.0005, + "loss": 2.1179, + "step": 127220 + }, + { + "epoch": 0.48426878192489514, + "grad_norm": 0.13365916907787323, + "learning_rate": 0.0005, + "loss": 2.1145, + "step": 127230 + }, + { + "epoch": 0.4843068443930178, + "grad_norm": 0.12094181030988693, + "learning_rate": 0.0005, + "loss": 2.1313, + "step": 127240 + }, + { + "epoch": 0.4843449068611405, + "grad_norm": 0.1237097904086113, + "learning_rate": 0.0005, + "loss": 2.1259, + "step": 127250 + }, + { + "epoch": 0.48438296932926317, + "grad_norm": 0.1223485916852951, + "learning_rate": 0.0005, + "loss": 2.1086, + "step": 127260 + }, + { + "epoch": 0.4844210317973859, + "grad_norm": 0.12210199981927872, + "learning_rate": 0.0005, + "loss": 2.1139, + "step": 127270 + }, + { + "epoch": 0.48445909426550854, + "grad_norm": 0.12790727615356445, + "learning_rate": 0.0005, + "loss": 2.1034, + "step": 127280 + }, + { + "epoch": 0.48449715673363125, + "grad_norm": 0.11236969381570816, + "learning_rate": 0.0005, + "loss": 2.1203, + "step": 127290 + }, + { + "epoch": 0.4845352192017539, + "grad_norm": 0.12637045979499817, + "learning_rate": 0.0005, + "loss": 2.1272, + "step": 127300 + }, + { + "epoch": 0.4845732816698766, + "grad_norm": 0.12643814086914062, + "learning_rate": 0.0005, + "loss": 2.111, + "step": 127310 + }, + { + "epoch": 0.4846113441379993, + "grad_norm": 0.11725429445505142, + "learning_rate": 0.0005, + "loss": 2.1163, + "step": 127320 + }, + { + "epoch": 0.484649406606122, + "grad_norm": 0.1281515210866928, + "learning_rate": 0.0005, + "loss": 2.1154, + "step": 127330 + }, + { + "epoch": 0.48468746907424465, + "grad_norm": 0.15110932290554047, + "learning_rate": 0.0005, + "loss": 2.1026, + "step": 127340 + }, + { + "epoch": 0.48472553154236736, + "grad_norm": 0.12391675263643265, + "learning_rate": 0.0005, + "loss": 2.1192, + "step": 127350 + }, + { + "epoch": 0.48476359401049, + "grad_norm": 0.13406608998775482, + "learning_rate": 0.0005, + "loss": 2.1367, + "step": 127360 + }, + { + "epoch": 0.4848016564786127, + "grad_norm": 0.11705932766199112, + "learning_rate": 0.0005, + "loss": 2.1229, + "step": 127370 + }, + { + "epoch": 0.4848397189467354, + "grad_norm": 0.1269107609987259, + "learning_rate": 0.0005, + "loss": 2.1021, + "step": 127380 + }, + { + "epoch": 0.48487778141485804, + "grad_norm": 0.11953911930322647, + "learning_rate": 0.0005, + "loss": 2.0996, + "step": 127390 + }, + { + "epoch": 0.48491584388298076, + "grad_norm": 0.12773080170154572, + "learning_rate": 0.0005, + "loss": 2.1162, + "step": 127400 + }, + { + "epoch": 0.4849539063511034, + "grad_norm": 0.12607593834400177, + "learning_rate": 0.0005, + "loss": 2.0954, + "step": 127410 + }, + { + "epoch": 0.4849919688192261, + "grad_norm": 0.1383269876241684, + "learning_rate": 0.0005, + "loss": 2.1225, + "step": 127420 + }, + { + "epoch": 0.4850300312873488, + "grad_norm": 0.13885319232940674, + "learning_rate": 0.0005, + "loss": 2.1084, + "step": 127430 + }, + { + "epoch": 0.4850680937554715, + "grad_norm": 0.12093721330165863, + "learning_rate": 0.0005, + "loss": 2.0981, + "step": 127440 + }, + { + "epoch": 0.48510615622359415, + "grad_norm": 0.11831363290548325, + "learning_rate": 0.0005, + "loss": 2.1117, + "step": 127450 + }, + { + "epoch": 0.48514421869171687, + "grad_norm": 0.11542569100856781, + "learning_rate": 0.0005, + "loss": 2.1203, + "step": 127460 + }, + { + "epoch": 0.4851822811598395, + "grad_norm": 0.12869244813919067, + "learning_rate": 0.0005, + "loss": 2.1289, + "step": 127470 + }, + { + "epoch": 0.48522034362796224, + "grad_norm": 0.1293071061372757, + "learning_rate": 0.0005, + "loss": 2.124, + "step": 127480 + }, + { + "epoch": 0.4852584060960849, + "grad_norm": 0.13619841635227203, + "learning_rate": 0.0005, + "loss": 2.1225, + "step": 127490 + }, + { + "epoch": 0.48529646856420755, + "grad_norm": 0.12476862967014313, + "learning_rate": 0.0005, + "loss": 2.1248, + "step": 127500 + }, + { + "epoch": 0.48533453103233026, + "grad_norm": 0.12852947413921356, + "learning_rate": 0.0005, + "loss": 2.1114, + "step": 127510 + }, + { + "epoch": 0.4853725935004529, + "grad_norm": 0.11751388758420944, + "learning_rate": 0.0005, + "loss": 2.1074, + "step": 127520 + }, + { + "epoch": 0.48541065596857563, + "grad_norm": 0.12054309993982315, + "learning_rate": 0.0005, + "loss": 2.1018, + "step": 127530 + }, + { + "epoch": 0.4854487184366983, + "grad_norm": 0.13147751986980438, + "learning_rate": 0.0005, + "loss": 2.1085, + "step": 127540 + }, + { + "epoch": 0.485486780904821, + "grad_norm": 0.13924241065979004, + "learning_rate": 0.0005, + "loss": 2.1269, + "step": 127550 + }, + { + "epoch": 0.48552484337294366, + "grad_norm": 0.12083495408296585, + "learning_rate": 0.0005, + "loss": 2.1139, + "step": 127560 + }, + { + "epoch": 0.4855629058410664, + "grad_norm": 0.12078743427991867, + "learning_rate": 0.0005, + "loss": 2.1146, + "step": 127570 + }, + { + "epoch": 0.48560096830918903, + "grad_norm": 0.13089805841445923, + "learning_rate": 0.0005, + "loss": 2.0991, + "step": 127580 + }, + { + "epoch": 0.48563903077731174, + "grad_norm": 0.14108452200889587, + "learning_rate": 0.0005, + "loss": 2.114, + "step": 127590 + }, + { + "epoch": 0.4856770932454344, + "grad_norm": 0.1272251456975937, + "learning_rate": 0.0005, + "loss": 2.1201, + "step": 127600 + }, + { + "epoch": 0.4857151557135571, + "grad_norm": 0.12551584839820862, + "learning_rate": 0.0005, + "loss": 2.128, + "step": 127610 + }, + { + "epoch": 0.48575321818167977, + "grad_norm": 0.12183475494384766, + "learning_rate": 0.0005, + "loss": 2.1192, + "step": 127620 + }, + { + "epoch": 0.4857912806498025, + "grad_norm": 0.13344140350818634, + "learning_rate": 0.0005, + "loss": 2.1136, + "step": 127630 + }, + { + "epoch": 0.48582934311792514, + "grad_norm": 0.12324704229831696, + "learning_rate": 0.0005, + "loss": 2.1091, + "step": 127640 + }, + { + "epoch": 0.4858674055860478, + "grad_norm": 0.11987955123186111, + "learning_rate": 0.0005, + "loss": 2.1152, + "step": 127650 + }, + { + "epoch": 0.4859054680541705, + "grad_norm": 0.11302775144577026, + "learning_rate": 0.0005, + "loss": 2.116, + "step": 127660 + }, + { + "epoch": 0.48594353052229317, + "grad_norm": 0.13847026228904724, + "learning_rate": 0.0005, + "loss": 2.1262, + "step": 127670 + }, + { + "epoch": 0.4859815929904159, + "grad_norm": 0.1276199072599411, + "learning_rate": 0.0005, + "loss": 2.1142, + "step": 127680 + }, + { + "epoch": 0.48601965545853854, + "grad_norm": 0.14437644183635712, + "learning_rate": 0.0005, + "loss": 2.129, + "step": 127690 + }, + { + "epoch": 0.48605771792666125, + "grad_norm": 0.12203813344240189, + "learning_rate": 0.0005, + "loss": 2.1187, + "step": 127700 + }, + { + "epoch": 0.4860957803947839, + "grad_norm": 0.135308176279068, + "learning_rate": 0.0005, + "loss": 2.1216, + "step": 127710 + }, + { + "epoch": 0.4861338428629066, + "grad_norm": 0.11932894587516785, + "learning_rate": 0.0005, + "loss": 2.1322, + "step": 127720 + }, + { + "epoch": 0.4861719053310293, + "grad_norm": 0.12615463137626648, + "learning_rate": 0.0005, + "loss": 2.1149, + "step": 127730 + }, + { + "epoch": 0.486209967799152, + "grad_norm": 0.12288632243871689, + "learning_rate": 0.0005, + "loss": 2.1004, + "step": 127740 + }, + { + "epoch": 0.48624803026727464, + "grad_norm": 0.12118787318468094, + "learning_rate": 0.0005, + "loss": 2.1226, + "step": 127750 + }, + { + "epoch": 0.48628609273539736, + "grad_norm": 0.12299742549657822, + "learning_rate": 0.0005, + "loss": 2.131, + "step": 127760 + }, + { + "epoch": 0.48632415520352, + "grad_norm": 0.11897924542427063, + "learning_rate": 0.0005, + "loss": 2.1038, + "step": 127770 + }, + { + "epoch": 0.4863622176716427, + "grad_norm": 0.1315123736858368, + "learning_rate": 0.0005, + "loss": 2.1218, + "step": 127780 + }, + { + "epoch": 0.4864002801397654, + "grad_norm": 0.1309213489294052, + "learning_rate": 0.0005, + "loss": 2.1286, + "step": 127790 + }, + { + "epoch": 0.48643834260788804, + "grad_norm": 0.13837243616580963, + "learning_rate": 0.0005, + "loss": 2.1282, + "step": 127800 + }, + { + "epoch": 0.48647640507601075, + "grad_norm": 0.13022461533546448, + "learning_rate": 0.0005, + "loss": 2.1248, + "step": 127810 + }, + { + "epoch": 0.4865144675441334, + "grad_norm": 0.1236427053809166, + "learning_rate": 0.0005, + "loss": 2.1202, + "step": 127820 + }, + { + "epoch": 0.4865525300122561, + "grad_norm": 0.11672331392765045, + "learning_rate": 0.0005, + "loss": 2.1109, + "step": 127830 + }, + { + "epoch": 0.4865905924803788, + "grad_norm": 0.13600587844848633, + "learning_rate": 0.0005, + "loss": 2.1184, + "step": 127840 + }, + { + "epoch": 0.4866286549485015, + "grad_norm": 0.13283072412014008, + "learning_rate": 0.0005, + "loss": 2.1071, + "step": 127850 + }, + { + "epoch": 0.48666671741662415, + "grad_norm": 0.12012767791748047, + "learning_rate": 0.0005, + "loss": 2.1145, + "step": 127860 + }, + { + "epoch": 0.48670477988474686, + "grad_norm": 0.12542325258255005, + "learning_rate": 0.0005, + "loss": 2.1024, + "step": 127870 + }, + { + "epoch": 0.4867428423528695, + "grad_norm": 0.13662007451057434, + "learning_rate": 0.0005, + "loss": 2.1132, + "step": 127880 + }, + { + "epoch": 0.48678090482099223, + "grad_norm": 0.12781007587909698, + "learning_rate": 0.0005, + "loss": 2.105, + "step": 127890 + }, + { + "epoch": 0.4868189672891149, + "grad_norm": 0.14764262735843658, + "learning_rate": 0.0005, + "loss": 2.1249, + "step": 127900 + }, + { + "epoch": 0.4868570297572376, + "grad_norm": 0.11196591705083847, + "learning_rate": 0.0005, + "loss": 2.1014, + "step": 127910 + }, + { + "epoch": 0.48689509222536026, + "grad_norm": 0.11736451089382172, + "learning_rate": 0.0005, + "loss": 2.1055, + "step": 127920 + }, + { + "epoch": 0.4869331546934829, + "grad_norm": 0.1281765252351761, + "learning_rate": 0.0005, + "loss": 2.1252, + "step": 127930 + }, + { + "epoch": 0.48697121716160563, + "grad_norm": 0.12472829222679138, + "learning_rate": 0.0005, + "loss": 2.1193, + "step": 127940 + }, + { + "epoch": 0.4870092796297283, + "grad_norm": 0.12662485241889954, + "learning_rate": 0.0005, + "loss": 2.1036, + "step": 127950 + }, + { + "epoch": 0.487047342097851, + "grad_norm": 0.1277189701795578, + "learning_rate": 0.0005, + "loss": 2.1277, + "step": 127960 + }, + { + "epoch": 0.48708540456597366, + "grad_norm": 0.14531460404396057, + "learning_rate": 0.0005, + "loss": 2.1161, + "step": 127970 + }, + { + "epoch": 0.48712346703409637, + "grad_norm": 0.15048803389072418, + "learning_rate": 0.0005, + "loss": 2.1174, + "step": 127980 + }, + { + "epoch": 0.487161529502219, + "grad_norm": 0.12176571786403656, + "learning_rate": 0.0005, + "loss": 2.1228, + "step": 127990 + }, + { + "epoch": 0.48719959197034174, + "grad_norm": 0.12032541632652283, + "learning_rate": 0.0005, + "loss": 2.1234, + "step": 128000 + }, + { + "epoch": 0.4872376544384644, + "grad_norm": 0.1318209022283554, + "learning_rate": 0.0005, + "loss": 2.1299, + "step": 128010 + }, + { + "epoch": 0.4872757169065871, + "grad_norm": 0.13124960660934448, + "learning_rate": 0.0005, + "loss": 2.1309, + "step": 128020 + }, + { + "epoch": 0.48731377937470977, + "grad_norm": 0.12284164130687714, + "learning_rate": 0.0005, + "loss": 2.0958, + "step": 128030 + }, + { + "epoch": 0.4873518418428325, + "grad_norm": 0.12062957137823105, + "learning_rate": 0.0005, + "loss": 2.1136, + "step": 128040 + }, + { + "epoch": 0.48738990431095514, + "grad_norm": 0.1256282776594162, + "learning_rate": 0.0005, + "loss": 2.115, + "step": 128050 + }, + { + "epoch": 0.48742796677907785, + "grad_norm": 0.12683241069316864, + "learning_rate": 0.0005, + "loss": 2.1256, + "step": 128060 + }, + { + "epoch": 0.4874660292472005, + "grad_norm": 0.11736001819372177, + "learning_rate": 0.0005, + "loss": 2.1087, + "step": 128070 + }, + { + "epoch": 0.48750409171532316, + "grad_norm": 0.11980387568473816, + "learning_rate": 0.0005, + "loss": 2.1232, + "step": 128080 + }, + { + "epoch": 0.4875421541834459, + "grad_norm": 0.13046181201934814, + "learning_rate": 0.0005, + "loss": 2.1146, + "step": 128090 + }, + { + "epoch": 0.48758021665156853, + "grad_norm": 0.1274123191833496, + "learning_rate": 0.0005, + "loss": 2.1122, + "step": 128100 + }, + { + "epoch": 0.48761827911969124, + "grad_norm": 0.11976911127567291, + "learning_rate": 0.0005, + "loss": 2.1274, + "step": 128110 + }, + { + "epoch": 0.4876563415878139, + "grad_norm": 0.12375515699386597, + "learning_rate": 0.0005, + "loss": 2.121, + "step": 128120 + }, + { + "epoch": 0.4876944040559366, + "grad_norm": 0.12907162308692932, + "learning_rate": 0.0005, + "loss": 2.1052, + "step": 128130 + }, + { + "epoch": 0.48773246652405927, + "grad_norm": 0.13195452094078064, + "learning_rate": 0.0005, + "loss": 2.1216, + "step": 128140 + }, + { + "epoch": 0.487770528992182, + "grad_norm": 0.13561616837978363, + "learning_rate": 0.0005, + "loss": 2.1111, + "step": 128150 + }, + { + "epoch": 0.48780859146030464, + "grad_norm": 0.12271276116371155, + "learning_rate": 0.0005, + "loss": 2.1214, + "step": 128160 + }, + { + "epoch": 0.48784665392842735, + "grad_norm": 0.13250857591629028, + "learning_rate": 0.0005, + "loss": 2.1071, + "step": 128170 + }, + { + "epoch": 0.48788471639655, + "grad_norm": 0.1365223228931427, + "learning_rate": 0.0005, + "loss": 2.1152, + "step": 128180 + }, + { + "epoch": 0.4879227788646727, + "grad_norm": 0.127644881606102, + "learning_rate": 0.0005, + "loss": 2.1077, + "step": 128190 + }, + { + "epoch": 0.4879608413327954, + "grad_norm": 0.11915410310029984, + "learning_rate": 0.0005, + "loss": 2.1246, + "step": 128200 + }, + { + "epoch": 0.4879989038009181, + "grad_norm": 0.12355806678533554, + "learning_rate": 0.0005, + "loss": 2.1158, + "step": 128210 + }, + { + "epoch": 0.48803696626904075, + "grad_norm": 0.15161676704883575, + "learning_rate": 0.0005, + "loss": 2.1131, + "step": 128220 + }, + { + "epoch": 0.4880750287371634, + "grad_norm": 0.12143923342227936, + "learning_rate": 0.0005, + "loss": 2.11, + "step": 128230 + }, + { + "epoch": 0.4881130912052861, + "grad_norm": 0.13301809132099152, + "learning_rate": 0.0005, + "loss": 2.112, + "step": 128240 + }, + { + "epoch": 0.4881511536734088, + "grad_norm": 0.12631773948669434, + "learning_rate": 0.0005, + "loss": 2.1248, + "step": 128250 + }, + { + "epoch": 0.4881892161415315, + "grad_norm": 0.1465943306684494, + "learning_rate": 0.0005, + "loss": 2.1139, + "step": 128260 + }, + { + "epoch": 0.48822727860965415, + "grad_norm": 0.12848037481307983, + "learning_rate": 0.0005, + "loss": 2.1166, + "step": 128270 + }, + { + "epoch": 0.48826534107777686, + "grad_norm": 0.11996082216501236, + "learning_rate": 0.0005, + "loss": 2.1147, + "step": 128280 + }, + { + "epoch": 0.4883034035458995, + "grad_norm": 0.1305745393037796, + "learning_rate": 0.0005, + "loss": 2.1007, + "step": 128290 + }, + { + "epoch": 0.48834146601402223, + "grad_norm": 0.12072660028934479, + "learning_rate": 0.0005, + "loss": 2.1219, + "step": 128300 + }, + { + "epoch": 0.4883795284821449, + "grad_norm": 0.11993353813886642, + "learning_rate": 0.0005, + "loss": 2.1048, + "step": 128310 + }, + { + "epoch": 0.4884175909502676, + "grad_norm": 0.12914374470710754, + "learning_rate": 0.0005, + "loss": 2.107, + "step": 128320 + }, + { + "epoch": 0.48845565341839026, + "grad_norm": 0.12362520396709442, + "learning_rate": 0.0005, + "loss": 2.1332, + "step": 128330 + }, + { + "epoch": 0.48849371588651297, + "grad_norm": 0.12493337690830231, + "learning_rate": 0.0005, + "loss": 2.115, + "step": 128340 + }, + { + "epoch": 0.4885317783546356, + "grad_norm": 0.12441351264715195, + "learning_rate": 0.0005, + "loss": 2.1034, + "step": 128350 + }, + { + "epoch": 0.4885698408227583, + "grad_norm": 0.12130783498287201, + "learning_rate": 0.0005, + "loss": 2.1162, + "step": 128360 + }, + { + "epoch": 0.488607903290881, + "grad_norm": 0.11701652407646179, + "learning_rate": 0.0005, + "loss": 2.1199, + "step": 128370 + }, + { + "epoch": 0.48864596575900365, + "grad_norm": 0.11885587126016617, + "learning_rate": 0.0005, + "loss": 2.107, + "step": 128380 + }, + { + "epoch": 0.48868402822712637, + "grad_norm": 0.14552637934684753, + "learning_rate": 0.0005, + "loss": 2.1037, + "step": 128390 + }, + { + "epoch": 0.488722090695249, + "grad_norm": 0.12319986522197723, + "learning_rate": 0.0005, + "loss": 2.0929, + "step": 128400 + }, + { + "epoch": 0.48876015316337174, + "grad_norm": 0.12676241993904114, + "learning_rate": 0.0005, + "loss": 2.1202, + "step": 128410 + }, + { + "epoch": 0.4887982156314944, + "grad_norm": 0.12602174282073975, + "learning_rate": 0.0005, + "loss": 2.1076, + "step": 128420 + }, + { + "epoch": 0.4888362780996171, + "grad_norm": 0.12732820212841034, + "learning_rate": 0.0005, + "loss": 2.1118, + "step": 128430 + }, + { + "epoch": 0.48887434056773976, + "grad_norm": 0.13058944046497345, + "learning_rate": 0.0005, + "loss": 2.1108, + "step": 128440 + }, + { + "epoch": 0.4889124030358625, + "grad_norm": 0.12957490980625153, + "learning_rate": 0.0005, + "loss": 2.101, + "step": 128450 + }, + { + "epoch": 0.48895046550398513, + "grad_norm": 0.1237308606505394, + "learning_rate": 0.0005, + "loss": 2.1149, + "step": 128460 + }, + { + "epoch": 0.48898852797210784, + "grad_norm": 0.13554799556732178, + "learning_rate": 0.0005, + "loss": 2.1225, + "step": 128470 + }, + { + "epoch": 0.4890265904402305, + "grad_norm": 0.11221189796924591, + "learning_rate": 0.0005, + "loss": 2.1156, + "step": 128480 + }, + { + "epoch": 0.4890646529083532, + "grad_norm": 0.1298692524433136, + "learning_rate": 0.0005, + "loss": 2.1262, + "step": 128490 + }, + { + "epoch": 0.48910271537647587, + "grad_norm": 0.12430565059185028, + "learning_rate": 0.0005, + "loss": 2.1037, + "step": 128500 + }, + { + "epoch": 0.48914077784459853, + "grad_norm": 0.15557394921779633, + "learning_rate": 0.0005, + "loss": 2.1014, + "step": 128510 + }, + { + "epoch": 0.48917884031272124, + "grad_norm": 0.11919406801462173, + "learning_rate": 0.0005, + "loss": 2.1122, + "step": 128520 + }, + { + "epoch": 0.4892169027808439, + "grad_norm": 0.1355024129152298, + "learning_rate": 0.0005, + "loss": 2.1216, + "step": 128530 + }, + { + "epoch": 0.4892549652489666, + "grad_norm": 0.11714503169059753, + "learning_rate": 0.0005, + "loss": 2.1087, + "step": 128540 + }, + { + "epoch": 0.48929302771708927, + "grad_norm": 0.13108661770820618, + "learning_rate": 0.0005, + "loss": 2.1171, + "step": 128550 + }, + { + "epoch": 0.489331090185212, + "grad_norm": 0.13541683554649353, + "learning_rate": 0.0005, + "loss": 2.1256, + "step": 128560 + }, + { + "epoch": 0.48936915265333464, + "grad_norm": 0.12678135931491852, + "learning_rate": 0.0005, + "loss": 2.1253, + "step": 128570 + }, + { + "epoch": 0.48940721512145735, + "grad_norm": 0.1365925818681717, + "learning_rate": 0.0005, + "loss": 2.128, + "step": 128580 + }, + { + "epoch": 0.48944527758958, + "grad_norm": 0.12762218713760376, + "learning_rate": 0.0005, + "loss": 2.106, + "step": 128590 + }, + { + "epoch": 0.4894833400577027, + "grad_norm": 0.1352071613073349, + "learning_rate": 0.0005, + "loss": 2.1136, + "step": 128600 + }, + { + "epoch": 0.4895214025258254, + "grad_norm": 0.12248429656028748, + "learning_rate": 0.0005, + "loss": 2.1096, + "step": 128610 + }, + { + "epoch": 0.4895594649939481, + "grad_norm": 0.12541040778160095, + "learning_rate": 0.0005, + "loss": 2.1097, + "step": 128620 + }, + { + "epoch": 0.48959752746207075, + "grad_norm": 0.12017042189836502, + "learning_rate": 0.0005, + "loss": 2.1061, + "step": 128630 + }, + { + "epoch": 0.48963558993019346, + "grad_norm": 0.11116807907819748, + "learning_rate": 0.0005, + "loss": 2.1151, + "step": 128640 + }, + { + "epoch": 0.4896736523983161, + "grad_norm": 0.13902676105499268, + "learning_rate": 0.0005, + "loss": 2.1133, + "step": 128650 + }, + { + "epoch": 0.4897117148664388, + "grad_norm": 0.12121377885341644, + "learning_rate": 0.0005, + "loss": 2.0942, + "step": 128660 + }, + { + "epoch": 0.4897497773345615, + "grad_norm": 0.13029231131076813, + "learning_rate": 0.0005, + "loss": 2.1206, + "step": 128670 + }, + { + "epoch": 0.48978783980268414, + "grad_norm": 0.13663001358509064, + "learning_rate": 0.0005, + "loss": 2.1201, + "step": 128680 + }, + { + "epoch": 0.48982590227080686, + "grad_norm": 0.13909859955310822, + "learning_rate": 0.0005, + "loss": 2.1297, + "step": 128690 + }, + { + "epoch": 0.4898639647389295, + "grad_norm": 0.12316319346427917, + "learning_rate": 0.0005, + "loss": 2.114, + "step": 128700 + }, + { + "epoch": 0.4899020272070522, + "grad_norm": 0.13498586416244507, + "learning_rate": 0.0005, + "loss": 2.1028, + "step": 128710 + }, + { + "epoch": 0.4899400896751749, + "grad_norm": 0.1561654806137085, + "learning_rate": 0.0005, + "loss": 2.1221, + "step": 128720 + }, + { + "epoch": 0.4899781521432976, + "grad_norm": 0.12787999212741852, + "learning_rate": 0.0005, + "loss": 2.0921, + "step": 128730 + }, + { + "epoch": 0.49001621461142025, + "grad_norm": 0.13014093041419983, + "learning_rate": 0.0005, + "loss": 2.1079, + "step": 128740 + }, + { + "epoch": 0.49005427707954297, + "grad_norm": 0.11951977759599686, + "learning_rate": 0.0005, + "loss": 2.1202, + "step": 128750 + }, + { + "epoch": 0.4900923395476656, + "grad_norm": 0.11888326704502106, + "learning_rate": 0.0005, + "loss": 2.1011, + "step": 128760 + }, + { + "epoch": 0.49013040201578834, + "grad_norm": 0.12961071729660034, + "learning_rate": 0.0005, + "loss": 2.1033, + "step": 128770 + }, + { + "epoch": 0.490168464483911, + "grad_norm": 0.13857637345790863, + "learning_rate": 0.0005, + "loss": 2.1341, + "step": 128780 + }, + { + "epoch": 0.4902065269520337, + "grad_norm": 0.1179923564195633, + "learning_rate": 0.0005, + "loss": 2.1192, + "step": 128790 + }, + { + "epoch": 0.49024458942015636, + "grad_norm": 0.12373822182416916, + "learning_rate": 0.0005, + "loss": 2.0984, + "step": 128800 + }, + { + "epoch": 0.490282651888279, + "grad_norm": 0.12144889682531357, + "learning_rate": 0.0005, + "loss": 2.1112, + "step": 128810 + }, + { + "epoch": 0.49032071435640173, + "grad_norm": 0.11663493514060974, + "learning_rate": 0.0005, + "loss": 2.1057, + "step": 128820 + }, + { + "epoch": 0.4903587768245244, + "grad_norm": 0.13361524045467377, + "learning_rate": 0.0005, + "loss": 2.1196, + "step": 128830 + }, + { + "epoch": 0.4903968392926471, + "grad_norm": 0.15043866634368896, + "learning_rate": 0.0005, + "loss": 2.1303, + "step": 128840 + }, + { + "epoch": 0.49043490176076976, + "grad_norm": 0.12512058019638062, + "learning_rate": 0.0005, + "loss": 2.1411, + "step": 128850 + }, + { + "epoch": 0.49047296422889247, + "grad_norm": 0.12376557290554047, + "learning_rate": 0.0005, + "loss": 2.1178, + "step": 128860 + }, + { + "epoch": 0.49051102669701513, + "grad_norm": 0.14460743963718414, + "learning_rate": 0.0005, + "loss": 2.1266, + "step": 128870 + }, + { + "epoch": 0.49054908916513784, + "grad_norm": 0.11526907235383987, + "learning_rate": 0.0005, + "loss": 2.122, + "step": 128880 + }, + { + "epoch": 0.4905871516332605, + "grad_norm": 0.1400173157453537, + "learning_rate": 0.0005, + "loss": 2.1041, + "step": 128890 + }, + { + "epoch": 0.4906252141013832, + "grad_norm": 0.12534202635288239, + "learning_rate": 0.0005, + "loss": 2.1131, + "step": 128900 + }, + { + "epoch": 0.49066327656950587, + "grad_norm": 0.12229157984256744, + "learning_rate": 0.0005, + "loss": 2.1276, + "step": 128910 + }, + { + "epoch": 0.4907013390376286, + "grad_norm": 0.12299957126379013, + "learning_rate": 0.0005, + "loss": 2.1208, + "step": 128920 + }, + { + "epoch": 0.49073940150575124, + "grad_norm": 0.12647582590579987, + "learning_rate": 0.0005, + "loss": 2.1285, + "step": 128930 + }, + { + "epoch": 0.4907774639738739, + "grad_norm": 0.1152181401848793, + "learning_rate": 0.0005, + "loss": 2.1205, + "step": 128940 + }, + { + "epoch": 0.4908155264419966, + "grad_norm": 0.12402541190385818, + "learning_rate": 0.0005, + "loss": 2.1222, + "step": 128950 + }, + { + "epoch": 0.49085358891011927, + "grad_norm": 0.11969681084156036, + "learning_rate": 0.0005, + "loss": 2.1055, + "step": 128960 + }, + { + "epoch": 0.490891651378242, + "grad_norm": 0.12540394067764282, + "learning_rate": 0.0005, + "loss": 2.1258, + "step": 128970 + }, + { + "epoch": 0.49092971384636463, + "grad_norm": 0.12889902293682098, + "learning_rate": 0.0005, + "loss": 2.1242, + "step": 128980 + }, + { + "epoch": 0.49096777631448735, + "grad_norm": 0.13155750930309296, + "learning_rate": 0.0005, + "loss": 2.1256, + "step": 128990 + }, + { + "epoch": 0.49100583878261, + "grad_norm": 0.130380779504776, + "learning_rate": 0.0005, + "loss": 2.12, + "step": 129000 + }, + { + "epoch": 0.4910439012507327, + "grad_norm": 0.12233448028564453, + "learning_rate": 0.0005, + "loss": 2.113, + "step": 129010 + }, + { + "epoch": 0.4910819637188554, + "grad_norm": 0.13260172307491302, + "learning_rate": 0.0005, + "loss": 2.1116, + "step": 129020 + }, + { + "epoch": 0.4911200261869781, + "grad_norm": 0.13506576418876648, + "learning_rate": 0.0005, + "loss": 2.1149, + "step": 129030 + }, + { + "epoch": 0.49115808865510074, + "grad_norm": 0.12151715159416199, + "learning_rate": 0.0005, + "loss": 2.1062, + "step": 129040 + }, + { + "epoch": 0.49119615112322346, + "grad_norm": 0.13031908869743347, + "learning_rate": 0.0005, + "loss": 2.1086, + "step": 129050 + }, + { + "epoch": 0.4912342135913461, + "grad_norm": 0.14683033525943756, + "learning_rate": 0.0005, + "loss": 2.109, + "step": 129060 + }, + { + "epoch": 0.4912722760594688, + "grad_norm": 0.1929064840078354, + "learning_rate": 0.0005, + "loss": 2.1215, + "step": 129070 + }, + { + "epoch": 0.4913103385275915, + "grad_norm": 0.14430083334445953, + "learning_rate": 0.0005, + "loss": 2.1004, + "step": 129080 + }, + { + "epoch": 0.49134840099571414, + "grad_norm": 0.12323372066020966, + "learning_rate": 0.0005, + "loss": 2.1039, + "step": 129090 + }, + { + "epoch": 0.49138646346383685, + "grad_norm": 0.13349588215351105, + "learning_rate": 0.0005, + "loss": 2.1204, + "step": 129100 + }, + { + "epoch": 0.4914245259319595, + "grad_norm": 0.12230733036994934, + "learning_rate": 0.0005, + "loss": 2.1101, + "step": 129110 + }, + { + "epoch": 0.4914625884000822, + "grad_norm": 0.13006240129470825, + "learning_rate": 0.0005, + "loss": 2.1118, + "step": 129120 + }, + { + "epoch": 0.4915006508682049, + "grad_norm": 0.1338682621717453, + "learning_rate": 0.0005, + "loss": 2.1092, + "step": 129130 + }, + { + "epoch": 0.4915387133363276, + "grad_norm": 0.1181456446647644, + "learning_rate": 0.0005, + "loss": 2.1329, + "step": 129140 + }, + { + "epoch": 0.49157677580445025, + "grad_norm": 0.12192094326019287, + "learning_rate": 0.0005, + "loss": 2.123, + "step": 129150 + }, + { + "epoch": 0.49161483827257296, + "grad_norm": 0.12347403168678284, + "learning_rate": 0.0005, + "loss": 2.1132, + "step": 129160 + }, + { + "epoch": 0.4916529007406956, + "grad_norm": 0.11497198045253754, + "learning_rate": 0.0005, + "loss": 2.1138, + "step": 129170 + }, + { + "epoch": 0.49169096320881833, + "grad_norm": 0.12021651864051819, + "learning_rate": 0.0005, + "loss": 2.1046, + "step": 129180 + }, + { + "epoch": 0.491729025676941, + "grad_norm": 0.1412685215473175, + "learning_rate": 0.0005, + "loss": 2.1189, + "step": 129190 + }, + { + "epoch": 0.4917670881450637, + "grad_norm": 0.12241291999816895, + "learning_rate": 0.0005, + "loss": 2.1224, + "step": 129200 + }, + { + "epoch": 0.49180515061318636, + "grad_norm": 0.12812742590904236, + "learning_rate": 0.0005, + "loss": 2.1263, + "step": 129210 + }, + { + "epoch": 0.49184321308130907, + "grad_norm": 0.12787936627864838, + "learning_rate": 0.0005, + "loss": 2.1094, + "step": 129220 + }, + { + "epoch": 0.49188127554943173, + "grad_norm": 0.12003834545612335, + "learning_rate": 0.0005, + "loss": 2.1298, + "step": 129230 + }, + { + "epoch": 0.4919193380175544, + "grad_norm": 0.13243554532527924, + "learning_rate": 0.0005, + "loss": 2.1154, + "step": 129240 + }, + { + "epoch": 0.4919574004856771, + "grad_norm": 0.13243474066257477, + "learning_rate": 0.0005, + "loss": 2.1143, + "step": 129250 + }, + { + "epoch": 0.49199546295379976, + "grad_norm": 0.12219058722257614, + "learning_rate": 0.0005, + "loss": 2.1134, + "step": 129260 + }, + { + "epoch": 0.49203352542192247, + "grad_norm": 0.11902420222759247, + "learning_rate": 0.0005, + "loss": 2.1166, + "step": 129270 + }, + { + "epoch": 0.4920715878900451, + "grad_norm": 0.12722794711589813, + "learning_rate": 0.0005, + "loss": 2.1208, + "step": 129280 + }, + { + "epoch": 0.49210965035816784, + "grad_norm": 0.18392205238342285, + "learning_rate": 0.0005, + "loss": 2.1206, + "step": 129290 + }, + { + "epoch": 0.4921477128262905, + "grad_norm": 0.11841941624879837, + "learning_rate": 0.0005, + "loss": 2.1192, + "step": 129300 + }, + { + "epoch": 0.4921857752944132, + "grad_norm": 0.11622878164052963, + "learning_rate": 0.0005, + "loss": 2.1141, + "step": 129310 + }, + { + "epoch": 0.49222383776253587, + "grad_norm": 0.1269485205411911, + "learning_rate": 0.0005, + "loss": 2.1127, + "step": 129320 + }, + { + "epoch": 0.4922619002306586, + "grad_norm": 0.11990734934806824, + "learning_rate": 0.0005, + "loss": 2.0991, + "step": 129330 + }, + { + "epoch": 0.49229996269878123, + "grad_norm": 0.12024495750665665, + "learning_rate": 0.0005, + "loss": 2.1374, + "step": 129340 + }, + { + "epoch": 0.49233802516690395, + "grad_norm": 0.1395251601934433, + "learning_rate": 0.0005, + "loss": 2.1286, + "step": 129350 + }, + { + "epoch": 0.4923760876350266, + "grad_norm": 0.1414317935705185, + "learning_rate": 0.0005, + "loss": 2.1171, + "step": 129360 + }, + { + "epoch": 0.49241415010314926, + "grad_norm": 0.13144022226333618, + "learning_rate": 0.0005, + "loss": 2.1117, + "step": 129370 + }, + { + "epoch": 0.492452212571272, + "grad_norm": 0.1271926462650299, + "learning_rate": 0.0005, + "loss": 2.1147, + "step": 129380 + }, + { + "epoch": 0.49249027503939463, + "grad_norm": 0.13372786343097687, + "learning_rate": 0.0005, + "loss": 2.0996, + "step": 129390 + }, + { + "epoch": 0.49252833750751734, + "grad_norm": 0.12830625474452972, + "learning_rate": 0.0005, + "loss": 2.1093, + "step": 129400 + }, + { + "epoch": 0.49256639997564, + "grad_norm": 0.13309486210346222, + "learning_rate": 0.0005, + "loss": 2.1167, + "step": 129410 + }, + { + "epoch": 0.4926044624437627, + "grad_norm": 0.12286410480737686, + "learning_rate": 0.0005, + "loss": 2.121, + "step": 129420 + }, + { + "epoch": 0.49264252491188537, + "grad_norm": 0.12522216141223907, + "learning_rate": 0.0005, + "loss": 2.1298, + "step": 129430 + }, + { + "epoch": 0.4926805873800081, + "grad_norm": 0.14131148159503937, + "learning_rate": 0.0005, + "loss": 2.1059, + "step": 129440 + }, + { + "epoch": 0.49271864984813074, + "grad_norm": 0.12129399180412292, + "learning_rate": 0.0005, + "loss": 2.1211, + "step": 129450 + }, + { + "epoch": 0.49275671231625345, + "grad_norm": 0.12903055548667908, + "learning_rate": 0.0005, + "loss": 2.1121, + "step": 129460 + }, + { + "epoch": 0.4927947747843761, + "grad_norm": 0.1280837506055832, + "learning_rate": 0.0005, + "loss": 2.1211, + "step": 129470 + }, + { + "epoch": 0.4928328372524988, + "grad_norm": 0.12391971051692963, + "learning_rate": 0.0005, + "loss": 2.1175, + "step": 129480 + }, + { + "epoch": 0.4928708997206215, + "grad_norm": 0.12764014303684235, + "learning_rate": 0.0005, + "loss": 2.1349, + "step": 129490 + }, + { + "epoch": 0.4929089621887442, + "grad_norm": 0.12799176573753357, + "learning_rate": 0.0005, + "loss": 2.1034, + "step": 129500 + }, + { + "epoch": 0.49294702465686685, + "grad_norm": 0.12443231046199799, + "learning_rate": 0.0005, + "loss": 2.106, + "step": 129510 + }, + { + "epoch": 0.4929850871249895, + "grad_norm": 0.12480633705854416, + "learning_rate": 0.0005, + "loss": 2.1231, + "step": 129520 + }, + { + "epoch": 0.4930231495931122, + "grad_norm": 0.12381468713283539, + "learning_rate": 0.0005, + "loss": 2.121, + "step": 129530 + }, + { + "epoch": 0.4930612120612349, + "grad_norm": 0.129247784614563, + "learning_rate": 0.0005, + "loss": 2.1048, + "step": 129540 + }, + { + "epoch": 0.4930992745293576, + "grad_norm": 0.12614686787128448, + "learning_rate": 0.0005, + "loss": 2.1208, + "step": 129550 + }, + { + "epoch": 0.49313733699748025, + "grad_norm": 0.12660275399684906, + "learning_rate": 0.0005, + "loss": 2.1293, + "step": 129560 + }, + { + "epoch": 0.49317539946560296, + "grad_norm": 0.1292925328016281, + "learning_rate": 0.0005, + "loss": 2.1268, + "step": 129570 + }, + { + "epoch": 0.4932134619337256, + "grad_norm": 0.12577460706233978, + "learning_rate": 0.0005, + "loss": 2.112, + "step": 129580 + }, + { + "epoch": 0.49325152440184833, + "grad_norm": 0.12658214569091797, + "learning_rate": 0.0005, + "loss": 2.1223, + "step": 129590 + }, + { + "epoch": 0.493289586869971, + "grad_norm": 0.1259559541940689, + "learning_rate": 0.0005, + "loss": 2.1199, + "step": 129600 + }, + { + "epoch": 0.4933276493380937, + "grad_norm": 0.12474989891052246, + "learning_rate": 0.0005, + "loss": 2.1131, + "step": 129610 + }, + { + "epoch": 0.49336571180621636, + "grad_norm": 0.11340212821960449, + "learning_rate": 0.0005, + "loss": 2.1115, + "step": 129620 + }, + { + "epoch": 0.49340377427433907, + "grad_norm": 0.1202770248055458, + "learning_rate": 0.0005, + "loss": 2.1176, + "step": 129630 + }, + { + "epoch": 0.4934418367424617, + "grad_norm": 0.13771983981132507, + "learning_rate": 0.0005, + "loss": 2.1288, + "step": 129640 + }, + { + "epoch": 0.49347989921058444, + "grad_norm": 0.1280750185251236, + "learning_rate": 0.0005, + "loss": 2.1252, + "step": 129650 + }, + { + "epoch": 0.4935179616787071, + "grad_norm": 0.12560606002807617, + "learning_rate": 0.0005, + "loss": 2.1223, + "step": 129660 + }, + { + "epoch": 0.49355602414682975, + "grad_norm": 0.1361568570137024, + "learning_rate": 0.0005, + "loss": 2.1214, + "step": 129670 + }, + { + "epoch": 0.49359408661495247, + "grad_norm": 0.13186447322368622, + "learning_rate": 0.0005, + "loss": 2.1162, + "step": 129680 + }, + { + "epoch": 0.4936321490830751, + "grad_norm": 0.12305932492017746, + "learning_rate": 0.0005, + "loss": 2.1174, + "step": 129690 + }, + { + "epoch": 0.49367021155119784, + "grad_norm": 0.1293884664773941, + "learning_rate": 0.0005, + "loss": 2.1151, + "step": 129700 + }, + { + "epoch": 0.4937082740193205, + "grad_norm": 0.12605488300323486, + "learning_rate": 0.0005, + "loss": 2.1157, + "step": 129710 + }, + { + "epoch": 0.4937463364874432, + "grad_norm": 0.13056647777557373, + "learning_rate": 0.0005, + "loss": 2.1146, + "step": 129720 + }, + { + "epoch": 0.49378439895556586, + "grad_norm": 0.1266745626926422, + "learning_rate": 0.0005, + "loss": 2.1187, + "step": 129730 + }, + { + "epoch": 0.4938224614236886, + "grad_norm": 0.1208287924528122, + "learning_rate": 0.0005, + "loss": 2.0923, + "step": 129740 + }, + { + "epoch": 0.49386052389181123, + "grad_norm": 0.11373704671859741, + "learning_rate": 0.0005, + "loss": 2.1211, + "step": 129750 + }, + { + "epoch": 0.49389858635993394, + "grad_norm": 0.14107324182987213, + "learning_rate": 0.0005, + "loss": 2.1063, + "step": 129760 + }, + { + "epoch": 0.4939366488280566, + "grad_norm": 0.13310225307941437, + "learning_rate": 0.0005, + "loss": 2.1196, + "step": 129770 + }, + { + "epoch": 0.4939747112961793, + "grad_norm": 0.12170784175395966, + "learning_rate": 0.0005, + "loss": 2.1087, + "step": 129780 + }, + { + "epoch": 0.49401277376430197, + "grad_norm": 0.13277970254421234, + "learning_rate": 0.0005, + "loss": 2.1133, + "step": 129790 + }, + { + "epoch": 0.49405083623242463, + "grad_norm": 0.12409801036119461, + "learning_rate": 0.0005, + "loss": 2.1207, + "step": 129800 + }, + { + "epoch": 0.49408889870054734, + "grad_norm": 0.13743042945861816, + "learning_rate": 0.0005, + "loss": 2.1168, + "step": 129810 + }, + { + "epoch": 0.49412696116867, + "grad_norm": 0.16073283553123474, + "learning_rate": 0.0005, + "loss": 2.1285, + "step": 129820 + }, + { + "epoch": 0.4941650236367927, + "grad_norm": 0.12472742050886154, + "learning_rate": 0.0005, + "loss": 2.1157, + "step": 129830 + }, + { + "epoch": 0.49420308610491537, + "grad_norm": 0.13261789083480835, + "learning_rate": 0.0005, + "loss": 2.1204, + "step": 129840 + }, + { + "epoch": 0.4942411485730381, + "grad_norm": 0.11896239966154099, + "learning_rate": 0.0005, + "loss": 2.1085, + "step": 129850 + }, + { + "epoch": 0.49427921104116074, + "grad_norm": 0.13525590300559998, + "learning_rate": 0.0005, + "loss": 2.1048, + "step": 129860 + }, + { + "epoch": 0.49431727350928345, + "grad_norm": 0.11765822023153305, + "learning_rate": 0.0005, + "loss": 2.1046, + "step": 129870 + }, + { + "epoch": 0.4943553359774061, + "grad_norm": 0.12600429356098175, + "learning_rate": 0.0005, + "loss": 2.1275, + "step": 129880 + }, + { + "epoch": 0.4943933984455288, + "grad_norm": 0.14068011939525604, + "learning_rate": 0.0005, + "loss": 2.1143, + "step": 129890 + }, + { + "epoch": 0.4944314609136515, + "grad_norm": 0.12498706579208374, + "learning_rate": 0.0005, + "loss": 2.1142, + "step": 129900 + }, + { + "epoch": 0.4944695233817742, + "grad_norm": 0.1201263815164566, + "learning_rate": 0.0005, + "loss": 2.0956, + "step": 129910 + }, + { + "epoch": 0.49450758584989685, + "grad_norm": 0.13016465306282043, + "learning_rate": 0.0005, + "loss": 2.1147, + "step": 129920 + }, + { + "epoch": 0.49454564831801956, + "grad_norm": 0.11403704434633255, + "learning_rate": 0.0005, + "loss": 2.1149, + "step": 129930 + }, + { + "epoch": 0.4945837107861422, + "grad_norm": 0.12226550281047821, + "learning_rate": 0.0005, + "loss": 2.1236, + "step": 129940 + }, + { + "epoch": 0.4946217732542649, + "grad_norm": 0.11579559743404388, + "learning_rate": 0.0005, + "loss": 2.1144, + "step": 129950 + }, + { + "epoch": 0.4946598357223876, + "grad_norm": 0.12176090478897095, + "learning_rate": 0.0005, + "loss": 2.1131, + "step": 129960 + }, + { + "epoch": 0.49469789819051024, + "grad_norm": 0.11621265858411789, + "learning_rate": 0.0005, + "loss": 2.1376, + "step": 129970 + }, + { + "epoch": 0.49473596065863296, + "grad_norm": 0.12214352935552597, + "learning_rate": 0.0005, + "loss": 2.1224, + "step": 129980 + }, + { + "epoch": 0.4947740231267556, + "grad_norm": 0.12496017664670944, + "learning_rate": 0.0005, + "loss": 2.117, + "step": 129990 + }, + { + "epoch": 0.4948120855948783, + "grad_norm": 0.11287581920623779, + "learning_rate": 0.0005, + "loss": 2.1137, + "step": 130000 + }, + { + "epoch": 0.494850148063001, + "grad_norm": 0.1303897351026535, + "learning_rate": 0.0005, + "loss": 2.1158, + "step": 130010 + }, + { + "epoch": 0.4948882105311237, + "grad_norm": 0.12756425142288208, + "learning_rate": 0.0005, + "loss": 2.1354, + "step": 130020 + }, + { + "epoch": 0.49492627299924635, + "grad_norm": 0.14151465892791748, + "learning_rate": 0.0005, + "loss": 2.1053, + "step": 130030 + }, + { + "epoch": 0.49496433546736907, + "grad_norm": 0.1192479282617569, + "learning_rate": 0.0005, + "loss": 2.1102, + "step": 130040 + }, + { + "epoch": 0.4950023979354917, + "grad_norm": 0.11875342577695847, + "learning_rate": 0.0005, + "loss": 2.1147, + "step": 130050 + }, + { + "epoch": 0.49504046040361444, + "grad_norm": 0.11185558885335922, + "learning_rate": 0.0005, + "loss": 2.1273, + "step": 130060 + }, + { + "epoch": 0.4950785228717371, + "grad_norm": 0.12418881058692932, + "learning_rate": 0.0005, + "loss": 2.1238, + "step": 130070 + }, + { + "epoch": 0.4951165853398598, + "grad_norm": 0.12762118875980377, + "learning_rate": 0.0005, + "loss": 2.1118, + "step": 130080 + }, + { + "epoch": 0.49515464780798246, + "grad_norm": 0.11265776306390762, + "learning_rate": 0.0005, + "loss": 2.119, + "step": 130090 + }, + { + "epoch": 0.4951927102761051, + "grad_norm": 0.13423916697502136, + "learning_rate": 0.0005, + "loss": 2.1061, + "step": 130100 + }, + { + "epoch": 0.49523077274422783, + "grad_norm": 0.13270613551139832, + "learning_rate": 0.0005, + "loss": 2.0982, + "step": 130110 + }, + { + "epoch": 0.4952688352123505, + "grad_norm": 0.12004047632217407, + "learning_rate": 0.0005, + "loss": 2.0986, + "step": 130120 + }, + { + "epoch": 0.4953068976804732, + "grad_norm": 0.11681164801120758, + "learning_rate": 0.0005, + "loss": 2.1133, + "step": 130130 + }, + { + "epoch": 0.49534496014859586, + "grad_norm": 0.12493449449539185, + "learning_rate": 0.0005, + "loss": 2.0962, + "step": 130140 + }, + { + "epoch": 0.49538302261671857, + "grad_norm": 0.11177048832178116, + "learning_rate": 0.0005, + "loss": 2.1114, + "step": 130150 + }, + { + "epoch": 0.49542108508484123, + "grad_norm": 0.12243694812059402, + "learning_rate": 0.0005, + "loss": 2.1226, + "step": 130160 + }, + { + "epoch": 0.49545914755296394, + "grad_norm": 0.12640246748924255, + "learning_rate": 0.0005, + "loss": 2.1081, + "step": 130170 + }, + { + "epoch": 0.4954972100210866, + "grad_norm": 0.12343862652778625, + "learning_rate": 0.0005, + "loss": 2.1048, + "step": 130180 + }, + { + "epoch": 0.4955352724892093, + "grad_norm": 0.11960109323263168, + "learning_rate": 0.0005, + "loss": 2.1127, + "step": 130190 + }, + { + "epoch": 0.49557333495733197, + "grad_norm": 0.1165766566991806, + "learning_rate": 0.0005, + "loss": 2.1211, + "step": 130200 + }, + { + "epoch": 0.4956113974254547, + "grad_norm": 0.1352618932723999, + "learning_rate": 0.0005, + "loss": 2.1235, + "step": 130210 + }, + { + "epoch": 0.49564945989357734, + "grad_norm": 0.13529905676841736, + "learning_rate": 0.0005, + "loss": 2.1133, + "step": 130220 + }, + { + "epoch": 0.4956875223617, + "grad_norm": 0.11719474196434021, + "learning_rate": 0.0005, + "loss": 2.1149, + "step": 130230 + }, + { + "epoch": 0.4957255848298227, + "grad_norm": 0.13491681218147278, + "learning_rate": 0.0005, + "loss": 2.1064, + "step": 130240 + }, + { + "epoch": 0.49576364729794536, + "grad_norm": 0.1283651441335678, + "learning_rate": 0.0005, + "loss": 2.1193, + "step": 130250 + }, + { + "epoch": 0.4958017097660681, + "grad_norm": 0.1324135810136795, + "learning_rate": 0.0005, + "loss": 2.1197, + "step": 130260 + }, + { + "epoch": 0.49583977223419073, + "grad_norm": 0.11992873251438141, + "learning_rate": 0.0005, + "loss": 2.1039, + "step": 130270 + }, + { + "epoch": 0.49587783470231345, + "grad_norm": 0.11500484496355057, + "learning_rate": 0.0005, + "loss": 2.099, + "step": 130280 + }, + { + "epoch": 0.4959158971704361, + "grad_norm": 0.11487402766942978, + "learning_rate": 0.0005, + "loss": 2.1073, + "step": 130290 + }, + { + "epoch": 0.4959539596385588, + "grad_norm": 0.12722621858119965, + "learning_rate": 0.0005, + "loss": 2.1095, + "step": 130300 + }, + { + "epoch": 0.4959920221066815, + "grad_norm": 0.14762382209300995, + "learning_rate": 0.0005, + "loss": 2.1007, + "step": 130310 + }, + { + "epoch": 0.4960300845748042, + "grad_norm": 0.11726965755224228, + "learning_rate": 0.0005, + "loss": 2.1065, + "step": 130320 + }, + { + "epoch": 0.49606814704292684, + "grad_norm": 0.13559289276599884, + "learning_rate": 0.0005, + "loss": 2.1166, + "step": 130330 + }, + { + "epoch": 0.49610620951104956, + "grad_norm": 0.12326718121767044, + "learning_rate": 0.0005, + "loss": 2.1013, + "step": 130340 + }, + { + "epoch": 0.4961442719791722, + "grad_norm": 0.1360858827829361, + "learning_rate": 0.0005, + "loss": 2.1059, + "step": 130350 + }, + { + "epoch": 0.4961823344472949, + "grad_norm": 0.14227698743343353, + "learning_rate": 0.0005, + "loss": 2.1203, + "step": 130360 + }, + { + "epoch": 0.4962203969154176, + "grad_norm": 0.12769527733325958, + "learning_rate": 0.0005, + "loss": 2.1099, + "step": 130370 + }, + { + "epoch": 0.49625845938354024, + "grad_norm": 0.11520993709564209, + "learning_rate": 0.0005, + "loss": 2.1203, + "step": 130380 + }, + { + "epoch": 0.49629652185166295, + "grad_norm": 0.12799958884716034, + "learning_rate": 0.0005, + "loss": 2.1136, + "step": 130390 + }, + { + "epoch": 0.4963345843197856, + "grad_norm": 0.12282870709896088, + "learning_rate": 0.0005, + "loss": 2.1144, + "step": 130400 + }, + { + "epoch": 0.4963726467879083, + "grad_norm": 0.12155033648014069, + "learning_rate": 0.0005, + "loss": 2.1002, + "step": 130410 + }, + { + "epoch": 0.496410709256031, + "grad_norm": 0.12830521166324615, + "learning_rate": 0.0005, + "loss": 2.1075, + "step": 130420 + }, + { + "epoch": 0.4964487717241537, + "grad_norm": 0.12929099798202515, + "learning_rate": 0.0005, + "loss": 2.1061, + "step": 130430 + }, + { + "epoch": 0.49648683419227635, + "grad_norm": 0.12785860896110535, + "learning_rate": 0.0005, + "loss": 2.1173, + "step": 130440 + }, + { + "epoch": 0.49652489666039906, + "grad_norm": 0.1235969290137291, + "learning_rate": 0.0005, + "loss": 2.1172, + "step": 130450 + }, + { + "epoch": 0.4965629591285217, + "grad_norm": 0.12364578247070312, + "learning_rate": 0.0005, + "loss": 2.1174, + "step": 130460 + }, + { + "epoch": 0.49660102159664443, + "grad_norm": 0.12840792536735535, + "learning_rate": 0.0005, + "loss": 2.1082, + "step": 130470 + }, + { + "epoch": 0.4966390840647671, + "grad_norm": 0.1260167807340622, + "learning_rate": 0.0005, + "loss": 2.1115, + "step": 130480 + }, + { + "epoch": 0.4966771465328898, + "grad_norm": 0.12089382857084274, + "learning_rate": 0.0005, + "loss": 2.1252, + "step": 130490 + }, + { + "epoch": 0.49671520900101246, + "grad_norm": 0.1287691444158554, + "learning_rate": 0.0005, + "loss": 2.121, + "step": 130500 + }, + { + "epoch": 0.49675327146913517, + "grad_norm": 0.12943902611732483, + "learning_rate": 0.0005, + "loss": 2.1298, + "step": 130510 + }, + { + "epoch": 0.49679133393725783, + "grad_norm": 0.13859035074710846, + "learning_rate": 0.0005, + "loss": 2.1175, + "step": 130520 + }, + { + "epoch": 0.4968293964053805, + "grad_norm": 0.12068472802639008, + "learning_rate": 0.0005, + "loss": 2.1246, + "step": 130530 + }, + { + "epoch": 0.4968674588735032, + "grad_norm": 0.12954632937908173, + "learning_rate": 0.0005, + "loss": 2.1113, + "step": 130540 + }, + { + "epoch": 0.49690552134162586, + "grad_norm": 0.13704584538936615, + "learning_rate": 0.0005, + "loss": 2.1176, + "step": 130550 + }, + { + "epoch": 0.49694358380974857, + "grad_norm": 0.13277916610240936, + "learning_rate": 0.0005, + "loss": 2.1041, + "step": 130560 + }, + { + "epoch": 0.4969816462778712, + "grad_norm": 0.12848743796348572, + "learning_rate": 0.0005, + "loss": 2.115, + "step": 130570 + }, + { + "epoch": 0.49701970874599394, + "grad_norm": 0.12172497808933258, + "learning_rate": 0.0005, + "loss": 2.1202, + "step": 130580 + }, + { + "epoch": 0.4970577712141166, + "grad_norm": 0.12590010464191437, + "learning_rate": 0.0005, + "loss": 2.115, + "step": 130590 + }, + { + "epoch": 0.4970958336822393, + "grad_norm": 0.1207374557852745, + "learning_rate": 0.0005, + "loss": 2.1076, + "step": 130600 + }, + { + "epoch": 0.49713389615036196, + "grad_norm": 0.12318812310695648, + "learning_rate": 0.0005, + "loss": 2.1095, + "step": 130610 + }, + { + "epoch": 0.4971719586184847, + "grad_norm": 0.1470927745103836, + "learning_rate": 0.0005, + "loss": 2.1001, + "step": 130620 + }, + { + "epoch": 0.49721002108660733, + "grad_norm": 0.13342763483524323, + "learning_rate": 0.0005, + "loss": 2.1255, + "step": 130630 + }, + { + "epoch": 0.49724808355473005, + "grad_norm": 0.12590563297271729, + "learning_rate": 0.0005, + "loss": 2.125, + "step": 130640 + }, + { + "epoch": 0.4972861460228527, + "grad_norm": 0.11755761504173279, + "learning_rate": 0.0005, + "loss": 2.1144, + "step": 130650 + }, + { + "epoch": 0.49732420849097536, + "grad_norm": 0.12532316148281097, + "learning_rate": 0.0005, + "loss": 2.1219, + "step": 130660 + }, + { + "epoch": 0.4973622709590981, + "grad_norm": 0.12779484689235687, + "learning_rate": 0.0005, + "loss": 2.1118, + "step": 130670 + }, + { + "epoch": 0.49740033342722073, + "grad_norm": 0.11526070535182953, + "learning_rate": 0.0005, + "loss": 2.1091, + "step": 130680 + }, + { + "epoch": 0.49743839589534344, + "grad_norm": 0.12624244391918182, + "learning_rate": 0.0005, + "loss": 2.1333, + "step": 130690 + }, + { + "epoch": 0.4974764583634661, + "grad_norm": 0.12625885009765625, + "learning_rate": 0.0005, + "loss": 2.1135, + "step": 130700 + }, + { + "epoch": 0.4975145208315888, + "grad_norm": 0.11956362426280975, + "learning_rate": 0.0005, + "loss": 2.123, + "step": 130710 + }, + { + "epoch": 0.49755258329971147, + "grad_norm": 0.12251365184783936, + "learning_rate": 0.0005, + "loss": 2.104, + "step": 130720 + }, + { + "epoch": 0.4975906457678342, + "grad_norm": 0.11382036656141281, + "learning_rate": 0.0005, + "loss": 2.1041, + "step": 130730 + }, + { + "epoch": 0.49762870823595684, + "grad_norm": 0.11894240230321884, + "learning_rate": 0.0005, + "loss": 2.1068, + "step": 130740 + }, + { + "epoch": 0.49766677070407955, + "grad_norm": 0.1151915043592453, + "learning_rate": 0.0005, + "loss": 2.1088, + "step": 130750 + }, + { + "epoch": 0.4977048331722022, + "grad_norm": 0.11738212406635284, + "learning_rate": 0.0005, + "loss": 2.1087, + "step": 130760 + }, + { + "epoch": 0.4977428956403249, + "grad_norm": 0.12464101612567902, + "learning_rate": 0.0005, + "loss": 2.1253, + "step": 130770 + }, + { + "epoch": 0.4977809581084476, + "grad_norm": 0.12291174381971359, + "learning_rate": 0.0005, + "loss": 2.1073, + "step": 130780 + }, + { + "epoch": 0.4978190205765703, + "grad_norm": 0.12099691480398178, + "learning_rate": 0.0005, + "loss": 2.1106, + "step": 130790 + }, + { + "epoch": 0.49785708304469295, + "grad_norm": 0.11977177113294601, + "learning_rate": 0.0005, + "loss": 2.1392, + "step": 130800 + }, + { + "epoch": 0.4978951455128156, + "grad_norm": 0.12582093477249146, + "learning_rate": 0.0005, + "loss": 2.1045, + "step": 130810 + }, + { + "epoch": 0.4979332079809383, + "grad_norm": 0.14268380403518677, + "learning_rate": 0.0005, + "loss": 2.1189, + "step": 130820 + }, + { + "epoch": 0.497971270449061, + "grad_norm": 0.11546272039413452, + "learning_rate": 0.0005, + "loss": 2.1105, + "step": 130830 + }, + { + "epoch": 0.4980093329171837, + "grad_norm": 0.1425076276063919, + "learning_rate": 0.0005, + "loss": 2.1095, + "step": 130840 + }, + { + "epoch": 0.49804739538530635, + "grad_norm": 0.13655667006969452, + "learning_rate": 0.0005, + "loss": 2.1259, + "step": 130850 + }, + { + "epoch": 0.49808545785342906, + "grad_norm": 0.1245507001876831, + "learning_rate": 0.0005, + "loss": 2.1099, + "step": 130860 + }, + { + "epoch": 0.4981235203215517, + "grad_norm": 0.13207346200942993, + "learning_rate": 0.0005, + "loss": 2.1283, + "step": 130870 + }, + { + "epoch": 0.49816158278967443, + "grad_norm": 0.11879435181617737, + "learning_rate": 0.0005, + "loss": 2.1106, + "step": 130880 + }, + { + "epoch": 0.4981996452577971, + "grad_norm": 0.1184876337647438, + "learning_rate": 0.0005, + "loss": 2.1206, + "step": 130890 + }, + { + "epoch": 0.4982377077259198, + "grad_norm": 0.12992629408836365, + "learning_rate": 0.0005, + "loss": 2.1162, + "step": 130900 + }, + { + "epoch": 0.49827577019404246, + "grad_norm": 0.14854702353477478, + "learning_rate": 0.0005, + "loss": 2.1149, + "step": 130910 + }, + { + "epoch": 0.49831383266216517, + "grad_norm": 0.1280069351196289, + "learning_rate": 0.0005, + "loss": 2.1154, + "step": 130920 + }, + { + "epoch": 0.4983518951302878, + "grad_norm": 0.11797192692756653, + "learning_rate": 0.0005, + "loss": 2.1023, + "step": 130930 + }, + { + "epoch": 0.49838995759841054, + "grad_norm": 0.1346352994441986, + "learning_rate": 0.0005, + "loss": 2.1219, + "step": 130940 + }, + { + "epoch": 0.4984280200665332, + "grad_norm": 0.1282471865415573, + "learning_rate": 0.0005, + "loss": 2.1318, + "step": 130950 + }, + { + "epoch": 0.49846608253465585, + "grad_norm": 0.12099988013505936, + "learning_rate": 0.0005, + "loss": 2.1261, + "step": 130960 + }, + { + "epoch": 0.49850414500277856, + "grad_norm": 1.4301072359085083, + "learning_rate": 0.0005, + "loss": 2.1105, + "step": 130970 + }, + { + "epoch": 0.4985422074709012, + "grad_norm": 0.12891265749931335, + "learning_rate": 0.0005, + "loss": 2.1107, + "step": 130980 + }, + { + "epoch": 0.49858026993902393, + "grad_norm": 0.12442260980606079, + "learning_rate": 0.0005, + "loss": 2.1213, + "step": 130990 + }, + { + "epoch": 0.4986183324071466, + "grad_norm": 0.14176233112812042, + "learning_rate": 0.0005, + "loss": 2.1267, + "step": 131000 + }, + { + "epoch": 0.4986563948752693, + "grad_norm": 0.12994049489498138, + "learning_rate": 0.0005, + "loss": 2.1184, + "step": 131010 + }, + { + "epoch": 0.49869445734339196, + "grad_norm": 0.12481575459241867, + "learning_rate": 0.0005, + "loss": 2.1182, + "step": 131020 + }, + { + "epoch": 0.4987325198115147, + "grad_norm": 0.12183275818824768, + "learning_rate": 0.0005, + "loss": 2.1169, + "step": 131030 + }, + { + "epoch": 0.49877058227963733, + "grad_norm": 0.1301172971725464, + "learning_rate": 0.0005, + "loss": 2.1142, + "step": 131040 + }, + { + "epoch": 0.49880864474776004, + "grad_norm": 0.12625858187675476, + "learning_rate": 0.0005, + "loss": 2.1102, + "step": 131050 + }, + { + "epoch": 0.4988467072158827, + "grad_norm": 0.13612774014472961, + "learning_rate": 0.0005, + "loss": 2.1187, + "step": 131060 + }, + { + "epoch": 0.4988847696840054, + "grad_norm": 0.12587963044643402, + "learning_rate": 0.0005, + "loss": 2.1152, + "step": 131070 + }, + { + "epoch": 0.49892283215212807, + "grad_norm": 0.12649382650852203, + "learning_rate": 0.0005, + "loss": 2.1176, + "step": 131080 + }, + { + "epoch": 0.49896089462025073, + "grad_norm": 0.135384663939476, + "learning_rate": 0.0005, + "loss": 2.1198, + "step": 131090 + }, + { + "epoch": 0.49899895708837344, + "grad_norm": 0.15188583731651306, + "learning_rate": 0.0005, + "loss": 2.1198, + "step": 131100 + }, + { + "epoch": 0.4990370195564961, + "grad_norm": 0.12795251607894897, + "learning_rate": 0.0005, + "loss": 2.1188, + "step": 131110 + }, + { + "epoch": 0.4990750820246188, + "grad_norm": 0.12417992204427719, + "learning_rate": 0.0005, + "loss": 2.1155, + "step": 131120 + }, + { + "epoch": 0.49911314449274147, + "grad_norm": 0.1358865350484848, + "learning_rate": 0.0005, + "loss": 2.1225, + "step": 131130 + }, + { + "epoch": 0.4991512069608642, + "grad_norm": 0.1385025829076767, + "learning_rate": 0.0005, + "loss": 2.1178, + "step": 131140 + }, + { + "epoch": 0.49918926942898684, + "grad_norm": 0.11305361241102219, + "learning_rate": 0.0005, + "loss": 2.1293, + "step": 131150 + }, + { + "epoch": 0.49922733189710955, + "grad_norm": 0.1257256418466568, + "learning_rate": 0.0005, + "loss": 2.1263, + "step": 131160 + }, + { + "epoch": 0.4992653943652322, + "grad_norm": 0.12424245476722717, + "learning_rate": 0.0005, + "loss": 2.1169, + "step": 131170 + }, + { + "epoch": 0.4993034568333549, + "grad_norm": 0.12357328087091446, + "learning_rate": 0.0005, + "loss": 2.1153, + "step": 131180 + }, + { + "epoch": 0.4993415193014776, + "grad_norm": 0.1326143890619278, + "learning_rate": 0.0005, + "loss": 2.0951, + "step": 131190 + }, + { + "epoch": 0.4993795817696003, + "grad_norm": 0.12293647229671478, + "learning_rate": 0.0005, + "loss": 2.1078, + "step": 131200 + }, + { + "epoch": 0.49941764423772295, + "grad_norm": 0.11853579431772232, + "learning_rate": 0.0005, + "loss": 2.1174, + "step": 131210 + }, + { + "epoch": 0.49945570670584566, + "grad_norm": 0.12029995769262314, + "learning_rate": 0.0005, + "loss": 2.122, + "step": 131220 + }, + { + "epoch": 0.4994937691739683, + "grad_norm": 0.13621987402439117, + "learning_rate": 0.0005, + "loss": 2.13, + "step": 131230 + }, + { + "epoch": 0.499531831642091, + "grad_norm": 0.1239994466304779, + "learning_rate": 0.0005, + "loss": 2.1239, + "step": 131240 + }, + { + "epoch": 0.4995698941102137, + "grad_norm": 0.12169505655765533, + "learning_rate": 0.0005, + "loss": 2.1127, + "step": 131250 + }, + { + "epoch": 0.49960795657833634, + "grad_norm": 0.12012088298797607, + "learning_rate": 0.0005, + "loss": 2.1311, + "step": 131260 + }, + { + "epoch": 0.49964601904645906, + "grad_norm": 0.11918789893388748, + "learning_rate": 0.0005, + "loss": 2.1297, + "step": 131270 + }, + { + "epoch": 0.4996840815145817, + "grad_norm": 0.12536664307117462, + "learning_rate": 0.0005, + "loss": 2.1214, + "step": 131280 + }, + { + "epoch": 0.4997221439827044, + "grad_norm": 0.13789057731628418, + "learning_rate": 0.0005, + "loss": 2.1147, + "step": 131290 + }, + { + "epoch": 0.4997602064508271, + "grad_norm": 0.11794903874397278, + "learning_rate": 0.0005, + "loss": 2.1074, + "step": 131300 + }, + { + "epoch": 0.4997982689189498, + "grad_norm": 0.12749889492988586, + "learning_rate": 0.0005, + "loss": 2.126, + "step": 131310 + }, + { + "epoch": 0.49983633138707245, + "grad_norm": 0.1235434040427208, + "learning_rate": 0.0005, + "loss": 2.119, + "step": 131320 + }, + { + "epoch": 0.49987439385519516, + "grad_norm": 0.1203431487083435, + "learning_rate": 0.0005, + "loss": 2.1098, + "step": 131330 + }, + { + "epoch": 0.4999124563233178, + "grad_norm": 0.12585608661174774, + "learning_rate": 0.0005, + "loss": 2.1024, + "step": 131340 + }, + { + "epoch": 0.49995051879144053, + "grad_norm": 0.13304553925991058, + "learning_rate": 0.0005, + "loss": 2.1116, + "step": 131350 + }, + { + "epoch": 0.4999885812595632, + "grad_norm": 0.12096429616212845, + "learning_rate": 0.0005, + "loss": 2.1101, + "step": 131360 + }, + { + "epoch": 0.5000266437276859, + "grad_norm": 0.12587635219097137, + "learning_rate": 0.0005, + "loss": 2.1128, + "step": 131370 + }, + { + "epoch": 0.5000647061958086, + "grad_norm": 0.13106253743171692, + "learning_rate": 0.0005, + "loss": 2.1145, + "step": 131380 + }, + { + "epoch": 0.5001027686639312, + "grad_norm": 0.12039899080991745, + "learning_rate": 0.0005, + "loss": 2.1057, + "step": 131390 + }, + { + "epoch": 0.5001408311320539, + "grad_norm": 0.14034612476825714, + "learning_rate": 0.0005, + "loss": 2.1225, + "step": 131400 + }, + { + "epoch": 0.5001788936001766, + "grad_norm": 0.11676795780658722, + "learning_rate": 0.0005, + "loss": 2.1172, + "step": 131410 + }, + { + "epoch": 0.5002169560682993, + "grad_norm": 0.12730121612548828, + "learning_rate": 0.0005, + "loss": 2.1184, + "step": 131420 + }, + { + "epoch": 0.500255018536422, + "grad_norm": 0.1298544555902481, + "learning_rate": 0.0005, + "loss": 2.1159, + "step": 131430 + }, + { + "epoch": 0.5002930810045446, + "grad_norm": 0.12161430716514587, + "learning_rate": 0.0005, + "loss": 2.1335, + "step": 131440 + }, + { + "epoch": 0.5003311434726674, + "grad_norm": 0.12329776585102081, + "learning_rate": 0.0005, + "loss": 2.0967, + "step": 131450 + }, + { + "epoch": 0.50036920594079, + "grad_norm": 0.13645300269126892, + "learning_rate": 0.0005, + "loss": 2.1089, + "step": 131460 + }, + { + "epoch": 0.5004072684089127, + "grad_norm": 0.11726679652929306, + "learning_rate": 0.0005, + "loss": 2.1313, + "step": 131470 + }, + { + "epoch": 0.5004453308770354, + "grad_norm": 0.12503685057163239, + "learning_rate": 0.0005, + "loss": 2.1046, + "step": 131480 + }, + { + "epoch": 0.5004833933451581, + "grad_norm": 0.13997359573841095, + "learning_rate": 0.0005, + "loss": 2.1317, + "step": 131490 + }, + { + "epoch": 0.5005214558132808, + "grad_norm": 0.1292068064212799, + "learning_rate": 0.0005, + "loss": 2.118, + "step": 131500 + }, + { + "epoch": 0.5005595182814034, + "grad_norm": 0.11975622922182083, + "learning_rate": 0.0005, + "loss": 2.1204, + "step": 131510 + }, + { + "epoch": 0.5005975807495261, + "grad_norm": 0.12581771612167358, + "learning_rate": 0.0005, + "loss": 2.112, + "step": 131520 + }, + { + "epoch": 0.5006356432176488, + "grad_norm": 0.1165451779961586, + "learning_rate": 0.0005, + "loss": 2.1069, + "step": 131530 + }, + { + "epoch": 0.5006737056857715, + "grad_norm": 0.1284739375114441, + "learning_rate": 0.0005, + "loss": 2.1001, + "step": 131540 + }, + { + "epoch": 0.5007117681538942, + "grad_norm": 0.13096266984939575, + "learning_rate": 0.0005, + "loss": 2.1004, + "step": 131550 + }, + { + "epoch": 0.5007498306220168, + "grad_norm": 0.11530668288469315, + "learning_rate": 0.0005, + "loss": 2.1027, + "step": 131560 + }, + { + "epoch": 0.5007878930901395, + "grad_norm": 0.12410982698202133, + "learning_rate": 0.0005, + "loss": 2.1073, + "step": 131570 + }, + { + "epoch": 0.5008259555582623, + "grad_norm": 0.1452266126871109, + "learning_rate": 0.0005, + "loss": 2.1192, + "step": 131580 + }, + { + "epoch": 0.5008640180263849, + "grad_norm": 0.12477368116378784, + "learning_rate": 0.0005, + "loss": 2.1226, + "step": 131590 + }, + { + "epoch": 0.5009020804945076, + "grad_norm": 0.138809472322464, + "learning_rate": 0.0005, + "loss": 2.1191, + "step": 131600 + }, + { + "epoch": 0.5009401429626302, + "grad_norm": 0.11599473655223846, + "learning_rate": 0.0005, + "loss": 2.1265, + "step": 131610 + }, + { + "epoch": 0.500978205430753, + "grad_norm": 0.12185689061880112, + "learning_rate": 0.0005, + "loss": 2.1083, + "step": 131620 + }, + { + "epoch": 0.5010162678988757, + "grad_norm": 0.14375880360603333, + "learning_rate": 0.0005, + "loss": 2.1049, + "step": 131630 + }, + { + "epoch": 0.5010543303669983, + "grad_norm": 0.12877488136291504, + "learning_rate": 0.0005, + "loss": 2.1155, + "step": 131640 + }, + { + "epoch": 0.501092392835121, + "grad_norm": 0.1386006623506546, + "learning_rate": 0.0005, + "loss": 2.1146, + "step": 131650 + }, + { + "epoch": 0.5011304553032437, + "grad_norm": 0.12304980307817459, + "learning_rate": 0.0005, + "loss": 2.1182, + "step": 131660 + }, + { + "epoch": 0.5011685177713664, + "grad_norm": 0.13089512288570404, + "learning_rate": 0.0005, + "loss": 2.121, + "step": 131670 + }, + { + "epoch": 0.501206580239489, + "grad_norm": 0.1259504109621048, + "learning_rate": 0.0005, + "loss": 2.1247, + "step": 131680 + }, + { + "epoch": 0.5012446427076117, + "grad_norm": 0.1313033401966095, + "learning_rate": 0.0005, + "loss": 2.1107, + "step": 131690 + }, + { + "epoch": 0.5012827051757344, + "grad_norm": 0.12506762146949768, + "learning_rate": 0.0005, + "loss": 2.1187, + "step": 131700 + }, + { + "epoch": 0.5013207676438571, + "grad_norm": 0.11512114107608795, + "learning_rate": 0.0005, + "loss": 2.1135, + "step": 131710 + }, + { + "epoch": 0.5013588301119798, + "grad_norm": 0.1273897886276245, + "learning_rate": 0.0005, + "loss": 2.1173, + "step": 131720 + }, + { + "epoch": 0.5013968925801024, + "grad_norm": 0.12503871321678162, + "learning_rate": 0.0005, + "loss": 2.1152, + "step": 131730 + }, + { + "epoch": 0.5014349550482251, + "grad_norm": 0.11183293163776398, + "learning_rate": 0.0005, + "loss": 2.114, + "step": 131740 + }, + { + "epoch": 0.5014730175163479, + "grad_norm": 0.13775190711021423, + "learning_rate": 0.0005, + "loss": 2.1222, + "step": 131750 + }, + { + "epoch": 0.5015110799844705, + "grad_norm": 0.12885314226150513, + "learning_rate": 0.0005, + "loss": 2.1173, + "step": 131760 + }, + { + "epoch": 0.5015491424525932, + "grad_norm": 0.12374498695135117, + "learning_rate": 0.0005, + "loss": 2.1189, + "step": 131770 + }, + { + "epoch": 0.5015872049207158, + "grad_norm": 0.12108965963125229, + "learning_rate": 0.0005, + "loss": 2.1066, + "step": 131780 + }, + { + "epoch": 0.5016252673888386, + "grad_norm": 0.13662739098072052, + "learning_rate": 0.0005, + "loss": 2.1205, + "step": 131790 + }, + { + "epoch": 0.5016633298569613, + "grad_norm": 0.12604866921901703, + "learning_rate": 0.0005, + "loss": 2.11, + "step": 131800 + }, + { + "epoch": 0.5017013923250839, + "grad_norm": 0.1323482245206833, + "learning_rate": 0.0005, + "loss": 2.1127, + "step": 131810 + }, + { + "epoch": 0.5017394547932066, + "grad_norm": 0.13354459404945374, + "learning_rate": 0.0005, + "loss": 2.1029, + "step": 131820 + }, + { + "epoch": 0.5017775172613292, + "grad_norm": 0.13745664060115814, + "learning_rate": 0.0005, + "loss": 2.1181, + "step": 131830 + }, + { + "epoch": 0.501815579729452, + "grad_norm": 0.13215488195419312, + "learning_rate": 0.0005, + "loss": 2.1066, + "step": 131840 + }, + { + "epoch": 0.5018536421975747, + "grad_norm": 0.13737165927886963, + "learning_rate": 0.0005, + "loss": 2.1141, + "step": 131850 + }, + { + "epoch": 0.5018917046656973, + "grad_norm": 0.12160696089267731, + "learning_rate": 0.0005, + "loss": 2.1154, + "step": 131860 + }, + { + "epoch": 0.50192976713382, + "grad_norm": 0.127287358045578, + "learning_rate": 0.0005, + "loss": 2.122, + "step": 131870 + }, + { + "epoch": 0.5019678296019427, + "grad_norm": 0.11619853228330612, + "learning_rate": 0.0005, + "loss": 2.1131, + "step": 131880 + }, + { + "epoch": 0.5020058920700654, + "grad_norm": 0.12385293841362, + "learning_rate": 0.0005, + "loss": 2.1153, + "step": 131890 + }, + { + "epoch": 0.5020439545381881, + "grad_norm": 0.12530963122844696, + "learning_rate": 0.0005, + "loss": 2.108, + "step": 131900 + }, + { + "epoch": 0.5020820170063107, + "grad_norm": 0.11971908062696457, + "learning_rate": 0.0005, + "loss": 2.0995, + "step": 131910 + }, + { + "epoch": 0.5021200794744335, + "grad_norm": 0.12599734961986542, + "learning_rate": 0.0005, + "loss": 2.1309, + "step": 131920 + }, + { + "epoch": 0.5021581419425561, + "grad_norm": 0.13007313013076782, + "learning_rate": 0.0005, + "loss": 2.1075, + "step": 131930 + }, + { + "epoch": 0.5021962044106788, + "grad_norm": 0.12488700449466705, + "learning_rate": 0.0005, + "loss": 2.109, + "step": 131940 + }, + { + "epoch": 0.5022342668788015, + "grad_norm": 0.12737232446670532, + "learning_rate": 0.0005, + "loss": 2.1016, + "step": 131950 + }, + { + "epoch": 0.5022723293469241, + "grad_norm": 0.138985738158226, + "learning_rate": 0.0005, + "loss": 2.1147, + "step": 131960 + }, + { + "epoch": 0.5023103918150469, + "grad_norm": 0.11265484243631363, + "learning_rate": 0.0005, + "loss": 2.1037, + "step": 131970 + }, + { + "epoch": 0.5023484542831695, + "grad_norm": 0.128614142537117, + "learning_rate": 0.0005, + "loss": 2.1186, + "step": 131980 + }, + { + "epoch": 0.5023865167512922, + "grad_norm": 0.13249129056930542, + "learning_rate": 0.0005, + "loss": 2.1245, + "step": 131990 + }, + { + "epoch": 0.5024245792194149, + "grad_norm": 0.11810528486967087, + "learning_rate": 0.0005, + "loss": 2.1135, + "step": 132000 + }, + { + "epoch": 0.5024626416875376, + "grad_norm": 0.11904824525117874, + "learning_rate": 0.0005, + "loss": 2.1163, + "step": 132010 + }, + { + "epoch": 0.5025007041556603, + "grad_norm": 0.11354199051856995, + "learning_rate": 0.0005, + "loss": 2.1082, + "step": 132020 + }, + { + "epoch": 0.5025387666237829, + "grad_norm": 0.13872075080871582, + "learning_rate": 0.0005, + "loss": 2.1058, + "step": 132030 + }, + { + "epoch": 0.5025768290919056, + "grad_norm": 0.1317596286535263, + "learning_rate": 0.0005, + "loss": 2.1063, + "step": 132040 + }, + { + "epoch": 0.5026148915600284, + "grad_norm": 0.14089347422122955, + "learning_rate": 0.0005, + "loss": 2.1199, + "step": 132050 + }, + { + "epoch": 0.502652954028151, + "grad_norm": 0.36503109335899353, + "learning_rate": 0.0005, + "loss": 2.1037, + "step": 132060 + }, + { + "epoch": 0.5026910164962737, + "grad_norm": 0.130210742354393, + "learning_rate": 0.0005, + "loss": 2.1115, + "step": 132070 + }, + { + "epoch": 0.5027290789643963, + "grad_norm": 0.11821793019771576, + "learning_rate": 0.0005, + "loss": 2.1049, + "step": 132080 + }, + { + "epoch": 0.5027671414325191, + "grad_norm": 0.1177087128162384, + "learning_rate": 0.0005, + "loss": 2.1229, + "step": 132090 + }, + { + "epoch": 0.5028052039006418, + "grad_norm": 0.1179640144109726, + "learning_rate": 0.0005, + "loss": 2.1228, + "step": 132100 + }, + { + "epoch": 0.5028432663687644, + "grad_norm": 0.12133560329675674, + "learning_rate": 0.0005, + "loss": 2.1088, + "step": 132110 + }, + { + "epoch": 0.5028813288368871, + "grad_norm": 0.14076071977615356, + "learning_rate": 0.0005, + "loss": 2.1138, + "step": 132120 + }, + { + "epoch": 0.5029193913050097, + "grad_norm": 0.1257144659757614, + "learning_rate": 0.0005, + "loss": 2.1157, + "step": 132130 + }, + { + "epoch": 0.5029574537731325, + "grad_norm": 0.12998227775096893, + "learning_rate": 0.0005, + "loss": 2.1104, + "step": 132140 + }, + { + "epoch": 0.5029955162412552, + "grad_norm": 0.12388579547405243, + "learning_rate": 0.0005, + "loss": 2.1076, + "step": 132150 + }, + { + "epoch": 0.5030335787093778, + "grad_norm": 0.1183706670999527, + "learning_rate": 0.0005, + "loss": 2.1037, + "step": 132160 + }, + { + "epoch": 0.5030716411775005, + "grad_norm": 0.12758280336856842, + "learning_rate": 0.0005, + "loss": 2.1086, + "step": 132170 + }, + { + "epoch": 0.5031097036456232, + "grad_norm": 0.1403193324804306, + "learning_rate": 0.0005, + "loss": 2.1196, + "step": 132180 + }, + { + "epoch": 0.5031477661137459, + "grad_norm": 0.1350007951259613, + "learning_rate": 0.0005, + "loss": 2.1146, + "step": 132190 + }, + { + "epoch": 0.5031858285818686, + "grad_norm": 0.12097181379795074, + "learning_rate": 0.0005, + "loss": 2.1111, + "step": 132200 + }, + { + "epoch": 0.5032238910499912, + "grad_norm": 0.12329886853694916, + "learning_rate": 0.0005, + "loss": 2.1149, + "step": 132210 + }, + { + "epoch": 0.503261953518114, + "grad_norm": 0.11826958507299423, + "learning_rate": 0.0005, + "loss": 2.1079, + "step": 132220 + }, + { + "epoch": 0.5033000159862366, + "grad_norm": 0.13061341643333435, + "learning_rate": 0.0005, + "loss": 2.114, + "step": 132230 + }, + { + "epoch": 0.5033380784543593, + "grad_norm": 0.13656175136566162, + "learning_rate": 0.0005, + "loss": 2.0999, + "step": 132240 + }, + { + "epoch": 0.503376140922482, + "grad_norm": 0.14762873947620392, + "learning_rate": 0.0005, + "loss": 2.1152, + "step": 132250 + }, + { + "epoch": 0.5034142033906046, + "grad_norm": 0.1220160499215126, + "learning_rate": 0.0005, + "loss": 2.1284, + "step": 132260 + }, + { + "epoch": 0.5034522658587274, + "grad_norm": 0.128128781914711, + "learning_rate": 0.0005, + "loss": 2.122, + "step": 132270 + }, + { + "epoch": 0.50349032832685, + "grad_norm": 0.14482936263084412, + "learning_rate": 0.0005, + "loss": 2.1254, + "step": 132280 + }, + { + "epoch": 0.5035283907949727, + "grad_norm": 0.12684223055839539, + "learning_rate": 0.0005, + "loss": 2.12, + "step": 132290 + }, + { + "epoch": 0.5035664532630953, + "grad_norm": 0.11803826689720154, + "learning_rate": 0.0005, + "loss": 2.1028, + "step": 132300 + }, + { + "epoch": 0.5036045157312181, + "grad_norm": 0.12412810325622559, + "learning_rate": 0.0005, + "loss": 2.1236, + "step": 132310 + }, + { + "epoch": 0.5036425781993408, + "grad_norm": 0.11887123435735703, + "learning_rate": 0.0005, + "loss": 2.1058, + "step": 132320 + }, + { + "epoch": 0.5036806406674634, + "grad_norm": 0.1269877701997757, + "learning_rate": 0.0005, + "loss": 2.1008, + "step": 132330 + }, + { + "epoch": 0.5037187031355861, + "grad_norm": 0.13143859803676605, + "learning_rate": 0.0005, + "loss": 2.1195, + "step": 132340 + }, + { + "epoch": 0.5037567656037089, + "grad_norm": 0.12457022070884705, + "learning_rate": 0.0005, + "loss": 2.1222, + "step": 132350 + }, + { + "epoch": 0.5037948280718315, + "grad_norm": 0.129766583442688, + "learning_rate": 0.0005, + "loss": 2.1053, + "step": 132360 + }, + { + "epoch": 0.5038328905399542, + "grad_norm": 0.1242799237370491, + "learning_rate": 0.0005, + "loss": 2.1203, + "step": 132370 + }, + { + "epoch": 0.5038709530080768, + "grad_norm": 0.15328699350357056, + "learning_rate": 0.0005, + "loss": 2.1064, + "step": 132380 + }, + { + "epoch": 0.5039090154761995, + "grad_norm": 0.12018805742263794, + "learning_rate": 0.0005, + "loss": 2.1221, + "step": 132390 + }, + { + "epoch": 0.5039470779443223, + "grad_norm": 0.12956398725509644, + "learning_rate": 0.0005, + "loss": 2.1319, + "step": 132400 + }, + { + "epoch": 0.5039851404124449, + "grad_norm": 0.12832574546337128, + "learning_rate": 0.0005, + "loss": 2.1176, + "step": 132410 + }, + { + "epoch": 0.5040232028805676, + "grad_norm": 0.12742365896701813, + "learning_rate": 0.0005, + "loss": 2.114, + "step": 132420 + }, + { + "epoch": 0.5040612653486902, + "grad_norm": 0.12595300376415253, + "learning_rate": 0.0005, + "loss": 2.1118, + "step": 132430 + }, + { + "epoch": 0.504099327816813, + "grad_norm": 0.11913701146841049, + "learning_rate": 0.0005, + "loss": 2.0927, + "step": 132440 + }, + { + "epoch": 0.5041373902849356, + "grad_norm": 0.11661577969789505, + "learning_rate": 0.0005, + "loss": 2.1084, + "step": 132450 + }, + { + "epoch": 0.5041754527530583, + "grad_norm": 0.14057959616184235, + "learning_rate": 0.0005, + "loss": 2.1194, + "step": 132460 + }, + { + "epoch": 0.504213515221181, + "grad_norm": 0.127021923661232, + "learning_rate": 0.0005, + "loss": 2.1185, + "step": 132470 + }, + { + "epoch": 0.5042515776893037, + "grad_norm": 0.121463842689991, + "learning_rate": 0.0005, + "loss": 2.1156, + "step": 132480 + }, + { + "epoch": 0.5042896401574264, + "grad_norm": 0.13129813969135284, + "learning_rate": 0.0005, + "loss": 2.1193, + "step": 132490 + }, + { + "epoch": 0.504327702625549, + "grad_norm": 0.14562547206878662, + "learning_rate": 0.0005, + "loss": 2.1139, + "step": 132500 + }, + { + "epoch": 0.5043657650936717, + "grad_norm": 0.12791889905929565, + "learning_rate": 0.0005, + "loss": 2.1111, + "step": 132510 + }, + { + "epoch": 0.5044038275617945, + "grad_norm": 0.12117471545934677, + "learning_rate": 0.0005, + "loss": 2.113, + "step": 132520 + }, + { + "epoch": 0.5044418900299171, + "grad_norm": 0.11436469852924347, + "learning_rate": 0.0005, + "loss": 2.1128, + "step": 132530 + }, + { + "epoch": 0.5044799524980398, + "grad_norm": 0.12663054466247559, + "learning_rate": 0.0005, + "loss": 2.119, + "step": 132540 + }, + { + "epoch": 0.5045180149661624, + "grad_norm": 0.17653295397758484, + "learning_rate": 0.0005, + "loss": 2.1042, + "step": 132550 + }, + { + "epoch": 0.5045560774342851, + "grad_norm": 0.12252450734376907, + "learning_rate": 0.0005, + "loss": 2.1387, + "step": 132560 + }, + { + "epoch": 0.5045941399024079, + "grad_norm": 0.11884088069200516, + "learning_rate": 0.0005, + "loss": 2.1162, + "step": 132570 + }, + { + "epoch": 0.5046322023705305, + "grad_norm": 0.1152932196855545, + "learning_rate": 0.0005, + "loss": 2.1032, + "step": 132580 + }, + { + "epoch": 0.5046702648386532, + "grad_norm": 0.13256339728832245, + "learning_rate": 0.0005, + "loss": 2.1159, + "step": 132590 + }, + { + "epoch": 0.5047083273067758, + "grad_norm": 0.12635567784309387, + "learning_rate": 0.0005, + "loss": 2.1089, + "step": 132600 + }, + { + "epoch": 0.5047463897748986, + "grad_norm": 0.1187937930226326, + "learning_rate": 0.0005, + "loss": 2.1249, + "step": 132610 + }, + { + "epoch": 0.5047844522430213, + "grad_norm": 0.1216667890548706, + "learning_rate": 0.0005, + "loss": 2.108, + "step": 132620 + }, + { + "epoch": 0.5048225147111439, + "grad_norm": 0.11755828559398651, + "learning_rate": 0.0005, + "loss": 2.1323, + "step": 132630 + }, + { + "epoch": 0.5048605771792666, + "grad_norm": 0.133527010679245, + "learning_rate": 0.0005, + "loss": 2.1098, + "step": 132640 + }, + { + "epoch": 0.5048986396473893, + "grad_norm": 0.11593339592218399, + "learning_rate": 0.0005, + "loss": 2.1123, + "step": 132650 + }, + { + "epoch": 0.504936702115512, + "grad_norm": 0.18402504920959473, + "learning_rate": 0.0005, + "loss": 2.1081, + "step": 132660 + }, + { + "epoch": 0.5049747645836347, + "grad_norm": 0.13471607863903046, + "learning_rate": 0.0005, + "loss": 2.0985, + "step": 132670 + }, + { + "epoch": 0.5050128270517573, + "grad_norm": 0.11921326071023941, + "learning_rate": 0.0005, + "loss": 2.1032, + "step": 132680 + }, + { + "epoch": 0.50505088951988, + "grad_norm": 0.12205012887716293, + "learning_rate": 0.0005, + "loss": 2.1045, + "step": 132690 + }, + { + "epoch": 0.5050889519880027, + "grad_norm": 0.11981106549501419, + "learning_rate": 0.0005, + "loss": 2.1147, + "step": 132700 + }, + { + "epoch": 0.5051270144561254, + "grad_norm": 0.12213804572820663, + "learning_rate": 0.0005, + "loss": 2.1164, + "step": 132710 + }, + { + "epoch": 0.5051650769242481, + "grad_norm": 0.13242408633232117, + "learning_rate": 0.0005, + "loss": 2.1129, + "step": 132720 + }, + { + "epoch": 0.5052031393923707, + "grad_norm": 0.11383634060621262, + "learning_rate": 0.0005, + "loss": 2.1056, + "step": 132730 + }, + { + "epoch": 0.5052412018604935, + "grad_norm": 0.11707602441310883, + "learning_rate": 0.0005, + "loss": 2.1347, + "step": 132740 + }, + { + "epoch": 0.5052792643286161, + "grad_norm": 0.11705445498228073, + "learning_rate": 0.0005, + "loss": 2.1215, + "step": 132750 + }, + { + "epoch": 0.5053173267967388, + "grad_norm": 0.11471796780824661, + "learning_rate": 0.0005, + "loss": 2.124, + "step": 132760 + }, + { + "epoch": 0.5053553892648615, + "grad_norm": 0.12790432572364807, + "learning_rate": 0.0005, + "loss": 2.1138, + "step": 132770 + }, + { + "epoch": 0.5053934517329842, + "grad_norm": 0.14037707448005676, + "learning_rate": 0.0005, + "loss": 2.0979, + "step": 132780 + }, + { + "epoch": 0.5054315142011069, + "grad_norm": 0.14267417788505554, + "learning_rate": 0.0005, + "loss": 2.1166, + "step": 132790 + }, + { + "epoch": 0.5054695766692295, + "grad_norm": 0.1200094223022461, + "learning_rate": 0.0005, + "loss": 2.1253, + "step": 132800 + }, + { + "epoch": 0.5055076391373522, + "grad_norm": 0.1262931376695633, + "learning_rate": 0.0005, + "loss": 2.1007, + "step": 132810 + }, + { + "epoch": 0.5055457016054749, + "grad_norm": 0.11991778761148453, + "learning_rate": 0.0005, + "loss": 2.1179, + "step": 132820 + }, + { + "epoch": 0.5055837640735976, + "grad_norm": 0.1474510282278061, + "learning_rate": 0.0005, + "loss": 2.1153, + "step": 132830 + }, + { + "epoch": 0.5056218265417203, + "grad_norm": 0.9946924448013306, + "learning_rate": 0.0005, + "loss": 2.1069, + "step": 132840 + }, + { + "epoch": 0.5056598890098429, + "grad_norm": 0.1304653137922287, + "learning_rate": 0.0005, + "loss": 2.1049, + "step": 132850 + }, + { + "epoch": 0.5056979514779656, + "grad_norm": 0.1402243971824646, + "learning_rate": 0.0005, + "loss": 2.1088, + "step": 132860 + }, + { + "epoch": 0.5057360139460884, + "grad_norm": 0.13506390154361725, + "learning_rate": 0.0005, + "loss": 2.1021, + "step": 132870 + }, + { + "epoch": 0.505774076414211, + "grad_norm": 0.1145101860165596, + "learning_rate": 0.0005, + "loss": 2.1327, + "step": 132880 + }, + { + "epoch": 0.5058121388823337, + "grad_norm": 0.11101609468460083, + "learning_rate": 0.0005, + "loss": 2.1194, + "step": 132890 + }, + { + "epoch": 0.5058502013504563, + "grad_norm": 0.12633520364761353, + "learning_rate": 0.0005, + "loss": 2.1274, + "step": 132900 + }, + { + "epoch": 0.5058882638185791, + "grad_norm": 0.13389374315738678, + "learning_rate": 0.0005, + "loss": 2.1185, + "step": 132910 + }, + { + "epoch": 0.5059263262867018, + "grad_norm": 0.12300463765859604, + "learning_rate": 0.0005, + "loss": 2.1023, + "step": 132920 + }, + { + "epoch": 0.5059643887548244, + "grad_norm": 0.1362847238779068, + "learning_rate": 0.0005, + "loss": 2.1067, + "step": 132930 + }, + { + "epoch": 0.5060024512229471, + "grad_norm": 0.12361106276512146, + "learning_rate": 0.0005, + "loss": 2.1149, + "step": 132940 + }, + { + "epoch": 0.5060405136910698, + "grad_norm": 0.12673906981945038, + "learning_rate": 0.0005, + "loss": 2.1175, + "step": 132950 + }, + { + "epoch": 0.5060785761591925, + "grad_norm": 0.11915009468793869, + "learning_rate": 0.0005, + "loss": 2.1133, + "step": 132960 + }, + { + "epoch": 0.5061166386273152, + "grad_norm": 0.1288875937461853, + "learning_rate": 0.0005, + "loss": 2.1282, + "step": 132970 + }, + { + "epoch": 0.5061547010954378, + "grad_norm": 0.11823371052742004, + "learning_rate": 0.0005, + "loss": 2.1098, + "step": 132980 + }, + { + "epoch": 0.5061927635635605, + "grad_norm": 0.12016794830560684, + "learning_rate": 0.0005, + "loss": 2.132, + "step": 132990 + }, + { + "epoch": 0.5062308260316832, + "grad_norm": 0.13051475584506989, + "learning_rate": 0.0005, + "loss": 2.1246, + "step": 133000 + }, + { + "epoch": 0.5062688884998059, + "grad_norm": 0.13054126501083374, + "learning_rate": 0.0005, + "loss": 2.11, + "step": 133010 + }, + { + "epoch": 0.5063069509679285, + "grad_norm": 0.12361478805541992, + "learning_rate": 0.0005, + "loss": 2.1101, + "step": 133020 + }, + { + "epoch": 0.5063450134360512, + "grad_norm": 0.1327802538871765, + "learning_rate": 0.0005, + "loss": 2.0975, + "step": 133030 + }, + { + "epoch": 0.506383075904174, + "grad_norm": 0.12492018938064575, + "learning_rate": 0.0005, + "loss": 2.1128, + "step": 133040 + }, + { + "epoch": 0.5064211383722966, + "grad_norm": 0.12457766383886337, + "learning_rate": 0.0005, + "loss": 2.1028, + "step": 133050 + }, + { + "epoch": 0.5064592008404193, + "grad_norm": 0.12348872423171997, + "learning_rate": 0.0005, + "loss": 2.119, + "step": 133060 + }, + { + "epoch": 0.506497263308542, + "grad_norm": 0.14808453619480133, + "learning_rate": 0.0005, + "loss": 2.1144, + "step": 133070 + }, + { + "epoch": 0.5065353257766647, + "grad_norm": 0.13085603713989258, + "learning_rate": 0.0005, + "loss": 2.1059, + "step": 133080 + }, + { + "epoch": 0.5065733882447874, + "grad_norm": 0.12417221069335938, + "learning_rate": 0.0005, + "loss": 2.1011, + "step": 133090 + }, + { + "epoch": 0.50661145071291, + "grad_norm": 0.12539374828338623, + "learning_rate": 0.0005, + "loss": 2.0998, + "step": 133100 + }, + { + "epoch": 0.5066495131810327, + "grad_norm": 0.13433487713336945, + "learning_rate": 0.0005, + "loss": 2.1134, + "step": 133110 + }, + { + "epoch": 0.5066875756491553, + "grad_norm": 0.12523174285888672, + "learning_rate": 0.0005, + "loss": 2.1022, + "step": 133120 + }, + { + "epoch": 0.5067256381172781, + "grad_norm": 0.13964121043682098, + "learning_rate": 0.0005, + "loss": 2.1243, + "step": 133130 + }, + { + "epoch": 0.5067637005854008, + "grad_norm": 0.12529048323631287, + "learning_rate": 0.0005, + "loss": 2.1088, + "step": 133140 + }, + { + "epoch": 0.5068017630535234, + "grad_norm": 0.12352463603019714, + "learning_rate": 0.0005, + "loss": 2.113, + "step": 133150 + }, + { + "epoch": 0.5068398255216461, + "grad_norm": 0.13642734289169312, + "learning_rate": 0.0005, + "loss": 2.1174, + "step": 133160 + }, + { + "epoch": 0.5068778879897688, + "grad_norm": 0.13066624104976654, + "learning_rate": 0.0005, + "loss": 2.0974, + "step": 133170 + }, + { + "epoch": 0.5069159504578915, + "grad_norm": 0.11709106713533401, + "learning_rate": 0.0005, + "loss": 2.1134, + "step": 133180 + }, + { + "epoch": 0.5069540129260142, + "grad_norm": 0.12521210312843323, + "learning_rate": 0.0005, + "loss": 2.1087, + "step": 133190 + }, + { + "epoch": 0.5069920753941368, + "grad_norm": 0.1172792986035347, + "learning_rate": 0.0005, + "loss": 2.126, + "step": 133200 + }, + { + "epoch": 0.5070301378622596, + "grad_norm": 0.12813249230384827, + "learning_rate": 0.0005, + "loss": 2.1213, + "step": 133210 + }, + { + "epoch": 0.5070682003303822, + "grad_norm": 0.1418289691209793, + "learning_rate": 0.0005, + "loss": 2.1184, + "step": 133220 + }, + { + "epoch": 0.5071062627985049, + "grad_norm": 0.12482786178588867, + "learning_rate": 0.0005, + "loss": 2.1259, + "step": 133230 + }, + { + "epoch": 0.5071443252666276, + "grad_norm": 0.12194843590259552, + "learning_rate": 0.0005, + "loss": 2.1146, + "step": 133240 + }, + { + "epoch": 0.5071823877347502, + "grad_norm": 0.12992645800113678, + "learning_rate": 0.0005, + "loss": 2.1215, + "step": 133250 + }, + { + "epoch": 0.507220450202873, + "grad_norm": 0.1264665126800537, + "learning_rate": 0.0005, + "loss": 2.1181, + "step": 133260 + }, + { + "epoch": 0.5072585126709956, + "grad_norm": 0.12155953794717789, + "learning_rate": 0.0005, + "loss": 2.1288, + "step": 133270 + }, + { + "epoch": 0.5072965751391183, + "grad_norm": 0.13857276737689972, + "learning_rate": 0.0005, + "loss": 2.1271, + "step": 133280 + }, + { + "epoch": 0.507334637607241, + "grad_norm": 0.11679156124591827, + "learning_rate": 0.0005, + "loss": 2.1118, + "step": 133290 + }, + { + "epoch": 0.5073727000753637, + "grad_norm": 0.12049750983715057, + "learning_rate": 0.0005, + "loss": 2.1006, + "step": 133300 + }, + { + "epoch": 0.5074107625434864, + "grad_norm": 0.11648929864168167, + "learning_rate": 0.0005, + "loss": 2.1081, + "step": 133310 + }, + { + "epoch": 0.507448825011609, + "grad_norm": 0.1900986135005951, + "learning_rate": 0.0005, + "loss": 2.1154, + "step": 133320 + }, + { + "epoch": 0.5074868874797317, + "grad_norm": 0.11860639601945877, + "learning_rate": 0.0005, + "loss": 2.1147, + "step": 133330 + }, + { + "epoch": 0.5075249499478545, + "grad_norm": 0.1396195888519287, + "learning_rate": 0.0005, + "loss": 2.1145, + "step": 133340 + }, + { + "epoch": 0.5075630124159771, + "grad_norm": 0.13535426557064056, + "learning_rate": 0.0005, + "loss": 2.1257, + "step": 133350 + }, + { + "epoch": 0.5076010748840998, + "grad_norm": 0.13391265273094177, + "learning_rate": 0.0005, + "loss": 2.1153, + "step": 133360 + }, + { + "epoch": 0.5076391373522224, + "grad_norm": 0.11973792314529419, + "learning_rate": 0.0005, + "loss": 2.1068, + "step": 133370 + }, + { + "epoch": 0.5076771998203452, + "grad_norm": 0.12155459821224213, + "learning_rate": 0.0005, + "loss": 2.1048, + "step": 133380 + }, + { + "epoch": 0.5077152622884679, + "grad_norm": 0.1285034418106079, + "learning_rate": 0.0005, + "loss": 2.112, + "step": 133390 + }, + { + "epoch": 0.5077533247565905, + "grad_norm": 0.12448261678218842, + "learning_rate": 0.0005, + "loss": 2.1155, + "step": 133400 + }, + { + "epoch": 0.5077913872247132, + "grad_norm": 0.13152651488780975, + "learning_rate": 0.0005, + "loss": 2.1146, + "step": 133410 + }, + { + "epoch": 0.5078294496928358, + "grad_norm": 0.12238927185535431, + "learning_rate": 0.0005, + "loss": 2.1236, + "step": 133420 + }, + { + "epoch": 0.5078675121609586, + "grad_norm": 0.12980888783931732, + "learning_rate": 0.0005, + "loss": 2.11, + "step": 133430 + }, + { + "epoch": 0.5079055746290813, + "grad_norm": 0.1216517984867096, + "learning_rate": 0.0005, + "loss": 2.1184, + "step": 133440 + }, + { + "epoch": 0.5079436370972039, + "grad_norm": 0.14697448909282684, + "learning_rate": 0.0005, + "loss": 2.1173, + "step": 133450 + }, + { + "epoch": 0.5079816995653266, + "grad_norm": 0.13025575876235962, + "learning_rate": 0.0005, + "loss": 2.1105, + "step": 133460 + }, + { + "epoch": 0.5080197620334493, + "grad_norm": 0.12157538533210754, + "learning_rate": 0.0005, + "loss": 2.1228, + "step": 133470 + }, + { + "epoch": 0.508057824501572, + "grad_norm": 0.13055738806724548, + "learning_rate": 0.0005, + "loss": 2.1062, + "step": 133480 + }, + { + "epoch": 0.5080958869696947, + "grad_norm": 0.12716266512870789, + "learning_rate": 0.0005, + "loss": 2.1289, + "step": 133490 + }, + { + "epoch": 0.5081339494378173, + "grad_norm": 0.12884607911109924, + "learning_rate": 0.0005, + "loss": 2.1228, + "step": 133500 + }, + { + "epoch": 0.5081720119059401, + "grad_norm": 0.12093187868595123, + "learning_rate": 0.0005, + "loss": 2.0979, + "step": 133510 + }, + { + "epoch": 0.5082100743740627, + "grad_norm": 0.11768296360969543, + "learning_rate": 0.0005, + "loss": 2.1052, + "step": 133520 + }, + { + "epoch": 0.5082481368421854, + "grad_norm": 0.11855961382389069, + "learning_rate": 0.0005, + "loss": 2.1081, + "step": 133530 + }, + { + "epoch": 0.508286199310308, + "grad_norm": 0.12154291570186615, + "learning_rate": 0.0005, + "loss": 2.1023, + "step": 133540 + }, + { + "epoch": 0.5083242617784307, + "grad_norm": 0.11715900152921677, + "learning_rate": 0.0005, + "loss": 2.1077, + "step": 133550 + }, + { + "epoch": 0.5083623242465535, + "grad_norm": 0.13073576986789703, + "learning_rate": 0.0005, + "loss": 2.1147, + "step": 133560 + }, + { + "epoch": 0.5084003867146761, + "grad_norm": 0.12054118514060974, + "learning_rate": 0.0005, + "loss": 2.1221, + "step": 133570 + }, + { + "epoch": 0.5084384491827988, + "grad_norm": 0.11386524140834808, + "learning_rate": 0.0005, + "loss": 2.116, + "step": 133580 + }, + { + "epoch": 0.5084765116509214, + "grad_norm": 0.13187789916992188, + "learning_rate": 0.0005, + "loss": 2.1266, + "step": 133590 + }, + { + "epoch": 0.5085145741190442, + "grad_norm": 0.13285937905311584, + "learning_rate": 0.0005, + "loss": 2.1123, + "step": 133600 + }, + { + "epoch": 0.5085526365871669, + "grad_norm": 0.12216323614120483, + "learning_rate": 0.0005, + "loss": 2.1252, + "step": 133610 + }, + { + "epoch": 0.5085906990552895, + "grad_norm": 0.13401517271995544, + "learning_rate": 0.0005, + "loss": 2.1248, + "step": 133620 + }, + { + "epoch": 0.5086287615234122, + "grad_norm": 0.13121497631072998, + "learning_rate": 0.0005, + "loss": 2.1004, + "step": 133630 + }, + { + "epoch": 0.508666823991535, + "grad_norm": 0.13181401789188385, + "learning_rate": 0.0005, + "loss": 2.1064, + "step": 133640 + }, + { + "epoch": 0.5087048864596576, + "grad_norm": 0.12712612748146057, + "learning_rate": 0.0005, + "loss": 2.1329, + "step": 133650 + }, + { + "epoch": 0.5087429489277803, + "grad_norm": 0.12323405593633652, + "learning_rate": 0.0005, + "loss": 2.1219, + "step": 133660 + }, + { + "epoch": 0.5087810113959029, + "grad_norm": 0.13902431726455688, + "learning_rate": 0.0005, + "loss": 2.1228, + "step": 133670 + }, + { + "epoch": 0.5088190738640256, + "grad_norm": 0.12429852038621902, + "learning_rate": 0.0005, + "loss": 2.1322, + "step": 133680 + }, + { + "epoch": 0.5088571363321484, + "grad_norm": 0.14107947051525116, + "learning_rate": 0.0005, + "loss": 2.1082, + "step": 133690 + }, + { + "epoch": 0.508895198800271, + "grad_norm": 0.13658325374126434, + "learning_rate": 0.0005, + "loss": 2.1193, + "step": 133700 + }, + { + "epoch": 0.5089332612683937, + "grad_norm": 0.12138927727937698, + "learning_rate": 0.0005, + "loss": 2.125, + "step": 133710 + }, + { + "epoch": 0.5089713237365163, + "grad_norm": 0.13314686715602875, + "learning_rate": 0.0005, + "loss": 2.1183, + "step": 133720 + }, + { + "epoch": 0.5090093862046391, + "grad_norm": 0.12922067940235138, + "learning_rate": 0.0005, + "loss": 2.1215, + "step": 133730 + }, + { + "epoch": 0.5090474486727617, + "grad_norm": 0.1288197636604309, + "learning_rate": 0.0005, + "loss": 2.1012, + "step": 133740 + }, + { + "epoch": 0.5090855111408844, + "grad_norm": 0.12378251552581787, + "learning_rate": 0.0005, + "loss": 2.1217, + "step": 133750 + }, + { + "epoch": 0.5091235736090071, + "grad_norm": 0.13623934984207153, + "learning_rate": 0.0005, + "loss": 2.1007, + "step": 133760 + }, + { + "epoch": 0.5091616360771298, + "grad_norm": 0.11521940678358078, + "learning_rate": 0.0005, + "loss": 2.1132, + "step": 133770 + }, + { + "epoch": 0.5091996985452525, + "grad_norm": 0.13184532523155212, + "learning_rate": 0.0005, + "loss": 2.1059, + "step": 133780 + }, + { + "epoch": 0.5092377610133751, + "grad_norm": 0.12468823045492172, + "learning_rate": 0.0005, + "loss": 2.1233, + "step": 133790 + }, + { + "epoch": 0.5092758234814978, + "grad_norm": 0.13141892850399017, + "learning_rate": 0.0005, + "loss": 2.1173, + "step": 133800 + }, + { + "epoch": 0.5093138859496206, + "grad_norm": 0.12401145696640015, + "learning_rate": 0.0005, + "loss": 2.1227, + "step": 133810 + }, + { + "epoch": 0.5093519484177432, + "grad_norm": 0.13101080060005188, + "learning_rate": 0.0005, + "loss": 2.1301, + "step": 133820 + }, + { + "epoch": 0.5093900108858659, + "grad_norm": 0.12074775993824005, + "learning_rate": 0.0005, + "loss": 2.1183, + "step": 133830 + }, + { + "epoch": 0.5094280733539885, + "grad_norm": 0.13417311012744904, + "learning_rate": 0.0005, + "loss": 2.107, + "step": 133840 + }, + { + "epoch": 0.5094661358221112, + "grad_norm": 0.11586230248212814, + "learning_rate": 0.0005, + "loss": 2.1133, + "step": 133850 + }, + { + "epoch": 0.509504198290234, + "grad_norm": 0.12526260316371918, + "learning_rate": 0.0005, + "loss": 2.1384, + "step": 133860 + }, + { + "epoch": 0.5095422607583566, + "grad_norm": 0.13197200000286102, + "learning_rate": 0.0005, + "loss": 2.1109, + "step": 133870 + }, + { + "epoch": 0.5095803232264793, + "grad_norm": 0.1136879101395607, + "learning_rate": 0.0005, + "loss": 2.1185, + "step": 133880 + }, + { + "epoch": 0.5096183856946019, + "grad_norm": 0.11466697603464127, + "learning_rate": 0.0005, + "loss": 2.1157, + "step": 133890 + }, + { + "epoch": 0.5096564481627247, + "grad_norm": 0.12406015396118164, + "learning_rate": 0.0005, + "loss": 2.1198, + "step": 133900 + }, + { + "epoch": 0.5096945106308474, + "grad_norm": 0.1297469139099121, + "learning_rate": 0.0005, + "loss": 2.1133, + "step": 133910 + }, + { + "epoch": 0.50973257309897, + "grad_norm": 0.12095653265714645, + "learning_rate": 0.0005, + "loss": 2.108, + "step": 133920 + }, + { + "epoch": 0.5097706355670927, + "grad_norm": 0.1300520896911621, + "learning_rate": 0.0005, + "loss": 2.1121, + "step": 133930 + }, + { + "epoch": 0.5098086980352154, + "grad_norm": 0.14236518740653992, + "learning_rate": 0.0005, + "loss": 2.117, + "step": 133940 + }, + { + "epoch": 0.5098467605033381, + "grad_norm": 0.12811344861984253, + "learning_rate": 0.0005, + "loss": 2.1175, + "step": 133950 + }, + { + "epoch": 0.5098848229714608, + "grad_norm": 0.13053585588932037, + "learning_rate": 0.0005, + "loss": 2.1226, + "step": 133960 + }, + { + "epoch": 0.5099228854395834, + "grad_norm": 0.11776957660913467, + "learning_rate": 0.0005, + "loss": 2.1145, + "step": 133970 + }, + { + "epoch": 0.5099609479077061, + "grad_norm": 0.1257060021162033, + "learning_rate": 0.0005, + "loss": 2.1023, + "step": 133980 + }, + { + "epoch": 0.5099990103758288, + "grad_norm": 0.10968641936779022, + "learning_rate": 0.0005, + "loss": 2.1223, + "step": 133990 + }, + { + "epoch": 0.5100370728439515, + "grad_norm": 0.11820013076066971, + "learning_rate": 0.0005, + "loss": 2.1163, + "step": 134000 + }, + { + "epoch": 0.5100751353120742, + "grad_norm": 0.1360628604888916, + "learning_rate": 0.0005, + "loss": 2.1095, + "step": 134010 + }, + { + "epoch": 0.5101131977801968, + "grad_norm": 0.1447390764951706, + "learning_rate": 0.0005, + "loss": 2.1198, + "step": 134020 + }, + { + "epoch": 0.5101512602483196, + "grad_norm": 0.12482602894306183, + "learning_rate": 0.0005, + "loss": 2.1092, + "step": 134030 + }, + { + "epoch": 0.5101893227164422, + "grad_norm": 0.14182770252227783, + "learning_rate": 0.0005, + "loss": 2.092, + "step": 134040 + }, + { + "epoch": 0.5102273851845649, + "grad_norm": 0.11726133525371552, + "learning_rate": 0.0005, + "loss": 2.1113, + "step": 134050 + }, + { + "epoch": 0.5102654476526876, + "grad_norm": 0.11940544843673706, + "learning_rate": 0.0005, + "loss": 2.1214, + "step": 134060 + }, + { + "epoch": 0.5103035101208103, + "grad_norm": 0.13671204447746277, + "learning_rate": 0.0005, + "loss": 2.1098, + "step": 134070 + }, + { + "epoch": 0.510341572588933, + "grad_norm": 0.13532045483589172, + "learning_rate": 0.0005, + "loss": 2.1082, + "step": 134080 + }, + { + "epoch": 0.5103796350570556, + "grad_norm": 0.1301160454750061, + "learning_rate": 0.0005, + "loss": 2.1157, + "step": 134090 + }, + { + "epoch": 0.5104176975251783, + "grad_norm": 0.11771665513515472, + "learning_rate": 0.0005, + "loss": 2.1147, + "step": 134100 + }, + { + "epoch": 0.510455759993301, + "grad_norm": 0.12012787908315659, + "learning_rate": 0.0005, + "loss": 2.1249, + "step": 134110 + }, + { + "epoch": 0.5104938224614237, + "grad_norm": 0.12731090188026428, + "learning_rate": 0.0005, + "loss": 2.1116, + "step": 134120 + }, + { + "epoch": 0.5105318849295464, + "grad_norm": 0.12644848227500916, + "learning_rate": 0.0005, + "loss": 2.1198, + "step": 134130 + }, + { + "epoch": 0.510569947397669, + "grad_norm": 0.12338414788246155, + "learning_rate": 0.0005, + "loss": 2.1207, + "step": 134140 + }, + { + "epoch": 0.5106080098657917, + "grad_norm": 0.12706167995929718, + "learning_rate": 0.0005, + "loss": 2.1041, + "step": 134150 + }, + { + "epoch": 0.5106460723339145, + "grad_norm": 0.1354827731847763, + "learning_rate": 0.0005, + "loss": 2.1131, + "step": 134160 + }, + { + "epoch": 0.5106841348020371, + "grad_norm": 0.12414322048425674, + "learning_rate": 0.0005, + "loss": 2.1011, + "step": 134170 + }, + { + "epoch": 0.5107221972701598, + "grad_norm": 0.11386504769325256, + "learning_rate": 0.0005, + "loss": 2.114, + "step": 134180 + }, + { + "epoch": 0.5107602597382824, + "grad_norm": 0.11852803826332092, + "learning_rate": 0.0005, + "loss": 2.1278, + "step": 134190 + }, + { + "epoch": 0.5107983222064052, + "grad_norm": 0.13569186627864838, + "learning_rate": 0.0005, + "loss": 2.104, + "step": 134200 + }, + { + "epoch": 0.5108363846745279, + "grad_norm": 0.11802306771278381, + "learning_rate": 0.0005, + "loss": 2.1107, + "step": 134210 + }, + { + "epoch": 0.5108744471426505, + "grad_norm": 0.13798528909683228, + "learning_rate": 0.0005, + "loss": 2.1057, + "step": 134220 + }, + { + "epoch": 0.5109125096107732, + "grad_norm": 0.12030106037855148, + "learning_rate": 0.0005, + "loss": 2.0996, + "step": 134230 + }, + { + "epoch": 0.5109505720788959, + "grad_norm": 0.13408568501472473, + "learning_rate": 0.0005, + "loss": 2.1221, + "step": 134240 + }, + { + "epoch": 0.5109886345470186, + "grad_norm": 0.14743518829345703, + "learning_rate": 0.0005, + "loss": 2.1289, + "step": 134250 + }, + { + "epoch": 0.5110266970151413, + "grad_norm": 0.1478193700313568, + "learning_rate": 0.0005, + "loss": 2.1004, + "step": 134260 + }, + { + "epoch": 0.5110647594832639, + "grad_norm": 0.12086143344640732, + "learning_rate": 0.0005, + "loss": 2.1139, + "step": 134270 + }, + { + "epoch": 0.5111028219513866, + "grad_norm": 0.126709446310997, + "learning_rate": 0.0005, + "loss": 2.1096, + "step": 134280 + }, + { + "epoch": 0.5111408844195093, + "grad_norm": 0.12228815257549286, + "learning_rate": 0.0005, + "loss": 2.1068, + "step": 134290 + }, + { + "epoch": 0.511178946887632, + "grad_norm": 0.11191216856241226, + "learning_rate": 0.0005, + "loss": 2.118, + "step": 134300 + }, + { + "epoch": 0.5112170093557546, + "grad_norm": 0.12554562091827393, + "learning_rate": 0.0005, + "loss": 2.13, + "step": 134310 + }, + { + "epoch": 0.5112550718238773, + "grad_norm": 0.12003730237483978, + "learning_rate": 0.0005, + "loss": 2.0977, + "step": 134320 + }, + { + "epoch": 0.5112931342920001, + "grad_norm": 0.130716934800148, + "learning_rate": 0.0005, + "loss": 2.1118, + "step": 134330 + }, + { + "epoch": 0.5113311967601227, + "grad_norm": 0.12484478950500488, + "learning_rate": 0.0005, + "loss": 2.1238, + "step": 134340 + }, + { + "epoch": 0.5113692592282454, + "grad_norm": 0.1287389099597931, + "learning_rate": 0.0005, + "loss": 2.1116, + "step": 134350 + }, + { + "epoch": 0.511407321696368, + "grad_norm": 0.121913380920887, + "learning_rate": 0.0005, + "loss": 2.124, + "step": 134360 + }, + { + "epoch": 0.5114453841644908, + "grad_norm": 0.12113969773054123, + "learning_rate": 0.0005, + "loss": 2.123, + "step": 134370 + }, + { + "epoch": 0.5114834466326135, + "grad_norm": 0.11920718103647232, + "learning_rate": 0.0005, + "loss": 2.104, + "step": 134380 + }, + { + "epoch": 0.5115215091007361, + "grad_norm": 0.1182803213596344, + "learning_rate": 0.0005, + "loss": 2.1149, + "step": 134390 + }, + { + "epoch": 0.5115595715688588, + "grad_norm": 0.11739551275968552, + "learning_rate": 0.0005, + "loss": 2.1135, + "step": 134400 + }, + { + "epoch": 0.5115976340369814, + "grad_norm": 0.12292132526636124, + "learning_rate": 0.0005, + "loss": 2.1089, + "step": 134410 + }, + { + "epoch": 0.5116356965051042, + "grad_norm": 0.13265924155712128, + "learning_rate": 0.0005, + "loss": 2.1161, + "step": 134420 + }, + { + "epoch": 0.5116737589732269, + "grad_norm": 0.12459909170866013, + "learning_rate": 0.0005, + "loss": 2.1119, + "step": 134430 + }, + { + "epoch": 0.5117118214413495, + "grad_norm": 0.11901148408651352, + "learning_rate": 0.0005, + "loss": 2.1184, + "step": 134440 + }, + { + "epoch": 0.5117498839094722, + "grad_norm": 0.12968461215496063, + "learning_rate": 0.0005, + "loss": 2.1106, + "step": 134450 + }, + { + "epoch": 0.511787946377595, + "grad_norm": 0.13257619738578796, + "learning_rate": 0.0005, + "loss": 2.1127, + "step": 134460 + }, + { + "epoch": 0.5118260088457176, + "grad_norm": 0.11957278102636337, + "learning_rate": 0.0005, + "loss": 2.1064, + "step": 134470 + }, + { + "epoch": 0.5118640713138403, + "grad_norm": 0.1330718696117401, + "learning_rate": 0.0005, + "loss": 2.1115, + "step": 134480 + }, + { + "epoch": 0.5119021337819629, + "grad_norm": 0.12618844211101532, + "learning_rate": 0.0005, + "loss": 2.1266, + "step": 134490 + }, + { + "epoch": 0.5119401962500857, + "grad_norm": 0.1173240914940834, + "learning_rate": 0.0005, + "loss": 2.1129, + "step": 134500 + }, + { + "epoch": 0.5119782587182083, + "grad_norm": 0.11414072662591934, + "learning_rate": 0.0005, + "loss": 2.1124, + "step": 134510 + }, + { + "epoch": 0.512016321186331, + "grad_norm": 0.12122435122728348, + "learning_rate": 0.0005, + "loss": 2.1198, + "step": 134520 + }, + { + "epoch": 0.5120543836544537, + "grad_norm": 0.12351789325475693, + "learning_rate": 0.0005, + "loss": 2.1277, + "step": 134530 + }, + { + "epoch": 0.5120924461225763, + "grad_norm": 0.1277690827846527, + "learning_rate": 0.0005, + "loss": 2.1121, + "step": 134540 + }, + { + "epoch": 0.5121305085906991, + "grad_norm": 0.12026087939739227, + "learning_rate": 0.0005, + "loss": 2.1285, + "step": 134550 + }, + { + "epoch": 0.5121685710588217, + "grad_norm": 0.12273011356592178, + "learning_rate": 0.0005, + "loss": 2.1257, + "step": 134560 + }, + { + "epoch": 0.5122066335269444, + "grad_norm": 0.13297882676124573, + "learning_rate": 0.0005, + "loss": 2.1374, + "step": 134570 + }, + { + "epoch": 0.5122446959950671, + "grad_norm": 0.1275002360343933, + "learning_rate": 0.0005, + "loss": 2.1369, + "step": 134580 + }, + { + "epoch": 0.5122827584631898, + "grad_norm": 0.13620533049106598, + "learning_rate": 0.0005, + "loss": 2.1062, + "step": 134590 + }, + { + "epoch": 0.5123208209313125, + "grad_norm": 0.14599129557609558, + "learning_rate": 0.0005, + "loss": 2.0967, + "step": 134600 + }, + { + "epoch": 0.5123588833994351, + "grad_norm": 0.11535637825727463, + "learning_rate": 0.0005, + "loss": 2.1285, + "step": 134610 + }, + { + "epoch": 0.5123969458675578, + "grad_norm": 0.12820352613925934, + "learning_rate": 0.0005, + "loss": 2.0817, + "step": 134620 + }, + { + "epoch": 0.5124350083356806, + "grad_norm": 0.12669673562049866, + "learning_rate": 0.0005, + "loss": 2.1086, + "step": 134630 + }, + { + "epoch": 0.5124730708038032, + "grad_norm": 0.13037653267383575, + "learning_rate": 0.0005, + "loss": 2.1168, + "step": 134640 + }, + { + "epoch": 0.5125111332719259, + "grad_norm": 0.12768852710723877, + "learning_rate": 0.0005, + "loss": 2.1161, + "step": 134650 + }, + { + "epoch": 0.5125491957400485, + "grad_norm": 0.1387774795293808, + "learning_rate": 0.0005, + "loss": 2.1104, + "step": 134660 + }, + { + "epoch": 0.5125872582081713, + "grad_norm": 0.13335128128528595, + "learning_rate": 0.0005, + "loss": 2.1248, + "step": 134670 + }, + { + "epoch": 0.512625320676294, + "grad_norm": 0.14019380509853363, + "learning_rate": 0.0005, + "loss": 2.1145, + "step": 134680 + }, + { + "epoch": 0.5126633831444166, + "grad_norm": 0.13378937542438507, + "learning_rate": 0.0005, + "loss": 2.1039, + "step": 134690 + }, + { + "epoch": 0.5127014456125393, + "grad_norm": 0.11569223552942276, + "learning_rate": 0.0005, + "loss": 2.0998, + "step": 134700 + }, + { + "epoch": 0.5127395080806619, + "grad_norm": 0.12776020169258118, + "learning_rate": 0.0005, + "loss": 2.1196, + "step": 134710 + }, + { + "epoch": 0.5127775705487847, + "grad_norm": 0.1253037303686142, + "learning_rate": 0.0005, + "loss": 2.1091, + "step": 134720 + }, + { + "epoch": 0.5128156330169074, + "grad_norm": 0.1115291565656662, + "learning_rate": 0.0005, + "loss": 2.1046, + "step": 134730 + }, + { + "epoch": 0.51285369548503, + "grad_norm": 0.1221696212887764, + "learning_rate": 0.0005, + "loss": 2.1144, + "step": 134740 + }, + { + "epoch": 0.5128917579531527, + "grad_norm": 0.12305761128664017, + "learning_rate": 0.0005, + "loss": 2.1158, + "step": 134750 + }, + { + "epoch": 0.5129298204212754, + "grad_norm": 0.13608863949775696, + "learning_rate": 0.0005, + "loss": 2.1121, + "step": 134760 + }, + { + "epoch": 0.5129678828893981, + "grad_norm": 0.13254103064537048, + "learning_rate": 0.0005, + "loss": 2.121, + "step": 134770 + }, + { + "epoch": 0.5130059453575208, + "grad_norm": 0.12625229358673096, + "learning_rate": 0.0005, + "loss": 2.104, + "step": 134780 + }, + { + "epoch": 0.5130440078256434, + "grad_norm": 0.12322506308555603, + "learning_rate": 0.0005, + "loss": 2.1188, + "step": 134790 + }, + { + "epoch": 0.5130820702937662, + "grad_norm": 0.12616781890392303, + "learning_rate": 0.0005, + "loss": 2.118, + "step": 134800 + }, + { + "epoch": 0.5131201327618888, + "grad_norm": 0.11625972390174866, + "learning_rate": 0.0005, + "loss": 2.1154, + "step": 134810 + }, + { + "epoch": 0.5131581952300115, + "grad_norm": 0.13064908981323242, + "learning_rate": 0.0005, + "loss": 2.1257, + "step": 134820 + }, + { + "epoch": 0.5131962576981342, + "grad_norm": 0.13486990332603455, + "learning_rate": 0.0005, + "loss": 2.1064, + "step": 134830 + }, + { + "epoch": 0.5132343201662568, + "grad_norm": 0.1420249342918396, + "learning_rate": 0.0005, + "loss": 2.1211, + "step": 134840 + }, + { + "epoch": 0.5132723826343796, + "grad_norm": 0.11938636749982834, + "learning_rate": 0.0005, + "loss": 2.1236, + "step": 134850 + }, + { + "epoch": 0.5133104451025022, + "grad_norm": 0.12307589501142502, + "learning_rate": 0.0005, + "loss": 2.1249, + "step": 134860 + }, + { + "epoch": 0.5133485075706249, + "grad_norm": 0.21486619114875793, + "learning_rate": 0.0005, + "loss": 2.1244, + "step": 134870 + }, + { + "epoch": 0.5133865700387475, + "grad_norm": 0.13930252194404602, + "learning_rate": 0.0005, + "loss": 2.1061, + "step": 134880 + }, + { + "epoch": 0.5134246325068703, + "grad_norm": 0.1287679225206375, + "learning_rate": 0.0005, + "loss": 2.1164, + "step": 134890 + }, + { + "epoch": 0.513462694974993, + "grad_norm": 0.12399435043334961, + "learning_rate": 0.0005, + "loss": 2.1314, + "step": 134900 + }, + { + "epoch": 0.5135007574431156, + "grad_norm": 0.1333046853542328, + "learning_rate": 0.0005, + "loss": 2.1087, + "step": 134910 + }, + { + "epoch": 0.5135388199112383, + "grad_norm": 0.12432458996772766, + "learning_rate": 0.0005, + "loss": 2.1088, + "step": 134920 + }, + { + "epoch": 0.513576882379361, + "grad_norm": 0.12399007380008698, + "learning_rate": 0.0005, + "loss": 2.1092, + "step": 134930 + }, + { + "epoch": 0.5136149448474837, + "grad_norm": 0.13366355001926422, + "learning_rate": 0.0005, + "loss": 2.1291, + "step": 134940 + }, + { + "epoch": 0.5136530073156064, + "grad_norm": 0.13235674798488617, + "learning_rate": 0.0005, + "loss": 2.1282, + "step": 134950 + }, + { + "epoch": 0.513691069783729, + "grad_norm": 0.12507614493370056, + "learning_rate": 0.0005, + "loss": 2.1112, + "step": 134960 + }, + { + "epoch": 0.5137291322518518, + "grad_norm": 0.11385700106620789, + "learning_rate": 0.0005, + "loss": 2.1046, + "step": 134970 + }, + { + "epoch": 0.5137671947199745, + "grad_norm": 0.12334541231393814, + "learning_rate": 0.0005, + "loss": 2.1198, + "step": 134980 + }, + { + "epoch": 0.5138052571880971, + "grad_norm": 0.11451227217912674, + "learning_rate": 0.0005, + "loss": 2.1239, + "step": 134990 + }, + { + "epoch": 0.5138433196562198, + "grad_norm": 0.10950082540512085, + "learning_rate": 0.0005, + "loss": 2.1026, + "step": 135000 + }, + { + "epoch": 0.5138813821243424, + "grad_norm": 0.11528286337852478, + "learning_rate": 0.0005, + "loss": 2.1183, + "step": 135010 + }, + { + "epoch": 0.5139194445924652, + "grad_norm": 0.12344331294298172, + "learning_rate": 0.0005, + "loss": 2.1249, + "step": 135020 + }, + { + "epoch": 0.5139575070605878, + "grad_norm": 0.12382146716117859, + "learning_rate": 0.0005, + "loss": 2.1087, + "step": 135030 + }, + { + "epoch": 0.5139955695287105, + "grad_norm": 0.1312481313943863, + "learning_rate": 0.0005, + "loss": 2.1383, + "step": 135040 + }, + { + "epoch": 0.5140336319968332, + "grad_norm": 0.11916206777095795, + "learning_rate": 0.0005, + "loss": 2.1229, + "step": 135050 + }, + { + "epoch": 0.5140716944649559, + "grad_norm": 0.11896311491727829, + "learning_rate": 0.0005, + "loss": 2.1103, + "step": 135060 + }, + { + "epoch": 0.5141097569330786, + "grad_norm": 0.11607718467712402, + "learning_rate": 0.0005, + "loss": 2.1194, + "step": 135070 + }, + { + "epoch": 0.5141478194012012, + "grad_norm": 0.12421748042106628, + "learning_rate": 0.0005, + "loss": 2.118, + "step": 135080 + }, + { + "epoch": 0.5141858818693239, + "grad_norm": 0.11468444019556046, + "learning_rate": 0.0005, + "loss": 2.117, + "step": 135090 + }, + { + "epoch": 0.5142239443374467, + "grad_norm": 0.12319518625736237, + "learning_rate": 0.0005, + "loss": 2.1305, + "step": 135100 + }, + { + "epoch": 0.5142620068055693, + "grad_norm": 0.12367475032806396, + "learning_rate": 0.0005, + "loss": 2.1035, + "step": 135110 + }, + { + "epoch": 0.514300069273692, + "grad_norm": 0.12323194742202759, + "learning_rate": 0.0005, + "loss": 2.1113, + "step": 135120 + }, + { + "epoch": 0.5143381317418146, + "grad_norm": 0.13855737447738647, + "learning_rate": 0.0005, + "loss": 2.1188, + "step": 135130 + }, + { + "epoch": 0.5143761942099373, + "grad_norm": 0.12522666156291962, + "learning_rate": 0.0005, + "loss": 2.1121, + "step": 135140 + }, + { + "epoch": 0.5144142566780601, + "grad_norm": 0.12855994701385498, + "learning_rate": 0.0005, + "loss": 2.104, + "step": 135150 + }, + { + "epoch": 0.5144523191461827, + "grad_norm": 0.13384929299354553, + "learning_rate": 0.0005, + "loss": 2.1027, + "step": 135160 + }, + { + "epoch": 0.5144903816143054, + "grad_norm": 0.14026358723640442, + "learning_rate": 0.0005, + "loss": 2.1224, + "step": 135170 + }, + { + "epoch": 0.514528444082428, + "grad_norm": 0.1325213462114334, + "learning_rate": 0.0005, + "loss": 2.1072, + "step": 135180 + }, + { + "epoch": 0.5145665065505508, + "grad_norm": 0.12667711079120636, + "learning_rate": 0.0005, + "loss": 2.1182, + "step": 135190 + }, + { + "epoch": 0.5146045690186735, + "grad_norm": 0.1194978654384613, + "learning_rate": 0.0005, + "loss": 2.1244, + "step": 135200 + }, + { + "epoch": 0.5146426314867961, + "grad_norm": 0.12435804307460785, + "learning_rate": 0.0005, + "loss": 2.1245, + "step": 135210 + }, + { + "epoch": 0.5146806939549188, + "grad_norm": 0.12173999845981598, + "learning_rate": 0.0005, + "loss": 2.1177, + "step": 135220 + }, + { + "epoch": 0.5147187564230415, + "grad_norm": 0.12142159789800644, + "learning_rate": 0.0005, + "loss": 2.1138, + "step": 135230 + }, + { + "epoch": 0.5147568188911642, + "grad_norm": 0.12548331916332245, + "learning_rate": 0.0005, + "loss": 2.1182, + "step": 135240 + }, + { + "epoch": 0.5147948813592869, + "grad_norm": 0.13339272141456604, + "learning_rate": 0.0005, + "loss": 2.1288, + "step": 135250 + }, + { + "epoch": 0.5148329438274095, + "grad_norm": 0.13147401809692383, + "learning_rate": 0.0005, + "loss": 2.1166, + "step": 135260 + }, + { + "epoch": 0.5148710062955322, + "grad_norm": 0.132236048579216, + "learning_rate": 0.0005, + "loss": 2.1165, + "step": 135270 + }, + { + "epoch": 0.5149090687636549, + "grad_norm": 0.1356712132692337, + "learning_rate": 0.0005, + "loss": 2.111, + "step": 135280 + }, + { + "epoch": 0.5149471312317776, + "grad_norm": 0.1267666220664978, + "learning_rate": 0.0005, + "loss": 2.1137, + "step": 135290 + }, + { + "epoch": 0.5149851936999003, + "grad_norm": 0.12242377549409866, + "learning_rate": 0.0005, + "loss": 2.1145, + "step": 135300 + }, + { + "epoch": 0.5150232561680229, + "grad_norm": 0.11466419696807861, + "learning_rate": 0.0005, + "loss": 2.1051, + "step": 135310 + }, + { + "epoch": 0.5150613186361457, + "grad_norm": 0.12359566986560822, + "learning_rate": 0.0005, + "loss": 2.1256, + "step": 135320 + }, + { + "epoch": 0.5150993811042683, + "grad_norm": 0.12927506864070892, + "learning_rate": 0.0005, + "loss": 2.1326, + "step": 135330 + }, + { + "epoch": 0.515137443572391, + "grad_norm": 0.12114574760198593, + "learning_rate": 0.0005, + "loss": 2.1112, + "step": 135340 + }, + { + "epoch": 0.5151755060405137, + "grad_norm": 0.1556452363729477, + "learning_rate": 0.0005, + "loss": 2.1085, + "step": 135350 + }, + { + "epoch": 0.5152135685086364, + "grad_norm": 0.13409800827503204, + "learning_rate": 0.0005, + "loss": 2.1189, + "step": 135360 + }, + { + "epoch": 0.5152516309767591, + "grad_norm": 0.1229565292596817, + "learning_rate": 0.0005, + "loss": 2.115, + "step": 135370 + }, + { + "epoch": 0.5152896934448817, + "grad_norm": 0.12051396816968918, + "learning_rate": 0.0005, + "loss": 2.1093, + "step": 135380 + }, + { + "epoch": 0.5153277559130044, + "grad_norm": 0.12776722013950348, + "learning_rate": 0.0005, + "loss": 2.1178, + "step": 135390 + }, + { + "epoch": 0.5153658183811272, + "grad_norm": 0.12183408439159393, + "learning_rate": 0.0005, + "loss": 2.0979, + "step": 135400 + }, + { + "epoch": 0.5154038808492498, + "grad_norm": 0.12558986246585846, + "learning_rate": 0.0005, + "loss": 2.1292, + "step": 135410 + }, + { + "epoch": 0.5154419433173725, + "grad_norm": 0.12178440392017365, + "learning_rate": 0.0005, + "loss": 2.1196, + "step": 135420 + }, + { + "epoch": 0.5154800057854951, + "grad_norm": 0.11090958118438721, + "learning_rate": 0.0005, + "loss": 2.1114, + "step": 135430 + }, + { + "epoch": 0.5155180682536178, + "grad_norm": 0.12164914608001709, + "learning_rate": 0.0005, + "loss": 2.1171, + "step": 135440 + }, + { + "epoch": 0.5155561307217406, + "grad_norm": 0.12471088767051697, + "learning_rate": 0.0005, + "loss": 2.1161, + "step": 135450 + }, + { + "epoch": 0.5155941931898632, + "grad_norm": 0.11975961923599243, + "learning_rate": 0.0005, + "loss": 2.1044, + "step": 135460 + }, + { + "epoch": 0.5156322556579859, + "grad_norm": 0.1201275885105133, + "learning_rate": 0.0005, + "loss": 2.1137, + "step": 135470 + }, + { + "epoch": 0.5156703181261085, + "grad_norm": 0.1290825456380844, + "learning_rate": 0.0005, + "loss": 2.1102, + "step": 135480 + }, + { + "epoch": 0.5157083805942313, + "grad_norm": 0.12944602966308594, + "learning_rate": 0.0005, + "loss": 2.1137, + "step": 135490 + }, + { + "epoch": 0.515746443062354, + "grad_norm": 0.11592540889978409, + "learning_rate": 0.0005, + "loss": 2.1166, + "step": 135500 + }, + { + "epoch": 0.5157845055304766, + "grad_norm": 0.12791410088539124, + "learning_rate": 0.0005, + "loss": 2.1126, + "step": 135510 + }, + { + "epoch": 0.5158225679985993, + "grad_norm": 0.12224525213241577, + "learning_rate": 0.0005, + "loss": 2.1107, + "step": 135520 + }, + { + "epoch": 0.515860630466722, + "grad_norm": 0.11816330254077911, + "learning_rate": 0.0005, + "loss": 2.1288, + "step": 135530 + }, + { + "epoch": 0.5158986929348447, + "grad_norm": 0.11541763693094254, + "learning_rate": 0.0005, + "loss": 2.1181, + "step": 135540 + }, + { + "epoch": 0.5159367554029674, + "grad_norm": 0.13468272984027863, + "learning_rate": 0.0005, + "loss": 2.1197, + "step": 135550 + }, + { + "epoch": 0.51597481787109, + "grad_norm": 0.11575797945261002, + "learning_rate": 0.0005, + "loss": 2.1249, + "step": 135560 + }, + { + "epoch": 0.5160128803392127, + "grad_norm": 0.11992021650075912, + "learning_rate": 0.0005, + "loss": 2.1205, + "step": 135570 + }, + { + "epoch": 0.5160509428073354, + "grad_norm": 0.12182078510522842, + "learning_rate": 0.0005, + "loss": 2.107, + "step": 135580 + }, + { + "epoch": 0.5160890052754581, + "grad_norm": 0.12293105572462082, + "learning_rate": 0.0005, + "loss": 2.1228, + "step": 135590 + }, + { + "epoch": 0.5161270677435807, + "grad_norm": 0.13838279247283936, + "learning_rate": 0.0005, + "loss": 2.1153, + "step": 135600 + }, + { + "epoch": 0.5161651302117034, + "grad_norm": 0.11866675317287445, + "learning_rate": 0.0005, + "loss": 2.1063, + "step": 135610 + }, + { + "epoch": 0.5162031926798262, + "grad_norm": 0.13188183307647705, + "learning_rate": 0.0005, + "loss": 2.1316, + "step": 135620 + }, + { + "epoch": 0.5162412551479488, + "grad_norm": 0.12662546336650848, + "learning_rate": 0.0005, + "loss": 2.1262, + "step": 135630 + }, + { + "epoch": 0.5162793176160715, + "grad_norm": 0.12657001614570618, + "learning_rate": 0.0005, + "loss": 2.125, + "step": 135640 + }, + { + "epoch": 0.5163173800841941, + "grad_norm": 0.146830752491951, + "learning_rate": 0.0005, + "loss": 2.1184, + "step": 135650 + }, + { + "epoch": 0.5163554425523169, + "grad_norm": 0.12488903850317001, + "learning_rate": 0.0005, + "loss": 2.12, + "step": 135660 + }, + { + "epoch": 0.5163935050204396, + "grad_norm": 0.12733852863311768, + "learning_rate": 0.0005, + "loss": 2.1048, + "step": 135670 + }, + { + "epoch": 0.5164315674885622, + "grad_norm": 0.12367139756679535, + "learning_rate": 0.0005, + "loss": 2.1279, + "step": 135680 + }, + { + "epoch": 0.5164696299566849, + "grad_norm": 0.1233339011669159, + "learning_rate": 0.0005, + "loss": 2.1204, + "step": 135690 + }, + { + "epoch": 0.5165076924248075, + "grad_norm": 0.12250369787216187, + "learning_rate": 0.0005, + "loss": 2.1125, + "step": 135700 + }, + { + "epoch": 0.5165457548929303, + "grad_norm": 0.127598375082016, + "learning_rate": 0.0005, + "loss": 2.1114, + "step": 135710 + }, + { + "epoch": 0.516583817361053, + "grad_norm": 0.12334663420915604, + "learning_rate": 0.0005, + "loss": 2.1259, + "step": 135720 + }, + { + "epoch": 0.5166218798291756, + "grad_norm": 0.12220164388418198, + "learning_rate": 0.0005, + "loss": 2.1054, + "step": 135730 + }, + { + "epoch": 0.5166599422972983, + "grad_norm": 0.11895354092121124, + "learning_rate": 0.0005, + "loss": 2.1097, + "step": 135740 + }, + { + "epoch": 0.516698004765421, + "grad_norm": 0.11887135356664658, + "learning_rate": 0.0005, + "loss": 2.1215, + "step": 135750 + }, + { + "epoch": 0.5167360672335437, + "grad_norm": 0.13403892517089844, + "learning_rate": 0.0005, + "loss": 2.1169, + "step": 135760 + }, + { + "epoch": 0.5167741297016664, + "grad_norm": 0.13431161642074585, + "learning_rate": 0.0005, + "loss": 2.1243, + "step": 135770 + }, + { + "epoch": 0.516812192169789, + "grad_norm": 0.1349843144416809, + "learning_rate": 0.0005, + "loss": 2.1216, + "step": 135780 + }, + { + "epoch": 0.5168502546379118, + "grad_norm": 0.26945772767066956, + "learning_rate": 0.0005, + "loss": 2.1284, + "step": 135790 + }, + { + "epoch": 0.5168883171060344, + "grad_norm": 0.11636281758546829, + "learning_rate": 0.0005, + "loss": 2.1077, + "step": 135800 + }, + { + "epoch": 0.5169263795741571, + "grad_norm": 0.11911586672067642, + "learning_rate": 0.0005, + "loss": 2.1258, + "step": 135810 + }, + { + "epoch": 0.5169644420422798, + "grad_norm": 0.1230466291308403, + "learning_rate": 0.0005, + "loss": 2.1302, + "step": 135820 + }, + { + "epoch": 0.5170025045104025, + "grad_norm": 0.11399305611848831, + "learning_rate": 0.0005, + "loss": 2.1099, + "step": 135830 + }, + { + "epoch": 0.5170405669785252, + "grad_norm": 0.12976102530956268, + "learning_rate": 0.0005, + "loss": 2.1257, + "step": 135840 + }, + { + "epoch": 0.5170786294466478, + "grad_norm": 0.122156523168087, + "learning_rate": 0.0005, + "loss": 2.1212, + "step": 135850 + }, + { + "epoch": 0.5171166919147705, + "grad_norm": 0.11330889910459518, + "learning_rate": 0.0005, + "loss": 2.1239, + "step": 135860 + }, + { + "epoch": 0.5171547543828932, + "grad_norm": 0.13504265248775482, + "learning_rate": 0.0005, + "loss": 2.1015, + "step": 135870 + }, + { + "epoch": 0.5171928168510159, + "grad_norm": 0.1345272660255432, + "learning_rate": 0.0005, + "loss": 2.1232, + "step": 135880 + }, + { + "epoch": 0.5172308793191386, + "grad_norm": 0.15253782272338867, + "learning_rate": 0.0005, + "loss": 2.1013, + "step": 135890 + }, + { + "epoch": 0.5172689417872612, + "grad_norm": 0.11856013536453247, + "learning_rate": 0.0005, + "loss": 2.102, + "step": 135900 + }, + { + "epoch": 0.5173070042553839, + "grad_norm": 0.11281318217515945, + "learning_rate": 0.0005, + "loss": 2.1126, + "step": 135910 + }, + { + "epoch": 0.5173450667235067, + "grad_norm": 0.12678377330303192, + "learning_rate": 0.0005, + "loss": 2.1229, + "step": 135920 + }, + { + "epoch": 0.5173831291916293, + "grad_norm": 0.11883621662855148, + "learning_rate": 0.0005, + "loss": 2.1121, + "step": 135930 + }, + { + "epoch": 0.517421191659752, + "grad_norm": 0.13795502483844757, + "learning_rate": 0.0005, + "loss": 2.1193, + "step": 135940 + }, + { + "epoch": 0.5174592541278746, + "grad_norm": 0.12521414458751678, + "learning_rate": 0.0005, + "loss": 2.1025, + "step": 135950 + }, + { + "epoch": 0.5174973165959974, + "grad_norm": 0.1392013430595398, + "learning_rate": 0.0005, + "loss": 2.1232, + "step": 135960 + }, + { + "epoch": 0.5175353790641201, + "grad_norm": 0.13591915369033813, + "learning_rate": 0.0005, + "loss": 2.1162, + "step": 135970 + }, + { + "epoch": 0.5175734415322427, + "grad_norm": 0.13453233242034912, + "learning_rate": 0.0005, + "loss": 2.1248, + "step": 135980 + }, + { + "epoch": 0.5176115040003654, + "grad_norm": 0.12628062069416046, + "learning_rate": 0.0005, + "loss": 2.0976, + "step": 135990 + }, + { + "epoch": 0.517649566468488, + "grad_norm": 0.12293115258216858, + "learning_rate": 0.0005, + "loss": 2.1255, + "step": 136000 + }, + { + "epoch": 0.5176876289366108, + "grad_norm": 0.1296631395816803, + "learning_rate": 0.0005, + "loss": 2.1208, + "step": 136010 + }, + { + "epoch": 0.5177256914047335, + "grad_norm": 0.1257656365633011, + "learning_rate": 0.0005, + "loss": 2.105, + "step": 136020 + }, + { + "epoch": 0.5177637538728561, + "grad_norm": 0.12673147022724152, + "learning_rate": 0.0005, + "loss": 2.1167, + "step": 136030 + }, + { + "epoch": 0.5178018163409788, + "grad_norm": 0.13222235441207886, + "learning_rate": 0.0005, + "loss": 2.1096, + "step": 136040 + }, + { + "epoch": 0.5178398788091015, + "grad_norm": 0.11780260503292084, + "learning_rate": 0.0005, + "loss": 2.1035, + "step": 136050 + }, + { + "epoch": 0.5178779412772242, + "grad_norm": 0.1233372688293457, + "learning_rate": 0.0005, + "loss": 2.1087, + "step": 136060 + }, + { + "epoch": 0.5179160037453469, + "grad_norm": 0.11666984856128693, + "learning_rate": 0.0005, + "loss": 2.1129, + "step": 136070 + }, + { + "epoch": 0.5179540662134695, + "grad_norm": 0.11872230470180511, + "learning_rate": 0.0005, + "loss": 2.1111, + "step": 136080 + }, + { + "epoch": 0.5179921286815923, + "grad_norm": 0.14317534863948822, + "learning_rate": 0.0005, + "loss": 2.1117, + "step": 136090 + }, + { + "epoch": 0.5180301911497149, + "grad_norm": 0.11910288780927658, + "learning_rate": 0.0005, + "loss": 2.1301, + "step": 136100 + }, + { + "epoch": 0.5180682536178376, + "grad_norm": 0.13141492009162903, + "learning_rate": 0.0005, + "loss": 2.1145, + "step": 136110 + }, + { + "epoch": 0.5181063160859603, + "grad_norm": 0.11873306334018707, + "learning_rate": 0.0005, + "loss": 2.1049, + "step": 136120 + }, + { + "epoch": 0.5181443785540829, + "grad_norm": 0.12319959700107574, + "learning_rate": 0.0005, + "loss": 2.124, + "step": 136130 + }, + { + "epoch": 0.5181824410222057, + "grad_norm": 0.12672430276870728, + "learning_rate": 0.0005, + "loss": 2.12, + "step": 136140 + }, + { + "epoch": 0.5182205034903283, + "grad_norm": 0.12143968045711517, + "learning_rate": 0.0005, + "loss": 2.1107, + "step": 136150 + }, + { + "epoch": 0.518258565958451, + "grad_norm": 0.1358950138092041, + "learning_rate": 0.0005, + "loss": 2.1127, + "step": 136160 + }, + { + "epoch": 0.5182966284265736, + "grad_norm": 0.11966950446367264, + "learning_rate": 0.0005, + "loss": 2.1173, + "step": 136170 + }, + { + "epoch": 0.5183346908946964, + "grad_norm": 0.13092869520187378, + "learning_rate": 0.0005, + "loss": 2.1043, + "step": 136180 + }, + { + "epoch": 0.5183727533628191, + "grad_norm": 0.13360098004341125, + "learning_rate": 0.0005, + "loss": 2.1085, + "step": 136190 + }, + { + "epoch": 0.5184108158309417, + "grad_norm": 0.1463603377342224, + "learning_rate": 0.0005, + "loss": 2.1277, + "step": 136200 + }, + { + "epoch": 0.5184488782990644, + "grad_norm": 0.12061507254838943, + "learning_rate": 0.0005, + "loss": 2.1105, + "step": 136210 + }, + { + "epoch": 0.5184869407671872, + "grad_norm": 0.12189993262290955, + "learning_rate": 0.0005, + "loss": 2.118, + "step": 136220 + }, + { + "epoch": 0.5185250032353098, + "grad_norm": 0.11584679037332535, + "learning_rate": 0.0005, + "loss": 2.1196, + "step": 136230 + }, + { + "epoch": 0.5185630657034325, + "grad_norm": 0.12555517256259918, + "learning_rate": 0.0005, + "loss": 2.1181, + "step": 136240 + }, + { + "epoch": 0.5186011281715551, + "grad_norm": 0.12317892163991928, + "learning_rate": 0.0005, + "loss": 2.1141, + "step": 136250 + }, + { + "epoch": 0.5186391906396779, + "grad_norm": 0.12790797650814056, + "learning_rate": 0.0005, + "loss": 2.1247, + "step": 136260 + }, + { + "epoch": 0.5186772531078006, + "grad_norm": 0.13499782979488373, + "learning_rate": 0.0005, + "loss": 2.1196, + "step": 136270 + }, + { + "epoch": 0.5187153155759232, + "grad_norm": 0.13588197529315948, + "learning_rate": 0.0005, + "loss": 2.1288, + "step": 136280 + }, + { + "epoch": 0.5187533780440459, + "grad_norm": 0.12234912067651749, + "learning_rate": 0.0005, + "loss": 2.1095, + "step": 136290 + }, + { + "epoch": 0.5187914405121685, + "grad_norm": 0.1208055168390274, + "learning_rate": 0.0005, + "loss": 2.0943, + "step": 136300 + }, + { + "epoch": 0.5188295029802913, + "grad_norm": 0.12459038197994232, + "learning_rate": 0.0005, + "loss": 2.1099, + "step": 136310 + }, + { + "epoch": 0.518867565448414, + "grad_norm": 0.13378497958183289, + "learning_rate": 0.0005, + "loss": 2.1313, + "step": 136320 + }, + { + "epoch": 0.5189056279165366, + "grad_norm": 0.13146470487117767, + "learning_rate": 0.0005, + "loss": 2.1272, + "step": 136330 + }, + { + "epoch": 0.5189436903846593, + "grad_norm": 0.12453589588403702, + "learning_rate": 0.0005, + "loss": 2.1286, + "step": 136340 + }, + { + "epoch": 0.518981752852782, + "grad_norm": 0.13514123857021332, + "learning_rate": 0.0005, + "loss": 2.1307, + "step": 136350 + }, + { + "epoch": 0.5190198153209047, + "grad_norm": 0.12778690457344055, + "learning_rate": 0.0005, + "loss": 2.1151, + "step": 136360 + }, + { + "epoch": 0.5190578777890273, + "grad_norm": 0.32647815346717834, + "learning_rate": 0.0005, + "loss": 2.1223, + "step": 136370 + }, + { + "epoch": 0.51909594025715, + "grad_norm": 0.11649586260318756, + "learning_rate": 0.0005, + "loss": 2.1244, + "step": 136380 + }, + { + "epoch": 0.5191340027252728, + "grad_norm": 0.11900041252374649, + "learning_rate": 0.0005, + "loss": 2.1166, + "step": 136390 + }, + { + "epoch": 0.5191720651933954, + "grad_norm": 0.1257382035255432, + "learning_rate": 0.0005, + "loss": 2.1095, + "step": 136400 + }, + { + "epoch": 0.5192101276615181, + "grad_norm": 0.12999407947063446, + "learning_rate": 0.0005, + "loss": 2.1113, + "step": 136410 + }, + { + "epoch": 0.5192481901296407, + "grad_norm": 0.12291140109300613, + "learning_rate": 0.0005, + "loss": 2.1203, + "step": 136420 + }, + { + "epoch": 0.5192862525977634, + "grad_norm": 0.12455683946609497, + "learning_rate": 0.0005, + "loss": 2.1232, + "step": 136430 + }, + { + "epoch": 0.5193243150658862, + "grad_norm": 0.1276245266199112, + "learning_rate": 0.0005, + "loss": 2.0975, + "step": 136440 + }, + { + "epoch": 0.5193623775340088, + "grad_norm": 0.11303102225065231, + "learning_rate": 0.0005, + "loss": 2.1002, + "step": 136450 + }, + { + "epoch": 0.5194004400021315, + "grad_norm": 0.11451926827430725, + "learning_rate": 0.0005, + "loss": 2.103, + "step": 136460 + }, + { + "epoch": 0.5194385024702541, + "grad_norm": 0.12597942352294922, + "learning_rate": 0.0005, + "loss": 2.099, + "step": 136470 + }, + { + "epoch": 0.5194765649383769, + "grad_norm": 0.1587076187133789, + "learning_rate": 0.0005, + "loss": 2.1196, + "step": 136480 + }, + { + "epoch": 0.5195146274064996, + "grad_norm": 0.12485882639884949, + "learning_rate": 0.0005, + "loss": 2.1067, + "step": 136490 + }, + { + "epoch": 0.5195526898746222, + "grad_norm": 0.12216902524232864, + "learning_rate": 0.0005, + "loss": 2.1144, + "step": 136500 + }, + { + "epoch": 0.5195907523427449, + "grad_norm": 0.14184367656707764, + "learning_rate": 0.0005, + "loss": 2.1265, + "step": 136510 + }, + { + "epoch": 0.5196288148108676, + "grad_norm": 0.13335593044757843, + "learning_rate": 0.0005, + "loss": 2.1174, + "step": 136520 + }, + { + "epoch": 0.5196668772789903, + "grad_norm": 0.12494330108165741, + "learning_rate": 0.0005, + "loss": 2.106, + "step": 136530 + }, + { + "epoch": 0.519704939747113, + "grad_norm": 0.1236383244395256, + "learning_rate": 0.0005, + "loss": 2.1038, + "step": 136540 + }, + { + "epoch": 0.5197430022152356, + "grad_norm": 0.12758508324623108, + "learning_rate": 0.0005, + "loss": 2.1136, + "step": 136550 + }, + { + "epoch": 0.5197810646833583, + "grad_norm": 0.13176749646663666, + "learning_rate": 0.0005, + "loss": 2.1215, + "step": 136560 + }, + { + "epoch": 0.519819127151481, + "grad_norm": 0.1244225725531578, + "learning_rate": 0.0005, + "loss": 2.1118, + "step": 136570 + }, + { + "epoch": 0.5198571896196037, + "grad_norm": 0.1265987902879715, + "learning_rate": 0.0005, + "loss": 2.1072, + "step": 136580 + }, + { + "epoch": 0.5198952520877264, + "grad_norm": 0.11397150158882141, + "learning_rate": 0.0005, + "loss": 2.1197, + "step": 136590 + }, + { + "epoch": 0.519933314555849, + "grad_norm": 0.11553878337144852, + "learning_rate": 0.0005, + "loss": 2.1277, + "step": 136600 + }, + { + "epoch": 0.5199713770239718, + "grad_norm": 0.11481893807649612, + "learning_rate": 0.0005, + "loss": 2.1212, + "step": 136610 + }, + { + "epoch": 0.5200094394920944, + "grad_norm": 0.12730897963047028, + "learning_rate": 0.0005, + "loss": 2.0991, + "step": 136620 + }, + { + "epoch": 0.5200475019602171, + "grad_norm": 0.1259535700082779, + "learning_rate": 0.0005, + "loss": 2.1263, + "step": 136630 + }, + { + "epoch": 0.5200855644283398, + "grad_norm": 0.12053762376308441, + "learning_rate": 0.0005, + "loss": 2.1146, + "step": 136640 + }, + { + "epoch": 0.5201236268964625, + "grad_norm": 0.13139915466308594, + "learning_rate": 0.0005, + "loss": 2.11, + "step": 136650 + }, + { + "epoch": 0.5201616893645852, + "grad_norm": 0.11750254034996033, + "learning_rate": 0.0005, + "loss": 2.1084, + "step": 136660 + }, + { + "epoch": 0.5201997518327078, + "grad_norm": 0.12227505445480347, + "learning_rate": 0.0005, + "loss": 2.1127, + "step": 136670 + }, + { + "epoch": 0.5202378143008305, + "grad_norm": 0.1355392038822174, + "learning_rate": 0.0005, + "loss": 2.115, + "step": 136680 + }, + { + "epoch": 0.5202758767689533, + "grad_norm": 0.13740362226963043, + "learning_rate": 0.0005, + "loss": 2.1141, + "step": 136690 + }, + { + "epoch": 0.5203139392370759, + "grad_norm": 0.1517869085073471, + "learning_rate": 0.0005, + "loss": 2.11, + "step": 136700 + }, + { + "epoch": 0.5203520017051986, + "grad_norm": 0.12694501876831055, + "learning_rate": 0.0005, + "loss": 2.1106, + "step": 136710 + }, + { + "epoch": 0.5203900641733212, + "grad_norm": 0.11573896557092667, + "learning_rate": 0.0005, + "loss": 2.1017, + "step": 136720 + }, + { + "epoch": 0.5204281266414439, + "grad_norm": 0.12164445966482162, + "learning_rate": 0.0005, + "loss": 2.1021, + "step": 136730 + }, + { + "epoch": 0.5204661891095667, + "grad_norm": 0.12111037224531174, + "learning_rate": 0.0005, + "loss": 2.1117, + "step": 136740 + }, + { + "epoch": 0.5205042515776893, + "grad_norm": 0.11075384169816971, + "learning_rate": 0.0005, + "loss": 2.0966, + "step": 136750 + }, + { + "epoch": 0.520542314045812, + "grad_norm": 0.13149183988571167, + "learning_rate": 0.0005, + "loss": 2.1119, + "step": 136760 + }, + { + "epoch": 0.5205803765139346, + "grad_norm": 0.1264672875404358, + "learning_rate": 0.0005, + "loss": 2.1253, + "step": 136770 + }, + { + "epoch": 0.5206184389820574, + "grad_norm": 0.13758137822151184, + "learning_rate": 0.0005, + "loss": 2.1095, + "step": 136780 + }, + { + "epoch": 0.52065650145018, + "grad_norm": 0.11698931455612183, + "learning_rate": 0.0005, + "loss": 2.1182, + "step": 136790 + }, + { + "epoch": 0.5206945639183027, + "grad_norm": 0.131836399435997, + "learning_rate": 0.0005, + "loss": 2.1144, + "step": 136800 + }, + { + "epoch": 0.5207326263864254, + "grad_norm": 0.12469332665205002, + "learning_rate": 0.0005, + "loss": 2.1098, + "step": 136810 + }, + { + "epoch": 0.5207706888545481, + "grad_norm": 0.13206984102725983, + "learning_rate": 0.0005, + "loss": 2.1103, + "step": 136820 + }, + { + "epoch": 0.5208087513226708, + "grad_norm": 0.12733477354049683, + "learning_rate": 0.0005, + "loss": 2.1168, + "step": 136830 + }, + { + "epoch": 0.5208468137907935, + "grad_norm": 0.12322621047496796, + "learning_rate": 0.0005, + "loss": 2.111, + "step": 136840 + }, + { + "epoch": 0.5208848762589161, + "grad_norm": 0.13477306067943573, + "learning_rate": 0.0005, + "loss": 2.1268, + "step": 136850 + }, + { + "epoch": 0.5209229387270388, + "grad_norm": 0.11726487427949905, + "learning_rate": 0.0005, + "loss": 2.117, + "step": 136860 + }, + { + "epoch": 0.5209610011951615, + "grad_norm": 0.13089509308338165, + "learning_rate": 0.0005, + "loss": 2.1126, + "step": 136870 + }, + { + "epoch": 0.5209990636632842, + "grad_norm": 0.13881553709506989, + "learning_rate": 0.0005, + "loss": 2.1164, + "step": 136880 + }, + { + "epoch": 0.5210371261314068, + "grad_norm": 0.1210898607969284, + "learning_rate": 0.0005, + "loss": 2.1209, + "step": 136890 + }, + { + "epoch": 0.5210751885995295, + "grad_norm": 0.12279921770095825, + "learning_rate": 0.0005, + "loss": 2.1111, + "step": 136900 + }, + { + "epoch": 0.5211132510676523, + "grad_norm": 0.12662425637245178, + "learning_rate": 0.0005, + "loss": 2.1084, + "step": 136910 + }, + { + "epoch": 0.5211513135357749, + "grad_norm": 0.11807191371917725, + "learning_rate": 0.0005, + "loss": 2.1044, + "step": 136920 + }, + { + "epoch": 0.5211893760038976, + "grad_norm": 0.12020532786846161, + "learning_rate": 0.0005, + "loss": 2.1259, + "step": 136930 + }, + { + "epoch": 0.5212274384720202, + "grad_norm": 0.12616896629333496, + "learning_rate": 0.0005, + "loss": 2.1267, + "step": 136940 + }, + { + "epoch": 0.521265500940143, + "grad_norm": 0.13148759305477142, + "learning_rate": 0.0005, + "loss": 2.1093, + "step": 136950 + }, + { + "epoch": 0.5213035634082657, + "grad_norm": 0.1483425796031952, + "learning_rate": 0.0005, + "loss": 2.1237, + "step": 136960 + }, + { + "epoch": 0.5213416258763883, + "grad_norm": 0.1538805514574051, + "learning_rate": 0.0005, + "loss": 2.1128, + "step": 136970 + }, + { + "epoch": 0.521379688344511, + "grad_norm": 0.11839305609464645, + "learning_rate": 0.0005, + "loss": 2.1133, + "step": 136980 + }, + { + "epoch": 0.5214177508126336, + "grad_norm": 0.12727127969264984, + "learning_rate": 0.0005, + "loss": 2.1273, + "step": 136990 + }, + { + "epoch": 0.5214558132807564, + "grad_norm": 0.12757115066051483, + "learning_rate": 0.0005, + "loss": 2.1073, + "step": 137000 + }, + { + "epoch": 0.5214938757488791, + "grad_norm": 0.1204015463590622, + "learning_rate": 0.0005, + "loss": 2.1208, + "step": 137010 + }, + { + "epoch": 0.5215319382170017, + "grad_norm": 0.14261414110660553, + "learning_rate": 0.0005, + "loss": 2.1024, + "step": 137020 + }, + { + "epoch": 0.5215700006851244, + "grad_norm": 0.14065109193325043, + "learning_rate": 0.0005, + "loss": 2.1227, + "step": 137030 + }, + { + "epoch": 0.5216080631532471, + "grad_norm": 0.13176940381526947, + "learning_rate": 0.0005, + "loss": 2.1079, + "step": 137040 + }, + { + "epoch": 0.5216461256213698, + "grad_norm": 0.14611373841762543, + "learning_rate": 0.0005, + "loss": 2.1129, + "step": 137050 + }, + { + "epoch": 0.5216841880894925, + "grad_norm": 0.1353708803653717, + "learning_rate": 0.0005, + "loss": 2.1086, + "step": 137060 + }, + { + "epoch": 0.5217222505576151, + "grad_norm": 0.12236816436052322, + "learning_rate": 0.0005, + "loss": 2.1099, + "step": 137070 + }, + { + "epoch": 0.5217603130257379, + "grad_norm": 0.12845854461193085, + "learning_rate": 0.0005, + "loss": 2.0969, + "step": 137080 + }, + { + "epoch": 0.5217983754938605, + "grad_norm": 0.13327331840991974, + "learning_rate": 0.0005, + "loss": 2.1261, + "step": 137090 + }, + { + "epoch": 0.5218364379619832, + "grad_norm": 0.1174556240439415, + "learning_rate": 0.0005, + "loss": 2.1184, + "step": 137100 + }, + { + "epoch": 0.5218745004301059, + "grad_norm": 0.11884109675884247, + "learning_rate": 0.0005, + "loss": 2.1072, + "step": 137110 + }, + { + "epoch": 0.5219125628982286, + "grad_norm": 0.13140694797039032, + "learning_rate": 0.0005, + "loss": 2.1085, + "step": 137120 + }, + { + "epoch": 0.5219506253663513, + "grad_norm": 0.1556568294763565, + "learning_rate": 0.0005, + "loss": 2.1259, + "step": 137130 + }, + { + "epoch": 0.5219886878344739, + "grad_norm": 0.14347811043262482, + "learning_rate": 0.0005, + "loss": 2.1102, + "step": 137140 + }, + { + "epoch": 0.5220267503025966, + "grad_norm": 0.1311759650707245, + "learning_rate": 0.0005, + "loss": 2.1261, + "step": 137150 + }, + { + "epoch": 0.5220648127707193, + "grad_norm": 0.12397903949022293, + "learning_rate": 0.0005, + "loss": 2.1478, + "step": 137160 + }, + { + "epoch": 0.522102875238842, + "grad_norm": 0.12823760509490967, + "learning_rate": 0.0005, + "loss": 2.1203, + "step": 137170 + }, + { + "epoch": 0.5221409377069647, + "grad_norm": 0.12383680045604706, + "learning_rate": 0.0005, + "loss": 2.0997, + "step": 137180 + }, + { + "epoch": 0.5221790001750873, + "grad_norm": 0.12665867805480957, + "learning_rate": 0.0005, + "loss": 2.1123, + "step": 137190 + }, + { + "epoch": 0.52221706264321, + "grad_norm": 0.11916995048522949, + "learning_rate": 0.0005, + "loss": 2.1323, + "step": 137200 + }, + { + "epoch": 0.5222551251113328, + "grad_norm": 0.12199389934539795, + "learning_rate": 0.0005, + "loss": 2.1025, + "step": 137210 + }, + { + "epoch": 0.5222931875794554, + "grad_norm": 0.13059760630130768, + "learning_rate": 0.0005, + "loss": 2.1094, + "step": 137220 + }, + { + "epoch": 0.5223312500475781, + "grad_norm": 0.12276890873908997, + "learning_rate": 0.0005, + "loss": 2.1166, + "step": 137230 + }, + { + "epoch": 0.5223693125157007, + "grad_norm": 0.12334048002958298, + "learning_rate": 0.0005, + "loss": 2.126, + "step": 137240 + }, + { + "epoch": 0.5224073749838235, + "grad_norm": 0.13500618934631348, + "learning_rate": 0.0005, + "loss": 2.1048, + "step": 137250 + }, + { + "epoch": 0.5224454374519462, + "grad_norm": 0.13345398008823395, + "learning_rate": 0.0005, + "loss": 2.106, + "step": 137260 + }, + { + "epoch": 0.5224834999200688, + "grad_norm": 0.12334723025560379, + "learning_rate": 0.0005, + "loss": 2.1147, + "step": 137270 + }, + { + "epoch": 0.5225215623881915, + "grad_norm": 0.13009530305862427, + "learning_rate": 0.0005, + "loss": 2.1022, + "step": 137280 + }, + { + "epoch": 0.5225596248563141, + "grad_norm": 0.13310536742210388, + "learning_rate": 0.0005, + "loss": 2.1054, + "step": 137290 + }, + { + "epoch": 0.5225976873244369, + "grad_norm": 0.11736752837896347, + "learning_rate": 0.0005, + "loss": 2.114, + "step": 137300 + }, + { + "epoch": 0.5226357497925596, + "grad_norm": 0.12839005887508392, + "learning_rate": 0.0005, + "loss": 2.1205, + "step": 137310 + }, + { + "epoch": 0.5226738122606822, + "grad_norm": 0.11895774304866791, + "learning_rate": 0.0005, + "loss": 2.1199, + "step": 137320 + }, + { + "epoch": 0.5227118747288049, + "grad_norm": 0.14122259616851807, + "learning_rate": 0.0005, + "loss": 2.1047, + "step": 137330 + }, + { + "epoch": 0.5227499371969276, + "grad_norm": 0.12385997921228409, + "learning_rate": 0.0005, + "loss": 2.1311, + "step": 137340 + }, + { + "epoch": 0.5227879996650503, + "grad_norm": 0.12733817100524902, + "learning_rate": 0.0005, + "loss": 2.1134, + "step": 137350 + }, + { + "epoch": 0.522826062133173, + "grad_norm": 0.12660536170005798, + "learning_rate": 0.0005, + "loss": 2.0942, + "step": 137360 + }, + { + "epoch": 0.5228641246012956, + "grad_norm": 0.12909385561943054, + "learning_rate": 0.0005, + "loss": 2.1072, + "step": 137370 + }, + { + "epoch": 0.5229021870694184, + "grad_norm": 0.12075277417898178, + "learning_rate": 0.0005, + "loss": 2.1052, + "step": 137380 + }, + { + "epoch": 0.522940249537541, + "grad_norm": 0.1270180195569992, + "learning_rate": 0.0005, + "loss": 2.1077, + "step": 137390 + }, + { + "epoch": 0.5229783120056637, + "grad_norm": 0.13144104182720184, + "learning_rate": 0.0005, + "loss": 2.1127, + "step": 137400 + }, + { + "epoch": 0.5230163744737863, + "grad_norm": 0.12336520850658417, + "learning_rate": 0.0005, + "loss": 2.1222, + "step": 137410 + }, + { + "epoch": 0.523054436941909, + "grad_norm": 0.135188028216362, + "learning_rate": 0.0005, + "loss": 2.1218, + "step": 137420 + }, + { + "epoch": 0.5230924994100318, + "grad_norm": 0.13050761818885803, + "learning_rate": 0.0005, + "loss": 2.1161, + "step": 137430 + }, + { + "epoch": 0.5231305618781544, + "grad_norm": 0.12770605087280273, + "learning_rate": 0.0005, + "loss": 2.1311, + "step": 137440 + }, + { + "epoch": 0.5231686243462771, + "grad_norm": 0.15316686034202576, + "learning_rate": 0.0005, + "loss": 2.1182, + "step": 137450 + }, + { + "epoch": 0.5232066868143997, + "grad_norm": 0.13354888558387756, + "learning_rate": 0.0005, + "loss": 2.1159, + "step": 137460 + }, + { + "epoch": 0.5232447492825225, + "grad_norm": 0.13278037309646606, + "learning_rate": 0.0005, + "loss": 2.1089, + "step": 137470 + }, + { + "epoch": 0.5232828117506452, + "grad_norm": 0.1274474710226059, + "learning_rate": 0.0005, + "loss": 2.12, + "step": 137480 + }, + { + "epoch": 0.5233208742187678, + "grad_norm": 0.12818734347820282, + "learning_rate": 0.0005, + "loss": 2.1066, + "step": 137490 + }, + { + "epoch": 0.5233589366868905, + "grad_norm": 0.12248261272907257, + "learning_rate": 0.0005, + "loss": 2.1187, + "step": 137500 + }, + { + "epoch": 0.5233969991550133, + "grad_norm": 0.12802907824516296, + "learning_rate": 0.0005, + "loss": 2.1022, + "step": 137510 + }, + { + "epoch": 0.5234350616231359, + "grad_norm": 0.1282433569431305, + "learning_rate": 0.0005, + "loss": 2.1059, + "step": 137520 + }, + { + "epoch": 0.5234731240912586, + "grad_norm": 0.12959887087345123, + "learning_rate": 0.0005, + "loss": 2.108, + "step": 137530 + }, + { + "epoch": 0.5235111865593812, + "grad_norm": 0.12287507951259613, + "learning_rate": 0.0005, + "loss": 2.1226, + "step": 137540 + }, + { + "epoch": 0.523549249027504, + "grad_norm": 0.1263885498046875, + "learning_rate": 0.0005, + "loss": 2.1137, + "step": 137550 + }, + { + "epoch": 0.5235873114956267, + "grad_norm": 0.13547179102897644, + "learning_rate": 0.0005, + "loss": 2.1175, + "step": 137560 + }, + { + "epoch": 0.5236253739637493, + "grad_norm": 0.11977437883615494, + "learning_rate": 0.0005, + "loss": 2.123, + "step": 137570 + }, + { + "epoch": 0.523663436431872, + "grad_norm": 0.12466085702180862, + "learning_rate": 0.0005, + "loss": 2.1182, + "step": 137580 + }, + { + "epoch": 0.5237014988999946, + "grad_norm": 0.12097606062889099, + "learning_rate": 0.0005, + "loss": 2.1183, + "step": 137590 + }, + { + "epoch": 0.5237395613681174, + "grad_norm": 0.12236282974481583, + "learning_rate": 0.0005, + "loss": 2.1206, + "step": 137600 + }, + { + "epoch": 0.52377762383624, + "grad_norm": 0.1352507621049881, + "learning_rate": 0.0005, + "loss": 2.1281, + "step": 137610 + }, + { + "epoch": 0.5238156863043627, + "grad_norm": 0.13352009654045105, + "learning_rate": 0.0005, + "loss": 2.0936, + "step": 137620 + }, + { + "epoch": 0.5238537487724854, + "grad_norm": 0.1264696568250656, + "learning_rate": 0.0005, + "loss": 2.1214, + "step": 137630 + }, + { + "epoch": 0.5238918112406081, + "grad_norm": 0.11878474056720734, + "learning_rate": 0.0005, + "loss": 2.1168, + "step": 137640 + }, + { + "epoch": 0.5239298737087308, + "grad_norm": 0.11161357909440994, + "learning_rate": 0.0005, + "loss": 2.1154, + "step": 137650 + }, + { + "epoch": 0.5239679361768534, + "grad_norm": 0.12533272802829742, + "learning_rate": 0.0005, + "loss": 2.1161, + "step": 137660 + }, + { + "epoch": 0.5240059986449761, + "grad_norm": 0.13127167522907257, + "learning_rate": 0.0005, + "loss": 2.1029, + "step": 137670 + }, + { + "epoch": 0.5240440611130989, + "grad_norm": 0.15212424099445343, + "learning_rate": 0.0005, + "loss": 2.1001, + "step": 137680 + }, + { + "epoch": 0.5240821235812215, + "grad_norm": 0.14096777141094208, + "learning_rate": 0.0005, + "loss": 2.116, + "step": 137690 + }, + { + "epoch": 0.5241201860493442, + "grad_norm": 0.13191179931163788, + "learning_rate": 0.0005, + "loss": 2.1092, + "step": 137700 + }, + { + "epoch": 0.5241582485174668, + "grad_norm": 0.1232769638299942, + "learning_rate": 0.0005, + "loss": 2.1184, + "step": 137710 + }, + { + "epoch": 0.5241963109855895, + "grad_norm": 0.13931381702423096, + "learning_rate": 0.0005, + "loss": 2.1235, + "step": 137720 + }, + { + "epoch": 0.5242343734537123, + "grad_norm": 0.12155848741531372, + "learning_rate": 0.0005, + "loss": 2.1078, + "step": 137730 + }, + { + "epoch": 0.5242724359218349, + "grad_norm": 0.12789209187030792, + "learning_rate": 0.0005, + "loss": 2.125, + "step": 137740 + }, + { + "epoch": 0.5243104983899576, + "grad_norm": 0.12094086408615112, + "learning_rate": 0.0005, + "loss": 2.1139, + "step": 137750 + }, + { + "epoch": 0.5243485608580802, + "grad_norm": 0.13228756189346313, + "learning_rate": 0.0005, + "loss": 2.1033, + "step": 137760 + }, + { + "epoch": 0.524386623326203, + "grad_norm": 0.12024088203907013, + "learning_rate": 0.0005, + "loss": 2.1167, + "step": 137770 + }, + { + "epoch": 0.5244246857943257, + "grad_norm": 0.1733071357011795, + "learning_rate": 0.0005, + "loss": 2.1145, + "step": 137780 + }, + { + "epoch": 0.5244627482624483, + "grad_norm": 0.22339996695518494, + "learning_rate": 0.0005, + "loss": 2.1158, + "step": 137790 + }, + { + "epoch": 0.524500810730571, + "grad_norm": 0.12036958336830139, + "learning_rate": 0.0005, + "loss": 2.1118, + "step": 137800 + }, + { + "epoch": 0.5245388731986937, + "grad_norm": 0.12528793513774872, + "learning_rate": 0.0005, + "loss": 2.12, + "step": 137810 + }, + { + "epoch": 0.5245769356668164, + "grad_norm": 0.11740902066230774, + "learning_rate": 0.0005, + "loss": 2.1105, + "step": 137820 + }, + { + "epoch": 0.5246149981349391, + "grad_norm": 0.13410566747188568, + "learning_rate": 0.0005, + "loss": 2.11, + "step": 137830 + }, + { + "epoch": 0.5246530606030617, + "grad_norm": 0.11765021085739136, + "learning_rate": 0.0005, + "loss": 2.0992, + "step": 137840 + }, + { + "epoch": 0.5246911230711844, + "grad_norm": 0.11986323446035385, + "learning_rate": 0.0005, + "loss": 2.13, + "step": 137850 + }, + { + "epoch": 0.5247291855393071, + "grad_norm": 0.1241866871714592, + "learning_rate": 0.0005, + "loss": 2.1222, + "step": 137860 + }, + { + "epoch": 0.5247672480074298, + "grad_norm": 0.13713204860687256, + "learning_rate": 0.0005, + "loss": 2.1243, + "step": 137870 + }, + { + "epoch": 0.5248053104755525, + "grad_norm": 0.127635657787323, + "learning_rate": 0.0005, + "loss": 2.1107, + "step": 137880 + }, + { + "epoch": 0.5248433729436751, + "grad_norm": 0.11746551096439362, + "learning_rate": 0.0005, + "loss": 2.1172, + "step": 137890 + }, + { + "epoch": 0.5248814354117979, + "grad_norm": 0.1233421340584755, + "learning_rate": 0.0005, + "loss": 2.1019, + "step": 137900 + }, + { + "epoch": 0.5249194978799205, + "grad_norm": 0.12225458770990372, + "learning_rate": 0.0005, + "loss": 2.1075, + "step": 137910 + }, + { + "epoch": 0.5249575603480432, + "grad_norm": 0.14724929630756378, + "learning_rate": 0.0005, + "loss": 2.1251, + "step": 137920 + }, + { + "epoch": 0.5249956228161659, + "grad_norm": 0.1190890446305275, + "learning_rate": 0.0005, + "loss": 2.1055, + "step": 137930 + }, + { + "epoch": 0.5250336852842886, + "grad_norm": 0.12628717720508575, + "learning_rate": 0.0005, + "loss": 2.1154, + "step": 137940 + }, + { + "epoch": 0.5250717477524113, + "grad_norm": 0.11910507082939148, + "learning_rate": 0.0005, + "loss": 2.1214, + "step": 137950 + }, + { + "epoch": 0.5251098102205339, + "grad_norm": 0.11877349019050598, + "learning_rate": 0.0005, + "loss": 2.114, + "step": 137960 + }, + { + "epoch": 0.5251478726886566, + "grad_norm": 0.1267973929643631, + "learning_rate": 0.0005, + "loss": 2.1132, + "step": 137970 + }, + { + "epoch": 0.5251859351567794, + "grad_norm": 0.1362115740776062, + "learning_rate": 0.0005, + "loss": 2.1181, + "step": 137980 + }, + { + "epoch": 0.525223997624902, + "grad_norm": 0.12277247756719589, + "learning_rate": 0.0005, + "loss": 2.1275, + "step": 137990 + }, + { + "epoch": 0.5252620600930247, + "grad_norm": 0.12753120064735413, + "learning_rate": 0.0005, + "loss": 2.1009, + "step": 138000 + }, + { + "epoch": 0.5253001225611473, + "grad_norm": 0.12616737186908722, + "learning_rate": 0.0005, + "loss": 2.1141, + "step": 138010 + }, + { + "epoch": 0.52533818502927, + "grad_norm": 0.11854333430528641, + "learning_rate": 0.0005, + "loss": 2.112, + "step": 138020 + }, + { + "epoch": 0.5253762474973928, + "grad_norm": 0.12479977309703827, + "learning_rate": 0.0005, + "loss": 2.1175, + "step": 138030 + }, + { + "epoch": 0.5254143099655154, + "grad_norm": 0.13225720822811127, + "learning_rate": 0.0005, + "loss": 2.1361, + "step": 138040 + }, + { + "epoch": 0.5254523724336381, + "grad_norm": 0.13469231128692627, + "learning_rate": 0.0005, + "loss": 2.109, + "step": 138050 + }, + { + "epoch": 0.5254904349017607, + "grad_norm": 0.1489909440279007, + "learning_rate": 0.0005, + "loss": 2.1134, + "step": 138060 + }, + { + "epoch": 0.5255284973698835, + "grad_norm": 0.12749110162258148, + "learning_rate": 0.0005, + "loss": 2.1122, + "step": 138070 + }, + { + "epoch": 0.5255665598380062, + "grad_norm": 0.13680586218833923, + "learning_rate": 0.0005, + "loss": 2.1075, + "step": 138080 + }, + { + "epoch": 0.5256046223061288, + "grad_norm": 0.12244150042533875, + "learning_rate": 0.0005, + "loss": 2.116, + "step": 138090 + }, + { + "epoch": 0.5256426847742515, + "grad_norm": 0.12733541429042816, + "learning_rate": 0.0005, + "loss": 2.1273, + "step": 138100 + }, + { + "epoch": 0.5256807472423742, + "grad_norm": 0.11963535845279694, + "learning_rate": 0.0005, + "loss": 2.1231, + "step": 138110 + }, + { + "epoch": 0.5257188097104969, + "grad_norm": 0.12353172153234482, + "learning_rate": 0.0005, + "loss": 2.1064, + "step": 138120 + }, + { + "epoch": 0.5257568721786195, + "grad_norm": 0.11347753554582596, + "learning_rate": 0.0005, + "loss": 2.1299, + "step": 138130 + }, + { + "epoch": 0.5257949346467422, + "grad_norm": 0.12878623604774475, + "learning_rate": 0.0005, + "loss": 2.1166, + "step": 138140 + }, + { + "epoch": 0.5258329971148649, + "grad_norm": 0.14339739084243774, + "learning_rate": 0.0005, + "loss": 2.1172, + "step": 138150 + }, + { + "epoch": 0.5258710595829876, + "grad_norm": 0.12231696397066116, + "learning_rate": 0.0005, + "loss": 2.1143, + "step": 138160 + }, + { + "epoch": 0.5259091220511103, + "grad_norm": 0.12762510776519775, + "learning_rate": 0.0005, + "loss": 2.113, + "step": 138170 + }, + { + "epoch": 0.525947184519233, + "grad_norm": 0.11546134203672409, + "learning_rate": 0.0005, + "loss": 2.1309, + "step": 138180 + }, + { + "epoch": 0.5259852469873556, + "grad_norm": 0.11776210367679596, + "learning_rate": 0.0005, + "loss": 2.122, + "step": 138190 + }, + { + "epoch": 0.5260233094554784, + "grad_norm": 0.13029833137989044, + "learning_rate": 0.0005, + "loss": 2.1094, + "step": 138200 + }, + { + "epoch": 0.526061371923601, + "grad_norm": 0.12698499858379364, + "learning_rate": 0.0005, + "loss": 2.1189, + "step": 138210 + }, + { + "epoch": 0.5260994343917237, + "grad_norm": 0.12059946358203888, + "learning_rate": 0.0005, + "loss": 2.1254, + "step": 138220 + }, + { + "epoch": 0.5261374968598463, + "grad_norm": 0.13680903613567352, + "learning_rate": 0.0005, + "loss": 2.116, + "step": 138230 + }, + { + "epoch": 0.5261755593279691, + "grad_norm": 0.12311997264623642, + "learning_rate": 0.0005, + "loss": 2.1278, + "step": 138240 + }, + { + "epoch": 0.5262136217960918, + "grad_norm": 0.13110458850860596, + "learning_rate": 0.0005, + "loss": 2.1118, + "step": 138250 + }, + { + "epoch": 0.5262516842642144, + "grad_norm": 0.12085889279842377, + "learning_rate": 0.0005, + "loss": 2.1138, + "step": 138260 + }, + { + "epoch": 0.5262897467323371, + "grad_norm": 0.1268099844455719, + "learning_rate": 0.0005, + "loss": 2.1183, + "step": 138270 + }, + { + "epoch": 0.5263278092004597, + "grad_norm": 0.1265743523836136, + "learning_rate": 0.0005, + "loss": 2.0959, + "step": 138280 + }, + { + "epoch": 0.5263658716685825, + "grad_norm": 0.12535488605499268, + "learning_rate": 0.0005, + "loss": 2.125, + "step": 138290 + }, + { + "epoch": 0.5264039341367052, + "grad_norm": 0.12558478116989136, + "learning_rate": 0.0005, + "loss": 2.1181, + "step": 138300 + }, + { + "epoch": 0.5264419966048278, + "grad_norm": 0.13227242231369019, + "learning_rate": 0.0005, + "loss": 2.1124, + "step": 138310 + }, + { + "epoch": 0.5264800590729505, + "grad_norm": 0.12063269317150116, + "learning_rate": 0.0005, + "loss": 2.1199, + "step": 138320 + }, + { + "epoch": 0.5265181215410732, + "grad_norm": 0.13169559836387634, + "learning_rate": 0.0005, + "loss": 2.1037, + "step": 138330 + }, + { + "epoch": 0.5265561840091959, + "grad_norm": 0.1179480329155922, + "learning_rate": 0.0005, + "loss": 2.0988, + "step": 138340 + }, + { + "epoch": 0.5265942464773186, + "grad_norm": 0.12810704112052917, + "learning_rate": 0.0005, + "loss": 2.094, + "step": 138350 + }, + { + "epoch": 0.5266323089454412, + "grad_norm": 0.13122254610061646, + "learning_rate": 0.0005, + "loss": 2.1095, + "step": 138360 + }, + { + "epoch": 0.526670371413564, + "grad_norm": 0.13543862104415894, + "learning_rate": 0.0005, + "loss": 2.1272, + "step": 138370 + }, + { + "epoch": 0.5267084338816866, + "grad_norm": 0.12366582453250885, + "learning_rate": 0.0005, + "loss": 2.1161, + "step": 138380 + }, + { + "epoch": 0.5267464963498093, + "grad_norm": 0.13607007265090942, + "learning_rate": 0.0005, + "loss": 2.1155, + "step": 138390 + }, + { + "epoch": 0.526784558817932, + "grad_norm": 0.1335546374320984, + "learning_rate": 0.0005, + "loss": 2.1113, + "step": 138400 + }, + { + "epoch": 0.5268226212860547, + "grad_norm": 0.12716715037822723, + "learning_rate": 0.0005, + "loss": 2.1203, + "step": 138410 + }, + { + "epoch": 0.5268606837541774, + "grad_norm": 0.1355588138103485, + "learning_rate": 0.0005, + "loss": 2.1045, + "step": 138420 + }, + { + "epoch": 0.5268987462223, + "grad_norm": 0.11754409968852997, + "learning_rate": 0.0005, + "loss": 2.1172, + "step": 138430 + }, + { + "epoch": 0.5269368086904227, + "grad_norm": 0.1135902926325798, + "learning_rate": 0.0005, + "loss": 2.1108, + "step": 138440 + }, + { + "epoch": 0.5269748711585454, + "grad_norm": 0.12047741562128067, + "learning_rate": 0.0005, + "loss": 2.1175, + "step": 138450 + }, + { + "epoch": 0.5270129336266681, + "grad_norm": 0.12416040897369385, + "learning_rate": 0.0005, + "loss": 2.1306, + "step": 138460 + }, + { + "epoch": 0.5270509960947908, + "grad_norm": 0.15139645338058472, + "learning_rate": 0.0005, + "loss": 2.1226, + "step": 138470 + }, + { + "epoch": 0.5270890585629134, + "grad_norm": 0.12424907088279724, + "learning_rate": 0.0005, + "loss": 2.1429, + "step": 138480 + }, + { + "epoch": 0.5271271210310361, + "grad_norm": 0.1323985457420349, + "learning_rate": 0.0005, + "loss": 2.0967, + "step": 138490 + }, + { + "epoch": 0.5271651834991589, + "grad_norm": 0.12402381002902985, + "learning_rate": 0.0005, + "loss": 2.1114, + "step": 138500 + }, + { + "epoch": 0.5272032459672815, + "grad_norm": 0.11723242700099945, + "learning_rate": 0.0005, + "loss": 2.1143, + "step": 138510 + }, + { + "epoch": 0.5272413084354042, + "grad_norm": 0.12278267741203308, + "learning_rate": 0.0005, + "loss": 2.1209, + "step": 138520 + }, + { + "epoch": 0.5272793709035268, + "grad_norm": 0.13405431807041168, + "learning_rate": 0.0005, + "loss": 2.0984, + "step": 138530 + }, + { + "epoch": 0.5273174333716496, + "grad_norm": 0.12649041414260864, + "learning_rate": 0.0005, + "loss": 2.1217, + "step": 138540 + }, + { + "epoch": 0.5273554958397723, + "grad_norm": 0.13687849044799805, + "learning_rate": 0.0005, + "loss": 2.1081, + "step": 138550 + }, + { + "epoch": 0.5273935583078949, + "grad_norm": 0.12157398462295532, + "learning_rate": 0.0005, + "loss": 2.1139, + "step": 138560 + }, + { + "epoch": 0.5274316207760176, + "grad_norm": 0.1232946366071701, + "learning_rate": 0.0005, + "loss": 2.1291, + "step": 138570 + }, + { + "epoch": 0.5274696832441402, + "grad_norm": 0.12800607085227966, + "learning_rate": 0.0005, + "loss": 2.1227, + "step": 138580 + }, + { + "epoch": 0.527507745712263, + "grad_norm": 0.14093177020549774, + "learning_rate": 0.0005, + "loss": 2.1203, + "step": 138590 + }, + { + "epoch": 0.5275458081803857, + "grad_norm": 0.13657447695732117, + "learning_rate": 0.0005, + "loss": 2.1152, + "step": 138600 + }, + { + "epoch": 0.5275838706485083, + "grad_norm": 0.12304209172725677, + "learning_rate": 0.0005, + "loss": 2.1129, + "step": 138610 + }, + { + "epoch": 0.527621933116631, + "grad_norm": 0.11970432847738266, + "learning_rate": 0.0005, + "loss": 2.1126, + "step": 138620 + }, + { + "epoch": 0.5276599955847537, + "grad_norm": 0.12269636243581772, + "learning_rate": 0.0005, + "loss": 2.102, + "step": 138630 + }, + { + "epoch": 0.5276980580528764, + "grad_norm": 0.1293053925037384, + "learning_rate": 0.0005, + "loss": 2.1085, + "step": 138640 + }, + { + "epoch": 0.527736120520999, + "grad_norm": 0.11836563050746918, + "learning_rate": 0.0005, + "loss": 2.1003, + "step": 138650 + }, + { + "epoch": 0.5277741829891217, + "grad_norm": 0.12414873391389847, + "learning_rate": 0.0005, + "loss": 2.1261, + "step": 138660 + }, + { + "epoch": 0.5278122454572445, + "grad_norm": 0.14319206774234772, + "learning_rate": 0.0005, + "loss": 2.0954, + "step": 138670 + }, + { + "epoch": 0.5278503079253671, + "grad_norm": 0.13531246781349182, + "learning_rate": 0.0005, + "loss": 2.1239, + "step": 138680 + }, + { + "epoch": 0.5278883703934898, + "grad_norm": 0.13123445212841034, + "learning_rate": 0.0005, + "loss": 2.1075, + "step": 138690 + }, + { + "epoch": 0.5279264328616124, + "grad_norm": 0.11836890876293182, + "learning_rate": 0.0005, + "loss": 2.1068, + "step": 138700 + }, + { + "epoch": 0.5279644953297351, + "grad_norm": 0.11703275889158249, + "learning_rate": 0.0005, + "loss": 2.11, + "step": 138710 + }, + { + "epoch": 0.5280025577978579, + "grad_norm": 0.12264472991228104, + "learning_rate": 0.0005, + "loss": 2.1181, + "step": 138720 + }, + { + "epoch": 0.5280406202659805, + "grad_norm": 0.12100914865732193, + "learning_rate": 0.0005, + "loss": 2.1205, + "step": 138730 + }, + { + "epoch": 0.5280786827341032, + "grad_norm": 0.12464544177055359, + "learning_rate": 0.0005, + "loss": 2.1273, + "step": 138740 + }, + { + "epoch": 0.5281167452022258, + "grad_norm": 0.13402172923088074, + "learning_rate": 0.0005, + "loss": 2.0946, + "step": 138750 + }, + { + "epoch": 0.5281548076703486, + "grad_norm": 0.13545958697795868, + "learning_rate": 0.0005, + "loss": 2.1081, + "step": 138760 + }, + { + "epoch": 0.5281928701384713, + "grad_norm": 0.11684264242649078, + "learning_rate": 0.0005, + "loss": 2.1163, + "step": 138770 + }, + { + "epoch": 0.5282309326065939, + "grad_norm": 0.15134216845035553, + "learning_rate": 0.0005, + "loss": 2.1239, + "step": 138780 + }, + { + "epoch": 0.5282689950747166, + "grad_norm": 0.12242384999990463, + "learning_rate": 0.0005, + "loss": 2.0945, + "step": 138790 + }, + { + "epoch": 0.5283070575428394, + "grad_norm": 0.12735465168952942, + "learning_rate": 0.0005, + "loss": 2.1126, + "step": 138800 + }, + { + "epoch": 0.528345120010962, + "grad_norm": 0.12246635556221008, + "learning_rate": 0.0005, + "loss": 2.1159, + "step": 138810 + }, + { + "epoch": 0.5283831824790847, + "grad_norm": 0.1216377466917038, + "learning_rate": 0.0005, + "loss": 2.1046, + "step": 138820 + }, + { + "epoch": 0.5284212449472073, + "grad_norm": 0.1245758906006813, + "learning_rate": 0.0005, + "loss": 2.1069, + "step": 138830 + }, + { + "epoch": 0.5284593074153301, + "grad_norm": 0.13611829280853271, + "learning_rate": 0.0005, + "loss": 2.0995, + "step": 138840 + }, + { + "epoch": 0.5284973698834528, + "grad_norm": 0.12166754901409149, + "learning_rate": 0.0005, + "loss": 2.1083, + "step": 138850 + }, + { + "epoch": 0.5285354323515754, + "grad_norm": 0.12426643818616867, + "learning_rate": 0.0005, + "loss": 2.1277, + "step": 138860 + }, + { + "epoch": 0.5285734948196981, + "grad_norm": 0.15440742671489716, + "learning_rate": 0.0005, + "loss": 2.1187, + "step": 138870 + }, + { + "epoch": 0.5286115572878207, + "grad_norm": 0.11902732402086258, + "learning_rate": 0.0005, + "loss": 2.1166, + "step": 138880 + }, + { + "epoch": 0.5286496197559435, + "grad_norm": 0.1373627632856369, + "learning_rate": 0.0005, + "loss": 2.1098, + "step": 138890 + }, + { + "epoch": 0.5286876822240661, + "grad_norm": 0.1300365924835205, + "learning_rate": 0.0005, + "loss": 2.1154, + "step": 138900 + }, + { + "epoch": 0.5287257446921888, + "grad_norm": 0.1294863224029541, + "learning_rate": 0.0005, + "loss": 2.1266, + "step": 138910 + }, + { + "epoch": 0.5287638071603115, + "grad_norm": 0.12608158588409424, + "learning_rate": 0.0005, + "loss": 2.1179, + "step": 138920 + }, + { + "epoch": 0.5288018696284342, + "grad_norm": 0.12034744024276733, + "learning_rate": 0.0005, + "loss": 2.1269, + "step": 138930 + }, + { + "epoch": 0.5288399320965569, + "grad_norm": 0.11805101484060287, + "learning_rate": 0.0005, + "loss": 2.1157, + "step": 138940 + }, + { + "epoch": 0.5288779945646795, + "grad_norm": 0.12345664203166962, + "learning_rate": 0.0005, + "loss": 2.1091, + "step": 138950 + }, + { + "epoch": 0.5289160570328022, + "grad_norm": 0.1263914555311203, + "learning_rate": 0.0005, + "loss": 2.1201, + "step": 138960 + }, + { + "epoch": 0.528954119500925, + "grad_norm": 0.11450544744729996, + "learning_rate": 0.0005, + "loss": 2.1254, + "step": 138970 + }, + { + "epoch": 0.5289921819690476, + "grad_norm": 0.125608429312706, + "learning_rate": 0.0005, + "loss": 2.1188, + "step": 138980 + }, + { + "epoch": 0.5290302444371703, + "grad_norm": 0.12446246296167374, + "learning_rate": 0.0005, + "loss": 2.1177, + "step": 138990 + }, + { + "epoch": 0.5290683069052929, + "grad_norm": 0.13462740182876587, + "learning_rate": 0.0005, + "loss": 2.1078, + "step": 139000 + }, + { + "epoch": 0.5291063693734156, + "grad_norm": 0.14255592226982117, + "learning_rate": 0.0005, + "loss": 2.1088, + "step": 139010 + }, + { + "epoch": 0.5291444318415384, + "grad_norm": 0.11985404044389725, + "learning_rate": 0.0005, + "loss": 2.1204, + "step": 139020 + }, + { + "epoch": 0.529182494309661, + "grad_norm": 0.12564119696617126, + "learning_rate": 0.0005, + "loss": 2.1203, + "step": 139030 + }, + { + "epoch": 0.5292205567777837, + "grad_norm": 0.13089881837368011, + "learning_rate": 0.0005, + "loss": 2.1234, + "step": 139040 + }, + { + "epoch": 0.5292586192459063, + "grad_norm": 0.11932288110256195, + "learning_rate": 0.0005, + "loss": 2.1073, + "step": 139050 + }, + { + "epoch": 0.5292966817140291, + "grad_norm": 0.1306738406419754, + "learning_rate": 0.0005, + "loss": 2.111, + "step": 139060 + }, + { + "epoch": 0.5293347441821518, + "grad_norm": 0.11912896484136581, + "learning_rate": 0.0005, + "loss": 2.1088, + "step": 139070 + }, + { + "epoch": 0.5293728066502744, + "grad_norm": 0.12531140446662903, + "learning_rate": 0.0005, + "loss": 2.1068, + "step": 139080 + }, + { + "epoch": 0.5294108691183971, + "grad_norm": 0.12904711067676544, + "learning_rate": 0.0005, + "loss": 2.116, + "step": 139090 + }, + { + "epoch": 0.5294489315865198, + "grad_norm": 0.12961961328983307, + "learning_rate": 0.0005, + "loss": 2.1145, + "step": 139100 + }, + { + "epoch": 0.5294869940546425, + "grad_norm": 0.1254771202802658, + "learning_rate": 0.0005, + "loss": 2.1115, + "step": 139110 + }, + { + "epoch": 0.5295250565227652, + "grad_norm": 0.1275731474161148, + "learning_rate": 0.0005, + "loss": 2.1244, + "step": 139120 + }, + { + "epoch": 0.5295631189908878, + "grad_norm": 0.1152644231915474, + "learning_rate": 0.0005, + "loss": 2.1209, + "step": 139130 + }, + { + "epoch": 0.5296011814590105, + "grad_norm": 0.12683624029159546, + "learning_rate": 0.0005, + "loss": 2.1107, + "step": 139140 + }, + { + "epoch": 0.5296392439271332, + "grad_norm": 0.13341467082500458, + "learning_rate": 0.0005, + "loss": 2.0962, + "step": 139150 + }, + { + "epoch": 0.5296773063952559, + "grad_norm": 0.1260858178138733, + "learning_rate": 0.0005, + "loss": 2.1146, + "step": 139160 + }, + { + "epoch": 0.5297153688633786, + "grad_norm": 0.13160032033920288, + "learning_rate": 0.0005, + "loss": 2.1012, + "step": 139170 + }, + { + "epoch": 0.5297534313315012, + "grad_norm": 0.12412998825311661, + "learning_rate": 0.0005, + "loss": 2.1138, + "step": 139180 + }, + { + "epoch": 0.529791493799624, + "grad_norm": 0.12666431069374084, + "learning_rate": 0.0005, + "loss": 2.1144, + "step": 139190 + }, + { + "epoch": 0.5298295562677466, + "grad_norm": 0.13844729959964752, + "learning_rate": 0.0005, + "loss": 2.1126, + "step": 139200 + }, + { + "epoch": 0.5298676187358693, + "grad_norm": 0.1178123876452446, + "learning_rate": 0.0005, + "loss": 2.1241, + "step": 139210 + }, + { + "epoch": 0.529905681203992, + "grad_norm": 0.1174953356385231, + "learning_rate": 0.0005, + "loss": 2.0996, + "step": 139220 + }, + { + "epoch": 0.5299437436721147, + "grad_norm": 0.12705713510513306, + "learning_rate": 0.0005, + "loss": 2.0992, + "step": 139230 + }, + { + "epoch": 0.5299818061402374, + "grad_norm": 0.13195791840553284, + "learning_rate": 0.0005, + "loss": 2.1242, + "step": 139240 + }, + { + "epoch": 0.53001986860836, + "grad_norm": 0.12599407136440277, + "learning_rate": 0.0005, + "loss": 2.1196, + "step": 139250 + }, + { + "epoch": 0.5300579310764827, + "grad_norm": 0.1358048915863037, + "learning_rate": 0.0005, + "loss": 2.1262, + "step": 139260 + }, + { + "epoch": 0.5300959935446055, + "grad_norm": 0.12501151859760284, + "learning_rate": 0.0005, + "loss": 2.1429, + "step": 139270 + }, + { + "epoch": 0.5301340560127281, + "grad_norm": 0.1353365033864975, + "learning_rate": 0.0005, + "loss": 2.1035, + "step": 139280 + }, + { + "epoch": 0.5301721184808508, + "grad_norm": 0.1324470341205597, + "learning_rate": 0.0005, + "loss": 2.1181, + "step": 139290 + }, + { + "epoch": 0.5302101809489734, + "grad_norm": 0.12147196382284164, + "learning_rate": 0.0005, + "loss": 2.1083, + "step": 139300 + }, + { + "epoch": 0.5302482434170961, + "grad_norm": 0.12834204733371735, + "learning_rate": 0.0005, + "loss": 2.1026, + "step": 139310 + }, + { + "epoch": 0.5302863058852189, + "grad_norm": 0.13552463054656982, + "learning_rate": 0.0005, + "loss": 2.1243, + "step": 139320 + }, + { + "epoch": 0.5303243683533415, + "grad_norm": 0.13295625150203705, + "learning_rate": 0.0005, + "loss": 2.1154, + "step": 139330 + }, + { + "epoch": 0.5303624308214642, + "grad_norm": 0.13045884668827057, + "learning_rate": 0.0005, + "loss": 2.1073, + "step": 139340 + }, + { + "epoch": 0.5304004932895868, + "grad_norm": 0.12231920659542084, + "learning_rate": 0.0005, + "loss": 2.1241, + "step": 139350 + }, + { + "epoch": 0.5304385557577096, + "grad_norm": 0.12363464385271072, + "learning_rate": 0.0005, + "loss": 2.1153, + "step": 139360 + }, + { + "epoch": 0.5304766182258323, + "grad_norm": 0.15936076641082764, + "learning_rate": 0.0005, + "loss": 2.1178, + "step": 139370 + }, + { + "epoch": 0.5305146806939549, + "grad_norm": 0.13969101011753082, + "learning_rate": 0.0005, + "loss": 2.1225, + "step": 139380 + }, + { + "epoch": 0.5305527431620776, + "grad_norm": 0.11162112653255463, + "learning_rate": 0.0005, + "loss": 2.1091, + "step": 139390 + }, + { + "epoch": 0.5305908056302003, + "grad_norm": 0.12614837288856506, + "learning_rate": 0.0005, + "loss": 2.1184, + "step": 139400 + }, + { + "epoch": 0.530628868098323, + "grad_norm": 0.13985757529735565, + "learning_rate": 0.0005, + "loss": 2.1092, + "step": 139410 + }, + { + "epoch": 0.5306669305664456, + "grad_norm": 0.13297826051712036, + "learning_rate": 0.0005, + "loss": 2.1008, + "step": 139420 + }, + { + "epoch": 0.5307049930345683, + "grad_norm": 0.11873283982276917, + "learning_rate": 0.0005, + "loss": 2.1197, + "step": 139430 + }, + { + "epoch": 0.530743055502691, + "grad_norm": 0.13034650683403015, + "learning_rate": 0.0005, + "loss": 2.0997, + "step": 139440 + }, + { + "epoch": 0.5307811179708137, + "grad_norm": 0.1307368278503418, + "learning_rate": 0.0005, + "loss": 2.108, + "step": 139450 + }, + { + "epoch": 0.5308191804389364, + "grad_norm": 0.12589477002620697, + "learning_rate": 0.0005, + "loss": 2.1213, + "step": 139460 + }, + { + "epoch": 0.530857242907059, + "grad_norm": 0.1286430060863495, + "learning_rate": 0.0005, + "loss": 2.1324, + "step": 139470 + }, + { + "epoch": 0.5308953053751817, + "grad_norm": 0.1236337199807167, + "learning_rate": 0.0005, + "loss": 2.1167, + "step": 139480 + }, + { + "epoch": 0.5309333678433045, + "grad_norm": 0.11303827166557312, + "learning_rate": 0.0005, + "loss": 2.1033, + "step": 139490 + }, + { + "epoch": 0.5309714303114271, + "grad_norm": 0.13597261905670166, + "learning_rate": 0.0005, + "loss": 2.1118, + "step": 139500 + }, + { + "epoch": 0.5310094927795498, + "grad_norm": 0.12540364265441895, + "learning_rate": 0.0005, + "loss": 2.0956, + "step": 139510 + }, + { + "epoch": 0.5310475552476724, + "grad_norm": 0.13377848267555237, + "learning_rate": 0.0005, + "loss": 2.1213, + "step": 139520 + }, + { + "epoch": 0.5310856177157952, + "grad_norm": 0.13392139971256256, + "learning_rate": 0.0005, + "loss": 2.1017, + "step": 139530 + }, + { + "epoch": 0.5311236801839179, + "grad_norm": 0.12368487566709518, + "learning_rate": 0.0005, + "loss": 2.1106, + "step": 139540 + }, + { + "epoch": 0.5311617426520405, + "grad_norm": 0.1207844540476799, + "learning_rate": 0.0005, + "loss": 2.1079, + "step": 139550 + }, + { + "epoch": 0.5311998051201632, + "grad_norm": 0.12541265785694122, + "learning_rate": 0.0005, + "loss": 2.0961, + "step": 139560 + }, + { + "epoch": 0.531237867588286, + "grad_norm": 0.12517641484737396, + "learning_rate": 0.0005, + "loss": 2.1154, + "step": 139570 + }, + { + "epoch": 0.5312759300564086, + "grad_norm": 0.12465142458677292, + "learning_rate": 0.0005, + "loss": 2.1305, + "step": 139580 + }, + { + "epoch": 0.5313139925245313, + "grad_norm": 0.13334906101226807, + "learning_rate": 0.0005, + "loss": 2.1287, + "step": 139590 + }, + { + "epoch": 0.5313520549926539, + "grad_norm": 0.13244402408599854, + "learning_rate": 0.0005, + "loss": 2.1232, + "step": 139600 + }, + { + "epoch": 0.5313901174607766, + "grad_norm": 0.1352616250514984, + "learning_rate": 0.0005, + "loss": 2.112, + "step": 139610 + }, + { + "epoch": 0.5314281799288993, + "grad_norm": 0.13310536742210388, + "learning_rate": 0.0005, + "loss": 2.1091, + "step": 139620 + }, + { + "epoch": 0.531466242397022, + "grad_norm": 0.13594430685043335, + "learning_rate": 0.0005, + "loss": 2.1077, + "step": 139630 + }, + { + "epoch": 0.5315043048651447, + "grad_norm": 0.12137679010629654, + "learning_rate": 0.0005, + "loss": 2.1047, + "step": 139640 + }, + { + "epoch": 0.5315423673332673, + "grad_norm": 0.12929204106330872, + "learning_rate": 0.0005, + "loss": 2.1125, + "step": 139650 + }, + { + "epoch": 0.5315804298013901, + "grad_norm": 0.13630947470664978, + "learning_rate": 0.0005, + "loss": 2.0938, + "step": 139660 + }, + { + "epoch": 0.5316184922695127, + "grad_norm": 0.12881417572498322, + "learning_rate": 0.0005, + "loss": 2.0922, + "step": 139670 + }, + { + "epoch": 0.5316565547376354, + "grad_norm": 0.1301601678133011, + "learning_rate": 0.0005, + "loss": 2.1101, + "step": 139680 + }, + { + "epoch": 0.5316946172057581, + "grad_norm": 0.1197756677865982, + "learning_rate": 0.0005, + "loss": 2.1045, + "step": 139690 + }, + { + "epoch": 0.5317326796738808, + "grad_norm": 0.13916751742362976, + "learning_rate": 0.0005, + "loss": 2.111, + "step": 139700 + }, + { + "epoch": 0.5317707421420035, + "grad_norm": 0.1245514303445816, + "learning_rate": 0.0005, + "loss": 2.108, + "step": 139710 + }, + { + "epoch": 0.5318088046101261, + "grad_norm": 0.12214001268148422, + "learning_rate": 0.0005, + "loss": 2.1125, + "step": 139720 + }, + { + "epoch": 0.5318468670782488, + "grad_norm": 0.126341313123703, + "learning_rate": 0.0005, + "loss": 2.1101, + "step": 139730 + }, + { + "epoch": 0.5318849295463715, + "grad_norm": 0.13778264820575714, + "learning_rate": 0.0005, + "loss": 2.1031, + "step": 139740 + }, + { + "epoch": 0.5319229920144942, + "grad_norm": 0.12353502213954926, + "learning_rate": 0.0005, + "loss": 2.113, + "step": 139750 + }, + { + "epoch": 0.5319610544826169, + "grad_norm": 0.12548722326755524, + "learning_rate": 0.0005, + "loss": 2.1122, + "step": 139760 + }, + { + "epoch": 0.5319991169507395, + "grad_norm": 0.12514159083366394, + "learning_rate": 0.0005, + "loss": 2.1091, + "step": 139770 + }, + { + "epoch": 0.5320371794188622, + "grad_norm": 0.13197045028209686, + "learning_rate": 0.0005, + "loss": 2.1126, + "step": 139780 + }, + { + "epoch": 0.532075241886985, + "grad_norm": 0.14428088068962097, + "learning_rate": 0.0005, + "loss": 2.1025, + "step": 139790 + }, + { + "epoch": 0.5321133043551076, + "grad_norm": 0.12338276207447052, + "learning_rate": 0.0005, + "loss": 2.0921, + "step": 139800 + }, + { + "epoch": 0.5321513668232303, + "grad_norm": 0.121820829808712, + "learning_rate": 0.0005, + "loss": 2.1248, + "step": 139810 + }, + { + "epoch": 0.5321894292913529, + "grad_norm": 0.12613916397094727, + "learning_rate": 0.0005, + "loss": 2.1133, + "step": 139820 + }, + { + "epoch": 0.5322274917594757, + "grad_norm": 0.1097206324338913, + "learning_rate": 0.0005, + "loss": 2.1183, + "step": 139830 + }, + { + "epoch": 0.5322655542275984, + "grad_norm": 0.12880319356918335, + "learning_rate": 0.0005, + "loss": 2.1131, + "step": 139840 + }, + { + "epoch": 0.532303616695721, + "grad_norm": 0.1289016604423523, + "learning_rate": 0.0005, + "loss": 2.123, + "step": 139850 + }, + { + "epoch": 0.5323416791638437, + "grad_norm": 0.11930489540100098, + "learning_rate": 0.0005, + "loss": 2.1128, + "step": 139860 + }, + { + "epoch": 0.5323797416319663, + "grad_norm": 0.12602676451206207, + "learning_rate": 0.0005, + "loss": 2.1199, + "step": 139870 + }, + { + "epoch": 0.5324178041000891, + "grad_norm": 0.14378313720226288, + "learning_rate": 0.0005, + "loss": 2.1232, + "step": 139880 + }, + { + "epoch": 0.5324558665682118, + "grad_norm": 0.12042778730392456, + "learning_rate": 0.0005, + "loss": 2.1151, + "step": 139890 + }, + { + "epoch": 0.5324939290363344, + "grad_norm": 0.12165861576795578, + "learning_rate": 0.0005, + "loss": 2.1188, + "step": 139900 + }, + { + "epoch": 0.5325319915044571, + "grad_norm": 0.1306481957435608, + "learning_rate": 0.0005, + "loss": 2.1041, + "step": 139910 + }, + { + "epoch": 0.5325700539725798, + "grad_norm": 0.12598676979541779, + "learning_rate": 0.0005, + "loss": 2.1287, + "step": 139920 + }, + { + "epoch": 0.5326081164407025, + "grad_norm": 0.13994857668876648, + "learning_rate": 0.0005, + "loss": 2.1071, + "step": 139930 + }, + { + "epoch": 0.5326461789088252, + "grad_norm": 0.12887181341648102, + "learning_rate": 0.0005, + "loss": 2.1188, + "step": 139940 + }, + { + "epoch": 0.5326842413769478, + "grad_norm": 0.12587593495845795, + "learning_rate": 0.0005, + "loss": 2.1158, + "step": 139950 + }, + { + "epoch": 0.5327223038450706, + "grad_norm": 0.13747109472751617, + "learning_rate": 0.0005, + "loss": 2.1126, + "step": 139960 + }, + { + "epoch": 0.5327603663131932, + "grad_norm": 0.12219313532114029, + "learning_rate": 0.0005, + "loss": 2.0977, + "step": 139970 + }, + { + "epoch": 0.5327984287813159, + "grad_norm": 0.12008965760469437, + "learning_rate": 0.0005, + "loss": 2.1182, + "step": 139980 + }, + { + "epoch": 0.5328364912494385, + "grad_norm": 0.14939166605472565, + "learning_rate": 0.0005, + "loss": 2.1093, + "step": 139990 + }, + { + "epoch": 0.5328745537175613, + "grad_norm": 0.11491386592388153, + "learning_rate": 0.0005, + "loss": 2.1129, + "step": 140000 + }, + { + "epoch": 0.532912616185684, + "grad_norm": 0.1381085067987442, + "learning_rate": 0.0005, + "loss": 2.1227, + "step": 140010 + }, + { + "epoch": 0.5329506786538066, + "grad_norm": 0.13951145112514496, + "learning_rate": 0.0005, + "loss": 2.1091, + "step": 140020 + }, + { + "epoch": 0.5329887411219293, + "grad_norm": 0.1354573667049408, + "learning_rate": 0.0005, + "loss": 2.122, + "step": 140030 + }, + { + "epoch": 0.533026803590052, + "grad_norm": 0.13436217606067657, + "learning_rate": 0.0005, + "loss": 2.1134, + "step": 140040 + }, + { + "epoch": 0.5330648660581747, + "grad_norm": 0.13453523814678192, + "learning_rate": 0.0005, + "loss": 2.114, + "step": 140050 + }, + { + "epoch": 0.5331029285262974, + "grad_norm": 0.1241929680109024, + "learning_rate": 0.0005, + "loss": 2.101, + "step": 140060 + }, + { + "epoch": 0.53314099099442, + "grad_norm": 0.12845444679260254, + "learning_rate": 0.0005, + "loss": 2.1202, + "step": 140070 + }, + { + "epoch": 0.5331790534625427, + "grad_norm": 0.12883029878139496, + "learning_rate": 0.0005, + "loss": 2.0998, + "step": 140080 + }, + { + "epoch": 0.5332171159306655, + "grad_norm": 0.1188458651304245, + "learning_rate": 0.0005, + "loss": 2.1023, + "step": 140090 + }, + { + "epoch": 0.5332551783987881, + "grad_norm": 0.11591944843530655, + "learning_rate": 0.0005, + "loss": 2.119, + "step": 140100 + }, + { + "epoch": 0.5332932408669108, + "grad_norm": 0.12100506573915482, + "learning_rate": 0.0005, + "loss": 2.1147, + "step": 140110 + }, + { + "epoch": 0.5333313033350334, + "grad_norm": 0.1290658563375473, + "learning_rate": 0.0005, + "loss": 2.1124, + "step": 140120 + }, + { + "epoch": 0.5333693658031562, + "grad_norm": 0.11808887869119644, + "learning_rate": 0.0005, + "loss": 2.1256, + "step": 140130 + }, + { + "epoch": 0.5334074282712788, + "grad_norm": 0.11424297094345093, + "learning_rate": 0.0005, + "loss": 2.0961, + "step": 140140 + }, + { + "epoch": 0.5334454907394015, + "grad_norm": 0.11808013170957565, + "learning_rate": 0.0005, + "loss": 2.114, + "step": 140150 + }, + { + "epoch": 0.5334835532075242, + "grad_norm": 0.118148572742939, + "learning_rate": 0.0005, + "loss": 2.1255, + "step": 140160 + }, + { + "epoch": 0.5335216156756468, + "grad_norm": 0.1298682987689972, + "learning_rate": 0.0005, + "loss": 2.1139, + "step": 140170 + }, + { + "epoch": 0.5335596781437696, + "grad_norm": 0.12634819746017456, + "learning_rate": 0.0005, + "loss": 2.1057, + "step": 140180 + }, + { + "epoch": 0.5335977406118922, + "grad_norm": 0.12795348465442657, + "learning_rate": 0.0005, + "loss": 2.1147, + "step": 140190 + }, + { + "epoch": 0.5336358030800149, + "grad_norm": 0.12447839230298996, + "learning_rate": 0.0005, + "loss": 2.1278, + "step": 140200 + }, + { + "epoch": 0.5336738655481376, + "grad_norm": 0.11496274173259735, + "learning_rate": 0.0005, + "loss": 2.1088, + "step": 140210 + }, + { + "epoch": 0.5337119280162603, + "grad_norm": 0.13818976283073425, + "learning_rate": 0.0005, + "loss": 2.1311, + "step": 140220 + }, + { + "epoch": 0.533749990484383, + "grad_norm": 0.12268754094839096, + "learning_rate": 0.0005, + "loss": 2.1175, + "step": 140230 + }, + { + "epoch": 0.5337880529525056, + "grad_norm": 0.11690956354141235, + "learning_rate": 0.0005, + "loss": 2.1155, + "step": 140240 + }, + { + "epoch": 0.5338261154206283, + "grad_norm": 0.12202656269073486, + "learning_rate": 0.0005, + "loss": 2.1186, + "step": 140250 + }, + { + "epoch": 0.5338641778887511, + "grad_norm": 0.12960444390773773, + "learning_rate": 0.0005, + "loss": 2.1273, + "step": 140260 + }, + { + "epoch": 0.5339022403568737, + "grad_norm": 0.1281118392944336, + "learning_rate": 0.0005, + "loss": 2.1122, + "step": 140270 + }, + { + "epoch": 0.5339403028249964, + "grad_norm": 0.12352045625448227, + "learning_rate": 0.0005, + "loss": 2.1126, + "step": 140280 + }, + { + "epoch": 0.533978365293119, + "grad_norm": 0.13139109313488007, + "learning_rate": 0.0005, + "loss": 2.117, + "step": 140290 + }, + { + "epoch": 0.5340164277612417, + "grad_norm": 0.13089576363563538, + "learning_rate": 0.0005, + "loss": 2.1072, + "step": 140300 + }, + { + "epoch": 0.5340544902293645, + "grad_norm": 0.12949113547801971, + "learning_rate": 0.0005, + "loss": 2.1073, + "step": 140310 + }, + { + "epoch": 0.5340925526974871, + "grad_norm": 0.1362101286649704, + "learning_rate": 0.0005, + "loss": 2.0981, + "step": 140320 + }, + { + "epoch": 0.5341306151656098, + "grad_norm": 0.12745702266693115, + "learning_rate": 0.0005, + "loss": 2.1118, + "step": 140330 + }, + { + "epoch": 0.5341686776337324, + "grad_norm": 0.12172038108110428, + "learning_rate": 0.0005, + "loss": 2.1103, + "step": 140340 + }, + { + "epoch": 0.5342067401018552, + "grad_norm": 0.11687658727169037, + "learning_rate": 0.0005, + "loss": 2.1115, + "step": 140350 + }, + { + "epoch": 0.5342448025699779, + "grad_norm": 0.13451483845710754, + "learning_rate": 0.0005, + "loss": 2.1151, + "step": 140360 + }, + { + "epoch": 0.5342828650381005, + "grad_norm": 0.12194819748401642, + "learning_rate": 0.0005, + "loss": 2.0998, + "step": 140370 + }, + { + "epoch": 0.5343209275062232, + "grad_norm": 0.12656249105930328, + "learning_rate": 0.0005, + "loss": 2.121, + "step": 140380 + }, + { + "epoch": 0.5343589899743459, + "grad_norm": 0.12181355804204941, + "learning_rate": 0.0005, + "loss": 2.0933, + "step": 140390 + }, + { + "epoch": 0.5343970524424686, + "grad_norm": 0.11588042974472046, + "learning_rate": 0.0005, + "loss": 2.1217, + "step": 140400 + }, + { + "epoch": 0.5344351149105913, + "grad_norm": 0.12667755782604218, + "learning_rate": 0.0005, + "loss": 2.1098, + "step": 140410 + }, + { + "epoch": 0.5344731773787139, + "grad_norm": 0.12891434133052826, + "learning_rate": 0.0005, + "loss": 2.1297, + "step": 140420 + }, + { + "epoch": 0.5345112398468367, + "grad_norm": 0.12638193368911743, + "learning_rate": 0.0005, + "loss": 2.133, + "step": 140430 + }, + { + "epoch": 0.5345493023149593, + "grad_norm": 0.12351429462432861, + "learning_rate": 0.0005, + "loss": 2.115, + "step": 140440 + }, + { + "epoch": 0.534587364783082, + "grad_norm": 0.12079423666000366, + "learning_rate": 0.0005, + "loss": 2.1193, + "step": 140450 + }, + { + "epoch": 0.5346254272512047, + "grad_norm": 0.1397024393081665, + "learning_rate": 0.0005, + "loss": 2.1125, + "step": 140460 + }, + { + "epoch": 0.5346634897193273, + "grad_norm": 0.13598190248012543, + "learning_rate": 0.0005, + "loss": 2.1033, + "step": 140470 + }, + { + "epoch": 0.5347015521874501, + "grad_norm": 0.13075561821460724, + "learning_rate": 0.0005, + "loss": 2.1094, + "step": 140480 + }, + { + "epoch": 0.5347396146555727, + "grad_norm": 0.13019627332687378, + "learning_rate": 0.0005, + "loss": 2.1378, + "step": 140490 + }, + { + "epoch": 0.5347776771236954, + "grad_norm": 0.1375575214624405, + "learning_rate": 0.0005, + "loss": 2.1312, + "step": 140500 + }, + { + "epoch": 0.534815739591818, + "grad_norm": 0.11885309964418411, + "learning_rate": 0.0005, + "loss": 2.0908, + "step": 140510 + }, + { + "epoch": 0.5348538020599408, + "grad_norm": 0.12946408987045288, + "learning_rate": 0.0005, + "loss": 2.1126, + "step": 140520 + }, + { + "epoch": 0.5348918645280635, + "grad_norm": 0.12864582240581512, + "learning_rate": 0.0005, + "loss": 2.1087, + "step": 140530 + }, + { + "epoch": 0.5349299269961861, + "grad_norm": 0.129059836268425, + "learning_rate": 0.0005, + "loss": 2.1065, + "step": 140540 + }, + { + "epoch": 0.5349679894643088, + "grad_norm": 0.1326047033071518, + "learning_rate": 0.0005, + "loss": 2.1016, + "step": 140550 + }, + { + "epoch": 0.5350060519324316, + "grad_norm": 0.12458567321300507, + "learning_rate": 0.0005, + "loss": 2.1168, + "step": 140560 + }, + { + "epoch": 0.5350441144005542, + "grad_norm": 0.1340087354183197, + "learning_rate": 0.0005, + "loss": 2.1154, + "step": 140570 + }, + { + "epoch": 0.5350821768686769, + "grad_norm": 0.12168935686349869, + "learning_rate": 0.0005, + "loss": 2.1149, + "step": 140580 + }, + { + "epoch": 0.5351202393367995, + "grad_norm": 0.1205526664853096, + "learning_rate": 0.0005, + "loss": 2.1035, + "step": 140590 + }, + { + "epoch": 0.5351583018049222, + "grad_norm": 0.1175910159945488, + "learning_rate": 0.0005, + "loss": 2.112, + "step": 140600 + }, + { + "epoch": 0.535196364273045, + "grad_norm": 0.11866150796413422, + "learning_rate": 0.0005, + "loss": 2.1075, + "step": 140610 + }, + { + "epoch": 0.5352344267411676, + "grad_norm": 0.11844488233327866, + "learning_rate": 0.0005, + "loss": 2.1293, + "step": 140620 + }, + { + "epoch": 0.5352724892092903, + "grad_norm": 0.1390773504972458, + "learning_rate": 0.0005, + "loss": 2.1058, + "step": 140630 + }, + { + "epoch": 0.5353105516774129, + "grad_norm": 0.12005293369293213, + "learning_rate": 0.0005, + "loss": 2.1137, + "step": 140640 + }, + { + "epoch": 0.5353486141455357, + "grad_norm": 0.12795551121234894, + "learning_rate": 0.0005, + "loss": 2.1092, + "step": 140650 + }, + { + "epoch": 0.5353866766136584, + "grad_norm": 0.11821883171796799, + "learning_rate": 0.0005, + "loss": 2.099, + "step": 140660 + }, + { + "epoch": 0.535424739081781, + "grad_norm": 0.12025831639766693, + "learning_rate": 0.0005, + "loss": 2.1133, + "step": 140670 + }, + { + "epoch": 0.5354628015499037, + "grad_norm": 0.13171258568763733, + "learning_rate": 0.0005, + "loss": 2.1218, + "step": 140680 + }, + { + "epoch": 0.5355008640180264, + "grad_norm": 0.12721110880374908, + "learning_rate": 0.0005, + "loss": 2.1186, + "step": 140690 + }, + { + "epoch": 0.5355389264861491, + "grad_norm": 0.127712681889534, + "learning_rate": 0.0005, + "loss": 2.1143, + "step": 140700 + }, + { + "epoch": 0.5355769889542717, + "grad_norm": 0.15471547842025757, + "learning_rate": 0.0005, + "loss": 2.1042, + "step": 140710 + }, + { + "epoch": 0.5356150514223944, + "grad_norm": 0.12533149123191833, + "learning_rate": 0.0005, + "loss": 2.1153, + "step": 140720 + }, + { + "epoch": 0.5356531138905171, + "grad_norm": 0.13401567935943604, + "learning_rate": 0.0005, + "loss": 2.1207, + "step": 140730 + }, + { + "epoch": 0.5356911763586398, + "grad_norm": 0.12059596925973892, + "learning_rate": 0.0005, + "loss": 2.1072, + "step": 140740 + }, + { + "epoch": 0.5357292388267625, + "grad_norm": 0.12798458337783813, + "learning_rate": 0.0005, + "loss": 2.1319, + "step": 140750 + }, + { + "epoch": 0.5357673012948851, + "grad_norm": 0.11938324570655823, + "learning_rate": 0.0005, + "loss": 2.1067, + "step": 140760 + }, + { + "epoch": 0.5358053637630078, + "grad_norm": 0.1102285087108612, + "learning_rate": 0.0005, + "loss": 2.1282, + "step": 140770 + }, + { + "epoch": 0.5358434262311306, + "grad_norm": 0.1263628453016281, + "learning_rate": 0.0005, + "loss": 2.1141, + "step": 140780 + }, + { + "epoch": 0.5358814886992532, + "grad_norm": 0.12735337018966675, + "learning_rate": 0.0005, + "loss": 2.1046, + "step": 140790 + }, + { + "epoch": 0.5359195511673759, + "grad_norm": 0.14012378454208374, + "learning_rate": 0.0005, + "loss": 2.1078, + "step": 140800 + }, + { + "epoch": 0.5359576136354985, + "grad_norm": 0.1268850862979889, + "learning_rate": 0.0005, + "loss": 2.1272, + "step": 140810 + }, + { + "epoch": 0.5359956761036213, + "grad_norm": 0.13749361038208008, + "learning_rate": 0.0005, + "loss": 2.1123, + "step": 140820 + }, + { + "epoch": 0.536033738571744, + "grad_norm": 0.11975759267807007, + "learning_rate": 0.0005, + "loss": 2.1213, + "step": 140830 + }, + { + "epoch": 0.5360718010398666, + "grad_norm": 0.12557609379291534, + "learning_rate": 0.0005, + "loss": 2.1031, + "step": 140840 + }, + { + "epoch": 0.5361098635079893, + "grad_norm": 0.11973942816257477, + "learning_rate": 0.0005, + "loss": 2.1249, + "step": 140850 + }, + { + "epoch": 0.536147925976112, + "grad_norm": 0.1194792166352272, + "learning_rate": 0.0005, + "loss": 2.115, + "step": 140860 + }, + { + "epoch": 0.5361859884442347, + "grad_norm": 0.12609535455703735, + "learning_rate": 0.0005, + "loss": 2.1361, + "step": 140870 + }, + { + "epoch": 0.5362240509123574, + "grad_norm": 0.11945301294326782, + "learning_rate": 0.0005, + "loss": 2.1016, + "step": 140880 + }, + { + "epoch": 0.53626211338048, + "grad_norm": 0.126478374004364, + "learning_rate": 0.0005, + "loss": 2.1131, + "step": 140890 + }, + { + "epoch": 0.5363001758486027, + "grad_norm": 0.1255444586277008, + "learning_rate": 0.0005, + "loss": 2.1076, + "step": 140900 + }, + { + "epoch": 0.5363382383167254, + "grad_norm": 0.12098710983991623, + "learning_rate": 0.0005, + "loss": 2.1065, + "step": 140910 + }, + { + "epoch": 0.5363763007848481, + "grad_norm": 0.12323322147130966, + "learning_rate": 0.0005, + "loss": 2.1125, + "step": 140920 + }, + { + "epoch": 0.5364143632529708, + "grad_norm": 0.13023199141025543, + "learning_rate": 0.0005, + "loss": 2.111, + "step": 140930 + }, + { + "epoch": 0.5364524257210934, + "grad_norm": 0.12331162393093109, + "learning_rate": 0.0005, + "loss": 2.125, + "step": 140940 + }, + { + "epoch": 0.5364904881892162, + "grad_norm": 0.1288183182477951, + "learning_rate": 0.0005, + "loss": 2.1189, + "step": 140950 + }, + { + "epoch": 0.5365285506573388, + "grad_norm": 0.12472337484359741, + "learning_rate": 0.0005, + "loss": 2.102, + "step": 140960 + }, + { + "epoch": 0.5365666131254615, + "grad_norm": 0.138634592294693, + "learning_rate": 0.0005, + "loss": 2.1047, + "step": 140970 + }, + { + "epoch": 0.5366046755935842, + "grad_norm": 0.12713125348091125, + "learning_rate": 0.0005, + "loss": 2.122, + "step": 140980 + }, + { + "epoch": 0.5366427380617069, + "grad_norm": 0.12022874504327774, + "learning_rate": 0.0005, + "loss": 2.1182, + "step": 140990 + }, + { + "epoch": 0.5366808005298296, + "grad_norm": 0.13861408829689026, + "learning_rate": 0.0005, + "loss": 2.1223, + "step": 141000 + }, + { + "epoch": 0.5367188629979522, + "grad_norm": 0.13212622702121735, + "learning_rate": 0.0005, + "loss": 2.1018, + "step": 141010 + }, + { + "epoch": 0.5367569254660749, + "grad_norm": 0.12079843878746033, + "learning_rate": 0.0005, + "loss": 2.1133, + "step": 141020 + }, + { + "epoch": 0.5367949879341976, + "grad_norm": 0.11715279519557953, + "learning_rate": 0.0005, + "loss": 2.1126, + "step": 141030 + }, + { + "epoch": 0.5368330504023203, + "grad_norm": 0.1280633807182312, + "learning_rate": 0.0005, + "loss": 2.1075, + "step": 141040 + }, + { + "epoch": 0.536871112870443, + "grad_norm": 0.1294240951538086, + "learning_rate": 0.0005, + "loss": 2.1088, + "step": 141050 + }, + { + "epoch": 0.5369091753385656, + "grad_norm": 0.12403983622789383, + "learning_rate": 0.0005, + "loss": 2.105, + "step": 141060 + }, + { + "epoch": 0.5369472378066883, + "grad_norm": 0.12071875482797623, + "learning_rate": 0.0005, + "loss": 2.1223, + "step": 141070 + }, + { + "epoch": 0.5369853002748111, + "grad_norm": 0.12532472610473633, + "learning_rate": 0.0005, + "loss": 2.1082, + "step": 141080 + }, + { + "epoch": 0.5370233627429337, + "grad_norm": 0.1143755242228508, + "learning_rate": 0.0005, + "loss": 2.1066, + "step": 141090 + }, + { + "epoch": 0.5370614252110564, + "grad_norm": 0.11905299127101898, + "learning_rate": 0.0005, + "loss": 2.1142, + "step": 141100 + }, + { + "epoch": 0.537099487679179, + "grad_norm": 0.12017108500003815, + "learning_rate": 0.0005, + "loss": 2.1134, + "step": 141110 + }, + { + "epoch": 0.5371375501473018, + "grad_norm": 0.12630660831928253, + "learning_rate": 0.0005, + "loss": 2.112, + "step": 141120 + }, + { + "epoch": 0.5371756126154245, + "grad_norm": 0.11764616519212723, + "learning_rate": 0.0005, + "loss": 2.1016, + "step": 141130 + }, + { + "epoch": 0.5372136750835471, + "grad_norm": 0.11926303058862686, + "learning_rate": 0.0005, + "loss": 2.1194, + "step": 141140 + }, + { + "epoch": 0.5372517375516698, + "grad_norm": 0.12672436237335205, + "learning_rate": 0.0005, + "loss": 2.1159, + "step": 141150 + }, + { + "epoch": 0.5372898000197924, + "grad_norm": 0.11773819476366043, + "learning_rate": 0.0005, + "loss": 2.1349, + "step": 141160 + }, + { + "epoch": 0.5373278624879152, + "grad_norm": 0.12111266702413559, + "learning_rate": 0.0005, + "loss": 2.1026, + "step": 141170 + }, + { + "epoch": 0.5373659249560379, + "grad_norm": 0.1228339672088623, + "learning_rate": 0.0005, + "loss": 2.1195, + "step": 141180 + }, + { + "epoch": 0.5374039874241605, + "grad_norm": 0.13635188341140747, + "learning_rate": 0.0005, + "loss": 2.131, + "step": 141190 + }, + { + "epoch": 0.5374420498922832, + "grad_norm": 0.1257738471031189, + "learning_rate": 0.0005, + "loss": 2.1159, + "step": 141200 + }, + { + "epoch": 0.5374801123604059, + "grad_norm": 0.12580062448978424, + "learning_rate": 0.0005, + "loss": 2.113, + "step": 141210 + }, + { + "epoch": 0.5375181748285286, + "grad_norm": 0.1258445829153061, + "learning_rate": 0.0005, + "loss": 2.1163, + "step": 141220 + }, + { + "epoch": 0.5375562372966513, + "grad_norm": 0.12222804129123688, + "learning_rate": 0.0005, + "loss": 2.1274, + "step": 141230 + }, + { + "epoch": 0.5375942997647739, + "grad_norm": 0.11764946579933167, + "learning_rate": 0.0005, + "loss": 2.1145, + "step": 141240 + }, + { + "epoch": 0.5376323622328967, + "grad_norm": 0.12180469930171967, + "learning_rate": 0.0005, + "loss": 2.0993, + "step": 141250 + }, + { + "epoch": 0.5376704247010193, + "grad_norm": 0.1325564682483673, + "learning_rate": 0.0005, + "loss": 2.1011, + "step": 141260 + }, + { + "epoch": 0.537708487169142, + "grad_norm": 0.13016091287136078, + "learning_rate": 0.0005, + "loss": 2.11, + "step": 141270 + }, + { + "epoch": 0.5377465496372646, + "grad_norm": 0.11561047285795212, + "learning_rate": 0.0005, + "loss": 2.121, + "step": 141280 + }, + { + "epoch": 0.5377846121053874, + "grad_norm": 0.12065380811691284, + "learning_rate": 0.0005, + "loss": 2.1059, + "step": 141290 + }, + { + "epoch": 0.5378226745735101, + "grad_norm": 0.12640777230262756, + "learning_rate": 0.0005, + "loss": 2.1213, + "step": 141300 + }, + { + "epoch": 0.5378607370416327, + "grad_norm": 0.11981063336133957, + "learning_rate": 0.0005, + "loss": 2.0958, + "step": 141310 + }, + { + "epoch": 0.5378987995097554, + "grad_norm": 0.11782816797494888, + "learning_rate": 0.0005, + "loss": 2.1129, + "step": 141320 + }, + { + "epoch": 0.537936861977878, + "grad_norm": 0.13507914543151855, + "learning_rate": 0.0005, + "loss": 2.1184, + "step": 141330 + }, + { + "epoch": 0.5379749244460008, + "grad_norm": 0.1195915937423706, + "learning_rate": 0.0005, + "loss": 2.1218, + "step": 141340 + }, + { + "epoch": 0.5380129869141235, + "grad_norm": 0.12683017551898956, + "learning_rate": 0.0005, + "loss": 2.1284, + "step": 141350 + }, + { + "epoch": 0.5380510493822461, + "grad_norm": 0.11754672974348068, + "learning_rate": 0.0005, + "loss": 2.1126, + "step": 141360 + }, + { + "epoch": 0.5380891118503688, + "grad_norm": 0.1195908859372139, + "learning_rate": 0.0005, + "loss": 2.1374, + "step": 141370 + }, + { + "epoch": 0.5381271743184916, + "grad_norm": 0.1198858767747879, + "learning_rate": 0.0005, + "loss": 2.1083, + "step": 141380 + }, + { + "epoch": 0.5381652367866142, + "grad_norm": 0.12950566411018372, + "learning_rate": 0.0005, + "loss": 2.1088, + "step": 141390 + }, + { + "epoch": 0.5382032992547369, + "grad_norm": 0.12549936771392822, + "learning_rate": 0.0005, + "loss": 2.1351, + "step": 141400 + }, + { + "epoch": 0.5382413617228595, + "grad_norm": 0.17845281958580017, + "learning_rate": 0.0005, + "loss": 2.1084, + "step": 141410 + }, + { + "epoch": 0.5382794241909823, + "grad_norm": 0.12847577035427094, + "learning_rate": 0.0005, + "loss": 2.1102, + "step": 141420 + }, + { + "epoch": 0.538317486659105, + "grad_norm": 0.12952706217765808, + "learning_rate": 0.0005, + "loss": 2.1071, + "step": 141430 + }, + { + "epoch": 0.5383555491272276, + "grad_norm": 0.1426181048154831, + "learning_rate": 0.0005, + "loss": 2.1277, + "step": 141440 + }, + { + "epoch": 0.5383936115953503, + "grad_norm": 0.11868982762098312, + "learning_rate": 0.0005, + "loss": 2.1277, + "step": 141450 + }, + { + "epoch": 0.5384316740634729, + "grad_norm": 0.12122055143117905, + "learning_rate": 0.0005, + "loss": 2.1239, + "step": 141460 + }, + { + "epoch": 0.5384697365315957, + "grad_norm": 0.11714940518140793, + "learning_rate": 0.0005, + "loss": 2.1163, + "step": 141470 + }, + { + "epoch": 0.5385077989997183, + "grad_norm": 0.12399179488420486, + "learning_rate": 0.0005, + "loss": 2.1167, + "step": 141480 + }, + { + "epoch": 0.538545861467841, + "grad_norm": 0.1398458331823349, + "learning_rate": 0.0005, + "loss": 2.1217, + "step": 141490 + }, + { + "epoch": 0.5385839239359637, + "grad_norm": 0.1362568438053131, + "learning_rate": 0.0005, + "loss": 2.1304, + "step": 141500 + }, + { + "epoch": 0.5386219864040864, + "grad_norm": 0.11364667862653732, + "learning_rate": 0.0005, + "loss": 2.0949, + "step": 141510 + }, + { + "epoch": 0.5386600488722091, + "grad_norm": 0.12641511857509613, + "learning_rate": 0.0005, + "loss": 2.1159, + "step": 141520 + }, + { + "epoch": 0.5386981113403317, + "grad_norm": 0.12470997869968414, + "learning_rate": 0.0005, + "loss": 2.1137, + "step": 141530 + }, + { + "epoch": 0.5387361738084544, + "grad_norm": 0.13600875437259674, + "learning_rate": 0.0005, + "loss": 2.1112, + "step": 141540 + }, + { + "epoch": 0.5387742362765772, + "grad_norm": 0.1253259927034378, + "learning_rate": 0.0005, + "loss": 2.0862, + "step": 141550 + }, + { + "epoch": 0.5388122987446998, + "grad_norm": 0.12575694918632507, + "learning_rate": 0.0005, + "loss": 2.1313, + "step": 141560 + }, + { + "epoch": 0.5388503612128225, + "grad_norm": 0.13145343959331512, + "learning_rate": 0.0005, + "loss": 2.1188, + "step": 141570 + }, + { + "epoch": 0.5388884236809451, + "grad_norm": 0.12116347253322601, + "learning_rate": 0.0005, + "loss": 2.1054, + "step": 141580 + }, + { + "epoch": 0.5389264861490678, + "grad_norm": 0.1289222240447998, + "learning_rate": 0.0005, + "loss": 2.1104, + "step": 141590 + }, + { + "epoch": 0.5389645486171906, + "grad_norm": 0.13402898609638214, + "learning_rate": 0.0005, + "loss": 2.102, + "step": 141600 + }, + { + "epoch": 0.5390026110853132, + "grad_norm": 0.13756172358989716, + "learning_rate": 0.0005, + "loss": 2.1118, + "step": 141610 + }, + { + "epoch": 0.5390406735534359, + "grad_norm": 0.11667651683092117, + "learning_rate": 0.0005, + "loss": 2.0997, + "step": 141620 + }, + { + "epoch": 0.5390787360215585, + "grad_norm": 0.11143524944782257, + "learning_rate": 0.0005, + "loss": 2.1076, + "step": 141630 + }, + { + "epoch": 0.5391167984896813, + "grad_norm": 0.1342800408601761, + "learning_rate": 0.0005, + "loss": 2.1094, + "step": 141640 + }, + { + "epoch": 0.539154860957804, + "grad_norm": 0.1318407654762268, + "learning_rate": 0.0005, + "loss": 2.1084, + "step": 141650 + }, + { + "epoch": 0.5391929234259266, + "grad_norm": 0.12428022176027298, + "learning_rate": 0.0005, + "loss": 2.1144, + "step": 141660 + }, + { + "epoch": 0.5392309858940493, + "grad_norm": 0.1295686662197113, + "learning_rate": 0.0005, + "loss": 2.1147, + "step": 141670 + }, + { + "epoch": 0.539269048362172, + "grad_norm": 0.12853935360908508, + "learning_rate": 0.0005, + "loss": 2.1058, + "step": 141680 + }, + { + "epoch": 0.5393071108302947, + "grad_norm": 0.1273634135723114, + "learning_rate": 0.0005, + "loss": 2.0992, + "step": 141690 + }, + { + "epoch": 0.5393451732984174, + "grad_norm": 0.12686489522457123, + "learning_rate": 0.0005, + "loss": 2.1134, + "step": 141700 + }, + { + "epoch": 0.53938323576654, + "grad_norm": 0.14804643392562866, + "learning_rate": 0.0005, + "loss": 2.1192, + "step": 141710 + }, + { + "epoch": 0.5394212982346628, + "grad_norm": 0.12041910737752914, + "learning_rate": 0.0005, + "loss": 2.1066, + "step": 141720 + }, + { + "epoch": 0.5394593607027854, + "grad_norm": 0.12201380729675293, + "learning_rate": 0.0005, + "loss": 2.1093, + "step": 141730 + }, + { + "epoch": 0.5394974231709081, + "grad_norm": 0.13108478486537933, + "learning_rate": 0.0005, + "loss": 2.1062, + "step": 141740 + }, + { + "epoch": 0.5395354856390308, + "grad_norm": 0.12690266966819763, + "learning_rate": 0.0005, + "loss": 2.1058, + "step": 141750 + }, + { + "epoch": 0.5395735481071534, + "grad_norm": 0.11823736876249313, + "learning_rate": 0.0005, + "loss": 2.108, + "step": 141760 + }, + { + "epoch": 0.5396116105752762, + "grad_norm": 0.1334260106086731, + "learning_rate": 0.0005, + "loss": 2.1306, + "step": 141770 + }, + { + "epoch": 0.5396496730433988, + "grad_norm": 0.1342170089483261, + "learning_rate": 0.0005, + "loss": 2.1084, + "step": 141780 + }, + { + "epoch": 0.5396877355115215, + "grad_norm": 0.1342397779226303, + "learning_rate": 0.0005, + "loss": 2.0944, + "step": 141790 + }, + { + "epoch": 0.5397257979796442, + "grad_norm": 0.14033067226409912, + "learning_rate": 0.0005, + "loss": 2.1142, + "step": 141800 + }, + { + "epoch": 0.5397638604477669, + "grad_norm": 0.12297873944044113, + "learning_rate": 0.0005, + "loss": 2.121, + "step": 141810 + }, + { + "epoch": 0.5398019229158896, + "grad_norm": 0.11781862378120422, + "learning_rate": 0.0005, + "loss": 2.1081, + "step": 141820 + }, + { + "epoch": 0.5398399853840122, + "grad_norm": 0.11869970709085464, + "learning_rate": 0.0005, + "loss": 2.1108, + "step": 141830 + }, + { + "epoch": 0.5398780478521349, + "grad_norm": 0.1195460706949234, + "learning_rate": 0.0005, + "loss": 2.1091, + "step": 141840 + }, + { + "epoch": 0.5399161103202577, + "grad_norm": 0.13290493190288544, + "learning_rate": 0.0005, + "loss": 2.1135, + "step": 141850 + }, + { + "epoch": 0.5399541727883803, + "grad_norm": 0.13338586688041687, + "learning_rate": 0.0005, + "loss": 2.1216, + "step": 141860 + }, + { + "epoch": 0.539992235256503, + "grad_norm": 0.1205737292766571, + "learning_rate": 0.0005, + "loss": 2.1307, + "step": 141870 + }, + { + "epoch": 0.5400302977246256, + "grad_norm": 0.12896011769771576, + "learning_rate": 0.0005, + "loss": 2.1256, + "step": 141880 + }, + { + "epoch": 0.5400683601927483, + "grad_norm": 0.12277299165725708, + "learning_rate": 0.0005, + "loss": 2.1109, + "step": 141890 + }, + { + "epoch": 0.540106422660871, + "grad_norm": 0.1205860897898674, + "learning_rate": 0.0005, + "loss": 2.1113, + "step": 141900 + }, + { + "epoch": 0.5401444851289937, + "grad_norm": 0.12812530994415283, + "learning_rate": 0.0005, + "loss": 2.1029, + "step": 141910 + }, + { + "epoch": 0.5401825475971164, + "grad_norm": 0.12242037057876587, + "learning_rate": 0.0005, + "loss": 2.1064, + "step": 141920 + }, + { + "epoch": 0.540220610065239, + "grad_norm": 0.12503241002559662, + "learning_rate": 0.0005, + "loss": 2.1094, + "step": 141930 + }, + { + "epoch": 0.5402586725333618, + "grad_norm": 0.13053427636623383, + "learning_rate": 0.0005, + "loss": 2.1131, + "step": 141940 + }, + { + "epoch": 0.5402967350014845, + "grad_norm": 0.12927544116973877, + "learning_rate": 0.0005, + "loss": 2.1165, + "step": 141950 + }, + { + "epoch": 0.5403347974696071, + "grad_norm": 0.1277080774307251, + "learning_rate": 0.0005, + "loss": 2.0988, + "step": 141960 + }, + { + "epoch": 0.5403728599377298, + "grad_norm": 0.11748964339494705, + "learning_rate": 0.0005, + "loss": 2.1286, + "step": 141970 + }, + { + "epoch": 0.5404109224058525, + "grad_norm": 0.11776341497898102, + "learning_rate": 0.0005, + "loss": 2.1081, + "step": 141980 + }, + { + "epoch": 0.5404489848739752, + "grad_norm": 0.12624762952327728, + "learning_rate": 0.0005, + "loss": 2.1007, + "step": 141990 + }, + { + "epoch": 0.5404870473420978, + "grad_norm": 0.12573248147964478, + "learning_rate": 0.0005, + "loss": 2.1132, + "step": 142000 + }, + { + "epoch": 0.5405251098102205, + "grad_norm": 0.1226499155163765, + "learning_rate": 0.0005, + "loss": 2.0992, + "step": 142010 + }, + { + "epoch": 0.5405631722783432, + "grad_norm": 0.11763577908277512, + "learning_rate": 0.0005, + "loss": 2.1059, + "step": 142020 + }, + { + "epoch": 0.5406012347464659, + "grad_norm": 0.12168823182582855, + "learning_rate": 0.0005, + "loss": 2.102, + "step": 142030 + }, + { + "epoch": 0.5406392972145886, + "grad_norm": 0.12195418775081635, + "learning_rate": 0.0005, + "loss": 2.1309, + "step": 142040 + }, + { + "epoch": 0.5406773596827112, + "grad_norm": 0.11766864359378815, + "learning_rate": 0.0005, + "loss": 2.1132, + "step": 142050 + }, + { + "epoch": 0.5407154221508339, + "grad_norm": 0.14633165299892426, + "learning_rate": 0.0005, + "loss": 2.1074, + "step": 142060 + }, + { + "epoch": 0.5407534846189567, + "grad_norm": 0.12138644605875015, + "learning_rate": 0.0005, + "loss": 2.1089, + "step": 142070 + }, + { + "epoch": 0.5407915470870793, + "grad_norm": 0.11825988441705704, + "learning_rate": 0.0005, + "loss": 2.0913, + "step": 142080 + }, + { + "epoch": 0.540829609555202, + "grad_norm": 0.11417256295681, + "learning_rate": 0.0005, + "loss": 2.116, + "step": 142090 + }, + { + "epoch": 0.5408676720233246, + "grad_norm": 0.12141603231430054, + "learning_rate": 0.0005, + "loss": 2.1058, + "step": 142100 + }, + { + "epoch": 0.5409057344914474, + "grad_norm": 0.1170593649148941, + "learning_rate": 0.0005, + "loss": 2.1075, + "step": 142110 + }, + { + "epoch": 0.5409437969595701, + "grad_norm": 0.12261858582496643, + "learning_rate": 0.0005, + "loss": 2.1219, + "step": 142120 + }, + { + "epoch": 0.5409818594276927, + "grad_norm": 0.12808889150619507, + "learning_rate": 0.0005, + "loss": 2.0992, + "step": 142130 + }, + { + "epoch": 0.5410199218958154, + "grad_norm": 0.12382281571626663, + "learning_rate": 0.0005, + "loss": 2.1162, + "step": 142140 + }, + { + "epoch": 0.5410579843639381, + "grad_norm": 0.11859180778265, + "learning_rate": 0.0005, + "loss": 2.1167, + "step": 142150 + }, + { + "epoch": 0.5410960468320608, + "grad_norm": 0.13315057754516602, + "learning_rate": 0.0005, + "loss": 2.1313, + "step": 142160 + }, + { + "epoch": 0.5411341093001835, + "grad_norm": 0.1338150054216385, + "learning_rate": 0.0005, + "loss": 2.1156, + "step": 142170 + }, + { + "epoch": 0.5411721717683061, + "grad_norm": 0.13176853954792023, + "learning_rate": 0.0005, + "loss": 2.1253, + "step": 142180 + }, + { + "epoch": 0.5412102342364288, + "grad_norm": 0.1379355937242508, + "learning_rate": 0.0005, + "loss": 2.1129, + "step": 142190 + }, + { + "epoch": 0.5412482967045515, + "grad_norm": 0.12809717655181885, + "learning_rate": 0.0005, + "loss": 2.0982, + "step": 142200 + }, + { + "epoch": 0.5412863591726742, + "grad_norm": 0.1322251707315445, + "learning_rate": 0.0005, + "loss": 2.1167, + "step": 142210 + }, + { + "epoch": 0.5413244216407969, + "grad_norm": 0.13161404430866241, + "learning_rate": 0.0005, + "loss": 2.1081, + "step": 142220 + }, + { + "epoch": 0.5413624841089195, + "grad_norm": 0.13390658795833588, + "learning_rate": 0.0005, + "loss": 2.1085, + "step": 142230 + }, + { + "epoch": 0.5414005465770423, + "grad_norm": 0.12221341580152512, + "learning_rate": 0.0005, + "loss": 2.0948, + "step": 142240 + }, + { + "epoch": 0.5414386090451649, + "grad_norm": 0.12755192816257477, + "learning_rate": 0.0005, + "loss": 2.1097, + "step": 142250 + }, + { + "epoch": 0.5414766715132876, + "grad_norm": 0.11980967968702316, + "learning_rate": 0.0005, + "loss": 2.1192, + "step": 142260 + }, + { + "epoch": 0.5415147339814103, + "grad_norm": 0.1292906403541565, + "learning_rate": 0.0005, + "loss": 2.1072, + "step": 142270 + }, + { + "epoch": 0.541552796449533, + "grad_norm": 0.11971554160118103, + "learning_rate": 0.0005, + "loss": 2.1013, + "step": 142280 + }, + { + "epoch": 0.5415908589176557, + "grad_norm": 0.11435715854167938, + "learning_rate": 0.0005, + "loss": 2.0947, + "step": 142290 + }, + { + "epoch": 0.5416289213857783, + "grad_norm": 0.12895023822784424, + "learning_rate": 0.0005, + "loss": 2.11, + "step": 142300 + }, + { + "epoch": 0.541666983853901, + "grad_norm": 0.11909782886505127, + "learning_rate": 0.0005, + "loss": 2.1274, + "step": 142310 + }, + { + "epoch": 0.5417050463220237, + "grad_norm": 0.12879256904125214, + "learning_rate": 0.0005, + "loss": 2.1063, + "step": 142320 + }, + { + "epoch": 0.5417431087901464, + "grad_norm": 0.12347320467233658, + "learning_rate": 0.0005, + "loss": 2.1294, + "step": 142330 + }, + { + "epoch": 0.5417811712582691, + "grad_norm": 0.12600278854370117, + "learning_rate": 0.0005, + "loss": 2.0998, + "step": 142340 + }, + { + "epoch": 0.5418192337263917, + "grad_norm": 0.14141617715358734, + "learning_rate": 0.0005, + "loss": 2.1142, + "step": 142350 + }, + { + "epoch": 0.5418572961945144, + "grad_norm": 0.13818462193012238, + "learning_rate": 0.0005, + "loss": 2.1022, + "step": 142360 + }, + { + "epoch": 0.5418953586626372, + "grad_norm": 0.1478867083787918, + "learning_rate": 0.0005, + "loss": 2.1235, + "step": 142370 + }, + { + "epoch": 0.5419334211307598, + "grad_norm": 0.12242863327264786, + "learning_rate": 0.0005, + "loss": 2.1126, + "step": 142380 + }, + { + "epoch": 0.5419714835988825, + "grad_norm": 0.128449484705925, + "learning_rate": 0.0005, + "loss": 2.1122, + "step": 142390 + }, + { + "epoch": 0.5420095460670051, + "grad_norm": 0.13392701745033264, + "learning_rate": 0.0005, + "loss": 2.1172, + "step": 142400 + }, + { + "epoch": 0.5420476085351279, + "grad_norm": 0.12477075308561325, + "learning_rate": 0.0005, + "loss": 2.0901, + "step": 142410 + }, + { + "epoch": 0.5420856710032506, + "grad_norm": 0.12193284183740616, + "learning_rate": 0.0005, + "loss": 2.1088, + "step": 142420 + }, + { + "epoch": 0.5421237334713732, + "grad_norm": 0.1314396858215332, + "learning_rate": 0.0005, + "loss": 2.1073, + "step": 142430 + }, + { + "epoch": 0.5421617959394959, + "grad_norm": 0.12364444881677628, + "learning_rate": 0.0005, + "loss": 2.1175, + "step": 142440 + }, + { + "epoch": 0.5421998584076185, + "grad_norm": 0.12623411417007446, + "learning_rate": 0.0005, + "loss": 2.1039, + "step": 142450 + }, + { + "epoch": 0.5422379208757413, + "grad_norm": 0.1251022219657898, + "learning_rate": 0.0005, + "loss": 2.0981, + "step": 142460 + }, + { + "epoch": 0.542275983343864, + "grad_norm": 0.12845852971076965, + "learning_rate": 0.0005, + "loss": 2.116, + "step": 142470 + }, + { + "epoch": 0.5423140458119866, + "grad_norm": 0.11740121245384216, + "learning_rate": 0.0005, + "loss": 2.1135, + "step": 142480 + }, + { + "epoch": 0.5423521082801093, + "grad_norm": 0.1352996677160263, + "learning_rate": 0.0005, + "loss": 2.1214, + "step": 142490 + }, + { + "epoch": 0.542390170748232, + "grad_norm": 0.11552150547504425, + "learning_rate": 0.0005, + "loss": 2.1012, + "step": 142500 + }, + { + "epoch": 0.5424282332163547, + "grad_norm": 0.12196964770555496, + "learning_rate": 0.0005, + "loss": 2.1148, + "step": 142510 + }, + { + "epoch": 0.5424662956844774, + "grad_norm": 0.1349833607673645, + "learning_rate": 0.0005, + "loss": 2.1357, + "step": 142520 + }, + { + "epoch": 0.5425043581526, + "grad_norm": 0.11843696981668472, + "learning_rate": 0.0005, + "loss": 2.1041, + "step": 142530 + }, + { + "epoch": 0.5425424206207228, + "grad_norm": 0.12222271412611008, + "learning_rate": 0.0005, + "loss": 2.1078, + "step": 142540 + }, + { + "epoch": 0.5425804830888454, + "grad_norm": 0.12116561830043793, + "learning_rate": 0.0005, + "loss": 2.1186, + "step": 142550 + }, + { + "epoch": 0.5426185455569681, + "grad_norm": 0.13041633367538452, + "learning_rate": 0.0005, + "loss": 2.1115, + "step": 142560 + }, + { + "epoch": 0.5426566080250907, + "grad_norm": 0.12237266451120377, + "learning_rate": 0.0005, + "loss": 2.1073, + "step": 142570 + }, + { + "epoch": 0.5426946704932135, + "grad_norm": 0.12065397948026657, + "learning_rate": 0.0005, + "loss": 2.106, + "step": 142580 + }, + { + "epoch": 0.5427327329613362, + "grad_norm": 0.12925831973552704, + "learning_rate": 0.0005, + "loss": 2.1004, + "step": 142590 + }, + { + "epoch": 0.5427707954294588, + "grad_norm": 0.12130697816610336, + "learning_rate": 0.0005, + "loss": 2.1148, + "step": 142600 + }, + { + "epoch": 0.5428088578975815, + "grad_norm": 0.1194259449839592, + "learning_rate": 0.0005, + "loss": 2.1025, + "step": 142610 + }, + { + "epoch": 0.5428469203657041, + "grad_norm": 0.13163216412067413, + "learning_rate": 0.0005, + "loss": 2.1121, + "step": 142620 + }, + { + "epoch": 0.5428849828338269, + "grad_norm": 0.12867335975170135, + "learning_rate": 0.0005, + "loss": 2.104, + "step": 142630 + }, + { + "epoch": 0.5429230453019496, + "grad_norm": 0.11896099895238876, + "learning_rate": 0.0005, + "loss": 2.0988, + "step": 142640 + }, + { + "epoch": 0.5429611077700722, + "grad_norm": 0.12593241035938263, + "learning_rate": 0.0005, + "loss": 2.1179, + "step": 142650 + }, + { + "epoch": 0.5429991702381949, + "grad_norm": 0.14316798746585846, + "learning_rate": 0.0005, + "loss": 2.1184, + "step": 142660 + }, + { + "epoch": 0.5430372327063177, + "grad_norm": 0.1276978999376297, + "learning_rate": 0.0005, + "loss": 2.1265, + "step": 142670 + }, + { + "epoch": 0.5430752951744403, + "grad_norm": 0.1307539939880371, + "learning_rate": 0.0005, + "loss": 2.1207, + "step": 142680 + }, + { + "epoch": 0.543113357642563, + "grad_norm": 0.12179253995418549, + "learning_rate": 0.0005, + "loss": 2.1065, + "step": 142690 + }, + { + "epoch": 0.5431514201106856, + "grad_norm": 0.12673813104629517, + "learning_rate": 0.0005, + "loss": 2.1269, + "step": 142700 + }, + { + "epoch": 0.5431894825788084, + "grad_norm": 0.12629275023937225, + "learning_rate": 0.0005, + "loss": 2.102, + "step": 142710 + }, + { + "epoch": 0.543227545046931, + "grad_norm": 0.11426787823438644, + "learning_rate": 0.0005, + "loss": 2.1201, + "step": 142720 + }, + { + "epoch": 0.5432656075150537, + "grad_norm": 0.11684422940015793, + "learning_rate": 0.0005, + "loss": 2.1177, + "step": 142730 + }, + { + "epoch": 0.5433036699831764, + "grad_norm": 0.1389496624469757, + "learning_rate": 0.0005, + "loss": 2.117, + "step": 142740 + }, + { + "epoch": 0.543341732451299, + "grad_norm": 0.1356937736272812, + "learning_rate": 0.0005, + "loss": 2.1262, + "step": 142750 + }, + { + "epoch": 0.5433797949194218, + "grad_norm": 0.12385162711143494, + "learning_rate": 0.0005, + "loss": 2.1168, + "step": 142760 + }, + { + "epoch": 0.5434178573875444, + "grad_norm": 0.12453263252973557, + "learning_rate": 0.0005, + "loss": 2.1067, + "step": 142770 + }, + { + "epoch": 0.5434559198556671, + "grad_norm": 0.12733960151672363, + "learning_rate": 0.0005, + "loss": 2.1172, + "step": 142780 + }, + { + "epoch": 0.5434939823237898, + "grad_norm": 0.1254967898130417, + "learning_rate": 0.0005, + "loss": 2.1094, + "step": 142790 + }, + { + "epoch": 0.5435320447919125, + "grad_norm": 0.11459669470787048, + "learning_rate": 0.0005, + "loss": 2.1191, + "step": 142800 + }, + { + "epoch": 0.5435701072600352, + "grad_norm": 0.11403346061706543, + "learning_rate": 0.0005, + "loss": 2.111, + "step": 142810 + }, + { + "epoch": 0.5436081697281578, + "grad_norm": 0.1326649934053421, + "learning_rate": 0.0005, + "loss": 2.1002, + "step": 142820 + }, + { + "epoch": 0.5436462321962805, + "grad_norm": 0.12152086943387985, + "learning_rate": 0.0005, + "loss": 2.1106, + "step": 142830 + }, + { + "epoch": 0.5436842946644033, + "grad_norm": 0.11927484720945358, + "learning_rate": 0.0005, + "loss": 2.1243, + "step": 142840 + }, + { + "epoch": 0.5437223571325259, + "grad_norm": 0.11907021701335907, + "learning_rate": 0.0005, + "loss": 2.1049, + "step": 142850 + }, + { + "epoch": 0.5437604196006486, + "grad_norm": 0.11528225243091583, + "learning_rate": 0.0005, + "loss": 2.1089, + "step": 142860 + }, + { + "epoch": 0.5437984820687712, + "grad_norm": 0.12464199960231781, + "learning_rate": 0.0005, + "loss": 2.1033, + "step": 142870 + }, + { + "epoch": 0.5438365445368939, + "grad_norm": 0.125784769654274, + "learning_rate": 0.0005, + "loss": 2.1308, + "step": 142880 + }, + { + "epoch": 0.5438746070050167, + "grad_norm": 0.11493180692195892, + "learning_rate": 0.0005, + "loss": 2.1162, + "step": 142890 + }, + { + "epoch": 0.5439126694731393, + "grad_norm": 0.11990799009799957, + "learning_rate": 0.0005, + "loss": 2.1002, + "step": 142900 + }, + { + "epoch": 0.543950731941262, + "grad_norm": 0.1233905702829361, + "learning_rate": 0.0005, + "loss": 2.1245, + "step": 142910 + }, + { + "epoch": 0.5439887944093846, + "grad_norm": 0.11480487883090973, + "learning_rate": 0.0005, + "loss": 2.1007, + "step": 142920 + }, + { + "epoch": 0.5440268568775074, + "grad_norm": 0.13271062076091766, + "learning_rate": 0.0005, + "loss": 2.119, + "step": 142930 + }, + { + "epoch": 0.5440649193456301, + "grad_norm": 0.12418513745069504, + "learning_rate": 0.0005, + "loss": 2.0973, + "step": 142940 + }, + { + "epoch": 0.5441029818137527, + "grad_norm": 0.11403152346611023, + "learning_rate": 0.0005, + "loss": 2.1102, + "step": 142950 + }, + { + "epoch": 0.5441410442818754, + "grad_norm": 0.11532585322856903, + "learning_rate": 0.0005, + "loss": 2.1346, + "step": 142960 + }, + { + "epoch": 0.5441791067499981, + "grad_norm": 0.11966732144355774, + "learning_rate": 0.0005, + "loss": 2.1165, + "step": 142970 + }, + { + "epoch": 0.5442171692181208, + "grad_norm": 0.1239163726568222, + "learning_rate": 0.0005, + "loss": 2.1048, + "step": 142980 + }, + { + "epoch": 0.5442552316862435, + "grad_norm": 0.12124045938253403, + "learning_rate": 0.0005, + "loss": 2.1244, + "step": 142990 + }, + { + "epoch": 0.5442932941543661, + "grad_norm": 0.12653355300426483, + "learning_rate": 0.0005, + "loss": 2.1115, + "step": 143000 + }, + { + "epoch": 0.5443313566224889, + "grad_norm": 0.12473955005407333, + "learning_rate": 0.0005, + "loss": 2.1097, + "step": 143010 + }, + { + "epoch": 0.5443694190906115, + "grad_norm": 0.12466265261173248, + "learning_rate": 0.0005, + "loss": 2.1091, + "step": 143020 + }, + { + "epoch": 0.5444074815587342, + "grad_norm": 0.12356074899435043, + "learning_rate": 0.0005, + "loss": 2.1128, + "step": 143030 + }, + { + "epoch": 0.5444455440268569, + "grad_norm": 0.1347591131925583, + "learning_rate": 0.0005, + "loss": 2.1172, + "step": 143040 + }, + { + "epoch": 0.5444836064949795, + "grad_norm": 0.1383460909128189, + "learning_rate": 0.0005, + "loss": 2.0978, + "step": 143050 + }, + { + "epoch": 0.5445216689631023, + "grad_norm": 0.12867282330989838, + "learning_rate": 0.0005, + "loss": 2.1251, + "step": 143060 + }, + { + "epoch": 0.5445597314312249, + "grad_norm": 0.12898746132850647, + "learning_rate": 0.0005, + "loss": 2.1063, + "step": 143070 + }, + { + "epoch": 0.5445977938993476, + "grad_norm": 0.13376469910144806, + "learning_rate": 0.0005, + "loss": 2.1098, + "step": 143080 + }, + { + "epoch": 0.5446358563674703, + "grad_norm": 0.12445175647735596, + "learning_rate": 0.0005, + "loss": 2.115, + "step": 143090 + }, + { + "epoch": 0.544673918835593, + "grad_norm": 0.1341526210308075, + "learning_rate": 0.0005, + "loss": 2.1156, + "step": 143100 + }, + { + "epoch": 0.5447119813037157, + "grad_norm": 0.13039885461330414, + "learning_rate": 0.0005, + "loss": 2.1169, + "step": 143110 + }, + { + "epoch": 0.5447500437718383, + "grad_norm": 0.11942508816719055, + "learning_rate": 0.0005, + "loss": 2.1165, + "step": 143120 + }, + { + "epoch": 0.544788106239961, + "grad_norm": 0.1156260222196579, + "learning_rate": 0.0005, + "loss": 2.1148, + "step": 143130 + }, + { + "epoch": 0.5448261687080838, + "grad_norm": 0.12988288700580597, + "learning_rate": 0.0005, + "loss": 2.1145, + "step": 143140 + }, + { + "epoch": 0.5448642311762064, + "grad_norm": 0.13214020431041718, + "learning_rate": 0.0005, + "loss": 2.0997, + "step": 143150 + }, + { + "epoch": 0.5449022936443291, + "grad_norm": 0.12015897035598755, + "learning_rate": 0.0005, + "loss": 2.1147, + "step": 143160 + }, + { + "epoch": 0.5449403561124517, + "grad_norm": 0.12021349370479584, + "learning_rate": 0.0005, + "loss": 2.1168, + "step": 143170 + }, + { + "epoch": 0.5449784185805744, + "grad_norm": 0.1231183409690857, + "learning_rate": 0.0005, + "loss": 2.1055, + "step": 143180 + }, + { + "epoch": 0.5450164810486972, + "grad_norm": 0.13053947687149048, + "learning_rate": 0.0005, + "loss": 2.1112, + "step": 143190 + }, + { + "epoch": 0.5450545435168198, + "grad_norm": 0.1145208552479744, + "learning_rate": 0.0005, + "loss": 2.1226, + "step": 143200 + }, + { + "epoch": 0.5450926059849425, + "grad_norm": 0.12140737473964691, + "learning_rate": 0.0005, + "loss": 2.1247, + "step": 143210 + }, + { + "epoch": 0.5451306684530651, + "grad_norm": 0.14588138461112976, + "learning_rate": 0.0005, + "loss": 2.1052, + "step": 143220 + }, + { + "epoch": 0.5451687309211879, + "grad_norm": 0.1178545281291008, + "learning_rate": 0.0005, + "loss": 2.1026, + "step": 143230 + }, + { + "epoch": 0.5452067933893106, + "grad_norm": 0.11969448626041412, + "learning_rate": 0.0005, + "loss": 2.1176, + "step": 143240 + }, + { + "epoch": 0.5452448558574332, + "grad_norm": 0.12450990080833435, + "learning_rate": 0.0005, + "loss": 2.0958, + "step": 143250 + }, + { + "epoch": 0.5452829183255559, + "grad_norm": 0.12544794380664825, + "learning_rate": 0.0005, + "loss": 2.1068, + "step": 143260 + }, + { + "epoch": 0.5453209807936786, + "grad_norm": 0.11299612373113632, + "learning_rate": 0.0005, + "loss": 2.0959, + "step": 143270 + }, + { + "epoch": 0.5453590432618013, + "grad_norm": 0.11318644136190414, + "learning_rate": 0.0005, + "loss": 2.1207, + "step": 143280 + }, + { + "epoch": 0.545397105729924, + "grad_norm": 0.12822823226451874, + "learning_rate": 0.0005, + "loss": 2.1054, + "step": 143290 + }, + { + "epoch": 0.5454351681980466, + "grad_norm": 0.11486469954252243, + "learning_rate": 0.0005, + "loss": 2.1179, + "step": 143300 + }, + { + "epoch": 0.5454732306661693, + "grad_norm": 0.14914393424987793, + "learning_rate": 0.0005, + "loss": 2.1145, + "step": 143310 + }, + { + "epoch": 0.545511293134292, + "grad_norm": 0.15870623290538788, + "learning_rate": 0.0005, + "loss": 2.1115, + "step": 143320 + }, + { + "epoch": 0.5455493556024147, + "grad_norm": 0.15740184485912323, + "learning_rate": 0.0005, + "loss": 2.12, + "step": 143330 + }, + { + "epoch": 0.5455874180705373, + "grad_norm": 0.1271182894706726, + "learning_rate": 0.0005, + "loss": 2.1122, + "step": 143340 + }, + { + "epoch": 0.54562548053866, + "grad_norm": 0.12603406608104706, + "learning_rate": 0.0005, + "loss": 2.1167, + "step": 143350 + }, + { + "epoch": 0.5456635430067828, + "grad_norm": 0.11953301727771759, + "learning_rate": 0.0005, + "loss": 2.1122, + "step": 143360 + }, + { + "epoch": 0.5457016054749054, + "grad_norm": 0.1294298619031906, + "learning_rate": 0.0005, + "loss": 2.123, + "step": 143370 + }, + { + "epoch": 0.5457396679430281, + "grad_norm": 0.14407844841480255, + "learning_rate": 0.0005, + "loss": 2.1127, + "step": 143380 + }, + { + "epoch": 0.5457777304111507, + "grad_norm": 0.6056898236274719, + "learning_rate": 0.0005, + "loss": 2.1222, + "step": 143390 + }, + { + "epoch": 0.5458157928792735, + "grad_norm": 0.11539902538061142, + "learning_rate": 0.0005, + "loss": 2.1055, + "step": 143400 + }, + { + "epoch": 0.5458538553473962, + "grad_norm": 0.11941809207201004, + "learning_rate": 0.0005, + "loss": 2.1155, + "step": 143410 + }, + { + "epoch": 0.5458919178155188, + "grad_norm": 0.12149988114833832, + "learning_rate": 0.0005, + "loss": 2.1061, + "step": 143420 + }, + { + "epoch": 0.5459299802836415, + "grad_norm": 0.11598517745733261, + "learning_rate": 0.0005, + "loss": 2.1172, + "step": 143430 + }, + { + "epoch": 0.5459680427517642, + "grad_norm": 0.12989123165607452, + "learning_rate": 0.0005, + "loss": 2.0952, + "step": 143440 + }, + { + "epoch": 0.5460061052198869, + "grad_norm": 0.13632255792617798, + "learning_rate": 0.0005, + "loss": 2.1123, + "step": 143450 + }, + { + "epoch": 0.5460441676880096, + "grad_norm": 0.12819725275039673, + "learning_rate": 0.0005, + "loss": 2.1074, + "step": 143460 + }, + { + "epoch": 0.5460822301561322, + "grad_norm": 0.1302596926689148, + "learning_rate": 0.0005, + "loss": 2.1171, + "step": 143470 + }, + { + "epoch": 0.5461202926242549, + "grad_norm": 0.11973568797111511, + "learning_rate": 0.0005, + "loss": 2.1172, + "step": 143480 + }, + { + "epoch": 0.5461583550923776, + "grad_norm": 0.1287134885787964, + "learning_rate": 0.0005, + "loss": 2.1053, + "step": 143490 + }, + { + "epoch": 0.5461964175605003, + "grad_norm": 0.12302163988351822, + "learning_rate": 0.0005, + "loss": 2.1131, + "step": 143500 + }, + { + "epoch": 0.546234480028623, + "grad_norm": 0.14660359919071198, + "learning_rate": 0.0005, + "loss": 2.1133, + "step": 143510 + }, + { + "epoch": 0.5462725424967456, + "grad_norm": 0.13259245455265045, + "learning_rate": 0.0005, + "loss": 2.1164, + "step": 143520 + }, + { + "epoch": 0.5463106049648684, + "grad_norm": 0.1288929283618927, + "learning_rate": 0.0005, + "loss": 2.1122, + "step": 143530 + }, + { + "epoch": 0.546348667432991, + "grad_norm": 0.1310984343290329, + "learning_rate": 0.0005, + "loss": 2.0969, + "step": 143540 + }, + { + "epoch": 0.5463867299011137, + "grad_norm": 0.121677465736866, + "learning_rate": 0.0005, + "loss": 2.1159, + "step": 143550 + }, + { + "epoch": 0.5464247923692364, + "grad_norm": 0.12330687791109085, + "learning_rate": 0.0005, + "loss": 2.1192, + "step": 143560 + }, + { + "epoch": 0.5464628548373591, + "grad_norm": 0.1153598204255104, + "learning_rate": 0.0005, + "loss": 2.1184, + "step": 143570 + }, + { + "epoch": 0.5465009173054818, + "grad_norm": 0.12227758020162582, + "learning_rate": 0.0005, + "loss": 2.1123, + "step": 143580 + }, + { + "epoch": 0.5465389797736044, + "grad_norm": 0.13273516297340393, + "learning_rate": 0.0005, + "loss": 2.1196, + "step": 143590 + }, + { + "epoch": 0.5465770422417271, + "grad_norm": 0.1361711323261261, + "learning_rate": 0.0005, + "loss": 2.1198, + "step": 143600 + }, + { + "epoch": 0.5466151047098498, + "grad_norm": 0.13460688292980194, + "learning_rate": 0.0005, + "loss": 2.1085, + "step": 143610 + }, + { + "epoch": 0.5466531671779725, + "grad_norm": 0.11942264437675476, + "learning_rate": 0.0005, + "loss": 2.115, + "step": 143620 + }, + { + "epoch": 0.5466912296460952, + "grad_norm": 0.12050394713878632, + "learning_rate": 0.0005, + "loss": 2.1111, + "step": 143630 + }, + { + "epoch": 0.5467292921142178, + "grad_norm": 0.1352856457233429, + "learning_rate": 0.0005, + "loss": 2.1253, + "step": 143640 + }, + { + "epoch": 0.5467673545823405, + "grad_norm": 0.12913435697555542, + "learning_rate": 0.0005, + "loss": 2.1239, + "step": 143650 + }, + { + "epoch": 0.5468054170504633, + "grad_norm": 0.1403176337480545, + "learning_rate": 0.0005, + "loss": 2.1259, + "step": 143660 + }, + { + "epoch": 0.5468434795185859, + "grad_norm": 0.12202751636505127, + "learning_rate": 0.0005, + "loss": 2.1098, + "step": 143670 + }, + { + "epoch": 0.5468815419867086, + "grad_norm": 0.13158494234085083, + "learning_rate": 0.0005, + "loss": 2.1239, + "step": 143680 + }, + { + "epoch": 0.5469196044548312, + "grad_norm": 0.11883752048015594, + "learning_rate": 0.0005, + "loss": 2.1047, + "step": 143690 + }, + { + "epoch": 0.546957666922954, + "grad_norm": 0.12454967200756073, + "learning_rate": 0.0005, + "loss": 2.1075, + "step": 143700 + }, + { + "epoch": 0.5469957293910767, + "grad_norm": 0.13018742203712463, + "learning_rate": 0.0005, + "loss": 2.1177, + "step": 143710 + }, + { + "epoch": 0.5470337918591993, + "grad_norm": 0.13621671497821808, + "learning_rate": 0.0005, + "loss": 2.1217, + "step": 143720 + }, + { + "epoch": 0.547071854327322, + "grad_norm": 0.12384206801652908, + "learning_rate": 0.0005, + "loss": 2.1211, + "step": 143730 + }, + { + "epoch": 0.5471099167954446, + "grad_norm": 0.12141196429729462, + "learning_rate": 0.0005, + "loss": 2.1118, + "step": 143740 + }, + { + "epoch": 0.5471479792635674, + "grad_norm": 0.12233904749155045, + "learning_rate": 0.0005, + "loss": 2.1068, + "step": 143750 + }, + { + "epoch": 0.54718604173169, + "grad_norm": 0.13541485369205475, + "learning_rate": 0.0005, + "loss": 2.1284, + "step": 143760 + }, + { + "epoch": 0.5472241041998127, + "grad_norm": 0.12836025655269623, + "learning_rate": 0.0005, + "loss": 2.1257, + "step": 143770 + }, + { + "epoch": 0.5472621666679354, + "grad_norm": 0.12747050821781158, + "learning_rate": 0.0005, + "loss": 2.1284, + "step": 143780 + }, + { + "epoch": 0.5473002291360581, + "grad_norm": 0.12959854304790497, + "learning_rate": 0.0005, + "loss": 2.1017, + "step": 143790 + }, + { + "epoch": 0.5473382916041808, + "grad_norm": 0.12214165925979614, + "learning_rate": 0.0005, + "loss": 2.1071, + "step": 143800 + }, + { + "epoch": 0.5473763540723035, + "grad_norm": 0.13413529098033905, + "learning_rate": 0.0005, + "loss": 2.1237, + "step": 143810 + }, + { + "epoch": 0.5474144165404261, + "grad_norm": 0.1225849986076355, + "learning_rate": 0.0005, + "loss": 2.1142, + "step": 143820 + }, + { + "epoch": 0.5474524790085489, + "grad_norm": 0.13006827235221863, + "learning_rate": 0.0005, + "loss": 2.116, + "step": 143830 + }, + { + "epoch": 0.5474905414766715, + "grad_norm": 0.1268097460269928, + "learning_rate": 0.0005, + "loss": 2.1153, + "step": 143840 + }, + { + "epoch": 0.5475286039447942, + "grad_norm": 0.1367637664079666, + "learning_rate": 0.0005, + "loss": 2.1163, + "step": 143850 + }, + { + "epoch": 0.5475666664129168, + "grad_norm": 0.12964405119419098, + "learning_rate": 0.0005, + "loss": 2.1072, + "step": 143860 + }, + { + "epoch": 0.5476047288810396, + "grad_norm": 0.14246685802936554, + "learning_rate": 0.0005, + "loss": 2.1175, + "step": 143870 + }, + { + "epoch": 0.5476427913491623, + "grad_norm": 0.14590322971343994, + "learning_rate": 0.0005, + "loss": 2.1061, + "step": 143880 + }, + { + "epoch": 0.5476808538172849, + "grad_norm": 0.13974499702453613, + "learning_rate": 0.0005, + "loss": 2.108, + "step": 143890 + }, + { + "epoch": 0.5477189162854076, + "grad_norm": 0.12832729518413544, + "learning_rate": 0.0005, + "loss": 2.1091, + "step": 143900 + }, + { + "epoch": 0.5477569787535302, + "grad_norm": 0.1358684003353119, + "learning_rate": 0.0005, + "loss": 2.1041, + "step": 143910 + }, + { + "epoch": 0.547795041221653, + "grad_norm": 0.11089088767766953, + "learning_rate": 0.0005, + "loss": 2.1099, + "step": 143920 + }, + { + "epoch": 0.5478331036897757, + "grad_norm": 0.12461254000663757, + "learning_rate": 0.0005, + "loss": 2.1122, + "step": 143930 + }, + { + "epoch": 0.5478711661578983, + "grad_norm": 0.13170070946216583, + "learning_rate": 0.0005, + "loss": 2.1132, + "step": 143940 + }, + { + "epoch": 0.547909228626021, + "grad_norm": 0.14326369762420654, + "learning_rate": 0.0005, + "loss": 2.1222, + "step": 143950 + }, + { + "epoch": 0.5479472910941438, + "grad_norm": 0.13025934994220734, + "learning_rate": 0.0005, + "loss": 2.0971, + "step": 143960 + }, + { + "epoch": 0.5479853535622664, + "grad_norm": 0.13151970505714417, + "learning_rate": 0.0005, + "loss": 2.1085, + "step": 143970 + }, + { + "epoch": 0.5480234160303891, + "grad_norm": 0.13055887818336487, + "learning_rate": 0.0005, + "loss": 2.1138, + "step": 143980 + }, + { + "epoch": 0.5480614784985117, + "grad_norm": 0.11997563391923904, + "learning_rate": 0.0005, + "loss": 2.1114, + "step": 143990 + }, + { + "epoch": 0.5480995409666345, + "grad_norm": 0.13561861217021942, + "learning_rate": 0.0005, + "loss": 2.1333, + "step": 144000 + }, + { + "epoch": 0.5481376034347571, + "grad_norm": 0.11981268227100372, + "learning_rate": 0.0005, + "loss": 2.1219, + "step": 144010 + }, + { + "epoch": 0.5481756659028798, + "grad_norm": 0.13623487949371338, + "learning_rate": 0.0005, + "loss": 2.1122, + "step": 144020 + }, + { + "epoch": 0.5482137283710025, + "grad_norm": 0.13205698132514954, + "learning_rate": 0.0005, + "loss": 2.1036, + "step": 144030 + }, + { + "epoch": 0.5482517908391251, + "grad_norm": 0.12684205174446106, + "learning_rate": 0.0005, + "loss": 2.1255, + "step": 144040 + }, + { + "epoch": 0.5482898533072479, + "grad_norm": 0.1281534880399704, + "learning_rate": 0.0005, + "loss": 2.1159, + "step": 144050 + }, + { + "epoch": 0.5483279157753705, + "grad_norm": 0.14929701387882233, + "learning_rate": 0.0005, + "loss": 2.1229, + "step": 144060 + }, + { + "epoch": 0.5483659782434932, + "grad_norm": 0.13052833080291748, + "learning_rate": 0.0005, + "loss": 2.1411, + "step": 144070 + }, + { + "epoch": 0.5484040407116159, + "grad_norm": 0.11817128211259842, + "learning_rate": 0.0005, + "loss": 2.1071, + "step": 144080 + }, + { + "epoch": 0.5484421031797386, + "grad_norm": 0.12966929376125336, + "learning_rate": 0.0005, + "loss": 2.1127, + "step": 144090 + }, + { + "epoch": 0.5484801656478613, + "grad_norm": 0.12929899990558624, + "learning_rate": 0.0005, + "loss": 2.1032, + "step": 144100 + }, + { + "epoch": 0.5485182281159839, + "grad_norm": 0.11421916633844376, + "learning_rate": 0.0005, + "loss": 2.113, + "step": 144110 + }, + { + "epoch": 0.5485562905841066, + "grad_norm": 0.12206301838159561, + "learning_rate": 0.0005, + "loss": 2.1146, + "step": 144120 + }, + { + "epoch": 0.5485943530522294, + "grad_norm": 0.11934498697519302, + "learning_rate": 0.0005, + "loss": 2.1162, + "step": 144130 + }, + { + "epoch": 0.548632415520352, + "grad_norm": 0.12408069521188736, + "learning_rate": 0.0005, + "loss": 2.1188, + "step": 144140 + }, + { + "epoch": 0.5486704779884747, + "grad_norm": 0.1259586066007614, + "learning_rate": 0.0005, + "loss": 2.1191, + "step": 144150 + }, + { + "epoch": 0.5487085404565973, + "grad_norm": 0.12521126866340637, + "learning_rate": 0.0005, + "loss": 2.1162, + "step": 144160 + }, + { + "epoch": 0.5487466029247201, + "grad_norm": 0.133035346865654, + "learning_rate": 0.0005, + "loss": 2.1055, + "step": 144170 + }, + { + "epoch": 0.5487846653928428, + "grad_norm": 0.1211579367518425, + "learning_rate": 0.0005, + "loss": 2.1064, + "step": 144180 + }, + { + "epoch": 0.5488227278609654, + "grad_norm": 0.12172611802816391, + "learning_rate": 0.0005, + "loss": 2.1134, + "step": 144190 + }, + { + "epoch": 0.5488607903290881, + "grad_norm": 0.10662341862916946, + "learning_rate": 0.0005, + "loss": 2.1049, + "step": 144200 + }, + { + "epoch": 0.5488988527972107, + "grad_norm": 0.12909914553165436, + "learning_rate": 0.0005, + "loss": 2.117, + "step": 144210 + }, + { + "epoch": 0.5489369152653335, + "grad_norm": 0.1285155564546585, + "learning_rate": 0.0005, + "loss": 2.1014, + "step": 144220 + }, + { + "epoch": 0.5489749777334562, + "grad_norm": 0.12418292462825775, + "learning_rate": 0.0005, + "loss": 2.0955, + "step": 144230 + }, + { + "epoch": 0.5490130402015788, + "grad_norm": 0.12626810371875763, + "learning_rate": 0.0005, + "loss": 2.1282, + "step": 144240 + }, + { + "epoch": 0.5490511026697015, + "grad_norm": 0.1202656552195549, + "learning_rate": 0.0005, + "loss": 2.1105, + "step": 144250 + }, + { + "epoch": 0.5490891651378242, + "grad_norm": 0.12186427414417267, + "learning_rate": 0.0005, + "loss": 2.1102, + "step": 144260 + }, + { + "epoch": 0.5491272276059469, + "grad_norm": 0.11100109666585922, + "learning_rate": 0.0005, + "loss": 2.1092, + "step": 144270 + }, + { + "epoch": 0.5491652900740696, + "grad_norm": 0.13168954849243164, + "learning_rate": 0.0005, + "loss": 2.1053, + "step": 144280 + }, + { + "epoch": 0.5492033525421922, + "grad_norm": 0.129677876830101, + "learning_rate": 0.0005, + "loss": 2.1039, + "step": 144290 + }, + { + "epoch": 0.549241415010315, + "grad_norm": 0.12331412732601166, + "learning_rate": 0.0005, + "loss": 2.1085, + "step": 144300 + }, + { + "epoch": 0.5492794774784376, + "grad_norm": 0.12670284509658813, + "learning_rate": 0.0005, + "loss": 2.1049, + "step": 144310 + }, + { + "epoch": 0.5493175399465603, + "grad_norm": 0.1265244036912918, + "learning_rate": 0.0005, + "loss": 2.1092, + "step": 144320 + }, + { + "epoch": 0.549355602414683, + "grad_norm": 0.12023013830184937, + "learning_rate": 0.0005, + "loss": 2.1152, + "step": 144330 + }, + { + "epoch": 0.5493936648828056, + "grad_norm": 0.13818997144699097, + "learning_rate": 0.0005, + "loss": 2.1152, + "step": 144340 + }, + { + "epoch": 0.5494317273509284, + "grad_norm": 0.11964023113250732, + "learning_rate": 0.0005, + "loss": 2.0969, + "step": 144350 + }, + { + "epoch": 0.549469789819051, + "grad_norm": 0.1293233186006546, + "learning_rate": 0.0005, + "loss": 2.1142, + "step": 144360 + }, + { + "epoch": 0.5495078522871737, + "grad_norm": 0.12815843522548676, + "learning_rate": 0.0005, + "loss": 2.1078, + "step": 144370 + }, + { + "epoch": 0.5495459147552963, + "grad_norm": 0.12119992822408676, + "learning_rate": 0.0005, + "loss": 2.1097, + "step": 144380 + }, + { + "epoch": 0.5495839772234191, + "grad_norm": 0.12910519540309906, + "learning_rate": 0.0005, + "loss": 2.1118, + "step": 144390 + }, + { + "epoch": 0.5496220396915418, + "grad_norm": 0.12036329507827759, + "learning_rate": 0.0005, + "loss": 2.1148, + "step": 144400 + }, + { + "epoch": 0.5496601021596644, + "grad_norm": 0.11510950326919556, + "learning_rate": 0.0005, + "loss": 2.1175, + "step": 144410 + }, + { + "epoch": 0.5496981646277871, + "grad_norm": 0.12537814676761627, + "learning_rate": 0.0005, + "loss": 2.1231, + "step": 144420 + }, + { + "epoch": 0.5497362270959099, + "grad_norm": 0.12165727466344833, + "learning_rate": 0.0005, + "loss": 2.1279, + "step": 144430 + }, + { + "epoch": 0.5497742895640325, + "grad_norm": 0.12165447324514389, + "learning_rate": 0.0005, + "loss": 2.1201, + "step": 144440 + }, + { + "epoch": 0.5498123520321552, + "grad_norm": 0.12614521384239197, + "learning_rate": 0.0005, + "loss": 2.1153, + "step": 144450 + }, + { + "epoch": 0.5498504145002778, + "grad_norm": 0.11235406249761581, + "learning_rate": 0.0005, + "loss": 2.1069, + "step": 144460 + }, + { + "epoch": 0.5498884769684005, + "grad_norm": 0.12776117026805878, + "learning_rate": 0.0005, + "loss": 2.1164, + "step": 144470 + }, + { + "epoch": 0.5499265394365233, + "grad_norm": 0.3991162180900574, + "learning_rate": 0.0005, + "loss": 2.117, + "step": 144480 + }, + { + "epoch": 0.5499646019046459, + "grad_norm": 0.12186644971370697, + "learning_rate": 0.0005, + "loss": 2.1263, + "step": 144490 + }, + { + "epoch": 0.5500026643727686, + "grad_norm": 0.12400142848491669, + "learning_rate": 0.0005, + "loss": 2.1068, + "step": 144500 + }, + { + "epoch": 0.5500407268408912, + "grad_norm": 0.12243487685918808, + "learning_rate": 0.0005, + "loss": 2.116, + "step": 144510 + }, + { + "epoch": 0.550078789309014, + "grad_norm": 0.12870800495147705, + "learning_rate": 0.0005, + "loss": 2.1296, + "step": 144520 + }, + { + "epoch": 0.5501168517771367, + "grad_norm": 0.11797146499156952, + "learning_rate": 0.0005, + "loss": 2.1118, + "step": 144530 + }, + { + "epoch": 0.5501549142452593, + "grad_norm": 0.11497573554515839, + "learning_rate": 0.0005, + "loss": 2.1175, + "step": 144540 + }, + { + "epoch": 0.550192976713382, + "grad_norm": 0.12143855541944504, + "learning_rate": 0.0005, + "loss": 2.1058, + "step": 144550 + }, + { + "epoch": 0.5502310391815047, + "grad_norm": 0.11397191137075424, + "learning_rate": 0.0005, + "loss": 2.1187, + "step": 144560 + }, + { + "epoch": 0.5502691016496274, + "grad_norm": 0.13390995562076569, + "learning_rate": 0.0005, + "loss": 2.1277, + "step": 144570 + }, + { + "epoch": 0.55030716411775, + "grad_norm": 0.13767433166503906, + "learning_rate": 0.0005, + "loss": 2.127, + "step": 144580 + }, + { + "epoch": 0.5503452265858727, + "grad_norm": 0.18040162324905396, + "learning_rate": 0.0005, + "loss": 2.1286, + "step": 144590 + }, + { + "epoch": 0.5503832890539955, + "grad_norm": 0.14773793518543243, + "learning_rate": 0.0005, + "loss": 2.1177, + "step": 144600 + }, + { + "epoch": 0.5504213515221181, + "grad_norm": 0.12226738035678864, + "learning_rate": 0.0005, + "loss": 2.1065, + "step": 144610 + }, + { + "epoch": 0.5504594139902408, + "grad_norm": 0.1189928650856018, + "learning_rate": 0.0005, + "loss": 2.1162, + "step": 144620 + }, + { + "epoch": 0.5504974764583634, + "grad_norm": 0.11632554978132248, + "learning_rate": 0.0005, + "loss": 2.1043, + "step": 144630 + }, + { + "epoch": 0.5505355389264861, + "grad_norm": 0.11748391389846802, + "learning_rate": 0.0005, + "loss": 2.1105, + "step": 144640 + }, + { + "epoch": 0.5505736013946089, + "grad_norm": 0.144392192363739, + "learning_rate": 0.0005, + "loss": 2.1287, + "step": 144650 + }, + { + "epoch": 0.5506116638627315, + "grad_norm": 0.15021365880966187, + "learning_rate": 0.0005, + "loss": 2.1213, + "step": 144660 + }, + { + "epoch": 0.5506497263308542, + "grad_norm": 0.14086973667144775, + "learning_rate": 0.0005, + "loss": 2.1242, + "step": 144670 + }, + { + "epoch": 0.5506877887989768, + "grad_norm": 0.1304117888212204, + "learning_rate": 0.0005, + "loss": 2.1065, + "step": 144680 + }, + { + "epoch": 0.5507258512670996, + "grad_norm": 0.12264396250247955, + "learning_rate": 0.0005, + "loss": 2.1258, + "step": 144690 + }, + { + "epoch": 0.5507639137352223, + "grad_norm": 0.1307545155286789, + "learning_rate": 0.0005, + "loss": 2.1161, + "step": 144700 + }, + { + "epoch": 0.5508019762033449, + "grad_norm": 0.12935005128383636, + "learning_rate": 0.0005, + "loss": 2.1217, + "step": 144710 + }, + { + "epoch": 0.5508400386714676, + "grad_norm": 0.13241122663021088, + "learning_rate": 0.0005, + "loss": 2.1143, + "step": 144720 + }, + { + "epoch": 0.5508781011395903, + "grad_norm": 0.11758055537939072, + "learning_rate": 0.0005, + "loss": 2.1027, + "step": 144730 + }, + { + "epoch": 0.550916163607713, + "grad_norm": 0.12281011044979095, + "learning_rate": 0.0005, + "loss": 2.1145, + "step": 144740 + }, + { + "epoch": 0.5509542260758357, + "grad_norm": 0.1272137612104416, + "learning_rate": 0.0005, + "loss": 2.1168, + "step": 144750 + }, + { + "epoch": 0.5509922885439583, + "grad_norm": 0.12607891857624054, + "learning_rate": 0.0005, + "loss": 2.1231, + "step": 144760 + }, + { + "epoch": 0.551030351012081, + "grad_norm": 0.12848597764968872, + "learning_rate": 0.0005, + "loss": 2.113, + "step": 144770 + }, + { + "epoch": 0.5510684134802037, + "grad_norm": 0.13169367611408234, + "learning_rate": 0.0005, + "loss": 2.1113, + "step": 144780 + }, + { + "epoch": 0.5511064759483264, + "grad_norm": 0.11452927440404892, + "learning_rate": 0.0005, + "loss": 2.1075, + "step": 144790 + }, + { + "epoch": 0.5511445384164491, + "grad_norm": 0.12549881637096405, + "learning_rate": 0.0005, + "loss": 2.1132, + "step": 144800 + }, + { + "epoch": 0.5511826008845717, + "grad_norm": 0.12136809527873993, + "learning_rate": 0.0005, + "loss": 2.1207, + "step": 144810 + }, + { + "epoch": 0.5512206633526945, + "grad_norm": 0.12465189397335052, + "learning_rate": 0.0005, + "loss": 2.1129, + "step": 144820 + }, + { + "epoch": 0.5512587258208171, + "grad_norm": 0.12369824945926666, + "learning_rate": 0.0005, + "loss": 2.1157, + "step": 144830 + }, + { + "epoch": 0.5512967882889398, + "grad_norm": 0.11682531982660294, + "learning_rate": 0.0005, + "loss": 2.115, + "step": 144840 + }, + { + "epoch": 0.5513348507570625, + "grad_norm": 0.13062603771686554, + "learning_rate": 0.0005, + "loss": 2.113, + "step": 144850 + }, + { + "epoch": 0.5513729132251852, + "grad_norm": 0.11250148713588715, + "learning_rate": 0.0005, + "loss": 2.1164, + "step": 144860 + }, + { + "epoch": 0.5514109756933079, + "grad_norm": 0.1234857439994812, + "learning_rate": 0.0005, + "loss": 2.1178, + "step": 144870 + }, + { + "epoch": 0.5514490381614305, + "grad_norm": 0.12878267467021942, + "learning_rate": 0.0005, + "loss": 2.1159, + "step": 144880 + }, + { + "epoch": 0.5514871006295532, + "grad_norm": 0.11966361850500107, + "learning_rate": 0.0005, + "loss": 2.1138, + "step": 144890 + }, + { + "epoch": 0.5515251630976759, + "grad_norm": 0.11235833168029785, + "learning_rate": 0.0005, + "loss": 2.1256, + "step": 144900 + }, + { + "epoch": 0.5515632255657986, + "grad_norm": 0.1246689185500145, + "learning_rate": 0.0005, + "loss": 2.1194, + "step": 144910 + }, + { + "epoch": 0.5516012880339213, + "grad_norm": 0.12522974610328674, + "learning_rate": 0.0005, + "loss": 2.1241, + "step": 144920 + }, + { + "epoch": 0.5516393505020439, + "grad_norm": 0.139484241604805, + "learning_rate": 0.0005, + "loss": 2.1214, + "step": 144930 + }, + { + "epoch": 0.5516774129701666, + "grad_norm": 0.13401883840560913, + "learning_rate": 0.0005, + "loss": 2.1097, + "step": 144940 + }, + { + "epoch": 0.5517154754382894, + "grad_norm": 0.12002041935920715, + "learning_rate": 0.0005, + "loss": 2.1102, + "step": 144950 + }, + { + "epoch": 0.551753537906412, + "grad_norm": 0.11653312295675278, + "learning_rate": 0.0005, + "loss": 2.1151, + "step": 144960 + }, + { + "epoch": 0.5517916003745347, + "grad_norm": 0.12327057123184204, + "learning_rate": 0.0005, + "loss": 2.1189, + "step": 144970 + }, + { + "epoch": 0.5518296628426573, + "grad_norm": 0.1284448206424713, + "learning_rate": 0.0005, + "loss": 2.1233, + "step": 144980 + }, + { + "epoch": 0.5518677253107801, + "grad_norm": 0.15450535714626312, + "learning_rate": 0.0005, + "loss": 2.1039, + "step": 144990 + }, + { + "epoch": 0.5519057877789028, + "grad_norm": 0.12576664984226227, + "learning_rate": 0.0005, + "loss": 2.1105, + "step": 145000 + }, + { + "epoch": 0.5519438502470254, + "grad_norm": 0.11548066139221191, + "learning_rate": 0.0005, + "loss": 2.0971, + "step": 145010 + }, + { + "epoch": 0.5519819127151481, + "grad_norm": 0.12530051171779633, + "learning_rate": 0.0005, + "loss": 2.1169, + "step": 145020 + }, + { + "epoch": 0.5520199751832708, + "grad_norm": 0.12279170006513596, + "learning_rate": 0.0005, + "loss": 2.1033, + "step": 145030 + }, + { + "epoch": 0.5520580376513935, + "grad_norm": 0.12480257451534271, + "learning_rate": 0.0005, + "loss": 2.0981, + "step": 145040 + }, + { + "epoch": 0.5520961001195162, + "grad_norm": 0.2465040534734726, + "learning_rate": 0.0005, + "loss": 2.0929, + "step": 145050 + }, + { + "epoch": 0.5521341625876388, + "grad_norm": 0.12566375732421875, + "learning_rate": 0.0005, + "loss": 2.1277, + "step": 145060 + }, + { + "epoch": 0.5521722250557615, + "grad_norm": 0.12638990581035614, + "learning_rate": 0.0005, + "loss": 2.1121, + "step": 145070 + }, + { + "epoch": 0.5522102875238842, + "grad_norm": 0.12483334541320801, + "learning_rate": 0.0005, + "loss": 2.1111, + "step": 145080 + }, + { + "epoch": 0.5522483499920069, + "grad_norm": 0.13051484525203705, + "learning_rate": 0.0005, + "loss": 2.1215, + "step": 145090 + }, + { + "epoch": 0.5522864124601295, + "grad_norm": 0.13386259973049164, + "learning_rate": 0.0005, + "loss": 2.1053, + "step": 145100 + }, + { + "epoch": 0.5523244749282522, + "grad_norm": 0.12317392975091934, + "learning_rate": 0.0005, + "loss": 2.1075, + "step": 145110 + }, + { + "epoch": 0.552362537396375, + "grad_norm": 0.11211195588111877, + "learning_rate": 0.0005, + "loss": 2.109, + "step": 145120 + }, + { + "epoch": 0.5524005998644976, + "grad_norm": 0.128127783536911, + "learning_rate": 0.0005, + "loss": 2.1114, + "step": 145130 + }, + { + "epoch": 0.5524386623326203, + "grad_norm": 0.1159677729010582, + "learning_rate": 0.0005, + "loss": 2.1058, + "step": 145140 + }, + { + "epoch": 0.552476724800743, + "grad_norm": 0.1285521537065506, + "learning_rate": 0.0005, + "loss": 2.1253, + "step": 145150 + }, + { + "epoch": 0.5525147872688657, + "grad_norm": 0.11685515195131302, + "learning_rate": 0.0005, + "loss": 2.1179, + "step": 145160 + }, + { + "epoch": 0.5525528497369884, + "grad_norm": 0.14790181815624237, + "learning_rate": 0.0005, + "loss": 2.1146, + "step": 145170 + }, + { + "epoch": 0.552590912205111, + "grad_norm": 0.14180521667003632, + "learning_rate": 0.0005, + "loss": 2.1194, + "step": 145180 + }, + { + "epoch": 0.5526289746732337, + "grad_norm": 0.12453516572713852, + "learning_rate": 0.0005, + "loss": 2.1108, + "step": 145190 + }, + { + "epoch": 0.5526670371413563, + "grad_norm": 0.1201147809624672, + "learning_rate": 0.0005, + "loss": 2.1368, + "step": 145200 + }, + { + "epoch": 0.5527050996094791, + "grad_norm": 0.12104005366563797, + "learning_rate": 0.0005, + "loss": 2.1057, + "step": 145210 + }, + { + "epoch": 0.5527431620776018, + "grad_norm": 0.11533419787883759, + "learning_rate": 0.0005, + "loss": 2.1068, + "step": 145220 + }, + { + "epoch": 0.5527812245457244, + "grad_norm": 0.12507320940494537, + "learning_rate": 0.0005, + "loss": 2.1201, + "step": 145230 + }, + { + "epoch": 0.5528192870138471, + "grad_norm": 0.12377454340457916, + "learning_rate": 0.0005, + "loss": 2.1006, + "step": 145240 + }, + { + "epoch": 0.5528573494819699, + "grad_norm": 0.13272665441036224, + "learning_rate": 0.0005, + "loss": 2.1079, + "step": 145250 + }, + { + "epoch": 0.5528954119500925, + "grad_norm": 0.12817463278770447, + "learning_rate": 0.0005, + "loss": 2.1238, + "step": 145260 + }, + { + "epoch": 0.5529334744182152, + "grad_norm": 0.11743728816509247, + "learning_rate": 0.0005, + "loss": 2.1014, + "step": 145270 + }, + { + "epoch": 0.5529715368863378, + "grad_norm": 0.13543175160884857, + "learning_rate": 0.0005, + "loss": 2.1161, + "step": 145280 + }, + { + "epoch": 0.5530095993544606, + "grad_norm": 0.13341118395328522, + "learning_rate": 0.0005, + "loss": 2.1195, + "step": 145290 + }, + { + "epoch": 0.5530476618225832, + "grad_norm": 0.1958242803812027, + "learning_rate": 0.0005, + "loss": 2.1158, + "step": 145300 + }, + { + "epoch": 0.5530857242907059, + "grad_norm": 0.12311431765556335, + "learning_rate": 0.0005, + "loss": 2.122, + "step": 145310 + }, + { + "epoch": 0.5531237867588286, + "grad_norm": 0.11194714903831482, + "learning_rate": 0.0005, + "loss": 2.1244, + "step": 145320 + }, + { + "epoch": 0.5531618492269512, + "grad_norm": 0.1213444247841835, + "learning_rate": 0.0005, + "loss": 2.098, + "step": 145330 + }, + { + "epoch": 0.553199911695074, + "grad_norm": 0.11784545332193375, + "learning_rate": 0.0005, + "loss": 2.1177, + "step": 145340 + }, + { + "epoch": 0.5532379741631966, + "grad_norm": 0.1280663013458252, + "learning_rate": 0.0005, + "loss": 2.1182, + "step": 145350 + }, + { + "epoch": 0.5532760366313193, + "grad_norm": 0.13075262308120728, + "learning_rate": 0.0005, + "loss": 2.117, + "step": 145360 + }, + { + "epoch": 0.553314099099442, + "grad_norm": 0.14177240431308746, + "learning_rate": 0.0005, + "loss": 2.1138, + "step": 145370 + }, + { + "epoch": 0.5533521615675647, + "grad_norm": 0.17335639894008636, + "learning_rate": 0.0005, + "loss": 2.1034, + "step": 145380 + }, + { + "epoch": 0.5533902240356874, + "grad_norm": 0.12055764347314835, + "learning_rate": 0.0005, + "loss": 2.1119, + "step": 145390 + }, + { + "epoch": 0.55342828650381, + "grad_norm": 0.12622298300266266, + "learning_rate": 0.0005, + "loss": 2.1094, + "step": 145400 + }, + { + "epoch": 0.5534663489719327, + "grad_norm": 0.13050603866577148, + "learning_rate": 0.0005, + "loss": 2.1116, + "step": 145410 + }, + { + "epoch": 0.5535044114400555, + "grad_norm": 0.12804704904556274, + "learning_rate": 0.0005, + "loss": 2.0984, + "step": 145420 + }, + { + "epoch": 0.5535424739081781, + "grad_norm": 0.13104109466075897, + "learning_rate": 0.0005, + "loss": 2.1191, + "step": 145430 + }, + { + "epoch": 0.5535805363763008, + "grad_norm": 0.11165295541286469, + "learning_rate": 0.0005, + "loss": 2.1112, + "step": 145440 + }, + { + "epoch": 0.5536185988444234, + "grad_norm": 0.12978315353393555, + "learning_rate": 0.0005, + "loss": 2.0952, + "step": 145450 + }, + { + "epoch": 0.5536566613125462, + "grad_norm": 0.1414872109889984, + "learning_rate": 0.0005, + "loss": 2.1171, + "step": 145460 + }, + { + "epoch": 0.5536947237806689, + "grad_norm": 0.12297790497541428, + "learning_rate": 0.0005, + "loss": 2.109, + "step": 145470 + }, + { + "epoch": 0.5537327862487915, + "grad_norm": 0.12759721279144287, + "learning_rate": 0.0005, + "loss": 2.1227, + "step": 145480 + }, + { + "epoch": 0.5537708487169142, + "grad_norm": 0.11224611848592758, + "learning_rate": 0.0005, + "loss": 2.103, + "step": 145490 + }, + { + "epoch": 0.5538089111850368, + "grad_norm": 0.13177062571048737, + "learning_rate": 0.0005, + "loss": 2.1145, + "step": 145500 + }, + { + "epoch": 0.5538469736531596, + "grad_norm": 0.12307994067668915, + "learning_rate": 0.0005, + "loss": 2.1142, + "step": 145510 + }, + { + "epoch": 0.5538850361212823, + "grad_norm": 0.13711430132389069, + "learning_rate": 0.0005, + "loss": 2.1087, + "step": 145520 + }, + { + "epoch": 0.5539230985894049, + "grad_norm": 0.12840867042541504, + "learning_rate": 0.0005, + "loss": 2.1172, + "step": 145530 + }, + { + "epoch": 0.5539611610575276, + "grad_norm": 0.14088504016399384, + "learning_rate": 0.0005, + "loss": 2.1149, + "step": 145540 + }, + { + "epoch": 0.5539992235256503, + "grad_norm": 0.1343139261007309, + "learning_rate": 0.0005, + "loss": 2.1208, + "step": 145550 + }, + { + "epoch": 0.554037285993773, + "grad_norm": 0.11925197392702103, + "learning_rate": 0.0005, + "loss": 2.1242, + "step": 145560 + }, + { + "epoch": 0.5540753484618957, + "grad_norm": 0.12442201375961304, + "learning_rate": 0.0005, + "loss": 2.1059, + "step": 145570 + }, + { + "epoch": 0.5541134109300183, + "grad_norm": 0.11571851372718811, + "learning_rate": 0.0005, + "loss": 2.1083, + "step": 145580 + }, + { + "epoch": 0.5541514733981411, + "grad_norm": 0.12373834103345871, + "learning_rate": 0.0005, + "loss": 2.109, + "step": 145590 + }, + { + "epoch": 0.5541895358662637, + "grad_norm": 0.1327885538339615, + "learning_rate": 0.0005, + "loss": 2.11, + "step": 145600 + }, + { + "epoch": 0.5542275983343864, + "grad_norm": 0.13805131614208221, + "learning_rate": 0.0005, + "loss": 2.1285, + "step": 145610 + }, + { + "epoch": 0.554265660802509, + "grad_norm": 0.12871071696281433, + "learning_rate": 0.0005, + "loss": 2.1163, + "step": 145620 + }, + { + "epoch": 0.5543037232706317, + "grad_norm": 0.3121941387653351, + "learning_rate": 0.0005, + "loss": 2.1193, + "step": 145630 + }, + { + "epoch": 0.5543417857387545, + "grad_norm": 0.12596552073955536, + "learning_rate": 0.0005, + "loss": 2.1153, + "step": 145640 + }, + { + "epoch": 0.5543798482068771, + "grad_norm": 0.12760427594184875, + "learning_rate": 0.0005, + "loss": 2.0993, + "step": 145650 + }, + { + "epoch": 0.5544179106749998, + "grad_norm": 0.11807743459939957, + "learning_rate": 0.0005, + "loss": 2.1218, + "step": 145660 + }, + { + "epoch": 0.5544559731431224, + "grad_norm": 0.12287381291389465, + "learning_rate": 0.0005, + "loss": 2.1164, + "step": 145670 + }, + { + "epoch": 0.5544940356112452, + "grad_norm": 0.12048117816448212, + "learning_rate": 0.0005, + "loss": 2.1225, + "step": 145680 + }, + { + "epoch": 0.5545320980793679, + "grad_norm": 0.12428100407123566, + "learning_rate": 0.0005, + "loss": 2.1202, + "step": 145690 + }, + { + "epoch": 0.5545701605474905, + "grad_norm": 0.11523248255252838, + "learning_rate": 0.0005, + "loss": 2.1015, + "step": 145700 + }, + { + "epoch": 0.5546082230156132, + "grad_norm": 0.11993500590324402, + "learning_rate": 0.0005, + "loss": 2.117, + "step": 145710 + }, + { + "epoch": 0.554646285483736, + "grad_norm": 0.12582233548164368, + "learning_rate": 0.0005, + "loss": 2.1165, + "step": 145720 + }, + { + "epoch": 0.5546843479518586, + "grad_norm": 0.13026317954063416, + "learning_rate": 0.0005, + "loss": 2.118, + "step": 145730 + }, + { + "epoch": 0.5547224104199813, + "grad_norm": 0.13820697367191315, + "learning_rate": 0.0005, + "loss": 2.1055, + "step": 145740 + }, + { + "epoch": 0.5547604728881039, + "grad_norm": 0.12032381445169449, + "learning_rate": 0.0005, + "loss": 2.1044, + "step": 145750 + }, + { + "epoch": 0.5547985353562266, + "grad_norm": 0.12701494991779327, + "learning_rate": 0.0005, + "loss": 2.1034, + "step": 145760 + }, + { + "epoch": 0.5548365978243494, + "grad_norm": 0.12934359908103943, + "learning_rate": 0.0005, + "loss": 2.0975, + "step": 145770 + }, + { + "epoch": 0.554874660292472, + "grad_norm": 0.1251780092716217, + "learning_rate": 0.0005, + "loss": 2.1138, + "step": 145780 + }, + { + "epoch": 0.5549127227605947, + "grad_norm": 0.12064220756292343, + "learning_rate": 0.0005, + "loss": 2.1098, + "step": 145790 + }, + { + "epoch": 0.5549507852287173, + "grad_norm": 0.12095960229635239, + "learning_rate": 0.0005, + "loss": 2.1008, + "step": 145800 + }, + { + "epoch": 0.5549888476968401, + "grad_norm": 0.11686307191848755, + "learning_rate": 0.0005, + "loss": 2.1156, + "step": 145810 + }, + { + "epoch": 0.5550269101649628, + "grad_norm": 0.13717177510261536, + "learning_rate": 0.0005, + "loss": 2.1127, + "step": 145820 + }, + { + "epoch": 0.5550649726330854, + "grad_norm": 0.13584278523921967, + "learning_rate": 0.0005, + "loss": 2.1177, + "step": 145830 + }, + { + "epoch": 0.5551030351012081, + "grad_norm": 0.14092442393302917, + "learning_rate": 0.0005, + "loss": 2.1326, + "step": 145840 + }, + { + "epoch": 0.5551410975693308, + "grad_norm": 0.13244536519050598, + "learning_rate": 0.0005, + "loss": 2.1163, + "step": 145850 + }, + { + "epoch": 0.5551791600374535, + "grad_norm": 0.12420519441366196, + "learning_rate": 0.0005, + "loss": 2.1157, + "step": 145860 + }, + { + "epoch": 0.5552172225055761, + "grad_norm": 0.14705169200897217, + "learning_rate": 0.0005, + "loss": 2.1161, + "step": 145870 + }, + { + "epoch": 0.5552552849736988, + "grad_norm": 0.12188060581684113, + "learning_rate": 0.0005, + "loss": 2.1126, + "step": 145880 + }, + { + "epoch": 0.5552933474418216, + "grad_norm": 0.14049747586250305, + "learning_rate": 0.0005, + "loss": 2.121, + "step": 145890 + }, + { + "epoch": 0.5553314099099442, + "grad_norm": 0.12535642087459564, + "learning_rate": 0.0005, + "loss": 2.1249, + "step": 145900 + }, + { + "epoch": 0.5553694723780669, + "grad_norm": 0.11171269416809082, + "learning_rate": 0.0005, + "loss": 2.125, + "step": 145910 + }, + { + "epoch": 0.5554075348461895, + "grad_norm": 0.1396142989397049, + "learning_rate": 0.0005, + "loss": 2.109, + "step": 145920 + }, + { + "epoch": 0.5554455973143122, + "grad_norm": 0.1270921230316162, + "learning_rate": 0.0005, + "loss": 2.1123, + "step": 145930 + }, + { + "epoch": 0.555483659782435, + "grad_norm": 0.11808069795370102, + "learning_rate": 0.0005, + "loss": 2.1162, + "step": 145940 + }, + { + "epoch": 0.5555217222505576, + "grad_norm": 0.12320335954427719, + "learning_rate": 0.0005, + "loss": 2.1068, + "step": 145950 + }, + { + "epoch": 0.5555597847186803, + "grad_norm": 0.12735216319561005, + "learning_rate": 0.0005, + "loss": 2.1139, + "step": 145960 + }, + { + "epoch": 0.5555978471868029, + "grad_norm": 0.1367887407541275, + "learning_rate": 0.0005, + "loss": 2.1088, + "step": 145970 + }, + { + "epoch": 0.5556359096549257, + "grad_norm": 0.1248806044459343, + "learning_rate": 0.0005, + "loss": 2.1259, + "step": 145980 + }, + { + "epoch": 0.5556739721230484, + "grad_norm": 0.13130532205104828, + "learning_rate": 0.0005, + "loss": 2.101, + "step": 145990 + }, + { + "epoch": 0.555712034591171, + "grad_norm": 0.13192979991436005, + "learning_rate": 0.0005, + "loss": 2.103, + "step": 146000 + }, + { + "epoch": 0.5557500970592937, + "grad_norm": 0.12497913092374802, + "learning_rate": 0.0005, + "loss": 2.1091, + "step": 146010 + }, + { + "epoch": 0.5557881595274164, + "grad_norm": 0.1344774216413498, + "learning_rate": 0.0005, + "loss": 2.1257, + "step": 146020 + }, + { + "epoch": 0.5558262219955391, + "grad_norm": 0.11685974895954132, + "learning_rate": 0.0005, + "loss": 2.1256, + "step": 146030 + }, + { + "epoch": 0.5558642844636618, + "grad_norm": 0.11300405114889145, + "learning_rate": 0.0005, + "loss": 2.1105, + "step": 146040 + }, + { + "epoch": 0.5559023469317844, + "grad_norm": 0.11893622577190399, + "learning_rate": 0.0005, + "loss": 2.1107, + "step": 146050 + }, + { + "epoch": 0.5559404093999071, + "grad_norm": 0.1360451728105545, + "learning_rate": 0.0005, + "loss": 2.122, + "step": 146060 + }, + { + "epoch": 0.5559784718680298, + "grad_norm": 0.1295030117034912, + "learning_rate": 0.0005, + "loss": 2.1229, + "step": 146070 + }, + { + "epoch": 0.5560165343361525, + "grad_norm": 0.13259471952915192, + "learning_rate": 0.0005, + "loss": 2.1185, + "step": 146080 + }, + { + "epoch": 0.5560545968042752, + "grad_norm": 0.1290517896413803, + "learning_rate": 0.0005, + "loss": 2.1322, + "step": 146090 + }, + { + "epoch": 0.5560926592723978, + "grad_norm": 0.1371571272611618, + "learning_rate": 0.0005, + "loss": 2.1255, + "step": 146100 + }, + { + "epoch": 0.5561307217405206, + "grad_norm": 0.1228359118103981, + "learning_rate": 0.0005, + "loss": 2.1103, + "step": 146110 + }, + { + "epoch": 0.5561687842086432, + "grad_norm": 0.13620351254940033, + "learning_rate": 0.0005, + "loss": 2.1122, + "step": 146120 + }, + { + "epoch": 0.5562068466767659, + "grad_norm": 0.12080255150794983, + "learning_rate": 0.0005, + "loss": 2.1154, + "step": 146130 + }, + { + "epoch": 0.5562449091448886, + "grad_norm": 0.13528335094451904, + "learning_rate": 0.0005, + "loss": 2.1178, + "step": 146140 + }, + { + "epoch": 0.5562829716130113, + "grad_norm": 0.1401769369840622, + "learning_rate": 0.0005, + "loss": 2.1158, + "step": 146150 + }, + { + "epoch": 0.556321034081134, + "grad_norm": 0.14545312523841858, + "learning_rate": 0.0005, + "loss": 2.1173, + "step": 146160 + }, + { + "epoch": 0.5563590965492566, + "grad_norm": 0.12032944709062576, + "learning_rate": 0.0005, + "loss": 2.1051, + "step": 146170 + }, + { + "epoch": 0.5563971590173793, + "grad_norm": 0.13641439378261566, + "learning_rate": 0.0005, + "loss": 2.1183, + "step": 146180 + }, + { + "epoch": 0.556435221485502, + "grad_norm": 0.12659136950969696, + "learning_rate": 0.0005, + "loss": 2.1212, + "step": 146190 + }, + { + "epoch": 0.5564732839536247, + "grad_norm": 0.13044819235801697, + "learning_rate": 0.0005, + "loss": 2.1188, + "step": 146200 + }, + { + "epoch": 0.5565113464217474, + "grad_norm": 0.12508030235767365, + "learning_rate": 0.0005, + "loss": 2.1136, + "step": 146210 + }, + { + "epoch": 0.55654940888987, + "grad_norm": 0.12736937403678894, + "learning_rate": 0.0005, + "loss": 2.1061, + "step": 146220 + }, + { + "epoch": 0.5565874713579927, + "grad_norm": 0.11774712800979614, + "learning_rate": 0.0005, + "loss": 2.1155, + "step": 146230 + }, + { + "epoch": 0.5566255338261155, + "grad_norm": 0.1205480620265007, + "learning_rate": 0.0005, + "loss": 2.121, + "step": 146240 + }, + { + "epoch": 0.5566635962942381, + "grad_norm": 0.13173139095306396, + "learning_rate": 0.0005, + "loss": 2.0966, + "step": 146250 + }, + { + "epoch": 0.5567016587623608, + "grad_norm": 0.11364980787038803, + "learning_rate": 0.0005, + "loss": 2.1019, + "step": 146260 + }, + { + "epoch": 0.5567397212304834, + "grad_norm": 0.12746919691562653, + "learning_rate": 0.0005, + "loss": 2.1155, + "step": 146270 + }, + { + "epoch": 0.5567777836986062, + "grad_norm": 0.11526639014482498, + "learning_rate": 0.0005, + "loss": 2.0978, + "step": 146280 + }, + { + "epoch": 0.5568158461667289, + "grad_norm": 0.12240004539489746, + "learning_rate": 0.0005, + "loss": 2.1107, + "step": 146290 + }, + { + "epoch": 0.5568539086348515, + "grad_norm": 0.17326214909553528, + "learning_rate": 0.0005, + "loss": 2.1311, + "step": 146300 + }, + { + "epoch": 0.5568919711029742, + "grad_norm": 0.12917761504650116, + "learning_rate": 0.0005, + "loss": 2.1097, + "step": 146310 + }, + { + "epoch": 0.5569300335710969, + "grad_norm": 0.13682439923286438, + "learning_rate": 0.0005, + "loss": 2.0972, + "step": 146320 + }, + { + "epoch": 0.5569680960392196, + "grad_norm": 0.12188564985990524, + "learning_rate": 0.0005, + "loss": 2.098, + "step": 146330 + }, + { + "epoch": 0.5570061585073423, + "grad_norm": 0.14334385097026825, + "learning_rate": 0.0005, + "loss": 2.1091, + "step": 146340 + }, + { + "epoch": 0.5570442209754649, + "grad_norm": 0.12378527969121933, + "learning_rate": 0.0005, + "loss": 2.0958, + "step": 146350 + }, + { + "epoch": 0.5570822834435876, + "grad_norm": 0.11779329925775528, + "learning_rate": 0.0005, + "loss": 2.1174, + "step": 146360 + }, + { + "epoch": 0.5571203459117103, + "grad_norm": 0.12320224195718765, + "learning_rate": 0.0005, + "loss": 2.1087, + "step": 146370 + }, + { + "epoch": 0.557158408379833, + "grad_norm": 0.13029901683330536, + "learning_rate": 0.0005, + "loss": 2.0994, + "step": 146380 + }, + { + "epoch": 0.5571964708479556, + "grad_norm": 0.12013163417577744, + "learning_rate": 0.0005, + "loss": 2.1112, + "step": 146390 + }, + { + "epoch": 0.5572345333160783, + "grad_norm": 0.11994480341672897, + "learning_rate": 0.0005, + "loss": 2.1187, + "step": 146400 + }, + { + "epoch": 0.5572725957842011, + "grad_norm": 0.1187630295753479, + "learning_rate": 0.0005, + "loss": 2.1109, + "step": 146410 + }, + { + "epoch": 0.5573106582523237, + "grad_norm": 0.1293669044971466, + "learning_rate": 0.0005, + "loss": 2.1082, + "step": 146420 + }, + { + "epoch": 0.5573487207204464, + "grad_norm": 0.11685261875391006, + "learning_rate": 0.0005, + "loss": 2.1212, + "step": 146430 + }, + { + "epoch": 0.557386783188569, + "grad_norm": 0.12134901434183121, + "learning_rate": 0.0005, + "loss": 2.1298, + "step": 146440 + }, + { + "epoch": 0.5574248456566918, + "grad_norm": 0.13090312480926514, + "learning_rate": 0.0005, + "loss": 2.114, + "step": 146450 + }, + { + "epoch": 0.5574629081248145, + "grad_norm": 0.12619557976722717, + "learning_rate": 0.0005, + "loss": 2.1305, + "step": 146460 + }, + { + "epoch": 0.5575009705929371, + "grad_norm": 0.13939683139324188, + "learning_rate": 0.0005, + "loss": 2.1169, + "step": 146470 + }, + { + "epoch": 0.5575390330610598, + "grad_norm": 0.1275961995124817, + "learning_rate": 0.0005, + "loss": 2.1178, + "step": 146480 + }, + { + "epoch": 0.5575770955291824, + "grad_norm": 0.13163341581821442, + "learning_rate": 0.0005, + "loss": 2.115, + "step": 146490 + }, + { + "epoch": 0.5576151579973052, + "grad_norm": 0.1348702758550644, + "learning_rate": 0.0005, + "loss": 2.1176, + "step": 146500 + }, + { + "epoch": 0.5576532204654279, + "grad_norm": 0.12150271236896515, + "learning_rate": 0.0005, + "loss": 2.1189, + "step": 146510 + }, + { + "epoch": 0.5576912829335505, + "grad_norm": 0.12395389378070831, + "learning_rate": 0.0005, + "loss": 2.1073, + "step": 146520 + }, + { + "epoch": 0.5577293454016732, + "grad_norm": 0.1396559327840805, + "learning_rate": 0.0005, + "loss": 2.1157, + "step": 146530 + }, + { + "epoch": 0.557767407869796, + "grad_norm": 0.1329929679632187, + "learning_rate": 0.0005, + "loss": 2.1068, + "step": 146540 + }, + { + "epoch": 0.5578054703379186, + "grad_norm": 0.12346814572811127, + "learning_rate": 0.0005, + "loss": 2.1296, + "step": 146550 + }, + { + "epoch": 0.5578435328060413, + "grad_norm": 0.12914222478866577, + "learning_rate": 0.0005, + "loss": 2.1166, + "step": 146560 + }, + { + "epoch": 0.5578815952741639, + "grad_norm": 0.14826074242591858, + "learning_rate": 0.0005, + "loss": 2.1158, + "step": 146570 + }, + { + "epoch": 0.5579196577422867, + "grad_norm": 0.13070404529571533, + "learning_rate": 0.0005, + "loss": 2.1221, + "step": 146580 + }, + { + "epoch": 0.5579577202104093, + "grad_norm": 0.12242259830236435, + "learning_rate": 0.0005, + "loss": 2.1082, + "step": 146590 + }, + { + "epoch": 0.557995782678532, + "grad_norm": 0.12169618159532547, + "learning_rate": 0.0005, + "loss": 2.0966, + "step": 146600 + }, + { + "epoch": 0.5580338451466547, + "grad_norm": 0.1209753081202507, + "learning_rate": 0.0005, + "loss": 2.1084, + "step": 146610 + }, + { + "epoch": 0.5580719076147773, + "grad_norm": 0.12649326026439667, + "learning_rate": 0.0005, + "loss": 2.1332, + "step": 146620 + }, + { + "epoch": 0.5581099700829001, + "grad_norm": 0.12578131258487701, + "learning_rate": 0.0005, + "loss": 2.1021, + "step": 146630 + }, + { + "epoch": 0.5581480325510227, + "grad_norm": 0.12273520231246948, + "learning_rate": 0.0005, + "loss": 2.115, + "step": 146640 + }, + { + "epoch": 0.5581860950191454, + "grad_norm": 0.12749134004116058, + "learning_rate": 0.0005, + "loss": 2.1078, + "step": 146650 + }, + { + "epoch": 0.5582241574872681, + "grad_norm": 0.11976244300603867, + "learning_rate": 0.0005, + "loss": 2.1083, + "step": 146660 + }, + { + "epoch": 0.5582622199553908, + "grad_norm": 0.12862452864646912, + "learning_rate": 0.0005, + "loss": 2.1093, + "step": 146670 + }, + { + "epoch": 0.5583002824235135, + "grad_norm": 0.13949202001094818, + "learning_rate": 0.0005, + "loss": 2.1096, + "step": 146680 + }, + { + "epoch": 0.5583383448916361, + "grad_norm": 0.13123486936092377, + "learning_rate": 0.0005, + "loss": 2.1114, + "step": 146690 + }, + { + "epoch": 0.5583764073597588, + "grad_norm": 0.1350037157535553, + "learning_rate": 0.0005, + "loss": 2.1262, + "step": 146700 + }, + { + "epoch": 0.5584144698278816, + "grad_norm": 0.1257314532995224, + "learning_rate": 0.0005, + "loss": 2.1005, + "step": 146710 + }, + { + "epoch": 0.5584525322960042, + "grad_norm": 0.12179521471261978, + "learning_rate": 0.0005, + "loss": 2.1161, + "step": 146720 + }, + { + "epoch": 0.5584905947641269, + "grad_norm": 0.11762476712465286, + "learning_rate": 0.0005, + "loss": 2.1059, + "step": 146730 + }, + { + "epoch": 0.5585286572322495, + "grad_norm": 0.13207073509693146, + "learning_rate": 0.0005, + "loss": 2.122, + "step": 146740 + }, + { + "epoch": 0.5585667197003723, + "grad_norm": 0.12246715277433395, + "learning_rate": 0.0005, + "loss": 2.1143, + "step": 146750 + }, + { + "epoch": 0.558604782168495, + "grad_norm": 0.1236710399389267, + "learning_rate": 0.0005, + "loss": 2.1089, + "step": 146760 + }, + { + "epoch": 0.5586428446366176, + "grad_norm": 0.1190788596868515, + "learning_rate": 0.0005, + "loss": 2.103, + "step": 146770 + }, + { + "epoch": 0.5586809071047403, + "grad_norm": 0.11813774704933167, + "learning_rate": 0.0005, + "loss": 2.1182, + "step": 146780 + }, + { + "epoch": 0.5587189695728629, + "grad_norm": 0.13377505540847778, + "learning_rate": 0.0005, + "loss": 2.1131, + "step": 146790 + }, + { + "epoch": 0.5587570320409857, + "grad_norm": 0.12410465627908707, + "learning_rate": 0.0005, + "loss": 2.1125, + "step": 146800 + }, + { + "epoch": 0.5587950945091084, + "grad_norm": 0.1365984082221985, + "learning_rate": 0.0005, + "loss": 2.1151, + "step": 146810 + }, + { + "epoch": 0.558833156977231, + "grad_norm": 0.12833617627620697, + "learning_rate": 0.0005, + "loss": 2.1105, + "step": 146820 + }, + { + "epoch": 0.5588712194453537, + "grad_norm": 0.12997734546661377, + "learning_rate": 0.0005, + "loss": 2.1125, + "step": 146830 + }, + { + "epoch": 0.5589092819134764, + "grad_norm": 0.11692407727241516, + "learning_rate": 0.0005, + "loss": 2.109, + "step": 146840 + }, + { + "epoch": 0.5589473443815991, + "grad_norm": 0.1247381642460823, + "learning_rate": 0.0005, + "loss": 2.1071, + "step": 146850 + }, + { + "epoch": 0.5589854068497218, + "grad_norm": 0.12471222877502441, + "learning_rate": 0.0005, + "loss": 2.1134, + "step": 146860 + }, + { + "epoch": 0.5590234693178444, + "grad_norm": 0.1211012452840805, + "learning_rate": 0.0005, + "loss": 2.1108, + "step": 146870 + }, + { + "epoch": 0.5590615317859672, + "grad_norm": 0.13737298548221588, + "learning_rate": 0.0005, + "loss": 2.1081, + "step": 146880 + }, + { + "epoch": 0.5590995942540898, + "grad_norm": 0.13684974610805511, + "learning_rate": 0.0005, + "loss": 2.1103, + "step": 146890 + }, + { + "epoch": 0.5591376567222125, + "grad_norm": 0.12505057454109192, + "learning_rate": 0.0005, + "loss": 2.1063, + "step": 146900 + }, + { + "epoch": 0.5591757191903352, + "grad_norm": 0.1270734667778015, + "learning_rate": 0.0005, + "loss": 2.1143, + "step": 146910 + }, + { + "epoch": 0.5592137816584578, + "grad_norm": 0.13607655465602875, + "learning_rate": 0.0005, + "loss": 2.1092, + "step": 146920 + }, + { + "epoch": 0.5592518441265806, + "grad_norm": 0.13621363043785095, + "learning_rate": 0.0005, + "loss": 2.1134, + "step": 146930 + }, + { + "epoch": 0.5592899065947032, + "grad_norm": 0.13767369091510773, + "learning_rate": 0.0005, + "loss": 2.1088, + "step": 146940 + }, + { + "epoch": 0.5593279690628259, + "grad_norm": 0.12254630774259567, + "learning_rate": 0.0005, + "loss": 2.1143, + "step": 146950 + }, + { + "epoch": 0.5593660315309485, + "grad_norm": 0.1332453489303589, + "learning_rate": 0.0005, + "loss": 2.108, + "step": 146960 + }, + { + "epoch": 0.5594040939990713, + "grad_norm": 0.1357196569442749, + "learning_rate": 0.0005, + "loss": 2.1225, + "step": 146970 + }, + { + "epoch": 0.559442156467194, + "grad_norm": 0.1272253692150116, + "learning_rate": 0.0005, + "loss": 2.1156, + "step": 146980 + }, + { + "epoch": 0.5594802189353166, + "grad_norm": 0.11661429703235626, + "learning_rate": 0.0005, + "loss": 2.0916, + "step": 146990 + }, + { + "epoch": 0.5595182814034393, + "grad_norm": 0.14878034591674805, + "learning_rate": 0.0005, + "loss": 2.1111, + "step": 147000 + }, + { + "epoch": 0.5595563438715621, + "grad_norm": 0.12362154573202133, + "learning_rate": 0.0005, + "loss": 2.1109, + "step": 147010 + }, + { + "epoch": 0.5595944063396847, + "grad_norm": 0.1282384991645813, + "learning_rate": 0.0005, + "loss": 2.1191, + "step": 147020 + }, + { + "epoch": 0.5596324688078074, + "grad_norm": 0.13063131272792816, + "learning_rate": 0.0005, + "loss": 2.1238, + "step": 147030 + }, + { + "epoch": 0.55967053127593, + "grad_norm": 0.1230950877070427, + "learning_rate": 0.0005, + "loss": 2.1279, + "step": 147040 + }, + { + "epoch": 0.5597085937440527, + "grad_norm": 0.12868686020374298, + "learning_rate": 0.0005, + "loss": 2.1162, + "step": 147050 + }, + { + "epoch": 0.5597466562121755, + "grad_norm": 0.11605887115001678, + "learning_rate": 0.0005, + "loss": 2.1134, + "step": 147060 + }, + { + "epoch": 0.5597847186802981, + "grad_norm": 0.12333370745182037, + "learning_rate": 0.0005, + "loss": 2.115, + "step": 147070 + }, + { + "epoch": 0.5598227811484208, + "grad_norm": 0.13515153527259827, + "learning_rate": 0.0005, + "loss": 2.1224, + "step": 147080 + }, + { + "epoch": 0.5598608436165434, + "grad_norm": 0.12148472666740417, + "learning_rate": 0.0005, + "loss": 2.1164, + "step": 147090 + }, + { + "epoch": 0.5598989060846662, + "grad_norm": 0.13153775036334991, + "learning_rate": 0.0005, + "loss": 2.128, + "step": 147100 + }, + { + "epoch": 0.5599369685527888, + "grad_norm": 0.2740803360939026, + "learning_rate": 0.0005, + "loss": 2.1205, + "step": 147110 + }, + { + "epoch": 0.5599750310209115, + "grad_norm": 0.13909249007701874, + "learning_rate": 0.0005, + "loss": 2.1001, + "step": 147120 + }, + { + "epoch": 0.5600130934890342, + "grad_norm": 0.12917034327983856, + "learning_rate": 0.0005, + "loss": 2.109, + "step": 147130 + }, + { + "epoch": 0.5600511559571569, + "grad_norm": 0.12254467606544495, + "learning_rate": 0.0005, + "loss": 2.123, + "step": 147140 + }, + { + "epoch": 0.5600892184252796, + "grad_norm": 0.12786157429218292, + "learning_rate": 0.0005, + "loss": 2.1156, + "step": 147150 + }, + { + "epoch": 0.5601272808934022, + "grad_norm": 0.12340975552797318, + "learning_rate": 0.0005, + "loss": 2.1073, + "step": 147160 + }, + { + "epoch": 0.5601653433615249, + "grad_norm": 0.12096070498228073, + "learning_rate": 0.0005, + "loss": 2.1316, + "step": 147170 + }, + { + "epoch": 0.5602034058296477, + "grad_norm": 0.11434274911880493, + "learning_rate": 0.0005, + "loss": 2.1036, + "step": 147180 + }, + { + "epoch": 0.5602414682977703, + "grad_norm": 0.1330031007528305, + "learning_rate": 0.0005, + "loss": 2.1144, + "step": 147190 + }, + { + "epoch": 0.560279530765893, + "grad_norm": 0.12533064186573029, + "learning_rate": 0.0005, + "loss": 2.1016, + "step": 147200 + }, + { + "epoch": 0.5603175932340156, + "grad_norm": 0.12945376336574554, + "learning_rate": 0.0005, + "loss": 2.1107, + "step": 147210 + }, + { + "epoch": 0.5603556557021383, + "grad_norm": 0.12761011719703674, + "learning_rate": 0.0005, + "loss": 2.1156, + "step": 147220 + }, + { + "epoch": 0.5603937181702611, + "grad_norm": 0.1295965164899826, + "learning_rate": 0.0005, + "loss": 2.1077, + "step": 147230 + }, + { + "epoch": 0.5604317806383837, + "grad_norm": 0.12540018558502197, + "learning_rate": 0.0005, + "loss": 2.1288, + "step": 147240 + }, + { + "epoch": 0.5604698431065064, + "grad_norm": 0.11819574236869812, + "learning_rate": 0.0005, + "loss": 2.1081, + "step": 147250 + }, + { + "epoch": 0.560507905574629, + "grad_norm": 0.12074295431375504, + "learning_rate": 0.0005, + "loss": 2.1047, + "step": 147260 + }, + { + "epoch": 0.5605459680427518, + "grad_norm": 0.1224203109741211, + "learning_rate": 0.0005, + "loss": 2.1053, + "step": 147270 + }, + { + "epoch": 0.5605840305108745, + "grad_norm": 0.11983948200941086, + "learning_rate": 0.0005, + "loss": 2.1305, + "step": 147280 + }, + { + "epoch": 0.5606220929789971, + "grad_norm": 0.12321861833333969, + "learning_rate": 0.0005, + "loss": 2.1106, + "step": 147290 + }, + { + "epoch": 0.5606601554471198, + "grad_norm": 0.1276361495256424, + "learning_rate": 0.0005, + "loss": 2.1204, + "step": 147300 + }, + { + "epoch": 0.5606982179152425, + "grad_norm": 0.1272909790277481, + "learning_rate": 0.0005, + "loss": 2.0989, + "step": 147310 + }, + { + "epoch": 0.5607362803833652, + "grad_norm": 0.1353839486837387, + "learning_rate": 0.0005, + "loss": 2.1207, + "step": 147320 + }, + { + "epoch": 0.5607743428514879, + "grad_norm": 0.11415134370326996, + "learning_rate": 0.0005, + "loss": 2.1144, + "step": 147330 + }, + { + "epoch": 0.5608124053196105, + "grad_norm": 0.13879957795143127, + "learning_rate": 0.0005, + "loss": 2.1093, + "step": 147340 + }, + { + "epoch": 0.5608504677877332, + "grad_norm": 0.11635793745517731, + "learning_rate": 0.0005, + "loss": 2.0991, + "step": 147350 + }, + { + "epoch": 0.5608885302558559, + "grad_norm": 0.12324373424053192, + "learning_rate": 0.0005, + "loss": 2.1266, + "step": 147360 + }, + { + "epoch": 0.5609265927239786, + "grad_norm": 0.12618057429790497, + "learning_rate": 0.0005, + "loss": 2.1105, + "step": 147370 + }, + { + "epoch": 0.5609646551921013, + "grad_norm": 0.13643816113471985, + "learning_rate": 0.0005, + "loss": 2.098, + "step": 147380 + }, + { + "epoch": 0.5610027176602239, + "grad_norm": 0.11734917014837265, + "learning_rate": 0.0005, + "loss": 2.1145, + "step": 147390 + }, + { + "epoch": 0.5610407801283467, + "grad_norm": 0.12413015961647034, + "learning_rate": 0.0005, + "loss": 2.125, + "step": 147400 + }, + { + "epoch": 0.5610788425964693, + "grad_norm": 0.11574844270944595, + "learning_rate": 0.0005, + "loss": 2.1213, + "step": 147410 + }, + { + "epoch": 0.561116905064592, + "grad_norm": 0.11728766560554504, + "learning_rate": 0.0005, + "loss": 2.1137, + "step": 147420 + }, + { + "epoch": 0.5611549675327147, + "grad_norm": 0.1308804303407669, + "learning_rate": 0.0005, + "loss": 2.1108, + "step": 147430 + }, + { + "epoch": 0.5611930300008374, + "grad_norm": 0.12309185415506363, + "learning_rate": 0.0005, + "loss": 2.115, + "step": 147440 + }, + { + "epoch": 0.5612310924689601, + "grad_norm": 0.13514117896556854, + "learning_rate": 0.0005, + "loss": 2.1238, + "step": 147450 + }, + { + "epoch": 0.5612691549370827, + "grad_norm": 0.14108632504940033, + "learning_rate": 0.0005, + "loss": 2.1028, + "step": 147460 + }, + { + "epoch": 0.5613072174052054, + "grad_norm": 0.12130303680896759, + "learning_rate": 0.0005, + "loss": 2.1111, + "step": 147470 + }, + { + "epoch": 0.561345279873328, + "grad_norm": 0.124043770134449, + "learning_rate": 0.0005, + "loss": 2.1331, + "step": 147480 + }, + { + "epoch": 0.5613833423414508, + "grad_norm": 0.12163038551807404, + "learning_rate": 0.0005, + "loss": 2.1033, + "step": 147490 + }, + { + "epoch": 0.5614214048095735, + "grad_norm": 0.11444005370140076, + "learning_rate": 0.0005, + "loss": 2.1289, + "step": 147500 + }, + { + "epoch": 0.5614594672776961, + "grad_norm": 0.12023451179265976, + "learning_rate": 0.0005, + "loss": 2.0972, + "step": 147510 + }, + { + "epoch": 0.5614975297458188, + "grad_norm": 0.12952552735805511, + "learning_rate": 0.0005, + "loss": 2.1243, + "step": 147520 + }, + { + "epoch": 0.5615355922139416, + "grad_norm": 0.14416095614433289, + "learning_rate": 0.0005, + "loss": 2.1169, + "step": 147530 + }, + { + "epoch": 0.5615736546820642, + "grad_norm": 0.14359638094902039, + "learning_rate": 0.0005, + "loss": 2.1129, + "step": 147540 + }, + { + "epoch": 0.5616117171501869, + "grad_norm": 0.13709759712219238, + "learning_rate": 0.0005, + "loss": 2.1088, + "step": 147550 + }, + { + "epoch": 0.5616497796183095, + "grad_norm": 0.11415136605501175, + "learning_rate": 0.0005, + "loss": 2.1193, + "step": 147560 + }, + { + "epoch": 0.5616878420864323, + "grad_norm": 0.13422194123268127, + "learning_rate": 0.0005, + "loss": 2.1172, + "step": 147570 + }, + { + "epoch": 0.561725904554555, + "grad_norm": 0.12092873454093933, + "learning_rate": 0.0005, + "loss": 2.1048, + "step": 147580 + }, + { + "epoch": 0.5617639670226776, + "grad_norm": 0.12244142591953278, + "learning_rate": 0.0005, + "loss": 2.1177, + "step": 147590 + }, + { + "epoch": 0.5618020294908003, + "grad_norm": 0.12528356909751892, + "learning_rate": 0.0005, + "loss": 2.1057, + "step": 147600 + }, + { + "epoch": 0.561840091958923, + "grad_norm": 0.12519213557243347, + "learning_rate": 0.0005, + "loss": 2.1087, + "step": 147610 + }, + { + "epoch": 0.5618781544270457, + "grad_norm": 0.13673320412635803, + "learning_rate": 0.0005, + "loss": 2.1253, + "step": 147620 + }, + { + "epoch": 0.5619162168951684, + "grad_norm": 0.12346883863210678, + "learning_rate": 0.0005, + "loss": 2.1188, + "step": 147630 + }, + { + "epoch": 0.561954279363291, + "grad_norm": 0.1161339059472084, + "learning_rate": 0.0005, + "loss": 2.1205, + "step": 147640 + }, + { + "epoch": 0.5619923418314137, + "grad_norm": 0.11981845647096634, + "learning_rate": 0.0005, + "loss": 2.1268, + "step": 147650 + }, + { + "epoch": 0.5620304042995364, + "grad_norm": 0.12061472982168198, + "learning_rate": 0.0005, + "loss": 2.122, + "step": 147660 + }, + { + "epoch": 0.5620684667676591, + "grad_norm": 0.12434431165456772, + "learning_rate": 0.0005, + "loss": 2.097, + "step": 147670 + }, + { + "epoch": 0.5621065292357817, + "grad_norm": 0.11918957531452179, + "learning_rate": 0.0005, + "loss": 2.1112, + "step": 147680 + }, + { + "epoch": 0.5621445917039044, + "grad_norm": 0.12601889669895172, + "learning_rate": 0.0005, + "loss": 2.1045, + "step": 147690 + }, + { + "epoch": 0.5621826541720272, + "grad_norm": 0.12405750900506973, + "learning_rate": 0.0005, + "loss": 2.1171, + "step": 147700 + }, + { + "epoch": 0.5622207166401498, + "grad_norm": 0.14284063875675201, + "learning_rate": 0.0005, + "loss": 2.1178, + "step": 147710 + }, + { + "epoch": 0.5622587791082725, + "grad_norm": 0.11972171068191528, + "learning_rate": 0.0005, + "loss": 2.1224, + "step": 147720 + }, + { + "epoch": 0.5622968415763951, + "grad_norm": 0.1217430904507637, + "learning_rate": 0.0005, + "loss": 2.13, + "step": 147730 + }, + { + "epoch": 0.5623349040445179, + "grad_norm": 0.11169090121984482, + "learning_rate": 0.0005, + "loss": 2.1213, + "step": 147740 + }, + { + "epoch": 0.5623729665126406, + "grad_norm": 0.1246361956000328, + "learning_rate": 0.0005, + "loss": 2.0933, + "step": 147750 + }, + { + "epoch": 0.5624110289807632, + "grad_norm": 0.11976980417966843, + "learning_rate": 0.0005, + "loss": 2.1027, + "step": 147760 + }, + { + "epoch": 0.5624490914488859, + "grad_norm": 0.14023859798908234, + "learning_rate": 0.0005, + "loss": 2.1142, + "step": 147770 + }, + { + "epoch": 0.5624871539170085, + "grad_norm": 0.12163596600294113, + "learning_rate": 0.0005, + "loss": 2.1109, + "step": 147780 + }, + { + "epoch": 0.5625252163851313, + "grad_norm": 0.11780829727649689, + "learning_rate": 0.0005, + "loss": 2.1295, + "step": 147790 + }, + { + "epoch": 0.562563278853254, + "grad_norm": 0.12326356023550034, + "learning_rate": 0.0005, + "loss": 2.1083, + "step": 147800 + }, + { + "epoch": 0.5626013413213766, + "grad_norm": 0.13496778905391693, + "learning_rate": 0.0005, + "loss": 2.1013, + "step": 147810 + }, + { + "epoch": 0.5626394037894993, + "grad_norm": 0.12595048546791077, + "learning_rate": 0.0005, + "loss": 2.1026, + "step": 147820 + }, + { + "epoch": 0.562677466257622, + "grad_norm": 0.13264413177967072, + "learning_rate": 0.0005, + "loss": 2.0981, + "step": 147830 + }, + { + "epoch": 0.5627155287257447, + "grad_norm": 0.13527928292751312, + "learning_rate": 0.0005, + "loss": 2.1285, + "step": 147840 + }, + { + "epoch": 0.5627535911938674, + "grad_norm": 0.12819808721542358, + "learning_rate": 0.0005, + "loss": 2.0945, + "step": 147850 + }, + { + "epoch": 0.56279165366199, + "grad_norm": 0.12501229345798492, + "learning_rate": 0.0005, + "loss": 2.1227, + "step": 147860 + }, + { + "epoch": 0.5628297161301128, + "grad_norm": 0.12441595643758774, + "learning_rate": 0.0005, + "loss": 2.1275, + "step": 147870 + }, + { + "epoch": 0.5628677785982354, + "grad_norm": 0.12710030376911163, + "learning_rate": 0.0005, + "loss": 2.1124, + "step": 147880 + }, + { + "epoch": 0.5629058410663581, + "grad_norm": 0.14186739921569824, + "learning_rate": 0.0005, + "loss": 2.1331, + "step": 147890 + }, + { + "epoch": 0.5629439035344808, + "grad_norm": 0.14794635772705078, + "learning_rate": 0.0005, + "loss": 2.1152, + "step": 147900 + }, + { + "epoch": 0.5629819660026034, + "grad_norm": 0.13675430417060852, + "learning_rate": 0.0005, + "loss": 2.1037, + "step": 147910 + }, + { + "epoch": 0.5630200284707262, + "grad_norm": 0.11546150594949722, + "learning_rate": 0.0005, + "loss": 2.1229, + "step": 147920 + }, + { + "epoch": 0.5630580909388488, + "grad_norm": 0.1285984367132187, + "learning_rate": 0.0005, + "loss": 2.12, + "step": 147930 + }, + { + "epoch": 0.5630961534069715, + "grad_norm": 0.12132029980421066, + "learning_rate": 0.0005, + "loss": 2.1184, + "step": 147940 + }, + { + "epoch": 0.5631342158750942, + "grad_norm": 0.12189318239688873, + "learning_rate": 0.0005, + "loss": 2.1106, + "step": 147950 + }, + { + "epoch": 0.5631722783432169, + "grad_norm": 0.11721985042095184, + "learning_rate": 0.0005, + "loss": 2.0968, + "step": 147960 + }, + { + "epoch": 0.5632103408113396, + "grad_norm": 0.11647754162549973, + "learning_rate": 0.0005, + "loss": 2.1039, + "step": 147970 + }, + { + "epoch": 0.5632484032794622, + "grad_norm": 0.12019306421279907, + "learning_rate": 0.0005, + "loss": 2.1078, + "step": 147980 + }, + { + "epoch": 0.5632864657475849, + "grad_norm": 0.12273525446653366, + "learning_rate": 0.0005, + "loss": 2.1201, + "step": 147990 + }, + { + "epoch": 0.5633245282157077, + "grad_norm": 0.13190962374210358, + "learning_rate": 0.0005, + "loss": 2.1093, + "step": 148000 + }, + { + "epoch": 0.5633625906838303, + "grad_norm": 0.12351113557815552, + "learning_rate": 0.0005, + "loss": 2.1193, + "step": 148010 + }, + { + "epoch": 0.563400653151953, + "grad_norm": 0.1404343992471695, + "learning_rate": 0.0005, + "loss": 2.1205, + "step": 148020 + }, + { + "epoch": 0.5634387156200756, + "grad_norm": 0.11794932931661606, + "learning_rate": 0.0005, + "loss": 2.1108, + "step": 148030 + }, + { + "epoch": 0.5634767780881984, + "grad_norm": 0.13164691627025604, + "learning_rate": 0.0005, + "loss": 2.117, + "step": 148040 + }, + { + "epoch": 0.5635148405563211, + "grad_norm": 0.10908240079879761, + "learning_rate": 0.0005, + "loss": 2.0936, + "step": 148050 + }, + { + "epoch": 0.5635529030244437, + "grad_norm": 0.12172287702560425, + "learning_rate": 0.0005, + "loss": 2.1258, + "step": 148060 + }, + { + "epoch": 0.5635909654925664, + "grad_norm": 0.12808212637901306, + "learning_rate": 0.0005, + "loss": 2.1149, + "step": 148070 + }, + { + "epoch": 0.563629027960689, + "grad_norm": 0.11926170438528061, + "learning_rate": 0.0005, + "loss": 2.1112, + "step": 148080 + }, + { + "epoch": 0.5636670904288118, + "grad_norm": 0.13597628474235535, + "learning_rate": 0.0005, + "loss": 2.1206, + "step": 148090 + }, + { + "epoch": 0.5637051528969345, + "grad_norm": 0.1296033412218094, + "learning_rate": 0.0005, + "loss": 2.1163, + "step": 148100 + }, + { + "epoch": 0.5637432153650571, + "grad_norm": 0.12396147847175598, + "learning_rate": 0.0005, + "loss": 2.0986, + "step": 148110 + }, + { + "epoch": 0.5637812778331798, + "grad_norm": 0.11653433740139008, + "learning_rate": 0.0005, + "loss": 2.0996, + "step": 148120 + }, + { + "epoch": 0.5638193403013025, + "grad_norm": 0.14345014095306396, + "learning_rate": 0.0005, + "loss": 2.1143, + "step": 148130 + }, + { + "epoch": 0.5638574027694252, + "grad_norm": 0.11987555772066116, + "learning_rate": 0.0005, + "loss": 2.105, + "step": 148140 + }, + { + "epoch": 0.5638954652375479, + "grad_norm": 0.12629404664039612, + "learning_rate": 0.0005, + "loss": 2.1073, + "step": 148150 + }, + { + "epoch": 0.5639335277056705, + "grad_norm": 0.12913820147514343, + "learning_rate": 0.0005, + "loss": 2.1192, + "step": 148160 + }, + { + "epoch": 0.5639715901737933, + "grad_norm": 0.1157715693116188, + "learning_rate": 0.0005, + "loss": 2.1167, + "step": 148170 + }, + { + "epoch": 0.5640096526419159, + "grad_norm": 0.12015217542648315, + "learning_rate": 0.0005, + "loss": 2.1176, + "step": 148180 + }, + { + "epoch": 0.5640477151100386, + "grad_norm": 0.12450938671827316, + "learning_rate": 0.0005, + "loss": 2.1004, + "step": 148190 + }, + { + "epoch": 0.5640857775781613, + "grad_norm": 0.12859109044075012, + "learning_rate": 0.0005, + "loss": 2.116, + "step": 148200 + }, + { + "epoch": 0.5641238400462839, + "grad_norm": 0.11546629667282104, + "learning_rate": 0.0005, + "loss": 2.1132, + "step": 148210 + }, + { + "epoch": 0.5641619025144067, + "grad_norm": 0.1237310841679573, + "learning_rate": 0.0005, + "loss": 2.1241, + "step": 148220 + }, + { + "epoch": 0.5641999649825293, + "grad_norm": 0.1317415088415146, + "learning_rate": 0.0005, + "loss": 2.1128, + "step": 148230 + }, + { + "epoch": 0.564238027450652, + "grad_norm": 0.12628450989723206, + "learning_rate": 0.0005, + "loss": 2.1202, + "step": 148240 + }, + { + "epoch": 0.5642760899187746, + "grad_norm": 0.12909241020679474, + "learning_rate": 0.0005, + "loss": 2.113, + "step": 148250 + }, + { + "epoch": 0.5643141523868974, + "grad_norm": 0.13340359926223755, + "learning_rate": 0.0005, + "loss": 2.1022, + "step": 148260 + }, + { + "epoch": 0.5643522148550201, + "grad_norm": 0.14159567654132843, + "learning_rate": 0.0005, + "loss": 2.1084, + "step": 148270 + }, + { + "epoch": 0.5643902773231427, + "grad_norm": 0.13016550242900848, + "learning_rate": 0.0005, + "loss": 2.1097, + "step": 148280 + }, + { + "epoch": 0.5644283397912654, + "grad_norm": 0.11598911881446838, + "learning_rate": 0.0005, + "loss": 2.1154, + "step": 148290 + }, + { + "epoch": 0.5644664022593882, + "grad_norm": 0.12047997862100601, + "learning_rate": 0.0005, + "loss": 2.094, + "step": 148300 + }, + { + "epoch": 0.5645044647275108, + "grad_norm": 0.12963908910751343, + "learning_rate": 0.0005, + "loss": 2.1072, + "step": 148310 + }, + { + "epoch": 0.5645425271956335, + "grad_norm": 0.12290691584348679, + "learning_rate": 0.0005, + "loss": 2.1198, + "step": 148320 + }, + { + "epoch": 0.5645805896637561, + "grad_norm": 0.11939645558595657, + "learning_rate": 0.0005, + "loss": 2.1219, + "step": 148330 + }, + { + "epoch": 0.5646186521318788, + "grad_norm": 0.11704593151807785, + "learning_rate": 0.0005, + "loss": 2.1256, + "step": 148340 + }, + { + "epoch": 0.5646567146000016, + "grad_norm": 0.12034741789102554, + "learning_rate": 0.0005, + "loss": 2.1179, + "step": 148350 + }, + { + "epoch": 0.5646947770681242, + "grad_norm": 0.12463753670454025, + "learning_rate": 0.0005, + "loss": 2.109, + "step": 148360 + }, + { + "epoch": 0.5647328395362469, + "grad_norm": 0.12364073097705841, + "learning_rate": 0.0005, + "loss": 2.1111, + "step": 148370 + }, + { + "epoch": 0.5647709020043695, + "grad_norm": 0.11811131983995438, + "learning_rate": 0.0005, + "loss": 2.1076, + "step": 148380 + }, + { + "epoch": 0.5648089644724923, + "grad_norm": 0.12201013416051865, + "learning_rate": 0.0005, + "loss": 2.1237, + "step": 148390 + }, + { + "epoch": 0.564847026940615, + "grad_norm": 0.11526204645633698, + "learning_rate": 0.0005, + "loss": 2.0983, + "step": 148400 + }, + { + "epoch": 0.5648850894087376, + "grad_norm": 0.6108805537223816, + "learning_rate": 0.0005, + "loss": 2.1038, + "step": 148410 + }, + { + "epoch": 0.5649231518768603, + "grad_norm": 0.12102359533309937, + "learning_rate": 0.0005, + "loss": 2.0993, + "step": 148420 + }, + { + "epoch": 0.564961214344983, + "grad_norm": 0.12056120485067368, + "learning_rate": 0.0005, + "loss": 2.1137, + "step": 148430 + }, + { + "epoch": 0.5649992768131057, + "grad_norm": 0.1187172681093216, + "learning_rate": 0.0005, + "loss": 2.1064, + "step": 148440 + }, + { + "epoch": 0.5650373392812283, + "grad_norm": 0.11827126145362854, + "learning_rate": 0.0005, + "loss": 2.1144, + "step": 148450 + }, + { + "epoch": 0.565075401749351, + "grad_norm": 0.11755429953336716, + "learning_rate": 0.0005, + "loss": 2.1228, + "step": 148460 + }, + { + "epoch": 0.5651134642174738, + "grad_norm": 0.12420319765806198, + "learning_rate": 0.0005, + "loss": 2.1099, + "step": 148470 + }, + { + "epoch": 0.5651515266855964, + "grad_norm": 0.1221015527844429, + "learning_rate": 0.0005, + "loss": 2.1131, + "step": 148480 + }, + { + "epoch": 0.5651895891537191, + "grad_norm": 0.14070142805576324, + "learning_rate": 0.0005, + "loss": 2.1076, + "step": 148490 + }, + { + "epoch": 0.5652276516218417, + "grad_norm": 0.12251606583595276, + "learning_rate": 0.0005, + "loss": 2.1088, + "step": 148500 + }, + { + "epoch": 0.5652657140899644, + "grad_norm": 0.11810902506113052, + "learning_rate": 0.0005, + "loss": 2.1153, + "step": 148510 + }, + { + "epoch": 0.5653037765580872, + "grad_norm": 0.13781926035881042, + "learning_rate": 0.0005, + "loss": 2.0982, + "step": 148520 + }, + { + "epoch": 0.5653418390262098, + "grad_norm": 0.1113312691450119, + "learning_rate": 0.0005, + "loss": 2.1193, + "step": 148530 + }, + { + "epoch": 0.5653799014943325, + "grad_norm": 0.12936291098594666, + "learning_rate": 0.0005, + "loss": 2.1276, + "step": 148540 + }, + { + "epoch": 0.5654179639624551, + "grad_norm": 0.1260511428117752, + "learning_rate": 0.0005, + "loss": 2.113, + "step": 148550 + }, + { + "epoch": 0.5654560264305779, + "grad_norm": 0.12644599378108978, + "learning_rate": 0.0005, + "loss": 2.1262, + "step": 148560 + }, + { + "epoch": 0.5654940888987006, + "grad_norm": 0.11645796149969101, + "learning_rate": 0.0005, + "loss": 2.1234, + "step": 148570 + }, + { + "epoch": 0.5655321513668232, + "grad_norm": 0.1230463981628418, + "learning_rate": 0.0005, + "loss": 2.1197, + "step": 148580 + }, + { + "epoch": 0.5655702138349459, + "grad_norm": 0.128582164645195, + "learning_rate": 0.0005, + "loss": 2.1094, + "step": 148590 + }, + { + "epoch": 0.5656082763030686, + "grad_norm": 0.11796011030673981, + "learning_rate": 0.0005, + "loss": 2.1031, + "step": 148600 + }, + { + "epoch": 0.5656463387711913, + "grad_norm": 0.12276263535022736, + "learning_rate": 0.0005, + "loss": 2.1161, + "step": 148610 + }, + { + "epoch": 0.565684401239314, + "grad_norm": 0.12811756134033203, + "learning_rate": 0.0005, + "loss": 2.1209, + "step": 148620 + }, + { + "epoch": 0.5657224637074366, + "grad_norm": 0.12802930176258087, + "learning_rate": 0.0005, + "loss": 2.1228, + "step": 148630 + }, + { + "epoch": 0.5657605261755593, + "grad_norm": 0.12110596895217896, + "learning_rate": 0.0005, + "loss": 2.1204, + "step": 148640 + }, + { + "epoch": 0.565798588643682, + "grad_norm": 0.12279154360294342, + "learning_rate": 0.0005, + "loss": 2.1076, + "step": 148650 + }, + { + "epoch": 0.5658366511118047, + "grad_norm": 0.1164965108036995, + "learning_rate": 0.0005, + "loss": 2.1232, + "step": 148660 + }, + { + "epoch": 0.5658747135799274, + "grad_norm": 0.12058350443840027, + "learning_rate": 0.0005, + "loss": 2.1271, + "step": 148670 + }, + { + "epoch": 0.56591277604805, + "grad_norm": 0.12220282107591629, + "learning_rate": 0.0005, + "loss": 2.0994, + "step": 148680 + }, + { + "epoch": 0.5659508385161728, + "grad_norm": 0.11955209821462631, + "learning_rate": 0.0005, + "loss": 2.1148, + "step": 148690 + }, + { + "epoch": 0.5659889009842954, + "grad_norm": 0.12738743424415588, + "learning_rate": 0.0005, + "loss": 2.1254, + "step": 148700 + }, + { + "epoch": 0.5660269634524181, + "grad_norm": 0.13051724433898926, + "learning_rate": 0.0005, + "loss": 2.1272, + "step": 148710 + }, + { + "epoch": 0.5660650259205408, + "grad_norm": 0.12039772421121597, + "learning_rate": 0.0005, + "loss": 2.1107, + "step": 148720 + }, + { + "epoch": 0.5661030883886635, + "grad_norm": 0.11901012808084488, + "learning_rate": 0.0005, + "loss": 2.105, + "step": 148730 + }, + { + "epoch": 0.5661411508567862, + "grad_norm": 0.11077872663736343, + "learning_rate": 0.0005, + "loss": 2.1133, + "step": 148740 + }, + { + "epoch": 0.5661792133249088, + "grad_norm": 0.14071735739707947, + "learning_rate": 0.0005, + "loss": 2.1193, + "step": 148750 + }, + { + "epoch": 0.5662172757930315, + "grad_norm": 0.14127027988433838, + "learning_rate": 0.0005, + "loss": 2.1296, + "step": 148760 + }, + { + "epoch": 0.5662553382611542, + "grad_norm": 0.1209738478064537, + "learning_rate": 0.0005, + "loss": 2.1076, + "step": 148770 + }, + { + "epoch": 0.5662934007292769, + "grad_norm": 0.13998672366142273, + "learning_rate": 0.0005, + "loss": 2.1387, + "step": 148780 + }, + { + "epoch": 0.5663314631973996, + "grad_norm": 0.14784862101078033, + "learning_rate": 0.0005, + "loss": 2.1222, + "step": 148790 + }, + { + "epoch": 0.5663695256655222, + "grad_norm": 0.11674334108829498, + "learning_rate": 0.0005, + "loss": 2.1039, + "step": 148800 + }, + { + "epoch": 0.5664075881336449, + "grad_norm": 0.1463528871536255, + "learning_rate": 0.0005, + "loss": 2.1128, + "step": 148810 + }, + { + "epoch": 0.5664456506017677, + "grad_norm": 0.1308635026216507, + "learning_rate": 0.0005, + "loss": 2.1217, + "step": 148820 + }, + { + "epoch": 0.5664837130698903, + "grad_norm": 0.12974272668361664, + "learning_rate": 0.0005, + "loss": 2.1115, + "step": 148830 + }, + { + "epoch": 0.566521775538013, + "grad_norm": 0.1228189468383789, + "learning_rate": 0.0005, + "loss": 2.1115, + "step": 148840 + }, + { + "epoch": 0.5665598380061356, + "grad_norm": 0.1328430473804474, + "learning_rate": 0.0005, + "loss": 2.1238, + "step": 148850 + }, + { + "epoch": 0.5665979004742584, + "grad_norm": 0.12181210517883301, + "learning_rate": 0.0005, + "loss": 2.1098, + "step": 148860 + }, + { + "epoch": 0.566635962942381, + "grad_norm": 0.13458064198493958, + "learning_rate": 0.0005, + "loss": 2.1121, + "step": 148870 + }, + { + "epoch": 0.5666740254105037, + "grad_norm": 0.12336578220129013, + "learning_rate": 0.0005, + "loss": 2.1201, + "step": 148880 + }, + { + "epoch": 0.5667120878786264, + "grad_norm": 0.11937737464904785, + "learning_rate": 0.0005, + "loss": 2.1216, + "step": 148890 + }, + { + "epoch": 0.5667501503467491, + "grad_norm": 0.13529595732688904, + "learning_rate": 0.0005, + "loss": 2.0974, + "step": 148900 + }, + { + "epoch": 0.5667882128148718, + "grad_norm": 0.1208646222949028, + "learning_rate": 0.0005, + "loss": 2.1084, + "step": 148910 + }, + { + "epoch": 0.5668262752829945, + "grad_norm": 0.13353469967842102, + "learning_rate": 0.0005, + "loss": 2.1085, + "step": 148920 + }, + { + "epoch": 0.5668643377511171, + "grad_norm": 0.13542310893535614, + "learning_rate": 0.0005, + "loss": 2.1164, + "step": 148930 + }, + { + "epoch": 0.5669024002192398, + "grad_norm": 0.11974187195301056, + "learning_rate": 0.0005, + "loss": 2.1123, + "step": 148940 + }, + { + "epoch": 0.5669404626873625, + "grad_norm": 0.11809533834457397, + "learning_rate": 0.0005, + "loss": 2.0951, + "step": 148950 + }, + { + "epoch": 0.5669785251554852, + "grad_norm": 0.12082237005233765, + "learning_rate": 0.0005, + "loss": 2.1193, + "step": 148960 + }, + { + "epoch": 0.5670165876236078, + "grad_norm": 0.13240136206150055, + "learning_rate": 0.0005, + "loss": 2.1123, + "step": 148970 + }, + { + "epoch": 0.5670546500917305, + "grad_norm": 0.13582897186279297, + "learning_rate": 0.0005, + "loss": 2.1091, + "step": 148980 + }, + { + "epoch": 0.5670927125598533, + "grad_norm": 0.12459953874349594, + "learning_rate": 0.0005, + "loss": 2.1145, + "step": 148990 + }, + { + "epoch": 0.5671307750279759, + "grad_norm": 0.13695721328258514, + "learning_rate": 0.0005, + "loss": 2.1181, + "step": 149000 + }, + { + "epoch": 0.5671688374960986, + "grad_norm": 0.12011035531759262, + "learning_rate": 0.0005, + "loss": 2.0968, + "step": 149010 + }, + { + "epoch": 0.5672068999642212, + "grad_norm": 0.12467087060213089, + "learning_rate": 0.0005, + "loss": 2.1139, + "step": 149020 + }, + { + "epoch": 0.567244962432344, + "grad_norm": 0.1225501075387001, + "learning_rate": 0.0005, + "loss": 2.1022, + "step": 149030 + }, + { + "epoch": 0.5672830249004667, + "grad_norm": 0.1462046056985855, + "learning_rate": 0.0005, + "loss": 2.1083, + "step": 149040 + }, + { + "epoch": 0.5673210873685893, + "grad_norm": 0.1293679028749466, + "learning_rate": 0.0005, + "loss": 2.1202, + "step": 149050 + }, + { + "epoch": 0.567359149836712, + "grad_norm": 0.13014128804206848, + "learning_rate": 0.0005, + "loss": 2.1199, + "step": 149060 + }, + { + "epoch": 0.5673972123048346, + "grad_norm": 0.13542695343494415, + "learning_rate": 0.0005, + "loss": 2.1198, + "step": 149070 + }, + { + "epoch": 0.5674352747729574, + "grad_norm": 0.12484169751405716, + "learning_rate": 0.0005, + "loss": 2.1157, + "step": 149080 + }, + { + "epoch": 0.5674733372410801, + "grad_norm": 0.1358722299337387, + "learning_rate": 0.0005, + "loss": 2.117, + "step": 149090 + }, + { + "epoch": 0.5675113997092027, + "grad_norm": 0.12429749220609665, + "learning_rate": 0.0005, + "loss": 2.1111, + "step": 149100 + }, + { + "epoch": 0.5675494621773254, + "grad_norm": 0.12064723670482635, + "learning_rate": 0.0005, + "loss": 2.1068, + "step": 149110 + }, + { + "epoch": 0.5675875246454481, + "grad_norm": 0.13320963084697723, + "learning_rate": 0.0005, + "loss": 2.1074, + "step": 149120 + }, + { + "epoch": 0.5676255871135708, + "grad_norm": 0.12744002044200897, + "learning_rate": 0.0005, + "loss": 2.1193, + "step": 149130 + }, + { + "epoch": 0.5676636495816935, + "grad_norm": 0.12755407392978668, + "learning_rate": 0.0005, + "loss": 2.1081, + "step": 149140 + }, + { + "epoch": 0.5677017120498161, + "grad_norm": 0.13250961899757385, + "learning_rate": 0.0005, + "loss": 2.1238, + "step": 149150 + }, + { + "epoch": 0.5677397745179389, + "grad_norm": 0.1300535649061203, + "learning_rate": 0.0005, + "loss": 2.1137, + "step": 149160 + }, + { + "epoch": 0.5677778369860615, + "grad_norm": 0.11809708178043365, + "learning_rate": 0.0005, + "loss": 2.1177, + "step": 149170 + }, + { + "epoch": 0.5678158994541842, + "grad_norm": 0.11867474019527435, + "learning_rate": 0.0005, + "loss": 2.1068, + "step": 149180 + }, + { + "epoch": 0.5678539619223069, + "grad_norm": 0.13715608417987823, + "learning_rate": 0.0005, + "loss": 2.1132, + "step": 149190 + }, + { + "epoch": 0.5678920243904296, + "grad_norm": 0.12556719779968262, + "learning_rate": 0.0005, + "loss": 2.1153, + "step": 149200 + }, + { + "epoch": 0.5679300868585523, + "grad_norm": 0.12606534361839294, + "learning_rate": 0.0005, + "loss": 2.116, + "step": 149210 + }, + { + "epoch": 0.5679681493266749, + "grad_norm": 0.13479286432266235, + "learning_rate": 0.0005, + "loss": 2.116, + "step": 149220 + }, + { + "epoch": 0.5680062117947976, + "grad_norm": 0.12721773982048035, + "learning_rate": 0.0005, + "loss": 2.1183, + "step": 149230 + }, + { + "epoch": 0.5680442742629203, + "grad_norm": 0.12476199865341187, + "learning_rate": 0.0005, + "loss": 2.1213, + "step": 149240 + }, + { + "epoch": 0.568082336731043, + "grad_norm": 0.13572070002555847, + "learning_rate": 0.0005, + "loss": 2.1132, + "step": 149250 + }, + { + "epoch": 0.5681203991991657, + "grad_norm": 0.11942504346370697, + "learning_rate": 0.0005, + "loss": 2.1142, + "step": 149260 + }, + { + "epoch": 0.5681584616672883, + "grad_norm": 0.1328095942735672, + "learning_rate": 0.0005, + "loss": 2.1193, + "step": 149270 + }, + { + "epoch": 0.568196524135411, + "grad_norm": 0.12975753843784332, + "learning_rate": 0.0005, + "loss": 2.1084, + "step": 149280 + }, + { + "epoch": 0.5682345866035338, + "grad_norm": 0.12333561480045319, + "learning_rate": 0.0005, + "loss": 2.1069, + "step": 149290 + }, + { + "epoch": 0.5682726490716564, + "grad_norm": 0.1283501535654068, + "learning_rate": 0.0005, + "loss": 2.1188, + "step": 149300 + }, + { + "epoch": 0.5683107115397791, + "grad_norm": 0.11721879988908768, + "learning_rate": 0.0005, + "loss": 2.1133, + "step": 149310 + }, + { + "epoch": 0.5683487740079017, + "grad_norm": 0.11698118597269058, + "learning_rate": 0.0005, + "loss": 2.1249, + "step": 149320 + }, + { + "epoch": 0.5683868364760245, + "grad_norm": 0.11460036784410477, + "learning_rate": 0.0005, + "loss": 2.1097, + "step": 149330 + }, + { + "epoch": 0.5684248989441472, + "grad_norm": 0.12026268243789673, + "learning_rate": 0.0005, + "loss": 2.1012, + "step": 149340 + }, + { + "epoch": 0.5684629614122698, + "grad_norm": 0.12232708930969238, + "learning_rate": 0.0005, + "loss": 2.105, + "step": 149350 + }, + { + "epoch": 0.5685010238803925, + "grad_norm": 0.12763771414756775, + "learning_rate": 0.0005, + "loss": 2.1059, + "step": 149360 + }, + { + "epoch": 0.5685390863485151, + "grad_norm": 0.12021525949239731, + "learning_rate": 0.0005, + "loss": 2.1171, + "step": 149370 + }, + { + "epoch": 0.5685771488166379, + "grad_norm": 0.11759550124406815, + "learning_rate": 0.0005, + "loss": 2.1115, + "step": 149380 + }, + { + "epoch": 0.5686152112847606, + "grad_norm": 0.12076186388731003, + "learning_rate": 0.0005, + "loss": 2.1141, + "step": 149390 + }, + { + "epoch": 0.5686532737528832, + "grad_norm": 0.12155576795339584, + "learning_rate": 0.0005, + "loss": 2.0969, + "step": 149400 + }, + { + "epoch": 0.5686913362210059, + "grad_norm": 0.11553096771240234, + "learning_rate": 0.0005, + "loss": 2.1106, + "step": 149410 + }, + { + "epoch": 0.5687293986891286, + "grad_norm": 0.11827583611011505, + "learning_rate": 0.0005, + "loss": 2.1118, + "step": 149420 + }, + { + "epoch": 0.5687674611572513, + "grad_norm": 0.12528096139431, + "learning_rate": 0.0005, + "loss": 2.1303, + "step": 149430 + }, + { + "epoch": 0.568805523625374, + "grad_norm": 0.12339337915182114, + "learning_rate": 0.0005, + "loss": 2.1175, + "step": 149440 + }, + { + "epoch": 0.5688435860934966, + "grad_norm": 0.12378998845815659, + "learning_rate": 0.0005, + "loss": 2.1062, + "step": 149450 + }, + { + "epoch": 0.5688816485616194, + "grad_norm": 0.14907345175743103, + "learning_rate": 0.0005, + "loss": 2.1053, + "step": 149460 + }, + { + "epoch": 0.568919711029742, + "grad_norm": 0.11773096024990082, + "learning_rate": 0.0005, + "loss": 2.1067, + "step": 149470 + }, + { + "epoch": 0.5689577734978647, + "grad_norm": 0.12182561308145523, + "learning_rate": 0.0005, + "loss": 2.1136, + "step": 149480 + }, + { + "epoch": 0.5689958359659874, + "grad_norm": 0.11594950407743454, + "learning_rate": 0.0005, + "loss": 2.1113, + "step": 149490 + }, + { + "epoch": 0.56903389843411, + "grad_norm": 0.1339421570301056, + "learning_rate": 0.0005, + "loss": 2.1283, + "step": 149500 + }, + { + "epoch": 0.5690719609022328, + "grad_norm": 0.1360144019126892, + "learning_rate": 0.0005, + "loss": 2.1084, + "step": 149510 + }, + { + "epoch": 0.5691100233703554, + "grad_norm": 0.21229194104671478, + "learning_rate": 0.0005, + "loss": 2.1137, + "step": 149520 + }, + { + "epoch": 0.5691480858384781, + "grad_norm": 0.11902911961078644, + "learning_rate": 0.0005, + "loss": 2.1136, + "step": 149530 + }, + { + "epoch": 0.5691861483066007, + "grad_norm": 0.1262020617723465, + "learning_rate": 0.0005, + "loss": 2.1316, + "step": 149540 + }, + { + "epoch": 0.5692242107747235, + "grad_norm": 0.130680650472641, + "learning_rate": 0.0005, + "loss": 2.1114, + "step": 149550 + }, + { + "epoch": 0.5692622732428462, + "grad_norm": 0.1344127506017685, + "learning_rate": 0.0005, + "loss": 2.1051, + "step": 149560 + }, + { + "epoch": 0.5693003357109688, + "grad_norm": 0.1387372463941574, + "learning_rate": 0.0005, + "loss": 2.1034, + "step": 149570 + }, + { + "epoch": 0.5693383981790915, + "grad_norm": 0.12453319132328033, + "learning_rate": 0.0005, + "loss": 2.1098, + "step": 149580 + }, + { + "epoch": 0.5693764606472143, + "grad_norm": 0.11809080839157104, + "learning_rate": 0.0005, + "loss": 2.1145, + "step": 149590 + }, + { + "epoch": 0.5694145231153369, + "grad_norm": 0.11634735763072968, + "learning_rate": 0.0005, + "loss": 2.0968, + "step": 149600 + }, + { + "epoch": 0.5694525855834596, + "grad_norm": 0.13118545711040497, + "learning_rate": 0.0005, + "loss": 2.1052, + "step": 149610 + }, + { + "epoch": 0.5694906480515822, + "grad_norm": 0.13438338041305542, + "learning_rate": 0.0005, + "loss": 2.1361, + "step": 149620 + }, + { + "epoch": 0.569528710519705, + "grad_norm": 0.12148258090019226, + "learning_rate": 0.0005, + "loss": 2.1128, + "step": 149630 + }, + { + "epoch": 0.5695667729878277, + "grad_norm": 0.1469801664352417, + "learning_rate": 0.0005, + "loss": 2.1096, + "step": 149640 + }, + { + "epoch": 0.5696048354559503, + "grad_norm": 0.11927749961614609, + "learning_rate": 0.0005, + "loss": 2.1151, + "step": 149650 + }, + { + "epoch": 0.569642897924073, + "grad_norm": 0.11742527037858963, + "learning_rate": 0.0005, + "loss": 2.1045, + "step": 149660 + }, + { + "epoch": 0.5696809603921956, + "grad_norm": 0.1266251653432846, + "learning_rate": 0.0005, + "loss": 2.1072, + "step": 149670 + }, + { + "epoch": 0.5697190228603184, + "grad_norm": 0.13379418849945068, + "learning_rate": 0.0005, + "loss": 2.1024, + "step": 149680 + }, + { + "epoch": 0.569757085328441, + "grad_norm": 0.120334193110466, + "learning_rate": 0.0005, + "loss": 2.114, + "step": 149690 + }, + { + "epoch": 0.5697951477965637, + "grad_norm": 0.1235523521900177, + "learning_rate": 0.0005, + "loss": 2.0905, + "step": 149700 + }, + { + "epoch": 0.5698332102646864, + "grad_norm": 0.12247676402330399, + "learning_rate": 0.0005, + "loss": 2.0915, + "step": 149710 + }, + { + "epoch": 0.5698712727328091, + "grad_norm": 0.12162892520427704, + "learning_rate": 0.0005, + "loss": 2.1004, + "step": 149720 + }, + { + "epoch": 0.5699093352009318, + "grad_norm": 0.12016452848911285, + "learning_rate": 0.0005, + "loss": 2.1113, + "step": 149730 + }, + { + "epoch": 0.5699473976690544, + "grad_norm": 0.1280735582113266, + "learning_rate": 0.0005, + "loss": 2.1278, + "step": 149740 + }, + { + "epoch": 0.5699854601371771, + "grad_norm": 0.1145968809723854, + "learning_rate": 0.0005, + "loss": 2.1026, + "step": 149750 + }, + { + "epoch": 0.5700235226052999, + "grad_norm": 0.12235980480909348, + "learning_rate": 0.0005, + "loss": 2.1146, + "step": 149760 + }, + { + "epoch": 0.5700615850734225, + "grad_norm": 0.13232506811618805, + "learning_rate": 0.0005, + "loss": 2.1001, + "step": 149770 + }, + { + "epoch": 0.5700996475415452, + "grad_norm": 0.1253451555967331, + "learning_rate": 0.0005, + "loss": 2.124, + "step": 149780 + }, + { + "epoch": 0.5701377100096678, + "grad_norm": 0.12646843492984772, + "learning_rate": 0.0005, + "loss": 2.1181, + "step": 149790 + }, + { + "epoch": 0.5701757724777905, + "grad_norm": 0.13931933045387268, + "learning_rate": 0.0005, + "loss": 2.1222, + "step": 149800 + }, + { + "epoch": 0.5702138349459133, + "grad_norm": 0.11552035808563232, + "learning_rate": 0.0005, + "loss": 2.1071, + "step": 149810 + }, + { + "epoch": 0.5702518974140359, + "grad_norm": 0.12433992326259613, + "learning_rate": 0.0005, + "loss": 2.1341, + "step": 149820 + }, + { + "epoch": 0.5702899598821586, + "grad_norm": 0.13327881693840027, + "learning_rate": 0.0005, + "loss": 2.1236, + "step": 149830 + }, + { + "epoch": 0.5703280223502812, + "grad_norm": 0.13314664363861084, + "learning_rate": 0.0005, + "loss": 2.1049, + "step": 149840 + }, + { + "epoch": 0.570366084818404, + "grad_norm": 0.906789243221283, + "learning_rate": 0.0005, + "loss": 2.1101, + "step": 149850 + }, + { + "epoch": 0.5704041472865267, + "grad_norm": 0.11846122145652771, + "learning_rate": 0.0005, + "loss": 2.1043, + "step": 149860 + }, + { + "epoch": 0.5704422097546493, + "grad_norm": 0.12619948387145996, + "learning_rate": 0.0005, + "loss": 2.1176, + "step": 149870 + }, + { + "epoch": 0.570480272222772, + "grad_norm": 0.11806615442037582, + "learning_rate": 0.0005, + "loss": 2.1205, + "step": 149880 + }, + { + "epoch": 0.5705183346908947, + "grad_norm": 0.1296517252922058, + "learning_rate": 0.0005, + "loss": 2.1078, + "step": 149890 + }, + { + "epoch": 0.5705563971590174, + "grad_norm": 0.12680789828300476, + "learning_rate": 0.0005, + "loss": 2.1108, + "step": 149900 + }, + { + "epoch": 0.5705944596271401, + "grad_norm": 0.12104959785938263, + "learning_rate": 0.0005, + "loss": 2.1083, + "step": 149910 + }, + { + "epoch": 0.5706325220952627, + "grad_norm": 0.12184807658195496, + "learning_rate": 0.0005, + "loss": 2.1067, + "step": 149920 + }, + { + "epoch": 0.5706705845633854, + "grad_norm": 0.12121890485286713, + "learning_rate": 0.0005, + "loss": 2.1009, + "step": 149930 + }, + { + "epoch": 0.5707086470315081, + "grad_norm": 0.13234730064868927, + "learning_rate": 0.0005, + "loss": 2.1079, + "step": 149940 + }, + { + "epoch": 0.5707467094996308, + "grad_norm": 0.13042353093624115, + "learning_rate": 0.0005, + "loss": 2.1125, + "step": 149950 + }, + { + "epoch": 0.5707847719677535, + "grad_norm": 0.12504348158836365, + "learning_rate": 0.0005, + "loss": 2.1207, + "step": 149960 + }, + { + "epoch": 0.5708228344358761, + "grad_norm": 0.1169368252158165, + "learning_rate": 0.0005, + "loss": 2.1316, + "step": 149970 + }, + { + "epoch": 0.5708608969039989, + "grad_norm": 0.13686639070510864, + "learning_rate": 0.0005, + "loss": 2.1016, + "step": 149980 + }, + { + "epoch": 0.5708989593721215, + "grad_norm": 0.12890039384365082, + "learning_rate": 0.0005, + "loss": 2.108, + "step": 149990 + }, + { + "epoch": 0.5709370218402442, + "grad_norm": 0.13867227733135223, + "learning_rate": 0.0005, + "loss": 2.1117, + "step": 150000 + }, + { + "epoch": 0.5709750843083669, + "grad_norm": 0.1201515644788742, + "learning_rate": 0.0005, + "loss": 2.1221, + "step": 150010 + }, + { + "epoch": 0.5710131467764896, + "grad_norm": 0.13026781380176544, + "learning_rate": 0.0005, + "loss": 2.117, + "step": 150020 + }, + { + "epoch": 0.5710512092446123, + "grad_norm": 0.1175355464220047, + "learning_rate": 0.0005, + "loss": 2.1303, + "step": 150030 + }, + { + "epoch": 0.5710892717127349, + "grad_norm": 0.11978862434625626, + "learning_rate": 0.0005, + "loss": 2.1079, + "step": 150040 + }, + { + "epoch": 0.5711273341808576, + "grad_norm": 0.12562698125839233, + "learning_rate": 0.0005, + "loss": 2.106, + "step": 150050 + }, + { + "epoch": 0.5711653966489804, + "grad_norm": 0.14393503963947296, + "learning_rate": 0.0005, + "loss": 2.124, + "step": 150060 + }, + { + "epoch": 0.571203459117103, + "grad_norm": 0.11995064467191696, + "learning_rate": 0.0005, + "loss": 2.0987, + "step": 150070 + }, + { + "epoch": 0.5712415215852257, + "grad_norm": 0.15165968239307404, + "learning_rate": 0.0005, + "loss": 2.1092, + "step": 150080 + }, + { + "epoch": 0.5712795840533483, + "grad_norm": 0.1265028566122055, + "learning_rate": 0.0005, + "loss": 2.114, + "step": 150090 + }, + { + "epoch": 0.571317646521471, + "grad_norm": 0.13700735569000244, + "learning_rate": 0.0005, + "loss": 2.1153, + "step": 150100 + }, + { + "epoch": 0.5713557089895938, + "grad_norm": 0.11874990165233612, + "learning_rate": 0.0005, + "loss": 2.1165, + "step": 150110 + }, + { + "epoch": 0.5713937714577164, + "grad_norm": 0.1361815482378006, + "learning_rate": 0.0005, + "loss": 2.1199, + "step": 150120 + }, + { + "epoch": 0.5714318339258391, + "grad_norm": 0.1455676257610321, + "learning_rate": 0.0005, + "loss": 2.1049, + "step": 150130 + }, + { + "epoch": 0.5714698963939617, + "grad_norm": 0.1240849643945694, + "learning_rate": 0.0005, + "loss": 2.1116, + "step": 150140 + }, + { + "epoch": 0.5715079588620845, + "grad_norm": 0.1253289133310318, + "learning_rate": 0.0005, + "loss": 2.0927, + "step": 150150 + }, + { + "epoch": 0.5715460213302072, + "grad_norm": 0.11646021902561188, + "learning_rate": 0.0005, + "loss": 2.1064, + "step": 150160 + }, + { + "epoch": 0.5715840837983298, + "grad_norm": 0.12506163120269775, + "learning_rate": 0.0005, + "loss": 2.1121, + "step": 150170 + }, + { + "epoch": 0.5716221462664525, + "grad_norm": 0.12780094146728516, + "learning_rate": 0.0005, + "loss": 2.1125, + "step": 150180 + }, + { + "epoch": 0.5716602087345752, + "grad_norm": 0.13137106597423553, + "learning_rate": 0.0005, + "loss": 2.1089, + "step": 150190 + }, + { + "epoch": 0.5716982712026979, + "grad_norm": 0.1429845094680786, + "learning_rate": 0.0005, + "loss": 2.1045, + "step": 150200 + }, + { + "epoch": 0.5717363336708206, + "grad_norm": 0.1283874809741974, + "learning_rate": 0.0005, + "loss": 2.1172, + "step": 150210 + }, + { + "epoch": 0.5717743961389432, + "grad_norm": 0.13848842680454254, + "learning_rate": 0.0005, + "loss": 2.1078, + "step": 150220 + }, + { + "epoch": 0.5718124586070659, + "grad_norm": 0.1264006644487381, + "learning_rate": 0.0005, + "loss": 2.0935, + "step": 150230 + }, + { + "epoch": 0.5718505210751886, + "grad_norm": 0.12379968911409378, + "learning_rate": 0.0005, + "loss": 2.1204, + "step": 150240 + }, + { + "epoch": 0.5718885835433113, + "grad_norm": 0.11110774427652359, + "learning_rate": 0.0005, + "loss": 2.1089, + "step": 150250 + }, + { + "epoch": 0.571926646011434, + "grad_norm": 0.12956595420837402, + "learning_rate": 0.0005, + "loss": 2.0975, + "step": 150260 + }, + { + "epoch": 0.5719647084795566, + "grad_norm": 0.1179521456360817, + "learning_rate": 0.0005, + "loss": 2.1082, + "step": 150270 + }, + { + "epoch": 0.5720027709476794, + "grad_norm": 0.13738635182380676, + "learning_rate": 0.0005, + "loss": 2.1126, + "step": 150280 + }, + { + "epoch": 0.572040833415802, + "grad_norm": 0.13342367112636566, + "learning_rate": 0.0005, + "loss": 2.0974, + "step": 150290 + }, + { + "epoch": 0.5720788958839247, + "grad_norm": 0.12412730604410172, + "learning_rate": 0.0005, + "loss": 2.1177, + "step": 150300 + }, + { + "epoch": 0.5721169583520473, + "grad_norm": 0.1309875100851059, + "learning_rate": 0.0005, + "loss": 2.1203, + "step": 150310 + }, + { + "epoch": 0.5721550208201701, + "grad_norm": 0.1166936457157135, + "learning_rate": 0.0005, + "loss": 2.1044, + "step": 150320 + }, + { + "epoch": 0.5721930832882928, + "grad_norm": 0.12807297706604004, + "learning_rate": 0.0005, + "loss": 2.1122, + "step": 150330 + }, + { + "epoch": 0.5722311457564154, + "grad_norm": 0.12080655992031097, + "learning_rate": 0.0005, + "loss": 2.115, + "step": 150340 + }, + { + "epoch": 0.5722692082245381, + "grad_norm": 0.12636420130729675, + "learning_rate": 0.0005, + "loss": 2.101, + "step": 150350 + }, + { + "epoch": 0.5723072706926607, + "grad_norm": 0.12918971478939056, + "learning_rate": 0.0005, + "loss": 2.1141, + "step": 150360 + }, + { + "epoch": 0.5723453331607835, + "grad_norm": 0.1290965974330902, + "learning_rate": 0.0005, + "loss": 2.1107, + "step": 150370 + }, + { + "epoch": 0.5723833956289062, + "grad_norm": 0.12524893879890442, + "learning_rate": 0.0005, + "loss": 2.104, + "step": 150380 + }, + { + "epoch": 0.5724214580970288, + "grad_norm": 0.12515844404697418, + "learning_rate": 0.0005, + "loss": 2.1064, + "step": 150390 + }, + { + "epoch": 0.5724595205651515, + "grad_norm": 0.1358836144208908, + "learning_rate": 0.0005, + "loss": 2.1299, + "step": 150400 + }, + { + "epoch": 0.5724975830332742, + "grad_norm": 0.13036629557609558, + "learning_rate": 0.0005, + "loss": 2.1276, + "step": 150410 + }, + { + "epoch": 0.5725356455013969, + "grad_norm": 0.12266593426465988, + "learning_rate": 0.0005, + "loss": 2.1236, + "step": 150420 + }, + { + "epoch": 0.5725737079695196, + "grad_norm": 0.12452898919582367, + "learning_rate": 0.0005, + "loss": 2.1226, + "step": 150430 + }, + { + "epoch": 0.5726117704376422, + "grad_norm": 0.152368426322937, + "learning_rate": 0.0005, + "loss": 2.1178, + "step": 150440 + }, + { + "epoch": 0.572649832905765, + "grad_norm": 0.13339364528656006, + "learning_rate": 0.0005, + "loss": 2.1103, + "step": 150450 + }, + { + "epoch": 0.5726878953738876, + "grad_norm": 0.12512865662574768, + "learning_rate": 0.0005, + "loss": 2.1188, + "step": 150460 + }, + { + "epoch": 0.5727259578420103, + "grad_norm": 0.12616005539894104, + "learning_rate": 0.0005, + "loss": 2.1172, + "step": 150470 + }, + { + "epoch": 0.572764020310133, + "grad_norm": 0.12529578804969788, + "learning_rate": 0.0005, + "loss": 2.1161, + "step": 150480 + }, + { + "epoch": 0.5728020827782557, + "grad_norm": 0.11737383157014847, + "learning_rate": 0.0005, + "loss": 2.1162, + "step": 150490 + }, + { + "epoch": 0.5728401452463784, + "grad_norm": 0.1222836822271347, + "learning_rate": 0.0005, + "loss": 2.1318, + "step": 150500 + }, + { + "epoch": 0.572878207714501, + "grad_norm": 0.1139824390411377, + "learning_rate": 0.0005, + "loss": 2.1196, + "step": 150510 + }, + { + "epoch": 0.5729162701826237, + "grad_norm": 0.11608431488275528, + "learning_rate": 0.0005, + "loss": 2.1204, + "step": 150520 + }, + { + "epoch": 0.5729543326507464, + "grad_norm": 0.136452317237854, + "learning_rate": 0.0005, + "loss": 2.1015, + "step": 150530 + }, + { + "epoch": 0.5729923951188691, + "grad_norm": 0.12363389134407043, + "learning_rate": 0.0005, + "loss": 2.1153, + "step": 150540 + }, + { + "epoch": 0.5730304575869918, + "grad_norm": 0.13430823385715485, + "learning_rate": 0.0005, + "loss": 2.1241, + "step": 150550 + }, + { + "epoch": 0.5730685200551144, + "grad_norm": 0.12809452414512634, + "learning_rate": 0.0005, + "loss": 2.1052, + "step": 150560 + }, + { + "epoch": 0.5731065825232371, + "grad_norm": 0.1187937781214714, + "learning_rate": 0.0005, + "loss": 2.1172, + "step": 150570 + }, + { + "epoch": 0.5731446449913599, + "grad_norm": 0.12435567378997803, + "learning_rate": 0.0005, + "loss": 2.1122, + "step": 150580 + }, + { + "epoch": 0.5731827074594825, + "grad_norm": 0.13549238443374634, + "learning_rate": 0.0005, + "loss": 2.1151, + "step": 150590 + }, + { + "epoch": 0.5732207699276052, + "grad_norm": 0.13395282626152039, + "learning_rate": 0.0005, + "loss": 2.1089, + "step": 150600 + }, + { + "epoch": 0.5732588323957278, + "grad_norm": 0.1308007836341858, + "learning_rate": 0.0005, + "loss": 2.0858, + "step": 150610 + }, + { + "epoch": 0.5732968948638506, + "grad_norm": 0.13134954869747162, + "learning_rate": 0.0005, + "loss": 2.1118, + "step": 150620 + }, + { + "epoch": 0.5733349573319733, + "grad_norm": 0.11117885261774063, + "learning_rate": 0.0005, + "loss": 2.1168, + "step": 150630 + }, + { + "epoch": 0.5733730198000959, + "grad_norm": 0.13431069254875183, + "learning_rate": 0.0005, + "loss": 2.1122, + "step": 150640 + }, + { + "epoch": 0.5734110822682186, + "grad_norm": 0.1294468343257904, + "learning_rate": 0.0005, + "loss": 2.114, + "step": 150650 + }, + { + "epoch": 0.5734491447363412, + "grad_norm": 0.107212133705616, + "learning_rate": 0.0005, + "loss": 2.1082, + "step": 150660 + }, + { + "epoch": 0.573487207204464, + "grad_norm": 0.1195271760225296, + "learning_rate": 0.0005, + "loss": 2.1074, + "step": 150670 + }, + { + "epoch": 0.5735252696725867, + "grad_norm": 0.12692059576511383, + "learning_rate": 0.0005, + "loss": 2.1214, + "step": 150680 + }, + { + "epoch": 0.5735633321407093, + "grad_norm": 0.1342695653438568, + "learning_rate": 0.0005, + "loss": 2.1081, + "step": 150690 + }, + { + "epoch": 0.573601394608832, + "grad_norm": 0.12712764739990234, + "learning_rate": 0.0005, + "loss": 2.1084, + "step": 150700 + }, + { + "epoch": 0.5736394570769547, + "grad_norm": 0.12305767834186554, + "learning_rate": 0.0005, + "loss": 2.1059, + "step": 150710 + }, + { + "epoch": 0.5736775195450774, + "grad_norm": 0.11675681918859482, + "learning_rate": 0.0005, + "loss": 2.0955, + "step": 150720 + }, + { + "epoch": 0.5737155820132, + "grad_norm": 0.12459848076105118, + "learning_rate": 0.0005, + "loss": 2.1125, + "step": 150730 + }, + { + "epoch": 0.5737536444813227, + "grad_norm": 0.12979640066623688, + "learning_rate": 0.0005, + "loss": 2.1299, + "step": 150740 + }, + { + "epoch": 0.5737917069494455, + "grad_norm": 0.11147118359804153, + "learning_rate": 0.0005, + "loss": 2.1212, + "step": 150750 + }, + { + "epoch": 0.5738297694175681, + "grad_norm": 0.1222359836101532, + "learning_rate": 0.0005, + "loss": 2.1125, + "step": 150760 + }, + { + "epoch": 0.5738678318856908, + "grad_norm": 0.11825738102197647, + "learning_rate": 0.0005, + "loss": 2.1195, + "step": 150770 + }, + { + "epoch": 0.5739058943538135, + "grad_norm": 0.1274542659521103, + "learning_rate": 0.0005, + "loss": 2.1153, + "step": 150780 + }, + { + "epoch": 0.5739439568219361, + "grad_norm": 0.12503868341445923, + "learning_rate": 0.0005, + "loss": 2.1009, + "step": 150790 + }, + { + "epoch": 0.5739820192900589, + "grad_norm": 0.12383761256933212, + "learning_rate": 0.0005, + "loss": 2.1043, + "step": 150800 + }, + { + "epoch": 0.5740200817581815, + "grad_norm": 0.17884153127670288, + "learning_rate": 0.0005, + "loss": 2.1208, + "step": 150810 + }, + { + "epoch": 0.5740581442263042, + "grad_norm": 0.13055235147476196, + "learning_rate": 0.0005, + "loss": 2.1274, + "step": 150820 + }, + { + "epoch": 0.5740962066944268, + "grad_norm": 0.11566507816314697, + "learning_rate": 0.0005, + "loss": 2.1114, + "step": 150830 + }, + { + "epoch": 0.5741342691625496, + "grad_norm": 0.11708749830722809, + "learning_rate": 0.0005, + "loss": 2.1187, + "step": 150840 + }, + { + "epoch": 0.5741723316306723, + "grad_norm": 0.1316896378993988, + "learning_rate": 0.0005, + "loss": 2.1017, + "step": 150850 + }, + { + "epoch": 0.5742103940987949, + "grad_norm": 0.12467554956674576, + "learning_rate": 0.0005, + "loss": 2.1085, + "step": 150860 + }, + { + "epoch": 0.5742484565669176, + "grad_norm": 0.13408036530017853, + "learning_rate": 0.0005, + "loss": 2.1145, + "step": 150870 + }, + { + "epoch": 0.5742865190350404, + "grad_norm": 0.123175248503685, + "learning_rate": 0.0005, + "loss": 2.1153, + "step": 150880 + }, + { + "epoch": 0.574324581503163, + "grad_norm": 0.14702090620994568, + "learning_rate": 0.0005, + "loss": 2.1166, + "step": 150890 + }, + { + "epoch": 0.5743626439712857, + "grad_norm": 0.13698521256446838, + "learning_rate": 0.0005, + "loss": 2.1204, + "step": 150900 + }, + { + "epoch": 0.5744007064394083, + "grad_norm": 0.12514759600162506, + "learning_rate": 0.0005, + "loss": 2.1131, + "step": 150910 + }, + { + "epoch": 0.5744387689075311, + "grad_norm": 0.1253451555967331, + "learning_rate": 0.0005, + "loss": 2.1115, + "step": 150920 + }, + { + "epoch": 0.5744768313756538, + "grad_norm": 0.1406048834323883, + "learning_rate": 0.0005, + "loss": 2.1177, + "step": 150930 + }, + { + "epoch": 0.5745148938437764, + "grad_norm": 0.11642050743103027, + "learning_rate": 0.0005, + "loss": 2.1018, + "step": 150940 + }, + { + "epoch": 0.5745529563118991, + "grad_norm": 0.1270553022623062, + "learning_rate": 0.0005, + "loss": 2.1167, + "step": 150950 + }, + { + "epoch": 0.5745910187800217, + "grad_norm": 0.11792125552892685, + "learning_rate": 0.0005, + "loss": 2.098, + "step": 150960 + }, + { + "epoch": 0.5746290812481445, + "grad_norm": 0.10952485352754593, + "learning_rate": 0.0005, + "loss": 2.1073, + "step": 150970 + }, + { + "epoch": 0.5746671437162671, + "grad_norm": 0.12044594436883926, + "learning_rate": 0.0005, + "loss": 2.1087, + "step": 150980 + }, + { + "epoch": 0.5747052061843898, + "grad_norm": 0.13403725624084473, + "learning_rate": 0.0005, + "loss": 2.1066, + "step": 150990 + }, + { + "epoch": 0.5747432686525125, + "grad_norm": 0.1265270859003067, + "learning_rate": 0.0005, + "loss": 2.1194, + "step": 151000 + }, + { + "epoch": 0.5747813311206352, + "grad_norm": 0.12403400987386703, + "learning_rate": 0.0005, + "loss": 2.1275, + "step": 151010 + }, + { + "epoch": 0.5748193935887579, + "grad_norm": 0.14083850383758545, + "learning_rate": 0.0005, + "loss": 2.1112, + "step": 151020 + }, + { + "epoch": 0.5748574560568805, + "grad_norm": 0.12334656715393066, + "learning_rate": 0.0005, + "loss": 2.107, + "step": 151030 + }, + { + "epoch": 0.5748955185250032, + "grad_norm": 0.1339842528104782, + "learning_rate": 0.0005, + "loss": 2.1109, + "step": 151040 + }, + { + "epoch": 0.574933580993126, + "grad_norm": 0.12079505622386932, + "learning_rate": 0.0005, + "loss": 2.1207, + "step": 151050 + }, + { + "epoch": 0.5749716434612486, + "grad_norm": 0.11383721977472305, + "learning_rate": 0.0005, + "loss": 2.1126, + "step": 151060 + }, + { + "epoch": 0.5750097059293713, + "grad_norm": 0.142286479473114, + "learning_rate": 0.0005, + "loss": 2.109, + "step": 151070 + }, + { + "epoch": 0.5750477683974939, + "grad_norm": 0.12102207541465759, + "learning_rate": 0.0005, + "loss": 2.1101, + "step": 151080 + }, + { + "epoch": 0.5750858308656166, + "grad_norm": 0.12138934433460236, + "learning_rate": 0.0005, + "loss": 2.1036, + "step": 151090 + }, + { + "epoch": 0.5751238933337394, + "grad_norm": 0.11563186347484589, + "learning_rate": 0.0005, + "loss": 2.1143, + "step": 151100 + }, + { + "epoch": 0.575161955801862, + "grad_norm": 0.11683789640665054, + "learning_rate": 0.0005, + "loss": 2.1271, + "step": 151110 + }, + { + "epoch": 0.5752000182699847, + "grad_norm": 0.1332668960094452, + "learning_rate": 0.0005, + "loss": 2.1179, + "step": 151120 + }, + { + "epoch": 0.5752380807381073, + "grad_norm": 0.134083092212677, + "learning_rate": 0.0005, + "loss": 2.116, + "step": 151130 + }, + { + "epoch": 0.5752761432062301, + "grad_norm": 0.12192686647176743, + "learning_rate": 0.0005, + "loss": 2.1071, + "step": 151140 + }, + { + "epoch": 0.5753142056743528, + "grad_norm": 0.11395518481731415, + "learning_rate": 0.0005, + "loss": 2.1106, + "step": 151150 + }, + { + "epoch": 0.5753522681424754, + "grad_norm": 0.126944437623024, + "learning_rate": 0.0005, + "loss": 2.1151, + "step": 151160 + }, + { + "epoch": 0.5753903306105981, + "grad_norm": 0.132183238863945, + "learning_rate": 0.0005, + "loss": 2.0985, + "step": 151170 + }, + { + "epoch": 0.5754283930787208, + "grad_norm": 0.12324900925159454, + "learning_rate": 0.0005, + "loss": 2.1128, + "step": 151180 + }, + { + "epoch": 0.5754664555468435, + "grad_norm": 0.1156749501824379, + "learning_rate": 0.0005, + "loss": 2.1083, + "step": 151190 + }, + { + "epoch": 0.5755045180149662, + "grad_norm": 0.12462375313043594, + "learning_rate": 0.0005, + "loss": 2.1104, + "step": 151200 + }, + { + "epoch": 0.5755425804830888, + "grad_norm": 0.12586110830307007, + "learning_rate": 0.0005, + "loss": 2.105, + "step": 151210 + }, + { + "epoch": 0.5755806429512115, + "grad_norm": 0.13233725726604462, + "learning_rate": 0.0005, + "loss": 2.133, + "step": 151220 + }, + { + "epoch": 0.5756187054193342, + "grad_norm": 0.14042454957962036, + "learning_rate": 0.0005, + "loss": 2.1284, + "step": 151230 + }, + { + "epoch": 0.5756567678874569, + "grad_norm": 0.11982891708612442, + "learning_rate": 0.0005, + "loss": 2.0985, + "step": 151240 + }, + { + "epoch": 0.5756948303555796, + "grad_norm": 0.12568190693855286, + "learning_rate": 0.0005, + "loss": 2.1066, + "step": 151250 + }, + { + "epoch": 0.5757328928237022, + "grad_norm": 0.11588189005851746, + "learning_rate": 0.0005, + "loss": 2.1107, + "step": 151260 + }, + { + "epoch": 0.575770955291825, + "grad_norm": 0.1222311481833458, + "learning_rate": 0.0005, + "loss": 2.1098, + "step": 151270 + }, + { + "epoch": 0.5758090177599476, + "grad_norm": 0.12930968403816223, + "learning_rate": 0.0005, + "loss": 2.1265, + "step": 151280 + }, + { + "epoch": 0.5758470802280703, + "grad_norm": 0.12175658345222473, + "learning_rate": 0.0005, + "loss": 2.1104, + "step": 151290 + }, + { + "epoch": 0.575885142696193, + "grad_norm": 0.11445169150829315, + "learning_rate": 0.0005, + "loss": 2.1139, + "step": 151300 + }, + { + "epoch": 0.5759232051643157, + "grad_norm": 0.12847721576690674, + "learning_rate": 0.0005, + "loss": 2.0985, + "step": 151310 + }, + { + "epoch": 0.5759612676324384, + "grad_norm": 0.12533120810985565, + "learning_rate": 0.0005, + "loss": 2.1193, + "step": 151320 + }, + { + "epoch": 0.575999330100561, + "grad_norm": 0.12310317158699036, + "learning_rate": 0.0005, + "loss": 2.1069, + "step": 151330 + }, + { + "epoch": 0.5760373925686837, + "grad_norm": 0.12116267532110214, + "learning_rate": 0.0005, + "loss": 2.1144, + "step": 151340 + }, + { + "epoch": 0.5760754550368065, + "grad_norm": 0.12053951621055603, + "learning_rate": 0.0005, + "loss": 2.1014, + "step": 151350 + }, + { + "epoch": 0.5761135175049291, + "grad_norm": 0.11923123896121979, + "learning_rate": 0.0005, + "loss": 2.1155, + "step": 151360 + }, + { + "epoch": 0.5761515799730518, + "grad_norm": 0.13338451087474823, + "learning_rate": 0.0005, + "loss": 2.1001, + "step": 151370 + }, + { + "epoch": 0.5761896424411744, + "grad_norm": 0.12539881467819214, + "learning_rate": 0.0005, + "loss": 2.0893, + "step": 151380 + }, + { + "epoch": 0.5762277049092971, + "grad_norm": 0.15315894782543182, + "learning_rate": 0.0005, + "loss": 2.1144, + "step": 151390 + }, + { + "epoch": 0.5762657673774199, + "grad_norm": 0.1272190511226654, + "learning_rate": 0.0005, + "loss": 2.1187, + "step": 151400 + }, + { + "epoch": 0.5763038298455425, + "grad_norm": 0.12473262846469879, + "learning_rate": 0.0005, + "loss": 2.1152, + "step": 151410 + }, + { + "epoch": 0.5763418923136652, + "grad_norm": 0.11976677924394608, + "learning_rate": 0.0005, + "loss": 2.1237, + "step": 151420 + }, + { + "epoch": 0.5763799547817878, + "grad_norm": 0.12231112271547318, + "learning_rate": 0.0005, + "loss": 2.1274, + "step": 151430 + }, + { + "epoch": 0.5764180172499106, + "grad_norm": 0.12565696239471436, + "learning_rate": 0.0005, + "loss": 2.1196, + "step": 151440 + }, + { + "epoch": 0.5764560797180333, + "grad_norm": 0.11815255135297775, + "learning_rate": 0.0005, + "loss": 2.101, + "step": 151450 + }, + { + "epoch": 0.5764941421861559, + "grad_norm": 0.14175409078598022, + "learning_rate": 0.0005, + "loss": 2.1169, + "step": 151460 + }, + { + "epoch": 0.5765322046542786, + "grad_norm": 0.11824507266283035, + "learning_rate": 0.0005, + "loss": 2.1096, + "step": 151470 + }, + { + "epoch": 0.5765702671224013, + "grad_norm": 0.11940937489271164, + "learning_rate": 0.0005, + "loss": 2.1061, + "step": 151480 + }, + { + "epoch": 0.576608329590524, + "grad_norm": 0.1218341588973999, + "learning_rate": 0.0005, + "loss": 2.1281, + "step": 151490 + }, + { + "epoch": 0.5766463920586467, + "grad_norm": 0.12804977595806122, + "learning_rate": 0.0005, + "loss": 2.1075, + "step": 151500 + }, + { + "epoch": 0.5766844545267693, + "grad_norm": 0.1206967905163765, + "learning_rate": 0.0005, + "loss": 2.1165, + "step": 151510 + }, + { + "epoch": 0.576722516994892, + "grad_norm": 0.13532279431819916, + "learning_rate": 0.0005, + "loss": 2.1059, + "step": 151520 + }, + { + "epoch": 0.5767605794630147, + "grad_norm": 0.12815798819065094, + "learning_rate": 0.0005, + "loss": 2.1216, + "step": 151530 + }, + { + "epoch": 0.5767986419311374, + "grad_norm": 0.12100932747125626, + "learning_rate": 0.0005, + "loss": 2.1076, + "step": 151540 + }, + { + "epoch": 0.57683670439926, + "grad_norm": 0.14143440127372742, + "learning_rate": 0.0005, + "loss": 2.1184, + "step": 151550 + }, + { + "epoch": 0.5768747668673827, + "grad_norm": 0.139065220952034, + "learning_rate": 0.0005, + "loss": 2.1189, + "step": 151560 + }, + { + "epoch": 0.5769128293355055, + "grad_norm": 0.1327260434627533, + "learning_rate": 0.0005, + "loss": 2.1048, + "step": 151570 + }, + { + "epoch": 0.5769508918036281, + "grad_norm": 0.1227652058005333, + "learning_rate": 0.0005, + "loss": 2.1079, + "step": 151580 + }, + { + "epoch": 0.5769889542717508, + "grad_norm": 0.1407083421945572, + "learning_rate": 0.0005, + "loss": 2.1178, + "step": 151590 + }, + { + "epoch": 0.5770270167398734, + "grad_norm": 0.11829644441604614, + "learning_rate": 0.0005, + "loss": 2.1133, + "step": 151600 + }, + { + "epoch": 0.5770650792079962, + "grad_norm": 0.12382587790489197, + "learning_rate": 0.0005, + "loss": 2.1182, + "step": 151610 + }, + { + "epoch": 0.5771031416761189, + "grad_norm": 0.13066115975379944, + "learning_rate": 0.0005, + "loss": 2.1063, + "step": 151620 + }, + { + "epoch": 0.5771412041442415, + "grad_norm": 0.12449169158935547, + "learning_rate": 0.0005, + "loss": 2.1029, + "step": 151630 + }, + { + "epoch": 0.5771792666123642, + "grad_norm": 0.12625348567962646, + "learning_rate": 0.0005, + "loss": 2.1174, + "step": 151640 + }, + { + "epoch": 0.5772173290804868, + "grad_norm": 0.11762882024049759, + "learning_rate": 0.0005, + "loss": 2.1153, + "step": 151650 + }, + { + "epoch": 0.5772553915486096, + "grad_norm": 0.13215084373950958, + "learning_rate": 0.0005, + "loss": 2.127, + "step": 151660 + }, + { + "epoch": 0.5772934540167323, + "grad_norm": 0.11425194144248962, + "learning_rate": 0.0005, + "loss": 2.1195, + "step": 151670 + }, + { + "epoch": 0.5773315164848549, + "grad_norm": 0.1309116780757904, + "learning_rate": 0.0005, + "loss": 2.1137, + "step": 151680 + }, + { + "epoch": 0.5773695789529776, + "grad_norm": 0.12787970900535583, + "learning_rate": 0.0005, + "loss": 2.1331, + "step": 151690 + }, + { + "epoch": 0.5774076414211003, + "grad_norm": 0.1286764293909073, + "learning_rate": 0.0005, + "loss": 2.1178, + "step": 151700 + }, + { + "epoch": 0.577445703889223, + "grad_norm": 0.13170640170574188, + "learning_rate": 0.0005, + "loss": 2.1253, + "step": 151710 + }, + { + "epoch": 0.5774837663573457, + "grad_norm": 0.13535332679748535, + "learning_rate": 0.0005, + "loss": 2.1284, + "step": 151720 + }, + { + "epoch": 0.5775218288254683, + "grad_norm": 0.12930703163146973, + "learning_rate": 0.0005, + "loss": 2.108, + "step": 151730 + }, + { + "epoch": 0.5775598912935911, + "grad_norm": 0.13405849039554596, + "learning_rate": 0.0005, + "loss": 2.1161, + "step": 151740 + }, + { + "epoch": 0.5775979537617137, + "grad_norm": 0.11642023921012878, + "learning_rate": 0.0005, + "loss": 2.1248, + "step": 151750 + }, + { + "epoch": 0.5776360162298364, + "grad_norm": 0.13674704730510712, + "learning_rate": 0.0005, + "loss": 2.1096, + "step": 151760 + }, + { + "epoch": 0.5776740786979591, + "grad_norm": 0.12092318385839462, + "learning_rate": 0.0005, + "loss": 2.1212, + "step": 151770 + }, + { + "epoch": 0.5777121411660818, + "grad_norm": 0.12388814240694046, + "learning_rate": 0.0005, + "loss": 2.1189, + "step": 151780 + }, + { + "epoch": 0.5777502036342045, + "grad_norm": 0.12979772686958313, + "learning_rate": 0.0005, + "loss": 2.1154, + "step": 151790 + }, + { + "epoch": 0.5777882661023271, + "grad_norm": 0.1481570303440094, + "learning_rate": 0.0005, + "loss": 2.1255, + "step": 151800 + }, + { + "epoch": 0.5778263285704498, + "grad_norm": 0.12397785484790802, + "learning_rate": 0.0005, + "loss": 2.1033, + "step": 151810 + }, + { + "epoch": 0.5778643910385725, + "grad_norm": 0.11837328225374222, + "learning_rate": 0.0005, + "loss": 2.1089, + "step": 151820 + }, + { + "epoch": 0.5779024535066952, + "grad_norm": 0.13646014034748077, + "learning_rate": 0.0005, + "loss": 2.1133, + "step": 151830 + }, + { + "epoch": 0.5779405159748179, + "grad_norm": 0.13520261645317078, + "learning_rate": 0.0005, + "loss": 2.1043, + "step": 151840 + }, + { + "epoch": 0.5779785784429405, + "grad_norm": 0.1331622302532196, + "learning_rate": 0.0005, + "loss": 2.1228, + "step": 151850 + }, + { + "epoch": 0.5780166409110632, + "grad_norm": 0.13102315366268158, + "learning_rate": 0.0005, + "loss": 2.1296, + "step": 151860 + }, + { + "epoch": 0.578054703379186, + "grad_norm": 0.12880952656269073, + "learning_rate": 0.0005, + "loss": 2.127, + "step": 151870 + }, + { + "epoch": 0.5780927658473086, + "grad_norm": 0.12405695021152496, + "learning_rate": 0.0005, + "loss": 2.1084, + "step": 151880 + }, + { + "epoch": 0.5781308283154313, + "grad_norm": 0.12300620228052139, + "learning_rate": 0.0005, + "loss": 2.1094, + "step": 151890 + }, + { + "epoch": 0.5781688907835539, + "grad_norm": 0.13016889989376068, + "learning_rate": 0.0005, + "loss": 2.1122, + "step": 151900 + }, + { + "epoch": 0.5782069532516767, + "grad_norm": 0.11752153187990189, + "learning_rate": 0.0005, + "loss": 2.1075, + "step": 151910 + }, + { + "epoch": 0.5782450157197994, + "grad_norm": 0.12173010408878326, + "learning_rate": 0.0005, + "loss": 2.1123, + "step": 151920 + }, + { + "epoch": 0.578283078187922, + "grad_norm": 0.12354975193738937, + "learning_rate": 0.0005, + "loss": 2.1098, + "step": 151930 + }, + { + "epoch": 0.5783211406560447, + "grad_norm": 0.11892983317375183, + "learning_rate": 0.0005, + "loss": 2.1305, + "step": 151940 + }, + { + "epoch": 0.5783592031241673, + "grad_norm": 0.11993703246116638, + "learning_rate": 0.0005, + "loss": 2.1009, + "step": 151950 + }, + { + "epoch": 0.5783972655922901, + "grad_norm": 0.14302973449230194, + "learning_rate": 0.0005, + "loss": 2.1145, + "step": 151960 + }, + { + "epoch": 0.5784353280604128, + "grad_norm": 0.12831313908100128, + "learning_rate": 0.0005, + "loss": 2.1207, + "step": 151970 + }, + { + "epoch": 0.5784733905285354, + "grad_norm": 0.13994410634040833, + "learning_rate": 0.0005, + "loss": 2.1251, + "step": 151980 + }, + { + "epoch": 0.5785114529966581, + "grad_norm": 0.13094666600227356, + "learning_rate": 0.0005, + "loss": 2.1239, + "step": 151990 + }, + { + "epoch": 0.5785495154647808, + "grad_norm": 0.13194623589515686, + "learning_rate": 0.0005, + "loss": 2.0952, + "step": 152000 + }, + { + "epoch": 0.5785875779329035, + "grad_norm": 0.11673175543546677, + "learning_rate": 0.0005, + "loss": 2.1008, + "step": 152010 + }, + { + "epoch": 0.5786256404010262, + "grad_norm": 0.11440134048461914, + "learning_rate": 0.0005, + "loss": 2.1116, + "step": 152020 + }, + { + "epoch": 0.5786637028691488, + "grad_norm": 0.1264093518257141, + "learning_rate": 0.0005, + "loss": 2.1035, + "step": 152030 + }, + { + "epoch": 0.5787017653372716, + "grad_norm": 0.11688932776451111, + "learning_rate": 0.0005, + "loss": 2.1095, + "step": 152040 + }, + { + "epoch": 0.5787398278053942, + "grad_norm": 0.13633820414543152, + "learning_rate": 0.0005, + "loss": 2.1071, + "step": 152050 + }, + { + "epoch": 0.5787778902735169, + "grad_norm": 0.13115862011909485, + "learning_rate": 0.0005, + "loss": 2.1151, + "step": 152060 + }, + { + "epoch": 0.5788159527416396, + "grad_norm": 0.14492659270763397, + "learning_rate": 0.0005, + "loss": 2.1133, + "step": 152070 + }, + { + "epoch": 0.5788540152097622, + "grad_norm": 0.12465827912092209, + "learning_rate": 0.0005, + "loss": 2.1044, + "step": 152080 + }, + { + "epoch": 0.578892077677885, + "grad_norm": 0.11936473846435547, + "learning_rate": 0.0005, + "loss": 2.1051, + "step": 152090 + }, + { + "epoch": 0.5789301401460076, + "grad_norm": 0.12346568703651428, + "learning_rate": 0.0005, + "loss": 2.1309, + "step": 152100 + }, + { + "epoch": 0.5789682026141303, + "grad_norm": 0.12095118314027786, + "learning_rate": 0.0005, + "loss": 2.119, + "step": 152110 + }, + { + "epoch": 0.579006265082253, + "grad_norm": 0.12287425994873047, + "learning_rate": 0.0005, + "loss": 2.1142, + "step": 152120 + }, + { + "epoch": 0.5790443275503757, + "grad_norm": 0.154531329870224, + "learning_rate": 0.0005, + "loss": 2.1039, + "step": 152130 + }, + { + "epoch": 0.5790823900184984, + "grad_norm": 0.1167474314570427, + "learning_rate": 0.0005, + "loss": 2.112, + "step": 152140 + }, + { + "epoch": 0.579120452486621, + "grad_norm": 0.1318303495645523, + "learning_rate": 0.0005, + "loss": 2.1014, + "step": 152150 + }, + { + "epoch": 0.5791585149547437, + "grad_norm": 0.13696379959583282, + "learning_rate": 0.0005, + "loss": 2.1066, + "step": 152160 + }, + { + "epoch": 0.5791965774228665, + "grad_norm": 0.12334418296813965, + "learning_rate": 0.0005, + "loss": 2.1106, + "step": 152170 + }, + { + "epoch": 0.5792346398909891, + "grad_norm": 0.11851691454648972, + "learning_rate": 0.0005, + "loss": 2.1008, + "step": 152180 + }, + { + "epoch": 0.5792727023591118, + "grad_norm": 0.11992210894823074, + "learning_rate": 0.0005, + "loss": 2.1055, + "step": 152190 + }, + { + "epoch": 0.5793107648272344, + "grad_norm": 0.11987617611885071, + "learning_rate": 0.0005, + "loss": 2.1159, + "step": 152200 + }, + { + "epoch": 0.5793488272953572, + "grad_norm": 0.12984754145145416, + "learning_rate": 0.0005, + "loss": 2.0902, + "step": 152210 + }, + { + "epoch": 0.5793868897634799, + "grad_norm": 0.12692391872406006, + "learning_rate": 0.0005, + "loss": 2.1152, + "step": 152220 + }, + { + "epoch": 0.5794249522316025, + "grad_norm": 0.12092795968055725, + "learning_rate": 0.0005, + "loss": 2.1177, + "step": 152230 + }, + { + "epoch": 0.5794630146997252, + "grad_norm": 0.13350768387317657, + "learning_rate": 0.0005, + "loss": 2.1096, + "step": 152240 + }, + { + "epoch": 0.5795010771678478, + "grad_norm": 0.12127964943647385, + "learning_rate": 0.0005, + "loss": 2.1072, + "step": 152250 + }, + { + "epoch": 0.5795391396359706, + "grad_norm": 0.13222403824329376, + "learning_rate": 0.0005, + "loss": 2.1006, + "step": 152260 + }, + { + "epoch": 0.5795772021040932, + "grad_norm": 0.11993005126714706, + "learning_rate": 0.0005, + "loss": 2.1197, + "step": 152270 + }, + { + "epoch": 0.5796152645722159, + "grad_norm": 0.12395907193422318, + "learning_rate": 0.0005, + "loss": 2.0988, + "step": 152280 + }, + { + "epoch": 0.5796533270403386, + "grad_norm": 0.1194024533033371, + "learning_rate": 0.0005, + "loss": 2.1234, + "step": 152290 + }, + { + "epoch": 0.5796913895084613, + "grad_norm": 0.12345626950263977, + "learning_rate": 0.0005, + "loss": 2.1202, + "step": 152300 + }, + { + "epoch": 0.579729451976584, + "grad_norm": 0.123421810567379, + "learning_rate": 0.0005, + "loss": 2.1352, + "step": 152310 + }, + { + "epoch": 0.5797675144447066, + "grad_norm": 0.11834047734737396, + "learning_rate": 0.0005, + "loss": 2.1136, + "step": 152320 + }, + { + "epoch": 0.5798055769128293, + "grad_norm": 0.12674610316753387, + "learning_rate": 0.0005, + "loss": 2.1064, + "step": 152330 + }, + { + "epoch": 0.5798436393809521, + "grad_norm": 0.1250268816947937, + "learning_rate": 0.0005, + "loss": 2.0967, + "step": 152340 + }, + { + "epoch": 0.5798817018490747, + "grad_norm": 0.122011199593544, + "learning_rate": 0.0005, + "loss": 2.109, + "step": 152350 + }, + { + "epoch": 0.5799197643171974, + "grad_norm": 0.11669352650642395, + "learning_rate": 0.0005, + "loss": 2.0964, + "step": 152360 + }, + { + "epoch": 0.57995782678532, + "grad_norm": 0.13394352793693542, + "learning_rate": 0.0005, + "loss": 2.1066, + "step": 152370 + }, + { + "epoch": 0.5799958892534427, + "grad_norm": 0.13123464584350586, + "learning_rate": 0.0005, + "loss": 2.1164, + "step": 152380 + }, + { + "epoch": 0.5800339517215655, + "grad_norm": 0.12727457284927368, + "learning_rate": 0.0005, + "loss": 2.1094, + "step": 152390 + }, + { + "epoch": 0.5800720141896881, + "grad_norm": 0.12583385407924652, + "learning_rate": 0.0005, + "loss": 2.0968, + "step": 152400 + }, + { + "epoch": 0.5801100766578108, + "grad_norm": 0.12469319999217987, + "learning_rate": 0.0005, + "loss": 2.1049, + "step": 152410 + }, + { + "epoch": 0.5801481391259334, + "grad_norm": 0.12815004587173462, + "learning_rate": 0.0005, + "loss": 2.108, + "step": 152420 + }, + { + "epoch": 0.5801862015940562, + "grad_norm": 0.1281806230545044, + "learning_rate": 0.0005, + "loss": 2.1328, + "step": 152430 + }, + { + "epoch": 0.5802242640621789, + "grad_norm": 0.11810749024152756, + "learning_rate": 0.0005, + "loss": 2.0892, + "step": 152440 + }, + { + "epoch": 0.5802623265303015, + "grad_norm": 0.12065868079662323, + "learning_rate": 0.0005, + "loss": 2.1146, + "step": 152450 + }, + { + "epoch": 0.5803003889984242, + "grad_norm": 0.1311550885438919, + "learning_rate": 0.0005, + "loss": 2.13, + "step": 152460 + }, + { + "epoch": 0.580338451466547, + "grad_norm": 0.12211163341999054, + "learning_rate": 0.0005, + "loss": 2.114, + "step": 152470 + }, + { + "epoch": 0.5803765139346696, + "grad_norm": 0.13015727698802948, + "learning_rate": 0.0005, + "loss": 2.1166, + "step": 152480 + }, + { + "epoch": 0.5804145764027923, + "grad_norm": 0.12457870692014694, + "learning_rate": 0.0005, + "loss": 2.1134, + "step": 152490 + }, + { + "epoch": 0.5804526388709149, + "grad_norm": 0.12478785961866379, + "learning_rate": 0.0005, + "loss": 2.1259, + "step": 152500 + }, + { + "epoch": 0.5804907013390376, + "grad_norm": 0.12055505812168121, + "learning_rate": 0.0005, + "loss": 2.1091, + "step": 152510 + }, + { + "epoch": 0.5805287638071603, + "grad_norm": 0.13347235321998596, + "learning_rate": 0.0005, + "loss": 2.1064, + "step": 152520 + }, + { + "epoch": 0.580566826275283, + "grad_norm": 0.11684930324554443, + "learning_rate": 0.0005, + "loss": 2.104, + "step": 152530 + }, + { + "epoch": 0.5806048887434057, + "grad_norm": 0.1279844045639038, + "learning_rate": 0.0005, + "loss": 2.1173, + "step": 152540 + }, + { + "epoch": 0.5806429512115283, + "grad_norm": 0.12912394106388092, + "learning_rate": 0.0005, + "loss": 2.1119, + "step": 152550 + }, + { + "epoch": 0.5806810136796511, + "grad_norm": 0.12052441388368607, + "learning_rate": 0.0005, + "loss": 2.1104, + "step": 152560 + }, + { + "epoch": 0.5807190761477737, + "grad_norm": 0.12511587142944336, + "learning_rate": 0.0005, + "loss": 2.0996, + "step": 152570 + }, + { + "epoch": 0.5807571386158964, + "grad_norm": 0.12099773436784744, + "learning_rate": 0.0005, + "loss": 2.1044, + "step": 152580 + }, + { + "epoch": 0.580795201084019, + "grad_norm": 0.1495269387960434, + "learning_rate": 0.0005, + "loss": 2.1185, + "step": 152590 + }, + { + "epoch": 0.5808332635521418, + "grad_norm": 0.1222032830119133, + "learning_rate": 0.0005, + "loss": 2.1113, + "step": 152600 + }, + { + "epoch": 0.5808713260202645, + "grad_norm": 0.11979856342077255, + "learning_rate": 0.0005, + "loss": 2.1284, + "step": 152610 + }, + { + "epoch": 0.5809093884883871, + "grad_norm": 0.13302768766880035, + "learning_rate": 0.0005, + "loss": 2.1036, + "step": 152620 + }, + { + "epoch": 0.5809474509565098, + "grad_norm": 0.14392118155956268, + "learning_rate": 0.0005, + "loss": 2.1138, + "step": 152630 + }, + { + "epoch": 0.5809855134246326, + "grad_norm": 0.11564590781927109, + "learning_rate": 0.0005, + "loss": 2.0939, + "step": 152640 + }, + { + "epoch": 0.5810235758927552, + "grad_norm": 0.13273461163043976, + "learning_rate": 0.0005, + "loss": 2.1187, + "step": 152650 + }, + { + "epoch": 0.5810616383608779, + "grad_norm": 0.1334458738565445, + "learning_rate": 0.0005, + "loss": 2.1202, + "step": 152660 + }, + { + "epoch": 0.5810997008290005, + "grad_norm": 0.13047702610492706, + "learning_rate": 0.0005, + "loss": 2.128, + "step": 152670 + }, + { + "epoch": 0.5811377632971232, + "grad_norm": 0.12065224349498749, + "learning_rate": 0.0005, + "loss": 2.105, + "step": 152680 + }, + { + "epoch": 0.581175825765246, + "grad_norm": 0.12787656486034393, + "learning_rate": 0.0005, + "loss": 2.1214, + "step": 152690 + }, + { + "epoch": 0.5812138882333686, + "grad_norm": 0.1257835030555725, + "learning_rate": 0.0005, + "loss": 2.1049, + "step": 152700 + }, + { + "epoch": 0.5812519507014913, + "grad_norm": 0.12944728136062622, + "learning_rate": 0.0005, + "loss": 2.111, + "step": 152710 + }, + { + "epoch": 0.5812900131696139, + "grad_norm": 0.12401887774467468, + "learning_rate": 0.0005, + "loss": 2.1114, + "step": 152720 + }, + { + "epoch": 0.5813280756377367, + "grad_norm": 0.12678106129169464, + "learning_rate": 0.0005, + "loss": 2.1139, + "step": 152730 + }, + { + "epoch": 0.5813661381058594, + "grad_norm": 0.12950730323791504, + "learning_rate": 0.0005, + "loss": 2.0996, + "step": 152740 + }, + { + "epoch": 0.581404200573982, + "grad_norm": 0.1311284750699997, + "learning_rate": 0.0005, + "loss": 2.1012, + "step": 152750 + }, + { + "epoch": 0.5814422630421047, + "grad_norm": 0.11955790221691132, + "learning_rate": 0.0005, + "loss": 2.1036, + "step": 152760 + }, + { + "epoch": 0.5814803255102274, + "grad_norm": 0.13516071438789368, + "learning_rate": 0.0005, + "loss": 2.1051, + "step": 152770 + }, + { + "epoch": 0.5815183879783501, + "grad_norm": 0.12173061817884445, + "learning_rate": 0.0005, + "loss": 2.1313, + "step": 152780 + }, + { + "epoch": 0.5815564504464728, + "grad_norm": 0.11228878051042557, + "learning_rate": 0.0005, + "loss": 2.1149, + "step": 152790 + }, + { + "epoch": 0.5815945129145954, + "grad_norm": 0.11856763064861298, + "learning_rate": 0.0005, + "loss": 2.1038, + "step": 152800 + }, + { + "epoch": 0.5816325753827181, + "grad_norm": 0.1394842565059662, + "learning_rate": 0.0005, + "loss": 2.1141, + "step": 152810 + }, + { + "epoch": 0.5816706378508408, + "grad_norm": 0.11592503637075424, + "learning_rate": 0.0005, + "loss": 2.1162, + "step": 152820 + }, + { + "epoch": 0.5817087003189635, + "grad_norm": 0.1304943561553955, + "learning_rate": 0.0005, + "loss": 2.1072, + "step": 152830 + }, + { + "epoch": 0.5817467627870861, + "grad_norm": 0.12011189758777618, + "learning_rate": 0.0005, + "loss": 2.1207, + "step": 152840 + }, + { + "epoch": 0.5817848252552088, + "grad_norm": 0.12784723937511444, + "learning_rate": 0.0005, + "loss": 2.1236, + "step": 152850 + }, + { + "epoch": 0.5818228877233316, + "grad_norm": 0.13046471774578094, + "learning_rate": 0.0005, + "loss": 2.1065, + "step": 152860 + }, + { + "epoch": 0.5818609501914542, + "grad_norm": 0.13343758881092072, + "learning_rate": 0.0005, + "loss": 2.1, + "step": 152870 + }, + { + "epoch": 0.5818990126595769, + "grad_norm": 0.11725534498691559, + "learning_rate": 0.0005, + "loss": 2.1002, + "step": 152880 + }, + { + "epoch": 0.5819370751276995, + "grad_norm": 0.12435596436262131, + "learning_rate": 0.0005, + "loss": 2.1275, + "step": 152890 + }, + { + "epoch": 0.5819751375958223, + "grad_norm": 0.12380015105009079, + "learning_rate": 0.0005, + "loss": 2.1103, + "step": 152900 + }, + { + "epoch": 0.582013200063945, + "grad_norm": 0.124103844165802, + "learning_rate": 0.0005, + "loss": 2.1098, + "step": 152910 + }, + { + "epoch": 0.5820512625320676, + "grad_norm": 0.137775719165802, + "learning_rate": 0.0005, + "loss": 2.0892, + "step": 152920 + }, + { + "epoch": 0.5820893250001903, + "grad_norm": 0.12004926800727844, + "learning_rate": 0.0005, + "loss": 2.0993, + "step": 152930 + }, + { + "epoch": 0.5821273874683129, + "grad_norm": 0.12947256863117218, + "learning_rate": 0.0005, + "loss": 2.1044, + "step": 152940 + }, + { + "epoch": 0.5821654499364357, + "grad_norm": 0.12591451406478882, + "learning_rate": 0.0005, + "loss": 2.1059, + "step": 152950 + }, + { + "epoch": 0.5822035124045584, + "grad_norm": 0.11985679715871811, + "learning_rate": 0.0005, + "loss": 2.1179, + "step": 152960 + }, + { + "epoch": 0.582241574872681, + "grad_norm": 0.1333392709493637, + "learning_rate": 0.0005, + "loss": 2.1111, + "step": 152970 + }, + { + "epoch": 0.5822796373408037, + "grad_norm": 0.125688835978508, + "learning_rate": 0.0005, + "loss": 2.1257, + "step": 152980 + }, + { + "epoch": 0.5823176998089264, + "grad_norm": 0.12332892417907715, + "learning_rate": 0.0005, + "loss": 2.1096, + "step": 152990 + }, + { + "epoch": 0.5823557622770491, + "grad_norm": 0.12826409935951233, + "learning_rate": 0.0005, + "loss": 2.1081, + "step": 153000 + }, + { + "epoch": 0.5823938247451718, + "grad_norm": 0.11257761716842651, + "learning_rate": 0.0005, + "loss": 2.1037, + "step": 153010 + }, + { + "epoch": 0.5824318872132944, + "grad_norm": 0.13841615617275238, + "learning_rate": 0.0005, + "loss": 2.1014, + "step": 153020 + }, + { + "epoch": 0.5824699496814172, + "grad_norm": 0.12583406269550323, + "learning_rate": 0.0005, + "loss": 2.1181, + "step": 153030 + }, + { + "epoch": 0.5825080121495398, + "grad_norm": 0.13003213703632355, + "learning_rate": 0.0005, + "loss": 2.1259, + "step": 153040 + }, + { + "epoch": 0.5825460746176625, + "grad_norm": 0.11432237178087234, + "learning_rate": 0.0005, + "loss": 2.1031, + "step": 153050 + }, + { + "epoch": 0.5825841370857852, + "grad_norm": 0.1389862596988678, + "learning_rate": 0.0005, + "loss": 2.1114, + "step": 153060 + }, + { + "epoch": 0.5826221995539079, + "grad_norm": 0.12699520587921143, + "learning_rate": 0.0005, + "loss": 2.0884, + "step": 153070 + }, + { + "epoch": 0.5826602620220306, + "grad_norm": 0.1173095852136612, + "learning_rate": 0.0005, + "loss": 2.1084, + "step": 153080 + }, + { + "epoch": 0.5826983244901532, + "grad_norm": 0.1409156620502472, + "learning_rate": 0.0005, + "loss": 2.1077, + "step": 153090 + }, + { + "epoch": 0.5827363869582759, + "grad_norm": 0.12233584374189377, + "learning_rate": 0.0005, + "loss": 2.1091, + "step": 153100 + }, + { + "epoch": 0.5827744494263986, + "grad_norm": 0.13279376924037933, + "learning_rate": 0.0005, + "loss": 2.1156, + "step": 153110 + }, + { + "epoch": 0.5828125118945213, + "grad_norm": 0.11767486482858658, + "learning_rate": 0.0005, + "loss": 2.1098, + "step": 153120 + }, + { + "epoch": 0.582850574362644, + "grad_norm": 0.12577876448631287, + "learning_rate": 0.0005, + "loss": 2.1043, + "step": 153130 + }, + { + "epoch": 0.5828886368307666, + "grad_norm": 0.12067107856273651, + "learning_rate": 0.0005, + "loss": 2.0991, + "step": 153140 + }, + { + "epoch": 0.5829266992988893, + "grad_norm": 0.12723997235298157, + "learning_rate": 0.0005, + "loss": 2.1216, + "step": 153150 + }, + { + "epoch": 0.5829647617670121, + "grad_norm": 0.1412838250398636, + "learning_rate": 0.0005, + "loss": 2.1249, + "step": 153160 + }, + { + "epoch": 0.5830028242351347, + "grad_norm": 0.1244267076253891, + "learning_rate": 0.0005, + "loss": 2.1225, + "step": 153170 + }, + { + "epoch": 0.5830408867032574, + "grad_norm": 0.1265404224395752, + "learning_rate": 0.0005, + "loss": 2.1064, + "step": 153180 + }, + { + "epoch": 0.58307894917138, + "grad_norm": 0.12896214425563812, + "learning_rate": 0.0005, + "loss": 2.1225, + "step": 153190 + }, + { + "epoch": 0.5831170116395028, + "grad_norm": 0.11987591534852982, + "learning_rate": 0.0005, + "loss": 2.11, + "step": 153200 + }, + { + "epoch": 0.5831550741076255, + "grad_norm": 0.1190297082066536, + "learning_rate": 0.0005, + "loss": 2.1208, + "step": 153210 + }, + { + "epoch": 0.5831931365757481, + "grad_norm": 0.12007693946361542, + "learning_rate": 0.0005, + "loss": 2.1051, + "step": 153220 + }, + { + "epoch": 0.5832311990438708, + "grad_norm": 0.1179143488407135, + "learning_rate": 0.0005, + "loss": 2.1151, + "step": 153230 + }, + { + "epoch": 0.5832692615119934, + "grad_norm": 0.11957690119743347, + "learning_rate": 0.0005, + "loss": 2.1107, + "step": 153240 + }, + { + "epoch": 0.5833073239801162, + "grad_norm": 0.13799679279327393, + "learning_rate": 0.0005, + "loss": 2.1066, + "step": 153250 + }, + { + "epoch": 0.5833453864482389, + "grad_norm": 0.1221759244799614, + "learning_rate": 0.0005, + "loss": 2.1132, + "step": 153260 + }, + { + "epoch": 0.5833834489163615, + "grad_norm": 0.12267787009477615, + "learning_rate": 0.0005, + "loss": 2.1245, + "step": 153270 + }, + { + "epoch": 0.5834215113844842, + "grad_norm": 0.1274324506521225, + "learning_rate": 0.0005, + "loss": 2.1164, + "step": 153280 + }, + { + "epoch": 0.5834595738526069, + "grad_norm": 0.1125502660870552, + "learning_rate": 0.0005, + "loss": 2.1069, + "step": 153290 + }, + { + "epoch": 0.5834976363207296, + "grad_norm": 0.13437913358211517, + "learning_rate": 0.0005, + "loss": 2.114, + "step": 153300 + }, + { + "epoch": 0.5835356987888523, + "grad_norm": 0.11928600072860718, + "learning_rate": 0.0005, + "loss": 2.0965, + "step": 153310 + }, + { + "epoch": 0.5835737612569749, + "grad_norm": 0.1146225705742836, + "learning_rate": 0.0005, + "loss": 2.1152, + "step": 153320 + }, + { + "epoch": 0.5836118237250977, + "grad_norm": 0.11559824645519257, + "learning_rate": 0.0005, + "loss": 2.0829, + "step": 153330 + }, + { + "epoch": 0.5836498861932203, + "grad_norm": 0.12081331014633179, + "learning_rate": 0.0005, + "loss": 2.1353, + "step": 153340 + }, + { + "epoch": 0.583687948661343, + "grad_norm": 0.1321122795343399, + "learning_rate": 0.0005, + "loss": 2.1122, + "step": 153350 + }, + { + "epoch": 0.5837260111294656, + "grad_norm": 0.12141875922679901, + "learning_rate": 0.0005, + "loss": 2.1079, + "step": 153360 + }, + { + "epoch": 0.5837640735975883, + "grad_norm": 0.13479618728160858, + "learning_rate": 0.0005, + "loss": 2.1003, + "step": 153370 + }, + { + "epoch": 0.5838021360657111, + "grad_norm": 0.11798585206270218, + "learning_rate": 0.0005, + "loss": 2.1327, + "step": 153380 + }, + { + "epoch": 0.5838401985338337, + "grad_norm": 0.14240063726902008, + "learning_rate": 0.0005, + "loss": 2.1241, + "step": 153390 + }, + { + "epoch": 0.5838782610019564, + "grad_norm": 0.12214536219835281, + "learning_rate": 0.0005, + "loss": 2.1155, + "step": 153400 + }, + { + "epoch": 0.583916323470079, + "grad_norm": 0.12596198916435242, + "learning_rate": 0.0005, + "loss": 2.1112, + "step": 153410 + }, + { + "epoch": 0.5839543859382018, + "grad_norm": 0.13761389255523682, + "learning_rate": 0.0005, + "loss": 2.0951, + "step": 153420 + }, + { + "epoch": 0.5839924484063245, + "grad_norm": 0.1374100297689438, + "learning_rate": 0.0005, + "loss": 2.1225, + "step": 153430 + }, + { + "epoch": 0.5840305108744471, + "grad_norm": 0.11680902540683746, + "learning_rate": 0.0005, + "loss": 2.1233, + "step": 153440 + }, + { + "epoch": 0.5840685733425698, + "grad_norm": 0.12362001091241837, + "learning_rate": 0.0005, + "loss": 2.1106, + "step": 153450 + }, + { + "epoch": 0.5841066358106926, + "grad_norm": 0.1341400444507599, + "learning_rate": 0.0005, + "loss": 2.0963, + "step": 153460 + }, + { + "epoch": 0.5841446982788152, + "grad_norm": 0.12792478501796722, + "learning_rate": 0.0005, + "loss": 2.1069, + "step": 153470 + }, + { + "epoch": 0.5841827607469379, + "grad_norm": 0.1239253506064415, + "learning_rate": 0.0005, + "loss": 2.1151, + "step": 153480 + }, + { + "epoch": 0.5842208232150605, + "grad_norm": 0.12595131993293762, + "learning_rate": 0.0005, + "loss": 2.1049, + "step": 153490 + }, + { + "epoch": 0.5842588856831833, + "grad_norm": 0.11936382949352264, + "learning_rate": 0.0005, + "loss": 2.1159, + "step": 153500 + }, + { + "epoch": 0.584296948151306, + "grad_norm": 0.11475072056055069, + "learning_rate": 0.0005, + "loss": 2.1124, + "step": 153510 + }, + { + "epoch": 0.5843350106194286, + "grad_norm": 0.12244194000959396, + "learning_rate": 0.0005, + "loss": 2.106, + "step": 153520 + }, + { + "epoch": 0.5843730730875513, + "grad_norm": 0.13092495501041412, + "learning_rate": 0.0005, + "loss": 2.1131, + "step": 153530 + }, + { + "epoch": 0.5844111355556739, + "grad_norm": 0.126319020986557, + "learning_rate": 0.0005, + "loss": 2.1235, + "step": 153540 + }, + { + "epoch": 0.5844491980237967, + "grad_norm": 0.12426517903804779, + "learning_rate": 0.0005, + "loss": 2.1093, + "step": 153550 + }, + { + "epoch": 0.5844872604919193, + "grad_norm": 0.12947340309619904, + "learning_rate": 0.0005, + "loss": 2.1235, + "step": 153560 + }, + { + "epoch": 0.584525322960042, + "grad_norm": 0.11483649164438248, + "learning_rate": 0.0005, + "loss": 2.1135, + "step": 153570 + }, + { + "epoch": 0.5845633854281647, + "grad_norm": 0.12365952879190445, + "learning_rate": 0.0005, + "loss": 2.1177, + "step": 153580 + }, + { + "epoch": 0.5846014478962874, + "grad_norm": 0.1213454157114029, + "learning_rate": 0.0005, + "loss": 2.1053, + "step": 153590 + }, + { + "epoch": 0.5846395103644101, + "grad_norm": 0.14581279456615448, + "learning_rate": 0.0005, + "loss": 2.1136, + "step": 153600 + }, + { + "epoch": 0.5846775728325327, + "grad_norm": 0.13556072115898132, + "learning_rate": 0.0005, + "loss": 2.1009, + "step": 153610 + }, + { + "epoch": 0.5847156353006554, + "grad_norm": 0.11281615495681763, + "learning_rate": 0.0005, + "loss": 2.1107, + "step": 153620 + }, + { + "epoch": 0.5847536977687782, + "grad_norm": 0.13184192776679993, + "learning_rate": 0.0005, + "loss": 2.1224, + "step": 153630 + }, + { + "epoch": 0.5847917602369008, + "grad_norm": 0.1341467797756195, + "learning_rate": 0.0005, + "loss": 2.1062, + "step": 153640 + }, + { + "epoch": 0.5848298227050235, + "grad_norm": 0.12088645249605179, + "learning_rate": 0.0005, + "loss": 2.1228, + "step": 153650 + }, + { + "epoch": 0.5848678851731461, + "grad_norm": 0.13409413397312164, + "learning_rate": 0.0005, + "loss": 2.1221, + "step": 153660 + }, + { + "epoch": 0.5849059476412688, + "grad_norm": 0.1399022787809372, + "learning_rate": 0.0005, + "loss": 2.1025, + "step": 153670 + }, + { + "epoch": 0.5849440101093916, + "grad_norm": 0.12620849907398224, + "learning_rate": 0.0005, + "loss": 2.1172, + "step": 153680 + }, + { + "epoch": 0.5849820725775142, + "grad_norm": 0.12294651567935944, + "learning_rate": 0.0005, + "loss": 2.1108, + "step": 153690 + }, + { + "epoch": 0.5850201350456369, + "grad_norm": 0.14084553718566895, + "learning_rate": 0.0005, + "loss": 2.1243, + "step": 153700 + }, + { + "epoch": 0.5850581975137595, + "grad_norm": 0.11670193821191788, + "learning_rate": 0.0005, + "loss": 2.097, + "step": 153710 + }, + { + "epoch": 0.5850962599818823, + "grad_norm": 0.14080242812633514, + "learning_rate": 0.0005, + "loss": 2.1161, + "step": 153720 + }, + { + "epoch": 0.585134322450005, + "grad_norm": 0.1202685683965683, + "learning_rate": 0.0005, + "loss": 2.1051, + "step": 153730 + }, + { + "epoch": 0.5851723849181276, + "grad_norm": 0.13900840282440186, + "learning_rate": 0.0005, + "loss": 2.1142, + "step": 153740 + }, + { + "epoch": 0.5852104473862503, + "grad_norm": 0.12486742436885834, + "learning_rate": 0.0005, + "loss": 2.1115, + "step": 153750 + }, + { + "epoch": 0.585248509854373, + "grad_norm": 0.11640941351652145, + "learning_rate": 0.0005, + "loss": 2.1149, + "step": 153760 + }, + { + "epoch": 0.5852865723224957, + "grad_norm": 0.12414953112602234, + "learning_rate": 0.0005, + "loss": 2.101, + "step": 153770 + }, + { + "epoch": 0.5853246347906184, + "grad_norm": 0.12600675225257874, + "learning_rate": 0.0005, + "loss": 2.1107, + "step": 153780 + }, + { + "epoch": 0.585362697258741, + "grad_norm": 0.5305410623550415, + "learning_rate": 0.0005, + "loss": 2.1138, + "step": 153790 + }, + { + "epoch": 0.5854007597268638, + "grad_norm": 0.1254570037126541, + "learning_rate": 0.0005, + "loss": 2.0942, + "step": 153800 + }, + { + "epoch": 0.5854388221949864, + "grad_norm": 0.12492625415325165, + "learning_rate": 0.0005, + "loss": 2.1146, + "step": 153810 + }, + { + "epoch": 0.5854768846631091, + "grad_norm": 0.13412101566791534, + "learning_rate": 0.0005, + "loss": 2.1069, + "step": 153820 + }, + { + "epoch": 0.5855149471312318, + "grad_norm": 0.1295090913772583, + "learning_rate": 0.0005, + "loss": 2.1313, + "step": 153830 + }, + { + "epoch": 0.5855530095993544, + "grad_norm": 0.12706872820854187, + "learning_rate": 0.0005, + "loss": 2.1186, + "step": 153840 + }, + { + "epoch": 0.5855910720674772, + "grad_norm": 0.1238255500793457, + "learning_rate": 0.0005, + "loss": 2.1061, + "step": 153850 + }, + { + "epoch": 0.5856291345355998, + "grad_norm": 0.11371153593063354, + "learning_rate": 0.0005, + "loss": 2.1093, + "step": 153860 + }, + { + "epoch": 0.5856671970037225, + "grad_norm": 0.11320296674966812, + "learning_rate": 0.0005, + "loss": 2.1208, + "step": 153870 + }, + { + "epoch": 0.5857052594718452, + "grad_norm": 0.12613484263420105, + "learning_rate": 0.0005, + "loss": 2.0967, + "step": 153880 + }, + { + "epoch": 0.5857433219399679, + "grad_norm": 0.12188831716775894, + "learning_rate": 0.0005, + "loss": 2.1032, + "step": 153890 + }, + { + "epoch": 0.5857813844080906, + "grad_norm": 0.1488693207502365, + "learning_rate": 0.0005, + "loss": 2.1045, + "step": 153900 + }, + { + "epoch": 0.5858194468762132, + "grad_norm": 0.12705174088478088, + "learning_rate": 0.0005, + "loss": 2.108, + "step": 153910 + }, + { + "epoch": 0.5858575093443359, + "grad_norm": 0.11664033681154251, + "learning_rate": 0.0005, + "loss": 2.1107, + "step": 153920 + }, + { + "epoch": 0.5858955718124587, + "grad_norm": 0.12219222635030746, + "learning_rate": 0.0005, + "loss": 2.1176, + "step": 153930 + }, + { + "epoch": 0.5859336342805813, + "grad_norm": 0.12206865847110748, + "learning_rate": 0.0005, + "loss": 2.1174, + "step": 153940 + }, + { + "epoch": 0.585971696748704, + "grad_norm": 0.12492918223142624, + "learning_rate": 0.0005, + "loss": 2.1083, + "step": 153950 + }, + { + "epoch": 0.5860097592168266, + "grad_norm": 0.12702591717243195, + "learning_rate": 0.0005, + "loss": 2.1143, + "step": 153960 + }, + { + "epoch": 0.5860478216849493, + "grad_norm": 0.11352322995662689, + "learning_rate": 0.0005, + "loss": 2.1123, + "step": 153970 + }, + { + "epoch": 0.5860858841530721, + "grad_norm": 0.14898304641246796, + "learning_rate": 0.0005, + "loss": 2.1263, + "step": 153980 + }, + { + "epoch": 0.5861239466211947, + "grad_norm": 0.12120857834815979, + "learning_rate": 0.0005, + "loss": 2.1109, + "step": 153990 + }, + { + "epoch": 0.5861620090893174, + "grad_norm": 0.12267789989709854, + "learning_rate": 0.0005, + "loss": 2.1071, + "step": 154000 + }, + { + "epoch": 0.58620007155744, + "grad_norm": 0.1255556046962738, + "learning_rate": 0.0005, + "loss": 2.1289, + "step": 154010 + }, + { + "epoch": 0.5862381340255628, + "grad_norm": 0.11012211441993713, + "learning_rate": 0.0005, + "loss": 2.1179, + "step": 154020 + }, + { + "epoch": 0.5862761964936855, + "grad_norm": 0.11161710321903229, + "learning_rate": 0.0005, + "loss": 2.1077, + "step": 154030 + }, + { + "epoch": 0.5863142589618081, + "grad_norm": 0.11873649060726166, + "learning_rate": 0.0005, + "loss": 2.1143, + "step": 154040 + }, + { + "epoch": 0.5863523214299308, + "grad_norm": 0.12429730594158173, + "learning_rate": 0.0005, + "loss": 2.1268, + "step": 154050 + }, + { + "epoch": 0.5863903838980535, + "grad_norm": 0.11891800910234451, + "learning_rate": 0.0005, + "loss": 2.1077, + "step": 154060 + }, + { + "epoch": 0.5864284463661762, + "grad_norm": 0.13123571872711182, + "learning_rate": 0.0005, + "loss": 2.1162, + "step": 154070 + }, + { + "epoch": 0.5864665088342988, + "grad_norm": 0.12377672642469406, + "learning_rate": 0.0005, + "loss": 2.0882, + "step": 154080 + }, + { + "epoch": 0.5865045713024215, + "grad_norm": 0.1367446929216385, + "learning_rate": 0.0005, + "loss": 2.1152, + "step": 154090 + }, + { + "epoch": 0.5865426337705442, + "grad_norm": 0.13042481243610382, + "learning_rate": 0.0005, + "loss": 2.1162, + "step": 154100 + }, + { + "epoch": 0.5865806962386669, + "grad_norm": 0.1151302307844162, + "learning_rate": 0.0005, + "loss": 2.1194, + "step": 154110 + }, + { + "epoch": 0.5866187587067896, + "grad_norm": 0.12772217392921448, + "learning_rate": 0.0005, + "loss": 2.1163, + "step": 154120 + }, + { + "epoch": 0.5866568211749122, + "grad_norm": 0.12427042424678802, + "learning_rate": 0.0005, + "loss": 2.1041, + "step": 154130 + }, + { + "epoch": 0.5866948836430349, + "grad_norm": 0.12787887454032898, + "learning_rate": 0.0005, + "loss": 2.1201, + "step": 154140 + }, + { + "epoch": 0.5867329461111577, + "grad_norm": 0.12783733010292053, + "learning_rate": 0.0005, + "loss": 2.1277, + "step": 154150 + }, + { + "epoch": 0.5867710085792803, + "grad_norm": 0.12757271528244019, + "learning_rate": 0.0005, + "loss": 2.1127, + "step": 154160 + }, + { + "epoch": 0.586809071047403, + "grad_norm": 0.12278042733669281, + "learning_rate": 0.0005, + "loss": 2.1089, + "step": 154170 + }, + { + "epoch": 0.5868471335155256, + "grad_norm": 0.12007766216993332, + "learning_rate": 0.0005, + "loss": 2.1131, + "step": 154180 + }, + { + "epoch": 0.5868851959836484, + "grad_norm": 0.11267594248056412, + "learning_rate": 0.0005, + "loss": 2.1293, + "step": 154190 + }, + { + "epoch": 0.5869232584517711, + "grad_norm": 0.13086989521980286, + "learning_rate": 0.0005, + "loss": 2.1064, + "step": 154200 + }, + { + "epoch": 0.5869613209198937, + "grad_norm": 0.13795329630374908, + "learning_rate": 0.0005, + "loss": 2.1118, + "step": 154210 + }, + { + "epoch": 0.5869993833880164, + "grad_norm": 0.1180015504360199, + "learning_rate": 0.0005, + "loss": 2.1065, + "step": 154220 + }, + { + "epoch": 0.5870374458561392, + "grad_norm": 0.13229142129421234, + "learning_rate": 0.0005, + "loss": 2.0986, + "step": 154230 + }, + { + "epoch": 0.5870755083242618, + "grad_norm": 0.11965411901473999, + "learning_rate": 0.0005, + "loss": 2.1127, + "step": 154240 + }, + { + "epoch": 0.5871135707923845, + "grad_norm": 0.11663006246089935, + "learning_rate": 0.0005, + "loss": 2.1212, + "step": 154250 + }, + { + "epoch": 0.5871516332605071, + "grad_norm": 0.12126167863607407, + "learning_rate": 0.0005, + "loss": 2.1273, + "step": 154260 + }, + { + "epoch": 0.5871896957286298, + "grad_norm": 0.12216583639383316, + "learning_rate": 0.0005, + "loss": 2.1107, + "step": 154270 + }, + { + "epoch": 0.5872277581967525, + "grad_norm": 0.11939745396375656, + "learning_rate": 0.0005, + "loss": 2.1207, + "step": 154280 + }, + { + "epoch": 0.5872658206648752, + "grad_norm": 0.11422568559646606, + "learning_rate": 0.0005, + "loss": 2.117, + "step": 154290 + }, + { + "epoch": 0.5873038831329979, + "grad_norm": 0.12077447026968002, + "learning_rate": 0.0005, + "loss": 2.101, + "step": 154300 + }, + { + "epoch": 0.5873419456011205, + "grad_norm": 0.11959082633256912, + "learning_rate": 0.0005, + "loss": 2.1123, + "step": 154310 + }, + { + "epoch": 0.5873800080692433, + "grad_norm": 0.12821198999881744, + "learning_rate": 0.0005, + "loss": 2.1082, + "step": 154320 + }, + { + "epoch": 0.5874180705373659, + "grad_norm": 0.12846586108207703, + "learning_rate": 0.0005, + "loss": 2.1177, + "step": 154330 + }, + { + "epoch": 0.5874561330054886, + "grad_norm": 0.12448491156101227, + "learning_rate": 0.0005, + "loss": 2.1123, + "step": 154340 + }, + { + "epoch": 0.5874941954736113, + "grad_norm": 0.12642216682434082, + "learning_rate": 0.0005, + "loss": 2.107, + "step": 154350 + }, + { + "epoch": 0.587532257941734, + "grad_norm": 0.11989720165729523, + "learning_rate": 0.0005, + "loss": 2.1008, + "step": 154360 + }, + { + "epoch": 0.5875703204098567, + "grad_norm": 0.14378298819065094, + "learning_rate": 0.0005, + "loss": 2.107, + "step": 154370 + }, + { + "epoch": 0.5876083828779793, + "grad_norm": 0.20334845781326294, + "learning_rate": 0.0005, + "loss": 2.123, + "step": 154380 + }, + { + "epoch": 0.587646445346102, + "grad_norm": 0.12918692827224731, + "learning_rate": 0.0005, + "loss": 2.1201, + "step": 154390 + }, + { + "epoch": 0.5876845078142247, + "grad_norm": 0.12609420716762543, + "learning_rate": 0.0005, + "loss": 2.1105, + "step": 154400 + }, + { + "epoch": 0.5877225702823474, + "grad_norm": 0.13024812936782837, + "learning_rate": 0.0005, + "loss": 2.1208, + "step": 154410 + }, + { + "epoch": 0.5877606327504701, + "grad_norm": 0.13436540961265564, + "learning_rate": 0.0005, + "loss": 2.1129, + "step": 154420 + }, + { + "epoch": 0.5877986952185927, + "grad_norm": 0.1369498074054718, + "learning_rate": 0.0005, + "loss": 2.1124, + "step": 154430 + }, + { + "epoch": 0.5878367576867154, + "grad_norm": 0.12045515328645706, + "learning_rate": 0.0005, + "loss": 2.1114, + "step": 154440 + }, + { + "epoch": 0.5878748201548382, + "grad_norm": 0.12609994411468506, + "learning_rate": 0.0005, + "loss": 2.1333, + "step": 154450 + }, + { + "epoch": 0.5879128826229608, + "grad_norm": 0.1296570599079132, + "learning_rate": 0.0005, + "loss": 2.1141, + "step": 154460 + }, + { + "epoch": 0.5879509450910835, + "grad_norm": 0.13209514319896698, + "learning_rate": 0.0005, + "loss": 2.1176, + "step": 154470 + }, + { + "epoch": 0.5879890075592061, + "grad_norm": 0.12957362830638885, + "learning_rate": 0.0005, + "loss": 2.1187, + "step": 154480 + }, + { + "epoch": 0.5880270700273289, + "grad_norm": 0.888014554977417, + "learning_rate": 0.0005, + "loss": 2.1034, + "step": 154490 + }, + { + "epoch": 0.5880651324954516, + "grad_norm": 0.1320250779390335, + "learning_rate": 0.0005, + "loss": 2.1117, + "step": 154500 + }, + { + "epoch": 0.5881031949635742, + "grad_norm": 0.12680445611476898, + "learning_rate": 0.0005, + "loss": 2.1044, + "step": 154510 + }, + { + "epoch": 0.5881412574316969, + "grad_norm": 0.12884357571601868, + "learning_rate": 0.0005, + "loss": 2.118, + "step": 154520 + }, + { + "epoch": 0.5881793198998195, + "grad_norm": 0.11659844219684601, + "learning_rate": 0.0005, + "loss": 2.123, + "step": 154530 + }, + { + "epoch": 0.5882173823679423, + "grad_norm": 0.1260477602481842, + "learning_rate": 0.0005, + "loss": 2.1066, + "step": 154540 + }, + { + "epoch": 0.588255444836065, + "grad_norm": 0.1149340271949768, + "learning_rate": 0.0005, + "loss": 2.1, + "step": 154550 + }, + { + "epoch": 0.5882935073041876, + "grad_norm": 0.12525539100170135, + "learning_rate": 0.0005, + "loss": 2.1122, + "step": 154560 + }, + { + "epoch": 0.5883315697723103, + "grad_norm": 0.12114496529102325, + "learning_rate": 0.0005, + "loss": 2.1036, + "step": 154570 + }, + { + "epoch": 0.588369632240433, + "grad_norm": 0.13838450610637665, + "learning_rate": 0.0005, + "loss": 2.1027, + "step": 154580 + }, + { + "epoch": 0.5884076947085557, + "grad_norm": 0.12621724605560303, + "learning_rate": 0.0005, + "loss": 2.1307, + "step": 154590 + }, + { + "epoch": 0.5884457571766784, + "grad_norm": 0.12438689172267914, + "learning_rate": 0.0005, + "loss": 2.1115, + "step": 154600 + }, + { + "epoch": 0.588483819644801, + "grad_norm": 0.15652117133140564, + "learning_rate": 0.0005, + "loss": 2.1027, + "step": 154610 + }, + { + "epoch": 0.5885218821129238, + "grad_norm": 0.12901882827281952, + "learning_rate": 0.0005, + "loss": 2.1275, + "step": 154620 + }, + { + "epoch": 0.5885599445810464, + "grad_norm": 0.12268058955669403, + "learning_rate": 0.0005, + "loss": 2.1175, + "step": 154630 + }, + { + "epoch": 0.5885980070491691, + "grad_norm": 0.13506019115447998, + "learning_rate": 0.0005, + "loss": 2.1092, + "step": 154640 + }, + { + "epoch": 0.5886360695172917, + "grad_norm": 0.12235263735055923, + "learning_rate": 0.0005, + "loss": 2.1015, + "step": 154650 + }, + { + "epoch": 0.5886741319854145, + "grad_norm": 0.11945250630378723, + "learning_rate": 0.0005, + "loss": 2.096, + "step": 154660 + }, + { + "epoch": 0.5887121944535372, + "grad_norm": 0.12435851246118546, + "learning_rate": 0.0005, + "loss": 2.1146, + "step": 154670 + }, + { + "epoch": 0.5887502569216598, + "grad_norm": 0.12978994846343994, + "learning_rate": 0.0005, + "loss": 2.1082, + "step": 154680 + }, + { + "epoch": 0.5887883193897825, + "grad_norm": 0.12437500059604645, + "learning_rate": 0.0005, + "loss": 2.1119, + "step": 154690 + }, + { + "epoch": 0.5888263818579051, + "grad_norm": 0.1315934807062149, + "learning_rate": 0.0005, + "loss": 2.1084, + "step": 154700 + }, + { + "epoch": 0.5888644443260279, + "grad_norm": 0.1189320832490921, + "learning_rate": 0.0005, + "loss": 2.1101, + "step": 154710 + }, + { + "epoch": 0.5889025067941506, + "grad_norm": 0.13439911603927612, + "learning_rate": 0.0005, + "loss": 2.1275, + "step": 154720 + }, + { + "epoch": 0.5889405692622732, + "grad_norm": 0.12734562158584595, + "learning_rate": 0.0005, + "loss": 2.1084, + "step": 154730 + }, + { + "epoch": 0.5889786317303959, + "grad_norm": 0.11846303939819336, + "learning_rate": 0.0005, + "loss": 2.1295, + "step": 154740 + }, + { + "epoch": 0.5890166941985187, + "grad_norm": 0.11798400431871414, + "learning_rate": 0.0005, + "loss": 2.1039, + "step": 154750 + }, + { + "epoch": 0.5890547566666413, + "grad_norm": 0.13610303401947021, + "learning_rate": 0.0005, + "loss": 2.1204, + "step": 154760 + }, + { + "epoch": 0.589092819134764, + "grad_norm": 0.12588945031166077, + "learning_rate": 0.0005, + "loss": 2.1144, + "step": 154770 + }, + { + "epoch": 0.5891308816028866, + "grad_norm": 0.12449779361486435, + "learning_rate": 0.0005, + "loss": 2.1291, + "step": 154780 + }, + { + "epoch": 0.5891689440710094, + "grad_norm": 0.17011813819408417, + "learning_rate": 0.0005, + "loss": 2.1155, + "step": 154790 + }, + { + "epoch": 0.589207006539132, + "grad_norm": 0.12432485073804855, + "learning_rate": 0.0005, + "loss": 2.11, + "step": 154800 + }, + { + "epoch": 0.5892450690072547, + "grad_norm": 0.10910047590732574, + "learning_rate": 0.0005, + "loss": 2.1212, + "step": 154810 + }, + { + "epoch": 0.5892831314753774, + "grad_norm": 0.11259827762842178, + "learning_rate": 0.0005, + "loss": 2.1212, + "step": 154820 + }, + { + "epoch": 0.5893211939435, + "grad_norm": 0.11866344511508942, + "learning_rate": 0.0005, + "loss": 2.1165, + "step": 154830 + }, + { + "epoch": 0.5893592564116228, + "grad_norm": 0.12497690320014954, + "learning_rate": 0.0005, + "loss": 2.1063, + "step": 154840 + }, + { + "epoch": 0.5893973188797454, + "grad_norm": 0.12016210705041885, + "learning_rate": 0.0005, + "loss": 2.1251, + "step": 154850 + }, + { + "epoch": 0.5894353813478681, + "grad_norm": 0.12100519984960556, + "learning_rate": 0.0005, + "loss": 2.097, + "step": 154860 + }, + { + "epoch": 0.5894734438159908, + "grad_norm": 0.12728513777256012, + "learning_rate": 0.0005, + "loss": 2.1138, + "step": 154870 + }, + { + "epoch": 0.5895115062841135, + "grad_norm": 0.12269476801156998, + "learning_rate": 0.0005, + "loss": 2.1049, + "step": 154880 + }, + { + "epoch": 0.5895495687522362, + "grad_norm": 0.1447187215089798, + "learning_rate": 0.0005, + "loss": 2.1178, + "step": 154890 + }, + { + "epoch": 0.5895876312203588, + "grad_norm": 0.1288946568965912, + "learning_rate": 0.0005, + "loss": 2.1176, + "step": 154900 + }, + { + "epoch": 0.5896256936884815, + "grad_norm": 0.1268695890903473, + "learning_rate": 0.0005, + "loss": 2.1122, + "step": 154910 + }, + { + "epoch": 0.5896637561566043, + "grad_norm": 0.13210426270961761, + "learning_rate": 0.0005, + "loss": 2.0881, + "step": 154920 + }, + { + "epoch": 0.5897018186247269, + "grad_norm": 0.1955825239419937, + "learning_rate": 0.0005, + "loss": 2.1074, + "step": 154930 + }, + { + "epoch": 0.5897398810928496, + "grad_norm": 0.13660775125026703, + "learning_rate": 0.0005, + "loss": 2.1031, + "step": 154940 + }, + { + "epoch": 0.5897779435609722, + "grad_norm": 0.12983176112174988, + "learning_rate": 0.0005, + "loss": 2.1078, + "step": 154950 + }, + { + "epoch": 0.5898160060290949, + "grad_norm": 0.12133912742137909, + "learning_rate": 0.0005, + "loss": 2.1068, + "step": 154960 + }, + { + "epoch": 0.5898540684972177, + "grad_norm": 0.12246081233024597, + "learning_rate": 0.0005, + "loss": 2.1131, + "step": 154970 + }, + { + "epoch": 0.5898921309653403, + "grad_norm": 0.12103147059679031, + "learning_rate": 0.0005, + "loss": 2.0996, + "step": 154980 + }, + { + "epoch": 0.589930193433463, + "grad_norm": 0.12457051128149033, + "learning_rate": 0.0005, + "loss": 2.1171, + "step": 154990 + }, + { + "epoch": 0.5899682559015856, + "grad_norm": 0.1302313357591629, + "learning_rate": 0.0005, + "loss": 2.1118, + "step": 155000 + }, + { + "epoch": 0.5900063183697084, + "grad_norm": 0.1370365023612976, + "learning_rate": 0.0005, + "loss": 2.1206, + "step": 155010 + }, + { + "epoch": 0.5900443808378311, + "grad_norm": 0.11973226815462112, + "learning_rate": 0.0005, + "loss": 2.1078, + "step": 155020 + }, + { + "epoch": 0.5900824433059537, + "grad_norm": 0.1383923441171646, + "learning_rate": 0.0005, + "loss": 2.1175, + "step": 155030 + }, + { + "epoch": 0.5901205057740764, + "grad_norm": 0.11702127754688263, + "learning_rate": 0.0005, + "loss": 2.0953, + "step": 155040 + }, + { + "epoch": 0.5901585682421991, + "grad_norm": 0.12547564506530762, + "learning_rate": 0.0005, + "loss": 2.1216, + "step": 155050 + }, + { + "epoch": 0.5901966307103218, + "grad_norm": 0.11866992712020874, + "learning_rate": 0.0005, + "loss": 2.1205, + "step": 155060 + }, + { + "epoch": 0.5902346931784445, + "grad_norm": 0.1276715099811554, + "learning_rate": 0.0005, + "loss": 2.1162, + "step": 155070 + }, + { + "epoch": 0.5902727556465671, + "grad_norm": 0.12169170379638672, + "learning_rate": 0.0005, + "loss": 2.1066, + "step": 155080 + }, + { + "epoch": 0.5903108181146899, + "grad_norm": 0.13266567885875702, + "learning_rate": 0.0005, + "loss": 2.1317, + "step": 155090 + }, + { + "epoch": 0.5903488805828125, + "grad_norm": 0.12670375406742096, + "learning_rate": 0.0005, + "loss": 2.1284, + "step": 155100 + }, + { + "epoch": 0.5903869430509352, + "grad_norm": 0.1242232397198677, + "learning_rate": 0.0005, + "loss": 2.1292, + "step": 155110 + }, + { + "epoch": 0.5904250055190579, + "grad_norm": 0.13165102899074554, + "learning_rate": 0.0005, + "loss": 2.1153, + "step": 155120 + }, + { + "epoch": 0.5904630679871805, + "grad_norm": 0.11474651843309402, + "learning_rate": 0.0005, + "loss": 2.1185, + "step": 155130 + }, + { + "epoch": 0.5905011304553033, + "grad_norm": 0.1368701159954071, + "learning_rate": 0.0005, + "loss": 2.1062, + "step": 155140 + }, + { + "epoch": 0.5905391929234259, + "grad_norm": 0.1212804764509201, + "learning_rate": 0.0005, + "loss": 2.1038, + "step": 155150 + }, + { + "epoch": 0.5905772553915486, + "grad_norm": 0.11533062160015106, + "learning_rate": 0.0005, + "loss": 2.1068, + "step": 155160 + }, + { + "epoch": 0.5906153178596713, + "grad_norm": 0.12023679167032242, + "learning_rate": 0.0005, + "loss": 2.1153, + "step": 155170 + }, + { + "epoch": 0.590653380327794, + "grad_norm": 0.11425898969173431, + "learning_rate": 0.0005, + "loss": 2.1048, + "step": 155180 + }, + { + "epoch": 0.5906914427959167, + "grad_norm": 0.13516448438167572, + "learning_rate": 0.0005, + "loss": 2.1106, + "step": 155190 + }, + { + "epoch": 0.5907295052640393, + "grad_norm": 0.11822634190320969, + "learning_rate": 0.0005, + "loss": 2.117, + "step": 155200 + }, + { + "epoch": 0.590767567732162, + "grad_norm": 0.12545858323574066, + "learning_rate": 0.0005, + "loss": 2.0899, + "step": 155210 + }, + { + "epoch": 0.5908056302002848, + "grad_norm": 0.1313958764076233, + "learning_rate": 0.0005, + "loss": 2.1003, + "step": 155220 + }, + { + "epoch": 0.5908436926684074, + "grad_norm": 0.13253335654735565, + "learning_rate": 0.0005, + "loss": 2.1294, + "step": 155230 + }, + { + "epoch": 0.5908817551365301, + "grad_norm": 0.11414457857608795, + "learning_rate": 0.0005, + "loss": 2.1143, + "step": 155240 + }, + { + "epoch": 0.5909198176046527, + "grad_norm": 0.12125179171562195, + "learning_rate": 0.0005, + "loss": 2.1082, + "step": 155250 + }, + { + "epoch": 0.5909578800727754, + "grad_norm": 0.1280829906463623, + "learning_rate": 0.0005, + "loss": 2.1286, + "step": 155260 + }, + { + "epoch": 0.5909959425408982, + "grad_norm": 0.12440607696771622, + "learning_rate": 0.0005, + "loss": 2.1085, + "step": 155270 + }, + { + "epoch": 0.5910340050090208, + "grad_norm": 0.13153356313705444, + "learning_rate": 0.0005, + "loss": 2.1168, + "step": 155280 + }, + { + "epoch": 0.5910720674771435, + "grad_norm": 0.1424436718225479, + "learning_rate": 0.0005, + "loss": 2.114, + "step": 155290 + }, + { + "epoch": 0.5911101299452661, + "grad_norm": 0.12767179310321808, + "learning_rate": 0.0005, + "loss": 2.1074, + "step": 155300 + }, + { + "epoch": 0.5911481924133889, + "grad_norm": 0.1162237673997879, + "learning_rate": 0.0005, + "loss": 2.1068, + "step": 155310 + }, + { + "epoch": 0.5911862548815116, + "grad_norm": 0.13033299148082733, + "learning_rate": 0.0005, + "loss": 2.1136, + "step": 155320 + }, + { + "epoch": 0.5912243173496342, + "grad_norm": 0.11099372804164886, + "learning_rate": 0.0005, + "loss": 2.1024, + "step": 155330 + }, + { + "epoch": 0.5912623798177569, + "grad_norm": 0.12529684603214264, + "learning_rate": 0.0005, + "loss": 2.1054, + "step": 155340 + }, + { + "epoch": 0.5913004422858796, + "grad_norm": 0.12372294068336487, + "learning_rate": 0.0005, + "loss": 2.1093, + "step": 155350 + }, + { + "epoch": 0.5913385047540023, + "grad_norm": 0.13699477910995483, + "learning_rate": 0.0005, + "loss": 2.1225, + "step": 155360 + }, + { + "epoch": 0.591376567222125, + "grad_norm": 0.12353967875242233, + "learning_rate": 0.0005, + "loss": 2.1191, + "step": 155370 + }, + { + "epoch": 0.5914146296902476, + "grad_norm": 0.1406717151403427, + "learning_rate": 0.0005, + "loss": 2.1205, + "step": 155380 + }, + { + "epoch": 0.5914526921583703, + "grad_norm": 0.1308826506137848, + "learning_rate": 0.0005, + "loss": 2.0959, + "step": 155390 + }, + { + "epoch": 0.591490754626493, + "grad_norm": 0.12167200446128845, + "learning_rate": 0.0005, + "loss": 2.1293, + "step": 155400 + }, + { + "epoch": 0.5915288170946157, + "grad_norm": 0.12970460951328278, + "learning_rate": 0.0005, + "loss": 2.1161, + "step": 155410 + }, + { + "epoch": 0.5915668795627383, + "grad_norm": 0.13693182170391083, + "learning_rate": 0.0005, + "loss": 2.0963, + "step": 155420 + }, + { + "epoch": 0.591604942030861, + "grad_norm": 0.13417673110961914, + "learning_rate": 0.0005, + "loss": 2.108, + "step": 155430 + }, + { + "epoch": 0.5916430044989838, + "grad_norm": 0.12016027420759201, + "learning_rate": 0.0005, + "loss": 2.1114, + "step": 155440 + }, + { + "epoch": 0.5916810669671064, + "grad_norm": 0.11692225933074951, + "learning_rate": 0.0005, + "loss": 2.1074, + "step": 155450 + }, + { + "epoch": 0.5917191294352291, + "grad_norm": 0.126420795917511, + "learning_rate": 0.0005, + "loss": 2.0982, + "step": 155460 + }, + { + "epoch": 0.5917571919033517, + "grad_norm": 0.11544563621282578, + "learning_rate": 0.0005, + "loss": 2.1036, + "step": 155470 + }, + { + "epoch": 0.5917952543714745, + "grad_norm": 0.11593781411647797, + "learning_rate": 0.0005, + "loss": 2.1135, + "step": 155480 + }, + { + "epoch": 0.5918333168395972, + "grad_norm": 0.11717119067907333, + "learning_rate": 0.0005, + "loss": 2.1174, + "step": 155490 + }, + { + "epoch": 0.5918713793077198, + "grad_norm": 0.1266764998435974, + "learning_rate": 0.0005, + "loss": 2.1153, + "step": 155500 + }, + { + "epoch": 0.5919094417758425, + "grad_norm": 0.12252969294786453, + "learning_rate": 0.0005, + "loss": 2.1169, + "step": 155510 + }, + { + "epoch": 0.5919475042439653, + "grad_norm": 0.12028811126947403, + "learning_rate": 0.0005, + "loss": 2.1008, + "step": 155520 + }, + { + "epoch": 0.5919855667120879, + "grad_norm": 0.1271073818206787, + "learning_rate": 0.0005, + "loss": 2.111, + "step": 155530 + }, + { + "epoch": 0.5920236291802106, + "grad_norm": 0.12291359901428223, + "learning_rate": 0.0005, + "loss": 2.1014, + "step": 155540 + }, + { + "epoch": 0.5920616916483332, + "grad_norm": 0.12409337610006332, + "learning_rate": 0.0005, + "loss": 2.1189, + "step": 155550 + }, + { + "epoch": 0.5920997541164559, + "grad_norm": 0.11245585978031158, + "learning_rate": 0.0005, + "loss": 2.0998, + "step": 155560 + }, + { + "epoch": 0.5921378165845786, + "grad_norm": 0.115641288459301, + "learning_rate": 0.0005, + "loss": 2.1028, + "step": 155570 + }, + { + "epoch": 0.5921758790527013, + "grad_norm": 0.12326592206954956, + "learning_rate": 0.0005, + "loss": 2.1159, + "step": 155580 + }, + { + "epoch": 0.592213941520824, + "grad_norm": 0.11967624723911285, + "learning_rate": 0.0005, + "loss": 2.1146, + "step": 155590 + }, + { + "epoch": 0.5922520039889466, + "grad_norm": 0.1309567093849182, + "learning_rate": 0.0005, + "loss": 2.1115, + "step": 155600 + }, + { + "epoch": 0.5922900664570694, + "grad_norm": 0.12665249407291412, + "learning_rate": 0.0005, + "loss": 2.1149, + "step": 155610 + }, + { + "epoch": 0.592328128925192, + "grad_norm": 0.12019842863082886, + "learning_rate": 0.0005, + "loss": 2.1064, + "step": 155620 + }, + { + "epoch": 0.5923661913933147, + "grad_norm": 0.12676399946212769, + "learning_rate": 0.0005, + "loss": 2.1205, + "step": 155630 + }, + { + "epoch": 0.5924042538614374, + "grad_norm": 0.13679906725883484, + "learning_rate": 0.0005, + "loss": 2.1163, + "step": 155640 + }, + { + "epoch": 0.5924423163295601, + "grad_norm": 0.12272468209266663, + "learning_rate": 0.0005, + "loss": 2.1032, + "step": 155650 + }, + { + "epoch": 0.5924803787976828, + "grad_norm": 0.11839216947555542, + "learning_rate": 0.0005, + "loss": 2.0965, + "step": 155660 + }, + { + "epoch": 0.5925184412658054, + "grad_norm": 0.1262093186378479, + "learning_rate": 0.0005, + "loss": 2.1084, + "step": 155670 + }, + { + "epoch": 0.5925565037339281, + "grad_norm": 0.1315896213054657, + "learning_rate": 0.0005, + "loss": 2.1209, + "step": 155680 + }, + { + "epoch": 0.5925945662020508, + "grad_norm": 0.12056088447570801, + "learning_rate": 0.0005, + "loss": 2.1158, + "step": 155690 + }, + { + "epoch": 0.5926326286701735, + "grad_norm": 0.13733994960784912, + "learning_rate": 0.0005, + "loss": 2.1021, + "step": 155700 + }, + { + "epoch": 0.5926706911382962, + "grad_norm": 0.11890329420566559, + "learning_rate": 0.0005, + "loss": 2.1081, + "step": 155710 + }, + { + "epoch": 0.5927087536064188, + "grad_norm": 0.13701745867729187, + "learning_rate": 0.0005, + "loss": 2.1263, + "step": 155720 + }, + { + "epoch": 0.5927468160745415, + "grad_norm": 0.130995512008667, + "learning_rate": 0.0005, + "loss": 2.1071, + "step": 155730 + }, + { + "epoch": 0.5927848785426643, + "grad_norm": 0.12714290618896484, + "learning_rate": 0.0005, + "loss": 2.1201, + "step": 155740 + }, + { + "epoch": 0.5928229410107869, + "grad_norm": 0.12613579630851746, + "learning_rate": 0.0005, + "loss": 2.1062, + "step": 155750 + }, + { + "epoch": 0.5928610034789096, + "grad_norm": 0.12419883906841278, + "learning_rate": 0.0005, + "loss": 2.0857, + "step": 155760 + }, + { + "epoch": 0.5928990659470322, + "grad_norm": 0.13133420050144196, + "learning_rate": 0.0005, + "loss": 2.1139, + "step": 155770 + }, + { + "epoch": 0.592937128415155, + "grad_norm": 0.1103355661034584, + "learning_rate": 0.0005, + "loss": 2.1191, + "step": 155780 + }, + { + "epoch": 0.5929751908832777, + "grad_norm": 0.12126592546701431, + "learning_rate": 0.0005, + "loss": 2.1136, + "step": 155790 + }, + { + "epoch": 0.5930132533514003, + "grad_norm": 0.12847162783145905, + "learning_rate": 0.0005, + "loss": 2.0935, + "step": 155800 + }, + { + "epoch": 0.593051315819523, + "grad_norm": 0.12895318865776062, + "learning_rate": 0.0005, + "loss": 2.1104, + "step": 155810 + }, + { + "epoch": 0.5930893782876456, + "grad_norm": 0.12282812595367432, + "learning_rate": 0.0005, + "loss": 2.1092, + "step": 155820 + }, + { + "epoch": 0.5931274407557684, + "grad_norm": 0.11810677498579025, + "learning_rate": 0.0005, + "loss": 2.1077, + "step": 155830 + }, + { + "epoch": 0.593165503223891, + "grad_norm": 0.12165694683790207, + "learning_rate": 0.0005, + "loss": 2.1037, + "step": 155840 + }, + { + "epoch": 0.5932035656920137, + "grad_norm": 0.1188536211848259, + "learning_rate": 0.0005, + "loss": 2.1127, + "step": 155850 + }, + { + "epoch": 0.5932416281601364, + "grad_norm": 0.11744207888841629, + "learning_rate": 0.0005, + "loss": 2.1216, + "step": 155860 + }, + { + "epoch": 0.5932796906282591, + "grad_norm": 0.11881184577941895, + "learning_rate": 0.0005, + "loss": 2.1005, + "step": 155870 + }, + { + "epoch": 0.5933177530963818, + "grad_norm": 0.1358705759048462, + "learning_rate": 0.0005, + "loss": 2.1143, + "step": 155880 + }, + { + "epoch": 0.5933558155645045, + "grad_norm": 0.12554964423179626, + "learning_rate": 0.0005, + "loss": 2.089, + "step": 155890 + }, + { + "epoch": 0.5933938780326271, + "grad_norm": 0.13144125044345856, + "learning_rate": 0.0005, + "loss": 2.1043, + "step": 155900 + }, + { + "epoch": 0.5934319405007499, + "grad_norm": 0.12426116317510605, + "learning_rate": 0.0005, + "loss": 2.1121, + "step": 155910 + }, + { + "epoch": 0.5934700029688725, + "grad_norm": 0.12069100141525269, + "learning_rate": 0.0005, + "loss": 2.094, + "step": 155920 + }, + { + "epoch": 0.5935080654369952, + "grad_norm": 0.12368258088827133, + "learning_rate": 0.0005, + "loss": 2.1172, + "step": 155930 + }, + { + "epoch": 0.5935461279051178, + "grad_norm": 0.1293889284133911, + "learning_rate": 0.0005, + "loss": 2.1104, + "step": 155940 + }, + { + "epoch": 0.5935841903732406, + "grad_norm": 0.12078117579221725, + "learning_rate": 0.0005, + "loss": 2.1166, + "step": 155950 + }, + { + "epoch": 0.5936222528413633, + "grad_norm": 0.12822595238685608, + "learning_rate": 0.0005, + "loss": 2.1016, + "step": 155960 + }, + { + "epoch": 0.5936603153094859, + "grad_norm": 0.12021885067224503, + "learning_rate": 0.0005, + "loss": 2.0914, + "step": 155970 + }, + { + "epoch": 0.5936983777776086, + "grad_norm": 0.12757346034049988, + "learning_rate": 0.0005, + "loss": 2.1049, + "step": 155980 + }, + { + "epoch": 0.5937364402457312, + "grad_norm": 0.12365161627531052, + "learning_rate": 0.0005, + "loss": 2.1264, + "step": 155990 + }, + { + "epoch": 0.593774502713854, + "grad_norm": 0.13567115366458893, + "learning_rate": 0.0005, + "loss": 2.1169, + "step": 156000 + }, + { + "epoch": 0.5938125651819767, + "grad_norm": 0.13355757296085358, + "learning_rate": 0.0005, + "loss": 2.12, + "step": 156010 + }, + { + "epoch": 0.5938506276500993, + "grad_norm": 0.1318034529685974, + "learning_rate": 0.0005, + "loss": 2.1073, + "step": 156020 + }, + { + "epoch": 0.593888690118222, + "grad_norm": 0.11863218992948532, + "learning_rate": 0.0005, + "loss": 2.1074, + "step": 156030 + }, + { + "epoch": 0.5939267525863448, + "grad_norm": 0.13734151422977448, + "learning_rate": 0.0005, + "loss": 2.1067, + "step": 156040 + }, + { + "epoch": 0.5939648150544674, + "grad_norm": 0.12175113707780838, + "learning_rate": 0.0005, + "loss": 2.1148, + "step": 156050 + }, + { + "epoch": 0.5940028775225901, + "grad_norm": 0.12426832318305969, + "learning_rate": 0.0005, + "loss": 2.1151, + "step": 156060 + }, + { + "epoch": 0.5940409399907127, + "grad_norm": 0.1340668648481369, + "learning_rate": 0.0005, + "loss": 2.1094, + "step": 156070 + }, + { + "epoch": 0.5940790024588355, + "grad_norm": 0.12402141094207764, + "learning_rate": 0.0005, + "loss": 2.107, + "step": 156080 + }, + { + "epoch": 0.5941170649269581, + "grad_norm": 0.12372589111328125, + "learning_rate": 0.0005, + "loss": 2.0952, + "step": 156090 + }, + { + "epoch": 0.5941551273950808, + "grad_norm": 0.11944245547056198, + "learning_rate": 0.0005, + "loss": 2.1106, + "step": 156100 + }, + { + "epoch": 0.5941931898632035, + "grad_norm": 0.12028376013040543, + "learning_rate": 0.0005, + "loss": 2.1242, + "step": 156110 + }, + { + "epoch": 0.5942312523313261, + "grad_norm": 0.12873254716396332, + "learning_rate": 0.0005, + "loss": 2.1107, + "step": 156120 + }, + { + "epoch": 0.5942693147994489, + "grad_norm": 0.1547442525625229, + "learning_rate": 0.0005, + "loss": 2.1226, + "step": 156130 + }, + { + "epoch": 0.5943073772675715, + "grad_norm": 0.1363360434770584, + "learning_rate": 0.0005, + "loss": 2.116, + "step": 156140 + }, + { + "epoch": 0.5943454397356942, + "grad_norm": 0.11733988672494888, + "learning_rate": 0.0005, + "loss": 2.0964, + "step": 156150 + }, + { + "epoch": 0.5943835022038169, + "grad_norm": 0.11870251595973969, + "learning_rate": 0.0005, + "loss": 2.1161, + "step": 156160 + }, + { + "epoch": 0.5944215646719396, + "grad_norm": 0.12539437413215637, + "learning_rate": 0.0005, + "loss": 2.109, + "step": 156170 + }, + { + "epoch": 0.5944596271400623, + "grad_norm": 0.11921115219593048, + "learning_rate": 0.0005, + "loss": 2.1033, + "step": 156180 + }, + { + "epoch": 0.5944976896081849, + "grad_norm": 0.12341469526290894, + "learning_rate": 0.0005, + "loss": 2.1276, + "step": 156190 + }, + { + "epoch": 0.5945357520763076, + "grad_norm": 0.12102333456277847, + "learning_rate": 0.0005, + "loss": 2.1088, + "step": 156200 + }, + { + "epoch": 0.5945738145444304, + "grad_norm": 0.1152803897857666, + "learning_rate": 0.0005, + "loss": 2.1114, + "step": 156210 + }, + { + "epoch": 0.594611877012553, + "grad_norm": 0.13594534993171692, + "learning_rate": 0.0005, + "loss": 2.1023, + "step": 156220 + }, + { + "epoch": 0.5946499394806757, + "grad_norm": 0.11999261379241943, + "learning_rate": 0.0005, + "loss": 2.1124, + "step": 156230 + }, + { + "epoch": 0.5946880019487983, + "grad_norm": 0.1253129243850708, + "learning_rate": 0.0005, + "loss": 2.1087, + "step": 156240 + }, + { + "epoch": 0.594726064416921, + "grad_norm": 0.11831702291965485, + "learning_rate": 0.0005, + "loss": 2.1111, + "step": 156250 + }, + { + "epoch": 0.5947641268850438, + "grad_norm": 0.13602091372013092, + "learning_rate": 0.0005, + "loss": 2.1018, + "step": 156260 + }, + { + "epoch": 0.5948021893531664, + "grad_norm": 0.12433513253927231, + "learning_rate": 0.0005, + "loss": 2.103, + "step": 156270 + }, + { + "epoch": 0.5948402518212891, + "grad_norm": 0.11340433359146118, + "learning_rate": 0.0005, + "loss": 2.1079, + "step": 156280 + }, + { + "epoch": 0.5948783142894117, + "grad_norm": 0.11828036606311798, + "learning_rate": 0.0005, + "loss": 2.1196, + "step": 156290 + }, + { + "epoch": 0.5949163767575345, + "grad_norm": 0.11860781162977219, + "learning_rate": 0.0005, + "loss": 2.1034, + "step": 156300 + }, + { + "epoch": 0.5949544392256572, + "grad_norm": 0.12515273690223694, + "learning_rate": 0.0005, + "loss": 2.11, + "step": 156310 + }, + { + "epoch": 0.5949925016937798, + "grad_norm": 0.12391990423202515, + "learning_rate": 0.0005, + "loss": 2.109, + "step": 156320 + }, + { + "epoch": 0.5950305641619025, + "grad_norm": 0.13401736319065094, + "learning_rate": 0.0005, + "loss": 2.1083, + "step": 156330 + }, + { + "epoch": 0.5950686266300252, + "grad_norm": 0.12803804874420166, + "learning_rate": 0.0005, + "loss": 2.1164, + "step": 156340 + }, + { + "epoch": 0.5951066890981479, + "grad_norm": 0.12634626030921936, + "learning_rate": 0.0005, + "loss": 2.1035, + "step": 156350 + }, + { + "epoch": 0.5951447515662706, + "grad_norm": 0.1226266622543335, + "learning_rate": 0.0005, + "loss": 2.1122, + "step": 156360 + }, + { + "epoch": 0.5951828140343932, + "grad_norm": 0.1310350000858307, + "learning_rate": 0.0005, + "loss": 2.1058, + "step": 156370 + }, + { + "epoch": 0.595220876502516, + "grad_norm": 0.11200859397649765, + "learning_rate": 0.0005, + "loss": 2.1191, + "step": 156380 + }, + { + "epoch": 0.5952589389706386, + "grad_norm": 0.1256367266178131, + "learning_rate": 0.0005, + "loss": 2.1308, + "step": 156390 + }, + { + "epoch": 0.5952970014387613, + "grad_norm": 0.11647086590528488, + "learning_rate": 0.0005, + "loss": 2.1095, + "step": 156400 + }, + { + "epoch": 0.595335063906884, + "grad_norm": 0.11876901984214783, + "learning_rate": 0.0005, + "loss": 2.1055, + "step": 156410 + }, + { + "epoch": 0.5953731263750066, + "grad_norm": 0.12916727364063263, + "learning_rate": 0.0005, + "loss": 2.1178, + "step": 156420 + }, + { + "epoch": 0.5954111888431294, + "grad_norm": 0.12823139131069183, + "learning_rate": 0.0005, + "loss": 2.1032, + "step": 156430 + }, + { + "epoch": 0.595449251311252, + "grad_norm": 0.13512112200260162, + "learning_rate": 0.0005, + "loss": 2.1107, + "step": 156440 + }, + { + "epoch": 0.5954873137793747, + "grad_norm": 0.12843908369541168, + "learning_rate": 0.0005, + "loss": 2.0912, + "step": 156450 + }, + { + "epoch": 0.5955253762474974, + "grad_norm": 0.12319228798151016, + "learning_rate": 0.0005, + "loss": 2.1146, + "step": 156460 + }, + { + "epoch": 0.5955634387156201, + "grad_norm": 0.13872624933719635, + "learning_rate": 0.0005, + "loss": 2.1192, + "step": 156470 + }, + { + "epoch": 0.5956015011837428, + "grad_norm": 0.12136664241552353, + "learning_rate": 0.0005, + "loss": 2.1098, + "step": 156480 + }, + { + "epoch": 0.5956395636518654, + "grad_norm": 0.12200096249580383, + "learning_rate": 0.0005, + "loss": 2.1157, + "step": 156490 + }, + { + "epoch": 0.5956776261199881, + "grad_norm": 0.12467196583747864, + "learning_rate": 0.0005, + "loss": 2.1296, + "step": 156500 + }, + { + "epoch": 0.5957156885881109, + "grad_norm": 0.11812784522771835, + "learning_rate": 0.0005, + "loss": 2.1169, + "step": 156510 + }, + { + "epoch": 0.5957537510562335, + "grad_norm": 0.12433940917253494, + "learning_rate": 0.0005, + "loss": 2.124, + "step": 156520 + }, + { + "epoch": 0.5957918135243562, + "grad_norm": 0.11930729448795319, + "learning_rate": 0.0005, + "loss": 2.1166, + "step": 156530 + }, + { + "epoch": 0.5958298759924788, + "grad_norm": 0.12910746037960052, + "learning_rate": 0.0005, + "loss": 2.109, + "step": 156540 + }, + { + "epoch": 0.5958679384606015, + "grad_norm": 0.12703043222427368, + "learning_rate": 0.0005, + "loss": 2.1013, + "step": 156550 + }, + { + "epoch": 0.5959060009287243, + "grad_norm": 0.12971876561641693, + "learning_rate": 0.0005, + "loss": 2.12, + "step": 156560 + }, + { + "epoch": 0.5959440633968469, + "grad_norm": 0.12978804111480713, + "learning_rate": 0.0005, + "loss": 2.1142, + "step": 156570 + }, + { + "epoch": 0.5959821258649696, + "grad_norm": 0.1250307708978653, + "learning_rate": 0.0005, + "loss": 2.1128, + "step": 156580 + }, + { + "epoch": 0.5960201883330922, + "grad_norm": 0.12306489050388336, + "learning_rate": 0.0005, + "loss": 2.0998, + "step": 156590 + }, + { + "epoch": 0.596058250801215, + "grad_norm": 0.12678273022174835, + "learning_rate": 0.0005, + "loss": 2.1096, + "step": 156600 + }, + { + "epoch": 0.5960963132693377, + "grad_norm": 0.1361202746629715, + "learning_rate": 0.0005, + "loss": 2.1128, + "step": 156610 + }, + { + "epoch": 0.5961343757374603, + "grad_norm": 0.1354992687702179, + "learning_rate": 0.0005, + "loss": 2.1059, + "step": 156620 + }, + { + "epoch": 0.596172438205583, + "grad_norm": 0.11919796466827393, + "learning_rate": 0.0005, + "loss": 2.116, + "step": 156630 + }, + { + "epoch": 0.5962105006737057, + "grad_norm": 0.12351883947849274, + "learning_rate": 0.0005, + "loss": 2.1155, + "step": 156640 + }, + { + "epoch": 0.5962485631418284, + "grad_norm": 0.1216733455657959, + "learning_rate": 0.0005, + "loss": 2.1044, + "step": 156650 + }, + { + "epoch": 0.596286625609951, + "grad_norm": 0.11714150756597519, + "learning_rate": 0.0005, + "loss": 2.1072, + "step": 156660 + }, + { + "epoch": 0.5963246880780737, + "grad_norm": 0.11788475513458252, + "learning_rate": 0.0005, + "loss": 2.1117, + "step": 156670 + }, + { + "epoch": 0.5963627505461964, + "grad_norm": 0.1324845850467682, + "learning_rate": 0.0005, + "loss": 2.1071, + "step": 156680 + }, + { + "epoch": 0.5964008130143191, + "grad_norm": 0.16658127307891846, + "learning_rate": 0.0005, + "loss": 2.115, + "step": 156690 + }, + { + "epoch": 0.5964388754824418, + "grad_norm": 0.1240057572722435, + "learning_rate": 0.0005, + "loss": 2.1113, + "step": 156700 + }, + { + "epoch": 0.5964769379505644, + "grad_norm": 0.12588045001029968, + "learning_rate": 0.0005, + "loss": 2.1047, + "step": 156710 + }, + { + "epoch": 0.5965150004186871, + "grad_norm": 0.12813587486743927, + "learning_rate": 0.0005, + "loss": 2.1136, + "step": 156720 + }, + { + "epoch": 0.5965530628868099, + "grad_norm": 0.12198911607265472, + "learning_rate": 0.0005, + "loss": 2.1182, + "step": 156730 + }, + { + "epoch": 0.5965911253549325, + "grad_norm": 0.13085797429084778, + "learning_rate": 0.0005, + "loss": 2.1214, + "step": 156740 + }, + { + "epoch": 0.5966291878230552, + "grad_norm": 0.12170829623937607, + "learning_rate": 0.0005, + "loss": 2.1023, + "step": 156750 + }, + { + "epoch": 0.5966672502911778, + "grad_norm": 0.1213245689868927, + "learning_rate": 0.0005, + "loss": 2.1117, + "step": 156760 + }, + { + "epoch": 0.5967053127593006, + "grad_norm": 0.11355091631412506, + "learning_rate": 0.0005, + "loss": 2.1216, + "step": 156770 + }, + { + "epoch": 0.5967433752274233, + "grad_norm": 0.12887951731681824, + "learning_rate": 0.0005, + "loss": 2.1116, + "step": 156780 + }, + { + "epoch": 0.5967814376955459, + "grad_norm": 0.12898492813110352, + "learning_rate": 0.0005, + "loss": 2.1114, + "step": 156790 + }, + { + "epoch": 0.5968195001636686, + "grad_norm": 0.1291269212961197, + "learning_rate": 0.0005, + "loss": 2.1092, + "step": 156800 + }, + { + "epoch": 0.5968575626317913, + "grad_norm": 0.11913888901472092, + "learning_rate": 0.0005, + "loss": 2.117, + "step": 156810 + }, + { + "epoch": 0.596895625099914, + "grad_norm": 0.11804598569869995, + "learning_rate": 0.0005, + "loss": 2.1128, + "step": 156820 + }, + { + "epoch": 0.5969336875680367, + "grad_norm": 0.11910227686166763, + "learning_rate": 0.0005, + "loss": 2.1121, + "step": 156830 + }, + { + "epoch": 0.5969717500361593, + "grad_norm": 0.12902553379535675, + "learning_rate": 0.0005, + "loss": 2.1144, + "step": 156840 + }, + { + "epoch": 0.597009812504282, + "grad_norm": 0.13280050456523895, + "learning_rate": 0.0005, + "loss": 2.1066, + "step": 156850 + }, + { + "epoch": 0.5970478749724047, + "grad_norm": 0.14002548158168793, + "learning_rate": 0.0005, + "loss": 2.1086, + "step": 156860 + }, + { + "epoch": 0.5970859374405274, + "grad_norm": 0.11478378623723984, + "learning_rate": 0.0005, + "loss": 2.1098, + "step": 156870 + }, + { + "epoch": 0.5971239999086501, + "grad_norm": 0.129014253616333, + "learning_rate": 0.0005, + "loss": 2.1396, + "step": 156880 + }, + { + "epoch": 0.5971620623767727, + "grad_norm": 0.12656255066394806, + "learning_rate": 0.0005, + "loss": 2.1327, + "step": 156890 + }, + { + "epoch": 0.5972001248448955, + "grad_norm": 0.14026038348674774, + "learning_rate": 0.0005, + "loss": 2.0998, + "step": 156900 + }, + { + "epoch": 0.5972381873130181, + "grad_norm": 0.11731848120689392, + "learning_rate": 0.0005, + "loss": 2.1113, + "step": 156910 + }, + { + "epoch": 0.5972762497811408, + "grad_norm": 0.12975703179836273, + "learning_rate": 0.0005, + "loss": 2.1127, + "step": 156920 + }, + { + "epoch": 0.5973143122492635, + "grad_norm": 0.1149299144744873, + "learning_rate": 0.0005, + "loss": 2.114, + "step": 156930 + }, + { + "epoch": 0.5973523747173862, + "grad_norm": 0.12866534292697906, + "learning_rate": 0.0005, + "loss": 2.1184, + "step": 156940 + }, + { + "epoch": 0.5973904371855089, + "grad_norm": 0.12251448631286621, + "learning_rate": 0.0005, + "loss": 2.1049, + "step": 156950 + }, + { + "epoch": 0.5974284996536315, + "grad_norm": 0.13120411336421967, + "learning_rate": 0.0005, + "loss": 2.1066, + "step": 156960 + }, + { + "epoch": 0.5974665621217542, + "grad_norm": 0.1309138685464859, + "learning_rate": 0.0005, + "loss": 2.1113, + "step": 156970 + }, + { + "epoch": 0.5975046245898769, + "grad_norm": 0.12745153903961182, + "learning_rate": 0.0005, + "loss": 2.1249, + "step": 156980 + }, + { + "epoch": 0.5975426870579996, + "grad_norm": 0.1209530383348465, + "learning_rate": 0.0005, + "loss": 2.1114, + "step": 156990 + }, + { + "epoch": 0.5975807495261223, + "grad_norm": 0.13832753896713257, + "learning_rate": 0.0005, + "loss": 2.1178, + "step": 157000 + }, + { + "epoch": 0.5976188119942449, + "grad_norm": 0.12662719190120697, + "learning_rate": 0.0005, + "loss": 2.1153, + "step": 157010 + }, + { + "epoch": 0.5976568744623676, + "grad_norm": 0.1310693472623825, + "learning_rate": 0.0005, + "loss": 2.1132, + "step": 157020 + }, + { + "epoch": 0.5976949369304904, + "grad_norm": 0.12905949354171753, + "learning_rate": 0.0005, + "loss": 2.1137, + "step": 157030 + }, + { + "epoch": 0.597732999398613, + "grad_norm": 0.12887629866600037, + "learning_rate": 0.0005, + "loss": 2.1096, + "step": 157040 + }, + { + "epoch": 0.5977710618667357, + "grad_norm": 0.14822103083133698, + "learning_rate": 0.0005, + "loss": 2.1141, + "step": 157050 + }, + { + "epoch": 0.5978091243348583, + "grad_norm": 0.142548605799675, + "learning_rate": 0.0005, + "loss": 2.0982, + "step": 157060 + }, + { + "epoch": 0.5978471868029811, + "grad_norm": 0.13723008334636688, + "learning_rate": 0.0005, + "loss": 2.1115, + "step": 157070 + }, + { + "epoch": 0.5978852492711038, + "grad_norm": 0.11937706917524338, + "learning_rate": 0.0005, + "loss": 2.1075, + "step": 157080 + }, + { + "epoch": 0.5979233117392264, + "grad_norm": 0.12686049938201904, + "learning_rate": 0.0005, + "loss": 2.0989, + "step": 157090 + }, + { + "epoch": 0.5979613742073491, + "grad_norm": 0.12511029839515686, + "learning_rate": 0.0005, + "loss": 2.1155, + "step": 157100 + }, + { + "epoch": 0.5979994366754717, + "grad_norm": 0.13167688250541687, + "learning_rate": 0.0005, + "loss": 2.1132, + "step": 157110 + }, + { + "epoch": 0.5980374991435945, + "grad_norm": 0.11056148260831833, + "learning_rate": 0.0005, + "loss": 2.1077, + "step": 157120 + }, + { + "epoch": 0.5980755616117172, + "grad_norm": 0.13581658899784088, + "learning_rate": 0.0005, + "loss": 2.1147, + "step": 157130 + }, + { + "epoch": 0.5981136240798398, + "grad_norm": 0.11735519021749496, + "learning_rate": 0.0005, + "loss": 2.1103, + "step": 157140 + }, + { + "epoch": 0.5981516865479625, + "grad_norm": 0.11120060831308365, + "learning_rate": 0.0005, + "loss": 2.1234, + "step": 157150 + }, + { + "epoch": 0.5981897490160852, + "grad_norm": 0.122090183198452, + "learning_rate": 0.0005, + "loss": 2.1093, + "step": 157160 + }, + { + "epoch": 0.5982278114842079, + "grad_norm": 0.1253940910100937, + "learning_rate": 0.0005, + "loss": 2.1131, + "step": 157170 + }, + { + "epoch": 0.5982658739523306, + "grad_norm": 0.13477633893489838, + "learning_rate": 0.0005, + "loss": 2.1209, + "step": 157180 + }, + { + "epoch": 0.5983039364204532, + "grad_norm": 0.1272205114364624, + "learning_rate": 0.0005, + "loss": 2.111, + "step": 157190 + }, + { + "epoch": 0.598341998888576, + "grad_norm": 0.12778523564338684, + "learning_rate": 0.0005, + "loss": 2.1172, + "step": 157200 + }, + { + "epoch": 0.5983800613566986, + "grad_norm": 0.1212928295135498, + "learning_rate": 0.0005, + "loss": 2.1172, + "step": 157210 + }, + { + "epoch": 0.5984181238248213, + "grad_norm": 0.12366236001253128, + "learning_rate": 0.0005, + "loss": 2.105, + "step": 157220 + }, + { + "epoch": 0.598456186292944, + "grad_norm": 0.12961052358150482, + "learning_rate": 0.0005, + "loss": 2.0999, + "step": 157230 + }, + { + "epoch": 0.5984942487610667, + "grad_norm": 0.12433735281229019, + "learning_rate": 0.0005, + "loss": 2.1227, + "step": 157240 + }, + { + "epoch": 0.5985323112291894, + "grad_norm": 0.12253043800592422, + "learning_rate": 0.0005, + "loss": 2.1117, + "step": 157250 + }, + { + "epoch": 0.598570373697312, + "grad_norm": 0.11693271994590759, + "learning_rate": 0.0005, + "loss": 2.114, + "step": 157260 + }, + { + "epoch": 0.5986084361654347, + "grad_norm": 0.11834903806447983, + "learning_rate": 0.0005, + "loss": 2.1158, + "step": 157270 + }, + { + "epoch": 0.5986464986335573, + "grad_norm": 0.12358506768941879, + "learning_rate": 0.0005, + "loss": 2.1113, + "step": 157280 + }, + { + "epoch": 0.5986845611016801, + "grad_norm": 0.12689338624477386, + "learning_rate": 0.0005, + "loss": 2.0958, + "step": 157290 + }, + { + "epoch": 0.5987226235698028, + "grad_norm": 0.12762139737606049, + "learning_rate": 0.0005, + "loss": 2.1149, + "step": 157300 + }, + { + "epoch": 0.5987606860379254, + "grad_norm": 0.12027955055236816, + "learning_rate": 0.0005, + "loss": 2.1128, + "step": 157310 + }, + { + "epoch": 0.5987987485060481, + "grad_norm": 0.13357017934322357, + "learning_rate": 0.0005, + "loss": 2.1037, + "step": 157320 + }, + { + "epoch": 0.5988368109741709, + "grad_norm": 0.13205654919147491, + "learning_rate": 0.0005, + "loss": 2.0932, + "step": 157330 + }, + { + "epoch": 0.5988748734422935, + "grad_norm": 0.12938706576824188, + "learning_rate": 0.0005, + "loss": 2.0985, + "step": 157340 + }, + { + "epoch": 0.5989129359104162, + "grad_norm": 0.11992862075567245, + "learning_rate": 0.0005, + "loss": 2.1179, + "step": 157350 + }, + { + "epoch": 0.5989509983785388, + "grad_norm": 0.12154851108789444, + "learning_rate": 0.0005, + "loss": 2.1085, + "step": 157360 + }, + { + "epoch": 0.5989890608466616, + "grad_norm": 0.1233595684170723, + "learning_rate": 0.0005, + "loss": 2.1057, + "step": 157370 + }, + { + "epoch": 0.5990271233147842, + "grad_norm": 0.12630879878997803, + "learning_rate": 0.0005, + "loss": 2.112, + "step": 157380 + }, + { + "epoch": 0.5990651857829069, + "grad_norm": 0.12620669603347778, + "learning_rate": 0.0005, + "loss": 2.1059, + "step": 157390 + }, + { + "epoch": 0.5991032482510296, + "grad_norm": 0.1237010583281517, + "learning_rate": 0.0005, + "loss": 2.1081, + "step": 157400 + }, + { + "epoch": 0.5991413107191522, + "grad_norm": 0.12899191677570343, + "learning_rate": 0.0005, + "loss": 2.1171, + "step": 157410 + }, + { + "epoch": 0.599179373187275, + "grad_norm": 0.13100019097328186, + "learning_rate": 0.0005, + "loss": 2.1062, + "step": 157420 + }, + { + "epoch": 0.5992174356553976, + "grad_norm": 0.12148187309503555, + "learning_rate": 0.0005, + "loss": 2.118, + "step": 157430 + }, + { + "epoch": 0.5992554981235203, + "grad_norm": 0.12283346056938171, + "learning_rate": 0.0005, + "loss": 2.1136, + "step": 157440 + }, + { + "epoch": 0.599293560591643, + "grad_norm": 0.1465480476617813, + "learning_rate": 0.0005, + "loss": 2.1074, + "step": 157450 + }, + { + "epoch": 0.5993316230597657, + "grad_norm": 0.11719803512096405, + "learning_rate": 0.0005, + "loss": 2.1042, + "step": 157460 + }, + { + "epoch": 0.5993696855278884, + "grad_norm": 0.12916360795497894, + "learning_rate": 0.0005, + "loss": 2.1206, + "step": 157470 + }, + { + "epoch": 0.599407747996011, + "grad_norm": 0.1229061484336853, + "learning_rate": 0.0005, + "loss": 2.114, + "step": 157480 + }, + { + "epoch": 0.5994458104641337, + "grad_norm": 0.1390194296836853, + "learning_rate": 0.0005, + "loss": 2.1084, + "step": 157490 + }, + { + "epoch": 0.5994838729322565, + "grad_norm": 0.12508492171764374, + "learning_rate": 0.0005, + "loss": 2.1187, + "step": 157500 + }, + { + "epoch": 0.5995219354003791, + "grad_norm": 0.11759859323501587, + "learning_rate": 0.0005, + "loss": 2.1138, + "step": 157510 + }, + { + "epoch": 0.5995599978685018, + "grad_norm": 0.12426968663930893, + "learning_rate": 0.0005, + "loss": 2.1131, + "step": 157520 + }, + { + "epoch": 0.5995980603366244, + "grad_norm": 0.11593806743621826, + "learning_rate": 0.0005, + "loss": 2.1148, + "step": 157530 + }, + { + "epoch": 0.5996361228047471, + "grad_norm": 0.12494784593582153, + "learning_rate": 0.0005, + "loss": 2.118, + "step": 157540 + }, + { + "epoch": 0.5996741852728699, + "grad_norm": 0.12596601247787476, + "learning_rate": 0.0005, + "loss": 2.1149, + "step": 157550 + }, + { + "epoch": 0.5997122477409925, + "grad_norm": 0.12842027842998505, + "learning_rate": 0.0005, + "loss": 2.1131, + "step": 157560 + }, + { + "epoch": 0.5997503102091152, + "grad_norm": 0.13903789222240448, + "learning_rate": 0.0005, + "loss": 2.1352, + "step": 157570 + }, + { + "epoch": 0.5997883726772378, + "grad_norm": 0.11448253691196442, + "learning_rate": 0.0005, + "loss": 2.118, + "step": 157580 + }, + { + "epoch": 0.5998264351453606, + "grad_norm": 0.12419616430997849, + "learning_rate": 0.0005, + "loss": 2.1171, + "step": 157590 + }, + { + "epoch": 0.5998644976134833, + "grad_norm": 0.12554436922073364, + "learning_rate": 0.0005, + "loss": 2.1099, + "step": 157600 + }, + { + "epoch": 0.5999025600816059, + "grad_norm": 0.12357697635889053, + "learning_rate": 0.0005, + "loss": 2.1089, + "step": 157610 + }, + { + "epoch": 0.5999406225497286, + "grad_norm": 0.12808255851268768, + "learning_rate": 0.0005, + "loss": 2.0929, + "step": 157620 + }, + { + "epoch": 0.5999786850178513, + "grad_norm": 0.13314425945281982, + "learning_rate": 0.0005, + "loss": 2.1226, + "step": 157630 + }, + { + "epoch": 0.600016747485974, + "grad_norm": 0.13815081119537354, + "learning_rate": 0.0005, + "loss": 2.1224, + "step": 157640 + }, + { + "epoch": 0.6000548099540967, + "grad_norm": 0.14097647368907928, + "learning_rate": 0.0005, + "loss": 2.1285, + "step": 157650 + }, + { + "epoch": 0.6000928724222193, + "grad_norm": 0.12406017631292343, + "learning_rate": 0.0005, + "loss": 2.1065, + "step": 157660 + }, + { + "epoch": 0.6001309348903421, + "grad_norm": 0.1178424134850502, + "learning_rate": 0.0005, + "loss": 2.1119, + "step": 157670 + }, + { + "epoch": 0.6001689973584647, + "grad_norm": 0.12351218611001968, + "learning_rate": 0.0005, + "loss": 2.1148, + "step": 157680 + }, + { + "epoch": 0.6002070598265874, + "grad_norm": 0.12011931836605072, + "learning_rate": 0.0005, + "loss": 2.1061, + "step": 157690 + }, + { + "epoch": 0.60024512229471, + "grad_norm": 0.1278388351202011, + "learning_rate": 0.0005, + "loss": 2.1, + "step": 157700 + }, + { + "epoch": 0.6002831847628327, + "grad_norm": 0.12576743960380554, + "learning_rate": 0.0005, + "loss": 2.1084, + "step": 157710 + }, + { + "epoch": 0.6003212472309555, + "grad_norm": 0.14748530089855194, + "learning_rate": 0.0005, + "loss": 2.1193, + "step": 157720 + }, + { + "epoch": 0.6003593096990781, + "grad_norm": 0.12966130673885345, + "learning_rate": 0.0005, + "loss": 2.1184, + "step": 157730 + }, + { + "epoch": 0.6003973721672008, + "grad_norm": 0.1358342468738556, + "learning_rate": 0.0005, + "loss": 2.1064, + "step": 157740 + }, + { + "epoch": 0.6004354346353235, + "grad_norm": 0.11560594290494919, + "learning_rate": 0.0005, + "loss": 2.0952, + "step": 157750 + }, + { + "epoch": 0.6004734971034462, + "grad_norm": 0.1258758008480072, + "learning_rate": 0.0005, + "loss": 2.1087, + "step": 157760 + }, + { + "epoch": 0.6005115595715689, + "grad_norm": 0.12026939541101456, + "learning_rate": 0.0005, + "loss": 2.1319, + "step": 157770 + }, + { + "epoch": 0.6005496220396915, + "grad_norm": 0.12306264787912369, + "learning_rate": 0.0005, + "loss": 2.1139, + "step": 157780 + }, + { + "epoch": 0.6005876845078142, + "grad_norm": 0.13696016371250153, + "learning_rate": 0.0005, + "loss": 2.1197, + "step": 157790 + }, + { + "epoch": 0.600625746975937, + "grad_norm": 0.12249837815761566, + "learning_rate": 0.0005, + "loss": 2.1112, + "step": 157800 + }, + { + "epoch": 0.6006638094440596, + "grad_norm": 0.13254183530807495, + "learning_rate": 0.0005, + "loss": 2.1137, + "step": 157810 + }, + { + "epoch": 0.6007018719121823, + "grad_norm": 0.1298949271440506, + "learning_rate": 0.0005, + "loss": 2.1164, + "step": 157820 + }, + { + "epoch": 0.6007399343803049, + "grad_norm": 0.1328222155570984, + "learning_rate": 0.0005, + "loss": 2.1081, + "step": 157830 + }, + { + "epoch": 0.6007779968484276, + "grad_norm": 0.12573933601379395, + "learning_rate": 0.0005, + "loss": 2.1005, + "step": 157840 + }, + { + "epoch": 0.6008160593165504, + "grad_norm": 0.14559721946716309, + "learning_rate": 0.0005, + "loss": 2.124, + "step": 157850 + }, + { + "epoch": 0.600854121784673, + "grad_norm": 0.13562941551208496, + "learning_rate": 0.0005, + "loss": 2.0963, + "step": 157860 + }, + { + "epoch": 0.6008921842527957, + "grad_norm": 0.12265808880329132, + "learning_rate": 0.0005, + "loss": 2.1058, + "step": 157870 + }, + { + "epoch": 0.6009302467209183, + "grad_norm": 0.1416488140821457, + "learning_rate": 0.0005, + "loss": 2.1189, + "step": 157880 + }, + { + "epoch": 0.6009683091890411, + "grad_norm": 0.126577690243721, + "learning_rate": 0.0005, + "loss": 2.1098, + "step": 157890 + }, + { + "epoch": 0.6010063716571638, + "grad_norm": 0.13580593466758728, + "learning_rate": 0.0005, + "loss": 2.095, + "step": 157900 + }, + { + "epoch": 0.6010444341252864, + "grad_norm": 0.1211516261100769, + "learning_rate": 0.0005, + "loss": 2.1072, + "step": 157910 + }, + { + "epoch": 0.6010824965934091, + "grad_norm": 0.13828988373279572, + "learning_rate": 0.0005, + "loss": 2.1101, + "step": 157920 + }, + { + "epoch": 0.6011205590615318, + "grad_norm": 0.11835107952356339, + "learning_rate": 0.0005, + "loss": 2.1156, + "step": 157930 + }, + { + "epoch": 0.6011586215296545, + "grad_norm": 0.11946625262498856, + "learning_rate": 0.0005, + "loss": 2.105, + "step": 157940 + }, + { + "epoch": 0.6011966839977771, + "grad_norm": 0.12866944074630737, + "learning_rate": 0.0005, + "loss": 2.1167, + "step": 157950 + }, + { + "epoch": 0.6012347464658998, + "grad_norm": 0.12162616103887558, + "learning_rate": 0.0005, + "loss": 2.1188, + "step": 157960 + }, + { + "epoch": 0.6012728089340225, + "grad_norm": 0.1159362643957138, + "learning_rate": 0.0005, + "loss": 2.1358, + "step": 157970 + }, + { + "epoch": 0.6013108714021452, + "grad_norm": 0.11833919584751129, + "learning_rate": 0.0005, + "loss": 2.106, + "step": 157980 + }, + { + "epoch": 0.6013489338702679, + "grad_norm": 0.18633560836315155, + "learning_rate": 0.0005, + "loss": 2.1184, + "step": 157990 + }, + { + "epoch": 0.6013869963383905, + "grad_norm": 0.12414807081222534, + "learning_rate": 0.0005, + "loss": 2.1068, + "step": 158000 + }, + { + "epoch": 0.6014250588065132, + "grad_norm": 0.12252053618431091, + "learning_rate": 0.0005, + "loss": 2.1004, + "step": 158010 + }, + { + "epoch": 0.601463121274636, + "grad_norm": 0.11328023672103882, + "learning_rate": 0.0005, + "loss": 2.114, + "step": 158020 + }, + { + "epoch": 0.6015011837427586, + "grad_norm": 0.12160588800907135, + "learning_rate": 0.0005, + "loss": 2.1228, + "step": 158030 + }, + { + "epoch": 0.6015392462108813, + "grad_norm": 0.15667229890823364, + "learning_rate": 0.0005, + "loss": 2.1059, + "step": 158040 + }, + { + "epoch": 0.6015773086790039, + "grad_norm": 0.12987153232097626, + "learning_rate": 0.0005, + "loss": 2.1159, + "step": 158050 + }, + { + "epoch": 0.6016153711471267, + "grad_norm": 0.13119001686573029, + "learning_rate": 0.0005, + "loss": 2.1181, + "step": 158060 + }, + { + "epoch": 0.6016534336152494, + "grad_norm": 0.12040984630584717, + "learning_rate": 0.0005, + "loss": 2.1181, + "step": 158070 + }, + { + "epoch": 0.601691496083372, + "grad_norm": 0.11469858884811401, + "learning_rate": 0.0005, + "loss": 2.1266, + "step": 158080 + }, + { + "epoch": 0.6017295585514947, + "grad_norm": 0.1116182953119278, + "learning_rate": 0.0005, + "loss": 2.1147, + "step": 158090 + }, + { + "epoch": 0.6017676210196174, + "grad_norm": 0.1325918287038803, + "learning_rate": 0.0005, + "loss": 2.1309, + "step": 158100 + }, + { + "epoch": 0.6018056834877401, + "grad_norm": 0.11638123542070389, + "learning_rate": 0.0005, + "loss": 2.1151, + "step": 158110 + }, + { + "epoch": 0.6018437459558628, + "grad_norm": 0.12299565225839615, + "learning_rate": 0.0005, + "loss": 2.1024, + "step": 158120 + }, + { + "epoch": 0.6018818084239854, + "grad_norm": 0.1290263831615448, + "learning_rate": 0.0005, + "loss": 2.1144, + "step": 158130 + }, + { + "epoch": 0.6019198708921081, + "grad_norm": 0.1211891621351242, + "learning_rate": 0.0005, + "loss": 2.1138, + "step": 158140 + }, + { + "epoch": 0.6019579333602308, + "grad_norm": 0.11648281663656235, + "learning_rate": 0.0005, + "loss": 2.1105, + "step": 158150 + }, + { + "epoch": 0.6019959958283535, + "grad_norm": 0.10998690128326416, + "learning_rate": 0.0005, + "loss": 2.1201, + "step": 158160 + }, + { + "epoch": 0.6020340582964762, + "grad_norm": 0.12193288654088974, + "learning_rate": 0.0005, + "loss": 2.1247, + "step": 158170 + }, + { + "epoch": 0.6020721207645988, + "grad_norm": 0.11943355947732925, + "learning_rate": 0.0005, + "loss": 2.123, + "step": 158180 + }, + { + "epoch": 0.6021101832327216, + "grad_norm": 0.11397210508584976, + "learning_rate": 0.0005, + "loss": 2.1253, + "step": 158190 + }, + { + "epoch": 0.6021482457008442, + "grad_norm": 0.5890202522277832, + "learning_rate": 0.0005, + "loss": 2.1139, + "step": 158200 + }, + { + "epoch": 0.6021863081689669, + "grad_norm": 0.12778812646865845, + "learning_rate": 0.0005, + "loss": 2.1034, + "step": 158210 + }, + { + "epoch": 0.6022243706370896, + "grad_norm": 0.1296212375164032, + "learning_rate": 0.0005, + "loss": 2.1091, + "step": 158220 + }, + { + "epoch": 0.6022624331052123, + "grad_norm": 0.12705478072166443, + "learning_rate": 0.0005, + "loss": 2.1177, + "step": 158230 + }, + { + "epoch": 0.602300495573335, + "grad_norm": 0.12249784171581268, + "learning_rate": 0.0005, + "loss": 2.109, + "step": 158240 + }, + { + "epoch": 0.6023385580414576, + "grad_norm": 0.13054530322551727, + "learning_rate": 0.0005, + "loss": 2.1173, + "step": 158250 + }, + { + "epoch": 0.6023766205095803, + "grad_norm": 0.12559649348258972, + "learning_rate": 0.0005, + "loss": 2.0865, + "step": 158260 + }, + { + "epoch": 0.602414682977703, + "grad_norm": 0.1225946918129921, + "learning_rate": 0.0005, + "loss": 2.0942, + "step": 158270 + }, + { + "epoch": 0.6024527454458257, + "grad_norm": 0.132630854845047, + "learning_rate": 0.0005, + "loss": 2.1198, + "step": 158280 + }, + { + "epoch": 0.6024908079139484, + "grad_norm": 0.12257161736488342, + "learning_rate": 0.0005, + "loss": 2.0972, + "step": 158290 + }, + { + "epoch": 0.602528870382071, + "grad_norm": 0.11760008335113525, + "learning_rate": 0.0005, + "loss": 2.1096, + "step": 158300 + }, + { + "epoch": 0.6025669328501937, + "grad_norm": 0.12999369204044342, + "learning_rate": 0.0005, + "loss": 2.1221, + "step": 158310 + }, + { + "epoch": 0.6026049953183165, + "grad_norm": 0.15574951469898224, + "learning_rate": 0.0005, + "loss": 2.109, + "step": 158320 + }, + { + "epoch": 0.6026430577864391, + "grad_norm": 0.10984206199645996, + "learning_rate": 0.0005, + "loss": 2.1144, + "step": 158330 + }, + { + "epoch": 0.6026811202545618, + "grad_norm": 0.1276576966047287, + "learning_rate": 0.0005, + "loss": 2.1178, + "step": 158340 + }, + { + "epoch": 0.6027191827226844, + "grad_norm": 0.12136907875537872, + "learning_rate": 0.0005, + "loss": 2.1046, + "step": 158350 + }, + { + "epoch": 0.6027572451908072, + "grad_norm": 0.1264997273683548, + "learning_rate": 0.0005, + "loss": 2.0908, + "step": 158360 + }, + { + "epoch": 0.6027953076589299, + "grad_norm": 0.1296354979276657, + "learning_rate": 0.0005, + "loss": 2.0985, + "step": 158370 + }, + { + "epoch": 0.6028333701270525, + "grad_norm": 0.14940467476844788, + "learning_rate": 0.0005, + "loss": 2.1189, + "step": 158380 + }, + { + "epoch": 0.6028714325951752, + "grad_norm": 0.12434712797403336, + "learning_rate": 0.0005, + "loss": 2.1199, + "step": 158390 + }, + { + "epoch": 0.6029094950632979, + "grad_norm": 0.12737199664115906, + "learning_rate": 0.0005, + "loss": 2.0976, + "step": 158400 + }, + { + "epoch": 0.6029475575314206, + "grad_norm": 0.1796177625656128, + "learning_rate": 0.0005, + "loss": 2.1092, + "step": 158410 + }, + { + "epoch": 0.6029856199995433, + "grad_norm": 0.11623994261026382, + "learning_rate": 0.0005, + "loss": 2.1072, + "step": 158420 + }, + { + "epoch": 0.6030236824676659, + "grad_norm": 0.11934838443994522, + "learning_rate": 0.0005, + "loss": 2.1221, + "step": 158430 + }, + { + "epoch": 0.6030617449357886, + "grad_norm": 0.11687199771404266, + "learning_rate": 0.0005, + "loss": 2.1101, + "step": 158440 + }, + { + "epoch": 0.6030998074039113, + "grad_norm": 0.11579285562038422, + "learning_rate": 0.0005, + "loss": 2.1023, + "step": 158450 + }, + { + "epoch": 0.603137869872034, + "grad_norm": 0.12085693329572678, + "learning_rate": 0.0005, + "loss": 2.1106, + "step": 158460 + }, + { + "epoch": 0.6031759323401567, + "grad_norm": 0.11823724955320358, + "learning_rate": 0.0005, + "loss": 2.1179, + "step": 158470 + }, + { + "epoch": 0.6032139948082793, + "grad_norm": 0.11455874890089035, + "learning_rate": 0.0005, + "loss": 2.1009, + "step": 158480 + }, + { + "epoch": 0.6032520572764021, + "grad_norm": 0.11831489950418472, + "learning_rate": 0.0005, + "loss": 2.1241, + "step": 158490 + }, + { + "epoch": 0.6032901197445247, + "grad_norm": 0.1299872249364853, + "learning_rate": 0.0005, + "loss": 2.1142, + "step": 158500 + }, + { + "epoch": 0.6033281822126474, + "grad_norm": 0.13752396404743195, + "learning_rate": 0.0005, + "loss": 2.108, + "step": 158510 + }, + { + "epoch": 0.60336624468077, + "grad_norm": 0.135402113199234, + "learning_rate": 0.0005, + "loss": 2.1107, + "step": 158520 + }, + { + "epoch": 0.6034043071488928, + "grad_norm": 0.12532898783683777, + "learning_rate": 0.0005, + "loss": 2.117, + "step": 158530 + }, + { + "epoch": 0.6034423696170155, + "grad_norm": 0.13254982233047485, + "learning_rate": 0.0005, + "loss": 2.1119, + "step": 158540 + }, + { + "epoch": 0.6034804320851381, + "grad_norm": 0.12623955309391022, + "learning_rate": 0.0005, + "loss": 2.1098, + "step": 158550 + }, + { + "epoch": 0.6035184945532608, + "grad_norm": 0.1269414871931076, + "learning_rate": 0.0005, + "loss": 2.123, + "step": 158560 + }, + { + "epoch": 0.6035565570213834, + "grad_norm": 0.12244527786970139, + "learning_rate": 0.0005, + "loss": 2.1179, + "step": 158570 + }, + { + "epoch": 0.6035946194895062, + "grad_norm": 0.11707145720720291, + "learning_rate": 0.0005, + "loss": 2.1102, + "step": 158580 + }, + { + "epoch": 0.6036326819576289, + "grad_norm": 0.124131940305233, + "learning_rate": 0.0005, + "loss": 2.1093, + "step": 158590 + }, + { + "epoch": 0.6036707444257515, + "grad_norm": 0.11714545637369156, + "learning_rate": 0.0005, + "loss": 2.1071, + "step": 158600 + }, + { + "epoch": 0.6037088068938742, + "grad_norm": 0.11764192581176758, + "learning_rate": 0.0005, + "loss": 2.1007, + "step": 158610 + }, + { + "epoch": 0.603746869361997, + "grad_norm": 0.12883175909519196, + "learning_rate": 0.0005, + "loss": 2.1055, + "step": 158620 + }, + { + "epoch": 0.6037849318301196, + "grad_norm": 0.12830160558223724, + "learning_rate": 0.0005, + "loss": 2.1045, + "step": 158630 + }, + { + "epoch": 0.6038229942982423, + "grad_norm": 0.13523967564105988, + "learning_rate": 0.0005, + "loss": 2.0989, + "step": 158640 + }, + { + "epoch": 0.6038610567663649, + "grad_norm": 0.29038044810295105, + "learning_rate": 0.0005, + "loss": 2.1167, + "step": 158650 + }, + { + "epoch": 0.6038991192344877, + "grad_norm": 0.1264004111289978, + "learning_rate": 0.0005, + "loss": 2.1043, + "step": 158660 + }, + { + "epoch": 0.6039371817026103, + "grad_norm": 0.12499309331178665, + "learning_rate": 0.0005, + "loss": 2.1067, + "step": 158670 + }, + { + "epoch": 0.603975244170733, + "grad_norm": 0.12598103284835815, + "learning_rate": 0.0005, + "loss": 2.1066, + "step": 158680 + }, + { + "epoch": 0.6040133066388557, + "grad_norm": 0.13107284903526306, + "learning_rate": 0.0005, + "loss": 2.1214, + "step": 158690 + }, + { + "epoch": 0.6040513691069783, + "grad_norm": 0.11517848819494247, + "learning_rate": 0.0005, + "loss": 2.1021, + "step": 158700 + }, + { + "epoch": 0.6040894315751011, + "grad_norm": 0.12531235814094543, + "learning_rate": 0.0005, + "loss": 2.1295, + "step": 158710 + }, + { + "epoch": 0.6041274940432237, + "grad_norm": 0.12719525396823883, + "learning_rate": 0.0005, + "loss": 2.1223, + "step": 158720 + }, + { + "epoch": 0.6041655565113464, + "grad_norm": 0.12175626307725906, + "learning_rate": 0.0005, + "loss": 2.1197, + "step": 158730 + }, + { + "epoch": 0.6042036189794691, + "grad_norm": 0.11426258087158203, + "learning_rate": 0.0005, + "loss": 2.1004, + "step": 158740 + }, + { + "epoch": 0.6042416814475918, + "grad_norm": 0.12400322407484055, + "learning_rate": 0.0005, + "loss": 2.1254, + "step": 158750 + }, + { + "epoch": 0.6042797439157145, + "grad_norm": 0.13072001934051514, + "learning_rate": 0.0005, + "loss": 2.1304, + "step": 158760 + }, + { + "epoch": 0.6043178063838371, + "grad_norm": 0.12369487434625626, + "learning_rate": 0.0005, + "loss": 2.1188, + "step": 158770 + }, + { + "epoch": 0.6043558688519598, + "grad_norm": 0.12297425419092178, + "learning_rate": 0.0005, + "loss": 2.0934, + "step": 158780 + }, + { + "epoch": 0.6043939313200826, + "grad_norm": 0.11754658073186874, + "learning_rate": 0.0005, + "loss": 2.1083, + "step": 158790 + }, + { + "epoch": 0.6044319937882052, + "grad_norm": 0.12254299223423004, + "learning_rate": 0.0005, + "loss": 2.1253, + "step": 158800 + }, + { + "epoch": 0.6044700562563279, + "grad_norm": 0.12980803847312927, + "learning_rate": 0.0005, + "loss": 2.1072, + "step": 158810 + }, + { + "epoch": 0.6045081187244505, + "grad_norm": 0.13752233982086182, + "learning_rate": 0.0005, + "loss": 2.114, + "step": 158820 + }, + { + "epoch": 0.6045461811925733, + "grad_norm": 0.12054204940795898, + "learning_rate": 0.0005, + "loss": 2.0929, + "step": 158830 + }, + { + "epoch": 0.604584243660696, + "grad_norm": 0.13067400455474854, + "learning_rate": 0.0005, + "loss": 2.1153, + "step": 158840 + }, + { + "epoch": 0.6046223061288186, + "grad_norm": 0.1433134377002716, + "learning_rate": 0.0005, + "loss": 2.1043, + "step": 158850 + }, + { + "epoch": 0.6046603685969413, + "grad_norm": 0.14245948195457458, + "learning_rate": 0.0005, + "loss": 2.122, + "step": 158860 + }, + { + "epoch": 0.6046984310650639, + "grad_norm": 0.1247478574514389, + "learning_rate": 0.0005, + "loss": 2.1115, + "step": 158870 + }, + { + "epoch": 0.6047364935331867, + "grad_norm": 0.12523062527179718, + "learning_rate": 0.0005, + "loss": 2.099, + "step": 158880 + }, + { + "epoch": 0.6047745560013094, + "grad_norm": 0.11393284797668457, + "learning_rate": 0.0005, + "loss": 2.1057, + "step": 158890 + }, + { + "epoch": 0.604812618469432, + "grad_norm": 0.1180889904499054, + "learning_rate": 0.0005, + "loss": 2.1177, + "step": 158900 + }, + { + "epoch": 0.6048506809375547, + "grad_norm": 0.13296665251255035, + "learning_rate": 0.0005, + "loss": 2.1199, + "step": 158910 + }, + { + "epoch": 0.6048887434056774, + "grad_norm": 0.13823893666267395, + "learning_rate": 0.0005, + "loss": 2.1005, + "step": 158920 + }, + { + "epoch": 0.6049268058738001, + "grad_norm": 0.13230063021183014, + "learning_rate": 0.0005, + "loss": 2.1109, + "step": 158930 + }, + { + "epoch": 0.6049648683419228, + "grad_norm": 0.11703740805387497, + "learning_rate": 0.0005, + "loss": 2.1243, + "step": 158940 + }, + { + "epoch": 0.6050029308100454, + "grad_norm": 0.1351865530014038, + "learning_rate": 0.0005, + "loss": 2.0944, + "step": 158950 + }, + { + "epoch": 0.6050409932781682, + "grad_norm": 0.12466300278902054, + "learning_rate": 0.0005, + "loss": 2.111, + "step": 158960 + }, + { + "epoch": 0.6050790557462908, + "grad_norm": 0.11597589403390884, + "learning_rate": 0.0005, + "loss": 2.1127, + "step": 158970 + }, + { + "epoch": 0.6051171182144135, + "grad_norm": 0.12419840693473816, + "learning_rate": 0.0005, + "loss": 2.1098, + "step": 158980 + }, + { + "epoch": 0.6051551806825362, + "grad_norm": 0.12489227950572968, + "learning_rate": 0.0005, + "loss": 2.1108, + "step": 158990 + }, + { + "epoch": 0.6051932431506588, + "grad_norm": 0.1234791949391365, + "learning_rate": 0.0005, + "loss": 2.1158, + "step": 159000 + }, + { + "epoch": 0.6052313056187816, + "grad_norm": 0.11526373773813248, + "learning_rate": 0.0005, + "loss": 2.1072, + "step": 159010 + }, + { + "epoch": 0.6052693680869042, + "grad_norm": 0.11705972254276276, + "learning_rate": 0.0005, + "loss": 2.099, + "step": 159020 + }, + { + "epoch": 0.6053074305550269, + "grad_norm": 0.1232641190290451, + "learning_rate": 0.0005, + "loss": 2.0949, + "step": 159030 + }, + { + "epoch": 0.6053454930231496, + "grad_norm": 0.13102813065052032, + "learning_rate": 0.0005, + "loss": 2.1044, + "step": 159040 + }, + { + "epoch": 0.6053835554912723, + "grad_norm": 0.11702539771795273, + "learning_rate": 0.0005, + "loss": 2.0988, + "step": 159050 + }, + { + "epoch": 0.605421617959395, + "grad_norm": 0.12806493043899536, + "learning_rate": 0.0005, + "loss": 2.1075, + "step": 159060 + }, + { + "epoch": 0.6054596804275176, + "grad_norm": 0.12140469253063202, + "learning_rate": 0.0005, + "loss": 2.1101, + "step": 159070 + }, + { + "epoch": 0.6054977428956403, + "grad_norm": 0.13960334658622742, + "learning_rate": 0.0005, + "loss": 2.1192, + "step": 159080 + }, + { + "epoch": 0.6055358053637631, + "grad_norm": 0.11300751566886902, + "learning_rate": 0.0005, + "loss": 2.1214, + "step": 159090 + }, + { + "epoch": 0.6055738678318857, + "grad_norm": 0.1150921955704689, + "learning_rate": 0.0005, + "loss": 2.0974, + "step": 159100 + }, + { + "epoch": 0.6056119303000084, + "grad_norm": 0.12943331897258759, + "learning_rate": 0.0005, + "loss": 2.1318, + "step": 159110 + }, + { + "epoch": 0.605649992768131, + "grad_norm": 0.12138635665178299, + "learning_rate": 0.0005, + "loss": 2.1014, + "step": 159120 + }, + { + "epoch": 0.6056880552362537, + "grad_norm": 0.12762968242168427, + "learning_rate": 0.0005, + "loss": 2.1082, + "step": 159130 + }, + { + "epoch": 0.6057261177043765, + "grad_norm": 0.11474503576755524, + "learning_rate": 0.0005, + "loss": 2.114, + "step": 159140 + }, + { + "epoch": 0.6057641801724991, + "grad_norm": 0.12041950970888138, + "learning_rate": 0.0005, + "loss": 2.0943, + "step": 159150 + }, + { + "epoch": 0.6058022426406218, + "grad_norm": 0.11588443070650101, + "learning_rate": 0.0005, + "loss": 2.0871, + "step": 159160 + }, + { + "epoch": 0.6058403051087444, + "grad_norm": 0.12442999333143234, + "learning_rate": 0.0005, + "loss": 2.1393, + "step": 159170 + }, + { + "epoch": 0.6058783675768672, + "grad_norm": 0.1322493851184845, + "learning_rate": 0.0005, + "loss": 2.122, + "step": 159180 + }, + { + "epoch": 0.6059164300449899, + "grad_norm": 0.12426673620939255, + "learning_rate": 0.0005, + "loss": 2.1166, + "step": 159190 + }, + { + "epoch": 0.6059544925131125, + "grad_norm": 0.11464572697877884, + "learning_rate": 0.0005, + "loss": 2.1083, + "step": 159200 + }, + { + "epoch": 0.6059925549812352, + "grad_norm": 0.11597344279289246, + "learning_rate": 0.0005, + "loss": 2.1079, + "step": 159210 + }, + { + "epoch": 0.6060306174493579, + "grad_norm": 0.12046016752719879, + "learning_rate": 0.0005, + "loss": 2.1199, + "step": 159220 + }, + { + "epoch": 0.6060686799174806, + "grad_norm": 0.12151051312685013, + "learning_rate": 0.0005, + "loss": 2.1004, + "step": 159230 + }, + { + "epoch": 0.6061067423856032, + "grad_norm": 0.11330360174179077, + "learning_rate": 0.0005, + "loss": 2.1156, + "step": 159240 + }, + { + "epoch": 0.6061448048537259, + "grad_norm": 0.12379861623048782, + "learning_rate": 0.0005, + "loss": 2.1281, + "step": 159250 + }, + { + "epoch": 0.6061828673218487, + "grad_norm": 0.11828950047492981, + "learning_rate": 0.0005, + "loss": 2.0971, + "step": 159260 + }, + { + "epoch": 0.6062209297899713, + "grad_norm": 0.12113313376903534, + "learning_rate": 0.0005, + "loss": 2.1111, + "step": 159270 + }, + { + "epoch": 0.606258992258094, + "grad_norm": 0.12315396964550018, + "learning_rate": 0.0005, + "loss": 2.1194, + "step": 159280 + }, + { + "epoch": 0.6062970547262166, + "grad_norm": 0.12765733897686005, + "learning_rate": 0.0005, + "loss": 2.118, + "step": 159290 + }, + { + "epoch": 0.6063351171943393, + "grad_norm": 0.12009833753108978, + "learning_rate": 0.0005, + "loss": 2.1122, + "step": 159300 + }, + { + "epoch": 0.6063731796624621, + "grad_norm": 0.13260215520858765, + "learning_rate": 0.0005, + "loss": 2.1118, + "step": 159310 + }, + { + "epoch": 0.6064112421305847, + "grad_norm": 0.12929363548755646, + "learning_rate": 0.0005, + "loss": 2.1143, + "step": 159320 + }, + { + "epoch": 0.6064493045987074, + "grad_norm": 0.13376052677631378, + "learning_rate": 0.0005, + "loss": 2.1003, + "step": 159330 + }, + { + "epoch": 0.60648736706683, + "grad_norm": 0.14937913417816162, + "learning_rate": 0.0005, + "loss": 2.0955, + "step": 159340 + }, + { + "epoch": 0.6065254295349528, + "grad_norm": 0.13000261783599854, + "learning_rate": 0.0005, + "loss": 2.1181, + "step": 159350 + }, + { + "epoch": 0.6065634920030755, + "grad_norm": 0.12199776619672775, + "learning_rate": 0.0005, + "loss": 2.1186, + "step": 159360 + }, + { + "epoch": 0.6066015544711981, + "grad_norm": 0.13417278230190277, + "learning_rate": 0.0005, + "loss": 2.1135, + "step": 159370 + }, + { + "epoch": 0.6066396169393208, + "grad_norm": 0.11662577837705612, + "learning_rate": 0.0005, + "loss": 2.1058, + "step": 159380 + }, + { + "epoch": 0.6066776794074435, + "grad_norm": 0.12676531076431274, + "learning_rate": 0.0005, + "loss": 2.1113, + "step": 159390 + }, + { + "epoch": 0.6067157418755662, + "grad_norm": 0.11627112329006195, + "learning_rate": 0.0005, + "loss": 2.1075, + "step": 159400 + }, + { + "epoch": 0.6067538043436889, + "grad_norm": 0.1277991533279419, + "learning_rate": 0.0005, + "loss": 2.1138, + "step": 159410 + }, + { + "epoch": 0.6067918668118115, + "grad_norm": 0.12240882217884064, + "learning_rate": 0.0005, + "loss": 2.1042, + "step": 159420 + }, + { + "epoch": 0.6068299292799342, + "grad_norm": 0.1373898833990097, + "learning_rate": 0.0005, + "loss": 2.1022, + "step": 159430 + }, + { + "epoch": 0.606867991748057, + "grad_norm": 0.1356273591518402, + "learning_rate": 0.0005, + "loss": 2.1124, + "step": 159440 + }, + { + "epoch": 0.6069060542161796, + "grad_norm": 0.1183418333530426, + "learning_rate": 0.0005, + "loss": 2.1079, + "step": 159450 + }, + { + "epoch": 0.6069441166843023, + "grad_norm": 0.12304482609033585, + "learning_rate": 0.0005, + "loss": 2.0957, + "step": 159460 + }, + { + "epoch": 0.6069821791524249, + "grad_norm": 0.1182306781411171, + "learning_rate": 0.0005, + "loss": 2.1196, + "step": 159470 + }, + { + "epoch": 0.6070202416205477, + "grad_norm": 0.11697513610124588, + "learning_rate": 0.0005, + "loss": 2.1153, + "step": 159480 + }, + { + "epoch": 0.6070583040886703, + "grad_norm": 0.11313638091087341, + "learning_rate": 0.0005, + "loss": 2.111, + "step": 159490 + }, + { + "epoch": 0.607096366556793, + "grad_norm": 0.11949624121189117, + "learning_rate": 0.0005, + "loss": 2.1168, + "step": 159500 + }, + { + "epoch": 0.6071344290249157, + "grad_norm": 0.12727008759975433, + "learning_rate": 0.0005, + "loss": 2.1105, + "step": 159510 + }, + { + "epoch": 0.6071724914930384, + "grad_norm": 0.13288576900959015, + "learning_rate": 0.0005, + "loss": 2.1151, + "step": 159520 + }, + { + "epoch": 0.6072105539611611, + "grad_norm": 0.13605280220508575, + "learning_rate": 0.0005, + "loss": 2.1098, + "step": 159530 + }, + { + "epoch": 0.6072486164292837, + "grad_norm": 0.11556291580200195, + "learning_rate": 0.0005, + "loss": 2.1108, + "step": 159540 + }, + { + "epoch": 0.6072866788974064, + "grad_norm": 0.13261030614376068, + "learning_rate": 0.0005, + "loss": 2.1309, + "step": 159550 + }, + { + "epoch": 0.607324741365529, + "grad_norm": 0.11572163552045822, + "learning_rate": 0.0005, + "loss": 2.1092, + "step": 159560 + }, + { + "epoch": 0.6073628038336518, + "grad_norm": 0.1369166076183319, + "learning_rate": 0.0005, + "loss": 2.1122, + "step": 159570 + }, + { + "epoch": 0.6074008663017745, + "grad_norm": 0.11626111716032028, + "learning_rate": 0.0005, + "loss": 2.116, + "step": 159580 + }, + { + "epoch": 0.6074389287698971, + "grad_norm": 0.12466323375701904, + "learning_rate": 0.0005, + "loss": 2.104, + "step": 159590 + }, + { + "epoch": 0.6074769912380198, + "grad_norm": 0.12724941968917847, + "learning_rate": 0.0005, + "loss": 2.1022, + "step": 159600 + }, + { + "epoch": 0.6075150537061426, + "grad_norm": 0.11998318135738373, + "learning_rate": 0.0005, + "loss": 2.1131, + "step": 159610 + }, + { + "epoch": 0.6075531161742652, + "grad_norm": 0.11803542822599411, + "learning_rate": 0.0005, + "loss": 2.1046, + "step": 159620 + }, + { + "epoch": 0.6075911786423879, + "grad_norm": 0.24420522153377533, + "learning_rate": 0.0005, + "loss": 2.1182, + "step": 159630 + }, + { + "epoch": 0.6076292411105105, + "grad_norm": 0.12755897641181946, + "learning_rate": 0.0005, + "loss": 2.1129, + "step": 159640 + }, + { + "epoch": 0.6076673035786333, + "grad_norm": 0.12958073616027832, + "learning_rate": 0.0005, + "loss": 2.1068, + "step": 159650 + }, + { + "epoch": 0.607705366046756, + "grad_norm": 0.12851276993751526, + "learning_rate": 0.0005, + "loss": 2.1278, + "step": 159660 + }, + { + "epoch": 0.6077434285148786, + "grad_norm": 0.13393059372901917, + "learning_rate": 0.0005, + "loss": 2.0993, + "step": 159670 + }, + { + "epoch": 0.6077814909830013, + "grad_norm": 0.11613164842128754, + "learning_rate": 0.0005, + "loss": 2.1032, + "step": 159680 + }, + { + "epoch": 0.607819553451124, + "grad_norm": 0.12504012882709503, + "learning_rate": 0.0005, + "loss": 2.0983, + "step": 159690 + }, + { + "epoch": 0.6078576159192467, + "grad_norm": 0.12086059153079987, + "learning_rate": 0.0005, + "loss": 2.1118, + "step": 159700 + }, + { + "epoch": 0.6078956783873694, + "grad_norm": 0.12351615726947784, + "learning_rate": 0.0005, + "loss": 2.1193, + "step": 159710 + }, + { + "epoch": 0.607933740855492, + "grad_norm": 0.13252276182174683, + "learning_rate": 0.0005, + "loss": 2.1181, + "step": 159720 + }, + { + "epoch": 0.6079718033236147, + "grad_norm": 0.12680232524871826, + "learning_rate": 0.0005, + "loss": 2.1134, + "step": 159730 + }, + { + "epoch": 0.6080098657917374, + "grad_norm": 0.44076067209243774, + "learning_rate": 0.0005, + "loss": 2.1089, + "step": 159740 + }, + { + "epoch": 0.6080479282598601, + "grad_norm": 0.3102264404296875, + "learning_rate": 0.0005, + "loss": 2.1049, + "step": 159750 + }, + { + "epoch": 0.6080859907279828, + "grad_norm": 0.11716543883085251, + "learning_rate": 0.0005, + "loss": 2.1137, + "step": 159760 + }, + { + "epoch": 0.6081240531961054, + "grad_norm": 0.12028875946998596, + "learning_rate": 0.0005, + "loss": 2.1207, + "step": 159770 + }, + { + "epoch": 0.6081621156642282, + "grad_norm": 0.12898167967796326, + "learning_rate": 0.0005, + "loss": 2.1092, + "step": 159780 + }, + { + "epoch": 0.6082001781323508, + "grad_norm": 0.12819376587867737, + "learning_rate": 0.0005, + "loss": 2.0988, + "step": 159790 + }, + { + "epoch": 0.6082382406004735, + "grad_norm": 0.10979683697223663, + "learning_rate": 0.0005, + "loss": 2.1149, + "step": 159800 + }, + { + "epoch": 0.6082763030685961, + "grad_norm": 0.11153441667556763, + "learning_rate": 0.0005, + "loss": 2.0982, + "step": 159810 + }, + { + "epoch": 0.6083143655367189, + "grad_norm": 0.14920252561569214, + "learning_rate": 0.0005, + "loss": 2.105, + "step": 159820 + }, + { + "epoch": 0.6083524280048416, + "grad_norm": 0.1329171061515808, + "learning_rate": 0.0005, + "loss": 2.1208, + "step": 159830 + }, + { + "epoch": 0.6083904904729642, + "grad_norm": 0.12048627436161041, + "learning_rate": 0.0005, + "loss": 2.1077, + "step": 159840 + }, + { + "epoch": 0.6084285529410869, + "grad_norm": 0.1240534782409668, + "learning_rate": 0.0005, + "loss": 2.1126, + "step": 159850 + }, + { + "epoch": 0.6084666154092095, + "grad_norm": 0.12022659927606583, + "learning_rate": 0.0005, + "loss": 2.1131, + "step": 159860 + }, + { + "epoch": 0.6085046778773323, + "grad_norm": 0.12578216195106506, + "learning_rate": 0.0005, + "loss": 2.1068, + "step": 159870 + }, + { + "epoch": 0.608542740345455, + "grad_norm": 0.12415976822376251, + "learning_rate": 0.0005, + "loss": 2.1101, + "step": 159880 + }, + { + "epoch": 0.6085808028135776, + "grad_norm": 0.14356285333633423, + "learning_rate": 0.0005, + "loss": 2.1133, + "step": 159890 + }, + { + "epoch": 0.6086188652817003, + "grad_norm": 0.1228196918964386, + "learning_rate": 0.0005, + "loss": 2.1025, + "step": 159900 + }, + { + "epoch": 0.608656927749823, + "grad_norm": 0.12486620992422104, + "learning_rate": 0.0005, + "loss": 2.1115, + "step": 159910 + }, + { + "epoch": 0.6086949902179457, + "grad_norm": 0.13456842303276062, + "learning_rate": 0.0005, + "loss": 2.1007, + "step": 159920 + }, + { + "epoch": 0.6087330526860684, + "grad_norm": 0.13543975353240967, + "learning_rate": 0.0005, + "loss": 2.1116, + "step": 159930 + }, + { + "epoch": 0.608771115154191, + "grad_norm": 0.13487686216831207, + "learning_rate": 0.0005, + "loss": 2.1092, + "step": 159940 + }, + { + "epoch": 0.6088091776223138, + "grad_norm": 0.13343895971775055, + "learning_rate": 0.0005, + "loss": 2.1188, + "step": 159950 + }, + { + "epoch": 0.6088472400904364, + "grad_norm": 0.12179489433765411, + "learning_rate": 0.0005, + "loss": 2.0941, + "step": 159960 + }, + { + "epoch": 0.6088853025585591, + "grad_norm": 0.13544785976409912, + "learning_rate": 0.0005, + "loss": 2.1212, + "step": 159970 + }, + { + "epoch": 0.6089233650266818, + "grad_norm": 0.11898493021726608, + "learning_rate": 0.0005, + "loss": 2.1048, + "step": 159980 + }, + { + "epoch": 0.6089614274948044, + "grad_norm": 0.1387069821357727, + "learning_rate": 0.0005, + "loss": 2.1015, + "step": 159990 + }, + { + "epoch": 0.6089994899629272, + "grad_norm": 0.1274258941411972, + "learning_rate": 0.0005, + "loss": 2.095, + "step": 160000 + }, + { + "epoch": 0.6090375524310498, + "grad_norm": 0.14660042524337769, + "learning_rate": 0.0005, + "loss": 2.1091, + "step": 160010 + }, + { + "epoch": 0.6090756148991725, + "grad_norm": 0.12244995683431625, + "learning_rate": 0.0005, + "loss": 2.1165, + "step": 160020 + }, + { + "epoch": 0.6091136773672952, + "grad_norm": 0.1266985833644867, + "learning_rate": 0.0005, + "loss": 2.0891, + "step": 160030 + }, + { + "epoch": 0.6091517398354179, + "grad_norm": 0.1258295476436615, + "learning_rate": 0.0005, + "loss": 2.0889, + "step": 160040 + }, + { + "epoch": 0.6091898023035406, + "grad_norm": 0.11685820668935776, + "learning_rate": 0.0005, + "loss": 2.1076, + "step": 160050 + }, + { + "epoch": 0.6092278647716632, + "grad_norm": 0.125748410820961, + "learning_rate": 0.0005, + "loss": 2.1183, + "step": 160060 + }, + { + "epoch": 0.6092659272397859, + "grad_norm": 0.120501309633255, + "learning_rate": 0.0005, + "loss": 2.1172, + "step": 160070 + }, + { + "epoch": 0.6093039897079087, + "grad_norm": 0.1285618394613266, + "learning_rate": 0.0005, + "loss": 2.1092, + "step": 160080 + }, + { + "epoch": 0.6093420521760313, + "grad_norm": 0.12627874314785004, + "learning_rate": 0.0005, + "loss": 2.11, + "step": 160090 + }, + { + "epoch": 0.609380114644154, + "grad_norm": 0.11696766316890717, + "learning_rate": 0.0005, + "loss": 2.1051, + "step": 160100 + }, + { + "epoch": 0.6094181771122766, + "grad_norm": 0.13108812272548676, + "learning_rate": 0.0005, + "loss": 2.1192, + "step": 160110 + }, + { + "epoch": 0.6094562395803994, + "grad_norm": 0.11752744764089584, + "learning_rate": 0.0005, + "loss": 2.101, + "step": 160120 + }, + { + "epoch": 0.6094943020485221, + "grad_norm": 0.12401732057332993, + "learning_rate": 0.0005, + "loss": 2.1013, + "step": 160130 + }, + { + "epoch": 0.6095323645166447, + "grad_norm": 0.11538513004779816, + "learning_rate": 0.0005, + "loss": 2.1094, + "step": 160140 + }, + { + "epoch": 0.6095704269847674, + "grad_norm": 0.12397050857543945, + "learning_rate": 0.0005, + "loss": 2.1084, + "step": 160150 + }, + { + "epoch": 0.60960848945289, + "grad_norm": 0.12734395265579224, + "learning_rate": 0.0005, + "loss": 2.1067, + "step": 160160 + }, + { + "epoch": 0.6096465519210128, + "grad_norm": 0.11543060094118118, + "learning_rate": 0.0005, + "loss": 2.1086, + "step": 160170 + }, + { + "epoch": 0.6096846143891355, + "grad_norm": 0.13912728428840637, + "learning_rate": 0.0005, + "loss": 2.1107, + "step": 160180 + }, + { + "epoch": 0.6097226768572581, + "grad_norm": 0.12097612023353577, + "learning_rate": 0.0005, + "loss": 2.1134, + "step": 160190 + }, + { + "epoch": 0.6097607393253808, + "grad_norm": 0.11677367985248566, + "learning_rate": 0.0005, + "loss": 2.105, + "step": 160200 + }, + { + "epoch": 0.6097988017935035, + "grad_norm": 0.11732392013072968, + "learning_rate": 0.0005, + "loss": 2.1185, + "step": 160210 + }, + { + "epoch": 0.6098368642616262, + "grad_norm": 0.11748608201742172, + "learning_rate": 0.0005, + "loss": 2.1, + "step": 160220 + }, + { + "epoch": 0.6098749267297489, + "grad_norm": 0.12275837361812592, + "learning_rate": 0.0005, + "loss": 2.1141, + "step": 160230 + }, + { + "epoch": 0.6099129891978715, + "grad_norm": 0.11576947569847107, + "learning_rate": 0.0005, + "loss": 2.1205, + "step": 160240 + }, + { + "epoch": 0.6099510516659943, + "grad_norm": 0.12737642228603363, + "learning_rate": 0.0005, + "loss": 2.1173, + "step": 160250 + }, + { + "epoch": 0.6099891141341169, + "grad_norm": 0.12359835207462311, + "learning_rate": 0.0005, + "loss": 2.1022, + "step": 160260 + }, + { + "epoch": 0.6100271766022396, + "grad_norm": 0.12053797394037247, + "learning_rate": 0.0005, + "loss": 2.1288, + "step": 160270 + }, + { + "epoch": 0.6100652390703623, + "grad_norm": 0.11976463347673416, + "learning_rate": 0.0005, + "loss": 2.1096, + "step": 160280 + }, + { + "epoch": 0.6101033015384849, + "grad_norm": 0.13453908264636993, + "learning_rate": 0.0005, + "loss": 2.1026, + "step": 160290 + }, + { + "epoch": 0.6101413640066077, + "grad_norm": 0.14670896530151367, + "learning_rate": 0.0005, + "loss": 2.1178, + "step": 160300 + }, + { + "epoch": 0.6101794264747303, + "grad_norm": 0.12755566835403442, + "learning_rate": 0.0005, + "loss": 2.0956, + "step": 160310 + }, + { + "epoch": 0.610217488942853, + "grad_norm": 0.13645780086517334, + "learning_rate": 0.0005, + "loss": 2.11, + "step": 160320 + }, + { + "epoch": 0.6102555514109756, + "grad_norm": 0.13180497288703918, + "learning_rate": 0.0005, + "loss": 2.1181, + "step": 160330 + }, + { + "epoch": 0.6102936138790984, + "grad_norm": 0.12781399488449097, + "learning_rate": 0.0005, + "loss": 2.1057, + "step": 160340 + }, + { + "epoch": 0.6103316763472211, + "grad_norm": 0.13411003351211548, + "learning_rate": 0.0005, + "loss": 2.1144, + "step": 160350 + }, + { + "epoch": 0.6103697388153437, + "grad_norm": 0.11541711539030075, + "learning_rate": 0.0005, + "loss": 2.1205, + "step": 160360 + }, + { + "epoch": 0.6104078012834664, + "grad_norm": 0.13555757701396942, + "learning_rate": 0.0005, + "loss": 2.1067, + "step": 160370 + }, + { + "epoch": 0.6104458637515892, + "grad_norm": 0.12073865532875061, + "learning_rate": 0.0005, + "loss": 2.1049, + "step": 160380 + }, + { + "epoch": 0.6104839262197118, + "grad_norm": 0.13339164853096008, + "learning_rate": 0.0005, + "loss": 2.1154, + "step": 160390 + }, + { + "epoch": 0.6105219886878345, + "grad_norm": 0.13692334294319153, + "learning_rate": 0.0005, + "loss": 2.1045, + "step": 160400 + }, + { + "epoch": 0.6105600511559571, + "grad_norm": 0.12026344984769821, + "learning_rate": 0.0005, + "loss": 2.0925, + "step": 160410 + }, + { + "epoch": 0.6105981136240798, + "grad_norm": 0.1249057799577713, + "learning_rate": 0.0005, + "loss": 2.1166, + "step": 160420 + }, + { + "epoch": 0.6106361760922026, + "grad_norm": 0.12343106418848038, + "learning_rate": 0.0005, + "loss": 2.1048, + "step": 160430 + }, + { + "epoch": 0.6106742385603252, + "grad_norm": 0.12635646760463715, + "learning_rate": 0.0005, + "loss": 2.1112, + "step": 160440 + }, + { + "epoch": 0.6107123010284479, + "grad_norm": 0.12143664807081223, + "learning_rate": 0.0005, + "loss": 2.1236, + "step": 160450 + }, + { + "epoch": 0.6107503634965705, + "grad_norm": 0.1384011059999466, + "learning_rate": 0.0005, + "loss": 2.1039, + "step": 160460 + }, + { + "epoch": 0.6107884259646933, + "grad_norm": 0.11583954840898514, + "learning_rate": 0.0005, + "loss": 2.1017, + "step": 160470 + }, + { + "epoch": 0.610826488432816, + "grad_norm": 0.13183361291885376, + "learning_rate": 0.0005, + "loss": 2.1093, + "step": 160480 + }, + { + "epoch": 0.6108645509009386, + "grad_norm": 0.12459275871515274, + "learning_rate": 0.0005, + "loss": 2.1195, + "step": 160490 + }, + { + "epoch": 0.6109026133690613, + "grad_norm": 0.1303837150335312, + "learning_rate": 0.0005, + "loss": 2.1213, + "step": 160500 + }, + { + "epoch": 0.610940675837184, + "grad_norm": 0.12999789416790009, + "learning_rate": 0.0005, + "loss": 2.0995, + "step": 160510 + }, + { + "epoch": 0.6109787383053067, + "grad_norm": 0.11867684870958328, + "learning_rate": 0.0005, + "loss": 2.1164, + "step": 160520 + }, + { + "epoch": 0.6110168007734293, + "grad_norm": 0.133700430393219, + "learning_rate": 0.0005, + "loss": 2.115, + "step": 160530 + }, + { + "epoch": 0.611054863241552, + "grad_norm": 0.1394004225730896, + "learning_rate": 0.0005, + "loss": 2.1054, + "step": 160540 + }, + { + "epoch": 0.6110929257096748, + "grad_norm": 0.12464521080255508, + "learning_rate": 0.0005, + "loss": 2.1172, + "step": 160550 + }, + { + "epoch": 0.6111309881777974, + "grad_norm": 0.129132479429245, + "learning_rate": 0.0005, + "loss": 2.1082, + "step": 160560 + }, + { + "epoch": 0.6111690506459201, + "grad_norm": 0.13225175440311432, + "learning_rate": 0.0005, + "loss": 2.0994, + "step": 160570 + }, + { + "epoch": 0.6112071131140427, + "grad_norm": 0.11520901322364807, + "learning_rate": 0.0005, + "loss": 2.102, + "step": 160580 + }, + { + "epoch": 0.6112451755821654, + "grad_norm": 0.11503398418426514, + "learning_rate": 0.0005, + "loss": 2.106, + "step": 160590 + }, + { + "epoch": 0.6112832380502882, + "grad_norm": 0.13115368783473969, + "learning_rate": 0.0005, + "loss": 2.1056, + "step": 160600 + }, + { + "epoch": 0.6113213005184108, + "grad_norm": 0.14434094727039337, + "learning_rate": 0.0005, + "loss": 2.1048, + "step": 160610 + }, + { + "epoch": 0.6113593629865335, + "grad_norm": 0.11493008583784103, + "learning_rate": 0.0005, + "loss": 2.1141, + "step": 160620 + }, + { + "epoch": 0.6113974254546561, + "grad_norm": 0.1224646270275116, + "learning_rate": 0.0005, + "loss": 2.1137, + "step": 160630 + }, + { + "epoch": 0.6114354879227789, + "grad_norm": 0.12974074482917786, + "learning_rate": 0.0005, + "loss": 2.1169, + "step": 160640 + }, + { + "epoch": 0.6114735503909016, + "grad_norm": 0.12477756291627884, + "learning_rate": 0.0005, + "loss": 2.1057, + "step": 160650 + }, + { + "epoch": 0.6115116128590242, + "grad_norm": 0.11827721446752548, + "learning_rate": 0.0005, + "loss": 2.1279, + "step": 160660 + }, + { + "epoch": 0.6115496753271469, + "grad_norm": 0.12852802872657776, + "learning_rate": 0.0005, + "loss": 2.1032, + "step": 160670 + }, + { + "epoch": 0.6115877377952696, + "grad_norm": 0.11004035919904709, + "learning_rate": 0.0005, + "loss": 2.108, + "step": 160680 + }, + { + "epoch": 0.6116258002633923, + "grad_norm": 0.12859857082366943, + "learning_rate": 0.0005, + "loss": 2.1218, + "step": 160690 + }, + { + "epoch": 0.611663862731515, + "grad_norm": 0.1396910548210144, + "learning_rate": 0.0005, + "loss": 2.0984, + "step": 160700 + }, + { + "epoch": 0.6117019251996376, + "grad_norm": 0.13013729453086853, + "learning_rate": 0.0005, + "loss": 2.1059, + "step": 160710 + }, + { + "epoch": 0.6117399876677603, + "grad_norm": 0.11966916918754578, + "learning_rate": 0.0005, + "loss": 2.1198, + "step": 160720 + }, + { + "epoch": 0.611778050135883, + "grad_norm": 0.1219642385840416, + "learning_rate": 0.0005, + "loss": 2.1125, + "step": 160730 + }, + { + "epoch": 0.6118161126040057, + "grad_norm": 0.11183346807956696, + "learning_rate": 0.0005, + "loss": 2.1146, + "step": 160740 + }, + { + "epoch": 0.6118541750721284, + "grad_norm": 0.12620967626571655, + "learning_rate": 0.0005, + "loss": 2.1045, + "step": 160750 + }, + { + "epoch": 0.611892237540251, + "grad_norm": 0.12379848212003708, + "learning_rate": 0.0005, + "loss": 2.1103, + "step": 160760 + }, + { + "epoch": 0.6119303000083738, + "grad_norm": 0.1288774013519287, + "learning_rate": 0.0005, + "loss": 2.12, + "step": 160770 + }, + { + "epoch": 0.6119683624764964, + "grad_norm": 0.12595853209495544, + "learning_rate": 0.0005, + "loss": 2.1205, + "step": 160780 + }, + { + "epoch": 0.6120064249446191, + "grad_norm": 0.1244102343916893, + "learning_rate": 0.0005, + "loss": 2.1091, + "step": 160790 + }, + { + "epoch": 0.6120444874127418, + "grad_norm": 0.12475734204053879, + "learning_rate": 0.0005, + "loss": 2.1306, + "step": 160800 + }, + { + "epoch": 0.6120825498808645, + "grad_norm": 0.142828568816185, + "learning_rate": 0.0005, + "loss": 2.099, + "step": 160810 + }, + { + "epoch": 0.6121206123489872, + "grad_norm": 0.1296069175004959, + "learning_rate": 0.0005, + "loss": 2.1061, + "step": 160820 + }, + { + "epoch": 0.6121586748171098, + "grad_norm": 0.130901500582695, + "learning_rate": 0.0005, + "loss": 2.1315, + "step": 160830 + }, + { + "epoch": 0.6121967372852325, + "grad_norm": 0.12161926180124283, + "learning_rate": 0.0005, + "loss": 2.1133, + "step": 160840 + }, + { + "epoch": 0.6122347997533552, + "grad_norm": 0.1284041553735733, + "learning_rate": 0.0005, + "loss": 2.1163, + "step": 160850 + }, + { + "epoch": 0.6122728622214779, + "grad_norm": 0.1331159770488739, + "learning_rate": 0.0005, + "loss": 2.0981, + "step": 160860 + }, + { + "epoch": 0.6123109246896006, + "grad_norm": 0.12174385040998459, + "learning_rate": 0.0005, + "loss": 2.1039, + "step": 160870 + }, + { + "epoch": 0.6123489871577232, + "grad_norm": 0.1177845224738121, + "learning_rate": 0.0005, + "loss": 2.0947, + "step": 160880 + }, + { + "epoch": 0.6123870496258459, + "grad_norm": 0.11481917649507523, + "learning_rate": 0.0005, + "loss": 2.0904, + "step": 160890 + }, + { + "epoch": 0.6124251120939687, + "grad_norm": 0.11830814182758331, + "learning_rate": 0.0005, + "loss": 2.1125, + "step": 160900 + }, + { + "epoch": 0.6124631745620913, + "grad_norm": 0.12196170538663864, + "learning_rate": 0.0005, + "loss": 2.1017, + "step": 160910 + }, + { + "epoch": 0.612501237030214, + "grad_norm": 0.1265617161989212, + "learning_rate": 0.0005, + "loss": 2.1094, + "step": 160920 + }, + { + "epoch": 0.6125392994983366, + "grad_norm": 0.13557623326778412, + "learning_rate": 0.0005, + "loss": 2.1089, + "step": 160930 + }, + { + "epoch": 0.6125773619664594, + "grad_norm": 0.13241899013519287, + "learning_rate": 0.0005, + "loss": 2.1174, + "step": 160940 + }, + { + "epoch": 0.6126154244345821, + "grad_norm": 0.11941409111022949, + "learning_rate": 0.0005, + "loss": 2.1128, + "step": 160950 + }, + { + "epoch": 0.6126534869027047, + "grad_norm": 0.11615356057882309, + "learning_rate": 0.0005, + "loss": 2.1187, + "step": 160960 + }, + { + "epoch": 0.6126915493708274, + "grad_norm": 0.11697541177272797, + "learning_rate": 0.0005, + "loss": 2.0934, + "step": 160970 + }, + { + "epoch": 0.6127296118389501, + "grad_norm": 0.11659201234579086, + "learning_rate": 0.0005, + "loss": 2.107, + "step": 160980 + }, + { + "epoch": 0.6127676743070728, + "grad_norm": 0.13227131962776184, + "learning_rate": 0.0005, + "loss": 2.1149, + "step": 160990 + }, + { + "epoch": 0.6128057367751955, + "grad_norm": 0.12896333634853363, + "learning_rate": 0.0005, + "loss": 2.0977, + "step": 161000 + }, + { + "epoch": 0.6128437992433181, + "grad_norm": 0.13977555930614471, + "learning_rate": 0.0005, + "loss": 2.1136, + "step": 161010 + }, + { + "epoch": 0.6128818617114408, + "grad_norm": 0.13483978807926178, + "learning_rate": 0.0005, + "loss": 2.1102, + "step": 161020 + }, + { + "epoch": 0.6129199241795635, + "grad_norm": 0.145457461476326, + "learning_rate": 0.0005, + "loss": 2.1201, + "step": 161030 + }, + { + "epoch": 0.6129579866476862, + "grad_norm": 0.13186997175216675, + "learning_rate": 0.0005, + "loss": 2.1105, + "step": 161040 + }, + { + "epoch": 0.6129960491158088, + "grad_norm": 0.1312837451696396, + "learning_rate": 0.0005, + "loss": 2.1191, + "step": 161050 + }, + { + "epoch": 0.6130341115839315, + "grad_norm": 0.10970824211835861, + "learning_rate": 0.0005, + "loss": 2.1115, + "step": 161060 + }, + { + "epoch": 0.6130721740520543, + "grad_norm": 0.12316413968801498, + "learning_rate": 0.0005, + "loss": 2.1098, + "step": 161070 + }, + { + "epoch": 0.6131102365201769, + "grad_norm": 0.12216629087924957, + "learning_rate": 0.0005, + "loss": 2.1253, + "step": 161080 + }, + { + "epoch": 0.6131482989882996, + "grad_norm": 0.12921833992004395, + "learning_rate": 0.0005, + "loss": 2.119, + "step": 161090 + }, + { + "epoch": 0.6131863614564222, + "grad_norm": 0.1225934848189354, + "learning_rate": 0.0005, + "loss": 2.1085, + "step": 161100 + }, + { + "epoch": 0.613224423924545, + "grad_norm": 0.13255460560321808, + "learning_rate": 0.0005, + "loss": 2.129, + "step": 161110 + }, + { + "epoch": 0.6132624863926677, + "grad_norm": 0.11727086454629898, + "learning_rate": 0.0005, + "loss": 2.0972, + "step": 161120 + }, + { + "epoch": 0.6133005488607903, + "grad_norm": 0.13540498912334442, + "learning_rate": 0.0005, + "loss": 2.1161, + "step": 161130 + }, + { + "epoch": 0.613338611328913, + "grad_norm": 0.11803429573774338, + "learning_rate": 0.0005, + "loss": 2.1202, + "step": 161140 + }, + { + "epoch": 0.6133766737970356, + "grad_norm": 0.12163504213094711, + "learning_rate": 0.0005, + "loss": 2.1111, + "step": 161150 + }, + { + "epoch": 0.6134147362651584, + "grad_norm": 0.10948128253221512, + "learning_rate": 0.0005, + "loss": 2.119, + "step": 161160 + }, + { + "epoch": 0.6134527987332811, + "grad_norm": 0.1163942962884903, + "learning_rate": 0.0005, + "loss": 2.1129, + "step": 161170 + }, + { + "epoch": 0.6134908612014037, + "grad_norm": 0.12510649859905243, + "learning_rate": 0.0005, + "loss": 2.1007, + "step": 161180 + }, + { + "epoch": 0.6135289236695264, + "grad_norm": 0.11732012778520584, + "learning_rate": 0.0005, + "loss": 2.1127, + "step": 161190 + }, + { + "epoch": 0.6135669861376492, + "grad_norm": 0.12112889438867569, + "learning_rate": 0.0005, + "loss": 2.1203, + "step": 161200 + }, + { + "epoch": 0.6136050486057718, + "grad_norm": 0.131588414311409, + "learning_rate": 0.0005, + "loss": 2.1155, + "step": 161210 + }, + { + "epoch": 0.6136431110738945, + "grad_norm": 0.12222830951213837, + "learning_rate": 0.0005, + "loss": 2.1057, + "step": 161220 + }, + { + "epoch": 0.6136811735420171, + "grad_norm": 0.12943825125694275, + "learning_rate": 0.0005, + "loss": 2.1184, + "step": 161230 + }, + { + "epoch": 0.6137192360101399, + "grad_norm": 0.11008955538272858, + "learning_rate": 0.0005, + "loss": 2.1102, + "step": 161240 + }, + { + "epoch": 0.6137572984782625, + "grad_norm": 0.11975482106208801, + "learning_rate": 0.0005, + "loss": 2.1146, + "step": 161250 + }, + { + "epoch": 0.6137953609463852, + "grad_norm": 0.11721187084913254, + "learning_rate": 0.0005, + "loss": 2.1059, + "step": 161260 + }, + { + "epoch": 0.6138334234145079, + "grad_norm": 0.13013498485088348, + "learning_rate": 0.0005, + "loss": 2.106, + "step": 161270 + }, + { + "epoch": 0.6138714858826305, + "grad_norm": 0.1277289241552353, + "learning_rate": 0.0005, + "loss": 2.1072, + "step": 161280 + }, + { + "epoch": 0.6139095483507533, + "grad_norm": 0.12991102039813995, + "learning_rate": 0.0005, + "loss": 2.1242, + "step": 161290 + }, + { + "epoch": 0.6139476108188759, + "grad_norm": 0.11442521959543228, + "learning_rate": 0.0005, + "loss": 2.1313, + "step": 161300 + }, + { + "epoch": 0.6139856732869986, + "grad_norm": 0.11590270698070526, + "learning_rate": 0.0005, + "loss": 2.0906, + "step": 161310 + }, + { + "epoch": 0.6140237357551213, + "grad_norm": 0.12811385095119476, + "learning_rate": 0.0005, + "loss": 2.1287, + "step": 161320 + }, + { + "epoch": 0.614061798223244, + "grad_norm": 0.1309431940317154, + "learning_rate": 0.0005, + "loss": 2.1158, + "step": 161330 + }, + { + "epoch": 0.6140998606913667, + "grad_norm": 0.11549009382724762, + "learning_rate": 0.0005, + "loss": 2.1129, + "step": 161340 + }, + { + "epoch": 0.6141379231594893, + "grad_norm": 0.1286330223083496, + "learning_rate": 0.0005, + "loss": 2.1294, + "step": 161350 + }, + { + "epoch": 0.614175985627612, + "grad_norm": 0.12446651607751846, + "learning_rate": 0.0005, + "loss": 2.1094, + "step": 161360 + }, + { + "epoch": 0.6142140480957348, + "grad_norm": 0.12710335850715637, + "learning_rate": 0.0005, + "loss": 2.1293, + "step": 161370 + }, + { + "epoch": 0.6142521105638574, + "grad_norm": 0.13139621913433075, + "learning_rate": 0.0005, + "loss": 2.1015, + "step": 161380 + }, + { + "epoch": 0.6142901730319801, + "grad_norm": 0.1189979761838913, + "learning_rate": 0.0005, + "loss": 2.1107, + "step": 161390 + }, + { + "epoch": 0.6143282355001027, + "grad_norm": 0.12037677317857742, + "learning_rate": 0.0005, + "loss": 2.0992, + "step": 161400 + }, + { + "epoch": 0.6143662979682255, + "grad_norm": 0.11824595183134079, + "learning_rate": 0.0005, + "loss": 2.1136, + "step": 161410 + }, + { + "epoch": 0.6144043604363482, + "grad_norm": 0.13079175353050232, + "learning_rate": 0.0005, + "loss": 2.1289, + "step": 161420 + }, + { + "epoch": 0.6144424229044708, + "grad_norm": 0.12407071888446808, + "learning_rate": 0.0005, + "loss": 2.0972, + "step": 161430 + }, + { + "epoch": 0.6144804853725935, + "grad_norm": 0.13047856092453003, + "learning_rate": 0.0005, + "loss": 2.1289, + "step": 161440 + }, + { + "epoch": 0.6145185478407161, + "grad_norm": 0.11712726205587387, + "learning_rate": 0.0005, + "loss": 2.1, + "step": 161450 + }, + { + "epoch": 0.6145566103088389, + "grad_norm": 0.12459481507539749, + "learning_rate": 0.0005, + "loss": 2.1236, + "step": 161460 + }, + { + "epoch": 0.6145946727769616, + "grad_norm": 0.11205767095088959, + "learning_rate": 0.0005, + "loss": 2.1142, + "step": 161470 + }, + { + "epoch": 0.6146327352450842, + "grad_norm": 0.13772039115428925, + "learning_rate": 0.0005, + "loss": 2.1139, + "step": 161480 + }, + { + "epoch": 0.6146707977132069, + "grad_norm": 0.13500213623046875, + "learning_rate": 0.0005, + "loss": 2.0978, + "step": 161490 + }, + { + "epoch": 0.6147088601813296, + "grad_norm": 0.12265978753566742, + "learning_rate": 0.0005, + "loss": 2.1076, + "step": 161500 + }, + { + "epoch": 0.6147469226494523, + "grad_norm": 0.11898373067378998, + "learning_rate": 0.0005, + "loss": 2.1165, + "step": 161510 + }, + { + "epoch": 0.614784985117575, + "grad_norm": 0.12593039870262146, + "learning_rate": 0.0005, + "loss": 2.0989, + "step": 161520 + }, + { + "epoch": 0.6148230475856976, + "grad_norm": 0.12925824522972107, + "learning_rate": 0.0005, + "loss": 2.0987, + "step": 161530 + }, + { + "epoch": 0.6148611100538204, + "grad_norm": 0.1286146491765976, + "learning_rate": 0.0005, + "loss": 2.1105, + "step": 161540 + }, + { + "epoch": 0.614899172521943, + "grad_norm": 0.13130497932434082, + "learning_rate": 0.0005, + "loss": 2.1072, + "step": 161550 + }, + { + "epoch": 0.6149372349900657, + "grad_norm": 0.13628366589546204, + "learning_rate": 0.0005, + "loss": 2.1175, + "step": 161560 + }, + { + "epoch": 0.6149752974581884, + "grad_norm": 0.12789924442768097, + "learning_rate": 0.0005, + "loss": 2.0948, + "step": 161570 + }, + { + "epoch": 0.615013359926311, + "grad_norm": 0.126933291554451, + "learning_rate": 0.0005, + "loss": 2.1082, + "step": 161580 + }, + { + "epoch": 0.6150514223944338, + "grad_norm": 0.12284158915281296, + "learning_rate": 0.0005, + "loss": 2.1169, + "step": 161590 + }, + { + "epoch": 0.6150894848625564, + "grad_norm": 0.13748809695243835, + "learning_rate": 0.0005, + "loss": 2.1245, + "step": 161600 + }, + { + "epoch": 0.6151275473306791, + "grad_norm": 0.12846975028514862, + "learning_rate": 0.0005, + "loss": 2.1188, + "step": 161610 + }, + { + "epoch": 0.6151656097988017, + "grad_norm": 0.1149773970246315, + "learning_rate": 0.0005, + "loss": 2.1102, + "step": 161620 + }, + { + "epoch": 0.6152036722669245, + "grad_norm": 0.1303306370973587, + "learning_rate": 0.0005, + "loss": 2.11, + "step": 161630 + }, + { + "epoch": 0.6152417347350472, + "grad_norm": 0.1264232099056244, + "learning_rate": 0.0005, + "loss": 2.1226, + "step": 161640 + }, + { + "epoch": 0.6152797972031698, + "grad_norm": 0.11736659705638885, + "learning_rate": 0.0005, + "loss": 2.0973, + "step": 161650 + }, + { + "epoch": 0.6153178596712925, + "grad_norm": 0.13512319326400757, + "learning_rate": 0.0005, + "loss": 2.116, + "step": 161660 + }, + { + "epoch": 0.6153559221394153, + "grad_norm": 0.10682762414216995, + "learning_rate": 0.0005, + "loss": 2.1212, + "step": 161670 + }, + { + "epoch": 0.6153939846075379, + "grad_norm": 0.10843072831630707, + "learning_rate": 0.0005, + "loss": 2.1191, + "step": 161680 + }, + { + "epoch": 0.6154320470756606, + "grad_norm": 0.1175176352262497, + "learning_rate": 0.0005, + "loss": 2.1074, + "step": 161690 + }, + { + "epoch": 0.6154701095437832, + "grad_norm": 0.11914224177598953, + "learning_rate": 0.0005, + "loss": 2.1108, + "step": 161700 + }, + { + "epoch": 0.6155081720119059, + "grad_norm": 0.11765717715024948, + "learning_rate": 0.0005, + "loss": 2.1327, + "step": 161710 + }, + { + "epoch": 0.6155462344800287, + "grad_norm": 0.11849711090326309, + "learning_rate": 0.0005, + "loss": 2.0947, + "step": 161720 + }, + { + "epoch": 0.6155842969481513, + "grad_norm": 0.13567321002483368, + "learning_rate": 0.0005, + "loss": 2.0982, + "step": 161730 + }, + { + "epoch": 0.615622359416274, + "grad_norm": 0.13653963804244995, + "learning_rate": 0.0005, + "loss": 2.1137, + "step": 161740 + }, + { + "epoch": 0.6156604218843966, + "grad_norm": 0.11373693495988846, + "learning_rate": 0.0005, + "loss": 2.1263, + "step": 161750 + }, + { + "epoch": 0.6156984843525194, + "grad_norm": 0.1299883872270584, + "learning_rate": 0.0005, + "loss": 2.1122, + "step": 161760 + }, + { + "epoch": 0.615736546820642, + "grad_norm": 0.11805561184883118, + "learning_rate": 0.0005, + "loss": 2.1048, + "step": 161770 + }, + { + "epoch": 0.6157746092887647, + "grad_norm": 0.12167860567569733, + "learning_rate": 0.0005, + "loss": 2.1036, + "step": 161780 + }, + { + "epoch": 0.6158126717568874, + "grad_norm": 0.12831658124923706, + "learning_rate": 0.0005, + "loss": 2.1183, + "step": 161790 + }, + { + "epoch": 0.6158507342250101, + "grad_norm": 0.12048164010047913, + "learning_rate": 0.0005, + "loss": 2.1081, + "step": 161800 + }, + { + "epoch": 0.6158887966931328, + "grad_norm": 0.12541131675243378, + "learning_rate": 0.0005, + "loss": 2.114, + "step": 161810 + }, + { + "epoch": 0.6159268591612554, + "grad_norm": 0.12292610853910446, + "learning_rate": 0.0005, + "loss": 2.1274, + "step": 161820 + }, + { + "epoch": 0.6159649216293781, + "grad_norm": 0.11723213642835617, + "learning_rate": 0.0005, + "loss": 2.103, + "step": 161830 + }, + { + "epoch": 0.6160029840975009, + "grad_norm": 0.1285085827112198, + "learning_rate": 0.0005, + "loss": 2.1084, + "step": 161840 + }, + { + "epoch": 0.6160410465656235, + "grad_norm": 0.12634149193763733, + "learning_rate": 0.0005, + "loss": 2.1111, + "step": 161850 + }, + { + "epoch": 0.6160791090337462, + "grad_norm": 0.12730668485164642, + "learning_rate": 0.0005, + "loss": 2.1117, + "step": 161860 + }, + { + "epoch": 0.6161171715018688, + "grad_norm": 0.12409865856170654, + "learning_rate": 0.0005, + "loss": 2.0979, + "step": 161870 + }, + { + "epoch": 0.6161552339699915, + "grad_norm": 0.4979267716407776, + "learning_rate": 0.0005, + "loss": 2.0942, + "step": 161880 + }, + { + "epoch": 0.6161932964381143, + "grad_norm": 0.1151154488325119, + "learning_rate": 0.0005, + "loss": 2.1098, + "step": 161890 + }, + { + "epoch": 0.6162313589062369, + "grad_norm": 0.12512674927711487, + "learning_rate": 0.0005, + "loss": 2.1035, + "step": 161900 + }, + { + "epoch": 0.6162694213743596, + "grad_norm": 0.12398797273635864, + "learning_rate": 0.0005, + "loss": 2.1059, + "step": 161910 + }, + { + "epoch": 0.6163074838424822, + "grad_norm": 0.13985444605350494, + "learning_rate": 0.0005, + "loss": 2.1108, + "step": 161920 + }, + { + "epoch": 0.616345546310605, + "grad_norm": 0.1274701952934265, + "learning_rate": 0.0005, + "loss": 2.1019, + "step": 161930 + }, + { + "epoch": 0.6163836087787277, + "grad_norm": 0.11744947731494904, + "learning_rate": 0.0005, + "loss": 2.1242, + "step": 161940 + }, + { + "epoch": 0.6164216712468503, + "grad_norm": 0.11642337590456009, + "learning_rate": 0.0005, + "loss": 2.1123, + "step": 161950 + }, + { + "epoch": 0.616459733714973, + "grad_norm": 0.12301789969205856, + "learning_rate": 0.0005, + "loss": 2.1089, + "step": 161960 + }, + { + "epoch": 0.6164977961830957, + "grad_norm": 0.11755318194627762, + "learning_rate": 0.0005, + "loss": 2.0998, + "step": 161970 + }, + { + "epoch": 0.6165358586512184, + "grad_norm": 0.12033776938915253, + "learning_rate": 0.0005, + "loss": 2.1068, + "step": 161980 + }, + { + "epoch": 0.6165739211193411, + "grad_norm": 0.1159583255648613, + "learning_rate": 0.0005, + "loss": 2.1265, + "step": 161990 + }, + { + "epoch": 0.6166119835874637, + "grad_norm": 0.1193026751279831, + "learning_rate": 0.0005, + "loss": 2.1127, + "step": 162000 + }, + { + "epoch": 0.6166500460555864, + "grad_norm": 0.13349869847297668, + "learning_rate": 0.0005, + "loss": 2.0997, + "step": 162010 + }, + { + "epoch": 0.6166881085237091, + "grad_norm": 0.12275967001914978, + "learning_rate": 0.0005, + "loss": 2.113, + "step": 162020 + }, + { + "epoch": 0.6167261709918318, + "grad_norm": 0.11640046536922455, + "learning_rate": 0.0005, + "loss": 2.1176, + "step": 162030 + }, + { + "epoch": 0.6167642334599545, + "grad_norm": 0.12108620256185532, + "learning_rate": 0.0005, + "loss": 2.1107, + "step": 162040 + }, + { + "epoch": 0.6168022959280771, + "grad_norm": 0.11861438304185867, + "learning_rate": 0.0005, + "loss": 2.1093, + "step": 162050 + }, + { + "epoch": 0.6168403583961999, + "grad_norm": 0.12845297157764435, + "learning_rate": 0.0005, + "loss": 2.1163, + "step": 162060 + }, + { + "epoch": 0.6168784208643225, + "grad_norm": 0.12152732163667679, + "learning_rate": 0.0005, + "loss": 2.1085, + "step": 162070 + }, + { + "epoch": 0.6169164833324452, + "grad_norm": 0.11804987490177155, + "learning_rate": 0.0005, + "loss": 2.1206, + "step": 162080 + }, + { + "epoch": 0.6169545458005679, + "grad_norm": 0.12480165809392929, + "learning_rate": 0.0005, + "loss": 2.1141, + "step": 162090 + }, + { + "epoch": 0.6169926082686906, + "grad_norm": 0.12539172172546387, + "learning_rate": 0.0005, + "loss": 2.1075, + "step": 162100 + }, + { + "epoch": 0.6170306707368133, + "grad_norm": 0.11880171298980713, + "learning_rate": 0.0005, + "loss": 2.1106, + "step": 162110 + }, + { + "epoch": 0.6170687332049359, + "grad_norm": 0.11587626487016678, + "learning_rate": 0.0005, + "loss": 2.1192, + "step": 162120 + }, + { + "epoch": 0.6171067956730586, + "grad_norm": 0.6760389804840088, + "learning_rate": 0.0005, + "loss": 2.1056, + "step": 162130 + }, + { + "epoch": 0.6171448581411813, + "grad_norm": 0.10932061821222305, + "learning_rate": 0.0005, + "loss": 2.1241, + "step": 162140 + }, + { + "epoch": 0.617182920609304, + "grad_norm": 0.11558043211698532, + "learning_rate": 0.0005, + "loss": 2.1021, + "step": 162150 + }, + { + "epoch": 0.6172209830774267, + "grad_norm": 0.12326429784297943, + "learning_rate": 0.0005, + "loss": 2.112, + "step": 162160 + }, + { + "epoch": 0.6172590455455493, + "grad_norm": 0.11882788687944412, + "learning_rate": 0.0005, + "loss": 2.1035, + "step": 162170 + }, + { + "epoch": 0.617297108013672, + "grad_norm": 0.11889998614788055, + "learning_rate": 0.0005, + "loss": 2.1046, + "step": 162180 + }, + { + "epoch": 0.6173351704817948, + "grad_norm": 0.11966200917959213, + "learning_rate": 0.0005, + "loss": 2.1133, + "step": 162190 + }, + { + "epoch": 0.6173732329499174, + "grad_norm": 0.12400029599666595, + "learning_rate": 0.0005, + "loss": 2.1088, + "step": 162200 + }, + { + "epoch": 0.6174112954180401, + "grad_norm": 0.13062159717082977, + "learning_rate": 0.0005, + "loss": 2.1037, + "step": 162210 + }, + { + "epoch": 0.6174493578861627, + "grad_norm": 0.12579452991485596, + "learning_rate": 0.0005, + "loss": 2.1147, + "step": 162220 + }, + { + "epoch": 0.6174874203542855, + "grad_norm": 0.12085752189159393, + "learning_rate": 0.0005, + "loss": 2.1117, + "step": 162230 + }, + { + "epoch": 0.6175254828224082, + "grad_norm": 0.1321951448917389, + "learning_rate": 0.0005, + "loss": 2.1116, + "step": 162240 + }, + { + "epoch": 0.6175635452905308, + "grad_norm": 0.11965358257293701, + "learning_rate": 0.0005, + "loss": 2.1075, + "step": 162250 + }, + { + "epoch": 0.6176016077586535, + "grad_norm": 0.13514268398284912, + "learning_rate": 0.0005, + "loss": 2.1165, + "step": 162260 + }, + { + "epoch": 0.6176396702267762, + "grad_norm": 0.12487059831619263, + "learning_rate": 0.0005, + "loss": 2.1132, + "step": 162270 + }, + { + "epoch": 0.6176777326948989, + "grad_norm": 0.13903431594371796, + "learning_rate": 0.0005, + "loss": 2.108, + "step": 162280 + }, + { + "epoch": 0.6177157951630216, + "grad_norm": 0.11895901709794998, + "learning_rate": 0.0005, + "loss": 2.1341, + "step": 162290 + }, + { + "epoch": 0.6177538576311442, + "grad_norm": 0.1182001382112503, + "learning_rate": 0.0005, + "loss": 2.1133, + "step": 162300 + }, + { + "epoch": 0.6177919200992669, + "grad_norm": 0.13222895562648773, + "learning_rate": 0.0005, + "loss": 2.1086, + "step": 162310 + }, + { + "epoch": 0.6178299825673896, + "grad_norm": 0.13359028100967407, + "learning_rate": 0.0005, + "loss": 2.0987, + "step": 162320 + }, + { + "epoch": 0.6178680450355123, + "grad_norm": 0.12804371118545532, + "learning_rate": 0.0005, + "loss": 2.1114, + "step": 162330 + }, + { + "epoch": 0.617906107503635, + "grad_norm": 0.11614733189344406, + "learning_rate": 0.0005, + "loss": 2.1095, + "step": 162340 + }, + { + "epoch": 0.6179441699717576, + "grad_norm": 0.13432751595973969, + "learning_rate": 0.0005, + "loss": 2.1175, + "step": 162350 + }, + { + "epoch": 0.6179822324398804, + "grad_norm": 0.1297280639410019, + "learning_rate": 0.0005, + "loss": 2.1155, + "step": 162360 + }, + { + "epoch": 0.618020294908003, + "grad_norm": 0.13260680437088013, + "learning_rate": 0.0005, + "loss": 2.1112, + "step": 162370 + }, + { + "epoch": 0.6180583573761257, + "grad_norm": 0.12126235663890839, + "learning_rate": 0.0005, + "loss": 2.1108, + "step": 162380 + }, + { + "epoch": 0.6180964198442483, + "grad_norm": 0.11476351320743561, + "learning_rate": 0.0005, + "loss": 2.1136, + "step": 162390 + }, + { + "epoch": 0.6181344823123711, + "grad_norm": 0.11820220202207565, + "learning_rate": 0.0005, + "loss": 2.1116, + "step": 162400 + }, + { + "epoch": 0.6181725447804938, + "grad_norm": 0.11576348543167114, + "learning_rate": 0.0005, + "loss": 2.108, + "step": 162410 + }, + { + "epoch": 0.6182106072486164, + "grad_norm": 0.12855997681617737, + "learning_rate": 0.0005, + "loss": 2.1121, + "step": 162420 + }, + { + "epoch": 0.6182486697167391, + "grad_norm": 0.1176319420337677, + "learning_rate": 0.0005, + "loss": 2.1154, + "step": 162430 + }, + { + "epoch": 0.6182867321848617, + "grad_norm": 0.12622740864753723, + "learning_rate": 0.0005, + "loss": 2.1072, + "step": 162440 + }, + { + "epoch": 0.6183247946529845, + "grad_norm": 0.11863669008016586, + "learning_rate": 0.0005, + "loss": 2.106, + "step": 162450 + }, + { + "epoch": 0.6183628571211072, + "grad_norm": 0.1218191608786583, + "learning_rate": 0.0005, + "loss": 2.1022, + "step": 162460 + }, + { + "epoch": 0.6184009195892298, + "grad_norm": 0.13541346788406372, + "learning_rate": 0.0005, + "loss": 2.1145, + "step": 162470 + }, + { + "epoch": 0.6184389820573525, + "grad_norm": 0.11973481625318527, + "learning_rate": 0.0005, + "loss": 2.1064, + "step": 162480 + }, + { + "epoch": 0.6184770445254753, + "grad_norm": 0.12786909937858582, + "learning_rate": 0.0005, + "loss": 2.1145, + "step": 162490 + }, + { + "epoch": 0.6185151069935979, + "grad_norm": 0.12716612219810486, + "learning_rate": 0.0005, + "loss": 2.1027, + "step": 162500 + }, + { + "epoch": 0.6185531694617206, + "grad_norm": 0.12912815809249878, + "learning_rate": 0.0005, + "loss": 2.1109, + "step": 162510 + }, + { + "epoch": 0.6185912319298432, + "grad_norm": 0.14242874085903168, + "learning_rate": 0.0005, + "loss": 2.128, + "step": 162520 + }, + { + "epoch": 0.618629294397966, + "grad_norm": 0.12945939600467682, + "learning_rate": 0.0005, + "loss": 2.1147, + "step": 162530 + }, + { + "epoch": 0.6186673568660886, + "grad_norm": 0.1209016814827919, + "learning_rate": 0.0005, + "loss": 2.109, + "step": 162540 + }, + { + "epoch": 0.6187054193342113, + "grad_norm": 0.1292838752269745, + "learning_rate": 0.0005, + "loss": 2.1037, + "step": 162550 + }, + { + "epoch": 0.618743481802334, + "grad_norm": 0.12523207068443298, + "learning_rate": 0.0005, + "loss": 2.1037, + "step": 162560 + }, + { + "epoch": 0.6187815442704566, + "grad_norm": 0.12495558708906174, + "learning_rate": 0.0005, + "loss": 2.1083, + "step": 162570 + }, + { + "epoch": 0.6188196067385794, + "grad_norm": 0.11377590894699097, + "learning_rate": 0.0005, + "loss": 2.0985, + "step": 162580 + }, + { + "epoch": 0.618857669206702, + "grad_norm": 0.12689173221588135, + "learning_rate": 0.0005, + "loss": 2.1236, + "step": 162590 + }, + { + "epoch": 0.6188957316748247, + "grad_norm": 0.12863075733184814, + "learning_rate": 0.0005, + "loss": 2.1083, + "step": 162600 + }, + { + "epoch": 0.6189337941429474, + "grad_norm": 0.129258930683136, + "learning_rate": 0.0005, + "loss": 2.1202, + "step": 162610 + }, + { + "epoch": 0.6189718566110701, + "grad_norm": 0.12370667606592178, + "learning_rate": 0.0005, + "loss": 2.1004, + "step": 162620 + }, + { + "epoch": 0.6190099190791928, + "grad_norm": 0.13053497672080994, + "learning_rate": 0.0005, + "loss": 2.1197, + "step": 162630 + }, + { + "epoch": 0.6190479815473154, + "grad_norm": 0.11634115129709244, + "learning_rate": 0.0005, + "loss": 2.1022, + "step": 162640 + }, + { + "epoch": 0.6190860440154381, + "grad_norm": 0.11391100287437439, + "learning_rate": 0.0005, + "loss": 2.1101, + "step": 162650 + }, + { + "epoch": 0.6191241064835609, + "grad_norm": 0.11741019040346146, + "learning_rate": 0.0005, + "loss": 2.1109, + "step": 162660 + }, + { + "epoch": 0.6191621689516835, + "grad_norm": 0.11574865132570267, + "learning_rate": 0.0005, + "loss": 2.1207, + "step": 162670 + }, + { + "epoch": 0.6192002314198062, + "grad_norm": 0.12010656297206879, + "learning_rate": 0.0005, + "loss": 2.1068, + "step": 162680 + }, + { + "epoch": 0.6192382938879288, + "grad_norm": 0.1286909133195877, + "learning_rate": 0.0005, + "loss": 2.1173, + "step": 162690 + }, + { + "epoch": 0.6192763563560516, + "grad_norm": 0.11293233931064606, + "learning_rate": 0.0005, + "loss": 2.1255, + "step": 162700 + }, + { + "epoch": 0.6193144188241743, + "grad_norm": 0.12448955327272415, + "learning_rate": 0.0005, + "loss": 2.1126, + "step": 162710 + }, + { + "epoch": 0.6193524812922969, + "grad_norm": 0.12170399725437164, + "learning_rate": 0.0005, + "loss": 2.1128, + "step": 162720 + }, + { + "epoch": 0.6193905437604196, + "grad_norm": 0.11960456520318985, + "learning_rate": 0.0005, + "loss": 2.1109, + "step": 162730 + }, + { + "epoch": 0.6194286062285422, + "grad_norm": 0.12298374623060226, + "learning_rate": 0.0005, + "loss": 2.1111, + "step": 162740 + }, + { + "epoch": 0.619466668696665, + "grad_norm": 0.12716816365718842, + "learning_rate": 0.0005, + "loss": 2.1032, + "step": 162750 + }, + { + "epoch": 0.6195047311647877, + "grad_norm": 0.12332828342914581, + "learning_rate": 0.0005, + "loss": 2.0954, + "step": 162760 + }, + { + "epoch": 0.6195427936329103, + "grad_norm": 0.1455685794353485, + "learning_rate": 0.0005, + "loss": 2.1168, + "step": 162770 + }, + { + "epoch": 0.619580856101033, + "grad_norm": 0.11270184069871902, + "learning_rate": 0.0005, + "loss": 2.1164, + "step": 162780 + }, + { + "epoch": 0.6196189185691557, + "grad_norm": 0.12589813768863678, + "learning_rate": 0.0005, + "loss": 2.1115, + "step": 162790 + }, + { + "epoch": 0.6196569810372784, + "grad_norm": 0.11809215694665909, + "learning_rate": 0.0005, + "loss": 2.1058, + "step": 162800 + }, + { + "epoch": 0.619695043505401, + "grad_norm": 0.1226392388343811, + "learning_rate": 0.0005, + "loss": 2.1249, + "step": 162810 + }, + { + "epoch": 0.6197331059735237, + "grad_norm": 0.12393829971551895, + "learning_rate": 0.0005, + "loss": 2.1143, + "step": 162820 + }, + { + "epoch": 0.6197711684416465, + "grad_norm": 0.11937522888183594, + "learning_rate": 0.0005, + "loss": 2.1301, + "step": 162830 + }, + { + "epoch": 0.6198092309097691, + "grad_norm": 0.11727029085159302, + "learning_rate": 0.0005, + "loss": 2.1133, + "step": 162840 + }, + { + "epoch": 0.6198472933778918, + "grad_norm": 0.1266438364982605, + "learning_rate": 0.0005, + "loss": 2.115, + "step": 162850 + }, + { + "epoch": 0.6198853558460145, + "grad_norm": 0.14234566688537598, + "learning_rate": 0.0005, + "loss": 2.1041, + "step": 162860 + }, + { + "epoch": 0.6199234183141371, + "grad_norm": 0.12679512798786163, + "learning_rate": 0.0005, + "loss": 2.1081, + "step": 162870 + }, + { + "epoch": 0.6199614807822599, + "grad_norm": 0.136153444647789, + "learning_rate": 0.0005, + "loss": 2.0946, + "step": 162880 + }, + { + "epoch": 0.6199995432503825, + "grad_norm": 0.12381961196660995, + "learning_rate": 0.0005, + "loss": 2.0958, + "step": 162890 + }, + { + "epoch": 0.6200376057185052, + "grad_norm": 0.12256696820259094, + "learning_rate": 0.0005, + "loss": 2.1035, + "step": 162900 + }, + { + "epoch": 0.6200756681866278, + "grad_norm": 0.11147227138280869, + "learning_rate": 0.0005, + "loss": 2.1154, + "step": 162910 + }, + { + "epoch": 0.6201137306547506, + "grad_norm": 0.11578986793756485, + "learning_rate": 0.0005, + "loss": 2.1142, + "step": 162920 + }, + { + "epoch": 0.6201517931228733, + "grad_norm": 0.12968918681144714, + "learning_rate": 0.0005, + "loss": 2.1062, + "step": 162930 + }, + { + "epoch": 0.6201898555909959, + "grad_norm": 0.11465263366699219, + "learning_rate": 0.0005, + "loss": 2.118, + "step": 162940 + }, + { + "epoch": 0.6202279180591186, + "grad_norm": 0.14826619625091553, + "learning_rate": 0.0005, + "loss": 2.0986, + "step": 162950 + }, + { + "epoch": 0.6202659805272414, + "grad_norm": 0.13926471769809723, + "learning_rate": 0.0005, + "loss": 2.109, + "step": 162960 + }, + { + "epoch": 0.620304042995364, + "grad_norm": 0.1253323256969452, + "learning_rate": 0.0005, + "loss": 2.1065, + "step": 162970 + }, + { + "epoch": 0.6203421054634867, + "grad_norm": 0.12329831719398499, + "learning_rate": 0.0005, + "loss": 2.1014, + "step": 162980 + }, + { + "epoch": 0.6203801679316093, + "grad_norm": 0.12464464455842972, + "learning_rate": 0.0005, + "loss": 2.1194, + "step": 162990 + }, + { + "epoch": 0.620418230399732, + "grad_norm": 0.11613859236240387, + "learning_rate": 0.0005, + "loss": 2.1213, + "step": 163000 + }, + { + "epoch": 0.6204562928678548, + "grad_norm": 0.13070306181907654, + "learning_rate": 0.0005, + "loss": 2.1133, + "step": 163010 + }, + { + "epoch": 0.6204943553359774, + "grad_norm": 0.12880989909172058, + "learning_rate": 0.0005, + "loss": 2.1048, + "step": 163020 + }, + { + "epoch": 0.6205324178041001, + "grad_norm": 0.1222710907459259, + "learning_rate": 0.0005, + "loss": 2.1309, + "step": 163030 + }, + { + "epoch": 0.6205704802722227, + "grad_norm": 0.1197257861495018, + "learning_rate": 0.0005, + "loss": 2.1172, + "step": 163040 + }, + { + "epoch": 0.6206085427403455, + "grad_norm": 0.12303168326616287, + "learning_rate": 0.0005, + "loss": 2.1151, + "step": 163050 + }, + { + "epoch": 0.6206466052084681, + "grad_norm": 0.11282678693532944, + "learning_rate": 0.0005, + "loss": 2.1171, + "step": 163060 + }, + { + "epoch": 0.6206846676765908, + "grad_norm": 0.11966310441493988, + "learning_rate": 0.0005, + "loss": 2.0977, + "step": 163070 + }, + { + "epoch": 0.6207227301447135, + "grad_norm": 0.13249096274375916, + "learning_rate": 0.0005, + "loss": 2.1125, + "step": 163080 + }, + { + "epoch": 0.6207607926128362, + "grad_norm": 0.1273467242717743, + "learning_rate": 0.0005, + "loss": 2.1018, + "step": 163090 + }, + { + "epoch": 0.6207988550809589, + "grad_norm": 0.14432646334171295, + "learning_rate": 0.0005, + "loss": 2.1093, + "step": 163100 + }, + { + "epoch": 0.6208369175490815, + "grad_norm": 0.1329868584871292, + "learning_rate": 0.0005, + "loss": 2.1164, + "step": 163110 + }, + { + "epoch": 0.6208749800172042, + "grad_norm": 0.13333286345005035, + "learning_rate": 0.0005, + "loss": 2.1104, + "step": 163120 + }, + { + "epoch": 0.620913042485327, + "grad_norm": 0.15608209371566772, + "learning_rate": 0.0005, + "loss": 2.1115, + "step": 163130 + }, + { + "epoch": 0.6209511049534496, + "grad_norm": 0.11932935565710068, + "learning_rate": 0.0005, + "loss": 2.0991, + "step": 163140 + }, + { + "epoch": 0.6209891674215723, + "grad_norm": 0.12438883632421494, + "learning_rate": 0.0005, + "loss": 2.1009, + "step": 163150 + }, + { + "epoch": 0.6210272298896949, + "grad_norm": 0.12361640483140945, + "learning_rate": 0.0005, + "loss": 2.1085, + "step": 163160 + }, + { + "epoch": 0.6210652923578176, + "grad_norm": 0.11473851650953293, + "learning_rate": 0.0005, + "loss": 2.099, + "step": 163170 + }, + { + "epoch": 0.6211033548259404, + "grad_norm": 0.12637248635292053, + "learning_rate": 0.0005, + "loss": 2.1105, + "step": 163180 + }, + { + "epoch": 0.621141417294063, + "grad_norm": 0.13711248338222504, + "learning_rate": 0.0005, + "loss": 2.1088, + "step": 163190 + }, + { + "epoch": 0.6211794797621857, + "grad_norm": 0.12292785942554474, + "learning_rate": 0.0005, + "loss": 2.105, + "step": 163200 + }, + { + "epoch": 0.6212175422303083, + "grad_norm": 0.13030977547168732, + "learning_rate": 0.0005, + "loss": 2.102, + "step": 163210 + }, + { + "epoch": 0.6212556046984311, + "grad_norm": 0.130666121840477, + "learning_rate": 0.0005, + "loss": 2.0948, + "step": 163220 + }, + { + "epoch": 0.6212936671665538, + "grad_norm": 0.12032100558280945, + "learning_rate": 0.0005, + "loss": 2.1163, + "step": 163230 + }, + { + "epoch": 0.6213317296346764, + "grad_norm": 0.11286701261997223, + "learning_rate": 0.0005, + "loss": 2.1101, + "step": 163240 + }, + { + "epoch": 0.6213697921027991, + "grad_norm": 0.1288653463125229, + "learning_rate": 0.0005, + "loss": 2.1098, + "step": 163250 + }, + { + "epoch": 0.6214078545709218, + "grad_norm": 0.11776786297559738, + "learning_rate": 0.0005, + "loss": 2.1043, + "step": 163260 + }, + { + "epoch": 0.6214459170390445, + "grad_norm": 0.12605293095111847, + "learning_rate": 0.0005, + "loss": 2.1322, + "step": 163270 + }, + { + "epoch": 0.6214839795071672, + "grad_norm": 0.11133257299661636, + "learning_rate": 0.0005, + "loss": 2.1115, + "step": 163280 + }, + { + "epoch": 0.6215220419752898, + "grad_norm": 0.1279788464307785, + "learning_rate": 0.0005, + "loss": 2.1264, + "step": 163290 + }, + { + "epoch": 0.6215601044434125, + "grad_norm": 0.13132244348526, + "learning_rate": 0.0005, + "loss": 2.1092, + "step": 163300 + }, + { + "epoch": 0.6215981669115352, + "grad_norm": 0.12739558517932892, + "learning_rate": 0.0005, + "loss": 2.0894, + "step": 163310 + }, + { + "epoch": 0.6216362293796579, + "grad_norm": 0.1202457919716835, + "learning_rate": 0.0005, + "loss": 2.1098, + "step": 163320 + }, + { + "epoch": 0.6216742918477806, + "grad_norm": 0.12516264617443085, + "learning_rate": 0.0005, + "loss": 2.119, + "step": 163330 + }, + { + "epoch": 0.6217123543159032, + "grad_norm": 0.11494667083024979, + "learning_rate": 0.0005, + "loss": 2.1162, + "step": 163340 + }, + { + "epoch": 0.621750416784026, + "grad_norm": 0.12559252977371216, + "learning_rate": 0.0005, + "loss": 2.11, + "step": 163350 + }, + { + "epoch": 0.6217884792521486, + "grad_norm": 0.12517182528972626, + "learning_rate": 0.0005, + "loss": 2.116, + "step": 163360 + }, + { + "epoch": 0.6218265417202713, + "grad_norm": 0.1260019838809967, + "learning_rate": 0.0005, + "loss": 2.109, + "step": 163370 + }, + { + "epoch": 0.621864604188394, + "grad_norm": 0.11948941648006439, + "learning_rate": 0.0005, + "loss": 2.1029, + "step": 163380 + }, + { + "epoch": 0.6219026666565167, + "grad_norm": 0.12022241204977036, + "learning_rate": 0.0005, + "loss": 2.1137, + "step": 163390 + }, + { + "epoch": 0.6219407291246394, + "grad_norm": 0.11221758276224136, + "learning_rate": 0.0005, + "loss": 2.1013, + "step": 163400 + }, + { + "epoch": 0.621978791592762, + "grad_norm": 0.12240494042634964, + "learning_rate": 0.0005, + "loss": 2.1028, + "step": 163410 + }, + { + "epoch": 0.6220168540608847, + "grad_norm": 0.12656979262828827, + "learning_rate": 0.0005, + "loss": 2.095, + "step": 163420 + }, + { + "epoch": 0.6220549165290075, + "grad_norm": 0.12262611091136932, + "learning_rate": 0.0005, + "loss": 2.1334, + "step": 163430 + }, + { + "epoch": 0.6220929789971301, + "grad_norm": 0.12745757400989532, + "learning_rate": 0.0005, + "loss": 2.1098, + "step": 163440 + }, + { + "epoch": 0.6221310414652528, + "grad_norm": 0.12522058188915253, + "learning_rate": 0.0005, + "loss": 2.114, + "step": 163450 + }, + { + "epoch": 0.6221691039333754, + "grad_norm": 0.1248687207698822, + "learning_rate": 0.0005, + "loss": 2.1088, + "step": 163460 + }, + { + "epoch": 0.6222071664014981, + "grad_norm": 0.13304556906223297, + "learning_rate": 0.0005, + "loss": 2.1246, + "step": 163470 + }, + { + "epoch": 0.6222452288696209, + "grad_norm": 0.12558689713478088, + "learning_rate": 0.0005, + "loss": 2.1102, + "step": 163480 + }, + { + "epoch": 0.6222832913377435, + "grad_norm": 0.11721085011959076, + "learning_rate": 0.0005, + "loss": 2.1031, + "step": 163490 + }, + { + "epoch": 0.6223213538058662, + "grad_norm": 0.11873149126768112, + "learning_rate": 0.0005, + "loss": 2.1289, + "step": 163500 + }, + { + "epoch": 0.6223594162739888, + "grad_norm": 0.1278821974992752, + "learning_rate": 0.0005, + "loss": 2.1133, + "step": 163510 + }, + { + "epoch": 0.6223974787421116, + "grad_norm": 0.13227632641792297, + "learning_rate": 0.0005, + "loss": 2.1286, + "step": 163520 + }, + { + "epoch": 0.6224355412102343, + "grad_norm": 0.12484121322631836, + "learning_rate": 0.0005, + "loss": 2.1025, + "step": 163530 + }, + { + "epoch": 0.6224736036783569, + "grad_norm": 0.1308179348707199, + "learning_rate": 0.0005, + "loss": 2.1212, + "step": 163540 + }, + { + "epoch": 0.6225116661464796, + "grad_norm": 0.12707605957984924, + "learning_rate": 0.0005, + "loss": 2.1039, + "step": 163550 + }, + { + "epoch": 0.6225497286146023, + "grad_norm": 0.12090172618627548, + "learning_rate": 0.0005, + "loss": 2.1226, + "step": 163560 + }, + { + "epoch": 0.622587791082725, + "grad_norm": 0.16413117945194244, + "learning_rate": 0.0005, + "loss": 2.0984, + "step": 163570 + }, + { + "epoch": 0.6226258535508477, + "grad_norm": 0.13108348846435547, + "learning_rate": 0.0005, + "loss": 2.1053, + "step": 163580 + }, + { + "epoch": 0.6226639160189703, + "grad_norm": 0.11516100913286209, + "learning_rate": 0.0005, + "loss": 2.093, + "step": 163590 + }, + { + "epoch": 0.622701978487093, + "grad_norm": 0.12196829169988632, + "learning_rate": 0.0005, + "loss": 2.103, + "step": 163600 + }, + { + "epoch": 0.6227400409552157, + "grad_norm": 0.11693855375051498, + "learning_rate": 0.0005, + "loss": 2.1151, + "step": 163610 + }, + { + "epoch": 0.6227781034233384, + "grad_norm": 0.12894059717655182, + "learning_rate": 0.0005, + "loss": 2.1119, + "step": 163620 + }, + { + "epoch": 0.622816165891461, + "grad_norm": 0.12001071870326996, + "learning_rate": 0.0005, + "loss": 2.1105, + "step": 163630 + }, + { + "epoch": 0.6228542283595837, + "grad_norm": 0.11836356669664383, + "learning_rate": 0.0005, + "loss": 2.1123, + "step": 163640 + }, + { + "epoch": 0.6228922908277065, + "grad_norm": 0.11794774234294891, + "learning_rate": 0.0005, + "loss": 2.0995, + "step": 163650 + }, + { + "epoch": 0.6229303532958291, + "grad_norm": 0.13065387308597565, + "learning_rate": 0.0005, + "loss": 2.0968, + "step": 163660 + }, + { + "epoch": 0.6229684157639518, + "grad_norm": 0.12129966914653778, + "learning_rate": 0.0005, + "loss": 2.1238, + "step": 163670 + }, + { + "epoch": 0.6230064782320744, + "grad_norm": 0.11834661662578583, + "learning_rate": 0.0005, + "loss": 2.1143, + "step": 163680 + }, + { + "epoch": 0.6230445407001972, + "grad_norm": 0.12051723152399063, + "learning_rate": 0.0005, + "loss": 2.1246, + "step": 163690 + }, + { + "epoch": 0.6230826031683199, + "grad_norm": 0.12891320884227753, + "learning_rate": 0.0005, + "loss": 2.1033, + "step": 163700 + }, + { + "epoch": 0.6231206656364425, + "grad_norm": 0.11517681926488876, + "learning_rate": 0.0005, + "loss": 2.1075, + "step": 163710 + }, + { + "epoch": 0.6231587281045652, + "grad_norm": 0.12965747714042664, + "learning_rate": 0.0005, + "loss": 2.113, + "step": 163720 + }, + { + "epoch": 0.6231967905726878, + "grad_norm": 0.11559164524078369, + "learning_rate": 0.0005, + "loss": 2.131, + "step": 163730 + }, + { + "epoch": 0.6232348530408106, + "grad_norm": 0.1317571997642517, + "learning_rate": 0.0005, + "loss": 2.0897, + "step": 163740 + }, + { + "epoch": 0.6232729155089333, + "grad_norm": 0.11871747672557831, + "learning_rate": 0.0005, + "loss": 2.1185, + "step": 163750 + }, + { + "epoch": 0.6233109779770559, + "grad_norm": 0.12817269563674927, + "learning_rate": 0.0005, + "loss": 2.1153, + "step": 163760 + }, + { + "epoch": 0.6233490404451786, + "grad_norm": 0.11896772682666779, + "learning_rate": 0.0005, + "loss": 2.1233, + "step": 163770 + }, + { + "epoch": 0.6233871029133013, + "grad_norm": 0.13990911841392517, + "learning_rate": 0.0005, + "loss": 2.1188, + "step": 163780 + }, + { + "epoch": 0.623425165381424, + "grad_norm": 0.12218964099884033, + "learning_rate": 0.0005, + "loss": 2.1136, + "step": 163790 + }, + { + "epoch": 0.6234632278495467, + "grad_norm": 0.1254860758781433, + "learning_rate": 0.0005, + "loss": 2.116, + "step": 163800 + }, + { + "epoch": 0.6235012903176693, + "grad_norm": 0.12563014030456543, + "learning_rate": 0.0005, + "loss": 2.1001, + "step": 163810 + }, + { + "epoch": 0.6235393527857921, + "grad_norm": 0.12120439857244492, + "learning_rate": 0.0005, + "loss": 2.1159, + "step": 163820 + }, + { + "epoch": 0.6235774152539147, + "grad_norm": 0.1312512308359146, + "learning_rate": 0.0005, + "loss": 2.1041, + "step": 163830 + }, + { + "epoch": 0.6236154777220374, + "grad_norm": 0.15499520301818848, + "learning_rate": 0.0005, + "loss": 2.1251, + "step": 163840 + }, + { + "epoch": 0.6236535401901601, + "grad_norm": 0.126829132437706, + "learning_rate": 0.0005, + "loss": 2.1206, + "step": 163850 + }, + { + "epoch": 0.6236916026582828, + "grad_norm": 0.1292577087879181, + "learning_rate": 0.0005, + "loss": 2.1005, + "step": 163860 + }, + { + "epoch": 0.6237296651264055, + "grad_norm": 0.12449748814105988, + "learning_rate": 0.0005, + "loss": 2.1025, + "step": 163870 + }, + { + "epoch": 0.6237677275945281, + "grad_norm": 0.12957262992858887, + "learning_rate": 0.0005, + "loss": 2.11, + "step": 163880 + }, + { + "epoch": 0.6238057900626508, + "grad_norm": 0.12353445589542389, + "learning_rate": 0.0005, + "loss": 2.126, + "step": 163890 + }, + { + "epoch": 0.6238438525307735, + "grad_norm": 0.12337320297956467, + "learning_rate": 0.0005, + "loss": 2.1134, + "step": 163900 + }, + { + "epoch": 0.6238819149988962, + "grad_norm": 0.12145992368459702, + "learning_rate": 0.0005, + "loss": 2.1171, + "step": 163910 + }, + { + "epoch": 0.6239199774670189, + "grad_norm": 0.11696632206439972, + "learning_rate": 0.0005, + "loss": 2.1118, + "step": 163920 + }, + { + "epoch": 0.6239580399351415, + "grad_norm": 0.12449745833873749, + "learning_rate": 0.0005, + "loss": 2.117, + "step": 163930 + }, + { + "epoch": 0.6239961024032642, + "grad_norm": 0.11991845816373825, + "learning_rate": 0.0005, + "loss": 2.1048, + "step": 163940 + }, + { + "epoch": 0.624034164871387, + "grad_norm": 0.11270780861377716, + "learning_rate": 0.0005, + "loss": 2.1092, + "step": 163950 + }, + { + "epoch": 0.6240722273395096, + "grad_norm": 0.11561381071805954, + "learning_rate": 0.0005, + "loss": 2.1172, + "step": 163960 + }, + { + "epoch": 0.6241102898076323, + "grad_norm": 0.12485513836145401, + "learning_rate": 0.0005, + "loss": 2.1055, + "step": 163970 + }, + { + "epoch": 0.6241483522757549, + "grad_norm": 0.12313344329595566, + "learning_rate": 0.0005, + "loss": 2.1062, + "step": 163980 + }, + { + "epoch": 0.6241864147438777, + "grad_norm": 0.13852645456790924, + "learning_rate": 0.0005, + "loss": 2.1129, + "step": 163990 + }, + { + "epoch": 0.6242244772120004, + "grad_norm": 0.13669776916503906, + "learning_rate": 0.0005, + "loss": 2.1085, + "step": 164000 + }, + { + "epoch": 0.624262539680123, + "grad_norm": 0.12754948437213898, + "learning_rate": 0.0005, + "loss": 2.1113, + "step": 164010 + }, + { + "epoch": 0.6243006021482457, + "grad_norm": 0.12182336300611496, + "learning_rate": 0.0005, + "loss": 2.0984, + "step": 164020 + }, + { + "epoch": 0.6243386646163683, + "grad_norm": 0.13123556971549988, + "learning_rate": 0.0005, + "loss": 2.0986, + "step": 164030 + }, + { + "epoch": 0.6243767270844911, + "grad_norm": 0.13097603619098663, + "learning_rate": 0.0005, + "loss": 2.1174, + "step": 164040 + }, + { + "epoch": 0.6244147895526138, + "grad_norm": 0.12931232154369354, + "learning_rate": 0.0005, + "loss": 2.1072, + "step": 164050 + }, + { + "epoch": 0.6244528520207364, + "grad_norm": 0.12148045003414154, + "learning_rate": 0.0005, + "loss": 2.1293, + "step": 164060 + }, + { + "epoch": 0.6244909144888591, + "grad_norm": 0.11986979097127914, + "learning_rate": 0.0005, + "loss": 2.1182, + "step": 164070 + }, + { + "epoch": 0.6245289769569818, + "grad_norm": 0.12932519614696503, + "learning_rate": 0.0005, + "loss": 2.0999, + "step": 164080 + }, + { + "epoch": 0.6245670394251045, + "grad_norm": 0.11838708072900772, + "learning_rate": 0.0005, + "loss": 2.1002, + "step": 164090 + }, + { + "epoch": 0.6246051018932272, + "grad_norm": 0.13119414448738098, + "learning_rate": 0.0005, + "loss": 2.1163, + "step": 164100 + }, + { + "epoch": 0.6246431643613498, + "grad_norm": 0.1321251541376114, + "learning_rate": 0.0005, + "loss": 2.1165, + "step": 164110 + }, + { + "epoch": 0.6246812268294726, + "grad_norm": 0.11741336435079575, + "learning_rate": 0.0005, + "loss": 2.1147, + "step": 164120 + }, + { + "epoch": 0.6247192892975952, + "grad_norm": 0.12370091676712036, + "learning_rate": 0.0005, + "loss": 2.1381, + "step": 164130 + }, + { + "epoch": 0.6247573517657179, + "grad_norm": 0.12821203470230103, + "learning_rate": 0.0005, + "loss": 2.1027, + "step": 164140 + }, + { + "epoch": 0.6247954142338406, + "grad_norm": 0.1400570273399353, + "learning_rate": 0.0005, + "loss": 2.1056, + "step": 164150 + }, + { + "epoch": 0.6248334767019632, + "grad_norm": 0.12167726457118988, + "learning_rate": 0.0005, + "loss": 2.122, + "step": 164160 + }, + { + "epoch": 0.624871539170086, + "grad_norm": 0.11930079758167267, + "learning_rate": 0.0005, + "loss": 2.1142, + "step": 164170 + }, + { + "epoch": 0.6249096016382086, + "grad_norm": 0.14793331921100616, + "learning_rate": 0.0005, + "loss": 2.1357, + "step": 164180 + }, + { + "epoch": 0.6249476641063313, + "grad_norm": 0.13453777134418488, + "learning_rate": 0.0005, + "loss": 2.1089, + "step": 164190 + }, + { + "epoch": 0.624985726574454, + "grad_norm": 0.13323485851287842, + "learning_rate": 0.0005, + "loss": 2.1213, + "step": 164200 + }, + { + "epoch": 0.6250237890425767, + "grad_norm": 0.12917537987232208, + "learning_rate": 0.0005, + "loss": 2.116, + "step": 164210 + }, + { + "epoch": 0.6250618515106994, + "grad_norm": 0.1289193481206894, + "learning_rate": 0.0005, + "loss": 2.112, + "step": 164220 + }, + { + "epoch": 0.625099913978822, + "grad_norm": 0.1279052197933197, + "learning_rate": 0.0005, + "loss": 2.1183, + "step": 164230 + }, + { + "epoch": 0.6251379764469447, + "grad_norm": 0.13927242159843445, + "learning_rate": 0.0005, + "loss": 2.1076, + "step": 164240 + }, + { + "epoch": 0.6251760389150675, + "grad_norm": 0.1160060465335846, + "learning_rate": 0.0005, + "loss": 2.0902, + "step": 164250 + }, + { + "epoch": 0.6252141013831901, + "grad_norm": 0.1169615238904953, + "learning_rate": 0.0005, + "loss": 2.1107, + "step": 164260 + }, + { + "epoch": 0.6252521638513128, + "grad_norm": 0.13549870252609253, + "learning_rate": 0.0005, + "loss": 2.1011, + "step": 164270 + }, + { + "epoch": 0.6252902263194354, + "grad_norm": 0.13748615980148315, + "learning_rate": 0.0005, + "loss": 2.1076, + "step": 164280 + }, + { + "epoch": 0.6253282887875582, + "grad_norm": 0.11843782663345337, + "learning_rate": 0.0005, + "loss": 2.1016, + "step": 164290 + }, + { + "epoch": 0.6253663512556809, + "grad_norm": 0.12487849593162537, + "learning_rate": 0.0005, + "loss": 2.1061, + "step": 164300 + }, + { + "epoch": 0.6254044137238035, + "grad_norm": 0.1206541433930397, + "learning_rate": 0.0005, + "loss": 2.1275, + "step": 164310 + }, + { + "epoch": 0.6254424761919262, + "grad_norm": 0.12133076786994934, + "learning_rate": 0.0005, + "loss": 2.1084, + "step": 164320 + }, + { + "epoch": 0.6254805386600488, + "grad_norm": 0.12979364395141602, + "learning_rate": 0.0005, + "loss": 2.1141, + "step": 164330 + }, + { + "epoch": 0.6255186011281716, + "grad_norm": 0.11954282969236374, + "learning_rate": 0.0005, + "loss": 2.1073, + "step": 164340 + }, + { + "epoch": 0.6255566635962942, + "grad_norm": 0.11928258091211319, + "learning_rate": 0.0005, + "loss": 2.1136, + "step": 164350 + }, + { + "epoch": 0.6255947260644169, + "grad_norm": 0.11903034895658493, + "learning_rate": 0.0005, + "loss": 2.1115, + "step": 164360 + }, + { + "epoch": 0.6256327885325396, + "grad_norm": 0.1315557211637497, + "learning_rate": 0.0005, + "loss": 2.1325, + "step": 164370 + }, + { + "epoch": 0.6256708510006623, + "grad_norm": 0.12095145136117935, + "learning_rate": 0.0005, + "loss": 2.102, + "step": 164380 + }, + { + "epoch": 0.625708913468785, + "grad_norm": 0.12724807858467102, + "learning_rate": 0.0005, + "loss": 2.1018, + "step": 164390 + }, + { + "epoch": 0.6257469759369076, + "grad_norm": 0.1404864639043808, + "learning_rate": 0.0005, + "loss": 2.1113, + "step": 164400 + }, + { + "epoch": 0.6257850384050303, + "grad_norm": 0.13892942667007446, + "learning_rate": 0.0005, + "loss": 2.1022, + "step": 164410 + }, + { + "epoch": 0.6258231008731531, + "grad_norm": 0.133827805519104, + "learning_rate": 0.0005, + "loss": 2.1038, + "step": 164420 + }, + { + "epoch": 0.6258611633412757, + "grad_norm": 0.11841180175542831, + "learning_rate": 0.0005, + "loss": 2.1171, + "step": 164430 + }, + { + "epoch": 0.6258992258093984, + "grad_norm": 0.1325208693742752, + "learning_rate": 0.0005, + "loss": 2.1221, + "step": 164440 + }, + { + "epoch": 0.625937288277521, + "grad_norm": 0.1317589282989502, + "learning_rate": 0.0005, + "loss": 2.1104, + "step": 164450 + }, + { + "epoch": 0.6259753507456437, + "grad_norm": 0.126037135720253, + "learning_rate": 0.0005, + "loss": 2.1138, + "step": 164460 + }, + { + "epoch": 0.6260134132137665, + "grad_norm": 0.13882552087306976, + "learning_rate": 0.0005, + "loss": 2.1242, + "step": 164470 + }, + { + "epoch": 0.6260514756818891, + "grad_norm": 0.1219392642378807, + "learning_rate": 0.0005, + "loss": 2.1131, + "step": 164480 + }, + { + "epoch": 0.6260895381500118, + "grad_norm": 0.12018023431301117, + "learning_rate": 0.0005, + "loss": 2.1105, + "step": 164490 + }, + { + "epoch": 0.6261276006181344, + "grad_norm": 0.11514944583177567, + "learning_rate": 0.0005, + "loss": 2.1139, + "step": 164500 + }, + { + "epoch": 0.6261656630862572, + "grad_norm": 0.1256149411201477, + "learning_rate": 0.0005, + "loss": 2.1194, + "step": 164510 + }, + { + "epoch": 0.6262037255543799, + "grad_norm": 0.1267387866973877, + "learning_rate": 0.0005, + "loss": 2.0985, + "step": 164520 + }, + { + "epoch": 0.6262417880225025, + "grad_norm": 0.12900102138519287, + "learning_rate": 0.0005, + "loss": 2.1088, + "step": 164530 + }, + { + "epoch": 0.6262798504906252, + "grad_norm": 0.1457938253879547, + "learning_rate": 0.0005, + "loss": 2.1078, + "step": 164540 + }, + { + "epoch": 0.626317912958748, + "grad_norm": 0.12686443328857422, + "learning_rate": 0.0005, + "loss": 2.1156, + "step": 164550 + }, + { + "epoch": 0.6263559754268706, + "grad_norm": 0.14455707371234894, + "learning_rate": 0.0005, + "loss": 2.119, + "step": 164560 + }, + { + "epoch": 0.6263940378949933, + "grad_norm": 0.1281202733516693, + "learning_rate": 0.0005, + "loss": 2.111, + "step": 164570 + }, + { + "epoch": 0.6264321003631159, + "grad_norm": 0.1168116107583046, + "learning_rate": 0.0005, + "loss": 2.127, + "step": 164580 + }, + { + "epoch": 0.6264701628312386, + "grad_norm": 0.12170332670211792, + "learning_rate": 0.0005, + "loss": 2.1217, + "step": 164590 + }, + { + "epoch": 0.6265082252993613, + "grad_norm": 0.11665555834770203, + "learning_rate": 0.0005, + "loss": 2.1043, + "step": 164600 + }, + { + "epoch": 0.626546287767484, + "grad_norm": 0.12107517570257187, + "learning_rate": 0.0005, + "loss": 2.1009, + "step": 164610 + }, + { + "epoch": 0.6265843502356067, + "grad_norm": 0.1253291517496109, + "learning_rate": 0.0005, + "loss": 2.1077, + "step": 164620 + }, + { + "epoch": 0.6266224127037293, + "grad_norm": 0.12508216500282288, + "learning_rate": 0.0005, + "loss": 2.1075, + "step": 164630 + }, + { + "epoch": 0.6266604751718521, + "grad_norm": 0.1261623650789261, + "learning_rate": 0.0005, + "loss": 2.1086, + "step": 164640 + }, + { + "epoch": 0.6266985376399747, + "grad_norm": 0.13363027572631836, + "learning_rate": 0.0005, + "loss": 2.1088, + "step": 164650 + }, + { + "epoch": 0.6267366001080974, + "grad_norm": 0.11288081854581833, + "learning_rate": 0.0005, + "loss": 2.108, + "step": 164660 + }, + { + "epoch": 0.62677466257622, + "grad_norm": 0.12379693239927292, + "learning_rate": 0.0005, + "loss": 2.1168, + "step": 164670 + }, + { + "epoch": 0.6268127250443428, + "grad_norm": 0.11657523363828659, + "learning_rate": 0.0005, + "loss": 2.1257, + "step": 164680 + }, + { + "epoch": 0.6268507875124655, + "grad_norm": 0.11529957503080368, + "learning_rate": 0.0005, + "loss": 2.111, + "step": 164690 + }, + { + "epoch": 0.6268888499805881, + "grad_norm": 0.19511523842811584, + "learning_rate": 0.0005, + "loss": 2.092, + "step": 164700 + }, + { + "epoch": 0.6269269124487108, + "grad_norm": 0.13535825908184052, + "learning_rate": 0.0005, + "loss": 2.111, + "step": 164710 + }, + { + "epoch": 0.6269649749168336, + "grad_norm": 0.12043019384145737, + "learning_rate": 0.0005, + "loss": 2.1121, + "step": 164720 + }, + { + "epoch": 0.6270030373849562, + "grad_norm": 0.11477592587471008, + "learning_rate": 0.0005, + "loss": 2.125, + "step": 164730 + }, + { + "epoch": 0.6270410998530789, + "grad_norm": 0.11728215217590332, + "learning_rate": 0.0005, + "loss": 2.1102, + "step": 164740 + }, + { + "epoch": 0.6270791623212015, + "grad_norm": 0.12050362676382065, + "learning_rate": 0.0005, + "loss": 2.1162, + "step": 164750 + }, + { + "epoch": 0.6271172247893242, + "grad_norm": 0.13324016332626343, + "learning_rate": 0.0005, + "loss": 2.1032, + "step": 164760 + }, + { + "epoch": 0.627155287257447, + "grad_norm": 0.1299990713596344, + "learning_rate": 0.0005, + "loss": 2.1104, + "step": 164770 + }, + { + "epoch": 0.6271933497255696, + "grad_norm": 0.12177756428718567, + "learning_rate": 0.0005, + "loss": 2.1195, + "step": 164780 + }, + { + "epoch": 0.6272314121936923, + "grad_norm": 0.12219898402690887, + "learning_rate": 0.0005, + "loss": 2.1032, + "step": 164790 + }, + { + "epoch": 0.6272694746618149, + "grad_norm": 0.1313663274049759, + "learning_rate": 0.0005, + "loss": 2.118, + "step": 164800 + }, + { + "epoch": 0.6273075371299377, + "grad_norm": 0.12362924218177795, + "learning_rate": 0.0005, + "loss": 2.1088, + "step": 164810 + }, + { + "epoch": 0.6273455995980604, + "grad_norm": 0.1263677179813385, + "learning_rate": 0.0005, + "loss": 2.1222, + "step": 164820 + }, + { + "epoch": 0.627383662066183, + "grad_norm": 0.12366585433483124, + "learning_rate": 0.0005, + "loss": 2.1113, + "step": 164830 + }, + { + "epoch": 0.6274217245343057, + "grad_norm": 0.18121863901615143, + "learning_rate": 0.0005, + "loss": 2.1101, + "step": 164840 + }, + { + "epoch": 0.6274597870024284, + "grad_norm": 0.12810751795768738, + "learning_rate": 0.0005, + "loss": 2.1104, + "step": 164850 + }, + { + "epoch": 0.6274978494705511, + "grad_norm": 0.11524245142936707, + "learning_rate": 0.0005, + "loss": 2.1164, + "step": 164860 + }, + { + "epoch": 0.6275359119386738, + "grad_norm": 0.1260351538658142, + "learning_rate": 0.0005, + "loss": 2.1078, + "step": 164870 + }, + { + "epoch": 0.6275739744067964, + "grad_norm": 0.1230417788028717, + "learning_rate": 0.0005, + "loss": 2.1131, + "step": 164880 + }, + { + "epoch": 0.6276120368749191, + "grad_norm": 0.13061945140361786, + "learning_rate": 0.0005, + "loss": 2.1121, + "step": 164890 + }, + { + "epoch": 0.6276500993430418, + "grad_norm": 0.13039201498031616, + "learning_rate": 0.0005, + "loss": 2.0959, + "step": 164900 + }, + { + "epoch": 0.6276881618111645, + "grad_norm": 0.12625457346439362, + "learning_rate": 0.0005, + "loss": 2.1191, + "step": 164910 + }, + { + "epoch": 0.6277262242792871, + "grad_norm": 0.11405977606773376, + "learning_rate": 0.0005, + "loss": 2.104, + "step": 164920 + }, + { + "epoch": 0.6277642867474098, + "grad_norm": 0.12500326335430145, + "learning_rate": 0.0005, + "loss": 2.1031, + "step": 164930 + }, + { + "epoch": 0.6278023492155326, + "grad_norm": 0.12264113873243332, + "learning_rate": 0.0005, + "loss": 2.1049, + "step": 164940 + }, + { + "epoch": 0.6278404116836552, + "grad_norm": 0.1172245517373085, + "learning_rate": 0.0005, + "loss": 2.1214, + "step": 164950 + }, + { + "epoch": 0.6278784741517779, + "grad_norm": 0.13103266060352325, + "learning_rate": 0.0005, + "loss": 2.1135, + "step": 164960 + }, + { + "epoch": 0.6279165366199005, + "grad_norm": 0.12313253432512283, + "learning_rate": 0.0005, + "loss": 2.1131, + "step": 164970 + }, + { + "epoch": 0.6279545990880233, + "grad_norm": 0.1311279535293579, + "learning_rate": 0.0005, + "loss": 2.1044, + "step": 164980 + }, + { + "epoch": 0.627992661556146, + "grad_norm": 0.11780136823654175, + "learning_rate": 0.0005, + "loss": 2.1026, + "step": 164990 + }, + { + "epoch": 0.6280307240242686, + "grad_norm": 0.127878338098526, + "learning_rate": 0.0005, + "loss": 2.1107, + "step": 165000 + }, + { + "epoch": 0.6280687864923913, + "grad_norm": 0.10879893600940704, + "learning_rate": 0.0005, + "loss": 2.1088, + "step": 165010 + }, + { + "epoch": 0.6281068489605139, + "grad_norm": 0.12845277786254883, + "learning_rate": 0.0005, + "loss": 2.1113, + "step": 165020 + }, + { + "epoch": 0.6281449114286367, + "grad_norm": 0.1244046613574028, + "learning_rate": 0.0005, + "loss": 2.109, + "step": 165030 + }, + { + "epoch": 0.6281829738967594, + "grad_norm": 0.1285630613565445, + "learning_rate": 0.0005, + "loss": 2.1223, + "step": 165040 + }, + { + "epoch": 0.628221036364882, + "grad_norm": 0.13964420557022095, + "learning_rate": 0.0005, + "loss": 2.1111, + "step": 165050 + }, + { + "epoch": 0.6282590988330047, + "grad_norm": 0.13453397154808044, + "learning_rate": 0.0005, + "loss": 2.0991, + "step": 165060 + }, + { + "epoch": 0.6282971613011274, + "grad_norm": 0.1252523958683014, + "learning_rate": 0.0005, + "loss": 2.1028, + "step": 165070 + }, + { + "epoch": 0.6283352237692501, + "grad_norm": 0.12090957909822464, + "learning_rate": 0.0005, + "loss": 2.098, + "step": 165080 + }, + { + "epoch": 0.6283732862373728, + "grad_norm": 0.11995670199394226, + "learning_rate": 0.0005, + "loss": 2.1171, + "step": 165090 + }, + { + "epoch": 0.6284113487054954, + "grad_norm": 0.12850458920001984, + "learning_rate": 0.0005, + "loss": 2.1108, + "step": 165100 + }, + { + "epoch": 0.6284494111736182, + "grad_norm": 0.1120951920747757, + "learning_rate": 0.0005, + "loss": 2.1052, + "step": 165110 + }, + { + "epoch": 0.6284874736417408, + "grad_norm": 0.11967871338129044, + "learning_rate": 0.0005, + "loss": 2.1117, + "step": 165120 + }, + { + "epoch": 0.6285255361098635, + "grad_norm": 0.14278464019298553, + "learning_rate": 0.0005, + "loss": 2.1224, + "step": 165130 + }, + { + "epoch": 0.6285635985779862, + "grad_norm": 0.12599928677082062, + "learning_rate": 0.0005, + "loss": 2.1023, + "step": 165140 + }, + { + "epoch": 0.6286016610461089, + "grad_norm": 0.12482509016990662, + "learning_rate": 0.0005, + "loss": 2.1043, + "step": 165150 + }, + { + "epoch": 0.6286397235142316, + "grad_norm": 0.12580351531505585, + "learning_rate": 0.0005, + "loss": 2.1113, + "step": 165160 + }, + { + "epoch": 0.6286777859823542, + "grad_norm": 0.13060778379440308, + "learning_rate": 0.0005, + "loss": 2.1074, + "step": 165170 + }, + { + "epoch": 0.6287158484504769, + "grad_norm": 0.11947949230670929, + "learning_rate": 0.0005, + "loss": 2.1122, + "step": 165180 + }, + { + "epoch": 0.6287539109185996, + "grad_norm": 0.11321130394935608, + "learning_rate": 0.0005, + "loss": 2.109, + "step": 165190 + }, + { + "epoch": 0.6287919733867223, + "grad_norm": 0.12642672657966614, + "learning_rate": 0.0005, + "loss": 2.1154, + "step": 165200 + }, + { + "epoch": 0.628830035854845, + "grad_norm": 0.1435793936252594, + "learning_rate": 0.0005, + "loss": 2.0962, + "step": 165210 + }, + { + "epoch": 0.6288680983229676, + "grad_norm": 0.1154685840010643, + "learning_rate": 0.0005, + "loss": 2.1122, + "step": 165220 + }, + { + "epoch": 0.6289061607910903, + "grad_norm": 0.11487980931997299, + "learning_rate": 0.0005, + "loss": 2.1214, + "step": 165230 + }, + { + "epoch": 0.6289442232592131, + "grad_norm": 0.11569257825613022, + "learning_rate": 0.0005, + "loss": 2.1217, + "step": 165240 + }, + { + "epoch": 0.6289822857273357, + "grad_norm": 0.11872398853302002, + "learning_rate": 0.0005, + "loss": 2.1169, + "step": 165250 + }, + { + "epoch": 0.6290203481954584, + "grad_norm": 0.13011687994003296, + "learning_rate": 0.0005, + "loss": 2.0994, + "step": 165260 + }, + { + "epoch": 0.629058410663581, + "grad_norm": 0.129885733127594, + "learning_rate": 0.0005, + "loss": 2.1117, + "step": 165270 + }, + { + "epoch": 0.6290964731317038, + "grad_norm": 0.12594501674175262, + "learning_rate": 0.0005, + "loss": 2.0879, + "step": 165280 + }, + { + "epoch": 0.6291345355998265, + "grad_norm": 0.12376669049263, + "learning_rate": 0.0005, + "loss": 2.1069, + "step": 165290 + }, + { + "epoch": 0.6291725980679491, + "grad_norm": 0.12542007863521576, + "learning_rate": 0.0005, + "loss": 2.1016, + "step": 165300 + }, + { + "epoch": 0.6292106605360718, + "grad_norm": 0.12012087553739548, + "learning_rate": 0.0005, + "loss": 2.1198, + "step": 165310 + }, + { + "epoch": 0.6292487230041944, + "grad_norm": 0.1414281576871872, + "learning_rate": 0.0005, + "loss": 2.0979, + "step": 165320 + }, + { + "epoch": 0.6292867854723172, + "grad_norm": 0.13443519175052643, + "learning_rate": 0.0005, + "loss": 2.1073, + "step": 165330 + }, + { + "epoch": 0.6293248479404399, + "grad_norm": 0.11794332414865494, + "learning_rate": 0.0005, + "loss": 2.1182, + "step": 165340 + }, + { + "epoch": 0.6293629104085625, + "grad_norm": 0.1224747747182846, + "learning_rate": 0.0005, + "loss": 2.1142, + "step": 165350 + }, + { + "epoch": 0.6294009728766852, + "grad_norm": 0.149954691529274, + "learning_rate": 0.0005, + "loss": 2.1077, + "step": 165360 + }, + { + "epoch": 0.6294390353448079, + "grad_norm": 0.11514601856470108, + "learning_rate": 0.0005, + "loss": 2.1067, + "step": 165370 + }, + { + "epoch": 0.6294770978129306, + "grad_norm": 0.12775687873363495, + "learning_rate": 0.0005, + "loss": 2.1015, + "step": 165380 + }, + { + "epoch": 0.6295151602810533, + "grad_norm": 0.13284911215305328, + "learning_rate": 0.0005, + "loss": 2.1074, + "step": 165390 + }, + { + "epoch": 0.6295532227491759, + "grad_norm": 0.13676607608795166, + "learning_rate": 0.0005, + "loss": 2.1043, + "step": 165400 + }, + { + "epoch": 0.6295912852172987, + "grad_norm": 0.12092647701501846, + "learning_rate": 0.0005, + "loss": 2.117, + "step": 165410 + }, + { + "epoch": 0.6296293476854213, + "grad_norm": 0.1766921877861023, + "learning_rate": 0.0005, + "loss": 2.1106, + "step": 165420 + }, + { + "epoch": 0.629667410153544, + "grad_norm": 0.12163124233484268, + "learning_rate": 0.0005, + "loss": 2.1047, + "step": 165430 + }, + { + "epoch": 0.6297054726216667, + "grad_norm": 0.1240711435675621, + "learning_rate": 0.0005, + "loss": 2.106, + "step": 165440 + }, + { + "epoch": 0.6297435350897893, + "grad_norm": 0.1204158365726471, + "learning_rate": 0.0005, + "loss": 2.1123, + "step": 165450 + }, + { + "epoch": 0.6297815975579121, + "grad_norm": 0.12393902242183685, + "learning_rate": 0.0005, + "loss": 2.1062, + "step": 165460 + }, + { + "epoch": 0.6298196600260347, + "grad_norm": 0.12610715627670288, + "learning_rate": 0.0005, + "loss": 2.1139, + "step": 165470 + }, + { + "epoch": 0.6298577224941574, + "grad_norm": 0.1293548047542572, + "learning_rate": 0.0005, + "loss": 2.1153, + "step": 165480 + }, + { + "epoch": 0.62989578496228, + "grad_norm": 0.13520577549934387, + "learning_rate": 0.0005, + "loss": 2.11, + "step": 165490 + }, + { + "epoch": 0.6299338474304028, + "grad_norm": 0.12101728469133377, + "learning_rate": 0.0005, + "loss": 2.1202, + "step": 165500 + }, + { + "epoch": 0.6299719098985255, + "grad_norm": 0.11872179061174393, + "learning_rate": 0.0005, + "loss": 2.1022, + "step": 165510 + }, + { + "epoch": 0.6300099723666481, + "grad_norm": 0.12255527824163437, + "learning_rate": 0.0005, + "loss": 2.0967, + "step": 165520 + }, + { + "epoch": 0.6300480348347708, + "grad_norm": 0.12082166969776154, + "learning_rate": 0.0005, + "loss": 2.103, + "step": 165530 + }, + { + "epoch": 0.6300860973028936, + "grad_norm": 0.12696723639965057, + "learning_rate": 0.0005, + "loss": 2.1142, + "step": 165540 + }, + { + "epoch": 0.6301241597710162, + "grad_norm": 0.12669730186462402, + "learning_rate": 0.0005, + "loss": 2.1109, + "step": 165550 + }, + { + "epoch": 0.6301622222391389, + "grad_norm": 0.12998205423355103, + "learning_rate": 0.0005, + "loss": 2.1195, + "step": 165560 + }, + { + "epoch": 0.6302002847072615, + "grad_norm": 0.11483128368854523, + "learning_rate": 0.0005, + "loss": 2.1029, + "step": 165570 + }, + { + "epoch": 0.6302383471753843, + "grad_norm": 0.11890500783920288, + "learning_rate": 0.0005, + "loss": 2.1024, + "step": 165580 + }, + { + "epoch": 0.630276409643507, + "grad_norm": 0.12761257588863373, + "learning_rate": 0.0005, + "loss": 2.1192, + "step": 165590 + }, + { + "epoch": 0.6303144721116296, + "grad_norm": 0.12069553881883621, + "learning_rate": 0.0005, + "loss": 2.114, + "step": 165600 + }, + { + "epoch": 0.6303525345797523, + "grad_norm": 0.11751195788383484, + "learning_rate": 0.0005, + "loss": 2.1167, + "step": 165610 + }, + { + "epoch": 0.6303905970478749, + "grad_norm": 0.1265881359577179, + "learning_rate": 0.0005, + "loss": 2.1127, + "step": 165620 + }, + { + "epoch": 0.6304286595159977, + "grad_norm": 0.12955141067504883, + "learning_rate": 0.0005, + "loss": 2.0929, + "step": 165630 + }, + { + "epoch": 0.6304667219841203, + "grad_norm": 0.12171577662229538, + "learning_rate": 0.0005, + "loss": 2.11, + "step": 165640 + }, + { + "epoch": 0.630504784452243, + "grad_norm": 0.12080515176057816, + "learning_rate": 0.0005, + "loss": 2.1084, + "step": 165650 + }, + { + "epoch": 0.6305428469203657, + "grad_norm": 0.12178708612918854, + "learning_rate": 0.0005, + "loss": 2.1112, + "step": 165660 + }, + { + "epoch": 0.6305809093884884, + "grad_norm": 0.12464504688978195, + "learning_rate": 0.0005, + "loss": 2.1157, + "step": 165670 + }, + { + "epoch": 0.6306189718566111, + "grad_norm": 0.11466963589191437, + "learning_rate": 0.0005, + "loss": 2.1032, + "step": 165680 + }, + { + "epoch": 0.6306570343247337, + "grad_norm": 0.11494333297014236, + "learning_rate": 0.0005, + "loss": 2.1148, + "step": 165690 + }, + { + "epoch": 0.6306950967928564, + "grad_norm": 0.11840105056762695, + "learning_rate": 0.0005, + "loss": 2.1153, + "step": 165700 + }, + { + "epoch": 0.6307331592609792, + "grad_norm": 0.12538498640060425, + "learning_rate": 0.0005, + "loss": 2.1312, + "step": 165710 + }, + { + "epoch": 0.6307712217291018, + "grad_norm": 0.14788444340229034, + "learning_rate": 0.0005, + "loss": 2.1013, + "step": 165720 + }, + { + "epoch": 0.6308092841972245, + "grad_norm": 0.1195816695690155, + "learning_rate": 0.0005, + "loss": 2.1064, + "step": 165730 + }, + { + "epoch": 0.6308473466653471, + "grad_norm": 0.1341714709997177, + "learning_rate": 0.0005, + "loss": 2.1058, + "step": 165740 + }, + { + "epoch": 0.6308854091334698, + "grad_norm": 0.13644200563430786, + "learning_rate": 0.0005, + "loss": 2.107, + "step": 165750 + }, + { + "epoch": 0.6309234716015926, + "grad_norm": 0.1300642192363739, + "learning_rate": 0.0005, + "loss": 2.1153, + "step": 165760 + }, + { + "epoch": 0.6309615340697152, + "grad_norm": 0.13034403324127197, + "learning_rate": 0.0005, + "loss": 2.1079, + "step": 165770 + }, + { + "epoch": 0.6309995965378379, + "grad_norm": 0.14354290068149567, + "learning_rate": 0.0005, + "loss": 2.1189, + "step": 165780 + }, + { + "epoch": 0.6310376590059605, + "grad_norm": 0.12148837745189667, + "learning_rate": 0.0005, + "loss": 2.114, + "step": 165790 + }, + { + "epoch": 0.6310757214740833, + "grad_norm": 0.12006913870573044, + "learning_rate": 0.0005, + "loss": 2.116, + "step": 165800 + }, + { + "epoch": 0.631113783942206, + "grad_norm": 0.12500861287117004, + "learning_rate": 0.0005, + "loss": 2.1125, + "step": 165810 + }, + { + "epoch": 0.6311518464103286, + "grad_norm": 0.11222272366285324, + "learning_rate": 0.0005, + "loss": 2.1315, + "step": 165820 + }, + { + "epoch": 0.6311899088784513, + "grad_norm": 0.1350722461938858, + "learning_rate": 0.0005, + "loss": 2.1025, + "step": 165830 + }, + { + "epoch": 0.631227971346574, + "grad_norm": 0.12127304822206497, + "learning_rate": 0.0005, + "loss": 2.0979, + "step": 165840 + }, + { + "epoch": 0.6312660338146967, + "grad_norm": 0.1220502182841301, + "learning_rate": 0.0005, + "loss": 2.1148, + "step": 165850 + }, + { + "epoch": 0.6313040962828194, + "grad_norm": 0.1291094273328781, + "learning_rate": 0.0005, + "loss": 2.111, + "step": 165860 + }, + { + "epoch": 0.631342158750942, + "grad_norm": 0.11507304757833481, + "learning_rate": 0.0005, + "loss": 2.1038, + "step": 165870 + }, + { + "epoch": 0.6313802212190647, + "grad_norm": 0.1210707575082779, + "learning_rate": 0.0005, + "loss": 2.1021, + "step": 165880 + }, + { + "epoch": 0.6314182836871874, + "grad_norm": 0.1463552713394165, + "learning_rate": 0.0005, + "loss": 2.1219, + "step": 165890 + }, + { + "epoch": 0.6314563461553101, + "grad_norm": 0.1260424256324768, + "learning_rate": 0.0005, + "loss": 2.1069, + "step": 165900 + }, + { + "epoch": 0.6314944086234328, + "grad_norm": 0.14042676985263824, + "learning_rate": 0.0005, + "loss": 2.1141, + "step": 165910 + }, + { + "epoch": 0.6315324710915554, + "grad_norm": 0.12042347341775894, + "learning_rate": 0.0005, + "loss": 2.1102, + "step": 165920 + }, + { + "epoch": 0.6315705335596782, + "grad_norm": 0.12311989068984985, + "learning_rate": 0.0005, + "loss": 2.1104, + "step": 165930 + }, + { + "epoch": 0.6316085960278008, + "grad_norm": 0.124355249106884, + "learning_rate": 0.0005, + "loss": 2.1112, + "step": 165940 + }, + { + "epoch": 0.6316466584959235, + "grad_norm": 0.13882076740264893, + "learning_rate": 0.0005, + "loss": 2.1117, + "step": 165950 + }, + { + "epoch": 0.6316847209640462, + "grad_norm": 0.13310198485851288, + "learning_rate": 0.0005, + "loss": 2.1318, + "step": 165960 + }, + { + "epoch": 0.6317227834321689, + "grad_norm": 0.11607959866523743, + "learning_rate": 0.0005, + "loss": 2.0976, + "step": 165970 + }, + { + "epoch": 0.6317608459002916, + "grad_norm": 0.15148335695266724, + "learning_rate": 0.0005, + "loss": 2.1157, + "step": 165980 + }, + { + "epoch": 0.6317989083684142, + "grad_norm": 0.13620083034038544, + "learning_rate": 0.0005, + "loss": 2.1035, + "step": 165990 + }, + { + "epoch": 0.6318369708365369, + "grad_norm": 0.12425015866756439, + "learning_rate": 0.0005, + "loss": 2.1254, + "step": 166000 + }, + { + "epoch": 0.6318750333046597, + "grad_norm": 0.13320602476596832, + "learning_rate": 0.0005, + "loss": 2.1035, + "step": 166010 + }, + { + "epoch": 0.6319130957727823, + "grad_norm": 0.12140514701604843, + "learning_rate": 0.0005, + "loss": 2.1099, + "step": 166020 + }, + { + "epoch": 0.631951158240905, + "grad_norm": 0.11672339588403702, + "learning_rate": 0.0005, + "loss": 2.1104, + "step": 166030 + }, + { + "epoch": 0.6319892207090276, + "grad_norm": 0.1160837858915329, + "learning_rate": 0.0005, + "loss": 2.1112, + "step": 166040 + }, + { + "epoch": 0.6320272831771503, + "grad_norm": 0.1330985575914383, + "learning_rate": 0.0005, + "loss": 2.1027, + "step": 166050 + }, + { + "epoch": 0.6320653456452731, + "grad_norm": 0.13461607694625854, + "learning_rate": 0.0005, + "loss": 2.1178, + "step": 166060 + }, + { + "epoch": 0.6321034081133957, + "grad_norm": 0.12698423862457275, + "learning_rate": 0.0005, + "loss": 2.1113, + "step": 166070 + }, + { + "epoch": 0.6321414705815184, + "grad_norm": 0.12149754166603088, + "learning_rate": 0.0005, + "loss": 2.1171, + "step": 166080 + }, + { + "epoch": 0.632179533049641, + "grad_norm": 0.12259469926357269, + "learning_rate": 0.0005, + "loss": 2.1164, + "step": 166090 + }, + { + "epoch": 0.6322175955177638, + "grad_norm": 0.11817657947540283, + "learning_rate": 0.0005, + "loss": 2.1212, + "step": 166100 + }, + { + "epoch": 0.6322556579858865, + "grad_norm": 0.13104508817195892, + "learning_rate": 0.0005, + "loss": 2.1012, + "step": 166110 + }, + { + "epoch": 0.6322937204540091, + "grad_norm": 0.1280125230550766, + "learning_rate": 0.0005, + "loss": 2.1204, + "step": 166120 + }, + { + "epoch": 0.6323317829221318, + "grad_norm": 0.132890984416008, + "learning_rate": 0.0005, + "loss": 2.1024, + "step": 166130 + }, + { + "epoch": 0.6323698453902545, + "grad_norm": 0.1223866418004036, + "learning_rate": 0.0005, + "loss": 2.0962, + "step": 166140 + }, + { + "epoch": 0.6324079078583772, + "grad_norm": 0.11968637257814407, + "learning_rate": 0.0005, + "loss": 2.1142, + "step": 166150 + }, + { + "epoch": 0.6324459703264999, + "grad_norm": 0.12027929723262787, + "learning_rate": 0.0005, + "loss": 2.0807, + "step": 166160 + }, + { + "epoch": 0.6324840327946225, + "grad_norm": 0.11879881471395493, + "learning_rate": 0.0005, + "loss": 2.1105, + "step": 166170 + }, + { + "epoch": 0.6325220952627452, + "grad_norm": 0.13272880017757416, + "learning_rate": 0.0005, + "loss": 2.1228, + "step": 166180 + }, + { + "epoch": 0.6325601577308679, + "grad_norm": 0.12231330573558807, + "learning_rate": 0.0005, + "loss": 2.1095, + "step": 166190 + }, + { + "epoch": 0.6325982201989906, + "grad_norm": 0.13635841012001038, + "learning_rate": 0.0005, + "loss": 2.1088, + "step": 166200 + }, + { + "epoch": 0.6326362826671132, + "grad_norm": 0.12453529238700867, + "learning_rate": 0.0005, + "loss": 2.0999, + "step": 166210 + }, + { + "epoch": 0.6326743451352359, + "grad_norm": 0.12656286358833313, + "learning_rate": 0.0005, + "loss": 2.1093, + "step": 166220 + }, + { + "epoch": 0.6327124076033587, + "grad_norm": 0.1307314932346344, + "learning_rate": 0.0005, + "loss": 2.1103, + "step": 166230 + }, + { + "epoch": 0.6327504700714813, + "grad_norm": 0.12419044226408005, + "learning_rate": 0.0005, + "loss": 2.1216, + "step": 166240 + }, + { + "epoch": 0.632788532539604, + "grad_norm": 0.11610493808984756, + "learning_rate": 0.0005, + "loss": 2.1114, + "step": 166250 + }, + { + "epoch": 0.6328265950077266, + "grad_norm": 0.12077783793210983, + "learning_rate": 0.0005, + "loss": 2.1139, + "step": 166260 + }, + { + "epoch": 0.6328646574758494, + "grad_norm": 0.13767452538013458, + "learning_rate": 0.0005, + "loss": 2.1075, + "step": 166270 + }, + { + "epoch": 0.6329027199439721, + "grad_norm": 0.11943716555833817, + "learning_rate": 0.0005, + "loss": 2.1131, + "step": 166280 + }, + { + "epoch": 0.6329407824120947, + "grad_norm": 0.12304921448230743, + "learning_rate": 0.0005, + "loss": 2.1079, + "step": 166290 + }, + { + "epoch": 0.6329788448802174, + "grad_norm": 0.12921641767024994, + "learning_rate": 0.0005, + "loss": 2.1138, + "step": 166300 + }, + { + "epoch": 0.63301690734834, + "grad_norm": 0.12375220656394958, + "learning_rate": 0.0005, + "loss": 2.1085, + "step": 166310 + }, + { + "epoch": 0.6330549698164628, + "grad_norm": 0.12074637413024902, + "learning_rate": 0.0005, + "loss": 2.1074, + "step": 166320 + }, + { + "epoch": 0.6330930322845855, + "grad_norm": 0.11634485423564911, + "learning_rate": 0.0005, + "loss": 2.1082, + "step": 166330 + }, + { + "epoch": 0.6331310947527081, + "grad_norm": 0.1356639862060547, + "learning_rate": 0.0005, + "loss": 2.1054, + "step": 166340 + }, + { + "epoch": 0.6331691572208308, + "grad_norm": 0.12911456823349, + "learning_rate": 0.0005, + "loss": 2.1069, + "step": 166350 + }, + { + "epoch": 0.6332072196889535, + "grad_norm": 0.11508312076330185, + "learning_rate": 0.0005, + "loss": 2.1099, + "step": 166360 + }, + { + "epoch": 0.6332452821570762, + "grad_norm": 0.11945507675409317, + "learning_rate": 0.0005, + "loss": 2.1136, + "step": 166370 + }, + { + "epoch": 0.6332833446251989, + "grad_norm": 0.11480210721492767, + "learning_rate": 0.0005, + "loss": 2.107, + "step": 166380 + }, + { + "epoch": 0.6333214070933215, + "grad_norm": 0.11745503544807434, + "learning_rate": 0.0005, + "loss": 2.1072, + "step": 166390 + }, + { + "epoch": 0.6333594695614443, + "grad_norm": 0.12687115371227264, + "learning_rate": 0.0005, + "loss": 2.1116, + "step": 166400 + }, + { + "epoch": 0.633397532029567, + "grad_norm": 0.12765026092529297, + "learning_rate": 0.0005, + "loss": 2.1025, + "step": 166410 + }, + { + "epoch": 0.6334355944976896, + "grad_norm": 0.13198712468147278, + "learning_rate": 0.0005, + "loss": 2.1299, + "step": 166420 + }, + { + "epoch": 0.6334736569658123, + "grad_norm": 0.12231364101171494, + "learning_rate": 0.0005, + "loss": 2.1306, + "step": 166430 + }, + { + "epoch": 0.633511719433935, + "grad_norm": 0.1289694607257843, + "learning_rate": 0.0005, + "loss": 2.0956, + "step": 166440 + }, + { + "epoch": 0.6335497819020577, + "grad_norm": 0.1258060783147812, + "learning_rate": 0.0005, + "loss": 2.0993, + "step": 166450 + }, + { + "epoch": 0.6335878443701803, + "grad_norm": 0.12296685576438904, + "learning_rate": 0.0005, + "loss": 2.1228, + "step": 166460 + }, + { + "epoch": 0.633625906838303, + "grad_norm": 0.11561420559883118, + "learning_rate": 0.0005, + "loss": 2.1057, + "step": 166470 + }, + { + "epoch": 0.6336639693064257, + "grad_norm": 0.12371906638145447, + "learning_rate": 0.0005, + "loss": 2.1001, + "step": 166480 + }, + { + "epoch": 0.6337020317745484, + "grad_norm": 0.13448134064674377, + "learning_rate": 0.0005, + "loss": 2.1053, + "step": 166490 + }, + { + "epoch": 0.6337400942426711, + "grad_norm": 0.139088973402977, + "learning_rate": 0.0005, + "loss": 2.121, + "step": 166500 + }, + { + "epoch": 0.6337781567107937, + "grad_norm": 0.116954505443573, + "learning_rate": 0.0005, + "loss": 2.1167, + "step": 166510 + }, + { + "epoch": 0.6338162191789164, + "grad_norm": 0.12373578548431396, + "learning_rate": 0.0005, + "loss": 2.1119, + "step": 166520 + }, + { + "epoch": 0.6338542816470392, + "grad_norm": 0.1236272007226944, + "learning_rate": 0.0005, + "loss": 2.1153, + "step": 166530 + }, + { + "epoch": 0.6338923441151618, + "grad_norm": 0.11625051498413086, + "learning_rate": 0.0005, + "loss": 2.1087, + "step": 166540 + }, + { + "epoch": 0.6339304065832845, + "grad_norm": 0.13029366731643677, + "learning_rate": 0.0005, + "loss": 2.1081, + "step": 166550 + }, + { + "epoch": 0.6339684690514071, + "grad_norm": 0.12038466334342957, + "learning_rate": 0.0005, + "loss": 2.1114, + "step": 166560 + }, + { + "epoch": 0.6340065315195299, + "grad_norm": 0.12874627113342285, + "learning_rate": 0.0005, + "loss": 2.1039, + "step": 166570 + }, + { + "epoch": 0.6340445939876526, + "grad_norm": 0.11089406162500381, + "learning_rate": 0.0005, + "loss": 2.1107, + "step": 166580 + }, + { + "epoch": 0.6340826564557752, + "grad_norm": 0.12499355524778366, + "learning_rate": 0.0005, + "loss": 2.115, + "step": 166590 + }, + { + "epoch": 0.6341207189238979, + "grad_norm": 0.13173672556877136, + "learning_rate": 0.0005, + "loss": 2.1281, + "step": 166600 + }, + { + "epoch": 0.6341587813920205, + "grad_norm": 0.1347840279340744, + "learning_rate": 0.0005, + "loss": 2.0987, + "step": 166610 + }, + { + "epoch": 0.6341968438601433, + "grad_norm": 0.1311391443014145, + "learning_rate": 0.0005, + "loss": 2.1109, + "step": 166620 + }, + { + "epoch": 0.634234906328266, + "grad_norm": 0.12326283007860184, + "learning_rate": 0.0005, + "loss": 2.123, + "step": 166630 + }, + { + "epoch": 0.6342729687963886, + "grad_norm": 0.1423521339893341, + "learning_rate": 0.0005, + "loss": 2.1008, + "step": 166640 + }, + { + "epoch": 0.6343110312645113, + "grad_norm": 0.1224626898765564, + "learning_rate": 0.0005, + "loss": 2.1112, + "step": 166650 + }, + { + "epoch": 0.634349093732634, + "grad_norm": 0.11854662746191025, + "learning_rate": 0.0005, + "loss": 2.1207, + "step": 166660 + }, + { + "epoch": 0.6343871562007567, + "grad_norm": 0.11189959943294525, + "learning_rate": 0.0005, + "loss": 2.1029, + "step": 166670 + }, + { + "epoch": 0.6344252186688794, + "grad_norm": 0.9422677755355835, + "learning_rate": 0.0005, + "loss": 2.1281, + "step": 166680 + }, + { + "epoch": 0.634463281137002, + "grad_norm": 0.1277889758348465, + "learning_rate": 0.0005, + "loss": 2.0983, + "step": 166690 + }, + { + "epoch": 0.6345013436051248, + "grad_norm": 0.12071488797664642, + "learning_rate": 0.0005, + "loss": 2.1209, + "step": 166700 + }, + { + "epoch": 0.6345394060732474, + "grad_norm": 0.11715003848075867, + "learning_rate": 0.0005, + "loss": 2.1281, + "step": 166710 + }, + { + "epoch": 0.6345774685413701, + "grad_norm": 0.12750089168548584, + "learning_rate": 0.0005, + "loss": 2.1069, + "step": 166720 + }, + { + "epoch": 0.6346155310094928, + "grad_norm": 0.12331834435462952, + "learning_rate": 0.0005, + "loss": 2.1062, + "step": 166730 + }, + { + "epoch": 0.6346535934776154, + "grad_norm": 0.1349371075630188, + "learning_rate": 0.0005, + "loss": 2.1092, + "step": 166740 + }, + { + "epoch": 0.6346916559457382, + "grad_norm": 0.12631113827228546, + "learning_rate": 0.0005, + "loss": 2.1136, + "step": 166750 + }, + { + "epoch": 0.6347297184138608, + "grad_norm": 0.1224856749176979, + "learning_rate": 0.0005, + "loss": 2.1049, + "step": 166760 + }, + { + "epoch": 0.6347677808819835, + "grad_norm": 0.11092979460954666, + "learning_rate": 0.0005, + "loss": 2.0955, + "step": 166770 + }, + { + "epoch": 0.6348058433501061, + "grad_norm": 0.11893656104803085, + "learning_rate": 0.0005, + "loss": 2.1039, + "step": 166780 + }, + { + "epoch": 0.6348439058182289, + "grad_norm": 0.1263199895620346, + "learning_rate": 0.0005, + "loss": 2.0971, + "step": 166790 + }, + { + "epoch": 0.6348819682863516, + "grad_norm": 0.12650194764137268, + "learning_rate": 0.0005, + "loss": 2.1022, + "step": 166800 + }, + { + "epoch": 0.6349200307544742, + "grad_norm": 0.12290910631418228, + "learning_rate": 0.0005, + "loss": 2.1074, + "step": 166810 + }, + { + "epoch": 0.6349580932225969, + "grad_norm": 0.11825523525476456, + "learning_rate": 0.0005, + "loss": 2.1133, + "step": 166820 + }, + { + "epoch": 0.6349961556907197, + "grad_norm": 0.13113102316856384, + "learning_rate": 0.0005, + "loss": 2.1106, + "step": 166830 + }, + { + "epoch": 0.6350342181588423, + "grad_norm": 0.12884521484375, + "learning_rate": 0.0005, + "loss": 2.1055, + "step": 166840 + }, + { + "epoch": 0.635072280626965, + "grad_norm": 0.11758764088153839, + "learning_rate": 0.0005, + "loss": 2.1017, + "step": 166850 + }, + { + "epoch": 0.6351103430950876, + "grad_norm": 0.11539915204048157, + "learning_rate": 0.0005, + "loss": 2.1135, + "step": 166860 + }, + { + "epoch": 0.6351484055632104, + "grad_norm": 0.12013541162014008, + "learning_rate": 0.0005, + "loss": 2.091, + "step": 166870 + }, + { + "epoch": 0.635186468031333, + "grad_norm": 0.13431543111801147, + "learning_rate": 0.0005, + "loss": 2.1007, + "step": 166880 + }, + { + "epoch": 0.6352245304994557, + "grad_norm": 0.1163390502333641, + "learning_rate": 0.0005, + "loss": 2.117, + "step": 166890 + }, + { + "epoch": 0.6352625929675784, + "grad_norm": 0.12375345826148987, + "learning_rate": 0.0005, + "loss": 2.0979, + "step": 166900 + }, + { + "epoch": 0.635300655435701, + "grad_norm": 0.12268900126218796, + "learning_rate": 0.0005, + "loss": 2.1174, + "step": 166910 + }, + { + "epoch": 0.6353387179038238, + "grad_norm": 0.12074095755815506, + "learning_rate": 0.0005, + "loss": 2.1031, + "step": 166920 + }, + { + "epoch": 0.6353767803719464, + "grad_norm": 0.1236800029873848, + "learning_rate": 0.0005, + "loss": 2.1105, + "step": 166930 + }, + { + "epoch": 0.6354148428400691, + "grad_norm": 0.12753638625144958, + "learning_rate": 0.0005, + "loss": 2.1233, + "step": 166940 + }, + { + "epoch": 0.6354529053081918, + "grad_norm": 0.12092048674821854, + "learning_rate": 0.0005, + "loss": 2.1262, + "step": 166950 + }, + { + "epoch": 0.6354909677763145, + "grad_norm": 0.11950389295816422, + "learning_rate": 0.0005, + "loss": 2.1164, + "step": 166960 + }, + { + "epoch": 0.6355290302444372, + "grad_norm": 0.12420322746038437, + "learning_rate": 0.0005, + "loss": 2.1218, + "step": 166970 + }, + { + "epoch": 0.6355670927125598, + "grad_norm": 0.12446198612451553, + "learning_rate": 0.0005, + "loss": 2.1082, + "step": 166980 + }, + { + "epoch": 0.6356051551806825, + "grad_norm": 0.12489917129278183, + "learning_rate": 0.0005, + "loss": 2.1209, + "step": 166990 + }, + { + "epoch": 0.6356432176488053, + "grad_norm": 0.13189025223255157, + "learning_rate": 0.0005, + "loss": 2.0984, + "step": 167000 + }, + { + "epoch": 0.6356812801169279, + "grad_norm": 0.11896958202123642, + "learning_rate": 0.0005, + "loss": 2.1185, + "step": 167010 + }, + { + "epoch": 0.6357193425850506, + "grad_norm": 0.12319599837064743, + "learning_rate": 0.0005, + "loss": 2.0937, + "step": 167020 + }, + { + "epoch": 0.6357574050531732, + "grad_norm": 0.12853579223155975, + "learning_rate": 0.0005, + "loss": 2.1168, + "step": 167030 + }, + { + "epoch": 0.6357954675212959, + "grad_norm": 0.12506678700447083, + "learning_rate": 0.0005, + "loss": 2.099, + "step": 167040 + }, + { + "epoch": 0.6358335299894187, + "grad_norm": 0.12734413146972656, + "learning_rate": 0.0005, + "loss": 2.0984, + "step": 167050 + }, + { + "epoch": 0.6358715924575413, + "grad_norm": 0.1209283098578453, + "learning_rate": 0.0005, + "loss": 2.1116, + "step": 167060 + }, + { + "epoch": 0.635909654925664, + "grad_norm": 0.12446945905685425, + "learning_rate": 0.0005, + "loss": 2.1144, + "step": 167070 + }, + { + "epoch": 0.6359477173937866, + "grad_norm": 0.12285857647657394, + "learning_rate": 0.0005, + "loss": 2.1112, + "step": 167080 + }, + { + "epoch": 0.6359857798619094, + "grad_norm": 0.12403672933578491, + "learning_rate": 0.0005, + "loss": 2.1085, + "step": 167090 + }, + { + "epoch": 0.6360238423300321, + "grad_norm": 0.12527821958065033, + "learning_rate": 0.0005, + "loss": 2.1191, + "step": 167100 + }, + { + "epoch": 0.6360619047981547, + "grad_norm": 0.13380716741085052, + "learning_rate": 0.0005, + "loss": 2.1296, + "step": 167110 + }, + { + "epoch": 0.6360999672662774, + "grad_norm": 0.12255584448575974, + "learning_rate": 0.0005, + "loss": 2.122, + "step": 167120 + }, + { + "epoch": 0.6361380297344001, + "grad_norm": 0.12181650102138519, + "learning_rate": 0.0005, + "loss": 2.1166, + "step": 167130 + }, + { + "epoch": 0.6361760922025228, + "grad_norm": 0.11952023953199387, + "learning_rate": 0.0005, + "loss": 2.1238, + "step": 167140 + }, + { + "epoch": 0.6362141546706455, + "grad_norm": 0.13203804194927216, + "learning_rate": 0.0005, + "loss": 2.1175, + "step": 167150 + }, + { + "epoch": 0.6362522171387681, + "grad_norm": 0.13042880594730377, + "learning_rate": 0.0005, + "loss": 2.1073, + "step": 167160 + }, + { + "epoch": 0.6362902796068908, + "grad_norm": 0.12648409605026245, + "learning_rate": 0.0005, + "loss": 2.1135, + "step": 167170 + }, + { + "epoch": 0.6363283420750135, + "grad_norm": 0.11189969629049301, + "learning_rate": 0.0005, + "loss": 2.109, + "step": 167180 + }, + { + "epoch": 0.6363664045431362, + "grad_norm": 0.12212081998586655, + "learning_rate": 0.0005, + "loss": 2.0869, + "step": 167190 + }, + { + "epoch": 0.6364044670112589, + "grad_norm": 0.11594847589731216, + "learning_rate": 0.0005, + "loss": 2.1043, + "step": 167200 + }, + { + "epoch": 0.6364425294793815, + "grad_norm": 0.126133993268013, + "learning_rate": 0.0005, + "loss": 2.0977, + "step": 167210 + }, + { + "epoch": 0.6364805919475043, + "grad_norm": 0.1368551105260849, + "learning_rate": 0.0005, + "loss": 2.1183, + "step": 167220 + }, + { + "epoch": 0.6365186544156269, + "grad_norm": 0.1192522794008255, + "learning_rate": 0.0005, + "loss": 2.1006, + "step": 167230 + }, + { + "epoch": 0.6365567168837496, + "grad_norm": 0.1303003877401352, + "learning_rate": 0.0005, + "loss": 2.1181, + "step": 167240 + }, + { + "epoch": 0.6365947793518723, + "grad_norm": 0.1174224391579628, + "learning_rate": 0.0005, + "loss": 2.1213, + "step": 167250 + }, + { + "epoch": 0.636632841819995, + "grad_norm": 0.14051686227321625, + "learning_rate": 0.0005, + "loss": 2.1016, + "step": 167260 + }, + { + "epoch": 0.6366709042881177, + "grad_norm": 0.1523093581199646, + "learning_rate": 0.0005, + "loss": 2.1068, + "step": 167270 + }, + { + "epoch": 0.6367089667562403, + "grad_norm": 0.11615092307329178, + "learning_rate": 0.0005, + "loss": 2.111, + "step": 167280 + }, + { + "epoch": 0.636747029224363, + "grad_norm": 0.1295924186706543, + "learning_rate": 0.0005, + "loss": 2.1028, + "step": 167290 + }, + { + "epoch": 0.6367850916924858, + "grad_norm": 0.11747189611196518, + "learning_rate": 0.0005, + "loss": 2.114, + "step": 167300 + }, + { + "epoch": 0.6368231541606084, + "grad_norm": 0.1178893968462944, + "learning_rate": 0.0005, + "loss": 2.1122, + "step": 167310 + }, + { + "epoch": 0.6368612166287311, + "grad_norm": 0.12379597872495651, + "learning_rate": 0.0005, + "loss": 2.0963, + "step": 167320 + }, + { + "epoch": 0.6368992790968537, + "grad_norm": 0.13348908722400665, + "learning_rate": 0.0005, + "loss": 2.112, + "step": 167330 + }, + { + "epoch": 0.6369373415649764, + "grad_norm": 0.12309478968381882, + "learning_rate": 0.0005, + "loss": 2.116, + "step": 167340 + }, + { + "epoch": 0.6369754040330992, + "grad_norm": 0.1236153319478035, + "learning_rate": 0.0005, + "loss": 2.1128, + "step": 167350 + }, + { + "epoch": 0.6370134665012218, + "grad_norm": 0.11330470442771912, + "learning_rate": 0.0005, + "loss": 2.1061, + "step": 167360 + }, + { + "epoch": 0.6370515289693445, + "grad_norm": 0.11641678214073181, + "learning_rate": 0.0005, + "loss": 2.1162, + "step": 167370 + }, + { + "epoch": 0.6370895914374671, + "grad_norm": 0.1299334019422531, + "learning_rate": 0.0005, + "loss": 2.1221, + "step": 167380 + }, + { + "epoch": 0.6371276539055899, + "grad_norm": 0.12439191341400146, + "learning_rate": 0.0005, + "loss": 2.1264, + "step": 167390 + }, + { + "epoch": 0.6371657163737126, + "grad_norm": 0.11246515810489655, + "learning_rate": 0.0005, + "loss": 2.0909, + "step": 167400 + }, + { + "epoch": 0.6372037788418352, + "grad_norm": 0.12425049394369125, + "learning_rate": 0.0005, + "loss": 2.1104, + "step": 167410 + }, + { + "epoch": 0.6372418413099579, + "grad_norm": 0.11815284937620163, + "learning_rate": 0.0005, + "loss": 2.1172, + "step": 167420 + }, + { + "epoch": 0.6372799037780806, + "grad_norm": 0.13013459742069244, + "learning_rate": 0.0005, + "loss": 2.1034, + "step": 167430 + }, + { + "epoch": 0.6373179662462033, + "grad_norm": 0.11978629976511002, + "learning_rate": 0.0005, + "loss": 2.1153, + "step": 167440 + }, + { + "epoch": 0.637356028714326, + "grad_norm": 0.11710581183433533, + "learning_rate": 0.0005, + "loss": 2.1105, + "step": 167450 + }, + { + "epoch": 0.6373940911824486, + "grad_norm": 0.11174238473176956, + "learning_rate": 0.0005, + "loss": 2.1156, + "step": 167460 + }, + { + "epoch": 0.6374321536505713, + "grad_norm": 0.13608428835868835, + "learning_rate": 0.0005, + "loss": 2.099, + "step": 167470 + }, + { + "epoch": 0.637470216118694, + "grad_norm": 0.12688113749027252, + "learning_rate": 0.0005, + "loss": 2.1234, + "step": 167480 + }, + { + "epoch": 0.6375082785868167, + "grad_norm": 0.12677867710590363, + "learning_rate": 0.0005, + "loss": 2.1114, + "step": 167490 + }, + { + "epoch": 0.6375463410549393, + "grad_norm": 0.12160094827413559, + "learning_rate": 0.0005, + "loss": 2.1153, + "step": 167500 + }, + { + "epoch": 0.637584403523062, + "grad_norm": 0.12581761181354523, + "learning_rate": 0.0005, + "loss": 2.1099, + "step": 167510 + }, + { + "epoch": 0.6376224659911848, + "grad_norm": 0.13145707547664642, + "learning_rate": 0.0005, + "loss": 2.1199, + "step": 167520 + }, + { + "epoch": 0.6376605284593074, + "grad_norm": 0.12493417412042618, + "learning_rate": 0.0005, + "loss": 2.0906, + "step": 167530 + }, + { + "epoch": 0.6376985909274301, + "grad_norm": 0.1158856451511383, + "learning_rate": 0.0005, + "loss": 2.105, + "step": 167540 + }, + { + "epoch": 0.6377366533955527, + "grad_norm": 0.11572497338056564, + "learning_rate": 0.0005, + "loss": 2.123, + "step": 167550 + }, + { + "epoch": 0.6377747158636755, + "grad_norm": 0.12406877428293228, + "learning_rate": 0.0005, + "loss": 2.1159, + "step": 167560 + }, + { + "epoch": 0.6378127783317982, + "grad_norm": 0.12328975647687912, + "learning_rate": 0.0005, + "loss": 2.1072, + "step": 167570 + }, + { + "epoch": 0.6378508407999208, + "grad_norm": 0.11333654075860977, + "learning_rate": 0.0005, + "loss": 2.1061, + "step": 167580 + }, + { + "epoch": 0.6378889032680435, + "grad_norm": 0.12689661979675293, + "learning_rate": 0.0005, + "loss": 2.1073, + "step": 167590 + }, + { + "epoch": 0.6379269657361661, + "grad_norm": 0.11475737392902374, + "learning_rate": 0.0005, + "loss": 2.1037, + "step": 167600 + }, + { + "epoch": 0.6379650282042889, + "grad_norm": 0.1381559669971466, + "learning_rate": 0.0005, + "loss": 2.1236, + "step": 167610 + }, + { + "epoch": 0.6380030906724116, + "grad_norm": 0.12458667904138565, + "learning_rate": 0.0005, + "loss": 2.1267, + "step": 167620 + }, + { + "epoch": 0.6380411531405342, + "grad_norm": 0.1247856616973877, + "learning_rate": 0.0005, + "loss": 2.1043, + "step": 167630 + }, + { + "epoch": 0.6380792156086569, + "grad_norm": 0.12368963658809662, + "learning_rate": 0.0005, + "loss": 2.0934, + "step": 167640 + }, + { + "epoch": 0.6381172780767796, + "grad_norm": 0.11723518371582031, + "learning_rate": 0.0005, + "loss": 2.1006, + "step": 167650 + }, + { + "epoch": 0.6381553405449023, + "grad_norm": 0.12084148079156876, + "learning_rate": 0.0005, + "loss": 2.1148, + "step": 167660 + }, + { + "epoch": 0.638193403013025, + "grad_norm": 0.12505698204040527, + "learning_rate": 0.0005, + "loss": 2.1035, + "step": 167670 + }, + { + "epoch": 0.6382314654811476, + "grad_norm": 0.12336069345474243, + "learning_rate": 0.0005, + "loss": 2.1081, + "step": 167680 + }, + { + "epoch": 0.6382695279492704, + "grad_norm": 0.12355723232030869, + "learning_rate": 0.0005, + "loss": 2.115, + "step": 167690 + }, + { + "epoch": 0.638307590417393, + "grad_norm": 0.12749448418617249, + "learning_rate": 0.0005, + "loss": 2.1104, + "step": 167700 + }, + { + "epoch": 0.6383456528855157, + "grad_norm": 0.11271673440933228, + "learning_rate": 0.0005, + "loss": 2.1147, + "step": 167710 + }, + { + "epoch": 0.6383837153536384, + "grad_norm": 0.12633991241455078, + "learning_rate": 0.0005, + "loss": 2.1093, + "step": 167720 + }, + { + "epoch": 0.6384217778217611, + "grad_norm": 0.12690429389476776, + "learning_rate": 0.0005, + "loss": 2.0995, + "step": 167730 + }, + { + "epoch": 0.6384598402898838, + "grad_norm": 0.12385845184326172, + "learning_rate": 0.0005, + "loss": 2.0894, + "step": 167740 + }, + { + "epoch": 0.6384979027580064, + "grad_norm": 0.11217658221721649, + "learning_rate": 0.0005, + "loss": 2.1198, + "step": 167750 + }, + { + "epoch": 0.6385359652261291, + "grad_norm": 0.13572321832180023, + "learning_rate": 0.0005, + "loss": 2.1161, + "step": 167760 + }, + { + "epoch": 0.6385740276942518, + "grad_norm": 0.11481758952140808, + "learning_rate": 0.0005, + "loss": 2.1055, + "step": 167770 + }, + { + "epoch": 0.6386120901623745, + "grad_norm": 0.12268579751253128, + "learning_rate": 0.0005, + "loss": 2.1007, + "step": 167780 + }, + { + "epoch": 0.6386501526304972, + "grad_norm": 0.1343347132205963, + "learning_rate": 0.0005, + "loss": 2.117, + "step": 167790 + }, + { + "epoch": 0.6386882150986198, + "grad_norm": 0.12613916397094727, + "learning_rate": 0.0005, + "loss": 2.1251, + "step": 167800 + }, + { + "epoch": 0.6387262775667425, + "grad_norm": 0.13326533138751984, + "learning_rate": 0.0005, + "loss": 2.1061, + "step": 167810 + }, + { + "epoch": 0.6387643400348653, + "grad_norm": 0.11773496121168137, + "learning_rate": 0.0005, + "loss": 2.1169, + "step": 167820 + }, + { + "epoch": 0.6388024025029879, + "grad_norm": 0.11977910995483398, + "learning_rate": 0.0005, + "loss": 2.124, + "step": 167830 + }, + { + "epoch": 0.6388404649711106, + "grad_norm": 0.224679633975029, + "learning_rate": 0.0005, + "loss": 2.105, + "step": 167840 + }, + { + "epoch": 0.6388785274392332, + "grad_norm": 0.12830469012260437, + "learning_rate": 0.0005, + "loss": 2.1074, + "step": 167850 + }, + { + "epoch": 0.638916589907356, + "grad_norm": 0.13360096514225006, + "learning_rate": 0.0005, + "loss": 2.1197, + "step": 167860 + }, + { + "epoch": 0.6389546523754787, + "grad_norm": 0.11928125470876694, + "learning_rate": 0.0005, + "loss": 2.1269, + "step": 167870 + }, + { + "epoch": 0.6389927148436013, + "grad_norm": 0.11468581110239029, + "learning_rate": 0.0005, + "loss": 2.1131, + "step": 167880 + }, + { + "epoch": 0.639030777311724, + "grad_norm": 0.1287572979927063, + "learning_rate": 0.0005, + "loss": 2.1074, + "step": 167890 + }, + { + "epoch": 0.6390688397798466, + "grad_norm": 0.12990231812000275, + "learning_rate": 0.0005, + "loss": 2.0983, + "step": 167900 + }, + { + "epoch": 0.6391069022479694, + "grad_norm": 0.137055903673172, + "learning_rate": 0.0005, + "loss": 2.1138, + "step": 167910 + }, + { + "epoch": 0.6391449647160921, + "grad_norm": 0.13186711072921753, + "learning_rate": 0.0005, + "loss": 2.1132, + "step": 167920 + }, + { + "epoch": 0.6391830271842147, + "grad_norm": 0.12623269855976105, + "learning_rate": 0.0005, + "loss": 2.1129, + "step": 167930 + }, + { + "epoch": 0.6392210896523374, + "grad_norm": 0.12275701761245728, + "learning_rate": 0.0005, + "loss": 2.1201, + "step": 167940 + }, + { + "epoch": 0.6392591521204601, + "grad_norm": 0.12814989686012268, + "learning_rate": 0.0005, + "loss": 2.1046, + "step": 167950 + }, + { + "epoch": 0.6392972145885828, + "grad_norm": 0.13093677163124084, + "learning_rate": 0.0005, + "loss": 2.1063, + "step": 167960 + }, + { + "epoch": 0.6393352770567055, + "grad_norm": 0.12273939698934555, + "learning_rate": 0.0005, + "loss": 2.1176, + "step": 167970 + }, + { + "epoch": 0.6393733395248281, + "grad_norm": 0.1292058527469635, + "learning_rate": 0.0005, + "loss": 2.1057, + "step": 167980 + }, + { + "epoch": 0.6394114019929509, + "grad_norm": 0.13064897060394287, + "learning_rate": 0.0005, + "loss": 2.1101, + "step": 167990 + }, + { + "epoch": 0.6394494644610735, + "grad_norm": 0.12779417634010315, + "learning_rate": 0.0005, + "loss": 2.075, + "step": 168000 + }, + { + "epoch": 0.6394875269291962, + "grad_norm": 0.11623568087816238, + "learning_rate": 0.0005, + "loss": 2.1028, + "step": 168010 + }, + { + "epoch": 0.6395255893973188, + "grad_norm": 0.12801700830459595, + "learning_rate": 0.0005, + "loss": 2.1194, + "step": 168020 + }, + { + "epoch": 0.6395636518654416, + "grad_norm": 0.12461545318365097, + "learning_rate": 0.0005, + "loss": 2.1141, + "step": 168030 + }, + { + "epoch": 0.6396017143335643, + "grad_norm": 0.11495253443717957, + "learning_rate": 0.0005, + "loss": 2.12, + "step": 168040 + }, + { + "epoch": 0.6396397768016869, + "grad_norm": 0.1169470027089119, + "learning_rate": 0.0005, + "loss": 2.1024, + "step": 168050 + }, + { + "epoch": 0.6396778392698096, + "grad_norm": 0.1238497868180275, + "learning_rate": 0.0005, + "loss": 2.1058, + "step": 168060 + }, + { + "epoch": 0.6397159017379322, + "grad_norm": 0.12846608459949493, + "learning_rate": 0.0005, + "loss": 2.0886, + "step": 168070 + }, + { + "epoch": 0.639753964206055, + "grad_norm": 0.14013221859931946, + "learning_rate": 0.0005, + "loss": 2.108, + "step": 168080 + }, + { + "epoch": 0.6397920266741777, + "grad_norm": 0.11937173455953598, + "learning_rate": 0.0005, + "loss": 2.1123, + "step": 168090 + }, + { + "epoch": 0.6398300891423003, + "grad_norm": 0.12790927290916443, + "learning_rate": 0.0005, + "loss": 2.1142, + "step": 168100 + }, + { + "epoch": 0.639868151610423, + "grad_norm": 0.11358696222305298, + "learning_rate": 0.0005, + "loss": 2.1317, + "step": 168110 + }, + { + "epoch": 0.6399062140785458, + "grad_norm": 0.12121156603097916, + "learning_rate": 0.0005, + "loss": 2.1301, + "step": 168120 + }, + { + "epoch": 0.6399442765466684, + "grad_norm": 0.13541017472743988, + "learning_rate": 0.0005, + "loss": 2.1117, + "step": 168130 + }, + { + "epoch": 0.6399823390147911, + "grad_norm": 0.12673501670360565, + "learning_rate": 0.0005, + "loss": 2.1043, + "step": 168140 + }, + { + "epoch": 0.6400204014829137, + "grad_norm": 0.1475529968738556, + "learning_rate": 0.0005, + "loss": 2.1119, + "step": 168150 + }, + { + "epoch": 0.6400584639510365, + "grad_norm": 0.11461649090051651, + "learning_rate": 0.0005, + "loss": 2.1088, + "step": 168160 + }, + { + "epoch": 0.6400965264191592, + "grad_norm": 0.12911100685596466, + "learning_rate": 0.0005, + "loss": 2.1144, + "step": 168170 + }, + { + "epoch": 0.6401345888872818, + "grad_norm": 0.10865405201911926, + "learning_rate": 0.0005, + "loss": 2.1083, + "step": 168180 + }, + { + "epoch": 0.6401726513554045, + "grad_norm": 0.13911312818527222, + "learning_rate": 0.0005, + "loss": 2.1085, + "step": 168190 + }, + { + "epoch": 0.6402107138235271, + "grad_norm": 0.12169904261827469, + "learning_rate": 0.0005, + "loss": 2.1087, + "step": 168200 + }, + { + "epoch": 0.6402487762916499, + "grad_norm": 0.13228438794612885, + "learning_rate": 0.0005, + "loss": 2.1053, + "step": 168210 + }, + { + "epoch": 0.6402868387597725, + "grad_norm": 0.1235433891415596, + "learning_rate": 0.0005, + "loss": 2.1157, + "step": 168220 + }, + { + "epoch": 0.6403249012278952, + "grad_norm": 0.13303908705711365, + "learning_rate": 0.0005, + "loss": 2.1019, + "step": 168230 + }, + { + "epoch": 0.6403629636960179, + "grad_norm": 0.11614194512367249, + "learning_rate": 0.0005, + "loss": 2.1052, + "step": 168240 + }, + { + "epoch": 0.6404010261641406, + "grad_norm": 0.12898878753185272, + "learning_rate": 0.0005, + "loss": 2.1023, + "step": 168250 + }, + { + "epoch": 0.6404390886322633, + "grad_norm": 0.1185198649764061, + "learning_rate": 0.0005, + "loss": 2.0975, + "step": 168260 + }, + { + "epoch": 0.6404771511003859, + "grad_norm": 0.11852993816137314, + "learning_rate": 0.0005, + "loss": 2.0982, + "step": 168270 + }, + { + "epoch": 0.6405152135685086, + "grad_norm": 0.1243414580821991, + "learning_rate": 0.0005, + "loss": 2.128, + "step": 168280 + }, + { + "epoch": 0.6405532760366314, + "grad_norm": 0.12120532989501953, + "learning_rate": 0.0005, + "loss": 2.0965, + "step": 168290 + }, + { + "epoch": 0.640591338504754, + "grad_norm": 0.1326688826084137, + "learning_rate": 0.0005, + "loss": 2.0858, + "step": 168300 + }, + { + "epoch": 0.6406294009728767, + "grad_norm": 0.11883825808763504, + "learning_rate": 0.0005, + "loss": 2.1102, + "step": 168310 + }, + { + "epoch": 0.6406674634409993, + "grad_norm": 0.11887935549020767, + "learning_rate": 0.0005, + "loss": 2.1103, + "step": 168320 + }, + { + "epoch": 0.640705525909122, + "grad_norm": 0.11938267201185226, + "learning_rate": 0.0005, + "loss": 2.1067, + "step": 168330 + }, + { + "epoch": 0.6407435883772448, + "grad_norm": 0.13582032918930054, + "learning_rate": 0.0005, + "loss": 2.1196, + "step": 168340 + }, + { + "epoch": 0.6407816508453674, + "grad_norm": 0.12318161129951477, + "learning_rate": 0.0005, + "loss": 2.1186, + "step": 168350 + }, + { + "epoch": 0.6408197133134901, + "grad_norm": 0.11993315070867538, + "learning_rate": 0.0005, + "loss": 2.1131, + "step": 168360 + }, + { + "epoch": 0.6408577757816127, + "grad_norm": 0.12338174134492874, + "learning_rate": 0.0005, + "loss": 2.1074, + "step": 168370 + }, + { + "epoch": 0.6408958382497355, + "grad_norm": 0.11832733452320099, + "learning_rate": 0.0005, + "loss": 2.1176, + "step": 168380 + }, + { + "epoch": 0.6409339007178582, + "grad_norm": 0.12378036230802536, + "learning_rate": 0.0005, + "loss": 2.117, + "step": 168390 + }, + { + "epoch": 0.6409719631859808, + "grad_norm": 0.13132528960704803, + "learning_rate": 0.0005, + "loss": 2.1027, + "step": 168400 + }, + { + "epoch": 0.6410100256541035, + "grad_norm": 0.11649808287620544, + "learning_rate": 0.0005, + "loss": 2.1078, + "step": 168410 + }, + { + "epoch": 0.6410480881222262, + "grad_norm": 0.12604773044586182, + "learning_rate": 0.0005, + "loss": 2.0951, + "step": 168420 + }, + { + "epoch": 0.6410861505903489, + "grad_norm": 0.12101250886917114, + "learning_rate": 0.0005, + "loss": 2.1157, + "step": 168430 + }, + { + "epoch": 0.6411242130584716, + "grad_norm": 0.12148529291152954, + "learning_rate": 0.0005, + "loss": 2.1022, + "step": 168440 + }, + { + "epoch": 0.6411622755265942, + "grad_norm": 0.12714166939258575, + "learning_rate": 0.0005, + "loss": 2.1148, + "step": 168450 + }, + { + "epoch": 0.641200337994717, + "grad_norm": 0.11625314503908157, + "learning_rate": 0.0005, + "loss": 2.1125, + "step": 168460 + }, + { + "epoch": 0.6412384004628396, + "grad_norm": 0.1165272668004036, + "learning_rate": 0.0005, + "loss": 2.1124, + "step": 168470 + }, + { + "epoch": 0.6412764629309623, + "grad_norm": 0.12469862401485443, + "learning_rate": 0.0005, + "loss": 2.1066, + "step": 168480 + }, + { + "epoch": 0.641314525399085, + "grad_norm": 0.12034633010625839, + "learning_rate": 0.0005, + "loss": 2.0975, + "step": 168490 + }, + { + "epoch": 0.6413525878672076, + "grad_norm": 0.1089642271399498, + "learning_rate": 0.0005, + "loss": 2.1039, + "step": 168500 + }, + { + "epoch": 0.6413906503353304, + "grad_norm": 0.12195317447185516, + "learning_rate": 0.0005, + "loss": 2.0986, + "step": 168510 + }, + { + "epoch": 0.641428712803453, + "grad_norm": 0.13394524157047272, + "learning_rate": 0.0005, + "loss": 2.1021, + "step": 168520 + }, + { + "epoch": 0.6414667752715757, + "grad_norm": 0.11951465904712677, + "learning_rate": 0.0005, + "loss": 2.1216, + "step": 168530 + }, + { + "epoch": 0.6415048377396984, + "grad_norm": 0.13562636077404022, + "learning_rate": 0.0005, + "loss": 2.1159, + "step": 168540 + }, + { + "epoch": 0.6415429002078211, + "grad_norm": 0.12247107923030853, + "learning_rate": 0.0005, + "loss": 2.0985, + "step": 168550 + }, + { + "epoch": 0.6415809626759438, + "grad_norm": 0.12110818922519684, + "learning_rate": 0.0005, + "loss": 2.115, + "step": 168560 + }, + { + "epoch": 0.6416190251440664, + "grad_norm": 0.12524105608463287, + "learning_rate": 0.0005, + "loss": 2.1091, + "step": 168570 + }, + { + "epoch": 0.6416570876121891, + "grad_norm": 0.1352654993534088, + "learning_rate": 0.0005, + "loss": 2.108, + "step": 168580 + }, + { + "epoch": 0.6416951500803119, + "grad_norm": 0.1185445487499237, + "learning_rate": 0.0005, + "loss": 2.1027, + "step": 168590 + }, + { + "epoch": 0.6417332125484345, + "grad_norm": 0.13428305089473724, + "learning_rate": 0.0005, + "loss": 2.0976, + "step": 168600 + }, + { + "epoch": 0.6417712750165572, + "grad_norm": 0.13018237054347992, + "learning_rate": 0.0005, + "loss": 2.1217, + "step": 168610 + }, + { + "epoch": 0.6418093374846798, + "grad_norm": 0.13127325475215912, + "learning_rate": 0.0005, + "loss": 2.113, + "step": 168620 + }, + { + "epoch": 0.6418473999528025, + "grad_norm": 0.12687428295612335, + "learning_rate": 0.0005, + "loss": 2.1078, + "step": 168630 + }, + { + "epoch": 0.6418854624209253, + "grad_norm": 0.14191210269927979, + "learning_rate": 0.0005, + "loss": 2.1048, + "step": 168640 + }, + { + "epoch": 0.6419235248890479, + "grad_norm": 0.12420354038476944, + "learning_rate": 0.0005, + "loss": 2.1053, + "step": 168650 + }, + { + "epoch": 0.6419615873571706, + "grad_norm": 0.13143634796142578, + "learning_rate": 0.0005, + "loss": 2.115, + "step": 168660 + }, + { + "epoch": 0.6419996498252932, + "grad_norm": 0.1170438900589943, + "learning_rate": 0.0005, + "loss": 2.1105, + "step": 168670 + }, + { + "epoch": 0.642037712293416, + "grad_norm": 0.14256148040294647, + "learning_rate": 0.0005, + "loss": 2.1137, + "step": 168680 + }, + { + "epoch": 0.6420757747615387, + "grad_norm": 0.12867961823940277, + "learning_rate": 0.0005, + "loss": 2.1073, + "step": 168690 + }, + { + "epoch": 0.6421138372296613, + "grad_norm": 0.12932276725769043, + "learning_rate": 0.0005, + "loss": 2.1036, + "step": 168700 + }, + { + "epoch": 0.642151899697784, + "grad_norm": 0.14709927141666412, + "learning_rate": 0.0005, + "loss": 2.1122, + "step": 168710 + }, + { + "epoch": 0.6421899621659067, + "grad_norm": 0.12742719054222107, + "learning_rate": 0.0005, + "loss": 2.1145, + "step": 168720 + }, + { + "epoch": 0.6422280246340294, + "grad_norm": 0.12468601018190384, + "learning_rate": 0.0005, + "loss": 2.1277, + "step": 168730 + }, + { + "epoch": 0.642266087102152, + "grad_norm": 0.12178006023168564, + "learning_rate": 0.0005, + "loss": 2.1064, + "step": 168740 + }, + { + "epoch": 0.6423041495702747, + "grad_norm": 0.12032576650381088, + "learning_rate": 0.0005, + "loss": 2.1036, + "step": 168750 + }, + { + "epoch": 0.6423422120383974, + "grad_norm": 0.11936362087726593, + "learning_rate": 0.0005, + "loss": 2.0937, + "step": 168760 + }, + { + "epoch": 0.6423802745065201, + "grad_norm": 0.11243180185556412, + "learning_rate": 0.0005, + "loss": 2.1093, + "step": 168770 + }, + { + "epoch": 0.6424183369746428, + "grad_norm": 0.12037627398967743, + "learning_rate": 0.0005, + "loss": 2.1035, + "step": 168780 + }, + { + "epoch": 0.6424563994427654, + "grad_norm": 0.1435946524143219, + "learning_rate": 0.0005, + "loss": 2.1195, + "step": 168790 + }, + { + "epoch": 0.6424944619108881, + "grad_norm": 0.128703311085701, + "learning_rate": 0.0005, + "loss": 2.1114, + "step": 168800 + }, + { + "epoch": 0.6425325243790109, + "grad_norm": 0.12024714052677155, + "learning_rate": 0.0005, + "loss": 2.1189, + "step": 168810 + }, + { + "epoch": 0.6425705868471335, + "grad_norm": 0.12797857820987701, + "learning_rate": 0.0005, + "loss": 2.088, + "step": 168820 + }, + { + "epoch": 0.6426086493152562, + "grad_norm": 0.12992393970489502, + "learning_rate": 0.0005, + "loss": 2.1102, + "step": 168830 + }, + { + "epoch": 0.6426467117833788, + "grad_norm": 0.1158052310347557, + "learning_rate": 0.0005, + "loss": 2.1124, + "step": 168840 + }, + { + "epoch": 0.6426847742515016, + "grad_norm": 0.12053052335977554, + "learning_rate": 0.0005, + "loss": 2.1188, + "step": 168850 + }, + { + "epoch": 0.6427228367196243, + "grad_norm": 0.13656136393547058, + "learning_rate": 0.0005, + "loss": 2.1015, + "step": 168860 + }, + { + "epoch": 0.6427608991877469, + "grad_norm": 0.11971770226955414, + "learning_rate": 0.0005, + "loss": 2.0972, + "step": 168870 + }, + { + "epoch": 0.6427989616558696, + "grad_norm": 0.11107343435287476, + "learning_rate": 0.0005, + "loss": 2.0961, + "step": 168880 + }, + { + "epoch": 0.6428370241239924, + "grad_norm": 0.12417291104793549, + "learning_rate": 0.0005, + "loss": 2.0989, + "step": 168890 + }, + { + "epoch": 0.642875086592115, + "grad_norm": 0.12001986801624298, + "learning_rate": 0.0005, + "loss": 2.1069, + "step": 168900 + }, + { + "epoch": 0.6429131490602377, + "grad_norm": 0.13117973506450653, + "learning_rate": 0.0005, + "loss": 2.101, + "step": 168910 + }, + { + "epoch": 0.6429512115283603, + "grad_norm": 0.13548439741134644, + "learning_rate": 0.0005, + "loss": 2.09, + "step": 168920 + }, + { + "epoch": 0.642989273996483, + "grad_norm": 0.11892268061637878, + "learning_rate": 0.0005, + "loss": 2.114, + "step": 168930 + }, + { + "epoch": 0.6430273364646057, + "grad_norm": 0.11337054520845413, + "learning_rate": 0.0005, + "loss": 2.0935, + "step": 168940 + }, + { + "epoch": 0.6430653989327284, + "grad_norm": 0.12810419499874115, + "learning_rate": 0.0005, + "loss": 2.1111, + "step": 168950 + }, + { + "epoch": 0.6431034614008511, + "grad_norm": 0.12267080694437027, + "learning_rate": 0.0005, + "loss": 2.1157, + "step": 168960 + }, + { + "epoch": 0.6431415238689737, + "grad_norm": 0.141947939991951, + "learning_rate": 0.0005, + "loss": 2.103, + "step": 168970 + }, + { + "epoch": 0.6431795863370965, + "grad_norm": 0.12055546045303345, + "learning_rate": 0.0005, + "loss": 2.0996, + "step": 168980 + }, + { + "epoch": 0.6432176488052191, + "grad_norm": 0.13173562288284302, + "learning_rate": 0.0005, + "loss": 2.0955, + "step": 168990 + }, + { + "epoch": 0.6432557112733418, + "grad_norm": 0.11383207887411118, + "learning_rate": 0.0005, + "loss": 2.1043, + "step": 169000 + }, + { + "epoch": 0.6432937737414645, + "grad_norm": 0.11952648311853409, + "learning_rate": 0.0005, + "loss": 2.0955, + "step": 169010 + }, + { + "epoch": 0.6433318362095872, + "grad_norm": 0.12814606726169586, + "learning_rate": 0.0005, + "loss": 2.1075, + "step": 169020 + }, + { + "epoch": 0.6433698986777099, + "grad_norm": 0.11744929850101471, + "learning_rate": 0.0005, + "loss": 2.1005, + "step": 169030 + }, + { + "epoch": 0.6434079611458325, + "grad_norm": 0.12779894471168518, + "learning_rate": 0.0005, + "loss": 2.105, + "step": 169040 + }, + { + "epoch": 0.6434460236139552, + "grad_norm": 0.119678795337677, + "learning_rate": 0.0005, + "loss": 2.1193, + "step": 169050 + }, + { + "epoch": 0.6434840860820779, + "grad_norm": 0.1276785284280777, + "learning_rate": 0.0005, + "loss": 2.1132, + "step": 169060 + }, + { + "epoch": 0.6435221485502006, + "grad_norm": 0.12187217175960541, + "learning_rate": 0.0005, + "loss": 2.0983, + "step": 169070 + }, + { + "epoch": 0.6435602110183233, + "grad_norm": 0.11958392709493637, + "learning_rate": 0.0005, + "loss": 2.1246, + "step": 169080 + }, + { + "epoch": 0.6435982734864459, + "grad_norm": 0.12024261802434921, + "learning_rate": 0.0005, + "loss": 2.1175, + "step": 169090 + }, + { + "epoch": 0.6436363359545686, + "grad_norm": 0.12304984778165817, + "learning_rate": 0.0005, + "loss": 2.1118, + "step": 169100 + }, + { + "epoch": 0.6436743984226914, + "grad_norm": 0.12955528497695923, + "learning_rate": 0.0005, + "loss": 2.1044, + "step": 169110 + }, + { + "epoch": 0.643712460890814, + "grad_norm": 0.11826508492231369, + "learning_rate": 0.0005, + "loss": 2.1212, + "step": 169120 + }, + { + "epoch": 0.6437505233589367, + "grad_norm": 0.1322208046913147, + "learning_rate": 0.0005, + "loss": 2.1118, + "step": 169130 + }, + { + "epoch": 0.6437885858270593, + "grad_norm": 0.1285995990037918, + "learning_rate": 0.0005, + "loss": 2.1129, + "step": 169140 + }, + { + "epoch": 0.6438266482951821, + "grad_norm": 0.1301390677690506, + "learning_rate": 0.0005, + "loss": 2.0966, + "step": 169150 + }, + { + "epoch": 0.6438647107633048, + "grad_norm": 0.11197373270988464, + "learning_rate": 0.0005, + "loss": 2.1244, + "step": 169160 + }, + { + "epoch": 0.6439027732314274, + "grad_norm": 0.12456469982862473, + "learning_rate": 0.0005, + "loss": 2.1161, + "step": 169170 + }, + { + "epoch": 0.6439408356995501, + "grad_norm": 0.17039534449577332, + "learning_rate": 0.0005, + "loss": 2.1177, + "step": 169180 + }, + { + "epoch": 0.6439788981676727, + "grad_norm": 0.11926776170730591, + "learning_rate": 0.0005, + "loss": 2.1157, + "step": 169190 + }, + { + "epoch": 0.6440169606357955, + "grad_norm": 0.12898243963718414, + "learning_rate": 0.0005, + "loss": 2.1035, + "step": 169200 + }, + { + "epoch": 0.6440550231039182, + "grad_norm": 0.12652383744716644, + "learning_rate": 0.0005, + "loss": 2.1016, + "step": 169210 + }, + { + "epoch": 0.6440930855720408, + "grad_norm": 0.12866918742656708, + "learning_rate": 0.0005, + "loss": 2.1115, + "step": 169220 + }, + { + "epoch": 0.6441311480401635, + "grad_norm": 0.12797291576862335, + "learning_rate": 0.0005, + "loss": 2.1065, + "step": 169230 + }, + { + "epoch": 0.6441692105082862, + "grad_norm": 0.12362707406282425, + "learning_rate": 0.0005, + "loss": 2.1148, + "step": 169240 + }, + { + "epoch": 0.6442072729764089, + "grad_norm": 0.11655452847480774, + "learning_rate": 0.0005, + "loss": 2.1121, + "step": 169250 + }, + { + "epoch": 0.6442453354445316, + "grad_norm": 0.13020145893096924, + "learning_rate": 0.0005, + "loss": 2.1054, + "step": 169260 + }, + { + "epoch": 0.6442833979126542, + "grad_norm": 0.11631694436073303, + "learning_rate": 0.0005, + "loss": 2.1133, + "step": 169270 + }, + { + "epoch": 0.644321460380777, + "grad_norm": 0.12558521330356598, + "learning_rate": 0.0005, + "loss": 2.1148, + "step": 169280 + }, + { + "epoch": 0.6443595228488996, + "grad_norm": 0.11559248715639114, + "learning_rate": 0.0005, + "loss": 2.1195, + "step": 169290 + }, + { + "epoch": 0.6443975853170223, + "grad_norm": 0.11725137382745743, + "learning_rate": 0.0005, + "loss": 2.1021, + "step": 169300 + }, + { + "epoch": 0.644435647785145, + "grad_norm": 0.12991683185100555, + "learning_rate": 0.0005, + "loss": 2.1201, + "step": 169310 + }, + { + "epoch": 0.6444737102532677, + "grad_norm": 0.1312405914068222, + "learning_rate": 0.0005, + "loss": 2.1183, + "step": 169320 + }, + { + "epoch": 0.6445117727213904, + "grad_norm": 0.12225445359945297, + "learning_rate": 0.0005, + "loss": 2.1113, + "step": 169330 + }, + { + "epoch": 0.644549835189513, + "grad_norm": 0.12185630202293396, + "learning_rate": 0.0005, + "loss": 2.118, + "step": 169340 + }, + { + "epoch": 0.6445878976576357, + "grad_norm": 0.12531758844852448, + "learning_rate": 0.0005, + "loss": 2.113, + "step": 169350 + }, + { + "epoch": 0.6446259601257583, + "grad_norm": 0.12139049172401428, + "learning_rate": 0.0005, + "loss": 2.1092, + "step": 169360 + }, + { + "epoch": 0.6446640225938811, + "grad_norm": 0.11470047384500504, + "learning_rate": 0.0005, + "loss": 2.1062, + "step": 169370 + }, + { + "epoch": 0.6447020850620038, + "grad_norm": 0.12508058547973633, + "learning_rate": 0.0005, + "loss": 2.1209, + "step": 169380 + }, + { + "epoch": 0.6447401475301264, + "grad_norm": 0.11169907450675964, + "learning_rate": 0.0005, + "loss": 2.0984, + "step": 169390 + }, + { + "epoch": 0.6447782099982491, + "grad_norm": 0.1254015862941742, + "learning_rate": 0.0005, + "loss": 2.1218, + "step": 169400 + }, + { + "epoch": 0.6448162724663719, + "grad_norm": 0.1406705379486084, + "learning_rate": 0.0005, + "loss": 2.1069, + "step": 169410 + }, + { + "epoch": 0.6448543349344945, + "grad_norm": 0.14721862971782684, + "learning_rate": 0.0005, + "loss": 2.1095, + "step": 169420 + }, + { + "epoch": 0.6448923974026172, + "grad_norm": 0.13507451117038727, + "learning_rate": 0.0005, + "loss": 2.1172, + "step": 169430 + }, + { + "epoch": 0.6449304598707398, + "grad_norm": 0.11473426967859268, + "learning_rate": 0.0005, + "loss": 2.1162, + "step": 169440 + }, + { + "epoch": 0.6449685223388626, + "grad_norm": 0.1197873130440712, + "learning_rate": 0.0005, + "loss": 2.11, + "step": 169450 + }, + { + "epoch": 0.6450065848069853, + "grad_norm": 0.11934750527143478, + "learning_rate": 0.0005, + "loss": 2.1222, + "step": 169460 + }, + { + "epoch": 0.6450446472751079, + "grad_norm": 0.12757621705532074, + "learning_rate": 0.0005, + "loss": 2.1037, + "step": 169470 + }, + { + "epoch": 0.6450827097432306, + "grad_norm": 0.1281720995903015, + "learning_rate": 0.0005, + "loss": 2.1158, + "step": 169480 + }, + { + "epoch": 0.6451207722113532, + "grad_norm": 0.15214742720127106, + "learning_rate": 0.0005, + "loss": 2.1158, + "step": 169490 + }, + { + "epoch": 0.645158834679476, + "grad_norm": 0.14202375710010529, + "learning_rate": 0.0005, + "loss": 2.1091, + "step": 169500 + }, + { + "epoch": 0.6451968971475986, + "grad_norm": 0.1248500794172287, + "learning_rate": 0.0005, + "loss": 2.1227, + "step": 169510 + }, + { + "epoch": 0.6452349596157213, + "grad_norm": 0.13348935544490814, + "learning_rate": 0.0005, + "loss": 2.1343, + "step": 169520 + }, + { + "epoch": 0.645273022083844, + "grad_norm": 0.11995500326156616, + "learning_rate": 0.0005, + "loss": 2.1103, + "step": 169530 + }, + { + "epoch": 0.6453110845519667, + "grad_norm": 0.11561872065067291, + "learning_rate": 0.0005, + "loss": 2.1148, + "step": 169540 + }, + { + "epoch": 0.6453491470200894, + "grad_norm": 0.11225131154060364, + "learning_rate": 0.0005, + "loss": 2.1029, + "step": 169550 + }, + { + "epoch": 0.645387209488212, + "grad_norm": 0.12102378159761429, + "learning_rate": 0.0005, + "loss": 2.1041, + "step": 169560 + }, + { + "epoch": 0.6454252719563347, + "grad_norm": 0.12393181771039963, + "learning_rate": 0.0005, + "loss": 2.1048, + "step": 169570 + }, + { + "epoch": 0.6454633344244575, + "grad_norm": 0.12839457392692566, + "learning_rate": 0.0005, + "loss": 2.1173, + "step": 169580 + }, + { + "epoch": 0.6455013968925801, + "grad_norm": 0.11687374114990234, + "learning_rate": 0.0005, + "loss": 2.1074, + "step": 169590 + }, + { + "epoch": 0.6455394593607028, + "grad_norm": 0.13319134712219238, + "learning_rate": 0.0005, + "loss": 2.113, + "step": 169600 + }, + { + "epoch": 0.6455775218288254, + "grad_norm": 0.11894795298576355, + "learning_rate": 0.0005, + "loss": 2.1255, + "step": 169610 + }, + { + "epoch": 0.6456155842969481, + "grad_norm": 0.1231246292591095, + "learning_rate": 0.0005, + "loss": 2.1025, + "step": 169620 + }, + { + "epoch": 0.6456536467650709, + "grad_norm": 0.12345793843269348, + "learning_rate": 0.0005, + "loss": 2.1189, + "step": 169630 + }, + { + "epoch": 0.6456917092331935, + "grad_norm": 0.13803525269031525, + "learning_rate": 0.0005, + "loss": 2.1237, + "step": 169640 + }, + { + "epoch": 0.6457297717013162, + "grad_norm": 0.14578431844711304, + "learning_rate": 0.0005, + "loss": 2.0963, + "step": 169650 + }, + { + "epoch": 0.6457678341694388, + "grad_norm": 0.12052863836288452, + "learning_rate": 0.0005, + "loss": 2.1092, + "step": 169660 + }, + { + "epoch": 0.6458058966375616, + "grad_norm": 0.1255030333995819, + "learning_rate": 0.0005, + "loss": 2.1118, + "step": 169670 + }, + { + "epoch": 0.6458439591056843, + "grad_norm": 0.1355823576450348, + "learning_rate": 0.0005, + "loss": 2.101, + "step": 169680 + }, + { + "epoch": 0.6458820215738069, + "grad_norm": 0.12269733846187592, + "learning_rate": 0.0005, + "loss": 2.1097, + "step": 169690 + }, + { + "epoch": 0.6459200840419296, + "grad_norm": 0.12395049631595612, + "learning_rate": 0.0005, + "loss": 2.1117, + "step": 169700 + }, + { + "epoch": 0.6459581465100523, + "grad_norm": 0.12342299520969391, + "learning_rate": 0.0005, + "loss": 2.1172, + "step": 169710 + }, + { + "epoch": 0.645996208978175, + "grad_norm": 0.12586908042430878, + "learning_rate": 0.0005, + "loss": 2.0957, + "step": 169720 + }, + { + "epoch": 0.6460342714462977, + "grad_norm": 0.1380547434091568, + "learning_rate": 0.0005, + "loss": 2.0924, + "step": 169730 + }, + { + "epoch": 0.6460723339144203, + "grad_norm": 0.12601609528064728, + "learning_rate": 0.0005, + "loss": 2.1043, + "step": 169740 + }, + { + "epoch": 0.6461103963825431, + "grad_norm": 0.1351451873779297, + "learning_rate": 0.0005, + "loss": 2.1122, + "step": 169750 + }, + { + "epoch": 0.6461484588506657, + "grad_norm": 0.11888038367033005, + "learning_rate": 0.0005, + "loss": 2.1102, + "step": 169760 + }, + { + "epoch": 0.6461865213187884, + "grad_norm": 0.12636907398700714, + "learning_rate": 0.0005, + "loss": 2.1269, + "step": 169770 + }, + { + "epoch": 0.646224583786911, + "grad_norm": 0.12032492458820343, + "learning_rate": 0.0005, + "loss": 2.113, + "step": 169780 + }, + { + "epoch": 0.6462626462550337, + "grad_norm": 0.12223067134618759, + "learning_rate": 0.0005, + "loss": 2.0996, + "step": 169790 + }, + { + "epoch": 0.6463007087231565, + "grad_norm": 0.12942957878112793, + "learning_rate": 0.0005, + "loss": 2.111, + "step": 169800 + }, + { + "epoch": 0.6463387711912791, + "grad_norm": 0.1295655220746994, + "learning_rate": 0.0005, + "loss": 2.114, + "step": 169810 + }, + { + "epoch": 0.6463768336594018, + "grad_norm": 0.12919200956821442, + "learning_rate": 0.0005, + "loss": 2.1198, + "step": 169820 + }, + { + "epoch": 0.6464148961275245, + "grad_norm": 0.12342769652605057, + "learning_rate": 0.0005, + "loss": 2.107, + "step": 169830 + }, + { + "epoch": 0.6464529585956472, + "grad_norm": 0.13746656477451324, + "learning_rate": 0.0005, + "loss": 2.115, + "step": 169840 + }, + { + "epoch": 0.6464910210637699, + "grad_norm": 0.1307177096605301, + "learning_rate": 0.0005, + "loss": 2.1049, + "step": 169850 + }, + { + "epoch": 0.6465290835318925, + "grad_norm": 0.12498698383569717, + "learning_rate": 0.0005, + "loss": 2.1121, + "step": 169860 + }, + { + "epoch": 0.6465671460000152, + "grad_norm": 0.12511016428470612, + "learning_rate": 0.0005, + "loss": 2.1252, + "step": 169870 + }, + { + "epoch": 0.646605208468138, + "grad_norm": 0.1254969835281372, + "learning_rate": 0.0005, + "loss": 2.0998, + "step": 169880 + }, + { + "epoch": 0.6466432709362606, + "grad_norm": 0.11852476000785828, + "learning_rate": 0.0005, + "loss": 2.0992, + "step": 169890 + }, + { + "epoch": 0.6466813334043833, + "grad_norm": 0.7008293271064758, + "learning_rate": 0.0005, + "loss": 2.1068, + "step": 169900 + }, + { + "epoch": 0.6467193958725059, + "grad_norm": 0.1426885724067688, + "learning_rate": 0.0005, + "loss": 2.1166, + "step": 169910 + }, + { + "epoch": 0.6467574583406286, + "grad_norm": 0.12015167623758316, + "learning_rate": 0.0005, + "loss": 2.0981, + "step": 169920 + }, + { + "epoch": 0.6467955208087514, + "grad_norm": 0.12605907022953033, + "learning_rate": 0.0005, + "loss": 2.1266, + "step": 169930 + }, + { + "epoch": 0.646833583276874, + "grad_norm": 0.1266433447599411, + "learning_rate": 0.0005, + "loss": 2.109, + "step": 169940 + }, + { + "epoch": 0.6468716457449967, + "grad_norm": 0.1276005059480667, + "learning_rate": 0.0005, + "loss": 2.108, + "step": 169950 + }, + { + "epoch": 0.6469097082131193, + "grad_norm": 0.11537665873765945, + "learning_rate": 0.0005, + "loss": 2.1018, + "step": 169960 + }, + { + "epoch": 0.6469477706812421, + "grad_norm": 0.14385004341602325, + "learning_rate": 0.0005, + "loss": 2.1047, + "step": 169970 + }, + { + "epoch": 0.6469858331493648, + "grad_norm": 0.12152372300624847, + "learning_rate": 0.0005, + "loss": 2.1009, + "step": 169980 + }, + { + "epoch": 0.6470238956174874, + "grad_norm": 0.12667691707611084, + "learning_rate": 0.0005, + "loss": 2.1045, + "step": 169990 + }, + { + "epoch": 0.6470619580856101, + "grad_norm": 0.1236126646399498, + "learning_rate": 0.0005, + "loss": 2.1168, + "step": 170000 + }, + { + "epoch": 0.6471000205537328, + "grad_norm": 0.13405680656433105, + "learning_rate": 0.0005, + "loss": 2.1422, + "step": 170010 + }, + { + "epoch": 0.6471380830218555, + "grad_norm": 0.13311605155467987, + "learning_rate": 0.0005, + "loss": 2.1059, + "step": 170020 + }, + { + "epoch": 0.6471761454899781, + "grad_norm": 0.12090720236301422, + "learning_rate": 0.0005, + "loss": 2.104, + "step": 170030 + }, + { + "epoch": 0.6472142079581008, + "grad_norm": 0.12787151336669922, + "learning_rate": 0.0005, + "loss": 2.1098, + "step": 170040 + }, + { + "epoch": 0.6472522704262235, + "grad_norm": 0.12532693147659302, + "learning_rate": 0.0005, + "loss": 2.1175, + "step": 170050 + }, + { + "epoch": 0.6472903328943462, + "grad_norm": 0.12059324234724045, + "learning_rate": 0.0005, + "loss": 2.109, + "step": 170060 + }, + { + "epoch": 0.6473283953624689, + "grad_norm": 0.13537880778312683, + "learning_rate": 0.0005, + "loss": 2.1004, + "step": 170070 + }, + { + "epoch": 0.6473664578305915, + "grad_norm": 0.12789538502693176, + "learning_rate": 0.0005, + "loss": 2.1061, + "step": 170080 + }, + { + "epoch": 0.6474045202987142, + "grad_norm": 0.11624449491500854, + "learning_rate": 0.0005, + "loss": 2.1138, + "step": 170090 + }, + { + "epoch": 0.647442582766837, + "grad_norm": 0.14559069275856018, + "learning_rate": 0.0005, + "loss": 2.123, + "step": 170100 + }, + { + "epoch": 0.6474806452349596, + "grad_norm": 0.12032388895750046, + "learning_rate": 0.0005, + "loss": 2.0995, + "step": 170110 + }, + { + "epoch": 0.6475187077030823, + "grad_norm": 0.12226753681898117, + "learning_rate": 0.0005, + "loss": 2.1084, + "step": 170120 + }, + { + "epoch": 0.6475567701712049, + "grad_norm": 0.12323250621557236, + "learning_rate": 0.0005, + "loss": 2.1256, + "step": 170130 + }, + { + "epoch": 0.6475948326393277, + "grad_norm": 0.12948326766490936, + "learning_rate": 0.0005, + "loss": 2.1198, + "step": 170140 + }, + { + "epoch": 0.6476328951074504, + "grad_norm": 0.1208270862698555, + "learning_rate": 0.0005, + "loss": 2.1105, + "step": 170150 + }, + { + "epoch": 0.647670957575573, + "grad_norm": 0.1250876635313034, + "learning_rate": 0.0005, + "loss": 2.1257, + "step": 170160 + }, + { + "epoch": 0.6477090200436957, + "grad_norm": 0.1304120272397995, + "learning_rate": 0.0005, + "loss": 2.1298, + "step": 170170 + }, + { + "epoch": 0.6477470825118185, + "grad_norm": 0.1235019862651825, + "learning_rate": 0.0005, + "loss": 2.115, + "step": 170180 + }, + { + "epoch": 0.6477851449799411, + "grad_norm": 0.1243685707449913, + "learning_rate": 0.0005, + "loss": 2.1094, + "step": 170190 + }, + { + "epoch": 0.6478232074480638, + "grad_norm": 0.120770163834095, + "learning_rate": 0.0005, + "loss": 2.1035, + "step": 170200 + }, + { + "epoch": 0.6478612699161864, + "grad_norm": 0.11926085501909256, + "learning_rate": 0.0005, + "loss": 2.0995, + "step": 170210 + }, + { + "epoch": 0.6478993323843091, + "grad_norm": 0.11803922057151794, + "learning_rate": 0.0005, + "loss": 2.1186, + "step": 170220 + }, + { + "epoch": 0.6479373948524318, + "grad_norm": 0.12076990306377411, + "learning_rate": 0.0005, + "loss": 2.0987, + "step": 170230 + }, + { + "epoch": 0.6479754573205545, + "grad_norm": 0.12457573413848877, + "learning_rate": 0.0005, + "loss": 2.0887, + "step": 170240 + }, + { + "epoch": 0.6480135197886772, + "grad_norm": 0.13837842643260956, + "learning_rate": 0.0005, + "loss": 2.1328, + "step": 170250 + }, + { + "epoch": 0.6480515822567998, + "grad_norm": 0.12683911621570587, + "learning_rate": 0.0005, + "loss": 2.0827, + "step": 170260 + }, + { + "epoch": 0.6480896447249226, + "grad_norm": 0.11728407442569733, + "learning_rate": 0.0005, + "loss": 2.1102, + "step": 170270 + }, + { + "epoch": 0.6481277071930452, + "grad_norm": 0.11798442155122757, + "learning_rate": 0.0005, + "loss": 2.0946, + "step": 170280 + }, + { + "epoch": 0.6481657696611679, + "grad_norm": 0.12231793254613876, + "learning_rate": 0.0005, + "loss": 2.1226, + "step": 170290 + }, + { + "epoch": 0.6482038321292906, + "grad_norm": 0.12606288492679596, + "learning_rate": 0.0005, + "loss": 2.1187, + "step": 170300 + }, + { + "epoch": 0.6482418945974133, + "grad_norm": 0.1174217239022255, + "learning_rate": 0.0005, + "loss": 2.1301, + "step": 170310 + }, + { + "epoch": 0.648279957065536, + "grad_norm": 0.12881214916706085, + "learning_rate": 0.0005, + "loss": 2.1202, + "step": 170320 + }, + { + "epoch": 0.6483180195336586, + "grad_norm": 0.12366227805614471, + "learning_rate": 0.0005, + "loss": 2.0936, + "step": 170330 + }, + { + "epoch": 0.6483560820017813, + "grad_norm": 0.11601834744215012, + "learning_rate": 0.0005, + "loss": 2.1066, + "step": 170340 + }, + { + "epoch": 0.648394144469904, + "grad_norm": 0.13191868364810944, + "learning_rate": 0.0005, + "loss": 2.1104, + "step": 170350 + }, + { + "epoch": 0.6484322069380267, + "grad_norm": 0.13391204178333282, + "learning_rate": 0.0005, + "loss": 2.1097, + "step": 170360 + }, + { + "epoch": 0.6484702694061494, + "grad_norm": 0.12104548513889313, + "learning_rate": 0.0005, + "loss": 2.117, + "step": 170370 + }, + { + "epoch": 0.648508331874272, + "grad_norm": 0.12440919131040573, + "learning_rate": 0.0005, + "loss": 2.1094, + "step": 170380 + }, + { + "epoch": 0.6485463943423947, + "grad_norm": 0.1321982890367508, + "learning_rate": 0.0005, + "loss": 2.1241, + "step": 170390 + }, + { + "epoch": 0.6485844568105175, + "grad_norm": 0.13380879163742065, + "learning_rate": 0.0005, + "loss": 2.1076, + "step": 170400 + }, + { + "epoch": 0.6486225192786401, + "grad_norm": 0.1223221942782402, + "learning_rate": 0.0005, + "loss": 2.1011, + "step": 170410 + }, + { + "epoch": 0.6486605817467628, + "grad_norm": 0.12063425034284592, + "learning_rate": 0.0005, + "loss": 2.1103, + "step": 170420 + }, + { + "epoch": 0.6486986442148854, + "grad_norm": 0.12745217978954315, + "learning_rate": 0.0005, + "loss": 2.1109, + "step": 170430 + }, + { + "epoch": 0.6487367066830082, + "grad_norm": 0.11377807706594467, + "learning_rate": 0.0005, + "loss": 2.1014, + "step": 170440 + }, + { + "epoch": 0.6487747691511309, + "grad_norm": 0.12785880267620087, + "learning_rate": 0.0005, + "loss": 2.1111, + "step": 170450 + }, + { + "epoch": 0.6488128316192535, + "grad_norm": 0.12305945158004761, + "learning_rate": 0.0005, + "loss": 2.095, + "step": 170460 + }, + { + "epoch": 0.6488508940873762, + "grad_norm": 0.11621784418821335, + "learning_rate": 0.0005, + "loss": 2.1026, + "step": 170470 + }, + { + "epoch": 0.6488889565554988, + "grad_norm": 0.11889064311981201, + "learning_rate": 0.0005, + "loss": 2.1087, + "step": 170480 + }, + { + "epoch": 0.6489270190236216, + "grad_norm": 0.1268673688173294, + "learning_rate": 0.0005, + "loss": 2.1106, + "step": 170490 + }, + { + "epoch": 0.6489650814917443, + "grad_norm": 0.13656675815582275, + "learning_rate": 0.0005, + "loss": 2.0941, + "step": 170500 + }, + { + "epoch": 0.6490031439598669, + "grad_norm": 0.12452402710914612, + "learning_rate": 0.0005, + "loss": 2.1192, + "step": 170510 + }, + { + "epoch": 0.6490412064279896, + "grad_norm": 0.12301170825958252, + "learning_rate": 0.0005, + "loss": 2.1009, + "step": 170520 + }, + { + "epoch": 0.6490792688961123, + "grad_norm": 0.12130080908536911, + "learning_rate": 0.0005, + "loss": 2.1194, + "step": 170530 + }, + { + "epoch": 0.649117331364235, + "grad_norm": 0.12391924858093262, + "learning_rate": 0.0005, + "loss": 2.105, + "step": 170540 + }, + { + "epoch": 0.6491553938323577, + "grad_norm": 0.11749352514743805, + "learning_rate": 0.0005, + "loss": 2.108, + "step": 170550 + }, + { + "epoch": 0.6491934563004803, + "grad_norm": 0.11723422259092331, + "learning_rate": 0.0005, + "loss": 2.0985, + "step": 170560 + }, + { + "epoch": 0.6492315187686031, + "grad_norm": 0.1299586147069931, + "learning_rate": 0.0005, + "loss": 2.1216, + "step": 170570 + }, + { + "epoch": 0.6492695812367257, + "grad_norm": 0.12474636733531952, + "learning_rate": 0.0005, + "loss": 2.1002, + "step": 170580 + }, + { + "epoch": 0.6493076437048484, + "grad_norm": 0.11184942722320557, + "learning_rate": 0.0005, + "loss": 2.1178, + "step": 170590 + }, + { + "epoch": 0.649345706172971, + "grad_norm": 0.1352064609527588, + "learning_rate": 0.0005, + "loss": 2.0927, + "step": 170600 + }, + { + "epoch": 0.6493837686410938, + "grad_norm": 0.12824389338493347, + "learning_rate": 0.0005, + "loss": 2.1158, + "step": 170610 + }, + { + "epoch": 0.6494218311092165, + "grad_norm": 0.1299542784690857, + "learning_rate": 0.0005, + "loss": 2.1118, + "step": 170620 + }, + { + "epoch": 0.6494598935773391, + "grad_norm": 0.12447993457317352, + "learning_rate": 0.0005, + "loss": 2.1154, + "step": 170630 + }, + { + "epoch": 0.6494979560454618, + "grad_norm": 0.13263040781021118, + "learning_rate": 0.0005, + "loss": 2.1265, + "step": 170640 + }, + { + "epoch": 0.6495360185135844, + "grad_norm": 0.12033121287822723, + "learning_rate": 0.0005, + "loss": 2.1058, + "step": 170650 + }, + { + "epoch": 0.6495740809817072, + "grad_norm": 0.1425001323223114, + "learning_rate": 0.0005, + "loss": 2.116, + "step": 170660 + }, + { + "epoch": 0.6496121434498299, + "grad_norm": 0.12598814070224762, + "learning_rate": 0.0005, + "loss": 2.1098, + "step": 170670 + }, + { + "epoch": 0.6496502059179525, + "grad_norm": 0.1267545521259308, + "learning_rate": 0.0005, + "loss": 2.1157, + "step": 170680 + }, + { + "epoch": 0.6496882683860752, + "grad_norm": 0.12284674495458603, + "learning_rate": 0.0005, + "loss": 2.1042, + "step": 170690 + }, + { + "epoch": 0.649726330854198, + "grad_norm": 0.12898008525371552, + "learning_rate": 0.0005, + "loss": 2.1036, + "step": 170700 + }, + { + "epoch": 0.6497643933223206, + "grad_norm": 0.12302932143211365, + "learning_rate": 0.0005, + "loss": 2.1154, + "step": 170710 + }, + { + "epoch": 0.6498024557904433, + "grad_norm": 0.12430860847234726, + "learning_rate": 0.0005, + "loss": 2.1021, + "step": 170720 + }, + { + "epoch": 0.6498405182585659, + "grad_norm": 0.12097886204719543, + "learning_rate": 0.0005, + "loss": 2.1091, + "step": 170730 + }, + { + "epoch": 0.6498785807266887, + "grad_norm": 0.11994773149490356, + "learning_rate": 0.0005, + "loss": 2.108, + "step": 170740 + }, + { + "epoch": 0.6499166431948113, + "grad_norm": 0.11827217042446136, + "learning_rate": 0.0005, + "loss": 2.1069, + "step": 170750 + }, + { + "epoch": 0.649954705662934, + "grad_norm": 0.12851224839687347, + "learning_rate": 0.0005, + "loss": 2.1009, + "step": 170760 + }, + { + "epoch": 0.6499927681310567, + "grad_norm": 0.11409155279397964, + "learning_rate": 0.0005, + "loss": 2.1027, + "step": 170770 + }, + { + "epoch": 0.6500308305991793, + "grad_norm": 0.12571172416210175, + "learning_rate": 0.0005, + "loss": 2.1243, + "step": 170780 + }, + { + "epoch": 0.6500688930673021, + "grad_norm": 0.13297589123249054, + "learning_rate": 0.0005, + "loss": 2.0942, + "step": 170790 + }, + { + "epoch": 0.6501069555354247, + "grad_norm": 0.1160137876868248, + "learning_rate": 0.0005, + "loss": 2.1042, + "step": 170800 + }, + { + "epoch": 0.6501450180035474, + "grad_norm": 0.11829045414924622, + "learning_rate": 0.0005, + "loss": 2.108, + "step": 170810 + }, + { + "epoch": 0.6501830804716701, + "grad_norm": 0.13427163660526276, + "learning_rate": 0.0005, + "loss": 2.109, + "step": 170820 + }, + { + "epoch": 0.6502211429397928, + "grad_norm": 0.13045386970043182, + "learning_rate": 0.0005, + "loss": 2.1024, + "step": 170830 + }, + { + "epoch": 0.6502592054079155, + "grad_norm": 0.12804299592971802, + "learning_rate": 0.0005, + "loss": 2.1135, + "step": 170840 + }, + { + "epoch": 0.6502972678760381, + "grad_norm": 0.12977930903434753, + "learning_rate": 0.0005, + "loss": 2.115, + "step": 170850 + }, + { + "epoch": 0.6503353303441608, + "grad_norm": 0.12637241184711456, + "learning_rate": 0.0005, + "loss": 2.1251, + "step": 170860 + }, + { + "epoch": 0.6503733928122836, + "grad_norm": 0.12315292656421661, + "learning_rate": 0.0005, + "loss": 2.0996, + "step": 170870 + }, + { + "epoch": 0.6504114552804062, + "grad_norm": 0.12074718624353409, + "learning_rate": 0.0005, + "loss": 2.1174, + "step": 170880 + }, + { + "epoch": 0.6504495177485289, + "grad_norm": 0.11984572559595108, + "learning_rate": 0.0005, + "loss": 2.1158, + "step": 170890 + }, + { + "epoch": 0.6504875802166515, + "grad_norm": 0.12621347606182098, + "learning_rate": 0.0005, + "loss": 2.1013, + "step": 170900 + }, + { + "epoch": 0.6505256426847742, + "grad_norm": 0.1321704387664795, + "learning_rate": 0.0005, + "loss": 2.1018, + "step": 170910 + }, + { + "epoch": 0.650563705152897, + "grad_norm": 0.1266314685344696, + "learning_rate": 0.0005, + "loss": 2.1085, + "step": 170920 + }, + { + "epoch": 0.6506017676210196, + "grad_norm": 0.1264774650335312, + "learning_rate": 0.0005, + "loss": 2.1071, + "step": 170930 + }, + { + "epoch": 0.6506398300891423, + "grad_norm": 0.13591910898685455, + "learning_rate": 0.0005, + "loss": 2.0969, + "step": 170940 + }, + { + "epoch": 0.6506778925572649, + "grad_norm": 0.13695189356803894, + "learning_rate": 0.0005, + "loss": 2.1107, + "step": 170950 + }, + { + "epoch": 0.6507159550253877, + "grad_norm": 0.12579090893268585, + "learning_rate": 0.0005, + "loss": 2.1008, + "step": 170960 + }, + { + "epoch": 0.6507540174935104, + "grad_norm": 0.1222383975982666, + "learning_rate": 0.0005, + "loss": 2.1058, + "step": 170970 + }, + { + "epoch": 0.650792079961633, + "grad_norm": 0.11862023919820786, + "learning_rate": 0.0005, + "loss": 2.1121, + "step": 170980 + }, + { + "epoch": 0.6508301424297557, + "grad_norm": 0.12775588035583496, + "learning_rate": 0.0005, + "loss": 2.1088, + "step": 170990 + }, + { + "epoch": 0.6508682048978784, + "grad_norm": 0.1321268528699875, + "learning_rate": 0.0005, + "loss": 2.1068, + "step": 171000 + }, + { + "epoch": 0.6509062673660011, + "grad_norm": 0.11592495441436768, + "learning_rate": 0.0005, + "loss": 2.1132, + "step": 171010 + }, + { + "epoch": 0.6509443298341238, + "grad_norm": 0.12910479307174683, + "learning_rate": 0.0005, + "loss": 2.1091, + "step": 171020 + }, + { + "epoch": 0.6509823923022464, + "grad_norm": 0.13574986159801483, + "learning_rate": 0.0005, + "loss": 2.1191, + "step": 171030 + }, + { + "epoch": 0.6510204547703692, + "grad_norm": 0.12799003720283508, + "learning_rate": 0.0005, + "loss": 2.1177, + "step": 171040 + }, + { + "epoch": 0.6510585172384918, + "grad_norm": 0.11744488775730133, + "learning_rate": 0.0005, + "loss": 2.1131, + "step": 171050 + }, + { + "epoch": 0.6510965797066145, + "grad_norm": 0.1317628026008606, + "learning_rate": 0.0005, + "loss": 2.1166, + "step": 171060 + }, + { + "epoch": 0.6511346421747372, + "grad_norm": 0.12473851442337036, + "learning_rate": 0.0005, + "loss": 2.1199, + "step": 171070 + }, + { + "epoch": 0.6511727046428598, + "grad_norm": 0.12542445957660675, + "learning_rate": 0.0005, + "loss": 2.1154, + "step": 171080 + }, + { + "epoch": 0.6512107671109826, + "grad_norm": 0.16549521684646606, + "learning_rate": 0.0005, + "loss": 2.1136, + "step": 171090 + }, + { + "epoch": 0.6512488295791052, + "grad_norm": 0.12249605357646942, + "learning_rate": 0.0005, + "loss": 2.106, + "step": 171100 + }, + { + "epoch": 0.6512868920472279, + "grad_norm": 0.12234670668840408, + "learning_rate": 0.0005, + "loss": 2.1007, + "step": 171110 + }, + { + "epoch": 0.6513249545153506, + "grad_norm": 0.11434841901063919, + "learning_rate": 0.0005, + "loss": 2.1102, + "step": 171120 + }, + { + "epoch": 0.6513630169834733, + "grad_norm": 0.12157260626554489, + "learning_rate": 0.0005, + "loss": 2.1174, + "step": 171130 + }, + { + "epoch": 0.651401079451596, + "grad_norm": 0.12111295759677887, + "learning_rate": 0.0005, + "loss": 2.1295, + "step": 171140 + }, + { + "epoch": 0.6514391419197186, + "grad_norm": 0.1188582181930542, + "learning_rate": 0.0005, + "loss": 2.1113, + "step": 171150 + }, + { + "epoch": 0.6514772043878413, + "grad_norm": 0.12025199085474014, + "learning_rate": 0.0005, + "loss": 2.1124, + "step": 171160 + }, + { + "epoch": 0.6515152668559641, + "grad_norm": 0.14027582108974457, + "learning_rate": 0.0005, + "loss": 2.1072, + "step": 171170 + }, + { + "epoch": 0.6515533293240867, + "grad_norm": 0.13581660389900208, + "learning_rate": 0.0005, + "loss": 2.1128, + "step": 171180 + }, + { + "epoch": 0.6515913917922094, + "grad_norm": 0.11547748744487762, + "learning_rate": 0.0005, + "loss": 2.1115, + "step": 171190 + }, + { + "epoch": 0.651629454260332, + "grad_norm": 0.12545788288116455, + "learning_rate": 0.0005, + "loss": 2.1064, + "step": 171200 + }, + { + "epoch": 0.6516675167284547, + "grad_norm": 0.11708662658929825, + "learning_rate": 0.0005, + "loss": 2.1069, + "step": 171210 + }, + { + "epoch": 0.6517055791965775, + "grad_norm": 0.11985498666763306, + "learning_rate": 0.0005, + "loss": 2.1206, + "step": 171220 + }, + { + "epoch": 0.6517436416647001, + "grad_norm": 0.12114997953176498, + "learning_rate": 0.0005, + "loss": 2.1147, + "step": 171230 + }, + { + "epoch": 0.6517817041328228, + "grad_norm": 0.11515387892723083, + "learning_rate": 0.0005, + "loss": 2.1005, + "step": 171240 + }, + { + "epoch": 0.6518197666009454, + "grad_norm": 0.13565857708454132, + "learning_rate": 0.0005, + "loss": 2.1022, + "step": 171250 + }, + { + "epoch": 0.6518578290690682, + "grad_norm": 0.1242651715874672, + "learning_rate": 0.0005, + "loss": 2.1146, + "step": 171260 + }, + { + "epoch": 0.6518958915371909, + "grad_norm": 0.12579038739204407, + "learning_rate": 0.0005, + "loss": 2.1043, + "step": 171270 + }, + { + "epoch": 0.6519339540053135, + "grad_norm": 0.12866291403770447, + "learning_rate": 0.0005, + "loss": 2.1037, + "step": 171280 + }, + { + "epoch": 0.6519720164734362, + "grad_norm": 0.12336907535791397, + "learning_rate": 0.0005, + "loss": 2.1128, + "step": 171290 + }, + { + "epoch": 0.6520100789415589, + "grad_norm": 0.1254909634590149, + "learning_rate": 0.0005, + "loss": 2.1137, + "step": 171300 + }, + { + "epoch": 0.6520481414096816, + "grad_norm": 0.12830188870429993, + "learning_rate": 0.0005, + "loss": 2.1012, + "step": 171310 + }, + { + "epoch": 0.6520862038778042, + "grad_norm": 0.14979737997055054, + "learning_rate": 0.0005, + "loss": 2.1038, + "step": 171320 + }, + { + "epoch": 0.6521242663459269, + "grad_norm": 0.12184132635593414, + "learning_rate": 0.0005, + "loss": 2.0956, + "step": 171330 + }, + { + "epoch": 0.6521623288140496, + "grad_norm": 0.1344393640756607, + "learning_rate": 0.0005, + "loss": 2.0814, + "step": 171340 + }, + { + "epoch": 0.6522003912821723, + "grad_norm": 0.14398813247680664, + "learning_rate": 0.0005, + "loss": 2.1108, + "step": 171350 + }, + { + "epoch": 0.652238453750295, + "grad_norm": 0.13092774152755737, + "learning_rate": 0.0005, + "loss": 2.1222, + "step": 171360 + }, + { + "epoch": 0.6522765162184176, + "grad_norm": 0.1261938065290451, + "learning_rate": 0.0005, + "loss": 2.102, + "step": 171370 + }, + { + "epoch": 0.6523145786865403, + "grad_norm": 0.1248980313539505, + "learning_rate": 0.0005, + "loss": 2.1173, + "step": 171380 + }, + { + "epoch": 0.6523526411546631, + "grad_norm": 0.127181276679039, + "learning_rate": 0.0005, + "loss": 2.1015, + "step": 171390 + }, + { + "epoch": 0.6523907036227857, + "grad_norm": 0.12705473601818085, + "learning_rate": 0.0005, + "loss": 2.1013, + "step": 171400 + }, + { + "epoch": 0.6524287660909084, + "grad_norm": 0.13760901987552643, + "learning_rate": 0.0005, + "loss": 2.1242, + "step": 171410 + }, + { + "epoch": 0.652466828559031, + "grad_norm": 0.13452285528182983, + "learning_rate": 0.0005, + "loss": 2.1201, + "step": 171420 + }, + { + "epoch": 0.6525048910271538, + "grad_norm": 0.11699914187192917, + "learning_rate": 0.0005, + "loss": 2.1102, + "step": 171430 + }, + { + "epoch": 0.6525429534952765, + "grad_norm": 0.12803451716899872, + "learning_rate": 0.0005, + "loss": 2.1036, + "step": 171440 + }, + { + "epoch": 0.6525810159633991, + "grad_norm": 0.12715654075145721, + "learning_rate": 0.0005, + "loss": 2.1115, + "step": 171450 + }, + { + "epoch": 0.6526190784315218, + "grad_norm": 0.1353614181280136, + "learning_rate": 0.0005, + "loss": 2.1053, + "step": 171460 + }, + { + "epoch": 0.6526571408996446, + "grad_norm": 0.1400173157453537, + "learning_rate": 0.0005, + "loss": 2.0991, + "step": 171470 + }, + { + "epoch": 0.6526952033677672, + "grad_norm": 0.11517337709665298, + "learning_rate": 0.0005, + "loss": 2.1144, + "step": 171480 + }, + { + "epoch": 0.6527332658358899, + "grad_norm": 0.1215863898396492, + "learning_rate": 0.0005, + "loss": 2.1088, + "step": 171490 + }, + { + "epoch": 0.6527713283040125, + "grad_norm": 0.11860140413045883, + "learning_rate": 0.0005, + "loss": 2.1113, + "step": 171500 + }, + { + "epoch": 0.6528093907721352, + "grad_norm": 0.1134290024638176, + "learning_rate": 0.0005, + "loss": 2.1094, + "step": 171510 + }, + { + "epoch": 0.652847453240258, + "grad_norm": 0.12387970834970474, + "learning_rate": 0.0005, + "loss": 2.1071, + "step": 171520 + }, + { + "epoch": 0.6528855157083806, + "grad_norm": 0.13404084742069244, + "learning_rate": 0.0005, + "loss": 2.1053, + "step": 171530 + }, + { + "epoch": 0.6529235781765033, + "grad_norm": 0.1261999011039734, + "learning_rate": 0.0005, + "loss": 2.115, + "step": 171540 + }, + { + "epoch": 0.6529616406446259, + "grad_norm": 0.12469957768917084, + "learning_rate": 0.0005, + "loss": 2.0986, + "step": 171550 + }, + { + "epoch": 0.6529997031127487, + "grad_norm": 0.1313467025756836, + "learning_rate": 0.0005, + "loss": 2.0964, + "step": 171560 + }, + { + "epoch": 0.6530377655808713, + "grad_norm": 0.13312993943691254, + "learning_rate": 0.0005, + "loss": 2.1036, + "step": 171570 + }, + { + "epoch": 0.653075828048994, + "grad_norm": 0.1249455064535141, + "learning_rate": 0.0005, + "loss": 2.1155, + "step": 171580 + }, + { + "epoch": 0.6531138905171167, + "grad_norm": 0.1262136846780777, + "learning_rate": 0.0005, + "loss": 2.1097, + "step": 171590 + }, + { + "epoch": 0.6531519529852394, + "grad_norm": 0.12008432298898697, + "learning_rate": 0.0005, + "loss": 2.1188, + "step": 171600 + }, + { + "epoch": 0.6531900154533621, + "grad_norm": 0.12458830326795578, + "learning_rate": 0.0005, + "loss": 2.1129, + "step": 171610 + }, + { + "epoch": 0.6532280779214847, + "grad_norm": 0.1217493787407875, + "learning_rate": 0.0005, + "loss": 2.115, + "step": 171620 + }, + { + "epoch": 0.6532661403896074, + "grad_norm": 0.12446153908967972, + "learning_rate": 0.0005, + "loss": 2.1003, + "step": 171630 + }, + { + "epoch": 0.65330420285773, + "grad_norm": 0.13291476666927338, + "learning_rate": 0.0005, + "loss": 2.1196, + "step": 171640 + }, + { + "epoch": 0.6533422653258528, + "grad_norm": 0.11692068725824356, + "learning_rate": 0.0005, + "loss": 2.1029, + "step": 171650 + }, + { + "epoch": 0.6533803277939755, + "grad_norm": 0.12520849704742432, + "learning_rate": 0.0005, + "loss": 2.1113, + "step": 171660 + }, + { + "epoch": 0.6534183902620981, + "grad_norm": 0.1229141503572464, + "learning_rate": 0.0005, + "loss": 2.097, + "step": 171670 + }, + { + "epoch": 0.6534564527302208, + "grad_norm": 0.12377126514911652, + "learning_rate": 0.0005, + "loss": 2.1133, + "step": 171680 + }, + { + "epoch": 0.6534945151983436, + "grad_norm": 0.12698081135749817, + "learning_rate": 0.0005, + "loss": 2.1141, + "step": 171690 + }, + { + "epoch": 0.6535325776664662, + "grad_norm": 0.12466852366924286, + "learning_rate": 0.0005, + "loss": 2.0966, + "step": 171700 + }, + { + "epoch": 0.6535706401345889, + "grad_norm": 0.1278296560049057, + "learning_rate": 0.0005, + "loss": 2.1061, + "step": 171710 + }, + { + "epoch": 0.6536087026027115, + "grad_norm": 0.1254255324602127, + "learning_rate": 0.0005, + "loss": 2.1133, + "step": 171720 + }, + { + "epoch": 0.6536467650708343, + "grad_norm": 0.12170296162366867, + "learning_rate": 0.0005, + "loss": 2.1086, + "step": 171730 + }, + { + "epoch": 0.653684827538957, + "grad_norm": 0.12227907031774521, + "learning_rate": 0.0005, + "loss": 2.0974, + "step": 171740 + }, + { + "epoch": 0.6537228900070796, + "grad_norm": 0.14366976916790009, + "learning_rate": 0.0005, + "loss": 2.1081, + "step": 171750 + }, + { + "epoch": 0.6537609524752023, + "grad_norm": 0.12183597683906555, + "learning_rate": 0.0005, + "loss": 2.116, + "step": 171760 + }, + { + "epoch": 0.6537990149433249, + "grad_norm": 0.12633612751960754, + "learning_rate": 0.0005, + "loss": 2.1145, + "step": 171770 + }, + { + "epoch": 0.6538370774114477, + "grad_norm": 0.12033534049987793, + "learning_rate": 0.0005, + "loss": 2.1008, + "step": 171780 + }, + { + "epoch": 0.6538751398795704, + "grad_norm": 0.11813070625066757, + "learning_rate": 0.0005, + "loss": 2.1148, + "step": 171790 + }, + { + "epoch": 0.653913202347693, + "grad_norm": 0.14022228121757507, + "learning_rate": 0.0005, + "loss": 2.1082, + "step": 171800 + }, + { + "epoch": 0.6539512648158157, + "grad_norm": 0.1259172260761261, + "learning_rate": 0.0005, + "loss": 2.1174, + "step": 171810 + }, + { + "epoch": 0.6539893272839384, + "grad_norm": 0.13032588362693787, + "learning_rate": 0.0005, + "loss": 2.1172, + "step": 171820 + }, + { + "epoch": 0.6540273897520611, + "grad_norm": 0.12416426092386246, + "learning_rate": 0.0005, + "loss": 2.1125, + "step": 171830 + }, + { + "epoch": 0.6540654522201838, + "grad_norm": 0.1398206502199173, + "learning_rate": 0.0005, + "loss": 2.1078, + "step": 171840 + }, + { + "epoch": 0.6541035146883064, + "grad_norm": 0.11684484779834747, + "learning_rate": 0.0005, + "loss": 2.0972, + "step": 171850 + }, + { + "epoch": 0.6541415771564292, + "grad_norm": 0.12237782776355743, + "learning_rate": 0.0005, + "loss": 2.1255, + "step": 171860 + }, + { + "epoch": 0.6541796396245518, + "grad_norm": 0.1241404116153717, + "learning_rate": 0.0005, + "loss": 2.1047, + "step": 171870 + }, + { + "epoch": 0.6542177020926745, + "grad_norm": 0.12162047624588013, + "learning_rate": 0.0005, + "loss": 2.1224, + "step": 171880 + }, + { + "epoch": 0.6542557645607971, + "grad_norm": 0.12076914310455322, + "learning_rate": 0.0005, + "loss": 2.1105, + "step": 171890 + }, + { + "epoch": 0.6542938270289199, + "grad_norm": 0.14396323263645172, + "learning_rate": 0.0005, + "loss": 2.0975, + "step": 171900 + }, + { + "epoch": 0.6543318894970426, + "grad_norm": 0.1287434846162796, + "learning_rate": 0.0005, + "loss": 2.1148, + "step": 171910 + }, + { + "epoch": 0.6543699519651652, + "grad_norm": 0.13110186159610748, + "learning_rate": 0.0005, + "loss": 2.1159, + "step": 171920 + }, + { + "epoch": 0.6544080144332879, + "grad_norm": 0.12009651213884354, + "learning_rate": 0.0005, + "loss": 2.1113, + "step": 171930 + }, + { + "epoch": 0.6544460769014105, + "grad_norm": 0.11705582588911057, + "learning_rate": 0.0005, + "loss": 2.1101, + "step": 171940 + }, + { + "epoch": 0.6544841393695333, + "grad_norm": 0.12955498695373535, + "learning_rate": 0.0005, + "loss": 2.131, + "step": 171950 + }, + { + "epoch": 0.654522201837656, + "grad_norm": 0.11973123997449875, + "learning_rate": 0.0005, + "loss": 2.1122, + "step": 171960 + }, + { + "epoch": 0.6545602643057786, + "grad_norm": 0.13571450114250183, + "learning_rate": 0.0005, + "loss": 2.1246, + "step": 171970 + }, + { + "epoch": 0.6545983267739013, + "grad_norm": 0.1409195512533188, + "learning_rate": 0.0005, + "loss": 2.1166, + "step": 171980 + }, + { + "epoch": 0.654636389242024, + "grad_norm": 0.13034707307815552, + "learning_rate": 0.0005, + "loss": 2.1228, + "step": 171990 + }, + { + "epoch": 0.6546744517101467, + "grad_norm": 0.1197667196393013, + "learning_rate": 0.0005, + "loss": 2.1135, + "step": 172000 + }, + { + "epoch": 0.6547125141782694, + "grad_norm": 0.12414491921663284, + "learning_rate": 0.0005, + "loss": 2.0912, + "step": 172010 + }, + { + "epoch": 0.654750576646392, + "grad_norm": 0.12749461829662323, + "learning_rate": 0.0005, + "loss": 2.1092, + "step": 172020 + }, + { + "epoch": 0.6547886391145148, + "grad_norm": 0.12330741435289383, + "learning_rate": 0.0005, + "loss": 2.1143, + "step": 172030 + }, + { + "epoch": 0.6548267015826374, + "grad_norm": 0.12864083051681519, + "learning_rate": 0.0005, + "loss": 2.1061, + "step": 172040 + }, + { + "epoch": 0.6548647640507601, + "grad_norm": 0.12212818115949631, + "learning_rate": 0.0005, + "loss": 2.1117, + "step": 172050 + }, + { + "epoch": 0.6549028265188828, + "grad_norm": 0.12514182925224304, + "learning_rate": 0.0005, + "loss": 2.1097, + "step": 172060 + }, + { + "epoch": 0.6549408889870054, + "grad_norm": 0.11073462665081024, + "learning_rate": 0.0005, + "loss": 2.118, + "step": 172070 + }, + { + "epoch": 0.6549789514551282, + "grad_norm": 0.11813299357891083, + "learning_rate": 0.0005, + "loss": 2.0974, + "step": 172080 + }, + { + "epoch": 0.6550170139232508, + "grad_norm": 0.12076397985219955, + "learning_rate": 0.0005, + "loss": 2.1198, + "step": 172090 + }, + { + "epoch": 0.6550550763913735, + "grad_norm": 0.12106800079345703, + "learning_rate": 0.0005, + "loss": 2.1155, + "step": 172100 + }, + { + "epoch": 0.6550931388594962, + "grad_norm": 0.23853981494903564, + "learning_rate": 0.0005, + "loss": 2.1071, + "step": 172110 + }, + { + "epoch": 0.6551312013276189, + "grad_norm": 0.12523460388183594, + "learning_rate": 0.0005, + "loss": 2.1045, + "step": 172120 + }, + { + "epoch": 0.6551692637957416, + "grad_norm": 0.12649975717067719, + "learning_rate": 0.0005, + "loss": 2.1015, + "step": 172130 + }, + { + "epoch": 0.6552073262638642, + "grad_norm": 0.11596899479627609, + "learning_rate": 0.0005, + "loss": 2.111, + "step": 172140 + }, + { + "epoch": 0.6552453887319869, + "grad_norm": 0.12488400936126709, + "learning_rate": 0.0005, + "loss": 2.1048, + "step": 172150 + }, + { + "epoch": 0.6552834512001097, + "grad_norm": 0.13245391845703125, + "learning_rate": 0.0005, + "loss": 2.1074, + "step": 172160 + }, + { + "epoch": 0.6553215136682323, + "grad_norm": 0.1204514130949974, + "learning_rate": 0.0005, + "loss": 2.1099, + "step": 172170 + }, + { + "epoch": 0.655359576136355, + "grad_norm": 0.1243198812007904, + "learning_rate": 0.0005, + "loss": 2.114, + "step": 172180 + }, + { + "epoch": 0.6553976386044776, + "grad_norm": 0.1361195594072342, + "learning_rate": 0.0005, + "loss": 2.1088, + "step": 172190 + }, + { + "epoch": 0.6554357010726003, + "grad_norm": 0.13096532225608826, + "learning_rate": 0.0005, + "loss": 2.1215, + "step": 172200 + }, + { + "epoch": 0.6554737635407231, + "grad_norm": 0.11856479197740555, + "learning_rate": 0.0005, + "loss": 2.1132, + "step": 172210 + }, + { + "epoch": 0.6555118260088457, + "grad_norm": 0.1288810521364212, + "learning_rate": 0.0005, + "loss": 2.1069, + "step": 172220 + }, + { + "epoch": 0.6555498884769684, + "grad_norm": 0.12508545815944672, + "learning_rate": 0.0005, + "loss": 2.1189, + "step": 172230 + }, + { + "epoch": 0.655587950945091, + "grad_norm": 0.13051195442676544, + "learning_rate": 0.0005, + "loss": 2.112, + "step": 172240 + }, + { + "epoch": 0.6556260134132138, + "grad_norm": 0.1230180487036705, + "learning_rate": 0.0005, + "loss": 2.1137, + "step": 172250 + }, + { + "epoch": 0.6556640758813365, + "grad_norm": 0.11832129955291748, + "learning_rate": 0.0005, + "loss": 2.118, + "step": 172260 + }, + { + "epoch": 0.6557021383494591, + "grad_norm": 0.13429515063762665, + "learning_rate": 0.0005, + "loss": 2.0976, + "step": 172270 + }, + { + "epoch": 0.6557402008175818, + "grad_norm": 0.1276344656944275, + "learning_rate": 0.0005, + "loss": 2.0926, + "step": 172280 + }, + { + "epoch": 0.6557782632857045, + "grad_norm": 0.11800602823495865, + "learning_rate": 0.0005, + "loss": 2.1054, + "step": 172290 + }, + { + "epoch": 0.6558163257538272, + "grad_norm": 0.12008919566869736, + "learning_rate": 0.0005, + "loss": 2.0926, + "step": 172300 + }, + { + "epoch": 0.6558543882219499, + "grad_norm": 0.12468130886554718, + "learning_rate": 0.0005, + "loss": 2.1029, + "step": 172310 + }, + { + "epoch": 0.6558924506900725, + "grad_norm": 0.12327641993761063, + "learning_rate": 0.0005, + "loss": 2.1046, + "step": 172320 + }, + { + "epoch": 0.6559305131581953, + "grad_norm": 0.13303974270820618, + "learning_rate": 0.0005, + "loss": 2.1209, + "step": 172330 + }, + { + "epoch": 0.6559685756263179, + "grad_norm": 0.12919138371944427, + "learning_rate": 0.0005, + "loss": 2.1048, + "step": 172340 + }, + { + "epoch": 0.6560066380944406, + "grad_norm": 0.11894699186086655, + "learning_rate": 0.0005, + "loss": 2.096, + "step": 172350 + }, + { + "epoch": 0.6560447005625633, + "grad_norm": 0.13580971956253052, + "learning_rate": 0.0005, + "loss": 2.1116, + "step": 172360 + }, + { + "epoch": 0.6560827630306859, + "grad_norm": 0.13319773972034454, + "learning_rate": 0.0005, + "loss": 2.123, + "step": 172370 + }, + { + "epoch": 0.6561208254988087, + "grad_norm": 0.1443890482187271, + "learning_rate": 0.0005, + "loss": 2.1163, + "step": 172380 + }, + { + "epoch": 0.6561588879669313, + "grad_norm": 0.1339932531118393, + "learning_rate": 0.0005, + "loss": 2.1086, + "step": 172390 + }, + { + "epoch": 0.656196950435054, + "grad_norm": 0.1335560381412506, + "learning_rate": 0.0005, + "loss": 2.1207, + "step": 172400 + }, + { + "epoch": 0.6562350129031767, + "grad_norm": 0.13227705657482147, + "learning_rate": 0.0005, + "loss": 2.1151, + "step": 172410 + }, + { + "epoch": 0.6562730753712994, + "grad_norm": 0.12547624111175537, + "learning_rate": 0.0005, + "loss": 2.1205, + "step": 172420 + }, + { + "epoch": 0.6563111378394221, + "grad_norm": 0.11724413186311722, + "learning_rate": 0.0005, + "loss": 2.1142, + "step": 172430 + }, + { + "epoch": 0.6563492003075447, + "grad_norm": 0.1592647284269333, + "learning_rate": 0.0005, + "loss": 2.1192, + "step": 172440 + }, + { + "epoch": 0.6563872627756674, + "grad_norm": 0.13125862181186676, + "learning_rate": 0.0005, + "loss": 2.0955, + "step": 172450 + }, + { + "epoch": 0.6564253252437902, + "grad_norm": 0.12079490721225739, + "learning_rate": 0.0005, + "loss": 2.1021, + "step": 172460 + }, + { + "epoch": 0.6564633877119128, + "grad_norm": 0.13198938965797424, + "learning_rate": 0.0005, + "loss": 2.1077, + "step": 172470 + }, + { + "epoch": 0.6565014501800355, + "grad_norm": 0.1353512406349182, + "learning_rate": 0.0005, + "loss": 2.1147, + "step": 172480 + }, + { + "epoch": 0.6565395126481581, + "grad_norm": 0.1280246526002884, + "learning_rate": 0.0005, + "loss": 2.0884, + "step": 172490 + }, + { + "epoch": 0.6565775751162808, + "grad_norm": 0.11841772496700287, + "learning_rate": 0.0005, + "loss": 2.1095, + "step": 172500 + }, + { + "epoch": 0.6566156375844036, + "grad_norm": 0.12437329441308975, + "learning_rate": 0.0005, + "loss": 2.1181, + "step": 172510 + }, + { + "epoch": 0.6566537000525262, + "grad_norm": 0.12280859798192978, + "learning_rate": 0.0005, + "loss": 2.1095, + "step": 172520 + }, + { + "epoch": 0.6566917625206489, + "grad_norm": 0.13854913413524628, + "learning_rate": 0.0005, + "loss": 2.1088, + "step": 172530 + }, + { + "epoch": 0.6567298249887715, + "grad_norm": 0.1186220794916153, + "learning_rate": 0.0005, + "loss": 2.1058, + "step": 172540 + }, + { + "epoch": 0.6567678874568943, + "grad_norm": 0.13003933429718018, + "learning_rate": 0.0005, + "loss": 2.0988, + "step": 172550 + }, + { + "epoch": 0.656805949925017, + "grad_norm": 0.12726987898349762, + "learning_rate": 0.0005, + "loss": 2.1008, + "step": 172560 + }, + { + "epoch": 0.6568440123931396, + "grad_norm": 0.1203669011592865, + "learning_rate": 0.0005, + "loss": 2.1062, + "step": 172570 + }, + { + "epoch": 0.6568820748612623, + "grad_norm": 0.1172369197010994, + "learning_rate": 0.0005, + "loss": 2.1146, + "step": 172580 + }, + { + "epoch": 0.656920137329385, + "grad_norm": 0.11729346215724945, + "learning_rate": 0.0005, + "loss": 2.091, + "step": 172590 + }, + { + "epoch": 0.6569581997975077, + "grad_norm": 0.11557607352733612, + "learning_rate": 0.0005, + "loss": 2.1147, + "step": 172600 + }, + { + "epoch": 0.6569962622656303, + "grad_norm": 0.11810766160488129, + "learning_rate": 0.0005, + "loss": 2.1119, + "step": 172610 + }, + { + "epoch": 0.657034324733753, + "grad_norm": 0.12994596362113953, + "learning_rate": 0.0005, + "loss": 2.1071, + "step": 172620 + }, + { + "epoch": 0.6570723872018758, + "grad_norm": 0.12548710405826569, + "learning_rate": 0.0005, + "loss": 2.116, + "step": 172630 + }, + { + "epoch": 0.6571104496699984, + "grad_norm": 0.12391366064548492, + "learning_rate": 0.0005, + "loss": 2.0999, + "step": 172640 + }, + { + "epoch": 0.6571485121381211, + "grad_norm": 0.11640439182519913, + "learning_rate": 0.0005, + "loss": 2.1051, + "step": 172650 + }, + { + "epoch": 0.6571865746062437, + "grad_norm": 0.1241813525557518, + "learning_rate": 0.0005, + "loss": 2.1072, + "step": 172660 + }, + { + "epoch": 0.6572246370743664, + "grad_norm": 0.12873999774456024, + "learning_rate": 0.0005, + "loss": 2.1081, + "step": 172670 + }, + { + "epoch": 0.6572626995424892, + "grad_norm": 0.11288134753704071, + "learning_rate": 0.0005, + "loss": 2.1032, + "step": 172680 + }, + { + "epoch": 0.6573007620106118, + "grad_norm": 0.11984800547361374, + "learning_rate": 0.0005, + "loss": 2.119, + "step": 172690 + }, + { + "epoch": 0.6573388244787345, + "grad_norm": 0.12152191251516342, + "learning_rate": 0.0005, + "loss": 2.1197, + "step": 172700 + }, + { + "epoch": 0.6573768869468571, + "grad_norm": 0.11136773973703384, + "learning_rate": 0.0005, + "loss": 2.108, + "step": 172710 + }, + { + "epoch": 0.6574149494149799, + "grad_norm": 0.12120914459228516, + "learning_rate": 0.0005, + "loss": 2.1186, + "step": 172720 + }, + { + "epoch": 0.6574530118831026, + "grad_norm": 0.12348100543022156, + "learning_rate": 0.0005, + "loss": 2.1073, + "step": 172730 + }, + { + "epoch": 0.6574910743512252, + "grad_norm": 0.11988083273172379, + "learning_rate": 0.0005, + "loss": 2.1104, + "step": 172740 + }, + { + "epoch": 0.6575291368193479, + "grad_norm": 0.1392463594675064, + "learning_rate": 0.0005, + "loss": 2.1071, + "step": 172750 + }, + { + "epoch": 0.6575671992874706, + "grad_norm": 0.11580455303192139, + "learning_rate": 0.0005, + "loss": 2.1091, + "step": 172760 + }, + { + "epoch": 0.6576052617555933, + "grad_norm": 0.12519559264183044, + "learning_rate": 0.0005, + "loss": 2.1078, + "step": 172770 + }, + { + "epoch": 0.657643324223716, + "grad_norm": 0.11238522827625275, + "learning_rate": 0.0005, + "loss": 2.1273, + "step": 172780 + }, + { + "epoch": 0.6576813866918386, + "grad_norm": 0.11753600835800171, + "learning_rate": 0.0005, + "loss": 2.0989, + "step": 172790 + }, + { + "epoch": 0.6577194491599613, + "grad_norm": 0.1268373727798462, + "learning_rate": 0.0005, + "loss": 2.1049, + "step": 172800 + }, + { + "epoch": 0.657757511628084, + "grad_norm": 0.11925873160362244, + "learning_rate": 0.0005, + "loss": 2.0942, + "step": 172810 + }, + { + "epoch": 0.6577955740962067, + "grad_norm": 0.1523437798023224, + "learning_rate": 0.0005, + "loss": 2.134, + "step": 172820 + }, + { + "epoch": 0.6578336365643294, + "grad_norm": 0.12391243875026703, + "learning_rate": 0.0005, + "loss": 2.1136, + "step": 172830 + }, + { + "epoch": 0.657871699032452, + "grad_norm": 0.1167418509721756, + "learning_rate": 0.0005, + "loss": 2.109, + "step": 172840 + }, + { + "epoch": 0.6579097615005748, + "grad_norm": 0.1174694374203682, + "learning_rate": 0.0005, + "loss": 2.0989, + "step": 172850 + }, + { + "epoch": 0.6579478239686974, + "grad_norm": 0.1145000234246254, + "learning_rate": 0.0005, + "loss": 2.1164, + "step": 172860 + }, + { + "epoch": 0.6579858864368201, + "grad_norm": 0.12418124824762344, + "learning_rate": 0.0005, + "loss": 2.1116, + "step": 172870 + }, + { + "epoch": 0.6580239489049428, + "grad_norm": 0.12741424143314362, + "learning_rate": 0.0005, + "loss": 2.1165, + "step": 172880 + }, + { + "epoch": 0.6580620113730655, + "grad_norm": 0.11886553466320038, + "learning_rate": 0.0005, + "loss": 2.1147, + "step": 172890 + }, + { + "epoch": 0.6581000738411882, + "grad_norm": 0.12890352308750153, + "learning_rate": 0.0005, + "loss": 2.1025, + "step": 172900 + }, + { + "epoch": 0.6581381363093108, + "grad_norm": 0.13043367862701416, + "learning_rate": 0.0005, + "loss": 2.1172, + "step": 172910 + }, + { + "epoch": 0.6581761987774335, + "grad_norm": 0.14133760333061218, + "learning_rate": 0.0005, + "loss": 2.1123, + "step": 172920 + }, + { + "epoch": 0.6582142612455562, + "grad_norm": 0.13013096153736115, + "learning_rate": 0.0005, + "loss": 2.0985, + "step": 172930 + }, + { + "epoch": 0.6582523237136789, + "grad_norm": 0.12168702483177185, + "learning_rate": 0.0005, + "loss": 2.1086, + "step": 172940 + }, + { + "epoch": 0.6582903861818016, + "grad_norm": 0.11807643622159958, + "learning_rate": 0.0005, + "loss": 2.1141, + "step": 172950 + }, + { + "epoch": 0.6583284486499242, + "grad_norm": 0.1260489970445633, + "learning_rate": 0.0005, + "loss": 2.1009, + "step": 172960 + }, + { + "epoch": 0.6583665111180469, + "grad_norm": 0.12451190501451492, + "learning_rate": 0.0005, + "loss": 2.1079, + "step": 172970 + }, + { + "epoch": 0.6584045735861697, + "grad_norm": 0.12765002250671387, + "learning_rate": 0.0005, + "loss": 2.1131, + "step": 172980 + }, + { + "epoch": 0.6584426360542923, + "grad_norm": 0.12903667986392975, + "learning_rate": 0.0005, + "loss": 2.0893, + "step": 172990 + }, + { + "epoch": 0.658480698522415, + "grad_norm": 0.12422466278076172, + "learning_rate": 0.0005, + "loss": 2.1042, + "step": 173000 + }, + { + "epoch": 0.6585187609905376, + "grad_norm": 0.1248549371957779, + "learning_rate": 0.0005, + "loss": 2.0992, + "step": 173010 + }, + { + "epoch": 0.6585568234586604, + "grad_norm": 0.12342771142721176, + "learning_rate": 0.0005, + "loss": 2.1155, + "step": 173020 + }, + { + "epoch": 0.6585948859267831, + "grad_norm": 0.136073037981987, + "learning_rate": 0.0005, + "loss": 2.114, + "step": 173030 + }, + { + "epoch": 0.6586329483949057, + "grad_norm": 0.12524139881134033, + "learning_rate": 0.0005, + "loss": 2.1189, + "step": 173040 + }, + { + "epoch": 0.6586710108630284, + "grad_norm": 0.13240216672420502, + "learning_rate": 0.0005, + "loss": 2.113, + "step": 173050 + }, + { + "epoch": 0.6587090733311511, + "grad_norm": 0.12460118532180786, + "learning_rate": 0.0005, + "loss": 2.0964, + "step": 173060 + }, + { + "epoch": 0.6587471357992738, + "grad_norm": 0.1181638091802597, + "learning_rate": 0.0005, + "loss": 2.1138, + "step": 173070 + }, + { + "epoch": 0.6587851982673965, + "grad_norm": 0.11996463686227798, + "learning_rate": 0.0005, + "loss": 2.1031, + "step": 173080 + }, + { + "epoch": 0.6588232607355191, + "grad_norm": 0.1309623420238495, + "learning_rate": 0.0005, + "loss": 2.1063, + "step": 173090 + }, + { + "epoch": 0.6588613232036418, + "grad_norm": 0.1276891827583313, + "learning_rate": 0.0005, + "loss": 2.1097, + "step": 173100 + }, + { + "epoch": 0.6588993856717645, + "grad_norm": 0.13824163377285004, + "learning_rate": 0.0005, + "loss": 2.0969, + "step": 173110 + }, + { + "epoch": 0.6589374481398872, + "grad_norm": 0.11203496158123016, + "learning_rate": 0.0005, + "loss": 2.1136, + "step": 173120 + }, + { + "epoch": 0.6589755106080099, + "grad_norm": 0.12256965786218643, + "learning_rate": 0.0005, + "loss": 2.1103, + "step": 173130 + }, + { + "epoch": 0.6590135730761325, + "grad_norm": 0.1368960738182068, + "learning_rate": 0.0005, + "loss": 2.1174, + "step": 173140 + }, + { + "epoch": 0.6590516355442553, + "grad_norm": 0.12021365761756897, + "learning_rate": 0.0005, + "loss": 2.1184, + "step": 173150 + }, + { + "epoch": 0.6590896980123779, + "grad_norm": 0.1191093698143959, + "learning_rate": 0.0005, + "loss": 2.1161, + "step": 173160 + }, + { + "epoch": 0.6591277604805006, + "grad_norm": 0.12375283986330032, + "learning_rate": 0.0005, + "loss": 2.1164, + "step": 173170 + }, + { + "epoch": 0.6591658229486232, + "grad_norm": 0.12380228191614151, + "learning_rate": 0.0005, + "loss": 2.1135, + "step": 173180 + }, + { + "epoch": 0.659203885416746, + "grad_norm": 0.12744921445846558, + "learning_rate": 0.0005, + "loss": 2.1025, + "step": 173190 + }, + { + "epoch": 0.6592419478848687, + "grad_norm": 0.12835119664669037, + "learning_rate": 0.0005, + "loss": 2.1005, + "step": 173200 + }, + { + "epoch": 0.6592800103529913, + "grad_norm": 0.13462497293949127, + "learning_rate": 0.0005, + "loss": 2.117, + "step": 173210 + }, + { + "epoch": 0.659318072821114, + "grad_norm": 0.14495985209941864, + "learning_rate": 0.0005, + "loss": 2.0958, + "step": 173220 + }, + { + "epoch": 0.6593561352892366, + "grad_norm": 0.11991511285305023, + "learning_rate": 0.0005, + "loss": 2.094, + "step": 173230 + }, + { + "epoch": 0.6593941977573594, + "grad_norm": 0.14063900709152222, + "learning_rate": 0.0005, + "loss": 2.1013, + "step": 173240 + }, + { + "epoch": 0.6594322602254821, + "grad_norm": 0.12147396802902222, + "learning_rate": 0.0005, + "loss": 2.104, + "step": 173250 + }, + { + "epoch": 0.6594703226936047, + "grad_norm": 0.12060917913913727, + "learning_rate": 0.0005, + "loss": 2.0877, + "step": 173260 + }, + { + "epoch": 0.6595083851617274, + "grad_norm": 0.12398088723421097, + "learning_rate": 0.0005, + "loss": 2.1064, + "step": 173270 + }, + { + "epoch": 0.6595464476298502, + "grad_norm": 0.12243727594614029, + "learning_rate": 0.0005, + "loss": 2.1117, + "step": 173280 + }, + { + "epoch": 0.6595845100979728, + "grad_norm": 0.1569921225309372, + "learning_rate": 0.0005, + "loss": 2.1017, + "step": 173290 + }, + { + "epoch": 0.6596225725660955, + "grad_norm": 0.1303899735212326, + "learning_rate": 0.0005, + "loss": 2.1172, + "step": 173300 + }, + { + "epoch": 0.6596606350342181, + "grad_norm": 0.12057130038738251, + "learning_rate": 0.0005, + "loss": 2.1151, + "step": 173310 + }, + { + "epoch": 0.6596986975023409, + "grad_norm": 0.13421890139579773, + "learning_rate": 0.0005, + "loss": 2.123, + "step": 173320 + }, + { + "epoch": 0.6597367599704635, + "grad_norm": 0.12553319334983826, + "learning_rate": 0.0005, + "loss": 2.1167, + "step": 173330 + }, + { + "epoch": 0.6597748224385862, + "grad_norm": 0.12315362691879272, + "learning_rate": 0.0005, + "loss": 2.1139, + "step": 173340 + }, + { + "epoch": 0.6598128849067089, + "grad_norm": 0.11498898267745972, + "learning_rate": 0.0005, + "loss": 2.1143, + "step": 173350 + }, + { + "epoch": 0.6598509473748315, + "grad_norm": 0.12755295634269714, + "learning_rate": 0.0005, + "loss": 2.1032, + "step": 173360 + }, + { + "epoch": 0.6598890098429543, + "grad_norm": 0.12583117187023163, + "learning_rate": 0.0005, + "loss": 2.1038, + "step": 173370 + }, + { + "epoch": 0.659927072311077, + "grad_norm": 0.12035936117172241, + "learning_rate": 0.0005, + "loss": 2.1174, + "step": 173380 + }, + { + "epoch": 0.6599651347791996, + "grad_norm": 0.13051927089691162, + "learning_rate": 0.0005, + "loss": 2.1085, + "step": 173390 + }, + { + "epoch": 0.6600031972473223, + "grad_norm": 0.13374698162078857, + "learning_rate": 0.0005, + "loss": 2.1095, + "step": 173400 + }, + { + "epoch": 0.660041259715445, + "grad_norm": 0.11594285815954208, + "learning_rate": 0.0005, + "loss": 2.1034, + "step": 173410 + }, + { + "epoch": 0.6600793221835677, + "grad_norm": 0.11974643170833588, + "learning_rate": 0.0005, + "loss": 2.1105, + "step": 173420 + }, + { + "epoch": 0.6601173846516903, + "grad_norm": 0.12264318764209747, + "learning_rate": 0.0005, + "loss": 2.104, + "step": 173430 + }, + { + "epoch": 0.660155447119813, + "grad_norm": 0.13385246694087982, + "learning_rate": 0.0005, + "loss": 2.1243, + "step": 173440 + }, + { + "epoch": 0.6601935095879358, + "grad_norm": 0.13724219799041748, + "learning_rate": 0.0005, + "loss": 2.1137, + "step": 173450 + }, + { + "epoch": 0.6602315720560584, + "grad_norm": 0.1183377206325531, + "learning_rate": 0.0005, + "loss": 2.0973, + "step": 173460 + }, + { + "epoch": 0.6602696345241811, + "grad_norm": 0.11857128888368607, + "learning_rate": 0.0005, + "loss": 2.1242, + "step": 173470 + }, + { + "epoch": 0.6603076969923037, + "grad_norm": 0.11552588641643524, + "learning_rate": 0.0005, + "loss": 2.1241, + "step": 173480 + }, + { + "epoch": 0.6603457594604265, + "grad_norm": 0.12726780772209167, + "learning_rate": 0.0005, + "loss": 2.1139, + "step": 173490 + }, + { + "epoch": 0.6603838219285492, + "grad_norm": 0.12919867038726807, + "learning_rate": 0.0005, + "loss": 2.1009, + "step": 173500 + }, + { + "epoch": 0.6604218843966718, + "grad_norm": 0.12737435102462769, + "learning_rate": 0.0005, + "loss": 2.1176, + "step": 173510 + }, + { + "epoch": 0.6604599468647945, + "grad_norm": 0.12323548644781113, + "learning_rate": 0.0005, + "loss": 2.1095, + "step": 173520 + }, + { + "epoch": 0.6604980093329171, + "grad_norm": 0.12535181641578674, + "learning_rate": 0.0005, + "loss": 2.1093, + "step": 173530 + }, + { + "epoch": 0.6605360718010399, + "grad_norm": 0.1355310082435608, + "learning_rate": 0.0005, + "loss": 2.1076, + "step": 173540 + }, + { + "epoch": 0.6605741342691626, + "grad_norm": 0.1210881695151329, + "learning_rate": 0.0005, + "loss": 2.092, + "step": 173550 + }, + { + "epoch": 0.6606121967372852, + "grad_norm": 0.12017525732517242, + "learning_rate": 0.0005, + "loss": 2.1093, + "step": 173560 + }, + { + "epoch": 0.6606502592054079, + "grad_norm": 0.12086072564125061, + "learning_rate": 0.0005, + "loss": 2.1001, + "step": 173570 + }, + { + "epoch": 0.6606883216735306, + "grad_norm": 0.12978361546993256, + "learning_rate": 0.0005, + "loss": 2.115, + "step": 173580 + }, + { + "epoch": 0.6607263841416533, + "grad_norm": 0.1455615758895874, + "learning_rate": 0.0005, + "loss": 2.1091, + "step": 173590 + }, + { + "epoch": 0.660764446609776, + "grad_norm": 0.1265958696603775, + "learning_rate": 0.0005, + "loss": 2.1094, + "step": 173600 + }, + { + "epoch": 0.6608025090778986, + "grad_norm": 0.12732741236686707, + "learning_rate": 0.0005, + "loss": 2.1104, + "step": 173610 + }, + { + "epoch": 0.6608405715460214, + "grad_norm": 0.12790100276470184, + "learning_rate": 0.0005, + "loss": 2.0933, + "step": 173620 + }, + { + "epoch": 0.660878634014144, + "grad_norm": 0.13768728077411652, + "learning_rate": 0.0005, + "loss": 2.1017, + "step": 173630 + }, + { + "epoch": 0.6609166964822667, + "grad_norm": 0.1262575089931488, + "learning_rate": 0.0005, + "loss": 2.1182, + "step": 173640 + }, + { + "epoch": 0.6609547589503894, + "grad_norm": 0.11182989180088043, + "learning_rate": 0.0005, + "loss": 2.1129, + "step": 173650 + }, + { + "epoch": 0.660992821418512, + "grad_norm": 0.11728381365537643, + "learning_rate": 0.0005, + "loss": 2.1137, + "step": 173660 + }, + { + "epoch": 0.6610308838866348, + "grad_norm": 0.12345705926418304, + "learning_rate": 0.0005, + "loss": 2.1282, + "step": 173670 + }, + { + "epoch": 0.6610689463547574, + "grad_norm": 0.13155001401901245, + "learning_rate": 0.0005, + "loss": 2.108, + "step": 173680 + }, + { + "epoch": 0.6611070088228801, + "grad_norm": 0.11630726605653763, + "learning_rate": 0.0005, + "loss": 2.1075, + "step": 173690 + }, + { + "epoch": 0.6611450712910028, + "grad_norm": 0.11791592091321945, + "learning_rate": 0.0005, + "loss": 2.098, + "step": 173700 + }, + { + "epoch": 0.6611831337591255, + "grad_norm": 0.35240083932876587, + "learning_rate": 0.0005, + "loss": 2.0907, + "step": 173710 + }, + { + "epoch": 0.6612211962272482, + "grad_norm": 0.1190033033490181, + "learning_rate": 0.0005, + "loss": 2.1068, + "step": 173720 + }, + { + "epoch": 0.6612592586953708, + "grad_norm": 0.124099962413311, + "learning_rate": 0.0005, + "loss": 2.1269, + "step": 173730 + }, + { + "epoch": 0.6612973211634935, + "grad_norm": 0.13553419709205627, + "learning_rate": 0.0005, + "loss": 2.1197, + "step": 173740 + }, + { + "epoch": 0.6613353836316163, + "grad_norm": 0.12280543893575668, + "learning_rate": 0.0005, + "loss": 2.1292, + "step": 173750 + }, + { + "epoch": 0.6613734460997389, + "grad_norm": 0.1327347308397293, + "learning_rate": 0.0005, + "loss": 2.1059, + "step": 173760 + }, + { + "epoch": 0.6614115085678616, + "grad_norm": 0.13505050539970398, + "learning_rate": 0.0005, + "loss": 2.096, + "step": 173770 + }, + { + "epoch": 0.6614495710359842, + "grad_norm": 0.135938823223114, + "learning_rate": 0.0005, + "loss": 2.1269, + "step": 173780 + }, + { + "epoch": 0.6614876335041069, + "grad_norm": 0.14170552790164948, + "learning_rate": 0.0005, + "loss": 2.1123, + "step": 173790 + }, + { + "epoch": 0.6615256959722297, + "grad_norm": 0.1215779259800911, + "learning_rate": 0.0005, + "loss": 2.1142, + "step": 173800 + }, + { + "epoch": 0.6615637584403523, + "grad_norm": 0.12040174007415771, + "learning_rate": 0.0005, + "loss": 2.1155, + "step": 173810 + }, + { + "epoch": 0.661601820908475, + "grad_norm": 0.11516829580068588, + "learning_rate": 0.0005, + "loss": 2.108, + "step": 173820 + }, + { + "epoch": 0.6616398833765976, + "grad_norm": 0.1317477524280548, + "learning_rate": 0.0005, + "loss": 2.1097, + "step": 173830 + }, + { + "epoch": 0.6616779458447204, + "grad_norm": 0.12069617956876755, + "learning_rate": 0.0005, + "loss": 2.102, + "step": 173840 + }, + { + "epoch": 0.661716008312843, + "grad_norm": 0.13325481116771698, + "learning_rate": 0.0005, + "loss": 2.1008, + "step": 173850 + }, + { + "epoch": 0.6617540707809657, + "grad_norm": 0.14849242568016052, + "learning_rate": 0.0005, + "loss": 2.1105, + "step": 173860 + }, + { + "epoch": 0.6617921332490884, + "grad_norm": 0.13232289254665375, + "learning_rate": 0.0005, + "loss": 2.1182, + "step": 173870 + }, + { + "epoch": 0.6618301957172111, + "grad_norm": 0.12283754348754883, + "learning_rate": 0.0005, + "loss": 2.1007, + "step": 173880 + }, + { + "epoch": 0.6618682581853338, + "grad_norm": 0.1293504387140274, + "learning_rate": 0.0005, + "loss": 2.1124, + "step": 173890 + }, + { + "epoch": 0.6619063206534564, + "grad_norm": 0.11680664122104645, + "learning_rate": 0.0005, + "loss": 2.1114, + "step": 173900 + }, + { + "epoch": 0.6619443831215791, + "grad_norm": 0.11964026093482971, + "learning_rate": 0.0005, + "loss": 2.1148, + "step": 173910 + }, + { + "epoch": 0.6619824455897019, + "grad_norm": 0.11271769553422928, + "learning_rate": 0.0005, + "loss": 2.1059, + "step": 173920 + }, + { + "epoch": 0.6620205080578245, + "grad_norm": 0.12464918941259384, + "learning_rate": 0.0005, + "loss": 2.1262, + "step": 173930 + }, + { + "epoch": 0.6620585705259472, + "grad_norm": 0.15808416903018951, + "learning_rate": 0.0005, + "loss": 2.1107, + "step": 173940 + }, + { + "epoch": 0.6620966329940698, + "grad_norm": 0.12016769498586655, + "learning_rate": 0.0005, + "loss": 2.114, + "step": 173950 + }, + { + "epoch": 0.6621346954621925, + "grad_norm": 0.14003989100456238, + "learning_rate": 0.0005, + "loss": 2.1032, + "step": 173960 + }, + { + "epoch": 0.6621727579303153, + "grad_norm": 0.11997903883457184, + "learning_rate": 0.0005, + "loss": 2.1212, + "step": 173970 + }, + { + "epoch": 0.6622108203984379, + "grad_norm": 0.11551333963871002, + "learning_rate": 0.0005, + "loss": 2.1016, + "step": 173980 + }, + { + "epoch": 0.6622488828665606, + "grad_norm": 0.11689604818820953, + "learning_rate": 0.0005, + "loss": 2.1087, + "step": 173990 + }, + { + "epoch": 0.6622869453346832, + "grad_norm": 0.1281474381685257, + "learning_rate": 0.0005, + "loss": 2.1087, + "step": 174000 + }, + { + "epoch": 0.662325007802806, + "grad_norm": 0.11026114225387573, + "learning_rate": 0.0005, + "loss": 2.108, + "step": 174010 + }, + { + "epoch": 0.6623630702709287, + "grad_norm": 0.11417057365179062, + "learning_rate": 0.0005, + "loss": 2.1115, + "step": 174020 + }, + { + "epoch": 0.6624011327390513, + "grad_norm": 0.11177896708250046, + "learning_rate": 0.0005, + "loss": 2.1074, + "step": 174030 + }, + { + "epoch": 0.662439195207174, + "grad_norm": 0.12440768629312515, + "learning_rate": 0.0005, + "loss": 2.1038, + "step": 174040 + }, + { + "epoch": 0.6624772576752967, + "grad_norm": 0.11963693797588348, + "learning_rate": 0.0005, + "loss": 2.1255, + "step": 174050 + }, + { + "epoch": 0.6625153201434194, + "grad_norm": 0.1275544911623001, + "learning_rate": 0.0005, + "loss": 2.1109, + "step": 174060 + }, + { + "epoch": 0.6625533826115421, + "grad_norm": 0.2038632482290268, + "learning_rate": 0.0005, + "loss": 2.11, + "step": 174070 + }, + { + "epoch": 0.6625914450796647, + "grad_norm": 0.1305026113986969, + "learning_rate": 0.0005, + "loss": 2.1173, + "step": 174080 + }, + { + "epoch": 0.6626295075477874, + "grad_norm": 0.12949995696544647, + "learning_rate": 0.0005, + "loss": 2.1163, + "step": 174090 + }, + { + "epoch": 0.6626675700159101, + "grad_norm": 0.13585160672664642, + "learning_rate": 0.0005, + "loss": 2.1213, + "step": 174100 + }, + { + "epoch": 0.6627056324840328, + "grad_norm": 0.12164440751075745, + "learning_rate": 0.0005, + "loss": 2.0915, + "step": 174110 + }, + { + "epoch": 0.6627436949521555, + "grad_norm": 0.13014480471611023, + "learning_rate": 0.0005, + "loss": 2.1176, + "step": 174120 + }, + { + "epoch": 0.6627817574202781, + "grad_norm": 0.12387395650148392, + "learning_rate": 0.0005, + "loss": 2.0995, + "step": 174130 + }, + { + "epoch": 0.6628198198884009, + "grad_norm": 0.12452687323093414, + "learning_rate": 0.0005, + "loss": 2.1131, + "step": 174140 + }, + { + "epoch": 0.6628578823565235, + "grad_norm": 0.14395101368427277, + "learning_rate": 0.0005, + "loss": 2.0964, + "step": 174150 + }, + { + "epoch": 0.6628959448246462, + "grad_norm": 0.1168159618973732, + "learning_rate": 0.0005, + "loss": 2.1111, + "step": 174160 + }, + { + "epoch": 0.6629340072927689, + "grad_norm": 0.134388267993927, + "learning_rate": 0.0005, + "loss": 2.0997, + "step": 174170 + }, + { + "epoch": 0.6629720697608916, + "grad_norm": 0.1289610117673874, + "learning_rate": 0.0005, + "loss": 2.1107, + "step": 174180 + }, + { + "epoch": 0.6630101322290143, + "grad_norm": 0.12149546295404434, + "learning_rate": 0.0005, + "loss": 2.1041, + "step": 174190 + }, + { + "epoch": 0.6630481946971369, + "grad_norm": 0.11487753689289093, + "learning_rate": 0.0005, + "loss": 2.1182, + "step": 174200 + }, + { + "epoch": 0.6630862571652596, + "grad_norm": 0.11515262722969055, + "learning_rate": 0.0005, + "loss": 2.1032, + "step": 174210 + }, + { + "epoch": 0.6631243196333823, + "grad_norm": 0.12902528047561646, + "learning_rate": 0.0005, + "loss": 2.1123, + "step": 174220 + }, + { + "epoch": 0.663162382101505, + "grad_norm": 0.12115098536014557, + "learning_rate": 0.0005, + "loss": 2.1172, + "step": 174230 + }, + { + "epoch": 0.6632004445696277, + "grad_norm": 0.12723582983016968, + "learning_rate": 0.0005, + "loss": 2.1042, + "step": 174240 + }, + { + "epoch": 0.6632385070377503, + "grad_norm": 0.12236615270376205, + "learning_rate": 0.0005, + "loss": 2.1119, + "step": 174250 + }, + { + "epoch": 0.663276569505873, + "grad_norm": 0.135558620095253, + "learning_rate": 0.0005, + "loss": 2.1193, + "step": 174260 + }, + { + "epoch": 0.6633146319739958, + "grad_norm": 0.11501994729042053, + "learning_rate": 0.0005, + "loss": 2.0963, + "step": 174270 + }, + { + "epoch": 0.6633526944421184, + "grad_norm": 0.13081204891204834, + "learning_rate": 0.0005, + "loss": 2.1141, + "step": 174280 + }, + { + "epoch": 0.6633907569102411, + "grad_norm": 0.13843820989131927, + "learning_rate": 0.0005, + "loss": 2.1114, + "step": 174290 + }, + { + "epoch": 0.6634288193783637, + "grad_norm": 0.13076812028884888, + "learning_rate": 0.0005, + "loss": 2.1042, + "step": 174300 + }, + { + "epoch": 0.6634668818464865, + "grad_norm": 0.12675538659095764, + "learning_rate": 0.0005, + "loss": 2.118, + "step": 174310 + }, + { + "epoch": 0.6635049443146092, + "grad_norm": 0.12187536060810089, + "learning_rate": 0.0005, + "loss": 2.1079, + "step": 174320 + }, + { + "epoch": 0.6635430067827318, + "grad_norm": 0.11600963771343231, + "learning_rate": 0.0005, + "loss": 2.1081, + "step": 174330 + }, + { + "epoch": 0.6635810692508545, + "grad_norm": 0.12720301747322083, + "learning_rate": 0.0005, + "loss": 2.1168, + "step": 174340 + }, + { + "epoch": 0.6636191317189772, + "grad_norm": 0.1272958666086197, + "learning_rate": 0.0005, + "loss": 2.1202, + "step": 174350 + }, + { + "epoch": 0.6636571941870999, + "grad_norm": 0.11979950964450836, + "learning_rate": 0.0005, + "loss": 2.1165, + "step": 174360 + }, + { + "epoch": 0.6636952566552226, + "grad_norm": 0.132895827293396, + "learning_rate": 0.0005, + "loss": 2.123, + "step": 174370 + }, + { + "epoch": 0.6637333191233452, + "grad_norm": 0.139450341463089, + "learning_rate": 0.0005, + "loss": 2.107, + "step": 174380 + }, + { + "epoch": 0.6637713815914679, + "grad_norm": 0.12335019558668137, + "learning_rate": 0.0005, + "loss": 2.1039, + "step": 174390 + }, + { + "epoch": 0.6638094440595906, + "grad_norm": 0.11418977379798889, + "learning_rate": 0.0005, + "loss": 2.1084, + "step": 174400 + }, + { + "epoch": 0.6638475065277133, + "grad_norm": 0.1271081268787384, + "learning_rate": 0.0005, + "loss": 2.0918, + "step": 174410 + }, + { + "epoch": 0.663885568995836, + "grad_norm": 0.13359275460243225, + "learning_rate": 0.0005, + "loss": 2.1023, + "step": 174420 + }, + { + "epoch": 0.6639236314639586, + "grad_norm": 0.12090206891298294, + "learning_rate": 0.0005, + "loss": 2.1176, + "step": 174430 + }, + { + "epoch": 0.6639616939320814, + "grad_norm": 0.11663159728050232, + "learning_rate": 0.0005, + "loss": 2.1151, + "step": 174440 + }, + { + "epoch": 0.663999756400204, + "grad_norm": 0.11631562560796738, + "learning_rate": 0.0005, + "loss": 2.1053, + "step": 174450 + }, + { + "epoch": 0.6640378188683267, + "grad_norm": 0.12493157386779785, + "learning_rate": 0.0005, + "loss": 2.109, + "step": 174460 + }, + { + "epoch": 0.6640758813364493, + "grad_norm": 0.1239357739686966, + "learning_rate": 0.0005, + "loss": 2.1181, + "step": 174470 + }, + { + "epoch": 0.6641139438045721, + "grad_norm": 0.1262454092502594, + "learning_rate": 0.0005, + "loss": 2.1038, + "step": 174480 + }, + { + "epoch": 0.6641520062726948, + "grad_norm": 0.12332665920257568, + "learning_rate": 0.0005, + "loss": 2.1027, + "step": 174490 + }, + { + "epoch": 0.6641900687408174, + "grad_norm": 0.13212019205093384, + "learning_rate": 0.0005, + "loss": 2.0915, + "step": 174500 + }, + { + "epoch": 0.6642281312089401, + "grad_norm": 0.13737590610980988, + "learning_rate": 0.0005, + "loss": 2.1131, + "step": 174510 + }, + { + "epoch": 0.6642661936770627, + "grad_norm": 0.12693600356578827, + "learning_rate": 0.0005, + "loss": 2.1154, + "step": 174520 + }, + { + "epoch": 0.6643042561451855, + "grad_norm": 0.14551861584186554, + "learning_rate": 0.0005, + "loss": 2.1057, + "step": 174530 + }, + { + "epoch": 0.6643423186133082, + "grad_norm": 0.11993174999952316, + "learning_rate": 0.0005, + "loss": 2.1078, + "step": 174540 + }, + { + "epoch": 0.6643803810814308, + "grad_norm": 0.11988063901662827, + "learning_rate": 0.0005, + "loss": 2.0993, + "step": 174550 + }, + { + "epoch": 0.6644184435495535, + "grad_norm": 0.11576548963785172, + "learning_rate": 0.0005, + "loss": 2.0968, + "step": 174560 + }, + { + "epoch": 0.6644565060176763, + "grad_norm": 0.1163567528128624, + "learning_rate": 0.0005, + "loss": 2.1101, + "step": 174570 + }, + { + "epoch": 0.6644945684857989, + "grad_norm": 0.11626116186380386, + "learning_rate": 0.0005, + "loss": 2.1048, + "step": 174580 + }, + { + "epoch": 0.6645326309539216, + "grad_norm": 0.13025660812854767, + "learning_rate": 0.0005, + "loss": 2.115, + "step": 174590 + }, + { + "epoch": 0.6645706934220442, + "grad_norm": 0.1261567324399948, + "learning_rate": 0.0005, + "loss": 2.1225, + "step": 174600 + }, + { + "epoch": 0.664608755890167, + "grad_norm": 0.1174742728471756, + "learning_rate": 0.0005, + "loss": 2.1097, + "step": 174610 + }, + { + "epoch": 0.6646468183582896, + "grad_norm": 0.12801344692707062, + "learning_rate": 0.0005, + "loss": 2.1004, + "step": 174620 + }, + { + "epoch": 0.6646848808264123, + "grad_norm": 0.11963155120611191, + "learning_rate": 0.0005, + "loss": 2.1134, + "step": 174630 + }, + { + "epoch": 0.664722943294535, + "grad_norm": 0.14293743669986725, + "learning_rate": 0.0005, + "loss": 2.1111, + "step": 174640 + }, + { + "epoch": 0.6647610057626576, + "grad_norm": 0.12802202999591827, + "learning_rate": 0.0005, + "loss": 2.1133, + "step": 174650 + }, + { + "epoch": 0.6647990682307804, + "grad_norm": 0.13224215805530548, + "learning_rate": 0.0005, + "loss": 2.1151, + "step": 174660 + }, + { + "epoch": 0.664837130698903, + "grad_norm": 0.1264164298772812, + "learning_rate": 0.0005, + "loss": 2.1046, + "step": 174670 + }, + { + "epoch": 0.6648751931670257, + "grad_norm": 0.12990862131118774, + "learning_rate": 0.0005, + "loss": 2.1192, + "step": 174680 + }, + { + "epoch": 0.6649132556351484, + "grad_norm": 0.11904025822877884, + "learning_rate": 0.0005, + "loss": 2.1102, + "step": 174690 + }, + { + "epoch": 0.6649513181032711, + "grad_norm": 0.1135101467370987, + "learning_rate": 0.0005, + "loss": 2.1081, + "step": 174700 + }, + { + "epoch": 0.6649893805713938, + "grad_norm": 0.12086432427167892, + "learning_rate": 0.0005, + "loss": 2.1223, + "step": 174710 + }, + { + "epoch": 0.6650274430395164, + "grad_norm": 0.13415375351905823, + "learning_rate": 0.0005, + "loss": 2.1246, + "step": 174720 + }, + { + "epoch": 0.6650655055076391, + "grad_norm": 0.29723674058914185, + "learning_rate": 0.0005, + "loss": 2.0995, + "step": 174730 + }, + { + "epoch": 0.6651035679757619, + "grad_norm": 0.12538795173168182, + "learning_rate": 0.0005, + "loss": 2.136, + "step": 174740 + }, + { + "epoch": 0.6651416304438845, + "grad_norm": 0.11672871559858322, + "learning_rate": 0.0005, + "loss": 2.1099, + "step": 174750 + }, + { + "epoch": 0.6651796929120072, + "grad_norm": 0.12080489099025726, + "learning_rate": 0.0005, + "loss": 2.1198, + "step": 174760 + }, + { + "epoch": 0.6652177553801298, + "grad_norm": 0.12584912776947021, + "learning_rate": 0.0005, + "loss": 2.1116, + "step": 174770 + }, + { + "epoch": 0.6652558178482526, + "grad_norm": 0.12897630035877228, + "learning_rate": 0.0005, + "loss": 2.1064, + "step": 174780 + }, + { + "epoch": 0.6652938803163753, + "grad_norm": 0.11987827718257904, + "learning_rate": 0.0005, + "loss": 2.1091, + "step": 174790 + }, + { + "epoch": 0.6653319427844979, + "grad_norm": 0.12624815106391907, + "learning_rate": 0.0005, + "loss": 2.111, + "step": 174800 + }, + { + "epoch": 0.6653700052526206, + "grad_norm": 0.11816033720970154, + "learning_rate": 0.0005, + "loss": 2.1176, + "step": 174810 + }, + { + "epoch": 0.6654080677207432, + "grad_norm": 0.12806962430477142, + "learning_rate": 0.0005, + "loss": 2.1105, + "step": 174820 + }, + { + "epoch": 0.665446130188866, + "grad_norm": 0.13350574672222137, + "learning_rate": 0.0005, + "loss": 2.0976, + "step": 174830 + }, + { + "epoch": 0.6654841926569887, + "grad_norm": 0.12472176551818848, + "learning_rate": 0.0005, + "loss": 2.1141, + "step": 174840 + }, + { + "epoch": 0.6655222551251113, + "grad_norm": 0.11982429772615433, + "learning_rate": 0.0005, + "loss": 2.0889, + "step": 174850 + }, + { + "epoch": 0.665560317593234, + "grad_norm": 0.11519755423069, + "learning_rate": 0.0005, + "loss": 2.1061, + "step": 174860 + }, + { + "epoch": 0.6655983800613567, + "grad_norm": 0.12131005525588989, + "learning_rate": 0.0005, + "loss": 2.103, + "step": 174870 + }, + { + "epoch": 0.6656364425294794, + "grad_norm": 0.13292309641838074, + "learning_rate": 0.0005, + "loss": 2.1126, + "step": 174880 + }, + { + "epoch": 0.6656745049976021, + "grad_norm": 0.13719289004802704, + "learning_rate": 0.0005, + "loss": 2.1095, + "step": 174890 + }, + { + "epoch": 0.6657125674657247, + "grad_norm": 0.1265268474817276, + "learning_rate": 0.0005, + "loss": 2.1109, + "step": 174900 + }, + { + "epoch": 0.6657506299338475, + "grad_norm": 0.11568540334701538, + "learning_rate": 0.0005, + "loss": 2.1129, + "step": 174910 + }, + { + "epoch": 0.6657886924019701, + "grad_norm": 0.12374504655599594, + "learning_rate": 0.0005, + "loss": 2.1051, + "step": 174920 + }, + { + "epoch": 0.6658267548700928, + "grad_norm": 0.1308293640613556, + "learning_rate": 0.0005, + "loss": 2.0927, + "step": 174930 + }, + { + "epoch": 0.6658648173382155, + "grad_norm": 0.12477356940507889, + "learning_rate": 0.0005, + "loss": 2.1067, + "step": 174940 + }, + { + "epoch": 0.6659028798063381, + "grad_norm": 0.1219056025147438, + "learning_rate": 0.0005, + "loss": 2.0891, + "step": 174950 + }, + { + "epoch": 0.6659409422744609, + "grad_norm": 0.11442694067955017, + "learning_rate": 0.0005, + "loss": 2.1126, + "step": 174960 + }, + { + "epoch": 0.6659790047425835, + "grad_norm": 0.11663993448019028, + "learning_rate": 0.0005, + "loss": 2.1116, + "step": 174970 + }, + { + "epoch": 0.6660170672107062, + "grad_norm": 0.11952158063650131, + "learning_rate": 0.0005, + "loss": 2.0908, + "step": 174980 + }, + { + "epoch": 0.6660551296788289, + "grad_norm": 0.12595412135124207, + "learning_rate": 0.0005, + "loss": 2.0944, + "step": 174990 + }, + { + "epoch": 0.6660931921469516, + "grad_norm": 0.12811562418937683, + "learning_rate": 0.0005, + "loss": 2.1061, + "step": 175000 + }, + { + "epoch": 0.6661312546150743, + "grad_norm": 0.11032678186893463, + "learning_rate": 0.0005, + "loss": 2.1114, + "step": 175010 + }, + { + "epoch": 0.6661693170831969, + "grad_norm": 0.13032512366771698, + "learning_rate": 0.0005, + "loss": 2.1128, + "step": 175020 + }, + { + "epoch": 0.6662073795513196, + "grad_norm": 0.1279965043067932, + "learning_rate": 0.0005, + "loss": 2.1153, + "step": 175030 + }, + { + "epoch": 0.6662454420194424, + "grad_norm": 0.12894755601882935, + "learning_rate": 0.0005, + "loss": 2.1081, + "step": 175040 + }, + { + "epoch": 0.666283504487565, + "grad_norm": 0.13770896196365356, + "learning_rate": 0.0005, + "loss": 2.0926, + "step": 175050 + }, + { + "epoch": 0.6663215669556877, + "grad_norm": 0.12088262289762497, + "learning_rate": 0.0005, + "loss": 2.1024, + "step": 175060 + }, + { + "epoch": 0.6663596294238103, + "grad_norm": 0.13149189949035645, + "learning_rate": 0.0005, + "loss": 2.114, + "step": 175070 + }, + { + "epoch": 0.666397691891933, + "grad_norm": 0.11546579003334045, + "learning_rate": 0.0005, + "loss": 2.0955, + "step": 175080 + }, + { + "epoch": 0.6664357543600558, + "grad_norm": 0.12365476787090302, + "learning_rate": 0.0005, + "loss": 2.1146, + "step": 175090 + }, + { + "epoch": 0.6664738168281784, + "grad_norm": 0.12172803282737732, + "learning_rate": 0.0005, + "loss": 2.0895, + "step": 175100 + }, + { + "epoch": 0.6665118792963011, + "grad_norm": 0.1241191178560257, + "learning_rate": 0.0005, + "loss": 2.1075, + "step": 175110 + }, + { + "epoch": 0.6665499417644237, + "grad_norm": 0.13243572413921356, + "learning_rate": 0.0005, + "loss": 2.1226, + "step": 175120 + }, + { + "epoch": 0.6665880042325465, + "grad_norm": 0.11853521317243576, + "learning_rate": 0.0005, + "loss": 2.1064, + "step": 175130 + }, + { + "epoch": 0.6666260667006692, + "grad_norm": 0.12455012649297714, + "learning_rate": 0.0005, + "loss": 2.094, + "step": 175140 + }, + { + "epoch": 0.6666641291687918, + "grad_norm": 0.1241413950920105, + "learning_rate": 0.0005, + "loss": 2.1038, + "step": 175150 + }, + { + "epoch": 0.6667021916369145, + "grad_norm": 0.1175464391708374, + "learning_rate": 0.0005, + "loss": 2.1145, + "step": 175160 + }, + { + "epoch": 0.6667402541050372, + "grad_norm": 0.12016557157039642, + "learning_rate": 0.0005, + "loss": 2.1145, + "step": 175170 + }, + { + "epoch": 0.6667783165731599, + "grad_norm": 0.11607896536588669, + "learning_rate": 0.0005, + "loss": 2.1247, + "step": 175180 + }, + { + "epoch": 0.6668163790412825, + "grad_norm": 0.11710387468338013, + "learning_rate": 0.0005, + "loss": 2.1154, + "step": 175190 + }, + { + "epoch": 0.6668544415094052, + "grad_norm": 0.13466401398181915, + "learning_rate": 0.0005, + "loss": 2.1178, + "step": 175200 + }, + { + "epoch": 0.666892503977528, + "grad_norm": 0.13053859770298004, + "learning_rate": 0.0005, + "loss": 2.1292, + "step": 175210 + }, + { + "epoch": 0.6669305664456506, + "grad_norm": 0.13781042397022247, + "learning_rate": 0.0005, + "loss": 2.1124, + "step": 175220 + }, + { + "epoch": 0.6669686289137733, + "grad_norm": 0.11386517435312271, + "learning_rate": 0.0005, + "loss": 2.1107, + "step": 175230 + }, + { + "epoch": 0.6670066913818959, + "grad_norm": 0.12546059489250183, + "learning_rate": 0.0005, + "loss": 2.129, + "step": 175240 + }, + { + "epoch": 0.6670447538500186, + "grad_norm": 0.11651482433080673, + "learning_rate": 0.0005, + "loss": 2.1138, + "step": 175250 + }, + { + "epoch": 0.6670828163181414, + "grad_norm": 0.1311427354812622, + "learning_rate": 0.0005, + "loss": 2.114, + "step": 175260 + }, + { + "epoch": 0.667120878786264, + "grad_norm": 0.1285676211118698, + "learning_rate": 0.0005, + "loss": 2.1156, + "step": 175270 + }, + { + "epoch": 0.6671589412543867, + "grad_norm": 0.11666958779096603, + "learning_rate": 0.0005, + "loss": 2.1114, + "step": 175280 + }, + { + "epoch": 0.6671970037225093, + "grad_norm": 0.12320812791585922, + "learning_rate": 0.0005, + "loss": 2.1002, + "step": 175290 + }, + { + "epoch": 0.6672350661906321, + "grad_norm": 0.14052678644657135, + "learning_rate": 0.0005, + "loss": 2.099, + "step": 175300 + }, + { + "epoch": 0.6672731286587548, + "grad_norm": 0.12256249040365219, + "learning_rate": 0.0005, + "loss": 2.1103, + "step": 175310 + }, + { + "epoch": 0.6673111911268774, + "grad_norm": 0.13572126626968384, + "learning_rate": 0.0005, + "loss": 2.105, + "step": 175320 + }, + { + "epoch": 0.6673492535950001, + "grad_norm": 0.13958248496055603, + "learning_rate": 0.0005, + "loss": 2.0991, + "step": 175330 + }, + { + "epoch": 0.6673873160631228, + "grad_norm": 0.12266869097948074, + "learning_rate": 0.0005, + "loss": 2.1041, + "step": 175340 + }, + { + "epoch": 0.6674253785312455, + "grad_norm": 0.11703468859195709, + "learning_rate": 0.0005, + "loss": 2.0954, + "step": 175350 + }, + { + "epoch": 0.6674634409993682, + "grad_norm": 0.12484674155712128, + "learning_rate": 0.0005, + "loss": 2.1063, + "step": 175360 + }, + { + "epoch": 0.6675015034674908, + "grad_norm": 0.20197418332099915, + "learning_rate": 0.0005, + "loss": 2.1273, + "step": 175370 + }, + { + "epoch": 0.6675395659356135, + "grad_norm": 0.11558258533477783, + "learning_rate": 0.0005, + "loss": 2.1206, + "step": 175380 + }, + { + "epoch": 0.6675776284037362, + "grad_norm": 0.11437583714723587, + "learning_rate": 0.0005, + "loss": 2.109, + "step": 175390 + }, + { + "epoch": 0.6676156908718589, + "grad_norm": 0.11593539267778397, + "learning_rate": 0.0005, + "loss": 2.1056, + "step": 175400 + }, + { + "epoch": 0.6676537533399816, + "grad_norm": 0.13235677778720856, + "learning_rate": 0.0005, + "loss": 2.0951, + "step": 175410 + }, + { + "epoch": 0.6676918158081042, + "grad_norm": 0.13780459761619568, + "learning_rate": 0.0005, + "loss": 2.1091, + "step": 175420 + }, + { + "epoch": 0.667729878276227, + "grad_norm": 0.1119593009352684, + "learning_rate": 0.0005, + "loss": 2.0994, + "step": 175430 + }, + { + "epoch": 0.6677679407443496, + "grad_norm": 0.11851577460765839, + "learning_rate": 0.0005, + "loss": 2.1119, + "step": 175440 + }, + { + "epoch": 0.6678060032124723, + "grad_norm": 0.12241306155920029, + "learning_rate": 0.0005, + "loss": 2.0935, + "step": 175450 + }, + { + "epoch": 0.667844065680595, + "grad_norm": 0.11685778200626373, + "learning_rate": 0.0005, + "loss": 2.1102, + "step": 175460 + }, + { + "epoch": 0.6678821281487177, + "grad_norm": 0.1231299638748169, + "learning_rate": 0.0005, + "loss": 2.1075, + "step": 175470 + }, + { + "epoch": 0.6679201906168404, + "grad_norm": 0.1288386881351471, + "learning_rate": 0.0005, + "loss": 2.0983, + "step": 175480 + }, + { + "epoch": 0.667958253084963, + "grad_norm": 0.14113663136959076, + "learning_rate": 0.0005, + "loss": 2.1183, + "step": 175490 + }, + { + "epoch": 0.6679963155530857, + "grad_norm": 0.12154048681259155, + "learning_rate": 0.0005, + "loss": 2.1015, + "step": 175500 + }, + { + "epoch": 0.6680343780212084, + "grad_norm": 0.11648967117071152, + "learning_rate": 0.0005, + "loss": 2.1037, + "step": 175510 + }, + { + "epoch": 0.6680724404893311, + "grad_norm": 0.1199147179722786, + "learning_rate": 0.0005, + "loss": 2.1099, + "step": 175520 + }, + { + "epoch": 0.6681105029574538, + "grad_norm": 0.1248493567109108, + "learning_rate": 0.0005, + "loss": 2.0968, + "step": 175530 + }, + { + "epoch": 0.6681485654255764, + "grad_norm": 0.12896087765693665, + "learning_rate": 0.0005, + "loss": 2.1064, + "step": 175540 + }, + { + "epoch": 0.6681866278936991, + "grad_norm": 0.11197390407323837, + "learning_rate": 0.0005, + "loss": 2.0893, + "step": 175550 + }, + { + "epoch": 0.6682246903618219, + "grad_norm": 0.1222924143075943, + "learning_rate": 0.0005, + "loss": 2.1083, + "step": 175560 + }, + { + "epoch": 0.6682627528299445, + "grad_norm": 0.12692193686962128, + "learning_rate": 0.0005, + "loss": 2.0893, + "step": 175570 + }, + { + "epoch": 0.6683008152980672, + "grad_norm": 0.12094826251268387, + "learning_rate": 0.0005, + "loss": 2.1149, + "step": 175580 + }, + { + "epoch": 0.6683388777661898, + "grad_norm": 0.12729638814926147, + "learning_rate": 0.0005, + "loss": 2.1072, + "step": 175590 + }, + { + "epoch": 0.6683769402343126, + "grad_norm": 0.12060044705867767, + "learning_rate": 0.0005, + "loss": 2.1102, + "step": 175600 + }, + { + "epoch": 0.6684150027024353, + "grad_norm": 0.11878161132335663, + "learning_rate": 0.0005, + "loss": 2.1154, + "step": 175610 + }, + { + "epoch": 0.6684530651705579, + "grad_norm": 0.12017234414815903, + "learning_rate": 0.0005, + "loss": 2.1162, + "step": 175620 + }, + { + "epoch": 0.6684911276386806, + "grad_norm": 0.13166853785514832, + "learning_rate": 0.0005, + "loss": 2.1078, + "step": 175630 + }, + { + "epoch": 0.6685291901068033, + "grad_norm": 0.1585860550403595, + "learning_rate": 0.0005, + "loss": 2.0934, + "step": 175640 + }, + { + "epoch": 0.668567252574926, + "grad_norm": 0.12804687023162842, + "learning_rate": 0.0005, + "loss": 2.1133, + "step": 175650 + }, + { + "epoch": 0.6686053150430487, + "grad_norm": 0.121377132833004, + "learning_rate": 0.0005, + "loss": 2.1022, + "step": 175660 + }, + { + "epoch": 0.6686433775111713, + "grad_norm": 0.11736253648996353, + "learning_rate": 0.0005, + "loss": 2.1081, + "step": 175670 + }, + { + "epoch": 0.668681439979294, + "grad_norm": 0.11880221962928772, + "learning_rate": 0.0005, + "loss": 2.113, + "step": 175680 + }, + { + "epoch": 0.6687195024474167, + "grad_norm": 0.29473286867141724, + "learning_rate": 0.0005, + "loss": 2.1127, + "step": 175690 + }, + { + "epoch": 0.6687575649155394, + "grad_norm": 0.120498426258564, + "learning_rate": 0.0005, + "loss": 2.1156, + "step": 175700 + }, + { + "epoch": 0.668795627383662, + "grad_norm": 0.13548476994037628, + "learning_rate": 0.0005, + "loss": 2.1101, + "step": 175710 + }, + { + "epoch": 0.6688336898517847, + "grad_norm": 0.12871968746185303, + "learning_rate": 0.0005, + "loss": 2.1027, + "step": 175720 + }, + { + "epoch": 0.6688717523199075, + "grad_norm": 0.1298273652791977, + "learning_rate": 0.0005, + "loss": 2.1079, + "step": 175730 + }, + { + "epoch": 0.6689098147880301, + "grad_norm": 0.14113155007362366, + "learning_rate": 0.0005, + "loss": 2.1081, + "step": 175740 + }, + { + "epoch": 0.6689478772561528, + "grad_norm": 0.11712589859962463, + "learning_rate": 0.0005, + "loss": 2.1179, + "step": 175750 + }, + { + "epoch": 0.6689859397242754, + "grad_norm": 0.12501053512096405, + "learning_rate": 0.0005, + "loss": 2.1106, + "step": 175760 + }, + { + "epoch": 0.6690240021923982, + "grad_norm": 0.13220301270484924, + "learning_rate": 0.0005, + "loss": 2.1226, + "step": 175770 + }, + { + "epoch": 0.6690620646605209, + "grad_norm": 0.13822099566459656, + "learning_rate": 0.0005, + "loss": 2.0945, + "step": 175780 + }, + { + "epoch": 0.6691001271286435, + "grad_norm": 0.11559640616178513, + "learning_rate": 0.0005, + "loss": 2.1263, + "step": 175790 + }, + { + "epoch": 0.6691381895967662, + "grad_norm": 0.12654858827590942, + "learning_rate": 0.0005, + "loss": 2.1041, + "step": 175800 + }, + { + "epoch": 0.6691762520648888, + "grad_norm": 0.12716756761074066, + "learning_rate": 0.0005, + "loss": 2.1205, + "step": 175810 + }, + { + "epoch": 0.6692143145330116, + "grad_norm": 0.1146780326962471, + "learning_rate": 0.0005, + "loss": 2.1049, + "step": 175820 + }, + { + "epoch": 0.6692523770011343, + "grad_norm": 0.13051363825798035, + "learning_rate": 0.0005, + "loss": 2.1225, + "step": 175830 + }, + { + "epoch": 0.6692904394692569, + "grad_norm": 0.4045941233634949, + "learning_rate": 0.0005, + "loss": 2.1215, + "step": 175840 + }, + { + "epoch": 0.6693285019373796, + "grad_norm": 0.12739895284175873, + "learning_rate": 0.0005, + "loss": 2.1282, + "step": 175850 + }, + { + "epoch": 0.6693665644055024, + "grad_norm": 0.12500180304050446, + "learning_rate": 0.0005, + "loss": 2.1059, + "step": 175860 + }, + { + "epoch": 0.669404626873625, + "grad_norm": 0.12794901430606842, + "learning_rate": 0.0005, + "loss": 2.1187, + "step": 175870 + }, + { + "epoch": 0.6694426893417477, + "grad_norm": 0.12256273627281189, + "learning_rate": 0.0005, + "loss": 2.1224, + "step": 175880 + }, + { + "epoch": 0.6694807518098703, + "grad_norm": 0.1283980906009674, + "learning_rate": 0.0005, + "loss": 2.1071, + "step": 175890 + }, + { + "epoch": 0.6695188142779931, + "grad_norm": 0.12353162467479706, + "learning_rate": 0.0005, + "loss": 2.1085, + "step": 175900 + }, + { + "epoch": 0.6695568767461157, + "grad_norm": 0.1328476071357727, + "learning_rate": 0.0005, + "loss": 2.1066, + "step": 175910 + }, + { + "epoch": 0.6695949392142384, + "grad_norm": 0.13343828916549683, + "learning_rate": 0.0005, + "loss": 2.1043, + "step": 175920 + }, + { + "epoch": 0.6696330016823611, + "grad_norm": 0.1255650371313095, + "learning_rate": 0.0005, + "loss": 2.0965, + "step": 175930 + }, + { + "epoch": 0.6696710641504837, + "grad_norm": 0.13649240136146545, + "learning_rate": 0.0005, + "loss": 2.1138, + "step": 175940 + }, + { + "epoch": 0.6697091266186065, + "grad_norm": 0.12300781905651093, + "learning_rate": 0.0005, + "loss": 2.0956, + "step": 175950 + }, + { + "epoch": 0.6697471890867291, + "grad_norm": 0.1329295039176941, + "learning_rate": 0.0005, + "loss": 2.1163, + "step": 175960 + }, + { + "epoch": 0.6697852515548518, + "grad_norm": 0.12757021188735962, + "learning_rate": 0.0005, + "loss": 2.0984, + "step": 175970 + }, + { + "epoch": 0.6698233140229745, + "grad_norm": 0.11467017978429794, + "learning_rate": 0.0005, + "loss": 2.1032, + "step": 175980 + }, + { + "epoch": 0.6698613764910972, + "grad_norm": 0.1320362240076065, + "learning_rate": 0.0005, + "loss": 2.1127, + "step": 175990 + }, + { + "epoch": 0.6698994389592199, + "grad_norm": 0.11636713147163391, + "learning_rate": 0.0005, + "loss": 2.0993, + "step": 176000 + }, + { + "epoch": 0.6699375014273425, + "grad_norm": 0.11335117369890213, + "learning_rate": 0.0005, + "loss": 2.1074, + "step": 176010 + }, + { + "epoch": 0.6699755638954652, + "grad_norm": 0.12051542848348618, + "learning_rate": 0.0005, + "loss": 2.1075, + "step": 176020 + }, + { + "epoch": 0.670013626363588, + "grad_norm": 0.11330889165401459, + "learning_rate": 0.0005, + "loss": 2.1101, + "step": 176030 + }, + { + "epoch": 0.6700516888317106, + "grad_norm": 0.11903239041566849, + "learning_rate": 0.0005, + "loss": 2.1233, + "step": 176040 + }, + { + "epoch": 0.6700897512998333, + "grad_norm": 0.12131945043802261, + "learning_rate": 0.0005, + "loss": 2.1034, + "step": 176050 + }, + { + "epoch": 0.6701278137679559, + "grad_norm": 0.1258198618888855, + "learning_rate": 0.0005, + "loss": 2.1047, + "step": 176060 + }, + { + "epoch": 0.6701658762360787, + "grad_norm": 0.12380845844745636, + "learning_rate": 0.0005, + "loss": 2.1089, + "step": 176070 + }, + { + "epoch": 0.6702039387042014, + "grad_norm": 0.11659497767686844, + "learning_rate": 0.0005, + "loss": 2.112, + "step": 176080 + }, + { + "epoch": 0.670242001172324, + "grad_norm": 0.12323492020368576, + "learning_rate": 0.0005, + "loss": 2.1003, + "step": 176090 + }, + { + "epoch": 0.6702800636404467, + "grad_norm": 0.1283530592918396, + "learning_rate": 0.0005, + "loss": 2.1057, + "step": 176100 + }, + { + "epoch": 0.6703181261085693, + "grad_norm": 0.12053488940000534, + "learning_rate": 0.0005, + "loss": 2.1125, + "step": 176110 + }, + { + "epoch": 0.6703561885766921, + "grad_norm": 0.12956029176712036, + "learning_rate": 0.0005, + "loss": 2.0991, + "step": 176120 + }, + { + "epoch": 0.6703942510448148, + "grad_norm": 0.13247571885585785, + "learning_rate": 0.0005, + "loss": 2.0964, + "step": 176130 + }, + { + "epoch": 0.6704323135129374, + "grad_norm": 0.11778081953525543, + "learning_rate": 0.0005, + "loss": 2.1072, + "step": 176140 + }, + { + "epoch": 0.6704703759810601, + "grad_norm": 0.11774902790784836, + "learning_rate": 0.0005, + "loss": 2.1163, + "step": 176150 + }, + { + "epoch": 0.6705084384491828, + "grad_norm": 0.12733864784240723, + "learning_rate": 0.0005, + "loss": 2.1001, + "step": 176160 + }, + { + "epoch": 0.6705465009173055, + "grad_norm": 0.12119722366333008, + "learning_rate": 0.0005, + "loss": 2.0957, + "step": 176170 + }, + { + "epoch": 0.6705845633854282, + "grad_norm": 0.1252257525920868, + "learning_rate": 0.0005, + "loss": 2.1214, + "step": 176180 + }, + { + "epoch": 0.6706226258535508, + "grad_norm": 0.1208445280790329, + "learning_rate": 0.0005, + "loss": 2.1275, + "step": 176190 + }, + { + "epoch": 0.6706606883216736, + "grad_norm": 0.14327453076839447, + "learning_rate": 0.0005, + "loss": 2.1069, + "step": 176200 + }, + { + "epoch": 0.6706987507897962, + "grad_norm": 0.1284928023815155, + "learning_rate": 0.0005, + "loss": 2.1228, + "step": 176210 + }, + { + "epoch": 0.6707368132579189, + "grad_norm": 0.1296626329421997, + "learning_rate": 0.0005, + "loss": 2.1101, + "step": 176220 + }, + { + "epoch": 0.6707748757260416, + "grad_norm": 0.14004792273044586, + "learning_rate": 0.0005, + "loss": 2.1217, + "step": 176230 + }, + { + "epoch": 0.6708129381941642, + "grad_norm": 0.1218409538269043, + "learning_rate": 0.0005, + "loss": 2.1109, + "step": 176240 + }, + { + "epoch": 0.670851000662287, + "grad_norm": 0.12494488805532455, + "learning_rate": 0.0005, + "loss": 2.1173, + "step": 176250 + }, + { + "epoch": 0.6708890631304096, + "grad_norm": 0.1340954452753067, + "learning_rate": 0.0005, + "loss": 2.1202, + "step": 176260 + }, + { + "epoch": 0.6709271255985323, + "grad_norm": 0.13070081174373627, + "learning_rate": 0.0005, + "loss": 2.1153, + "step": 176270 + }, + { + "epoch": 0.670965188066655, + "grad_norm": 0.1265803575515747, + "learning_rate": 0.0005, + "loss": 2.0996, + "step": 176280 + }, + { + "epoch": 0.6710032505347777, + "grad_norm": 0.1216389462351799, + "learning_rate": 0.0005, + "loss": 2.1369, + "step": 176290 + }, + { + "epoch": 0.6710413130029004, + "grad_norm": 0.12888644635677338, + "learning_rate": 0.0005, + "loss": 2.1167, + "step": 176300 + }, + { + "epoch": 0.671079375471023, + "grad_norm": 0.12916290760040283, + "learning_rate": 0.0005, + "loss": 2.1074, + "step": 176310 + }, + { + "epoch": 0.6711174379391457, + "grad_norm": 0.12547896802425385, + "learning_rate": 0.0005, + "loss": 2.1051, + "step": 176320 + }, + { + "epoch": 0.6711555004072685, + "grad_norm": 0.122887521982193, + "learning_rate": 0.0005, + "loss": 2.1195, + "step": 176330 + }, + { + "epoch": 0.6711935628753911, + "grad_norm": 0.13145895302295685, + "learning_rate": 0.0005, + "loss": 2.1202, + "step": 176340 + }, + { + "epoch": 0.6712316253435138, + "grad_norm": 0.11573679000139236, + "learning_rate": 0.0005, + "loss": 2.1242, + "step": 176350 + }, + { + "epoch": 0.6712696878116364, + "grad_norm": 0.12110036611557007, + "learning_rate": 0.0005, + "loss": 2.0973, + "step": 176360 + }, + { + "epoch": 0.6713077502797591, + "grad_norm": 0.1120925024151802, + "learning_rate": 0.0005, + "loss": 2.096, + "step": 176370 + }, + { + "epoch": 0.6713458127478819, + "grad_norm": 0.12430943548679352, + "learning_rate": 0.0005, + "loss": 2.1183, + "step": 176380 + }, + { + "epoch": 0.6713838752160045, + "grad_norm": 0.11478005349636078, + "learning_rate": 0.0005, + "loss": 2.1029, + "step": 176390 + }, + { + "epoch": 0.6714219376841272, + "grad_norm": 0.12330004572868347, + "learning_rate": 0.0005, + "loss": 2.097, + "step": 176400 + }, + { + "epoch": 0.6714600001522498, + "grad_norm": 0.13458773493766785, + "learning_rate": 0.0005, + "loss": 2.1044, + "step": 176410 + }, + { + "epoch": 0.6714980626203726, + "grad_norm": 0.12437058240175247, + "learning_rate": 0.0005, + "loss": 2.1027, + "step": 176420 + }, + { + "epoch": 0.6715361250884953, + "grad_norm": 0.1219138354063034, + "learning_rate": 0.0005, + "loss": 2.111, + "step": 176430 + }, + { + "epoch": 0.6715741875566179, + "grad_norm": 0.12024988234043121, + "learning_rate": 0.0005, + "loss": 2.0999, + "step": 176440 + }, + { + "epoch": 0.6716122500247406, + "grad_norm": 0.12701474130153656, + "learning_rate": 0.0005, + "loss": 2.1175, + "step": 176450 + }, + { + "epoch": 0.6716503124928633, + "grad_norm": 0.13415849208831787, + "learning_rate": 0.0005, + "loss": 2.1146, + "step": 176460 + }, + { + "epoch": 0.671688374960986, + "grad_norm": 0.23941706120967865, + "learning_rate": 0.0005, + "loss": 2.1164, + "step": 176470 + }, + { + "epoch": 0.6717264374291086, + "grad_norm": 0.12130671739578247, + "learning_rate": 0.0005, + "loss": 2.1288, + "step": 176480 + }, + { + "epoch": 0.6717644998972313, + "grad_norm": 0.12588223814964294, + "learning_rate": 0.0005, + "loss": 2.1127, + "step": 176490 + }, + { + "epoch": 0.6718025623653541, + "grad_norm": 0.12544754147529602, + "learning_rate": 0.0005, + "loss": 2.1114, + "step": 176500 + }, + { + "epoch": 0.6718406248334767, + "grad_norm": 0.13205012679100037, + "learning_rate": 0.0005, + "loss": 2.1004, + "step": 176510 + }, + { + "epoch": 0.6718786873015994, + "grad_norm": 0.12384752929210663, + "learning_rate": 0.0005, + "loss": 2.1159, + "step": 176520 + }, + { + "epoch": 0.671916749769722, + "grad_norm": 0.12456027418375015, + "learning_rate": 0.0005, + "loss": 2.1037, + "step": 176530 + }, + { + "epoch": 0.6719548122378447, + "grad_norm": 0.12547491490840912, + "learning_rate": 0.0005, + "loss": 2.0956, + "step": 176540 + }, + { + "epoch": 0.6719928747059675, + "grad_norm": 0.12561234831809998, + "learning_rate": 0.0005, + "loss": 2.114, + "step": 176550 + }, + { + "epoch": 0.6720309371740901, + "grad_norm": 0.11695345491170883, + "learning_rate": 0.0005, + "loss": 2.1189, + "step": 176560 + }, + { + "epoch": 0.6720689996422128, + "grad_norm": 0.11866967380046844, + "learning_rate": 0.0005, + "loss": 2.1, + "step": 176570 + }, + { + "epoch": 0.6721070621103354, + "grad_norm": 0.12426942586898804, + "learning_rate": 0.0005, + "loss": 2.1012, + "step": 176580 + }, + { + "epoch": 0.6721451245784582, + "grad_norm": 0.12986068427562714, + "learning_rate": 0.0005, + "loss": 2.1189, + "step": 176590 + }, + { + "epoch": 0.6721831870465809, + "grad_norm": 0.13200722634792328, + "learning_rate": 0.0005, + "loss": 2.0985, + "step": 176600 + }, + { + "epoch": 0.6722212495147035, + "grad_norm": 0.12788031995296478, + "learning_rate": 0.0005, + "loss": 2.0862, + "step": 176610 + }, + { + "epoch": 0.6722593119828262, + "grad_norm": 0.12382833659648895, + "learning_rate": 0.0005, + "loss": 2.0934, + "step": 176620 + }, + { + "epoch": 0.672297374450949, + "grad_norm": 0.12163611501455307, + "learning_rate": 0.0005, + "loss": 2.098, + "step": 176630 + }, + { + "epoch": 0.6723354369190716, + "grad_norm": 0.13965623080730438, + "learning_rate": 0.0005, + "loss": 2.1204, + "step": 176640 + }, + { + "epoch": 0.6723734993871943, + "grad_norm": 0.11982572823762894, + "learning_rate": 0.0005, + "loss": 2.1156, + "step": 176650 + }, + { + "epoch": 0.6724115618553169, + "grad_norm": 0.12545208632946014, + "learning_rate": 0.0005, + "loss": 2.1085, + "step": 176660 + }, + { + "epoch": 0.6724496243234396, + "grad_norm": 0.11891097575426102, + "learning_rate": 0.0005, + "loss": 2.1059, + "step": 176670 + }, + { + "epoch": 0.6724876867915623, + "grad_norm": 0.11695777624845505, + "learning_rate": 0.0005, + "loss": 2.112, + "step": 176680 + }, + { + "epoch": 0.672525749259685, + "grad_norm": 0.12013732641935349, + "learning_rate": 0.0005, + "loss": 2.1132, + "step": 176690 + }, + { + "epoch": 0.6725638117278077, + "grad_norm": 0.12866584956645966, + "learning_rate": 0.0005, + "loss": 2.1066, + "step": 176700 + }, + { + "epoch": 0.6726018741959303, + "grad_norm": 0.11633819341659546, + "learning_rate": 0.0005, + "loss": 2.1024, + "step": 176710 + }, + { + "epoch": 0.6726399366640531, + "grad_norm": 0.12568669021129608, + "learning_rate": 0.0005, + "loss": 2.1236, + "step": 176720 + }, + { + "epoch": 0.6726779991321757, + "grad_norm": 0.12346602976322174, + "learning_rate": 0.0005, + "loss": 2.1193, + "step": 176730 + }, + { + "epoch": 0.6727160616002984, + "grad_norm": 0.1353200525045395, + "learning_rate": 0.0005, + "loss": 2.1148, + "step": 176740 + }, + { + "epoch": 0.672754124068421, + "grad_norm": 0.1326470673084259, + "learning_rate": 0.0005, + "loss": 2.1229, + "step": 176750 + }, + { + "epoch": 0.6727921865365438, + "grad_norm": 0.12330491840839386, + "learning_rate": 0.0005, + "loss": 2.1143, + "step": 176760 + }, + { + "epoch": 0.6728302490046665, + "grad_norm": 0.11250167340040207, + "learning_rate": 0.0005, + "loss": 2.0979, + "step": 176770 + }, + { + "epoch": 0.6728683114727891, + "grad_norm": 0.13047778606414795, + "learning_rate": 0.0005, + "loss": 2.1103, + "step": 176780 + }, + { + "epoch": 0.6729063739409118, + "grad_norm": 0.12224044650793076, + "learning_rate": 0.0005, + "loss": 2.1226, + "step": 176790 + }, + { + "epoch": 0.6729444364090345, + "grad_norm": 0.11687453091144562, + "learning_rate": 0.0005, + "loss": 2.1119, + "step": 176800 + }, + { + "epoch": 0.6729824988771572, + "grad_norm": 0.12105058878660202, + "learning_rate": 0.0005, + "loss": 2.111, + "step": 176810 + }, + { + "epoch": 0.6730205613452799, + "grad_norm": 0.13791713118553162, + "learning_rate": 0.0005, + "loss": 2.1081, + "step": 176820 + }, + { + "epoch": 0.6730586238134025, + "grad_norm": 0.14121802151203156, + "learning_rate": 0.0005, + "loss": 2.1022, + "step": 176830 + }, + { + "epoch": 0.6730966862815252, + "grad_norm": 0.11477218568325043, + "learning_rate": 0.0005, + "loss": 2.1209, + "step": 176840 + }, + { + "epoch": 0.673134748749648, + "grad_norm": 0.11514034867286682, + "learning_rate": 0.0005, + "loss": 2.1186, + "step": 176850 + }, + { + "epoch": 0.6731728112177706, + "grad_norm": 0.13139691948890686, + "learning_rate": 0.0005, + "loss": 2.115, + "step": 176860 + }, + { + "epoch": 0.6732108736858933, + "grad_norm": 0.13228735327720642, + "learning_rate": 0.0005, + "loss": 2.1127, + "step": 176870 + }, + { + "epoch": 0.6732489361540159, + "grad_norm": 0.1252131164073944, + "learning_rate": 0.0005, + "loss": 2.1036, + "step": 176880 + }, + { + "epoch": 0.6732869986221387, + "grad_norm": 0.11720944195985794, + "learning_rate": 0.0005, + "loss": 2.1225, + "step": 176890 + }, + { + "epoch": 0.6733250610902614, + "grad_norm": 0.13159362971782684, + "learning_rate": 0.0005, + "loss": 2.1071, + "step": 176900 + }, + { + "epoch": 0.673363123558384, + "grad_norm": 0.13194140791893005, + "learning_rate": 0.0005, + "loss": 2.1061, + "step": 176910 + }, + { + "epoch": 0.6734011860265067, + "grad_norm": 0.125912606716156, + "learning_rate": 0.0005, + "loss": 2.1007, + "step": 176920 + }, + { + "epoch": 0.6734392484946294, + "grad_norm": 0.12334372103214264, + "learning_rate": 0.0005, + "loss": 2.1074, + "step": 176930 + }, + { + "epoch": 0.6734773109627521, + "grad_norm": 0.11493158340454102, + "learning_rate": 0.0005, + "loss": 2.1183, + "step": 176940 + }, + { + "epoch": 0.6735153734308748, + "grad_norm": 0.11860795319080353, + "learning_rate": 0.0005, + "loss": 2.093, + "step": 176950 + }, + { + "epoch": 0.6735534358989974, + "grad_norm": 0.1266835480928421, + "learning_rate": 0.0005, + "loss": 2.0922, + "step": 176960 + }, + { + "epoch": 0.6735914983671201, + "grad_norm": 0.1444019377231598, + "learning_rate": 0.0005, + "loss": 2.1083, + "step": 176970 + }, + { + "epoch": 0.6736295608352428, + "grad_norm": 0.12035851180553436, + "learning_rate": 0.0005, + "loss": 2.0961, + "step": 176980 + }, + { + "epoch": 0.6736676233033655, + "grad_norm": 0.11523926258087158, + "learning_rate": 0.0005, + "loss": 2.0926, + "step": 176990 + }, + { + "epoch": 0.6737056857714881, + "grad_norm": 0.12259594351053238, + "learning_rate": 0.0005, + "loss": 2.1127, + "step": 177000 + }, + { + "epoch": 0.6737437482396108, + "grad_norm": 0.1452437937259674, + "learning_rate": 0.0005, + "loss": 2.1015, + "step": 177010 + }, + { + "epoch": 0.6737818107077336, + "grad_norm": 0.12240692973136902, + "learning_rate": 0.0005, + "loss": 2.108, + "step": 177020 + }, + { + "epoch": 0.6738198731758562, + "grad_norm": 0.11802651733160019, + "learning_rate": 0.0005, + "loss": 2.1062, + "step": 177030 + }, + { + "epoch": 0.6738579356439789, + "grad_norm": 0.13415051996707916, + "learning_rate": 0.0005, + "loss": 2.1005, + "step": 177040 + }, + { + "epoch": 0.6738959981121015, + "grad_norm": 0.11771440505981445, + "learning_rate": 0.0005, + "loss": 2.1094, + "step": 177050 + }, + { + "epoch": 0.6739340605802243, + "grad_norm": 0.11489039659500122, + "learning_rate": 0.0005, + "loss": 2.1058, + "step": 177060 + }, + { + "epoch": 0.673972123048347, + "grad_norm": 0.1397712081670761, + "learning_rate": 0.0005, + "loss": 2.1054, + "step": 177070 + }, + { + "epoch": 0.6740101855164696, + "grad_norm": 0.12139667570590973, + "learning_rate": 0.0005, + "loss": 2.1066, + "step": 177080 + }, + { + "epoch": 0.6740482479845923, + "grad_norm": 0.12221745401620865, + "learning_rate": 0.0005, + "loss": 2.1116, + "step": 177090 + }, + { + "epoch": 0.6740863104527149, + "grad_norm": 0.12911903858184814, + "learning_rate": 0.0005, + "loss": 2.1199, + "step": 177100 + }, + { + "epoch": 0.6741243729208377, + "grad_norm": 0.123677559196949, + "learning_rate": 0.0005, + "loss": 2.1078, + "step": 177110 + }, + { + "epoch": 0.6741624353889604, + "grad_norm": 0.12174209207296371, + "learning_rate": 0.0005, + "loss": 2.0907, + "step": 177120 + }, + { + "epoch": 0.674200497857083, + "grad_norm": 0.11167836934328079, + "learning_rate": 0.0005, + "loss": 2.1077, + "step": 177130 + }, + { + "epoch": 0.6742385603252057, + "grad_norm": 0.12730008363723755, + "learning_rate": 0.0005, + "loss": 2.1175, + "step": 177140 + }, + { + "epoch": 0.6742766227933285, + "grad_norm": 0.11940035223960876, + "learning_rate": 0.0005, + "loss": 2.1029, + "step": 177150 + }, + { + "epoch": 0.6743146852614511, + "grad_norm": 0.11655905097723007, + "learning_rate": 0.0005, + "loss": 2.1189, + "step": 177160 + }, + { + "epoch": 0.6743527477295738, + "grad_norm": 0.12851646542549133, + "learning_rate": 0.0005, + "loss": 2.1147, + "step": 177170 + }, + { + "epoch": 0.6743908101976964, + "grad_norm": 0.11784732341766357, + "learning_rate": 0.0005, + "loss": 2.0976, + "step": 177180 + }, + { + "epoch": 0.6744288726658192, + "grad_norm": 0.14028826355934143, + "learning_rate": 0.0005, + "loss": 2.1073, + "step": 177190 + }, + { + "epoch": 0.6744669351339418, + "grad_norm": 0.131543830037117, + "learning_rate": 0.0005, + "loss": 2.1065, + "step": 177200 + }, + { + "epoch": 0.6745049976020645, + "grad_norm": 0.16741855442523956, + "learning_rate": 0.0005, + "loss": 2.1226, + "step": 177210 + }, + { + "epoch": 0.6745430600701872, + "grad_norm": 0.14125947654247284, + "learning_rate": 0.0005, + "loss": 2.1059, + "step": 177220 + }, + { + "epoch": 0.6745811225383099, + "grad_norm": 0.1236991360783577, + "learning_rate": 0.0005, + "loss": 2.0987, + "step": 177230 + }, + { + "epoch": 0.6746191850064326, + "grad_norm": 0.11668677628040314, + "learning_rate": 0.0005, + "loss": 2.1244, + "step": 177240 + }, + { + "epoch": 0.6746572474745552, + "grad_norm": 0.1334819495677948, + "learning_rate": 0.0005, + "loss": 2.1149, + "step": 177250 + }, + { + "epoch": 0.6746953099426779, + "grad_norm": 0.12094828486442566, + "learning_rate": 0.0005, + "loss": 2.1099, + "step": 177260 + }, + { + "epoch": 0.6747333724108006, + "grad_norm": 0.14832857251167297, + "learning_rate": 0.0005, + "loss": 2.112, + "step": 177270 + }, + { + "epoch": 0.6747714348789233, + "grad_norm": 0.15081220865249634, + "learning_rate": 0.0005, + "loss": 2.1203, + "step": 177280 + }, + { + "epoch": 0.674809497347046, + "grad_norm": 0.12344935536384583, + "learning_rate": 0.0005, + "loss": 2.1156, + "step": 177290 + }, + { + "epoch": 0.6748475598151686, + "grad_norm": 0.13707692921161652, + "learning_rate": 0.0005, + "loss": 2.1217, + "step": 177300 + }, + { + "epoch": 0.6748856222832913, + "grad_norm": 0.11331111192703247, + "learning_rate": 0.0005, + "loss": 2.1095, + "step": 177310 + }, + { + "epoch": 0.6749236847514141, + "grad_norm": 0.13219937682151794, + "learning_rate": 0.0005, + "loss": 2.1138, + "step": 177320 + }, + { + "epoch": 0.6749617472195367, + "grad_norm": 0.12428843975067139, + "learning_rate": 0.0005, + "loss": 2.1155, + "step": 177330 + }, + { + "epoch": 0.6749998096876594, + "grad_norm": 0.1201782301068306, + "learning_rate": 0.0005, + "loss": 2.0849, + "step": 177340 + }, + { + "epoch": 0.675037872155782, + "grad_norm": 0.13102507591247559, + "learning_rate": 0.0005, + "loss": 2.1091, + "step": 177350 + }, + { + "epoch": 0.6750759346239048, + "grad_norm": 0.12345290929079056, + "learning_rate": 0.0005, + "loss": 2.1051, + "step": 177360 + }, + { + "epoch": 0.6751139970920275, + "grad_norm": 0.11714234948158264, + "learning_rate": 0.0005, + "loss": 2.0884, + "step": 177370 + }, + { + "epoch": 0.6751520595601501, + "grad_norm": 0.11517336964607239, + "learning_rate": 0.0005, + "loss": 2.1208, + "step": 177380 + }, + { + "epoch": 0.6751901220282728, + "grad_norm": 0.11558566987514496, + "learning_rate": 0.0005, + "loss": 2.1133, + "step": 177390 + }, + { + "epoch": 0.6752281844963954, + "grad_norm": 0.12705738842487335, + "learning_rate": 0.0005, + "loss": 2.1087, + "step": 177400 + }, + { + "epoch": 0.6752662469645182, + "grad_norm": 0.12877146899700165, + "learning_rate": 0.0005, + "loss": 2.1203, + "step": 177410 + }, + { + "epoch": 0.6753043094326409, + "grad_norm": 0.12679651379585266, + "learning_rate": 0.0005, + "loss": 2.1023, + "step": 177420 + }, + { + "epoch": 0.6753423719007635, + "grad_norm": 0.1234024167060852, + "learning_rate": 0.0005, + "loss": 2.1067, + "step": 177430 + }, + { + "epoch": 0.6753804343688862, + "grad_norm": 0.12910224497318268, + "learning_rate": 0.0005, + "loss": 2.1187, + "step": 177440 + }, + { + "epoch": 0.6754184968370089, + "grad_norm": 0.12047523260116577, + "learning_rate": 0.0005, + "loss": 2.1126, + "step": 177450 + }, + { + "epoch": 0.6754565593051316, + "grad_norm": 0.13041920959949493, + "learning_rate": 0.0005, + "loss": 2.1041, + "step": 177460 + }, + { + "epoch": 0.6754946217732543, + "grad_norm": 0.1203002780675888, + "learning_rate": 0.0005, + "loss": 2.1104, + "step": 177470 + }, + { + "epoch": 0.6755326842413769, + "grad_norm": 0.114230215549469, + "learning_rate": 0.0005, + "loss": 2.1171, + "step": 177480 + }, + { + "epoch": 0.6755707467094997, + "grad_norm": 0.12507209181785583, + "learning_rate": 0.0005, + "loss": 2.1147, + "step": 177490 + }, + { + "epoch": 0.6756088091776223, + "grad_norm": 0.13496264815330505, + "learning_rate": 0.0005, + "loss": 2.1165, + "step": 177500 + }, + { + "epoch": 0.675646871645745, + "grad_norm": 0.14469777047634125, + "learning_rate": 0.0005, + "loss": 2.1155, + "step": 177510 + }, + { + "epoch": 0.6756849341138677, + "grad_norm": 0.12933559715747833, + "learning_rate": 0.0005, + "loss": 2.1349, + "step": 177520 + }, + { + "epoch": 0.6757229965819903, + "grad_norm": 0.1213747188448906, + "learning_rate": 0.0005, + "loss": 2.1074, + "step": 177530 + }, + { + "epoch": 0.6757610590501131, + "grad_norm": 0.1251504123210907, + "learning_rate": 0.0005, + "loss": 2.1002, + "step": 177540 + }, + { + "epoch": 0.6757991215182357, + "grad_norm": 0.13115982711315155, + "learning_rate": 0.0005, + "loss": 2.0957, + "step": 177550 + }, + { + "epoch": 0.6758371839863584, + "grad_norm": 0.12193156778812408, + "learning_rate": 0.0005, + "loss": 2.1218, + "step": 177560 + }, + { + "epoch": 0.675875246454481, + "grad_norm": 0.11421877890825272, + "learning_rate": 0.0005, + "loss": 2.1042, + "step": 177570 + }, + { + "epoch": 0.6759133089226038, + "grad_norm": 0.11429056525230408, + "learning_rate": 0.0005, + "loss": 2.1144, + "step": 177580 + }, + { + "epoch": 0.6759513713907265, + "grad_norm": 0.11777948588132858, + "learning_rate": 0.0005, + "loss": 2.1106, + "step": 177590 + }, + { + "epoch": 0.6759894338588491, + "grad_norm": 0.1239769235253334, + "learning_rate": 0.0005, + "loss": 2.1198, + "step": 177600 + }, + { + "epoch": 0.6760274963269718, + "grad_norm": 0.11587876081466675, + "learning_rate": 0.0005, + "loss": 2.1128, + "step": 177610 + }, + { + "epoch": 0.6760655587950946, + "grad_norm": 0.11251964420080185, + "learning_rate": 0.0005, + "loss": 2.1005, + "step": 177620 + }, + { + "epoch": 0.6761036212632172, + "grad_norm": 0.13177922368049622, + "learning_rate": 0.0005, + "loss": 2.1211, + "step": 177630 + }, + { + "epoch": 0.6761416837313399, + "grad_norm": 0.12955419719219208, + "learning_rate": 0.0005, + "loss": 2.0921, + "step": 177640 + }, + { + "epoch": 0.6761797461994625, + "grad_norm": 0.12349336594343185, + "learning_rate": 0.0005, + "loss": 2.1004, + "step": 177650 + }, + { + "epoch": 0.6762178086675853, + "grad_norm": 0.1187133714556694, + "learning_rate": 0.0005, + "loss": 2.1217, + "step": 177660 + }, + { + "epoch": 0.676255871135708, + "grad_norm": 0.11961386352777481, + "learning_rate": 0.0005, + "loss": 2.1224, + "step": 177670 + }, + { + "epoch": 0.6762939336038306, + "grad_norm": 0.12203500419855118, + "learning_rate": 0.0005, + "loss": 2.0934, + "step": 177680 + }, + { + "epoch": 0.6763319960719533, + "grad_norm": 0.10937876999378204, + "learning_rate": 0.0005, + "loss": 2.1003, + "step": 177690 + }, + { + "epoch": 0.6763700585400759, + "grad_norm": 0.12402454763650894, + "learning_rate": 0.0005, + "loss": 2.1106, + "step": 177700 + }, + { + "epoch": 0.6764081210081987, + "grad_norm": 0.12096264958381653, + "learning_rate": 0.0005, + "loss": 2.1309, + "step": 177710 + }, + { + "epoch": 0.6764461834763213, + "grad_norm": 0.11508809030056, + "learning_rate": 0.0005, + "loss": 2.0907, + "step": 177720 + }, + { + "epoch": 0.676484245944444, + "grad_norm": 0.120554119348526, + "learning_rate": 0.0005, + "loss": 2.1052, + "step": 177730 + }, + { + "epoch": 0.6765223084125667, + "grad_norm": 0.14930492639541626, + "learning_rate": 0.0005, + "loss": 2.1002, + "step": 177740 + }, + { + "epoch": 0.6765603708806894, + "grad_norm": 0.12888970971107483, + "learning_rate": 0.0005, + "loss": 2.1079, + "step": 177750 + }, + { + "epoch": 0.6765984333488121, + "grad_norm": 0.11661666631698608, + "learning_rate": 0.0005, + "loss": 2.1196, + "step": 177760 + }, + { + "epoch": 0.6766364958169347, + "grad_norm": 0.1314840316772461, + "learning_rate": 0.0005, + "loss": 2.1023, + "step": 177770 + }, + { + "epoch": 0.6766745582850574, + "grad_norm": 0.1238323450088501, + "learning_rate": 0.0005, + "loss": 2.1224, + "step": 177780 + }, + { + "epoch": 0.6767126207531802, + "grad_norm": 0.12378830462694168, + "learning_rate": 0.0005, + "loss": 2.1105, + "step": 177790 + }, + { + "epoch": 0.6767506832213028, + "grad_norm": 0.12708117067813873, + "learning_rate": 0.0005, + "loss": 2.0985, + "step": 177800 + }, + { + "epoch": 0.6767887456894255, + "grad_norm": 0.1207418218255043, + "learning_rate": 0.0005, + "loss": 2.1116, + "step": 177810 + }, + { + "epoch": 0.6768268081575481, + "grad_norm": 0.12296409159898758, + "learning_rate": 0.0005, + "loss": 2.1118, + "step": 177820 + }, + { + "epoch": 0.6768648706256708, + "grad_norm": 0.1191890686750412, + "learning_rate": 0.0005, + "loss": 2.1176, + "step": 177830 + }, + { + "epoch": 0.6769029330937936, + "grad_norm": 0.1247776448726654, + "learning_rate": 0.0005, + "loss": 2.1119, + "step": 177840 + }, + { + "epoch": 0.6769409955619162, + "grad_norm": 0.11682870239019394, + "learning_rate": 0.0005, + "loss": 2.1126, + "step": 177850 + }, + { + "epoch": 0.6769790580300389, + "grad_norm": 0.11881222575902939, + "learning_rate": 0.0005, + "loss": 2.1139, + "step": 177860 + }, + { + "epoch": 0.6770171204981615, + "grad_norm": 0.1070871651172638, + "learning_rate": 0.0005, + "loss": 2.1161, + "step": 177870 + }, + { + "epoch": 0.6770551829662843, + "grad_norm": 0.12376029789447784, + "learning_rate": 0.0005, + "loss": 2.1097, + "step": 177880 + }, + { + "epoch": 0.677093245434407, + "grad_norm": 0.12242919206619263, + "learning_rate": 0.0005, + "loss": 2.1248, + "step": 177890 + }, + { + "epoch": 0.6771313079025296, + "grad_norm": 0.12933039665222168, + "learning_rate": 0.0005, + "loss": 2.1034, + "step": 177900 + }, + { + "epoch": 0.6771693703706523, + "grad_norm": 0.1156318187713623, + "learning_rate": 0.0005, + "loss": 2.0935, + "step": 177910 + }, + { + "epoch": 0.677207432838775, + "grad_norm": 0.11402034759521484, + "learning_rate": 0.0005, + "loss": 2.1174, + "step": 177920 + }, + { + "epoch": 0.6772454953068977, + "grad_norm": 0.12201918661594391, + "learning_rate": 0.0005, + "loss": 2.1048, + "step": 177930 + }, + { + "epoch": 0.6772835577750204, + "grad_norm": 0.12645022571086884, + "learning_rate": 0.0005, + "loss": 2.1167, + "step": 177940 + }, + { + "epoch": 0.677321620243143, + "grad_norm": 0.12442043423652649, + "learning_rate": 0.0005, + "loss": 2.1127, + "step": 177950 + }, + { + "epoch": 0.6773596827112657, + "grad_norm": 0.1325632780790329, + "learning_rate": 0.0005, + "loss": 2.1057, + "step": 177960 + }, + { + "epoch": 0.6773977451793884, + "grad_norm": 0.1344526708126068, + "learning_rate": 0.0005, + "loss": 2.1023, + "step": 177970 + }, + { + "epoch": 0.6774358076475111, + "grad_norm": 0.13039939105510712, + "learning_rate": 0.0005, + "loss": 2.1105, + "step": 177980 + }, + { + "epoch": 0.6774738701156338, + "grad_norm": 0.13478504121303558, + "learning_rate": 0.0005, + "loss": 2.0844, + "step": 177990 + }, + { + "epoch": 0.6775119325837564, + "grad_norm": 0.11849367618560791, + "learning_rate": 0.0005, + "loss": 2.1023, + "step": 178000 + }, + { + "epoch": 0.6775499950518792, + "grad_norm": 0.13681115210056305, + "learning_rate": 0.0005, + "loss": 2.0949, + "step": 178010 + }, + { + "epoch": 0.6775880575200018, + "grad_norm": 0.12495207786560059, + "learning_rate": 0.0005, + "loss": 2.1223, + "step": 178020 + }, + { + "epoch": 0.6776261199881245, + "grad_norm": 0.11878683418035507, + "learning_rate": 0.0005, + "loss": 2.096, + "step": 178030 + }, + { + "epoch": 0.6776641824562472, + "grad_norm": 0.11636865884065628, + "learning_rate": 0.0005, + "loss": 2.1002, + "step": 178040 + }, + { + "epoch": 0.6777022449243699, + "grad_norm": 0.11892195045948029, + "learning_rate": 0.0005, + "loss": 2.1187, + "step": 178050 + }, + { + "epoch": 0.6777403073924926, + "grad_norm": 0.11692816019058228, + "learning_rate": 0.0005, + "loss": 2.1041, + "step": 178060 + }, + { + "epoch": 0.6777783698606152, + "grad_norm": 0.13492970168590546, + "learning_rate": 0.0005, + "loss": 2.1044, + "step": 178070 + }, + { + "epoch": 0.6778164323287379, + "grad_norm": 0.13853606581687927, + "learning_rate": 0.0005, + "loss": 2.1058, + "step": 178080 + }, + { + "epoch": 0.6778544947968607, + "grad_norm": 0.127889484167099, + "learning_rate": 0.0005, + "loss": 2.123, + "step": 178090 + }, + { + "epoch": 0.6778925572649833, + "grad_norm": 0.12552450597286224, + "learning_rate": 0.0005, + "loss": 2.1087, + "step": 178100 + }, + { + "epoch": 0.677930619733106, + "grad_norm": 0.13038252294063568, + "learning_rate": 0.0005, + "loss": 2.1015, + "step": 178110 + }, + { + "epoch": 0.6779686822012286, + "grad_norm": 0.12474310398101807, + "learning_rate": 0.0005, + "loss": 2.1147, + "step": 178120 + }, + { + "epoch": 0.6780067446693513, + "grad_norm": 0.1274314820766449, + "learning_rate": 0.0005, + "loss": 2.1193, + "step": 178130 + }, + { + "epoch": 0.6780448071374741, + "grad_norm": 0.12522082030773163, + "learning_rate": 0.0005, + "loss": 2.1107, + "step": 178140 + }, + { + "epoch": 0.6780828696055967, + "grad_norm": 0.11199326813220978, + "learning_rate": 0.0005, + "loss": 2.096, + "step": 178150 + }, + { + "epoch": 0.6781209320737194, + "grad_norm": 0.11414346843957901, + "learning_rate": 0.0005, + "loss": 2.1118, + "step": 178160 + }, + { + "epoch": 0.678158994541842, + "grad_norm": 0.1230161264538765, + "learning_rate": 0.0005, + "loss": 2.1026, + "step": 178170 + }, + { + "epoch": 0.6781970570099648, + "grad_norm": 0.11851771920919418, + "learning_rate": 0.0005, + "loss": 2.1125, + "step": 178180 + }, + { + "epoch": 0.6782351194780875, + "grad_norm": 0.12614396214485168, + "learning_rate": 0.0005, + "loss": 2.1067, + "step": 178190 + }, + { + "epoch": 0.6782731819462101, + "grad_norm": 0.11604657024145126, + "learning_rate": 0.0005, + "loss": 2.1136, + "step": 178200 + }, + { + "epoch": 0.6783112444143328, + "grad_norm": 0.11043967306613922, + "learning_rate": 0.0005, + "loss": 2.0992, + "step": 178210 + }, + { + "epoch": 0.6783493068824555, + "grad_norm": 0.13001075387001038, + "learning_rate": 0.0005, + "loss": 2.1187, + "step": 178220 + }, + { + "epoch": 0.6783873693505782, + "grad_norm": 0.1245742067694664, + "learning_rate": 0.0005, + "loss": 2.1112, + "step": 178230 + }, + { + "epoch": 0.6784254318187009, + "grad_norm": 0.1329800933599472, + "learning_rate": 0.0005, + "loss": 2.1134, + "step": 178240 + }, + { + "epoch": 0.6784634942868235, + "grad_norm": 0.12265625596046448, + "learning_rate": 0.0005, + "loss": 2.111, + "step": 178250 + }, + { + "epoch": 0.6785015567549462, + "grad_norm": 0.1359381377696991, + "learning_rate": 0.0005, + "loss": 2.1161, + "step": 178260 + }, + { + "epoch": 0.6785396192230689, + "grad_norm": 0.13058646023273468, + "learning_rate": 0.0005, + "loss": 2.1189, + "step": 178270 + }, + { + "epoch": 0.6785776816911916, + "grad_norm": 0.12857820093631744, + "learning_rate": 0.0005, + "loss": 2.1104, + "step": 178280 + }, + { + "epoch": 0.6786157441593142, + "grad_norm": 0.12578581273555756, + "learning_rate": 0.0005, + "loss": 2.1158, + "step": 178290 + }, + { + "epoch": 0.6786538066274369, + "grad_norm": 0.136888325214386, + "learning_rate": 0.0005, + "loss": 2.0979, + "step": 178300 + }, + { + "epoch": 0.6786918690955597, + "grad_norm": 0.12314784526824951, + "learning_rate": 0.0005, + "loss": 2.1012, + "step": 178310 + }, + { + "epoch": 0.6787299315636823, + "grad_norm": 0.11737371981143951, + "learning_rate": 0.0005, + "loss": 2.111, + "step": 178320 + }, + { + "epoch": 0.678767994031805, + "grad_norm": 0.1219782680273056, + "learning_rate": 0.0005, + "loss": 2.1122, + "step": 178330 + }, + { + "epoch": 0.6788060564999276, + "grad_norm": 0.12800832092761993, + "learning_rate": 0.0005, + "loss": 2.1029, + "step": 178340 + }, + { + "epoch": 0.6788441189680504, + "grad_norm": 0.12492241710424423, + "learning_rate": 0.0005, + "loss": 2.1006, + "step": 178350 + }, + { + "epoch": 0.6788821814361731, + "grad_norm": 0.1212736964225769, + "learning_rate": 0.0005, + "loss": 2.118, + "step": 178360 + }, + { + "epoch": 0.6789202439042957, + "grad_norm": 0.12892039120197296, + "learning_rate": 0.0005, + "loss": 2.1075, + "step": 178370 + }, + { + "epoch": 0.6789583063724184, + "grad_norm": 0.13535402715206146, + "learning_rate": 0.0005, + "loss": 2.1064, + "step": 178380 + }, + { + "epoch": 0.678996368840541, + "grad_norm": 0.12345527857542038, + "learning_rate": 0.0005, + "loss": 2.1294, + "step": 178390 + }, + { + "epoch": 0.6790344313086638, + "grad_norm": 0.12075518816709518, + "learning_rate": 0.0005, + "loss": 2.1115, + "step": 178400 + }, + { + "epoch": 0.6790724937767865, + "grad_norm": 0.11288055032491684, + "learning_rate": 0.0005, + "loss": 2.1128, + "step": 178410 + }, + { + "epoch": 0.6791105562449091, + "grad_norm": 0.12587016820907593, + "learning_rate": 0.0005, + "loss": 2.1106, + "step": 178420 + }, + { + "epoch": 0.6791486187130318, + "grad_norm": 0.13511881232261658, + "learning_rate": 0.0005, + "loss": 2.1034, + "step": 178430 + }, + { + "epoch": 0.6791866811811546, + "grad_norm": 0.12678687274456024, + "learning_rate": 0.0005, + "loss": 2.1221, + "step": 178440 + }, + { + "epoch": 0.6792247436492772, + "grad_norm": 0.11422378569841385, + "learning_rate": 0.0005, + "loss": 2.0985, + "step": 178450 + }, + { + "epoch": 0.6792628061173999, + "grad_norm": 0.1303092986345291, + "learning_rate": 0.0005, + "loss": 2.0871, + "step": 178460 + }, + { + "epoch": 0.6793008685855225, + "grad_norm": 0.11457906663417816, + "learning_rate": 0.0005, + "loss": 2.1181, + "step": 178470 + }, + { + "epoch": 0.6793389310536453, + "grad_norm": 0.12561039626598358, + "learning_rate": 0.0005, + "loss": 2.1018, + "step": 178480 + }, + { + "epoch": 0.679376993521768, + "grad_norm": 0.11544699966907501, + "learning_rate": 0.0005, + "loss": 2.1108, + "step": 178490 + }, + { + "epoch": 0.6794150559898906, + "grad_norm": 0.13715773820877075, + "learning_rate": 0.0005, + "loss": 2.1211, + "step": 178500 + }, + { + "epoch": 0.6794531184580133, + "grad_norm": 0.1227487102150917, + "learning_rate": 0.0005, + "loss": 2.1075, + "step": 178510 + }, + { + "epoch": 0.679491180926136, + "grad_norm": 0.11637184768915176, + "learning_rate": 0.0005, + "loss": 2.125, + "step": 178520 + }, + { + "epoch": 0.6795292433942587, + "grad_norm": 0.11330291628837585, + "learning_rate": 0.0005, + "loss": 2.1258, + "step": 178530 + }, + { + "epoch": 0.6795673058623813, + "grad_norm": 0.12525829672813416, + "learning_rate": 0.0005, + "loss": 2.1087, + "step": 178540 + }, + { + "epoch": 0.679605368330504, + "grad_norm": 0.11803308129310608, + "learning_rate": 0.0005, + "loss": 2.1117, + "step": 178550 + }, + { + "epoch": 0.6796434307986267, + "grad_norm": 0.12072555720806122, + "learning_rate": 0.0005, + "loss": 2.1112, + "step": 178560 + }, + { + "epoch": 0.6796814932667494, + "grad_norm": 0.12317853420972824, + "learning_rate": 0.0005, + "loss": 2.1206, + "step": 178570 + }, + { + "epoch": 0.6797195557348721, + "grad_norm": 0.1288745105266571, + "learning_rate": 0.0005, + "loss": 2.1025, + "step": 178580 + }, + { + "epoch": 0.6797576182029947, + "grad_norm": 0.13706010580062866, + "learning_rate": 0.0005, + "loss": 2.1083, + "step": 178590 + }, + { + "epoch": 0.6797956806711174, + "grad_norm": 0.13190880417823792, + "learning_rate": 0.0005, + "loss": 2.1065, + "step": 178600 + }, + { + "epoch": 0.6798337431392402, + "grad_norm": 0.13624754548072815, + "learning_rate": 0.0005, + "loss": 2.1156, + "step": 178610 + }, + { + "epoch": 0.6798718056073628, + "grad_norm": 0.14656874537467957, + "learning_rate": 0.0005, + "loss": 2.1155, + "step": 178620 + }, + { + "epoch": 0.6799098680754855, + "grad_norm": 0.11314431577920914, + "learning_rate": 0.0005, + "loss": 2.1036, + "step": 178630 + }, + { + "epoch": 0.6799479305436081, + "grad_norm": 0.11898855865001678, + "learning_rate": 0.0005, + "loss": 2.106, + "step": 178640 + }, + { + "epoch": 0.6799859930117309, + "grad_norm": 0.11646313220262527, + "learning_rate": 0.0005, + "loss": 2.1079, + "step": 178650 + }, + { + "epoch": 0.6800240554798536, + "grad_norm": 0.12358859926462173, + "learning_rate": 0.0005, + "loss": 2.1056, + "step": 178660 + }, + { + "epoch": 0.6800621179479762, + "grad_norm": 0.12150691449642181, + "learning_rate": 0.0005, + "loss": 2.1195, + "step": 178670 + }, + { + "epoch": 0.6801001804160989, + "grad_norm": 0.15535567700862885, + "learning_rate": 0.0005, + "loss": 2.1095, + "step": 178680 + }, + { + "epoch": 0.6801382428842215, + "grad_norm": 0.13010378181934357, + "learning_rate": 0.0005, + "loss": 2.0994, + "step": 178690 + }, + { + "epoch": 0.6801763053523443, + "grad_norm": 0.1207583099603653, + "learning_rate": 0.0005, + "loss": 2.1066, + "step": 178700 + }, + { + "epoch": 0.680214367820467, + "grad_norm": 0.12940700352191925, + "learning_rate": 0.0005, + "loss": 2.094, + "step": 178710 + }, + { + "epoch": 0.6802524302885896, + "grad_norm": 0.11882823705673218, + "learning_rate": 0.0005, + "loss": 2.1092, + "step": 178720 + }, + { + "epoch": 0.6802904927567123, + "grad_norm": 0.12433478981256485, + "learning_rate": 0.0005, + "loss": 2.1134, + "step": 178730 + }, + { + "epoch": 0.680328555224835, + "grad_norm": 0.12496747821569443, + "learning_rate": 0.0005, + "loss": 2.0939, + "step": 178740 + }, + { + "epoch": 0.6803666176929577, + "grad_norm": 0.12532317638397217, + "learning_rate": 0.0005, + "loss": 2.0965, + "step": 178750 + }, + { + "epoch": 0.6804046801610804, + "grad_norm": 0.11943234503269196, + "learning_rate": 0.0005, + "loss": 2.0914, + "step": 178760 + }, + { + "epoch": 0.680442742629203, + "grad_norm": 0.1301838904619217, + "learning_rate": 0.0005, + "loss": 2.0988, + "step": 178770 + }, + { + "epoch": 0.6804808050973258, + "grad_norm": 0.11897633224725723, + "learning_rate": 0.0005, + "loss": 2.1044, + "step": 178780 + }, + { + "epoch": 0.6805188675654484, + "grad_norm": 0.1269768923521042, + "learning_rate": 0.0005, + "loss": 2.106, + "step": 178790 + }, + { + "epoch": 0.6805569300335711, + "grad_norm": 0.12773433327674866, + "learning_rate": 0.0005, + "loss": 2.0983, + "step": 178800 + }, + { + "epoch": 0.6805949925016938, + "grad_norm": 0.1303984671831131, + "learning_rate": 0.0005, + "loss": 2.1164, + "step": 178810 + }, + { + "epoch": 0.6806330549698164, + "grad_norm": 0.11902695149183273, + "learning_rate": 0.0005, + "loss": 2.103, + "step": 178820 + }, + { + "epoch": 0.6806711174379392, + "grad_norm": 0.12053973972797394, + "learning_rate": 0.0005, + "loss": 2.1115, + "step": 178830 + }, + { + "epoch": 0.6807091799060618, + "grad_norm": 0.12673945724964142, + "learning_rate": 0.0005, + "loss": 2.1114, + "step": 178840 + }, + { + "epoch": 0.6807472423741845, + "grad_norm": 0.127527117729187, + "learning_rate": 0.0005, + "loss": 2.1099, + "step": 178850 + }, + { + "epoch": 0.6807853048423071, + "grad_norm": 0.12394998222589493, + "learning_rate": 0.0005, + "loss": 2.1218, + "step": 178860 + }, + { + "epoch": 0.6808233673104299, + "grad_norm": 0.1310361623764038, + "learning_rate": 0.0005, + "loss": 2.1168, + "step": 178870 + }, + { + "epoch": 0.6808614297785526, + "grad_norm": 0.12271669507026672, + "learning_rate": 0.0005, + "loss": 2.1069, + "step": 178880 + }, + { + "epoch": 0.6808994922466752, + "grad_norm": 0.11129756271839142, + "learning_rate": 0.0005, + "loss": 2.1129, + "step": 178890 + }, + { + "epoch": 0.6809375547147979, + "grad_norm": 0.12341821938753128, + "learning_rate": 0.0005, + "loss": 2.1191, + "step": 178900 + }, + { + "epoch": 0.6809756171829207, + "grad_norm": 0.11532428115606308, + "learning_rate": 0.0005, + "loss": 2.1058, + "step": 178910 + }, + { + "epoch": 0.6810136796510433, + "grad_norm": 0.12454450130462646, + "learning_rate": 0.0005, + "loss": 2.1098, + "step": 178920 + }, + { + "epoch": 0.681051742119166, + "grad_norm": 0.1244184821844101, + "learning_rate": 0.0005, + "loss": 2.1095, + "step": 178930 + }, + { + "epoch": 0.6810898045872886, + "grad_norm": 0.11621390283107758, + "learning_rate": 0.0005, + "loss": 2.1092, + "step": 178940 + }, + { + "epoch": 0.6811278670554114, + "grad_norm": 0.1157296672463417, + "learning_rate": 0.0005, + "loss": 2.1221, + "step": 178950 + }, + { + "epoch": 0.681165929523534, + "grad_norm": 0.11904320865869522, + "learning_rate": 0.0005, + "loss": 2.1086, + "step": 178960 + }, + { + "epoch": 0.6812039919916567, + "grad_norm": 0.1200244352221489, + "learning_rate": 0.0005, + "loss": 2.1176, + "step": 178970 + }, + { + "epoch": 0.6812420544597794, + "grad_norm": 0.1374550759792328, + "learning_rate": 0.0005, + "loss": 2.1177, + "step": 178980 + }, + { + "epoch": 0.681280116927902, + "grad_norm": 0.1327461302280426, + "learning_rate": 0.0005, + "loss": 2.1097, + "step": 178990 + }, + { + "epoch": 0.6813181793960248, + "grad_norm": 0.12066030502319336, + "learning_rate": 0.0005, + "loss": 2.1116, + "step": 179000 + }, + { + "epoch": 0.6813562418641474, + "grad_norm": 0.11714760959148407, + "learning_rate": 0.0005, + "loss": 2.1098, + "step": 179010 + }, + { + "epoch": 0.6813943043322701, + "grad_norm": 0.1339813768863678, + "learning_rate": 0.0005, + "loss": 2.1064, + "step": 179020 + }, + { + "epoch": 0.6814323668003928, + "grad_norm": 0.1168723851442337, + "learning_rate": 0.0005, + "loss": 2.1168, + "step": 179030 + }, + { + "epoch": 0.6814704292685155, + "grad_norm": 0.12488270550966263, + "learning_rate": 0.0005, + "loss": 2.1094, + "step": 179040 + }, + { + "epoch": 0.6815084917366382, + "grad_norm": 0.1275583803653717, + "learning_rate": 0.0005, + "loss": 2.1298, + "step": 179050 + }, + { + "epoch": 0.6815465542047608, + "grad_norm": 0.11426876485347748, + "learning_rate": 0.0005, + "loss": 2.0997, + "step": 179060 + }, + { + "epoch": 0.6815846166728835, + "grad_norm": 0.12208835780620575, + "learning_rate": 0.0005, + "loss": 2.105, + "step": 179070 + }, + { + "epoch": 0.6816226791410063, + "grad_norm": 0.12177817523479462, + "learning_rate": 0.0005, + "loss": 2.1075, + "step": 179080 + }, + { + "epoch": 0.6816607416091289, + "grad_norm": 0.1255825310945511, + "learning_rate": 0.0005, + "loss": 2.0969, + "step": 179090 + }, + { + "epoch": 0.6816988040772516, + "grad_norm": 0.14138972759246826, + "learning_rate": 0.0005, + "loss": 2.1057, + "step": 179100 + }, + { + "epoch": 0.6817368665453742, + "grad_norm": 0.12741991877555847, + "learning_rate": 0.0005, + "loss": 2.1061, + "step": 179110 + }, + { + "epoch": 0.6817749290134969, + "grad_norm": 0.1334943175315857, + "learning_rate": 0.0005, + "loss": 2.1199, + "step": 179120 + }, + { + "epoch": 0.6818129914816197, + "grad_norm": 0.12393660098314285, + "learning_rate": 0.0005, + "loss": 2.1206, + "step": 179130 + }, + { + "epoch": 0.6818510539497423, + "grad_norm": 0.11239521205425262, + "learning_rate": 0.0005, + "loss": 2.1047, + "step": 179140 + }, + { + "epoch": 0.681889116417865, + "grad_norm": 0.11759033799171448, + "learning_rate": 0.0005, + "loss": 2.1112, + "step": 179150 + }, + { + "epoch": 0.6819271788859876, + "grad_norm": 0.11990151554346085, + "learning_rate": 0.0005, + "loss": 2.1117, + "step": 179160 + }, + { + "epoch": 0.6819652413541104, + "grad_norm": 0.11784869432449341, + "learning_rate": 0.0005, + "loss": 2.1021, + "step": 179170 + }, + { + "epoch": 0.6820033038222331, + "grad_norm": 0.12301430851221085, + "learning_rate": 0.0005, + "loss": 2.1081, + "step": 179180 + }, + { + "epoch": 0.6820413662903557, + "grad_norm": 0.12320394814014435, + "learning_rate": 0.0005, + "loss": 2.1119, + "step": 179190 + }, + { + "epoch": 0.6820794287584784, + "grad_norm": 0.13769547641277313, + "learning_rate": 0.0005, + "loss": 2.1037, + "step": 179200 + }, + { + "epoch": 0.6821174912266011, + "grad_norm": 0.13465788960456848, + "learning_rate": 0.0005, + "loss": 2.1142, + "step": 179210 + }, + { + "epoch": 0.6821555536947238, + "grad_norm": 0.1108383983373642, + "learning_rate": 0.0005, + "loss": 2.1139, + "step": 179220 + }, + { + "epoch": 0.6821936161628465, + "grad_norm": 0.12412822991609573, + "learning_rate": 0.0005, + "loss": 2.0999, + "step": 179230 + }, + { + "epoch": 0.6822316786309691, + "grad_norm": 0.12183883041143417, + "learning_rate": 0.0005, + "loss": 2.1131, + "step": 179240 + }, + { + "epoch": 0.6822697410990918, + "grad_norm": 0.12129288166761398, + "learning_rate": 0.0005, + "loss": 2.1122, + "step": 179250 + }, + { + "epoch": 0.6823078035672145, + "grad_norm": 0.12597280740737915, + "learning_rate": 0.0005, + "loss": 2.1156, + "step": 179260 + }, + { + "epoch": 0.6823458660353372, + "grad_norm": 0.11562073230743408, + "learning_rate": 0.0005, + "loss": 2.1004, + "step": 179270 + }, + { + "epoch": 0.6823839285034599, + "grad_norm": 0.1106618121266365, + "learning_rate": 0.0005, + "loss": 2.12, + "step": 179280 + }, + { + "epoch": 0.6824219909715825, + "grad_norm": 0.128122478723526, + "learning_rate": 0.0005, + "loss": 2.1102, + "step": 179290 + }, + { + "epoch": 0.6824600534397053, + "grad_norm": 0.14943507313728333, + "learning_rate": 0.0005, + "loss": 2.1203, + "step": 179300 + }, + { + "epoch": 0.6824981159078279, + "grad_norm": 0.11820433288812637, + "learning_rate": 0.0005, + "loss": 2.095, + "step": 179310 + }, + { + "epoch": 0.6825361783759506, + "grad_norm": 0.11343543976545334, + "learning_rate": 0.0005, + "loss": 2.1087, + "step": 179320 + }, + { + "epoch": 0.6825742408440733, + "grad_norm": 0.9048527479171753, + "learning_rate": 0.0005, + "loss": 2.0994, + "step": 179330 + }, + { + "epoch": 0.682612303312196, + "grad_norm": 0.12243762612342834, + "learning_rate": 0.0005, + "loss": 2.103, + "step": 179340 + }, + { + "epoch": 0.6826503657803187, + "grad_norm": 0.11322756856679916, + "learning_rate": 0.0005, + "loss": 2.1181, + "step": 179350 + }, + { + "epoch": 0.6826884282484413, + "grad_norm": 0.11842131614685059, + "learning_rate": 0.0005, + "loss": 2.1107, + "step": 179360 + }, + { + "epoch": 0.682726490716564, + "grad_norm": 0.11658118665218353, + "learning_rate": 0.0005, + "loss": 2.1124, + "step": 179370 + }, + { + "epoch": 0.6827645531846868, + "grad_norm": 0.11913055926561356, + "learning_rate": 0.0005, + "loss": 2.0982, + "step": 179380 + }, + { + "epoch": 0.6828026156528094, + "grad_norm": 0.11381657421588898, + "learning_rate": 0.0005, + "loss": 2.1185, + "step": 179390 + }, + { + "epoch": 0.6828406781209321, + "grad_norm": 0.11231932044029236, + "learning_rate": 0.0005, + "loss": 2.1183, + "step": 179400 + }, + { + "epoch": 0.6828787405890547, + "grad_norm": 0.12924441695213318, + "learning_rate": 0.0005, + "loss": 2.109, + "step": 179410 + }, + { + "epoch": 0.6829168030571774, + "grad_norm": 0.11670733988285065, + "learning_rate": 0.0005, + "loss": 2.0953, + "step": 179420 + }, + { + "epoch": 0.6829548655253002, + "grad_norm": 0.11556648463010788, + "learning_rate": 0.0005, + "loss": 2.0968, + "step": 179430 + }, + { + "epoch": 0.6829929279934228, + "grad_norm": 0.1284991353750229, + "learning_rate": 0.0005, + "loss": 2.1145, + "step": 179440 + }, + { + "epoch": 0.6830309904615455, + "grad_norm": 0.12964291870594025, + "learning_rate": 0.0005, + "loss": 2.1032, + "step": 179450 + }, + { + "epoch": 0.6830690529296681, + "grad_norm": 0.1216454803943634, + "learning_rate": 0.0005, + "loss": 2.1056, + "step": 179460 + }, + { + "epoch": 0.6831071153977909, + "grad_norm": 0.14056138694286346, + "learning_rate": 0.0005, + "loss": 2.0957, + "step": 179470 + }, + { + "epoch": 0.6831451778659136, + "grad_norm": 0.1397508829832077, + "learning_rate": 0.0005, + "loss": 2.0995, + "step": 179480 + }, + { + "epoch": 0.6831832403340362, + "grad_norm": 0.12131225317716599, + "learning_rate": 0.0005, + "loss": 2.1145, + "step": 179490 + }, + { + "epoch": 0.6832213028021589, + "grad_norm": 0.11590532958507538, + "learning_rate": 0.0005, + "loss": 2.1193, + "step": 179500 + }, + { + "epoch": 0.6832593652702816, + "grad_norm": 0.12147286534309387, + "learning_rate": 0.0005, + "loss": 2.095, + "step": 179510 + }, + { + "epoch": 0.6832974277384043, + "grad_norm": 0.1389768123626709, + "learning_rate": 0.0005, + "loss": 2.1009, + "step": 179520 + }, + { + "epoch": 0.683335490206527, + "grad_norm": 0.11397550255060196, + "learning_rate": 0.0005, + "loss": 2.1015, + "step": 179530 + }, + { + "epoch": 0.6833735526746496, + "grad_norm": 0.1191280260682106, + "learning_rate": 0.0005, + "loss": 2.1074, + "step": 179540 + }, + { + "epoch": 0.6834116151427723, + "grad_norm": 0.1180817037820816, + "learning_rate": 0.0005, + "loss": 2.1032, + "step": 179550 + }, + { + "epoch": 0.683449677610895, + "grad_norm": 0.12457938492298126, + "learning_rate": 0.0005, + "loss": 2.1069, + "step": 179560 + }, + { + "epoch": 0.6834877400790177, + "grad_norm": 0.1202085092663765, + "learning_rate": 0.0005, + "loss": 2.1066, + "step": 179570 + }, + { + "epoch": 0.6835258025471403, + "grad_norm": 0.1307467371225357, + "learning_rate": 0.0005, + "loss": 2.1002, + "step": 179580 + }, + { + "epoch": 0.683563865015263, + "grad_norm": 0.13553386926651, + "learning_rate": 0.0005, + "loss": 2.1117, + "step": 179590 + }, + { + "epoch": 0.6836019274833858, + "grad_norm": 0.131942480802536, + "learning_rate": 0.0005, + "loss": 2.1139, + "step": 179600 + }, + { + "epoch": 0.6836399899515084, + "grad_norm": 0.11515048891305923, + "learning_rate": 0.0005, + "loss": 2.1081, + "step": 179610 + }, + { + "epoch": 0.6836780524196311, + "grad_norm": 0.13275325298309326, + "learning_rate": 0.0005, + "loss": 2.1062, + "step": 179620 + }, + { + "epoch": 0.6837161148877537, + "grad_norm": 0.11690928786993027, + "learning_rate": 0.0005, + "loss": 2.0996, + "step": 179630 + }, + { + "epoch": 0.6837541773558765, + "grad_norm": 0.14062003791332245, + "learning_rate": 0.0005, + "loss": 2.1018, + "step": 179640 + }, + { + "epoch": 0.6837922398239992, + "grad_norm": 0.11849799007177353, + "learning_rate": 0.0005, + "loss": 2.124, + "step": 179650 + }, + { + "epoch": 0.6838303022921218, + "grad_norm": 0.12318059802055359, + "learning_rate": 0.0005, + "loss": 2.0956, + "step": 179660 + }, + { + "epoch": 0.6838683647602445, + "grad_norm": 0.12393590807914734, + "learning_rate": 0.0005, + "loss": 2.1034, + "step": 179670 + }, + { + "epoch": 0.6839064272283671, + "grad_norm": 0.12598690390586853, + "learning_rate": 0.0005, + "loss": 2.1162, + "step": 179680 + }, + { + "epoch": 0.6839444896964899, + "grad_norm": 0.11892339587211609, + "learning_rate": 0.0005, + "loss": 2.1084, + "step": 179690 + }, + { + "epoch": 0.6839825521646126, + "grad_norm": 0.12454485148191452, + "learning_rate": 0.0005, + "loss": 2.1054, + "step": 179700 + }, + { + "epoch": 0.6840206146327352, + "grad_norm": 0.13515712320804596, + "learning_rate": 0.0005, + "loss": 2.1256, + "step": 179710 + }, + { + "epoch": 0.6840586771008579, + "grad_norm": 0.12502405047416687, + "learning_rate": 0.0005, + "loss": 2.0984, + "step": 179720 + }, + { + "epoch": 0.6840967395689806, + "grad_norm": 0.11308914422988892, + "learning_rate": 0.0005, + "loss": 2.0922, + "step": 179730 + }, + { + "epoch": 0.6841348020371033, + "grad_norm": 0.13700667023658752, + "learning_rate": 0.0005, + "loss": 2.1071, + "step": 179740 + }, + { + "epoch": 0.684172864505226, + "grad_norm": 0.13408491015434265, + "learning_rate": 0.0005, + "loss": 2.1106, + "step": 179750 + }, + { + "epoch": 0.6842109269733486, + "grad_norm": 0.13319963216781616, + "learning_rate": 0.0005, + "loss": 2.0949, + "step": 179760 + }, + { + "epoch": 0.6842489894414714, + "grad_norm": 0.12456398457288742, + "learning_rate": 0.0005, + "loss": 2.113, + "step": 179770 + }, + { + "epoch": 0.684287051909594, + "grad_norm": 0.13097211718559265, + "learning_rate": 0.0005, + "loss": 2.0982, + "step": 179780 + }, + { + "epoch": 0.6843251143777167, + "grad_norm": 0.11321916431188583, + "learning_rate": 0.0005, + "loss": 2.1135, + "step": 179790 + }, + { + "epoch": 0.6843631768458394, + "grad_norm": 0.1175604984164238, + "learning_rate": 0.0005, + "loss": 2.0939, + "step": 179800 + }, + { + "epoch": 0.6844012393139621, + "grad_norm": 0.12084577232599258, + "learning_rate": 0.0005, + "loss": 2.1025, + "step": 179810 + }, + { + "epoch": 0.6844393017820848, + "grad_norm": 0.11561520397663116, + "learning_rate": 0.0005, + "loss": 2.1145, + "step": 179820 + }, + { + "epoch": 0.6844773642502074, + "grad_norm": 0.11477413773536682, + "learning_rate": 0.0005, + "loss": 2.1214, + "step": 179830 + }, + { + "epoch": 0.6845154267183301, + "grad_norm": 0.1235690787434578, + "learning_rate": 0.0005, + "loss": 2.1041, + "step": 179840 + }, + { + "epoch": 0.6845534891864528, + "grad_norm": 0.12032691389322281, + "learning_rate": 0.0005, + "loss": 2.1281, + "step": 179850 + }, + { + "epoch": 0.6845915516545755, + "grad_norm": 0.12753982841968536, + "learning_rate": 0.0005, + "loss": 2.1091, + "step": 179860 + }, + { + "epoch": 0.6846296141226982, + "grad_norm": 0.1180667132139206, + "learning_rate": 0.0005, + "loss": 2.1099, + "step": 179870 + }, + { + "epoch": 0.6846676765908208, + "grad_norm": 0.12818437814712524, + "learning_rate": 0.0005, + "loss": 2.0986, + "step": 179880 + }, + { + "epoch": 0.6847057390589435, + "grad_norm": 0.12024499475955963, + "learning_rate": 0.0005, + "loss": 2.1081, + "step": 179890 + }, + { + "epoch": 0.6847438015270663, + "grad_norm": 0.11732951551675797, + "learning_rate": 0.0005, + "loss": 2.1164, + "step": 179900 + }, + { + "epoch": 0.6847818639951889, + "grad_norm": 0.1257062554359436, + "learning_rate": 0.0005, + "loss": 2.1191, + "step": 179910 + }, + { + "epoch": 0.6848199264633116, + "grad_norm": 0.13282521069049835, + "learning_rate": 0.0005, + "loss": 2.1063, + "step": 179920 + }, + { + "epoch": 0.6848579889314342, + "grad_norm": 0.12256745249032974, + "learning_rate": 0.0005, + "loss": 2.1051, + "step": 179930 + }, + { + "epoch": 0.684896051399557, + "grad_norm": 0.1206756979227066, + "learning_rate": 0.0005, + "loss": 2.1081, + "step": 179940 + }, + { + "epoch": 0.6849341138676797, + "grad_norm": 0.12020318955183029, + "learning_rate": 0.0005, + "loss": 2.0951, + "step": 179950 + }, + { + "epoch": 0.6849721763358023, + "grad_norm": 0.13199806213378906, + "learning_rate": 0.0005, + "loss": 2.1162, + "step": 179960 + }, + { + "epoch": 0.685010238803925, + "grad_norm": 0.12754608690738678, + "learning_rate": 0.0005, + "loss": 2.1056, + "step": 179970 + }, + { + "epoch": 0.6850483012720476, + "grad_norm": 0.1329394429922104, + "learning_rate": 0.0005, + "loss": 2.0972, + "step": 179980 + }, + { + "epoch": 0.6850863637401704, + "grad_norm": 0.1296091377735138, + "learning_rate": 0.0005, + "loss": 2.129, + "step": 179990 + }, + { + "epoch": 0.6851244262082931, + "grad_norm": 0.1283130943775177, + "learning_rate": 0.0005, + "loss": 2.1058, + "step": 180000 + }, + { + "epoch": 0.6851624886764157, + "grad_norm": 0.12421011179685593, + "learning_rate": 0.0005, + "loss": 2.0999, + "step": 180010 + }, + { + "epoch": 0.6852005511445384, + "grad_norm": 0.13211941719055176, + "learning_rate": 0.0005, + "loss": 2.1049, + "step": 180020 + }, + { + "epoch": 0.6852386136126611, + "grad_norm": 0.11720091104507446, + "learning_rate": 0.0005, + "loss": 2.1029, + "step": 180030 + }, + { + "epoch": 0.6852766760807838, + "grad_norm": 0.1294281780719757, + "learning_rate": 0.0005, + "loss": 2.105, + "step": 180040 + }, + { + "epoch": 0.6853147385489065, + "grad_norm": 0.12521712481975555, + "learning_rate": 0.0005, + "loss": 2.0966, + "step": 180050 + }, + { + "epoch": 0.6853528010170291, + "grad_norm": 0.12627719342708588, + "learning_rate": 0.0005, + "loss": 2.1183, + "step": 180060 + }, + { + "epoch": 0.6853908634851519, + "grad_norm": 0.13020716607570648, + "learning_rate": 0.0005, + "loss": 2.1282, + "step": 180070 + }, + { + "epoch": 0.6854289259532745, + "grad_norm": 0.11846989393234253, + "learning_rate": 0.0005, + "loss": 2.114, + "step": 180080 + }, + { + "epoch": 0.6854669884213972, + "grad_norm": 0.11798037588596344, + "learning_rate": 0.0005, + "loss": 2.12, + "step": 180090 + }, + { + "epoch": 0.6855050508895199, + "grad_norm": 0.12672331929206848, + "learning_rate": 0.0005, + "loss": 2.1089, + "step": 180100 + }, + { + "epoch": 0.6855431133576425, + "grad_norm": 0.12962199747562408, + "learning_rate": 0.0005, + "loss": 2.1127, + "step": 180110 + }, + { + "epoch": 0.6855811758257653, + "grad_norm": 0.11662869900465012, + "learning_rate": 0.0005, + "loss": 2.1097, + "step": 180120 + }, + { + "epoch": 0.6856192382938879, + "grad_norm": 0.1326063722372055, + "learning_rate": 0.0005, + "loss": 2.1072, + "step": 180130 + }, + { + "epoch": 0.6856573007620106, + "grad_norm": 0.12435778975486755, + "learning_rate": 0.0005, + "loss": 2.1256, + "step": 180140 + }, + { + "epoch": 0.6856953632301332, + "grad_norm": 0.12299150973558426, + "learning_rate": 0.0005, + "loss": 2.1132, + "step": 180150 + }, + { + "epoch": 0.685733425698256, + "grad_norm": 0.13320887088775635, + "learning_rate": 0.0005, + "loss": 2.0997, + "step": 180160 + }, + { + "epoch": 0.6857714881663787, + "grad_norm": 0.13181836903095245, + "learning_rate": 0.0005, + "loss": 2.1062, + "step": 180170 + }, + { + "epoch": 0.6858095506345013, + "grad_norm": 0.12428215146064758, + "learning_rate": 0.0005, + "loss": 2.1069, + "step": 180180 + }, + { + "epoch": 0.685847613102624, + "grad_norm": 0.12043698132038116, + "learning_rate": 0.0005, + "loss": 2.0954, + "step": 180190 + }, + { + "epoch": 0.6858856755707468, + "grad_norm": 0.12302254885435104, + "learning_rate": 0.0005, + "loss": 2.1171, + "step": 180200 + }, + { + "epoch": 0.6859237380388694, + "grad_norm": 0.24158915877342224, + "learning_rate": 0.0005, + "loss": 2.1121, + "step": 180210 + }, + { + "epoch": 0.6859618005069921, + "grad_norm": 0.12980495393276215, + "learning_rate": 0.0005, + "loss": 2.1029, + "step": 180220 + }, + { + "epoch": 0.6859998629751147, + "grad_norm": 0.12906219065189362, + "learning_rate": 0.0005, + "loss": 2.1263, + "step": 180230 + }, + { + "epoch": 0.6860379254432375, + "grad_norm": 0.1250433474779129, + "learning_rate": 0.0005, + "loss": 2.116, + "step": 180240 + }, + { + "epoch": 0.6860759879113602, + "grad_norm": 0.13827890157699585, + "learning_rate": 0.0005, + "loss": 2.1056, + "step": 180250 + }, + { + "epoch": 0.6861140503794828, + "grad_norm": 0.12229827791452408, + "learning_rate": 0.0005, + "loss": 2.114, + "step": 180260 + }, + { + "epoch": 0.6861521128476055, + "grad_norm": 0.11730405688285828, + "learning_rate": 0.0005, + "loss": 2.1145, + "step": 180270 + }, + { + "epoch": 0.6861901753157281, + "grad_norm": 0.13208967447280884, + "learning_rate": 0.0005, + "loss": 2.1048, + "step": 180280 + }, + { + "epoch": 0.6862282377838509, + "grad_norm": 0.12246529757976532, + "learning_rate": 0.0005, + "loss": 2.1245, + "step": 180290 + }, + { + "epoch": 0.6862663002519735, + "grad_norm": 0.12544351816177368, + "learning_rate": 0.0005, + "loss": 2.1054, + "step": 180300 + }, + { + "epoch": 0.6863043627200962, + "grad_norm": 0.1244068592786789, + "learning_rate": 0.0005, + "loss": 2.1137, + "step": 180310 + }, + { + "epoch": 0.6863424251882189, + "grad_norm": 0.12453640252351761, + "learning_rate": 0.0005, + "loss": 2.1224, + "step": 180320 + }, + { + "epoch": 0.6863804876563416, + "grad_norm": 0.11731000244617462, + "learning_rate": 0.0005, + "loss": 2.1091, + "step": 180330 + }, + { + "epoch": 0.6864185501244643, + "grad_norm": 0.12416515499353409, + "learning_rate": 0.0005, + "loss": 2.1018, + "step": 180340 + }, + { + "epoch": 0.686456612592587, + "grad_norm": 0.13353504240512848, + "learning_rate": 0.0005, + "loss": 2.1043, + "step": 180350 + }, + { + "epoch": 0.6864946750607096, + "grad_norm": 0.11940930038690567, + "learning_rate": 0.0005, + "loss": 2.0994, + "step": 180360 + }, + { + "epoch": 0.6865327375288324, + "grad_norm": 0.2674172818660736, + "learning_rate": 0.0005, + "loss": 2.11, + "step": 180370 + }, + { + "epoch": 0.686570799996955, + "grad_norm": 0.1152404174208641, + "learning_rate": 0.0005, + "loss": 2.1082, + "step": 180380 + }, + { + "epoch": 0.6866088624650777, + "grad_norm": 0.15712182223796844, + "learning_rate": 0.0005, + "loss": 2.1193, + "step": 180390 + }, + { + "epoch": 0.6866469249332003, + "grad_norm": 0.13125132024288177, + "learning_rate": 0.0005, + "loss": 2.1054, + "step": 180400 + }, + { + "epoch": 0.686684987401323, + "grad_norm": 0.12223441153764725, + "learning_rate": 0.0005, + "loss": 2.113, + "step": 180410 + }, + { + "epoch": 0.6867230498694458, + "grad_norm": 0.12567825615406036, + "learning_rate": 0.0005, + "loss": 2.1111, + "step": 180420 + }, + { + "epoch": 0.6867611123375684, + "grad_norm": 0.11764916777610779, + "learning_rate": 0.0005, + "loss": 2.1091, + "step": 180430 + }, + { + "epoch": 0.6867991748056911, + "grad_norm": 0.1293945163488388, + "learning_rate": 0.0005, + "loss": 2.1001, + "step": 180440 + }, + { + "epoch": 0.6868372372738137, + "grad_norm": 0.1217617318034172, + "learning_rate": 0.0005, + "loss": 2.1029, + "step": 180450 + }, + { + "epoch": 0.6868752997419365, + "grad_norm": 0.12653307616710663, + "learning_rate": 0.0005, + "loss": 2.1002, + "step": 180460 + }, + { + "epoch": 0.6869133622100592, + "grad_norm": 0.13232189416885376, + "learning_rate": 0.0005, + "loss": 2.1271, + "step": 180470 + }, + { + "epoch": 0.6869514246781818, + "grad_norm": 0.12960053980350494, + "learning_rate": 0.0005, + "loss": 2.1117, + "step": 180480 + }, + { + "epoch": 0.6869894871463045, + "grad_norm": 0.12225568294525146, + "learning_rate": 0.0005, + "loss": 2.1117, + "step": 180490 + }, + { + "epoch": 0.6870275496144272, + "grad_norm": 0.1188991367816925, + "learning_rate": 0.0005, + "loss": 2.0998, + "step": 180500 + }, + { + "epoch": 0.6870656120825499, + "grad_norm": 0.12143424898386002, + "learning_rate": 0.0005, + "loss": 2.1162, + "step": 180510 + }, + { + "epoch": 0.6871036745506726, + "grad_norm": 0.13485486805438995, + "learning_rate": 0.0005, + "loss": 2.1056, + "step": 180520 + }, + { + "epoch": 0.6871417370187952, + "grad_norm": 0.12453248351812363, + "learning_rate": 0.0005, + "loss": 2.1204, + "step": 180530 + }, + { + "epoch": 0.6871797994869179, + "grad_norm": 0.13146209716796875, + "learning_rate": 0.0005, + "loss": 2.1048, + "step": 180540 + }, + { + "epoch": 0.6872178619550406, + "grad_norm": 0.1289307028055191, + "learning_rate": 0.0005, + "loss": 2.0844, + "step": 180550 + }, + { + "epoch": 0.6872559244231633, + "grad_norm": 0.1286131888628006, + "learning_rate": 0.0005, + "loss": 2.1214, + "step": 180560 + }, + { + "epoch": 0.687293986891286, + "grad_norm": 0.29164138436317444, + "learning_rate": 0.0005, + "loss": 2.1029, + "step": 180570 + }, + { + "epoch": 0.6873320493594086, + "grad_norm": 0.13932517170906067, + "learning_rate": 0.0005, + "loss": 2.1044, + "step": 180580 + }, + { + "epoch": 0.6873701118275314, + "grad_norm": 0.12999944388866425, + "learning_rate": 0.0005, + "loss": 2.0993, + "step": 180590 + }, + { + "epoch": 0.687408174295654, + "grad_norm": 0.12977859377861023, + "learning_rate": 0.0005, + "loss": 2.1091, + "step": 180600 + }, + { + "epoch": 0.6874462367637767, + "grad_norm": 0.12538054585456848, + "learning_rate": 0.0005, + "loss": 2.0938, + "step": 180610 + }, + { + "epoch": 0.6874842992318994, + "grad_norm": 0.1271113157272339, + "learning_rate": 0.0005, + "loss": 2.106, + "step": 180620 + }, + { + "epoch": 0.6875223617000221, + "grad_norm": 0.11441612988710403, + "learning_rate": 0.0005, + "loss": 2.1115, + "step": 180630 + }, + { + "epoch": 0.6875604241681448, + "grad_norm": 0.1191798597574234, + "learning_rate": 0.0005, + "loss": 2.1093, + "step": 180640 + }, + { + "epoch": 0.6875984866362674, + "grad_norm": 0.12940026819705963, + "learning_rate": 0.0005, + "loss": 2.0984, + "step": 180650 + }, + { + "epoch": 0.6876365491043901, + "grad_norm": 0.12178556621074677, + "learning_rate": 0.0005, + "loss": 2.1066, + "step": 180660 + }, + { + "epoch": 0.6876746115725129, + "grad_norm": 0.1176435723900795, + "learning_rate": 0.0005, + "loss": 2.1078, + "step": 180670 + }, + { + "epoch": 0.6877126740406355, + "grad_norm": 0.12224700301885605, + "learning_rate": 0.0005, + "loss": 2.1067, + "step": 180680 + }, + { + "epoch": 0.6877507365087582, + "grad_norm": 0.11887124925851822, + "learning_rate": 0.0005, + "loss": 2.0955, + "step": 180690 + }, + { + "epoch": 0.6877887989768808, + "grad_norm": 0.13752269744873047, + "learning_rate": 0.0005, + "loss": 2.1162, + "step": 180700 + }, + { + "epoch": 0.6878268614450035, + "grad_norm": 0.11800897121429443, + "learning_rate": 0.0005, + "loss": 2.1004, + "step": 180710 + }, + { + "epoch": 0.6878649239131263, + "grad_norm": 0.12719684839248657, + "learning_rate": 0.0005, + "loss": 2.1219, + "step": 180720 + }, + { + "epoch": 0.6879029863812489, + "grad_norm": 0.1225336343050003, + "learning_rate": 0.0005, + "loss": 2.1092, + "step": 180730 + }, + { + "epoch": 0.6879410488493716, + "grad_norm": 0.11735167354345322, + "learning_rate": 0.0005, + "loss": 2.0984, + "step": 180740 + }, + { + "epoch": 0.6879791113174942, + "grad_norm": 0.1273951232433319, + "learning_rate": 0.0005, + "loss": 2.1317, + "step": 180750 + }, + { + "epoch": 0.688017173785617, + "grad_norm": 0.1332283616065979, + "learning_rate": 0.0005, + "loss": 2.1131, + "step": 180760 + }, + { + "epoch": 0.6880552362537397, + "grad_norm": 0.13235154747962952, + "learning_rate": 0.0005, + "loss": 2.1149, + "step": 180770 + }, + { + "epoch": 0.6880932987218623, + "grad_norm": 0.12404754012823105, + "learning_rate": 0.0005, + "loss": 2.1028, + "step": 180780 + }, + { + "epoch": 0.688131361189985, + "grad_norm": 0.12362740188837051, + "learning_rate": 0.0005, + "loss": 2.1173, + "step": 180790 + }, + { + "epoch": 0.6881694236581077, + "grad_norm": 0.1286434531211853, + "learning_rate": 0.0005, + "loss": 2.1038, + "step": 180800 + }, + { + "epoch": 0.6882074861262304, + "grad_norm": 0.12793101370334625, + "learning_rate": 0.0005, + "loss": 2.1154, + "step": 180810 + }, + { + "epoch": 0.688245548594353, + "grad_norm": 0.11994019150733948, + "learning_rate": 0.0005, + "loss": 2.1033, + "step": 180820 + }, + { + "epoch": 0.6882836110624757, + "grad_norm": 0.11454705148935318, + "learning_rate": 0.0005, + "loss": 2.0962, + "step": 180830 + }, + { + "epoch": 0.6883216735305984, + "grad_norm": 0.11727307736873627, + "learning_rate": 0.0005, + "loss": 2.1075, + "step": 180840 + }, + { + "epoch": 0.6883597359987211, + "grad_norm": 0.13075630366802216, + "learning_rate": 0.0005, + "loss": 2.1011, + "step": 180850 + }, + { + "epoch": 0.6883977984668438, + "grad_norm": 0.13252484798431396, + "learning_rate": 0.0005, + "loss": 2.1113, + "step": 180860 + }, + { + "epoch": 0.6884358609349664, + "grad_norm": 0.11895839869976044, + "learning_rate": 0.0005, + "loss": 2.1154, + "step": 180870 + }, + { + "epoch": 0.6884739234030891, + "grad_norm": 0.13343650102615356, + "learning_rate": 0.0005, + "loss": 2.1013, + "step": 180880 + }, + { + "epoch": 0.6885119858712119, + "grad_norm": 0.12214325368404388, + "learning_rate": 0.0005, + "loss": 2.1043, + "step": 180890 + }, + { + "epoch": 0.6885500483393345, + "grad_norm": 0.13322407007217407, + "learning_rate": 0.0005, + "loss": 2.0943, + "step": 180900 + }, + { + "epoch": 0.6885881108074572, + "grad_norm": 0.11768539994955063, + "learning_rate": 0.0005, + "loss": 2.1127, + "step": 180910 + }, + { + "epoch": 0.6886261732755798, + "grad_norm": 0.1263519674539566, + "learning_rate": 0.0005, + "loss": 2.1097, + "step": 180920 + }, + { + "epoch": 0.6886642357437026, + "grad_norm": 0.129350483417511, + "learning_rate": 0.0005, + "loss": 2.1132, + "step": 180930 + }, + { + "epoch": 0.6887022982118253, + "grad_norm": 0.1228652149438858, + "learning_rate": 0.0005, + "loss": 2.0939, + "step": 180940 + }, + { + "epoch": 0.6887403606799479, + "grad_norm": 0.11624860763549805, + "learning_rate": 0.0005, + "loss": 2.1023, + "step": 180950 + }, + { + "epoch": 0.6887784231480706, + "grad_norm": 0.12083599716424942, + "learning_rate": 0.0005, + "loss": 2.1071, + "step": 180960 + }, + { + "epoch": 0.6888164856161932, + "grad_norm": 0.12017053365707397, + "learning_rate": 0.0005, + "loss": 2.1153, + "step": 180970 + }, + { + "epoch": 0.688854548084316, + "grad_norm": 0.12007162719964981, + "learning_rate": 0.0005, + "loss": 2.1218, + "step": 180980 + }, + { + "epoch": 0.6888926105524387, + "grad_norm": 0.12415078282356262, + "learning_rate": 0.0005, + "loss": 2.1035, + "step": 180990 + }, + { + "epoch": 0.6889306730205613, + "grad_norm": 0.11761855334043503, + "learning_rate": 0.0005, + "loss": 2.0996, + "step": 181000 + }, + { + "epoch": 0.688968735488684, + "grad_norm": 0.13418373465538025, + "learning_rate": 0.0005, + "loss": 2.1129, + "step": 181010 + }, + { + "epoch": 0.6890067979568067, + "grad_norm": 0.13213621079921722, + "learning_rate": 0.0005, + "loss": 2.1011, + "step": 181020 + }, + { + "epoch": 0.6890448604249294, + "grad_norm": 0.1185673251748085, + "learning_rate": 0.0005, + "loss": 2.1145, + "step": 181030 + }, + { + "epoch": 0.6890829228930521, + "grad_norm": 0.12293388694524765, + "learning_rate": 0.0005, + "loss": 2.106, + "step": 181040 + }, + { + "epoch": 0.6891209853611747, + "grad_norm": 0.1463843137025833, + "learning_rate": 0.0005, + "loss": 2.1197, + "step": 181050 + }, + { + "epoch": 0.6891590478292975, + "grad_norm": 0.1176767498254776, + "learning_rate": 0.0005, + "loss": 2.1069, + "step": 181060 + }, + { + "epoch": 0.6891971102974201, + "grad_norm": 0.1293659210205078, + "learning_rate": 0.0005, + "loss": 2.1133, + "step": 181070 + }, + { + "epoch": 0.6892351727655428, + "grad_norm": 0.12415621429681778, + "learning_rate": 0.0005, + "loss": 2.1066, + "step": 181080 + }, + { + "epoch": 0.6892732352336655, + "grad_norm": 0.11896419525146484, + "learning_rate": 0.0005, + "loss": 2.1091, + "step": 181090 + }, + { + "epoch": 0.6893112977017882, + "grad_norm": 0.13892972469329834, + "learning_rate": 0.0005, + "loss": 2.1059, + "step": 181100 + }, + { + "epoch": 0.6893493601699109, + "grad_norm": 0.11878636479377747, + "learning_rate": 0.0005, + "loss": 2.1171, + "step": 181110 + }, + { + "epoch": 0.6893874226380335, + "grad_norm": 0.12854772806167603, + "learning_rate": 0.0005, + "loss": 2.0964, + "step": 181120 + }, + { + "epoch": 0.6894254851061562, + "grad_norm": 0.12110681086778641, + "learning_rate": 0.0005, + "loss": 2.1123, + "step": 181130 + }, + { + "epoch": 0.6894635475742789, + "grad_norm": 0.12103510648012161, + "learning_rate": 0.0005, + "loss": 2.1052, + "step": 181140 + }, + { + "epoch": 0.6895016100424016, + "grad_norm": 0.12146977335214615, + "learning_rate": 0.0005, + "loss": 2.1061, + "step": 181150 + }, + { + "epoch": 0.6895396725105243, + "grad_norm": 0.13748902082443237, + "learning_rate": 0.0005, + "loss": 2.1088, + "step": 181160 + }, + { + "epoch": 0.6895777349786469, + "grad_norm": 0.13430270552635193, + "learning_rate": 0.0005, + "loss": 2.0969, + "step": 181170 + }, + { + "epoch": 0.6896157974467696, + "grad_norm": 0.12388263642787933, + "learning_rate": 0.0005, + "loss": 2.1114, + "step": 181180 + }, + { + "epoch": 0.6896538599148924, + "grad_norm": 0.11775881797075272, + "learning_rate": 0.0005, + "loss": 2.1224, + "step": 181190 + }, + { + "epoch": 0.689691922383015, + "grad_norm": 0.12764546275138855, + "learning_rate": 0.0005, + "loss": 2.1085, + "step": 181200 + }, + { + "epoch": 0.6897299848511377, + "grad_norm": 0.11993419378995895, + "learning_rate": 0.0005, + "loss": 2.1088, + "step": 181210 + }, + { + "epoch": 0.6897680473192603, + "grad_norm": 0.12366458773612976, + "learning_rate": 0.0005, + "loss": 2.1005, + "step": 181220 + }, + { + "epoch": 0.6898061097873831, + "grad_norm": 0.12651975452899933, + "learning_rate": 0.0005, + "loss": 2.113, + "step": 181230 + }, + { + "epoch": 0.6898441722555058, + "grad_norm": 0.12930312752723694, + "learning_rate": 0.0005, + "loss": 2.113, + "step": 181240 + }, + { + "epoch": 0.6898822347236284, + "grad_norm": 0.11947444826364517, + "learning_rate": 0.0005, + "loss": 2.1194, + "step": 181250 + }, + { + "epoch": 0.6899202971917511, + "grad_norm": 0.20199982821941376, + "learning_rate": 0.0005, + "loss": 2.1016, + "step": 181260 + }, + { + "epoch": 0.6899583596598737, + "grad_norm": 0.11784765869379044, + "learning_rate": 0.0005, + "loss": 2.1132, + "step": 181270 + }, + { + "epoch": 0.6899964221279965, + "grad_norm": 0.1350754201412201, + "learning_rate": 0.0005, + "loss": 2.0942, + "step": 181280 + }, + { + "epoch": 0.6900344845961192, + "grad_norm": 0.13489319384098053, + "learning_rate": 0.0005, + "loss": 2.1078, + "step": 181290 + }, + { + "epoch": 0.6900725470642418, + "grad_norm": 0.12121303379535675, + "learning_rate": 0.0005, + "loss": 2.1067, + "step": 181300 + }, + { + "epoch": 0.6901106095323645, + "grad_norm": 0.12036201357841492, + "learning_rate": 0.0005, + "loss": 2.1196, + "step": 181310 + }, + { + "epoch": 0.6901486720004872, + "grad_norm": 0.12152925133705139, + "learning_rate": 0.0005, + "loss": 2.1104, + "step": 181320 + }, + { + "epoch": 0.6901867344686099, + "grad_norm": 0.13613882660865784, + "learning_rate": 0.0005, + "loss": 2.1193, + "step": 181330 + }, + { + "epoch": 0.6902247969367326, + "grad_norm": 0.1253340244293213, + "learning_rate": 0.0005, + "loss": 2.1162, + "step": 181340 + }, + { + "epoch": 0.6902628594048552, + "grad_norm": 0.12414707988500595, + "learning_rate": 0.0005, + "loss": 2.101, + "step": 181350 + }, + { + "epoch": 0.690300921872978, + "grad_norm": 0.13022355735301971, + "learning_rate": 0.0005, + "loss": 2.1148, + "step": 181360 + }, + { + "epoch": 0.6903389843411006, + "grad_norm": 0.12295828759670258, + "learning_rate": 0.0005, + "loss": 2.1111, + "step": 181370 + }, + { + "epoch": 0.6903770468092233, + "grad_norm": 0.13136707246303558, + "learning_rate": 0.0005, + "loss": 2.1108, + "step": 181380 + }, + { + "epoch": 0.690415109277346, + "grad_norm": 0.12850356101989746, + "learning_rate": 0.0005, + "loss": 2.1031, + "step": 181390 + }, + { + "epoch": 0.6904531717454686, + "grad_norm": 0.12073330581188202, + "learning_rate": 0.0005, + "loss": 2.1016, + "step": 181400 + }, + { + "epoch": 0.6904912342135914, + "grad_norm": 0.11530125886201859, + "learning_rate": 0.0005, + "loss": 2.1073, + "step": 181410 + }, + { + "epoch": 0.690529296681714, + "grad_norm": 0.12165053933858871, + "learning_rate": 0.0005, + "loss": 2.1134, + "step": 181420 + }, + { + "epoch": 0.6905673591498367, + "grad_norm": 0.11687213182449341, + "learning_rate": 0.0005, + "loss": 2.1049, + "step": 181430 + }, + { + "epoch": 0.6906054216179593, + "grad_norm": 0.12140630930662155, + "learning_rate": 0.0005, + "loss": 2.1075, + "step": 181440 + }, + { + "epoch": 0.6906434840860821, + "grad_norm": 0.12299919128417969, + "learning_rate": 0.0005, + "loss": 2.1047, + "step": 181450 + }, + { + "epoch": 0.6906815465542048, + "grad_norm": 0.11274578422307968, + "learning_rate": 0.0005, + "loss": 2.1102, + "step": 181460 + }, + { + "epoch": 0.6907196090223274, + "grad_norm": 0.11769766360521317, + "learning_rate": 0.0005, + "loss": 2.1015, + "step": 181470 + }, + { + "epoch": 0.6907576714904501, + "grad_norm": 0.11614657938480377, + "learning_rate": 0.0005, + "loss": 2.1107, + "step": 181480 + }, + { + "epoch": 0.6907957339585729, + "grad_norm": 0.13985876739025116, + "learning_rate": 0.0005, + "loss": 2.0987, + "step": 181490 + }, + { + "epoch": 0.6908337964266955, + "grad_norm": 0.1485404074192047, + "learning_rate": 0.0005, + "loss": 2.0888, + "step": 181500 + }, + { + "epoch": 0.6908718588948182, + "grad_norm": 0.12811745703220367, + "learning_rate": 0.0005, + "loss": 2.1169, + "step": 181510 + }, + { + "epoch": 0.6909099213629408, + "grad_norm": 0.119151271879673, + "learning_rate": 0.0005, + "loss": 2.104, + "step": 181520 + }, + { + "epoch": 0.6909479838310636, + "grad_norm": 0.12118816375732422, + "learning_rate": 0.0005, + "loss": 2.104, + "step": 181530 + }, + { + "epoch": 0.6909860462991863, + "grad_norm": 0.11851918697357178, + "learning_rate": 0.0005, + "loss": 2.117, + "step": 181540 + }, + { + "epoch": 0.6910241087673089, + "grad_norm": 0.12571333348751068, + "learning_rate": 0.0005, + "loss": 2.0872, + "step": 181550 + }, + { + "epoch": 0.6910621712354316, + "grad_norm": 0.1240975484251976, + "learning_rate": 0.0005, + "loss": 2.1022, + "step": 181560 + }, + { + "epoch": 0.6911002337035542, + "grad_norm": 0.11628197133541107, + "learning_rate": 0.0005, + "loss": 2.1027, + "step": 181570 + }, + { + "epoch": 0.691138296171677, + "grad_norm": 0.11772032827138901, + "learning_rate": 0.0005, + "loss": 2.1061, + "step": 181580 + }, + { + "epoch": 0.6911763586397996, + "grad_norm": 0.12051352113485336, + "learning_rate": 0.0005, + "loss": 2.107, + "step": 181590 + }, + { + "epoch": 0.6912144211079223, + "grad_norm": 0.13565829396247864, + "learning_rate": 0.0005, + "loss": 2.1189, + "step": 181600 + }, + { + "epoch": 0.691252483576045, + "grad_norm": 0.1340634673833847, + "learning_rate": 0.0005, + "loss": 2.1079, + "step": 181610 + }, + { + "epoch": 0.6912905460441677, + "grad_norm": 0.11844746023416519, + "learning_rate": 0.0005, + "loss": 2.099, + "step": 181620 + }, + { + "epoch": 0.6913286085122904, + "grad_norm": 0.13710397481918335, + "learning_rate": 0.0005, + "loss": 2.0924, + "step": 181630 + }, + { + "epoch": 0.691366670980413, + "grad_norm": 0.11849800497293472, + "learning_rate": 0.0005, + "loss": 2.1032, + "step": 181640 + }, + { + "epoch": 0.6914047334485357, + "grad_norm": 0.12317143380641937, + "learning_rate": 0.0005, + "loss": 2.116, + "step": 181650 + }, + { + "epoch": 0.6914427959166585, + "grad_norm": 0.1267412304878235, + "learning_rate": 0.0005, + "loss": 2.1249, + "step": 181660 + }, + { + "epoch": 0.6914808583847811, + "grad_norm": 0.12529774010181427, + "learning_rate": 0.0005, + "loss": 2.1177, + "step": 181670 + }, + { + "epoch": 0.6915189208529038, + "grad_norm": 0.130142942070961, + "learning_rate": 0.0005, + "loss": 2.1051, + "step": 181680 + }, + { + "epoch": 0.6915569833210264, + "grad_norm": 0.12222882360219955, + "learning_rate": 0.0005, + "loss": 2.1077, + "step": 181690 + }, + { + "epoch": 0.6915950457891491, + "grad_norm": 0.11752060800790787, + "learning_rate": 0.0005, + "loss": 2.1103, + "step": 181700 + }, + { + "epoch": 0.6916331082572719, + "grad_norm": 0.13059893250465393, + "learning_rate": 0.0005, + "loss": 2.0989, + "step": 181710 + }, + { + "epoch": 0.6916711707253945, + "grad_norm": 0.13170769810676575, + "learning_rate": 0.0005, + "loss": 2.109, + "step": 181720 + }, + { + "epoch": 0.6917092331935172, + "grad_norm": 0.11935019493103027, + "learning_rate": 0.0005, + "loss": 2.1111, + "step": 181730 + }, + { + "epoch": 0.6917472956616398, + "grad_norm": 0.1218312606215477, + "learning_rate": 0.0005, + "loss": 2.1303, + "step": 181740 + }, + { + "epoch": 0.6917853581297626, + "grad_norm": 0.11730033159255981, + "learning_rate": 0.0005, + "loss": 2.1206, + "step": 181750 + }, + { + "epoch": 0.6918234205978853, + "grad_norm": 0.1214584931731224, + "learning_rate": 0.0005, + "loss": 2.1049, + "step": 181760 + }, + { + "epoch": 0.6918614830660079, + "grad_norm": 0.13505078852176666, + "learning_rate": 0.0005, + "loss": 2.1097, + "step": 181770 + }, + { + "epoch": 0.6918995455341306, + "grad_norm": 0.13040438294410706, + "learning_rate": 0.0005, + "loss": 2.109, + "step": 181780 + }, + { + "epoch": 0.6919376080022533, + "grad_norm": 0.12045073509216309, + "learning_rate": 0.0005, + "loss": 2.1173, + "step": 181790 + }, + { + "epoch": 0.691975670470376, + "grad_norm": 0.12367752939462662, + "learning_rate": 0.0005, + "loss": 2.1202, + "step": 181800 + }, + { + "epoch": 0.6920137329384987, + "grad_norm": 0.13401754200458527, + "learning_rate": 0.0005, + "loss": 2.108, + "step": 181810 + }, + { + "epoch": 0.6920517954066213, + "grad_norm": 0.12990473210811615, + "learning_rate": 0.0005, + "loss": 2.0946, + "step": 181820 + }, + { + "epoch": 0.692089857874744, + "grad_norm": 0.11714720726013184, + "learning_rate": 0.0005, + "loss": 2.0953, + "step": 181830 + }, + { + "epoch": 0.6921279203428667, + "grad_norm": 0.12829618155956268, + "learning_rate": 0.0005, + "loss": 2.1086, + "step": 181840 + }, + { + "epoch": 0.6921659828109894, + "grad_norm": 0.12493853271007538, + "learning_rate": 0.0005, + "loss": 2.1289, + "step": 181850 + }, + { + "epoch": 0.6922040452791121, + "grad_norm": 0.12608863413333893, + "learning_rate": 0.0005, + "loss": 2.1177, + "step": 181860 + }, + { + "epoch": 0.6922421077472347, + "grad_norm": 0.13609422743320465, + "learning_rate": 0.0005, + "loss": 2.1091, + "step": 181870 + }, + { + "epoch": 0.6922801702153575, + "grad_norm": 0.14166349172592163, + "learning_rate": 0.0005, + "loss": 2.1226, + "step": 181880 + }, + { + "epoch": 0.6923182326834801, + "grad_norm": 0.13140493631362915, + "learning_rate": 0.0005, + "loss": 2.1014, + "step": 181890 + }, + { + "epoch": 0.6923562951516028, + "grad_norm": 0.12775324285030365, + "learning_rate": 0.0005, + "loss": 2.1038, + "step": 181900 + }, + { + "epoch": 0.6923943576197255, + "grad_norm": 0.1270611435174942, + "learning_rate": 0.0005, + "loss": 2.1124, + "step": 181910 + }, + { + "epoch": 0.6924324200878482, + "grad_norm": 0.13412043452262878, + "learning_rate": 0.0005, + "loss": 2.1006, + "step": 181920 + }, + { + "epoch": 0.6924704825559709, + "grad_norm": 0.12404550611972809, + "learning_rate": 0.0005, + "loss": 2.1052, + "step": 181930 + }, + { + "epoch": 0.6925085450240935, + "grad_norm": 0.1306399554014206, + "learning_rate": 0.0005, + "loss": 2.1187, + "step": 181940 + }, + { + "epoch": 0.6925466074922162, + "grad_norm": 0.1323079913854599, + "learning_rate": 0.0005, + "loss": 2.1009, + "step": 181950 + }, + { + "epoch": 0.692584669960339, + "grad_norm": 0.12677091360092163, + "learning_rate": 0.0005, + "loss": 2.1195, + "step": 181960 + }, + { + "epoch": 0.6926227324284616, + "grad_norm": 0.12194015830755234, + "learning_rate": 0.0005, + "loss": 2.131, + "step": 181970 + }, + { + "epoch": 0.6926607948965843, + "grad_norm": 0.12330708652734756, + "learning_rate": 0.0005, + "loss": 2.0952, + "step": 181980 + }, + { + "epoch": 0.6926988573647069, + "grad_norm": 0.12560437619686127, + "learning_rate": 0.0005, + "loss": 2.0936, + "step": 181990 + }, + { + "epoch": 0.6927369198328296, + "grad_norm": 0.13029389083385468, + "learning_rate": 0.0005, + "loss": 2.085, + "step": 182000 + }, + { + "epoch": 0.6927749823009524, + "grad_norm": 0.12004786729812622, + "learning_rate": 0.0005, + "loss": 2.1238, + "step": 182010 + }, + { + "epoch": 0.692813044769075, + "grad_norm": 0.1352328658103943, + "learning_rate": 0.0005, + "loss": 2.1127, + "step": 182020 + }, + { + "epoch": 0.6928511072371977, + "grad_norm": 0.1298789083957672, + "learning_rate": 0.0005, + "loss": 2.0988, + "step": 182030 + }, + { + "epoch": 0.6928891697053203, + "grad_norm": 0.12174946814775467, + "learning_rate": 0.0005, + "loss": 2.1222, + "step": 182040 + }, + { + "epoch": 0.6929272321734431, + "grad_norm": 0.12317008525133133, + "learning_rate": 0.0005, + "loss": 2.1043, + "step": 182050 + }, + { + "epoch": 0.6929652946415658, + "grad_norm": 0.1217416450381279, + "learning_rate": 0.0005, + "loss": 2.0969, + "step": 182060 + }, + { + "epoch": 0.6930033571096884, + "grad_norm": 0.13692374527454376, + "learning_rate": 0.0005, + "loss": 2.108, + "step": 182070 + }, + { + "epoch": 0.6930414195778111, + "grad_norm": 0.1314442753791809, + "learning_rate": 0.0005, + "loss": 2.1177, + "step": 182080 + }, + { + "epoch": 0.6930794820459338, + "grad_norm": 0.12198949605226517, + "learning_rate": 0.0005, + "loss": 2.1002, + "step": 182090 + }, + { + "epoch": 0.6931175445140565, + "grad_norm": 0.1149495393037796, + "learning_rate": 0.0005, + "loss": 2.1222, + "step": 182100 + }, + { + "epoch": 0.6931556069821792, + "grad_norm": 0.12052424252033234, + "learning_rate": 0.0005, + "loss": 2.1127, + "step": 182110 + }, + { + "epoch": 0.6931936694503018, + "grad_norm": 0.11988913267850876, + "learning_rate": 0.0005, + "loss": 2.111, + "step": 182120 + }, + { + "epoch": 0.6932317319184245, + "grad_norm": 0.11893389374017715, + "learning_rate": 0.0005, + "loss": 2.1204, + "step": 182130 + }, + { + "epoch": 0.6932697943865472, + "grad_norm": 0.12340518832206726, + "learning_rate": 0.0005, + "loss": 2.1043, + "step": 182140 + }, + { + "epoch": 0.6933078568546699, + "grad_norm": 0.12666188180446625, + "learning_rate": 0.0005, + "loss": 2.1085, + "step": 182150 + }, + { + "epoch": 0.6933459193227925, + "grad_norm": 0.11834313720464706, + "learning_rate": 0.0005, + "loss": 2.1153, + "step": 182160 + }, + { + "epoch": 0.6933839817909152, + "grad_norm": 0.13510651886463165, + "learning_rate": 0.0005, + "loss": 2.1035, + "step": 182170 + }, + { + "epoch": 0.693422044259038, + "grad_norm": 0.12976713478565216, + "learning_rate": 0.0005, + "loss": 2.0916, + "step": 182180 + }, + { + "epoch": 0.6934601067271606, + "grad_norm": 0.12408886104822159, + "learning_rate": 0.0005, + "loss": 2.1166, + "step": 182190 + }, + { + "epoch": 0.6934981691952833, + "grad_norm": 0.13030008971691132, + "learning_rate": 0.0005, + "loss": 2.1087, + "step": 182200 + }, + { + "epoch": 0.6935362316634059, + "grad_norm": 0.13181696832180023, + "learning_rate": 0.0005, + "loss": 2.0985, + "step": 182210 + }, + { + "epoch": 0.6935742941315287, + "grad_norm": 0.12257903814315796, + "learning_rate": 0.0005, + "loss": 2.1184, + "step": 182220 + }, + { + "epoch": 0.6936123565996514, + "grad_norm": 0.11532396078109741, + "learning_rate": 0.0005, + "loss": 2.1168, + "step": 182230 + }, + { + "epoch": 0.693650419067774, + "grad_norm": 0.12889619171619415, + "learning_rate": 0.0005, + "loss": 2.1131, + "step": 182240 + }, + { + "epoch": 0.6936884815358967, + "grad_norm": 0.12298876792192459, + "learning_rate": 0.0005, + "loss": 2.1109, + "step": 182250 + }, + { + "epoch": 0.6937265440040195, + "grad_norm": 0.1157648116350174, + "learning_rate": 0.0005, + "loss": 2.1057, + "step": 182260 + }, + { + "epoch": 0.6937646064721421, + "grad_norm": 0.14459484815597534, + "learning_rate": 0.0005, + "loss": 2.1026, + "step": 182270 + }, + { + "epoch": 0.6938026689402648, + "grad_norm": 0.13314592838287354, + "learning_rate": 0.0005, + "loss": 2.095, + "step": 182280 + }, + { + "epoch": 0.6938407314083874, + "grad_norm": 0.1288757175207138, + "learning_rate": 0.0005, + "loss": 2.0946, + "step": 182290 + }, + { + "epoch": 0.6938787938765101, + "grad_norm": 0.12310304492712021, + "learning_rate": 0.0005, + "loss": 2.1375, + "step": 182300 + }, + { + "epoch": 0.6939168563446328, + "grad_norm": 0.1330040991306305, + "learning_rate": 0.0005, + "loss": 2.1014, + "step": 182310 + }, + { + "epoch": 0.6939549188127555, + "grad_norm": 0.1187869980931282, + "learning_rate": 0.0005, + "loss": 2.1057, + "step": 182320 + }, + { + "epoch": 0.6939929812808782, + "grad_norm": 0.11585734784603119, + "learning_rate": 0.0005, + "loss": 2.099, + "step": 182330 + }, + { + "epoch": 0.6940310437490008, + "grad_norm": 0.1329014152288437, + "learning_rate": 0.0005, + "loss": 2.0977, + "step": 182340 + }, + { + "epoch": 0.6940691062171236, + "grad_norm": 0.13332955539226532, + "learning_rate": 0.0005, + "loss": 2.1073, + "step": 182350 + }, + { + "epoch": 0.6941071686852462, + "grad_norm": 0.1157715693116188, + "learning_rate": 0.0005, + "loss": 2.1047, + "step": 182360 + }, + { + "epoch": 0.6941452311533689, + "grad_norm": 0.11897668242454529, + "learning_rate": 0.0005, + "loss": 2.1109, + "step": 182370 + }, + { + "epoch": 0.6941832936214916, + "grad_norm": 0.12142232060432434, + "learning_rate": 0.0005, + "loss": 2.0807, + "step": 182380 + }, + { + "epoch": 0.6942213560896143, + "grad_norm": 0.1747933030128479, + "learning_rate": 0.0005, + "loss": 2.1199, + "step": 182390 + }, + { + "epoch": 0.694259418557737, + "grad_norm": 0.1174633651971817, + "learning_rate": 0.0005, + "loss": 2.0975, + "step": 182400 + }, + { + "epoch": 0.6942974810258596, + "grad_norm": 0.12898437678813934, + "learning_rate": 0.0005, + "loss": 2.1134, + "step": 182410 + }, + { + "epoch": 0.6943355434939823, + "grad_norm": 0.12418783456087112, + "learning_rate": 0.0005, + "loss": 2.0961, + "step": 182420 + }, + { + "epoch": 0.694373605962105, + "grad_norm": 0.12284686416387558, + "learning_rate": 0.0005, + "loss": 2.1052, + "step": 182430 + }, + { + "epoch": 0.6944116684302277, + "grad_norm": 0.12123236805200577, + "learning_rate": 0.0005, + "loss": 2.1179, + "step": 182440 + }, + { + "epoch": 0.6944497308983504, + "grad_norm": 0.11644534766674042, + "learning_rate": 0.0005, + "loss": 2.1029, + "step": 182450 + }, + { + "epoch": 0.694487793366473, + "grad_norm": 0.12517879903316498, + "learning_rate": 0.0005, + "loss": 2.0976, + "step": 182460 + }, + { + "epoch": 0.6945258558345957, + "grad_norm": 0.11970049887895584, + "learning_rate": 0.0005, + "loss": 2.1156, + "step": 182470 + }, + { + "epoch": 0.6945639183027185, + "grad_norm": 0.12386345863342285, + "learning_rate": 0.0005, + "loss": 2.1123, + "step": 182480 + }, + { + "epoch": 0.6946019807708411, + "grad_norm": 0.12449586391448975, + "learning_rate": 0.0005, + "loss": 2.1106, + "step": 182490 + }, + { + "epoch": 0.6946400432389638, + "grad_norm": 0.12283190339803696, + "learning_rate": 0.0005, + "loss": 2.1156, + "step": 182500 + }, + { + "epoch": 0.6946781057070864, + "grad_norm": 0.12632504105567932, + "learning_rate": 0.0005, + "loss": 2.1033, + "step": 182510 + }, + { + "epoch": 0.6947161681752092, + "grad_norm": 0.12029515206813812, + "learning_rate": 0.0005, + "loss": 2.1049, + "step": 182520 + }, + { + "epoch": 0.6947542306433319, + "grad_norm": 0.1184384822845459, + "learning_rate": 0.0005, + "loss": 2.1154, + "step": 182530 + }, + { + "epoch": 0.6947922931114545, + "grad_norm": 0.11830782145261765, + "learning_rate": 0.0005, + "loss": 2.1001, + "step": 182540 + }, + { + "epoch": 0.6948303555795772, + "grad_norm": 0.1292402148246765, + "learning_rate": 0.0005, + "loss": 2.1205, + "step": 182550 + }, + { + "epoch": 0.6948684180476998, + "grad_norm": 0.13244417309761047, + "learning_rate": 0.0005, + "loss": 2.1048, + "step": 182560 + }, + { + "epoch": 0.6949064805158226, + "grad_norm": 0.13277667760849, + "learning_rate": 0.0005, + "loss": 2.1049, + "step": 182570 + }, + { + "epoch": 0.6949445429839453, + "grad_norm": 0.1484387069940567, + "learning_rate": 0.0005, + "loss": 2.1145, + "step": 182580 + }, + { + "epoch": 0.6949826054520679, + "grad_norm": 0.11615041643381119, + "learning_rate": 0.0005, + "loss": 2.1156, + "step": 182590 + }, + { + "epoch": 0.6950206679201906, + "grad_norm": 0.12297375500202179, + "learning_rate": 0.0005, + "loss": 2.1104, + "step": 182600 + }, + { + "epoch": 0.6950587303883133, + "grad_norm": 0.12898223102092743, + "learning_rate": 0.0005, + "loss": 2.0987, + "step": 182610 + }, + { + "epoch": 0.695096792856436, + "grad_norm": 0.12044510990381241, + "learning_rate": 0.0005, + "loss": 2.1114, + "step": 182620 + }, + { + "epoch": 0.6951348553245587, + "grad_norm": 0.13075432181358337, + "learning_rate": 0.0005, + "loss": 2.1169, + "step": 182630 + }, + { + "epoch": 0.6951729177926813, + "grad_norm": 0.129336416721344, + "learning_rate": 0.0005, + "loss": 2.1025, + "step": 182640 + }, + { + "epoch": 0.6952109802608041, + "grad_norm": 0.13075217604637146, + "learning_rate": 0.0005, + "loss": 2.1019, + "step": 182650 + }, + { + "epoch": 0.6952490427289267, + "grad_norm": 0.12655818462371826, + "learning_rate": 0.0005, + "loss": 2.1181, + "step": 182660 + }, + { + "epoch": 0.6952871051970494, + "grad_norm": 0.11474862694740295, + "learning_rate": 0.0005, + "loss": 2.1047, + "step": 182670 + }, + { + "epoch": 0.695325167665172, + "grad_norm": 0.17931579053401947, + "learning_rate": 0.0005, + "loss": 2.0888, + "step": 182680 + }, + { + "epoch": 0.6953632301332948, + "grad_norm": 0.13210253417491913, + "learning_rate": 0.0005, + "loss": 2.0936, + "step": 182690 + }, + { + "epoch": 0.6954012926014175, + "grad_norm": 0.12530747056007385, + "learning_rate": 0.0005, + "loss": 2.1119, + "step": 182700 + }, + { + "epoch": 0.6954393550695401, + "grad_norm": 0.11945699900388718, + "learning_rate": 0.0005, + "loss": 2.1244, + "step": 182710 + }, + { + "epoch": 0.6954774175376628, + "grad_norm": 0.11607809364795685, + "learning_rate": 0.0005, + "loss": 2.1065, + "step": 182720 + }, + { + "epoch": 0.6955154800057854, + "grad_norm": 0.13010531663894653, + "learning_rate": 0.0005, + "loss": 2.1053, + "step": 182730 + }, + { + "epoch": 0.6955535424739082, + "grad_norm": 0.1291985809803009, + "learning_rate": 0.0005, + "loss": 2.1069, + "step": 182740 + }, + { + "epoch": 0.6955916049420309, + "grad_norm": 0.11874186247587204, + "learning_rate": 0.0005, + "loss": 2.1146, + "step": 182750 + }, + { + "epoch": 0.6956296674101535, + "grad_norm": 0.13069820404052734, + "learning_rate": 0.0005, + "loss": 2.1132, + "step": 182760 + }, + { + "epoch": 0.6956677298782762, + "grad_norm": 0.12255138903856277, + "learning_rate": 0.0005, + "loss": 2.1011, + "step": 182770 + }, + { + "epoch": 0.695705792346399, + "grad_norm": 0.12217668443918228, + "learning_rate": 0.0005, + "loss": 2.0963, + "step": 182780 + }, + { + "epoch": 0.6957438548145216, + "grad_norm": 0.11657962948083878, + "learning_rate": 0.0005, + "loss": 2.0881, + "step": 182790 + }, + { + "epoch": 0.6957819172826443, + "grad_norm": 0.11431416869163513, + "learning_rate": 0.0005, + "loss": 2.1016, + "step": 182800 + }, + { + "epoch": 0.6958199797507669, + "grad_norm": 0.12846305966377258, + "learning_rate": 0.0005, + "loss": 2.1267, + "step": 182810 + }, + { + "epoch": 0.6958580422188897, + "grad_norm": 0.12828603386878967, + "learning_rate": 0.0005, + "loss": 2.1184, + "step": 182820 + }, + { + "epoch": 0.6958961046870124, + "grad_norm": 0.13457556068897247, + "learning_rate": 0.0005, + "loss": 2.1035, + "step": 182830 + }, + { + "epoch": 0.695934167155135, + "grad_norm": 0.12460450083017349, + "learning_rate": 0.0005, + "loss": 2.1158, + "step": 182840 + }, + { + "epoch": 0.6959722296232577, + "grad_norm": 0.13973352313041687, + "learning_rate": 0.0005, + "loss": 2.1145, + "step": 182850 + }, + { + "epoch": 0.6960102920913803, + "grad_norm": 0.1241278126835823, + "learning_rate": 0.0005, + "loss": 2.112, + "step": 182860 + }, + { + "epoch": 0.6960483545595031, + "grad_norm": 0.11662398278713226, + "learning_rate": 0.0005, + "loss": 2.1062, + "step": 182870 + }, + { + "epoch": 0.6960864170276257, + "grad_norm": 0.12885436415672302, + "learning_rate": 0.0005, + "loss": 2.1061, + "step": 182880 + }, + { + "epoch": 0.6961244794957484, + "grad_norm": 0.11967397481203079, + "learning_rate": 0.0005, + "loss": 2.1044, + "step": 182890 + }, + { + "epoch": 0.6961625419638711, + "grad_norm": 0.11201397329568863, + "learning_rate": 0.0005, + "loss": 2.1113, + "step": 182900 + }, + { + "epoch": 0.6962006044319938, + "grad_norm": 0.11858703196048737, + "learning_rate": 0.0005, + "loss": 2.1074, + "step": 182910 + }, + { + "epoch": 0.6962386669001165, + "grad_norm": 0.13523299992084503, + "learning_rate": 0.0005, + "loss": 2.131, + "step": 182920 + }, + { + "epoch": 0.6962767293682391, + "grad_norm": 0.12153832614421844, + "learning_rate": 0.0005, + "loss": 2.1046, + "step": 182930 + }, + { + "epoch": 0.6963147918363618, + "grad_norm": 0.12720975279808044, + "learning_rate": 0.0005, + "loss": 2.1129, + "step": 182940 + }, + { + "epoch": 0.6963528543044846, + "grad_norm": 0.12351260334253311, + "learning_rate": 0.0005, + "loss": 2.1127, + "step": 182950 + }, + { + "epoch": 0.6963909167726072, + "grad_norm": 0.12928800284862518, + "learning_rate": 0.0005, + "loss": 2.1003, + "step": 182960 + }, + { + "epoch": 0.6964289792407299, + "grad_norm": 0.13228236138820648, + "learning_rate": 0.0005, + "loss": 2.1107, + "step": 182970 + }, + { + "epoch": 0.6964670417088525, + "grad_norm": 0.11482942849397659, + "learning_rate": 0.0005, + "loss": 2.1027, + "step": 182980 + }, + { + "epoch": 0.6965051041769752, + "grad_norm": 0.14297699928283691, + "learning_rate": 0.0005, + "loss": 2.1087, + "step": 182990 + }, + { + "epoch": 0.696543166645098, + "grad_norm": 0.12494589388370514, + "learning_rate": 0.0005, + "loss": 2.1138, + "step": 183000 + }, + { + "epoch": 0.6965812291132206, + "grad_norm": 0.11370964348316193, + "learning_rate": 0.0005, + "loss": 2.1133, + "step": 183010 + }, + { + "epoch": 0.6966192915813433, + "grad_norm": 0.14288152754306793, + "learning_rate": 0.0005, + "loss": 2.1071, + "step": 183020 + }, + { + "epoch": 0.6966573540494659, + "grad_norm": 0.12203198671340942, + "learning_rate": 0.0005, + "loss": 2.099, + "step": 183030 + }, + { + "epoch": 0.6966954165175887, + "grad_norm": 0.12365303933620453, + "learning_rate": 0.0005, + "loss": 2.1203, + "step": 183040 + }, + { + "epoch": 0.6967334789857114, + "grad_norm": 0.12327518314123154, + "learning_rate": 0.0005, + "loss": 2.1188, + "step": 183050 + }, + { + "epoch": 0.696771541453834, + "grad_norm": 0.13546282052993774, + "learning_rate": 0.0005, + "loss": 2.11, + "step": 183060 + }, + { + "epoch": 0.6968096039219567, + "grad_norm": 0.1293046772480011, + "learning_rate": 0.0005, + "loss": 2.11, + "step": 183070 + }, + { + "epoch": 0.6968476663900794, + "grad_norm": 0.12784546613693237, + "learning_rate": 0.0005, + "loss": 2.0981, + "step": 183080 + }, + { + "epoch": 0.6968857288582021, + "grad_norm": 0.13157708942890167, + "learning_rate": 0.0005, + "loss": 2.1219, + "step": 183090 + }, + { + "epoch": 0.6969237913263248, + "grad_norm": 0.11840756237506866, + "learning_rate": 0.0005, + "loss": 2.1182, + "step": 183100 + }, + { + "epoch": 0.6969618537944474, + "grad_norm": 0.12368209660053253, + "learning_rate": 0.0005, + "loss": 2.1265, + "step": 183110 + }, + { + "epoch": 0.6969999162625702, + "grad_norm": 0.12720529735088348, + "learning_rate": 0.0005, + "loss": 2.1001, + "step": 183120 + }, + { + "epoch": 0.6970379787306928, + "grad_norm": 0.14050424098968506, + "learning_rate": 0.0005, + "loss": 2.1079, + "step": 183130 + }, + { + "epoch": 0.6970760411988155, + "grad_norm": 0.12502345442771912, + "learning_rate": 0.0005, + "loss": 2.1204, + "step": 183140 + }, + { + "epoch": 0.6971141036669382, + "grad_norm": 0.12937872111797333, + "learning_rate": 0.0005, + "loss": 2.109, + "step": 183150 + }, + { + "epoch": 0.6971521661350608, + "grad_norm": 0.1396130472421646, + "learning_rate": 0.0005, + "loss": 2.106, + "step": 183160 + }, + { + "epoch": 0.6971902286031836, + "grad_norm": 0.12844252586364746, + "learning_rate": 0.0005, + "loss": 2.1017, + "step": 183170 + }, + { + "epoch": 0.6972282910713062, + "grad_norm": 0.12445782124996185, + "learning_rate": 0.0005, + "loss": 2.1038, + "step": 183180 + }, + { + "epoch": 0.6972663535394289, + "grad_norm": 0.11656850576400757, + "learning_rate": 0.0005, + "loss": 2.1135, + "step": 183190 + }, + { + "epoch": 0.6973044160075516, + "grad_norm": 0.11850359290838242, + "learning_rate": 0.0005, + "loss": 2.1229, + "step": 183200 + }, + { + "epoch": 0.6973424784756743, + "grad_norm": 0.11828984320163727, + "learning_rate": 0.0005, + "loss": 2.1171, + "step": 183210 + }, + { + "epoch": 0.697380540943797, + "grad_norm": 0.12916888296604156, + "learning_rate": 0.0005, + "loss": 2.1018, + "step": 183220 + }, + { + "epoch": 0.6974186034119196, + "grad_norm": 0.13033920526504517, + "learning_rate": 0.0005, + "loss": 2.123, + "step": 183230 + }, + { + "epoch": 0.6974566658800423, + "grad_norm": 0.12288140505552292, + "learning_rate": 0.0005, + "loss": 2.1098, + "step": 183240 + }, + { + "epoch": 0.6974947283481651, + "grad_norm": 0.12239166349172592, + "learning_rate": 0.0005, + "loss": 2.1061, + "step": 183250 + }, + { + "epoch": 0.6975327908162877, + "grad_norm": 0.1316891759634018, + "learning_rate": 0.0005, + "loss": 2.1114, + "step": 183260 + }, + { + "epoch": 0.6975708532844104, + "grad_norm": 0.1223904937505722, + "learning_rate": 0.0005, + "loss": 2.1205, + "step": 183270 + }, + { + "epoch": 0.697608915752533, + "grad_norm": 0.13343580067157745, + "learning_rate": 0.0005, + "loss": 2.1146, + "step": 183280 + }, + { + "epoch": 0.6976469782206557, + "grad_norm": 0.11461261659860611, + "learning_rate": 0.0005, + "loss": 2.1177, + "step": 183290 + }, + { + "epoch": 0.6976850406887785, + "grad_norm": 0.1275361180305481, + "learning_rate": 0.0005, + "loss": 2.0911, + "step": 183300 + }, + { + "epoch": 0.6977231031569011, + "grad_norm": 0.11810991168022156, + "learning_rate": 0.0005, + "loss": 2.1028, + "step": 183310 + }, + { + "epoch": 0.6977611656250238, + "grad_norm": 0.12467637658119202, + "learning_rate": 0.0005, + "loss": 2.128, + "step": 183320 + }, + { + "epoch": 0.6977992280931464, + "grad_norm": 0.13004258275032043, + "learning_rate": 0.0005, + "loss": 2.1198, + "step": 183330 + }, + { + "epoch": 0.6978372905612692, + "grad_norm": 0.14000114798545837, + "learning_rate": 0.0005, + "loss": 2.1135, + "step": 183340 + }, + { + "epoch": 0.6978753530293919, + "grad_norm": 0.12561576068401337, + "learning_rate": 0.0005, + "loss": 2.1114, + "step": 183350 + }, + { + "epoch": 0.6979134154975145, + "grad_norm": 0.13117700815200806, + "learning_rate": 0.0005, + "loss": 2.1027, + "step": 183360 + }, + { + "epoch": 0.6979514779656372, + "grad_norm": 0.13505379855632782, + "learning_rate": 0.0005, + "loss": 2.1028, + "step": 183370 + }, + { + "epoch": 0.6979895404337599, + "grad_norm": 0.13054326176643372, + "learning_rate": 0.0005, + "loss": 2.1098, + "step": 183380 + }, + { + "epoch": 0.6980276029018826, + "grad_norm": 0.13579963147640228, + "learning_rate": 0.0005, + "loss": 2.1064, + "step": 183390 + }, + { + "epoch": 0.6980656653700053, + "grad_norm": 0.13260318338871002, + "learning_rate": 0.0005, + "loss": 2.1183, + "step": 183400 + }, + { + "epoch": 0.6981037278381279, + "grad_norm": 0.12206083536148071, + "learning_rate": 0.0005, + "loss": 2.0904, + "step": 183410 + }, + { + "epoch": 0.6981417903062506, + "grad_norm": 0.12467200309038162, + "learning_rate": 0.0005, + "loss": 2.1096, + "step": 183420 + }, + { + "epoch": 0.6981798527743733, + "grad_norm": 0.12051711976528168, + "learning_rate": 0.0005, + "loss": 2.109, + "step": 183430 + }, + { + "epoch": 0.698217915242496, + "grad_norm": 0.1251114457845688, + "learning_rate": 0.0005, + "loss": 2.1113, + "step": 183440 + }, + { + "epoch": 0.6982559777106186, + "grad_norm": 0.12093877792358398, + "learning_rate": 0.0005, + "loss": 2.1114, + "step": 183450 + }, + { + "epoch": 0.6982940401787413, + "grad_norm": 0.12126054614782333, + "learning_rate": 0.0005, + "loss": 2.1228, + "step": 183460 + }, + { + "epoch": 0.6983321026468641, + "grad_norm": 0.13498644530773163, + "learning_rate": 0.0005, + "loss": 2.0933, + "step": 183470 + }, + { + "epoch": 0.6983701651149867, + "grad_norm": 0.12364812940359116, + "learning_rate": 0.0005, + "loss": 2.1117, + "step": 183480 + }, + { + "epoch": 0.6984082275831094, + "grad_norm": 0.119442880153656, + "learning_rate": 0.0005, + "loss": 2.1057, + "step": 183490 + }, + { + "epoch": 0.698446290051232, + "grad_norm": 0.11596567928791046, + "learning_rate": 0.0005, + "loss": 2.1115, + "step": 183500 + }, + { + "epoch": 0.6984843525193548, + "grad_norm": 0.14025621116161346, + "learning_rate": 0.0005, + "loss": 2.109, + "step": 183510 + }, + { + "epoch": 0.6985224149874775, + "grad_norm": 0.1185678243637085, + "learning_rate": 0.0005, + "loss": 2.1194, + "step": 183520 + }, + { + "epoch": 0.6985604774556001, + "grad_norm": 0.3852109909057617, + "learning_rate": 0.0005, + "loss": 2.1254, + "step": 183530 + }, + { + "epoch": 0.6985985399237228, + "grad_norm": 0.14016196131706238, + "learning_rate": 0.0005, + "loss": 2.1051, + "step": 183540 + }, + { + "epoch": 0.6986366023918456, + "grad_norm": 0.12113747000694275, + "learning_rate": 0.0005, + "loss": 2.1138, + "step": 183550 + }, + { + "epoch": 0.6986746648599682, + "grad_norm": 0.11765763908624649, + "learning_rate": 0.0005, + "loss": 2.1117, + "step": 183560 + }, + { + "epoch": 0.6987127273280909, + "grad_norm": 0.1222396045923233, + "learning_rate": 0.0005, + "loss": 2.1125, + "step": 183570 + }, + { + "epoch": 0.6987507897962135, + "grad_norm": 0.11592081934213638, + "learning_rate": 0.0005, + "loss": 2.107, + "step": 183580 + }, + { + "epoch": 0.6987888522643362, + "grad_norm": 0.1372542381286621, + "learning_rate": 0.0005, + "loss": 2.1076, + "step": 183590 + }, + { + "epoch": 0.698826914732459, + "grad_norm": 0.12375343590974808, + "learning_rate": 0.0005, + "loss": 2.0973, + "step": 183600 + }, + { + "epoch": 0.6988649772005816, + "grad_norm": 0.13878773152828217, + "learning_rate": 0.0005, + "loss": 2.0969, + "step": 183610 + }, + { + "epoch": 0.6989030396687043, + "grad_norm": 0.1390574872493744, + "learning_rate": 0.0005, + "loss": 2.0971, + "step": 183620 + }, + { + "epoch": 0.6989411021368269, + "grad_norm": 0.131068155169487, + "learning_rate": 0.0005, + "loss": 2.1029, + "step": 183630 + }, + { + "epoch": 0.6989791646049497, + "grad_norm": 0.11932392418384552, + "learning_rate": 0.0005, + "loss": 2.1019, + "step": 183640 + }, + { + "epoch": 0.6990172270730723, + "grad_norm": 0.11907713860273361, + "learning_rate": 0.0005, + "loss": 2.1154, + "step": 183650 + }, + { + "epoch": 0.699055289541195, + "grad_norm": 0.11536707729101181, + "learning_rate": 0.0005, + "loss": 2.093, + "step": 183660 + }, + { + "epoch": 0.6990933520093177, + "grad_norm": 0.11935848742723465, + "learning_rate": 0.0005, + "loss": 2.1096, + "step": 183670 + }, + { + "epoch": 0.6991314144774404, + "grad_norm": 0.12416180223226547, + "learning_rate": 0.0005, + "loss": 2.1027, + "step": 183680 + }, + { + "epoch": 0.6991694769455631, + "grad_norm": 0.1163659617304802, + "learning_rate": 0.0005, + "loss": 2.1061, + "step": 183690 + }, + { + "epoch": 0.6992075394136857, + "grad_norm": 0.13339783251285553, + "learning_rate": 0.0005, + "loss": 2.1087, + "step": 183700 + }, + { + "epoch": 0.6992456018818084, + "grad_norm": 0.12422681599855423, + "learning_rate": 0.0005, + "loss": 2.1029, + "step": 183710 + }, + { + "epoch": 0.6992836643499311, + "grad_norm": 0.13619937002658844, + "learning_rate": 0.0005, + "loss": 2.1, + "step": 183720 + }, + { + "epoch": 0.6993217268180538, + "grad_norm": 0.12961874902248383, + "learning_rate": 0.0005, + "loss": 2.1055, + "step": 183730 + }, + { + "epoch": 0.6993597892861765, + "grad_norm": 0.13386094570159912, + "learning_rate": 0.0005, + "loss": 2.1104, + "step": 183740 + }, + { + "epoch": 0.6993978517542991, + "grad_norm": 0.1292843520641327, + "learning_rate": 0.0005, + "loss": 2.1069, + "step": 183750 + }, + { + "epoch": 0.6994359142224218, + "grad_norm": 0.12814553081989288, + "learning_rate": 0.0005, + "loss": 2.1105, + "step": 183760 + }, + { + "epoch": 0.6994739766905446, + "grad_norm": 0.11046000570058823, + "learning_rate": 0.0005, + "loss": 2.1106, + "step": 183770 + }, + { + "epoch": 0.6995120391586672, + "grad_norm": 0.1339779794216156, + "learning_rate": 0.0005, + "loss": 2.1045, + "step": 183780 + }, + { + "epoch": 0.6995501016267899, + "grad_norm": 0.12307266891002655, + "learning_rate": 0.0005, + "loss": 2.1028, + "step": 183790 + }, + { + "epoch": 0.6995881640949125, + "grad_norm": 0.12837962806224823, + "learning_rate": 0.0005, + "loss": 2.0996, + "step": 183800 + }, + { + "epoch": 0.6996262265630353, + "grad_norm": 0.11693299561738968, + "learning_rate": 0.0005, + "loss": 2.1123, + "step": 183810 + }, + { + "epoch": 0.699664289031158, + "grad_norm": 0.1293923407793045, + "learning_rate": 0.0005, + "loss": 2.1036, + "step": 183820 + }, + { + "epoch": 0.6997023514992806, + "grad_norm": 0.12665726244449615, + "learning_rate": 0.0005, + "loss": 2.1153, + "step": 183830 + }, + { + "epoch": 0.6997404139674033, + "grad_norm": 0.12343131005764008, + "learning_rate": 0.0005, + "loss": 2.1068, + "step": 183840 + }, + { + "epoch": 0.6997784764355259, + "grad_norm": 0.12809494137763977, + "learning_rate": 0.0005, + "loss": 2.1018, + "step": 183850 + }, + { + "epoch": 0.6998165389036487, + "grad_norm": 0.1415323168039322, + "learning_rate": 0.0005, + "loss": 2.0934, + "step": 183860 + }, + { + "epoch": 0.6998546013717714, + "grad_norm": 0.13057930767536163, + "learning_rate": 0.0005, + "loss": 2.1004, + "step": 183870 + }, + { + "epoch": 0.699892663839894, + "grad_norm": 0.12605410814285278, + "learning_rate": 0.0005, + "loss": 2.113, + "step": 183880 + }, + { + "epoch": 0.6999307263080167, + "grad_norm": 0.12751656770706177, + "learning_rate": 0.0005, + "loss": 2.1057, + "step": 183890 + }, + { + "epoch": 0.6999687887761394, + "grad_norm": 0.12429352104663849, + "learning_rate": 0.0005, + "loss": 2.0976, + "step": 183900 + }, + { + "epoch": 0.7000068512442621, + "grad_norm": 0.11484511941671371, + "learning_rate": 0.0005, + "loss": 2.1054, + "step": 183910 + }, + { + "epoch": 0.7000449137123848, + "grad_norm": 0.12453338503837585, + "learning_rate": 0.0005, + "loss": 2.116, + "step": 183920 + }, + { + "epoch": 0.7000829761805074, + "grad_norm": 0.12155555188655853, + "learning_rate": 0.0005, + "loss": 2.1047, + "step": 183930 + }, + { + "epoch": 0.7001210386486302, + "grad_norm": 0.12803073227405548, + "learning_rate": 0.0005, + "loss": 2.1196, + "step": 183940 + }, + { + "epoch": 0.7001591011167528, + "grad_norm": 0.14072082936763763, + "learning_rate": 0.0005, + "loss": 2.1058, + "step": 183950 + }, + { + "epoch": 0.7001971635848755, + "grad_norm": 0.11939458549022675, + "learning_rate": 0.0005, + "loss": 2.1174, + "step": 183960 + }, + { + "epoch": 0.7002352260529981, + "grad_norm": 0.12321234494447708, + "learning_rate": 0.0005, + "loss": 2.1029, + "step": 183970 + }, + { + "epoch": 0.7002732885211209, + "grad_norm": 0.11370708048343658, + "learning_rate": 0.0005, + "loss": 2.0981, + "step": 183980 + }, + { + "epoch": 0.7003113509892436, + "grad_norm": 0.13109511137008667, + "learning_rate": 0.0005, + "loss": 2.1169, + "step": 183990 + }, + { + "epoch": 0.7003494134573662, + "grad_norm": 0.11811353266239166, + "learning_rate": 0.0005, + "loss": 2.1147, + "step": 184000 + }, + { + "epoch": 0.7003874759254889, + "grad_norm": 0.12360669672489166, + "learning_rate": 0.0005, + "loss": 2.1195, + "step": 184010 + }, + { + "epoch": 0.7004255383936115, + "grad_norm": 0.15090149641036987, + "learning_rate": 0.0005, + "loss": 2.1195, + "step": 184020 + }, + { + "epoch": 0.7004636008617343, + "grad_norm": 0.12501868605613708, + "learning_rate": 0.0005, + "loss": 2.0962, + "step": 184030 + }, + { + "epoch": 0.700501663329857, + "grad_norm": 0.12880779802799225, + "learning_rate": 0.0005, + "loss": 2.1228, + "step": 184040 + }, + { + "epoch": 0.7005397257979796, + "grad_norm": 0.11294928938150406, + "learning_rate": 0.0005, + "loss": 2.0966, + "step": 184050 + }, + { + "epoch": 0.7005777882661023, + "grad_norm": 0.12811915576457977, + "learning_rate": 0.0005, + "loss": 2.1273, + "step": 184060 + }, + { + "epoch": 0.700615850734225, + "grad_norm": 0.12252122163772583, + "learning_rate": 0.0005, + "loss": 2.0963, + "step": 184070 + }, + { + "epoch": 0.7006539132023477, + "grad_norm": 0.12165088206529617, + "learning_rate": 0.0005, + "loss": 2.1046, + "step": 184080 + }, + { + "epoch": 0.7006919756704704, + "grad_norm": 0.1230388954281807, + "learning_rate": 0.0005, + "loss": 2.1051, + "step": 184090 + }, + { + "epoch": 0.700730038138593, + "grad_norm": 0.13142719864845276, + "learning_rate": 0.0005, + "loss": 2.1135, + "step": 184100 + }, + { + "epoch": 0.7007681006067158, + "grad_norm": 0.12288113683462143, + "learning_rate": 0.0005, + "loss": 2.1218, + "step": 184110 + }, + { + "epoch": 0.7008061630748385, + "grad_norm": 0.13205543160438538, + "learning_rate": 0.0005, + "loss": 2.1183, + "step": 184120 + }, + { + "epoch": 0.7008442255429611, + "grad_norm": 0.13189668953418732, + "learning_rate": 0.0005, + "loss": 2.1188, + "step": 184130 + }, + { + "epoch": 0.7008822880110838, + "grad_norm": 0.10998144000768661, + "learning_rate": 0.0005, + "loss": 2.1015, + "step": 184140 + }, + { + "epoch": 0.7009203504792064, + "grad_norm": 0.12300020456314087, + "learning_rate": 0.0005, + "loss": 2.0962, + "step": 184150 + }, + { + "epoch": 0.7009584129473292, + "grad_norm": 0.11949418485164642, + "learning_rate": 0.0005, + "loss": 2.1217, + "step": 184160 + }, + { + "epoch": 0.7009964754154518, + "grad_norm": 0.1179998368024826, + "learning_rate": 0.0005, + "loss": 2.1059, + "step": 184170 + }, + { + "epoch": 0.7010345378835745, + "grad_norm": 0.11798691749572754, + "learning_rate": 0.0005, + "loss": 2.1181, + "step": 184180 + }, + { + "epoch": 0.7010726003516972, + "grad_norm": 0.13212557137012482, + "learning_rate": 0.0005, + "loss": 2.0912, + "step": 184190 + }, + { + "epoch": 0.7011106628198199, + "grad_norm": 0.1313924491405487, + "learning_rate": 0.0005, + "loss": 2.1054, + "step": 184200 + }, + { + "epoch": 0.7011487252879426, + "grad_norm": 0.12167239934206009, + "learning_rate": 0.0005, + "loss": 2.0977, + "step": 184210 + }, + { + "epoch": 0.7011867877560652, + "grad_norm": 0.11549846827983856, + "learning_rate": 0.0005, + "loss": 2.1136, + "step": 184220 + }, + { + "epoch": 0.7012248502241879, + "grad_norm": 0.11892012506723404, + "learning_rate": 0.0005, + "loss": 2.103, + "step": 184230 + }, + { + "epoch": 0.7012629126923107, + "grad_norm": 0.11951800435781479, + "learning_rate": 0.0005, + "loss": 2.1143, + "step": 184240 + }, + { + "epoch": 0.7013009751604333, + "grad_norm": 0.12169340997934341, + "learning_rate": 0.0005, + "loss": 2.1007, + "step": 184250 + }, + { + "epoch": 0.701339037628556, + "grad_norm": 0.1290595978498459, + "learning_rate": 0.0005, + "loss": 2.0975, + "step": 184260 + }, + { + "epoch": 0.7013771000966786, + "grad_norm": 0.13304656744003296, + "learning_rate": 0.0005, + "loss": 2.1231, + "step": 184270 + }, + { + "epoch": 0.7014151625648013, + "grad_norm": 0.12380509823560715, + "learning_rate": 0.0005, + "loss": 2.1013, + "step": 184280 + }, + { + "epoch": 0.7014532250329241, + "grad_norm": 0.12488371133804321, + "learning_rate": 0.0005, + "loss": 2.1084, + "step": 184290 + }, + { + "epoch": 0.7014912875010467, + "grad_norm": 0.11350318789482117, + "learning_rate": 0.0005, + "loss": 2.1068, + "step": 184300 + }, + { + "epoch": 0.7015293499691694, + "grad_norm": 0.12473493069410324, + "learning_rate": 0.0005, + "loss": 2.1008, + "step": 184310 + }, + { + "epoch": 0.701567412437292, + "grad_norm": 0.1370915323495865, + "learning_rate": 0.0005, + "loss": 2.0993, + "step": 184320 + }, + { + "epoch": 0.7016054749054148, + "grad_norm": 0.11229779571294785, + "learning_rate": 0.0005, + "loss": 2.1191, + "step": 184330 + }, + { + "epoch": 0.7016435373735375, + "grad_norm": 0.1268649399280548, + "learning_rate": 0.0005, + "loss": 2.1118, + "step": 184340 + }, + { + "epoch": 0.7016815998416601, + "grad_norm": 0.12467587739229202, + "learning_rate": 0.0005, + "loss": 2.0866, + "step": 184350 + }, + { + "epoch": 0.7017196623097828, + "grad_norm": 0.13000306487083435, + "learning_rate": 0.0005, + "loss": 2.0873, + "step": 184360 + }, + { + "epoch": 0.7017577247779055, + "grad_norm": 0.11177881062030792, + "learning_rate": 0.0005, + "loss": 2.1053, + "step": 184370 + }, + { + "epoch": 0.7017957872460282, + "grad_norm": 0.12280798703432083, + "learning_rate": 0.0005, + "loss": 2.1004, + "step": 184380 + }, + { + "epoch": 0.7018338497141509, + "grad_norm": 0.13626615703105927, + "learning_rate": 0.0005, + "loss": 2.1065, + "step": 184390 + }, + { + "epoch": 0.7018719121822735, + "grad_norm": 0.12476800382137299, + "learning_rate": 0.0005, + "loss": 2.1027, + "step": 184400 + }, + { + "epoch": 0.7019099746503963, + "grad_norm": 0.12631286680698395, + "learning_rate": 0.0005, + "loss": 2.0994, + "step": 184410 + }, + { + "epoch": 0.7019480371185189, + "grad_norm": 0.12857434153556824, + "learning_rate": 0.0005, + "loss": 2.1074, + "step": 184420 + }, + { + "epoch": 0.7019860995866416, + "grad_norm": 0.11980973184108734, + "learning_rate": 0.0005, + "loss": 2.0938, + "step": 184430 + }, + { + "epoch": 0.7020241620547643, + "grad_norm": 0.1299014389514923, + "learning_rate": 0.0005, + "loss": 2.1297, + "step": 184440 + }, + { + "epoch": 0.7020622245228869, + "grad_norm": 0.11516579985618591, + "learning_rate": 0.0005, + "loss": 2.1053, + "step": 184450 + }, + { + "epoch": 0.7021002869910097, + "grad_norm": 0.13994434475898743, + "learning_rate": 0.0005, + "loss": 2.1124, + "step": 184460 + }, + { + "epoch": 0.7021383494591323, + "grad_norm": 0.1350252479314804, + "learning_rate": 0.0005, + "loss": 2.0993, + "step": 184470 + }, + { + "epoch": 0.702176411927255, + "grad_norm": 0.12053694576025009, + "learning_rate": 0.0005, + "loss": 2.1243, + "step": 184480 + }, + { + "epoch": 0.7022144743953777, + "grad_norm": 0.1263304352760315, + "learning_rate": 0.0005, + "loss": 2.1263, + "step": 184490 + }, + { + "epoch": 0.7022525368635004, + "grad_norm": 0.12201272696256638, + "learning_rate": 0.0005, + "loss": 2.1032, + "step": 184500 + }, + { + "epoch": 0.7022905993316231, + "grad_norm": 0.1335000991821289, + "learning_rate": 0.0005, + "loss": 2.1124, + "step": 184510 + }, + { + "epoch": 0.7023286617997457, + "grad_norm": 0.11891698092222214, + "learning_rate": 0.0005, + "loss": 2.1142, + "step": 184520 + }, + { + "epoch": 0.7023667242678684, + "grad_norm": 0.11630988866090775, + "learning_rate": 0.0005, + "loss": 2.1202, + "step": 184530 + }, + { + "epoch": 0.7024047867359912, + "grad_norm": 0.11909238249063492, + "learning_rate": 0.0005, + "loss": 2.1008, + "step": 184540 + }, + { + "epoch": 0.7024428492041138, + "grad_norm": 0.11476001888513565, + "learning_rate": 0.0005, + "loss": 2.1003, + "step": 184550 + }, + { + "epoch": 0.7024809116722365, + "grad_norm": 0.1404361128807068, + "learning_rate": 0.0005, + "loss": 2.1092, + "step": 184560 + }, + { + "epoch": 0.7025189741403591, + "grad_norm": 0.12600143253803253, + "learning_rate": 0.0005, + "loss": 2.1179, + "step": 184570 + }, + { + "epoch": 0.7025570366084818, + "grad_norm": 0.15968388319015503, + "learning_rate": 0.0005, + "loss": 2.1126, + "step": 184580 + }, + { + "epoch": 0.7025950990766046, + "grad_norm": 0.13121315836906433, + "learning_rate": 0.0005, + "loss": 2.1242, + "step": 184590 + }, + { + "epoch": 0.7026331615447272, + "grad_norm": 0.12117509543895721, + "learning_rate": 0.0005, + "loss": 2.1056, + "step": 184600 + }, + { + "epoch": 0.7026712240128499, + "grad_norm": 0.1206469014286995, + "learning_rate": 0.0005, + "loss": 2.1233, + "step": 184610 + }, + { + "epoch": 0.7027092864809725, + "grad_norm": 0.1204044297337532, + "learning_rate": 0.0005, + "loss": 2.1117, + "step": 184620 + }, + { + "epoch": 0.7027473489490953, + "grad_norm": 0.11982487142086029, + "learning_rate": 0.0005, + "loss": 2.0926, + "step": 184630 + }, + { + "epoch": 0.702785411417218, + "grad_norm": 0.1303478628396988, + "learning_rate": 0.0005, + "loss": 2.0976, + "step": 184640 + }, + { + "epoch": 0.7028234738853406, + "grad_norm": 0.12789596617221832, + "learning_rate": 0.0005, + "loss": 2.1127, + "step": 184650 + }, + { + "epoch": 0.7028615363534633, + "grad_norm": 0.12563934922218323, + "learning_rate": 0.0005, + "loss": 2.1124, + "step": 184660 + }, + { + "epoch": 0.702899598821586, + "grad_norm": 0.12067053467035294, + "learning_rate": 0.0005, + "loss": 2.1013, + "step": 184670 + }, + { + "epoch": 0.7029376612897087, + "grad_norm": 0.12684741616249084, + "learning_rate": 0.0005, + "loss": 2.1195, + "step": 184680 + }, + { + "epoch": 0.7029757237578314, + "grad_norm": 0.12234799563884735, + "learning_rate": 0.0005, + "loss": 2.0969, + "step": 184690 + }, + { + "epoch": 0.703013786225954, + "grad_norm": 0.1267378032207489, + "learning_rate": 0.0005, + "loss": 2.1145, + "step": 184700 + }, + { + "epoch": 0.7030518486940767, + "grad_norm": 0.12101038545370102, + "learning_rate": 0.0005, + "loss": 2.1191, + "step": 184710 + }, + { + "epoch": 0.7030899111621994, + "grad_norm": 0.1352284848690033, + "learning_rate": 0.0005, + "loss": 2.0959, + "step": 184720 + }, + { + "epoch": 0.7031279736303221, + "grad_norm": 0.11621130257844925, + "learning_rate": 0.0005, + "loss": 2.1017, + "step": 184730 + }, + { + "epoch": 0.7031660360984447, + "grad_norm": 0.11834180355072021, + "learning_rate": 0.0005, + "loss": 2.1128, + "step": 184740 + }, + { + "epoch": 0.7032040985665674, + "grad_norm": 0.12049825489521027, + "learning_rate": 0.0005, + "loss": 2.1285, + "step": 184750 + }, + { + "epoch": 0.7032421610346902, + "grad_norm": 0.12099867314100266, + "learning_rate": 0.0005, + "loss": 2.1181, + "step": 184760 + }, + { + "epoch": 0.7032802235028128, + "grad_norm": 0.12538106739521027, + "learning_rate": 0.0005, + "loss": 2.0961, + "step": 184770 + }, + { + "epoch": 0.7033182859709355, + "grad_norm": 0.1355898529291153, + "learning_rate": 0.0005, + "loss": 2.104, + "step": 184780 + }, + { + "epoch": 0.7033563484390581, + "grad_norm": 0.12255386263132095, + "learning_rate": 0.0005, + "loss": 2.1086, + "step": 184790 + }, + { + "epoch": 0.7033944109071809, + "grad_norm": 0.11250531673431396, + "learning_rate": 0.0005, + "loss": 2.1188, + "step": 184800 + }, + { + "epoch": 0.7034324733753036, + "grad_norm": 0.11034489423036575, + "learning_rate": 0.0005, + "loss": 2.1099, + "step": 184810 + }, + { + "epoch": 0.7034705358434262, + "grad_norm": 0.12313147634267807, + "learning_rate": 0.0005, + "loss": 2.0969, + "step": 184820 + }, + { + "epoch": 0.7035085983115489, + "grad_norm": 0.13078320026397705, + "learning_rate": 0.0005, + "loss": 2.1041, + "step": 184830 + }, + { + "epoch": 0.7035466607796717, + "grad_norm": 0.12762703001499176, + "learning_rate": 0.0005, + "loss": 2.1021, + "step": 184840 + }, + { + "epoch": 0.7035847232477943, + "grad_norm": 0.13302679359912872, + "learning_rate": 0.0005, + "loss": 2.1015, + "step": 184850 + }, + { + "epoch": 0.703622785715917, + "grad_norm": 0.12136547267436981, + "learning_rate": 0.0005, + "loss": 2.1057, + "step": 184860 + }, + { + "epoch": 0.7036608481840396, + "grad_norm": 0.17401854693889618, + "learning_rate": 0.0005, + "loss": 2.0976, + "step": 184870 + }, + { + "epoch": 0.7036989106521623, + "grad_norm": 0.12410420179367065, + "learning_rate": 0.0005, + "loss": 2.1135, + "step": 184880 + }, + { + "epoch": 0.703736973120285, + "grad_norm": 0.11731221526861191, + "learning_rate": 0.0005, + "loss": 2.1112, + "step": 184890 + }, + { + "epoch": 0.7037750355884077, + "grad_norm": 0.11649060994386673, + "learning_rate": 0.0005, + "loss": 2.1086, + "step": 184900 + }, + { + "epoch": 0.7038130980565304, + "grad_norm": 0.11642348766326904, + "learning_rate": 0.0005, + "loss": 2.1011, + "step": 184910 + }, + { + "epoch": 0.703851160524653, + "grad_norm": 0.11932951956987381, + "learning_rate": 0.0005, + "loss": 2.1041, + "step": 184920 + }, + { + "epoch": 0.7038892229927758, + "grad_norm": 0.11806853115558624, + "learning_rate": 0.0005, + "loss": 2.103, + "step": 184930 + }, + { + "epoch": 0.7039272854608984, + "grad_norm": 0.12727023661136627, + "learning_rate": 0.0005, + "loss": 2.1294, + "step": 184940 + }, + { + "epoch": 0.7039653479290211, + "grad_norm": 0.1231415644288063, + "learning_rate": 0.0005, + "loss": 2.0955, + "step": 184950 + }, + { + "epoch": 0.7040034103971438, + "grad_norm": 0.12632031738758087, + "learning_rate": 0.0005, + "loss": 2.1043, + "step": 184960 + }, + { + "epoch": 0.7040414728652665, + "grad_norm": 0.1354237049818039, + "learning_rate": 0.0005, + "loss": 2.0975, + "step": 184970 + }, + { + "epoch": 0.7040795353333892, + "grad_norm": 0.11664775758981705, + "learning_rate": 0.0005, + "loss": 2.1125, + "step": 184980 + }, + { + "epoch": 0.7041175978015118, + "grad_norm": 0.11538016051054001, + "learning_rate": 0.0005, + "loss": 2.1019, + "step": 184990 + }, + { + "epoch": 0.7041556602696345, + "grad_norm": 0.11794717609882355, + "learning_rate": 0.0005, + "loss": 2.099, + "step": 185000 + }, + { + "epoch": 0.7041937227377572, + "grad_norm": 0.12959380447864532, + "learning_rate": 0.0005, + "loss": 2.1094, + "step": 185010 + }, + { + "epoch": 0.7042317852058799, + "grad_norm": 0.12816959619522095, + "learning_rate": 0.0005, + "loss": 2.1147, + "step": 185020 + }, + { + "epoch": 0.7042698476740026, + "grad_norm": 0.1298322081565857, + "learning_rate": 0.0005, + "loss": 2.1095, + "step": 185030 + }, + { + "epoch": 0.7043079101421252, + "grad_norm": 0.12537501752376556, + "learning_rate": 0.0005, + "loss": 2.11, + "step": 185040 + }, + { + "epoch": 0.7043459726102479, + "grad_norm": 0.14048700034618378, + "learning_rate": 0.0005, + "loss": 2.1137, + "step": 185050 + }, + { + "epoch": 0.7043840350783707, + "grad_norm": 0.12887980043888092, + "learning_rate": 0.0005, + "loss": 2.102, + "step": 185060 + }, + { + "epoch": 0.7044220975464933, + "grad_norm": 0.15135058760643005, + "learning_rate": 0.0005, + "loss": 2.1205, + "step": 185070 + }, + { + "epoch": 0.704460160014616, + "grad_norm": 0.12217877805233002, + "learning_rate": 0.0005, + "loss": 2.1234, + "step": 185080 + }, + { + "epoch": 0.7044982224827386, + "grad_norm": 0.11798762530088425, + "learning_rate": 0.0005, + "loss": 2.1144, + "step": 185090 + }, + { + "epoch": 0.7045362849508614, + "grad_norm": 0.12822438776493073, + "learning_rate": 0.0005, + "loss": 2.1146, + "step": 185100 + }, + { + "epoch": 0.7045743474189841, + "grad_norm": 0.11703921109437943, + "learning_rate": 0.0005, + "loss": 2.106, + "step": 185110 + }, + { + "epoch": 0.7046124098871067, + "grad_norm": 0.1345566064119339, + "learning_rate": 0.0005, + "loss": 2.1202, + "step": 185120 + }, + { + "epoch": 0.7046504723552294, + "grad_norm": 0.13655199110507965, + "learning_rate": 0.0005, + "loss": 2.108, + "step": 185130 + }, + { + "epoch": 0.704688534823352, + "grad_norm": 0.12039312720298767, + "learning_rate": 0.0005, + "loss": 2.1098, + "step": 185140 + }, + { + "epoch": 0.7047265972914748, + "grad_norm": 0.12556329369544983, + "learning_rate": 0.0005, + "loss": 2.0966, + "step": 185150 + }, + { + "epoch": 0.7047646597595975, + "grad_norm": 0.12472794204950333, + "learning_rate": 0.0005, + "loss": 2.1079, + "step": 185160 + }, + { + "epoch": 0.7048027222277201, + "grad_norm": 0.1353958398103714, + "learning_rate": 0.0005, + "loss": 2.1174, + "step": 185170 + }, + { + "epoch": 0.7048407846958428, + "grad_norm": 0.1271139532327652, + "learning_rate": 0.0005, + "loss": 2.1132, + "step": 185180 + }, + { + "epoch": 0.7048788471639655, + "grad_norm": 0.1349204033613205, + "learning_rate": 0.0005, + "loss": 2.1071, + "step": 185190 + }, + { + "epoch": 0.7049169096320882, + "grad_norm": 0.13143670558929443, + "learning_rate": 0.0005, + "loss": 2.1176, + "step": 185200 + }, + { + "epoch": 0.7049549721002109, + "grad_norm": 0.11317627131938934, + "learning_rate": 0.0005, + "loss": 2.1002, + "step": 185210 + }, + { + "epoch": 0.7049930345683335, + "grad_norm": 0.14271529018878937, + "learning_rate": 0.0005, + "loss": 2.1191, + "step": 185220 + }, + { + "epoch": 0.7050310970364563, + "grad_norm": 0.12950241565704346, + "learning_rate": 0.0005, + "loss": 2.1046, + "step": 185230 + }, + { + "epoch": 0.7050691595045789, + "grad_norm": 0.12387185543775558, + "learning_rate": 0.0005, + "loss": 2.0931, + "step": 185240 + }, + { + "epoch": 0.7051072219727016, + "grad_norm": 0.14337509870529175, + "learning_rate": 0.0005, + "loss": 2.1013, + "step": 185250 + }, + { + "epoch": 0.7051452844408242, + "grad_norm": 0.12338105589151382, + "learning_rate": 0.0005, + "loss": 2.1187, + "step": 185260 + }, + { + "epoch": 0.705183346908947, + "grad_norm": 0.1267063170671463, + "learning_rate": 0.0005, + "loss": 2.1155, + "step": 185270 + }, + { + "epoch": 0.7052214093770697, + "grad_norm": 0.12914665043354034, + "learning_rate": 0.0005, + "loss": 2.1178, + "step": 185280 + }, + { + "epoch": 0.7052594718451923, + "grad_norm": 0.1194671168923378, + "learning_rate": 0.0005, + "loss": 2.1168, + "step": 185290 + }, + { + "epoch": 0.705297534313315, + "grad_norm": 0.12188367545604706, + "learning_rate": 0.0005, + "loss": 2.1084, + "step": 185300 + }, + { + "epoch": 0.7053355967814376, + "grad_norm": 0.12353203445672989, + "learning_rate": 0.0005, + "loss": 2.1, + "step": 185310 + }, + { + "epoch": 0.7053736592495604, + "grad_norm": 0.11360117048025131, + "learning_rate": 0.0005, + "loss": 2.1112, + "step": 185320 + }, + { + "epoch": 0.7054117217176831, + "grad_norm": 0.11561376601457596, + "learning_rate": 0.0005, + "loss": 2.1113, + "step": 185330 + }, + { + "epoch": 0.7054497841858057, + "grad_norm": 0.1353146731853485, + "learning_rate": 0.0005, + "loss": 2.1044, + "step": 185340 + }, + { + "epoch": 0.7054878466539284, + "grad_norm": 0.13493327796459198, + "learning_rate": 0.0005, + "loss": 2.1165, + "step": 185350 + }, + { + "epoch": 0.7055259091220512, + "grad_norm": 0.1353655904531479, + "learning_rate": 0.0005, + "loss": 2.1267, + "step": 185360 + }, + { + "epoch": 0.7055639715901738, + "grad_norm": 0.12144365161657333, + "learning_rate": 0.0005, + "loss": 2.1085, + "step": 185370 + }, + { + "epoch": 0.7056020340582965, + "grad_norm": 0.15383411943912506, + "learning_rate": 0.0005, + "loss": 2.1046, + "step": 185380 + }, + { + "epoch": 0.7056400965264191, + "grad_norm": 0.12681280076503754, + "learning_rate": 0.0005, + "loss": 2.1125, + "step": 185390 + }, + { + "epoch": 0.7056781589945419, + "grad_norm": 0.11766770482063293, + "learning_rate": 0.0005, + "loss": 2.1162, + "step": 185400 + }, + { + "epoch": 0.7057162214626646, + "grad_norm": 0.11990108340978622, + "learning_rate": 0.0005, + "loss": 2.0879, + "step": 185410 + }, + { + "epoch": 0.7057542839307872, + "grad_norm": 0.11549576371908188, + "learning_rate": 0.0005, + "loss": 2.1067, + "step": 185420 + }, + { + "epoch": 0.7057923463989099, + "grad_norm": 0.1381005048751831, + "learning_rate": 0.0005, + "loss": 2.1164, + "step": 185430 + }, + { + "epoch": 0.7058304088670325, + "grad_norm": 0.1284915655851364, + "learning_rate": 0.0005, + "loss": 2.1054, + "step": 185440 + }, + { + "epoch": 0.7058684713351553, + "grad_norm": 0.12681995332241058, + "learning_rate": 0.0005, + "loss": 2.1159, + "step": 185450 + }, + { + "epoch": 0.705906533803278, + "grad_norm": 0.12076663970947266, + "learning_rate": 0.0005, + "loss": 2.0996, + "step": 185460 + }, + { + "epoch": 0.7059445962714006, + "grad_norm": 0.12343986332416534, + "learning_rate": 0.0005, + "loss": 2.0981, + "step": 185470 + }, + { + "epoch": 0.7059826587395233, + "grad_norm": 0.13354164361953735, + "learning_rate": 0.0005, + "loss": 2.1272, + "step": 185480 + }, + { + "epoch": 0.706020721207646, + "grad_norm": 0.12271565198898315, + "learning_rate": 0.0005, + "loss": 2.1165, + "step": 185490 + }, + { + "epoch": 0.7060587836757687, + "grad_norm": 0.12300974130630493, + "learning_rate": 0.0005, + "loss": 2.1127, + "step": 185500 + }, + { + "epoch": 0.7060968461438913, + "grad_norm": 0.11424795538187027, + "learning_rate": 0.0005, + "loss": 2.1088, + "step": 185510 + }, + { + "epoch": 0.706134908612014, + "grad_norm": 0.11418292671442032, + "learning_rate": 0.0005, + "loss": 2.0861, + "step": 185520 + }, + { + "epoch": 0.7061729710801368, + "grad_norm": 0.12428473681211472, + "learning_rate": 0.0005, + "loss": 2.105, + "step": 185530 + }, + { + "epoch": 0.7062110335482594, + "grad_norm": 0.11890202015638351, + "learning_rate": 0.0005, + "loss": 2.103, + "step": 185540 + }, + { + "epoch": 0.7062490960163821, + "grad_norm": 0.12819798290729523, + "learning_rate": 0.0005, + "loss": 2.1087, + "step": 185550 + }, + { + "epoch": 0.7062871584845047, + "grad_norm": 0.12449389696121216, + "learning_rate": 0.0005, + "loss": 2.1107, + "step": 185560 + }, + { + "epoch": 0.7063252209526274, + "grad_norm": 0.1176564022898674, + "learning_rate": 0.0005, + "loss": 2.1088, + "step": 185570 + }, + { + "epoch": 0.7063632834207502, + "grad_norm": 0.1193840354681015, + "learning_rate": 0.0005, + "loss": 2.1153, + "step": 185580 + }, + { + "epoch": 0.7064013458888728, + "grad_norm": 0.11895597726106644, + "learning_rate": 0.0005, + "loss": 2.0847, + "step": 185590 + }, + { + "epoch": 0.7064394083569955, + "grad_norm": 0.11255518347024918, + "learning_rate": 0.0005, + "loss": 2.0921, + "step": 185600 + }, + { + "epoch": 0.7064774708251181, + "grad_norm": 0.11871010065078735, + "learning_rate": 0.0005, + "loss": 2.0982, + "step": 185610 + }, + { + "epoch": 0.7065155332932409, + "grad_norm": 0.11335593461990356, + "learning_rate": 0.0005, + "loss": 2.1114, + "step": 185620 + }, + { + "epoch": 0.7065535957613636, + "grad_norm": 0.12085136771202087, + "learning_rate": 0.0005, + "loss": 2.1156, + "step": 185630 + }, + { + "epoch": 0.7065916582294862, + "grad_norm": 0.12406984716653824, + "learning_rate": 0.0005, + "loss": 2.1039, + "step": 185640 + }, + { + "epoch": 0.7066297206976089, + "grad_norm": 0.13687089085578918, + "learning_rate": 0.0005, + "loss": 2.0998, + "step": 185650 + }, + { + "epoch": 0.7066677831657316, + "grad_norm": 0.1142050176858902, + "learning_rate": 0.0005, + "loss": 2.1127, + "step": 185660 + }, + { + "epoch": 0.7067058456338543, + "grad_norm": 0.11903434246778488, + "learning_rate": 0.0005, + "loss": 2.1026, + "step": 185670 + }, + { + "epoch": 0.706743908101977, + "grad_norm": 0.12342733144760132, + "learning_rate": 0.0005, + "loss": 2.1192, + "step": 185680 + }, + { + "epoch": 0.7067819705700996, + "grad_norm": 0.11694970726966858, + "learning_rate": 0.0005, + "loss": 2.1159, + "step": 185690 + }, + { + "epoch": 0.7068200330382224, + "grad_norm": 0.12412623316049576, + "learning_rate": 0.0005, + "loss": 2.1007, + "step": 185700 + }, + { + "epoch": 0.706858095506345, + "grad_norm": 0.13319043815135956, + "learning_rate": 0.0005, + "loss": 2.1299, + "step": 185710 + }, + { + "epoch": 0.7068961579744677, + "grad_norm": 0.13207711279392242, + "learning_rate": 0.0005, + "loss": 2.1101, + "step": 185720 + }, + { + "epoch": 0.7069342204425904, + "grad_norm": 0.4719063639640808, + "learning_rate": 0.0005, + "loss": 2.1014, + "step": 185730 + }, + { + "epoch": 0.706972282910713, + "grad_norm": 0.1343603879213333, + "learning_rate": 0.0005, + "loss": 2.1042, + "step": 185740 + }, + { + "epoch": 0.7070103453788358, + "grad_norm": 0.11707521229982376, + "learning_rate": 0.0005, + "loss": 2.1168, + "step": 185750 + }, + { + "epoch": 0.7070484078469584, + "grad_norm": 0.11734578758478165, + "learning_rate": 0.0005, + "loss": 2.1129, + "step": 185760 + }, + { + "epoch": 0.7070864703150811, + "grad_norm": 0.12925496697425842, + "learning_rate": 0.0005, + "loss": 2.1011, + "step": 185770 + }, + { + "epoch": 0.7071245327832038, + "grad_norm": 0.11399725079536438, + "learning_rate": 0.0005, + "loss": 2.0981, + "step": 185780 + }, + { + "epoch": 0.7071625952513265, + "grad_norm": 0.12147218734025955, + "learning_rate": 0.0005, + "loss": 2.1022, + "step": 185790 + }, + { + "epoch": 0.7072006577194492, + "grad_norm": 0.11649216711521149, + "learning_rate": 0.0005, + "loss": 2.1095, + "step": 185800 + }, + { + "epoch": 0.7072387201875718, + "grad_norm": 0.12788520753383636, + "learning_rate": 0.0005, + "loss": 2.1143, + "step": 185810 + }, + { + "epoch": 0.7072767826556945, + "grad_norm": 0.12423229217529297, + "learning_rate": 0.0005, + "loss": 2.099, + "step": 185820 + }, + { + "epoch": 0.7073148451238173, + "grad_norm": 0.12157479673624039, + "learning_rate": 0.0005, + "loss": 2.1251, + "step": 185830 + }, + { + "epoch": 0.7073529075919399, + "grad_norm": 0.13278579711914062, + "learning_rate": 0.0005, + "loss": 2.0984, + "step": 185840 + }, + { + "epoch": 0.7073909700600626, + "grad_norm": 0.13672243058681488, + "learning_rate": 0.0005, + "loss": 2.1128, + "step": 185850 + }, + { + "epoch": 0.7074290325281852, + "grad_norm": 0.11652804166078568, + "learning_rate": 0.0005, + "loss": 2.1266, + "step": 185860 + }, + { + "epoch": 0.7074670949963079, + "grad_norm": 0.11444343626499176, + "learning_rate": 0.0005, + "loss": 2.1051, + "step": 185870 + }, + { + "epoch": 0.7075051574644307, + "grad_norm": 0.12347528338432312, + "learning_rate": 0.0005, + "loss": 2.1168, + "step": 185880 + }, + { + "epoch": 0.7075432199325533, + "grad_norm": 0.10334424674510956, + "learning_rate": 0.0005, + "loss": 2.1158, + "step": 185890 + }, + { + "epoch": 0.707581282400676, + "grad_norm": 0.17353834211826324, + "learning_rate": 0.0005, + "loss": 2.1085, + "step": 185900 + }, + { + "epoch": 0.7076193448687986, + "grad_norm": 0.12602335214614868, + "learning_rate": 0.0005, + "loss": 2.108, + "step": 185910 + }, + { + "epoch": 0.7076574073369214, + "grad_norm": 0.12529000639915466, + "learning_rate": 0.0005, + "loss": 2.1098, + "step": 185920 + }, + { + "epoch": 0.707695469805044, + "grad_norm": 0.1246362179517746, + "learning_rate": 0.0005, + "loss": 2.1062, + "step": 185930 + }, + { + "epoch": 0.7077335322731667, + "grad_norm": 0.125050887465477, + "learning_rate": 0.0005, + "loss": 2.1056, + "step": 185940 + }, + { + "epoch": 0.7077715947412894, + "grad_norm": 0.12313340604305267, + "learning_rate": 0.0005, + "loss": 2.1213, + "step": 185950 + }, + { + "epoch": 0.7078096572094121, + "grad_norm": 0.12455099076032639, + "learning_rate": 0.0005, + "loss": 2.1028, + "step": 185960 + }, + { + "epoch": 0.7078477196775348, + "grad_norm": 0.11638380587100983, + "learning_rate": 0.0005, + "loss": 2.1016, + "step": 185970 + }, + { + "epoch": 0.7078857821456574, + "grad_norm": 0.12521560490131378, + "learning_rate": 0.0005, + "loss": 2.1063, + "step": 185980 + }, + { + "epoch": 0.7079238446137801, + "grad_norm": 0.11603990197181702, + "learning_rate": 0.0005, + "loss": 2.1009, + "step": 185990 + }, + { + "epoch": 0.7079619070819028, + "grad_norm": 0.11898902803659439, + "learning_rate": 0.0005, + "loss": 2.1061, + "step": 186000 + }, + { + "epoch": 0.7079999695500255, + "grad_norm": 0.13573262095451355, + "learning_rate": 0.0005, + "loss": 2.1197, + "step": 186010 + }, + { + "epoch": 0.7080380320181482, + "grad_norm": 0.13460132479667664, + "learning_rate": 0.0005, + "loss": 2.1087, + "step": 186020 + }, + { + "epoch": 0.7080760944862708, + "grad_norm": 0.12528298795223236, + "learning_rate": 0.0005, + "loss": 2.1127, + "step": 186030 + }, + { + "epoch": 0.7081141569543935, + "grad_norm": 0.12539632618427277, + "learning_rate": 0.0005, + "loss": 2.1028, + "step": 186040 + }, + { + "epoch": 0.7081522194225163, + "grad_norm": 0.13004456460475922, + "learning_rate": 0.0005, + "loss": 2.0981, + "step": 186050 + }, + { + "epoch": 0.7081902818906389, + "grad_norm": 0.11649925261735916, + "learning_rate": 0.0005, + "loss": 2.1092, + "step": 186060 + }, + { + "epoch": 0.7082283443587616, + "grad_norm": 0.12443257868289948, + "learning_rate": 0.0005, + "loss": 2.0926, + "step": 186070 + }, + { + "epoch": 0.7082664068268842, + "grad_norm": 0.11880318075418472, + "learning_rate": 0.0005, + "loss": 2.1129, + "step": 186080 + }, + { + "epoch": 0.708304469295007, + "grad_norm": 0.12145640701055527, + "learning_rate": 0.0005, + "loss": 2.1083, + "step": 186090 + }, + { + "epoch": 0.7083425317631297, + "grad_norm": 0.11560018360614777, + "learning_rate": 0.0005, + "loss": 2.1135, + "step": 186100 + }, + { + "epoch": 0.7083805942312523, + "grad_norm": 0.12145461142063141, + "learning_rate": 0.0005, + "loss": 2.1084, + "step": 186110 + }, + { + "epoch": 0.708418656699375, + "grad_norm": 0.126446932554245, + "learning_rate": 0.0005, + "loss": 2.1053, + "step": 186120 + }, + { + "epoch": 0.7084567191674978, + "grad_norm": 0.10951827466487885, + "learning_rate": 0.0005, + "loss": 2.1117, + "step": 186130 + }, + { + "epoch": 0.7084947816356204, + "grad_norm": 0.12064436078071594, + "learning_rate": 0.0005, + "loss": 2.1076, + "step": 186140 + }, + { + "epoch": 0.7085328441037431, + "grad_norm": 0.12185485661029816, + "learning_rate": 0.0005, + "loss": 2.1102, + "step": 186150 + }, + { + "epoch": 0.7085709065718657, + "grad_norm": 0.12862376868724823, + "learning_rate": 0.0005, + "loss": 2.112, + "step": 186160 + }, + { + "epoch": 0.7086089690399884, + "grad_norm": 0.12142283469438553, + "learning_rate": 0.0005, + "loss": 2.0963, + "step": 186170 + }, + { + "epoch": 0.7086470315081111, + "grad_norm": 0.12142530828714371, + "learning_rate": 0.0005, + "loss": 2.1225, + "step": 186180 + }, + { + "epoch": 0.7086850939762338, + "grad_norm": 0.12393418699502945, + "learning_rate": 0.0005, + "loss": 2.0839, + "step": 186190 + }, + { + "epoch": 0.7087231564443565, + "grad_norm": 0.11970370262861252, + "learning_rate": 0.0005, + "loss": 2.116, + "step": 186200 + }, + { + "epoch": 0.7087612189124791, + "grad_norm": 0.1194053664803505, + "learning_rate": 0.0005, + "loss": 2.1064, + "step": 186210 + }, + { + "epoch": 0.7087992813806019, + "grad_norm": 0.13109378516674042, + "learning_rate": 0.0005, + "loss": 2.1142, + "step": 186220 + }, + { + "epoch": 0.7088373438487245, + "grad_norm": 0.14906781911849976, + "learning_rate": 0.0005, + "loss": 2.1132, + "step": 186230 + }, + { + "epoch": 0.7088754063168472, + "grad_norm": 0.12335547059774399, + "learning_rate": 0.0005, + "loss": 2.1044, + "step": 186240 + }, + { + "epoch": 0.7089134687849699, + "grad_norm": 0.12918975949287415, + "learning_rate": 0.0005, + "loss": 2.1033, + "step": 186250 + }, + { + "epoch": 0.7089515312530926, + "grad_norm": 0.11700023710727692, + "learning_rate": 0.0005, + "loss": 2.1167, + "step": 186260 + }, + { + "epoch": 0.7089895937212153, + "grad_norm": 0.12088143825531006, + "learning_rate": 0.0005, + "loss": 2.1041, + "step": 186270 + }, + { + "epoch": 0.7090276561893379, + "grad_norm": 0.14303064346313477, + "learning_rate": 0.0005, + "loss": 2.0962, + "step": 186280 + }, + { + "epoch": 0.7090657186574606, + "grad_norm": 0.12583290040493011, + "learning_rate": 0.0005, + "loss": 2.1212, + "step": 186290 + }, + { + "epoch": 0.7091037811255833, + "grad_norm": 0.13083286583423615, + "learning_rate": 0.0005, + "loss": 2.1059, + "step": 186300 + }, + { + "epoch": 0.709141843593706, + "grad_norm": 0.11844097822904587, + "learning_rate": 0.0005, + "loss": 2.119, + "step": 186310 + }, + { + "epoch": 0.7091799060618287, + "grad_norm": 0.1358223557472229, + "learning_rate": 0.0005, + "loss": 2.107, + "step": 186320 + }, + { + "epoch": 0.7092179685299513, + "grad_norm": 0.12576615810394287, + "learning_rate": 0.0005, + "loss": 2.0985, + "step": 186330 + }, + { + "epoch": 0.709256030998074, + "grad_norm": 0.1307367980480194, + "learning_rate": 0.0005, + "loss": 2.1019, + "step": 186340 + }, + { + "epoch": 0.7092940934661968, + "grad_norm": 0.12038248777389526, + "learning_rate": 0.0005, + "loss": 2.1183, + "step": 186350 + }, + { + "epoch": 0.7093321559343194, + "grad_norm": 0.12328119575977325, + "learning_rate": 0.0005, + "loss": 2.1142, + "step": 186360 + }, + { + "epoch": 0.7093702184024421, + "grad_norm": 0.12589795887470245, + "learning_rate": 0.0005, + "loss": 2.0998, + "step": 186370 + }, + { + "epoch": 0.7094082808705647, + "grad_norm": 0.11715169250965118, + "learning_rate": 0.0005, + "loss": 2.1036, + "step": 186380 + }, + { + "epoch": 0.7094463433386875, + "grad_norm": 0.12813791632652283, + "learning_rate": 0.0005, + "loss": 2.1174, + "step": 186390 + }, + { + "epoch": 0.7094844058068102, + "grad_norm": 0.17527632415294647, + "learning_rate": 0.0005, + "loss": 2.1177, + "step": 186400 + }, + { + "epoch": 0.7095224682749328, + "grad_norm": 0.1268410086631775, + "learning_rate": 0.0005, + "loss": 2.1064, + "step": 186410 + }, + { + "epoch": 0.7095605307430555, + "grad_norm": 0.13726426661014557, + "learning_rate": 0.0005, + "loss": 2.1037, + "step": 186420 + }, + { + "epoch": 0.7095985932111781, + "grad_norm": 0.1140046939253807, + "learning_rate": 0.0005, + "loss": 2.1205, + "step": 186430 + }, + { + "epoch": 0.7096366556793009, + "grad_norm": 0.11753685027360916, + "learning_rate": 0.0005, + "loss": 2.1255, + "step": 186440 + }, + { + "epoch": 0.7096747181474236, + "grad_norm": 0.12714707851409912, + "learning_rate": 0.0005, + "loss": 2.0882, + "step": 186450 + }, + { + "epoch": 0.7097127806155462, + "grad_norm": 0.12958048284053802, + "learning_rate": 0.0005, + "loss": 2.1087, + "step": 186460 + }, + { + "epoch": 0.7097508430836689, + "grad_norm": 0.13289125263690948, + "learning_rate": 0.0005, + "loss": 2.1, + "step": 186470 + }, + { + "epoch": 0.7097889055517916, + "grad_norm": 0.1325511336326599, + "learning_rate": 0.0005, + "loss": 2.0969, + "step": 186480 + }, + { + "epoch": 0.7098269680199143, + "grad_norm": 0.11752146482467651, + "learning_rate": 0.0005, + "loss": 2.1278, + "step": 186490 + }, + { + "epoch": 0.709865030488037, + "grad_norm": 0.11792637407779694, + "learning_rate": 0.0005, + "loss": 2.1034, + "step": 186500 + }, + { + "epoch": 0.7099030929561596, + "grad_norm": 0.12621088325977325, + "learning_rate": 0.0005, + "loss": 2.1125, + "step": 186510 + }, + { + "epoch": 0.7099411554242824, + "grad_norm": 0.12432099878787994, + "learning_rate": 0.0005, + "loss": 2.089, + "step": 186520 + }, + { + "epoch": 0.709979217892405, + "grad_norm": 0.13783694803714752, + "learning_rate": 0.0005, + "loss": 2.1012, + "step": 186530 + }, + { + "epoch": 0.7100172803605277, + "grad_norm": 0.12313029170036316, + "learning_rate": 0.0005, + "loss": 2.1047, + "step": 186540 + }, + { + "epoch": 0.7100553428286503, + "grad_norm": 0.15528278052806854, + "learning_rate": 0.0005, + "loss": 2.1091, + "step": 186550 + }, + { + "epoch": 0.7100934052967731, + "grad_norm": 0.1190701350569725, + "learning_rate": 0.0005, + "loss": 2.1035, + "step": 186560 + }, + { + "epoch": 0.7101314677648958, + "grad_norm": 0.12318447232246399, + "learning_rate": 0.0005, + "loss": 2.104, + "step": 186570 + }, + { + "epoch": 0.7101695302330184, + "grad_norm": 0.12099643796682358, + "learning_rate": 0.0005, + "loss": 2.1056, + "step": 186580 + }, + { + "epoch": 0.7102075927011411, + "grad_norm": 0.12311971932649612, + "learning_rate": 0.0005, + "loss": 2.096, + "step": 186590 + }, + { + "epoch": 0.7102456551692637, + "grad_norm": 0.12411779165267944, + "learning_rate": 0.0005, + "loss": 2.1147, + "step": 186600 + }, + { + "epoch": 0.7102837176373865, + "grad_norm": 0.12775003910064697, + "learning_rate": 0.0005, + "loss": 2.1279, + "step": 186610 + }, + { + "epoch": 0.7103217801055092, + "grad_norm": 0.11959369480609894, + "learning_rate": 0.0005, + "loss": 2.1155, + "step": 186620 + }, + { + "epoch": 0.7103598425736318, + "grad_norm": 0.13204076886177063, + "learning_rate": 0.0005, + "loss": 2.1142, + "step": 186630 + }, + { + "epoch": 0.7103979050417545, + "grad_norm": 0.12158135324716568, + "learning_rate": 0.0005, + "loss": 2.1158, + "step": 186640 + }, + { + "epoch": 0.7104359675098773, + "grad_norm": 0.11938668042421341, + "learning_rate": 0.0005, + "loss": 2.1186, + "step": 186650 + }, + { + "epoch": 0.7104740299779999, + "grad_norm": 0.11448933929204941, + "learning_rate": 0.0005, + "loss": 2.0976, + "step": 186660 + }, + { + "epoch": 0.7105120924461226, + "grad_norm": 0.11393057554960251, + "learning_rate": 0.0005, + "loss": 2.1033, + "step": 186670 + }, + { + "epoch": 0.7105501549142452, + "grad_norm": 0.12316378206014633, + "learning_rate": 0.0005, + "loss": 2.1103, + "step": 186680 + }, + { + "epoch": 0.710588217382368, + "grad_norm": 0.11819449812173843, + "learning_rate": 0.0005, + "loss": 2.1167, + "step": 186690 + }, + { + "epoch": 0.7106262798504906, + "grad_norm": 0.12064626067876816, + "learning_rate": 0.0005, + "loss": 2.0913, + "step": 186700 + }, + { + "epoch": 0.7106643423186133, + "grad_norm": 0.1182859018445015, + "learning_rate": 0.0005, + "loss": 2.1155, + "step": 186710 + }, + { + "epoch": 0.710702404786736, + "grad_norm": 0.12516747415065765, + "learning_rate": 0.0005, + "loss": 2.1133, + "step": 186720 + }, + { + "epoch": 0.7107404672548586, + "grad_norm": 0.13327202200889587, + "learning_rate": 0.0005, + "loss": 2.114, + "step": 186730 + }, + { + "epoch": 0.7107785297229814, + "grad_norm": 0.12220057845115662, + "learning_rate": 0.0005, + "loss": 2.1101, + "step": 186740 + }, + { + "epoch": 0.710816592191104, + "grad_norm": 0.12841612100601196, + "learning_rate": 0.0005, + "loss": 2.0913, + "step": 186750 + }, + { + "epoch": 0.7108546546592267, + "grad_norm": 0.11224877089262009, + "learning_rate": 0.0005, + "loss": 2.1178, + "step": 186760 + }, + { + "epoch": 0.7108927171273494, + "grad_norm": 0.12116874754428864, + "learning_rate": 0.0005, + "loss": 2.1134, + "step": 186770 + }, + { + "epoch": 0.7109307795954721, + "grad_norm": 0.1130293607711792, + "learning_rate": 0.0005, + "loss": 2.1147, + "step": 186780 + }, + { + "epoch": 0.7109688420635948, + "grad_norm": 0.12867222726345062, + "learning_rate": 0.0005, + "loss": 2.1228, + "step": 186790 + }, + { + "epoch": 0.7110069045317174, + "grad_norm": 0.12083183974027634, + "learning_rate": 0.0005, + "loss": 2.1056, + "step": 186800 + }, + { + "epoch": 0.7110449669998401, + "grad_norm": 0.11976433545351028, + "learning_rate": 0.0005, + "loss": 2.0977, + "step": 186810 + }, + { + "epoch": 0.7110830294679629, + "grad_norm": 0.1157735213637352, + "learning_rate": 0.0005, + "loss": 2.1048, + "step": 186820 + }, + { + "epoch": 0.7111210919360855, + "grad_norm": 0.12680009007453918, + "learning_rate": 0.0005, + "loss": 2.1024, + "step": 186830 + }, + { + "epoch": 0.7111591544042082, + "grad_norm": 0.13108272850513458, + "learning_rate": 0.0005, + "loss": 2.0836, + "step": 186840 + }, + { + "epoch": 0.7111972168723308, + "grad_norm": 0.11942317336797714, + "learning_rate": 0.0005, + "loss": 2.1133, + "step": 186850 + }, + { + "epoch": 0.7112352793404536, + "grad_norm": 0.12562249600887299, + "learning_rate": 0.0005, + "loss": 2.0997, + "step": 186860 + }, + { + "epoch": 0.7112733418085763, + "grad_norm": 0.13094764947891235, + "learning_rate": 0.0005, + "loss": 2.1109, + "step": 186870 + }, + { + "epoch": 0.7113114042766989, + "grad_norm": 0.14853011071681976, + "learning_rate": 0.0005, + "loss": 2.1094, + "step": 186880 + }, + { + "epoch": 0.7113494667448216, + "grad_norm": 0.21083134412765503, + "learning_rate": 0.0005, + "loss": 2.1096, + "step": 186890 + }, + { + "epoch": 0.7113875292129442, + "grad_norm": 0.12516815960407257, + "learning_rate": 0.0005, + "loss": 2.115, + "step": 186900 + }, + { + "epoch": 0.711425591681067, + "grad_norm": 0.11651566624641418, + "learning_rate": 0.0005, + "loss": 2.0968, + "step": 186910 + }, + { + "epoch": 0.7114636541491897, + "grad_norm": 0.11329130083322525, + "learning_rate": 0.0005, + "loss": 2.1024, + "step": 186920 + }, + { + "epoch": 0.7115017166173123, + "grad_norm": 0.12068556994199753, + "learning_rate": 0.0005, + "loss": 2.1144, + "step": 186930 + }, + { + "epoch": 0.711539779085435, + "grad_norm": 0.12598134577274323, + "learning_rate": 0.0005, + "loss": 2.1085, + "step": 186940 + }, + { + "epoch": 0.7115778415535577, + "grad_norm": 0.11477117985486984, + "learning_rate": 0.0005, + "loss": 2.1046, + "step": 186950 + }, + { + "epoch": 0.7116159040216804, + "grad_norm": 0.13626538217067719, + "learning_rate": 0.0005, + "loss": 2.1226, + "step": 186960 + }, + { + "epoch": 0.7116539664898031, + "grad_norm": 0.13596713542938232, + "learning_rate": 0.0005, + "loss": 2.1061, + "step": 186970 + }, + { + "epoch": 0.7116920289579257, + "grad_norm": 0.12076553702354431, + "learning_rate": 0.0005, + "loss": 2.0876, + "step": 186980 + }, + { + "epoch": 0.7117300914260485, + "grad_norm": 0.12414637953042984, + "learning_rate": 0.0005, + "loss": 2.1091, + "step": 186990 + }, + { + "epoch": 0.7117681538941711, + "grad_norm": 0.11907530575990677, + "learning_rate": 0.0005, + "loss": 2.0956, + "step": 187000 + }, + { + "epoch": 0.7118062163622938, + "grad_norm": 0.11883730441331863, + "learning_rate": 0.0005, + "loss": 2.1027, + "step": 187010 + }, + { + "epoch": 0.7118442788304165, + "grad_norm": 0.11856499314308167, + "learning_rate": 0.0005, + "loss": 2.1053, + "step": 187020 + }, + { + "epoch": 0.7118823412985391, + "grad_norm": 0.1154351681470871, + "learning_rate": 0.0005, + "loss": 2.1077, + "step": 187030 + }, + { + "epoch": 0.7119204037666619, + "grad_norm": 0.11441994458436966, + "learning_rate": 0.0005, + "loss": 2.1076, + "step": 187040 + }, + { + "epoch": 0.7119584662347845, + "grad_norm": 0.12784643471240997, + "learning_rate": 0.0005, + "loss": 2.1091, + "step": 187050 + }, + { + "epoch": 0.7119965287029072, + "grad_norm": 0.138327956199646, + "learning_rate": 0.0005, + "loss": 2.1189, + "step": 187060 + }, + { + "epoch": 0.7120345911710299, + "grad_norm": 0.13243740797042847, + "learning_rate": 0.0005, + "loss": 2.1048, + "step": 187070 + }, + { + "epoch": 0.7120726536391526, + "grad_norm": 0.11836738139390945, + "learning_rate": 0.0005, + "loss": 2.1127, + "step": 187080 + }, + { + "epoch": 0.7121107161072753, + "grad_norm": 0.12716001272201538, + "learning_rate": 0.0005, + "loss": 2.0925, + "step": 187090 + }, + { + "epoch": 0.7121487785753979, + "grad_norm": 0.14492060244083405, + "learning_rate": 0.0005, + "loss": 2.1133, + "step": 187100 + }, + { + "epoch": 0.7121868410435206, + "grad_norm": 0.1620989590883255, + "learning_rate": 0.0005, + "loss": 2.1048, + "step": 187110 + }, + { + "epoch": 0.7122249035116434, + "grad_norm": 0.11282233148813248, + "learning_rate": 0.0005, + "loss": 2.1154, + "step": 187120 + }, + { + "epoch": 0.712262965979766, + "grad_norm": 0.1469227373600006, + "learning_rate": 0.0005, + "loss": 2.0917, + "step": 187130 + }, + { + "epoch": 0.7123010284478887, + "grad_norm": 0.13612151145935059, + "learning_rate": 0.0005, + "loss": 2.1031, + "step": 187140 + }, + { + "epoch": 0.7123390909160113, + "grad_norm": 0.12967805564403534, + "learning_rate": 0.0005, + "loss": 2.1001, + "step": 187150 + }, + { + "epoch": 0.712377153384134, + "grad_norm": 0.14889055490493774, + "learning_rate": 0.0005, + "loss": 2.1018, + "step": 187160 + }, + { + "epoch": 0.7124152158522568, + "grad_norm": 0.12575414776802063, + "learning_rate": 0.0005, + "loss": 2.117, + "step": 187170 + }, + { + "epoch": 0.7124532783203794, + "grad_norm": 0.12359946966171265, + "learning_rate": 0.0005, + "loss": 2.1039, + "step": 187180 + }, + { + "epoch": 0.7124913407885021, + "grad_norm": 0.12194182723760605, + "learning_rate": 0.0005, + "loss": 2.1173, + "step": 187190 + }, + { + "epoch": 0.7125294032566247, + "grad_norm": 0.12410972267389297, + "learning_rate": 0.0005, + "loss": 2.1078, + "step": 187200 + }, + { + "epoch": 0.7125674657247475, + "grad_norm": 0.11400230973958969, + "learning_rate": 0.0005, + "loss": 2.1172, + "step": 187210 + }, + { + "epoch": 0.7126055281928702, + "grad_norm": 0.11606621742248535, + "learning_rate": 0.0005, + "loss": 2.115, + "step": 187220 + }, + { + "epoch": 0.7126435906609928, + "grad_norm": 0.12606237828731537, + "learning_rate": 0.0005, + "loss": 2.1196, + "step": 187230 + }, + { + "epoch": 0.7126816531291155, + "grad_norm": 0.1324600726366043, + "learning_rate": 0.0005, + "loss": 2.1124, + "step": 187240 + }, + { + "epoch": 0.7127197155972382, + "grad_norm": 0.7168033719062805, + "learning_rate": 0.0005, + "loss": 2.1118, + "step": 187250 + }, + { + "epoch": 0.7127577780653609, + "grad_norm": 0.11666694283485413, + "learning_rate": 0.0005, + "loss": 2.0817, + "step": 187260 + }, + { + "epoch": 0.7127958405334835, + "grad_norm": 0.1246514767408371, + "learning_rate": 0.0005, + "loss": 2.1082, + "step": 187270 + }, + { + "epoch": 0.7128339030016062, + "grad_norm": 0.11888870596885681, + "learning_rate": 0.0005, + "loss": 2.0972, + "step": 187280 + }, + { + "epoch": 0.712871965469729, + "grad_norm": 0.13312821090221405, + "learning_rate": 0.0005, + "loss": 2.1198, + "step": 187290 + }, + { + "epoch": 0.7129100279378516, + "grad_norm": 0.12242951989173889, + "learning_rate": 0.0005, + "loss": 2.1129, + "step": 187300 + }, + { + "epoch": 0.7129480904059743, + "grad_norm": 0.1219000369310379, + "learning_rate": 0.0005, + "loss": 2.0949, + "step": 187310 + }, + { + "epoch": 0.712986152874097, + "grad_norm": 0.1179623156785965, + "learning_rate": 0.0005, + "loss": 2.1363, + "step": 187320 + }, + { + "epoch": 0.7130242153422196, + "grad_norm": 0.12434601038694382, + "learning_rate": 0.0005, + "loss": 2.111, + "step": 187330 + }, + { + "epoch": 0.7130622778103424, + "grad_norm": 0.11475856602191925, + "learning_rate": 0.0005, + "loss": 2.1059, + "step": 187340 + }, + { + "epoch": 0.713100340278465, + "grad_norm": 0.11931030452251434, + "learning_rate": 0.0005, + "loss": 2.1088, + "step": 187350 + }, + { + "epoch": 0.7131384027465877, + "grad_norm": 0.13195136189460754, + "learning_rate": 0.0005, + "loss": 2.1098, + "step": 187360 + }, + { + "epoch": 0.7131764652147103, + "grad_norm": 0.11491978168487549, + "learning_rate": 0.0005, + "loss": 2.1141, + "step": 187370 + }, + { + "epoch": 0.7132145276828331, + "grad_norm": 0.13336238265037537, + "learning_rate": 0.0005, + "loss": 2.1129, + "step": 187380 + }, + { + "epoch": 0.7132525901509558, + "grad_norm": 0.11675713211297989, + "learning_rate": 0.0005, + "loss": 2.1097, + "step": 187390 + }, + { + "epoch": 0.7132906526190784, + "grad_norm": 0.11502215266227722, + "learning_rate": 0.0005, + "loss": 2.1082, + "step": 187400 + }, + { + "epoch": 0.7133287150872011, + "grad_norm": 0.12776829302310944, + "learning_rate": 0.0005, + "loss": 2.1253, + "step": 187410 + }, + { + "epoch": 0.7133667775553238, + "grad_norm": 0.12221261858940125, + "learning_rate": 0.0005, + "loss": 2.0999, + "step": 187420 + }, + { + "epoch": 0.7134048400234465, + "grad_norm": 0.12141124904155731, + "learning_rate": 0.0005, + "loss": 2.1101, + "step": 187430 + }, + { + "epoch": 0.7134429024915692, + "grad_norm": 0.12151667475700378, + "learning_rate": 0.0005, + "loss": 2.1211, + "step": 187440 + }, + { + "epoch": 0.7134809649596918, + "grad_norm": 0.12741494178771973, + "learning_rate": 0.0005, + "loss": 2.1108, + "step": 187450 + }, + { + "epoch": 0.7135190274278145, + "grad_norm": 0.13373515009880066, + "learning_rate": 0.0005, + "loss": 2.1037, + "step": 187460 + }, + { + "epoch": 0.7135570898959372, + "grad_norm": 0.12541289627552032, + "learning_rate": 0.0005, + "loss": 2.0949, + "step": 187470 + }, + { + "epoch": 0.7135951523640599, + "grad_norm": 0.13134725391864777, + "learning_rate": 0.0005, + "loss": 2.1071, + "step": 187480 + }, + { + "epoch": 0.7136332148321826, + "grad_norm": 0.12061105668544769, + "learning_rate": 0.0005, + "loss": 2.0963, + "step": 187490 + }, + { + "epoch": 0.7136712773003052, + "grad_norm": 0.12639255821704865, + "learning_rate": 0.0005, + "loss": 2.1294, + "step": 187500 + }, + { + "epoch": 0.713709339768428, + "grad_norm": 0.12902987003326416, + "learning_rate": 0.0005, + "loss": 2.1038, + "step": 187510 + }, + { + "epoch": 0.7137474022365506, + "grad_norm": 0.12578243017196655, + "learning_rate": 0.0005, + "loss": 2.127, + "step": 187520 + }, + { + "epoch": 0.7137854647046733, + "grad_norm": 0.11344828456640244, + "learning_rate": 0.0005, + "loss": 2.092, + "step": 187530 + }, + { + "epoch": 0.713823527172796, + "grad_norm": 0.13546188175678253, + "learning_rate": 0.0005, + "loss": 2.1258, + "step": 187540 + }, + { + "epoch": 0.7138615896409187, + "grad_norm": 0.12519478797912598, + "learning_rate": 0.0005, + "loss": 2.0963, + "step": 187550 + }, + { + "epoch": 0.7138996521090414, + "grad_norm": 0.12113405764102936, + "learning_rate": 0.0005, + "loss": 2.0966, + "step": 187560 + }, + { + "epoch": 0.713937714577164, + "grad_norm": 0.13216333091259003, + "learning_rate": 0.0005, + "loss": 2.1151, + "step": 187570 + }, + { + "epoch": 0.7139757770452867, + "grad_norm": 0.13803592324256897, + "learning_rate": 0.0005, + "loss": 2.1305, + "step": 187580 + }, + { + "epoch": 0.7140138395134094, + "grad_norm": 0.12262871116399765, + "learning_rate": 0.0005, + "loss": 2.1133, + "step": 187590 + }, + { + "epoch": 0.7140519019815321, + "grad_norm": 0.12475186586380005, + "learning_rate": 0.0005, + "loss": 2.1217, + "step": 187600 + }, + { + "epoch": 0.7140899644496548, + "grad_norm": 0.1186951994895935, + "learning_rate": 0.0005, + "loss": 2.1087, + "step": 187610 + }, + { + "epoch": 0.7141280269177774, + "grad_norm": 0.12116733193397522, + "learning_rate": 0.0005, + "loss": 2.0968, + "step": 187620 + }, + { + "epoch": 0.7141660893859001, + "grad_norm": 0.12256205081939697, + "learning_rate": 0.0005, + "loss": 2.0939, + "step": 187630 + }, + { + "epoch": 0.7142041518540229, + "grad_norm": 0.122563935816288, + "learning_rate": 0.0005, + "loss": 2.1067, + "step": 187640 + }, + { + "epoch": 0.7142422143221455, + "grad_norm": 0.1201736181974411, + "learning_rate": 0.0005, + "loss": 2.094, + "step": 187650 + }, + { + "epoch": 0.7142802767902682, + "grad_norm": 0.11947773396968842, + "learning_rate": 0.0005, + "loss": 2.0989, + "step": 187660 + }, + { + "epoch": 0.7143183392583908, + "grad_norm": 0.11251599341630936, + "learning_rate": 0.0005, + "loss": 2.1137, + "step": 187670 + }, + { + "epoch": 0.7143564017265136, + "grad_norm": 0.11396802216768265, + "learning_rate": 0.0005, + "loss": 2.1235, + "step": 187680 + }, + { + "epoch": 0.7143944641946363, + "grad_norm": 0.1160028800368309, + "learning_rate": 0.0005, + "loss": 2.1057, + "step": 187690 + }, + { + "epoch": 0.7144325266627589, + "grad_norm": 0.11407670378684998, + "learning_rate": 0.0005, + "loss": 2.0946, + "step": 187700 + }, + { + "epoch": 0.7144705891308816, + "grad_norm": 0.11758105456829071, + "learning_rate": 0.0005, + "loss": 2.1183, + "step": 187710 + }, + { + "epoch": 0.7145086515990043, + "grad_norm": 0.12069651484489441, + "learning_rate": 0.0005, + "loss": 2.1104, + "step": 187720 + }, + { + "epoch": 0.714546714067127, + "grad_norm": 0.12910427153110504, + "learning_rate": 0.0005, + "loss": 2.1225, + "step": 187730 + }, + { + "epoch": 0.7145847765352497, + "grad_norm": 0.12105337530374527, + "learning_rate": 0.0005, + "loss": 2.1057, + "step": 187740 + }, + { + "epoch": 0.7146228390033723, + "grad_norm": 0.1282321959733963, + "learning_rate": 0.0005, + "loss": 2.1297, + "step": 187750 + }, + { + "epoch": 0.714660901471495, + "grad_norm": 0.1256188601255417, + "learning_rate": 0.0005, + "loss": 2.1092, + "step": 187760 + }, + { + "epoch": 0.7146989639396177, + "grad_norm": 0.12741422653198242, + "learning_rate": 0.0005, + "loss": 2.1029, + "step": 187770 + }, + { + "epoch": 0.7147370264077404, + "grad_norm": 0.12263234704732895, + "learning_rate": 0.0005, + "loss": 2.1051, + "step": 187780 + }, + { + "epoch": 0.714775088875863, + "grad_norm": 0.12101583927869797, + "learning_rate": 0.0005, + "loss": 2.1042, + "step": 187790 + }, + { + "epoch": 0.7148131513439857, + "grad_norm": 0.13574911653995514, + "learning_rate": 0.0005, + "loss": 2.1128, + "step": 187800 + }, + { + "epoch": 0.7148512138121085, + "grad_norm": 0.12082146853208542, + "learning_rate": 0.0005, + "loss": 2.125, + "step": 187810 + }, + { + "epoch": 0.7148892762802311, + "grad_norm": 0.12578290700912476, + "learning_rate": 0.0005, + "loss": 2.125, + "step": 187820 + }, + { + "epoch": 0.7149273387483538, + "grad_norm": 0.12050847709178925, + "learning_rate": 0.0005, + "loss": 2.1241, + "step": 187830 + }, + { + "epoch": 0.7149654012164764, + "grad_norm": 0.122773677110672, + "learning_rate": 0.0005, + "loss": 2.0985, + "step": 187840 + }, + { + "epoch": 0.7150034636845992, + "grad_norm": 0.11656002700328827, + "learning_rate": 0.0005, + "loss": 2.113, + "step": 187850 + }, + { + "epoch": 0.7150415261527219, + "grad_norm": 0.1179434210062027, + "learning_rate": 0.0005, + "loss": 2.1011, + "step": 187860 + }, + { + "epoch": 0.7150795886208445, + "grad_norm": 0.11811520904302597, + "learning_rate": 0.0005, + "loss": 2.0987, + "step": 187870 + }, + { + "epoch": 0.7151176510889672, + "grad_norm": 0.12294347584247589, + "learning_rate": 0.0005, + "loss": 2.1087, + "step": 187880 + }, + { + "epoch": 0.7151557135570898, + "grad_norm": 0.13415418565273285, + "learning_rate": 0.0005, + "loss": 2.1092, + "step": 187890 + }, + { + "epoch": 0.7151937760252126, + "grad_norm": 0.11592666804790497, + "learning_rate": 0.0005, + "loss": 2.1088, + "step": 187900 + }, + { + "epoch": 0.7152318384933353, + "grad_norm": 0.1220850721001625, + "learning_rate": 0.0005, + "loss": 2.1105, + "step": 187910 + }, + { + "epoch": 0.7152699009614579, + "grad_norm": 0.1307125687599182, + "learning_rate": 0.0005, + "loss": 2.1262, + "step": 187920 + }, + { + "epoch": 0.7153079634295806, + "grad_norm": 0.13731436431407928, + "learning_rate": 0.0005, + "loss": 2.1086, + "step": 187930 + }, + { + "epoch": 0.7153460258977034, + "grad_norm": 0.1312839537858963, + "learning_rate": 0.0005, + "loss": 2.1128, + "step": 187940 + }, + { + "epoch": 0.715384088365826, + "grad_norm": 0.13992613554000854, + "learning_rate": 0.0005, + "loss": 2.1182, + "step": 187950 + }, + { + "epoch": 0.7154221508339487, + "grad_norm": 0.11659088730812073, + "learning_rate": 0.0005, + "loss": 2.1121, + "step": 187960 + }, + { + "epoch": 0.7154602133020713, + "grad_norm": 0.12068942189216614, + "learning_rate": 0.0005, + "loss": 2.1134, + "step": 187970 + }, + { + "epoch": 0.7154982757701941, + "grad_norm": 0.1227799654006958, + "learning_rate": 0.0005, + "loss": 2.0905, + "step": 187980 + }, + { + "epoch": 0.7155363382383167, + "grad_norm": 0.12143576890230179, + "learning_rate": 0.0005, + "loss": 2.1065, + "step": 187990 + }, + { + "epoch": 0.7155744007064394, + "grad_norm": 0.11767726391553879, + "learning_rate": 0.0005, + "loss": 2.0974, + "step": 188000 + }, + { + "epoch": 0.7156124631745621, + "grad_norm": 0.12173023074865341, + "learning_rate": 0.0005, + "loss": 2.1232, + "step": 188010 + }, + { + "epoch": 0.7156505256426847, + "grad_norm": 0.1280447244644165, + "learning_rate": 0.0005, + "loss": 2.1019, + "step": 188020 + }, + { + "epoch": 0.7156885881108075, + "grad_norm": 0.13191145658493042, + "learning_rate": 0.0005, + "loss": 2.1146, + "step": 188030 + }, + { + "epoch": 0.7157266505789301, + "grad_norm": 0.13750119507312775, + "learning_rate": 0.0005, + "loss": 2.1001, + "step": 188040 + }, + { + "epoch": 0.7157647130470528, + "grad_norm": 0.12749455869197845, + "learning_rate": 0.0005, + "loss": 2.0974, + "step": 188050 + }, + { + "epoch": 0.7158027755151755, + "grad_norm": 0.1899019479751587, + "learning_rate": 0.0005, + "loss": 2.1092, + "step": 188060 + }, + { + "epoch": 0.7158408379832982, + "grad_norm": 0.1267443299293518, + "learning_rate": 0.0005, + "loss": 2.113, + "step": 188070 + }, + { + "epoch": 0.7158789004514209, + "grad_norm": 0.12289052456617355, + "learning_rate": 0.0005, + "loss": 2.1, + "step": 188080 + }, + { + "epoch": 0.7159169629195435, + "grad_norm": 0.1474115550518036, + "learning_rate": 0.0005, + "loss": 2.1079, + "step": 188090 + }, + { + "epoch": 0.7159550253876662, + "grad_norm": 0.12440325319766998, + "learning_rate": 0.0005, + "loss": 2.1153, + "step": 188100 + }, + { + "epoch": 0.715993087855789, + "grad_norm": 0.13077470660209656, + "learning_rate": 0.0005, + "loss": 2.1163, + "step": 188110 + }, + { + "epoch": 0.7160311503239116, + "grad_norm": 0.11480049043893814, + "learning_rate": 0.0005, + "loss": 2.103, + "step": 188120 + }, + { + "epoch": 0.7160692127920343, + "grad_norm": 0.11923867464065552, + "learning_rate": 0.0005, + "loss": 2.1107, + "step": 188130 + }, + { + "epoch": 0.7161072752601569, + "grad_norm": 0.1369302123785019, + "learning_rate": 0.0005, + "loss": 2.1112, + "step": 188140 + }, + { + "epoch": 0.7161453377282797, + "grad_norm": 0.13814698159694672, + "learning_rate": 0.0005, + "loss": 2.109, + "step": 188150 + }, + { + "epoch": 0.7161834001964024, + "grad_norm": 0.13006384670734406, + "learning_rate": 0.0005, + "loss": 2.1022, + "step": 188160 + }, + { + "epoch": 0.716221462664525, + "grad_norm": 0.12163983285427094, + "learning_rate": 0.0005, + "loss": 2.1007, + "step": 188170 + }, + { + "epoch": 0.7162595251326477, + "grad_norm": 0.12530353665351868, + "learning_rate": 0.0005, + "loss": 2.1081, + "step": 188180 + }, + { + "epoch": 0.7162975876007703, + "grad_norm": 0.12386982887983322, + "learning_rate": 0.0005, + "loss": 2.1069, + "step": 188190 + }, + { + "epoch": 0.7163356500688931, + "grad_norm": 0.12685070931911469, + "learning_rate": 0.0005, + "loss": 2.1175, + "step": 188200 + }, + { + "epoch": 0.7163737125370158, + "grad_norm": 0.4660695791244507, + "learning_rate": 0.0005, + "loss": 2.1041, + "step": 188210 + }, + { + "epoch": 0.7164117750051384, + "grad_norm": 0.1478555053472519, + "learning_rate": 0.0005, + "loss": 2.1114, + "step": 188220 + }, + { + "epoch": 0.7164498374732611, + "grad_norm": 0.12359131127595901, + "learning_rate": 0.0005, + "loss": 2.105, + "step": 188230 + }, + { + "epoch": 0.7164878999413838, + "grad_norm": 0.12482161074876785, + "learning_rate": 0.0005, + "loss": 2.1088, + "step": 188240 + }, + { + "epoch": 0.7165259624095065, + "grad_norm": 0.14276772737503052, + "learning_rate": 0.0005, + "loss": 2.1033, + "step": 188250 + }, + { + "epoch": 0.7165640248776292, + "grad_norm": 0.12220495939254761, + "learning_rate": 0.0005, + "loss": 2.1124, + "step": 188260 + }, + { + "epoch": 0.7166020873457518, + "grad_norm": 0.12394607067108154, + "learning_rate": 0.0005, + "loss": 2.1086, + "step": 188270 + }, + { + "epoch": 0.7166401498138746, + "grad_norm": 0.13645556569099426, + "learning_rate": 0.0005, + "loss": 2.1235, + "step": 188280 + }, + { + "epoch": 0.7166782122819972, + "grad_norm": 0.12253500521183014, + "learning_rate": 0.0005, + "loss": 2.0978, + "step": 188290 + }, + { + "epoch": 0.7167162747501199, + "grad_norm": 0.12891119718551636, + "learning_rate": 0.0005, + "loss": 2.1055, + "step": 188300 + }, + { + "epoch": 0.7167543372182426, + "grad_norm": 0.1140003502368927, + "learning_rate": 0.0005, + "loss": 2.1146, + "step": 188310 + }, + { + "epoch": 0.7167923996863652, + "grad_norm": 0.11547480523586273, + "learning_rate": 0.0005, + "loss": 2.0956, + "step": 188320 + }, + { + "epoch": 0.716830462154488, + "grad_norm": 0.10986243188381195, + "learning_rate": 0.0005, + "loss": 2.091, + "step": 188330 + }, + { + "epoch": 0.7168685246226106, + "grad_norm": 0.14526185393333435, + "learning_rate": 0.0005, + "loss": 2.1105, + "step": 188340 + }, + { + "epoch": 0.7169065870907333, + "grad_norm": 0.23593851923942566, + "learning_rate": 0.0005, + "loss": 2.1092, + "step": 188350 + }, + { + "epoch": 0.716944649558856, + "grad_norm": 0.1532217115163803, + "learning_rate": 0.0005, + "loss": 2.1143, + "step": 188360 + }, + { + "epoch": 0.7169827120269787, + "grad_norm": 0.12344641238451004, + "learning_rate": 0.0005, + "loss": 2.0996, + "step": 188370 + }, + { + "epoch": 0.7170207744951014, + "grad_norm": 0.12098667770624161, + "learning_rate": 0.0005, + "loss": 2.1105, + "step": 188380 + }, + { + "epoch": 0.717058836963224, + "grad_norm": 0.1281360387802124, + "learning_rate": 0.0005, + "loss": 2.1069, + "step": 188390 + }, + { + "epoch": 0.7170968994313467, + "grad_norm": 0.10796024650335312, + "learning_rate": 0.0005, + "loss": 2.1093, + "step": 188400 + }, + { + "epoch": 0.7171349618994695, + "grad_norm": 0.12044819444417953, + "learning_rate": 0.0005, + "loss": 2.0977, + "step": 188410 + }, + { + "epoch": 0.7171730243675921, + "grad_norm": 0.11701665073633194, + "learning_rate": 0.0005, + "loss": 2.1112, + "step": 188420 + }, + { + "epoch": 0.7172110868357148, + "grad_norm": 0.12460478395223618, + "learning_rate": 0.0005, + "loss": 2.1092, + "step": 188430 + }, + { + "epoch": 0.7172491493038374, + "grad_norm": 0.13054977357387543, + "learning_rate": 0.0005, + "loss": 2.112, + "step": 188440 + }, + { + "epoch": 0.7172872117719601, + "grad_norm": 0.11797840893268585, + "learning_rate": 0.0005, + "loss": 2.1034, + "step": 188450 + }, + { + "epoch": 0.7173252742400829, + "grad_norm": 0.1306247115135193, + "learning_rate": 0.0005, + "loss": 2.114, + "step": 188460 + }, + { + "epoch": 0.7173633367082055, + "grad_norm": 0.13063837587833405, + "learning_rate": 0.0005, + "loss": 2.095, + "step": 188470 + }, + { + "epoch": 0.7174013991763282, + "grad_norm": 0.12942704558372498, + "learning_rate": 0.0005, + "loss": 2.111, + "step": 188480 + }, + { + "epoch": 0.7174394616444508, + "grad_norm": 0.11970430612564087, + "learning_rate": 0.0005, + "loss": 2.1041, + "step": 188490 + }, + { + "epoch": 0.7174775241125736, + "grad_norm": 0.13363367319107056, + "learning_rate": 0.0005, + "loss": 2.0975, + "step": 188500 + }, + { + "epoch": 0.7175155865806963, + "grad_norm": 0.13914057612419128, + "learning_rate": 0.0005, + "loss": 2.1268, + "step": 188510 + }, + { + "epoch": 0.7175536490488189, + "grad_norm": 0.12554246187210083, + "learning_rate": 0.0005, + "loss": 2.1223, + "step": 188520 + }, + { + "epoch": 0.7175917115169416, + "grad_norm": 0.12300821393728256, + "learning_rate": 0.0005, + "loss": 2.1073, + "step": 188530 + }, + { + "epoch": 0.7176297739850643, + "grad_norm": 0.11722347140312195, + "learning_rate": 0.0005, + "loss": 2.0974, + "step": 188540 + }, + { + "epoch": 0.717667836453187, + "grad_norm": 0.11022867262363434, + "learning_rate": 0.0005, + "loss": 2.1007, + "step": 188550 + }, + { + "epoch": 0.7177058989213096, + "grad_norm": 0.13012823462486267, + "learning_rate": 0.0005, + "loss": 2.1027, + "step": 188560 + }, + { + "epoch": 0.7177439613894323, + "grad_norm": 0.12405557930469513, + "learning_rate": 0.0005, + "loss": 2.1086, + "step": 188570 + }, + { + "epoch": 0.7177820238575551, + "grad_norm": 0.11842475086450577, + "learning_rate": 0.0005, + "loss": 2.1186, + "step": 188580 + }, + { + "epoch": 0.7178200863256777, + "grad_norm": 0.11545991897583008, + "learning_rate": 0.0005, + "loss": 2.1074, + "step": 188590 + }, + { + "epoch": 0.7178581487938004, + "grad_norm": 0.11615921556949615, + "learning_rate": 0.0005, + "loss": 2.0959, + "step": 188600 + }, + { + "epoch": 0.717896211261923, + "grad_norm": 0.11555454134941101, + "learning_rate": 0.0005, + "loss": 2.1119, + "step": 188610 + }, + { + "epoch": 0.7179342737300457, + "grad_norm": 0.13142219185829163, + "learning_rate": 0.0005, + "loss": 2.1106, + "step": 188620 + }, + { + "epoch": 0.7179723361981685, + "grad_norm": 0.13296760618686676, + "learning_rate": 0.0005, + "loss": 2.112, + "step": 188630 + }, + { + "epoch": 0.7180103986662911, + "grad_norm": 0.12703141570091248, + "learning_rate": 0.0005, + "loss": 2.1084, + "step": 188640 + }, + { + "epoch": 0.7180484611344138, + "grad_norm": 0.12670379877090454, + "learning_rate": 0.0005, + "loss": 2.11, + "step": 188650 + }, + { + "epoch": 0.7180865236025364, + "grad_norm": 0.12041231989860535, + "learning_rate": 0.0005, + "loss": 2.1124, + "step": 188660 + }, + { + "epoch": 0.7181245860706592, + "grad_norm": 0.13363684713840485, + "learning_rate": 0.0005, + "loss": 2.1047, + "step": 188670 + }, + { + "epoch": 0.7181626485387819, + "grad_norm": 0.12533117830753326, + "learning_rate": 0.0005, + "loss": 2.1174, + "step": 188680 + }, + { + "epoch": 0.7182007110069045, + "grad_norm": 0.12696968019008636, + "learning_rate": 0.0005, + "loss": 2.1084, + "step": 188690 + }, + { + "epoch": 0.7182387734750272, + "grad_norm": 0.11952722817659378, + "learning_rate": 0.0005, + "loss": 2.1165, + "step": 188700 + }, + { + "epoch": 0.71827683594315, + "grad_norm": 0.11573243141174316, + "learning_rate": 0.0005, + "loss": 2.0972, + "step": 188710 + }, + { + "epoch": 0.7183148984112726, + "grad_norm": 0.11628570407629013, + "learning_rate": 0.0005, + "loss": 2.1227, + "step": 188720 + }, + { + "epoch": 0.7183529608793953, + "grad_norm": 0.12123718857765198, + "learning_rate": 0.0005, + "loss": 2.1224, + "step": 188730 + }, + { + "epoch": 0.7183910233475179, + "grad_norm": 0.14781494438648224, + "learning_rate": 0.0005, + "loss": 2.0823, + "step": 188740 + }, + { + "epoch": 0.7184290858156406, + "grad_norm": 0.1224733367562294, + "learning_rate": 0.0005, + "loss": 2.1017, + "step": 188750 + }, + { + "epoch": 0.7184671482837633, + "grad_norm": 0.13327038288116455, + "learning_rate": 0.0005, + "loss": 2.1007, + "step": 188760 + }, + { + "epoch": 0.718505210751886, + "grad_norm": 0.13257130980491638, + "learning_rate": 0.0005, + "loss": 2.1116, + "step": 188770 + }, + { + "epoch": 0.7185432732200087, + "grad_norm": 0.11904368549585342, + "learning_rate": 0.0005, + "loss": 2.0859, + "step": 188780 + }, + { + "epoch": 0.7185813356881313, + "grad_norm": 0.1520932912826538, + "learning_rate": 0.0005, + "loss": 2.1083, + "step": 188790 + }, + { + "epoch": 0.7186193981562541, + "grad_norm": 0.11274637281894684, + "learning_rate": 0.0005, + "loss": 2.1243, + "step": 188800 + }, + { + "epoch": 0.7186574606243767, + "grad_norm": 0.11699338257312775, + "learning_rate": 0.0005, + "loss": 2.1134, + "step": 188810 + }, + { + "epoch": 0.7186955230924994, + "grad_norm": 0.1301192194223404, + "learning_rate": 0.0005, + "loss": 2.1031, + "step": 188820 + }, + { + "epoch": 0.7187335855606221, + "grad_norm": 0.12862426042556763, + "learning_rate": 0.0005, + "loss": 2.1097, + "step": 188830 + }, + { + "epoch": 0.7187716480287448, + "grad_norm": 0.1355823129415512, + "learning_rate": 0.0005, + "loss": 2.0997, + "step": 188840 + }, + { + "epoch": 0.7188097104968675, + "grad_norm": 0.1313876509666443, + "learning_rate": 0.0005, + "loss": 2.0989, + "step": 188850 + }, + { + "epoch": 0.7188477729649901, + "grad_norm": 0.14267843961715698, + "learning_rate": 0.0005, + "loss": 2.1023, + "step": 188860 + }, + { + "epoch": 0.7188858354331128, + "grad_norm": 0.12998761236667633, + "learning_rate": 0.0005, + "loss": 2.089, + "step": 188870 + }, + { + "epoch": 0.7189238979012355, + "grad_norm": 0.1305166333913803, + "learning_rate": 0.0005, + "loss": 2.106, + "step": 188880 + }, + { + "epoch": 0.7189619603693582, + "grad_norm": 0.11812318861484528, + "learning_rate": 0.0005, + "loss": 2.1033, + "step": 188890 + }, + { + "epoch": 0.7190000228374809, + "grad_norm": 0.13159221410751343, + "learning_rate": 0.0005, + "loss": 2.0968, + "step": 188900 + }, + { + "epoch": 0.7190380853056035, + "grad_norm": 0.11888105422258377, + "learning_rate": 0.0005, + "loss": 2.1062, + "step": 188910 + }, + { + "epoch": 0.7190761477737262, + "grad_norm": 0.12733519077301025, + "learning_rate": 0.0005, + "loss": 2.1233, + "step": 188920 + }, + { + "epoch": 0.719114210241849, + "grad_norm": 0.13240741193294525, + "learning_rate": 0.0005, + "loss": 2.1164, + "step": 188930 + }, + { + "epoch": 0.7191522727099716, + "grad_norm": 0.11451985687017441, + "learning_rate": 0.0005, + "loss": 2.1115, + "step": 188940 + }, + { + "epoch": 0.7191903351780943, + "grad_norm": 0.12244334071874619, + "learning_rate": 0.0005, + "loss": 2.0975, + "step": 188950 + }, + { + "epoch": 0.7192283976462169, + "grad_norm": 0.12746724486351013, + "learning_rate": 0.0005, + "loss": 2.1123, + "step": 188960 + }, + { + "epoch": 0.7192664601143397, + "grad_norm": 0.1367591917514801, + "learning_rate": 0.0005, + "loss": 2.1069, + "step": 188970 + }, + { + "epoch": 0.7193045225824624, + "grad_norm": 0.12524889409542084, + "learning_rate": 0.0005, + "loss": 2.0968, + "step": 188980 + }, + { + "epoch": 0.719342585050585, + "grad_norm": 0.12255341559648514, + "learning_rate": 0.0005, + "loss": 2.1232, + "step": 188990 + }, + { + "epoch": 0.7193806475187077, + "grad_norm": 0.12615957856178284, + "learning_rate": 0.0005, + "loss": 2.1, + "step": 189000 + }, + { + "epoch": 0.7194187099868304, + "grad_norm": 0.13030847907066345, + "learning_rate": 0.0005, + "loss": 2.1182, + "step": 189010 + }, + { + "epoch": 0.7194567724549531, + "grad_norm": 0.12409580498933792, + "learning_rate": 0.0005, + "loss": 2.1075, + "step": 189020 + }, + { + "epoch": 0.7194948349230758, + "grad_norm": 0.12422270327806473, + "learning_rate": 0.0005, + "loss": 2.096, + "step": 189030 + }, + { + "epoch": 0.7195328973911984, + "grad_norm": 0.11482467502355576, + "learning_rate": 0.0005, + "loss": 2.1074, + "step": 189040 + }, + { + "epoch": 0.7195709598593211, + "grad_norm": 0.1306055337190628, + "learning_rate": 0.0005, + "loss": 2.1052, + "step": 189050 + }, + { + "epoch": 0.7196090223274438, + "grad_norm": 0.11800495535135269, + "learning_rate": 0.0005, + "loss": 2.1124, + "step": 189060 + }, + { + "epoch": 0.7196470847955665, + "grad_norm": 0.12688249349594116, + "learning_rate": 0.0005, + "loss": 2.1012, + "step": 189070 + }, + { + "epoch": 0.7196851472636892, + "grad_norm": 0.1269036829471588, + "learning_rate": 0.0005, + "loss": 2.1075, + "step": 189080 + }, + { + "epoch": 0.7197232097318118, + "grad_norm": 0.12588876485824585, + "learning_rate": 0.0005, + "loss": 2.1056, + "step": 189090 + }, + { + "epoch": 0.7197612721999346, + "grad_norm": 0.12352360039949417, + "learning_rate": 0.0005, + "loss": 2.1135, + "step": 189100 + }, + { + "epoch": 0.7197993346680572, + "grad_norm": 0.11827840656042099, + "learning_rate": 0.0005, + "loss": 2.1083, + "step": 189110 + }, + { + "epoch": 0.7198373971361799, + "grad_norm": 0.11889263242483139, + "learning_rate": 0.0005, + "loss": 2.1193, + "step": 189120 + }, + { + "epoch": 0.7198754596043025, + "grad_norm": 0.11715391278266907, + "learning_rate": 0.0005, + "loss": 2.1165, + "step": 189130 + }, + { + "epoch": 0.7199135220724253, + "grad_norm": 0.11334909498691559, + "learning_rate": 0.0005, + "loss": 2.1002, + "step": 189140 + }, + { + "epoch": 0.719951584540548, + "grad_norm": 0.12664197385311127, + "learning_rate": 0.0005, + "loss": 2.0927, + "step": 189150 + }, + { + "epoch": 0.7199896470086706, + "grad_norm": 0.11824624985456467, + "learning_rate": 0.0005, + "loss": 2.1008, + "step": 189160 + }, + { + "epoch": 0.7200277094767933, + "grad_norm": 0.12083036452531815, + "learning_rate": 0.0005, + "loss": 2.1039, + "step": 189170 + }, + { + "epoch": 0.7200657719449159, + "grad_norm": 0.13151566684246063, + "learning_rate": 0.0005, + "loss": 2.1113, + "step": 189180 + }, + { + "epoch": 0.7201038344130387, + "grad_norm": 0.12442060559988022, + "learning_rate": 0.0005, + "loss": 2.1048, + "step": 189190 + }, + { + "epoch": 0.7201418968811614, + "grad_norm": 0.1193741112947464, + "learning_rate": 0.0005, + "loss": 2.1217, + "step": 189200 + }, + { + "epoch": 0.720179959349284, + "grad_norm": 0.1237751692533493, + "learning_rate": 0.0005, + "loss": 2.1064, + "step": 189210 + }, + { + "epoch": 0.7202180218174067, + "grad_norm": 0.1314356029033661, + "learning_rate": 0.0005, + "loss": 2.0949, + "step": 189220 + }, + { + "epoch": 0.7202560842855295, + "grad_norm": 0.1298057585954666, + "learning_rate": 0.0005, + "loss": 2.1294, + "step": 189230 + }, + { + "epoch": 0.7202941467536521, + "grad_norm": 0.13816192746162415, + "learning_rate": 0.0005, + "loss": 2.1071, + "step": 189240 + }, + { + "epoch": 0.7203322092217748, + "grad_norm": 0.11258254200220108, + "learning_rate": 0.0005, + "loss": 2.0919, + "step": 189250 + }, + { + "epoch": 0.7203702716898974, + "grad_norm": 0.12321577966213226, + "learning_rate": 0.0005, + "loss": 2.1147, + "step": 189260 + }, + { + "epoch": 0.7204083341580202, + "grad_norm": 0.12666542828083038, + "learning_rate": 0.0005, + "loss": 2.0995, + "step": 189270 + }, + { + "epoch": 0.7204463966261428, + "grad_norm": 0.12397710978984833, + "learning_rate": 0.0005, + "loss": 2.1055, + "step": 189280 + }, + { + "epoch": 0.7204844590942655, + "grad_norm": 0.12449897825717926, + "learning_rate": 0.0005, + "loss": 2.1033, + "step": 189290 + }, + { + "epoch": 0.7205225215623882, + "grad_norm": 0.11226168274879456, + "learning_rate": 0.0005, + "loss": 2.1013, + "step": 189300 + }, + { + "epoch": 0.7205605840305108, + "grad_norm": 0.12319658696651459, + "learning_rate": 0.0005, + "loss": 2.1117, + "step": 189310 + }, + { + "epoch": 0.7205986464986336, + "grad_norm": 0.12847982347011566, + "learning_rate": 0.0005, + "loss": 2.0973, + "step": 189320 + }, + { + "epoch": 0.7206367089667562, + "grad_norm": 0.11298822611570358, + "learning_rate": 0.0005, + "loss": 2.1152, + "step": 189330 + }, + { + "epoch": 0.7206747714348789, + "grad_norm": 0.12321088463068008, + "learning_rate": 0.0005, + "loss": 2.1243, + "step": 189340 + }, + { + "epoch": 0.7207128339030016, + "grad_norm": 0.12383893132209778, + "learning_rate": 0.0005, + "loss": 2.0976, + "step": 189350 + }, + { + "epoch": 0.7207508963711243, + "grad_norm": 0.12227057665586472, + "learning_rate": 0.0005, + "loss": 2.103, + "step": 189360 + }, + { + "epoch": 0.720788958839247, + "grad_norm": 0.12617835402488708, + "learning_rate": 0.0005, + "loss": 2.099, + "step": 189370 + }, + { + "epoch": 0.7208270213073696, + "grad_norm": 0.1312699317932129, + "learning_rate": 0.0005, + "loss": 2.1136, + "step": 189380 + }, + { + "epoch": 0.7208650837754923, + "grad_norm": 0.12388839572668076, + "learning_rate": 0.0005, + "loss": 2.1009, + "step": 189390 + }, + { + "epoch": 0.7209031462436151, + "grad_norm": 0.14507871866226196, + "learning_rate": 0.0005, + "loss": 2.1242, + "step": 189400 + }, + { + "epoch": 0.7209412087117377, + "grad_norm": 0.11744916439056396, + "learning_rate": 0.0005, + "loss": 2.101, + "step": 189410 + }, + { + "epoch": 0.7209792711798604, + "grad_norm": 0.12581637501716614, + "learning_rate": 0.0005, + "loss": 2.0967, + "step": 189420 + }, + { + "epoch": 0.721017333647983, + "grad_norm": 0.1203906461596489, + "learning_rate": 0.0005, + "loss": 2.1119, + "step": 189430 + }, + { + "epoch": 0.7210553961161058, + "grad_norm": 0.12365449219942093, + "learning_rate": 0.0005, + "loss": 2.1167, + "step": 189440 + }, + { + "epoch": 0.7210934585842285, + "grad_norm": 0.11525920778512955, + "learning_rate": 0.0005, + "loss": 2.1057, + "step": 189450 + }, + { + "epoch": 0.7211315210523511, + "grad_norm": 0.11574247479438782, + "learning_rate": 0.0005, + "loss": 2.1278, + "step": 189460 + }, + { + "epoch": 0.7211695835204738, + "grad_norm": 0.1328197568655014, + "learning_rate": 0.0005, + "loss": 2.1119, + "step": 189470 + }, + { + "epoch": 0.7212076459885964, + "grad_norm": 0.14724650979042053, + "learning_rate": 0.0005, + "loss": 2.1091, + "step": 189480 + }, + { + "epoch": 0.7212457084567192, + "grad_norm": 0.12958109378814697, + "learning_rate": 0.0005, + "loss": 2.1235, + "step": 189490 + }, + { + "epoch": 0.7212837709248419, + "grad_norm": 0.13805188238620758, + "learning_rate": 0.0005, + "loss": 2.0946, + "step": 189500 + }, + { + "epoch": 0.7213218333929645, + "grad_norm": 0.1344052255153656, + "learning_rate": 0.0005, + "loss": 2.1021, + "step": 189510 + }, + { + "epoch": 0.7213598958610872, + "grad_norm": 0.12882310152053833, + "learning_rate": 0.0005, + "loss": 2.1015, + "step": 189520 + }, + { + "epoch": 0.7213979583292099, + "grad_norm": 0.13333013653755188, + "learning_rate": 0.0005, + "loss": 2.1031, + "step": 189530 + }, + { + "epoch": 0.7214360207973326, + "grad_norm": 0.11692256480455399, + "learning_rate": 0.0005, + "loss": 2.1094, + "step": 189540 + }, + { + "epoch": 0.7214740832654553, + "grad_norm": 0.12592947483062744, + "learning_rate": 0.0005, + "loss": 2.1138, + "step": 189550 + }, + { + "epoch": 0.7215121457335779, + "grad_norm": 0.13801392912864685, + "learning_rate": 0.0005, + "loss": 2.111, + "step": 189560 + }, + { + "epoch": 0.7215502082017007, + "grad_norm": 0.12887591123580933, + "learning_rate": 0.0005, + "loss": 2.1019, + "step": 189570 + }, + { + "epoch": 0.7215882706698233, + "grad_norm": 0.14320510625839233, + "learning_rate": 0.0005, + "loss": 2.0955, + "step": 189580 + }, + { + "epoch": 0.721626333137946, + "grad_norm": 0.12209837138652802, + "learning_rate": 0.0005, + "loss": 2.0975, + "step": 189590 + }, + { + "epoch": 0.7216643956060687, + "grad_norm": 0.12583830952644348, + "learning_rate": 0.0005, + "loss": 2.1117, + "step": 189600 + }, + { + "epoch": 0.7217024580741913, + "grad_norm": 0.12144199758768082, + "learning_rate": 0.0005, + "loss": 2.1119, + "step": 189610 + }, + { + "epoch": 0.7217405205423141, + "grad_norm": 0.11988362669944763, + "learning_rate": 0.0005, + "loss": 2.1095, + "step": 189620 + }, + { + "epoch": 0.7217785830104367, + "grad_norm": 0.14672856032848358, + "learning_rate": 0.0005, + "loss": 2.0907, + "step": 189630 + }, + { + "epoch": 0.7218166454785594, + "grad_norm": 0.12737524509429932, + "learning_rate": 0.0005, + "loss": 2.1046, + "step": 189640 + }, + { + "epoch": 0.721854707946682, + "grad_norm": 0.11321786791086197, + "learning_rate": 0.0005, + "loss": 2.1118, + "step": 189650 + }, + { + "epoch": 0.7218927704148048, + "grad_norm": 0.11814183741807938, + "learning_rate": 0.0005, + "loss": 2.1138, + "step": 189660 + }, + { + "epoch": 0.7219308328829275, + "grad_norm": 0.11926648765802383, + "learning_rate": 0.0005, + "loss": 2.0989, + "step": 189670 + }, + { + "epoch": 0.7219688953510501, + "grad_norm": 0.1263626515865326, + "learning_rate": 0.0005, + "loss": 2.1089, + "step": 189680 + }, + { + "epoch": 0.7220069578191728, + "grad_norm": 0.12267031520605087, + "learning_rate": 0.0005, + "loss": 2.1075, + "step": 189690 + }, + { + "epoch": 0.7220450202872956, + "grad_norm": 0.13086408376693726, + "learning_rate": 0.0005, + "loss": 2.1186, + "step": 189700 + }, + { + "epoch": 0.7220830827554182, + "grad_norm": 0.13127847015857697, + "learning_rate": 0.0005, + "loss": 2.1096, + "step": 189710 + }, + { + "epoch": 0.7221211452235409, + "grad_norm": 0.10789002478122711, + "learning_rate": 0.0005, + "loss": 2.1023, + "step": 189720 + }, + { + "epoch": 0.7221592076916635, + "grad_norm": 0.12119221687316895, + "learning_rate": 0.0005, + "loss": 2.1205, + "step": 189730 + }, + { + "epoch": 0.7221972701597862, + "grad_norm": 0.12627781927585602, + "learning_rate": 0.0005, + "loss": 2.1072, + "step": 189740 + }, + { + "epoch": 0.722235332627909, + "grad_norm": 0.1297757923603058, + "learning_rate": 0.0005, + "loss": 2.1004, + "step": 189750 + }, + { + "epoch": 0.7222733950960316, + "grad_norm": 0.12928855419158936, + "learning_rate": 0.0005, + "loss": 2.0983, + "step": 189760 + }, + { + "epoch": 0.7223114575641543, + "grad_norm": 0.12331338226795197, + "learning_rate": 0.0005, + "loss": 2.1163, + "step": 189770 + }, + { + "epoch": 0.7223495200322769, + "grad_norm": 0.13063183426856995, + "learning_rate": 0.0005, + "loss": 2.1061, + "step": 189780 + }, + { + "epoch": 0.7223875825003997, + "grad_norm": 0.1343780905008316, + "learning_rate": 0.0005, + "loss": 2.1221, + "step": 189790 + }, + { + "epoch": 0.7224256449685224, + "grad_norm": 0.13402073085308075, + "learning_rate": 0.0005, + "loss": 2.1102, + "step": 189800 + }, + { + "epoch": 0.722463707436645, + "grad_norm": 0.1176358014345169, + "learning_rate": 0.0005, + "loss": 2.0964, + "step": 189810 + }, + { + "epoch": 0.7225017699047677, + "grad_norm": 0.11849499493837357, + "learning_rate": 0.0005, + "loss": 2.1127, + "step": 189820 + }, + { + "epoch": 0.7225398323728904, + "grad_norm": 0.12364055961370468, + "learning_rate": 0.0005, + "loss": 2.1115, + "step": 189830 + }, + { + "epoch": 0.7225778948410131, + "grad_norm": 0.11501021683216095, + "learning_rate": 0.0005, + "loss": 2.1105, + "step": 189840 + }, + { + "epoch": 0.7226159573091357, + "grad_norm": 0.13123121857643127, + "learning_rate": 0.0005, + "loss": 2.1004, + "step": 189850 + }, + { + "epoch": 0.7226540197772584, + "grad_norm": 0.11838650703430176, + "learning_rate": 0.0005, + "loss": 2.1118, + "step": 189860 + }, + { + "epoch": 0.7226920822453812, + "grad_norm": 0.11834557354450226, + "learning_rate": 0.0005, + "loss": 2.1044, + "step": 189870 + }, + { + "epoch": 0.7227301447135038, + "grad_norm": 0.1256466507911682, + "learning_rate": 0.0005, + "loss": 2.1199, + "step": 189880 + }, + { + "epoch": 0.7227682071816265, + "grad_norm": 0.12792137265205383, + "learning_rate": 0.0005, + "loss": 2.1014, + "step": 189890 + }, + { + "epoch": 0.7228062696497491, + "grad_norm": 0.12776750326156616, + "learning_rate": 0.0005, + "loss": 2.1022, + "step": 189900 + }, + { + "epoch": 0.7228443321178718, + "grad_norm": 0.12447094172239304, + "learning_rate": 0.0005, + "loss": 2.1096, + "step": 189910 + }, + { + "epoch": 0.7228823945859946, + "grad_norm": 0.12594708800315857, + "learning_rate": 0.0005, + "loss": 2.105, + "step": 189920 + }, + { + "epoch": 0.7229204570541172, + "grad_norm": 0.1258392184972763, + "learning_rate": 0.0005, + "loss": 2.1098, + "step": 189930 + }, + { + "epoch": 0.7229585195222399, + "grad_norm": 0.12574885785579681, + "learning_rate": 0.0005, + "loss": 2.1119, + "step": 189940 + }, + { + "epoch": 0.7229965819903625, + "grad_norm": 0.12300106137990952, + "learning_rate": 0.0005, + "loss": 2.1004, + "step": 189950 + }, + { + "epoch": 0.7230346444584853, + "grad_norm": 0.12460119277238846, + "learning_rate": 0.0005, + "loss": 2.1087, + "step": 189960 + }, + { + "epoch": 0.723072706926608, + "grad_norm": 0.11818580329418182, + "learning_rate": 0.0005, + "loss": 2.1059, + "step": 189970 + }, + { + "epoch": 0.7231107693947306, + "grad_norm": 0.13395582139492035, + "learning_rate": 0.0005, + "loss": 2.097, + "step": 189980 + }, + { + "epoch": 0.7231488318628533, + "grad_norm": 0.1125182956457138, + "learning_rate": 0.0005, + "loss": 2.1156, + "step": 189990 + }, + { + "epoch": 0.723186894330976, + "grad_norm": 0.12543977797031403, + "learning_rate": 0.0005, + "loss": 2.1253, + "step": 190000 + }, + { + "epoch": 0.7232249567990987, + "grad_norm": 0.1250716894865036, + "learning_rate": 0.0005, + "loss": 2.1166, + "step": 190010 + }, + { + "epoch": 0.7232630192672214, + "grad_norm": 0.12153498083353043, + "learning_rate": 0.0005, + "loss": 2.1074, + "step": 190020 + }, + { + "epoch": 0.723301081735344, + "grad_norm": 0.11792191863059998, + "learning_rate": 0.0005, + "loss": 2.1112, + "step": 190030 + }, + { + "epoch": 0.7233391442034667, + "grad_norm": 0.12454144656658173, + "learning_rate": 0.0005, + "loss": 2.1105, + "step": 190040 + }, + { + "epoch": 0.7233772066715894, + "grad_norm": 0.12839899957180023, + "learning_rate": 0.0005, + "loss": 2.1197, + "step": 190050 + }, + { + "epoch": 0.7234152691397121, + "grad_norm": 0.11735977232456207, + "learning_rate": 0.0005, + "loss": 2.1004, + "step": 190060 + }, + { + "epoch": 0.7234533316078348, + "grad_norm": 0.11648841947317123, + "learning_rate": 0.0005, + "loss": 2.127, + "step": 190070 + }, + { + "epoch": 0.7234913940759574, + "grad_norm": 0.12635114789009094, + "learning_rate": 0.0005, + "loss": 2.113, + "step": 190080 + }, + { + "epoch": 0.7235294565440802, + "grad_norm": 0.12741799652576447, + "learning_rate": 0.0005, + "loss": 2.1141, + "step": 190090 + }, + { + "epoch": 0.7235675190122028, + "grad_norm": 0.12407921254634857, + "learning_rate": 0.0005, + "loss": 2.1061, + "step": 190100 + }, + { + "epoch": 0.7236055814803255, + "grad_norm": 0.12719793617725372, + "learning_rate": 0.0005, + "loss": 2.1048, + "step": 190110 + }, + { + "epoch": 0.7236436439484482, + "grad_norm": 0.1287856101989746, + "learning_rate": 0.0005, + "loss": 2.1057, + "step": 190120 + }, + { + "epoch": 0.7236817064165709, + "grad_norm": 0.1346472203731537, + "learning_rate": 0.0005, + "loss": 2.1107, + "step": 190130 + }, + { + "epoch": 0.7237197688846936, + "grad_norm": 0.12088294327259064, + "learning_rate": 0.0005, + "loss": 2.1112, + "step": 190140 + }, + { + "epoch": 0.7237578313528162, + "grad_norm": 0.11873350292444229, + "learning_rate": 0.0005, + "loss": 2.1112, + "step": 190150 + }, + { + "epoch": 0.7237958938209389, + "grad_norm": 0.12106628715991974, + "learning_rate": 0.0005, + "loss": 2.108, + "step": 190160 + }, + { + "epoch": 0.7238339562890616, + "grad_norm": 0.12756995856761932, + "learning_rate": 0.0005, + "loss": 2.0906, + "step": 190170 + }, + { + "epoch": 0.7238720187571843, + "grad_norm": 0.11779794096946716, + "learning_rate": 0.0005, + "loss": 2.107, + "step": 190180 + }, + { + "epoch": 0.723910081225307, + "grad_norm": 0.11814567446708679, + "learning_rate": 0.0005, + "loss": 2.1115, + "step": 190190 + }, + { + "epoch": 0.7239481436934296, + "grad_norm": 0.12297229468822479, + "learning_rate": 0.0005, + "loss": 2.1049, + "step": 190200 + }, + { + "epoch": 0.7239862061615523, + "grad_norm": 0.1186053529381752, + "learning_rate": 0.0005, + "loss": 2.1055, + "step": 190210 + }, + { + "epoch": 0.7240242686296751, + "grad_norm": 0.12631520628929138, + "learning_rate": 0.0005, + "loss": 2.1097, + "step": 190220 + }, + { + "epoch": 0.7240623310977977, + "grad_norm": 0.12042009085416794, + "learning_rate": 0.0005, + "loss": 2.1016, + "step": 190230 + }, + { + "epoch": 0.7241003935659204, + "grad_norm": 0.12243197858333588, + "learning_rate": 0.0005, + "loss": 2.1185, + "step": 190240 + }, + { + "epoch": 0.724138456034043, + "grad_norm": 0.11951422691345215, + "learning_rate": 0.0005, + "loss": 2.1018, + "step": 190250 + }, + { + "epoch": 0.7241765185021658, + "grad_norm": 0.12606891989707947, + "learning_rate": 0.0005, + "loss": 2.1128, + "step": 190260 + }, + { + "epoch": 0.7242145809702885, + "grad_norm": 0.11739018559455872, + "learning_rate": 0.0005, + "loss": 2.104, + "step": 190270 + }, + { + "epoch": 0.7242526434384111, + "grad_norm": 0.14321762323379517, + "learning_rate": 0.0005, + "loss": 2.1219, + "step": 190280 + }, + { + "epoch": 0.7242907059065338, + "grad_norm": 0.11680032312870026, + "learning_rate": 0.0005, + "loss": 2.0941, + "step": 190290 + }, + { + "epoch": 0.7243287683746565, + "grad_norm": 0.13240210711956024, + "learning_rate": 0.0005, + "loss": 2.1012, + "step": 190300 + }, + { + "epoch": 0.7243668308427792, + "grad_norm": 0.12868797779083252, + "learning_rate": 0.0005, + "loss": 2.1127, + "step": 190310 + }, + { + "epoch": 0.7244048933109019, + "grad_norm": 0.1447475254535675, + "learning_rate": 0.0005, + "loss": 2.1, + "step": 190320 + }, + { + "epoch": 0.7244429557790245, + "grad_norm": 0.13167577981948853, + "learning_rate": 0.0005, + "loss": 2.1133, + "step": 190330 + }, + { + "epoch": 0.7244810182471472, + "grad_norm": 0.1302758753299713, + "learning_rate": 0.0005, + "loss": 2.1061, + "step": 190340 + }, + { + "epoch": 0.7245190807152699, + "grad_norm": 0.12184086441993713, + "learning_rate": 0.0005, + "loss": 2.0995, + "step": 190350 + }, + { + "epoch": 0.7245571431833926, + "grad_norm": 0.12463497370481491, + "learning_rate": 0.0005, + "loss": 2.1018, + "step": 190360 + }, + { + "epoch": 0.7245952056515153, + "grad_norm": 0.10786112397909164, + "learning_rate": 0.0005, + "loss": 2.1098, + "step": 190370 + }, + { + "epoch": 0.7246332681196379, + "grad_norm": 0.1271791160106659, + "learning_rate": 0.0005, + "loss": 2.1079, + "step": 190380 + }, + { + "epoch": 0.7246713305877607, + "grad_norm": 0.11974359303712845, + "learning_rate": 0.0005, + "loss": 2.1045, + "step": 190390 + }, + { + "epoch": 0.7247093930558833, + "grad_norm": 0.1264609694480896, + "learning_rate": 0.0005, + "loss": 2.0943, + "step": 190400 + }, + { + "epoch": 0.724747455524006, + "grad_norm": 0.1227140799164772, + "learning_rate": 0.0005, + "loss": 2.1237, + "step": 190410 + }, + { + "epoch": 0.7247855179921286, + "grad_norm": 0.1285555362701416, + "learning_rate": 0.0005, + "loss": 2.1124, + "step": 190420 + }, + { + "epoch": 0.7248235804602514, + "grad_norm": 0.12815794348716736, + "learning_rate": 0.0005, + "loss": 2.1058, + "step": 190430 + }, + { + "epoch": 0.7248616429283741, + "grad_norm": 0.11281808465719223, + "learning_rate": 0.0005, + "loss": 2.1091, + "step": 190440 + }, + { + "epoch": 0.7248997053964967, + "grad_norm": 0.13443569839000702, + "learning_rate": 0.0005, + "loss": 2.1113, + "step": 190450 + }, + { + "epoch": 0.7249377678646194, + "grad_norm": 0.13241413235664368, + "learning_rate": 0.0005, + "loss": 2.1128, + "step": 190460 + }, + { + "epoch": 0.724975830332742, + "grad_norm": 0.12769834697246552, + "learning_rate": 0.0005, + "loss": 2.1064, + "step": 190470 + }, + { + "epoch": 0.7250138928008648, + "grad_norm": 0.12712432444095612, + "learning_rate": 0.0005, + "loss": 2.1038, + "step": 190480 + }, + { + "epoch": 0.7250519552689875, + "grad_norm": 0.1333046555519104, + "learning_rate": 0.0005, + "loss": 2.1101, + "step": 190490 + }, + { + "epoch": 0.7250900177371101, + "grad_norm": 0.12503378093242645, + "learning_rate": 0.0005, + "loss": 2.118, + "step": 190500 + }, + { + "epoch": 0.7251280802052328, + "grad_norm": 0.12388863414525986, + "learning_rate": 0.0005, + "loss": 2.1094, + "step": 190510 + }, + { + "epoch": 0.7251661426733556, + "grad_norm": 0.11568185687065125, + "learning_rate": 0.0005, + "loss": 2.1098, + "step": 190520 + }, + { + "epoch": 0.7252042051414782, + "grad_norm": 0.1251184195280075, + "learning_rate": 0.0005, + "loss": 2.1051, + "step": 190530 + }, + { + "epoch": 0.7252422676096009, + "grad_norm": 0.1223842203617096, + "learning_rate": 0.0005, + "loss": 2.1095, + "step": 190540 + }, + { + "epoch": 0.7252803300777235, + "grad_norm": 0.11854085326194763, + "learning_rate": 0.0005, + "loss": 2.1035, + "step": 190550 + }, + { + "epoch": 0.7253183925458463, + "grad_norm": 0.12464988976716995, + "learning_rate": 0.0005, + "loss": 2.0991, + "step": 190560 + }, + { + "epoch": 0.725356455013969, + "grad_norm": 0.12290152162313461, + "learning_rate": 0.0005, + "loss": 2.1074, + "step": 190570 + }, + { + "epoch": 0.7253945174820916, + "grad_norm": 0.12548936903476715, + "learning_rate": 0.0005, + "loss": 2.1212, + "step": 190580 + }, + { + "epoch": 0.7254325799502143, + "grad_norm": 0.13220664858818054, + "learning_rate": 0.0005, + "loss": 2.1106, + "step": 190590 + }, + { + "epoch": 0.7254706424183369, + "grad_norm": 0.1262110024690628, + "learning_rate": 0.0005, + "loss": 2.0912, + "step": 190600 + }, + { + "epoch": 0.7255087048864597, + "grad_norm": 0.12420913577079773, + "learning_rate": 0.0005, + "loss": 2.1004, + "step": 190610 + }, + { + "epoch": 0.7255467673545823, + "grad_norm": 0.13860227167606354, + "learning_rate": 0.0005, + "loss": 2.1021, + "step": 190620 + }, + { + "epoch": 0.725584829822705, + "grad_norm": 0.14560693502426147, + "learning_rate": 0.0005, + "loss": 2.1234, + "step": 190630 + }, + { + "epoch": 0.7256228922908277, + "grad_norm": 0.13095788657665253, + "learning_rate": 0.0005, + "loss": 2.1106, + "step": 190640 + }, + { + "epoch": 0.7256609547589504, + "grad_norm": 0.125588521361351, + "learning_rate": 0.0005, + "loss": 2.1031, + "step": 190650 + }, + { + "epoch": 0.7256990172270731, + "grad_norm": 0.13196080923080444, + "learning_rate": 0.0005, + "loss": 2.0899, + "step": 190660 + }, + { + "epoch": 0.7257370796951957, + "grad_norm": 0.11772377043962479, + "learning_rate": 0.0005, + "loss": 2.111, + "step": 190670 + }, + { + "epoch": 0.7257751421633184, + "grad_norm": 0.1185615062713623, + "learning_rate": 0.0005, + "loss": 2.1066, + "step": 190680 + }, + { + "epoch": 0.7258132046314412, + "grad_norm": 0.12486417591571808, + "learning_rate": 0.0005, + "loss": 2.1153, + "step": 190690 + }, + { + "epoch": 0.7258512670995638, + "grad_norm": 0.12780217826366425, + "learning_rate": 0.0005, + "loss": 2.109, + "step": 190700 + }, + { + "epoch": 0.7258893295676865, + "grad_norm": 0.13506446778774261, + "learning_rate": 0.0005, + "loss": 2.12, + "step": 190710 + }, + { + "epoch": 0.7259273920358091, + "grad_norm": 0.11652851849794388, + "learning_rate": 0.0005, + "loss": 2.1044, + "step": 190720 + }, + { + "epoch": 0.7259654545039319, + "grad_norm": 0.12922434508800507, + "learning_rate": 0.0005, + "loss": 2.1033, + "step": 190730 + }, + { + "epoch": 0.7260035169720546, + "grad_norm": 0.12396050244569778, + "learning_rate": 0.0005, + "loss": 2.1158, + "step": 190740 + }, + { + "epoch": 0.7260415794401772, + "grad_norm": 0.15156452357769012, + "learning_rate": 0.0005, + "loss": 2.1237, + "step": 190750 + }, + { + "epoch": 0.7260796419082999, + "grad_norm": 0.11920617520809174, + "learning_rate": 0.0005, + "loss": 2.1175, + "step": 190760 + }, + { + "epoch": 0.7261177043764225, + "grad_norm": 0.1289711445569992, + "learning_rate": 0.0005, + "loss": 2.1046, + "step": 190770 + }, + { + "epoch": 0.7261557668445453, + "grad_norm": 0.12466076016426086, + "learning_rate": 0.0005, + "loss": 2.1191, + "step": 190780 + }, + { + "epoch": 0.726193829312668, + "grad_norm": 0.11493317782878876, + "learning_rate": 0.0005, + "loss": 2.1097, + "step": 190790 + }, + { + "epoch": 0.7262318917807906, + "grad_norm": 0.1272706240415573, + "learning_rate": 0.0005, + "loss": 2.0877, + "step": 190800 + }, + { + "epoch": 0.7262699542489133, + "grad_norm": 0.13595803081989288, + "learning_rate": 0.0005, + "loss": 2.1071, + "step": 190810 + }, + { + "epoch": 0.726308016717036, + "grad_norm": 0.14387886226177216, + "learning_rate": 0.0005, + "loss": 2.1017, + "step": 190820 + }, + { + "epoch": 0.7263460791851587, + "grad_norm": 0.12329813838005066, + "learning_rate": 0.0005, + "loss": 2.0966, + "step": 190830 + }, + { + "epoch": 0.7263841416532814, + "grad_norm": 0.13892407715320587, + "learning_rate": 0.0005, + "loss": 2.116, + "step": 190840 + }, + { + "epoch": 0.726422204121404, + "grad_norm": 0.1313343346118927, + "learning_rate": 0.0005, + "loss": 2.1046, + "step": 190850 + }, + { + "epoch": 0.7264602665895268, + "grad_norm": 0.12014450877904892, + "learning_rate": 0.0005, + "loss": 2.1039, + "step": 190860 + }, + { + "epoch": 0.7264983290576494, + "grad_norm": 0.11999203264713287, + "learning_rate": 0.0005, + "loss": 2.0985, + "step": 190870 + }, + { + "epoch": 0.7265363915257721, + "grad_norm": 0.13435359299182892, + "learning_rate": 0.0005, + "loss": 2.1243, + "step": 190880 + }, + { + "epoch": 0.7265744539938948, + "grad_norm": 0.13794434070587158, + "learning_rate": 0.0005, + "loss": 2.0845, + "step": 190890 + }, + { + "epoch": 0.7266125164620174, + "grad_norm": 0.12234117835760117, + "learning_rate": 0.0005, + "loss": 2.1173, + "step": 190900 + }, + { + "epoch": 0.7266505789301402, + "grad_norm": 0.13267767429351807, + "learning_rate": 0.0005, + "loss": 2.1137, + "step": 190910 + }, + { + "epoch": 0.7266886413982628, + "grad_norm": 0.11783602088689804, + "learning_rate": 0.0005, + "loss": 2.1102, + "step": 190920 + }, + { + "epoch": 0.7267267038663855, + "grad_norm": 0.13267308473587036, + "learning_rate": 0.0005, + "loss": 2.1116, + "step": 190930 + }, + { + "epoch": 0.7267647663345081, + "grad_norm": 0.12474412471055984, + "learning_rate": 0.0005, + "loss": 2.1053, + "step": 190940 + }, + { + "epoch": 0.7268028288026309, + "grad_norm": 0.13308046758174896, + "learning_rate": 0.0005, + "loss": 2.1083, + "step": 190950 + }, + { + "epoch": 0.7268408912707536, + "grad_norm": 0.12626679241657257, + "learning_rate": 0.0005, + "loss": 2.1009, + "step": 190960 + }, + { + "epoch": 0.7268789537388762, + "grad_norm": 0.12065853178501129, + "learning_rate": 0.0005, + "loss": 2.116, + "step": 190970 + }, + { + "epoch": 0.7269170162069989, + "grad_norm": 0.11856792122125626, + "learning_rate": 0.0005, + "loss": 2.1103, + "step": 190980 + }, + { + "epoch": 0.7269550786751217, + "grad_norm": 0.14160200953483582, + "learning_rate": 0.0005, + "loss": 2.0968, + "step": 190990 + }, + { + "epoch": 0.7269931411432443, + "grad_norm": 0.13331879675388336, + "learning_rate": 0.0005, + "loss": 2.1088, + "step": 191000 + }, + { + "epoch": 0.727031203611367, + "grad_norm": 0.13861005008220673, + "learning_rate": 0.0005, + "loss": 2.0836, + "step": 191010 + }, + { + "epoch": 0.7270692660794896, + "grad_norm": 0.11692408472299576, + "learning_rate": 0.0005, + "loss": 2.1043, + "step": 191020 + }, + { + "epoch": 0.7271073285476123, + "grad_norm": 0.11825072765350342, + "learning_rate": 0.0005, + "loss": 2.1152, + "step": 191030 + }, + { + "epoch": 0.727145391015735, + "grad_norm": 0.12770481407642365, + "learning_rate": 0.0005, + "loss": 2.1008, + "step": 191040 + }, + { + "epoch": 0.7271834534838577, + "grad_norm": 0.1203337088227272, + "learning_rate": 0.0005, + "loss": 2.098, + "step": 191050 + }, + { + "epoch": 0.7272215159519804, + "grad_norm": 0.12706756591796875, + "learning_rate": 0.0005, + "loss": 2.1031, + "step": 191060 + }, + { + "epoch": 0.727259578420103, + "grad_norm": 0.13932664692401886, + "learning_rate": 0.0005, + "loss": 2.1156, + "step": 191070 + }, + { + "epoch": 0.7272976408882258, + "grad_norm": 0.12633827328681946, + "learning_rate": 0.0005, + "loss": 2.1056, + "step": 191080 + }, + { + "epoch": 0.7273357033563485, + "grad_norm": 0.11820350587368011, + "learning_rate": 0.0005, + "loss": 2.1026, + "step": 191090 + }, + { + "epoch": 0.7273737658244711, + "grad_norm": 0.12790940701961517, + "learning_rate": 0.0005, + "loss": 2.1075, + "step": 191100 + }, + { + "epoch": 0.7274118282925938, + "grad_norm": 0.12736229598522186, + "learning_rate": 0.0005, + "loss": 2.1069, + "step": 191110 + }, + { + "epoch": 0.7274498907607165, + "grad_norm": 0.12914036214351654, + "learning_rate": 0.0005, + "loss": 2.1041, + "step": 191120 + }, + { + "epoch": 0.7274879532288392, + "grad_norm": 0.13645537197589874, + "learning_rate": 0.0005, + "loss": 2.117, + "step": 191130 + }, + { + "epoch": 0.7275260156969618, + "grad_norm": 0.13665220141410828, + "learning_rate": 0.0005, + "loss": 2.0969, + "step": 191140 + }, + { + "epoch": 0.7275640781650845, + "grad_norm": 0.12808138132095337, + "learning_rate": 0.0005, + "loss": 2.1023, + "step": 191150 + }, + { + "epoch": 0.7276021406332073, + "grad_norm": 0.14374873042106628, + "learning_rate": 0.0005, + "loss": 2.1029, + "step": 191160 + }, + { + "epoch": 0.7276402031013299, + "grad_norm": 0.12461627274751663, + "learning_rate": 0.0005, + "loss": 2.0987, + "step": 191170 + }, + { + "epoch": 0.7276782655694526, + "grad_norm": 0.12864984571933746, + "learning_rate": 0.0005, + "loss": 2.1158, + "step": 191180 + }, + { + "epoch": 0.7277163280375752, + "grad_norm": 0.1112942323088646, + "learning_rate": 0.0005, + "loss": 2.0927, + "step": 191190 + }, + { + "epoch": 0.7277543905056979, + "grad_norm": 0.14084668457508087, + "learning_rate": 0.0005, + "loss": 2.1073, + "step": 191200 + }, + { + "epoch": 0.7277924529738207, + "grad_norm": 0.12258113920688629, + "learning_rate": 0.0005, + "loss": 2.0966, + "step": 191210 + }, + { + "epoch": 0.7278305154419433, + "grad_norm": 0.12442629039287567, + "learning_rate": 0.0005, + "loss": 2.1138, + "step": 191220 + }, + { + "epoch": 0.727868577910066, + "grad_norm": 0.12644663453102112, + "learning_rate": 0.0005, + "loss": 2.107, + "step": 191230 + }, + { + "epoch": 0.7279066403781886, + "grad_norm": 0.11585810035467148, + "learning_rate": 0.0005, + "loss": 2.1063, + "step": 191240 + }, + { + "epoch": 0.7279447028463114, + "grad_norm": 0.14139024913311005, + "learning_rate": 0.0005, + "loss": 2.1159, + "step": 191250 + }, + { + "epoch": 0.7279827653144341, + "grad_norm": 0.1261819750070572, + "learning_rate": 0.0005, + "loss": 2.0942, + "step": 191260 + }, + { + "epoch": 0.7280208277825567, + "grad_norm": 0.12189679592847824, + "learning_rate": 0.0005, + "loss": 2.1055, + "step": 191270 + }, + { + "epoch": 0.7280588902506794, + "grad_norm": 0.12211482971906662, + "learning_rate": 0.0005, + "loss": 2.0995, + "step": 191280 + }, + { + "epoch": 0.7280969527188021, + "grad_norm": 0.12543800473213196, + "learning_rate": 0.0005, + "loss": 2.097, + "step": 191290 + }, + { + "epoch": 0.7281350151869248, + "grad_norm": 0.13193655014038086, + "learning_rate": 0.0005, + "loss": 2.1106, + "step": 191300 + }, + { + "epoch": 0.7281730776550475, + "grad_norm": 0.1276022493839264, + "learning_rate": 0.0005, + "loss": 2.1154, + "step": 191310 + }, + { + "epoch": 0.7282111401231701, + "grad_norm": 0.1486976146697998, + "learning_rate": 0.0005, + "loss": 2.1002, + "step": 191320 + }, + { + "epoch": 0.7282492025912928, + "grad_norm": 0.1777261197566986, + "learning_rate": 0.0005, + "loss": 2.0958, + "step": 191330 + }, + { + "epoch": 0.7282872650594155, + "grad_norm": 0.1220930740237236, + "learning_rate": 0.0005, + "loss": 2.1075, + "step": 191340 + }, + { + "epoch": 0.7283253275275382, + "grad_norm": 0.12030477076768875, + "learning_rate": 0.0005, + "loss": 2.1079, + "step": 191350 + }, + { + "epoch": 0.7283633899956609, + "grad_norm": 0.14080104231834412, + "learning_rate": 0.0005, + "loss": 2.0928, + "step": 191360 + }, + { + "epoch": 0.7284014524637835, + "grad_norm": 0.12594959139823914, + "learning_rate": 0.0005, + "loss": 2.1106, + "step": 191370 + }, + { + "epoch": 0.7284395149319063, + "grad_norm": 0.13289976119995117, + "learning_rate": 0.0005, + "loss": 2.108, + "step": 191380 + }, + { + "epoch": 0.7284775774000289, + "grad_norm": 0.11373937129974365, + "learning_rate": 0.0005, + "loss": 2.1183, + "step": 191390 + }, + { + "epoch": 0.7285156398681516, + "grad_norm": 0.13607096672058105, + "learning_rate": 0.0005, + "loss": 2.1131, + "step": 191400 + }, + { + "epoch": 0.7285537023362743, + "grad_norm": 0.11733315140008926, + "learning_rate": 0.0005, + "loss": 2.1154, + "step": 191410 + }, + { + "epoch": 0.728591764804397, + "grad_norm": 0.125960573554039, + "learning_rate": 0.0005, + "loss": 2.1001, + "step": 191420 + }, + { + "epoch": 0.7286298272725197, + "grad_norm": 0.13192327320575714, + "learning_rate": 0.0005, + "loss": 2.1207, + "step": 191430 + }, + { + "epoch": 0.7286678897406423, + "grad_norm": 0.1263810694217682, + "learning_rate": 0.0005, + "loss": 2.1101, + "step": 191440 + }, + { + "epoch": 0.728705952208765, + "grad_norm": 0.12355059385299683, + "learning_rate": 0.0005, + "loss": 2.1102, + "step": 191450 + }, + { + "epoch": 0.7287440146768878, + "grad_norm": 0.11213410645723343, + "learning_rate": 0.0005, + "loss": 2.103, + "step": 191460 + }, + { + "epoch": 0.7287820771450104, + "grad_norm": 0.11975536495447159, + "learning_rate": 0.0005, + "loss": 2.1041, + "step": 191470 + }, + { + "epoch": 0.7288201396131331, + "grad_norm": 0.11797386407852173, + "learning_rate": 0.0005, + "loss": 2.1206, + "step": 191480 + }, + { + "epoch": 0.7288582020812557, + "grad_norm": 0.12237497419118881, + "learning_rate": 0.0005, + "loss": 2.1255, + "step": 191490 + }, + { + "epoch": 0.7288962645493784, + "grad_norm": 0.12381374090909958, + "learning_rate": 0.0005, + "loss": 2.1149, + "step": 191500 + }, + { + "epoch": 0.7289343270175012, + "grad_norm": 0.12289374321699142, + "learning_rate": 0.0005, + "loss": 2.1068, + "step": 191510 + }, + { + "epoch": 0.7289723894856238, + "grad_norm": 0.12725837528705597, + "learning_rate": 0.0005, + "loss": 2.1182, + "step": 191520 + }, + { + "epoch": 0.7290104519537465, + "grad_norm": 0.13120581209659576, + "learning_rate": 0.0005, + "loss": 2.112, + "step": 191530 + }, + { + "epoch": 0.7290485144218691, + "grad_norm": 0.12344781309366226, + "learning_rate": 0.0005, + "loss": 2.1097, + "step": 191540 + }, + { + "epoch": 0.7290865768899919, + "grad_norm": 0.12666013836860657, + "learning_rate": 0.0005, + "loss": 2.1198, + "step": 191550 + }, + { + "epoch": 0.7291246393581146, + "grad_norm": 0.11229068040847778, + "learning_rate": 0.0005, + "loss": 2.1133, + "step": 191560 + }, + { + "epoch": 0.7291627018262372, + "grad_norm": 0.1354386806488037, + "learning_rate": 0.0005, + "loss": 2.1058, + "step": 191570 + }, + { + "epoch": 0.7292007642943599, + "grad_norm": 0.13093844056129456, + "learning_rate": 0.0005, + "loss": 2.0934, + "step": 191580 + }, + { + "epoch": 0.7292388267624826, + "grad_norm": 0.12772110104560852, + "learning_rate": 0.0005, + "loss": 2.106, + "step": 191590 + }, + { + "epoch": 0.7292768892306053, + "grad_norm": 0.12142293900251389, + "learning_rate": 0.0005, + "loss": 2.0933, + "step": 191600 + }, + { + "epoch": 0.729314951698728, + "grad_norm": 0.1302458941936493, + "learning_rate": 0.0005, + "loss": 2.1136, + "step": 191610 + }, + { + "epoch": 0.7293530141668506, + "grad_norm": 0.12662212550640106, + "learning_rate": 0.0005, + "loss": 2.1212, + "step": 191620 + }, + { + "epoch": 0.7293910766349733, + "grad_norm": 0.1362551897764206, + "learning_rate": 0.0005, + "loss": 2.112, + "step": 191630 + }, + { + "epoch": 0.729429139103096, + "grad_norm": 0.12042704224586487, + "learning_rate": 0.0005, + "loss": 2.1261, + "step": 191640 + }, + { + "epoch": 0.7294672015712187, + "grad_norm": 0.13758443295955658, + "learning_rate": 0.0005, + "loss": 2.104, + "step": 191650 + }, + { + "epoch": 0.7295052640393414, + "grad_norm": 0.11594793200492859, + "learning_rate": 0.0005, + "loss": 2.0962, + "step": 191660 + }, + { + "epoch": 0.729543326507464, + "grad_norm": 0.11860723793506622, + "learning_rate": 0.0005, + "loss": 2.1001, + "step": 191670 + }, + { + "epoch": 0.7295813889755868, + "grad_norm": 0.12728294730186462, + "learning_rate": 0.0005, + "loss": 2.1065, + "step": 191680 + }, + { + "epoch": 0.7296194514437094, + "grad_norm": 0.12003385275602341, + "learning_rate": 0.0005, + "loss": 2.1067, + "step": 191690 + }, + { + "epoch": 0.7296575139118321, + "grad_norm": 0.12386397272348404, + "learning_rate": 0.0005, + "loss": 2.1018, + "step": 191700 + }, + { + "epoch": 0.7296955763799547, + "grad_norm": 0.13539811968803406, + "learning_rate": 0.0005, + "loss": 2.1041, + "step": 191710 + }, + { + "epoch": 0.7297336388480775, + "grad_norm": 0.12116830050945282, + "learning_rate": 0.0005, + "loss": 2.1223, + "step": 191720 + }, + { + "epoch": 0.7297717013162002, + "grad_norm": 0.12179238349199295, + "learning_rate": 0.0005, + "loss": 2.1079, + "step": 191730 + }, + { + "epoch": 0.7298097637843228, + "grad_norm": 0.1176956370472908, + "learning_rate": 0.0005, + "loss": 2.1238, + "step": 191740 + }, + { + "epoch": 0.7298478262524455, + "grad_norm": 0.12996090948581696, + "learning_rate": 0.0005, + "loss": 2.098, + "step": 191750 + }, + { + "epoch": 0.7298858887205681, + "grad_norm": 0.1440000981092453, + "learning_rate": 0.0005, + "loss": 2.1005, + "step": 191760 + }, + { + "epoch": 0.7299239511886909, + "grad_norm": 0.11414307355880737, + "learning_rate": 0.0005, + "loss": 2.1155, + "step": 191770 + }, + { + "epoch": 0.7299620136568136, + "grad_norm": 0.13091455399990082, + "learning_rate": 0.0005, + "loss": 2.1223, + "step": 191780 + }, + { + "epoch": 0.7300000761249362, + "grad_norm": 0.1182379201054573, + "learning_rate": 0.0005, + "loss": 2.1158, + "step": 191790 + }, + { + "epoch": 0.7300381385930589, + "grad_norm": 0.13387130200862885, + "learning_rate": 0.0005, + "loss": 2.1109, + "step": 191800 + }, + { + "epoch": 0.7300762010611817, + "grad_norm": 0.16108554601669312, + "learning_rate": 0.0005, + "loss": 2.0841, + "step": 191810 + }, + { + "epoch": 0.7301142635293043, + "grad_norm": 0.11871182173490524, + "learning_rate": 0.0005, + "loss": 2.0839, + "step": 191820 + }, + { + "epoch": 0.730152325997427, + "grad_norm": 0.11011119186878204, + "learning_rate": 0.0005, + "loss": 2.1179, + "step": 191830 + }, + { + "epoch": 0.7301903884655496, + "grad_norm": 0.12743213772773743, + "learning_rate": 0.0005, + "loss": 2.0948, + "step": 191840 + }, + { + "epoch": 0.7302284509336724, + "grad_norm": 0.1217624694108963, + "learning_rate": 0.0005, + "loss": 2.1092, + "step": 191850 + }, + { + "epoch": 0.730266513401795, + "grad_norm": 0.1229822188615799, + "learning_rate": 0.0005, + "loss": 2.1152, + "step": 191860 + }, + { + "epoch": 0.7303045758699177, + "grad_norm": 0.11932218819856644, + "learning_rate": 0.0005, + "loss": 2.1139, + "step": 191870 + }, + { + "epoch": 0.7303426383380404, + "grad_norm": 0.13336646556854248, + "learning_rate": 0.0005, + "loss": 2.1095, + "step": 191880 + }, + { + "epoch": 0.7303807008061631, + "grad_norm": 0.13458584249019623, + "learning_rate": 0.0005, + "loss": 2.1228, + "step": 191890 + }, + { + "epoch": 0.7304187632742858, + "grad_norm": 0.1292772889137268, + "learning_rate": 0.0005, + "loss": 2.0995, + "step": 191900 + }, + { + "epoch": 0.7304568257424084, + "grad_norm": 0.12842907011508942, + "learning_rate": 0.0005, + "loss": 2.109, + "step": 191910 + }, + { + "epoch": 0.7304948882105311, + "grad_norm": 0.11552907526493073, + "learning_rate": 0.0005, + "loss": 2.0999, + "step": 191920 + }, + { + "epoch": 0.7305329506786538, + "grad_norm": 0.12777958810329437, + "learning_rate": 0.0005, + "loss": 2.1069, + "step": 191930 + }, + { + "epoch": 0.7305710131467765, + "grad_norm": 0.1212586760520935, + "learning_rate": 0.0005, + "loss": 2.1077, + "step": 191940 + }, + { + "epoch": 0.7306090756148992, + "grad_norm": 0.13225427269935608, + "learning_rate": 0.0005, + "loss": 2.1178, + "step": 191950 + }, + { + "epoch": 0.7306471380830218, + "grad_norm": 0.12566696107387543, + "learning_rate": 0.0005, + "loss": 2.1143, + "step": 191960 + }, + { + "epoch": 0.7306852005511445, + "grad_norm": 0.12067980319261551, + "learning_rate": 0.0005, + "loss": 2.1152, + "step": 191970 + }, + { + "epoch": 0.7307232630192673, + "grad_norm": 0.11316678673028946, + "learning_rate": 0.0005, + "loss": 2.1016, + "step": 191980 + }, + { + "epoch": 0.7307613254873899, + "grad_norm": 0.12638939917087555, + "learning_rate": 0.0005, + "loss": 2.1012, + "step": 191990 + }, + { + "epoch": 0.7307993879555126, + "grad_norm": 0.12090083956718445, + "learning_rate": 0.0005, + "loss": 2.1153, + "step": 192000 + }, + { + "epoch": 0.7308374504236352, + "grad_norm": 0.1209336370229721, + "learning_rate": 0.0005, + "loss": 2.117, + "step": 192010 + }, + { + "epoch": 0.730875512891758, + "grad_norm": 0.12320481240749359, + "learning_rate": 0.0005, + "loss": 2.1096, + "step": 192020 + }, + { + "epoch": 0.7309135753598807, + "grad_norm": 0.11362239718437195, + "learning_rate": 0.0005, + "loss": 2.1101, + "step": 192030 + }, + { + "epoch": 0.7309516378280033, + "grad_norm": 0.12169143557548523, + "learning_rate": 0.0005, + "loss": 2.0909, + "step": 192040 + }, + { + "epoch": 0.730989700296126, + "grad_norm": 0.12608258426189423, + "learning_rate": 0.0005, + "loss": 2.1016, + "step": 192050 + }, + { + "epoch": 0.7310277627642486, + "grad_norm": 0.1409205049276352, + "learning_rate": 0.0005, + "loss": 2.1132, + "step": 192060 + }, + { + "epoch": 0.7310658252323714, + "grad_norm": 0.11726506054401398, + "learning_rate": 0.0005, + "loss": 2.1169, + "step": 192070 + }, + { + "epoch": 0.7311038877004941, + "grad_norm": 0.12257499247789383, + "learning_rate": 0.0005, + "loss": 2.1096, + "step": 192080 + }, + { + "epoch": 0.7311419501686167, + "grad_norm": 0.1246478408575058, + "learning_rate": 0.0005, + "loss": 2.0977, + "step": 192090 + }, + { + "epoch": 0.7311800126367394, + "grad_norm": 0.11844100058078766, + "learning_rate": 0.0005, + "loss": 2.0972, + "step": 192100 + }, + { + "epoch": 0.7312180751048621, + "grad_norm": 0.12616798281669617, + "learning_rate": 0.0005, + "loss": 2.1224, + "step": 192110 + }, + { + "epoch": 0.7312561375729848, + "grad_norm": 0.11756548285484314, + "learning_rate": 0.0005, + "loss": 2.1039, + "step": 192120 + }, + { + "epoch": 0.7312942000411075, + "grad_norm": 0.15198977291584015, + "learning_rate": 0.0005, + "loss": 2.1221, + "step": 192130 + }, + { + "epoch": 0.7313322625092301, + "grad_norm": 0.1201084554195404, + "learning_rate": 0.0005, + "loss": 2.1195, + "step": 192140 + }, + { + "epoch": 0.7313703249773529, + "grad_norm": 0.13323737680912018, + "learning_rate": 0.0005, + "loss": 2.1053, + "step": 192150 + }, + { + "epoch": 0.7314083874454755, + "grad_norm": 0.12064183503389359, + "learning_rate": 0.0005, + "loss": 2.1116, + "step": 192160 + }, + { + "epoch": 0.7314464499135982, + "grad_norm": 0.12921644747257233, + "learning_rate": 0.0005, + "loss": 2.0976, + "step": 192170 + }, + { + "epoch": 0.7314845123817209, + "grad_norm": 0.11377564817667007, + "learning_rate": 0.0005, + "loss": 2.1109, + "step": 192180 + }, + { + "epoch": 0.7315225748498435, + "grad_norm": 0.11053812503814697, + "learning_rate": 0.0005, + "loss": 2.1008, + "step": 192190 + }, + { + "epoch": 0.7315606373179663, + "grad_norm": 0.1232326477766037, + "learning_rate": 0.0005, + "loss": 2.1049, + "step": 192200 + }, + { + "epoch": 0.7315986997860889, + "grad_norm": 0.122260183095932, + "learning_rate": 0.0005, + "loss": 2.1121, + "step": 192210 + }, + { + "epoch": 0.7316367622542116, + "grad_norm": 0.13045214116573334, + "learning_rate": 0.0005, + "loss": 2.122, + "step": 192220 + }, + { + "epoch": 0.7316748247223342, + "grad_norm": 0.11762091517448425, + "learning_rate": 0.0005, + "loss": 2.1231, + "step": 192230 + }, + { + "epoch": 0.731712887190457, + "grad_norm": 0.11861058324575424, + "learning_rate": 0.0005, + "loss": 2.1169, + "step": 192240 + }, + { + "epoch": 0.7317509496585797, + "grad_norm": 0.12967616319656372, + "learning_rate": 0.0005, + "loss": 2.1079, + "step": 192250 + }, + { + "epoch": 0.7317890121267023, + "grad_norm": 0.11592860519886017, + "learning_rate": 0.0005, + "loss": 2.106, + "step": 192260 + }, + { + "epoch": 0.731827074594825, + "grad_norm": 0.12144582718610764, + "learning_rate": 0.0005, + "loss": 2.1126, + "step": 192270 + }, + { + "epoch": 0.7318651370629478, + "grad_norm": 0.11628725379705429, + "learning_rate": 0.0005, + "loss": 2.1112, + "step": 192280 + }, + { + "epoch": 0.7319031995310704, + "grad_norm": 0.11305202543735504, + "learning_rate": 0.0005, + "loss": 2.1133, + "step": 192290 + }, + { + "epoch": 0.7319412619991931, + "grad_norm": 0.11752969026565552, + "learning_rate": 0.0005, + "loss": 2.1076, + "step": 192300 + }, + { + "epoch": 0.7319793244673157, + "grad_norm": 0.1308397501707077, + "learning_rate": 0.0005, + "loss": 2.1077, + "step": 192310 + }, + { + "epoch": 0.7320173869354385, + "grad_norm": 0.13620050251483917, + "learning_rate": 0.0005, + "loss": 2.0956, + "step": 192320 + }, + { + "epoch": 0.7320554494035612, + "grad_norm": 0.12767353653907776, + "learning_rate": 0.0005, + "loss": 2.1139, + "step": 192330 + }, + { + "epoch": 0.7320935118716838, + "grad_norm": 0.1389073133468628, + "learning_rate": 0.0005, + "loss": 2.099, + "step": 192340 + }, + { + "epoch": 0.7321315743398065, + "grad_norm": 0.14815621078014374, + "learning_rate": 0.0005, + "loss": 2.1067, + "step": 192350 + }, + { + "epoch": 0.7321696368079291, + "grad_norm": 0.1339462399482727, + "learning_rate": 0.0005, + "loss": 2.103, + "step": 192360 + }, + { + "epoch": 0.7322076992760519, + "grad_norm": 0.129747211933136, + "learning_rate": 0.0005, + "loss": 2.1216, + "step": 192370 + }, + { + "epoch": 0.7322457617441746, + "grad_norm": 0.13563303649425507, + "learning_rate": 0.0005, + "loss": 2.1173, + "step": 192380 + }, + { + "epoch": 0.7322838242122972, + "grad_norm": 0.12491580843925476, + "learning_rate": 0.0005, + "loss": 2.1068, + "step": 192390 + }, + { + "epoch": 0.7323218866804199, + "grad_norm": 0.12931863963603973, + "learning_rate": 0.0005, + "loss": 2.109, + "step": 192400 + }, + { + "epoch": 0.7323599491485426, + "grad_norm": 0.12038969993591309, + "learning_rate": 0.0005, + "loss": 2.1073, + "step": 192410 + }, + { + "epoch": 0.7323980116166653, + "grad_norm": 0.12600909173488617, + "learning_rate": 0.0005, + "loss": 2.1134, + "step": 192420 + }, + { + "epoch": 0.732436074084788, + "grad_norm": 0.12822246551513672, + "learning_rate": 0.0005, + "loss": 2.0965, + "step": 192430 + }, + { + "epoch": 0.7324741365529106, + "grad_norm": 0.12231694906949997, + "learning_rate": 0.0005, + "loss": 2.1077, + "step": 192440 + }, + { + "epoch": 0.7325121990210334, + "grad_norm": 0.12413424998521805, + "learning_rate": 0.0005, + "loss": 2.0926, + "step": 192450 + }, + { + "epoch": 0.732550261489156, + "grad_norm": 0.12980525195598602, + "learning_rate": 0.0005, + "loss": 2.0957, + "step": 192460 + }, + { + "epoch": 0.7325883239572787, + "grad_norm": 0.11845885217189789, + "learning_rate": 0.0005, + "loss": 2.097, + "step": 192470 + }, + { + "epoch": 0.7326263864254013, + "grad_norm": 0.13867278397083282, + "learning_rate": 0.0005, + "loss": 2.1115, + "step": 192480 + }, + { + "epoch": 0.732664448893524, + "grad_norm": 0.1332332342863083, + "learning_rate": 0.0005, + "loss": 2.1278, + "step": 192490 + }, + { + "epoch": 0.7327025113616468, + "grad_norm": 0.12007120251655579, + "learning_rate": 0.0005, + "loss": 2.1035, + "step": 192500 + }, + { + "epoch": 0.7327405738297694, + "grad_norm": 0.14139346778392792, + "learning_rate": 0.0005, + "loss": 2.0876, + "step": 192510 + }, + { + "epoch": 0.7327786362978921, + "grad_norm": 0.1176033467054367, + "learning_rate": 0.0005, + "loss": 2.1009, + "step": 192520 + }, + { + "epoch": 0.7328166987660147, + "grad_norm": 0.11958561837673187, + "learning_rate": 0.0005, + "loss": 2.1234, + "step": 192530 + }, + { + "epoch": 0.7328547612341375, + "grad_norm": 0.11961330473423004, + "learning_rate": 0.0005, + "loss": 2.1084, + "step": 192540 + }, + { + "epoch": 0.7328928237022602, + "grad_norm": 0.13144491612911224, + "learning_rate": 0.0005, + "loss": 2.122, + "step": 192550 + }, + { + "epoch": 0.7329308861703828, + "grad_norm": 0.12117725610733032, + "learning_rate": 0.0005, + "loss": 2.1095, + "step": 192560 + }, + { + "epoch": 0.7329689486385055, + "grad_norm": 0.11846371740102768, + "learning_rate": 0.0005, + "loss": 2.1057, + "step": 192570 + }, + { + "epoch": 0.7330070111066282, + "grad_norm": 0.13184702396392822, + "learning_rate": 0.0005, + "loss": 2.1178, + "step": 192580 + }, + { + "epoch": 0.7330450735747509, + "grad_norm": 0.13365581631660461, + "learning_rate": 0.0005, + "loss": 2.094, + "step": 192590 + }, + { + "epoch": 0.7330831360428736, + "grad_norm": 0.11207521706819534, + "learning_rate": 0.0005, + "loss": 2.1016, + "step": 192600 + }, + { + "epoch": 0.7331211985109962, + "grad_norm": 0.1342562884092331, + "learning_rate": 0.0005, + "loss": 2.1138, + "step": 192610 + }, + { + "epoch": 0.7331592609791189, + "grad_norm": 0.1263633370399475, + "learning_rate": 0.0005, + "loss": 2.1147, + "step": 192620 + }, + { + "epoch": 0.7331973234472416, + "grad_norm": 0.11910117417573929, + "learning_rate": 0.0005, + "loss": 2.1064, + "step": 192630 + }, + { + "epoch": 0.7332353859153643, + "grad_norm": 0.12975086271762848, + "learning_rate": 0.0005, + "loss": 2.1167, + "step": 192640 + }, + { + "epoch": 0.733273448383487, + "grad_norm": 0.14330074191093445, + "learning_rate": 0.0005, + "loss": 2.1202, + "step": 192650 + }, + { + "epoch": 0.7333115108516096, + "grad_norm": 0.1286696344614029, + "learning_rate": 0.0005, + "loss": 2.0972, + "step": 192660 + }, + { + "epoch": 0.7333495733197324, + "grad_norm": 0.11552855372428894, + "learning_rate": 0.0005, + "loss": 2.1004, + "step": 192670 + }, + { + "epoch": 0.733387635787855, + "grad_norm": 0.12719112634658813, + "learning_rate": 0.0005, + "loss": 2.1176, + "step": 192680 + }, + { + "epoch": 0.7334256982559777, + "grad_norm": 0.11824102699756622, + "learning_rate": 0.0005, + "loss": 2.1061, + "step": 192690 + }, + { + "epoch": 0.7334637607241004, + "grad_norm": 0.12147907167673111, + "learning_rate": 0.0005, + "loss": 2.1111, + "step": 192700 + }, + { + "epoch": 0.7335018231922231, + "grad_norm": 0.11929722875356674, + "learning_rate": 0.0005, + "loss": 2.1058, + "step": 192710 + }, + { + "epoch": 0.7335398856603458, + "grad_norm": 0.12859030067920685, + "learning_rate": 0.0005, + "loss": 2.1102, + "step": 192720 + }, + { + "epoch": 0.7335779481284684, + "grad_norm": 0.12841321527957916, + "learning_rate": 0.0005, + "loss": 2.1111, + "step": 192730 + }, + { + "epoch": 0.7336160105965911, + "grad_norm": 0.11988534033298492, + "learning_rate": 0.0005, + "loss": 2.1087, + "step": 192740 + }, + { + "epoch": 0.7336540730647139, + "grad_norm": 0.12679408490657806, + "learning_rate": 0.0005, + "loss": 2.0997, + "step": 192750 + }, + { + "epoch": 0.7336921355328365, + "grad_norm": 0.11900787800550461, + "learning_rate": 0.0005, + "loss": 2.1103, + "step": 192760 + }, + { + "epoch": 0.7337301980009592, + "grad_norm": 0.12465648353099823, + "learning_rate": 0.0005, + "loss": 2.108, + "step": 192770 + }, + { + "epoch": 0.7337682604690818, + "grad_norm": 0.11600150167942047, + "learning_rate": 0.0005, + "loss": 2.1027, + "step": 192780 + }, + { + "epoch": 0.7338063229372045, + "grad_norm": 0.12035398185253143, + "learning_rate": 0.0005, + "loss": 2.1008, + "step": 192790 + }, + { + "epoch": 0.7338443854053273, + "grad_norm": 0.11685821413993835, + "learning_rate": 0.0005, + "loss": 2.0968, + "step": 192800 + }, + { + "epoch": 0.7338824478734499, + "grad_norm": 0.1247807964682579, + "learning_rate": 0.0005, + "loss": 2.1045, + "step": 192810 + }, + { + "epoch": 0.7339205103415726, + "grad_norm": 0.12759731709957123, + "learning_rate": 0.0005, + "loss": 2.1087, + "step": 192820 + }, + { + "epoch": 0.7339585728096952, + "grad_norm": 0.11301767081022263, + "learning_rate": 0.0005, + "loss": 2.0982, + "step": 192830 + }, + { + "epoch": 0.733996635277818, + "grad_norm": 0.11778232455253601, + "learning_rate": 0.0005, + "loss": 2.1036, + "step": 192840 + }, + { + "epoch": 0.7340346977459407, + "grad_norm": 0.12188120931386948, + "learning_rate": 0.0005, + "loss": 2.0961, + "step": 192850 + }, + { + "epoch": 0.7340727602140633, + "grad_norm": 0.13007090985774994, + "learning_rate": 0.0005, + "loss": 2.1158, + "step": 192860 + }, + { + "epoch": 0.734110822682186, + "grad_norm": 0.13528990745544434, + "learning_rate": 0.0005, + "loss": 2.1295, + "step": 192870 + }, + { + "epoch": 0.7341488851503087, + "grad_norm": 0.12280694395303726, + "learning_rate": 0.0005, + "loss": 2.1026, + "step": 192880 + }, + { + "epoch": 0.7341869476184314, + "grad_norm": 0.12887060642242432, + "learning_rate": 0.0005, + "loss": 2.0977, + "step": 192890 + }, + { + "epoch": 0.734225010086554, + "grad_norm": 0.11871284246444702, + "learning_rate": 0.0005, + "loss": 2.1009, + "step": 192900 + }, + { + "epoch": 0.7342630725546767, + "grad_norm": 0.111478291451931, + "learning_rate": 0.0005, + "loss": 2.1049, + "step": 192910 + }, + { + "epoch": 0.7343011350227994, + "grad_norm": 0.12480342388153076, + "learning_rate": 0.0005, + "loss": 2.1192, + "step": 192920 + }, + { + "epoch": 0.7343391974909221, + "grad_norm": 0.11666394770145416, + "learning_rate": 0.0005, + "loss": 2.1127, + "step": 192930 + }, + { + "epoch": 0.7343772599590448, + "grad_norm": 0.11283561587333679, + "learning_rate": 0.0005, + "loss": 2.1169, + "step": 192940 + }, + { + "epoch": 0.7344153224271674, + "grad_norm": 0.1266782432794571, + "learning_rate": 0.0005, + "loss": 2.0996, + "step": 192950 + }, + { + "epoch": 0.7344533848952901, + "grad_norm": 0.1207321286201477, + "learning_rate": 0.0005, + "loss": 2.1007, + "step": 192960 + }, + { + "epoch": 0.7344914473634129, + "grad_norm": 0.13289088010787964, + "learning_rate": 0.0005, + "loss": 2.1111, + "step": 192970 + }, + { + "epoch": 0.7345295098315355, + "grad_norm": 0.13234733045101166, + "learning_rate": 0.0005, + "loss": 2.1222, + "step": 192980 + }, + { + "epoch": 0.7345675722996582, + "grad_norm": 0.13487665355205536, + "learning_rate": 0.0005, + "loss": 2.1038, + "step": 192990 + }, + { + "epoch": 0.7346056347677808, + "grad_norm": 0.11529957503080368, + "learning_rate": 0.0005, + "loss": 2.1049, + "step": 193000 + }, + { + "epoch": 0.7346436972359036, + "grad_norm": 0.1310238540172577, + "learning_rate": 0.0005, + "loss": 2.1175, + "step": 193010 + }, + { + "epoch": 0.7346817597040263, + "grad_norm": 0.1475614756345749, + "learning_rate": 0.0005, + "loss": 2.1178, + "step": 193020 + }, + { + "epoch": 0.7347198221721489, + "grad_norm": 0.12483355402946472, + "learning_rate": 0.0005, + "loss": 2.1136, + "step": 193030 + }, + { + "epoch": 0.7347578846402716, + "grad_norm": 0.15248079597949982, + "learning_rate": 0.0005, + "loss": 2.1103, + "step": 193040 + }, + { + "epoch": 0.7347959471083942, + "grad_norm": 0.12657523155212402, + "learning_rate": 0.0005, + "loss": 2.1039, + "step": 193050 + }, + { + "epoch": 0.734834009576517, + "grad_norm": 0.13055965304374695, + "learning_rate": 0.0005, + "loss": 2.1037, + "step": 193060 + }, + { + "epoch": 0.7348720720446397, + "grad_norm": 0.12264831364154816, + "learning_rate": 0.0005, + "loss": 2.104, + "step": 193070 + }, + { + "epoch": 0.7349101345127623, + "grad_norm": 0.12394838035106659, + "learning_rate": 0.0005, + "loss": 2.1122, + "step": 193080 + }, + { + "epoch": 0.734948196980885, + "grad_norm": 0.12351053953170776, + "learning_rate": 0.0005, + "loss": 2.1079, + "step": 193090 + }, + { + "epoch": 0.7349862594490078, + "grad_norm": 0.11562132835388184, + "learning_rate": 0.0005, + "loss": 2.1171, + "step": 193100 + }, + { + "epoch": 0.7350243219171304, + "grad_norm": 0.12288986146450043, + "learning_rate": 0.0005, + "loss": 2.1068, + "step": 193110 + }, + { + "epoch": 0.7350623843852531, + "grad_norm": 0.12032705545425415, + "learning_rate": 0.0005, + "loss": 2.1113, + "step": 193120 + }, + { + "epoch": 0.7351004468533757, + "grad_norm": 0.12123995274305344, + "learning_rate": 0.0005, + "loss": 2.102, + "step": 193130 + }, + { + "epoch": 0.7351385093214985, + "grad_norm": 0.11481358855962753, + "learning_rate": 0.0005, + "loss": 2.0922, + "step": 193140 + }, + { + "epoch": 0.7351765717896211, + "grad_norm": 0.128712460398674, + "learning_rate": 0.0005, + "loss": 2.107, + "step": 193150 + }, + { + "epoch": 0.7352146342577438, + "grad_norm": 0.12128186970949173, + "learning_rate": 0.0005, + "loss": 2.0784, + "step": 193160 + }, + { + "epoch": 0.7352526967258665, + "grad_norm": 0.12377264350652695, + "learning_rate": 0.0005, + "loss": 2.1126, + "step": 193170 + }, + { + "epoch": 0.7352907591939892, + "grad_norm": 0.14269936084747314, + "learning_rate": 0.0005, + "loss": 2.101, + "step": 193180 + }, + { + "epoch": 0.7353288216621119, + "grad_norm": 0.12458387017250061, + "learning_rate": 0.0005, + "loss": 2.1113, + "step": 193190 + }, + { + "epoch": 0.7353668841302345, + "grad_norm": 0.11986847221851349, + "learning_rate": 0.0005, + "loss": 2.0962, + "step": 193200 + }, + { + "epoch": 0.7354049465983572, + "grad_norm": 0.12104091048240662, + "learning_rate": 0.0005, + "loss": 2.1087, + "step": 193210 + }, + { + "epoch": 0.7354430090664799, + "grad_norm": 0.13339339196681976, + "learning_rate": 0.0005, + "loss": 2.1067, + "step": 193220 + }, + { + "epoch": 0.7354810715346026, + "grad_norm": 0.1323716938495636, + "learning_rate": 0.0005, + "loss": 2.1047, + "step": 193230 + }, + { + "epoch": 0.7355191340027253, + "grad_norm": 0.11504301428794861, + "learning_rate": 0.0005, + "loss": 2.1067, + "step": 193240 + }, + { + "epoch": 0.7355571964708479, + "grad_norm": 0.11710140109062195, + "learning_rate": 0.0005, + "loss": 2.1075, + "step": 193250 + }, + { + "epoch": 0.7355952589389706, + "grad_norm": 0.12178219109773636, + "learning_rate": 0.0005, + "loss": 2.1004, + "step": 193260 + }, + { + "epoch": 0.7356333214070934, + "grad_norm": 0.11730635166168213, + "learning_rate": 0.0005, + "loss": 2.1192, + "step": 193270 + }, + { + "epoch": 0.735671383875216, + "grad_norm": 0.12166262418031693, + "learning_rate": 0.0005, + "loss": 2.1002, + "step": 193280 + }, + { + "epoch": 0.7357094463433387, + "grad_norm": 0.13376054167747498, + "learning_rate": 0.0005, + "loss": 2.115, + "step": 193290 + }, + { + "epoch": 0.7357475088114613, + "grad_norm": 0.11151111125946045, + "learning_rate": 0.0005, + "loss": 2.1076, + "step": 193300 + }, + { + "epoch": 0.7357855712795841, + "grad_norm": 0.11726437509059906, + "learning_rate": 0.0005, + "loss": 2.1244, + "step": 193310 + }, + { + "epoch": 0.7358236337477068, + "grad_norm": 0.12291442602872849, + "learning_rate": 0.0005, + "loss": 2.1064, + "step": 193320 + }, + { + "epoch": 0.7358616962158294, + "grad_norm": 0.12281958758831024, + "learning_rate": 0.0005, + "loss": 2.1066, + "step": 193330 + }, + { + "epoch": 0.7358997586839521, + "grad_norm": 0.13164712488651276, + "learning_rate": 0.0005, + "loss": 2.1026, + "step": 193340 + }, + { + "epoch": 0.7359378211520747, + "grad_norm": 0.12434583902359009, + "learning_rate": 0.0005, + "loss": 2.1122, + "step": 193350 + }, + { + "epoch": 0.7359758836201975, + "grad_norm": 0.12332146614789963, + "learning_rate": 0.0005, + "loss": 2.1094, + "step": 193360 + }, + { + "epoch": 0.7360139460883202, + "grad_norm": 0.12583640217781067, + "learning_rate": 0.0005, + "loss": 2.1056, + "step": 193370 + }, + { + "epoch": 0.7360520085564428, + "grad_norm": 0.1278449147939682, + "learning_rate": 0.0005, + "loss": 2.1144, + "step": 193380 + }, + { + "epoch": 0.7360900710245655, + "grad_norm": 0.12696930766105652, + "learning_rate": 0.0005, + "loss": 2.107, + "step": 193390 + }, + { + "epoch": 0.7361281334926882, + "grad_norm": 0.11661224067211151, + "learning_rate": 0.0005, + "loss": 2.1047, + "step": 193400 + }, + { + "epoch": 0.7361661959608109, + "grad_norm": 0.13193956017494202, + "learning_rate": 0.0005, + "loss": 2.1177, + "step": 193410 + }, + { + "epoch": 0.7362042584289336, + "grad_norm": 0.1422918438911438, + "learning_rate": 0.0005, + "loss": 2.1114, + "step": 193420 + }, + { + "epoch": 0.7362423208970562, + "grad_norm": 0.12093400210142136, + "learning_rate": 0.0005, + "loss": 2.1162, + "step": 193430 + }, + { + "epoch": 0.736280383365179, + "grad_norm": 0.11584337800741196, + "learning_rate": 0.0005, + "loss": 2.1016, + "step": 193440 + }, + { + "epoch": 0.7363184458333016, + "grad_norm": 0.1224224641919136, + "learning_rate": 0.0005, + "loss": 2.1181, + "step": 193450 + }, + { + "epoch": 0.7363565083014243, + "grad_norm": 0.1242123395204544, + "learning_rate": 0.0005, + "loss": 2.0986, + "step": 193460 + }, + { + "epoch": 0.736394570769547, + "grad_norm": 0.13345323503017426, + "learning_rate": 0.0005, + "loss": 2.1129, + "step": 193470 + }, + { + "epoch": 0.7364326332376696, + "grad_norm": 0.1371457725763321, + "learning_rate": 0.0005, + "loss": 2.1152, + "step": 193480 + }, + { + "epoch": 0.7364706957057924, + "grad_norm": 0.12954550981521606, + "learning_rate": 0.0005, + "loss": 2.0934, + "step": 193490 + }, + { + "epoch": 0.736508758173915, + "grad_norm": 0.12712004780769348, + "learning_rate": 0.0005, + "loss": 2.0927, + "step": 193500 + }, + { + "epoch": 0.7365468206420377, + "grad_norm": 0.12469438463449478, + "learning_rate": 0.0005, + "loss": 2.1054, + "step": 193510 + }, + { + "epoch": 0.7365848831101603, + "grad_norm": 0.12666510045528412, + "learning_rate": 0.0005, + "loss": 2.1037, + "step": 193520 + }, + { + "epoch": 0.7366229455782831, + "grad_norm": 0.12624983489513397, + "learning_rate": 0.0005, + "loss": 2.0918, + "step": 193530 + }, + { + "epoch": 0.7366610080464058, + "grad_norm": 0.12753058969974518, + "learning_rate": 0.0005, + "loss": 2.1112, + "step": 193540 + }, + { + "epoch": 0.7366990705145284, + "grad_norm": 0.1122048869729042, + "learning_rate": 0.0005, + "loss": 2.0855, + "step": 193550 + }, + { + "epoch": 0.7367371329826511, + "grad_norm": 0.12347835302352905, + "learning_rate": 0.0005, + "loss": 2.1077, + "step": 193560 + }, + { + "epoch": 0.7367751954507739, + "grad_norm": 0.12568634748458862, + "learning_rate": 0.0005, + "loss": 2.1098, + "step": 193570 + }, + { + "epoch": 0.7368132579188965, + "grad_norm": 0.11385814845561981, + "learning_rate": 0.0005, + "loss": 2.1113, + "step": 193580 + }, + { + "epoch": 0.7368513203870192, + "grad_norm": 0.13868993520736694, + "learning_rate": 0.0005, + "loss": 2.1013, + "step": 193590 + }, + { + "epoch": 0.7368893828551418, + "grad_norm": 0.1321054995059967, + "learning_rate": 0.0005, + "loss": 2.1047, + "step": 193600 + }, + { + "epoch": 0.7369274453232646, + "grad_norm": 0.12079103291034698, + "learning_rate": 0.0005, + "loss": 2.1058, + "step": 193610 + }, + { + "epoch": 0.7369655077913873, + "grad_norm": 0.11624564230442047, + "learning_rate": 0.0005, + "loss": 2.1163, + "step": 193620 + }, + { + "epoch": 0.7370035702595099, + "grad_norm": 0.13603192567825317, + "learning_rate": 0.0005, + "loss": 2.1035, + "step": 193630 + }, + { + "epoch": 0.7370416327276326, + "grad_norm": 0.1264621466398239, + "learning_rate": 0.0005, + "loss": 2.1012, + "step": 193640 + }, + { + "epoch": 0.7370796951957552, + "grad_norm": 0.11997373402118683, + "learning_rate": 0.0005, + "loss": 2.1192, + "step": 193650 + }, + { + "epoch": 0.737117757663878, + "grad_norm": 0.1271255761384964, + "learning_rate": 0.0005, + "loss": 2.1021, + "step": 193660 + }, + { + "epoch": 0.7371558201320006, + "grad_norm": 0.13099275529384613, + "learning_rate": 0.0005, + "loss": 2.1006, + "step": 193670 + }, + { + "epoch": 0.7371938826001233, + "grad_norm": 0.11465470492839813, + "learning_rate": 0.0005, + "loss": 2.092, + "step": 193680 + }, + { + "epoch": 0.737231945068246, + "grad_norm": 0.13196159899234772, + "learning_rate": 0.0005, + "loss": 2.1119, + "step": 193690 + }, + { + "epoch": 0.7372700075363687, + "grad_norm": 0.11341453343629837, + "learning_rate": 0.0005, + "loss": 2.111, + "step": 193700 + }, + { + "epoch": 0.7373080700044914, + "grad_norm": 0.12351559102535248, + "learning_rate": 0.0005, + "loss": 2.1221, + "step": 193710 + }, + { + "epoch": 0.737346132472614, + "grad_norm": 0.12498927861452103, + "learning_rate": 0.0005, + "loss": 2.1068, + "step": 193720 + }, + { + "epoch": 0.7373841949407367, + "grad_norm": 0.11956217139959335, + "learning_rate": 0.0005, + "loss": 2.1008, + "step": 193730 + }, + { + "epoch": 0.7374222574088595, + "grad_norm": 0.1368047147989273, + "learning_rate": 0.0005, + "loss": 2.117, + "step": 193740 + }, + { + "epoch": 0.7374603198769821, + "grad_norm": 0.12249314039945602, + "learning_rate": 0.0005, + "loss": 2.1097, + "step": 193750 + }, + { + "epoch": 0.7374983823451048, + "grad_norm": 0.1328524500131607, + "learning_rate": 0.0005, + "loss": 2.1003, + "step": 193760 + }, + { + "epoch": 0.7375364448132274, + "grad_norm": 0.23825938999652863, + "learning_rate": 0.0005, + "loss": 2.1102, + "step": 193770 + }, + { + "epoch": 0.7375745072813501, + "grad_norm": 0.11991281062364578, + "learning_rate": 0.0005, + "loss": 2.1079, + "step": 193780 + }, + { + "epoch": 0.7376125697494729, + "grad_norm": 0.12263292819261551, + "learning_rate": 0.0005, + "loss": 2.104, + "step": 193790 + }, + { + "epoch": 0.7376506322175955, + "grad_norm": 0.10938142985105515, + "learning_rate": 0.0005, + "loss": 2.0997, + "step": 193800 + }, + { + "epoch": 0.7376886946857182, + "grad_norm": 0.12292012572288513, + "learning_rate": 0.0005, + "loss": 2.1226, + "step": 193810 + }, + { + "epoch": 0.7377267571538408, + "grad_norm": 0.13027265667915344, + "learning_rate": 0.0005, + "loss": 2.1216, + "step": 193820 + }, + { + "epoch": 0.7377648196219636, + "grad_norm": 0.13232482969760895, + "learning_rate": 0.0005, + "loss": 2.1076, + "step": 193830 + }, + { + "epoch": 0.7378028820900863, + "grad_norm": 0.12443753331899643, + "learning_rate": 0.0005, + "loss": 2.1115, + "step": 193840 + }, + { + "epoch": 0.7378409445582089, + "grad_norm": 0.11491934210062027, + "learning_rate": 0.0005, + "loss": 2.0982, + "step": 193850 + }, + { + "epoch": 0.7378790070263316, + "grad_norm": 0.11515094339847565, + "learning_rate": 0.0005, + "loss": 2.1057, + "step": 193860 + }, + { + "epoch": 0.7379170694944543, + "grad_norm": 0.11938028037548065, + "learning_rate": 0.0005, + "loss": 2.0905, + "step": 193870 + }, + { + "epoch": 0.737955131962577, + "grad_norm": 0.13776546716690063, + "learning_rate": 0.0005, + "loss": 2.1172, + "step": 193880 + }, + { + "epoch": 0.7379931944306997, + "grad_norm": 0.12095969170331955, + "learning_rate": 0.0005, + "loss": 2.0996, + "step": 193890 + }, + { + "epoch": 0.7380312568988223, + "grad_norm": 0.1486758291721344, + "learning_rate": 0.0005, + "loss": 2.1257, + "step": 193900 + }, + { + "epoch": 0.738069319366945, + "grad_norm": 0.12475045770406723, + "learning_rate": 0.0005, + "loss": 2.1047, + "step": 193910 + }, + { + "epoch": 0.7381073818350677, + "grad_norm": 0.12161503732204437, + "learning_rate": 0.0005, + "loss": 2.1129, + "step": 193920 + }, + { + "epoch": 0.7381454443031904, + "grad_norm": 0.13839933276176453, + "learning_rate": 0.0005, + "loss": 2.1189, + "step": 193930 + }, + { + "epoch": 0.7381835067713131, + "grad_norm": 0.13922223448753357, + "learning_rate": 0.0005, + "loss": 2.1068, + "step": 193940 + }, + { + "epoch": 0.7382215692394357, + "grad_norm": 0.12621387839317322, + "learning_rate": 0.0005, + "loss": 2.1027, + "step": 193950 + }, + { + "epoch": 0.7382596317075585, + "grad_norm": 0.12859176099300385, + "learning_rate": 0.0005, + "loss": 2.1193, + "step": 193960 + }, + { + "epoch": 0.7382976941756811, + "grad_norm": 0.12133989483118057, + "learning_rate": 0.0005, + "loss": 2.1008, + "step": 193970 + }, + { + "epoch": 0.7383357566438038, + "grad_norm": 0.11736427992582321, + "learning_rate": 0.0005, + "loss": 2.101, + "step": 193980 + }, + { + "epoch": 0.7383738191119265, + "grad_norm": 0.12446961551904678, + "learning_rate": 0.0005, + "loss": 2.1067, + "step": 193990 + }, + { + "epoch": 0.7384118815800492, + "grad_norm": 0.1348889321088791, + "learning_rate": 0.0005, + "loss": 2.1002, + "step": 194000 + }, + { + "epoch": 0.7384499440481719, + "grad_norm": 0.12804125249385834, + "learning_rate": 0.0005, + "loss": 2.1013, + "step": 194010 + }, + { + "epoch": 0.7384880065162945, + "grad_norm": 0.11841566115617752, + "learning_rate": 0.0005, + "loss": 2.1036, + "step": 194020 + }, + { + "epoch": 0.7385260689844172, + "grad_norm": 0.12618225812911987, + "learning_rate": 0.0005, + "loss": 2.1038, + "step": 194030 + }, + { + "epoch": 0.73856413145254, + "grad_norm": 0.14122790098190308, + "learning_rate": 0.0005, + "loss": 2.1035, + "step": 194040 + }, + { + "epoch": 0.7386021939206626, + "grad_norm": 0.12055061757564545, + "learning_rate": 0.0005, + "loss": 2.1085, + "step": 194050 + }, + { + "epoch": 0.7386402563887853, + "grad_norm": 0.11497969925403595, + "learning_rate": 0.0005, + "loss": 2.1041, + "step": 194060 + }, + { + "epoch": 0.7386783188569079, + "grad_norm": 0.11820774525403976, + "learning_rate": 0.0005, + "loss": 2.1194, + "step": 194070 + }, + { + "epoch": 0.7387163813250306, + "grad_norm": 0.14072273671627045, + "learning_rate": 0.0005, + "loss": 2.0959, + "step": 194080 + }, + { + "epoch": 0.7387544437931534, + "grad_norm": 0.12891046702861786, + "learning_rate": 0.0005, + "loss": 2.1263, + "step": 194090 + }, + { + "epoch": 0.738792506261276, + "grad_norm": 0.1222953349351883, + "learning_rate": 0.0005, + "loss": 2.0974, + "step": 194100 + }, + { + "epoch": 0.7388305687293987, + "grad_norm": 0.12340711802244186, + "learning_rate": 0.0005, + "loss": 2.1037, + "step": 194110 + }, + { + "epoch": 0.7388686311975213, + "grad_norm": 0.13871681690216064, + "learning_rate": 0.0005, + "loss": 2.107, + "step": 194120 + }, + { + "epoch": 0.7389066936656441, + "grad_norm": 0.18659791350364685, + "learning_rate": 0.0005, + "loss": 2.1212, + "step": 194130 + }, + { + "epoch": 0.7389447561337668, + "grad_norm": 0.12714652717113495, + "learning_rate": 0.0005, + "loss": 2.1146, + "step": 194140 + }, + { + "epoch": 0.7389828186018894, + "grad_norm": 0.11733846366405487, + "learning_rate": 0.0005, + "loss": 2.1034, + "step": 194150 + }, + { + "epoch": 0.7390208810700121, + "grad_norm": 0.11640311032533646, + "learning_rate": 0.0005, + "loss": 2.1136, + "step": 194160 + }, + { + "epoch": 0.7390589435381348, + "grad_norm": 0.12715081870555878, + "learning_rate": 0.0005, + "loss": 2.1015, + "step": 194170 + }, + { + "epoch": 0.7390970060062575, + "grad_norm": 0.12195795029401779, + "learning_rate": 0.0005, + "loss": 2.1055, + "step": 194180 + }, + { + "epoch": 0.7391350684743802, + "grad_norm": 0.13097110390663147, + "learning_rate": 0.0005, + "loss": 2.1055, + "step": 194190 + }, + { + "epoch": 0.7391731309425028, + "grad_norm": 0.13842256367206573, + "learning_rate": 0.0005, + "loss": 2.098, + "step": 194200 + }, + { + "epoch": 0.7392111934106255, + "grad_norm": 0.12363022565841675, + "learning_rate": 0.0005, + "loss": 2.1074, + "step": 194210 + }, + { + "epoch": 0.7392492558787482, + "grad_norm": 0.12613829970359802, + "learning_rate": 0.0005, + "loss": 2.1003, + "step": 194220 + }, + { + "epoch": 0.7392873183468709, + "grad_norm": 0.12938351929187775, + "learning_rate": 0.0005, + "loss": 2.1037, + "step": 194230 + }, + { + "epoch": 0.7393253808149935, + "grad_norm": 0.13193729519844055, + "learning_rate": 0.0005, + "loss": 2.1018, + "step": 194240 + }, + { + "epoch": 0.7393634432831162, + "grad_norm": 0.1291109174489975, + "learning_rate": 0.0005, + "loss": 2.1062, + "step": 194250 + }, + { + "epoch": 0.739401505751239, + "grad_norm": 0.13131606578826904, + "learning_rate": 0.0005, + "loss": 2.1047, + "step": 194260 + }, + { + "epoch": 0.7394395682193616, + "grad_norm": 0.12098632752895355, + "learning_rate": 0.0005, + "loss": 2.101, + "step": 194270 + }, + { + "epoch": 0.7394776306874843, + "grad_norm": 0.121994249522686, + "learning_rate": 0.0005, + "loss": 2.104, + "step": 194280 + }, + { + "epoch": 0.739515693155607, + "grad_norm": 0.12419252097606659, + "learning_rate": 0.0005, + "loss": 2.1214, + "step": 194290 + }, + { + "epoch": 0.7395537556237297, + "grad_norm": 0.1346055269241333, + "learning_rate": 0.0005, + "loss": 2.1153, + "step": 194300 + }, + { + "epoch": 0.7395918180918524, + "grad_norm": 0.128359854221344, + "learning_rate": 0.0005, + "loss": 2.0956, + "step": 194310 + }, + { + "epoch": 0.739629880559975, + "grad_norm": 0.14567787945270538, + "learning_rate": 0.0005, + "loss": 2.1049, + "step": 194320 + }, + { + "epoch": 0.7396679430280977, + "grad_norm": 0.12489699572324753, + "learning_rate": 0.0005, + "loss": 2.0941, + "step": 194330 + }, + { + "epoch": 0.7397060054962203, + "grad_norm": 0.1287219524383545, + "learning_rate": 0.0005, + "loss": 2.101, + "step": 194340 + }, + { + "epoch": 0.7397440679643431, + "grad_norm": 0.13634465634822845, + "learning_rate": 0.0005, + "loss": 2.1175, + "step": 194350 + }, + { + "epoch": 0.7397821304324658, + "grad_norm": 0.11655566096305847, + "learning_rate": 0.0005, + "loss": 2.1101, + "step": 194360 + }, + { + "epoch": 0.7398201929005884, + "grad_norm": 0.11334828287363052, + "learning_rate": 0.0005, + "loss": 2.092, + "step": 194370 + }, + { + "epoch": 0.7398582553687111, + "grad_norm": 0.12410476803779602, + "learning_rate": 0.0005, + "loss": 2.0999, + "step": 194380 + }, + { + "epoch": 0.7398963178368339, + "grad_norm": 0.12430357187986374, + "learning_rate": 0.0005, + "loss": 2.1084, + "step": 194390 + }, + { + "epoch": 0.7399343803049565, + "grad_norm": 0.1274576187133789, + "learning_rate": 0.0005, + "loss": 2.1065, + "step": 194400 + }, + { + "epoch": 0.7399724427730792, + "grad_norm": 0.12407605350017548, + "learning_rate": 0.0005, + "loss": 2.1121, + "step": 194410 + }, + { + "epoch": 0.7400105052412018, + "grad_norm": 0.13608965277671814, + "learning_rate": 0.0005, + "loss": 2.0964, + "step": 194420 + }, + { + "epoch": 0.7400485677093246, + "grad_norm": 0.1341804713010788, + "learning_rate": 0.0005, + "loss": 2.1102, + "step": 194430 + }, + { + "epoch": 0.7400866301774472, + "grad_norm": 0.1318841278553009, + "learning_rate": 0.0005, + "loss": 2.1071, + "step": 194440 + }, + { + "epoch": 0.7401246926455699, + "grad_norm": 0.11794579029083252, + "learning_rate": 0.0005, + "loss": 2.1111, + "step": 194450 + }, + { + "epoch": 0.7401627551136926, + "grad_norm": 0.1288701295852661, + "learning_rate": 0.0005, + "loss": 2.1125, + "step": 194460 + }, + { + "epoch": 0.7402008175818153, + "grad_norm": 0.11914876848459244, + "learning_rate": 0.0005, + "loss": 2.105, + "step": 194470 + }, + { + "epoch": 0.740238880049938, + "grad_norm": 0.13467030227184296, + "learning_rate": 0.0005, + "loss": 2.1077, + "step": 194480 + }, + { + "epoch": 0.7402769425180606, + "grad_norm": 0.14675983786582947, + "learning_rate": 0.0005, + "loss": 2.1168, + "step": 194490 + }, + { + "epoch": 0.7403150049861833, + "grad_norm": 0.12259969860315323, + "learning_rate": 0.0005, + "loss": 2.1011, + "step": 194500 + }, + { + "epoch": 0.740353067454306, + "grad_norm": 0.12208813428878784, + "learning_rate": 0.0005, + "loss": 2.118, + "step": 194510 + }, + { + "epoch": 0.7403911299224287, + "grad_norm": 0.12095604836940765, + "learning_rate": 0.0005, + "loss": 2.0999, + "step": 194520 + }, + { + "epoch": 0.7404291923905514, + "grad_norm": 0.12412271648645401, + "learning_rate": 0.0005, + "loss": 2.1014, + "step": 194530 + }, + { + "epoch": 0.740467254858674, + "grad_norm": 0.11599559336900711, + "learning_rate": 0.0005, + "loss": 2.1127, + "step": 194540 + }, + { + "epoch": 0.7405053173267967, + "grad_norm": 0.1328132301568985, + "learning_rate": 0.0005, + "loss": 2.1084, + "step": 194550 + }, + { + "epoch": 0.7405433797949195, + "grad_norm": 0.12443557381629944, + "learning_rate": 0.0005, + "loss": 2.1139, + "step": 194560 + }, + { + "epoch": 0.7405814422630421, + "grad_norm": 0.11573288589715958, + "learning_rate": 0.0005, + "loss": 2.1016, + "step": 194570 + }, + { + "epoch": 0.7406195047311648, + "grad_norm": 0.1223871111869812, + "learning_rate": 0.0005, + "loss": 2.1151, + "step": 194580 + }, + { + "epoch": 0.7406575671992874, + "grad_norm": 0.13990621268749237, + "learning_rate": 0.0005, + "loss": 2.1, + "step": 194590 + }, + { + "epoch": 0.7406956296674102, + "grad_norm": 0.1179359182715416, + "learning_rate": 0.0005, + "loss": 2.1073, + "step": 194600 + }, + { + "epoch": 0.7407336921355329, + "grad_norm": 0.11926417052745819, + "learning_rate": 0.0005, + "loss": 2.1002, + "step": 194610 + }, + { + "epoch": 0.7407717546036555, + "grad_norm": 0.12110884487628937, + "learning_rate": 0.0005, + "loss": 2.1215, + "step": 194620 + }, + { + "epoch": 0.7408098170717782, + "grad_norm": 0.12658652663230896, + "learning_rate": 0.0005, + "loss": 2.1026, + "step": 194630 + }, + { + "epoch": 0.7408478795399008, + "grad_norm": 0.125624418258667, + "learning_rate": 0.0005, + "loss": 2.1127, + "step": 194640 + }, + { + "epoch": 0.7408859420080236, + "grad_norm": 0.1293461173772812, + "learning_rate": 0.0005, + "loss": 2.0944, + "step": 194650 + }, + { + "epoch": 0.7409240044761463, + "grad_norm": 0.1234101727604866, + "learning_rate": 0.0005, + "loss": 2.1134, + "step": 194660 + }, + { + "epoch": 0.7409620669442689, + "grad_norm": 0.1319395899772644, + "learning_rate": 0.0005, + "loss": 2.1139, + "step": 194670 + }, + { + "epoch": 0.7410001294123916, + "grad_norm": 0.12228941917419434, + "learning_rate": 0.0005, + "loss": 2.108, + "step": 194680 + }, + { + "epoch": 0.7410381918805143, + "grad_norm": 0.13420431315898895, + "learning_rate": 0.0005, + "loss": 2.1021, + "step": 194690 + }, + { + "epoch": 0.741076254348637, + "grad_norm": 0.12183579057455063, + "learning_rate": 0.0005, + "loss": 2.1126, + "step": 194700 + }, + { + "epoch": 0.7411143168167597, + "grad_norm": 0.13289429247379303, + "learning_rate": 0.0005, + "loss": 2.1059, + "step": 194710 + }, + { + "epoch": 0.7411523792848823, + "grad_norm": 0.1252071112394333, + "learning_rate": 0.0005, + "loss": 2.1105, + "step": 194720 + }, + { + "epoch": 0.7411904417530051, + "grad_norm": 0.13454918563365936, + "learning_rate": 0.0005, + "loss": 2.104, + "step": 194730 + }, + { + "epoch": 0.7412285042211277, + "grad_norm": 0.11970204859972, + "learning_rate": 0.0005, + "loss": 2.1048, + "step": 194740 + }, + { + "epoch": 0.7412665666892504, + "grad_norm": 0.1275210827589035, + "learning_rate": 0.0005, + "loss": 2.11, + "step": 194750 + }, + { + "epoch": 0.741304629157373, + "grad_norm": 0.1155448704957962, + "learning_rate": 0.0005, + "loss": 2.1134, + "step": 194760 + }, + { + "epoch": 0.7413426916254957, + "grad_norm": 0.7521626949310303, + "learning_rate": 0.0005, + "loss": 2.1072, + "step": 194770 + }, + { + "epoch": 0.7413807540936185, + "grad_norm": 0.1261117309331894, + "learning_rate": 0.0005, + "loss": 2.1015, + "step": 194780 + }, + { + "epoch": 0.7414188165617411, + "grad_norm": 0.11743960529565811, + "learning_rate": 0.0005, + "loss": 2.1128, + "step": 194790 + }, + { + "epoch": 0.7414568790298638, + "grad_norm": 0.11156170070171356, + "learning_rate": 0.0005, + "loss": 2.1058, + "step": 194800 + }, + { + "epoch": 0.7414949414979864, + "grad_norm": 0.12148051708936691, + "learning_rate": 0.0005, + "loss": 2.0943, + "step": 194810 + }, + { + "epoch": 0.7415330039661092, + "grad_norm": 0.12265943735837936, + "learning_rate": 0.0005, + "loss": 2.1043, + "step": 194820 + }, + { + "epoch": 0.7415710664342319, + "grad_norm": 0.12968555092811584, + "learning_rate": 0.0005, + "loss": 2.1122, + "step": 194830 + }, + { + "epoch": 0.7416091289023545, + "grad_norm": 0.1303674876689911, + "learning_rate": 0.0005, + "loss": 2.0982, + "step": 194840 + }, + { + "epoch": 0.7416471913704772, + "grad_norm": 0.12468607723712921, + "learning_rate": 0.0005, + "loss": 2.1057, + "step": 194850 + }, + { + "epoch": 0.7416852538386, + "grad_norm": 0.1342761218547821, + "learning_rate": 0.0005, + "loss": 2.1105, + "step": 194860 + }, + { + "epoch": 0.7417233163067226, + "grad_norm": 0.12461252510547638, + "learning_rate": 0.0005, + "loss": 2.1068, + "step": 194870 + }, + { + "epoch": 0.7417613787748453, + "grad_norm": 0.13232353329658508, + "learning_rate": 0.0005, + "loss": 2.0986, + "step": 194880 + }, + { + "epoch": 0.7417994412429679, + "grad_norm": 0.10807273536920547, + "learning_rate": 0.0005, + "loss": 2.1013, + "step": 194890 + }, + { + "epoch": 0.7418375037110907, + "grad_norm": 0.12074726819992065, + "learning_rate": 0.0005, + "loss": 2.1085, + "step": 194900 + }, + { + "epoch": 0.7418755661792134, + "grad_norm": 0.13266825675964355, + "learning_rate": 0.0005, + "loss": 2.1086, + "step": 194910 + }, + { + "epoch": 0.741913628647336, + "grad_norm": 0.17095763981342316, + "learning_rate": 0.0005, + "loss": 2.0993, + "step": 194920 + }, + { + "epoch": 0.7419516911154587, + "grad_norm": 0.1295676976442337, + "learning_rate": 0.0005, + "loss": 2.1113, + "step": 194930 + }, + { + "epoch": 0.7419897535835813, + "grad_norm": 0.1498701125383377, + "learning_rate": 0.0005, + "loss": 2.1113, + "step": 194940 + }, + { + "epoch": 0.7420278160517041, + "grad_norm": 0.11569618433713913, + "learning_rate": 0.0005, + "loss": 2.1159, + "step": 194950 + }, + { + "epoch": 0.7420658785198267, + "grad_norm": 0.1568787395954132, + "learning_rate": 0.0005, + "loss": 2.101, + "step": 194960 + }, + { + "epoch": 0.7421039409879494, + "grad_norm": 0.11845368146896362, + "learning_rate": 0.0005, + "loss": 2.0849, + "step": 194970 + }, + { + "epoch": 0.7421420034560721, + "grad_norm": 0.1119331493973732, + "learning_rate": 0.0005, + "loss": 2.1079, + "step": 194980 + }, + { + "epoch": 0.7421800659241948, + "grad_norm": 0.12699395418167114, + "learning_rate": 0.0005, + "loss": 2.0995, + "step": 194990 + }, + { + "epoch": 0.7422181283923175, + "grad_norm": 0.112799733877182, + "learning_rate": 0.0005, + "loss": 2.0924, + "step": 195000 + }, + { + "epoch": 0.7422561908604401, + "grad_norm": 0.12144352495670319, + "learning_rate": 0.0005, + "loss": 2.1054, + "step": 195010 + }, + { + "epoch": 0.7422942533285628, + "grad_norm": 0.131864532828331, + "learning_rate": 0.0005, + "loss": 2.1074, + "step": 195020 + }, + { + "epoch": 0.7423323157966856, + "grad_norm": 0.12389921396970749, + "learning_rate": 0.0005, + "loss": 2.111, + "step": 195030 + }, + { + "epoch": 0.7423703782648082, + "grad_norm": 0.11010252684354782, + "learning_rate": 0.0005, + "loss": 2.113, + "step": 195040 + }, + { + "epoch": 0.7424084407329309, + "grad_norm": 0.11864927411079407, + "learning_rate": 0.0005, + "loss": 2.1203, + "step": 195050 + }, + { + "epoch": 0.7424465032010535, + "grad_norm": 0.11730501055717468, + "learning_rate": 0.0005, + "loss": 2.1025, + "step": 195060 + }, + { + "epoch": 0.7424845656691762, + "grad_norm": 0.11623819172382355, + "learning_rate": 0.0005, + "loss": 2.1082, + "step": 195070 + }, + { + "epoch": 0.742522628137299, + "grad_norm": 0.12447119504213333, + "learning_rate": 0.0005, + "loss": 2.0924, + "step": 195080 + }, + { + "epoch": 0.7425606906054216, + "grad_norm": 0.11485525965690613, + "learning_rate": 0.0005, + "loss": 2.1194, + "step": 195090 + }, + { + "epoch": 0.7425987530735443, + "grad_norm": 0.12632080912590027, + "learning_rate": 0.0005, + "loss": 2.112, + "step": 195100 + }, + { + "epoch": 0.7426368155416669, + "grad_norm": 0.12295328080654144, + "learning_rate": 0.0005, + "loss": 2.1005, + "step": 195110 + }, + { + "epoch": 0.7426748780097897, + "grad_norm": 0.14133335649967194, + "learning_rate": 0.0005, + "loss": 2.1051, + "step": 195120 + }, + { + "epoch": 0.7427129404779124, + "grad_norm": 0.1315918266773224, + "learning_rate": 0.0005, + "loss": 2.1028, + "step": 195130 + }, + { + "epoch": 0.742751002946035, + "grad_norm": 0.11096256226301193, + "learning_rate": 0.0005, + "loss": 2.1032, + "step": 195140 + }, + { + "epoch": 0.7427890654141577, + "grad_norm": 0.1371743530035019, + "learning_rate": 0.0005, + "loss": 2.105, + "step": 195150 + }, + { + "epoch": 0.7428271278822804, + "grad_norm": 0.13684628903865814, + "learning_rate": 0.0005, + "loss": 2.1081, + "step": 195160 + }, + { + "epoch": 0.7428651903504031, + "grad_norm": 0.11548829823732376, + "learning_rate": 0.0005, + "loss": 2.0895, + "step": 195170 + }, + { + "epoch": 0.7429032528185258, + "grad_norm": 0.12377560883760452, + "learning_rate": 0.0005, + "loss": 2.1199, + "step": 195180 + }, + { + "epoch": 0.7429413152866484, + "grad_norm": 0.12421797215938568, + "learning_rate": 0.0005, + "loss": 2.1136, + "step": 195190 + }, + { + "epoch": 0.7429793777547711, + "grad_norm": 0.11980584263801575, + "learning_rate": 0.0005, + "loss": 2.1023, + "step": 195200 + }, + { + "epoch": 0.7430174402228938, + "grad_norm": 0.1434640884399414, + "learning_rate": 0.0005, + "loss": 2.1057, + "step": 195210 + }, + { + "epoch": 0.7430555026910165, + "grad_norm": 0.13627471029758453, + "learning_rate": 0.0005, + "loss": 2.1192, + "step": 195220 + }, + { + "epoch": 0.7430935651591392, + "grad_norm": 0.12290006130933762, + "learning_rate": 0.0005, + "loss": 2.1003, + "step": 195230 + }, + { + "epoch": 0.7431316276272618, + "grad_norm": 0.12723302841186523, + "learning_rate": 0.0005, + "loss": 2.1084, + "step": 195240 + }, + { + "epoch": 0.7431696900953846, + "grad_norm": 0.1370949000120163, + "learning_rate": 0.0005, + "loss": 2.1045, + "step": 195250 + }, + { + "epoch": 0.7432077525635072, + "grad_norm": 0.1215818002820015, + "learning_rate": 0.0005, + "loss": 2.0992, + "step": 195260 + }, + { + "epoch": 0.7432458150316299, + "grad_norm": 0.1530386060476303, + "learning_rate": 0.0005, + "loss": 2.1076, + "step": 195270 + }, + { + "epoch": 0.7432838774997526, + "grad_norm": 0.13338083028793335, + "learning_rate": 0.0005, + "loss": 2.1022, + "step": 195280 + }, + { + "epoch": 0.7433219399678753, + "grad_norm": 0.12526699900627136, + "learning_rate": 0.0005, + "loss": 2.1027, + "step": 195290 + }, + { + "epoch": 0.743360002435998, + "grad_norm": 0.11876381933689117, + "learning_rate": 0.0005, + "loss": 2.0965, + "step": 195300 + }, + { + "epoch": 0.7433980649041206, + "grad_norm": 0.1208324059844017, + "learning_rate": 0.0005, + "loss": 2.1071, + "step": 195310 + }, + { + "epoch": 0.7434361273722433, + "grad_norm": 0.1263258308172226, + "learning_rate": 0.0005, + "loss": 2.1198, + "step": 195320 + }, + { + "epoch": 0.7434741898403661, + "grad_norm": 0.12312033027410507, + "learning_rate": 0.0005, + "loss": 2.1123, + "step": 195330 + }, + { + "epoch": 0.7435122523084887, + "grad_norm": 0.11395483464002609, + "learning_rate": 0.0005, + "loss": 2.1032, + "step": 195340 + }, + { + "epoch": 0.7435503147766114, + "grad_norm": 0.13589192926883698, + "learning_rate": 0.0005, + "loss": 2.1125, + "step": 195350 + }, + { + "epoch": 0.743588377244734, + "grad_norm": 0.1306493729352951, + "learning_rate": 0.0005, + "loss": 2.0964, + "step": 195360 + }, + { + "epoch": 0.7436264397128567, + "grad_norm": 0.1288546621799469, + "learning_rate": 0.0005, + "loss": 2.1146, + "step": 195370 + }, + { + "epoch": 0.7436645021809795, + "grad_norm": 0.13180427253246307, + "learning_rate": 0.0005, + "loss": 2.1012, + "step": 195380 + }, + { + "epoch": 0.7437025646491021, + "grad_norm": 0.11989396065473557, + "learning_rate": 0.0005, + "loss": 2.1217, + "step": 195390 + }, + { + "epoch": 0.7437406271172248, + "grad_norm": 0.11841694265604019, + "learning_rate": 0.0005, + "loss": 2.1115, + "step": 195400 + }, + { + "epoch": 0.7437786895853474, + "grad_norm": 0.12703417241573334, + "learning_rate": 0.0005, + "loss": 2.1077, + "step": 195410 + }, + { + "epoch": 0.7438167520534702, + "grad_norm": 0.12518304586410522, + "learning_rate": 0.0005, + "loss": 2.1007, + "step": 195420 + }, + { + "epoch": 0.7438548145215929, + "grad_norm": 0.13396604359149933, + "learning_rate": 0.0005, + "loss": 2.1082, + "step": 195430 + }, + { + "epoch": 0.7438928769897155, + "grad_norm": 0.12902133166790009, + "learning_rate": 0.0005, + "loss": 2.1031, + "step": 195440 + }, + { + "epoch": 0.7439309394578382, + "grad_norm": 0.13435329496860504, + "learning_rate": 0.0005, + "loss": 2.1135, + "step": 195450 + }, + { + "epoch": 0.7439690019259609, + "grad_norm": 0.12080912292003632, + "learning_rate": 0.0005, + "loss": 2.103, + "step": 195460 + }, + { + "epoch": 0.7440070643940836, + "grad_norm": 0.11378413438796997, + "learning_rate": 0.0005, + "loss": 2.122, + "step": 195470 + }, + { + "epoch": 0.7440451268622063, + "grad_norm": 0.12693087756633759, + "learning_rate": 0.0005, + "loss": 2.1004, + "step": 195480 + }, + { + "epoch": 0.7440831893303289, + "grad_norm": 0.12436477839946747, + "learning_rate": 0.0005, + "loss": 2.1014, + "step": 195490 + }, + { + "epoch": 0.7441212517984516, + "grad_norm": 0.12074817717075348, + "learning_rate": 0.0005, + "loss": 2.0949, + "step": 195500 + }, + { + "epoch": 0.7441593142665743, + "grad_norm": 0.13063935935497284, + "learning_rate": 0.0005, + "loss": 2.1085, + "step": 195510 + }, + { + "epoch": 0.744197376734697, + "grad_norm": 0.12233955413103104, + "learning_rate": 0.0005, + "loss": 2.0922, + "step": 195520 + }, + { + "epoch": 0.7442354392028196, + "grad_norm": 0.12078586220741272, + "learning_rate": 0.0005, + "loss": 2.1025, + "step": 195530 + }, + { + "epoch": 0.7442735016709423, + "grad_norm": 0.12886860966682434, + "learning_rate": 0.0005, + "loss": 2.0972, + "step": 195540 + }, + { + "epoch": 0.7443115641390651, + "grad_norm": 0.13154838979244232, + "learning_rate": 0.0005, + "loss": 2.0997, + "step": 195550 + }, + { + "epoch": 0.7443496266071877, + "grad_norm": 0.12000450491905212, + "learning_rate": 0.0005, + "loss": 2.1087, + "step": 195560 + }, + { + "epoch": 0.7443876890753104, + "grad_norm": 0.12249580025672913, + "learning_rate": 0.0005, + "loss": 2.0969, + "step": 195570 + }, + { + "epoch": 0.744425751543433, + "grad_norm": 0.12777367234230042, + "learning_rate": 0.0005, + "loss": 2.1162, + "step": 195580 + }, + { + "epoch": 0.7444638140115558, + "grad_norm": 0.12537863850593567, + "learning_rate": 0.0005, + "loss": 2.1103, + "step": 195590 + }, + { + "epoch": 0.7445018764796785, + "grad_norm": 0.11748939752578735, + "learning_rate": 0.0005, + "loss": 2.0998, + "step": 195600 + }, + { + "epoch": 0.7445399389478011, + "grad_norm": 0.11325472593307495, + "learning_rate": 0.0005, + "loss": 2.116, + "step": 195610 + }, + { + "epoch": 0.7445780014159238, + "grad_norm": 0.12836772203445435, + "learning_rate": 0.0005, + "loss": 2.108, + "step": 195620 + }, + { + "epoch": 0.7446160638840464, + "grad_norm": 0.12222602963447571, + "learning_rate": 0.0005, + "loss": 2.0922, + "step": 195630 + }, + { + "epoch": 0.7446541263521692, + "grad_norm": 0.12648913264274597, + "learning_rate": 0.0005, + "loss": 2.1128, + "step": 195640 + }, + { + "epoch": 0.7446921888202919, + "grad_norm": 0.1333761066198349, + "learning_rate": 0.0005, + "loss": 2.1098, + "step": 195650 + }, + { + "epoch": 0.7447302512884145, + "grad_norm": 0.13610762357711792, + "learning_rate": 0.0005, + "loss": 2.0942, + "step": 195660 + }, + { + "epoch": 0.7447683137565372, + "grad_norm": 0.11557116359472275, + "learning_rate": 0.0005, + "loss": 2.1111, + "step": 195670 + }, + { + "epoch": 0.74480637622466, + "grad_norm": 0.136189267039299, + "learning_rate": 0.0005, + "loss": 2.1218, + "step": 195680 + }, + { + "epoch": 0.7448444386927826, + "grad_norm": 0.14569132030010223, + "learning_rate": 0.0005, + "loss": 2.1032, + "step": 195690 + }, + { + "epoch": 0.7448825011609053, + "grad_norm": 0.12963727116584778, + "learning_rate": 0.0005, + "loss": 2.1072, + "step": 195700 + }, + { + "epoch": 0.7449205636290279, + "grad_norm": 0.12556087970733643, + "learning_rate": 0.0005, + "loss": 2.1115, + "step": 195710 + }, + { + "epoch": 0.7449586260971507, + "grad_norm": 0.11867135018110275, + "learning_rate": 0.0005, + "loss": 2.1142, + "step": 195720 + }, + { + "epoch": 0.7449966885652733, + "grad_norm": 0.12275257706642151, + "learning_rate": 0.0005, + "loss": 2.1072, + "step": 195730 + }, + { + "epoch": 0.745034751033396, + "grad_norm": 0.11988690495491028, + "learning_rate": 0.0005, + "loss": 2.1186, + "step": 195740 + }, + { + "epoch": 0.7450728135015187, + "grad_norm": 0.13531182706356049, + "learning_rate": 0.0005, + "loss": 2.11, + "step": 195750 + }, + { + "epoch": 0.7451108759696414, + "grad_norm": 0.1375477910041809, + "learning_rate": 0.0005, + "loss": 2.1147, + "step": 195760 + }, + { + "epoch": 0.7451489384377641, + "grad_norm": 0.13907170295715332, + "learning_rate": 0.0005, + "loss": 2.1017, + "step": 195770 + }, + { + "epoch": 0.7451870009058867, + "grad_norm": 0.1284158080816269, + "learning_rate": 0.0005, + "loss": 2.0954, + "step": 195780 + }, + { + "epoch": 0.7452250633740094, + "grad_norm": 0.11813719570636749, + "learning_rate": 0.0005, + "loss": 2.1009, + "step": 195790 + }, + { + "epoch": 0.7452631258421321, + "grad_norm": 0.13614222407341003, + "learning_rate": 0.0005, + "loss": 2.1116, + "step": 195800 + }, + { + "epoch": 0.7453011883102548, + "grad_norm": 0.1306437849998474, + "learning_rate": 0.0005, + "loss": 2.1089, + "step": 195810 + }, + { + "epoch": 0.7453392507783775, + "grad_norm": 0.13610365986824036, + "learning_rate": 0.0005, + "loss": 2.1145, + "step": 195820 + }, + { + "epoch": 0.7453773132465001, + "grad_norm": 0.12205871939659119, + "learning_rate": 0.0005, + "loss": 2.102, + "step": 195830 + }, + { + "epoch": 0.7454153757146228, + "grad_norm": 0.1164771318435669, + "learning_rate": 0.0005, + "loss": 2.1126, + "step": 195840 + }, + { + "epoch": 0.7454534381827456, + "grad_norm": 0.12755510210990906, + "learning_rate": 0.0005, + "loss": 2.1212, + "step": 195850 + }, + { + "epoch": 0.7454915006508682, + "grad_norm": 0.11595284938812256, + "learning_rate": 0.0005, + "loss": 2.097, + "step": 195860 + }, + { + "epoch": 0.7455295631189909, + "grad_norm": 0.1476670503616333, + "learning_rate": 0.0005, + "loss": 2.0923, + "step": 195870 + }, + { + "epoch": 0.7455676255871135, + "grad_norm": 0.12011052668094635, + "learning_rate": 0.0005, + "loss": 2.1163, + "step": 195880 + }, + { + "epoch": 0.7456056880552363, + "grad_norm": 0.13740184903144836, + "learning_rate": 0.0005, + "loss": 2.1194, + "step": 195890 + }, + { + "epoch": 0.745643750523359, + "grad_norm": 0.11663123965263367, + "learning_rate": 0.0005, + "loss": 2.1061, + "step": 195900 + }, + { + "epoch": 0.7456818129914816, + "grad_norm": 0.12527674436569214, + "learning_rate": 0.0005, + "loss": 2.108, + "step": 195910 + }, + { + "epoch": 0.7457198754596043, + "grad_norm": 0.14667300879955292, + "learning_rate": 0.0005, + "loss": 2.1084, + "step": 195920 + }, + { + "epoch": 0.7457579379277269, + "grad_norm": 0.14290866255760193, + "learning_rate": 0.0005, + "loss": 2.1024, + "step": 195930 + }, + { + "epoch": 0.7457960003958497, + "grad_norm": 0.12882038950920105, + "learning_rate": 0.0005, + "loss": 2.0961, + "step": 195940 + }, + { + "epoch": 0.7458340628639724, + "grad_norm": 0.12286058813333511, + "learning_rate": 0.0005, + "loss": 2.1211, + "step": 195950 + }, + { + "epoch": 0.745872125332095, + "grad_norm": 0.12759080529212952, + "learning_rate": 0.0005, + "loss": 2.1204, + "step": 195960 + }, + { + "epoch": 0.7459101878002177, + "grad_norm": 0.12761107087135315, + "learning_rate": 0.0005, + "loss": 2.0998, + "step": 195970 + }, + { + "epoch": 0.7459482502683404, + "grad_norm": 0.11339493840932846, + "learning_rate": 0.0005, + "loss": 2.1123, + "step": 195980 + }, + { + "epoch": 0.7459863127364631, + "grad_norm": 0.1386970579624176, + "learning_rate": 0.0005, + "loss": 2.1028, + "step": 195990 + }, + { + "epoch": 0.7460243752045858, + "grad_norm": 0.12397254258394241, + "learning_rate": 0.0005, + "loss": 2.1069, + "step": 196000 + }, + { + "epoch": 0.7460624376727084, + "grad_norm": 0.12716779112815857, + "learning_rate": 0.0005, + "loss": 2.0913, + "step": 196010 + }, + { + "epoch": 0.7461005001408312, + "grad_norm": 0.13425159454345703, + "learning_rate": 0.0005, + "loss": 2.1229, + "step": 196020 + }, + { + "epoch": 0.7461385626089538, + "grad_norm": 0.1143328994512558, + "learning_rate": 0.0005, + "loss": 2.1258, + "step": 196030 + }, + { + "epoch": 0.7461766250770765, + "grad_norm": 0.13692298531532288, + "learning_rate": 0.0005, + "loss": 2.1045, + "step": 196040 + }, + { + "epoch": 0.7462146875451992, + "grad_norm": 0.12230276316404343, + "learning_rate": 0.0005, + "loss": 2.0959, + "step": 196050 + }, + { + "epoch": 0.7462527500133218, + "grad_norm": 0.1241021677851677, + "learning_rate": 0.0005, + "loss": 2.1262, + "step": 196060 + }, + { + "epoch": 0.7462908124814446, + "grad_norm": 0.11289073526859283, + "learning_rate": 0.0005, + "loss": 2.1109, + "step": 196070 + }, + { + "epoch": 0.7463288749495672, + "grad_norm": 0.12893123924732208, + "learning_rate": 0.0005, + "loss": 2.1088, + "step": 196080 + }, + { + "epoch": 0.7463669374176899, + "grad_norm": 0.11894325166940689, + "learning_rate": 0.0005, + "loss": 2.1149, + "step": 196090 + }, + { + "epoch": 0.7464049998858125, + "grad_norm": 0.11847478151321411, + "learning_rate": 0.0005, + "loss": 2.1107, + "step": 196100 + }, + { + "epoch": 0.7464430623539353, + "grad_norm": 0.15468305349349976, + "learning_rate": 0.0005, + "loss": 2.1179, + "step": 196110 + }, + { + "epoch": 0.746481124822058, + "grad_norm": 0.11917208880186081, + "learning_rate": 0.0005, + "loss": 2.1058, + "step": 196120 + }, + { + "epoch": 0.7465191872901806, + "grad_norm": 0.13077053427696228, + "learning_rate": 0.0005, + "loss": 2.1025, + "step": 196130 + }, + { + "epoch": 0.7465572497583033, + "grad_norm": 0.13156230747699738, + "learning_rate": 0.0005, + "loss": 2.0886, + "step": 196140 + }, + { + "epoch": 0.7465953122264261, + "grad_norm": 0.1107783317565918, + "learning_rate": 0.0005, + "loss": 2.1206, + "step": 196150 + }, + { + "epoch": 0.7466333746945487, + "grad_norm": 0.25660818815231323, + "learning_rate": 0.0005, + "loss": 2.1105, + "step": 196160 + }, + { + "epoch": 0.7466714371626714, + "grad_norm": 0.11806541681289673, + "learning_rate": 0.0005, + "loss": 2.1101, + "step": 196170 + }, + { + "epoch": 0.746709499630794, + "grad_norm": 0.11881528794765472, + "learning_rate": 0.0005, + "loss": 2.1009, + "step": 196180 + }, + { + "epoch": 0.7467475620989168, + "grad_norm": 0.11958711594343185, + "learning_rate": 0.0005, + "loss": 2.1163, + "step": 196190 + }, + { + "epoch": 0.7467856245670395, + "grad_norm": 0.12247668951749802, + "learning_rate": 0.0005, + "loss": 2.1125, + "step": 196200 + }, + { + "epoch": 0.7468236870351621, + "grad_norm": 0.14594995975494385, + "learning_rate": 0.0005, + "loss": 2.11, + "step": 196210 + }, + { + "epoch": 0.7468617495032848, + "grad_norm": 0.14043888449668884, + "learning_rate": 0.0005, + "loss": 2.1075, + "step": 196220 + }, + { + "epoch": 0.7468998119714074, + "grad_norm": 0.12176655232906342, + "learning_rate": 0.0005, + "loss": 2.1214, + "step": 196230 + }, + { + "epoch": 0.7469378744395302, + "grad_norm": 0.12017684429883957, + "learning_rate": 0.0005, + "loss": 2.1099, + "step": 196240 + }, + { + "epoch": 0.7469759369076528, + "grad_norm": 0.12407419085502625, + "learning_rate": 0.0005, + "loss": 2.1205, + "step": 196250 + }, + { + "epoch": 0.7470139993757755, + "grad_norm": 0.12825097143650055, + "learning_rate": 0.0005, + "loss": 2.1168, + "step": 196260 + }, + { + "epoch": 0.7470520618438982, + "grad_norm": 0.1215038001537323, + "learning_rate": 0.0005, + "loss": 2.1147, + "step": 196270 + }, + { + "epoch": 0.7470901243120209, + "grad_norm": 0.12874773144721985, + "learning_rate": 0.0005, + "loss": 2.1083, + "step": 196280 + }, + { + "epoch": 0.7471281867801436, + "grad_norm": 0.12362653017044067, + "learning_rate": 0.0005, + "loss": 2.1107, + "step": 196290 + }, + { + "epoch": 0.7471662492482662, + "grad_norm": 0.12631510198116302, + "learning_rate": 0.0005, + "loss": 2.1132, + "step": 196300 + }, + { + "epoch": 0.7472043117163889, + "grad_norm": 0.12554360926151276, + "learning_rate": 0.0005, + "loss": 2.1011, + "step": 196310 + }, + { + "epoch": 0.7472423741845117, + "grad_norm": 0.12463687360286713, + "learning_rate": 0.0005, + "loss": 2.1091, + "step": 196320 + }, + { + "epoch": 0.7472804366526343, + "grad_norm": 0.12406588345766068, + "learning_rate": 0.0005, + "loss": 2.1072, + "step": 196330 + }, + { + "epoch": 0.747318499120757, + "grad_norm": 0.12036009132862091, + "learning_rate": 0.0005, + "loss": 2.1047, + "step": 196340 + }, + { + "epoch": 0.7473565615888796, + "grad_norm": 0.12196014076471329, + "learning_rate": 0.0005, + "loss": 2.1039, + "step": 196350 + }, + { + "epoch": 0.7473946240570023, + "grad_norm": 0.1232280284166336, + "learning_rate": 0.0005, + "loss": 2.1013, + "step": 196360 + }, + { + "epoch": 0.7474326865251251, + "grad_norm": 0.13089613616466522, + "learning_rate": 0.0005, + "loss": 2.0951, + "step": 196370 + }, + { + "epoch": 0.7474707489932477, + "grad_norm": 0.12335113435983658, + "learning_rate": 0.0005, + "loss": 2.0932, + "step": 196380 + }, + { + "epoch": 0.7475088114613704, + "grad_norm": 0.12216979265213013, + "learning_rate": 0.0005, + "loss": 2.0996, + "step": 196390 + }, + { + "epoch": 0.747546873929493, + "grad_norm": 0.15522664785385132, + "learning_rate": 0.0005, + "loss": 2.1262, + "step": 196400 + }, + { + "epoch": 0.7475849363976158, + "grad_norm": 0.13516436517238617, + "learning_rate": 0.0005, + "loss": 2.1144, + "step": 196410 + }, + { + "epoch": 0.7476229988657385, + "grad_norm": 0.12450914829969406, + "learning_rate": 0.0005, + "loss": 2.1045, + "step": 196420 + }, + { + "epoch": 0.7476610613338611, + "grad_norm": 0.12933525443077087, + "learning_rate": 0.0005, + "loss": 2.1036, + "step": 196430 + }, + { + "epoch": 0.7476991238019838, + "grad_norm": 0.1256740838289261, + "learning_rate": 0.0005, + "loss": 2.1153, + "step": 196440 + }, + { + "epoch": 0.7477371862701065, + "grad_norm": 0.12129776924848557, + "learning_rate": 0.0005, + "loss": 2.1058, + "step": 196450 + }, + { + "epoch": 0.7477752487382292, + "grad_norm": 0.12356984615325928, + "learning_rate": 0.0005, + "loss": 2.1046, + "step": 196460 + }, + { + "epoch": 0.7478133112063519, + "grad_norm": 0.12165206670761108, + "learning_rate": 0.0005, + "loss": 2.1122, + "step": 196470 + }, + { + "epoch": 0.7478513736744745, + "grad_norm": 0.1134643629193306, + "learning_rate": 0.0005, + "loss": 2.1003, + "step": 196480 + }, + { + "epoch": 0.7478894361425973, + "grad_norm": 0.12672019004821777, + "learning_rate": 0.0005, + "loss": 2.1143, + "step": 196490 + }, + { + "epoch": 0.7479274986107199, + "grad_norm": 0.11957664787769318, + "learning_rate": 0.0005, + "loss": 2.1048, + "step": 196500 + }, + { + "epoch": 0.7479655610788426, + "grad_norm": 0.1286424845457077, + "learning_rate": 0.0005, + "loss": 2.1152, + "step": 196510 + }, + { + "epoch": 0.7480036235469653, + "grad_norm": 0.1337771862745285, + "learning_rate": 0.0005, + "loss": 2.1193, + "step": 196520 + }, + { + "epoch": 0.7480416860150879, + "grad_norm": 0.1336856186389923, + "learning_rate": 0.0005, + "loss": 2.1182, + "step": 196530 + }, + { + "epoch": 0.7480797484832107, + "grad_norm": 0.1441793590784073, + "learning_rate": 0.0005, + "loss": 2.1057, + "step": 196540 + }, + { + "epoch": 0.7481178109513333, + "grad_norm": 0.12170455604791641, + "learning_rate": 0.0005, + "loss": 2.1246, + "step": 196550 + }, + { + "epoch": 0.748155873419456, + "grad_norm": 0.1308586448431015, + "learning_rate": 0.0005, + "loss": 2.0871, + "step": 196560 + }, + { + "epoch": 0.7481939358875787, + "grad_norm": 0.1299491673707962, + "learning_rate": 0.0005, + "loss": 2.099, + "step": 196570 + }, + { + "epoch": 0.7482319983557014, + "grad_norm": 0.12823458015918732, + "learning_rate": 0.0005, + "loss": 2.121, + "step": 196580 + }, + { + "epoch": 0.7482700608238241, + "grad_norm": 0.1290253847837448, + "learning_rate": 0.0005, + "loss": 2.1146, + "step": 196590 + }, + { + "epoch": 0.7483081232919467, + "grad_norm": 0.14299185574054718, + "learning_rate": 0.0005, + "loss": 2.1083, + "step": 196600 + }, + { + "epoch": 0.7483461857600694, + "grad_norm": 0.137240469455719, + "learning_rate": 0.0005, + "loss": 2.1029, + "step": 196610 + }, + { + "epoch": 0.7483842482281922, + "grad_norm": 0.14135576784610748, + "learning_rate": 0.0005, + "loss": 2.111, + "step": 196620 + }, + { + "epoch": 0.7484223106963148, + "grad_norm": 0.12895070016384125, + "learning_rate": 0.0005, + "loss": 2.1145, + "step": 196630 + }, + { + "epoch": 0.7484603731644375, + "grad_norm": 0.12500692903995514, + "learning_rate": 0.0005, + "loss": 2.1112, + "step": 196640 + }, + { + "epoch": 0.7484984356325601, + "grad_norm": 0.12122353166341782, + "learning_rate": 0.0005, + "loss": 2.1065, + "step": 196650 + }, + { + "epoch": 0.7485364981006828, + "grad_norm": 0.12457163631916046, + "learning_rate": 0.0005, + "loss": 2.1111, + "step": 196660 + }, + { + "epoch": 0.7485745605688056, + "grad_norm": 0.11478997766971588, + "learning_rate": 0.0005, + "loss": 2.1039, + "step": 196670 + }, + { + "epoch": 0.7486126230369282, + "grad_norm": 0.12098927795886993, + "learning_rate": 0.0005, + "loss": 2.1193, + "step": 196680 + }, + { + "epoch": 0.7486506855050509, + "grad_norm": 0.137788787484169, + "learning_rate": 0.0005, + "loss": 2.0984, + "step": 196690 + }, + { + "epoch": 0.7486887479731735, + "grad_norm": 0.12748932838439941, + "learning_rate": 0.0005, + "loss": 2.1103, + "step": 196700 + }, + { + "epoch": 0.7487268104412963, + "grad_norm": 0.12803438305854797, + "learning_rate": 0.0005, + "loss": 2.1136, + "step": 196710 + }, + { + "epoch": 0.748764872909419, + "grad_norm": 0.12306778877973557, + "learning_rate": 0.0005, + "loss": 2.1154, + "step": 196720 + }, + { + "epoch": 0.7488029353775416, + "grad_norm": 0.1417481154203415, + "learning_rate": 0.0005, + "loss": 2.1047, + "step": 196730 + }, + { + "epoch": 0.7488409978456643, + "grad_norm": 0.1265956461429596, + "learning_rate": 0.0005, + "loss": 2.1019, + "step": 196740 + }, + { + "epoch": 0.748879060313787, + "grad_norm": 0.14312238991260529, + "learning_rate": 0.0005, + "loss": 2.1139, + "step": 196750 + }, + { + "epoch": 0.7489171227819097, + "grad_norm": 0.1359281837940216, + "learning_rate": 0.0005, + "loss": 2.1128, + "step": 196760 + }, + { + "epoch": 0.7489551852500324, + "grad_norm": 0.13459917902946472, + "learning_rate": 0.0005, + "loss": 2.1201, + "step": 196770 + }, + { + "epoch": 0.748993247718155, + "grad_norm": 0.12361203879117966, + "learning_rate": 0.0005, + "loss": 2.1086, + "step": 196780 + }, + { + "epoch": 0.7490313101862777, + "grad_norm": 0.1266535371541977, + "learning_rate": 0.0005, + "loss": 2.1178, + "step": 196790 + }, + { + "epoch": 0.7490693726544004, + "grad_norm": 0.13514138758182526, + "learning_rate": 0.0005, + "loss": 2.1083, + "step": 196800 + }, + { + "epoch": 0.7491074351225231, + "grad_norm": 0.11725779622793198, + "learning_rate": 0.0005, + "loss": 2.1007, + "step": 196810 + }, + { + "epoch": 0.7491454975906457, + "grad_norm": 0.12637847661972046, + "learning_rate": 0.0005, + "loss": 2.0989, + "step": 196820 + }, + { + "epoch": 0.7491835600587684, + "grad_norm": 0.12415625900030136, + "learning_rate": 0.0005, + "loss": 2.1049, + "step": 196830 + }, + { + "epoch": 0.7492216225268912, + "grad_norm": 0.1321948766708374, + "learning_rate": 0.0005, + "loss": 2.1108, + "step": 196840 + }, + { + "epoch": 0.7492596849950138, + "grad_norm": 0.13515117764472961, + "learning_rate": 0.0005, + "loss": 2.115, + "step": 196850 + }, + { + "epoch": 0.7492977474631365, + "grad_norm": 0.12687243521213531, + "learning_rate": 0.0005, + "loss": 2.1004, + "step": 196860 + }, + { + "epoch": 0.7493358099312591, + "grad_norm": 0.12546047568321228, + "learning_rate": 0.0005, + "loss": 2.1144, + "step": 196870 + }, + { + "epoch": 0.7493738723993819, + "grad_norm": 0.13166911900043488, + "learning_rate": 0.0005, + "loss": 2.1192, + "step": 196880 + }, + { + "epoch": 0.7494119348675046, + "grad_norm": 0.13394483923912048, + "learning_rate": 0.0005, + "loss": 2.1184, + "step": 196890 + }, + { + "epoch": 0.7494499973356272, + "grad_norm": 0.12177181988954544, + "learning_rate": 0.0005, + "loss": 2.1071, + "step": 196900 + }, + { + "epoch": 0.7494880598037499, + "grad_norm": 0.12611910700798035, + "learning_rate": 0.0005, + "loss": 2.106, + "step": 196910 + }, + { + "epoch": 0.7495261222718727, + "grad_norm": 0.11354668438434601, + "learning_rate": 0.0005, + "loss": 2.1183, + "step": 196920 + }, + { + "epoch": 0.7495641847399953, + "grad_norm": 0.134921133518219, + "learning_rate": 0.0005, + "loss": 2.1091, + "step": 196930 + }, + { + "epoch": 0.749602247208118, + "grad_norm": 0.14567500352859497, + "learning_rate": 0.0005, + "loss": 2.1091, + "step": 196940 + }, + { + "epoch": 0.7496403096762406, + "grad_norm": 0.12872666120529175, + "learning_rate": 0.0005, + "loss": 2.1073, + "step": 196950 + }, + { + "epoch": 0.7496783721443633, + "grad_norm": 0.11634253710508347, + "learning_rate": 0.0005, + "loss": 2.105, + "step": 196960 + }, + { + "epoch": 0.749716434612486, + "grad_norm": 0.11523030698299408, + "learning_rate": 0.0005, + "loss": 2.109, + "step": 196970 + }, + { + "epoch": 0.7497544970806087, + "grad_norm": 0.12753266096115112, + "learning_rate": 0.0005, + "loss": 2.0921, + "step": 196980 + }, + { + "epoch": 0.7497925595487314, + "grad_norm": 0.11636027693748474, + "learning_rate": 0.0005, + "loss": 2.1083, + "step": 196990 + }, + { + "epoch": 0.749830622016854, + "grad_norm": 0.12546052038669586, + "learning_rate": 0.0005, + "loss": 2.112, + "step": 197000 + }, + { + "epoch": 0.7498686844849768, + "grad_norm": 0.12241706252098083, + "learning_rate": 0.0005, + "loss": 2.0931, + "step": 197010 + }, + { + "epoch": 0.7499067469530994, + "grad_norm": 0.12285008281469345, + "learning_rate": 0.0005, + "loss": 2.1173, + "step": 197020 + }, + { + "epoch": 0.7499448094212221, + "grad_norm": 0.11864815652370453, + "learning_rate": 0.0005, + "loss": 2.1068, + "step": 197030 + }, + { + "epoch": 0.7499828718893448, + "grad_norm": 0.12144052237272263, + "learning_rate": 0.0005, + "loss": 2.1176, + "step": 197040 + }, + { + "epoch": 0.7500209343574675, + "grad_norm": 0.12249705940485, + "learning_rate": 0.0005, + "loss": 2.1103, + "step": 197050 + }, + { + "epoch": 0.7500589968255902, + "grad_norm": 0.13299207389354706, + "learning_rate": 0.0005, + "loss": 2.0939, + "step": 197060 + }, + { + "epoch": 0.7500970592937128, + "grad_norm": 0.12375347316265106, + "learning_rate": 0.0005, + "loss": 2.1131, + "step": 197070 + }, + { + "epoch": 0.7501351217618355, + "grad_norm": 0.1229780837893486, + "learning_rate": 0.0005, + "loss": 2.0892, + "step": 197080 + }, + { + "epoch": 0.7501731842299582, + "grad_norm": 0.12229334563016891, + "learning_rate": 0.0005, + "loss": 2.099, + "step": 197090 + }, + { + "epoch": 0.7502112466980809, + "grad_norm": 0.1299109309911728, + "learning_rate": 0.0005, + "loss": 2.0912, + "step": 197100 + }, + { + "epoch": 0.7502493091662036, + "grad_norm": 0.12679001688957214, + "learning_rate": 0.0005, + "loss": 2.1187, + "step": 197110 + }, + { + "epoch": 0.7502873716343262, + "grad_norm": 0.12795282900333405, + "learning_rate": 0.0005, + "loss": 2.1104, + "step": 197120 + }, + { + "epoch": 0.7503254341024489, + "grad_norm": 0.12865543365478516, + "learning_rate": 0.0005, + "loss": 2.1025, + "step": 197130 + }, + { + "epoch": 0.7503634965705717, + "grad_norm": 0.13436903059482574, + "learning_rate": 0.0005, + "loss": 2.1104, + "step": 197140 + }, + { + "epoch": 0.7504015590386943, + "grad_norm": 0.9106693267822266, + "learning_rate": 0.0005, + "loss": 2.1036, + "step": 197150 + }, + { + "epoch": 0.750439621506817, + "grad_norm": 0.1274736523628235, + "learning_rate": 0.0005, + "loss": 2.1042, + "step": 197160 + }, + { + "epoch": 0.7504776839749396, + "grad_norm": 0.12124037742614746, + "learning_rate": 0.0005, + "loss": 2.1026, + "step": 197170 + }, + { + "epoch": 0.7505157464430624, + "grad_norm": 0.1154475286602974, + "learning_rate": 0.0005, + "loss": 2.1089, + "step": 197180 + }, + { + "epoch": 0.7505538089111851, + "grad_norm": 0.1293182224035263, + "learning_rate": 0.0005, + "loss": 2.1131, + "step": 197190 + }, + { + "epoch": 0.7505918713793077, + "grad_norm": 0.14153964817523956, + "learning_rate": 0.0005, + "loss": 2.1102, + "step": 197200 + }, + { + "epoch": 0.7506299338474304, + "grad_norm": 0.14210331439971924, + "learning_rate": 0.0005, + "loss": 2.0971, + "step": 197210 + }, + { + "epoch": 0.750667996315553, + "grad_norm": 0.11955045163631439, + "learning_rate": 0.0005, + "loss": 2.0928, + "step": 197220 + }, + { + "epoch": 0.7507060587836758, + "grad_norm": 0.12392129749059677, + "learning_rate": 0.0005, + "loss": 2.0907, + "step": 197230 + }, + { + "epoch": 0.7507441212517985, + "grad_norm": 0.12962493300437927, + "learning_rate": 0.0005, + "loss": 2.1036, + "step": 197240 + }, + { + "epoch": 0.7507821837199211, + "grad_norm": 0.12972719967365265, + "learning_rate": 0.0005, + "loss": 2.101, + "step": 197250 + }, + { + "epoch": 0.7508202461880438, + "grad_norm": 0.12515634298324585, + "learning_rate": 0.0005, + "loss": 2.1118, + "step": 197260 + }, + { + "epoch": 0.7508583086561665, + "grad_norm": 0.1352843940258026, + "learning_rate": 0.0005, + "loss": 2.112, + "step": 197270 + }, + { + "epoch": 0.7508963711242892, + "grad_norm": 0.12352333217859268, + "learning_rate": 0.0005, + "loss": 2.1114, + "step": 197280 + }, + { + "epoch": 0.7509344335924119, + "grad_norm": 0.124315544962883, + "learning_rate": 0.0005, + "loss": 2.0998, + "step": 197290 + }, + { + "epoch": 0.7509724960605345, + "grad_norm": 0.11670838296413422, + "learning_rate": 0.0005, + "loss": 2.0924, + "step": 197300 + }, + { + "epoch": 0.7510105585286573, + "grad_norm": 0.12693895399570465, + "learning_rate": 0.0005, + "loss": 2.1062, + "step": 197310 + }, + { + "epoch": 0.7510486209967799, + "grad_norm": 0.11586874723434448, + "learning_rate": 0.0005, + "loss": 2.1215, + "step": 197320 + }, + { + "epoch": 0.7510866834649026, + "grad_norm": 0.11443132907152176, + "learning_rate": 0.0005, + "loss": 2.1066, + "step": 197330 + }, + { + "epoch": 0.7511247459330253, + "grad_norm": 0.11413166671991348, + "learning_rate": 0.0005, + "loss": 2.0958, + "step": 197340 + }, + { + "epoch": 0.751162808401148, + "grad_norm": 0.11946621537208557, + "learning_rate": 0.0005, + "loss": 2.0809, + "step": 197350 + }, + { + "epoch": 0.7512008708692707, + "grad_norm": 0.1346030980348587, + "learning_rate": 0.0005, + "loss": 2.1042, + "step": 197360 + }, + { + "epoch": 0.7512389333373933, + "grad_norm": 0.13821019232273102, + "learning_rate": 0.0005, + "loss": 2.1009, + "step": 197370 + }, + { + "epoch": 0.751276995805516, + "grad_norm": 0.12743425369262695, + "learning_rate": 0.0005, + "loss": 2.1061, + "step": 197380 + }, + { + "epoch": 0.7513150582736386, + "grad_norm": 0.14612269401550293, + "learning_rate": 0.0005, + "loss": 2.1188, + "step": 197390 + }, + { + "epoch": 0.7513531207417614, + "grad_norm": 0.11492089182138443, + "learning_rate": 0.0005, + "loss": 2.1082, + "step": 197400 + }, + { + "epoch": 0.7513911832098841, + "grad_norm": 0.11517133563756943, + "learning_rate": 0.0005, + "loss": 2.1104, + "step": 197410 + }, + { + "epoch": 0.7514292456780067, + "grad_norm": 0.12532839179039001, + "learning_rate": 0.0005, + "loss": 2.1066, + "step": 197420 + }, + { + "epoch": 0.7514673081461294, + "grad_norm": 0.13420192897319794, + "learning_rate": 0.0005, + "loss": 2.1222, + "step": 197430 + }, + { + "epoch": 0.7515053706142522, + "grad_norm": 0.12853312492370605, + "learning_rate": 0.0005, + "loss": 2.1028, + "step": 197440 + }, + { + "epoch": 0.7515434330823748, + "grad_norm": 0.12556856870651245, + "learning_rate": 0.0005, + "loss": 2.1147, + "step": 197450 + }, + { + "epoch": 0.7515814955504975, + "grad_norm": 0.12764570116996765, + "learning_rate": 0.0005, + "loss": 2.1041, + "step": 197460 + }, + { + "epoch": 0.7516195580186201, + "grad_norm": 0.12602615356445312, + "learning_rate": 0.0005, + "loss": 2.1102, + "step": 197470 + }, + { + "epoch": 0.7516576204867429, + "grad_norm": 0.11851579695940018, + "learning_rate": 0.0005, + "loss": 2.1205, + "step": 197480 + }, + { + "epoch": 0.7516956829548656, + "grad_norm": 0.13655637204647064, + "learning_rate": 0.0005, + "loss": 2.0996, + "step": 197490 + }, + { + "epoch": 0.7517337454229882, + "grad_norm": 0.12730269134044647, + "learning_rate": 0.0005, + "loss": 2.1007, + "step": 197500 + }, + { + "epoch": 0.7517718078911109, + "grad_norm": 0.12439996004104614, + "learning_rate": 0.0005, + "loss": 2.0978, + "step": 197510 + }, + { + "epoch": 0.7518098703592335, + "grad_norm": 0.11916308104991913, + "learning_rate": 0.0005, + "loss": 2.1148, + "step": 197520 + }, + { + "epoch": 0.7518479328273563, + "grad_norm": 0.2023012787103653, + "learning_rate": 0.0005, + "loss": 2.1005, + "step": 197530 + }, + { + "epoch": 0.751885995295479, + "grad_norm": 0.11802316457033157, + "learning_rate": 0.0005, + "loss": 2.1121, + "step": 197540 + }, + { + "epoch": 0.7519240577636016, + "grad_norm": 0.11546476930379868, + "learning_rate": 0.0005, + "loss": 2.105, + "step": 197550 + }, + { + "epoch": 0.7519621202317243, + "grad_norm": 0.146450936794281, + "learning_rate": 0.0005, + "loss": 2.106, + "step": 197560 + }, + { + "epoch": 0.752000182699847, + "grad_norm": 0.12375127524137497, + "learning_rate": 0.0005, + "loss": 2.1095, + "step": 197570 + }, + { + "epoch": 0.7520382451679697, + "grad_norm": 0.12629228830337524, + "learning_rate": 0.0005, + "loss": 2.1076, + "step": 197580 + }, + { + "epoch": 0.7520763076360923, + "grad_norm": 0.12279598414897919, + "learning_rate": 0.0005, + "loss": 2.1106, + "step": 197590 + }, + { + "epoch": 0.752114370104215, + "grad_norm": 0.1399388462305069, + "learning_rate": 0.0005, + "loss": 2.1141, + "step": 197600 + }, + { + "epoch": 0.7521524325723378, + "grad_norm": 0.12528078258037567, + "learning_rate": 0.0005, + "loss": 2.1, + "step": 197610 + }, + { + "epoch": 0.7521904950404604, + "grad_norm": 0.11265812814235687, + "learning_rate": 0.0005, + "loss": 2.1092, + "step": 197620 + }, + { + "epoch": 0.7522285575085831, + "grad_norm": 0.12309877574443817, + "learning_rate": 0.0005, + "loss": 2.1138, + "step": 197630 + }, + { + "epoch": 0.7522666199767057, + "grad_norm": 0.12960395216941833, + "learning_rate": 0.0005, + "loss": 2.0989, + "step": 197640 + }, + { + "epoch": 0.7523046824448284, + "grad_norm": 0.13863934576511383, + "learning_rate": 0.0005, + "loss": 2.1147, + "step": 197650 + }, + { + "epoch": 0.7523427449129512, + "grad_norm": 0.1307358592748642, + "learning_rate": 0.0005, + "loss": 2.1165, + "step": 197660 + }, + { + "epoch": 0.7523808073810738, + "grad_norm": 0.13089637458324432, + "learning_rate": 0.0005, + "loss": 2.1193, + "step": 197670 + }, + { + "epoch": 0.7524188698491965, + "grad_norm": 0.12067008763551712, + "learning_rate": 0.0005, + "loss": 2.1056, + "step": 197680 + }, + { + "epoch": 0.7524569323173191, + "grad_norm": 0.12283031642436981, + "learning_rate": 0.0005, + "loss": 2.1061, + "step": 197690 + }, + { + "epoch": 0.7524949947854419, + "grad_norm": 0.12055887281894684, + "learning_rate": 0.0005, + "loss": 2.1071, + "step": 197700 + }, + { + "epoch": 0.7525330572535646, + "grad_norm": 0.12535439431667328, + "learning_rate": 0.0005, + "loss": 2.1053, + "step": 197710 + }, + { + "epoch": 0.7525711197216872, + "grad_norm": 0.13567043840885162, + "learning_rate": 0.0005, + "loss": 2.123, + "step": 197720 + }, + { + "epoch": 0.7526091821898099, + "grad_norm": 0.12497591227293015, + "learning_rate": 0.0005, + "loss": 2.1095, + "step": 197730 + }, + { + "epoch": 0.7526472446579326, + "grad_norm": 0.12373036891222, + "learning_rate": 0.0005, + "loss": 2.0952, + "step": 197740 + }, + { + "epoch": 0.7526853071260553, + "grad_norm": 0.1281946897506714, + "learning_rate": 0.0005, + "loss": 2.1137, + "step": 197750 + }, + { + "epoch": 0.752723369594178, + "grad_norm": 0.11898750066757202, + "learning_rate": 0.0005, + "loss": 2.1133, + "step": 197760 + }, + { + "epoch": 0.7527614320623006, + "grad_norm": 0.11677069962024689, + "learning_rate": 0.0005, + "loss": 2.1301, + "step": 197770 + }, + { + "epoch": 0.7527994945304234, + "grad_norm": 0.14142510294914246, + "learning_rate": 0.0005, + "loss": 2.1057, + "step": 197780 + }, + { + "epoch": 0.752837556998546, + "grad_norm": 0.1283315122127533, + "learning_rate": 0.0005, + "loss": 2.1061, + "step": 197790 + }, + { + "epoch": 0.7528756194666687, + "grad_norm": 0.13681048154830933, + "learning_rate": 0.0005, + "loss": 2.1167, + "step": 197800 + }, + { + "epoch": 0.7529136819347914, + "grad_norm": 0.14908556640148163, + "learning_rate": 0.0005, + "loss": 2.0997, + "step": 197810 + }, + { + "epoch": 0.752951744402914, + "grad_norm": 0.12765933573246002, + "learning_rate": 0.0005, + "loss": 2.1168, + "step": 197820 + }, + { + "epoch": 0.7529898068710368, + "grad_norm": 0.13758790493011475, + "learning_rate": 0.0005, + "loss": 2.1079, + "step": 197830 + }, + { + "epoch": 0.7530278693391594, + "grad_norm": 0.13992281258106232, + "learning_rate": 0.0005, + "loss": 2.0972, + "step": 197840 + }, + { + "epoch": 0.7530659318072821, + "grad_norm": 0.12648849189281464, + "learning_rate": 0.0005, + "loss": 2.1075, + "step": 197850 + }, + { + "epoch": 0.7531039942754048, + "grad_norm": 0.12558187544345856, + "learning_rate": 0.0005, + "loss": 2.1, + "step": 197860 + }, + { + "epoch": 0.7531420567435275, + "grad_norm": 0.12360218912363052, + "learning_rate": 0.0005, + "loss": 2.1096, + "step": 197870 + }, + { + "epoch": 0.7531801192116502, + "grad_norm": 0.137051522731781, + "learning_rate": 0.0005, + "loss": 2.1222, + "step": 197880 + }, + { + "epoch": 0.7532181816797728, + "grad_norm": 0.12066777050495148, + "learning_rate": 0.0005, + "loss": 2.1135, + "step": 197890 + }, + { + "epoch": 0.7532562441478955, + "grad_norm": 0.13327810168266296, + "learning_rate": 0.0005, + "loss": 2.1034, + "step": 197900 + }, + { + "epoch": 0.7532943066160183, + "grad_norm": 0.11925477534532547, + "learning_rate": 0.0005, + "loss": 2.1092, + "step": 197910 + }, + { + "epoch": 0.7533323690841409, + "grad_norm": 0.1300128847360611, + "learning_rate": 0.0005, + "loss": 2.1082, + "step": 197920 + }, + { + "epoch": 0.7533704315522636, + "grad_norm": 0.12425397336483002, + "learning_rate": 0.0005, + "loss": 2.104, + "step": 197930 + }, + { + "epoch": 0.7534084940203862, + "grad_norm": 0.1329755336046219, + "learning_rate": 0.0005, + "loss": 2.1045, + "step": 197940 + }, + { + "epoch": 0.7534465564885089, + "grad_norm": 0.11931162327528, + "learning_rate": 0.0005, + "loss": 2.1167, + "step": 197950 + }, + { + "epoch": 0.7534846189566317, + "grad_norm": 0.1208098903298378, + "learning_rate": 0.0005, + "loss": 2.1083, + "step": 197960 + }, + { + "epoch": 0.7535226814247543, + "grad_norm": 0.13338631391525269, + "learning_rate": 0.0005, + "loss": 2.0987, + "step": 197970 + }, + { + "epoch": 0.753560743892877, + "grad_norm": 0.12523089349269867, + "learning_rate": 0.0005, + "loss": 2.1006, + "step": 197980 + }, + { + "epoch": 0.7535988063609996, + "grad_norm": 0.1101926937699318, + "learning_rate": 0.0005, + "loss": 2.1229, + "step": 197990 + }, + { + "epoch": 0.7536368688291224, + "grad_norm": 0.1254616379737854, + "learning_rate": 0.0005, + "loss": 2.0887, + "step": 198000 + }, + { + "epoch": 0.753674931297245, + "grad_norm": 0.14773495495319366, + "learning_rate": 0.0005, + "loss": 2.0991, + "step": 198010 + }, + { + "epoch": 0.7537129937653677, + "grad_norm": 0.13330087065696716, + "learning_rate": 0.0005, + "loss": 2.103, + "step": 198020 + }, + { + "epoch": 0.7537510562334904, + "grad_norm": 0.11752574890851974, + "learning_rate": 0.0005, + "loss": 2.1133, + "step": 198030 + }, + { + "epoch": 0.7537891187016131, + "grad_norm": 0.13528861105442047, + "learning_rate": 0.0005, + "loss": 2.1003, + "step": 198040 + }, + { + "epoch": 0.7538271811697358, + "grad_norm": 0.12782931327819824, + "learning_rate": 0.0005, + "loss": 2.0901, + "step": 198050 + }, + { + "epoch": 0.7538652436378585, + "grad_norm": 0.1557457000017166, + "learning_rate": 0.0005, + "loss": 2.1128, + "step": 198060 + }, + { + "epoch": 0.7539033061059811, + "grad_norm": 0.12887586653232574, + "learning_rate": 0.0005, + "loss": 2.1094, + "step": 198070 + }, + { + "epoch": 0.7539413685741038, + "grad_norm": 0.12701363861560822, + "learning_rate": 0.0005, + "loss": 2.1035, + "step": 198080 + }, + { + "epoch": 0.7539794310422265, + "grad_norm": 0.12219791114330292, + "learning_rate": 0.0005, + "loss": 2.1198, + "step": 198090 + }, + { + "epoch": 0.7540174935103492, + "grad_norm": 0.125774547457695, + "learning_rate": 0.0005, + "loss": 2.1097, + "step": 198100 + }, + { + "epoch": 0.7540555559784718, + "grad_norm": 0.11728359758853912, + "learning_rate": 0.0005, + "loss": 2.1164, + "step": 198110 + }, + { + "epoch": 0.7540936184465945, + "grad_norm": 0.11758928745985031, + "learning_rate": 0.0005, + "loss": 2.0999, + "step": 198120 + }, + { + "epoch": 0.7541316809147173, + "grad_norm": 0.11796324700117111, + "learning_rate": 0.0005, + "loss": 2.0959, + "step": 198130 + }, + { + "epoch": 0.7541697433828399, + "grad_norm": 0.1250022053718567, + "learning_rate": 0.0005, + "loss": 2.093, + "step": 198140 + }, + { + "epoch": 0.7542078058509626, + "grad_norm": 0.12501591444015503, + "learning_rate": 0.0005, + "loss": 2.1088, + "step": 198150 + }, + { + "epoch": 0.7542458683190852, + "grad_norm": 0.12826436758041382, + "learning_rate": 0.0005, + "loss": 2.1081, + "step": 198160 + }, + { + "epoch": 0.754283930787208, + "grad_norm": 0.11726287007331848, + "learning_rate": 0.0005, + "loss": 2.1065, + "step": 198170 + }, + { + "epoch": 0.7543219932553307, + "grad_norm": 0.12219440191984177, + "learning_rate": 0.0005, + "loss": 2.1117, + "step": 198180 + }, + { + "epoch": 0.7543600557234533, + "grad_norm": 0.11771136522293091, + "learning_rate": 0.0005, + "loss": 2.1099, + "step": 198190 + }, + { + "epoch": 0.754398118191576, + "grad_norm": 0.12274880707263947, + "learning_rate": 0.0005, + "loss": 2.1316, + "step": 198200 + }, + { + "epoch": 0.7544361806596988, + "grad_norm": 0.12219416350126266, + "learning_rate": 0.0005, + "loss": 2.1064, + "step": 198210 + }, + { + "epoch": 0.7544742431278214, + "grad_norm": 0.11942414939403534, + "learning_rate": 0.0005, + "loss": 2.1033, + "step": 198220 + }, + { + "epoch": 0.7545123055959441, + "grad_norm": 0.1254412829875946, + "learning_rate": 0.0005, + "loss": 2.1091, + "step": 198230 + }, + { + "epoch": 0.7545503680640667, + "grad_norm": 0.12787742912769318, + "learning_rate": 0.0005, + "loss": 2.1126, + "step": 198240 + }, + { + "epoch": 0.7545884305321894, + "grad_norm": 0.14581698179244995, + "learning_rate": 0.0005, + "loss": 2.117, + "step": 198250 + }, + { + "epoch": 0.7546264930003121, + "grad_norm": 0.12357164174318314, + "learning_rate": 0.0005, + "loss": 2.1009, + "step": 198260 + }, + { + "epoch": 0.7546645554684348, + "grad_norm": 0.13686387240886688, + "learning_rate": 0.0005, + "loss": 2.1075, + "step": 198270 + }, + { + "epoch": 0.7547026179365575, + "grad_norm": 0.12348747253417969, + "learning_rate": 0.0005, + "loss": 2.1044, + "step": 198280 + }, + { + "epoch": 0.7547406804046801, + "grad_norm": 0.11589358001947403, + "learning_rate": 0.0005, + "loss": 2.1137, + "step": 198290 + }, + { + "epoch": 0.7547787428728029, + "grad_norm": 0.12138720601797104, + "learning_rate": 0.0005, + "loss": 2.0988, + "step": 198300 + }, + { + "epoch": 0.7548168053409255, + "grad_norm": 0.11561553925275803, + "learning_rate": 0.0005, + "loss": 2.1124, + "step": 198310 + }, + { + "epoch": 0.7548548678090482, + "grad_norm": 0.12074074149131775, + "learning_rate": 0.0005, + "loss": 2.1165, + "step": 198320 + }, + { + "epoch": 0.7548929302771709, + "grad_norm": 0.11432056128978729, + "learning_rate": 0.0005, + "loss": 2.1265, + "step": 198330 + }, + { + "epoch": 0.7549309927452936, + "grad_norm": 0.12902531027793884, + "learning_rate": 0.0005, + "loss": 2.1241, + "step": 198340 + }, + { + "epoch": 0.7549690552134163, + "grad_norm": 0.11342615634202957, + "learning_rate": 0.0005, + "loss": 2.095, + "step": 198350 + }, + { + "epoch": 0.7550071176815389, + "grad_norm": 0.12891358137130737, + "learning_rate": 0.0005, + "loss": 2.0979, + "step": 198360 + }, + { + "epoch": 0.7550451801496616, + "grad_norm": 0.1385073959827423, + "learning_rate": 0.0005, + "loss": 2.1064, + "step": 198370 + }, + { + "epoch": 0.7550832426177843, + "grad_norm": 0.14252620935440063, + "learning_rate": 0.0005, + "loss": 2.1144, + "step": 198380 + }, + { + "epoch": 0.755121305085907, + "grad_norm": 0.14449909329414368, + "learning_rate": 0.0005, + "loss": 2.1062, + "step": 198390 + }, + { + "epoch": 0.7551593675540297, + "grad_norm": 0.13095831871032715, + "learning_rate": 0.0005, + "loss": 2.0991, + "step": 198400 + }, + { + "epoch": 0.7551974300221523, + "grad_norm": 0.12750248610973358, + "learning_rate": 0.0005, + "loss": 2.1086, + "step": 198410 + }, + { + "epoch": 0.755235492490275, + "grad_norm": 0.12844283878803253, + "learning_rate": 0.0005, + "loss": 2.1214, + "step": 198420 + }, + { + "epoch": 0.7552735549583978, + "grad_norm": 0.12403746694326401, + "learning_rate": 0.0005, + "loss": 2.1214, + "step": 198430 + }, + { + "epoch": 0.7553116174265204, + "grad_norm": 0.12165076285600662, + "learning_rate": 0.0005, + "loss": 2.1187, + "step": 198440 + }, + { + "epoch": 0.7553496798946431, + "grad_norm": 0.1258837878704071, + "learning_rate": 0.0005, + "loss": 2.105, + "step": 198450 + }, + { + "epoch": 0.7553877423627657, + "grad_norm": 0.12263844907283783, + "learning_rate": 0.0005, + "loss": 2.1009, + "step": 198460 + }, + { + "epoch": 0.7554258048308885, + "grad_norm": 0.15989969670772552, + "learning_rate": 0.0005, + "loss": 2.1001, + "step": 198470 + }, + { + "epoch": 0.7554638672990112, + "grad_norm": 0.12192397564649582, + "learning_rate": 0.0005, + "loss": 2.1086, + "step": 198480 + }, + { + "epoch": 0.7555019297671338, + "grad_norm": 0.1142887994647026, + "learning_rate": 0.0005, + "loss": 2.115, + "step": 198490 + }, + { + "epoch": 0.7555399922352565, + "grad_norm": 0.12102162837982178, + "learning_rate": 0.0005, + "loss": 2.0939, + "step": 198500 + }, + { + "epoch": 0.7555780547033791, + "grad_norm": 0.12950734794139862, + "learning_rate": 0.0005, + "loss": 2.093, + "step": 198510 + }, + { + "epoch": 0.7556161171715019, + "grad_norm": 0.11939499527215958, + "learning_rate": 0.0005, + "loss": 2.1101, + "step": 198520 + }, + { + "epoch": 0.7556541796396246, + "grad_norm": 0.12329903990030289, + "learning_rate": 0.0005, + "loss": 2.0973, + "step": 198530 + }, + { + "epoch": 0.7556922421077472, + "grad_norm": 0.1571332812309265, + "learning_rate": 0.0005, + "loss": 2.0888, + "step": 198540 + }, + { + "epoch": 0.7557303045758699, + "grad_norm": 0.12774313986301422, + "learning_rate": 0.0005, + "loss": 2.1094, + "step": 198550 + }, + { + "epoch": 0.7557683670439926, + "grad_norm": 0.11515670269727707, + "learning_rate": 0.0005, + "loss": 2.1102, + "step": 198560 + }, + { + "epoch": 0.7558064295121153, + "grad_norm": 0.12537802755832672, + "learning_rate": 0.0005, + "loss": 2.0992, + "step": 198570 + }, + { + "epoch": 0.755844491980238, + "grad_norm": 0.13329669833183289, + "learning_rate": 0.0005, + "loss": 2.1084, + "step": 198580 + }, + { + "epoch": 0.7558825544483606, + "grad_norm": 0.11480581760406494, + "learning_rate": 0.0005, + "loss": 2.1071, + "step": 198590 + }, + { + "epoch": 0.7559206169164834, + "grad_norm": 0.12687240540981293, + "learning_rate": 0.0005, + "loss": 2.0982, + "step": 198600 + }, + { + "epoch": 0.755958679384606, + "grad_norm": 0.12661078572273254, + "learning_rate": 0.0005, + "loss": 2.1068, + "step": 198610 + }, + { + "epoch": 0.7559967418527287, + "grad_norm": 0.13912835717201233, + "learning_rate": 0.0005, + "loss": 2.113, + "step": 198620 + }, + { + "epoch": 0.7560348043208514, + "grad_norm": 0.1346118450164795, + "learning_rate": 0.0005, + "loss": 2.1047, + "step": 198630 + }, + { + "epoch": 0.7560728667889741, + "grad_norm": 0.13172611594200134, + "learning_rate": 0.0005, + "loss": 2.0918, + "step": 198640 + }, + { + "epoch": 0.7561109292570968, + "grad_norm": 0.12862594425678253, + "learning_rate": 0.0005, + "loss": 2.1159, + "step": 198650 + }, + { + "epoch": 0.7561489917252194, + "grad_norm": 0.12063287943601608, + "learning_rate": 0.0005, + "loss": 2.1107, + "step": 198660 + }, + { + "epoch": 0.7561870541933421, + "grad_norm": 0.11567655950784683, + "learning_rate": 0.0005, + "loss": 2.1113, + "step": 198670 + }, + { + "epoch": 0.7562251166614647, + "grad_norm": 0.12064255028963089, + "learning_rate": 0.0005, + "loss": 2.1089, + "step": 198680 + }, + { + "epoch": 0.7562631791295875, + "grad_norm": 0.12698319554328918, + "learning_rate": 0.0005, + "loss": 2.0936, + "step": 198690 + }, + { + "epoch": 0.7563012415977102, + "grad_norm": 0.12099333107471466, + "learning_rate": 0.0005, + "loss": 2.1033, + "step": 198700 + }, + { + "epoch": 0.7563393040658328, + "grad_norm": 0.1265001893043518, + "learning_rate": 0.0005, + "loss": 2.1127, + "step": 198710 + }, + { + "epoch": 0.7563773665339555, + "grad_norm": 0.12512876093387604, + "learning_rate": 0.0005, + "loss": 2.1019, + "step": 198720 + }, + { + "epoch": 0.7564154290020783, + "grad_norm": 0.12623614072799683, + "learning_rate": 0.0005, + "loss": 2.1067, + "step": 198730 + }, + { + "epoch": 0.7564534914702009, + "grad_norm": 0.11762641370296478, + "learning_rate": 0.0005, + "loss": 2.105, + "step": 198740 + }, + { + "epoch": 0.7564915539383236, + "grad_norm": 0.11356106400489807, + "learning_rate": 0.0005, + "loss": 2.1016, + "step": 198750 + }, + { + "epoch": 0.7565296164064462, + "grad_norm": 0.13299451768398285, + "learning_rate": 0.0005, + "loss": 2.1074, + "step": 198760 + }, + { + "epoch": 0.756567678874569, + "grad_norm": 0.13182711601257324, + "learning_rate": 0.0005, + "loss": 2.1139, + "step": 198770 + }, + { + "epoch": 0.7566057413426917, + "grad_norm": 0.13028521835803986, + "learning_rate": 0.0005, + "loss": 2.0855, + "step": 198780 + }, + { + "epoch": 0.7566438038108143, + "grad_norm": 0.11709822714328766, + "learning_rate": 0.0005, + "loss": 2.0949, + "step": 198790 + }, + { + "epoch": 0.756681866278937, + "grad_norm": 0.14088694751262665, + "learning_rate": 0.0005, + "loss": 2.1229, + "step": 198800 + }, + { + "epoch": 0.7567199287470596, + "grad_norm": 0.12376152724027634, + "learning_rate": 0.0005, + "loss": 2.1175, + "step": 198810 + }, + { + "epoch": 0.7567579912151824, + "grad_norm": 0.12194863706827164, + "learning_rate": 0.0005, + "loss": 2.1228, + "step": 198820 + }, + { + "epoch": 0.756796053683305, + "grad_norm": 0.12565350532531738, + "learning_rate": 0.0005, + "loss": 2.1134, + "step": 198830 + }, + { + "epoch": 0.7568341161514277, + "grad_norm": 0.12101472169160843, + "learning_rate": 0.0005, + "loss": 2.1027, + "step": 198840 + }, + { + "epoch": 0.7568721786195504, + "grad_norm": 0.11491557955741882, + "learning_rate": 0.0005, + "loss": 2.1205, + "step": 198850 + }, + { + "epoch": 0.7569102410876731, + "grad_norm": 0.1167779192328453, + "learning_rate": 0.0005, + "loss": 2.1065, + "step": 198860 + }, + { + "epoch": 0.7569483035557958, + "grad_norm": 0.12635232508182526, + "learning_rate": 0.0005, + "loss": 2.1023, + "step": 198870 + }, + { + "epoch": 0.7569863660239184, + "grad_norm": 0.12471341341733932, + "learning_rate": 0.0005, + "loss": 2.1154, + "step": 198880 + }, + { + "epoch": 0.7570244284920411, + "grad_norm": 0.12258568406105042, + "learning_rate": 0.0005, + "loss": 2.0975, + "step": 198890 + }, + { + "epoch": 0.7570624909601639, + "grad_norm": 0.1298074573278427, + "learning_rate": 0.0005, + "loss": 2.1042, + "step": 198900 + }, + { + "epoch": 0.7571005534282865, + "grad_norm": 0.13026870787143707, + "learning_rate": 0.0005, + "loss": 2.1225, + "step": 198910 + }, + { + "epoch": 0.7571386158964092, + "grad_norm": 0.120946004986763, + "learning_rate": 0.0005, + "loss": 2.1033, + "step": 198920 + }, + { + "epoch": 0.7571766783645318, + "grad_norm": 0.12533290684223175, + "learning_rate": 0.0005, + "loss": 2.0977, + "step": 198930 + }, + { + "epoch": 0.7572147408326545, + "grad_norm": 0.10581686347723007, + "learning_rate": 0.0005, + "loss": 2.1059, + "step": 198940 + }, + { + "epoch": 0.7572528033007773, + "grad_norm": 0.12052454799413681, + "learning_rate": 0.0005, + "loss": 2.1065, + "step": 198950 + }, + { + "epoch": 0.7572908657688999, + "grad_norm": 0.13290107250213623, + "learning_rate": 0.0005, + "loss": 2.1176, + "step": 198960 + }, + { + "epoch": 0.7573289282370226, + "grad_norm": 0.1411670446395874, + "learning_rate": 0.0005, + "loss": 2.12, + "step": 198970 + }, + { + "epoch": 0.7573669907051452, + "grad_norm": 0.13029508292675018, + "learning_rate": 0.0005, + "loss": 2.1085, + "step": 198980 + }, + { + "epoch": 0.757405053173268, + "grad_norm": 0.12159781903028488, + "learning_rate": 0.0005, + "loss": 2.1098, + "step": 198990 + }, + { + "epoch": 0.7574431156413907, + "grad_norm": 0.1355499029159546, + "learning_rate": 0.0005, + "loss": 2.1179, + "step": 199000 + }, + { + "epoch": 0.7574811781095133, + "grad_norm": 0.12084466218948364, + "learning_rate": 0.0005, + "loss": 2.109, + "step": 199010 + }, + { + "epoch": 0.757519240577636, + "grad_norm": 0.12449130415916443, + "learning_rate": 0.0005, + "loss": 2.1193, + "step": 199020 + }, + { + "epoch": 0.7575573030457587, + "grad_norm": 0.1268312633037567, + "learning_rate": 0.0005, + "loss": 2.1046, + "step": 199030 + }, + { + "epoch": 0.7575953655138814, + "grad_norm": 0.12807604670524597, + "learning_rate": 0.0005, + "loss": 2.1107, + "step": 199040 + }, + { + "epoch": 0.7576334279820041, + "grad_norm": 0.12403193861246109, + "learning_rate": 0.0005, + "loss": 2.1057, + "step": 199050 + }, + { + "epoch": 0.7576714904501267, + "grad_norm": 0.12660513818264008, + "learning_rate": 0.0005, + "loss": 2.1002, + "step": 199060 + }, + { + "epoch": 0.7577095529182495, + "grad_norm": 0.13343201577663422, + "learning_rate": 0.0005, + "loss": 2.1118, + "step": 199070 + }, + { + "epoch": 0.7577476153863721, + "grad_norm": 0.13150076568126678, + "learning_rate": 0.0005, + "loss": 2.1075, + "step": 199080 + }, + { + "epoch": 0.7577856778544948, + "grad_norm": 0.12581336498260498, + "learning_rate": 0.0005, + "loss": 2.1055, + "step": 199090 + }, + { + "epoch": 0.7578237403226175, + "grad_norm": 0.11675869673490524, + "learning_rate": 0.0005, + "loss": 2.1173, + "step": 199100 + }, + { + "epoch": 0.7578618027907401, + "grad_norm": 0.1217801496386528, + "learning_rate": 0.0005, + "loss": 2.1082, + "step": 199110 + }, + { + "epoch": 0.7578998652588629, + "grad_norm": 0.11997724324464798, + "learning_rate": 0.0005, + "loss": 2.1158, + "step": 199120 + }, + { + "epoch": 0.7579379277269855, + "grad_norm": 0.13471657037734985, + "learning_rate": 0.0005, + "loss": 2.1291, + "step": 199130 + }, + { + "epoch": 0.7579759901951082, + "grad_norm": 0.13399042189121246, + "learning_rate": 0.0005, + "loss": 2.1235, + "step": 199140 + }, + { + "epoch": 0.7580140526632309, + "grad_norm": 0.1244891956448555, + "learning_rate": 0.0005, + "loss": 2.1075, + "step": 199150 + }, + { + "epoch": 0.7580521151313536, + "grad_norm": 0.12309864163398743, + "learning_rate": 0.0005, + "loss": 2.1206, + "step": 199160 + }, + { + "epoch": 0.7580901775994763, + "grad_norm": 0.11609305441379547, + "learning_rate": 0.0005, + "loss": 2.0961, + "step": 199170 + }, + { + "epoch": 0.7581282400675989, + "grad_norm": 0.12005341053009033, + "learning_rate": 0.0005, + "loss": 2.1092, + "step": 199180 + }, + { + "epoch": 0.7581663025357216, + "grad_norm": 0.12097519636154175, + "learning_rate": 0.0005, + "loss": 2.1045, + "step": 199190 + }, + { + "epoch": 0.7582043650038444, + "grad_norm": 0.1215854361653328, + "learning_rate": 0.0005, + "loss": 2.1187, + "step": 199200 + }, + { + "epoch": 0.758242427471967, + "grad_norm": 0.11642411351203918, + "learning_rate": 0.0005, + "loss": 2.116, + "step": 199210 + }, + { + "epoch": 0.7582804899400897, + "grad_norm": 0.1203550472855568, + "learning_rate": 0.0005, + "loss": 2.1013, + "step": 199220 + }, + { + "epoch": 0.7583185524082123, + "grad_norm": 0.14909736812114716, + "learning_rate": 0.0005, + "loss": 2.0923, + "step": 199230 + }, + { + "epoch": 0.758356614876335, + "grad_norm": 0.13295716047286987, + "learning_rate": 0.0005, + "loss": 2.1066, + "step": 199240 + }, + { + "epoch": 0.7583946773444578, + "grad_norm": 0.13675452768802643, + "learning_rate": 0.0005, + "loss": 2.114, + "step": 199250 + }, + { + "epoch": 0.7584327398125804, + "grad_norm": 0.12822073698043823, + "learning_rate": 0.0005, + "loss": 2.0961, + "step": 199260 + }, + { + "epoch": 0.7584708022807031, + "grad_norm": 0.1309756189584732, + "learning_rate": 0.0005, + "loss": 2.0837, + "step": 199270 + }, + { + "epoch": 0.7585088647488257, + "grad_norm": 0.11035849899053574, + "learning_rate": 0.0005, + "loss": 2.1044, + "step": 199280 + }, + { + "epoch": 0.7585469272169485, + "grad_norm": 0.12121044844388962, + "learning_rate": 0.0005, + "loss": 2.0881, + "step": 199290 + }, + { + "epoch": 0.7585849896850712, + "grad_norm": 0.13430899381637573, + "learning_rate": 0.0005, + "loss": 2.1066, + "step": 199300 + }, + { + "epoch": 0.7586230521531938, + "grad_norm": 0.11547856032848358, + "learning_rate": 0.0005, + "loss": 2.1052, + "step": 199310 + }, + { + "epoch": 0.7586611146213165, + "grad_norm": 0.12349169701337814, + "learning_rate": 0.0005, + "loss": 2.0869, + "step": 199320 + }, + { + "epoch": 0.7586991770894392, + "grad_norm": 0.14382505416870117, + "learning_rate": 0.0005, + "loss": 2.1048, + "step": 199330 + }, + { + "epoch": 0.7587372395575619, + "grad_norm": 0.13851898908615112, + "learning_rate": 0.0005, + "loss": 2.1078, + "step": 199340 + }, + { + "epoch": 0.7587753020256846, + "grad_norm": 0.12755434215068817, + "learning_rate": 0.0005, + "loss": 2.1152, + "step": 199350 + }, + { + "epoch": 0.7588133644938072, + "grad_norm": 0.1360461562871933, + "learning_rate": 0.0005, + "loss": 2.0972, + "step": 199360 + }, + { + "epoch": 0.7588514269619299, + "grad_norm": 0.1464185267686844, + "learning_rate": 0.0005, + "loss": 2.1043, + "step": 199370 + }, + { + "epoch": 0.7588894894300526, + "grad_norm": 0.14608561992645264, + "learning_rate": 0.0005, + "loss": 2.1027, + "step": 199380 + }, + { + "epoch": 0.7589275518981753, + "grad_norm": 0.11467023193836212, + "learning_rate": 0.0005, + "loss": 2.0922, + "step": 199390 + }, + { + "epoch": 0.758965614366298, + "grad_norm": 0.1256762444972992, + "learning_rate": 0.0005, + "loss": 2.0968, + "step": 199400 + }, + { + "epoch": 0.7590036768344206, + "grad_norm": 0.12068627774715424, + "learning_rate": 0.0005, + "loss": 2.0935, + "step": 199410 + }, + { + "epoch": 0.7590417393025434, + "grad_norm": 0.16103871166706085, + "learning_rate": 0.0005, + "loss": 2.1063, + "step": 199420 + }, + { + "epoch": 0.759079801770666, + "grad_norm": 0.12145240604877472, + "learning_rate": 0.0005, + "loss": 2.1141, + "step": 199430 + }, + { + "epoch": 0.7591178642387887, + "grad_norm": 0.1241162121295929, + "learning_rate": 0.0005, + "loss": 2.0901, + "step": 199440 + }, + { + "epoch": 0.7591559267069113, + "grad_norm": 0.11487778276205063, + "learning_rate": 0.0005, + "loss": 2.1079, + "step": 199450 + }, + { + "epoch": 0.7591939891750341, + "grad_norm": 0.12314852327108383, + "learning_rate": 0.0005, + "loss": 2.0937, + "step": 199460 + }, + { + "epoch": 0.7592320516431568, + "grad_norm": 0.21512353420257568, + "learning_rate": 0.0005, + "loss": 2.1157, + "step": 199470 + }, + { + "epoch": 0.7592701141112794, + "grad_norm": 0.1336342692375183, + "learning_rate": 0.0005, + "loss": 2.1115, + "step": 199480 + }, + { + "epoch": 0.7593081765794021, + "grad_norm": 0.11381682753562927, + "learning_rate": 0.0005, + "loss": 2.0994, + "step": 199490 + }, + { + "epoch": 0.7593462390475249, + "grad_norm": 0.11951464414596558, + "learning_rate": 0.0005, + "loss": 2.1109, + "step": 199500 + }, + { + "epoch": 0.7593843015156475, + "grad_norm": 0.11122014373540878, + "learning_rate": 0.0005, + "loss": 2.1155, + "step": 199510 + }, + { + "epoch": 0.7594223639837702, + "grad_norm": 0.11937428265810013, + "learning_rate": 0.0005, + "loss": 2.1051, + "step": 199520 + }, + { + "epoch": 0.7594604264518928, + "grad_norm": 0.12456253916025162, + "learning_rate": 0.0005, + "loss": 2.1068, + "step": 199530 + }, + { + "epoch": 0.7594984889200155, + "grad_norm": 0.11650332808494568, + "learning_rate": 0.0005, + "loss": 2.0945, + "step": 199540 + }, + { + "epoch": 0.7595365513881382, + "grad_norm": 0.1122107282280922, + "learning_rate": 0.0005, + "loss": 2.1168, + "step": 199550 + }, + { + "epoch": 0.7595746138562609, + "grad_norm": 0.12337896972894669, + "learning_rate": 0.0005, + "loss": 2.0962, + "step": 199560 + }, + { + "epoch": 0.7596126763243836, + "grad_norm": 0.11624779552221298, + "learning_rate": 0.0005, + "loss": 2.1046, + "step": 199570 + }, + { + "epoch": 0.7596507387925062, + "grad_norm": 0.13512980937957764, + "learning_rate": 0.0005, + "loss": 2.1128, + "step": 199580 + }, + { + "epoch": 0.759688801260629, + "grad_norm": 0.13231684267520905, + "learning_rate": 0.0005, + "loss": 2.1065, + "step": 199590 + }, + { + "epoch": 0.7597268637287516, + "grad_norm": 0.1199183389544487, + "learning_rate": 0.0005, + "loss": 2.1129, + "step": 199600 + }, + { + "epoch": 0.7597649261968743, + "grad_norm": 0.12749294936656952, + "learning_rate": 0.0005, + "loss": 2.1108, + "step": 199610 + }, + { + "epoch": 0.759802988664997, + "grad_norm": 0.13242657482624054, + "learning_rate": 0.0005, + "loss": 2.1013, + "step": 199620 + }, + { + "epoch": 0.7598410511331197, + "grad_norm": 0.12247808277606964, + "learning_rate": 0.0005, + "loss": 2.1226, + "step": 199630 + }, + { + "epoch": 0.7598791136012424, + "grad_norm": 0.1193656176328659, + "learning_rate": 0.0005, + "loss": 2.0984, + "step": 199640 + }, + { + "epoch": 0.759917176069365, + "grad_norm": 0.12609358131885529, + "learning_rate": 0.0005, + "loss": 2.1056, + "step": 199650 + }, + { + "epoch": 0.7599552385374877, + "grad_norm": 0.12350145727396011, + "learning_rate": 0.0005, + "loss": 2.1053, + "step": 199660 + }, + { + "epoch": 0.7599933010056104, + "grad_norm": 0.1205834224820137, + "learning_rate": 0.0005, + "loss": 2.1156, + "step": 199670 + }, + { + "epoch": 0.7600313634737331, + "grad_norm": 0.12084044516086578, + "learning_rate": 0.0005, + "loss": 2.1046, + "step": 199680 + }, + { + "epoch": 0.7600694259418558, + "grad_norm": 0.12557877600193024, + "learning_rate": 0.0005, + "loss": 2.1193, + "step": 199690 + }, + { + "epoch": 0.7601074884099784, + "grad_norm": 0.1353178471326828, + "learning_rate": 0.0005, + "loss": 2.1047, + "step": 199700 + }, + { + "epoch": 0.7601455508781011, + "grad_norm": 0.12090218812227249, + "learning_rate": 0.0005, + "loss": 2.1, + "step": 199710 + }, + { + "epoch": 0.7601836133462239, + "grad_norm": 0.13385021686553955, + "learning_rate": 0.0005, + "loss": 2.1001, + "step": 199720 + }, + { + "epoch": 0.7602216758143465, + "grad_norm": 0.1355661153793335, + "learning_rate": 0.0005, + "loss": 2.0958, + "step": 199730 + }, + { + "epoch": 0.7602597382824692, + "grad_norm": 0.1311371773481369, + "learning_rate": 0.0005, + "loss": 2.1114, + "step": 199740 + }, + { + "epoch": 0.7602978007505918, + "grad_norm": 0.14127229154109955, + "learning_rate": 0.0005, + "loss": 2.1108, + "step": 199750 + }, + { + "epoch": 0.7603358632187146, + "grad_norm": 0.12275160849094391, + "learning_rate": 0.0005, + "loss": 2.1129, + "step": 199760 + }, + { + "epoch": 0.7603739256868373, + "grad_norm": 0.12980370223522186, + "learning_rate": 0.0005, + "loss": 2.113, + "step": 199770 + }, + { + "epoch": 0.7604119881549599, + "grad_norm": 0.12649033963680267, + "learning_rate": 0.0005, + "loss": 2.1031, + "step": 199780 + }, + { + "epoch": 0.7604500506230826, + "grad_norm": 0.12469767779111862, + "learning_rate": 0.0005, + "loss": 2.0962, + "step": 199790 + }, + { + "epoch": 0.7604881130912052, + "grad_norm": 0.1242150142788887, + "learning_rate": 0.0005, + "loss": 2.1314, + "step": 199800 + }, + { + "epoch": 0.760526175559328, + "grad_norm": 0.13924376666545868, + "learning_rate": 0.0005, + "loss": 2.1166, + "step": 199810 + }, + { + "epoch": 0.7605642380274507, + "grad_norm": 0.14627453684806824, + "learning_rate": 0.0005, + "loss": 2.0973, + "step": 199820 + }, + { + "epoch": 0.7606023004955733, + "grad_norm": 0.14123176038265228, + "learning_rate": 0.0005, + "loss": 2.1051, + "step": 199830 + }, + { + "epoch": 0.760640362963696, + "grad_norm": 0.14151117205619812, + "learning_rate": 0.0005, + "loss": 2.1149, + "step": 199840 + }, + { + "epoch": 0.7606784254318187, + "grad_norm": 0.12917184829711914, + "learning_rate": 0.0005, + "loss": 2.1178, + "step": 199850 + }, + { + "epoch": 0.7607164878999414, + "grad_norm": 0.14700210094451904, + "learning_rate": 0.0005, + "loss": 2.1041, + "step": 199860 + }, + { + "epoch": 0.760754550368064, + "grad_norm": 0.12420197576284409, + "learning_rate": 0.0005, + "loss": 2.1019, + "step": 199870 + }, + { + "epoch": 0.7607926128361867, + "grad_norm": 0.12090126425027847, + "learning_rate": 0.0005, + "loss": 2.1061, + "step": 199880 + }, + { + "epoch": 0.7608306753043095, + "grad_norm": 0.12003166973590851, + "learning_rate": 0.0005, + "loss": 2.1053, + "step": 199890 + }, + { + "epoch": 0.7608687377724321, + "grad_norm": 0.11866210401058197, + "learning_rate": 0.0005, + "loss": 2.1169, + "step": 199900 + }, + { + "epoch": 0.7609068002405548, + "grad_norm": 0.13188976049423218, + "learning_rate": 0.0005, + "loss": 2.0998, + "step": 199910 + }, + { + "epoch": 0.7609448627086774, + "grad_norm": 0.12912617623806, + "learning_rate": 0.0005, + "loss": 2.1116, + "step": 199920 + }, + { + "epoch": 0.7609829251768002, + "grad_norm": 0.14618924260139465, + "learning_rate": 0.0005, + "loss": 2.111, + "step": 199930 + }, + { + "epoch": 0.7610209876449229, + "grad_norm": 0.12827840447425842, + "learning_rate": 0.0005, + "loss": 2.1032, + "step": 199940 + }, + { + "epoch": 0.7610590501130455, + "grad_norm": 0.12038593739271164, + "learning_rate": 0.0005, + "loss": 2.107, + "step": 199950 + }, + { + "epoch": 0.7610971125811682, + "grad_norm": 0.13553407788276672, + "learning_rate": 0.0005, + "loss": 2.0962, + "step": 199960 + }, + { + "epoch": 0.7611351750492908, + "grad_norm": 0.12781314551830292, + "learning_rate": 0.0005, + "loss": 2.1178, + "step": 199970 + }, + { + "epoch": 0.7611732375174136, + "grad_norm": 0.11799361556768417, + "learning_rate": 0.0005, + "loss": 2.1084, + "step": 199980 + }, + { + "epoch": 0.7612112999855363, + "grad_norm": 0.12439846992492676, + "learning_rate": 0.0005, + "loss": 2.1148, + "step": 199990 + }, + { + "epoch": 0.7612493624536589, + "grad_norm": 0.1184629574418068, + "learning_rate": 0.0005, + "loss": 2.1058, + "step": 200000 + }, + { + "epoch": 0.7612874249217816, + "grad_norm": 0.12614157795906067, + "learning_rate": 0.0005, + "loss": 2.1094, + "step": 200010 + }, + { + "epoch": 0.7613254873899044, + "grad_norm": 0.14780183136463165, + "learning_rate": 0.0005, + "loss": 2.0919, + "step": 200020 + }, + { + "epoch": 0.761363549858027, + "grad_norm": 0.12393063306808472, + "learning_rate": 0.0005, + "loss": 2.1061, + "step": 200030 + }, + { + "epoch": 0.7614016123261497, + "grad_norm": 0.11860448122024536, + "learning_rate": 0.0005, + "loss": 2.1023, + "step": 200040 + }, + { + "epoch": 0.7614396747942723, + "grad_norm": 0.11324331164360046, + "learning_rate": 0.0005, + "loss": 2.0999, + "step": 200050 + }, + { + "epoch": 0.7614777372623951, + "grad_norm": 0.12680920958518982, + "learning_rate": 0.0005, + "loss": 2.1124, + "step": 200060 + }, + { + "epoch": 0.7615157997305178, + "grad_norm": 0.11807206273078918, + "learning_rate": 0.0005, + "loss": 2.101, + "step": 200070 + }, + { + "epoch": 0.7615538621986404, + "grad_norm": 0.12784326076507568, + "learning_rate": 0.0005, + "loss": 2.1126, + "step": 200080 + }, + { + "epoch": 0.7615919246667631, + "grad_norm": 0.12069597095251083, + "learning_rate": 0.0005, + "loss": 2.1191, + "step": 200090 + }, + { + "epoch": 0.7616299871348857, + "grad_norm": 0.12362676113843918, + "learning_rate": 0.0005, + "loss": 2.1032, + "step": 200100 + }, + { + "epoch": 0.7616680496030085, + "grad_norm": 0.11808501929044724, + "learning_rate": 0.0005, + "loss": 2.1098, + "step": 200110 + }, + { + "epoch": 0.7617061120711311, + "grad_norm": 0.1286737024784088, + "learning_rate": 0.0005, + "loss": 2.0964, + "step": 200120 + }, + { + "epoch": 0.7617441745392538, + "grad_norm": 0.1303025186061859, + "learning_rate": 0.0005, + "loss": 2.1031, + "step": 200130 + }, + { + "epoch": 0.7617822370073765, + "grad_norm": 0.12651576101779938, + "learning_rate": 0.0005, + "loss": 2.0939, + "step": 200140 + }, + { + "epoch": 0.7618202994754992, + "grad_norm": 0.1360395848751068, + "learning_rate": 0.0005, + "loss": 2.113, + "step": 200150 + }, + { + "epoch": 0.7618583619436219, + "grad_norm": 0.11983554065227509, + "learning_rate": 0.0005, + "loss": 2.1298, + "step": 200160 + }, + { + "epoch": 0.7618964244117445, + "grad_norm": 0.12265199422836304, + "learning_rate": 0.0005, + "loss": 2.1249, + "step": 200170 + }, + { + "epoch": 0.7619344868798672, + "grad_norm": 0.12509949505329132, + "learning_rate": 0.0005, + "loss": 2.1171, + "step": 200180 + }, + { + "epoch": 0.76197254934799, + "grad_norm": 0.11727918684482574, + "learning_rate": 0.0005, + "loss": 2.1039, + "step": 200190 + }, + { + "epoch": 0.7620106118161126, + "grad_norm": 0.12236443907022476, + "learning_rate": 0.0005, + "loss": 2.1038, + "step": 200200 + }, + { + "epoch": 0.7620486742842353, + "grad_norm": 0.11449822783470154, + "learning_rate": 0.0005, + "loss": 2.1033, + "step": 200210 + }, + { + "epoch": 0.7620867367523579, + "grad_norm": 0.1347184032201767, + "learning_rate": 0.0005, + "loss": 2.1074, + "step": 200220 + }, + { + "epoch": 0.7621247992204806, + "grad_norm": 0.12472956627607346, + "learning_rate": 0.0005, + "loss": 2.1097, + "step": 200230 + }, + { + "epoch": 0.7621628616886034, + "grad_norm": 0.13330808281898499, + "learning_rate": 0.0005, + "loss": 2.1002, + "step": 200240 + }, + { + "epoch": 0.762200924156726, + "grad_norm": 0.14842481911182404, + "learning_rate": 0.0005, + "loss": 2.1151, + "step": 200250 + }, + { + "epoch": 0.7622389866248487, + "grad_norm": 0.12240416556596756, + "learning_rate": 0.0005, + "loss": 2.109, + "step": 200260 + }, + { + "epoch": 0.7622770490929713, + "grad_norm": 0.12104672193527222, + "learning_rate": 0.0005, + "loss": 2.1092, + "step": 200270 + }, + { + "epoch": 0.7623151115610941, + "grad_norm": 0.1383139193058014, + "learning_rate": 0.0005, + "loss": 2.1162, + "step": 200280 + }, + { + "epoch": 0.7623531740292168, + "grad_norm": 0.12329646944999695, + "learning_rate": 0.0005, + "loss": 2.1168, + "step": 200290 + }, + { + "epoch": 0.7623912364973394, + "grad_norm": 0.1255401223897934, + "learning_rate": 0.0005, + "loss": 2.0927, + "step": 200300 + }, + { + "epoch": 0.7624292989654621, + "grad_norm": 0.12841983139514923, + "learning_rate": 0.0005, + "loss": 2.0941, + "step": 200310 + }, + { + "epoch": 0.7624673614335848, + "grad_norm": 0.12036871910095215, + "learning_rate": 0.0005, + "loss": 2.1105, + "step": 200320 + }, + { + "epoch": 0.7625054239017075, + "grad_norm": 0.11924967914819717, + "learning_rate": 0.0005, + "loss": 2.1089, + "step": 200330 + }, + { + "epoch": 0.7625434863698302, + "grad_norm": 0.12155918776988983, + "learning_rate": 0.0005, + "loss": 2.1103, + "step": 200340 + }, + { + "epoch": 0.7625815488379528, + "grad_norm": 0.11829260736703873, + "learning_rate": 0.0005, + "loss": 2.1019, + "step": 200350 + }, + { + "epoch": 0.7626196113060756, + "grad_norm": 0.1354644000530243, + "learning_rate": 0.0005, + "loss": 2.1174, + "step": 200360 + }, + { + "epoch": 0.7626576737741982, + "grad_norm": 0.1325652152299881, + "learning_rate": 0.0005, + "loss": 2.1152, + "step": 200370 + }, + { + "epoch": 0.7626957362423209, + "grad_norm": 0.13303276896476746, + "learning_rate": 0.0005, + "loss": 2.1064, + "step": 200380 + }, + { + "epoch": 0.7627337987104436, + "grad_norm": 0.11551357805728912, + "learning_rate": 0.0005, + "loss": 2.1103, + "step": 200390 + }, + { + "epoch": 0.7627718611785662, + "grad_norm": 0.11996026337146759, + "learning_rate": 0.0005, + "loss": 2.1102, + "step": 200400 + }, + { + "epoch": 0.762809923646689, + "grad_norm": 0.12938952445983887, + "learning_rate": 0.0005, + "loss": 2.1097, + "step": 200410 + }, + { + "epoch": 0.7628479861148116, + "grad_norm": 0.13679099082946777, + "learning_rate": 0.0005, + "loss": 2.1092, + "step": 200420 + }, + { + "epoch": 0.7628860485829343, + "grad_norm": 0.12176800519227982, + "learning_rate": 0.0005, + "loss": 2.1028, + "step": 200430 + }, + { + "epoch": 0.762924111051057, + "grad_norm": 0.1698097288608551, + "learning_rate": 0.0005, + "loss": 2.0942, + "step": 200440 + }, + { + "epoch": 0.7629621735191797, + "grad_norm": 0.11960088461637497, + "learning_rate": 0.0005, + "loss": 2.1108, + "step": 200450 + }, + { + "epoch": 0.7630002359873024, + "grad_norm": 0.1185685396194458, + "learning_rate": 0.0005, + "loss": 2.0984, + "step": 200460 + }, + { + "epoch": 0.763038298455425, + "grad_norm": 0.11566752195358276, + "learning_rate": 0.0005, + "loss": 2.1233, + "step": 200470 + }, + { + "epoch": 0.7630763609235477, + "grad_norm": 0.12025351822376251, + "learning_rate": 0.0005, + "loss": 2.1089, + "step": 200480 + }, + { + "epoch": 0.7631144233916705, + "grad_norm": 0.12671546638011932, + "learning_rate": 0.0005, + "loss": 2.1085, + "step": 200490 + }, + { + "epoch": 0.7631524858597931, + "grad_norm": 0.1308915913105011, + "learning_rate": 0.0005, + "loss": 2.0939, + "step": 200500 + }, + { + "epoch": 0.7631905483279158, + "grad_norm": 0.12134160101413727, + "learning_rate": 0.0005, + "loss": 2.0914, + "step": 200510 + }, + { + "epoch": 0.7632286107960384, + "grad_norm": 0.11867324262857437, + "learning_rate": 0.0005, + "loss": 2.1218, + "step": 200520 + }, + { + "epoch": 0.7632666732641611, + "grad_norm": 0.13097916543483734, + "learning_rate": 0.0005, + "loss": 2.1043, + "step": 200530 + }, + { + "epoch": 0.7633047357322839, + "grad_norm": 0.14320102334022522, + "learning_rate": 0.0005, + "loss": 2.1088, + "step": 200540 + }, + { + "epoch": 0.7633427982004065, + "grad_norm": 0.12619368731975555, + "learning_rate": 0.0005, + "loss": 2.1038, + "step": 200550 + }, + { + "epoch": 0.7633808606685292, + "grad_norm": 0.1211523711681366, + "learning_rate": 0.0005, + "loss": 2.12, + "step": 200560 + }, + { + "epoch": 0.7634189231366518, + "grad_norm": 0.12414732575416565, + "learning_rate": 0.0005, + "loss": 2.1117, + "step": 200570 + }, + { + "epoch": 0.7634569856047746, + "grad_norm": 0.12435490638017654, + "learning_rate": 0.0005, + "loss": 2.109, + "step": 200580 + }, + { + "epoch": 0.7634950480728973, + "grad_norm": 0.12518084049224854, + "learning_rate": 0.0005, + "loss": 2.1033, + "step": 200590 + }, + { + "epoch": 0.7635331105410199, + "grad_norm": 0.1124156191945076, + "learning_rate": 0.0005, + "loss": 2.111, + "step": 200600 + }, + { + "epoch": 0.7635711730091426, + "grad_norm": 0.12874804437160492, + "learning_rate": 0.0005, + "loss": 2.103, + "step": 200610 + }, + { + "epoch": 0.7636092354772653, + "grad_norm": 0.13261353969573975, + "learning_rate": 0.0005, + "loss": 2.1216, + "step": 200620 + }, + { + "epoch": 0.763647297945388, + "grad_norm": 0.11985619366168976, + "learning_rate": 0.0005, + "loss": 2.1076, + "step": 200630 + }, + { + "epoch": 0.7636853604135106, + "grad_norm": 0.12247840315103531, + "learning_rate": 0.0005, + "loss": 2.1014, + "step": 200640 + }, + { + "epoch": 0.7637234228816333, + "grad_norm": 0.1114402785897255, + "learning_rate": 0.0005, + "loss": 2.1108, + "step": 200650 + }, + { + "epoch": 0.763761485349756, + "grad_norm": 0.1159055233001709, + "learning_rate": 0.0005, + "loss": 2.1006, + "step": 200660 + }, + { + "epoch": 0.7637995478178787, + "grad_norm": 0.1403997242450714, + "learning_rate": 0.0005, + "loss": 2.1101, + "step": 200670 + }, + { + "epoch": 0.7638376102860014, + "grad_norm": 0.12809248268604279, + "learning_rate": 0.0005, + "loss": 2.1151, + "step": 200680 + }, + { + "epoch": 0.763875672754124, + "grad_norm": 0.11926622688770294, + "learning_rate": 0.0005, + "loss": 2.1021, + "step": 200690 + }, + { + "epoch": 0.7639137352222467, + "grad_norm": 0.12996363639831543, + "learning_rate": 0.0005, + "loss": 2.1131, + "step": 200700 + }, + { + "epoch": 0.7639517976903695, + "grad_norm": 0.12981660664081573, + "learning_rate": 0.0005, + "loss": 2.0952, + "step": 200710 + }, + { + "epoch": 0.7639898601584921, + "grad_norm": 0.21764759719371796, + "learning_rate": 0.0005, + "loss": 2.1018, + "step": 200720 + }, + { + "epoch": 0.7640279226266148, + "grad_norm": 0.21405267715454102, + "learning_rate": 0.0005, + "loss": 2.1045, + "step": 200730 + }, + { + "epoch": 0.7640659850947374, + "grad_norm": 0.1331353485584259, + "learning_rate": 0.0005, + "loss": 2.1014, + "step": 200740 + }, + { + "epoch": 0.7641040475628602, + "grad_norm": 0.12313000112771988, + "learning_rate": 0.0005, + "loss": 2.0937, + "step": 200750 + }, + { + "epoch": 0.7641421100309829, + "grad_norm": 0.12090528011322021, + "learning_rate": 0.0005, + "loss": 2.1044, + "step": 200760 + }, + { + "epoch": 0.7641801724991055, + "grad_norm": 0.12569141387939453, + "learning_rate": 0.0005, + "loss": 2.1093, + "step": 200770 + }, + { + "epoch": 0.7642182349672282, + "grad_norm": 0.11512047052383423, + "learning_rate": 0.0005, + "loss": 2.103, + "step": 200780 + }, + { + "epoch": 0.764256297435351, + "grad_norm": 0.13008204102516174, + "learning_rate": 0.0005, + "loss": 2.1059, + "step": 200790 + }, + { + "epoch": 0.7642943599034736, + "grad_norm": 0.11905677616596222, + "learning_rate": 0.0005, + "loss": 2.1078, + "step": 200800 + }, + { + "epoch": 0.7643324223715963, + "grad_norm": 0.12396584451198578, + "learning_rate": 0.0005, + "loss": 2.0924, + "step": 200810 + }, + { + "epoch": 0.7643704848397189, + "grad_norm": 0.11942270398139954, + "learning_rate": 0.0005, + "loss": 2.0799, + "step": 200820 + }, + { + "epoch": 0.7644085473078416, + "grad_norm": 0.11927188187837601, + "learning_rate": 0.0005, + "loss": 2.1281, + "step": 200830 + }, + { + "epoch": 0.7644466097759643, + "grad_norm": 0.1283026933670044, + "learning_rate": 0.0005, + "loss": 2.1087, + "step": 200840 + }, + { + "epoch": 0.764484672244087, + "grad_norm": 0.13145646452903748, + "learning_rate": 0.0005, + "loss": 2.0986, + "step": 200850 + }, + { + "epoch": 0.7645227347122097, + "grad_norm": 0.12313584238290787, + "learning_rate": 0.0005, + "loss": 2.1137, + "step": 200860 + }, + { + "epoch": 0.7645607971803323, + "grad_norm": 0.13153089582920074, + "learning_rate": 0.0005, + "loss": 2.107, + "step": 200870 + }, + { + "epoch": 0.7645988596484551, + "grad_norm": 0.12019491195678711, + "learning_rate": 0.0005, + "loss": 2.0964, + "step": 200880 + }, + { + "epoch": 0.7646369221165777, + "grad_norm": 0.12518441677093506, + "learning_rate": 0.0005, + "loss": 2.1079, + "step": 200890 + }, + { + "epoch": 0.7646749845847004, + "grad_norm": 0.11960908025503159, + "learning_rate": 0.0005, + "loss": 2.0951, + "step": 200900 + }, + { + "epoch": 0.7647130470528231, + "grad_norm": 0.1407465934753418, + "learning_rate": 0.0005, + "loss": 2.1137, + "step": 200910 + }, + { + "epoch": 0.7647511095209458, + "grad_norm": 0.1314762830734253, + "learning_rate": 0.0005, + "loss": 2.1015, + "step": 200920 + }, + { + "epoch": 0.7647891719890685, + "grad_norm": 0.11958575248718262, + "learning_rate": 0.0005, + "loss": 2.1064, + "step": 200930 + }, + { + "epoch": 0.7648272344571911, + "grad_norm": 0.12189696729183197, + "learning_rate": 0.0005, + "loss": 2.1009, + "step": 200940 + }, + { + "epoch": 0.7648652969253138, + "grad_norm": 0.15705542266368866, + "learning_rate": 0.0005, + "loss": 2.1095, + "step": 200950 + }, + { + "epoch": 0.7649033593934365, + "grad_norm": 0.11517345160245895, + "learning_rate": 0.0005, + "loss": 2.0988, + "step": 200960 + }, + { + "epoch": 0.7649414218615592, + "grad_norm": 0.13677245378494263, + "learning_rate": 0.0005, + "loss": 2.1189, + "step": 200970 + }, + { + "epoch": 0.7649794843296819, + "grad_norm": 0.13079141080379486, + "learning_rate": 0.0005, + "loss": 2.1181, + "step": 200980 + }, + { + "epoch": 0.7650175467978045, + "grad_norm": 0.12438200414180756, + "learning_rate": 0.0005, + "loss": 2.1206, + "step": 200990 + }, + { + "epoch": 0.7650556092659272, + "grad_norm": 0.12100034207105637, + "learning_rate": 0.0005, + "loss": 2.109, + "step": 201000 + }, + { + "epoch": 0.76509367173405, + "grad_norm": 0.1258230209350586, + "learning_rate": 0.0005, + "loss": 2.1065, + "step": 201010 + }, + { + "epoch": 0.7651317342021726, + "grad_norm": 0.13739217817783356, + "learning_rate": 0.0005, + "loss": 2.1129, + "step": 201020 + }, + { + "epoch": 0.7651697966702953, + "grad_norm": 0.12209373712539673, + "learning_rate": 0.0005, + "loss": 2.0926, + "step": 201030 + }, + { + "epoch": 0.7652078591384179, + "grad_norm": 0.11340746283531189, + "learning_rate": 0.0005, + "loss": 2.1156, + "step": 201040 + }, + { + "epoch": 0.7652459216065407, + "grad_norm": 0.14591138064861298, + "learning_rate": 0.0005, + "loss": 2.0934, + "step": 201050 + }, + { + "epoch": 0.7652839840746634, + "grad_norm": 0.11925285309553146, + "learning_rate": 0.0005, + "loss": 2.1016, + "step": 201060 + }, + { + "epoch": 0.765322046542786, + "grad_norm": 0.1297868937253952, + "learning_rate": 0.0005, + "loss": 2.1063, + "step": 201070 + }, + { + "epoch": 0.7653601090109087, + "grad_norm": 0.12385216355323792, + "learning_rate": 0.0005, + "loss": 2.1, + "step": 201080 + }, + { + "epoch": 0.7653981714790314, + "grad_norm": 0.12152455747127533, + "learning_rate": 0.0005, + "loss": 2.111, + "step": 201090 + }, + { + "epoch": 0.7654362339471541, + "grad_norm": 0.12108520418405533, + "learning_rate": 0.0005, + "loss": 2.1002, + "step": 201100 + }, + { + "epoch": 0.7654742964152768, + "grad_norm": 0.12145870923995972, + "learning_rate": 0.0005, + "loss": 2.108, + "step": 201110 + }, + { + "epoch": 0.7655123588833994, + "grad_norm": 0.12569770216941833, + "learning_rate": 0.0005, + "loss": 2.1181, + "step": 201120 + }, + { + "epoch": 0.7655504213515221, + "grad_norm": 0.12960554659366608, + "learning_rate": 0.0005, + "loss": 2.116, + "step": 201130 + }, + { + "epoch": 0.7655884838196448, + "grad_norm": 0.1400604248046875, + "learning_rate": 0.0005, + "loss": 2.0998, + "step": 201140 + }, + { + "epoch": 0.7656265462877675, + "grad_norm": 0.13274477422237396, + "learning_rate": 0.0005, + "loss": 2.0978, + "step": 201150 + }, + { + "epoch": 0.7656646087558902, + "grad_norm": 0.11507438123226166, + "learning_rate": 0.0005, + "loss": 2.1268, + "step": 201160 + }, + { + "epoch": 0.7657026712240128, + "grad_norm": 0.11901625990867615, + "learning_rate": 0.0005, + "loss": 2.105, + "step": 201170 + }, + { + "epoch": 0.7657407336921356, + "grad_norm": 0.1272687017917633, + "learning_rate": 0.0005, + "loss": 2.1051, + "step": 201180 + }, + { + "epoch": 0.7657787961602582, + "grad_norm": 0.13874635100364685, + "learning_rate": 0.0005, + "loss": 2.1062, + "step": 201190 + }, + { + "epoch": 0.7658168586283809, + "grad_norm": 0.5305657982826233, + "learning_rate": 0.0005, + "loss": 2.1095, + "step": 201200 + }, + { + "epoch": 0.7658549210965035, + "grad_norm": 0.1429230123758316, + "learning_rate": 0.0005, + "loss": 2.1133, + "step": 201210 + }, + { + "epoch": 0.7658929835646263, + "grad_norm": 0.11990445107221603, + "learning_rate": 0.0005, + "loss": 2.1045, + "step": 201220 + }, + { + "epoch": 0.765931046032749, + "grad_norm": 0.11194054782390594, + "learning_rate": 0.0005, + "loss": 2.1088, + "step": 201230 + }, + { + "epoch": 0.7659691085008716, + "grad_norm": 0.12997521460056305, + "learning_rate": 0.0005, + "loss": 2.0912, + "step": 201240 + }, + { + "epoch": 0.7660071709689943, + "grad_norm": 0.12883532047271729, + "learning_rate": 0.0005, + "loss": 2.1181, + "step": 201250 + }, + { + "epoch": 0.766045233437117, + "grad_norm": 0.11726586520671844, + "learning_rate": 0.0005, + "loss": 2.1101, + "step": 201260 + }, + { + "epoch": 0.7660832959052397, + "grad_norm": 0.1368391364812851, + "learning_rate": 0.0005, + "loss": 2.0987, + "step": 201270 + }, + { + "epoch": 0.7661213583733624, + "grad_norm": 0.11998681724071503, + "learning_rate": 0.0005, + "loss": 2.1023, + "step": 201280 + }, + { + "epoch": 0.766159420841485, + "grad_norm": 0.13713191449642181, + "learning_rate": 0.0005, + "loss": 2.1088, + "step": 201290 + }, + { + "epoch": 0.7661974833096077, + "grad_norm": 0.14142447710037231, + "learning_rate": 0.0005, + "loss": 2.0997, + "step": 201300 + }, + { + "epoch": 0.7662355457777305, + "grad_norm": 0.12667404115200043, + "learning_rate": 0.0005, + "loss": 2.1015, + "step": 201310 + }, + { + "epoch": 0.7662736082458531, + "grad_norm": 0.11173027753829956, + "learning_rate": 0.0005, + "loss": 2.0948, + "step": 201320 + }, + { + "epoch": 0.7663116707139758, + "grad_norm": 0.1272125542163849, + "learning_rate": 0.0005, + "loss": 2.106, + "step": 201330 + }, + { + "epoch": 0.7663497331820984, + "grad_norm": 0.11534397304058075, + "learning_rate": 0.0005, + "loss": 2.1017, + "step": 201340 + }, + { + "epoch": 0.7663877956502212, + "grad_norm": 0.12607638537883759, + "learning_rate": 0.0005, + "loss": 2.1095, + "step": 201350 + }, + { + "epoch": 0.7664258581183439, + "grad_norm": 0.13547714054584503, + "learning_rate": 0.0005, + "loss": 2.1267, + "step": 201360 + }, + { + "epoch": 0.7664639205864665, + "grad_norm": 0.13281914591789246, + "learning_rate": 0.0005, + "loss": 2.0975, + "step": 201370 + }, + { + "epoch": 0.7665019830545892, + "grad_norm": 0.15444037318229675, + "learning_rate": 0.0005, + "loss": 2.1139, + "step": 201380 + }, + { + "epoch": 0.7665400455227118, + "grad_norm": 0.14034345746040344, + "learning_rate": 0.0005, + "loss": 2.1115, + "step": 201390 + }, + { + "epoch": 0.7665781079908346, + "grad_norm": 0.3182571530342102, + "learning_rate": 0.0005, + "loss": 2.1095, + "step": 201400 + }, + { + "epoch": 0.7666161704589572, + "grad_norm": 0.1384763866662979, + "learning_rate": 0.0005, + "loss": 2.1098, + "step": 201410 + }, + { + "epoch": 0.7666542329270799, + "grad_norm": 0.12868386507034302, + "learning_rate": 0.0005, + "loss": 2.115, + "step": 201420 + }, + { + "epoch": 0.7666922953952026, + "grad_norm": 0.12463357299566269, + "learning_rate": 0.0005, + "loss": 2.1101, + "step": 201430 + }, + { + "epoch": 0.7667303578633253, + "grad_norm": 0.12342123687267303, + "learning_rate": 0.0005, + "loss": 2.1095, + "step": 201440 + }, + { + "epoch": 0.766768420331448, + "grad_norm": 0.13239067792892456, + "learning_rate": 0.0005, + "loss": 2.0958, + "step": 201450 + }, + { + "epoch": 0.7668064827995706, + "grad_norm": 0.11232977360486984, + "learning_rate": 0.0005, + "loss": 2.1039, + "step": 201460 + }, + { + "epoch": 0.7668445452676933, + "grad_norm": 0.1485995352268219, + "learning_rate": 0.0005, + "loss": 2.1093, + "step": 201470 + }, + { + "epoch": 0.7668826077358161, + "grad_norm": 0.12763135135173798, + "learning_rate": 0.0005, + "loss": 2.1081, + "step": 201480 + }, + { + "epoch": 0.7669206702039387, + "grad_norm": 0.13601966202259064, + "learning_rate": 0.0005, + "loss": 2.1077, + "step": 201490 + }, + { + "epoch": 0.7669587326720614, + "grad_norm": 0.15210683643817902, + "learning_rate": 0.0005, + "loss": 2.1141, + "step": 201500 + }, + { + "epoch": 0.766996795140184, + "grad_norm": 0.12328286468982697, + "learning_rate": 0.0005, + "loss": 2.1209, + "step": 201510 + }, + { + "epoch": 0.7670348576083068, + "grad_norm": 0.12505197525024414, + "learning_rate": 0.0005, + "loss": 2.1052, + "step": 201520 + }, + { + "epoch": 0.7670729200764295, + "grad_norm": 0.1358686238527298, + "learning_rate": 0.0005, + "loss": 2.1115, + "step": 201530 + }, + { + "epoch": 0.7671109825445521, + "grad_norm": 0.13519304990768433, + "learning_rate": 0.0005, + "loss": 2.1004, + "step": 201540 + }, + { + "epoch": 0.7671490450126748, + "grad_norm": 0.11909667402505875, + "learning_rate": 0.0005, + "loss": 2.0925, + "step": 201550 + }, + { + "epoch": 0.7671871074807974, + "grad_norm": 0.12398222088813782, + "learning_rate": 0.0005, + "loss": 2.0942, + "step": 201560 + }, + { + "epoch": 0.7672251699489202, + "grad_norm": 0.11791833490133286, + "learning_rate": 0.0005, + "loss": 2.1267, + "step": 201570 + }, + { + "epoch": 0.7672632324170429, + "grad_norm": 0.12576699256896973, + "learning_rate": 0.0005, + "loss": 2.099, + "step": 201580 + }, + { + "epoch": 0.7673012948851655, + "grad_norm": 0.12361087650060654, + "learning_rate": 0.0005, + "loss": 2.1091, + "step": 201590 + }, + { + "epoch": 0.7673393573532882, + "grad_norm": 0.24520492553710938, + "learning_rate": 0.0005, + "loss": 2.1159, + "step": 201600 + }, + { + "epoch": 0.7673774198214109, + "grad_norm": 0.12726250290870667, + "learning_rate": 0.0005, + "loss": 2.1119, + "step": 201610 + }, + { + "epoch": 0.7674154822895336, + "grad_norm": 0.12805522978305817, + "learning_rate": 0.0005, + "loss": 2.1055, + "step": 201620 + }, + { + "epoch": 0.7674535447576563, + "grad_norm": 0.24818246066570282, + "learning_rate": 0.0005, + "loss": 2.1014, + "step": 201630 + }, + { + "epoch": 0.7674916072257789, + "grad_norm": 0.11738839745521545, + "learning_rate": 0.0005, + "loss": 2.0948, + "step": 201640 + }, + { + "epoch": 0.7675296696939017, + "grad_norm": 0.1346309930086136, + "learning_rate": 0.0005, + "loss": 2.0941, + "step": 201650 + }, + { + "epoch": 0.7675677321620243, + "grad_norm": 0.11849494278430939, + "learning_rate": 0.0005, + "loss": 2.0968, + "step": 201660 + }, + { + "epoch": 0.767605794630147, + "grad_norm": 0.12269473075866699, + "learning_rate": 0.0005, + "loss": 2.0933, + "step": 201670 + }, + { + "epoch": 0.7676438570982697, + "grad_norm": 0.13212372362613678, + "learning_rate": 0.0005, + "loss": 2.099, + "step": 201680 + }, + { + "epoch": 0.7676819195663923, + "grad_norm": 0.1193777471780777, + "learning_rate": 0.0005, + "loss": 2.0979, + "step": 201690 + }, + { + "epoch": 0.7677199820345151, + "grad_norm": 0.1222861036658287, + "learning_rate": 0.0005, + "loss": 2.1194, + "step": 201700 + }, + { + "epoch": 0.7677580445026377, + "grad_norm": 0.13242977857589722, + "learning_rate": 0.0005, + "loss": 2.1069, + "step": 201710 + }, + { + "epoch": 0.7677961069707604, + "grad_norm": 0.12794439494609833, + "learning_rate": 0.0005, + "loss": 2.1169, + "step": 201720 + }, + { + "epoch": 0.767834169438883, + "grad_norm": 0.2587161064147949, + "learning_rate": 0.0005, + "loss": 2.0911, + "step": 201730 + }, + { + "epoch": 0.7678722319070058, + "grad_norm": 0.12928904592990875, + "learning_rate": 0.0005, + "loss": 2.1073, + "step": 201740 + }, + { + "epoch": 0.7679102943751285, + "grad_norm": 0.15386945009231567, + "learning_rate": 0.0005, + "loss": 2.1125, + "step": 201750 + }, + { + "epoch": 0.7679483568432511, + "grad_norm": 0.13748303055763245, + "learning_rate": 0.0005, + "loss": 2.1024, + "step": 201760 + }, + { + "epoch": 0.7679864193113738, + "grad_norm": 0.12886695563793182, + "learning_rate": 0.0005, + "loss": 2.1009, + "step": 201770 + }, + { + "epoch": 0.7680244817794966, + "grad_norm": 0.12823881208896637, + "learning_rate": 0.0005, + "loss": 2.1299, + "step": 201780 + }, + { + "epoch": 0.7680625442476192, + "grad_norm": 0.11645886301994324, + "learning_rate": 0.0005, + "loss": 2.1088, + "step": 201790 + }, + { + "epoch": 0.7681006067157419, + "grad_norm": 0.12088745087385178, + "learning_rate": 0.0005, + "loss": 2.1165, + "step": 201800 + }, + { + "epoch": 0.7681386691838645, + "grad_norm": 0.1247321143746376, + "learning_rate": 0.0005, + "loss": 2.1041, + "step": 201810 + }, + { + "epoch": 0.7681767316519872, + "grad_norm": 0.21556483209133148, + "learning_rate": 0.0005, + "loss": 2.0999, + "step": 201820 + }, + { + "epoch": 0.76821479412011, + "grad_norm": 0.15077947080135345, + "learning_rate": 0.0005, + "loss": 2.1107, + "step": 201830 + }, + { + "epoch": 0.7682528565882326, + "grad_norm": 0.14259713888168335, + "learning_rate": 0.0005, + "loss": 2.1086, + "step": 201840 + }, + { + "epoch": 0.7682909190563553, + "grad_norm": 0.12281884253025055, + "learning_rate": 0.0005, + "loss": 2.099, + "step": 201850 + }, + { + "epoch": 0.7683289815244779, + "grad_norm": 0.1274409294128418, + "learning_rate": 0.0005, + "loss": 2.1116, + "step": 201860 + }, + { + "epoch": 0.7683670439926007, + "grad_norm": 0.13390718400478363, + "learning_rate": 0.0005, + "loss": 2.099, + "step": 201870 + }, + { + "epoch": 0.7684051064607234, + "grad_norm": 0.12256969511508942, + "learning_rate": 0.0005, + "loss": 2.1121, + "step": 201880 + }, + { + "epoch": 0.768443168928846, + "grad_norm": 0.13756145536899567, + "learning_rate": 0.0005, + "loss": 2.1043, + "step": 201890 + }, + { + "epoch": 0.7684812313969687, + "grad_norm": 0.12480933964252472, + "learning_rate": 0.0005, + "loss": 2.1207, + "step": 201900 + }, + { + "epoch": 0.7685192938650914, + "grad_norm": 0.11438208073377609, + "learning_rate": 0.0005, + "loss": 2.1081, + "step": 201910 + }, + { + "epoch": 0.7685573563332141, + "grad_norm": 0.12336448580026627, + "learning_rate": 0.0005, + "loss": 2.1168, + "step": 201920 + }, + { + "epoch": 0.7685954188013367, + "grad_norm": 0.12992775440216064, + "learning_rate": 0.0005, + "loss": 2.113, + "step": 201930 + }, + { + "epoch": 0.7686334812694594, + "grad_norm": 0.11728628724813461, + "learning_rate": 0.0005, + "loss": 2.1134, + "step": 201940 + }, + { + "epoch": 0.7686715437375822, + "grad_norm": 0.12237447500228882, + "learning_rate": 0.0005, + "loss": 2.1213, + "step": 201950 + }, + { + "epoch": 0.7687096062057048, + "grad_norm": 0.11688285320997238, + "learning_rate": 0.0005, + "loss": 2.1229, + "step": 201960 + }, + { + "epoch": 0.7687476686738275, + "grad_norm": 0.1265353411436081, + "learning_rate": 0.0005, + "loss": 2.0984, + "step": 201970 + }, + { + "epoch": 0.7687857311419501, + "grad_norm": 0.11828229576349258, + "learning_rate": 0.0005, + "loss": 2.1039, + "step": 201980 + }, + { + "epoch": 0.7688237936100728, + "grad_norm": 0.1361830234527588, + "learning_rate": 0.0005, + "loss": 2.11, + "step": 201990 + }, + { + "epoch": 0.7688618560781956, + "grad_norm": 0.12680485844612122, + "learning_rate": 0.0005, + "loss": 2.1065, + "step": 202000 + }, + { + "epoch": 0.7688999185463182, + "grad_norm": 0.12592792510986328, + "learning_rate": 0.0005, + "loss": 2.1091, + "step": 202010 + }, + { + "epoch": 0.7689379810144409, + "grad_norm": 0.1196167916059494, + "learning_rate": 0.0005, + "loss": 2.102, + "step": 202020 + }, + { + "epoch": 0.7689760434825635, + "grad_norm": 0.11954740434885025, + "learning_rate": 0.0005, + "loss": 2.0975, + "step": 202030 + }, + { + "epoch": 0.7690141059506863, + "grad_norm": 0.13095378875732422, + "learning_rate": 0.0005, + "loss": 2.0931, + "step": 202040 + }, + { + "epoch": 0.769052168418809, + "grad_norm": 0.13523481786251068, + "learning_rate": 0.0005, + "loss": 2.098, + "step": 202050 + }, + { + "epoch": 0.7690902308869316, + "grad_norm": 0.14069612324237823, + "learning_rate": 0.0005, + "loss": 2.1131, + "step": 202060 + }, + { + "epoch": 0.7691282933550543, + "grad_norm": 0.11833822727203369, + "learning_rate": 0.0005, + "loss": 2.1175, + "step": 202070 + }, + { + "epoch": 0.769166355823177, + "grad_norm": 0.13712801039218903, + "learning_rate": 0.0005, + "loss": 2.0987, + "step": 202080 + }, + { + "epoch": 0.7692044182912997, + "grad_norm": 0.13787589967250824, + "learning_rate": 0.0005, + "loss": 2.1092, + "step": 202090 + }, + { + "epoch": 0.7692424807594224, + "grad_norm": 0.1386062055826187, + "learning_rate": 0.0005, + "loss": 2.1225, + "step": 202100 + }, + { + "epoch": 0.769280543227545, + "grad_norm": 0.12532974779605865, + "learning_rate": 0.0005, + "loss": 2.1132, + "step": 202110 + }, + { + "epoch": 0.7693186056956677, + "grad_norm": 0.12713083624839783, + "learning_rate": 0.0005, + "loss": 2.0961, + "step": 202120 + }, + { + "epoch": 0.7693566681637904, + "grad_norm": 0.12442374974489212, + "learning_rate": 0.0005, + "loss": 2.1098, + "step": 202130 + }, + { + "epoch": 0.7693947306319131, + "grad_norm": 0.12369605898857117, + "learning_rate": 0.0005, + "loss": 2.0881, + "step": 202140 + }, + { + "epoch": 0.7694327931000358, + "grad_norm": 0.12773537635803223, + "learning_rate": 0.0005, + "loss": 2.1001, + "step": 202150 + }, + { + "epoch": 0.7694708555681584, + "grad_norm": 0.13192373514175415, + "learning_rate": 0.0005, + "loss": 2.1077, + "step": 202160 + }, + { + "epoch": 0.7695089180362812, + "grad_norm": 0.1284187138080597, + "learning_rate": 0.0005, + "loss": 2.0935, + "step": 202170 + }, + { + "epoch": 0.7695469805044038, + "grad_norm": 0.12153860926628113, + "learning_rate": 0.0005, + "loss": 2.0988, + "step": 202180 + }, + { + "epoch": 0.7695850429725265, + "grad_norm": 0.1268356591463089, + "learning_rate": 0.0005, + "loss": 2.1022, + "step": 202190 + }, + { + "epoch": 0.7696231054406492, + "grad_norm": 0.12742312252521515, + "learning_rate": 0.0005, + "loss": 2.1112, + "step": 202200 + }, + { + "epoch": 0.7696611679087719, + "grad_norm": 0.11673905700445175, + "learning_rate": 0.0005, + "loss": 2.0851, + "step": 202210 + }, + { + "epoch": 0.7696992303768946, + "grad_norm": 0.11608495563268661, + "learning_rate": 0.0005, + "loss": 2.1153, + "step": 202220 + }, + { + "epoch": 0.7697372928450172, + "grad_norm": 0.12505364418029785, + "learning_rate": 0.0005, + "loss": 2.1149, + "step": 202230 + }, + { + "epoch": 0.7697753553131399, + "grad_norm": 0.11059442907571793, + "learning_rate": 0.0005, + "loss": 2.1103, + "step": 202240 + }, + { + "epoch": 0.7698134177812626, + "grad_norm": 0.13075938820838928, + "learning_rate": 0.0005, + "loss": 2.0916, + "step": 202250 + }, + { + "epoch": 0.7698514802493853, + "grad_norm": 0.13375014066696167, + "learning_rate": 0.0005, + "loss": 2.102, + "step": 202260 + }, + { + "epoch": 0.769889542717508, + "grad_norm": 0.1336306482553482, + "learning_rate": 0.0005, + "loss": 2.1078, + "step": 202270 + }, + { + "epoch": 0.7699276051856306, + "grad_norm": 0.1256602555513382, + "learning_rate": 0.0005, + "loss": 2.1124, + "step": 202280 + }, + { + "epoch": 0.7699656676537533, + "grad_norm": 0.1295190006494522, + "learning_rate": 0.0005, + "loss": 2.0898, + "step": 202290 + }, + { + "epoch": 0.7700037301218761, + "grad_norm": 0.13925063610076904, + "learning_rate": 0.0005, + "loss": 2.1136, + "step": 202300 + }, + { + "epoch": 0.7700417925899987, + "grad_norm": 0.13413967192173004, + "learning_rate": 0.0005, + "loss": 2.1068, + "step": 202310 + }, + { + "epoch": 0.7700798550581214, + "grad_norm": 0.11593659222126007, + "learning_rate": 0.0005, + "loss": 2.1, + "step": 202320 + }, + { + "epoch": 0.770117917526244, + "grad_norm": 0.12970569729804993, + "learning_rate": 0.0005, + "loss": 2.1156, + "step": 202330 + }, + { + "epoch": 0.7701559799943668, + "grad_norm": 0.12410783767700195, + "learning_rate": 0.0005, + "loss": 2.0962, + "step": 202340 + }, + { + "epoch": 0.7701940424624895, + "grad_norm": 0.1210460290312767, + "learning_rate": 0.0005, + "loss": 2.1021, + "step": 202350 + }, + { + "epoch": 0.7702321049306121, + "grad_norm": 0.13017313182353973, + "learning_rate": 0.0005, + "loss": 2.11, + "step": 202360 + }, + { + "epoch": 0.7702701673987348, + "grad_norm": 0.1134258434176445, + "learning_rate": 0.0005, + "loss": 2.1189, + "step": 202370 + }, + { + "epoch": 0.7703082298668575, + "grad_norm": 0.13476477563381195, + "learning_rate": 0.0005, + "loss": 2.1003, + "step": 202380 + }, + { + "epoch": 0.7703462923349802, + "grad_norm": 0.11765899509191513, + "learning_rate": 0.0005, + "loss": 2.0993, + "step": 202390 + }, + { + "epoch": 0.7703843548031029, + "grad_norm": 0.12300023436546326, + "learning_rate": 0.0005, + "loss": 2.1077, + "step": 202400 + }, + { + "epoch": 0.7704224172712255, + "grad_norm": 0.14942879974842072, + "learning_rate": 0.0005, + "loss": 2.1169, + "step": 202410 + }, + { + "epoch": 0.7704604797393482, + "grad_norm": 0.12374628335237503, + "learning_rate": 0.0005, + "loss": 2.1059, + "step": 202420 + }, + { + "epoch": 0.7704985422074709, + "grad_norm": 0.12553882598876953, + "learning_rate": 0.0005, + "loss": 2.1061, + "step": 202430 + }, + { + "epoch": 0.7705366046755936, + "grad_norm": 0.13185171782970428, + "learning_rate": 0.0005, + "loss": 2.1112, + "step": 202440 + }, + { + "epoch": 0.7705746671437163, + "grad_norm": 0.12644770741462708, + "learning_rate": 0.0005, + "loss": 2.1045, + "step": 202450 + }, + { + "epoch": 0.7706127296118389, + "grad_norm": 0.13731904327869415, + "learning_rate": 0.0005, + "loss": 2.113, + "step": 202460 + }, + { + "epoch": 0.7706507920799617, + "grad_norm": 0.1281971037387848, + "learning_rate": 0.0005, + "loss": 2.1106, + "step": 202470 + }, + { + "epoch": 0.7706888545480843, + "grad_norm": 0.12450539320707321, + "learning_rate": 0.0005, + "loss": 2.105, + "step": 202480 + }, + { + "epoch": 0.770726917016207, + "grad_norm": 0.11814303696155548, + "learning_rate": 0.0005, + "loss": 2.1049, + "step": 202490 + }, + { + "epoch": 0.7707649794843296, + "grad_norm": 0.13239553570747375, + "learning_rate": 0.0005, + "loss": 2.1027, + "step": 202500 + }, + { + "epoch": 0.7708030419524524, + "grad_norm": 0.11851152032613754, + "learning_rate": 0.0005, + "loss": 2.1045, + "step": 202510 + }, + { + "epoch": 0.7708411044205751, + "grad_norm": 0.1262698620557785, + "learning_rate": 0.0005, + "loss": 2.1019, + "step": 202520 + }, + { + "epoch": 0.7708791668886977, + "grad_norm": 0.13262896239757538, + "learning_rate": 0.0005, + "loss": 2.0998, + "step": 202530 + }, + { + "epoch": 0.7709172293568204, + "grad_norm": 0.13419535756111145, + "learning_rate": 0.0005, + "loss": 2.1183, + "step": 202540 + }, + { + "epoch": 0.770955291824943, + "grad_norm": 0.13374435901641846, + "learning_rate": 0.0005, + "loss": 2.1263, + "step": 202550 + }, + { + "epoch": 0.7709933542930658, + "grad_norm": 0.1315140277147293, + "learning_rate": 0.0005, + "loss": 2.1118, + "step": 202560 + }, + { + "epoch": 0.7710314167611885, + "grad_norm": 0.13533872365951538, + "learning_rate": 0.0005, + "loss": 2.1153, + "step": 202570 + }, + { + "epoch": 0.7710694792293111, + "grad_norm": 0.12378238141536713, + "learning_rate": 0.0005, + "loss": 2.1044, + "step": 202580 + }, + { + "epoch": 0.7711075416974338, + "grad_norm": 0.1197347342967987, + "learning_rate": 0.0005, + "loss": 2.1072, + "step": 202590 + }, + { + "epoch": 0.7711456041655566, + "grad_norm": 0.13101764023303986, + "learning_rate": 0.0005, + "loss": 2.112, + "step": 202600 + }, + { + "epoch": 0.7711836666336792, + "grad_norm": 0.1293804794549942, + "learning_rate": 0.0005, + "loss": 2.093, + "step": 202610 + }, + { + "epoch": 0.7712217291018019, + "grad_norm": 0.12614557147026062, + "learning_rate": 0.0005, + "loss": 2.099, + "step": 202620 + }, + { + "epoch": 0.7712597915699245, + "grad_norm": 0.1223040446639061, + "learning_rate": 0.0005, + "loss": 2.1068, + "step": 202630 + }, + { + "epoch": 0.7712978540380473, + "grad_norm": 0.11462346464395523, + "learning_rate": 0.0005, + "loss": 2.1084, + "step": 202640 + }, + { + "epoch": 0.77133591650617, + "grad_norm": 0.13106228411197662, + "learning_rate": 0.0005, + "loss": 2.1035, + "step": 202650 + }, + { + "epoch": 0.7713739789742926, + "grad_norm": 0.12828728556632996, + "learning_rate": 0.0005, + "loss": 2.1022, + "step": 202660 + }, + { + "epoch": 0.7714120414424153, + "grad_norm": 0.13604027032852173, + "learning_rate": 0.0005, + "loss": 2.0991, + "step": 202670 + }, + { + "epoch": 0.7714501039105379, + "grad_norm": 0.11754266917705536, + "learning_rate": 0.0005, + "loss": 2.0876, + "step": 202680 + }, + { + "epoch": 0.7714881663786607, + "grad_norm": 0.1196461096405983, + "learning_rate": 0.0005, + "loss": 2.1088, + "step": 202690 + }, + { + "epoch": 0.7715262288467833, + "grad_norm": 0.14312899112701416, + "learning_rate": 0.0005, + "loss": 2.1072, + "step": 202700 + }, + { + "epoch": 0.771564291314906, + "grad_norm": 0.11657819151878357, + "learning_rate": 0.0005, + "loss": 2.1035, + "step": 202710 + }, + { + "epoch": 0.7716023537830287, + "grad_norm": 0.12473199516534805, + "learning_rate": 0.0005, + "loss": 2.1112, + "step": 202720 + }, + { + "epoch": 0.7716404162511514, + "grad_norm": 0.14507940411567688, + "learning_rate": 0.0005, + "loss": 2.099, + "step": 202730 + }, + { + "epoch": 0.7716784787192741, + "grad_norm": 0.12810270488262177, + "learning_rate": 0.0005, + "loss": 2.1261, + "step": 202740 + }, + { + "epoch": 0.7717165411873967, + "grad_norm": 0.12481430917978287, + "learning_rate": 0.0005, + "loss": 2.118, + "step": 202750 + }, + { + "epoch": 0.7717546036555194, + "grad_norm": 0.13375705480575562, + "learning_rate": 0.0005, + "loss": 2.1056, + "step": 202760 + }, + { + "epoch": 0.7717926661236422, + "grad_norm": 0.14170774817466736, + "learning_rate": 0.0005, + "loss": 2.1006, + "step": 202770 + }, + { + "epoch": 0.7718307285917648, + "grad_norm": 0.1388128399848938, + "learning_rate": 0.0005, + "loss": 2.1088, + "step": 202780 + }, + { + "epoch": 0.7718687910598875, + "grad_norm": 0.12213142216205597, + "learning_rate": 0.0005, + "loss": 2.1093, + "step": 202790 + }, + { + "epoch": 0.7719068535280101, + "grad_norm": 0.13339495658874512, + "learning_rate": 0.0005, + "loss": 2.1105, + "step": 202800 + }, + { + "epoch": 0.7719449159961329, + "grad_norm": 0.11571691185235977, + "learning_rate": 0.0005, + "loss": 2.0938, + "step": 202810 + }, + { + "epoch": 0.7719829784642556, + "grad_norm": 0.13037385046482086, + "learning_rate": 0.0005, + "loss": 2.108, + "step": 202820 + }, + { + "epoch": 0.7720210409323782, + "grad_norm": 0.11974932998418808, + "learning_rate": 0.0005, + "loss": 2.1044, + "step": 202830 + }, + { + "epoch": 0.7720591034005009, + "grad_norm": 0.115937739610672, + "learning_rate": 0.0005, + "loss": 2.1114, + "step": 202840 + }, + { + "epoch": 0.7720971658686235, + "grad_norm": 0.12142065167427063, + "learning_rate": 0.0005, + "loss": 2.1093, + "step": 202850 + }, + { + "epoch": 0.7721352283367463, + "grad_norm": 0.13116484880447388, + "learning_rate": 0.0005, + "loss": 2.1004, + "step": 202860 + }, + { + "epoch": 0.772173290804869, + "grad_norm": 0.11825598776340485, + "learning_rate": 0.0005, + "loss": 2.116, + "step": 202870 + }, + { + "epoch": 0.7722113532729916, + "grad_norm": 0.11560803651809692, + "learning_rate": 0.0005, + "loss": 2.1147, + "step": 202880 + }, + { + "epoch": 0.7722494157411143, + "grad_norm": 0.12228305637836456, + "learning_rate": 0.0005, + "loss": 2.1101, + "step": 202890 + }, + { + "epoch": 0.772287478209237, + "grad_norm": 0.1562349647283554, + "learning_rate": 0.0005, + "loss": 2.0923, + "step": 202900 + }, + { + "epoch": 0.7723255406773597, + "grad_norm": 0.12264727801084518, + "learning_rate": 0.0005, + "loss": 2.0976, + "step": 202910 + }, + { + "epoch": 0.7723636031454824, + "grad_norm": 0.13685239851474762, + "learning_rate": 0.0005, + "loss": 2.101, + "step": 202920 + }, + { + "epoch": 0.772401665613605, + "grad_norm": 0.1482173651456833, + "learning_rate": 0.0005, + "loss": 2.1207, + "step": 202930 + }, + { + "epoch": 0.7724397280817278, + "grad_norm": 0.11645165830850601, + "learning_rate": 0.0005, + "loss": 2.0975, + "step": 202940 + }, + { + "epoch": 0.7724777905498504, + "grad_norm": 0.12072696536779404, + "learning_rate": 0.0005, + "loss": 2.0881, + "step": 202950 + }, + { + "epoch": 0.7725158530179731, + "grad_norm": 0.13543497025966644, + "learning_rate": 0.0005, + "loss": 2.1097, + "step": 202960 + }, + { + "epoch": 0.7725539154860958, + "grad_norm": 0.1298336237668991, + "learning_rate": 0.0005, + "loss": 2.0991, + "step": 202970 + }, + { + "epoch": 0.7725919779542184, + "grad_norm": 0.14146368205547333, + "learning_rate": 0.0005, + "loss": 2.1032, + "step": 202980 + }, + { + "epoch": 0.7726300404223412, + "grad_norm": 0.1257074922323227, + "learning_rate": 0.0005, + "loss": 2.1029, + "step": 202990 + }, + { + "epoch": 0.7726681028904638, + "grad_norm": 0.14784252643585205, + "learning_rate": 0.0005, + "loss": 2.1157, + "step": 203000 + }, + { + "epoch": 0.7727061653585865, + "grad_norm": 0.11903432011604309, + "learning_rate": 0.0005, + "loss": 2.1056, + "step": 203010 + }, + { + "epoch": 0.7727442278267092, + "grad_norm": 0.12048203498125076, + "learning_rate": 0.0005, + "loss": 2.1156, + "step": 203020 + }, + { + "epoch": 0.7727822902948319, + "grad_norm": 0.12632669508457184, + "learning_rate": 0.0005, + "loss": 2.1034, + "step": 203030 + }, + { + "epoch": 0.7728203527629546, + "grad_norm": 0.11561502516269684, + "learning_rate": 0.0005, + "loss": 2.0985, + "step": 203040 + }, + { + "epoch": 0.7728584152310772, + "grad_norm": 0.12799832224845886, + "learning_rate": 0.0005, + "loss": 2.0985, + "step": 203050 + }, + { + "epoch": 0.7728964776991999, + "grad_norm": 0.13653217256069183, + "learning_rate": 0.0005, + "loss": 2.1134, + "step": 203060 + }, + { + "epoch": 0.7729345401673227, + "grad_norm": 0.12045499682426453, + "learning_rate": 0.0005, + "loss": 2.1048, + "step": 203070 + }, + { + "epoch": 0.7729726026354453, + "grad_norm": 0.12969614565372467, + "learning_rate": 0.0005, + "loss": 2.1105, + "step": 203080 + }, + { + "epoch": 0.773010665103568, + "grad_norm": 0.12215245515108109, + "learning_rate": 0.0005, + "loss": 2.102, + "step": 203090 + }, + { + "epoch": 0.7730487275716906, + "grad_norm": 0.13063253462314606, + "learning_rate": 0.0005, + "loss": 2.111, + "step": 203100 + }, + { + "epoch": 0.7730867900398133, + "grad_norm": 0.12350734323263168, + "learning_rate": 0.0005, + "loss": 2.1029, + "step": 203110 + }, + { + "epoch": 0.7731248525079361, + "grad_norm": 0.12446156144142151, + "learning_rate": 0.0005, + "loss": 2.1102, + "step": 203120 + }, + { + "epoch": 0.7731629149760587, + "grad_norm": 0.12515167891979218, + "learning_rate": 0.0005, + "loss": 2.1161, + "step": 203130 + }, + { + "epoch": 0.7732009774441814, + "grad_norm": 0.11699115484952927, + "learning_rate": 0.0005, + "loss": 2.0834, + "step": 203140 + }, + { + "epoch": 0.773239039912304, + "grad_norm": 0.13818705081939697, + "learning_rate": 0.0005, + "loss": 2.1008, + "step": 203150 + }, + { + "epoch": 0.7732771023804268, + "grad_norm": 0.12045501172542572, + "learning_rate": 0.0005, + "loss": 2.1127, + "step": 203160 + }, + { + "epoch": 0.7733151648485495, + "grad_norm": 0.13033844530582428, + "learning_rate": 0.0005, + "loss": 2.113, + "step": 203170 + }, + { + "epoch": 0.7733532273166721, + "grad_norm": 0.1280549317598343, + "learning_rate": 0.0005, + "loss": 2.1209, + "step": 203180 + }, + { + "epoch": 0.7733912897847948, + "grad_norm": 0.11952662467956543, + "learning_rate": 0.0005, + "loss": 2.1019, + "step": 203190 + }, + { + "epoch": 0.7734293522529175, + "grad_norm": 0.12275929749011993, + "learning_rate": 0.0005, + "loss": 2.1135, + "step": 203200 + }, + { + "epoch": 0.7734674147210402, + "grad_norm": 0.12167345732450485, + "learning_rate": 0.0005, + "loss": 2.1058, + "step": 203210 + }, + { + "epoch": 0.7735054771891628, + "grad_norm": 0.13266120851039886, + "learning_rate": 0.0005, + "loss": 2.1138, + "step": 203220 + }, + { + "epoch": 0.7735435396572855, + "grad_norm": 0.1521034985780716, + "learning_rate": 0.0005, + "loss": 2.1092, + "step": 203230 + }, + { + "epoch": 0.7735816021254083, + "grad_norm": 0.12634935975074768, + "learning_rate": 0.0005, + "loss": 2.1115, + "step": 203240 + }, + { + "epoch": 0.7736196645935309, + "grad_norm": 0.11577683687210083, + "learning_rate": 0.0005, + "loss": 2.1308, + "step": 203250 + }, + { + "epoch": 0.7736577270616536, + "grad_norm": 0.12791483104228973, + "learning_rate": 0.0005, + "loss": 2.1032, + "step": 203260 + }, + { + "epoch": 0.7736957895297762, + "grad_norm": 0.12884661555290222, + "learning_rate": 0.0005, + "loss": 2.1023, + "step": 203270 + }, + { + "epoch": 0.7737338519978989, + "grad_norm": 0.12367019057273865, + "learning_rate": 0.0005, + "loss": 2.1113, + "step": 203280 + }, + { + "epoch": 0.7737719144660217, + "grad_norm": 0.14287038147449493, + "learning_rate": 0.0005, + "loss": 2.1044, + "step": 203290 + }, + { + "epoch": 0.7738099769341443, + "grad_norm": 0.12698234617710114, + "learning_rate": 0.0005, + "loss": 2.0931, + "step": 203300 + }, + { + "epoch": 0.773848039402267, + "grad_norm": 0.13310617208480835, + "learning_rate": 0.0005, + "loss": 2.108, + "step": 203310 + }, + { + "epoch": 0.7738861018703896, + "grad_norm": 0.12289171665906906, + "learning_rate": 0.0005, + "loss": 2.1221, + "step": 203320 + }, + { + "epoch": 0.7739241643385124, + "grad_norm": 0.13482849299907684, + "learning_rate": 0.0005, + "loss": 2.1094, + "step": 203330 + }, + { + "epoch": 0.7739622268066351, + "grad_norm": 0.12322328984737396, + "learning_rate": 0.0005, + "loss": 2.1073, + "step": 203340 + }, + { + "epoch": 0.7740002892747577, + "grad_norm": 0.12590280175209045, + "learning_rate": 0.0005, + "loss": 2.0959, + "step": 203350 + }, + { + "epoch": 0.7740383517428804, + "grad_norm": 0.12155240774154663, + "learning_rate": 0.0005, + "loss": 2.1077, + "step": 203360 + }, + { + "epoch": 0.7740764142110031, + "grad_norm": 0.1313089281320572, + "learning_rate": 0.0005, + "loss": 2.1105, + "step": 203370 + }, + { + "epoch": 0.7741144766791258, + "grad_norm": 0.12607799470424652, + "learning_rate": 0.0005, + "loss": 2.1169, + "step": 203380 + }, + { + "epoch": 0.7741525391472485, + "grad_norm": 0.13865213096141815, + "learning_rate": 0.0005, + "loss": 2.1109, + "step": 203390 + }, + { + "epoch": 0.7741906016153711, + "grad_norm": 0.14330661296844482, + "learning_rate": 0.0005, + "loss": 2.1184, + "step": 203400 + }, + { + "epoch": 0.7742286640834938, + "grad_norm": 0.11868394911289215, + "learning_rate": 0.0005, + "loss": 2.1111, + "step": 203410 + }, + { + "epoch": 0.7742667265516165, + "grad_norm": 0.1340259611606598, + "learning_rate": 0.0005, + "loss": 2.1121, + "step": 203420 + }, + { + "epoch": 0.7743047890197392, + "grad_norm": 0.12151813507080078, + "learning_rate": 0.0005, + "loss": 2.1159, + "step": 203430 + }, + { + "epoch": 0.7743428514878619, + "grad_norm": 0.11983367055654526, + "learning_rate": 0.0005, + "loss": 2.104, + "step": 203440 + }, + { + "epoch": 0.7743809139559845, + "grad_norm": 0.12431453168392181, + "learning_rate": 0.0005, + "loss": 2.1062, + "step": 203450 + }, + { + "epoch": 0.7744189764241073, + "grad_norm": 0.11596907675266266, + "learning_rate": 0.0005, + "loss": 2.1152, + "step": 203460 + }, + { + "epoch": 0.7744570388922299, + "grad_norm": 0.1255701184272766, + "learning_rate": 0.0005, + "loss": 2.1174, + "step": 203470 + }, + { + "epoch": 0.7744951013603526, + "grad_norm": 0.126678466796875, + "learning_rate": 0.0005, + "loss": 2.0857, + "step": 203480 + }, + { + "epoch": 0.7745331638284753, + "grad_norm": 0.12204180657863617, + "learning_rate": 0.0005, + "loss": 2.0935, + "step": 203490 + }, + { + "epoch": 0.774571226296598, + "grad_norm": 0.13281655311584473, + "learning_rate": 0.0005, + "loss": 2.0969, + "step": 203500 + }, + { + "epoch": 0.7746092887647207, + "grad_norm": 0.11845386028289795, + "learning_rate": 0.0005, + "loss": 2.1167, + "step": 203510 + }, + { + "epoch": 0.7746473512328433, + "grad_norm": 0.1315074861049652, + "learning_rate": 0.0005, + "loss": 2.0944, + "step": 203520 + }, + { + "epoch": 0.774685413700966, + "grad_norm": 0.12224984914064407, + "learning_rate": 0.0005, + "loss": 2.1125, + "step": 203530 + }, + { + "epoch": 0.7747234761690887, + "grad_norm": 0.11303869634866714, + "learning_rate": 0.0005, + "loss": 2.0993, + "step": 203540 + }, + { + "epoch": 0.7747615386372114, + "grad_norm": 0.12494504451751709, + "learning_rate": 0.0005, + "loss": 2.112, + "step": 203550 + }, + { + "epoch": 0.7747996011053341, + "grad_norm": 0.1141730546951294, + "learning_rate": 0.0005, + "loss": 2.0917, + "step": 203560 + }, + { + "epoch": 0.7748376635734567, + "grad_norm": 0.11735429614782333, + "learning_rate": 0.0005, + "loss": 2.1014, + "step": 203570 + }, + { + "epoch": 0.7748757260415794, + "grad_norm": 0.12974528968334198, + "learning_rate": 0.0005, + "loss": 2.1036, + "step": 203580 + }, + { + "epoch": 0.7749137885097022, + "grad_norm": 0.12256599962711334, + "learning_rate": 0.0005, + "loss": 2.1101, + "step": 203590 + }, + { + "epoch": 0.7749518509778248, + "grad_norm": 0.12738636136054993, + "learning_rate": 0.0005, + "loss": 2.1064, + "step": 203600 + }, + { + "epoch": 0.7749899134459475, + "grad_norm": 0.11851944774389267, + "learning_rate": 0.0005, + "loss": 2.1161, + "step": 203610 + }, + { + "epoch": 0.7750279759140701, + "grad_norm": 0.12773127853870392, + "learning_rate": 0.0005, + "loss": 2.0903, + "step": 203620 + }, + { + "epoch": 0.7750660383821929, + "grad_norm": 0.1170206144452095, + "learning_rate": 0.0005, + "loss": 2.1061, + "step": 203630 + }, + { + "epoch": 0.7751041008503156, + "grad_norm": 0.1569705307483673, + "learning_rate": 0.0005, + "loss": 2.1018, + "step": 203640 + }, + { + "epoch": 0.7751421633184382, + "grad_norm": 0.1502056121826172, + "learning_rate": 0.0005, + "loss": 2.1011, + "step": 203650 + }, + { + "epoch": 0.7751802257865609, + "grad_norm": 0.138113334774971, + "learning_rate": 0.0005, + "loss": 2.1031, + "step": 203660 + }, + { + "epoch": 0.7752182882546836, + "grad_norm": 0.12981128692626953, + "learning_rate": 0.0005, + "loss": 2.1193, + "step": 203670 + }, + { + "epoch": 0.7752563507228063, + "grad_norm": 0.13203611969947815, + "learning_rate": 0.0005, + "loss": 2.0993, + "step": 203680 + }, + { + "epoch": 0.775294413190929, + "grad_norm": 0.1312369853258133, + "learning_rate": 0.0005, + "loss": 2.1147, + "step": 203690 + }, + { + "epoch": 0.7753324756590516, + "grad_norm": 0.137653648853302, + "learning_rate": 0.0005, + "loss": 2.1067, + "step": 203700 + }, + { + "epoch": 0.7753705381271743, + "grad_norm": 0.13897337019443512, + "learning_rate": 0.0005, + "loss": 2.1111, + "step": 203710 + }, + { + "epoch": 0.775408600595297, + "grad_norm": 0.11673405766487122, + "learning_rate": 0.0005, + "loss": 2.1165, + "step": 203720 + }, + { + "epoch": 0.7754466630634197, + "grad_norm": 0.12671113014221191, + "learning_rate": 0.0005, + "loss": 2.125, + "step": 203730 + }, + { + "epoch": 0.7754847255315424, + "grad_norm": 0.11186876893043518, + "learning_rate": 0.0005, + "loss": 2.1013, + "step": 203740 + }, + { + "epoch": 0.775522787999665, + "grad_norm": 0.11881640553474426, + "learning_rate": 0.0005, + "loss": 2.0873, + "step": 203750 + }, + { + "epoch": 0.7755608504677878, + "grad_norm": 0.1171368807554245, + "learning_rate": 0.0005, + "loss": 2.1091, + "step": 203760 + }, + { + "epoch": 0.7755989129359104, + "grad_norm": 0.11364400386810303, + "learning_rate": 0.0005, + "loss": 2.1064, + "step": 203770 + }, + { + "epoch": 0.7756369754040331, + "grad_norm": 0.1332196146249771, + "learning_rate": 0.0005, + "loss": 2.1062, + "step": 203780 + }, + { + "epoch": 0.7756750378721557, + "grad_norm": 0.12130193412303925, + "learning_rate": 0.0005, + "loss": 2.1009, + "step": 203790 + }, + { + "epoch": 0.7757131003402785, + "grad_norm": 0.12022104114294052, + "learning_rate": 0.0005, + "loss": 2.108, + "step": 203800 + }, + { + "epoch": 0.7757511628084012, + "grad_norm": 0.12510570883750916, + "learning_rate": 0.0005, + "loss": 2.1003, + "step": 203810 + }, + { + "epoch": 0.7757892252765238, + "grad_norm": 0.1242448166012764, + "learning_rate": 0.0005, + "loss": 2.1093, + "step": 203820 + }, + { + "epoch": 0.7758272877446465, + "grad_norm": 0.12493916600942612, + "learning_rate": 0.0005, + "loss": 2.0886, + "step": 203830 + }, + { + "epoch": 0.7758653502127691, + "grad_norm": 0.1297662854194641, + "learning_rate": 0.0005, + "loss": 2.1143, + "step": 203840 + }, + { + "epoch": 0.7759034126808919, + "grad_norm": 0.13508228957653046, + "learning_rate": 0.0005, + "loss": 2.0833, + "step": 203850 + }, + { + "epoch": 0.7759414751490146, + "grad_norm": 0.13191717863082886, + "learning_rate": 0.0005, + "loss": 2.0949, + "step": 203860 + }, + { + "epoch": 0.7759795376171372, + "grad_norm": 0.13562016189098358, + "learning_rate": 0.0005, + "loss": 2.1019, + "step": 203870 + }, + { + "epoch": 0.7760176000852599, + "grad_norm": 0.1370725929737091, + "learning_rate": 0.0005, + "loss": 2.1064, + "step": 203880 + }, + { + "epoch": 0.7760556625533827, + "grad_norm": 0.1385413259267807, + "learning_rate": 0.0005, + "loss": 2.1088, + "step": 203890 + }, + { + "epoch": 0.7760937250215053, + "grad_norm": 0.12840357422828674, + "learning_rate": 0.0005, + "loss": 2.0966, + "step": 203900 + }, + { + "epoch": 0.776131787489628, + "grad_norm": 0.13046427071094513, + "learning_rate": 0.0005, + "loss": 2.1018, + "step": 203910 + }, + { + "epoch": 0.7761698499577506, + "grad_norm": 0.11721883714199066, + "learning_rate": 0.0005, + "loss": 2.1141, + "step": 203920 + }, + { + "epoch": 0.7762079124258734, + "grad_norm": 0.18598304688930511, + "learning_rate": 0.0005, + "loss": 2.1031, + "step": 203930 + }, + { + "epoch": 0.776245974893996, + "grad_norm": 0.12292864173650742, + "learning_rate": 0.0005, + "loss": 2.1138, + "step": 203940 + }, + { + "epoch": 0.7762840373621187, + "grad_norm": 0.13777364790439606, + "learning_rate": 0.0005, + "loss": 2.1086, + "step": 203950 + }, + { + "epoch": 0.7763220998302414, + "grad_norm": 0.1157660037279129, + "learning_rate": 0.0005, + "loss": 2.1084, + "step": 203960 + }, + { + "epoch": 0.776360162298364, + "grad_norm": 0.13139058649539948, + "learning_rate": 0.0005, + "loss": 2.1039, + "step": 203970 + }, + { + "epoch": 0.7763982247664868, + "grad_norm": 0.1354646384716034, + "learning_rate": 0.0005, + "loss": 2.0906, + "step": 203980 + }, + { + "epoch": 0.7764362872346094, + "grad_norm": 0.13240858912467957, + "learning_rate": 0.0005, + "loss": 2.1156, + "step": 203990 + }, + { + "epoch": 0.7764743497027321, + "grad_norm": 0.1313706338405609, + "learning_rate": 0.0005, + "loss": 2.1111, + "step": 204000 + }, + { + "epoch": 0.7765124121708548, + "grad_norm": 0.11614730954170227, + "learning_rate": 0.0005, + "loss": 2.096, + "step": 204010 + }, + { + "epoch": 0.7765504746389775, + "grad_norm": 0.11456860601902008, + "learning_rate": 0.0005, + "loss": 2.1211, + "step": 204020 + }, + { + "epoch": 0.7765885371071002, + "grad_norm": 0.1360856592655182, + "learning_rate": 0.0005, + "loss": 2.094, + "step": 204030 + }, + { + "epoch": 0.7766265995752228, + "grad_norm": 0.1354278028011322, + "learning_rate": 0.0005, + "loss": 2.119, + "step": 204040 + }, + { + "epoch": 0.7766646620433455, + "grad_norm": 0.11774899065494537, + "learning_rate": 0.0005, + "loss": 2.0985, + "step": 204050 + }, + { + "epoch": 0.7767027245114683, + "grad_norm": 0.14782720804214478, + "learning_rate": 0.0005, + "loss": 2.1129, + "step": 204060 + }, + { + "epoch": 0.7767407869795909, + "grad_norm": 0.1334516555070877, + "learning_rate": 0.0005, + "loss": 2.1096, + "step": 204070 + }, + { + "epoch": 0.7767788494477136, + "grad_norm": 0.13219749927520752, + "learning_rate": 0.0005, + "loss": 2.1232, + "step": 204080 + }, + { + "epoch": 0.7768169119158362, + "grad_norm": 0.12305323034524918, + "learning_rate": 0.0005, + "loss": 2.0999, + "step": 204090 + }, + { + "epoch": 0.776854974383959, + "grad_norm": 0.1264452338218689, + "learning_rate": 0.0005, + "loss": 2.1098, + "step": 204100 + }, + { + "epoch": 0.7768930368520817, + "grad_norm": 0.13045167922973633, + "learning_rate": 0.0005, + "loss": 2.1, + "step": 204110 + }, + { + "epoch": 0.7769310993202043, + "grad_norm": 0.1378946453332901, + "learning_rate": 0.0005, + "loss": 2.1031, + "step": 204120 + }, + { + "epoch": 0.776969161788327, + "grad_norm": 0.14290933310985565, + "learning_rate": 0.0005, + "loss": 2.1221, + "step": 204130 + }, + { + "epoch": 0.7770072242564496, + "grad_norm": 0.1294529139995575, + "learning_rate": 0.0005, + "loss": 2.1161, + "step": 204140 + }, + { + "epoch": 0.7770452867245724, + "grad_norm": 0.11729662865400314, + "learning_rate": 0.0005, + "loss": 2.1155, + "step": 204150 + }, + { + "epoch": 0.7770833491926951, + "grad_norm": 0.13432623445987701, + "learning_rate": 0.0005, + "loss": 2.1106, + "step": 204160 + }, + { + "epoch": 0.7771214116608177, + "grad_norm": 0.11898574978113174, + "learning_rate": 0.0005, + "loss": 2.1039, + "step": 204170 + }, + { + "epoch": 0.7771594741289404, + "grad_norm": 0.12093368917703629, + "learning_rate": 0.0005, + "loss": 2.1174, + "step": 204180 + }, + { + "epoch": 0.7771975365970631, + "grad_norm": 0.1334381401538849, + "learning_rate": 0.0005, + "loss": 2.0988, + "step": 204190 + }, + { + "epoch": 0.7772355990651858, + "grad_norm": 0.12980128824710846, + "learning_rate": 0.0005, + "loss": 2.0964, + "step": 204200 + }, + { + "epoch": 0.7772736615333085, + "grad_norm": 0.1222313717007637, + "learning_rate": 0.0005, + "loss": 2.0976, + "step": 204210 + }, + { + "epoch": 0.7773117240014311, + "grad_norm": 0.13379375636577606, + "learning_rate": 0.0005, + "loss": 2.1176, + "step": 204220 + }, + { + "epoch": 0.7773497864695539, + "grad_norm": 0.12348196655511856, + "learning_rate": 0.0005, + "loss": 2.1005, + "step": 204230 + }, + { + "epoch": 0.7773878489376765, + "grad_norm": 0.1255643665790558, + "learning_rate": 0.0005, + "loss": 2.1, + "step": 204240 + }, + { + "epoch": 0.7774259114057992, + "grad_norm": 0.1284782737493515, + "learning_rate": 0.0005, + "loss": 2.105, + "step": 204250 + }, + { + "epoch": 0.7774639738739219, + "grad_norm": 0.12363579124212265, + "learning_rate": 0.0005, + "loss": 2.1113, + "step": 204260 + }, + { + "epoch": 0.7775020363420445, + "grad_norm": 0.11654678732156754, + "learning_rate": 0.0005, + "loss": 2.1036, + "step": 204270 + }, + { + "epoch": 0.7775400988101673, + "grad_norm": 0.12405503541231155, + "learning_rate": 0.0005, + "loss": 2.113, + "step": 204280 + }, + { + "epoch": 0.7775781612782899, + "grad_norm": 0.121688112616539, + "learning_rate": 0.0005, + "loss": 2.1037, + "step": 204290 + }, + { + "epoch": 0.7776162237464126, + "grad_norm": 0.12813690304756165, + "learning_rate": 0.0005, + "loss": 2.1245, + "step": 204300 + }, + { + "epoch": 0.7776542862145353, + "grad_norm": 0.12217184901237488, + "learning_rate": 0.0005, + "loss": 2.1177, + "step": 204310 + }, + { + "epoch": 0.777692348682658, + "grad_norm": 0.12200063467025757, + "learning_rate": 0.0005, + "loss": 2.1009, + "step": 204320 + }, + { + "epoch": 0.7777304111507807, + "grad_norm": 0.12824185192584991, + "learning_rate": 0.0005, + "loss": 2.095, + "step": 204330 + }, + { + "epoch": 0.7777684736189033, + "grad_norm": 0.131599560379982, + "learning_rate": 0.0005, + "loss": 2.1216, + "step": 204340 + }, + { + "epoch": 0.777806536087026, + "grad_norm": 0.12848694622516632, + "learning_rate": 0.0005, + "loss": 2.0989, + "step": 204350 + }, + { + "epoch": 0.7778445985551488, + "grad_norm": 0.13016167283058167, + "learning_rate": 0.0005, + "loss": 2.1165, + "step": 204360 + }, + { + "epoch": 0.7778826610232714, + "grad_norm": 0.12172859907150269, + "learning_rate": 0.0005, + "loss": 2.1154, + "step": 204370 + }, + { + "epoch": 0.7779207234913941, + "grad_norm": 0.1151774451136589, + "learning_rate": 0.0005, + "loss": 2.1073, + "step": 204380 + }, + { + "epoch": 0.7779587859595167, + "grad_norm": 0.1309356987476349, + "learning_rate": 0.0005, + "loss": 2.1162, + "step": 204390 + }, + { + "epoch": 0.7779968484276394, + "grad_norm": 0.1296638697385788, + "learning_rate": 0.0005, + "loss": 2.1122, + "step": 204400 + }, + { + "epoch": 0.7780349108957622, + "grad_norm": 0.11932416260242462, + "learning_rate": 0.0005, + "loss": 2.0981, + "step": 204410 + }, + { + "epoch": 0.7780729733638848, + "grad_norm": 0.12049002200365067, + "learning_rate": 0.0005, + "loss": 2.1042, + "step": 204420 + }, + { + "epoch": 0.7781110358320075, + "grad_norm": 0.1300167441368103, + "learning_rate": 0.0005, + "loss": 2.1002, + "step": 204430 + }, + { + "epoch": 0.7781490983001301, + "grad_norm": 0.12067482620477676, + "learning_rate": 0.0005, + "loss": 2.1151, + "step": 204440 + }, + { + "epoch": 0.7781871607682529, + "grad_norm": 0.12820187211036682, + "learning_rate": 0.0005, + "loss": 2.102, + "step": 204450 + }, + { + "epoch": 0.7782252232363756, + "grad_norm": 0.1288333237171173, + "learning_rate": 0.0005, + "loss": 2.0919, + "step": 204460 + }, + { + "epoch": 0.7782632857044982, + "grad_norm": 0.1281125694513321, + "learning_rate": 0.0005, + "loss": 2.1084, + "step": 204470 + }, + { + "epoch": 0.7783013481726209, + "grad_norm": 0.12810519337654114, + "learning_rate": 0.0005, + "loss": 2.1116, + "step": 204480 + }, + { + "epoch": 0.7783394106407436, + "grad_norm": 0.11424490809440613, + "learning_rate": 0.0005, + "loss": 2.1078, + "step": 204490 + }, + { + "epoch": 0.7783774731088663, + "grad_norm": 0.12385211139917374, + "learning_rate": 0.0005, + "loss": 2.1037, + "step": 204500 + }, + { + "epoch": 0.778415535576989, + "grad_norm": 0.13899962604045868, + "learning_rate": 0.0005, + "loss": 2.1092, + "step": 204510 + }, + { + "epoch": 0.7784535980451116, + "grad_norm": 0.13048166036605835, + "learning_rate": 0.0005, + "loss": 2.0961, + "step": 204520 + }, + { + "epoch": 0.7784916605132344, + "grad_norm": 0.12829838693141937, + "learning_rate": 0.0005, + "loss": 2.0889, + "step": 204530 + }, + { + "epoch": 0.778529722981357, + "grad_norm": 0.11860474944114685, + "learning_rate": 0.0005, + "loss": 2.1158, + "step": 204540 + }, + { + "epoch": 0.7785677854494797, + "grad_norm": 0.12046080082654953, + "learning_rate": 0.0005, + "loss": 2.11, + "step": 204550 + }, + { + "epoch": 0.7786058479176023, + "grad_norm": 0.11780071258544922, + "learning_rate": 0.0005, + "loss": 2.1007, + "step": 204560 + }, + { + "epoch": 0.778643910385725, + "grad_norm": 0.1262700855731964, + "learning_rate": 0.0005, + "loss": 2.1383, + "step": 204570 + }, + { + "epoch": 0.7786819728538478, + "grad_norm": 0.14198298752307892, + "learning_rate": 0.0005, + "loss": 2.1055, + "step": 204580 + }, + { + "epoch": 0.7787200353219704, + "grad_norm": 0.13928957283496857, + "learning_rate": 0.0005, + "loss": 2.1085, + "step": 204590 + }, + { + "epoch": 0.7787580977900931, + "grad_norm": 0.12705102562904358, + "learning_rate": 0.0005, + "loss": 2.1069, + "step": 204600 + }, + { + "epoch": 0.7787961602582157, + "grad_norm": 0.1210823580622673, + "learning_rate": 0.0005, + "loss": 2.1038, + "step": 204610 + }, + { + "epoch": 0.7788342227263385, + "grad_norm": 0.12910155951976776, + "learning_rate": 0.0005, + "loss": 2.1093, + "step": 204620 + }, + { + "epoch": 0.7788722851944612, + "grad_norm": 0.13223743438720703, + "learning_rate": 0.0005, + "loss": 2.1137, + "step": 204630 + }, + { + "epoch": 0.7789103476625838, + "grad_norm": 0.1313503235578537, + "learning_rate": 0.0005, + "loss": 2.1063, + "step": 204640 + }, + { + "epoch": 0.7789484101307065, + "grad_norm": 0.13831321895122528, + "learning_rate": 0.0005, + "loss": 2.1119, + "step": 204650 + }, + { + "epoch": 0.7789864725988292, + "grad_norm": 0.12389397621154785, + "learning_rate": 0.0005, + "loss": 2.1098, + "step": 204660 + }, + { + "epoch": 0.7790245350669519, + "grad_norm": 0.12389600276947021, + "learning_rate": 0.0005, + "loss": 2.1049, + "step": 204670 + }, + { + "epoch": 0.7790625975350746, + "grad_norm": 0.12264768034219742, + "learning_rate": 0.0005, + "loss": 2.1157, + "step": 204680 + }, + { + "epoch": 0.7791006600031972, + "grad_norm": 0.1387430876493454, + "learning_rate": 0.0005, + "loss": 2.1243, + "step": 204690 + }, + { + "epoch": 0.7791387224713199, + "grad_norm": 0.129319965839386, + "learning_rate": 0.0005, + "loss": 2.1134, + "step": 204700 + }, + { + "epoch": 0.7791767849394426, + "grad_norm": 0.12320511043071747, + "learning_rate": 0.0005, + "loss": 2.1163, + "step": 204710 + }, + { + "epoch": 0.7792148474075653, + "grad_norm": 0.11558236181735992, + "learning_rate": 0.0005, + "loss": 2.0915, + "step": 204720 + }, + { + "epoch": 0.779252909875688, + "grad_norm": 0.12244782596826553, + "learning_rate": 0.0005, + "loss": 2.1156, + "step": 204730 + }, + { + "epoch": 0.7792909723438106, + "grad_norm": 0.13204488158226013, + "learning_rate": 0.0005, + "loss": 2.1138, + "step": 204740 + }, + { + "epoch": 0.7793290348119334, + "grad_norm": 0.1383647918701172, + "learning_rate": 0.0005, + "loss": 2.0981, + "step": 204750 + }, + { + "epoch": 0.779367097280056, + "grad_norm": 0.11390195786952972, + "learning_rate": 0.0005, + "loss": 2.1185, + "step": 204760 + }, + { + "epoch": 0.7794051597481787, + "grad_norm": 0.10992393642663956, + "learning_rate": 0.0005, + "loss": 2.0976, + "step": 204770 + }, + { + "epoch": 0.7794432222163014, + "grad_norm": 0.12079618126153946, + "learning_rate": 0.0005, + "loss": 2.1002, + "step": 204780 + }, + { + "epoch": 0.7794812846844241, + "grad_norm": 0.13436715304851532, + "learning_rate": 0.0005, + "loss": 2.1007, + "step": 204790 + }, + { + "epoch": 0.7795193471525468, + "grad_norm": 0.12580615282058716, + "learning_rate": 0.0005, + "loss": 2.0882, + "step": 204800 + }, + { + "epoch": 0.7795574096206694, + "grad_norm": 0.12018632143735886, + "learning_rate": 0.0005, + "loss": 2.1041, + "step": 204810 + }, + { + "epoch": 0.7795954720887921, + "grad_norm": 0.1208050474524498, + "learning_rate": 0.0005, + "loss": 2.089, + "step": 204820 + }, + { + "epoch": 0.7796335345569148, + "grad_norm": 0.12312716245651245, + "learning_rate": 0.0005, + "loss": 2.0985, + "step": 204830 + }, + { + "epoch": 0.7796715970250375, + "grad_norm": 0.14190800487995148, + "learning_rate": 0.0005, + "loss": 2.1007, + "step": 204840 + }, + { + "epoch": 0.7797096594931602, + "grad_norm": 0.14695966243743896, + "learning_rate": 0.0005, + "loss": 2.0925, + "step": 204850 + }, + { + "epoch": 0.7797477219612828, + "grad_norm": 0.12959155440330505, + "learning_rate": 0.0005, + "loss": 2.0896, + "step": 204860 + }, + { + "epoch": 0.7797857844294055, + "grad_norm": 0.12169928848743439, + "learning_rate": 0.0005, + "loss": 2.1201, + "step": 204870 + }, + { + "epoch": 0.7798238468975283, + "grad_norm": 0.13090549409389496, + "learning_rate": 0.0005, + "loss": 2.1039, + "step": 204880 + }, + { + "epoch": 0.7798619093656509, + "grad_norm": 0.12255292385816574, + "learning_rate": 0.0005, + "loss": 2.1095, + "step": 204890 + }, + { + "epoch": 0.7798999718337736, + "grad_norm": 0.1383610963821411, + "learning_rate": 0.0005, + "loss": 2.1088, + "step": 204900 + }, + { + "epoch": 0.7799380343018962, + "grad_norm": 0.12175476551055908, + "learning_rate": 0.0005, + "loss": 2.0998, + "step": 204910 + }, + { + "epoch": 0.779976096770019, + "grad_norm": 0.1260368973016739, + "learning_rate": 0.0005, + "loss": 2.1006, + "step": 204920 + }, + { + "epoch": 0.7800141592381417, + "grad_norm": 0.12578240036964417, + "learning_rate": 0.0005, + "loss": 2.0863, + "step": 204930 + }, + { + "epoch": 0.7800522217062643, + "grad_norm": 0.1140437051653862, + "learning_rate": 0.0005, + "loss": 2.1075, + "step": 204940 + }, + { + "epoch": 0.780090284174387, + "grad_norm": 0.13163577020168304, + "learning_rate": 0.0005, + "loss": 2.1092, + "step": 204950 + }, + { + "epoch": 0.7801283466425097, + "grad_norm": 0.11661121249198914, + "learning_rate": 0.0005, + "loss": 2.1179, + "step": 204960 + }, + { + "epoch": 0.7801664091106324, + "grad_norm": 0.12443295121192932, + "learning_rate": 0.0005, + "loss": 2.1064, + "step": 204970 + }, + { + "epoch": 0.780204471578755, + "grad_norm": 0.13133709132671356, + "learning_rate": 0.0005, + "loss": 2.106, + "step": 204980 + }, + { + "epoch": 0.7802425340468777, + "grad_norm": 0.12514156103134155, + "learning_rate": 0.0005, + "loss": 2.1135, + "step": 204990 + }, + { + "epoch": 0.7802805965150004, + "grad_norm": 0.13035859167575836, + "learning_rate": 0.0005, + "loss": 2.1084, + "step": 205000 + }, + { + "epoch": 0.7803186589831231, + "grad_norm": 0.12508799135684967, + "learning_rate": 0.0005, + "loss": 2.1023, + "step": 205010 + }, + { + "epoch": 0.7803567214512458, + "grad_norm": 0.12291455268859863, + "learning_rate": 0.0005, + "loss": 2.1123, + "step": 205020 + }, + { + "epoch": 0.7803947839193685, + "grad_norm": 0.12133867293596268, + "learning_rate": 0.0005, + "loss": 2.1245, + "step": 205030 + }, + { + "epoch": 0.7804328463874911, + "grad_norm": 0.11362049728631973, + "learning_rate": 0.0005, + "loss": 2.102, + "step": 205040 + }, + { + "epoch": 0.7804709088556139, + "grad_norm": 0.132710263133049, + "learning_rate": 0.0005, + "loss": 2.1191, + "step": 205050 + }, + { + "epoch": 0.7805089713237365, + "grad_norm": 0.1385025829076767, + "learning_rate": 0.0005, + "loss": 2.1142, + "step": 205060 + }, + { + "epoch": 0.7805470337918592, + "grad_norm": 0.13487720489501953, + "learning_rate": 0.0005, + "loss": 2.1216, + "step": 205070 + }, + { + "epoch": 0.7805850962599818, + "grad_norm": 0.11675132066011429, + "learning_rate": 0.0005, + "loss": 2.1149, + "step": 205080 + }, + { + "epoch": 0.7806231587281046, + "grad_norm": 0.1177266389131546, + "learning_rate": 0.0005, + "loss": 2.1143, + "step": 205090 + }, + { + "epoch": 0.7806612211962273, + "grad_norm": 0.12549777328968048, + "learning_rate": 0.0005, + "loss": 2.1081, + "step": 205100 + }, + { + "epoch": 0.7806992836643499, + "grad_norm": 0.12713997066020966, + "learning_rate": 0.0005, + "loss": 2.1087, + "step": 205110 + }, + { + "epoch": 0.7807373461324726, + "grad_norm": 0.1339418888092041, + "learning_rate": 0.0005, + "loss": 2.122, + "step": 205120 + }, + { + "epoch": 0.7807754086005952, + "grad_norm": 0.12972566485404968, + "learning_rate": 0.0005, + "loss": 2.0958, + "step": 205130 + }, + { + "epoch": 0.780813471068718, + "grad_norm": 0.12319502234458923, + "learning_rate": 0.0005, + "loss": 2.0862, + "step": 205140 + }, + { + "epoch": 0.7808515335368407, + "grad_norm": 0.11615166813135147, + "learning_rate": 0.0005, + "loss": 2.1114, + "step": 205150 + }, + { + "epoch": 0.7808895960049633, + "grad_norm": 0.11988827586174011, + "learning_rate": 0.0005, + "loss": 2.1261, + "step": 205160 + }, + { + "epoch": 0.780927658473086, + "grad_norm": 0.1277739554643631, + "learning_rate": 0.0005, + "loss": 2.1102, + "step": 205170 + }, + { + "epoch": 0.7809657209412088, + "grad_norm": 0.12377830594778061, + "learning_rate": 0.0005, + "loss": 2.1053, + "step": 205180 + }, + { + "epoch": 0.7810037834093314, + "grad_norm": 0.11800064146518707, + "learning_rate": 0.0005, + "loss": 2.1158, + "step": 205190 + }, + { + "epoch": 0.7810418458774541, + "grad_norm": 0.12103444337844849, + "learning_rate": 0.0005, + "loss": 2.1099, + "step": 205200 + }, + { + "epoch": 0.7810799083455767, + "grad_norm": 0.13712644577026367, + "learning_rate": 0.0005, + "loss": 2.1046, + "step": 205210 + }, + { + "epoch": 0.7811179708136995, + "grad_norm": 0.1243886649608612, + "learning_rate": 0.0005, + "loss": 2.1165, + "step": 205220 + }, + { + "epoch": 0.7811560332818221, + "grad_norm": 0.12707261741161346, + "learning_rate": 0.0005, + "loss": 2.1102, + "step": 205230 + }, + { + "epoch": 0.7811940957499448, + "grad_norm": 0.1293025016784668, + "learning_rate": 0.0005, + "loss": 2.1073, + "step": 205240 + }, + { + "epoch": 0.7812321582180675, + "grad_norm": 0.11886018514633179, + "learning_rate": 0.0005, + "loss": 2.113, + "step": 205250 + }, + { + "epoch": 0.7812702206861901, + "grad_norm": 0.1304692178964615, + "learning_rate": 0.0005, + "loss": 2.1145, + "step": 205260 + }, + { + "epoch": 0.7813082831543129, + "grad_norm": 0.13967598974704742, + "learning_rate": 0.0005, + "loss": 2.1097, + "step": 205270 + }, + { + "epoch": 0.7813463456224355, + "grad_norm": 0.14922238886356354, + "learning_rate": 0.0005, + "loss": 2.1159, + "step": 205280 + }, + { + "epoch": 0.7813844080905582, + "grad_norm": 0.15449193120002747, + "learning_rate": 0.0005, + "loss": 2.1173, + "step": 205290 + }, + { + "epoch": 0.7814224705586809, + "grad_norm": 0.13307182490825653, + "learning_rate": 0.0005, + "loss": 2.0898, + "step": 205300 + }, + { + "epoch": 0.7814605330268036, + "grad_norm": 0.1463232934474945, + "learning_rate": 0.0005, + "loss": 2.1101, + "step": 205310 + }, + { + "epoch": 0.7814985954949263, + "grad_norm": 0.12033987790346146, + "learning_rate": 0.0005, + "loss": 2.1162, + "step": 205320 + }, + { + "epoch": 0.7815366579630489, + "grad_norm": 0.12558874487876892, + "learning_rate": 0.0005, + "loss": 2.0963, + "step": 205330 + }, + { + "epoch": 0.7815747204311716, + "grad_norm": 0.1404847800731659, + "learning_rate": 0.0005, + "loss": 2.1007, + "step": 205340 + }, + { + "epoch": 0.7816127828992944, + "grad_norm": 0.14405734837055206, + "learning_rate": 0.0005, + "loss": 2.1066, + "step": 205350 + }, + { + "epoch": 0.781650845367417, + "grad_norm": 0.11858896166086197, + "learning_rate": 0.0005, + "loss": 2.1158, + "step": 205360 + }, + { + "epoch": 0.7816889078355397, + "grad_norm": 0.12527897953987122, + "learning_rate": 0.0005, + "loss": 2.1011, + "step": 205370 + }, + { + "epoch": 0.7817269703036623, + "grad_norm": 0.14698106050491333, + "learning_rate": 0.0005, + "loss": 2.0952, + "step": 205380 + }, + { + "epoch": 0.7817650327717851, + "grad_norm": 0.13634824752807617, + "learning_rate": 0.0005, + "loss": 2.1003, + "step": 205390 + }, + { + "epoch": 0.7818030952399078, + "grad_norm": 0.11414165049791336, + "learning_rate": 0.0005, + "loss": 2.1065, + "step": 205400 + }, + { + "epoch": 0.7818411577080304, + "grad_norm": 0.13669812679290771, + "learning_rate": 0.0005, + "loss": 2.1015, + "step": 205410 + }, + { + "epoch": 0.7818792201761531, + "grad_norm": 0.1324278861284256, + "learning_rate": 0.0005, + "loss": 2.0997, + "step": 205420 + }, + { + "epoch": 0.7819172826442757, + "grad_norm": 0.13586843013763428, + "learning_rate": 0.0005, + "loss": 2.1058, + "step": 205430 + }, + { + "epoch": 0.7819553451123985, + "grad_norm": 0.12450312823057175, + "learning_rate": 0.0005, + "loss": 2.0972, + "step": 205440 + }, + { + "epoch": 0.7819934075805212, + "grad_norm": 0.12089171260595322, + "learning_rate": 0.0005, + "loss": 2.118, + "step": 205450 + }, + { + "epoch": 0.7820314700486438, + "grad_norm": 0.11433006823062897, + "learning_rate": 0.0005, + "loss": 2.1157, + "step": 205460 + }, + { + "epoch": 0.7820695325167665, + "grad_norm": 0.12635235488414764, + "learning_rate": 0.0005, + "loss": 2.0977, + "step": 205470 + }, + { + "epoch": 0.7821075949848892, + "grad_norm": 0.12928998470306396, + "learning_rate": 0.0005, + "loss": 2.0966, + "step": 205480 + }, + { + "epoch": 0.7821456574530119, + "grad_norm": 0.116602323949337, + "learning_rate": 0.0005, + "loss": 2.1029, + "step": 205490 + }, + { + "epoch": 0.7821837199211346, + "grad_norm": 0.1305515021085739, + "learning_rate": 0.0005, + "loss": 2.1098, + "step": 205500 + }, + { + "epoch": 0.7822217823892572, + "grad_norm": 0.12704770267009735, + "learning_rate": 0.0005, + "loss": 2.1036, + "step": 205510 + }, + { + "epoch": 0.78225984485738, + "grad_norm": 0.1347886472940445, + "learning_rate": 0.0005, + "loss": 2.1005, + "step": 205520 + }, + { + "epoch": 0.7822979073255026, + "grad_norm": 0.12773995101451874, + "learning_rate": 0.0005, + "loss": 2.1114, + "step": 205530 + }, + { + "epoch": 0.7823359697936253, + "grad_norm": 0.12586063146591187, + "learning_rate": 0.0005, + "loss": 2.1187, + "step": 205540 + }, + { + "epoch": 0.782374032261748, + "grad_norm": 0.12715336680412292, + "learning_rate": 0.0005, + "loss": 2.0949, + "step": 205550 + }, + { + "epoch": 0.7824120947298706, + "grad_norm": 0.1147814616560936, + "learning_rate": 0.0005, + "loss": 2.1044, + "step": 205560 + }, + { + "epoch": 0.7824501571979934, + "grad_norm": 0.1331065148115158, + "learning_rate": 0.0005, + "loss": 2.0975, + "step": 205570 + }, + { + "epoch": 0.782488219666116, + "grad_norm": 0.129253551363945, + "learning_rate": 0.0005, + "loss": 2.1004, + "step": 205580 + }, + { + "epoch": 0.7825262821342387, + "grad_norm": 0.13816684484481812, + "learning_rate": 0.0005, + "loss": 2.1155, + "step": 205590 + }, + { + "epoch": 0.7825643446023614, + "grad_norm": 0.13440492749214172, + "learning_rate": 0.0005, + "loss": 2.1168, + "step": 205600 + }, + { + "epoch": 0.7826024070704841, + "grad_norm": 0.1320820450782776, + "learning_rate": 0.0005, + "loss": 2.0969, + "step": 205610 + }, + { + "epoch": 0.7826404695386068, + "grad_norm": 0.11831782013177872, + "learning_rate": 0.0005, + "loss": 2.1082, + "step": 205620 + }, + { + "epoch": 0.7826785320067294, + "grad_norm": 0.11591319739818573, + "learning_rate": 0.0005, + "loss": 2.1108, + "step": 205630 + }, + { + "epoch": 0.7827165944748521, + "grad_norm": 0.11812224239110947, + "learning_rate": 0.0005, + "loss": 2.1065, + "step": 205640 + }, + { + "epoch": 0.7827546569429749, + "grad_norm": 0.12301401793956757, + "learning_rate": 0.0005, + "loss": 2.1015, + "step": 205650 + }, + { + "epoch": 0.7827927194110975, + "grad_norm": 0.12146145850419998, + "learning_rate": 0.0005, + "loss": 2.1183, + "step": 205660 + }, + { + "epoch": 0.7828307818792202, + "grad_norm": 0.13071522116661072, + "learning_rate": 0.0005, + "loss": 2.117, + "step": 205670 + }, + { + "epoch": 0.7828688443473428, + "grad_norm": 0.12665431201457977, + "learning_rate": 0.0005, + "loss": 2.0957, + "step": 205680 + }, + { + "epoch": 0.7829069068154656, + "grad_norm": 0.12369175255298615, + "learning_rate": 0.0005, + "loss": 2.1187, + "step": 205690 + }, + { + "epoch": 0.7829449692835883, + "grad_norm": 0.14192961156368256, + "learning_rate": 0.0005, + "loss": 2.1105, + "step": 205700 + }, + { + "epoch": 0.7829830317517109, + "grad_norm": 0.1278933882713318, + "learning_rate": 0.0005, + "loss": 2.1056, + "step": 205710 + }, + { + "epoch": 0.7830210942198336, + "grad_norm": 0.12124519050121307, + "learning_rate": 0.0005, + "loss": 2.0972, + "step": 205720 + }, + { + "epoch": 0.7830591566879562, + "grad_norm": 0.13264502584934235, + "learning_rate": 0.0005, + "loss": 2.0932, + "step": 205730 + }, + { + "epoch": 0.783097219156079, + "grad_norm": 0.1269812136888504, + "learning_rate": 0.0005, + "loss": 2.0937, + "step": 205740 + }, + { + "epoch": 0.7831352816242017, + "grad_norm": 0.1288755089044571, + "learning_rate": 0.0005, + "loss": 2.1071, + "step": 205750 + }, + { + "epoch": 0.7831733440923243, + "grad_norm": 0.13168203830718994, + "learning_rate": 0.0005, + "loss": 2.1004, + "step": 205760 + }, + { + "epoch": 0.783211406560447, + "grad_norm": 0.11975059658288956, + "learning_rate": 0.0005, + "loss": 2.1008, + "step": 205770 + }, + { + "epoch": 0.7832494690285697, + "grad_norm": 0.1149614229798317, + "learning_rate": 0.0005, + "loss": 2.1152, + "step": 205780 + }, + { + "epoch": 0.7832875314966924, + "grad_norm": 0.11844199150800705, + "learning_rate": 0.0005, + "loss": 2.1116, + "step": 205790 + }, + { + "epoch": 0.783325593964815, + "grad_norm": 0.12237623333930969, + "learning_rate": 0.0005, + "loss": 2.0955, + "step": 205800 + }, + { + "epoch": 0.7833636564329377, + "grad_norm": 0.12408531457185745, + "learning_rate": 0.0005, + "loss": 2.1099, + "step": 205810 + }, + { + "epoch": 0.7834017189010605, + "grad_norm": 0.11463352292776108, + "learning_rate": 0.0005, + "loss": 2.1304, + "step": 205820 + }, + { + "epoch": 0.7834397813691831, + "grad_norm": 0.1344767063856125, + "learning_rate": 0.0005, + "loss": 2.1236, + "step": 205830 + }, + { + "epoch": 0.7834778438373058, + "grad_norm": 0.14116428792476654, + "learning_rate": 0.0005, + "loss": 2.1114, + "step": 205840 + }, + { + "epoch": 0.7835159063054284, + "grad_norm": 0.14916980266571045, + "learning_rate": 0.0005, + "loss": 2.1218, + "step": 205850 + }, + { + "epoch": 0.7835539687735511, + "grad_norm": 0.1306719183921814, + "learning_rate": 0.0005, + "loss": 2.1003, + "step": 205860 + }, + { + "epoch": 0.7835920312416739, + "grad_norm": 0.13697031140327454, + "learning_rate": 0.0005, + "loss": 2.1082, + "step": 205870 + }, + { + "epoch": 0.7836300937097965, + "grad_norm": 0.13301123678684235, + "learning_rate": 0.0005, + "loss": 2.1206, + "step": 205880 + }, + { + "epoch": 0.7836681561779192, + "grad_norm": 0.1257372945547104, + "learning_rate": 0.0005, + "loss": 2.0985, + "step": 205890 + }, + { + "epoch": 0.7837062186460418, + "grad_norm": 0.138628751039505, + "learning_rate": 0.0005, + "loss": 2.1107, + "step": 205900 + }, + { + "epoch": 0.7837442811141646, + "grad_norm": 0.11938808113336563, + "learning_rate": 0.0005, + "loss": 2.1161, + "step": 205910 + }, + { + "epoch": 0.7837823435822873, + "grad_norm": 0.1246340274810791, + "learning_rate": 0.0005, + "loss": 2.1052, + "step": 205920 + }, + { + "epoch": 0.7838204060504099, + "grad_norm": 0.1270332634449005, + "learning_rate": 0.0005, + "loss": 2.1131, + "step": 205930 + }, + { + "epoch": 0.7838584685185326, + "grad_norm": 0.11850014328956604, + "learning_rate": 0.0005, + "loss": 2.0882, + "step": 205940 + }, + { + "epoch": 0.7838965309866553, + "grad_norm": 0.1358293741941452, + "learning_rate": 0.0005, + "loss": 2.1184, + "step": 205950 + }, + { + "epoch": 0.783934593454778, + "grad_norm": 0.11286400258541107, + "learning_rate": 0.0005, + "loss": 2.1189, + "step": 205960 + }, + { + "epoch": 0.7839726559229007, + "grad_norm": 0.11473195999860764, + "learning_rate": 0.0005, + "loss": 2.1052, + "step": 205970 + }, + { + "epoch": 0.7840107183910233, + "grad_norm": 0.12464620918035507, + "learning_rate": 0.0005, + "loss": 2.1031, + "step": 205980 + }, + { + "epoch": 0.784048780859146, + "grad_norm": 0.128046914935112, + "learning_rate": 0.0005, + "loss": 2.1177, + "step": 205990 + }, + { + "epoch": 0.7840868433272687, + "grad_norm": 0.13824665546417236, + "learning_rate": 0.0005, + "loss": 2.0938, + "step": 206000 + }, + { + "epoch": 0.7841249057953914, + "grad_norm": 0.12734805047512054, + "learning_rate": 0.0005, + "loss": 2.1123, + "step": 206010 + }, + { + "epoch": 0.7841629682635141, + "grad_norm": 0.11570902168750763, + "learning_rate": 0.0005, + "loss": 2.1077, + "step": 206020 + }, + { + "epoch": 0.7842010307316367, + "grad_norm": 0.13064268231391907, + "learning_rate": 0.0005, + "loss": 2.104, + "step": 206030 + }, + { + "epoch": 0.7842390931997595, + "grad_norm": 0.1355222761631012, + "learning_rate": 0.0005, + "loss": 2.0937, + "step": 206040 + }, + { + "epoch": 0.7842771556678821, + "grad_norm": 0.15214493870735168, + "learning_rate": 0.0005, + "loss": 2.1122, + "step": 206050 + }, + { + "epoch": 0.7843152181360048, + "grad_norm": 0.12032835930585861, + "learning_rate": 0.0005, + "loss": 2.094, + "step": 206060 + }, + { + "epoch": 0.7843532806041275, + "grad_norm": 0.12022040039300919, + "learning_rate": 0.0005, + "loss": 2.1011, + "step": 206070 + }, + { + "epoch": 0.7843913430722502, + "grad_norm": 0.12360106408596039, + "learning_rate": 0.0005, + "loss": 2.1078, + "step": 206080 + }, + { + "epoch": 0.7844294055403729, + "grad_norm": 0.12494411319494247, + "learning_rate": 0.0005, + "loss": 2.1002, + "step": 206090 + }, + { + "epoch": 0.7844674680084955, + "grad_norm": 0.12421322613954544, + "learning_rate": 0.0005, + "loss": 2.1108, + "step": 206100 + }, + { + "epoch": 0.7845055304766182, + "grad_norm": 0.12146128714084625, + "learning_rate": 0.0005, + "loss": 2.0991, + "step": 206110 + }, + { + "epoch": 0.784543592944741, + "grad_norm": 0.13392406702041626, + "learning_rate": 0.0005, + "loss": 2.0923, + "step": 206120 + }, + { + "epoch": 0.7845816554128636, + "grad_norm": 0.1384228765964508, + "learning_rate": 0.0005, + "loss": 2.1081, + "step": 206130 + }, + { + "epoch": 0.7846197178809863, + "grad_norm": 0.13139431178569794, + "learning_rate": 0.0005, + "loss": 2.1104, + "step": 206140 + }, + { + "epoch": 0.7846577803491089, + "grad_norm": 0.1144791916012764, + "learning_rate": 0.0005, + "loss": 2.0991, + "step": 206150 + }, + { + "epoch": 0.7846958428172316, + "grad_norm": 0.14469364285469055, + "learning_rate": 0.0005, + "loss": 2.1064, + "step": 206160 + }, + { + "epoch": 0.7847339052853544, + "grad_norm": 0.13301540911197662, + "learning_rate": 0.0005, + "loss": 2.1041, + "step": 206170 + }, + { + "epoch": 0.784771967753477, + "grad_norm": 0.1295434534549713, + "learning_rate": 0.0005, + "loss": 2.1104, + "step": 206180 + }, + { + "epoch": 0.7848100302215997, + "grad_norm": 0.12937068939208984, + "learning_rate": 0.0005, + "loss": 2.108, + "step": 206190 + }, + { + "epoch": 0.7848480926897223, + "grad_norm": 0.12335211783647537, + "learning_rate": 0.0005, + "loss": 2.1038, + "step": 206200 + }, + { + "epoch": 0.7848861551578451, + "grad_norm": 0.1225755363702774, + "learning_rate": 0.0005, + "loss": 2.1006, + "step": 206210 + }, + { + "epoch": 0.7849242176259678, + "grad_norm": 0.11895408481359482, + "learning_rate": 0.0005, + "loss": 2.1073, + "step": 206220 + }, + { + "epoch": 0.7849622800940904, + "grad_norm": 0.12693949043750763, + "learning_rate": 0.0005, + "loss": 2.1079, + "step": 206230 + }, + { + "epoch": 0.7850003425622131, + "grad_norm": 0.12901674211025238, + "learning_rate": 0.0005, + "loss": 2.0974, + "step": 206240 + }, + { + "epoch": 0.7850384050303358, + "grad_norm": 0.11677834391593933, + "learning_rate": 0.0005, + "loss": 2.1115, + "step": 206250 + }, + { + "epoch": 0.7850764674984585, + "grad_norm": 0.12869200110435486, + "learning_rate": 0.0005, + "loss": 2.1026, + "step": 206260 + }, + { + "epoch": 0.7851145299665812, + "grad_norm": 0.2570255398750305, + "learning_rate": 0.0005, + "loss": 2.0962, + "step": 206270 + }, + { + "epoch": 0.7851525924347038, + "grad_norm": 0.12122949957847595, + "learning_rate": 0.0005, + "loss": 2.0991, + "step": 206280 + }, + { + "epoch": 0.7851906549028265, + "grad_norm": 0.11915149539709091, + "learning_rate": 0.0005, + "loss": 2.1194, + "step": 206290 + }, + { + "epoch": 0.7852287173709492, + "grad_norm": 0.12245742976665497, + "learning_rate": 0.0005, + "loss": 2.1078, + "step": 206300 + }, + { + "epoch": 0.7852667798390719, + "grad_norm": 0.1253976821899414, + "learning_rate": 0.0005, + "loss": 2.0965, + "step": 206310 + }, + { + "epoch": 0.7853048423071946, + "grad_norm": 0.12328154593706131, + "learning_rate": 0.0005, + "loss": 2.0952, + "step": 206320 + }, + { + "epoch": 0.7853429047753172, + "grad_norm": 0.1260962337255478, + "learning_rate": 0.0005, + "loss": 2.0922, + "step": 206330 + }, + { + "epoch": 0.78538096724344, + "grad_norm": 0.12645284831523895, + "learning_rate": 0.0005, + "loss": 2.1049, + "step": 206340 + }, + { + "epoch": 0.7854190297115626, + "grad_norm": 0.1292407214641571, + "learning_rate": 0.0005, + "loss": 2.1132, + "step": 206350 + }, + { + "epoch": 0.7854570921796853, + "grad_norm": 0.12239421904087067, + "learning_rate": 0.0005, + "loss": 2.1071, + "step": 206360 + }, + { + "epoch": 0.785495154647808, + "grad_norm": 0.1213301345705986, + "learning_rate": 0.0005, + "loss": 2.1028, + "step": 206370 + }, + { + "epoch": 0.7855332171159307, + "grad_norm": 0.12082770466804504, + "learning_rate": 0.0005, + "loss": 2.114, + "step": 206380 + }, + { + "epoch": 0.7855712795840534, + "grad_norm": 0.12510155141353607, + "learning_rate": 0.0005, + "loss": 2.117, + "step": 206390 + }, + { + "epoch": 0.785609342052176, + "grad_norm": 0.1296463906764984, + "learning_rate": 0.0005, + "loss": 2.1201, + "step": 206400 + }, + { + "epoch": 0.7856474045202987, + "grad_norm": 0.11684957891702652, + "learning_rate": 0.0005, + "loss": 2.1084, + "step": 206410 + }, + { + "epoch": 0.7856854669884213, + "grad_norm": 0.121466264128685, + "learning_rate": 0.0005, + "loss": 2.1088, + "step": 206420 + }, + { + "epoch": 0.7857235294565441, + "grad_norm": 0.11626929044723511, + "learning_rate": 0.0005, + "loss": 2.1176, + "step": 206430 + }, + { + "epoch": 0.7857615919246668, + "grad_norm": 0.12083613127470016, + "learning_rate": 0.0005, + "loss": 2.0951, + "step": 206440 + }, + { + "epoch": 0.7857996543927894, + "grad_norm": 0.1161016970872879, + "learning_rate": 0.0005, + "loss": 2.0856, + "step": 206450 + }, + { + "epoch": 0.7858377168609121, + "grad_norm": 0.12474708259105682, + "learning_rate": 0.0005, + "loss": 2.102, + "step": 206460 + }, + { + "epoch": 0.7858757793290349, + "grad_norm": 0.12049251049757004, + "learning_rate": 0.0005, + "loss": 2.103, + "step": 206470 + }, + { + "epoch": 0.7859138417971575, + "grad_norm": 0.12773816287517548, + "learning_rate": 0.0005, + "loss": 2.1135, + "step": 206480 + }, + { + "epoch": 0.7859519042652802, + "grad_norm": 0.12669214606285095, + "learning_rate": 0.0005, + "loss": 2.1111, + "step": 206490 + }, + { + "epoch": 0.7859899667334028, + "grad_norm": 0.12713594734668732, + "learning_rate": 0.0005, + "loss": 2.1037, + "step": 206500 + }, + { + "epoch": 0.7860280292015256, + "grad_norm": 0.1267489492893219, + "learning_rate": 0.0005, + "loss": 2.1135, + "step": 206510 + }, + { + "epoch": 0.7860660916696482, + "grad_norm": 0.12541113793849945, + "learning_rate": 0.0005, + "loss": 2.1107, + "step": 206520 + }, + { + "epoch": 0.7861041541377709, + "grad_norm": 0.12049649655818939, + "learning_rate": 0.0005, + "loss": 2.1116, + "step": 206530 + }, + { + "epoch": 0.7861422166058936, + "grad_norm": 0.12635451555252075, + "learning_rate": 0.0005, + "loss": 2.1043, + "step": 206540 + }, + { + "epoch": 0.7861802790740163, + "grad_norm": 0.11797259002923965, + "learning_rate": 0.0005, + "loss": 2.1133, + "step": 206550 + }, + { + "epoch": 0.786218341542139, + "grad_norm": 0.12539760768413544, + "learning_rate": 0.0005, + "loss": 2.1085, + "step": 206560 + }, + { + "epoch": 0.7862564040102616, + "grad_norm": 0.12407641857862473, + "learning_rate": 0.0005, + "loss": 2.1096, + "step": 206570 + }, + { + "epoch": 0.7862944664783843, + "grad_norm": 0.12364064157009125, + "learning_rate": 0.0005, + "loss": 2.1078, + "step": 206580 + }, + { + "epoch": 0.786332528946507, + "grad_norm": 0.13597682118415833, + "learning_rate": 0.0005, + "loss": 2.1061, + "step": 206590 + }, + { + "epoch": 0.7863705914146297, + "grad_norm": 0.12736718356609344, + "learning_rate": 0.0005, + "loss": 2.1082, + "step": 206600 + }, + { + "epoch": 0.7864086538827524, + "grad_norm": 0.1300850659608841, + "learning_rate": 0.0005, + "loss": 2.1053, + "step": 206610 + }, + { + "epoch": 0.786446716350875, + "grad_norm": 0.1280023455619812, + "learning_rate": 0.0005, + "loss": 2.1156, + "step": 206620 + }, + { + "epoch": 0.7864847788189977, + "grad_norm": 0.1388653814792633, + "learning_rate": 0.0005, + "loss": 2.094, + "step": 206630 + }, + { + "epoch": 0.7865228412871205, + "grad_norm": 0.12239661067724228, + "learning_rate": 0.0005, + "loss": 2.1054, + "step": 206640 + }, + { + "epoch": 0.7865609037552431, + "grad_norm": 0.12179192900657654, + "learning_rate": 0.0005, + "loss": 2.1066, + "step": 206650 + }, + { + "epoch": 0.7865989662233658, + "grad_norm": 0.1308254599571228, + "learning_rate": 0.0005, + "loss": 2.1027, + "step": 206660 + }, + { + "epoch": 0.7866370286914884, + "grad_norm": 0.12290980666875839, + "learning_rate": 0.0005, + "loss": 2.0996, + "step": 206670 + }, + { + "epoch": 0.7866750911596112, + "grad_norm": 0.12178878486156464, + "learning_rate": 0.0005, + "loss": 2.1061, + "step": 206680 + }, + { + "epoch": 0.7867131536277339, + "grad_norm": 0.11778522282838821, + "learning_rate": 0.0005, + "loss": 2.1115, + "step": 206690 + }, + { + "epoch": 0.7867512160958565, + "grad_norm": 0.12160991132259369, + "learning_rate": 0.0005, + "loss": 2.1167, + "step": 206700 + }, + { + "epoch": 0.7867892785639792, + "grad_norm": 0.11683660745620728, + "learning_rate": 0.0005, + "loss": 2.1034, + "step": 206710 + }, + { + "epoch": 0.7868273410321018, + "grad_norm": 0.1343355029821396, + "learning_rate": 0.0005, + "loss": 2.1009, + "step": 206720 + }, + { + "epoch": 0.7868654035002246, + "grad_norm": 0.12453968822956085, + "learning_rate": 0.0005, + "loss": 2.1071, + "step": 206730 + }, + { + "epoch": 0.7869034659683473, + "grad_norm": 0.1248086541891098, + "learning_rate": 0.0005, + "loss": 2.1193, + "step": 206740 + }, + { + "epoch": 0.7869415284364699, + "grad_norm": 0.12308883666992188, + "learning_rate": 0.0005, + "loss": 2.0996, + "step": 206750 + }, + { + "epoch": 0.7869795909045926, + "grad_norm": 0.12302669137716293, + "learning_rate": 0.0005, + "loss": 2.1109, + "step": 206760 + }, + { + "epoch": 0.7870176533727153, + "grad_norm": 0.1207866296172142, + "learning_rate": 0.0005, + "loss": 2.1047, + "step": 206770 + }, + { + "epoch": 0.787055715840838, + "grad_norm": 0.13234055042266846, + "learning_rate": 0.0005, + "loss": 2.1103, + "step": 206780 + }, + { + "epoch": 0.7870937783089607, + "grad_norm": 0.1425267904996872, + "learning_rate": 0.0005, + "loss": 2.1046, + "step": 206790 + }, + { + "epoch": 0.7871318407770833, + "grad_norm": 0.12068969756364822, + "learning_rate": 0.0005, + "loss": 2.1092, + "step": 206800 + }, + { + "epoch": 0.7871699032452061, + "grad_norm": 0.11635356396436691, + "learning_rate": 0.0005, + "loss": 2.115, + "step": 206810 + }, + { + "epoch": 0.7872079657133287, + "grad_norm": 0.13005107641220093, + "learning_rate": 0.0005, + "loss": 2.1067, + "step": 206820 + }, + { + "epoch": 0.7872460281814514, + "grad_norm": 0.13159166276454926, + "learning_rate": 0.0005, + "loss": 2.1014, + "step": 206830 + }, + { + "epoch": 0.787284090649574, + "grad_norm": 0.1244141086935997, + "learning_rate": 0.0005, + "loss": 2.1163, + "step": 206840 + }, + { + "epoch": 0.7873221531176967, + "grad_norm": 0.12854860723018646, + "learning_rate": 0.0005, + "loss": 2.0951, + "step": 206850 + }, + { + "epoch": 0.7873602155858195, + "grad_norm": 0.12042578309774399, + "learning_rate": 0.0005, + "loss": 2.0953, + "step": 206860 + }, + { + "epoch": 0.7873982780539421, + "grad_norm": 0.12977582216262817, + "learning_rate": 0.0005, + "loss": 2.0942, + "step": 206870 + }, + { + "epoch": 0.7874363405220648, + "grad_norm": 0.11893955618143082, + "learning_rate": 0.0005, + "loss": 2.1019, + "step": 206880 + }, + { + "epoch": 0.7874744029901874, + "grad_norm": 0.13247361779212952, + "learning_rate": 0.0005, + "loss": 2.0895, + "step": 206890 + }, + { + "epoch": 0.7875124654583102, + "grad_norm": 0.12134560942649841, + "learning_rate": 0.0005, + "loss": 2.111, + "step": 206900 + }, + { + "epoch": 0.7875505279264329, + "grad_norm": 0.12910045683383942, + "learning_rate": 0.0005, + "loss": 2.1295, + "step": 206910 + }, + { + "epoch": 0.7875885903945555, + "grad_norm": 0.11358480155467987, + "learning_rate": 0.0005, + "loss": 2.1134, + "step": 206920 + }, + { + "epoch": 0.7876266528626782, + "grad_norm": 0.13005992770195007, + "learning_rate": 0.0005, + "loss": 2.1004, + "step": 206930 + }, + { + "epoch": 0.787664715330801, + "grad_norm": 0.12794363498687744, + "learning_rate": 0.0005, + "loss": 2.0932, + "step": 206940 + }, + { + "epoch": 0.7877027777989236, + "grad_norm": 0.1298464983701706, + "learning_rate": 0.0005, + "loss": 2.1196, + "step": 206950 + }, + { + "epoch": 0.7877408402670463, + "grad_norm": 0.12766271829605103, + "learning_rate": 0.0005, + "loss": 2.0947, + "step": 206960 + }, + { + "epoch": 0.7877789027351689, + "grad_norm": 0.1273168921470642, + "learning_rate": 0.0005, + "loss": 2.1155, + "step": 206970 + }, + { + "epoch": 0.7878169652032917, + "grad_norm": 0.12702283263206482, + "learning_rate": 0.0005, + "loss": 2.1065, + "step": 206980 + }, + { + "epoch": 0.7878550276714144, + "grad_norm": 0.13174878060817719, + "learning_rate": 0.0005, + "loss": 2.1027, + "step": 206990 + }, + { + "epoch": 0.787893090139537, + "grad_norm": 0.12054309993982315, + "learning_rate": 0.0005, + "loss": 2.1044, + "step": 207000 + }, + { + "epoch": 0.7879311526076597, + "grad_norm": 0.12238851934671402, + "learning_rate": 0.0005, + "loss": 2.1149, + "step": 207010 + }, + { + "epoch": 0.7879692150757823, + "grad_norm": 0.14216649532318115, + "learning_rate": 0.0005, + "loss": 2.0952, + "step": 207020 + }, + { + "epoch": 0.7880072775439051, + "grad_norm": 0.14556671679019928, + "learning_rate": 0.0005, + "loss": 2.1081, + "step": 207030 + }, + { + "epoch": 0.7880453400120278, + "grad_norm": 0.13143803179264069, + "learning_rate": 0.0005, + "loss": 2.0974, + "step": 207040 + }, + { + "epoch": 0.7880834024801504, + "grad_norm": 0.12600690126419067, + "learning_rate": 0.0005, + "loss": 2.0934, + "step": 207050 + }, + { + "epoch": 0.7881214649482731, + "grad_norm": 0.12676817178726196, + "learning_rate": 0.0005, + "loss": 2.1047, + "step": 207060 + }, + { + "epoch": 0.7881595274163958, + "grad_norm": 0.12112493813037872, + "learning_rate": 0.0005, + "loss": 2.1083, + "step": 207070 + }, + { + "epoch": 0.7881975898845185, + "grad_norm": 0.12040665000677109, + "learning_rate": 0.0005, + "loss": 2.1242, + "step": 207080 + }, + { + "epoch": 0.7882356523526411, + "grad_norm": 0.12102606147527695, + "learning_rate": 0.0005, + "loss": 2.1064, + "step": 207090 + }, + { + "epoch": 0.7882737148207638, + "grad_norm": 0.1392698436975479, + "learning_rate": 0.0005, + "loss": 2.0991, + "step": 207100 + }, + { + "epoch": 0.7883117772888866, + "grad_norm": 0.13138611614704132, + "learning_rate": 0.0005, + "loss": 2.1114, + "step": 207110 + }, + { + "epoch": 0.7883498397570092, + "grad_norm": 0.1238655149936676, + "learning_rate": 0.0005, + "loss": 2.0922, + "step": 207120 + }, + { + "epoch": 0.7883879022251319, + "grad_norm": 0.13656646013259888, + "learning_rate": 0.0005, + "loss": 2.114, + "step": 207130 + }, + { + "epoch": 0.7884259646932545, + "grad_norm": 0.12678642570972443, + "learning_rate": 0.0005, + "loss": 2.1143, + "step": 207140 + }, + { + "epoch": 0.7884640271613772, + "grad_norm": 0.12136158347129822, + "learning_rate": 0.0005, + "loss": 2.1072, + "step": 207150 + }, + { + "epoch": 0.7885020896295, + "grad_norm": 0.14471644163131714, + "learning_rate": 0.0005, + "loss": 2.1, + "step": 207160 + }, + { + "epoch": 0.7885401520976226, + "grad_norm": 0.12014409154653549, + "learning_rate": 0.0005, + "loss": 2.1035, + "step": 207170 + }, + { + "epoch": 0.7885782145657453, + "grad_norm": 0.11891982704401016, + "learning_rate": 0.0005, + "loss": 2.1175, + "step": 207180 + }, + { + "epoch": 0.7886162770338679, + "grad_norm": 0.12429668009281158, + "learning_rate": 0.0005, + "loss": 2.1145, + "step": 207190 + }, + { + "epoch": 0.7886543395019907, + "grad_norm": 0.12298885732889175, + "learning_rate": 0.0005, + "loss": 2.1125, + "step": 207200 + }, + { + "epoch": 0.7886924019701134, + "grad_norm": 0.1149677112698555, + "learning_rate": 0.0005, + "loss": 2.1018, + "step": 207210 + }, + { + "epoch": 0.788730464438236, + "grad_norm": 0.11625348776578903, + "learning_rate": 0.0005, + "loss": 2.0962, + "step": 207220 + }, + { + "epoch": 0.7887685269063587, + "grad_norm": 0.14361906051635742, + "learning_rate": 0.0005, + "loss": 2.098, + "step": 207230 + }, + { + "epoch": 0.7888065893744814, + "grad_norm": 0.14315269887447357, + "learning_rate": 0.0005, + "loss": 2.0979, + "step": 207240 + }, + { + "epoch": 0.7888446518426041, + "grad_norm": 0.11848867684602737, + "learning_rate": 0.0005, + "loss": 2.1125, + "step": 207250 + }, + { + "epoch": 0.7888827143107268, + "grad_norm": 0.12466177344322205, + "learning_rate": 0.0005, + "loss": 2.1108, + "step": 207260 + }, + { + "epoch": 0.7889207767788494, + "grad_norm": 0.12067979574203491, + "learning_rate": 0.0005, + "loss": 2.0958, + "step": 207270 + }, + { + "epoch": 0.7889588392469721, + "grad_norm": 0.15696817636489868, + "learning_rate": 0.0005, + "loss": 2.0891, + "step": 207280 + }, + { + "epoch": 0.7889969017150948, + "grad_norm": 0.12369900941848755, + "learning_rate": 0.0005, + "loss": 2.1185, + "step": 207290 + }, + { + "epoch": 0.7890349641832175, + "grad_norm": 0.1119314655661583, + "learning_rate": 0.0005, + "loss": 2.1181, + "step": 207300 + }, + { + "epoch": 0.7890730266513402, + "grad_norm": 0.11977098882198334, + "learning_rate": 0.0005, + "loss": 2.0966, + "step": 207310 + }, + { + "epoch": 0.7891110891194628, + "grad_norm": 0.12511758506298065, + "learning_rate": 0.0005, + "loss": 2.0865, + "step": 207320 + }, + { + "epoch": 0.7891491515875856, + "grad_norm": 0.12501899898052216, + "learning_rate": 0.0005, + "loss": 2.1186, + "step": 207330 + }, + { + "epoch": 0.7891872140557082, + "grad_norm": 0.12755510210990906, + "learning_rate": 0.0005, + "loss": 2.1231, + "step": 207340 + }, + { + "epoch": 0.7892252765238309, + "grad_norm": 0.13348741829395294, + "learning_rate": 0.0005, + "loss": 2.1204, + "step": 207350 + }, + { + "epoch": 0.7892633389919536, + "grad_norm": 0.13311418890953064, + "learning_rate": 0.0005, + "loss": 2.1234, + "step": 207360 + }, + { + "epoch": 0.7893014014600763, + "grad_norm": 0.12708371877670288, + "learning_rate": 0.0005, + "loss": 2.1065, + "step": 207370 + }, + { + "epoch": 0.789339463928199, + "grad_norm": 0.1293056756258011, + "learning_rate": 0.0005, + "loss": 2.106, + "step": 207380 + }, + { + "epoch": 0.7893775263963216, + "grad_norm": 0.1280953288078308, + "learning_rate": 0.0005, + "loss": 2.1174, + "step": 207390 + }, + { + "epoch": 0.7894155888644443, + "grad_norm": 0.1261862814426422, + "learning_rate": 0.0005, + "loss": 2.0965, + "step": 207400 + }, + { + "epoch": 0.7894536513325671, + "grad_norm": 0.13907188177108765, + "learning_rate": 0.0005, + "loss": 2.1073, + "step": 207410 + }, + { + "epoch": 0.7894917138006897, + "grad_norm": 0.11450821906328201, + "learning_rate": 0.0005, + "loss": 2.1064, + "step": 207420 + }, + { + "epoch": 0.7895297762688124, + "grad_norm": 0.12620657682418823, + "learning_rate": 0.0005, + "loss": 2.0947, + "step": 207430 + }, + { + "epoch": 0.789567838736935, + "grad_norm": 0.12586279213428497, + "learning_rate": 0.0005, + "loss": 2.1038, + "step": 207440 + }, + { + "epoch": 0.7896059012050577, + "grad_norm": 0.12371962517499924, + "learning_rate": 0.0005, + "loss": 2.11, + "step": 207450 + }, + { + "epoch": 0.7896439636731805, + "grad_norm": 0.11716640740633011, + "learning_rate": 0.0005, + "loss": 2.0993, + "step": 207460 + }, + { + "epoch": 0.7896820261413031, + "grad_norm": 0.12114844471216202, + "learning_rate": 0.0005, + "loss": 2.0976, + "step": 207470 + }, + { + "epoch": 0.7897200886094258, + "grad_norm": 0.11807013303041458, + "learning_rate": 0.0005, + "loss": 2.1032, + "step": 207480 + }, + { + "epoch": 0.7897581510775484, + "grad_norm": 0.13748803734779358, + "learning_rate": 0.0005, + "loss": 2.1054, + "step": 207490 + }, + { + "epoch": 0.7897962135456712, + "grad_norm": 0.1313491016626358, + "learning_rate": 0.0005, + "loss": 2.0936, + "step": 207500 + }, + { + "epoch": 0.7898342760137939, + "grad_norm": 0.1214139312505722, + "learning_rate": 0.0005, + "loss": 2.1161, + "step": 207510 + }, + { + "epoch": 0.7898723384819165, + "grad_norm": 0.12973102927207947, + "learning_rate": 0.0005, + "loss": 2.1056, + "step": 207520 + }, + { + "epoch": 0.7899104009500392, + "grad_norm": 0.12636278569698334, + "learning_rate": 0.0005, + "loss": 2.1054, + "step": 207530 + }, + { + "epoch": 0.7899484634181619, + "grad_norm": 0.12249350547790527, + "learning_rate": 0.0005, + "loss": 2.1085, + "step": 207540 + }, + { + "epoch": 0.7899865258862846, + "grad_norm": 0.1312398761510849, + "learning_rate": 0.0005, + "loss": 2.1146, + "step": 207550 + }, + { + "epoch": 0.7900245883544073, + "grad_norm": 0.12716726958751678, + "learning_rate": 0.0005, + "loss": 2.1096, + "step": 207560 + }, + { + "epoch": 0.7900626508225299, + "grad_norm": 0.11871406435966492, + "learning_rate": 0.0005, + "loss": 2.1147, + "step": 207570 + }, + { + "epoch": 0.7901007132906526, + "grad_norm": 0.129543274641037, + "learning_rate": 0.0005, + "loss": 2.0934, + "step": 207580 + }, + { + "epoch": 0.7901387757587753, + "grad_norm": 0.12391343712806702, + "learning_rate": 0.0005, + "loss": 2.1164, + "step": 207590 + }, + { + "epoch": 0.790176838226898, + "grad_norm": 0.12911324203014374, + "learning_rate": 0.0005, + "loss": 2.1094, + "step": 207600 + }, + { + "epoch": 0.7902149006950207, + "grad_norm": 0.12524741888046265, + "learning_rate": 0.0005, + "loss": 2.1147, + "step": 207610 + }, + { + "epoch": 0.7902529631631433, + "grad_norm": 0.12118924409151077, + "learning_rate": 0.0005, + "loss": 2.1016, + "step": 207620 + }, + { + "epoch": 0.7902910256312661, + "grad_norm": 0.1353355050086975, + "learning_rate": 0.0005, + "loss": 2.0991, + "step": 207630 + }, + { + "epoch": 0.7903290880993887, + "grad_norm": 0.12381858378648758, + "learning_rate": 0.0005, + "loss": 2.1067, + "step": 207640 + }, + { + "epoch": 0.7903671505675114, + "grad_norm": 0.13531967997550964, + "learning_rate": 0.0005, + "loss": 2.1022, + "step": 207650 + }, + { + "epoch": 0.790405213035634, + "grad_norm": 0.12401936948299408, + "learning_rate": 0.0005, + "loss": 2.1108, + "step": 207660 + }, + { + "epoch": 0.7904432755037568, + "grad_norm": 0.1160985603928566, + "learning_rate": 0.0005, + "loss": 2.1142, + "step": 207670 + }, + { + "epoch": 0.7904813379718795, + "grad_norm": 0.14534474909305573, + "learning_rate": 0.0005, + "loss": 2.0966, + "step": 207680 + }, + { + "epoch": 0.7905194004400021, + "grad_norm": 0.13224861025810242, + "learning_rate": 0.0005, + "loss": 2.1137, + "step": 207690 + }, + { + "epoch": 0.7905574629081248, + "grad_norm": 0.1339549869298935, + "learning_rate": 0.0005, + "loss": 2.1168, + "step": 207700 + }, + { + "epoch": 0.7905955253762474, + "grad_norm": 0.14083094894886017, + "learning_rate": 0.0005, + "loss": 2.1066, + "step": 207710 + }, + { + "epoch": 0.7906335878443702, + "grad_norm": 0.11693526059389114, + "learning_rate": 0.0005, + "loss": 2.0983, + "step": 207720 + }, + { + "epoch": 0.7906716503124929, + "grad_norm": 0.12638410925865173, + "learning_rate": 0.0005, + "loss": 2.1178, + "step": 207730 + }, + { + "epoch": 0.7907097127806155, + "grad_norm": 0.13595831394195557, + "learning_rate": 0.0005, + "loss": 2.0955, + "step": 207740 + }, + { + "epoch": 0.7907477752487382, + "grad_norm": 0.12722502648830414, + "learning_rate": 0.0005, + "loss": 2.0989, + "step": 207750 + }, + { + "epoch": 0.790785837716861, + "grad_norm": 0.15252414345741272, + "learning_rate": 0.0005, + "loss": 2.1103, + "step": 207760 + }, + { + "epoch": 0.7908239001849836, + "grad_norm": 0.12534798681735992, + "learning_rate": 0.0005, + "loss": 2.1015, + "step": 207770 + }, + { + "epoch": 0.7908619626531063, + "grad_norm": 0.12866796553134918, + "learning_rate": 0.0005, + "loss": 2.1139, + "step": 207780 + }, + { + "epoch": 0.7909000251212289, + "grad_norm": 0.12148977071046829, + "learning_rate": 0.0005, + "loss": 2.0836, + "step": 207790 + }, + { + "epoch": 0.7909380875893517, + "grad_norm": 0.13029548525810242, + "learning_rate": 0.0005, + "loss": 2.1073, + "step": 207800 + }, + { + "epoch": 0.7909761500574743, + "grad_norm": 0.125240296125412, + "learning_rate": 0.0005, + "loss": 2.0913, + "step": 207810 + }, + { + "epoch": 0.791014212525597, + "grad_norm": 0.1148528978228569, + "learning_rate": 0.0005, + "loss": 2.1074, + "step": 207820 + }, + { + "epoch": 0.7910522749937197, + "grad_norm": 0.11937754601240158, + "learning_rate": 0.0005, + "loss": 2.0951, + "step": 207830 + }, + { + "epoch": 0.7910903374618424, + "grad_norm": 0.1433212012052536, + "learning_rate": 0.0005, + "loss": 2.1023, + "step": 207840 + }, + { + "epoch": 0.7911283999299651, + "grad_norm": 0.12246479839086533, + "learning_rate": 0.0005, + "loss": 2.0938, + "step": 207850 + }, + { + "epoch": 0.7911664623980877, + "grad_norm": 0.12843631207942963, + "learning_rate": 0.0005, + "loss": 2.1021, + "step": 207860 + }, + { + "epoch": 0.7912045248662104, + "grad_norm": 0.13483227789402008, + "learning_rate": 0.0005, + "loss": 2.1025, + "step": 207870 + }, + { + "epoch": 0.7912425873343331, + "grad_norm": 0.11968355625867844, + "learning_rate": 0.0005, + "loss": 2.1007, + "step": 207880 + }, + { + "epoch": 0.7912806498024558, + "grad_norm": 0.13005025684833527, + "learning_rate": 0.0005, + "loss": 2.1061, + "step": 207890 + }, + { + "epoch": 0.7913187122705785, + "grad_norm": 0.11541354656219482, + "learning_rate": 0.0005, + "loss": 2.1045, + "step": 207900 + }, + { + "epoch": 0.7913567747387011, + "grad_norm": 0.14411550760269165, + "learning_rate": 0.0005, + "loss": 2.101, + "step": 207910 + }, + { + "epoch": 0.7913948372068238, + "grad_norm": 0.1196213811635971, + "learning_rate": 0.0005, + "loss": 2.1095, + "step": 207920 + }, + { + "epoch": 0.7914328996749466, + "grad_norm": 0.13634739816188812, + "learning_rate": 0.0005, + "loss": 2.0959, + "step": 207930 + }, + { + "epoch": 0.7914709621430692, + "grad_norm": 0.1274816393852234, + "learning_rate": 0.0005, + "loss": 2.0932, + "step": 207940 + }, + { + "epoch": 0.7915090246111919, + "grad_norm": 0.11603652685880661, + "learning_rate": 0.0005, + "loss": 2.107, + "step": 207950 + }, + { + "epoch": 0.7915470870793145, + "grad_norm": 0.11945189535617828, + "learning_rate": 0.0005, + "loss": 2.0871, + "step": 207960 + }, + { + "epoch": 0.7915851495474373, + "grad_norm": 0.12008696794509888, + "learning_rate": 0.0005, + "loss": 2.1011, + "step": 207970 + }, + { + "epoch": 0.79162321201556, + "grad_norm": 0.12966056168079376, + "learning_rate": 0.0005, + "loss": 2.1058, + "step": 207980 + }, + { + "epoch": 0.7916612744836826, + "grad_norm": 0.11781937628984451, + "learning_rate": 0.0005, + "loss": 2.1033, + "step": 207990 + }, + { + "epoch": 0.7916993369518053, + "grad_norm": 0.1327858716249466, + "learning_rate": 0.0005, + "loss": 2.1111, + "step": 208000 + }, + { + "epoch": 0.7917373994199279, + "grad_norm": 0.16530166566371918, + "learning_rate": 0.0005, + "loss": 2.1069, + "step": 208010 + }, + { + "epoch": 0.7917754618880507, + "grad_norm": 0.12523028254508972, + "learning_rate": 0.0005, + "loss": 2.1094, + "step": 208020 + }, + { + "epoch": 0.7918135243561734, + "grad_norm": 0.11686685681343079, + "learning_rate": 0.0005, + "loss": 2.1067, + "step": 208030 + }, + { + "epoch": 0.791851586824296, + "grad_norm": 0.11279519647359848, + "learning_rate": 0.0005, + "loss": 2.1022, + "step": 208040 + }, + { + "epoch": 0.7918896492924187, + "grad_norm": 0.14504992961883545, + "learning_rate": 0.0005, + "loss": 2.1102, + "step": 208050 + }, + { + "epoch": 0.7919277117605414, + "grad_norm": 0.1297408491373062, + "learning_rate": 0.0005, + "loss": 2.089, + "step": 208060 + }, + { + "epoch": 0.7919657742286641, + "grad_norm": 0.1253969669342041, + "learning_rate": 0.0005, + "loss": 2.098, + "step": 208070 + }, + { + "epoch": 0.7920038366967868, + "grad_norm": 0.11713798344135284, + "learning_rate": 0.0005, + "loss": 2.1167, + "step": 208080 + }, + { + "epoch": 0.7920418991649094, + "grad_norm": 0.12483670562505722, + "learning_rate": 0.0005, + "loss": 2.1052, + "step": 208090 + }, + { + "epoch": 0.7920799616330322, + "grad_norm": 0.13903804123401642, + "learning_rate": 0.0005, + "loss": 2.114, + "step": 208100 + }, + { + "epoch": 0.7921180241011548, + "grad_norm": 0.13183006644248962, + "learning_rate": 0.0005, + "loss": 2.1069, + "step": 208110 + }, + { + "epoch": 0.7921560865692775, + "grad_norm": 0.12532487511634827, + "learning_rate": 0.0005, + "loss": 2.0982, + "step": 208120 + }, + { + "epoch": 0.7921941490374002, + "grad_norm": 0.12564100325107574, + "learning_rate": 0.0005, + "loss": 2.1287, + "step": 208130 + }, + { + "epoch": 0.7922322115055228, + "grad_norm": 0.12045514583587646, + "learning_rate": 0.0005, + "loss": 2.0955, + "step": 208140 + }, + { + "epoch": 0.7922702739736456, + "grad_norm": 0.11653630435466766, + "learning_rate": 0.0005, + "loss": 2.1023, + "step": 208150 + }, + { + "epoch": 0.7923083364417682, + "grad_norm": 0.12388094514608383, + "learning_rate": 0.0005, + "loss": 2.1095, + "step": 208160 + }, + { + "epoch": 0.7923463989098909, + "grad_norm": 0.12007354944944382, + "learning_rate": 0.0005, + "loss": 2.1074, + "step": 208170 + }, + { + "epoch": 0.7923844613780135, + "grad_norm": 0.1252797544002533, + "learning_rate": 0.0005, + "loss": 2.1015, + "step": 208180 + }, + { + "epoch": 0.7924225238461363, + "grad_norm": 0.11857400834560394, + "learning_rate": 0.0005, + "loss": 2.0968, + "step": 208190 + }, + { + "epoch": 0.792460586314259, + "grad_norm": 0.19271457195281982, + "learning_rate": 0.0005, + "loss": 2.1, + "step": 208200 + }, + { + "epoch": 0.7924986487823816, + "grad_norm": 0.13805493712425232, + "learning_rate": 0.0005, + "loss": 2.1121, + "step": 208210 + }, + { + "epoch": 0.7925367112505043, + "grad_norm": 0.13163357973098755, + "learning_rate": 0.0005, + "loss": 2.1062, + "step": 208220 + }, + { + "epoch": 0.7925747737186271, + "grad_norm": 0.1201360672712326, + "learning_rate": 0.0005, + "loss": 2.1089, + "step": 208230 + }, + { + "epoch": 0.7926128361867497, + "grad_norm": 0.14493165910243988, + "learning_rate": 0.0005, + "loss": 2.1093, + "step": 208240 + }, + { + "epoch": 0.7926508986548724, + "grad_norm": 0.1183045506477356, + "learning_rate": 0.0005, + "loss": 2.1069, + "step": 208250 + }, + { + "epoch": 0.792688961122995, + "grad_norm": 0.14185409247875214, + "learning_rate": 0.0005, + "loss": 2.1066, + "step": 208260 + }, + { + "epoch": 0.7927270235911178, + "grad_norm": 0.13562677800655365, + "learning_rate": 0.0005, + "loss": 2.1005, + "step": 208270 + }, + { + "epoch": 0.7927650860592405, + "grad_norm": 0.13558503985404968, + "learning_rate": 0.0005, + "loss": 2.1018, + "step": 208280 + }, + { + "epoch": 0.7928031485273631, + "grad_norm": 0.11868877708911896, + "learning_rate": 0.0005, + "loss": 2.0877, + "step": 208290 + }, + { + "epoch": 0.7928412109954858, + "grad_norm": 0.1223066970705986, + "learning_rate": 0.0005, + "loss": 2.1116, + "step": 208300 + }, + { + "epoch": 0.7928792734636084, + "grad_norm": 0.12212125211954117, + "learning_rate": 0.0005, + "loss": 2.1094, + "step": 208310 + }, + { + "epoch": 0.7929173359317312, + "grad_norm": 0.12405651807785034, + "learning_rate": 0.0005, + "loss": 2.1072, + "step": 208320 + }, + { + "epoch": 0.7929553983998539, + "grad_norm": 0.11782582849264145, + "learning_rate": 0.0005, + "loss": 2.1198, + "step": 208330 + }, + { + "epoch": 0.7929934608679765, + "grad_norm": 0.15370948612689972, + "learning_rate": 0.0005, + "loss": 2.087, + "step": 208340 + }, + { + "epoch": 0.7930315233360992, + "grad_norm": 0.13403061032295227, + "learning_rate": 0.0005, + "loss": 2.1237, + "step": 208350 + }, + { + "epoch": 0.7930695858042219, + "grad_norm": 0.130899116396904, + "learning_rate": 0.0005, + "loss": 2.1127, + "step": 208360 + }, + { + "epoch": 0.7931076482723446, + "grad_norm": 0.119450643658638, + "learning_rate": 0.0005, + "loss": 2.1077, + "step": 208370 + }, + { + "epoch": 0.7931457107404672, + "grad_norm": 0.11362778395414352, + "learning_rate": 0.0005, + "loss": 2.1043, + "step": 208380 + }, + { + "epoch": 0.7931837732085899, + "grad_norm": 0.12749043107032776, + "learning_rate": 0.0005, + "loss": 2.1093, + "step": 208390 + }, + { + "epoch": 0.7932218356767127, + "grad_norm": 0.11601119488477707, + "learning_rate": 0.0005, + "loss": 2.0986, + "step": 208400 + }, + { + "epoch": 0.7932598981448353, + "grad_norm": 0.1223764568567276, + "learning_rate": 0.0005, + "loss": 2.1052, + "step": 208410 + }, + { + "epoch": 0.793297960612958, + "grad_norm": 0.1120501160621643, + "learning_rate": 0.0005, + "loss": 2.1291, + "step": 208420 + }, + { + "epoch": 0.7933360230810806, + "grad_norm": 0.12687718868255615, + "learning_rate": 0.0005, + "loss": 2.1023, + "step": 208430 + }, + { + "epoch": 0.7933740855492033, + "grad_norm": 0.13210518658161163, + "learning_rate": 0.0005, + "loss": 2.1, + "step": 208440 + }, + { + "epoch": 0.7934121480173261, + "grad_norm": 0.12925389409065247, + "learning_rate": 0.0005, + "loss": 2.1145, + "step": 208450 + }, + { + "epoch": 0.7934502104854487, + "grad_norm": 0.13438375294208527, + "learning_rate": 0.0005, + "loss": 2.1062, + "step": 208460 + }, + { + "epoch": 0.7934882729535714, + "grad_norm": 0.12347417324781418, + "learning_rate": 0.0005, + "loss": 2.0848, + "step": 208470 + }, + { + "epoch": 0.793526335421694, + "grad_norm": 0.11717919260263443, + "learning_rate": 0.0005, + "loss": 2.1025, + "step": 208480 + }, + { + "epoch": 0.7935643978898168, + "grad_norm": 0.12019776552915573, + "learning_rate": 0.0005, + "loss": 2.1123, + "step": 208490 + }, + { + "epoch": 0.7936024603579395, + "grad_norm": 0.15380197763442993, + "learning_rate": 0.0005, + "loss": 2.1061, + "step": 208500 + }, + { + "epoch": 0.7936405228260621, + "grad_norm": 0.12014266103506088, + "learning_rate": 0.0005, + "loss": 2.1, + "step": 208510 + }, + { + "epoch": 0.7936785852941848, + "grad_norm": 0.1318885236978531, + "learning_rate": 0.0005, + "loss": 2.1092, + "step": 208520 + }, + { + "epoch": 0.7937166477623075, + "grad_norm": 0.12295283377170563, + "learning_rate": 0.0005, + "loss": 2.1045, + "step": 208530 + }, + { + "epoch": 0.7937547102304302, + "grad_norm": 0.1341993659734726, + "learning_rate": 0.0005, + "loss": 2.1032, + "step": 208540 + }, + { + "epoch": 0.7937927726985529, + "grad_norm": 0.1195361539721489, + "learning_rate": 0.0005, + "loss": 2.1011, + "step": 208550 + }, + { + "epoch": 0.7938308351666755, + "grad_norm": 0.12576062977313995, + "learning_rate": 0.0005, + "loss": 2.0984, + "step": 208560 + }, + { + "epoch": 0.7938688976347982, + "grad_norm": 0.12561865150928497, + "learning_rate": 0.0005, + "loss": 2.0903, + "step": 208570 + }, + { + "epoch": 0.7939069601029209, + "grad_norm": 0.1389131397008896, + "learning_rate": 0.0005, + "loss": 2.0906, + "step": 208580 + }, + { + "epoch": 0.7939450225710436, + "grad_norm": 0.11707048863172531, + "learning_rate": 0.0005, + "loss": 2.1015, + "step": 208590 + }, + { + "epoch": 0.7939830850391663, + "grad_norm": 0.1230204775929451, + "learning_rate": 0.0005, + "loss": 2.1134, + "step": 208600 + }, + { + "epoch": 0.7940211475072889, + "grad_norm": 0.12738430500030518, + "learning_rate": 0.0005, + "loss": 2.1062, + "step": 208610 + }, + { + "epoch": 0.7940592099754117, + "grad_norm": 0.1250593066215515, + "learning_rate": 0.0005, + "loss": 2.1059, + "step": 208620 + }, + { + "epoch": 0.7940972724435343, + "grad_norm": 0.1447353959083557, + "learning_rate": 0.0005, + "loss": 2.1152, + "step": 208630 + }, + { + "epoch": 0.794135334911657, + "grad_norm": 0.1509043425321579, + "learning_rate": 0.0005, + "loss": 2.1057, + "step": 208640 + }, + { + "epoch": 0.7941733973797797, + "grad_norm": 0.12834376096725464, + "learning_rate": 0.0005, + "loss": 2.1153, + "step": 208650 + }, + { + "epoch": 0.7942114598479024, + "grad_norm": 0.11595456302165985, + "learning_rate": 0.0005, + "loss": 2.0962, + "step": 208660 + }, + { + "epoch": 0.7942495223160251, + "grad_norm": 0.1368151754140854, + "learning_rate": 0.0005, + "loss": 2.1055, + "step": 208670 + }, + { + "epoch": 0.7942875847841477, + "grad_norm": 0.12382905185222626, + "learning_rate": 0.0005, + "loss": 2.1084, + "step": 208680 + }, + { + "epoch": 0.7943256472522704, + "grad_norm": 0.11919035017490387, + "learning_rate": 0.0005, + "loss": 2.0879, + "step": 208690 + }, + { + "epoch": 0.7943637097203932, + "grad_norm": 0.1275068074464798, + "learning_rate": 0.0005, + "loss": 2.1028, + "step": 208700 + }, + { + "epoch": 0.7944017721885158, + "grad_norm": 0.12816788256168365, + "learning_rate": 0.0005, + "loss": 2.1058, + "step": 208710 + }, + { + "epoch": 0.7944398346566385, + "grad_norm": 0.2003813534975052, + "learning_rate": 0.0005, + "loss": 2.101, + "step": 208720 + }, + { + "epoch": 0.7944778971247611, + "grad_norm": 0.11509547382593155, + "learning_rate": 0.0005, + "loss": 2.0979, + "step": 208730 + }, + { + "epoch": 0.7945159595928838, + "grad_norm": 0.13221478462219238, + "learning_rate": 0.0005, + "loss": 2.1086, + "step": 208740 + }, + { + "epoch": 0.7945540220610066, + "grad_norm": 0.11940953880548477, + "learning_rate": 0.0005, + "loss": 2.095, + "step": 208750 + }, + { + "epoch": 0.7945920845291292, + "grad_norm": 0.13081574440002441, + "learning_rate": 0.0005, + "loss": 2.1171, + "step": 208760 + }, + { + "epoch": 0.7946301469972519, + "grad_norm": 0.11604013293981552, + "learning_rate": 0.0005, + "loss": 2.0976, + "step": 208770 + }, + { + "epoch": 0.7946682094653745, + "grad_norm": 0.13396811485290527, + "learning_rate": 0.0005, + "loss": 2.1059, + "step": 208780 + }, + { + "epoch": 0.7947062719334973, + "grad_norm": 0.14683446288108826, + "learning_rate": 0.0005, + "loss": 2.1295, + "step": 208790 + }, + { + "epoch": 0.79474433440162, + "grad_norm": 0.13151712715625763, + "learning_rate": 0.0005, + "loss": 2.1029, + "step": 208800 + }, + { + "epoch": 0.7947823968697426, + "grad_norm": 0.13657481968402863, + "learning_rate": 0.0005, + "loss": 2.1027, + "step": 208810 + }, + { + "epoch": 0.7948204593378653, + "grad_norm": 0.12927566468715668, + "learning_rate": 0.0005, + "loss": 2.1123, + "step": 208820 + }, + { + "epoch": 0.794858521805988, + "grad_norm": 0.11834193766117096, + "learning_rate": 0.0005, + "loss": 2.0974, + "step": 208830 + }, + { + "epoch": 0.7948965842741107, + "grad_norm": 0.12424030900001526, + "learning_rate": 0.0005, + "loss": 2.1195, + "step": 208840 + }, + { + "epoch": 0.7949346467422334, + "grad_norm": 0.12404995411634445, + "learning_rate": 0.0005, + "loss": 2.1099, + "step": 208850 + }, + { + "epoch": 0.794972709210356, + "grad_norm": 0.12764133512973785, + "learning_rate": 0.0005, + "loss": 2.1195, + "step": 208860 + }, + { + "epoch": 0.7950107716784787, + "grad_norm": 0.1281396746635437, + "learning_rate": 0.0005, + "loss": 2.1113, + "step": 208870 + }, + { + "epoch": 0.7950488341466014, + "grad_norm": 0.13411007821559906, + "learning_rate": 0.0005, + "loss": 2.1099, + "step": 208880 + }, + { + "epoch": 0.7950868966147241, + "grad_norm": 0.12388242781162262, + "learning_rate": 0.0005, + "loss": 2.0904, + "step": 208890 + }, + { + "epoch": 0.7951249590828467, + "grad_norm": 0.15650135278701782, + "learning_rate": 0.0005, + "loss": 2.1068, + "step": 208900 + }, + { + "epoch": 0.7951630215509694, + "grad_norm": 0.11632000654935837, + "learning_rate": 0.0005, + "loss": 2.0931, + "step": 208910 + }, + { + "epoch": 0.7952010840190922, + "grad_norm": 0.1179938018321991, + "learning_rate": 0.0005, + "loss": 2.1034, + "step": 208920 + }, + { + "epoch": 0.7952391464872148, + "grad_norm": 0.12669743597507477, + "learning_rate": 0.0005, + "loss": 2.1209, + "step": 208930 + }, + { + "epoch": 0.7952772089553375, + "grad_norm": 0.13332943618297577, + "learning_rate": 0.0005, + "loss": 2.0969, + "step": 208940 + }, + { + "epoch": 0.7953152714234601, + "grad_norm": 0.1121673732995987, + "learning_rate": 0.0005, + "loss": 2.0955, + "step": 208950 + }, + { + "epoch": 0.7953533338915829, + "grad_norm": 0.11808997392654419, + "learning_rate": 0.0005, + "loss": 2.1124, + "step": 208960 + }, + { + "epoch": 0.7953913963597056, + "grad_norm": 0.1382598578929901, + "learning_rate": 0.0005, + "loss": 2.0944, + "step": 208970 + }, + { + "epoch": 0.7954294588278282, + "grad_norm": 0.1268141269683838, + "learning_rate": 0.0005, + "loss": 2.1008, + "step": 208980 + }, + { + "epoch": 0.7954675212959509, + "grad_norm": 0.13062426447868347, + "learning_rate": 0.0005, + "loss": 2.1078, + "step": 208990 + }, + { + "epoch": 0.7955055837640735, + "grad_norm": 0.12167482823133469, + "learning_rate": 0.0005, + "loss": 2.102, + "step": 209000 + }, + { + "epoch": 0.7955436462321963, + "grad_norm": 0.12169355899095535, + "learning_rate": 0.0005, + "loss": 2.0866, + "step": 209010 + }, + { + "epoch": 0.795581708700319, + "grad_norm": 0.12138015776872635, + "learning_rate": 0.0005, + "loss": 2.1096, + "step": 209020 + }, + { + "epoch": 0.7956197711684416, + "grad_norm": 0.16487336158752441, + "learning_rate": 0.0005, + "loss": 2.1031, + "step": 209030 + }, + { + "epoch": 0.7956578336365643, + "grad_norm": 0.1197362020611763, + "learning_rate": 0.0005, + "loss": 2.1127, + "step": 209040 + }, + { + "epoch": 0.795695896104687, + "grad_norm": 0.1222614049911499, + "learning_rate": 0.0005, + "loss": 2.0989, + "step": 209050 + }, + { + "epoch": 0.7957339585728097, + "grad_norm": 0.13899779319763184, + "learning_rate": 0.0005, + "loss": 2.1131, + "step": 209060 + }, + { + "epoch": 0.7957720210409324, + "grad_norm": 0.12792204320430756, + "learning_rate": 0.0005, + "loss": 2.1127, + "step": 209070 + }, + { + "epoch": 0.795810083509055, + "grad_norm": 0.12948037683963776, + "learning_rate": 0.0005, + "loss": 2.1028, + "step": 209080 + }, + { + "epoch": 0.7958481459771778, + "grad_norm": 0.11667436361312866, + "learning_rate": 0.0005, + "loss": 2.0971, + "step": 209090 + }, + { + "epoch": 0.7958862084453004, + "grad_norm": 0.12511169910430908, + "learning_rate": 0.0005, + "loss": 2.0989, + "step": 209100 + }, + { + "epoch": 0.7959242709134231, + "grad_norm": 0.12640981376171112, + "learning_rate": 0.0005, + "loss": 2.0958, + "step": 209110 + }, + { + "epoch": 0.7959623333815458, + "grad_norm": 0.13863001763820648, + "learning_rate": 0.0005, + "loss": 2.1037, + "step": 209120 + }, + { + "epoch": 0.7960003958496685, + "grad_norm": 0.12970925867557526, + "learning_rate": 0.0005, + "loss": 2.1019, + "step": 209130 + }, + { + "epoch": 0.7960384583177912, + "grad_norm": 0.12926508486270905, + "learning_rate": 0.0005, + "loss": 2.1059, + "step": 209140 + }, + { + "epoch": 0.7960765207859138, + "grad_norm": 0.139383926987648, + "learning_rate": 0.0005, + "loss": 2.1056, + "step": 209150 + }, + { + "epoch": 0.7961145832540365, + "grad_norm": 0.17026956379413605, + "learning_rate": 0.0005, + "loss": 2.0958, + "step": 209160 + }, + { + "epoch": 0.7961526457221592, + "grad_norm": 0.12663201987743378, + "learning_rate": 0.0005, + "loss": 2.0865, + "step": 209170 + }, + { + "epoch": 0.7961907081902819, + "grad_norm": 0.1172584593296051, + "learning_rate": 0.0005, + "loss": 2.1212, + "step": 209180 + }, + { + "epoch": 0.7962287706584046, + "grad_norm": 0.11819595843553543, + "learning_rate": 0.0005, + "loss": 2.108, + "step": 209190 + }, + { + "epoch": 0.7962668331265272, + "grad_norm": 0.13188645243644714, + "learning_rate": 0.0005, + "loss": 2.116, + "step": 209200 + }, + { + "epoch": 0.7963048955946499, + "grad_norm": 0.1291705071926117, + "learning_rate": 0.0005, + "loss": 2.1202, + "step": 209210 + }, + { + "epoch": 0.7963429580627727, + "grad_norm": 0.13012315332889557, + "learning_rate": 0.0005, + "loss": 2.112, + "step": 209220 + }, + { + "epoch": 0.7963810205308953, + "grad_norm": 0.1288910061120987, + "learning_rate": 0.0005, + "loss": 2.1027, + "step": 209230 + }, + { + "epoch": 0.796419082999018, + "grad_norm": 0.12641553580760956, + "learning_rate": 0.0005, + "loss": 2.1066, + "step": 209240 + }, + { + "epoch": 0.7964571454671406, + "grad_norm": 0.12806424498558044, + "learning_rate": 0.0005, + "loss": 2.1051, + "step": 209250 + }, + { + "epoch": 0.7964952079352634, + "grad_norm": 0.1583370417356491, + "learning_rate": 0.0005, + "loss": 2.0912, + "step": 209260 + }, + { + "epoch": 0.7965332704033861, + "grad_norm": 0.1468966156244278, + "learning_rate": 0.0005, + "loss": 2.1097, + "step": 209270 + }, + { + "epoch": 0.7965713328715087, + "grad_norm": 0.11811856180429459, + "learning_rate": 0.0005, + "loss": 2.1102, + "step": 209280 + }, + { + "epoch": 0.7966093953396314, + "grad_norm": 0.12157811969518661, + "learning_rate": 0.0005, + "loss": 2.1057, + "step": 209290 + }, + { + "epoch": 0.796647457807754, + "grad_norm": 0.13439470529556274, + "learning_rate": 0.0005, + "loss": 2.1199, + "step": 209300 + }, + { + "epoch": 0.7966855202758768, + "grad_norm": 0.12155460566282272, + "learning_rate": 0.0005, + "loss": 2.1176, + "step": 209310 + }, + { + "epoch": 0.7967235827439995, + "grad_norm": 0.12126308679580688, + "learning_rate": 0.0005, + "loss": 2.1025, + "step": 209320 + }, + { + "epoch": 0.7967616452121221, + "grad_norm": 0.11862965673208237, + "learning_rate": 0.0005, + "loss": 2.1169, + "step": 209330 + }, + { + "epoch": 0.7967997076802448, + "grad_norm": 0.12274859100580215, + "learning_rate": 0.0005, + "loss": 2.1059, + "step": 209340 + }, + { + "epoch": 0.7968377701483675, + "grad_norm": 0.12550798058509827, + "learning_rate": 0.0005, + "loss": 2.1084, + "step": 209350 + }, + { + "epoch": 0.7968758326164902, + "grad_norm": 0.14728228747844696, + "learning_rate": 0.0005, + "loss": 2.1084, + "step": 209360 + }, + { + "epoch": 0.7969138950846129, + "grad_norm": 0.11769311875104904, + "learning_rate": 0.0005, + "loss": 2.1213, + "step": 209370 + }, + { + "epoch": 0.7969519575527355, + "grad_norm": 0.13724328577518463, + "learning_rate": 0.0005, + "loss": 2.0971, + "step": 209380 + }, + { + "epoch": 0.7969900200208583, + "grad_norm": 0.1331387758255005, + "learning_rate": 0.0005, + "loss": 2.1054, + "step": 209390 + }, + { + "epoch": 0.7970280824889809, + "grad_norm": 0.12574204802513123, + "learning_rate": 0.0005, + "loss": 2.103, + "step": 209400 + }, + { + "epoch": 0.7970661449571036, + "grad_norm": 0.12337487936019897, + "learning_rate": 0.0005, + "loss": 2.1043, + "step": 209410 + }, + { + "epoch": 0.7971042074252263, + "grad_norm": 0.1246960312128067, + "learning_rate": 0.0005, + "loss": 2.1069, + "step": 209420 + }, + { + "epoch": 0.7971422698933489, + "grad_norm": 0.12354270368814468, + "learning_rate": 0.0005, + "loss": 2.1083, + "step": 209430 + }, + { + "epoch": 0.7971803323614717, + "grad_norm": 0.13482049107551575, + "learning_rate": 0.0005, + "loss": 2.106, + "step": 209440 + }, + { + "epoch": 0.7972183948295943, + "grad_norm": 0.11957293748855591, + "learning_rate": 0.0005, + "loss": 2.0908, + "step": 209450 + }, + { + "epoch": 0.797256457297717, + "grad_norm": 0.11577066034078598, + "learning_rate": 0.0005, + "loss": 2.1104, + "step": 209460 + }, + { + "epoch": 0.7972945197658396, + "grad_norm": 0.12052362412214279, + "learning_rate": 0.0005, + "loss": 2.1076, + "step": 209470 + }, + { + "epoch": 0.7973325822339624, + "grad_norm": 0.12633970379829407, + "learning_rate": 0.0005, + "loss": 2.1141, + "step": 209480 + }, + { + "epoch": 0.7973706447020851, + "grad_norm": 0.12112203985452652, + "learning_rate": 0.0005, + "loss": 2.1047, + "step": 209490 + }, + { + "epoch": 0.7974087071702077, + "grad_norm": 0.1264239400625229, + "learning_rate": 0.0005, + "loss": 2.1077, + "step": 209500 + }, + { + "epoch": 0.7974467696383304, + "grad_norm": 0.11804235726594925, + "learning_rate": 0.0005, + "loss": 2.1085, + "step": 209510 + }, + { + "epoch": 0.7974848321064532, + "grad_norm": 0.12418785691261292, + "learning_rate": 0.0005, + "loss": 2.1066, + "step": 209520 + }, + { + "epoch": 0.7975228945745758, + "grad_norm": 0.11656662076711655, + "learning_rate": 0.0005, + "loss": 2.1169, + "step": 209530 + }, + { + "epoch": 0.7975609570426985, + "grad_norm": 0.12503372132778168, + "learning_rate": 0.0005, + "loss": 2.1039, + "step": 209540 + }, + { + "epoch": 0.7975990195108211, + "grad_norm": 0.12820298969745636, + "learning_rate": 0.0005, + "loss": 2.1181, + "step": 209550 + }, + { + "epoch": 0.7976370819789439, + "grad_norm": 0.13916277885437012, + "learning_rate": 0.0005, + "loss": 2.1127, + "step": 209560 + }, + { + "epoch": 0.7976751444470666, + "grad_norm": 0.13408006727695465, + "learning_rate": 0.0005, + "loss": 2.1014, + "step": 209570 + }, + { + "epoch": 0.7977132069151892, + "grad_norm": 0.12541238963603973, + "learning_rate": 0.0005, + "loss": 2.0987, + "step": 209580 + }, + { + "epoch": 0.7977512693833119, + "grad_norm": 0.11731202900409698, + "learning_rate": 0.0005, + "loss": 2.0898, + "step": 209590 + }, + { + "epoch": 0.7977893318514345, + "grad_norm": 0.12128131836652756, + "learning_rate": 0.0005, + "loss": 2.1056, + "step": 209600 + }, + { + "epoch": 0.7978273943195573, + "grad_norm": 0.12297343462705612, + "learning_rate": 0.0005, + "loss": 2.102, + "step": 209610 + }, + { + "epoch": 0.79786545678768, + "grad_norm": 0.13395269215106964, + "learning_rate": 0.0005, + "loss": 2.0958, + "step": 209620 + }, + { + "epoch": 0.7979035192558026, + "grad_norm": 0.13858111202716827, + "learning_rate": 0.0005, + "loss": 2.0983, + "step": 209630 + }, + { + "epoch": 0.7979415817239253, + "grad_norm": 0.12004192918539047, + "learning_rate": 0.0005, + "loss": 2.1101, + "step": 209640 + }, + { + "epoch": 0.797979644192048, + "grad_norm": 0.12386671453714371, + "learning_rate": 0.0005, + "loss": 2.0935, + "step": 209650 + }, + { + "epoch": 0.7980177066601707, + "grad_norm": 0.13386517763137817, + "learning_rate": 0.0005, + "loss": 2.1033, + "step": 209660 + }, + { + "epoch": 0.7980557691282933, + "grad_norm": 0.1264449954032898, + "learning_rate": 0.0005, + "loss": 2.113, + "step": 209670 + }, + { + "epoch": 0.798093831596416, + "grad_norm": 0.1198006197810173, + "learning_rate": 0.0005, + "loss": 2.0796, + "step": 209680 + }, + { + "epoch": 0.7981318940645388, + "grad_norm": 0.13347522914409637, + "learning_rate": 0.0005, + "loss": 2.1117, + "step": 209690 + }, + { + "epoch": 0.7981699565326614, + "grad_norm": 0.13666976988315582, + "learning_rate": 0.0005, + "loss": 2.1015, + "step": 209700 + }, + { + "epoch": 0.7982080190007841, + "grad_norm": 0.12404890358448029, + "learning_rate": 0.0005, + "loss": 2.1083, + "step": 209710 + }, + { + "epoch": 0.7982460814689067, + "grad_norm": 0.1316789984703064, + "learning_rate": 0.0005, + "loss": 2.1106, + "step": 209720 + }, + { + "epoch": 0.7982841439370294, + "grad_norm": 0.13406074047088623, + "learning_rate": 0.0005, + "loss": 2.0984, + "step": 209730 + }, + { + "epoch": 0.7983222064051522, + "grad_norm": 0.13223187625408173, + "learning_rate": 0.0005, + "loss": 2.1092, + "step": 209740 + }, + { + "epoch": 0.7983602688732748, + "grad_norm": 0.12085633724927902, + "learning_rate": 0.0005, + "loss": 2.1028, + "step": 209750 + }, + { + "epoch": 0.7983983313413975, + "grad_norm": 0.12477198988199234, + "learning_rate": 0.0005, + "loss": 2.0986, + "step": 209760 + }, + { + "epoch": 0.7984363938095201, + "grad_norm": 0.12964096665382385, + "learning_rate": 0.0005, + "loss": 2.1109, + "step": 209770 + }, + { + "epoch": 0.7984744562776429, + "grad_norm": 0.12128846347332001, + "learning_rate": 0.0005, + "loss": 2.1116, + "step": 209780 + }, + { + "epoch": 0.7985125187457656, + "grad_norm": 0.11454357951879501, + "learning_rate": 0.0005, + "loss": 2.0966, + "step": 209790 + }, + { + "epoch": 0.7985505812138882, + "grad_norm": 0.12122207880020142, + "learning_rate": 0.0005, + "loss": 2.104, + "step": 209800 + }, + { + "epoch": 0.7985886436820109, + "grad_norm": 0.12776987254619598, + "learning_rate": 0.0005, + "loss": 2.1053, + "step": 209810 + }, + { + "epoch": 0.7986267061501336, + "grad_norm": 0.11924677342176437, + "learning_rate": 0.0005, + "loss": 2.1034, + "step": 209820 + }, + { + "epoch": 0.7986647686182563, + "grad_norm": 0.125722736120224, + "learning_rate": 0.0005, + "loss": 2.1173, + "step": 209830 + }, + { + "epoch": 0.798702831086379, + "grad_norm": 0.12875454127788544, + "learning_rate": 0.0005, + "loss": 2.1007, + "step": 209840 + }, + { + "epoch": 0.7987408935545016, + "grad_norm": 0.129396453499794, + "learning_rate": 0.0005, + "loss": 2.1092, + "step": 209850 + }, + { + "epoch": 0.7987789560226243, + "grad_norm": 0.13016273081302643, + "learning_rate": 0.0005, + "loss": 2.106, + "step": 209860 + }, + { + "epoch": 0.798817018490747, + "grad_norm": 0.1285935938358307, + "learning_rate": 0.0005, + "loss": 2.1246, + "step": 209870 + }, + { + "epoch": 0.7988550809588697, + "grad_norm": 0.14093440771102905, + "learning_rate": 0.0005, + "loss": 2.1076, + "step": 209880 + }, + { + "epoch": 0.7988931434269924, + "grad_norm": 0.1263287216424942, + "learning_rate": 0.0005, + "loss": 2.1111, + "step": 209890 + }, + { + "epoch": 0.798931205895115, + "grad_norm": 0.1366177499294281, + "learning_rate": 0.0005, + "loss": 2.1029, + "step": 209900 + }, + { + "epoch": 0.7989692683632378, + "grad_norm": 0.11795708537101746, + "learning_rate": 0.0005, + "loss": 2.1055, + "step": 209910 + }, + { + "epoch": 0.7990073308313604, + "grad_norm": 0.1111733689904213, + "learning_rate": 0.0005, + "loss": 2.0974, + "step": 209920 + }, + { + "epoch": 0.7990453932994831, + "grad_norm": 0.13642236590385437, + "learning_rate": 0.0005, + "loss": 2.1087, + "step": 209930 + }, + { + "epoch": 0.7990834557676058, + "grad_norm": 0.12918546795845032, + "learning_rate": 0.0005, + "loss": 2.0907, + "step": 209940 + }, + { + "epoch": 0.7991215182357285, + "grad_norm": 0.1210419088602066, + "learning_rate": 0.0005, + "loss": 2.1084, + "step": 209950 + }, + { + "epoch": 0.7991595807038512, + "grad_norm": 0.13431444764137268, + "learning_rate": 0.0005, + "loss": 2.1132, + "step": 209960 + }, + { + "epoch": 0.7991976431719738, + "grad_norm": 0.1349552571773529, + "learning_rate": 0.0005, + "loss": 2.1109, + "step": 209970 + }, + { + "epoch": 0.7992357056400965, + "grad_norm": 0.1264919936656952, + "learning_rate": 0.0005, + "loss": 2.1114, + "step": 209980 + }, + { + "epoch": 0.7992737681082193, + "grad_norm": 0.12178479880094528, + "learning_rate": 0.0005, + "loss": 2.1214, + "step": 209990 + }, + { + "epoch": 0.7993118305763419, + "grad_norm": 0.13122200965881348, + "learning_rate": 0.0005, + "loss": 2.0908, + "step": 210000 + }, + { + "epoch": 0.7993498930444646, + "grad_norm": 0.12984102964401245, + "learning_rate": 0.0005, + "loss": 2.1166, + "step": 210010 + }, + { + "epoch": 0.7993879555125872, + "grad_norm": 0.11454375833272934, + "learning_rate": 0.0005, + "loss": 2.0861, + "step": 210020 + }, + { + "epoch": 0.7994260179807099, + "grad_norm": 0.1272624135017395, + "learning_rate": 0.0005, + "loss": 2.1162, + "step": 210030 + }, + { + "epoch": 0.7994640804488327, + "grad_norm": 0.12016329169273376, + "learning_rate": 0.0005, + "loss": 2.097, + "step": 210040 + }, + { + "epoch": 0.7995021429169553, + "grad_norm": 0.13649529218673706, + "learning_rate": 0.0005, + "loss": 2.0994, + "step": 210050 + }, + { + "epoch": 0.799540205385078, + "grad_norm": 0.1260644644498825, + "learning_rate": 0.0005, + "loss": 2.1027, + "step": 210060 + }, + { + "epoch": 0.7995782678532006, + "grad_norm": 0.1434025764465332, + "learning_rate": 0.0005, + "loss": 2.1043, + "step": 210070 + }, + { + "epoch": 0.7996163303213234, + "grad_norm": 0.1485346555709839, + "learning_rate": 0.0005, + "loss": 2.1054, + "step": 210080 + }, + { + "epoch": 0.7996543927894461, + "grad_norm": 0.12919403612613678, + "learning_rate": 0.0005, + "loss": 2.1137, + "step": 210090 + }, + { + "epoch": 0.7996924552575687, + "grad_norm": 0.13078376650810242, + "learning_rate": 0.0005, + "loss": 2.1043, + "step": 210100 + }, + { + "epoch": 0.7997305177256914, + "grad_norm": 0.13152313232421875, + "learning_rate": 0.0005, + "loss": 2.1221, + "step": 210110 + }, + { + "epoch": 0.7997685801938141, + "grad_norm": 0.12501022219657898, + "learning_rate": 0.0005, + "loss": 2.1144, + "step": 210120 + }, + { + "epoch": 0.7998066426619368, + "grad_norm": 0.12021830677986145, + "learning_rate": 0.0005, + "loss": 2.1188, + "step": 210130 + }, + { + "epoch": 0.7998447051300595, + "grad_norm": 0.11749450862407684, + "learning_rate": 0.0005, + "loss": 2.1282, + "step": 210140 + }, + { + "epoch": 0.7998827675981821, + "grad_norm": 0.12211523205041885, + "learning_rate": 0.0005, + "loss": 2.102, + "step": 210150 + }, + { + "epoch": 0.7999208300663048, + "grad_norm": 0.12200842797756195, + "learning_rate": 0.0005, + "loss": 2.1105, + "step": 210160 + }, + { + "epoch": 0.7999588925344275, + "grad_norm": 0.11807631701231003, + "learning_rate": 0.0005, + "loss": 2.1017, + "step": 210170 + }, + { + "epoch": 0.7999969550025502, + "grad_norm": 0.11688867211341858, + "learning_rate": 0.0005, + "loss": 2.1063, + "step": 210180 + }, + { + "epoch": 0.8000350174706728, + "grad_norm": 0.12884896993637085, + "learning_rate": 0.0005, + "loss": 2.104, + "step": 210190 + }, + { + "epoch": 0.8000730799387955, + "grad_norm": 0.11895192414522171, + "learning_rate": 0.0005, + "loss": 2.1072, + "step": 210200 + }, + { + "epoch": 0.8001111424069183, + "grad_norm": 0.13126152753829956, + "learning_rate": 0.0005, + "loss": 2.0943, + "step": 210210 + }, + { + "epoch": 0.8001492048750409, + "grad_norm": 0.1251167356967926, + "learning_rate": 0.0005, + "loss": 2.0996, + "step": 210220 + }, + { + "epoch": 0.8001872673431636, + "grad_norm": 0.11805843561887741, + "learning_rate": 0.0005, + "loss": 2.1167, + "step": 210230 + }, + { + "epoch": 0.8002253298112862, + "grad_norm": 0.12272538989782333, + "learning_rate": 0.0005, + "loss": 2.103, + "step": 210240 + }, + { + "epoch": 0.800263392279409, + "grad_norm": 0.1270427703857422, + "learning_rate": 0.0005, + "loss": 2.1023, + "step": 210250 + }, + { + "epoch": 0.8003014547475317, + "grad_norm": 0.11693274229764938, + "learning_rate": 0.0005, + "loss": 2.0947, + "step": 210260 + }, + { + "epoch": 0.8003395172156543, + "grad_norm": 0.12159300595521927, + "learning_rate": 0.0005, + "loss": 2.0994, + "step": 210270 + }, + { + "epoch": 0.800377579683777, + "grad_norm": 0.12339331954717636, + "learning_rate": 0.0005, + "loss": 2.1051, + "step": 210280 + }, + { + "epoch": 0.8004156421518998, + "grad_norm": 0.11767217516899109, + "learning_rate": 0.0005, + "loss": 2.1083, + "step": 210290 + }, + { + "epoch": 0.8004537046200224, + "grad_norm": 0.12712466716766357, + "learning_rate": 0.0005, + "loss": 2.1133, + "step": 210300 + }, + { + "epoch": 0.8004917670881451, + "grad_norm": 0.11670083552598953, + "learning_rate": 0.0005, + "loss": 2.1201, + "step": 210310 + }, + { + "epoch": 0.8005298295562677, + "grad_norm": 0.13408468663692474, + "learning_rate": 0.0005, + "loss": 2.0973, + "step": 210320 + }, + { + "epoch": 0.8005678920243904, + "grad_norm": 0.12165634334087372, + "learning_rate": 0.0005, + "loss": 2.1116, + "step": 210330 + }, + { + "epoch": 0.8006059544925131, + "grad_norm": 0.14448975026607513, + "learning_rate": 0.0005, + "loss": 2.0914, + "step": 210340 + }, + { + "epoch": 0.8006440169606358, + "grad_norm": 0.12268371880054474, + "learning_rate": 0.0005, + "loss": 2.1049, + "step": 210350 + }, + { + "epoch": 0.8006820794287585, + "grad_norm": 0.14124977588653564, + "learning_rate": 0.0005, + "loss": 2.1013, + "step": 210360 + }, + { + "epoch": 0.8007201418968811, + "grad_norm": 0.12242026627063751, + "learning_rate": 0.0005, + "loss": 2.1145, + "step": 210370 + }, + { + "epoch": 0.8007582043650039, + "grad_norm": 0.1343546211719513, + "learning_rate": 0.0005, + "loss": 2.103, + "step": 210380 + }, + { + "epoch": 0.8007962668331265, + "grad_norm": 0.13093207776546478, + "learning_rate": 0.0005, + "loss": 2.1014, + "step": 210390 + }, + { + "epoch": 0.8008343293012492, + "grad_norm": 0.11842503398656845, + "learning_rate": 0.0005, + "loss": 2.1081, + "step": 210400 + }, + { + "epoch": 0.8008723917693719, + "grad_norm": 0.12339796870946884, + "learning_rate": 0.0005, + "loss": 2.1042, + "step": 210410 + }, + { + "epoch": 0.8009104542374946, + "grad_norm": 0.11186055094003677, + "learning_rate": 0.0005, + "loss": 2.1006, + "step": 210420 + }, + { + "epoch": 0.8009485167056173, + "grad_norm": 0.12415434420108795, + "learning_rate": 0.0005, + "loss": 2.116, + "step": 210430 + }, + { + "epoch": 0.8009865791737399, + "grad_norm": 0.121949702501297, + "learning_rate": 0.0005, + "loss": 2.0952, + "step": 210440 + }, + { + "epoch": 0.8010246416418626, + "grad_norm": 0.14509126543998718, + "learning_rate": 0.0005, + "loss": 2.113, + "step": 210450 + }, + { + "epoch": 0.8010627041099853, + "grad_norm": 0.12276475131511688, + "learning_rate": 0.0005, + "loss": 2.1043, + "step": 210460 + }, + { + "epoch": 0.801100766578108, + "grad_norm": 0.12225574254989624, + "learning_rate": 0.0005, + "loss": 2.1159, + "step": 210470 + }, + { + "epoch": 0.8011388290462307, + "grad_norm": 0.12323661148548126, + "learning_rate": 0.0005, + "loss": 2.1095, + "step": 210480 + }, + { + "epoch": 0.8011768915143533, + "grad_norm": 0.12105714529752731, + "learning_rate": 0.0005, + "loss": 2.1167, + "step": 210490 + }, + { + "epoch": 0.801214953982476, + "grad_norm": 0.1379852592945099, + "learning_rate": 0.0005, + "loss": 2.1239, + "step": 210500 + }, + { + "epoch": 0.8012530164505988, + "grad_norm": 0.1205686628818512, + "learning_rate": 0.0005, + "loss": 2.1134, + "step": 210510 + }, + { + "epoch": 0.8012910789187214, + "grad_norm": 0.1286148726940155, + "learning_rate": 0.0005, + "loss": 2.1162, + "step": 210520 + }, + { + "epoch": 0.8013291413868441, + "grad_norm": 0.1285942792892456, + "learning_rate": 0.0005, + "loss": 2.0926, + "step": 210530 + }, + { + "epoch": 0.8013672038549667, + "grad_norm": 0.13062357902526855, + "learning_rate": 0.0005, + "loss": 2.103, + "step": 210540 + }, + { + "epoch": 0.8014052663230895, + "grad_norm": 0.12718930840492249, + "learning_rate": 0.0005, + "loss": 2.0935, + "step": 210550 + }, + { + "epoch": 0.8014433287912122, + "grad_norm": 0.134733647108078, + "learning_rate": 0.0005, + "loss": 2.116, + "step": 210560 + }, + { + "epoch": 0.8014813912593348, + "grad_norm": 0.12464946508407593, + "learning_rate": 0.0005, + "loss": 2.1078, + "step": 210570 + }, + { + "epoch": 0.8015194537274575, + "grad_norm": 0.1174856424331665, + "learning_rate": 0.0005, + "loss": 2.1019, + "step": 210580 + }, + { + "epoch": 0.8015575161955801, + "grad_norm": 0.12474462389945984, + "learning_rate": 0.0005, + "loss": 2.1162, + "step": 210590 + }, + { + "epoch": 0.8015955786637029, + "grad_norm": 0.12405211478471756, + "learning_rate": 0.0005, + "loss": 2.1045, + "step": 210600 + }, + { + "epoch": 0.8016336411318256, + "grad_norm": 0.13704289495944977, + "learning_rate": 0.0005, + "loss": 2.1104, + "step": 210610 + }, + { + "epoch": 0.8016717035999482, + "grad_norm": 0.1354646533727646, + "learning_rate": 0.0005, + "loss": 2.0865, + "step": 210620 + }, + { + "epoch": 0.8017097660680709, + "grad_norm": 0.12755626440048218, + "learning_rate": 0.0005, + "loss": 2.1062, + "step": 210630 + }, + { + "epoch": 0.8017478285361936, + "grad_norm": 0.12112542241811752, + "learning_rate": 0.0005, + "loss": 2.0911, + "step": 210640 + }, + { + "epoch": 0.8017858910043163, + "grad_norm": 0.12758193910121918, + "learning_rate": 0.0005, + "loss": 2.1164, + "step": 210650 + }, + { + "epoch": 0.801823953472439, + "grad_norm": 0.11487865447998047, + "learning_rate": 0.0005, + "loss": 2.1063, + "step": 210660 + }, + { + "epoch": 0.8018620159405616, + "grad_norm": 0.1294761449098587, + "learning_rate": 0.0005, + "loss": 2.1169, + "step": 210670 + }, + { + "epoch": 0.8019000784086844, + "grad_norm": 0.11908960342407227, + "learning_rate": 0.0005, + "loss": 2.0932, + "step": 210680 + }, + { + "epoch": 0.801938140876807, + "grad_norm": 0.12050192803144455, + "learning_rate": 0.0005, + "loss": 2.1161, + "step": 210690 + }, + { + "epoch": 0.8019762033449297, + "grad_norm": 0.13563457131385803, + "learning_rate": 0.0005, + "loss": 2.1191, + "step": 210700 + }, + { + "epoch": 0.8020142658130524, + "grad_norm": 0.19248554110527039, + "learning_rate": 0.0005, + "loss": 2.1101, + "step": 210710 + }, + { + "epoch": 0.8020523282811751, + "grad_norm": 0.11567103862762451, + "learning_rate": 0.0005, + "loss": 2.1071, + "step": 210720 + }, + { + "epoch": 0.8020903907492978, + "grad_norm": 0.13898178935050964, + "learning_rate": 0.0005, + "loss": 2.1044, + "step": 210730 + }, + { + "epoch": 0.8021284532174204, + "grad_norm": 0.1283826380968094, + "learning_rate": 0.0005, + "loss": 2.1073, + "step": 210740 + }, + { + "epoch": 0.8021665156855431, + "grad_norm": 0.12306949496269226, + "learning_rate": 0.0005, + "loss": 2.1063, + "step": 210750 + }, + { + "epoch": 0.8022045781536657, + "grad_norm": 0.13112527132034302, + "learning_rate": 0.0005, + "loss": 2.1024, + "step": 210760 + }, + { + "epoch": 0.8022426406217885, + "grad_norm": 0.11857830733060837, + "learning_rate": 0.0005, + "loss": 2.1033, + "step": 210770 + }, + { + "epoch": 0.8022807030899112, + "grad_norm": 0.12262977659702301, + "learning_rate": 0.0005, + "loss": 2.1182, + "step": 210780 + }, + { + "epoch": 0.8023187655580338, + "grad_norm": 0.14496393501758575, + "learning_rate": 0.0005, + "loss": 2.0992, + "step": 210790 + }, + { + "epoch": 0.8023568280261565, + "grad_norm": 0.12095699459314346, + "learning_rate": 0.0005, + "loss": 2.1042, + "step": 210800 + }, + { + "epoch": 0.8023948904942793, + "grad_norm": 0.12212936580181122, + "learning_rate": 0.0005, + "loss": 2.1127, + "step": 210810 + }, + { + "epoch": 0.8024329529624019, + "grad_norm": 0.1293233036994934, + "learning_rate": 0.0005, + "loss": 2.0964, + "step": 210820 + }, + { + "epoch": 0.8024710154305246, + "grad_norm": 0.12779028713703156, + "learning_rate": 0.0005, + "loss": 2.1036, + "step": 210830 + }, + { + "epoch": 0.8025090778986472, + "grad_norm": 0.12210148572921753, + "learning_rate": 0.0005, + "loss": 2.1072, + "step": 210840 + }, + { + "epoch": 0.80254714036677, + "grad_norm": 0.13600996136665344, + "learning_rate": 0.0005, + "loss": 2.1059, + "step": 210850 + }, + { + "epoch": 0.8025852028348927, + "grad_norm": 0.11877809464931488, + "learning_rate": 0.0005, + "loss": 2.1009, + "step": 210860 + }, + { + "epoch": 0.8026232653030153, + "grad_norm": 0.1277652084827423, + "learning_rate": 0.0005, + "loss": 2.1206, + "step": 210870 + }, + { + "epoch": 0.802661327771138, + "grad_norm": 0.12291332334280014, + "learning_rate": 0.0005, + "loss": 2.0952, + "step": 210880 + }, + { + "epoch": 0.8026993902392606, + "grad_norm": 0.12449786812067032, + "learning_rate": 0.0005, + "loss": 2.1117, + "step": 210890 + }, + { + "epoch": 0.8027374527073834, + "grad_norm": 0.11884238570928574, + "learning_rate": 0.0005, + "loss": 2.1025, + "step": 210900 + }, + { + "epoch": 0.802775515175506, + "grad_norm": 0.1160527914762497, + "learning_rate": 0.0005, + "loss": 2.1083, + "step": 210910 + }, + { + "epoch": 0.8028135776436287, + "grad_norm": 0.11842243373394012, + "learning_rate": 0.0005, + "loss": 2.0906, + "step": 210920 + }, + { + "epoch": 0.8028516401117514, + "grad_norm": 0.13278530538082123, + "learning_rate": 0.0005, + "loss": 2.101, + "step": 210930 + }, + { + "epoch": 0.8028897025798741, + "grad_norm": 0.13262298703193665, + "learning_rate": 0.0005, + "loss": 2.1057, + "step": 210940 + }, + { + "epoch": 0.8029277650479968, + "grad_norm": 0.1319943219423294, + "learning_rate": 0.0005, + "loss": 2.0988, + "step": 210950 + }, + { + "epoch": 0.8029658275161194, + "grad_norm": 0.12588515877723694, + "learning_rate": 0.0005, + "loss": 2.089, + "step": 210960 + }, + { + "epoch": 0.8030038899842421, + "grad_norm": 0.1147276908159256, + "learning_rate": 0.0005, + "loss": 2.0954, + "step": 210970 + }, + { + "epoch": 0.8030419524523649, + "grad_norm": 0.11792852729558945, + "learning_rate": 0.0005, + "loss": 2.1047, + "step": 210980 + }, + { + "epoch": 0.8030800149204875, + "grad_norm": 0.12042088061571121, + "learning_rate": 0.0005, + "loss": 2.0852, + "step": 210990 + }, + { + "epoch": 0.8031180773886102, + "grad_norm": 0.11919981986284256, + "learning_rate": 0.0005, + "loss": 2.1062, + "step": 211000 + }, + { + "epoch": 0.8031561398567328, + "grad_norm": 0.13294468820095062, + "learning_rate": 0.0005, + "loss": 2.1001, + "step": 211010 + }, + { + "epoch": 0.8031942023248555, + "grad_norm": 0.13578881323337555, + "learning_rate": 0.0005, + "loss": 2.1182, + "step": 211020 + }, + { + "epoch": 0.8032322647929783, + "grad_norm": 0.13631047308444977, + "learning_rate": 0.0005, + "loss": 2.119, + "step": 211030 + }, + { + "epoch": 0.8032703272611009, + "grad_norm": 0.12270741909742355, + "learning_rate": 0.0005, + "loss": 2.1056, + "step": 211040 + }, + { + "epoch": 0.8033083897292236, + "grad_norm": 0.14174945652484894, + "learning_rate": 0.0005, + "loss": 2.0858, + "step": 211050 + }, + { + "epoch": 0.8033464521973462, + "grad_norm": 0.12768588960170746, + "learning_rate": 0.0005, + "loss": 2.0999, + "step": 211060 + }, + { + "epoch": 0.803384514665469, + "grad_norm": 0.1305711418390274, + "learning_rate": 0.0005, + "loss": 2.1247, + "step": 211070 + }, + { + "epoch": 0.8034225771335917, + "grad_norm": 0.1230345070362091, + "learning_rate": 0.0005, + "loss": 2.1189, + "step": 211080 + }, + { + "epoch": 0.8034606396017143, + "grad_norm": 0.12930433452129364, + "learning_rate": 0.0005, + "loss": 2.0919, + "step": 211090 + }, + { + "epoch": 0.803498702069837, + "grad_norm": 0.14357741177082062, + "learning_rate": 0.0005, + "loss": 2.1062, + "step": 211100 + }, + { + "epoch": 0.8035367645379597, + "grad_norm": 0.12526747584342957, + "learning_rate": 0.0005, + "loss": 2.1065, + "step": 211110 + }, + { + "epoch": 0.8035748270060824, + "grad_norm": 0.15204723179340363, + "learning_rate": 0.0005, + "loss": 2.1067, + "step": 211120 + }, + { + "epoch": 0.8036128894742051, + "grad_norm": 0.13927868008613586, + "learning_rate": 0.0005, + "loss": 2.0982, + "step": 211130 + }, + { + "epoch": 0.8036509519423277, + "grad_norm": 0.12264429032802582, + "learning_rate": 0.0005, + "loss": 2.1013, + "step": 211140 + }, + { + "epoch": 0.8036890144104505, + "grad_norm": 0.12423133850097656, + "learning_rate": 0.0005, + "loss": 2.1091, + "step": 211150 + }, + { + "epoch": 0.8037270768785731, + "grad_norm": 0.1225658655166626, + "learning_rate": 0.0005, + "loss": 2.1022, + "step": 211160 + }, + { + "epoch": 0.8037651393466958, + "grad_norm": 0.12389955669641495, + "learning_rate": 0.0005, + "loss": 2.0979, + "step": 211170 + }, + { + "epoch": 0.8038032018148185, + "grad_norm": 0.1286647617816925, + "learning_rate": 0.0005, + "loss": 2.1083, + "step": 211180 + }, + { + "epoch": 0.8038412642829411, + "grad_norm": 0.12770266830921173, + "learning_rate": 0.0005, + "loss": 2.1053, + "step": 211190 + }, + { + "epoch": 0.8038793267510639, + "grad_norm": 0.1357760727405548, + "learning_rate": 0.0005, + "loss": 2.1101, + "step": 211200 + }, + { + "epoch": 0.8039173892191865, + "grad_norm": 0.12599506974220276, + "learning_rate": 0.0005, + "loss": 2.1189, + "step": 211210 + }, + { + "epoch": 0.8039554516873092, + "grad_norm": 0.11944464594125748, + "learning_rate": 0.0005, + "loss": 2.1171, + "step": 211220 + }, + { + "epoch": 0.8039935141554319, + "grad_norm": 0.12599098682403564, + "learning_rate": 0.0005, + "loss": 2.1205, + "step": 211230 + }, + { + "epoch": 0.8040315766235546, + "grad_norm": 0.13443703949451447, + "learning_rate": 0.0005, + "loss": 2.0954, + "step": 211240 + }, + { + "epoch": 0.8040696390916773, + "grad_norm": 0.138460174202919, + "learning_rate": 0.0005, + "loss": 2.1104, + "step": 211250 + }, + { + "epoch": 0.8041077015597999, + "grad_norm": 0.12518736720085144, + "learning_rate": 0.0005, + "loss": 2.1053, + "step": 211260 + }, + { + "epoch": 0.8041457640279226, + "grad_norm": 0.12137263268232346, + "learning_rate": 0.0005, + "loss": 2.1038, + "step": 211270 + }, + { + "epoch": 0.8041838264960454, + "grad_norm": 0.12049419432878494, + "learning_rate": 0.0005, + "loss": 2.09, + "step": 211280 + }, + { + "epoch": 0.804221888964168, + "grad_norm": 0.12654536962509155, + "learning_rate": 0.0005, + "loss": 2.1023, + "step": 211290 + }, + { + "epoch": 0.8042599514322907, + "grad_norm": 0.12805117666721344, + "learning_rate": 0.0005, + "loss": 2.1004, + "step": 211300 + }, + { + "epoch": 0.8042980139004133, + "grad_norm": 0.12165186554193497, + "learning_rate": 0.0005, + "loss": 2.1034, + "step": 211310 + }, + { + "epoch": 0.804336076368536, + "grad_norm": 0.13391226530075073, + "learning_rate": 0.0005, + "loss": 2.1087, + "step": 211320 + }, + { + "epoch": 0.8043741388366588, + "grad_norm": 0.12267400324344635, + "learning_rate": 0.0005, + "loss": 2.1112, + "step": 211330 + }, + { + "epoch": 0.8044122013047814, + "grad_norm": 0.123236745595932, + "learning_rate": 0.0005, + "loss": 2.1023, + "step": 211340 + }, + { + "epoch": 0.8044502637729041, + "grad_norm": 0.12270749360322952, + "learning_rate": 0.0005, + "loss": 2.1041, + "step": 211350 + }, + { + "epoch": 0.8044883262410267, + "grad_norm": 0.11430325359106064, + "learning_rate": 0.0005, + "loss": 2.1104, + "step": 211360 + }, + { + "epoch": 0.8045263887091495, + "grad_norm": 0.12952934205532074, + "learning_rate": 0.0005, + "loss": 2.1192, + "step": 211370 + }, + { + "epoch": 0.8045644511772722, + "grad_norm": 0.12921790778636932, + "learning_rate": 0.0005, + "loss": 2.1252, + "step": 211380 + }, + { + "epoch": 0.8046025136453948, + "grad_norm": 0.13494722545146942, + "learning_rate": 0.0005, + "loss": 2.1035, + "step": 211390 + }, + { + "epoch": 0.8046405761135175, + "grad_norm": 0.13030272722244263, + "learning_rate": 0.0005, + "loss": 2.1133, + "step": 211400 + }, + { + "epoch": 0.8046786385816402, + "grad_norm": 0.1260061413049698, + "learning_rate": 0.0005, + "loss": 2.1153, + "step": 211410 + }, + { + "epoch": 0.8047167010497629, + "grad_norm": 0.1300874650478363, + "learning_rate": 0.0005, + "loss": 2.0976, + "step": 211420 + }, + { + "epoch": 0.8047547635178856, + "grad_norm": 0.11533726006746292, + "learning_rate": 0.0005, + "loss": 2.1045, + "step": 211430 + }, + { + "epoch": 0.8047928259860082, + "grad_norm": 0.12814481556415558, + "learning_rate": 0.0005, + "loss": 2.1037, + "step": 211440 + }, + { + "epoch": 0.8048308884541309, + "grad_norm": 0.11733753979206085, + "learning_rate": 0.0005, + "loss": 2.1153, + "step": 211450 + }, + { + "epoch": 0.8048689509222536, + "grad_norm": 0.14942340552806854, + "learning_rate": 0.0005, + "loss": 2.0851, + "step": 211460 + }, + { + "epoch": 0.8049070133903763, + "grad_norm": 0.11945247650146484, + "learning_rate": 0.0005, + "loss": 2.1026, + "step": 211470 + }, + { + "epoch": 0.804945075858499, + "grad_norm": 0.13306763768196106, + "learning_rate": 0.0005, + "loss": 2.1101, + "step": 211480 + }, + { + "epoch": 0.8049831383266216, + "grad_norm": 0.12273822724819183, + "learning_rate": 0.0005, + "loss": 2.1039, + "step": 211490 + }, + { + "epoch": 0.8050212007947444, + "grad_norm": 0.1341172754764557, + "learning_rate": 0.0005, + "loss": 2.0872, + "step": 211500 + }, + { + "epoch": 0.805059263262867, + "grad_norm": 0.118862085044384, + "learning_rate": 0.0005, + "loss": 2.1011, + "step": 211510 + }, + { + "epoch": 0.8050973257309897, + "grad_norm": 0.1313682496547699, + "learning_rate": 0.0005, + "loss": 2.0974, + "step": 211520 + }, + { + "epoch": 0.8051353881991123, + "grad_norm": 0.11379699409008026, + "learning_rate": 0.0005, + "loss": 2.102, + "step": 211530 + }, + { + "epoch": 0.8051734506672351, + "grad_norm": 0.12442870438098907, + "learning_rate": 0.0005, + "loss": 2.1095, + "step": 211540 + }, + { + "epoch": 0.8052115131353578, + "grad_norm": 0.1156822144985199, + "learning_rate": 0.0005, + "loss": 2.1092, + "step": 211550 + }, + { + "epoch": 0.8052495756034804, + "grad_norm": 0.12452563643455505, + "learning_rate": 0.0005, + "loss": 2.1161, + "step": 211560 + }, + { + "epoch": 0.8052876380716031, + "grad_norm": 0.13099755346775055, + "learning_rate": 0.0005, + "loss": 2.1256, + "step": 211570 + }, + { + "epoch": 0.8053257005397259, + "grad_norm": 0.1255125254392624, + "learning_rate": 0.0005, + "loss": 2.1049, + "step": 211580 + }, + { + "epoch": 0.8053637630078485, + "grad_norm": 0.12978138029575348, + "learning_rate": 0.0005, + "loss": 2.124, + "step": 211590 + }, + { + "epoch": 0.8054018254759712, + "grad_norm": 0.1305938959121704, + "learning_rate": 0.0005, + "loss": 2.1147, + "step": 211600 + }, + { + "epoch": 0.8054398879440938, + "grad_norm": 0.11762437969446182, + "learning_rate": 0.0005, + "loss": 2.0957, + "step": 211610 + }, + { + "epoch": 0.8054779504122165, + "grad_norm": 0.13285085558891296, + "learning_rate": 0.0005, + "loss": 2.1017, + "step": 211620 + }, + { + "epoch": 0.8055160128803392, + "grad_norm": 0.11013451218605042, + "learning_rate": 0.0005, + "loss": 2.1063, + "step": 211630 + }, + { + "epoch": 0.8055540753484619, + "grad_norm": 0.11815615743398666, + "learning_rate": 0.0005, + "loss": 2.0987, + "step": 211640 + }, + { + "epoch": 0.8055921378165846, + "grad_norm": 0.12628640234470367, + "learning_rate": 0.0005, + "loss": 2.0902, + "step": 211650 + }, + { + "epoch": 0.8056302002847072, + "grad_norm": 0.12378709763288498, + "learning_rate": 0.0005, + "loss": 2.1071, + "step": 211660 + }, + { + "epoch": 0.80566826275283, + "grad_norm": 0.13026103377342224, + "learning_rate": 0.0005, + "loss": 2.11, + "step": 211670 + }, + { + "epoch": 0.8057063252209526, + "grad_norm": 0.13067007064819336, + "learning_rate": 0.0005, + "loss": 2.1156, + "step": 211680 + }, + { + "epoch": 0.8057443876890753, + "grad_norm": 0.13219596445560455, + "learning_rate": 0.0005, + "loss": 2.1157, + "step": 211690 + }, + { + "epoch": 0.805782450157198, + "grad_norm": 0.12373536080121994, + "learning_rate": 0.0005, + "loss": 2.1077, + "step": 211700 + }, + { + "epoch": 0.8058205126253207, + "grad_norm": 0.12243331968784332, + "learning_rate": 0.0005, + "loss": 2.1111, + "step": 211710 + }, + { + "epoch": 0.8058585750934434, + "grad_norm": 0.12543143332004547, + "learning_rate": 0.0005, + "loss": 2.1066, + "step": 211720 + }, + { + "epoch": 0.805896637561566, + "grad_norm": 0.12280245870351791, + "learning_rate": 0.0005, + "loss": 2.1015, + "step": 211730 + }, + { + "epoch": 0.8059347000296887, + "grad_norm": 0.12935133278369904, + "learning_rate": 0.0005, + "loss": 2.1081, + "step": 211740 + }, + { + "epoch": 0.8059727624978114, + "grad_norm": 0.12362467497587204, + "learning_rate": 0.0005, + "loss": 2.1012, + "step": 211750 + }, + { + "epoch": 0.8060108249659341, + "grad_norm": 0.12046709656715393, + "learning_rate": 0.0005, + "loss": 2.0875, + "step": 211760 + }, + { + "epoch": 0.8060488874340568, + "grad_norm": 0.12583065032958984, + "learning_rate": 0.0005, + "loss": 2.0924, + "step": 211770 + }, + { + "epoch": 0.8060869499021794, + "grad_norm": 0.13486357033252716, + "learning_rate": 0.0005, + "loss": 2.1171, + "step": 211780 + }, + { + "epoch": 0.8061250123703021, + "grad_norm": 0.11814618855714798, + "learning_rate": 0.0005, + "loss": 2.0969, + "step": 211790 + }, + { + "epoch": 0.8061630748384249, + "grad_norm": 0.12391425669193268, + "learning_rate": 0.0005, + "loss": 2.0947, + "step": 211800 + }, + { + "epoch": 0.8062011373065475, + "grad_norm": 0.12215106189250946, + "learning_rate": 0.0005, + "loss": 2.1033, + "step": 211810 + }, + { + "epoch": 0.8062391997746702, + "grad_norm": 0.13146349787712097, + "learning_rate": 0.0005, + "loss": 2.1121, + "step": 211820 + }, + { + "epoch": 0.8062772622427928, + "grad_norm": 0.12323244661092758, + "learning_rate": 0.0005, + "loss": 2.1081, + "step": 211830 + }, + { + "epoch": 0.8063153247109156, + "grad_norm": 0.11947532743215561, + "learning_rate": 0.0005, + "loss": 2.097, + "step": 211840 + }, + { + "epoch": 0.8063533871790383, + "grad_norm": 0.13553132116794586, + "learning_rate": 0.0005, + "loss": 2.1048, + "step": 211850 + }, + { + "epoch": 0.8063914496471609, + "grad_norm": 0.1189856007695198, + "learning_rate": 0.0005, + "loss": 2.097, + "step": 211860 + }, + { + "epoch": 0.8064295121152836, + "grad_norm": 0.12500151991844177, + "learning_rate": 0.0005, + "loss": 2.0889, + "step": 211870 + }, + { + "epoch": 0.8064675745834062, + "grad_norm": 0.11782065778970718, + "learning_rate": 0.0005, + "loss": 2.1116, + "step": 211880 + }, + { + "epoch": 0.806505637051529, + "grad_norm": 0.1276840716600418, + "learning_rate": 0.0005, + "loss": 2.1048, + "step": 211890 + }, + { + "epoch": 0.8065436995196517, + "grad_norm": 0.13151061534881592, + "learning_rate": 0.0005, + "loss": 2.1126, + "step": 211900 + }, + { + "epoch": 0.8065817619877743, + "grad_norm": 0.12591581046581268, + "learning_rate": 0.0005, + "loss": 2.1025, + "step": 211910 + }, + { + "epoch": 0.806619824455897, + "grad_norm": 0.13186302781105042, + "learning_rate": 0.0005, + "loss": 2.1061, + "step": 211920 + }, + { + "epoch": 0.8066578869240197, + "grad_norm": 0.1300540715456009, + "learning_rate": 0.0005, + "loss": 2.118, + "step": 211930 + }, + { + "epoch": 0.8066959493921424, + "grad_norm": 0.12366016209125519, + "learning_rate": 0.0005, + "loss": 2.1054, + "step": 211940 + }, + { + "epoch": 0.806734011860265, + "grad_norm": 0.13649655878543854, + "learning_rate": 0.0005, + "loss": 2.0949, + "step": 211950 + }, + { + "epoch": 0.8067720743283877, + "grad_norm": 0.12333094328641891, + "learning_rate": 0.0005, + "loss": 2.0925, + "step": 211960 + }, + { + "epoch": 0.8068101367965105, + "grad_norm": 0.12637990713119507, + "learning_rate": 0.0005, + "loss": 2.1144, + "step": 211970 + }, + { + "epoch": 0.8068481992646331, + "grad_norm": 0.123201884329319, + "learning_rate": 0.0005, + "loss": 2.1, + "step": 211980 + }, + { + "epoch": 0.8068862617327558, + "grad_norm": 0.14880193769931793, + "learning_rate": 0.0005, + "loss": 2.124, + "step": 211990 + }, + { + "epoch": 0.8069243242008785, + "grad_norm": 0.11887034773826599, + "learning_rate": 0.0005, + "loss": 2.1108, + "step": 212000 + }, + { + "epoch": 0.8069623866690012, + "grad_norm": 0.11663667112588882, + "learning_rate": 0.0005, + "loss": 2.1042, + "step": 212010 + }, + { + "epoch": 0.8070004491371239, + "grad_norm": 0.1256750077009201, + "learning_rate": 0.0005, + "loss": 2.1058, + "step": 212020 + }, + { + "epoch": 0.8070385116052465, + "grad_norm": 0.12154050171375275, + "learning_rate": 0.0005, + "loss": 2.0944, + "step": 212030 + }, + { + "epoch": 0.8070765740733692, + "grad_norm": 0.12251798063516617, + "learning_rate": 0.0005, + "loss": 2.1007, + "step": 212040 + }, + { + "epoch": 0.8071146365414918, + "grad_norm": 0.12491164356470108, + "learning_rate": 0.0005, + "loss": 2.097, + "step": 212050 + }, + { + "epoch": 0.8071526990096146, + "grad_norm": 0.11804839223623276, + "learning_rate": 0.0005, + "loss": 2.0889, + "step": 212060 + }, + { + "epoch": 0.8071907614777373, + "grad_norm": 0.11745022982358932, + "learning_rate": 0.0005, + "loss": 2.1143, + "step": 212070 + }, + { + "epoch": 0.8072288239458599, + "grad_norm": 0.12897460162639618, + "learning_rate": 0.0005, + "loss": 2.1169, + "step": 212080 + }, + { + "epoch": 0.8072668864139826, + "grad_norm": 0.12131928652524948, + "learning_rate": 0.0005, + "loss": 2.0943, + "step": 212090 + }, + { + "epoch": 0.8073049488821054, + "grad_norm": 0.12376783043146133, + "learning_rate": 0.0005, + "loss": 2.1104, + "step": 212100 + }, + { + "epoch": 0.807343011350228, + "grad_norm": 0.11986812204122543, + "learning_rate": 0.0005, + "loss": 2.1006, + "step": 212110 + }, + { + "epoch": 0.8073810738183507, + "grad_norm": 0.1264466643333435, + "learning_rate": 0.0005, + "loss": 2.0998, + "step": 212120 + }, + { + "epoch": 0.8074191362864733, + "grad_norm": 0.11758770048618317, + "learning_rate": 0.0005, + "loss": 2.1194, + "step": 212130 + }, + { + "epoch": 0.8074571987545961, + "grad_norm": 0.12155801057815552, + "learning_rate": 0.0005, + "loss": 2.0934, + "step": 212140 + }, + { + "epoch": 0.8074952612227188, + "grad_norm": 0.13400864601135254, + "learning_rate": 0.0005, + "loss": 2.1064, + "step": 212150 + }, + { + "epoch": 0.8075333236908414, + "grad_norm": 0.14201660454273224, + "learning_rate": 0.0005, + "loss": 2.1192, + "step": 212160 + }, + { + "epoch": 0.8075713861589641, + "grad_norm": 0.12833969295024872, + "learning_rate": 0.0005, + "loss": 2.1119, + "step": 212170 + }, + { + "epoch": 0.8076094486270867, + "grad_norm": 0.12091749161481857, + "learning_rate": 0.0005, + "loss": 2.0984, + "step": 212180 + }, + { + "epoch": 0.8076475110952095, + "grad_norm": 0.12184452265501022, + "learning_rate": 0.0005, + "loss": 2.097, + "step": 212190 + }, + { + "epoch": 0.8076855735633321, + "grad_norm": 0.12771421670913696, + "learning_rate": 0.0005, + "loss": 2.105, + "step": 212200 + }, + { + "epoch": 0.8077236360314548, + "grad_norm": 0.12905149161815643, + "learning_rate": 0.0005, + "loss": 2.0958, + "step": 212210 + }, + { + "epoch": 0.8077616984995775, + "grad_norm": 0.11964689195156097, + "learning_rate": 0.0005, + "loss": 2.1096, + "step": 212220 + }, + { + "epoch": 0.8077997609677002, + "grad_norm": 0.12076413631439209, + "learning_rate": 0.0005, + "loss": 2.1105, + "step": 212230 + }, + { + "epoch": 0.8078378234358229, + "grad_norm": 0.12492384761571884, + "learning_rate": 0.0005, + "loss": 2.1182, + "step": 212240 + }, + { + "epoch": 0.8078758859039455, + "grad_norm": 0.1287720799446106, + "learning_rate": 0.0005, + "loss": 2.0967, + "step": 212250 + }, + { + "epoch": 0.8079139483720682, + "grad_norm": 0.12003912776708603, + "learning_rate": 0.0005, + "loss": 2.0943, + "step": 212260 + }, + { + "epoch": 0.807952010840191, + "grad_norm": 0.12055928260087967, + "learning_rate": 0.0005, + "loss": 2.1098, + "step": 212270 + }, + { + "epoch": 0.8079900733083136, + "grad_norm": 0.12833811342716217, + "learning_rate": 0.0005, + "loss": 2.0976, + "step": 212280 + }, + { + "epoch": 0.8080281357764363, + "grad_norm": 0.13042432069778442, + "learning_rate": 0.0005, + "loss": 2.0982, + "step": 212290 + }, + { + "epoch": 0.8080661982445589, + "grad_norm": 0.13135427236557007, + "learning_rate": 0.0005, + "loss": 2.1157, + "step": 212300 + }, + { + "epoch": 0.8081042607126816, + "grad_norm": 0.13378044962882996, + "learning_rate": 0.0005, + "loss": 2.0999, + "step": 212310 + }, + { + "epoch": 0.8081423231808044, + "grad_norm": 0.13855749368667603, + "learning_rate": 0.0005, + "loss": 2.1077, + "step": 212320 + }, + { + "epoch": 0.808180385648927, + "grad_norm": 0.1329144835472107, + "learning_rate": 0.0005, + "loss": 2.0941, + "step": 212330 + }, + { + "epoch": 0.8082184481170497, + "grad_norm": 0.11958435922861099, + "learning_rate": 0.0005, + "loss": 2.1136, + "step": 212340 + }, + { + "epoch": 0.8082565105851723, + "grad_norm": 0.12032509595155716, + "learning_rate": 0.0005, + "loss": 2.1001, + "step": 212350 + }, + { + "epoch": 0.8082945730532951, + "grad_norm": 0.12137807160615921, + "learning_rate": 0.0005, + "loss": 2.1111, + "step": 212360 + }, + { + "epoch": 0.8083326355214178, + "grad_norm": 0.11983451247215271, + "learning_rate": 0.0005, + "loss": 2.0996, + "step": 212370 + }, + { + "epoch": 0.8083706979895404, + "grad_norm": 0.1348404586315155, + "learning_rate": 0.0005, + "loss": 2.1061, + "step": 212380 + }, + { + "epoch": 0.8084087604576631, + "grad_norm": 0.12229456007480621, + "learning_rate": 0.0005, + "loss": 2.1111, + "step": 212390 + }, + { + "epoch": 0.8084468229257858, + "grad_norm": 0.13219401240348816, + "learning_rate": 0.0005, + "loss": 2.0988, + "step": 212400 + }, + { + "epoch": 0.8084848853939085, + "grad_norm": 0.12037495523691177, + "learning_rate": 0.0005, + "loss": 2.1001, + "step": 212410 + }, + { + "epoch": 0.8085229478620312, + "grad_norm": 0.1404789388179779, + "learning_rate": 0.0005, + "loss": 2.1054, + "step": 212420 + }, + { + "epoch": 0.8085610103301538, + "grad_norm": 0.12103266268968582, + "learning_rate": 0.0005, + "loss": 2.0872, + "step": 212430 + }, + { + "epoch": 0.8085990727982766, + "grad_norm": 0.1356445997953415, + "learning_rate": 0.0005, + "loss": 2.1078, + "step": 212440 + }, + { + "epoch": 0.8086371352663992, + "grad_norm": 0.14060430228710175, + "learning_rate": 0.0005, + "loss": 2.1022, + "step": 212450 + }, + { + "epoch": 0.8086751977345219, + "grad_norm": 0.12964075803756714, + "learning_rate": 0.0005, + "loss": 2.1045, + "step": 212460 + }, + { + "epoch": 0.8087132602026446, + "grad_norm": 0.12052503228187561, + "learning_rate": 0.0005, + "loss": 2.1291, + "step": 212470 + }, + { + "epoch": 0.8087513226707672, + "grad_norm": 0.13579916954040527, + "learning_rate": 0.0005, + "loss": 2.1044, + "step": 212480 + }, + { + "epoch": 0.80878938513889, + "grad_norm": 0.12856526672840118, + "learning_rate": 0.0005, + "loss": 2.1074, + "step": 212490 + }, + { + "epoch": 0.8088274476070126, + "grad_norm": 0.12353648245334625, + "learning_rate": 0.0005, + "loss": 2.0864, + "step": 212500 + }, + { + "epoch": 0.8088655100751353, + "grad_norm": 0.128973588347435, + "learning_rate": 0.0005, + "loss": 2.1001, + "step": 212510 + }, + { + "epoch": 0.808903572543258, + "grad_norm": 0.12027554214000702, + "learning_rate": 0.0005, + "loss": 2.1219, + "step": 212520 + }, + { + "epoch": 0.8089416350113807, + "grad_norm": 0.1417223960161209, + "learning_rate": 0.0005, + "loss": 2.1031, + "step": 212530 + }, + { + "epoch": 0.8089796974795034, + "grad_norm": 0.11463826149702072, + "learning_rate": 0.0005, + "loss": 2.0996, + "step": 212540 + }, + { + "epoch": 0.809017759947626, + "grad_norm": 0.11450222134590149, + "learning_rate": 0.0005, + "loss": 2.1003, + "step": 212550 + }, + { + "epoch": 0.8090558224157487, + "grad_norm": 0.11749234795570374, + "learning_rate": 0.0005, + "loss": 2.0984, + "step": 212560 + }, + { + "epoch": 0.8090938848838715, + "grad_norm": 0.12792567908763885, + "learning_rate": 0.0005, + "loss": 2.0904, + "step": 212570 + }, + { + "epoch": 0.8091319473519941, + "grad_norm": 0.12287959456443787, + "learning_rate": 0.0005, + "loss": 2.1119, + "step": 212580 + }, + { + "epoch": 0.8091700098201168, + "grad_norm": 0.12542614340782166, + "learning_rate": 0.0005, + "loss": 2.1048, + "step": 212590 + }, + { + "epoch": 0.8092080722882394, + "grad_norm": 0.12899567186832428, + "learning_rate": 0.0005, + "loss": 2.0897, + "step": 212600 + }, + { + "epoch": 0.8092461347563621, + "grad_norm": 0.12323947995901108, + "learning_rate": 0.0005, + "loss": 2.0954, + "step": 212610 + }, + { + "epoch": 0.8092841972244849, + "grad_norm": 0.13235338032245636, + "learning_rate": 0.0005, + "loss": 2.1079, + "step": 212620 + }, + { + "epoch": 0.8093222596926075, + "grad_norm": 0.12860196828842163, + "learning_rate": 0.0005, + "loss": 2.101, + "step": 212630 + }, + { + "epoch": 0.8093603221607302, + "grad_norm": 0.13267847895622253, + "learning_rate": 0.0005, + "loss": 2.0847, + "step": 212640 + }, + { + "epoch": 0.8093983846288528, + "grad_norm": 0.14531466364860535, + "learning_rate": 0.0005, + "loss": 2.0922, + "step": 212650 + }, + { + "epoch": 0.8094364470969756, + "grad_norm": 0.12676188349723816, + "learning_rate": 0.0005, + "loss": 2.1126, + "step": 212660 + }, + { + "epoch": 0.8094745095650983, + "grad_norm": 0.11910112202167511, + "learning_rate": 0.0005, + "loss": 2.0994, + "step": 212670 + }, + { + "epoch": 0.8095125720332209, + "grad_norm": 0.12504889070987701, + "learning_rate": 0.0005, + "loss": 2.1114, + "step": 212680 + }, + { + "epoch": 0.8095506345013436, + "grad_norm": 0.12771740555763245, + "learning_rate": 0.0005, + "loss": 2.1158, + "step": 212690 + }, + { + "epoch": 0.8095886969694663, + "grad_norm": 0.12403963506221771, + "learning_rate": 0.0005, + "loss": 2.1316, + "step": 212700 + }, + { + "epoch": 0.809626759437589, + "grad_norm": 0.11381793767213821, + "learning_rate": 0.0005, + "loss": 2.1117, + "step": 212710 + }, + { + "epoch": 0.8096648219057117, + "grad_norm": 0.13926367461681366, + "learning_rate": 0.0005, + "loss": 2.1178, + "step": 212720 + }, + { + "epoch": 0.8097028843738343, + "grad_norm": 0.14142242074012756, + "learning_rate": 0.0005, + "loss": 2.1118, + "step": 212730 + }, + { + "epoch": 0.809740946841957, + "grad_norm": 0.12207679450511932, + "learning_rate": 0.0005, + "loss": 2.0933, + "step": 212740 + }, + { + "epoch": 0.8097790093100797, + "grad_norm": 0.11419576406478882, + "learning_rate": 0.0005, + "loss": 2.0969, + "step": 212750 + }, + { + "epoch": 0.8098170717782024, + "grad_norm": 0.11438912153244019, + "learning_rate": 0.0005, + "loss": 2.107, + "step": 212760 + }, + { + "epoch": 0.809855134246325, + "grad_norm": 0.12209003418684006, + "learning_rate": 0.0005, + "loss": 2.0927, + "step": 212770 + }, + { + "epoch": 0.8098931967144477, + "grad_norm": 0.1303257942199707, + "learning_rate": 0.0005, + "loss": 2.1026, + "step": 212780 + }, + { + "epoch": 0.8099312591825705, + "grad_norm": 0.12480229139328003, + "learning_rate": 0.0005, + "loss": 2.1083, + "step": 212790 + }, + { + "epoch": 0.8099693216506931, + "grad_norm": 0.13041506707668304, + "learning_rate": 0.0005, + "loss": 2.1208, + "step": 212800 + }, + { + "epoch": 0.8100073841188158, + "grad_norm": 0.1102275401353836, + "learning_rate": 0.0005, + "loss": 2.0972, + "step": 212810 + }, + { + "epoch": 0.8100454465869384, + "grad_norm": 0.11377348005771637, + "learning_rate": 0.0005, + "loss": 2.1084, + "step": 212820 + }, + { + "epoch": 0.8100835090550612, + "grad_norm": 0.13421009480953217, + "learning_rate": 0.0005, + "loss": 2.0827, + "step": 212830 + }, + { + "epoch": 0.8101215715231839, + "grad_norm": 0.13049618899822235, + "learning_rate": 0.0005, + "loss": 2.098, + "step": 212840 + }, + { + "epoch": 0.8101596339913065, + "grad_norm": 0.1221061423420906, + "learning_rate": 0.0005, + "loss": 2.1186, + "step": 212850 + }, + { + "epoch": 0.8101976964594292, + "grad_norm": 0.13240858912467957, + "learning_rate": 0.0005, + "loss": 2.0877, + "step": 212860 + }, + { + "epoch": 0.810235758927552, + "grad_norm": 0.1204882338643074, + "learning_rate": 0.0005, + "loss": 2.1004, + "step": 212870 + }, + { + "epoch": 0.8102738213956746, + "grad_norm": 0.14405842125415802, + "learning_rate": 0.0005, + "loss": 2.1112, + "step": 212880 + }, + { + "epoch": 0.8103118838637973, + "grad_norm": 0.12688469886779785, + "learning_rate": 0.0005, + "loss": 2.0946, + "step": 212890 + }, + { + "epoch": 0.8103499463319199, + "grad_norm": 0.12993687391281128, + "learning_rate": 0.0005, + "loss": 2.1004, + "step": 212900 + }, + { + "epoch": 0.8103880088000426, + "grad_norm": 0.12682536244392395, + "learning_rate": 0.0005, + "loss": 2.0976, + "step": 212910 + }, + { + "epoch": 0.8104260712681653, + "grad_norm": 0.12129734456539154, + "learning_rate": 0.0005, + "loss": 2.1171, + "step": 212920 + }, + { + "epoch": 0.810464133736288, + "grad_norm": 0.11905720084905624, + "learning_rate": 0.0005, + "loss": 2.1055, + "step": 212930 + }, + { + "epoch": 0.8105021962044107, + "grad_norm": 0.1375799924135208, + "learning_rate": 0.0005, + "loss": 2.1071, + "step": 212940 + }, + { + "epoch": 0.8105402586725333, + "grad_norm": 0.13128814101219177, + "learning_rate": 0.0005, + "loss": 2.0914, + "step": 212950 + }, + { + "epoch": 0.8105783211406561, + "grad_norm": 0.12356577068567276, + "learning_rate": 0.0005, + "loss": 2.0964, + "step": 212960 + }, + { + "epoch": 0.8106163836087787, + "grad_norm": 0.12033846974372864, + "learning_rate": 0.0005, + "loss": 2.1168, + "step": 212970 + }, + { + "epoch": 0.8106544460769014, + "grad_norm": 0.12359870970249176, + "learning_rate": 0.0005, + "loss": 2.0967, + "step": 212980 + }, + { + "epoch": 0.8106925085450241, + "grad_norm": 0.12279484421014786, + "learning_rate": 0.0005, + "loss": 2.1203, + "step": 212990 + }, + { + "epoch": 0.8107305710131468, + "grad_norm": 0.11939160525798798, + "learning_rate": 0.0005, + "loss": 2.105, + "step": 213000 + }, + { + "epoch": 0.8107686334812695, + "grad_norm": 0.12679126858711243, + "learning_rate": 0.0005, + "loss": 2.093, + "step": 213010 + }, + { + "epoch": 0.8108066959493921, + "grad_norm": 0.12365018576383591, + "learning_rate": 0.0005, + "loss": 2.1022, + "step": 213020 + }, + { + "epoch": 0.8108447584175148, + "grad_norm": 0.12529915571212769, + "learning_rate": 0.0005, + "loss": 2.0942, + "step": 213030 + }, + { + "epoch": 0.8108828208856375, + "grad_norm": 0.13124340772628784, + "learning_rate": 0.0005, + "loss": 2.1099, + "step": 213040 + }, + { + "epoch": 0.8109208833537602, + "grad_norm": 0.1141396164894104, + "learning_rate": 0.0005, + "loss": 2.1053, + "step": 213050 + }, + { + "epoch": 0.8109589458218829, + "grad_norm": 0.12517715990543365, + "learning_rate": 0.0005, + "loss": 2.1037, + "step": 213060 + }, + { + "epoch": 0.8109970082900055, + "grad_norm": 0.1294000744819641, + "learning_rate": 0.0005, + "loss": 2.1068, + "step": 213070 + }, + { + "epoch": 0.8110350707581282, + "grad_norm": 0.13378405570983887, + "learning_rate": 0.0005, + "loss": 2.1125, + "step": 213080 + }, + { + "epoch": 0.811073133226251, + "grad_norm": 0.12976671755313873, + "learning_rate": 0.0005, + "loss": 2.0988, + "step": 213090 + }, + { + "epoch": 0.8111111956943736, + "grad_norm": 0.132253035902977, + "learning_rate": 0.0005, + "loss": 2.1053, + "step": 213100 + }, + { + "epoch": 0.8111492581624963, + "grad_norm": 0.13804452121257782, + "learning_rate": 0.0005, + "loss": 2.1075, + "step": 213110 + }, + { + "epoch": 0.8111873206306189, + "grad_norm": 0.12440338730812073, + "learning_rate": 0.0005, + "loss": 2.1251, + "step": 213120 + }, + { + "epoch": 0.8112253830987417, + "grad_norm": 0.14301490783691406, + "learning_rate": 0.0005, + "loss": 2.1171, + "step": 213130 + }, + { + "epoch": 0.8112634455668644, + "grad_norm": 0.13201947510242462, + "learning_rate": 0.0005, + "loss": 2.1054, + "step": 213140 + }, + { + "epoch": 0.811301508034987, + "grad_norm": 0.12042871117591858, + "learning_rate": 0.0005, + "loss": 2.1182, + "step": 213150 + }, + { + "epoch": 0.8113395705031097, + "grad_norm": 0.12254253029823303, + "learning_rate": 0.0005, + "loss": 2.11, + "step": 213160 + }, + { + "epoch": 0.8113776329712323, + "grad_norm": 0.1616806983947754, + "learning_rate": 0.0005, + "loss": 2.0858, + "step": 213170 + }, + { + "epoch": 0.8114156954393551, + "grad_norm": 0.14708156883716583, + "learning_rate": 0.0005, + "loss": 2.1036, + "step": 213180 + }, + { + "epoch": 0.8114537579074778, + "grad_norm": 0.11895033717155457, + "learning_rate": 0.0005, + "loss": 2.1066, + "step": 213190 + }, + { + "epoch": 0.8114918203756004, + "grad_norm": 0.11877939850091934, + "learning_rate": 0.0005, + "loss": 2.0986, + "step": 213200 + }, + { + "epoch": 0.8115298828437231, + "grad_norm": 0.11637184768915176, + "learning_rate": 0.0005, + "loss": 2.092, + "step": 213210 + }, + { + "epoch": 0.8115679453118458, + "grad_norm": 0.12218356132507324, + "learning_rate": 0.0005, + "loss": 2.1144, + "step": 213220 + }, + { + "epoch": 0.8116060077799685, + "grad_norm": 0.1255423128604889, + "learning_rate": 0.0005, + "loss": 2.1108, + "step": 213230 + }, + { + "epoch": 0.8116440702480912, + "grad_norm": 0.14560627937316895, + "learning_rate": 0.0005, + "loss": 2.1262, + "step": 213240 + }, + { + "epoch": 0.8116821327162138, + "grad_norm": 0.12048596888780594, + "learning_rate": 0.0005, + "loss": 2.1081, + "step": 213250 + }, + { + "epoch": 0.8117201951843366, + "grad_norm": 0.12892501056194305, + "learning_rate": 0.0005, + "loss": 2.0956, + "step": 213260 + }, + { + "epoch": 0.8117582576524592, + "grad_norm": 0.12159749120473862, + "learning_rate": 0.0005, + "loss": 2.1064, + "step": 213270 + }, + { + "epoch": 0.8117963201205819, + "grad_norm": 0.13641926646232605, + "learning_rate": 0.0005, + "loss": 2.0846, + "step": 213280 + }, + { + "epoch": 0.8118343825887046, + "grad_norm": 0.13524945080280304, + "learning_rate": 0.0005, + "loss": 2.1058, + "step": 213290 + }, + { + "epoch": 0.8118724450568273, + "grad_norm": 0.1168695017695427, + "learning_rate": 0.0005, + "loss": 2.0976, + "step": 213300 + }, + { + "epoch": 0.81191050752495, + "grad_norm": 0.1244073137640953, + "learning_rate": 0.0005, + "loss": 2.1068, + "step": 213310 + }, + { + "epoch": 0.8119485699930726, + "grad_norm": 0.11838249862194061, + "learning_rate": 0.0005, + "loss": 2.0964, + "step": 213320 + }, + { + "epoch": 0.8119866324611953, + "grad_norm": 0.11801798641681671, + "learning_rate": 0.0005, + "loss": 2.0991, + "step": 213330 + }, + { + "epoch": 0.812024694929318, + "grad_norm": 0.12498720735311508, + "learning_rate": 0.0005, + "loss": 2.0983, + "step": 213340 + }, + { + "epoch": 0.8120627573974407, + "grad_norm": 0.12367401272058487, + "learning_rate": 0.0005, + "loss": 2.1025, + "step": 213350 + }, + { + "epoch": 0.8121008198655634, + "grad_norm": 0.12564094364643097, + "learning_rate": 0.0005, + "loss": 2.108, + "step": 213360 + }, + { + "epoch": 0.812138882333686, + "grad_norm": 0.12600673735141754, + "learning_rate": 0.0005, + "loss": 2.0857, + "step": 213370 + }, + { + "epoch": 0.8121769448018087, + "grad_norm": 0.13035088777542114, + "learning_rate": 0.0005, + "loss": 2.1144, + "step": 213380 + }, + { + "epoch": 0.8122150072699315, + "grad_norm": 0.1314198225736618, + "learning_rate": 0.0005, + "loss": 2.0898, + "step": 213390 + }, + { + "epoch": 0.8122530697380541, + "grad_norm": 0.1195560097694397, + "learning_rate": 0.0005, + "loss": 2.1133, + "step": 213400 + }, + { + "epoch": 0.8122911322061768, + "grad_norm": 0.12452965974807739, + "learning_rate": 0.0005, + "loss": 2.0986, + "step": 213410 + }, + { + "epoch": 0.8123291946742994, + "grad_norm": 0.7531483769416809, + "learning_rate": 0.0005, + "loss": 2.1064, + "step": 213420 + }, + { + "epoch": 0.8123672571424222, + "grad_norm": 0.1292552500963211, + "learning_rate": 0.0005, + "loss": 2.1101, + "step": 213430 + }, + { + "epoch": 0.8124053196105449, + "grad_norm": 0.1480943262577057, + "learning_rate": 0.0005, + "loss": 2.1094, + "step": 213440 + }, + { + "epoch": 0.8124433820786675, + "grad_norm": 0.12335444241762161, + "learning_rate": 0.0005, + "loss": 2.102, + "step": 213450 + }, + { + "epoch": 0.8124814445467902, + "grad_norm": 0.13456949591636658, + "learning_rate": 0.0005, + "loss": 2.1212, + "step": 213460 + }, + { + "epoch": 0.8125195070149128, + "grad_norm": 0.12705455720424652, + "learning_rate": 0.0005, + "loss": 2.0931, + "step": 213470 + }, + { + "epoch": 0.8125575694830356, + "grad_norm": 0.1286783516407013, + "learning_rate": 0.0005, + "loss": 2.109, + "step": 213480 + }, + { + "epoch": 0.8125956319511582, + "grad_norm": 0.12179796397686005, + "learning_rate": 0.0005, + "loss": 2.1028, + "step": 213490 + }, + { + "epoch": 0.8126336944192809, + "grad_norm": 0.12345118820667267, + "learning_rate": 0.0005, + "loss": 2.1042, + "step": 213500 + }, + { + "epoch": 0.8126717568874036, + "grad_norm": 0.11786182224750519, + "learning_rate": 0.0005, + "loss": 2.0997, + "step": 213510 + }, + { + "epoch": 0.8127098193555263, + "grad_norm": 0.12873464822769165, + "learning_rate": 0.0005, + "loss": 2.1117, + "step": 213520 + }, + { + "epoch": 0.812747881823649, + "grad_norm": 0.12281100451946259, + "learning_rate": 0.0005, + "loss": 2.1166, + "step": 213530 + }, + { + "epoch": 0.8127859442917716, + "grad_norm": 0.12193473428487778, + "learning_rate": 0.0005, + "loss": 2.1178, + "step": 213540 + }, + { + "epoch": 0.8128240067598943, + "grad_norm": 0.12629088759422302, + "learning_rate": 0.0005, + "loss": 2.087, + "step": 213550 + }, + { + "epoch": 0.8128620692280171, + "grad_norm": 0.13471092283725739, + "learning_rate": 0.0005, + "loss": 2.0936, + "step": 213560 + }, + { + "epoch": 0.8129001316961397, + "grad_norm": 0.13517582416534424, + "learning_rate": 0.0005, + "loss": 2.0928, + "step": 213570 + }, + { + "epoch": 0.8129381941642624, + "grad_norm": 0.11960075050592422, + "learning_rate": 0.0005, + "loss": 2.0947, + "step": 213580 + }, + { + "epoch": 0.812976256632385, + "grad_norm": 0.12184352427721024, + "learning_rate": 0.0005, + "loss": 2.1101, + "step": 213590 + }, + { + "epoch": 0.8130143191005077, + "grad_norm": 0.12594962120056152, + "learning_rate": 0.0005, + "loss": 2.0956, + "step": 213600 + }, + { + "epoch": 0.8130523815686305, + "grad_norm": 0.12507972121238708, + "learning_rate": 0.0005, + "loss": 2.1032, + "step": 213610 + }, + { + "epoch": 0.8130904440367531, + "grad_norm": 0.13315139710903168, + "learning_rate": 0.0005, + "loss": 2.1098, + "step": 213620 + }, + { + "epoch": 0.8131285065048758, + "grad_norm": 0.11814764887094498, + "learning_rate": 0.0005, + "loss": 2.0994, + "step": 213630 + }, + { + "epoch": 0.8131665689729984, + "grad_norm": 0.12203751504421234, + "learning_rate": 0.0005, + "loss": 2.1032, + "step": 213640 + }, + { + "epoch": 0.8132046314411212, + "grad_norm": 0.1237272098660469, + "learning_rate": 0.0005, + "loss": 2.1066, + "step": 213650 + }, + { + "epoch": 0.8132426939092439, + "grad_norm": 0.15392014384269714, + "learning_rate": 0.0005, + "loss": 2.1043, + "step": 213660 + }, + { + "epoch": 0.8132807563773665, + "grad_norm": 0.12798550724983215, + "learning_rate": 0.0005, + "loss": 2.1081, + "step": 213670 + }, + { + "epoch": 0.8133188188454892, + "grad_norm": 0.12292656302452087, + "learning_rate": 0.0005, + "loss": 2.1061, + "step": 213680 + }, + { + "epoch": 0.813356881313612, + "grad_norm": 0.14073209464550018, + "learning_rate": 0.0005, + "loss": 2.105, + "step": 213690 + }, + { + "epoch": 0.8133949437817346, + "grad_norm": 0.1310732662677765, + "learning_rate": 0.0005, + "loss": 2.0897, + "step": 213700 + }, + { + "epoch": 0.8134330062498573, + "grad_norm": 0.14236874878406525, + "learning_rate": 0.0005, + "loss": 2.1037, + "step": 213710 + }, + { + "epoch": 0.8134710687179799, + "grad_norm": 0.12861856818199158, + "learning_rate": 0.0005, + "loss": 2.0869, + "step": 213720 + }, + { + "epoch": 0.8135091311861027, + "grad_norm": 0.1245727613568306, + "learning_rate": 0.0005, + "loss": 2.1004, + "step": 213730 + }, + { + "epoch": 0.8135471936542253, + "grad_norm": 0.11919999122619629, + "learning_rate": 0.0005, + "loss": 2.1011, + "step": 213740 + }, + { + "epoch": 0.813585256122348, + "grad_norm": 0.11863479018211365, + "learning_rate": 0.0005, + "loss": 2.1015, + "step": 213750 + }, + { + "epoch": 0.8136233185904707, + "grad_norm": 0.12423855811357498, + "learning_rate": 0.0005, + "loss": 2.0852, + "step": 213760 + }, + { + "epoch": 0.8136613810585933, + "grad_norm": 0.11683562397956848, + "learning_rate": 0.0005, + "loss": 2.1005, + "step": 213770 + }, + { + "epoch": 0.8136994435267161, + "grad_norm": 0.12702502310276031, + "learning_rate": 0.0005, + "loss": 2.0991, + "step": 213780 + }, + { + "epoch": 0.8137375059948387, + "grad_norm": 0.12645933032035828, + "learning_rate": 0.0005, + "loss": 2.1169, + "step": 213790 + }, + { + "epoch": 0.8137755684629614, + "grad_norm": 0.11896852403879166, + "learning_rate": 0.0005, + "loss": 2.1084, + "step": 213800 + }, + { + "epoch": 0.813813630931084, + "grad_norm": 0.13744334876537323, + "learning_rate": 0.0005, + "loss": 2.1143, + "step": 213810 + }, + { + "epoch": 0.8138516933992068, + "grad_norm": 0.1264820694923401, + "learning_rate": 0.0005, + "loss": 2.1115, + "step": 213820 + }, + { + "epoch": 0.8138897558673295, + "grad_norm": 0.13732865452766418, + "learning_rate": 0.0005, + "loss": 2.1056, + "step": 213830 + }, + { + "epoch": 0.8139278183354521, + "grad_norm": 0.12342477589845657, + "learning_rate": 0.0005, + "loss": 2.0947, + "step": 213840 + }, + { + "epoch": 0.8139658808035748, + "grad_norm": 0.12727709114551544, + "learning_rate": 0.0005, + "loss": 2.0987, + "step": 213850 + }, + { + "epoch": 0.8140039432716976, + "grad_norm": 0.11827078461647034, + "learning_rate": 0.0005, + "loss": 2.1001, + "step": 213860 + }, + { + "epoch": 0.8140420057398202, + "grad_norm": 0.12728027999401093, + "learning_rate": 0.0005, + "loss": 2.1145, + "step": 213870 + }, + { + "epoch": 0.8140800682079429, + "grad_norm": 0.13272863626480103, + "learning_rate": 0.0005, + "loss": 2.1036, + "step": 213880 + }, + { + "epoch": 0.8141181306760655, + "grad_norm": 0.13463446497917175, + "learning_rate": 0.0005, + "loss": 2.1035, + "step": 213890 + }, + { + "epoch": 0.8141561931441882, + "grad_norm": 0.12459887564182281, + "learning_rate": 0.0005, + "loss": 2.1093, + "step": 213900 + }, + { + "epoch": 0.814194255612311, + "grad_norm": 0.1231890395283699, + "learning_rate": 0.0005, + "loss": 2.1037, + "step": 213910 + }, + { + "epoch": 0.8142323180804336, + "grad_norm": 0.11568400263786316, + "learning_rate": 0.0005, + "loss": 2.1006, + "step": 213920 + }, + { + "epoch": 0.8142703805485563, + "grad_norm": 0.11822357028722763, + "learning_rate": 0.0005, + "loss": 2.1018, + "step": 213930 + }, + { + "epoch": 0.8143084430166789, + "grad_norm": 0.15082870423793793, + "learning_rate": 0.0005, + "loss": 2.096, + "step": 213940 + }, + { + "epoch": 0.8143465054848017, + "grad_norm": 0.13176187872886658, + "learning_rate": 0.0005, + "loss": 2.1087, + "step": 213950 + }, + { + "epoch": 0.8143845679529244, + "grad_norm": 0.12025332450866699, + "learning_rate": 0.0005, + "loss": 2.1034, + "step": 213960 + }, + { + "epoch": 0.814422630421047, + "grad_norm": 0.1275869607925415, + "learning_rate": 0.0005, + "loss": 2.102, + "step": 213970 + }, + { + "epoch": 0.8144606928891697, + "grad_norm": 0.12367784976959229, + "learning_rate": 0.0005, + "loss": 2.1151, + "step": 213980 + }, + { + "epoch": 0.8144987553572924, + "grad_norm": 0.14042086899280548, + "learning_rate": 0.0005, + "loss": 2.1122, + "step": 213990 + }, + { + "epoch": 0.8145368178254151, + "grad_norm": 0.13881553709506989, + "learning_rate": 0.0005, + "loss": 2.096, + "step": 214000 + }, + { + "epoch": 0.8145748802935378, + "grad_norm": 0.1477145552635193, + "learning_rate": 0.0005, + "loss": 2.1126, + "step": 214010 + }, + { + "epoch": 0.8146129427616604, + "grad_norm": 0.11577677726745605, + "learning_rate": 0.0005, + "loss": 2.0982, + "step": 214020 + }, + { + "epoch": 0.8146510052297831, + "grad_norm": 0.12480149418115616, + "learning_rate": 0.0005, + "loss": 2.0907, + "step": 214030 + }, + { + "epoch": 0.8146890676979058, + "grad_norm": 0.13816551864147186, + "learning_rate": 0.0005, + "loss": 2.1165, + "step": 214040 + }, + { + "epoch": 0.8147271301660285, + "grad_norm": 0.1139967143535614, + "learning_rate": 0.0005, + "loss": 2.0967, + "step": 214050 + }, + { + "epoch": 0.8147651926341511, + "grad_norm": 0.11555656045675278, + "learning_rate": 0.0005, + "loss": 2.0881, + "step": 214060 + }, + { + "epoch": 0.8148032551022738, + "grad_norm": 0.12136835604906082, + "learning_rate": 0.0005, + "loss": 2.1115, + "step": 214070 + }, + { + "epoch": 0.8148413175703966, + "grad_norm": 0.13405480980873108, + "learning_rate": 0.0005, + "loss": 2.096, + "step": 214080 + }, + { + "epoch": 0.8148793800385192, + "grad_norm": 0.1206866204738617, + "learning_rate": 0.0005, + "loss": 2.1065, + "step": 214090 + }, + { + "epoch": 0.8149174425066419, + "grad_norm": 0.13503244519233704, + "learning_rate": 0.0005, + "loss": 2.1095, + "step": 214100 + }, + { + "epoch": 0.8149555049747645, + "grad_norm": 0.12347983568906784, + "learning_rate": 0.0005, + "loss": 2.1223, + "step": 214110 + }, + { + "epoch": 0.8149935674428873, + "grad_norm": 0.13352620601654053, + "learning_rate": 0.0005, + "loss": 2.1031, + "step": 214120 + }, + { + "epoch": 0.81503162991101, + "grad_norm": 0.12844537198543549, + "learning_rate": 0.0005, + "loss": 2.1177, + "step": 214130 + }, + { + "epoch": 0.8150696923791326, + "grad_norm": 0.12517356872558594, + "learning_rate": 0.0005, + "loss": 2.1167, + "step": 214140 + }, + { + "epoch": 0.8151077548472553, + "grad_norm": 0.12121585011482239, + "learning_rate": 0.0005, + "loss": 2.1015, + "step": 214150 + }, + { + "epoch": 0.815145817315378, + "grad_norm": 0.12959855794906616, + "learning_rate": 0.0005, + "loss": 2.0969, + "step": 214160 + }, + { + "epoch": 0.8151838797835007, + "grad_norm": 0.12382923066616058, + "learning_rate": 0.0005, + "loss": 2.0997, + "step": 214170 + }, + { + "epoch": 0.8152219422516234, + "grad_norm": 0.12631931900978088, + "learning_rate": 0.0005, + "loss": 2.1146, + "step": 214180 + }, + { + "epoch": 0.815260004719746, + "grad_norm": 0.12381136417388916, + "learning_rate": 0.0005, + "loss": 2.1002, + "step": 214190 + }, + { + "epoch": 0.8152980671878687, + "grad_norm": 0.1158674955368042, + "learning_rate": 0.0005, + "loss": 2.1052, + "step": 214200 + }, + { + "epoch": 0.8153361296559914, + "grad_norm": 0.1376393884420395, + "learning_rate": 0.0005, + "loss": 2.1041, + "step": 214210 + }, + { + "epoch": 0.8153741921241141, + "grad_norm": 0.12953539192676544, + "learning_rate": 0.0005, + "loss": 2.0939, + "step": 214220 + }, + { + "epoch": 0.8154122545922368, + "grad_norm": 0.12734073400497437, + "learning_rate": 0.0005, + "loss": 2.0996, + "step": 214230 + }, + { + "epoch": 0.8154503170603594, + "grad_norm": 0.12529154121875763, + "learning_rate": 0.0005, + "loss": 2.0925, + "step": 214240 + }, + { + "epoch": 0.8154883795284822, + "grad_norm": 0.12359047681093216, + "learning_rate": 0.0005, + "loss": 2.0995, + "step": 214250 + }, + { + "epoch": 0.8155264419966048, + "grad_norm": 0.13195255398750305, + "learning_rate": 0.0005, + "loss": 2.1154, + "step": 214260 + }, + { + "epoch": 0.8155645044647275, + "grad_norm": 0.12542951107025146, + "learning_rate": 0.0005, + "loss": 2.0786, + "step": 214270 + }, + { + "epoch": 0.8156025669328502, + "grad_norm": 0.12773358821868896, + "learning_rate": 0.0005, + "loss": 2.105, + "step": 214280 + }, + { + "epoch": 0.8156406294009729, + "grad_norm": 0.12585784494876862, + "learning_rate": 0.0005, + "loss": 2.1007, + "step": 214290 + }, + { + "epoch": 0.8156786918690956, + "grad_norm": 0.11464321613311768, + "learning_rate": 0.0005, + "loss": 2.1157, + "step": 214300 + }, + { + "epoch": 0.8157167543372182, + "grad_norm": 0.12004934996366501, + "learning_rate": 0.0005, + "loss": 2.0929, + "step": 214310 + }, + { + "epoch": 0.8157548168053409, + "grad_norm": 0.13350030779838562, + "learning_rate": 0.0005, + "loss": 2.091, + "step": 214320 + }, + { + "epoch": 0.8157928792734636, + "grad_norm": 0.12730374932289124, + "learning_rate": 0.0005, + "loss": 2.1003, + "step": 214330 + }, + { + "epoch": 0.8158309417415863, + "grad_norm": 0.13901080191135406, + "learning_rate": 0.0005, + "loss": 2.0859, + "step": 214340 + }, + { + "epoch": 0.815869004209709, + "grad_norm": 0.1676304042339325, + "learning_rate": 0.0005, + "loss": 2.0964, + "step": 214350 + }, + { + "epoch": 0.8159070666778316, + "grad_norm": 0.13866017758846283, + "learning_rate": 0.0005, + "loss": 2.1053, + "step": 214360 + }, + { + "epoch": 0.8159451291459543, + "grad_norm": 0.13269653916358948, + "learning_rate": 0.0005, + "loss": 2.1077, + "step": 214370 + }, + { + "epoch": 0.8159831916140771, + "grad_norm": 0.12412741780281067, + "learning_rate": 0.0005, + "loss": 2.1112, + "step": 214380 + }, + { + "epoch": 0.8160212540821997, + "grad_norm": 0.11861774325370789, + "learning_rate": 0.0005, + "loss": 2.1235, + "step": 214390 + }, + { + "epoch": 0.8160593165503224, + "grad_norm": 0.12456288188695908, + "learning_rate": 0.0005, + "loss": 2.0968, + "step": 214400 + }, + { + "epoch": 0.816097379018445, + "grad_norm": 0.1261448860168457, + "learning_rate": 0.0005, + "loss": 2.0985, + "step": 214410 + }, + { + "epoch": 0.8161354414865678, + "grad_norm": 0.134896382689476, + "learning_rate": 0.0005, + "loss": 2.1154, + "step": 214420 + }, + { + "epoch": 0.8161735039546905, + "grad_norm": 0.14073701202869415, + "learning_rate": 0.0005, + "loss": 2.1163, + "step": 214430 + }, + { + "epoch": 0.8162115664228131, + "grad_norm": 0.12699000537395477, + "learning_rate": 0.0005, + "loss": 2.1036, + "step": 214440 + }, + { + "epoch": 0.8162496288909358, + "grad_norm": 0.12732018530368805, + "learning_rate": 0.0005, + "loss": 2.1106, + "step": 214450 + }, + { + "epoch": 0.8162876913590584, + "grad_norm": 0.1431722640991211, + "learning_rate": 0.0005, + "loss": 2.0975, + "step": 214460 + }, + { + "epoch": 0.8163257538271812, + "grad_norm": 0.1346866488456726, + "learning_rate": 0.0005, + "loss": 2.1063, + "step": 214470 + }, + { + "epoch": 0.8163638162953039, + "grad_norm": 0.14443494379520416, + "learning_rate": 0.0005, + "loss": 2.1177, + "step": 214480 + }, + { + "epoch": 0.8164018787634265, + "grad_norm": 0.12590277194976807, + "learning_rate": 0.0005, + "loss": 2.0922, + "step": 214490 + }, + { + "epoch": 0.8164399412315492, + "grad_norm": 0.11266959458589554, + "learning_rate": 0.0005, + "loss": 2.0889, + "step": 214500 + }, + { + "epoch": 0.8164780036996719, + "grad_norm": 0.12907174229621887, + "learning_rate": 0.0005, + "loss": 2.1214, + "step": 214510 + }, + { + "epoch": 0.8165160661677946, + "grad_norm": 0.13524018228054047, + "learning_rate": 0.0005, + "loss": 2.1124, + "step": 214520 + }, + { + "epoch": 0.8165541286359173, + "grad_norm": 0.12719334661960602, + "learning_rate": 0.0005, + "loss": 2.0982, + "step": 214530 + }, + { + "epoch": 0.8165921911040399, + "grad_norm": 0.12704965472221375, + "learning_rate": 0.0005, + "loss": 2.1032, + "step": 214540 + }, + { + "epoch": 0.8166302535721627, + "grad_norm": 0.11621110886335373, + "learning_rate": 0.0005, + "loss": 2.1014, + "step": 214550 + }, + { + "epoch": 0.8166683160402853, + "grad_norm": 0.1382187306880951, + "learning_rate": 0.0005, + "loss": 2.1032, + "step": 214560 + }, + { + "epoch": 0.816706378508408, + "grad_norm": 0.12210649996995926, + "learning_rate": 0.0005, + "loss": 2.1097, + "step": 214570 + }, + { + "epoch": 0.8167444409765307, + "grad_norm": 0.11293422430753708, + "learning_rate": 0.0005, + "loss": 2.1001, + "step": 214580 + }, + { + "epoch": 0.8167825034446534, + "grad_norm": 0.12237521260976791, + "learning_rate": 0.0005, + "loss": 2.0907, + "step": 214590 + }, + { + "epoch": 0.8168205659127761, + "grad_norm": 0.12014593183994293, + "learning_rate": 0.0005, + "loss": 2.1157, + "step": 214600 + }, + { + "epoch": 0.8168586283808987, + "grad_norm": 0.12235447764396667, + "learning_rate": 0.0005, + "loss": 2.1169, + "step": 214610 + }, + { + "epoch": 0.8168966908490214, + "grad_norm": 0.12861338257789612, + "learning_rate": 0.0005, + "loss": 2.11, + "step": 214620 + }, + { + "epoch": 0.816934753317144, + "grad_norm": 0.12567169964313507, + "learning_rate": 0.0005, + "loss": 2.1033, + "step": 214630 + }, + { + "epoch": 0.8169728157852668, + "grad_norm": 0.24400149285793304, + "learning_rate": 0.0005, + "loss": 2.107, + "step": 214640 + }, + { + "epoch": 0.8170108782533895, + "grad_norm": 0.12880581617355347, + "learning_rate": 0.0005, + "loss": 2.1247, + "step": 214650 + }, + { + "epoch": 0.8170489407215121, + "grad_norm": 0.11873859167098999, + "learning_rate": 0.0005, + "loss": 2.0995, + "step": 214660 + }, + { + "epoch": 0.8170870031896348, + "grad_norm": 0.12728740274906158, + "learning_rate": 0.0005, + "loss": 2.1025, + "step": 214670 + }, + { + "epoch": 0.8171250656577576, + "grad_norm": 0.11791323870420456, + "learning_rate": 0.0005, + "loss": 2.1061, + "step": 214680 + }, + { + "epoch": 0.8171631281258802, + "grad_norm": 0.1313634216785431, + "learning_rate": 0.0005, + "loss": 2.1044, + "step": 214690 + }, + { + "epoch": 0.8172011905940029, + "grad_norm": 0.13051530718803406, + "learning_rate": 0.0005, + "loss": 2.1243, + "step": 214700 + }, + { + "epoch": 0.8172392530621255, + "grad_norm": 0.13162212073802948, + "learning_rate": 0.0005, + "loss": 2.0969, + "step": 214710 + }, + { + "epoch": 0.8172773155302483, + "grad_norm": 0.1316559910774231, + "learning_rate": 0.0005, + "loss": 2.1156, + "step": 214720 + }, + { + "epoch": 0.817315377998371, + "grad_norm": 0.12329455465078354, + "learning_rate": 0.0005, + "loss": 2.1201, + "step": 214730 + }, + { + "epoch": 0.8173534404664936, + "grad_norm": 0.11354649811983109, + "learning_rate": 0.0005, + "loss": 2.1085, + "step": 214740 + }, + { + "epoch": 0.8173915029346163, + "grad_norm": 0.12798650562763214, + "learning_rate": 0.0005, + "loss": 2.1064, + "step": 214750 + }, + { + "epoch": 0.8174295654027389, + "grad_norm": 0.11971401423215866, + "learning_rate": 0.0005, + "loss": 2.1011, + "step": 214760 + }, + { + "epoch": 0.8174676278708617, + "grad_norm": 0.12153248488903046, + "learning_rate": 0.0005, + "loss": 2.0935, + "step": 214770 + }, + { + "epoch": 0.8175056903389843, + "grad_norm": 0.12790264189243317, + "learning_rate": 0.0005, + "loss": 2.1027, + "step": 214780 + }, + { + "epoch": 0.817543752807107, + "grad_norm": 0.11219383031129837, + "learning_rate": 0.0005, + "loss": 2.1005, + "step": 214790 + }, + { + "epoch": 0.8175818152752297, + "grad_norm": 0.12153936922550201, + "learning_rate": 0.0005, + "loss": 2.1045, + "step": 214800 + }, + { + "epoch": 0.8176198777433524, + "grad_norm": 0.13037075102329254, + "learning_rate": 0.0005, + "loss": 2.1009, + "step": 214810 + }, + { + "epoch": 0.8176579402114751, + "grad_norm": 0.15542103350162506, + "learning_rate": 0.0005, + "loss": 2.1096, + "step": 214820 + }, + { + "epoch": 0.8176960026795977, + "grad_norm": 0.13262136280536652, + "learning_rate": 0.0005, + "loss": 2.1063, + "step": 214830 + }, + { + "epoch": 0.8177340651477204, + "grad_norm": 0.1375369131565094, + "learning_rate": 0.0005, + "loss": 2.1122, + "step": 214840 + }, + { + "epoch": 0.8177721276158432, + "grad_norm": 0.11928694695234299, + "learning_rate": 0.0005, + "loss": 2.1019, + "step": 214850 + }, + { + "epoch": 0.8178101900839658, + "grad_norm": 0.11859259009361267, + "learning_rate": 0.0005, + "loss": 2.1021, + "step": 214860 + }, + { + "epoch": 0.8178482525520885, + "grad_norm": 0.11539218574762344, + "learning_rate": 0.0005, + "loss": 2.1058, + "step": 214870 + }, + { + "epoch": 0.8178863150202111, + "grad_norm": 0.11819108575582504, + "learning_rate": 0.0005, + "loss": 2.1071, + "step": 214880 + }, + { + "epoch": 0.8179243774883338, + "grad_norm": 0.14430873095989227, + "learning_rate": 0.0005, + "loss": 2.0956, + "step": 214890 + }, + { + "epoch": 0.8179624399564566, + "grad_norm": 0.12576286494731903, + "learning_rate": 0.0005, + "loss": 2.1075, + "step": 214900 + }, + { + "epoch": 0.8180005024245792, + "grad_norm": 0.12197782844305038, + "learning_rate": 0.0005, + "loss": 2.0945, + "step": 214910 + }, + { + "epoch": 0.8180385648927019, + "grad_norm": 0.12566708028316498, + "learning_rate": 0.0005, + "loss": 2.1121, + "step": 214920 + }, + { + "epoch": 0.8180766273608245, + "grad_norm": 0.12516872584819794, + "learning_rate": 0.0005, + "loss": 2.1059, + "step": 214930 + }, + { + "epoch": 0.8181146898289473, + "grad_norm": 0.13733117282390594, + "learning_rate": 0.0005, + "loss": 2.1031, + "step": 214940 + }, + { + "epoch": 0.81815275229707, + "grad_norm": 0.1253124177455902, + "learning_rate": 0.0005, + "loss": 2.0865, + "step": 214950 + }, + { + "epoch": 0.8181908147651926, + "grad_norm": 0.12638896703720093, + "learning_rate": 0.0005, + "loss": 2.1138, + "step": 214960 + }, + { + "epoch": 0.8182288772333153, + "grad_norm": 0.13121956586837769, + "learning_rate": 0.0005, + "loss": 2.1224, + "step": 214970 + }, + { + "epoch": 0.818266939701438, + "grad_norm": 0.1312199980020523, + "learning_rate": 0.0005, + "loss": 2.1087, + "step": 214980 + }, + { + "epoch": 0.8183050021695607, + "grad_norm": 0.12410897016525269, + "learning_rate": 0.0005, + "loss": 2.1131, + "step": 214990 + }, + { + "epoch": 0.8183430646376834, + "grad_norm": 0.12948821485042572, + "learning_rate": 0.0005, + "loss": 2.1016, + "step": 215000 + }, + { + "epoch": 0.818381127105806, + "grad_norm": 0.12356840074062347, + "learning_rate": 0.0005, + "loss": 2.1097, + "step": 215010 + }, + { + "epoch": 0.8184191895739288, + "grad_norm": 0.12674380838871002, + "learning_rate": 0.0005, + "loss": 2.1093, + "step": 215020 + }, + { + "epoch": 0.8184572520420514, + "grad_norm": 0.13833408057689667, + "learning_rate": 0.0005, + "loss": 2.0929, + "step": 215030 + }, + { + "epoch": 0.8184953145101741, + "grad_norm": 0.12283878773450851, + "learning_rate": 0.0005, + "loss": 2.1044, + "step": 215040 + }, + { + "epoch": 0.8185333769782968, + "grad_norm": 0.12343966960906982, + "learning_rate": 0.0005, + "loss": 2.1113, + "step": 215050 + }, + { + "epoch": 0.8185714394464194, + "grad_norm": 0.1294650286436081, + "learning_rate": 0.0005, + "loss": 2.0876, + "step": 215060 + }, + { + "epoch": 0.8186095019145422, + "grad_norm": 0.12338768690824509, + "learning_rate": 0.0005, + "loss": 2.1094, + "step": 215070 + }, + { + "epoch": 0.8186475643826648, + "grad_norm": 0.11877749860286713, + "learning_rate": 0.0005, + "loss": 2.1074, + "step": 215080 + }, + { + "epoch": 0.8186856268507875, + "grad_norm": 0.12078936398029327, + "learning_rate": 0.0005, + "loss": 2.0966, + "step": 215090 + }, + { + "epoch": 0.8187236893189102, + "grad_norm": 0.11437283456325531, + "learning_rate": 0.0005, + "loss": 2.1248, + "step": 215100 + }, + { + "epoch": 0.8187617517870329, + "grad_norm": 0.11495641618967056, + "learning_rate": 0.0005, + "loss": 2.113, + "step": 215110 + }, + { + "epoch": 0.8187998142551556, + "grad_norm": 0.5764119029045105, + "learning_rate": 0.0005, + "loss": 2.1051, + "step": 215120 + }, + { + "epoch": 0.8188378767232782, + "grad_norm": 0.11969491839408875, + "learning_rate": 0.0005, + "loss": 2.0977, + "step": 215130 + }, + { + "epoch": 0.8188759391914009, + "grad_norm": 0.1191493421792984, + "learning_rate": 0.0005, + "loss": 2.0926, + "step": 215140 + }, + { + "epoch": 0.8189140016595237, + "grad_norm": 0.11786416918039322, + "learning_rate": 0.0005, + "loss": 2.1017, + "step": 215150 + }, + { + "epoch": 0.8189520641276463, + "grad_norm": 0.12184026092290878, + "learning_rate": 0.0005, + "loss": 2.115, + "step": 215160 + }, + { + "epoch": 0.818990126595769, + "grad_norm": 0.13123203814029694, + "learning_rate": 0.0005, + "loss": 2.1147, + "step": 215170 + }, + { + "epoch": 0.8190281890638916, + "grad_norm": 0.12545669078826904, + "learning_rate": 0.0005, + "loss": 2.1132, + "step": 215180 + }, + { + "epoch": 0.8190662515320143, + "grad_norm": 0.14899766445159912, + "learning_rate": 0.0005, + "loss": 2.1202, + "step": 215190 + }, + { + "epoch": 0.8191043140001371, + "grad_norm": 0.12839815020561218, + "learning_rate": 0.0005, + "loss": 2.1074, + "step": 215200 + }, + { + "epoch": 0.8191423764682597, + "grad_norm": 0.12477117031812668, + "learning_rate": 0.0005, + "loss": 2.1012, + "step": 215210 + }, + { + "epoch": 0.8191804389363824, + "grad_norm": 0.1285984218120575, + "learning_rate": 0.0005, + "loss": 2.1046, + "step": 215220 + }, + { + "epoch": 0.819218501404505, + "grad_norm": 0.1194237545132637, + "learning_rate": 0.0005, + "loss": 2.1078, + "step": 215230 + }, + { + "epoch": 0.8192565638726278, + "grad_norm": 0.12809935212135315, + "learning_rate": 0.0005, + "loss": 2.1197, + "step": 215240 + }, + { + "epoch": 0.8192946263407505, + "grad_norm": 0.12358009070158005, + "learning_rate": 0.0005, + "loss": 2.1088, + "step": 215250 + }, + { + "epoch": 0.8193326888088731, + "grad_norm": 0.1358308643102646, + "learning_rate": 0.0005, + "loss": 2.102, + "step": 215260 + }, + { + "epoch": 0.8193707512769958, + "grad_norm": 0.12565359473228455, + "learning_rate": 0.0005, + "loss": 2.1011, + "step": 215270 + }, + { + "epoch": 0.8194088137451185, + "grad_norm": 0.13406111299991608, + "learning_rate": 0.0005, + "loss": 2.1004, + "step": 215280 + }, + { + "epoch": 0.8194468762132412, + "grad_norm": 0.11594336479902267, + "learning_rate": 0.0005, + "loss": 2.0928, + "step": 215290 + }, + { + "epoch": 0.8194849386813639, + "grad_norm": 0.13356558978557587, + "learning_rate": 0.0005, + "loss": 2.1056, + "step": 215300 + }, + { + "epoch": 0.8195230011494865, + "grad_norm": 0.1209784597158432, + "learning_rate": 0.0005, + "loss": 2.1077, + "step": 215310 + }, + { + "epoch": 0.8195610636176093, + "grad_norm": 0.12455625832080841, + "learning_rate": 0.0005, + "loss": 2.1072, + "step": 215320 + }, + { + "epoch": 0.8195991260857319, + "grad_norm": 0.12635934352874756, + "learning_rate": 0.0005, + "loss": 2.096, + "step": 215330 + }, + { + "epoch": 0.8196371885538546, + "grad_norm": 0.12177467346191406, + "learning_rate": 0.0005, + "loss": 2.087, + "step": 215340 + }, + { + "epoch": 0.8196752510219772, + "grad_norm": 0.1592981070280075, + "learning_rate": 0.0005, + "loss": 2.0962, + "step": 215350 + }, + { + "epoch": 0.8197133134900999, + "grad_norm": 0.13388434052467346, + "learning_rate": 0.0005, + "loss": 2.1226, + "step": 215360 + }, + { + "epoch": 0.8197513759582227, + "grad_norm": 0.11722065508365631, + "learning_rate": 0.0005, + "loss": 2.0935, + "step": 215370 + }, + { + "epoch": 0.8197894384263453, + "grad_norm": 0.1249929741024971, + "learning_rate": 0.0005, + "loss": 2.1122, + "step": 215380 + }, + { + "epoch": 0.819827500894468, + "grad_norm": 0.12349975854158401, + "learning_rate": 0.0005, + "loss": 2.1117, + "step": 215390 + }, + { + "epoch": 0.8198655633625906, + "grad_norm": 0.1266603320837021, + "learning_rate": 0.0005, + "loss": 2.1251, + "step": 215400 + }, + { + "epoch": 0.8199036258307134, + "grad_norm": 0.12294841557741165, + "learning_rate": 0.0005, + "loss": 2.1194, + "step": 215410 + }, + { + "epoch": 0.8199416882988361, + "grad_norm": 0.12204351276159286, + "learning_rate": 0.0005, + "loss": 2.1096, + "step": 215420 + }, + { + "epoch": 0.8199797507669587, + "grad_norm": 0.12130658328533173, + "learning_rate": 0.0005, + "loss": 2.0968, + "step": 215430 + }, + { + "epoch": 0.8200178132350814, + "grad_norm": 0.11593271046876907, + "learning_rate": 0.0005, + "loss": 2.1071, + "step": 215440 + }, + { + "epoch": 0.8200558757032042, + "grad_norm": 0.1328611522912979, + "learning_rate": 0.0005, + "loss": 2.0975, + "step": 215450 + }, + { + "epoch": 0.8200939381713268, + "grad_norm": 0.1163841113448143, + "learning_rate": 0.0005, + "loss": 2.1093, + "step": 215460 + }, + { + "epoch": 0.8201320006394495, + "grad_norm": 0.1402568370103836, + "learning_rate": 0.0005, + "loss": 2.1133, + "step": 215470 + }, + { + "epoch": 0.8201700631075721, + "grad_norm": 0.11890465766191483, + "learning_rate": 0.0005, + "loss": 2.1067, + "step": 215480 + }, + { + "epoch": 0.8202081255756948, + "grad_norm": 0.12252867221832275, + "learning_rate": 0.0005, + "loss": 2.0965, + "step": 215490 + }, + { + "epoch": 0.8202461880438175, + "grad_norm": 0.12809689342975616, + "learning_rate": 0.0005, + "loss": 2.1087, + "step": 215500 + }, + { + "epoch": 0.8202842505119402, + "grad_norm": 0.12146003544330597, + "learning_rate": 0.0005, + "loss": 2.1123, + "step": 215510 + }, + { + "epoch": 0.8203223129800629, + "grad_norm": 0.12264453619718552, + "learning_rate": 0.0005, + "loss": 2.1077, + "step": 215520 + }, + { + "epoch": 0.8203603754481855, + "grad_norm": 0.12639731168746948, + "learning_rate": 0.0005, + "loss": 2.0928, + "step": 215530 + }, + { + "epoch": 0.8203984379163083, + "grad_norm": 0.17479752004146576, + "learning_rate": 0.0005, + "loss": 2.0986, + "step": 215540 + }, + { + "epoch": 0.820436500384431, + "grad_norm": 0.11922673135995865, + "learning_rate": 0.0005, + "loss": 2.112, + "step": 215550 + }, + { + "epoch": 0.8204745628525536, + "grad_norm": 0.11989131569862366, + "learning_rate": 0.0005, + "loss": 2.1087, + "step": 215560 + }, + { + "epoch": 0.8205126253206763, + "grad_norm": 0.10601154714822769, + "learning_rate": 0.0005, + "loss": 2.1105, + "step": 215570 + }, + { + "epoch": 0.820550687788799, + "grad_norm": 0.1295652985572815, + "learning_rate": 0.0005, + "loss": 2.119, + "step": 215580 + }, + { + "epoch": 0.8205887502569217, + "grad_norm": 0.13028371334075928, + "learning_rate": 0.0005, + "loss": 2.1078, + "step": 215590 + }, + { + "epoch": 0.8206268127250443, + "grad_norm": 0.12443135678768158, + "learning_rate": 0.0005, + "loss": 2.1, + "step": 215600 + }, + { + "epoch": 0.820664875193167, + "grad_norm": 0.13287542760372162, + "learning_rate": 0.0005, + "loss": 2.1018, + "step": 215610 + }, + { + "epoch": 0.8207029376612897, + "grad_norm": 0.13548590242862701, + "learning_rate": 0.0005, + "loss": 2.1153, + "step": 215620 + }, + { + "epoch": 0.8207410001294124, + "grad_norm": 0.11423881351947784, + "learning_rate": 0.0005, + "loss": 2.0987, + "step": 215630 + }, + { + "epoch": 0.8207790625975351, + "grad_norm": 0.15103867650032043, + "learning_rate": 0.0005, + "loss": 2.1172, + "step": 215640 + }, + { + "epoch": 0.8208171250656577, + "grad_norm": 0.1434904932975769, + "learning_rate": 0.0005, + "loss": 2.1058, + "step": 215650 + }, + { + "epoch": 0.8208551875337804, + "grad_norm": 0.12201616168022156, + "learning_rate": 0.0005, + "loss": 2.1091, + "step": 215660 + }, + { + "epoch": 0.8208932500019032, + "grad_norm": 0.11950390040874481, + "learning_rate": 0.0005, + "loss": 2.0888, + "step": 215670 + }, + { + "epoch": 0.8209313124700258, + "grad_norm": 0.18592995405197144, + "learning_rate": 0.0005, + "loss": 2.106, + "step": 215680 + }, + { + "epoch": 0.8209693749381485, + "grad_norm": 0.13558170199394226, + "learning_rate": 0.0005, + "loss": 2.1022, + "step": 215690 + }, + { + "epoch": 0.8210074374062711, + "grad_norm": 0.12218789756298065, + "learning_rate": 0.0005, + "loss": 2.1013, + "step": 215700 + }, + { + "epoch": 0.8210454998743939, + "grad_norm": 0.12565554678440094, + "learning_rate": 0.0005, + "loss": 2.1065, + "step": 215710 + }, + { + "epoch": 0.8210835623425166, + "grad_norm": 0.1262756884098053, + "learning_rate": 0.0005, + "loss": 2.11, + "step": 215720 + }, + { + "epoch": 0.8211216248106392, + "grad_norm": 0.12086983770132065, + "learning_rate": 0.0005, + "loss": 2.1011, + "step": 215730 + }, + { + "epoch": 0.8211596872787619, + "grad_norm": 0.11935533583164215, + "learning_rate": 0.0005, + "loss": 2.1021, + "step": 215740 + }, + { + "epoch": 0.8211977497468846, + "grad_norm": 0.12646500766277313, + "learning_rate": 0.0005, + "loss": 2.1128, + "step": 215750 + }, + { + "epoch": 0.8212358122150073, + "grad_norm": 0.12062659114599228, + "learning_rate": 0.0005, + "loss": 2.0992, + "step": 215760 + }, + { + "epoch": 0.82127387468313, + "grad_norm": 0.1336217075586319, + "learning_rate": 0.0005, + "loss": 2.1163, + "step": 215770 + }, + { + "epoch": 0.8213119371512526, + "grad_norm": 0.12084248661994934, + "learning_rate": 0.0005, + "loss": 2.1068, + "step": 215780 + }, + { + "epoch": 0.8213499996193753, + "grad_norm": 0.11078011244535446, + "learning_rate": 0.0005, + "loss": 2.0923, + "step": 215790 + }, + { + "epoch": 0.821388062087498, + "grad_norm": 0.11964815855026245, + "learning_rate": 0.0005, + "loss": 2.115, + "step": 215800 + }, + { + "epoch": 0.8214261245556207, + "grad_norm": 0.11715470254421234, + "learning_rate": 0.0005, + "loss": 2.1034, + "step": 215810 + }, + { + "epoch": 0.8214641870237434, + "grad_norm": 0.133929044008255, + "learning_rate": 0.0005, + "loss": 2.1171, + "step": 215820 + }, + { + "epoch": 0.821502249491866, + "grad_norm": 0.127930149435997, + "learning_rate": 0.0005, + "loss": 2.0991, + "step": 215830 + }, + { + "epoch": 0.8215403119599888, + "grad_norm": 0.1305428445339203, + "learning_rate": 0.0005, + "loss": 2.1016, + "step": 215840 + }, + { + "epoch": 0.8215783744281114, + "grad_norm": 0.12557300925254822, + "learning_rate": 0.0005, + "loss": 2.1005, + "step": 215850 + }, + { + "epoch": 0.8216164368962341, + "grad_norm": 0.12656274437904358, + "learning_rate": 0.0005, + "loss": 2.11, + "step": 215860 + }, + { + "epoch": 0.8216544993643567, + "grad_norm": 0.12123452126979828, + "learning_rate": 0.0005, + "loss": 2.1191, + "step": 215870 + }, + { + "epoch": 0.8216925618324795, + "grad_norm": 0.12982730567455292, + "learning_rate": 0.0005, + "loss": 2.1087, + "step": 215880 + }, + { + "epoch": 0.8217306243006022, + "grad_norm": 0.13296344876289368, + "learning_rate": 0.0005, + "loss": 2.1067, + "step": 215890 + }, + { + "epoch": 0.8217686867687248, + "grad_norm": 0.1178731918334961, + "learning_rate": 0.0005, + "loss": 2.1009, + "step": 215900 + }, + { + "epoch": 0.8218067492368475, + "grad_norm": 0.1229533925652504, + "learning_rate": 0.0005, + "loss": 2.1083, + "step": 215910 + }, + { + "epoch": 0.8218448117049701, + "grad_norm": 0.12563879787921906, + "learning_rate": 0.0005, + "loss": 2.1065, + "step": 215920 + }, + { + "epoch": 0.8218828741730929, + "grad_norm": 0.15010017156600952, + "learning_rate": 0.0005, + "loss": 2.1226, + "step": 215930 + }, + { + "epoch": 0.8219209366412156, + "grad_norm": 0.11918827891349792, + "learning_rate": 0.0005, + "loss": 2.1029, + "step": 215940 + }, + { + "epoch": 0.8219589991093382, + "grad_norm": 0.1260400414466858, + "learning_rate": 0.0005, + "loss": 2.1041, + "step": 215950 + }, + { + "epoch": 0.8219970615774609, + "grad_norm": 0.12422250956296921, + "learning_rate": 0.0005, + "loss": 2.1121, + "step": 215960 + }, + { + "epoch": 0.8220351240455837, + "grad_norm": 0.12583960592746735, + "learning_rate": 0.0005, + "loss": 2.1105, + "step": 215970 + }, + { + "epoch": 0.8220731865137063, + "grad_norm": 0.12483104318380356, + "learning_rate": 0.0005, + "loss": 2.1128, + "step": 215980 + }, + { + "epoch": 0.822111248981829, + "grad_norm": 0.11402330547571182, + "learning_rate": 0.0005, + "loss": 2.1082, + "step": 215990 + }, + { + "epoch": 0.8221493114499516, + "grad_norm": 0.1227409839630127, + "learning_rate": 0.0005, + "loss": 2.1093, + "step": 216000 + }, + { + "epoch": 0.8221873739180744, + "grad_norm": 0.12268001586198807, + "learning_rate": 0.0005, + "loss": 2.099, + "step": 216010 + }, + { + "epoch": 0.822225436386197, + "grad_norm": 0.12708698213100433, + "learning_rate": 0.0005, + "loss": 2.0944, + "step": 216020 + }, + { + "epoch": 0.8222634988543197, + "grad_norm": 0.1555493175983429, + "learning_rate": 0.0005, + "loss": 2.1077, + "step": 216030 + }, + { + "epoch": 0.8223015613224424, + "grad_norm": 0.14136861264705658, + "learning_rate": 0.0005, + "loss": 2.0922, + "step": 216040 + }, + { + "epoch": 0.822339623790565, + "grad_norm": 0.12100417912006378, + "learning_rate": 0.0005, + "loss": 2.1001, + "step": 216050 + }, + { + "epoch": 0.8223776862586878, + "grad_norm": 0.11855873465538025, + "learning_rate": 0.0005, + "loss": 2.0977, + "step": 216060 + }, + { + "epoch": 0.8224157487268104, + "grad_norm": 0.12413900345563889, + "learning_rate": 0.0005, + "loss": 2.1229, + "step": 216070 + }, + { + "epoch": 0.8224538111949331, + "grad_norm": 0.12667207419872284, + "learning_rate": 0.0005, + "loss": 2.1042, + "step": 216080 + }, + { + "epoch": 0.8224918736630558, + "grad_norm": 0.1186310201883316, + "learning_rate": 0.0005, + "loss": 2.0922, + "step": 216090 + }, + { + "epoch": 0.8225299361311785, + "grad_norm": 0.11538801342248917, + "learning_rate": 0.0005, + "loss": 2.1128, + "step": 216100 + }, + { + "epoch": 0.8225679985993012, + "grad_norm": 0.12002494186162949, + "learning_rate": 0.0005, + "loss": 2.1173, + "step": 216110 + }, + { + "epoch": 0.8226060610674238, + "grad_norm": 0.12906166911125183, + "learning_rate": 0.0005, + "loss": 2.1027, + "step": 216120 + }, + { + "epoch": 0.8226441235355465, + "grad_norm": 0.1414247453212738, + "learning_rate": 0.0005, + "loss": 2.0921, + "step": 216130 + }, + { + "epoch": 0.8226821860036693, + "grad_norm": 0.1321527659893036, + "learning_rate": 0.0005, + "loss": 2.0977, + "step": 216140 + }, + { + "epoch": 0.8227202484717919, + "grad_norm": 0.12948259711265564, + "learning_rate": 0.0005, + "loss": 2.1026, + "step": 216150 + }, + { + "epoch": 0.8227583109399146, + "grad_norm": 0.12453103810548782, + "learning_rate": 0.0005, + "loss": 2.1033, + "step": 216160 + }, + { + "epoch": 0.8227963734080372, + "grad_norm": 0.11883663386106491, + "learning_rate": 0.0005, + "loss": 2.1022, + "step": 216170 + }, + { + "epoch": 0.82283443587616, + "grad_norm": 0.12888406217098236, + "learning_rate": 0.0005, + "loss": 2.1165, + "step": 216180 + }, + { + "epoch": 0.8228724983442827, + "grad_norm": 0.12209624797105789, + "learning_rate": 0.0005, + "loss": 2.1075, + "step": 216190 + }, + { + "epoch": 0.8229105608124053, + "grad_norm": 0.1349632740020752, + "learning_rate": 0.0005, + "loss": 2.0763, + "step": 216200 + }, + { + "epoch": 0.822948623280528, + "grad_norm": 0.12418237328529358, + "learning_rate": 0.0005, + "loss": 2.1104, + "step": 216210 + }, + { + "epoch": 0.8229866857486506, + "grad_norm": 0.1306927651166916, + "learning_rate": 0.0005, + "loss": 2.1107, + "step": 216220 + }, + { + "epoch": 0.8230247482167734, + "grad_norm": 0.1376977562904358, + "learning_rate": 0.0005, + "loss": 2.0964, + "step": 216230 + }, + { + "epoch": 0.8230628106848961, + "grad_norm": 0.11857961118221283, + "learning_rate": 0.0005, + "loss": 2.0969, + "step": 216240 + }, + { + "epoch": 0.8231008731530187, + "grad_norm": 0.11643446236848831, + "learning_rate": 0.0005, + "loss": 2.1161, + "step": 216250 + }, + { + "epoch": 0.8231389356211414, + "grad_norm": 0.1297149360179901, + "learning_rate": 0.0005, + "loss": 2.0893, + "step": 216260 + }, + { + "epoch": 0.8231769980892641, + "grad_norm": 0.13119326531887054, + "learning_rate": 0.0005, + "loss": 2.1017, + "step": 216270 + }, + { + "epoch": 0.8232150605573868, + "grad_norm": 0.11515495181083679, + "learning_rate": 0.0005, + "loss": 2.1023, + "step": 216280 + }, + { + "epoch": 0.8232531230255095, + "grad_norm": 0.1289171725511551, + "learning_rate": 0.0005, + "loss": 2.0919, + "step": 216290 + }, + { + "epoch": 0.8232911854936321, + "grad_norm": 0.12597043812274933, + "learning_rate": 0.0005, + "loss": 2.1047, + "step": 216300 + }, + { + "epoch": 0.8233292479617549, + "grad_norm": 0.11690404266119003, + "learning_rate": 0.0005, + "loss": 2.1141, + "step": 216310 + }, + { + "epoch": 0.8233673104298775, + "grad_norm": 0.12119033187627792, + "learning_rate": 0.0005, + "loss": 2.0954, + "step": 216320 + }, + { + "epoch": 0.8234053728980002, + "grad_norm": 0.14844262599945068, + "learning_rate": 0.0005, + "loss": 2.1147, + "step": 216330 + }, + { + "epoch": 0.8234434353661229, + "grad_norm": 0.11987555772066116, + "learning_rate": 0.0005, + "loss": 2.1077, + "step": 216340 + }, + { + "epoch": 0.8234814978342455, + "grad_norm": 0.13595780730247498, + "learning_rate": 0.0005, + "loss": 2.1244, + "step": 216350 + }, + { + "epoch": 0.8235195603023683, + "grad_norm": 0.11209575086832047, + "learning_rate": 0.0005, + "loss": 2.0902, + "step": 216360 + }, + { + "epoch": 0.8235576227704909, + "grad_norm": 0.12953747808933258, + "learning_rate": 0.0005, + "loss": 2.1067, + "step": 216370 + }, + { + "epoch": 0.8235956852386136, + "grad_norm": 0.12427164614200592, + "learning_rate": 0.0005, + "loss": 2.1135, + "step": 216380 + }, + { + "epoch": 0.8236337477067363, + "grad_norm": 0.12212230265140533, + "learning_rate": 0.0005, + "loss": 2.1032, + "step": 216390 + }, + { + "epoch": 0.823671810174859, + "grad_norm": 0.12323116511106491, + "learning_rate": 0.0005, + "loss": 2.1084, + "step": 216400 + }, + { + "epoch": 0.8237098726429817, + "grad_norm": 0.12646298110485077, + "learning_rate": 0.0005, + "loss": 2.0995, + "step": 216410 + }, + { + "epoch": 0.8237479351111043, + "grad_norm": 0.14122170209884644, + "learning_rate": 0.0005, + "loss": 2.0929, + "step": 216420 + }, + { + "epoch": 0.823785997579227, + "grad_norm": 0.1250144988298416, + "learning_rate": 0.0005, + "loss": 2.1001, + "step": 216430 + }, + { + "epoch": 0.8238240600473498, + "grad_norm": 0.12037564814090729, + "learning_rate": 0.0005, + "loss": 2.114, + "step": 216440 + }, + { + "epoch": 0.8238621225154724, + "grad_norm": 0.12027294188737869, + "learning_rate": 0.0005, + "loss": 2.0968, + "step": 216450 + }, + { + "epoch": 0.8239001849835951, + "grad_norm": 0.13762728869915009, + "learning_rate": 0.0005, + "loss": 2.1084, + "step": 216460 + }, + { + "epoch": 0.8239382474517177, + "grad_norm": 0.12742702662944794, + "learning_rate": 0.0005, + "loss": 2.1111, + "step": 216470 + }, + { + "epoch": 0.8239763099198404, + "grad_norm": 0.1193777471780777, + "learning_rate": 0.0005, + "loss": 2.1009, + "step": 216480 + }, + { + "epoch": 0.8240143723879632, + "grad_norm": 0.11855217069387436, + "learning_rate": 0.0005, + "loss": 2.1097, + "step": 216490 + }, + { + "epoch": 0.8240524348560858, + "grad_norm": 0.12601156532764435, + "learning_rate": 0.0005, + "loss": 2.0995, + "step": 216500 + }, + { + "epoch": 0.8240904973242085, + "grad_norm": 0.13579390943050385, + "learning_rate": 0.0005, + "loss": 2.1066, + "step": 216510 + }, + { + "epoch": 0.8241285597923311, + "grad_norm": 0.12577751278877258, + "learning_rate": 0.0005, + "loss": 2.1016, + "step": 216520 + }, + { + "epoch": 0.8241666222604539, + "grad_norm": 0.1241578757762909, + "learning_rate": 0.0005, + "loss": 2.0974, + "step": 216530 + }, + { + "epoch": 0.8242046847285766, + "grad_norm": 0.11784977465867996, + "learning_rate": 0.0005, + "loss": 2.1113, + "step": 216540 + }, + { + "epoch": 0.8242427471966992, + "grad_norm": 0.12446720898151398, + "learning_rate": 0.0005, + "loss": 2.092, + "step": 216550 + }, + { + "epoch": 0.8242808096648219, + "grad_norm": 0.10872805118560791, + "learning_rate": 0.0005, + "loss": 2.1139, + "step": 216560 + }, + { + "epoch": 0.8243188721329446, + "grad_norm": 0.1375352293252945, + "learning_rate": 0.0005, + "loss": 2.1076, + "step": 216570 + }, + { + "epoch": 0.8243569346010673, + "grad_norm": 0.12831570208072662, + "learning_rate": 0.0005, + "loss": 2.0909, + "step": 216580 + }, + { + "epoch": 0.82439499706919, + "grad_norm": 0.11933586746454239, + "learning_rate": 0.0005, + "loss": 2.1133, + "step": 216590 + }, + { + "epoch": 0.8244330595373126, + "grad_norm": 0.12159659713506699, + "learning_rate": 0.0005, + "loss": 2.1155, + "step": 216600 + }, + { + "epoch": 0.8244711220054354, + "grad_norm": 0.1195387914776802, + "learning_rate": 0.0005, + "loss": 2.1075, + "step": 216610 + }, + { + "epoch": 0.824509184473558, + "grad_norm": 0.11985363066196442, + "learning_rate": 0.0005, + "loss": 2.0944, + "step": 216620 + }, + { + "epoch": 0.8245472469416807, + "grad_norm": 0.1207309141755104, + "learning_rate": 0.0005, + "loss": 2.0973, + "step": 216630 + }, + { + "epoch": 0.8245853094098033, + "grad_norm": 0.1313052773475647, + "learning_rate": 0.0005, + "loss": 2.0922, + "step": 216640 + }, + { + "epoch": 0.824623371877926, + "grad_norm": 0.11789082735776901, + "learning_rate": 0.0005, + "loss": 2.1116, + "step": 216650 + }, + { + "epoch": 0.8246614343460488, + "grad_norm": 0.13953982293605804, + "learning_rate": 0.0005, + "loss": 2.1162, + "step": 216660 + }, + { + "epoch": 0.8246994968141714, + "grad_norm": 0.12093924731016159, + "learning_rate": 0.0005, + "loss": 2.1066, + "step": 216670 + }, + { + "epoch": 0.8247375592822941, + "grad_norm": 0.13901619613170624, + "learning_rate": 0.0005, + "loss": 2.0962, + "step": 216680 + }, + { + "epoch": 0.8247756217504167, + "grad_norm": 0.1243644654750824, + "learning_rate": 0.0005, + "loss": 2.0967, + "step": 216690 + }, + { + "epoch": 0.8248136842185395, + "grad_norm": 0.1316295564174652, + "learning_rate": 0.0005, + "loss": 2.1083, + "step": 216700 + }, + { + "epoch": 0.8248517466866622, + "grad_norm": 0.1418789029121399, + "learning_rate": 0.0005, + "loss": 2.1037, + "step": 216710 + }, + { + "epoch": 0.8248898091547848, + "grad_norm": 0.15817764401435852, + "learning_rate": 0.0005, + "loss": 2.0892, + "step": 216720 + }, + { + "epoch": 0.8249278716229075, + "grad_norm": 0.12203866243362427, + "learning_rate": 0.0005, + "loss": 2.1177, + "step": 216730 + }, + { + "epoch": 0.8249659340910303, + "grad_norm": 0.12293455749750137, + "learning_rate": 0.0005, + "loss": 2.1044, + "step": 216740 + }, + { + "epoch": 0.8250039965591529, + "grad_norm": 0.12383758276700974, + "learning_rate": 0.0005, + "loss": 2.1078, + "step": 216750 + }, + { + "epoch": 0.8250420590272756, + "grad_norm": 0.1415373533964157, + "learning_rate": 0.0005, + "loss": 2.1105, + "step": 216760 + }, + { + "epoch": 0.8250801214953982, + "grad_norm": 0.13275422155857086, + "learning_rate": 0.0005, + "loss": 2.1146, + "step": 216770 + }, + { + "epoch": 0.8251181839635209, + "grad_norm": 0.12756270170211792, + "learning_rate": 0.0005, + "loss": 2.1013, + "step": 216780 + }, + { + "epoch": 0.8251562464316436, + "grad_norm": 0.12516839802265167, + "learning_rate": 0.0005, + "loss": 2.1246, + "step": 216790 + }, + { + "epoch": 0.8251943088997663, + "grad_norm": 0.13058559596538544, + "learning_rate": 0.0005, + "loss": 2.1117, + "step": 216800 + }, + { + "epoch": 0.825232371367889, + "grad_norm": 0.1205880343914032, + "learning_rate": 0.0005, + "loss": 2.1033, + "step": 216810 + }, + { + "epoch": 0.8252704338360116, + "grad_norm": 0.12645703554153442, + "learning_rate": 0.0005, + "loss": 2.1086, + "step": 216820 + }, + { + "epoch": 0.8253084963041344, + "grad_norm": 0.12271429598331451, + "learning_rate": 0.0005, + "loss": 2.0993, + "step": 216830 + }, + { + "epoch": 0.825346558772257, + "grad_norm": 0.1316102296113968, + "learning_rate": 0.0005, + "loss": 2.1076, + "step": 216840 + }, + { + "epoch": 0.8253846212403797, + "grad_norm": 0.13087411224842072, + "learning_rate": 0.0005, + "loss": 2.1139, + "step": 216850 + }, + { + "epoch": 0.8254226837085024, + "grad_norm": 0.14741811156272888, + "learning_rate": 0.0005, + "loss": 2.1094, + "step": 216860 + }, + { + "epoch": 0.8254607461766251, + "grad_norm": 0.13531188666820526, + "learning_rate": 0.0005, + "loss": 2.0898, + "step": 216870 + }, + { + "epoch": 0.8254988086447478, + "grad_norm": 0.10500217229127884, + "learning_rate": 0.0005, + "loss": 2.1123, + "step": 216880 + }, + { + "epoch": 0.8255368711128704, + "grad_norm": 0.13994759321212769, + "learning_rate": 0.0005, + "loss": 2.1135, + "step": 216890 + }, + { + "epoch": 0.8255749335809931, + "grad_norm": 0.12008815258741379, + "learning_rate": 0.0005, + "loss": 2.1091, + "step": 216900 + }, + { + "epoch": 0.8256129960491158, + "grad_norm": 0.12398341298103333, + "learning_rate": 0.0005, + "loss": 2.1063, + "step": 216910 + }, + { + "epoch": 0.8256510585172385, + "grad_norm": 0.11703905463218689, + "learning_rate": 0.0005, + "loss": 2.0943, + "step": 216920 + }, + { + "epoch": 0.8256891209853612, + "grad_norm": 0.12600445747375488, + "learning_rate": 0.0005, + "loss": 2.1101, + "step": 216930 + }, + { + "epoch": 0.8257271834534838, + "grad_norm": 0.12901932001113892, + "learning_rate": 0.0005, + "loss": 2.1178, + "step": 216940 + }, + { + "epoch": 0.8257652459216065, + "grad_norm": 0.12366965413093567, + "learning_rate": 0.0005, + "loss": 2.1084, + "step": 216950 + }, + { + "epoch": 0.8258033083897293, + "grad_norm": 0.11896523088216782, + "learning_rate": 0.0005, + "loss": 2.1195, + "step": 216960 + }, + { + "epoch": 0.8258413708578519, + "grad_norm": 0.1275978684425354, + "learning_rate": 0.0005, + "loss": 2.0942, + "step": 216970 + }, + { + "epoch": 0.8258794333259746, + "grad_norm": 0.13192734122276306, + "learning_rate": 0.0005, + "loss": 2.0987, + "step": 216980 + }, + { + "epoch": 0.8259174957940972, + "grad_norm": 0.12553296983242035, + "learning_rate": 0.0005, + "loss": 2.0984, + "step": 216990 + }, + { + "epoch": 0.82595555826222, + "grad_norm": 0.1140354722738266, + "learning_rate": 0.0005, + "loss": 2.113, + "step": 217000 + }, + { + "epoch": 0.8259936207303427, + "grad_norm": 0.12425722181797028, + "learning_rate": 0.0005, + "loss": 2.1064, + "step": 217010 + }, + { + "epoch": 0.8260316831984653, + "grad_norm": 0.12796053290367126, + "learning_rate": 0.0005, + "loss": 2.0921, + "step": 217020 + }, + { + "epoch": 0.826069745666588, + "grad_norm": 0.11599701642990112, + "learning_rate": 0.0005, + "loss": 2.1046, + "step": 217030 + }, + { + "epoch": 0.8261078081347107, + "grad_norm": 0.12334559112787247, + "learning_rate": 0.0005, + "loss": 2.1049, + "step": 217040 + }, + { + "epoch": 0.8261458706028334, + "grad_norm": 0.12353719025850296, + "learning_rate": 0.0005, + "loss": 2.0928, + "step": 217050 + }, + { + "epoch": 0.8261839330709561, + "grad_norm": 0.13022439181804657, + "learning_rate": 0.0005, + "loss": 2.1052, + "step": 217060 + }, + { + "epoch": 0.8262219955390787, + "grad_norm": 0.12294045090675354, + "learning_rate": 0.0005, + "loss": 2.1107, + "step": 217070 + }, + { + "epoch": 0.8262600580072014, + "grad_norm": 0.1288580745458603, + "learning_rate": 0.0005, + "loss": 2.0962, + "step": 217080 + }, + { + "epoch": 0.8262981204753241, + "grad_norm": 0.12367033958435059, + "learning_rate": 0.0005, + "loss": 2.1081, + "step": 217090 + }, + { + "epoch": 0.8263361829434468, + "grad_norm": 0.12451356649398804, + "learning_rate": 0.0005, + "loss": 2.1001, + "step": 217100 + }, + { + "epoch": 0.8263742454115695, + "grad_norm": 0.130989670753479, + "learning_rate": 0.0005, + "loss": 2.0946, + "step": 217110 + }, + { + "epoch": 0.8264123078796921, + "grad_norm": 0.12526611983776093, + "learning_rate": 0.0005, + "loss": 2.0983, + "step": 217120 + }, + { + "epoch": 0.8264503703478149, + "grad_norm": 0.12423757463693619, + "learning_rate": 0.0005, + "loss": 2.102, + "step": 217130 + }, + { + "epoch": 0.8264884328159375, + "grad_norm": 0.12939000129699707, + "learning_rate": 0.0005, + "loss": 2.1074, + "step": 217140 + }, + { + "epoch": 0.8265264952840602, + "grad_norm": 0.13361996412277222, + "learning_rate": 0.0005, + "loss": 2.1041, + "step": 217150 + }, + { + "epoch": 0.8265645577521828, + "grad_norm": 0.13030046224594116, + "learning_rate": 0.0005, + "loss": 2.1121, + "step": 217160 + }, + { + "epoch": 0.8266026202203056, + "grad_norm": 0.11973579227924347, + "learning_rate": 0.0005, + "loss": 2.0919, + "step": 217170 + }, + { + "epoch": 0.8266406826884283, + "grad_norm": 0.13034309446811676, + "learning_rate": 0.0005, + "loss": 2.108, + "step": 217180 + }, + { + "epoch": 0.8266787451565509, + "grad_norm": 0.12459006905555725, + "learning_rate": 0.0005, + "loss": 2.1065, + "step": 217190 + }, + { + "epoch": 0.8267168076246736, + "grad_norm": 0.12662822008132935, + "learning_rate": 0.0005, + "loss": 2.0997, + "step": 217200 + }, + { + "epoch": 0.8267548700927962, + "grad_norm": 0.1360860913991928, + "learning_rate": 0.0005, + "loss": 2.0941, + "step": 217210 + }, + { + "epoch": 0.826792932560919, + "grad_norm": 0.12168723344802856, + "learning_rate": 0.0005, + "loss": 2.1102, + "step": 217220 + }, + { + "epoch": 0.8268309950290417, + "grad_norm": 0.12574991583824158, + "learning_rate": 0.0005, + "loss": 2.11, + "step": 217230 + }, + { + "epoch": 0.8268690574971643, + "grad_norm": 0.11954498291015625, + "learning_rate": 0.0005, + "loss": 2.1048, + "step": 217240 + }, + { + "epoch": 0.826907119965287, + "grad_norm": 0.11757103353738785, + "learning_rate": 0.0005, + "loss": 2.1165, + "step": 217250 + }, + { + "epoch": 0.8269451824334098, + "grad_norm": 0.11557068675756454, + "learning_rate": 0.0005, + "loss": 2.1001, + "step": 217260 + }, + { + "epoch": 0.8269832449015324, + "grad_norm": 0.12298137694597244, + "learning_rate": 0.0005, + "loss": 2.0974, + "step": 217270 + }, + { + "epoch": 0.8270213073696551, + "grad_norm": 0.1493585854768753, + "learning_rate": 0.0005, + "loss": 2.0894, + "step": 217280 + }, + { + "epoch": 0.8270593698377777, + "grad_norm": 0.12169773876667023, + "learning_rate": 0.0005, + "loss": 2.0921, + "step": 217290 + }, + { + "epoch": 0.8270974323059005, + "grad_norm": 0.11764802783727646, + "learning_rate": 0.0005, + "loss": 2.1047, + "step": 217300 + }, + { + "epoch": 0.8271354947740232, + "grad_norm": 0.11365995556116104, + "learning_rate": 0.0005, + "loss": 2.115, + "step": 217310 + }, + { + "epoch": 0.8271735572421458, + "grad_norm": 0.1227889209985733, + "learning_rate": 0.0005, + "loss": 2.1007, + "step": 217320 + }, + { + "epoch": 0.8272116197102685, + "grad_norm": 0.12642797827720642, + "learning_rate": 0.0005, + "loss": 2.1157, + "step": 217330 + }, + { + "epoch": 0.8272496821783911, + "grad_norm": 0.1310013085603714, + "learning_rate": 0.0005, + "loss": 2.1281, + "step": 217340 + }, + { + "epoch": 0.8272877446465139, + "grad_norm": 0.12254031747579575, + "learning_rate": 0.0005, + "loss": 2.1057, + "step": 217350 + }, + { + "epoch": 0.8273258071146365, + "grad_norm": 0.1295580267906189, + "learning_rate": 0.0005, + "loss": 2.1148, + "step": 217360 + }, + { + "epoch": 0.8273638695827592, + "grad_norm": 0.1251552402973175, + "learning_rate": 0.0005, + "loss": 2.089, + "step": 217370 + }, + { + "epoch": 0.8274019320508819, + "grad_norm": 0.13117572665214539, + "learning_rate": 0.0005, + "loss": 2.1193, + "step": 217380 + }, + { + "epoch": 0.8274399945190046, + "grad_norm": 0.14766137301921844, + "learning_rate": 0.0005, + "loss": 2.1107, + "step": 217390 + }, + { + "epoch": 0.8274780569871273, + "grad_norm": 0.1245618611574173, + "learning_rate": 0.0005, + "loss": 2.1042, + "step": 217400 + }, + { + "epoch": 0.8275161194552499, + "grad_norm": 0.12521414458751678, + "learning_rate": 0.0005, + "loss": 2.1161, + "step": 217410 + }, + { + "epoch": 0.8275541819233726, + "grad_norm": 0.1354859322309494, + "learning_rate": 0.0005, + "loss": 2.1084, + "step": 217420 + }, + { + "epoch": 0.8275922443914954, + "grad_norm": 0.1282625049352646, + "learning_rate": 0.0005, + "loss": 2.0939, + "step": 217430 + }, + { + "epoch": 0.827630306859618, + "grad_norm": 0.12341107428073883, + "learning_rate": 0.0005, + "loss": 2.0936, + "step": 217440 + }, + { + "epoch": 0.8276683693277407, + "grad_norm": 0.12624134123325348, + "learning_rate": 0.0005, + "loss": 2.1109, + "step": 217450 + }, + { + "epoch": 0.8277064317958633, + "grad_norm": 0.13044513761997223, + "learning_rate": 0.0005, + "loss": 2.0943, + "step": 217460 + }, + { + "epoch": 0.8277444942639861, + "grad_norm": 0.11772848665714264, + "learning_rate": 0.0005, + "loss": 2.1177, + "step": 217470 + }, + { + "epoch": 0.8277825567321088, + "grad_norm": 0.11918339878320694, + "learning_rate": 0.0005, + "loss": 2.1095, + "step": 217480 + }, + { + "epoch": 0.8278206192002314, + "grad_norm": 0.11408302932977676, + "learning_rate": 0.0005, + "loss": 2.1149, + "step": 217490 + }, + { + "epoch": 0.8278586816683541, + "grad_norm": 0.12150252610445023, + "learning_rate": 0.0005, + "loss": 2.0851, + "step": 217500 + }, + { + "epoch": 0.8278967441364767, + "grad_norm": 0.12904538214206696, + "learning_rate": 0.0005, + "loss": 2.1171, + "step": 217510 + }, + { + "epoch": 0.8279348066045995, + "grad_norm": 0.13957063853740692, + "learning_rate": 0.0005, + "loss": 2.0989, + "step": 217520 + }, + { + "epoch": 0.8279728690727222, + "grad_norm": 0.1259831190109253, + "learning_rate": 0.0005, + "loss": 2.1069, + "step": 217530 + }, + { + "epoch": 0.8280109315408448, + "grad_norm": 0.13127802312374115, + "learning_rate": 0.0005, + "loss": 2.13, + "step": 217540 + }, + { + "epoch": 0.8280489940089675, + "grad_norm": 0.12426311522722244, + "learning_rate": 0.0005, + "loss": 2.1054, + "step": 217550 + }, + { + "epoch": 0.8280870564770902, + "grad_norm": 0.116221584379673, + "learning_rate": 0.0005, + "loss": 2.0915, + "step": 217560 + }, + { + "epoch": 0.8281251189452129, + "grad_norm": 0.1291026920080185, + "learning_rate": 0.0005, + "loss": 2.0989, + "step": 217570 + }, + { + "epoch": 0.8281631814133356, + "grad_norm": 0.11487631499767303, + "learning_rate": 0.0005, + "loss": 2.0881, + "step": 217580 + }, + { + "epoch": 0.8282012438814582, + "grad_norm": 0.11711179465055466, + "learning_rate": 0.0005, + "loss": 2.1096, + "step": 217590 + }, + { + "epoch": 0.828239306349581, + "grad_norm": 0.132344588637352, + "learning_rate": 0.0005, + "loss": 2.1042, + "step": 217600 + }, + { + "epoch": 0.8282773688177036, + "grad_norm": 0.13396376371383667, + "learning_rate": 0.0005, + "loss": 2.0892, + "step": 217610 + }, + { + "epoch": 0.8283154312858263, + "grad_norm": 0.12230243533849716, + "learning_rate": 0.0005, + "loss": 2.0912, + "step": 217620 + }, + { + "epoch": 0.828353493753949, + "grad_norm": 0.1329038143157959, + "learning_rate": 0.0005, + "loss": 2.0965, + "step": 217630 + }, + { + "epoch": 0.8283915562220716, + "grad_norm": 0.12074906378984451, + "learning_rate": 0.0005, + "loss": 2.1046, + "step": 217640 + }, + { + "epoch": 0.8284296186901944, + "grad_norm": 0.12414073944091797, + "learning_rate": 0.0005, + "loss": 2.1055, + "step": 217650 + }, + { + "epoch": 0.828467681158317, + "grad_norm": 0.12632207572460175, + "learning_rate": 0.0005, + "loss": 2.1095, + "step": 217660 + }, + { + "epoch": 0.8285057436264397, + "grad_norm": 0.12833178043365479, + "learning_rate": 0.0005, + "loss": 2.0941, + "step": 217670 + }, + { + "epoch": 0.8285438060945624, + "grad_norm": 0.1254252791404724, + "learning_rate": 0.0005, + "loss": 2.108, + "step": 217680 + }, + { + "epoch": 0.8285818685626851, + "grad_norm": 0.12022615224123001, + "learning_rate": 0.0005, + "loss": 2.0968, + "step": 217690 + }, + { + "epoch": 0.8286199310308078, + "grad_norm": 0.121268130838871, + "learning_rate": 0.0005, + "loss": 2.1063, + "step": 217700 + }, + { + "epoch": 0.8286579934989304, + "grad_norm": 0.12961791455745697, + "learning_rate": 0.0005, + "loss": 2.0942, + "step": 217710 + }, + { + "epoch": 0.8286960559670531, + "grad_norm": 0.1238061934709549, + "learning_rate": 0.0005, + "loss": 2.1082, + "step": 217720 + }, + { + "epoch": 0.8287341184351759, + "grad_norm": 0.12405844032764435, + "learning_rate": 0.0005, + "loss": 2.0892, + "step": 217730 + }, + { + "epoch": 0.8287721809032985, + "grad_norm": 0.11987213045358658, + "learning_rate": 0.0005, + "loss": 2.1025, + "step": 217740 + }, + { + "epoch": 0.8288102433714212, + "grad_norm": 0.1331670880317688, + "learning_rate": 0.0005, + "loss": 2.1062, + "step": 217750 + }, + { + "epoch": 0.8288483058395438, + "grad_norm": 0.11618906259536743, + "learning_rate": 0.0005, + "loss": 2.1058, + "step": 217760 + }, + { + "epoch": 0.8288863683076665, + "grad_norm": 0.12605607509613037, + "learning_rate": 0.0005, + "loss": 2.1053, + "step": 217770 + }, + { + "epoch": 0.8289244307757893, + "grad_norm": 0.12620078027248383, + "learning_rate": 0.0005, + "loss": 2.1034, + "step": 217780 + }, + { + "epoch": 0.8289624932439119, + "grad_norm": 0.12945833802223206, + "learning_rate": 0.0005, + "loss": 2.102, + "step": 217790 + }, + { + "epoch": 0.8290005557120346, + "grad_norm": 0.13022899627685547, + "learning_rate": 0.0005, + "loss": 2.1014, + "step": 217800 + }, + { + "epoch": 0.8290386181801572, + "grad_norm": 0.13662181794643402, + "learning_rate": 0.0005, + "loss": 2.1156, + "step": 217810 + }, + { + "epoch": 0.82907668064828, + "grad_norm": 0.13192316889762878, + "learning_rate": 0.0005, + "loss": 2.0998, + "step": 217820 + }, + { + "epoch": 0.8291147431164027, + "grad_norm": 0.13144901394844055, + "learning_rate": 0.0005, + "loss": 2.1106, + "step": 217830 + }, + { + "epoch": 0.8291528055845253, + "grad_norm": 0.11243834346532822, + "learning_rate": 0.0005, + "loss": 2.0957, + "step": 217840 + }, + { + "epoch": 0.829190868052648, + "grad_norm": 0.1198180615901947, + "learning_rate": 0.0005, + "loss": 2.0998, + "step": 217850 + }, + { + "epoch": 0.8292289305207707, + "grad_norm": 0.11610034108161926, + "learning_rate": 0.0005, + "loss": 2.1096, + "step": 217860 + }, + { + "epoch": 0.8292669929888934, + "grad_norm": 0.13124071061611176, + "learning_rate": 0.0005, + "loss": 2.1085, + "step": 217870 + }, + { + "epoch": 0.829305055457016, + "grad_norm": 0.11900652945041656, + "learning_rate": 0.0005, + "loss": 2.0903, + "step": 217880 + }, + { + "epoch": 0.8293431179251387, + "grad_norm": 0.13047805428504944, + "learning_rate": 0.0005, + "loss": 2.1041, + "step": 217890 + }, + { + "epoch": 0.8293811803932615, + "grad_norm": 0.12463737279176712, + "learning_rate": 0.0005, + "loss": 2.0975, + "step": 217900 + }, + { + "epoch": 0.8294192428613841, + "grad_norm": 0.12787851691246033, + "learning_rate": 0.0005, + "loss": 2.1005, + "step": 217910 + }, + { + "epoch": 0.8294573053295068, + "grad_norm": 0.13656316697597504, + "learning_rate": 0.0005, + "loss": 2.0978, + "step": 217920 + }, + { + "epoch": 0.8294953677976294, + "grad_norm": 0.1297297179698944, + "learning_rate": 0.0005, + "loss": 2.0995, + "step": 217930 + }, + { + "epoch": 0.8295334302657521, + "grad_norm": 0.13846826553344727, + "learning_rate": 0.0005, + "loss": 2.0852, + "step": 217940 + }, + { + "epoch": 0.8295714927338749, + "grad_norm": 0.1337054818868637, + "learning_rate": 0.0005, + "loss": 2.107, + "step": 217950 + }, + { + "epoch": 0.8296095552019975, + "grad_norm": 0.11899015307426453, + "learning_rate": 0.0005, + "loss": 2.1029, + "step": 217960 + }, + { + "epoch": 0.8296476176701202, + "grad_norm": 0.12474185228347778, + "learning_rate": 0.0005, + "loss": 2.1054, + "step": 217970 + }, + { + "epoch": 0.8296856801382428, + "grad_norm": 0.12763509154319763, + "learning_rate": 0.0005, + "loss": 2.1057, + "step": 217980 + }, + { + "epoch": 0.8297237426063656, + "grad_norm": 0.13220475614070892, + "learning_rate": 0.0005, + "loss": 2.103, + "step": 217990 + }, + { + "epoch": 0.8297618050744883, + "grad_norm": 0.12977369129657745, + "learning_rate": 0.0005, + "loss": 2.1158, + "step": 218000 + }, + { + "epoch": 0.8297998675426109, + "grad_norm": 0.13516859710216522, + "learning_rate": 0.0005, + "loss": 2.1115, + "step": 218010 + }, + { + "epoch": 0.8298379300107336, + "grad_norm": 0.1203380823135376, + "learning_rate": 0.0005, + "loss": 2.1069, + "step": 218020 + }, + { + "epoch": 0.8298759924788564, + "grad_norm": 0.11484825611114502, + "learning_rate": 0.0005, + "loss": 2.1073, + "step": 218030 + }, + { + "epoch": 0.829914054946979, + "grad_norm": 0.13916315138339996, + "learning_rate": 0.0005, + "loss": 2.1181, + "step": 218040 + }, + { + "epoch": 0.8299521174151017, + "grad_norm": 0.16566237807273865, + "learning_rate": 0.0005, + "loss": 2.1134, + "step": 218050 + }, + { + "epoch": 0.8299901798832243, + "grad_norm": 0.1584080308675766, + "learning_rate": 0.0005, + "loss": 2.1044, + "step": 218060 + }, + { + "epoch": 0.830028242351347, + "grad_norm": 0.15324938297271729, + "learning_rate": 0.0005, + "loss": 2.0958, + "step": 218070 + }, + { + "epoch": 0.8300663048194697, + "grad_norm": 0.12328063696622849, + "learning_rate": 0.0005, + "loss": 2.1232, + "step": 218080 + }, + { + "epoch": 0.8301043672875924, + "grad_norm": 0.1370043009519577, + "learning_rate": 0.0005, + "loss": 2.1065, + "step": 218090 + }, + { + "epoch": 0.8301424297557151, + "grad_norm": 0.11810302734375, + "learning_rate": 0.0005, + "loss": 2.0956, + "step": 218100 + }, + { + "epoch": 0.8301804922238377, + "grad_norm": 0.12439907342195511, + "learning_rate": 0.0005, + "loss": 2.0989, + "step": 218110 + }, + { + "epoch": 0.8302185546919605, + "grad_norm": 0.12648242712020874, + "learning_rate": 0.0005, + "loss": 2.091, + "step": 218120 + }, + { + "epoch": 0.8302566171600831, + "grad_norm": 0.13632889091968536, + "learning_rate": 0.0005, + "loss": 2.1014, + "step": 218130 + }, + { + "epoch": 0.8302946796282058, + "grad_norm": 0.13036006689071655, + "learning_rate": 0.0005, + "loss": 2.1034, + "step": 218140 + }, + { + "epoch": 0.8303327420963285, + "grad_norm": 0.12066423892974854, + "learning_rate": 0.0005, + "loss": 2.1149, + "step": 218150 + }, + { + "epoch": 0.8303708045644512, + "grad_norm": 0.138320654630661, + "learning_rate": 0.0005, + "loss": 2.1074, + "step": 218160 + }, + { + "epoch": 0.8304088670325739, + "grad_norm": 0.12363722175359726, + "learning_rate": 0.0005, + "loss": 2.0952, + "step": 218170 + }, + { + "epoch": 0.8304469295006965, + "grad_norm": 0.13549621403217316, + "learning_rate": 0.0005, + "loss": 2.0913, + "step": 218180 + }, + { + "epoch": 0.8304849919688192, + "grad_norm": 0.1275758296251297, + "learning_rate": 0.0005, + "loss": 2.102, + "step": 218190 + }, + { + "epoch": 0.8305230544369419, + "grad_norm": 0.1225966215133667, + "learning_rate": 0.0005, + "loss": 2.1204, + "step": 218200 + }, + { + "epoch": 0.8305611169050646, + "grad_norm": 0.11592172086238861, + "learning_rate": 0.0005, + "loss": 2.077, + "step": 218210 + }, + { + "epoch": 0.8305991793731873, + "grad_norm": 0.12450257688760757, + "learning_rate": 0.0005, + "loss": 2.0964, + "step": 218220 + }, + { + "epoch": 0.8306372418413099, + "grad_norm": 0.20624670386314392, + "learning_rate": 0.0005, + "loss": 2.0992, + "step": 218230 + }, + { + "epoch": 0.8306753043094326, + "grad_norm": 0.12865829467773438, + "learning_rate": 0.0005, + "loss": 2.1099, + "step": 218240 + }, + { + "epoch": 0.8307133667775554, + "grad_norm": 0.12566210329532623, + "learning_rate": 0.0005, + "loss": 2.1078, + "step": 218250 + }, + { + "epoch": 0.830751429245678, + "grad_norm": 0.12799441814422607, + "learning_rate": 0.0005, + "loss": 2.1186, + "step": 218260 + }, + { + "epoch": 0.8307894917138007, + "grad_norm": 0.12885446846485138, + "learning_rate": 0.0005, + "loss": 2.116, + "step": 218270 + }, + { + "epoch": 0.8308275541819233, + "grad_norm": 0.13233810663223267, + "learning_rate": 0.0005, + "loss": 2.1006, + "step": 218280 + }, + { + "epoch": 0.8308656166500461, + "grad_norm": 0.11711487174034119, + "learning_rate": 0.0005, + "loss": 2.1029, + "step": 218290 + }, + { + "epoch": 0.8309036791181688, + "grad_norm": 0.11240831017494202, + "learning_rate": 0.0005, + "loss": 2.1053, + "step": 218300 + }, + { + "epoch": 0.8309417415862914, + "grad_norm": 0.12619911134243011, + "learning_rate": 0.0005, + "loss": 2.1025, + "step": 218310 + }, + { + "epoch": 0.8309798040544141, + "grad_norm": 0.12383470684289932, + "learning_rate": 0.0005, + "loss": 2.1005, + "step": 218320 + }, + { + "epoch": 0.8310178665225368, + "grad_norm": 0.12004560232162476, + "learning_rate": 0.0005, + "loss": 2.1145, + "step": 218330 + }, + { + "epoch": 0.8310559289906595, + "grad_norm": 0.12753629684448242, + "learning_rate": 0.0005, + "loss": 2.109, + "step": 218340 + }, + { + "epoch": 0.8310939914587822, + "grad_norm": 0.11608237028121948, + "learning_rate": 0.0005, + "loss": 2.1084, + "step": 218350 + }, + { + "epoch": 0.8311320539269048, + "grad_norm": 0.1277332901954651, + "learning_rate": 0.0005, + "loss": 2.1096, + "step": 218360 + }, + { + "epoch": 0.8311701163950275, + "grad_norm": 0.14176727831363678, + "learning_rate": 0.0005, + "loss": 2.114, + "step": 218370 + }, + { + "epoch": 0.8312081788631502, + "grad_norm": 0.12776508927345276, + "learning_rate": 0.0005, + "loss": 2.094, + "step": 218380 + }, + { + "epoch": 0.8312462413312729, + "grad_norm": 0.12426704168319702, + "learning_rate": 0.0005, + "loss": 2.109, + "step": 218390 + }, + { + "epoch": 0.8312843037993956, + "grad_norm": 0.12347155809402466, + "learning_rate": 0.0005, + "loss": 2.1119, + "step": 218400 + }, + { + "epoch": 0.8313223662675182, + "grad_norm": 0.12837913632392883, + "learning_rate": 0.0005, + "loss": 2.1072, + "step": 218410 + }, + { + "epoch": 0.831360428735641, + "grad_norm": 0.12590497732162476, + "learning_rate": 0.0005, + "loss": 2.1181, + "step": 218420 + }, + { + "epoch": 0.8313984912037636, + "grad_norm": 0.11684665083885193, + "learning_rate": 0.0005, + "loss": 2.0926, + "step": 218430 + }, + { + "epoch": 0.8314365536718863, + "grad_norm": 0.1202918216586113, + "learning_rate": 0.0005, + "loss": 2.0988, + "step": 218440 + }, + { + "epoch": 0.831474616140009, + "grad_norm": 0.12684597074985504, + "learning_rate": 0.0005, + "loss": 2.1082, + "step": 218450 + }, + { + "epoch": 0.8315126786081317, + "grad_norm": 0.11629930138587952, + "learning_rate": 0.0005, + "loss": 2.1078, + "step": 218460 + }, + { + "epoch": 0.8315507410762544, + "grad_norm": 0.1290973573923111, + "learning_rate": 0.0005, + "loss": 2.1058, + "step": 218470 + }, + { + "epoch": 0.831588803544377, + "grad_norm": 0.11917988955974579, + "learning_rate": 0.0005, + "loss": 2.0996, + "step": 218480 + }, + { + "epoch": 0.8316268660124997, + "grad_norm": 0.11952853202819824, + "learning_rate": 0.0005, + "loss": 2.1002, + "step": 218490 + }, + { + "epoch": 0.8316649284806223, + "grad_norm": 0.1278664916753769, + "learning_rate": 0.0005, + "loss": 2.1002, + "step": 218500 + }, + { + "epoch": 0.8317029909487451, + "grad_norm": 0.12799476087093353, + "learning_rate": 0.0005, + "loss": 2.1113, + "step": 218510 + }, + { + "epoch": 0.8317410534168678, + "grad_norm": 0.17281503975391388, + "learning_rate": 0.0005, + "loss": 2.1015, + "step": 218520 + }, + { + "epoch": 0.8317791158849904, + "grad_norm": 0.13788138329982758, + "learning_rate": 0.0005, + "loss": 2.0996, + "step": 218530 + }, + { + "epoch": 0.8318171783531131, + "grad_norm": 0.12696509063243866, + "learning_rate": 0.0005, + "loss": 2.0985, + "step": 218540 + }, + { + "epoch": 0.8318552408212359, + "grad_norm": 0.12793564796447754, + "learning_rate": 0.0005, + "loss": 2.1045, + "step": 218550 + }, + { + "epoch": 0.8318933032893585, + "grad_norm": 0.2812157869338989, + "learning_rate": 0.0005, + "loss": 2.1114, + "step": 218560 + }, + { + "epoch": 0.8319313657574812, + "grad_norm": 0.11583702266216278, + "learning_rate": 0.0005, + "loss": 2.1098, + "step": 218570 + }, + { + "epoch": 0.8319694282256038, + "grad_norm": 0.12513552606105804, + "learning_rate": 0.0005, + "loss": 2.1036, + "step": 218580 + }, + { + "epoch": 0.8320074906937266, + "grad_norm": 0.12092099338769913, + "learning_rate": 0.0005, + "loss": 2.1183, + "step": 218590 + }, + { + "epoch": 0.8320455531618492, + "grad_norm": 0.14139620959758759, + "learning_rate": 0.0005, + "loss": 2.105, + "step": 218600 + }, + { + "epoch": 0.8320836156299719, + "grad_norm": 0.12599727511405945, + "learning_rate": 0.0005, + "loss": 2.1049, + "step": 218610 + }, + { + "epoch": 0.8321216780980946, + "grad_norm": 0.11518718302249908, + "learning_rate": 0.0005, + "loss": 2.0944, + "step": 218620 + }, + { + "epoch": 0.8321597405662172, + "grad_norm": 0.1322387456893921, + "learning_rate": 0.0005, + "loss": 2.0976, + "step": 218630 + }, + { + "epoch": 0.83219780303434, + "grad_norm": 0.11819644272327423, + "learning_rate": 0.0005, + "loss": 2.1002, + "step": 218640 + }, + { + "epoch": 0.8322358655024626, + "grad_norm": 0.13278405368328094, + "learning_rate": 0.0005, + "loss": 2.1039, + "step": 218650 + }, + { + "epoch": 0.8322739279705853, + "grad_norm": 0.1323469579219818, + "learning_rate": 0.0005, + "loss": 2.1064, + "step": 218660 + }, + { + "epoch": 0.832311990438708, + "grad_norm": 0.11011801660060883, + "learning_rate": 0.0005, + "loss": 2.1081, + "step": 218670 + }, + { + "epoch": 0.8323500529068307, + "grad_norm": 0.11899255216121674, + "learning_rate": 0.0005, + "loss": 2.1039, + "step": 218680 + }, + { + "epoch": 0.8323881153749534, + "grad_norm": 0.12844981253147125, + "learning_rate": 0.0005, + "loss": 2.1123, + "step": 218690 + }, + { + "epoch": 0.832426177843076, + "grad_norm": 0.13125044107437134, + "learning_rate": 0.0005, + "loss": 2.1183, + "step": 218700 + }, + { + "epoch": 0.8324642403111987, + "grad_norm": 0.13138504326343536, + "learning_rate": 0.0005, + "loss": 2.0873, + "step": 218710 + }, + { + "epoch": 0.8325023027793215, + "grad_norm": 0.15032434463500977, + "learning_rate": 0.0005, + "loss": 2.102, + "step": 218720 + }, + { + "epoch": 0.8325403652474441, + "grad_norm": 0.12644049525260925, + "learning_rate": 0.0005, + "loss": 2.1013, + "step": 218730 + }, + { + "epoch": 0.8325784277155668, + "grad_norm": 0.11857806891202927, + "learning_rate": 0.0005, + "loss": 2.1057, + "step": 218740 + }, + { + "epoch": 0.8326164901836894, + "grad_norm": 0.11589282751083374, + "learning_rate": 0.0005, + "loss": 2.1131, + "step": 218750 + }, + { + "epoch": 0.8326545526518122, + "grad_norm": 0.1254875361919403, + "learning_rate": 0.0005, + "loss": 2.1153, + "step": 218760 + }, + { + "epoch": 0.8326926151199349, + "grad_norm": 0.15716472268104553, + "learning_rate": 0.0005, + "loss": 2.1039, + "step": 218770 + }, + { + "epoch": 0.8327306775880575, + "grad_norm": 0.14979439973831177, + "learning_rate": 0.0005, + "loss": 2.1116, + "step": 218780 + }, + { + "epoch": 0.8327687400561802, + "grad_norm": 0.12744982540607452, + "learning_rate": 0.0005, + "loss": 2.104, + "step": 218790 + }, + { + "epoch": 0.8328068025243028, + "grad_norm": 0.1272817850112915, + "learning_rate": 0.0005, + "loss": 2.0867, + "step": 218800 + }, + { + "epoch": 0.8328448649924256, + "grad_norm": 0.12373799085617065, + "learning_rate": 0.0005, + "loss": 2.0968, + "step": 218810 + }, + { + "epoch": 0.8328829274605483, + "grad_norm": 0.12223444879055023, + "learning_rate": 0.0005, + "loss": 2.1175, + "step": 218820 + }, + { + "epoch": 0.8329209899286709, + "grad_norm": 0.12802813947200775, + "learning_rate": 0.0005, + "loss": 2.0967, + "step": 218830 + }, + { + "epoch": 0.8329590523967936, + "grad_norm": 0.12781710922718048, + "learning_rate": 0.0005, + "loss": 2.1119, + "step": 218840 + }, + { + "epoch": 0.8329971148649163, + "grad_norm": 0.12687340378761292, + "learning_rate": 0.0005, + "loss": 2.0979, + "step": 218850 + }, + { + "epoch": 0.833035177333039, + "grad_norm": 0.12185090035200119, + "learning_rate": 0.0005, + "loss": 2.1146, + "step": 218860 + }, + { + "epoch": 0.8330732398011617, + "grad_norm": 0.1304478794336319, + "learning_rate": 0.0005, + "loss": 2.1107, + "step": 218870 + }, + { + "epoch": 0.8331113022692843, + "grad_norm": 0.13264945149421692, + "learning_rate": 0.0005, + "loss": 2.1089, + "step": 218880 + }, + { + "epoch": 0.8331493647374071, + "grad_norm": 0.1238386332988739, + "learning_rate": 0.0005, + "loss": 2.1114, + "step": 218890 + }, + { + "epoch": 0.8331874272055297, + "grad_norm": 0.12317591905593872, + "learning_rate": 0.0005, + "loss": 2.0945, + "step": 218900 + }, + { + "epoch": 0.8332254896736524, + "grad_norm": 0.12547940015792847, + "learning_rate": 0.0005, + "loss": 2.0991, + "step": 218910 + }, + { + "epoch": 0.833263552141775, + "grad_norm": 0.14949534833431244, + "learning_rate": 0.0005, + "loss": 2.1043, + "step": 218920 + }, + { + "epoch": 0.8333016146098977, + "grad_norm": 0.13050684332847595, + "learning_rate": 0.0005, + "loss": 2.1088, + "step": 218930 + }, + { + "epoch": 0.8333396770780205, + "grad_norm": 0.12255212664604187, + "learning_rate": 0.0005, + "loss": 2.1293, + "step": 218940 + }, + { + "epoch": 0.8333777395461431, + "grad_norm": 0.1258205771446228, + "learning_rate": 0.0005, + "loss": 2.1096, + "step": 218950 + }, + { + "epoch": 0.8334158020142658, + "grad_norm": 0.11713549494743347, + "learning_rate": 0.0005, + "loss": 2.1059, + "step": 218960 + }, + { + "epoch": 0.8334538644823885, + "grad_norm": 0.12944845855236053, + "learning_rate": 0.0005, + "loss": 2.121, + "step": 218970 + }, + { + "epoch": 0.8334919269505112, + "grad_norm": 0.13849878311157227, + "learning_rate": 0.0005, + "loss": 2.1109, + "step": 218980 + }, + { + "epoch": 0.8335299894186339, + "grad_norm": 0.13154521584510803, + "learning_rate": 0.0005, + "loss": 2.1168, + "step": 218990 + }, + { + "epoch": 0.8335680518867565, + "grad_norm": 0.12196949869394302, + "learning_rate": 0.0005, + "loss": 2.0972, + "step": 219000 + }, + { + "epoch": 0.8336061143548792, + "grad_norm": 0.12521928548812866, + "learning_rate": 0.0005, + "loss": 2.1051, + "step": 219010 + }, + { + "epoch": 0.833644176823002, + "grad_norm": 0.12222328037023544, + "learning_rate": 0.0005, + "loss": 2.0975, + "step": 219020 + }, + { + "epoch": 0.8336822392911246, + "grad_norm": 0.12767820060253143, + "learning_rate": 0.0005, + "loss": 2.0984, + "step": 219030 + }, + { + "epoch": 0.8337203017592473, + "grad_norm": 0.15104414522647858, + "learning_rate": 0.0005, + "loss": 2.1056, + "step": 219040 + }, + { + "epoch": 0.8337583642273699, + "grad_norm": 0.14058572053909302, + "learning_rate": 0.0005, + "loss": 2.0939, + "step": 219050 + }, + { + "epoch": 0.8337964266954926, + "grad_norm": 0.18238197267055511, + "learning_rate": 0.0005, + "loss": 2.1265, + "step": 219060 + }, + { + "epoch": 0.8338344891636154, + "grad_norm": 0.12206801027059555, + "learning_rate": 0.0005, + "loss": 2.1118, + "step": 219070 + }, + { + "epoch": 0.833872551631738, + "grad_norm": 0.14061588048934937, + "learning_rate": 0.0005, + "loss": 2.1037, + "step": 219080 + }, + { + "epoch": 0.8339106140998607, + "grad_norm": 0.13219532370567322, + "learning_rate": 0.0005, + "loss": 2.1039, + "step": 219090 + }, + { + "epoch": 0.8339486765679833, + "grad_norm": 0.13353204727172852, + "learning_rate": 0.0005, + "loss": 2.0939, + "step": 219100 + }, + { + "epoch": 0.8339867390361061, + "grad_norm": 0.12455740571022034, + "learning_rate": 0.0005, + "loss": 2.1311, + "step": 219110 + }, + { + "epoch": 0.8340248015042288, + "grad_norm": 0.12448125332593918, + "learning_rate": 0.0005, + "loss": 2.1156, + "step": 219120 + }, + { + "epoch": 0.8340628639723514, + "grad_norm": 0.12801167368888855, + "learning_rate": 0.0005, + "loss": 2.1109, + "step": 219130 + }, + { + "epoch": 0.8341009264404741, + "grad_norm": 0.12430154532194138, + "learning_rate": 0.0005, + "loss": 2.1097, + "step": 219140 + }, + { + "epoch": 0.8341389889085968, + "grad_norm": 0.13335710763931274, + "learning_rate": 0.0005, + "loss": 2.0941, + "step": 219150 + }, + { + "epoch": 0.8341770513767195, + "grad_norm": 0.12447947263717651, + "learning_rate": 0.0005, + "loss": 2.0965, + "step": 219160 + }, + { + "epoch": 0.8342151138448421, + "grad_norm": 0.1202947348356247, + "learning_rate": 0.0005, + "loss": 2.0845, + "step": 219170 + }, + { + "epoch": 0.8342531763129648, + "grad_norm": 0.12497258931398392, + "learning_rate": 0.0005, + "loss": 2.1016, + "step": 219180 + }, + { + "epoch": 0.8342912387810876, + "grad_norm": 0.12731964886188507, + "learning_rate": 0.0005, + "loss": 2.1064, + "step": 219190 + }, + { + "epoch": 0.8343293012492102, + "grad_norm": 0.11766664683818817, + "learning_rate": 0.0005, + "loss": 2.0959, + "step": 219200 + }, + { + "epoch": 0.8343673637173329, + "grad_norm": 0.11760242283344269, + "learning_rate": 0.0005, + "loss": 2.1047, + "step": 219210 + }, + { + "epoch": 0.8344054261854555, + "grad_norm": 0.12806595861911774, + "learning_rate": 0.0005, + "loss": 2.1006, + "step": 219220 + }, + { + "epoch": 0.8344434886535782, + "grad_norm": 0.1269293874502182, + "learning_rate": 0.0005, + "loss": 2.1118, + "step": 219230 + }, + { + "epoch": 0.834481551121701, + "grad_norm": 0.11925157159566879, + "learning_rate": 0.0005, + "loss": 2.0955, + "step": 219240 + }, + { + "epoch": 0.8345196135898236, + "grad_norm": 0.14136989414691925, + "learning_rate": 0.0005, + "loss": 2.1119, + "step": 219250 + }, + { + "epoch": 0.8345576760579463, + "grad_norm": 0.12632125616073608, + "learning_rate": 0.0005, + "loss": 2.1065, + "step": 219260 + }, + { + "epoch": 0.8345957385260689, + "grad_norm": 0.12249311804771423, + "learning_rate": 0.0005, + "loss": 2.1042, + "step": 219270 + }, + { + "epoch": 0.8346338009941917, + "grad_norm": 0.13576222956180573, + "learning_rate": 0.0005, + "loss": 2.1111, + "step": 219280 + }, + { + "epoch": 0.8346718634623144, + "grad_norm": 0.35710614919662476, + "learning_rate": 0.0005, + "loss": 2.1012, + "step": 219290 + }, + { + "epoch": 0.834709925930437, + "grad_norm": 0.11858486384153366, + "learning_rate": 0.0005, + "loss": 2.105, + "step": 219300 + }, + { + "epoch": 0.8347479883985597, + "grad_norm": 0.11976990848779678, + "learning_rate": 0.0005, + "loss": 2.0948, + "step": 219310 + }, + { + "epoch": 0.8347860508666824, + "grad_norm": 0.12474948912858963, + "learning_rate": 0.0005, + "loss": 2.0997, + "step": 219320 + }, + { + "epoch": 0.8348241133348051, + "grad_norm": 0.1316811442375183, + "learning_rate": 0.0005, + "loss": 2.1088, + "step": 219330 + }, + { + "epoch": 0.8348621758029278, + "grad_norm": 0.11638589948415756, + "learning_rate": 0.0005, + "loss": 2.104, + "step": 219340 + }, + { + "epoch": 0.8349002382710504, + "grad_norm": 0.12653575837612152, + "learning_rate": 0.0005, + "loss": 2.1065, + "step": 219350 + }, + { + "epoch": 0.8349383007391731, + "grad_norm": 0.1143994927406311, + "learning_rate": 0.0005, + "loss": 2.124, + "step": 219360 + }, + { + "epoch": 0.8349763632072958, + "grad_norm": 0.1304251253604889, + "learning_rate": 0.0005, + "loss": 2.1199, + "step": 219370 + }, + { + "epoch": 0.8350144256754185, + "grad_norm": 0.14210690557956696, + "learning_rate": 0.0005, + "loss": 2.1025, + "step": 219380 + }, + { + "epoch": 0.8350524881435412, + "grad_norm": 0.1346653550863266, + "learning_rate": 0.0005, + "loss": 2.1026, + "step": 219390 + }, + { + "epoch": 0.8350905506116638, + "grad_norm": 0.12486615777015686, + "learning_rate": 0.0005, + "loss": 2.1192, + "step": 219400 + }, + { + "epoch": 0.8351286130797866, + "grad_norm": 0.1316107213497162, + "learning_rate": 0.0005, + "loss": 2.0899, + "step": 219410 + }, + { + "epoch": 0.8351666755479092, + "grad_norm": 0.12303043156862259, + "learning_rate": 0.0005, + "loss": 2.1059, + "step": 219420 + }, + { + "epoch": 0.8352047380160319, + "grad_norm": 0.17756234109401703, + "learning_rate": 0.0005, + "loss": 2.0772, + "step": 219430 + }, + { + "epoch": 0.8352428004841546, + "grad_norm": 0.11724088340997696, + "learning_rate": 0.0005, + "loss": 2.1004, + "step": 219440 + }, + { + "epoch": 0.8352808629522773, + "grad_norm": 0.1304939091205597, + "learning_rate": 0.0005, + "loss": 2.1108, + "step": 219450 + }, + { + "epoch": 0.8353189254204, + "grad_norm": 0.13139232993125916, + "learning_rate": 0.0005, + "loss": 2.1032, + "step": 219460 + }, + { + "epoch": 0.8353569878885226, + "grad_norm": 0.14084558188915253, + "learning_rate": 0.0005, + "loss": 2.124, + "step": 219470 + }, + { + "epoch": 0.8353950503566453, + "grad_norm": 0.12776871025562286, + "learning_rate": 0.0005, + "loss": 2.1001, + "step": 219480 + }, + { + "epoch": 0.835433112824768, + "grad_norm": 0.12956500053405762, + "learning_rate": 0.0005, + "loss": 2.1209, + "step": 219490 + }, + { + "epoch": 0.8354711752928907, + "grad_norm": 0.12888874113559723, + "learning_rate": 0.0005, + "loss": 2.1038, + "step": 219500 + }, + { + "epoch": 0.8355092377610134, + "grad_norm": 0.1196303591132164, + "learning_rate": 0.0005, + "loss": 2.097, + "step": 219510 + }, + { + "epoch": 0.835547300229136, + "grad_norm": 0.1274517923593521, + "learning_rate": 0.0005, + "loss": 2.1177, + "step": 219520 + }, + { + "epoch": 0.8355853626972587, + "grad_norm": 0.12661273777484894, + "learning_rate": 0.0005, + "loss": 2.1104, + "step": 219530 + }, + { + "epoch": 0.8356234251653815, + "grad_norm": 0.11761227250099182, + "learning_rate": 0.0005, + "loss": 2.1167, + "step": 219540 + }, + { + "epoch": 0.8356614876335041, + "grad_norm": 0.1256171017885208, + "learning_rate": 0.0005, + "loss": 2.1125, + "step": 219550 + }, + { + "epoch": 0.8356995501016268, + "grad_norm": 0.11300463229417801, + "learning_rate": 0.0005, + "loss": 2.0933, + "step": 219560 + }, + { + "epoch": 0.8357376125697494, + "grad_norm": 0.11957667022943497, + "learning_rate": 0.0005, + "loss": 2.1069, + "step": 219570 + }, + { + "epoch": 0.8357756750378722, + "grad_norm": 0.11883150041103363, + "learning_rate": 0.0005, + "loss": 2.1143, + "step": 219580 + }, + { + "epoch": 0.8358137375059949, + "grad_norm": 0.13056206703186035, + "learning_rate": 0.0005, + "loss": 2.1063, + "step": 219590 + }, + { + "epoch": 0.8358517999741175, + "grad_norm": 0.13161806762218475, + "learning_rate": 0.0005, + "loss": 2.0964, + "step": 219600 + }, + { + "epoch": 0.8358898624422402, + "grad_norm": 0.4031476378440857, + "learning_rate": 0.0005, + "loss": 2.1017, + "step": 219610 + }, + { + "epoch": 0.8359279249103629, + "grad_norm": 0.14822180569171906, + "learning_rate": 0.0005, + "loss": 2.0922, + "step": 219620 + }, + { + "epoch": 0.8359659873784856, + "grad_norm": 0.12903235852718353, + "learning_rate": 0.0005, + "loss": 2.1052, + "step": 219630 + }, + { + "epoch": 0.8360040498466083, + "grad_norm": 0.12346906960010529, + "learning_rate": 0.0005, + "loss": 2.1067, + "step": 219640 + }, + { + "epoch": 0.8360421123147309, + "grad_norm": 0.127950057387352, + "learning_rate": 0.0005, + "loss": 2.1226, + "step": 219650 + }, + { + "epoch": 0.8360801747828536, + "grad_norm": 0.12633047997951508, + "learning_rate": 0.0005, + "loss": 2.1032, + "step": 219660 + }, + { + "epoch": 0.8361182372509763, + "grad_norm": 0.1212027296423912, + "learning_rate": 0.0005, + "loss": 2.1264, + "step": 219670 + }, + { + "epoch": 0.836156299719099, + "grad_norm": 0.11412563174962997, + "learning_rate": 0.0005, + "loss": 2.1064, + "step": 219680 + }, + { + "epoch": 0.8361943621872217, + "grad_norm": 0.13156822323799133, + "learning_rate": 0.0005, + "loss": 2.1107, + "step": 219690 + }, + { + "epoch": 0.8362324246553443, + "grad_norm": 0.11729413270950317, + "learning_rate": 0.0005, + "loss": 2.102, + "step": 219700 + }, + { + "epoch": 0.8362704871234671, + "grad_norm": 0.12015419453382492, + "learning_rate": 0.0005, + "loss": 2.1026, + "step": 219710 + }, + { + "epoch": 0.8363085495915897, + "grad_norm": 0.11273893713951111, + "learning_rate": 0.0005, + "loss": 2.1125, + "step": 219720 + }, + { + "epoch": 0.8363466120597124, + "grad_norm": 0.1223645731806755, + "learning_rate": 0.0005, + "loss": 2.1129, + "step": 219730 + }, + { + "epoch": 0.836384674527835, + "grad_norm": 0.15054258704185486, + "learning_rate": 0.0005, + "loss": 2.1041, + "step": 219740 + }, + { + "epoch": 0.8364227369959578, + "grad_norm": 0.11842544376850128, + "learning_rate": 0.0005, + "loss": 2.0929, + "step": 219750 + }, + { + "epoch": 0.8364607994640805, + "grad_norm": 0.12672503292560577, + "learning_rate": 0.0005, + "loss": 2.1007, + "step": 219760 + }, + { + "epoch": 0.8364988619322031, + "grad_norm": 0.1286192536354065, + "learning_rate": 0.0005, + "loss": 2.1, + "step": 219770 + }, + { + "epoch": 0.8365369244003258, + "grad_norm": 0.12729696929454803, + "learning_rate": 0.0005, + "loss": 2.1043, + "step": 219780 + }, + { + "epoch": 0.8365749868684484, + "grad_norm": 0.13377317786216736, + "learning_rate": 0.0005, + "loss": 2.0995, + "step": 219790 + }, + { + "epoch": 0.8366130493365712, + "grad_norm": 0.12418297678232193, + "learning_rate": 0.0005, + "loss": 2.1228, + "step": 219800 + }, + { + "epoch": 0.8366511118046939, + "grad_norm": 0.12871158123016357, + "learning_rate": 0.0005, + "loss": 2.112, + "step": 219810 + }, + { + "epoch": 0.8366891742728165, + "grad_norm": 0.12255019694566727, + "learning_rate": 0.0005, + "loss": 2.0834, + "step": 219820 + }, + { + "epoch": 0.8367272367409392, + "grad_norm": 0.12640362977981567, + "learning_rate": 0.0005, + "loss": 2.0994, + "step": 219830 + }, + { + "epoch": 0.836765299209062, + "grad_norm": 0.13552077114582062, + "learning_rate": 0.0005, + "loss": 2.1107, + "step": 219840 + }, + { + "epoch": 0.8368033616771846, + "grad_norm": 0.11782406270503998, + "learning_rate": 0.0005, + "loss": 2.1075, + "step": 219850 + }, + { + "epoch": 0.8368414241453073, + "grad_norm": 0.13368447124958038, + "learning_rate": 0.0005, + "loss": 2.1129, + "step": 219860 + }, + { + "epoch": 0.8368794866134299, + "grad_norm": 0.13141457736492157, + "learning_rate": 0.0005, + "loss": 2.0907, + "step": 219870 + }, + { + "epoch": 0.8369175490815527, + "grad_norm": 0.12528492510318756, + "learning_rate": 0.0005, + "loss": 2.1182, + "step": 219880 + }, + { + "epoch": 0.8369556115496753, + "grad_norm": 0.1313919723033905, + "learning_rate": 0.0005, + "loss": 2.0997, + "step": 219890 + }, + { + "epoch": 0.836993674017798, + "grad_norm": 0.1197948232293129, + "learning_rate": 0.0005, + "loss": 2.0952, + "step": 219900 + }, + { + "epoch": 0.8370317364859207, + "grad_norm": 0.12959925830364227, + "learning_rate": 0.0005, + "loss": 2.1029, + "step": 219910 + }, + { + "epoch": 0.8370697989540434, + "grad_norm": 0.12254883348941803, + "learning_rate": 0.0005, + "loss": 2.1005, + "step": 219920 + }, + { + "epoch": 0.8371078614221661, + "grad_norm": 0.12791576981544495, + "learning_rate": 0.0005, + "loss": 2.1126, + "step": 219930 + }, + { + "epoch": 0.8371459238902887, + "grad_norm": 0.12873512506484985, + "learning_rate": 0.0005, + "loss": 2.0948, + "step": 219940 + }, + { + "epoch": 0.8371839863584114, + "grad_norm": 0.12622743844985962, + "learning_rate": 0.0005, + "loss": 2.0954, + "step": 219950 + }, + { + "epoch": 0.8372220488265341, + "grad_norm": 0.1307528167963028, + "learning_rate": 0.0005, + "loss": 2.108, + "step": 219960 + }, + { + "epoch": 0.8372601112946568, + "grad_norm": 0.11617814004421234, + "learning_rate": 0.0005, + "loss": 2.0998, + "step": 219970 + }, + { + "epoch": 0.8372981737627795, + "grad_norm": 0.12317516654729843, + "learning_rate": 0.0005, + "loss": 2.1019, + "step": 219980 + }, + { + "epoch": 0.8373362362309021, + "grad_norm": 0.11890271306037903, + "learning_rate": 0.0005, + "loss": 2.0985, + "step": 219990 + }, + { + "epoch": 0.8373742986990248, + "grad_norm": 0.1206265315413475, + "learning_rate": 0.0005, + "loss": 2.0789, + "step": 220000 + }, + { + "epoch": 0.8374123611671476, + "grad_norm": 0.12651634216308594, + "learning_rate": 0.0005, + "loss": 2.0913, + "step": 220010 + }, + { + "epoch": 0.8374504236352702, + "grad_norm": 0.11712469160556793, + "learning_rate": 0.0005, + "loss": 2.1054, + "step": 220020 + }, + { + "epoch": 0.8374884861033929, + "grad_norm": 0.1293676495552063, + "learning_rate": 0.0005, + "loss": 2.0991, + "step": 220030 + }, + { + "epoch": 0.8375265485715155, + "grad_norm": 0.13680680096149445, + "learning_rate": 0.0005, + "loss": 2.0967, + "step": 220040 + }, + { + "epoch": 0.8375646110396383, + "grad_norm": 0.11812064796686172, + "learning_rate": 0.0005, + "loss": 2.0852, + "step": 220050 + }, + { + "epoch": 0.837602673507761, + "grad_norm": 0.12090223282575607, + "learning_rate": 0.0005, + "loss": 2.1004, + "step": 220060 + }, + { + "epoch": 0.8376407359758836, + "grad_norm": 0.13474848866462708, + "learning_rate": 0.0005, + "loss": 2.0967, + "step": 220070 + }, + { + "epoch": 0.8376787984440063, + "grad_norm": 0.13077902793884277, + "learning_rate": 0.0005, + "loss": 2.107, + "step": 220080 + }, + { + "epoch": 0.8377168609121289, + "grad_norm": 0.13111700117588043, + "learning_rate": 0.0005, + "loss": 2.1019, + "step": 220090 + }, + { + "epoch": 0.8377549233802517, + "grad_norm": 0.13522939383983612, + "learning_rate": 0.0005, + "loss": 2.0968, + "step": 220100 + }, + { + "epoch": 0.8377929858483744, + "grad_norm": 0.11386032402515411, + "learning_rate": 0.0005, + "loss": 2.1175, + "step": 220110 + }, + { + "epoch": 0.837831048316497, + "grad_norm": 0.12607534229755402, + "learning_rate": 0.0005, + "loss": 2.1006, + "step": 220120 + }, + { + "epoch": 0.8378691107846197, + "grad_norm": 0.12744760513305664, + "learning_rate": 0.0005, + "loss": 2.1114, + "step": 220130 + }, + { + "epoch": 0.8379071732527424, + "grad_norm": 0.12559397518634796, + "learning_rate": 0.0005, + "loss": 2.1006, + "step": 220140 + }, + { + "epoch": 0.8379452357208651, + "grad_norm": 0.1274642050266266, + "learning_rate": 0.0005, + "loss": 2.1024, + "step": 220150 + }, + { + "epoch": 0.8379832981889878, + "grad_norm": 0.12040489912033081, + "learning_rate": 0.0005, + "loss": 2.1054, + "step": 220160 + }, + { + "epoch": 0.8380213606571104, + "grad_norm": 0.1237734705209732, + "learning_rate": 0.0005, + "loss": 2.1082, + "step": 220170 + }, + { + "epoch": 0.8380594231252332, + "grad_norm": 0.13057562708854675, + "learning_rate": 0.0005, + "loss": 2.103, + "step": 220180 + }, + { + "epoch": 0.8380974855933558, + "grad_norm": 0.1266573965549469, + "learning_rate": 0.0005, + "loss": 2.0954, + "step": 220190 + }, + { + "epoch": 0.8381355480614785, + "grad_norm": 0.13707292079925537, + "learning_rate": 0.0005, + "loss": 2.1011, + "step": 220200 + }, + { + "epoch": 0.8381736105296012, + "grad_norm": 0.12204498797655106, + "learning_rate": 0.0005, + "loss": 2.1099, + "step": 220210 + }, + { + "epoch": 0.8382116729977238, + "grad_norm": 0.1297737956047058, + "learning_rate": 0.0005, + "loss": 2.102, + "step": 220220 + }, + { + "epoch": 0.8382497354658466, + "grad_norm": 0.11896966397762299, + "learning_rate": 0.0005, + "loss": 2.097, + "step": 220230 + }, + { + "epoch": 0.8382877979339692, + "grad_norm": 0.12746648490428925, + "learning_rate": 0.0005, + "loss": 2.1213, + "step": 220240 + }, + { + "epoch": 0.8383258604020919, + "grad_norm": 0.13134010136127472, + "learning_rate": 0.0005, + "loss": 2.1139, + "step": 220250 + }, + { + "epoch": 0.8383639228702146, + "grad_norm": 0.1297721415758133, + "learning_rate": 0.0005, + "loss": 2.1142, + "step": 220260 + }, + { + "epoch": 0.8384019853383373, + "grad_norm": 0.11566124856472015, + "learning_rate": 0.0005, + "loss": 2.0967, + "step": 220270 + }, + { + "epoch": 0.83844004780646, + "grad_norm": 0.1283719837665558, + "learning_rate": 0.0005, + "loss": 2.1089, + "step": 220280 + }, + { + "epoch": 0.8384781102745826, + "grad_norm": 0.13768593966960907, + "learning_rate": 0.0005, + "loss": 2.1122, + "step": 220290 + }, + { + "epoch": 0.8385161727427053, + "grad_norm": 0.13208796083927155, + "learning_rate": 0.0005, + "loss": 2.0954, + "step": 220300 + }, + { + "epoch": 0.8385542352108281, + "grad_norm": 0.15101394057273865, + "learning_rate": 0.0005, + "loss": 2.1114, + "step": 220310 + }, + { + "epoch": 0.8385922976789507, + "grad_norm": 0.12248805165290833, + "learning_rate": 0.0005, + "loss": 2.093, + "step": 220320 + }, + { + "epoch": 0.8386303601470734, + "grad_norm": 0.1368759274482727, + "learning_rate": 0.0005, + "loss": 2.1011, + "step": 220330 + }, + { + "epoch": 0.838668422615196, + "grad_norm": 0.12396056950092316, + "learning_rate": 0.0005, + "loss": 2.0939, + "step": 220340 + }, + { + "epoch": 0.8387064850833188, + "grad_norm": 0.12895584106445312, + "learning_rate": 0.0005, + "loss": 2.1083, + "step": 220350 + }, + { + "epoch": 0.8387445475514415, + "grad_norm": 0.140159010887146, + "learning_rate": 0.0005, + "loss": 2.1141, + "step": 220360 + }, + { + "epoch": 0.8387826100195641, + "grad_norm": 0.12067504972219467, + "learning_rate": 0.0005, + "loss": 2.1044, + "step": 220370 + }, + { + "epoch": 0.8388206724876868, + "grad_norm": 0.11043280363082886, + "learning_rate": 0.0005, + "loss": 2.1002, + "step": 220380 + }, + { + "epoch": 0.8388587349558094, + "grad_norm": 0.12122703343629837, + "learning_rate": 0.0005, + "loss": 2.1047, + "step": 220390 + }, + { + "epoch": 0.8388967974239322, + "grad_norm": 0.11654380708932877, + "learning_rate": 0.0005, + "loss": 2.1096, + "step": 220400 + }, + { + "epoch": 0.8389348598920549, + "grad_norm": 0.12953002750873566, + "learning_rate": 0.0005, + "loss": 2.1145, + "step": 220410 + }, + { + "epoch": 0.8389729223601775, + "grad_norm": 0.12626194953918457, + "learning_rate": 0.0005, + "loss": 2.1048, + "step": 220420 + }, + { + "epoch": 0.8390109848283002, + "grad_norm": 0.1339138299226761, + "learning_rate": 0.0005, + "loss": 2.0937, + "step": 220430 + }, + { + "epoch": 0.8390490472964229, + "grad_norm": 0.12890183925628662, + "learning_rate": 0.0005, + "loss": 2.1013, + "step": 220440 + }, + { + "epoch": 0.8390871097645456, + "grad_norm": 0.12708349525928497, + "learning_rate": 0.0005, + "loss": 2.1121, + "step": 220450 + }, + { + "epoch": 0.8391251722326682, + "grad_norm": 0.12480761110782623, + "learning_rate": 0.0005, + "loss": 2.1096, + "step": 220460 + }, + { + "epoch": 0.8391632347007909, + "grad_norm": 0.11401735246181488, + "learning_rate": 0.0005, + "loss": 2.1013, + "step": 220470 + }, + { + "epoch": 0.8392012971689137, + "grad_norm": 0.12634459137916565, + "learning_rate": 0.0005, + "loss": 2.1174, + "step": 220480 + }, + { + "epoch": 0.8392393596370363, + "grad_norm": 0.11847994476556778, + "learning_rate": 0.0005, + "loss": 2.1102, + "step": 220490 + }, + { + "epoch": 0.839277422105159, + "grad_norm": 0.1256241649389267, + "learning_rate": 0.0005, + "loss": 2.0971, + "step": 220500 + }, + { + "epoch": 0.8393154845732816, + "grad_norm": 0.12276550382375717, + "learning_rate": 0.0005, + "loss": 2.0925, + "step": 220510 + }, + { + "epoch": 0.8393535470414043, + "grad_norm": 0.13058394193649292, + "learning_rate": 0.0005, + "loss": 2.1073, + "step": 220520 + }, + { + "epoch": 0.8393916095095271, + "grad_norm": 0.13051599264144897, + "learning_rate": 0.0005, + "loss": 2.1081, + "step": 220530 + }, + { + "epoch": 0.8394296719776497, + "grad_norm": 0.12998336553573608, + "learning_rate": 0.0005, + "loss": 2.1115, + "step": 220540 + }, + { + "epoch": 0.8394677344457724, + "grad_norm": 0.12816651165485382, + "learning_rate": 0.0005, + "loss": 2.1001, + "step": 220550 + }, + { + "epoch": 0.839505796913895, + "grad_norm": 0.12044844776391983, + "learning_rate": 0.0005, + "loss": 2.098, + "step": 220560 + }, + { + "epoch": 0.8395438593820178, + "grad_norm": 0.13804836571216583, + "learning_rate": 0.0005, + "loss": 2.1201, + "step": 220570 + }, + { + "epoch": 0.8395819218501405, + "grad_norm": 0.12196008116006851, + "learning_rate": 0.0005, + "loss": 2.1123, + "step": 220580 + }, + { + "epoch": 0.8396199843182631, + "grad_norm": 0.127668097615242, + "learning_rate": 0.0005, + "loss": 2.1144, + "step": 220590 + }, + { + "epoch": 0.8396580467863858, + "grad_norm": 0.13133740425109863, + "learning_rate": 0.0005, + "loss": 2.0973, + "step": 220600 + }, + { + "epoch": 0.8396961092545085, + "grad_norm": 0.13879632949829102, + "learning_rate": 0.0005, + "loss": 2.1013, + "step": 220610 + }, + { + "epoch": 0.8397341717226312, + "grad_norm": 0.12597687542438507, + "learning_rate": 0.0005, + "loss": 2.1069, + "step": 220620 + }, + { + "epoch": 0.8397722341907539, + "grad_norm": 0.1211690902709961, + "learning_rate": 0.0005, + "loss": 2.0911, + "step": 220630 + }, + { + "epoch": 0.8398102966588765, + "grad_norm": 0.13215793669223785, + "learning_rate": 0.0005, + "loss": 2.11, + "step": 220640 + }, + { + "epoch": 0.8398483591269992, + "grad_norm": 0.1430741250514984, + "learning_rate": 0.0005, + "loss": 2.1029, + "step": 220650 + }, + { + "epoch": 0.839886421595122, + "grad_norm": 0.1376802921295166, + "learning_rate": 0.0005, + "loss": 2.1052, + "step": 220660 + }, + { + "epoch": 0.8399244840632446, + "grad_norm": 0.12379004806280136, + "learning_rate": 0.0005, + "loss": 2.1162, + "step": 220670 + }, + { + "epoch": 0.8399625465313673, + "grad_norm": 0.12066813558340073, + "learning_rate": 0.0005, + "loss": 2.0853, + "step": 220680 + }, + { + "epoch": 0.8400006089994899, + "grad_norm": 0.1271403282880783, + "learning_rate": 0.0005, + "loss": 2.101, + "step": 220690 + }, + { + "epoch": 0.8400386714676127, + "grad_norm": 0.12651439011096954, + "learning_rate": 0.0005, + "loss": 2.1119, + "step": 220700 + }, + { + "epoch": 0.8400767339357353, + "grad_norm": 0.1287226527929306, + "learning_rate": 0.0005, + "loss": 2.0833, + "step": 220710 + }, + { + "epoch": 0.840114796403858, + "grad_norm": 0.13046154379844666, + "learning_rate": 0.0005, + "loss": 2.1046, + "step": 220720 + }, + { + "epoch": 0.8401528588719807, + "grad_norm": 0.11079791933298111, + "learning_rate": 0.0005, + "loss": 2.1049, + "step": 220730 + }, + { + "epoch": 0.8401909213401034, + "grad_norm": 0.1274874359369278, + "learning_rate": 0.0005, + "loss": 2.0981, + "step": 220740 + }, + { + "epoch": 0.8402289838082261, + "grad_norm": 0.1260407567024231, + "learning_rate": 0.0005, + "loss": 2.1139, + "step": 220750 + }, + { + "epoch": 0.8402670462763487, + "grad_norm": 0.11859792470932007, + "learning_rate": 0.0005, + "loss": 2.1049, + "step": 220760 + }, + { + "epoch": 0.8403051087444714, + "grad_norm": 0.12220750004053116, + "learning_rate": 0.0005, + "loss": 2.1039, + "step": 220770 + }, + { + "epoch": 0.8403431712125942, + "grad_norm": 0.13098108768463135, + "learning_rate": 0.0005, + "loss": 2.0985, + "step": 220780 + }, + { + "epoch": 0.8403812336807168, + "grad_norm": 0.13276565074920654, + "learning_rate": 0.0005, + "loss": 2.1038, + "step": 220790 + }, + { + "epoch": 0.8404192961488395, + "grad_norm": 0.12173032015562057, + "learning_rate": 0.0005, + "loss": 2.1022, + "step": 220800 + }, + { + "epoch": 0.8404573586169621, + "grad_norm": 0.12042824178934097, + "learning_rate": 0.0005, + "loss": 2.1122, + "step": 220810 + }, + { + "epoch": 0.8404954210850848, + "grad_norm": 0.12178431451320648, + "learning_rate": 0.0005, + "loss": 2.093, + "step": 220820 + }, + { + "epoch": 0.8405334835532076, + "grad_norm": 0.11848913133144379, + "learning_rate": 0.0005, + "loss": 2.1112, + "step": 220830 + }, + { + "epoch": 0.8405715460213302, + "grad_norm": 0.11565049737691879, + "learning_rate": 0.0005, + "loss": 2.1015, + "step": 220840 + }, + { + "epoch": 0.8406096084894529, + "grad_norm": 0.12566454708576202, + "learning_rate": 0.0005, + "loss": 2.126, + "step": 220850 + }, + { + "epoch": 0.8406476709575755, + "grad_norm": 0.12305108457803726, + "learning_rate": 0.0005, + "loss": 2.1056, + "step": 220860 + }, + { + "epoch": 0.8406857334256983, + "grad_norm": 0.12301186472177505, + "learning_rate": 0.0005, + "loss": 2.1109, + "step": 220870 + }, + { + "epoch": 0.840723795893821, + "grad_norm": 0.1459309160709381, + "learning_rate": 0.0005, + "loss": 2.1128, + "step": 220880 + }, + { + "epoch": 0.8407618583619436, + "grad_norm": 0.12649591267108917, + "learning_rate": 0.0005, + "loss": 2.0977, + "step": 220890 + }, + { + "epoch": 0.8407999208300663, + "grad_norm": 0.12986882030963898, + "learning_rate": 0.0005, + "loss": 2.1014, + "step": 220900 + }, + { + "epoch": 0.840837983298189, + "grad_norm": 0.11345621198415756, + "learning_rate": 0.0005, + "loss": 2.1084, + "step": 220910 + }, + { + "epoch": 0.8408760457663117, + "grad_norm": 0.12991900742053986, + "learning_rate": 0.0005, + "loss": 2.1062, + "step": 220920 + }, + { + "epoch": 0.8409141082344344, + "grad_norm": 0.11783597618341446, + "learning_rate": 0.0005, + "loss": 2.0976, + "step": 220930 + }, + { + "epoch": 0.840952170702557, + "grad_norm": 0.13129445910453796, + "learning_rate": 0.0005, + "loss": 2.1036, + "step": 220940 + }, + { + "epoch": 0.8409902331706797, + "grad_norm": 0.15973244607448578, + "learning_rate": 0.0005, + "loss": 2.1032, + "step": 220950 + }, + { + "epoch": 0.8410282956388024, + "grad_norm": 0.11920005828142166, + "learning_rate": 0.0005, + "loss": 2.1114, + "step": 220960 + }, + { + "epoch": 0.8410663581069251, + "grad_norm": 0.12620969116687775, + "learning_rate": 0.0005, + "loss": 2.0979, + "step": 220970 + }, + { + "epoch": 0.8411044205750478, + "grad_norm": 0.12203420698642731, + "learning_rate": 0.0005, + "loss": 2.1002, + "step": 220980 + }, + { + "epoch": 0.8411424830431704, + "grad_norm": 0.13088800013065338, + "learning_rate": 0.0005, + "loss": 2.1001, + "step": 220990 + }, + { + "epoch": 0.8411805455112932, + "grad_norm": 0.12926673889160156, + "learning_rate": 0.0005, + "loss": 2.1056, + "step": 221000 + }, + { + "epoch": 0.8412186079794158, + "grad_norm": 0.12309526652097702, + "learning_rate": 0.0005, + "loss": 2.0876, + "step": 221010 + }, + { + "epoch": 0.8412566704475385, + "grad_norm": 0.1410890370607376, + "learning_rate": 0.0005, + "loss": 2.102, + "step": 221020 + }, + { + "epoch": 0.8412947329156611, + "grad_norm": 0.14877110719680786, + "learning_rate": 0.0005, + "loss": 2.1089, + "step": 221030 + }, + { + "epoch": 0.8413327953837839, + "grad_norm": 0.1271168440580368, + "learning_rate": 0.0005, + "loss": 2.1081, + "step": 221040 + }, + { + "epoch": 0.8413708578519066, + "grad_norm": 0.12515568733215332, + "learning_rate": 0.0005, + "loss": 2.1027, + "step": 221050 + }, + { + "epoch": 0.8414089203200292, + "grad_norm": 0.1173098087310791, + "learning_rate": 0.0005, + "loss": 2.1089, + "step": 221060 + }, + { + "epoch": 0.8414469827881519, + "grad_norm": 0.12076503783464432, + "learning_rate": 0.0005, + "loss": 2.1096, + "step": 221070 + }, + { + "epoch": 0.8414850452562745, + "grad_norm": 0.12368505448102951, + "learning_rate": 0.0005, + "loss": 2.0945, + "step": 221080 + }, + { + "epoch": 0.8415231077243973, + "grad_norm": 0.1254063993692398, + "learning_rate": 0.0005, + "loss": 2.105, + "step": 221090 + }, + { + "epoch": 0.84156117019252, + "grad_norm": 0.1290888488292694, + "learning_rate": 0.0005, + "loss": 2.1132, + "step": 221100 + }, + { + "epoch": 0.8415992326606426, + "grad_norm": 0.12299709022045135, + "learning_rate": 0.0005, + "loss": 2.1126, + "step": 221110 + }, + { + "epoch": 0.8416372951287653, + "grad_norm": 0.14081443846225739, + "learning_rate": 0.0005, + "loss": 2.0776, + "step": 221120 + }, + { + "epoch": 0.841675357596888, + "grad_norm": 0.13779985904693604, + "learning_rate": 0.0005, + "loss": 2.1049, + "step": 221130 + }, + { + "epoch": 0.8417134200650107, + "grad_norm": 0.1326400339603424, + "learning_rate": 0.0005, + "loss": 2.1099, + "step": 221140 + }, + { + "epoch": 0.8417514825331334, + "grad_norm": 0.13280388712882996, + "learning_rate": 0.0005, + "loss": 2.0914, + "step": 221150 + }, + { + "epoch": 0.841789545001256, + "grad_norm": 0.12059993296861649, + "learning_rate": 0.0005, + "loss": 2.0931, + "step": 221160 + }, + { + "epoch": 0.8418276074693788, + "grad_norm": 0.12586145102977753, + "learning_rate": 0.0005, + "loss": 2.0977, + "step": 221170 + }, + { + "epoch": 0.8418656699375014, + "grad_norm": 0.12924747169017792, + "learning_rate": 0.0005, + "loss": 2.1073, + "step": 221180 + }, + { + "epoch": 0.8419037324056241, + "grad_norm": 0.11559860408306122, + "learning_rate": 0.0005, + "loss": 2.1084, + "step": 221190 + }, + { + "epoch": 0.8419417948737468, + "grad_norm": 0.13393127918243408, + "learning_rate": 0.0005, + "loss": 2.1114, + "step": 221200 + }, + { + "epoch": 0.8419798573418695, + "grad_norm": 0.11904259026050568, + "learning_rate": 0.0005, + "loss": 2.1112, + "step": 221210 + }, + { + "epoch": 0.8420179198099922, + "grad_norm": 0.11419398337602615, + "learning_rate": 0.0005, + "loss": 2.1134, + "step": 221220 + }, + { + "epoch": 0.8420559822781148, + "grad_norm": 0.12206226587295532, + "learning_rate": 0.0005, + "loss": 2.1043, + "step": 221230 + }, + { + "epoch": 0.8420940447462375, + "grad_norm": 0.11776451766490936, + "learning_rate": 0.0005, + "loss": 2.1045, + "step": 221240 + }, + { + "epoch": 0.8421321072143602, + "grad_norm": 0.12946538627147675, + "learning_rate": 0.0005, + "loss": 2.1105, + "step": 221250 + }, + { + "epoch": 0.8421701696824829, + "grad_norm": 0.12498574703931808, + "learning_rate": 0.0005, + "loss": 2.1119, + "step": 221260 + }, + { + "epoch": 0.8422082321506056, + "grad_norm": 0.13250340521335602, + "learning_rate": 0.0005, + "loss": 2.1076, + "step": 221270 + }, + { + "epoch": 0.8422462946187282, + "grad_norm": 0.11953128129243851, + "learning_rate": 0.0005, + "loss": 2.1083, + "step": 221280 + }, + { + "epoch": 0.8422843570868509, + "grad_norm": 0.12299077957868576, + "learning_rate": 0.0005, + "loss": 2.0955, + "step": 221290 + }, + { + "epoch": 0.8423224195549737, + "grad_norm": 0.13264764845371246, + "learning_rate": 0.0005, + "loss": 2.1023, + "step": 221300 + }, + { + "epoch": 0.8423604820230963, + "grad_norm": 0.12111787497997284, + "learning_rate": 0.0005, + "loss": 2.109, + "step": 221310 + }, + { + "epoch": 0.842398544491219, + "grad_norm": 0.13892684876918793, + "learning_rate": 0.0005, + "loss": 2.0948, + "step": 221320 + }, + { + "epoch": 0.8424366069593416, + "grad_norm": 0.12118415534496307, + "learning_rate": 0.0005, + "loss": 2.1107, + "step": 221330 + }, + { + "epoch": 0.8424746694274644, + "grad_norm": 0.135015606880188, + "learning_rate": 0.0005, + "loss": 2.1154, + "step": 221340 + }, + { + "epoch": 0.8425127318955871, + "grad_norm": 0.13240301609039307, + "learning_rate": 0.0005, + "loss": 2.1023, + "step": 221350 + }, + { + "epoch": 0.8425507943637097, + "grad_norm": 0.12435257434844971, + "learning_rate": 0.0005, + "loss": 2.0947, + "step": 221360 + }, + { + "epoch": 0.8425888568318324, + "grad_norm": 0.11481736600399017, + "learning_rate": 0.0005, + "loss": 2.0968, + "step": 221370 + }, + { + "epoch": 0.842626919299955, + "grad_norm": 0.12455075234174728, + "learning_rate": 0.0005, + "loss": 2.1158, + "step": 221380 + }, + { + "epoch": 0.8426649817680778, + "grad_norm": 0.1482924073934555, + "learning_rate": 0.0005, + "loss": 2.1114, + "step": 221390 + }, + { + "epoch": 0.8427030442362005, + "grad_norm": 0.1364888846874237, + "learning_rate": 0.0005, + "loss": 2.1151, + "step": 221400 + }, + { + "epoch": 0.8427411067043231, + "grad_norm": 0.13500209152698517, + "learning_rate": 0.0005, + "loss": 2.1059, + "step": 221410 + }, + { + "epoch": 0.8427791691724458, + "grad_norm": 0.11597223579883575, + "learning_rate": 0.0005, + "loss": 2.1034, + "step": 221420 + }, + { + "epoch": 0.8428172316405685, + "grad_norm": 0.12641501426696777, + "learning_rate": 0.0005, + "loss": 2.1085, + "step": 221430 + }, + { + "epoch": 0.8428552941086912, + "grad_norm": 0.11433349549770355, + "learning_rate": 0.0005, + "loss": 2.0905, + "step": 221440 + }, + { + "epoch": 0.8428933565768139, + "grad_norm": 0.1248387023806572, + "learning_rate": 0.0005, + "loss": 2.1016, + "step": 221450 + }, + { + "epoch": 0.8429314190449365, + "grad_norm": 0.12403486669063568, + "learning_rate": 0.0005, + "loss": 2.1052, + "step": 221460 + }, + { + "epoch": 0.8429694815130593, + "grad_norm": 0.12142638862133026, + "learning_rate": 0.0005, + "loss": 2.1001, + "step": 221470 + }, + { + "epoch": 0.8430075439811819, + "grad_norm": 0.12724417448043823, + "learning_rate": 0.0005, + "loss": 2.1188, + "step": 221480 + }, + { + "epoch": 0.8430456064493046, + "grad_norm": 0.1116470918059349, + "learning_rate": 0.0005, + "loss": 2.1021, + "step": 221490 + }, + { + "epoch": 0.8430836689174273, + "grad_norm": 0.12297196686267853, + "learning_rate": 0.0005, + "loss": 2.112, + "step": 221500 + }, + { + "epoch": 0.8431217313855499, + "grad_norm": 0.11790607869625092, + "learning_rate": 0.0005, + "loss": 2.1086, + "step": 221510 + }, + { + "epoch": 0.8431597938536727, + "grad_norm": 0.14102840423583984, + "learning_rate": 0.0005, + "loss": 2.1123, + "step": 221520 + }, + { + "epoch": 0.8431978563217953, + "grad_norm": 0.12849067151546478, + "learning_rate": 0.0005, + "loss": 2.0959, + "step": 221530 + }, + { + "epoch": 0.843235918789918, + "grad_norm": 0.12495825439691544, + "learning_rate": 0.0005, + "loss": 2.0989, + "step": 221540 + }, + { + "epoch": 0.8432739812580407, + "grad_norm": 0.11713171750307083, + "learning_rate": 0.0005, + "loss": 2.1004, + "step": 221550 + }, + { + "epoch": 0.8433120437261634, + "grad_norm": 0.1117461696267128, + "learning_rate": 0.0005, + "loss": 2.1016, + "step": 221560 + }, + { + "epoch": 0.8433501061942861, + "grad_norm": 0.13182447850704193, + "learning_rate": 0.0005, + "loss": 2.0947, + "step": 221570 + }, + { + "epoch": 0.8433881686624087, + "grad_norm": 0.11467888951301575, + "learning_rate": 0.0005, + "loss": 2.1068, + "step": 221580 + }, + { + "epoch": 0.8434262311305314, + "grad_norm": 0.12174401432275772, + "learning_rate": 0.0005, + "loss": 2.1199, + "step": 221590 + }, + { + "epoch": 0.8434642935986542, + "grad_norm": 0.12947309017181396, + "learning_rate": 0.0005, + "loss": 2.111, + "step": 221600 + }, + { + "epoch": 0.8435023560667768, + "grad_norm": 0.1451372653245926, + "learning_rate": 0.0005, + "loss": 2.0992, + "step": 221610 + }, + { + "epoch": 0.8435404185348995, + "grad_norm": 0.12875647842884064, + "learning_rate": 0.0005, + "loss": 2.112, + "step": 221620 + }, + { + "epoch": 0.8435784810030221, + "grad_norm": 0.11792705953121185, + "learning_rate": 0.0005, + "loss": 2.1015, + "step": 221630 + }, + { + "epoch": 0.8436165434711449, + "grad_norm": 0.11868022382259369, + "learning_rate": 0.0005, + "loss": 2.1134, + "step": 221640 + }, + { + "epoch": 0.8436546059392676, + "grad_norm": 0.12856294214725494, + "learning_rate": 0.0005, + "loss": 2.1004, + "step": 221650 + }, + { + "epoch": 0.8436926684073902, + "grad_norm": 0.12594842910766602, + "learning_rate": 0.0005, + "loss": 2.081, + "step": 221660 + }, + { + "epoch": 0.8437307308755129, + "grad_norm": 0.11838579177856445, + "learning_rate": 0.0005, + "loss": 2.1213, + "step": 221670 + }, + { + "epoch": 0.8437687933436355, + "grad_norm": 0.125643789768219, + "learning_rate": 0.0005, + "loss": 2.0925, + "step": 221680 + }, + { + "epoch": 0.8438068558117583, + "grad_norm": 0.12387394905090332, + "learning_rate": 0.0005, + "loss": 2.1071, + "step": 221690 + }, + { + "epoch": 0.843844918279881, + "grad_norm": 0.11876220256090164, + "learning_rate": 0.0005, + "loss": 2.1098, + "step": 221700 + }, + { + "epoch": 0.8438829807480036, + "grad_norm": 0.11892592161893845, + "learning_rate": 0.0005, + "loss": 2.0999, + "step": 221710 + }, + { + "epoch": 0.8439210432161263, + "grad_norm": 0.14295817911624908, + "learning_rate": 0.0005, + "loss": 2.1032, + "step": 221720 + }, + { + "epoch": 0.843959105684249, + "grad_norm": 0.12424919009208679, + "learning_rate": 0.0005, + "loss": 2.0996, + "step": 221730 + }, + { + "epoch": 0.8439971681523717, + "grad_norm": 0.13023801147937775, + "learning_rate": 0.0005, + "loss": 2.0955, + "step": 221740 + }, + { + "epoch": 0.8440352306204943, + "grad_norm": 0.137488454580307, + "learning_rate": 0.0005, + "loss": 2.1024, + "step": 221750 + }, + { + "epoch": 0.844073293088617, + "grad_norm": 0.12765270471572876, + "learning_rate": 0.0005, + "loss": 2.1151, + "step": 221760 + }, + { + "epoch": 0.8441113555567398, + "grad_norm": 0.11833908408880234, + "learning_rate": 0.0005, + "loss": 2.1098, + "step": 221770 + }, + { + "epoch": 0.8441494180248624, + "grad_norm": 0.12693412601947784, + "learning_rate": 0.0005, + "loss": 2.1017, + "step": 221780 + }, + { + "epoch": 0.8441874804929851, + "grad_norm": 0.13138915598392487, + "learning_rate": 0.0005, + "loss": 2.1062, + "step": 221790 + }, + { + "epoch": 0.8442255429611077, + "grad_norm": 0.11366184055805206, + "learning_rate": 0.0005, + "loss": 2.0966, + "step": 221800 + }, + { + "epoch": 0.8442636054292304, + "grad_norm": 0.14213193953037262, + "learning_rate": 0.0005, + "loss": 2.1054, + "step": 221810 + }, + { + "epoch": 0.8443016678973532, + "grad_norm": 0.13295476138591766, + "learning_rate": 0.0005, + "loss": 2.1108, + "step": 221820 + }, + { + "epoch": 0.8443397303654758, + "grad_norm": 0.15998639166355133, + "learning_rate": 0.0005, + "loss": 2.0947, + "step": 221830 + }, + { + "epoch": 0.8443777928335985, + "grad_norm": 0.13382495939731598, + "learning_rate": 0.0005, + "loss": 2.1013, + "step": 221840 + }, + { + "epoch": 0.8444158553017211, + "grad_norm": 0.14077311754226685, + "learning_rate": 0.0005, + "loss": 2.1156, + "step": 221850 + }, + { + "epoch": 0.8444539177698439, + "grad_norm": 0.13966163992881775, + "learning_rate": 0.0005, + "loss": 2.0827, + "step": 221860 + }, + { + "epoch": 0.8444919802379666, + "grad_norm": 0.13039542734622955, + "learning_rate": 0.0005, + "loss": 2.1161, + "step": 221870 + }, + { + "epoch": 0.8445300427060892, + "grad_norm": 0.12404860556125641, + "learning_rate": 0.0005, + "loss": 2.1057, + "step": 221880 + }, + { + "epoch": 0.8445681051742119, + "grad_norm": 0.14446274936199188, + "learning_rate": 0.0005, + "loss": 2.1089, + "step": 221890 + }, + { + "epoch": 0.8446061676423346, + "grad_norm": 0.12789031863212585, + "learning_rate": 0.0005, + "loss": 2.1026, + "step": 221900 + }, + { + "epoch": 0.8446442301104573, + "grad_norm": 0.1414797306060791, + "learning_rate": 0.0005, + "loss": 2.1072, + "step": 221910 + }, + { + "epoch": 0.84468229257858, + "grad_norm": 0.12627270817756653, + "learning_rate": 0.0005, + "loss": 2.0969, + "step": 221920 + }, + { + "epoch": 0.8447203550467026, + "grad_norm": 0.13195939362049103, + "learning_rate": 0.0005, + "loss": 2.104, + "step": 221930 + }, + { + "epoch": 0.8447584175148253, + "grad_norm": 0.13768230378627777, + "learning_rate": 0.0005, + "loss": 2.0994, + "step": 221940 + }, + { + "epoch": 0.844796479982948, + "grad_norm": 0.12444497644901276, + "learning_rate": 0.0005, + "loss": 2.103, + "step": 221950 + }, + { + "epoch": 0.8448345424510707, + "grad_norm": 0.130240336060524, + "learning_rate": 0.0005, + "loss": 2.0894, + "step": 221960 + }, + { + "epoch": 0.8448726049191934, + "grad_norm": 0.1341758668422699, + "learning_rate": 0.0005, + "loss": 2.1119, + "step": 221970 + }, + { + "epoch": 0.844910667387316, + "grad_norm": 0.1302516609430313, + "learning_rate": 0.0005, + "loss": 2.1232, + "step": 221980 + }, + { + "epoch": 0.8449487298554388, + "grad_norm": 0.1489470899105072, + "learning_rate": 0.0005, + "loss": 2.1042, + "step": 221990 + }, + { + "epoch": 0.8449867923235614, + "grad_norm": 0.11727765202522278, + "learning_rate": 0.0005, + "loss": 2.0888, + "step": 222000 + }, + { + "epoch": 0.8450248547916841, + "grad_norm": 0.12005919963121414, + "learning_rate": 0.0005, + "loss": 2.1073, + "step": 222010 + }, + { + "epoch": 0.8450629172598068, + "grad_norm": 0.12615996599197388, + "learning_rate": 0.0005, + "loss": 2.1137, + "step": 222020 + }, + { + "epoch": 0.8451009797279295, + "grad_norm": 0.12618914246559143, + "learning_rate": 0.0005, + "loss": 2.1077, + "step": 222030 + }, + { + "epoch": 0.8451390421960522, + "grad_norm": 0.12011348456144333, + "learning_rate": 0.0005, + "loss": 2.0983, + "step": 222040 + }, + { + "epoch": 0.8451771046641748, + "grad_norm": 0.12232651561498642, + "learning_rate": 0.0005, + "loss": 2.1055, + "step": 222050 + }, + { + "epoch": 0.8452151671322975, + "grad_norm": 0.1303800344467163, + "learning_rate": 0.0005, + "loss": 2.1122, + "step": 222060 + }, + { + "epoch": 0.8452532296004203, + "grad_norm": 0.13714206218719482, + "learning_rate": 0.0005, + "loss": 2.1088, + "step": 222070 + }, + { + "epoch": 0.8452912920685429, + "grad_norm": 0.1355716437101364, + "learning_rate": 0.0005, + "loss": 2.0955, + "step": 222080 + }, + { + "epoch": 0.8453293545366656, + "grad_norm": 0.11672431975603104, + "learning_rate": 0.0005, + "loss": 2.1021, + "step": 222090 + }, + { + "epoch": 0.8453674170047882, + "grad_norm": 0.12329725921154022, + "learning_rate": 0.0005, + "loss": 2.1047, + "step": 222100 + }, + { + "epoch": 0.8454054794729109, + "grad_norm": 0.12570767104625702, + "learning_rate": 0.0005, + "loss": 2.1283, + "step": 222110 + }, + { + "epoch": 0.8454435419410337, + "grad_norm": 0.13327881693840027, + "learning_rate": 0.0005, + "loss": 2.0952, + "step": 222120 + }, + { + "epoch": 0.8454816044091563, + "grad_norm": 0.12163237482309341, + "learning_rate": 0.0005, + "loss": 2.1132, + "step": 222130 + }, + { + "epoch": 0.845519666877279, + "grad_norm": 0.14293643832206726, + "learning_rate": 0.0005, + "loss": 2.0911, + "step": 222140 + }, + { + "epoch": 0.8455577293454016, + "grad_norm": 0.12491180747747421, + "learning_rate": 0.0005, + "loss": 2.1185, + "step": 222150 + }, + { + "epoch": 0.8455957918135244, + "grad_norm": 0.14374269545078278, + "learning_rate": 0.0005, + "loss": 2.0967, + "step": 222160 + }, + { + "epoch": 0.8456338542816471, + "grad_norm": 0.11893656104803085, + "learning_rate": 0.0005, + "loss": 2.1158, + "step": 222170 + }, + { + "epoch": 0.8456719167497697, + "grad_norm": 0.1267782598733902, + "learning_rate": 0.0005, + "loss": 2.1052, + "step": 222180 + }, + { + "epoch": 0.8457099792178924, + "grad_norm": 0.13819265365600586, + "learning_rate": 0.0005, + "loss": 2.0971, + "step": 222190 + }, + { + "epoch": 0.8457480416860151, + "grad_norm": 0.11883528530597687, + "learning_rate": 0.0005, + "loss": 2.1089, + "step": 222200 + }, + { + "epoch": 0.8457861041541378, + "grad_norm": 0.11555849015712738, + "learning_rate": 0.0005, + "loss": 2.0961, + "step": 222210 + }, + { + "epoch": 0.8458241666222605, + "grad_norm": 0.13113310933113098, + "learning_rate": 0.0005, + "loss": 2.0987, + "step": 222220 + }, + { + "epoch": 0.8458622290903831, + "grad_norm": 0.12390421330928802, + "learning_rate": 0.0005, + "loss": 2.0994, + "step": 222230 + }, + { + "epoch": 0.8459002915585058, + "grad_norm": 0.13246551156044006, + "learning_rate": 0.0005, + "loss": 2.1068, + "step": 222240 + }, + { + "epoch": 0.8459383540266285, + "grad_norm": 0.12514223158359528, + "learning_rate": 0.0005, + "loss": 2.1039, + "step": 222250 + }, + { + "epoch": 0.8459764164947512, + "grad_norm": 0.12240414321422577, + "learning_rate": 0.0005, + "loss": 2.0958, + "step": 222260 + }, + { + "epoch": 0.8460144789628739, + "grad_norm": 0.1251474767923355, + "learning_rate": 0.0005, + "loss": 2.0967, + "step": 222270 + }, + { + "epoch": 0.8460525414309965, + "grad_norm": 0.13345548510551453, + "learning_rate": 0.0005, + "loss": 2.1035, + "step": 222280 + }, + { + "epoch": 0.8460906038991193, + "grad_norm": 0.13378407061100006, + "learning_rate": 0.0005, + "loss": 2.1136, + "step": 222290 + }, + { + "epoch": 0.8461286663672419, + "grad_norm": 0.1198481023311615, + "learning_rate": 0.0005, + "loss": 2.0962, + "step": 222300 + }, + { + "epoch": 0.8461667288353646, + "grad_norm": 0.12984788417816162, + "learning_rate": 0.0005, + "loss": 2.0917, + "step": 222310 + }, + { + "epoch": 0.8462047913034872, + "grad_norm": 0.125489741563797, + "learning_rate": 0.0005, + "loss": 2.0982, + "step": 222320 + }, + { + "epoch": 0.84624285377161, + "grad_norm": 0.1202753558754921, + "learning_rate": 0.0005, + "loss": 2.112, + "step": 222330 + }, + { + "epoch": 0.8462809162397327, + "grad_norm": 0.12433353066444397, + "learning_rate": 0.0005, + "loss": 2.102, + "step": 222340 + }, + { + "epoch": 0.8463189787078553, + "grad_norm": 0.11911118030548096, + "learning_rate": 0.0005, + "loss": 2.1088, + "step": 222350 + }, + { + "epoch": 0.846357041175978, + "grad_norm": 0.11664999276399612, + "learning_rate": 0.0005, + "loss": 2.098, + "step": 222360 + }, + { + "epoch": 0.8463951036441006, + "grad_norm": 0.13223478198051453, + "learning_rate": 0.0005, + "loss": 2.0976, + "step": 222370 + }, + { + "epoch": 0.8464331661122234, + "grad_norm": 0.14218950271606445, + "learning_rate": 0.0005, + "loss": 2.1054, + "step": 222380 + }, + { + "epoch": 0.8464712285803461, + "grad_norm": 0.11777843534946442, + "learning_rate": 0.0005, + "loss": 2.1028, + "step": 222390 + }, + { + "epoch": 0.8465092910484687, + "grad_norm": 0.12073778361082077, + "learning_rate": 0.0005, + "loss": 2.0993, + "step": 222400 + }, + { + "epoch": 0.8465473535165914, + "grad_norm": 0.11218991130590439, + "learning_rate": 0.0005, + "loss": 2.1099, + "step": 222410 + }, + { + "epoch": 0.8465854159847142, + "grad_norm": 0.12169850617647171, + "learning_rate": 0.0005, + "loss": 2.0992, + "step": 222420 + }, + { + "epoch": 0.8466234784528368, + "grad_norm": 0.11669646948575974, + "learning_rate": 0.0005, + "loss": 2.0886, + "step": 222430 + }, + { + "epoch": 0.8466615409209595, + "grad_norm": 0.12841486930847168, + "learning_rate": 0.0005, + "loss": 2.1017, + "step": 222440 + }, + { + "epoch": 0.8466996033890821, + "grad_norm": 0.11732825636863708, + "learning_rate": 0.0005, + "loss": 2.1212, + "step": 222450 + }, + { + "epoch": 0.8467376658572049, + "grad_norm": 0.12267529964447021, + "learning_rate": 0.0005, + "loss": 2.1048, + "step": 222460 + }, + { + "epoch": 0.8467757283253275, + "grad_norm": 0.13632412254810333, + "learning_rate": 0.0005, + "loss": 2.1003, + "step": 222470 + }, + { + "epoch": 0.8468137907934502, + "grad_norm": 0.12303720414638519, + "learning_rate": 0.0005, + "loss": 2.1102, + "step": 222480 + }, + { + "epoch": 0.8468518532615729, + "grad_norm": 0.1213759183883667, + "learning_rate": 0.0005, + "loss": 2.0978, + "step": 222490 + }, + { + "epoch": 0.8468899157296956, + "grad_norm": 0.12312841415405273, + "learning_rate": 0.0005, + "loss": 2.1071, + "step": 222500 + }, + { + "epoch": 0.8469279781978183, + "grad_norm": 0.12350822985172272, + "learning_rate": 0.0005, + "loss": 2.0946, + "step": 222510 + }, + { + "epoch": 0.846966040665941, + "grad_norm": 0.14025971293449402, + "learning_rate": 0.0005, + "loss": 2.0968, + "step": 222520 + }, + { + "epoch": 0.8470041031340636, + "grad_norm": 0.14085686206817627, + "learning_rate": 0.0005, + "loss": 2.1226, + "step": 222530 + }, + { + "epoch": 0.8470421656021863, + "grad_norm": 0.12586809694766998, + "learning_rate": 0.0005, + "loss": 2.0876, + "step": 222540 + }, + { + "epoch": 0.847080228070309, + "grad_norm": 0.14620359241962433, + "learning_rate": 0.0005, + "loss": 2.0921, + "step": 222550 + }, + { + "epoch": 0.8471182905384317, + "grad_norm": 0.12692441046237946, + "learning_rate": 0.0005, + "loss": 2.0956, + "step": 222560 + }, + { + "epoch": 0.8471563530065543, + "grad_norm": 0.13988254964351654, + "learning_rate": 0.0005, + "loss": 2.0914, + "step": 222570 + }, + { + "epoch": 0.847194415474677, + "grad_norm": 0.13647693395614624, + "learning_rate": 0.0005, + "loss": 2.0958, + "step": 222580 + }, + { + "epoch": 0.8472324779427998, + "grad_norm": 0.12744392454624176, + "learning_rate": 0.0005, + "loss": 2.0933, + "step": 222590 + }, + { + "epoch": 0.8472705404109224, + "grad_norm": 0.1267772763967514, + "learning_rate": 0.0005, + "loss": 2.1032, + "step": 222600 + }, + { + "epoch": 0.8473086028790451, + "grad_norm": 0.12130838632583618, + "learning_rate": 0.0005, + "loss": 2.1054, + "step": 222610 + }, + { + "epoch": 0.8473466653471677, + "grad_norm": 0.143631711602211, + "learning_rate": 0.0005, + "loss": 2.1133, + "step": 222620 + }, + { + "epoch": 0.8473847278152905, + "grad_norm": 0.12004122883081436, + "learning_rate": 0.0005, + "loss": 2.1126, + "step": 222630 + }, + { + "epoch": 0.8474227902834132, + "grad_norm": 0.11501511931419373, + "learning_rate": 0.0005, + "loss": 2.103, + "step": 222640 + }, + { + "epoch": 0.8474608527515358, + "grad_norm": 0.1254333108663559, + "learning_rate": 0.0005, + "loss": 2.1107, + "step": 222650 + }, + { + "epoch": 0.8474989152196585, + "grad_norm": 0.12307880818843842, + "learning_rate": 0.0005, + "loss": 2.0908, + "step": 222660 + }, + { + "epoch": 0.8475369776877811, + "grad_norm": 0.12675033509731293, + "learning_rate": 0.0005, + "loss": 2.0992, + "step": 222670 + }, + { + "epoch": 0.8475750401559039, + "grad_norm": 0.1205359548330307, + "learning_rate": 0.0005, + "loss": 2.1036, + "step": 222680 + }, + { + "epoch": 0.8476131026240266, + "grad_norm": 0.13939648866653442, + "learning_rate": 0.0005, + "loss": 2.0967, + "step": 222690 + }, + { + "epoch": 0.8476511650921492, + "grad_norm": 0.12317508459091187, + "learning_rate": 0.0005, + "loss": 2.1005, + "step": 222700 + }, + { + "epoch": 0.8476892275602719, + "grad_norm": 0.1242111399769783, + "learning_rate": 0.0005, + "loss": 2.1028, + "step": 222710 + }, + { + "epoch": 0.8477272900283946, + "grad_norm": 0.13081376254558563, + "learning_rate": 0.0005, + "loss": 2.1102, + "step": 222720 + }, + { + "epoch": 0.8477653524965173, + "grad_norm": 0.13119825720787048, + "learning_rate": 0.0005, + "loss": 2.1005, + "step": 222730 + }, + { + "epoch": 0.84780341496464, + "grad_norm": 0.16249608993530273, + "learning_rate": 0.0005, + "loss": 2.1062, + "step": 222740 + }, + { + "epoch": 0.8478414774327626, + "grad_norm": 0.12679052352905273, + "learning_rate": 0.0005, + "loss": 2.0937, + "step": 222750 + }, + { + "epoch": 0.8478795399008854, + "grad_norm": 0.15585845708847046, + "learning_rate": 0.0005, + "loss": 2.1039, + "step": 222760 + }, + { + "epoch": 0.847917602369008, + "grad_norm": 0.12806294858455658, + "learning_rate": 0.0005, + "loss": 2.1157, + "step": 222770 + }, + { + "epoch": 0.8479556648371307, + "grad_norm": 0.11923015862703323, + "learning_rate": 0.0005, + "loss": 2.1168, + "step": 222780 + }, + { + "epoch": 0.8479937273052534, + "grad_norm": 0.12365025281906128, + "learning_rate": 0.0005, + "loss": 2.1077, + "step": 222790 + }, + { + "epoch": 0.848031789773376, + "grad_norm": 0.15433207154273987, + "learning_rate": 0.0005, + "loss": 2.0972, + "step": 222800 + }, + { + "epoch": 0.8480698522414988, + "grad_norm": 0.1409216821193695, + "learning_rate": 0.0005, + "loss": 2.1204, + "step": 222810 + }, + { + "epoch": 0.8481079147096214, + "grad_norm": 0.12249742448329926, + "learning_rate": 0.0005, + "loss": 2.112, + "step": 222820 + }, + { + "epoch": 0.8481459771777441, + "grad_norm": 0.1369142085313797, + "learning_rate": 0.0005, + "loss": 2.1049, + "step": 222830 + }, + { + "epoch": 0.8481840396458667, + "grad_norm": 0.1325635313987732, + "learning_rate": 0.0005, + "loss": 2.1266, + "step": 222840 + }, + { + "epoch": 0.8482221021139895, + "grad_norm": 0.11897878348827362, + "learning_rate": 0.0005, + "loss": 2.0946, + "step": 222850 + }, + { + "epoch": 0.8482601645821122, + "grad_norm": 0.1231696829199791, + "learning_rate": 0.0005, + "loss": 2.0909, + "step": 222860 + }, + { + "epoch": 0.8482982270502348, + "grad_norm": 0.1305019110441208, + "learning_rate": 0.0005, + "loss": 2.1176, + "step": 222870 + }, + { + "epoch": 0.8483362895183575, + "grad_norm": 0.11948280781507492, + "learning_rate": 0.0005, + "loss": 2.1175, + "step": 222880 + }, + { + "epoch": 0.8483743519864803, + "grad_norm": 0.12036039680242538, + "learning_rate": 0.0005, + "loss": 2.0905, + "step": 222890 + }, + { + "epoch": 0.8484124144546029, + "grad_norm": 0.11098724603652954, + "learning_rate": 0.0005, + "loss": 2.094, + "step": 222900 + }, + { + "epoch": 0.8484504769227256, + "grad_norm": 0.1361108273267746, + "learning_rate": 0.0005, + "loss": 2.1001, + "step": 222910 + }, + { + "epoch": 0.8484885393908482, + "grad_norm": 0.13085906207561493, + "learning_rate": 0.0005, + "loss": 2.109, + "step": 222920 + }, + { + "epoch": 0.848526601858971, + "grad_norm": 0.1153787150979042, + "learning_rate": 0.0005, + "loss": 2.111, + "step": 222930 + }, + { + "epoch": 0.8485646643270937, + "grad_norm": 0.13161927461624146, + "learning_rate": 0.0005, + "loss": 2.0967, + "step": 222940 + }, + { + "epoch": 0.8486027267952163, + "grad_norm": 0.11290472000837326, + "learning_rate": 0.0005, + "loss": 2.0999, + "step": 222950 + }, + { + "epoch": 0.848640789263339, + "grad_norm": 0.12351827323436737, + "learning_rate": 0.0005, + "loss": 2.0939, + "step": 222960 + }, + { + "epoch": 0.8486788517314616, + "grad_norm": 0.11744271963834763, + "learning_rate": 0.0005, + "loss": 2.1173, + "step": 222970 + }, + { + "epoch": 0.8487169141995844, + "grad_norm": 0.11657220125198364, + "learning_rate": 0.0005, + "loss": 2.0954, + "step": 222980 + }, + { + "epoch": 0.848754976667707, + "grad_norm": 0.11970972269773483, + "learning_rate": 0.0005, + "loss": 2.1179, + "step": 222990 + }, + { + "epoch": 0.8487930391358297, + "grad_norm": 0.14427636563777924, + "learning_rate": 0.0005, + "loss": 2.0958, + "step": 223000 + }, + { + "epoch": 0.8488311016039524, + "grad_norm": 0.12226763367652893, + "learning_rate": 0.0005, + "loss": 2.1138, + "step": 223010 + }, + { + "epoch": 0.8488691640720751, + "grad_norm": 0.12056204676628113, + "learning_rate": 0.0005, + "loss": 2.1043, + "step": 223020 + }, + { + "epoch": 0.8489072265401978, + "grad_norm": 0.11601797491312027, + "learning_rate": 0.0005, + "loss": 2.1062, + "step": 223030 + }, + { + "epoch": 0.8489452890083204, + "grad_norm": 0.12741492688655853, + "learning_rate": 0.0005, + "loss": 2.1078, + "step": 223040 + }, + { + "epoch": 0.8489833514764431, + "grad_norm": 0.13284242153167725, + "learning_rate": 0.0005, + "loss": 2.0943, + "step": 223050 + }, + { + "epoch": 0.8490214139445659, + "grad_norm": 0.120316281914711, + "learning_rate": 0.0005, + "loss": 2.1037, + "step": 223060 + }, + { + "epoch": 0.8490594764126885, + "grad_norm": 0.12143097072839737, + "learning_rate": 0.0005, + "loss": 2.1076, + "step": 223070 + }, + { + "epoch": 0.8490975388808112, + "grad_norm": 0.1196446344256401, + "learning_rate": 0.0005, + "loss": 2.0992, + "step": 223080 + }, + { + "epoch": 0.8491356013489338, + "grad_norm": 0.12994390726089478, + "learning_rate": 0.0005, + "loss": 2.0839, + "step": 223090 + }, + { + "epoch": 0.8491736638170565, + "grad_norm": 0.12859565019607544, + "learning_rate": 0.0005, + "loss": 2.0943, + "step": 223100 + }, + { + "epoch": 0.8492117262851793, + "grad_norm": 0.12666714191436768, + "learning_rate": 0.0005, + "loss": 2.1141, + "step": 223110 + }, + { + "epoch": 0.8492497887533019, + "grad_norm": 0.13240092992782593, + "learning_rate": 0.0005, + "loss": 2.1044, + "step": 223120 + }, + { + "epoch": 0.8492878512214246, + "grad_norm": 0.13775987923145294, + "learning_rate": 0.0005, + "loss": 2.107, + "step": 223130 + }, + { + "epoch": 0.8493259136895472, + "grad_norm": 0.13414819538593292, + "learning_rate": 0.0005, + "loss": 2.1062, + "step": 223140 + }, + { + "epoch": 0.84936397615767, + "grad_norm": 0.12259962409734726, + "learning_rate": 0.0005, + "loss": 2.0953, + "step": 223150 + }, + { + "epoch": 0.8494020386257927, + "grad_norm": 0.11610279977321625, + "learning_rate": 0.0005, + "loss": 2.1046, + "step": 223160 + }, + { + "epoch": 0.8494401010939153, + "grad_norm": 0.13720989227294922, + "learning_rate": 0.0005, + "loss": 2.1005, + "step": 223170 + }, + { + "epoch": 0.849478163562038, + "grad_norm": 0.13144254684448242, + "learning_rate": 0.0005, + "loss": 2.1002, + "step": 223180 + }, + { + "epoch": 0.8495162260301607, + "grad_norm": 0.13327936828136444, + "learning_rate": 0.0005, + "loss": 2.113, + "step": 223190 + }, + { + "epoch": 0.8495542884982834, + "grad_norm": 0.13548313081264496, + "learning_rate": 0.0005, + "loss": 2.1051, + "step": 223200 + }, + { + "epoch": 0.8495923509664061, + "grad_norm": 0.12639567255973816, + "learning_rate": 0.0005, + "loss": 2.1161, + "step": 223210 + }, + { + "epoch": 0.8496304134345287, + "grad_norm": 0.12638792395591736, + "learning_rate": 0.0005, + "loss": 2.1052, + "step": 223220 + }, + { + "epoch": 0.8496684759026514, + "grad_norm": 0.1326935738325119, + "learning_rate": 0.0005, + "loss": 2.0998, + "step": 223230 + }, + { + "epoch": 0.8497065383707741, + "grad_norm": 0.12060036510229111, + "learning_rate": 0.0005, + "loss": 2.1118, + "step": 223240 + }, + { + "epoch": 0.8497446008388968, + "grad_norm": 0.13392925262451172, + "learning_rate": 0.0005, + "loss": 2.1098, + "step": 223250 + }, + { + "epoch": 0.8497826633070195, + "grad_norm": 0.12088333815336227, + "learning_rate": 0.0005, + "loss": 2.1262, + "step": 223260 + }, + { + "epoch": 0.8498207257751421, + "grad_norm": 0.12968561053276062, + "learning_rate": 0.0005, + "loss": 2.1037, + "step": 223270 + }, + { + "epoch": 0.8498587882432649, + "grad_norm": 0.1378614455461502, + "learning_rate": 0.0005, + "loss": 2.1135, + "step": 223280 + }, + { + "epoch": 0.8498968507113875, + "grad_norm": 0.11719853430986404, + "learning_rate": 0.0005, + "loss": 2.0941, + "step": 223290 + }, + { + "epoch": 0.8499349131795102, + "grad_norm": 0.1415800154209137, + "learning_rate": 0.0005, + "loss": 2.1079, + "step": 223300 + }, + { + "epoch": 0.8499729756476329, + "grad_norm": 0.13601985573768616, + "learning_rate": 0.0005, + "loss": 2.1095, + "step": 223310 + }, + { + "epoch": 0.8500110381157556, + "grad_norm": 0.12387509644031525, + "learning_rate": 0.0005, + "loss": 2.1013, + "step": 223320 + }, + { + "epoch": 0.8500491005838783, + "grad_norm": 0.1318059265613556, + "learning_rate": 0.0005, + "loss": 2.0944, + "step": 223330 + }, + { + "epoch": 0.8500871630520009, + "grad_norm": 0.14278966188430786, + "learning_rate": 0.0005, + "loss": 2.1087, + "step": 223340 + }, + { + "epoch": 0.8501252255201236, + "grad_norm": 0.12081281840801239, + "learning_rate": 0.0005, + "loss": 2.1024, + "step": 223350 + }, + { + "epoch": 0.8501632879882464, + "grad_norm": 0.12138575315475464, + "learning_rate": 0.0005, + "loss": 2.1009, + "step": 223360 + }, + { + "epoch": 0.850201350456369, + "grad_norm": 0.12061097472906113, + "learning_rate": 0.0005, + "loss": 2.104, + "step": 223370 + }, + { + "epoch": 0.8502394129244917, + "grad_norm": 0.12924207746982574, + "learning_rate": 0.0005, + "loss": 2.1094, + "step": 223380 + }, + { + "epoch": 0.8502774753926143, + "grad_norm": 0.1417977660894394, + "learning_rate": 0.0005, + "loss": 2.1041, + "step": 223390 + }, + { + "epoch": 0.850315537860737, + "grad_norm": 0.13214237987995148, + "learning_rate": 0.0005, + "loss": 2.1154, + "step": 223400 + }, + { + "epoch": 0.8503536003288598, + "grad_norm": 0.1482153981924057, + "learning_rate": 0.0005, + "loss": 2.1215, + "step": 223410 + }, + { + "epoch": 0.8503916627969824, + "grad_norm": 0.12344057857990265, + "learning_rate": 0.0005, + "loss": 2.0972, + "step": 223420 + }, + { + "epoch": 0.8504297252651051, + "grad_norm": 0.1381489783525467, + "learning_rate": 0.0005, + "loss": 2.1, + "step": 223430 + }, + { + "epoch": 0.8504677877332277, + "grad_norm": 0.13054247200489044, + "learning_rate": 0.0005, + "loss": 2.1036, + "step": 223440 + }, + { + "epoch": 0.8505058502013505, + "grad_norm": 0.12159658968448639, + "learning_rate": 0.0005, + "loss": 2.1074, + "step": 223450 + }, + { + "epoch": 0.8505439126694732, + "grad_norm": 0.11975965648889542, + "learning_rate": 0.0005, + "loss": 2.1139, + "step": 223460 + }, + { + "epoch": 0.8505819751375958, + "grad_norm": 0.12309782952070236, + "learning_rate": 0.0005, + "loss": 2.1098, + "step": 223470 + }, + { + "epoch": 0.8506200376057185, + "grad_norm": 0.12559787929058075, + "learning_rate": 0.0005, + "loss": 2.1089, + "step": 223480 + }, + { + "epoch": 0.8506581000738412, + "grad_norm": 0.1299746036529541, + "learning_rate": 0.0005, + "loss": 2.0944, + "step": 223490 + }, + { + "epoch": 0.8506961625419639, + "grad_norm": 0.12091419100761414, + "learning_rate": 0.0005, + "loss": 2.113, + "step": 223500 + }, + { + "epoch": 0.8507342250100866, + "grad_norm": 0.12265986949205399, + "learning_rate": 0.0005, + "loss": 2.1147, + "step": 223510 + }, + { + "epoch": 0.8507722874782092, + "grad_norm": 0.12044291198253632, + "learning_rate": 0.0005, + "loss": 2.1121, + "step": 223520 + }, + { + "epoch": 0.8508103499463319, + "grad_norm": 0.1332319676876068, + "learning_rate": 0.0005, + "loss": 2.0979, + "step": 223530 + }, + { + "epoch": 0.8508484124144546, + "grad_norm": 0.1274791657924652, + "learning_rate": 0.0005, + "loss": 2.1016, + "step": 223540 + }, + { + "epoch": 0.8508864748825773, + "grad_norm": 0.1327950358390808, + "learning_rate": 0.0005, + "loss": 2.0953, + "step": 223550 + }, + { + "epoch": 0.8509245373507, + "grad_norm": 0.12357247620820999, + "learning_rate": 0.0005, + "loss": 2.1175, + "step": 223560 + }, + { + "epoch": 0.8509625998188226, + "grad_norm": 0.11313042044639587, + "learning_rate": 0.0005, + "loss": 2.1076, + "step": 223570 + }, + { + "epoch": 0.8510006622869454, + "grad_norm": 0.12048235535621643, + "learning_rate": 0.0005, + "loss": 2.1064, + "step": 223580 + }, + { + "epoch": 0.851038724755068, + "grad_norm": 0.1278276890516281, + "learning_rate": 0.0005, + "loss": 2.1242, + "step": 223590 + }, + { + "epoch": 0.8510767872231907, + "grad_norm": 0.13198168575763702, + "learning_rate": 0.0005, + "loss": 2.1067, + "step": 223600 + }, + { + "epoch": 0.8511148496913133, + "grad_norm": 0.13076524436473846, + "learning_rate": 0.0005, + "loss": 2.0989, + "step": 223610 + }, + { + "epoch": 0.8511529121594361, + "grad_norm": 0.14714770019054413, + "learning_rate": 0.0005, + "loss": 2.1264, + "step": 223620 + }, + { + "epoch": 0.8511909746275588, + "grad_norm": 0.1160232275724411, + "learning_rate": 0.0005, + "loss": 2.1058, + "step": 223630 + }, + { + "epoch": 0.8512290370956814, + "grad_norm": 0.13508348166942596, + "learning_rate": 0.0005, + "loss": 2.1022, + "step": 223640 + }, + { + "epoch": 0.8512670995638041, + "grad_norm": 0.12602970004081726, + "learning_rate": 0.0005, + "loss": 2.0971, + "step": 223650 + }, + { + "epoch": 0.8513051620319267, + "grad_norm": 0.12282170355319977, + "learning_rate": 0.0005, + "loss": 2.0992, + "step": 223660 + }, + { + "epoch": 0.8513432245000495, + "grad_norm": 0.12498307228088379, + "learning_rate": 0.0005, + "loss": 2.1191, + "step": 223670 + }, + { + "epoch": 0.8513812869681722, + "grad_norm": 0.12019147723913193, + "learning_rate": 0.0005, + "loss": 2.1122, + "step": 223680 + }, + { + "epoch": 0.8514193494362948, + "grad_norm": 0.1340957134962082, + "learning_rate": 0.0005, + "loss": 2.0962, + "step": 223690 + }, + { + "epoch": 0.8514574119044175, + "grad_norm": 0.118826724588871, + "learning_rate": 0.0005, + "loss": 2.1048, + "step": 223700 + }, + { + "epoch": 0.8514954743725403, + "grad_norm": 0.12180516868829727, + "learning_rate": 0.0005, + "loss": 2.1086, + "step": 223710 + }, + { + "epoch": 0.8515335368406629, + "grad_norm": 0.1166292056441307, + "learning_rate": 0.0005, + "loss": 2.1061, + "step": 223720 + }, + { + "epoch": 0.8515715993087856, + "grad_norm": 0.1198718249797821, + "learning_rate": 0.0005, + "loss": 2.1014, + "step": 223730 + }, + { + "epoch": 0.8516096617769082, + "grad_norm": 0.14058130979537964, + "learning_rate": 0.0005, + "loss": 2.0963, + "step": 223740 + }, + { + "epoch": 0.851647724245031, + "grad_norm": 0.13598237931728363, + "learning_rate": 0.0005, + "loss": 2.0961, + "step": 223750 + }, + { + "epoch": 0.8516857867131536, + "grad_norm": 0.14740775525569916, + "learning_rate": 0.0005, + "loss": 2.1035, + "step": 223760 + }, + { + "epoch": 0.8517238491812763, + "grad_norm": 0.12089115381240845, + "learning_rate": 0.0005, + "loss": 2.1311, + "step": 223770 + }, + { + "epoch": 0.851761911649399, + "grad_norm": 0.1189613789319992, + "learning_rate": 0.0005, + "loss": 2.1087, + "step": 223780 + }, + { + "epoch": 0.8517999741175217, + "grad_norm": 0.12509900331497192, + "learning_rate": 0.0005, + "loss": 2.0995, + "step": 223790 + }, + { + "epoch": 0.8518380365856444, + "grad_norm": 0.12147897481918335, + "learning_rate": 0.0005, + "loss": 2.1064, + "step": 223800 + }, + { + "epoch": 0.851876099053767, + "grad_norm": 0.13214780390262604, + "learning_rate": 0.0005, + "loss": 2.1082, + "step": 223810 + }, + { + "epoch": 0.8519141615218897, + "grad_norm": 0.13353584706783295, + "learning_rate": 0.0005, + "loss": 2.1099, + "step": 223820 + }, + { + "epoch": 0.8519522239900124, + "grad_norm": 0.14544062316417694, + "learning_rate": 0.0005, + "loss": 2.0955, + "step": 223830 + }, + { + "epoch": 0.8519902864581351, + "grad_norm": 0.13279809057712555, + "learning_rate": 0.0005, + "loss": 2.0964, + "step": 223840 + }, + { + "epoch": 0.8520283489262578, + "grad_norm": 0.13682380318641663, + "learning_rate": 0.0005, + "loss": 2.1154, + "step": 223850 + }, + { + "epoch": 0.8520664113943804, + "grad_norm": 0.1213570311665535, + "learning_rate": 0.0005, + "loss": 2.1024, + "step": 223860 + }, + { + "epoch": 0.8521044738625031, + "grad_norm": 0.116135373711586, + "learning_rate": 0.0005, + "loss": 2.1037, + "step": 223870 + }, + { + "epoch": 0.8521425363306259, + "grad_norm": 0.11622577905654907, + "learning_rate": 0.0005, + "loss": 2.1002, + "step": 223880 + }, + { + "epoch": 0.8521805987987485, + "grad_norm": 0.14203287661075592, + "learning_rate": 0.0005, + "loss": 2.1037, + "step": 223890 + }, + { + "epoch": 0.8522186612668712, + "grad_norm": 0.14461076259613037, + "learning_rate": 0.0005, + "loss": 2.1155, + "step": 223900 + }, + { + "epoch": 0.8522567237349938, + "grad_norm": 0.12584730982780457, + "learning_rate": 0.0005, + "loss": 2.1012, + "step": 223910 + }, + { + "epoch": 0.8522947862031166, + "grad_norm": 0.11870056390762329, + "learning_rate": 0.0005, + "loss": 2.1135, + "step": 223920 + }, + { + "epoch": 0.8523328486712393, + "grad_norm": 0.1315356343984604, + "learning_rate": 0.0005, + "loss": 2.0937, + "step": 223930 + }, + { + "epoch": 0.8523709111393619, + "grad_norm": 0.12040794640779495, + "learning_rate": 0.0005, + "loss": 2.0966, + "step": 223940 + }, + { + "epoch": 0.8524089736074846, + "grad_norm": 0.1251789778470993, + "learning_rate": 0.0005, + "loss": 2.1028, + "step": 223950 + }, + { + "epoch": 0.8524470360756072, + "grad_norm": 0.13328661024570465, + "learning_rate": 0.0005, + "loss": 2.1086, + "step": 223960 + }, + { + "epoch": 0.85248509854373, + "grad_norm": 0.12740342319011688, + "learning_rate": 0.0005, + "loss": 2.1061, + "step": 223970 + }, + { + "epoch": 0.8525231610118527, + "grad_norm": 0.12411228567361832, + "learning_rate": 0.0005, + "loss": 2.0802, + "step": 223980 + }, + { + "epoch": 0.8525612234799753, + "grad_norm": 0.13667641580104828, + "learning_rate": 0.0005, + "loss": 2.0946, + "step": 223990 + }, + { + "epoch": 0.852599285948098, + "grad_norm": 0.12220586091279984, + "learning_rate": 0.0005, + "loss": 2.1099, + "step": 224000 + }, + { + "epoch": 0.8526373484162207, + "grad_norm": 0.1269436925649643, + "learning_rate": 0.0005, + "loss": 2.096, + "step": 224010 + }, + { + "epoch": 0.8526754108843434, + "grad_norm": 0.12647700309753418, + "learning_rate": 0.0005, + "loss": 2.1067, + "step": 224020 + }, + { + "epoch": 0.8527134733524661, + "grad_norm": 0.11566011607646942, + "learning_rate": 0.0005, + "loss": 2.0988, + "step": 224030 + }, + { + "epoch": 0.8527515358205887, + "grad_norm": 0.12147703021764755, + "learning_rate": 0.0005, + "loss": 2.0977, + "step": 224040 + }, + { + "epoch": 0.8527895982887115, + "grad_norm": 0.11546335369348526, + "learning_rate": 0.0005, + "loss": 2.0968, + "step": 224050 + }, + { + "epoch": 0.8528276607568341, + "grad_norm": 0.1299871951341629, + "learning_rate": 0.0005, + "loss": 2.1028, + "step": 224060 + }, + { + "epoch": 0.8528657232249568, + "grad_norm": 0.12042056024074554, + "learning_rate": 0.0005, + "loss": 2.0932, + "step": 224070 + }, + { + "epoch": 0.8529037856930795, + "grad_norm": 0.11888284981250763, + "learning_rate": 0.0005, + "loss": 2.0946, + "step": 224080 + }, + { + "epoch": 0.8529418481612021, + "grad_norm": 0.13539917767047882, + "learning_rate": 0.0005, + "loss": 2.0836, + "step": 224090 + }, + { + "epoch": 0.8529799106293249, + "grad_norm": 0.13094571232795715, + "learning_rate": 0.0005, + "loss": 2.1028, + "step": 224100 + }, + { + "epoch": 0.8530179730974475, + "grad_norm": 0.11932310461997986, + "learning_rate": 0.0005, + "loss": 2.1099, + "step": 224110 + }, + { + "epoch": 0.8530560355655702, + "grad_norm": 0.1398283690214157, + "learning_rate": 0.0005, + "loss": 2.1066, + "step": 224120 + }, + { + "epoch": 0.8530940980336928, + "grad_norm": 0.12287000566720963, + "learning_rate": 0.0005, + "loss": 2.0949, + "step": 224130 + }, + { + "epoch": 0.8531321605018156, + "grad_norm": 0.12652814388275146, + "learning_rate": 0.0005, + "loss": 2.1044, + "step": 224140 + }, + { + "epoch": 0.8531702229699383, + "grad_norm": 0.1283009648323059, + "learning_rate": 0.0005, + "loss": 2.0946, + "step": 224150 + }, + { + "epoch": 0.8532082854380609, + "grad_norm": 0.1335098296403885, + "learning_rate": 0.0005, + "loss": 2.1032, + "step": 224160 + }, + { + "epoch": 0.8532463479061836, + "grad_norm": 0.12675894796848297, + "learning_rate": 0.0005, + "loss": 2.097, + "step": 224170 + }, + { + "epoch": 0.8532844103743064, + "grad_norm": 0.12911538779735565, + "learning_rate": 0.0005, + "loss": 2.0915, + "step": 224180 + }, + { + "epoch": 0.853322472842429, + "grad_norm": 0.12739954888820648, + "learning_rate": 0.0005, + "loss": 2.1115, + "step": 224190 + }, + { + "epoch": 0.8533605353105517, + "grad_norm": 0.17836807668209076, + "learning_rate": 0.0005, + "loss": 2.0915, + "step": 224200 + }, + { + "epoch": 0.8533985977786743, + "grad_norm": 0.12320113927125931, + "learning_rate": 0.0005, + "loss": 2.1106, + "step": 224210 + }, + { + "epoch": 0.8534366602467971, + "grad_norm": 0.12615108489990234, + "learning_rate": 0.0005, + "loss": 2.106, + "step": 224220 + }, + { + "epoch": 0.8534747227149198, + "grad_norm": 0.11938022822141647, + "learning_rate": 0.0005, + "loss": 2.0919, + "step": 224230 + }, + { + "epoch": 0.8535127851830424, + "grad_norm": 0.1230238601565361, + "learning_rate": 0.0005, + "loss": 2.1133, + "step": 224240 + }, + { + "epoch": 0.8535508476511651, + "grad_norm": 0.12678667902946472, + "learning_rate": 0.0005, + "loss": 2.1123, + "step": 224250 + }, + { + "epoch": 0.8535889101192877, + "grad_norm": 0.1329822987318039, + "learning_rate": 0.0005, + "loss": 2.1032, + "step": 224260 + }, + { + "epoch": 0.8536269725874105, + "grad_norm": 0.12265556305646896, + "learning_rate": 0.0005, + "loss": 2.1152, + "step": 224270 + }, + { + "epoch": 0.8536650350555332, + "grad_norm": 0.12903083860874176, + "learning_rate": 0.0005, + "loss": 2.115, + "step": 224280 + }, + { + "epoch": 0.8537030975236558, + "grad_norm": 0.13243375718593597, + "learning_rate": 0.0005, + "loss": 2.0904, + "step": 224290 + }, + { + "epoch": 0.8537411599917785, + "grad_norm": 0.12826158106327057, + "learning_rate": 0.0005, + "loss": 2.1168, + "step": 224300 + }, + { + "epoch": 0.8537792224599012, + "grad_norm": 0.12951309978961945, + "learning_rate": 0.0005, + "loss": 2.1118, + "step": 224310 + }, + { + "epoch": 0.8538172849280239, + "grad_norm": 0.12756584584712982, + "learning_rate": 0.0005, + "loss": 2.0995, + "step": 224320 + }, + { + "epoch": 0.8538553473961465, + "grad_norm": 0.12267711013555527, + "learning_rate": 0.0005, + "loss": 2.1014, + "step": 224330 + }, + { + "epoch": 0.8538934098642692, + "grad_norm": 0.14301878213882446, + "learning_rate": 0.0005, + "loss": 2.1101, + "step": 224340 + }, + { + "epoch": 0.853931472332392, + "grad_norm": 0.14637650549411774, + "learning_rate": 0.0005, + "loss": 2.1029, + "step": 224350 + }, + { + "epoch": 0.8539695348005146, + "grad_norm": 0.11642546951770782, + "learning_rate": 0.0005, + "loss": 2.0914, + "step": 224360 + }, + { + "epoch": 0.8540075972686373, + "grad_norm": 0.11581864207983017, + "learning_rate": 0.0005, + "loss": 2.0933, + "step": 224370 + }, + { + "epoch": 0.8540456597367599, + "grad_norm": 0.1454319804906845, + "learning_rate": 0.0005, + "loss": 2.0971, + "step": 224380 + }, + { + "epoch": 0.8540837222048826, + "grad_norm": 0.12507648766040802, + "learning_rate": 0.0005, + "loss": 2.1123, + "step": 224390 + }, + { + "epoch": 0.8541217846730054, + "grad_norm": 0.11913920938968658, + "learning_rate": 0.0005, + "loss": 2.1034, + "step": 224400 + }, + { + "epoch": 0.854159847141128, + "grad_norm": 0.1339544951915741, + "learning_rate": 0.0005, + "loss": 2.1042, + "step": 224410 + }, + { + "epoch": 0.8541979096092507, + "grad_norm": 0.14385294914245605, + "learning_rate": 0.0005, + "loss": 2.1089, + "step": 224420 + }, + { + "epoch": 0.8542359720773733, + "grad_norm": 0.14541102945804596, + "learning_rate": 0.0005, + "loss": 2.1077, + "step": 224430 + }, + { + "epoch": 0.8542740345454961, + "grad_norm": 0.12655025720596313, + "learning_rate": 0.0005, + "loss": 2.1106, + "step": 224440 + }, + { + "epoch": 0.8543120970136188, + "grad_norm": 0.1332712322473526, + "learning_rate": 0.0005, + "loss": 2.0887, + "step": 224450 + }, + { + "epoch": 0.8543501594817414, + "grad_norm": 0.11915437132120132, + "learning_rate": 0.0005, + "loss": 2.1075, + "step": 224460 + }, + { + "epoch": 0.8543882219498641, + "grad_norm": 0.13157260417938232, + "learning_rate": 0.0005, + "loss": 2.1093, + "step": 224470 + }, + { + "epoch": 0.8544262844179868, + "grad_norm": 0.12617330253124237, + "learning_rate": 0.0005, + "loss": 2.1035, + "step": 224480 + }, + { + "epoch": 0.8544643468861095, + "grad_norm": 0.116673544049263, + "learning_rate": 0.0005, + "loss": 2.0889, + "step": 224490 + }, + { + "epoch": 0.8545024093542322, + "grad_norm": 0.1166527196764946, + "learning_rate": 0.0005, + "loss": 2.0988, + "step": 224500 + }, + { + "epoch": 0.8545404718223548, + "grad_norm": 0.12571091949939728, + "learning_rate": 0.0005, + "loss": 2.0896, + "step": 224510 + }, + { + "epoch": 0.8545785342904776, + "grad_norm": 0.1225324347615242, + "learning_rate": 0.0005, + "loss": 2.0977, + "step": 224520 + }, + { + "epoch": 0.8546165967586002, + "grad_norm": 0.13173021376132965, + "learning_rate": 0.0005, + "loss": 2.1036, + "step": 224530 + }, + { + "epoch": 0.8546546592267229, + "grad_norm": 0.11553698778152466, + "learning_rate": 0.0005, + "loss": 2.1122, + "step": 224540 + }, + { + "epoch": 0.8546927216948456, + "grad_norm": 0.12536373734474182, + "learning_rate": 0.0005, + "loss": 2.1131, + "step": 224550 + }, + { + "epoch": 0.8547307841629682, + "grad_norm": 0.13124406337738037, + "learning_rate": 0.0005, + "loss": 2.1158, + "step": 224560 + }, + { + "epoch": 0.854768846631091, + "grad_norm": 0.1201077252626419, + "learning_rate": 0.0005, + "loss": 2.0968, + "step": 224570 + }, + { + "epoch": 0.8548069090992136, + "grad_norm": 0.13268786668777466, + "learning_rate": 0.0005, + "loss": 2.1068, + "step": 224580 + }, + { + "epoch": 0.8548449715673363, + "grad_norm": 0.1516837626695633, + "learning_rate": 0.0005, + "loss": 2.0951, + "step": 224590 + }, + { + "epoch": 0.854883034035459, + "grad_norm": 0.11803478002548218, + "learning_rate": 0.0005, + "loss": 2.1009, + "step": 224600 + }, + { + "epoch": 0.8549210965035817, + "grad_norm": 0.11986857652664185, + "learning_rate": 0.0005, + "loss": 2.1046, + "step": 224610 + }, + { + "epoch": 0.8549591589717044, + "grad_norm": 0.11951293796300888, + "learning_rate": 0.0005, + "loss": 2.1023, + "step": 224620 + }, + { + "epoch": 0.854997221439827, + "grad_norm": 0.1232936680316925, + "learning_rate": 0.0005, + "loss": 2.0982, + "step": 224630 + }, + { + "epoch": 0.8550352839079497, + "grad_norm": 0.12167305499315262, + "learning_rate": 0.0005, + "loss": 2.1082, + "step": 224640 + }, + { + "epoch": 0.8550733463760725, + "grad_norm": 0.12915948033332825, + "learning_rate": 0.0005, + "loss": 2.1019, + "step": 224650 + }, + { + "epoch": 0.8551114088441951, + "grad_norm": 0.12672092020511627, + "learning_rate": 0.0005, + "loss": 2.112, + "step": 224660 + }, + { + "epoch": 0.8551494713123178, + "grad_norm": 0.1322372853755951, + "learning_rate": 0.0005, + "loss": 2.0963, + "step": 224670 + }, + { + "epoch": 0.8551875337804404, + "grad_norm": 0.13333991169929504, + "learning_rate": 0.0005, + "loss": 2.1005, + "step": 224680 + }, + { + "epoch": 0.8552255962485631, + "grad_norm": 0.11972727626562119, + "learning_rate": 0.0005, + "loss": 2.1025, + "step": 224690 + }, + { + "epoch": 0.8552636587166859, + "grad_norm": 0.12012451142072678, + "learning_rate": 0.0005, + "loss": 2.1106, + "step": 224700 + }, + { + "epoch": 0.8553017211848085, + "grad_norm": 0.1278320550918579, + "learning_rate": 0.0005, + "loss": 2.0922, + "step": 224710 + }, + { + "epoch": 0.8553397836529312, + "grad_norm": 0.12670546770095825, + "learning_rate": 0.0005, + "loss": 2.1165, + "step": 224720 + }, + { + "epoch": 0.8553778461210538, + "grad_norm": 0.12710654735565186, + "learning_rate": 0.0005, + "loss": 2.0964, + "step": 224730 + }, + { + "epoch": 0.8554159085891766, + "grad_norm": 0.12144336849451065, + "learning_rate": 0.0005, + "loss": 2.0966, + "step": 224740 + }, + { + "epoch": 0.8554539710572993, + "grad_norm": 0.136688232421875, + "learning_rate": 0.0005, + "loss": 2.1071, + "step": 224750 + }, + { + "epoch": 0.8554920335254219, + "grad_norm": 0.12287929654121399, + "learning_rate": 0.0005, + "loss": 2.0962, + "step": 224760 + }, + { + "epoch": 0.8555300959935446, + "grad_norm": 0.13087445497512817, + "learning_rate": 0.0005, + "loss": 2.1043, + "step": 224770 + }, + { + "epoch": 0.8555681584616673, + "grad_norm": 0.1203342080116272, + "learning_rate": 0.0005, + "loss": 2.0981, + "step": 224780 + }, + { + "epoch": 0.85560622092979, + "grad_norm": 0.1329360008239746, + "learning_rate": 0.0005, + "loss": 2.1069, + "step": 224790 + }, + { + "epoch": 0.8556442833979127, + "grad_norm": 0.13662736117839813, + "learning_rate": 0.0005, + "loss": 2.1053, + "step": 224800 + }, + { + "epoch": 0.8556823458660353, + "grad_norm": 0.12587331235408783, + "learning_rate": 0.0005, + "loss": 2.1075, + "step": 224810 + }, + { + "epoch": 0.855720408334158, + "grad_norm": 0.1271894872188568, + "learning_rate": 0.0005, + "loss": 2.1074, + "step": 224820 + }, + { + "epoch": 0.8557584708022807, + "grad_norm": 0.13241060078144073, + "learning_rate": 0.0005, + "loss": 2.098, + "step": 224830 + }, + { + "epoch": 0.8557965332704034, + "grad_norm": 0.14475074410438538, + "learning_rate": 0.0005, + "loss": 2.1018, + "step": 224840 + }, + { + "epoch": 0.855834595738526, + "grad_norm": 0.12807601690292358, + "learning_rate": 0.0005, + "loss": 2.1067, + "step": 224850 + }, + { + "epoch": 0.8558726582066487, + "grad_norm": 0.13161490857601166, + "learning_rate": 0.0005, + "loss": 2.1114, + "step": 224860 + }, + { + "epoch": 0.8559107206747715, + "grad_norm": 0.12225492298603058, + "learning_rate": 0.0005, + "loss": 2.088, + "step": 224870 + }, + { + "epoch": 0.8559487831428941, + "grad_norm": 0.11510666459798813, + "learning_rate": 0.0005, + "loss": 2.0963, + "step": 224880 + }, + { + "epoch": 0.8559868456110168, + "grad_norm": 0.11517883092164993, + "learning_rate": 0.0005, + "loss": 2.1087, + "step": 224890 + }, + { + "epoch": 0.8560249080791394, + "grad_norm": 0.1427791714668274, + "learning_rate": 0.0005, + "loss": 2.1174, + "step": 224900 + }, + { + "epoch": 0.8560629705472622, + "grad_norm": 0.14692747592926025, + "learning_rate": 0.0005, + "loss": 2.1148, + "step": 224910 + }, + { + "epoch": 0.8561010330153849, + "grad_norm": 0.13827954232692719, + "learning_rate": 0.0005, + "loss": 2.1172, + "step": 224920 + }, + { + "epoch": 0.8561390954835075, + "grad_norm": 0.11719125509262085, + "learning_rate": 0.0005, + "loss": 2.1109, + "step": 224930 + }, + { + "epoch": 0.8561771579516302, + "grad_norm": 0.12412826716899872, + "learning_rate": 0.0005, + "loss": 2.0967, + "step": 224940 + }, + { + "epoch": 0.856215220419753, + "grad_norm": 0.12181001156568527, + "learning_rate": 0.0005, + "loss": 2.1036, + "step": 224950 + }, + { + "epoch": 0.8562532828878756, + "grad_norm": 0.1239355206489563, + "learning_rate": 0.0005, + "loss": 2.1072, + "step": 224960 + }, + { + "epoch": 0.8562913453559983, + "grad_norm": 0.12484218925237656, + "learning_rate": 0.0005, + "loss": 2.1094, + "step": 224970 + }, + { + "epoch": 0.8563294078241209, + "grad_norm": 0.12160801142454147, + "learning_rate": 0.0005, + "loss": 2.0942, + "step": 224980 + }, + { + "epoch": 0.8563674702922436, + "grad_norm": 0.12800124287605286, + "learning_rate": 0.0005, + "loss": 2.1096, + "step": 224990 + }, + { + "epoch": 0.8564055327603664, + "grad_norm": 0.12416477501392365, + "learning_rate": 0.0005, + "loss": 2.097, + "step": 225000 + }, + { + "epoch": 0.856443595228489, + "grad_norm": 0.14295855164527893, + "learning_rate": 0.0005, + "loss": 2.1082, + "step": 225010 + }, + { + "epoch": 0.8564816576966117, + "grad_norm": 0.21226909756660461, + "learning_rate": 0.0005, + "loss": 2.1154, + "step": 225020 + }, + { + "epoch": 0.8565197201647343, + "grad_norm": 0.11625513434410095, + "learning_rate": 0.0005, + "loss": 2.0995, + "step": 225030 + }, + { + "epoch": 0.8565577826328571, + "grad_norm": 0.1246718093752861, + "learning_rate": 0.0005, + "loss": 2.1142, + "step": 225040 + }, + { + "epoch": 0.8565958451009797, + "grad_norm": 0.13294856250286102, + "learning_rate": 0.0005, + "loss": 2.122, + "step": 225050 + }, + { + "epoch": 0.8566339075691024, + "grad_norm": 0.13694648444652557, + "learning_rate": 0.0005, + "loss": 2.1076, + "step": 225060 + }, + { + "epoch": 0.8566719700372251, + "grad_norm": 0.11973528563976288, + "learning_rate": 0.0005, + "loss": 2.101, + "step": 225070 + }, + { + "epoch": 0.8567100325053478, + "grad_norm": 0.13602188229560852, + "learning_rate": 0.0005, + "loss": 2.0973, + "step": 225080 + }, + { + "epoch": 0.8567480949734705, + "grad_norm": 0.13067738711833954, + "learning_rate": 0.0005, + "loss": 2.0949, + "step": 225090 + }, + { + "epoch": 0.8567861574415931, + "grad_norm": 0.13641096651554108, + "learning_rate": 0.0005, + "loss": 2.0835, + "step": 225100 + }, + { + "epoch": 0.8568242199097158, + "grad_norm": 0.12239253520965576, + "learning_rate": 0.0005, + "loss": 2.1059, + "step": 225110 + }, + { + "epoch": 0.8568622823778385, + "grad_norm": 0.127982959151268, + "learning_rate": 0.0005, + "loss": 2.1123, + "step": 225120 + }, + { + "epoch": 0.8569003448459612, + "grad_norm": 0.11979227513074875, + "learning_rate": 0.0005, + "loss": 2.1108, + "step": 225130 + }, + { + "epoch": 0.8569384073140839, + "grad_norm": 0.14429311454296112, + "learning_rate": 0.0005, + "loss": 2.1208, + "step": 225140 + }, + { + "epoch": 0.8569764697822065, + "grad_norm": 0.12793608009815216, + "learning_rate": 0.0005, + "loss": 2.1161, + "step": 225150 + }, + { + "epoch": 0.8570145322503292, + "grad_norm": 0.12261246889829636, + "learning_rate": 0.0005, + "loss": 2.0988, + "step": 225160 + }, + { + "epoch": 0.857052594718452, + "grad_norm": 0.12844735383987427, + "learning_rate": 0.0005, + "loss": 2.1152, + "step": 225170 + }, + { + "epoch": 0.8570906571865746, + "grad_norm": 0.12870533764362335, + "learning_rate": 0.0005, + "loss": 2.0932, + "step": 225180 + }, + { + "epoch": 0.8571287196546973, + "grad_norm": 0.1424136906862259, + "learning_rate": 0.0005, + "loss": 2.1058, + "step": 225190 + }, + { + "epoch": 0.8571667821228199, + "grad_norm": 0.12998011708259583, + "learning_rate": 0.0005, + "loss": 2.1073, + "step": 225200 + }, + { + "epoch": 0.8572048445909427, + "grad_norm": 0.1247173473238945, + "learning_rate": 0.0005, + "loss": 2.0944, + "step": 225210 + }, + { + "epoch": 0.8572429070590654, + "grad_norm": 0.12054622918367386, + "learning_rate": 0.0005, + "loss": 2.0907, + "step": 225220 + }, + { + "epoch": 0.857280969527188, + "grad_norm": 0.1245812177658081, + "learning_rate": 0.0005, + "loss": 2.1055, + "step": 225230 + }, + { + "epoch": 0.8573190319953107, + "grad_norm": 0.13140606880187988, + "learning_rate": 0.0005, + "loss": 2.1019, + "step": 225240 + }, + { + "epoch": 0.8573570944634333, + "grad_norm": 0.13697367906570435, + "learning_rate": 0.0005, + "loss": 2.1006, + "step": 225250 + }, + { + "epoch": 0.8573951569315561, + "grad_norm": 0.13572590053081512, + "learning_rate": 0.0005, + "loss": 2.1079, + "step": 225260 + }, + { + "epoch": 0.8574332193996788, + "grad_norm": 0.1267860233783722, + "learning_rate": 0.0005, + "loss": 2.1042, + "step": 225270 + }, + { + "epoch": 0.8574712818678014, + "grad_norm": 0.1345335990190506, + "learning_rate": 0.0005, + "loss": 2.1091, + "step": 225280 + }, + { + "epoch": 0.8575093443359241, + "grad_norm": 0.12333787232637405, + "learning_rate": 0.0005, + "loss": 2.0963, + "step": 225290 + }, + { + "epoch": 0.8575474068040468, + "grad_norm": 0.12491423636674881, + "learning_rate": 0.0005, + "loss": 2.0999, + "step": 225300 + }, + { + "epoch": 0.8575854692721695, + "grad_norm": 0.12076954543590546, + "learning_rate": 0.0005, + "loss": 2.0976, + "step": 225310 + }, + { + "epoch": 0.8576235317402922, + "grad_norm": 0.127850741147995, + "learning_rate": 0.0005, + "loss": 2.1108, + "step": 225320 + }, + { + "epoch": 0.8576615942084148, + "grad_norm": 0.12069816142320633, + "learning_rate": 0.0005, + "loss": 2.1022, + "step": 225330 + }, + { + "epoch": 0.8576996566765376, + "grad_norm": 0.13940003514289856, + "learning_rate": 0.0005, + "loss": 2.0992, + "step": 225340 + }, + { + "epoch": 0.8577377191446602, + "grad_norm": 0.13708564639091492, + "learning_rate": 0.0005, + "loss": 2.1033, + "step": 225350 + }, + { + "epoch": 0.8577757816127829, + "grad_norm": 0.12923499941825867, + "learning_rate": 0.0005, + "loss": 2.107, + "step": 225360 + }, + { + "epoch": 0.8578138440809056, + "grad_norm": 0.12485264986753464, + "learning_rate": 0.0005, + "loss": 2.0914, + "step": 225370 + }, + { + "epoch": 0.8578519065490283, + "grad_norm": 0.131544828414917, + "learning_rate": 0.0005, + "loss": 2.1044, + "step": 225380 + }, + { + "epoch": 0.857889969017151, + "grad_norm": 0.13050268590450287, + "learning_rate": 0.0005, + "loss": 2.1037, + "step": 225390 + }, + { + "epoch": 0.8579280314852736, + "grad_norm": 0.1251644492149353, + "learning_rate": 0.0005, + "loss": 2.0943, + "step": 225400 + }, + { + "epoch": 0.8579660939533963, + "grad_norm": 0.13173510134220123, + "learning_rate": 0.0005, + "loss": 2.0995, + "step": 225410 + }, + { + "epoch": 0.858004156421519, + "grad_norm": 0.12391963601112366, + "learning_rate": 0.0005, + "loss": 2.0996, + "step": 225420 + }, + { + "epoch": 0.8580422188896417, + "grad_norm": 0.1279720813035965, + "learning_rate": 0.0005, + "loss": 2.0961, + "step": 225430 + }, + { + "epoch": 0.8580802813577644, + "grad_norm": 0.11828334629535675, + "learning_rate": 0.0005, + "loss": 2.1063, + "step": 225440 + }, + { + "epoch": 0.858118343825887, + "grad_norm": 0.12441591918468475, + "learning_rate": 0.0005, + "loss": 2.1229, + "step": 225450 + }, + { + "epoch": 0.8581564062940097, + "grad_norm": 0.12340865284204483, + "learning_rate": 0.0005, + "loss": 2.1037, + "step": 225460 + }, + { + "epoch": 0.8581944687621325, + "grad_norm": 0.11606843024492264, + "learning_rate": 0.0005, + "loss": 2.1109, + "step": 225470 + }, + { + "epoch": 0.8582325312302551, + "grad_norm": 0.12435857206583023, + "learning_rate": 0.0005, + "loss": 2.1138, + "step": 225480 + }, + { + "epoch": 0.8582705936983778, + "grad_norm": 0.12200822681188583, + "learning_rate": 0.0005, + "loss": 2.124, + "step": 225490 + }, + { + "epoch": 0.8583086561665004, + "grad_norm": 0.1329309344291687, + "learning_rate": 0.0005, + "loss": 2.1019, + "step": 225500 + }, + { + "epoch": 0.8583467186346232, + "grad_norm": 0.12208396941423416, + "learning_rate": 0.0005, + "loss": 2.1145, + "step": 225510 + }, + { + "epoch": 0.8583847811027459, + "grad_norm": 0.12494415789842606, + "learning_rate": 0.0005, + "loss": 2.0953, + "step": 225520 + }, + { + "epoch": 0.8584228435708685, + "grad_norm": 0.13383197784423828, + "learning_rate": 0.0005, + "loss": 2.1067, + "step": 225530 + }, + { + "epoch": 0.8584609060389912, + "grad_norm": 0.13817362487316132, + "learning_rate": 0.0005, + "loss": 2.0963, + "step": 225540 + }, + { + "epoch": 0.8584989685071138, + "grad_norm": 0.12982138991355896, + "learning_rate": 0.0005, + "loss": 2.097, + "step": 225550 + }, + { + "epoch": 0.8585370309752366, + "grad_norm": 0.12778323888778687, + "learning_rate": 0.0005, + "loss": 2.1068, + "step": 225560 + }, + { + "epoch": 0.8585750934433592, + "grad_norm": 0.13495448231697083, + "learning_rate": 0.0005, + "loss": 2.1054, + "step": 225570 + }, + { + "epoch": 0.8586131559114819, + "grad_norm": 0.12340793013572693, + "learning_rate": 0.0005, + "loss": 2.1018, + "step": 225580 + }, + { + "epoch": 0.8586512183796046, + "grad_norm": 0.12468979507684708, + "learning_rate": 0.0005, + "loss": 2.1028, + "step": 225590 + }, + { + "epoch": 0.8586892808477273, + "grad_norm": 0.12642750144004822, + "learning_rate": 0.0005, + "loss": 2.1203, + "step": 225600 + }, + { + "epoch": 0.85872734331585, + "grad_norm": 0.1208629161119461, + "learning_rate": 0.0005, + "loss": 2.1088, + "step": 225610 + }, + { + "epoch": 0.8587654057839726, + "grad_norm": 0.11862723529338837, + "learning_rate": 0.0005, + "loss": 2.1116, + "step": 225620 + }, + { + "epoch": 0.8588034682520953, + "grad_norm": 0.1195588931441307, + "learning_rate": 0.0005, + "loss": 2.1027, + "step": 225630 + }, + { + "epoch": 0.8588415307202181, + "grad_norm": 0.12250778824090958, + "learning_rate": 0.0005, + "loss": 2.112, + "step": 225640 + }, + { + "epoch": 0.8588795931883407, + "grad_norm": 0.13086935877799988, + "learning_rate": 0.0005, + "loss": 2.108, + "step": 225650 + }, + { + "epoch": 0.8589176556564634, + "grad_norm": 0.11836916953325272, + "learning_rate": 0.0005, + "loss": 2.1019, + "step": 225660 + }, + { + "epoch": 0.858955718124586, + "grad_norm": 0.12411430478096008, + "learning_rate": 0.0005, + "loss": 2.1163, + "step": 225670 + }, + { + "epoch": 0.8589937805927087, + "grad_norm": 0.12509943544864655, + "learning_rate": 0.0005, + "loss": 2.0843, + "step": 225680 + }, + { + "epoch": 0.8590318430608315, + "grad_norm": 0.14191097021102905, + "learning_rate": 0.0005, + "loss": 2.1058, + "step": 225690 + }, + { + "epoch": 0.8590699055289541, + "grad_norm": 0.13417589664459229, + "learning_rate": 0.0005, + "loss": 2.0949, + "step": 225700 + }, + { + "epoch": 0.8591079679970768, + "grad_norm": 0.1313231736421585, + "learning_rate": 0.0005, + "loss": 2.09, + "step": 225710 + }, + { + "epoch": 0.8591460304651994, + "grad_norm": 0.12625999748706818, + "learning_rate": 0.0005, + "loss": 2.0932, + "step": 225720 + }, + { + "epoch": 0.8591840929333222, + "grad_norm": 0.13181033730506897, + "learning_rate": 0.0005, + "loss": 2.1052, + "step": 225730 + }, + { + "epoch": 0.8592221554014449, + "grad_norm": 0.13501916825771332, + "learning_rate": 0.0005, + "loss": 2.0981, + "step": 225740 + }, + { + "epoch": 0.8592602178695675, + "grad_norm": 0.12991739809513092, + "learning_rate": 0.0005, + "loss": 2.097, + "step": 225750 + }, + { + "epoch": 0.8592982803376902, + "grad_norm": 0.13025683164596558, + "learning_rate": 0.0005, + "loss": 2.1052, + "step": 225760 + }, + { + "epoch": 0.859336342805813, + "grad_norm": 0.12073546648025513, + "learning_rate": 0.0005, + "loss": 2.0893, + "step": 225770 + }, + { + "epoch": 0.8593744052739356, + "grad_norm": 0.12659108638763428, + "learning_rate": 0.0005, + "loss": 2.1127, + "step": 225780 + }, + { + "epoch": 0.8594124677420583, + "grad_norm": 0.12313074618577957, + "learning_rate": 0.0005, + "loss": 2.0964, + "step": 225790 + }, + { + "epoch": 0.8594505302101809, + "grad_norm": 0.12898115813732147, + "learning_rate": 0.0005, + "loss": 2.1037, + "step": 225800 + }, + { + "epoch": 0.8594885926783037, + "grad_norm": 0.1190689355134964, + "learning_rate": 0.0005, + "loss": 2.118, + "step": 225810 + }, + { + "epoch": 0.8595266551464263, + "grad_norm": 0.13219016790390015, + "learning_rate": 0.0005, + "loss": 2.089, + "step": 225820 + }, + { + "epoch": 0.859564717614549, + "grad_norm": 0.14062274992465973, + "learning_rate": 0.0005, + "loss": 2.1105, + "step": 225830 + }, + { + "epoch": 0.8596027800826717, + "grad_norm": 0.11882352828979492, + "learning_rate": 0.0005, + "loss": 2.092, + "step": 225840 + }, + { + "epoch": 0.8596408425507943, + "grad_norm": 0.13080590963363647, + "learning_rate": 0.0005, + "loss": 2.1033, + "step": 225850 + }, + { + "epoch": 0.8596789050189171, + "grad_norm": 0.1176263764500618, + "learning_rate": 0.0005, + "loss": 2.1022, + "step": 225860 + }, + { + "epoch": 0.8597169674870397, + "grad_norm": 0.12545640766620636, + "learning_rate": 0.0005, + "loss": 2.0974, + "step": 225870 + }, + { + "epoch": 0.8597550299551624, + "grad_norm": 0.13276277482509613, + "learning_rate": 0.0005, + "loss": 2.1071, + "step": 225880 + }, + { + "epoch": 0.859793092423285, + "grad_norm": 0.13441850244998932, + "learning_rate": 0.0005, + "loss": 2.1151, + "step": 225890 + }, + { + "epoch": 0.8598311548914078, + "grad_norm": 0.12007158249616623, + "learning_rate": 0.0005, + "loss": 2.1005, + "step": 225900 + }, + { + "epoch": 0.8598692173595305, + "grad_norm": 0.13201579451560974, + "learning_rate": 0.0005, + "loss": 2.098, + "step": 225910 + }, + { + "epoch": 0.8599072798276531, + "grad_norm": 0.12150247395038605, + "learning_rate": 0.0005, + "loss": 2.1122, + "step": 225920 + }, + { + "epoch": 0.8599453422957758, + "grad_norm": 0.12309324741363525, + "learning_rate": 0.0005, + "loss": 2.1095, + "step": 225930 + }, + { + "epoch": 0.8599834047638986, + "grad_norm": 0.11839766800403595, + "learning_rate": 0.0005, + "loss": 2.0867, + "step": 225940 + }, + { + "epoch": 0.8600214672320212, + "grad_norm": 0.13090623915195465, + "learning_rate": 0.0005, + "loss": 2.101, + "step": 225950 + }, + { + "epoch": 0.8600595297001439, + "grad_norm": 0.13575056195259094, + "learning_rate": 0.0005, + "loss": 2.1121, + "step": 225960 + }, + { + "epoch": 0.8600975921682665, + "grad_norm": 0.127581387758255, + "learning_rate": 0.0005, + "loss": 2.1069, + "step": 225970 + }, + { + "epoch": 0.8601356546363892, + "grad_norm": 0.12778790295124054, + "learning_rate": 0.0005, + "loss": 2.1045, + "step": 225980 + }, + { + "epoch": 0.860173717104512, + "grad_norm": 0.12477439641952515, + "learning_rate": 0.0005, + "loss": 2.1024, + "step": 225990 + }, + { + "epoch": 0.8602117795726346, + "grad_norm": 0.11519020795822144, + "learning_rate": 0.0005, + "loss": 2.1047, + "step": 226000 + }, + { + "epoch": 0.8602498420407573, + "grad_norm": 0.12137072533369064, + "learning_rate": 0.0005, + "loss": 2.1144, + "step": 226010 + }, + { + "epoch": 0.8602879045088799, + "grad_norm": 0.12396124005317688, + "learning_rate": 0.0005, + "loss": 2.1046, + "step": 226020 + }, + { + "epoch": 0.8603259669770027, + "grad_norm": 0.13137753307819366, + "learning_rate": 0.0005, + "loss": 2.0965, + "step": 226030 + }, + { + "epoch": 0.8603640294451254, + "grad_norm": 0.1262117326259613, + "learning_rate": 0.0005, + "loss": 2.1239, + "step": 226040 + }, + { + "epoch": 0.860402091913248, + "grad_norm": 0.12531767785549164, + "learning_rate": 0.0005, + "loss": 2.0982, + "step": 226050 + }, + { + "epoch": 0.8604401543813707, + "grad_norm": 0.12092337757349014, + "learning_rate": 0.0005, + "loss": 2.0964, + "step": 226060 + }, + { + "epoch": 0.8604782168494934, + "grad_norm": 0.13076327741146088, + "learning_rate": 0.0005, + "loss": 2.1165, + "step": 226070 + }, + { + "epoch": 0.8605162793176161, + "grad_norm": 0.12410133332014084, + "learning_rate": 0.0005, + "loss": 2.1056, + "step": 226080 + }, + { + "epoch": 0.8605543417857388, + "grad_norm": 0.1234215795993805, + "learning_rate": 0.0005, + "loss": 2.1101, + "step": 226090 + }, + { + "epoch": 0.8605924042538614, + "grad_norm": 0.13111284375190735, + "learning_rate": 0.0005, + "loss": 2.1007, + "step": 226100 + }, + { + "epoch": 0.8606304667219841, + "grad_norm": 0.11520785838365555, + "learning_rate": 0.0005, + "loss": 2.0976, + "step": 226110 + }, + { + "epoch": 0.8606685291901068, + "grad_norm": 0.12031671404838562, + "learning_rate": 0.0005, + "loss": 2.1043, + "step": 226120 + }, + { + "epoch": 0.8607065916582295, + "grad_norm": 0.12421654164791107, + "learning_rate": 0.0005, + "loss": 2.097, + "step": 226130 + }, + { + "epoch": 0.8607446541263521, + "grad_norm": 0.12553544342517853, + "learning_rate": 0.0005, + "loss": 2.105, + "step": 226140 + }, + { + "epoch": 0.8607827165944748, + "grad_norm": 0.12797226011753082, + "learning_rate": 0.0005, + "loss": 2.0918, + "step": 226150 + }, + { + "epoch": 0.8608207790625976, + "grad_norm": 0.13866060972213745, + "learning_rate": 0.0005, + "loss": 2.0974, + "step": 226160 + }, + { + "epoch": 0.8608588415307202, + "grad_norm": 0.1263991892337799, + "learning_rate": 0.0005, + "loss": 2.0962, + "step": 226170 + }, + { + "epoch": 0.8608969039988429, + "grad_norm": 0.1374591439962387, + "learning_rate": 0.0005, + "loss": 2.098, + "step": 226180 + }, + { + "epoch": 0.8609349664669655, + "grad_norm": 0.12693995237350464, + "learning_rate": 0.0005, + "loss": 2.1163, + "step": 226190 + }, + { + "epoch": 0.8609730289350883, + "grad_norm": 0.1270032674074173, + "learning_rate": 0.0005, + "loss": 2.1065, + "step": 226200 + }, + { + "epoch": 0.861011091403211, + "grad_norm": 0.13378532230854034, + "learning_rate": 0.0005, + "loss": 2.0968, + "step": 226210 + }, + { + "epoch": 0.8610491538713336, + "grad_norm": 0.1346651166677475, + "learning_rate": 0.0005, + "loss": 2.1093, + "step": 226220 + }, + { + "epoch": 0.8610872163394563, + "grad_norm": 0.12393094599246979, + "learning_rate": 0.0005, + "loss": 2.1129, + "step": 226230 + }, + { + "epoch": 0.861125278807579, + "grad_norm": 0.15087266266345978, + "learning_rate": 0.0005, + "loss": 2.0978, + "step": 226240 + }, + { + "epoch": 0.8611633412757017, + "grad_norm": 0.11483550816774368, + "learning_rate": 0.0005, + "loss": 2.1081, + "step": 226250 + }, + { + "epoch": 0.8612014037438244, + "grad_norm": 0.1254885494709015, + "learning_rate": 0.0005, + "loss": 2.0951, + "step": 226260 + }, + { + "epoch": 0.861239466211947, + "grad_norm": 0.12374576181173325, + "learning_rate": 0.0005, + "loss": 2.1055, + "step": 226270 + }, + { + "epoch": 0.8612775286800697, + "grad_norm": 0.12660790979862213, + "learning_rate": 0.0005, + "loss": 2.1078, + "step": 226280 + }, + { + "epoch": 0.8613155911481924, + "grad_norm": 0.12744198739528656, + "learning_rate": 0.0005, + "loss": 2.1077, + "step": 226290 + }, + { + "epoch": 0.8613536536163151, + "grad_norm": 0.12421263754367828, + "learning_rate": 0.0005, + "loss": 2.0946, + "step": 226300 + }, + { + "epoch": 0.8613917160844378, + "grad_norm": 0.11595192551612854, + "learning_rate": 0.0005, + "loss": 2.0966, + "step": 226310 + }, + { + "epoch": 0.8614297785525604, + "grad_norm": 0.12452074885368347, + "learning_rate": 0.0005, + "loss": 2.0938, + "step": 226320 + }, + { + "epoch": 0.8614678410206832, + "grad_norm": 0.12246125936508179, + "learning_rate": 0.0005, + "loss": 2.1145, + "step": 226330 + }, + { + "epoch": 0.8615059034888058, + "grad_norm": 0.13508029282093048, + "learning_rate": 0.0005, + "loss": 2.1007, + "step": 226340 + }, + { + "epoch": 0.8615439659569285, + "grad_norm": 0.11969486624002457, + "learning_rate": 0.0005, + "loss": 2.0936, + "step": 226350 + }, + { + "epoch": 0.8615820284250512, + "grad_norm": 0.1402081400156021, + "learning_rate": 0.0005, + "loss": 2.1014, + "step": 226360 + }, + { + "epoch": 0.8616200908931739, + "grad_norm": 0.1315373182296753, + "learning_rate": 0.0005, + "loss": 2.1153, + "step": 226370 + }, + { + "epoch": 0.8616581533612966, + "grad_norm": 0.13985498249530792, + "learning_rate": 0.0005, + "loss": 2.0986, + "step": 226380 + }, + { + "epoch": 0.8616962158294192, + "grad_norm": 0.12545058131217957, + "learning_rate": 0.0005, + "loss": 2.0904, + "step": 226390 + }, + { + "epoch": 0.8617342782975419, + "grad_norm": 0.17630834877490997, + "learning_rate": 0.0005, + "loss": 2.097, + "step": 226400 + }, + { + "epoch": 0.8617723407656646, + "grad_norm": 0.13177427649497986, + "learning_rate": 0.0005, + "loss": 2.0995, + "step": 226410 + }, + { + "epoch": 0.8618104032337873, + "grad_norm": 0.12645024061203003, + "learning_rate": 0.0005, + "loss": 2.0984, + "step": 226420 + }, + { + "epoch": 0.86184846570191, + "grad_norm": 0.13735495507717133, + "learning_rate": 0.0005, + "loss": 2.0954, + "step": 226430 + }, + { + "epoch": 0.8618865281700326, + "grad_norm": 0.13273243606090546, + "learning_rate": 0.0005, + "loss": 2.1046, + "step": 226440 + }, + { + "epoch": 0.8619245906381553, + "grad_norm": 0.13654807209968567, + "learning_rate": 0.0005, + "loss": 2.101, + "step": 226450 + }, + { + "epoch": 0.8619626531062781, + "grad_norm": 0.11886227875947952, + "learning_rate": 0.0005, + "loss": 2.1124, + "step": 226460 + }, + { + "epoch": 0.8620007155744007, + "grad_norm": 0.1377406120300293, + "learning_rate": 0.0005, + "loss": 2.1019, + "step": 226470 + }, + { + "epoch": 0.8620387780425234, + "grad_norm": 0.13204039633274078, + "learning_rate": 0.0005, + "loss": 2.1068, + "step": 226480 + }, + { + "epoch": 0.862076840510646, + "grad_norm": 0.13246789574623108, + "learning_rate": 0.0005, + "loss": 2.1171, + "step": 226490 + }, + { + "epoch": 0.8621149029787688, + "grad_norm": 0.14469172060489655, + "learning_rate": 0.0005, + "loss": 2.1078, + "step": 226500 + }, + { + "epoch": 0.8621529654468915, + "grad_norm": 0.13322791457176208, + "learning_rate": 0.0005, + "loss": 2.0996, + "step": 226510 + }, + { + "epoch": 0.8621910279150141, + "grad_norm": 0.13450798392295837, + "learning_rate": 0.0005, + "loss": 2.0884, + "step": 226520 + }, + { + "epoch": 0.8622290903831368, + "grad_norm": 0.12661725282669067, + "learning_rate": 0.0005, + "loss": 2.0988, + "step": 226530 + }, + { + "epoch": 0.8622671528512594, + "grad_norm": 0.12003296613693237, + "learning_rate": 0.0005, + "loss": 2.0919, + "step": 226540 + }, + { + "epoch": 0.8623052153193822, + "grad_norm": 0.16400742530822754, + "learning_rate": 0.0005, + "loss": 2.1124, + "step": 226550 + }, + { + "epoch": 0.8623432777875049, + "grad_norm": 0.12328396737575531, + "learning_rate": 0.0005, + "loss": 2.1073, + "step": 226560 + }, + { + "epoch": 0.8623813402556275, + "grad_norm": 0.11483404785394669, + "learning_rate": 0.0005, + "loss": 2.0864, + "step": 226570 + }, + { + "epoch": 0.8624194027237502, + "grad_norm": 0.12388511002063751, + "learning_rate": 0.0005, + "loss": 2.1091, + "step": 226580 + }, + { + "epoch": 0.8624574651918729, + "grad_norm": 0.11999114602804184, + "learning_rate": 0.0005, + "loss": 2.1087, + "step": 226590 + }, + { + "epoch": 0.8624955276599956, + "grad_norm": 0.12741059064865112, + "learning_rate": 0.0005, + "loss": 2.1085, + "step": 226600 + }, + { + "epoch": 0.8625335901281183, + "grad_norm": 0.12094668298959732, + "learning_rate": 0.0005, + "loss": 2.1063, + "step": 226610 + }, + { + "epoch": 0.8625716525962409, + "grad_norm": 0.11427821218967438, + "learning_rate": 0.0005, + "loss": 2.0967, + "step": 226620 + }, + { + "epoch": 0.8626097150643637, + "grad_norm": 0.12909498810768127, + "learning_rate": 0.0005, + "loss": 2.1021, + "step": 226630 + }, + { + "epoch": 0.8626477775324863, + "grad_norm": 0.29847195744514465, + "learning_rate": 0.0005, + "loss": 2.1104, + "step": 226640 + }, + { + "epoch": 0.862685840000609, + "grad_norm": 0.1321536898612976, + "learning_rate": 0.0005, + "loss": 2.0969, + "step": 226650 + }, + { + "epoch": 0.8627239024687317, + "grad_norm": 0.12902212142944336, + "learning_rate": 0.0005, + "loss": 2.1088, + "step": 226660 + }, + { + "epoch": 0.8627619649368544, + "grad_norm": 0.13099472224712372, + "learning_rate": 0.0005, + "loss": 2.0955, + "step": 226670 + }, + { + "epoch": 0.8628000274049771, + "grad_norm": 0.12781858444213867, + "learning_rate": 0.0005, + "loss": 2.0906, + "step": 226680 + }, + { + "epoch": 0.8628380898730997, + "grad_norm": 0.12633411586284637, + "learning_rate": 0.0005, + "loss": 2.0903, + "step": 226690 + }, + { + "epoch": 0.8628761523412224, + "grad_norm": 0.13626986742019653, + "learning_rate": 0.0005, + "loss": 2.1017, + "step": 226700 + }, + { + "epoch": 0.862914214809345, + "grad_norm": 0.13220657408237457, + "learning_rate": 0.0005, + "loss": 2.1131, + "step": 226710 + }, + { + "epoch": 0.8629522772774678, + "grad_norm": 0.11806018650531769, + "learning_rate": 0.0005, + "loss": 2.1141, + "step": 226720 + }, + { + "epoch": 0.8629903397455905, + "grad_norm": 0.12303390353918076, + "learning_rate": 0.0005, + "loss": 2.1124, + "step": 226730 + }, + { + "epoch": 0.8630284022137131, + "grad_norm": 0.12535367906093597, + "learning_rate": 0.0005, + "loss": 2.1101, + "step": 226740 + }, + { + "epoch": 0.8630664646818358, + "grad_norm": 0.12739968299865723, + "learning_rate": 0.0005, + "loss": 2.0964, + "step": 226750 + }, + { + "epoch": 0.8631045271499586, + "grad_norm": 0.12450426071882248, + "learning_rate": 0.0005, + "loss": 2.0947, + "step": 226760 + }, + { + "epoch": 0.8631425896180812, + "grad_norm": 0.12579986453056335, + "learning_rate": 0.0005, + "loss": 2.1173, + "step": 226770 + }, + { + "epoch": 0.8631806520862039, + "grad_norm": 0.1489512324333191, + "learning_rate": 0.0005, + "loss": 2.0969, + "step": 226780 + }, + { + "epoch": 0.8632187145543265, + "grad_norm": 0.12502668797969818, + "learning_rate": 0.0005, + "loss": 2.0738, + "step": 226790 + }, + { + "epoch": 0.8632567770224493, + "grad_norm": 0.13363705575466156, + "learning_rate": 0.0005, + "loss": 2.0976, + "step": 226800 + }, + { + "epoch": 0.863294839490572, + "grad_norm": 0.12163389474153519, + "learning_rate": 0.0005, + "loss": 2.1007, + "step": 226810 + }, + { + "epoch": 0.8633329019586946, + "grad_norm": 0.13291381299495697, + "learning_rate": 0.0005, + "loss": 2.0966, + "step": 226820 + }, + { + "epoch": 0.8633709644268173, + "grad_norm": 0.12920823693275452, + "learning_rate": 0.0005, + "loss": 2.0972, + "step": 226830 + }, + { + "epoch": 0.8634090268949399, + "grad_norm": 0.1237969845533371, + "learning_rate": 0.0005, + "loss": 2.1108, + "step": 226840 + }, + { + "epoch": 0.8634470893630627, + "grad_norm": 0.1371234804391861, + "learning_rate": 0.0005, + "loss": 2.103, + "step": 226850 + }, + { + "epoch": 0.8634851518311853, + "grad_norm": 0.12614992260932922, + "learning_rate": 0.0005, + "loss": 2.1, + "step": 226860 + }, + { + "epoch": 0.863523214299308, + "grad_norm": 0.13154858350753784, + "learning_rate": 0.0005, + "loss": 2.1038, + "step": 226870 + }, + { + "epoch": 0.8635612767674307, + "grad_norm": 0.13891302049160004, + "learning_rate": 0.0005, + "loss": 2.1038, + "step": 226880 + }, + { + "epoch": 0.8635993392355534, + "grad_norm": 0.147722527384758, + "learning_rate": 0.0005, + "loss": 2.1106, + "step": 226890 + }, + { + "epoch": 0.8636374017036761, + "grad_norm": 0.1390915811061859, + "learning_rate": 0.0005, + "loss": 2.1014, + "step": 226900 + }, + { + "epoch": 0.8636754641717987, + "grad_norm": 0.11712667346000671, + "learning_rate": 0.0005, + "loss": 2.0922, + "step": 226910 + }, + { + "epoch": 0.8637135266399214, + "grad_norm": 0.12969042360782623, + "learning_rate": 0.0005, + "loss": 2.0975, + "step": 226920 + }, + { + "epoch": 0.8637515891080442, + "grad_norm": 0.12357570976018906, + "learning_rate": 0.0005, + "loss": 2.093, + "step": 226930 + }, + { + "epoch": 0.8637896515761668, + "grad_norm": 0.12982304394245148, + "learning_rate": 0.0005, + "loss": 2.1053, + "step": 226940 + }, + { + "epoch": 0.8638277140442895, + "grad_norm": 0.11862597614526749, + "learning_rate": 0.0005, + "loss": 2.1022, + "step": 226950 + }, + { + "epoch": 0.8638657765124121, + "grad_norm": 0.11904960125684738, + "learning_rate": 0.0005, + "loss": 2.1075, + "step": 226960 + }, + { + "epoch": 0.8639038389805348, + "grad_norm": 0.11986031383275986, + "learning_rate": 0.0005, + "loss": 2.1115, + "step": 226970 + }, + { + "epoch": 0.8639419014486576, + "grad_norm": 0.13269126415252686, + "learning_rate": 0.0005, + "loss": 2.1018, + "step": 226980 + }, + { + "epoch": 0.8639799639167802, + "grad_norm": 0.16061635315418243, + "learning_rate": 0.0005, + "loss": 2.1008, + "step": 226990 + }, + { + "epoch": 0.8640180263849029, + "grad_norm": 0.12891733646392822, + "learning_rate": 0.0005, + "loss": 2.0913, + "step": 227000 + }, + { + "epoch": 0.8640560888530255, + "grad_norm": 0.12706734240055084, + "learning_rate": 0.0005, + "loss": 2.1059, + "step": 227010 + }, + { + "epoch": 0.8640941513211483, + "grad_norm": 0.12456909567117691, + "learning_rate": 0.0005, + "loss": 2.1148, + "step": 227020 + }, + { + "epoch": 0.864132213789271, + "grad_norm": 0.13296130299568176, + "learning_rate": 0.0005, + "loss": 2.1021, + "step": 227030 + }, + { + "epoch": 0.8641702762573936, + "grad_norm": 0.1313837468624115, + "learning_rate": 0.0005, + "loss": 2.0953, + "step": 227040 + }, + { + "epoch": 0.8642083387255163, + "grad_norm": 0.12122230976819992, + "learning_rate": 0.0005, + "loss": 2.1011, + "step": 227050 + }, + { + "epoch": 0.864246401193639, + "grad_norm": 0.12515757977962494, + "learning_rate": 0.0005, + "loss": 2.094, + "step": 227060 + }, + { + "epoch": 0.8642844636617617, + "grad_norm": 0.11461256444454193, + "learning_rate": 0.0005, + "loss": 2.1045, + "step": 227070 + }, + { + "epoch": 0.8643225261298844, + "grad_norm": 0.12336868047714233, + "learning_rate": 0.0005, + "loss": 2.1041, + "step": 227080 + }, + { + "epoch": 0.864360588598007, + "grad_norm": 0.12718035280704498, + "learning_rate": 0.0005, + "loss": 2.1042, + "step": 227090 + }, + { + "epoch": 0.8643986510661298, + "grad_norm": 0.13720561563968658, + "learning_rate": 0.0005, + "loss": 2.1087, + "step": 227100 + }, + { + "epoch": 0.8644367135342524, + "grad_norm": 0.11648658663034439, + "learning_rate": 0.0005, + "loss": 2.086, + "step": 227110 + }, + { + "epoch": 0.8644747760023751, + "grad_norm": 0.12198827415704727, + "learning_rate": 0.0005, + "loss": 2.116, + "step": 227120 + }, + { + "epoch": 0.8645128384704978, + "grad_norm": 0.12043623626232147, + "learning_rate": 0.0005, + "loss": 2.1036, + "step": 227130 + }, + { + "epoch": 0.8645509009386204, + "grad_norm": 0.11719216406345367, + "learning_rate": 0.0005, + "loss": 2.1048, + "step": 227140 + }, + { + "epoch": 0.8645889634067432, + "grad_norm": 0.1277819722890854, + "learning_rate": 0.0005, + "loss": 2.106, + "step": 227150 + }, + { + "epoch": 0.8646270258748658, + "grad_norm": 0.1243322491645813, + "learning_rate": 0.0005, + "loss": 2.1115, + "step": 227160 + }, + { + "epoch": 0.8646650883429885, + "grad_norm": 0.12716972827911377, + "learning_rate": 0.0005, + "loss": 2.1, + "step": 227170 + }, + { + "epoch": 0.8647031508111112, + "grad_norm": 0.14013415575027466, + "learning_rate": 0.0005, + "loss": 2.1044, + "step": 227180 + }, + { + "epoch": 0.8647412132792339, + "grad_norm": 0.12532857060432434, + "learning_rate": 0.0005, + "loss": 2.0909, + "step": 227190 + }, + { + "epoch": 0.8647792757473566, + "grad_norm": 0.12132761627435684, + "learning_rate": 0.0005, + "loss": 2.1062, + "step": 227200 + }, + { + "epoch": 0.8648173382154792, + "grad_norm": 0.1268184930086136, + "learning_rate": 0.0005, + "loss": 2.105, + "step": 227210 + }, + { + "epoch": 0.8648554006836019, + "grad_norm": 0.12742270529270172, + "learning_rate": 0.0005, + "loss": 2.114, + "step": 227220 + }, + { + "epoch": 0.8648934631517247, + "grad_norm": 0.12107336521148682, + "learning_rate": 0.0005, + "loss": 2.1014, + "step": 227230 + }, + { + "epoch": 0.8649315256198473, + "grad_norm": 0.13903018832206726, + "learning_rate": 0.0005, + "loss": 2.1066, + "step": 227240 + }, + { + "epoch": 0.86496958808797, + "grad_norm": 0.1315573751926422, + "learning_rate": 0.0005, + "loss": 2.1082, + "step": 227250 + }, + { + "epoch": 0.8650076505560926, + "grad_norm": 0.1224905326962471, + "learning_rate": 0.0005, + "loss": 2.1, + "step": 227260 + }, + { + "epoch": 0.8650457130242153, + "grad_norm": 0.122121661901474, + "learning_rate": 0.0005, + "loss": 2.1069, + "step": 227270 + }, + { + "epoch": 0.8650837754923381, + "grad_norm": 0.12318921089172363, + "learning_rate": 0.0005, + "loss": 2.094, + "step": 227280 + }, + { + "epoch": 0.8651218379604607, + "grad_norm": 0.12507276237010956, + "learning_rate": 0.0005, + "loss": 2.0907, + "step": 227290 + }, + { + "epoch": 0.8651599004285834, + "grad_norm": 0.12425078451633453, + "learning_rate": 0.0005, + "loss": 2.104, + "step": 227300 + }, + { + "epoch": 0.865197962896706, + "grad_norm": 0.1246219128370285, + "learning_rate": 0.0005, + "loss": 2.0936, + "step": 227310 + }, + { + "epoch": 0.8652360253648288, + "grad_norm": 0.12110217660665512, + "learning_rate": 0.0005, + "loss": 2.1081, + "step": 227320 + }, + { + "epoch": 0.8652740878329515, + "grad_norm": 0.11924567073583603, + "learning_rate": 0.0005, + "loss": 2.0879, + "step": 227330 + }, + { + "epoch": 0.8653121503010741, + "grad_norm": 0.11475517600774765, + "learning_rate": 0.0005, + "loss": 2.0928, + "step": 227340 + }, + { + "epoch": 0.8653502127691968, + "grad_norm": 0.12574663758277893, + "learning_rate": 0.0005, + "loss": 2.1013, + "step": 227350 + }, + { + "epoch": 0.8653882752373195, + "grad_norm": 0.12827351689338684, + "learning_rate": 0.0005, + "loss": 2.0993, + "step": 227360 + }, + { + "epoch": 0.8654263377054422, + "grad_norm": 0.11690565198659897, + "learning_rate": 0.0005, + "loss": 2.1018, + "step": 227370 + }, + { + "epoch": 0.8654644001735649, + "grad_norm": 0.137576162815094, + "learning_rate": 0.0005, + "loss": 2.0958, + "step": 227380 + }, + { + "epoch": 0.8655024626416875, + "grad_norm": 0.11919543892145157, + "learning_rate": 0.0005, + "loss": 2.0955, + "step": 227390 + }, + { + "epoch": 0.8655405251098102, + "grad_norm": 0.1196046695113182, + "learning_rate": 0.0005, + "loss": 2.1166, + "step": 227400 + }, + { + "epoch": 0.8655785875779329, + "grad_norm": 0.11969230324029922, + "learning_rate": 0.0005, + "loss": 2.1128, + "step": 227410 + }, + { + "epoch": 0.8656166500460556, + "grad_norm": 0.13001693785190582, + "learning_rate": 0.0005, + "loss": 2.0981, + "step": 227420 + }, + { + "epoch": 0.8656547125141782, + "grad_norm": 0.1436636596918106, + "learning_rate": 0.0005, + "loss": 2.0966, + "step": 227430 + }, + { + "epoch": 0.8656927749823009, + "grad_norm": 0.12186801433563232, + "learning_rate": 0.0005, + "loss": 2.1062, + "step": 227440 + }, + { + "epoch": 0.8657308374504237, + "grad_norm": 0.11917836219072342, + "learning_rate": 0.0005, + "loss": 2.1015, + "step": 227450 + }, + { + "epoch": 0.8657688999185463, + "grad_norm": 0.16602906584739685, + "learning_rate": 0.0005, + "loss": 2.1134, + "step": 227460 + }, + { + "epoch": 0.865806962386669, + "grad_norm": 0.12305092066526413, + "learning_rate": 0.0005, + "loss": 2.1072, + "step": 227470 + }, + { + "epoch": 0.8658450248547916, + "grad_norm": 0.12628142535686493, + "learning_rate": 0.0005, + "loss": 2.1086, + "step": 227480 + }, + { + "epoch": 0.8658830873229144, + "grad_norm": 0.12511824071407318, + "learning_rate": 0.0005, + "loss": 2.0988, + "step": 227490 + }, + { + "epoch": 0.8659211497910371, + "grad_norm": 0.12373516708612442, + "learning_rate": 0.0005, + "loss": 2.0956, + "step": 227500 + }, + { + "epoch": 0.8659592122591597, + "grad_norm": 0.12012782692909241, + "learning_rate": 0.0005, + "loss": 2.0917, + "step": 227510 + }, + { + "epoch": 0.8659972747272824, + "grad_norm": 0.1300332099199295, + "learning_rate": 0.0005, + "loss": 2.1027, + "step": 227520 + }, + { + "epoch": 0.8660353371954052, + "grad_norm": 0.12339738011360168, + "learning_rate": 0.0005, + "loss": 2.1047, + "step": 227530 + }, + { + "epoch": 0.8660733996635278, + "grad_norm": 0.13832895457744598, + "learning_rate": 0.0005, + "loss": 2.0933, + "step": 227540 + }, + { + "epoch": 0.8661114621316505, + "grad_norm": 0.12086351215839386, + "learning_rate": 0.0005, + "loss": 2.1131, + "step": 227550 + }, + { + "epoch": 0.8661495245997731, + "grad_norm": 0.1234988421201706, + "learning_rate": 0.0005, + "loss": 2.1153, + "step": 227560 + }, + { + "epoch": 0.8661875870678958, + "grad_norm": 0.1281185895204544, + "learning_rate": 0.0005, + "loss": 2.1284, + "step": 227570 + }, + { + "epoch": 0.8662256495360185, + "grad_norm": 0.12379458546638489, + "learning_rate": 0.0005, + "loss": 2.1112, + "step": 227580 + }, + { + "epoch": 0.8662637120041412, + "grad_norm": 0.12381254881620407, + "learning_rate": 0.0005, + "loss": 2.0958, + "step": 227590 + }, + { + "epoch": 0.8663017744722639, + "grad_norm": 0.12239328026771545, + "learning_rate": 0.0005, + "loss": 2.0999, + "step": 227600 + }, + { + "epoch": 0.8663398369403865, + "grad_norm": 0.12826788425445557, + "learning_rate": 0.0005, + "loss": 2.0941, + "step": 227610 + }, + { + "epoch": 0.8663778994085093, + "grad_norm": 0.11990267038345337, + "learning_rate": 0.0005, + "loss": 2.104, + "step": 227620 + }, + { + "epoch": 0.866415961876632, + "grad_norm": 0.12026792019605637, + "learning_rate": 0.0005, + "loss": 2.1286, + "step": 227630 + }, + { + "epoch": 0.8664540243447546, + "grad_norm": 0.12082263827323914, + "learning_rate": 0.0005, + "loss": 2.1054, + "step": 227640 + }, + { + "epoch": 0.8664920868128773, + "grad_norm": 0.13680067658424377, + "learning_rate": 0.0005, + "loss": 2.0892, + "step": 227650 + }, + { + "epoch": 0.866530149281, + "grad_norm": 0.13015596568584442, + "learning_rate": 0.0005, + "loss": 2.1084, + "step": 227660 + }, + { + "epoch": 0.8665682117491227, + "grad_norm": 0.1317087709903717, + "learning_rate": 0.0005, + "loss": 2.1016, + "step": 227670 + }, + { + "epoch": 0.8666062742172453, + "grad_norm": 0.13107813894748688, + "learning_rate": 0.0005, + "loss": 2.103, + "step": 227680 + }, + { + "epoch": 0.866644336685368, + "grad_norm": 0.11237580329179764, + "learning_rate": 0.0005, + "loss": 2.101, + "step": 227690 + }, + { + "epoch": 0.8666823991534907, + "grad_norm": 0.13148245215415955, + "learning_rate": 0.0005, + "loss": 2.122, + "step": 227700 + }, + { + "epoch": 0.8667204616216134, + "grad_norm": 0.12727142870426178, + "learning_rate": 0.0005, + "loss": 2.0995, + "step": 227710 + }, + { + "epoch": 0.8667585240897361, + "grad_norm": 0.12946079671382904, + "learning_rate": 0.0005, + "loss": 2.1166, + "step": 227720 + }, + { + "epoch": 0.8667965865578587, + "grad_norm": 0.1307932585477829, + "learning_rate": 0.0005, + "loss": 2.0848, + "step": 227730 + }, + { + "epoch": 0.8668346490259814, + "grad_norm": 0.13611574470996857, + "learning_rate": 0.0005, + "loss": 2.1055, + "step": 227740 + }, + { + "epoch": 0.8668727114941042, + "grad_norm": 0.12899921834468842, + "learning_rate": 0.0005, + "loss": 2.1027, + "step": 227750 + }, + { + "epoch": 0.8669107739622268, + "grad_norm": 0.14597873389720917, + "learning_rate": 0.0005, + "loss": 2.1127, + "step": 227760 + }, + { + "epoch": 0.8669488364303495, + "grad_norm": 0.13091064989566803, + "learning_rate": 0.0005, + "loss": 2.1104, + "step": 227770 + }, + { + "epoch": 0.8669868988984721, + "grad_norm": 0.13377290964126587, + "learning_rate": 0.0005, + "loss": 2.1062, + "step": 227780 + }, + { + "epoch": 0.8670249613665949, + "grad_norm": 0.12073376029729843, + "learning_rate": 0.0005, + "loss": 2.0993, + "step": 227790 + }, + { + "epoch": 0.8670630238347176, + "grad_norm": 0.12615399062633514, + "learning_rate": 0.0005, + "loss": 2.1087, + "step": 227800 + }, + { + "epoch": 0.8671010863028402, + "grad_norm": 0.12113891541957855, + "learning_rate": 0.0005, + "loss": 2.0971, + "step": 227810 + }, + { + "epoch": 0.8671391487709629, + "grad_norm": 0.1192716658115387, + "learning_rate": 0.0005, + "loss": 2.0954, + "step": 227820 + }, + { + "epoch": 0.8671772112390855, + "grad_norm": 0.1256159394979477, + "learning_rate": 0.0005, + "loss": 2.1029, + "step": 227830 + }, + { + "epoch": 0.8672152737072083, + "grad_norm": 0.12360091507434845, + "learning_rate": 0.0005, + "loss": 2.1101, + "step": 227840 + }, + { + "epoch": 0.867253336175331, + "grad_norm": 0.11907059699296951, + "learning_rate": 0.0005, + "loss": 2.1016, + "step": 227850 + }, + { + "epoch": 0.8672913986434536, + "grad_norm": 0.12359027564525604, + "learning_rate": 0.0005, + "loss": 2.1018, + "step": 227860 + }, + { + "epoch": 0.8673294611115763, + "grad_norm": 0.12925948202610016, + "learning_rate": 0.0005, + "loss": 2.0875, + "step": 227870 + }, + { + "epoch": 0.867367523579699, + "grad_norm": 0.1352248638868332, + "learning_rate": 0.0005, + "loss": 2.0974, + "step": 227880 + }, + { + "epoch": 0.8674055860478217, + "grad_norm": 0.1331569403409958, + "learning_rate": 0.0005, + "loss": 2.0831, + "step": 227890 + }, + { + "epoch": 0.8674436485159444, + "grad_norm": 0.12908479571342468, + "learning_rate": 0.0005, + "loss": 2.1174, + "step": 227900 + }, + { + "epoch": 0.867481710984067, + "grad_norm": 0.1242067888379097, + "learning_rate": 0.0005, + "loss": 2.0978, + "step": 227910 + }, + { + "epoch": 0.8675197734521898, + "grad_norm": 0.13277634978294373, + "learning_rate": 0.0005, + "loss": 2.1008, + "step": 227920 + }, + { + "epoch": 0.8675578359203124, + "grad_norm": 0.12008011341094971, + "learning_rate": 0.0005, + "loss": 2.1114, + "step": 227930 + }, + { + "epoch": 0.8675958983884351, + "grad_norm": 0.11581023037433624, + "learning_rate": 0.0005, + "loss": 2.0985, + "step": 227940 + }, + { + "epoch": 0.8676339608565578, + "grad_norm": 0.12725763022899628, + "learning_rate": 0.0005, + "loss": 2.1005, + "step": 227950 + }, + { + "epoch": 0.8676720233246805, + "grad_norm": 0.11594830453395844, + "learning_rate": 0.0005, + "loss": 2.1051, + "step": 227960 + }, + { + "epoch": 0.8677100857928032, + "grad_norm": 0.12017028033733368, + "learning_rate": 0.0005, + "loss": 2.0935, + "step": 227970 + }, + { + "epoch": 0.8677481482609258, + "grad_norm": 0.1227264478802681, + "learning_rate": 0.0005, + "loss": 2.1008, + "step": 227980 + }, + { + "epoch": 0.8677862107290485, + "grad_norm": 0.12182033061981201, + "learning_rate": 0.0005, + "loss": 2.1059, + "step": 227990 + }, + { + "epoch": 0.8678242731971711, + "grad_norm": 0.13833528757095337, + "learning_rate": 0.0005, + "loss": 2.1079, + "step": 228000 + }, + { + "epoch": 0.8678623356652939, + "grad_norm": 0.13079838454723358, + "learning_rate": 0.0005, + "loss": 2.103, + "step": 228010 + }, + { + "epoch": 0.8679003981334166, + "grad_norm": 0.13576842844486237, + "learning_rate": 0.0005, + "loss": 2.111, + "step": 228020 + }, + { + "epoch": 0.8679384606015392, + "grad_norm": 0.12961263954639435, + "learning_rate": 0.0005, + "loss": 2.0894, + "step": 228030 + }, + { + "epoch": 0.8679765230696619, + "grad_norm": 0.12877993285655975, + "learning_rate": 0.0005, + "loss": 2.1088, + "step": 228040 + }, + { + "epoch": 0.8680145855377847, + "grad_norm": 0.12145891785621643, + "learning_rate": 0.0005, + "loss": 2.0949, + "step": 228050 + }, + { + "epoch": 0.8680526480059073, + "grad_norm": 0.11703445017337799, + "learning_rate": 0.0005, + "loss": 2.1172, + "step": 228060 + }, + { + "epoch": 0.86809071047403, + "grad_norm": 0.12900608777999878, + "learning_rate": 0.0005, + "loss": 2.1032, + "step": 228070 + }, + { + "epoch": 0.8681287729421526, + "grad_norm": 0.11951258033514023, + "learning_rate": 0.0005, + "loss": 2.0884, + "step": 228080 + }, + { + "epoch": 0.8681668354102754, + "grad_norm": 0.1197548508644104, + "learning_rate": 0.0005, + "loss": 2.0922, + "step": 228090 + }, + { + "epoch": 0.868204897878398, + "grad_norm": 0.12547695636749268, + "learning_rate": 0.0005, + "loss": 2.1062, + "step": 228100 + }, + { + "epoch": 0.8682429603465207, + "grad_norm": 0.12415210902690887, + "learning_rate": 0.0005, + "loss": 2.0972, + "step": 228110 + }, + { + "epoch": 0.8682810228146434, + "grad_norm": 0.12451770156621933, + "learning_rate": 0.0005, + "loss": 2.097, + "step": 228120 + }, + { + "epoch": 0.868319085282766, + "grad_norm": 0.12714098393917084, + "learning_rate": 0.0005, + "loss": 2.0968, + "step": 228130 + }, + { + "epoch": 0.8683571477508888, + "grad_norm": 0.12449847161769867, + "learning_rate": 0.0005, + "loss": 2.1054, + "step": 228140 + }, + { + "epoch": 0.8683952102190114, + "grad_norm": 0.11371887475252151, + "learning_rate": 0.0005, + "loss": 2.1075, + "step": 228150 + }, + { + "epoch": 0.8684332726871341, + "grad_norm": 0.11736097186803818, + "learning_rate": 0.0005, + "loss": 2.1079, + "step": 228160 + }, + { + "epoch": 0.8684713351552568, + "grad_norm": 0.1239897757768631, + "learning_rate": 0.0005, + "loss": 2.1054, + "step": 228170 + }, + { + "epoch": 0.8685093976233795, + "grad_norm": 0.13431617617607117, + "learning_rate": 0.0005, + "loss": 2.11, + "step": 228180 + }, + { + "epoch": 0.8685474600915022, + "grad_norm": 0.1289248764514923, + "learning_rate": 0.0005, + "loss": 2.1025, + "step": 228190 + }, + { + "epoch": 0.8685855225596248, + "grad_norm": 0.11664478480815887, + "learning_rate": 0.0005, + "loss": 2.1096, + "step": 228200 + }, + { + "epoch": 0.8686235850277475, + "grad_norm": 0.13370691239833832, + "learning_rate": 0.0005, + "loss": 2.1118, + "step": 228210 + }, + { + "epoch": 0.8686616474958703, + "grad_norm": 0.12480060011148453, + "learning_rate": 0.0005, + "loss": 2.1067, + "step": 228220 + }, + { + "epoch": 0.8686997099639929, + "grad_norm": 0.14454995095729828, + "learning_rate": 0.0005, + "loss": 2.1044, + "step": 228230 + }, + { + "epoch": 0.8687377724321156, + "grad_norm": 0.13794377446174622, + "learning_rate": 0.0005, + "loss": 2.1122, + "step": 228240 + }, + { + "epoch": 0.8687758349002382, + "grad_norm": 0.13001668453216553, + "learning_rate": 0.0005, + "loss": 2.1098, + "step": 228250 + }, + { + "epoch": 0.8688138973683609, + "grad_norm": 0.1631646305322647, + "learning_rate": 0.0005, + "loss": 2.1033, + "step": 228260 + }, + { + "epoch": 0.8688519598364837, + "grad_norm": 0.13760943710803986, + "learning_rate": 0.0005, + "loss": 2.0969, + "step": 228270 + }, + { + "epoch": 0.8688900223046063, + "grad_norm": 0.12570780515670776, + "learning_rate": 0.0005, + "loss": 2.1117, + "step": 228280 + }, + { + "epoch": 0.868928084772729, + "grad_norm": 0.1366773247718811, + "learning_rate": 0.0005, + "loss": 2.1004, + "step": 228290 + }, + { + "epoch": 0.8689661472408516, + "grad_norm": 0.12486300617456436, + "learning_rate": 0.0005, + "loss": 2.102, + "step": 228300 + }, + { + "epoch": 0.8690042097089744, + "grad_norm": 0.12038671225309372, + "learning_rate": 0.0005, + "loss": 2.0962, + "step": 228310 + }, + { + "epoch": 0.8690422721770971, + "grad_norm": 0.12719301879405975, + "learning_rate": 0.0005, + "loss": 2.0913, + "step": 228320 + }, + { + "epoch": 0.8690803346452197, + "grad_norm": 0.1270996630191803, + "learning_rate": 0.0005, + "loss": 2.0907, + "step": 228330 + }, + { + "epoch": 0.8691183971133424, + "grad_norm": 0.13092835247516632, + "learning_rate": 0.0005, + "loss": 2.0882, + "step": 228340 + }, + { + "epoch": 0.8691564595814651, + "grad_norm": 0.12825457751750946, + "learning_rate": 0.0005, + "loss": 2.0839, + "step": 228350 + }, + { + "epoch": 0.8691945220495878, + "grad_norm": 0.12021090090274811, + "learning_rate": 0.0005, + "loss": 2.1146, + "step": 228360 + }, + { + "epoch": 0.8692325845177105, + "grad_norm": 0.13270549476146698, + "learning_rate": 0.0005, + "loss": 2.1046, + "step": 228370 + }, + { + "epoch": 0.8692706469858331, + "grad_norm": 0.11613058298826218, + "learning_rate": 0.0005, + "loss": 2.1143, + "step": 228380 + }, + { + "epoch": 0.8693087094539559, + "grad_norm": 0.12812206149101257, + "learning_rate": 0.0005, + "loss": 2.1086, + "step": 228390 + }, + { + "epoch": 0.8693467719220785, + "grad_norm": 0.13474640250205994, + "learning_rate": 0.0005, + "loss": 2.1171, + "step": 228400 + }, + { + "epoch": 0.8693848343902012, + "grad_norm": 0.12155559659004211, + "learning_rate": 0.0005, + "loss": 2.0865, + "step": 228410 + }, + { + "epoch": 0.8694228968583239, + "grad_norm": 0.5138123035430908, + "learning_rate": 0.0005, + "loss": 2.1166, + "step": 228420 + }, + { + "epoch": 0.8694609593264465, + "grad_norm": 0.12644684314727783, + "learning_rate": 0.0005, + "loss": 2.0947, + "step": 228430 + }, + { + "epoch": 0.8694990217945693, + "grad_norm": 0.1481925994157791, + "learning_rate": 0.0005, + "loss": 2.0812, + "step": 228440 + }, + { + "epoch": 0.8695370842626919, + "grad_norm": 0.11693347245454788, + "learning_rate": 0.0005, + "loss": 2.1022, + "step": 228450 + }, + { + "epoch": 0.8695751467308146, + "grad_norm": 0.13215351104736328, + "learning_rate": 0.0005, + "loss": 2.107, + "step": 228460 + }, + { + "epoch": 0.8696132091989373, + "grad_norm": 0.11924117803573608, + "learning_rate": 0.0005, + "loss": 2.0976, + "step": 228470 + }, + { + "epoch": 0.86965127166706, + "grad_norm": 0.13626554608345032, + "learning_rate": 0.0005, + "loss": 2.1044, + "step": 228480 + }, + { + "epoch": 0.8696893341351827, + "grad_norm": 0.13347983360290527, + "learning_rate": 0.0005, + "loss": 2.096, + "step": 228490 + }, + { + "epoch": 0.8697273966033053, + "grad_norm": 0.13828565180301666, + "learning_rate": 0.0005, + "loss": 2.0969, + "step": 228500 + }, + { + "epoch": 0.869765459071428, + "grad_norm": 0.1184941977262497, + "learning_rate": 0.0005, + "loss": 2.1137, + "step": 228510 + }, + { + "epoch": 0.8698035215395508, + "grad_norm": 0.13307888805866241, + "learning_rate": 0.0005, + "loss": 2.0961, + "step": 228520 + }, + { + "epoch": 0.8698415840076734, + "grad_norm": 0.13183127343654633, + "learning_rate": 0.0005, + "loss": 2.0983, + "step": 228530 + }, + { + "epoch": 0.8698796464757961, + "grad_norm": 0.13290202617645264, + "learning_rate": 0.0005, + "loss": 2.0999, + "step": 228540 + }, + { + "epoch": 0.8699177089439187, + "grad_norm": 0.12918685376644135, + "learning_rate": 0.0005, + "loss": 2.0986, + "step": 228550 + }, + { + "epoch": 0.8699557714120414, + "grad_norm": 0.12262940406799316, + "learning_rate": 0.0005, + "loss": 2.1073, + "step": 228560 + }, + { + "epoch": 0.8699938338801642, + "grad_norm": 0.12436863780021667, + "learning_rate": 0.0005, + "loss": 2.1061, + "step": 228570 + }, + { + "epoch": 0.8700318963482868, + "grad_norm": 0.12879203259944916, + "learning_rate": 0.0005, + "loss": 2.1023, + "step": 228580 + }, + { + "epoch": 0.8700699588164095, + "grad_norm": 0.12833668291568756, + "learning_rate": 0.0005, + "loss": 2.1001, + "step": 228590 + }, + { + "epoch": 0.8701080212845321, + "grad_norm": 0.13610157370567322, + "learning_rate": 0.0005, + "loss": 2.0854, + "step": 228600 + }, + { + "epoch": 0.8701460837526549, + "grad_norm": 0.12968987226486206, + "learning_rate": 0.0005, + "loss": 2.1096, + "step": 228610 + }, + { + "epoch": 0.8701841462207776, + "grad_norm": 0.1264941394329071, + "learning_rate": 0.0005, + "loss": 2.1117, + "step": 228620 + }, + { + "epoch": 0.8702222086889002, + "grad_norm": 0.12434609234333038, + "learning_rate": 0.0005, + "loss": 2.1111, + "step": 228630 + }, + { + "epoch": 0.8702602711570229, + "grad_norm": 0.12505097687244415, + "learning_rate": 0.0005, + "loss": 2.1146, + "step": 228640 + }, + { + "epoch": 0.8702983336251456, + "grad_norm": 0.12018077075481415, + "learning_rate": 0.0005, + "loss": 2.1052, + "step": 228650 + }, + { + "epoch": 0.8703363960932683, + "grad_norm": 0.11911716312170029, + "learning_rate": 0.0005, + "loss": 2.1047, + "step": 228660 + }, + { + "epoch": 0.870374458561391, + "grad_norm": 0.12822499871253967, + "learning_rate": 0.0005, + "loss": 2.1287, + "step": 228670 + }, + { + "epoch": 0.8704125210295136, + "grad_norm": 0.11791371554136276, + "learning_rate": 0.0005, + "loss": 2.1008, + "step": 228680 + }, + { + "epoch": 0.8704505834976363, + "grad_norm": 0.12105327099561691, + "learning_rate": 0.0005, + "loss": 2.1028, + "step": 228690 + }, + { + "epoch": 0.870488645965759, + "grad_norm": 0.13159845769405365, + "learning_rate": 0.0005, + "loss": 2.1049, + "step": 228700 + }, + { + "epoch": 0.8705267084338817, + "grad_norm": 0.12858182191848755, + "learning_rate": 0.0005, + "loss": 2.0934, + "step": 228710 + }, + { + "epoch": 0.8705647709020043, + "grad_norm": 0.12227248400449753, + "learning_rate": 0.0005, + "loss": 2.1099, + "step": 228720 + }, + { + "epoch": 0.870602833370127, + "grad_norm": 0.13533693552017212, + "learning_rate": 0.0005, + "loss": 2.1047, + "step": 228730 + }, + { + "epoch": 0.8706408958382498, + "grad_norm": 0.128210186958313, + "learning_rate": 0.0005, + "loss": 2.0864, + "step": 228740 + }, + { + "epoch": 0.8706789583063724, + "grad_norm": 0.33541253209114075, + "learning_rate": 0.0005, + "loss": 2.1087, + "step": 228750 + }, + { + "epoch": 0.8707170207744951, + "grad_norm": 0.13576674461364746, + "learning_rate": 0.0005, + "loss": 2.1212, + "step": 228760 + }, + { + "epoch": 0.8707550832426177, + "grad_norm": 0.11649937182664871, + "learning_rate": 0.0005, + "loss": 2.1126, + "step": 228770 + }, + { + "epoch": 0.8707931457107405, + "grad_norm": 0.12344325333833694, + "learning_rate": 0.0005, + "loss": 2.1094, + "step": 228780 + }, + { + "epoch": 0.8708312081788632, + "grad_norm": 0.12784738838672638, + "learning_rate": 0.0005, + "loss": 2.0903, + "step": 228790 + }, + { + "epoch": 0.8708692706469858, + "grad_norm": 0.1322634369134903, + "learning_rate": 0.0005, + "loss": 2.0983, + "step": 228800 + }, + { + "epoch": 0.8709073331151085, + "grad_norm": 0.1282651573419571, + "learning_rate": 0.0005, + "loss": 2.1135, + "step": 228810 + }, + { + "epoch": 0.8709453955832313, + "grad_norm": 0.13180986046791077, + "learning_rate": 0.0005, + "loss": 2.1137, + "step": 228820 + }, + { + "epoch": 0.8709834580513539, + "grad_norm": 0.1403912752866745, + "learning_rate": 0.0005, + "loss": 2.1062, + "step": 228830 + }, + { + "epoch": 0.8710215205194766, + "grad_norm": 0.12029944360256195, + "learning_rate": 0.0005, + "loss": 2.1038, + "step": 228840 + }, + { + "epoch": 0.8710595829875992, + "grad_norm": 0.13395288586616516, + "learning_rate": 0.0005, + "loss": 2.1073, + "step": 228850 + }, + { + "epoch": 0.8710976454557219, + "grad_norm": 0.13040871918201447, + "learning_rate": 0.0005, + "loss": 2.1011, + "step": 228860 + }, + { + "epoch": 0.8711357079238446, + "grad_norm": 0.12356771528720856, + "learning_rate": 0.0005, + "loss": 2.1018, + "step": 228870 + }, + { + "epoch": 0.8711737703919673, + "grad_norm": 0.12945036590099335, + "learning_rate": 0.0005, + "loss": 2.0986, + "step": 228880 + }, + { + "epoch": 0.87121183286009, + "grad_norm": 0.12240960448980331, + "learning_rate": 0.0005, + "loss": 2.1158, + "step": 228890 + }, + { + "epoch": 0.8712498953282126, + "grad_norm": 0.12477682530879974, + "learning_rate": 0.0005, + "loss": 2.1066, + "step": 228900 + }, + { + "epoch": 0.8712879577963354, + "grad_norm": 0.15387369692325592, + "learning_rate": 0.0005, + "loss": 2.1126, + "step": 228910 + }, + { + "epoch": 0.871326020264458, + "grad_norm": 0.14190644025802612, + "learning_rate": 0.0005, + "loss": 2.1081, + "step": 228920 + }, + { + "epoch": 0.8713640827325807, + "grad_norm": 0.1381376087665558, + "learning_rate": 0.0005, + "loss": 2.1104, + "step": 228930 + }, + { + "epoch": 0.8714021452007034, + "grad_norm": 0.16371963918209076, + "learning_rate": 0.0005, + "loss": 2.1051, + "step": 228940 + }, + { + "epoch": 0.8714402076688261, + "grad_norm": 0.13157308101654053, + "learning_rate": 0.0005, + "loss": 2.1122, + "step": 228950 + }, + { + "epoch": 0.8714782701369488, + "grad_norm": 0.14220169186592102, + "learning_rate": 0.0005, + "loss": 2.1214, + "step": 228960 + }, + { + "epoch": 0.8715163326050714, + "grad_norm": 0.13458815217018127, + "learning_rate": 0.0005, + "loss": 2.1087, + "step": 228970 + }, + { + "epoch": 0.8715543950731941, + "grad_norm": 0.13978664577007294, + "learning_rate": 0.0005, + "loss": 2.093, + "step": 228980 + }, + { + "epoch": 0.8715924575413168, + "grad_norm": 0.11965165287256241, + "learning_rate": 0.0005, + "loss": 2.106, + "step": 228990 + }, + { + "epoch": 0.8716305200094395, + "grad_norm": 0.1364961862564087, + "learning_rate": 0.0005, + "loss": 2.0934, + "step": 229000 + }, + { + "epoch": 0.8716685824775622, + "grad_norm": 0.12621454894542694, + "learning_rate": 0.0005, + "loss": 2.1131, + "step": 229010 + }, + { + "epoch": 0.8717066449456848, + "grad_norm": 0.12990035116672516, + "learning_rate": 0.0005, + "loss": 2.1051, + "step": 229020 + }, + { + "epoch": 0.8717447074138075, + "grad_norm": 0.13059154152870178, + "learning_rate": 0.0005, + "loss": 2.1021, + "step": 229030 + }, + { + "epoch": 0.8717827698819303, + "grad_norm": 0.13521890342235565, + "learning_rate": 0.0005, + "loss": 2.1083, + "step": 229040 + }, + { + "epoch": 0.8718208323500529, + "grad_norm": 0.12604497373104095, + "learning_rate": 0.0005, + "loss": 2.1054, + "step": 229050 + }, + { + "epoch": 0.8718588948181756, + "grad_norm": 0.13183411955833435, + "learning_rate": 0.0005, + "loss": 2.1054, + "step": 229060 + }, + { + "epoch": 0.8718969572862982, + "grad_norm": 0.13138261437416077, + "learning_rate": 0.0005, + "loss": 2.0977, + "step": 229070 + }, + { + "epoch": 0.871935019754421, + "grad_norm": 0.1265527904033661, + "learning_rate": 0.0005, + "loss": 2.1147, + "step": 229080 + }, + { + "epoch": 0.8719730822225437, + "grad_norm": 0.14647045731544495, + "learning_rate": 0.0005, + "loss": 2.0997, + "step": 229090 + }, + { + "epoch": 0.8720111446906663, + "grad_norm": 0.12061312049627304, + "learning_rate": 0.0005, + "loss": 2.0964, + "step": 229100 + }, + { + "epoch": 0.872049207158789, + "grad_norm": 0.1324048638343811, + "learning_rate": 0.0005, + "loss": 2.1069, + "step": 229110 + }, + { + "epoch": 0.8720872696269116, + "grad_norm": 0.12923435866832733, + "learning_rate": 0.0005, + "loss": 2.111, + "step": 229120 + }, + { + "epoch": 0.8721253320950344, + "grad_norm": 0.13202157616615295, + "learning_rate": 0.0005, + "loss": 2.0987, + "step": 229130 + }, + { + "epoch": 0.8721633945631571, + "grad_norm": 0.12037435173988342, + "learning_rate": 0.0005, + "loss": 2.122, + "step": 229140 + }, + { + "epoch": 0.8722014570312797, + "grad_norm": 0.12143620103597641, + "learning_rate": 0.0005, + "loss": 2.0953, + "step": 229150 + }, + { + "epoch": 0.8722395194994024, + "grad_norm": 0.1236945390701294, + "learning_rate": 0.0005, + "loss": 2.0912, + "step": 229160 + }, + { + "epoch": 0.8722775819675251, + "grad_norm": 0.13332204520702362, + "learning_rate": 0.0005, + "loss": 2.1042, + "step": 229170 + }, + { + "epoch": 0.8723156444356478, + "grad_norm": 0.14076372981071472, + "learning_rate": 0.0005, + "loss": 2.1119, + "step": 229180 + }, + { + "epoch": 0.8723537069037705, + "grad_norm": 0.13058961927890778, + "learning_rate": 0.0005, + "loss": 2.0999, + "step": 229190 + }, + { + "epoch": 0.8723917693718931, + "grad_norm": 0.13476161658763885, + "learning_rate": 0.0005, + "loss": 2.1062, + "step": 229200 + }, + { + "epoch": 0.8724298318400159, + "grad_norm": 0.1271485835313797, + "learning_rate": 0.0005, + "loss": 2.0999, + "step": 229210 + }, + { + "epoch": 0.8724678943081385, + "grad_norm": 0.12276792526245117, + "learning_rate": 0.0005, + "loss": 2.0919, + "step": 229220 + }, + { + "epoch": 0.8725059567762612, + "grad_norm": 0.12720341980457306, + "learning_rate": 0.0005, + "loss": 2.1121, + "step": 229230 + }, + { + "epoch": 0.8725440192443839, + "grad_norm": 0.13597077131271362, + "learning_rate": 0.0005, + "loss": 2.1097, + "step": 229240 + }, + { + "epoch": 0.8725820817125066, + "grad_norm": 0.1279052197933197, + "learning_rate": 0.0005, + "loss": 2.0927, + "step": 229250 + }, + { + "epoch": 0.8726201441806293, + "grad_norm": 0.10936145484447479, + "learning_rate": 0.0005, + "loss": 2.1162, + "step": 229260 + }, + { + "epoch": 0.8726582066487519, + "grad_norm": 0.12631534039974213, + "learning_rate": 0.0005, + "loss": 2.1055, + "step": 229270 + }, + { + "epoch": 0.8726962691168746, + "grad_norm": 0.12847156822681427, + "learning_rate": 0.0005, + "loss": 2.1087, + "step": 229280 + }, + { + "epoch": 0.8727343315849972, + "grad_norm": 0.12804313004016876, + "learning_rate": 0.0005, + "loss": 2.0898, + "step": 229290 + }, + { + "epoch": 0.87277239405312, + "grad_norm": 0.1265999674797058, + "learning_rate": 0.0005, + "loss": 2.1022, + "step": 229300 + }, + { + "epoch": 0.8728104565212427, + "grad_norm": 0.11968325823545456, + "learning_rate": 0.0005, + "loss": 2.1042, + "step": 229310 + }, + { + "epoch": 0.8728485189893653, + "grad_norm": 0.11957523971796036, + "learning_rate": 0.0005, + "loss": 2.1001, + "step": 229320 + }, + { + "epoch": 0.872886581457488, + "grad_norm": 0.12701158225536346, + "learning_rate": 0.0005, + "loss": 2.1061, + "step": 229330 + }, + { + "epoch": 0.8729246439256108, + "grad_norm": 0.11046332120895386, + "learning_rate": 0.0005, + "loss": 2.102, + "step": 229340 + }, + { + "epoch": 0.8729627063937334, + "grad_norm": 0.1372259110212326, + "learning_rate": 0.0005, + "loss": 2.1079, + "step": 229350 + }, + { + "epoch": 0.8730007688618561, + "grad_norm": 0.12271147966384888, + "learning_rate": 0.0005, + "loss": 2.1138, + "step": 229360 + }, + { + "epoch": 0.8730388313299787, + "grad_norm": 0.11774443835020065, + "learning_rate": 0.0005, + "loss": 2.1076, + "step": 229370 + }, + { + "epoch": 0.8730768937981015, + "grad_norm": 0.11719627678394318, + "learning_rate": 0.0005, + "loss": 2.0932, + "step": 229380 + }, + { + "epoch": 0.8731149562662242, + "grad_norm": 0.11640815436840057, + "learning_rate": 0.0005, + "loss": 2.1111, + "step": 229390 + }, + { + "epoch": 0.8731530187343468, + "grad_norm": 0.1204354465007782, + "learning_rate": 0.0005, + "loss": 2.0933, + "step": 229400 + }, + { + "epoch": 0.8731910812024695, + "grad_norm": 0.12202809005975723, + "learning_rate": 0.0005, + "loss": 2.1066, + "step": 229410 + }, + { + "epoch": 0.8732291436705921, + "grad_norm": 0.13924041390419006, + "learning_rate": 0.0005, + "loss": 2.1064, + "step": 229420 + }, + { + "epoch": 0.8732672061387149, + "grad_norm": 0.11984602361917496, + "learning_rate": 0.0005, + "loss": 2.0983, + "step": 229430 + }, + { + "epoch": 0.8733052686068375, + "grad_norm": 0.11162258684635162, + "learning_rate": 0.0005, + "loss": 2.1003, + "step": 229440 + }, + { + "epoch": 0.8733433310749602, + "grad_norm": 0.11645910888910294, + "learning_rate": 0.0005, + "loss": 2.1069, + "step": 229450 + }, + { + "epoch": 0.8733813935430829, + "grad_norm": 0.1287803053855896, + "learning_rate": 0.0005, + "loss": 2.1123, + "step": 229460 + }, + { + "epoch": 0.8734194560112056, + "grad_norm": 0.1326991766691208, + "learning_rate": 0.0005, + "loss": 2.098, + "step": 229470 + }, + { + "epoch": 0.8734575184793283, + "grad_norm": 0.13610665500164032, + "learning_rate": 0.0005, + "loss": 2.0989, + "step": 229480 + }, + { + "epoch": 0.873495580947451, + "grad_norm": 0.13597998023033142, + "learning_rate": 0.0005, + "loss": 2.0971, + "step": 229490 + }, + { + "epoch": 0.8735336434155736, + "grad_norm": 0.14227288961410522, + "learning_rate": 0.0005, + "loss": 2.1144, + "step": 229500 + }, + { + "epoch": 0.8735717058836964, + "grad_norm": 0.12824498116970062, + "learning_rate": 0.0005, + "loss": 2.1035, + "step": 229510 + }, + { + "epoch": 0.873609768351819, + "grad_norm": 0.1353910118341446, + "learning_rate": 0.0005, + "loss": 2.1149, + "step": 229520 + }, + { + "epoch": 0.8736478308199417, + "grad_norm": 0.13321471214294434, + "learning_rate": 0.0005, + "loss": 2.1089, + "step": 229530 + }, + { + "epoch": 0.8736858932880643, + "grad_norm": 0.12848351895809174, + "learning_rate": 0.0005, + "loss": 2.0995, + "step": 229540 + }, + { + "epoch": 0.8737239557561871, + "grad_norm": 0.12468912452459335, + "learning_rate": 0.0005, + "loss": 2.095, + "step": 229550 + }, + { + "epoch": 0.8737620182243098, + "grad_norm": 0.13091306388378143, + "learning_rate": 0.0005, + "loss": 2.1084, + "step": 229560 + }, + { + "epoch": 0.8738000806924324, + "grad_norm": 0.12601301074028015, + "learning_rate": 0.0005, + "loss": 2.1096, + "step": 229570 + }, + { + "epoch": 0.8738381431605551, + "grad_norm": 0.1305581033229828, + "learning_rate": 0.0005, + "loss": 2.1127, + "step": 229580 + }, + { + "epoch": 0.8738762056286777, + "grad_norm": 0.14347653090953827, + "learning_rate": 0.0005, + "loss": 2.0863, + "step": 229590 + }, + { + "epoch": 0.8739142680968005, + "grad_norm": 0.12171580642461777, + "learning_rate": 0.0005, + "loss": 2.1046, + "step": 229600 + }, + { + "epoch": 0.8739523305649232, + "grad_norm": 0.11917494237422943, + "learning_rate": 0.0005, + "loss": 2.1178, + "step": 229610 + }, + { + "epoch": 0.8739903930330458, + "grad_norm": 0.11979630589485168, + "learning_rate": 0.0005, + "loss": 2.1127, + "step": 229620 + }, + { + "epoch": 0.8740284555011685, + "grad_norm": 0.12840092182159424, + "learning_rate": 0.0005, + "loss": 2.103, + "step": 229630 + }, + { + "epoch": 0.8740665179692912, + "grad_norm": 0.1223602220416069, + "learning_rate": 0.0005, + "loss": 2.1049, + "step": 229640 + }, + { + "epoch": 0.8741045804374139, + "grad_norm": 0.12229134887456894, + "learning_rate": 0.0005, + "loss": 2.1125, + "step": 229650 + }, + { + "epoch": 0.8741426429055366, + "grad_norm": 0.16060106456279755, + "learning_rate": 0.0005, + "loss": 2.1146, + "step": 229660 + }, + { + "epoch": 0.8741807053736592, + "grad_norm": 0.11981988698244095, + "learning_rate": 0.0005, + "loss": 2.106, + "step": 229670 + }, + { + "epoch": 0.874218767841782, + "grad_norm": 0.1297769695520401, + "learning_rate": 0.0005, + "loss": 2.0997, + "step": 229680 + }, + { + "epoch": 0.8742568303099046, + "grad_norm": 0.12590380012989044, + "learning_rate": 0.0005, + "loss": 2.098, + "step": 229690 + }, + { + "epoch": 0.8742948927780273, + "grad_norm": 0.1368873566389084, + "learning_rate": 0.0005, + "loss": 2.1133, + "step": 229700 + }, + { + "epoch": 0.87433295524615, + "grad_norm": 0.1381053775548935, + "learning_rate": 0.0005, + "loss": 2.1146, + "step": 229710 + }, + { + "epoch": 0.8743710177142726, + "grad_norm": 0.11610874533653259, + "learning_rate": 0.0005, + "loss": 2.1013, + "step": 229720 + }, + { + "epoch": 0.8744090801823954, + "grad_norm": 0.12096839398145676, + "learning_rate": 0.0005, + "loss": 2.1077, + "step": 229730 + }, + { + "epoch": 0.874447142650518, + "grad_norm": 0.130154088139534, + "learning_rate": 0.0005, + "loss": 2.1011, + "step": 229740 + }, + { + "epoch": 0.8744852051186407, + "grad_norm": 0.13154588639736176, + "learning_rate": 0.0005, + "loss": 2.1013, + "step": 229750 + }, + { + "epoch": 0.8745232675867634, + "grad_norm": 0.13115628063678741, + "learning_rate": 0.0005, + "loss": 2.1049, + "step": 229760 + }, + { + "epoch": 0.8745613300548861, + "grad_norm": 0.1279299259185791, + "learning_rate": 0.0005, + "loss": 2.0815, + "step": 229770 + }, + { + "epoch": 0.8745993925230088, + "grad_norm": 0.11625447869300842, + "learning_rate": 0.0005, + "loss": 2.0931, + "step": 229780 + }, + { + "epoch": 0.8746374549911314, + "grad_norm": 0.12501411139965057, + "learning_rate": 0.0005, + "loss": 2.0983, + "step": 229790 + }, + { + "epoch": 0.8746755174592541, + "grad_norm": 0.12068720906972885, + "learning_rate": 0.0005, + "loss": 2.1044, + "step": 229800 + }, + { + "epoch": 0.8747135799273769, + "grad_norm": 0.12535758316516876, + "learning_rate": 0.0005, + "loss": 2.0978, + "step": 229810 + }, + { + "epoch": 0.8747516423954995, + "grad_norm": 0.12687882781028748, + "learning_rate": 0.0005, + "loss": 2.1243, + "step": 229820 + }, + { + "epoch": 0.8747897048636222, + "grad_norm": 0.127311110496521, + "learning_rate": 0.0005, + "loss": 2.1063, + "step": 229830 + }, + { + "epoch": 0.8748277673317448, + "grad_norm": 0.12320202589035034, + "learning_rate": 0.0005, + "loss": 2.0978, + "step": 229840 + }, + { + "epoch": 0.8748658297998675, + "grad_norm": 0.12253516167402267, + "learning_rate": 0.0005, + "loss": 2.1127, + "step": 229850 + }, + { + "epoch": 0.8749038922679903, + "grad_norm": 0.12297092378139496, + "learning_rate": 0.0005, + "loss": 2.0947, + "step": 229860 + }, + { + "epoch": 0.8749419547361129, + "grad_norm": 0.11488507688045502, + "learning_rate": 0.0005, + "loss": 2.1038, + "step": 229870 + }, + { + "epoch": 0.8749800172042356, + "grad_norm": 0.12407977879047394, + "learning_rate": 0.0005, + "loss": 2.1081, + "step": 229880 + }, + { + "epoch": 0.8750180796723582, + "grad_norm": 0.11904379725456238, + "learning_rate": 0.0005, + "loss": 2.1028, + "step": 229890 + }, + { + "epoch": 0.875056142140481, + "grad_norm": 0.13884297013282776, + "learning_rate": 0.0005, + "loss": 2.1087, + "step": 229900 + }, + { + "epoch": 0.8750942046086037, + "grad_norm": 0.1380092203617096, + "learning_rate": 0.0005, + "loss": 2.1019, + "step": 229910 + }, + { + "epoch": 0.8751322670767263, + "grad_norm": 0.1314605325460434, + "learning_rate": 0.0005, + "loss": 2.1008, + "step": 229920 + }, + { + "epoch": 0.875170329544849, + "grad_norm": 0.13077495992183685, + "learning_rate": 0.0005, + "loss": 2.1132, + "step": 229930 + }, + { + "epoch": 0.8752083920129717, + "grad_norm": 0.11464505642652512, + "learning_rate": 0.0005, + "loss": 2.1081, + "step": 229940 + }, + { + "epoch": 0.8752464544810944, + "grad_norm": 0.1231277585029602, + "learning_rate": 0.0005, + "loss": 2.107, + "step": 229950 + }, + { + "epoch": 0.875284516949217, + "grad_norm": 0.11799775063991547, + "learning_rate": 0.0005, + "loss": 2.0939, + "step": 229960 + }, + { + "epoch": 0.8753225794173397, + "grad_norm": 0.1273152232170105, + "learning_rate": 0.0005, + "loss": 2.0995, + "step": 229970 + }, + { + "epoch": 0.8753606418854625, + "grad_norm": 0.12103879451751709, + "learning_rate": 0.0005, + "loss": 2.1058, + "step": 229980 + }, + { + "epoch": 0.8753987043535851, + "grad_norm": 0.12625528872013092, + "learning_rate": 0.0005, + "loss": 2.1032, + "step": 229990 + }, + { + "epoch": 0.8754367668217078, + "grad_norm": 0.14012883603572845, + "learning_rate": 0.0005, + "loss": 2.118, + "step": 230000 + }, + { + "epoch": 0.8754748292898304, + "grad_norm": 0.14058808982372284, + "learning_rate": 0.0005, + "loss": 2.1024, + "step": 230010 + }, + { + "epoch": 0.8755128917579531, + "grad_norm": 0.1262475699186325, + "learning_rate": 0.0005, + "loss": 2.1048, + "step": 230020 + }, + { + "epoch": 0.8755509542260759, + "grad_norm": 0.12014681100845337, + "learning_rate": 0.0005, + "loss": 2.0959, + "step": 230030 + }, + { + "epoch": 0.8755890166941985, + "grad_norm": 0.12667715549468994, + "learning_rate": 0.0005, + "loss": 2.1003, + "step": 230040 + }, + { + "epoch": 0.8756270791623212, + "grad_norm": 0.12279102951288223, + "learning_rate": 0.0005, + "loss": 2.1008, + "step": 230050 + }, + { + "epoch": 0.8756651416304438, + "grad_norm": 0.13368308544158936, + "learning_rate": 0.0005, + "loss": 2.1147, + "step": 230060 + }, + { + "epoch": 0.8757032040985666, + "grad_norm": 0.12176341563463211, + "learning_rate": 0.0005, + "loss": 2.113, + "step": 230070 + }, + { + "epoch": 0.8757412665666893, + "grad_norm": 0.12322124093770981, + "learning_rate": 0.0005, + "loss": 2.1037, + "step": 230080 + }, + { + "epoch": 0.8757793290348119, + "grad_norm": 0.12963128089904785, + "learning_rate": 0.0005, + "loss": 2.0983, + "step": 230090 + }, + { + "epoch": 0.8758173915029346, + "grad_norm": 0.1318719983100891, + "learning_rate": 0.0005, + "loss": 2.1024, + "step": 230100 + }, + { + "epoch": 0.8758554539710574, + "grad_norm": 0.12382566928863525, + "learning_rate": 0.0005, + "loss": 2.1011, + "step": 230110 + }, + { + "epoch": 0.87589351643918, + "grad_norm": 0.13396726548671722, + "learning_rate": 0.0005, + "loss": 2.1058, + "step": 230120 + }, + { + "epoch": 0.8759315789073027, + "grad_norm": 0.12979789078235626, + "learning_rate": 0.0005, + "loss": 2.0943, + "step": 230130 + }, + { + "epoch": 0.8759696413754253, + "grad_norm": 0.12729598581790924, + "learning_rate": 0.0005, + "loss": 2.0814, + "step": 230140 + }, + { + "epoch": 0.876007703843548, + "grad_norm": 0.12816545367240906, + "learning_rate": 0.0005, + "loss": 2.1039, + "step": 230150 + }, + { + "epoch": 0.8760457663116707, + "grad_norm": 0.1208256185054779, + "learning_rate": 0.0005, + "loss": 2.0937, + "step": 230160 + }, + { + "epoch": 0.8760838287797934, + "grad_norm": 0.13537472486495972, + "learning_rate": 0.0005, + "loss": 2.1224, + "step": 230170 + }, + { + "epoch": 0.8761218912479161, + "grad_norm": 0.133370503783226, + "learning_rate": 0.0005, + "loss": 2.114, + "step": 230180 + }, + { + "epoch": 0.8761599537160387, + "grad_norm": 0.13083863258361816, + "learning_rate": 0.0005, + "loss": 2.1035, + "step": 230190 + }, + { + "epoch": 0.8761980161841615, + "grad_norm": 0.13987453281879425, + "learning_rate": 0.0005, + "loss": 2.0903, + "step": 230200 + }, + { + "epoch": 0.8762360786522841, + "grad_norm": 0.1147368848323822, + "learning_rate": 0.0005, + "loss": 2.1126, + "step": 230210 + }, + { + "epoch": 0.8762741411204068, + "grad_norm": 0.1245139092206955, + "learning_rate": 0.0005, + "loss": 2.1145, + "step": 230220 + }, + { + "epoch": 0.8763122035885295, + "grad_norm": 0.13482628762722015, + "learning_rate": 0.0005, + "loss": 2.1128, + "step": 230230 + }, + { + "epoch": 0.8763502660566522, + "grad_norm": 0.129679337143898, + "learning_rate": 0.0005, + "loss": 2.101, + "step": 230240 + }, + { + "epoch": 0.8763883285247749, + "grad_norm": 0.15952397882938385, + "learning_rate": 0.0005, + "loss": 2.0997, + "step": 230250 + }, + { + "epoch": 0.8764263909928975, + "grad_norm": 1.1382399797439575, + "learning_rate": 0.0005, + "loss": 2.0845, + "step": 230260 + }, + { + "epoch": 0.8764644534610202, + "grad_norm": 0.14629729092121124, + "learning_rate": 0.0005, + "loss": 2.1232, + "step": 230270 + }, + { + "epoch": 0.8765025159291429, + "grad_norm": 0.12604254484176636, + "learning_rate": 0.0005, + "loss": 2.1067, + "step": 230280 + }, + { + "epoch": 0.8765405783972656, + "grad_norm": 0.1218118742108345, + "learning_rate": 0.0005, + "loss": 2.1011, + "step": 230290 + }, + { + "epoch": 0.8765786408653883, + "grad_norm": 0.13148868083953857, + "learning_rate": 0.0005, + "loss": 2.0949, + "step": 230300 + }, + { + "epoch": 0.8766167033335109, + "grad_norm": 0.11712725460529327, + "learning_rate": 0.0005, + "loss": 2.1163, + "step": 230310 + }, + { + "epoch": 0.8766547658016336, + "grad_norm": 0.14665967226028442, + "learning_rate": 0.0005, + "loss": 2.1021, + "step": 230320 + }, + { + "epoch": 0.8766928282697564, + "grad_norm": 0.12293502688407898, + "learning_rate": 0.0005, + "loss": 2.1051, + "step": 230330 + }, + { + "epoch": 0.876730890737879, + "grad_norm": 0.13239432871341705, + "learning_rate": 0.0005, + "loss": 2.1123, + "step": 230340 + }, + { + "epoch": 0.8767689532060017, + "grad_norm": 0.11775525659322739, + "learning_rate": 0.0005, + "loss": 2.1018, + "step": 230350 + }, + { + "epoch": 0.8768070156741243, + "grad_norm": 0.14179232716560364, + "learning_rate": 0.0005, + "loss": 2.0927, + "step": 230360 + }, + { + "epoch": 0.8768450781422471, + "grad_norm": 0.12616828083992004, + "learning_rate": 0.0005, + "loss": 2.0923, + "step": 230370 + }, + { + "epoch": 0.8768831406103698, + "grad_norm": 0.12409175932407379, + "learning_rate": 0.0005, + "loss": 2.0993, + "step": 230380 + }, + { + "epoch": 0.8769212030784924, + "grad_norm": 0.1291017383337021, + "learning_rate": 0.0005, + "loss": 2.0999, + "step": 230390 + }, + { + "epoch": 0.8769592655466151, + "grad_norm": 0.1327333301305771, + "learning_rate": 0.0005, + "loss": 2.1017, + "step": 230400 + }, + { + "epoch": 0.8769973280147378, + "grad_norm": 0.12073655426502228, + "learning_rate": 0.0005, + "loss": 2.1109, + "step": 230410 + }, + { + "epoch": 0.8770353904828605, + "grad_norm": 0.12345512956380844, + "learning_rate": 0.0005, + "loss": 2.1137, + "step": 230420 + }, + { + "epoch": 0.8770734529509832, + "grad_norm": 0.12081936746835709, + "learning_rate": 0.0005, + "loss": 2.0913, + "step": 230430 + }, + { + "epoch": 0.8771115154191058, + "grad_norm": 0.1453080177307129, + "learning_rate": 0.0005, + "loss": 2.1082, + "step": 230440 + }, + { + "epoch": 0.8771495778872285, + "grad_norm": 0.13501416146755219, + "learning_rate": 0.0005, + "loss": 2.1121, + "step": 230450 + }, + { + "epoch": 0.8771876403553512, + "grad_norm": 0.12920066714286804, + "learning_rate": 0.0005, + "loss": 2.1069, + "step": 230460 + }, + { + "epoch": 0.8772257028234739, + "grad_norm": 0.13471481204032898, + "learning_rate": 0.0005, + "loss": 2.1088, + "step": 230470 + }, + { + "epoch": 0.8772637652915966, + "grad_norm": 0.12981899082660675, + "learning_rate": 0.0005, + "loss": 2.1185, + "step": 230480 + }, + { + "epoch": 0.8773018277597192, + "grad_norm": 0.13292816281318665, + "learning_rate": 0.0005, + "loss": 2.1119, + "step": 230490 + }, + { + "epoch": 0.877339890227842, + "grad_norm": 0.12929858267307281, + "learning_rate": 0.0005, + "loss": 2.0955, + "step": 230500 + }, + { + "epoch": 0.8773779526959646, + "grad_norm": 0.13031207025051117, + "learning_rate": 0.0005, + "loss": 2.1025, + "step": 230510 + }, + { + "epoch": 0.8774160151640873, + "grad_norm": 0.11181312799453735, + "learning_rate": 0.0005, + "loss": 2.1075, + "step": 230520 + }, + { + "epoch": 0.87745407763221, + "grad_norm": 0.13998934626579285, + "learning_rate": 0.0005, + "loss": 2.1047, + "step": 230530 + }, + { + "epoch": 0.8774921401003327, + "grad_norm": 0.13008730113506317, + "learning_rate": 0.0005, + "loss": 2.1159, + "step": 230540 + }, + { + "epoch": 0.8775302025684554, + "grad_norm": 0.13119496405124664, + "learning_rate": 0.0005, + "loss": 2.1051, + "step": 230550 + }, + { + "epoch": 0.877568265036578, + "grad_norm": 0.12499556690454483, + "learning_rate": 0.0005, + "loss": 2.0987, + "step": 230560 + }, + { + "epoch": 0.8776063275047007, + "grad_norm": 0.1330575793981552, + "learning_rate": 0.0005, + "loss": 2.083, + "step": 230570 + }, + { + "epoch": 0.8776443899728233, + "grad_norm": 0.13125604391098022, + "learning_rate": 0.0005, + "loss": 2.1172, + "step": 230580 + }, + { + "epoch": 0.8776824524409461, + "grad_norm": 0.13973161578178406, + "learning_rate": 0.0005, + "loss": 2.0932, + "step": 230590 + }, + { + "epoch": 0.8777205149090688, + "grad_norm": 0.12531033158302307, + "learning_rate": 0.0005, + "loss": 2.1082, + "step": 230600 + }, + { + "epoch": 0.8777585773771914, + "grad_norm": 0.14181895554065704, + "learning_rate": 0.0005, + "loss": 2.0942, + "step": 230610 + }, + { + "epoch": 0.8777966398453141, + "grad_norm": 0.14905595779418945, + "learning_rate": 0.0005, + "loss": 2.1135, + "step": 230620 + }, + { + "epoch": 0.8778347023134369, + "grad_norm": 0.1413782238960266, + "learning_rate": 0.0005, + "loss": 2.1049, + "step": 230630 + }, + { + "epoch": 0.8778727647815595, + "grad_norm": 0.12952205538749695, + "learning_rate": 0.0005, + "loss": 2.1131, + "step": 230640 + }, + { + "epoch": 0.8779108272496822, + "grad_norm": 0.12465883791446686, + "learning_rate": 0.0005, + "loss": 2.1128, + "step": 230650 + }, + { + "epoch": 0.8779488897178048, + "grad_norm": 0.13702189922332764, + "learning_rate": 0.0005, + "loss": 2.1005, + "step": 230660 + }, + { + "epoch": 0.8779869521859276, + "grad_norm": 0.15412265062332153, + "learning_rate": 0.0005, + "loss": 2.1005, + "step": 230670 + }, + { + "epoch": 0.8780250146540503, + "grad_norm": 0.13058574497699738, + "learning_rate": 0.0005, + "loss": 2.1026, + "step": 230680 + }, + { + "epoch": 0.8780630771221729, + "grad_norm": 0.12237841635942459, + "learning_rate": 0.0005, + "loss": 2.1053, + "step": 230690 + }, + { + "epoch": 0.8781011395902956, + "grad_norm": 0.13826708495616913, + "learning_rate": 0.0005, + "loss": 2.1055, + "step": 230700 + }, + { + "epoch": 0.8781392020584182, + "grad_norm": 0.13739526271820068, + "learning_rate": 0.0005, + "loss": 2.0878, + "step": 230710 + }, + { + "epoch": 0.878177264526541, + "grad_norm": 0.12953074276447296, + "learning_rate": 0.0005, + "loss": 2.0939, + "step": 230720 + }, + { + "epoch": 0.8782153269946636, + "grad_norm": 0.13328810036182404, + "learning_rate": 0.0005, + "loss": 2.1017, + "step": 230730 + }, + { + "epoch": 0.8782533894627863, + "grad_norm": 0.11889463663101196, + "learning_rate": 0.0005, + "loss": 2.1154, + "step": 230740 + }, + { + "epoch": 0.878291451930909, + "grad_norm": 0.12260544300079346, + "learning_rate": 0.0005, + "loss": 2.1054, + "step": 230750 + }, + { + "epoch": 0.8783295143990317, + "grad_norm": 0.12881432473659515, + "learning_rate": 0.0005, + "loss": 2.0973, + "step": 230760 + }, + { + "epoch": 0.8783675768671544, + "grad_norm": 0.12546837329864502, + "learning_rate": 0.0005, + "loss": 2.1022, + "step": 230770 + }, + { + "epoch": 0.878405639335277, + "grad_norm": 0.13221587240695953, + "learning_rate": 0.0005, + "loss": 2.1059, + "step": 230780 + }, + { + "epoch": 0.8784437018033997, + "grad_norm": 0.128005713224411, + "learning_rate": 0.0005, + "loss": 2.0956, + "step": 230790 + }, + { + "epoch": 0.8784817642715225, + "grad_norm": 0.11627799272537231, + "learning_rate": 0.0005, + "loss": 2.0746, + "step": 230800 + }, + { + "epoch": 0.8785198267396451, + "grad_norm": 0.13420972228050232, + "learning_rate": 0.0005, + "loss": 2.1057, + "step": 230810 + }, + { + "epoch": 0.8785578892077678, + "grad_norm": 0.14359724521636963, + "learning_rate": 0.0005, + "loss": 2.1017, + "step": 230820 + }, + { + "epoch": 0.8785959516758904, + "grad_norm": 0.12914758920669556, + "learning_rate": 0.0005, + "loss": 2.0951, + "step": 230830 + }, + { + "epoch": 0.8786340141440132, + "grad_norm": 0.13227564096450806, + "learning_rate": 0.0005, + "loss": 2.1098, + "step": 230840 + }, + { + "epoch": 0.8786720766121359, + "grad_norm": 0.12736381590366364, + "learning_rate": 0.0005, + "loss": 2.1206, + "step": 230850 + }, + { + "epoch": 0.8787101390802585, + "grad_norm": 0.13434436917304993, + "learning_rate": 0.0005, + "loss": 2.1028, + "step": 230860 + }, + { + "epoch": 0.8787482015483812, + "grad_norm": 0.12227039039134979, + "learning_rate": 0.0005, + "loss": 2.1049, + "step": 230870 + }, + { + "epoch": 0.8787862640165038, + "grad_norm": 0.12851709127426147, + "learning_rate": 0.0005, + "loss": 2.0935, + "step": 230880 + }, + { + "epoch": 0.8788243264846266, + "grad_norm": 0.12752105295658112, + "learning_rate": 0.0005, + "loss": 2.1136, + "step": 230890 + }, + { + "epoch": 0.8788623889527493, + "grad_norm": 0.12396000325679779, + "learning_rate": 0.0005, + "loss": 2.1066, + "step": 230900 + }, + { + "epoch": 0.8789004514208719, + "grad_norm": 0.12684553861618042, + "learning_rate": 0.0005, + "loss": 2.0909, + "step": 230910 + }, + { + "epoch": 0.8789385138889946, + "grad_norm": 0.1198529377579689, + "learning_rate": 0.0005, + "loss": 2.1122, + "step": 230920 + }, + { + "epoch": 0.8789765763571173, + "grad_norm": 0.13882936537265778, + "learning_rate": 0.0005, + "loss": 2.0874, + "step": 230930 + }, + { + "epoch": 0.87901463882524, + "grad_norm": 0.12034550309181213, + "learning_rate": 0.0005, + "loss": 2.0992, + "step": 230940 + }, + { + "epoch": 0.8790527012933627, + "grad_norm": 0.12130250036716461, + "learning_rate": 0.0005, + "loss": 2.1278, + "step": 230950 + }, + { + "epoch": 0.8790907637614853, + "grad_norm": 0.13701075315475464, + "learning_rate": 0.0005, + "loss": 2.0874, + "step": 230960 + }, + { + "epoch": 0.8791288262296081, + "grad_norm": 0.1234058365225792, + "learning_rate": 0.0005, + "loss": 2.0886, + "step": 230970 + }, + { + "epoch": 0.8791668886977307, + "grad_norm": 0.13145093619823456, + "learning_rate": 0.0005, + "loss": 2.1064, + "step": 230980 + }, + { + "epoch": 0.8792049511658534, + "grad_norm": 0.12291737645864487, + "learning_rate": 0.0005, + "loss": 2.1075, + "step": 230990 + }, + { + "epoch": 0.8792430136339761, + "grad_norm": 0.13090135157108307, + "learning_rate": 0.0005, + "loss": 2.0976, + "step": 231000 + }, + { + "epoch": 0.8792810761020987, + "grad_norm": 0.1326054185628891, + "learning_rate": 0.0005, + "loss": 2.1042, + "step": 231010 + }, + { + "epoch": 0.8793191385702215, + "grad_norm": 0.1226671040058136, + "learning_rate": 0.0005, + "loss": 2.1175, + "step": 231020 + }, + { + "epoch": 0.8793572010383441, + "grad_norm": 0.1419045627117157, + "learning_rate": 0.0005, + "loss": 2.0936, + "step": 231030 + }, + { + "epoch": 0.8793952635064668, + "grad_norm": 0.1294616311788559, + "learning_rate": 0.0005, + "loss": 2.1118, + "step": 231040 + }, + { + "epoch": 0.8794333259745895, + "grad_norm": 0.12092513591051102, + "learning_rate": 0.0005, + "loss": 2.0836, + "step": 231050 + }, + { + "epoch": 0.8794713884427122, + "grad_norm": 0.12082849442958832, + "learning_rate": 0.0005, + "loss": 2.1128, + "step": 231060 + }, + { + "epoch": 0.8795094509108349, + "grad_norm": 0.14724799990653992, + "learning_rate": 0.0005, + "loss": 2.1174, + "step": 231070 + }, + { + "epoch": 0.8795475133789575, + "grad_norm": 0.1306437999010086, + "learning_rate": 0.0005, + "loss": 2.1111, + "step": 231080 + }, + { + "epoch": 0.8795855758470802, + "grad_norm": 0.13486473262310028, + "learning_rate": 0.0005, + "loss": 2.1065, + "step": 231090 + }, + { + "epoch": 0.879623638315203, + "grad_norm": 0.12858137488365173, + "learning_rate": 0.0005, + "loss": 2.1025, + "step": 231100 + }, + { + "epoch": 0.8796617007833256, + "grad_norm": 0.12715476751327515, + "learning_rate": 0.0005, + "loss": 2.0971, + "step": 231110 + }, + { + "epoch": 0.8796997632514483, + "grad_norm": 0.11756472289562225, + "learning_rate": 0.0005, + "loss": 2.0902, + "step": 231120 + }, + { + "epoch": 0.8797378257195709, + "grad_norm": 0.11899585276842117, + "learning_rate": 0.0005, + "loss": 2.1003, + "step": 231130 + }, + { + "epoch": 0.8797758881876936, + "grad_norm": 0.12455402314662933, + "learning_rate": 0.0005, + "loss": 2.1013, + "step": 231140 + }, + { + "epoch": 0.8798139506558164, + "grad_norm": 0.12440775334835052, + "learning_rate": 0.0005, + "loss": 2.1083, + "step": 231150 + }, + { + "epoch": 0.879852013123939, + "grad_norm": 0.13117524981498718, + "learning_rate": 0.0005, + "loss": 2.1004, + "step": 231160 + }, + { + "epoch": 0.8798900755920617, + "grad_norm": 0.11392761021852493, + "learning_rate": 0.0005, + "loss": 2.1157, + "step": 231170 + }, + { + "epoch": 0.8799281380601843, + "grad_norm": 0.15131865441799164, + "learning_rate": 0.0005, + "loss": 2.108, + "step": 231180 + }, + { + "epoch": 0.8799662005283071, + "grad_norm": 0.1557011753320694, + "learning_rate": 0.0005, + "loss": 2.0987, + "step": 231190 + }, + { + "epoch": 0.8800042629964298, + "grad_norm": 0.1395118534564972, + "learning_rate": 0.0005, + "loss": 2.1021, + "step": 231200 + }, + { + "epoch": 0.8800423254645524, + "grad_norm": 0.13964758813381195, + "learning_rate": 0.0005, + "loss": 2.1078, + "step": 231210 + }, + { + "epoch": 0.8800803879326751, + "grad_norm": 0.13237391412258148, + "learning_rate": 0.0005, + "loss": 2.1026, + "step": 231220 + }, + { + "epoch": 0.8801184504007978, + "grad_norm": 0.12277648597955704, + "learning_rate": 0.0005, + "loss": 2.1071, + "step": 231230 + }, + { + "epoch": 0.8801565128689205, + "grad_norm": 0.12859274446964264, + "learning_rate": 0.0005, + "loss": 2.1071, + "step": 231240 + }, + { + "epoch": 0.8801945753370432, + "grad_norm": 0.13040199875831604, + "learning_rate": 0.0005, + "loss": 2.1136, + "step": 231250 + }, + { + "epoch": 0.8802326378051658, + "grad_norm": 0.12296934425830841, + "learning_rate": 0.0005, + "loss": 2.1035, + "step": 231260 + }, + { + "epoch": 0.8802707002732886, + "grad_norm": 0.12698037922382355, + "learning_rate": 0.0005, + "loss": 2.0972, + "step": 231270 + }, + { + "epoch": 0.8803087627414112, + "grad_norm": 0.11731021106243134, + "learning_rate": 0.0005, + "loss": 2.0915, + "step": 231280 + }, + { + "epoch": 0.8803468252095339, + "grad_norm": 0.13041014969348907, + "learning_rate": 0.0005, + "loss": 2.1008, + "step": 231290 + }, + { + "epoch": 0.8803848876776565, + "grad_norm": 0.11453888565301895, + "learning_rate": 0.0005, + "loss": 2.1024, + "step": 231300 + }, + { + "epoch": 0.8804229501457792, + "grad_norm": 0.14653709530830383, + "learning_rate": 0.0005, + "loss": 2.0992, + "step": 231310 + }, + { + "epoch": 0.880461012613902, + "grad_norm": 0.13944950699806213, + "learning_rate": 0.0005, + "loss": 2.1226, + "step": 231320 + }, + { + "epoch": 0.8804990750820246, + "grad_norm": 0.11421007663011551, + "learning_rate": 0.0005, + "loss": 2.0897, + "step": 231330 + }, + { + "epoch": 0.8805371375501473, + "grad_norm": 0.11948207765817642, + "learning_rate": 0.0005, + "loss": 2.1012, + "step": 231340 + }, + { + "epoch": 0.8805752000182699, + "grad_norm": 0.12842802703380585, + "learning_rate": 0.0005, + "loss": 2.0914, + "step": 231350 + }, + { + "epoch": 0.8806132624863927, + "grad_norm": 0.12044843286275864, + "learning_rate": 0.0005, + "loss": 2.0998, + "step": 231360 + }, + { + "epoch": 0.8806513249545154, + "grad_norm": 0.12985649704933167, + "learning_rate": 0.0005, + "loss": 2.122, + "step": 231370 + }, + { + "epoch": 0.880689387422638, + "grad_norm": 0.13624177873134613, + "learning_rate": 0.0005, + "loss": 2.1144, + "step": 231380 + }, + { + "epoch": 0.8807274498907607, + "grad_norm": 0.1136452928185463, + "learning_rate": 0.0005, + "loss": 2.1015, + "step": 231390 + }, + { + "epoch": 0.8807655123588835, + "grad_norm": 0.13304318487644196, + "learning_rate": 0.0005, + "loss": 2.092, + "step": 231400 + }, + { + "epoch": 0.8808035748270061, + "grad_norm": 0.13884195685386658, + "learning_rate": 0.0005, + "loss": 2.1107, + "step": 231410 + }, + { + "epoch": 0.8808416372951288, + "grad_norm": 0.13247938454151154, + "learning_rate": 0.0005, + "loss": 2.1031, + "step": 231420 + }, + { + "epoch": 0.8808796997632514, + "grad_norm": 0.1284450888633728, + "learning_rate": 0.0005, + "loss": 2.1043, + "step": 231430 + }, + { + "epoch": 0.8809177622313741, + "grad_norm": 0.12137272953987122, + "learning_rate": 0.0005, + "loss": 2.1041, + "step": 231440 + }, + { + "epoch": 0.8809558246994968, + "grad_norm": 0.13511404395103455, + "learning_rate": 0.0005, + "loss": 2.1104, + "step": 231450 + }, + { + "epoch": 0.8809938871676195, + "grad_norm": 0.12272574752569199, + "learning_rate": 0.0005, + "loss": 2.1172, + "step": 231460 + }, + { + "epoch": 0.8810319496357422, + "grad_norm": 0.14283962547779083, + "learning_rate": 0.0005, + "loss": 2.1226, + "step": 231470 + }, + { + "epoch": 0.8810700121038648, + "grad_norm": 0.11777593195438385, + "learning_rate": 0.0005, + "loss": 2.1064, + "step": 231480 + }, + { + "epoch": 0.8811080745719876, + "grad_norm": 0.12992756068706512, + "learning_rate": 0.0005, + "loss": 2.0946, + "step": 231490 + }, + { + "epoch": 0.8811461370401102, + "grad_norm": 0.11626632511615753, + "learning_rate": 0.0005, + "loss": 2.0943, + "step": 231500 + }, + { + "epoch": 0.8811841995082329, + "grad_norm": 0.12404317408800125, + "learning_rate": 0.0005, + "loss": 2.0916, + "step": 231510 + }, + { + "epoch": 0.8812222619763556, + "grad_norm": 0.12410927563905716, + "learning_rate": 0.0005, + "loss": 2.1086, + "step": 231520 + }, + { + "epoch": 0.8812603244444783, + "grad_norm": 0.1409444361925125, + "learning_rate": 0.0005, + "loss": 2.1187, + "step": 231530 + }, + { + "epoch": 0.881298386912601, + "grad_norm": 0.12675026059150696, + "learning_rate": 0.0005, + "loss": 2.1013, + "step": 231540 + }, + { + "epoch": 0.8813364493807236, + "grad_norm": 0.13229568302631378, + "learning_rate": 0.0005, + "loss": 2.0949, + "step": 231550 + }, + { + "epoch": 0.8813745118488463, + "grad_norm": 0.12156939506530762, + "learning_rate": 0.0005, + "loss": 2.1003, + "step": 231560 + }, + { + "epoch": 0.881412574316969, + "grad_norm": 0.12610271573066711, + "learning_rate": 0.0005, + "loss": 2.1173, + "step": 231570 + }, + { + "epoch": 0.8814506367850917, + "grad_norm": 0.12162590771913528, + "learning_rate": 0.0005, + "loss": 2.0958, + "step": 231580 + }, + { + "epoch": 0.8814886992532144, + "grad_norm": 0.16064129769802094, + "learning_rate": 0.0005, + "loss": 2.0996, + "step": 231590 + }, + { + "epoch": 0.881526761721337, + "grad_norm": 0.13760524988174438, + "learning_rate": 0.0005, + "loss": 2.113, + "step": 231600 + }, + { + "epoch": 0.8815648241894597, + "grad_norm": 0.13056804239749908, + "learning_rate": 0.0005, + "loss": 2.1073, + "step": 231610 + }, + { + "epoch": 0.8816028866575825, + "grad_norm": 0.1253260225057602, + "learning_rate": 0.0005, + "loss": 2.1019, + "step": 231620 + }, + { + "epoch": 0.8816409491257051, + "grad_norm": 0.12314766645431519, + "learning_rate": 0.0005, + "loss": 2.0978, + "step": 231630 + }, + { + "epoch": 0.8816790115938278, + "grad_norm": 0.12430446594953537, + "learning_rate": 0.0005, + "loss": 2.1115, + "step": 231640 + }, + { + "epoch": 0.8817170740619504, + "grad_norm": 0.12304916977882385, + "learning_rate": 0.0005, + "loss": 2.095, + "step": 231650 + }, + { + "epoch": 0.8817551365300732, + "grad_norm": 0.12238744646310806, + "learning_rate": 0.0005, + "loss": 2.0945, + "step": 231660 + }, + { + "epoch": 0.8817931989981959, + "grad_norm": 0.12784691154956818, + "learning_rate": 0.0005, + "loss": 2.0979, + "step": 231670 + }, + { + "epoch": 0.8818312614663185, + "grad_norm": 0.12692983448505402, + "learning_rate": 0.0005, + "loss": 2.104, + "step": 231680 + }, + { + "epoch": 0.8818693239344412, + "grad_norm": 0.11969311535358429, + "learning_rate": 0.0005, + "loss": 2.1167, + "step": 231690 + }, + { + "epoch": 0.8819073864025639, + "grad_norm": 0.24011379480361938, + "learning_rate": 0.0005, + "loss": 2.1055, + "step": 231700 + }, + { + "epoch": 0.8819454488706866, + "grad_norm": 0.11667758971452713, + "learning_rate": 0.0005, + "loss": 2.1166, + "step": 231710 + }, + { + "epoch": 0.8819835113388093, + "grad_norm": 0.12892790138721466, + "learning_rate": 0.0005, + "loss": 2.1062, + "step": 231720 + }, + { + "epoch": 0.8820215738069319, + "grad_norm": 0.12426768988370895, + "learning_rate": 0.0005, + "loss": 2.1147, + "step": 231730 + }, + { + "epoch": 0.8820596362750546, + "grad_norm": 0.13126088678836823, + "learning_rate": 0.0005, + "loss": 2.1129, + "step": 231740 + }, + { + "epoch": 0.8820976987431773, + "grad_norm": 0.13232356309890747, + "learning_rate": 0.0005, + "loss": 2.1103, + "step": 231750 + }, + { + "epoch": 0.8821357612113, + "grad_norm": 0.13249552249908447, + "learning_rate": 0.0005, + "loss": 2.1205, + "step": 231760 + }, + { + "epoch": 0.8821738236794227, + "grad_norm": 0.11961726099252701, + "learning_rate": 0.0005, + "loss": 2.0971, + "step": 231770 + }, + { + "epoch": 0.8822118861475453, + "grad_norm": 0.1333591789007187, + "learning_rate": 0.0005, + "loss": 2.1156, + "step": 231780 + }, + { + "epoch": 0.8822499486156681, + "grad_norm": 0.13824038207530975, + "learning_rate": 0.0005, + "loss": 2.114, + "step": 231790 + }, + { + "epoch": 0.8822880110837907, + "grad_norm": 0.1226683259010315, + "learning_rate": 0.0005, + "loss": 2.0986, + "step": 231800 + }, + { + "epoch": 0.8823260735519134, + "grad_norm": 0.16337880492210388, + "learning_rate": 0.0005, + "loss": 2.1055, + "step": 231810 + }, + { + "epoch": 0.882364136020036, + "grad_norm": 0.1357499063014984, + "learning_rate": 0.0005, + "loss": 2.0927, + "step": 231820 + }, + { + "epoch": 0.8824021984881588, + "grad_norm": 0.12789271771907806, + "learning_rate": 0.0005, + "loss": 2.1182, + "step": 231830 + }, + { + "epoch": 0.8824402609562815, + "grad_norm": 0.12225945293903351, + "learning_rate": 0.0005, + "loss": 2.1153, + "step": 231840 + }, + { + "epoch": 0.8824783234244041, + "grad_norm": 0.11710258573293686, + "learning_rate": 0.0005, + "loss": 2.1065, + "step": 231850 + }, + { + "epoch": 0.8825163858925268, + "grad_norm": 0.11904674023389816, + "learning_rate": 0.0005, + "loss": 2.1195, + "step": 231860 + }, + { + "epoch": 0.8825544483606494, + "grad_norm": 0.12898290157318115, + "learning_rate": 0.0005, + "loss": 2.1045, + "step": 231870 + }, + { + "epoch": 0.8825925108287722, + "grad_norm": 0.1373780220746994, + "learning_rate": 0.0005, + "loss": 2.1069, + "step": 231880 + }, + { + "epoch": 0.8826305732968949, + "grad_norm": 0.12841899693012238, + "learning_rate": 0.0005, + "loss": 2.1004, + "step": 231890 + }, + { + "epoch": 0.8826686357650175, + "grad_norm": 0.12314864248037338, + "learning_rate": 0.0005, + "loss": 2.0986, + "step": 231900 + }, + { + "epoch": 0.8827066982331402, + "grad_norm": 0.13329792022705078, + "learning_rate": 0.0005, + "loss": 2.0985, + "step": 231910 + }, + { + "epoch": 0.882744760701263, + "grad_norm": 0.14434537291526794, + "learning_rate": 0.0005, + "loss": 2.1001, + "step": 231920 + }, + { + "epoch": 0.8827828231693856, + "grad_norm": 0.11754970997571945, + "learning_rate": 0.0005, + "loss": 2.0996, + "step": 231930 + }, + { + "epoch": 0.8828208856375083, + "grad_norm": 0.12650670111179352, + "learning_rate": 0.0005, + "loss": 2.114, + "step": 231940 + }, + { + "epoch": 0.8828589481056309, + "grad_norm": 0.121844083070755, + "learning_rate": 0.0005, + "loss": 2.0899, + "step": 231950 + }, + { + "epoch": 0.8828970105737537, + "grad_norm": 0.11975842714309692, + "learning_rate": 0.0005, + "loss": 2.0966, + "step": 231960 + }, + { + "epoch": 0.8829350730418764, + "grad_norm": 0.14225952327251434, + "learning_rate": 0.0005, + "loss": 2.109, + "step": 231970 + }, + { + "epoch": 0.882973135509999, + "grad_norm": 0.12241010367870331, + "learning_rate": 0.0005, + "loss": 2.1043, + "step": 231980 + }, + { + "epoch": 0.8830111979781217, + "grad_norm": 0.12553659081459045, + "learning_rate": 0.0005, + "loss": 2.0906, + "step": 231990 + }, + { + "epoch": 0.8830492604462443, + "grad_norm": 0.14470550417900085, + "learning_rate": 0.0005, + "loss": 2.1102, + "step": 232000 + }, + { + "epoch": 0.8830873229143671, + "grad_norm": 0.11787319928407669, + "learning_rate": 0.0005, + "loss": 2.1052, + "step": 232010 + }, + { + "epoch": 0.8831253853824897, + "grad_norm": 0.12110399454832077, + "learning_rate": 0.0005, + "loss": 2.1061, + "step": 232020 + }, + { + "epoch": 0.8831634478506124, + "grad_norm": 0.14320997893810272, + "learning_rate": 0.0005, + "loss": 2.0868, + "step": 232030 + }, + { + "epoch": 0.8832015103187351, + "grad_norm": 0.1214723140001297, + "learning_rate": 0.0005, + "loss": 2.1177, + "step": 232040 + }, + { + "epoch": 0.8832395727868578, + "grad_norm": 0.13799428939819336, + "learning_rate": 0.0005, + "loss": 2.1146, + "step": 232050 + }, + { + "epoch": 0.8832776352549805, + "grad_norm": 0.13196426630020142, + "learning_rate": 0.0005, + "loss": 2.1115, + "step": 232060 + }, + { + "epoch": 0.8833156977231031, + "grad_norm": 0.11245911568403244, + "learning_rate": 0.0005, + "loss": 2.095, + "step": 232070 + }, + { + "epoch": 0.8833537601912258, + "grad_norm": 0.12483595311641693, + "learning_rate": 0.0005, + "loss": 2.1031, + "step": 232080 + }, + { + "epoch": 0.8833918226593486, + "grad_norm": 0.12437473982572556, + "learning_rate": 0.0005, + "loss": 2.0902, + "step": 232090 + }, + { + "epoch": 0.8834298851274712, + "grad_norm": 0.1243564561009407, + "learning_rate": 0.0005, + "loss": 2.1091, + "step": 232100 + }, + { + "epoch": 0.8834679475955939, + "grad_norm": 0.12160544097423553, + "learning_rate": 0.0005, + "loss": 2.1113, + "step": 232110 + }, + { + "epoch": 0.8835060100637165, + "grad_norm": 0.13572929799556732, + "learning_rate": 0.0005, + "loss": 2.1056, + "step": 232120 + }, + { + "epoch": 0.8835440725318393, + "grad_norm": 0.12826408445835114, + "learning_rate": 0.0005, + "loss": 2.0908, + "step": 232130 + }, + { + "epoch": 0.883582134999962, + "grad_norm": 0.12957674264907837, + "learning_rate": 0.0005, + "loss": 2.1036, + "step": 232140 + }, + { + "epoch": 0.8836201974680846, + "grad_norm": 0.12144125998020172, + "learning_rate": 0.0005, + "loss": 2.1132, + "step": 232150 + }, + { + "epoch": 0.8836582599362073, + "grad_norm": 0.12279727309942245, + "learning_rate": 0.0005, + "loss": 2.1198, + "step": 232160 + }, + { + "epoch": 0.8836963224043299, + "grad_norm": 0.13412638008594513, + "learning_rate": 0.0005, + "loss": 2.1251, + "step": 232170 + }, + { + "epoch": 0.8837343848724527, + "grad_norm": 0.12279703468084335, + "learning_rate": 0.0005, + "loss": 2.088, + "step": 232180 + }, + { + "epoch": 0.8837724473405754, + "grad_norm": 0.12408280372619629, + "learning_rate": 0.0005, + "loss": 2.1013, + "step": 232190 + }, + { + "epoch": 0.883810509808698, + "grad_norm": 0.1378985494375229, + "learning_rate": 0.0005, + "loss": 2.1064, + "step": 232200 + }, + { + "epoch": 0.8838485722768207, + "grad_norm": 0.12556681036949158, + "learning_rate": 0.0005, + "loss": 2.1126, + "step": 232210 + }, + { + "epoch": 0.8838866347449434, + "grad_norm": 4.76461935043335, + "learning_rate": 0.0005, + "loss": 2.1026, + "step": 232220 + }, + { + "epoch": 0.8839246972130661, + "grad_norm": 0.22819869220256805, + "learning_rate": 0.0005, + "loss": 2.1196, + "step": 232230 + }, + { + "epoch": 0.8839627596811888, + "grad_norm": 0.13506510853767395, + "learning_rate": 0.0005, + "loss": 2.1075, + "step": 232240 + }, + { + "epoch": 0.8840008221493114, + "grad_norm": 0.12306538969278336, + "learning_rate": 0.0005, + "loss": 2.1241, + "step": 232250 + }, + { + "epoch": 0.8840388846174342, + "grad_norm": 0.1268211454153061, + "learning_rate": 0.0005, + "loss": 2.1023, + "step": 232260 + }, + { + "epoch": 0.8840769470855568, + "grad_norm": 0.13606832921504974, + "learning_rate": 0.0005, + "loss": 2.11, + "step": 232270 + }, + { + "epoch": 0.8841150095536795, + "grad_norm": 0.1291336715221405, + "learning_rate": 0.0005, + "loss": 2.1195, + "step": 232280 + }, + { + "epoch": 0.8841530720218022, + "grad_norm": 0.11495401710271835, + "learning_rate": 0.0005, + "loss": 2.112, + "step": 232290 + }, + { + "epoch": 0.8841911344899248, + "grad_norm": 0.12901656329631805, + "learning_rate": 0.0005, + "loss": 2.11, + "step": 232300 + }, + { + "epoch": 0.8842291969580476, + "grad_norm": 0.14306919276714325, + "learning_rate": 0.0005, + "loss": 2.1025, + "step": 232310 + }, + { + "epoch": 0.8842672594261702, + "grad_norm": 0.1347128301858902, + "learning_rate": 0.0005, + "loss": 2.1199, + "step": 232320 + }, + { + "epoch": 0.8843053218942929, + "grad_norm": 0.13565176725387573, + "learning_rate": 0.0005, + "loss": 2.1087, + "step": 232330 + }, + { + "epoch": 0.8843433843624156, + "grad_norm": 0.12219851464033127, + "learning_rate": 0.0005, + "loss": 2.105, + "step": 232340 + }, + { + "epoch": 0.8843814468305383, + "grad_norm": 0.12421177327632904, + "learning_rate": 0.0005, + "loss": 2.1187, + "step": 232350 + }, + { + "epoch": 0.884419509298661, + "grad_norm": 0.1276005059480667, + "learning_rate": 0.0005, + "loss": 2.1252, + "step": 232360 + }, + { + "epoch": 0.8844575717667836, + "grad_norm": 0.12903755903244019, + "learning_rate": 0.0005, + "loss": 2.1012, + "step": 232370 + }, + { + "epoch": 0.8844956342349063, + "grad_norm": 0.12208840996026993, + "learning_rate": 0.0005, + "loss": 2.1034, + "step": 232380 + }, + { + "epoch": 0.8845336967030291, + "grad_norm": 0.1322821080684662, + "learning_rate": 0.0005, + "loss": 2.1143, + "step": 232390 + }, + { + "epoch": 0.8845717591711517, + "grad_norm": 0.14976076781749725, + "learning_rate": 0.0005, + "loss": 2.1109, + "step": 232400 + }, + { + "epoch": 0.8846098216392744, + "grad_norm": 0.12396979331970215, + "learning_rate": 0.0005, + "loss": 2.1125, + "step": 232410 + }, + { + "epoch": 0.884647884107397, + "grad_norm": 0.13356992602348328, + "learning_rate": 0.0005, + "loss": 2.0891, + "step": 232420 + }, + { + "epoch": 0.8846859465755197, + "grad_norm": 0.11742783337831497, + "learning_rate": 0.0005, + "loss": 2.1039, + "step": 232430 + }, + { + "epoch": 0.8847240090436425, + "grad_norm": 0.12508390843868256, + "learning_rate": 0.0005, + "loss": 2.0971, + "step": 232440 + }, + { + "epoch": 0.8847620715117651, + "grad_norm": 0.11658743023872375, + "learning_rate": 0.0005, + "loss": 2.119, + "step": 232450 + }, + { + "epoch": 0.8848001339798878, + "grad_norm": 0.11587092280387878, + "learning_rate": 0.0005, + "loss": 2.0923, + "step": 232460 + }, + { + "epoch": 0.8848381964480104, + "grad_norm": 0.12888342142105103, + "learning_rate": 0.0005, + "loss": 2.0918, + "step": 232470 + }, + { + "epoch": 0.8848762589161332, + "grad_norm": 0.12616628408432007, + "learning_rate": 0.0005, + "loss": 2.1083, + "step": 232480 + }, + { + "epoch": 0.8849143213842559, + "grad_norm": 0.13656558096408844, + "learning_rate": 0.0005, + "loss": 2.0967, + "step": 232490 + }, + { + "epoch": 0.8849523838523785, + "grad_norm": 0.14839287102222443, + "learning_rate": 0.0005, + "loss": 2.118, + "step": 232500 + }, + { + "epoch": 0.8849904463205012, + "grad_norm": 0.1278732717037201, + "learning_rate": 0.0005, + "loss": 2.1062, + "step": 232510 + }, + { + "epoch": 0.8850285087886239, + "grad_norm": 0.5294134020805359, + "learning_rate": 0.0005, + "loss": 2.1079, + "step": 232520 + }, + { + "epoch": 0.8850665712567466, + "grad_norm": 0.14765897393226624, + "learning_rate": 0.0005, + "loss": 2.111, + "step": 232530 + }, + { + "epoch": 0.8851046337248692, + "grad_norm": 0.12571071088314056, + "learning_rate": 0.0005, + "loss": 2.1164, + "step": 232540 + }, + { + "epoch": 0.8851426961929919, + "grad_norm": 0.12959067523479462, + "learning_rate": 0.0005, + "loss": 2.1124, + "step": 232550 + }, + { + "epoch": 0.8851807586611147, + "grad_norm": 0.14037704467773438, + "learning_rate": 0.0005, + "loss": 2.0978, + "step": 232560 + }, + { + "epoch": 0.8852188211292373, + "grad_norm": 0.12762996554374695, + "learning_rate": 0.0005, + "loss": 2.102, + "step": 232570 + }, + { + "epoch": 0.88525688359736, + "grad_norm": 0.12175919860601425, + "learning_rate": 0.0005, + "loss": 2.1067, + "step": 232580 + }, + { + "epoch": 0.8852949460654826, + "grad_norm": 0.1340930014848709, + "learning_rate": 0.0005, + "loss": 2.0877, + "step": 232590 + }, + { + "epoch": 0.8853330085336053, + "grad_norm": 0.12818540632724762, + "learning_rate": 0.0005, + "loss": 2.1065, + "step": 232600 + }, + { + "epoch": 0.8853710710017281, + "grad_norm": 0.11314734071493149, + "learning_rate": 0.0005, + "loss": 2.1038, + "step": 232610 + }, + { + "epoch": 0.8854091334698507, + "grad_norm": 0.1258586347103119, + "learning_rate": 0.0005, + "loss": 2.0868, + "step": 232620 + }, + { + "epoch": 0.8854471959379734, + "grad_norm": 0.131071999669075, + "learning_rate": 0.0005, + "loss": 2.1077, + "step": 232630 + }, + { + "epoch": 0.885485258406096, + "grad_norm": 0.1313721090555191, + "learning_rate": 0.0005, + "loss": 2.1163, + "step": 232640 + }, + { + "epoch": 0.8855233208742188, + "grad_norm": 0.11949295550584793, + "learning_rate": 0.0005, + "loss": 2.1011, + "step": 232650 + }, + { + "epoch": 0.8855613833423415, + "grad_norm": 0.13298510015010834, + "learning_rate": 0.0005, + "loss": 2.0999, + "step": 232660 + }, + { + "epoch": 0.8855994458104641, + "grad_norm": 0.12202002853155136, + "learning_rate": 0.0005, + "loss": 2.0873, + "step": 232670 + }, + { + "epoch": 0.8856375082785868, + "grad_norm": 0.11990275233983994, + "learning_rate": 0.0005, + "loss": 2.102, + "step": 232680 + }, + { + "epoch": 0.8856755707467096, + "grad_norm": 0.11966723203659058, + "learning_rate": 0.0005, + "loss": 2.1184, + "step": 232690 + }, + { + "epoch": 0.8857136332148322, + "grad_norm": 0.12311986833810806, + "learning_rate": 0.0005, + "loss": 2.1118, + "step": 232700 + }, + { + "epoch": 0.8857516956829549, + "grad_norm": 0.1322653889656067, + "learning_rate": 0.0005, + "loss": 2.09, + "step": 232710 + }, + { + "epoch": 0.8857897581510775, + "grad_norm": 0.1241132989525795, + "learning_rate": 0.0005, + "loss": 2.112, + "step": 232720 + }, + { + "epoch": 0.8858278206192002, + "grad_norm": 0.112840935587883, + "learning_rate": 0.0005, + "loss": 2.0986, + "step": 232730 + }, + { + "epoch": 0.885865883087323, + "grad_norm": 0.14808683097362518, + "learning_rate": 0.0005, + "loss": 2.1035, + "step": 232740 + }, + { + "epoch": 0.8859039455554456, + "grad_norm": 0.1410990208387375, + "learning_rate": 0.0005, + "loss": 2.0967, + "step": 232750 + }, + { + "epoch": 0.8859420080235683, + "grad_norm": 0.12336340546607971, + "learning_rate": 0.0005, + "loss": 2.1047, + "step": 232760 + }, + { + "epoch": 0.8859800704916909, + "grad_norm": 0.1287340223789215, + "learning_rate": 0.0005, + "loss": 2.108, + "step": 232770 + }, + { + "epoch": 0.8860181329598137, + "grad_norm": 0.14114344120025635, + "learning_rate": 0.0005, + "loss": 2.0966, + "step": 232780 + }, + { + "epoch": 0.8860561954279363, + "grad_norm": 0.13871510326862335, + "learning_rate": 0.0005, + "loss": 2.1214, + "step": 232790 + }, + { + "epoch": 0.886094257896059, + "grad_norm": 0.11456874012947083, + "learning_rate": 0.0005, + "loss": 2.0962, + "step": 232800 + }, + { + "epoch": 0.8861323203641817, + "grad_norm": 0.13572253286838531, + "learning_rate": 0.0005, + "loss": 2.1093, + "step": 232810 + }, + { + "epoch": 0.8861703828323044, + "grad_norm": 0.12444180250167847, + "learning_rate": 0.0005, + "loss": 2.0842, + "step": 232820 + }, + { + "epoch": 0.8862084453004271, + "grad_norm": 0.134059876203537, + "learning_rate": 0.0005, + "loss": 2.1013, + "step": 232830 + }, + { + "epoch": 0.8862465077685497, + "grad_norm": 0.13002780079841614, + "learning_rate": 0.0005, + "loss": 2.1112, + "step": 232840 + }, + { + "epoch": 0.8862845702366724, + "grad_norm": 0.1284944862127304, + "learning_rate": 0.0005, + "loss": 2.0962, + "step": 232850 + }, + { + "epoch": 0.886322632704795, + "grad_norm": 0.3809811770915985, + "learning_rate": 0.0005, + "loss": 2.111, + "step": 232860 + }, + { + "epoch": 0.8863606951729178, + "grad_norm": 0.13519035279750824, + "learning_rate": 0.0005, + "loss": 2.0949, + "step": 232870 + }, + { + "epoch": 0.8863987576410405, + "grad_norm": 0.12790334224700928, + "learning_rate": 0.0005, + "loss": 2.0823, + "step": 232880 + }, + { + "epoch": 0.8864368201091631, + "grad_norm": 0.12451233714818954, + "learning_rate": 0.0005, + "loss": 2.1163, + "step": 232890 + }, + { + "epoch": 0.8864748825772858, + "grad_norm": 0.12516069412231445, + "learning_rate": 0.0005, + "loss": 2.1045, + "step": 232900 + }, + { + "epoch": 0.8865129450454086, + "grad_norm": 0.15014207363128662, + "learning_rate": 0.0005, + "loss": 2.1003, + "step": 232910 + }, + { + "epoch": 0.8865510075135312, + "grad_norm": 0.126219242811203, + "learning_rate": 0.0005, + "loss": 2.1157, + "step": 232920 + }, + { + "epoch": 0.8865890699816539, + "grad_norm": 0.1344812512397766, + "learning_rate": 0.0005, + "loss": 2.1014, + "step": 232930 + }, + { + "epoch": 0.8866271324497765, + "grad_norm": 0.12141722440719604, + "learning_rate": 0.0005, + "loss": 2.1051, + "step": 232940 + }, + { + "epoch": 0.8866651949178993, + "grad_norm": 0.1323108971118927, + "learning_rate": 0.0005, + "loss": 2.1046, + "step": 232950 + }, + { + "epoch": 0.886703257386022, + "grad_norm": 0.12559595704078674, + "learning_rate": 0.0005, + "loss": 2.0891, + "step": 232960 + }, + { + "epoch": 0.8867413198541446, + "grad_norm": 0.11912598460912704, + "learning_rate": 0.0005, + "loss": 2.1047, + "step": 232970 + }, + { + "epoch": 0.8867793823222673, + "grad_norm": 0.12086867541074753, + "learning_rate": 0.0005, + "loss": 2.1191, + "step": 232980 + }, + { + "epoch": 0.88681744479039, + "grad_norm": 0.13172176480293274, + "learning_rate": 0.0005, + "loss": 2.1016, + "step": 232990 + }, + { + "epoch": 0.8868555072585127, + "grad_norm": 0.12233125418424606, + "learning_rate": 0.0005, + "loss": 2.1009, + "step": 233000 + }, + { + "epoch": 0.8868935697266354, + "grad_norm": 0.11781726777553558, + "learning_rate": 0.0005, + "loss": 2.1017, + "step": 233010 + }, + { + "epoch": 0.886931632194758, + "grad_norm": 0.126663938164711, + "learning_rate": 0.0005, + "loss": 2.1024, + "step": 233020 + }, + { + "epoch": 0.8869696946628807, + "grad_norm": 0.11784598231315613, + "learning_rate": 0.0005, + "loss": 2.0906, + "step": 233030 + }, + { + "epoch": 0.8870077571310034, + "grad_norm": 0.12991993129253387, + "learning_rate": 0.0005, + "loss": 2.1134, + "step": 233040 + }, + { + "epoch": 0.8870458195991261, + "grad_norm": 0.11740230768918991, + "learning_rate": 0.0005, + "loss": 2.1169, + "step": 233050 + }, + { + "epoch": 0.8870838820672488, + "grad_norm": 0.1238425225019455, + "learning_rate": 0.0005, + "loss": 2.1092, + "step": 233060 + }, + { + "epoch": 0.8871219445353714, + "grad_norm": 0.13130146265029907, + "learning_rate": 0.0005, + "loss": 2.0911, + "step": 233070 + }, + { + "epoch": 0.8871600070034942, + "grad_norm": 0.12228170037269592, + "learning_rate": 0.0005, + "loss": 2.0954, + "step": 233080 + }, + { + "epoch": 0.8871980694716168, + "grad_norm": 0.13704465329647064, + "learning_rate": 0.0005, + "loss": 2.1156, + "step": 233090 + }, + { + "epoch": 0.8872361319397395, + "grad_norm": 0.15340986847877502, + "learning_rate": 0.0005, + "loss": 2.1087, + "step": 233100 + }, + { + "epoch": 0.8872741944078621, + "grad_norm": 0.12370522320270538, + "learning_rate": 0.0005, + "loss": 2.1039, + "step": 233110 + }, + { + "epoch": 0.8873122568759849, + "grad_norm": 0.11676563322544098, + "learning_rate": 0.0005, + "loss": 2.1195, + "step": 233120 + }, + { + "epoch": 0.8873503193441076, + "grad_norm": 0.12067238241434097, + "learning_rate": 0.0005, + "loss": 2.1128, + "step": 233130 + }, + { + "epoch": 0.8873883818122302, + "grad_norm": 0.1201990470290184, + "learning_rate": 0.0005, + "loss": 2.0893, + "step": 233140 + }, + { + "epoch": 0.8874264442803529, + "grad_norm": 0.11820308119058609, + "learning_rate": 0.0005, + "loss": 2.0933, + "step": 233150 + }, + { + "epoch": 0.8874645067484755, + "grad_norm": 0.13470560312271118, + "learning_rate": 0.0005, + "loss": 2.0986, + "step": 233160 + }, + { + "epoch": 0.8875025692165983, + "grad_norm": 0.13587632775306702, + "learning_rate": 0.0005, + "loss": 2.1131, + "step": 233170 + }, + { + "epoch": 0.887540631684721, + "grad_norm": 0.13131999969482422, + "learning_rate": 0.0005, + "loss": 2.1061, + "step": 233180 + }, + { + "epoch": 0.8875786941528436, + "grad_norm": 0.1288503259420395, + "learning_rate": 0.0005, + "loss": 2.0842, + "step": 233190 + }, + { + "epoch": 0.8876167566209663, + "grad_norm": 0.1321592479944229, + "learning_rate": 0.0005, + "loss": 2.1116, + "step": 233200 + }, + { + "epoch": 0.887654819089089, + "grad_norm": 0.8052067160606384, + "learning_rate": 0.0005, + "loss": 2.1078, + "step": 233210 + }, + { + "epoch": 0.8876928815572117, + "grad_norm": 0.13146033883094788, + "learning_rate": 0.0005, + "loss": 2.1021, + "step": 233220 + }, + { + "epoch": 0.8877309440253344, + "grad_norm": 0.11876517534255981, + "learning_rate": 0.0005, + "loss": 2.0993, + "step": 233230 + }, + { + "epoch": 0.887769006493457, + "grad_norm": 0.12811189889907837, + "learning_rate": 0.0005, + "loss": 2.1018, + "step": 233240 + }, + { + "epoch": 0.8878070689615798, + "grad_norm": 0.13559778034687042, + "learning_rate": 0.0005, + "loss": 2.0977, + "step": 233250 + }, + { + "epoch": 0.8878451314297024, + "grad_norm": 0.12814271450042725, + "learning_rate": 0.0005, + "loss": 2.0921, + "step": 233260 + }, + { + "epoch": 0.8878831938978251, + "grad_norm": 0.1480410397052765, + "learning_rate": 0.0005, + "loss": 2.0954, + "step": 233270 + }, + { + "epoch": 0.8879212563659478, + "grad_norm": 0.12418772280216217, + "learning_rate": 0.0005, + "loss": 2.1003, + "step": 233280 + }, + { + "epoch": 0.8879593188340704, + "grad_norm": 0.1289658099412918, + "learning_rate": 0.0005, + "loss": 2.101, + "step": 233290 + }, + { + "epoch": 0.8879973813021932, + "grad_norm": 0.12939070165157318, + "learning_rate": 0.0005, + "loss": 2.1247, + "step": 233300 + }, + { + "epoch": 0.8880354437703158, + "grad_norm": 0.11741060763597488, + "learning_rate": 0.0005, + "loss": 2.1141, + "step": 233310 + }, + { + "epoch": 0.8880735062384385, + "grad_norm": 0.12273950129747391, + "learning_rate": 0.0005, + "loss": 2.0916, + "step": 233320 + }, + { + "epoch": 0.8881115687065612, + "grad_norm": 0.12378660589456558, + "learning_rate": 0.0005, + "loss": 2.0973, + "step": 233330 + }, + { + "epoch": 0.8881496311746839, + "grad_norm": 0.1443643718957901, + "learning_rate": 0.0005, + "loss": 2.0957, + "step": 233340 + }, + { + "epoch": 0.8881876936428066, + "grad_norm": 0.1268627941608429, + "learning_rate": 0.0005, + "loss": 2.0959, + "step": 233350 + }, + { + "epoch": 0.8882257561109292, + "grad_norm": 0.1410587728023529, + "learning_rate": 0.0005, + "loss": 2.1195, + "step": 233360 + }, + { + "epoch": 0.8882638185790519, + "grad_norm": 0.14428554475307465, + "learning_rate": 0.0005, + "loss": 2.1108, + "step": 233370 + }, + { + "epoch": 0.8883018810471747, + "grad_norm": 0.1303177773952484, + "learning_rate": 0.0005, + "loss": 2.1082, + "step": 233380 + }, + { + "epoch": 0.8883399435152973, + "grad_norm": 0.1261586993932724, + "learning_rate": 0.0005, + "loss": 2.1004, + "step": 233390 + }, + { + "epoch": 0.88837800598342, + "grad_norm": 0.13833171129226685, + "learning_rate": 0.0005, + "loss": 2.0917, + "step": 233400 + }, + { + "epoch": 0.8884160684515426, + "grad_norm": 0.12124264240264893, + "learning_rate": 0.0005, + "loss": 2.0948, + "step": 233410 + }, + { + "epoch": 0.8884541309196654, + "grad_norm": 0.11245989799499512, + "learning_rate": 0.0005, + "loss": 2.0989, + "step": 233420 + }, + { + "epoch": 0.8884921933877881, + "grad_norm": 0.11601608991622925, + "learning_rate": 0.0005, + "loss": 2.0987, + "step": 233430 + }, + { + "epoch": 0.8885302558559107, + "grad_norm": 0.1283676028251648, + "learning_rate": 0.0005, + "loss": 2.1007, + "step": 233440 + }, + { + "epoch": 0.8885683183240334, + "grad_norm": 0.1229342520236969, + "learning_rate": 0.0005, + "loss": 2.114, + "step": 233450 + }, + { + "epoch": 0.888606380792156, + "grad_norm": 0.1284010112285614, + "learning_rate": 0.0005, + "loss": 2.1244, + "step": 233460 + }, + { + "epoch": 0.8886444432602788, + "grad_norm": 0.12559211254119873, + "learning_rate": 0.0005, + "loss": 2.1162, + "step": 233470 + }, + { + "epoch": 0.8886825057284015, + "grad_norm": 0.13134633004665375, + "learning_rate": 0.0005, + "loss": 2.1016, + "step": 233480 + }, + { + "epoch": 0.8887205681965241, + "grad_norm": 0.1325225532054901, + "learning_rate": 0.0005, + "loss": 2.1046, + "step": 233490 + }, + { + "epoch": 0.8887586306646468, + "grad_norm": 0.1136227697134018, + "learning_rate": 0.0005, + "loss": 2.1117, + "step": 233500 + }, + { + "epoch": 0.8887966931327695, + "grad_norm": 0.12274109572172165, + "learning_rate": 0.0005, + "loss": 2.099, + "step": 233510 + }, + { + "epoch": 0.8888347556008922, + "grad_norm": 0.14137963950634003, + "learning_rate": 0.0005, + "loss": 2.0909, + "step": 233520 + }, + { + "epoch": 0.8888728180690149, + "grad_norm": 0.13006393611431122, + "learning_rate": 0.0005, + "loss": 2.0874, + "step": 233530 + }, + { + "epoch": 0.8889108805371375, + "grad_norm": 0.13273341953754425, + "learning_rate": 0.0005, + "loss": 2.1079, + "step": 233540 + }, + { + "epoch": 0.8889489430052603, + "grad_norm": 0.12141282111406326, + "learning_rate": 0.0005, + "loss": 2.1085, + "step": 233550 + }, + { + "epoch": 0.8889870054733829, + "grad_norm": 0.13110850751399994, + "learning_rate": 0.0005, + "loss": 2.1001, + "step": 233560 + }, + { + "epoch": 0.8890250679415056, + "grad_norm": 0.1146354153752327, + "learning_rate": 0.0005, + "loss": 2.0922, + "step": 233570 + }, + { + "epoch": 0.8890631304096283, + "grad_norm": 0.13466612994670868, + "learning_rate": 0.0005, + "loss": 2.0943, + "step": 233580 + }, + { + "epoch": 0.8891011928777509, + "grad_norm": 0.1334793120622635, + "learning_rate": 0.0005, + "loss": 2.0902, + "step": 233590 + }, + { + "epoch": 0.8891392553458737, + "grad_norm": 0.14149747788906097, + "learning_rate": 0.0005, + "loss": 2.0949, + "step": 233600 + }, + { + "epoch": 0.8891773178139963, + "grad_norm": 0.13406780362129211, + "learning_rate": 0.0005, + "loss": 2.1051, + "step": 233610 + }, + { + "epoch": 0.889215380282119, + "grad_norm": 0.12005721032619476, + "learning_rate": 0.0005, + "loss": 2.0909, + "step": 233620 + }, + { + "epoch": 0.8892534427502417, + "grad_norm": 0.1475459784269333, + "learning_rate": 0.0005, + "loss": 2.1087, + "step": 233630 + }, + { + "epoch": 0.8892915052183644, + "grad_norm": 0.12235203385353088, + "learning_rate": 0.0005, + "loss": 2.1035, + "step": 233640 + }, + { + "epoch": 0.8893295676864871, + "grad_norm": 0.12408211827278137, + "learning_rate": 0.0005, + "loss": 2.0776, + "step": 233650 + }, + { + "epoch": 0.8893676301546097, + "grad_norm": 0.12142856419086456, + "learning_rate": 0.0005, + "loss": 2.0987, + "step": 233660 + }, + { + "epoch": 0.8894056926227324, + "grad_norm": 0.13211260735988617, + "learning_rate": 0.0005, + "loss": 2.0975, + "step": 233670 + }, + { + "epoch": 0.8894437550908552, + "grad_norm": 0.13942740857601166, + "learning_rate": 0.0005, + "loss": 2.1018, + "step": 233680 + }, + { + "epoch": 0.8894818175589778, + "grad_norm": 0.12578365206718445, + "learning_rate": 0.0005, + "loss": 2.1029, + "step": 233690 + }, + { + "epoch": 0.8895198800271005, + "grad_norm": 0.11925357580184937, + "learning_rate": 0.0005, + "loss": 2.1027, + "step": 233700 + }, + { + "epoch": 0.8895579424952231, + "grad_norm": 0.13358432054519653, + "learning_rate": 0.0005, + "loss": 2.1158, + "step": 233710 + }, + { + "epoch": 0.8895960049633458, + "grad_norm": 0.13842090964317322, + "learning_rate": 0.0005, + "loss": 2.0865, + "step": 233720 + }, + { + "epoch": 0.8896340674314686, + "grad_norm": 0.13777171075344086, + "learning_rate": 0.0005, + "loss": 2.1056, + "step": 233730 + }, + { + "epoch": 0.8896721298995912, + "grad_norm": 0.12120426446199417, + "learning_rate": 0.0005, + "loss": 2.1055, + "step": 233740 + }, + { + "epoch": 0.8897101923677139, + "grad_norm": 0.12940014898777008, + "learning_rate": 0.0005, + "loss": 2.1023, + "step": 233750 + }, + { + "epoch": 0.8897482548358365, + "grad_norm": 0.11755038052797318, + "learning_rate": 0.0005, + "loss": 2.093, + "step": 233760 + }, + { + "epoch": 0.8897863173039593, + "grad_norm": 0.11646275222301483, + "learning_rate": 0.0005, + "loss": 2.1083, + "step": 233770 + }, + { + "epoch": 0.889824379772082, + "grad_norm": 0.13029074668884277, + "learning_rate": 0.0005, + "loss": 2.1, + "step": 233780 + }, + { + "epoch": 0.8898624422402046, + "grad_norm": 0.12717968225479126, + "learning_rate": 0.0005, + "loss": 2.0935, + "step": 233790 + }, + { + "epoch": 0.8899005047083273, + "grad_norm": 0.11478909105062485, + "learning_rate": 0.0005, + "loss": 2.0916, + "step": 233800 + }, + { + "epoch": 0.88993856717645, + "grad_norm": 0.11571209132671356, + "learning_rate": 0.0005, + "loss": 2.0975, + "step": 233810 + }, + { + "epoch": 0.8899766296445727, + "grad_norm": 0.13696546852588654, + "learning_rate": 0.0005, + "loss": 2.1138, + "step": 233820 + }, + { + "epoch": 0.8900146921126953, + "grad_norm": 0.11886905133724213, + "learning_rate": 0.0005, + "loss": 2.0937, + "step": 233830 + }, + { + "epoch": 0.890052754580818, + "grad_norm": 0.1315298080444336, + "learning_rate": 0.0005, + "loss": 2.1115, + "step": 233840 + }, + { + "epoch": 0.8900908170489408, + "grad_norm": 0.11898674815893173, + "learning_rate": 0.0005, + "loss": 2.1072, + "step": 233850 + }, + { + "epoch": 0.8901288795170634, + "grad_norm": 0.124471515417099, + "learning_rate": 0.0005, + "loss": 2.1034, + "step": 233860 + }, + { + "epoch": 0.8901669419851861, + "grad_norm": 0.13916394114494324, + "learning_rate": 0.0005, + "loss": 2.1058, + "step": 233870 + }, + { + "epoch": 0.8902050044533087, + "grad_norm": 0.13877750933170319, + "learning_rate": 0.0005, + "loss": 2.1085, + "step": 233880 + }, + { + "epoch": 0.8902430669214314, + "grad_norm": 0.12235158681869507, + "learning_rate": 0.0005, + "loss": 2.1065, + "step": 233890 + }, + { + "epoch": 0.8902811293895542, + "grad_norm": 0.12139211595058441, + "learning_rate": 0.0005, + "loss": 2.0987, + "step": 233900 + }, + { + "epoch": 0.8903191918576768, + "grad_norm": 0.12321995943784714, + "learning_rate": 0.0005, + "loss": 2.0903, + "step": 233910 + }, + { + "epoch": 0.8903572543257995, + "grad_norm": 0.1337067186832428, + "learning_rate": 0.0005, + "loss": 2.098, + "step": 233920 + }, + { + "epoch": 0.8903953167939221, + "grad_norm": 0.17027568817138672, + "learning_rate": 0.0005, + "loss": 2.102, + "step": 233930 + }, + { + "epoch": 0.8904333792620449, + "grad_norm": 0.14118517935276031, + "learning_rate": 0.0005, + "loss": 2.1197, + "step": 233940 + }, + { + "epoch": 0.8904714417301676, + "grad_norm": 0.1275780349969864, + "learning_rate": 0.0005, + "loss": 2.0945, + "step": 233950 + }, + { + "epoch": 0.8905095041982902, + "grad_norm": 0.16922014951705933, + "learning_rate": 0.0005, + "loss": 2.0891, + "step": 233960 + }, + { + "epoch": 0.8905475666664129, + "grad_norm": 0.13354507088661194, + "learning_rate": 0.0005, + "loss": 2.1143, + "step": 233970 + }, + { + "epoch": 0.8905856291345357, + "grad_norm": 0.1327764391899109, + "learning_rate": 0.0005, + "loss": 2.1037, + "step": 233980 + }, + { + "epoch": 0.8906236916026583, + "grad_norm": 0.12903861701488495, + "learning_rate": 0.0005, + "loss": 2.0952, + "step": 233990 + }, + { + "epoch": 0.890661754070781, + "grad_norm": 0.13357609510421753, + "learning_rate": 0.0005, + "loss": 2.0967, + "step": 234000 + }, + { + "epoch": 0.8906998165389036, + "grad_norm": 0.15990236401557922, + "learning_rate": 0.0005, + "loss": 2.0917, + "step": 234010 + }, + { + "epoch": 0.8907378790070263, + "grad_norm": 0.13477084040641785, + "learning_rate": 0.0005, + "loss": 2.1194, + "step": 234020 + }, + { + "epoch": 0.890775941475149, + "grad_norm": 0.12353883683681488, + "learning_rate": 0.0005, + "loss": 2.101, + "step": 234030 + }, + { + "epoch": 0.8908140039432717, + "grad_norm": 0.12096250802278519, + "learning_rate": 0.0005, + "loss": 2.0869, + "step": 234040 + }, + { + "epoch": 0.8908520664113944, + "grad_norm": 0.12207420915365219, + "learning_rate": 0.0005, + "loss": 2.09, + "step": 234050 + }, + { + "epoch": 0.890890128879517, + "grad_norm": 0.13806971907615662, + "learning_rate": 0.0005, + "loss": 2.1141, + "step": 234060 + }, + { + "epoch": 0.8909281913476398, + "grad_norm": 0.12487991154193878, + "learning_rate": 0.0005, + "loss": 2.1147, + "step": 234070 + }, + { + "epoch": 0.8909662538157624, + "grad_norm": 0.12291319668292999, + "learning_rate": 0.0005, + "loss": 2.1077, + "step": 234080 + }, + { + "epoch": 0.8910043162838851, + "grad_norm": 0.12147248536348343, + "learning_rate": 0.0005, + "loss": 2.0934, + "step": 234090 + }, + { + "epoch": 0.8910423787520078, + "grad_norm": 0.120960533618927, + "learning_rate": 0.0005, + "loss": 2.0961, + "step": 234100 + }, + { + "epoch": 0.8910804412201305, + "grad_norm": 0.11734280735254288, + "learning_rate": 0.0005, + "loss": 2.0945, + "step": 234110 + }, + { + "epoch": 0.8911185036882532, + "grad_norm": 0.11988954246044159, + "learning_rate": 0.0005, + "loss": 2.1092, + "step": 234120 + }, + { + "epoch": 0.8911565661563758, + "grad_norm": 0.11701469123363495, + "learning_rate": 0.0005, + "loss": 2.1209, + "step": 234130 + }, + { + "epoch": 0.8911946286244985, + "grad_norm": 0.12386269122362137, + "learning_rate": 0.0005, + "loss": 2.1079, + "step": 234140 + }, + { + "epoch": 0.8912326910926213, + "grad_norm": 0.14417067170143127, + "learning_rate": 0.0005, + "loss": 2.1095, + "step": 234150 + }, + { + "epoch": 0.8912707535607439, + "grad_norm": 0.14032790064811707, + "learning_rate": 0.0005, + "loss": 2.0941, + "step": 234160 + }, + { + "epoch": 0.8913088160288666, + "grad_norm": 0.1238948404788971, + "learning_rate": 0.0005, + "loss": 2.1135, + "step": 234170 + }, + { + "epoch": 0.8913468784969892, + "grad_norm": 0.13059444725513458, + "learning_rate": 0.0005, + "loss": 2.0998, + "step": 234180 + }, + { + "epoch": 0.8913849409651119, + "grad_norm": 0.1279069483280182, + "learning_rate": 0.0005, + "loss": 2.0986, + "step": 234190 + }, + { + "epoch": 0.8914230034332347, + "grad_norm": 0.12548679113388062, + "learning_rate": 0.0005, + "loss": 2.1085, + "step": 234200 + }, + { + "epoch": 0.8914610659013573, + "grad_norm": 0.12830105423927307, + "learning_rate": 0.0005, + "loss": 2.1018, + "step": 234210 + }, + { + "epoch": 0.89149912836948, + "grad_norm": 0.12883508205413818, + "learning_rate": 0.0005, + "loss": 2.1227, + "step": 234220 + }, + { + "epoch": 0.8915371908376026, + "grad_norm": 0.12640762329101562, + "learning_rate": 0.0005, + "loss": 2.0874, + "step": 234230 + }, + { + "epoch": 0.8915752533057254, + "grad_norm": 0.12490357458591461, + "learning_rate": 0.0005, + "loss": 2.096, + "step": 234240 + }, + { + "epoch": 0.8916133157738481, + "grad_norm": 0.11947716027498245, + "learning_rate": 0.0005, + "loss": 2.0975, + "step": 234250 + }, + { + "epoch": 0.8916513782419707, + "grad_norm": 0.11541569977998734, + "learning_rate": 0.0005, + "loss": 2.1139, + "step": 234260 + }, + { + "epoch": 0.8916894407100934, + "grad_norm": 0.1117173358798027, + "learning_rate": 0.0005, + "loss": 2.0908, + "step": 234270 + }, + { + "epoch": 0.8917275031782161, + "grad_norm": 0.13849619030952454, + "learning_rate": 0.0005, + "loss": 2.0917, + "step": 234280 + }, + { + "epoch": 0.8917655656463388, + "grad_norm": 0.12830138206481934, + "learning_rate": 0.0005, + "loss": 2.0959, + "step": 234290 + }, + { + "epoch": 0.8918036281144615, + "grad_norm": 0.13390886783599854, + "learning_rate": 0.0005, + "loss": 2.1012, + "step": 234300 + }, + { + "epoch": 0.8918416905825841, + "grad_norm": 0.11898814886808395, + "learning_rate": 0.0005, + "loss": 2.1053, + "step": 234310 + }, + { + "epoch": 0.8918797530507068, + "grad_norm": 0.12488134950399399, + "learning_rate": 0.0005, + "loss": 2.1045, + "step": 234320 + }, + { + "epoch": 0.8919178155188295, + "grad_norm": 0.1258251667022705, + "learning_rate": 0.0005, + "loss": 2.1022, + "step": 234330 + }, + { + "epoch": 0.8919558779869522, + "grad_norm": 0.1283978968858719, + "learning_rate": 0.0005, + "loss": 2.0971, + "step": 234340 + }, + { + "epoch": 0.8919939404550749, + "grad_norm": 0.11223854869604111, + "learning_rate": 0.0005, + "loss": 2.1218, + "step": 234350 + }, + { + "epoch": 0.8920320029231975, + "grad_norm": 0.13979001343250275, + "learning_rate": 0.0005, + "loss": 2.0949, + "step": 234360 + }, + { + "epoch": 0.8920700653913203, + "grad_norm": 0.1531330645084381, + "learning_rate": 0.0005, + "loss": 2.0913, + "step": 234370 + }, + { + "epoch": 0.8921081278594429, + "grad_norm": 0.12277109175920486, + "learning_rate": 0.0005, + "loss": 2.0968, + "step": 234380 + }, + { + "epoch": 0.8921461903275656, + "grad_norm": 0.12363363057374954, + "learning_rate": 0.0005, + "loss": 2.1055, + "step": 234390 + }, + { + "epoch": 0.8921842527956882, + "grad_norm": 0.14143170416355133, + "learning_rate": 0.0005, + "loss": 2.1173, + "step": 234400 + }, + { + "epoch": 0.892222315263811, + "grad_norm": 0.12829793989658356, + "learning_rate": 0.0005, + "loss": 2.0973, + "step": 234410 + }, + { + "epoch": 0.8922603777319337, + "grad_norm": 0.13423499464988708, + "learning_rate": 0.0005, + "loss": 2.0968, + "step": 234420 + }, + { + "epoch": 0.8922984402000563, + "grad_norm": 0.12552112340927124, + "learning_rate": 0.0005, + "loss": 2.1039, + "step": 234430 + }, + { + "epoch": 0.892336502668179, + "grad_norm": 0.14071422815322876, + "learning_rate": 0.0005, + "loss": 2.1136, + "step": 234440 + }, + { + "epoch": 0.8923745651363016, + "grad_norm": 0.1370924413204193, + "learning_rate": 0.0005, + "loss": 2.0879, + "step": 234450 + }, + { + "epoch": 0.8924126276044244, + "grad_norm": 0.12983131408691406, + "learning_rate": 0.0005, + "loss": 2.1053, + "step": 234460 + }, + { + "epoch": 0.8924506900725471, + "grad_norm": 0.1345546841621399, + "learning_rate": 0.0005, + "loss": 2.1058, + "step": 234470 + }, + { + "epoch": 0.8924887525406697, + "grad_norm": 0.1273956000804901, + "learning_rate": 0.0005, + "loss": 2.1095, + "step": 234480 + }, + { + "epoch": 0.8925268150087924, + "grad_norm": 0.14083106815814972, + "learning_rate": 0.0005, + "loss": 2.0981, + "step": 234490 + }, + { + "epoch": 0.8925648774769152, + "grad_norm": 0.1267554610967636, + "learning_rate": 0.0005, + "loss": 2.1059, + "step": 234500 + }, + { + "epoch": 0.8926029399450378, + "grad_norm": 0.1280146986246109, + "learning_rate": 0.0005, + "loss": 2.1045, + "step": 234510 + }, + { + "epoch": 0.8926410024131605, + "grad_norm": 0.1163170114159584, + "learning_rate": 0.0005, + "loss": 2.0969, + "step": 234520 + }, + { + "epoch": 0.8926790648812831, + "grad_norm": 0.14170005917549133, + "learning_rate": 0.0005, + "loss": 2.1146, + "step": 234530 + }, + { + "epoch": 0.8927171273494059, + "grad_norm": 0.12691031396389008, + "learning_rate": 0.0005, + "loss": 2.1125, + "step": 234540 + }, + { + "epoch": 0.8927551898175285, + "grad_norm": 0.1331014186143875, + "learning_rate": 0.0005, + "loss": 2.1171, + "step": 234550 + }, + { + "epoch": 0.8927932522856512, + "grad_norm": 0.1329735368490219, + "learning_rate": 0.0005, + "loss": 2.0899, + "step": 234560 + }, + { + "epoch": 0.8928313147537739, + "grad_norm": 0.11880216747522354, + "learning_rate": 0.0005, + "loss": 2.1125, + "step": 234570 + }, + { + "epoch": 0.8928693772218966, + "grad_norm": 0.1179804652929306, + "learning_rate": 0.0005, + "loss": 2.111, + "step": 234580 + }, + { + "epoch": 0.8929074396900193, + "grad_norm": 0.12276770919561386, + "learning_rate": 0.0005, + "loss": 2.0959, + "step": 234590 + }, + { + "epoch": 0.892945502158142, + "grad_norm": 0.13497260212898254, + "learning_rate": 0.0005, + "loss": 2.1069, + "step": 234600 + }, + { + "epoch": 0.8929835646262646, + "grad_norm": 0.12063473463058472, + "learning_rate": 0.0005, + "loss": 2.1114, + "step": 234610 + }, + { + "epoch": 0.8930216270943873, + "grad_norm": 0.13163186609745026, + "learning_rate": 0.0005, + "loss": 2.0942, + "step": 234620 + }, + { + "epoch": 0.89305968956251, + "grad_norm": 0.14808395504951477, + "learning_rate": 0.0005, + "loss": 2.0989, + "step": 234630 + }, + { + "epoch": 0.8930977520306327, + "grad_norm": 0.11621870845556259, + "learning_rate": 0.0005, + "loss": 2.1032, + "step": 234640 + }, + { + "epoch": 0.8931358144987553, + "grad_norm": 0.1356925368309021, + "learning_rate": 0.0005, + "loss": 2.1159, + "step": 234650 + }, + { + "epoch": 0.893173876966878, + "grad_norm": 0.12438897788524628, + "learning_rate": 0.0005, + "loss": 2.1234, + "step": 234660 + }, + { + "epoch": 0.8932119394350008, + "grad_norm": 0.12834861874580383, + "learning_rate": 0.0005, + "loss": 2.1085, + "step": 234670 + }, + { + "epoch": 0.8932500019031234, + "grad_norm": 0.12933696806430817, + "learning_rate": 0.0005, + "loss": 2.1136, + "step": 234680 + }, + { + "epoch": 0.8932880643712461, + "grad_norm": 0.11973818391561508, + "learning_rate": 0.0005, + "loss": 2.0867, + "step": 234690 + }, + { + "epoch": 0.8933261268393687, + "grad_norm": 0.12239322066307068, + "learning_rate": 0.0005, + "loss": 2.1156, + "step": 234700 + }, + { + "epoch": 0.8933641893074915, + "grad_norm": 0.12869210541248322, + "learning_rate": 0.0005, + "loss": 2.1119, + "step": 234710 + }, + { + "epoch": 0.8934022517756142, + "grad_norm": 0.12452254444360733, + "learning_rate": 0.0005, + "loss": 2.1041, + "step": 234720 + }, + { + "epoch": 0.8934403142437368, + "grad_norm": 0.1302383542060852, + "learning_rate": 0.0005, + "loss": 2.1073, + "step": 234730 + }, + { + "epoch": 0.8934783767118595, + "grad_norm": 0.15368464589118958, + "learning_rate": 0.0005, + "loss": 2.1108, + "step": 234740 + }, + { + "epoch": 0.8935164391799821, + "grad_norm": 0.17728041112422943, + "learning_rate": 0.0005, + "loss": 2.0895, + "step": 234750 + }, + { + "epoch": 0.8935545016481049, + "grad_norm": 0.13167928159236908, + "learning_rate": 0.0005, + "loss": 2.0899, + "step": 234760 + }, + { + "epoch": 0.8935925641162276, + "grad_norm": 0.13544803857803345, + "learning_rate": 0.0005, + "loss": 2.1193, + "step": 234770 + }, + { + "epoch": 0.8936306265843502, + "grad_norm": 0.1211390420794487, + "learning_rate": 0.0005, + "loss": 2.0968, + "step": 234780 + }, + { + "epoch": 0.8936686890524729, + "grad_norm": 0.13180460035800934, + "learning_rate": 0.0005, + "loss": 2.1027, + "step": 234790 + }, + { + "epoch": 0.8937067515205956, + "grad_norm": 0.12960641086101532, + "learning_rate": 0.0005, + "loss": 2.0915, + "step": 234800 + }, + { + "epoch": 0.8937448139887183, + "grad_norm": 0.12918585538864136, + "learning_rate": 0.0005, + "loss": 2.0959, + "step": 234810 + }, + { + "epoch": 0.893782876456841, + "grad_norm": 0.18785136938095093, + "learning_rate": 0.0005, + "loss": 2.102, + "step": 234820 + }, + { + "epoch": 0.8938209389249636, + "grad_norm": 0.11553603410720825, + "learning_rate": 0.0005, + "loss": 2.0971, + "step": 234830 + }, + { + "epoch": 0.8938590013930864, + "grad_norm": 0.12703922390937805, + "learning_rate": 0.0005, + "loss": 2.1051, + "step": 234840 + }, + { + "epoch": 0.893897063861209, + "grad_norm": 0.13607758283615112, + "learning_rate": 0.0005, + "loss": 2.1036, + "step": 234850 + }, + { + "epoch": 0.8939351263293317, + "grad_norm": 0.12318916618824005, + "learning_rate": 0.0005, + "loss": 2.1106, + "step": 234860 + }, + { + "epoch": 0.8939731887974544, + "grad_norm": 0.13668863475322723, + "learning_rate": 0.0005, + "loss": 2.0971, + "step": 234870 + }, + { + "epoch": 0.894011251265577, + "grad_norm": 0.1230866089463234, + "learning_rate": 0.0005, + "loss": 2.0942, + "step": 234880 + }, + { + "epoch": 0.8940493137336998, + "grad_norm": 0.1301426887512207, + "learning_rate": 0.0005, + "loss": 2.1039, + "step": 234890 + }, + { + "epoch": 0.8940873762018224, + "grad_norm": 0.2735763490200043, + "learning_rate": 0.0005, + "loss": 2.1024, + "step": 234900 + }, + { + "epoch": 0.8941254386699451, + "grad_norm": 0.12629179656505585, + "learning_rate": 0.0005, + "loss": 2.1052, + "step": 234910 + }, + { + "epoch": 0.8941635011380678, + "grad_norm": 0.1262362003326416, + "learning_rate": 0.0005, + "loss": 2.1142, + "step": 234920 + }, + { + "epoch": 0.8942015636061905, + "grad_norm": 0.1381853222846985, + "learning_rate": 0.0005, + "loss": 2.1075, + "step": 234930 + }, + { + "epoch": 0.8942396260743132, + "grad_norm": 0.12422885000705719, + "learning_rate": 0.0005, + "loss": 2.104, + "step": 234940 + }, + { + "epoch": 0.8942776885424358, + "grad_norm": 0.15908923745155334, + "learning_rate": 0.0005, + "loss": 2.1181, + "step": 234950 + }, + { + "epoch": 0.8943157510105585, + "grad_norm": 0.12250334769487381, + "learning_rate": 0.0005, + "loss": 2.113, + "step": 234960 + }, + { + "epoch": 0.8943538134786813, + "grad_norm": 0.1374390572309494, + "learning_rate": 0.0005, + "loss": 2.0923, + "step": 234970 + }, + { + "epoch": 0.8943918759468039, + "grad_norm": 0.1280045360326767, + "learning_rate": 0.0005, + "loss": 2.1043, + "step": 234980 + }, + { + "epoch": 0.8944299384149266, + "grad_norm": 0.13371485471725464, + "learning_rate": 0.0005, + "loss": 2.0907, + "step": 234990 + }, + { + "epoch": 0.8944680008830492, + "grad_norm": 0.14468924701213837, + "learning_rate": 0.0005, + "loss": 2.085, + "step": 235000 + }, + { + "epoch": 0.894506063351172, + "grad_norm": 0.12231716513633728, + "learning_rate": 0.0005, + "loss": 2.1011, + "step": 235010 + }, + { + "epoch": 0.8945441258192947, + "grad_norm": 0.12150655686855316, + "learning_rate": 0.0005, + "loss": 2.0854, + "step": 235020 + }, + { + "epoch": 0.8945821882874173, + "grad_norm": 0.13228917121887207, + "learning_rate": 0.0005, + "loss": 2.1248, + "step": 235030 + }, + { + "epoch": 0.89462025075554, + "grad_norm": 0.12393835932016373, + "learning_rate": 0.0005, + "loss": 2.0931, + "step": 235040 + }, + { + "epoch": 0.8946583132236626, + "grad_norm": 0.13002344965934753, + "learning_rate": 0.0005, + "loss": 2.1003, + "step": 235050 + }, + { + "epoch": 0.8946963756917854, + "grad_norm": 0.12541408836841583, + "learning_rate": 0.0005, + "loss": 2.1122, + "step": 235060 + }, + { + "epoch": 0.894734438159908, + "grad_norm": 0.12563663721084595, + "learning_rate": 0.0005, + "loss": 2.0954, + "step": 235070 + }, + { + "epoch": 0.8947725006280307, + "grad_norm": 0.12658190727233887, + "learning_rate": 0.0005, + "loss": 2.0949, + "step": 235080 + }, + { + "epoch": 0.8948105630961534, + "grad_norm": 0.13006722927093506, + "learning_rate": 0.0005, + "loss": 2.1109, + "step": 235090 + }, + { + "epoch": 0.8948486255642761, + "grad_norm": 0.12455962598323822, + "learning_rate": 0.0005, + "loss": 2.1, + "step": 235100 + }, + { + "epoch": 0.8948866880323988, + "grad_norm": 0.13397879898548126, + "learning_rate": 0.0005, + "loss": 2.1165, + "step": 235110 + }, + { + "epoch": 0.8949247505005214, + "grad_norm": 0.13239069283008575, + "learning_rate": 0.0005, + "loss": 2.1059, + "step": 235120 + }, + { + "epoch": 0.8949628129686441, + "grad_norm": 0.13646048307418823, + "learning_rate": 0.0005, + "loss": 2.0966, + "step": 235130 + }, + { + "epoch": 0.8950008754367669, + "grad_norm": 0.11922208964824677, + "learning_rate": 0.0005, + "loss": 2.0844, + "step": 235140 + }, + { + "epoch": 0.8950389379048895, + "grad_norm": 0.13137061893939972, + "learning_rate": 0.0005, + "loss": 2.1066, + "step": 235150 + }, + { + "epoch": 0.8950770003730122, + "grad_norm": 0.13950665295124054, + "learning_rate": 0.0005, + "loss": 2.0981, + "step": 235160 + }, + { + "epoch": 0.8951150628411348, + "grad_norm": 0.1411396861076355, + "learning_rate": 0.0005, + "loss": 2.1069, + "step": 235170 + }, + { + "epoch": 0.8951531253092575, + "grad_norm": 0.13766488432884216, + "learning_rate": 0.0005, + "loss": 2.1269, + "step": 235180 + }, + { + "epoch": 0.8951911877773803, + "grad_norm": 0.13621293008327484, + "learning_rate": 0.0005, + "loss": 2.1086, + "step": 235190 + }, + { + "epoch": 0.8952292502455029, + "grad_norm": 0.13348594307899475, + "learning_rate": 0.0005, + "loss": 2.1044, + "step": 235200 + }, + { + "epoch": 0.8952673127136256, + "grad_norm": 0.2628757357597351, + "learning_rate": 0.0005, + "loss": 2.1101, + "step": 235210 + }, + { + "epoch": 0.8953053751817482, + "grad_norm": 0.1402920037508011, + "learning_rate": 0.0005, + "loss": 2.117, + "step": 235220 + }, + { + "epoch": 0.895343437649871, + "grad_norm": 0.11941851675510406, + "learning_rate": 0.0005, + "loss": 2.1116, + "step": 235230 + }, + { + "epoch": 0.8953815001179937, + "grad_norm": 0.1464409977197647, + "learning_rate": 0.0005, + "loss": 2.1095, + "step": 235240 + }, + { + "epoch": 0.8954195625861163, + "grad_norm": 0.11677876114845276, + "learning_rate": 0.0005, + "loss": 2.1034, + "step": 235250 + }, + { + "epoch": 0.895457625054239, + "grad_norm": 0.11923077702522278, + "learning_rate": 0.0005, + "loss": 2.1008, + "step": 235260 + }, + { + "epoch": 0.8954956875223617, + "grad_norm": 0.13116028904914856, + "learning_rate": 0.0005, + "loss": 2.1058, + "step": 235270 + }, + { + "epoch": 0.8955337499904844, + "grad_norm": 0.12417915463447571, + "learning_rate": 0.0005, + "loss": 2.0887, + "step": 235280 + }, + { + "epoch": 0.8955718124586071, + "grad_norm": 0.12923891842365265, + "learning_rate": 0.0005, + "loss": 2.1196, + "step": 235290 + }, + { + "epoch": 0.8956098749267297, + "grad_norm": 0.13769130408763885, + "learning_rate": 0.0005, + "loss": 2.1092, + "step": 235300 + }, + { + "epoch": 0.8956479373948524, + "grad_norm": 0.13752809166908264, + "learning_rate": 0.0005, + "loss": 2.0907, + "step": 235310 + }, + { + "epoch": 0.8956859998629751, + "grad_norm": 0.12434390932321548, + "learning_rate": 0.0005, + "loss": 2.1038, + "step": 235320 + }, + { + "epoch": 0.8957240623310978, + "grad_norm": 0.1192469522356987, + "learning_rate": 0.0005, + "loss": 2.1057, + "step": 235330 + }, + { + "epoch": 0.8957621247992205, + "grad_norm": 0.13500575721263885, + "learning_rate": 0.0005, + "loss": 2.0928, + "step": 235340 + }, + { + "epoch": 0.8958001872673431, + "grad_norm": 0.11191660910844803, + "learning_rate": 0.0005, + "loss": 2.1077, + "step": 235350 + }, + { + "epoch": 0.8958382497354659, + "grad_norm": 0.1157965213060379, + "learning_rate": 0.0005, + "loss": 2.0919, + "step": 235360 + }, + { + "epoch": 0.8958763122035885, + "grad_norm": 0.14252077043056488, + "learning_rate": 0.0005, + "loss": 2.0914, + "step": 235370 + }, + { + "epoch": 0.8959143746717112, + "grad_norm": 0.1264662891626358, + "learning_rate": 0.0005, + "loss": 2.0957, + "step": 235380 + }, + { + "epoch": 0.8959524371398339, + "grad_norm": 0.13862596452236176, + "learning_rate": 0.0005, + "loss": 2.1039, + "step": 235390 + }, + { + "epoch": 0.8959904996079566, + "grad_norm": 0.14109665155410767, + "learning_rate": 0.0005, + "loss": 2.101, + "step": 235400 + }, + { + "epoch": 0.8960285620760793, + "grad_norm": 0.1267188936471939, + "learning_rate": 0.0005, + "loss": 2.115, + "step": 235410 + }, + { + "epoch": 0.8960666245442019, + "grad_norm": 0.121976338326931, + "learning_rate": 0.0005, + "loss": 2.1042, + "step": 235420 + }, + { + "epoch": 0.8961046870123246, + "grad_norm": 0.12344034016132355, + "learning_rate": 0.0005, + "loss": 2.1091, + "step": 235430 + }, + { + "epoch": 0.8961427494804474, + "grad_norm": 0.12784519791603088, + "learning_rate": 0.0005, + "loss": 2.1082, + "step": 235440 + }, + { + "epoch": 0.89618081194857, + "grad_norm": 0.1280445009469986, + "learning_rate": 0.0005, + "loss": 2.0979, + "step": 235450 + }, + { + "epoch": 0.8962188744166927, + "grad_norm": 0.11634043604135513, + "learning_rate": 0.0005, + "loss": 2.0942, + "step": 235460 + }, + { + "epoch": 0.8962569368848153, + "grad_norm": 0.12300094217061996, + "learning_rate": 0.0005, + "loss": 2.0836, + "step": 235470 + }, + { + "epoch": 0.896294999352938, + "grad_norm": 0.1236441358923912, + "learning_rate": 0.0005, + "loss": 2.1138, + "step": 235480 + }, + { + "epoch": 0.8963330618210608, + "grad_norm": 0.14748676121234894, + "learning_rate": 0.0005, + "loss": 2.1002, + "step": 235490 + }, + { + "epoch": 0.8963711242891834, + "grad_norm": 0.11680328845977783, + "learning_rate": 0.0005, + "loss": 2.1059, + "step": 235500 + }, + { + "epoch": 0.8964091867573061, + "grad_norm": 0.12132323533296585, + "learning_rate": 0.0005, + "loss": 2.1138, + "step": 235510 + }, + { + "epoch": 0.8964472492254287, + "grad_norm": 0.1263308972120285, + "learning_rate": 0.0005, + "loss": 2.1091, + "step": 235520 + }, + { + "epoch": 0.8964853116935515, + "grad_norm": 0.13237321376800537, + "learning_rate": 0.0005, + "loss": 2.1112, + "step": 235530 + }, + { + "epoch": 0.8965233741616742, + "grad_norm": 0.1214015781879425, + "learning_rate": 0.0005, + "loss": 2.1029, + "step": 235540 + }, + { + "epoch": 0.8965614366297968, + "grad_norm": 0.11785169690847397, + "learning_rate": 0.0005, + "loss": 2.1051, + "step": 235550 + }, + { + "epoch": 0.8965994990979195, + "grad_norm": 0.13144147396087646, + "learning_rate": 0.0005, + "loss": 2.1017, + "step": 235560 + }, + { + "epoch": 0.8966375615660422, + "grad_norm": 0.12580817937850952, + "learning_rate": 0.0005, + "loss": 2.1045, + "step": 235570 + }, + { + "epoch": 0.8966756240341649, + "grad_norm": 0.12729611992835999, + "learning_rate": 0.0005, + "loss": 2.0839, + "step": 235580 + }, + { + "epoch": 0.8967136865022876, + "grad_norm": 0.13856258988380432, + "learning_rate": 0.0005, + "loss": 2.0889, + "step": 235590 + }, + { + "epoch": 0.8967517489704102, + "grad_norm": 0.13701985776424408, + "learning_rate": 0.0005, + "loss": 2.0944, + "step": 235600 + }, + { + "epoch": 0.8967898114385329, + "grad_norm": 0.12763841450214386, + "learning_rate": 0.0005, + "loss": 2.0996, + "step": 235610 + }, + { + "epoch": 0.8968278739066556, + "grad_norm": 0.12884309887886047, + "learning_rate": 0.0005, + "loss": 2.092, + "step": 235620 + }, + { + "epoch": 0.8968659363747783, + "grad_norm": 0.10915088653564453, + "learning_rate": 0.0005, + "loss": 2.1043, + "step": 235630 + }, + { + "epoch": 0.896903998842901, + "grad_norm": 0.1206916868686676, + "learning_rate": 0.0005, + "loss": 2.1065, + "step": 235640 + }, + { + "epoch": 0.8969420613110236, + "grad_norm": 0.13930000364780426, + "learning_rate": 0.0005, + "loss": 2.1049, + "step": 235650 + }, + { + "epoch": 0.8969801237791464, + "grad_norm": 0.13301430642604828, + "learning_rate": 0.0005, + "loss": 2.1069, + "step": 235660 + }, + { + "epoch": 0.897018186247269, + "grad_norm": 0.12809635698795319, + "learning_rate": 0.0005, + "loss": 2.1081, + "step": 235670 + }, + { + "epoch": 0.8970562487153917, + "grad_norm": 0.13747407495975494, + "learning_rate": 0.0005, + "loss": 2.108, + "step": 235680 + }, + { + "epoch": 0.8970943111835143, + "grad_norm": 0.12985946238040924, + "learning_rate": 0.0005, + "loss": 2.107, + "step": 235690 + }, + { + "epoch": 0.8971323736516371, + "grad_norm": 0.12377568334341049, + "learning_rate": 0.0005, + "loss": 2.1068, + "step": 235700 + }, + { + "epoch": 0.8971704361197598, + "grad_norm": 0.14152388274669647, + "learning_rate": 0.0005, + "loss": 2.1063, + "step": 235710 + }, + { + "epoch": 0.8972084985878824, + "grad_norm": 0.11914193630218506, + "learning_rate": 0.0005, + "loss": 2.1011, + "step": 235720 + }, + { + "epoch": 0.8972465610560051, + "grad_norm": 0.12625829875469208, + "learning_rate": 0.0005, + "loss": 2.0972, + "step": 235730 + }, + { + "epoch": 0.8972846235241277, + "grad_norm": 0.11790535598993301, + "learning_rate": 0.0005, + "loss": 2.1002, + "step": 235740 + }, + { + "epoch": 0.8973226859922505, + "grad_norm": 0.11743942648172379, + "learning_rate": 0.0005, + "loss": 2.1045, + "step": 235750 + }, + { + "epoch": 0.8973607484603732, + "grad_norm": 0.1302875429391861, + "learning_rate": 0.0005, + "loss": 2.1101, + "step": 235760 + }, + { + "epoch": 0.8973988109284958, + "grad_norm": 0.1282423734664917, + "learning_rate": 0.0005, + "loss": 2.1074, + "step": 235770 + }, + { + "epoch": 0.8974368733966185, + "grad_norm": 0.13597077131271362, + "learning_rate": 0.0005, + "loss": 2.0918, + "step": 235780 + }, + { + "epoch": 0.8974749358647413, + "grad_norm": 0.13141882419586182, + "learning_rate": 0.0005, + "loss": 2.0927, + "step": 235790 + }, + { + "epoch": 0.8975129983328639, + "grad_norm": 0.12061822414398193, + "learning_rate": 0.0005, + "loss": 2.1139, + "step": 235800 + }, + { + "epoch": 0.8975510608009866, + "grad_norm": 0.138809934258461, + "learning_rate": 0.0005, + "loss": 2.0989, + "step": 235810 + }, + { + "epoch": 0.8975891232691092, + "grad_norm": 0.1366889476776123, + "learning_rate": 0.0005, + "loss": 2.118, + "step": 235820 + }, + { + "epoch": 0.897627185737232, + "grad_norm": 0.12433522194623947, + "learning_rate": 0.0005, + "loss": 2.1019, + "step": 235830 + }, + { + "epoch": 0.8976652482053546, + "grad_norm": 0.12865471839904785, + "learning_rate": 0.0005, + "loss": 2.1066, + "step": 235840 + }, + { + "epoch": 0.8977033106734773, + "grad_norm": 0.12241709977388382, + "learning_rate": 0.0005, + "loss": 2.1154, + "step": 235850 + }, + { + "epoch": 0.8977413731416, + "grad_norm": 0.11950081586837769, + "learning_rate": 0.0005, + "loss": 2.1021, + "step": 235860 + }, + { + "epoch": 0.8977794356097227, + "grad_norm": 0.12112652510404587, + "learning_rate": 0.0005, + "loss": 2.1132, + "step": 235870 + }, + { + "epoch": 0.8978174980778454, + "grad_norm": 0.14162234961986542, + "learning_rate": 0.0005, + "loss": 2.109, + "step": 235880 + }, + { + "epoch": 0.897855560545968, + "grad_norm": 0.12865613400936127, + "learning_rate": 0.0005, + "loss": 2.1113, + "step": 235890 + }, + { + "epoch": 0.8978936230140907, + "grad_norm": 0.1257365494966507, + "learning_rate": 0.0005, + "loss": 2.1041, + "step": 235900 + }, + { + "epoch": 0.8979316854822134, + "grad_norm": 0.11937166750431061, + "learning_rate": 0.0005, + "loss": 2.1011, + "step": 235910 + }, + { + "epoch": 0.8979697479503361, + "grad_norm": 0.12168044596910477, + "learning_rate": 0.0005, + "loss": 2.1109, + "step": 235920 + }, + { + "epoch": 0.8980078104184588, + "grad_norm": 0.1285332292318344, + "learning_rate": 0.0005, + "loss": 2.0979, + "step": 235930 + }, + { + "epoch": 0.8980458728865814, + "grad_norm": 0.11787950992584229, + "learning_rate": 0.0005, + "loss": 2.1144, + "step": 235940 + }, + { + "epoch": 0.8980839353547041, + "grad_norm": 0.13091526925563812, + "learning_rate": 0.0005, + "loss": 2.1016, + "step": 235950 + }, + { + "epoch": 0.8981219978228269, + "grad_norm": 0.12218698859214783, + "learning_rate": 0.0005, + "loss": 2.1096, + "step": 235960 + }, + { + "epoch": 0.8981600602909495, + "grad_norm": 0.12118630111217499, + "learning_rate": 0.0005, + "loss": 2.0933, + "step": 235970 + }, + { + "epoch": 0.8981981227590722, + "grad_norm": 0.12844808399677277, + "learning_rate": 0.0005, + "loss": 2.1083, + "step": 235980 + }, + { + "epoch": 0.8982361852271948, + "grad_norm": 0.11266063153743744, + "learning_rate": 0.0005, + "loss": 2.0973, + "step": 235990 + }, + { + "epoch": 0.8982742476953176, + "grad_norm": 0.13366252183914185, + "learning_rate": 0.0005, + "loss": 2.0957, + "step": 236000 + }, + { + "epoch": 0.8983123101634403, + "grad_norm": 0.12028734385967255, + "learning_rate": 0.0005, + "loss": 2.0935, + "step": 236010 + }, + { + "epoch": 0.8983503726315629, + "grad_norm": 0.14484402537345886, + "learning_rate": 0.0005, + "loss": 2.1148, + "step": 236020 + }, + { + "epoch": 0.8983884350996856, + "grad_norm": 0.13890573382377625, + "learning_rate": 0.0005, + "loss": 2.1087, + "step": 236030 + }, + { + "epoch": 0.8984264975678082, + "grad_norm": 0.12853313982486725, + "learning_rate": 0.0005, + "loss": 2.0932, + "step": 236040 + }, + { + "epoch": 0.898464560035931, + "grad_norm": 0.12683875858783722, + "learning_rate": 0.0005, + "loss": 2.1164, + "step": 236050 + }, + { + "epoch": 0.8985026225040537, + "grad_norm": 0.12236892431974411, + "learning_rate": 0.0005, + "loss": 2.097, + "step": 236060 + }, + { + "epoch": 0.8985406849721763, + "grad_norm": 0.13218963146209717, + "learning_rate": 0.0005, + "loss": 2.1066, + "step": 236070 + }, + { + "epoch": 0.898578747440299, + "grad_norm": 0.12956587970256805, + "learning_rate": 0.0005, + "loss": 2.105, + "step": 236080 + }, + { + "epoch": 0.8986168099084217, + "grad_norm": 0.12366285920143127, + "learning_rate": 0.0005, + "loss": 2.104, + "step": 236090 + }, + { + "epoch": 0.8986548723765444, + "grad_norm": 0.12479374557733536, + "learning_rate": 0.0005, + "loss": 2.0983, + "step": 236100 + }, + { + "epoch": 0.8986929348446671, + "grad_norm": 0.11998150497674942, + "learning_rate": 0.0005, + "loss": 2.1072, + "step": 236110 + }, + { + "epoch": 0.8987309973127897, + "grad_norm": 0.12145683914422989, + "learning_rate": 0.0005, + "loss": 2.1041, + "step": 236120 + }, + { + "epoch": 0.8987690597809125, + "grad_norm": 0.13119207322597504, + "learning_rate": 0.0005, + "loss": 2.1181, + "step": 236130 + }, + { + "epoch": 0.8988071222490351, + "grad_norm": 0.12737643718719482, + "learning_rate": 0.0005, + "loss": 2.0835, + "step": 236140 + }, + { + "epoch": 0.8988451847171578, + "grad_norm": 0.13847488164901733, + "learning_rate": 0.0005, + "loss": 2.1126, + "step": 236150 + }, + { + "epoch": 0.8988832471852805, + "grad_norm": 0.12293814867734909, + "learning_rate": 0.0005, + "loss": 2.1009, + "step": 236160 + }, + { + "epoch": 0.8989213096534031, + "grad_norm": 0.10784052312374115, + "learning_rate": 0.0005, + "loss": 2.1067, + "step": 236170 + }, + { + "epoch": 0.8989593721215259, + "grad_norm": 0.13122645020484924, + "learning_rate": 0.0005, + "loss": 2.1073, + "step": 236180 + }, + { + "epoch": 0.8989974345896485, + "grad_norm": 0.12222135812044144, + "learning_rate": 0.0005, + "loss": 2.1015, + "step": 236190 + }, + { + "epoch": 0.8990354970577712, + "grad_norm": 0.1274719387292862, + "learning_rate": 0.0005, + "loss": 2.108, + "step": 236200 + }, + { + "epoch": 0.8990735595258939, + "grad_norm": 0.12640856206417084, + "learning_rate": 0.0005, + "loss": 2.102, + "step": 236210 + }, + { + "epoch": 0.8991116219940166, + "grad_norm": 0.13322730362415314, + "learning_rate": 0.0005, + "loss": 2.113, + "step": 236220 + }, + { + "epoch": 0.8991496844621393, + "grad_norm": 0.11743305623531342, + "learning_rate": 0.0005, + "loss": 2.1151, + "step": 236230 + }, + { + "epoch": 0.8991877469302619, + "grad_norm": 0.14595948159694672, + "learning_rate": 0.0005, + "loss": 2.1057, + "step": 236240 + }, + { + "epoch": 0.8992258093983846, + "grad_norm": 0.12401731312274933, + "learning_rate": 0.0005, + "loss": 2.1017, + "step": 236250 + }, + { + "epoch": 0.8992638718665074, + "grad_norm": 0.11821137368679047, + "learning_rate": 0.0005, + "loss": 2.1113, + "step": 236260 + }, + { + "epoch": 0.89930193433463, + "grad_norm": 0.12861000001430511, + "learning_rate": 0.0005, + "loss": 2.1062, + "step": 236270 + }, + { + "epoch": 0.8993399968027527, + "grad_norm": 0.12914568185806274, + "learning_rate": 0.0005, + "loss": 2.1021, + "step": 236280 + }, + { + "epoch": 0.8993780592708753, + "grad_norm": 0.13119269907474518, + "learning_rate": 0.0005, + "loss": 2.1039, + "step": 236290 + }, + { + "epoch": 0.8994161217389981, + "grad_norm": 0.14866691827774048, + "learning_rate": 0.0005, + "loss": 2.1034, + "step": 236300 + }, + { + "epoch": 0.8994541842071208, + "grad_norm": 0.14841854572296143, + "learning_rate": 0.0005, + "loss": 2.101, + "step": 236310 + }, + { + "epoch": 0.8994922466752434, + "grad_norm": 0.12997667491436005, + "learning_rate": 0.0005, + "loss": 2.0945, + "step": 236320 + }, + { + "epoch": 0.8995303091433661, + "grad_norm": 0.118070587515831, + "learning_rate": 0.0005, + "loss": 2.115, + "step": 236330 + }, + { + "epoch": 0.8995683716114887, + "grad_norm": 0.12368416786193848, + "learning_rate": 0.0005, + "loss": 2.1037, + "step": 236340 + }, + { + "epoch": 0.8996064340796115, + "grad_norm": 0.1380939930677414, + "learning_rate": 0.0005, + "loss": 2.1056, + "step": 236350 + }, + { + "epoch": 0.8996444965477342, + "grad_norm": 0.1202736422419548, + "learning_rate": 0.0005, + "loss": 2.1154, + "step": 236360 + }, + { + "epoch": 0.8996825590158568, + "grad_norm": 0.12209721654653549, + "learning_rate": 0.0005, + "loss": 2.106, + "step": 236370 + }, + { + "epoch": 0.8997206214839795, + "grad_norm": 0.12462722510099411, + "learning_rate": 0.0005, + "loss": 2.1046, + "step": 236380 + }, + { + "epoch": 0.8997586839521022, + "grad_norm": 0.12407328188419342, + "learning_rate": 0.0005, + "loss": 2.104, + "step": 236390 + }, + { + "epoch": 0.8997967464202249, + "grad_norm": 0.11674037575721741, + "learning_rate": 0.0005, + "loss": 2.1139, + "step": 236400 + }, + { + "epoch": 0.8998348088883475, + "grad_norm": 0.12624992430210114, + "learning_rate": 0.0005, + "loss": 2.1199, + "step": 236410 + }, + { + "epoch": 0.8998728713564702, + "grad_norm": 0.12524190545082092, + "learning_rate": 0.0005, + "loss": 2.1081, + "step": 236420 + }, + { + "epoch": 0.899910933824593, + "grad_norm": 0.1276724636554718, + "learning_rate": 0.0005, + "loss": 2.0976, + "step": 236430 + }, + { + "epoch": 0.8999489962927156, + "grad_norm": 0.12621335685253143, + "learning_rate": 0.0005, + "loss": 2.1033, + "step": 236440 + }, + { + "epoch": 0.8999870587608383, + "grad_norm": 0.12861864268779755, + "learning_rate": 0.0005, + "loss": 2.1042, + "step": 236450 + }, + { + "epoch": 0.900025121228961, + "grad_norm": 0.13620729744434357, + "learning_rate": 0.0005, + "loss": 2.1011, + "step": 236460 + }, + { + "epoch": 0.9000631836970836, + "grad_norm": 0.13242807984352112, + "learning_rate": 0.0005, + "loss": 2.1038, + "step": 236470 + }, + { + "epoch": 0.9001012461652064, + "grad_norm": 0.12402066588401794, + "learning_rate": 0.0005, + "loss": 2.1025, + "step": 236480 + }, + { + "epoch": 0.900139308633329, + "grad_norm": 0.14862869679927826, + "learning_rate": 0.0005, + "loss": 2.1031, + "step": 236490 + }, + { + "epoch": 0.9001773711014517, + "grad_norm": 0.11755998432636261, + "learning_rate": 0.0005, + "loss": 2.1067, + "step": 236500 + }, + { + "epoch": 0.9002154335695743, + "grad_norm": 0.134490966796875, + "learning_rate": 0.0005, + "loss": 2.1062, + "step": 236510 + }, + { + "epoch": 0.9002534960376971, + "grad_norm": 0.13029280304908752, + "learning_rate": 0.0005, + "loss": 2.1184, + "step": 236520 + }, + { + "epoch": 0.9002915585058198, + "grad_norm": 0.13604643940925598, + "learning_rate": 0.0005, + "loss": 2.0962, + "step": 236530 + }, + { + "epoch": 0.9003296209739424, + "grad_norm": 0.11948616057634354, + "learning_rate": 0.0005, + "loss": 2.1077, + "step": 236540 + }, + { + "epoch": 0.9003676834420651, + "grad_norm": 0.1403426080942154, + "learning_rate": 0.0005, + "loss": 2.0941, + "step": 236550 + }, + { + "epoch": 0.9004057459101878, + "grad_norm": 0.12944534420967102, + "learning_rate": 0.0005, + "loss": 2.0947, + "step": 236560 + }, + { + "epoch": 0.9004438083783105, + "grad_norm": 0.13606569170951843, + "learning_rate": 0.0005, + "loss": 2.0843, + "step": 236570 + }, + { + "epoch": 0.9004818708464332, + "grad_norm": 0.1257597804069519, + "learning_rate": 0.0005, + "loss": 2.1082, + "step": 236580 + }, + { + "epoch": 0.9005199333145558, + "grad_norm": 0.13176938891410828, + "learning_rate": 0.0005, + "loss": 2.1053, + "step": 236590 + }, + { + "epoch": 0.9005579957826785, + "grad_norm": 0.14309963583946228, + "learning_rate": 0.0005, + "loss": 2.102, + "step": 236600 + }, + { + "epoch": 0.9005960582508012, + "grad_norm": 0.12653017044067383, + "learning_rate": 0.0005, + "loss": 2.1238, + "step": 236610 + }, + { + "epoch": 0.9006341207189239, + "grad_norm": 0.13162977993488312, + "learning_rate": 0.0005, + "loss": 2.1073, + "step": 236620 + }, + { + "epoch": 0.9006721831870466, + "grad_norm": 0.13455240428447723, + "learning_rate": 0.0005, + "loss": 2.1186, + "step": 236630 + }, + { + "epoch": 0.9007102456551692, + "grad_norm": 0.13237057626247406, + "learning_rate": 0.0005, + "loss": 2.1064, + "step": 236640 + }, + { + "epoch": 0.900748308123292, + "grad_norm": 0.12127892673015594, + "learning_rate": 0.0005, + "loss": 2.0899, + "step": 236650 + }, + { + "epoch": 0.9007863705914146, + "grad_norm": 0.12416072934865952, + "learning_rate": 0.0005, + "loss": 2.101, + "step": 236660 + }, + { + "epoch": 0.9008244330595373, + "grad_norm": 0.1350620687007904, + "learning_rate": 0.0005, + "loss": 2.0934, + "step": 236670 + }, + { + "epoch": 0.90086249552766, + "grad_norm": 0.1300247311592102, + "learning_rate": 0.0005, + "loss": 2.1072, + "step": 236680 + }, + { + "epoch": 0.9009005579957827, + "grad_norm": 0.12768010795116425, + "learning_rate": 0.0005, + "loss": 2.0963, + "step": 236690 + }, + { + "epoch": 0.9009386204639054, + "grad_norm": 0.1444215327501297, + "learning_rate": 0.0005, + "loss": 2.0875, + "step": 236700 + }, + { + "epoch": 0.900976682932028, + "grad_norm": 0.1194644421339035, + "learning_rate": 0.0005, + "loss": 2.0919, + "step": 236710 + }, + { + "epoch": 0.9010147454001507, + "grad_norm": 0.1391426920890808, + "learning_rate": 0.0005, + "loss": 2.1102, + "step": 236720 + }, + { + "epoch": 0.9010528078682735, + "grad_norm": 0.11720550805330276, + "learning_rate": 0.0005, + "loss": 2.1001, + "step": 236730 + }, + { + "epoch": 0.9010908703363961, + "grad_norm": 0.12966321408748627, + "learning_rate": 0.0005, + "loss": 2.1224, + "step": 236740 + }, + { + "epoch": 0.9011289328045188, + "grad_norm": 0.1279650330543518, + "learning_rate": 0.0005, + "loss": 2.1076, + "step": 236750 + }, + { + "epoch": 0.9011669952726414, + "grad_norm": 0.11716605722904205, + "learning_rate": 0.0005, + "loss": 2.1033, + "step": 236760 + }, + { + "epoch": 0.9012050577407641, + "grad_norm": 0.15291838347911835, + "learning_rate": 0.0005, + "loss": 2.0868, + "step": 236770 + }, + { + "epoch": 0.9012431202088869, + "grad_norm": 0.12723152339458466, + "learning_rate": 0.0005, + "loss": 2.1203, + "step": 236780 + }, + { + "epoch": 0.9012811826770095, + "grad_norm": 0.1147589385509491, + "learning_rate": 0.0005, + "loss": 2.1167, + "step": 236790 + }, + { + "epoch": 0.9013192451451322, + "grad_norm": 0.13322225213050842, + "learning_rate": 0.0005, + "loss": 2.0919, + "step": 236800 + }, + { + "epoch": 0.9013573076132548, + "grad_norm": 0.13659736514091492, + "learning_rate": 0.0005, + "loss": 2.1115, + "step": 236810 + }, + { + "epoch": 0.9013953700813776, + "grad_norm": 0.13309742510318756, + "learning_rate": 0.0005, + "loss": 2.1071, + "step": 236820 + }, + { + "epoch": 0.9014334325495003, + "grad_norm": 0.13266639411449432, + "learning_rate": 0.0005, + "loss": 2.1086, + "step": 236830 + }, + { + "epoch": 0.9014714950176229, + "grad_norm": 0.12524521350860596, + "learning_rate": 0.0005, + "loss": 2.1122, + "step": 236840 + }, + { + "epoch": 0.9015095574857456, + "grad_norm": 0.15074117481708527, + "learning_rate": 0.0005, + "loss": 2.1079, + "step": 236850 + }, + { + "epoch": 0.9015476199538683, + "grad_norm": 0.12030625343322754, + "learning_rate": 0.0005, + "loss": 2.1064, + "step": 236860 + }, + { + "epoch": 0.901585682421991, + "grad_norm": 0.13095256686210632, + "learning_rate": 0.0005, + "loss": 2.0951, + "step": 236870 + }, + { + "epoch": 0.9016237448901137, + "grad_norm": 0.11550428718328476, + "learning_rate": 0.0005, + "loss": 2.0883, + "step": 236880 + }, + { + "epoch": 0.9016618073582363, + "grad_norm": 0.11969823390245438, + "learning_rate": 0.0005, + "loss": 2.1039, + "step": 236890 + }, + { + "epoch": 0.901699869826359, + "grad_norm": 0.11549341678619385, + "learning_rate": 0.0005, + "loss": 2.1084, + "step": 236900 + }, + { + "epoch": 0.9017379322944817, + "grad_norm": 0.12132531404495239, + "learning_rate": 0.0005, + "loss": 2.0942, + "step": 236910 + }, + { + "epoch": 0.9017759947626044, + "grad_norm": 0.11665021628141403, + "learning_rate": 0.0005, + "loss": 2.1042, + "step": 236920 + }, + { + "epoch": 0.901814057230727, + "grad_norm": 0.1289794147014618, + "learning_rate": 0.0005, + "loss": 2.1018, + "step": 236930 + }, + { + "epoch": 0.9018521196988497, + "grad_norm": 0.14646443724632263, + "learning_rate": 0.0005, + "loss": 2.1069, + "step": 236940 + }, + { + "epoch": 0.9018901821669725, + "grad_norm": 0.1430707573890686, + "learning_rate": 0.0005, + "loss": 2.0956, + "step": 236950 + }, + { + "epoch": 0.9019282446350951, + "grad_norm": 0.13452860713005066, + "learning_rate": 0.0005, + "loss": 2.1043, + "step": 236960 + }, + { + "epoch": 0.9019663071032178, + "grad_norm": 0.12012603133916855, + "learning_rate": 0.0005, + "loss": 2.1004, + "step": 236970 + }, + { + "epoch": 0.9020043695713404, + "grad_norm": 0.11895259469747543, + "learning_rate": 0.0005, + "loss": 2.0845, + "step": 236980 + }, + { + "epoch": 0.9020424320394632, + "grad_norm": 0.1259036660194397, + "learning_rate": 0.0005, + "loss": 2.1016, + "step": 236990 + }, + { + "epoch": 0.9020804945075859, + "grad_norm": 0.14254164695739746, + "learning_rate": 0.0005, + "loss": 2.098, + "step": 237000 + }, + { + "epoch": 0.9021185569757085, + "grad_norm": 0.1326291710138321, + "learning_rate": 0.0005, + "loss": 2.1023, + "step": 237010 + }, + { + "epoch": 0.9021566194438312, + "grad_norm": 0.12728415429592133, + "learning_rate": 0.0005, + "loss": 2.1019, + "step": 237020 + }, + { + "epoch": 0.9021946819119538, + "grad_norm": 0.13301852345466614, + "learning_rate": 0.0005, + "loss": 2.1167, + "step": 237030 + }, + { + "epoch": 0.9022327443800766, + "grad_norm": 0.14580120146274567, + "learning_rate": 0.0005, + "loss": 2.1138, + "step": 237040 + }, + { + "epoch": 0.9022708068481993, + "grad_norm": 0.12128158658742905, + "learning_rate": 0.0005, + "loss": 2.0766, + "step": 237050 + }, + { + "epoch": 0.9023088693163219, + "grad_norm": 0.14106401801109314, + "learning_rate": 0.0005, + "loss": 2.1005, + "step": 237060 + }, + { + "epoch": 0.9023469317844446, + "grad_norm": 0.1238996684551239, + "learning_rate": 0.0005, + "loss": 2.1056, + "step": 237070 + }, + { + "epoch": 0.9023849942525674, + "grad_norm": 0.12480529397726059, + "learning_rate": 0.0005, + "loss": 2.092, + "step": 237080 + }, + { + "epoch": 0.90242305672069, + "grad_norm": 0.1321299970149994, + "learning_rate": 0.0005, + "loss": 2.0994, + "step": 237090 + }, + { + "epoch": 0.9024611191888127, + "grad_norm": 0.1273770034313202, + "learning_rate": 0.0005, + "loss": 2.1098, + "step": 237100 + }, + { + "epoch": 0.9024991816569353, + "grad_norm": 0.11618199944496155, + "learning_rate": 0.0005, + "loss": 2.0924, + "step": 237110 + }, + { + "epoch": 0.9025372441250581, + "grad_norm": 0.13283830881118774, + "learning_rate": 0.0005, + "loss": 2.1042, + "step": 237120 + }, + { + "epoch": 0.9025753065931807, + "grad_norm": 0.12719158828258514, + "learning_rate": 0.0005, + "loss": 2.1175, + "step": 237130 + }, + { + "epoch": 0.9026133690613034, + "grad_norm": 0.11212098598480225, + "learning_rate": 0.0005, + "loss": 2.1008, + "step": 237140 + }, + { + "epoch": 0.9026514315294261, + "grad_norm": 0.13038401305675507, + "learning_rate": 0.0005, + "loss": 2.1073, + "step": 237150 + }, + { + "epoch": 0.9026894939975488, + "grad_norm": 0.1276264190673828, + "learning_rate": 0.0005, + "loss": 2.1042, + "step": 237160 + }, + { + "epoch": 0.9027275564656715, + "grad_norm": 0.13317757844924927, + "learning_rate": 0.0005, + "loss": 2.1118, + "step": 237170 + }, + { + "epoch": 0.9027656189337941, + "grad_norm": 0.13702648878097534, + "learning_rate": 0.0005, + "loss": 2.1086, + "step": 237180 + }, + { + "epoch": 0.9028036814019168, + "grad_norm": 0.13050617277622223, + "learning_rate": 0.0005, + "loss": 2.1095, + "step": 237190 + }, + { + "epoch": 0.9028417438700395, + "grad_norm": 0.12711739540100098, + "learning_rate": 0.0005, + "loss": 2.1055, + "step": 237200 + }, + { + "epoch": 0.9028798063381622, + "grad_norm": 0.12161190062761307, + "learning_rate": 0.0005, + "loss": 2.1007, + "step": 237210 + }, + { + "epoch": 0.9029178688062849, + "grad_norm": 0.12170030176639557, + "learning_rate": 0.0005, + "loss": 2.0887, + "step": 237220 + }, + { + "epoch": 0.9029559312744075, + "grad_norm": 0.11940809339284897, + "learning_rate": 0.0005, + "loss": 2.11, + "step": 237230 + }, + { + "epoch": 0.9029939937425302, + "grad_norm": 0.12572605907917023, + "learning_rate": 0.0005, + "loss": 2.1121, + "step": 237240 + }, + { + "epoch": 0.903032056210653, + "grad_norm": 0.14780136942863464, + "learning_rate": 0.0005, + "loss": 2.1104, + "step": 237250 + }, + { + "epoch": 0.9030701186787756, + "grad_norm": 0.13027071952819824, + "learning_rate": 0.0005, + "loss": 2.0981, + "step": 237260 + }, + { + "epoch": 0.9031081811468983, + "grad_norm": 0.14059773087501526, + "learning_rate": 0.0005, + "loss": 2.0957, + "step": 237270 + }, + { + "epoch": 0.9031462436150209, + "grad_norm": 0.13311892747879028, + "learning_rate": 0.0005, + "loss": 2.1094, + "step": 237280 + }, + { + "epoch": 0.9031843060831437, + "grad_norm": 0.1414981335401535, + "learning_rate": 0.0005, + "loss": 2.0976, + "step": 237290 + }, + { + "epoch": 0.9032223685512664, + "grad_norm": 0.13014385104179382, + "learning_rate": 0.0005, + "loss": 2.1043, + "step": 237300 + }, + { + "epoch": 0.903260431019389, + "grad_norm": 0.12036735564470291, + "learning_rate": 0.0005, + "loss": 2.1001, + "step": 237310 + }, + { + "epoch": 0.9032984934875117, + "grad_norm": 0.13092973828315735, + "learning_rate": 0.0005, + "loss": 2.1066, + "step": 237320 + }, + { + "epoch": 0.9033365559556343, + "grad_norm": 0.11806660890579224, + "learning_rate": 0.0005, + "loss": 2.1102, + "step": 237330 + }, + { + "epoch": 0.9033746184237571, + "grad_norm": 0.13571125268936157, + "learning_rate": 0.0005, + "loss": 2.0955, + "step": 237340 + }, + { + "epoch": 0.9034126808918798, + "grad_norm": 0.1258023977279663, + "learning_rate": 0.0005, + "loss": 2.0916, + "step": 237350 + }, + { + "epoch": 0.9034507433600024, + "grad_norm": 0.13625076413154602, + "learning_rate": 0.0005, + "loss": 2.1005, + "step": 237360 + }, + { + "epoch": 0.9034888058281251, + "grad_norm": 0.12940800189971924, + "learning_rate": 0.0005, + "loss": 2.0991, + "step": 237370 + }, + { + "epoch": 0.9035268682962478, + "grad_norm": 0.13219799101352692, + "learning_rate": 0.0005, + "loss": 2.1011, + "step": 237380 + }, + { + "epoch": 0.9035649307643705, + "grad_norm": 0.13905245065689087, + "learning_rate": 0.0005, + "loss": 2.103, + "step": 237390 + }, + { + "epoch": 0.9036029932324932, + "grad_norm": 0.13367201387882233, + "learning_rate": 0.0005, + "loss": 2.1112, + "step": 237400 + }, + { + "epoch": 0.9036410557006158, + "grad_norm": 0.12304285168647766, + "learning_rate": 0.0005, + "loss": 2.1174, + "step": 237410 + }, + { + "epoch": 0.9036791181687386, + "grad_norm": 0.13298951089382172, + "learning_rate": 0.0005, + "loss": 2.1018, + "step": 237420 + }, + { + "epoch": 0.9037171806368612, + "grad_norm": 0.12021772563457489, + "learning_rate": 0.0005, + "loss": 2.0978, + "step": 237430 + }, + { + "epoch": 0.9037552431049839, + "grad_norm": 0.12623293697834015, + "learning_rate": 0.0005, + "loss": 2.1211, + "step": 237440 + }, + { + "epoch": 0.9037933055731066, + "grad_norm": 0.14011913537979126, + "learning_rate": 0.0005, + "loss": 2.1115, + "step": 237450 + }, + { + "epoch": 0.9038313680412292, + "grad_norm": 0.1234155222773552, + "learning_rate": 0.0005, + "loss": 2.1007, + "step": 237460 + }, + { + "epoch": 0.903869430509352, + "grad_norm": 0.12620636820793152, + "learning_rate": 0.0005, + "loss": 2.1084, + "step": 237470 + }, + { + "epoch": 0.9039074929774746, + "grad_norm": 0.12419416755437851, + "learning_rate": 0.0005, + "loss": 2.1083, + "step": 237480 + }, + { + "epoch": 0.9039455554455973, + "grad_norm": 0.13159045577049255, + "learning_rate": 0.0005, + "loss": 2.103, + "step": 237490 + }, + { + "epoch": 0.90398361791372, + "grad_norm": 0.14370512962341309, + "learning_rate": 0.0005, + "loss": 2.1115, + "step": 237500 + }, + { + "epoch": 0.9040216803818427, + "grad_norm": 0.14260618388652802, + "learning_rate": 0.0005, + "loss": 2.0962, + "step": 237510 + }, + { + "epoch": 0.9040597428499654, + "grad_norm": 0.11814543604850769, + "learning_rate": 0.0005, + "loss": 2.1053, + "step": 237520 + }, + { + "epoch": 0.904097805318088, + "grad_norm": 0.1222439780831337, + "learning_rate": 0.0005, + "loss": 2.1115, + "step": 237530 + }, + { + "epoch": 0.9041358677862107, + "grad_norm": 0.140053853392601, + "learning_rate": 0.0005, + "loss": 2.1032, + "step": 237540 + }, + { + "epoch": 0.9041739302543335, + "grad_norm": 0.1309879869222641, + "learning_rate": 0.0005, + "loss": 2.1016, + "step": 237550 + }, + { + "epoch": 0.9042119927224561, + "grad_norm": 0.1283019781112671, + "learning_rate": 0.0005, + "loss": 2.1124, + "step": 237560 + }, + { + "epoch": 0.9042500551905788, + "grad_norm": 0.13409188389778137, + "learning_rate": 0.0005, + "loss": 2.0908, + "step": 237570 + }, + { + "epoch": 0.9042881176587014, + "grad_norm": 0.12143038213253021, + "learning_rate": 0.0005, + "loss": 2.1098, + "step": 237580 + }, + { + "epoch": 0.9043261801268242, + "grad_norm": 0.11686935275793076, + "learning_rate": 0.0005, + "loss": 2.0949, + "step": 237590 + }, + { + "epoch": 0.9043642425949469, + "grad_norm": 0.12011512368917465, + "learning_rate": 0.0005, + "loss": 2.1028, + "step": 237600 + }, + { + "epoch": 0.9044023050630695, + "grad_norm": 0.11764641851186752, + "learning_rate": 0.0005, + "loss": 2.0903, + "step": 237610 + }, + { + "epoch": 0.9044403675311922, + "grad_norm": 0.13327255845069885, + "learning_rate": 0.0005, + "loss": 2.0935, + "step": 237620 + }, + { + "epoch": 0.9044784299993148, + "grad_norm": 0.12383707612752914, + "learning_rate": 0.0005, + "loss": 2.096, + "step": 237630 + }, + { + "epoch": 0.9045164924674376, + "grad_norm": 0.14364486932754517, + "learning_rate": 0.0005, + "loss": 2.0974, + "step": 237640 + }, + { + "epoch": 0.9045545549355603, + "grad_norm": 0.12070348113775253, + "learning_rate": 0.0005, + "loss": 2.1026, + "step": 237650 + }, + { + "epoch": 0.9045926174036829, + "grad_norm": 0.12772414088249207, + "learning_rate": 0.0005, + "loss": 2.0856, + "step": 237660 + }, + { + "epoch": 0.9046306798718056, + "grad_norm": 0.12472674250602722, + "learning_rate": 0.0005, + "loss": 2.0894, + "step": 237670 + }, + { + "epoch": 0.9046687423399283, + "grad_norm": 0.14813371002674103, + "learning_rate": 0.0005, + "loss": 2.1046, + "step": 237680 + }, + { + "epoch": 0.904706804808051, + "grad_norm": 0.1288592517375946, + "learning_rate": 0.0005, + "loss": 2.1077, + "step": 237690 + }, + { + "epoch": 0.9047448672761736, + "grad_norm": 0.1278732568025589, + "learning_rate": 0.0005, + "loss": 2.0924, + "step": 237700 + }, + { + "epoch": 0.9047829297442963, + "grad_norm": 0.12233841419219971, + "learning_rate": 0.0005, + "loss": 2.1058, + "step": 237710 + }, + { + "epoch": 0.9048209922124191, + "grad_norm": 0.12657716870307922, + "learning_rate": 0.0005, + "loss": 2.1158, + "step": 237720 + }, + { + "epoch": 0.9048590546805417, + "grad_norm": 0.1324680596590042, + "learning_rate": 0.0005, + "loss": 2.1075, + "step": 237730 + }, + { + "epoch": 0.9048971171486644, + "grad_norm": 0.12798066437244415, + "learning_rate": 0.0005, + "loss": 2.0974, + "step": 237740 + }, + { + "epoch": 0.904935179616787, + "grad_norm": 0.12066149711608887, + "learning_rate": 0.0005, + "loss": 2.1102, + "step": 237750 + }, + { + "epoch": 0.9049732420849097, + "grad_norm": 0.1185355931520462, + "learning_rate": 0.0005, + "loss": 2.1193, + "step": 237760 + }, + { + "epoch": 0.9050113045530325, + "grad_norm": 0.12513957917690277, + "learning_rate": 0.0005, + "loss": 2.1085, + "step": 237770 + }, + { + "epoch": 0.9050493670211551, + "grad_norm": 0.12734851241111755, + "learning_rate": 0.0005, + "loss": 2.1011, + "step": 237780 + }, + { + "epoch": 0.9050874294892778, + "grad_norm": 0.12355585396289825, + "learning_rate": 0.0005, + "loss": 2.1149, + "step": 237790 + }, + { + "epoch": 0.9051254919574004, + "grad_norm": 0.1301615983247757, + "learning_rate": 0.0005, + "loss": 2.1055, + "step": 237800 + }, + { + "epoch": 0.9051635544255232, + "grad_norm": 0.13630014657974243, + "learning_rate": 0.0005, + "loss": 2.1144, + "step": 237810 + }, + { + "epoch": 0.9052016168936459, + "grad_norm": 0.14217840135097504, + "learning_rate": 0.0005, + "loss": 2.0922, + "step": 237820 + }, + { + "epoch": 0.9052396793617685, + "grad_norm": 0.1299649029970169, + "learning_rate": 0.0005, + "loss": 2.0966, + "step": 237830 + }, + { + "epoch": 0.9052777418298912, + "grad_norm": 0.11644427478313446, + "learning_rate": 0.0005, + "loss": 2.0961, + "step": 237840 + }, + { + "epoch": 0.905315804298014, + "grad_norm": 0.12850260734558105, + "learning_rate": 0.0005, + "loss": 2.0986, + "step": 237850 + }, + { + "epoch": 0.9053538667661366, + "grad_norm": 0.2166110724210739, + "learning_rate": 0.0005, + "loss": 2.1106, + "step": 237860 + }, + { + "epoch": 0.9053919292342593, + "grad_norm": 0.12297400087118149, + "learning_rate": 0.0005, + "loss": 2.096, + "step": 237870 + }, + { + "epoch": 0.9054299917023819, + "grad_norm": 0.12361843138933182, + "learning_rate": 0.0005, + "loss": 2.0971, + "step": 237880 + }, + { + "epoch": 0.9054680541705046, + "grad_norm": 0.12942881882190704, + "learning_rate": 0.0005, + "loss": 2.1023, + "step": 237890 + }, + { + "epoch": 0.9055061166386273, + "grad_norm": 0.12694664299488068, + "learning_rate": 0.0005, + "loss": 2.098, + "step": 237900 + }, + { + "epoch": 0.90554417910675, + "grad_norm": 0.1355399787425995, + "learning_rate": 0.0005, + "loss": 2.113, + "step": 237910 + }, + { + "epoch": 0.9055822415748727, + "grad_norm": 0.12172191590070724, + "learning_rate": 0.0005, + "loss": 2.082, + "step": 237920 + }, + { + "epoch": 0.9056203040429953, + "grad_norm": 0.12276661396026611, + "learning_rate": 0.0005, + "loss": 2.1042, + "step": 237930 + }, + { + "epoch": 0.9056583665111181, + "grad_norm": 0.12394405156373978, + "learning_rate": 0.0005, + "loss": 2.0988, + "step": 237940 + }, + { + "epoch": 0.9056964289792407, + "grad_norm": 0.12693005800247192, + "learning_rate": 0.0005, + "loss": 2.1036, + "step": 237950 + }, + { + "epoch": 0.9057344914473634, + "grad_norm": 0.12156771123409271, + "learning_rate": 0.0005, + "loss": 2.0988, + "step": 237960 + }, + { + "epoch": 0.9057725539154861, + "grad_norm": 0.2160923331975937, + "learning_rate": 0.0005, + "loss": 2.0978, + "step": 237970 + }, + { + "epoch": 0.9058106163836088, + "grad_norm": 0.127583846449852, + "learning_rate": 0.0005, + "loss": 2.1069, + "step": 237980 + }, + { + "epoch": 0.9058486788517315, + "grad_norm": 0.1311924308538437, + "learning_rate": 0.0005, + "loss": 2.1153, + "step": 237990 + }, + { + "epoch": 0.9058867413198541, + "grad_norm": 0.1340932697057724, + "learning_rate": 0.0005, + "loss": 2.1063, + "step": 238000 + }, + { + "epoch": 0.9059248037879768, + "grad_norm": 0.13224120438098907, + "learning_rate": 0.0005, + "loss": 2.1011, + "step": 238010 + }, + { + "epoch": 0.9059628662560996, + "grad_norm": 0.12841913104057312, + "learning_rate": 0.0005, + "loss": 2.126, + "step": 238020 + }, + { + "epoch": 0.9060009287242222, + "grad_norm": 0.1256815791130066, + "learning_rate": 0.0005, + "loss": 2.0918, + "step": 238030 + }, + { + "epoch": 0.9060389911923449, + "grad_norm": 0.12731921672821045, + "learning_rate": 0.0005, + "loss": 2.1067, + "step": 238040 + }, + { + "epoch": 0.9060770536604675, + "grad_norm": 0.125631183385849, + "learning_rate": 0.0005, + "loss": 2.0843, + "step": 238050 + }, + { + "epoch": 0.9061151161285902, + "grad_norm": 0.1256406456232071, + "learning_rate": 0.0005, + "loss": 2.0891, + "step": 238060 + }, + { + "epoch": 0.906153178596713, + "grad_norm": 0.10923466831445694, + "learning_rate": 0.0005, + "loss": 2.1102, + "step": 238070 + }, + { + "epoch": 0.9061912410648356, + "grad_norm": 0.12627013027668, + "learning_rate": 0.0005, + "loss": 2.107, + "step": 238080 + }, + { + "epoch": 0.9062293035329583, + "grad_norm": 0.11679789423942566, + "learning_rate": 0.0005, + "loss": 2.0896, + "step": 238090 + }, + { + "epoch": 0.9062673660010809, + "grad_norm": 0.12997809052467346, + "learning_rate": 0.0005, + "loss": 2.0994, + "step": 238100 + }, + { + "epoch": 0.9063054284692037, + "grad_norm": 0.13287369906902313, + "learning_rate": 0.0005, + "loss": 2.1165, + "step": 238110 + }, + { + "epoch": 0.9063434909373264, + "grad_norm": 0.12381280958652496, + "learning_rate": 0.0005, + "loss": 2.1027, + "step": 238120 + }, + { + "epoch": 0.906381553405449, + "grad_norm": 0.12879888713359833, + "learning_rate": 0.0005, + "loss": 2.0966, + "step": 238130 + }, + { + "epoch": 0.9064196158735717, + "grad_norm": 0.12576310336589813, + "learning_rate": 0.0005, + "loss": 2.1009, + "step": 238140 + }, + { + "epoch": 0.9064576783416944, + "grad_norm": 0.12751945853233337, + "learning_rate": 0.0005, + "loss": 2.09, + "step": 238150 + }, + { + "epoch": 0.9064957408098171, + "grad_norm": 0.11907423287630081, + "learning_rate": 0.0005, + "loss": 2.0893, + "step": 238160 + }, + { + "epoch": 0.9065338032779398, + "grad_norm": 0.13061079382896423, + "learning_rate": 0.0005, + "loss": 2.1149, + "step": 238170 + }, + { + "epoch": 0.9065718657460624, + "grad_norm": 0.13230901956558228, + "learning_rate": 0.0005, + "loss": 2.1071, + "step": 238180 + }, + { + "epoch": 0.9066099282141851, + "grad_norm": 0.11777165532112122, + "learning_rate": 0.0005, + "loss": 2.1032, + "step": 238190 + }, + { + "epoch": 0.9066479906823078, + "grad_norm": 0.12985430657863617, + "learning_rate": 0.0005, + "loss": 2.101, + "step": 238200 + }, + { + "epoch": 0.9066860531504305, + "grad_norm": 0.12661513686180115, + "learning_rate": 0.0005, + "loss": 2.091, + "step": 238210 + }, + { + "epoch": 0.9067241156185532, + "grad_norm": 0.1334647685289383, + "learning_rate": 0.0005, + "loss": 2.104, + "step": 238220 + }, + { + "epoch": 0.9067621780866758, + "grad_norm": 0.12028210610151291, + "learning_rate": 0.0005, + "loss": 2.0865, + "step": 238230 + }, + { + "epoch": 0.9068002405547986, + "grad_norm": 0.12762492895126343, + "learning_rate": 0.0005, + "loss": 2.1245, + "step": 238240 + }, + { + "epoch": 0.9068383030229212, + "grad_norm": 0.13555355370044708, + "learning_rate": 0.0005, + "loss": 2.1083, + "step": 238250 + }, + { + "epoch": 0.9068763654910439, + "grad_norm": 0.12440907210111618, + "learning_rate": 0.0005, + "loss": 2.1028, + "step": 238260 + }, + { + "epoch": 0.9069144279591665, + "grad_norm": 0.13013915717601776, + "learning_rate": 0.0005, + "loss": 2.1034, + "step": 238270 + }, + { + "epoch": 0.9069524904272893, + "grad_norm": 0.1273365169763565, + "learning_rate": 0.0005, + "loss": 2.0811, + "step": 238280 + }, + { + "epoch": 0.906990552895412, + "grad_norm": 0.13053053617477417, + "learning_rate": 0.0005, + "loss": 2.1054, + "step": 238290 + }, + { + "epoch": 0.9070286153635346, + "grad_norm": 0.12159951776266098, + "learning_rate": 0.0005, + "loss": 2.0798, + "step": 238300 + }, + { + "epoch": 0.9070666778316573, + "grad_norm": 0.13156504929065704, + "learning_rate": 0.0005, + "loss": 2.0915, + "step": 238310 + }, + { + "epoch": 0.9071047402997799, + "grad_norm": 0.12892203032970428, + "learning_rate": 0.0005, + "loss": 2.117, + "step": 238320 + }, + { + "epoch": 0.9071428027679027, + "grad_norm": 0.1343413144350052, + "learning_rate": 0.0005, + "loss": 2.1057, + "step": 238330 + }, + { + "epoch": 0.9071808652360254, + "grad_norm": 0.12478955090045929, + "learning_rate": 0.0005, + "loss": 2.1084, + "step": 238340 + }, + { + "epoch": 0.907218927704148, + "grad_norm": 0.12486769258975983, + "learning_rate": 0.0005, + "loss": 2.1102, + "step": 238350 + }, + { + "epoch": 0.9072569901722707, + "grad_norm": 0.12930084764957428, + "learning_rate": 0.0005, + "loss": 2.1181, + "step": 238360 + }, + { + "epoch": 0.9072950526403935, + "grad_norm": 0.13934142887592316, + "learning_rate": 0.0005, + "loss": 2.1037, + "step": 238370 + }, + { + "epoch": 0.9073331151085161, + "grad_norm": 0.1408960372209549, + "learning_rate": 0.0005, + "loss": 2.1128, + "step": 238380 + }, + { + "epoch": 0.9073711775766388, + "grad_norm": 0.1512349545955658, + "learning_rate": 0.0005, + "loss": 2.1005, + "step": 238390 + }, + { + "epoch": 0.9074092400447614, + "grad_norm": 0.12842883169651031, + "learning_rate": 0.0005, + "loss": 2.101, + "step": 238400 + }, + { + "epoch": 0.9074473025128842, + "grad_norm": 0.1381392478942871, + "learning_rate": 0.0005, + "loss": 2.1021, + "step": 238410 + }, + { + "epoch": 0.9074853649810068, + "grad_norm": 0.12094287574291229, + "learning_rate": 0.0005, + "loss": 2.096, + "step": 238420 + }, + { + "epoch": 0.9075234274491295, + "grad_norm": 0.1263532191514969, + "learning_rate": 0.0005, + "loss": 2.1116, + "step": 238430 + }, + { + "epoch": 0.9075614899172522, + "grad_norm": 0.14776651561260223, + "learning_rate": 0.0005, + "loss": 2.0833, + "step": 238440 + }, + { + "epoch": 0.9075995523853749, + "grad_norm": 0.12728571891784668, + "learning_rate": 0.0005, + "loss": 2.1074, + "step": 238450 + }, + { + "epoch": 0.9076376148534976, + "grad_norm": 0.13688482344150543, + "learning_rate": 0.0005, + "loss": 2.1038, + "step": 238460 + }, + { + "epoch": 0.9076756773216202, + "grad_norm": 0.15688008069992065, + "learning_rate": 0.0005, + "loss": 2.1043, + "step": 238470 + }, + { + "epoch": 0.9077137397897429, + "grad_norm": 0.12313451617956161, + "learning_rate": 0.0005, + "loss": 2.1051, + "step": 238480 + }, + { + "epoch": 0.9077518022578656, + "grad_norm": 0.12344785779714584, + "learning_rate": 0.0005, + "loss": 2.1236, + "step": 238490 + }, + { + "epoch": 0.9077898647259883, + "grad_norm": 0.13766978681087494, + "learning_rate": 0.0005, + "loss": 2.0984, + "step": 238500 + }, + { + "epoch": 0.907827927194111, + "grad_norm": 0.14033202826976776, + "learning_rate": 0.0005, + "loss": 2.1033, + "step": 238510 + }, + { + "epoch": 0.9078659896622336, + "grad_norm": 0.12385424226522446, + "learning_rate": 0.0005, + "loss": 2.1111, + "step": 238520 + }, + { + "epoch": 0.9079040521303563, + "grad_norm": 0.12292386591434479, + "learning_rate": 0.0005, + "loss": 2.1084, + "step": 238530 + }, + { + "epoch": 0.9079421145984791, + "grad_norm": 0.11781243979930878, + "learning_rate": 0.0005, + "loss": 2.1201, + "step": 238540 + }, + { + "epoch": 0.9079801770666017, + "grad_norm": 0.12393909692764282, + "learning_rate": 0.0005, + "loss": 2.1071, + "step": 238550 + }, + { + "epoch": 0.9080182395347244, + "grad_norm": 0.14330510795116425, + "learning_rate": 0.0005, + "loss": 2.1065, + "step": 238560 + }, + { + "epoch": 0.908056302002847, + "grad_norm": 0.12844721972942352, + "learning_rate": 0.0005, + "loss": 2.1053, + "step": 238570 + }, + { + "epoch": 0.9080943644709698, + "grad_norm": 0.13164971768856049, + "learning_rate": 0.0005, + "loss": 2.0821, + "step": 238580 + }, + { + "epoch": 0.9081324269390925, + "grad_norm": 0.11971738189458847, + "learning_rate": 0.0005, + "loss": 2.0965, + "step": 238590 + }, + { + "epoch": 0.9081704894072151, + "grad_norm": 0.12422315776348114, + "learning_rate": 0.0005, + "loss": 2.1041, + "step": 238600 + }, + { + "epoch": 0.9082085518753378, + "grad_norm": 0.1429583877325058, + "learning_rate": 0.0005, + "loss": 2.1072, + "step": 238610 + }, + { + "epoch": 0.9082466143434604, + "grad_norm": 0.12299343943595886, + "learning_rate": 0.0005, + "loss": 2.107, + "step": 238620 + }, + { + "epoch": 0.9082846768115832, + "grad_norm": 0.13192951679229736, + "learning_rate": 0.0005, + "loss": 2.1036, + "step": 238630 + }, + { + "epoch": 0.9083227392797059, + "grad_norm": 0.12766271829605103, + "learning_rate": 0.0005, + "loss": 2.0959, + "step": 238640 + }, + { + "epoch": 0.9083608017478285, + "grad_norm": 0.13206838071346283, + "learning_rate": 0.0005, + "loss": 2.1073, + "step": 238650 + }, + { + "epoch": 0.9083988642159512, + "grad_norm": 0.13054607808589935, + "learning_rate": 0.0005, + "loss": 2.0899, + "step": 238660 + }, + { + "epoch": 0.9084369266840739, + "grad_norm": 0.1256096065044403, + "learning_rate": 0.0005, + "loss": 2.1171, + "step": 238670 + }, + { + "epoch": 0.9084749891521966, + "grad_norm": 0.13709591329097748, + "learning_rate": 0.0005, + "loss": 2.1122, + "step": 238680 + }, + { + "epoch": 0.9085130516203193, + "grad_norm": 0.1344529390335083, + "learning_rate": 0.0005, + "loss": 2.0868, + "step": 238690 + }, + { + "epoch": 0.9085511140884419, + "grad_norm": 0.11625105142593384, + "learning_rate": 0.0005, + "loss": 2.1071, + "step": 238700 + }, + { + "epoch": 0.9085891765565647, + "grad_norm": 0.13110487163066864, + "learning_rate": 0.0005, + "loss": 2.1136, + "step": 238710 + }, + { + "epoch": 0.9086272390246873, + "grad_norm": 0.11854494363069534, + "learning_rate": 0.0005, + "loss": 2.1034, + "step": 238720 + }, + { + "epoch": 0.90866530149281, + "grad_norm": 0.1250758171081543, + "learning_rate": 0.0005, + "loss": 2.0843, + "step": 238730 + }, + { + "epoch": 0.9087033639609327, + "grad_norm": 0.11900221556425095, + "learning_rate": 0.0005, + "loss": 2.1087, + "step": 238740 + }, + { + "epoch": 0.9087414264290554, + "grad_norm": 0.12195499986410141, + "learning_rate": 0.0005, + "loss": 2.0932, + "step": 238750 + }, + { + "epoch": 0.9087794888971781, + "grad_norm": 0.14227333664894104, + "learning_rate": 0.0005, + "loss": 2.1044, + "step": 238760 + }, + { + "epoch": 0.9088175513653007, + "grad_norm": 0.12002543359994888, + "learning_rate": 0.0005, + "loss": 2.1047, + "step": 238770 + }, + { + "epoch": 0.9088556138334234, + "grad_norm": 0.11458992213010788, + "learning_rate": 0.0005, + "loss": 2.0972, + "step": 238780 + }, + { + "epoch": 0.908893676301546, + "grad_norm": 0.12165806442499161, + "learning_rate": 0.0005, + "loss": 2.0972, + "step": 238790 + }, + { + "epoch": 0.9089317387696688, + "grad_norm": 0.1254490166902542, + "learning_rate": 0.0005, + "loss": 2.0819, + "step": 238800 + }, + { + "epoch": 0.9089698012377915, + "grad_norm": 0.12584739923477173, + "learning_rate": 0.0005, + "loss": 2.0887, + "step": 238810 + }, + { + "epoch": 0.9090078637059141, + "grad_norm": 0.14241082966327667, + "learning_rate": 0.0005, + "loss": 2.113, + "step": 238820 + }, + { + "epoch": 0.9090459261740368, + "grad_norm": 0.13195203244686127, + "learning_rate": 0.0005, + "loss": 2.1049, + "step": 238830 + }, + { + "epoch": 0.9090839886421596, + "grad_norm": 0.12545497715473175, + "learning_rate": 0.0005, + "loss": 2.1243, + "step": 238840 + }, + { + "epoch": 0.9091220511102822, + "grad_norm": 0.12716884911060333, + "learning_rate": 0.0005, + "loss": 2.0926, + "step": 238850 + }, + { + "epoch": 0.9091601135784049, + "grad_norm": 0.1222204640507698, + "learning_rate": 0.0005, + "loss": 2.0983, + "step": 238860 + }, + { + "epoch": 0.9091981760465275, + "grad_norm": 0.12676088511943817, + "learning_rate": 0.0005, + "loss": 2.0906, + "step": 238870 + }, + { + "epoch": 0.9092362385146503, + "grad_norm": 0.14713385701179504, + "learning_rate": 0.0005, + "loss": 2.1142, + "step": 238880 + }, + { + "epoch": 0.909274300982773, + "grad_norm": 0.1335698366165161, + "learning_rate": 0.0005, + "loss": 2.1106, + "step": 238890 + }, + { + "epoch": 0.9093123634508956, + "grad_norm": 0.13682667911052704, + "learning_rate": 0.0005, + "loss": 2.1036, + "step": 238900 + }, + { + "epoch": 0.9093504259190183, + "grad_norm": 0.1311189830303192, + "learning_rate": 0.0005, + "loss": 2.1067, + "step": 238910 + }, + { + "epoch": 0.9093884883871409, + "grad_norm": 0.13310740888118744, + "learning_rate": 0.0005, + "loss": 2.0996, + "step": 238920 + }, + { + "epoch": 0.9094265508552637, + "grad_norm": 0.1309422254562378, + "learning_rate": 0.0005, + "loss": 2.1146, + "step": 238930 + }, + { + "epoch": 0.9094646133233864, + "grad_norm": 0.12251695990562439, + "learning_rate": 0.0005, + "loss": 2.1065, + "step": 238940 + }, + { + "epoch": 0.909502675791509, + "grad_norm": 0.13915124535560608, + "learning_rate": 0.0005, + "loss": 2.1109, + "step": 238950 + }, + { + "epoch": 0.9095407382596317, + "grad_norm": 0.1315782219171524, + "learning_rate": 0.0005, + "loss": 2.0938, + "step": 238960 + }, + { + "epoch": 0.9095788007277544, + "grad_norm": 0.14065399765968323, + "learning_rate": 0.0005, + "loss": 2.0993, + "step": 238970 + }, + { + "epoch": 0.9096168631958771, + "grad_norm": 0.14490070939064026, + "learning_rate": 0.0005, + "loss": 2.1123, + "step": 238980 + }, + { + "epoch": 0.9096549256639997, + "grad_norm": 0.132430762052536, + "learning_rate": 0.0005, + "loss": 2.0868, + "step": 238990 + }, + { + "epoch": 0.9096929881321224, + "grad_norm": 0.1288212686777115, + "learning_rate": 0.0005, + "loss": 2.106, + "step": 239000 + }, + { + "epoch": 0.9097310506002452, + "grad_norm": 0.1340613067150116, + "learning_rate": 0.0005, + "loss": 2.111, + "step": 239010 + }, + { + "epoch": 0.9097691130683678, + "grad_norm": 0.13787075877189636, + "learning_rate": 0.0005, + "loss": 2.1096, + "step": 239020 + }, + { + "epoch": 0.9098071755364905, + "grad_norm": 0.13590595126152039, + "learning_rate": 0.0005, + "loss": 2.0955, + "step": 239030 + }, + { + "epoch": 0.9098452380046131, + "grad_norm": 0.12815529108047485, + "learning_rate": 0.0005, + "loss": 2.1125, + "step": 239040 + }, + { + "epoch": 0.9098833004727358, + "grad_norm": 0.12662175297737122, + "learning_rate": 0.0005, + "loss": 2.116, + "step": 239050 + }, + { + "epoch": 0.9099213629408586, + "grad_norm": 0.13380499184131622, + "learning_rate": 0.0005, + "loss": 2.115, + "step": 239060 + }, + { + "epoch": 0.9099594254089812, + "grad_norm": 0.11831115186214447, + "learning_rate": 0.0005, + "loss": 2.1013, + "step": 239070 + }, + { + "epoch": 0.9099974878771039, + "grad_norm": 0.12954314053058624, + "learning_rate": 0.0005, + "loss": 2.113, + "step": 239080 + }, + { + "epoch": 0.9100355503452265, + "grad_norm": 0.12330293655395508, + "learning_rate": 0.0005, + "loss": 2.1085, + "step": 239090 + }, + { + "epoch": 0.9100736128133493, + "grad_norm": 0.12839968502521515, + "learning_rate": 0.0005, + "loss": 2.1088, + "step": 239100 + }, + { + "epoch": 0.910111675281472, + "grad_norm": 0.13956953585147858, + "learning_rate": 0.0005, + "loss": 2.1129, + "step": 239110 + }, + { + "epoch": 0.9101497377495946, + "grad_norm": 0.1369396448135376, + "learning_rate": 0.0005, + "loss": 2.079, + "step": 239120 + }, + { + "epoch": 0.9101878002177173, + "grad_norm": 0.1352585405111313, + "learning_rate": 0.0005, + "loss": 2.1114, + "step": 239130 + }, + { + "epoch": 0.91022586268584, + "grad_norm": 0.13882745802402496, + "learning_rate": 0.0005, + "loss": 2.095, + "step": 239140 + }, + { + "epoch": 0.9102639251539627, + "grad_norm": 0.12504839897155762, + "learning_rate": 0.0005, + "loss": 2.095, + "step": 239150 + }, + { + "epoch": 0.9103019876220854, + "grad_norm": 0.1293504387140274, + "learning_rate": 0.0005, + "loss": 2.0844, + "step": 239160 + }, + { + "epoch": 0.910340050090208, + "grad_norm": 0.11434295773506165, + "learning_rate": 0.0005, + "loss": 2.1052, + "step": 239170 + }, + { + "epoch": 0.9103781125583308, + "grad_norm": 0.12427566945552826, + "learning_rate": 0.0005, + "loss": 2.0976, + "step": 239180 + }, + { + "epoch": 0.9104161750264534, + "grad_norm": 0.12932242453098297, + "learning_rate": 0.0005, + "loss": 2.1074, + "step": 239190 + }, + { + "epoch": 0.9104542374945761, + "grad_norm": 0.13227252662181854, + "learning_rate": 0.0005, + "loss": 2.1123, + "step": 239200 + }, + { + "epoch": 0.9104922999626988, + "grad_norm": 0.13035479187965393, + "learning_rate": 0.0005, + "loss": 2.1053, + "step": 239210 + }, + { + "epoch": 0.9105303624308214, + "grad_norm": 0.12500816583633423, + "learning_rate": 0.0005, + "loss": 2.1072, + "step": 239220 + }, + { + "epoch": 0.9105684248989442, + "grad_norm": 0.1311212033033371, + "learning_rate": 0.0005, + "loss": 2.1046, + "step": 239230 + }, + { + "epoch": 0.9106064873670668, + "grad_norm": 0.14410890638828278, + "learning_rate": 0.0005, + "loss": 2.1137, + "step": 239240 + }, + { + "epoch": 0.9106445498351895, + "grad_norm": 0.1327977031469345, + "learning_rate": 0.0005, + "loss": 2.101, + "step": 239250 + }, + { + "epoch": 0.9106826123033122, + "grad_norm": 0.12608478963375092, + "learning_rate": 0.0005, + "loss": 2.099, + "step": 239260 + }, + { + "epoch": 0.9107206747714349, + "grad_norm": 0.1240050345659256, + "learning_rate": 0.0005, + "loss": 2.1085, + "step": 239270 + }, + { + "epoch": 0.9107587372395576, + "grad_norm": 0.1272159367799759, + "learning_rate": 0.0005, + "loss": 2.1003, + "step": 239280 + }, + { + "epoch": 0.9107967997076802, + "grad_norm": 0.12495238333940506, + "learning_rate": 0.0005, + "loss": 2.0952, + "step": 239290 + }, + { + "epoch": 0.9108348621758029, + "grad_norm": 0.13057975471019745, + "learning_rate": 0.0005, + "loss": 2.1039, + "step": 239300 + }, + { + "epoch": 0.9108729246439257, + "grad_norm": 0.14395946264266968, + "learning_rate": 0.0005, + "loss": 2.1039, + "step": 239310 + }, + { + "epoch": 0.9109109871120483, + "grad_norm": 0.1312178671360016, + "learning_rate": 0.0005, + "loss": 2.107, + "step": 239320 + }, + { + "epoch": 0.910949049580171, + "grad_norm": 0.13741567730903625, + "learning_rate": 0.0005, + "loss": 2.0995, + "step": 239330 + }, + { + "epoch": 0.9109871120482936, + "grad_norm": 0.13207745552062988, + "learning_rate": 0.0005, + "loss": 2.0937, + "step": 239340 + }, + { + "epoch": 0.9110251745164163, + "grad_norm": 0.13339263200759888, + "learning_rate": 0.0005, + "loss": 2.1145, + "step": 239350 + }, + { + "epoch": 0.9110632369845391, + "grad_norm": 0.1387874186038971, + "learning_rate": 0.0005, + "loss": 2.0911, + "step": 239360 + }, + { + "epoch": 0.9111012994526617, + "grad_norm": 0.13634562492370605, + "learning_rate": 0.0005, + "loss": 2.1074, + "step": 239370 + }, + { + "epoch": 0.9111393619207844, + "grad_norm": 0.13025477528572083, + "learning_rate": 0.0005, + "loss": 2.1155, + "step": 239380 + }, + { + "epoch": 0.911177424388907, + "grad_norm": 0.12511643767356873, + "learning_rate": 0.0005, + "loss": 2.1145, + "step": 239390 + }, + { + "epoch": 0.9112154868570298, + "grad_norm": 0.13627535104751587, + "learning_rate": 0.0005, + "loss": 2.1107, + "step": 239400 + }, + { + "epoch": 0.9112535493251525, + "grad_norm": 0.1424589306116104, + "learning_rate": 0.0005, + "loss": 2.1108, + "step": 239410 + }, + { + "epoch": 0.9112916117932751, + "grad_norm": 0.12316667288541794, + "learning_rate": 0.0005, + "loss": 2.1111, + "step": 239420 + }, + { + "epoch": 0.9113296742613978, + "grad_norm": 0.12137805670499802, + "learning_rate": 0.0005, + "loss": 2.096, + "step": 239430 + }, + { + "epoch": 0.9113677367295205, + "grad_norm": 0.12696003913879395, + "learning_rate": 0.0005, + "loss": 2.0904, + "step": 239440 + }, + { + "epoch": 0.9114057991976432, + "grad_norm": 0.11263412237167358, + "learning_rate": 0.0005, + "loss": 2.09, + "step": 239450 + }, + { + "epoch": 0.9114438616657659, + "grad_norm": 0.12840403616428375, + "learning_rate": 0.0005, + "loss": 2.1049, + "step": 239460 + }, + { + "epoch": 0.9114819241338885, + "grad_norm": 0.13457255065441132, + "learning_rate": 0.0005, + "loss": 2.0978, + "step": 239470 + }, + { + "epoch": 0.9115199866020112, + "grad_norm": 0.12429932504892349, + "learning_rate": 0.0005, + "loss": 2.099, + "step": 239480 + }, + { + "epoch": 0.9115580490701339, + "grad_norm": 0.12521515786647797, + "learning_rate": 0.0005, + "loss": 2.1085, + "step": 239490 + }, + { + "epoch": 0.9115961115382566, + "grad_norm": 0.12820985913276672, + "learning_rate": 0.0005, + "loss": 2.113, + "step": 239500 + }, + { + "epoch": 0.9116341740063792, + "grad_norm": 0.12908533215522766, + "learning_rate": 0.0005, + "loss": 2.0993, + "step": 239510 + }, + { + "epoch": 0.9116722364745019, + "grad_norm": 0.12380233407020569, + "learning_rate": 0.0005, + "loss": 2.0984, + "step": 239520 + }, + { + "epoch": 0.9117102989426247, + "grad_norm": 0.14130914211273193, + "learning_rate": 0.0005, + "loss": 2.0975, + "step": 239530 + }, + { + "epoch": 0.9117483614107473, + "grad_norm": 0.132679283618927, + "learning_rate": 0.0005, + "loss": 2.088, + "step": 239540 + }, + { + "epoch": 0.91178642387887, + "grad_norm": 0.13694171607494354, + "learning_rate": 0.0005, + "loss": 2.0992, + "step": 239550 + }, + { + "epoch": 0.9118244863469926, + "grad_norm": 0.12591597437858582, + "learning_rate": 0.0005, + "loss": 2.0814, + "step": 239560 + }, + { + "epoch": 0.9118625488151154, + "grad_norm": 0.11692557483911514, + "learning_rate": 0.0005, + "loss": 2.0929, + "step": 239570 + }, + { + "epoch": 0.9119006112832381, + "grad_norm": 0.13992343842983246, + "learning_rate": 0.0005, + "loss": 2.1053, + "step": 239580 + }, + { + "epoch": 0.9119386737513607, + "grad_norm": 0.1163182258605957, + "learning_rate": 0.0005, + "loss": 2.1039, + "step": 239590 + }, + { + "epoch": 0.9119767362194834, + "grad_norm": 0.11891079694032669, + "learning_rate": 0.0005, + "loss": 2.096, + "step": 239600 + }, + { + "epoch": 0.9120147986876062, + "grad_norm": 0.12675254046916962, + "learning_rate": 0.0005, + "loss": 2.1206, + "step": 239610 + }, + { + "epoch": 0.9120528611557288, + "grad_norm": 0.12357204407453537, + "learning_rate": 0.0005, + "loss": 2.1081, + "step": 239620 + }, + { + "epoch": 0.9120909236238515, + "grad_norm": 0.11951776593923569, + "learning_rate": 0.0005, + "loss": 2.102, + "step": 239630 + }, + { + "epoch": 0.9121289860919741, + "grad_norm": 0.12621699273586273, + "learning_rate": 0.0005, + "loss": 2.1108, + "step": 239640 + }, + { + "epoch": 0.9121670485600968, + "grad_norm": 0.12371443957090378, + "learning_rate": 0.0005, + "loss": 2.1006, + "step": 239650 + }, + { + "epoch": 0.9122051110282196, + "grad_norm": 0.13789597153663635, + "learning_rate": 0.0005, + "loss": 2.0982, + "step": 239660 + }, + { + "epoch": 0.9122431734963422, + "grad_norm": 0.12520888447761536, + "learning_rate": 0.0005, + "loss": 2.1109, + "step": 239670 + }, + { + "epoch": 0.9122812359644649, + "grad_norm": 0.1457749903202057, + "learning_rate": 0.0005, + "loss": 2.1168, + "step": 239680 + }, + { + "epoch": 0.9123192984325875, + "grad_norm": 0.13121797144412994, + "learning_rate": 0.0005, + "loss": 2.0919, + "step": 239690 + }, + { + "epoch": 0.9123573609007103, + "grad_norm": 0.14107409119606018, + "learning_rate": 0.0005, + "loss": 2.0929, + "step": 239700 + }, + { + "epoch": 0.912395423368833, + "grad_norm": 0.12807787954807281, + "learning_rate": 0.0005, + "loss": 2.0948, + "step": 239710 + }, + { + "epoch": 0.9124334858369556, + "grad_norm": 0.13790073990821838, + "learning_rate": 0.0005, + "loss": 2.1105, + "step": 239720 + }, + { + "epoch": 0.9124715483050783, + "grad_norm": 0.13001273572444916, + "learning_rate": 0.0005, + "loss": 2.1053, + "step": 239730 + }, + { + "epoch": 0.912509610773201, + "grad_norm": 0.12587277591228485, + "learning_rate": 0.0005, + "loss": 2.0977, + "step": 239740 + }, + { + "epoch": 0.9125476732413237, + "grad_norm": 0.14173942804336548, + "learning_rate": 0.0005, + "loss": 2.1022, + "step": 239750 + }, + { + "epoch": 0.9125857357094463, + "grad_norm": 0.12748290598392487, + "learning_rate": 0.0005, + "loss": 2.0953, + "step": 239760 + }, + { + "epoch": 0.912623798177569, + "grad_norm": 0.12559473514556885, + "learning_rate": 0.0005, + "loss": 2.1039, + "step": 239770 + }, + { + "epoch": 0.9126618606456917, + "grad_norm": 0.1296173632144928, + "learning_rate": 0.0005, + "loss": 2.1206, + "step": 239780 + }, + { + "epoch": 0.9126999231138144, + "grad_norm": 0.12084129452705383, + "learning_rate": 0.0005, + "loss": 2.107, + "step": 239790 + }, + { + "epoch": 0.9127379855819371, + "grad_norm": 0.1363648772239685, + "learning_rate": 0.0005, + "loss": 2.1076, + "step": 239800 + }, + { + "epoch": 0.9127760480500597, + "grad_norm": 0.13141706585884094, + "learning_rate": 0.0005, + "loss": 2.1032, + "step": 239810 + }, + { + "epoch": 0.9128141105181824, + "grad_norm": 0.1180909052491188, + "learning_rate": 0.0005, + "loss": 2.0905, + "step": 239820 + }, + { + "epoch": 0.9128521729863052, + "grad_norm": 0.1213419958949089, + "learning_rate": 0.0005, + "loss": 2.1136, + "step": 239830 + }, + { + "epoch": 0.9128902354544278, + "grad_norm": 0.13057322800159454, + "learning_rate": 0.0005, + "loss": 2.1055, + "step": 239840 + }, + { + "epoch": 0.9129282979225505, + "grad_norm": 0.14257821440696716, + "learning_rate": 0.0005, + "loss": 2.102, + "step": 239850 + }, + { + "epoch": 0.9129663603906731, + "grad_norm": 0.1380070596933365, + "learning_rate": 0.0005, + "loss": 2.114, + "step": 239860 + }, + { + "epoch": 0.9130044228587959, + "grad_norm": 0.1317712664604187, + "learning_rate": 0.0005, + "loss": 2.1132, + "step": 239870 + }, + { + "epoch": 0.9130424853269186, + "grad_norm": 0.14122363924980164, + "learning_rate": 0.0005, + "loss": 2.1024, + "step": 239880 + }, + { + "epoch": 0.9130805477950412, + "grad_norm": 0.11707179248332977, + "learning_rate": 0.0005, + "loss": 2.0888, + "step": 239890 + }, + { + "epoch": 0.9131186102631639, + "grad_norm": 0.13957612216472626, + "learning_rate": 0.0005, + "loss": 2.118, + "step": 239900 + }, + { + "epoch": 0.9131566727312865, + "grad_norm": 0.13124236464500427, + "learning_rate": 0.0005, + "loss": 2.0931, + "step": 239910 + }, + { + "epoch": 0.9131947351994093, + "grad_norm": 0.12704578042030334, + "learning_rate": 0.0005, + "loss": 2.1131, + "step": 239920 + }, + { + "epoch": 0.913232797667532, + "grad_norm": 0.1228540688753128, + "learning_rate": 0.0005, + "loss": 2.107, + "step": 239930 + }, + { + "epoch": 0.9132708601356546, + "grad_norm": 0.11537936329841614, + "learning_rate": 0.0005, + "loss": 2.1111, + "step": 239940 + }, + { + "epoch": 0.9133089226037773, + "grad_norm": 0.12744903564453125, + "learning_rate": 0.0005, + "loss": 2.1048, + "step": 239950 + }, + { + "epoch": 0.9133469850719, + "grad_norm": 0.1313735693693161, + "learning_rate": 0.0005, + "loss": 2.1068, + "step": 239960 + }, + { + "epoch": 0.9133850475400227, + "grad_norm": 0.14830102026462555, + "learning_rate": 0.0005, + "loss": 2.104, + "step": 239970 + }, + { + "epoch": 0.9134231100081454, + "grad_norm": 0.1338607221841812, + "learning_rate": 0.0005, + "loss": 2.0951, + "step": 239980 + }, + { + "epoch": 0.913461172476268, + "grad_norm": 0.1361265778541565, + "learning_rate": 0.0005, + "loss": 2.0934, + "step": 239990 + }, + { + "epoch": 0.9134992349443908, + "grad_norm": 0.11743402481079102, + "learning_rate": 0.0005, + "loss": 2.0992, + "step": 240000 + }, + { + "epoch": 0.9135372974125134, + "grad_norm": 0.12837345898151398, + "learning_rate": 0.0005, + "loss": 2.0893, + "step": 240010 + }, + { + "epoch": 0.9135753598806361, + "grad_norm": 0.13669757544994354, + "learning_rate": 0.0005, + "loss": 2.1127, + "step": 240020 + }, + { + "epoch": 0.9136134223487588, + "grad_norm": 0.155908465385437, + "learning_rate": 0.0005, + "loss": 2.1137, + "step": 240030 + }, + { + "epoch": 0.9136514848168815, + "grad_norm": 0.11326774209737778, + "learning_rate": 0.0005, + "loss": 2.1018, + "step": 240040 + }, + { + "epoch": 0.9136895472850042, + "grad_norm": 0.12513118982315063, + "learning_rate": 0.0005, + "loss": 2.0939, + "step": 240050 + }, + { + "epoch": 0.9137276097531268, + "grad_norm": 0.11165674775838852, + "learning_rate": 0.0005, + "loss": 2.1065, + "step": 240060 + }, + { + "epoch": 0.9137656722212495, + "grad_norm": 0.1229131743311882, + "learning_rate": 0.0005, + "loss": 2.1056, + "step": 240070 + }, + { + "epoch": 0.9138037346893721, + "grad_norm": 0.12102916091680527, + "learning_rate": 0.0005, + "loss": 2.1023, + "step": 240080 + }, + { + "epoch": 0.9138417971574949, + "grad_norm": 0.12489450722932816, + "learning_rate": 0.0005, + "loss": 2.1102, + "step": 240090 + }, + { + "epoch": 0.9138798596256176, + "grad_norm": 0.13946636021137238, + "learning_rate": 0.0005, + "loss": 2.1028, + "step": 240100 + }, + { + "epoch": 0.9139179220937402, + "grad_norm": 0.13643383979797363, + "learning_rate": 0.0005, + "loss": 2.0916, + "step": 240110 + }, + { + "epoch": 0.9139559845618629, + "grad_norm": 0.12476367503404617, + "learning_rate": 0.0005, + "loss": 2.0862, + "step": 240120 + }, + { + "epoch": 0.9139940470299857, + "grad_norm": 0.13304784893989563, + "learning_rate": 0.0005, + "loss": 2.0984, + "step": 240130 + }, + { + "epoch": 0.9140321094981083, + "grad_norm": 0.11753713339567184, + "learning_rate": 0.0005, + "loss": 2.1004, + "step": 240140 + }, + { + "epoch": 0.914070171966231, + "grad_norm": 0.13841630518436432, + "learning_rate": 0.0005, + "loss": 2.0843, + "step": 240150 + }, + { + "epoch": 0.9141082344343536, + "grad_norm": 0.1333719938993454, + "learning_rate": 0.0005, + "loss": 2.0964, + "step": 240160 + }, + { + "epoch": 0.9141462969024764, + "grad_norm": 0.13542665541172028, + "learning_rate": 0.0005, + "loss": 2.1123, + "step": 240170 + }, + { + "epoch": 0.914184359370599, + "grad_norm": 0.135142520070076, + "learning_rate": 0.0005, + "loss": 2.0995, + "step": 240180 + }, + { + "epoch": 0.9142224218387217, + "grad_norm": 0.1313934624195099, + "learning_rate": 0.0005, + "loss": 2.1216, + "step": 240190 + }, + { + "epoch": 0.9142604843068444, + "grad_norm": 0.12458037585020065, + "learning_rate": 0.0005, + "loss": 2.1029, + "step": 240200 + }, + { + "epoch": 0.914298546774967, + "grad_norm": 0.13040408492088318, + "learning_rate": 0.0005, + "loss": 2.1171, + "step": 240210 + }, + { + "epoch": 0.9143366092430898, + "grad_norm": 0.124188631772995, + "learning_rate": 0.0005, + "loss": 2.1002, + "step": 240220 + }, + { + "epoch": 0.9143746717112125, + "grad_norm": 0.14607562124729156, + "learning_rate": 0.0005, + "loss": 2.1065, + "step": 240230 + }, + { + "epoch": 0.9144127341793351, + "grad_norm": 0.13257095217704773, + "learning_rate": 0.0005, + "loss": 2.1069, + "step": 240240 + }, + { + "epoch": 0.9144507966474578, + "grad_norm": 0.12451108545064926, + "learning_rate": 0.0005, + "loss": 2.104, + "step": 240250 + }, + { + "epoch": 0.9144888591155805, + "grad_norm": 0.12608541548252106, + "learning_rate": 0.0005, + "loss": 2.1104, + "step": 240260 + }, + { + "epoch": 0.9145269215837032, + "grad_norm": 0.13142694532871246, + "learning_rate": 0.0005, + "loss": 2.1071, + "step": 240270 + }, + { + "epoch": 0.9145649840518258, + "grad_norm": 0.13280390202999115, + "learning_rate": 0.0005, + "loss": 2.1042, + "step": 240280 + }, + { + "epoch": 0.9146030465199485, + "grad_norm": 0.13582828640937805, + "learning_rate": 0.0005, + "loss": 2.1084, + "step": 240290 + }, + { + "epoch": 0.9146411089880713, + "grad_norm": 0.1362244039773941, + "learning_rate": 0.0005, + "loss": 2.1041, + "step": 240300 + }, + { + "epoch": 0.9146791714561939, + "grad_norm": 0.14132159948349, + "learning_rate": 0.0005, + "loss": 2.086, + "step": 240310 + }, + { + "epoch": 0.9147172339243166, + "grad_norm": 0.124893419444561, + "learning_rate": 0.0005, + "loss": 2.1098, + "step": 240320 + }, + { + "epoch": 0.9147552963924392, + "grad_norm": 0.1291424036026001, + "learning_rate": 0.0005, + "loss": 2.1093, + "step": 240330 + }, + { + "epoch": 0.9147933588605619, + "grad_norm": 0.11848355829715729, + "learning_rate": 0.0005, + "loss": 2.1123, + "step": 240340 + }, + { + "epoch": 0.9148314213286847, + "grad_norm": 0.12111199647188187, + "learning_rate": 0.0005, + "loss": 2.1064, + "step": 240350 + }, + { + "epoch": 0.9148694837968073, + "grad_norm": 0.11707769334316254, + "learning_rate": 0.0005, + "loss": 2.1063, + "step": 240360 + }, + { + "epoch": 0.91490754626493, + "grad_norm": 0.12086135894060135, + "learning_rate": 0.0005, + "loss": 2.0953, + "step": 240370 + }, + { + "epoch": 0.9149456087330526, + "grad_norm": 0.13472595810890198, + "learning_rate": 0.0005, + "loss": 2.0966, + "step": 240380 + }, + { + "epoch": 0.9149836712011754, + "grad_norm": 0.12526558339595795, + "learning_rate": 0.0005, + "loss": 2.082, + "step": 240390 + }, + { + "epoch": 0.9150217336692981, + "grad_norm": 0.12476403266191483, + "learning_rate": 0.0005, + "loss": 2.1087, + "step": 240400 + }, + { + "epoch": 0.9150597961374207, + "grad_norm": 0.13596831262111664, + "learning_rate": 0.0005, + "loss": 2.1066, + "step": 240410 + }, + { + "epoch": 0.9150978586055434, + "grad_norm": 0.12879303097724915, + "learning_rate": 0.0005, + "loss": 2.1187, + "step": 240420 + }, + { + "epoch": 0.9151359210736661, + "grad_norm": 0.13720719516277313, + "learning_rate": 0.0005, + "loss": 2.0952, + "step": 240430 + }, + { + "epoch": 0.9151739835417888, + "grad_norm": 0.1436295062303543, + "learning_rate": 0.0005, + "loss": 2.1236, + "step": 240440 + }, + { + "epoch": 0.9152120460099115, + "grad_norm": 0.12874655425548553, + "learning_rate": 0.0005, + "loss": 2.1082, + "step": 240450 + }, + { + "epoch": 0.9152501084780341, + "grad_norm": 0.1262015402317047, + "learning_rate": 0.0005, + "loss": 2.0894, + "step": 240460 + }, + { + "epoch": 0.9152881709461569, + "grad_norm": 0.13029156625270844, + "learning_rate": 0.0005, + "loss": 2.1104, + "step": 240470 + }, + { + "epoch": 0.9153262334142795, + "grad_norm": 0.1307915300130844, + "learning_rate": 0.0005, + "loss": 2.0999, + "step": 240480 + }, + { + "epoch": 0.9153642958824022, + "grad_norm": 0.13881021738052368, + "learning_rate": 0.0005, + "loss": 2.1005, + "step": 240490 + }, + { + "epoch": 0.9154023583505249, + "grad_norm": 0.12643685936927795, + "learning_rate": 0.0005, + "loss": 2.0945, + "step": 240500 + }, + { + "epoch": 0.9154404208186475, + "grad_norm": 0.11630550026893616, + "learning_rate": 0.0005, + "loss": 2.1111, + "step": 240510 + }, + { + "epoch": 0.9154784832867703, + "grad_norm": 0.11585814505815506, + "learning_rate": 0.0005, + "loss": 2.1132, + "step": 240520 + }, + { + "epoch": 0.9155165457548929, + "grad_norm": 0.12080513685941696, + "learning_rate": 0.0005, + "loss": 2.0908, + "step": 240530 + }, + { + "epoch": 0.9155546082230156, + "grad_norm": 0.13465972244739532, + "learning_rate": 0.0005, + "loss": 2.1056, + "step": 240540 + }, + { + "epoch": 0.9155926706911383, + "grad_norm": 0.11828291416168213, + "learning_rate": 0.0005, + "loss": 2.1155, + "step": 240550 + }, + { + "epoch": 0.915630733159261, + "grad_norm": 0.13376320898532867, + "learning_rate": 0.0005, + "loss": 2.1014, + "step": 240560 + }, + { + "epoch": 0.9156687956273837, + "grad_norm": 0.12552687525749207, + "learning_rate": 0.0005, + "loss": 2.1085, + "step": 240570 + }, + { + "epoch": 0.9157068580955063, + "grad_norm": 0.13472148776054382, + "learning_rate": 0.0005, + "loss": 2.0974, + "step": 240580 + }, + { + "epoch": 0.915744920563629, + "grad_norm": 0.12127243727445602, + "learning_rate": 0.0005, + "loss": 2.0959, + "step": 240590 + }, + { + "epoch": 0.9157829830317518, + "grad_norm": 0.13831770420074463, + "learning_rate": 0.0005, + "loss": 2.089, + "step": 240600 + }, + { + "epoch": 0.9158210454998744, + "grad_norm": 0.12685850262641907, + "learning_rate": 0.0005, + "loss": 2.1001, + "step": 240610 + }, + { + "epoch": 0.9158591079679971, + "grad_norm": 0.14404189586639404, + "learning_rate": 0.0005, + "loss": 2.1089, + "step": 240620 + }, + { + "epoch": 0.9158971704361197, + "grad_norm": 0.13158273696899414, + "learning_rate": 0.0005, + "loss": 2.1072, + "step": 240630 + }, + { + "epoch": 0.9159352329042424, + "grad_norm": 0.13201908767223358, + "learning_rate": 0.0005, + "loss": 2.1159, + "step": 240640 + }, + { + "epoch": 0.9159732953723652, + "grad_norm": 0.1278715282678604, + "learning_rate": 0.0005, + "loss": 2.107, + "step": 240650 + }, + { + "epoch": 0.9160113578404878, + "grad_norm": 0.12094113230705261, + "learning_rate": 0.0005, + "loss": 2.089, + "step": 240660 + }, + { + "epoch": 0.9160494203086105, + "grad_norm": 0.11523523926734924, + "learning_rate": 0.0005, + "loss": 2.1057, + "step": 240670 + }, + { + "epoch": 0.9160874827767331, + "grad_norm": 0.12744709849357605, + "learning_rate": 0.0005, + "loss": 2.1014, + "step": 240680 + }, + { + "epoch": 0.9161255452448559, + "grad_norm": 0.12252113968133926, + "learning_rate": 0.0005, + "loss": 2.091, + "step": 240690 + }, + { + "epoch": 0.9161636077129786, + "grad_norm": 0.12514427304267883, + "learning_rate": 0.0005, + "loss": 2.0992, + "step": 240700 + }, + { + "epoch": 0.9162016701811012, + "grad_norm": 0.12158069014549255, + "learning_rate": 0.0005, + "loss": 2.1081, + "step": 240710 + }, + { + "epoch": 0.9162397326492239, + "grad_norm": 0.13026173412799835, + "learning_rate": 0.0005, + "loss": 2.1023, + "step": 240720 + }, + { + "epoch": 0.9162777951173466, + "grad_norm": 0.11686385422945023, + "learning_rate": 0.0005, + "loss": 2.1163, + "step": 240730 + }, + { + "epoch": 0.9163158575854693, + "grad_norm": 0.13855034112930298, + "learning_rate": 0.0005, + "loss": 2.1097, + "step": 240740 + }, + { + "epoch": 0.916353920053592, + "grad_norm": 0.1281408816576004, + "learning_rate": 0.0005, + "loss": 2.0993, + "step": 240750 + }, + { + "epoch": 0.9163919825217146, + "grad_norm": 0.13539083302021027, + "learning_rate": 0.0005, + "loss": 2.121, + "step": 240760 + }, + { + "epoch": 0.9164300449898373, + "grad_norm": 0.11456126719713211, + "learning_rate": 0.0005, + "loss": 2.1029, + "step": 240770 + }, + { + "epoch": 0.91646810745796, + "grad_norm": 0.12172668427228928, + "learning_rate": 0.0005, + "loss": 2.0896, + "step": 240780 + }, + { + "epoch": 0.9165061699260827, + "grad_norm": 0.12095924466848373, + "learning_rate": 0.0005, + "loss": 2.1077, + "step": 240790 + }, + { + "epoch": 0.9165442323942053, + "grad_norm": 0.12311307340860367, + "learning_rate": 0.0005, + "loss": 2.1183, + "step": 240800 + }, + { + "epoch": 0.916582294862328, + "grad_norm": 0.14387722313404083, + "learning_rate": 0.0005, + "loss": 2.1175, + "step": 240810 + }, + { + "epoch": 0.9166203573304508, + "grad_norm": 0.1334153264760971, + "learning_rate": 0.0005, + "loss": 2.1018, + "step": 240820 + }, + { + "epoch": 0.9166584197985734, + "grad_norm": 0.12912805378437042, + "learning_rate": 0.0005, + "loss": 2.0925, + "step": 240830 + }, + { + "epoch": 0.9166964822666961, + "grad_norm": 0.11982877552509308, + "learning_rate": 0.0005, + "loss": 2.1015, + "step": 240840 + }, + { + "epoch": 0.9167345447348187, + "grad_norm": 0.12010152637958527, + "learning_rate": 0.0005, + "loss": 2.0923, + "step": 240850 + }, + { + "epoch": 0.9167726072029415, + "grad_norm": 0.1530761867761612, + "learning_rate": 0.0005, + "loss": 2.1129, + "step": 240860 + }, + { + "epoch": 0.9168106696710642, + "grad_norm": 0.13544851541519165, + "learning_rate": 0.0005, + "loss": 2.1058, + "step": 240870 + }, + { + "epoch": 0.9168487321391868, + "grad_norm": 0.13245892524719238, + "learning_rate": 0.0005, + "loss": 2.105, + "step": 240880 + }, + { + "epoch": 0.9168867946073095, + "grad_norm": 0.13171501457691193, + "learning_rate": 0.0005, + "loss": 2.0951, + "step": 240890 + }, + { + "epoch": 0.9169248570754323, + "grad_norm": 0.12141535431146622, + "learning_rate": 0.0005, + "loss": 2.0924, + "step": 240900 + }, + { + "epoch": 0.9169629195435549, + "grad_norm": 0.12837472558021545, + "learning_rate": 0.0005, + "loss": 2.1122, + "step": 240910 + }, + { + "epoch": 0.9170009820116776, + "grad_norm": 0.12579694390296936, + "learning_rate": 0.0005, + "loss": 2.1121, + "step": 240920 + }, + { + "epoch": 0.9170390444798002, + "grad_norm": 0.12822788953781128, + "learning_rate": 0.0005, + "loss": 2.1123, + "step": 240930 + }, + { + "epoch": 0.9170771069479229, + "grad_norm": 0.12980876863002777, + "learning_rate": 0.0005, + "loss": 2.111, + "step": 240940 + }, + { + "epoch": 0.9171151694160457, + "grad_norm": 0.11740733683109283, + "learning_rate": 0.0005, + "loss": 2.0993, + "step": 240950 + }, + { + "epoch": 0.9171532318841683, + "grad_norm": 0.1145625188946724, + "learning_rate": 0.0005, + "loss": 2.0938, + "step": 240960 + }, + { + "epoch": 0.917191294352291, + "grad_norm": 0.1304570436477661, + "learning_rate": 0.0005, + "loss": 2.1069, + "step": 240970 + }, + { + "epoch": 0.9172293568204136, + "grad_norm": 0.12784039974212646, + "learning_rate": 0.0005, + "loss": 2.0988, + "step": 240980 + }, + { + "epoch": 0.9172674192885364, + "grad_norm": 0.12327566742897034, + "learning_rate": 0.0005, + "loss": 2.1002, + "step": 240990 + }, + { + "epoch": 0.917305481756659, + "grad_norm": 0.1315973401069641, + "learning_rate": 0.0005, + "loss": 2.1071, + "step": 241000 + }, + { + "epoch": 0.9173435442247817, + "grad_norm": 0.15045495331287384, + "learning_rate": 0.0005, + "loss": 2.1076, + "step": 241010 + }, + { + "epoch": 0.9173816066929044, + "grad_norm": 0.11927295476198196, + "learning_rate": 0.0005, + "loss": 2.0953, + "step": 241020 + }, + { + "epoch": 0.9174196691610271, + "grad_norm": 0.1331934928894043, + "learning_rate": 0.0005, + "loss": 2.1129, + "step": 241030 + }, + { + "epoch": 0.9174577316291498, + "grad_norm": 0.12129730731248856, + "learning_rate": 0.0005, + "loss": 2.0902, + "step": 241040 + }, + { + "epoch": 0.9174957940972724, + "grad_norm": 0.13536980748176575, + "learning_rate": 0.0005, + "loss": 2.1193, + "step": 241050 + }, + { + "epoch": 0.9175338565653951, + "grad_norm": 0.13700516521930695, + "learning_rate": 0.0005, + "loss": 2.1012, + "step": 241060 + }, + { + "epoch": 0.9175719190335178, + "grad_norm": 0.13450057804584503, + "learning_rate": 0.0005, + "loss": 2.1258, + "step": 241070 + }, + { + "epoch": 0.9176099815016405, + "grad_norm": 0.13630062341690063, + "learning_rate": 0.0005, + "loss": 2.1023, + "step": 241080 + }, + { + "epoch": 0.9176480439697632, + "grad_norm": 0.1458582878112793, + "learning_rate": 0.0005, + "loss": 2.0986, + "step": 241090 + }, + { + "epoch": 0.9176861064378858, + "grad_norm": 0.1451157182455063, + "learning_rate": 0.0005, + "loss": 2.1206, + "step": 241100 + }, + { + "epoch": 0.9177241689060085, + "grad_norm": 0.11896733939647675, + "learning_rate": 0.0005, + "loss": 2.0929, + "step": 241110 + }, + { + "epoch": 0.9177622313741313, + "grad_norm": 0.1251809000968933, + "learning_rate": 0.0005, + "loss": 2.101, + "step": 241120 + }, + { + "epoch": 0.9178002938422539, + "grad_norm": 0.11638472974300385, + "learning_rate": 0.0005, + "loss": 2.0895, + "step": 241130 + }, + { + "epoch": 0.9178383563103766, + "grad_norm": 0.1312682032585144, + "learning_rate": 0.0005, + "loss": 2.1131, + "step": 241140 + }, + { + "epoch": 0.9178764187784992, + "grad_norm": 0.11362048238515854, + "learning_rate": 0.0005, + "loss": 2.1077, + "step": 241150 + }, + { + "epoch": 0.917914481246622, + "grad_norm": 0.11174650490283966, + "learning_rate": 0.0005, + "loss": 2.1089, + "step": 241160 + }, + { + "epoch": 0.9179525437147447, + "grad_norm": 0.1195438802242279, + "learning_rate": 0.0005, + "loss": 2.1127, + "step": 241170 + }, + { + "epoch": 0.9179906061828673, + "grad_norm": 0.12362208217382431, + "learning_rate": 0.0005, + "loss": 2.1029, + "step": 241180 + }, + { + "epoch": 0.91802866865099, + "grad_norm": 0.1291234940290451, + "learning_rate": 0.0005, + "loss": 2.1161, + "step": 241190 + }, + { + "epoch": 0.9180667311191126, + "grad_norm": 0.1306326985359192, + "learning_rate": 0.0005, + "loss": 2.1035, + "step": 241200 + }, + { + "epoch": 0.9181047935872354, + "grad_norm": 0.11614658683538437, + "learning_rate": 0.0005, + "loss": 2.0925, + "step": 241210 + }, + { + "epoch": 0.9181428560553581, + "grad_norm": 0.13114012777805328, + "learning_rate": 0.0005, + "loss": 2.1127, + "step": 241220 + }, + { + "epoch": 0.9181809185234807, + "grad_norm": 0.12753938138484955, + "learning_rate": 0.0005, + "loss": 2.1019, + "step": 241230 + }, + { + "epoch": 0.9182189809916034, + "grad_norm": 0.1368921548128128, + "learning_rate": 0.0005, + "loss": 2.1047, + "step": 241240 + }, + { + "epoch": 0.9182570434597261, + "grad_norm": 0.11379440873861313, + "learning_rate": 0.0005, + "loss": 2.087, + "step": 241250 + }, + { + "epoch": 0.9182951059278488, + "grad_norm": 0.14151537418365479, + "learning_rate": 0.0005, + "loss": 2.1126, + "step": 241260 + }, + { + "epoch": 0.9183331683959715, + "grad_norm": 0.1276009976863861, + "learning_rate": 0.0005, + "loss": 2.1047, + "step": 241270 + }, + { + "epoch": 0.9183712308640941, + "grad_norm": 0.14686158299446106, + "learning_rate": 0.0005, + "loss": 2.0872, + "step": 241280 + }, + { + "epoch": 0.9184092933322169, + "grad_norm": 0.12174024432897568, + "learning_rate": 0.0005, + "loss": 2.1089, + "step": 241290 + }, + { + "epoch": 0.9184473558003395, + "grad_norm": 0.12808623909950256, + "learning_rate": 0.0005, + "loss": 2.0832, + "step": 241300 + }, + { + "epoch": 0.9184854182684622, + "grad_norm": 0.13373562693595886, + "learning_rate": 0.0005, + "loss": 2.1182, + "step": 241310 + }, + { + "epoch": 0.9185234807365849, + "grad_norm": 0.13873694837093353, + "learning_rate": 0.0005, + "loss": 2.0989, + "step": 241320 + }, + { + "epoch": 0.9185615432047076, + "grad_norm": 0.1596411168575287, + "learning_rate": 0.0005, + "loss": 2.1006, + "step": 241330 + }, + { + "epoch": 0.9185996056728303, + "grad_norm": 0.1393207609653473, + "learning_rate": 0.0005, + "loss": 2.1048, + "step": 241340 + }, + { + "epoch": 0.9186376681409529, + "grad_norm": 0.12455132603645325, + "learning_rate": 0.0005, + "loss": 2.0927, + "step": 241350 + }, + { + "epoch": 0.9186757306090756, + "grad_norm": 0.13242022693157196, + "learning_rate": 0.0005, + "loss": 2.1036, + "step": 241360 + }, + { + "epoch": 0.9187137930771982, + "grad_norm": 0.13528971374034882, + "learning_rate": 0.0005, + "loss": 2.1154, + "step": 241370 + }, + { + "epoch": 0.918751855545321, + "grad_norm": 0.12305892258882523, + "learning_rate": 0.0005, + "loss": 2.0876, + "step": 241380 + }, + { + "epoch": 0.9187899180134437, + "grad_norm": 0.11626999080181122, + "learning_rate": 0.0005, + "loss": 2.0927, + "step": 241390 + }, + { + "epoch": 0.9188279804815663, + "grad_norm": 0.11935063451528549, + "learning_rate": 0.0005, + "loss": 2.1049, + "step": 241400 + }, + { + "epoch": 0.918866042949689, + "grad_norm": 0.13918636739253998, + "learning_rate": 0.0005, + "loss": 2.0972, + "step": 241410 + }, + { + "epoch": 0.9189041054178118, + "grad_norm": 0.117560476064682, + "learning_rate": 0.0005, + "loss": 2.112, + "step": 241420 + }, + { + "epoch": 0.9189421678859344, + "grad_norm": 0.12481194734573364, + "learning_rate": 0.0005, + "loss": 2.1261, + "step": 241430 + }, + { + "epoch": 0.9189802303540571, + "grad_norm": 0.12880441546440125, + "learning_rate": 0.0005, + "loss": 2.0838, + "step": 241440 + }, + { + "epoch": 0.9190182928221797, + "grad_norm": 0.14205487072467804, + "learning_rate": 0.0005, + "loss": 2.0902, + "step": 241450 + }, + { + "epoch": 0.9190563552903025, + "grad_norm": 0.13198022544384003, + "learning_rate": 0.0005, + "loss": 2.0923, + "step": 241460 + }, + { + "epoch": 0.9190944177584252, + "grad_norm": 0.12845546007156372, + "learning_rate": 0.0005, + "loss": 2.1003, + "step": 241470 + }, + { + "epoch": 0.9191324802265478, + "grad_norm": 0.22721713781356812, + "learning_rate": 0.0005, + "loss": 2.0987, + "step": 241480 + }, + { + "epoch": 0.9191705426946705, + "grad_norm": 0.12433764338493347, + "learning_rate": 0.0005, + "loss": 2.1006, + "step": 241490 + }, + { + "epoch": 0.9192086051627931, + "grad_norm": 0.12495458871126175, + "learning_rate": 0.0005, + "loss": 2.0987, + "step": 241500 + }, + { + "epoch": 0.9192466676309159, + "grad_norm": 0.13803954422473907, + "learning_rate": 0.0005, + "loss": 2.1071, + "step": 241510 + }, + { + "epoch": 0.9192847300990385, + "grad_norm": 0.11726602911949158, + "learning_rate": 0.0005, + "loss": 2.1157, + "step": 241520 + }, + { + "epoch": 0.9193227925671612, + "grad_norm": 0.12827108800411224, + "learning_rate": 0.0005, + "loss": 2.0917, + "step": 241530 + }, + { + "epoch": 0.9193608550352839, + "grad_norm": 0.12403757125139236, + "learning_rate": 0.0005, + "loss": 2.112, + "step": 241540 + }, + { + "epoch": 0.9193989175034066, + "grad_norm": 0.12619732320308685, + "learning_rate": 0.0005, + "loss": 2.0945, + "step": 241550 + }, + { + "epoch": 0.9194369799715293, + "grad_norm": 0.13321739435195923, + "learning_rate": 0.0005, + "loss": 2.1143, + "step": 241560 + }, + { + "epoch": 0.919475042439652, + "grad_norm": 0.1475500762462616, + "learning_rate": 0.0005, + "loss": 2.1187, + "step": 241570 + }, + { + "epoch": 0.9195131049077746, + "grad_norm": 0.11906681209802628, + "learning_rate": 0.0005, + "loss": 2.1118, + "step": 241580 + }, + { + "epoch": 0.9195511673758974, + "grad_norm": 0.1269826889038086, + "learning_rate": 0.0005, + "loss": 2.0899, + "step": 241590 + }, + { + "epoch": 0.91958922984402, + "grad_norm": 0.12411966174840927, + "learning_rate": 0.0005, + "loss": 2.1053, + "step": 241600 + }, + { + "epoch": 0.9196272923121427, + "grad_norm": 0.12925204634666443, + "learning_rate": 0.0005, + "loss": 2.1137, + "step": 241610 + }, + { + "epoch": 0.9196653547802653, + "grad_norm": 0.12196630239486694, + "learning_rate": 0.0005, + "loss": 2.1188, + "step": 241620 + }, + { + "epoch": 0.919703417248388, + "grad_norm": 0.13332179188728333, + "learning_rate": 0.0005, + "loss": 2.1026, + "step": 241630 + }, + { + "epoch": 0.9197414797165108, + "grad_norm": 0.11703956127166748, + "learning_rate": 0.0005, + "loss": 2.1036, + "step": 241640 + }, + { + "epoch": 0.9197795421846334, + "grad_norm": 0.12794820964336395, + "learning_rate": 0.0005, + "loss": 2.1046, + "step": 241650 + }, + { + "epoch": 0.9198176046527561, + "grad_norm": 0.13128633797168732, + "learning_rate": 0.0005, + "loss": 2.1109, + "step": 241660 + }, + { + "epoch": 0.9198556671208787, + "grad_norm": 0.14611156284809113, + "learning_rate": 0.0005, + "loss": 2.1177, + "step": 241670 + }, + { + "epoch": 0.9198937295890015, + "grad_norm": 0.11457149684429169, + "learning_rate": 0.0005, + "loss": 2.1111, + "step": 241680 + }, + { + "epoch": 0.9199317920571242, + "grad_norm": 0.12531642615795135, + "learning_rate": 0.0005, + "loss": 2.1071, + "step": 241690 + }, + { + "epoch": 0.9199698545252468, + "grad_norm": 0.11403706669807434, + "learning_rate": 0.0005, + "loss": 2.1031, + "step": 241700 + }, + { + "epoch": 0.9200079169933695, + "grad_norm": 0.13887332379817963, + "learning_rate": 0.0005, + "loss": 2.0902, + "step": 241710 + }, + { + "epoch": 0.9200459794614922, + "grad_norm": 0.1301995813846588, + "learning_rate": 0.0005, + "loss": 2.1003, + "step": 241720 + }, + { + "epoch": 0.9200840419296149, + "grad_norm": 0.12377279996871948, + "learning_rate": 0.0005, + "loss": 2.1103, + "step": 241730 + }, + { + "epoch": 0.9201221043977376, + "grad_norm": 0.1265926957130432, + "learning_rate": 0.0005, + "loss": 2.1155, + "step": 241740 + }, + { + "epoch": 0.9201601668658602, + "grad_norm": 0.12687602639198303, + "learning_rate": 0.0005, + "loss": 2.1093, + "step": 241750 + }, + { + "epoch": 0.920198229333983, + "grad_norm": 0.12322045117616653, + "learning_rate": 0.0005, + "loss": 2.1102, + "step": 241760 + }, + { + "epoch": 0.9202362918021056, + "grad_norm": 0.12699291110038757, + "learning_rate": 0.0005, + "loss": 2.1069, + "step": 241770 + }, + { + "epoch": 0.9202743542702283, + "grad_norm": 0.13091620802879333, + "learning_rate": 0.0005, + "loss": 2.1147, + "step": 241780 + }, + { + "epoch": 0.920312416738351, + "grad_norm": 0.14797089993953705, + "learning_rate": 0.0005, + "loss": 2.1033, + "step": 241790 + }, + { + "epoch": 0.9203504792064736, + "grad_norm": 0.12071997672319412, + "learning_rate": 0.0005, + "loss": 2.1128, + "step": 241800 + }, + { + "epoch": 0.9203885416745964, + "grad_norm": 0.1170908659696579, + "learning_rate": 0.0005, + "loss": 2.0987, + "step": 241810 + }, + { + "epoch": 0.920426604142719, + "grad_norm": 0.12971200048923492, + "learning_rate": 0.0005, + "loss": 2.1098, + "step": 241820 + }, + { + "epoch": 0.9204646666108417, + "grad_norm": 0.13067524135112762, + "learning_rate": 0.0005, + "loss": 2.0944, + "step": 241830 + }, + { + "epoch": 0.9205027290789644, + "grad_norm": 0.11806374788284302, + "learning_rate": 0.0005, + "loss": 2.0939, + "step": 241840 + }, + { + "epoch": 0.9205407915470871, + "grad_norm": 0.12400566041469574, + "learning_rate": 0.0005, + "loss": 2.1069, + "step": 241850 + }, + { + "epoch": 0.9205788540152098, + "grad_norm": 0.11953874677419662, + "learning_rate": 0.0005, + "loss": 2.1134, + "step": 241860 + }, + { + "epoch": 0.9206169164833324, + "grad_norm": 0.1293313354253769, + "learning_rate": 0.0005, + "loss": 2.1219, + "step": 241870 + }, + { + "epoch": 0.9206549789514551, + "grad_norm": 0.12719066441059113, + "learning_rate": 0.0005, + "loss": 2.0887, + "step": 241880 + }, + { + "epoch": 0.9206930414195779, + "grad_norm": 0.13698673248291016, + "learning_rate": 0.0005, + "loss": 2.0927, + "step": 241890 + }, + { + "epoch": 0.9207311038877005, + "grad_norm": 0.12375357747077942, + "learning_rate": 0.0005, + "loss": 2.1005, + "step": 241900 + }, + { + "epoch": 0.9207691663558232, + "grad_norm": 0.12502874433994293, + "learning_rate": 0.0005, + "loss": 2.0936, + "step": 241910 + }, + { + "epoch": 0.9208072288239458, + "grad_norm": 0.1243850439786911, + "learning_rate": 0.0005, + "loss": 2.1061, + "step": 241920 + }, + { + "epoch": 0.9208452912920685, + "grad_norm": 0.1315883994102478, + "learning_rate": 0.0005, + "loss": 2.0987, + "step": 241930 + }, + { + "epoch": 0.9208833537601913, + "grad_norm": 0.140249103307724, + "learning_rate": 0.0005, + "loss": 2.095, + "step": 241940 + }, + { + "epoch": 0.9209214162283139, + "grad_norm": 0.12236278504133224, + "learning_rate": 0.0005, + "loss": 2.1157, + "step": 241950 + }, + { + "epoch": 0.9209594786964366, + "grad_norm": 0.14159269630908966, + "learning_rate": 0.0005, + "loss": 2.1072, + "step": 241960 + }, + { + "epoch": 0.9209975411645592, + "grad_norm": 0.12036581337451935, + "learning_rate": 0.0005, + "loss": 2.0907, + "step": 241970 + }, + { + "epoch": 0.921035603632682, + "grad_norm": 0.13146193325519562, + "learning_rate": 0.0005, + "loss": 2.1083, + "step": 241980 + }, + { + "epoch": 0.9210736661008047, + "grad_norm": 0.12361620366573334, + "learning_rate": 0.0005, + "loss": 2.1125, + "step": 241990 + }, + { + "epoch": 0.9211117285689273, + "grad_norm": 0.13313716650009155, + "learning_rate": 0.0005, + "loss": 2.1029, + "step": 242000 + }, + { + "epoch": 0.92114979103705, + "grad_norm": 0.12712553143501282, + "learning_rate": 0.0005, + "loss": 2.1169, + "step": 242010 + }, + { + "epoch": 0.9211878535051727, + "grad_norm": 0.13036790490150452, + "learning_rate": 0.0005, + "loss": 2.1017, + "step": 242020 + }, + { + "epoch": 0.9212259159732954, + "grad_norm": 0.14863251149654388, + "learning_rate": 0.0005, + "loss": 2.0993, + "step": 242030 + }, + { + "epoch": 0.921263978441418, + "grad_norm": 0.13584734499454498, + "learning_rate": 0.0005, + "loss": 2.1006, + "step": 242040 + }, + { + "epoch": 0.9213020409095407, + "grad_norm": 0.12490679323673248, + "learning_rate": 0.0005, + "loss": 2.0962, + "step": 242050 + }, + { + "epoch": 0.9213401033776634, + "grad_norm": 0.1250891089439392, + "learning_rate": 0.0005, + "loss": 2.1001, + "step": 242060 + }, + { + "epoch": 0.9213781658457861, + "grad_norm": 0.12336582690477371, + "learning_rate": 0.0005, + "loss": 2.0968, + "step": 242070 + }, + { + "epoch": 0.9214162283139088, + "grad_norm": 0.12958140671253204, + "learning_rate": 0.0005, + "loss": 2.1139, + "step": 242080 + }, + { + "epoch": 0.9214542907820314, + "grad_norm": 0.1208835244178772, + "learning_rate": 0.0005, + "loss": 2.0879, + "step": 242090 + }, + { + "epoch": 0.9214923532501541, + "grad_norm": 0.12668830156326294, + "learning_rate": 0.0005, + "loss": 2.1078, + "step": 242100 + }, + { + "epoch": 0.9215304157182769, + "grad_norm": 0.13278093934059143, + "learning_rate": 0.0005, + "loss": 2.0991, + "step": 242110 + }, + { + "epoch": 0.9215684781863995, + "grad_norm": 0.13082022964954376, + "learning_rate": 0.0005, + "loss": 2.1146, + "step": 242120 + }, + { + "epoch": 0.9216065406545222, + "grad_norm": 0.12971073389053345, + "learning_rate": 0.0005, + "loss": 2.1122, + "step": 242130 + }, + { + "epoch": 0.9216446031226448, + "grad_norm": 0.13306653499603271, + "learning_rate": 0.0005, + "loss": 2.1047, + "step": 242140 + }, + { + "epoch": 0.9216826655907676, + "grad_norm": 0.123257115483284, + "learning_rate": 0.0005, + "loss": 2.0999, + "step": 242150 + }, + { + "epoch": 0.9217207280588903, + "grad_norm": 0.1386224329471588, + "learning_rate": 0.0005, + "loss": 2.0916, + "step": 242160 + }, + { + "epoch": 0.9217587905270129, + "grad_norm": 0.12598341703414917, + "learning_rate": 0.0005, + "loss": 2.0993, + "step": 242170 + }, + { + "epoch": 0.9217968529951356, + "grad_norm": 0.14487870037555695, + "learning_rate": 0.0005, + "loss": 2.0904, + "step": 242180 + }, + { + "epoch": 0.9218349154632584, + "grad_norm": 0.12394547462463379, + "learning_rate": 0.0005, + "loss": 2.0905, + "step": 242190 + }, + { + "epoch": 0.921872977931381, + "grad_norm": 0.12625393271446228, + "learning_rate": 0.0005, + "loss": 2.0925, + "step": 242200 + }, + { + "epoch": 0.9219110403995037, + "grad_norm": 0.12205489724874496, + "learning_rate": 0.0005, + "loss": 2.0926, + "step": 242210 + }, + { + "epoch": 0.9219491028676263, + "grad_norm": 0.12885427474975586, + "learning_rate": 0.0005, + "loss": 2.1117, + "step": 242220 + }, + { + "epoch": 0.921987165335749, + "grad_norm": 0.14043216407299042, + "learning_rate": 0.0005, + "loss": 2.1178, + "step": 242230 + }, + { + "epoch": 0.9220252278038717, + "grad_norm": 0.12551164627075195, + "learning_rate": 0.0005, + "loss": 2.0929, + "step": 242240 + }, + { + "epoch": 0.9220632902719944, + "grad_norm": 0.13114112615585327, + "learning_rate": 0.0005, + "loss": 2.0974, + "step": 242250 + }, + { + "epoch": 0.9221013527401171, + "grad_norm": 0.11854348331689835, + "learning_rate": 0.0005, + "loss": 2.1101, + "step": 242260 + }, + { + "epoch": 0.9221394152082397, + "grad_norm": 0.12824852764606476, + "learning_rate": 0.0005, + "loss": 2.1085, + "step": 242270 + }, + { + "epoch": 0.9221774776763625, + "grad_norm": 0.12649212777614594, + "learning_rate": 0.0005, + "loss": 2.101, + "step": 242280 + }, + { + "epoch": 0.9222155401444851, + "grad_norm": 0.14085513353347778, + "learning_rate": 0.0005, + "loss": 2.0988, + "step": 242290 + }, + { + "epoch": 0.9222536026126078, + "grad_norm": 0.13265717029571533, + "learning_rate": 0.0005, + "loss": 2.0977, + "step": 242300 + }, + { + "epoch": 0.9222916650807305, + "grad_norm": 0.12744875252246857, + "learning_rate": 0.0005, + "loss": 2.0952, + "step": 242310 + }, + { + "epoch": 0.9223297275488532, + "grad_norm": 0.12851621210575104, + "learning_rate": 0.0005, + "loss": 2.1055, + "step": 242320 + }, + { + "epoch": 0.9223677900169759, + "grad_norm": 0.1234138086438179, + "learning_rate": 0.0005, + "loss": 2.0816, + "step": 242330 + }, + { + "epoch": 0.9224058524850985, + "grad_norm": 0.12517589330673218, + "learning_rate": 0.0005, + "loss": 2.1143, + "step": 242340 + }, + { + "epoch": 0.9224439149532212, + "grad_norm": 0.12155363708734512, + "learning_rate": 0.0005, + "loss": 2.1034, + "step": 242350 + }, + { + "epoch": 0.9224819774213439, + "grad_norm": 0.12556451559066772, + "learning_rate": 0.0005, + "loss": 2.103, + "step": 242360 + }, + { + "epoch": 0.9225200398894666, + "grad_norm": 0.13065999746322632, + "learning_rate": 0.0005, + "loss": 2.0889, + "step": 242370 + }, + { + "epoch": 0.9225581023575893, + "grad_norm": 0.12490629404783249, + "learning_rate": 0.0005, + "loss": 2.0955, + "step": 242380 + }, + { + "epoch": 0.9225961648257119, + "grad_norm": 0.12175456434488297, + "learning_rate": 0.0005, + "loss": 2.1084, + "step": 242390 + }, + { + "epoch": 0.9226342272938346, + "grad_norm": 0.13581399619579315, + "learning_rate": 0.0005, + "loss": 2.1105, + "step": 242400 + }, + { + "epoch": 0.9226722897619574, + "grad_norm": 0.146930992603302, + "learning_rate": 0.0005, + "loss": 2.1053, + "step": 242410 + }, + { + "epoch": 0.92271035223008, + "grad_norm": 0.13098594546318054, + "learning_rate": 0.0005, + "loss": 2.1046, + "step": 242420 + }, + { + "epoch": 0.9227484146982027, + "grad_norm": 0.1277133822441101, + "learning_rate": 0.0005, + "loss": 2.1054, + "step": 242430 + }, + { + "epoch": 0.9227864771663253, + "grad_norm": 0.1202763095498085, + "learning_rate": 0.0005, + "loss": 2.0993, + "step": 242440 + }, + { + "epoch": 0.9228245396344481, + "grad_norm": 0.11052737385034561, + "learning_rate": 0.0005, + "loss": 2.1136, + "step": 242450 + }, + { + "epoch": 0.9228626021025708, + "grad_norm": 0.12411277741193771, + "learning_rate": 0.0005, + "loss": 2.0992, + "step": 242460 + }, + { + "epoch": 0.9229006645706934, + "grad_norm": 0.13022834062576294, + "learning_rate": 0.0005, + "loss": 2.1092, + "step": 242470 + }, + { + "epoch": 0.9229387270388161, + "grad_norm": 0.13648240268230438, + "learning_rate": 0.0005, + "loss": 2.1006, + "step": 242480 + }, + { + "epoch": 0.9229767895069387, + "grad_norm": 0.126776322722435, + "learning_rate": 0.0005, + "loss": 2.096, + "step": 242490 + }, + { + "epoch": 0.9230148519750615, + "grad_norm": 0.13091644644737244, + "learning_rate": 0.0005, + "loss": 2.1032, + "step": 242500 + }, + { + "epoch": 0.9230529144431842, + "grad_norm": 0.14091360569000244, + "learning_rate": 0.0005, + "loss": 2.1006, + "step": 242510 + }, + { + "epoch": 0.9230909769113068, + "grad_norm": 0.13159774243831635, + "learning_rate": 0.0005, + "loss": 2.1077, + "step": 242520 + }, + { + "epoch": 0.9231290393794295, + "grad_norm": 0.1189807653427124, + "learning_rate": 0.0005, + "loss": 2.1138, + "step": 242530 + }, + { + "epoch": 0.9231671018475522, + "grad_norm": 0.1277497261762619, + "learning_rate": 0.0005, + "loss": 2.105, + "step": 242540 + }, + { + "epoch": 0.9232051643156749, + "grad_norm": 0.13974037766456604, + "learning_rate": 0.0005, + "loss": 2.0998, + "step": 242550 + }, + { + "epoch": 0.9232432267837976, + "grad_norm": 0.11727729439735413, + "learning_rate": 0.0005, + "loss": 2.113, + "step": 242560 + }, + { + "epoch": 0.9232812892519202, + "grad_norm": 0.1266697645187378, + "learning_rate": 0.0005, + "loss": 2.1154, + "step": 242570 + }, + { + "epoch": 0.923319351720043, + "grad_norm": 0.1267307996749878, + "learning_rate": 0.0005, + "loss": 2.1, + "step": 242580 + }, + { + "epoch": 0.9233574141881656, + "grad_norm": 0.1328422725200653, + "learning_rate": 0.0005, + "loss": 2.098, + "step": 242590 + }, + { + "epoch": 0.9233954766562883, + "grad_norm": 0.12920698523521423, + "learning_rate": 0.0005, + "loss": 2.1101, + "step": 242600 + }, + { + "epoch": 0.923433539124411, + "grad_norm": 0.135263592004776, + "learning_rate": 0.0005, + "loss": 2.0948, + "step": 242610 + }, + { + "epoch": 0.9234716015925337, + "grad_norm": 0.13382890820503235, + "learning_rate": 0.0005, + "loss": 2.1009, + "step": 242620 + }, + { + "epoch": 0.9235096640606564, + "grad_norm": 0.12548057734966278, + "learning_rate": 0.0005, + "loss": 2.0885, + "step": 242630 + }, + { + "epoch": 0.923547726528779, + "grad_norm": 0.13171319663524628, + "learning_rate": 0.0005, + "loss": 2.1057, + "step": 242640 + }, + { + "epoch": 0.9235857889969017, + "grad_norm": 0.1245994120836258, + "learning_rate": 0.0005, + "loss": 2.0864, + "step": 242650 + }, + { + "epoch": 0.9236238514650243, + "grad_norm": 0.1319652944803238, + "learning_rate": 0.0005, + "loss": 2.0951, + "step": 242660 + }, + { + "epoch": 0.9236619139331471, + "grad_norm": 0.12865465879440308, + "learning_rate": 0.0005, + "loss": 2.0985, + "step": 242670 + }, + { + "epoch": 0.9236999764012698, + "grad_norm": 0.12202879786491394, + "learning_rate": 0.0005, + "loss": 2.1099, + "step": 242680 + }, + { + "epoch": 0.9237380388693924, + "grad_norm": 0.11969945579767227, + "learning_rate": 0.0005, + "loss": 2.114, + "step": 242690 + }, + { + "epoch": 0.9237761013375151, + "grad_norm": 0.12907131016254425, + "learning_rate": 0.0005, + "loss": 2.1139, + "step": 242700 + }, + { + "epoch": 0.9238141638056379, + "grad_norm": 0.13925276696681976, + "learning_rate": 0.0005, + "loss": 2.0981, + "step": 242710 + }, + { + "epoch": 0.9238522262737605, + "grad_norm": 0.12933896481990814, + "learning_rate": 0.0005, + "loss": 2.0962, + "step": 242720 + }, + { + "epoch": 0.9238902887418832, + "grad_norm": 0.12854160368442535, + "learning_rate": 0.0005, + "loss": 2.1116, + "step": 242730 + }, + { + "epoch": 0.9239283512100058, + "grad_norm": 0.12014547735452652, + "learning_rate": 0.0005, + "loss": 2.112, + "step": 242740 + }, + { + "epoch": 0.9239664136781286, + "grad_norm": 0.11493667215108871, + "learning_rate": 0.0005, + "loss": 2.0963, + "step": 242750 + }, + { + "epoch": 0.9240044761462513, + "grad_norm": 0.12854570150375366, + "learning_rate": 0.0005, + "loss": 2.0963, + "step": 242760 + }, + { + "epoch": 0.9240425386143739, + "grad_norm": 0.1280025988817215, + "learning_rate": 0.0005, + "loss": 2.1083, + "step": 242770 + }, + { + "epoch": 0.9240806010824966, + "grad_norm": 0.12395115196704865, + "learning_rate": 0.0005, + "loss": 2.1011, + "step": 242780 + }, + { + "epoch": 0.9241186635506192, + "grad_norm": 0.1267663985490799, + "learning_rate": 0.0005, + "loss": 2.094, + "step": 242790 + }, + { + "epoch": 0.924156726018742, + "grad_norm": 0.13480299711227417, + "learning_rate": 0.0005, + "loss": 2.1086, + "step": 242800 + }, + { + "epoch": 0.9241947884868646, + "grad_norm": 0.1318933069705963, + "learning_rate": 0.0005, + "loss": 2.1, + "step": 242810 + }, + { + "epoch": 0.9242328509549873, + "grad_norm": 0.13908858597278595, + "learning_rate": 0.0005, + "loss": 2.1091, + "step": 242820 + }, + { + "epoch": 0.92427091342311, + "grad_norm": 0.11871346086263657, + "learning_rate": 0.0005, + "loss": 2.0974, + "step": 242830 + }, + { + "epoch": 0.9243089758912327, + "grad_norm": 0.12937457859516144, + "learning_rate": 0.0005, + "loss": 2.1127, + "step": 242840 + }, + { + "epoch": 0.9243470383593554, + "grad_norm": 0.14433366060256958, + "learning_rate": 0.0005, + "loss": 2.0954, + "step": 242850 + }, + { + "epoch": 0.924385100827478, + "grad_norm": 0.13290633261203766, + "learning_rate": 0.0005, + "loss": 2.121, + "step": 242860 + }, + { + "epoch": 0.9244231632956007, + "grad_norm": 0.12644854187965393, + "learning_rate": 0.0005, + "loss": 2.0954, + "step": 242870 + }, + { + "epoch": 0.9244612257637235, + "grad_norm": 0.13102544844150543, + "learning_rate": 0.0005, + "loss": 2.1125, + "step": 242880 + }, + { + "epoch": 0.9244992882318461, + "grad_norm": 0.17648755013942719, + "learning_rate": 0.0005, + "loss": 2.1107, + "step": 242890 + }, + { + "epoch": 0.9245373506999688, + "grad_norm": 0.12615133821964264, + "learning_rate": 0.0005, + "loss": 2.1042, + "step": 242900 + }, + { + "epoch": 0.9245754131680914, + "grad_norm": 0.12318253517150879, + "learning_rate": 0.0005, + "loss": 2.1124, + "step": 242910 + }, + { + "epoch": 0.9246134756362141, + "grad_norm": 0.1262514740228653, + "learning_rate": 0.0005, + "loss": 2.1145, + "step": 242920 + }, + { + "epoch": 0.9246515381043369, + "grad_norm": 0.12124498188495636, + "learning_rate": 0.0005, + "loss": 2.1162, + "step": 242930 + }, + { + "epoch": 0.9246896005724595, + "grad_norm": 0.13532347977161407, + "learning_rate": 0.0005, + "loss": 2.1005, + "step": 242940 + }, + { + "epoch": 0.9247276630405822, + "grad_norm": 0.12648305296897888, + "learning_rate": 0.0005, + "loss": 2.0988, + "step": 242950 + }, + { + "epoch": 0.9247657255087048, + "grad_norm": 0.12291644513607025, + "learning_rate": 0.0005, + "loss": 2.1006, + "step": 242960 + }, + { + "epoch": 0.9248037879768276, + "grad_norm": 0.1294446587562561, + "learning_rate": 0.0005, + "loss": 2.0831, + "step": 242970 + }, + { + "epoch": 0.9248418504449503, + "grad_norm": 0.13656698167324066, + "learning_rate": 0.0005, + "loss": 2.0856, + "step": 242980 + }, + { + "epoch": 0.9248799129130729, + "grad_norm": 0.1236143708229065, + "learning_rate": 0.0005, + "loss": 2.1097, + "step": 242990 + }, + { + "epoch": 0.9249179753811956, + "grad_norm": 0.13238398730754852, + "learning_rate": 0.0005, + "loss": 2.1009, + "step": 243000 + }, + { + "epoch": 0.9249560378493183, + "grad_norm": 0.13070373237133026, + "learning_rate": 0.0005, + "loss": 2.0925, + "step": 243010 + }, + { + "epoch": 0.924994100317441, + "grad_norm": 0.13280506432056427, + "learning_rate": 0.0005, + "loss": 2.1173, + "step": 243020 + }, + { + "epoch": 0.9250321627855637, + "grad_norm": 0.12996715307235718, + "learning_rate": 0.0005, + "loss": 2.1126, + "step": 243030 + }, + { + "epoch": 0.9250702252536863, + "grad_norm": 0.12887771427631378, + "learning_rate": 0.0005, + "loss": 2.0892, + "step": 243040 + }, + { + "epoch": 0.9251082877218091, + "grad_norm": 0.13306356966495514, + "learning_rate": 0.0005, + "loss": 2.1034, + "step": 243050 + }, + { + "epoch": 0.9251463501899317, + "grad_norm": 0.14614702761173248, + "learning_rate": 0.0005, + "loss": 2.11, + "step": 243060 + }, + { + "epoch": 0.9251844126580544, + "grad_norm": 0.12717987596988678, + "learning_rate": 0.0005, + "loss": 2.0998, + "step": 243070 + }, + { + "epoch": 0.9252224751261771, + "grad_norm": 0.12082240730524063, + "learning_rate": 0.0005, + "loss": 2.1016, + "step": 243080 + }, + { + "epoch": 0.9252605375942997, + "grad_norm": 0.12202537059783936, + "learning_rate": 0.0005, + "loss": 2.0892, + "step": 243090 + }, + { + "epoch": 0.9252986000624225, + "grad_norm": 0.13810782134532928, + "learning_rate": 0.0005, + "loss": 2.1021, + "step": 243100 + }, + { + "epoch": 0.9253366625305451, + "grad_norm": 0.1495743691921234, + "learning_rate": 0.0005, + "loss": 2.0927, + "step": 243110 + }, + { + "epoch": 0.9253747249986678, + "grad_norm": 0.19354461133480072, + "learning_rate": 0.0005, + "loss": 2.1216, + "step": 243120 + }, + { + "epoch": 0.9254127874667905, + "grad_norm": 0.12816660106182098, + "learning_rate": 0.0005, + "loss": 2.1151, + "step": 243130 + }, + { + "epoch": 0.9254508499349132, + "grad_norm": 0.1510034054517746, + "learning_rate": 0.0005, + "loss": 2.1082, + "step": 243140 + }, + { + "epoch": 0.9254889124030359, + "grad_norm": 0.12415878474712372, + "learning_rate": 0.0005, + "loss": 2.1095, + "step": 243150 + }, + { + "epoch": 0.9255269748711585, + "grad_norm": 0.11569990962743759, + "learning_rate": 0.0005, + "loss": 2.1035, + "step": 243160 + }, + { + "epoch": 0.9255650373392812, + "grad_norm": 0.1276220977306366, + "learning_rate": 0.0005, + "loss": 2.1052, + "step": 243170 + }, + { + "epoch": 0.925603099807404, + "grad_norm": 0.12693388760089874, + "learning_rate": 0.0005, + "loss": 2.0899, + "step": 243180 + }, + { + "epoch": 0.9256411622755266, + "grad_norm": 0.12227284908294678, + "learning_rate": 0.0005, + "loss": 2.1086, + "step": 243190 + }, + { + "epoch": 0.9256792247436493, + "grad_norm": 0.14652849733829498, + "learning_rate": 0.0005, + "loss": 2.1011, + "step": 243200 + }, + { + "epoch": 0.9257172872117719, + "grad_norm": 0.12039449065923691, + "learning_rate": 0.0005, + "loss": 2.1063, + "step": 243210 + }, + { + "epoch": 0.9257553496798946, + "grad_norm": 0.12962830066680908, + "learning_rate": 0.0005, + "loss": 2.0976, + "step": 243220 + }, + { + "epoch": 0.9257934121480174, + "grad_norm": 0.12831825017929077, + "learning_rate": 0.0005, + "loss": 2.1031, + "step": 243230 + }, + { + "epoch": 0.92583147461614, + "grad_norm": 0.12297516316175461, + "learning_rate": 0.0005, + "loss": 2.1025, + "step": 243240 + }, + { + "epoch": 0.9258695370842627, + "grad_norm": 0.11735675483942032, + "learning_rate": 0.0005, + "loss": 2.1105, + "step": 243250 + }, + { + "epoch": 0.9259075995523853, + "grad_norm": 0.1269374042749405, + "learning_rate": 0.0005, + "loss": 2.0746, + "step": 243260 + }, + { + "epoch": 0.9259456620205081, + "grad_norm": 0.11785628646612167, + "learning_rate": 0.0005, + "loss": 2.0923, + "step": 243270 + }, + { + "epoch": 0.9259837244886308, + "grad_norm": 0.12496186792850494, + "learning_rate": 0.0005, + "loss": 2.1184, + "step": 243280 + }, + { + "epoch": 0.9260217869567534, + "grad_norm": 0.13376538455486298, + "learning_rate": 0.0005, + "loss": 2.091, + "step": 243290 + }, + { + "epoch": 0.9260598494248761, + "grad_norm": 0.1268036812543869, + "learning_rate": 0.0005, + "loss": 2.1087, + "step": 243300 + }, + { + "epoch": 0.9260979118929988, + "grad_norm": 0.12560303509235382, + "learning_rate": 0.0005, + "loss": 2.092, + "step": 243310 + }, + { + "epoch": 0.9261359743611215, + "grad_norm": 0.12618377804756165, + "learning_rate": 0.0005, + "loss": 2.1031, + "step": 243320 + }, + { + "epoch": 0.9261740368292442, + "grad_norm": 0.12502415478229523, + "learning_rate": 0.0005, + "loss": 2.1047, + "step": 243330 + }, + { + "epoch": 0.9262120992973668, + "grad_norm": 0.1230950802564621, + "learning_rate": 0.0005, + "loss": 2.0981, + "step": 243340 + }, + { + "epoch": 0.9262501617654895, + "grad_norm": 0.12102053314447403, + "learning_rate": 0.0005, + "loss": 2.1043, + "step": 243350 + }, + { + "epoch": 0.9262882242336122, + "grad_norm": 0.1210583746433258, + "learning_rate": 0.0005, + "loss": 2.0992, + "step": 243360 + }, + { + "epoch": 0.9263262867017349, + "grad_norm": 0.13412946462631226, + "learning_rate": 0.0005, + "loss": 2.0986, + "step": 243370 + }, + { + "epoch": 0.9263643491698575, + "grad_norm": 0.15333864092826843, + "learning_rate": 0.0005, + "loss": 2.1024, + "step": 243380 + }, + { + "epoch": 0.9264024116379802, + "grad_norm": 0.13135433197021484, + "learning_rate": 0.0005, + "loss": 2.1168, + "step": 243390 + }, + { + "epoch": 0.926440474106103, + "grad_norm": 0.1341996043920517, + "learning_rate": 0.0005, + "loss": 2.0946, + "step": 243400 + }, + { + "epoch": 0.9264785365742256, + "grad_norm": 0.13238976895809174, + "learning_rate": 0.0005, + "loss": 2.1024, + "step": 243410 + }, + { + "epoch": 0.9265165990423483, + "grad_norm": 0.1404605507850647, + "learning_rate": 0.0005, + "loss": 2.0975, + "step": 243420 + }, + { + "epoch": 0.926554661510471, + "grad_norm": 0.12437047809362411, + "learning_rate": 0.0005, + "loss": 2.1123, + "step": 243430 + }, + { + "epoch": 0.9265927239785937, + "grad_norm": 0.12488163262605667, + "learning_rate": 0.0005, + "loss": 2.0929, + "step": 243440 + }, + { + "epoch": 0.9266307864467164, + "grad_norm": 0.12038839608430862, + "learning_rate": 0.0005, + "loss": 2.1039, + "step": 243450 + }, + { + "epoch": 0.926668848914839, + "grad_norm": 0.12095153331756592, + "learning_rate": 0.0005, + "loss": 2.0962, + "step": 243460 + }, + { + "epoch": 0.9267069113829617, + "grad_norm": 0.1299186497926712, + "learning_rate": 0.0005, + "loss": 2.1126, + "step": 243470 + }, + { + "epoch": 0.9267449738510845, + "grad_norm": 0.13669754564762115, + "learning_rate": 0.0005, + "loss": 2.1018, + "step": 243480 + }, + { + "epoch": 0.9267830363192071, + "grad_norm": 0.12292340397834778, + "learning_rate": 0.0005, + "loss": 2.1028, + "step": 243490 + }, + { + "epoch": 0.9268210987873298, + "grad_norm": 0.11784177273511887, + "learning_rate": 0.0005, + "loss": 2.0979, + "step": 243500 + }, + { + "epoch": 0.9268591612554524, + "grad_norm": 0.1215096265077591, + "learning_rate": 0.0005, + "loss": 2.1177, + "step": 243510 + }, + { + "epoch": 0.9268972237235751, + "grad_norm": 0.13446903228759766, + "learning_rate": 0.0005, + "loss": 2.0987, + "step": 243520 + }, + { + "epoch": 0.9269352861916978, + "grad_norm": 0.1293669492006302, + "learning_rate": 0.0005, + "loss": 2.0969, + "step": 243530 + }, + { + "epoch": 0.9269733486598205, + "grad_norm": 0.21987639367580414, + "learning_rate": 0.0005, + "loss": 2.1054, + "step": 243540 + }, + { + "epoch": 0.9270114111279432, + "grad_norm": 0.13799740374088287, + "learning_rate": 0.0005, + "loss": 2.091, + "step": 243550 + }, + { + "epoch": 0.9270494735960658, + "grad_norm": 0.14946503937244415, + "learning_rate": 0.0005, + "loss": 2.1141, + "step": 243560 + }, + { + "epoch": 0.9270875360641886, + "grad_norm": 0.13505719602108002, + "learning_rate": 0.0005, + "loss": 2.1225, + "step": 243570 + }, + { + "epoch": 0.9271255985323112, + "grad_norm": 0.13582663238048553, + "learning_rate": 0.0005, + "loss": 2.0999, + "step": 243580 + }, + { + "epoch": 0.9271636610004339, + "grad_norm": 0.15584570169448853, + "learning_rate": 0.0005, + "loss": 2.114, + "step": 243590 + }, + { + "epoch": 0.9272017234685566, + "grad_norm": 0.12088710069656372, + "learning_rate": 0.0005, + "loss": 2.1095, + "step": 243600 + }, + { + "epoch": 0.9272397859366793, + "grad_norm": 0.12905220687389374, + "learning_rate": 0.0005, + "loss": 2.0933, + "step": 243610 + }, + { + "epoch": 0.927277848404802, + "grad_norm": 0.12397059798240662, + "learning_rate": 0.0005, + "loss": 2.1122, + "step": 243620 + }, + { + "epoch": 0.9273159108729246, + "grad_norm": 0.11348934471607208, + "learning_rate": 0.0005, + "loss": 2.1039, + "step": 243630 + }, + { + "epoch": 0.9273539733410473, + "grad_norm": 0.12789826095104218, + "learning_rate": 0.0005, + "loss": 2.104, + "step": 243640 + }, + { + "epoch": 0.92739203580917, + "grad_norm": 0.13802658021450043, + "learning_rate": 0.0005, + "loss": 2.1076, + "step": 243650 + }, + { + "epoch": 0.9274300982772927, + "grad_norm": 0.13512162864208221, + "learning_rate": 0.0005, + "loss": 2.0855, + "step": 243660 + }, + { + "epoch": 0.9274681607454154, + "grad_norm": 0.12698446214199066, + "learning_rate": 0.0005, + "loss": 2.0998, + "step": 243670 + }, + { + "epoch": 0.927506223213538, + "grad_norm": 0.13155756890773773, + "learning_rate": 0.0005, + "loss": 2.0946, + "step": 243680 + }, + { + "epoch": 0.9275442856816607, + "grad_norm": 0.1301504522562027, + "learning_rate": 0.0005, + "loss": 2.104, + "step": 243690 + }, + { + "epoch": 0.9275823481497835, + "grad_norm": 0.12535198032855988, + "learning_rate": 0.0005, + "loss": 2.0925, + "step": 243700 + }, + { + "epoch": 0.9276204106179061, + "grad_norm": 0.12066397815942764, + "learning_rate": 0.0005, + "loss": 2.1049, + "step": 243710 + }, + { + "epoch": 0.9276584730860288, + "grad_norm": 0.13454478979110718, + "learning_rate": 0.0005, + "loss": 2.1228, + "step": 243720 + }, + { + "epoch": 0.9276965355541514, + "grad_norm": 0.12323904037475586, + "learning_rate": 0.0005, + "loss": 2.1074, + "step": 243730 + }, + { + "epoch": 0.9277345980222742, + "grad_norm": 0.14403332769870758, + "learning_rate": 0.0005, + "loss": 2.1077, + "step": 243740 + }, + { + "epoch": 0.9277726604903969, + "grad_norm": 0.1309439241886139, + "learning_rate": 0.0005, + "loss": 2.0982, + "step": 243750 + }, + { + "epoch": 0.9278107229585195, + "grad_norm": 0.1328670233488083, + "learning_rate": 0.0005, + "loss": 2.0964, + "step": 243760 + }, + { + "epoch": 0.9278487854266422, + "grad_norm": 0.13527549803256989, + "learning_rate": 0.0005, + "loss": 2.1052, + "step": 243770 + }, + { + "epoch": 0.9278868478947649, + "grad_norm": 0.13164310157299042, + "learning_rate": 0.0005, + "loss": 2.1114, + "step": 243780 + }, + { + "epoch": 0.9279249103628876, + "grad_norm": 0.1374911218881607, + "learning_rate": 0.0005, + "loss": 2.1119, + "step": 243790 + }, + { + "epoch": 0.9279629728310103, + "grad_norm": 0.12938323616981506, + "learning_rate": 0.0005, + "loss": 2.1098, + "step": 243800 + }, + { + "epoch": 0.9280010352991329, + "grad_norm": 0.13699007034301758, + "learning_rate": 0.0005, + "loss": 2.1157, + "step": 243810 + }, + { + "epoch": 0.9280390977672556, + "grad_norm": 0.12728573381900787, + "learning_rate": 0.0005, + "loss": 2.1116, + "step": 243820 + }, + { + "epoch": 0.9280771602353783, + "grad_norm": 0.13931600749492645, + "learning_rate": 0.0005, + "loss": 2.1046, + "step": 243830 + }, + { + "epoch": 0.928115222703501, + "grad_norm": 0.12266018986701965, + "learning_rate": 0.0005, + "loss": 2.0865, + "step": 243840 + }, + { + "epoch": 0.9281532851716237, + "grad_norm": 0.13104864954948425, + "learning_rate": 0.0005, + "loss": 2.1034, + "step": 243850 + }, + { + "epoch": 0.9281913476397463, + "grad_norm": 0.1316935271024704, + "learning_rate": 0.0005, + "loss": 2.1074, + "step": 243860 + }, + { + "epoch": 0.9282294101078691, + "grad_norm": 0.1339971125125885, + "learning_rate": 0.0005, + "loss": 2.1117, + "step": 243870 + }, + { + "epoch": 0.9282674725759917, + "grad_norm": 0.12672404944896698, + "learning_rate": 0.0005, + "loss": 2.1066, + "step": 243880 + }, + { + "epoch": 0.9283055350441144, + "grad_norm": 0.11675414443016052, + "learning_rate": 0.0005, + "loss": 2.1155, + "step": 243890 + }, + { + "epoch": 0.928343597512237, + "grad_norm": 0.12460727989673615, + "learning_rate": 0.0005, + "loss": 2.1094, + "step": 243900 + }, + { + "epoch": 0.9283816599803598, + "grad_norm": 0.12264706939458847, + "learning_rate": 0.0005, + "loss": 2.102, + "step": 243910 + }, + { + "epoch": 0.9284197224484825, + "grad_norm": 0.13503718376159668, + "learning_rate": 0.0005, + "loss": 2.1141, + "step": 243920 + }, + { + "epoch": 0.9284577849166051, + "grad_norm": 0.128460094332695, + "learning_rate": 0.0005, + "loss": 2.1121, + "step": 243930 + }, + { + "epoch": 0.9284958473847278, + "grad_norm": 0.13557086884975433, + "learning_rate": 0.0005, + "loss": 2.1095, + "step": 243940 + }, + { + "epoch": 0.9285339098528504, + "grad_norm": 0.1326914131641388, + "learning_rate": 0.0005, + "loss": 2.0946, + "step": 243950 + }, + { + "epoch": 0.9285719723209732, + "grad_norm": 0.12578825652599335, + "learning_rate": 0.0005, + "loss": 2.1056, + "step": 243960 + }, + { + "epoch": 0.9286100347890959, + "grad_norm": 0.12709611654281616, + "learning_rate": 0.0005, + "loss": 2.1056, + "step": 243970 + }, + { + "epoch": 0.9286480972572185, + "grad_norm": 0.12961846590042114, + "learning_rate": 0.0005, + "loss": 2.107, + "step": 243980 + }, + { + "epoch": 0.9286861597253412, + "grad_norm": 0.13466140627861023, + "learning_rate": 0.0005, + "loss": 2.099, + "step": 243990 + }, + { + "epoch": 0.928724222193464, + "grad_norm": 0.12910573184490204, + "learning_rate": 0.0005, + "loss": 2.0979, + "step": 244000 + }, + { + "epoch": 0.9287622846615866, + "grad_norm": 0.12387394905090332, + "learning_rate": 0.0005, + "loss": 2.096, + "step": 244010 + }, + { + "epoch": 0.9288003471297093, + "grad_norm": 0.1370897889137268, + "learning_rate": 0.0005, + "loss": 2.1069, + "step": 244020 + }, + { + "epoch": 0.9288384095978319, + "grad_norm": 0.1256018429994583, + "learning_rate": 0.0005, + "loss": 2.1079, + "step": 244030 + }, + { + "epoch": 0.9288764720659547, + "grad_norm": 0.13081581890583038, + "learning_rate": 0.0005, + "loss": 2.0939, + "step": 244040 + }, + { + "epoch": 0.9289145345340774, + "grad_norm": 0.14325900375843048, + "learning_rate": 0.0005, + "loss": 2.1133, + "step": 244050 + }, + { + "epoch": 0.9289525970022, + "grad_norm": 0.15604746341705322, + "learning_rate": 0.0005, + "loss": 2.107, + "step": 244060 + }, + { + "epoch": 0.9289906594703227, + "grad_norm": 0.1184174194931984, + "learning_rate": 0.0005, + "loss": 2.0926, + "step": 244070 + }, + { + "epoch": 0.9290287219384453, + "grad_norm": 0.14096128940582275, + "learning_rate": 0.0005, + "loss": 2.1166, + "step": 244080 + }, + { + "epoch": 0.9290667844065681, + "grad_norm": 0.12300019711256027, + "learning_rate": 0.0005, + "loss": 2.0989, + "step": 244090 + }, + { + "epoch": 0.9291048468746907, + "grad_norm": 0.13574087619781494, + "learning_rate": 0.0005, + "loss": 2.0913, + "step": 244100 + }, + { + "epoch": 0.9291429093428134, + "grad_norm": 0.12608695030212402, + "learning_rate": 0.0005, + "loss": 2.1131, + "step": 244110 + }, + { + "epoch": 0.9291809718109361, + "grad_norm": 0.12948545813560486, + "learning_rate": 0.0005, + "loss": 2.1007, + "step": 244120 + }, + { + "epoch": 0.9292190342790588, + "grad_norm": 0.1463763415813446, + "learning_rate": 0.0005, + "loss": 2.1012, + "step": 244130 + }, + { + "epoch": 0.9292570967471815, + "grad_norm": 0.14562129974365234, + "learning_rate": 0.0005, + "loss": 2.108, + "step": 244140 + }, + { + "epoch": 0.9292951592153041, + "grad_norm": 0.1424500048160553, + "learning_rate": 0.0005, + "loss": 2.0962, + "step": 244150 + }, + { + "epoch": 0.9293332216834268, + "grad_norm": 0.1314752846956253, + "learning_rate": 0.0005, + "loss": 2.1009, + "step": 244160 + }, + { + "epoch": 0.9293712841515496, + "grad_norm": 0.12602221965789795, + "learning_rate": 0.0005, + "loss": 2.1078, + "step": 244170 + }, + { + "epoch": 0.9294093466196722, + "grad_norm": 0.12631013989448547, + "learning_rate": 0.0005, + "loss": 2.1011, + "step": 244180 + }, + { + "epoch": 0.9294474090877949, + "grad_norm": 0.11970032751560211, + "learning_rate": 0.0005, + "loss": 2.1126, + "step": 244190 + }, + { + "epoch": 0.9294854715559175, + "grad_norm": 0.13537070155143738, + "learning_rate": 0.0005, + "loss": 2.1029, + "step": 244200 + }, + { + "epoch": 0.9295235340240403, + "grad_norm": 0.1548534780740738, + "learning_rate": 0.0005, + "loss": 2.0944, + "step": 244210 + }, + { + "epoch": 0.929561596492163, + "grad_norm": 0.13076388835906982, + "learning_rate": 0.0005, + "loss": 2.1054, + "step": 244220 + }, + { + "epoch": 0.9295996589602856, + "grad_norm": 0.13737253844738007, + "learning_rate": 0.0005, + "loss": 2.0973, + "step": 244230 + }, + { + "epoch": 0.9296377214284083, + "grad_norm": 0.13844993710517883, + "learning_rate": 0.0005, + "loss": 2.1035, + "step": 244240 + }, + { + "epoch": 0.9296757838965309, + "grad_norm": 0.13641414046287537, + "learning_rate": 0.0005, + "loss": 2.0983, + "step": 244250 + }, + { + "epoch": 0.9297138463646537, + "grad_norm": 0.13651876151561737, + "learning_rate": 0.0005, + "loss": 2.0961, + "step": 244260 + }, + { + "epoch": 0.9297519088327764, + "grad_norm": 0.12432459741830826, + "learning_rate": 0.0005, + "loss": 2.091, + "step": 244270 + }, + { + "epoch": 0.929789971300899, + "grad_norm": 0.1276438683271408, + "learning_rate": 0.0005, + "loss": 2.1047, + "step": 244280 + }, + { + "epoch": 0.9298280337690217, + "grad_norm": 0.1557062566280365, + "learning_rate": 0.0005, + "loss": 2.0996, + "step": 244290 + }, + { + "epoch": 0.9298660962371444, + "grad_norm": 0.12883219122886658, + "learning_rate": 0.0005, + "loss": 2.0962, + "step": 244300 + }, + { + "epoch": 0.9299041587052671, + "grad_norm": 0.12083089351654053, + "learning_rate": 0.0005, + "loss": 2.0889, + "step": 244310 + }, + { + "epoch": 0.9299422211733898, + "grad_norm": 0.12065315246582031, + "learning_rate": 0.0005, + "loss": 2.108, + "step": 244320 + }, + { + "epoch": 0.9299802836415124, + "grad_norm": 0.13488535583019257, + "learning_rate": 0.0005, + "loss": 2.1015, + "step": 244330 + }, + { + "epoch": 0.9300183461096352, + "grad_norm": 0.13125616312026978, + "learning_rate": 0.0005, + "loss": 2.098, + "step": 244340 + }, + { + "epoch": 0.9300564085777578, + "grad_norm": 0.12402810901403427, + "learning_rate": 0.0005, + "loss": 2.1119, + "step": 244350 + }, + { + "epoch": 0.9300944710458805, + "grad_norm": 0.12106141448020935, + "learning_rate": 0.0005, + "loss": 2.1086, + "step": 244360 + }, + { + "epoch": 0.9301325335140032, + "grad_norm": 0.1306813508272171, + "learning_rate": 0.0005, + "loss": 2.107, + "step": 244370 + }, + { + "epoch": 0.9301705959821258, + "grad_norm": 0.13408935070037842, + "learning_rate": 0.0005, + "loss": 2.1056, + "step": 244380 + }, + { + "epoch": 0.9302086584502486, + "grad_norm": 0.1249052956700325, + "learning_rate": 0.0005, + "loss": 2.095, + "step": 244390 + }, + { + "epoch": 0.9302467209183712, + "grad_norm": 0.12542724609375, + "learning_rate": 0.0005, + "loss": 2.0916, + "step": 244400 + }, + { + "epoch": 0.9302847833864939, + "grad_norm": 0.144064798951149, + "learning_rate": 0.0005, + "loss": 2.0844, + "step": 244410 + }, + { + "epoch": 0.9303228458546166, + "grad_norm": 0.13257959485054016, + "learning_rate": 0.0005, + "loss": 2.1098, + "step": 244420 + }, + { + "epoch": 0.9303609083227393, + "grad_norm": 0.14355061948299408, + "learning_rate": 0.0005, + "loss": 2.0928, + "step": 244430 + }, + { + "epoch": 0.930398970790862, + "grad_norm": 0.1293869912624359, + "learning_rate": 0.0005, + "loss": 2.0938, + "step": 244440 + }, + { + "epoch": 0.9304370332589846, + "grad_norm": 0.11531233042478561, + "learning_rate": 0.0005, + "loss": 2.1111, + "step": 244450 + }, + { + "epoch": 0.9304750957271073, + "grad_norm": 0.12876319885253906, + "learning_rate": 0.0005, + "loss": 2.1219, + "step": 244460 + }, + { + "epoch": 0.9305131581952301, + "grad_norm": 0.1262495368719101, + "learning_rate": 0.0005, + "loss": 2.1091, + "step": 244470 + }, + { + "epoch": 0.9305512206633527, + "grad_norm": 0.12156631052494049, + "learning_rate": 0.0005, + "loss": 2.0983, + "step": 244480 + }, + { + "epoch": 0.9305892831314754, + "grad_norm": 0.13556714355945587, + "learning_rate": 0.0005, + "loss": 2.1047, + "step": 244490 + }, + { + "epoch": 0.930627345599598, + "grad_norm": 0.12696297466754913, + "learning_rate": 0.0005, + "loss": 2.1032, + "step": 244500 + }, + { + "epoch": 0.9306654080677207, + "grad_norm": 0.12120062112808228, + "learning_rate": 0.0005, + "loss": 2.094, + "step": 244510 + }, + { + "epoch": 0.9307034705358435, + "grad_norm": 0.1332768201828003, + "learning_rate": 0.0005, + "loss": 2.1108, + "step": 244520 + }, + { + "epoch": 0.9307415330039661, + "grad_norm": 0.1333252191543579, + "learning_rate": 0.0005, + "loss": 2.1, + "step": 244530 + }, + { + "epoch": 0.9307795954720888, + "grad_norm": 0.14268051087856293, + "learning_rate": 0.0005, + "loss": 2.1108, + "step": 244540 + }, + { + "epoch": 0.9308176579402114, + "grad_norm": 0.15190616250038147, + "learning_rate": 0.0005, + "loss": 2.1186, + "step": 244550 + }, + { + "epoch": 0.9308557204083342, + "grad_norm": 0.11583707481622696, + "learning_rate": 0.0005, + "loss": 2.1115, + "step": 244560 + }, + { + "epoch": 0.9308937828764569, + "grad_norm": 0.12261312454938889, + "learning_rate": 0.0005, + "loss": 2.1071, + "step": 244570 + }, + { + "epoch": 0.9309318453445795, + "grad_norm": 0.13171446323394775, + "learning_rate": 0.0005, + "loss": 2.0932, + "step": 244580 + }, + { + "epoch": 0.9309699078127022, + "grad_norm": 0.12392304092645645, + "learning_rate": 0.0005, + "loss": 2.1017, + "step": 244590 + }, + { + "epoch": 0.9310079702808249, + "grad_norm": 0.13120070099830627, + "learning_rate": 0.0005, + "loss": 2.1144, + "step": 244600 + }, + { + "epoch": 0.9310460327489476, + "grad_norm": 0.12526655197143555, + "learning_rate": 0.0005, + "loss": 2.1104, + "step": 244610 + }, + { + "epoch": 0.9310840952170703, + "grad_norm": 0.1256546527147293, + "learning_rate": 0.0005, + "loss": 2.0988, + "step": 244620 + }, + { + "epoch": 0.9311221576851929, + "grad_norm": 0.11751958727836609, + "learning_rate": 0.0005, + "loss": 2.1037, + "step": 244630 + }, + { + "epoch": 0.9311602201533157, + "grad_norm": 0.14713047444820404, + "learning_rate": 0.0005, + "loss": 2.1124, + "step": 244640 + }, + { + "epoch": 0.9311982826214383, + "grad_norm": 0.1289437860250473, + "learning_rate": 0.0005, + "loss": 2.1004, + "step": 244650 + }, + { + "epoch": 0.931236345089561, + "grad_norm": 0.12775853276252747, + "learning_rate": 0.0005, + "loss": 2.1122, + "step": 244660 + }, + { + "epoch": 0.9312744075576836, + "grad_norm": 0.128163143992424, + "learning_rate": 0.0005, + "loss": 2.1024, + "step": 244670 + }, + { + "epoch": 0.9313124700258063, + "grad_norm": 0.12197849154472351, + "learning_rate": 0.0005, + "loss": 2.1106, + "step": 244680 + }, + { + "epoch": 0.9313505324939291, + "grad_norm": 0.11555957794189453, + "learning_rate": 0.0005, + "loss": 2.1188, + "step": 244690 + }, + { + "epoch": 0.9313885949620517, + "grad_norm": 0.11189104616641998, + "learning_rate": 0.0005, + "loss": 2.077, + "step": 244700 + }, + { + "epoch": 0.9314266574301744, + "grad_norm": 0.126925989985466, + "learning_rate": 0.0005, + "loss": 2.1049, + "step": 244710 + }, + { + "epoch": 0.931464719898297, + "grad_norm": 0.14960403740406036, + "learning_rate": 0.0005, + "loss": 2.1098, + "step": 244720 + }, + { + "epoch": 0.9315027823664198, + "grad_norm": 0.12165334820747375, + "learning_rate": 0.0005, + "loss": 2.0917, + "step": 244730 + }, + { + "epoch": 0.9315408448345425, + "grad_norm": 0.1221255511045456, + "learning_rate": 0.0005, + "loss": 2.1044, + "step": 244740 + }, + { + "epoch": 0.9315789073026651, + "grad_norm": 0.12555082142353058, + "learning_rate": 0.0005, + "loss": 2.1121, + "step": 244750 + }, + { + "epoch": 0.9316169697707878, + "grad_norm": 0.1253214180469513, + "learning_rate": 0.0005, + "loss": 2.1164, + "step": 244760 + }, + { + "epoch": 0.9316550322389106, + "grad_norm": 0.12106679379940033, + "learning_rate": 0.0005, + "loss": 2.1047, + "step": 244770 + }, + { + "epoch": 0.9316930947070332, + "grad_norm": 0.1325124055147171, + "learning_rate": 0.0005, + "loss": 2.0897, + "step": 244780 + }, + { + "epoch": 0.9317311571751559, + "grad_norm": 0.1329493522644043, + "learning_rate": 0.0005, + "loss": 2.1118, + "step": 244790 + }, + { + "epoch": 0.9317692196432785, + "grad_norm": 0.12659507989883423, + "learning_rate": 0.0005, + "loss": 2.108, + "step": 244800 + }, + { + "epoch": 0.9318072821114012, + "grad_norm": 0.12768709659576416, + "learning_rate": 0.0005, + "loss": 2.1073, + "step": 244810 + }, + { + "epoch": 0.931845344579524, + "grad_norm": 0.13004130125045776, + "learning_rate": 0.0005, + "loss": 2.088, + "step": 244820 + }, + { + "epoch": 0.9318834070476466, + "grad_norm": 0.13215667009353638, + "learning_rate": 0.0005, + "loss": 2.0925, + "step": 244830 + }, + { + "epoch": 0.9319214695157693, + "grad_norm": 0.13396494090557098, + "learning_rate": 0.0005, + "loss": 2.0988, + "step": 244840 + }, + { + "epoch": 0.9319595319838919, + "grad_norm": 0.12323446571826935, + "learning_rate": 0.0005, + "loss": 2.1032, + "step": 244850 + }, + { + "epoch": 0.9319975944520147, + "grad_norm": 0.1380215287208557, + "learning_rate": 0.0005, + "loss": 2.1043, + "step": 244860 + }, + { + "epoch": 0.9320356569201373, + "grad_norm": 0.13250230252742767, + "learning_rate": 0.0005, + "loss": 2.0982, + "step": 244870 + }, + { + "epoch": 0.93207371938826, + "grad_norm": 0.13689535856246948, + "learning_rate": 0.0005, + "loss": 2.1051, + "step": 244880 + }, + { + "epoch": 0.9321117818563827, + "grad_norm": 0.13904209434986115, + "learning_rate": 0.0005, + "loss": 2.1057, + "step": 244890 + }, + { + "epoch": 0.9321498443245054, + "grad_norm": 0.12622742354869843, + "learning_rate": 0.0005, + "loss": 2.1044, + "step": 244900 + }, + { + "epoch": 0.9321879067926281, + "grad_norm": 0.13061605393886566, + "learning_rate": 0.0005, + "loss": 2.1041, + "step": 244910 + }, + { + "epoch": 0.9322259692607507, + "grad_norm": 0.12408585846424103, + "learning_rate": 0.0005, + "loss": 2.0996, + "step": 244920 + }, + { + "epoch": 0.9322640317288734, + "grad_norm": 0.12326142191886902, + "learning_rate": 0.0005, + "loss": 2.1187, + "step": 244930 + }, + { + "epoch": 0.9323020941969961, + "grad_norm": 0.1347014605998993, + "learning_rate": 0.0005, + "loss": 2.1149, + "step": 244940 + }, + { + "epoch": 0.9323401566651188, + "grad_norm": 0.14028072357177734, + "learning_rate": 0.0005, + "loss": 2.1045, + "step": 244950 + }, + { + "epoch": 0.9323782191332415, + "grad_norm": 0.12555497884750366, + "learning_rate": 0.0005, + "loss": 2.0973, + "step": 244960 + }, + { + "epoch": 0.9324162816013641, + "grad_norm": 0.12269239872694016, + "learning_rate": 0.0005, + "loss": 2.0762, + "step": 244970 + }, + { + "epoch": 0.9324543440694868, + "grad_norm": 0.1264629364013672, + "learning_rate": 0.0005, + "loss": 2.12, + "step": 244980 + }, + { + "epoch": 0.9324924065376096, + "grad_norm": 0.11486383527517319, + "learning_rate": 0.0005, + "loss": 2.0965, + "step": 244990 + }, + { + "epoch": 0.9325304690057322, + "grad_norm": 0.1295613944530487, + "learning_rate": 0.0005, + "loss": 2.0915, + "step": 245000 + }, + { + "epoch": 0.9325685314738549, + "grad_norm": 0.13714879751205444, + "learning_rate": 0.0005, + "loss": 2.0936, + "step": 245010 + }, + { + "epoch": 0.9326065939419775, + "grad_norm": 0.13334430754184723, + "learning_rate": 0.0005, + "loss": 2.0837, + "step": 245020 + }, + { + "epoch": 0.9326446564101003, + "grad_norm": 0.12343837320804596, + "learning_rate": 0.0005, + "loss": 2.0892, + "step": 245030 + }, + { + "epoch": 0.932682718878223, + "grad_norm": 0.12585949897766113, + "learning_rate": 0.0005, + "loss": 2.0925, + "step": 245040 + }, + { + "epoch": 0.9327207813463456, + "grad_norm": 0.13595852255821228, + "learning_rate": 0.0005, + "loss": 2.1031, + "step": 245050 + }, + { + "epoch": 0.9327588438144683, + "grad_norm": 0.1471097618341446, + "learning_rate": 0.0005, + "loss": 2.1043, + "step": 245060 + }, + { + "epoch": 0.932796906282591, + "grad_norm": 0.13127946853637695, + "learning_rate": 0.0005, + "loss": 2.094, + "step": 245070 + }, + { + "epoch": 0.9328349687507137, + "grad_norm": 0.12792746722698212, + "learning_rate": 0.0005, + "loss": 2.1132, + "step": 245080 + }, + { + "epoch": 0.9328730312188364, + "grad_norm": 0.1280100792646408, + "learning_rate": 0.0005, + "loss": 2.1103, + "step": 245090 + }, + { + "epoch": 0.932911093686959, + "grad_norm": 0.13393692672252655, + "learning_rate": 0.0005, + "loss": 2.0926, + "step": 245100 + }, + { + "epoch": 0.9329491561550817, + "grad_norm": 0.13617435097694397, + "learning_rate": 0.0005, + "loss": 2.0963, + "step": 245110 + }, + { + "epoch": 0.9329872186232044, + "grad_norm": 0.13600575923919678, + "learning_rate": 0.0005, + "loss": 2.1164, + "step": 245120 + }, + { + "epoch": 0.9330252810913271, + "grad_norm": 0.14357805252075195, + "learning_rate": 0.0005, + "loss": 2.101, + "step": 245130 + }, + { + "epoch": 0.9330633435594498, + "grad_norm": 0.12242817133665085, + "learning_rate": 0.0005, + "loss": 2.0949, + "step": 245140 + }, + { + "epoch": 0.9331014060275724, + "grad_norm": 0.11658231168985367, + "learning_rate": 0.0005, + "loss": 2.1036, + "step": 245150 + }, + { + "epoch": 0.9331394684956952, + "grad_norm": 0.11763457208871841, + "learning_rate": 0.0005, + "loss": 2.1114, + "step": 245160 + }, + { + "epoch": 0.9331775309638178, + "grad_norm": 0.12078958749771118, + "learning_rate": 0.0005, + "loss": 2.0855, + "step": 245170 + }, + { + "epoch": 0.9332155934319405, + "grad_norm": 0.12004182487726212, + "learning_rate": 0.0005, + "loss": 2.117, + "step": 245180 + }, + { + "epoch": 0.9332536559000632, + "grad_norm": 0.13686971366405487, + "learning_rate": 0.0005, + "loss": 2.1101, + "step": 245190 + }, + { + "epoch": 0.9332917183681859, + "grad_norm": 0.12164989113807678, + "learning_rate": 0.0005, + "loss": 2.095, + "step": 245200 + }, + { + "epoch": 0.9333297808363086, + "grad_norm": 0.11873283237218857, + "learning_rate": 0.0005, + "loss": 2.0832, + "step": 245210 + }, + { + "epoch": 0.9333678433044312, + "grad_norm": 0.13262200355529785, + "learning_rate": 0.0005, + "loss": 2.0946, + "step": 245220 + }, + { + "epoch": 0.9334059057725539, + "grad_norm": 0.13883377611637115, + "learning_rate": 0.0005, + "loss": 2.1044, + "step": 245230 + }, + { + "epoch": 0.9334439682406765, + "grad_norm": 0.12576153874397278, + "learning_rate": 0.0005, + "loss": 2.0878, + "step": 245240 + }, + { + "epoch": 0.9334820307087993, + "grad_norm": 0.14034013450145721, + "learning_rate": 0.0005, + "loss": 2.093, + "step": 245250 + }, + { + "epoch": 0.933520093176922, + "grad_norm": 0.12679800391197205, + "learning_rate": 0.0005, + "loss": 2.1084, + "step": 245260 + }, + { + "epoch": 0.9335581556450446, + "grad_norm": 0.12937819957733154, + "learning_rate": 0.0005, + "loss": 2.1, + "step": 245270 + }, + { + "epoch": 0.9335962181131673, + "grad_norm": 0.12601298093795776, + "learning_rate": 0.0005, + "loss": 2.0998, + "step": 245280 + }, + { + "epoch": 0.93363428058129, + "grad_norm": 0.14028529822826385, + "learning_rate": 0.0005, + "loss": 2.0837, + "step": 245290 + }, + { + "epoch": 0.9336723430494127, + "grad_norm": 0.12624330818653107, + "learning_rate": 0.0005, + "loss": 2.1175, + "step": 245300 + }, + { + "epoch": 0.9337104055175354, + "grad_norm": 0.11185381561517715, + "learning_rate": 0.0005, + "loss": 2.0955, + "step": 245310 + }, + { + "epoch": 0.933748467985658, + "grad_norm": 0.1357858031988144, + "learning_rate": 0.0005, + "loss": 2.1027, + "step": 245320 + }, + { + "epoch": 0.9337865304537808, + "grad_norm": 0.1280422806739807, + "learning_rate": 0.0005, + "loss": 2.0834, + "step": 245330 + }, + { + "epoch": 0.9338245929219035, + "grad_norm": 0.13474594056606293, + "learning_rate": 0.0005, + "loss": 2.0878, + "step": 245340 + }, + { + "epoch": 0.9338626553900261, + "grad_norm": 0.12183479964733124, + "learning_rate": 0.0005, + "loss": 2.0983, + "step": 245350 + }, + { + "epoch": 0.9339007178581488, + "grad_norm": 0.1369076818227768, + "learning_rate": 0.0005, + "loss": 2.0996, + "step": 245360 + }, + { + "epoch": 0.9339387803262714, + "grad_norm": 0.1256004273891449, + "learning_rate": 0.0005, + "loss": 2.107, + "step": 245370 + }, + { + "epoch": 0.9339768427943942, + "grad_norm": 0.13013307750225067, + "learning_rate": 0.0005, + "loss": 2.1091, + "step": 245380 + }, + { + "epoch": 0.9340149052625168, + "grad_norm": 0.1307704746723175, + "learning_rate": 0.0005, + "loss": 2.1123, + "step": 245390 + }, + { + "epoch": 0.9340529677306395, + "grad_norm": 0.1378530114889145, + "learning_rate": 0.0005, + "loss": 2.1067, + "step": 245400 + }, + { + "epoch": 0.9340910301987622, + "grad_norm": 0.13222309947013855, + "learning_rate": 0.0005, + "loss": 2.1102, + "step": 245410 + }, + { + "epoch": 0.9341290926668849, + "grad_norm": 0.13338710367679596, + "learning_rate": 0.0005, + "loss": 2.1054, + "step": 245420 + }, + { + "epoch": 0.9341671551350076, + "grad_norm": 0.13438653945922852, + "learning_rate": 0.0005, + "loss": 2.1026, + "step": 245430 + }, + { + "epoch": 0.9342052176031302, + "grad_norm": 0.11549054831266403, + "learning_rate": 0.0005, + "loss": 2.0903, + "step": 245440 + }, + { + "epoch": 0.9342432800712529, + "grad_norm": 0.1245436742901802, + "learning_rate": 0.0005, + "loss": 2.115, + "step": 245450 + }, + { + "epoch": 0.9342813425393757, + "grad_norm": 0.12763431668281555, + "learning_rate": 0.0005, + "loss": 2.0855, + "step": 245460 + }, + { + "epoch": 0.9343194050074983, + "grad_norm": 0.1399812549352646, + "learning_rate": 0.0005, + "loss": 2.1121, + "step": 245470 + }, + { + "epoch": 0.934357467475621, + "grad_norm": 0.12727563083171844, + "learning_rate": 0.0005, + "loss": 2.098, + "step": 245480 + }, + { + "epoch": 0.9343955299437436, + "grad_norm": 0.1361941248178482, + "learning_rate": 0.0005, + "loss": 2.0954, + "step": 245490 + }, + { + "epoch": 0.9344335924118664, + "grad_norm": 0.13142044842243195, + "learning_rate": 0.0005, + "loss": 2.1259, + "step": 245500 + }, + { + "epoch": 0.9344716548799891, + "grad_norm": 0.11843173205852509, + "learning_rate": 0.0005, + "loss": 2.0804, + "step": 245510 + }, + { + "epoch": 0.9345097173481117, + "grad_norm": 0.13190241158008575, + "learning_rate": 0.0005, + "loss": 2.101, + "step": 245520 + }, + { + "epoch": 0.9345477798162344, + "grad_norm": 0.12379549443721771, + "learning_rate": 0.0005, + "loss": 2.1083, + "step": 245530 + }, + { + "epoch": 0.934585842284357, + "grad_norm": 0.12975290417671204, + "learning_rate": 0.0005, + "loss": 2.0866, + "step": 245540 + }, + { + "epoch": 0.9346239047524798, + "grad_norm": 0.11416078358888626, + "learning_rate": 0.0005, + "loss": 2.0987, + "step": 245550 + }, + { + "epoch": 0.9346619672206025, + "grad_norm": 0.1281895637512207, + "learning_rate": 0.0005, + "loss": 2.0973, + "step": 245560 + }, + { + "epoch": 0.9347000296887251, + "grad_norm": 0.11358912289142609, + "learning_rate": 0.0005, + "loss": 2.1156, + "step": 245570 + }, + { + "epoch": 0.9347380921568478, + "grad_norm": 0.1254263073205948, + "learning_rate": 0.0005, + "loss": 2.1081, + "step": 245580 + }, + { + "epoch": 0.9347761546249705, + "grad_norm": 0.1187024861574173, + "learning_rate": 0.0005, + "loss": 2.1055, + "step": 245590 + }, + { + "epoch": 0.9348142170930932, + "grad_norm": 0.13790404796600342, + "learning_rate": 0.0005, + "loss": 2.1289, + "step": 245600 + }, + { + "epoch": 0.9348522795612159, + "grad_norm": 0.12254596501588821, + "learning_rate": 0.0005, + "loss": 2.1106, + "step": 245610 + }, + { + "epoch": 0.9348903420293385, + "grad_norm": 0.12026899307966232, + "learning_rate": 0.0005, + "loss": 2.0981, + "step": 245620 + }, + { + "epoch": 0.9349284044974613, + "grad_norm": 0.1398571878671646, + "learning_rate": 0.0005, + "loss": 2.1177, + "step": 245630 + }, + { + "epoch": 0.9349664669655839, + "grad_norm": 0.13355661928653717, + "learning_rate": 0.0005, + "loss": 2.1127, + "step": 245640 + }, + { + "epoch": 0.9350045294337066, + "grad_norm": 0.12633633613586426, + "learning_rate": 0.0005, + "loss": 2.0953, + "step": 245650 + }, + { + "epoch": 0.9350425919018293, + "grad_norm": 0.13872991502285004, + "learning_rate": 0.0005, + "loss": 2.1046, + "step": 245660 + }, + { + "epoch": 0.9350806543699519, + "grad_norm": 0.13830257952213287, + "learning_rate": 0.0005, + "loss": 2.0985, + "step": 245670 + }, + { + "epoch": 0.9351187168380747, + "grad_norm": 0.12482500821352005, + "learning_rate": 0.0005, + "loss": 2.0914, + "step": 245680 + }, + { + "epoch": 0.9351567793061973, + "grad_norm": 0.13927072286605835, + "learning_rate": 0.0005, + "loss": 2.0915, + "step": 245690 + }, + { + "epoch": 0.93519484177432, + "grad_norm": 0.1152929961681366, + "learning_rate": 0.0005, + "loss": 2.1077, + "step": 245700 + }, + { + "epoch": 0.9352329042424427, + "grad_norm": 0.14349882304668427, + "learning_rate": 0.0005, + "loss": 2.0947, + "step": 245710 + }, + { + "epoch": 0.9352709667105654, + "grad_norm": 0.2207951843738556, + "learning_rate": 0.0005, + "loss": 2.1001, + "step": 245720 + }, + { + "epoch": 0.9353090291786881, + "grad_norm": 0.1341167688369751, + "learning_rate": 0.0005, + "loss": 2.0991, + "step": 245730 + }, + { + "epoch": 0.9353470916468107, + "grad_norm": 0.12583181262016296, + "learning_rate": 0.0005, + "loss": 2.1039, + "step": 245740 + }, + { + "epoch": 0.9353851541149334, + "grad_norm": 0.1297006458044052, + "learning_rate": 0.0005, + "loss": 2.1, + "step": 245750 + }, + { + "epoch": 0.9354232165830562, + "grad_norm": 0.13608314096927643, + "learning_rate": 0.0005, + "loss": 2.111, + "step": 245760 + }, + { + "epoch": 0.9354612790511788, + "grad_norm": 0.15426835417747498, + "learning_rate": 0.0005, + "loss": 2.1017, + "step": 245770 + }, + { + "epoch": 0.9354993415193015, + "grad_norm": 0.133841872215271, + "learning_rate": 0.0005, + "loss": 2.1031, + "step": 245780 + }, + { + "epoch": 0.9355374039874241, + "grad_norm": 0.1368071734905243, + "learning_rate": 0.0005, + "loss": 2.1125, + "step": 245790 + }, + { + "epoch": 0.9355754664555468, + "grad_norm": 0.12981870770454407, + "learning_rate": 0.0005, + "loss": 2.1074, + "step": 245800 + }, + { + "epoch": 0.9356135289236696, + "grad_norm": 0.12528027594089508, + "learning_rate": 0.0005, + "loss": 2.0901, + "step": 245810 + }, + { + "epoch": 0.9356515913917922, + "grad_norm": 0.13445281982421875, + "learning_rate": 0.0005, + "loss": 2.1164, + "step": 245820 + }, + { + "epoch": 0.9356896538599149, + "grad_norm": 0.1206308901309967, + "learning_rate": 0.0005, + "loss": 2.0953, + "step": 245830 + }, + { + "epoch": 0.9357277163280375, + "grad_norm": 0.11260633170604706, + "learning_rate": 0.0005, + "loss": 2.1091, + "step": 245840 + }, + { + "epoch": 0.9357657787961603, + "grad_norm": 0.12294673174619675, + "learning_rate": 0.0005, + "loss": 2.0873, + "step": 245850 + }, + { + "epoch": 0.935803841264283, + "grad_norm": 0.11559704691171646, + "learning_rate": 0.0005, + "loss": 2.1096, + "step": 245860 + }, + { + "epoch": 0.9358419037324056, + "grad_norm": 0.12751439213752747, + "learning_rate": 0.0005, + "loss": 2.0912, + "step": 245870 + }, + { + "epoch": 0.9358799662005283, + "grad_norm": 0.12568078935146332, + "learning_rate": 0.0005, + "loss": 2.1088, + "step": 245880 + }, + { + "epoch": 0.935918028668651, + "grad_norm": 0.12613657116889954, + "learning_rate": 0.0005, + "loss": 2.0834, + "step": 245890 + }, + { + "epoch": 0.9359560911367737, + "grad_norm": 0.131540447473526, + "learning_rate": 0.0005, + "loss": 2.1036, + "step": 245900 + }, + { + "epoch": 0.9359941536048964, + "grad_norm": 0.13464267551898956, + "learning_rate": 0.0005, + "loss": 2.1125, + "step": 245910 + }, + { + "epoch": 0.936032216073019, + "grad_norm": 0.1384028196334839, + "learning_rate": 0.0005, + "loss": 2.101, + "step": 245920 + }, + { + "epoch": 0.9360702785411418, + "grad_norm": 0.13131101429462433, + "learning_rate": 0.0005, + "loss": 2.0984, + "step": 245930 + }, + { + "epoch": 0.9361083410092644, + "grad_norm": 0.13290739059448242, + "learning_rate": 0.0005, + "loss": 2.1036, + "step": 245940 + }, + { + "epoch": 0.9361464034773871, + "grad_norm": 0.12560899555683136, + "learning_rate": 0.0005, + "loss": 2.1127, + "step": 245950 + }, + { + "epoch": 0.9361844659455097, + "grad_norm": 0.12775875627994537, + "learning_rate": 0.0005, + "loss": 2.1124, + "step": 245960 + }, + { + "epoch": 0.9362225284136324, + "grad_norm": 0.12522175908088684, + "learning_rate": 0.0005, + "loss": 2.1201, + "step": 245970 + }, + { + "epoch": 0.9362605908817552, + "grad_norm": 0.13588744401931763, + "learning_rate": 0.0005, + "loss": 2.0895, + "step": 245980 + }, + { + "epoch": 0.9362986533498778, + "grad_norm": 0.11728566884994507, + "learning_rate": 0.0005, + "loss": 2.1052, + "step": 245990 + }, + { + "epoch": 0.9363367158180005, + "grad_norm": 0.14255517721176147, + "learning_rate": 0.0005, + "loss": 2.0999, + "step": 246000 + }, + { + "epoch": 0.9363747782861231, + "grad_norm": 0.1172654777765274, + "learning_rate": 0.0005, + "loss": 2.0814, + "step": 246010 + }, + { + "epoch": 0.9364128407542459, + "grad_norm": 0.12044258415699005, + "learning_rate": 0.0005, + "loss": 2.0894, + "step": 246020 + }, + { + "epoch": 0.9364509032223686, + "grad_norm": 0.13443933427333832, + "learning_rate": 0.0005, + "loss": 2.1162, + "step": 246030 + }, + { + "epoch": 0.9364889656904912, + "grad_norm": 0.3673914968967438, + "learning_rate": 0.0005, + "loss": 2.1048, + "step": 246040 + }, + { + "epoch": 0.9365270281586139, + "grad_norm": 0.136433944106102, + "learning_rate": 0.0005, + "loss": 2.1084, + "step": 246050 + }, + { + "epoch": 0.9365650906267367, + "grad_norm": 0.12014338374137878, + "learning_rate": 0.0005, + "loss": 2.11, + "step": 246060 + }, + { + "epoch": 0.9366031530948593, + "grad_norm": 0.12036501616239548, + "learning_rate": 0.0005, + "loss": 2.0948, + "step": 246070 + }, + { + "epoch": 0.936641215562982, + "grad_norm": 0.1268536001443863, + "learning_rate": 0.0005, + "loss": 2.1086, + "step": 246080 + }, + { + "epoch": 0.9366792780311046, + "grad_norm": 0.13351179659366608, + "learning_rate": 0.0005, + "loss": 2.1002, + "step": 246090 + }, + { + "epoch": 0.9367173404992273, + "grad_norm": 0.12870946526527405, + "learning_rate": 0.0005, + "loss": 2.1041, + "step": 246100 + }, + { + "epoch": 0.93675540296735, + "grad_norm": 0.13091915845870972, + "learning_rate": 0.0005, + "loss": 2.1116, + "step": 246110 + }, + { + "epoch": 0.9367934654354727, + "grad_norm": 0.12250076234340668, + "learning_rate": 0.0005, + "loss": 2.1068, + "step": 246120 + }, + { + "epoch": 0.9368315279035954, + "grad_norm": 0.1321459263563156, + "learning_rate": 0.0005, + "loss": 2.0904, + "step": 246130 + }, + { + "epoch": 0.936869590371718, + "grad_norm": 0.13747379183769226, + "learning_rate": 0.0005, + "loss": 2.1052, + "step": 246140 + }, + { + "epoch": 0.9369076528398408, + "grad_norm": 0.12448690831661224, + "learning_rate": 0.0005, + "loss": 2.0996, + "step": 246150 + }, + { + "epoch": 0.9369457153079634, + "grad_norm": 0.12248706817626953, + "learning_rate": 0.0005, + "loss": 2.0959, + "step": 246160 + }, + { + "epoch": 0.9369837777760861, + "grad_norm": 0.12774351239204407, + "learning_rate": 0.0005, + "loss": 2.0891, + "step": 246170 + }, + { + "epoch": 0.9370218402442088, + "grad_norm": 0.1347806453704834, + "learning_rate": 0.0005, + "loss": 2.1158, + "step": 246180 + }, + { + "epoch": 0.9370599027123315, + "grad_norm": 0.12406744062900543, + "learning_rate": 0.0005, + "loss": 2.104, + "step": 246190 + }, + { + "epoch": 0.9370979651804542, + "grad_norm": 0.12361624836921692, + "learning_rate": 0.0005, + "loss": 2.0981, + "step": 246200 + }, + { + "epoch": 0.9371360276485768, + "grad_norm": 0.12326037883758545, + "learning_rate": 0.0005, + "loss": 2.1063, + "step": 246210 + }, + { + "epoch": 0.9371740901166995, + "grad_norm": 0.13250653445720673, + "learning_rate": 0.0005, + "loss": 2.0898, + "step": 246220 + }, + { + "epoch": 0.9372121525848222, + "grad_norm": 0.12099245190620422, + "learning_rate": 0.0005, + "loss": 2.1046, + "step": 246230 + }, + { + "epoch": 0.9372502150529449, + "grad_norm": 0.12234620004892349, + "learning_rate": 0.0005, + "loss": 2.1028, + "step": 246240 + }, + { + "epoch": 0.9372882775210676, + "grad_norm": 0.12898355722427368, + "learning_rate": 0.0005, + "loss": 2.1015, + "step": 246250 + }, + { + "epoch": 0.9373263399891902, + "grad_norm": 0.1290358155965805, + "learning_rate": 0.0005, + "loss": 2.0973, + "step": 246260 + }, + { + "epoch": 0.9373644024573129, + "grad_norm": 0.11929871886968613, + "learning_rate": 0.0005, + "loss": 2.0819, + "step": 246270 + }, + { + "epoch": 0.9374024649254357, + "grad_norm": 0.15042388439178467, + "learning_rate": 0.0005, + "loss": 2.1043, + "step": 246280 + }, + { + "epoch": 0.9374405273935583, + "grad_norm": 0.12484857439994812, + "learning_rate": 0.0005, + "loss": 2.1036, + "step": 246290 + }, + { + "epoch": 0.937478589861681, + "grad_norm": 0.14481140673160553, + "learning_rate": 0.0005, + "loss": 2.0918, + "step": 246300 + }, + { + "epoch": 0.9375166523298036, + "grad_norm": 0.13843390345573425, + "learning_rate": 0.0005, + "loss": 2.0957, + "step": 246310 + }, + { + "epoch": 0.9375547147979264, + "grad_norm": 0.12472894787788391, + "learning_rate": 0.0005, + "loss": 2.1021, + "step": 246320 + }, + { + "epoch": 0.9375927772660491, + "grad_norm": 0.12258918583393097, + "learning_rate": 0.0005, + "loss": 2.1111, + "step": 246330 + }, + { + "epoch": 0.9376308397341717, + "grad_norm": 0.12393265962600708, + "learning_rate": 0.0005, + "loss": 2.1038, + "step": 246340 + }, + { + "epoch": 0.9376689022022944, + "grad_norm": 0.13368739187717438, + "learning_rate": 0.0005, + "loss": 2.1036, + "step": 246350 + }, + { + "epoch": 0.9377069646704171, + "grad_norm": 0.1267513930797577, + "learning_rate": 0.0005, + "loss": 2.1226, + "step": 246360 + }, + { + "epoch": 0.9377450271385398, + "grad_norm": 0.13105252385139465, + "learning_rate": 0.0005, + "loss": 2.1052, + "step": 246370 + }, + { + "epoch": 0.9377830896066625, + "grad_norm": 0.1518404483795166, + "learning_rate": 0.0005, + "loss": 2.1042, + "step": 246380 + }, + { + "epoch": 0.9378211520747851, + "grad_norm": 0.14227744936943054, + "learning_rate": 0.0005, + "loss": 2.1014, + "step": 246390 + }, + { + "epoch": 0.9378592145429078, + "grad_norm": 0.12951421737670898, + "learning_rate": 0.0005, + "loss": 2.1019, + "step": 246400 + }, + { + "epoch": 0.9378972770110305, + "grad_norm": 0.12221680581569672, + "learning_rate": 0.0005, + "loss": 2.1066, + "step": 246410 + }, + { + "epoch": 0.9379353394791532, + "grad_norm": 0.1388002187013626, + "learning_rate": 0.0005, + "loss": 2.1032, + "step": 246420 + }, + { + "epoch": 0.9379734019472759, + "grad_norm": 0.11565013229846954, + "learning_rate": 0.0005, + "loss": 2.104, + "step": 246430 + }, + { + "epoch": 0.9380114644153985, + "grad_norm": 0.12359726428985596, + "learning_rate": 0.0005, + "loss": 2.104, + "step": 246440 + }, + { + "epoch": 0.9380495268835213, + "grad_norm": 0.12950290739536285, + "learning_rate": 0.0005, + "loss": 2.0792, + "step": 246450 + }, + { + "epoch": 0.9380875893516439, + "grad_norm": 0.12705442309379578, + "learning_rate": 0.0005, + "loss": 2.1095, + "step": 246460 + }, + { + "epoch": 0.9381256518197666, + "grad_norm": 0.12719959020614624, + "learning_rate": 0.0005, + "loss": 2.1049, + "step": 246470 + }, + { + "epoch": 0.9381637142878892, + "grad_norm": 0.11859887838363647, + "learning_rate": 0.0005, + "loss": 2.1008, + "step": 246480 + }, + { + "epoch": 0.938201776756012, + "grad_norm": 0.1301327794790268, + "learning_rate": 0.0005, + "loss": 2.1028, + "step": 246490 + }, + { + "epoch": 0.9382398392241347, + "grad_norm": 0.13133437931537628, + "learning_rate": 0.0005, + "loss": 2.0973, + "step": 246500 + }, + { + "epoch": 0.9382779016922573, + "grad_norm": 0.14098897576332092, + "learning_rate": 0.0005, + "loss": 2.1155, + "step": 246510 + }, + { + "epoch": 0.93831596416038, + "grad_norm": 0.12575797736644745, + "learning_rate": 0.0005, + "loss": 2.1181, + "step": 246520 + }, + { + "epoch": 0.9383540266285026, + "grad_norm": 0.1329401731491089, + "learning_rate": 0.0005, + "loss": 2.1053, + "step": 246530 + }, + { + "epoch": 0.9383920890966254, + "grad_norm": 0.11845256388187408, + "learning_rate": 0.0005, + "loss": 2.1138, + "step": 246540 + }, + { + "epoch": 0.9384301515647481, + "grad_norm": 0.12183622270822525, + "learning_rate": 0.0005, + "loss": 2.1077, + "step": 246550 + }, + { + "epoch": 0.9384682140328707, + "grad_norm": 0.13629251718521118, + "learning_rate": 0.0005, + "loss": 2.1016, + "step": 246560 + }, + { + "epoch": 0.9385062765009934, + "grad_norm": 0.13547387719154358, + "learning_rate": 0.0005, + "loss": 2.1043, + "step": 246570 + }, + { + "epoch": 0.9385443389691162, + "grad_norm": 0.12108206003904343, + "learning_rate": 0.0005, + "loss": 2.0825, + "step": 246580 + }, + { + "epoch": 0.9385824014372388, + "grad_norm": 0.12693358957767487, + "learning_rate": 0.0005, + "loss": 2.1106, + "step": 246590 + }, + { + "epoch": 0.9386204639053615, + "grad_norm": 0.12511876225471497, + "learning_rate": 0.0005, + "loss": 2.0898, + "step": 246600 + }, + { + "epoch": 0.9386585263734841, + "grad_norm": 0.12276551872491837, + "learning_rate": 0.0005, + "loss": 2.1128, + "step": 246610 + }, + { + "epoch": 0.9386965888416069, + "grad_norm": 0.12356232106685638, + "learning_rate": 0.0005, + "loss": 2.0989, + "step": 246620 + }, + { + "epoch": 0.9387346513097296, + "grad_norm": 0.1273866444826126, + "learning_rate": 0.0005, + "loss": 2.1128, + "step": 246630 + }, + { + "epoch": 0.9387727137778522, + "grad_norm": 0.1403125822544098, + "learning_rate": 0.0005, + "loss": 2.0997, + "step": 246640 + }, + { + "epoch": 0.9388107762459749, + "grad_norm": 0.11948072165250778, + "learning_rate": 0.0005, + "loss": 2.0924, + "step": 246650 + }, + { + "epoch": 0.9388488387140975, + "grad_norm": 0.1343478113412857, + "learning_rate": 0.0005, + "loss": 2.1074, + "step": 246660 + }, + { + "epoch": 0.9388869011822203, + "grad_norm": 0.1337791532278061, + "learning_rate": 0.0005, + "loss": 2.1009, + "step": 246670 + }, + { + "epoch": 0.938924963650343, + "grad_norm": 0.12476864457130432, + "learning_rate": 0.0005, + "loss": 2.1123, + "step": 246680 + }, + { + "epoch": 0.9389630261184656, + "grad_norm": 0.11873772740364075, + "learning_rate": 0.0005, + "loss": 2.1102, + "step": 246690 + }, + { + "epoch": 0.9390010885865883, + "grad_norm": 0.1170477494597435, + "learning_rate": 0.0005, + "loss": 2.09, + "step": 246700 + }, + { + "epoch": 0.939039151054711, + "grad_norm": 0.1320660561323166, + "learning_rate": 0.0005, + "loss": 2.1019, + "step": 246710 + }, + { + "epoch": 0.9390772135228337, + "grad_norm": 0.11541859060525894, + "learning_rate": 0.0005, + "loss": 2.1069, + "step": 246720 + }, + { + "epoch": 0.9391152759909563, + "grad_norm": 0.1338876336812973, + "learning_rate": 0.0005, + "loss": 2.1025, + "step": 246730 + }, + { + "epoch": 0.939153338459079, + "grad_norm": 0.12336771190166473, + "learning_rate": 0.0005, + "loss": 2.0931, + "step": 246740 + }, + { + "epoch": 0.9391914009272018, + "grad_norm": 0.12288336455821991, + "learning_rate": 0.0005, + "loss": 2.1079, + "step": 246750 + }, + { + "epoch": 0.9392294633953244, + "grad_norm": 0.12184653431177139, + "learning_rate": 0.0005, + "loss": 2.1069, + "step": 246760 + }, + { + "epoch": 0.9392675258634471, + "grad_norm": 0.11762639880180359, + "learning_rate": 0.0005, + "loss": 2.1051, + "step": 246770 + }, + { + "epoch": 0.9393055883315697, + "grad_norm": 0.12614305317401886, + "learning_rate": 0.0005, + "loss": 2.1057, + "step": 246780 + }, + { + "epoch": 0.9393436507996925, + "grad_norm": 0.14764799177646637, + "learning_rate": 0.0005, + "loss": 2.0982, + "step": 246790 + }, + { + "epoch": 0.9393817132678152, + "grad_norm": 0.13260680437088013, + "learning_rate": 0.0005, + "loss": 2.1081, + "step": 246800 + }, + { + "epoch": 0.9394197757359378, + "grad_norm": 0.14004816114902496, + "learning_rate": 0.0005, + "loss": 2.0995, + "step": 246810 + }, + { + "epoch": 0.9394578382040605, + "grad_norm": 0.12658268213272095, + "learning_rate": 0.0005, + "loss": 2.0992, + "step": 246820 + }, + { + "epoch": 0.9394959006721831, + "grad_norm": 0.12620680034160614, + "learning_rate": 0.0005, + "loss": 2.1137, + "step": 246830 + }, + { + "epoch": 0.9395339631403059, + "grad_norm": 0.13516870141029358, + "learning_rate": 0.0005, + "loss": 2.0992, + "step": 246840 + }, + { + "epoch": 0.9395720256084286, + "grad_norm": 0.13516582548618317, + "learning_rate": 0.0005, + "loss": 2.1021, + "step": 246850 + }, + { + "epoch": 0.9396100880765512, + "grad_norm": 0.13392318785190582, + "learning_rate": 0.0005, + "loss": 2.0997, + "step": 246860 + }, + { + "epoch": 0.9396481505446739, + "grad_norm": 0.13238194584846497, + "learning_rate": 0.0005, + "loss": 2.0991, + "step": 246870 + }, + { + "epoch": 0.9396862130127966, + "grad_norm": 0.7064668536186218, + "learning_rate": 0.0005, + "loss": 2.1056, + "step": 246880 + }, + { + "epoch": 0.9397242754809193, + "grad_norm": 0.12507928907871246, + "learning_rate": 0.0005, + "loss": 2.0891, + "step": 246890 + }, + { + "epoch": 0.939762337949042, + "grad_norm": 0.13007816672325134, + "learning_rate": 0.0005, + "loss": 2.1077, + "step": 246900 + }, + { + "epoch": 0.9398004004171646, + "grad_norm": 0.15461665391921997, + "learning_rate": 0.0005, + "loss": 2.087, + "step": 246910 + }, + { + "epoch": 0.9398384628852874, + "grad_norm": 0.12466249614953995, + "learning_rate": 0.0005, + "loss": 2.11, + "step": 246920 + }, + { + "epoch": 0.93987652535341, + "grad_norm": 0.11939510703086853, + "learning_rate": 0.0005, + "loss": 2.1034, + "step": 246930 + }, + { + "epoch": 0.9399145878215327, + "grad_norm": 0.13170850276947021, + "learning_rate": 0.0005, + "loss": 2.1046, + "step": 246940 + }, + { + "epoch": 0.9399526502896554, + "grad_norm": 0.13314881920814514, + "learning_rate": 0.0005, + "loss": 2.1053, + "step": 246950 + }, + { + "epoch": 0.939990712757778, + "grad_norm": 0.1443936675786972, + "learning_rate": 0.0005, + "loss": 2.1114, + "step": 246960 + }, + { + "epoch": 0.9400287752259008, + "grad_norm": 0.12503737211227417, + "learning_rate": 0.0005, + "loss": 2.1041, + "step": 246970 + }, + { + "epoch": 0.9400668376940234, + "grad_norm": 0.12784408032894135, + "learning_rate": 0.0005, + "loss": 2.1006, + "step": 246980 + }, + { + "epoch": 0.9401049001621461, + "grad_norm": 0.12824499607086182, + "learning_rate": 0.0005, + "loss": 2.0988, + "step": 246990 + }, + { + "epoch": 0.9401429626302688, + "grad_norm": 0.1239519789814949, + "learning_rate": 0.0005, + "loss": 2.1176, + "step": 247000 + }, + { + "epoch": 0.9401810250983915, + "grad_norm": 0.137156143784523, + "learning_rate": 0.0005, + "loss": 2.099, + "step": 247010 + }, + { + "epoch": 0.9402190875665142, + "grad_norm": 0.751557469367981, + "learning_rate": 0.0005, + "loss": 2.1167, + "step": 247020 + }, + { + "epoch": 0.9402571500346368, + "grad_norm": 0.15649625658988953, + "learning_rate": 0.0005, + "loss": 2.1155, + "step": 247030 + }, + { + "epoch": 0.9402952125027595, + "grad_norm": 0.14045241475105286, + "learning_rate": 0.0005, + "loss": 2.1013, + "step": 247040 + }, + { + "epoch": 0.9403332749708823, + "grad_norm": 0.1222781240940094, + "learning_rate": 0.0005, + "loss": 2.0918, + "step": 247050 + }, + { + "epoch": 0.9403713374390049, + "grad_norm": 0.1385979801416397, + "learning_rate": 0.0005, + "loss": 2.0976, + "step": 247060 + }, + { + "epoch": 0.9404093999071276, + "grad_norm": 0.12437307089567184, + "learning_rate": 0.0005, + "loss": 2.1006, + "step": 247070 + }, + { + "epoch": 0.9404474623752502, + "grad_norm": 0.1382979303598404, + "learning_rate": 0.0005, + "loss": 2.1078, + "step": 247080 + }, + { + "epoch": 0.9404855248433729, + "grad_norm": 0.12030627578496933, + "learning_rate": 0.0005, + "loss": 2.1009, + "step": 247090 + }, + { + "epoch": 0.9405235873114957, + "grad_norm": 0.12918499112129211, + "learning_rate": 0.0005, + "loss": 2.1046, + "step": 247100 + }, + { + "epoch": 0.9405616497796183, + "grad_norm": 0.13981960713863373, + "learning_rate": 0.0005, + "loss": 2.1071, + "step": 247110 + }, + { + "epoch": 0.940599712247741, + "grad_norm": 0.12755261361598969, + "learning_rate": 0.0005, + "loss": 2.0877, + "step": 247120 + }, + { + "epoch": 0.9406377747158636, + "grad_norm": 0.12737154960632324, + "learning_rate": 0.0005, + "loss": 2.0937, + "step": 247130 + }, + { + "epoch": 0.9406758371839864, + "grad_norm": 0.11642778664827347, + "learning_rate": 0.0005, + "loss": 2.1076, + "step": 247140 + }, + { + "epoch": 0.940713899652109, + "grad_norm": 0.1331937313079834, + "learning_rate": 0.0005, + "loss": 2.1023, + "step": 247150 + }, + { + "epoch": 0.9407519621202317, + "grad_norm": 0.1338392198085785, + "learning_rate": 0.0005, + "loss": 2.1047, + "step": 247160 + }, + { + "epoch": 0.9407900245883544, + "grad_norm": 0.12696552276611328, + "learning_rate": 0.0005, + "loss": 2.1057, + "step": 247170 + }, + { + "epoch": 0.9408280870564771, + "grad_norm": 0.13532081246376038, + "learning_rate": 0.0005, + "loss": 2.0985, + "step": 247180 + }, + { + "epoch": 0.9408661495245998, + "grad_norm": 0.14050805568695068, + "learning_rate": 0.0005, + "loss": 2.1226, + "step": 247190 + }, + { + "epoch": 0.9409042119927225, + "grad_norm": 0.11687342077493668, + "learning_rate": 0.0005, + "loss": 2.0951, + "step": 247200 + }, + { + "epoch": 0.9409422744608451, + "grad_norm": 0.1321887969970703, + "learning_rate": 0.0005, + "loss": 2.1039, + "step": 247210 + }, + { + "epoch": 0.9409803369289679, + "grad_norm": 0.13208705186843872, + "learning_rate": 0.0005, + "loss": 2.1058, + "step": 247220 + }, + { + "epoch": 0.9410183993970905, + "grad_norm": 0.1374688446521759, + "learning_rate": 0.0005, + "loss": 2.1033, + "step": 247230 + }, + { + "epoch": 0.9410564618652132, + "grad_norm": 0.12851817905902863, + "learning_rate": 0.0005, + "loss": 2.0954, + "step": 247240 + }, + { + "epoch": 0.9410945243333358, + "grad_norm": 0.1238558441400528, + "learning_rate": 0.0005, + "loss": 2.1036, + "step": 247250 + }, + { + "epoch": 0.9411325868014585, + "grad_norm": 0.1303836703300476, + "learning_rate": 0.0005, + "loss": 2.1036, + "step": 247260 + }, + { + "epoch": 0.9411706492695813, + "grad_norm": 0.1266433149576187, + "learning_rate": 0.0005, + "loss": 2.1014, + "step": 247270 + }, + { + "epoch": 0.9412087117377039, + "grad_norm": 0.12886379659175873, + "learning_rate": 0.0005, + "loss": 2.1136, + "step": 247280 + }, + { + "epoch": 0.9412467742058266, + "grad_norm": 0.1281256079673767, + "learning_rate": 0.0005, + "loss": 2.1112, + "step": 247290 + }, + { + "epoch": 0.9412848366739492, + "grad_norm": 0.11581743508577347, + "learning_rate": 0.0005, + "loss": 2.0954, + "step": 247300 + }, + { + "epoch": 0.941322899142072, + "grad_norm": 0.12750853598117828, + "learning_rate": 0.0005, + "loss": 2.1053, + "step": 247310 + }, + { + "epoch": 0.9413609616101947, + "grad_norm": 0.1209491565823555, + "learning_rate": 0.0005, + "loss": 2.1049, + "step": 247320 + }, + { + "epoch": 0.9413990240783173, + "grad_norm": 0.13058196008205414, + "learning_rate": 0.0005, + "loss": 2.1105, + "step": 247330 + }, + { + "epoch": 0.94143708654644, + "grad_norm": 0.1260518729686737, + "learning_rate": 0.0005, + "loss": 2.1128, + "step": 247340 + }, + { + "epoch": 0.9414751490145628, + "grad_norm": 0.13261444866657257, + "learning_rate": 0.0005, + "loss": 2.0957, + "step": 247350 + }, + { + "epoch": 0.9415132114826854, + "grad_norm": 0.11897791922092438, + "learning_rate": 0.0005, + "loss": 2.0984, + "step": 247360 + }, + { + "epoch": 0.9415512739508081, + "grad_norm": 0.13250933587551117, + "learning_rate": 0.0005, + "loss": 2.1055, + "step": 247370 + }, + { + "epoch": 0.9415893364189307, + "grad_norm": 0.12644755840301514, + "learning_rate": 0.0005, + "loss": 2.0994, + "step": 247380 + }, + { + "epoch": 0.9416273988870534, + "grad_norm": 0.12750305235385895, + "learning_rate": 0.0005, + "loss": 2.0909, + "step": 247390 + }, + { + "epoch": 0.9416654613551761, + "grad_norm": 0.1279606968164444, + "learning_rate": 0.0005, + "loss": 2.1115, + "step": 247400 + }, + { + "epoch": 0.9417035238232988, + "grad_norm": 0.12689395248889923, + "learning_rate": 0.0005, + "loss": 2.1138, + "step": 247410 + }, + { + "epoch": 0.9417415862914215, + "grad_norm": 0.13141341507434845, + "learning_rate": 0.0005, + "loss": 2.1012, + "step": 247420 + }, + { + "epoch": 0.9417796487595441, + "grad_norm": 0.12511730194091797, + "learning_rate": 0.0005, + "loss": 2.0976, + "step": 247430 + }, + { + "epoch": 0.9418177112276669, + "grad_norm": 0.12846639752388, + "learning_rate": 0.0005, + "loss": 2.0834, + "step": 247440 + }, + { + "epoch": 0.9418557736957895, + "grad_norm": 0.14211717247962952, + "learning_rate": 0.0005, + "loss": 2.1066, + "step": 247450 + }, + { + "epoch": 0.9418938361639122, + "grad_norm": 0.5756912231445312, + "learning_rate": 0.0005, + "loss": 2.1088, + "step": 247460 + }, + { + "epoch": 0.9419318986320349, + "grad_norm": 0.12941479682922363, + "learning_rate": 0.0005, + "loss": 2.0928, + "step": 247470 + }, + { + "epoch": 0.9419699611001576, + "grad_norm": 0.12610690295696259, + "learning_rate": 0.0005, + "loss": 2.0897, + "step": 247480 + }, + { + "epoch": 0.9420080235682803, + "grad_norm": 0.13161595165729523, + "learning_rate": 0.0005, + "loss": 2.109, + "step": 247490 + }, + { + "epoch": 0.9420460860364029, + "grad_norm": 0.13354597985744476, + "learning_rate": 0.0005, + "loss": 2.089, + "step": 247500 + }, + { + "epoch": 0.9420841485045256, + "grad_norm": 0.13382618129253387, + "learning_rate": 0.0005, + "loss": 2.1034, + "step": 247510 + }, + { + "epoch": 0.9421222109726483, + "grad_norm": 0.18865372240543365, + "learning_rate": 0.0005, + "loss": 2.1201, + "step": 247520 + }, + { + "epoch": 0.942160273440771, + "grad_norm": 0.1534956693649292, + "learning_rate": 0.0005, + "loss": 2.0872, + "step": 247530 + }, + { + "epoch": 0.9421983359088937, + "grad_norm": 0.11790409684181213, + "learning_rate": 0.0005, + "loss": 2.0983, + "step": 247540 + }, + { + "epoch": 0.9422363983770163, + "grad_norm": 0.1351659893989563, + "learning_rate": 0.0005, + "loss": 2.088, + "step": 247550 + }, + { + "epoch": 0.942274460845139, + "grad_norm": 0.1758767068386078, + "learning_rate": 0.0005, + "loss": 2.1036, + "step": 247560 + }, + { + "epoch": 0.9423125233132618, + "grad_norm": 0.13484224677085876, + "learning_rate": 0.0005, + "loss": 2.1167, + "step": 247570 + }, + { + "epoch": 0.9423505857813844, + "grad_norm": 0.12165403366088867, + "learning_rate": 0.0005, + "loss": 2.1024, + "step": 247580 + }, + { + "epoch": 0.9423886482495071, + "grad_norm": 0.11824820935726166, + "learning_rate": 0.0005, + "loss": 2.1098, + "step": 247590 + }, + { + "epoch": 0.9424267107176297, + "grad_norm": 1.1216968297958374, + "learning_rate": 0.0005, + "loss": 2.0955, + "step": 247600 + }, + { + "epoch": 0.9424647731857525, + "grad_norm": 0.12063819169998169, + "learning_rate": 0.0005, + "loss": 2.1108, + "step": 247610 + }, + { + "epoch": 0.9425028356538752, + "grad_norm": 0.1480524092912674, + "learning_rate": 0.0005, + "loss": 2.1013, + "step": 247620 + }, + { + "epoch": 0.9425408981219978, + "grad_norm": 0.13503408432006836, + "learning_rate": 0.0005, + "loss": 2.1038, + "step": 247630 + }, + { + "epoch": 0.9425789605901205, + "grad_norm": 0.13576354086399078, + "learning_rate": 0.0005, + "loss": 2.101, + "step": 247640 + }, + { + "epoch": 0.9426170230582432, + "grad_norm": 0.1289045363664627, + "learning_rate": 0.0005, + "loss": 2.0925, + "step": 247650 + }, + { + "epoch": 0.9426550855263659, + "grad_norm": 0.11602700501680374, + "learning_rate": 0.0005, + "loss": 2.0865, + "step": 247660 + }, + { + "epoch": 0.9426931479944886, + "grad_norm": 0.12578940391540527, + "learning_rate": 0.0005, + "loss": 2.0986, + "step": 247670 + }, + { + "epoch": 0.9427312104626112, + "grad_norm": 0.13036979734897614, + "learning_rate": 0.0005, + "loss": 2.0812, + "step": 247680 + }, + { + "epoch": 0.9427692729307339, + "grad_norm": 0.13717511296272278, + "learning_rate": 0.0005, + "loss": 2.1052, + "step": 247690 + }, + { + "epoch": 0.9428073353988566, + "grad_norm": 0.12567205727100372, + "learning_rate": 0.0005, + "loss": 2.0959, + "step": 247700 + }, + { + "epoch": 0.9428453978669793, + "grad_norm": 0.12764301896095276, + "learning_rate": 0.0005, + "loss": 2.1024, + "step": 247710 + }, + { + "epoch": 0.942883460335102, + "grad_norm": 0.11658774316310883, + "learning_rate": 0.0005, + "loss": 2.1039, + "step": 247720 + }, + { + "epoch": 0.9429215228032246, + "grad_norm": 0.12862013280391693, + "learning_rate": 0.0005, + "loss": 2.0962, + "step": 247730 + }, + { + "epoch": 0.9429595852713474, + "grad_norm": 0.13183024525642395, + "learning_rate": 0.0005, + "loss": 2.1047, + "step": 247740 + }, + { + "epoch": 0.94299764773947, + "grad_norm": 0.12923283874988556, + "learning_rate": 0.0005, + "loss": 2.1027, + "step": 247750 + }, + { + "epoch": 0.9430357102075927, + "grad_norm": 0.1428668349981308, + "learning_rate": 0.0005, + "loss": 2.0905, + "step": 247760 + }, + { + "epoch": 0.9430737726757153, + "grad_norm": 0.1358453780412674, + "learning_rate": 0.0005, + "loss": 2.0955, + "step": 247770 + }, + { + "epoch": 0.9431118351438381, + "grad_norm": 0.12302374839782715, + "learning_rate": 0.0005, + "loss": 2.0954, + "step": 247780 + }, + { + "epoch": 0.9431498976119608, + "grad_norm": 0.1263979822397232, + "learning_rate": 0.0005, + "loss": 2.0955, + "step": 247790 + }, + { + "epoch": 0.9431879600800834, + "grad_norm": 0.13652725517749786, + "learning_rate": 0.0005, + "loss": 2.1084, + "step": 247800 + }, + { + "epoch": 0.9432260225482061, + "grad_norm": 0.12320644408464432, + "learning_rate": 0.0005, + "loss": 2.1066, + "step": 247810 + }, + { + "epoch": 0.9432640850163287, + "grad_norm": 0.12279729545116425, + "learning_rate": 0.0005, + "loss": 2.1042, + "step": 247820 + }, + { + "epoch": 0.9433021474844515, + "grad_norm": 0.14760860800743103, + "learning_rate": 0.0005, + "loss": 2.0997, + "step": 247830 + }, + { + "epoch": 0.9433402099525742, + "grad_norm": 0.13388857245445251, + "learning_rate": 0.0005, + "loss": 2.1191, + "step": 247840 + }, + { + "epoch": 0.9433782724206968, + "grad_norm": 0.129547119140625, + "learning_rate": 0.0005, + "loss": 2.1103, + "step": 247850 + }, + { + "epoch": 0.9434163348888195, + "grad_norm": 0.129354789853096, + "learning_rate": 0.0005, + "loss": 2.0993, + "step": 247860 + }, + { + "epoch": 0.9434543973569423, + "grad_norm": 0.12934815883636475, + "learning_rate": 0.0005, + "loss": 2.1133, + "step": 247870 + }, + { + "epoch": 0.9434924598250649, + "grad_norm": 0.1424483209848404, + "learning_rate": 0.0005, + "loss": 2.0971, + "step": 247880 + }, + { + "epoch": 0.9435305222931876, + "grad_norm": 0.12889623641967773, + "learning_rate": 0.0005, + "loss": 2.0925, + "step": 247890 + }, + { + "epoch": 0.9435685847613102, + "grad_norm": 0.13118860125541687, + "learning_rate": 0.0005, + "loss": 2.1085, + "step": 247900 + }, + { + "epoch": 0.943606647229433, + "grad_norm": 0.13081367313861847, + "learning_rate": 0.0005, + "loss": 2.0936, + "step": 247910 + }, + { + "epoch": 0.9436447096975557, + "grad_norm": 0.12048673629760742, + "learning_rate": 0.0005, + "loss": 2.117, + "step": 247920 + }, + { + "epoch": 0.9436827721656783, + "grad_norm": 0.12423050403594971, + "learning_rate": 0.0005, + "loss": 2.0875, + "step": 247930 + }, + { + "epoch": 0.943720834633801, + "grad_norm": 0.13558757305145264, + "learning_rate": 0.0005, + "loss": 2.0964, + "step": 247940 + }, + { + "epoch": 0.9437588971019236, + "grad_norm": 0.12062255293130875, + "learning_rate": 0.0005, + "loss": 2.1027, + "step": 247950 + }, + { + "epoch": 0.9437969595700464, + "grad_norm": 0.1254424899816513, + "learning_rate": 0.0005, + "loss": 2.1003, + "step": 247960 + }, + { + "epoch": 0.943835022038169, + "grad_norm": 0.1260339617729187, + "learning_rate": 0.0005, + "loss": 2.0948, + "step": 247970 + }, + { + "epoch": 0.9438730845062917, + "grad_norm": 0.1362634301185608, + "learning_rate": 0.0005, + "loss": 2.0929, + "step": 247980 + }, + { + "epoch": 0.9439111469744144, + "grad_norm": 0.12181158363819122, + "learning_rate": 0.0005, + "loss": 2.1001, + "step": 247990 + }, + { + "epoch": 0.9439492094425371, + "grad_norm": 0.12834899127483368, + "learning_rate": 0.0005, + "loss": 2.1178, + "step": 248000 + }, + { + "epoch": 0.9439872719106598, + "grad_norm": 0.13318775594234467, + "learning_rate": 0.0005, + "loss": 2.0983, + "step": 248010 + }, + { + "epoch": 0.9440253343787824, + "grad_norm": 0.11598256975412369, + "learning_rate": 0.0005, + "loss": 2.1021, + "step": 248020 + }, + { + "epoch": 0.9440633968469051, + "grad_norm": 0.12331517040729523, + "learning_rate": 0.0005, + "loss": 2.0933, + "step": 248030 + }, + { + "epoch": 0.9441014593150279, + "grad_norm": 0.13327628374099731, + "learning_rate": 0.0005, + "loss": 2.0928, + "step": 248040 + }, + { + "epoch": 0.9441395217831505, + "grad_norm": 0.127610981464386, + "learning_rate": 0.0005, + "loss": 2.0876, + "step": 248050 + }, + { + "epoch": 0.9441775842512732, + "grad_norm": 0.13593189418315887, + "learning_rate": 0.0005, + "loss": 2.0969, + "step": 248060 + }, + { + "epoch": 0.9442156467193958, + "grad_norm": 0.12497703731060028, + "learning_rate": 0.0005, + "loss": 2.0892, + "step": 248070 + }, + { + "epoch": 0.9442537091875186, + "grad_norm": 0.1429484635591507, + "learning_rate": 0.0005, + "loss": 2.0871, + "step": 248080 + }, + { + "epoch": 0.9442917716556413, + "grad_norm": 0.13125035166740417, + "learning_rate": 0.0005, + "loss": 2.0917, + "step": 248090 + }, + { + "epoch": 0.9443298341237639, + "grad_norm": 0.13275553286075592, + "learning_rate": 0.0005, + "loss": 2.0919, + "step": 248100 + }, + { + "epoch": 0.9443678965918866, + "grad_norm": 0.12504088878631592, + "learning_rate": 0.0005, + "loss": 2.1143, + "step": 248110 + }, + { + "epoch": 0.9444059590600092, + "grad_norm": 0.1175403743982315, + "learning_rate": 0.0005, + "loss": 2.1118, + "step": 248120 + }, + { + "epoch": 0.944444021528132, + "grad_norm": 0.1290964037179947, + "learning_rate": 0.0005, + "loss": 2.0974, + "step": 248130 + }, + { + "epoch": 0.9444820839962547, + "grad_norm": 0.13197334110736847, + "learning_rate": 0.0005, + "loss": 2.0911, + "step": 248140 + }, + { + "epoch": 0.9445201464643773, + "grad_norm": 0.12857557833194733, + "learning_rate": 0.0005, + "loss": 2.1063, + "step": 248150 + }, + { + "epoch": 0.9445582089325, + "grad_norm": 0.12041082978248596, + "learning_rate": 0.0005, + "loss": 2.1015, + "step": 248160 + }, + { + "epoch": 0.9445962714006227, + "grad_norm": 0.12345477193593979, + "learning_rate": 0.0005, + "loss": 2.1055, + "step": 248170 + }, + { + "epoch": 0.9446343338687454, + "grad_norm": 0.12356548011302948, + "learning_rate": 0.0005, + "loss": 2.1122, + "step": 248180 + }, + { + "epoch": 0.9446723963368681, + "grad_norm": 0.14413070678710938, + "learning_rate": 0.0005, + "loss": 2.0933, + "step": 248190 + }, + { + "epoch": 0.9447104588049907, + "grad_norm": 0.13794247806072235, + "learning_rate": 0.0005, + "loss": 2.0998, + "step": 248200 + }, + { + "epoch": 0.9447485212731135, + "grad_norm": 0.12572318315505981, + "learning_rate": 0.0005, + "loss": 2.1056, + "step": 248210 + }, + { + "epoch": 0.9447865837412361, + "grad_norm": 0.12373986095190048, + "learning_rate": 0.0005, + "loss": 2.1031, + "step": 248220 + }, + { + "epoch": 0.9448246462093588, + "grad_norm": 0.1262924075126648, + "learning_rate": 0.0005, + "loss": 2.108, + "step": 248230 + }, + { + "epoch": 0.9448627086774815, + "grad_norm": 0.13063013553619385, + "learning_rate": 0.0005, + "loss": 2.1124, + "step": 248240 + }, + { + "epoch": 0.9449007711456041, + "grad_norm": 0.12661738693714142, + "learning_rate": 0.0005, + "loss": 2.1015, + "step": 248250 + }, + { + "epoch": 0.9449388336137269, + "grad_norm": 0.12877686321735382, + "learning_rate": 0.0005, + "loss": 2.1158, + "step": 248260 + }, + { + "epoch": 0.9449768960818495, + "grad_norm": 0.1472814381122589, + "learning_rate": 0.0005, + "loss": 2.1109, + "step": 248270 + }, + { + "epoch": 0.9450149585499722, + "grad_norm": 0.13327832520008087, + "learning_rate": 0.0005, + "loss": 2.1098, + "step": 248280 + }, + { + "epoch": 0.9450530210180949, + "grad_norm": 0.14059068262577057, + "learning_rate": 0.0005, + "loss": 2.0912, + "step": 248290 + }, + { + "epoch": 0.9450910834862176, + "grad_norm": 0.13199837505817413, + "learning_rate": 0.0005, + "loss": 2.1185, + "step": 248300 + }, + { + "epoch": 0.9451291459543403, + "grad_norm": 0.15628120303153992, + "learning_rate": 0.0005, + "loss": 2.0906, + "step": 248310 + }, + { + "epoch": 0.9451672084224629, + "grad_norm": 0.12141703814268112, + "learning_rate": 0.0005, + "loss": 2.0871, + "step": 248320 + }, + { + "epoch": 0.9452052708905856, + "grad_norm": 0.13831599056720734, + "learning_rate": 0.0005, + "loss": 2.1066, + "step": 248330 + }, + { + "epoch": 0.9452433333587084, + "grad_norm": 0.1294797658920288, + "learning_rate": 0.0005, + "loss": 2.1104, + "step": 248340 + }, + { + "epoch": 0.945281395826831, + "grad_norm": 0.1410902440547943, + "learning_rate": 0.0005, + "loss": 2.1107, + "step": 248350 + }, + { + "epoch": 0.9453194582949537, + "grad_norm": 0.1276555210351944, + "learning_rate": 0.0005, + "loss": 2.114, + "step": 248360 + }, + { + "epoch": 0.9453575207630763, + "grad_norm": 0.12058349698781967, + "learning_rate": 0.0005, + "loss": 2.0932, + "step": 248370 + }, + { + "epoch": 0.9453955832311991, + "grad_norm": 0.12352532893419266, + "learning_rate": 0.0005, + "loss": 2.1064, + "step": 248380 + }, + { + "epoch": 0.9454336456993218, + "grad_norm": 0.13646119832992554, + "learning_rate": 0.0005, + "loss": 2.1133, + "step": 248390 + }, + { + "epoch": 0.9454717081674444, + "grad_norm": 0.13688063621520996, + "learning_rate": 0.0005, + "loss": 2.1084, + "step": 248400 + }, + { + "epoch": 0.9455097706355671, + "grad_norm": 0.12911947071552277, + "learning_rate": 0.0005, + "loss": 2.1124, + "step": 248410 + }, + { + "epoch": 0.9455478331036897, + "grad_norm": 0.12154773622751236, + "learning_rate": 0.0005, + "loss": 2.1031, + "step": 248420 + }, + { + "epoch": 0.9455858955718125, + "grad_norm": 0.12716318666934967, + "learning_rate": 0.0005, + "loss": 2.0912, + "step": 248430 + }, + { + "epoch": 0.9456239580399352, + "grad_norm": 0.13813437521457672, + "learning_rate": 0.0005, + "loss": 2.0894, + "step": 248440 + }, + { + "epoch": 0.9456620205080578, + "grad_norm": 0.14493019878864288, + "learning_rate": 0.0005, + "loss": 2.1078, + "step": 248450 + }, + { + "epoch": 0.9457000829761805, + "grad_norm": 0.1254408359527588, + "learning_rate": 0.0005, + "loss": 2.1012, + "step": 248460 + }, + { + "epoch": 0.9457381454443032, + "grad_norm": 0.11893883347511292, + "learning_rate": 0.0005, + "loss": 2.1041, + "step": 248470 + }, + { + "epoch": 0.9457762079124259, + "grad_norm": 0.12803934514522552, + "learning_rate": 0.0005, + "loss": 2.1083, + "step": 248480 + }, + { + "epoch": 0.9458142703805485, + "grad_norm": 0.1335584819316864, + "learning_rate": 0.0005, + "loss": 2.1051, + "step": 248490 + }, + { + "epoch": 0.9458523328486712, + "grad_norm": 0.12377738207578659, + "learning_rate": 0.0005, + "loss": 2.1059, + "step": 248500 + }, + { + "epoch": 0.945890395316794, + "grad_norm": 0.12790720164775848, + "learning_rate": 0.0005, + "loss": 2.1145, + "step": 248510 + }, + { + "epoch": 0.9459284577849166, + "grad_norm": 0.12881170213222504, + "learning_rate": 0.0005, + "loss": 2.1081, + "step": 248520 + }, + { + "epoch": 0.9459665202530393, + "grad_norm": 0.12704525887966156, + "learning_rate": 0.0005, + "loss": 2.1022, + "step": 248530 + }, + { + "epoch": 0.946004582721162, + "grad_norm": 0.1225334033370018, + "learning_rate": 0.0005, + "loss": 2.1058, + "step": 248540 + }, + { + "epoch": 0.9460426451892846, + "grad_norm": 0.1326608657836914, + "learning_rate": 0.0005, + "loss": 2.1084, + "step": 248550 + }, + { + "epoch": 0.9460807076574074, + "grad_norm": 0.13305629789829254, + "learning_rate": 0.0005, + "loss": 2.1014, + "step": 248560 + }, + { + "epoch": 0.94611877012553, + "grad_norm": 0.1200888454914093, + "learning_rate": 0.0005, + "loss": 2.1172, + "step": 248570 + }, + { + "epoch": 0.9461568325936527, + "grad_norm": 0.1219412088394165, + "learning_rate": 0.0005, + "loss": 2.0992, + "step": 248580 + }, + { + "epoch": 0.9461948950617753, + "grad_norm": 0.12386654317378998, + "learning_rate": 0.0005, + "loss": 2.0999, + "step": 248590 + }, + { + "epoch": 0.9462329575298981, + "grad_norm": 0.13242636620998383, + "learning_rate": 0.0005, + "loss": 2.1036, + "step": 248600 + }, + { + "epoch": 0.9462710199980208, + "grad_norm": 0.13440661132335663, + "learning_rate": 0.0005, + "loss": 2.1085, + "step": 248610 + }, + { + "epoch": 0.9463090824661434, + "grad_norm": 0.12407206743955612, + "learning_rate": 0.0005, + "loss": 2.0971, + "step": 248620 + }, + { + "epoch": 0.9463471449342661, + "grad_norm": 0.1191297248005867, + "learning_rate": 0.0005, + "loss": 2.1073, + "step": 248630 + }, + { + "epoch": 0.9463852074023889, + "grad_norm": 0.13571171462535858, + "learning_rate": 0.0005, + "loss": 2.0961, + "step": 248640 + }, + { + "epoch": 0.9464232698705115, + "grad_norm": 0.11383175104856491, + "learning_rate": 0.0005, + "loss": 2.0871, + "step": 248650 + }, + { + "epoch": 0.9464613323386342, + "grad_norm": 0.11983030289411545, + "learning_rate": 0.0005, + "loss": 2.0944, + "step": 248660 + }, + { + "epoch": 0.9464993948067568, + "grad_norm": 0.1278083771467209, + "learning_rate": 0.0005, + "loss": 2.1011, + "step": 248670 + }, + { + "epoch": 0.9465374572748795, + "grad_norm": 0.13592961430549622, + "learning_rate": 0.0005, + "loss": 2.101, + "step": 248680 + }, + { + "epoch": 0.9465755197430022, + "grad_norm": 0.12375950068235397, + "learning_rate": 0.0005, + "loss": 2.1153, + "step": 248690 + }, + { + "epoch": 0.9466135822111249, + "grad_norm": 0.12705379724502563, + "learning_rate": 0.0005, + "loss": 2.1013, + "step": 248700 + }, + { + "epoch": 0.9466516446792476, + "grad_norm": 0.1328490674495697, + "learning_rate": 0.0005, + "loss": 2.1078, + "step": 248710 + }, + { + "epoch": 0.9466897071473702, + "grad_norm": 0.12585778534412384, + "learning_rate": 0.0005, + "loss": 2.117, + "step": 248720 + }, + { + "epoch": 0.946727769615493, + "grad_norm": 0.12267450988292694, + "learning_rate": 0.0005, + "loss": 2.0945, + "step": 248730 + }, + { + "epoch": 0.9467658320836156, + "grad_norm": 0.12765011191368103, + "learning_rate": 0.0005, + "loss": 2.1021, + "step": 248740 + }, + { + "epoch": 0.9468038945517383, + "grad_norm": 0.1310986429452896, + "learning_rate": 0.0005, + "loss": 2.1065, + "step": 248750 + }, + { + "epoch": 0.946841957019861, + "grad_norm": 0.11241777241230011, + "learning_rate": 0.0005, + "loss": 2.0968, + "step": 248760 + }, + { + "epoch": 0.9468800194879837, + "grad_norm": 0.12467561662197113, + "learning_rate": 0.0005, + "loss": 2.0972, + "step": 248770 + }, + { + "epoch": 0.9469180819561064, + "grad_norm": 0.13519595563411713, + "learning_rate": 0.0005, + "loss": 2.1063, + "step": 248780 + }, + { + "epoch": 0.946956144424229, + "grad_norm": 0.12287317216396332, + "learning_rate": 0.0005, + "loss": 2.1096, + "step": 248790 + }, + { + "epoch": 0.9469942068923517, + "grad_norm": 0.324131041765213, + "learning_rate": 0.0005, + "loss": 2.0982, + "step": 248800 + }, + { + "epoch": 0.9470322693604745, + "grad_norm": 0.15612849593162537, + "learning_rate": 0.0005, + "loss": 2.1101, + "step": 248810 + }, + { + "epoch": 0.9470703318285971, + "grad_norm": 0.12411175668239594, + "learning_rate": 0.0005, + "loss": 2.1011, + "step": 248820 + }, + { + "epoch": 0.9471083942967198, + "grad_norm": 0.11235556751489639, + "learning_rate": 0.0005, + "loss": 2.1089, + "step": 248830 + }, + { + "epoch": 0.9471464567648424, + "grad_norm": 0.12063275277614594, + "learning_rate": 0.0005, + "loss": 2.1187, + "step": 248840 + }, + { + "epoch": 0.9471845192329651, + "grad_norm": 0.12278567999601364, + "learning_rate": 0.0005, + "loss": 2.1022, + "step": 248850 + }, + { + "epoch": 0.9472225817010879, + "grad_norm": 0.13061478734016418, + "learning_rate": 0.0005, + "loss": 2.094, + "step": 248860 + }, + { + "epoch": 0.9472606441692105, + "grad_norm": 0.1256982833147049, + "learning_rate": 0.0005, + "loss": 2.1104, + "step": 248870 + }, + { + "epoch": 0.9472987066373332, + "grad_norm": 0.11956235766410828, + "learning_rate": 0.0005, + "loss": 2.1075, + "step": 248880 + }, + { + "epoch": 0.9473367691054558, + "grad_norm": 0.11729145050048828, + "learning_rate": 0.0005, + "loss": 2.1044, + "step": 248890 + }, + { + "epoch": 0.9473748315735786, + "grad_norm": 0.15000468492507935, + "learning_rate": 0.0005, + "loss": 2.104, + "step": 248900 + }, + { + "epoch": 0.9474128940417013, + "grad_norm": 0.12541407346725464, + "learning_rate": 0.0005, + "loss": 2.0987, + "step": 248910 + }, + { + "epoch": 0.9474509565098239, + "grad_norm": 0.13246984779834747, + "learning_rate": 0.0005, + "loss": 2.1019, + "step": 248920 + }, + { + "epoch": 0.9474890189779466, + "grad_norm": 0.12549960613250732, + "learning_rate": 0.0005, + "loss": 2.0964, + "step": 248930 + }, + { + "epoch": 0.9475270814460693, + "grad_norm": 0.1670214980840683, + "learning_rate": 0.0005, + "loss": 2.1118, + "step": 248940 + }, + { + "epoch": 0.947565143914192, + "grad_norm": 0.1267949342727661, + "learning_rate": 0.0005, + "loss": 2.098, + "step": 248950 + }, + { + "epoch": 0.9476032063823147, + "grad_norm": 0.13572907447814941, + "learning_rate": 0.0005, + "loss": 2.097, + "step": 248960 + }, + { + "epoch": 0.9476412688504373, + "grad_norm": 0.1221279725432396, + "learning_rate": 0.0005, + "loss": 2.1115, + "step": 248970 + }, + { + "epoch": 0.94767933131856, + "grad_norm": 0.13833478093147278, + "learning_rate": 0.0005, + "loss": 2.1056, + "step": 248980 + }, + { + "epoch": 0.9477173937866827, + "grad_norm": 0.1366894692182541, + "learning_rate": 0.0005, + "loss": 2.1062, + "step": 248990 + }, + { + "epoch": 0.9477554562548054, + "grad_norm": 0.12488869577646255, + "learning_rate": 0.0005, + "loss": 2.1104, + "step": 249000 + }, + { + "epoch": 0.947793518722928, + "grad_norm": 0.12555605173110962, + "learning_rate": 0.0005, + "loss": 2.116, + "step": 249010 + }, + { + "epoch": 0.9478315811910507, + "grad_norm": 0.12620453536510468, + "learning_rate": 0.0005, + "loss": 2.1055, + "step": 249020 + }, + { + "epoch": 0.9478696436591735, + "grad_norm": 0.11639034003019333, + "learning_rate": 0.0005, + "loss": 2.0893, + "step": 249030 + }, + { + "epoch": 0.9479077061272961, + "grad_norm": 0.1186845600605011, + "learning_rate": 0.0005, + "loss": 2.0902, + "step": 249040 + }, + { + "epoch": 0.9479457685954188, + "grad_norm": 0.1193389892578125, + "learning_rate": 0.0005, + "loss": 2.1028, + "step": 249050 + }, + { + "epoch": 0.9479838310635414, + "grad_norm": 0.12538054585456848, + "learning_rate": 0.0005, + "loss": 2.1147, + "step": 249060 + }, + { + "epoch": 0.9480218935316642, + "grad_norm": 0.14422617852687836, + "learning_rate": 0.0005, + "loss": 2.1313, + "step": 249070 + }, + { + "epoch": 0.9480599559997869, + "grad_norm": 0.12844279408454895, + "learning_rate": 0.0005, + "loss": 2.0998, + "step": 249080 + }, + { + "epoch": 0.9480980184679095, + "grad_norm": 0.14020352065563202, + "learning_rate": 0.0005, + "loss": 2.1053, + "step": 249090 + }, + { + "epoch": 0.9481360809360322, + "grad_norm": 0.134184330701828, + "learning_rate": 0.0005, + "loss": 2.1042, + "step": 249100 + }, + { + "epoch": 0.9481741434041548, + "grad_norm": 0.1266484558582306, + "learning_rate": 0.0005, + "loss": 2.1079, + "step": 249110 + }, + { + "epoch": 0.9482122058722776, + "grad_norm": 0.1303141862154007, + "learning_rate": 0.0005, + "loss": 2.106, + "step": 249120 + }, + { + "epoch": 0.9482502683404003, + "grad_norm": 0.135605126619339, + "learning_rate": 0.0005, + "loss": 2.0991, + "step": 249130 + }, + { + "epoch": 0.9482883308085229, + "grad_norm": 0.1270579844713211, + "learning_rate": 0.0005, + "loss": 2.0937, + "step": 249140 + }, + { + "epoch": 0.9483263932766456, + "grad_norm": 0.13215014338493347, + "learning_rate": 0.0005, + "loss": 2.1145, + "step": 249150 + }, + { + "epoch": 0.9483644557447684, + "grad_norm": 0.1274470090866089, + "learning_rate": 0.0005, + "loss": 2.1004, + "step": 249160 + }, + { + "epoch": 0.948402518212891, + "grad_norm": 0.12177138775587082, + "learning_rate": 0.0005, + "loss": 2.1, + "step": 249170 + }, + { + "epoch": 0.9484405806810137, + "grad_norm": 0.13077914714813232, + "learning_rate": 0.0005, + "loss": 2.1041, + "step": 249180 + }, + { + "epoch": 0.9484786431491363, + "grad_norm": 0.12208457291126251, + "learning_rate": 0.0005, + "loss": 2.1037, + "step": 249190 + }, + { + "epoch": 0.9485167056172591, + "grad_norm": 0.11963622272014618, + "learning_rate": 0.0005, + "loss": 2.1094, + "step": 249200 + }, + { + "epoch": 0.9485547680853817, + "grad_norm": 0.12783189117908478, + "learning_rate": 0.0005, + "loss": 2.1061, + "step": 249210 + }, + { + "epoch": 0.9485928305535044, + "grad_norm": 0.12605759501457214, + "learning_rate": 0.0005, + "loss": 2.1072, + "step": 249220 + }, + { + "epoch": 0.9486308930216271, + "grad_norm": 0.12036082148551941, + "learning_rate": 0.0005, + "loss": 2.1177, + "step": 249230 + }, + { + "epoch": 0.9486689554897498, + "grad_norm": 0.1330842226743698, + "learning_rate": 0.0005, + "loss": 2.1113, + "step": 249240 + }, + { + "epoch": 0.9487070179578725, + "grad_norm": 0.12136498838663101, + "learning_rate": 0.0005, + "loss": 2.1144, + "step": 249250 + }, + { + "epoch": 0.9487450804259951, + "grad_norm": 0.12630410492420197, + "learning_rate": 0.0005, + "loss": 2.1001, + "step": 249260 + }, + { + "epoch": 0.9487831428941178, + "grad_norm": 0.11599517613649368, + "learning_rate": 0.0005, + "loss": 2.1049, + "step": 249270 + }, + { + "epoch": 0.9488212053622405, + "grad_norm": 0.12922899425029755, + "learning_rate": 0.0005, + "loss": 2.101, + "step": 249280 + }, + { + "epoch": 0.9488592678303632, + "grad_norm": 0.1108672022819519, + "learning_rate": 0.0005, + "loss": 2.0947, + "step": 249290 + }, + { + "epoch": 0.9488973302984859, + "grad_norm": 0.12226884067058563, + "learning_rate": 0.0005, + "loss": 2.0969, + "step": 249300 + }, + { + "epoch": 0.9489353927666085, + "grad_norm": 0.13119786977767944, + "learning_rate": 0.0005, + "loss": 2.0973, + "step": 249310 + }, + { + "epoch": 0.9489734552347312, + "grad_norm": 0.13422635197639465, + "learning_rate": 0.0005, + "loss": 2.1018, + "step": 249320 + }, + { + "epoch": 0.949011517702854, + "grad_norm": 0.13358595967292786, + "learning_rate": 0.0005, + "loss": 2.0927, + "step": 249330 + }, + { + "epoch": 0.9490495801709766, + "grad_norm": 0.1265745759010315, + "learning_rate": 0.0005, + "loss": 2.1053, + "step": 249340 + }, + { + "epoch": 0.9490876426390993, + "grad_norm": 0.12609122693538666, + "learning_rate": 0.0005, + "loss": 2.1144, + "step": 249350 + }, + { + "epoch": 0.9491257051072219, + "grad_norm": 0.13447849452495575, + "learning_rate": 0.0005, + "loss": 2.1172, + "step": 249360 + }, + { + "epoch": 0.9491637675753447, + "grad_norm": 0.12742824852466583, + "learning_rate": 0.0005, + "loss": 2.1091, + "step": 249370 + }, + { + "epoch": 0.9492018300434674, + "grad_norm": 0.13406914472579956, + "learning_rate": 0.0005, + "loss": 2.1143, + "step": 249380 + }, + { + "epoch": 0.94923989251159, + "grad_norm": 0.11564747989177704, + "learning_rate": 0.0005, + "loss": 2.0953, + "step": 249390 + }, + { + "epoch": 0.9492779549797127, + "grad_norm": 0.1280737817287445, + "learning_rate": 0.0005, + "loss": 2.1177, + "step": 249400 + }, + { + "epoch": 0.9493160174478353, + "grad_norm": 0.12426292151212692, + "learning_rate": 0.0005, + "loss": 2.0874, + "step": 249410 + }, + { + "epoch": 0.9493540799159581, + "grad_norm": 0.16510789096355438, + "learning_rate": 0.0005, + "loss": 2.1088, + "step": 249420 + }, + { + "epoch": 0.9493921423840808, + "grad_norm": 0.12697450816631317, + "learning_rate": 0.0005, + "loss": 2.1111, + "step": 249430 + }, + { + "epoch": 0.9494302048522034, + "grad_norm": 0.12335682660341263, + "learning_rate": 0.0005, + "loss": 2.0911, + "step": 249440 + }, + { + "epoch": 0.9494682673203261, + "grad_norm": 0.1332320123910904, + "learning_rate": 0.0005, + "loss": 2.0922, + "step": 249450 + }, + { + "epoch": 0.9495063297884488, + "grad_norm": 0.1257762312889099, + "learning_rate": 0.0005, + "loss": 2.1131, + "step": 249460 + }, + { + "epoch": 0.9495443922565715, + "grad_norm": 0.1309550553560257, + "learning_rate": 0.0005, + "loss": 2.1085, + "step": 249470 + }, + { + "epoch": 0.9495824547246942, + "grad_norm": 0.12939979135990143, + "learning_rate": 0.0005, + "loss": 2.1081, + "step": 249480 + }, + { + "epoch": 0.9496205171928168, + "grad_norm": 0.14679569005966187, + "learning_rate": 0.0005, + "loss": 2.0969, + "step": 249490 + }, + { + "epoch": 0.9496585796609396, + "grad_norm": 0.12528207898139954, + "learning_rate": 0.0005, + "loss": 2.0869, + "step": 249500 + }, + { + "epoch": 0.9496966421290622, + "grad_norm": 0.12247055023908615, + "learning_rate": 0.0005, + "loss": 2.1172, + "step": 249510 + }, + { + "epoch": 0.9497347045971849, + "grad_norm": 0.12854456901550293, + "learning_rate": 0.0005, + "loss": 2.0919, + "step": 249520 + }, + { + "epoch": 0.9497727670653076, + "grad_norm": 0.12855258584022522, + "learning_rate": 0.0005, + "loss": 2.1202, + "step": 249530 + }, + { + "epoch": 0.9498108295334302, + "grad_norm": 0.12317433953285217, + "learning_rate": 0.0005, + "loss": 2.1034, + "step": 249540 + }, + { + "epoch": 0.949848892001553, + "grad_norm": 0.12889666855335236, + "learning_rate": 0.0005, + "loss": 2.1098, + "step": 249550 + }, + { + "epoch": 0.9498869544696756, + "grad_norm": 0.14366105198860168, + "learning_rate": 0.0005, + "loss": 2.1181, + "step": 249560 + }, + { + "epoch": 0.9499250169377983, + "grad_norm": 0.12399886548519135, + "learning_rate": 0.0005, + "loss": 2.0979, + "step": 249570 + }, + { + "epoch": 0.949963079405921, + "grad_norm": 0.135155588388443, + "learning_rate": 0.0005, + "loss": 2.1094, + "step": 249580 + }, + { + "epoch": 0.9500011418740437, + "grad_norm": 0.1358819603919983, + "learning_rate": 0.0005, + "loss": 2.1038, + "step": 249590 + }, + { + "epoch": 0.9500392043421664, + "grad_norm": 0.14831727743148804, + "learning_rate": 0.0005, + "loss": 2.1037, + "step": 249600 + }, + { + "epoch": 0.950077266810289, + "grad_norm": 0.12076780200004578, + "learning_rate": 0.0005, + "loss": 2.1058, + "step": 249610 + }, + { + "epoch": 0.9501153292784117, + "grad_norm": 0.13320599496364594, + "learning_rate": 0.0005, + "loss": 2.0904, + "step": 249620 + }, + { + "epoch": 0.9501533917465345, + "grad_norm": 0.1212395578622818, + "learning_rate": 0.0005, + "loss": 2.0867, + "step": 249630 + }, + { + "epoch": 0.9501914542146571, + "grad_norm": 0.1279253512620926, + "learning_rate": 0.0005, + "loss": 2.0957, + "step": 249640 + }, + { + "epoch": 0.9502295166827798, + "grad_norm": 0.1288677155971527, + "learning_rate": 0.0005, + "loss": 2.1055, + "step": 249650 + }, + { + "epoch": 0.9502675791509024, + "grad_norm": 0.13300663232803345, + "learning_rate": 0.0005, + "loss": 2.1129, + "step": 249660 + }, + { + "epoch": 0.9503056416190252, + "grad_norm": 0.1199730783700943, + "learning_rate": 0.0005, + "loss": 2.1016, + "step": 249670 + }, + { + "epoch": 0.9503437040871479, + "grad_norm": 0.1486423760652542, + "learning_rate": 0.0005, + "loss": 2.0971, + "step": 249680 + }, + { + "epoch": 0.9503817665552705, + "grad_norm": 0.12857380509376526, + "learning_rate": 0.0005, + "loss": 2.1181, + "step": 249690 + }, + { + "epoch": 0.9504198290233932, + "grad_norm": 0.1329220086336136, + "learning_rate": 0.0005, + "loss": 2.1051, + "step": 249700 + }, + { + "epoch": 0.9504578914915158, + "grad_norm": 0.1408364474773407, + "learning_rate": 0.0005, + "loss": 2.1045, + "step": 249710 + }, + { + "epoch": 0.9504959539596386, + "grad_norm": 0.12741191685199738, + "learning_rate": 0.0005, + "loss": 2.0829, + "step": 249720 + }, + { + "epoch": 0.9505340164277613, + "grad_norm": 0.13314951956272125, + "learning_rate": 0.0005, + "loss": 2.1102, + "step": 249730 + }, + { + "epoch": 0.9505720788958839, + "grad_norm": 0.38605743646621704, + "learning_rate": 0.0005, + "loss": 2.1011, + "step": 249740 + }, + { + "epoch": 0.9506101413640066, + "grad_norm": 0.13181251287460327, + "learning_rate": 0.0005, + "loss": 2.1101, + "step": 249750 + }, + { + "epoch": 0.9506482038321293, + "grad_norm": 0.13556815683841705, + "learning_rate": 0.0005, + "loss": 2.099, + "step": 249760 + }, + { + "epoch": 0.950686266300252, + "grad_norm": 0.12179868668317795, + "learning_rate": 0.0005, + "loss": 2.0837, + "step": 249770 + }, + { + "epoch": 0.9507243287683746, + "grad_norm": 0.14233249425888062, + "learning_rate": 0.0005, + "loss": 2.1113, + "step": 249780 + }, + { + "epoch": 0.9507623912364973, + "grad_norm": 0.13981717824935913, + "learning_rate": 0.0005, + "loss": 2.1053, + "step": 249790 + }, + { + "epoch": 0.9508004537046201, + "grad_norm": 0.14350582659244537, + "learning_rate": 0.0005, + "loss": 2.0909, + "step": 249800 + }, + { + "epoch": 0.9508385161727427, + "grad_norm": 0.14137966930866241, + "learning_rate": 0.0005, + "loss": 2.1323, + "step": 249810 + }, + { + "epoch": 0.9508765786408654, + "grad_norm": 0.1286575198173523, + "learning_rate": 0.0005, + "loss": 2.1118, + "step": 249820 + }, + { + "epoch": 0.950914641108988, + "grad_norm": 0.12449745088815689, + "learning_rate": 0.0005, + "loss": 2.0988, + "step": 249830 + }, + { + "epoch": 0.9509527035771107, + "grad_norm": 0.12966904044151306, + "learning_rate": 0.0005, + "loss": 2.0999, + "step": 249840 + }, + { + "epoch": 0.9509907660452335, + "grad_norm": 0.13560040295124054, + "learning_rate": 0.0005, + "loss": 2.1054, + "step": 249850 + }, + { + "epoch": 0.9510288285133561, + "grad_norm": 0.12174062430858612, + "learning_rate": 0.0005, + "loss": 2.1084, + "step": 249860 + }, + { + "epoch": 0.9510668909814788, + "grad_norm": 0.12325585633516312, + "learning_rate": 0.0005, + "loss": 2.1071, + "step": 249870 + }, + { + "epoch": 0.9511049534496014, + "grad_norm": 0.13168565928936005, + "learning_rate": 0.0005, + "loss": 2.0978, + "step": 249880 + }, + { + "epoch": 0.9511430159177242, + "grad_norm": 0.11944977194070816, + "learning_rate": 0.0005, + "loss": 2.113, + "step": 249890 + }, + { + "epoch": 0.9511810783858469, + "grad_norm": 0.12736931443214417, + "learning_rate": 0.0005, + "loss": 2.1085, + "step": 249900 + }, + { + "epoch": 0.9512191408539695, + "grad_norm": 0.11464428901672363, + "learning_rate": 0.0005, + "loss": 2.109, + "step": 249910 + }, + { + "epoch": 0.9512572033220922, + "grad_norm": 0.1317955106496811, + "learning_rate": 0.0005, + "loss": 2.097, + "step": 249920 + }, + { + "epoch": 0.951295265790215, + "grad_norm": 0.1277564913034439, + "learning_rate": 0.0005, + "loss": 2.1075, + "step": 249930 + }, + { + "epoch": 0.9513333282583376, + "grad_norm": 0.13540315628051758, + "learning_rate": 0.0005, + "loss": 2.1096, + "step": 249940 + }, + { + "epoch": 0.9513713907264603, + "grad_norm": 0.13464035093784332, + "learning_rate": 0.0005, + "loss": 2.0946, + "step": 249950 + }, + { + "epoch": 0.9514094531945829, + "grad_norm": 0.1361413598060608, + "learning_rate": 0.0005, + "loss": 2.1157, + "step": 249960 + }, + { + "epoch": 0.9514475156627056, + "grad_norm": 0.1233205646276474, + "learning_rate": 0.0005, + "loss": 2.11, + "step": 249970 + }, + { + "epoch": 0.9514855781308283, + "grad_norm": 0.13336284458637238, + "learning_rate": 0.0005, + "loss": 2.0963, + "step": 249980 + }, + { + "epoch": 0.951523640598951, + "grad_norm": 0.12346182018518448, + "learning_rate": 0.0005, + "loss": 2.1006, + "step": 249990 + }, + { + "epoch": 0.9515617030670737, + "grad_norm": 0.12522675096988678, + "learning_rate": 0.0005, + "loss": 2.12, + "step": 250000 + }, + { + "epoch": 0.9515997655351963, + "grad_norm": 0.12237617373466492, + "learning_rate": 0.0005, + "loss": 2.0996, + "step": 250010 + }, + { + "epoch": 0.9516378280033191, + "grad_norm": 0.13073542714118958, + "learning_rate": 0.0005, + "loss": 2.085, + "step": 250020 + }, + { + "epoch": 0.9516758904714417, + "grad_norm": 0.13486787676811218, + "learning_rate": 0.0005, + "loss": 2.1008, + "step": 250030 + }, + { + "epoch": 0.9517139529395644, + "grad_norm": 0.11351048201322556, + "learning_rate": 0.0005, + "loss": 2.1068, + "step": 250040 + }, + { + "epoch": 0.9517520154076871, + "grad_norm": 0.13018272817134857, + "learning_rate": 0.0005, + "loss": 2.0918, + "step": 250050 + }, + { + "epoch": 0.9517900778758098, + "grad_norm": 0.12174280732870102, + "learning_rate": 0.0005, + "loss": 2.1071, + "step": 250060 + }, + { + "epoch": 0.9518281403439325, + "grad_norm": 0.12217634916305542, + "learning_rate": 0.0005, + "loss": 2.0947, + "step": 250070 + }, + { + "epoch": 0.9518662028120551, + "grad_norm": 0.11637187749147415, + "learning_rate": 0.0005, + "loss": 2.0821, + "step": 250080 + }, + { + "epoch": 0.9519042652801778, + "grad_norm": 0.13275794684886932, + "learning_rate": 0.0005, + "loss": 2.095, + "step": 250090 + }, + { + "epoch": 0.9519423277483006, + "grad_norm": 0.12678100168704987, + "learning_rate": 0.0005, + "loss": 2.1104, + "step": 250100 + }, + { + "epoch": 0.9519803902164232, + "grad_norm": 0.12198396027088165, + "learning_rate": 0.0005, + "loss": 2.1168, + "step": 250110 + }, + { + "epoch": 0.9520184526845459, + "grad_norm": 0.13225769996643066, + "learning_rate": 0.0005, + "loss": 2.0756, + "step": 250120 + }, + { + "epoch": 0.9520565151526685, + "grad_norm": 0.12288650125265121, + "learning_rate": 0.0005, + "loss": 2.104, + "step": 250130 + }, + { + "epoch": 0.9520945776207912, + "grad_norm": 0.1239933893084526, + "learning_rate": 0.0005, + "loss": 2.1034, + "step": 250140 + }, + { + "epoch": 0.952132640088914, + "grad_norm": 0.12927429378032684, + "learning_rate": 0.0005, + "loss": 2.1074, + "step": 250150 + }, + { + "epoch": 0.9521707025570366, + "grad_norm": 0.12449338287115097, + "learning_rate": 0.0005, + "loss": 2.109, + "step": 250160 + }, + { + "epoch": 0.9522087650251593, + "grad_norm": 0.11477695405483246, + "learning_rate": 0.0005, + "loss": 2.102, + "step": 250170 + }, + { + "epoch": 0.9522468274932819, + "grad_norm": 0.12830223143100739, + "learning_rate": 0.0005, + "loss": 2.0995, + "step": 250180 + }, + { + "epoch": 0.9522848899614047, + "grad_norm": 0.12702523171901703, + "learning_rate": 0.0005, + "loss": 2.0972, + "step": 250190 + }, + { + "epoch": 0.9523229524295274, + "grad_norm": 0.12919588387012482, + "learning_rate": 0.0005, + "loss": 2.108, + "step": 250200 + }, + { + "epoch": 0.95236101489765, + "grad_norm": 0.127348393201828, + "learning_rate": 0.0005, + "loss": 2.0885, + "step": 250210 + }, + { + "epoch": 0.9523990773657727, + "grad_norm": 0.11610375344753265, + "learning_rate": 0.0005, + "loss": 2.0882, + "step": 250220 + }, + { + "epoch": 0.9524371398338954, + "grad_norm": 0.12946386635303497, + "learning_rate": 0.0005, + "loss": 2.1101, + "step": 250230 + }, + { + "epoch": 0.9524752023020181, + "grad_norm": 0.1378440260887146, + "learning_rate": 0.0005, + "loss": 2.1029, + "step": 250240 + }, + { + "epoch": 0.9525132647701408, + "grad_norm": 0.13060183823108673, + "learning_rate": 0.0005, + "loss": 2.1015, + "step": 250250 + }, + { + "epoch": 0.9525513272382634, + "grad_norm": 0.12115290760993958, + "learning_rate": 0.0005, + "loss": 2.1038, + "step": 250260 + }, + { + "epoch": 0.9525893897063861, + "grad_norm": 0.15255451202392578, + "learning_rate": 0.0005, + "loss": 2.1064, + "step": 250270 + }, + { + "epoch": 0.9526274521745088, + "grad_norm": 0.13809344172477722, + "learning_rate": 0.0005, + "loss": 2.1076, + "step": 250280 + }, + { + "epoch": 0.9526655146426315, + "grad_norm": 0.12919829785823822, + "learning_rate": 0.0005, + "loss": 2.0889, + "step": 250290 + }, + { + "epoch": 0.9527035771107542, + "grad_norm": 0.12677544355392456, + "learning_rate": 0.0005, + "loss": 2.1066, + "step": 250300 + }, + { + "epoch": 0.9527416395788768, + "grad_norm": 0.13873299956321716, + "learning_rate": 0.0005, + "loss": 2.0926, + "step": 250310 + }, + { + "epoch": 0.9527797020469996, + "grad_norm": 0.1411902904510498, + "learning_rate": 0.0005, + "loss": 2.1008, + "step": 250320 + }, + { + "epoch": 0.9528177645151222, + "grad_norm": 0.11741477996110916, + "learning_rate": 0.0005, + "loss": 2.0815, + "step": 250330 + }, + { + "epoch": 0.9528558269832449, + "grad_norm": 0.13320787250995636, + "learning_rate": 0.0005, + "loss": 2.0868, + "step": 250340 + }, + { + "epoch": 0.9528938894513675, + "grad_norm": 0.11807240545749664, + "learning_rate": 0.0005, + "loss": 2.1052, + "step": 250350 + }, + { + "epoch": 0.9529319519194903, + "grad_norm": 0.13958625495433807, + "learning_rate": 0.0005, + "loss": 2.0908, + "step": 250360 + }, + { + "epoch": 0.952970014387613, + "grad_norm": 0.12476134300231934, + "learning_rate": 0.0005, + "loss": 2.1189, + "step": 250370 + }, + { + "epoch": 0.9530080768557356, + "grad_norm": 0.13018640875816345, + "learning_rate": 0.0005, + "loss": 2.1106, + "step": 250380 + }, + { + "epoch": 0.9530461393238583, + "grad_norm": 0.12088967114686966, + "learning_rate": 0.0005, + "loss": 2.1134, + "step": 250390 + }, + { + "epoch": 0.953084201791981, + "grad_norm": 0.11502324044704437, + "learning_rate": 0.0005, + "loss": 2.1032, + "step": 250400 + }, + { + "epoch": 0.9531222642601037, + "grad_norm": 0.12260492146015167, + "learning_rate": 0.0005, + "loss": 2.1069, + "step": 250410 + }, + { + "epoch": 0.9531603267282264, + "grad_norm": 0.12876258790493011, + "learning_rate": 0.0005, + "loss": 2.0924, + "step": 250420 + }, + { + "epoch": 0.953198389196349, + "grad_norm": 0.13146209716796875, + "learning_rate": 0.0005, + "loss": 2.1143, + "step": 250430 + }, + { + "epoch": 0.9532364516644717, + "grad_norm": 0.1314568817615509, + "learning_rate": 0.0005, + "loss": 2.0992, + "step": 250440 + }, + { + "epoch": 0.9532745141325945, + "grad_norm": 0.12796203792095184, + "learning_rate": 0.0005, + "loss": 2.1, + "step": 250450 + }, + { + "epoch": 0.9533125766007171, + "grad_norm": 0.14050385355949402, + "learning_rate": 0.0005, + "loss": 2.1035, + "step": 250460 + }, + { + "epoch": 0.9533506390688398, + "grad_norm": 0.12566983699798584, + "learning_rate": 0.0005, + "loss": 2.1013, + "step": 250470 + }, + { + "epoch": 0.9533887015369624, + "grad_norm": 0.13052697479724884, + "learning_rate": 0.0005, + "loss": 2.1125, + "step": 250480 + }, + { + "epoch": 0.9534267640050852, + "grad_norm": 0.1295281946659088, + "learning_rate": 0.0005, + "loss": 2.0874, + "step": 250490 + }, + { + "epoch": 0.9534648264732078, + "grad_norm": 0.12345805764198303, + "learning_rate": 0.0005, + "loss": 2.1025, + "step": 250500 + }, + { + "epoch": 0.9535028889413305, + "grad_norm": 0.14121460914611816, + "learning_rate": 0.0005, + "loss": 2.114, + "step": 250510 + }, + { + "epoch": 0.9535409514094532, + "grad_norm": 0.12259361147880554, + "learning_rate": 0.0005, + "loss": 2.0837, + "step": 250520 + }, + { + "epoch": 0.9535790138775759, + "grad_norm": 0.1267411708831787, + "learning_rate": 0.0005, + "loss": 2.1178, + "step": 250530 + }, + { + "epoch": 0.9536170763456986, + "grad_norm": 0.12196764349937439, + "learning_rate": 0.0005, + "loss": 2.1178, + "step": 250540 + }, + { + "epoch": 0.9536551388138212, + "grad_norm": 0.1457517147064209, + "learning_rate": 0.0005, + "loss": 2.1152, + "step": 250550 + }, + { + "epoch": 0.9536932012819439, + "grad_norm": 0.1303631216287613, + "learning_rate": 0.0005, + "loss": 2.0985, + "step": 250560 + }, + { + "epoch": 0.9537312637500666, + "grad_norm": 0.12143102288246155, + "learning_rate": 0.0005, + "loss": 2.1066, + "step": 250570 + }, + { + "epoch": 0.9537693262181893, + "grad_norm": 0.14176031947135925, + "learning_rate": 0.0005, + "loss": 2.1064, + "step": 250580 + }, + { + "epoch": 0.953807388686312, + "grad_norm": 0.13000178337097168, + "learning_rate": 0.0005, + "loss": 2.1093, + "step": 250590 + }, + { + "epoch": 0.9538454511544346, + "grad_norm": 0.1253688484430313, + "learning_rate": 0.0005, + "loss": 2.11, + "step": 250600 + }, + { + "epoch": 0.9538835136225573, + "grad_norm": 0.12691253423690796, + "learning_rate": 0.0005, + "loss": 2.0735, + "step": 250610 + }, + { + "epoch": 0.9539215760906801, + "grad_norm": 0.12398819625377655, + "learning_rate": 0.0005, + "loss": 2.1024, + "step": 250620 + }, + { + "epoch": 0.9539596385588027, + "grad_norm": 0.12237461656332016, + "learning_rate": 0.0005, + "loss": 2.1147, + "step": 250630 + }, + { + "epoch": 0.9539977010269254, + "grad_norm": 0.12592361867427826, + "learning_rate": 0.0005, + "loss": 2.1065, + "step": 250640 + }, + { + "epoch": 0.954035763495048, + "grad_norm": 0.1414032280445099, + "learning_rate": 0.0005, + "loss": 2.1024, + "step": 250650 + }, + { + "epoch": 0.9540738259631708, + "grad_norm": 0.13277390599250793, + "learning_rate": 0.0005, + "loss": 2.1083, + "step": 250660 + }, + { + "epoch": 0.9541118884312935, + "grad_norm": 0.12286174297332764, + "learning_rate": 0.0005, + "loss": 2.0943, + "step": 250670 + }, + { + "epoch": 0.9541499508994161, + "grad_norm": 0.11582817882299423, + "learning_rate": 0.0005, + "loss": 2.0956, + "step": 250680 + }, + { + "epoch": 0.9541880133675388, + "grad_norm": 0.13498404622077942, + "learning_rate": 0.0005, + "loss": 2.1024, + "step": 250690 + }, + { + "epoch": 0.9542260758356614, + "grad_norm": 0.14900074899196625, + "learning_rate": 0.0005, + "loss": 2.1048, + "step": 250700 + }, + { + "epoch": 0.9542641383037842, + "grad_norm": 0.12041672319173813, + "learning_rate": 0.0005, + "loss": 2.1047, + "step": 250710 + }, + { + "epoch": 0.9543022007719069, + "grad_norm": 0.13749726116657257, + "learning_rate": 0.0005, + "loss": 2.0916, + "step": 250720 + }, + { + "epoch": 0.9543402632400295, + "grad_norm": 0.12858223915100098, + "learning_rate": 0.0005, + "loss": 2.1169, + "step": 250730 + }, + { + "epoch": 0.9543783257081522, + "grad_norm": 0.14657242596149445, + "learning_rate": 0.0005, + "loss": 2.0959, + "step": 250740 + }, + { + "epoch": 0.9544163881762749, + "grad_norm": 0.12545934319496155, + "learning_rate": 0.0005, + "loss": 2.1024, + "step": 250750 + }, + { + "epoch": 0.9544544506443976, + "grad_norm": 0.1151088997721672, + "learning_rate": 0.0005, + "loss": 2.0965, + "step": 250760 + }, + { + "epoch": 0.9544925131125203, + "grad_norm": 0.13286983966827393, + "learning_rate": 0.0005, + "loss": 2.1202, + "step": 250770 + }, + { + "epoch": 0.9545305755806429, + "grad_norm": 0.1329016238451004, + "learning_rate": 0.0005, + "loss": 2.0976, + "step": 250780 + }, + { + "epoch": 0.9545686380487657, + "grad_norm": 0.1329929530620575, + "learning_rate": 0.0005, + "loss": 2.1052, + "step": 250790 + }, + { + "epoch": 0.9546067005168883, + "grad_norm": 0.1350843906402588, + "learning_rate": 0.0005, + "loss": 2.0979, + "step": 250800 + }, + { + "epoch": 0.954644762985011, + "grad_norm": 0.12170596420764923, + "learning_rate": 0.0005, + "loss": 2.1036, + "step": 250810 + }, + { + "epoch": 0.9546828254531337, + "grad_norm": 0.12434626370668411, + "learning_rate": 0.0005, + "loss": 2.1074, + "step": 250820 + }, + { + "epoch": 0.9547208879212563, + "grad_norm": 0.14334000647068024, + "learning_rate": 0.0005, + "loss": 2.1035, + "step": 250830 + }, + { + "epoch": 0.9547589503893791, + "grad_norm": 0.14487650990486145, + "learning_rate": 0.0005, + "loss": 2.1188, + "step": 250840 + }, + { + "epoch": 0.9547970128575017, + "grad_norm": 0.12593317031860352, + "learning_rate": 0.0005, + "loss": 2.1118, + "step": 250850 + }, + { + "epoch": 0.9548350753256244, + "grad_norm": 0.14340145885944366, + "learning_rate": 0.0005, + "loss": 2.1, + "step": 250860 + }, + { + "epoch": 0.954873137793747, + "grad_norm": 0.13837972283363342, + "learning_rate": 0.0005, + "loss": 2.1082, + "step": 250870 + }, + { + "epoch": 0.9549112002618698, + "grad_norm": 0.12254208326339722, + "learning_rate": 0.0005, + "loss": 2.1015, + "step": 250880 + }, + { + "epoch": 0.9549492627299925, + "grad_norm": 0.12815473973751068, + "learning_rate": 0.0005, + "loss": 2.0997, + "step": 250890 + }, + { + "epoch": 0.9549873251981151, + "grad_norm": 0.12106409668922424, + "learning_rate": 0.0005, + "loss": 2.1084, + "step": 250900 + }, + { + "epoch": 0.9550253876662378, + "grad_norm": 0.12017683684825897, + "learning_rate": 0.0005, + "loss": 2.1038, + "step": 250910 + }, + { + "epoch": 0.9550634501343606, + "grad_norm": 0.12627241015434265, + "learning_rate": 0.0005, + "loss": 2.1004, + "step": 250920 + }, + { + "epoch": 0.9551015126024832, + "grad_norm": 0.1460886001586914, + "learning_rate": 0.0005, + "loss": 2.1058, + "step": 250930 + }, + { + "epoch": 0.9551395750706059, + "grad_norm": 0.11303062736988068, + "learning_rate": 0.0005, + "loss": 2.096, + "step": 250940 + }, + { + "epoch": 0.9551776375387285, + "grad_norm": 0.1309528350830078, + "learning_rate": 0.0005, + "loss": 2.1002, + "step": 250950 + }, + { + "epoch": 0.9552157000068513, + "grad_norm": 0.12723702192306519, + "learning_rate": 0.0005, + "loss": 2.0887, + "step": 250960 + }, + { + "epoch": 0.955253762474974, + "grad_norm": 0.12411960959434509, + "learning_rate": 0.0005, + "loss": 2.1121, + "step": 250970 + }, + { + "epoch": 0.9552918249430966, + "grad_norm": 0.1408742070198059, + "learning_rate": 0.0005, + "loss": 2.1134, + "step": 250980 + }, + { + "epoch": 0.9553298874112193, + "grad_norm": 0.12967275083065033, + "learning_rate": 0.0005, + "loss": 2.1009, + "step": 250990 + }, + { + "epoch": 0.9553679498793419, + "grad_norm": 0.1268027126789093, + "learning_rate": 0.0005, + "loss": 2.1118, + "step": 251000 + }, + { + "epoch": 0.9554060123474647, + "grad_norm": 0.12056492269039154, + "learning_rate": 0.0005, + "loss": 2.1047, + "step": 251010 + }, + { + "epoch": 0.9554440748155874, + "grad_norm": 0.11946653574705124, + "learning_rate": 0.0005, + "loss": 2.1063, + "step": 251020 + }, + { + "epoch": 0.95548213728371, + "grad_norm": 0.12903720140457153, + "learning_rate": 0.0005, + "loss": 2.1178, + "step": 251030 + }, + { + "epoch": 0.9555201997518327, + "grad_norm": 0.1267389953136444, + "learning_rate": 0.0005, + "loss": 2.1034, + "step": 251040 + }, + { + "epoch": 0.9555582622199554, + "grad_norm": 0.13117201626300812, + "learning_rate": 0.0005, + "loss": 2.0986, + "step": 251050 + }, + { + "epoch": 0.9555963246880781, + "grad_norm": 0.1332862228155136, + "learning_rate": 0.0005, + "loss": 2.1046, + "step": 251060 + }, + { + "epoch": 0.9556343871562007, + "grad_norm": 0.12589438259601593, + "learning_rate": 0.0005, + "loss": 2.0995, + "step": 251070 + }, + { + "epoch": 0.9556724496243234, + "grad_norm": 0.1279170662164688, + "learning_rate": 0.0005, + "loss": 2.1022, + "step": 251080 + }, + { + "epoch": 0.9557105120924462, + "grad_norm": 0.14559973776340485, + "learning_rate": 0.0005, + "loss": 2.1035, + "step": 251090 + }, + { + "epoch": 0.9557485745605688, + "grad_norm": 0.13227277994155884, + "learning_rate": 0.0005, + "loss": 2.103, + "step": 251100 + }, + { + "epoch": 0.9557866370286915, + "grad_norm": 0.11522812396287918, + "learning_rate": 0.0005, + "loss": 2.1013, + "step": 251110 + }, + { + "epoch": 0.9558246994968141, + "grad_norm": 0.12452112138271332, + "learning_rate": 0.0005, + "loss": 2.1061, + "step": 251120 + }, + { + "epoch": 0.9558627619649368, + "grad_norm": 0.12298731505870819, + "learning_rate": 0.0005, + "loss": 2.09, + "step": 251130 + }, + { + "epoch": 0.9559008244330596, + "grad_norm": 0.1283821314573288, + "learning_rate": 0.0005, + "loss": 2.0968, + "step": 251140 + }, + { + "epoch": 0.9559388869011822, + "grad_norm": 0.13891930878162384, + "learning_rate": 0.0005, + "loss": 2.1116, + "step": 251150 + }, + { + "epoch": 0.9559769493693049, + "grad_norm": 0.13013017177581787, + "learning_rate": 0.0005, + "loss": 2.1005, + "step": 251160 + }, + { + "epoch": 0.9560150118374275, + "grad_norm": 0.1186857670545578, + "learning_rate": 0.0005, + "loss": 2.1043, + "step": 251170 + }, + { + "epoch": 0.9560530743055503, + "grad_norm": 0.12192530930042267, + "learning_rate": 0.0005, + "loss": 2.109, + "step": 251180 + }, + { + "epoch": 0.956091136773673, + "grad_norm": 0.12259349972009659, + "learning_rate": 0.0005, + "loss": 2.1232, + "step": 251190 + }, + { + "epoch": 0.9561291992417956, + "grad_norm": 0.12459017336368561, + "learning_rate": 0.0005, + "loss": 2.0999, + "step": 251200 + }, + { + "epoch": 0.9561672617099183, + "grad_norm": 0.11912458389997482, + "learning_rate": 0.0005, + "loss": 2.0879, + "step": 251210 + }, + { + "epoch": 0.956205324178041, + "grad_norm": 0.12418442964553833, + "learning_rate": 0.0005, + "loss": 2.1174, + "step": 251220 + }, + { + "epoch": 0.9562433866461637, + "grad_norm": 0.13457264006137848, + "learning_rate": 0.0005, + "loss": 2.103, + "step": 251230 + }, + { + "epoch": 0.9562814491142864, + "grad_norm": 0.1312561333179474, + "learning_rate": 0.0005, + "loss": 2.0901, + "step": 251240 + }, + { + "epoch": 0.956319511582409, + "grad_norm": 0.13382896780967712, + "learning_rate": 0.0005, + "loss": 2.0959, + "step": 251250 + }, + { + "epoch": 0.9563575740505317, + "grad_norm": 0.14410291612148285, + "learning_rate": 0.0005, + "loss": 2.1038, + "step": 251260 + }, + { + "epoch": 0.9563956365186544, + "grad_norm": 0.13317954540252686, + "learning_rate": 0.0005, + "loss": 2.1252, + "step": 251270 + }, + { + "epoch": 0.9564336989867771, + "grad_norm": 0.1316411942243576, + "learning_rate": 0.0005, + "loss": 2.0885, + "step": 251280 + }, + { + "epoch": 0.9564717614548998, + "grad_norm": 0.1131930723786354, + "learning_rate": 0.0005, + "loss": 2.0913, + "step": 251290 + }, + { + "epoch": 0.9565098239230224, + "grad_norm": 0.11551398038864136, + "learning_rate": 0.0005, + "loss": 2.1056, + "step": 251300 + }, + { + "epoch": 0.9565478863911452, + "grad_norm": 0.12596704065799713, + "learning_rate": 0.0005, + "loss": 2.1022, + "step": 251310 + }, + { + "epoch": 0.9565859488592678, + "grad_norm": 0.13516521453857422, + "learning_rate": 0.0005, + "loss": 2.1058, + "step": 251320 + }, + { + "epoch": 0.9566240113273905, + "grad_norm": 0.12555758655071259, + "learning_rate": 0.0005, + "loss": 2.1043, + "step": 251330 + }, + { + "epoch": 0.9566620737955132, + "grad_norm": 0.1379236876964569, + "learning_rate": 0.0005, + "loss": 2.1108, + "step": 251340 + }, + { + "epoch": 0.9567001362636359, + "grad_norm": 0.12100891023874283, + "learning_rate": 0.0005, + "loss": 2.1139, + "step": 251350 + }, + { + "epoch": 0.9567381987317586, + "grad_norm": 0.12747138738632202, + "learning_rate": 0.0005, + "loss": 2.0993, + "step": 251360 + }, + { + "epoch": 0.9567762611998812, + "grad_norm": 0.12468067556619644, + "learning_rate": 0.0005, + "loss": 2.1049, + "step": 251370 + }, + { + "epoch": 0.9568143236680039, + "grad_norm": 0.1281834840774536, + "learning_rate": 0.0005, + "loss": 2.1007, + "step": 251380 + }, + { + "epoch": 0.9568523861361267, + "grad_norm": 0.14125867187976837, + "learning_rate": 0.0005, + "loss": 2.1012, + "step": 251390 + }, + { + "epoch": 0.9568904486042493, + "grad_norm": 0.17697103321552277, + "learning_rate": 0.0005, + "loss": 2.1086, + "step": 251400 + }, + { + "epoch": 0.956928511072372, + "grad_norm": 0.12031698226928711, + "learning_rate": 0.0005, + "loss": 2.09, + "step": 251410 + }, + { + "epoch": 0.9569665735404946, + "grad_norm": 0.11731946468353271, + "learning_rate": 0.0005, + "loss": 2.091, + "step": 251420 + }, + { + "epoch": 0.9570046360086173, + "grad_norm": 0.13509485125541687, + "learning_rate": 0.0005, + "loss": 2.1095, + "step": 251430 + }, + { + "epoch": 0.9570426984767401, + "grad_norm": 0.1525716781616211, + "learning_rate": 0.0005, + "loss": 2.1049, + "step": 251440 + }, + { + "epoch": 0.9570807609448627, + "grad_norm": 0.12425057590007782, + "learning_rate": 0.0005, + "loss": 2.0994, + "step": 251450 + }, + { + "epoch": 0.9571188234129854, + "grad_norm": 0.13297343254089355, + "learning_rate": 0.0005, + "loss": 2.0951, + "step": 251460 + }, + { + "epoch": 0.957156885881108, + "grad_norm": 0.13009850680828094, + "learning_rate": 0.0005, + "loss": 2.0987, + "step": 251470 + }, + { + "epoch": 0.9571949483492308, + "grad_norm": 0.1400795876979828, + "learning_rate": 0.0005, + "loss": 2.1099, + "step": 251480 + }, + { + "epoch": 0.9572330108173535, + "grad_norm": 0.12518252432346344, + "learning_rate": 0.0005, + "loss": 2.1116, + "step": 251490 + }, + { + "epoch": 0.9572710732854761, + "grad_norm": 0.1322515457868576, + "learning_rate": 0.0005, + "loss": 2.099, + "step": 251500 + }, + { + "epoch": 0.9573091357535988, + "grad_norm": 0.12189028412103653, + "learning_rate": 0.0005, + "loss": 2.1105, + "step": 251510 + }, + { + "epoch": 0.9573471982217215, + "grad_norm": 0.12501998245716095, + "learning_rate": 0.0005, + "loss": 2.0942, + "step": 251520 + }, + { + "epoch": 0.9573852606898442, + "grad_norm": 0.13441771268844604, + "learning_rate": 0.0005, + "loss": 2.0733, + "step": 251530 + }, + { + "epoch": 0.9574233231579669, + "grad_norm": 0.13036592304706573, + "learning_rate": 0.0005, + "loss": 2.0949, + "step": 251540 + }, + { + "epoch": 0.9574613856260895, + "grad_norm": 0.12414134293794632, + "learning_rate": 0.0005, + "loss": 2.0805, + "step": 251550 + }, + { + "epoch": 0.9574994480942122, + "grad_norm": 0.12663614749908447, + "learning_rate": 0.0005, + "loss": 2.0944, + "step": 251560 + }, + { + "epoch": 0.9575375105623349, + "grad_norm": 0.13920724391937256, + "learning_rate": 0.0005, + "loss": 2.1114, + "step": 251570 + }, + { + "epoch": 0.9575755730304576, + "grad_norm": 0.3212215006351471, + "learning_rate": 0.0005, + "loss": 2.0987, + "step": 251580 + }, + { + "epoch": 0.9576136354985803, + "grad_norm": 0.1401003748178482, + "learning_rate": 0.0005, + "loss": 2.1007, + "step": 251590 + }, + { + "epoch": 0.9576516979667029, + "grad_norm": 0.1583040952682495, + "learning_rate": 0.0005, + "loss": 2.1093, + "step": 251600 + }, + { + "epoch": 0.9576897604348257, + "grad_norm": 0.13942931592464447, + "learning_rate": 0.0005, + "loss": 2.0953, + "step": 251610 + }, + { + "epoch": 0.9577278229029483, + "grad_norm": 0.12837275862693787, + "learning_rate": 0.0005, + "loss": 2.0995, + "step": 251620 + }, + { + "epoch": 0.957765885371071, + "grad_norm": 0.12775376439094543, + "learning_rate": 0.0005, + "loss": 2.1114, + "step": 251630 + }, + { + "epoch": 0.9578039478391936, + "grad_norm": 0.12998957931995392, + "learning_rate": 0.0005, + "loss": 2.0986, + "step": 251640 + }, + { + "epoch": 0.9578420103073164, + "grad_norm": 0.12760597467422485, + "learning_rate": 0.0005, + "loss": 2.1028, + "step": 251650 + }, + { + "epoch": 0.9578800727754391, + "grad_norm": 0.12280615419149399, + "learning_rate": 0.0005, + "loss": 2.099, + "step": 251660 + }, + { + "epoch": 0.9579181352435617, + "grad_norm": 0.1304440051317215, + "learning_rate": 0.0005, + "loss": 2.0834, + "step": 251670 + }, + { + "epoch": 0.9579561977116844, + "grad_norm": 0.1276025027036667, + "learning_rate": 0.0005, + "loss": 2.1106, + "step": 251680 + }, + { + "epoch": 0.957994260179807, + "grad_norm": 0.13778327405452728, + "learning_rate": 0.0005, + "loss": 2.097, + "step": 251690 + }, + { + "epoch": 0.9580323226479298, + "grad_norm": 0.13166651129722595, + "learning_rate": 0.0005, + "loss": 2.1152, + "step": 251700 + }, + { + "epoch": 0.9580703851160525, + "grad_norm": 0.12186914682388306, + "learning_rate": 0.0005, + "loss": 2.1001, + "step": 251710 + }, + { + "epoch": 0.9581084475841751, + "grad_norm": 0.13413405418395996, + "learning_rate": 0.0005, + "loss": 2.1076, + "step": 251720 + }, + { + "epoch": 0.9581465100522978, + "grad_norm": 0.1275538057088852, + "learning_rate": 0.0005, + "loss": 2.1032, + "step": 251730 + }, + { + "epoch": 0.9581845725204206, + "grad_norm": 0.14755627512931824, + "learning_rate": 0.0005, + "loss": 2.0927, + "step": 251740 + }, + { + "epoch": 0.9582226349885432, + "grad_norm": 0.12701916694641113, + "learning_rate": 0.0005, + "loss": 2.1127, + "step": 251750 + }, + { + "epoch": 0.9582606974566659, + "grad_norm": 0.12311451882123947, + "learning_rate": 0.0005, + "loss": 2.1054, + "step": 251760 + }, + { + "epoch": 0.9582987599247885, + "grad_norm": 0.1216728538274765, + "learning_rate": 0.0005, + "loss": 2.1063, + "step": 251770 + }, + { + "epoch": 0.9583368223929113, + "grad_norm": 0.1265268474817276, + "learning_rate": 0.0005, + "loss": 2.116, + "step": 251780 + }, + { + "epoch": 0.958374884861034, + "grad_norm": 0.12738136947155, + "learning_rate": 0.0005, + "loss": 2.0876, + "step": 251790 + }, + { + "epoch": 0.9584129473291566, + "grad_norm": 0.1373567283153534, + "learning_rate": 0.0005, + "loss": 2.112, + "step": 251800 + }, + { + "epoch": 0.9584510097972793, + "grad_norm": 0.12077868729829788, + "learning_rate": 0.0005, + "loss": 2.0954, + "step": 251810 + }, + { + "epoch": 0.958489072265402, + "grad_norm": 0.1335574835538864, + "learning_rate": 0.0005, + "loss": 2.1098, + "step": 251820 + }, + { + "epoch": 0.9585271347335247, + "grad_norm": 0.13100647926330566, + "learning_rate": 0.0005, + "loss": 2.1068, + "step": 251830 + }, + { + "epoch": 0.9585651972016473, + "grad_norm": 0.12595607340335846, + "learning_rate": 0.0005, + "loss": 2.1068, + "step": 251840 + }, + { + "epoch": 0.95860325966977, + "grad_norm": 0.1261446624994278, + "learning_rate": 0.0005, + "loss": 2.1106, + "step": 251850 + }, + { + "epoch": 0.9586413221378927, + "grad_norm": 0.12131789326667786, + "learning_rate": 0.0005, + "loss": 2.125, + "step": 251860 + }, + { + "epoch": 0.9586793846060154, + "grad_norm": 0.12954165041446686, + "learning_rate": 0.0005, + "loss": 2.1045, + "step": 251870 + }, + { + "epoch": 0.9587174470741381, + "grad_norm": 0.14390873908996582, + "learning_rate": 0.0005, + "loss": 2.1136, + "step": 251880 + }, + { + "epoch": 0.9587555095422607, + "grad_norm": 0.13217221200466156, + "learning_rate": 0.0005, + "loss": 2.0832, + "step": 251890 + }, + { + "epoch": 0.9587935720103834, + "grad_norm": 0.11833815276622772, + "learning_rate": 0.0005, + "loss": 2.1054, + "step": 251900 + }, + { + "epoch": 0.9588316344785062, + "grad_norm": 0.11794855445623398, + "learning_rate": 0.0005, + "loss": 2.1003, + "step": 251910 + }, + { + "epoch": 0.9588696969466288, + "grad_norm": 0.14406496286392212, + "learning_rate": 0.0005, + "loss": 2.095, + "step": 251920 + }, + { + "epoch": 0.9589077594147515, + "grad_norm": 0.14679089188575745, + "learning_rate": 0.0005, + "loss": 2.1045, + "step": 251930 + }, + { + "epoch": 0.9589458218828741, + "grad_norm": 0.13904447853565216, + "learning_rate": 0.0005, + "loss": 2.1123, + "step": 251940 + }, + { + "epoch": 0.9589838843509969, + "grad_norm": 0.12721426784992218, + "learning_rate": 0.0005, + "loss": 2.0897, + "step": 251950 + }, + { + "epoch": 0.9590219468191196, + "grad_norm": 0.12797985970973969, + "learning_rate": 0.0005, + "loss": 2.1177, + "step": 251960 + }, + { + "epoch": 0.9590600092872422, + "grad_norm": 0.12952707707881927, + "learning_rate": 0.0005, + "loss": 2.1081, + "step": 251970 + }, + { + "epoch": 0.9590980717553649, + "grad_norm": 0.1243777871131897, + "learning_rate": 0.0005, + "loss": 2.0988, + "step": 251980 + }, + { + "epoch": 0.9591361342234875, + "grad_norm": 0.12574772536754608, + "learning_rate": 0.0005, + "loss": 2.093, + "step": 251990 + }, + { + "epoch": 0.9591741966916103, + "grad_norm": 0.1342536360025406, + "learning_rate": 0.0005, + "loss": 2.1098, + "step": 252000 + }, + { + "epoch": 0.959212259159733, + "grad_norm": 0.1257815659046173, + "learning_rate": 0.0005, + "loss": 2.1066, + "step": 252010 + }, + { + "epoch": 0.9592503216278556, + "grad_norm": 0.13442516326904297, + "learning_rate": 0.0005, + "loss": 2.0896, + "step": 252020 + }, + { + "epoch": 0.9592883840959783, + "grad_norm": 0.12347886711359024, + "learning_rate": 0.0005, + "loss": 2.0999, + "step": 252030 + }, + { + "epoch": 0.959326446564101, + "grad_norm": 0.14267504215240479, + "learning_rate": 0.0005, + "loss": 2.0846, + "step": 252040 + }, + { + "epoch": 0.9593645090322237, + "grad_norm": 0.13044388592243195, + "learning_rate": 0.0005, + "loss": 2.1122, + "step": 252050 + }, + { + "epoch": 0.9594025715003464, + "grad_norm": 0.1280493289232254, + "learning_rate": 0.0005, + "loss": 2.0957, + "step": 252060 + }, + { + "epoch": 0.959440633968469, + "grad_norm": 0.13191363215446472, + "learning_rate": 0.0005, + "loss": 2.1011, + "step": 252070 + }, + { + "epoch": 0.9594786964365918, + "grad_norm": 0.13211052119731903, + "learning_rate": 0.0005, + "loss": 2.0857, + "step": 252080 + }, + { + "epoch": 0.9595167589047144, + "grad_norm": 0.11889991164207458, + "learning_rate": 0.0005, + "loss": 2.0901, + "step": 252090 + }, + { + "epoch": 0.9595548213728371, + "grad_norm": 0.13696099817752838, + "learning_rate": 0.0005, + "loss": 2.0881, + "step": 252100 + }, + { + "epoch": 0.9595928838409598, + "grad_norm": 0.13332092761993408, + "learning_rate": 0.0005, + "loss": 2.0939, + "step": 252110 + }, + { + "epoch": 0.9596309463090824, + "grad_norm": 0.1378851681947708, + "learning_rate": 0.0005, + "loss": 2.116, + "step": 252120 + }, + { + "epoch": 0.9596690087772052, + "grad_norm": 0.1309574693441391, + "learning_rate": 0.0005, + "loss": 2.1047, + "step": 252130 + }, + { + "epoch": 0.9597070712453278, + "grad_norm": 0.11649864166975021, + "learning_rate": 0.0005, + "loss": 2.1094, + "step": 252140 + }, + { + "epoch": 0.9597451337134505, + "grad_norm": 0.11125954240560532, + "learning_rate": 0.0005, + "loss": 2.1184, + "step": 252150 + }, + { + "epoch": 0.9597831961815732, + "grad_norm": 0.1262616515159607, + "learning_rate": 0.0005, + "loss": 2.0887, + "step": 252160 + }, + { + "epoch": 0.9598212586496959, + "grad_norm": 0.13262724876403809, + "learning_rate": 0.0005, + "loss": 2.104, + "step": 252170 + }, + { + "epoch": 0.9598593211178186, + "grad_norm": 0.1290009468793869, + "learning_rate": 0.0005, + "loss": 2.0881, + "step": 252180 + }, + { + "epoch": 0.9598973835859412, + "grad_norm": 0.12596750259399414, + "learning_rate": 0.0005, + "loss": 2.1015, + "step": 252190 + }, + { + "epoch": 0.9599354460540639, + "grad_norm": 0.12755310535430908, + "learning_rate": 0.0005, + "loss": 2.0998, + "step": 252200 + }, + { + "epoch": 0.9599735085221867, + "grad_norm": 0.11970789730548859, + "learning_rate": 0.0005, + "loss": 2.1023, + "step": 252210 + }, + { + "epoch": 0.9600115709903093, + "grad_norm": 0.1304336041212082, + "learning_rate": 0.0005, + "loss": 2.1106, + "step": 252220 + }, + { + "epoch": 0.960049633458432, + "grad_norm": 0.12059587985277176, + "learning_rate": 0.0005, + "loss": 2.1111, + "step": 252230 + }, + { + "epoch": 0.9600876959265546, + "grad_norm": 0.13717228174209595, + "learning_rate": 0.0005, + "loss": 2.0955, + "step": 252240 + }, + { + "epoch": 0.9601257583946774, + "grad_norm": 0.1227838546037674, + "learning_rate": 0.0005, + "loss": 2.1007, + "step": 252250 + }, + { + "epoch": 0.9601638208628, + "grad_norm": 0.12704740464687347, + "learning_rate": 0.0005, + "loss": 2.1005, + "step": 252260 + }, + { + "epoch": 0.9602018833309227, + "grad_norm": 0.12922018766403198, + "learning_rate": 0.0005, + "loss": 2.1097, + "step": 252270 + }, + { + "epoch": 0.9602399457990454, + "grad_norm": 0.12147005647420883, + "learning_rate": 0.0005, + "loss": 2.0903, + "step": 252280 + }, + { + "epoch": 0.960278008267168, + "grad_norm": 0.14183133840560913, + "learning_rate": 0.0005, + "loss": 2.0952, + "step": 252290 + }, + { + "epoch": 0.9603160707352908, + "grad_norm": 0.1297820508480072, + "learning_rate": 0.0005, + "loss": 2.1091, + "step": 252300 + }, + { + "epoch": 0.9603541332034135, + "grad_norm": 0.12243583053350449, + "learning_rate": 0.0005, + "loss": 2.0976, + "step": 252310 + }, + { + "epoch": 0.9603921956715361, + "grad_norm": 0.1582963615655899, + "learning_rate": 0.0005, + "loss": 2.098, + "step": 252320 + }, + { + "epoch": 0.9604302581396588, + "grad_norm": 0.11770734190940857, + "learning_rate": 0.0005, + "loss": 2.11, + "step": 252330 + }, + { + "epoch": 0.9604683206077815, + "grad_norm": 0.14207595586776733, + "learning_rate": 0.0005, + "loss": 2.1037, + "step": 252340 + }, + { + "epoch": 0.9605063830759042, + "grad_norm": 0.13186196982860565, + "learning_rate": 0.0005, + "loss": 2.1103, + "step": 252350 + }, + { + "epoch": 0.9605444455440268, + "grad_norm": 0.1168905720114708, + "learning_rate": 0.0005, + "loss": 2.096, + "step": 252360 + }, + { + "epoch": 0.9605825080121495, + "grad_norm": 0.12174654006958008, + "learning_rate": 0.0005, + "loss": 2.0896, + "step": 252370 + }, + { + "epoch": 0.9606205704802723, + "grad_norm": 0.11918246001005173, + "learning_rate": 0.0005, + "loss": 2.1093, + "step": 252380 + }, + { + "epoch": 0.9606586329483949, + "grad_norm": 0.13458192348480225, + "learning_rate": 0.0005, + "loss": 2.0968, + "step": 252390 + }, + { + "epoch": 0.9606966954165176, + "grad_norm": 0.14331522583961487, + "learning_rate": 0.0005, + "loss": 2.1062, + "step": 252400 + }, + { + "epoch": 0.9607347578846402, + "grad_norm": 0.13765034079551697, + "learning_rate": 0.0005, + "loss": 2.103, + "step": 252410 + }, + { + "epoch": 0.9607728203527629, + "grad_norm": 0.12519164383411407, + "learning_rate": 0.0005, + "loss": 2.1021, + "step": 252420 + }, + { + "epoch": 0.9608108828208857, + "grad_norm": 0.1264149695634842, + "learning_rate": 0.0005, + "loss": 2.1114, + "step": 252430 + }, + { + "epoch": 0.9608489452890083, + "grad_norm": 0.122474804520607, + "learning_rate": 0.0005, + "loss": 2.1011, + "step": 252440 + }, + { + "epoch": 0.960887007757131, + "grad_norm": 0.1438497006893158, + "learning_rate": 0.0005, + "loss": 2.0948, + "step": 252450 + }, + { + "epoch": 0.9609250702252536, + "grad_norm": 0.14879588782787323, + "learning_rate": 0.0005, + "loss": 2.0969, + "step": 252460 + }, + { + "epoch": 0.9609631326933764, + "grad_norm": 0.12623494863510132, + "learning_rate": 0.0005, + "loss": 2.1011, + "step": 252470 + }, + { + "epoch": 0.9610011951614991, + "grad_norm": 0.123557910323143, + "learning_rate": 0.0005, + "loss": 2.0947, + "step": 252480 + }, + { + "epoch": 0.9610392576296217, + "grad_norm": 0.12479705363512039, + "learning_rate": 0.0005, + "loss": 2.1036, + "step": 252490 + }, + { + "epoch": 0.9610773200977444, + "grad_norm": 0.12509945034980774, + "learning_rate": 0.0005, + "loss": 2.1003, + "step": 252500 + }, + { + "epoch": 0.9611153825658671, + "grad_norm": 0.13864733278751373, + "learning_rate": 0.0005, + "loss": 2.1095, + "step": 252510 + }, + { + "epoch": 0.9611534450339898, + "grad_norm": 0.13315652310848236, + "learning_rate": 0.0005, + "loss": 2.1071, + "step": 252520 + }, + { + "epoch": 0.9611915075021125, + "grad_norm": 0.12884598970413208, + "learning_rate": 0.0005, + "loss": 2.105, + "step": 252530 + }, + { + "epoch": 0.9612295699702351, + "grad_norm": 0.12486692517995834, + "learning_rate": 0.0005, + "loss": 2.1053, + "step": 252540 + }, + { + "epoch": 0.9612676324383578, + "grad_norm": 0.14396265149116516, + "learning_rate": 0.0005, + "loss": 2.0992, + "step": 252550 + }, + { + "epoch": 0.9613056949064805, + "grad_norm": 0.13405755162239075, + "learning_rate": 0.0005, + "loss": 2.1064, + "step": 252560 + }, + { + "epoch": 0.9613437573746032, + "grad_norm": 0.12350235134363174, + "learning_rate": 0.0005, + "loss": 2.1074, + "step": 252570 + }, + { + "epoch": 0.9613818198427259, + "grad_norm": 0.12725189328193665, + "learning_rate": 0.0005, + "loss": 2.1084, + "step": 252580 + }, + { + "epoch": 0.9614198823108485, + "grad_norm": 0.13886277377605438, + "learning_rate": 0.0005, + "loss": 2.1094, + "step": 252590 + }, + { + "epoch": 0.9614579447789713, + "grad_norm": 0.14337188005447388, + "learning_rate": 0.0005, + "loss": 2.1127, + "step": 252600 + }, + { + "epoch": 0.9614960072470939, + "grad_norm": 0.14677828550338745, + "learning_rate": 0.0005, + "loss": 2.1115, + "step": 252610 + }, + { + "epoch": 0.9615340697152166, + "grad_norm": 0.12397907674312592, + "learning_rate": 0.0005, + "loss": 2.0889, + "step": 252620 + }, + { + "epoch": 0.9615721321833393, + "grad_norm": 0.13534249365329742, + "learning_rate": 0.0005, + "loss": 2.1007, + "step": 252630 + }, + { + "epoch": 0.961610194651462, + "grad_norm": 0.12256189435720444, + "learning_rate": 0.0005, + "loss": 2.1039, + "step": 252640 + }, + { + "epoch": 0.9616482571195847, + "grad_norm": 0.13429589569568634, + "learning_rate": 0.0005, + "loss": 2.1116, + "step": 252650 + }, + { + "epoch": 0.9616863195877073, + "grad_norm": 0.13483218848705292, + "learning_rate": 0.0005, + "loss": 2.0942, + "step": 252660 + }, + { + "epoch": 0.96172438205583, + "grad_norm": 0.13158905506134033, + "learning_rate": 0.0005, + "loss": 2.1111, + "step": 252670 + }, + { + "epoch": 0.9617624445239528, + "grad_norm": 0.12446098774671555, + "learning_rate": 0.0005, + "loss": 2.0948, + "step": 252680 + }, + { + "epoch": 0.9618005069920754, + "grad_norm": 0.14150193333625793, + "learning_rate": 0.0005, + "loss": 2.1104, + "step": 252690 + }, + { + "epoch": 0.9618385694601981, + "grad_norm": 0.13725963234901428, + "learning_rate": 0.0005, + "loss": 2.1063, + "step": 252700 + }, + { + "epoch": 0.9618766319283207, + "grad_norm": 0.12844370305538177, + "learning_rate": 0.0005, + "loss": 2.1062, + "step": 252710 + }, + { + "epoch": 0.9619146943964434, + "grad_norm": 0.12074138969182968, + "learning_rate": 0.0005, + "loss": 2.1057, + "step": 252720 + }, + { + "epoch": 0.9619527568645662, + "grad_norm": 0.13764441013336182, + "learning_rate": 0.0005, + "loss": 2.1045, + "step": 252730 + }, + { + "epoch": 0.9619908193326888, + "grad_norm": 0.11331465095281601, + "learning_rate": 0.0005, + "loss": 2.1046, + "step": 252740 + }, + { + "epoch": 0.9620288818008115, + "grad_norm": 0.1203673928976059, + "learning_rate": 0.0005, + "loss": 2.1041, + "step": 252750 + }, + { + "epoch": 0.9620669442689341, + "grad_norm": 0.13739249110221863, + "learning_rate": 0.0005, + "loss": 2.1038, + "step": 252760 + }, + { + "epoch": 0.9621050067370569, + "grad_norm": 0.1379999965429306, + "learning_rate": 0.0005, + "loss": 2.1108, + "step": 252770 + }, + { + "epoch": 0.9621430692051796, + "grad_norm": 0.13139371573925018, + "learning_rate": 0.0005, + "loss": 2.1026, + "step": 252780 + }, + { + "epoch": 0.9621811316733022, + "grad_norm": 0.12953905761241913, + "learning_rate": 0.0005, + "loss": 2.0802, + "step": 252790 + }, + { + "epoch": 0.9622191941414249, + "grad_norm": 0.11608421057462692, + "learning_rate": 0.0005, + "loss": 2.1026, + "step": 252800 + }, + { + "epoch": 0.9622572566095476, + "grad_norm": 0.14513848721981049, + "learning_rate": 0.0005, + "loss": 2.1069, + "step": 252810 + }, + { + "epoch": 0.9622953190776703, + "grad_norm": 0.13420742750167847, + "learning_rate": 0.0005, + "loss": 2.1121, + "step": 252820 + }, + { + "epoch": 0.962333381545793, + "grad_norm": 0.11324869096279144, + "learning_rate": 0.0005, + "loss": 2.1062, + "step": 252830 + }, + { + "epoch": 0.9623714440139156, + "grad_norm": 0.12924343347549438, + "learning_rate": 0.0005, + "loss": 2.1018, + "step": 252840 + }, + { + "epoch": 0.9624095064820383, + "grad_norm": 0.14046475291252136, + "learning_rate": 0.0005, + "loss": 2.1061, + "step": 252850 + }, + { + "epoch": 0.962447568950161, + "grad_norm": 0.13469430804252625, + "learning_rate": 0.0005, + "loss": 2.0985, + "step": 252860 + }, + { + "epoch": 0.9624856314182837, + "grad_norm": 0.1296015977859497, + "learning_rate": 0.0005, + "loss": 2.0947, + "step": 252870 + }, + { + "epoch": 0.9625236938864064, + "grad_norm": 0.12833398580551147, + "learning_rate": 0.0005, + "loss": 2.0935, + "step": 252880 + }, + { + "epoch": 0.962561756354529, + "grad_norm": 0.13369536399841309, + "learning_rate": 0.0005, + "loss": 2.0997, + "step": 252890 + }, + { + "epoch": 0.9625998188226518, + "grad_norm": 0.14273881912231445, + "learning_rate": 0.0005, + "loss": 2.1012, + "step": 252900 + }, + { + "epoch": 0.9626378812907744, + "grad_norm": 0.1325225681066513, + "learning_rate": 0.0005, + "loss": 2.0974, + "step": 252910 + }, + { + "epoch": 0.9626759437588971, + "grad_norm": 0.13713519275188446, + "learning_rate": 0.0005, + "loss": 2.1037, + "step": 252920 + }, + { + "epoch": 0.9627140062270197, + "grad_norm": 0.12612666189670563, + "learning_rate": 0.0005, + "loss": 2.0957, + "step": 252930 + }, + { + "epoch": 0.9627520686951425, + "grad_norm": 0.11918192356824875, + "learning_rate": 0.0005, + "loss": 2.1125, + "step": 252940 + }, + { + "epoch": 0.9627901311632652, + "grad_norm": 0.12291216105222702, + "learning_rate": 0.0005, + "loss": 2.1067, + "step": 252950 + }, + { + "epoch": 0.9628281936313878, + "grad_norm": 0.13093940913677216, + "learning_rate": 0.0005, + "loss": 2.1106, + "step": 252960 + }, + { + "epoch": 0.9628662560995105, + "grad_norm": 0.11661577969789505, + "learning_rate": 0.0005, + "loss": 2.1099, + "step": 252970 + }, + { + "epoch": 0.9629043185676333, + "grad_norm": 0.13057559728622437, + "learning_rate": 0.0005, + "loss": 2.0996, + "step": 252980 + }, + { + "epoch": 0.9629423810357559, + "grad_norm": 0.13503484427928925, + "learning_rate": 0.0005, + "loss": 2.1006, + "step": 252990 + }, + { + "epoch": 0.9629804435038786, + "grad_norm": 0.14270184934139252, + "learning_rate": 0.0005, + "loss": 2.0977, + "step": 253000 + }, + { + "epoch": 0.9630185059720012, + "grad_norm": 0.13299335539340973, + "learning_rate": 0.0005, + "loss": 2.1092, + "step": 253010 + }, + { + "epoch": 0.9630565684401239, + "grad_norm": 0.12235192954540253, + "learning_rate": 0.0005, + "loss": 2.1008, + "step": 253020 + }, + { + "epoch": 0.9630946309082467, + "grad_norm": 0.125530406832695, + "learning_rate": 0.0005, + "loss": 2.1036, + "step": 253030 + }, + { + "epoch": 0.9631326933763693, + "grad_norm": 0.15567998588085175, + "learning_rate": 0.0005, + "loss": 2.1096, + "step": 253040 + }, + { + "epoch": 0.963170755844492, + "grad_norm": 0.13265834748744965, + "learning_rate": 0.0005, + "loss": 2.1032, + "step": 253050 + }, + { + "epoch": 0.9632088183126146, + "grad_norm": 0.1267433613538742, + "learning_rate": 0.0005, + "loss": 2.0917, + "step": 253060 + }, + { + "epoch": 0.9632468807807374, + "grad_norm": 0.24809212982654572, + "learning_rate": 0.0005, + "loss": 2.0975, + "step": 253070 + }, + { + "epoch": 0.96328494324886, + "grad_norm": 0.13036325573921204, + "learning_rate": 0.0005, + "loss": 2.1048, + "step": 253080 + }, + { + "epoch": 0.9633230057169827, + "grad_norm": 0.12346664816141129, + "learning_rate": 0.0005, + "loss": 2.1207, + "step": 253090 + }, + { + "epoch": 0.9633610681851054, + "grad_norm": 0.13149957358837128, + "learning_rate": 0.0005, + "loss": 2.111, + "step": 253100 + }, + { + "epoch": 0.9633991306532281, + "grad_norm": 0.12425676733255386, + "learning_rate": 0.0005, + "loss": 2.0961, + "step": 253110 + }, + { + "epoch": 0.9634371931213508, + "grad_norm": 0.12317558377981186, + "learning_rate": 0.0005, + "loss": 2.1083, + "step": 253120 + }, + { + "epoch": 0.9634752555894734, + "grad_norm": 0.126312255859375, + "learning_rate": 0.0005, + "loss": 2.0861, + "step": 253130 + }, + { + "epoch": 0.9635133180575961, + "grad_norm": 0.13709893822669983, + "learning_rate": 0.0005, + "loss": 2.0907, + "step": 253140 + }, + { + "epoch": 0.9635513805257188, + "grad_norm": 0.13229143619537354, + "learning_rate": 0.0005, + "loss": 2.0939, + "step": 253150 + }, + { + "epoch": 0.9635894429938415, + "grad_norm": 0.13252483308315277, + "learning_rate": 0.0005, + "loss": 2.0892, + "step": 253160 + }, + { + "epoch": 0.9636275054619642, + "grad_norm": 0.12838731706142426, + "learning_rate": 0.0005, + "loss": 2.0912, + "step": 253170 + }, + { + "epoch": 0.9636655679300868, + "grad_norm": 0.13725703954696655, + "learning_rate": 0.0005, + "loss": 2.0939, + "step": 253180 + }, + { + "epoch": 0.9637036303982095, + "grad_norm": 0.12590914964675903, + "learning_rate": 0.0005, + "loss": 2.1061, + "step": 253190 + }, + { + "epoch": 0.9637416928663323, + "grad_norm": 0.1262999176979065, + "learning_rate": 0.0005, + "loss": 2.108, + "step": 253200 + }, + { + "epoch": 0.9637797553344549, + "grad_norm": 0.1354227513074875, + "learning_rate": 0.0005, + "loss": 2.0905, + "step": 253210 + }, + { + "epoch": 0.9638178178025776, + "grad_norm": 0.13120831549167633, + "learning_rate": 0.0005, + "loss": 2.0871, + "step": 253220 + }, + { + "epoch": 0.9638558802707002, + "grad_norm": 0.12905097007751465, + "learning_rate": 0.0005, + "loss": 2.1037, + "step": 253230 + }, + { + "epoch": 0.963893942738823, + "grad_norm": 0.12586815655231476, + "learning_rate": 0.0005, + "loss": 2.109, + "step": 253240 + }, + { + "epoch": 0.9639320052069457, + "grad_norm": 0.12380609661340714, + "learning_rate": 0.0005, + "loss": 2.1068, + "step": 253250 + }, + { + "epoch": 0.9639700676750683, + "grad_norm": 0.13315235078334808, + "learning_rate": 0.0005, + "loss": 2.0977, + "step": 253260 + }, + { + "epoch": 0.964008130143191, + "grad_norm": 0.12807630002498627, + "learning_rate": 0.0005, + "loss": 2.1072, + "step": 253270 + }, + { + "epoch": 0.9640461926113136, + "grad_norm": 0.12301483005285263, + "learning_rate": 0.0005, + "loss": 2.107, + "step": 253280 + }, + { + "epoch": 0.9640842550794364, + "grad_norm": 0.12575186789035797, + "learning_rate": 0.0005, + "loss": 2.1132, + "step": 253290 + }, + { + "epoch": 0.9641223175475591, + "grad_norm": 0.1165456622838974, + "learning_rate": 0.0005, + "loss": 2.1141, + "step": 253300 + }, + { + "epoch": 0.9641603800156817, + "grad_norm": 0.13741955161094666, + "learning_rate": 0.0005, + "loss": 2.0996, + "step": 253310 + }, + { + "epoch": 0.9641984424838044, + "grad_norm": 0.12666748464107513, + "learning_rate": 0.0005, + "loss": 2.1145, + "step": 253320 + }, + { + "epoch": 0.9642365049519271, + "grad_norm": 0.12276309728622437, + "learning_rate": 0.0005, + "loss": 2.1224, + "step": 253330 + }, + { + "epoch": 0.9642745674200498, + "grad_norm": 0.13581430912017822, + "learning_rate": 0.0005, + "loss": 2.1112, + "step": 253340 + }, + { + "epoch": 0.9643126298881725, + "grad_norm": 0.7123417854309082, + "learning_rate": 0.0005, + "loss": 2.0961, + "step": 253350 + }, + { + "epoch": 0.9643506923562951, + "grad_norm": 0.1316785216331482, + "learning_rate": 0.0005, + "loss": 2.1045, + "step": 253360 + }, + { + "epoch": 0.9643887548244179, + "grad_norm": 0.11734849959611893, + "learning_rate": 0.0005, + "loss": 2.1039, + "step": 253370 + }, + { + "epoch": 0.9644268172925405, + "grad_norm": 0.1331537514925003, + "learning_rate": 0.0005, + "loss": 2.1075, + "step": 253380 + }, + { + "epoch": 0.9644648797606632, + "grad_norm": 0.1352957934141159, + "learning_rate": 0.0005, + "loss": 2.0968, + "step": 253390 + }, + { + "epoch": 0.9645029422287859, + "grad_norm": 0.12489691376686096, + "learning_rate": 0.0005, + "loss": 2.0979, + "step": 253400 + }, + { + "epoch": 0.9645410046969086, + "grad_norm": 0.12582498788833618, + "learning_rate": 0.0005, + "loss": 2.1205, + "step": 253410 + }, + { + "epoch": 0.9645790671650313, + "grad_norm": 0.11681034415960312, + "learning_rate": 0.0005, + "loss": 2.1028, + "step": 253420 + }, + { + "epoch": 0.9646171296331539, + "grad_norm": 0.12178188562393188, + "learning_rate": 0.0005, + "loss": 2.0971, + "step": 253430 + }, + { + "epoch": 0.9646551921012766, + "grad_norm": 0.13658106327056885, + "learning_rate": 0.0005, + "loss": 2.096, + "step": 253440 + }, + { + "epoch": 0.9646932545693993, + "grad_norm": 0.13959579169750214, + "learning_rate": 0.0005, + "loss": 2.1042, + "step": 253450 + }, + { + "epoch": 0.964731317037522, + "grad_norm": 0.1278458833694458, + "learning_rate": 0.0005, + "loss": 2.1072, + "step": 253460 + }, + { + "epoch": 0.9647693795056447, + "grad_norm": 0.13757051527500153, + "learning_rate": 0.0005, + "loss": 2.1154, + "step": 253470 + }, + { + "epoch": 0.9648074419737673, + "grad_norm": 0.12881222367286682, + "learning_rate": 0.0005, + "loss": 2.1, + "step": 253480 + }, + { + "epoch": 0.96484550444189, + "grad_norm": 0.1218414306640625, + "learning_rate": 0.0005, + "loss": 2.0919, + "step": 253490 + }, + { + "epoch": 0.9648835669100128, + "grad_norm": 0.12052089720964432, + "learning_rate": 0.0005, + "loss": 2.1018, + "step": 253500 + }, + { + "epoch": 0.9649216293781354, + "grad_norm": 0.14013579487800598, + "learning_rate": 0.0005, + "loss": 2.0872, + "step": 253510 + }, + { + "epoch": 0.9649596918462581, + "grad_norm": 0.1209578886628151, + "learning_rate": 0.0005, + "loss": 2.1122, + "step": 253520 + }, + { + "epoch": 0.9649977543143807, + "grad_norm": 0.12182780355215073, + "learning_rate": 0.0005, + "loss": 2.114, + "step": 253530 + }, + { + "epoch": 0.9650358167825035, + "grad_norm": 0.13168083131313324, + "learning_rate": 0.0005, + "loss": 2.0986, + "step": 253540 + }, + { + "epoch": 0.9650738792506262, + "grad_norm": 0.13472476601600647, + "learning_rate": 0.0005, + "loss": 2.0966, + "step": 253550 + }, + { + "epoch": 0.9651119417187488, + "grad_norm": 0.1423880010843277, + "learning_rate": 0.0005, + "loss": 2.1011, + "step": 253560 + }, + { + "epoch": 0.9651500041868715, + "grad_norm": 0.13923941552639008, + "learning_rate": 0.0005, + "loss": 2.0933, + "step": 253570 + }, + { + "epoch": 0.9651880666549941, + "grad_norm": 0.12023203074932098, + "learning_rate": 0.0005, + "loss": 2.0855, + "step": 253580 + }, + { + "epoch": 0.9652261291231169, + "grad_norm": 0.12728914618492126, + "learning_rate": 0.0005, + "loss": 2.0936, + "step": 253590 + }, + { + "epoch": 0.9652641915912396, + "grad_norm": 0.13072176277637482, + "learning_rate": 0.0005, + "loss": 2.0938, + "step": 253600 + }, + { + "epoch": 0.9653022540593622, + "grad_norm": 0.12164273858070374, + "learning_rate": 0.0005, + "loss": 2.1051, + "step": 253610 + }, + { + "epoch": 0.9653403165274849, + "grad_norm": 0.14329518377780914, + "learning_rate": 0.0005, + "loss": 2.0966, + "step": 253620 + }, + { + "epoch": 0.9653783789956076, + "grad_norm": 0.13620802760124207, + "learning_rate": 0.0005, + "loss": 2.1011, + "step": 253630 + }, + { + "epoch": 0.9654164414637303, + "grad_norm": 0.13148976862430573, + "learning_rate": 0.0005, + "loss": 2.1179, + "step": 253640 + }, + { + "epoch": 0.965454503931853, + "grad_norm": 0.19821810722351074, + "learning_rate": 0.0005, + "loss": 2.1049, + "step": 253650 + }, + { + "epoch": 0.9654925663999756, + "grad_norm": 0.1319485753774643, + "learning_rate": 0.0005, + "loss": 2.1143, + "step": 253660 + }, + { + "epoch": 0.9655306288680984, + "grad_norm": 0.1330292969942093, + "learning_rate": 0.0005, + "loss": 2.1015, + "step": 253670 + }, + { + "epoch": 0.965568691336221, + "grad_norm": 0.12223409116268158, + "learning_rate": 0.0005, + "loss": 2.0926, + "step": 253680 + }, + { + "epoch": 0.9656067538043437, + "grad_norm": 0.12201818823814392, + "learning_rate": 0.0005, + "loss": 2.0748, + "step": 253690 + }, + { + "epoch": 0.9656448162724663, + "grad_norm": 0.1322978287935257, + "learning_rate": 0.0005, + "loss": 2.0986, + "step": 253700 + }, + { + "epoch": 0.965682878740589, + "grad_norm": 0.1483135223388672, + "learning_rate": 0.0005, + "loss": 2.1028, + "step": 253710 + }, + { + "epoch": 0.9657209412087118, + "grad_norm": 0.1406850814819336, + "learning_rate": 0.0005, + "loss": 2.0886, + "step": 253720 + }, + { + "epoch": 0.9657590036768344, + "grad_norm": 0.13997170329093933, + "learning_rate": 0.0005, + "loss": 2.1102, + "step": 253730 + }, + { + "epoch": 0.9657970661449571, + "grad_norm": 0.12795639038085938, + "learning_rate": 0.0005, + "loss": 2.0969, + "step": 253740 + }, + { + "epoch": 0.9658351286130797, + "grad_norm": 0.1439167857170105, + "learning_rate": 0.0005, + "loss": 2.0977, + "step": 253750 + }, + { + "epoch": 0.9658731910812025, + "grad_norm": 0.12860262393951416, + "learning_rate": 0.0005, + "loss": 2.1072, + "step": 253760 + }, + { + "epoch": 0.9659112535493252, + "grad_norm": 0.11895686388015747, + "learning_rate": 0.0005, + "loss": 2.1015, + "step": 253770 + }, + { + "epoch": 0.9659493160174478, + "grad_norm": 0.1427023708820343, + "learning_rate": 0.0005, + "loss": 2.1077, + "step": 253780 + }, + { + "epoch": 0.9659873784855705, + "grad_norm": 0.13326308131217957, + "learning_rate": 0.0005, + "loss": 2.0997, + "step": 253790 + }, + { + "epoch": 0.9660254409536932, + "grad_norm": 0.12903240323066711, + "learning_rate": 0.0005, + "loss": 2.1192, + "step": 253800 + }, + { + "epoch": 0.9660635034218159, + "grad_norm": 0.1381160020828247, + "learning_rate": 0.0005, + "loss": 2.1097, + "step": 253810 + }, + { + "epoch": 0.9661015658899386, + "grad_norm": 0.12535077333450317, + "learning_rate": 0.0005, + "loss": 2.0907, + "step": 253820 + }, + { + "epoch": 0.9661396283580612, + "grad_norm": 0.1265016347169876, + "learning_rate": 0.0005, + "loss": 2.1151, + "step": 253830 + }, + { + "epoch": 0.966177690826184, + "grad_norm": 0.12111052125692368, + "learning_rate": 0.0005, + "loss": 2.1203, + "step": 253840 + }, + { + "epoch": 0.9662157532943066, + "grad_norm": 0.13855448365211487, + "learning_rate": 0.0005, + "loss": 2.1037, + "step": 253850 + }, + { + "epoch": 0.9662538157624293, + "grad_norm": 0.1279250979423523, + "learning_rate": 0.0005, + "loss": 2.1122, + "step": 253860 + }, + { + "epoch": 0.966291878230552, + "grad_norm": 0.1239514946937561, + "learning_rate": 0.0005, + "loss": 2.1065, + "step": 253870 + }, + { + "epoch": 0.9663299406986746, + "grad_norm": 0.11943173408508301, + "learning_rate": 0.0005, + "loss": 2.1046, + "step": 253880 + }, + { + "epoch": 0.9663680031667974, + "grad_norm": 0.1287466436624527, + "learning_rate": 0.0005, + "loss": 2.1045, + "step": 253890 + }, + { + "epoch": 0.96640606563492, + "grad_norm": 0.11794771999120712, + "learning_rate": 0.0005, + "loss": 2.1018, + "step": 253900 + }, + { + "epoch": 0.9664441281030427, + "grad_norm": 0.1384105086326599, + "learning_rate": 0.0005, + "loss": 2.0973, + "step": 253910 + }, + { + "epoch": 0.9664821905711654, + "grad_norm": 0.12583598494529724, + "learning_rate": 0.0005, + "loss": 2.0939, + "step": 253920 + }, + { + "epoch": 0.9665202530392881, + "grad_norm": 0.1272657811641693, + "learning_rate": 0.0005, + "loss": 2.1159, + "step": 253930 + }, + { + "epoch": 0.9665583155074108, + "grad_norm": 0.1401505172252655, + "learning_rate": 0.0005, + "loss": 2.1001, + "step": 253940 + }, + { + "epoch": 0.9665963779755334, + "grad_norm": 0.13038720190525055, + "learning_rate": 0.0005, + "loss": 2.1041, + "step": 253950 + }, + { + "epoch": 0.9666344404436561, + "grad_norm": 0.12374894320964813, + "learning_rate": 0.0005, + "loss": 2.1053, + "step": 253960 + }, + { + "epoch": 0.9666725029117789, + "grad_norm": 0.12792405486106873, + "learning_rate": 0.0005, + "loss": 2.0947, + "step": 253970 + }, + { + "epoch": 0.9667105653799015, + "grad_norm": 0.1282413899898529, + "learning_rate": 0.0005, + "loss": 2.1038, + "step": 253980 + }, + { + "epoch": 0.9667486278480242, + "grad_norm": 0.1365712434053421, + "learning_rate": 0.0005, + "loss": 2.1017, + "step": 253990 + }, + { + "epoch": 0.9667866903161468, + "grad_norm": 0.1373814046382904, + "learning_rate": 0.0005, + "loss": 2.0883, + "step": 254000 + }, + { + "epoch": 0.9668247527842695, + "grad_norm": 0.1197550892829895, + "learning_rate": 0.0005, + "loss": 2.1055, + "step": 254010 + }, + { + "epoch": 0.9668628152523923, + "grad_norm": 0.13073822855949402, + "learning_rate": 0.0005, + "loss": 2.1036, + "step": 254020 + }, + { + "epoch": 0.9669008777205149, + "grad_norm": 0.13818205893039703, + "learning_rate": 0.0005, + "loss": 2.1046, + "step": 254030 + }, + { + "epoch": 0.9669389401886376, + "grad_norm": 0.12917038798332214, + "learning_rate": 0.0005, + "loss": 2.117, + "step": 254040 + }, + { + "epoch": 0.9669770026567602, + "grad_norm": 0.13809436559677124, + "learning_rate": 0.0005, + "loss": 2.1106, + "step": 254050 + }, + { + "epoch": 0.967015065124883, + "grad_norm": 0.13827873766422272, + "learning_rate": 0.0005, + "loss": 2.0976, + "step": 254060 + }, + { + "epoch": 0.9670531275930057, + "grad_norm": 0.12173813581466675, + "learning_rate": 0.0005, + "loss": 2.1217, + "step": 254070 + }, + { + "epoch": 0.9670911900611283, + "grad_norm": 0.12423427402973175, + "learning_rate": 0.0005, + "loss": 2.1051, + "step": 254080 + }, + { + "epoch": 0.967129252529251, + "grad_norm": 0.12945401668548584, + "learning_rate": 0.0005, + "loss": 2.0869, + "step": 254090 + }, + { + "epoch": 0.9671673149973737, + "grad_norm": 0.14030809700489044, + "learning_rate": 0.0005, + "loss": 2.0967, + "step": 254100 + }, + { + "epoch": 0.9672053774654964, + "grad_norm": 0.14602228999137878, + "learning_rate": 0.0005, + "loss": 2.1087, + "step": 254110 + }, + { + "epoch": 0.967243439933619, + "grad_norm": 0.11788609623908997, + "learning_rate": 0.0005, + "loss": 2.0985, + "step": 254120 + }, + { + "epoch": 0.9672815024017417, + "grad_norm": 0.1414944976568222, + "learning_rate": 0.0005, + "loss": 2.103, + "step": 254130 + }, + { + "epoch": 0.9673195648698644, + "grad_norm": 0.13184891641139984, + "learning_rate": 0.0005, + "loss": 2.1069, + "step": 254140 + }, + { + "epoch": 0.9673576273379871, + "grad_norm": 0.1220046728849411, + "learning_rate": 0.0005, + "loss": 2.0957, + "step": 254150 + }, + { + "epoch": 0.9673956898061098, + "grad_norm": 0.12934859097003937, + "learning_rate": 0.0005, + "loss": 2.0952, + "step": 254160 + }, + { + "epoch": 0.9674337522742325, + "grad_norm": 0.13587023317813873, + "learning_rate": 0.0005, + "loss": 2.1004, + "step": 254170 + }, + { + "epoch": 0.9674718147423551, + "grad_norm": 0.13337966799736023, + "learning_rate": 0.0005, + "loss": 2.0932, + "step": 254180 + }, + { + "epoch": 0.9675098772104779, + "grad_norm": 0.11864124983549118, + "learning_rate": 0.0005, + "loss": 2.1067, + "step": 254190 + }, + { + "epoch": 0.9675479396786005, + "grad_norm": 0.1444726586341858, + "learning_rate": 0.0005, + "loss": 2.1063, + "step": 254200 + }, + { + "epoch": 0.9675860021467232, + "grad_norm": 0.1322479099035263, + "learning_rate": 0.0005, + "loss": 2.1044, + "step": 254210 + }, + { + "epoch": 0.9676240646148458, + "grad_norm": 0.13444893062114716, + "learning_rate": 0.0005, + "loss": 2.1099, + "step": 254220 + }, + { + "epoch": 0.9676621270829686, + "grad_norm": 0.14414022862911224, + "learning_rate": 0.0005, + "loss": 2.1184, + "step": 254230 + }, + { + "epoch": 0.9677001895510913, + "grad_norm": 0.12338683009147644, + "learning_rate": 0.0005, + "loss": 2.0885, + "step": 254240 + }, + { + "epoch": 0.9677382520192139, + "grad_norm": 0.12411805987358093, + "learning_rate": 0.0005, + "loss": 2.0986, + "step": 254250 + }, + { + "epoch": 0.9677763144873366, + "grad_norm": 0.14127685129642487, + "learning_rate": 0.0005, + "loss": 2.1122, + "step": 254260 + }, + { + "epoch": 0.9678143769554594, + "grad_norm": 0.12266780436038971, + "learning_rate": 0.0005, + "loss": 2.1026, + "step": 254270 + }, + { + "epoch": 0.967852439423582, + "grad_norm": 0.12482955306768417, + "learning_rate": 0.0005, + "loss": 2.1065, + "step": 254280 + }, + { + "epoch": 0.9678905018917047, + "grad_norm": 0.12805862724781036, + "learning_rate": 0.0005, + "loss": 2.1001, + "step": 254290 + }, + { + "epoch": 0.9679285643598273, + "grad_norm": 0.15698951482772827, + "learning_rate": 0.0005, + "loss": 2.1077, + "step": 254300 + }, + { + "epoch": 0.96796662682795, + "grad_norm": 0.13735942542552948, + "learning_rate": 0.0005, + "loss": 2.1004, + "step": 254310 + }, + { + "epoch": 0.9680046892960728, + "grad_norm": 0.12623661756515503, + "learning_rate": 0.0005, + "loss": 2.0795, + "step": 254320 + }, + { + "epoch": 0.9680427517641954, + "grad_norm": 0.13633745908737183, + "learning_rate": 0.0005, + "loss": 2.0998, + "step": 254330 + }, + { + "epoch": 0.9680808142323181, + "grad_norm": 0.12903860211372375, + "learning_rate": 0.0005, + "loss": 2.0983, + "step": 254340 + }, + { + "epoch": 0.9681188767004407, + "grad_norm": 0.12975889444351196, + "learning_rate": 0.0005, + "loss": 2.1125, + "step": 254350 + }, + { + "epoch": 0.9681569391685635, + "grad_norm": 0.12580707669258118, + "learning_rate": 0.0005, + "loss": 2.1001, + "step": 254360 + }, + { + "epoch": 0.9681950016366861, + "grad_norm": 0.13266442716121674, + "learning_rate": 0.0005, + "loss": 2.0895, + "step": 254370 + }, + { + "epoch": 0.9682330641048088, + "grad_norm": 0.13872191309928894, + "learning_rate": 0.0005, + "loss": 2.1022, + "step": 254380 + }, + { + "epoch": 0.9682711265729315, + "grad_norm": 0.12552133202552795, + "learning_rate": 0.0005, + "loss": 2.0975, + "step": 254390 + }, + { + "epoch": 0.9683091890410542, + "grad_norm": 0.12063095718622208, + "learning_rate": 0.0005, + "loss": 2.0861, + "step": 254400 + }, + { + "epoch": 0.9683472515091769, + "grad_norm": 0.13410405814647675, + "learning_rate": 0.0005, + "loss": 2.1049, + "step": 254410 + }, + { + "epoch": 0.9683853139772995, + "grad_norm": 0.12568983435630798, + "learning_rate": 0.0005, + "loss": 2.1096, + "step": 254420 + }, + { + "epoch": 0.9684233764454222, + "grad_norm": 0.12332507967948914, + "learning_rate": 0.0005, + "loss": 2.1037, + "step": 254430 + }, + { + "epoch": 0.9684614389135449, + "grad_norm": 0.11628450453281403, + "learning_rate": 0.0005, + "loss": 2.0986, + "step": 254440 + }, + { + "epoch": 0.9684995013816676, + "grad_norm": 0.13699153065681458, + "learning_rate": 0.0005, + "loss": 2.097, + "step": 254450 + }, + { + "epoch": 0.9685375638497903, + "grad_norm": 0.1462022215127945, + "learning_rate": 0.0005, + "loss": 2.0915, + "step": 254460 + }, + { + "epoch": 0.9685756263179129, + "grad_norm": 0.1329510509967804, + "learning_rate": 0.0005, + "loss": 2.1062, + "step": 254470 + }, + { + "epoch": 0.9686136887860356, + "grad_norm": 0.1387140452861786, + "learning_rate": 0.0005, + "loss": 2.0968, + "step": 254480 + }, + { + "epoch": 0.9686517512541584, + "grad_norm": 0.11960545927286148, + "learning_rate": 0.0005, + "loss": 2.1025, + "step": 254490 + }, + { + "epoch": 0.968689813722281, + "grad_norm": 0.1454916149377823, + "learning_rate": 0.0005, + "loss": 2.1077, + "step": 254500 + }, + { + "epoch": 0.9687278761904037, + "grad_norm": 0.12710386514663696, + "learning_rate": 0.0005, + "loss": 2.0936, + "step": 254510 + }, + { + "epoch": 0.9687659386585263, + "grad_norm": 0.13227415084838867, + "learning_rate": 0.0005, + "loss": 2.1251, + "step": 254520 + }, + { + "epoch": 0.9688040011266491, + "grad_norm": 0.12211448699235916, + "learning_rate": 0.0005, + "loss": 2.0956, + "step": 254530 + }, + { + "epoch": 0.9688420635947718, + "grad_norm": 0.12017636746168137, + "learning_rate": 0.0005, + "loss": 2.0948, + "step": 254540 + }, + { + "epoch": 0.9688801260628944, + "grad_norm": 0.12308043986558914, + "learning_rate": 0.0005, + "loss": 2.1094, + "step": 254550 + }, + { + "epoch": 0.9689181885310171, + "grad_norm": 0.1376669704914093, + "learning_rate": 0.0005, + "loss": 2.0883, + "step": 254560 + }, + { + "epoch": 0.9689562509991397, + "grad_norm": 0.13200579583644867, + "learning_rate": 0.0005, + "loss": 2.1023, + "step": 254570 + }, + { + "epoch": 0.9689943134672625, + "grad_norm": 0.13734455406665802, + "learning_rate": 0.0005, + "loss": 2.1043, + "step": 254580 + }, + { + "epoch": 0.9690323759353852, + "grad_norm": 0.12204215675592422, + "learning_rate": 0.0005, + "loss": 2.0854, + "step": 254590 + }, + { + "epoch": 0.9690704384035078, + "grad_norm": 0.13001269102096558, + "learning_rate": 0.0005, + "loss": 2.1021, + "step": 254600 + }, + { + "epoch": 0.9691085008716305, + "grad_norm": 0.11727369576692581, + "learning_rate": 0.0005, + "loss": 2.0948, + "step": 254610 + }, + { + "epoch": 0.9691465633397532, + "grad_norm": 0.13287939131259918, + "learning_rate": 0.0005, + "loss": 2.1055, + "step": 254620 + }, + { + "epoch": 0.9691846258078759, + "grad_norm": 0.11994440853595734, + "learning_rate": 0.0005, + "loss": 2.1109, + "step": 254630 + }, + { + "epoch": 0.9692226882759986, + "grad_norm": 0.13018877804279327, + "learning_rate": 0.0005, + "loss": 2.0963, + "step": 254640 + }, + { + "epoch": 0.9692607507441212, + "grad_norm": 0.12774427235126495, + "learning_rate": 0.0005, + "loss": 2.1035, + "step": 254650 + }, + { + "epoch": 0.969298813212244, + "grad_norm": 0.11620493233203888, + "learning_rate": 0.0005, + "loss": 2.1038, + "step": 254660 + }, + { + "epoch": 0.9693368756803666, + "grad_norm": 0.13221842050552368, + "learning_rate": 0.0005, + "loss": 2.1146, + "step": 254670 + }, + { + "epoch": 0.9693749381484893, + "grad_norm": 0.1154850572347641, + "learning_rate": 0.0005, + "loss": 2.1077, + "step": 254680 + }, + { + "epoch": 0.969413000616612, + "grad_norm": 0.1304425150156021, + "learning_rate": 0.0005, + "loss": 2.0954, + "step": 254690 + }, + { + "epoch": 0.9694510630847347, + "grad_norm": 0.11422496289014816, + "learning_rate": 0.0005, + "loss": 2.0953, + "step": 254700 + }, + { + "epoch": 0.9694891255528574, + "grad_norm": 0.12208788841962814, + "learning_rate": 0.0005, + "loss": 2.1081, + "step": 254710 + }, + { + "epoch": 0.96952718802098, + "grad_norm": 0.14281286299228668, + "learning_rate": 0.0005, + "loss": 2.1073, + "step": 254720 + }, + { + "epoch": 0.9695652504891027, + "grad_norm": 0.12920963764190674, + "learning_rate": 0.0005, + "loss": 2.0878, + "step": 254730 + }, + { + "epoch": 0.9696033129572253, + "grad_norm": 0.139565572142601, + "learning_rate": 0.0005, + "loss": 2.0963, + "step": 254740 + }, + { + "epoch": 0.9696413754253481, + "grad_norm": 0.14187435805797577, + "learning_rate": 0.0005, + "loss": 2.1109, + "step": 254750 + }, + { + "epoch": 0.9696794378934708, + "grad_norm": 0.1298711597919464, + "learning_rate": 0.0005, + "loss": 2.1269, + "step": 254760 + }, + { + "epoch": 0.9697175003615934, + "grad_norm": 0.12016897648572922, + "learning_rate": 0.0005, + "loss": 2.0986, + "step": 254770 + }, + { + "epoch": 0.9697555628297161, + "grad_norm": 0.13304503262043, + "learning_rate": 0.0005, + "loss": 2.1018, + "step": 254780 + }, + { + "epoch": 0.9697936252978389, + "grad_norm": 0.13235604763031006, + "learning_rate": 0.0005, + "loss": 2.0974, + "step": 254790 + }, + { + "epoch": 0.9698316877659615, + "grad_norm": 0.1423034816980362, + "learning_rate": 0.0005, + "loss": 2.1051, + "step": 254800 + }, + { + "epoch": 0.9698697502340842, + "grad_norm": 0.13728663325309753, + "learning_rate": 0.0005, + "loss": 2.0886, + "step": 254810 + }, + { + "epoch": 0.9699078127022068, + "grad_norm": 0.12065032124519348, + "learning_rate": 0.0005, + "loss": 2.1058, + "step": 254820 + }, + { + "epoch": 0.9699458751703296, + "grad_norm": 0.11827119439840317, + "learning_rate": 0.0005, + "loss": 2.1181, + "step": 254830 + }, + { + "epoch": 0.9699839376384523, + "grad_norm": 0.1292085349559784, + "learning_rate": 0.0005, + "loss": 2.1087, + "step": 254840 + }, + { + "epoch": 0.9700220001065749, + "grad_norm": 0.12470462918281555, + "learning_rate": 0.0005, + "loss": 2.1157, + "step": 254850 + }, + { + "epoch": 0.9700600625746976, + "grad_norm": 0.1269422173500061, + "learning_rate": 0.0005, + "loss": 2.1037, + "step": 254860 + }, + { + "epoch": 0.9700981250428202, + "grad_norm": 0.1338520497083664, + "learning_rate": 0.0005, + "loss": 2.0934, + "step": 254870 + }, + { + "epoch": 0.970136187510943, + "grad_norm": 0.11972904205322266, + "learning_rate": 0.0005, + "loss": 2.1087, + "step": 254880 + }, + { + "epoch": 0.9701742499790657, + "grad_norm": 0.1291525512933731, + "learning_rate": 0.0005, + "loss": 2.097, + "step": 254890 + }, + { + "epoch": 0.9702123124471883, + "grad_norm": 0.14123444259166718, + "learning_rate": 0.0005, + "loss": 2.1106, + "step": 254900 + }, + { + "epoch": 0.970250374915311, + "grad_norm": 0.1271352916955948, + "learning_rate": 0.0005, + "loss": 2.109, + "step": 254910 + }, + { + "epoch": 0.9702884373834337, + "grad_norm": 0.12444078922271729, + "learning_rate": 0.0005, + "loss": 2.1071, + "step": 254920 + }, + { + "epoch": 0.9703264998515564, + "grad_norm": 0.11825793236494064, + "learning_rate": 0.0005, + "loss": 2.0996, + "step": 254930 + }, + { + "epoch": 0.970364562319679, + "grad_norm": 0.21717515587806702, + "learning_rate": 0.0005, + "loss": 2.116, + "step": 254940 + }, + { + "epoch": 0.9704026247878017, + "grad_norm": 0.12200090289115906, + "learning_rate": 0.0005, + "loss": 2.1098, + "step": 254950 + }, + { + "epoch": 0.9704406872559245, + "grad_norm": 0.13350056111812592, + "learning_rate": 0.0005, + "loss": 2.1092, + "step": 254960 + }, + { + "epoch": 0.9704787497240471, + "grad_norm": 0.13846023380756378, + "learning_rate": 0.0005, + "loss": 2.079, + "step": 254970 + }, + { + "epoch": 0.9705168121921698, + "grad_norm": 0.13522903621196747, + "learning_rate": 0.0005, + "loss": 2.105, + "step": 254980 + }, + { + "epoch": 0.9705548746602924, + "grad_norm": 0.12254560738801956, + "learning_rate": 0.0005, + "loss": 2.1038, + "step": 254990 + }, + { + "epoch": 0.9705929371284151, + "grad_norm": 0.12815439701080322, + "learning_rate": 0.0005, + "loss": 2.1073, + "step": 255000 + }, + { + "epoch": 0.9706309995965379, + "grad_norm": 0.13445810973644257, + "learning_rate": 0.0005, + "loss": 2.1042, + "step": 255010 + }, + { + "epoch": 0.9706690620646605, + "grad_norm": 0.13216426968574524, + "learning_rate": 0.0005, + "loss": 2.1021, + "step": 255020 + }, + { + "epoch": 0.9707071245327832, + "grad_norm": 0.12756191194057465, + "learning_rate": 0.0005, + "loss": 2.1066, + "step": 255030 + }, + { + "epoch": 0.9707451870009058, + "grad_norm": 0.1667054444551468, + "learning_rate": 0.0005, + "loss": 2.11, + "step": 255040 + }, + { + "epoch": 0.9707832494690286, + "grad_norm": 0.12336678802967072, + "learning_rate": 0.0005, + "loss": 2.1008, + "step": 255050 + }, + { + "epoch": 0.9708213119371513, + "grad_norm": 0.12533093988895416, + "learning_rate": 0.0005, + "loss": 2.0936, + "step": 255060 + }, + { + "epoch": 0.9708593744052739, + "grad_norm": 0.13144513964653015, + "learning_rate": 0.0005, + "loss": 2.1049, + "step": 255070 + }, + { + "epoch": 0.9708974368733966, + "grad_norm": 0.12879325449466705, + "learning_rate": 0.0005, + "loss": 2.1056, + "step": 255080 + }, + { + "epoch": 0.9709354993415193, + "grad_norm": 0.12825438380241394, + "learning_rate": 0.0005, + "loss": 2.0877, + "step": 255090 + }, + { + "epoch": 0.970973561809642, + "grad_norm": 0.13372738659381866, + "learning_rate": 0.0005, + "loss": 2.1045, + "step": 255100 + }, + { + "epoch": 0.9710116242777647, + "grad_norm": 0.1324717253446579, + "learning_rate": 0.0005, + "loss": 2.1106, + "step": 255110 + }, + { + "epoch": 0.9710496867458873, + "grad_norm": 0.14835681021213531, + "learning_rate": 0.0005, + "loss": 2.1078, + "step": 255120 + }, + { + "epoch": 0.9710877492140101, + "grad_norm": 0.13331232964992523, + "learning_rate": 0.0005, + "loss": 2.1067, + "step": 255130 + }, + { + "epoch": 0.9711258116821327, + "grad_norm": 0.12525902688503265, + "learning_rate": 0.0005, + "loss": 2.107, + "step": 255140 + }, + { + "epoch": 0.9711638741502554, + "grad_norm": 0.125664621591568, + "learning_rate": 0.0005, + "loss": 2.1051, + "step": 255150 + }, + { + "epoch": 0.9712019366183781, + "grad_norm": 0.12486566603183746, + "learning_rate": 0.0005, + "loss": 2.0942, + "step": 255160 + }, + { + "epoch": 0.9712399990865007, + "grad_norm": 0.14651164412498474, + "learning_rate": 0.0005, + "loss": 2.0772, + "step": 255170 + }, + { + "epoch": 0.9712780615546235, + "grad_norm": 0.13397106528282166, + "learning_rate": 0.0005, + "loss": 2.0997, + "step": 255180 + }, + { + "epoch": 0.9713161240227461, + "grad_norm": 0.12492938339710236, + "learning_rate": 0.0005, + "loss": 2.0929, + "step": 255190 + }, + { + "epoch": 0.9713541864908688, + "grad_norm": 0.1352192461490631, + "learning_rate": 0.0005, + "loss": 2.1015, + "step": 255200 + }, + { + "epoch": 0.9713922489589915, + "grad_norm": 0.1237134337425232, + "learning_rate": 0.0005, + "loss": 2.0918, + "step": 255210 + }, + { + "epoch": 0.9714303114271142, + "grad_norm": 0.12343886494636536, + "learning_rate": 0.0005, + "loss": 2.1087, + "step": 255220 + }, + { + "epoch": 0.9714683738952369, + "grad_norm": 0.1212475448846817, + "learning_rate": 0.0005, + "loss": 2.0963, + "step": 255230 + }, + { + "epoch": 0.9715064363633595, + "grad_norm": 0.13539819419384003, + "learning_rate": 0.0005, + "loss": 2.1005, + "step": 255240 + }, + { + "epoch": 0.9715444988314822, + "grad_norm": 0.13426490128040314, + "learning_rate": 0.0005, + "loss": 2.1194, + "step": 255250 + }, + { + "epoch": 0.971582561299605, + "grad_norm": 0.13717344403266907, + "learning_rate": 0.0005, + "loss": 2.1006, + "step": 255260 + }, + { + "epoch": 0.9716206237677276, + "grad_norm": 0.12221307307481766, + "learning_rate": 0.0005, + "loss": 2.1114, + "step": 255270 + }, + { + "epoch": 0.9716586862358503, + "grad_norm": 0.13117346167564392, + "learning_rate": 0.0005, + "loss": 2.0914, + "step": 255280 + }, + { + "epoch": 0.9716967487039729, + "grad_norm": 0.13155390322208405, + "learning_rate": 0.0005, + "loss": 2.1129, + "step": 255290 + }, + { + "epoch": 0.9717348111720956, + "grad_norm": 0.1376398205757141, + "learning_rate": 0.0005, + "loss": 2.104, + "step": 255300 + }, + { + "epoch": 0.9717728736402184, + "grad_norm": 0.12305990606546402, + "learning_rate": 0.0005, + "loss": 2.1019, + "step": 255310 + }, + { + "epoch": 0.971810936108341, + "grad_norm": 0.11314533650875092, + "learning_rate": 0.0005, + "loss": 2.1048, + "step": 255320 + }, + { + "epoch": 0.9718489985764637, + "grad_norm": 0.13132306933403015, + "learning_rate": 0.0005, + "loss": 2.0983, + "step": 255330 + }, + { + "epoch": 0.9718870610445863, + "grad_norm": 0.11744492501020432, + "learning_rate": 0.0005, + "loss": 2.0982, + "step": 255340 + }, + { + "epoch": 0.9719251235127091, + "grad_norm": 0.12644986808300018, + "learning_rate": 0.0005, + "loss": 2.1055, + "step": 255350 + }, + { + "epoch": 0.9719631859808318, + "grad_norm": 0.1316252052783966, + "learning_rate": 0.0005, + "loss": 2.116, + "step": 255360 + }, + { + "epoch": 0.9720012484489544, + "grad_norm": 0.12514221668243408, + "learning_rate": 0.0005, + "loss": 2.11, + "step": 255370 + }, + { + "epoch": 0.9720393109170771, + "grad_norm": 0.14039510488510132, + "learning_rate": 0.0005, + "loss": 2.093, + "step": 255380 + }, + { + "epoch": 0.9720773733851998, + "grad_norm": 0.11962796747684479, + "learning_rate": 0.0005, + "loss": 2.0862, + "step": 255390 + }, + { + "epoch": 0.9721154358533225, + "grad_norm": 0.15293170511722565, + "learning_rate": 0.0005, + "loss": 2.119, + "step": 255400 + }, + { + "epoch": 0.9721534983214452, + "grad_norm": 0.15546253323554993, + "learning_rate": 0.0005, + "loss": 2.112, + "step": 255410 + }, + { + "epoch": 0.9721915607895678, + "grad_norm": 0.1338612586259842, + "learning_rate": 0.0005, + "loss": 2.1022, + "step": 255420 + }, + { + "epoch": 0.9722296232576905, + "grad_norm": 0.1300361603498459, + "learning_rate": 0.0005, + "loss": 2.1109, + "step": 255430 + }, + { + "epoch": 0.9722676857258132, + "grad_norm": 0.12052548676729202, + "learning_rate": 0.0005, + "loss": 2.1214, + "step": 255440 + }, + { + "epoch": 0.9723057481939359, + "grad_norm": 0.1294412463903427, + "learning_rate": 0.0005, + "loss": 2.1124, + "step": 255450 + }, + { + "epoch": 0.9723438106620585, + "grad_norm": 0.12222734093666077, + "learning_rate": 0.0005, + "loss": 2.0998, + "step": 255460 + }, + { + "epoch": 0.9723818731301812, + "grad_norm": 0.12707626819610596, + "learning_rate": 0.0005, + "loss": 2.0934, + "step": 255470 + }, + { + "epoch": 0.972419935598304, + "grad_norm": 0.13399982452392578, + "learning_rate": 0.0005, + "loss": 2.0904, + "step": 255480 + }, + { + "epoch": 0.9724579980664266, + "grad_norm": 0.13506804406642914, + "learning_rate": 0.0005, + "loss": 2.106, + "step": 255490 + }, + { + "epoch": 0.9724960605345493, + "grad_norm": 0.12390710413455963, + "learning_rate": 0.0005, + "loss": 2.1014, + "step": 255500 + }, + { + "epoch": 0.972534123002672, + "grad_norm": 0.12795017659664154, + "learning_rate": 0.0005, + "loss": 2.0959, + "step": 255510 + }, + { + "epoch": 0.9725721854707947, + "grad_norm": 0.12792456150054932, + "learning_rate": 0.0005, + "loss": 2.1079, + "step": 255520 + }, + { + "epoch": 0.9726102479389174, + "grad_norm": 0.13905619084835052, + "learning_rate": 0.0005, + "loss": 2.1041, + "step": 255530 + }, + { + "epoch": 0.97264831040704, + "grad_norm": 0.13615982234477997, + "learning_rate": 0.0005, + "loss": 2.1096, + "step": 255540 + }, + { + "epoch": 0.9726863728751627, + "grad_norm": 0.13537728786468506, + "learning_rate": 0.0005, + "loss": 2.1119, + "step": 255550 + }, + { + "epoch": 0.9727244353432855, + "grad_norm": 0.1334761083126068, + "learning_rate": 0.0005, + "loss": 2.0939, + "step": 255560 + }, + { + "epoch": 0.9727624978114081, + "grad_norm": 0.12943799793720245, + "learning_rate": 0.0005, + "loss": 2.0996, + "step": 255570 + }, + { + "epoch": 0.9728005602795308, + "grad_norm": 0.13745969533920288, + "learning_rate": 0.0005, + "loss": 2.1078, + "step": 255580 + }, + { + "epoch": 0.9728386227476534, + "grad_norm": 0.12578512728214264, + "learning_rate": 0.0005, + "loss": 2.1024, + "step": 255590 + }, + { + "epoch": 0.9728766852157761, + "grad_norm": 0.12996302545070648, + "learning_rate": 0.0005, + "loss": 2.1121, + "step": 255600 + }, + { + "epoch": 0.9729147476838989, + "grad_norm": 0.1376398652791977, + "learning_rate": 0.0005, + "loss": 2.1122, + "step": 255610 + }, + { + "epoch": 0.9729528101520215, + "grad_norm": 0.12311550974845886, + "learning_rate": 0.0005, + "loss": 2.1122, + "step": 255620 + }, + { + "epoch": 0.9729908726201442, + "grad_norm": 0.12581251561641693, + "learning_rate": 0.0005, + "loss": 2.1075, + "step": 255630 + }, + { + "epoch": 0.9730289350882668, + "grad_norm": 0.13663913309574127, + "learning_rate": 0.0005, + "loss": 2.1153, + "step": 255640 + }, + { + "epoch": 0.9730669975563896, + "grad_norm": 0.11359215527772903, + "learning_rate": 0.0005, + "loss": 2.1072, + "step": 255650 + }, + { + "epoch": 0.9731050600245122, + "grad_norm": 0.12716831266880035, + "learning_rate": 0.0005, + "loss": 2.1028, + "step": 255660 + }, + { + "epoch": 0.9731431224926349, + "grad_norm": 0.1292915642261505, + "learning_rate": 0.0005, + "loss": 2.1037, + "step": 255670 + }, + { + "epoch": 0.9731811849607576, + "grad_norm": 0.13148799538612366, + "learning_rate": 0.0005, + "loss": 2.1084, + "step": 255680 + }, + { + "epoch": 0.9732192474288803, + "grad_norm": 0.1377020627260208, + "learning_rate": 0.0005, + "loss": 2.107, + "step": 255690 + }, + { + "epoch": 0.973257309897003, + "grad_norm": 0.1219463124871254, + "learning_rate": 0.0005, + "loss": 2.0958, + "step": 255700 + }, + { + "epoch": 0.9732953723651256, + "grad_norm": 0.12844008207321167, + "learning_rate": 0.0005, + "loss": 2.1124, + "step": 255710 + }, + { + "epoch": 0.9733334348332483, + "grad_norm": 0.13267263770103455, + "learning_rate": 0.0005, + "loss": 2.1149, + "step": 255720 + }, + { + "epoch": 0.973371497301371, + "grad_norm": 0.1315184086561203, + "learning_rate": 0.0005, + "loss": 2.1089, + "step": 255730 + }, + { + "epoch": 0.9734095597694937, + "grad_norm": 0.2425239086151123, + "learning_rate": 0.0005, + "loss": 2.109, + "step": 255740 + }, + { + "epoch": 0.9734476222376164, + "grad_norm": 0.12007585912942886, + "learning_rate": 0.0005, + "loss": 2.113, + "step": 255750 + }, + { + "epoch": 0.973485684705739, + "grad_norm": 0.1381920874118805, + "learning_rate": 0.0005, + "loss": 2.0868, + "step": 255760 + }, + { + "epoch": 0.9735237471738617, + "grad_norm": 0.1297314167022705, + "learning_rate": 0.0005, + "loss": 2.0955, + "step": 255770 + }, + { + "epoch": 0.9735618096419845, + "grad_norm": 0.13172182440757751, + "learning_rate": 0.0005, + "loss": 2.0912, + "step": 255780 + }, + { + "epoch": 0.9735998721101071, + "grad_norm": 0.12290968745946884, + "learning_rate": 0.0005, + "loss": 2.0875, + "step": 255790 + }, + { + "epoch": 0.9736379345782298, + "grad_norm": 0.13406983017921448, + "learning_rate": 0.0005, + "loss": 2.1213, + "step": 255800 + }, + { + "epoch": 0.9736759970463524, + "grad_norm": 0.6703738570213318, + "learning_rate": 0.0005, + "loss": 2.0934, + "step": 255810 + }, + { + "epoch": 0.9737140595144752, + "grad_norm": 0.13410161435604095, + "learning_rate": 0.0005, + "loss": 2.1113, + "step": 255820 + }, + { + "epoch": 0.9737521219825979, + "grad_norm": 0.13230681419372559, + "learning_rate": 0.0005, + "loss": 2.1086, + "step": 255830 + }, + { + "epoch": 0.9737901844507205, + "grad_norm": 0.13530078530311584, + "learning_rate": 0.0005, + "loss": 2.096, + "step": 255840 + }, + { + "epoch": 0.9738282469188432, + "grad_norm": 0.12839284539222717, + "learning_rate": 0.0005, + "loss": 2.1086, + "step": 255850 + }, + { + "epoch": 0.9738663093869658, + "grad_norm": 0.11974814534187317, + "learning_rate": 0.0005, + "loss": 2.0928, + "step": 255860 + }, + { + "epoch": 0.9739043718550886, + "grad_norm": 0.15206706523895264, + "learning_rate": 0.0005, + "loss": 2.0834, + "step": 255870 + }, + { + "epoch": 0.9739424343232113, + "grad_norm": 0.12297140806913376, + "learning_rate": 0.0005, + "loss": 2.1059, + "step": 255880 + }, + { + "epoch": 0.9739804967913339, + "grad_norm": 0.12922076880931854, + "learning_rate": 0.0005, + "loss": 2.1042, + "step": 255890 + }, + { + "epoch": 0.9740185592594566, + "grad_norm": 0.12766559422016144, + "learning_rate": 0.0005, + "loss": 2.0981, + "step": 255900 + }, + { + "epoch": 0.9740566217275793, + "grad_norm": 0.13466320931911469, + "learning_rate": 0.0005, + "loss": 2.105, + "step": 255910 + }, + { + "epoch": 0.974094684195702, + "grad_norm": 0.1484542340040207, + "learning_rate": 0.0005, + "loss": 2.0963, + "step": 255920 + }, + { + "epoch": 0.9741327466638247, + "grad_norm": 0.12738987803459167, + "learning_rate": 0.0005, + "loss": 2.0969, + "step": 255930 + }, + { + "epoch": 0.9741708091319473, + "grad_norm": 0.12110450118780136, + "learning_rate": 0.0005, + "loss": 2.09, + "step": 255940 + }, + { + "epoch": 0.9742088716000701, + "grad_norm": 0.12763312458992004, + "learning_rate": 0.0005, + "loss": 2.0964, + "step": 255950 + }, + { + "epoch": 0.9742469340681927, + "grad_norm": 0.12843506038188934, + "learning_rate": 0.0005, + "loss": 2.0991, + "step": 255960 + }, + { + "epoch": 0.9742849965363154, + "grad_norm": 0.13136166334152222, + "learning_rate": 0.0005, + "loss": 2.0912, + "step": 255970 + }, + { + "epoch": 0.974323059004438, + "grad_norm": 0.13471366465091705, + "learning_rate": 0.0005, + "loss": 2.1052, + "step": 255980 + }, + { + "epoch": 0.9743611214725608, + "grad_norm": 0.12480809539556503, + "learning_rate": 0.0005, + "loss": 2.1054, + "step": 255990 + }, + { + "epoch": 0.9743991839406835, + "grad_norm": 0.1274518221616745, + "learning_rate": 0.0005, + "loss": 2.096, + "step": 256000 + }, + { + "epoch": 0.9744372464088061, + "grad_norm": 0.12159202992916107, + "learning_rate": 0.0005, + "loss": 2.1118, + "step": 256010 + }, + { + "epoch": 0.9744753088769288, + "grad_norm": 0.12531977891921997, + "learning_rate": 0.0005, + "loss": 2.1038, + "step": 256020 + }, + { + "epoch": 0.9745133713450514, + "grad_norm": 0.15594400465488434, + "learning_rate": 0.0005, + "loss": 2.1049, + "step": 256030 + }, + { + "epoch": 0.9745514338131742, + "grad_norm": 0.16119563579559326, + "learning_rate": 0.0005, + "loss": 2.1165, + "step": 256040 + }, + { + "epoch": 0.9745894962812969, + "grad_norm": 0.1301833689212799, + "learning_rate": 0.0005, + "loss": 2.0964, + "step": 256050 + }, + { + "epoch": 0.9746275587494195, + "grad_norm": 0.12218587845563889, + "learning_rate": 0.0005, + "loss": 2.0993, + "step": 256060 + }, + { + "epoch": 0.9746656212175422, + "grad_norm": 0.12528090178966522, + "learning_rate": 0.0005, + "loss": 2.0978, + "step": 256070 + }, + { + "epoch": 0.974703683685665, + "grad_norm": 0.1265835464000702, + "learning_rate": 0.0005, + "loss": 2.0981, + "step": 256080 + }, + { + "epoch": 0.9747417461537876, + "grad_norm": 0.13827918469905853, + "learning_rate": 0.0005, + "loss": 2.1101, + "step": 256090 + }, + { + "epoch": 0.9747798086219103, + "grad_norm": 0.11425582319498062, + "learning_rate": 0.0005, + "loss": 2.1045, + "step": 256100 + }, + { + "epoch": 0.9748178710900329, + "grad_norm": 0.14054735004901886, + "learning_rate": 0.0005, + "loss": 2.0978, + "step": 256110 + }, + { + "epoch": 0.9748559335581557, + "grad_norm": 0.13398562371730804, + "learning_rate": 0.0005, + "loss": 2.1077, + "step": 256120 + }, + { + "epoch": 0.9748939960262784, + "grad_norm": 0.13601486384868622, + "learning_rate": 0.0005, + "loss": 2.1013, + "step": 256130 + }, + { + "epoch": 0.974932058494401, + "grad_norm": 0.13452279567718506, + "learning_rate": 0.0005, + "loss": 2.112, + "step": 256140 + }, + { + "epoch": 0.9749701209625237, + "grad_norm": 0.12645120918750763, + "learning_rate": 0.0005, + "loss": 2.1128, + "step": 256150 + }, + { + "epoch": 0.9750081834306463, + "grad_norm": 0.15241266787052155, + "learning_rate": 0.0005, + "loss": 2.0933, + "step": 256160 + }, + { + "epoch": 0.9750462458987691, + "grad_norm": 0.12823283672332764, + "learning_rate": 0.0005, + "loss": 2.0942, + "step": 256170 + }, + { + "epoch": 0.9750843083668917, + "grad_norm": 0.14209270477294922, + "learning_rate": 0.0005, + "loss": 2.0944, + "step": 256180 + }, + { + "epoch": 0.9751223708350144, + "grad_norm": 0.12420807033777237, + "learning_rate": 0.0005, + "loss": 2.0904, + "step": 256190 + }, + { + "epoch": 0.9751604333031371, + "grad_norm": 0.12384863197803497, + "learning_rate": 0.0005, + "loss": 2.1159, + "step": 256200 + }, + { + "epoch": 0.9751984957712598, + "grad_norm": 0.12487807869911194, + "learning_rate": 0.0005, + "loss": 2.1077, + "step": 256210 + }, + { + "epoch": 0.9752365582393825, + "grad_norm": 0.14678291976451874, + "learning_rate": 0.0005, + "loss": 2.1118, + "step": 256220 + }, + { + "epoch": 0.9752746207075051, + "grad_norm": 0.12178803980350494, + "learning_rate": 0.0005, + "loss": 2.1002, + "step": 256230 + }, + { + "epoch": 0.9753126831756278, + "grad_norm": 0.12519116699695587, + "learning_rate": 0.0005, + "loss": 2.0979, + "step": 256240 + }, + { + "epoch": 0.9753507456437506, + "grad_norm": 0.13480907678604126, + "learning_rate": 0.0005, + "loss": 2.0952, + "step": 256250 + }, + { + "epoch": 0.9753888081118732, + "grad_norm": 0.13024158775806427, + "learning_rate": 0.0005, + "loss": 2.11, + "step": 256260 + }, + { + "epoch": 0.9754268705799959, + "grad_norm": 0.13576485216617584, + "learning_rate": 0.0005, + "loss": 2.1083, + "step": 256270 + }, + { + "epoch": 0.9754649330481185, + "grad_norm": 0.12468088418245316, + "learning_rate": 0.0005, + "loss": 2.1016, + "step": 256280 + }, + { + "epoch": 0.9755029955162412, + "grad_norm": 0.15353313088417053, + "learning_rate": 0.0005, + "loss": 2.0962, + "step": 256290 + }, + { + "epoch": 0.975541057984364, + "grad_norm": 0.13093069195747375, + "learning_rate": 0.0005, + "loss": 2.107, + "step": 256300 + }, + { + "epoch": 0.9755791204524866, + "grad_norm": 0.1228041797876358, + "learning_rate": 0.0005, + "loss": 2.1167, + "step": 256310 + }, + { + "epoch": 0.9756171829206093, + "grad_norm": 0.13330857455730438, + "learning_rate": 0.0005, + "loss": 2.0943, + "step": 256320 + }, + { + "epoch": 0.9756552453887319, + "grad_norm": 0.13814891874790192, + "learning_rate": 0.0005, + "loss": 2.1114, + "step": 256330 + }, + { + "epoch": 0.9756933078568547, + "grad_norm": 0.14975720643997192, + "learning_rate": 0.0005, + "loss": 2.1034, + "step": 256340 + }, + { + "epoch": 0.9757313703249774, + "grad_norm": 0.12255514413118362, + "learning_rate": 0.0005, + "loss": 2.0978, + "step": 256350 + }, + { + "epoch": 0.9757694327931, + "grad_norm": 0.12346848845481873, + "learning_rate": 0.0005, + "loss": 2.0936, + "step": 256360 + }, + { + "epoch": 0.9758074952612227, + "grad_norm": 0.12416546046733856, + "learning_rate": 0.0005, + "loss": 2.0857, + "step": 256370 + }, + { + "epoch": 0.9758455577293454, + "grad_norm": 0.12955905497074127, + "learning_rate": 0.0005, + "loss": 2.0978, + "step": 256380 + }, + { + "epoch": 0.9758836201974681, + "grad_norm": 0.12957718968391418, + "learning_rate": 0.0005, + "loss": 2.1036, + "step": 256390 + }, + { + "epoch": 0.9759216826655908, + "grad_norm": 0.13651643693447113, + "learning_rate": 0.0005, + "loss": 2.117, + "step": 256400 + }, + { + "epoch": 0.9759597451337134, + "grad_norm": 0.1301625818014145, + "learning_rate": 0.0005, + "loss": 2.0937, + "step": 256410 + }, + { + "epoch": 0.9759978076018362, + "grad_norm": 0.12493173032999039, + "learning_rate": 0.0005, + "loss": 2.1075, + "step": 256420 + }, + { + "epoch": 0.9760358700699588, + "grad_norm": 0.131301611661911, + "learning_rate": 0.0005, + "loss": 2.1124, + "step": 256430 + }, + { + "epoch": 0.9760739325380815, + "grad_norm": 0.12295681983232498, + "learning_rate": 0.0005, + "loss": 2.1025, + "step": 256440 + }, + { + "epoch": 0.9761119950062042, + "grad_norm": 0.1482170820236206, + "learning_rate": 0.0005, + "loss": 2.1174, + "step": 256450 + }, + { + "epoch": 0.9761500574743268, + "grad_norm": 0.1427377164363861, + "learning_rate": 0.0005, + "loss": 2.103, + "step": 256460 + }, + { + "epoch": 0.9761881199424496, + "grad_norm": 0.135903999209404, + "learning_rate": 0.0005, + "loss": 2.0978, + "step": 256470 + }, + { + "epoch": 0.9762261824105722, + "grad_norm": 0.13628506660461426, + "learning_rate": 0.0005, + "loss": 2.0984, + "step": 256480 + }, + { + "epoch": 0.9762642448786949, + "grad_norm": 0.12419886887073517, + "learning_rate": 0.0005, + "loss": 2.1005, + "step": 256490 + }, + { + "epoch": 0.9763023073468176, + "grad_norm": 0.11160758882761002, + "learning_rate": 0.0005, + "loss": 2.0999, + "step": 256500 + }, + { + "epoch": 0.9763403698149403, + "grad_norm": 0.1205335259437561, + "learning_rate": 0.0005, + "loss": 2.1041, + "step": 256510 + }, + { + "epoch": 0.976378432283063, + "grad_norm": 0.12622901797294617, + "learning_rate": 0.0005, + "loss": 2.0983, + "step": 256520 + }, + { + "epoch": 0.9764164947511856, + "grad_norm": 0.1198456808924675, + "learning_rate": 0.0005, + "loss": 2.1122, + "step": 256530 + }, + { + "epoch": 0.9764545572193083, + "grad_norm": 0.12420783191919327, + "learning_rate": 0.0005, + "loss": 2.0867, + "step": 256540 + }, + { + "epoch": 0.9764926196874311, + "grad_norm": 0.12044156342744827, + "learning_rate": 0.0005, + "loss": 2.0954, + "step": 256550 + }, + { + "epoch": 0.9765306821555537, + "grad_norm": 0.12951701879501343, + "learning_rate": 0.0005, + "loss": 2.0966, + "step": 256560 + }, + { + "epoch": 0.9765687446236764, + "grad_norm": 0.1298401802778244, + "learning_rate": 0.0005, + "loss": 2.0968, + "step": 256570 + }, + { + "epoch": 0.976606807091799, + "grad_norm": 0.15672166645526886, + "learning_rate": 0.0005, + "loss": 2.0813, + "step": 256580 + }, + { + "epoch": 0.9766448695599217, + "grad_norm": 0.1251695454120636, + "learning_rate": 0.0005, + "loss": 2.1108, + "step": 256590 + }, + { + "epoch": 0.9766829320280445, + "grad_norm": 0.1194787546992302, + "learning_rate": 0.0005, + "loss": 2.1129, + "step": 256600 + }, + { + "epoch": 0.9767209944961671, + "grad_norm": 0.13885675370693207, + "learning_rate": 0.0005, + "loss": 2.1059, + "step": 256610 + }, + { + "epoch": 0.9767590569642898, + "grad_norm": 0.12232273817062378, + "learning_rate": 0.0005, + "loss": 2.0931, + "step": 256620 + }, + { + "epoch": 0.9767971194324124, + "grad_norm": 0.12373682111501694, + "learning_rate": 0.0005, + "loss": 2.1054, + "step": 256630 + }, + { + "epoch": 0.9768351819005352, + "grad_norm": 0.13903123140335083, + "learning_rate": 0.0005, + "loss": 2.1126, + "step": 256640 + }, + { + "epoch": 0.9768732443686579, + "grad_norm": 0.11475900560617447, + "learning_rate": 0.0005, + "loss": 2.1159, + "step": 256650 + }, + { + "epoch": 0.9769113068367805, + "grad_norm": 0.12367875128984451, + "learning_rate": 0.0005, + "loss": 2.1138, + "step": 256660 + }, + { + "epoch": 0.9769493693049032, + "grad_norm": 0.12255427241325378, + "learning_rate": 0.0005, + "loss": 2.1037, + "step": 256670 + }, + { + "epoch": 0.9769874317730259, + "grad_norm": 0.1237168237566948, + "learning_rate": 0.0005, + "loss": 2.0923, + "step": 256680 + }, + { + "epoch": 0.9770254942411486, + "grad_norm": 0.12001582235097885, + "learning_rate": 0.0005, + "loss": 2.1031, + "step": 256690 + }, + { + "epoch": 0.9770635567092713, + "grad_norm": 0.13157282769680023, + "learning_rate": 0.0005, + "loss": 2.1116, + "step": 256700 + }, + { + "epoch": 0.9771016191773939, + "grad_norm": 0.11652835458517075, + "learning_rate": 0.0005, + "loss": 2.1021, + "step": 256710 + }, + { + "epoch": 0.9771396816455166, + "grad_norm": 0.12160100042819977, + "learning_rate": 0.0005, + "loss": 2.0893, + "step": 256720 + }, + { + "epoch": 0.9771777441136393, + "grad_norm": 0.1366545855998993, + "learning_rate": 0.0005, + "loss": 2.1043, + "step": 256730 + }, + { + "epoch": 0.977215806581762, + "grad_norm": 0.12709447741508484, + "learning_rate": 0.0005, + "loss": 2.1166, + "step": 256740 + }, + { + "epoch": 0.9772538690498846, + "grad_norm": 0.11968369036912918, + "learning_rate": 0.0005, + "loss": 2.1012, + "step": 256750 + }, + { + "epoch": 0.9772919315180073, + "grad_norm": 0.14591091871261597, + "learning_rate": 0.0005, + "loss": 2.1055, + "step": 256760 + }, + { + "epoch": 0.9773299939861301, + "grad_norm": 0.12609747052192688, + "learning_rate": 0.0005, + "loss": 2.102, + "step": 256770 + }, + { + "epoch": 0.9773680564542527, + "grad_norm": 0.12567102909088135, + "learning_rate": 0.0005, + "loss": 2.11, + "step": 256780 + }, + { + "epoch": 0.9774061189223754, + "grad_norm": 0.13073763251304626, + "learning_rate": 0.0005, + "loss": 2.103, + "step": 256790 + }, + { + "epoch": 0.977444181390498, + "grad_norm": 0.12760724127292633, + "learning_rate": 0.0005, + "loss": 2.0813, + "step": 256800 + }, + { + "epoch": 0.9774822438586208, + "grad_norm": 0.1319473534822464, + "learning_rate": 0.0005, + "loss": 2.0978, + "step": 256810 + }, + { + "epoch": 0.9775203063267435, + "grad_norm": 0.12671951949596405, + "learning_rate": 0.0005, + "loss": 2.0981, + "step": 256820 + }, + { + "epoch": 0.9775583687948661, + "grad_norm": 0.11461569368839264, + "learning_rate": 0.0005, + "loss": 2.1044, + "step": 256830 + }, + { + "epoch": 0.9775964312629888, + "grad_norm": 0.12155576050281525, + "learning_rate": 0.0005, + "loss": 2.1023, + "step": 256840 + }, + { + "epoch": 0.9776344937311116, + "grad_norm": 0.12291843444108963, + "learning_rate": 0.0005, + "loss": 2.0947, + "step": 256850 + }, + { + "epoch": 0.9776725561992342, + "grad_norm": 0.12945066392421722, + "learning_rate": 0.0005, + "loss": 2.097, + "step": 256860 + }, + { + "epoch": 0.9777106186673569, + "grad_norm": 0.12481513619422913, + "learning_rate": 0.0005, + "loss": 2.0899, + "step": 256870 + }, + { + "epoch": 0.9777486811354795, + "grad_norm": 0.12574784457683563, + "learning_rate": 0.0005, + "loss": 2.1112, + "step": 256880 + }, + { + "epoch": 0.9777867436036022, + "grad_norm": 0.1317308396100998, + "learning_rate": 0.0005, + "loss": 2.1114, + "step": 256890 + }, + { + "epoch": 0.977824806071725, + "grad_norm": 0.1266586184501648, + "learning_rate": 0.0005, + "loss": 2.1065, + "step": 256900 + }, + { + "epoch": 0.9778628685398476, + "grad_norm": 0.1240110993385315, + "learning_rate": 0.0005, + "loss": 2.1187, + "step": 256910 + }, + { + "epoch": 0.9779009310079703, + "grad_norm": 0.1257583349943161, + "learning_rate": 0.0005, + "loss": 2.1086, + "step": 256920 + }, + { + "epoch": 0.9779389934760929, + "grad_norm": 0.1170019805431366, + "learning_rate": 0.0005, + "loss": 2.1124, + "step": 256930 + }, + { + "epoch": 0.9779770559442157, + "grad_norm": 0.11175654083490372, + "learning_rate": 0.0005, + "loss": 2.1025, + "step": 256940 + }, + { + "epoch": 0.9780151184123383, + "grad_norm": 0.14043356478214264, + "learning_rate": 0.0005, + "loss": 2.1071, + "step": 256950 + }, + { + "epoch": 0.978053180880461, + "grad_norm": 0.11840981245040894, + "learning_rate": 0.0005, + "loss": 2.0961, + "step": 256960 + }, + { + "epoch": 0.9780912433485837, + "grad_norm": 0.1315300315618515, + "learning_rate": 0.0005, + "loss": 2.1114, + "step": 256970 + }, + { + "epoch": 0.9781293058167064, + "grad_norm": 0.1293240189552307, + "learning_rate": 0.0005, + "loss": 2.1022, + "step": 256980 + }, + { + "epoch": 0.9781673682848291, + "grad_norm": 0.13710515201091766, + "learning_rate": 0.0005, + "loss": 2.0878, + "step": 256990 + }, + { + "epoch": 0.9782054307529517, + "grad_norm": 0.1184496060013771, + "learning_rate": 0.0005, + "loss": 2.0932, + "step": 257000 + }, + { + "epoch": 0.9782434932210744, + "grad_norm": 0.13085511326789856, + "learning_rate": 0.0005, + "loss": 2.1056, + "step": 257010 + }, + { + "epoch": 0.9782815556891971, + "grad_norm": 0.11874709278345108, + "learning_rate": 0.0005, + "loss": 2.1174, + "step": 257020 + }, + { + "epoch": 0.9783196181573198, + "grad_norm": 0.14223746955394745, + "learning_rate": 0.0005, + "loss": 2.0975, + "step": 257030 + }, + { + "epoch": 0.9783576806254425, + "grad_norm": 0.12502065300941467, + "learning_rate": 0.0005, + "loss": 2.0923, + "step": 257040 + }, + { + "epoch": 0.9783957430935651, + "grad_norm": 0.12484754621982574, + "learning_rate": 0.0005, + "loss": 2.1034, + "step": 257050 + }, + { + "epoch": 0.9784338055616878, + "grad_norm": 0.11882039904594421, + "learning_rate": 0.0005, + "loss": 2.1048, + "step": 257060 + }, + { + "epoch": 0.9784718680298106, + "grad_norm": 0.15266261994838715, + "learning_rate": 0.0005, + "loss": 2.1046, + "step": 257070 + }, + { + "epoch": 0.9785099304979332, + "grad_norm": 0.1286851167678833, + "learning_rate": 0.0005, + "loss": 2.0931, + "step": 257080 + }, + { + "epoch": 0.9785479929660559, + "grad_norm": 0.1353611946105957, + "learning_rate": 0.0005, + "loss": 2.1102, + "step": 257090 + }, + { + "epoch": 0.9785860554341785, + "grad_norm": 0.12660717964172363, + "learning_rate": 0.0005, + "loss": 2.123, + "step": 257100 + }, + { + "epoch": 0.9786241179023013, + "grad_norm": 0.12552636861801147, + "learning_rate": 0.0005, + "loss": 2.1144, + "step": 257110 + }, + { + "epoch": 0.978662180370424, + "grad_norm": 0.12433915585279465, + "learning_rate": 0.0005, + "loss": 2.1109, + "step": 257120 + }, + { + "epoch": 0.9787002428385466, + "grad_norm": 0.12207551300525665, + "learning_rate": 0.0005, + "loss": 2.1026, + "step": 257130 + }, + { + "epoch": 0.9787383053066693, + "grad_norm": 0.12603680789470673, + "learning_rate": 0.0005, + "loss": 2.1011, + "step": 257140 + }, + { + "epoch": 0.9787763677747919, + "grad_norm": 0.12861515581607819, + "learning_rate": 0.0005, + "loss": 2.1063, + "step": 257150 + }, + { + "epoch": 0.9788144302429147, + "grad_norm": 0.1411086916923523, + "learning_rate": 0.0005, + "loss": 2.0968, + "step": 257160 + }, + { + "epoch": 0.9788524927110374, + "grad_norm": 0.14067700505256653, + "learning_rate": 0.0005, + "loss": 2.0986, + "step": 257170 + }, + { + "epoch": 0.97889055517916, + "grad_norm": 0.12675721943378448, + "learning_rate": 0.0005, + "loss": 2.0871, + "step": 257180 + }, + { + "epoch": 0.9789286176472827, + "grad_norm": 0.12993498146533966, + "learning_rate": 0.0005, + "loss": 2.0869, + "step": 257190 + }, + { + "epoch": 0.9789666801154054, + "grad_norm": 0.1242540255188942, + "learning_rate": 0.0005, + "loss": 2.1088, + "step": 257200 + }, + { + "epoch": 0.9790047425835281, + "grad_norm": 0.12612493336200714, + "learning_rate": 0.0005, + "loss": 2.1137, + "step": 257210 + }, + { + "epoch": 0.9790428050516508, + "grad_norm": 0.11633577197790146, + "learning_rate": 0.0005, + "loss": 2.1077, + "step": 257220 + }, + { + "epoch": 0.9790808675197734, + "grad_norm": 0.1153443455696106, + "learning_rate": 0.0005, + "loss": 2.0937, + "step": 257230 + }, + { + "epoch": 0.9791189299878962, + "grad_norm": 0.1141563132405281, + "learning_rate": 0.0005, + "loss": 2.1033, + "step": 257240 + }, + { + "epoch": 0.9791569924560188, + "grad_norm": 0.12455535680055618, + "learning_rate": 0.0005, + "loss": 2.1123, + "step": 257250 + }, + { + "epoch": 0.9791950549241415, + "grad_norm": 0.1222526803612709, + "learning_rate": 0.0005, + "loss": 2.0995, + "step": 257260 + }, + { + "epoch": 0.9792331173922642, + "grad_norm": 0.12650750577449799, + "learning_rate": 0.0005, + "loss": 2.0894, + "step": 257270 + }, + { + "epoch": 0.9792711798603869, + "grad_norm": 0.1303875595331192, + "learning_rate": 0.0005, + "loss": 2.1011, + "step": 257280 + }, + { + "epoch": 0.9793092423285096, + "grad_norm": 0.12938658893108368, + "learning_rate": 0.0005, + "loss": 2.1037, + "step": 257290 + }, + { + "epoch": 0.9793473047966322, + "grad_norm": 0.12311175465583801, + "learning_rate": 0.0005, + "loss": 2.0959, + "step": 257300 + }, + { + "epoch": 0.9793853672647549, + "grad_norm": 0.1328628957271576, + "learning_rate": 0.0005, + "loss": 2.104, + "step": 257310 + }, + { + "epoch": 0.9794234297328775, + "grad_norm": 0.1298210620880127, + "learning_rate": 0.0005, + "loss": 2.0907, + "step": 257320 + }, + { + "epoch": 0.9794614922010003, + "grad_norm": 0.13311082124710083, + "learning_rate": 0.0005, + "loss": 2.0932, + "step": 257330 + }, + { + "epoch": 0.979499554669123, + "grad_norm": 0.1271749883890152, + "learning_rate": 0.0005, + "loss": 2.1058, + "step": 257340 + }, + { + "epoch": 0.9795376171372456, + "grad_norm": 0.1434793770313263, + "learning_rate": 0.0005, + "loss": 2.0931, + "step": 257350 + }, + { + "epoch": 0.9795756796053683, + "grad_norm": 0.1282554268836975, + "learning_rate": 0.0005, + "loss": 2.1053, + "step": 257360 + }, + { + "epoch": 0.9796137420734911, + "grad_norm": 0.14303486049175262, + "learning_rate": 0.0005, + "loss": 2.0975, + "step": 257370 + }, + { + "epoch": 0.9796518045416137, + "grad_norm": 0.13568031787872314, + "learning_rate": 0.0005, + "loss": 2.098, + "step": 257380 + }, + { + "epoch": 0.9796898670097364, + "grad_norm": 0.1360870599746704, + "learning_rate": 0.0005, + "loss": 2.0992, + "step": 257390 + }, + { + "epoch": 0.979727929477859, + "grad_norm": 0.12494190782308578, + "learning_rate": 0.0005, + "loss": 2.1013, + "step": 257400 + }, + { + "epoch": 0.9797659919459818, + "grad_norm": 0.11892944574356079, + "learning_rate": 0.0005, + "loss": 2.0945, + "step": 257410 + }, + { + "epoch": 0.9798040544141045, + "grad_norm": 0.1417090743780136, + "learning_rate": 0.0005, + "loss": 2.1158, + "step": 257420 + }, + { + "epoch": 0.9798421168822271, + "grad_norm": 0.1306687444448471, + "learning_rate": 0.0005, + "loss": 2.1011, + "step": 257430 + }, + { + "epoch": 0.9798801793503498, + "grad_norm": 0.14193464815616608, + "learning_rate": 0.0005, + "loss": 2.1063, + "step": 257440 + }, + { + "epoch": 0.9799182418184724, + "grad_norm": 0.12267079204320908, + "learning_rate": 0.0005, + "loss": 2.1065, + "step": 257450 + }, + { + "epoch": 0.9799563042865952, + "grad_norm": 0.12777526676654816, + "learning_rate": 0.0005, + "loss": 2.1042, + "step": 257460 + }, + { + "epoch": 0.9799943667547178, + "grad_norm": 0.13828790187835693, + "learning_rate": 0.0005, + "loss": 2.0931, + "step": 257470 + }, + { + "epoch": 0.9800324292228405, + "grad_norm": 0.12572868168354034, + "learning_rate": 0.0005, + "loss": 2.1016, + "step": 257480 + }, + { + "epoch": 0.9800704916909632, + "grad_norm": 0.14129850268363953, + "learning_rate": 0.0005, + "loss": 2.1105, + "step": 257490 + }, + { + "epoch": 0.9801085541590859, + "grad_norm": 0.12947334349155426, + "learning_rate": 0.0005, + "loss": 2.0897, + "step": 257500 + }, + { + "epoch": 0.9801466166272086, + "grad_norm": 0.1196305900812149, + "learning_rate": 0.0005, + "loss": 2.0941, + "step": 257510 + }, + { + "epoch": 0.9801846790953312, + "grad_norm": 0.13480442762374878, + "learning_rate": 0.0005, + "loss": 2.0982, + "step": 257520 + }, + { + "epoch": 0.9802227415634539, + "grad_norm": 0.14180704951286316, + "learning_rate": 0.0005, + "loss": 2.0998, + "step": 257530 + }, + { + "epoch": 0.9802608040315767, + "grad_norm": 0.12781913578510284, + "learning_rate": 0.0005, + "loss": 2.0941, + "step": 257540 + }, + { + "epoch": 0.9802988664996993, + "grad_norm": 0.13860957324504852, + "learning_rate": 0.0005, + "loss": 2.1034, + "step": 257550 + }, + { + "epoch": 0.980336928967822, + "grad_norm": 0.13281327486038208, + "learning_rate": 0.0005, + "loss": 2.1025, + "step": 257560 + }, + { + "epoch": 0.9803749914359446, + "grad_norm": 0.1327611654996872, + "learning_rate": 0.0005, + "loss": 2.1129, + "step": 257570 + }, + { + "epoch": 0.9804130539040674, + "grad_norm": 0.11932291090488434, + "learning_rate": 0.0005, + "loss": 2.0989, + "step": 257580 + }, + { + "epoch": 0.9804511163721901, + "grad_norm": 0.1157454401254654, + "learning_rate": 0.0005, + "loss": 2.0863, + "step": 257590 + }, + { + "epoch": 0.9804891788403127, + "grad_norm": 0.12068181484937668, + "learning_rate": 0.0005, + "loss": 2.1061, + "step": 257600 + }, + { + "epoch": 0.9805272413084354, + "grad_norm": 0.11915294080972672, + "learning_rate": 0.0005, + "loss": 2.1013, + "step": 257610 + }, + { + "epoch": 0.980565303776558, + "grad_norm": 0.11446240544319153, + "learning_rate": 0.0005, + "loss": 2.1077, + "step": 257620 + }, + { + "epoch": 0.9806033662446808, + "grad_norm": 0.1247885599732399, + "learning_rate": 0.0005, + "loss": 2.1064, + "step": 257630 + }, + { + "epoch": 0.9806414287128035, + "grad_norm": 0.12949500977993011, + "learning_rate": 0.0005, + "loss": 2.102, + "step": 257640 + }, + { + "epoch": 0.9806794911809261, + "grad_norm": 0.11171896755695343, + "learning_rate": 0.0005, + "loss": 2.102, + "step": 257650 + }, + { + "epoch": 0.9807175536490488, + "grad_norm": 0.13205240666866302, + "learning_rate": 0.0005, + "loss": 2.1019, + "step": 257660 + }, + { + "epoch": 0.9807556161171715, + "grad_norm": 0.15095172822475433, + "learning_rate": 0.0005, + "loss": 2.0995, + "step": 257670 + }, + { + "epoch": 0.9807936785852942, + "grad_norm": 0.14866958558559418, + "learning_rate": 0.0005, + "loss": 2.1002, + "step": 257680 + }, + { + "epoch": 0.9808317410534169, + "grad_norm": 0.13984258472919464, + "learning_rate": 0.0005, + "loss": 2.1007, + "step": 257690 + }, + { + "epoch": 0.9808698035215395, + "grad_norm": 0.12085743993520737, + "learning_rate": 0.0005, + "loss": 2.0948, + "step": 257700 + }, + { + "epoch": 0.9809078659896623, + "grad_norm": 0.11848316341638565, + "learning_rate": 0.0005, + "loss": 2.0996, + "step": 257710 + }, + { + "epoch": 0.9809459284577849, + "grad_norm": 0.12194575369358063, + "learning_rate": 0.0005, + "loss": 2.0837, + "step": 257720 + }, + { + "epoch": 0.9809839909259076, + "grad_norm": 0.12809818983078003, + "learning_rate": 0.0005, + "loss": 2.092, + "step": 257730 + }, + { + "epoch": 0.9810220533940303, + "grad_norm": 0.13447466492652893, + "learning_rate": 0.0005, + "loss": 2.1036, + "step": 257740 + }, + { + "epoch": 0.9810601158621529, + "grad_norm": 0.1250292956829071, + "learning_rate": 0.0005, + "loss": 2.1058, + "step": 257750 + }, + { + "epoch": 0.9810981783302757, + "grad_norm": 0.14565639197826385, + "learning_rate": 0.0005, + "loss": 2.104, + "step": 257760 + }, + { + "epoch": 0.9811362407983983, + "grad_norm": 0.11911211162805557, + "learning_rate": 0.0005, + "loss": 2.1064, + "step": 257770 + }, + { + "epoch": 0.981174303266521, + "grad_norm": 0.12898415327072144, + "learning_rate": 0.0005, + "loss": 2.0929, + "step": 257780 + }, + { + "epoch": 0.9812123657346437, + "grad_norm": 0.11979299783706665, + "learning_rate": 0.0005, + "loss": 2.108, + "step": 257790 + }, + { + "epoch": 0.9812504282027664, + "grad_norm": 0.13223201036453247, + "learning_rate": 0.0005, + "loss": 2.0912, + "step": 257800 + }, + { + "epoch": 0.9812884906708891, + "grad_norm": 0.13081419467926025, + "learning_rate": 0.0005, + "loss": 2.1162, + "step": 257810 + }, + { + "epoch": 0.9813265531390117, + "grad_norm": 0.1428884118795395, + "learning_rate": 0.0005, + "loss": 2.1024, + "step": 257820 + }, + { + "epoch": 0.9813646156071344, + "grad_norm": 0.13086241483688354, + "learning_rate": 0.0005, + "loss": 2.0926, + "step": 257830 + }, + { + "epoch": 0.9814026780752572, + "grad_norm": 0.1303388625383377, + "learning_rate": 0.0005, + "loss": 2.0941, + "step": 257840 + }, + { + "epoch": 0.9814407405433798, + "grad_norm": 0.1248994842171669, + "learning_rate": 0.0005, + "loss": 2.1072, + "step": 257850 + }, + { + "epoch": 0.9814788030115025, + "grad_norm": 0.12460754811763763, + "learning_rate": 0.0005, + "loss": 2.104, + "step": 257860 + }, + { + "epoch": 0.9815168654796251, + "grad_norm": 0.11974826455116272, + "learning_rate": 0.0005, + "loss": 2.1056, + "step": 257870 + }, + { + "epoch": 0.9815549279477478, + "grad_norm": 0.1300465613603592, + "learning_rate": 0.0005, + "loss": 2.0892, + "step": 257880 + }, + { + "epoch": 0.9815929904158706, + "grad_norm": 0.14067713916301727, + "learning_rate": 0.0005, + "loss": 2.099, + "step": 257890 + }, + { + "epoch": 0.9816310528839932, + "grad_norm": 0.1349792778491974, + "learning_rate": 0.0005, + "loss": 2.0985, + "step": 257900 + }, + { + "epoch": 0.9816691153521159, + "grad_norm": 0.13326580822467804, + "learning_rate": 0.0005, + "loss": 2.0921, + "step": 257910 + }, + { + "epoch": 0.9817071778202385, + "grad_norm": 0.2773866057395935, + "learning_rate": 0.0005, + "loss": 2.0947, + "step": 257920 + }, + { + "epoch": 0.9817452402883613, + "grad_norm": 0.12216406315565109, + "learning_rate": 0.0005, + "loss": 2.1029, + "step": 257930 + }, + { + "epoch": 0.981783302756484, + "grad_norm": 0.14116701483726501, + "learning_rate": 0.0005, + "loss": 2.1063, + "step": 257940 + }, + { + "epoch": 0.9818213652246066, + "grad_norm": 0.12974095344543457, + "learning_rate": 0.0005, + "loss": 2.1021, + "step": 257950 + }, + { + "epoch": 0.9818594276927293, + "grad_norm": 0.11997047066688538, + "learning_rate": 0.0005, + "loss": 2.0941, + "step": 257960 + }, + { + "epoch": 0.981897490160852, + "grad_norm": 0.13372445106506348, + "learning_rate": 0.0005, + "loss": 2.106, + "step": 257970 + }, + { + "epoch": 0.9819355526289747, + "grad_norm": 0.13414129614830017, + "learning_rate": 0.0005, + "loss": 2.0915, + "step": 257980 + }, + { + "epoch": 0.9819736150970974, + "grad_norm": 0.12741310894489288, + "learning_rate": 0.0005, + "loss": 2.0884, + "step": 257990 + }, + { + "epoch": 0.98201167756522, + "grad_norm": 0.12601037323474884, + "learning_rate": 0.0005, + "loss": 2.106, + "step": 258000 + }, + { + "epoch": 0.9820497400333428, + "grad_norm": 0.1299891471862793, + "learning_rate": 0.0005, + "loss": 2.1115, + "step": 258010 + }, + { + "epoch": 0.9820878025014654, + "grad_norm": 0.13461840152740479, + "learning_rate": 0.0005, + "loss": 2.1031, + "step": 258020 + }, + { + "epoch": 0.9821258649695881, + "grad_norm": 0.12584570050239563, + "learning_rate": 0.0005, + "loss": 2.0969, + "step": 258030 + }, + { + "epoch": 0.9821639274377107, + "grad_norm": 0.13351255655288696, + "learning_rate": 0.0005, + "loss": 2.1014, + "step": 258040 + }, + { + "epoch": 0.9822019899058334, + "grad_norm": 0.12079761922359467, + "learning_rate": 0.0005, + "loss": 2.1001, + "step": 258050 + }, + { + "epoch": 0.9822400523739562, + "grad_norm": 0.12505701184272766, + "learning_rate": 0.0005, + "loss": 2.1102, + "step": 258060 + }, + { + "epoch": 0.9822781148420788, + "grad_norm": 0.13210898637771606, + "learning_rate": 0.0005, + "loss": 2.0924, + "step": 258070 + }, + { + "epoch": 0.9823161773102015, + "grad_norm": 0.1253563016653061, + "learning_rate": 0.0005, + "loss": 2.0963, + "step": 258080 + }, + { + "epoch": 0.9823542397783241, + "grad_norm": 0.12380914390087128, + "learning_rate": 0.0005, + "loss": 2.0934, + "step": 258090 + }, + { + "epoch": 0.9823923022464469, + "grad_norm": 0.1306920349597931, + "learning_rate": 0.0005, + "loss": 2.1157, + "step": 258100 + }, + { + "epoch": 0.9824303647145696, + "grad_norm": 0.13246990740299225, + "learning_rate": 0.0005, + "loss": 2.1017, + "step": 258110 + }, + { + "epoch": 0.9824684271826922, + "grad_norm": 0.1187557801604271, + "learning_rate": 0.0005, + "loss": 2.1127, + "step": 258120 + }, + { + "epoch": 0.9825064896508149, + "grad_norm": 0.13110314309597015, + "learning_rate": 0.0005, + "loss": 2.0881, + "step": 258130 + }, + { + "epoch": 0.9825445521189377, + "grad_norm": 0.14428871870040894, + "learning_rate": 0.0005, + "loss": 2.0947, + "step": 258140 + }, + { + "epoch": 0.9825826145870603, + "grad_norm": 0.13296005129814148, + "learning_rate": 0.0005, + "loss": 2.0934, + "step": 258150 + }, + { + "epoch": 0.982620677055183, + "grad_norm": 0.11815083771944046, + "learning_rate": 0.0005, + "loss": 2.105, + "step": 258160 + }, + { + "epoch": 0.9826587395233056, + "grad_norm": 0.12260029464960098, + "learning_rate": 0.0005, + "loss": 2.1093, + "step": 258170 + }, + { + "epoch": 0.9826968019914283, + "grad_norm": 0.13770967721939087, + "learning_rate": 0.0005, + "loss": 2.1037, + "step": 258180 + }, + { + "epoch": 0.982734864459551, + "grad_norm": 0.12869694828987122, + "learning_rate": 0.0005, + "loss": 2.0934, + "step": 258190 + }, + { + "epoch": 0.9827729269276737, + "grad_norm": 0.14327187836170197, + "learning_rate": 0.0005, + "loss": 2.1109, + "step": 258200 + }, + { + "epoch": 0.9828109893957964, + "grad_norm": 0.12928041815757751, + "learning_rate": 0.0005, + "loss": 2.0949, + "step": 258210 + }, + { + "epoch": 0.982849051863919, + "grad_norm": 0.11430586129426956, + "learning_rate": 0.0005, + "loss": 2.0952, + "step": 258220 + }, + { + "epoch": 0.9828871143320418, + "grad_norm": 0.12899106740951538, + "learning_rate": 0.0005, + "loss": 2.0996, + "step": 258230 + }, + { + "epoch": 0.9829251768001644, + "grad_norm": 0.12435305863618851, + "learning_rate": 0.0005, + "loss": 2.1006, + "step": 258240 + }, + { + "epoch": 0.9829632392682871, + "grad_norm": 0.13388924300670624, + "learning_rate": 0.0005, + "loss": 2.101, + "step": 258250 + }, + { + "epoch": 0.9830013017364098, + "grad_norm": 0.12655743956565857, + "learning_rate": 0.0005, + "loss": 2.0982, + "step": 258260 + }, + { + "epoch": 0.9830393642045325, + "grad_norm": 0.13059300184249878, + "learning_rate": 0.0005, + "loss": 2.1105, + "step": 258270 + }, + { + "epoch": 0.9830774266726552, + "grad_norm": 0.12760014832019806, + "learning_rate": 0.0005, + "loss": 2.0987, + "step": 258280 + }, + { + "epoch": 0.9831154891407778, + "grad_norm": 0.12506812810897827, + "learning_rate": 0.0005, + "loss": 2.0887, + "step": 258290 + }, + { + "epoch": 0.9831535516089005, + "grad_norm": 0.1192302331328392, + "learning_rate": 0.0005, + "loss": 2.102, + "step": 258300 + }, + { + "epoch": 0.9831916140770232, + "grad_norm": 0.12790679931640625, + "learning_rate": 0.0005, + "loss": 2.1012, + "step": 258310 + }, + { + "epoch": 0.9832296765451459, + "grad_norm": 0.11708279699087143, + "learning_rate": 0.0005, + "loss": 2.083, + "step": 258320 + }, + { + "epoch": 0.9832677390132686, + "grad_norm": 0.1344541311264038, + "learning_rate": 0.0005, + "loss": 2.1002, + "step": 258330 + }, + { + "epoch": 0.9833058014813912, + "grad_norm": 0.11863990128040314, + "learning_rate": 0.0005, + "loss": 2.1129, + "step": 258340 + }, + { + "epoch": 0.9833438639495139, + "grad_norm": 0.1262895166873932, + "learning_rate": 0.0005, + "loss": 2.0953, + "step": 258350 + }, + { + "epoch": 0.9833819264176367, + "grad_norm": 0.12119850516319275, + "learning_rate": 0.0005, + "loss": 2.0964, + "step": 258360 + }, + { + "epoch": 0.9834199888857593, + "grad_norm": 0.14586344361305237, + "learning_rate": 0.0005, + "loss": 2.0944, + "step": 258370 + }, + { + "epoch": 0.983458051353882, + "grad_norm": 0.131086528301239, + "learning_rate": 0.0005, + "loss": 2.0954, + "step": 258380 + }, + { + "epoch": 0.9834961138220046, + "grad_norm": 0.12114156782627106, + "learning_rate": 0.0005, + "loss": 2.1026, + "step": 258390 + }, + { + "epoch": 0.9835341762901274, + "grad_norm": 0.11951122432947159, + "learning_rate": 0.0005, + "loss": 2.0994, + "step": 258400 + }, + { + "epoch": 0.9835722387582501, + "grad_norm": 0.1404489427804947, + "learning_rate": 0.0005, + "loss": 2.1066, + "step": 258410 + }, + { + "epoch": 0.9836103012263727, + "grad_norm": 0.12836606800556183, + "learning_rate": 0.0005, + "loss": 2.1051, + "step": 258420 + }, + { + "epoch": 0.9836483636944954, + "grad_norm": 0.1190219521522522, + "learning_rate": 0.0005, + "loss": 2.1099, + "step": 258430 + }, + { + "epoch": 0.9836864261626181, + "grad_norm": 0.11856423318386078, + "learning_rate": 0.0005, + "loss": 2.097, + "step": 258440 + }, + { + "epoch": 0.9837244886307408, + "grad_norm": 0.11622331291437149, + "learning_rate": 0.0005, + "loss": 2.0942, + "step": 258450 + }, + { + "epoch": 0.9837625510988635, + "grad_norm": 0.1283189356327057, + "learning_rate": 0.0005, + "loss": 2.09, + "step": 258460 + }, + { + "epoch": 0.9838006135669861, + "grad_norm": 0.1327824890613556, + "learning_rate": 0.0005, + "loss": 2.1166, + "step": 258470 + }, + { + "epoch": 0.9838386760351088, + "grad_norm": 0.13173063099384308, + "learning_rate": 0.0005, + "loss": 2.1182, + "step": 258480 + }, + { + "epoch": 0.9838767385032315, + "grad_norm": 0.1208709105849266, + "learning_rate": 0.0005, + "loss": 2.0835, + "step": 258490 + }, + { + "epoch": 0.9839148009713542, + "grad_norm": 0.13490121066570282, + "learning_rate": 0.0005, + "loss": 2.1008, + "step": 258500 + }, + { + "epoch": 0.9839528634394769, + "grad_norm": 0.12017489224672318, + "learning_rate": 0.0005, + "loss": 2.0999, + "step": 258510 + }, + { + "epoch": 0.9839909259075995, + "grad_norm": 0.12940183281898499, + "learning_rate": 0.0005, + "loss": 2.1188, + "step": 258520 + }, + { + "epoch": 0.9840289883757223, + "grad_norm": 0.11870049685239792, + "learning_rate": 0.0005, + "loss": 2.0974, + "step": 258530 + }, + { + "epoch": 0.9840670508438449, + "grad_norm": 0.1260661631822586, + "learning_rate": 0.0005, + "loss": 2.1086, + "step": 258540 + }, + { + "epoch": 0.9841051133119676, + "grad_norm": 0.11173855513334274, + "learning_rate": 0.0005, + "loss": 2.1008, + "step": 258550 + }, + { + "epoch": 0.9841431757800903, + "grad_norm": 0.11546964198350906, + "learning_rate": 0.0005, + "loss": 2.0851, + "step": 258560 + }, + { + "epoch": 0.984181238248213, + "grad_norm": 0.12497804313898087, + "learning_rate": 0.0005, + "loss": 2.0978, + "step": 258570 + }, + { + "epoch": 0.9842193007163357, + "grad_norm": 0.14110919833183289, + "learning_rate": 0.0005, + "loss": 2.1159, + "step": 258580 + }, + { + "epoch": 0.9842573631844583, + "grad_norm": 0.12901490926742554, + "learning_rate": 0.0005, + "loss": 2.0986, + "step": 258590 + }, + { + "epoch": 0.984295425652581, + "grad_norm": 0.1393149495124817, + "learning_rate": 0.0005, + "loss": 2.0976, + "step": 258600 + }, + { + "epoch": 0.9843334881207036, + "grad_norm": 0.14474362134933472, + "learning_rate": 0.0005, + "loss": 2.1097, + "step": 258610 + }, + { + "epoch": 0.9843715505888264, + "grad_norm": 0.1445140540599823, + "learning_rate": 0.0005, + "loss": 2.1055, + "step": 258620 + }, + { + "epoch": 0.9844096130569491, + "grad_norm": 0.1387968510389328, + "learning_rate": 0.0005, + "loss": 2.0849, + "step": 258630 + }, + { + "epoch": 0.9844476755250717, + "grad_norm": 0.1392017900943756, + "learning_rate": 0.0005, + "loss": 2.1015, + "step": 258640 + }, + { + "epoch": 0.9844857379931944, + "grad_norm": 0.11866472661495209, + "learning_rate": 0.0005, + "loss": 2.0996, + "step": 258650 + }, + { + "epoch": 0.9845238004613172, + "grad_norm": 0.1393352746963501, + "learning_rate": 0.0005, + "loss": 2.1037, + "step": 258660 + }, + { + "epoch": 0.9845618629294398, + "grad_norm": 0.12474837154150009, + "learning_rate": 0.0005, + "loss": 2.1053, + "step": 258670 + }, + { + "epoch": 0.9845999253975625, + "grad_norm": 0.12675200402736664, + "learning_rate": 0.0005, + "loss": 2.1032, + "step": 258680 + }, + { + "epoch": 0.9846379878656851, + "grad_norm": 0.11997582763433456, + "learning_rate": 0.0005, + "loss": 2.0972, + "step": 258690 + }, + { + "epoch": 0.9846760503338079, + "grad_norm": 0.12846367061138153, + "learning_rate": 0.0005, + "loss": 2.1114, + "step": 258700 + }, + { + "epoch": 0.9847141128019306, + "grad_norm": 0.12774696946144104, + "learning_rate": 0.0005, + "loss": 2.108, + "step": 258710 + }, + { + "epoch": 0.9847521752700532, + "grad_norm": 0.22226040065288544, + "learning_rate": 0.0005, + "loss": 2.1133, + "step": 258720 + }, + { + "epoch": 0.9847902377381759, + "grad_norm": 0.12668132781982422, + "learning_rate": 0.0005, + "loss": 2.0915, + "step": 258730 + }, + { + "epoch": 0.9848283002062985, + "grad_norm": 0.1319456696510315, + "learning_rate": 0.0005, + "loss": 2.082, + "step": 258740 + }, + { + "epoch": 0.9848663626744213, + "grad_norm": 0.12583576142787933, + "learning_rate": 0.0005, + "loss": 2.1138, + "step": 258750 + }, + { + "epoch": 0.984904425142544, + "grad_norm": 0.12175232172012329, + "learning_rate": 0.0005, + "loss": 2.1044, + "step": 258760 + }, + { + "epoch": 0.9849424876106666, + "grad_norm": 0.12503385543823242, + "learning_rate": 0.0005, + "loss": 2.0931, + "step": 258770 + }, + { + "epoch": 0.9849805500787893, + "grad_norm": 0.13150545954704285, + "learning_rate": 0.0005, + "loss": 2.0976, + "step": 258780 + }, + { + "epoch": 0.985018612546912, + "grad_norm": 0.1303718388080597, + "learning_rate": 0.0005, + "loss": 2.1056, + "step": 258790 + }, + { + "epoch": 0.9850566750150347, + "grad_norm": 0.1266319453716278, + "learning_rate": 0.0005, + "loss": 2.0984, + "step": 258800 + }, + { + "epoch": 0.9850947374831573, + "grad_norm": 0.13897955417633057, + "learning_rate": 0.0005, + "loss": 2.0937, + "step": 258810 + }, + { + "epoch": 0.98513279995128, + "grad_norm": 0.13643626868724823, + "learning_rate": 0.0005, + "loss": 2.0976, + "step": 258820 + }, + { + "epoch": 0.9851708624194028, + "grad_norm": 0.129782572388649, + "learning_rate": 0.0005, + "loss": 2.1045, + "step": 258830 + }, + { + "epoch": 0.9852089248875254, + "grad_norm": 0.11904452741146088, + "learning_rate": 0.0005, + "loss": 2.098, + "step": 258840 + }, + { + "epoch": 0.9852469873556481, + "grad_norm": 0.13108906149864197, + "learning_rate": 0.0005, + "loss": 2.107, + "step": 258850 + }, + { + "epoch": 0.9852850498237707, + "grad_norm": 0.12430950999259949, + "learning_rate": 0.0005, + "loss": 2.0938, + "step": 258860 + }, + { + "epoch": 0.9853231122918935, + "grad_norm": 0.1274895966053009, + "learning_rate": 0.0005, + "loss": 2.0906, + "step": 258870 + }, + { + "epoch": 0.9853611747600162, + "grad_norm": 0.11865878850221634, + "learning_rate": 0.0005, + "loss": 2.077, + "step": 258880 + }, + { + "epoch": 0.9853992372281388, + "grad_norm": 0.11411301791667938, + "learning_rate": 0.0005, + "loss": 2.1038, + "step": 258890 + }, + { + "epoch": 0.9854372996962615, + "grad_norm": 0.13757802546024323, + "learning_rate": 0.0005, + "loss": 2.1019, + "step": 258900 + }, + { + "epoch": 0.9854753621643841, + "grad_norm": 0.15388959646224976, + "learning_rate": 0.0005, + "loss": 2.0894, + "step": 258910 + }, + { + "epoch": 0.9855134246325069, + "grad_norm": 0.13236352801322937, + "learning_rate": 0.0005, + "loss": 2.1124, + "step": 258920 + }, + { + "epoch": 0.9855514871006296, + "grad_norm": 0.11857990175485611, + "learning_rate": 0.0005, + "loss": 2.1063, + "step": 258930 + }, + { + "epoch": 0.9855895495687522, + "grad_norm": 0.1266779899597168, + "learning_rate": 0.0005, + "loss": 2.102, + "step": 258940 + }, + { + "epoch": 0.9856276120368749, + "grad_norm": 0.14129385352134705, + "learning_rate": 0.0005, + "loss": 2.098, + "step": 258950 + }, + { + "epoch": 0.9856656745049976, + "grad_norm": 0.12670108675956726, + "learning_rate": 0.0005, + "loss": 2.0982, + "step": 258960 + }, + { + "epoch": 0.9857037369731203, + "grad_norm": 0.1306104212999344, + "learning_rate": 0.0005, + "loss": 2.1031, + "step": 258970 + }, + { + "epoch": 0.985741799441243, + "grad_norm": 0.6689227223396301, + "learning_rate": 0.0005, + "loss": 2.0893, + "step": 258980 + }, + { + "epoch": 0.9857798619093656, + "grad_norm": 0.1370919644832611, + "learning_rate": 0.0005, + "loss": 2.1177, + "step": 258990 + }, + { + "epoch": 0.9858179243774884, + "grad_norm": 0.12262406200170517, + "learning_rate": 0.0005, + "loss": 2.0986, + "step": 259000 + }, + { + "epoch": 0.985855986845611, + "grad_norm": 0.1291884481906891, + "learning_rate": 0.0005, + "loss": 2.1055, + "step": 259010 + }, + { + "epoch": 0.9858940493137337, + "grad_norm": 0.13689911365509033, + "learning_rate": 0.0005, + "loss": 2.1088, + "step": 259020 + }, + { + "epoch": 0.9859321117818564, + "grad_norm": 0.1361452043056488, + "learning_rate": 0.0005, + "loss": 2.0923, + "step": 259030 + }, + { + "epoch": 0.985970174249979, + "grad_norm": 0.12106503546237946, + "learning_rate": 0.0005, + "loss": 2.105, + "step": 259040 + }, + { + "epoch": 0.9860082367181018, + "grad_norm": 0.13545392453670502, + "learning_rate": 0.0005, + "loss": 2.0995, + "step": 259050 + }, + { + "epoch": 0.9860462991862244, + "grad_norm": 0.12301244586706161, + "learning_rate": 0.0005, + "loss": 2.0975, + "step": 259060 + }, + { + "epoch": 0.9860843616543471, + "grad_norm": 0.11876979470252991, + "learning_rate": 0.0005, + "loss": 2.1014, + "step": 259070 + }, + { + "epoch": 0.9861224241224698, + "grad_norm": 0.11995098739862442, + "learning_rate": 0.0005, + "loss": 2.1066, + "step": 259080 + }, + { + "epoch": 0.9861604865905925, + "grad_norm": 0.13658028841018677, + "learning_rate": 0.0005, + "loss": 2.0982, + "step": 259090 + }, + { + "epoch": 0.9861985490587152, + "grad_norm": 0.12490151822566986, + "learning_rate": 0.0005, + "loss": 2.105, + "step": 259100 + }, + { + "epoch": 0.9862366115268378, + "grad_norm": 0.11956927925348282, + "learning_rate": 0.0005, + "loss": 2.0995, + "step": 259110 + }, + { + "epoch": 0.9862746739949605, + "grad_norm": 0.14448417723178864, + "learning_rate": 0.0005, + "loss": 2.0953, + "step": 259120 + }, + { + "epoch": 0.9863127364630833, + "grad_norm": 0.13080665469169617, + "learning_rate": 0.0005, + "loss": 2.116, + "step": 259130 + }, + { + "epoch": 0.9863507989312059, + "grad_norm": 0.116084985435009, + "learning_rate": 0.0005, + "loss": 2.1081, + "step": 259140 + }, + { + "epoch": 0.9863888613993286, + "grad_norm": 0.13065792620182037, + "learning_rate": 0.0005, + "loss": 2.1118, + "step": 259150 + }, + { + "epoch": 0.9864269238674512, + "grad_norm": 0.1385491043329239, + "learning_rate": 0.0005, + "loss": 2.1058, + "step": 259160 + }, + { + "epoch": 0.9864649863355739, + "grad_norm": 0.11696495860815048, + "learning_rate": 0.0005, + "loss": 2.0916, + "step": 259170 + }, + { + "epoch": 0.9865030488036967, + "grad_norm": 0.1375739574432373, + "learning_rate": 0.0005, + "loss": 2.0982, + "step": 259180 + }, + { + "epoch": 0.9865411112718193, + "grad_norm": 0.13284343481063843, + "learning_rate": 0.0005, + "loss": 2.1221, + "step": 259190 + }, + { + "epoch": 0.986579173739942, + "grad_norm": 0.12562021613121033, + "learning_rate": 0.0005, + "loss": 2.0988, + "step": 259200 + }, + { + "epoch": 0.9866172362080646, + "grad_norm": 0.11927879601716995, + "learning_rate": 0.0005, + "loss": 2.0876, + "step": 259210 + }, + { + "epoch": 0.9866552986761874, + "grad_norm": 0.12254306674003601, + "learning_rate": 0.0005, + "loss": 2.1014, + "step": 259220 + }, + { + "epoch": 0.98669336114431, + "grad_norm": 0.1201302781701088, + "learning_rate": 0.0005, + "loss": 2.0942, + "step": 259230 + }, + { + "epoch": 0.9867314236124327, + "grad_norm": 0.12187743186950684, + "learning_rate": 0.0005, + "loss": 2.0901, + "step": 259240 + }, + { + "epoch": 0.9867694860805554, + "grad_norm": 0.11292102187871933, + "learning_rate": 0.0005, + "loss": 2.0807, + "step": 259250 + }, + { + "epoch": 0.9868075485486781, + "grad_norm": 0.12742412090301514, + "learning_rate": 0.0005, + "loss": 2.1109, + "step": 259260 + }, + { + "epoch": 0.9868456110168008, + "grad_norm": 0.1279342919588089, + "learning_rate": 0.0005, + "loss": 2.1088, + "step": 259270 + }, + { + "epoch": 0.9868836734849235, + "grad_norm": 0.12992903590202332, + "learning_rate": 0.0005, + "loss": 2.0926, + "step": 259280 + }, + { + "epoch": 0.9869217359530461, + "grad_norm": 0.14792388677597046, + "learning_rate": 0.0005, + "loss": 2.1004, + "step": 259290 + }, + { + "epoch": 0.9869597984211689, + "grad_norm": 0.13319194316864014, + "learning_rate": 0.0005, + "loss": 2.0976, + "step": 259300 + }, + { + "epoch": 0.9869978608892915, + "grad_norm": 0.1406574845314026, + "learning_rate": 0.0005, + "loss": 2.1123, + "step": 259310 + }, + { + "epoch": 0.9870359233574142, + "grad_norm": 0.12122943997383118, + "learning_rate": 0.0005, + "loss": 2.094, + "step": 259320 + }, + { + "epoch": 0.9870739858255368, + "grad_norm": 0.12128177285194397, + "learning_rate": 0.0005, + "loss": 2.111, + "step": 259330 + }, + { + "epoch": 0.9871120482936595, + "grad_norm": 0.1400650441646576, + "learning_rate": 0.0005, + "loss": 2.097, + "step": 259340 + }, + { + "epoch": 0.9871501107617823, + "grad_norm": 0.1302204579114914, + "learning_rate": 0.0005, + "loss": 2.1148, + "step": 259350 + }, + { + "epoch": 0.9871881732299049, + "grad_norm": 0.12166402488946915, + "learning_rate": 0.0005, + "loss": 2.0991, + "step": 259360 + }, + { + "epoch": 0.9872262356980276, + "grad_norm": 0.1307765245437622, + "learning_rate": 0.0005, + "loss": 2.0932, + "step": 259370 + }, + { + "epoch": 0.9872642981661502, + "grad_norm": 0.1271083652973175, + "learning_rate": 0.0005, + "loss": 2.0974, + "step": 259380 + }, + { + "epoch": 0.987302360634273, + "grad_norm": 0.12263017147779465, + "learning_rate": 0.0005, + "loss": 2.1272, + "step": 259390 + }, + { + "epoch": 0.9873404231023957, + "grad_norm": 0.13138532638549805, + "learning_rate": 0.0005, + "loss": 2.0947, + "step": 259400 + }, + { + "epoch": 0.9873784855705183, + "grad_norm": 0.1262940764427185, + "learning_rate": 0.0005, + "loss": 2.0935, + "step": 259410 + }, + { + "epoch": 0.987416548038641, + "grad_norm": 0.12203992903232574, + "learning_rate": 0.0005, + "loss": 2.0917, + "step": 259420 + }, + { + "epoch": 0.9874546105067638, + "grad_norm": 0.13928186893463135, + "learning_rate": 0.0005, + "loss": 2.1157, + "step": 259430 + }, + { + "epoch": 0.9874926729748864, + "grad_norm": 0.13313859701156616, + "learning_rate": 0.0005, + "loss": 2.0918, + "step": 259440 + }, + { + "epoch": 0.9875307354430091, + "grad_norm": 0.1350223273038864, + "learning_rate": 0.0005, + "loss": 2.1118, + "step": 259450 + }, + { + "epoch": 0.9875687979111317, + "grad_norm": 0.12552247941493988, + "learning_rate": 0.0005, + "loss": 2.0922, + "step": 259460 + }, + { + "epoch": 0.9876068603792544, + "grad_norm": 0.1323108673095703, + "learning_rate": 0.0005, + "loss": 2.083, + "step": 259470 + }, + { + "epoch": 0.9876449228473771, + "grad_norm": 0.12774093449115753, + "learning_rate": 0.0005, + "loss": 2.1119, + "step": 259480 + }, + { + "epoch": 0.9876829853154998, + "grad_norm": 0.14221549034118652, + "learning_rate": 0.0005, + "loss": 2.0833, + "step": 259490 + }, + { + "epoch": 0.9877210477836225, + "grad_norm": 0.13134565949440002, + "learning_rate": 0.0005, + "loss": 2.0932, + "step": 259500 + }, + { + "epoch": 0.9877591102517451, + "grad_norm": 0.12980684638023376, + "learning_rate": 0.0005, + "loss": 2.0976, + "step": 259510 + }, + { + "epoch": 0.9877971727198679, + "grad_norm": 0.13035202026367188, + "learning_rate": 0.0005, + "loss": 2.1068, + "step": 259520 + }, + { + "epoch": 0.9878352351879905, + "grad_norm": 0.13434964418411255, + "learning_rate": 0.0005, + "loss": 2.1018, + "step": 259530 + }, + { + "epoch": 0.9878732976561132, + "grad_norm": 0.12856483459472656, + "learning_rate": 0.0005, + "loss": 2.1048, + "step": 259540 + }, + { + "epoch": 0.9879113601242359, + "grad_norm": 0.1277189999818802, + "learning_rate": 0.0005, + "loss": 2.115, + "step": 259550 + }, + { + "epoch": 0.9879494225923586, + "grad_norm": 0.1284840852022171, + "learning_rate": 0.0005, + "loss": 2.1032, + "step": 259560 + }, + { + "epoch": 0.9879874850604813, + "grad_norm": 0.11731751263141632, + "learning_rate": 0.0005, + "loss": 2.0968, + "step": 259570 + }, + { + "epoch": 0.9880255475286039, + "grad_norm": 0.13612335920333862, + "learning_rate": 0.0005, + "loss": 2.0886, + "step": 259580 + }, + { + "epoch": 0.9880636099967266, + "grad_norm": 0.12836602330207825, + "learning_rate": 0.0005, + "loss": 2.0996, + "step": 259590 + }, + { + "epoch": 0.9881016724648493, + "grad_norm": 0.13179896771907806, + "learning_rate": 0.0005, + "loss": 2.1122, + "step": 259600 + }, + { + "epoch": 0.988139734932972, + "grad_norm": 0.1250675916671753, + "learning_rate": 0.0005, + "loss": 2.0987, + "step": 259610 + }, + { + "epoch": 0.9881777974010947, + "grad_norm": 0.12391552329063416, + "learning_rate": 0.0005, + "loss": 2.1109, + "step": 259620 + }, + { + "epoch": 0.9882158598692173, + "grad_norm": 0.1251387745141983, + "learning_rate": 0.0005, + "loss": 2.1006, + "step": 259630 + }, + { + "epoch": 0.98825392233734, + "grad_norm": 0.13607516884803772, + "learning_rate": 0.0005, + "loss": 2.0868, + "step": 259640 + }, + { + "epoch": 0.9882919848054628, + "grad_norm": 0.1332910805940628, + "learning_rate": 0.0005, + "loss": 2.0971, + "step": 259650 + }, + { + "epoch": 0.9883300472735854, + "grad_norm": 0.12459276616573334, + "learning_rate": 0.0005, + "loss": 2.0953, + "step": 259660 + }, + { + "epoch": 0.9883681097417081, + "grad_norm": 0.14937591552734375, + "learning_rate": 0.0005, + "loss": 2.1177, + "step": 259670 + }, + { + "epoch": 0.9884061722098307, + "grad_norm": 0.11843692511320114, + "learning_rate": 0.0005, + "loss": 2.0845, + "step": 259680 + }, + { + "epoch": 0.9884442346779535, + "grad_norm": 0.13042514026165009, + "learning_rate": 0.0005, + "loss": 2.1059, + "step": 259690 + }, + { + "epoch": 0.9884822971460762, + "grad_norm": 0.12332228571176529, + "learning_rate": 0.0005, + "loss": 2.0964, + "step": 259700 + }, + { + "epoch": 0.9885203596141988, + "grad_norm": 0.12637607753276825, + "learning_rate": 0.0005, + "loss": 2.0998, + "step": 259710 + }, + { + "epoch": 0.9885584220823215, + "grad_norm": 0.11896184086799622, + "learning_rate": 0.0005, + "loss": 2.0866, + "step": 259720 + }, + { + "epoch": 0.9885964845504442, + "grad_norm": 0.118443563580513, + "learning_rate": 0.0005, + "loss": 2.1033, + "step": 259730 + }, + { + "epoch": 0.9886345470185669, + "grad_norm": 0.12883390486240387, + "learning_rate": 0.0005, + "loss": 2.1018, + "step": 259740 + }, + { + "epoch": 0.9886726094866896, + "grad_norm": 0.12184642255306244, + "learning_rate": 0.0005, + "loss": 2.093, + "step": 259750 + }, + { + "epoch": 0.9887106719548122, + "grad_norm": 0.12203714996576309, + "learning_rate": 0.0005, + "loss": 2.1054, + "step": 259760 + }, + { + "epoch": 0.9887487344229349, + "grad_norm": 0.13342252373695374, + "learning_rate": 0.0005, + "loss": 2.1098, + "step": 259770 + }, + { + "epoch": 0.9887867968910576, + "grad_norm": 0.12597885727882385, + "learning_rate": 0.0005, + "loss": 2.1023, + "step": 259780 + }, + { + "epoch": 0.9888248593591803, + "grad_norm": 0.13244442641735077, + "learning_rate": 0.0005, + "loss": 2.0948, + "step": 259790 + }, + { + "epoch": 0.988862921827303, + "grad_norm": 0.1264996975660324, + "learning_rate": 0.0005, + "loss": 2.1062, + "step": 259800 + }, + { + "epoch": 0.9889009842954256, + "grad_norm": 0.12919080257415771, + "learning_rate": 0.0005, + "loss": 2.0979, + "step": 259810 + }, + { + "epoch": 0.9889390467635484, + "grad_norm": 0.11465083807706833, + "learning_rate": 0.0005, + "loss": 2.1043, + "step": 259820 + }, + { + "epoch": 0.988977109231671, + "grad_norm": 0.14477968215942383, + "learning_rate": 0.0005, + "loss": 2.0895, + "step": 259830 + }, + { + "epoch": 0.9890151716997937, + "grad_norm": 0.13007590174674988, + "learning_rate": 0.0005, + "loss": 2.0981, + "step": 259840 + }, + { + "epoch": 0.9890532341679164, + "grad_norm": 0.1438094526529312, + "learning_rate": 0.0005, + "loss": 2.1073, + "step": 259850 + }, + { + "epoch": 0.9890912966360391, + "grad_norm": 0.1194855123758316, + "learning_rate": 0.0005, + "loss": 2.1134, + "step": 259860 + }, + { + "epoch": 0.9891293591041618, + "grad_norm": 0.13489076495170593, + "learning_rate": 0.0005, + "loss": 2.1001, + "step": 259870 + }, + { + "epoch": 0.9891674215722844, + "grad_norm": 0.12890882790088654, + "learning_rate": 0.0005, + "loss": 2.0964, + "step": 259880 + }, + { + "epoch": 0.9892054840404071, + "grad_norm": 0.12260407209396362, + "learning_rate": 0.0005, + "loss": 2.1036, + "step": 259890 + }, + { + "epoch": 0.9892435465085297, + "grad_norm": 0.13415499031543732, + "learning_rate": 0.0005, + "loss": 2.0939, + "step": 259900 + }, + { + "epoch": 0.9892816089766525, + "grad_norm": 0.13491657376289368, + "learning_rate": 0.0005, + "loss": 2.0905, + "step": 259910 + }, + { + "epoch": 0.9893196714447752, + "grad_norm": 0.14124566316604614, + "learning_rate": 0.0005, + "loss": 2.1005, + "step": 259920 + }, + { + "epoch": 0.9893577339128978, + "grad_norm": 0.12449941784143448, + "learning_rate": 0.0005, + "loss": 2.1147, + "step": 259930 + }, + { + "epoch": 0.9893957963810205, + "grad_norm": 0.1191040500998497, + "learning_rate": 0.0005, + "loss": 2.1082, + "step": 259940 + }, + { + "epoch": 0.9894338588491433, + "grad_norm": 0.14282147586345673, + "learning_rate": 0.0005, + "loss": 2.0846, + "step": 259950 + }, + { + "epoch": 0.9894719213172659, + "grad_norm": 0.11729595065116882, + "learning_rate": 0.0005, + "loss": 2.089, + "step": 259960 + }, + { + "epoch": 0.9895099837853886, + "grad_norm": 0.12624691426753998, + "learning_rate": 0.0005, + "loss": 2.1154, + "step": 259970 + }, + { + "epoch": 0.9895480462535112, + "grad_norm": 0.1268104761838913, + "learning_rate": 0.0005, + "loss": 2.1085, + "step": 259980 + }, + { + "epoch": 0.989586108721634, + "grad_norm": 0.12314214557409286, + "learning_rate": 0.0005, + "loss": 2.0865, + "step": 259990 + }, + { + "epoch": 0.9896241711897567, + "grad_norm": 0.12357240170240402, + "learning_rate": 0.0005, + "loss": 2.1092, + "step": 260000 + }, + { + "epoch": 0.9896622336578793, + "grad_norm": 0.1274331957101822, + "learning_rate": 0.000493545027756321, + "loss": 2.0947, + "step": 260010 + }, + { + "epoch": 0.989700296126002, + "grad_norm": 0.13679982721805573, + "learning_rate": 0.0004908712907082472, + "loss": 2.0972, + "step": 260020 + }, + { + "epoch": 0.9897383585941246, + "grad_norm": 0.12808233499526978, + "learning_rate": 0.000488819660112501, + "loss": 2.1004, + "step": 260030 + }, + { + "epoch": 0.9897764210622474, + "grad_norm": 0.1215348020195961, + "learning_rate": 0.000487090055512642, + "loss": 2.0961, + "step": 260040 + }, + { + "epoch": 0.98981448353037, + "grad_norm": 0.11865545064210892, + "learning_rate": 0.0004855662432702594, + "loss": 2.0994, + "step": 260050 + }, + { + "epoch": 0.9898525459984927, + "grad_norm": 0.12360881268978119, + "learning_rate": 0.0004841886116991581, + "loss": 2.103, + "step": 260060 + }, + { + "epoch": 0.9898906084666154, + "grad_norm": 0.13874591886997223, + "learning_rate": 0.00048292174872340064, + "loss": 2.0962, + "step": 260070 + }, + { + "epoch": 0.9899286709347381, + "grad_norm": 0.12478182464838028, + "learning_rate": 0.00048174258141649444, + "loss": 2.084, + "step": 260080 + }, + { + "epoch": 0.9899667334028608, + "grad_norm": 0.1325121968984604, + "learning_rate": 0.00048063508326896294, + "loss": 2.0916, + "step": 260090 + }, + { + "epoch": 0.9900047958709834, + "grad_norm": 0.13993875682353973, + "learning_rate": 0.00047958758547680686, + "loss": 2.0808, + "step": 260100 + }, + { + "epoch": 0.9900428583391061, + "grad_norm": 0.13443487882614136, + "learning_rate": 0.0004785912790355581, + "loss": 2.0752, + "step": 260110 + }, + { + "epoch": 0.9900809208072289, + "grad_norm": 0.1270710825920105, + "learning_rate": 0.00047763932022500214, + "loss": 2.0884, + "step": 260120 + }, + { + "epoch": 0.9901189832753515, + "grad_norm": 0.12298639118671417, + "learning_rate": 0.0004767262665937184, + "loss": 2.0913, + "step": 260130 + }, + { + "epoch": 0.9901570457434742, + "grad_norm": 0.13086600601673126, + "learning_rate": 0.0004758477054230176, + "loss": 2.1023, + "step": 260140 + }, + { + "epoch": 0.9901951082115968, + "grad_norm": 0.1266835331916809, + "learning_rate": 0.000475, + "loss": 2.0924, + "step": 260150 + }, + { + "epoch": 0.9902331706797196, + "grad_norm": 0.13983656466007233, + "learning_rate": 0.0004741801110252839, + "loss": 2.0947, + "step": 260160 + }, + { + "epoch": 0.9902712331478423, + "grad_norm": 0.12102605402469635, + "learning_rate": 0.00047338546762888114, + "loss": 2.0982, + "step": 260170 + }, + { + "epoch": 0.9903092956159649, + "grad_norm": 0.13553886115550995, + "learning_rate": 0.0004726138721247417, + "loss": 2.0925, + "step": 260180 + }, + { + "epoch": 0.9903473580840876, + "grad_norm": 0.11878236383199692, + "learning_rate": 0.00047186342830644315, + "loss": 2.086, + "step": 260190 + }, + { + "epoch": 0.9903854205522102, + "grad_norm": 0.12370821088552475, + "learning_rate": 0.00047113248654051873, + "loss": 2.0944, + "step": 260200 + }, + { + "epoch": 0.990423483020333, + "grad_norm": 0.12732823193073273, + "learning_rate": 0.0004704196010845019, + "loss": 2.1089, + "step": 260210 + }, + { + "epoch": 0.9904615454884557, + "grad_norm": 0.1242508515715599, + "learning_rate": 0.00046972349645902506, + "loss": 2.1049, + "step": 260220 + }, + { + "epoch": 0.9904996079565783, + "grad_norm": 0.12015186250209808, + "learning_rate": 0.00046904304063165547, + "loss": 2.0988, + "step": 260230 + }, + { + "epoch": 0.990537670424701, + "grad_norm": 0.11646655201911926, + "learning_rate": 0.00046837722339831625, + "loss": 2.0995, + "step": 260240 + }, + { + "epoch": 0.9905757328928237, + "grad_norm": 0.12747345864772797, + "learning_rate": 0.0004677251387816049, + "loss": 2.0825, + "step": 260250 + }, + { + "epoch": 0.9906137953609464, + "grad_norm": 0.11547389626502991, + "learning_rate": 0.00046708597056978085, + "loss": 2.0941, + "step": 260260 + }, + { + "epoch": 0.9906518578290691, + "grad_norm": 0.13355474174022675, + "learning_rate": 0.0004664589803375032, + "loss": 2.0938, + "step": 260270 + }, + { + "epoch": 0.9906899202971917, + "grad_norm": 0.10946942865848541, + "learning_rate": 0.0004658434974468013, + "loss": 2.1177, + "step": 260280 + }, + { + "epoch": 0.9907279827653145, + "grad_norm": 0.13152863085269928, + "learning_rate": 0.00046523891064230964, + "loss": 2.0944, + "step": 260290 + }, + { + "epoch": 0.9907660452334371, + "grad_norm": 0.1293293684720993, + "learning_rate": 0.00046464466094067264, + "loss": 2.0836, + "step": 260300 + }, + { + "epoch": 0.9908041077015598, + "grad_norm": 0.15848124027252197, + "learning_rate": 0.00046406023557858694, + "loss": 2.1005, + "step": 260310 + }, + { + "epoch": 0.9908421701696825, + "grad_norm": 0.11891598254442215, + "learning_rate": 0.0004634851628329889, + "loss": 2.085, + "step": 260320 + }, + { + "epoch": 0.9908802326378051, + "grad_norm": 0.13395057618618011, + "learning_rate": 0.0004629190075645217, + "loss": 2.0884, + "step": 260330 + }, + { + "epoch": 0.9909182951059279, + "grad_norm": 0.1368226706981659, + "learning_rate": 0.00046236136736454596, + "loss": 2.0924, + "step": 260340 + }, + { + "epoch": 0.9909563575740505, + "grad_norm": 0.12232238054275513, + "learning_rate": 0.00046181186920870134, + "loss": 2.0994, + "step": 260350 + }, + { + "epoch": 0.9909944200421732, + "grad_norm": 0.12242230027914047, + "learning_rate": 0.00046127016653792587, + "loss": 2.1126, + "step": 260360 + }, + { + "epoch": 0.9910324825102959, + "grad_norm": 0.13534030318260193, + "learning_rate": 0.0004607359367020342, + "loss": 2.0776, + "step": 260370 + }, + { + "epoch": 0.9910705449784186, + "grad_norm": 0.12158862501382828, + "learning_rate": 0.00046020887871228895, + "loss": 2.09, + "step": 260380 + }, + { + "epoch": 0.9911086074465413, + "grad_norm": 0.12573935091495514, + "learning_rate": 0.00045968871125850726, + "loss": 2.0939, + "step": 260390 + }, + { + "epoch": 0.9911466699146639, + "grad_norm": 0.13777689635753632, + "learning_rate": 0.0004591751709536137, + "loss": 2.0933, + "step": 260400 + }, + { + "epoch": 0.9911847323827866, + "grad_norm": 0.14061543345451355, + "learning_rate": 0.0004586680107745425, + "loss": 2.0882, + "step": 260410 + }, + { + "epoch": 0.9912227948509094, + "grad_norm": 0.12106756120920181, + "learning_rate": 0.00045816699867329627, + "loss": 2.0974, + "step": 260420 + }, + { + "epoch": 0.991260857319032, + "grad_norm": 0.11237644404172897, + "learning_rate": 0.00045767191633599904, + "loss": 2.0765, + "step": 260430 + }, + { + "epoch": 0.9912989197871547, + "grad_norm": 0.11515557765960693, + "learning_rate": 0.00045718255807111627, + "loss": 2.0915, + "step": 260440 + }, + { + "epoch": 0.9913369822552773, + "grad_norm": 0.13092699646949768, + "learning_rate": 0.00045669872981077806, + "loss": 2.1002, + "step": 260450 + }, + { + "epoch": 0.9913750447234, + "grad_norm": 0.1275516003370285, + "learning_rate": 0.00045622024821145434, + "loss": 2.087, + "step": 260460 + }, + { + "epoch": 0.9914131071915228, + "grad_norm": 0.11951445043087006, + "learning_rate": 0.00045574693984216084, + "loss": 2.0958, + "step": 260470 + }, + { + "epoch": 0.9914511696596454, + "grad_norm": 0.11712721735239029, + "learning_rate": 0.0004552786404500042, + "loss": 2.1043, + "step": 260480 + }, + { + "epoch": 0.9914892321277681, + "grad_norm": 0.1637088805437088, + "learning_rate": 0.00045481519429424683, + "loss": 2.0944, + "step": 260490 + }, + { + "epoch": 0.9915272945958907, + "grad_norm": 0.12922069430351257, + "learning_rate": 0.0004543564535412362, + "loss": 2.0902, + "step": 260500 + }, + { + "epoch": 0.9915653570640135, + "grad_norm": 0.12950067222118378, + "learning_rate": 0.0004539022777135356, + "loss": 2.083, + "step": 260510 + }, + { + "epoch": 0.9916034195321362, + "grad_norm": 0.1143832728266716, + "learning_rate": 0.0004534525331874368, + "loss": 2.0961, + "step": 260520 + }, + { + "epoch": 0.9916414820002588, + "grad_norm": 0.12358095496892929, + "learning_rate": 0.00045300709273376105, + "loss": 2.0873, + "step": 260530 + }, + { + "epoch": 0.9916795444683815, + "grad_norm": 0.12123238295316696, + "learning_rate": 0.00045256583509747434, + "loss": 2.0912, + "step": 260540 + }, + { + "epoch": 0.9917176069365042, + "grad_norm": 0.1274598389863968, + "learning_rate": 0.0004521286446121831, + "loss": 2.0981, + "step": 260550 + }, + { + "epoch": 0.9917556694046269, + "grad_norm": 0.1331852376461029, + "learning_rate": 0.0004516954108460352, + "loss": 2.0915, + "step": 260560 + }, + { + "epoch": 0.9917937318727496, + "grad_norm": 0.12474968284368515, + "learning_rate": 0.0004512660282759552, + "loss": 2.0815, + "step": 260570 + }, + { + "epoch": 0.9918317943408722, + "grad_norm": 0.12475012987852097, + "learning_rate": 0.00045084039598749126, + "loss": 2.0903, + "step": 260580 + }, + { + "epoch": 0.991869856808995, + "grad_norm": 0.3353778123855591, + "learning_rate": 0.00045041841739785495, + "loss": 2.0809, + "step": 260590 + }, + { + "epoch": 0.9919079192771176, + "grad_norm": 0.13950788974761963, + "learning_rate": 0.00045000000000000004, + "loss": 2.0717, + "step": 260600 + }, + { + "epoch": 0.9919459817452403, + "grad_norm": 0.12284799665212631, + "learning_rate": 0.00044958505512581945, + "loss": 2.0936, + "step": 260610 + }, + { + "epoch": 0.991984044213363, + "grad_norm": 0.12896114587783813, + "learning_rate": 0.00044917349772674364, + "loss": 2.0884, + "step": 260620 + }, + { + "epoch": 0.9920221066814856, + "grad_norm": 0.12631112337112427, + "learning_rate": 0.000448765246170202, + "loss": 2.0751, + "step": 260630 + }, + { + "epoch": 0.9920601691496084, + "grad_norm": 0.11359766870737076, + "learning_rate": 0.0004483602220505678, + "loss": 2.0827, + "step": 260640 + }, + { + "epoch": 0.992098231617731, + "grad_norm": 0.11313661932945251, + "learning_rate": 0.0004479583500133467, + "loss": 2.0941, + "step": 260650 + }, + { + "epoch": 0.9921362940858537, + "grad_norm": 0.13781104981899261, + "learning_rate": 0.0004475595575914924, + "loss": 2.0975, + "step": 260660 + }, + { + "epoch": 0.9921743565539763, + "grad_norm": 0.11699791997671127, + "learning_rate": 0.00044716377505284215, + "loss": 2.0763, + "step": 260670 + }, + { + "epoch": 0.9922124190220991, + "grad_norm": 0.12874886393547058, + "learning_rate": 0.0004467709352577623, + "loss": 2.1004, + "step": 260680 + }, + { + "epoch": 0.9922504814902218, + "grad_norm": 0.1220022514462471, + "learning_rate": 0.00044638097352618194, + "loss": 2.0787, + "step": 260690 + }, + { + "epoch": 0.9922885439583444, + "grad_norm": 0.13112017512321472, + "learning_rate": 0.0004459938275132678, + "loss": 2.0947, + "step": 260700 + }, + { + "epoch": 0.9923266064264671, + "grad_norm": 0.1277540922164917, + "learning_rate": 0.0004456094370930643, + "loss": 2.0734, + "step": 260710 + }, + { + "epoch": 0.9923646688945899, + "grad_norm": 0.13177043199539185, + "learning_rate": 0.0004452277442494834, + "loss": 2.0818, + "step": 260720 + }, + { + "epoch": 0.9924027313627125, + "grad_norm": 0.1327768862247467, + "learning_rate": 0.0004448486929740857, + "loss": 2.0947, + "step": 260730 + }, + { + "epoch": 0.9924407938308352, + "grad_norm": 0.13766483962535858, + "learning_rate": 0.0004444722291701411, + "loss": 2.0992, + "step": 260740 + }, + { + "epoch": 0.9924788562989578, + "grad_norm": 0.12571793794631958, + "learning_rate": 0.00044409830056250525, + "loss": 2.1074, + "step": 260750 + }, + { + "epoch": 0.9925169187670805, + "grad_norm": 0.13008525967597961, + "learning_rate": 0.00044372685661288623, + "loss": 2.0903, + "step": 260760 + }, + { + "epoch": 0.9925549812352032, + "grad_norm": 0.12241663038730621, + "learning_rate": 0.0004433578484401119, + "loss": 2.1075, + "step": 260770 + }, + { + "epoch": 0.9925930437033259, + "grad_norm": 0.1442648470401764, + "learning_rate": 0.0004429912287450431, + "loss": 2.0914, + "step": 260780 + }, + { + "epoch": 0.9926311061714486, + "grad_norm": 0.14241662621498108, + "learning_rate": 0.00044262695173980497, + "loss": 2.0961, + "step": 260790 + }, + { + "epoch": 0.9926691686395712, + "grad_norm": 0.12129341065883636, + "learning_rate": 0.0004422649730810374, + "loss": 2.083, + "step": 260800 + }, + { + "epoch": 0.992707231107694, + "grad_norm": 0.11631777882575989, + "learning_rate": 0.00044190524980688875, + "loss": 2.085, + "step": 260810 + }, + { + "epoch": 0.9927452935758166, + "grad_norm": 0.1259375363588333, + "learning_rate": 0.0004415477402774994, + "loss": 2.0885, + "step": 260820 + }, + { + "epoch": 0.9927833560439393, + "grad_norm": 0.13364864885807037, + "learning_rate": 0.0004411924041187422, + "loss": 2.0886, + "step": 260830 + }, + { + "epoch": 0.992821418512062, + "grad_norm": 0.11769169569015503, + "learning_rate": 0.00044083920216900387, + "loss": 2.0932, + "step": 260840 + }, + { + "epoch": 0.9928594809801847, + "grad_norm": 0.12409477680921555, + "learning_rate": 0.0004404880964288096, + "loss": 2.0867, + "step": 260850 + }, + { + "epoch": 0.9928975434483074, + "grad_norm": 0.12123719602823257, + "learning_rate": 0.00044013905001310676, + "loss": 2.1011, + "step": 260860 + }, + { + "epoch": 0.99293560591643, + "grad_norm": 0.1280791163444519, + "learning_rate": 0.00043979202710603854, + "loss": 2.0903, + "step": 260870 + }, + { + "epoch": 0.9929736683845527, + "grad_norm": 0.11044704169034958, + "learning_rate": 0.00043944699291805016, + "loss": 2.0946, + "step": 260880 + }, + { + "epoch": 0.9930117308526754, + "grad_norm": 0.12645240128040314, + "learning_rate": 0.000439103913645183, + "loss": 2.1006, + "step": 260890 + }, + { + "epoch": 0.9930497933207981, + "grad_norm": 0.12837836146354675, + "learning_rate": 0.00043876275643042056, + "loss": 2.0736, + "step": 260900 + }, + { + "epoch": 0.9930878557889208, + "grad_norm": 0.12500976026058197, + "learning_rate": 0.0004384234893269628, + "loss": 2.0912, + "step": 260910 + }, + { + "epoch": 0.9931259182570434, + "grad_norm": 0.13092707097530365, + "learning_rate": 0.00043808608126331093, + "loss": 2.0816, + "step": 260920 + }, + { + "epoch": 0.9931639807251661, + "grad_norm": 0.14441804587841034, + "learning_rate": 0.00043775050201005634, + "loss": 2.0976, + "step": 260930 + }, + { + "epoch": 0.9932020431932889, + "grad_norm": 0.13155049085617065, + "learning_rate": 0.0004374167221482714, + "loss": 2.0869, + "step": 260940 + }, + { + "epoch": 0.9932401056614115, + "grad_norm": 0.15008226037025452, + "learning_rate": 0.00043708471303941047, + "loss": 2.0841, + "step": 260950 + }, + { + "epoch": 0.9932781681295342, + "grad_norm": 0.1321364790201187, + "learning_rate": 0.00043675444679663243, + "loss": 2.1137, + "step": 260960 + }, + { + "epoch": 0.9933162305976568, + "grad_norm": 0.11087088286876678, + "learning_rate": 0.00043642589625746456, + "loss": 2.102, + "step": 260970 + }, + { + "epoch": 0.9933542930657796, + "grad_norm": 0.12107894569635391, + "learning_rate": 0.0004360990349577306, + "loss": 2.1037, + "step": 260980 + }, + { + "epoch": 0.9933923555339023, + "grad_norm": 0.11820726096630096, + "learning_rate": 0.0004357738371066743, + "loss": 2.0826, + "step": 260990 + }, + { + "epoch": 0.9934304180020249, + "grad_norm": 0.12801015377044678, + "learning_rate": 0.00043545027756320976, + "loss": 2.0897, + "step": 261000 + }, + { + "epoch": 0.9934684804701476, + "grad_norm": 0.12365680187940598, + "learning_rate": 0.00043512833181323814, + "loss": 2.072, + "step": 261010 + }, + { + "epoch": 0.9935065429382703, + "grad_norm": 0.12400777637958527, + "learning_rate": 0.0004348079759479735, + "loss": 2.0791, + "step": 261020 + }, + { + "epoch": 0.993544605406393, + "grad_norm": 0.11780333518981934, + "learning_rate": 0.00043448918664322153, + "loss": 2.0863, + "step": 261030 + }, + { + "epoch": 0.9935826678745157, + "grad_norm": 0.12443245947360992, + "learning_rate": 0.0004341719411395617, + "loss": 2.0896, + "step": 261040 + }, + { + "epoch": 0.9936207303426383, + "grad_norm": 0.13305029273033142, + "learning_rate": 0.00043385621722338526, + "loss": 2.0816, + "step": 261050 + }, + { + "epoch": 0.993658792810761, + "grad_norm": 0.123264841735363, + "learning_rate": 0.0004335419932087437, + "loss": 2.0981, + "step": 261060 + }, + { + "epoch": 0.9936968552788837, + "grad_norm": 0.17386004328727722, + "learning_rate": 0.0004332292479199663, + "loss": 2.0907, + "step": 261070 + }, + { + "epoch": 0.9937349177470064, + "grad_norm": 0.13856405019760132, + "learning_rate": 0.00043291796067500634, + "loss": 2.0982, + "step": 261080 + }, + { + "epoch": 0.993772980215129, + "grad_norm": 0.11440404504537582, + "learning_rate": 0.0004326081112694809, + "loss": 2.0841, + "step": 261090 + }, + { + "epoch": 0.9938110426832517, + "grad_norm": 0.11984309554100037, + "learning_rate": 0.00043229967996136697, + "loss": 2.0697, + "step": 261100 + }, + { + "epoch": 0.9938491051513745, + "grad_norm": 0.13008153438568115, + "learning_rate": 0.0004319926474563228, + "loss": 2.0884, + "step": 261110 + }, + { + "epoch": 0.9938871676194971, + "grad_norm": 0.11998965591192245, + "learning_rate": 0.0004316869948936027, + "loss": 2.0961, + "step": 261120 + }, + { + "epoch": 0.9939252300876198, + "grad_norm": 0.12517978250980377, + "learning_rate": 0.00043138270383253697, + "loss": 2.0747, + "step": 261130 + }, + { + "epoch": 0.9939632925557425, + "grad_norm": 0.12591873109340668, + "learning_rate": 0.00043107975623954886, + "loss": 2.0745, + "step": 261140 + }, + { + "epoch": 0.9940013550238652, + "grad_norm": 0.12345649302005768, + "learning_rate": 0.0004307781344756827, + "loss": 2.0917, + "step": 261150 + }, + { + "epoch": 0.9940394174919879, + "grad_norm": 0.14185993373394012, + "learning_rate": 0.0004304778212846193, + "loss": 2.0787, + "step": 261160 + }, + { + "epoch": 0.9940774799601105, + "grad_norm": 0.10899876803159714, + "learning_rate": 0.00043017879978115534, + "loss": 2.0792, + "step": 261170 + }, + { + "epoch": 0.9941155424282332, + "grad_norm": 0.11979634314775467, + "learning_rate": 0.00042988105344012457, + "loss": 2.0861, + "step": 261180 + }, + { + "epoch": 0.9941536048963558, + "grad_norm": 0.1269412636756897, + "learning_rate": 0.0004295845660857413, + "loss": 2.0928, + "step": 261190 + }, + { + "epoch": 0.9941916673644786, + "grad_norm": 0.12314164638519287, + "learning_rate": 0.00042928932188134527, + "loss": 2.0722, + "step": 261200 + }, + { + "epoch": 0.9942297298326013, + "grad_norm": 0.13209030032157898, + "learning_rate": 0.00042899530531953067, + "loss": 2.0971, + "step": 261210 + }, + { + "epoch": 0.9942677923007239, + "grad_norm": 0.12487448006868362, + "learning_rate": 0.00042870250121264185, + "loss": 2.082, + "step": 261220 + }, + { + "epoch": 0.9943058547688466, + "grad_norm": 0.1182442456483841, + "learning_rate": 0.00042841089468361826, + "loss": 2.0888, + "step": 261230 + }, + { + "epoch": 0.9943439172369694, + "grad_norm": 0.11452000588178635, + "learning_rate": 0.0004281204711571739, + "loss": 2.0861, + "step": 261240 + }, + { + "epoch": 0.994381979705092, + "grad_norm": 0.13320042192935944, + "learning_rate": 0.0004278312163512968, + "loss": 2.0745, + "step": 261250 + }, + { + "epoch": 0.9944200421732147, + "grad_norm": 0.12851102650165558, + "learning_rate": 0.0004275431162690528, + "loss": 2.0891, + "step": 261260 + }, + { + "epoch": 0.9944581046413373, + "grad_norm": 0.13266323506832123, + "learning_rate": 0.00042725615719068266, + "loss": 2.0889, + "step": 261270 + }, + { + "epoch": 0.9944961671094601, + "grad_norm": 0.1309056431055069, + "learning_rate": 0.0004269703256659779, + "loss": 2.0873, + "step": 261280 + }, + { + "epoch": 0.9945342295775828, + "grad_norm": 0.12090447545051575, + "learning_rate": 0.0004266856085069241, + "loss": 2.0863, + "step": 261290 + }, + { + "epoch": 0.9945722920457054, + "grad_norm": 0.12500663101673126, + "learning_rate": 0.00042640199278060124, + "loss": 2.0944, + "step": 261300 + }, + { + "epoch": 0.9946103545138281, + "grad_norm": 0.11978045105934143, + "learning_rate": 0.0004261194658023284, + "loss": 2.0696, + "step": 261310 + }, + { + "epoch": 0.9946484169819507, + "grad_norm": 0.1339166760444641, + "learning_rate": 0.0004258380151290434, + "loss": 2.0732, + "step": 261320 + }, + { + "epoch": 0.9946864794500735, + "grad_norm": 0.12607313692569733, + "learning_rate": 0.000425557628552909, + "loss": 2.09, + "step": 261330 + }, + { + "epoch": 0.9947245419181961, + "grad_norm": 0.12124264240264893, + "learning_rate": 0.0004252782940951337, + "loss": 2.0862, + "step": 261340 + }, + { + "epoch": 0.9947626043863188, + "grad_norm": 0.12462054193019867, + "learning_rate": 0.000425, + "loss": 2.1053, + "step": 261350 + }, + { + "epoch": 0.9948006668544415, + "grad_norm": 0.1448441445827484, + "learning_rate": 0.0004247227347290919, + "loss": 2.0945, + "step": 261360 + }, + { + "epoch": 0.9948387293225642, + "grad_norm": 0.11749083548784256, + "learning_rate": 0.00042444648695571244, + "loss": 2.0738, + "step": 261370 + }, + { + "epoch": 0.9948767917906869, + "grad_norm": 0.129529669880867, + "learning_rate": 0.0004241712455594845, + "loss": 2.0758, + "step": 261380 + }, + { + "epoch": 0.9949148542588095, + "grad_norm": 0.11888234317302704, + "learning_rate": 0.0004238969996211275, + "loss": 2.094, + "step": 261390 + }, + { + "epoch": 0.9949529167269322, + "grad_norm": 0.1276368796825409, + "learning_rate": 0.00042362373841740267, + "loss": 2.0911, + "step": 261400 + }, + { + "epoch": 0.994990979195055, + "grad_norm": 0.11603518575429916, + "learning_rate": 0.00042335145141622057, + "loss": 2.0785, + "step": 261410 + }, + { + "epoch": 0.9950290416631776, + "grad_norm": 0.12183357775211334, + "learning_rate": 0.0004230801282719045, + "loss": 2.101, + "step": 261420 + }, + { + "epoch": 0.9950671041313003, + "grad_norm": 0.12915317714214325, + "learning_rate": 0.0004228097588206039, + "loss": 2.0822, + "step": 261430 + }, + { + "epoch": 0.9951051665994229, + "grad_norm": 0.12496720254421234, + "learning_rate": 0.0004225403330758517, + "loss": 2.0768, + "step": 261440 + }, + { + "epoch": 0.9951432290675457, + "grad_norm": 0.13041459023952484, + "learning_rate": 0.0004222718412242599, + "loss": 2.0899, + "step": 261450 + }, + { + "epoch": 0.9951812915356684, + "grad_norm": 0.1264573186635971, + "learning_rate": 0.0004220042736213494, + "loss": 2.0879, + "step": 261460 + }, + { + "epoch": 0.995219354003791, + "grad_norm": 0.1156756654381752, + "learning_rate": 0.0004217376207875074, + "loss": 2.09, + "step": 261470 + }, + { + "epoch": 0.9952574164719137, + "grad_norm": 0.11675167083740234, + "learning_rate": 0.00042147187340406834, + "loss": 2.0849, + "step": 261480 + }, + { + "epoch": 0.9952954789400363, + "grad_norm": 0.13087327778339386, + "learning_rate": 0.0004212070223095146, + "loss": 2.0705, + "step": 261490 + }, + { + "epoch": 0.9953335414081591, + "grad_norm": 0.12341038882732391, + "learning_rate": 0.0004209430584957905, + "loss": 2.0994, + "step": 261500 + }, + { + "epoch": 0.9953716038762818, + "grad_norm": 0.1165153756737709, + "learning_rate": 0.00042067997310472807, + "loss": 2.0832, + "step": 261510 + }, + { + "epoch": 0.9954096663444044, + "grad_norm": 0.12436539679765701, + "learning_rate": 0.0004204177574245779, + "loss": 2.0767, + "step": 261520 + }, + { + "epoch": 0.9954477288125271, + "grad_norm": 0.13395121693611145, + "learning_rate": 0.00042015640288664346, + "loss": 2.0783, + "step": 261530 + }, + { + "epoch": 0.9954857912806498, + "grad_norm": 0.13121996819972992, + "learning_rate": 0.0004198959010620139, + "loss": 2.0866, + "step": 261540 + }, + { + "epoch": 0.9955238537487725, + "grad_norm": 0.11080297082662582, + "learning_rate": 0.00041963624365839205, + "loss": 2.0746, + "step": 261550 + }, + { + "epoch": 0.9955619162168952, + "grad_norm": 0.12134183198213577, + "learning_rate": 0.0004193774225170145, + "loss": 2.0965, + "step": 261560 + }, + { + "epoch": 0.9955999786850178, + "grad_norm": 0.13009509444236755, + "learning_rate": 0.0004191194296096605, + "loss": 2.0738, + "step": 261570 + }, + { + "epoch": 0.9956380411531406, + "grad_norm": 0.1238146498799324, + "learning_rate": 0.0004188622570357461, + "loss": 2.081, + "step": 261580 + }, + { + "epoch": 0.9956761036212632, + "grad_norm": 0.11727666854858398, + "learning_rate": 0.0004186058970195015, + "loss": 2.0848, + "step": 261590 + }, + { + "epoch": 0.9957141660893859, + "grad_norm": 0.1214786097407341, + "learning_rate": 0.0004183503419072274, + "loss": 2.0727, + "step": 261600 + }, + { + "epoch": 0.9957522285575086, + "grad_norm": 0.12340279668569565, + "learning_rate": 0.0004180955841646292, + "loss": 2.0892, + "step": 261610 + }, + { + "epoch": 0.9957902910256312, + "grad_norm": 0.1235736683011055, + "learning_rate": 0.00041784161637422506, + "loss": 2.0726, + "step": 261620 + }, + { + "epoch": 0.995828353493754, + "grad_norm": 0.11563219130039215, + "learning_rate": 0.0004175884312328259, + "loss": 2.0813, + "step": 261630 + }, + { + "epoch": 0.9958664159618766, + "grad_norm": 0.12072248756885529, + "learning_rate": 0.00041733602154908503, + "loss": 2.0853, + "step": 261640 + }, + { + "epoch": 0.9959044784299993, + "grad_norm": 0.13266229629516602, + "learning_rate": 0.000417084380241115, + "loss": 2.0867, + "step": 261650 + }, + { + "epoch": 0.995942540898122, + "grad_norm": 0.11815812438726425, + "learning_rate": 0.000416833500334169, + "loss": 2.0762, + "step": 261660 + }, + { + "epoch": 0.9959806033662447, + "grad_norm": 0.11936049163341522, + "learning_rate": 0.00041658337495838535, + "loss": 2.0797, + "step": 261670 + }, + { + "epoch": 0.9960186658343674, + "grad_norm": 0.13057257235050201, + "learning_rate": 0.0004163339973465924, + "loss": 2.0801, + "step": 261680 + }, + { + "epoch": 0.99605672830249, + "grad_norm": 0.12162986397743225, + "learning_rate": 0.00041608536083217264, + "loss": 2.0789, + "step": 261690 + }, + { + "epoch": 0.9960947907706127, + "grad_norm": 0.13624535501003265, + "learning_rate": 0.0004158374588469827, + "loss": 2.0855, + "step": 261700 + }, + { + "epoch": 0.9961328532387355, + "grad_norm": 0.1518181413412094, + "learning_rate": 0.00041559028491932936, + "loss": 2.0836, + "step": 261710 + }, + { + "epoch": 0.9961709157068581, + "grad_norm": 0.11926338076591492, + "learning_rate": 0.00041534383267199807, + "loss": 2.0806, + "step": 261720 + }, + { + "epoch": 0.9962089781749808, + "grad_norm": 0.12259402126073837, + "learning_rate": 0.00041509809582033314, + "loss": 2.0924, + "step": 261730 + }, + { + "epoch": 0.9962470406431034, + "grad_norm": 0.11852733790874481, + "learning_rate": 0.00041485306817036797, + "loss": 2.0745, + "step": 261740 + }, + { + "epoch": 0.9962851031112261, + "grad_norm": 0.1294088065624237, + "learning_rate": 0.0004146087436170033, + "loss": 2.0817, + "step": 261750 + }, + { + "epoch": 0.9963231655793489, + "grad_norm": 0.12300239503383636, + "learning_rate": 0.0004143651161422325, + "loss": 2.0908, + "step": 261760 + }, + { + "epoch": 0.9963612280474715, + "grad_norm": 0.1162174716591835, + "learning_rate": 0.00041412217981341167, + "loss": 2.1025, + "step": 261770 + }, + { + "epoch": 0.9963992905155942, + "grad_norm": 0.1360090672969818, + "learning_rate": 0.0004138799287815746, + "loss": 2.0813, + "step": 261780 + }, + { + "epoch": 0.9964373529837168, + "grad_norm": 0.1395946741104126, + "learning_rate": 0.0004136383572797893, + "loss": 2.0956, + "step": 261790 + }, + { + "epoch": 0.9964754154518396, + "grad_norm": 0.11686591058969498, + "learning_rate": 0.0004133974596215562, + "loss": 2.092, + "step": 261800 + }, + { + "epoch": 0.9965134779199623, + "grad_norm": 0.11687017232179642, + "learning_rate": 0.0004131572301992465, + "loss": 2.0956, + "step": 261810 + }, + { + "epoch": 0.9965515403880849, + "grad_norm": 0.12350569665431976, + "learning_rate": 0.00041291766348257916, + "loss": 2.0794, + "step": 261820 + }, + { + "epoch": 0.9965896028562076, + "grad_norm": 0.13105247914791107, + "learning_rate": 0.0004126787540171351, + "loss": 2.0609, + "step": 261830 + }, + { + "epoch": 0.9966276653243303, + "grad_norm": 0.11866418272256851, + "learning_rate": 0.0004124404964229087, + "loss": 2.0755, + "step": 261840 + }, + { + "epoch": 0.996665727792453, + "grad_norm": 0.11236187070608139, + "learning_rate": 0.00041220288539289384, + "loss": 2.0822, + "step": 261850 + }, + { + "epoch": 0.9967037902605757, + "grad_norm": 0.11838629096746445, + "learning_rate": 0.00041196591569170495, + "loss": 2.0929, + "step": 261860 + }, + { + "epoch": 0.9967418527286983, + "grad_norm": 0.12502582371234894, + "learning_rate": 0.00041172958215423096, + "loss": 2.0859, + "step": 261870 + }, + { + "epoch": 0.9967799151968211, + "grad_norm": 0.14400994777679443, + "learning_rate": 0.00041149387968432167, + "loss": 2.0881, + "step": 261880 + }, + { + "epoch": 0.9968179776649437, + "grad_norm": 0.12822756171226501, + "learning_rate": 0.00041125880325350577, + "loss": 2.0717, + "step": 261890 + }, + { + "epoch": 0.9968560401330664, + "grad_norm": 0.1273794323205948, + "learning_rate": 0.0004110243478997391, + "loss": 2.0873, + "step": 261900 + }, + { + "epoch": 0.996894102601189, + "grad_norm": 0.11934802681207657, + "learning_rate": 0.00041079050872618245, + "loss": 2.0776, + "step": 261910 + }, + { + "epoch": 0.9969321650693117, + "grad_norm": 0.13123826682567596, + "learning_rate": 0.0004105572809000084, + "loss": 2.0772, + "step": 261920 + }, + { + "epoch": 0.9969702275374345, + "grad_norm": 0.13744977116584778, + "learning_rate": 0.000410324659651236, + "loss": 2.0683, + "step": 261930 + }, + { + "epoch": 0.9970082900055571, + "grad_norm": 0.11932683736085892, + "learning_rate": 0.0004100926402715922, + "loss": 2.0658, + "step": 261940 + }, + { + "epoch": 0.9970463524736798, + "grad_norm": 0.1288817971944809, + "learning_rate": 0.00040986121811340025, + "loss": 2.079, + "step": 261950 + }, + { + "epoch": 0.9970844149418024, + "grad_norm": 0.13244900107383728, + "learning_rate": 0.0004096303885884936, + "loss": 2.0807, + "step": 261960 + }, + { + "epoch": 0.9971224774099252, + "grad_norm": 0.12538810074329376, + "learning_rate": 0.00040940014716715414, + "loss": 2.065, + "step": 261970 + }, + { + "epoch": 0.9971605398780479, + "grad_norm": 0.12961310148239136, + "learning_rate": 0.00040917048937707525, + "loss": 2.0796, + "step": 261980 + }, + { + "epoch": 0.9971986023461705, + "grad_norm": 0.1247786357998848, + "learning_rate": 0.0004089414108023485, + "loss": 2.0708, + "step": 261990 + }, + { + "epoch": 0.9972366648142932, + "grad_norm": 0.11900941282510757, + "learning_rate": 0.0004087129070824723, + "loss": 2.0948, + "step": 262000 + }, + { + "epoch": 0.997274727282416, + "grad_norm": 0.1165849044919014, + "learning_rate": 0.0004084849739113844, + "loss": 2.0735, + "step": 262010 + }, + { + "epoch": 0.9973127897505386, + "grad_norm": 0.12156182527542114, + "learning_rate": 0.0004082576070365141, + "loss": 2.0774, + "step": 262020 + }, + { + "epoch": 0.9973508522186613, + "grad_norm": 0.13087128102779388, + "learning_rate": 0.0004080308022578574, + "loss": 2.0855, + "step": 262030 + }, + { + "epoch": 0.9973889146867839, + "grad_norm": 0.1311582624912262, + "learning_rate": 0.00040780455542707114, + "loss": 2.0852, + "step": 262040 + }, + { + "epoch": 0.9974269771549066, + "grad_norm": 0.12414788454771042, + "learning_rate": 0.00040757886244658823, + "loss": 2.0563, + "step": 262050 + }, + { + "epoch": 0.9974650396230293, + "grad_norm": 0.10987682640552521, + "learning_rate": 0.0004073537192687514, + "loss": 2.0742, + "step": 262060 + }, + { + "epoch": 0.997503102091152, + "grad_norm": 0.12739349901676178, + "learning_rate": 0.00040712912189496645, + "loss": 2.0616, + "step": 262070 + }, + { + "epoch": 0.9975411645592747, + "grad_norm": 0.12246967107057571, + "learning_rate": 0.00040690506637487375, + "loss": 2.066, + "step": 262080 + }, + { + "epoch": 0.9975792270273973, + "grad_norm": 0.12437837570905685, + "learning_rate": 0.0004066815488055372, + "loss": 2.0731, + "step": 262090 + }, + { + "epoch": 0.9976172894955201, + "grad_norm": 0.12276354432106018, + "learning_rate": 0.00040645856533065144, + "loss": 2.0863, + "step": 262100 + }, + { + "epoch": 0.9976553519636427, + "grad_norm": 0.14230865240097046, + "learning_rate": 0.0004062361121397653, + "loss": 2.0718, + "step": 262110 + }, + { + "epoch": 0.9976934144317654, + "grad_norm": 0.11511580646038055, + "learning_rate": 0.0004060141854675221, + "loss": 2.0909, + "step": 262120 + }, + { + "epoch": 0.9977314768998881, + "grad_norm": 0.12784437835216522, + "learning_rate": 0.0004057927815929162, + "loss": 2.068, + "step": 262130 + }, + { + "epoch": 0.9977695393680108, + "grad_norm": 0.12329355627298355, + "learning_rate": 0.0004055718968385647, + "loss": 2.0667, + "step": 262140 + }, + { + "epoch": 0.9978076018361335, + "grad_norm": 0.12663282454013824, + "learning_rate": 0.0004053515275699954, + "loss": 2.06, + "step": 262150 + }, + { + "epoch": 0.9978456643042561, + "grad_norm": 0.14040499925613403, + "learning_rate": 0.0004051316701949486, + "loss": 2.0831, + "step": 262160 + }, + { + "epoch": 0.9978837267723788, + "grad_norm": 0.11923239380121231, + "learning_rate": 0.00040491232116269393, + "loss": 2.0734, + "step": 262170 + }, + { + "epoch": 0.9979217892405015, + "grad_norm": 0.11212129890918732, + "learning_rate": 0.00040469347696336135, + "loss": 2.0828, + "step": 262180 + }, + { + "epoch": 0.9979598517086242, + "grad_norm": 0.11588910222053528, + "learning_rate": 0.000404475134127286, + "loss": 2.0721, + "step": 262190 + }, + { + "epoch": 0.9979979141767469, + "grad_norm": 0.11533407121896744, + "learning_rate": 0.00040425728922436623, + "loss": 2.0772, + "step": 262200 + }, + { + "epoch": 0.9980359766448695, + "grad_norm": 0.12063222378492355, + "learning_rate": 0.00040403993886343483, + "loss": 2.0653, + "step": 262210 + }, + { + "epoch": 0.9980740391129922, + "grad_norm": 0.13424226641654968, + "learning_rate": 0.0004038230796916433, + "loss": 2.0903, + "step": 262220 + }, + { + "epoch": 0.998112101581115, + "grad_norm": 0.12373346090316772, + "learning_rate": 0.0004036067083938583, + "loss": 2.0967, + "step": 262230 + }, + { + "epoch": 0.9981501640492376, + "grad_norm": 0.13985063135623932, + "learning_rate": 0.0004033908216920704, + "loss": 2.0968, + "step": 262240 + }, + { + "epoch": 0.9981882265173603, + "grad_norm": 0.12289280444383621, + "learning_rate": 0.00040317541634481456, + "loss": 2.0731, + "step": 262250 + }, + { + "epoch": 0.9982262889854829, + "grad_norm": 0.11922875046730042, + "learning_rate": 0.00040296048914660246, + "loss": 2.0713, + "step": 262260 + }, + { + "epoch": 0.9982643514536057, + "grad_norm": 0.1151062399148941, + "learning_rate": 0.0004027460369273656, + "loss": 2.079, + "step": 262270 + }, + { + "epoch": 0.9983024139217284, + "grad_norm": 0.13483203947544098, + "learning_rate": 0.0004025320565519104, + "loss": 2.0612, + "step": 262280 + }, + { + "epoch": 0.998340476389851, + "grad_norm": 0.12044581025838852, + "learning_rate": 0.00040231854491938265, + "loss": 2.0863, + "step": 262290 + }, + { + "epoch": 0.9983785388579737, + "grad_norm": 0.13184158504009247, + "learning_rate": 0.00040210549896274394, + "loss": 2.0738, + "step": 262300 + }, + { + "epoch": 0.9984166013260964, + "grad_norm": 0.12094981968402863, + "learning_rate": 0.0004018929156482571, + "loss": 2.0625, + "step": 262310 + }, + { + "epoch": 0.9984546637942191, + "grad_norm": 0.47059765458106995, + "learning_rate": 0.0004016807919749825, + "loss": 2.0765, + "step": 262320 + }, + { + "epoch": 0.9984927262623418, + "grad_norm": 0.12888529896736145, + "learning_rate": 0.0004014691249742837, + "loss": 2.0696, + "step": 262330 + }, + { + "epoch": 0.9985307887304644, + "grad_norm": 0.1215779036283493, + "learning_rate": 0.0004012579117093425, + "loss": 2.0789, + "step": 262340 + }, + { + "epoch": 0.9985688511985871, + "grad_norm": 0.11985091120004654, + "learning_rate": 0.000401047149274684, + "loss": 2.0806, + "step": 262350 + }, + { + "epoch": 0.9986069136667098, + "grad_norm": 0.13111738860607147, + "learning_rate": 0.0004008368347957099, + "loss": 2.0949, + "step": 262360 + }, + { + "epoch": 0.9986449761348325, + "grad_norm": 0.12248831242322922, + "learning_rate": 0.00040062696542824105, + "loss": 2.0751, + "step": 262370 + }, + { + "epoch": 0.9986830386029552, + "grad_norm": 0.13073061406612396, + "learning_rate": 0.0004004175383580689, + "loss": 2.0818, + "step": 262380 + }, + { + "epoch": 0.9987211010710778, + "grad_norm": 0.11265187710523605, + "learning_rate": 0.0004002085508005153, + "loss": 2.0791, + "step": 262390 + }, + { + "epoch": 0.9987591635392006, + "grad_norm": 0.11508375406265259, + "learning_rate": 0.0004, + "loss": 2.0742, + "step": 262400 + }, + { + "epoch": 0.9987972260073232, + "grad_norm": 0.1140381395816803, + "learning_rate": 0.00039979188322961725, + "loss": 2.065, + "step": 262410 + }, + { + "epoch": 0.9988352884754459, + "grad_norm": 0.1270381361246109, + "learning_rate": 0.00039958419779071954, + "loss": 2.0798, + "step": 262420 + }, + { + "epoch": 0.9988733509435685, + "grad_norm": 0.12487317621707916, + "learning_rate": 0.0003993769410125095, + "loss": 2.0717, + "step": 262430 + }, + { + "epoch": 0.9989114134116913, + "grad_norm": 0.12228159606456757, + "learning_rate": 0.00039917011025163883, + "loss": 2.0825, + "step": 262440 + }, + { + "epoch": 0.998949475879814, + "grad_norm": 0.11730682849884033, + "learning_rate": 0.0003989637028918155, + "loss": 2.0792, + "step": 262450 + }, + { + "epoch": 0.9989875383479366, + "grad_norm": 0.127616748213768, + "learning_rate": 0.00039875771634341706, + "loss": 2.0759, + "step": 262460 + }, + { + "epoch": 0.9990256008160593, + "grad_norm": 0.12070652842521667, + "learning_rate": 0.000398552148043112, + "loss": 2.0684, + "step": 262470 + }, + { + "epoch": 0.999063663284182, + "grad_norm": 0.12831643223762512, + "learning_rate": 0.00039834699545348727, + "loss": 2.0886, + "step": 262480 + }, + { + "epoch": 0.9991017257523047, + "grad_norm": 0.13065242767333984, + "learning_rate": 0.0003981422560626832, + "loss": 2.0678, + "step": 262490 + }, + { + "epoch": 0.9991397882204274, + "grad_norm": 0.12046606093645096, + "learning_rate": 0.0003979379273840342, + "loss": 2.0794, + "step": 262500 + }, + { + "epoch": 0.99917785068855, + "grad_norm": 0.12492360919713974, + "learning_rate": 0.00039773400695571705, + "loss": 2.0612, + "step": 262510 + }, + { + "epoch": 0.9992159131566727, + "grad_norm": 0.11388891935348511, + "learning_rate": 0.000397530492340404, + "loss": 2.0642, + "step": 262520 + }, + { + "epoch": 0.9992539756247955, + "grad_norm": 0.11626210063695908, + "learning_rate": 0.0003973273811249237, + "loss": 2.0722, + "step": 262530 + }, + { + "epoch": 0.9992920380929181, + "grad_norm": 0.12329540401697159, + "learning_rate": 0.0003971246709199269, + "loss": 2.0744, + "step": 262540 + }, + { + "epoch": 0.9993301005610408, + "grad_norm": 0.1349315345287323, + "learning_rate": 0.0003969223593595585, + "loss": 2.0731, + "step": 262550 + }, + { + "epoch": 0.9993681630291634, + "grad_norm": 0.1360684037208557, + "learning_rate": 0.00039672044410113557, + "loss": 2.0867, + "step": 262560 + }, + { + "epoch": 0.9994062254972862, + "grad_norm": 0.1352628469467163, + "learning_rate": 0.00039651892282483077, + "loss": 2.0611, + "step": 262570 + }, + { + "epoch": 0.9994442879654089, + "grad_norm": 0.13114619255065918, + "learning_rate": 0.0003963177932333614, + "loss": 2.0862, + "step": 262580 + }, + { + "epoch": 0.9994823504335315, + "grad_norm": 0.11713041365146637, + "learning_rate": 0.0003961170530516839, + "loss": 2.0724, + "step": 262590 + }, + { + "epoch": 0.9995204129016542, + "grad_norm": 0.11450465768575668, + "learning_rate": 0.0003959167000266934, + "loss": 2.0709, + "step": 262600 + }, + { + "epoch": 0.9995584753697769, + "grad_norm": 0.12338771671056747, + "learning_rate": 0.00039571673192692895, + "loss": 2.0782, + "step": 262610 + }, + { + "epoch": 0.9995965378378996, + "grad_norm": 0.13126511871814728, + "learning_rate": 0.0003955171465422835, + "loss": 2.0789, + "step": 262620 + }, + { + "epoch": 0.9996346003060222, + "grad_norm": 0.12571443617343903, + "learning_rate": 0.00039531794168371864, + "loss": 2.087, + "step": 262630 + }, + { + "epoch": 0.9996726627741449, + "grad_norm": 0.12128579616546631, + "learning_rate": 0.0003951191151829848, + "loss": 2.0793, + "step": 262640 + }, + { + "epoch": 0.9997107252422676, + "grad_norm": 0.13705646991729736, + "learning_rate": 0.0003949206648923459, + "loss": 2.0662, + "step": 262650 + }, + { + "epoch": 0.9997487877103903, + "grad_norm": 0.12217232584953308, + "learning_rate": 0.00039472258868430835, + "loss": 2.0781, + "step": 262660 + }, + { + "epoch": 0.999786850178513, + "grad_norm": 0.11280722916126251, + "learning_rate": 0.0003945248844513551, + "loss": 2.0688, + "step": 262670 + }, + { + "epoch": 0.9998249126466356, + "grad_norm": 0.11979371309280396, + "learning_rate": 0.0003943275501056843, + "loss": 2.0698, + "step": 262680 + }, + { + "epoch": 0.9998629751147583, + "grad_norm": 0.13274313509464264, + "learning_rate": 0.00039413058357895173, + "loss": 2.0747, + "step": 262690 + }, + { + "epoch": 0.9999010375828811, + "grad_norm": 0.13018228113651276, + "learning_rate": 0.00039393398282201785, + "loss": 2.0636, + "step": 262700 + }, + { + "epoch": 0.9999391000510037, + "grad_norm": 0.12832440435886383, + "learning_rate": 0.0003937377458046995, + "loss": 2.0743, + "step": 262710 + }, + { + "epoch": 0.9999771625191264, + "grad_norm": 0.11964524537324905, + "learning_rate": 0.0003935418705155246, + "loss": 2.0809, + "step": 262720 + }, + { + "epoch": 1.0000152249872492, + "grad_norm": 0.12813405692577362, + "learning_rate": 0.0003933463549614923, + "loss": 2.0832, + "step": 262730 + }, + { + "epoch": 1.0000532874553718, + "grad_norm": 0.122567318379879, + "learning_rate": 0.000393151197167836, + "loss": 2.0679, + "step": 262740 + }, + { + "epoch": 1.0000913499234945, + "grad_norm": 0.12381791323423386, + "learning_rate": 0.0003929563951777906, + "loss": 2.0814, + "step": 262750 + }, + { + "epoch": 1.0001294123916171, + "grad_norm": 0.12520988285541534, + "learning_rate": 0.0003927619470523639, + "loss": 2.0771, + "step": 262760 + }, + { + "epoch": 1.0001674748597398, + "grad_norm": 0.12787935137748718, + "learning_rate": 0.0003925678508701112, + "loss": 2.0707, + "step": 262770 + }, + { + "epoch": 1.0002055373278624, + "grad_norm": 0.12769973278045654, + "learning_rate": 0.00039237410472691374, + "loss": 2.074, + "step": 262780 + }, + { + "epoch": 1.000243599795985, + "grad_norm": 0.13190263509750366, + "learning_rate": 0.0003921807067357609, + "loss": 2.0792, + "step": 262790 + }, + { + "epoch": 1.0002816622641078, + "grad_norm": 0.11785663664340973, + "learning_rate": 0.00039198765502653567, + "loss": 2.0698, + "step": 262800 + }, + { + "epoch": 1.0003197247322306, + "grad_norm": 0.13956955075263977, + "learning_rate": 0.0003917949477458038, + "loss": 2.0618, + "step": 262810 + }, + { + "epoch": 1.0003577872003533, + "grad_norm": 0.1239355057477951, + "learning_rate": 0.000391602583056606, + "loss": 2.0765, + "step": 262820 + }, + { + "epoch": 1.000395849668476, + "grad_norm": 0.11955326795578003, + "learning_rate": 0.00039141055913825384, + "loss": 2.0675, + "step": 262830 + }, + { + "epoch": 1.0004339121365986, + "grad_norm": 0.10961072146892548, + "learning_rate": 0.0003912188741861286, + "loss": 2.0729, + "step": 262840 + }, + { + "epoch": 1.0004719746047213, + "grad_norm": 0.131142720580101, + "learning_rate": 0.0003910275264114832, + "loss": 2.0704, + "step": 262850 + }, + { + "epoch": 1.000510037072844, + "grad_norm": 0.12614300847053528, + "learning_rate": 0.0003908365140412479, + "loss": 2.0757, + "step": 262860 + }, + { + "epoch": 1.0005480995409666, + "grad_norm": 0.13212086260318756, + "learning_rate": 0.00039064583531783835, + "loss": 2.077, + "step": 262870 + }, + { + "epoch": 1.0005861620090892, + "grad_norm": 0.11505197733640671, + "learning_rate": 0.00039045548849896676, + "loss": 2.0816, + "step": 262880 + }, + { + "epoch": 1.000624224477212, + "grad_norm": 0.11376156657934189, + "learning_rate": 0.00039026547185745653, + "loss": 2.0682, + "step": 262890 + }, + { + "epoch": 1.0006622869453348, + "grad_norm": 0.14367571473121643, + "learning_rate": 0.000390075783681059, + "loss": 2.0899, + "step": 262900 + }, + { + "epoch": 1.0007003494134574, + "grad_norm": 0.11860551685094833, + "learning_rate": 0.0003898864222722738, + "loss": 2.0721, + "step": 262910 + }, + { + "epoch": 1.00073841188158, + "grad_norm": 0.12184567004442215, + "learning_rate": 0.00038969738594817136, + "loss": 2.0688, + "step": 262920 + }, + { + "epoch": 1.0007764743497027, + "grad_norm": 0.12217500805854797, + "learning_rate": 0.00038950867304021855, + "loss": 2.0592, + "step": 262930 + }, + { + "epoch": 1.0008145368178254, + "grad_norm": 0.11617803573608398, + "learning_rate": 0.0003893202818941067, + "loss": 2.0612, + "step": 262940 + }, + { + "epoch": 1.000852599285948, + "grad_norm": 0.13129088282585144, + "learning_rate": 0.00038913221086958274, + "loss": 2.0635, + "step": 262950 + }, + { + "epoch": 1.0008906617540707, + "grad_norm": 0.1188531294465065, + "learning_rate": 0.00038894445834028213, + "loss": 2.0795, + "step": 262960 + }, + { + "epoch": 1.0009287242221934, + "grad_norm": 0.12933498620986938, + "learning_rate": 0.00038875702269356506, + "loss": 2.0678, + "step": 262970 + }, + { + "epoch": 1.0009667866903162, + "grad_norm": 0.12279139459133148, + "learning_rate": 0.0003885699023303548, + "loss": 2.0711, + "step": 262980 + }, + { + "epoch": 1.001004849158439, + "grad_norm": 0.12984012067317963, + "learning_rate": 0.00038838309566497855, + "loss": 2.0839, + "step": 262990 + }, + { + "epoch": 1.0010429116265616, + "grad_norm": 0.12132301926612854, + "learning_rate": 0.00038819660112501055, + "loss": 2.0716, + "step": 263000 + }, + { + "epoch": 1.0010809740946842, + "grad_norm": 0.2595154941082001, + "learning_rate": 0.00038801041715111773, + "loss": 2.0623, + "step": 263010 + }, + { + "epoch": 1.0011190365628069, + "grad_norm": 0.12713661789894104, + "learning_rate": 0.00038782454219690775, + "loss": 2.0847, + "step": 263020 + }, + { + "epoch": 1.0011570990309295, + "grad_norm": 0.12192989140748978, + "learning_rate": 0.00038763897472877886, + "loss": 2.0625, + "step": 263030 + }, + { + "epoch": 1.0011951614990522, + "grad_norm": 0.11912639439105988, + "learning_rate": 0.00038745371322577245, + "loss": 2.0742, + "step": 263040 + }, + { + "epoch": 1.0012332239671748, + "grad_norm": 0.11651023477315903, + "learning_rate": 0.00038726875617942763, + "loss": 2.0721, + "step": 263050 + }, + { + "epoch": 1.0012712864352975, + "grad_norm": 0.12285809963941574, + "learning_rate": 0.00038708410209363785, + "loss": 2.0678, + "step": 263060 + }, + { + "epoch": 1.0013093489034204, + "grad_norm": 0.14245565235614777, + "learning_rate": 0.0003868997494845097, + "loss": 2.0897, + "step": 263070 + }, + { + "epoch": 1.001347411371543, + "grad_norm": 0.14112943410873413, + "learning_rate": 0.0003867156968802238, + "loss": 2.0726, + "step": 263080 + }, + { + "epoch": 1.0013854738396657, + "grad_norm": 0.1136462390422821, + "learning_rate": 0.0003865319428208978, + "loss": 2.0654, + "step": 263090 + }, + { + "epoch": 1.0014235363077884, + "grad_norm": 0.11797811836004257, + "learning_rate": 0.00038634848585845124, + "loss": 2.0743, + "step": 263100 + }, + { + "epoch": 1.001461598775911, + "grad_norm": 0.12332088500261307, + "learning_rate": 0.00038616532455647214, + "loss": 2.0581, + "step": 263110 + }, + { + "epoch": 1.0014996612440337, + "grad_norm": 0.12641309201717377, + "learning_rate": 0.0003859824574900862, + "loss": 2.0921, + "step": 263120 + }, + { + "epoch": 1.0015377237121563, + "grad_norm": 0.12153881043195724, + "learning_rate": 0.00038579988324582737, + "loss": 2.0746, + "step": 263130 + }, + { + "epoch": 1.001575786180279, + "grad_norm": 0.1199311763048172, + "learning_rate": 0.0003856176004215101, + "loss": 2.0535, + "step": 263140 + }, + { + "epoch": 1.0016138486484019, + "grad_norm": 0.11625245958566666, + "learning_rate": 0.000385435607626104, + "loss": 2.074, + "step": 263150 + }, + { + "epoch": 1.0016519111165245, + "grad_norm": 0.11807789653539658, + "learning_rate": 0.00038525390347961, + "loss": 2.0766, + "step": 263160 + }, + { + "epoch": 1.0016899735846472, + "grad_norm": 0.11367712914943695, + "learning_rate": 0.0003850724866129379, + "loss": 2.0647, + "step": 263170 + }, + { + "epoch": 1.0017280360527698, + "grad_norm": 0.12399783730506897, + "learning_rate": 0.00038489135566778667, + "loss": 2.0563, + "step": 263180 + }, + { + "epoch": 1.0017660985208925, + "grad_norm": 0.11463885754346848, + "learning_rate": 0.00038471050929652495, + "loss": 2.0593, + "step": 263190 + }, + { + "epoch": 1.0018041609890151, + "grad_norm": 0.1467050015926361, + "learning_rate": 0.00038452994616207484, + "loss": 2.0702, + "step": 263200 + }, + { + "epoch": 1.0018422234571378, + "grad_norm": 0.12973052263259888, + "learning_rate": 0.0003843496649377962, + "loss": 2.0552, + "step": 263210 + }, + { + "epoch": 1.0018802859252605, + "grad_norm": 0.1213889867067337, + "learning_rate": 0.000384169664307373, + "loss": 2.0743, + "step": 263220 + }, + { + "epoch": 1.0019183483933831, + "grad_norm": 0.1346946805715561, + "learning_rate": 0.00038398994296470095, + "loss": 2.081, + "step": 263230 + }, + { + "epoch": 1.001956410861506, + "grad_norm": 0.13012921810150146, + "learning_rate": 0.0003838104996137775, + "loss": 2.065, + "step": 263240 + }, + { + "epoch": 1.0019944733296287, + "grad_norm": 0.11883129179477692, + "learning_rate": 0.00038363133296859215, + "loss": 2.0656, + "step": 263250 + }, + { + "epoch": 1.0020325357977513, + "grad_norm": 0.14053906500339508, + "learning_rate": 0.0003834524417530194, + "loss": 2.0638, + "step": 263260 + }, + { + "epoch": 1.002070598265874, + "grad_norm": 0.12595908343791962, + "learning_rate": 0.0003832738247007125, + "loss": 2.074, + "step": 263270 + }, + { + "epoch": 1.0021086607339966, + "grad_norm": 0.11965731531381607, + "learning_rate": 0.00038309548055499875, + "loss": 2.0691, + "step": 263280 + }, + { + "epoch": 1.0021467232021193, + "grad_norm": 0.11948921531438828, + "learning_rate": 0.0003829174080687768, + "loss": 2.0721, + "step": 263290 + }, + { + "epoch": 1.002184785670242, + "grad_norm": 0.11565563082695007, + "learning_rate": 0.0003827396060044143, + "loss": 2.0749, + "step": 263300 + }, + { + "epoch": 1.0022228481383646, + "grad_norm": 0.13267754018306732, + "learning_rate": 0.0003825620731336479, + "loss": 2.0906, + "step": 263310 + }, + { + "epoch": 1.0022609106064875, + "grad_norm": 0.12567409873008728, + "learning_rate": 0.00038238480823748436, + "loss": 2.0804, + "step": 263320 + }, + { + "epoch": 1.0022989730746101, + "grad_norm": 0.11261701583862305, + "learning_rate": 0.00038220781010610256, + "loss": 2.0572, + "step": 263330 + }, + { + "epoch": 1.0023370355427328, + "grad_norm": 0.1253342479467392, + "learning_rate": 0.00038203107753875744, + "loss": 2.0854, + "step": 263340 + }, + { + "epoch": 1.0023750980108554, + "grad_norm": 0.12061937898397446, + "learning_rate": 0.00038185460934368477, + "loss": 2.0691, + "step": 263350 + }, + { + "epoch": 1.002413160478978, + "grad_norm": 0.12016843259334564, + "learning_rate": 0.0003816784043380077, + "loss": 2.069, + "step": 263360 + }, + { + "epoch": 1.0024512229471008, + "grad_norm": 0.12179528176784515, + "learning_rate": 0.00038150246134764377, + "loss": 2.085, + "step": 263370 + }, + { + "epoch": 1.0024892854152234, + "grad_norm": 0.11890345811843872, + "learning_rate": 0.000381326779207214, + "loss": 2.0874, + "step": 263380 + }, + { + "epoch": 1.002527347883346, + "grad_norm": 0.139007106423378, + "learning_rate": 0.0003811513567599529, + "loss": 2.0827, + "step": 263390 + }, + { + "epoch": 1.0025654103514687, + "grad_norm": 0.11820542812347412, + "learning_rate": 0.0003809761928576192, + "loss": 2.0754, + "step": 263400 + }, + { + "epoch": 1.0026034728195916, + "grad_norm": 0.13151657581329346, + "learning_rate": 0.00038080128636040853, + "loss": 2.0838, + "step": 263410 + }, + { + "epoch": 1.0026415352877143, + "grad_norm": 0.12178993970155716, + "learning_rate": 0.00038062663613686677, + "loss": 2.0577, + "step": 263420 + }, + { + "epoch": 1.002679597755837, + "grad_norm": 0.12232571840286255, + "learning_rate": 0.0003804522410638047, + "loss": 2.0789, + "step": 263430 + }, + { + "epoch": 1.0027176602239596, + "grad_norm": 0.13086816668510437, + "learning_rate": 0.00038027810002621356, + "loss": 2.0751, + "step": 263440 + }, + { + "epoch": 1.0027557226920822, + "grad_norm": 0.12952834367752075, + "learning_rate": 0.000380104211917182, + "loss": 2.0804, + "step": 263450 + }, + { + "epoch": 1.002793785160205, + "grad_norm": 0.1318865418434143, + "learning_rate": 0.00037993057563781417, + "loss": 2.0713, + "step": 263460 + }, + { + "epoch": 1.0028318476283276, + "grad_norm": 0.1240028366446495, + "learning_rate": 0.00037975719009714824, + "loss": 2.0767, + "step": 263470 + }, + { + "epoch": 1.0028699100964502, + "grad_norm": 0.11515330523252487, + "learning_rate": 0.000379584054212077, + "loss": 2.0785, + "step": 263480 + }, + { + "epoch": 1.0029079725645729, + "grad_norm": 0.15111832320690155, + "learning_rate": 0.00037941116690726846, + "loss": 2.0584, + "step": 263490 + }, + { + "epoch": 1.0029460350326957, + "grad_norm": 0.1152157112956047, + "learning_rate": 0.000379238527115088, + "loss": 2.0668, + "step": 263500 + }, + { + "epoch": 1.0029840975008184, + "grad_norm": 0.12826646864414215, + "learning_rate": 0.0003790661337755218, + "loss": 2.0696, + "step": 263510 + }, + { + "epoch": 1.003022159968941, + "grad_norm": 0.12478494644165039, + "learning_rate": 0.00037889398583610035, + "loss": 2.0901, + "step": 263520 + }, + { + "epoch": 1.0030602224370637, + "grad_norm": 0.12773947417736053, + "learning_rate": 0.000378722082251824, + "loss": 2.0644, + "step": 263530 + }, + { + "epoch": 1.0030982849051864, + "grad_norm": 0.11997218430042267, + "learning_rate": 0.0003785504219850888, + "loss": 2.0726, + "step": 263540 + }, + { + "epoch": 1.003136347373309, + "grad_norm": 0.12420439720153809, + "learning_rate": 0.00037837900400561315, + "loss": 2.0677, + "step": 263550 + }, + { + "epoch": 1.0031744098414317, + "grad_norm": 0.12196481972932816, + "learning_rate": 0.00037820782729036597, + "loss": 2.0567, + "step": 263560 + }, + { + "epoch": 1.0032124723095543, + "grad_norm": 0.11405656486749649, + "learning_rate": 0.00037803689082349537, + "loss": 2.0673, + "step": 263570 + }, + { + "epoch": 1.0032505347776772, + "grad_norm": 0.12951435148715973, + "learning_rate": 0.0003778661935962583, + "loss": 2.0634, + "step": 263580 + }, + { + "epoch": 1.0032885972457999, + "grad_norm": 0.12632913887500763, + "learning_rate": 0.0003776957346069511, + "loss": 2.0728, + "step": 263590 + }, + { + "epoch": 1.0033266597139225, + "grad_norm": 0.13721810281276703, + "learning_rate": 0.0003775255128608411, + "loss": 2.0819, + "step": 263600 + }, + { + "epoch": 1.0033647221820452, + "grad_norm": 0.1207350492477417, + "learning_rate": 0.0003773555273700985, + "loss": 2.0599, + "step": 263610 + }, + { + "epoch": 1.0034027846501679, + "grad_norm": 0.1209789514541626, + "learning_rate": 0.00037718577715372976, + "loss": 2.0713, + "step": 263620 + }, + { + "epoch": 1.0034408471182905, + "grad_norm": 0.12487935274839401, + "learning_rate": 0.0003770162612375116, + "loss": 2.0793, + "step": 263630 + }, + { + "epoch": 1.0034789095864132, + "grad_norm": 0.13212476670742035, + "learning_rate": 0.0003768469786539256, + "loss": 2.0634, + "step": 263640 + }, + { + "epoch": 1.0035169720545358, + "grad_norm": 0.11858946830034256, + "learning_rate": 0.00037667792844209383, + "loss": 2.0764, + "step": 263650 + }, + { + "epoch": 1.0035550345226585, + "grad_norm": 0.12068924307823181, + "learning_rate": 0.00037650910964771536, + "loss": 2.0705, + "step": 263660 + }, + { + "epoch": 1.0035930969907814, + "grad_norm": 0.11545553803443909, + "learning_rate": 0.00037634052132300307, + "loss": 2.0713, + "step": 263670 + }, + { + "epoch": 1.003631159458904, + "grad_norm": 0.11419456452131271, + "learning_rate": 0.00037617216252662196, + "loss": 2.0601, + "step": 263680 + }, + { + "epoch": 1.0036692219270267, + "grad_norm": 0.12380310148000717, + "learning_rate": 0.00037600403232362754, + "loss": 2.0698, + "step": 263690 + }, + { + "epoch": 1.0037072843951493, + "grad_norm": 0.12769466638565063, + "learning_rate": 0.0003758361297854055, + "loss": 2.0653, + "step": 263700 + }, + { + "epoch": 1.003745346863272, + "grad_norm": 0.11498839408159256, + "learning_rate": 0.0003756684539896116, + "loss": 2.0702, + "step": 263710 + }, + { + "epoch": 1.0037834093313946, + "grad_norm": 0.1188734918832779, + "learning_rate": 0.0003755010040201127, + "loss": 2.0584, + "step": 263720 + }, + { + "epoch": 1.0038214717995173, + "grad_norm": 0.12653295695781708, + "learning_rate": 0.0003753337789669284, + "loss": 2.0629, + "step": 263730 + }, + { + "epoch": 1.00385953426764, + "grad_norm": 0.11432266980409622, + "learning_rate": 0.0003751667779261733, + "loss": 2.0694, + "step": 263740 + }, + { + "epoch": 1.0038975967357628, + "grad_norm": 0.13446052372455597, + "learning_rate": 0.000375, + "loss": 2.0673, + "step": 263750 + }, + { + "epoch": 1.0039356592038855, + "grad_norm": 0.20220239460468292, + "learning_rate": 0.00037483344429654273, + "loss": 2.0727, + "step": 263760 + }, + { + "epoch": 1.0039737216720082, + "grad_norm": 0.12345918267965317, + "learning_rate": 0.00037466710992986184, + "loss": 2.0663, + "step": 263770 + }, + { + "epoch": 1.0040117841401308, + "grad_norm": 0.11362280696630478, + "learning_rate": 0.0003745009960198887, + "loss": 2.0641, + "step": 263780 + }, + { + "epoch": 1.0040498466082535, + "grad_norm": 0.12851089239120483, + "learning_rate": 0.00037433510169237127, + "loss": 2.0735, + "step": 263790 + }, + { + "epoch": 1.0040879090763761, + "grad_norm": 0.13473722338676453, + "learning_rate": 0.00037416942607882087, + "loss": 2.0698, + "step": 263800 + }, + { + "epoch": 1.0041259715444988, + "grad_norm": 0.13479141891002655, + "learning_rate": 0.00037400396831645846, + "loss": 2.082, + "step": 263810 + }, + { + "epoch": 1.0041640340126214, + "grad_norm": 0.11503574252128601, + "learning_rate": 0.00037383872754816297, + "loss": 2.0694, + "step": 263820 + }, + { + "epoch": 1.004202096480744, + "grad_norm": 0.1254061758518219, + "learning_rate": 0.00037367370292241866, + "loss": 2.0849, + "step": 263830 + }, + { + "epoch": 1.004240158948867, + "grad_norm": 0.12476842105388641, + "learning_rate": 0.00037350889359326486, + "loss": 2.0783, + "step": 263840 + }, + { + "epoch": 1.0042782214169896, + "grad_norm": 0.12398224323987961, + "learning_rate": 0.00037334429872024446, + "loss": 2.0698, + "step": 263850 + }, + { + "epoch": 1.0043162838851123, + "grad_norm": 0.10987059772014618, + "learning_rate": 0.00037317991746835467, + "loss": 2.0737, + "step": 263860 + }, + { + "epoch": 1.004354346353235, + "grad_norm": 0.1181088387966156, + "learning_rate": 0.00037301574900799704, + "loss": 2.0674, + "step": 263870 + }, + { + "epoch": 1.0043924088213576, + "grad_norm": 0.1426362544298172, + "learning_rate": 0.000372851792514929, + "loss": 2.0892, + "step": 263880 + }, + { + "epoch": 1.0044304712894803, + "grad_norm": 0.12462259083986282, + "learning_rate": 0.00037268804717021525, + "loss": 2.0735, + "step": 263890 + }, + { + "epoch": 1.004468533757603, + "grad_norm": 0.12202735990285873, + "learning_rate": 0.00037252451216018037, + "loss": 2.0563, + "step": 263900 + }, + { + "epoch": 1.0045065962257256, + "grad_norm": 0.12221315503120422, + "learning_rate": 0.0003723611866763614, + "loss": 2.0689, + "step": 263910 + }, + { + "epoch": 1.0045446586938482, + "grad_norm": 0.11692935228347778, + "learning_rate": 0.00037219806991546124, + "loss": 2.0791, + "step": 263920 + }, + { + "epoch": 1.0045827211619711, + "grad_norm": 0.13614268600940704, + "learning_rate": 0.0003720351610793027, + "loss": 2.0521, + "step": 263930 + }, + { + "epoch": 1.0046207836300938, + "grad_norm": 0.1747211217880249, + "learning_rate": 0.0003718724593747829, + "loss": 2.0717, + "step": 263940 + }, + { + "epoch": 1.0046588460982164, + "grad_norm": 0.12883198261260986, + "learning_rate": 0.0003717099640138279, + "loss": 2.0805, + "step": 263950 + }, + { + "epoch": 1.004696908566339, + "grad_norm": 0.11266852915287018, + "learning_rate": 0.0003715476742133487, + "loss": 2.0634, + "step": 263960 + }, + { + "epoch": 1.0047349710344617, + "grad_norm": 0.11975182592868805, + "learning_rate": 0.0003713855891951968, + "loss": 2.0719, + "step": 263970 + }, + { + "epoch": 1.0047730335025844, + "grad_norm": 0.13118694722652435, + "learning_rate": 0.00037122370818612097, + "loss": 2.0715, + "step": 263980 + }, + { + "epoch": 1.004811095970707, + "grad_norm": 0.1224631518125534, + "learning_rate": 0.00037106203041772375, + "loss": 2.0583, + "step": 263990 + }, + { + "epoch": 1.0048491584388297, + "grad_norm": 0.12530510127544403, + "learning_rate": 0.00037090055512641946, + "loss": 2.0691, + "step": 264000 + }, + { + "epoch": 1.0048872209069526, + "grad_norm": 0.12490088492631912, + "learning_rate": 0.00037073928155339174, + "loss": 2.0633, + "step": 264010 + }, + { + "epoch": 1.0049252833750753, + "grad_norm": 0.13902606070041656, + "learning_rate": 0.0003705782089445522, + "loss": 2.0725, + "step": 264020 + }, + { + "epoch": 1.004963345843198, + "grad_norm": 0.12821468710899353, + "learning_rate": 0.0003704173365504989, + "loss": 2.0639, + "step": 264030 + }, + { + "epoch": 1.0050014083113206, + "grad_norm": 0.11437909305095673, + "learning_rate": 0.00037025666362647627, + "loss": 2.0664, + "step": 264040 + }, + { + "epoch": 1.0050394707794432, + "grad_norm": 0.13727909326553345, + "learning_rate": 0.0003700961894323342, + "loss": 2.0568, + "step": 264050 + }, + { + "epoch": 1.0050775332475659, + "grad_norm": 0.11755349487066269, + "learning_rate": 0.00036993591323248887, + "loss": 2.0766, + "step": 264060 + }, + { + "epoch": 1.0051155957156885, + "grad_norm": 0.11817418038845062, + "learning_rate": 0.00036977583429588295, + "loss": 2.084, + "step": 264070 + }, + { + "epoch": 1.0051536581838112, + "grad_norm": 0.11482264846563339, + "learning_rate": 0.000369615951895947, + "loss": 2.0624, + "step": 264080 + }, + { + "epoch": 1.0051917206519339, + "grad_norm": 0.11733140796422958, + "learning_rate": 0.00036945626531056084, + "loss": 2.0585, + "step": 264090 + }, + { + "epoch": 1.0052297831200567, + "grad_norm": 0.12599030137062073, + "learning_rate": 0.00036929677382201566, + "loss": 2.0602, + "step": 264100 + }, + { + "epoch": 1.0052678455881794, + "grad_norm": 0.12617869675159454, + "learning_rate": 0.000369137476716976, + "loss": 2.0603, + "step": 264110 + }, + { + "epoch": 1.005305908056302, + "grad_norm": 0.1278885155916214, + "learning_rate": 0.000368978373286443, + "loss": 2.0668, + "step": 264120 + }, + { + "epoch": 1.0053439705244247, + "grad_norm": 0.12624938786029816, + "learning_rate": 0.0003688194628257174, + "loss": 2.0669, + "step": 264130 + }, + { + "epoch": 1.0053820329925474, + "grad_norm": 0.12271690368652344, + "learning_rate": 0.00036866074463436305, + "loss": 2.0706, + "step": 264140 + }, + { + "epoch": 1.00542009546067, + "grad_norm": 0.11715830862522125, + "learning_rate": 0.00036850221801617086, + "loss": 2.0778, + "step": 264150 + }, + { + "epoch": 1.0054581579287927, + "grad_norm": 0.11676019430160522, + "learning_rate": 0.00036834388227912333, + "loss": 2.0479, + "step": 264160 + }, + { + "epoch": 1.0054962203969153, + "grad_norm": 0.13122425973415375, + "learning_rate": 0.00036818573673535934, + "loss": 2.0753, + "step": 264170 + }, + { + "epoch": 1.0055342828650382, + "grad_norm": 0.12575967609882355, + "learning_rate": 0.000368027780701139, + "loss": 2.0607, + "step": 264180 + }, + { + "epoch": 1.0055723453331609, + "grad_norm": 0.1159675344824791, + "learning_rate": 0.00036787001349680937, + "loss": 2.0737, + "step": 264190 + }, + { + "epoch": 1.0056104078012835, + "grad_norm": 0.11694355309009552, + "learning_rate": 0.0003677124344467705, + "loss": 2.0636, + "step": 264200 + }, + { + "epoch": 1.0056484702694062, + "grad_norm": 0.12231706827878952, + "learning_rate": 0.0003675550428794412, + "loss": 2.0793, + "step": 264210 + }, + { + "epoch": 1.0056865327375288, + "grad_norm": 0.12539350986480713, + "learning_rate": 0.0003673978381272261, + "loss": 2.0681, + "step": 264220 + }, + { + "epoch": 1.0057245952056515, + "grad_norm": 0.1346427947282791, + "learning_rate": 0.0003672408195264825, + "loss": 2.0652, + "step": 264230 + }, + { + "epoch": 1.0057626576737742, + "grad_norm": 0.11903837323188782, + "learning_rate": 0.00036708398641748744, + "loss": 2.0749, + "step": 264240 + }, + { + "epoch": 1.0058007201418968, + "grad_norm": 0.12069795280694962, + "learning_rate": 0.0003669273381444057, + "loss": 2.0741, + "step": 264250 + }, + { + "epoch": 1.0058387826100195, + "grad_norm": 0.129713237285614, + "learning_rate": 0.0003667708740552578, + "loss": 2.0621, + "step": 264260 + }, + { + "epoch": 1.0058768450781423, + "grad_norm": 0.11903487145900726, + "learning_rate": 0.00036661459350188763, + "loss": 2.0669, + "step": 264270 + }, + { + "epoch": 1.005914907546265, + "grad_norm": 0.10933295637369156, + "learning_rate": 0.00036645849583993254, + "loss": 2.0737, + "step": 264280 + }, + { + "epoch": 1.0059529700143877, + "grad_norm": 0.11205419898033142, + "learning_rate": 0.00036630258042879063, + "loss": 2.0696, + "step": 264290 + }, + { + "epoch": 1.0059910324825103, + "grad_norm": 0.11133591085672379, + "learning_rate": 0.0003661468466315916, + "loss": 2.0711, + "step": 264300 + }, + { + "epoch": 1.006029094950633, + "grad_norm": 0.12142396718263626, + "learning_rate": 0.00036599129381516537, + "loss": 2.0675, + "step": 264310 + }, + { + "epoch": 1.0060671574187556, + "grad_norm": 0.13197079300880432, + "learning_rate": 0.0003658359213500126, + "loss": 2.0718, + "step": 264320 + }, + { + "epoch": 1.0061052198868783, + "grad_norm": 0.1480017453432083, + "learning_rate": 0.0003656807286102747, + "loss": 2.0737, + "step": 264330 + }, + { + "epoch": 1.006143282355001, + "grad_norm": 0.12461571395397186, + "learning_rate": 0.00036552571497370465, + "loss": 2.0583, + "step": 264340 + }, + { + "epoch": 1.0061813448231236, + "grad_norm": 0.12747173011302948, + "learning_rate": 0.0003653708798216374, + "loss": 2.0611, + "step": 264350 + }, + { + "epoch": 1.0062194072912465, + "grad_norm": 0.13145940005779266, + "learning_rate": 0.0003652162225389618, + "loss": 2.0746, + "step": 264360 + }, + { + "epoch": 1.0062574697593691, + "grad_norm": 0.12994110584259033, + "learning_rate": 0.0003650617425140915, + "loss": 2.0586, + "step": 264370 + }, + { + "epoch": 1.0062955322274918, + "grad_norm": 0.12092719972133636, + "learning_rate": 0.0003649074391389371, + "loss": 2.0539, + "step": 264380 + }, + { + "epoch": 1.0063335946956145, + "grad_norm": 0.12784551084041595, + "learning_rate": 0.0003647533118088777, + "loss": 2.0727, + "step": 264390 + }, + { + "epoch": 1.006371657163737, + "grad_norm": 0.12413742393255234, + "learning_rate": 0.000364599359922734, + "loss": 2.0712, + "step": 264400 + }, + { + "epoch": 1.0064097196318598, + "grad_norm": 0.12628783285617828, + "learning_rate": 0.0003644455828827404, + "loss": 2.058, + "step": 264410 + }, + { + "epoch": 1.0064477820999824, + "grad_norm": 0.11597968637943268, + "learning_rate": 0.0003642919800945181, + "loss": 2.0514, + "step": 264420 + }, + { + "epoch": 1.006485844568105, + "grad_norm": 0.12907037138938904, + "learning_rate": 0.0003641385509670483, + "loss": 2.0661, + "step": 264430 + }, + { + "epoch": 1.006523907036228, + "grad_norm": 0.12897427380084991, + "learning_rate": 0.0003639852949126455, + "loss": 2.0635, + "step": 264440 + }, + { + "epoch": 1.0065619695043506, + "grad_norm": 0.1267291009426117, + "learning_rate": 0.00036383221134693173, + "loss": 2.0646, + "step": 264450 + }, + { + "epoch": 1.0066000319724733, + "grad_norm": 0.12274293601512909, + "learning_rate": 0.0003636792996888098, + "loss": 2.0733, + "step": 264460 + }, + { + "epoch": 1.006638094440596, + "grad_norm": 0.1234775260090828, + "learning_rate": 0.0003635265593604382, + "loss": 2.0722, + "step": 264470 + }, + { + "epoch": 1.0066761569087186, + "grad_norm": 0.1398056596517563, + "learning_rate": 0.0003633739897872054, + "loss": 2.0682, + "step": 264480 + }, + { + "epoch": 1.0067142193768412, + "grad_norm": 0.13379579782485962, + "learning_rate": 0.00036322159039770446, + "loss": 2.0686, + "step": 264490 + }, + { + "epoch": 1.006752281844964, + "grad_norm": 0.11642500758171082, + "learning_rate": 0.0003630693606237085, + "loss": 2.0736, + "step": 264500 + }, + { + "epoch": 1.0067903443130866, + "grad_norm": 0.12083044648170471, + "learning_rate": 0.00036291729990014544, + "loss": 2.0572, + "step": 264510 + }, + { + "epoch": 1.0068284067812092, + "grad_norm": 0.12776164710521698, + "learning_rate": 0.000362765407665074, + "loss": 2.0651, + "step": 264520 + }, + { + "epoch": 1.006866469249332, + "grad_norm": 0.1178087666630745, + "learning_rate": 0.0003626136833596592, + "loss": 2.0825, + "step": 264530 + }, + { + "epoch": 1.0069045317174548, + "grad_norm": 0.14784130454063416, + "learning_rate": 0.0003624621264281483, + "loss": 2.0755, + "step": 264540 + }, + { + "epoch": 1.0069425941855774, + "grad_norm": 0.1876007318496704, + "learning_rate": 0.0003623107363178474, + "loss": 2.0711, + "step": 264550 + }, + { + "epoch": 1.0069806566537, + "grad_norm": 0.13013619184494019, + "learning_rate": 0.00036215951247909776, + "loss": 2.052, + "step": 264560 + }, + { + "epoch": 1.0070187191218227, + "grad_norm": 0.1125163584947586, + "learning_rate": 0.0003620084543652523, + "loss": 2.057, + "step": 264570 + }, + { + "epoch": 1.0070567815899454, + "grad_norm": 0.12135955691337585, + "learning_rate": 0.0003618575614326527, + "loss": 2.0687, + "step": 264580 + }, + { + "epoch": 1.007094844058068, + "grad_norm": 0.1208009198307991, + "learning_rate": 0.0003617068331406067, + "loss": 2.0622, + "step": 264590 + }, + { + "epoch": 1.0071329065261907, + "grad_norm": 0.11748412996530533, + "learning_rate": 0.0003615562689513654, + "loss": 2.0786, + "step": 264600 + }, + { + "epoch": 1.0071709689943136, + "grad_norm": 0.140289306640625, + "learning_rate": 0.00036140586833010094, + "loss": 2.0799, + "step": 264610 + }, + { + "epoch": 1.0072090314624362, + "grad_norm": 0.12493662536144257, + "learning_rate": 0.00036125563074488394, + "loss": 2.0628, + "step": 264620 + }, + { + "epoch": 1.007247093930559, + "grad_norm": 0.11624481528997421, + "learning_rate": 0.0003611055556666623, + "loss": 2.0709, + "step": 264630 + }, + { + "epoch": 1.0072851563986815, + "grad_norm": 0.11657288670539856, + "learning_rate": 0.00036095564256923864, + "loss": 2.0628, + "step": 264640 + }, + { + "epoch": 1.0073232188668042, + "grad_norm": 0.1291431337594986, + "learning_rate": 0.00036080589092924944, + "loss": 2.0778, + "step": 264650 + }, + { + "epoch": 1.0073612813349269, + "grad_norm": 0.11379469186067581, + "learning_rate": 0.0003606563002261435, + "loss": 2.0554, + "step": 264660 + }, + { + "epoch": 1.0073993438030495, + "grad_norm": 0.12691283226013184, + "learning_rate": 0.0003605068699421605, + "loss": 2.0661, + "step": 264670 + }, + { + "epoch": 1.0074374062711722, + "grad_norm": 0.12331678718328476, + "learning_rate": 0.0003603575995623106, + "loss": 2.0747, + "step": 264680 + }, + { + "epoch": 1.0074754687392948, + "grad_norm": 0.11112986505031586, + "learning_rate": 0.0003602084885743535, + "loss": 2.0633, + "step": 264690 + }, + { + "epoch": 1.0075135312074177, + "grad_norm": 0.11814521253108978, + "learning_rate": 0.00036005953646877774, + "loss": 2.0625, + "step": 264700 + }, + { + "epoch": 1.0075515936755404, + "grad_norm": 0.12072381377220154, + "learning_rate": 0.000359910742738781, + "loss": 2.0715, + "step": 264710 + }, + { + "epoch": 1.007589656143663, + "grad_norm": 0.11832285672426224, + "learning_rate": 0.00035976210688024913, + "loss": 2.0724, + "step": 264720 + }, + { + "epoch": 1.0076277186117857, + "grad_norm": 0.12225547432899475, + "learning_rate": 0.0003596136283917369, + "loss": 2.0707, + "step": 264730 + }, + { + "epoch": 1.0076657810799083, + "grad_norm": 0.12879660725593567, + "learning_rate": 0.0003594653067744481, + "loss": 2.0624, + "step": 264740 + }, + { + "epoch": 1.007703843548031, + "grad_norm": 0.11869513988494873, + "learning_rate": 0.0003593171415322156, + "loss": 2.0778, + "step": 264750 + }, + { + "epoch": 1.0077419060161537, + "grad_norm": 0.1178370863199234, + "learning_rate": 0.00035916913217148264, + "loss": 2.0569, + "step": 264760 + }, + { + "epoch": 1.0077799684842763, + "grad_norm": 0.12137988209724426, + "learning_rate": 0.00035902127820128314, + "loss": 2.0602, + "step": 264770 + }, + { + "epoch": 1.007818030952399, + "grad_norm": 0.1318652629852295, + "learning_rate": 0.00035887357913322304, + "loss": 2.0584, + "step": 264780 + }, + { + "epoch": 1.0078560934205218, + "grad_norm": 0.1255052536725998, + "learning_rate": 0.000358726034481461, + "loss": 2.0753, + "step": 264790 + }, + { + "epoch": 1.0078941558886445, + "grad_norm": 0.11904001235961914, + "learning_rate": 0.00035857864376269053, + "loss": 2.0701, + "step": 264800 + }, + { + "epoch": 1.0079322183567672, + "grad_norm": 0.1347653716802597, + "learning_rate": 0.0003584314064961205, + "loss": 2.0608, + "step": 264810 + }, + { + "epoch": 1.0079702808248898, + "grad_norm": 0.12691181898117065, + "learning_rate": 0.0003582843222034579, + "loss": 2.067, + "step": 264820 + }, + { + "epoch": 1.0080083432930125, + "grad_norm": 0.11998774111270905, + "learning_rate": 0.00035813739040888894, + "loss": 2.0675, + "step": 264830 + }, + { + "epoch": 1.0080464057611351, + "grad_norm": 0.1235920637845993, + "learning_rate": 0.0003579906106390614, + "loss": 2.0609, + "step": 264840 + }, + { + "epoch": 1.0080844682292578, + "grad_norm": 0.16122466325759888, + "learning_rate": 0.00035784398242306683, + "loss": 2.0666, + "step": 264850 + }, + { + "epoch": 1.0081225306973804, + "grad_norm": 0.12416666746139526, + "learning_rate": 0.00035769750529242295, + "loss": 2.0639, + "step": 264860 + }, + { + "epoch": 1.0081605931655033, + "grad_norm": 0.12113657593727112, + "learning_rate": 0.00035755117878105605, + "loss": 2.0689, + "step": 264870 + }, + { + "epoch": 1.008198655633626, + "grad_norm": 0.12859567999839783, + "learning_rate": 0.00035740500242528375, + "loss": 2.0651, + "step": 264880 + }, + { + "epoch": 1.0082367181017486, + "grad_norm": 0.12713578343391418, + "learning_rate": 0.00035725897576379805, + "loss": 2.0582, + "step": 264890 + }, + { + "epoch": 1.0082747805698713, + "grad_norm": 0.11723849922418594, + "learning_rate": 0.0003571130983376479, + "loss": 2.0713, + "step": 264900 + }, + { + "epoch": 1.008312843037994, + "grad_norm": 0.12310074269771576, + "learning_rate": 0.000356967369690223, + "loss": 2.0527, + "step": 264910 + }, + { + "epoch": 1.0083509055061166, + "grad_norm": 0.12018231302499771, + "learning_rate": 0.00035682178936723646, + "loss": 2.0476, + "step": 264920 + }, + { + "epoch": 1.0083889679742393, + "grad_norm": 0.1254473626613617, + "learning_rate": 0.00035667635691670875, + "loss": 2.0584, + "step": 264930 + }, + { + "epoch": 1.008427030442362, + "grad_norm": 0.1365075260400772, + "learning_rate": 0.00035653107188895106, + "loss": 2.0541, + "step": 264940 + }, + { + "epoch": 1.0084650929104846, + "grad_norm": 0.12408968806266785, + "learning_rate": 0.0003563859338365493, + "loss": 2.0747, + "step": 264950 + }, + { + "epoch": 1.0085031553786075, + "grad_norm": 0.1151510551571846, + "learning_rate": 0.0003562409423143478, + "loss": 2.0481, + "step": 264960 + }, + { + "epoch": 1.0085412178467301, + "grad_norm": 0.12300057709217072, + "learning_rate": 0.00035609609687943365, + "loss": 2.06, + "step": 264970 + }, + { + "epoch": 1.0085792803148528, + "grad_norm": 0.11640718579292297, + "learning_rate": 0.0003559513970911207, + "loss": 2.0609, + "step": 264980 + }, + { + "epoch": 1.0086173427829754, + "grad_norm": 0.13439495861530304, + "learning_rate": 0.00035580684251093375, + "loss": 2.0642, + "step": 264990 + }, + { + "epoch": 1.008655405251098, + "grad_norm": 0.13347728550434113, + "learning_rate": 0.00035566243270259356, + "loss": 2.0612, + "step": 265000 + }, + { + "epoch": 1.0086934677192207, + "grad_norm": 0.12582042813301086, + "learning_rate": 0.0003555181672320011, + "loss": 2.0582, + "step": 265010 + }, + { + "epoch": 1.0087315301873434, + "grad_norm": 0.1276628077030182, + "learning_rate": 0.00035537404566722243, + "loss": 2.0508, + "step": 265020 + }, + { + "epoch": 1.008769592655466, + "grad_norm": 0.13432751595973969, + "learning_rate": 0.0003552300675784735, + "loss": 2.0717, + "step": 265030 + }, + { + "epoch": 1.008807655123589, + "grad_norm": 0.11870171129703522, + "learning_rate": 0.00035508623253810564, + "loss": 2.0571, + "step": 265040 + }, + { + "epoch": 1.0088457175917116, + "grad_norm": 0.12191476672887802, + "learning_rate": 0.00035494254012058993, + "loss": 2.0698, + "step": 265050 + }, + { + "epoch": 1.0088837800598343, + "grad_norm": 0.12333951145410538, + "learning_rate": 0.00035479898990250337, + "loss": 2.0618, + "step": 265060 + }, + { + "epoch": 1.008921842527957, + "grad_norm": 0.11955705285072327, + "learning_rate": 0.00035465558146251366, + "loss": 2.0592, + "step": 265070 + }, + { + "epoch": 1.0089599049960796, + "grad_norm": 0.12724429368972778, + "learning_rate": 0.00035451231438136537, + "loss": 2.0613, + "step": 265080 + }, + { + "epoch": 1.0089979674642022, + "grad_norm": 0.13674019277095795, + "learning_rate": 0.000354369188241865, + "loss": 2.0668, + "step": 265090 + }, + { + "epoch": 1.0090360299323249, + "grad_norm": 0.12613962590694427, + "learning_rate": 0.00035422620262886754, + "loss": 2.0556, + "step": 265100 + }, + { + "epoch": 1.0090740924004475, + "grad_norm": 0.12345962971448898, + "learning_rate": 0.00035408335712926143, + "loss": 2.0559, + "step": 265110 + }, + { + "epoch": 1.0091121548685702, + "grad_norm": 0.11877019703388214, + "learning_rate": 0.0003539406513319557, + "loss": 2.0419, + "step": 265120 + }, + { + "epoch": 1.009150217336693, + "grad_norm": 0.11650692671537399, + "learning_rate": 0.0003537980848278656, + "loss": 2.0547, + "step": 265130 + }, + { + "epoch": 1.0091882798048157, + "grad_norm": 0.12157123535871506, + "learning_rate": 0.0003536556572098987, + "loss": 2.0465, + "step": 265140 + }, + { + "epoch": 1.0092263422729384, + "grad_norm": 0.13357307016849518, + "learning_rate": 0.0003535133680729421, + "loss": 2.0592, + "step": 265150 + }, + { + "epoch": 1.009264404741061, + "grad_norm": 0.12342700362205505, + "learning_rate": 0.00035337121701384825, + "loss": 2.0605, + "step": 265160 + }, + { + "epoch": 1.0093024672091837, + "grad_norm": 0.1354524791240692, + "learning_rate": 0.00035322920363142177, + "loss": 2.0726, + "step": 265170 + }, + { + "epoch": 1.0093405296773064, + "grad_norm": 0.13181686401367188, + "learning_rate": 0.0003530873275264066, + "loss": 2.0707, + "step": 265180 + }, + { + "epoch": 1.009378592145429, + "grad_norm": 0.1218135878443718, + "learning_rate": 0.0003529455883014726, + "loss": 2.0449, + "step": 265190 + }, + { + "epoch": 1.0094166546135517, + "grad_norm": 0.11534344404935837, + "learning_rate": 0.00035280398556120253, + "loss": 2.0425, + "step": 265200 + }, + { + "epoch": 1.0094547170816743, + "grad_norm": 0.12393413484096527, + "learning_rate": 0.0003526625189120795, + "loss": 2.0799, + "step": 265210 + }, + { + "epoch": 1.0094927795497972, + "grad_norm": 0.13263042271137238, + "learning_rate": 0.0003525211879624738, + "loss": 2.0631, + "step": 265220 + }, + { + "epoch": 1.0095308420179199, + "grad_norm": 0.12168634682893753, + "learning_rate": 0.0003523799923226304, + "loss": 2.0658, + "step": 265230 + }, + { + "epoch": 1.0095689044860425, + "grad_norm": 0.1166045144200325, + "learning_rate": 0.0003522389316046567, + "loss": 2.065, + "step": 265240 + }, + { + "epoch": 1.0096069669541652, + "grad_norm": 0.12050196528434753, + "learning_rate": 0.00035209800542250957, + "loss": 2.0715, + "step": 265250 + }, + { + "epoch": 1.0096450294222878, + "grad_norm": 0.13119080662727356, + "learning_rate": 0.0003519572133919836, + "loss": 2.0592, + "step": 265260 + }, + { + "epoch": 1.0096830918904105, + "grad_norm": 0.13546597957611084, + "learning_rate": 0.00035181655513069843, + "loss": 2.0598, + "step": 265270 + }, + { + "epoch": 1.0097211543585332, + "grad_norm": 0.12136995792388916, + "learning_rate": 0.00035167603025808677, + "loss": 2.0529, + "step": 265280 + }, + { + "epoch": 1.0097592168266558, + "grad_norm": 0.13856296241283417, + "learning_rate": 0.0003515356383953824, + "loss": 2.0558, + "step": 265290 + }, + { + "epoch": 1.0097972792947787, + "grad_norm": 0.12482566386461258, + "learning_rate": 0.00035139537916560826, + "loss": 2.0644, + "step": 265300 + }, + { + "epoch": 1.0098353417629014, + "grad_norm": 0.12852446734905243, + "learning_rate": 0.00035125525219356483, + "loss": 2.0676, + "step": 265310 + }, + { + "epoch": 1.009873404231024, + "grad_norm": 0.1314767599105835, + "learning_rate": 0.00035111525710581807, + "loss": 2.0602, + "step": 265320 + }, + { + "epoch": 1.0099114666991467, + "grad_norm": 0.12297448515892029, + "learning_rate": 0.0003509753935306879, + "loss": 2.0586, + "step": 265330 + }, + { + "epoch": 1.0099495291672693, + "grad_norm": 0.1343231052160263, + "learning_rate": 0.00035083566109823705, + "loss": 2.0591, + "step": 265340 + }, + { + "epoch": 1.009987591635392, + "grad_norm": 0.11657698452472687, + "learning_rate": 0.0003506960594402591, + "loss": 2.0508, + "step": 265350 + }, + { + "epoch": 1.0100256541035146, + "grad_norm": 0.13249176740646362, + "learning_rate": 0.00035055658819026735, + "loss": 2.0565, + "step": 265360 + }, + { + "epoch": 1.0100637165716373, + "grad_norm": 0.1254306137561798, + "learning_rate": 0.00035041724698348407, + "loss": 2.0648, + "step": 265370 + }, + { + "epoch": 1.01010177903976, + "grad_norm": 0.12727105617523193, + "learning_rate": 0.00035027803545682864, + "loss": 2.054, + "step": 265380 + }, + { + "epoch": 1.0101398415078828, + "grad_norm": 0.11955145001411438, + "learning_rate": 0.00035013895324890684, + "loss": 2.0738, + "step": 265390 + }, + { + "epoch": 1.0101779039760055, + "grad_norm": 0.11762211471796036, + "learning_rate": 0.00035, + "loss": 2.0612, + "step": 265400 + }, + { + "epoch": 1.0102159664441281, + "grad_norm": 0.13332591950893402, + "learning_rate": 0.00034986117535205403, + "loss": 2.0627, + "step": 265410 + }, + { + "epoch": 1.0102540289122508, + "grad_norm": 0.1313796192407608, + "learning_rate": 0.0003497224789486687, + "loss": 2.0571, + "step": 265420 + }, + { + "epoch": 1.0102920913803735, + "grad_norm": 0.11726871132850647, + "learning_rate": 0.00034958391043508674, + "loss": 2.069, + "step": 265430 + }, + { + "epoch": 1.0103301538484961, + "grad_norm": 0.1298063099384308, + "learning_rate": 0.0003494454694581838, + "loss": 2.0624, + "step": 265440 + }, + { + "epoch": 1.0103682163166188, + "grad_norm": 0.12552876770496368, + "learning_rate": 0.0003493071556664573, + "loss": 2.0478, + "step": 265450 + }, + { + "epoch": 1.0104062787847414, + "grad_norm": 0.12347956746816635, + "learning_rate": 0.00034916896871001646, + "loss": 2.0487, + "step": 265460 + }, + { + "epoch": 1.0104443412528643, + "grad_norm": 0.11969196796417236, + "learning_rate": 0.0003490309082405718, + "loss": 2.0575, + "step": 265470 + }, + { + "epoch": 1.010482403720987, + "grad_norm": 0.12199029326438904, + "learning_rate": 0.0003488929739114248, + "loss": 2.0509, + "step": 265480 + }, + { + "epoch": 1.0105204661891096, + "grad_norm": 0.12357623130083084, + "learning_rate": 0.0003487551653774583, + "loss": 2.0611, + "step": 265490 + }, + { + "epoch": 1.0105585286572323, + "grad_norm": 0.1269357055425644, + "learning_rate": 0.00034861748229512545, + "loss": 2.0546, + "step": 265500 + }, + { + "epoch": 1.010596591125355, + "grad_norm": 0.12839782238006592, + "learning_rate": 0.0003484799243224406, + "loss": 2.0637, + "step": 265510 + }, + { + "epoch": 1.0106346535934776, + "grad_norm": 0.12106683105230331, + "learning_rate": 0.00034834249111896897, + "loss": 2.0545, + "step": 265520 + }, + { + "epoch": 1.0106727160616003, + "grad_norm": 0.11588679999113083, + "learning_rate": 0.00034820518234581697, + "loss": 2.0674, + "step": 265530 + }, + { + "epoch": 1.010710778529723, + "grad_norm": 0.12074842303991318, + "learning_rate": 0.00034806799766562234, + "loss": 2.0695, + "step": 265540 + }, + { + "epoch": 1.0107488409978456, + "grad_norm": 0.12576799094676971, + "learning_rate": 0.0003479309367425445, + "loss": 2.0669, + "step": 265550 + }, + { + "epoch": 1.0107869034659684, + "grad_norm": 0.12758374214172363, + "learning_rate": 0.000347793999242255, + "loss": 2.0525, + "step": 265560 + }, + { + "epoch": 1.010824965934091, + "grad_norm": 0.13564392924308777, + "learning_rate": 0.00034765718483192805, + "loss": 2.0669, + "step": 265570 + }, + { + "epoch": 1.0108630284022138, + "grad_norm": 0.12913645803928375, + "learning_rate": 0.00034752049318023095, + "loss": 2.0469, + "step": 265580 + }, + { + "epoch": 1.0109010908703364, + "grad_norm": 0.1334034502506256, + "learning_rate": 0.00034738392395731484, + "loss": 2.0645, + "step": 265590 + }, + { + "epoch": 1.010939153338459, + "grad_norm": 0.12395340949296951, + "learning_rate": 0.00034724747683480533, + "loss": 2.059, + "step": 265600 + }, + { + "epoch": 1.0109772158065817, + "grad_norm": 0.1262947916984558, + "learning_rate": 0.0003471111514857934, + "loss": 2.0786, + "step": 265610 + }, + { + "epoch": 1.0110152782747044, + "grad_norm": 0.11491633951663971, + "learning_rate": 0.0003469749475848263, + "loss": 2.0532, + "step": 265620 + }, + { + "epoch": 1.011053340742827, + "grad_norm": 0.12061280012130737, + "learning_rate": 0.00034683886480789807, + "loss": 2.0567, + "step": 265630 + }, + { + "epoch": 1.0110914032109497, + "grad_norm": 0.11542093008756638, + "learning_rate": 0.0003467029028324411, + "loss": 2.0645, + "step": 265640 + }, + { + "epoch": 1.0111294656790726, + "grad_norm": 0.11799933016300201, + "learning_rate": 0.00034656706133731694, + "loss": 2.0585, + "step": 265650 + }, + { + "epoch": 1.0111675281471952, + "grad_norm": 0.13193176686763763, + "learning_rate": 0.0003464313400028074, + "loss": 2.0518, + "step": 265660 + }, + { + "epoch": 1.011205590615318, + "grad_norm": 0.12344823032617569, + "learning_rate": 0.000346295738510606, + "loss": 2.0678, + "step": 265670 + }, + { + "epoch": 1.0112436530834406, + "grad_norm": 0.1263018101453781, + "learning_rate": 0.000346160256543809, + "loss": 2.0649, + "step": 265680 + }, + { + "epoch": 1.0112817155515632, + "grad_norm": 0.12341112643480301, + "learning_rate": 0.00034602489378690685, + "loss": 2.0566, + "step": 265690 + }, + { + "epoch": 1.0113197780196859, + "grad_norm": 0.1236085444688797, + "learning_rate": 0.0003458896499257756, + "loss": 2.0558, + "step": 265700 + }, + { + "epoch": 1.0113578404878085, + "grad_norm": 0.13375934958457947, + "learning_rate": 0.00034575452464766866, + "loss": 2.0586, + "step": 265710 + }, + { + "epoch": 1.0113959029559312, + "grad_norm": 0.11553890258073807, + "learning_rate": 0.00034561951764120786, + "loss": 2.042, + "step": 265720 + }, + { + "epoch": 1.011433965424054, + "grad_norm": 0.15427514910697937, + "learning_rate": 0.0003454846285963756, + "loss": 2.0556, + "step": 265730 + }, + { + "epoch": 1.0114720278921767, + "grad_norm": 0.12771137058734894, + "learning_rate": 0.00034534985720450606, + "loss": 2.0808, + "step": 265740 + }, + { + "epoch": 1.0115100903602994, + "grad_norm": 0.12867282330989838, + "learning_rate": 0.0003452152031582774, + "loss": 2.0604, + "step": 265750 + }, + { + "epoch": 1.011548152828422, + "grad_norm": 0.12850438058376312, + "learning_rate": 0.0003450806661517033, + "loss": 2.0593, + "step": 265760 + }, + { + "epoch": 1.0115862152965447, + "grad_norm": 0.12304188311100006, + "learning_rate": 0.00034494624588012496, + "loss": 2.0701, + "step": 265770 + }, + { + "epoch": 1.0116242777646673, + "grad_norm": 0.1225685402750969, + "learning_rate": 0.00034481194204020294, + "loss": 2.0595, + "step": 265780 + }, + { + "epoch": 1.01166234023279, + "grad_norm": 0.12346023321151733, + "learning_rate": 0.00034467775432990935, + "loss": 2.0654, + "step": 265790 + }, + { + "epoch": 1.0117004027009127, + "grad_norm": 0.12504765391349792, + "learning_rate": 0.00034454368244851973, + "loss": 2.0584, + "step": 265800 + }, + { + "epoch": 1.0117384651690353, + "grad_norm": 0.1463267207145691, + "learning_rate": 0.0003444097260966055, + "loss": 2.0563, + "step": 265810 + }, + { + "epoch": 1.0117765276371582, + "grad_norm": 0.13947755098342896, + "learning_rate": 0.00034427588497602563, + "loss": 2.0643, + "step": 265820 + }, + { + "epoch": 1.0118145901052809, + "grad_norm": 0.1254192292690277, + "learning_rate": 0.0003441421587899195, + "loss": 2.0549, + "step": 265830 + }, + { + "epoch": 1.0118526525734035, + "grad_norm": 0.12350041419267654, + "learning_rate": 0.00034400854724269883, + "loss": 2.0662, + "step": 265840 + }, + { + "epoch": 1.0118907150415262, + "grad_norm": 0.1425914466381073, + "learning_rate": 0.00034387505004004003, + "loss": 2.063, + "step": 265850 + }, + { + "epoch": 1.0119287775096488, + "grad_norm": 0.12194148451089859, + "learning_rate": 0.000343741666888877, + "loss": 2.0575, + "step": 265860 + }, + { + "epoch": 1.0119668399777715, + "grad_norm": 0.1255568414926529, + "learning_rate": 0.00034360839749739337, + "loss": 2.0679, + "step": 265870 + }, + { + "epoch": 1.0120049024458941, + "grad_norm": 0.12367769330739975, + "learning_rate": 0.00034347524157501475, + "loss": 2.0739, + "step": 265880 + }, + { + "epoch": 1.0120429649140168, + "grad_norm": 0.11774411052465439, + "learning_rate": 0.00034334219883240205, + "loss": 2.0546, + "step": 265890 + }, + { + "epoch": 1.0120810273821397, + "grad_norm": 0.1194257140159607, + "learning_rate": 0.00034320926898144354, + "loss": 2.0595, + "step": 265900 + }, + { + "epoch": 1.0121190898502623, + "grad_norm": 0.11727624386548996, + "learning_rate": 0.00034307645173524783, + "loss": 2.073, + "step": 265910 + }, + { + "epoch": 1.012157152318385, + "grad_norm": 0.23917624354362488, + "learning_rate": 0.0003429437468081367, + "loss": 2.0695, + "step": 265920 + }, + { + "epoch": 1.0121952147865076, + "grad_norm": 0.11985159665346146, + "learning_rate": 0.00034281115391563777, + "loss": 2.0403, + "step": 265930 + }, + { + "epoch": 1.0122332772546303, + "grad_norm": 0.11827631294727325, + "learning_rate": 0.0003426786727744773, + "loss": 2.0505, + "step": 265940 + }, + { + "epoch": 1.012271339722753, + "grad_norm": 0.1171039417386055, + "learning_rate": 0.0003425463031025735, + "loss": 2.0589, + "step": 265950 + }, + { + "epoch": 1.0123094021908756, + "grad_norm": 0.12298347800970078, + "learning_rate": 0.00034241404461902915, + "loss": 2.0669, + "step": 265960 + }, + { + "epoch": 1.0123474646589983, + "grad_norm": 0.12044963985681534, + "learning_rate": 0.00034228189704412496, + "loss": 2.0533, + "step": 265970 + }, + { + "epoch": 1.012385527127121, + "grad_norm": 0.12134046852588654, + "learning_rate": 0.0003421498600993124, + "loss": 2.0674, + "step": 265980 + }, + { + "epoch": 1.0124235895952438, + "grad_norm": 0.11874759942293167, + "learning_rate": 0.00034201793350720677, + "loss": 2.0597, + "step": 265990 + }, + { + "epoch": 1.0124616520633665, + "grad_norm": 0.1224139928817749, + "learning_rate": 0.000341886116991581, + "loss": 2.0592, + "step": 266000 + }, + { + "epoch": 1.0124997145314891, + "grad_norm": 0.1219983920454979, + "learning_rate": 0.0003417544102773582, + "loss": 2.0589, + "step": 266010 + }, + { + "epoch": 1.0125377769996118, + "grad_norm": 0.14656014740467072, + "learning_rate": 0.00034162281309060535, + "loss": 2.0629, + "step": 266020 + }, + { + "epoch": 1.0125758394677344, + "grad_norm": 0.13034473359584808, + "learning_rate": 0.00034149132515852643, + "loss": 2.0674, + "step": 266030 + }, + { + "epoch": 1.012613901935857, + "grad_norm": 0.12056338042020798, + "learning_rate": 0.0003413599462094561, + "loss": 2.0456, + "step": 266040 + }, + { + "epoch": 1.0126519644039798, + "grad_norm": 0.12062523514032364, + "learning_rate": 0.0003412286759728529, + "loss": 2.0658, + "step": 266050 + }, + { + "epoch": 1.0126900268721024, + "grad_norm": 0.11885876953601837, + "learning_rate": 0.00034109751417929295, + "loss": 2.0566, + "step": 266060 + }, + { + "epoch": 1.012728089340225, + "grad_norm": 0.12024195492267609, + "learning_rate": 0.0003409664605604633, + "loss": 2.0683, + "step": 266070 + }, + { + "epoch": 1.012766151808348, + "grad_norm": 0.12108021229505539, + "learning_rate": 0.0003408355148491557, + "loss": 2.0484, + "step": 266080 + }, + { + "epoch": 1.0128042142764706, + "grad_norm": 0.11807679384946823, + "learning_rate": 0.00034070467677926007, + "loss": 2.0572, + "step": 266090 + }, + { + "epoch": 1.0128422767445933, + "grad_norm": 0.11726764589548111, + "learning_rate": 0.00034057394608575845, + "loss": 2.0616, + "step": 266100 + }, + { + "epoch": 1.012880339212716, + "grad_norm": 0.12831568717956543, + "learning_rate": 0.0003404433225047183, + "loss": 2.0764, + "step": 266110 + }, + { + "epoch": 1.0129184016808386, + "grad_norm": 0.12672363221645355, + "learning_rate": 0.00034031280577328686, + "loss": 2.051, + "step": 266120 + }, + { + "epoch": 1.0129564641489612, + "grad_norm": 0.15040802955627441, + "learning_rate": 0.0003401823956296846, + "loss": 2.0577, + "step": 266130 + }, + { + "epoch": 1.012994526617084, + "grad_norm": 0.1106763556599617, + "learning_rate": 0.00034005209181319895, + "loss": 2.0587, + "step": 266140 + }, + { + "epoch": 1.0130325890852065, + "grad_norm": 0.12348024547100067, + "learning_rate": 0.0003399218940641788, + "loss": 2.0643, + "step": 266150 + }, + { + "epoch": 1.0130706515533294, + "grad_norm": 0.14651146531105042, + "learning_rate": 0.00033979180212402783, + "loss": 2.0612, + "step": 266160 + }, + { + "epoch": 1.013108714021452, + "grad_norm": 0.11821554601192474, + "learning_rate": 0.00033966181573519884, + "loss": 2.0595, + "step": 266170 + }, + { + "epoch": 1.0131467764895747, + "grad_norm": 0.12168850004673004, + "learning_rate": 0.0003395319346411879, + "loss": 2.0767, + "step": 266180 + }, + { + "epoch": 1.0131848389576974, + "grad_norm": 0.1250876486301422, + "learning_rate": 0.0003394021585865281, + "loss": 2.0556, + "step": 266190 + }, + { + "epoch": 1.01322290142582, + "grad_norm": 0.12602892518043518, + "learning_rate": 0.00033927248731678404, + "loss": 2.0792, + "step": 266200 + }, + { + "epoch": 1.0132609638939427, + "grad_norm": 0.11486831307411194, + "learning_rate": 0.0003391429205785459, + "loss": 2.0466, + "step": 266210 + }, + { + "epoch": 1.0132990263620654, + "grad_norm": 0.11743859201669693, + "learning_rate": 0.0003390134581194233, + "loss": 2.0593, + "step": 266220 + }, + { + "epoch": 1.013337088830188, + "grad_norm": 0.12538020312786102, + "learning_rate": 0.00033888409968804034, + "loss": 2.0717, + "step": 266230 + }, + { + "epoch": 1.0133751512983107, + "grad_norm": 0.12342677265405655, + "learning_rate": 0.000338754845034029, + "loss": 2.0695, + "step": 266240 + }, + { + "epoch": 1.0134132137664336, + "grad_norm": 0.12480781972408295, + "learning_rate": 0.00033862569390802436, + "loss": 2.0609, + "step": 266250 + }, + { + "epoch": 1.0134512762345562, + "grad_norm": 0.1464034616947174, + "learning_rate": 0.0003384966460616581, + "loss": 2.0619, + "step": 266260 + }, + { + "epoch": 1.0134893387026789, + "grad_norm": 0.12034554779529572, + "learning_rate": 0.0003383677012475539, + "loss": 2.0665, + "step": 266270 + }, + { + "epoch": 1.0135274011708015, + "grad_norm": 0.12253714352846146, + "learning_rate": 0.00033823885921932095, + "loss": 2.0644, + "step": 266280 + }, + { + "epoch": 1.0135654636389242, + "grad_norm": 0.14051516354084015, + "learning_rate": 0.0003381101197315492, + "loss": 2.0513, + "step": 266290 + }, + { + "epoch": 1.0136035261070468, + "grad_norm": 0.12799084186553955, + "learning_rate": 0.0003379814825398035, + "loss": 2.0541, + "step": 266300 + }, + { + "epoch": 1.0136415885751695, + "grad_norm": 0.12607944011688232, + "learning_rate": 0.0003378529474006183, + "loss": 2.057, + "step": 266310 + }, + { + "epoch": 1.0136796510432922, + "grad_norm": 0.12739719450473785, + "learning_rate": 0.0003377245140714922, + "loss": 2.046, + "step": 266320 + }, + { + "epoch": 1.013717713511415, + "grad_norm": 0.13314710557460785, + "learning_rate": 0.0003375961823108829, + "loss": 2.0547, + "step": 266330 + }, + { + "epoch": 1.0137557759795377, + "grad_norm": 0.13084451854228973, + "learning_rate": 0.00033746795187820136, + "loss": 2.0629, + "step": 266340 + }, + { + "epoch": 1.0137938384476604, + "grad_norm": 0.12994296848773956, + "learning_rate": 0.00033733982253380726, + "loss": 2.0678, + "step": 266350 + }, + { + "epoch": 1.013831900915783, + "grad_norm": 0.12707634270191193, + "learning_rate": 0.0003372117940390029, + "loss": 2.0555, + "step": 266360 + }, + { + "epoch": 1.0138699633839057, + "grad_norm": 0.1259993463754654, + "learning_rate": 0.000337083866156029, + "loss": 2.0404, + "step": 266370 + }, + { + "epoch": 1.0139080258520283, + "grad_norm": 0.15101008117198944, + "learning_rate": 0.0003369560386480587, + "loss": 2.0687, + "step": 266380 + }, + { + "epoch": 1.013946088320151, + "grad_norm": 0.12252911180257797, + "learning_rate": 0.0003368283112791928, + "loss": 2.0619, + "step": 266390 + }, + { + "epoch": 1.0139841507882736, + "grad_norm": 0.11849822103977203, + "learning_rate": 0.00033670068381445477, + "loss": 2.0519, + "step": 266400 + }, + { + "epoch": 1.0140222132563963, + "grad_norm": 0.11857569962739944, + "learning_rate": 0.0003365731560197857, + "loss": 2.0449, + "step": 266410 + }, + { + "epoch": 1.0140602757245192, + "grad_norm": 0.12244977056980133, + "learning_rate": 0.00033644572766203875, + "loss": 2.0471, + "step": 266420 + }, + { + "epoch": 1.0140983381926418, + "grad_norm": 0.128976970911026, + "learning_rate": 0.00033631839850897517, + "loss": 2.0452, + "step": 266430 + }, + { + "epoch": 1.0141364006607645, + "grad_norm": 0.11401844769716263, + "learning_rate": 0.00033619116832925847, + "loss": 2.0556, + "step": 266440 + }, + { + "epoch": 1.0141744631288871, + "grad_norm": 0.12518161535263062, + "learning_rate": 0.00033606403689245005, + "loss": 2.061, + "step": 266450 + }, + { + "epoch": 1.0142125255970098, + "grad_norm": 0.12827928364276886, + "learning_rate": 0.00033593700396900386, + "loss": 2.0657, + "step": 266460 + }, + { + "epoch": 1.0142505880651325, + "grad_norm": 0.12486431747674942, + "learning_rate": 0.00033581006933026215, + "loss": 2.0553, + "step": 266470 + }, + { + "epoch": 1.0142886505332551, + "grad_norm": 0.13143974542617798, + "learning_rate": 0.00033568323274845016, + "loss": 2.0454, + "step": 266480 + }, + { + "epoch": 1.0143267130013778, + "grad_norm": 0.133738175034523, + "learning_rate": 0.0003355564939966718, + "loss": 2.0383, + "step": 266490 + }, + { + "epoch": 1.0143647754695004, + "grad_norm": 0.12220674008131027, + "learning_rate": 0.00033542985284890417, + "loss": 2.0568, + "step": 266500 + }, + { + "epoch": 1.0144028379376233, + "grad_norm": 0.12594294548034668, + "learning_rate": 0.000335303309079994, + "loss": 2.0652, + "step": 266510 + }, + { + "epoch": 1.014440900405746, + "grad_norm": 0.1356634497642517, + "learning_rate": 0.0003351768624656518, + "loss": 2.0414, + "step": 266520 + }, + { + "epoch": 1.0144789628738686, + "grad_norm": 0.11889128386974335, + "learning_rate": 0.000335050512782448, + "loss": 2.0487, + "step": 266530 + }, + { + "epoch": 1.0145170253419913, + "grad_norm": 0.13497433066368103, + "learning_rate": 0.0003349242598078082, + "loss": 2.0658, + "step": 266540 + }, + { + "epoch": 1.014555087810114, + "grad_norm": 0.11696867644786835, + "learning_rate": 0.0003347981033200083, + "loss": 2.0619, + "step": 266550 + }, + { + "epoch": 1.0145931502782366, + "grad_norm": 0.11523982882499695, + "learning_rate": 0.00033467204309817005, + "loss": 2.0731, + "step": 266560 + }, + { + "epoch": 1.0146312127463593, + "grad_norm": 0.1279909610748291, + "learning_rate": 0.00033454607892225704, + "loss": 2.0666, + "step": 266570 + }, + { + "epoch": 1.014669275214482, + "grad_norm": 0.11702031642198563, + "learning_rate": 0.0003344202105730695, + "loss": 2.0524, + "step": 266580 + }, + { + "epoch": 1.0147073376826048, + "grad_norm": 0.12712937593460083, + "learning_rate": 0.00033429443783224014, + "loss": 2.0589, + "step": 266590 + }, + { + "epoch": 1.0147454001507275, + "grad_norm": 0.12439191341400146, + "learning_rate": 0.00033416876048223, + "loss": 2.047, + "step": 266600 + }, + { + "epoch": 1.01478346261885, + "grad_norm": 0.1319584995508194, + "learning_rate": 0.0003340431783063237, + "loss": 2.0527, + "step": 266610 + }, + { + "epoch": 1.0148215250869728, + "grad_norm": 0.1275530457496643, + "learning_rate": 0.00033391769108862516, + "loss": 2.066, + "step": 266620 + }, + { + "epoch": 1.0148595875550954, + "grad_norm": 0.11523158103227615, + "learning_rate": 0.00033379229861405337, + "loss": 2.0584, + "step": 266630 + }, + { + "epoch": 1.014897650023218, + "grad_norm": 0.1223595067858696, + "learning_rate": 0.000333667000668338, + "loss": 2.0693, + "step": 266640 + }, + { + "epoch": 1.0149357124913407, + "grad_norm": 0.12558513879776, + "learning_rate": 0.0003335417970380152, + "loss": 2.0495, + "step": 266650 + }, + { + "epoch": 1.0149737749594634, + "grad_norm": 0.14692170917987823, + "learning_rate": 0.0003334166875104232, + "loss": 2.0554, + "step": 266660 + }, + { + "epoch": 1.015011837427586, + "grad_norm": 0.12919218838214874, + "learning_rate": 0.0003332916718736984, + "loss": 2.0621, + "step": 266670 + }, + { + "epoch": 1.015049899895709, + "grad_norm": 0.14028730988502502, + "learning_rate": 0.00033316674991677064, + "loss": 2.0557, + "step": 266680 + }, + { + "epoch": 1.0150879623638316, + "grad_norm": 0.1568068265914917, + "learning_rate": 0.00033304192142936, + "loss": 2.0638, + "step": 266690 + }, + { + "epoch": 1.0151260248319542, + "grad_norm": 0.13906286656856537, + "learning_rate": 0.00033291718620197155, + "loss": 2.0654, + "step": 266700 + }, + { + "epoch": 1.015164087300077, + "grad_norm": 0.1218264028429985, + "learning_rate": 0.00033279254402589177, + "loss": 2.0528, + "step": 266710 + }, + { + "epoch": 1.0152021497681996, + "grad_norm": 0.1341378092765808, + "learning_rate": 0.0003326679946931849, + "loss": 2.064, + "step": 266720 + }, + { + "epoch": 1.0152402122363222, + "grad_norm": 0.13618212938308716, + "learning_rate": 0.00033254353799668803, + "loss": 2.0526, + "step": 266730 + }, + { + "epoch": 1.0152782747044449, + "grad_norm": 0.1174771636724472, + "learning_rate": 0.00033241917373000773, + "loss": 2.068, + "step": 266740 + }, + { + "epoch": 1.0153163371725675, + "grad_norm": 0.12650097906589508, + "learning_rate": 0.00033229490168751574, + "loss": 2.0551, + "step": 266750 + }, + { + "epoch": 1.0153543996406904, + "grad_norm": 0.12127122282981873, + "learning_rate": 0.00033217072166434527, + "loss": 2.0677, + "step": 266760 + }, + { + "epoch": 1.015392462108813, + "grad_norm": 0.12797364592552185, + "learning_rate": 0.0003320466334563867, + "loss": 2.0677, + "step": 266770 + }, + { + "epoch": 1.0154305245769357, + "grad_norm": 0.12370993942022324, + "learning_rate": 0.0003319226368602839, + "loss": 2.0511, + "step": 266780 + }, + { + "epoch": 1.0154685870450584, + "grad_norm": 0.1199609562754631, + "learning_rate": 0.00033179873167343046, + "loss": 2.058, + "step": 266790 + }, + { + "epoch": 1.015506649513181, + "grad_norm": 0.12218517065048218, + "learning_rate": 0.0003316749176939654, + "loss": 2.0799, + "step": 266800 + }, + { + "epoch": 1.0155447119813037, + "grad_norm": 0.12803561985492706, + "learning_rate": 0.0003315511947207698, + "loss": 2.0366, + "step": 266810 + }, + { + "epoch": 1.0155827744494264, + "grad_norm": 0.1239086464047432, + "learning_rate": 0.00033142756255346293, + "loss": 2.0491, + "step": 266820 + }, + { + "epoch": 1.015620836917549, + "grad_norm": 0.11244252324104309, + "learning_rate": 0.00033130402099239785, + "loss": 2.0671, + "step": 266830 + }, + { + "epoch": 1.0156588993856717, + "grad_norm": 0.13067172467708588, + "learning_rate": 0.00033118056983865866, + "loss": 2.0574, + "step": 266840 + }, + { + "epoch": 1.0156969618537945, + "grad_norm": 0.1313253790140152, + "learning_rate": 0.00033105720889405586, + "loss": 2.0725, + "step": 266850 + }, + { + "epoch": 1.0157350243219172, + "grad_norm": 0.12428645044565201, + "learning_rate": 0.00033093393796112326, + "loss": 2.0476, + "step": 266860 + }, + { + "epoch": 1.0157730867900399, + "grad_norm": 0.1148483082652092, + "learning_rate": 0.0003308107568431137, + "loss": 2.056, + "step": 266870 + }, + { + "epoch": 1.0158111492581625, + "grad_norm": 0.11680980026721954, + "learning_rate": 0.00033068766534399607, + "loss": 2.0547, + "step": 266880 + }, + { + "epoch": 1.0158492117262852, + "grad_norm": 0.13104647397994995, + "learning_rate": 0.0003305646632684512, + "loss": 2.0578, + "step": 266890 + }, + { + "epoch": 1.0158872741944078, + "grad_norm": 0.1296451985836029, + "learning_rate": 0.0003304417504218683, + "loss": 2.0579, + "step": 266900 + }, + { + "epoch": 1.0159253366625305, + "grad_norm": 0.13351930677890778, + "learning_rate": 0.00033031892661034153, + "loss": 2.0502, + "step": 266910 + }, + { + "epoch": 1.0159633991306531, + "grad_norm": 0.12565377354621887, + "learning_rate": 0.0003301961916406663, + "loss": 2.0472, + "step": 266920 + }, + { + "epoch": 1.0160014615987758, + "grad_norm": 0.12520720064640045, + "learning_rate": 0.0003300735453203357, + "loss": 2.0581, + "step": 266930 + }, + { + "epoch": 1.0160395240668987, + "grad_norm": 0.11860103160142899, + "learning_rate": 0.0003299509874575372, + "loss": 2.077, + "step": 266940 + }, + { + "epoch": 1.0160775865350213, + "grad_norm": 0.11802423000335693, + "learning_rate": 0.0003298285178611489, + "loss": 2.0563, + "step": 266950 + }, + { + "epoch": 1.016115649003144, + "grad_norm": 0.1367066502571106, + "learning_rate": 0.000329706136340736, + "loss": 2.0517, + "step": 266960 + }, + { + "epoch": 1.0161537114712667, + "grad_norm": 0.7919990420341492, + "learning_rate": 0.0003295838427065477, + "loss": 2.0558, + "step": 266970 + }, + { + "epoch": 1.0161917739393893, + "grad_norm": 0.12654846906661987, + "learning_rate": 0.00032946163676951355, + "loss": 2.0564, + "step": 266980 + }, + { + "epoch": 1.016229836407512, + "grad_norm": 0.12668249011039734, + "learning_rate": 0.0003293395183412399, + "loss": 2.0684, + "step": 266990 + }, + { + "epoch": 1.0162678988756346, + "grad_norm": 0.12429069727659225, + "learning_rate": 0.0003292174872340067, + "loss": 2.0495, + "step": 267000 + }, + { + "epoch": 1.0163059613437573, + "grad_norm": 0.12904322147369385, + "learning_rate": 0.0003290955432607642, + "loss": 2.0571, + "step": 267010 + }, + { + "epoch": 1.0163440238118802, + "grad_norm": 0.11880702525377274, + "learning_rate": 0.0003289736862351293, + "loss": 2.0493, + "step": 267020 + }, + { + "epoch": 1.0163820862800028, + "grad_norm": 0.12380634993314743, + "learning_rate": 0.00032885191597138263, + "loss": 2.0683, + "step": 267030 + }, + { + "epoch": 1.0164201487481255, + "grad_norm": 0.1211773157119751, + "learning_rate": 0.00032873023228446494, + "loss": 2.0607, + "step": 267040 + }, + { + "epoch": 1.0164582112162481, + "grad_norm": 0.12402321398258209, + "learning_rate": 0.0003286086349899739, + "loss": 2.054, + "step": 267050 + }, + { + "epoch": 1.0164962736843708, + "grad_norm": 0.13763384521007538, + "learning_rate": 0.0003284871239041609, + "loss": 2.0655, + "step": 267060 + }, + { + "epoch": 1.0165343361524934, + "grad_norm": 0.12396842241287231, + "learning_rate": 0.00032836569884392766, + "loss": 2.0471, + "step": 267070 + }, + { + "epoch": 1.016572398620616, + "grad_norm": 0.1238761842250824, + "learning_rate": 0.0003282443596268233, + "loss": 2.0527, + "step": 267080 + }, + { + "epoch": 1.0166104610887388, + "grad_norm": 0.13287590444087982, + "learning_rate": 0.0003281231060710409, + "loss": 2.042, + "step": 267090 + }, + { + "epoch": 1.0166485235568614, + "grad_norm": 0.12432575225830078, + "learning_rate": 0.0003280019379954142, + "loss": 2.0563, + "step": 267100 + }, + { + "epoch": 1.0166865860249843, + "grad_norm": 0.12866751849651337, + "learning_rate": 0.00032788085521941494, + "loss": 2.0383, + "step": 267110 + }, + { + "epoch": 1.016724648493107, + "grad_norm": 0.1300543248653412, + "learning_rate": 0.0003277598575631492, + "loss": 2.0679, + "step": 267120 + }, + { + "epoch": 1.0167627109612296, + "grad_norm": 0.12244408577680588, + "learning_rate": 0.00032763894484735443, + "loss": 2.0551, + "step": 267130 + }, + { + "epoch": 1.0168007734293523, + "grad_norm": 0.14718212187290192, + "learning_rate": 0.0003275181168933966, + "loss": 2.0612, + "step": 267140 + }, + { + "epoch": 1.016838835897475, + "grad_norm": 0.1293575018644333, + "learning_rate": 0.00032739737352326684, + "loss": 2.0561, + "step": 267150 + }, + { + "epoch": 1.0168768983655976, + "grad_norm": 0.11455338448286057, + "learning_rate": 0.0003272767145595785, + "loss": 2.0612, + "step": 267160 + }, + { + "epoch": 1.0169149608337202, + "grad_norm": 0.13592086732387543, + "learning_rate": 0.000327156139825564, + "loss": 2.0513, + "step": 267170 + }, + { + "epoch": 1.016953023301843, + "grad_norm": 0.12359960377216339, + "learning_rate": 0.0003270356491450719, + "loss": 2.0394, + "step": 267180 + }, + { + "epoch": 1.0169910857699658, + "grad_norm": 0.1179828867316246, + "learning_rate": 0.0003269152423425641, + "loss": 2.0547, + "step": 267190 + }, + { + "epoch": 1.0170291482380884, + "grad_norm": 0.12695840001106262, + "learning_rate": 0.0003267949192431123, + "loss": 2.0474, + "step": 267200 + }, + { + "epoch": 1.017067210706211, + "grad_norm": 0.12281975150108337, + "learning_rate": 0.0003266746796723956, + "loss": 2.0541, + "step": 267210 + }, + { + "epoch": 1.0171052731743337, + "grad_norm": 0.1289357990026474, + "learning_rate": 0.00032655452345669737, + "loss": 2.0818, + "step": 267220 + }, + { + "epoch": 1.0171433356424564, + "grad_norm": 0.12075571715831757, + "learning_rate": 0.0003264344504229022, + "loss": 2.0671, + "step": 267230 + }, + { + "epoch": 1.017181398110579, + "grad_norm": 0.12263869494199753, + "learning_rate": 0.000326314460398493, + "loss": 2.0484, + "step": 267240 + }, + { + "epoch": 1.0172194605787017, + "grad_norm": 0.13677935302257538, + "learning_rate": 0.0003261945532115482, + "loss": 2.0737, + "step": 267250 + }, + { + "epoch": 1.0172575230468244, + "grad_norm": 0.13722999393939972, + "learning_rate": 0.0003260747286907392, + "loss": 2.0591, + "step": 267260 + }, + { + "epoch": 1.017295585514947, + "grad_norm": 0.12648960947990417, + "learning_rate": 0.0003259549866653265, + "loss": 2.0347, + "step": 267270 + }, + { + "epoch": 1.01733364798307, + "grad_norm": 0.12108505517244339, + "learning_rate": 0.00032583532696515825, + "loss": 2.0592, + "step": 267280 + }, + { + "epoch": 1.0173717104511926, + "grad_norm": 0.13847528398036957, + "learning_rate": 0.0003257157494206663, + "loss": 2.0518, + "step": 267290 + }, + { + "epoch": 1.0174097729193152, + "grad_norm": 0.13215801119804382, + "learning_rate": 0.0003255962538628637, + "loss": 2.0478, + "step": 267300 + }, + { + "epoch": 1.0174478353874379, + "grad_norm": 0.12544967234134674, + "learning_rate": 0.00032547684012334253, + "loss": 2.0576, + "step": 267310 + }, + { + "epoch": 1.0174858978555605, + "grad_norm": 0.1270923614501953, + "learning_rate": 0.0003253575080342702, + "loss": 2.0461, + "step": 267320 + }, + { + "epoch": 1.0175239603236832, + "grad_norm": 0.13393916189670563, + "learning_rate": 0.0003252382574283872, + "loss": 2.0528, + "step": 267330 + }, + { + "epoch": 1.0175620227918059, + "grad_norm": 0.13391059637069702, + "learning_rate": 0.00032511908813900436, + "loss": 2.0443, + "step": 267340 + }, + { + "epoch": 1.0176000852599285, + "grad_norm": 0.11034025996923447, + "learning_rate": 0.00032500000000000004, + "loss": 2.0617, + "step": 267350 + }, + { + "epoch": 1.0176381477280514, + "grad_norm": 0.13497096300125122, + "learning_rate": 0.0003248809928458174, + "loss": 2.0606, + "step": 267360 + }, + { + "epoch": 1.017676210196174, + "grad_norm": 0.12526626884937286, + "learning_rate": 0.0003247620665114618, + "loss": 2.0575, + "step": 267370 + }, + { + "epoch": 1.0177142726642967, + "grad_norm": 0.13385437428951263, + "learning_rate": 0.0003246432208324982, + "loss": 2.0567, + "step": 267380 + }, + { + "epoch": 1.0177523351324194, + "grad_norm": 0.1284913271665573, + "learning_rate": 0.0003245244556450482, + "loss": 2.054, + "step": 267390 + }, + { + "epoch": 1.017790397600542, + "grad_norm": 0.11587168276309967, + "learning_rate": 0.0003244057707857877, + "loss": 2.0541, + "step": 267400 + }, + { + "epoch": 1.0178284600686647, + "grad_norm": 0.11677401512861252, + "learning_rate": 0.0003242871660919442, + "loss": 2.0539, + "step": 267410 + }, + { + "epoch": 1.0178665225367873, + "grad_norm": 0.12091653048992157, + "learning_rate": 0.0003241686414012942, + "loss": 2.0556, + "step": 267420 + }, + { + "epoch": 1.01790458500491, + "grad_norm": 0.11467602103948593, + "learning_rate": 0.00032405019655216055, + "loss": 2.0558, + "step": 267430 + }, + { + "epoch": 1.0179426474730326, + "grad_norm": 0.13561949133872986, + "learning_rate": 0.00032393183138340994, + "loss": 2.0645, + "step": 267440 + }, + { + "epoch": 1.0179807099411555, + "grad_norm": 0.14254984259605408, + "learning_rate": 0.0003238135457344502, + "loss": 2.0544, + "step": 267450 + }, + { + "epoch": 1.0180187724092782, + "grad_norm": 0.13436760008335114, + "learning_rate": 0.0003236953394452281, + "loss": 2.0375, + "step": 267460 + }, + { + "epoch": 1.0180568348774008, + "grad_norm": 0.136689230799675, + "learning_rate": 0.0003235772123562265, + "loss": 2.0658, + "step": 267470 + }, + { + "epoch": 1.0180948973455235, + "grad_norm": 0.13175417482852936, + "learning_rate": 0.0003234591643084619, + "loss": 2.0571, + "step": 267480 + }, + { + "epoch": 1.0181329598136462, + "grad_norm": 0.13128870725631714, + "learning_rate": 0.00032334119514348193, + "loss": 2.0514, + "step": 267490 + }, + { + "epoch": 1.0181710222817688, + "grad_norm": 0.12385053932666779, + "learning_rate": 0.00032322330470336316, + "loss": 2.0423, + "step": 267500 + }, + { + "epoch": 1.0182090847498915, + "grad_norm": 0.13810376822948456, + "learning_rate": 0.0003231054928307081, + "loss": 2.0527, + "step": 267510 + }, + { + "epoch": 1.0182471472180141, + "grad_norm": 0.18634332716464996, + "learning_rate": 0.0003229877593686433, + "loss": 2.0546, + "step": 267520 + }, + { + "epoch": 1.0182852096861368, + "grad_norm": 0.1266355961561203, + "learning_rate": 0.00032287010416081653, + "loss": 2.0672, + "step": 267530 + }, + { + "epoch": 1.0183232721542597, + "grad_norm": 0.13006629049777985, + "learning_rate": 0.00032275252705139443, + "loss": 2.0619, + "step": 267540 + }, + { + "epoch": 1.0183613346223823, + "grad_norm": 0.13841281831264496, + "learning_rate": 0.00032263502788506035, + "loss": 2.0476, + "step": 267550 + }, + { + "epoch": 1.018399397090505, + "grad_norm": 0.13692769408226013, + "learning_rate": 0.0003225176065070116, + "loss": 2.0531, + "step": 267560 + }, + { + "epoch": 1.0184374595586276, + "grad_norm": 0.13610826432704926, + "learning_rate": 0.00032240026276295715, + "loss": 2.0712, + "step": 267570 + }, + { + "epoch": 1.0184755220267503, + "grad_norm": 0.12330146878957748, + "learning_rate": 0.00032228299649911566, + "loss": 2.0588, + "step": 267580 + }, + { + "epoch": 1.018513584494873, + "grad_norm": 0.12359149754047394, + "learning_rate": 0.0003221658075622126, + "loss": 2.0514, + "step": 267590 + }, + { + "epoch": 1.0185516469629956, + "grad_norm": 0.12202282249927521, + "learning_rate": 0.00032204869579947816, + "loss": 2.0551, + "step": 267600 + }, + { + "epoch": 1.0185897094311183, + "grad_norm": 0.12366171926259995, + "learning_rate": 0.0003219316610586449, + "loss": 2.0434, + "step": 267610 + }, + { + "epoch": 1.0186277718992411, + "grad_norm": 0.13653773069381714, + "learning_rate": 0.0003218147031879454, + "loss": 2.0518, + "step": 267620 + }, + { + "epoch": 1.0186658343673638, + "grad_norm": 0.12109905481338501, + "learning_rate": 0.0003216978220361101, + "loss": 2.0403, + "step": 267630 + }, + { + "epoch": 1.0187038968354865, + "grad_norm": 0.13771206140518188, + "learning_rate": 0.0003215810174523649, + "loss": 2.056, + "step": 267640 + }, + { + "epoch": 1.0187419593036091, + "grad_norm": 0.119480662047863, + "learning_rate": 0.0003214642892864288, + "loss": 2.0594, + "step": 267650 + }, + { + "epoch": 1.0187800217717318, + "grad_norm": 0.11807338893413544, + "learning_rate": 0.0003213476373885118, + "loss": 2.0479, + "step": 267660 + }, + { + "epoch": 1.0188180842398544, + "grad_norm": 0.1199382096529007, + "learning_rate": 0.00032123106160931276, + "loss": 2.0501, + "step": 267670 + }, + { + "epoch": 1.018856146707977, + "grad_norm": 0.12022703140974045, + "learning_rate": 0.0003211145618000169, + "loss": 2.0536, + "step": 267680 + }, + { + "epoch": 1.0188942091760997, + "grad_norm": 0.12680114805698395, + "learning_rate": 0.0003209981378122935, + "loss": 2.0466, + "step": 267690 + }, + { + "epoch": 1.0189322716442224, + "grad_norm": 0.12346377223730087, + "learning_rate": 0.0003208817894982944, + "loss": 2.0414, + "step": 267700 + }, + { + "epoch": 1.0189703341123453, + "grad_norm": 0.12502339482307434, + "learning_rate": 0.0003207655167106508, + "loss": 2.0502, + "step": 267710 + }, + { + "epoch": 1.019008396580468, + "grad_norm": 0.12736190855503082, + "learning_rate": 0.00032064931930247194, + "loss": 2.0404, + "step": 267720 + }, + { + "epoch": 1.0190464590485906, + "grad_norm": 0.11968687921762466, + "learning_rate": 0.0003205331971273424, + "loss": 2.055, + "step": 267730 + }, + { + "epoch": 1.0190845215167132, + "grad_norm": 0.11527033895254135, + "learning_rate": 0.00032041715003932025, + "loss": 2.0296, + "step": 267740 + }, + { + "epoch": 1.019122583984836, + "grad_norm": 0.128132626414299, + "learning_rate": 0.0003203011778929348, + "loss": 2.0428, + "step": 267750 + }, + { + "epoch": 1.0191606464529586, + "grad_norm": 0.1341097503900528, + "learning_rate": 0.0003201852805431843, + "loss": 2.0479, + "step": 267760 + }, + { + "epoch": 1.0191987089210812, + "grad_norm": 0.14120911061763763, + "learning_rate": 0.0003200694578455342, + "loss": 2.056, + "step": 267770 + }, + { + "epoch": 1.0192367713892039, + "grad_norm": 0.12796425819396973, + "learning_rate": 0.0003199537096559147, + "loss": 2.0521, + "step": 267780 + }, + { + "epoch": 1.0192748338573265, + "grad_norm": 0.12128780037164688, + "learning_rate": 0.0003198380358307189, + "loss": 2.0576, + "step": 267790 + }, + { + "epoch": 1.0193128963254494, + "grad_norm": 0.12644854187965393, + "learning_rate": 0.00031972243622680054, + "loss": 2.063, + "step": 267800 + }, + { + "epoch": 1.019350958793572, + "grad_norm": 0.13088761270046234, + "learning_rate": 0.0003196069107014721, + "loss": 2.0624, + "step": 267810 + }, + { + "epoch": 1.0193890212616947, + "grad_norm": 0.13620150089263916, + "learning_rate": 0.00031949145911250255, + "loss": 2.0609, + "step": 267820 + }, + { + "epoch": 1.0194270837298174, + "grad_norm": 0.11924911290407181, + "learning_rate": 0.00031937608131811556, + "loss": 2.0535, + "step": 267830 + }, + { + "epoch": 1.01946514619794, + "grad_norm": 0.13811926543712616, + "learning_rate": 0.0003192607771769872, + "loss": 2.0487, + "step": 267840 + }, + { + "epoch": 1.0195032086660627, + "grad_norm": 0.11828891932964325, + "learning_rate": 0.0003191455465482441, + "loss": 2.0518, + "step": 267850 + }, + { + "epoch": 1.0195412711341854, + "grad_norm": 0.11766720563173294, + "learning_rate": 0.00031903038929146147, + "loss": 2.0627, + "step": 267860 + }, + { + "epoch": 1.019579333602308, + "grad_norm": 0.12382876127958298, + "learning_rate": 0.0003189153052666608, + "loss": 2.0566, + "step": 267870 + }, + { + "epoch": 1.019617396070431, + "grad_norm": 0.1257692575454712, + "learning_rate": 0.0003188002943343082, + "loss": 2.0413, + "step": 267880 + }, + { + "epoch": 1.0196554585385535, + "grad_norm": 0.12685583531856537, + "learning_rate": 0.00031868535635531253, + "loss": 2.0734, + "step": 267890 + }, + { + "epoch": 1.0196935210066762, + "grad_norm": 0.12163806706666946, + "learning_rate": 0.000318570491191023, + "loss": 2.0426, + "step": 267900 + }, + { + "epoch": 1.0197315834747989, + "grad_norm": 0.1382589042186737, + "learning_rate": 0.0003184556987032274, + "loss": 2.0553, + "step": 267910 + }, + { + "epoch": 1.0197696459429215, + "grad_norm": 0.13633142411708832, + "learning_rate": 0.00031834097875415044, + "loss": 2.0386, + "step": 267920 + }, + { + "epoch": 1.0198077084110442, + "grad_norm": 0.12466693669557571, + "learning_rate": 0.0003182263312064515, + "loss": 2.0495, + "step": 267930 + }, + { + "epoch": 1.0198457708791668, + "grad_norm": 0.13681867718696594, + "learning_rate": 0.0003181117559232226, + "loss": 2.0497, + "step": 267940 + }, + { + "epoch": 1.0198838333472895, + "grad_norm": 0.1384352594614029, + "learning_rate": 0.0003179972527679871, + "loss": 2.0599, + "step": 267950 + }, + { + "epoch": 1.0199218958154121, + "grad_norm": 0.11396428197622299, + "learning_rate": 0.0003178828216046969, + "loss": 2.0436, + "step": 267960 + }, + { + "epoch": 1.019959958283535, + "grad_norm": 0.13938863575458527, + "learning_rate": 0.0003177684622977314, + "loss": 2.0478, + "step": 267970 + }, + { + "epoch": 1.0199980207516577, + "grad_norm": 0.12432560324668884, + "learning_rate": 0.0003176541747118953, + "loss": 2.0578, + "step": 267980 + }, + { + "epoch": 1.0200360832197803, + "grad_norm": 0.12672552466392517, + "learning_rate": 0.0003175399587124165, + "loss": 2.0483, + "step": 267990 + }, + { + "epoch": 1.020074145687903, + "grad_norm": 0.1234738752245903, + "learning_rate": 0.00031742581416494464, + "loss": 2.0588, + "step": 268000 + }, + { + "epoch": 1.0201122081560257, + "grad_norm": 0.12561601400375366, + "learning_rate": 0.00031731174093554894, + "loss": 2.0552, + "step": 268010 + }, + { + "epoch": 1.0201502706241483, + "grad_norm": 0.13383440673351288, + "learning_rate": 0.00031719773889071653, + "loss": 2.0625, + "step": 268020 + }, + { + "epoch": 1.020188333092271, + "grad_norm": 0.12872928380966187, + "learning_rate": 0.0003170838078973506, + "loss": 2.0599, + "step": 268030 + }, + { + "epoch": 1.0202263955603936, + "grad_norm": 0.17568981647491455, + "learning_rate": 0.00031696994782276874, + "loss": 2.0451, + "step": 268040 + }, + { + "epoch": 1.0202644580285165, + "grad_norm": 0.13012796640396118, + "learning_rate": 0.00031685615853470074, + "loss": 2.0433, + "step": 268050 + }, + { + "epoch": 1.0203025204966392, + "grad_norm": 0.13532185554504395, + "learning_rate": 0.00031674243990128723, + "loss": 2.0476, + "step": 268060 + }, + { + "epoch": 1.0203405829647618, + "grad_norm": 0.1131984293460846, + "learning_rate": 0.00031662879179107753, + "loss": 2.0518, + "step": 268070 + }, + { + "epoch": 1.0203786454328845, + "grad_norm": 0.11749204248189926, + "learning_rate": 0.0003165152140730282, + "loss": 2.0495, + "step": 268080 + }, + { + "epoch": 1.0204167079010071, + "grad_norm": 0.12241828441619873, + "learning_rate": 0.00031640170661650116, + "loss": 2.0469, + "step": 268090 + }, + { + "epoch": 1.0204547703691298, + "grad_norm": 0.14096224308013916, + "learning_rate": 0.0003162882692912616, + "loss": 2.0591, + "step": 268100 + }, + { + "epoch": 1.0204928328372525, + "grad_norm": 0.12495024502277374, + "learning_rate": 0.0003161749019674771, + "loss": 2.0499, + "step": 268110 + }, + { + "epoch": 1.020530895305375, + "grad_norm": 0.12601830065250397, + "learning_rate": 0.00031606160451571474, + "loss": 2.0624, + "step": 268120 + }, + { + "epoch": 1.0205689577734978, + "grad_norm": 0.12339954823255539, + "learning_rate": 0.0003159483768069404, + "loss": 2.0632, + "step": 268130 + }, + { + "epoch": 1.0206070202416206, + "grad_norm": 0.13249126076698303, + "learning_rate": 0.0003158352187125164, + "loss": 2.0613, + "step": 268140 + }, + { + "epoch": 1.0206450827097433, + "grad_norm": 0.14813245832920074, + "learning_rate": 0.00031572213010420014, + "loss": 2.0554, + "step": 268150 + }, + { + "epoch": 1.020683145177866, + "grad_norm": 0.12110047787427902, + "learning_rate": 0.0003156091108541422, + "loss": 2.0641, + "step": 268160 + }, + { + "epoch": 1.0207212076459886, + "grad_norm": 0.12983626127243042, + "learning_rate": 0.00031549616083488487, + "loss": 2.0557, + "step": 268170 + }, + { + "epoch": 1.0207592701141113, + "grad_norm": 0.12175390124320984, + "learning_rate": 0.0003153832799193601, + "loss": 2.0587, + "step": 268180 + }, + { + "epoch": 1.020797332582234, + "grad_norm": 0.12431416660547256, + "learning_rate": 0.00031527046798088834, + "loss": 2.0439, + "step": 268190 + }, + { + "epoch": 1.0208353950503566, + "grad_norm": 0.12754175066947937, + "learning_rate": 0.0003151577248931764, + "loss": 2.0825, + "step": 268200 + }, + { + "epoch": 1.0208734575184792, + "grad_norm": 0.12287028878927231, + "learning_rate": 0.00031504505053031606, + "loss": 2.0501, + "step": 268210 + }, + { + "epoch": 1.0209115199866021, + "grad_norm": 0.14124400913715363, + "learning_rate": 0.0003149324447667825, + "loss": 2.0659, + "step": 268220 + }, + { + "epoch": 1.0209495824547248, + "grad_norm": 0.1486729383468628, + "learning_rate": 0.0003148199074774325, + "loss": 2.06, + "step": 268230 + }, + { + "epoch": 1.0209876449228474, + "grad_norm": 0.12180312722921371, + "learning_rate": 0.0003147074385375027, + "loss": 2.0547, + "step": 268240 + }, + { + "epoch": 1.02102570739097, + "grad_norm": 0.12472087144851685, + "learning_rate": 0.00031459503782260847, + "loss": 2.06, + "step": 268250 + }, + { + "epoch": 1.0210637698590928, + "grad_norm": 0.12739062309265137, + "learning_rate": 0.0003144827052087416, + "loss": 2.0566, + "step": 268260 + }, + { + "epoch": 1.0211018323272154, + "grad_norm": 0.1196855679154396, + "learning_rate": 0.0003143704405722695, + "loss": 2.0511, + "step": 268270 + }, + { + "epoch": 1.021139894795338, + "grad_norm": 0.13298091292381287, + "learning_rate": 0.0003142582437899329, + "loss": 2.049, + "step": 268280 + }, + { + "epoch": 1.0211779572634607, + "grad_norm": 0.11433979868888855, + "learning_rate": 0.00031414611473884474, + "loss": 2.0521, + "step": 268290 + }, + { + "epoch": 1.0212160197315834, + "grad_norm": 0.15914879739284515, + "learning_rate": 0.00031403405329648835, + "loss": 2.0452, + "step": 268300 + }, + { + "epoch": 1.0212540821997063, + "grad_norm": 0.13394173979759216, + "learning_rate": 0.00031392205934071605, + "loss": 2.0617, + "step": 268310 + }, + { + "epoch": 1.021292144667829, + "grad_norm": 0.1315068155527115, + "learning_rate": 0.0003138101327497475, + "loss": 2.0559, + "step": 268320 + }, + { + "epoch": 1.0213302071359516, + "grad_norm": 0.1253783404827118, + "learning_rate": 0.00031369827340216804, + "loss": 2.0615, + "step": 268330 + }, + { + "epoch": 1.0213682696040742, + "grad_norm": 0.12846381962299347, + "learning_rate": 0.0003135864811769275, + "loss": 2.0415, + "step": 268340 + }, + { + "epoch": 1.0214063320721969, + "grad_norm": 0.13440275192260742, + "learning_rate": 0.0003134747559533384, + "loss": 2.0616, + "step": 268350 + }, + { + "epoch": 1.0214443945403195, + "grad_norm": 0.12881481647491455, + "learning_rate": 0.0003133630976110744, + "loss": 2.0386, + "step": 268360 + }, + { + "epoch": 1.0214824570084422, + "grad_norm": 0.12021521478891373, + "learning_rate": 0.00031325150603016906, + "loss": 2.0555, + "step": 268370 + }, + { + "epoch": 1.0215205194765649, + "grad_norm": 0.11945555359125137, + "learning_rate": 0.0003131399810910138, + "loss": 2.0573, + "step": 268380 + }, + { + "epoch": 1.0215585819446875, + "grad_norm": 0.12708795070648193, + "learning_rate": 0.00031302852267435727, + "loss": 2.047, + "step": 268390 + }, + { + "epoch": 1.0215966444128104, + "grad_norm": 0.12525077164173126, + "learning_rate": 0.0003129171306613029, + "loss": 2.0467, + "step": 268400 + }, + { + "epoch": 1.021634706880933, + "grad_norm": 0.1318259835243225, + "learning_rate": 0.0003128058049333082, + "loss": 2.0442, + "step": 268410 + }, + { + "epoch": 1.0216727693490557, + "grad_norm": 0.1215236485004425, + "learning_rate": 0.00031269454537218266, + "loss": 2.0592, + "step": 268420 + }, + { + "epoch": 1.0217108318171784, + "grad_norm": 0.1245192289352417, + "learning_rate": 0.0003125833518600869, + "loss": 2.0556, + "step": 268430 + }, + { + "epoch": 1.021748894285301, + "grad_norm": 0.12638451159000397, + "learning_rate": 0.0003124722242795306, + "loss": 2.0435, + "step": 268440 + }, + { + "epoch": 1.0217869567534237, + "grad_norm": 0.12292831391096115, + "learning_rate": 0.00031236116251337167, + "loss": 2.0565, + "step": 268450 + }, + { + "epoch": 1.0218250192215463, + "grad_norm": 0.12036208808422089, + "learning_rate": 0.0003122501664448141, + "loss": 2.0577, + "step": 268460 + }, + { + "epoch": 1.021863081689669, + "grad_norm": 0.12013652175664902, + "learning_rate": 0.00031213923595740733, + "loss": 2.0472, + "step": 268470 + }, + { + "epoch": 1.0219011441577919, + "grad_norm": 0.13374583423137665, + "learning_rate": 0.00031202837093504424, + "loss": 2.0341, + "step": 268480 + }, + { + "epoch": 1.0219392066259145, + "grad_norm": 0.11965156346559525, + "learning_rate": 0.00031191757126195976, + "loss": 2.0475, + "step": 268490 + }, + { + "epoch": 1.0219772690940372, + "grad_norm": 0.12487906217575073, + "learning_rate": 0.00031180683682272974, + "loss": 2.0673, + "step": 268500 + }, + { + "epoch": 1.0220153315621598, + "grad_norm": 0.13901683688163757, + "learning_rate": 0.0003116961675022695, + "loss": 2.0512, + "step": 268510 + }, + { + "epoch": 1.0220533940302825, + "grad_norm": 0.13149023056030273, + "learning_rate": 0.0003115855631858323, + "loss": 2.0504, + "step": 268520 + }, + { + "epoch": 1.0220914564984052, + "grad_norm": 0.12233823537826538, + "learning_rate": 0.0003114750237590078, + "loss": 2.0701, + "step": 268530 + }, + { + "epoch": 1.0221295189665278, + "grad_norm": 0.11628257483243942, + "learning_rate": 0.00031136454910772116, + "loss": 2.0632, + "step": 268540 + }, + { + "epoch": 1.0221675814346505, + "grad_norm": 0.1403857320547104, + "learning_rate": 0.0003112541391182313, + "loss": 2.0637, + "step": 268550 + }, + { + "epoch": 1.0222056439027731, + "grad_norm": 0.11599292606115341, + "learning_rate": 0.00031114379367712944, + "loss": 2.0597, + "step": 268560 + }, + { + "epoch": 1.022243706370896, + "grad_norm": 0.1288028508424759, + "learning_rate": 0.0003110335126713381, + "loss": 2.0502, + "step": 268570 + }, + { + "epoch": 1.0222817688390187, + "grad_norm": 0.1404590755701065, + "learning_rate": 0.0003109232959881096, + "loss": 2.0545, + "step": 268580 + }, + { + "epoch": 1.0223198313071413, + "grad_norm": 0.12103798240423203, + "learning_rate": 0.00031081314351502463, + "loss": 2.0361, + "step": 268590 + }, + { + "epoch": 1.022357893775264, + "grad_norm": 0.12138662487268448, + "learning_rate": 0.0003107030551399909, + "loss": 2.0593, + "step": 268600 + }, + { + "epoch": 1.0223959562433866, + "grad_norm": 0.12390505522489548, + "learning_rate": 0.0003105930307512419, + "loss": 2.0392, + "step": 268610 + }, + { + "epoch": 1.0224340187115093, + "grad_norm": 0.12961971759796143, + "learning_rate": 0.0003104830702373356, + "loss": 2.0532, + "step": 268620 + }, + { + "epoch": 1.022472081179632, + "grad_norm": 0.1259758621454239, + "learning_rate": 0.0003103731734871531, + "loss": 2.0452, + "step": 268630 + }, + { + "epoch": 1.0225101436477546, + "grad_norm": 0.14060895144939423, + "learning_rate": 0.0003102633403898972, + "loss": 2.0581, + "step": 268640 + }, + { + "epoch": 1.0225482061158773, + "grad_norm": 0.1207043007016182, + "learning_rate": 0.0003101535708350914, + "loss": 2.0508, + "step": 268650 + }, + { + "epoch": 1.0225862685840001, + "grad_norm": 0.13988932967185974, + "learning_rate": 0.00031004386471257804, + "loss": 2.0532, + "step": 268660 + }, + { + "epoch": 1.0226243310521228, + "grad_norm": 0.14313188195228577, + "learning_rate": 0.00030993422191251787, + "loss": 2.0454, + "step": 268670 + }, + { + "epoch": 1.0226623935202455, + "grad_norm": 0.12709325551986694, + "learning_rate": 0.00030982464232538786, + "loss": 2.0561, + "step": 268680 + }, + { + "epoch": 1.0227004559883681, + "grad_norm": 0.13226522505283356, + "learning_rate": 0.00030971512584198046, + "loss": 2.057, + "step": 268690 + }, + { + "epoch": 1.0227385184564908, + "grad_norm": 0.14007730782032013, + "learning_rate": 0.0003096056723534023, + "loss": 2.0546, + "step": 268700 + }, + { + "epoch": 1.0227765809246134, + "grad_norm": 0.12856709957122803, + "learning_rate": 0.0003094962817510727, + "loss": 2.044, + "step": 268710 + }, + { + "epoch": 1.022814643392736, + "grad_norm": 0.13431373238563538, + "learning_rate": 0.00030938695392672264, + "loss": 2.0507, + "step": 268720 + }, + { + "epoch": 1.0228527058608587, + "grad_norm": 0.13312764465808868, + "learning_rate": 0.0003092776887723935, + "loss": 2.0521, + "step": 268730 + }, + { + "epoch": 1.0228907683289816, + "grad_norm": 0.1246262639760971, + "learning_rate": 0.0003091684861804354, + "loss": 2.0738, + "step": 268740 + }, + { + "epoch": 1.0229288307971043, + "grad_norm": 0.1405113935470581, + "learning_rate": 0.00030905934604350666, + "loss": 2.0422, + "step": 268750 + }, + { + "epoch": 1.022966893265227, + "grad_norm": 0.12192755937576294, + "learning_rate": 0.000308950268254572, + "loss": 2.0503, + "step": 268760 + }, + { + "epoch": 1.0230049557333496, + "grad_norm": 0.13036781549453735, + "learning_rate": 0.00030884125270690154, + "loss": 2.0533, + "step": 268770 + }, + { + "epoch": 1.0230430182014723, + "grad_norm": 0.16155710816383362, + "learning_rate": 0.00030873229929406966, + "loss": 2.0446, + "step": 268780 + }, + { + "epoch": 1.023081080669595, + "grad_norm": 0.14020535349845886, + "learning_rate": 0.00030862340790995365, + "loss": 2.0495, + "step": 268790 + }, + { + "epoch": 1.0231191431377176, + "grad_norm": 0.11833903193473816, + "learning_rate": 0.00030851457844873235, + "loss": 2.05, + "step": 268800 + }, + { + "epoch": 1.0231572056058402, + "grad_norm": 0.1218896210193634, + "learning_rate": 0.0003084058108048855, + "loss": 2.0512, + "step": 268810 + }, + { + "epoch": 1.0231952680739629, + "grad_norm": 0.12483804672956467, + "learning_rate": 0.00030829710487319186, + "loss": 2.0453, + "step": 268820 + }, + { + "epoch": 1.0232333305420858, + "grad_norm": 0.1135217696428299, + "learning_rate": 0.00030818846054872853, + "loss": 2.0549, + "step": 268830 + }, + { + "epoch": 1.0232713930102084, + "grad_norm": 0.12844489514827728, + "learning_rate": 0.0003080798777268696, + "loss": 2.0584, + "step": 268840 + }, + { + "epoch": 1.023309455478331, + "grad_norm": 0.11942926049232483, + "learning_rate": 0.00030797135630328476, + "loss": 2.0411, + "step": 268850 + }, + { + "epoch": 1.0233475179464537, + "grad_norm": 0.13348643481731415, + "learning_rate": 0.0003078628961739387, + "loss": 2.0697, + "step": 268860 + }, + { + "epoch": 1.0233855804145764, + "grad_norm": 0.12040871381759644, + "learning_rate": 0.0003077544972350892, + "loss": 2.0597, + "step": 268870 + }, + { + "epoch": 1.023423642882699, + "grad_norm": 0.12042777240276337, + "learning_rate": 0.00030764615938328653, + "loss": 2.0513, + "step": 268880 + }, + { + "epoch": 1.0234617053508217, + "grad_norm": 0.11985976994037628, + "learning_rate": 0.00030753788251537223, + "loss": 2.0532, + "step": 268890 + }, + { + "epoch": 1.0234997678189444, + "grad_norm": 0.12266091257333755, + "learning_rate": 0.0003074296665284776, + "loss": 2.0559, + "step": 268900 + }, + { + "epoch": 1.0235378302870672, + "grad_norm": 0.11272238940000534, + "learning_rate": 0.00030732151132002305, + "loss": 2.0522, + "step": 268910 + }, + { + "epoch": 1.02357589275519, + "grad_norm": 0.1326189637184143, + "learning_rate": 0.0003072134167877166, + "loss": 2.0474, + "step": 268920 + }, + { + "epoch": 1.0236139552233126, + "grad_norm": 0.1255561113357544, + "learning_rate": 0.0003071053828295529, + "loss": 2.0353, + "step": 268930 + }, + { + "epoch": 1.0236520176914352, + "grad_norm": 0.12174849957227707, + "learning_rate": 0.00030699740934381214, + "loss": 2.0544, + "step": 268940 + }, + { + "epoch": 1.0236900801595579, + "grad_norm": 0.1409405916929245, + "learning_rate": 0.0003068894962290589, + "loss": 2.0487, + "step": 268950 + }, + { + "epoch": 1.0237281426276805, + "grad_norm": 0.1327158361673355, + "learning_rate": 0.00030678164338414083, + "loss": 2.0471, + "step": 268960 + }, + { + "epoch": 1.0237662050958032, + "grad_norm": 0.12210732698440552, + "learning_rate": 0.00030667385070818796, + "loss": 2.0532, + "step": 268970 + }, + { + "epoch": 1.0238042675639258, + "grad_norm": 0.11677435040473938, + "learning_rate": 0.0003065661181006113, + "loss": 2.0624, + "step": 268980 + }, + { + "epoch": 1.0238423300320485, + "grad_norm": 0.13230466842651367, + "learning_rate": 0.0003064584454611017, + "loss": 2.0562, + "step": 268990 + }, + { + "epoch": 1.0238803925001714, + "grad_norm": 0.12184811383485794, + "learning_rate": 0.00030635083268962916, + "loss": 2.0595, + "step": 269000 + }, + { + "epoch": 1.023918454968294, + "grad_norm": 0.12902513146400452, + "learning_rate": 0.00030624327968644115, + "loss": 2.0547, + "step": 269010 + }, + { + "epoch": 1.0239565174364167, + "grad_norm": 0.12603649497032166, + "learning_rate": 0.00030613578635206205, + "loss": 2.0609, + "step": 269020 + }, + { + "epoch": 1.0239945799045393, + "grad_norm": 0.1304672509431839, + "learning_rate": 0.0003060283525872918, + "loss": 2.0491, + "step": 269030 + }, + { + "epoch": 1.024032642372662, + "grad_norm": 0.11761519312858582, + "learning_rate": 0.00030592097829320485, + "loss": 2.0537, + "step": 269040 + }, + { + "epoch": 1.0240707048407847, + "grad_norm": 0.11814765632152557, + "learning_rate": 0.00030581366337114924, + "loss": 2.0416, + "step": 269050 + }, + { + "epoch": 1.0241087673089073, + "grad_norm": 0.11363788694143295, + "learning_rate": 0.0003057064077227455, + "loss": 2.0402, + "step": 269060 + }, + { + "epoch": 1.02414682977703, + "grad_norm": 0.1189923882484436, + "learning_rate": 0.0003055992112498854, + "loss": 2.0461, + "step": 269070 + }, + { + "epoch": 1.0241848922451529, + "grad_norm": 0.12008653581142426, + "learning_rate": 0.00030549207385473123, + "loss": 2.0411, + "step": 269080 + }, + { + "epoch": 1.0242229547132755, + "grad_norm": 0.1371253877878189, + "learning_rate": 0.0003053849954397144, + "loss": 2.0486, + "step": 269090 + }, + { + "epoch": 1.0242610171813982, + "grad_norm": 0.13268887996673584, + "learning_rate": 0.0003052779759075346, + "loss": 2.0425, + "step": 269100 + }, + { + "epoch": 1.0242990796495208, + "grad_norm": 0.11739718168973923, + "learning_rate": 0.000305171015161159, + "loss": 2.0396, + "step": 269110 + }, + { + "epoch": 1.0243371421176435, + "grad_norm": 0.13256852328777313, + "learning_rate": 0.00030506411310382076, + "loss": 2.0447, + "step": 269120 + }, + { + "epoch": 1.0243752045857661, + "grad_norm": 0.12280917912721634, + "learning_rate": 0.00030495726963901816, + "loss": 2.045, + "step": 269130 + }, + { + "epoch": 1.0244132670538888, + "grad_norm": 0.11900188028812408, + "learning_rate": 0.0003048504846705139, + "loss": 2.0466, + "step": 269140 + }, + { + "epoch": 1.0244513295220115, + "grad_norm": 0.12554916739463806, + "learning_rate": 0.0003047437581023337, + "loss": 2.0548, + "step": 269150 + }, + { + "epoch": 1.0244893919901341, + "grad_norm": 0.1171054095029831, + "learning_rate": 0.00030463708983876524, + "loss": 2.0488, + "step": 269160 + }, + { + "epoch": 1.024527454458257, + "grad_norm": 0.11716967821121216, + "learning_rate": 0.00030453047978435787, + "loss": 2.0453, + "step": 269170 + }, + { + "epoch": 1.0245655169263796, + "grad_norm": 0.1312137097120285, + "learning_rate": 0.0003044239278439205, + "loss": 2.0319, + "step": 269180 + }, + { + "epoch": 1.0246035793945023, + "grad_norm": 0.1252538561820984, + "learning_rate": 0.00030431743392252163, + "loss": 2.0537, + "step": 269190 + }, + { + "epoch": 1.024641641862625, + "grad_norm": 0.1490531712770462, + "learning_rate": 0.00030421099792548787, + "loss": 2.0492, + "step": 269200 + }, + { + "epoch": 1.0246797043307476, + "grad_norm": 0.14222683012485504, + "learning_rate": 0.00030410461975840267, + "loss": 2.0459, + "step": 269210 + }, + { + "epoch": 1.0247177667988703, + "grad_norm": 0.1411609947681427, + "learning_rate": 0.0003039982993271062, + "loss": 2.0302, + "step": 269220 + }, + { + "epoch": 1.024755829266993, + "grad_norm": 0.14139287173748016, + "learning_rate": 0.00030389203653769355, + "loss": 2.0311, + "step": 269230 + }, + { + "epoch": 1.0247938917351156, + "grad_norm": 0.12095063179731369, + "learning_rate": 0.00030378583129651415, + "loss": 2.0512, + "step": 269240 + }, + { + "epoch": 1.0248319542032382, + "grad_norm": 0.14292798936367035, + "learning_rate": 0.0003036796835101709, + "loss": 2.0592, + "step": 269250 + }, + { + "epoch": 1.0248700166713611, + "grad_norm": 0.12329915165901184, + "learning_rate": 0.00030357359308551886, + "loss": 2.0566, + "step": 269260 + }, + { + "epoch": 1.0249080791394838, + "grad_norm": 0.12383721023797989, + "learning_rate": 0.00030346755992966456, + "loss": 2.0421, + "step": 269270 + }, + { + "epoch": 1.0249461416076064, + "grad_norm": 0.12567129731178284, + "learning_rate": 0.000303361583949965, + "loss": 2.0447, + "step": 269280 + }, + { + "epoch": 1.024984204075729, + "grad_norm": 0.13148026168346405, + "learning_rate": 0.0003032556650540267, + "loss": 2.0577, + "step": 269290 + }, + { + "epoch": 1.0250222665438518, + "grad_norm": 0.14086686074733734, + "learning_rate": 0.00030314980314970474, + "loss": 2.039, + "step": 269300 + }, + { + "epoch": 1.0250603290119744, + "grad_norm": 0.12691082060337067, + "learning_rate": 0.00030304399814510175, + "loss": 2.0465, + "step": 269310 + }, + { + "epoch": 1.025098391480097, + "grad_norm": 0.11918236315250397, + "learning_rate": 0.0003029382499485673, + "loss": 2.0489, + "step": 269320 + }, + { + "epoch": 1.0251364539482197, + "grad_norm": 0.11919201165437698, + "learning_rate": 0.00030283255846869653, + "loss": 2.0438, + "step": 269330 + }, + { + "epoch": 1.0251745164163426, + "grad_norm": 0.12480544298887253, + "learning_rate": 0.0003027269236143293, + "loss": 2.0322, + "step": 269340 + }, + { + "epoch": 1.0252125788844653, + "grad_norm": 0.12979908287525177, + "learning_rate": 0.00030262134529454983, + "loss": 2.0557, + "step": 269350 + }, + { + "epoch": 1.025250641352588, + "grad_norm": 0.13761655986309052, + "learning_rate": 0.000302515823418685, + "loss": 2.0328, + "step": 269360 + }, + { + "epoch": 1.0252887038207106, + "grad_norm": 0.13578085601329803, + "learning_rate": 0.000302410357896304, + "loss": 2.0563, + "step": 269370 + }, + { + "epoch": 1.0253267662888332, + "grad_norm": 0.12312494218349457, + "learning_rate": 0.0003023049486372172, + "loss": 2.0461, + "step": 269380 + }, + { + "epoch": 1.025364828756956, + "grad_norm": 0.11766970902681351, + "learning_rate": 0.00030219959555147517, + "loss": 2.0526, + "step": 269390 + }, + { + "epoch": 1.0254028912250785, + "grad_norm": 0.12069400399923325, + "learning_rate": 0.00030209429854936805, + "loss": 2.0549, + "step": 269400 + }, + { + "epoch": 1.0254409536932012, + "grad_norm": 0.16372714936733246, + "learning_rate": 0.00030198905754142444, + "loss": 2.0425, + "step": 269410 + }, + { + "epoch": 1.0254790161613239, + "grad_norm": 0.1190953329205513, + "learning_rate": 0.00030188387243841055, + "loss": 2.0513, + "step": 269420 + }, + { + "epoch": 1.0255170786294467, + "grad_norm": 0.13131879270076752, + "learning_rate": 0.00030177874315132937, + "loss": 2.0602, + "step": 269430 + }, + { + "epoch": 1.0255551410975694, + "grad_norm": 0.11880412697792053, + "learning_rate": 0.00030167366959141977, + "loss": 2.0469, + "step": 269440 + }, + { + "epoch": 1.025593203565692, + "grad_norm": 0.1298285871744156, + "learning_rate": 0.00030156865167015566, + "loss": 2.0728, + "step": 269450 + }, + { + "epoch": 1.0256312660338147, + "grad_norm": 0.13389140367507935, + "learning_rate": 0.00030146368929924515, + "loss": 2.0458, + "step": 269460 + }, + { + "epoch": 1.0256693285019374, + "grad_norm": 0.12634499371051788, + "learning_rate": 0.00030135878239062944, + "loss": 2.0379, + "step": 269470 + }, + { + "epoch": 1.02570739097006, + "grad_norm": 0.1331862211227417, + "learning_rate": 0.00030125393085648214, + "loss": 2.0434, + "step": 269480 + }, + { + "epoch": 1.0257454534381827, + "grad_norm": 0.11369920521974564, + "learning_rate": 0.00030114913460920856, + "loss": 2.0404, + "step": 269490 + }, + { + "epoch": 1.0257835159063053, + "grad_norm": 0.14334441721439362, + "learning_rate": 0.00030104439356144465, + "loss": 2.0642, + "step": 269500 + }, + { + "epoch": 1.025821578374428, + "grad_norm": 0.1409229189157486, + "learning_rate": 0.0003009397076260562, + "loss": 2.0557, + "step": 269510 + }, + { + "epoch": 1.0258596408425509, + "grad_norm": 0.11479076743125916, + "learning_rate": 0.0003008350767161379, + "loss": 2.0506, + "step": 269520 + }, + { + "epoch": 1.0258977033106735, + "grad_norm": 0.12338791787624359, + "learning_rate": 0.00030073050074501287, + "loss": 2.0564, + "step": 269530 + }, + { + "epoch": 1.0259357657787962, + "grad_norm": 0.12768040597438812, + "learning_rate": 0.00030062597962623116, + "loss": 2.0389, + "step": 269540 + }, + { + "epoch": 1.0259738282469189, + "grad_norm": 0.1327030509710312, + "learning_rate": 0.0003005215132735696, + "loss": 2.0432, + "step": 269550 + }, + { + "epoch": 1.0260118907150415, + "grad_norm": 0.1372380554676056, + "learning_rate": 0.00030041710160103063, + "loss": 2.0446, + "step": 269560 + }, + { + "epoch": 1.0260499531831642, + "grad_norm": 0.12993136048316956, + "learning_rate": 0.0003003127445228414, + "loss": 2.0497, + "step": 269570 + }, + { + "epoch": 1.0260880156512868, + "grad_norm": 0.12132129818201065, + "learning_rate": 0.0003002084419534532, + "loss": 2.047, + "step": 269580 + }, + { + "epoch": 1.0261260781194095, + "grad_norm": 0.1340913325548172, + "learning_rate": 0.00030010419380754053, + "loss": 2.0438, + "step": 269590 + }, + { + "epoch": 1.0261641405875324, + "grad_norm": 0.12921398878097534, + "learning_rate": 0.0003, + "loss": 2.0546, + "step": 269600 + }, + { + "epoch": 1.026202203055655, + "grad_norm": 0.13172978162765503, + "learning_rate": 0.0002998958604459501, + "loss": 2.0324, + "step": 269610 + }, + { + "epoch": 1.0262402655237777, + "grad_norm": 0.1243109479546547, + "learning_rate": 0.00029979177506073, + "loss": 2.0618, + "step": 269620 + }, + { + "epoch": 1.0262783279919003, + "grad_norm": 0.11869452893733978, + "learning_rate": 0.0002996877437598987, + "loss": 2.0386, + "step": 269630 + }, + { + "epoch": 1.026316390460023, + "grad_norm": 0.13170160353183746, + "learning_rate": 0.0002995837664592345, + "loss": 2.0502, + "step": 269640 + }, + { + "epoch": 1.0263544529281456, + "grad_norm": 0.1317114382982254, + "learning_rate": 0.00029947984307473395, + "loss": 2.025, + "step": 269650 + }, + { + "epoch": 1.0263925153962683, + "grad_norm": 0.12583796679973602, + "learning_rate": 0.0002993759735226112, + "loss": 2.0529, + "step": 269660 + }, + { + "epoch": 1.026430577864391, + "grad_norm": 0.13131083548069, + "learning_rate": 0.00029927215771929726, + "loss": 2.0412, + "step": 269670 + }, + { + "epoch": 1.0264686403325136, + "grad_norm": 0.12807200849056244, + "learning_rate": 0.0002991683955814391, + "loss": 2.0481, + "step": 269680 + }, + { + "epoch": 1.0265067028006365, + "grad_norm": 0.14543797075748444, + "learning_rate": 0.0002990646870258988, + "loss": 2.0281, + "step": 269690 + }, + { + "epoch": 1.0265447652687592, + "grad_norm": 0.12976425886154175, + "learning_rate": 0.00029896103196975306, + "loss": 2.0369, + "step": 269700 + }, + { + "epoch": 1.0265828277368818, + "grad_norm": 0.13158537447452545, + "learning_rate": 0.000298857430330292, + "loss": 2.0489, + "step": 269710 + }, + { + "epoch": 1.0266208902050045, + "grad_norm": 0.1256389617919922, + "learning_rate": 0.00029875388202501896, + "loss": 2.0485, + "step": 269720 + }, + { + "epoch": 1.0266589526731271, + "grad_norm": 0.1335773915052414, + "learning_rate": 0.0002986503869716491, + "loss": 2.0394, + "step": 269730 + }, + { + "epoch": 1.0266970151412498, + "grad_norm": 0.12555572390556335, + "learning_rate": 0.0002985469450881091, + "loss": 2.0512, + "step": 269740 + }, + { + "epoch": 1.0267350776093724, + "grad_norm": 0.13962522149085999, + "learning_rate": 0.0002984435562925363, + "loss": 2.0377, + "step": 269750 + }, + { + "epoch": 1.026773140077495, + "grad_norm": 0.14196039736270905, + "learning_rate": 0.0002983402205032777, + "loss": 2.0521, + "step": 269760 + }, + { + "epoch": 1.026811202545618, + "grad_norm": 0.1314079761505127, + "learning_rate": 0.0002982369376388896, + "loss": 2.0545, + "step": 269770 + }, + { + "epoch": 1.0268492650137406, + "grad_norm": 0.12030822783708572, + "learning_rate": 0.0002981337076181365, + "loss": 2.0389, + "step": 269780 + }, + { + "epoch": 1.0268873274818633, + "grad_norm": 0.13099026679992676, + "learning_rate": 0.0002980305303599906, + "loss": 2.0451, + "step": 269790 + }, + { + "epoch": 1.026925389949986, + "grad_norm": 0.128982275724411, + "learning_rate": 0.000297927405783631, + "loss": 2.0383, + "step": 269800 + }, + { + "epoch": 1.0269634524181086, + "grad_norm": 0.142899751663208, + "learning_rate": 0.0002978243338084427, + "loss": 2.0751, + "step": 269810 + }, + { + "epoch": 1.0270015148862313, + "grad_norm": 0.1250215768814087, + "learning_rate": 0.0002977213143540164, + "loss": 2.0571, + "step": 269820 + }, + { + "epoch": 1.027039577354354, + "grad_norm": 0.1249474361538887, + "learning_rate": 0.00029761834734014713, + "loss": 2.0465, + "step": 269830 + }, + { + "epoch": 1.0270776398224766, + "grad_norm": 0.13094857335090637, + "learning_rate": 0.0002975154326868341, + "loss": 2.0414, + "step": 269840 + }, + { + "epoch": 1.0271157022905992, + "grad_norm": 0.13219231367111206, + "learning_rate": 0.0002974125703142797, + "loss": 2.0326, + "step": 269850 + }, + { + "epoch": 1.027153764758722, + "grad_norm": 0.12404201924800873, + "learning_rate": 0.0002973097601428887, + "loss": 2.0403, + "step": 269860 + }, + { + "epoch": 1.0271918272268448, + "grad_norm": 0.1259431391954422, + "learning_rate": 0.0002972070020932676, + "loss": 2.0463, + "step": 269870 + }, + { + "epoch": 1.0272298896949674, + "grad_norm": 0.1325017213821411, + "learning_rate": 0.000297104296086224, + "loss": 2.0418, + "step": 269880 + }, + { + "epoch": 1.02726795216309, + "grad_norm": 0.13986846804618835, + "learning_rate": 0.00029700164204276593, + "loss": 2.051, + "step": 269890 + }, + { + "epoch": 1.0273060146312127, + "grad_norm": 0.13025562465190887, + "learning_rate": 0.000296899039884101, + "loss": 2.0497, + "step": 269900 + }, + { + "epoch": 1.0273440770993354, + "grad_norm": 0.11536869406700134, + "learning_rate": 0.0002967964895316356, + "loss": 2.0686, + "step": 269910 + }, + { + "epoch": 1.027382139567458, + "grad_norm": 0.12382342666387558, + "learning_rate": 0.0002966939909069746, + "loss": 2.0553, + "step": 269920 + }, + { + "epoch": 1.0274202020355807, + "grad_norm": 0.12849363684654236, + "learning_rate": 0.00029659154393192014, + "loss": 2.0469, + "step": 269930 + }, + { + "epoch": 1.0274582645037036, + "grad_norm": 0.123623326420784, + "learning_rate": 0.0002964891485284712, + "loss": 2.0419, + "step": 269940 + }, + { + "epoch": 1.0274963269718262, + "grad_norm": 0.12088244408369064, + "learning_rate": 0.00029638680461882306, + "loss": 2.0494, + "step": 269950 + }, + { + "epoch": 1.027534389439949, + "grad_norm": 0.1202690601348877, + "learning_rate": 0.0002962845121253663, + "loss": 2.0575, + "step": 269960 + }, + { + "epoch": 1.0275724519080716, + "grad_norm": 0.1362772434949875, + "learning_rate": 0.0002961822709706865, + "loss": 2.0621, + "step": 269970 + }, + { + "epoch": 1.0276105143761942, + "grad_norm": 0.13600513339042664, + "learning_rate": 0.0002960800810775629, + "loss": 2.0292, + "step": 269980 + }, + { + "epoch": 1.0276485768443169, + "grad_norm": 0.11714157462120056, + "learning_rate": 0.00029597794236896834, + "loss": 2.0388, + "step": 269990 + }, + { + "epoch": 1.0276866393124395, + "grad_norm": 0.11797063052654266, + "learning_rate": 0.0002958758547680685, + "loss": 2.0347, + "step": 270000 + }, + { + "epoch": 1.0277247017805622, + "grad_norm": 0.1323215812444687, + "learning_rate": 0.0002957738181982209, + "loss": 2.0464, + "step": 270010 + }, + { + "epoch": 1.0277627642486848, + "grad_norm": 0.13525962829589844, + "learning_rate": 0.00029567183258297447, + "loss": 2.0588, + "step": 270020 + }, + { + "epoch": 1.0278008267168077, + "grad_norm": 0.13091787695884705, + "learning_rate": 0.000295569897846069, + "loss": 2.0575, + "step": 270030 + }, + { + "epoch": 1.0278388891849304, + "grad_norm": 0.13230513036251068, + "learning_rate": 0.0002954680139114341, + "loss": 2.0416, + "step": 270040 + }, + { + "epoch": 1.027876951653053, + "grad_norm": 0.12935812771320343, + "learning_rate": 0.00029536618070318877, + "loss": 2.0577, + "step": 270050 + }, + { + "epoch": 1.0279150141211757, + "grad_norm": 0.12722337245941162, + "learning_rate": 0.00029526439814564086, + "loss": 2.0404, + "step": 270060 + }, + { + "epoch": 1.0279530765892984, + "grad_norm": 0.14798662066459656, + "learning_rate": 0.0002951626661632862, + "loss": 2.0445, + "step": 270070 + }, + { + "epoch": 1.027991139057421, + "grad_norm": 0.1255602240562439, + "learning_rate": 0.000295060984680808, + "loss": 2.0257, + "step": 270080 + }, + { + "epoch": 1.0280292015255437, + "grad_norm": 0.1310020089149475, + "learning_rate": 0.00029495935362307637, + "loss": 2.0485, + "step": 270090 + }, + { + "epoch": 1.0280672639936663, + "grad_norm": 0.12309551239013672, + "learning_rate": 0.0002948577729151471, + "loss": 2.0537, + "step": 270100 + }, + { + "epoch": 1.028105326461789, + "grad_norm": 0.13925056159496307, + "learning_rate": 0.00029475624248226205, + "loss": 2.049, + "step": 270110 + }, + { + "epoch": 1.0281433889299119, + "grad_norm": 0.13675113022327423, + "learning_rate": 0.0002946547622498475, + "loss": 2.0443, + "step": 270120 + }, + { + "epoch": 1.0281814513980345, + "grad_norm": 0.13782602548599243, + "learning_rate": 0.0002945533321435139, + "loss": 2.042, + "step": 270130 + }, + { + "epoch": 1.0282195138661572, + "grad_norm": 0.12527993321418762, + "learning_rate": 0.0002944519520890553, + "loss": 2.0517, + "step": 270140 + }, + { + "epoch": 1.0282575763342798, + "grad_norm": 0.12390842288732529, + "learning_rate": 0.0002943506220124489, + "loss": 2.0416, + "step": 270150 + }, + { + "epoch": 1.0282956388024025, + "grad_norm": 0.12301144003868103, + "learning_rate": 0.0002942493418398538, + "loss": 2.0467, + "step": 270160 + }, + { + "epoch": 1.0283337012705251, + "grad_norm": 0.12911370396614075, + "learning_rate": 0.000294148111497611, + "loss": 2.0395, + "step": 270170 + }, + { + "epoch": 1.0283717637386478, + "grad_norm": 0.12177471816539764, + "learning_rate": 0.0002940469309122424, + "loss": 2.0481, + "step": 270180 + }, + { + "epoch": 1.0284098262067705, + "grad_norm": 0.12141739577054977, + "learning_rate": 0.00029394580001045035, + "loss": 2.0451, + "step": 270190 + }, + { + "epoch": 1.0284478886748933, + "grad_norm": 0.1493520736694336, + "learning_rate": 0.000293844718719117, + "loss": 2.0475, + "step": 270200 + }, + { + "epoch": 1.028485951143016, + "grad_norm": 0.11416112631559372, + "learning_rate": 0.0002937436869653036, + "loss": 2.0371, + "step": 270210 + }, + { + "epoch": 1.0285240136111387, + "grad_norm": 0.1214093267917633, + "learning_rate": 0.00029364270467625007, + "loss": 2.05, + "step": 270220 + }, + { + "epoch": 1.0285620760792613, + "grad_norm": 0.1179608702659607, + "learning_rate": 0.00029354177177937427, + "loss": 2.0268, + "step": 270230 + }, + { + "epoch": 1.028600138547384, + "grad_norm": 0.13988269865512848, + "learning_rate": 0.00029344088820227113, + "loss": 2.0496, + "step": 270240 + }, + { + "epoch": 1.0286382010155066, + "grad_norm": 0.14148426055908203, + "learning_rate": 0.00029334005387271256, + "loss": 2.0278, + "step": 270250 + }, + { + "epoch": 1.0286762634836293, + "grad_norm": 0.1297960877418518, + "learning_rate": 0.0002932392687186467, + "loss": 2.0563, + "step": 270260 + }, + { + "epoch": 1.028714325951752, + "grad_norm": 0.12837962806224823, + "learning_rate": 0.00029313853266819685, + "loss": 2.0566, + "step": 270270 + }, + { + "epoch": 1.0287523884198746, + "grad_norm": 0.11966746300458908, + "learning_rate": 0.0002930378456496615, + "loss": 2.0314, + "step": 270280 + }, + { + "epoch": 1.0287904508879975, + "grad_norm": 0.1341601312160492, + "learning_rate": 0.00029293720759151343, + "loss": 2.0373, + "step": 270290 + }, + { + "epoch": 1.0288285133561201, + "grad_norm": 0.12876875698566437, + "learning_rate": 0.0002928366184223991, + "loss": 2.0385, + "step": 270300 + }, + { + "epoch": 1.0288665758242428, + "grad_norm": 0.1265905350446701, + "learning_rate": 0.0002927360780711382, + "loss": 2.0351, + "step": 270310 + }, + { + "epoch": 1.0289046382923654, + "grad_norm": 0.13416585326194763, + "learning_rate": 0.00029263558646672286, + "loss": 2.0451, + "step": 270320 + }, + { + "epoch": 1.028942700760488, + "grad_norm": 0.12282083928585052, + "learning_rate": 0.00029253514353831715, + "loss": 2.0508, + "step": 270330 + }, + { + "epoch": 1.0289807632286108, + "grad_norm": 0.1517842561006546, + "learning_rate": 0.00029243474921525684, + "loss": 2.0515, + "step": 270340 + }, + { + "epoch": 1.0290188256967334, + "grad_norm": 0.16193543374538422, + "learning_rate": 0.00029233440342704817, + "loss": 2.0535, + "step": 270350 + }, + { + "epoch": 1.029056888164856, + "grad_norm": 0.12937527894973755, + "learning_rate": 0.0002922341061033678, + "loss": 2.0328, + "step": 270360 + }, + { + "epoch": 1.0290949506329787, + "grad_norm": 0.12712572515010834, + "learning_rate": 0.00029213385717406185, + "loss": 2.0375, + "step": 270370 + }, + { + "epoch": 1.0291330131011016, + "grad_norm": 0.12684941291809082, + "learning_rate": 0.0002920336565691458, + "loss": 2.0545, + "step": 270380 + }, + { + "epoch": 1.0291710755692243, + "grad_norm": 0.13399958610534668, + "learning_rate": 0.0002919335042188035, + "loss": 2.0469, + "step": 270390 + }, + { + "epoch": 1.029209138037347, + "grad_norm": 0.13286960124969482, + "learning_rate": 0.00029183340005338676, + "loss": 2.0511, + "step": 270400 + }, + { + "epoch": 1.0292472005054696, + "grad_norm": 0.1161641776561737, + "learning_rate": 0.00029173334400341475, + "loss": 2.0366, + "step": 270410 + }, + { + "epoch": 1.0292852629735922, + "grad_norm": 0.12700442969799042, + "learning_rate": 0.0002916333359995734, + "loss": 2.0338, + "step": 270420 + }, + { + "epoch": 1.029323325441715, + "grad_norm": 0.1438145488500595, + "learning_rate": 0.0002915333759727151, + "loss": 2.0537, + "step": 270430 + }, + { + "epoch": 1.0293613879098376, + "grad_norm": 0.13152122497558594, + "learning_rate": 0.00029143346385385795, + "loss": 2.0535, + "step": 270440 + }, + { + "epoch": 1.0293994503779602, + "grad_norm": 0.11739667505025864, + "learning_rate": 0.00029133359957418475, + "loss": 2.0367, + "step": 270450 + }, + { + "epoch": 1.029437512846083, + "grad_norm": 0.12094622105360031, + "learning_rate": 0.0002912337830650434, + "loss": 2.0609, + "step": 270460 + }, + { + "epoch": 1.0294755753142057, + "grad_norm": 0.1305398941040039, + "learning_rate": 0.00029113401425794576, + "loss": 2.0372, + "step": 270470 + }, + { + "epoch": 1.0295136377823284, + "grad_norm": 0.13244089484214783, + "learning_rate": 0.00029103429308456694, + "loss": 2.0342, + "step": 270480 + }, + { + "epoch": 1.029551700250451, + "grad_norm": 0.13122345507144928, + "learning_rate": 0.00029093461947674515, + "loss": 2.0369, + "step": 270490 + }, + { + "epoch": 1.0295897627185737, + "grad_norm": 0.12100960314273834, + "learning_rate": 0.0002908349933664811, + "loss": 2.0455, + "step": 270500 + }, + { + "epoch": 1.0296278251866964, + "grad_norm": 0.1490957885980606, + "learning_rate": 0.0002907354146859372, + "loss": 2.0532, + "step": 270510 + }, + { + "epoch": 1.029665887654819, + "grad_norm": 0.13087870180606842, + "learning_rate": 0.0002906358833674373, + "loss": 2.0472, + "step": 270520 + }, + { + "epoch": 1.0297039501229417, + "grad_norm": 0.13519656658172607, + "learning_rate": 0.0002905363993434659, + "loss": 2.0577, + "step": 270530 + }, + { + "epoch": 1.0297420125910643, + "grad_norm": 0.13375785946846008, + "learning_rate": 0.00029043696254666795, + "loss": 2.036, + "step": 270540 + }, + { + "epoch": 1.0297800750591872, + "grad_norm": 0.12649527192115784, + "learning_rate": 0.000290337572909848, + "loss": 2.0623, + "step": 270550 + }, + { + "epoch": 1.0298181375273099, + "grad_norm": 0.14169186353683472, + "learning_rate": 0.0002902382303659697, + "loss": 2.0439, + "step": 270560 + }, + { + "epoch": 1.0298561999954325, + "grad_norm": 0.14085915684700012, + "learning_rate": 0.00029013893484815565, + "loss": 2.0355, + "step": 270570 + }, + { + "epoch": 1.0298942624635552, + "grad_norm": 0.13160523772239685, + "learning_rate": 0.00029003968628968634, + "loss": 2.0297, + "step": 270580 + }, + { + "epoch": 1.0299323249316779, + "grad_norm": 0.1466887891292572, + "learning_rate": 0.000289940484624, + "loss": 2.0551, + "step": 270590 + }, + { + "epoch": 1.0299703873998005, + "grad_norm": 0.15212509036064148, + "learning_rate": 0.0002898413297846918, + "loss": 2.0437, + "step": 270600 + }, + { + "epoch": 1.0300084498679232, + "grad_norm": 0.1452605426311493, + "learning_rate": 0.00028974222170551376, + "loss": 2.056, + "step": 270610 + }, + { + "epoch": 1.0300465123360458, + "grad_norm": 0.1186077743768692, + "learning_rate": 0.00028964316032037375, + "loss": 2.066, + "step": 270620 + }, + { + "epoch": 1.0300845748041687, + "grad_norm": 0.1270798295736313, + "learning_rate": 0.0002895441455633351, + "loss": 2.0402, + "step": 270630 + }, + { + "epoch": 1.0301226372722914, + "grad_norm": 0.1419951617717743, + "learning_rate": 0.0002894451773686166, + "loss": 2.0533, + "step": 270640 + }, + { + "epoch": 1.030160699740414, + "grad_norm": 0.13650859892368317, + "learning_rate": 0.00028934625567059104, + "loss": 2.0297, + "step": 270650 + }, + { + "epoch": 1.0301987622085367, + "grad_norm": 0.11910858750343323, + "learning_rate": 0.0002892473804037856, + "loss": 2.0631, + "step": 270660 + }, + { + "epoch": 1.0302368246766593, + "grad_norm": 0.12136068940162659, + "learning_rate": 0.0002891485515028808, + "loss": 2.0589, + "step": 270670 + }, + { + "epoch": 1.030274887144782, + "grad_norm": 0.12515807151794434, + "learning_rate": 0.00028904976890271016, + "loss": 2.0337, + "step": 270680 + }, + { + "epoch": 1.0303129496129046, + "grad_norm": 0.13482242822647095, + "learning_rate": 0.0002889510325382598, + "loss": 2.0463, + "step": 270690 + }, + { + "epoch": 1.0303510120810273, + "grad_norm": 0.14589546620845795, + "learning_rate": 0.00028885234234466784, + "loss": 2.0644, + "step": 270700 + }, + { + "epoch": 1.03038907454915, + "grad_norm": 0.1272658407688141, + "learning_rate": 0.0002887536982572239, + "loss": 2.0505, + "step": 270710 + }, + { + "epoch": 1.0304271370172728, + "grad_norm": 0.15014109015464783, + "learning_rate": 0.0002886551002113685, + "loss": 2.0567, + "step": 270720 + }, + { + "epoch": 1.0304651994853955, + "grad_norm": 0.11785303056240082, + "learning_rate": 0.00028855654814269294, + "loss": 2.0389, + "step": 270730 + }, + { + "epoch": 1.0305032619535182, + "grad_norm": 0.12073080986738205, + "learning_rate": 0.0002884580419869382, + "loss": 2.0508, + "step": 270740 + }, + { + "epoch": 1.0305413244216408, + "grad_norm": 0.14707377552986145, + "learning_rate": 0.0002883595816799951, + "loss": 2.0511, + "step": 270750 + }, + { + "epoch": 1.0305793868897635, + "grad_norm": 0.13029642403125763, + "learning_rate": 0.0002882611671579034, + "loss": 2.033, + "step": 270760 + }, + { + "epoch": 1.0306174493578861, + "grad_norm": 0.13907068967819214, + "learning_rate": 0.00028816279835685145, + "loss": 2.0579, + "step": 270770 + }, + { + "epoch": 1.0306555118260088, + "grad_norm": 0.14463141560554504, + "learning_rate": 0.0002880644752131756, + "loss": 2.0451, + "step": 270780 + }, + { + "epoch": 1.0306935742941314, + "grad_norm": 0.1304018646478653, + "learning_rate": 0.00028796619766336, + "loss": 2.0389, + "step": 270790 + }, + { + "epoch": 1.0307316367622543, + "grad_norm": 0.14264269173145294, + "learning_rate": 0.00028786796564403573, + "loss": 2.0434, + "step": 270800 + }, + { + "epoch": 1.030769699230377, + "grad_norm": 0.13232994079589844, + "learning_rate": 0.0002877697790919807, + "loss": 2.0544, + "step": 270810 + }, + { + "epoch": 1.0308077616984996, + "grad_norm": 0.12084189802408218, + "learning_rate": 0.0002876716379441189, + "loss": 2.0471, + "step": 270820 + }, + { + "epoch": 1.0308458241666223, + "grad_norm": 0.13656097650527954, + "learning_rate": 0.00028757354213751994, + "loss": 2.0594, + "step": 270830 + }, + { + "epoch": 1.030883886634745, + "grad_norm": 0.12707364559173584, + "learning_rate": 0.0002874754916093989, + "loss": 2.047, + "step": 270840 + }, + { + "epoch": 1.0309219491028676, + "grad_norm": 0.13456393778324127, + "learning_rate": 0.0002873774862971155, + "loss": 2.0666, + "step": 270850 + }, + { + "epoch": 1.0309600115709903, + "grad_norm": 0.11755143851041794, + "learning_rate": 0.00028727952613817354, + "loss": 2.0423, + "step": 270860 + }, + { + "epoch": 1.030998074039113, + "grad_norm": 0.12134380638599396, + "learning_rate": 0.000287181611070221, + "loss": 2.0474, + "step": 270870 + }, + { + "epoch": 1.0310361365072356, + "grad_norm": 0.14861641824245453, + "learning_rate": 0.0002870837410310492, + "loss": 2.0432, + "step": 270880 + }, + { + "epoch": 1.0310741989753585, + "grad_norm": 0.13673312962055206, + "learning_rate": 0.0002869859159585921, + "loss": 2.0353, + "step": 270890 + }, + { + "epoch": 1.0311122614434811, + "grad_norm": 0.12540844082832336, + "learning_rate": 0.0002868881357909263, + "loss": 2.0423, + "step": 270900 + }, + { + "epoch": 1.0311503239116038, + "grad_norm": 0.13241389393806458, + "learning_rate": 0.0002867904004662705, + "loss": 2.0427, + "step": 270910 + }, + { + "epoch": 1.0311883863797264, + "grad_norm": 0.1488330215215683, + "learning_rate": 0.00028669270992298456, + "loss": 2.0439, + "step": 270920 + }, + { + "epoch": 1.031226448847849, + "grad_norm": 0.13117197155952454, + "learning_rate": 0.00028659506409956995, + "loss": 2.0439, + "step": 270930 + }, + { + "epoch": 1.0312645113159717, + "grad_norm": 0.12436238676309586, + "learning_rate": 0.00028649746293466833, + "loss": 2.0582, + "step": 270940 + }, + { + "epoch": 1.0313025737840944, + "grad_norm": 0.134196937084198, + "learning_rate": 0.0002863999063670617, + "loss": 2.0517, + "step": 270950 + }, + { + "epoch": 1.031340636252217, + "grad_norm": 0.12425374239683151, + "learning_rate": 0.00028630239433567197, + "loss": 2.0472, + "step": 270960 + }, + { + "epoch": 1.0313786987203397, + "grad_norm": 0.12356572598218918, + "learning_rate": 0.00028620492677955997, + "loss": 2.0396, + "step": 270970 + }, + { + "epoch": 1.0314167611884626, + "grad_norm": 0.1313382238149643, + "learning_rate": 0.0002861075036379256, + "loss": 2.0492, + "step": 270980 + }, + { + "epoch": 1.0314548236565853, + "grad_norm": 0.13148029148578644, + "learning_rate": 0.0002860101248501073, + "loss": 2.0416, + "step": 270990 + }, + { + "epoch": 1.031492886124708, + "grad_norm": 0.12867027521133423, + "learning_rate": 0.0002859127903555812, + "loss": 2.0288, + "step": 271000 + }, + { + "epoch": 1.0315309485928306, + "grad_norm": 0.12998327612876892, + "learning_rate": 0.00028581550009396106, + "loss": 2.0544, + "step": 271010 + }, + { + "epoch": 1.0315690110609532, + "grad_norm": 0.1239062026143074, + "learning_rate": 0.0002857182540049977, + "loss": 2.033, + "step": 271020 + }, + { + "epoch": 1.0316070735290759, + "grad_norm": 0.1187189444899559, + "learning_rate": 0.0002856210520285788, + "loss": 2.0454, + "step": 271030 + }, + { + "epoch": 1.0316451359971985, + "grad_norm": 0.14411544799804688, + "learning_rate": 0.00028552389410472785, + "loss": 2.0359, + "step": 271040 + }, + { + "epoch": 1.0316831984653212, + "grad_norm": 0.12118139117956161, + "learning_rate": 0.00028542678017360446, + "loss": 2.0341, + "step": 271050 + }, + { + "epoch": 1.031721260933444, + "grad_norm": 0.13053321838378906, + "learning_rate": 0.00028532971017550344, + "loss": 2.0394, + "step": 271060 + }, + { + "epoch": 1.0317593234015667, + "grad_norm": 0.1759636551141739, + "learning_rate": 0.00028523268405085465, + "loss": 2.0489, + "step": 271070 + }, + { + "epoch": 1.0317973858696894, + "grad_norm": 0.14842788875102997, + "learning_rate": 0.0002851357017402224, + "loss": 2.0419, + "step": 271080 + }, + { + "epoch": 1.031835448337812, + "grad_norm": 0.12284272164106369, + "learning_rate": 0.0002850387631843049, + "loss": 2.0399, + "step": 271090 + }, + { + "epoch": 1.0318735108059347, + "grad_norm": 0.1309538185596466, + "learning_rate": 0.00028494186832393433, + "loss": 2.0462, + "step": 271100 + }, + { + "epoch": 1.0319115732740574, + "grad_norm": 0.14089371263980865, + "learning_rate": 0.00028484501710007585, + "loss": 2.0324, + "step": 271110 + }, + { + "epoch": 1.03194963574218, + "grad_norm": 0.1265069842338562, + "learning_rate": 0.0002847482094538275, + "loss": 2.0399, + "step": 271120 + }, + { + "epoch": 1.0319876982103027, + "grad_norm": 0.1347149908542633, + "learning_rate": 0.00028465144532641975, + "loss": 2.0483, + "step": 271130 + }, + { + "epoch": 1.0320257606784253, + "grad_norm": 0.13016733527183533, + "learning_rate": 0.00028455472465921497, + "loss": 2.0477, + "step": 271140 + }, + { + "epoch": 1.0320638231465482, + "grad_norm": 0.14013062417507172, + "learning_rate": 0.0002844580473937073, + "loss": 2.0515, + "step": 271150 + }, + { + "epoch": 1.0321018856146709, + "grad_norm": 0.1346028596162796, + "learning_rate": 0.0002843614134715218, + "loss": 2.0354, + "step": 271160 + }, + { + "epoch": 1.0321399480827935, + "grad_norm": 0.1217736080288887, + "learning_rate": 0.0002842648228344142, + "loss": 2.0338, + "step": 271170 + }, + { + "epoch": 1.0321780105509162, + "grad_norm": 0.11644167453050613, + "learning_rate": 0.000284168275424271, + "loss": 2.0353, + "step": 271180 + }, + { + "epoch": 1.0322160730190388, + "grad_norm": 0.11918290704488754, + "learning_rate": 0.00028407177118310815, + "loss": 2.0449, + "step": 271190 + }, + { + "epoch": 1.0322541354871615, + "grad_norm": 0.12893658876419067, + "learning_rate": 0.00028397531005307133, + "loss": 2.0265, + "step": 271200 + }, + { + "epoch": 1.0322921979552842, + "grad_norm": 0.1387089639902115, + "learning_rate": 0.0002838788919764353, + "loss": 2.0351, + "step": 271210 + }, + { + "epoch": 1.0323302604234068, + "grad_norm": 0.12826283276081085, + "learning_rate": 0.00028378251689560347, + "loss": 2.0457, + "step": 271220 + }, + { + "epoch": 1.0323683228915295, + "grad_norm": 0.11565086990594864, + "learning_rate": 0.0002836861847531077, + "loss": 2.0441, + "step": 271230 + }, + { + "epoch": 1.0324063853596523, + "grad_norm": 0.1372179388999939, + "learning_rate": 0.00028358989549160757, + "loss": 2.051, + "step": 271240 + }, + { + "epoch": 1.032444447827775, + "grad_norm": 0.16050764918327332, + "learning_rate": 0.00028349364905389034, + "loss": 2.0405, + "step": 271250 + }, + { + "epoch": 1.0324825102958977, + "grad_norm": 0.12388958036899567, + "learning_rate": 0.00028339744538287024, + "loss": 2.0409, + "step": 271260 + }, + { + "epoch": 1.0325205727640203, + "grad_norm": 0.13892515003681183, + "learning_rate": 0.0002833012844215884, + "loss": 2.0442, + "step": 271270 + }, + { + "epoch": 1.032558635232143, + "grad_norm": 0.12876741588115692, + "learning_rate": 0.000283205166113212, + "loss": 2.0392, + "step": 271280 + }, + { + "epoch": 1.0325966977002656, + "grad_norm": 0.13448983430862427, + "learning_rate": 0.0002831090904010344, + "loss": 2.0362, + "step": 271290 + }, + { + "epoch": 1.0326347601683883, + "grad_norm": 0.13363835215568542, + "learning_rate": 0.0002830130572284744, + "loss": 2.0282, + "step": 271300 + }, + { + "epoch": 1.032672822636511, + "grad_norm": 0.1334499567747116, + "learning_rate": 0.0002829170665390759, + "loss": 2.038, + "step": 271310 + }, + { + "epoch": 1.0327108851046338, + "grad_norm": 0.1251898854970932, + "learning_rate": 0.00028282111827650767, + "loss": 2.0388, + "step": 271320 + }, + { + "epoch": 1.0327489475727565, + "grad_norm": 0.14659687876701355, + "learning_rate": 0.00028272521238456285, + "loss": 2.0515, + "step": 271330 + }, + { + "epoch": 1.0327870100408791, + "grad_norm": 0.12558038532733917, + "learning_rate": 0.0002826293488071584, + "loss": 2.0342, + "step": 271340 + }, + { + "epoch": 1.0328250725090018, + "grad_norm": 0.1283547431230545, + "learning_rate": 0.00028253352748833514, + "loss": 2.0362, + "step": 271350 + }, + { + "epoch": 1.0328631349771245, + "grad_norm": 0.12782080471515656, + "learning_rate": 0.0002824377483722571, + "loss": 2.0587, + "step": 271360 + }, + { + "epoch": 1.032901197445247, + "grad_norm": 0.12600749731063843, + "learning_rate": 0.00028234201140321083, + "loss": 2.041, + "step": 271370 + }, + { + "epoch": 1.0329392599133698, + "grad_norm": 0.13655886054039001, + "learning_rate": 0.0002822463165256058, + "loss": 2.0494, + "step": 271380 + }, + { + "epoch": 1.0329773223814924, + "grad_norm": 0.1363477259874344, + "learning_rate": 0.00028215066368397323, + "loss": 2.0404, + "step": 271390 + }, + { + "epoch": 1.033015384849615, + "grad_norm": 0.1418614387512207, + "learning_rate": 0.00028205505282296635, + "loss": 2.0365, + "step": 271400 + }, + { + "epoch": 1.033053447317738, + "grad_norm": 0.12292768061161041, + "learning_rate": 0.00028195948388735946, + "loss": 2.037, + "step": 271410 + }, + { + "epoch": 1.0330915097858606, + "grad_norm": 0.11791270226240158, + "learning_rate": 0.000281863956822048, + "loss": 2.0417, + "step": 271420 + }, + { + "epoch": 1.0331295722539833, + "grad_norm": 0.11475887894630432, + "learning_rate": 0.00028176847157204807, + "loss": 2.0327, + "step": 271430 + }, + { + "epoch": 1.033167634722106, + "grad_norm": 0.1630665808916092, + "learning_rate": 0.0002816730280824958, + "loss": 2.0361, + "step": 271440 + }, + { + "epoch": 1.0332056971902286, + "grad_norm": 0.13486617803573608, + "learning_rate": 0.00028157762629864736, + "loss": 2.0385, + "step": 271450 + }, + { + "epoch": 1.0332437596583512, + "grad_norm": 0.1664140820503235, + "learning_rate": 0.00028148226616587845, + "loss": 2.0249, + "step": 271460 + }, + { + "epoch": 1.033281822126474, + "grad_norm": 0.13183072209358215, + "learning_rate": 0.00028138694762968366, + "loss": 2.0455, + "step": 271470 + }, + { + "epoch": 1.0333198845945966, + "grad_norm": 0.12325718253850937, + "learning_rate": 0.0002812916706356767, + "loss": 2.0388, + "step": 271480 + }, + { + "epoch": 1.0333579470627194, + "grad_norm": 0.1310417354106903, + "learning_rate": 0.00028119643512958937, + "loss": 2.0444, + "step": 271490 + }, + { + "epoch": 1.033396009530842, + "grad_norm": 0.12681756913661957, + "learning_rate": 0.00028110124105727176, + "loss": 2.0375, + "step": 271500 + }, + { + "epoch": 1.0334340719989648, + "grad_norm": 0.12703047692775726, + "learning_rate": 0.0002810060883646914, + "loss": 2.0323, + "step": 271510 + }, + { + "epoch": 1.0334721344670874, + "grad_norm": 0.1270838975906372, + "learning_rate": 0.00028091097699793356, + "loss": 2.039, + "step": 271520 + }, + { + "epoch": 1.03351019693521, + "grad_norm": 0.13786223530769348, + "learning_rate": 0.0002808159069032, + "loss": 2.0454, + "step": 271530 + }, + { + "epoch": 1.0335482594033327, + "grad_norm": 0.12944789230823517, + "learning_rate": 0.00028072087802680955, + "loss": 2.0484, + "step": 271540 + }, + { + "epoch": 1.0335863218714554, + "grad_norm": 0.12100300192832947, + "learning_rate": 0.0002806258903151969, + "loss": 2.0474, + "step": 271550 + }, + { + "epoch": 1.033624384339578, + "grad_norm": 0.160098597407341, + "learning_rate": 0.00028053094371491305, + "loss": 2.0456, + "step": 271560 + }, + { + "epoch": 1.0336624468077007, + "grad_norm": 0.1333358883857727, + "learning_rate": 0.0002804360381726242, + "loss": 2.0491, + "step": 271570 + }, + { + "epoch": 1.0337005092758236, + "grad_norm": 0.1314467042684555, + "learning_rate": 0.000280341173635112, + "loss": 2.0408, + "step": 271580 + }, + { + "epoch": 1.0337385717439462, + "grad_norm": 0.13625569641590118, + "learning_rate": 0.000280246350049273, + "loss": 2.0622, + "step": 271590 + }, + { + "epoch": 1.033776634212069, + "grad_norm": 0.12897242605686188, + "learning_rate": 0.000280151567362118, + "loss": 2.0384, + "step": 271600 + }, + { + "epoch": 1.0338146966801915, + "grad_norm": 0.12388896942138672, + "learning_rate": 0.0002800568255207723, + "loss": 2.02, + "step": 271610 + }, + { + "epoch": 1.0338527591483142, + "grad_norm": 0.13515430688858032, + "learning_rate": 0.00027996212447247484, + "loss": 2.0463, + "step": 271620 + }, + { + "epoch": 1.0338908216164369, + "grad_norm": 0.13997548818588257, + "learning_rate": 0.0002798674641645781, + "loss": 2.0293, + "step": 271630 + }, + { + "epoch": 1.0339288840845595, + "grad_norm": 0.1346750557422638, + "learning_rate": 0.0002797728445445476, + "loss": 2.0568, + "step": 271640 + }, + { + "epoch": 1.0339669465526822, + "grad_norm": 0.12908905744552612, + "learning_rate": 0.0002796782655599619, + "loss": 2.0483, + "step": 271650 + }, + { + "epoch": 1.034005009020805, + "grad_norm": 0.12039399892091751, + "learning_rate": 0.00027958372715851187, + "loss": 2.0488, + "step": 271660 + }, + { + "epoch": 1.0340430714889277, + "grad_norm": 0.13701388239860535, + "learning_rate": 0.0002794892292880006, + "loss": 2.042, + "step": 271670 + }, + { + "epoch": 1.0340811339570504, + "grad_norm": 0.1296730488538742, + "learning_rate": 0.0002793947718963427, + "loss": 2.048, + "step": 271680 + }, + { + "epoch": 1.034119196425173, + "grad_norm": 0.14297781884670258, + "learning_rate": 0.0002793003549315647, + "loss": 2.0565, + "step": 271690 + }, + { + "epoch": 1.0341572588932957, + "grad_norm": 0.1596149206161499, + "learning_rate": 0.0002792059783418038, + "loss": 2.0322, + "step": 271700 + }, + { + "epoch": 1.0341953213614183, + "grad_norm": 0.13431543111801147, + "learning_rate": 0.0002791116420753084, + "loss": 2.0342, + "step": 271710 + }, + { + "epoch": 1.034233383829541, + "grad_norm": 0.11944739520549774, + "learning_rate": 0.00027901734608043704, + "loss": 2.0376, + "step": 271720 + }, + { + "epoch": 1.0342714462976637, + "grad_norm": 0.13216999173164368, + "learning_rate": 0.00027892309030565855, + "loss": 2.0317, + "step": 271730 + }, + { + "epoch": 1.0343095087657863, + "grad_norm": 0.13707774877548218, + "learning_rate": 0.0002788288746995515, + "loss": 2.0322, + "step": 271740 + }, + { + "epoch": 1.0343475712339092, + "grad_norm": 0.13045634329319, + "learning_rate": 0.0002787346992108041, + "loss": 2.0405, + "step": 271750 + }, + { + "epoch": 1.0343856337020318, + "grad_norm": 0.13780523836612701, + "learning_rate": 0.00027864056378821346, + "loss": 2.0402, + "step": 271760 + }, + { + "epoch": 1.0344236961701545, + "grad_norm": 0.11843425035476685, + "learning_rate": 0.00027854646838068565, + "loss": 2.0361, + "step": 271770 + }, + { + "epoch": 1.0344617586382772, + "grad_norm": 0.1744195818901062, + "learning_rate": 0.00027845241293723524, + "loss": 2.0439, + "step": 271780 + }, + { + "epoch": 1.0344998211063998, + "grad_norm": 0.13265101611614227, + "learning_rate": 0.000278358397406985, + "loss": 2.0341, + "step": 271790 + }, + { + "epoch": 1.0345378835745225, + "grad_norm": 0.1773340404033661, + "learning_rate": 0.00027826442173916547, + "loss": 2.0429, + "step": 271800 + }, + { + "epoch": 1.0345759460426451, + "grad_norm": 0.12701316177845, + "learning_rate": 0.0002781704858831149, + "loss": 2.0415, + "step": 271810 + }, + { + "epoch": 1.0346140085107678, + "grad_norm": 0.14962537586688995, + "learning_rate": 0.00027807658978827853, + "loss": 2.0328, + "step": 271820 + }, + { + "epoch": 1.0346520709788904, + "grad_norm": 0.12321054190397263, + "learning_rate": 0.0002779827334042087, + "loss": 2.0429, + "step": 271830 + }, + { + "epoch": 1.0346901334470133, + "grad_norm": 0.4225773215293884, + "learning_rate": 0.00027788891668056425, + "loss": 2.045, + "step": 271840 + }, + { + "epoch": 1.034728195915136, + "grad_norm": 0.14254769682884216, + "learning_rate": 0.00027779513956711026, + "loss": 2.0565, + "step": 271850 + }, + { + "epoch": 1.0347662583832586, + "grad_norm": 0.13923725485801697, + "learning_rate": 0.00027770140201371793, + "loss": 2.0555, + "step": 271860 + }, + { + "epoch": 1.0348043208513813, + "grad_norm": 0.13378120958805084, + "learning_rate": 0.00027760770397036385, + "loss": 2.0425, + "step": 271870 + }, + { + "epoch": 1.034842383319504, + "grad_norm": 0.14077191054821014, + "learning_rate": 0.0002775140453871301, + "loss": 2.0569, + "step": 271880 + }, + { + "epoch": 1.0348804457876266, + "grad_norm": 0.13247641921043396, + "learning_rate": 0.00027742042621420386, + "loss": 2.0413, + "step": 271890 + }, + { + "epoch": 1.0349185082557493, + "grad_norm": 0.1335470974445343, + "learning_rate": 0.0002773268464018769, + "loss": 2.0605, + "step": 271900 + }, + { + "epoch": 1.034956570723872, + "grad_norm": 0.12615036964416504, + "learning_rate": 0.00027723330590054534, + "loss": 2.041, + "step": 271910 + }, + { + "epoch": 1.0349946331919948, + "grad_norm": 0.12654559314250946, + "learning_rate": 0.0002771398046607096, + "loss": 2.0372, + "step": 271920 + }, + { + "epoch": 1.0350326956601175, + "grad_norm": 0.14396308362483978, + "learning_rate": 0.0002770463426329738, + "loss": 2.0384, + "step": 271930 + }, + { + "epoch": 1.0350707581282401, + "grad_norm": 0.15201355516910553, + "learning_rate": 0.0002769529197680454, + "loss": 2.034, + "step": 271940 + }, + { + "epoch": 1.0351088205963628, + "grad_norm": 0.1299097239971161, + "learning_rate": 0.0002768595360167353, + "loss": 2.0445, + "step": 271950 + }, + { + "epoch": 1.0351468830644854, + "grad_norm": 0.12313937395811081, + "learning_rate": 0.00027676619132995704, + "loss": 2.0524, + "step": 271960 + }, + { + "epoch": 1.035184945532608, + "grad_norm": 0.14119631052017212, + "learning_rate": 0.00027667288565872707, + "loss": 2.0346, + "step": 271970 + }, + { + "epoch": 1.0352230080007307, + "grad_norm": 0.13589946925640106, + "learning_rate": 0.00027657961895416374, + "loss": 2.0463, + "step": 271980 + }, + { + "epoch": 1.0352610704688534, + "grad_norm": 0.1256546825170517, + "learning_rate": 0.00027648639116748766, + "loss": 2.0517, + "step": 271990 + }, + { + "epoch": 1.035299132936976, + "grad_norm": 0.13876435160636902, + "learning_rate": 0.0002763932022500211, + "loss": 2.0387, + "step": 272000 + }, + { + "epoch": 1.035337195405099, + "grad_norm": 0.13768640160560608, + "learning_rate": 0.0002763000521531874, + "loss": 2.0265, + "step": 272010 + }, + { + "epoch": 1.0353752578732216, + "grad_norm": 0.13261787593364716, + "learning_rate": 0.00027620694082851154, + "loss": 2.0189, + "step": 272020 + }, + { + "epoch": 1.0354133203413443, + "grad_norm": 0.11902239173650742, + "learning_rate": 0.00027611386822761885, + "loss": 2.036, + "step": 272030 + }, + { + "epoch": 1.035451382809467, + "grad_norm": 0.1425875723361969, + "learning_rate": 0.00027602083430223546, + "loss": 2.0414, + "step": 272040 + }, + { + "epoch": 1.0354894452775896, + "grad_norm": 0.14108528196811676, + "learning_rate": 0.0002759278390041875, + "loss": 2.0214, + "step": 272050 + }, + { + "epoch": 1.0355275077457122, + "grad_norm": 0.17860999703407288, + "learning_rate": 0.00027583488228540107, + "loss": 2.0408, + "step": 272060 + }, + { + "epoch": 1.0355655702138349, + "grad_norm": 0.14166538417339325, + "learning_rate": 0.000275741964097902, + "loss": 2.0608, + "step": 272070 + }, + { + "epoch": 1.0356036326819575, + "grad_norm": 0.13897563517093658, + "learning_rate": 0.0002756490843938155, + "loss": 2.0373, + "step": 272080 + }, + { + "epoch": 1.0356416951500802, + "grad_norm": 0.12821650505065918, + "learning_rate": 0.0002755562431253656, + "loss": 2.0291, + "step": 272090 + }, + { + "epoch": 1.035679757618203, + "grad_norm": 0.1173110380768776, + "learning_rate": 0.0002754634402448754, + "loss": 2.0345, + "step": 272100 + }, + { + "epoch": 1.0357178200863257, + "grad_norm": 0.1263628900051117, + "learning_rate": 0.00027537067570476616, + "loss": 2.0261, + "step": 272110 + }, + { + "epoch": 1.0357558825544484, + "grad_norm": 0.13653993606567383, + "learning_rate": 0.0002752779494575577, + "loss": 2.0366, + "step": 272120 + }, + { + "epoch": 1.035793945022571, + "grad_norm": 0.12434863299131393, + "learning_rate": 0.0002751852614558675, + "loss": 2.0579, + "step": 272130 + }, + { + "epoch": 1.0358320074906937, + "grad_norm": 0.12659211456775665, + "learning_rate": 0.00027509261165241074, + "loss": 2.0289, + "step": 272140 + }, + { + "epoch": 1.0358700699588164, + "grad_norm": 0.12357845157384872, + "learning_rate": 0.000275, + "loss": 2.0485, + "step": 272150 + }, + { + "epoch": 1.035908132426939, + "grad_norm": 0.12654438614845276, + "learning_rate": 0.0002749074264515449, + "loss": 2.045, + "step": 272160 + }, + { + "epoch": 1.0359461948950617, + "grad_norm": 0.11802220344543457, + "learning_rate": 0.00027481489096005184, + "loss": 2.0446, + "step": 272170 + }, + { + "epoch": 1.0359842573631846, + "grad_norm": 0.14644622802734375, + "learning_rate": 0.00027472239347862385, + "loss": 2.0423, + "step": 272180 + }, + { + "epoch": 1.0360223198313072, + "grad_norm": 0.13627606630325317, + "learning_rate": 0.0002746299339604599, + "loss": 2.0408, + "step": 272190 + }, + { + "epoch": 1.0360603822994299, + "grad_norm": 0.14931724965572357, + "learning_rate": 0.00027453751235885526, + "loss": 2.0406, + "step": 272200 + }, + { + "epoch": 1.0360984447675525, + "grad_norm": 0.17233909666538239, + "learning_rate": 0.00027444512862720086, + "loss": 2.0531, + "step": 272210 + }, + { + "epoch": 1.0361365072356752, + "grad_norm": 0.13387896120548248, + "learning_rate": 0.0002743527827189826, + "loss": 2.0429, + "step": 272220 + }, + { + "epoch": 1.0361745697037978, + "grad_norm": 0.14065547287464142, + "learning_rate": 0.0002742604745877822, + "loss": 2.0369, + "step": 272230 + }, + { + "epoch": 1.0362126321719205, + "grad_norm": 0.1187807098031044, + "learning_rate": 0.00027416820418727574, + "loss": 2.0478, + "step": 272240 + }, + { + "epoch": 1.0362506946400432, + "grad_norm": 0.1296232044696808, + "learning_rate": 0.00027407597147123406, + "loss": 2.0303, + "step": 272250 + }, + { + "epoch": 1.0362887571081658, + "grad_norm": 0.12152990698814392, + "learning_rate": 0.0002739837763935223, + "loss": 2.0536, + "step": 272260 + }, + { + "epoch": 1.0363268195762887, + "grad_norm": 0.14488112926483154, + "learning_rate": 0.00027389161890809975, + "loss": 2.0202, + "step": 272270 + }, + { + "epoch": 1.0363648820444114, + "grad_norm": 0.13850657641887665, + "learning_rate": 0.00027379949896901934, + "loss": 2.0382, + "step": 272280 + }, + { + "epoch": 1.036402944512534, + "grad_norm": 0.1515287607908249, + "learning_rate": 0.00027370741653042776, + "loss": 2.0255, + "step": 272290 + }, + { + "epoch": 1.0364410069806567, + "grad_norm": 0.13836178183555603, + "learning_rate": 0.00027361537154656457, + "loss": 2.0439, + "step": 272300 + }, + { + "epoch": 1.0364790694487793, + "grad_norm": 0.13903303444385529, + "learning_rate": 0.0002735233639717627, + "loss": 2.0438, + "step": 272310 + }, + { + "epoch": 1.036517131916902, + "grad_norm": 0.1336706131696701, + "learning_rate": 0.0002734313937604476, + "loss": 2.04, + "step": 272320 + }, + { + "epoch": 1.0365551943850246, + "grad_norm": 0.12238036096096039, + "learning_rate": 0.00027333946086713725, + "loss": 2.0453, + "step": 272330 + }, + { + "epoch": 1.0365932568531473, + "grad_norm": 0.12669286131858826, + "learning_rate": 0.0002732475652464418, + "loss": 2.0288, + "step": 272340 + }, + { + "epoch": 1.0366313193212702, + "grad_norm": 0.15388906002044678, + "learning_rate": 0.00027315570685306336, + "loss": 2.0514, + "step": 272350 + }, + { + "epoch": 1.0366693817893928, + "grad_norm": 0.12682406604290009, + "learning_rate": 0.0002730638856417956, + "loss": 2.0594, + "step": 272360 + }, + { + "epoch": 1.0367074442575155, + "grad_norm": 0.12043902277946472, + "learning_rate": 0.00027297210156752394, + "loss": 2.026, + "step": 272370 + }, + { + "epoch": 1.0367455067256381, + "grad_norm": 0.12616755068302155, + "learning_rate": 0.00027288035458522455, + "loss": 2.0319, + "step": 272380 + }, + { + "epoch": 1.0367835691937608, + "grad_norm": 0.13175073266029358, + "learning_rate": 0.0002727886446499648, + "loss": 2.0323, + "step": 272390 + }, + { + "epoch": 1.0368216316618835, + "grad_norm": 0.1313619464635849, + "learning_rate": 0.0002726969717169024, + "loss": 2.0324, + "step": 272400 + }, + { + "epoch": 1.0368596941300061, + "grad_norm": 0.15082502365112305, + "learning_rate": 0.0002726053357412859, + "loss": 2.0228, + "step": 272410 + }, + { + "epoch": 1.0368977565981288, + "grad_norm": 0.12719838321208954, + "learning_rate": 0.0002725137366784535, + "loss": 2.0329, + "step": 272420 + }, + { + "epoch": 1.0369358190662514, + "grad_norm": 0.11831434071063995, + "learning_rate": 0.00027242217448383366, + "loss": 2.0315, + "step": 272430 + }, + { + "epoch": 1.0369738815343743, + "grad_norm": 0.15007321536540985, + "learning_rate": 0.0002723306491129442, + "loss": 2.0404, + "step": 272440 + }, + { + "epoch": 1.037011944002497, + "grad_norm": 0.1263130009174347, + "learning_rate": 0.00027223916052139253, + "loss": 2.0303, + "step": 272450 + }, + { + "epoch": 1.0370500064706196, + "grad_norm": 0.14173053205013275, + "learning_rate": 0.00027214770866487503, + "loss": 2.0309, + "step": 272460 + }, + { + "epoch": 1.0370880689387423, + "grad_norm": 0.13739918172359467, + "learning_rate": 0.00027205629349917707, + "loss": 2.0275, + "step": 272470 + }, + { + "epoch": 1.037126131406865, + "grad_norm": 0.1491842418909073, + "learning_rate": 0.0002719649149801724, + "loss": 2.0511, + "step": 272480 + }, + { + "epoch": 1.0371641938749876, + "grad_norm": 0.13553743064403534, + "learning_rate": 0.00027187357306382356, + "loss": 2.0328, + "step": 272490 + }, + { + "epoch": 1.0372022563431103, + "grad_norm": 0.13617704808712006, + "learning_rate": 0.00027178226770618077, + "loss": 2.0338, + "step": 272500 + }, + { + "epoch": 1.037240318811233, + "grad_norm": 0.13658751547336578, + "learning_rate": 0.00027169099886338257, + "loss": 2.0559, + "step": 272510 + }, + { + "epoch": 1.0372783812793558, + "grad_norm": 0.122580386698246, + "learning_rate": 0.0002715997664916547, + "loss": 2.0365, + "step": 272520 + }, + { + "epoch": 1.0373164437474784, + "grad_norm": 0.13053767383098602, + "learning_rate": 0.00027150857054731057, + "loss": 2.0461, + "step": 272530 + }, + { + "epoch": 1.037354506215601, + "grad_norm": 0.15087807178497314, + "learning_rate": 0.00027141741098675074, + "loss": 2.0341, + "step": 272540 + }, + { + "epoch": 1.0373925686837238, + "grad_norm": 0.14107906818389893, + "learning_rate": 0.00027132628776646263, + "loss": 2.0347, + "step": 272550 + }, + { + "epoch": 1.0374306311518464, + "grad_norm": 0.1257706731557846, + "learning_rate": 0.0002712352008430201, + "loss": 2.0359, + "step": 272560 + }, + { + "epoch": 1.037468693619969, + "grad_norm": 0.14777903258800507, + "learning_rate": 0.00027114415017308384, + "loss": 2.0242, + "step": 272570 + }, + { + "epoch": 1.0375067560880917, + "grad_norm": 0.1389368772506714, + "learning_rate": 0.0002710531357134004, + "loss": 2.0403, + "step": 272580 + }, + { + "epoch": 1.0375448185562144, + "grad_norm": 0.13209329545497894, + "learning_rate": 0.00027096215742080233, + "loss": 2.0348, + "step": 272590 + }, + { + "epoch": 1.037582881024337, + "grad_norm": 0.1406853199005127, + "learning_rate": 0.00027087121525220804, + "loss": 2.0433, + "step": 272600 + }, + { + "epoch": 1.03762094349246, + "grad_norm": 0.1262650340795517, + "learning_rate": 0.00027078030916462113, + "loss": 2.0485, + "step": 272610 + }, + { + "epoch": 1.0376590059605826, + "grad_norm": 0.14530049264431, + "learning_rate": 0.00027068943911513077, + "loss": 2.0375, + "step": 272620 + }, + { + "epoch": 1.0376970684287052, + "grad_norm": 0.14594286680221558, + "learning_rate": 0.00027059860506091076, + "loss": 2.0299, + "step": 272630 + }, + { + "epoch": 1.037735130896828, + "grad_norm": 0.14979077875614166, + "learning_rate": 0.00027050780695921995, + "loss": 2.0371, + "step": 272640 + }, + { + "epoch": 1.0377731933649506, + "grad_norm": 0.1400003582239151, + "learning_rate": 0.0002704170447674015, + "loss": 2.0479, + "step": 272650 + }, + { + "epoch": 1.0378112558330732, + "grad_norm": 0.13226519525051117, + "learning_rate": 0.00027032631844288294, + "loss": 2.0343, + "step": 272660 + }, + { + "epoch": 1.0378493183011959, + "grad_norm": 0.1601780354976654, + "learning_rate": 0.000270235627943176, + "loss": 2.0328, + "step": 272670 + }, + { + "epoch": 1.0378873807693185, + "grad_norm": 0.12874449789524078, + "learning_rate": 0.00027014497322587586, + "loss": 2.0652, + "step": 272680 + }, + { + "epoch": 1.0379254432374414, + "grad_norm": 0.15249601006507874, + "learning_rate": 0.00027005435424866164, + "loss": 2.0322, + "step": 272690 + }, + { + "epoch": 1.037963505705564, + "grad_norm": 0.13022339344024658, + "learning_rate": 0.00026996377096929564, + "loss": 2.0343, + "step": 272700 + }, + { + "epoch": 1.0380015681736867, + "grad_norm": 0.12486381083726883, + "learning_rate": 0.00026987322334562336, + "loss": 2.0522, + "step": 272710 + }, + { + "epoch": 1.0380396306418094, + "grad_norm": 0.12613098323345184, + "learning_rate": 0.0002697827113355733, + "loss": 2.0299, + "step": 272720 + }, + { + "epoch": 1.038077693109932, + "grad_norm": 0.14147455990314484, + "learning_rate": 0.00026969223489715623, + "loss": 2.0421, + "step": 272730 + }, + { + "epoch": 1.0381157555780547, + "grad_norm": 0.14256823062896729, + "learning_rate": 0.00026960179398846586, + "loss": 2.0428, + "step": 272740 + }, + { + "epoch": 1.0381538180461773, + "grad_norm": 0.15050435066223145, + "learning_rate": 0.0002695113885676778, + "loss": 2.036, + "step": 272750 + }, + { + "epoch": 1.0381918805143, + "grad_norm": 0.1313105970621109, + "learning_rate": 0.00026942101859304984, + "loss": 2.0243, + "step": 272760 + }, + { + "epoch": 1.0382299429824227, + "grad_norm": 0.1230410784482956, + "learning_rate": 0.0002693306840229214, + "loss": 2.039, + "step": 272770 + }, + { + "epoch": 1.0382680054505455, + "grad_norm": 0.13287854194641113, + "learning_rate": 0.00026924038481571344, + "loss": 2.0566, + "step": 272780 + }, + { + "epoch": 1.0383060679186682, + "grad_norm": 0.13118687272071838, + "learning_rate": 0.0002691501209299284, + "loss": 2.0441, + "step": 272790 + }, + { + "epoch": 1.0383441303867909, + "grad_norm": 0.14854682981967926, + "learning_rate": 0.00026905989232414967, + "loss": 2.0348, + "step": 272800 + }, + { + "epoch": 1.0383821928549135, + "grad_norm": 0.12359170615673065, + "learning_rate": 0.0002689696989570416, + "loss": 2.0315, + "step": 272810 + }, + { + "epoch": 1.0384202553230362, + "grad_norm": 0.13356876373291016, + "learning_rate": 0.000268879540787349, + "loss": 2.0433, + "step": 272820 + }, + { + "epoch": 1.0384583177911588, + "grad_norm": 0.1332310438156128, + "learning_rate": 0.0002687894177738974, + "loss": 2.0274, + "step": 272830 + }, + { + "epoch": 1.0384963802592815, + "grad_norm": 0.1265527904033661, + "learning_rate": 0.0002686993298755924, + "loss": 2.0395, + "step": 272840 + }, + { + "epoch": 1.0385344427274041, + "grad_norm": 0.14267970621585846, + "learning_rate": 0.0002686092770514197, + "loss": 2.024, + "step": 272850 + }, + { + "epoch": 1.0385725051955268, + "grad_norm": 0.14452096819877625, + "learning_rate": 0.00026851925926044443, + "loss": 2.0316, + "step": 272860 + }, + { + "epoch": 1.0386105676636497, + "grad_norm": 0.15038394927978516, + "learning_rate": 0.0002684292764618118, + "loss": 2.0601, + "step": 272870 + }, + { + "epoch": 1.0386486301317723, + "grad_norm": 0.16037602722644806, + "learning_rate": 0.00026833932861474596, + "loss": 2.0338, + "step": 272880 + }, + { + "epoch": 1.038686692599895, + "grad_norm": 0.13514462113380432, + "learning_rate": 0.00026824941567855036, + "loss": 2.0381, + "step": 272890 + }, + { + "epoch": 1.0387247550680176, + "grad_norm": 0.12547416985034943, + "learning_rate": 0.0002681595376126074, + "loss": 2.0307, + "step": 272900 + }, + { + "epoch": 1.0387628175361403, + "grad_norm": 0.14363206923007965, + "learning_rate": 0.0002680696943763781, + "loss": 2.0256, + "step": 272910 + }, + { + "epoch": 1.038800880004263, + "grad_norm": 0.1408882886171341, + "learning_rate": 0.00026797988592940195, + "loss": 2.0426, + "step": 272920 + }, + { + "epoch": 1.0388389424723856, + "grad_norm": 0.13903652131557465, + "learning_rate": 0.0002678901122312968, + "loss": 2.0399, + "step": 272930 + }, + { + "epoch": 1.0388770049405083, + "grad_norm": 0.17271898686885834, + "learning_rate": 0.00026780037324175847, + "loss": 2.0265, + "step": 272940 + }, + { + "epoch": 1.038915067408631, + "grad_norm": 0.1400110274553299, + "learning_rate": 0.0002677106689205608, + "loss": 2.0324, + "step": 272950 + }, + { + "epoch": 1.0389531298767538, + "grad_norm": 0.13623112440109253, + "learning_rate": 0.000267620999227555, + "loss": 2.0388, + "step": 272960 + }, + { + "epoch": 1.0389911923448765, + "grad_norm": 0.1342500001192093, + "learning_rate": 0.00026753136412267, + "loss": 2.0245, + "step": 272970 + }, + { + "epoch": 1.0390292548129991, + "grad_norm": 0.13029596209526062, + "learning_rate": 0.00026744176356591166, + "loss": 2.0379, + "step": 272980 + }, + { + "epoch": 1.0390673172811218, + "grad_norm": 0.13370917737483978, + "learning_rate": 0.0002673521975173632, + "loss": 2.0379, + "step": 272990 + }, + { + "epoch": 1.0391053797492444, + "grad_norm": 0.12633267045021057, + "learning_rate": 0.0002672626659371843, + "loss": 2.0312, + "step": 273000 + }, + { + "epoch": 1.039143442217367, + "grad_norm": 0.140619158744812, + "learning_rate": 0.0002671731687856115, + "loss": 2.0352, + "step": 273010 + }, + { + "epoch": 1.0391815046854898, + "grad_norm": 0.13824288547039032, + "learning_rate": 0.00026708370602295775, + "loss": 2.0447, + "step": 273020 + }, + { + "epoch": 1.0392195671536124, + "grad_norm": 0.13469131290912628, + "learning_rate": 0.00026699427760961184, + "loss": 2.0596, + "step": 273030 + }, + { + "epoch": 1.0392576296217353, + "grad_norm": 0.12450501322746277, + "learning_rate": 0.00026690488350603883, + "loss": 2.0334, + "step": 273040 + }, + { + "epoch": 1.039295692089858, + "grad_norm": 0.13858427107334137, + "learning_rate": 0.0002668155236727796, + "loss": 2.0428, + "step": 273050 + }, + { + "epoch": 1.0393337545579806, + "grad_norm": 0.15618272125720978, + "learning_rate": 0.00026672619807045056, + "loss": 2.0179, + "step": 273060 + }, + { + "epoch": 1.0393718170261033, + "grad_norm": 0.14906159043312073, + "learning_rate": 0.0002666369066597434, + "loss": 2.0539, + "step": 273070 + }, + { + "epoch": 1.039409879494226, + "grad_norm": 0.147879496216774, + "learning_rate": 0.00026654764940142495, + "loss": 2.0518, + "step": 273080 + }, + { + "epoch": 1.0394479419623486, + "grad_norm": 0.13153234124183655, + "learning_rate": 0.00026645842625633726, + "loss": 2.0236, + "step": 273090 + }, + { + "epoch": 1.0394860044304712, + "grad_norm": 0.13339228928089142, + "learning_rate": 0.0002663692371853969, + "loss": 2.0288, + "step": 273100 + }, + { + "epoch": 1.039524066898594, + "grad_norm": 0.14011885225772858, + "learning_rate": 0.00026628008214959516, + "loss": 2.0262, + "step": 273110 + }, + { + "epoch": 1.0395621293667165, + "grad_norm": 0.12014271318912506, + "learning_rate": 0.0002661909611099976, + "loss": 2.0378, + "step": 273120 + }, + { + "epoch": 1.0396001918348394, + "grad_norm": 0.12502440810203552, + "learning_rate": 0.000266101874027744, + "loss": 2.038, + "step": 273130 + }, + { + "epoch": 1.039638254302962, + "grad_norm": 0.13825981318950653, + "learning_rate": 0.0002660128208640482, + "loss": 2.0349, + "step": 273140 + }, + { + "epoch": 1.0396763167710847, + "grad_norm": 0.13164453208446503, + "learning_rate": 0.00026592380158019767, + "loss": 2.0223, + "step": 273150 + }, + { + "epoch": 1.0397143792392074, + "grad_norm": 0.133603036403656, + "learning_rate": 0.00026583481613755354, + "loss": 2.0272, + "step": 273160 + }, + { + "epoch": 1.03975244170733, + "grad_norm": 0.16143639385700226, + "learning_rate": 0.00026574586449755043, + "loss": 2.0446, + "step": 273170 + }, + { + "epoch": 1.0397905041754527, + "grad_norm": 0.1411830633878708, + "learning_rate": 0.0002656569466216959, + "loss": 2.0311, + "step": 273180 + }, + { + "epoch": 1.0398285666435754, + "grad_norm": 0.1281070113182068, + "learning_rate": 0.0002655680624715708, + "loss": 2.0435, + "step": 273190 + }, + { + "epoch": 1.039866629111698, + "grad_norm": 0.13342636823654175, + "learning_rate": 0.0002654792120088285, + "loss": 2.0375, + "step": 273200 + }, + { + "epoch": 1.039904691579821, + "grad_norm": 0.1617501974105835, + "learning_rate": 0.00026539039519519526, + "loss": 2.0263, + "step": 273210 + }, + { + "epoch": 1.0399427540479436, + "grad_norm": 0.1403946727514267, + "learning_rate": 0.0002653016119924695, + "loss": 2.0484, + "step": 273220 + }, + { + "epoch": 1.0399808165160662, + "grad_norm": 0.1406700313091278, + "learning_rate": 0.00026521286236252207, + "loss": 2.0434, + "step": 273230 + }, + { + "epoch": 1.0400188789841889, + "grad_norm": 0.14084210991859436, + "learning_rate": 0.00026512414626729574, + "loss": 2.0388, + "step": 273240 + }, + { + "epoch": 1.0400569414523115, + "grad_norm": 0.1333620548248291, + "learning_rate": 0.0002650354636688053, + "loss": 2.0467, + "step": 273250 + }, + { + "epoch": 1.0400950039204342, + "grad_norm": 0.13422276079654694, + "learning_rate": 0.0002649468145291368, + "loss": 2.0377, + "step": 273260 + }, + { + "epoch": 1.0401330663885568, + "grad_norm": 0.23148223757743835, + "learning_rate": 0.0002648581988104483, + "loss": 2.0431, + "step": 273270 + }, + { + "epoch": 1.0401711288566795, + "grad_norm": 0.1293201595544815, + "learning_rate": 0.00026476961647496866, + "loss": 2.0378, + "step": 273280 + }, + { + "epoch": 1.0402091913248022, + "grad_norm": 0.13805337250232697, + "learning_rate": 0.0002646810674849982, + "loss": 2.0464, + "step": 273290 + }, + { + "epoch": 1.040247253792925, + "grad_norm": 0.12398272007703781, + "learning_rate": 0.00026459255180290775, + "loss": 2.0379, + "step": 273300 + }, + { + "epoch": 1.0402853162610477, + "grad_norm": 0.14543074369430542, + "learning_rate": 0.0002645040693911393, + "loss": 2.0313, + "step": 273310 + }, + { + "epoch": 1.0403233787291704, + "grad_norm": 0.13031961023807526, + "learning_rate": 0.0002644156202122051, + "loss": 2.0453, + "step": 273320 + }, + { + "epoch": 1.040361441197293, + "grad_norm": 0.13371948897838593, + "learning_rate": 0.00026432720422868775, + "loss": 2.0357, + "step": 273330 + }, + { + "epoch": 1.0403995036654157, + "grad_norm": 0.13025528192520142, + "learning_rate": 0.00026423882140324005, + "loss": 2.0281, + "step": 273340 + }, + { + "epoch": 1.0404375661335383, + "grad_norm": 0.14672107994556427, + "learning_rate": 0.0002641504716985849, + "loss": 2.0361, + "step": 273350 + }, + { + "epoch": 1.040475628601661, + "grad_norm": 0.13892264664173126, + "learning_rate": 0.00026406215507751487, + "loss": 2.0315, + "step": 273360 + }, + { + "epoch": 1.0405136910697836, + "grad_norm": 0.12774324417114258, + "learning_rate": 0.00026397387150289196, + "loss": 2.0449, + "step": 273370 + }, + { + "epoch": 1.0405517535379065, + "grad_norm": 0.13852806389331818, + "learning_rate": 0.00026388562093764814, + "loss": 2.033, + "step": 273380 + }, + { + "epoch": 1.0405898160060292, + "grad_norm": 0.13382737338542938, + "learning_rate": 0.00026379740334478397, + "loss": 2.0342, + "step": 273390 + }, + { + "epoch": 1.0406278784741518, + "grad_norm": 0.1346932351589203, + "learning_rate": 0.0002637092186873696, + "loss": 2.0376, + "step": 273400 + }, + { + "epoch": 1.0406659409422745, + "grad_norm": 0.1198713481426239, + "learning_rate": 0.0002636210669285437, + "loss": 2.0221, + "step": 273410 + }, + { + "epoch": 1.0407040034103971, + "grad_norm": 0.13439087569713593, + "learning_rate": 0.00026353294803151394, + "loss": 2.0116, + "step": 273420 + }, + { + "epoch": 1.0407420658785198, + "grad_norm": 0.1452450007200241, + "learning_rate": 0.0002634448619595564, + "loss": 2.0341, + "step": 273430 + }, + { + "epoch": 1.0407801283466425, + "grad_norm": 0.1251915544271469, + "learning_rate": 0.0002633568086760154, + "loss": 2.0497, + "step": 273440 + }, + { + "epoch": 1.0408181908147651, + "grad_norm": 0.14364951848983765, + "learning_rate": 0.0002632687881443035, + "loss": 2.0435, + "step": 273450 + }, + { + "epoch": 1.0408562532828878, + "grad_norm": 0.13844889402389526, + "learning_rate": 0.00026318080032790135, + "loss": 2.0302, + "step": 273460 + }, + { + "epoch": 1.0408943157510107, + "grad_norm": 0.14765089750289917, + "learning_rate": 0.00026309284519035735, + "loss": 2.0338, + "step": 273470 + }, + { + "epoch": 1.0409323782191333, + "grad_norm": 0.13139985501766205, + "learning_rate": 0.0002630049226952875, + "loss": 2.0472, + "step": 273480 + }, + { + "epoch": 1.040970440687256, + "grad_norm": 0.14244161546230316, + "learning_rate": 0.0002629170328063753, + "loss": 2.0378, + "step": 273490 + }, + { + "epoch": 1.0410085031553786, + "grad_norm": 0.13524176180362701, + "learning_rate": 0.00026282917548737156, + "loss": 2.0261, + "step": 273500 + }, + { + "epoch": 1.0410465656235013, + "grad_norm": 0.12593677639961243, + "learning_rate": 0.00026274135070209414, + "loss": 2.0361, + "step": 273510 + }, + { + "epoch": 1.041084628091624, + "grad_norm": 0.12570716440677643, + "learning_rate": 0.00026265355841442804, + "loss": 2.0419, + "step": 273520 + }, + { + "epoch": 1.0411226905597466, + "grad_norm": 0.1178411990404129, + "learning_rate": 0.0002625657985883247, + "loss": 2.0258, + "step": 273530 + }, + { + "epoch": 1.0411607530278693, + "grad_norm": 0.12717683613300323, + "learning_rate": 0.0002624780711878024, + "loss": 2.0434, + "step": 273540 + }, + { + "epoch": 1.0411988154959921, + "grad_norm": 0.14389148354530334, + "learning_rate": 0.0002623903761769458, + "loss": 2.0383, + "step": 273550 + }, + { + "epoch": 1.0412368779641148, + "grad_norm": 0.11844097077846527, + "learning_rate": 0.00026230271351990577, + "loss": 2.03, + "step": 273560 + }, + { + "epoch": 1.0412749404322375, + "grad_norm": 0.13370178639888763, + "learning_rate": 0.0002622150831808993, + "loss": 2.0407, + "step": 273570 + }, + { + "epoch": 1.04131300290036, + "grad_norm": 0.13318881392478943, + "learning_rate": 0.00026212748512420916, + "loss": 2.0337, + "step": 273580 + }, + { + "epoch": 1.0413510653684828, + "grad_norm": 0.152360200881958, + "learning_rate": 0.0002620399193141841, + "loss": 2.0212, + "step": 273590 + }, + { + "epoch": 1.0413891278366054, + "grad_norm": 0.14211878180503845, + "learning_rate": 0.00026195238571523837, + "loss": 2.0308, + "step": 273600 + }, + { + "epoch": 1.041427190304728, + "grad_norm": 0.14196383953094482, + "learning_rate": 0.0002618648842918514, + "loss": 2.0319, + "step": 273610 + }, + { + "epoch": 1.0414652527728507, + "grad_norm": 0.12660767138004303, + "learning_rate": 0.00026177741500856813, + "loss": 2.0417, + "step": 273620 + }, + { + "epoch": 1.0415033152409734, + "grad_norm": 0.13070152699947357, + "learning_rate": 0.00026168997782999835, + "loss": 2.0413, + "step": 273630 + }, + { + "epoch": 1.0415413777090963, + "grad_norm": 0.13289307057857513, + "learning_rate": 0.000261602572720817, + "loss": 2.0326, + "step": 273640 + }, + { + "epoch": 1.041579440177219, + "grad_norm": 0.1299850195646286, + "learning_rate": 0.0002615151996457636, + "loss": 2.037, + "step": 273650 + }, + { + "epoch": 1.0416175026453416, + "grad_norm": 0.1389370858669281, + "learning_rate": 0.00026142785856964214, + "loss": 2.032, + "step": 273660 + }, + { + "epoch": 1.0416555651134642, + "grad_norm": 0.1402798444032669, + "learning_rate": 0.0002613405494573212, + "loss": 2.0354, + "step": 273670 + }, + { + "epoch": 1.041693627581587, + "grad_norm": 0.12128802388906479, + "learning_rate": 0.00026125327227373353, + "loss": 2.0256, + "step": 273680 + }, + { + "epoch": 1.0417316900497096, + "grad_norm": 0.13357381522655487, + "learning_rate": 0.000261166026983876, + "loss": 2.0463, + "step": 273690 + }, + { + "epoch": 1.0417697525178322, + "grad_norm": 0.1297796219587326, + "learning_rate": 0.00026107881355280914, + "loss": 2.0438, + "step": 273700 + }, + { + "epoch": 1.0418078149859549, + "grad_norm": 0.1287699043750763, + "learning_rate": 0.0002609916319456576, + "loss": 2.033, + "step": 273710 + }, + { + "epoch": 1.0418458774540775, + "grad_norm": 0.14555631577968597, + "learning_rate": 0.00026090448212760944, + "loss": 2.0287, + "step": 273720 + }, + { + "epoch": 1.0418839399222004, + "grad_norm": 0.12862415611743927, + "learning_rate": 0.00026081736406391594, + "loss": 2.0335, + "step": 273730 + }, + { + "epoch": 1.041922002390323, + "grad_norm": 0.12859775125980377, + "learning_rate": 0.000260730277719892, + "loss": 2.0239, + "step": 273740 + }, + { + "epoch": 1.0419600648584457, + "grad_norm": 0.12264101952314377, + "learning_rate": 0.00026064322306091546, + "loss": 2.0345, + "step": 273750 + }, + { + "epoch": 1.0419981273265684, + "grad_norm": 0.13000193238258362, + "learning_rate": 0.0002605562000524271, + "loss": 2.025, + "step": 273760 + }, + { + "epoch": 1.042036189794691, + "grad_norm": 0.1287056803703308, + "learning_rate": 0.0002604692086599303, + "loss": 2.0306, + "step": 273770 + }, + { + "epoch": 1.0420742522628137, + "grad_norm": 0.14237751066684723, + "learning_rate": 0.0002603822488489914, + "loss": 2.0388, + "step": 273780 + }, + { + "epoch": 1.0421123147309364, + "grad_norm": 0.1337154060602188, + "learning_rate": 0.00026029532058523903, + "loss": 2.0257, + "step": 273790 + }, + { + "epoch": 1.042150377199059, + "grad_norm": 0.11729374527931213, + "learning_rate": 0.00026020842383436403, + "loss": 2.0373, + "step": 273800 + }, + { + "epoch": 1.0421884396671817, + "grad_norm": 0.1402532309293747, + "learning_rate": 0.0002601215585621195, + "loss": 2.0214, + "step": 273810 + }, + { + "epoch": 1.0422265021353045, + "grad_norm": 0.1256987452507019, + "learning_rate": 0.0002600347247343205, + "loss": 2.0273, + "step": 273820 + }, + { + "epoch": 1.0422645646034272, + "grad_norm": 0.13618886470794678, + "learning_rate": 0.00025994792231684395, + "loss": 2.0352, + "step": 273830 + }, + { + "epoch": 1.0423026270715499, + "grad_norm": 0.1377623826265335, + "learning_rate": 0.0002598611512756283, + "loss": 2.0506, + "step": 273840 + }, + { + "epoch": 1.0423406895396725, + "grad_norm": 0.122389055788517, + "learning_rate": 0.0002597744115766737, + "loss": 2.0467, + "step": 273850 + }, + { + "epoch": 1.0423787520077952, + "grad_norm": 0.12941765785217285, + "learning_rate": 0.00025968770318604166, + "loss": 2.0364, + "step": 273860 + }, + { + "epoch": 1.0424168144759178, + "grad_norm": 0.13198088109493256, + "learning_rate": 0.0002596010260698547, + "loss": 2.0416, + "step": 273870 + }, + { + "epoch": 1.0424548769440405, + "grad_norm": 0.1301337331533432, + "learning_rate": 0.0002595143801942966, + "loss": 2.0274, + "step": 273880 + }, + { + "epoch": 1.0424929394121631, + "grad_norm": 0.12306160479784012, + "learning_rate": 0.0002594277655256119, + "loss": 2.033, + "step": 273890 + }, + { + "epoch": 1.042531001880286, + "grad_norm": 0.13080520927906036, + "learning_rate": 0.00025934118203010584, + "loss": 2.0339, + "step": 273900 + }, + { + "epoch": 1.0425690643484087, + "grad_norm": 0.13939636945724487, + "learning_rate": 0.00025925462967414447, + "loss": 2.0369, + "step": 273910 + }, + { + "epoch": 1.0426071268165313, + "grad_norm": 0.12812650203704834, + "learning_rate": 0.00025916810842415404, + "loss": 2.0381, + "step": 273920 + }, + { + "epoch": 1.042645189284654, + "grad_norm": 0.13757014274597168, + "learning_rate": 0.0002590816182466214, + "loss": 2.0457, + "step": 273930 + }, + { + "epoch": 1.0426832517527767, + "grad_norm": 0.11760999262332916, + "learning_rate": 0.0002589951591080931, + "loss": 2.0266, + "step": 273940 + }, + { + "epoch": 1.0427213142208993, + "grad_norm": 0.125470831990242, + "learning_rate": 0.0002589087309751761, + "loss": 2.0354, + "step": 273950 + }, + { + "epoch": 1.042759376689022, + "grad_norm": 0.15457892417907715, + "learning_rate": 0.0002588223338145369, + "loss": 2.0321, + "step": 273960 + }, + { + "epoch": 1.0427974391571446, + "grad_norm": 0.12411262840032578, + "learning_rate": 0.0002587359675929018, + "loss": 2.0329, + "step": 273970 + }, + { + "epoch": 1.0428355016252673, + "grad_norm": 0.12908662855625153, + "learning_rate": 0.0002586496322770566, + "loss": 2.037, + "step": 273980 + }, + { + "epoch": 1.0428735640933902, + "grad_norm": 0.15328560769557953, + "learning_rate": 0.0002585633278338465, + "loss": 2.0478, + "step": 273990 + }, + { + "epoch": 1.0429116265615128, + "grad_norm": 0.1366276890039444, + "learning_rate": 0.000258477054230176, + "loss": 2.0388, + "step": 274000 + }, + { + "epoch": 1.0429496890296355, + "grad_norm": 0.1253184974193573, + "learning_rate": 0.00025839081143300856, + "loss": 2.0429, + "step": 274010 + }, + { + "epoch": 1.0429877514977581, + "grad_norm": 0.12240269780158997, + "learning_rate": 0.0002583045994093668, + "loss": 2.0464, + "step": 274020 + }, + { + "epoch": 1.0430258139658808, + "grad_norm": 0.17729179561138153, + "learning_rate": 0.00025821841812633174, + "loss": 2.0296, + "step": 274030 + }, + { + "epoch": 1.0430638764340034, + "grad_norm": 0.14219191670417786, + "learning_rate": 0.0002581322675510435, + "loss": 2.0277, + "step": 274040 + }, + { + "epoch": 1.043101938902126, + "grad_norm": 0.1508656144142151, + "learning_rate": 0.0002580461476507004, + "loss": 2.0166, + "step": 274050 + }, + { + "epoch": 1.0431400013702488, + "grad_norm": 0.13650383055210114, + "learning_rate": 0.0002579600583925592, + "loss": 2.0246, + "step": 274060 + }, + { + "epoch": 1.0431780638383716, + "grad_norm": 0.12071377784013748, + "learning_rate": 0.000257873999743935, + "loss": 2.0229, + "step": 274070 + }, + { + "epoch": 1.0432161263064943, + "grad_norm": 0.13448010385036469, + "learning_rate": 0.0002577879716722007, + "loss": 2.0491, + "step": 274080 + }, + { + "epoch": 1.043254188774617, + "grad_norm": 0.14792564511299133, + "learning_rate": 0.00025770197414478723, + "loss": 2.0428, + "step": 274090 + }, + { + "epoch": 1.0432922512427396, + "grad_norm": 0.13969039916992188, + "learning_rate": 0.00025761600712918354, + "loss": 2.0426, + "step": 274100 + }, + { + "epoch": 1.0433303137108623, + "grad_norm": 0.1547282338142395, + "learning_rate": 0.00025753007059293586, + "loss": 2.0579, + "step": 274110 + }, + { + "epoch": 1.043368376178985, + "grad_norm": 0.1221390813589096, + "learning_rate": 0.0002574441645036481, + "loss": 2.0368, + "step": 274120 + }, + { + "epoch": 1.0434064386471076, + "grad_norm": 0.13443832099437714, + "learning_rate": 0.0002573582888289814, + "loss": 2.0221, + "step": 274130 + }, + { + "epoch": 1.0434445011152302, + "grad_norm": 0.12936048209667206, + "learning_rate": 0.00025727244353665436, + "loss": 2.0312, + "step": 274140 + }, + { + "epoch": 1.043482563583353, + "grad_norm": 0.1321532279253006, + "learning_rate": 0.00025718662859444225, + "loss": 2.0479, + "step": 274150 + }, + { + "epoch": 1.0435206260514758, + "grad_norm": 0.12923943996429443, + "learning_rate": 0.0002571008439701776, + "loss": 2.0438, + "step": 274160 + }, + { + "epoch": 1.0435586885195984, + "grad_norm": 0.1430782824754715, + "learning_rate": 0.00025701508963174965, + "loss": 2.0335, + "step": 274170 + }, + { + "epoch": 1.043596750987721, + "grad_norm": 0.13641083240509033, + "learning_rate": 0.0002569293655471041, + "loss": 2.0469, + "step": 274180 + }, + { + "epoch": 1.0436348134558437, + "grad_norm": 0.14894016087055206, + "learning_rate": 0.0002568436716842434, + "loss": 2.0388, + "step": 274190 + }, + { + "epoch": 1.0436728759239664, + "grad_norm": 0.1385067105293274, + "learning_rate": 0.0002567580080112263, + "loss": 2.031, + "step": 274200 + }, + { + "epoch": 1.043710938392089, + "grad_norm": 0.14582498371601105, + "learning_rate": 0.00025667237449616753, + "loss": 2.0342, + "step": 274210 + }, + { + "epoch": 1.0437490008602117, + "grad_norm": 0.1438489854335785, + "learning_rate": 0.0002565867711072383, + "loss": 2.0439, + "step": 274220 + }, + { + "epoch": 1.0437870633283344, + "grad_norm": 0.13223670423030853, + "learning_rate": 0.00025650119781266546, + "loss": 2.0338, + "step": 274230 + }, + { + "epoch": 1.0438251257964573, + "grad_norm": 0.14658309519290924, + "learning_rate": 0.0002564156545807319, + "loss": 2.0453, + "step": 274240 + }, + { + "epoch": 1.04386318826458, + "grad_norm": 0.13508301973342896, + "learning_rate": 0.000256330141379776, + "loss": 2.0511, + "step": 274250 + }, + { + "epoch": 1.0439012507327026, + "grad_norm": 0.13011658191680908, + "learning_rate": 0.0002562446581781916, + "loss": 2.032, + "step": 274260 + }, + { + "epoch": 1.0439393132008252, + "grad_norm": 0.13241124153137207, + "learning_rate": 0.00025615920494442823, + "loss": 2.0309, + "step": 274270 + }, + { + "epoch": 1.0439773756689479, + "grad_norm": 0.13608020544052124, + "learning_rate": 0.0002560737816469907, + "loss": 2.0371, + "step": 274280 + }, + { + "epoch": 1.0440154381370705, + "grad_norm": 0.32768458127975464, + "learning_rate": 0.0002559883882544385, + "loss": 2.0252, + "step": 274290 + }, + { + "epoch": 1.0440535006051932, + "grad_norm": 0.12332502007484436, + "learning_rate": 0.0002559030247353865, + "loss": 2.0292, + "step": 274300 + }, + { + "epoch": 1.0440915630733159, + "grad_norm": 0.14246225357055664, + "learning_rate": 0.0002558176910585044, + "loss": 2.0181, + "step": 274310 + }, + { + "epoch": 1.0441296255414385, + "grad_norm": 0.1291736364364624, + "learning_rate": 0.0002557323871925165, + "loss": 2.0446, + "step": 274320 + }, + { + "epoch": 1.0441676880095614, + "grad_norm": 0.1337514966726303, + "learning_rate": 0.00025564711310620183, + "loss": 2.0262, + "step": 274330 + }, + { + "epoch": 1.044205750477684, + "grad_norm": 0.15136483311653137, + "learning_rate": 0.0002555618687683937, + "loss": 2.0272, + "step": 274340 + }, + { + "epoch": 1.0442438129458067, + "grad_norm": 0.1356852501630783, + "learning_rate": 0.00025547665414797985, + "loss": 2.0434, + "step": 274350 + }, + { + "epoch": 1.0442818754139294, + "grad_norm": 0.14751608669757843, + "learning_rate": 0.0002553914692139022, + "loss": 2.0278, + "step": 274360 + }, + { + "epoch": 1.044319937882052, + "grad_norm": 0.12296520173549652, + "learning_rate": 0.0002553063139351569, + "loss": 2.0389, + "step": 274370 + }, + { + "epoch": 1.0443580003501747, + "grad_norm": 0.15096762776374817, + "learning_rate": 0.00025522118828079365, + "loss": 2.0292, + "step": 274380 + }, + { + "epoch": 1.0443960628182973, + "grad_norm": 0.14363795518875122, + "learning_rate": 0.0002551360922199163, + "loss": 2.0455, + "step": 274390 + }, + { + "epoch": 1.04443412528642, + "grad_norm": 0.15220364928245544, + "learning_rate": 0.0002550510257216822, + "loss": 2.0414, + "step": 274400 + }, + { + "epoch": 1.0444721877545429, + "grad_norm": 0.1270347386598587, + "learning_rate": 0.00025496598875530225, + "loss": 2.0379, + "step": 274410 + }, + { + "epoch": 1.0445102502226655, + "grad_norm": 0.14191339910030365, + "learning_rate": 0.0002548809812900408, + "loss": 2.0224, + "step": 274420 + }, + { + "epoch": 1.0445483126907882, + "grad_norm": 0.13303406536579132, + "learning_rate": 0.00025479600329521545, + "loss": 2.0361, + "step": 274430 + }, + { + "epoch": 1.0445863751589108, + "grad_norm": 0.1290275901556015, + "learning_rate": 0.00025471105474019693, + "loss": 2.0192, + "step": 274440 + }, + { + "epoch": 1.0446244376270335, + "grad_norm": 0.1343204528093338, + "learning_rate": 0.00025462613559440906, + "loss": 2.0309, + "step": 274450 + }, + { + "epoch": 1.0446625000951562, + "grad_norm": 0.13578853011131287, + "learning_rate": 0.00025454124582732844, + "loss": 2.048, + "step": 274460 + }, + { + "epoch": 1.0447005625632788, + "grad_norm": 0.13064728677272797, + "learning_rate": 0.0002544563854084846, + "loss": 2.0372, + "step": 274470 + }, + { + "epoch": 1.0447386250314015, + "grad_norm": 0.15233071148395538, + "learning_rate": 0.0002543715543074595, + "loss": 2.032, + "step": 274480 + }, + { + "epoch": 1.0447766874995241, + "grad_norm": 0.14285831153392792, + "learning_rate": 0.00025428675249388773, + "loss": 2.0326, + "step": 274490 + }, + { + "epoch": 1.044814749967647, + "grad_norm": 0.1405845433473587, + "learning_rate": 0.00025420197993745623, + "loss": 2.0417, + "step": 274500 + }, + { + "epoch": 1.0448528124357697, + "grad_norm": 0.12189560383558273, + "learning_rate": 0.00025411723660790427, + "loss": 2.0389, + "step": 274510 + }, + { + "epoch": 1.0448908749038923, + "grad_norm": 0.12811321020126343, + "learning_rate": 0.00025403252247502316, + "loss": 2.023, + "step": 274520 + }, + { + "epoch": 1.044928937372015, + "grad_norm": 0.16220244765281677, + "learning_rate": 0.0002539478375086562, + "loss": 2.037, + "step": 274530 + }, + { + "epoch": 1.0449669998401376, + "grad_norm": 0.13246352970600128, + "learning_rate": 0.0002538631816786986, + "loss": 2.0438, + "step": 274540 + }, + { + "epoch": 1.0450050623082603, + "grad_norm": 0.12363100796937943, + "learning_rate": 0.00025377855495509736, + "loss": 2.0369, + "step": 274550 + }, + { + "epoch": 1.045043124776383, + "grad_norm": 0.13489417731761932, + "learning_rate": 0.00025369395730785114, + "loss": 2.0226, + "step": 274560 + }, + { + "epoch": 1.0450811872445056, + "grad_norm": 0.13529403507709503, + "learning_rate": 0.0002536093887070099, + "loss": 2.0421, + "step": 274570 + }, + { + "epoch": 1.0451192497126283, + "grad_norm": 0.13258054852485657, + "learning_rate": 0.00025352484912267526, + "loss": 2.0218, + "step": 274580 + }, + { + "epoch": 1.0451573121807511, + "grad_norm": 0.14793935418128967, + "learning_rate": 0.0002534403385249999, + "loss": 2.0355, + "step": 274590 + }, + { + "epoch": 1.0451953746488738, + "grad_norm": 0.1330820918083191, + "learning_rate": 0.0002533558568841876, + "loss": 2.0364, + "step": 274600 + }, + { + "epoch": 1.0452334371169965, + "grad_norm": 0.13027338683605194, + "learning_rate": 0.0002532714041704934, + "loss": 2.0392, + "step": 274610 + }, + { + "epoch": 1.0452714995851191, + "grad_norm": 0.13190694153308868, + "learning_rate": 0.00025318698035422307, + "loss": 2.0415, + "step": 274620 + }, + { + "epoch": 1.0453095620532418, + "grad_norm": 0.1548970639705658, + "learning_rate": 0.00025310258540573306, + "loss": 2.0412, + "step": 274630 + }, + { + "epoch": 1.0453476245213644, + "grad_norm": 0.16039757430553436, + "learning_rate": 0.0002530182192954306, + "loss": 2.041, + "step": 274640 + }, + { + "epoch": 1.045385686989487, + "grad_norm": 0.13856640458106995, + "learning_rate": 0.0002529338819937734, + "loss": 2.0256, + "step": 274650 + }, + { + "epoch": 1.0454237494576097, + "grad_norm": 0.13383691012859344, + "learning_rate": 0.0002528495734712697, + "loss": 2.0324, + "step": 274660 + }, + { + "epoch": 1.0454618119257324, + "grad_norm": 0.12692676484584808, + "learning_rate": 0.00025276529369847766, + "loss": 2.0259, + "step": 274670 + }, + { + "epoch": 1.0454998743938553, + "grad_norm": 0.1338866949081421, + "learning_rate": 0.0002526810426460061, + "loss": 2.0271, + "step": 274680 + }, + { + "epoch": 1.045537936861978, + "grad_norm": 0.14236962795257568, + "learning_rate": 0.0002525968202845135, + "loss": 2.0362, + "step": 274690 + }, + { + "epoch": 1.0455759993301006, + "grad_norm": 0.14252808690071106, + "learning_rate": 0.00025251262658470836, + "loss": 2.0381, + "step": 274700 + }, + { + "epoch": 1.0456140617982232, + "grad_norm": 0.14136750996112823, + "learning_rate": 0.00025242846151734915, + "loss": 2.031, + "step": 274710 + }, + { + "epoch": 1.045652124266346, + "grad_norm": 0.12346253544092178, + "learning_rate": 0.0002523443250532439, + "loss": 2.0303, + "step": 274720 + }, + { + "epoch": 1.0456901867344686, + "grad_norm": 0.17243121564388275, + "learning_rate": 0.0002522602171632501, + "loss": 2.0304, + "step": 274730 + }, + { + "epoch": 1.0457282492025912, + "grad_norm": 0.14957967400550842, + "learning_rate": 0.0002521761378182749, + "loss": 2.0196, + "step": 274740 + }, + { + "epoch": 1.0457663116707139, + "grad_norm": 0.1367514431476593, + "learning_rate": 0.00025209208698927476, + "loss": 2.0339, + "step": 274750 + }, + { + "epoch": 1.0458043741388368, + "grad_norm": 0.13444387912750244, + "learning_rate": 0.0002520080646472551, + "loss": 2.0379, + "step": 274760 + }, + { + "epoch": 1.0458424366069594, + "grad_norm": 0.12551210820674896, + "learning_rate": 0.00025192407076327084, + "loss": 2.0408, + "step": 274770 + }, + { + "epoch": 1.045880499075082, + "grad_norm": 0.13665346801280975, + "learning_rate": 0.00025184010530842553, + "loss": 2.0374, + "step": 274780 + }, + { + "epoch": 1.0459185615432047, + "grad_norm": 0.13541540503501892, + "learning_rate": 0.00025175616825387183, + "loss": 2.0345, + "step": 274790 + }, + { + "epoch": 1.0459566240113274, + "grad_norm": 0.12491797655820847, + "learning_rate": 0.00025167225957081096, + "loss": 2.0243, + "step": 274800 + }, + { + "epoch": 1.04599468647945, + "grad_norm": 0.13685432076454163, + "learning_rate": 0.00025158837923049305, + "loss": 2.0278, + "step": 274810 + }, + { + "epoch": 1.0460327489475727, + "grad_norm": 0.14867368340492249, + "learning_rate": 0.0002515045272042165, + "loss": 2.0173, + "step": 274820 + }, + { + "epoch": 1.0460708114156954, + "grad_norm": 0.15146611630916595, + "learning_rate": 0.00025142070346332815, + "loss": 2.0342, + "step": 274830 + }, + { + "epoch": 1.046108873883818, + "grad_norm": 0.1487848460674286, + "learning_rate": 0.0002513369079792232, + "loss": 2.0382, + "step": 274840 + }, + { + "epoch": 1.046146936351941, + "grad_norm": 0.13378088176250458, + "learning_rate": 0.00025125314072334503, + "loss": 2.0527, + "step": 274850 + }, + { + "epoch": 1.0461849988200635, + "grad_norm": 0.14431817829608917, + "learning_rate": 0.0002511694016671851, + "loss": 2.0422, + "step": 274860 + }, + { + "epoch": 1.0462230612881862, + "grad_norm": 0.1392163783311844, + "learning_rate": 0.00025108569078228284, + "loss": 2.0454, + "step": 274870 + }, + { + "epoch": 1.0462611237563089, + "grad_norm": 0.15907195210456848, + "learning_rate": 0.00025100200804022534, + "loss": 2.0303, + "step": 274880 + }, + { + "epoch": 1.0462991862244315, + "grad_norm": 0.14727604389190674, + "learning_rate": 0.00025091835341264766, + "loss": 2.0254, + "step": 274890 + }, + { + "epoch": 1.0463372486925542, + "grad_norm": 0.14289073646068573, + "learning_rate": 0.00025083472687123244, + "loss": 2.04, + "step": 274900 + }, + { + "epoch": 1.0463753111606768, + "grad_norm": 0.12701305747032166, + "learning_rate": 0.00025075112838770966, + "loss": 2.03, + "step": 274910 + }, + { + "epoch": 1.0464133736287995, + "grad_norm": 0.1313587874174118, + "learning_rate": 0.00025066755793385677, + "loss": 2.0391, + "step": 274920 + }, + { + "epoch": 1.0464514360969224, + "grad_norm": 0.16116932034492493, + "learning_rate": 0.0002505840154814986, + "loss": 2.033, + "step": 274930 + }, + { + "epoch": 1.046489498565045, + "grad_norm": 0.14866022765636444, + "learning_rate": 0.00025050050100250697, + "loss": 2.047, + "step": 274940 + }, + { + "epoch": 1.0465275610331677, + "grad_norm": 0.13143713772296906, + "learning_rate": 0.00025041701446880106, + "loss": 2.017, + "step": 274950 + }, + { + "epoch": 1.0465656235012903, + "grad_norm": 0.13434137403964996, + "learning_rate": 0.0002503335558523466, + "loss": 2.025, + "step": 274960 + }, + { + "epoch": 1.046603685969413, + "grad_norm": 0.13187500834465027, + "learning_rate": 0.0002502501251251564, + "loss": 2.017, + "step": 274970 + }, + { + "epoch": 1.0466417484375357, + "grad_norm": 0.15085522830486298, + "learning_rate": 0.00025016672225929015, + "loss": 2.029, + "step": 274980 + }, + { + "epoch": 1.0466798109056583, + "grad_norm": 0.1359572410583496, + "learning_rate": 0.0002500833472268538, + "loss": 2.0563, + "step": 274990 + }, + { + "epoch": 1.046717873373781, + "grad_norm": 0.13730421662330627, + "learning_rate": 0.00025, + "loss": 2.0221, + "step": 275000 + }, + { + "epoch": 1.0467559358419036, + "grad_norm": 0.15225379168987274, + "learning_rate": 0.0002499166805509279, + "loss": 2.0296, + "step": 275010 + }, + { + "epoch": 1.0467939983100265, + "grad_norm": 0.1194249764084816, + "learning_rate": 0.0002498333888518827, + "loss": 2.0216, + "step": 275020 + }, + { + "epoch": 1.0468320607781492, + "grad_norm": 0.1379227489233017, + "learning_rate": 0.00024975012487515603, + "loss": 2.0371, + "step": 275030 + }, + { + "epoch": 1.0468701232462718, + "grad_norm": 0.27223625779151917, + "learning_rate": 0.0002496668885930855, + "loss": 2.0251, + "step": 275040 + }, + { + "epoch": 1.0469081857143945, + "grad_norm": 0.1424230933189392, + "learning_rate": 0.0002495836799780547, + "loss": 2.0391, + "step": 275050 + }, + { + "epoch": 1.0469462481825171, + "grad_norm": 0.13158190250396729, + "learning_rate": 0.0002495004990024931, + "loss": 2.0478, + "step": 275060 + }, + { + "epoch": 1.0469843106506398, + "grad_norm": 0.1202026754617691, + "learning_rate": 0.00024941734563887574, + "loss": 2.0332, + "step": 275070 + }, + { + "epoch": 1.0470223731187625, + "grad_norm": 0.12430751323699951, + "learning_rate": 0.0002493342198597237, + "loss": 2.0147, + "step": 275080 + }, + { + "epoch": 1.047060435586885, + "grad_norm": 0.13746048510074615, + "learning_rate": 0.00024925112163760334, + "loss": 2.0338, + "step": 275090 + }, + { + "epoch": 1.047098498055008, + "grad_norm": 0.12649886310100555, + "learning_rate": 0.0002491680509451265, + "loss": 2.0366, + "step": 275100 + }, + { + "epoch": 1.0471365605231306, + "grad_norm": 0.14034858345985413, + "learning_rate": 0.00024908500775495035, + "loss": 2.034, + "step": 275110 + }, + { + "epoch": 1.0471746229912533, + "grad_norm": 0.14112162590026855, + "learning_rate": 0.00024900199203977736, + "loss": 2.0321, + "step": 275120 + }, + { + "epoch": 1.047212685459376, + "grad_norm": 0.1548360139131546, + "learning_rate": 0.00024891900377235505, + "loss": 2.0299, + "step": 275130 + }, + { + "epoch": 1.0472507479274986, + "grad_norm": 0.13118277490139008, + "learning_rate": 0.000248836042925476, + "loss": 2.0324, + "step": 275140 + }, + { + "epoch": 1.0472888103956213, + "grad_norm": 0.14753767848014832, + "learning_rate": 0.00024875310947197776, + "loss": 2.029, + "step": 275150 + }, + { + "epoch": 1.047326872863744, + "grad_norm": 0.14345189929008484, + "learning_rate": 0.0002486702033847426, + "loss": 2.0378, + "step": 275160 + }, + { + "epoch": 1.0473649353318666, + "grad_norm": 0.12945662438869476, + "learning_rate": 0.0002485873246366975, + "loss": 2.0304, + "step": 275170 + }, + { + "epoch": 1.0474029977999892, + "grad_norm": 0.1253436952829361, + "learning_rate": 0.0002485044732008142, + "loss": 2.0398, + "step": 275180 + }, + { + "epoch": 1.0474410602681121, + "grad_norm": 0.13015608489513397, + "learning_rate": 0.0002484216490501087, + "loss": 2.0305, + "step": 275190 + }, + { + "epoch": 1.0474791227362348, + "grad_norm": 0.14374291896820068, + "learning_rate": 0.00024833885215764167, + "loss": 2.0542, + "step": 275200 + }, + { + "epoch": 1.0475171852043574, + "grad_norm": 0.12700749933719635, + "learning_rate": 0.00024825608249651795, + "loss": 2.0357, + "step": 275210 + }, + { + "epoch": 1.04755524767248, + "grad_norm": 0.1461777687072754, + "learning_rate": 0.00024817334003988646, + "loss": 2.0085, + "step": 275220 + }, + { + "epoch": 1.0475933101406028, + "grad_norm": 0.14648723602294922, + "learning_rate": 0.00024809062476094043, + "loss": 2.0371, + "step": 275230 + }, + { + "epoch": 1.0476313726087254, + "grad_norm": 0.1811038702726364, + "learning_rate": 0.0002480079366329169, + "loss": 2.0408, + "step": 275240 + }, + { + "epoch": 1.047669435076848, + "grad_norm": 0.14561040699481964, + "learning_rate": 0.0002479252756290971, + "loss": 2.0397, + "step": 275250 + }, + { + "epoch": 1.0477074975449707, + "grad_norm": 0.15173044800758362, + "learning_rate": 0.00024784264172280566, + "loss": 2.0314, + "step": 275260 + }, + { + "epoch": 1.0477455600130936, + "grad_norm": 0.13430675864219666, + "learning_rate": 0.00024776003488741124, + "loss": 2.037, + "step": 275270 + }, + { + "epoch": 1.0477836224812163, + "grad_norm": 0.13322807848453522, + "learning_rate": 0.0002476774550963258, + "loss": 2.0252, + "step": 275280 + }, + { + "epoch": 1.047821684949339, + "grad_norm": 0.13812817633152008, + "learning_rate": 0.00024759490232300507, + "loss": 2.047, + "step": 275290 + }, + { + "epoch": 1.0478597474174616, + "grad_norm": 0.12764814496040344, + "learning_rate": 0.000247512376540948, + "loss": 2.043, + "step": 275300 + }, + { + "epoch": 1.0478978098855842, + "grad_norm": 0.1589909940958023, + "learning_rate": 0.00024742987772369697, + "loss": 2.0366, + "step": 275310 + }, + { + "epoch": 1.0479358723537069, + "grad_norm": 0.14342698454856873, + "learning_rate": 0.0002473474058448373, + "loss": 2.014, + "step": 275320 + }, + { + "epoch": 1.0479739348218295, + "grad_norm": 0.14049477875232697, + "learning_rate": 0.00024726496087799774, + "loss": 2.0314, + "step": 275330 + }, + { + "epoch": 1.0480119972899522, + "grad_norm": 0.14000898599624634, + "learning_rate": 0.0002471825427968498, + "loss": 2.0284, + "step": 275340 + }, + { + "epoch": 1.0480500597580749, + "grad_norm": 0.12767021358013153, + "learning_rate": 0.00024710015157510803, + "loss": 2.0357, + "step": 275350 + }, + { + "epoch": 1.0480881222261977, + "grad_norm": 0.13420067727565765, + "learning_rate": 0.00024701778718652965, + "loss": 2.028, + "step": 275360 + }, + { + "epoch": 1.0481261846943204, + "grad_norm": 0.13943490386009216, + "learning_rate": 0.0002469354496049147, + "loss": 2.0407, + "step": 275370 + }, + { + "epoch": 1.048164247162443, + "grad_norm": 0.1490197330713272, + "learning_rate": 0.00024685313880410574, + "loss": 2.0161, + "step": 275380 + }, + { + "epoch": 1.0482023096305657, + "grad_norm": 0.14449959993362427, + "learning_rate": 0.000246770854757988, + "loss": 2.026, + "step": 275390 + }, + { + "epoch": 1.0482403720986884, + "grad_norm": 0.13023877143859863, + "learning_rate": 0.00024668859744048896, + "loss": 2.0419, + "step": 275400 + }, + { + "epoch": 1.048278434566811, + "grad_norm": 0.1355513632297516, + "learning_rate": 0.0002466063668255784, + "loss": 2.0382, + "step": 275410 + }, + { + "epoch": 1.0483164970349337, + "grad_norm": 0.12848785519599915, + "learning_rate": 0.0002465241628872685, + "loss": 2.0306, + "step": 275420 + }, + { + "epoch": 1.0483545595030563, + "grad_norm": 0.14631952345371246, + "learning_rate": 0.0002464419855996134, + "loss": 2.0328, + "step": 275430 + }, + { + "epoch": 1.048392621971179, + "grad_norm": 0.13640548288822174, + "learning_rate": 0.0002463598349367093, + "loss": 2.0235, + "step": 275440 + }, + { + "epoch": 1.0484306844393019, + "grad_norm": 0.13725143671035767, + "learning_rate": 0.0002462777108726945, + "loss": 2.0416, + "step": 275450 + }, + { + "epoch": 1.0484687469074245, + "grad_norm": 0.14505118131637573, + "learning_rate": 0.0002461956133817489, + "loss": 2.0331, + "step": 275460 + }, + { + "epoch": 1.0485068093755472, + "grad_norm": 0.1433112770318985, + "learning_rate": 0.0002461135424380943, + "loss": 2.039, + "step": 275470 + }, + { + "epoch": 1.0485448718436698, + "grad_norm": 0.13896414637565613, + "learning_rate": 0.00024603149801599413, + "loss": 2.0261, + "step": 275480 + }, + { + "epoch": 1.0485829343117925, + "grad_norm": 0.15077711641788483, + "learning_rate": 0.00024594948008975324, + "loss": 2.0315, + "step": 275490 + }, + { + "epoch": 1.0486209967799152, + "grad_norm": 0.11837725341320038, + "learning_rate": 0.0002458674886337182, + "loss": 2.0328, + "step": 275500 + }, + { + "epoch": 1.0486590592480378, + "grad_norm": 0.14062966406345367, + "learning_rate": 0.0002457855236222768, + "loss": 2.0226, + "step": 275510 + }, + { + "epoch": 1.0486971217161605, + "grad_norm": 0.16681206226348877, + "learning_rate": 0.000245703585029858, + "loss": 2.0255, + "step": 275520 + }, + { + "epoch": 1.0487351841842831, + "grad_norm": 0.12906672060489655, + "learning_rate": 0.0002456216728309321, + "loss": 2.0141, + "step": 275530 + }, + { + "epoch": 1.048773246652406, + "grad_norm": 0.13691098988056183, + "learning_rate": 0.00024553978700001054, + "loss": 2.0251, + "step": 275540 + }, + { + "epoch": 1.0488113091205287, + "grad_norm": 0.13565468788146973, + "learning_rate": 0.0002454579275116456, + "loss": 2.0432, + "step": 275550 + }, + { + "epoch": 1.0488493715886513, + "grad_norm": 0.13613511621952057, + "learning_rate": 0.0002453760943404305, + "loss": 2.0194, + "step": 275560 + }, + { + "epoch": 1.048887434056774, + "grad_norm": 0.16795943677425385, + "learning_rate": 0.0002452942874609994, + "loss": 2.0425, + "step": 275570 + }, + { + "epoch": 1.0489254965248966, + "grad_norm": 0.14237631857395172, + "learning_rate": 0.00024521250684802706, + "loss": 2.0397, + "step": 275580 + }, + { + "epoch": 1.0489635589930193, + "grad_norm": 0.14784595370292664, + "learning_rate": 0.0002451307524762288, + "loss": 2.0284, + "step": 275590 + }, + { + "epoch": 1.049001621461142, + "grad_norm": 0.1270940750837326, + "learning_rate": 0.0002450490243203607, + "loss": 2.0278, + "step": 275600 + }, + { + "epoch": 1.0490396839292646, + "grad_norm": 0.12831611931324005, + "learning_rate": 0.00024496732235521925, + "loss": 2.0134, + "step": 275610 + }, + { + "epoch": 1.0490777463973875, + "grad_norm": 0.1486426740884781, + "learning_rate": 0.000244885646555641, + "loss": 2.0376, + "step": 275620 + }, + { + "epoch": 1.0491158088655101, + "grad_norm": 0.12868672609329224, + "learning_rate": 0.0002448039968965031, + "loss": 2.0269, + "step": 275630 + }, + { + "epoch": 1.0491538713336328, + "grad_norm": 0.1494320034980774, + "learning_rate": 0.00024472237335272277, + "loss": 2.0179, + "step": 275640 + }, + { + "epoch": 1.0491919338017555, + "grad_norm": 0.14237873256206512, + "learning_rate": 0.0002446407758992573, + "loss": 2.0128, + "step": 275650 + }, + { + "epoch": 1.0492299962698781, + "grad_norm": 0.14436769485473633, + "learning_rate": 0.00024455920451110395, + "loss": 2.0428, + "step": 275660 + }, + { + "epoch": 1.0492680587380008, + "grad_norm": 0.13775093853473663, + "learning_rate": 0.00024447765916330006, + "loss": 2.0403, + "step": 275670 + }, + { + "epoch": 1.0493061212061234, + "grad_norm": 0.12932050228118896, + "learning_rate": 0.0002443961398309225, + "loss": 2.0173, + "step": 275680 + }, + { + "epoch": 1.049344183674246, + "grad_norm": 0.12811337411403656, + "learning_rate": 0.0002443146464890881, + "loss": 2.0213, + "step": 275690 + }, + { + "epoch": 1.0493822461423687, + "grad_norm": 0.13555051386356354, + "learning_rate": 0.0002442331791129532, + "loss": 2.0079, + "step": 275700 + }, + { + "epoch": 1.0494203086104916, + "grad_norm": 0.1272488683462143, + "learning_rate": 0.00024415173767771385, + "loss": 2.0245, + "step": 275710 + }, + { + "epoch": 1.0494583710786143, + "grad_norm": 0.14617857336997986, + "learning_rate": 0.00024407032215860547, + "loss": 2.0303, + "step": 275720 + }, + { + "epoch": 1.049496433546737, + "grad_norm": 0.13590756058692932, + "learning_rate": 0.00024398893253090275, + "loss": 2.035, + "step": 275730 + }, + { + "epoch": 1.0495344960148596, + "grad_norm": 0.1549926996231079, + "learning_rate": 0.00024390756876991988, + "loss": 2.0305, + "step": 275740 + }, + { + "epoch": 1.0495725584829823, + "grad_norm": 0.13491860032081604, + "learning_rate": 0.00024382623085101, + "loss": 2.034, + "step": 275750 + }, + { + "epoch": 1.049610620951105, + "grad_norm": 0.14095671474933624, + "learning_rate": 0.00024374491874956571, + "loss": 2.0196, + "step": 275760 + }, + { + "epoch": 1.0496486834192276, + "grad_norm": 0.15967071056365967, + "learning_rate": 0.00024366363244101837, + "loss": 2.0319, + "step": 275770 + }, + { + "epoch": 1.0496867458873502, + "grad_norm": 0.16383443772792816, + "learning_rate": 0.0002435823719008383, + "loss": 2.027, + "step": 275780 + }, + { + "epoch": 1.049724808355473, + "grad_norm": 0.1587381362915039, + "learning_rate": 0.0002435011371045348, + "loss": 2.0313, + "step": 275790 + }, + { + "epoch": 1.0497628708235958, + "grad_norm": 0.13336656987667084, + "learning_rate": 0.00024341992802765584, + "loss": 2.023, + "step": 275800 + }, + { + "epoch": 1.0498009332917184, + "grad_norm": 0.14063481986522675, + "learning_rate": 0.00024333874464578804, + "loss": 2.0407, + "step": 275810 + }, + { + "epoch": 1.049838995759841, + "grad_norm": 0.134406179189682, + "learning_rate": 0.0002432575869345568, + "loss": 2.0231, + "step": 275820 + }, + { + "epoch": 1.0498770582279637, + "grad_norm": 0.12986144423484802, + "learning_rate": 0.00024317645486962586, + "loss": 2.0294, + "step": 275830 + }, + { + "epoch": 1.0499151206960864, + "grad_norm": 0.13074316084384918, + "learning_rate": 0.00024309534842669738, + "loss": 2.0335, + "step": 275840 + }, + { + "epoch": 1.049953183164209, + "grad_norm": 0.11960646510124207, + "learning_rate": 0.00024301426758151212, + "loss": 2.0371, + "step": 275850 + }, + { + "epoch": 1.0499912456323317, + "grad_norm": 0.15164650976657867, + "learning_rate": 0.0002429332123098487, + "loss": 2.04, + "step": 275860 + }, + { + "epoch": 1.0500293081004544, + "grad_norm": 0.14507368206977844, + "learning_rate": 0.00024285218258752417, + "loss": 2.0258, + "step": 275870 + }, + { + "epoch": 1.0500673705685772, + "grad_norm": 0.13732139766216278, + "learning_rate": 0.0002427711783903937, + "loss": 2.0295, + "step": 275880 + }, + { + "epoch": 1.0501054330367, + "grad_norm": 0.13564445078372955, + "learning_rate": 0.0002426901996943503, + "loss": 2.0415, + "step": 275890 + }, + { + "epoch": 1.0501434955048226, + "grad_norm": 0.14474515616893768, + "learning_rate": 0.000242609246475325, + "loss": 2.0391, + "step": 275900 + }, + { + "epoch": 1.0501815579729452, + "grad_norm": 0.176593616604805, + "learning_rate": 0.00024252831870928666, + "loss": 2.0354, + "step": 275910 + }, + { + "epoch": 1.0502196204410679, + "grad_norm": 0.1409319043159485, + "learning_rate": 0.00024244741637224193, + "loss": 2.0167, + "step": 275920 + }, + { + "epoch": 1.0502576829091905, + "grad_norm": 0.14716550707817078, + "learning_rate": 0.00024236653944023496, + "loss": 2.0202, + "step": 275930 + }, + { + "epoch": 1.0502957453773132, + "grad_norm": 0.1268341839313507, + "learning_rate": 0.0002422856878893478, + "loss": 2.0114, + "step": 275940 + }, + { + "epoch": 1.0503338078454358, + "grad_norm": 0.12487328052520752, + "learning_rate": 0.00024220486169569967, + "loss": 2.0437, + "step": 275950 + }, + { + "epoch": 1.0503718703135587, + "grad_norm": 0.13905835151672363, + "learning_rate": 0.00024212406083544747, + "loss": 2.0203, + "step": 275960 + }, + { + "epoch": 1.0504099327816814, + "grad_norm": 0.13599219918251038, + "learning_rate": 0.0002420432852847853, + "loss": 2.021, + "step": 275970 + }, + { + "epoch": 1.050447995249804, + "grad_norm": 0.13935434818267822, + "learning_rate": 0.00024196253501994458, + "loss": 2.028, + "step": 275980 + }, + { + "epoch": 1.0504860577179267, + "grad_norm": 0.15049731731414795, + "learning_rate": 0.0002418818100171939, + "loss": 2.0315, + "step": 275990 + }, + { + "epoch": 1.0505241201860493, + "grad_norm": 0.13765917718410492, + "learning_rate": 0.0002418011102528389, + "loss": 2.0183, + "step": 276000 + }, + { + "epoch": 1.050562182654172, + "grad_norm": 0.13514918088912964, + "learning_rate": 0.00024172043570322226, + "loss": 2.0324, + "step": 276010 + }, + { + "epoch": 1.0506002451222947, + "grad_norm": 0.12284824997186661, + "learning_rate": 0.00024163978634472372, + "loss": 2.037, + "step": 276020 + }, + { + "epoch": 1.0506383075904173, + "grad_norm": 0.13643833994865417, + "learning_rate": 0.00024155916215375973, + "loss": 2.0215, + "step": 276030 + }, + { + "epoch": 1.05067637005854, + "grad_norm": 0.1364985704421997, + "learning_rate": 0.0002414785631067835, + "loss": 2.0274, + "step": 276040 + }, + { + "epoch": 1.0507144325266629, + "grad_norm": 0.13563884794712067, + "learning_rate": 0.00024139798918028498, + "loss": 2.0269, + "step": 276050 + }, + { + "epoch": 1.0507524949947855, + "grad_norm": 0.15985845029354095, + "learning_rate": 0.00024131744035079083, + "loss": 2.0319, + "step": 276060 + }, + { + "epoch": 1.0507905574629082, + "grad_norm": 0.13042674958705902, + "learning_rate": 0.0002412369165948641, + "loss": 2.0382, + "step": 276070 + }, + { + "epoch": 1.0508286199310308, + "grad_norm": 0.13331478834152222, + "learning_rate": 0.0002411564178891043, + "loss": 2.0234, + "step": 276080 + }, + { + "epoch": 1.0508666823991535, + "grad_norm": 0.13373886048793793, + "learning_rate": 0.00024107594421014745, + "loss": 2.0215, + "step": 276090 + }, + { + "epoch": 1.0509047448672761, + "grad_norm": 0.1378992199897766, + "learning_rate": 0.00024099549553466582, + "loss": 2.0323, + "step": 276100 + }, + { + "epoch": 1.0509428073353988, + "grad_norm": 0.14647714793682098, + "learning_rate": 0.00024091507183936767, + "loss": 2.0383, + "step": 276110 + }, + { + "epoch": 1.0509808698035215, + "grad_norm": 0.1651303470134735, + "learning_rate": 0.00024083467310099784, + "loss": 2.0249, + "step": 276120 + }, + { + "epoch": 1.0510189322716443, + "grad_norm": 0.12446706742048264, + "learning_rate": 0.0002407542992963368, + "loss": 2.0287, + "step": 276130 + }, + { + "epoch": 1.051056994739767, + "grad_norm": 0.1194441169500351, + "learning_rate": 0.0002406739504022012, + "loss": 2.0217, + "step": 276140 + }, + { + "epoch": 1.0510950572078896, + "grad_norm": 0.132504403591156, + "learning_rate": 0.0002405936263954437, + "loss": 2.0266, + "step": 276150 + }, + { + "epoch": 1.0511331196760123, + "grad_norm": 0.11552822589874268, + "learning_rate": 0.0002405133272529525, + "loss": 2.0301, + "step": 276160 + }, + { + "epoch": 1.051171182144135, + "grad_norm": 0.13374902307987213, + "learning_rate": 0.0002404330529516518, + "loss": 2.0188, + "step": 276170 + }, + { + "epoch": 1.0512092446122576, + "grad_norm": 0.1289547234773636, + "learning_rate": 0.0002403528034685014, + "loss": 2.0263, + "step": 276180 + }, + { + "epoch": 1.0512473070803803, + "grad_norm": 0.1595178246498108, + "learning_rate": 0.00024027257878049663, + "loss": 2.0196, + "step": 276190 + }, + { + "epoch": 1.051285369548503, + "grad_norm": 0.13148796558380127, + "learning_rate": 0.0002401923788646684, + "loss": 2.0193, + "step": 276200 + }, + { + "epoch": 1.0513234320166256, + "grad_norm": 0.1301811933517456, + "learning_rate": 0.00024011220369808307, + "loss": 2.0175, + "step": 276210 + }, + { + "epoch": 1.0513614944847485, + "grad_norm": 0.17643029987812042, + "learning_rate": 0.00024003205325784233, + "loss": 2.0483, + "step": 276220 + }, + { + "epoch": 1.0513995569528711, + "grad_norm": 0.14449796080589294, + "learning_rate": 0.00023995192752108313, + "loss": 2.0255, + "step": 276230 + }, + { + "epoch": 1.0514376194209938, + "grad_norm": 0.1294335275888443, + "learning_rate": 0.00023987182646497773, + "loss": 2.0206, + "step": 276240 + }, + { + "epoch": 1.0514756818891164, + "grad_norm": 0.1255398392677307, + "learning_rate": 0.00023979175006673344, + "loss": 2.0332, + "step": 276250 + }, + { + "epoch": 1.051513744357239, + "grad_norm": 0.12768849730491638, + "learning_rate": 0.00023971169830359259, + "loss": 2.0397, + "step": 276260 + }, + { + "epoch": 1.0515518068253618, + "grad_norm": 0.13748249411582947, + "learning_rate": 0.0002396316711528327, + "loss": 2.0474, + "step": 276270 + }, + { + "epoch": 1.0515898692934844, + "grad_norm": 0.13064044713974, + "learning_rate": 0.00023955166859176592, + "loss": 2.0314, + "step": 276280 + }, + { + "epoch": 1.051627931761607, + "grad_norm": 0.1798001527786255, + "learning_rate": 0.00023947169059773942, + "loss": 2.0249, + "step": 276290 + }, + { + "epoch": 1.0516659942297297, + "grad_norm": 0.14949892461299896, + "learning_rate": 0.00023939173714813516, + "loss": 2.027, + "step": 276300 + }, + { + "epoch": 1.0517040566978526, + "grad_norm": 0.13493582606315613, + "learning_rate": 0.0002393118082203697, + "loss": 2.0383, + "step": 276310 + }, + { + "epoch": 1.0517421191659753, + "grad_norm": 0.1302517056465149, + "learning_rate": 0.000239231903791894, + "loss": 2.0331, + "step": 276320 + }, + { + "epoch": 1.051780181634098, + "grad_norm": 0.12950988113880157, + "learning_rate": 0.00023915202384019413, + "loss": 2.0274, + "step": 276330 + }, + { + "epoch": 1.0518182441022206, + "grad_norm": 0.1417592167854309, + "learning_rate": 0.00023907216834279, + "loss": 2.032, + "step": 276340 + }, + { + "epoch": 1.0518563065703432, + "grad_norm": 0.17708420753479004, + "learning_rate": 0.00023899233727723624, + "loss": 2.0362, + "step": 276350 + }, + { + "epoch": 1.051894369038466, + "grad_norm": 0.15068641304969788, + "learning_rate": 0.00023891253062112178, + "loss": 2.0568, + "step": 276360 + }, + { + "epoch": 1.0519324315065886, + "grad_norm": 0.14147871732711792, + "learning_rate": 0.00023883274835206976, + "loss": 2.0339, + "step": 276370 + }, + { + "epoch": 1.0519704939747112, + "grad_norm": 0.14862096309661865, + "learning_rate": 0.00023875299044773734, + "loss": 2.0263, + "step": 276380 + }, + { + "epoch": 1.052008556442834, + "grad_norm": 0.15561573207378387, + "learning_rate": 0.0002386732568858161, + "loss": 2.0253, + "step": 276390 + }, + { + "epoch": 1.0520466189109567, + "grad_norm": 0.14745838940143585, + "learning_rate": 0.0002385935476440313, + "loss": 2.0103, + "step": 276400 + }, + { + "epoch": 1.0520846813790794, + "grad_norm": 0.1444399207830429, + "learning_rate": 0.00023851386270014235, + "loss": 2.0236, + "step": 276410 + }, + { + "epoch": 1.052122743847202, + "grad_norm": 0.156083881855011, + "learning_rate": 0.0002384342020319425, + "loss": 2.0307, + "step": 276420 + }, + { + "epoch": 1.0521608063153247, + "grad_norm": 0.13538765907287598, + "learning_rate": 0.00023835456561725882, + "loss": 2.0299, + "step": 276430 + }, + { + "epoch": 1.0521988687834474, + "grad_norm": 0.13774365186691284, + "learning_rate": 0.000238274953433952, + "loss": 2.0312, + "step": 276440 + }, + { + "epoch": 1.05223693125157, + "grad_norm": 0.17226563394069672, + "learning_rate": 0.00023819536545991655, + "loss": 2.0228, + "step": 276450 + }, + { + "epoch": 1.0522749937196927, + "grad_norm": 0.14496228098869324, + "learning_rate": 0.00023811580167308044, + "loss": 2.0265, + "step": 276460 + }, + { + "epoch": 1.0523130561878153, + "grad_norm": 0.13886311650276184, + "learning_rate": 0.00023803626205140527, + "loss": 2.0403, + "step": 276470 + }, + { + "epoch": 1.0523511186559382, + "grad_norm": 0.13826139271259308, + "learning_rate": 0.00023795674657288608, + "loss": 2.0228, + "step": 276480 + }, + { + "epoch": 1.0523891811240609, + "grad_norm": 0.12250851094722748, + "learning_rate": 0.00023787725521555108, + "loss": 2.037, + "step": 276490 + }, + { + "epoch": 1.0524272435921835, + "grad_norm": 0.1252516806125641, + "learning_rate": 0.0002377977879574621, + "loss": 2.0248, + "step": 276500 + }, + { + "epoch": 1.0524653060603062, + "grad_norm": 0.1385105699300766, + "learning_rate": 0.000237718344776714, + "loss": 2.0175, + "step": 276510 + }, + { + "epoch": 1.0525033685284289, + "grad_norm": 0.15018312633037567, + "learning_rate": 0.00023763892565143485, + "loss": 2.0349, + "step": 276520 + }, + { + "epoch": 1.0525414309965515, + "grad_norm": 0.12454996258020401, + "learning_rate": 0.0002375595305597858, + "loss": 2.0264, + "step": 276530 + }, + { + "epoch": 1.0525794934646742, + "grad_norm": 0.13958518207073212, + "learning_rate": 0.0002374801594799611, + "loss": 2.0332, + "step": 276540 + }, + { + "epoch": 1.0526175559327968, + "grad_norm": 0.16262899339199066, + "learning_rate": 0.00023740081239018786, + "loss": 2.0252, + "step": 276550 + }, + { + "epoch": 1.0526556184009195, + "grad_norm": 0.14138750731945038, + "learning_rate": 0.00023732148926872604, + "loss": 2.0178, + "step": 276560 + }, + { + "epoch": 1.0526936808690424, + "grad_norm": 0.1345914602279663, + "learning_rate": 0.00023724219009386865, + "loss": 2.0228, + "step": 276570 + }, + { + "epoch": 1.052731743337165, + "grad_norm": 0.12766501307487488, + "learning_rate": 0.00023716291484394115, + "loss": 2.0277, + "step": 276580 + }, + { + "epoch": 1.0527698058052877, + "grad_norm": 0.12269017845392227, + "learning_rate": 0.00023708366349730188, + "loss": 2.0324, + "step": 276590 + }, + { + "epoch": 1.0528078682734103, + "grad_norm": 0.14488330483436584, + "learning_rate": 0.00023700443603234169, + "loss": 2.0332, + "step": 276600 + }, + { + "epoch": 1.052845930741533, + "grad_norm": 0.15273307263851166, + "learning_rate": 0.000236925232427484, + "loss": 2.0324, + "step": 276610 + }, + { + "epoch": 1.0528839932096556, + "grad_norm": 0.14215412735939026, + "learning_rate": 0.00023684605266118463, + "loss": 2.0301, + "step": 276620 + }, + { + "epoch": 1.0529220556777783, + "grad_norm": 0.1337355077266693, + "learning_rate": 0.00023676689671193202, + "loss": 2.0284, + "step": 276630 + }, + { + "epoch": 1.052960118145901, + "grad_norm": 0.14778606593608856, + "learning_rate": 0.0002366877645582467, + "loss": 2.0232, + "step": 276640 + }, + { + "epoch": 1.0529981806140238, + "grad_norm": 0.14975306391716003, + "learning_rate": 0.00023660865617868154, + "loss": 2.0143, + "step": 276650 + }, + { + "epoch": 1.0530362430821465, + "grad_norm": 0.12948152422904968, + "learning_rate": 0.00023652957155182163, + "loss": 2.0306, + "step": 276660 + }, + { + "epoch": 1.0530743055502692, + "grad_norm": 0.1450575739145279, + "learning_rate": 0.00023645051065628427, + "loss": 2.0324, + "step": 276670 + }, + { + "epoch": 1.0531123680183918, + "grad_norm": 0.14224261045455933, + "learning_rate": 0.00023637147347071864, + "loss": 2.0158, + "step": 276680 + }, + { + "epoch": 1.0531504304865145, + "grad_norm": 0.12313112616539001, + "learning_rate": 0.0002362924599738061, + "loss": 2.026, + "step": 276690 + }, + { + "epoch": 1.0531884929546371, + "grad_norm": 0.13612818717956543, + "learning_rate": 0.00023621347014425977, + "loss": 2.0447, + "step": 276700 + }, + { + "epoch": 1.0532265554227598, + "grad_norm": 0.12820477783679962, + "learning_rate": 0.00023613450396082474, + "loss": 2.0325, + "step": 276710 + }, + { + "epoch": 1.0532646178908824, + "grad_norm": 0.1335642784833908, + "learning_rate": 0.00023605556140227795, + "loss": 2.0345, + "step": 276720 + }, + { + "epoch": 1.053302680359005, + "grad_norm": 0.1384914219379425, + "learning_rate": 0.0002359766424474279, + "loss": 2.0255, + "step": 276730 + }, + { + "epoch": 1.053340742827128, + "grad_norm": 0.12948639690876007, + "learning_rate": 0.00023589774707511485, + "loss": 2.015, + "step": 276740 + }, + { + "epoch": 1.0533788052952506, + "grad_norm": 0.16162152588367462, + "learning_rate": 0.0002358188752642107, + "loss": 2.021, + "step": 276750 + }, + { + "epoch": 1.0534168677633733, + "grad_norm": 0.12111581861972809, + "learning_rate": 0.00023574002699361878, + "loss": 2.0229, + "step": 276760 + }, + { + "epoch": 1.053454930231496, + "grad_norm": 0.1492808610200882, + "learning_rate": 0.00023566120224227395, + "loss": 2.0413, + "step": 276770 + }, + { + "epoch": 1.0534929926996186, + "grad_norm": 0.14243555068969727, + "learning_rate": 0.0002355824009891424, + "loss": 2.016, + "step": 276780 + }, + { + "epoch": 1.0535310551677413, + "grad_norm": 0.13206037878990173, + "learning_rate": 0.00023550362321322183, + "loss": 2.0301, + "step": 276790 + }, + { + "epoch": 1.053569117635864, + "grad_norm": 0.133035346865654, + "learning_rate": 0.0002354248688935409, + "loss": 2.0395, + "step": 276800 + }, + { + "epoch": 1.0536071801039866, + "grad_norm": 0.14485926926136017, + "learning_rate": 0.00023534613800915984, + "loss": 2.0231, + "step": 276810 + }, + { + "epoch": 1.0536452425721095, + "grad_norm": 0.12455225735902786, + "learning_rate": 0.00023526743053916971, + "loss": 2.0061, + "step": 276820 + }, + { + "epoch": 1.053683305040232, + "grad_norm": 0.15578719973564148, + "learning_rate": 0.0002351887464626928, + "loss": 2.0313, + "step": 276830 + }, + { + "epoch": 1.0537213675083548, + "grad_norm": 0.14091117680072784, + "learning_rate": 0.00023511008575888233, + "loss": 2.0345, + "step": 276840 + }, + { + "epoch": 1.0537594299764774, + "grad_norm": 0.14255601167678833, + "learning_rate": 0.0002350314484069226, + "loss": 2.0358, + "step": 276850 + }, + { + "epoch": 1.0537974924446, + "grad_norm": 0.13675963878631592, + "learning_rate": 0.00023495283438602855, + "loss": 2.0202, + "step": 276860 + }, + { + "epoch": 1.0538355549127227, + "grad_norm": 0.1342182606458664, + "learning_rate": 0.00023487424367544624, + "loss": 2.0126, + "step": 276870 + }, + { + "epoch": 1.0538736173808454, + "grad_norm": 0.12606917321681976, + "learning_rate": 0.0002347956762544522, + "loss": 2.037, + "step": 276880 + }, + { + "epoch": 1.053911679848968, + "grad_norm": 0.13598474860191345, + "learning_rate": 0.00023471713210235386, + "loss": 2.02, + "step": 276890 + }, + { + "epoch": 1.0539497423170907, + "grad_norm": 0.13164277374744415, + "learning_rate": 0.00023463861119848905, + "loss": 2.0234, + "step": 276900 + }, + { + "epoch": 1.0539878047852136, + "grad_norm": 0.1325596272945404, + "learning_rate": 0.0002345601135222264, + "loss": 2.036, + "step": 276910 + }, + { + "epoch": 1.0540258672533362, + "grad_norm": 0.1430538147687912, + "learning_rate": 0.00023448163905296493, + "loss": 2.0429, + "step": 276920 + }, + { + "epoch": 1.054063929721459, + "grad_norm": 0.1567510962486267, + "learning_rate": 0.00023440318777013408, + "loss": 2.0223, + "step": 276930 + }, + { + "epoch": 1.0541019921895816, + "grad_norm": 0.16093851625919342, + "learning_rate": 0.0002343247596531936, + "loss": 2.0205, + "step": 276940 + }, + { + "epoch": 1.0541400546577042, + "grad_norm": 0.13100646436214447, + "learning_rate": 0.0002342463546816338, + "loss": 2.027, + "step": 276950 + }, + { + "epoch": 1.0541781171258269, + "grad_norm": 0.14721788465976715, + "learning_rate": 0.00023416797283497487, + "loss": 2.0404, + "step": 276960 + }, + { + "epoch": 1.0542161795939495, + "grad_norm": 0.13433021306991577, + "learning_rate": 0.00023408961409276746, + "loss": 2.033, + "step": 276970 + }, + { + "epoch": 1.0542542420620722, + "grad_norm": 0.13327881693840027, + "learning_rate": 0.0002340112784345923, + "loss": 2.0492, + "step": 276980 + }, + { + "epoch": 1.054292304530195, + "grad_norm": 0.13001398742198944, + "learning_rate": 0.00023393296584006, + "loss": 2.0264, + "step": 276990 + }, + { + "epoch": 1.0543303669983177, + "grad_norm": 0.15639451146125793, + "learning_rate": 0.00023385467628881147, + "loss": 2.036, + "step": 277000 + }, + { + "epoch": 1.0543684294664404, + "grad_norm": 0.1349909007549286, + "learning_rate": 0.00023377640976051728, + "loss": 2.0342, + "step": 277010 + }, + { + "epoch": 1.054406491934563, + "grad_norm": 0.13593465089797974, + "learning_rate": 0.000233698166234878, + "loss": 2.0304, + "step": 277020 + }, + { + "epoch": 1.0544445544026857, + "grad_norm": 0.13852593302726746, + "learning_rate": 0.00023361994569162404, + "loss": 2.0327, + "step": 277030 + }, + { + "epoch": 1.0544826168708084, + "grad_norm": 0.1467350423336029, + "learning_rate": 0.0002335417481105155, + "loss": 2.0256, + "step": 277040 + }, + { + "epoch": 1.054520679338931, + "grad_norm": 0.1416371613740921, + "learning_rate": 0.00023346357347134205, + "loss": 2.0358, + "step": 277050 + }, + { + "epoch": 1.0545587418070537, + "grad_norm": 0.14200444519519806, + "learning_rate": 0.00023338542175392336, + "loss": 2.0335, + "step": 277060 + }, + { + "epoch": 1.0545968042751763, + "grad_norm": 0.13185830414295197, + "learning_rate": 0.00023330729293810827, + "loss": 2.0144, + "step": 277070 + }, + { + "epoch": 1.0546348667432992, + "grad_norm": 0.1465560793876648, + "learning_rate": 0.0002332291870037753, + "loss": 2.0302, + "step": 277080 + }, + { + "epoch": 1.0546729292114219, + "grad_norm": 0.12864398956298828, + "learning_rate": 0.00023315110393083255, + "loss": 2.0304, + "step": 277090 + }, + { + "epoch": 1.0547109916795445, + "grad_norm": 0.14548641443252563, + "learning_rate": 0.00023307304369921721, + "loss": 2.0303, + "step": 277100 + }, + { + "epoch": 1.0547490541476672, + "grad_norm": 0.14383628964424133, + "learning_rate": 0.00023299500628889603, + "loss": 2.0268, + "step": 277110 + }, + { + "epoch": 1.0547871166157898, + "grad_norm": 0.15194641053676605, + "learning_rate": 0.00023291699167986497, + "loss": 2.0306, + "step": 277120 + }, + { + "epoch": 1.0548251790839125, + "grad_norm": 0.13964703679084778, + "learning_rate": 0.00023283899985214907, + "loss": 2.0204, + "step": 277130 + }, + { + "epoch": 1.0548632415520351, + "grad_norm": 0.135025292634964, + "learning_rate": 0.00023276103078580273, + "loss": 2.0271, + "step": 277140 + }, + { + "epoch": 1.0549013040201578, + "grad_norm": 0.14040318131446838, + "learning_rate": 0.00023268308446090934, + "loss": 2.0271, + "step": 277150 + }, + { + "epoch": 1.0549393664882805, + "grad_norm": 0.7150856256484985, + "learning_rate": 0.00023260516085758126, + "loss": 2.0461, + "step": 277160 + }, + { + "epoch": 1.0549774289564033, + "grad_norm": 0.1341201215982437, + "learning_rate": 0.00023252725995595987, + "loss": 2.0262, + "step": 277170 + }, + { + "epoch": 1.055015491424526, + "grad_norm": 0.1424829214811325, + "learning_rate": 0.0002324493817362155, + "loss": 2.0297, + "step": 277180 + }, + { + "epoch": 1.0550535538926487, + "grad_norm": 0.16039422154426575, + "learning_rate": 0.00023237152617854728, + "loss": 2.0216, + "step": 277190 + }, + { + "epoch": 1.0550916163607713, + "grad_norm": 0.1337854564189911, + "learning_rate": 0.00023229369326318317, + "loss": 2.0255, + "step": 277200 + }, + { + "epoch": 1.055129678828894, + "grad_norm": 0.1322571486234665, + "learning_rate": 0.00023221588297037983, + "loss": 2.0318, + "step": 277210 + }, + { + "epoch": 1.0551677412970166, + "grad_norm": 0.13549922406673431, + "learning_rate": 0.00023213809528042255, + "loss": 2.0272, + "step": 277220 + }, + { + "epoch": 1.0552058037651393, + "grad_norm": 0.13208764791488647, + "learning_rate": 0.00023206033017362534, + "loss": 2.0349, + "step": 277230 + }, + { + "epoch": 1.055243866233262, + "grad_norm": 0.1279682070016861, + "learning_rate": 0.00023198258763033075, + "loss": 2.0268, + "step": 277240 + }, + { + "epoch": 1.0552819287013848, + "grad_norm": 0.13463786244392395, + "learning_rate": 0.00023190486763090978, + "loss": 2.0357, + "step": 277250 + }, + { + "epoch": 1.0553199911695075, + "grad_norm": 0.13447913527488708, + "learning_rate": 0.00023182717015576195, + "loss": 2.0129, + "step": 277260 + }, + { + "epoch": 1.0553580536376301, + "grad_norm": 0.13115116953849792, + "learning_rate": 0.00023174949518531501, + "loss": 2.04, + "step": 277270 + }, + { + "epoch": 1.0553961161057528, + "grad_norm": 0.12983772158622742, + "learning_rate": 0.00023167184270002524, + "loss": 2.0406, + "step": 277280 + }, + { + "epoch": 1.0554341785738754, + "grad_norm": 0.15062974393367767, + "learning_rate": 0.00023159421268037706, + "loss": 2.0226, + "step": 277290 + }, + { + "epoch": 1.055472241041998, + "grad_norm": 0.15623483061790466, + "learning_rate": 0.00023151660510688316, + "loss": 2.023, + "step": 277300 + }, + { + "epoch": 1.0555103035101208, + "grad_norm": 0.1440696120262146, + "learning_rate": 0.00023143901996008432, + "loss": 2.0077, + "step": 277310 + }, + { + "epoch": 1.0555483659782434, + "grad_norm": 0.15217626094818115, + "learning_rate": 0.00023136145722054948, + "loss": 2.0199, + "step": 277320 + }, + { + "epoch": 1.055586428446366, + "grad_norm": 0.12807799875736237, + "learning_rate": 0.00023128391686887568, + "loss": 2.0236, + "step": 277330 + }, + { + "epoch": 1.055624490914489, + "grad_norm": 0.1437712162733078, + "learning_rate": 0.0002312063988856878, + "loss": 2.0185, + "step": 277340 + }, + { + "epoch": 1.0556625533826116, + "grad_norm": 0.13763098418712616, + "learning_rate": 0.0002311289032516387, + "loss": 2.015, + "step": 277350 + }, + { + "epoch": 1.0557006158507343, + "grad_norm": 0.13203462958335876, + "learning_rate": 0.00023105142994740924, + "loss": 2.0194, + "step": 277360 + }, + { + "epoch": 1.055738678318857, + "grad_norm": 0.1410224586725235, + "learning_rate": 0.00023097397895370793, + "loss": 2.0335, + "step": 277370 + }, + { + "epoch": 1.0557767407869796, + "grad_norm": 0.1614682972431183, + "learning_rate": 0.0002308965502512711, + "loss": 2.008, + "step": 277380 + }, + { + "epoch": 1.0558148032551022, + "grad_norm": 0.13247433304786682, + "learning_rate": 0.00023081914382086283, + "loss": 2.0175, + "step": 277390 + }, + { + "epoch": 1.055852865723225, + "grad_norm": 0.14368799328804016, + "learning_rate": 0.00023074175964327482, + "loss": 2.0175, + "step": 277400 + }, + { + "epoch": 1.0558909281913476, + "grad_norm": 0.14326095581054688, + "learning_rate": 0.0002306643976993263, + "loss": 2.0171, + "step": 277410 + }, + { + "epoch": 1.0559289906594702, + "grad_norm": 0.16751118004322052, + "learning_rate": 0.00023058705796986419, + "loss": 2.0364, + "step": 277420 + }, + { + "epoch": 1.055967053127593, + "grad_norm": 0.13226917386054993, + "learning_rate": 0.00023050974043576272, + "loss": 2.0149, + "step": 277430 + }, + { + "epoch": 1.0560051155957157, + "grad_norm": 0.13986603915691376, + "learning_rate": 0.0002304324450779236, + "loss": 2.0374, + "step": 277440 + }, + { + "epoch": 1.0560431780638384, + "grad_norm": 0.14492225646972656, + "learning_rate": 0.00023035517187727607, + "loss": 2.0419, + "step": 277450 + }, + { + "epoch": 1.056081240531961, + "grad_norm": 0.13461005687713623, + "learning_rate": 0.00023027792081477643, + "loss": 2.0392, + "step": 277460 + }, + { + "epoch": 1.0561193030000837, + "grad_norm": 0.17020630836486816, + "learning_rate": 0.00023020069187140846, + "loss": 2.02, + "step": 277470 + }, + { + "epoch": 1.0561573654682064, + "grad_norm": 0.13969464600086212, + "learning_rate": 0.00023012348502818308, + "loss": 2.0317, + "step": 277480 + }, + { + "epoch": 1.056195427936329, + "grad_norm": 0.1441717892885208, + "learning_rate": 0.00023004630026613826, + "loss": 2.0206, + "step": 277490 + }, + { + "epoch": 1.0562334904044517, + "grad_norm": 0.12981465458869934, + "learning_rate": 0.00022996913756633913, + "loss": 2.0178, + "step": 277500 + }, + { + "epoch": 1.0562715528725746, + "grad_norm": 0.16938796639442444, + "learning_rate": 0.00022989199690987805, + "loss": 2.0176, + "step": 277510 + }, + { + "epoch": 1.0563096153406972, + "grad_norm": 0.15160812437534332, + "learning_rate": 0.00022981487827787413, + "loss": 2.0262, + "step": 277520 + }, + { + "epoch": 1.0563476778088199, + "grad_norm": 0.12253037095069885, + "learning_rate": 0.0002297377816514734, + "loss": 2.0199, + "step": 277530 + }, + { + "epoch": 1.0563857402769425, + "grad_norm": 0.1367233842611313, + "learning_rate": 0.00022966070701184904, + "loss": 2.0395, + "step": 277540 + }, + { + "epoch": 1.0564238027450652, + "grad_norm": 0.13288183510303497, + "learning_rate": 0.00022958365434020084, + "loss": 2.0296, + "step": 277550 + }, + { + "epoch": 1.0564618652131879, + "grad_norm": 0.11826697736978531, + "learning_rate": 0.00022950662361775537, + "loss": 2.0217, + "step": 277560 + }, + { + "epoch": 1.0564999276813105, + "grad_norm": 0.12652520835399628, + "learning_rate": 0.00022942961482576602, + "loss": 2.0218, + "step": 277570 + }, + { + "epoch": 1.0565379901494332, + "grad_norm": 0.1527426689863205, + "learning_rate": 0.00022935262794551286, + "loss": 2.0275, + "step": 277580 + }, + { + "epoch": 1.0565760526175558, + "grad_norm": 0.13205711543560028, + "learning_rate": 0.00022927566295830243, + "loss": 2.0192, + "step": 277590 + }, + { + "epoch": 1.0566141150856787, + "grad_norm": 0.13617442548274994, + "learning_rate": 0.00022919871984546797, + "loss": 2.038, + "step": 277600 + }, + { + "epoch": 1.0566521775538014, + "grad_norm": 0.13269749283790588, + "learning_rate": 0.00022912179858836922, + "loss": 2.0294, + "step": 277610 + }, + { + "epoch": 1.056690240021924, + "grad_norm": 0.1346275359392166, + "learning_rate": 0.00022904489916839233, + "loss": 2.0185, + "step": 277620 + }, + { + "epoch": 1.0567283024900467, + "grad_norm": 0.14745649695396423, + "learning_rate": 0.0002289680215669499, + "loss": 2.0402, + "step": 277630 + }, + { + "epoch": 1.0567663649581693, + "grad_norm": 0.1354425996541977, + "learning_rate": 0.00022889116576548085, + "loss": 2.0331, + "step": 277640 + }, + { + "epoch": 1.056804427426292, + "grad_norm": 0.13665327429771423, + "learning_rate": 0.00022881433174545035, + "loss": 2.0087, + "step": 277650 + }, + { + "epoch": 1.0568424898944146, + "grad_norm": 0.1539125293493271, + "learning_rate": 0.00022873751948834992, + "loss": 2.0321, + "step": 277660 + }, + { + "epoch": 1.0568805523625373, + "grad_norm": 0.13223835825920105, + "learning_rate": 0.0002286607289756973, + "loss": 2.0371, + "step": 277670 + }, + { + "epoch": 1.0569186148306602, + "grad_norm": 0.1493256539106369, + "learning_rate": 0.00022858396018903622, + "loss": 2.023, + "step": 277680 + }, + { + "epoch": 1.0569566772987828, + "grad_norm": 0.13801084458827972, + "learning_rate": 0.0002285072131099367, + "loss": 2.0171, + "step": 277690 + }, + { + "epoch": 1.0569947397669055, + "grad_norm": 0.14287050068378448, + "learning_rate": 0.00022843048771999463, + "loss": 2.0375, + "step": 277700 + }, + { + "epoch": 1.0570328022350282, + "grad_norm": 0.15522553026676178, + "learning_rate": 0.00022835378400083196, + "loss": 2.015, + "step": 277710 + }, + { + "epoch": 1.0570708647031508, + "grad_norm": 0.13880471885204315, + "learning_rate": 0.00022827710193409662, + "loss": 2.0132, + "step": 277720 + }, + { + "epoch": 1.0571089271712735, + "grad_norm": 0.13956932723522186, + "learning_rate": 0.00022820044150146236, + "loss": 2.0146, + "step": 277730 + }, + { + "epoch": 1.0571469896393961, + "grad_norm": 0.12712182104587555, + "learning_rate": 0.00022812380268462878, + "loss": 2.0262, + "step": 277740 + }, + { + "epoch": 1.0571850521075188, + "grad_norm": 0.18996763229370117, + "learning_rate": 0.00022804718546532132, + "loss": 2.027, + "step": 277750 + }, + { + "epoch": 1.0572231145756414, + "grad_norm": 0.16294504702091217, + "learning_rate": 0.00022797058982529112, + "loss": 2.0087, + "step": 277760 + }, + { + "epoch": 1.0572611770437643, + "grad_norm": 0.12692804634571075, + "learning_rate": 0.00022789401574631503, + "loss": 2.0337, + "step": 277770 + }, + { + "epoch": 1.057299239511887, + "grad_norm": 0.1276542842388153, + "learning_rate": 0.0002278174632101954, + "loss": 2.0268, + "step": 277780 + }, + { + "epoch": 1.0573373019800096, + "grad_norm": 0.14247260987758636, + "learning_rate": 0.00022774093219876036, + "loss": 2.0021, + "step": 277790 + }, + { + "epoch": 1.0573753644481323, + "grad_norm": 0.13470324873924255, + "learning_rate": 0.00022766442269386345, + "loss": 2.0288, + "step": 277800 + }, + { + "epoch": 1.057413426916255, + "grad_norm": 0.13499245047569275, + "learning_rate": 0.00022758793467738376, + "loss": 2.0175, + "step": 277810 + }, + { + "epoch": 1.0574514893843776, + "grad_norm": 0.13271304965019226, + "learning_rate": 0.00022751146813122574, + "loss": 2.032, + "step": 277820 + }, + { + "epoch": 1.0574895518525003, + "grad_norm": 0.12108422815799713, + "learning_rate": 0.00022743502303731927, + "loss": 2.0218, + "step": 277830 + }, + { + "epoch": 1.057527614320623, + "grad_norm": 0.15185122191905975, + "learning_rate": 0.0002273585993776196, + "loss": 2.0304, + "step": 277840 + }, + { + "epoch": 1.0575656767887458, + "grad_norm": 0.13970841467380524, + "learning_rate": 0.00022728219713410714, + "loss": 2.0184, + "step": 277850 + }, + { + "epoch": 1.0576037392568685, + "grad_norm": 0.15076260268688202, + "learning_rate": 0.00022720581628878767, + "loss": 2.0196, + "step": 277860 + }, + { + "epoch": 1.0576418017249911, + "grad_norm": 0.14045163989067078, + "learning_rate": 0.00022712945682369206, + "loss": 2.0299, + "step": 277870 + }, + { + "epoch": 1.0576798641931138, + "grad_norm": 0.14571724832057953, + "learning_rate": 0.00022705311872087643, + "loss": 2.0213, + "step": 277880 + }, + { + "epoch": 1.0577179266612364, + "grad_norm": 0.14794188737869263, + "learning_rate": 0.00022697680196242175, + "loss": 2.0362, + "step": 277890 + }, + { + "epoch": 1.057755989129359, + "grad_norm": 0.13847602903842926, + "learning_rate": 0.00022690050653043436, + "loss": 2.0325, + "step": 277900 + }, + { + "epoch": 1.0577940515974817, + "grad_norm": 0.13841918110847473, + "learning_rate": 0.0002268242324070453, + "loss": 2.034, + "step": 277910 + }, + { + "epoch": 1.0578321140656044, + "grad_norm": 0.1277417689561844, + "learning_rate": 0.00022674797957441067, + "loss": 2.0145, + "step": 277920 + }, + { + "epoch": 1.057870176533727, + "grad_norm": 0.1296491026878357, + "learning_rate": 0.00022667174801471157, + "loss": 2.0274, + "step": 277930 + }, + { + "epoch": 1.05790823900185, + "grad_norm": 0.15428663790225983, + "learning_rate": 0.00022659553771015367, + "loss": 2.0266, + "step": 277940 + }, + { + "epoch": 1.0579463014699726, + "grad_norm": 0.15810316801071167, + "learning_rate": 0.0002265193486429677, + "loss": 2.0248, + "step": 277950 + }, + { + "epoch": 1.0579843639380953, + "grad_norm": 0.12902303040027618, + "learning_rate": 0.00022644318079540894, + "loss": 2.0204, + "step": 277960 + }, + { + "epoch": 1.058022426406218, + "grad_norm": 0.13099555671215057, + "learning_rate": 0.00022636703414975745, + "loss": 2.0227, + "step": 277970 + }, + { + "epoch": 1.0580604888743406, + "grad_norm": 0.14858795702457428, + "learning_rate": 0.00022629090868831804, + "loss": 2.0268, + "step": 277980 + }, + { + "epoch": 1.0580985513424632, + "grad_norm": 0.13814105093479156, + "learning_rate": 0.0002262148043934199, + "loss": 2.0149, + "step": 277990 + }, + { + "epoch": 1.0581366138105859, + "grad_norm": 0.13135714828968048, + "learning_rate": 0.00022613872124741696, + "loss": 2.0325, + "step": 278000 + }, + { + "epoch": 1.0581746762787085, + "grad_norm": 0.11998411267995834, + "learning_rate": 0.00022606265923268753, + "loss": 2.0323, + "step": 278010 + }, + { + "epoch": 1.0582127387468312, + "grad_norm": 0.13203677535057068, + "learning_rate": 0.00022598661833163453, + "loss": 2.0301, + "step": 278020 + }, + { + "epoch": 1.058250801214954, + "grad_norm": 0.15732504427433014, + "learning_rate": 0.00022591059852668512, + "loss": 2.0235, + "step": 278030 + }, + { + "epoch": 1.0582888636830767, + "grad_norm": 0.13207469880580902, + "learning_rate": 0.00022583459980029088, + "loss": 2.027, + "step": 278040 + }, + { + "epoch": 1.0583269261511994, + "grad_norm": 0.16607141494750977, + "learning_rate": 0.00022575862213492775, + "loss": 2.0105, + "step": 278050 + }, + { + "epoch": 1.058364988619322, + "grad_norm": 0.13722389936447144, + "learning_rate": 0.00022568266551309597, + "loss": 2.0212, + "step": 278060 + }, + { + "epoch": 1.0584030510874447, + "grad_norm": 0.13708582520484924, + "learning_rate": 0.00022560672991731983, + "loss": 2.0176, + "step": 278070 + }, + { + "epoch": 1.0584411135555674, + "grad_norm": 0.12867718935012817, + "learning_rate": 0.00022553081533014798, + "loss": 2.024, + "step": 278080 + }, + { + "epoch": 1.05847917602369, + "grad_norm": 0.12874118983745575, + "learning_rate": 0.0002254549217341531, + "loss": 2.0304, + "step": 278090 + }, + { + "epoch": 1.0585172384918127, + "grad_norm": 0.14795273542404175, + "learning_rate": 0.00022537904911193197, + "loss": 2.0174, + "step": 278100 + }, + { + "epoch": 1.0585553009599356, + "grad_norm": 0.1385265588760376, + "learning_rate": 0.00022530319744610544, + "loss": 2.0182, + "step": 278110 + }, + { + "epoch": 1.0585933634280582, + "grad_norm": 0.1351693719625473, + "learning_rate": 0.00022522736671931826, + "loss": 2.0282, + "step": 278120 + }, + { + "epoch": 1.0586314258961809, + "grad_norm": 0.1506417840719223, + "learning_rate": 0.00022515155691423927, + "loss": 2.0175, + "step": 278130 + }, + { + "epoch": 1.0586694883643035, + "grad_norm": 0.15787288546562195, + "learning_rate": 0.000225075768013561, + "loss": 2.017, + "step": 278140 + }, + { + "epoch": 1.0587075508324262, + "grad_norm": 0.16656358540058136, + "learning_rate": 0.000225, + "loss": 2.0143, + "step": 278150 + }, + { + "epoch": 1.0587456133005488, + "grad_norm": 0.14569364488124847, + "learning_rate": 0.00022492425285629657, + "loss": 2.027, + "step": 278160 + }, + { + "epoch": 1.0587836757686715, + "grad_norm": 0.15189801156520844, + "learning_rate": 0.00022484852656521476, + "loss": 2.0172, + "step": 278170 + }, + { + "epoch": 1.0588217382367942, + "grad_norm": 0.13380587100982666, + "learning_rate": 0.00022477282110954235, + "loss": 2.0282, + "step": 278180 + }, + { + "epoch": 1.0588598007049168, + "grad_norm": 0.13768541812896729, + "learning_rate": 0.00022469713647209067, + "loss": 2.0224, + "step": 278190 + }, + { + "epoch": 1.0588978631730397, + "grad_norm": 0.1390744000673294, + "learning_rate": 0.00022462147263569487, + "loss": 2.0329, + "step": 278200 + }, + { + "epoch": 1.0589359256411623, + "grad_norm": 0.16329453885555267, + "learning_rate": 0.00022454582958321363, + "loss": 2.0321, + "step": 278210 + }, + { + "epoch": 1.058973988109285, + "grad_norm": 0.16884244978427887, + "learning_rate": 0.000224470207297529, + "loss": 2.0298, + "step": 278220 + }, + { + "epoch": 1.0590120505774077, + "grad_norm": 0.12581200897693634, + "learning_rate": 0.00022439460576154658, + "loss": 2.0398, + "step": 278230 + }, + { + "epoch": 1.0590501130455303, + "grad_norm": 1.0800994634628296, + "learning_rate": 0.00022431902495819556, + "loss": 2.02, + "step": 278240 + }, + { + "epoch": 1.059088175513653, + "grad_norm": 0.13833831250667572, + "learning_rate": 0.0002242434648704284, + "loss": 2.0356, + "step": 278250 + }, + { + "epoch": 1.0591262379817756, + "grad_norm": 0.1597747802734375, + "learning_rate": 0.00022416792548122084, + "loss": 2.0125, + "step": 278260 + }, + { + "epoch": 1.0591643004498983, + "grad_norm": 0.16858337819576263, + "learning_rate": 0.0002240924067735721, + "loss": 2.027, + "step": 278270 + }, + { + "epoch": 1.059202362918021, + "grad_norm": 0.1446356326341629, + "learning_rate": 0.00022401690873050456, + "loss": 2.0293, + "step": 278280 + }, + { + "epoch": 1.0592404253861438, + "grad_norm": 0.14237236976623535, + "learning_rate": 0.00022394143133506374, + "loss": 2.0253, + "step": 278290 + }, + { + "epoch": 1.0592784878542665, + "grad_norm": 0.14626847207546234, + "learning_rate": 0.00022386597457031847, + "loss": 2.0134, + "step": 278300 + }, + { + "epoch": 1.0593165503223891, + "grad_norm": 0.12890249490737915, + "learning_rate": 0.0002237905384193607, + "loss": 2.0212, + "step": 278310 + }, + { + "epoch": 1.0593546127905118, + "grad_norm": 0.14841724932193756, + "learning_rate": 0.00022371512286530531, + "loss": 2.0129, + "step": 278320 + }, + { + "epoch": 1.0593926752586345, + "grad_norm": 0.145298570394516, + "learning_rate": 0.00022363972789129043, + "loss": 2.0183, + "step": 278330 + }, + { + "epoch": 1.059430737726757, + "grad_norm": 0.15068456530570984, + "learning_rate": 0.00022356435348047706, + "loss": 2.0285, + "step": 278340 + }, + { + "epoch": 1.0594688001948798, + "grad_norm": 0.1366945505142212, + "learning_rate": 0.00022348899961604902, + "loss": 2.0372, + "step": 278350 + }, + { + "epoch": 1.0595068626630024, + "grad_norm": 0.13482408225536346, + "learning_rate": 0.0002234136662812134, + "loss": 2.0077, + "step": 278360 + }, + { + "epoch": 1.0595449251311253, + "grad_norm": 0.13060268759727478, + "learning_rate": 0.00022333835345919977, + "loss": 2.0194, + "step": 278370 + }, + { + "epoch": 1.059582987599248, + "grad_norm": 0.16523779928684235, + "learning_rate": 0.00022326306113326083, + "loss": 2.0315, + "step": 278380 + }, + { + "epoch": 1.0596210500673706, + "grad_norm": 0.13515925407409668, + "learning_rate": 0.00022318778928667182, + "loss": 2.0185, + "step": 278390 + }, + { + "epoch": 1.0596591125354933, + "grad_norm": 0.13461363315582275, + "learning_rate": 0.00022311253790273085, + "loss": 2.0273, + "step": 278400 + }, + { + "epoch": 1.059697175003616, + "grad_norm": 0.12945568561553955, + "learning_rate": 0.00022303730696475866, + "loss": 2.0309, + "step": 278410 + }, + { + "epoch": 1.0597352374717386, + "grad_norm": 0.13417372107505798, + "learning_rate": 0.00022296209645609867, + "loss": 2.0159, + "step": 278420 + }, + { + "epoch": 1.0597732999398612, + "grad_norm": 0.14353212714195251, + "learning_rate": 0.00022288690636011684, + "loss": 2.0281, + "step": 278430 + }, + { + "epoch": 1.059811362407984, + "grad_norm": 0.1466580629348755, + "learning_rate": 0.00022281173666020176, + "loss": 2.0147, + "step": 278440 + }, + { + "epoch": 1.0598494248761066, + "grad_norm": 0.15297137200832367, + "learning_rate": 0.0002227365873397646, + "loss": 2.031, + "step": 278450 + }, + { + "epoch": 1.0598874873442294, + "grad_norm": 0.1253899484872818, + "learning_rate": 0.00022266145838223877, + "loss": 2.0232, + "step": 278460 + }, + { + "epoch": 1.059925549812352, + "grad_norm": 0.14415845274925232, + "learning_rate": 0.00022258634977108044, + "loss": 2.0253, + "step": 278470 + }, + { + "epoch": 1.0599636122804748, + "grad_norm": 0.14856302738189697, + "learning_rate": 0.00022251126148976785, + "loss": 2.0118, + "step": 278480 + }, + { + "epoch": 1.0600016747485974, + "grad_norm": 0.13965021073818207, + "learning_rate": 0.00022243619352180178, + "loss": 2.0216, + "step": 278490 + }, + { + "epoch": 1.06003973721672, + "grad_norm": 0.13246068358421326, + "learning_rate": 0.00022236114585070531, + "loss": 2.0192, + "step": 278500 + }, + { + "epoch": 1.0600777996848427, + "grad_norm": 0.16952328383922577, + "learning_rate": 0.00022228611846002367, + "loss": 2.0101, + "step": 278510 + }, + { + "epoch": 1.0601158621529654, + "grad_norm": 0.1641233265399933, + "learning_rate": 0.00022221111133332444, + "loss": 2.0194, + "step": 278520 + }, + { + "epoch": 1.060153924621088, + "grad_norm": 0.14600826799869537, + "learning_rate": 0.00022213612445419728, + "loss": 2.0246, + "step": 278530 + }, + { + "epoch": 1.060191987089211, + "grad_norm": 0.1428350955247879, + "learning_rate": 0.00022206115780625408, + "loss": 2.0211, + "step": 278540 + }, + { + "epoch": 1.0602300495573336, + "grad_norm": 0.13691508769989014, + "learning_rate": 0.00022198621137312874, + "loss": 2.0261, + "step": 278550 + }, + { + "epoch": 1.0602681120254562, + "grad_norm": 0.17456191778182983, + "learning_rate": 0.00022191128513847718, + "loss": 2.0111, + "step": 278560 + }, + { + "epoch": 1.060306174493579, + "grad_norm": 0.15665015578269958, + "learning_rate": 0.00022183637908597753, + "loss": 2.0345, + "step": 278570 + }, + { + "epoch": 1.0603442369617015, + "grad_norm": 0.14208164811134338, + "learning_rate": 0.0002217614931993297, + "loss": 2.0317, + "step": 278580 + }, + { + "epoch": 1.0603822994298242, + "grad_norm": 0.15451577305793762, + "learning_rate": 0.00022168662746225554, + "loss": 2.0137, + "step": 278590 + }, + { + "epoch": 1.0604203618979469, + "grad_norm": 0.1741928905248642, + "learning_rate": 0.0002216117818584989, + "loss": 2.0278, + "step": 278600 + }, + { + "epoch": 1.0604584243660695, + "grad_norm": 0.16310615837574005, + "learning_rate": 0.00022153695637182542, + "loss": 2.0301, + "step": 278610 + }, + { + "epoch": 1.0604964868341922, + "grad_norm": 0.13577622175216675, + "learning_rate": 0.00022146215098602245, + "loss": 2.0235, + "step": 278620 + }, + { + "epoch": 1.060534549302315, + "grad_norm": 0.13898788392543793, + "learning_rate": 0.00022138736568489935, + "loss": 2.0281, + "step": 278630 + }, + { + "epoch": 1.0605726117704377, + "grad_norm": 0.12679430842399597, + "learning_rate": 0.00022131260045228697, + "loss": 2.0204, + "step": 278640 + }, + { + "epoch": 1.0606106742385604, + "grad_norm": 0.14395207166671753, + "learning_rate": 0.0002212378552720378, + "loss": 2.0266, + "step": 278650 + }, + { + "epoch": 1.060648736706683, + "grad_norm": 0.13999640941619873, + "learning_rate": 0.0002211631301280263, + "loss": 2.0246, + "step": 278660 + }, + { + "epoch": 1.0606867991748057, + "grad_norm": 0.19303567707538605, + "learning_rate": 0.00022108842500414823, + "loss": 2.0222, + "step": 278670 + }, + { + "epoch": 1.0607248616429283, + "grad_norm": 0.13559174537658691, + "learning_rate": 0.00022101373988432093, + "loss": 2.0236, + "step": 278680 + }, + { + "epoch": 1.060762924111051, + "grad_norm": 0.13610365986824036, + "learning_rate": 0.0002209390747524835, + "loss": 2.0226, + "step": 278690 + }, + { + "epoch": 1.0608009865791737, + "grad_norm": 0.12796823680400848, + "learning_rate": 0.00022086442959259623, + "loss": 2.0216, + "step": 278700 + }, + { + "epoch": 1.0608390490472965, + "grad_norm": 0.14899617433547974, + "learning_rate": 0.00022078980438864105, + "loss": 2.0291, + "step": 278710 + }, + { + "epoch": 1.0608771115154192, + "grad_norm": 0.13698174059391022, + "learning_rate": 0.0002207151991246212, + "loss": 2.0272, + "step": 278720 + }, + { + "epoch": 1.0609151739835418, + "grad_norm": 0.14914453029632568, + "learning_rate": 0.0002206406137845613, + "loss": 2.0174, + "step": 278730 + }, + { + "epoch": 1.0609532364516645, + "grad_norm": 0.14008817076683044, + "learning_rate": 0.00022056604835250722, + "loss": 2.0165, + "step": 278740 + }, + { + "epoch": 1.0609912989197872, + "grad_norm": 0.12607118487358093, + "learning_rate": 0.00022049150281252628, + "loss": 2.0137, + "step": 278750 + }, + { + "epoch": 1.0610293613879098, + "grad_norm": 0.1694176197052002, + "learning_rate": 0.00022041697714870695, + "loss": 2.0284, + "step": 278760 + }, + { + "epoch": 1.0610674238560325, + "grad_norm": 0.17194348573684692, + "learning_rate": 0.00022034247134515878, + "loss": 2.0313, + "step": 278770 + }, + { + "epoch": 1.0611054863241551, + "grad_norm": 0.14293989539146423, + "learning_rate": 0.00022026798538601273, + "loss": 2.0262, + "step": 278780 + }, + { + "epoch": 1.0611435487922778, + "grad_norm": 0.16035741567611694, + "learning_rate": 0.00022019351925542063, + "loss": 2.0306, + "step": 278790 + }, + { + "epoch": 1.0611816112604007, + "grad_norm": 0.1362650990486145, + "learning_rate": 0.00022011907293755556, + "loss": 2.0145, + "step": 278800 + }, + { + "epoch": 1.0612196737285233, + "grad_norm": 0.14739075303077698, + "learning_rate": 0.00022004464641661164, + "loss": 2.024, + "step": 278810 + }, + { + "epoch": 1.061257736196646, + "grad_norm": 0.13756489753723145, + "learning_rate": 0.0002199702396768039, + "loss": 2.0093, + "step": 278820 + }, + { + "epoch": 1.0612957986647686, + "grad_norm": 0.1316682994365692, + "learning_rate": 0.00021989585270236833, + "loss": 2.0122, + "step": 278830 + }, + { + "epoch": 1.0613338611328913, + "grad_norm": 0.15358810126781464, + "learning_rate": 0.00021982148547756202, + "loss": 2.0323, + "step": 278840 + }, + { + "epoch": 1.061371923601014, + "grad_norm": 0.12710179388523102, + "learning_rate": 0.00021974713798666274, + "loss": 2.0295, + "step": 278850 + }, + { + "epoch": 1.0614099860691366, + "grad_norm": 0.12713980674743652, + "learning_rate": 0.00021967281021396918, + "loss": 2.0232, + "step": 278860 + }, + { + "epoch": 1.0614480485372593, + "grad_norm": 0.16448360681533813, + "learning_rate": 0.00021959850214380095, + "loss": 2.0301, + "step": 278870 + }, + { + "epoch": 1.061486111005382, + "grad_norm": 0.13466374576091766, + "learning_rate": 0.0002195242137604983, + "loss": 2.0257, + "step": 278880 + }, + { + "epoch": 1.0615241734735048, + "grad_norm": 0.13751991093158722, + "learning_rate": 0.0002194499450484222, + "loss": 2.008, + "step": 278890 + }, + { + "epoch": 1.0615622359416275, + "grad_norm": 0.13438063859939575, + "learning_rate": 0.0002193756959919544, + "loss": 2.0221, + "step": 278900 + }, + { + "epoch": 1.0616002984097501, + "grad_norm": 0.13424231112003326, + "learning_rate": 0.00021930146657549732, + "loss": 2.0294, + "step": 278910 + }, + { + "epoch": 1.0616383608778728, + "grad_norm": 0.13700388371944427, + "learning_rate": 0.00021922725678347383, + "loss": 2.0323, + "step": 278920 + }, + { + "epoch": 1.0616764233459954, + "grad_norm": 0.13141176104545593, + "learning_rate": 0.00021915306660032764, + "loss": 2.0182, + "step": 278930 + }, + { + "epoch": 1.061714485814118, + "grad_norm": 0.1494986116886139, + "learning_rate": 0.00021907889601052282, + "loss": 2.0313, + "step": 278940 + }, + { + "epoch": 1.0617525482822407, + "grad_norm": 0.1353265792131424, + "learning_rate": 0.00021900474499854389, + "loss": 2.0058, + "step": 278950 + }, + { + "epoch": 1.0617906107503634, + "grad_norm": 0.1495896428823471, + "learning_rate": 0.0002189306135488961, + "loss": 2.0244, + "step": 278960 + }, + { + "epoch": 1.0618286732184863, + "grad_norm": 0.12826865911483765, + "learning_rate": 0.00021885650164610483, + "loss": 2.0158, + "step": 278970 + }, + { + "epoch": 1.061866735686609, + "grad_norm": 0.13081242144107819, + "learning_rate": 0.000218782409274716, + "loss": 2.0146, + "step": 278980 + }, + { + "epoch": 1.0619047981547316, + "grad_norm": 0.16037730872631073, + "learning_rate": 0.00021870833641929595, + "loss": 2.0065, + "step": 278990 + }, + { + "epoch": 1.0619428606228543, + "grad_norm": 0.16274122893810272, + "learning_rate": 0.00021863428306443113, + "loss": 2.0105, + "step": 279000 + }, + { + "epoch": 1.061980923090977, + "grad_norm": 0.1254984587430954, + "learning_rate": 0.00021856024919472843, + "loss": 2.0208, + "step": 279010 + }, + { + "epoch": 1.0620189855590996, + "grad_norm": 0.13238771259784698, + "learning_rate": 0.00021848623479481504, + "loss": 2.0118, + "step": 279020 + }, + { + "epoch": 1.0620570480272222, + "grad_norm": 0.12015458941459656, + "learning_rate": 0.00021841223984933817, + "loss": 2.023, + "step": 279030 + }, + { + "epoch": 1.0620951104953449, + "grad_norm": 0.1459268033504486, + "learning_rate": 0.0002183382643429652, + "loss": 2.0236, + "step": 279040 + }, + { + "epoch": 1.0621331729634675, + "grad_norm": 0.13737080991268158, + "learning_rate": 0.0002182643082603839, + "loss": 2.0123, + "step": 279050 + }, + { + "epoch": 1.0621712354315904, + "grad_norm": 0.14364588260650635, + "learning_rate": 0.0002181903715863018, + "loss": 2.0331, + "step": 279060 + }, + { + "epoch": 1.062209297899713, + "grad_norm": 0.133913055062294, + "learning_rate": 0.0002181164543054467, + "loss": 2.016, + "step": 279070 + }, + { + "epoch": 1.0622473603678357, + "grad_norm": 0.14843854308128357, + "learning_rate": 0.00021804255640256632, + "loss": 2.0346, + "step": 279080 + }, + { + "epoch": 1.0622854228359584, + "grad_norm": 0.12832611799240112, + "learning_rate": 0.00021796867786242846, + "loss": 2.0163, + "step": 279090 + }, + { + "epoch": 1.062323485304081, + "grad_norm": 0.13493861258029938, + "learning_rate": 0.00021789481866982068, + "loss": 2.0199, + "step": 279100 + }, + { + "epoch": 1.0623615477722037, + "grad_norm": 0.1524580419063568, + "learning_rate": 0.00021782097880955077, + "loss": 2.0283, + "step": 279110 + }, + { + "epoch": 1.0623996102403264, + "grad_norm": 0.13833408057689667, + "learning_rate": 0.00021774715826644602, + "loss": 2.0288, + "step": 279120 + }, + { + "epoch": 1.062437672708449, + "grad_norm": 0.14383922517299652, + "learning_rate": 0.0002176733570253538, + "loss": 2.0212, + "step": 279130 + }, + { + "epoch": 1.0624757351765717, + "grad_norm": 0.13017255067825317, + "learning_rate": 0.00021759957507114125, + "loss": 2.0184, + "step": 279140 + }, + { + "epoch": 1.0625137976446946, + "grad_norm": 0.1488547921180725, + "learning_rate": 0.00021752581238869512, + "loss": 2.026, + "step": 279150 + }, + { + "epoch": 1.0625518601128172, + "grad_norm": 0.1583673655986786, + "learning_rate": 0.00021745206896292207, + "loss": 2.0161, + "step": 279160 + }, + { + "epoch": 1.0625899225809399, + "grad_norm": 0.13907520473003387, + "learning_rate": 0.00021737834477874842, + "loss": 2.0199, + "step": 279170 + }, + { + "epoch": 1.0626279850490625, + "grad_norm": 0.1313203126192093, + "learning_rate": 0.00021730463982112003, + "loss": 2.0093, + "step": 279180 + }, + { + "epoch": 1.0626660475171852, + "grad_norm": 0.18097400665283203, + "learning_rate": 0.00021723095407500255, + "loss": 2.0272, + "step": 279190 + }, + { + "epoch": 1.0627041099853078, + "grad_norm": 0.16231577098369598, + "learning_rate": 0.000217157287525381, + "loss": 2.0285, + "step": 279200 + }, + { + "epoch": 1.0627421724534305, + "grad_norm": 0.1830163151025772, + "learning_rate": 0.00021708364015726018, + "loss": 2.0193, + "step": 279210 + }, + { + "epoch": 1.0627802349215532, + "grad_norm": 0.14548464119434357, + "learning_rate": 0.00021701001195566417, + "loss": 2.0151, + "step": 279220 + }, + { + "epoch": 1.062818297389676, + "grad_norm": 0.14360500872135162, + "learning_rate": 0.00021693640290563676, + "loss": 2.0163, + "step": 279230 + }, + { + "epoch": 1.0628563598577987, + "grad_norm": 0.1566280722618103, + "learning_rate": 0.00021686281299224103, + "loss": 2.0032, + "step": 279240 + }, + { + "epoch": 1.0628944223259214, + "grad_norm": 0.13956668972969055, + "learning_rate": 0.00021678924220055952, + "loss": 2.0294, + "step": 279250 + }, + { + "epoch": 1.062932484794044, + "grad_norm": 0.13794146478176117, + "learning_rate": 0.00021671569051569413, + "loss": 2.0223, + "step": 279260 + }, + { + "epoch": 1.0629705472621667, + "grad_norm": 0.1345396786928177, + "learning_rate": 0.00021664215792276603, + "loss": 2.0239, + "step": 279270 + }, + { + "epoch": 1.0630086097302893, + "grad_norm": 0.13856376707553864, + "learning_rate": 0.0002165686444069158, + "loss": 2.0214, + "step": 279280 + }, + { + "epoch": 1.063046672198412, + "grad_norm": 0.13800005614757538, + "learning_rate": 0.00021649514995330328, + "loss": 2.0078, + "step": 279290 + }, + { + "epoch": 1.0630847346665346, + "grad_norm": 0.14292772114276886, + "learning_rate": 0.00021642167454710744, + "loss": 2.0175, + "step": 279300 + }, + { + "epoch": 1.0631227971346573, + "grad_norm": 0.14993226528167725, + "learning_rate": 0.00021634821817352647, + "loss": 2.0193, + "step": 279310 + }, + { + "epoch": 1.0631608596027802, + "grad_norm": 0.14130374789237976, + "learning_rate": 0.00021627478081777786, + "loss": 2.0215, + "step": 279320 + }, + { + "epoch": 1.0631989220709028, + "grad_norm": 0.15642014145851135, + "learning_rate": 0.00021620136246509801, + "loss": 2.0208, + "step": 279330 + }, + { + "epoch": 1.0632369845390255, + "grad_norm": 0.14840982854366302, + "learning_rate": 0.00021612796310074262, + "loss": 2.0143, + "step": 279340 + }, + { + "epoch": 1.0632750470071481, + "grad_norm": 0.12628307938575745, + "learning_rate": 0.00021605458270998636, + "loss": 2.0154, + "step": 279350 + }, + { + "epoch": 1.0633131094752708, + "grad_norm": 0.16346469521522522, + "learning_rate": 0.00021598122127812274, + "loss": 2.0042, + "step": 279360 + }, + { + "epoch": 1.0633511719433935, + "grad_norm": 0.13518983125686646, + "learning_rate": 0.00021590787879046465, + "loss": 2.0242, + "step": 279370 + }, + { + "epoch": 1.0633892344115161, + "grad_norm": 0.14704202115535736, + "learning_rate": 0.0002158345552323436, + "loss": 2.0159, + "step": 279380 + }, + { + "epoch": 1.0634272968796388, + "grad_norm": 0.15335479378700256, + "learning_rate": 0.00021576125058911011, + "loss": 2.0225, + "step": 279390 + }, + { + "epoch": 1.0634653593477617, + "grad_norm": 0.14868423342704773, + "learning_rate": 0.00021568796484613368, + "loss": 2.0191, + "step": 279400 + }, + { + "epoch": 1.0635034218158843, + "grad_norm": 0.13510699570178986, + "learning_rate": 0.0002156146979888025, + "loss": 2.0252, + "step": 279410 + }, + { + "epoch": 1.063541484284007, + "grad_norm": 0.14692704379558563, + "learning_rate": 0.0002155414500025238, + "loss": 2.0288, + "step": 279420 + }, + { + "epoch": 1.0635795467521296, + "grad_norm": 0.16837383806705475, + "learning_rate": 0.0002154682208727234, + "loss": 1.9996, + "step": 279430 + }, + { + "epoch": 1.0636176092202523, + "grad_norm": 0.15059910714626312, + "learning_rate": 0.00021539501058484583, + "loss": 2.0368, + "step": 279440 + }, + { + "epoch": 1.063655671688375, + "grad_norm": 0.16362057626247406, + "learning_rate": 0.00021532181912435462, + "loss": 2.02, + "step": 279450 + }, + { + "epoch": 1.0636937341564976, + "grad_norm": 0.1484278440475464, + "learning_rate": 0.0002152486464767317, + "loss": 2.0244, + "step": 279460 + }, + { + "epoch": 1.0637317966246203, + "grad_norm": 0.17187005281448364, + "learning_rate": 0.0002151754926274777, + "loss": 2.015, + "step": 279470 + }, + { + "epoch": 1.063769859092743, + "grad_norm": 0.14839616417884827, + "learning_rate": 0.00021510235756211205, + "loss": 2.0169, + "step": 279480 + }, + { + "epoch": 1.0638079215608658, + "grad_norm": 0.16944752633571625, + "learning_rate": 0.0002150292412661725, + "loss": 2.0368, + "step": 279490 + }, + { + "epoch": 1.0638459840289884, + "grad_norm": 0.1417141556739807, + "learning_rate": 0.0002149561437252155, + "loss": 2.0282, + "step": 279500 + }, + { + "epoch": 1.063884046497111, + "grad_norm": 0.15089070796966553, + "learning_rate": 0.000214883064924816, + "loss": 2.0045, + "step": 279510 + }, + { + "epoch": 1.0639221089652338, + "grad_norm": 0.146227166056633, + "learning_rate": 0.0002148100048505675, + "loss": 2.0196, + "step": 279520 + }, + { + "epoch": 1.0639601714333564, + "grad_norm": 0.1489729881286621, + "learning_rate": 0.00021473696348808176, + "loss": 2.0222, + "step": 279530 + }, + { + "epoch": 1.063998233901479, + "grad_norm": 0.16424356400966644, + "learning_rate": 0.00021466394082298912, + "loss": 2.0284, + "step": 279540 + }, + { + "epoch": 1.0640362963696017, + "grad_norm": 0.15235692262649536, + "learning_rate": 0.00021459093684093823, + "loss": 2.0131, + "step": 279550 + }, + { + "epoch": 1.0640743588377244, + "grad_norm": 0.13210387527942657, + "learning_rate": 0.00021451795152759606, + "loss": 2.0264, + "step": 279560 + }, + { + "epoch": 1.0641124213058473, + "grad_norm": 0.14658477902412415, + "learning_rate": 0.000214444984868648, + "loss": 2.0255, + "step": 279570 + }, + { + "epoch": 1.06415048377397, + "grad_norm": 0.14071625471115112, + "learning_rate": 0.00021437203684979774, + "loss": 2.0228, + "step": 279580 + }, + { + "epoch": 1.0641885462420926, + "grad_norm": 0.1429414302110672, + "learning_rate": 0.00021429910745676694, + "loss": 2.0403, + "step": 279590 + }, + { + "epoch": 1.0642266087102152, + "grad_norm": 0.13399270176887512, + "learning_rate": 0.00021422619667529593, + "loss": 2.0104, + "step": 279600 + }, + { + "epoch": 1.064264671178338, + "grad_norm": 0.13192670047283173, + "learning_rate": 0.0002141533044911428, + "loss": 2.0224, + "step": 279610 + }, + { + "epoch": 1.0643027336464606, + "grad_norm": 0.12973575294017792, + "learning_rate": 0.00021408043089008407, + "loss": 2.0116, + "step": 279620 + }, + { + "epoch": 1.0643407961145832, + "grad_norm": 0.14499998092651367, + "learning_rate": 0.00021400757585791425, + "loss": 2.0223, + "step": 279630 + }, + { + "epoch": 1.0643788585827059, + "grad_norm": 0.1375758945941925, + "learning_rate": 0.00021393473938044604, + "loss": 2.0265, + "step": 279640 + }, + { + "epoch": 1.0644169210508285, + "grad_norm": 0.15999175608158112, + "learning_rate": 0.00021386192144351007, + "loss": 2.013, + "step": 279650 + }, + { + "epoch": 1.0644549835189514, + "grad_norm": 0.12279332429170609, + "learning_rate": 0.0002137891220329551, + "loss": 2.0267, + "step": 279660 + }, + { + "epoch": 1.064493045987074, + "grad_norm": 0.15905717015266418, + "learning_rate": 0.00021371634113464782, + "loss": 2.0063, + "step": 279670 + }, + { + "epoch": 1.0645311084551967, + "grad_norm": 0.1357671469449997, + "learning_rate": 0.00021364357873447293, + "loss": 2.0415, + "step": 279680 + }, + { + "epoch": 1.0645691709233194, + "grad_norm": 0.1571875363588333, + "learning_rate": 0.000213570834818333, + "loss": 2.0407, + "step": 279690 + }, + { + "epoch": 1.064607233391442, + "grad_norm": 0.14225240051746368, + "learning_rate": 0.00021349810937214858, + "loss": 2.0186, + "step": 279700 + }, + { + "epoch": 1.0646452958595647, + "grad_norm": 0.13177137076854706, + "learning_rate": 0.00021342540238185797, + "loss": 2.0141, + "step": 279710 + }, + { + "epoch": 1.0646833583276873, + "grad_norm": 0.19607017934322357, + "learning_rate": 0.00021335271383341747, + "loss": 2.0255, + "step": 279720 + }, + { + "epoch": 1.06472142079581, + "grad_norm": 0.12749847769737244, + "learning_rate": 0.00021328004371280098, + "loss": 2.024, + "step": 279730 + }, + { + "epoch": 1.0647594832639329, + "grad_norm": 0.1525043100118637, + "learning_rate": 0.00021320739200600026, + "loss": 2.023, + "step": 279740 + }, + { + "epoch": 1.0647975457320555, + "grad_norm": 0.1349322646856308, + "learning_rate": 0.00021313475869902493, + "loss": 2.0142, + "step": 279750 + }, + { + "epoch": 1.0648356082001782, + "grad_norm": 0.13257881999015808, + "learning_rate": 0.00021306214377790213, + "loss": 2.0357, + "step": 279760 + }, + { + "epoch": 1.0648736706683009, + "grad_norm": 0.16428206861019135, + "learning_rate": 0.0002129895472286767, + "loss": 2.0216, + "step": 279770 + }, + { + "epoch": 1.0649117331364235, + "grad_norm": 0.14325910806655884, + "learning_rate": 0.0002129169690374113, + "loss": 2.018, + "step": 279780 + }, + { + "epoch": 1.0649497956045462, + "grad_norm": 0.1777109056711197, + "learning_rate": 0.000212844409190186, + "loss": 2.0177, + "step": 279790 + }, + { + "epoch": 1.0649878580726688, + "grad_norm": 0.13226689398288727, + "learning_rate": 0.00021277186767309857, + "loss": 2.0101, + "step": 279800 + }, + { + "epoch": 1.0650259205407915, + "grad_norm": 0.15519066154956818, + "learning_rate": 0.0002126993444722643, + "loss": 2.0159, + "step": 279810 + }, + { + "epoch": 1.0650639830089141, + "grad_norm": 0.13406550884246826, + "learning_rate": 0.00021262683957381595, + "loss": 2.0193, + "step": 279820 + }, + { + "epoch": 1.0651020454770368, + "grad_norm": 0.14426659047603607, + "learning_rate": 0.0002125543529639038, + "loss": 2.0187, + "step": 279830 + }, + { + "epoch": 1.0651401079451597, + "grad_norm": 0.12878841161727905, + "learning_rate": 0.00021248188462869567, + "loss": 2.012, + "step": 279840 + }, + { + "epoch": 1.0651781704132823, + "grad_norm": 0.1430235654115677, + "learning_rate": 0.00021240943455437671, + "loss": 2.0205, + "step": 279850 + }, + { + "epoch": 1.065216232881405, + "grad_norm": 0.16837656497955322, + "learning_rate": 0.0002123370027271495, + "loss": 2.0245, + "step": 279860 + }, + { + "epoch": 1.0652542953495276, + "grad_norm": 0.17101293802261353, + "learning_rate": 0.000212264589133234, + "loss": 2.007, + "step": 279870 + }, + { + "epoch": 1.0652923578176503, + "grad_norm": 0.14210744202136993, + "learning_rate": 0.0002121921937588674, + "loss": 2.0122, + "step": 279880 + }, + { + "epoch": 1.065330420285773, + "grad_norm": 0.13525697588920593, + "learning_rate": 0.0002121198165903044, + "loss": 2.0233, + "step": 279890 + }, + { + "epoch": 1.0653684827538956, + "grad_norm": 0.15535393357276917, + "learning_rate": 0.00021204745761381672, + "loss": 2.0213, + "step": 279900 + }, + { + "epoch": 1.0654065452220183, + "grad_norm": 0.13685598969459534, + "learning_rate": 0.00021197511681569364, + "loss": 2.0258, + "step": 279910 + }, + { + "epoch": 1.0654446076901412, + "grad_norm": 0.14752687513828278, + "learning_rate": 0.00021190279418224135, + "loss": 2.0152, + "step": 279920 + }, + { + "epoch": 1.0654826701582638, + "grad_norm": 0.15611529350280762, + "learning_rate": 0.00021183048969978336, + "loss": 2.012, + "step": 279930 + }, + { + "epoch": 1.0655207326263865, + "grad_norm": 0.15216854214668274, + "learning_rate": 0.00021175820335466038, + "loss": 2.022, + "step": 279940 + }, + { + "epoch": 1.0655587950945091, + "grad_norm": 0.14078018069267273, + "learning_rate": 0.00021168593513323008, + "loss": 2.0187, + "step": 279950 + }, + { + "epoch": 1.0655968575626318, + "grad_norm": 0.14143924415111542, + "learning_rate": 0.00021161368502186745, + "loss": 2.0227, + "step": 279960 + }, + { + "epoch": 1.0656349200307544, + "grad_norm": 0.1464422047138214, + "learning_rate": 0.00021154145300696442, + "loss": 2.0351, + "step": 279970 + }, + { + "epoch": 1.065672982498877, + "grad_norm": 0.16325105726718903, + "learning_rate": 0.00021146923907492982, + "loss": 2.023, + "step": 279980 + }, + { + "epoch": 1.0657110449669998, + "grad_norm": 0.1634446084499359, + "learning_rate": 0.0002113970432121898, + "loss": 2.0239, + "step": 279990 + }, + { + "epoch": 1.0657491074351224, + "grad_norm": 0.1478370875120163, + "learning_rate": 0.00021132486540518714, + "loss": 2.0105, + "step": 280000 + }, + { + "epoch": 1.0657871699032453, + "grad_norm": 0.17000077664852142, + "learning_rate": 0.00021125270564038178, + "loss": 2.0199, + "step": 280010 + }, + { + "epoch": 1.065825232371368, + "grad_norm": 0.17442534863948822, + "learning_rate": 0.0002111805639042506, + "loss": 2.0216, + "step": 280020 + }, + { + "epoch": 1.0658632948394906, + "grad_norm": 0.15885937213897705, + "learning_rate": 0.00021110844018328723, + "loss": 2.0169, + "step": 280030 + }, + { + "epoch": 1.0659013573076133, + "grad_norm": 0.13275925815105438, + "learning_rate": 0.00021103633446400223, + "loss": 2.0261, + "step": 280040 + }, + { + "epoch": 1.065939419775736, + "grad_norm": 0.13854673504829407, + "learning_rate": 0.00021096424673292292, + "loss": 2.0376, + "step": 280050 + }, + { + "epoch": 1.0659774822438586, + "grad_norm": 0.1444002389907837, + "learning_rate": 0.00021089217697659352, + "loss": 2.0045, + "step": 280060 + }, + { + "epoch": 1.0660155447119812, + "grad_norm": 0.1577683538198471, + "learning_rate": 0.0002108201251815749, + "loss": 2.0062, + "step": 280070 + }, + { + "epoch": 1.066053607180104, + "grad_norm": 0.21680620312690735, + "learning_rate": 0.00021074809133444483, + "loss": 2.0092, + "step": 280080 + }, + { + "epoch": 1.0660916696482268, + "grad_norm": 0.134336456656456, + "learning_rate": 0.00021067607542179763, + "loss": 2.0295, + "step": 280090 + }, + { + "epoch": 1.0661297321163494, + "grad_norm": 0.13651613891124725, + "learning_rate": 0.00021060407743024435, + "loss": 2.0166, + "step": 280100 + }, + { + "epoch": 1.066167794584472, + "grad_norm": 0.1491308957338333, + "learning_rate": 0.00021053209734641276, + "loss": 2.0164, + "step": 280110 + }, + { + "epoch": 1.0662058570525947, + "grad_norm": 0.1435723751783371, + "learning_rate": 0.00021046013515694712, + "loss": 2.0196, + "step": 280120 + }, + { + "epoch": 1.0662439195207174, + "grad_norm": 0.16801053285598755, + "learning_rate": 0.0002103881908485084, + "loss": 2.0368, + "step": 280130 + }, + { + "epoch": 1.06628198198884, + "grad_norm": 0.15588593482971191, + "learning_rate": 0.00021031626440777407, + "loss": 2.0134, + "step": 280140 + }, + { + "epoch": 1.0663200444569627, + "grad_norm": 0.14381839334964752, + "learning_rate": 0.0002102443558214382, + "loss": 2.0138, + "step": 280150 + }, + { + "epoch": 1.0663581069250854, + "grad_norm": 0.14546221494674683, + "learning_rate": 0.00021017246507621125, + "loss": 2.0079, + "step": 280160 + }, + { + "epoch": 1.066396169393208, + "grad_norm": 0.13942362368106842, + "learning_rate": 0.00021010059215882028, + "loss": 2.0184, + "step": 280170 + }, + { + "epoch": 1.066434231861331, + "grad_norm": 0.13316209614276886, + "learning_rate": 0.00021002873705600872, + "loss": 2.0044, + "step": 280180 + }, + { + "epoch": 1.0664722943294536, + "grad_norm": 0.12746325135231018, + "learning_rate": 0.00020995689975453647, + "loss": 2.0146, + "step": 280190 + }, + { + "epoch": 1.0665103567975762, + "grad_norm": 0.136347234249115, + "learning_rate": 0.00020988508024117982, + "loss": 2.0213, + "step": 280200 + }, + { + "epoch": 1.0665484192656989, + "grad_norm": 0.13976691663265228, + "learning_rate": 0.00020981327850273142, + "loss": 2.024, + "step": 280210 + }, + { + "epoch": 1.0665864817338215, + "grad_norm": 0.1449359655380249, + "learning_rate": 0.00020974149452600017, + "loss": 2.0065, + "step": 280220 + }, + { + "epoch": 1.0666245442019442, + "grad_norm": 0.14353162050247192, + "learning_rate": 0.00020966972829781138, + "loss": 2.0173, + "step": 280230 + }, + { + "epoch": 1.0666626066700668, + "grad_norm": 0.14832115173339844, + "learning_rate": 0.00020959797980500667, + "loss": 2.0146, + "step": 280240 + }, + { + "epoch": 1.0667006691381895, + "grad_norm": 0.16132640838623047, + "learning_rate": 0.00020952624903444373, + "loss": 2.0172, + "step": 280250 + }, + { + "epoch": 1.0667387316063124, + "grad_norm": 0.16305841505527496, + "learning_rate": 0.00020945453597299667, + "loss": 2.0124, + "step": 280260 + }, + { + "epoch": 1.066776794074435, + "grad_norm": 0.15650595724582672, + "learning_rate": 0.0002093828406075558, + "loss": 2.0242, + "step": 280270 + }, + { + "epoch": 1.0668148565425577, + "grad_norm": 0.13507379591464996, + "learning_rate": 0.00020931116292502734, + "loss": 2.0231, + "step": 280280 + }, + { + "epoch": 1.0668529190106804, + "grad_norm": 0.12661010026931763, + "learning_rate": 0.00020923950291233396, + "loss": 2.0292, + "step": 280290 + }, + { + "epoch": 1.066890981478803, + "grad_norm": 0.18024198710918427, + "learning_rate": 0.00020916786055641423, + "loss": 2.0196, + "step": 280300 + }, + { + "epoch": 1.0669290439469257, + "grad_norm": 0.1324780285358429, + "learning_rate": 0.00020909623584422287, + "loss": 2.0223, + "step": 280310 + }, + { + "epoch": 1.0669671064150483, + "grad_norm": 0.16044573485851288, + "learning_rate": 0.00020902462876273077, + "loss": 2.0313, + "step": 280320 + }, + { + "epoch": 1.067005168883171, + "grad_norm": 0.1606072187423706, + "learning_rate": 0.00020895303929892463, + "loss": 2.0144, + "step": 280330 + }, + { + "epoch": 1.0670432313512936, + "grad_norm": 0.14759883284568787, + "learning_rate": 0.00020888146743980725, + "loss": 2.0261, + "step": 280340 + }, + { + "epoch": 1.0670812938194165, + "grad_norm": 0.1489844173192978, + "learning_rate": 0.0002088099131723975, + "loss": 2.0158, + "step": 280350 + }, + { + "epoch": 1.0671193562875392, + "grad_norm": 0.1644773781299591, + "learning_rate": 0.00020873837648373013, + "loss": 2.0165, + "step": 280360 + }, + { + "epoch": 1.0671574187556618, + "grad_norm": 0.14012792706489563, + "learning_rate": 0.00020866685736085566, + "loss": 2.014, + "step": 280370 + }, + { + "epoch": 1.0671954812237845, + "grad_norm": 0.14115259051322937, + "learning_rate": 0.00020859535579084078, + "loss": 2.0088, + "step": 280380 + }, + { + "epoch": 1.0672335436919071, + "grad_norm": 0.1442381888628006, + "learning_rate": 0.0002085238717607678, + "loss": 2.0138, + "step": 280390 + }, + { + "epoch": 1.0672716061600298, + "grad_norm": 0.1616717129945755, + "learning_rate": 0.00020845240525773495, + "loss": 2.0148, + "step": 280400 + }, + { + "epoch": 1.0673096686281525, + "grad_norm": 0.15273220837116241, + "learning_rate": 0.00020838095626885634, + "loss": 2.0208, + "step": 280410 + }, + { + "epoch": 1.0673477310962751, + "grad_norm": 0.13198786973953247, + "learning_rate": 0.0002083095247812618, + "loss": 2.017, + "step": 280420 + }, + { + "epoch": 1.067385793564398, + "grad_norm": 0.16148214042186737, + "learning_rate": 0.00020823811078209681, + "loss": 2.0236, + "step": 280430 + }, + { + "epoch": 1.0674238560325207, + "grad_norm": 0.1520000398159027, + "learning_rate": 0.00020816671425852285, + "loss": 2.016, + "step": 280440 + }, + { + "epoch": 1.0674619185006433, + "grad_norm": 0.1314682811498642, + "learning_rate": 0.0002080953351977168, + "loss": 2.0105, + "step": 280450 + }, + { + "epoch": 1.067499980968766, + "grad_norm": 0.15124158561229706, + "learning_rate": 0.00020802397358687136, + "loss": 2.0239, + "step": 280460 + }, + { + "epoch": 1.0675380434368886, + "grad_norm": 0.1314118504524231, + "learning_rate": 0.00020795262941319492, + "loss": 2.0265, + "step": 280470 + }, + { + "epoch": 1.0675761059050113, + "grad_norm": 0.14328762888908386, + "learning_rate": 0.00020788130266391141, + "loss": 2.0218, + "step": 280480 + }, + { + "epoch": 1.067614168373134, + "grad_norm": 0.14062193036079407, + "learning_rate": 0.00020780999332626037, + "loss": 2.0277, + "step": 280490 + }, + { + "epoch": 1.0676522308412566, + "grad_norm": 0.16746476292610168, + "learning_rate": 0.00020773870138749695, + "loss": 2.0281, + "step": 280500 + }, + { + "epoch": 1.0676902933093793, + "grad_norm": 0.1793404072523117, + "learning_rate": 0.00020766742683489182, + "loss": 2.0281, + "step": 280510 + }, + { + "epoch": 1.0677283557775021, + "grad_norm": 0.13390576839447021, + "learning_rate": 0.00020759616965573108, + "loss": 2.0243, + "step": 280520 + }, + { + "epoch": 1.0677664182456248, + "grad_norm": 0.1407162994146347, + "learning_rate": 0.00020752492983731646, + "loss": 2.0249, + "step": 280530 + }, + { + "epoch": 1.0678044807137475, + "grad_norm": 0.1637965440750122, + "learning_rate": 0.00020745370736696507, + "loss": 2.0256, + "step": 280540 + }, + { + "epoch": 1.06784254318187, + "grad_norm": 0.1503283828496933, + "learning_rate": 0.00020738250223200939, + "loss": 2.0117, + "step": 280550 + }, + { + "epoch": 1.0678806056499928, + "grad_norm": 0.1429990977048874, + "learning_rate": 0.00020731131441979744, + "loss": 2.0321, + "step": 280560 + }, + { + "epoch": 1.0679186681181154, + "grad_norm": 0.15503156185150146, + "learning_rate": 0.00020724014391769262, + "loss": 2.0142, + "step": 280570 + }, + { + "epoch": 1.067956730586238, + "grad_norm": 0.1364317238330841, + "learning_rate": 0.00020716899071307354, + "loss": 2.0115, + "step": 280580 + }, + { + "epoch": 1.0679947930543607, + "grad_norm": 0.17652592062950134, + "learning_rate": 0.0002070978547933343, + "loss": 2.0062, + "step": 280590 + }, + { + "epoch": 1.0680328555224836, + "grad_norm": 0.15647253394126892, + "learning_rate": 0.00020702673614588423, + "loss": 2.0302, + "step": 280600 + }, + { + "epoch": 1.0680709179906063, + "grad_norm": 0.14261257648468018, + "learning_rate": 0.00020695563475814794, + "loss": 2.0226, + "step": 280610 + }, + { + "epoch": 1.068108980458729, + "grad_norm": 0.1298782229423523, + "learning_rate": 0.00020688455061756522, + "loss": 2.0295, + "step": 280620 + }, + { + "epoch": 1.0681470429268516, + "grad_norm": 0.18369466066360474, + "learning_rate": 0.00020681348371159132, + "loss": 2.013, + "step": 280630 + }, + { + "epoch": 1.0681851053949742, + "grad_norm": 0.14287227392196655, + "learning_rate": 0.00020674243402769643, + "loss": 2.0163, + "step": 280640 + }, + { + "epoch": 1.068223167863097, + "grad_norm": 0.1325698047876358, + "learning_rate": 0.000206671401553366, + "loss": 2.0251, + "step": 280650 + }, + { + "epoch": 1.0682612303312196, + "grad_norm": 0.1377692073583603, + "learning_rate": 0.00020660038627610066, + "loss": 2.0164, + "step": 280660 + }, + { + "epoch": 1.0682992927993422, + "grad_norm": 0.14473956823349, + "learning_rate": 0.00020652938818341627, + "loss": 2.0188, + "step": 280670 + }, + { + "epoch": 1.0683373552674649, + "grad_norm": 0.14381441473960876, + "learning_rate": 0.00020645840726284348, + "loss": 2.0155, + "step": 280680 + }, + { + "epoch": 1.0683754177355875, + "grad_norm": 0.18615978956222534, + "learning_rate": 0.00020638744350192834, + "loss": 2.0263, + "step": 280690 + }, + { + "epoch": 1.0684134802037104, + "grad_norm": 0.14097805321216583, + "learning_rate": 0.00020631649688823174, + "loss": 2.0228, + "step": 280700 + }, + { + "epoch": 1.068451542671833, + "grad_norm": 0.14481240510940552, + "learning_rate": 0.0002062455674093297, + "loss": 2.025, + "step": 280710 + }, + { + "epoch": 1.0684896051399557, + "grad_norm": 0.13994210958480835, + "learning_rate": 0.00020617465505281318, + "loss": 2.0091, + "step": 280720 + }, + { + "epoch": 1.0685276676080784, + "grad_norm": 0.17684991657733917, + "learning_rate": 0.0002061037598062881, + "loss": 2.0189, + "step": 280730 + }, + { + "epoch": 1.068565730076201, + "grad_norm": 0.16578590869903564, + "learning_rate": 0.00020603288165737537, + "loss": 2.0175, + "step": 280740 + }, + { + "epoch": 1.0686037925443237, + "grad_norm": 0.1451704055070877, + "learning_rate": 0.00020596202059371084, + "loss": 2.0248, + "step": 280750 + }, + { + "epoch": 1.0686418550124464, + "grad_norm": 0.15710540115833282, + "learning_rate": 0.00020589117660294514, + "loss": 2.0152, + "step": 280760 + }, + { + "epoch": 1.068679917480569, + "grad_norm": 0.16066260635852814, + "learning_rate": 0.0002058203496727439, + "loss": 2.028, + "step": 280770 + }, + { + "epoch": 1.0687179799486919, + "grad_norm": 0.13257549703121185, + "learning_rate": 0.00020574953979078753, + "loss": 2.0165, + "step": 280780 + }, + { + "epoch": 1.0687560424168145, + "grad_norm": 0.14641395211219788, + "learning_rate": 0.0002056787469447713, + "loss": 2.0122, + "step": 280790 + }, + { + "epoch": 1.0687941048849372, + "grad_norm": 0.13742470741271973, + "learning_rate": 0.00020560797112240508, + "loss": 2.0188, + "step": 280800 + }, + { + "epoch": 1.0688321673530599, + "grad_norm": 0.18226590752601624, + "learning_rate": 0.0002055372123114139, + "loss": 2.0338, + "step": 280810 + }, + { + "epoch": 1.0688702298211825, + "grad_norm": 0.13812443614006042, + "learning_rate": 0.0002054664704995372, + "loss": 2.0412, + "step": 280820 + }, + { + "epoch": 1.0689082922893052, + "grad_norm": 0.20717187225818634, + "learning_rate": 0.00020539574567452924, + "loss": 2.018, + "step": 280830 + }, + { + "epoch": 1.0689463547574278, + "grad_norm": 0.14729878306388855, + "learning_rate": 0.00020532503782415902, + "loss": 2.0214, + "step": 280840 + }, + { + "epoch": 1.0689844172255505, + "grad_norm": 0.14876174926757812, + "learning_rate": 0.00020525434693621008, + "loss": 2.001, + "step": 280850 + }, + { + "epoch": 1.0690224796936731, + "grad_norm": 0.14767611026763916, + "learning_rate": 0.00020518367299848084, + "loss": 2.0185, + "step": 280860 + }, + { + "epoch": 1.069060542161796, + "grad_norm": 0.13235405087471008, + "learning_rate": 0.00020511301599878418, + "loss": 2.0158, + "step": 280870 + }, + { + "epoch": 1.0690986046299187, + "grad_norm": 0.13786683976650238, + "learning_rate": 0.0002050423759249475, + "loss": 2.0356, + "step": 280880 + }, + { + "epoch": 1.0691366670980413, + "grad_norm": 0.13260993361473083, + "learning_rate": 0.00020497175276481294, + "loss": 2.0064, + "step": 280890 + }, + { + "epoch": 1.069174729566164, + "grad_norm": 0.13280153274536133, + "learning_rate": 0.00020490114650623708, + "loss": 2.0152, + "step": 280900 + }, + { + "epoch": 1.0692127920342867, + "grad_norm": 0.15440459549427032, + "learning_rate": 0.0002048305571370912, + "loss": 2.0104, + "step": 280910 + }, + { + "epoch": 1.0692508545024093, + "grad_norm": 0.13955621421337128, + "learning_rate": 0.00020475998464526075, + "loss": 2.0256, + "step": 280920 + }, + { + "epoch": 1.069288916970532, + "grad_norm": 0.15699417889118195, + "learning_rate": 0.00020468942901864594, + "loss": 2.0169, + "step": 280930 + }, + { + "epoch": 1.0693269794386546, + "grad_norm": 0.1380460411310196, + "learning_rate": 0.00020461889024516144, + "loss": 2.0085, + "step": 280940 + }, + { + "epoch": 1.0693650419067775, + "grad_norm": 0.13854175806045532, + "learning_rate": 0.00020454836831273606, + "loss": 2.0236, + "step": 280950 + }, + { + "epoch": 1.0694031043749002, + "grad_norm": 0.17974607646465302, + "learning_rate": 0.00020447786320931333, + "loss": 2.0237, + "step": 280960 + }, + { + "epoch": 1.0694411668430228, + "grad_norm": 0.13895480334758759, + "learning_rate": 0.000204407374922851, + "loss": 2.0189, + "step": 280970 + }, + { + "epoch": 1.0694792293111455, + "grad_norm": 0.12747804820537567, + "learning_rate": 0.0002043369034413211, + "loss": 2.0098, + "step": 280980 + }, + { + "epoch": 1.0695172917792681, + "grad_norm": 0.1594598889350891, + "learning_rate": 0.00020426644875271027, + "loss": 2.0271, + "step": 280990 + }, + { + "epoch": 1.0695553542473908, + "grad_norm": 0.15190567076206207, + "learning_rate": 0.0002041960108450192, + "loss": 2.0107, + "step": 281000 + }, + { + "epoch": 1.0695934167155134, + "grad_norm": 0.19501842558383942, + "learning_rate": 0.00020412558970626295, + "loss": 2.0048, + "step": 281010 + }, + { + "epoch": 1.069631479183636, + "grad_norm": 0.12260035425424576, + "learning_rate": 0.00020405518532447083, + "loss": 2.009, + "step": 281020 + }, + { + "epoch": 1.0696695416517588, + "grad_norm": 0.13547752797603607, + "learning_rate": 0.00020398479768768635, + "loss": 2.0087, + "step": 281030 + }, + { + "epoch": 1.0697076041198816, + "grad_norm": 0.13921548426151276, + "learning_rate": 0.00020391442678396727, + "loss": 2.0026, + "step": 281040 + }, + { + "epoch": 1.0697456665880043, + "grad_norm": 0.14413045346736908, + "learning_rate": 0.00020384407260138566, + "loss": 2.0115, + "step": 281050 + }, + { + "epoch": 1.069783729056127, + "grad_norm": 0.13830119371414185, + "learning_rate": 0.00020377373512802755, + "loss": 2.0323, + "step": 281060 + }, + { + "epoch": 1.0698217915242496, + "grad_norm": 0.1393330693244934, + "learning_rate": 0.00020370341435199313, + "loss": 2.0317, + "step": 281070 + }, + { + "epoch": 1.0698598539923723, + "grad_norm": 0.13055545091629028, + "learning_rate": 0.0002036331102613969, + "loss": 2.0284, + "step": 281080 + }, + { + "epoch": 1.069897916460495, + "grad_norm": 0.14802104234695435, + "learning_rate": 0.00020356282284436728, + "loss": 2.013, + "step": 281090 + }, + { + "epoch": 1.0699359789286176, + "grad_norm": 0.1766086220741272, + "learning_rate": 0.0002034925520890467, + "loss": 2.034, + "step": 281100 + }, + { + "epoch": 1.0699740413967402, + "grad_norm": 0.13774430751800537, + "learning_rate": 0.00020342229798359196, + "loss": 2.0077, + "step": 281110 + }, + { + "epoch": 1.0700121038648631, + "grad_norm": 0.14388753473758698, + "learning_rate": 0.0002033520605161735, + "loss": 2.0194, + "step": 281120 + }, + { + "epoch": 1.0700501663329858, + "grad_norm": 0.16754965484142303, + "learning_rate": 0.000203281839674976, + "loss": 2.0278, + "step": 281130 + }, + { + "epoch": 1.0700882288011084, + "grad_norm": 0.1566379964351654, + "learning_rate": 0.00020321163544819798, + "loss": 2.02, + "step": 281140 + }, + { + "epoch": 1.070126291269231, + "grad_norm": 0.13745974004268646, + "learning_rate": 0.0002031414478240521, + "loss": 2.0253, + "step": 281150 + }, + { + "epoch": 1.0701643537373537, + "grad_norm": 0.1512146145105362, + "learning_rate": 0.00020307127679076471, + "loss": 2.0087, + "step": 281160 + }, + { + "epoch": 1.0702024162054764, + "grad_norm": 0.16319917142391205, + "learning_rate": 0.0002030011223365763, + "loss": 2.0168, + "step": 281170 + }, + { + "epoch": 1.070240478673599, + "grad_norm": 0.1592712253332138, + "learning_rate": 0.00020293098444974106, + "loss": 2.0189, + "step": 281180 + }, + { + "epoch": 1.0702785411417217, + "grad_norm": 0.189766988158226, + "learning_rate": 0.00020286086311852714, + "loss": 2.0047, + "step": 281190 + }, + { + "epoch": 1.0703166036098444, + "grad_norm": 0.13883762061595917, + "learning_rate": 0.0002027907583312165, + "loss": 2.0139, + "step": 281200 + }, + { + "epoch": 1.0703546660779673, + "grad_norm": 0.15338963270187378, + "learning_rate": 0.00020272067007610502, + "loss": 2.012, + "step": 281210 + }, + { + "epoch": 1.07039272854609, + "grad_norm": 0.14497676491737366, + "learning_rate": 0.00020265059834150218, + "loss": 2.0159, + "step": 281220 + }, + { + "epoch": 1.0704307910142126, + "grad_norm": 0.15736877918243408, + "learning_rate": 0.00020258054311573137, + "loss": 2.0107, + "step": 281230 + }, + { + "epoch": 1.0704688534823352, + "grad_norm": 0.15212887525558472, + "learning_rate": 0.0002025105043871297, + "loss": 2.0035, + "step": 281240 + }, + { + "epoch": 1.0705069159504579, + "grad_norm": 0.15456640720367432, + "learning_rate": 0.00020244048214404793, + "loss": 2.0036, + "step": 281250 + }, + { + "epoch": 1.0705449784185805, + "grad_norm": 0.13884684443473816, + "learning_rate": 0.00020237047637485067, + "loss": 2.0216, + "step": 281260 + }, + { + "epoch": 1.0705830408867032, + "grad_norm": 0.18412266671657562, + "learning_rate": 0.0002023004870679161, + "loss": 2.0221, + "step": 281270 + }, + { + "epoch": 1.0706211033548259, + "grad_norm": 0.14824584126472473, + "learning_rate": 0.00020223051421163607, + "loss": 1.9981, + "step": 281280 + }, + { + "epoch": 1.0706591658229487, + "grad_norm": 0.13820265233516693, + "learning_rate": 0.0002021605577944161, + "loss": 2.0035, + "step": 281290 + }, + { + "epoch": 1.0706972282910714, + "grad_norm": 0.14500044286251068, + "learning_rate": 0.0002020906178046754, + "loss": 2.0038, + "step": 281300 + }, + { + "epoch": 1.070735290759194, + "grad_norm": 0.14148664474487305, + "learning_rate": 0.00020202069423084652, + "loss": 2.0051, + "step": 281310 + }, + { + "epoch": 1.0707733532273167, + "grad_norm": 0.1406242847442627, + "learning_rate": 0.00020195078706137582, + "loss": 2.0071, + "step": 281320 + }, + { + "epoch": 1.0708114156954394, + "grad_norm": 0.12793296575546265, + "learning_rate": 0.00020188089628472318, + "loss": 2.0039, + "step": 281330 + }, + { + "epoch": 1.070849478163562, + "grad_norm": 0.13845571875572205, + "learning_rate": 0.00020181102188936178, + "loss": 2.0057, + "step": 281340 + }, + { + "epoch": 1.0708875406316847, + "grad_norm": 0.1830500364303589, + "learning_rate": 0.00020174116386377866, + "loss": 2.0158, + "step": 281350 + }, + { + "epoch": 1.0709256030998073, + "grad_norm": 0.15033838152885437, + "learning_rate": 0.00020167132219647404, + "loss": 2.0125, + "step": 281360 + }, + { + "epoch": 1.07096366556793, + "grad_norm": 0.1378006786108017, + "learning_rate": 0.00020160149687596174, + "loss": 2.0167, + "step": 281370 + }, + { + "epoch": 1.0710017280360529, + "grad_norm": 0.15929892659187317, + "learning_rate": 0.00020153168789076897, + "loss": 2.0034, + "step": 281380 + }, + { + "epoch": 1.0710397905041755, + "grad_norm": 0.14839771389961243, + "learning_rate": 0.00020146189522943643, + "loss": 2.0286, + "step": 281390 + }, + { + "epoch": 1.0710778529722982, + "grad_norm": 0.13341379165649414, + "learning_rate": 0.00020139211888051806, + "loss": 2.0248, + "step": 281400 + }, + { + "epoch": 1.0711159154404208, + "grad_norm": 0.1480301469564438, + "learning_rate": 0.00020132235883258128, + "loss": 2.0037, + "step": 281410 + }, + { + "epoch": 1.0711539779085435, + "grad_norm": 0.15032023191452026, + "learning_rate": 0.0002012526150742069, + "loss": 2.0286, + "step": 281420 + }, + { + "epoch": 1.0711920403766662, + "grad_norm": 0.17209644615650177, + "learning_rate": 0.0002011828875939889, + "loss": 2.0223, + "step": 281430 + }, + { + "epoch": 1.0712301028447888, + "grad_norm": 0.1403706818819046, + "learning_rate": 0.00020111317638053471, + "loss": 2.0093, + "step": 281440 + }, + { + "epoch": 1.0712681653129115, + "grad_norm": 0.13635705411434174, + "learning_rate": 0.00020104348142246502, + "loss": 2.0087, + "step": 281450 + }, + { + "epoch": 1.0713062277810343, + "grad_norm": 0.12491488456726074, + "learning_rate": 0.00020097380270841375, + "loss": 2.0211, + "step": 281460 + }, + { + "epoch": 1.071344290249157, + "grad_norm": 0.1397937834262848, + "learning_rate": 0.00020090414022702803, + "loss": 2.0289, + "step": 281470 + }, + { + "epoch": 1.0713823527172797, + "grad_norm": 0.1508331000804901, + "learning_rate": 0.00020083449396696818, + "loss": 2.02, + "step": 281480 + }, + { + "epoch": 1.0714204151854023, + "grad_norm": 0.15565991401672363, + "learning_rate": 0.00020076486391690785, + "loss": 2.0113, + "step": 281490 + }, + { + "epoch": 1.071458477653525, + "grad_norm": 0.13104164600372314, + "learning_rate": 0.0002006952500655338, + "loss": 2.0017, + "step": 281500 + }, + { + "epoch": 1.0714965401216476, + "grad_norm": 0.17238375544548035, + "learning_rate": 0.00020062565240154596, + "loss": 2.0247, + "step": 281510 + }, + { + "epoch": 1.0715346025897703, + "grad_norm": 0.1550111025571823, + "learning_rate": 0.00020055607091365725, + "loss": 2.0301, + "step": 281520 + }, + { + "epoch": 1.071572665057893, + "grad_norm": 0.13278517127037048, + "learning_rate": 0.0002004865055905939, + "loss": 2.0215, + "step": 281530 + }, + { + "epoch": 1.0716107275260156, + "grad_norm": 0.16525842249393463, + "learning_rate": 0.00020041695642109515, + "loss": 2.0218, + "step": 281540 + }, + { + "epoch": 1.0716487899941383, + "grad_norm": 0.13423489034175873, + "learning_rate": 0.00020034742339391327, + "loss": 2.0172, + "step": 281550 + }, + { + "epoch": 1.0716868524622611, + "grad_norm": 0.1912301927804947, + "learning_rate": 0.00020027790649781364, + "loss": 2.0127, + "step": 281560 + }, + { + "epoch": 1.0717249149303838, + "grad_norm": 0.13126081228256226, + "learning_rate": 0.00020020840572157467, + "loss": 2.0082, + "step": 281570 + }, + { + "epoch": 1.0717629773985065, + "grad_norm": 0.1419760137796402, + "learning_rate": 0.00020013892105398762, + "loss": 2.0213, + "step": 281580 + }, + { + "epoch": 1.0718010398666291, + "grad_norm": 0.182196706533432, + "learning_rate": 0.000200069452483857, + "loss": 2.0223, + "step": 281590 + }, + { + "epoch": 1.0718391023347518, + "grad_norm": 0.14496959745883942, + "learning_rate": 0.0002, + "loss": 2.03, + "step": 281600 + }, + { + "epoch": 1.0718771648028744, + "grad_norm": 0.14605028927326202, + "learning_rate": 0.00019993056359124701, + "loss": 2.0057, + "step": 281610 + }, + { + "epoch": 1.071915227270997, + "grad_norm": 0.13911451399326324, + "learning_rate": 0.00019986114324644112, + "loss": 2.0121, + "step": 281620 + }, + { + "epoch": 1.0719532897391197, + "grad_norm": 0.15131564438343048, + "learning_rate": 0.00019979173895443852, + "loss": 2.0049, + "step": 281630 + }, + { + "epoch": 1.0719913522072426, + "grad_norm": 0.140620157122612, + "learning_rate": 0.00019972235070410806, + "loss": 2.0164, + "step": 281640 + }, + { + "epoch": 1.0720294146753653, + "grad_norm": 0.15846246480941772, + "learning_rate": 0.00019965297848433168, + "loss": 2.0048, + "step": 281650 + }, + { + "epoch": 1.072067477143488, + "grad_norm": 0.14672231674194336, + "learning_rate": 0.00019958362228400395, + "loss": 2.0157, + "step": 281660 + }, + { + "epoch": 1.0721055396116106, + "grad_norm": 0.163814976811409, + "learning_rate": 0.00019951428209203243, + "loss": 2.0156, + "step": 281670 + }, + { + "epoch": 1.0721436020797332, + "grad_norm": 0.14089249074459076, + "learning_rate": 0.00019944495789733736, + "loss": 2.0251, + "step": 281680 + }, + { + "epoch": 1.072181664547856, + "grad_norm": 0.14137700200080872, + "learning_rate": 0.00019937564968885174, + "loss": 2.0075, + "step": 281690 + }, + { + "epoch": 1.0722197270159786, + "grad_norm": 0.15704695880413055, + "learning_rate": 0.00019930635745552138, + "loss": 2.003, + "step": 281700 + }, + { + "epoch": 1.0722577894841012, + "grad_norm": 0.16056190431118011, + "learning_rate": 0.00019923708118630492, + "loss": 2.0112, + "step": 281710 + }, + { + "epoch": 1.0722958519522239, + "grad_norm": 0.15353207290172577, + "learning_rate": 0.00019916782087017355, + "loss": 2.0058, + "step": 281720 + }, + { + "epoch": 1.0723339144203468, + "grad_norm": 0.15010952949523926, + "learning_rate": 0.00019909857649611114, + "loss": 2.0187, + "step": 281730 + }, + { + "epoch": 1.0723719768884694, + "grad_norm": 0.146628737449646, + "learning_rate": 0.00019902934805311447, + "loss": 2.011, + "step": 281740 + }, + { + "epoch": 1.072410039356592, + "grad_norm": 0.17401348054409027, + "learning_rate": 0.0001989601355301926, + "loss": 2.0135, + "step": 281750 + }, + { + "epoch": 1.0724481018247147, + "grad_norm": 0.1571890264749527, + "learning_rate": 0.0001988909389163676, + "loss": 2.005, + "step": 281760 + }, + { + "epoch": 1.0724861642928374, + "grad_norm": 0.13389791548252106, + "learning_rate": 0.0001988217582006739, + "loss": 2.0151, + "step": 281770 + }, + { + "epoch": 1.07252422676096, + "grad_norm": 0.14010360836982727, + "learning_rate": 0.00019875259337215863, + "loss": 2.0271, + "step": 281780 + }, + { + "epoch": 1.0725622892290827, + "grad_norm": 0.14111493527889252, + "learning_rate": 0.00019868344441988147, + "loss": 2.0033, + "step": 281790 + }, + { + "epoch": 1.0726003516972054, + "grad_norm": 0.13658498227596283, + "learning_rate": 0.0001986143113329146, + "loss": 2.0278, + "step": 281800 + }, + { + "epoch": 1.0726384141653282, + "grad_norm": 0.15208940207958221, + "learning_rate": 0.00019854519410034282, + "loss": 2.0158, + "step": 281810 + }, + { + "epoch": 1.072676476633451, + "grad_norm": 0.14470238983631134, + "learning_rate": 0.0001984760927112633, + "loss": 2.0137, + "step": 281820 + }, + { + "epoch": 1.0727145391015735, + "grad_norm": 0.13096201419830322, + "learning_rate": 0.00019840700715478593, + "loss": 2.0075, + "step": 281830 + }, + { + "epoch": 1.0727526015696962, + "grad_norm": 0.1504952311515808, + "learning_rate": 0.00019833793742003286, + "loss": 2.0197, + "step": 281840 + }, + { + "epoch": 1.0727906640378189, + "grad_norm": 0.14142200350761414, + "learning_rate": 0.0001982688834961388, + "loss": 2.0145, + "step": 281850 + }, + { + "epoch": 1.0728287265059415, + "grad_norm": 0.13171540200710297, + "learning_rate": 0.00019819984537225078, + "loss": 2.0184, + "step": 281860 + }, + { + "epoch": 1.0728667889740642, + "grad_norm": 0.12692947685718536, + "learning_rate": 0.00019813082303752838, + "loss": 2.0127, + "step": 281870 + }, + { + "epoch": 1.0729048514421868, + "grad_norm": 0.16185876727104187, + "learning_rate": 0.00019806181648114352, + "loss": 2.0118, + "step": 281880 + }, + { + "epoch": 1.0729429139103095, + "grad_norm": 0.15910275280475616, + "learning_rate": 0.0001979928256922804, + "loss": 2.008, + "step": 281890 + }, + { + "epoch": 1.0729809763784324, + "grad_norm": 0.13678741455078125, + "learning_rate": 0.0001979238506601357, + "loss": 2.0275, + "step": 281900 + }, + { + "epoch": 1.073019038846555, + "grad_norm": 0.17118968069553375, + "learning_rate": 0.00019785489137391838, + "loss": 2.0161, + "step": 281910 + }, + { + "epoch": 1.0730571013146777, + "grad_norm": 0.12950493395328522, + "learning_rate": 0.0001977859478228497, + "loss": 2.0096, + "step": 281920 + }, + { + "epoch": 1.0730951637828003, + "grad_norm": 0.13670261204242706, + "learning_rate": 0.0001977170199961632, + "loss": 2.0194, + "step": 281930 + }, + { + "epoch": 1.073133226250923, + "grad_norm": 0.1519925445318222, + "learning_rate": 0.00019764810788310477, + "loss": 2.006, + "step": 281940 + }, + { + "epoch": 1.0731712887190457, + "grad_norm": 0.16093097627162933, + "learning_rate": 0.0001975792114729324, + "loss": 1.9946, + "step": 281950 + }, + { + "epoch": 1.0732093511871683, + "grad_norm": 0.14991694688796997, + "learning_rate": 0.00019751033075491654, + "loss": 2.0128, + "step": 281960 + }, + { + "epoch": 1.073247413655291, + "grad_norm": 0.14340434968471527, + "learning_rate": 0.0001974414657183396, + "loss": 2.0325, + "step": 281970 + }, + { + "epoch": 1.0732854761234139, + "grad_norm": 0.14055612683296204, + "learning_rate": 0.00019737261635249638, + "loss": 2.0073, + "step": 281980 + }, + { + "epoch": 1.0733235385915365, + "grad_norm": 0.1411522477865219, + "learning_rate": 0.0001973037826466938, + "loss": 2.005, + "step": 281990 + }, + { + "epoch": 1.0733616010596592, + "grad_norm": 0.17120565474033356, + "learning_rate": 0.00019723496459025087, + "loss": 2.0129, + "step": 282000 + }, + { + "epoch": 1.0733996635277818, + "grad_norm": 0.13336136937141418, + "learning_rate": 0.0001971661621724987, + "loss": 2.0178, + "step": 282010 + }, + { + "epoch": 1.0734377259959045, + "grad_norm": 0.16977474093437195, + "learning_rate": 0.00019709737538278082, + "loss": 2.0231, + "step": 282020 + }, + { + "epoch": 1.0734757884640271, + "grad_norm": 0.14384149014949799, + "learning_rate": 0.0001970286042104525, + "loss": 2.0212, + "step": 282030 + }, + { + "epoch": 1.0735138509321498, + "grad_norm": 0.1479279100894928, + "learning_rate": 0.00019695984864488114, + "loss": 1.9961, + "step": 282040 + }, + { + "epoch": 1.0735519134002725, + "grad_norm": 0.1393909901380539, + "learning_rate": 0.00019689110867544647, + "loss": 2.0178, + "step": 282050 + }, + { + "epoch": 1.073589975868395, + "grad_norm": 0.1390264630317688, + "learning_rate": 0.00019682238429153998, + "loss": 2.0209, + "step": 282060 + }, + { + "epoch": 1.073628038336518, + "grad_norm": 0.17246313393115997, + "learning_rate": 0.00019675367548256529, + "loss": 2.004, + "step": 282070 + }, + { + "epoch": 1.0736661008046406, + "grad_norm": 0.1607477217912674, + "learning_rate": 0.00019668498223793796, + "loss": 2.0053, + "step": 282080 + }, + { + "epoch": 1.0737041632727633, + "grad_norm": 0.14993655681610107, + "learning_rate": 0.00019661630454708567, + "loss": 2.013, + "step": 282090 + }, + { + "epoch": 1.073742225740886, + "grad_norm": 0.16224703192710876, + "learning_rate": 0.00019654764239944794, + "loss": 2.0218, + "step": 282100 + }, + { + "epoch": 1.0737802882090086, + "grad_norm": 0.16363425552845, + "learning_rate": 0.00019647899578447626, + "loss": 2.0246, + "step": 282110 + }, + { + "epoch": 1.0738183506771313, + "grad_norm": 0.15580902993679047, + "learning_rate": 0.00019641036469163404, + "loss": 2.0254, + "step": 282120 + }, + { + "epoch": 1.073856413145254, + "grad_norm": 0.1360892504453659, + "learning_rate": 0.0001963417491103966, + "loss": 2.0202, + "step": 282130 + }, + { + "epoch": 1.0738944756133766, + "grad_norm": 0.14252018928527832, + "learning_rate": 0.00019627314903025118, + "loss": 2.0144, + "step": 282140 + }, + { + "epoch": 1.0739325380814995, + "grad_norm": 0.13783758878707886, + "learning_rate": 0.00019620456444069696, + "loss": 2.0257, + "step": 282150 + }, + { + "epoch": 1.0739706005496221, + "grad_norm": 0.16759249567985535, + "learning_rate": 0.0001961359953312447, + "loss": 2.0279, + "step": 282160 + }, + { + "epoch": 1.0740086630177448, + "grad_norm": 0.14761881530284882, + "learning_rate": 0.00019606744169141732, + "loss": 2.0225, + "step": 282170 + }, + { + "epoch": 1.0740467254858674, + "grad_norm": 0.15070804953575134, + "learning_rate": 0.0001959989035107494, + "loss": 2.0072, + "step": 282180 + }, + { + "epoch": 1.07408478795399, + "grad_norm": 0.1455344706773758, + "learning_rate": 0.00019593038077878722, + "loss": 2.0093, + "step": 282190 + }, + { + "epoch": 1.0741228504221128, + "grad_norm": 0.14780759811401367, + "learning_rate": 0.00019586187348508906, + "loss": 2.0141, + "step": 282200 + }, + { + "epoch": 1.0741609128902354, + "grad_norm": 0.1355655938386917, + "learning_rate": 0.00019579338161922473, + "loss": 2.0128, + "step": 282210 + }, + { + "epoch": 1.074198975358358, + "grad_norm": 0.13392916321754456, + "learning_rate": 0.00019572490517077586, + "loss": 2.0172, + "step": 282220 + }, + { + "epoch": 1.0742370378264807, + "grad_norm": 0.13204143941402435, + "learning_rate": 0.00019565644412933598, + "loss": 2.0208, + "step": 282230 + }, + { + "epoch": 1.0742751002946036, + "grad_norm": 0.15992946922779083, + "learning_rate": 0.00019558799848451002, + "loss": 2.022, + "step": 282240 + }, + { + "epoch": 1.0743131627627263, + "grad_norm": 0.15534865856170654, + "learning_rate": 0.00019551956822591482, + "loss": 2.0222, + "step": 282250 + }, + { + "epoch": 1.074351225230849, + "grad_norm": 0.16686315834522247, + "learning_rate": 0.0001954511533431788, + "loss": 2.0182, + "step": 282260 + }, + { + "epoch": 1.0743892876989716, + "grad_norm": 0.12937520444393158, + "learning_rate": 0.000195382753825942, + "loss": 2.0121, + "step": 282270 + }, + { + "epoch": 1.0744273501670942, + "grad_norm": 0.15132172405719757, + "learning_rate": 0.00019531436966385607, + "loss": 2.0174, + "step": 282280 + }, + { + "epoch": 1.0744654126352169, + "grad_norm": 0.1341206282377243, + "learning_rate": 0.00019524600084658446, + "loss": 2.0161, + "step": 282290 + }, + { + "epoch": 1.0745034751033395, + "grad_norm": 0.13307468593120575, + "learning_rate": 0.00019517764736380201, + "loss": 2.0128, + "step": 282300 + }, + { + "epoch": 1.0745415375714622, + "grad_norm": 0.13326209783554077, + "learning_rate": 0.0001951093092051951, + "loss": 2.0153, + "step": 282310 + }, + { + "epoch": 1.074579600039585, + "grad_norm": 0.1382444202899933, + "learning_rate": 0.00019504098636046187, + "loss": 2.0198, + "step": 282320 + }, + { + "epoch": 1.0746176625077077, + "grad_norm": 0.1539696604013443, + "learning_rate": 0.00019497267881931192, + "loss": 2.0149, + "step": 282330 + }, + { + "epoch": 1.0746557249758304, + "grad_norm": 0.14765378832817078, + "learning_rate": 0.0001949043865714662, + "loss": 2.0126, + "step": 282340 + }, + { + "epoch": 1.074693787443953, + "grad_norm": 0.1588527411222458, + "learning_rate": 0.00019483610960665743, + "loss": 2.0209, + "step": 282350 + }, + { + "epoch": 1.0747318499120757, + "grad_norm": 0.1634843647480011, + "learning_rate": 0.00019476784791462964, + "loss": 2.0097, + "step": 282360 + }, + { + "epoch": 1.0747699123801984, + "grad_norm": 0.1361679583787918, + "learning_rate": 0.0001946996014851384, + "loss": 2.008, + "step": 282370 + }, + { + "epoch": 1.074807974848321, + "grad_norm": 0.14792568981647491, + "learning_rate": 0.00019463137030795058, + "loss": 2.025, + "step": 282380 + }, + { + "epoch": 1.0748460373164437, + "grad_norm": 0.135822132229805, + "learning_rate": 0.00019456315437284478, + "loss": 2.0182, + "step": 282390 + }, + { + "epoch": 1.0748840997845663, + "grad_norm": 0.17058981955051422, + "learning_rate": 0.00019449495366961067, + "loss": 2.0061, + "step": 282400 + }, + { + "epoch": 1.0749221622526892, + "grad_norm": 0.16827578842639923, + "learning_rate": 0.00019442676818804956, + "loss": 2.0218, + "step": 282410 + }, + { + "epoch": 1.0749602247208119, + "grad_norm": 0.1305459439754486, + "learning_rate": 0.00019435859791797406, + "loss": 2.0085, + "step": 282420 + }, + { + "epoch": 1.0749982871889345, + "grad_norm": 0.17050224542617798, + "learning_rate": 0.00019429044284920805, + "loss": 2.0145, + "step": 282430 + }, + { + "epoch": 1.0750363496570572, + "grad_norm": 0.15739241242408752, + "learning_rate": 0.0001942223029715869, + "loss": 2.0225, + "step": 282440 + }, + { + "epoch": 1.0750744121251798, + "grad_norm": 0.15171968936920166, + "learning_rate": 0.00019415417827495717, + "loss": 2.0333, + "step": 282450 + }, + { + "epoch": 1.0751124745933025, + "grad_norm": 0.1353382021188736, + "learning_rate": 0.0001940860687491769, + "loss": 2.0259, + "step": 282460 + }, + { + "epoch": 1.0751505370614252, + "grad_norm": 0.15228162705898285, + "learning_rate": 0.00019401797438411516, + "loss": 2.0067, + "step": 282470 + }, + { + "epoch": 1.0751885995295478, + "grad_norm": 0.17352250218391418, + "learning_rate": 0.00019394989516965255, + "loss": 2.0035, + "step": 282480 + }, + { + "epoch": 1.0752266619976705, + "grad_norm": 0.1305777132511139, + "learning_rate": 0.00019388183109568074, + "loss": 2.0188, + "step": 282490 + }, + { + "epoch": 1.0752647244657934, + "grad_norm": 0.1270778477191925, + "learning_rate": 0.00019381378215210278, + "loss": 2.0177, + "step": 282500 + }, + { + "epoch": 1.075302786933916, + "grad_norm": 0.1552201211452484, + "learning_rate": 0.00019374574832883273, + "loss": 2.0108, + "step": 282510 + }, + { + "epoch": 1.0753408494020387, + "grad_norm": 0.1422547698020935, + "learning_rate": 0.00019367772961579604, + "loss": 2.0139, + "step": 282520 + }, + { + "epoch": 1.0753789118701613, + "grad_norm": 0.18762005865573883, + "learning_rate": 0.00019360972600292943, + "loss": 2.0168, + "step": 282530 + }, + { + "epoch": 1.075416974338284, + "grad_norm": 0.17483144998550415, + "learning_rate": 0.0001935417374801804, + "loss": 2.0074, + "step": 282540 + }, + { + "epoch": 1.0754550368064066, + "grad_norm": 0.14014564454555511, + "learning_rate": 0.00019347376403750798, + "loss": 2.0011, + "step": 282550 + }, + { + "epoch": 1.0754930992745293, + "grad_norm": 0.14157618582248688, + "learning_rate": 0.0001934058056648822, + "loss": 2.0045, + "step": 282560 + }, + { + "epoch": 1.075531161742652, + "grad_norm": 0.17108173668384552, + "learning_rate": 0.00019333786235228406, + "loss": 2.0105, + "step": 282570 + }, + { + "epoch": 1.0755692242107746, + "grad_norm": 0.1399330347776413, + "learning_rate": 0.00019326993408970596, + "loss": 2.0253, + "step": 282580 + }, + { + "epoch": 1.0756072866788975, + "grad_norm": 0.2349340170621872, + "learning_rate": 0.00019320202086715112, + "loss": 2.0085, + "step": 282590 + }, + { + "epoch": 1.0756453491470201, + "grad_norm": 0.20484134554862976, + "learning_rate": 0.00019313412267463388, + "loss": 1.9964, + "step": 282600 + }, + { + "epoch": 1.0756834116151428, + "grad_norm": 0.1534414142370224, + "learning_rate": 0.00019306623950217966, + "loss": 2.0127, + "step": 282610 + }, + { + "epoch": 1.0757214740832655, + "grad_norm": 0.15662942826747894, + "learning_rate": 0.00019299837133982496, + "loss": 2.0064, + "step": 282620 + }, + { + "epoch": 1.0757595365513881, + "grad_norm": 0.16546347737312317, + "learning_rate": 0.00019293051817761725, + "loss": 2.0043, + "step": 282630 + }, + { + "epoch": 1.0757975990195108, + "grad_norm": 0.16139911115169525, + "learning_rate": 0.00019286268000561486, + "loss": 2.0198, + "step": 282640 + }, + { + "epoch": 1.0758356614876334, + "grad_norm": 0.16876624524593353, + "learning_rate": 0.00019279485681388732, + "loss": 2.0223, + "step": 282650 + }, + { + "epoch": 1.075873723955756, + "grad_norm": 0.15508447587490082, + "learning_rate": 0.00019272704859251495, + "loss": 2.0048, + "step": 282660 + }, + { + "epoch": 1.075911786423879, + "grad_norm": 0.16199934482574463, + "learning_rate": 0.00019265925533158912, + "loss": 2.012, + "step": 282670 + }, + { + "epoch": 1.0759498488920016, + "grad_norm": 0.1381034553050995, + "learning_rate": 0.00019259147702121204, + "loss": 1.9981, + "step": 282680 + }, + { + "epoch": 1.0759879113601243, + "grad_norm": 0.18051622807979584, + "learning_rate": 0.0001925237136514969, + "loss": 2.0134, + "step": 282690 + }, + { + "epoch": 1.076025973828247, + "grad_norm": 0.1292513757944107, + "learning_rate": 0.00019245596521256776, + "loss": 2.0076, + "step": 282700 + }, + { + "epoch": 1.0760640362963696, + "grad_norm": 0.14846284687519073, + "learning_rate": 0.0001923882316945595, + "loss": 2.017, + "step": 282710 + }, + { + "epoch": 1.0761020987644923, + "grad_norm": 0.15135519206523895, + "learning_rate": 0.000192320513087618, + "loss": 2.0073, + "step": 282720 + }, + { + "epoch": 1.076140161232615, + "grad_norm": 0.15053410828113556, + "learning_rate": 0.00019225280938189976, + "loss": 2.0245, + "step": 282730 + }, + { + "epoch": 1.0761782237007376, + "grad_norm": 0.1461486518383026, + "learning_rate": 0.0001921851205675723, + "loss": 2.0046, + "step": 282740 + }, + { + "epoch": 1.0762162861688602, + "grad_norm": 0.15739676356315613, + "learning_rate": 0.0001921174466348139, + "loss": 2.0302, + "step": 282750 + }, + { + "epoch": 1.076254348636983, + "grad_norm": 0.14662247896194458, + "learning_rate": 0.00019204978757381358, + "loss": 2.0034, + "step": 282760 + }, + { + "epoch": 1.0762924111051058, + "grad_norm": 0.15804801881313324, + "learning_rate": 0.00019198214337477122, + "loss": 2.0074, + "step": 282770 + }, + { + "epoch": 1.0763304735732284, + "grad_norm": 0.15608061850070953, + "learning_rate": 0.00019191451402789735, + "loss": 2.006, + "step": 282780 + }, + { + "epoch": 1.076368536041351, + "grad_norm": 0.13967221975326538, + "learning_rate": 0.00019184689952341327, + "loss": 2.0196, + "step": 282790 + }, + { + "epoch": 1.0764065985094737, + "grad_norm": 0.16726204752922058, + "learning_rate": 0.0001917792998515512, + "loss": 2.0119, + "step": 282800 + }, + { + "epoch": 1.0764446609775964, + "grad_norm": 0.1628492772579193, + "learning_rate": 0.00019171171500255375, + "loss": 2.0018, + "step": 282810 + }, + { + "epoch": 1.076482723445719, + "grad_norm": 0.14668413996696472, + "learning_rate": 0.00019164414496667436, + "loss": 2.0048, + "step": 282820 + }, + { + "epoch": 1.0765207859138417, + "grad_norm": 0.13558825850486755, + "learning_rate": 0.0001915765897341773, + "loss": 2.0183, + "step": 282830 + }, + { + "epoch": 1.0765588483819646, + "grad_norm": 0.14365750551223755, + "learning_rate": 0.00019150904929533723, + "loss": 2.0051, + "step": 282840 + }, + { + "epoch": 1.0765969108500872, + "grad_norm": 0.14168040454387665, + "learning_rate": 0.0001914415236404397, + "loss": 2.0082, + "step": 282850 + }, + { + "epoch": 1.07663497331821, + "grad_norm": 0.14769423007965088, + "learning_rate": 0.00019137401275978071, + "loss": 2.0199, + "step": 282860 + }, + { + "epoch": 1.0766730357863326, + "grad_norm": 0.1618385761976242, + "learning_rate": 0.00019130651664366695, + "loss": 2.01, + "step": 282870 + }, + { + "epoch": 1.0767110982544552, + "grad_norm": 0.15950889885425568, + "learning_rate": 0.00019123903528241572, + "loss": 1.999, + "step": 282880 + }, + { + "epoch": 1.0767491607225779, + "grad_norm": 0.14730483293533325, + "learning_rate": 0.00019117156866635483, + "loss": 2.0012, + "step": 282890 + }, + { + "epoch": 1.0767872231907005, + "grad_norm": 0.14688549935817719, + "learning_rate": 0.00019110411678582268, + "loss": 2.0245, + "step": 282900 + }, + { + "epoch": 1.0768252856588232, + "grad_norm": 0.1415330469608307, + "learning_rate": 0.00019103667963116827, + "loss": 2.0161, + "step": 282910 + }, + { + "epoch": 1.0768633481269458, + "grad_norm": 0.13632331788539886, + "learning_rate": 0.00019096925719275115, + "loss": 2.0101, + "step": 282920 + }, + { + "epoch": 1.0769014105950687, + "grad_norm": 0.19452516734600067, + "learning_rate": 0.0001909018494609412, + "loss": 2.0235, + "step": 282930 + }, + { + "epoch": 1.0769394730631914, + "grad_norm": 0.17027851939201355, + "learning_rate": 0.00019083445642611896, + "loss": 2.0223, + "step": 282940 + }, + { + "epoch": 1.076977535531314, + "grad_norm": 0.1464688926935196, + "learning_rate": 0.00019076707807867544, + "loss": 2.0188, + "step": 282950 + }, + { + "epoch": 1.0770155979994367, + "grad_norm": 0.16679330170154572, + "learning_rate": 0.00019069971440901213, + "loss": 2.0091, + "step": 282960 + }, + { + "epoch": 1.0770536604675593, + "grad_norm": 0.13822433352470398, + "learning_rate": 0.0001906323654075408, + "loss": 2.0132, + "step": 282970 + }, + { + "epoch": 1.077091722935682, + "grad_norm": 0.15693537890911102, + "learning_rate": 0.000190565031064684, + "loss": 2.0065, + "step": 282980 + }, + { + "epoch": 1.0771297854038047, + "grad_norm": 0.13129134476184845, + "learning_rate": 0.00019049771137087424, + "loss": 1.9964, + "step": 282990 + }, + { + "epoch": 1.0771678478719273, + "grad_norm": 0.1458442658185959, + "learning_rate": 0.00019043040631655484, + "loss": 2.0102, + "step": 283000 + }, + { + "epoch": 1.0772059103400502, + "grad_norm": 0.14534065127372742, + "learning_rate": 0.0001903631158921793, + "loss": 2.0007, + "step": 283010 + }, + { + "epoch": 1.0772439728081729, + "grad_norm": 0.1374642699956894, + "learning_rate": 0.0001902958400882115, + "loss": 2.0108, + "step": 283020 + }, + { + "epoch": 1.0772820352762955, + "grad_norm": 0.1382344365119934, + "learning_rate": 0.00019022857889512573, + "loss": 2.0193, + "step": 283030 + }, + { + "epoch": 1.0773200977444182, + "grad_norm": 0.14922475814819336, + "learning_rate": 0.00019016133230340666, + "loss": 2.0249, + "step": 283040 + }, + { + "epoch": 1.0773581602125408, + "grad_norm": 0.16759616136550903, + "learning_rate": 0.00019009410030354913, + "loss": 2.0091, + "step": 283050 + }, + { + "epoch": 1.0773962226806635, + "grad_norm": 0.13289323449134827, + "learning_rate": 0.0001900268828860584, + "loss": 2.0075, + "step": 283060 + }, + { + "epoch": 1.0774342851487861, + "grad_norm": 0.13219304382801056, + "learning_rate": 0.0001899596800414501, + "loss": 2.0166, + "step": 283070 + }, + { + "epoch": 1.0774723476169088, + "grad_norm": 0.1477508693933487, + "learning_rate": 0.00018989249176024992, + "loss": 2.012, + "step": 283080 + }, + { + "epoch": 1.0775104100850315, + "grad_norm": 0.17055627703666687, + "learning_rate": 0.00018982531803299398, + "loss": 1.992, + "step": 283090 + }, + { + "epoch": 1.0775484725531543, + "grad_norm": 0.16017602384090424, + "learning_rate": 0.00018975815885022857, + "loss": 2.0025, + "step": 283100 + }, + { + "epoch": 1.077586535021277, + "grad_norm": 0.1373511552810669, + "learning_rate": 0.00018969101420251029, + "loss": 1.9972, + "step": 283110 + }, + { + "epoch": 1.0776245974893996, + "grad_norm": 0.13665539026260376, + "learning_rate": 0.00018962388408040587, + "loss": 1.993, + "step": 283120 + }, + { + "epoch": 1.0776626599575223, + "grad_norm": 0.1449371576309204, + "learning_rate": 0.00018955676847449228, + "loss": 2.0158, + "step": 283130 + }, + { + "epoch": 1.077700722425645, + "grad_norm": 0.16415664553642273, + "learning_rate": 0.0001894896673753566, + "loss": 2.0146, + "step": 283140 + }, + { + "epoch": 1.0777387848937676, + "grad_norm": 0.16769972443580627, + "learning_rate": 0.0001894225807735963, + "loss": 2.0202, + "step": 283150 + }, + { + "epoch": 1.0777768473618903, + "grad_norm": 0.15793916583061218, + "learning_rate": 0.00018935550865981866, + "loss": 2.0132, + "step": 283160 + }, + { + "epoch": 1.077814909830013, + "grad_norm": 0.1298457831144333, + "learning_rate": 0.00018928845102464143, + "loss": 2.0068, + "step": 283170 + }, + { + "epoch": 1.0778529722981358, + "grad_norm": 0.13816504180431366, + "learning_rate": 0.00018922140785869224, + "loss": 2.0034, + "step": 283180 + }, + { + "epoch": 1.0778910347662585, + "grad_norm": 0.137596994638443, + "learning_rate": 0.0001891543791526089, + "loss": 2.0118, + "step": 283190 + }, + { + "epoch": 1.0779290972343811, + "grad_norm": 0.166671484708786, + "learning_rate": 0.00018908736489703948, + "loss": 2.0186, + "step": 283200 + }, + { + "epoch": 1.0779671597025038, + "grad_norm": 0.1365715116262436, + "learning_rate": 0.0001890203650826419, + "loss": 2.0167, + "step": 283210 + }, + { + "epoch": 1.0780052221706264, + "grad_norm": 0.13674066960811615, + "learning_rate": 0.0001889533797000842, + "loss": 2.0134, + "step": 283220 + }, + { + "epoch": 1.078043284638749, + "grad_norm": 0.13761483132839203, + "learning_rate": 0.00018888640874004448, + "loss": 2.0246, + "step": 283230 + }, + { + "epoch": 1.0780813471068718, + "grad_norm": 0.18757186830043793, + "learning_rate": 0.00018881945219321095, + "loss": 2.0121, + "step": 283240 + }, + { + "epoch": 1.0781194095749944, + "grad_norm": 0.2052200734615326, + "learning_rate": 0.0001887525100502817, + "loss": 2.0133, + "step": 283250 + }, + { + "epoch": 1.078157472043117, + "grad_norm": 0.14256851375102997, + "learning_rate": 0.0001886855823019649, + "loss": 2.0108, + "step": 283260 + }, + { + "epoch": 1.07819553451124, + "grad_norm": 0.13533790409564972, + "learning_rate": 0.00018861866893897872, + "loss": 2.0102, + "step": 283270 + }, + { + "epoch": 1.0782335969793626, + "grad_norm": 0.1639997363090515, + "learning_rate": 0.00018855176995205124, + "loss": 2.0092, + "step": 283280 + }, + { + "epoch": 1.0782716594474853, + "grad_norm": 0.16645225882530212, + "learning_rate": 0.0001884848853319206, + "loss": 2.0475, + "step": 283290 + }, + { + "epoch": 1.078309721915608, + "grad_norm": 0.1505880504846573, + "learning_rate": 0.0001884180150693347, + "loss": 2.0073, + "step": 283300 + }, + { + "epoch": 1.0783477843837306, + "grad_norm": 0.15273131430149078, + "learning_rate": 0.00018835115915505156, + "loss": 2.0154, + "step": 283310 + }, + { + "epoch": 1.0783858468518532, + "grad_norm": 0.1675952672958374, + "learning_rate": 0.000188284317579839, + "loss": 2.0151, + "step": 283320 + }, + { + "epoch": 1.078423909319976, + "grad_norm": 0.14620567858219147, + "learning_rate": 0.00018821749033447478, + "loss": 2.0074, + "step": 283330 + }, + { + "epoch": 1.0784619717880986, + "grad_norm": 0.1587236523628235, + "learning_rate": 0.0001881506774097465, + "loss": 2.0229, + "step": 283340 + }, + { + "epoch": 1.0785000342562212, + "grad_norm": 0.16149382293224335, + "learning_rate": 0.00018808387879645167, + "loss": 2.016, + "step": 283350 + }, + { + "epoch": 1.078538096724344, + "grad_norm": 0.19399163126945496, + "learning_rate": 0.0001880170944853976, + "loss": 2.0156, + "step": 283360 + }, + { + "epoch": 1.0785761591924667, + "grad_norm": 0.15907388925552368, + "learning_rate": 0.0001879503244674015, + "loss": 1.9963, + "step": 283370 + }, + { + "epoch": 1.0786142216605894, + "grad_norm": 0.14165343344211578, + "learning_rate": 0.00018788356873329038, + "loss": 2.0276, + "step": 283380 + }, + { + "epoch": 1.078652284128712, + "grad_norm": 0.1441272646188736, + "learning_rate": 0.00018781682727390104, + "loss": 2.0069, + "step": 283390 + }, + { + "epoch": 1.0786903465968347, + "grad_norm": 0.1678933948278427, + "learning_rate": 0.00018775010008008008, + "loss": 2.0281, + "step": 283400 + }, + { + "epoch": 1.0787284090649574, + "grad_norm": 0.1660948246717453, + "learning_rate": 0.00018768338714268392, + "loss": 2.0149, + "step": 283410 + }, + { + "epoch": 1.07876647153308, + "grad_norm": 0.1400216817855835, + "learning_rate": 0.00018761668845257862, + "loss": 2.0268, + "step": 283420 + }, + { + "epoch": 1.0788045340012027, + "grad_norm": 0.15131667256355286, + "learning_rate": 0.00018755000400064016, + "loss": 2.0161, + "step": 283430 + }, + { + "epoch": 1.0788425964693253, + "grad_norm": 0.1365083009004593, + "learning_rate": 0.00018748333377775406, + "loss": 2.0132, + "step": 283440 + }, + { + "epoch": 1.0788806589374482, + "grad_norm": 0.1457868367433548, + "learning_rate": 0.0001874166777748158, + "loss": 2.0171, + "step": 283450 + }, + { + "epoch": 1.0789187214055709, + "grad_norm": 0.1760193258523941, + "learning_rate": 0.00018735003598273036, + "loss": 2.0208, + "step": 283460 + }, + { + "epoch": 1.0789567838736935, + "grad_norm": 0.14603100717067719, + "learning_rate": 0.00018728340839241247, + "loss": 2.0066, + "step": 283470 + }, + { + "epoch": 1.0789948463418162, + "grad_norm": 0.16044391691684723, + "learning_rate": 0.00018721679499478667, + "loss": 2.0208, + "step": 283480 + }, + { + "epoch": 1.0790329088099389, + "grad_norm": 0.1581258922815323, + "learning_rate": 0.00018715019578078684, + "loss": 2.0036, + "step": 283490 + }, + { + "epoch": 1.0790709712780615, + "grad_norm": 0.1435764580965042, + "learning_rate": 0.00018708361074135687, + "loss": 2.0245, + "step": 283500 + }, + { + "epoch": 1.0791090337461842, + "grad_norm": 0.13865844905376434, + "learning_rate": 0.00018701703986745012, + "loss": 2.0091, + "step": 283510 + }, + { + "epoch": 1.0791470962143068, + "grad_norm": 0.1498934030532837, + "learning_rate": 0.00018695048315002944, + "loss": 2.0307, + "step": 283520 + }, + { + "epoch": 1.0791851586824297, + "grad_norm": 0.1492132842540741, + "learning_rate": 0.00018688394058006757, + "loss": 2.0035, + "step": 283530 + }, + { + "epoch": 1.0792232211505524, + "grad_norm": 0.16242045164108276, + "learning_rate": 0.00018681741214854658, + "loss": 2.0155, + "step": 283540 + }, + { + "epoch": 1.079261283618675, + "grad_norm": 0.16527853906154633, + "learning_rate": 0.00018675089784645833, + "loss": 2.0191, + "step": 283550 + }, + { + "epoch": 1.0792993460867977, + "grad_norm": 0.178872749209404, + "learning_rate": 0.000186684397664804, + "loss": 2.0106, + "step": 283560 + }, + { + "epoch": 1.0793374085549203, + "grad_norm": 0.18619264662265778, + "learning_rate": 0.00018661791159459463, + "loss": 2.012, + "step": 283570 + }, + { + "epoch": 1.079375471023043, + "grad_norm": 0.16307014226913452, + "learning_rate": 0.00018655143962685038, + "loss": 2.0204, + "step": 283580 + }, + { + "epoch": 1.0794135334911656, + "grad_norm": 0.17658159136772156, + "learning_rate": 0.00018648498175260141, + "loss": 2.0227, + "step": 283590 + }, + { + "epoch": 1.0794515959592883, + "grad_norm": 0.14216390252113342, + "learning_rate": 0.00018641853796288704, + "loss": 1.9927, + "step": 283600 + }, + { + "epoch": 1.079489658427411, + "grad_norm": 0.1339612454175949, + "learning_rate": 0.0001863521082487561, + "loss": 2.0187, + "step": 283610 + }, + { + "epoch": 1.0795277208955338, + "grad_norm": 0.15501002967357635, + "learning_rate": 0.0001862856926012671, + "loss": 2.0255, + "step": 283620 + }, + { + "epoch": 1.0795657833636565, + "grad_norm": 0.17483502626419067, + "learning_rate": 0.00018621929101148788, + "loss": 1.9948, + "step": 283630 + }, + { + "epoch": 1.0796038458317792, + "grad_norm": 0.1788007915019989, + "learning_rate": 0.00018615290347049568, + "loss": 2.0124, + "step": 283640 + }, + { + "epoch": 1.0796419082999018, + "grad_norm": 0.16997750103473663, + "learning_rate": 0.0001860865299693773, + "loss": 2.013, + "step": 283650 + }, + { + "epoch": 1.0796799707680245, + "grad_norm": 0.2105778157711029, + "learning_rate": 0.00018602017049922887, + "loss": 2.0133, + "step": 283660 + }, + { + "epoch": 1.0797180332361471, + "grad_norm": 0.16479893028736115, + "learning_rate": 0.00018595382505115588, + "loss": 2.018, + "step": 283670 + }, + { + "epoch": 1.0797560957042698, + "grad_norm": 0.1657109558582306, + "learning_rate": 0.00018588749361627343, + "loss": 2.017, + "step": 283680 + }, + { + "epoch": 1.0797941581723924, + "grad_norm": 0.1969013214111328, + "learning_rate": 0.00018582117618570578, + "loss": 2.0011, + "step": 283690 + }, + { + "epoch": 1.0798322206405153, + "grad_norm": 0.13585196435451508, + "learning_rate": 0.00018575487275058661, + "loss": 2.0124, + "step": 283700 + }, + { + "epoch": 1.079870283108638, + "grad_norm": 0.13131742179393768, + "learning_rate": 0.00018568858330205907, + "loss": 2.0013, + "step": 283710 + }, + { + "epoch": 1.0799083455767606, + "grad_norm": 0.1606842577457428, + "learning_rate": 0.00018562230783127548, + "loss": 2.0115, + "step": 283720 + }, + { + "epoch": 1.0799464080448833, + "grad_norm": 0.14883503317832947, + "learning_rate": 0.00018555604632939748, + "loss": 2.0099, + "step": 283730 + }, + { + "epoch": 1.079984470513006, + "grad_norm": 0.1689402461051941, + "learning_rate": 0.00018548979878759632, + "loss": 2.0121, + "step": 283740 + }, + { + "epoch": 1.0800225329811286, + "grad_norm": 0.16839559376239777, + "learning_rate": 0.0001854235651970521, + "loss": 2.01, + "step": 283750 + }, + { + "epoch": 1.0800605954492513, + "grad_norm": 0.1578279286623001, + "learning_rate": 0.00018535734554895456, + "loss": 2.0087, + "step": 283760 + }, + { + "epoch": 1.080098657917374, + "grad_norm": 0.14088387787342072, + "learning_rate": 0.00018529113983450246, + "loss": 1.9997, + "step": 283770 + }, + { + "epoch": 1.0801367203854966, + "grad_norm": 0.13390129804611206, + "learning_rate": 0.00018522494804490408, + "loss": 2.0186, + "step": 283780 + }, + { + "epoch": 1.0801747828536195, + "grad_norm": 0.13913902640342712, + "learning_rate": 0.00018515877017137667, + "loss": 2.0088, + "step": 283790 + }, + { + "epoch": 1.080212845321742, + "grad_norm": 0.16706794500350952, + "learning_rate": 0.00018509260620514696, + "loss": 1.996, + "step": 283800 + }, + { + "epoch": 1.0802509077898648, + "grad_norm": 0.13749095797538757, + "learning_rate": 0.00018502645613745062, + "loss": 2.0122, + "step": 283810 + }, + { + "epoch": 1.0802889702579874, + "grad_norm": 0.1544412076473236, + "learning_rate": 0.0001849603199595327, + "loss": 2.0125, + "step": 283820 + }, + { + "epoch": 1.08032703272611, + "grad_norm": 0.18350383639335632, + "learning_rate": 0.0001848941976626475, + "loss": 2.0126, + "step": 283830 + }, + { + "epoch": 1.0803650951942327, + "grad_norm": 0.1641099452972412, + "learning_rate": 0.00018482808923805832, + "loss": 2.0051, + "step": 283840 + }, + { + "epoch": 1.0804031576623554, + "grad_norm": 0.1506098061800003, + "learning_rate": 0.0001847619946770377, + "loss": 2.0235, + "step": 283850 + }, + { + "epoch": 1.080441220130478, + "grad_norm": 0.13227491080760956, + "learning_rate": 0.00018469591397086738, + "loss": 1.9968, + "step": 283860 + }, + { + "epoch": 1.080479282598601, + "grad_norm": 0.14088758826255798, + "learning_rate": 0.00018462984711083814, + "loss": 2.0032, + "step": 283870 + }, + { + "epoch": 1.0805173450667236, + "grad_norm": 0.1535644233226776, + "learning_rate": 0.0001845637940882499, + "loss": 2.0074, + "step": 283880 + }, + { + "epoch": 1.0805554075348462, + "grad_norm": 0.13675335049629211, + "learning_rate": 0.00018449775489441178, + "loss": 2.0214, + "step": 283890 + }, + { + "epoch": 1.080593470002969, + "grad_norm": 0.14348675310611725, + "learning_rate": 0.00018443172952064185, + "loss": 2.0205, + "step": 283900 + }, + { + "epoch": 1.0806315324710916, + "grad_norm": 0.1493082195520401, + "learning_rate": 0.0001843657179582674, + "loss": 2.0074, + "step": 283910 + }, + { + "epoch": 1.0806695949392142, + "grad_norm": 0.13401271402835846, + "learning_rate": 0.0001842997201986247, + "loss": 2.0021, + "step": 283920 + }, + { + "epoch": 1.0807076574073369, + "grad_norm": 0.15331800282001495, + "learning_rate": 0.00018423373623305907, + "loss": 2.0022, + "step": 283930 + }, + { + "epoch": 1.0807457198754595, + "grad_norm": 0.13529661297798157, + "learning_rate": 0.00018416776605292484, + "loss": 2.0236, + "step": 283940 + }, + { + "epoch": 1.0807837823435822, + "grad_norm": 0.14704905450344086, + "learning_rate": 0.00018410180964958557, + "loss": 1.9967, + "step": 283950 + }, + { + "epoch": 1.080821844811705, + "grad_norm": 0.15539222955703735, + "learning_rate": 0.00018403586701441361, + "loss": 2.0291, + "step": 283960 + }, + { + "epoch": 1.0808599072798277, + "grad_norm": 0.14663167297840118, + "learning_rate": 0.0001839699381387903, + "loss": 2.0175, + "step": 283970 + }, + { + "epoch": 1.0808979697479504, + "grad_norm": 0.18007595837116241, + "learning_rate": 0.00018390402301410625, + "loss": 1.9998, + "step": 283980 + }, + { + "epoch": 1.080936032216073, + "grad_norm": 0.17619404196739197, + "learning_rate": 0.00018383812163176073, + "loss": 2.0131, + "step": 283990 + }, + { + "epoch": 1.0809740946841957, + "grad_norm": 0.14554846286773682, + "learning_rate": 0.00018377223398316207, + "loss": 2.0071, + "step": 284000 + }, + { + "epoch": 1.0810121571523184, + "grad_norm": 0.13671885430812836, + "learning_rate": 0.00018370636005972762, + "loss": 2.024, + "step": 284010 + }, + { + "epoch": 1.081050219620441, + "grad_norm": 0.15811777114868164, + "learning_rate": 0.0001836404998528836, + "loss": 2.0026, + "step": 284020 + }, + { + "epoch": 1.0810882820885637, + "grad_norm": 0.16493038833141327, + "learning_rate": 0.0001835746533540652, + "loss": 2.0035, + "step": 284030 + }, + { + "epoch": 1.0811263445566865, + "grad_norm": 0.1355385184288025, + "learning_rate": 0.00018350882055471646, + "loss": 2.0125, + "step": 284040 + }, + { + "epoch": 1.0811644070248092, + "grad_norm": 0.1493157148361206, + "learning_rate": 0.00018344300144629034, + "loss": 2.0083, + "step": 284050 + }, + { + "epoch": 1.0812024694929319, + "grad_norm": 0.1507268100976944, + "learning_rate": 0.00018337719602024873, + "loss": 2.0161, + "step": 284060 + }, + { + "epoch": 1.0812405319610545, + "grad_norm": 0.1359802782535553, + "learning_rate": 0.0001833114042680623, + "loss": 2.0057, + "step": 284070 + }, + { + "epoch": 1.0812785944291772, + "grad_norm": 0.17290131747722626, + "learning_rate": 0.00018324562618121065, + "loss": 1.9976, + "step": 284080 + }, + { + "epoch": 1.0813166568972998, + "grad_norm": 0.17614369094371796, + "learning_rate": 0.00018317986175118224, + "loss": 2.0057, + "step": 284090 + }, + { + "epoch": 1.0813547193654225, + "grad_norm": 0.14882458746433258, + "learning_rate": 0.0001831141109694743, + "loss": 2.005, + "step": 284100 + }, + { + "epoch": 1.0813927818335451, + "grad_norm": 0.1476394385099411, + "learning_rate": 0.00018304837382759288, + "loss": 2.0242, + "step": 284110 + }, + { + "epoch": 1.0814308443016678, + "grad_norm": 0.16538527607917786, + "learning_rate": 0.00018298265031705286, + "loss": 1.9985, + "step": 284120 + }, + { + "epoch": 1.0814689067697907, + "grad_norm": 0.13338564336299896, + "learning_rate": 0.00018291694042937796, + "loss": 2.0136, + "step": 284130 + }, + { + "epoch": 1.0815069692379133, + "grad_norm": 0.13948379456996918, + "learning_rate": 0.0001828512441561006, + "loss": 2.0101, + "step": 284140 + }, + { + "epoch": 1.081545031706036, + "grad_norm": 0.14546875655651093, + "learning_rate": 0.00018278556148876196, + "loss": 2.0074, + "step": 284150 + }, + { + "epoch": 1.0815830941741587, + "grad_norm": 0.17852666974067688, + "learning_rate": 0.00018271989241891218, + "loss": 1.9957, + "step": 284160 + }, + { + "epoch": 1.0816211566422813, + "grad_norm": 0.16738300025463104, + "learning_rate": 0.00018265423693810984, + "loss": 2.0249, + "step": 284170 + }, + { + "epoch": 1.081659219110404, + "grad_norm": 0.1647614687681198, + "learning_rate": 0.0001825885950379224, + "loss": 2.0112, + "step": 284180 + }, + { + "epoch": 1.0816972815785266, + "grad_norm": 0.1584128588438034, + "learning_rate": 0.00018252296670992612, + "loss": 2.0144, + "step": 284190 + }, + { + "epoch": 1.0817353440466493, + "grad_norm": 0.173493891954422, + "learning_rate": 0.00018245735194570583, + "loss": 2.0134, + "step": 284200 + }, + { + "epoch": 1.081773406514772, + "grad_norm": 0.15178415179252625, + "learning_rate": 0.00018239175073685504, + "loss": 2.0165, + "step": 284210 + }, + { + "epoch": 1.0818114689828948, + "grad_norm": 0.14960837364196777, + "learning_rate": 0.00018232616307497612, + "loss": 2.0251, + "step": 284220 + }, + { + "epoch": 1.0818495314510175, + "grad_norm": 0.13643042743206024, + "learning_rate": 0.0001822605889516799, + "loss": 1.9996, + "step": 284230 + }, + { + "epoch": 1.0818875939191401, + "grad_norm": 0.15679650008678436, + "learning_rate": 0.00018219502835858592, + "loss": 2.0078, + "step": 284240 + }, + { + "epoch": 1.0819256563872628, + "grad_norm": 0.15052969753742218, + "learning_rate": 0.00018212948128732247, + "loss": 2.0, + "step": 284250 + }, + { + "epoch": 1.0819637188553854, + "grad_norm": 0.13190500438213348, + "learning_rate": 0.0001820639477295264, + "loss": 2.0053, + "step": 284260 + }, + { + "epoch": 1.082001781323508, + "grad_norm": 0.15521852672100067, + "learning_rate": 0.00018199842767684305, + "loss": 2.0003, + "step": 284270 + }, + { + "epoch": 1.0820398437916308, + "grad_norm": 0.15261310338974, + "learning_rate": 0.00018193292112092662, + "loss": 2.0115, + "step": 284280 + }, + { + "epoch": 1.0820779062597534, + "grad_norm": 0.14686596393585205, + "learning_rate": 0.00018186742805343975, + "loss": 2.0122, + "step": 284290 + }, + { + "epoch": 1.082115968727876, + "grad_norm": 0.1680731326341629, + "learning_rate": 0.00018180194846605362, + "loss": 2.0055, + "step": 284300 + }, + { + "epoch": 1.082154031195999, + "grad_norm": 0.2047916054725647, + "learning_rate": 0.00018173648235044805, + "loss": 2.0156, + "step": 284310 + }, + { + "epoch": 1.0821920936641216, + "grad_norm": 0.14100709557533264, + "learning_rate": 0.00018167102969831139, + "loss": 1.997, + "step": 284320 + }, + { + "epoch": 1.0822301561322443, + "grad_norm": 0.17565107345581055, + "learning_rate": 0.00018160559050134064, + "loss": 2.0085, + "step": 284330 + }, + { + "epoch": 1.082268218600367, + "grad_norm": 0.1566193699836731, + "learning_rate": 0.00018154016475124108, + "loss": 2.032, + "step": 284340 + }, + { + "epoch": 1.0823062810684896, + "grad_norm": 0.1979827731847763, + "learning_rate": 0.00018147475243972678, + "loss": 2.0144, + "step": 284350 + }, + { + "epoch": 1.0823443435366122, + "grad_norm": 0.14324718713760376, + "learning_rate": 0.00018140935355852019, + "loss": 2.003, + "step": 284360 + }, + { + "epoch": 1.082382406004735, + "grad_norm": 0.1722949594259262, + "learning_rate": 0.00018134396809935222, + "loss": 2.0078, + "step": 284370 + }, + { + "epoch": 1.0824204684728576, + "grad_norm": 0.13863638043403625, + "learning_rate": 0.00018127859605396229, + "loss": 2.0092, + "step": 284380 + }, + { + "epoch": 1.0824585309409804, + "grad_norm": 0.12698815762996674, + "learning_rate": 0.0001812132374140984, + "loss": 2.0169, + "step": 284390 + }, + { + "epoch": 1.082496593409103, + "grad_norm": 0.15930671989917755, + "learning_rate": 0.00018114789217151683, + "loss": 2.0116, + "step": 284400 + }, + { + "epoch": 1.0825346558772257, + "grad_norm": 0.15124854445457458, + "learning_rate": 0.0001810825603179824, + "loss": 2.0018, + "step": 284410 + }, + { + "epoch": 1.0825727183453484, + "grad_norm": 0.1586410254240036, + "learning_rate": 0.0001810172418452684, + "loss": 2.0096, + "step": 284420 + }, + { + "epoch": 1.082610780813471, + "grad_norm": 0.1424012929201126, + "learning_rate": 0.00018095193674515643, + "loss": 1.9924, + "step": 284430 + }, + { + "epoch": 1.0826488432815937, + "grad_norm": 0.15181034803390503, + "learning_rate": 0.00018088664500943662, + "loss": 2.0041, + "step": 284440 + }, + { + "epoch": 1.0826869057497164, + "grad_norm": 0.15156134963035583, + "learning_rate": 0.00018082136662990738, + "loss": 2.0034, + "step": 284450 + }, + { + "epoch": 1.082724968217839, + "grad_norm": 0.14133992791175842, + "learning_rate": 0.00018075610159837564, + "loss": 1.9996, + "step": 284460 + }, + { + "epoch": 1.0827630306859617, + "grad_norm": 0.14046622812747955, + "learning_rate": 0.00018069084990665656, + "loss": 1.998, + "step": 284470 + }, + { + "epoch": 1.0828010931540846, + "grad_norm": 0.17330563068389893, + "learning_rate": 0.00018062561154657376, + "loss": 2.0016, + "step": 284480 + }, + { + "epoch": 1.0828391556222072, + "grad_norm": 0.16119085252285004, + "learning_rate": 0.0001805603865099592, + "loss": 2.0187, + "step": 284490 + }, + { + "epoch": 1.0828772180903299, + "grad_norm": 0.16322733461856842, + "learning_rate": 0.00018049517478865311, + "loss": 1.998, + "step": 284500 + }, + { + "epoch": 1.0829152805584525, + "grad_norm": 0.14511479437351227, + "learning_rate": 0.00018042997637450416, + "loss": 2.0181, + "step": 284510 + }, + { + "epoch": 1.0829533430265752, + "grad_norm": 0.14829565584659576, + "learning_rate": 0.0001803647912593691, + "loss": 2.0053, + "step": 284520 + }, + { + "epoch": 1.0829914054946979, + "grad_norm": 0.1813560426235199, + "learning_rate": 0.00018029961943511342, + "loss": 2.0052, + "step": 284530 + }, + { + "epoch": 1.0830294679628205, + "grad_norm": 0.13126425445079803, + "learning_rate": 0.0001802344608936104, + "loss": 2.0076, + "step": 284540 + }, + { + "epoch": 1.0830675304309432, + "grad_norm": 0.16517986357212067, + "learning_rate": 0.00018016931562674188, + "loss": 2.0101, + "step": 284550 + }, + { + "epoch": 1.083105592899066, + "grad_norm": 0.1345342993736267, + "learning_rate": 0.0001801041836263979, + "loss": 2.005, + "step": 284560 + }, + { + "epoch": 1.0831436553671887, + "grad_norm": 0.13746510446071625, + "learning_rate": 0.00018003906488447692, + "loss": 2.0143, + "step": 284570 + }, + { + "epoch": 1.0831817178353114, + "grad_norm": 0.14292031526565552, + "learning_rate": 0.00017997395939288525, + "loss": 2.0117, + "step": 284580 + }, + { + "epoch": 1.083219780303434, + "grad_norm": 0.15923810005187988, + "learning_rate": 0.00017990886714353783, + "loss": 2.0153, + "step": 284590 + }, + { + "epoch": 1.0832578427715567, + "grad_norm": 0.14287570118904114, + "learning_rate": 0.00017984378812835756, + "loss": 2.033, + "step": 284600 + }, + { + "epoch": 1.0832959052396793, + "grad_norm": 0.1656784862279892, + "learning_rate": 0.00017977872233927573, + "loss": 2.0002, + "step": 284610 + }, + { + "epoch": 1.083333967707802, + "grad_norm": 0.13215410709381104, + "learning_rate": 0.00017971366976823173, + "loss": 2.003, + "step": 284620 + }, + { + "epoch": 1.0833720301759246, + "grad_norm": 0.16975288093090057, + "learning_rate": 0.0001796486304071731, + "loss": 2.0143, + "step": 284630 + }, + { + "epoch": 1.0834100926440473, + "grad_norm": 0.1532881259918213, + "learning_rate": 0.0001795836042480556, + "loss": 2.0028, + "step": 284640 + }, + { + "epoch": 1.0834481551121702, + "grad_norm": 0.15794317424297333, + "learning_rate": 0.00017951859128284315, + "loss": 2.0231, + "step": 284650 + }, + { + "epoch": 1.0834862175802928, + "grad_norm": 0.14210180938243866, + "learning_rate": 0.00017945359150350787, + "loss": 2.01, + "step": 284660 + }, + { + "epoch": 1.0835242800484155, + "grad_norm": 0.20217181742191315, + "learning_rate": 0.0001793886049020299, + "loss": 2.0126, + "step": 284670 + }, + { + "epoch": 1.0835623425165382, + "grad_norm": 0.1375533938407898, + "learning_rate": 0.00017932363147039765, + "loss": 2.0103, + "step": 284680 + }, + { + "epoch": 1.0836004049846608, + "grad_norm": 0.14529874920845032, + "learning_rate": 0.0001792586712006075, + "loss": 1.9933, + "step": 284690 + }, + { + "epoch": 1.0836384674527835, + "grad_norm": 0.14373710751533508, + "learning_rate": 0.00017919372408466404, + "loss": 2.0093, + "step": 284700 + }, + { + "epoch": 1.0836765299209061, + "grad_norm": 0.1357426643371582, + "learning_rate": 0.00017912879011457988, + "loss": 2.0084, + "step": 284710 + }, + { + "epoch": 1.0837145923890288, + "grad_norm": 0.15259642899036407, + "learning_rate": 0.00017906386928237577, + "loss": 2.0076, + "step": 284720 + }, + { + "epoch": 1.0837526548571517, + "grad_norm": 0.15701621770858765, + "learning_rate": 0.00017899896158008044, + "loss": 2.0148, + "step": 284730 + }, + { + "epoch": 1.0837907173252743, + "grad_norm": 0.13638730347156525, + "learning_rate": 0.00017893406699973086, + "loss": 2.0231, + "step": 284740 + }, + { + "epoch": 1.083828779793397, + "grad_norm": 0.1526368409395218, + "learning_rate": 0.00017886918553337177, + "loss": 1.9945, + "step": 284750 + }, + { + "epoch": 1.0838668422615196, + "grad_norm": 0.17583663761615753, + "learning_rate": 0.00017880431717305623, + "loss": 2.0016, + "step": 284760 + }, + { + "epoch": 1.0839049047296423, + "grad_norm": 0.13901753723621368, + "learning_rate": 0.0001787394619108451, + "loss": 1.9962, + "step": 284770 + }, + { + "epoch": 1.083942967197765, + "grad_norm": 0.14218559861183167, + "learning_rate": 0.00017867461973880745, + "loss": 2.0059, + "step": 284780 + }, + { + "epoch": 1.0839810296658876, + "grad_norm": 0.15195435285568237, + "learning_rate": 0.00017860979064902012, + "loss": 1.9785, + "step": 284790 + }, + { + "epoch": 1.0840190921340103, + "grad_norm": 0.1548282653093338, + "learning_rate": 0.00017854497463356818, + "loss": 1.9966, + "step": 284800 + }, + { + "epoch": 1.084057154602133, + "grad_norm": 0.16204087436199188, + "learning_rate": 0.0001784801716845445, + "loss": 1.9974, + "step": 284810 + }, + { + "epoch": 1.0840952170702558, + "grad_norm": 0.1456010788679123, + "learning_rate": 0.00017841538179404993, + "loss": 2.0028, + "step": 284820 + }, + { + "epoch": 1.0841332795383785, + "grad_norm": 0.17768023908138275, + "learning_rate": 0.00017835060495419343, + "loss": 2.0119, + "step": 284830 + }, + { + "epoch": 1.0841713420065011, + "grad_norm": 0.15545113384723663, + "learning_rate": 0.0001782858411570918, + "loss": 2.0115, + "step": 284840 + }, + { + "epoch": 1.0842094044746238, + "grad_norm": 0.16330191493034363, + "learning_rate": 0.00017822109039486962, + "loss": 2.0031, + "step": 284850 + }, + { + "epoch": 1.0842474669427464, + "grad_norm": 0.14554713666439056, + "learning_rate": 0.00017815635265965967, + "loss": 2.0208, + "step": 284860 + }, + { + "epoch": 1.084285529410869, + "grad_norm": 0.13749483227729797, + "learning_rate": 0.00017809162794360257, + "loss": 1.9935, + "step": 284870 + }, + { + "epoch": 1.0843235918789917, + "grad_norm": 0.15069235861301422, + "learning_rate": 0.00017802691623884664, + "loss": 2.002, + "step": 284880 + }, + { + "epoch": 1.0843616543471144, + "grad_norm": 0.13793307542800903, + "learning_rate": 0.00017796221753754837, + "loss": 2.0199, + "step": 284890 + }, + { + "epoch": 1.0843997168152373, + "grad_norm": 0.14687716960906982, + "learning_rate": 0.00017789753183187186, + "loss": 2.0139, + "step": 284900 + }, + { + "epoch": 1.08443777928336, + "grad_norm": 0.15696461498737335, + "learning_rate": 0.0001778328591139893, + "loss": 2.0064, + "step": 284910 + }, + { + "epoch": 1.0844758417514826, + "grad_norm": 0.21080505847930908, + "learning_rate": 0.00017776819937608064, + "loss": 2.0004, + "step": 284920 + }, + { + "epoch": 1.0845139042196053, + "grad_norm": 0.13712690770626068, + "learning_rate": 0.0001777035526103336, + "loss": 2.0056, + "step": 284930 + }, + { + "epoch": 1.084551966687728, + "grad_norm": 0.13887648284435272, + "learning_rate": 0.0001776389188089439, + "loss": 2.004, + "step": 284940 + }, + { + "epoch": 1.0845900291558506, + "grad_norm": 0.13740429282188416, + "learning_rate": 0.00017757429796411495, + "loss": 2.0064, + "step": 284950 + }, + { + "epoch": 1.0846280916239732, + "grad_norm": 0.1832609623670578, + "learning_rate": 0.00017750969006805802, + "loss": 2.0065, + "step": 284960 + }, + { + "epoch": 1.0846661540920959, + "grad_norm": 0.160702183842659, + "learning_rate": 0.00017744509511299217, + "loss": 2.015, + "step": 284970 + }, + { + "epoch": 1.0847042165602185, + "grad_norm": 0.15266917645931244, + "learning_rate": 0.0001773805130911443, + "loss": 2.0148, + "step": 284980 + }, + { + "epoch": 1.0847422790283414, + "grad_norm": 0.16672642529010773, + "learning_rate": 0.00017731594399474893, + "loss": 2.0083, + "step": 284990 + }, + { + "epoch": 1.084780341496464, + "grad_norm": 0.16822563111782074, + "learning_rate": 0.0001772513878160486, + "loss": 2.0062, + "step": 285000 + }, + { + "epoch": 1.0848184039645867, + "grad_norm": 0.140736386179924, + "learning_rate": 0.0001771868445472934, + "loss": 2.0132, + "step": 285010 + }, + { + "epoch": 1.0848564664327094, + "grad_norm": 0.15960873663425446, + "learning_rate": 0.0001771223141807412, + "loss": 2.007, + "step": 285020 + }, + { + "epoch": 1.084894528900832, + "grad_norm": 0.1767164021730423, + "learning_rate": 0.00017705779670865767, + "loss": 2.0038, + "step": 285030 + }, + { + "epoch": 1.0849325913689547, + "grad_norm": 0.15160715579986572, + "learning_rate": 0.00017699329212331622, + "loss": 2.0039, + "step": 285040 + }, + { + "epoch": 1.0849706538370774, + "grad_norm": 0.151358962059021, + "learning_rate": 0.00017692880041699787, + "loss": 1.9956, + "step": 285050 + }, + { + "epoch": 1.0850087163052, + "grad_norm": 0.13658073544502258, + "learning_rate": 0.0001768643215819914, + "loss": 2.0185, + "step": 285060 + }, + { + "epoch": 1.085046778773323, + "grad_norm": 0.15917928516864777, + "learning_rate": 0.0001767998556105933, + "loss": 2.0128, + "step": 285070 + }, + { + "epoch": 1.0850848412414456, + "grad_norm": 0.1380550116300583, + "learning_rate": 0.0001767354024951077, + "loss": 2.016, + "step": 285080 + }, + { + "epoch": 1.0851229037095682, + "grad_norm": 0.17589768767356873, + "learning_rate": 0.0001766709622278465, + "loss": 2.0192, + "step": 285090 + }, + { + "epoch": 1.0851609661776909, + "grad_norm": 0.1453203558921814, + "learning_rate": 0.0001766065348011291, + "loss": 2.0167, + "step": 285100 + }, + { + "epoch": 1.0851990286458135, + "grad_norm": 0.13754414021968842, + "learning_rate": 0.00017654212020728268, + "loss": 2.0006, + "step": 285110 + }, + { + "epoch": 1.0852370911139362, + "grad_norm": 0.1450946182012558, + "learning_rate": 0.0001764777184386419, + "loss": 2.0104, + "step": 285120 + }, + { + "epoch": 1.0852751535820588, + "grad_norm": 0.14516545832157135, + "learning_rate": 0.0001764133294875493, + "loss": 2.0098, + "step": 285130 + }, + { + "epoch": 1.0853132160501815, + "grad_norm": 0.15919335186481476, + "learning_rate": 0.00017634895334635488, + "loss": 2.018, + "step": 285140 + }, + { + "epoch": 1.0853512785183042, + "grad_norm": 0.1589651107788086, + "learning_rate": 0.00017628459000741616, + "loss": 1.993, + "step": 285150 + }, + { + "epoch": 1.0853893409864268, + "grad_norm": 0.1578570306301117, + "learning_rate": 0.0001762202394630984, + "loss": 2.0113, + "step": 285160 + }, + { + "epoch": 1.0854274034545497, + "grad_norm": 0.15003244578838348, + "learning_rate": 0.0001761559017057745, + "loss": 2.0134, + "step": 285170 + }, + { + "epoch": 1.0854654659226723, + "grad_norm": 0.15741883218288422, + "learning_rate": 0.00017609157672782472, + "loss": 2.0009, + "step": 285180 + }, + { + "epoch": 1.085503528390795, + "grad_norm": 0.13433581590652466, + "learning_rate": 0.00017602726452163703, + "loss": 2.0201, + "step": 285190 + }, + { + "epoch": 1.0855415908589177, + "grad_norm": 0.16501778364181519, + "learning_rate": 0.00017596296507960702, + "loss": 2.0033, + "step": 285200 + }, + { + "epoch": 1.0855796533270403, + "grad_norm": 0.17917238175868988, + "learning_rate": 0.0001758986783941376, + "loss": 1.9986, + "step": 285210 + }, + { + "epoch": 1.085617715795163, + "grad_norm": 0.1420595645904541, + "learning_rate": 0.00017583440445763937, + "loss": 2.0049, + "step": 285220 + }, + { + "epoch": 1.0856557782632856, + "grad_norm": 0.13732458651065826, + "learning_rate": 0.00017577014326253048, + "loss": 1.9963, + "step": 285230 + }, + { + "epoch": 1.0856938407314083, + "grad_norm": 0.15316133201122284, + "learning_rate": 0.00017570589480123656, + "loss": 2.0055, + "step": 285240 + }, + { + "epoch": 1.0857319031995312, + "grad_norm": 0.16198821365833282, + "learning_rate": 0.00017564165906619063, + "loss": 2.0089, + "step": 285250 + }, + { + "epoch": 1.0857699656676538, + "grad_norm": 0.14189749956130981, + "learning_rate": 0.00017557743604983334, + "loss": 2.0037, + "step": 285260 + }, + { + "epoch": 1.0858080281357765, + "grad_norm": 0.1340772807598114, + "learning_rate": 0.0001755132257446127, + "loss": 2.0069, + "step": 285270 + }, + { + "epoch": 1.0858460906038991, + "grad_norm": 0.156874418258667, + "learning_rate": 0.0001754490281429844, + "loss": 2.0035, + "step": 285280 + }, + { + "epoch": 1.0858841530720218, + "grad_norm": 0.1479601413011551, + "learning_rate": 0.00017538484323741137, + "loss": 2.0102, + "step": 285290 + }, + { + "epoch": 1.0859222155401445, + "grad_norm": 0.13866026699543, + "learning_rate": 0.00017532067102036402, + "loss": 2.004, + "step": 285300 + }, + { + "epoch": 1.085960278008267, + "grad_norm": 0.14532513916492462, + "learning_rate": 0.00017525651148432036, + "loss": 1.9924, + "step": 285310 + }, + { + "epoch": 1.0859983404763898, + "grad_norm": 0.1476632058620453, + "learning_rate": 0.00017519236462176568, + "loss": 2.0077, + "step": 285320 + }, + { + "epoch": 1.0860364029445124, + "grad_norm": 0.155150905251503, + "learning_rate": 0.00017512823042519271, + "loss": 2.0086, + "step": 285330 + }, + { + "epoch": 1.0860744654126353, + "grad_norm": 0.17510360479354858, + "learning_rate": 0.00017506410888710168, + "loss": 2.029, + "step": 285340 + }, + { + "epoch": 1.086112527880758, + "grad_norm": 0.147820383310318, + "learning_rate": 0.000175, + "loss": 2.0126, + "step": 285350 + }, + { + "epoch": 1.0861505903488806, + "grad_norm": 0.159368097782135, + "learning_rate": 0.00017493590375640273, + "loss": 2.0101, + "step": 285360 + }, + { + "epoch": 1.0861886528170033, + "grad_norm": 0.18122664093971252, + "learning_rate": 0.00017487182014883218, + "loss": 2.0041, + "step": 285370 + }, + { + "epoch": 1.086226715285126, + "grad_norm": 0.1660909205675125, + "learning_rate": 0.000174807749169818, + "loss": 2.0045, + "step": 285380 + }, + { + "epoch": 1.0862647777532486, + "grad_norm": 0.16297060251235962, + "learning_rate": 0.0001747436908118973, + "loss": 2.0196, + "step": 285390 + }, + { + "epoch": 1.0863028402213712, + "grad_norm": 0.1591036319732666, + "learning_rate": 0.0001746796450676144, + "loss": 2.0259, + "step": 285400 + }, + { + "epoch": 1.086340902689494, + "grad_norm": 0.1548295021057129, + "learning_rate": 0.00017461561192952114, + "loss": 2.0177, + "step": 285410 + }, + { + "epoch": 1.0863789651576168, + "grad_norm": 0.19422496855258942, + "learning_rate": 0.00017455159139017644, + "loss": 1.9943, + "step": 285420 + }, + { + "epoch": 1.0864170276257394, + "grad_norm": 0.14384585618972778, + "learning_rate": 0.00017448758344214683, + "loss": 2.0085, + "step": 285430 + }, + { + "epoch": 1.086455090093862, + "grad_norm": 0.15218782424926758, + "learning_rate": 0.0001744235880780059, + "loss": 2.0069, + "step": 285440 + }, + { + "epoch": 1.0864931525619848, + "grad_norm": 0.14296023547649384, + "learning_rate": 0.00017435960529033463, + "loss": 2.0121, + "step": 285450 + }, + { + "epoch": 1.0865312150301074, + "grad_norm": 0.13962121307849884, + "learning_rate": 0.0001742956350717213, + "loss": 1.999, + "step": 285460 + }, + { + "epoch": 1.08656927749823, + "grad_norm": 0.15799452364444733, + "learning_rate": 0.00017423167741476153, + "loss": 2.0169, + "step": 285470 + }, + { + "epoch": 1.0866073399663527, + "grad_norm": 0.1890258938074112, + "learning_rate": 0.000174167732312058, + "loss": 2.0067, + "step": 285480 + }, + { + "epoch": 1.0866454024344754, + "grad_norm": 0.187018021941185, + "learning_rate": 0.00017410379975622093, + "loss": 2.0143, + "step": 285490 + }, + { + "epoch": 1.086683464902598, + "grad_norm": 0.1458263248205185, + "learning_rate": 0.00017403987973986756, + "loss": 1.9961, + "step": 285500 + }, + { + "epoch": 1.086721527370721, + "grad_norm": 0.15105989575386047, + "learning_rate": 0.00017397597225562246, + "loss": 2.0032, + "step": 285510 + }, + { + "epoch": 1.0867595898388436, + "grad_norm": 0.14834915101528168, + "learning_rate": 0.00017391207729611737, + "loss": 2.0062, + "step": 285520 + }, + { + "epoch": 1.0867976523069662, + "grad_norm": 0.14579245448112488, + "learning_rate": 0.00017384819485399133, + "loss": 2.0119, + "step": 285530 + }, + { + "epoch": 1.086835714775089, + "grad_norm": 0.1407989114522934, + "learning_rate": 0.00017378432492189057, + "loss": 2.0034, + "step": 285540 + }, + { + "epoch": 1.0868737772432115, + "grad_norm": 0.1429048627614975, + "learning_rate": 0.00017372046749246844, + "loss": 2.0049, + "step": 285550 + }, + { + "epoch": 1.0869118397113342, + "grad_norm": 0.14179645478725433, + "learning_rate": 0.0001736566225583856, + "loss": 1.9977, + "step": 285560 + }, + { + "epoch": 1.0869499021794569, + "grad_norm": 0.16286250948905945, + "learning_rate": 0.00017359279011230978, + "loss": 2.0151, + "step": 285570 + }, + { + "epoch": 1.0869879646475795, + "grad_norm": 0.1652802675962448, + "learning_rate": 0.00017352897014691587, + "loss": 2.0158, + "step": 285580 + }, + { + "epoch": 1.0870260271157024, + "grad_norm": 0.17722772061824799, + "learning_rate": 0.0001734651626548861, + "loss": 2.0127, + "step": 285590 + }, + { + "epoch": 1.087064089583825, + "grad_norm": 0.16950948536396027, + "learning_rate": 0.00017340136762890958, + "loss": 2.0054, + "step": 285600 + }, + { + "epoch": 1.0871021520519477, + "grad_norm": 0.16562749445438385, + "learning_rate": 0.0001733375850616828, + "loss": 2.0079, + "step": 285610 + }, + { + "epoch": 1.0871402145200704, + "grad_norm": 0.16646356880664825, + "learning_rate": 0.00017327381494590922, + "loss": 2.0163, + "step": 285620 + }, + { + "epoch": 1.087178276988193, + "grad_norm": 0.17872683703899384, + "learning_rate": 0.00017321005727429946, + "loss": 2.0024, + "step": 285630 + }, + { + "epoch": 1.0872163394563157, + "grad_norm": 0.14331817626953125, + "learning_rate": 0.00017314631203957126, + "loss": 2.0333, + "step": 285640 + }, + { + "epoch": 1.0872544019244383, + "grad_norm": 0.15764577686786652, + "learning_rate": 0.0001730825792344495, + "loss": 2.0171, + "step": 285650 + }, + { + "epoch": 1.087292464392561, + "grad_norm": 0.21209511160850525, + "learning_rate": 0.00017301885885166607, + "loss": 1.9974, + "step": 285660 + }, + { + "epoch": 1.0873305268606837, + "grad_norm": 0.14507344365119934, + "learning_rate": 0.00017295515088396008, + "loss": 2.0294, + "step": 285670 + }, + { + "epoch": 1.0873685893288065, + "grad_norm": 0.15377765893936157, + "learning_rate": 0.00017289145532407746, + "loss": 1.9843, + "step": 285680 + }, + { + "epoch": 1.0874066517969292, + "grad_norm": 0.19551454484462738, + "learning_rate": 0.0001728277721647715, + "loss": 2.019, + "step": 285690 + }, + { + "epoch": 1.0874447142650518, + "grad_norm": 0.17279034852981567, + "learning_rate": 0.00017276410139880233, + "loss": 2.0254, + "step": 285700 + }, + { + "epoch": 1.0874827767331745, + "grad_norm": 0.14661487936973572, + "learning_rate": 0.00017270044301893718, + "loss": 2.0132, + "step": 285710 + }, + { + "epoch": 1.0875208392012972, + "grad_norm": 0.14725978672504425, + "learning_rate": 0.00017263679701795033, + "loss": 2.0066, + "step": 285720 + }, + { + "epoch": 1.0875589016694198, + "grad_norm": 0.20444689691066742, + "learning_rate": 0.00017257316338862305, + "loss": 1.989, + "step": 285730 + }, + { + "epoch": 1.0875969641375425, + "grad_norm": 0.14883337914943695, + "learning_rate": 0.0001725095421237437, + "loss": 2.0105, + "step": 285740 + }, + { + "epoch": 1.0876350266056651, + "grad_norm": 0.1610412299633026, + "learning_rate": 0.00017244593321610757, + "loss": 2.0308, + "step": 285750 + }, + { + "epoch": 1.087673089073788, + "grad_norm": 0.17380206286907196, + "learning_rate": 0.00017238233665851693, + "loss": 2.0085, + "step": 285760 + }, + { + "epoch": 1.0877111515419107, + "grad_norm": 0.1897568255662918, + "learning_rate": 0.00017231875244378114, + "loss": 2.0239, + "step": 285770 + }, + { + "epoch": 1.0877492140100333, + "grad_norm": 0.18813052773475647, + "learning_rate": 0.0001722551805647164, + "loss": 2.0117, + "step": 285780 + }, + { + "epoch": 1.087787276478156, + "grad_norm": 0.15918661653995514, + "learning_rate": 0.00017219162101414593, + "loss": 1.993, + "step": 285790 + }, + { + "epoch": 1.0878253389462786, + "grad_norm": 0.13849321007728577, + "learning_rate": 0.00017212807378489997, + "loss": 2.0134, + "step": 285800 + }, + { + "epoch": 1.0878634014144013, + "grad_norm": 0.1621067076921463, + "learning_rate": 0.00017206453886981567, + "loss": 2.0181, + "step": 285810 + }, + { + "epoch": 1.087901463882524, + "grad_norm": 0.17289504408836365, + "learning_rate": 0.000172001016261737, + "loss": 1.9917, + "step": 285820 + }, + { + "epoch": 1.0879395263506466, + "grad_norm": 0.1490570455789566, + "learning_rate": 0.00017193750595351505, + "loss": 2.0093, + "step": 285830 + }, + { + "epoch": 1.0879775888187693, + "grad_norm": 0.15735618770122528, + "learning_rate": 0.00017187400793800767, + "loss": 2.0095, + "step": 285840 + }, + { + "epoch": 1.0880156512868921, + "grad_norm": 0.19365963339805603, + "learning_rate": 0.00017181052220807975, + "loss": 2.002, + "step": 285850 + }, + { + "epoch": 1.0880537137550148, + "grad_norm": 0.15690432488918304, + "learning_rate": 0.00017174704875660296, + "loss": 1.9952, + "step": 285860 + }, + { + "epoch": 1.0880917762231375, + "grad_norm": 0.17785774171352386, + "learning_rate": 0.00017168358757645597, + "loss": 1.9854, + "step": 285870 + }, + { + "epoch": 1.0881298386912601, + "grad_norm": 0.15966349840164185, + "learning_rate": 0.00017162013866052424, + "loss": 2.0002, + "step": 285880 + }, + { + "epoch": 1.0881679011593828, + "grad_norm": 0.14687706530094147, + "learning_rate": 0.00017155670200170016, + "loss": 2.0077, + "step": 285890 + }, + { + "epoch": 1.0882059636275054, + "grad_norm": 0.15228454768657684, + "learning_rate": 0.00017149327759288297, + "loss": 2.0044, + "step": 285900 + }, + { + "epoch": 1.088244026095628, + "grad_norm": 0.16104793548583984, + "learning_rate": 0.00017142986542697868, + "loss": 2.0053, + "step": 285910 + }, + { + "epoch": 1.0882820885637507, + "grad_norm": 0.1378631442785263, + "learning_rate": 0.00017136646549690033, + "loss": 2.0007, + "step": 285920 + }, + { + "epoch": 1.0883201510318736, + "grad_norm": 0.149201899766922, + "learning_rate": 0.00017130307779556763, + "loss": 2.0061, + "step": 285930 + }, + { + "epoch": 1.0883582134999963, + "grad_norm": 0.15267148613929749, + "learning_rate": 0.00017123970231590718, + "loss": 2.0115, + "step": 285940 + }, + { + "epoch": 1.088396275968119, + "grad_norm": 0.16789740324020386, + "learning_rate": 0.0001711763390508524, + "loss": 1.9813, + "step": 285950 + }, + { + "epoch": 1.0884343384362416, + "grad_norm": 0.1826680302619934, + "learning_rate": 0.00017111298799334345, + "loss": 2.0106, + "step": 285960 + }, + { + "epoch": 1.0884724009043643, + "grad_norm": 0.16165274381637573, + "learning_rate": 0.00017104964913632737, + "loss": 2.006, + "step": 285970 + }, + { + "epoch": 1.088510463372487, + "grad_norm": 0.15090411901474, + "learning_rate": 0.00017098632247275797, + "loss": 1.9881, + "step": 285980 + }, + { + "epoch": 1.0885485258406096, + "grad_norm": 0.15984360873699188, + "learning_rate": 0.00017092300799559585, + "loss": 2.0049, + "step": 285990 + }, + { + "epoch": 1.0885865883087322, + "grad_norm": 0.16495366394519806, + "learning_rate": 0.00017085970569780833, + "loss": 2.01, + "step": 286000 + }, + { + "epoch": 1.0886246507768549, + "grad_norm": 0.1553267389535904, + "learning_rate": 0.0001707964155723696, + "loss": 2.0111, + "step": 286010 + }, + { + "epoch": 1.0886627132449775, + "grad_norm": 0.14737053215503693, + "learning_rate": 0.00017073313761226038, + "loss": 2.0051, + "step": 286020 + }, + { + "epoch": 1.0887007757131004, + "grad_norm": 0.19088530540466309, + "learning_rate": 0.00017066987181046838, + "loss": 2.0076, + "step": 286030 + }, + { + "epoch": 1.088738838181223, + "grad_norm": 0.14853475987911224, + "learning_rate": 0.0001706066181599879, + "loss": 2.0169, + "step": 286040 + }, + { + "epoch": 1.0887769006493457, + "grad_norm": 0.22443360090255737, + "learning_rate": 0.00017054337665382008, + "loss": 2.0177, + "step": 286050 + }, + { + "epoch": 1.0888149631174684, + "grad_norm": 0.16358278691768646, + "learning_rate": 0.00017048014728497264, + "loss": 2.0092, + "step": 286060 + }, + { + "epoch": 1.088853025585591, + "grad_norm": 0.14100675284862518, + "learning_rate": 0.0001704169300464601, + "loss": 2.0094, + "step": 286070 + }, + { + "epoch": 1.0888910880537137, + "grad_norm": 0.14958365261554718, + "learning_rate": 0.00017035372493130359, + "loss": 1.9926, + "step": 286080 + }, + { + "epoch": 1.0889291505218364, + "grad_norm": 0.14957799017429352, + "learning_rate": 0.00017029053193253103, + "loss": 2.001, + "step": 286090 + }, + { + "epoch": 1.088967212989959, + "grad_norm": 0.16943015158176422, + "learning_rate": 0.00017022735104317704, + "loss": 2.01, + "step": 286100 + }, + { + "epoch": 1.089005275458082, + "grad_norm": 0.15172553062438965, + "learning_rate": 0.00017016418225628278, + "loss": 2.001, + "step": 286110 + }, + { + "epoch": 1.0890433379262046, + "grad_norm": 0.15565767884254456, + "learning_rate": 0.0001701010255648961, + "loss": 2.0, + "step": 286120 + }, + { + "epoch": 1.0890814003943272, + "grad_norm": 0.14762194454669952, + "learning_rate": 0.00017003788096207168, + "loss": 1.9912, + "step": 286130 + }, + { + "epoch": 1.0891194628624499, + "grad_norm": 0.1653757095336914, + "learning_rate": 0.00016997474844087057, + "loss": 2.0073, + "step": 286140 + }, + { + "epoch": 1.0891575253305725, + "grad_norm": 0.1787230223417282, + "learning_rate": 0.00016991162799436066, + "loss": 2.0119, + "step": 286150 + }, + { + "epoch": 1.0891955877986952, + "grad_norm": 0.21251770853996277, + "learning_rate": 0.00016984851961561643, + "loss": 2.0091, + "step": 286160 + }, + { + "epoch": 1.0892336502668178, + "grad_norm": 0.16268038749694824, + "learning_rate": 0.00016978542329771897, + "loss": 2.0079, + "step": 286170 + }, + { + "epoch": 1.0892717127349405, + "grad_norm": 0.139210045337677, + "learning_rate": 0.00016972233903375585, + "loss": 2.0042, + "step": 286180 + }, + { + "epoch": 1.0893097752030632, + "grad_norm": 0.17149564623832703, + "learning_rate": 0.00016965926681682141, + "loss": 2.0037, + "step": 286190 + }, + { + "epoch": 1.089347837671186, + "grad_norm": 0.1662767082452774, + "learning_rate": 0.0001695962066400165, + "loss": 2.0081, + "step": 286200 + }, + { + "epoch": 1.0893859001393087, + "grad_norm": 0.1791909635066986, + "learning_rate": 0.00016953315849644864, + "loss": 2.0038, + "step": 286210 + }, + { + "epoch": 1.0894239626074314, + "grad_norm": 0.14849887788295746, + "learning_rate": 0.00016947012237923182, + "loss": 2.006, + "step": 286220 + }, + { + "epoch": 1.089462025075554, + "grad_norm": 0.15307657420635223, + "learning_rate": 0.0001694070982814866, + "loss": 2.0029, + "step": 286230 + }, + { + "epoch": 1.0895000875436767, + "grad_norm": 0.1445232778787613, + "learning_rate": 0.00016934408619634011, + "loss": 1.9932, + "step": 286240 + }, + { + "epoch": 1.0895381500117993, + "grad_norm": 0.16825585067272186, + "learning_rate": 0.00016928108611692617, + "loss": 2.0032, + "step": 286250 + }, + { + "epoch": 1.089576212479922, + "grad_norm": 0.1817305088043213, + "learning_rate": 0.00016921809803638495, + "loss": 2.0079, + "step": 286260 + }, + { + "epoch": 1.0896142749480446, + "grad_norm": 0.1555592119693756, + "learning_rate": 0.0001691551219478631, + "loss": 1.9982, + "step": 286270 + }, + { + "epoch": 1.0896523374161675, + "grad_norm": 0.14670062065124512, + "learning_rate": 0.00016909215784451404, + "loss": 2.0045, + "step": 286280 + }, + { + "epoch": 1.0896903998842902, + "grad_norm": 0.15879133343696594, + "learning_rate": 0.0001690292057194976, + "loss": 2.0125, + "step": 286290 + }, + { + "epoch": 1.0897284623524128, + "grad_norm": 0.1739165037870407, + "learning_rate": 0.00016896626556597993, + "loss": 1.9925, + "step": 286300 + }, + { + "epoch": 1.0897665248205355, + "grad_norm": 0.13863325119018555, + "learning_rate": 0.0001689033373771339, + "loss": 2.0041, + "step": 286310 + }, + { + "epoch": 1.0898045872886581, + "grad_norm": 0.1826622039079666, + "learning_rate": 0.0001688404211461389, + "loss": 2.0126, + "step": 286320 + }, + { + "epoch": 1.0898426497567808, + "grad_norm": 0.18271639943122864, + "learning_rate": 0.00016877751686618053, + "loss": 1.9994, + "step": 286330 + }, + { + "epoch": 1.0898807122249035, + "grad_norm": 0.14765529334545135, + "learning_rate": 0.0001687146245304511, + "loss": 2.0127, + "step": 286340 + }, + { + "epoch": 1.0899187746930261, + "grad_norm": 0.1734556257724762, + "learning_rate": 0.0001686517441321493, + "loss": 2.0015, + "step": 286350 + }, + { + "epoch": 1.0899568371611488, + "grad_norm": 0.14427216351032257, + "learning_rate": 0.00016858887566448022, + "loss": 2.0012, + "step": 286360 + }, + { + "epoch": 1.0899948996292717, + "grad_norm": 0.17495569586753845, + "learning_rate": 0.00016852601912065556, + "loss": 2.0077, + "step": 286370 + }, + { + "epoch": 1.0900329620973943, + "grad_norm": 0.14779295027256012, + "learning_rate": 0.00016846317449389326, + "loss": 2.0107, + "step": 286380 + }, + { + "epoch": 1.090071024565517, + "grad_norm": 0.16964447498321533, + "learning_rate": 0.0001684003417774178, + "loss": 1.9968, + "step": 286390 + }, + { + "epoch": 1.0901090870336396, + "grad_norm": 0.14409834146499634, + "learning_rate": 0.00016833752096446003, + "loss": 2.0048, + "step": 286400 + }, + { + "epoch": 1.0901471495017623, + "grad_norm": 0.13902217149734497, + "learning_rate": 0.00016827471204825724, + "loss": 1.9971, + "step": 286410 + }, + { + "epoch": 1.090185211969885, + "grad_norm": 0.17094755172729492, + "learning_rate": 0.0001682119150220531, + "loss": 2.0094, + "step": 286420 + }, + { + "epoch": 1.0902232744380076, + "grad_norm": 0.164179265499115, + "learning_rate": 0.00016814912987909764, + "loss": 2.0153, + "step": 286430 + }, + { + "epoch": 1.0902613369061303, + "grad_norm": 0.14546635746955872, + "learning_rate": 0.00016808635661264744, + "loss": 2.0041, + "step": 286440 + }, + { + "epoch": 1.0902993993742531, + "grad_norm": 0.14570185542106628, + "learning_rate": 0.00016802359521596517, + "loss": 2.0091, + "step": 286450 + }, + { + "epoch": 1.0903374618423758, + "grad_norm": 0.14966364204883575, + "learning_rate": 0.00016796084568232016, + "loss": 1.9971, + "step": 286460 + }, + { + "epoch": 1.0903755243104984, + "grad_norm": 0.17986074090003967, + "learning_rate": 0.00016789810800498794, + "loss": 2.0117, + "step": 286470 + }, + { + "epoch": 1.090413586778621, + "grad_norm": 0.18305820226669312, + "learning_rate": 0.00016783538217725037, + "loss": 1.9942, + "step": 286480 + }, + { + "epoch": 1.0904516492467438, + "grad_norm": 0.20067545771598816, + "learning_rate": 0.00016777266819239573, + "loss": 2.0062, + "step": 286490 + }, + { + "epoch": 1.0904897117148664, + "grad_norm": 0.1565934121608734, + "learning_rate": 0.0001677099660437186, + "loss": 2.0087, + "step": 286500 + }, + { + "epoch": 1.090527774182989, + "grad_norm": 0.16560204327106476, + "learning_rate": 0.0001676472757245198, + "loss": 1.9894, + "step": 286510 + }, + { + "epoch": 1.0905658366511117, + "grad_norm": 0.15744443237781525, + "learning_rate": 0.00016758459722810676, + "loss": 1.9977, + "step": 286520 + }, + { + "epoch": 1.0906038991192344, + "grad_norm": 0.1524791717529297, + "learning_rate": 0.00016752193054779286, + "loss": 2.0021, + "step": 286530 + }, + { + "epoch": 1.0906419615873573, + "grad_norm": 0.16802918910980225, + "learning_rate": 0.00016745927567689801, + "loss": 1.9982, + "step": 286540 + }, + { + "epoch": 1.09068002405548, + "grad_norm": 0.16290956735610962, + "learning_rate": 0.0001673966326087482, + "loss": 2.0062, + "step": 286550 + }, + { + "epoch": 1.0907180865236026, + "grad_norm": 0.19118371605873108, + "learning_rate": 0.00016733400133667604, + "loss": 1.9943, + "step": 286560 + }, + { + "epoch": 1.0907561489917252, + "grad_norm": 0.17798908054828644, + "learning_rate": 0.0001672713818540201, + "loss": 2.007, + "step": 286570 + }, + { + "epoch": 1.090794211459848, + "grad_norm": 0.16545897722244263, + "learning_rate": 0.0001672087741541253, + "loss": 2.0211, + "step": 286580 + }, + { + "epoch": 1.0908322739279706, + "grad_norm": 0.16653016209602356, + "learning_rate": 0.00016714617823034288, + "loss": 2.0091, + "step": 286590 + }, + { + "epoch": 1.0908703363960932, + "grad_norm": 0.1368710845708847, + "learning_rate": 0.00016708359407603037, + "loss": 2.0035, + "step": 286600 + }, + { + "epoch": 1.0909083988642159, + "grad_norm": 0.16141341626644135, + "learning_rate": 0.0001670210216845514, + "loss": 2.0106, + "step": 286610 + }, + { + "epoch": 1.0909464613323387, + "grad_norm": 0.13862714171409607, + "learning_rate": 0.00016695846104927592, + "loss": 2.0011, + "step": 286620 + }, + { + "epoch": 1.0909845238004614, + "grad_norm": 0.13830986618995667, + "learning_rate": 0.00016689591216358012, + "loss": 2.0061, + "step": 286630 + }, + { + "epoch": 1.091022586268584, + "grad_norm": 0.1664019078016281, + "learning_rate": 0.0001668333750208464, + "loss": 1.9973, + "step": 286640 + }, + { + "epoch": 1.0910606487367067, + "grad_norm": 0.14373458921909332, + "learning_rate": 0.00016677084961446326, + "loss": 2.0144, + "step": 286650 + }, + { + "epoch": 1.0910987112048294, + "grad_norm": 0.16323675215244293, + "learning_rate": 0.00016670833593782558, + "loss": 2.0114, + "step": 286660 + }, + { + "epoch": 1.091136773672952, + "grad_norm": 0.167044535279274, + "learning_rate": 0.0001666458339843343, + "loss": 1.9995, + "step": 286670 + }, + { + "epoch": 1.0911748361410747, + "grad_norm": 0.14962568879127502, + "learning_rate": 0.00016658334374739664, + "loss": 1.9798, + "step": 286680 + }, + { + "epoch": 1.0912128986091973, + "grad_norm": 0.13388465344905853, + "learning_rate": 0.00016652086522042593, + "loss": 2.013, + "step": 286690 + }, + { + "epoch": 1.09125096107732, + "grad_norm": 0.14110036194324493, + "learning_rate": 0.0001664583983968417, + "loss": 1.9943, + "step": 286700 + }, + { + "epoch": 1.0912890235454429, + "grad_norm": 0.17264728248119354, + "learning_rate": 0.0001663959432700695, + "loss": 2.0083, + "step": 286710 + }, + { + "epoch": 1.0913270860135655, + "grad_norm": 0.16424086689949036, + "learning_rate": 0.00016633349983354135, + "loss": 2.0085, + "step": 286720 + }, + { + "epoch": 1.0913651484816882, + "grad_norm": 0.15271663665771484, + "learning_rate": 0.00016627106808069514, + "loss": 1.9945, + "step": 286730 + }, + { + "epoch": 1.0914032109498109, + "grad_norm": 0.1648697853088379, + "learning_rate": 0.00016620864800497503, + "loss": 1.9921, + "step": 286740 + }, + { + "epoch": 1.0914412734179335, + "grad_norm": 0.16213729977607727, + "learning_rate": 0.00016614623959983115, + "loss": 1.9955, + "step": 286750 + }, + { + "epoch": 1.0914793358860562, + "grad_norm": 0.1640002578496933, + "learning_rate": 0.00016608384285872002, + "loss": 2.0016, + "step": 286760 + }, + { + "epoch": 1.0915173983541788, + "grad_norm": 0.19037093222141266, + "learning_rate": 0.00016602145777510396, + "loss": 2.0206, + "step": 286770 + }, + { + "epoch": 1.0915554608223015, + "grad_norm": 0.1593031883239746, + "learning_rate": 0.00016595908434245167, + "loss": 1.9916, + "step": 286780 + }, + { + "epoch": 1.0915935232904244, + "grad_norm": 0.1630665510892868, + "learning_rate": 0.0001658967225542377, + "loss": 1.9885, + "step": 286790 + }, + { + "epoch": 1.091631585758547, + "grad_norm": 0.13461852073669434, + "learning_rate": 0.000165834372403943, + "loss": 2.0162, + "step": 286800 + }, + { + "epoch": 1.0916696482266697, + "grad_norm": 0.19513630867004395, + "learning_rate": 0.00016577203388505425, + "loss": 2.0226, + "step": 286810 + }, + { + "epoch": 1.0917077106947923, + "grad_norm": 0.1570121794939041, + "learning_rate": 0.0001657097069910644, + "loss": 2.0136, + "step": 286820 + }, + { + "epoch": 1.091745773162915, + "grad_norm": 0.144845649600029, + "learning_rate": 0.00016564739171547255, + "loss": 1.9948, + "step": 286830 + }, + { + "epoch": 1.0917838356310376, + "grad_norm": 0.15379267930984497, + "learning_rate": 0.00016558508805178358, + "loss": 2.0069, + "step": 286840 + }, + { + "epoch": 1.0918218980991603, + "grad_norm": 0.14519637823104858, + "learning_rate": 0.00016552279599350871, + "loss": 2.0013, + "step": 286850 + }, + { + "epoch": 1.091859960567283, + "grad_norm": 0.14761601388454437, + "learning_rate": 0.00016546051553416498, + "loss": 1.9983, + "step": 286860 + }, + { + "epoch": 1.0918980230354056, + "grad_norm": 0.15481062233448029, + "learning_rate": 0.00016539824666727559, + "loss": 1.9977, + "step": 286870 + }, + { + "epoch": 1.0919360855035283, + "grad_norm": 0.14929449558258057, + "learning_rate": 0.00016533598938636978, + "loss": 2.0055, + "step": 286880 + }, + { + "epoch": 1.0919741479716512, + "grad_norm": 0.16063956916332245, + "learning_rate": 0.00016527374368498272, + "loss": 2.006, + "step": 286890 + }, + { + "epoch": 1.0920122104397738, + "grad_norm": 0.139910489320755, + "learning_rate": 0.0001652115095566556, + "loss": 1.9999, + "step": 286900 + }, + { + "epoch": 1.0920502729078965, + "grad_norm": 0.15074720978736877, + "learning_rate": 0.0001651492869949356, + "loss": 2.0081, + "step": 286910 + }, + { + "epoch": 1.0920883353760191, + "grad_norm": 0.14458464086055756, + "learning_rate": 0.00016508707599337607, + "loss": 2.0055, + "step": 286920 + }, + { + "epoch": 1.0921263978441418, + "grad_norm": 0.15293492376804352, + "learning_rate": 0.0001650248765455361, + "loss": 2.0052, + "step": 286930 + }, + { + "epoch": 1.0921644603122644, + "grad_norm": 0.1694762408733368, + "learning_rate": 0.00016496268864498093, + "loss": 2.0053, + "step": 286940 + }, + { + "epoch": 1.092202522780387, + "grad_norm": 0.1571517437696457, + "learning_rate": 0.00016490051228528168, + "loss": 1.9962, + "step": 286950 + }, + { + "epoch": 1.0922405852485098, + "grad_norm": 0.1510549634695053, + "learning_rate": 0.00016483834746001548, + "loss": 1.9841, + "step": 286960 + }, + { + "epoch": 1.0922786477166326, + "grad_norm": 0.1525040715932846, + "learning_rate": 0.00016477619416276534, + "loss": 2.0132, + "step": 286970 + }, + { + "epoch": 1.0923167101847553, + "grad_norm": 0.14453206956386566, + "learning_rate": 0.0001647140523871204, + "loss": 2.0037, + "step": 286980 + }, + { + "epoch": 1.092354772652878, + "grad_norm": 0.19410859048366547, + "learning_rate": 0.00016465192212667545, + "loss": 1.9918, + "step": 286990 + }, + { + "epoch": 1.0923928351210006, + "grad_norm": 0.17885205149650574, + "learning_rate": 0.00016458980337503154, + "loss": 2.0214, + "step": 287000 + }, + { + "epoch": 1.0924308975891233, + "grad_norm": 0.14350204169750214, + "learning_rate": 0.00016452769612579544, + "loss": 2.0075, + "step": 287010 + }, + { + "epoch": 1.092468960057246, + "grad_norm": 0.15950074791908264, + "learning_rate": 0.00016446560037257978, + "loss": 2.004, + "step": 287020 + }, + { + "epoch": 1.0925070225253686, + "grad_norm": 0.17068053781986237, + "learning_rate": 0.00016440351610900333, + "loss": 1.9916, + "step": 287030 + }, + { + "epoch": 1.0925450849934912, + "grad_norm": 0.15856210887432098, + "learning_rate": 0.00016434144332869055, + "loss": 2.0099, + "step": 287040 + }, + { + "epoch": 1.092583147461614, + "grad_norm": 0.15143512189388275, + "learning_rate": 0.00016427938202527193, + "loss": 2.0088, + "step": 287050 + }, + { + "epoch": 1.0926212099297368, + "grad_norm": 0.14432553946971893, + "learning_rate": 0.0001642173321923837, + "loss": 1.9955, + "step": 287060 + }, + { + "epoch": 1.0926592723978594, + "grad_norm": 0.19511444866657257, + "learning_rate": 0.00016415529382366817, + "loss": 2.0103, + "step": 287070 + }, + { + "epoch": 1.092697334865982, + "grad_norm": 0.16971547901630402, + "learning_rate": 0.00016409326691277338, + "loss": 2.0093, + "step": 287080 + }, + { + "epoch": 1.0927353973341047, + "grad_norm": 0.16324810683727264, + "learning_rate": 0.00016403125145335318, + "loss": 2.0059, + "step": 287090 + }, + { + "epoch": 1.0927734598022274, + "grad_norm": 0.1708240658044815, + "learning_rate": 0.0001639692474390675, + "loss": 1.9884, + "step": 287100 + }, + { + "epoch": 1.09281152227035, + "grad_norm": 0.16044610738754272, + "learning_rate": 0.00016390725486358187, + "loss": 1.9985, + "step": 287110 + }, + { + "epoch": 1.0928495847384727, + "grad_norm": 0.18719172477722168, + "learning_rate": 0.00016384527372056778, + "loss": 1.995, + "step": 287120 + }, + { + "epoch": 1.0928876472065954, + "grad_norm": 0.15649785101413727, + "learning_rate": 0.0001637833040037026, + "loss": 1.9986, + "step": 287130 + }, + { + "epoch": 1.0929257096747182, + "grad_norm": 0.15070591866970062, + "learning_rate": 0.00016372134570666945, + "loss": 2.0029, + "step": 287140 + }, + { + "epoch": 1.092963772142841, + "grad_norm": 0.15930724143981934, + "learning_rate": 0.00016365939882315722, + "loss": 2.0, + "step": 287150 + }, + { + "epoch": 1.0930018346109636, + "grad_norm": 0.1533130407333374, + "learning_rate": 0.00016359746334686082, + "loss": 1.9915, + "step": 287160 + }, + { + "epoch": 1.0930398970790862, + "grad_norm": 0.15068431198596954, + "learning_rate": 0.0001635355392714807, + "loss": 2.0032, + "step": 287170 + }, + { + "epoch": 1.0930779595472089, + "grad_norm": 0.1491413116455078, + "learning_rate": 0.00016347362659072318, + "loss": 2.0149, + "step": 287180 + }, + { + "epoch": 1.0931160220153315, + "grad_norm": 0.1355583220720291, + "learning_rate": 0.00016341172529830056, + "loss": 2.0016, + "step": 287190 + }, + { + "epoch": 1.0931540844834542, + "grad_norm": 0.1516856998205185, + "learning_rate": 0.00016334983538793076, + "loss": 2.0003, + "step": 287200 + }, + { + "epoch": 1.0931921469515768, + "grad_norm": 0.1841883808374405, + "learning_rate": 0.00016328795685333736, + "loss": 2.0038, + "step": 287210 + }, + { + "epoch": 1.0932302094196995, + "grad_norm": 0.15536165237426758, + "learning_rate": 0.00016322608968824998, + "loss": 2.0222, + "step": 287220 + }, + { + "epoch": 1.0932682718878224, + "grad_norm": 0.17003241181373596, + "learning_rate": 0.00016316423388640372, + "loss": 2.0192, + "step": 287230 + }, + { + "epoch": 1.093306334355945, + "grad_norm": 0.15207625925540924, + "learning_rate": 0.00016310238944153966, + "loss": 1.9915, + "step": 287240 + }, + { + "epoch": 1.0933443968240677, + "grad_norm": 0.18474330008029938, + "learning_rate": 0.0001630405563474045, + "loss": 1.9994, + "step": 287250 + }, + { + "epoch": 1.0933824592921904, + "grad_norm": 0.17792512476444244, + "learning_rate": 0.00016297873459775075, + "loss": 1.9965, + "step": 287260 + }, + { + "epoch": 1.093420521760313, + "grad_norm": 0.14206421375274658, + "learning_rate": 0.00016291692418633652, + "loss": 1.9997, + "step": 287270 + }, + { + "epoch": 1.0934585842284357, + "grad_norm": 0.15516752004623413, + "learning_rate": 0.0001628551251069258, + "loss": 1.9974, + "step": 287280 + }, + { + "epoch": 1.0934966466965583, + "grad_norm": 0.1504223644733429, + "learning_rate": 0.00016279333735328815, + "loss": 2.0051, + "step": 287290 + }, + { + "epoch": 1.093534709164681, + "grad_norm": 0.18267503380775452, + "learning_rate": 0.00016273156091919894, + "loss": 2.0162, + "step": 287300 + }, + { + "epoch": 1.0935727716328039, + "grad_norm": 0.14748525619506836, + "learning_rate": 0.00016266979579843927, + "loss": 2.0015, + "step": 287310 + }, + { + "epoch": 1.0936108341009265, + "grad_norm": 0.19388824701309204, + "learning_rate": 0.00016260804198479578, + "loss": 1.9957, + "step": 287320 + }, + { + "epoch": 1.0936488965690492, + "grad_norm": 0.15450628101825714, + "learning_rate": 0.0001625462994720609, + "loss": 2.0021, + "step": 287330 + }, + { + "epoch": 1.0936869590371718, + "grad_norm": 0.14921757578849792, + "learning_rate": 0.0001624845682540328, + "loss": 1.9967, + "step": 287340 + }, + { + "epoch": 1.0937250215052945, + "grad_norm": 0.13192152976989746, + "learning_rate": 0.00016242284832451508, + "loss": 2.023, + "step": 287350 + }, + { + "epoch": 1.0937630839734171, + "grad_norm": 0.16562673449516296, + "learning_rate": 0.00016236113967731737, + "loss": 2.0048, + "step": 287360 + }, + { + "epoch": 1.0938011464415398, + "grad_norm": 0.14732389152050018, + "learning_rate": 0.0001622994423062546, + "loss": 1.9913, + "step": 287370 + }, + { + "epoch": 1.0938392089096625, + "grad_norm": 0.16607753932476044, + "learning_rate": 0.00016223775620514757, + "loss": 2.0023, + "step": 287380 + }, + { + "epoch": 1.0938772713777851, + "grad_norm": 0.15482759475708008, + "learning_rate": 0.00016217608136782265, + "loss": 1.9949, + "step": 287390 + }, + { + "epoch": 1.093915333845908, + "grad_norm": 0.18163645267486572, + "learning_rate": 0.00016211441778811182, + "loss": 1.9959, + "step": 287400 + }, + { + "epoch": 1.0939533963140307, + "grad_norm": 0.16344821453094482, + "learning_rate": 0.00016205276545985272, + "loss": 2.0087, + "step": 287410 + }, + { + "epoch": 1.0939914587821533, + "grad_norm": 0.16161710023880005, + "learning_rate": 0.00016199112437688856, + "loss": 1.9912, + "step": 287420 + }, + { + "epoch": 1.094029521250276, + "grad_norm": 0.16121038794517517, + "learning_rate": 0.00016192949453306833, + "loss": 1.9999, + "step": 287430 + }, + { + "epoch": 1.0940675837183986, + "grad_norm": 0.16078747808933258, + "learning_rate": 0.00016186787592224644, + "loss": 2.0047, + "step": 287440 + }, + { + "epoch": 1.0941056461865213, + "grad_norm": 0.1562453657388687, + "learning_rate": 0.0001618062685382829, + "loss": 2.0022, + "step": 287450 + }, + { + "epoch": 1.094143708654644, + "grad_norm": 0.1912255585193634, + "learning_rate": 0.0001617446723750435, + "loss": 2.0044, + "step": 287460 + }, + { + "epoch": 1.0941817711227666, + "grad_norm": 0.1432357132434845, + "learning_rate": 0.00016168308742639938, + "loss": 1.9939, + "step": 287470 + }, + { + "epoch": 1.0942198335908895, + "grad_norm": 0.16316208243370056, + "learning_rate": 0.0001616215136862274, + "loss": 2.0033, + "step": 287480 + }, + { + "epoch": 1.0942578960590121, + "grad_norm": 0.21677039563655853, + "learning_rate": 0.00016155995114840993, + "loss": 2.0013, + "step": 287490 + }, + { + "epoch": 1.0942959585271348, + "grad_norm": 0.15556517243385315, + "learning_rate": 0.000161498399806835, + "loss": 2.0027, + "step": 287500 + }, + { + "epoch": 1.0943340209952575, + "grad_norm": 0.17532147467136383, + "learning_rate": 0.000161436859655396, + "loss": 1.9996, + "step": 287510 + }, + { + "epoch": 1.09437208346338, + "grad_norm": 0.20894719660282135, + "learning_rate": 0.00016137533068799216, + "loss": 2.0244, + "step": 287520 + }, + { + "epoch": 1.0944101459315028, + "grad_norm": 0.15300019085407257, + "learning_rate": 0.00016131381289852798, + "loss": 2.0154, + "step": 287530 + }, + { + "epoch": 1.0944482083996254, + "grad_norm": 0.15588730573654175, + "learning_rate": 0.00016125230628091352, + "loss": 2.0095, + "step": 287540 + }, + { + "epoch": 1.094486270867748, + "grad_norm": 0.14091601967811584, + "learning_rate": 0.0001611908108290646, + "loss": 1.9952, + "step": 287550 + }, + { + "epoch": 1.0945243333358707, + "grad_norm": 0.1498078852891922, + "learning_rate": 0.00016112932653690237, + "loss": 1.9998, + "step": 287560 + }, + { + "epoch": 1.0945623958039936, + "grad_norm": 0.15718935430049896, + "learning_rate": 0.0001610678533983535, + "loss": 2.0035, + "step": 287570 + }, + { + "epoch": 1.0946004582721163, + "grad_norm": 0.14311803877353668, + "learning_rate": 0.00016100639140735019, + "loss": 2.0122, + "step": 287580 + }, + { + "epoch": 1.094638520740239, + "grad_norm": 0.17664818465709686, + "learning_rate": 0.00016094494055783016, + "loss": 2.0004, + "step": 287590 + }, + { + "epoch": 1.0946765832083616, + "grad_norm": 0.17590229213237762, + "learning_rate": 0.00016088350084373659, + "loss": 2.0063, + "step": 287600 + }, + { + "epoch": 1.0947146456764842, + "grad_norm": 0.20563502609729767, + "learning_rate": 0.00016082207225901824, + "loss": 1.9919, + "step": 287610 + }, + { + "epoch": 1.094752708144607, + "grad_norm": 0.15983684360980988, + "learning_rate": 0.00016076065479762915, + "loss": 2.0112, + "step": 287620 + }, + { + "epoch": 1.0947907706127296, + "grad_norm": 0.15316280722618103, + "learning_rate": 0.00016069924845352906, + "loss": 2.0041, + "step": 287630 + }, + { + "epoch": 1.0948288330808522, + "grad_norm": 0.1465786248445511, + "learning_rate": 0.00016063785322068303, + "loss": 2.0049, + "step": 287640 + }, + { + "epoch": 1.094866895548975, + "grad_norm": 0.20526903867721558, + "learning_rate": 0.00016057646909306168, + "loss": 1.9967, + "step": 287650 + }, + { + "epoch": 1.0949049580170978, + "grad_norm": 0.15140342712402344, + "learning_rate": 0.00016051509606464088, + "loss": 2.0155, + "step": 287660 + }, + { + "epoch": 1.0949430204852204, + "grad_norm": 0.13655777275562286, + "learning_rate": 0.00016045373412940222, + "loss": 2.0007, + "step": 287670 + }, + { + "epoch": 1.094981082953343, + "grad_norm": 0.14832167327404022, + "learning_rate": 0.00016039238328133254, + "loss": 2.0155, + "step": 287680 + }, + { + "epoch": 1.0950191454214657, + "grad_norm": 0.1434812843799591, + "learning_rate": 0.00016033104351442418, + "loss": 2.0123, + "step": 287690 + }, + { + "epoch": 1.0950572078895884, + "grad_norm": 0.14046348631381989, + "learning_rate": 0.0001602697148226749, + "loss": 2.0025, + "step": 287700 + }, + { + "epoch": 1.095095270357711, + "grad_norm": 0.14863237738609314, + "learning_rate": 0.0001602083972000878, + "loss": 2.0069, + "step": 287710 + }, + { + "epoch": 1.0951333328258337, + "grad_norm": 0.15894490480422974, + "learning_rate": 0.0001601470906406714, + "loss": 2.0039, + "step": 287720 + }, + { + "epoch": 1.0951713952939564, + "grad_norm": 0.16327515244483948, + "learning_rate": 0.00016008579513843984, + "loss": 1.9813, + "step": 287730 + }, + { + "epoch": 1.095209457762079, + "grad_norm": 0.1889437884092331, + "learning_rate": 0.00016002451068741248, + "loss": 2.0002, + "step": 287740 + }, + { + "epoch": 1.0952475202302019, + "grad_norm": 0.17731782793998718, + "learning_rate": 0.00015996323728161393, + "loss": 2.0006, + "step": 287750 + }, + { + "epoch": 1.0952855826983245, + "grad_norm": 0.19618789851665497, + "learning_rate": 0.00015990197491507442, + "loss": 1.9982, + "step": 287760 + }, + { + "epoch": 1.0953236451664472, + "grad_norm": 0.1554926037788391, + "learning_rate": 0.0001598407235818295, + "loss": 1.9862, + "step": 287770 + }, + { + "epoch": 1.0953617076345699, + "grad_norm": 0.16439561545848846, + "learning_rate": 0.00015977948327592002, + "loss": 2.0052, + "step": 287780 + }, + { + "epoch": 1.0953997701026925, + "grad_norm": 0.15064625442028046, + "learning_rate": 0.00015971825399139223, + "loss": 2.0139, + "step": 287790 + }, + { + "epoch": 1.0954378325708152, + "grad_norm": 0.16435232758522034, + "learning_rate": 0.00015965703572229774, + "loss": 2.019, + "step": 287800 + }, + { + "epoch": 1.0954758950389378, + "grad_norm": 0.13841593265533447, + "learning_rate": 0.0001595958284626935, + "loss": 1.9997, + "step": 287810 + }, + { + "epoch": 1.0955139575070605, + "grad_norm": 0.16646708548069, + "learning_rate": 0.00015953463220664182, + "loss": 2.005, + "step": 287820 + }, + { + "epoch": 1.0955520199751834, + "grad_norm": 0.2216751128435135, + "learning_rate": 0.0001594734469482103, + "loss": 2.0131, + "step": 287830 + }, + { + "epoch": 1.095590082443306, + "grad_norm": 0.19386476278305054, + "learning_rate": 0.00015941227268147196, + "loss": 2.0042, + "step": 287840 + }, + { + "epoch": 1.0956281449114287, + "grad_norm": 0.13800813257694244, + "learning_rate": 0.00015935110940050508, + "loss": 2.0069, + "step": 287850 + }, + { + "epoch": 1.0956662073795513, + "grad_norm": 0.14591440558433533, + "learning_rate": 0.00015928995709939321, + "loss": 2.0032, + "step": 287860 + }, + { + "epoch": 1.095704269847674, + "grad_norm": 0.24522840976715088, + "learning_rate": 0.00015922881577222524, + "loss": 2.0018, + "step": 287870 + }, + { + "epoch": 1.0957423323157967, + "grad_norm": 0.14610832929611206, + "learning_rate": 0.00015916768541309546, + "loss": 2.0102, + "step": 287880 + }, + { + "epoch": 1.0957803947839193, + "grad_norm": 0.15744467079639435, + "learning_rate": 0.0001591065660161033, + "loss": 2.0041, + "step": 287890 + }, + { + "epoch": 1.095818457252042, + "grad_norm": 0.14038099348545074, + "learning_rate": 0.0001590454575753536, + "loss": 2.0019, + "step": 287900 + }, + { + "epoch": 1.0958565197201646, + "grad_norm": 0.16151206195354462, + "learning_rate": 0.00015898436008495643, + "loss": 2.0176, + "step": 287910 + }, + { + "epoch": 1.0958945821882875, + "grad_norm": 0.17450959980487823, + "learning_rate": 0.0001589232735390271, + "loss": 2.0055, + "step": 287920 + }, + { + "epoch": 1.0959326446564102, + "grad_norm": 0.14196312427520752, + "learning_rate": 0.00015886219793168627, + "loss": 2.0046, + "step": 287930 + }, + { + "epoch": 1.0959707071245328, + "grad_norm": 0.19493959844112396, + "learning_rate": 0.0001588011332570598, + "loss": 1.9931, + "step": 287940 + }, + { + "epoch": 1.0960087695926555, + "grad_norm": 0.1691603660583496, + "learning_rate": 0.00015874007950927881, + "loss": 2.0087, + "step": 287950 + }, + { + "epoch": 1.0960468320607781, + "grad_norm": 0.1751086562871933, + "learning_rate": 0.00015867903668247973, + "loss": 1.9943, + "step": 287960 + }, + { + "epoch": 1.0960848945289008, + "grad_norm": 0.18380074203014374, + "learning_rate": 0.00015861800477080419, + "loss": 2.0019, + "step": 287970 + }, + { + "epoch": 1.0961229569970234, + "grad_norm": 0.1523730605840683, + "learning_rate": 0.000158556983768399, + "loss": 1.9901, + "step": 287980 + }, + { + "epoch": 1.096161019465146, + "grad_norm": 0.18034271895885468, + "learning_rate": 0.0001584959736694163, + "loss": 1.9968, + "step": 287990 + }, + { + "epoch": 1.096199081933269, + "grad_norm": 0.15186168253421783, + "learning_rate": 0.0001584349744680134, + "loss": 2.0008, + "step": 288000 + }, + { + "epoch": 1.0962371444013916, + "grad_norm": 0.16246947646141052, + "learning_rate": 0.0001583739861583528, + "loss": 1.9983, + "step": 288010 + }, + { + "epoch": 1.0962752068695143, + "grad_norm": 0.14793811738491058, + "learning_rate": 0.00015831300873460225, + "loss": 2.0115, + "step": 288020 + }, + { + "epoch": 1.096313269337637, + "grad_norm": 0.15794621407985687, + "learning_rate": 0.0001582520421909347, + "loss": 1.9923, + "step": 288030 + }, + { + "epoch": 1.0963513318057596, + "grad_norm": 0.15211062133312225, + "learning_rate": 0.00015819108652152835, + "loss": 2.015, + "step": 288040 + }, + { + "epoch": 1.0963893942738823, + "grad_norm": 0.15101411938667297, + "learning_rate": 0.00015813014172056639, + "loss": 1.9842, + "step": 288050 + }, + { + "epoch": 1.096427456742005, + "grad_norm": 0.1630418449640274, + "learning_rate": 0.00015806920778223748, + "loss": 1.9971, + "step": 288060 + }, + { + "epoch": 1.0964655192101276, + "grad_norm": 0.1481507569551468, + "learning_rate": 0.0001580082847007353, + "loss": 1.9978, + "step": 288070 + }, + { + "epoch": 1.0965035816782502, + "grad_norm": 0.1584104299545288, + "learning_rate": 0.0001579473724702586, + "loss": 1.9828, + "step": 288080 + }, + { + "epoch": 1.0965416441463731, + "grad_norm": 0.16793425381183624, + "learning_rate": 0.00015788647108501153, + "loss": 2.0079, + "step": 288090 + }, + { + "epoch": 1.0965797066144958, + "grad_norm": 0.15810927748680115, + "learning_rate": 0.00015782558053920322, + "loss": 2.0104, + "step": 288100 + }, + { + "epoch": 1.0966177690826184, + "grad_norm": 0.15413767099380493, + "learning_rate": 0.00015776470082704791, + "loss": 2.0074, + "step": 288110 + }, + { + "epoch": 1.096655831550741, + "grad_norm": 0.14198710024356842, + "learning_rate": 0.00015770383194276528, + "loss": 1.9906, + "step": 288120 + }, + { + "epoch": 1.0966938940188637, + "grad_norm": 0.17314308881759644, + "learning_rate": 0.00015764297388057985, + "loss": 1.9965, + "step": 288130 + }, + { + "epoch": 1.0967319564869864, + "grad_norm": 0.21093778312206268, + "learning_rate": 0.00015758212663472137, + "loss": 2.0053, + "step": 288140 + }, + { + "epoch": 1.096770018955109, + "grad_norm": 0.16736042499542236, + "learning_rate": 0.00015752129019942467, + "loss": 2.0135, + "step": 288150 + }, + { + "epoch": 1.0968080814232317, + "grad_norm": 0.15092270076274872, + "learning_rate": 0.00015746046456892988, + "loss": 1.9901, + "step": 288160 + }, + { + "epoch": 1.0968461438913546, + "grad_norm": 0.14054399728775024, + "learning_rate": 0.00015739964973748207, + "loss": 1.9955, + "step": 288170 + }, + { + "epoch": 1.0968842063594773, + "grad_norm": 0.1438574343919754, + "learning_rate": 0.00015733884569933131, + "loss": 2.0008, + "step": 288180 + }, + { + "epoch": 1.0969222688276, + "grad_norm": 0.15705853700637817, + "learning_rate": 0.0001572780524487331, + "loss": 2.0009, + "step": 288190 + }, + { + "epoch": 1.0969603312957226, + "grad_norm": 0.15003721415996552, + "learning_rate": 0.0001572172699799478, + "loss": 1.9926, + "step": 288200 + }, + { + "epoch": 1.0969983937638452, + "grad_norm": 0.2214089035987854, + "learning_rate": 0.00015715649828724088, + "loss": 1.9999, + "step": 288210 + }, + { + "epoch": 1.0970364562319679, + "grad_norm": 0.20071560144424438, + "learning_rate": 0.00015709573736488296, + "loss": 1.9994, + "step": 288220 + }, + { + "epoch": 1.0970745187000905, + "grad_norm": 0.17797204852104187, + "learning_rate": 0.00015703498720714965, + "loss": 1.9929, + "step": 288230 + }, + { + "epoch": 1.0971125811682132, + "grad_norm": 0.1843406856060028, + "learning_rate": 0.00015697424780832176, + "loss": 1.9963, + "step": 288240 + }, + { + "epoch": 1.0971506436363359, + "grad_norm": 0.1717744767665863, + "learning_rate": 0.000156913519162685, + "loss": 2.007, + "step": 288250 + }, + { + "epoch": 1.0971887061044587, + "grad_norm": 0.15767750144004822, + "learning_rate": 0.00015685280126453023, + "loss": 2.0069, + "step": 288260 + }, + { + "epoch": 1.0972267685725814, + "grad_norm": 0.23259855806827545, + "learning_rate": 0.00015679209410815337, + "loss": 2.0009, + "step": 288270 + }, + { + "epoch": 1.097264831040704, + "grad_norm": 0.1731208860874176, + "learning_rate": 0.00015673139768785532, + "loss": 1.9992, + "step": 288280 + }, + { + "epoch": 1.0973028935088267, + "grad_norm": 0.16261719167232513, + "learning_rate": 0.00015667071199794213, + "loss": 1.9964, + "step": 288290 + }, + { + "epoch": 1.0973409559769494, + "grad_norm": 0.147927924990654, + "learning_rate": 0.00015661003703272474, + "loss": 2.0035, + "step": 288300 + }, + { + "epoch": 1.097379018445072, + "grad_norm": 0.21638403832912445, + "learning_rate": 0.00015654937278651926, + "loss": 2.006, + "step": 288310 + }, + { + "epoch": 1.0974170809131947, + "grad_norm": 0.1866307407617569, + "learning_rate": 0.00015648871925364667, + "loss": 1.9907, + "step": 288320 + }, + { + "epoch": 1.0974551433813173, + "grad_norm": 0.16943003237247467, + "learning_rate": 0.000156428076428433, + "loss": 1.996, + "step": 288330 + }, + { + "epoch": 1.0974932058494402, + "grad_norm": 0.1697445660829544, + "learning_rate": 0.00015636744430520943, + "loss": 1.9813, + "step": 288340 + }, + { + "epoch": 1.0975312683175629, + "grad_norm": 0.13972648978233337, + "learning_rate": 0.000156306822878312, + "loss": 1.9917, + "step": 288350 + }, + { + "epoch": 1.0975693307856855, + "grad_norm": 0.15015487372875214, + "learning_rate": 0.00015624621214208173, + "loss": 1.9955, + "step": 288360 + }, + { + "epoch": 1.0976073932538082, + "grad_norm": 0.14674119651317596, + "learning_rate": 0.00015618561209086474, + "loss": 2.0034, + "step": 288370 + }, + { + "epoch": 1.0976454557219308, + "grad_norm": 0.18153975903987885, + "learning_rate": 0.00015612502271901202, + "loss": 1.9931, + "step": 288380 + }, + { + "epoch": 1.0976835181900535, + "grad_norm": 0.16578781604766846, + "learning_rate": 0.0001560644440208796, + "loss": 1.998, + "step": 288390 + }, + { + "epoch": 1.0977215806581762, + "grad_norm": 0.1398111879825592, + "learning_rate": 0.00015600387599082845, + "loss": 2.0043, + "step": 288400 + }, + { + "epoch": 1.0977596431262988, + "grad_norm": 0.15062817931175232, + "learning_rate": 0.00015594331862322453, + "loss": 1.9935, + "step": 288410 + }, + { + "epoch": 1.0977977055944215, + "grad_norm": 0.1535642296075821, + "learning_rate": 0.0001558827719124387, + "loss": 1.9983, + "step": 288420 + }, + { + "epoch": 1.0978357680625443, + "grad_norm": 0.13218477368354797, + "learning_rate": 0.00015582223585284695, + "loss": 1.9896, + "step": 288430 + }, + { + "epoch": 1.097873830530667, + "grad_norm": 0.1496010273694992, + "learning_rate": 0.00015576171043882992, + "loss": 1.9917, + "step": 288440 + }, + { + "epoch": 1.0979118929987897, + "grad_norm": 0.160117968916893, + "learning_rate": 0.00015570119566477337, + "loss": 1.9945, + "step": 288450 + }, + { + "epoch": 1.0979499554669123, + "grad_norm": 0.15955455601215363, + "learning_rate": 0.00015564069152506805, + "loss": 1.9808, + "step": 288460 + }, + { + "epoch": 1.097988017935035, + "grad_norm": 0.152132049202919, + "learning_rate": 0.00015558019801410956, + "loss": 2.0021, + "step": 288470 + }, + { + "epoch": 1.0980260804031576, + "grad_norm": 0.15424053370952606, + "learning_rate": 0.0001555197151262983, + "loss": 2.0095, + "step": 288480 + }, + { + "epoch": 1.0980641428712803, + "grad_norm": 0.15129613876342773, + "learning_rate": 0.00015545924285603985, + "loss": 2.0025, + "step": 288490 + }, + { + "epoch": 1.098102205339403, + "grad_norm": 0.1594102382659912, + "learning_rate": 0.00015539878119774446, + "loss": 2.0106, + "step": 288500 + }, + { + "epoch": 1.0981402678075258, + "grad_norm": 0.21141329407691956, + "learning_rate": 0.00015533833014582738, + "loss": 1.9938, + "step": 288510 + }, + { + "epoch": 1.0981783302756485, + "grad_norm": 0.14478717744350433, + "learning_rate": 0.00015527788969470885, + "loss": 2.0059, + "step": 288520 + }, + { + "epoch": 1.0982163927437711, + "grad_norm": 0.19828860461711884, + "learning_rate": 0.0001552174598388138, + "loss": 1.9837, + "step": 288530 + }, + { + "epoch": 1.0982544552118938, + "grad_norm": 0.16630606353282928, + "learning_rate": 0.00015515704057257214, + "loss": 2.0055, + "step": 288540 + }, + { + "epoch": 1.0982925176800165, + "grad_norm": 0.1673320233821869, + "learning_rate": 0.00015509663189041873, + "loss": 1.999, + "step": 288550 + }, + { + "epoch": 1.0983305801481391, + "grad_norm": 0.13971549272537231, + "learning_rate": 0.00015503623378679323, + "loss": 2.0113, + "step": 288560 + }, + { + "epoch": 1.0983686426162618, + "grad_norm": 0.14060862362384796, + "learning_rate": 0.00015497584625614007, + "loss": 1.9897, + "step": 288570 + }, + { + "epoch": 1.0984067050843844, + "grad_norm": 0.22311097383499146, + "learning_rate": 0.00015491546929290883, + "loss": 1.9926, + "step": 288580 + }, + { + "epoch": 1.098444767552507, + "grad_norm": 0.1624974012374878, + "learning_rate": 0.0001548551028915537, + "loss": 2.0086, + "step": 288590 + }, + { + "epoch": 1.0984828300206297, + "grad_norm": 0.1577761322259903, + "learning_rate": 0.0001547947470465337, + "loss": 2.0068, + "step": 288600 + }, + { + "epoch": 1.0985208924887526, + "grad_norm": 0.14535222947597504, + "learning_rate": 0.0001547344017523128, + "loss": 2.0043, + "step": 288610 + }, + { + "epoch": 1.0985589549568753, + "grad_norm": 0.15469293296337128, + "learning_rate": 0.00015467406700335985, + "loss": 1.9993, + "step": 288620 + }, + { + "epoch": 1.098597017424998, + "grad_norm": 0.18461138010025024, + "learning_rate": 0.0001546137427941484, + "loss": 1.9892, + "step": 288630 + }, + { + "epoch": 1.0986350798931206, + "grad_norm": 0.15238602459430695, + "learning_rate": 0.00015455342911915694, + "loss": 2.0026, + "step": 288640 + }, + { + "epoch": 1.0986731423612432, + "grad_norm": 0.15034642815589905, + "learning_rate": 0.00015449312597286868, + "loss": 2.0044, + "step": 288650 + }, + { + "epoch": 1.098711204829366, + "grad_norm": 0.16094571352005005, + "learning_rate": 0.0001544328333497717, + "loss": 2.0095, + "step": 288660 + }, + { + "epoch": 1.0987492672974886, + "grad_norm": 0.15386877954006195, + "learning_rate": 0.0001543725512443589, + "loss": 1.9989, + "step": 288670 + }, + { + "epoch": 1.0987873297656112, + "grad_norm": 0.14561620354652405, + "learning_rate": 0.00015431227965112792, + "loss": 1.9938, + "step": 288680 + }, + { + "epoch": 1.098825392233734, + "grad_norm": 0.15083807706832886, + "learning_rate": 0.00015425201856458125, + "loss": 1.9973, + "step": 288690 + }, + { + "epoch": 1.0988634547018568, + "grad_norm": 0.148808091878891, + "learning_rate": 0.0001541917679792262, + "loss": 1.9817, + "step": 288700 + }, + { + "epoch": 1.0989015171699794, + "grad_norm": 0.18542420864105225, + "learning_rate": 0.0001541315278895748, + "loss": 1.9831, + "step": 288710 + }, + { + "epoch": 1.098939579638102, + "grad_norm": 0.20795604586601257, + "learning_rate": 0.00015407129829014382, + "loss": 1.9894, + "step": 288720 + }, + { + "epoch": 1.0989776421062247, + "grad_norm": 0.1624477505683899, + "learning_rate": 0.00015401107917545492, + "loss": 2.024, + "step": 288730 + }, + { + "epoch": 1.0990157045743474, + "grad_norm": 0.1427524834871292, + "learning_rate": 0.00015395087054003448, + "loss": 2.0004, + "step": 288740 + }, + { + "epoch": 1.09905376704247, + "grad_norm": 0.16401228308677673, + "learning_rate": 0.00015389067237841353, + "loss": 2.0047, + "step": 288750 + }, + { + "epoch": 1.0990918295105927, + "grad_norm": 0.18014630675315857, + "learning_rate": 0.00015383048468512812, + "loss": 2.0046, + "step": 288760 + }, + { + "epoch": 1.0991298919787154, + "grad_norm": 0.22702322900295258, + "learning_rate": 0.00015377030745471876, + "loss": 1.998, + "step": 288770 + }, + { + "epoch": 1.0991679544468382, + "grad_norm": 0.17550452053546906, + "learning_rate": 0.00015371014068173077, + "loss": 1.9982, + "step": 288780 + }, + { + "epoch": 1.099206016914961, + "grad_norm": 0.19119413197040558, + "learning_rate": 0.00015364998436071447, + "loss": 1.997, + "step": 288790 + }, + { + "epoch": 1.0992440793830835, + "grad_norm": 0.21022382378578186, + "learning_rate": 0.00015358983848622455, + "loss": 1.9926, + "step": 288800 + }, + { + "epoch": 1.0992821418512062, + "grad_norm": 0.14497125148773193, + "learning_rate": 0.0001535297030528206, + "loss": 1.9891, + "step": 288810 + }, + { + "epoch": 1.0993202043193289, + "grad_norm": 0.14831840991973877, + "learning_rate": 0.00015346957805506694, + "loss": 2.0051, + "step": 288820 + }, + { + "epoch": 1.0993582667874515, + "grad_norm": 0.14361897110939026, + "learning_rate": 0.0001534094634875326, + "loss": 1.993, + "step": 288830 + }, + { + "epoch": 1.0993963292555742, + "grad_norm": 0.1500166654586792, + "learning_rate": 0.00015334935934479123, + "loss": 2.0049, + "step": 288840 + }, + { + "epoch": 1.0994343917236968, + "grad_norm": 0.17549310624599457, + "learning_rate": 0.00015328926562142132, + "loss": 1.9883, + "step": 288850 + }, + { + "epoch": 1.0994724541918197, + "grad_norm": 0.15605853497982025, + "learning_rate": 0.00015322918231200594, + "loss": 2.0059, + "step": 288860 + }, + { + "epoch": 1.0995105166599424, + "grad_norm": 0.13666051626205444, + "learning_rate": 0.00015316910941113294, + "loss": 1.9986, + "step": 288870 + }, + { + "epoch": 1.099548579128065, + "grad_norm": 0.14910241961479187, + "learning_rate": 0.0001531090469133948, + "loss": 2.0049, + "step": 288880 + }, + { + "epoch": 1.0995866415961877, + "grad_norm": 0.15947653353214264, + "learning_rate": 0.00015304899481338868, + "loss": 1.9857, + "step": 288890 + }, + { + "epoch": 1.0996247040643103, + "grad_norm": 0.1871788054704666, + "learning_rate": 0.00015298895310571643, + "loss": 1.9886, + "step": 288900 + }, + { + "epoch": 1.099662766532433, + "grad_norm": 0.18131105601787567, + "learning_rate": 0.00015292892178498464, + "loss": 1.9964, + "step": 288910 + }, + { + "epoch": 1.0997008290005557, + "grad_norm": 0.2423751950263977, + "learning_rate": 0.00015286890084580435, + "loss": 2.0039, + "step": 288920 + }, + { + "epoch": 1.0997388914686783, + "grad_norm": 0.15106576681137085, + "learning_rate": 0.00015280889028279154, + "loss": 2.0011, + "step": 288930 + }, + { + "epoch": 1.099776953936801, + "grad_norm": 0.17737062275409698, + "learning_rate": 0.0001527488900905667, + "loss": 1.9977, + "step": 288940 + }, + { + "epoch": 1.0998150164049239, + "grad_norm": 0.1560988426208496, + "learning_rate": 0.00015268890026375487, + "loss": 1.9988, + "step": 288950 + }, + { + "epoch": 1.0998530788730465, + "grad_norm": 0.15913379192352295, + "learning_rate": 0.00015262892079698592, + "loss": 2.0013, + "step": 288960 + }, + { + "epoch": 1.0998911413411692, + "grad_norm": 0.1441878080368042, + "learning_rate": 0.00015256895168489425, + "loss": 2.0045, + "step": 288970 + }, + { + "epoch": 1.0999292038092918, + "grad_norm": 0.1639913022518158, + "learning_rate": 0.00015250899292211894, + "loss": 2.0125, + "step": 288980 + }, + { + "epoch": 1.0999672662774145, + "grad_norm": 0.1896839141845703, + "learning_rate": 0.00015244904450330355, + "loss": 1.9886, + "step": 288990 + }, + { + "epoch": 1.1000053287455371, + "grad_norm": 0.20792101323604584, + "learning_rate": 0.0001523891064230965, + "loss": 2.0052, + "step": 289000 + }, + { + "epoch": 1.1000433912136598, + "grad_norm": 0.22266975045204163, + "learning_rate": 0.00015232917867615066, + "loss": 1.9991, + "step": 289010 + }, + { + "epoch": 1.1000814536817825, + "grad_norm": 0.19801975786685944, + "learning_rate": 0.00015226926125712343, + "loss": 2.0012, + "step": 289020 + }, + { + "epoch": 1.1001195161499053, + "grad_norm": 0.20056535303592682, + "learning_rate": 0.0001522093541606771, + "loss": 2.0076, + "step": 289030 + }, + { + "epoch": 1.100157578618028, + "grad_norm": 0.17718303203582764, + "learning_rate": 0.00015214945738147828, + "loss": 2.0151, + "step": 289040 + }, + { + "epoch": 1.1001956410861506, + "grad_norm": 0.15518245100975037, + "learning_rate": 0.0001520895709141983, + "loss": 2.0094, + "step": 289050 + }, + { + "epoch": 1.1002337035542733, + "grad_norm": 0.16333739459514618, + "learning_rate": 0.00015202969475351302, + "loss": 1.9887, + "step": 289060 + }, + { + "epoch": 1.100271766022396, + "grad_norm": 0.1541435420513153, + "learning_rate": 0.00015196982889410294, + "loss": 1.9986, + "step": 289070 + }, + { + "epoch": 1.1003098284905186, + "grad_norm": 0.15141764283180237, + "learning_rate": 0.00015190997333065304, + "loss": 1.9973, + "step": 289080 + }, + { + "epoch": 1.1003478909586413, + "grad_norm": 0.1551426500082016, + "learning_rate": 0.00015185012805785304, + "loss": 2.0003, + "step": 289090 + }, + { + "epoch": 1.100385953426764, + "grad_norm": 0.15018625557422638, + "learning_rate": 0.00015179029307039703, + "loss": 1.9999, + "step": 289100 + }, + { + "epoch": 1.1004240158948866, + "grad_norm": 0.16040506958961487, + "learning_rate": 0.00015173046836298377, + "loss": 1.9833, + "step": 289110 + }, + { + "epoch": 1.1004620783630095, + "grad_norm": 0.14544560015201569, + "learning_rate": 0.0001516706539303165, + "loss": 1.9885, + "step": 289120 + }, + { + "epoch": 1.1005001408311321, + "grad_norm": 0.15642940998077393, + "learning_rate": 0.00015161084976710309, + "loss": 2.0103, + "step": 289130 + }, + { + "epoch": 1.1005382032992548, + "grad_norm": 0.14502054452896118, + "learning_rate": 0.00015155105586805596, + "loss": 1.9831, + "step": 289140 + }, + { + "epoch": 1.1005762657673774, + "grad_norm": 0.14879922568798065, + "learning_rate": 0.00015149127222789194, + "loss": 1.9798, + "step": 289150 + }, + { + "epoch": 1.1006143282355, + "grad_norm": 0.15120501816272736, + "learning_rate": 0.0001514314988413325, + "loss": 2.0017, + "step": 289160 + }, + { + "epoch": 1.1006523907036228, + "grad_norm": 0.15092013776302338, + "learning_rate": 0.00015137173570310358, + "loss": 1.9922, + "step": 289170 + }, + { + "epoch": 1.1006904531717454, + "grad_norm": 0.18918323516845703, + "learning_rate": 0.00015131198280793568, + "loss": 1.989, + "step": 289180 + }, + { + "epoch": 1.100728515639868, + "grad_norm": 0.15814675390720367, + "learning_rate": 0.00015125224015056382, + "loss": 1.9879, + "step": 289190 + }, + { + "epoch": 1.100766578107991, + "grad_norm": 0.19635015726089478, + "learning_rate": 0.0001511925077257275, + "loss": 1.9881, + "step": 289200 + }, + { + "epoch": 1.1008046405761136, + "grad_norm": 0.22311274707317352, + "learning_rate": 0.00015113278552817072, + "loss": 2.003, + "step": 289210 + }, + { + "epoch": 1.1008427030442363, + "grad_norm": 0.1627051681280136, + "learning_rate": 0.000151073073552642, + "loss": 1.9937, + "step": 289220 + }, + { + "epoch": 1.100880765512359, + "grad_norm": 0.1458701491355896, + "learning_rate": 0.00015101337179389428, + "loss": 2.0009, + "step": 289230 + }, + { + "epoch": 1.1009188279804816, + "grad_norm": 0.24090033769607544, + "learning_rate": 0.0001509536802466851, + "loss": 1.995, + "step": 289240 + }, + { + "epoch": 1.1009568904486042, + "grad_norm": 0.1879594475030899, + "learning_rate": 0.0001508939989057765, + "loss": 1.9979, + "step": 289250 + }, + { + "epoch": 1.1009949529167269, + "grad_norm": 0.1629105806350708, + "learning_rate": 0.00015083432776593475, + "loss": 1.9971, + "step": 289260 + }, + { + "epoch": 1.1010330153848495, + "grad_norm": 0.18328267335891724, + "learning_rate": 0.00015077466682193098, + "loss": 1.9839, + "step": 289270 + }, + { + "epoch": 1.1010710778529722, + "grad_norm": 0.16340726613998413, + "learning_rate": 0.00015071501606854037, + "loss": 1.9913, + "step": 289280 + }, + { + "epoch": 1.101109140321095, + "grad_norm": 0.15893971920013428, + "learning_rate": 0.00015065537550054293, + "loss": 2.0083, + "step": 289290 + }, + { + "epoch": 1.1011472027892177, + "grad_norm": 0.16307507455348969, + "learning_rate": 0.00015059574511272283, + "loss": 1.9925, + "step": 289300 + }, + { + "epoch": 1.1011852652573404, + "grad_norm": 0.20666718482971191, + "learning_rate": 0.00015053612489986895, + "loss": 2.0027, + "step": 289310 + }, + { + "epoch": 1.101223327725463, + "grad_norm": 0.1942300647497177, + "learning_rate": 0.00015047651485677437, + "loss": 2.0019, + "step": 289320 + }, + { + "epoch": 1.1012613901935857, + "grad_norm": 0.153672456741333, + "learning_rate": 0.0001504169149782368, + "loss": 2.0058, + "step": 289330 + }, + { + "epoch": 1.1012994526617084, + "grad_norm": 0.15216460824012756, + "learning_rate": 0.00015035732525905826, + "loss": 1.9905, + "step": 289340 + }, + { + "epoch": 1.101337515129831, + "grad_norm": 0.18357402086257935, + "learning_rate": 0.0001502977456940452, + "loss": 1.997, + "step": 289350 + }, + { + "epoch": 1.1013755775979537, + "grad_norm": 0.16224220395088196, + "learning_rate": 0.00015023817627800868, + "loss": 2.0048, + "step": 289360 + }, + { + "epoch": 1.1014136400660766, + "grad_norm": 0.15068677067756653, + "learning_rate": 0.00015017861700576392, + "loss": 1.9926, + "step": 289370 + }, + { + "epoch": 1.1014517025341992, + "grad_norm": 0.15177755057811737, + "learning_rate": 0.00015011906787213076, + "loss": 1.9836, + "step": 289380 + }, + { + "epoch": 1.1014897650023219, + "grad_norm": 0.15786100924015045, + "learning_rate": 0.00015005952887193325, + "loss": 2.0062, + "step": 289390 + }, + { + "epoch": 1.1015278274704445, + "grad_norm": 0.15467971563339233, + "learning_rate": 0.00015000000000000001, + "loss": 2.0015, + "step": 289400 + }, + { + "epoch": 1.1015658899385672, + "grad_norm": 0.14474506676197052, + "learning_rate": 0.000149940481251164, + "loss": 1.9909, + "step": 289410 + }, + { + "epoch": 1.1016039524066898, + "grad_norm": 0.151639923453331, + "learning_rate": 0.00014988097262026256, + "loss": 1.9939, + "step": 289420 + }, + { + "epoch": 1.1016420148748125, + "grad_norm": 0.17876717448234558, + "learning_rate": 0.00014982147410213742, + "loss": 2.0102, + "step": 289430 + }, + { + "epoch": 1.1016800773429352, + "grad_norm": 0.1566760092973709, + "learning_rate": 0.00014976198569163475, + "loss": 2.0064, + "step": 289440 + }, + { + "epoch": 1.1017181398110578, + "grad_norm": 0.15046700835227966, + "learning_rate": 0.00014970250738360498, + "loss": 1.9942, + "step": 289450 + }, + { + "epoch": 1.1017562022791805, + "grad_norm": 0.15465828776359558, + "learning_rate": 0.00014964303917290302, + "loss": 1.9938, + "step": 289460 + }, + { + "epoch": 1.1017942647473034, + "grad_norm": 0.1620795875787735, + "learning_rate": 0.00014958358105438802, + "loss": 1.9864, + "step": 289470 + }, + { + "epoch": 1.101832327215426, + "grad_norm": 0.2129961997270584, + "learning_rate": 0.00014952413302292362, + "loss": 2.0068, + "step": 289480 + }, + { + "epoch": 1.1018703896835487, + "grad_norm": 0.171329528093338, + "learning_rate": 0.0001494646950733778, + "loss": 2.0017, + "step": 289490 + }, + { + "epoch": 1.1019084521516713, + "grad_norm": 0.17650112509727478, + "learning_rate": 0.00014940526720062287, + "loss": 1.9973, + "step": 289500 + }, + { + "epoch": 1.101946514619794, + "grad_norm": 0.22053077816963196, + "learning_rate": 0.00014934584939953538, + "loss": 1.985, + "step": 289510 + }, + { + "epoch": 1.1019845770879166, + "grad_norm": 0.1486891359090805, + "learning_rate": 0.0001492864416649964, + "loss": 1.9996, + "step": 289520 + }, + { + "epoch": 1.1020226395560393, + "grad_norm": 0.14435265958309174, + "learning_rate": 0.0001492270439918912, + "loss": 1.9937, + "step": 289530 + }, + { + "epoch": 1.102060702024162, + "grad_norm": 0.1580554097890854, + "learning_rate": 0.00014916765637510937, + "loss": 2.0134, + "step": 289540 + }, + { + "epoch": 1.1020987644922848, + "grad_norm": 0.16247224807739258, + "learning_rate": 0.00014910827880954504, + "loss": 1.9933, + "step": 289550 + }, + { + "epoch": 1.1021368269604075, + "grad_norm": 0.19161753356456757, + "learning_rate": 0.00014904891129009635, + "loss": 2.0013, + "step": 289560 + }, + { + "epoch": 1.1021748894285301, + "grad_norm": 0.17701232433319092, + "learning_rate": 0.00014898955381166597, + "loss": 1.9985, + "step": 289570 + }, + { + "epoch": 1.1022129518966528, + "grad_norm": 0.14210069179534912, + "learning_rate": 0.00014893020636916083, + "loss": 1.9887, + "step": 289580 + }, + { + "epoch": 1.1022510143647755, + "grad_norm": 0.14342361688613892, + "learning_rate": 0.00014887086895749215, + "loss": 1.9924, + "step": 289590 + }, + { + "epoch": 1.1022890768328981, + "grad_norm": 0.22423794865608215, + "learning_rate": 0.0001488115415715754, + "loss": 2.0036, + "step": 289600 + }, + { + "epoch": 1.1023271393010208, + "grad_norm": 0.1794877052307129, + "learning_rate": 0.00014875222420633038, + "loss": 2.0018, + "step": 289610 + }, + { + "epoch": 1.1023652017691434, + "grad_norm": 0.15107597410678864, + "learning_rate": 0.00014869291685668128, + "loss": 2.0049, + "step": 289620 + }, + { + "epoch": 1.102403264237266, + "grad_norm": 0.15473225712776184, + "learning_rate": 0.00014863361951755638, + "loss": 1.9945, + "step": 289630 + }, + { + "epoch": 1.102441326705389, + "grad_norm": 0.16924192011356354, + "learning_rate": 0.00014857433218388845, + "loss": 1.991, + "step": 289640 + }, + { + "epoch": 1.1024793891735116, + "grad_norm": 0.16266871988773346, + "learning_rate": 0.00014851505485061434, + "loss": 1.9842, + "step": 289650 + }, + { + "epoch": 1.1025174516416343, + "grad_norm": 0.1542142778635025, + "learning_rate": 0.00014845578751267524, + "loss": 1.9933, + "step": 289660 + }, + { + "epoch": 1.102555514109757, + "grad_norm": 0.16522587835788727, + "learning_rate": 0.0001483965301650167, + "loss": 1.9997, + "step": 289670 + }, + { + "epoch": 1.1025935765778796, + "grad_norm": 0.14744262397289276, + "learning_rate": 0.00014833728280258846, + "loss": 1.9997, + "step": 289680 + }, + { + "epoch": 1.1026316390460023, + "grad_norm": 0.1941128671169281, + "learning_rate": 0.0001482780454203444, + "loss": 1.9926, + "step": 289690 + }, + { + "epoch": 1.102669701514125, + "grad_norm": 0.19153951108455658, + "learning_rate": 0.00014821881801324276, + "loss": 2.0, + "step": 289700 + }, + { + "epoch": 1.1027077639822476, + "grad_norm": 0.18494223058223724, + "learning_rate": 0.0001481596005762461, + "loss": 2.0099, + "step": 289710 + }, + { + "epoch": 1.1027458264503704, + "grad_norm": 0.17378918826580048, + "learning_rate": 0.00014810039310432112, + "loss": 2.0009, + "step": 289720 + }, + { + "epoch": 1.102783888918493, + "grad_norm": 0.1473301351070404, + "learning_rate": 0.00014804119559243866, + "loss": 2.0015, + "step": 289730 + }, + { + "epoch": 1.1028219513866158, + "grad_norm": 0.1474497765302658, + "learning_rate": 0.00014798200803557406, + "loss": 1.9901, + "step": 289740 + }, + { + "epoch": 1.1028600138547384, + "grad_norm": 0.15276385843753815, + "learning_rate": 0.00014792283042870652, + "loss": 2.0014, + "step": 289750 + }, + { + "epoch": 1.102898076322861, + "grad_norm": 0.14474351704120636, + "learning_rate": 0.00014786366276681984, + "loss": 1.9905, + "step": 289760 + }, + { + "epoch": 1.1029361387909837, + "grad_norm": 0.16547620296478271, + "learning_rate": 0.00014780450504490173, + "loss": 2.0099, + "step": 289770 + }, + { + "epoch": 1.1029742012591064, + "grad_norm": 0.16784998774528503, + "learning_rate": 0.00014774535725794425, + "loss": 1.9933, + "step": 289780 + }, + { + "epoch": 1.103012263727229, + "grad_norm": 0.14733049273490906, + "learning_rate": 0.00014768621940094368, + "loss": 1.9994, + "step": 289790 + }, + { + "epoch": 1.1030503261953517, + "grad_norm": 0.1614081859588623, + "learning_rate": 0.0001476270914689004, + "loss": 2.0061, + "step": 289800 + }, + { + "epoch": 1.1030883886634746, + "grad_norm": 0.19870488345623016, + "learning_rate": 0.00014756797345681917, + "loss": 1.9968, + "step": 289810 + }, + { + "epoch": 1.1031264511315972, + "grad_norm": 0.16760575771331787, + "learning_rate": 0.00014750886535970864, + "loss": 1.9875, + "step": 289820 + }, + { + "epoch": 1.10316451359972, + "grad_norm": 0.15084044635295868, + "learning_rate": 0.00014744976717258196, + "loss": 2.0, + "step": 289830 + }, + { + "epoch": 1.1032025760678426, + "grad_norm": 0.14579305052757263, + "learning_rate": 0.00014739067889045626, + "loss": 1.9929, + "step": 289840 + }, + { + "epoch": 1.1032406385359652, + "grad_norm": 0.15969346463680267, + "learning_rate": 0.00014733160050835287, + "loss": 1.992, + "step": 289850 + }, + { + "epoch": 1.1032787010040879, + "grad_norm": 0.17312924563884735, + "learning_rate": 0.00014727253202129743, + "loss": 1.9932, + "step": 289860 + }, + { + "epoch": 1.1033167634722105, + "grad_norm": 0.1722601354122162, + "learning_rate": 0.0001472134734243195, + "loss": 1.9965, + "step": 289870 + }, + { + "epoch": 1.1033548259403332, + "grad_norm": 0.1556127369403839, + "learning_rate": 0.00014715442471245299, + "loss": 1.9975, + "step": 289880 + }, + { + "epoch": 1.103392888408456, + "grad_norm": 0.1654588282108307, + "learning_rate": 0.00014709538588073595, + "loss": 1.9947, + "step": 289890 + }, + { + "epoch": 1.1034309508765787, + "grad_norm": 0.1517258733510971, + "learning_rate": 0.00014703635692421048, + "loss": 1.9943, + "step": 289900 + }, + { + "epoch": 1.1034690133447014, + "grad_norm": 0.14629270136356354, + "learning_rate": 0.00014697733783792293, + "loss": 2.0045, + "step": 289910 + }, + { + "epoch": 1.103507075812824, + "grad_norm": 0.16628184914588928, + "learning_rate": 0.00014691832861692372, + "loss": 1.9975, + "step": 289920 + }, + { + "epoch": 1.1035451382809467, + "grad_norm": 0.163875013589859, + "learning_rate": 0.00014685932925626743, + "loss": 1.9841, + "step": 289930 + }, + { + "epoch": 1.1035832007490693, + "grad_norm": 0.164508655667305, + "learning_rate": 0.00014680033975101282, + "loss": 1.9806, + "step": 289940 + }, + { + "epoch": 1.103621263217192, + "grad_norm": 0.17021775245666504, + "learning_rate": 0.00014674136009622263, + "loss": 1.9932, + "step": 289950 + }, + { + "epoch": 1.1036593256853147, + "grad_norm": 0.15839537978172302, + "learning_rate": 0.00014668239028696383, + "loss": 2.0106, + "step": 289960 + }, + { + "epoch": 1.1036973881534373, + "grad_norm": 0.1492561548948288, + "learning_rate": 0.0001466234303183076, + "loss": 1.9936, + "step": 289970 + }, + { + "epoch": 1.1037354506215602, + "grad_norm": 0.17014679312705994, + "learning_rate": 0.00014656448018532904, + "loss": 1.9998, + "step": 289980 + }, + { + "epoch": 1.1037735130896829, + "grad_norm": 0.15628159046173096, + "learning_rate": 0.00014650553988310744, + "loss": 2.0081, + "step": 289990 + }, + { + "epoch": 1.1038115755578055, + "grad_norm": 0.16518567502498627, + "learning_rate": 0.00014644660940672623, + "loss": 2.0011, + "step": 290000 + }, + { + "epoch": 1.1038496380259282, + "grad_norm": 0.15008623898029327, + "learning_rate": 0.00014638768875127289, + "loss": 1.986, + "step": 290010 + }, + { + "epoch": 1.1038877004940508, + "grad_norm": 0.2048100084066391, + "learning_rate": 0.00014632877791183896, + "loss": 2.005, + "step": 290020 + }, + { + "epoch": 1.1039257629621735, + "grad_norm": 0.19177468121051788, + "learning_rate": 0.00014626987688352017, + "loss": 1.989, + "step": 290030 + }, + { + "epoch": 1.1039638254302961, + "grad_norm": 0.16694004833698273, + "learning_rate": 0.00014621098566141617, + "loss": 1.9882, + "step": 290040 + }, + { + "epoch": 1.1040018878984188, + "grad_norm": 0.22580422461032867, + "learning_rate": 0.00014615210424063092, + "loss": 2.0191, + "step": 290050 + }, + { + "epoch": 1.1040399503665417, + "grad_norm": 0.16023632884025574, + "learning_rate": 0.00014609323261627222, + "loss": 1.9965, + "step": 290060 + }, + { + "epoch": 1.1040780128346643, + "grad_norm": 0.20892436802387238, + "learning_rate": 0.0001460343707834521, + "loss": 1.9882, + "step": 290070 + }, + { + "epoch": 1.104116075302787, + "grad_norm": 0.18539145588874817, + "learning_rate": 0.0001459755187372866, + "loss": 1.9884, + "step": 290080 + }, + { + "epoch": 1.1041541377709096, + "grad_norm": 0.1476847231388092, + "learning_rate": 0.00014591667647289574, + "loss": 1.9922, + "step": 290090 + }, + { + "epoch": 1.1041922002390323, + "grad_norm": 0.17845043540000916, + "learning_rate": 0.0001458578439854037, + "loss": 1.9937, + "step": 290100 + }, + { + "epoch": 1.104230262707155, + "grad_norm": 0.15550413727760315, + "learning_rate": 0.00014579902126993872, + "loss": 2.018, + "step": 290110 + }, + { + "epoch": 1.1042683251752776, + "grad_norm": 0.16928768157958984, + "learning_rate": 0.00014574020832163297, + "loss": 2.0076, + "step": 290120 + }, + { + "epoch": 1.1043063876434003, + "grad_norm": 0.17358897626399994, + "learning_rate": 0.0001456814051356228, + "loss": 2.0049, + "step": 290130 + }, + { + "epoch": 1.104344450111523, + "grad_norm": 0.18268205225467682, + "learning_rate": 0.0001456226117070485, + "loss": 1.9806, + "step": 290140 + }, + { + "epoch": 1.1043825125796458, + "grad_norm": 0.18139834702014923, + "learning_rate": 0.0001455638280310544, + "loss": 1.9939, + "step": 290150 + }, + { + "epoch": 1.1044205750477685, + "grad_norm": 0.18911303579807281, + "learning_rate": 0.0001455050541027888, + "loss": 1.9914, + "step": 290160 + }, + { + "epoch": 1.1044586375158911, + "grad_norm": 0.20248942077159882, + "learning_rate": 0.00014544628991740427, + "loss": 2.0105, + "step": 290170 + }, + { + "epoch": 1.1044966999840138, + "grad_norm": 0.26870208978652954, + "learning_rate": 0.00014538753547005712, + "loss": 1.9981, + "step": 290180 + }, + { + "epoch": 1.1045347624521364, + "grad_norm": 0.1862451285123825, + "learning_rate": 0.00014532879075590776, + "loss": 2.0049, + "step": 290190 + }, + { + "epoch": 1.104572824920259, + "grad_norm": 0.17011886835098267, + "learning_rate": 0.00014527005577012064, + "loss": 1.9946, + "step": 290200 + }, + { + "epoch": 1.1046108873883818, + "grad_norm": 0.19458185136318207, + "learning_rate": 0.00014521133050786418, + "loss": 1.9968, + "step": 290210 + }, + { + "epoch": 1.1046489498565044, + "grad_norm": 0.19219279289245605, + "learning_rate": 0.00014515261496431077, + "loss": 1.9908, + "step": 290220 + }, + { + "epoch": 1.1046870123246273, + "grad_norm": 0.1879696100950241, + "learning_rate": 0.00014509390913463693, + "loss": 1.9734, + "step": 290230 + }, + { + "epoch": 1.10472507479275, + "grad_norm": 0.1549655795097351, + "learning_rate": 0.00014503521301402307, + "loss": 2.0045, + "step": 290240 + }, + { + "epoch": 1.1047631372608726, + "grad_norm": 0.19210083782672882, + "learning_rate": 0.00014497652659765348, + "loss": 2.0109, + "step": 290250 + }, + { + "epoch": 1.1048011997289953, + "grad_norm": 0.1508699655532837, + "learning_rate": 0.0001449178498807166, + "loss": 2.0096, + "step": 290260 + }, + { + "epoch": 1.104839262197118, + "grad_norm": 0.14681914448738098, + "learning_rate": 0.00014485918285840477, + "loss": 1.9985, + "step": 290270 + }, + { + "epoch": 1.1048773246652406, + "grad_norm": 0.16370920836925507, + "learning_rate": 0.00014480052552591429, + "loss": 1.9955, + "step": 290280 + }, + { + "epoch": 1.1049153871333632, + "grad_norm": 0.18862545490264893, + "learning_rate": 0.00014474187787844552, + "loss": 1.9966, + "step": 290290 + }, + { + "epoch": 1.104953449601486, + "grad_norm": 0.1513679325580597, + "learning_rate": 0.00014468323991120264, + "loss": 1.9865, + "step": 290300 + }, + { + "epoch": 1.1049915120696086, + "grad_norm": 0.15902507305145264, + "learning_rate": 0.00014462461161939382, + "loss": 1.9905, + "step": 290310 + }, + { + "epoch": 1.1050295745377312, + "grad_norm": 0.14775541424751282, + "learning_rate": 0.00014456599299823137, + "loss": 2.0034, + "step": 290320 + }, + { + "epoch": 1.105067637005854, + "grad_norm": 0.18092699348926544, + "learning_rate": 0.00014450738404293124, + "loss": 2.0011, + "step": 290330 + }, + { + "epoch": 1.1051056994739767, + "grad_norm": 0.14086657762527466, + "learning_rate": 0.00014444878474871348, + "loss": 1.9985, + "step": 290340 + }, + { + "epoch": 1.1051437619420994, + "grad_norm": 0.19705578684806824, + "learning_rate": 0.0001443901951108022, + "loss": 1.9896, + "step": 290350 + }, + { + "epoch": 1.105181824410222, + "grad_norm": 0.1683788001537323, + "learning_rate": 0.00014433161512442523, + "loss": 1.9888, + "step": 290360 + }, + { + "epoch": 1.1052198868783447, + "grad_norm": 0.17867405712604523, + "learning_rate": 0.0001442730447848144, + "loss": 1.997, + "step": 290370 + }, + { + "epoch": 1.1052579493464674, + "grad_norm": 0.1499549299478531, + "learning_rate": 0.00014421448408720555, + "loss": 1.9974, + "step": 290380 + }, + { + "epoch": 1.10529601181459, + "grad_norm": 0.1693742722272873, + "learning_rate": 0.00014415593302683833, + "loss": 1.9997, + "step": 290390 + }, + { + "epoch": 1.1053340742827127, + "grad_norm": 0.14913637936115265, + "learning_rate": 0.0001440973915989563, + "loss": 1.9935, + "step": 290400 + }, + { + "epoch": 1.1053721367508356, + "grad_norm": 0.15323811769485474, + "learning_rate": 0.00014403885979880705, + "loss": 1.9955, + "step": 290410 + }, + { + "epoch": 1.1054101992189582, + "grad_norm": 0.1560705304145813, + "learning_rate": 0.00014398033762164203, + "loss": 1.987, + "step": 290420 + }, + { + "epoch": 1.1054482616870809, + "grad_norm": 0.22040893137454987, + "learning_rate": 0.0001439218250627165, + "loss": 1.9997, + "step": 290430 + }, + { + "epoch": 1.1054863241552035, + "grad_norm": 0.2085459977388382, + "learning_rate": 0.00014386332211728975, + "loss": 1.9904, + "step": 290440 + }, + { + "epoch": 1.1055243866233262, + "grad_norm": 0.1620468944311142, + "learning_rate": 0.00014380482878062484, + "loss": 2.003, + "step": 290450 + }, + { + "epoch": 1.1055624490914489, + "grad_norm": 0.1452278196811676, + "learning_rate": 0.0001437463450479888, + "loss": 1.9898, + "step": 290460 + }, + { + "epoch": 1.1056005115595715, + "grad_norm": 0.15971092879772186, + "learning_rate": 0.00014368787091465252, + "loss": 1.9945, + "step": 290470 + }, + { + "epoch": 1.1056385740276942, + "grad_norm": 0.15089818835258484, + "learning_rate": 0.00014362940637589077, + "loss": 2.0007, + "step": 290480 + }, + { + "epoch": 1.1056766364958168, + "grad_norm": 0.15188033878803253, + "learning_rate": 0.00014357095142698224, + "loss": 1.9957, + "step": 290490 + }, + { + "epoch": 1.1057146989639397, + "grad_norm": 0.18442633748054504, + "learning_rate": 0.00014351250606320937, + "loss": 1.985, + "step": 290500 + }, + { + "epoch": 1.1057527614320624, + "grad_norm": 0.15648066997528076, + "learning_rate": 0.00014345407027985862, + "loss": 1.9846, + "step": 290510 + }, + { + "epoch": 1.105790823900185, + "grad_norm": 0.15993808209896088, + "learning_rate": 0.00014339564407222016, + "loss": 1.9937, + "step": 290520 + }, + { + "epoch": 1.1058288863683077, + "grad_norm": 0.1606651097536087, + "learning_rate": 0.00014333722743558818, + "loss": 2.0042, + "step": 290530 + }, + { + "epoch": 1.1058669488364303, + "grad_norm": 0.1600220501422882, + "learning_rate": 0.00014327882036526063, + "loss": 1.9934, + "step": 290540 + }, + { + "epoch": 1.105905011304553, + "grad_norm": 0.20765568315982819, + "learning_rate": 0.00014322042285653925, + "loss": 1.9812, + "step": 290550 + }, + { + "epoch": 1.1059430737726756, + "grad_norm": 0.15059348940849304, + "learning_rate": 0.00014316203490472972, + "loss": 1.9896, + "step": 290560 + }, + { + "epoch": 1.1059811362407983, + "grad_norm": 0.20932096242904663, + "learning_rate": 0.00014310365650514157, + "loss": 2.0081, + "step": 290570 + }, + { + "epoch": 1.1060191987089212, + "grad_norm": 0.19905489683151245, + "learning_rate": 0.000143045287653088, + "loss": 1.9983, + "step": 290580 + }, + { + "epoch": 1.1060572611770438, + "grad_norm": 0.1467469334602356, + "learning_rate": 0.00014298692834388639, + "loss": 1.9974, + "step": 290590 + }, + { + "epoch": 1.1060953236451665, + "grad_norm": 0.14193718135356903, + "learning_rate": 0.00014292857857285752, + "loss": 2.0026, + "step": 290600 + }, + { + "epoch": 1.1061333861132892, + "grad_norm": 0.1589806228876114, + "learning_rate": 0.00014287023833532624, + "loss": 1.9906, + "step": 290610 + }, + { + "epoch": 1.1061714485814118, + "grad_norm": 0.15120485424995422, + "learning_rate": 0.00014281190762662128, + "loss": 1.9992, + "step": 290620 + }, + { + "epoch": 1.1062095110495345, + "grad_norm": 0.15628735721111298, + "learning_rate": 0.000142753586442075, + "loss": 1.9936, + "step": 290630 + }, + { + "epoch": 1.1062475735176571, + "grad_norm": 0.1633949875831604, + "learning_rate": 0.0001426952747770236, + "loss": 1.9897, + "step": 290640 + }, + { + "epoch": 1.1062856359857798, + "grad_norm": 0.1764533966779709, + "learning_rate": 0.00014263697262680719, + "loss": 1.9891, + "step": 290650 + }, + { + "epoch": 1.1063236984539024, + "grad_norm": 0.17311392724514008, + "learning_rate": 0.00014257867998676967, + "loss": 2.0095, + "step": 290660 + }, + { + "epoch": 1.1063617609220253, + "grad_norm": 0.20260465145111084, + "learning_rate": 0.00014252039685225864, + "loss": 2.0225, + "step": 290670 + }, + { + "epoch": 1.106399823390148, + "grad_norm": 0.20680083334445953, + "learning_rate": 0.0001424621232186255, + "loss": 2.0022, + "step": 290680 + }, + { + "epoch": 1.1064378858582706, + "grad_norm": 0.16034738719463348, + "learning_rate": 0.00014240385908122555, + "loss": 1.9941, + "step": 290690 + }, + { + "epoch": 1.1064759483263933, + "grad_norm": 0.1378408670425415, + "learning_rate": 0.00014234560443541773, + "loss": 1.995, + "step": 290700 + }, + { + "epoch": 1.106514010794516, + "grad_norm": 0.18012098968029022, + "learning_rate": 0.00014228735927656493, + "loss": 2.003, + "step": 290710 + }, + { + "epoch": 1.1065520732626386, + "grad_norm": 0.18704988062381744, + "learning_rate": 0.00014222912360003365, + "loss": 2.0084, + "step": 290720 + }, + { + "epoch": 1.1065901357307613, + "grad_norm": 0.16487978398799896, + "learning_rate": 0.0001421708974011942, + "loss": 1.9931, + "step": 290730 + }, + { + "epoch": 1.106628198198884, + "grad_norm": 0.15397684276103973, + "learning_rate": 0.00014211268067542077, + "loss": 1.9913, + "step": 290740 + }, + { + "epoch": 1.1066662606670068, + "grad_norm": 0.1769288331270218, + "learning_rate": 0.00014205447341809118, + "loss": 2.0106, + "step": 290750 + }, + { + "epoch": 1.1067043231351295, + "grad_norm": 0.1709713190793991, + "learning_rate": 0.000141996275624587, + "loss": 1.9984, + "step": 290760 + }, + { + "epoch": 1.106742385603252, + "grad_norm": 0.15498772263526917, + "learning_rate": 0.00014193808729029368, + "loss": 1.9707, + "step": 290770 + }, + { + "epoch": 1.1067804480713748, + "grad_norm": 0.15889990329742432, + "learning_rate": 0.00014187990841060032, + "loss": 1.9981, + "step": 290780 + }, + { + "epoch": 1.1068185105394974, + "grad_norm": 0.16191764175891876, + "learning_rate": 0.00014182173898089984, + "loss": 2.0066, + "step": 290790 + }, + { + "epoch": 1.10685657300762, + "grad_norm": 0.16238361597061157, + "learning_rate": 0.0001417635789965887, + "loss": 2.01, + "step": 290800 + }, + { + "epoch": 1.1068946354757427, + "grad_norm": 0.19509507715702057, + "learning_rate": 0.00014170542845306743, + "loss": 1.9998, + "step": 290810 + }, + { + "epoch": 1.1069326979438654, + "grad_norm": 0.1603064388036728, + "learning_rate": 0.00014164728734573996, + "loss": 1.9946, + "step": 290820 + }, + { + "epoch": 1.106970760411988, + "grad_norm": 0.18165600299835205, + "learning_rate": 0.00014158915567001417, + "loss": 1.9896, + "step": 290830 + }, + { + "epoch": 1.107008822880111, + "grad_norm": 0.1503620743751526, + "learning_rate": 0.00014153103342130159, + "loss": 2.0022, + "step": 290840 + }, + { + "epoch": 1.1070468853482336, + "grad_norm": 0.14587150514125824, + "learning_rate": 0.00014147292059501742, + "loss": 1.9836, + "step": 290850 + }, + { + "epoch": 1.1070849478163562, + "grad_norm": 0.16784952580928802, + "learning_rate": 0.00014141481718658072, + "loss": 2.0025, + "step": 290860 + }, + { + "epoch": 1.107123010284479, + "grad_norm": 0.14302490651607513, + "learning_rate": 0.00014135672319141407, + "loss": 1.9768, + "step": 290870 + }, + { + "epoch": 1.1071610727526016, + "grad_norm": 0.1688007414340973, + "learning_rate": 0.00014129863860494384, + "loss": 1.9909, + "step": 290880 + }, + { + "epoch": 1.1071991352207242, + "grad_norm": 0.1606607288122177, + "learning_rate": 0.0001412405634226002, + "loss": 1.9986, + "step": 290890 + }, + { + "epoch": 1.1072371976888469, + "grad_norm": 0.18139497935771942, + "learning_rate": 0.0001411824976398169, + "loss": 1.9983, + "step": 290900 + }, + { + "epoch": 1.1072752601569695, + "grad_norm": 0.1669086068868637, + "learning_rate": 0.0001411244412520314, + "loss": 2.016, + "step": 290910 + }, + { + "epoch": 1.1073133226250924, + "grad_norm": 0.17165876924991608, + "learning_rate": 0.00014106639425468488, + "loss": 1.9919, + "step": 290920 + }, + { + "epoch": 1.107351385093215, + "grad_norm": 0.15280088782310486, + "learning_rate": 0.00014100835664322215, + "loss": 1.9889, + "step": 290930 + }, + { + "epoch": 1.1073894475613377, + "grad_norm": 0.14866815507411957, + "learning_rate": 0.00014095032841309174, + "loss": 1.9949, + "step": 290940 + }, + { + "epoch": 1.1074275100294604, + "grad_norm": 0.18499290943145752, + "learning_rate": 0.0001408923095597459, + "loss": 1.9875, + "step": 290950 + }, + { + "epoch": 1.107465572497583, + "grad_norm": 0.18930111825466156, + "learning_rate": 0.00014083430007864057, + "loss": 1.9826, + "step": 290960 + }, + { + "epoch": 1.1075036349657057, + "grad_norm": 0.17787308990955353, + "learning_rate": 0.00014077629996523522, + "loss": 2.002, + "step": 290970 + }, + { + "epoch": 1.1075416974338284, + "grad_norm": 0.13603579998016357, + "learning_rate": 0.00014071830921499306, + "loss": 1.998, + "step": 290980 + }, + { + "epoch": 1.107579759901951, + "grad_norm": 0.1438482403755188, + "learning_rate": 0.00014066032782338102, + "loss": 1.9825, + "step": 290990 + }, + { + "epoch": 1.1076178223700737, + "grad_norm": 0.1974167674779892, + "learning_rate": 0.00014060235578586955, + "loss": 1.9849, + "step": 291000 + }, + { + "epoch": 1.1076558848381965, + "grad_norm": 0.15366169810295105, + "learning_rate": 0.000140544393097933, + "loss": 2.0074, + "step": 291010 + }, + { + "epoch": 1.1076939473063192, + "grad_norm": 0.1550496369600296, + "learning_rate": 0.000140486439755049, + "loss": 1.9768, + "step": 291020 + }, + { + "epoch": 1.1077320097744419, + "grad_norm": 0.15785466134548187, + "learning_rate": 0.00014042849575269918, + "loss": 1.9828, + "step": 291030 + }, + { + "epoch": 1.1077700722425645, + "grad_norm": 0.16849170625209808, + "learning_rate": 0.00014037056108636865, + "loss": 2.0063, + "step": 291040 + }, + { + "epoch": 1.1078081347106872, + "grad_norm": 0.1854594498872757, + "learning_rate": 0.00014031263575154607, + "loss": 1.9903, + "step": 291050 + }, + { + "epoch": 1.1078461971788098, + "grad_norm": 0.14291688799858093, + "learning_rate": 0.00014025471974372382, + "loss": 2.0002, + "step": 291060 + }, + { + "epoch": 1.1078842596469325, + "grad_norm": 0.15868712961673737, + "learning_rate": 0.0001401968130583981, + "loss": 1.989, + "step": 291070 + }, + { + "epoch": 1.1079223221150551, + "grad_norm": 0.1694360077381134, + "learning_rate": 0.00014013891569106835, + "loss": 1.9879, + "step": 291080 + }, + { + "epoch": 1.107960384583178, + "grad_norm": 0.19629625976085663, + "learning_rate": 0.00014008102763723795, + "loss": 2.007, + "step": 291090 + }, + { + "epoch": 1.1079984470513007, + "grad_norm": 0.17719154059886932, + "learning_rate": 0.00014002314889241375, + "loss": 1.9958, + "step": 291100 + }, + { + "epoch": 1.1080365095194233, + "grad_norm": 0.1574486643075943, + "learning_rate": 0.00013996527945210618, + "loss": 1.9925, + "step": 291110 + }, + { + "epoch": 1.108074571987546, + "grad_norm": 0.15465888381004333, + "learning_rate": 0.00013990741931182939, + "loss": 1.9972, + "step": 291120 + }, + { + "epoch": 1.1081126344556687, + "grad_norm": 0.18018567562103271, + "learning_rate": 0.00013984956846710105, + "loss": 2.0014, + "step": 291130 + }, + { + "epoch": 1.1081506969237913, + "grad_norm": 0.17025281488895416, + "learning_rate": 0.0001397917269134425, + "loss": 2.0062, + "step": 291140 + }, + { + "epoch": 1.108188759391914, + "grad_norm": 0.15943843126296997, + "learning_rate": 0.00013973389464637853, + "loss": 1.9928, + "step": 291150 + }, + { + "epoch": 1.1082268218600366, + "grad_norm": 0.1920195072889328, + "learning_rate": 0.00013967607166143775, + "loss": 1.9894, + "step": 291160 + }, + { + "epoch": 1.1082648843281593, + "grad_norm": 0.19052480161190033, + "learning_rate": 0.00013961825795415217, + "loss": 1.9869, + "step": 291170 + }, + { + "epoch": 1.108302946796282, + "grad_norm": 0.2030021846294403, + "learning_rate": 0.00013956045352005742, + "loss": 1.9906, + "step": 291180 + }, + { + "epoch": 1.1083410092644048, + "grad_norm": 0.21491935849189758, + "learning_rate": 0.0001395026583546928, + "loss": 2.0128, + "step": 291190 + }, + { + "epoch": 1.1083790717325275, + "grad_norm": 0.1512957364320755, + "learning_rate": 0.00013944487245360105, + "loss": 1.9947, + "step": 291200 + }, + { + "epoch": 1.1084171342006501, + "grad_norm": 0.18461589515209198, + "learning_rate": 0.00013938709581232862, + "loss": 1.9996, + "step": 291210 + }, + { + "epoch": 1.1084551966687728, + "grad_norm": 0.18098539113998413, + "learning_rate": 0.00013932932842642537, + "loss": 1.9857, + "step": 291220 + }, + { + "epoch": 1.1084932591368954, + "grad_norm": 0.16633349657058716, + "learning_rate": 0.00013927157029144488, + "loss": 1.9976, + "step": 291230 + }, + { + "epoch": 1.108531321605018, + "grad_norm": 0.23348954319953918, + "learning_rate": 0.00013921382140294415, + "loss": 2.0039, + "step": 291240 + }, + { + "epoch": 1.1085693840731408, + "grad_norm": 0.1641732156276703, + "learning_rate": 0.00013915608175648386, + "loss": 1.9816, + "step": 291250 + }, + { + "epoch": 1.1086074465412634, + "grad_norm": 0.20765918493270874, + "learning_rate": 0.00013909835134762823, + "loss": 1.993, + "step": 291260 + }, + { + "epoch": 1.1086455090093863, + "grad_norm": 0.17798057198524475, + "learning_rate": 0.0001390406301719449, + "loss": 2.0044, + "step": 291270 + }, + { + "epoch": 1.108683571477509, + "grad_norm": 0.15671156346797943, + "learning_rate": 0.00013898291822500515, + "loss": 1.9776, + "step": 291280 + }, + { + "epoch": 1.1087216339456316, + "grad_norm": 0.1654648631811142, + "learning_rate": 0.0001389252155023838, + "loss": 1.9782, + "step": 291290 + }, + { + "epoch": 1.1087596964137543, + "grad_norm": 0.15194883942604065, + "learning_rate": 0.0001388675219996592, + "loss": 1.994, + "step": 291300 + }, + { + "epoch": 1.108797758881877, + "grad_norm": 0.17421121895313263, + "learning_rate": 0.00013880983771241313, + "loss": 1.9997, + "step": 291310 + }, + { + "epoch": 1.1088358213499996, + "grad_norm": 0.1441257894039154, + "learning_rate": 0.00013875216263623113, + "loss": 1.9798, + "step": 291320 + }, + { + "epoch": 1.1088738838181222, + "grad_norm": 0.19240529835224152, + "learning_rate": 0.00013869449676670204, + "loss": 1.9993, + "step": 291330 + }, + { + "epoch": 1.108911946286245, + "grad_norm": 0.16603481769561768, + "learning_rate": 0.00013863684009941834, + "loss": 1.9938, + "step": 291340 + }, + { + "epoch": 1.1089500087543676, + "grad_norm": 0.14293710887432098, + "learning_rate": 0.000138579192629976, + "loss": 1.9767, + "step": 291350 + }, + { + "epoch": 1.1089880712224904, + "grad_norm": 0.17526213824748993, + "learning_rate": 0.00013852155435397447, + "loss": 1.9891, + "step": 291360 + }, + { + "epoch": 1.109026133690613, + "grad_norm": 0.19786065816879272, + "learning_rate": 0.00013846392526701662, + "loss": 2.0015, + "step": 291370 + }, + { + "epoch": 1.1090641961587357, + "grad_norm": 0.16378028690814972, + "learning_rate": 0.0001384063053647091, + "loss": 2.0019, + "step": 291380 + }, + { + "epoch": 1.1091022586268584, + "grad_norm": 0.1845664083957672, + "learning_rate": 0.0001383486946426618, + "loss": 1.995, + "step": 291390 + }, + { + "epoch": 1.109140321094981, + "grad_norm": 0.18735288083553314, + "learning_rate": 0.00013829109309648825, + "loss": 1.9832, + "step": 291400 + }, + { + "epoch": 1.1091783835631037, + "grad_norm": 0.15484029054641724, + "learning_rate": 0.0001382335007218054, + "loss": 2.0015, + "step": 291410 + }, + { + "epoch": 1.1092164460312264, + "grad_norm": 0.17933939397335052, + "learning_rate": 0.00013817591751423363, + "loss": 1.9925, + "step": 291420 + }, + { + "epoch": 1.109254508499349, + "grad_norm": 0.17813944816589355, + "learning_rate": 0.00013811834346939695, + "loss": 2.0081, + "step": 291430 + }, + { + "epoch": 1.109292570967472, + "grad_norm": 0.14827589690685272, + "learning_rate": 0.00013806077858292281, + "loss": 1.9839, + "step": 291440 + }, + { + "epoch": 1.1093306334355946, + "grad_norm": 0.1530294567346573, + "learning_rate": 0.00013800322285044213, + "loss": 1.9957, + "step": 291450 + }, + { + "epoch": 1.1093686959037172, + "grad_norm": 0.26225045323371887, + "learning_rate": 0.0001379456762675892, + "loss": 1.9754, + "step": 291460 + }, + { + "epoch": 1.1094067583718399, + "grad_norm": 0.16842412948608398, + "learning_rate": 0.0001378881388300019, + "loss": 1.9809, + "step": 291470 + }, + { + "epoch": 1.1094448208399625, + "grad_norm": 0.16914471983909607, + "learning_rate": 0.00013783061053332152, + "loss": 1.9976, + "step": 291480 + }, + { + "epoch": 1.1094828833080852, + "grad_norm": 0.23515167832374573, + "learning_rate": 0.00013777309137319278, + "loss": 1.9958, + "step": 291490 + }, + { + "epoch": 1.1095209457762079, + "grad_norm": 0.15831919014453888, + "learning_rate": 0.000137715581345264, + "loss": 1.9806, + "step": 291500 + }, + { + "epoch": 1.1095590082443305, + "grad_norm": 0.15088757872581482, + "learning_rate": 0.00013765808044518686, + "loss": 2.0058, + "step": 291510 + }, + { + "epoch": 1.1095970707124532, + "grad_norm": 0.14674466848373413, + "learning_rate": 0.00013760058866861647, + "loss": 1.9826, + "step": 291520 + }, + { + "epoch": 1.109635133180576, + "grad_norm": 0.152365580201149, + "learning_rate": 0.00013754310601121135, + "loss": 2.0032, + "step": 291530 + }, + { + "epoch": 1.1096731956486987, + "grad_norm": 0.18826673924922943, + "learning_rate": 0.00013748563246863355, + "loss": 2.0001, + "step": 291540 + }, + { + "epoch": 1.1097112581168214, + "grad_norm": 0.18371078372001648, + "learning_rate": 0.00013742816803654845, + "loss": 1.9927, + "step": 291550 + }, + { + "epoch": 1.109749320584944, + "grad_norm": 0.1651403307914734, + "learning_rate": 0.00013737071271062508, + "loss": 2.0015, + "step": 291560 + }, + { + "epoch": 1.1097873830530667, + "grad_norm": 0.14440204203128815, + "learning_rate": 0.00013731326648653568, + "loss": 2.0011, + "step": 291570 + }, + { + "epoch": 1.1098254455211893, + "grad_norm": 0.24342374503612518, + "learning_rate": 0.00013725582935995606, + "loss": 1.9962, + "step": 291580 + }, + { + "epoch": 1.109863507989312, + "grad_norm": 0.1862649917602539, + "learning_rate": 0.0001371984013265653, + "loss": 2.0055, + "step": 291590 + }, + { + "epoch": 1.1099015704574346, + "grad_norm": 0.16959112882614136, + "learning_rate": 0.00013714098238204597, + "loss": 1.991, + "step": 291600 + }, + { + "epoch": 1.1099396329255575, + "grad_norm": 0.15094879269599915, + "learning_rate": 0.00013708357252208413, + "loss": 1.9883, + "step": 291610 + }, + { + "epoch": 1.1099776953936802, + "grad_norm": 0.1410762369632721, + "learning_rate": 0.00013702617174236927, + "loss": 1.9937, + "step": 291620 + }, + { + "epoch": 1.1100157578618028, + "grad_norm": 0.16502730548381805, + "learning_rate": 0.00013696878003859408, + "loss": 1.993, + "step": 291630 + }, + { + "epoch": 1.1100538203299255, + "grad_norm": 0.17659837007522583, + "learning_rate": 0.00013691139740645492, + "loss": 2.0023, + "step": 291640 + }, + { + "epoch": 1.1100918827980482, + "grad_norm": 0.18810971081256866, + "learning_rate": 0.00013685402384165123, + "loss": 1.9911, + "step": 291650 + }, + { + "epoch": 1.1101299452661708, + "grad_norm": 0.21112871170043945, + "learning_rate": 0.00013679665933988622, + "loss": 1.9886, + "step": 291660 + }, + { + "epoch": 1.1101680077342935, + "grad_norm": 0.16219669580459595, + "learning_rate": 0.00013673930389686617, + "loss": 1.9913, + "step": 291670 + }, + { + "epoch": 1.1102060702024161, + "grad_norm": 0.15710720419883728, + "learning_rate": 0.000136681957508301, + "loss": 1.9897, + "step": 291680 + }, + { + "epoch": 1.1102441326705388, + "grad_norm": 0.19102413952350616, + "learning_rate": 0.00013662462016990383, + "loss": 1.9864, + "step": 291690 + }, + { + "epoch": 1.1102821951386617, + "grad_norm": 0.15526530146598816, + "learning_rate": 0.00013656729187739124, + "loss": 1.9877, + "step": 291700 + }, + { + "epoch": 1.1103202576067843, + "grad_norm": 0.19195884466171265, + "learning_rate": 0.00013650997262648317, + "loss": 1.9923, + "step": 291710 + }, + { + "epoch": 1.110358320074907, + "grad_norm": 0.1689821034669876, + "learning_rate": 0.00013645266241290305, + "loss": 1.9934, + "step": 291720 + }, + { + "epoch": 1.1103963825430296, + "grad_norm": 0.17311668395996094, + "learning_rate": 0.00013639536123237738, + "loss": 2.0037, + "step": 291730 + }, + { + "epoch": 1.1104344450111523, + "grad_norm": 0.1563536375761032, + "learning_rate": 0.0001363380690806364, + "loss": 1.9901, + "step": 291740 + }, + { + "epoch": 1.110472507479275, + "grad_norm": 0.14188843965530396, + "learning_rate": 0.00013628078595341342, + "loss": 1.9851, + "step": 291750 + }, + { + "epoch": 1.1105105699473976, + "grad_norm": 0.16506977379322052, + "learning_rate": 0.00013622351184644526, + "loss": 1.9953, + "step": 291760 + }, + { + "epoch": 1.1105486324155203, + "grad_norm": 0.15015274286270142, + "learning_rate": 0.00013616624675547213, + "loss": 2.0002, + "step": 291770 + }, + { + "epoch": 1.1105866948836431, + "grad_norm": 0.14491043984889984, + "learning_rate": 0.00013610899067623743, + "loss": 1.9963, + "step": 291780 + }, + { + "epoch": 1.1106247573517658, + "grad_norm": 0.14073286950588226, + "learning_rate": 0.00013605174360448803, + "loss": 1.9929, + "step": 291790 + }, + { + "epoch": 1.1106628198198885, + "grad_norm": 0.14233963191509247, + "learning_rate": 0.0001359945055359741, + "loss": 1.9945, + "step": 291800 + }, + { + "epoch": 1.1107008822880111, + "grad_norm": 0.1646268367767334, + "learning_rate": 0.00013593727646644916, + "loss": 1.9905, + "step": 291810 + }, + { + "epoch": 1.1107389447561338, + "grad_norm": 0.18890641629695892, + "learning_rate": 0.00013588005639167012, + "loss": 1.9846, + "step": 291820 + }, + { + "epoch": 1.1107770072242564, + "grad_norm": 0.16375824809074402, + "learning_rate": 0.00013582284530739715, + "loss": 1.9948, + "step": 291830 + }, + { + "epoch": 1.110815069692379, + "grad_norm": 0.15013252198696136, + "learning_rate": 0.00013576564320939377, + "loss": 1.9855, + "step": 291840 + }, + { + "epoch": 1.1108531321605017, + "grad_norm": 0.18155845999717712, + "learning_rate": 0.00013570845009342676, + "loss": 1.9734, + "step": 291850 + }, + { + "epoch": 1.1108911946286244, + "grad_norm": 0.1454450935125351, + "learning_rate": 0.0001356512659552664, + "loss": 1.9903, + "step": 291860 + }, + { + "epoch": 1.1109292570967473, + "grad_norm": 0.14277590811252594, + "learning_rate": 0.0001355940907906862, + "loss": 1.9927, + "step": 291870 + }, + { + "epoch": 1.11096731956487, + "grad_norm": 0.16399109363555908, + "learning_rate": 0.00013553692459546284, + "loss": 1.9982, + "step": 291880 + }, + { + "epoch": 1.1110053820329926, + "grad_norm": 0.16129136085510254, + "learning_rate": 0.00013547976736537655, + "loss": 1.9994, + "step": 291890 + }, + { + "epoch": 1.1110434445011153, + "grad_norm": 0.1648087501525879, + "learning_rate": 0.00013542261909621074, + "loss": 1.9908, + "step": 291900 + }, + { + "epoch": 1.111081506969238, + "grad_norm": 0.1906840205192566, + "learning_rate": 0.000135365479783752, + "loss": 1.9899, + "step": 291910 + }, + { + "epoch": 1.1111195694373606, + "grad_norm": 0.17575401067733765, + "learning_rate": 0.0001353083494237906, + "loss": 2.015, + "step": 291920 + }, + { + "epoch": 1.1111576319054832, + "grad_norm": 0.17227664589881897, + "learning_rate": 0.00013525122801211976, + "loss": 1.9862, + "step": 291930 + }, + { + "epoch": 1.1111956943736059, + "grad_norm": 0.19471804797649384, + "learning_rate": 0.00013519411554453604, + "loss": 1.9961, + "step": 291940 + }, + { + "epoch": 1.1112337568417288, + "grad_norm": 0.1781400740146637, + "learning_rate": 0.00013513701201683942, + "loss": 1.984, + "step": 291950 + }, + { + "epoch": 1.1112718193098514, + "grad_norm": 0.14845864474773407, + "learning_rate": 0.00013507991742483304, + "loss": 1.9821, + "step": 291960 + }, + { + "epoch": 1.111309881777974, + "grad_norm": 0.15103501081466675, + "learning_rate": 0.00013502283176432346, + "loss": 1.9982, + "step": 291970 + }, + { + "epoch": 1.1113479442460967, + "grad_norm": 0.1785365641117096, + "learning_rate": 0.0001349657550311204, + "loss": 1.9979, + "step": 291980 + }, + { + "epoch": 1.1113860067142194, + "grad_norm": 0.2228976935148239, + "learning_rate": 0.00013490868722103688, + "loss": 1.9898, + "step": 291990 + }, + { + "epoch": 1.111424069182342, + "grad_norm": 0.15446533262729645, + "learning_rate": 0.0001348516283298893, + "loss": 1.9875, + "step": 292000 + }, + { + "epoch": 1.1114621316504647, + "grad_norm": 0.16202174127101898, + "learning_rate": 0.00013479457835349708, + "loss": 2.002, + "step": 292010 + }, + { + "epoch": 1.1115001941185874, + "grad_norm": 0.1793016642332077, + "learning_rate": 0.00013473753728768318, + "loss": 1.9883, + "step": 292020 + }, + { + "epoch": 1.11153825658671, + "grad_norm": 0.1564231514930725, + "learning_rate": 0.00013468050512827358, + "loss": 2.0075, + "step": 292030 + }, + { + "epoch": 1.111576319054833, + "grad_norm": 0.15224598348140717, + "learning_rate": 0.0001346234818710978, + "loss": 1.9923, + "step": 292040 + }, + { + "epoch": 1.1116143815229556, + "grad_norm": 0.17988833785057068, + "learning_rate": 0.00013456646751198838, + "loss": 2.0062, + "step": 292050 + }, + { + "epoch": 1.1116524439910782, + "grad_norm": 0.15105636417865753, + "learning_rate": 0.0001345094620467811, + "loss": 2.008, + "step": 292060 + }, + { + "epoch": 1.1116905064592009, + "grad_norm": 0.18763117492198944, + "learning_rate": 0.00013445246547131524, + "loss": 1.9823, + "step": 292070 + }, + { + "epoch": 1.1117285689273235, + "grad_norm": 0.16480746865272522, + "learning_rate": 0.000134395477781433, + "loss": 1.9947, + "step": 292080 + }, + { + "epoch": 1.1117666313954462, + "grad_norm": 0.21038369834423065, + "learning_rate": 0.00013433849897298, + "loss": 1.9862, + "step": 292090 + }, + { + "epoch": 1.1118046938635688, + "grad_norm": 0.16916730999946594, + "learning_rate": 0.00013428152904180514, + "loss": 1.9857, + "step": 292100 + }, + { + "epoch": 1.1118427563316915, + "grad_norm": 0.16177931427955627, + "learning_rate": 0.00013422456798376048, + "loss": 1.9756, + "step": 292110 + }, + { + "epoch": 1.1118808187998142, + "grad_norm": 0.20139986276626587, + "learning_rate": 0.0001341676157947012, + "loss": 1.9941, + "step": 292120 + }, + { + "epoch": 1.111918881267937, + "grad_norm": 0.16537266969680786, + "learning_rate": 0.00013411067247048598, + "loss": 1.9977, + "step": 292130 + }, + { + "epoch": 1.1119569437360597, + "grad_norm": 0.18192315101623535, + "learning_rate": 0.0001340537380069764, + "loss": 1.9959, + "step": 292140 + }, + { + "epoch": 1.1119950062041823, + "grad_norm": 0.16280584037303925, + "learning_rate": 0.00013399681240003754, + "loss": 1.9868, + "step": 292150 + }, + { + "epoch": 1.112033068672305, + "grad_norm": 0.1430111676454544, + "learning_rate": 0.00013393989564553744, + "loss": 1.9901, + "step": 292160 + }, + { + "epoch": 1.1120711311404277, + "grad_norm": 0.14697347581386566, + "learning_rate": 0.00013388298773934766, + "loss": 1.9884, + "step": 292170 + }, + { + "epoch": 1.1121091936085503, + "grad_norm": 0.16174790263175964, + "learning_rate": 0.00013382608867734265, + "loss": 2.0107, + "step": 292180 + }, + { + "epoch": 1.112147256076673, + "grad_norm": 0.17751865088939667, + "learning_rate": 0.00013376919845540025, + "loss": 1.9975, + "step": 292190 + }, + { + "epoch": 1.1121853185447956, + "grad_norm": 0.2055535614490509, + "learning_rate": 0.00013371231706940152, + "loss": 1.9957, + "step": 292200 + }, + { + "epoch": 1.1122233810129183, + "grad_norm": 0.16524241864681244, + "learning_rate": 0.00013365544451523055, + "loss": 2.0028, + "step": 292210 + }, + { + "epoch": 1.1122614434810412, + "grad_norm": 0.17955149710178375, + "learning_rate": 0.00013359858078877478, + "loss": 1.9782, + "step": 292220 + }, + { + "epoch": 1.1122995059491638, + "grad_norm": 0.1631380319595337, + "learning_rate": 0.00013354172588592484, + "loss": 1.9813, + "step": 292230 + }, + { + "epoch": 1.1123375684172865, + "grad_norm": 0.1502358317375183, + "learning_rate": 0.0001334848798025744, + "loss": 1.9934, + "step": 292240 + }, + { + "epoch": 1.1123756308854091, + "grad_norm": 0.20850756764411926, + "learning_rate": 0.0001334280425346205, + "loss": 1.9876, + "step": 292250 + }, + { + "epoch": 1.1124136933535318, + "grad_norm": 0.1623973846435547, + "learning_rate": 0.00013337121407796328, + "loss": 1.9928, + "step": 292260 + }, + { + "epoch": 1.1124517558216545, + "grad_norm": 0.23397907614707947, + "learning_rate": 0.0001333143944285059, + "loss": 1.9842, + "step": 292270 + }, + { + "epoch": 1.112489818289777, + "grad_norm": 0.21935118734836578, + "learning_rate": 0.000133257583582155, + "loss": 1.9768, + "step": 292280 + }, + { + "epoch": 1.1125278807578998, + "grad_norm": 0.1885177344083786, + "learning_rate": 0.00013320078153482024, + "loss": 1.9962, + "step": 292290 + }, + { + "epoch": 1.1125659432260226, + "grad_norm": 0.16666430234909058, + "learning_rate": 0.00013314398828241436, + "loss": 1.9879, + "step": 292300 + }, + { + "epoch": 1.1126040056941453, + "grad_norm": 0.14934664964675903, + "learning_rate": 0.00013308720382085336, + "loss": 1.9862, + "step": 292310 + }, + { + "epoch": 1.112642068162268, + "grad_norm": 0.18101535737514496, + "learning_rate": 0.00013303042814605641, + "loss": 1.9954, + "step": 292320 + }, + { + "epoch": 1.1126801306303906, + "grad_norm": 0.17227186262607574, + "learning_rate": 0.00013297366125394577, + "loss": 1.9845, + "step": 292330 + }, + { + "epoch": 1.1127181930985133, + "grad_norm": 0.2443477064371109, + "learning_rate": 0.00013291690314044696, + "loss": 1.9926, + "step": 292340 + }, + { + "epoch": 1.112756255566636, + "grad_norm": 0.15813399851322174, + "learning_rate": 0.00013286015380148852, + "loss": 1.9914, + "step": 292350 + }, + { + "epoch": 1.1127943180347586, + "grad_norm": 0.1505991667509079, + "learning_rate": 0.0001328034132330022, + "loss": 1.9996, + "step": 292360 + }, + { + "epoch": 1.1128323805028812, + "grad_norm": 0.18855006992816925, + "learning_rate": 0.00013274668143092294, + "loss": 1.9945, + "step": 292370 + }, + { + "epoch": 1.112870442971004, + "grad_norm": 0.1625203788280487, + "learning_rate": 0.00013268995839118875, + "loss": 1.9891, + "step": 292380 + }, + { + "epoch": 1.1129085054391268, + "grad_norm": 0.1641480177640915, + "learning_rate": 0.0001326332441097407, + "loss": 1.9807, + "step": 292390 + }, + { + "epoch": 1.1129465679072494, + "grad_norm": 0.16145209968090057, + "learning_rate": 0.00013257653858252328, + "loss": 1.9977, + "step": 292400 + }, + { + "epoch": 1.112984630375372, + "grad_norm": 0.1449916511774063, + "learning_rate": 0.00013251984180548377, + "loss": 1.9924, + "step": 292410 + }, + { + "epoch": 1.1130226928434948, + "grad_norm": 0.18777312338352203, + "learning_rate": 0.0001324631537745728, + "loss": 1.9868, + "step": 292420 + }, + { + "epoch": 1.1130607553116174, + "grad_norm": 0.14995765686035156, + "learning_rate": 0.000132406474485744, + "loss": 1.9997, + "step": 292430 + }, + { + "epoch": 1.11309881777974, + "grad_norm": 0.1728399097919464, + "learning_rate": 0.00013234980393495417, + "loss": 2.0076, + "step": 292440 + }, + { + "epoch": 1.1131368802478627, + "grad_norm": 0.157765731215477, + "learning_rate": 0.00013229314211816317, + "loss": 1.9942, + "step": 292450 + }, + { + "epoch": 1.1131749427159854, + "grad_norm": 0.20687264204025269, + "learning_rate": 0.00013223648903133418, + "loss": 1.9799, + "step": 292460 + }, + { + "epoch": 1.1132130051841083, + "grad_norm": 0.1790127158164978, + "learning_rate": 0.00013217984467043314, + "loss": 2.001, + "step": 292470 + }, + { + "epoch": 1.113251067652231, + "grad_norm": 0.14681363105773926, + "learning_rate": 0.0001321232090314295, + "loss": 1.9994, + "step": 292480 + }, + { + "epoch": 1.1132891301203536, + "grad_norm": 0.17148324847221375, + "learning_rate": 0.00013206658211029542, + "loss": 2.0021, + "step": 292490 + }, + { + "epoch": 1.1133271925884762, + "grad_norm": 0.1445014625787735, + "learning_rate": 0.0001320099639030064, + "loss": 1.9919, + "step": 292500 + }, + { + "epoch": 1.113365255056599, + "grad_norm": 0.16965574026107788, + "learning_rate": 0.00013195335440554097, + "loss": 2.0039, + "step": 292510 + }, + { + "epoch": 1.1134033175247215, + "grad_norm": 0.17414280772209167, + "learning_rate": 0.00013189675361388077, + "loss": 1.9966, + "step": 292520 + }, + { + "epoch": 1.1134413799928442, + "grad_norm": 0.18921469151973724, + "learning_rate": 0.00013184016152401051, + "loss": 1.994, + "step": 292530 + }, + { + "epoch": 1.1134794424609669, + "grad_norm": 0.19784805178642273, + "learning_rate": 0.000131783578131918, + "loss": 2.0034, + "step": 292540 + }, + { + "epoch": 1.1135175049290895, + "grad_norm": 0.19768598675727844, + "learning_rate": 0.00013172700343359412, + "loss": 1.9839, + "step": 292550 + }, + { + "epoch": 1.1135555673972124, + "grad_norm": 0.16530479490756989, + "learning_rate": 0.00013167043742503282, + "loss": 1.9888, + "step": 292560 + }, + { + "epoch": 1.113593629865335, + "grad_norm": 0.15673546493053436, + "learning_rate": 0.00013161388010223118, + "loss": 1.9941, + "step": 292570 + }, + { + "epoch": 1.1136316923334577, + "grad_norm": 0.15796087682247162, + "learning_rate": 0.0001315573314611892, + "loss": 2.0009, + "step": 292580 + }, + { + "epoch": 1.1136697548015804, + "grad_norm": 0.1805214136838913, + "learning_rate": 0.00013150079149791015, + "loss": 1.9993, + "step": 292590 + }, + { + "epoch": 1.113707817269703, + "grad_norm": 0.15272203087806702, + "learning_rate": 0.00013144426020840033, + "loss": 1.9801, + "step": 292600 + }, + { + "epoch": 1.1137458797378257, + "grad_norm": 0.15740281343460083, + "learning_rate": 0.00013138773758866894, + "loss": 1.9852, + "step": 292610 + }, + { + "epoch": 1.1137839422059483, + "grad_norm": 0.17293420433998108, + "learning_rate": 0.00013133122363472838, + "loss": 1.9922, + "step": 292620 + }, + { + "epoch": 1.113822004674071, + "grad_norm": 0.1495877504348755, + "learning_rate": 0.00013127471834259408, + "loss": 1.9977, + "step": 292630 + }, + { + "epoch": 1.1138600671421939, + "grad_norm": 0.14915607869625092, + "learning_rate": 0.00013121822170828452, + "loss": 1.9756, + "step": 292640 + }, + { + "epoch": 1.1138981296103165, + "grad_norm": 0.15841448307037354, + "learning_rate": 0.0001311617337278212, + "loss": 1.9795, + "step": 292650 + }, + { + "epoch": 1.1139361920784392, + "grad_norm": 0.15663465857505798, + "learning_rate": 0.00013110525439722875, + "loss": 1.9907, + "step": 292660 + }, + { + "epoch": 1.1139742545465618, + "grad_norm": 0.14949707686901093, + "learning_rate": 0.0001310487837125347, + "loss": 1.9947, + "step": 292670 + }, + { + "epoch": 1.1140123170146845, + "grad_norm": 0.16750098764896393, + "learning_rate": 0.00013099232166976975, + "loss": 2.0036, + "step": 292680 + }, + { + "epoch": 1.1140503794828072, + "grad_norm": 0.23145738244056702, + "learning_rate": 0.00013093586826496762, + "loss": 1.9838, + "step": 292690 + }, + { + "epoch": 1.1140884419509298, + "grad_norm": 0.1893061250448227, + "learning_rate": 0.00013087942349416498, + "loss": 1.9821, + "step": 292700 + }, + { + "epoch": 1.1141265044190525, + "grad_norm": 0.16750651597976685, + "learning_rate": 0.0001308229873534016, + "loss": 1.9836, + "step": 292710 + }, + { + "epoch": 1.1141645668871751, + "grad_norm": 0.14061546325683594, + "learning_rate": 0.00013076655983872026, + "loss": 1.9814, + "step": 292720 + }, + { + "epoch": 1.114202629355298, + "grad_norm": 0.15702416002750397, + "learning_rate": 0.0001307101409461668, + "loss": 1.9929, + "step": 292730 + }, + { + "epoch": 1.1142406918234207, + "grad_norm": 0.1511785238981247, + "learning_rate": 0.00013065373067178997, + "loss": 1.975, + "step": 292740 + }, + { + "epoch": 1.1142787542915433, + "grad_norm": 0.17471915483474731, + "learning_rate": 0.00013059732901164167, + "loss": 1.9757, + "step": 292750 + }, + { + "epoch": 1.114316816759666, + "grad_norm": 0.17572665214538574, + "learning_rate": 0.00013054093596177668, + "loss": 1.9844, + "step": 292760 + }, + { + "epoch": 1.1143548792277886, + "grad_norm": 0.17547960579395294, + "learning_rate": 0.00013048455151825293, + "loss": 1.9892, + "step": 292770 + }, + { + "epoch": 1.1143929416959113, + "grad_norm": 0.20557276904582977, + "learning_rate": 0.00013042817567713127, + "loss": 2.0015, + "step": 292780 + }, + { + "epoch": 1.114431004164034, + "grad_norm": 0.16719205677509308, + "learning_rate": 0.00013037180843447562, + "loss": 1.9891, + "step": 292790 + }, + { + "epoch": 1.1144690666321566, + "grad_norm": 0.17433960735797882, + "learning_rate": 0.0001303154497863528, + "loss": 1.9981, + "step": 292800 + }, + { + "epoch": 1.1145071291002795, + "grad_norm": 0.15616962313652039, + "learning_rate": 0.00013025909972883264, + "loss": 2.0043, + "step": 292810 + }, + { + "epoch": 1.1145451915684021, + "grad_norm": 0.23012442886829376, + "learning_rate": 0.00013020275825798805, + "loss": 1.9873, + "step": 292820 + }, + { + "epoch": 1.1145832540365248, + "grad_norm": 0.17314907908439636, + "learning_rate": 0.00013014642536989497, + "loss": 1.9949, + "step": 292830 + }, + { + "epoch": 1.1146213165046475, + "grad_norm": 0.14603310823440552, + "learning_rate": 0.0001300901010606322, + "loss": 1.9841, + "step": 292840 + }, + { + "epoch": 1.1146593789727701, + "grad_norm": 0.16606785356998444, + "learning_rate": 0.00013003378532628146, + "loss": 1.9914, + "step": 292850 + }, + { + "epoch": 1.1146974414408928, + "grad_norm": 0.14196555316448212, + "learning_rate": 0.00012997747816292772, + "loss": 1.9911, + "step": 292860 + }, + { + "epoch": 1.1147355039090154, + "grad_norm": 0.16460765898227692, + "learning_rate": 0.00012992117956665865, + "loss": 1.9963, + "step": 292870 + }, + { + "epoch": 1.114773566377138, + "grad_norm": 0.1715785413980484, + "learning_rate": 0.00012986488953356507, + "loss": 1.9846, + "step": 292880 + }, + { + "epoch": 1.1148116288452607, + "grad_norm": 0.1690293848514557, + "learning_rate": 0.00012980860805974072, + "loss": 1.9839, + "step": 292890 + }, + { + "epoch": 1.1148496913133836, + "grad_norm": 0.1981569081544876, + "learning_rate": 0.0001297523351412823, + "loss": 1.9965, + "step": 292900 + }, + { + "epoch": 1.1148877537815063, + "grad_norm": 0.14672847092151642, + "learning_rate": 0.00012969607077428953, + "loss": 1.9905, + "step": 292910 + }, + { + "epoch": 1.114925816249629, + "grad_norm": 0.20228272676467896, + "learning_rate": 0.00012963981495486498, + "loss": 2.002, + "step": 292920 + }, + { + "epoch": 1.1149638787177516, + "grad_norm": 0.2107199728488922, + "learning_rate": 0.00012958356767911428, + "loss": 1.9812, + "step": 292930 + }, + { + "epoch": 1.1150019411858743, + "grad_norm": 0.15910865366458893, + "learning_rate": 0.00012952732894314588, + "loss": 1.9941, + "step": 292940 + }, + { + "epoch": 1.115040003653997, + "grad_norm": 0.18070292472839355, + "learning_rate": 0.0001294710987430715, + "loss": 1.9769, + "step": 292950 + }, + { + "epoch": 1.1150780661221196, + "grad_norm": 0.17187024652957916, + "learning_rate": 0.00012941487707500543, + "loss": 1.9875, + "step": 292960 + }, + { + "epoch": 1.1151161285902422, + "grad_norm": 0.1531585156917572, + "learning_rate": 0.00012935866393506512, + "loss": 1.9652, + "step": 292970 + }, + { + "epoch": 1.1151541910583649, + "grad_norm": 0.17606285214424133, + "learning_rate": 0.0001293024593193709, + "loss": 1.9983, + "step": 292980 + }, + { + "epoch": 1.1151922535264878, + "grad_norm": 0.15632694959640503, + "learning_rate": 0.0001292462632240461, + "loss": 1.9985, + "step": 292990 + }, + { + "epoch": 1.1152303159946104, + "grad_norm": 0.1512540876865387, + "learning_rate": 0.00012919007564521684, + "loss": 2.0103, + "step": 293000 + }, + { + "epoch": 1.115268378462733, + "grad_norm": 0.15779423713684082, + "learning_rate": 0.00012913389657901242, + "loss": 1.9961, + "step": 293010 + }, + { + "epoch": 1.1153064409308557, + "grad_norm": 0.14625298976898193, + "learning_rate": 0.00012907772602156482, + "loss": 1.9751, + "step": 293020 + }, + { + "epoch": 1.1153445033989784, + "grad_norm": 0.19319714605808258, + "learning_rate": 0.0001290215639690091, + "loss": 2.0026, + "step": 293030 + }, + { + "epoch": 1.115382565867101, + "grad_norm": 0.16166873276233673, + "learning_rate": 0.0001289654104174832, + "loss": 1.9846, + "step": 293040 + }, + { + "epoch": 1.1154206283352237, + "grad_norm": 0.1930776983499527, + "learning_rate": 0.00012890926536312803, + "loss": 1.9897, + "step": 293050 + }, + { + "epoch": 1.1154586908033464, + "grad_norm": 0.1487596333026886, + "learning_rate": 0.00012885312880208728, + "loss": 1.9907, + "step": 293060 + }, + { + "epoch": 1.115496753271469, + "grad_norm": 0.22580161690711975, + "learning_rate": 0.00012879700073050777, + "loss": 1.9888, + "step": 293070 + }, + { + "epoch": 1.115534815739592, + "grad_norm": 0.18477199971675873, + "learning_rate": 0.00012874088114453897, + "loss": 1.9819, + "step": 293080 + }, + { + "epoch": 1.1155728782077146, + "grad_norm": 0.2038331925868988, + "learning_rate": 0.00012868477004033353, + "loss": 1.9914, + "step": 293090 + }, + { + "epoch": 1.1156109406758372, + "grad_norm": 0.21437828242778778, + "learning_rate": 0.00012862866741404684, + "loss": 2.0052, + "step": 293100 + }, + { + "epoch": 1.1156490031439599, + "grad_norm": 0.16125749051570892, + "learning_rate": 0.0001285725732618372, + "loss": 1.995, + "step": 293110 + }, + { + "epoch": 1.1156870656120825, + "grad_norm": 0.18511207401752472, + "learning_rate": 0.0001285164875798658, + "loss": 1.9737, + "step": 293120 + }, + { + "epoch": 1.1157251280802052, + "grad_norm": 0.20369894802570343, + "learning_rate": 0.0001284604103642969, + "loss": 1.9757, + "step": 293130 + }, + { + "epoch": 1.1157631905483278, + "grad_norm": 0.1853613555431366, + "learning_rate": 0.00012840434161129745, + "loss": 1.9815, + "step": 293140 + }, + { + "epoch": 1.1158012530164505, + "grad_norm": 0.16906699538230896, + "learning_rate": 0.00012834828131703734, + "loss": 1.9748, + "step": 293150 + }, + { + "epoch": 1.1158393154845734, + "grad_norm": 0.1837167739868164, + "learning_rate": 0.0001282922294776895, + "loss": 1.9982, + "step": 293160 + }, + { + "epoch": 1.115877377952696, + "grad_norm": 0.16122731566429138, + "learning_rate": 0.00012823618608942945, + "loss": 1.9906, + "step": 293170 + }, + { + "epoch": 1.1159154404208187, + "grad_norm": 0.16960924863815308, + "learning_rate": 0.00012818015114843584, + "loss": 1.9878, + "step": 293180 + }, + { + "epoch": 1.1159535028889414, + "grad_norm": 0.18560199439525604, + "learning_rate": 0.0001281241246508902, + "loss": 2.0013, + "step": 293190 + }, + { + "epoch": 1.115991565357064, + "grad_norm": 0.20879821479320526, + "learning_rate": 0.0001280681065929767, + "loss": 1.9916, + "step": 293200 + }, + { + "epoch": 1.1160296278251867, + "grad_norm": 0.1578042060136795, + "learning_rate": 0.00012801209697088268, + "loss": 1.9951, + "step": 293210 + }, + { + "epoch": 1.1160676902933093, + "grad_norm": 0.1498941034078598, + "learning_rate": 0.00012795609578079815, + "loss": 1.9811, + "step": 293220 + }, + { + "epoch": 1.116105752761432, + "grad_norm": 0.156406968832016, + "learning_rate": 0.00012790010301891603, + "loss": 1.9818, + "step": 293230 + }, + { + "epoch": 1.1161438152295546, + "grad_norm": 0.1441483199596405, + "learning_rate": 0.0001278441186814321, + "loss": 1.9711, + "step": 293240 + }, + { + "epoch": 1.1161818776976775, + "grad_norm": 0.15797194838523865, + "learning_rate": 0.0001277881427645451, + "loss": 1.992, + "step": 293250 + }, + { + "epoch": 1.1162199401658002, + "grad_norm": 0.17018261551856995, + "learning_rate": 0.00012773217526445652, + "loss": 1.9726, + "step": 293260 + }, + { + "epoch": 1.1162580026339228, + "grad_norm": 0.1910189986228943, + "learning_rate": 0.0001276762161773707, + "loss": 1.9828, + "step": 293270 + }, + { + "epoch": 1.1162960651020455, + "grad_norm": 0.15563057363033295, + "learning_rate": 0.0001276202654994949, + "loss": 1.9681, + "step": 293280 + }, + { + "epoch": 1.1163341275701681, + "grad_norm": 0.19308650493621826, + "learning_rate": 0.0001275643232270392, + "loss": 1.9683, + "step": 293290 + }, + { + "epoch": 1.1163721900382908, + "grad_norm": 0.15804284811019897, + "learning_rate": 0.0001275083893562165, + "loss": 2.002, + "step": 293300 + }, + { + "epoch": 1.1164102525064135, + "grad_norm": 0.1859627366065979, + "learning_rate": 0.00012745246388324262, + "loss": 1.9815, + "step": 293310 + }, + { + "epoch": 1.1164483149745361, + "grad_norm": 0.20299947261810303, + "learning_rate": 0.00012739654680433605, + "loss": 1.9811, + "step": 293320 + }, + { + "epoch": 1.116486377442659, + "grad_norm": 0.17057934403419495, + "learning_rate": 0.00012734063811571834, + "loss": 1.9893, + "step": 293330 + }, + { + "epoch": 1.1165244399107817, + "grad_norm": 0.1744338870048523, + "learning_rate": 0.00012728473781361372, + "loss": 1.9985, + "step": 293340 + }, + { + "epoch": 1.1165625023789043, + "grad_norm": 0.2139360010623932, + "learning_rate": 0.00012722884589424932, + "loss": 1.9871, + "step": 293350 + }, + { + "epoch": 1.116600564847027, + "grad_norm": 0.20919451117515564, + "learning_rate": 0.000127172962353855, + "loss": 1.9833, + "step": 293360 + }, + { + "epoch": 1.1166386273151496, + "grad_norm": 0.16842421889305115, + "learning_rate": 0.00012711708718866365, + "loss": 1.9971, + "step": 293370 + }, + { + "epoch": 1.1166766897832723, + "grad_norm": 0.19921022653579712, + "learning_rate": 0.00012706122039491075, + "loss": 1.9929, + "step": 293380 + }, + { + "epoch": 1.116714752251395, + "grad_norm": 0.1602468341588974, + "learning_rate": 0.00012700536196883473, + "loss": 1.9998, + "step": 293390 + }, + { + "epoch": 1.1167528147195176, + "grad_norm": 0.16518428921699524, + "learning_rate": 0.0001269495119066768, + "loss": 1.9892, + "step": 293400 + }, + { + "epoch": 1.1167908771876403, + "grad_norm": 0.15834908187389374, + "learning_rate": 0.000126893670204681, + "loss": 1.9885, + "step": 293410 + }, + { + "epoch": 1.1168289396557631, + "grad_norm": 0.1923481523990631, + "learning_rate": 0.00012683783685909418, + "loss": 1.9807, + "step": 293420 + }, + { + "epoch": 1.1168670021238858, + "grad_norm": 0.18503503501415253, + "learning_rate": 0.00012678201186616602, + "loss": 1.9882, + "step": 293430 + }, + { + "epoch": 1.1169050645920084, + "grad_norm": 0.1797768622636795, + "learning_rate": 0.00012672619522214885, + "loss": 1.9876, + "step": 293440 + }, + { + "epoch": 1.116943127060131, + "grad_norm": 0.19215306639671326, + "learning_rate": 0.000126670386923298, + "loss": 1.9717, + "step": 293450 + }, + { + "epoch": 1.1169811895282538, + "grad_norm": 0.16550879180431366, + "learning_rate": 0.00012661458696587157, + "loss": 1.9926, + "step": 293460 + }, + { + "epoch": 1.1170192519963764, + "grad_norm": 0.18938034772872925, + "learning_rate": 0.0001265587953461304, + "loss": 2.0031, + "step": 293470 + }, + { + "epoch": 1.117057314464499, + "grad_norm": 0.1606336086988449, + "learning_rate": 0.00012650301206033803, + "loss": 1.9919, + "step": 293480 + }, + { + "epoch": 1.1170953769326217, + "grad_norm": 0.18568719923496246, + "learning_rate": 0.00012644723710476097, + "loss": 1.9744, + "step": 293490 + }, + { + "epoch": 1.1171334394007446, + "grad_norm": 0.1861104965209961, + "learning_rate": 0.0001263914704756684, + "loss": 1.9889, + "step": 293500 + }, + { + "epoch": 1.1171715018688673, + "grad_norm": 0.18612819910049438, + "learning_rate": 0.00012633571216933242, + "loss": 2.0001, + "step": 293510 + }, + { + "epoch": 1.11720956433699, + "grad_norm": 0.1559780389070511, + "learning_rate": 0.00012627996218202769, + "loss": 1.9946, + "step": 293520 + }, + { + "epoch": 1.1172476268051126, + "grad_norm": 0.1587904393672943, + "learning_rate": 0.0001262242205100318, + "loss": 1.9833, + "step": 293530 + }, + { + "epoch": 1.1172856892732352, + "grad_norm": 0.20348505675792694, + "learning_rate": 0.0001261684871496251, + "loss": 1.9875, + "step": 293540 + }, + { + "epoch": 1.117323751741358, + "grad_norm": 0.15221315622329712, + "learning_rate": 0.0001261127620970907, + "loss": 1.9776, + "step": 293550 + }, + { + "epoch": 1.1173618142094806, + "grad_norm": 0.17232543230056763, + "learning_rate": 0.00012605704534871453, + "loss": 1.9885, + "step": 293560 + }, + { + "epoch": 1.1173998766776032, + "grad_norm": 0.18388041853904724, + "learning_rate": 0.0001260013369007852, + "loss": 1.995, + "step": 293570 + }, + { + "epoch": 1.1174379391457259, + "grad_norm": 0.1528346687555313, + "learning_rate": 0.00012594563674959403, + "loss": 1.9756, + "step": 293580 + }, + { + "epoch": 1.1174760016138487, + "grad_norm": 0.16484560072422028, + "learning_rate": 0.0001258899448914353, + "loss": 1.9857, + "step": 293590 + }, + { + "epoch": 1.1175140640819714, + "grad_norm": 0.14479511976242065, + "learning_rate": 0.00012583426132260585, + "loss": 1.9906, + "step": 293600 + }, + { + "epoch": 1.117552126550094, + "grad_norm": 0.16426406800746918, + "learning_rate": 0.00012577858603940545, + "loss": 1.9831, + "step": 293610 + }, + { + "epoch": 1.1175901890182167, + "grad_norm": 0.1525418609380722, + "learning_rate": 0.0001257229190381365, + "loss": 1.9849, + "step": 293620 + }, + { + "epoch": 1.1176282514863394, + "grad_norm": 0.14647795259952545, + "learning_rate": 0.00012566726031510418, + "loss": 1.9824, + "step": 293630 + }, + { + "epoch": 1.117666313954462, + "grad_norm": 0.16383711993694305, + "learning_rate": 0.00012561160986661636, + "loss": 1.9901, + "step": 293640 + }, + { + "epoch": 1.1177043764225847, + "grad_norm": 0.1702844202518463, + "learning_rate": 0.0001255559676889838, + "loss": 1.9863, + "step": 293650 + }, + { + "epoch": 1.1177424388907073, + "grad_norm": 0.22296492755413055, + "learning_rate": 0.0001255003337785199, + "loss": 1.9959, + "step": 293660 + }, + { + "epoch": 1.1177805013588302, + "grad_norm": 0.23798371851444244, + "learning_rate": 0.0001254447081315408, + "loss": 1.9828, + "step": 293670 + }, + { + "epoch": 1.1178185638269529, + "grad_norm": 0.18219825625419617, + "learning_rate": 0.00012538909074436534, + "loss": 1.9906, + "step": 293680 + }, + { + "epoch": 1.1178566262950755, + "grad_norm": 0.2241986244916916, + "learning_rate": 0.00012533348161331522, + "loss": 1.9942, + "step": 293690 + }, + { + "epoch": 1.1178946887631982, + "grad_norm": 0.15364870429039001, + "learning_rate": 0.00012527788073471473, + "loss": 1.9898, + "step": 293700 + }, + { + "epoch": 1.1179327512313209, + "grad_norm": 0.20049087703227997, + "learning_rate": 0.000125222288104891, + "loss": 1.9918, + "step": 293710 + }, + { + "epoch": 1.1179708136994435, + "grad_norm": 0.1605052649974823, + "learning_rate": 0.00012516670372017375, + "loss": 1.9906, + "step": 293720 + }, + { + "epoch": 1.1180088761675662, + "grad_norm": 0.16484107077121735, + "learning_rate": 0.00012511112757689557, + "loss": 1.982, + "step": 293730 + }, + { + "epoch": 1.1180469386356888, + "grad_norm": 0.16114190220832825, + "learning_rate": 0.00012505555967139164, + "loss": 1.9787, + "step": 293740 + }, + { + "epoch": 1.1180850011038115, + "grad_norm": 0.17148414254188538, + "learning_rate": 0.000125, + "loss": 1.987, + "step": 293750 + }, + { + "epoch": 1.1181230635719344, + "grad_norm": 0.17402517795562744, + "learning_rate": 0.00012494444855906122, + "loss": 1.9793, + "step": 293760 + }, + { + "epoch": 1.118161126040057, + "grad_norm": 0.1686442792415619, + "learning_rate": 0.00012488890534491874, + "loss": 1.9868, + "step": 293770 + }, + { + "epoch": 1.1181991885081797, + "grad_norm": 0.17639507353305817, + "learning_rate": 0.00012483337035391862, + "loss": 1.973, + "step": 293780 + }, + { + "epoch": 1.1182372509763023, + "grad_norm": 0.25353384017944336, + "learning_rate": 0.00012477784358240962, + "loss": 1.9817, + "step": 293790 + }, + { + "epoch": 1.118275313444425, + "grad_norm": 0.14957685768604279, + "learning_rate": 0.00012472232502674325, + "loss": 1.9882, + "step": 293800 + }, + { + "epoch": 1.1183133759125476, + "grad_norm": 0.1678726077079773, + "learning_rate": 0.00012466681468327373, + "loss": 1.9836, + "step": 293810 + }, + { + "epoch": 1.1183514383806703, + "grad_norm": 0.17077717185020447, + "learning_rate": 0.00012461131254835789, + "loss": 1.9959, + "step": 293820 + }, + { + "epoch": 1.118389500848793, + "grad_norm": 0.2349521964788437, + "learning_rate": 0.0001245558186183553, + "loss": 1.9822, + "step": 293830 + }, + { + "epoch": 1.1184275633169156, + "grad_norm": 0.18050359189510345, + "learning_rate": 0.00012450033288962826, + "loss": 1.9847, + "step": 293840 + }, + { + "epoch": 1.1184656257850385, + "grad_norm": 0.15957248210906982, + "learning_rate": 0.00012444485535854168, + "loss": 1.9902, + "step": 293850 + }, + { + "epoch": 1.1185036882531612, + "grad_norm": 0.16633108258247375, + "learning_rate": 0.00012438938602146328, + "loss": 1.9826, + "step": 293860 + }, + { + "epoch": 1.1185417507212838, + "grad_norm": 0.1867218166589737, + "learning_rate": 0.0001243339248747633, + "loss": 1.9942, + "step": 293870 + }, + { + "epoch": 1.1185798131894065, + "grad_norm": 0.1948414444923401, + "learning_rate": 0.00012427847191481473, + "loss": 1.9838, + "step": 293880 + }, + { + "epoch": 1.1186178756575291, + "grad_norm": 0.16936980187892914, + "learning_rate": 0.00012422302713799326, + "loss": 2.009, + "step": 293890 + }, + { + "epoch": 1.1186559381256518, + "grad_norm": 0.1969858705997467, + "learning_rate": 0.00012416759054067735, + "loss": 1.9868, + "step": 293900 + }, + { + "epoch": 1.1186940005937744, + "grad_norm": 0.17792664468288422, + "learning_rate": 0.00012411216211924775, + "loss": 1.9882, + "step": 293910 + }, + { + "epoch": 1.118732063061897, + "grad_norm": 0.17175708711147308, + "learning_rate": 0.0001240567418700884, + "loss": 1.9842, + "step": 293920 + }, + { + "epoch": 1.1187701255300198, + "grad_norm": 0.1614762544631958, + "learning_rate": 0.00012400132978958555, + "loss": 1.9829, + "step": 293930 + }, + { + "epoch": 1.1188081879981426, + "grad_norm": 0.17596665024757385, + "learning_rate": 0.00012394592587412824, + "loss": 1.9943, + "step": 293940 + }, + { + "epoch": 1.1188462504662653, + "grad_norm": 0.18127337098121643, + "learning_rate": 0.00012389053012010808, + "loss": 2.001, + "step": 293950 + }, + { + "epoch": 1.118884312934388, + "grad_norm": 0.1930181235074997, + "learning_rate": 0.0001238351425239195, + "loss": 1.9824, + "step": 293960 + }, + { + "epoch": 1.1189223754025106, + "grad_norm": 0.15906044840812683, + "learning_rate": 0.00012377976308195932, + "loss": 1.9944, + "step": 293970 + }, + { + "epoch": 1.1189604378706333, + "grad_norm": 0.1689421683549881, + "learning_rate": 0.00012372439179062734, + "loss": 1.9806, + "step": 293980 + }, + { + "epoch": 1.118998500338756, + "grad_norm": 0.17643670737743378, + "learning_rate": 0.00012366902864632573, + "loss": 1.9793, + "step": 293990 + }, + { + "epoch": 1.1190365628068786, + "grad_norm": 0.19098548591136932, + "learning_rate": 0.0001236136736454595, + "loss": 1.9854, + "step": 294000 + }, + { + "epoch": 1.1190746252750012, + "grad_norm": 0.16189756989479065, + "learning_rate": 0.0001235583267844362, + "loss": 1.9983, + "step": 294010 + }, + { + "epoch": 1.1191126877431241, + "grad_norm": 0.17617790400981903, + "learning_rate": 0.00012350298805966604, + "loss": 1.9805, + "step": 294020 + }, + { + "epoch": 1.1191507502112468, + "grad_norm": 0.16275504231452942, + "learning_rate": 0.0001234476574675618, + "loss": 1.978, + "step": 294030 + }, + { + "epoch": 1.1191888126793694, + "grad_norm": 0.17957115173339844, + "learning_rate": 0.00012339233500453906, + "loss": 1.996, + "step": 294040 + }, + { + "epoch": 1.119226875147492, + "grad_norm": 0.1702452450990677, + "learning_rate": 0.0001233370206670159, + "loss": 1.9915, + "step": 294050 + }, + { + "epoch": 1.1192649376156147, + "grad_norm": 0.17218199372291565, + "learning_rate": 0.0001232817144514131, + "loss": 1.9914, + "step": 294060 + }, + { + "epoch": 1.1193030000837374, + "grad_norm": 0.15313132107257843, + "learning_rate": 0.000123226416354154, + "loss": 1.9918, + "step": 294070 + }, + { + "epoch": 1.11934106255186, + "grad_norm": 0.15285362303256989, + "learning_rate": 0.00012317112637166456, + "loss": 1.9765, + "step": 294080 + }, + { + "epoch": 1.1193791250199827, + "grad_norm": 0.22781339287757874, + "learning_rate": 0.00012311584450037344, + "loss": 1.9786, + "step": 294090 + }, + { + "epoch": 1.1194171874881054, + "grad_norm": 0.18625149130821228, + "learning_rate": 0.00012306057073671196, + "loss": 1.9889, + "step": 294100 + }, + { + "epoch": 1.1194552499562282, + "grad_norm": 0.1693805605173111, + "learning_rate": 0.00012300530507711383, + "loss": 1.9917, + "step": 294110 + }, + { + "epoch": 1.119493312424351, + "grad_norm": 0.1493048220872879, + "learning_rate": 0.00012295004751801565, + "loss": 1.9787, + "step": 294120 + }, + { + "epoch": 1.1195313748924736, + "grad_norm": 0.23088662326335907, + "learning_rate": 0.00012289479805585642, + "loss": 1.9755, + "step": 294130 + }, + { + "epoch": 1.1195694373605962, + "grad_norm": 0.17690658569335938, + "learning_rate": 0.0001228395566870778, + "loss": 1.9939, + "step": 294140 + }, + { + "epoch": 1.1196074998287189, + "grad_norm": 0.16235333681106567, + "learning_rate": 0.00012278432340812412, + "loss": 1.9822, + "step": 294150 + }, + { + "epoch": 1.1196455622968415, + "grad_norm": 0.17188286781311035, + "learning_rate": 0.00012272909821544237, + "loss": 1.9945, + "step": 294160 + }, + { + "epoch": 1.1196836247649642, + "grad_norm": 0.21481271088123322, + "learning_rate": 0.00012267388110548188, + "loss": 1.9845, + "step": 294170 + }, + { + "epoch": 1.1197216872330868, + "grad_norm": 0.15324686467647552, + "learning_rate": 0.0001226186720746949, + "loss": 1.979, + "step": 294180 + }, + { + "epoch": 1.1197597497012097, + "grad_norm": 0.17416979372501373, + "learning_rate": 0.00012256347111953603, + "loss": 1.983, + "step": 294190 + }, + { + "epoch": 1.1197978121693324, + "grad_norm": 0.16954316198825836, + "learning_rate": 0.00012250827823646256, + "loss": 1.9834, + "step": 294200 + }, + { + "epoch": 1.119835874637455, + "grad_norm": 0.1516610085964203, + "learning_rate": 0.0001224530934219343, + "loss": 1.9998, + "step": 294210 + }, + { + "epoch": 1.1198739371055777, + "grad_norm": 0.1607905924320221, + "learning_rate": 0.00012239791667241384, + "loss": 1.9864, + "step": 294220 + }, + { + "epoch": 1.1199119995737004, + "grad_norm": 0.1633176952600479, + "learning_rate": 0.00012234274798436612, + "loss": 1.9747, + "step": 294230 + }, + { + "epoch": 1.119950062041823, + "grad_norm": 0.16344870626926422, + "learning_rate": 0.00012228758735425882, + "loss": 1.9829, + "step": 294240 + }, + { + "epoch": 1.1199881245099457, + "grad_norm": 0.14256510138511658, + "learning_rate": 0.00012223243477856212, + "loss": 1.9633, + "step": 294250 + }, + { + "epoch": 1.1200261869780683, + "grad_norm": 0.2190176248550415, + "learning_rate": 0.0001221772902537488, + "loss": 1.9703, + "step": 294260 + }, + { + "epoch": 1.120064249446191, + "grad_norm": 0.1784692406654358, + "learning_rate": 0.00012212215377629414, + "loss": 1.9881, + "step": 294270 + }, + { + "epoch": 1.1201023119143139, + "grad_norm": 0.19331470131874084, + "learning_rate": 0.00012206702534267622, + "loss": 1.9846, + "step": 294280 + }, + { + "epoch": 1.1201403743824365, + "grad_norm": 0.19215671718120575, + "learning_rate": 0.00012201190494937541, + "loss": 1.9854, + "step": 294290 + }, + { + "epoch": 1.1201784368505592, + "grad_norm": 0.17129290103912354, + "learning_rate": 0.00012195679259287485, + "loss": 1.9964, + "step": 294300 + }, + { + "epoch": 1.1202164993186818, + "grad_norm": 0.247706338763237, + "learning_rate": 0.00012190168826966003, + "loss": 1.9791, + "step": 294310 + }, + { + "epoch": 1.1202545617868045, + "grad_norm": 0.23695670068264008, + "learning_rate": 0.00012184659197621928, + "loss": 1.9807, + "step": 294320 + }, + { + "epoch": 1.1202926242549271, + "grad_norm": 0.202765554189682, + "learning_rate": 0.00012179150370904319, + "loss": 1.9836, + "step": 294330 + }, + { + "epoch": 1.1203306867230498, + "grad_norm": 0.15483033657073975, + "learning_rate": 0.00012173642346462521, + "loss": 1.9949, + "step": 294340 + }, + { + "epoch": 1.1203687491911725, + "grad_norm": 0.1478613317012787, + "learning_rate": 0.00012168135123946112, + "loss": 1.9832, + "step": 294350 + }, + { + "epoch": 1.1204068116592953, + "grad_norm": 0.19954794645309448, + "learning_rate": 0.00012162628703004924, + "loss": 1.9974, + "step": 294360 + }, + { + "epoch": 1.120444874127418, + "grad_norm": 0.20727083086967468, + "learning_rate": 0.00012157123083289068, + "loss": 1.9894, + "step": 294370 + }, + { + "epoch": 1.1204829365955407, + "grad_norm": 0.22470323741436005, + "learning_rate": 0.00012151618264448877, + "loss": 1.9849, + "step": 294380 + }, + { + "epoch": 1.1205209990636633, + "grad_norm": 0.17775776982307434, + "learning_rate": 0.0001214611424613496, + "loss": 1.9869, + "step": 294390 + }, + { + "epoch": 1.120559061531786, + "grad_norm": 0.19694171845912933, + "learning_rate": 0.00012140611027998177, + "loss": 1.9774, + "step": 294400 + }, + { + "epoch": 1.1205971239999086, + "grad_norm": 0.159882590174675, + "learning_rate": 0.00012135108609689632, + "loss": 1.9881, + "step": 294410 + }, + { + "epoch": 1.1206351864680313, + "grad_norm": 0.17960825562477112, + "learning_rate": 0.00012129606990860703, + "loss": 1.9956, + "step": 294420 + }, + { + "epoch": 1.120673248936154, + "grad_norm": 0.1673920601606369, + "learning_rate": 0.0001212410617116299, + "loss": 1.9874, + "step": 294430 + }, + { + "epoch": 1.1207113114042766, + "grad_norm": 0.1664450317621231, + "learning_rate": 0.00012118606150248378, + "loss": 1.9838, + "step": 294440 + }, + { + "epoch": 1.1207493738723995, + "grad_norm": 0.18804626166820526, + "learning_rate": 0.0001211310692776898, + "loss": 1.9796, + "step": 294450 + }, + { + "epoch": 1.1207874363405221, + "grad_norm": 0.16094692051410675, + "learning_rate": 0.00012107608503377182, + "loss": 1.9809, + "step": 294460 + }, + { + "epoch": 1.1208254988086448, + "grad_norm": 0.2077358514070511, + "learning_rate": 0.00012102110876725603, + "loss": 1.9958, + "step": 294470 + }, + { + "epoch": 1.1208635612767675, + "grad_norm": 0.1666060984134674, + "learning_rate": 0.00012096614047467125, + "loss": 1.9916, + "step": 294480 + }, + { + "epoch": 1.12090162374489, + "grad_norm": 0.16506783664226532, + "learning_rate": 0.00012091118015254876, + "loss": 1.9917, + "step": 294490 + }, + { + "epoch": 1.1209396862130128, + "grad_norm": 0.156228706240654, + "learning_rate": 0.0001208562277974225, + "loss": 1.9862, + "step": 294500 + }, + { + "epoch": 1.1209777486811354, + "grad_norm": 0.2048158347606659, + "learning_rate": 0.00012080128340582863, + "loss": 1.9775, + "step": 294510 + }, + { + "epoch": 1.121015811149258, + "grad_norm": 0.15326504409313202, + "learning_rate": 0.00012074634697430619, + "loss": 1.9846, + "step": 294520 + }, + { + "epoch": 1.121053873617381, + "grad_norm": 0.15472504496574402, + "learning_rate": 0.00012069141849939646, + "loss": 1.9914, + "step": 294530 + }, + { + "epoch": 1.1210919360855036, + "grad_norm": 0.15494997799396515, + "learning_rate": 0.00012063649797764325, + "loss": 1.9635, + "step": 294540 + }, + { + "epoch": 1.1211299985536263, + "grad_norm": 0.1556081771850586, + "learning_rate": 0.000120581585405593, + "loss": 2.0011, + "step": 294550 + }, + { + "epoch": 1.121168061021749, + "grad_norm": 0.15699365735054016, + "learning_rate": 0.00012052668077979451, + "loss": 1.9982, + "step": 294560 + }, + { + "epoch": 1.1212061234898716, + "grad_norm": 0.21263249218463898, + "learning_rate": 0.00012047178409679909, + "loss": 1.9909, + "step": 294570 + }, + { + "epoch": 1.1212441859579942, + "grad_norm": 0.22214330732822418, + "learning_rate": 0.00012041689535316069, + "loss": 1.984, + "step": 294580 + }, + { + "epoch": 1.121282248426117, + "grad_norm": 0.19332750141620636, + "learning_rate": 0.00012036201454543567, + "loss": 1.9856, + "step": 294590 + }, + { + "epoch": 1.1213203108942396, + "grad_norm": 0.16281723976135254, + "learning_rate": 0.00012030714167018275, + "loss": 1.983, + "step": 294600 + }, + { + "epoch": 1.1213583733623622, + "grad_norm": 0.1823878437280655, + "learning_rate": 0.00012025227672396332, + "loss": 1.9874, + "step": 294610 + }, + { + "epoch": 1.121396435830485, + "grad_norm": 0.150102898478508, + "learning_rate": 0.00012019741970334113, + "loss": 1.9854, + "step": 294620 + }, + { + "epoch": 1.1214344982986078, + "grad_norm": 0.15728197991847992, + "learning_rate": 0.00012014257060488248, + "loss": 1.9612, + "step": 294630 + }, + { + "epoch": 1.1214725607667304, + "grad_norm": 0.19842971861362457, + "learning_rate": 0.00012008772942515617, + "loss": 1.9752, + "step": 294640 + }, + { + "epoch": 1.121510623234853, + "grad_norm": 0.16742175817489624, + "learning_rate": 0.00012003289616073342, + "loss": 1.9868, + "step": 294650 + }, + { + "epoch": 1.1215486857029757, + "grad_norm": 0.17433230578899384, + "learning_rate": 0.0001199780708081879, + "loss": 1.9827, + "step": 294660 + }, + { + "epoch": 1.1215867481710984, + "grad_norm": 0.16636930406093597, + "learning_rate": 0.00011992325336409576, + "loss": 1.9903, + "step": 294670 + }, + { + "epoch": 1.121624810639221, + "grad_norm": 0.17104153335094452, + "learning_rate": 0.00011986844382503575, + "loss": 1.9881, + "step": 294680 + }, + { + "epoch": 1.1216628731073437, + "grad_norm": 0.18843293190002441, + "learning_rate": 0.00011981364218758889, + "loss": 1.9935, + "step": 294690 + }, + { + "epoch": 1.1217009355754666, + "grad_norm": 0.18074916303157806, + "learning_rate": 0.00011975884844833884, + "loss": 1.9796, + "step": 294700 + }, + { + "epoch": 1.1217389980435892, + "grad_norm": 0.16955699026584625, + "learning_rate": 0.00011970406260387162, + "loss": 1.9886, + "step": 294710 + }, + { + "epoch": 1.1217770605117119, + "grad_norm": 0.15711715817451477, + "learning_rate": 0.00011964928465077568, + "loss": 1.9702, + "step": 294720 + }, + { + "epoch": 1.1218151229798345, + "grad_norm": 0.20470353960990906, + "learning_rate": 0.00011959451458564202, + "loss": 1.984, + "step": 294730 + }, + { + "epoch": 1.1218531854479572, + "grad_norm": 0.19052280485630035, + "learning_rate": 0.00011953975240506404, + "loss": 1.9768, + "step": 294740 + }, + { + "epoch": 1.1218912479160799, + "grad_norm": 0.2238382250070572, + "learning_rate": 0.00011948499810563751, + "loss": 1.9749, + "step": 294750 + }, + { + "epoch": 1.1219293103842025, + "grad_norm": 0.20275108516216278, + "learning_rate": 0.00011943025168396093, + "loss": 1.9908, + "step": 294760 + }, + { + "epoch": 1.1219673728523252, + "grad_norm": 0.1481522172689438, + "learning_rate": 0.00011937551313663486, + "loss": 1.9866, + "step": 294770 + }, + { + "epoch": 1.1220054353204478, + "grad_norm": 0.1640080064535141, + "learning_rate": 0.00011932078246026268, + "loss": 1.978, + "step": 294780 + }, + { + "epoch": 1.1220434977885705, + "grad_norm": 0.15450188517570496, + "learning_rate": 0.00011926605965144988, + "loss": 1.9854, + "step": 294790 + }, + { + "epoch": 1.1220815602566934, + "grad_norm": 0.1643836796283722, + "learning_rate": 0.00011921134470680461, + "loss": 1.9925, + "step": 294800 + }, + { + "epoch": 1.122119622724816, + "grad_norm": 0.1798993945121765, + "learning_rate": 0.00011915663762293733, + "loss": 1.9843, + "step": 294810 + }, + { + "epoch": 1.1221576851929387, + "grad_norm": 0.19793778657913208, + "learning_rate": 0.00011910193839646105, + "loss": 1.9928, + "step": 294820 + }, + { + "epoch": 1.1221957476610613, + "grad_norm": 0.16370028257369995, + "learning_rate": 0.00011904724702399117, + "loss": 1.9861, + "step": 294830 + }, + { + "epoch": 1.122233810129184, + "grad_norm": 0.14759381115436554, + "learning_rate": 0.00011899256350214543, + "loss": 1.9916, + "step": 294840 + }, + { + "epoch": 1.1222718725973067, + "grad_norm": 0.17767883837223053, + "learning_rate": 0.00011893788782754417, + "loss": 1.9853, + "step": 294850 + }, + { + "epoch": 1.1223099350654293, + "grad_norm": 0.14556005597114563, + "learning_rate": 0.00011888321999680989, + "loss": 1.9898, + "step": 294860 + }, + { + "epoch": 1.122347997533552, + "grad_norm": 0.15929433703422546, + "learning_rate": 0.00011882856000656783, + "loss": 1.9894, + "step": 294870 + }, + { + "epoch": 1.1223860600016748, + "grad_norm": 0.1680683046579361, + "learning_rate": 0.00011877390785344538, + "loss": 1.9915, + "step": 294880 + }, + { + "epoch": 1.1224241224697975, + "grad_norm": 0.1863175928592682, + "learning_rate": 0.00011871926353407259, + "loss": 1.9944, + "step": 294890 + }, + { + "epoch": 1.1224621849379202, + "grad_norm": 0.18595843017101288, + "learning_rate": 0.00011866462704508168, + "loss": 1.9977, + "step": 294900 + }, + { + "epoch": 1.1225002474060428, + "grad_norm": 0.1734783798456192, + "learning_rate": 0.00011860999838310743, + "loss": 1.9636, + "step": 294910 + }, + { + "epoch": 1.1225383098741655, + "grad_norm": 0.15593071281909943, + "learning_rate": 0.00011855537754478701, + "loss": 1.9946, + "step": 294920 + }, + { + "epoch": 1.1225763723422881, + "grad_norm": 0.1951831728219986, + "learning_rate": 0.00011850076452675995, + "loss": 1.9852, + "step": 294930 + }, + { + "epoch": 1.1226144348104108, + "grad_norm": 0.1786825805902481, + "learning_rate": 0.00011844615932566827, + "loss": 1.9759, + "step": 294940 + }, + { + "epoch": 1.1226524972785334, + "grad_norm": 0.18678231537342072, + "learning_rate": 0.00011839156193815631, + "loss": 1.9857, + "step": 294950 + }, + { + "epoch": 1.122690559746656, + "grad_norm": 0.18317410349845886, + "learning_rate": 0.00011833697236087082, + "loss": 2.0051, + "step": 294960 + }, + { + "epoch": 1.122728622214779, + "grad_norm": 0.16210854053497314, + "learning_rate": 0.00011828239059046109, + "loss": 1.9848, + "step": 294970 + }, + { + "epoch": 1.1227666846829016, + "grad_norm": 0.1885615587234497, + "learning_rate": 0.00011822781662357851, + "loss": 1.9832, + "step": 294980 + }, + { + "epoch": 1.1228047471510243, + "grad_norm": 0.18907485902309418, + "learning_rate": 0.00011817325045687716, + "loss": 1.988, + "step": 294990 + }, + { + "epoch": 1.122842809619147, + "grad_norm": 0.14906156063079834, + "learning_rate": 0.00011811869208701332, + "loss": 1.9745, + "step": 295000 + }, + { + "epoch": 1.1228808720872696, + "grad_norm": 0.1860986202955246, + "learning_rate": 0.00011806414151064581, + "loss": 1.9751, + "step": 295010 + }, + { + "epoch": 1.1229189345553923, + "grad_norm": 0.20608893036842346, + "learning_rate": 0.00011800959872443567, + "loss": 1.9909, + "step": 295020 + }, + { + "epoch": 1.122956997023515, + "grad_norm": 0.17302121222019196, + "learning_rate": 0.00011795506372504644, + "loss": 1.9879, + "step": 295030 + }, + { + "epoch": 1.1229950594916376, + "grad_norm": 0.1496732234954834, + "learning_rate": 0.000117900536509144, + "loss": 1.9784, + "step": 295040 + }, + { + "epoch": 1.1230331219597605, + "grad_norm": 0.2081655114889145, + "learning_rate": 0.00011784601707339665, + "loss": 1.9958, + "step": 295050 + }, + { + "epoch": 1.1230711844278831, + "grad_norm": 0.2021922916173935, + "learning_rate": 0.00011779150541447492, + "loss": 1.9986, + "step": 295060 + }, + { + "epoch": 1.1231092468960058, + "grad_norm": 0.23316898941993713, + "learning_rate": 0.00011773700152905198, + "loss": 1.9816, + "step": 295070 + }, + { + "epoch": 1.1231473093641284, + "grad_norm": 0.2011638879776001, + "learning_rate": 0.00011768250541380315, + "loss": 1.9981, + "step": 295080 + }, + { + "epoch": 1.123185371832251, + "grad_norm": 0.1774028092622757, + "learning_rate": 0.00011762801706540616, + "loss": 1.9899, + "step": 295090 + }, + { + "epoch": 1.1232234343003737, + "grad_norm": 0.181764155626297, + "learning_rate": 0.00011757353648054114, + "loss": 1.9833, + "step": 295100 + }, + { + "epoch": 1.1232614967684964, + "grad_norm": 0.1900632381439209, + "learning_rate": 0.0001175190636558906, + "loss": 1.9869, + "step": 295110 + }, + { + "epoch": 1.123299559236619, + "grad_norm": 0.15032033622264862, + "learning_rate": 0.00011746459858813935, + "loss": 1.9868, + "step": 295120 + }, + { + "epoch": 1.1233376217047417, + "grad_norm": 0.25727778673171997, + "learning_rate": 0.0001174101412739747, + "loss": 1.9942, + "step": 295130 + }, + { + "epoch": 1.1233756841728646, + "grad_norm": 0.17406918108463287, + "learning_rate": 0.00011735569171008609, + "loss": 1.9872, + "step": 295140 + }, + { + "epoch": 1.1234137466409873, + "grad_norm": 0.19947175681591034, + "learning_rate": 0.00011730124989316553, + "loss": 1.9611, + "step": 295150 + }, + { + "epoch": 1.12345180910911, + "grad_norm": 0.23555511236190796, + "learning_rate": 0.00011724681581990726, + "loss": 1.9822, + "step": 295160 + }, + { + "epoch": 1.1234898715772326, + "grad_norm": 0.15338340401649475, + "learning_rate": 0.00011719238948700788, + "loss": 1.9855, + "step": 295170 + }, + { + "epoch": 1.1235279340453552, + "grad_norm": 0.14458732306957245, + "learning_rate": 0.0001171379708911664, + "loss": 1.9884, + "step": 295180 + }, + { + "epoch": 1.1235659965134779, + "grad_norm": 0.16276195645332336, + "learning_rate": 0.00011708356002908416, + "loss": 1.9876, + "step": 295190 + }, + { + "epoch": 1.1236040589816005, + "grad_norm": 0.1536705493927002, + "learning_rate": 0.00011702915689746474, + "loss": 1.9978, + "step": 295200 + }, + { + "epoch": 1.1236421214497232, + "grad_norm": 0.2029026299715042, + "learning_rate": 0.00011697476149301423, + "loss": 1.9782, + "step": 295210 + }, + { + "epoch": 1.123680183917846, + "grad_norm": 0.15842191874980927, + "learning_rate": 0.00011692037381244092, + "loss": 1.987, + "step": 295220 + }, + { + "epoch": 1.1237182463859687, + "grad_norm": 0.17644508183002472, + "learning_rate": 0.00011686599385245549, + "loss": 1.9821, + "step": 295230 + }, + { + "epoch": 1.1237563088540914, + "grad_norm": 0.17277035117149353, + "learning_rate": 0.00011681162160977098, + "loss": 1.9751, + "step": 295240 + }, + { + "epoch": 1.123794371322214, + "grad_norm": 0.1563599556684494, + "learning_rate": 0.00011675725708110268, + "loss": 1.9903, + "step": 295250 + }, + { + "epoch": 1.1238324337903367, + "grad_norm": 0.20365029573440552, + "learning_rate": 0.00011670290026316837, + "loss": 1.9731, + "step": 295260 + }, + { + "epoch": 1.1238704962584594, + "grad_norm": 0.15751436352729797, + "learning_rate": 0.00011664855115268791, + "loss": 1.9847, + "step": 295270 + }, + { + "epoch": 1.123908558726582, + "grad_norm": 0.2296774536371231, + "learning_rate": 0.00011659420974638374, + "loss": 1.9644, + "step": 295280 + }, + { + "epoch": 1.1239466211947047, + "grad_norm": 0.1706577092409134, + "learning_rate": 0.00011653987604098043, + "loss": 1.9926, + "step": 295290 + }, + { + "epoch": 1.1239846836628273, + "grad_norm": 0.16438205540180206, + "learning_rate": 0.00011648555003320493, + "loss": 1.9909, + "step": 295300 + }, + { + "epoch": 1.1240227461309502, + "grad_norm": 0.15377695858478546, + "learning_rate": 0.00011643123171978664, + "loss": 1.9713, + "step": 295310 + }, + { + "epoch": 1.1240608085990729, + "grad_norm": 0.17905206978321075, + "learning_rate": 0.00011637692109745707, + "loss": 1.9679, + "step": 295320 + }, + { + "epoch": 1.1240988710671955, + "grad_norm": 0.17283768951892853, + "learning_rate": 0.0001163226181629502, + "loss": 1.988, + "step": 295330 + }, + { + "epoch": 1.1241369335353182, + "grad_norm": 0.17996124923229218, + "learning_rate": 0.00011626832291300216, + "loss": 1.9979, + "step": 295340 + }, + { + "epoch": 1.1241749960034408, + "grad_norm": 0.21404337882995605, + "learning_rate": 0.00011621403534435155, + "loss": 2.0035, + "step": 295350 + }, + { + "epoch": 1.1242130584715635, + "grad_norm": 0.1598508507013321, + "learning_rate": 0.00011615975545373914, + "loss": 1.9755, + "step": 295360 + }, + { + "epoch": 1.1242511209396862, + "grad_norm": 0.17142625153064728, + "learning_rate": 0.00011610548323790814, + "loss": 1.9922, + "step": 295370 + }, + { + "epoch": 1.1242891834078088, + "grad_norm": 0.15926562249660492, + "learning_rate": 0.00011605121869360402, + "loss": 1.9932, + "step": 295380 + }, + { + "epoch": 1.1243272458759317, + "grad_norm": 0.18492233753204346, + "learning_rate": 0.00011599696181757452, + "loss": 1.9789, + "step": 295390 + }, + { + "epoch": 1.1243653083440543, + "grad_norm": 0.1605922430753708, + "learning_rate": 0.00011594271260656957, + "loss": 1.9902, + "step": 295400 + }, + { + "epoch": 1.124403370812177, + "grad_norm": 0.16567596793174744, + "learning_rate": 0.00011588847105734168, + "loss": 1.9772, + "step": 295410 + }, + { + "epoch": 1.1244414332802997, + "grad_norm": 0.18696080148220062, + "learning_rate": 0.00011583423716664527, + "loss": 1.9932, + "step": 295420 + }, + { + "epoch": 1.1244794957484223, + "grad_norm": 0.16154401004314423, + "learning_rate": 0.00011578001093123746, + "loss": 1.9821, + "step": 295430 + }, + { + "epoch": 1.124517558216545, + "grad_norm": 0.20887313783168793, + "learning_rate": 0.0001157257923478774, + "loss": 1.9735, + "step": 295440 + }, + { + "epoch": 1.1245556206846676, + "grad_norm": 0.23249229788780212, + "learning_rate": 0.00011567158141332651, + "loss": 1.9901, + "step": 295450 + }, + { + "epoch": 1.1245936831527903, + "grad_norm": 0.16391891241073608, + "learning_rate": 0.00011561737812434863, + "loss": 1.9923, + "step": 295460 + }, + { + "epoch": 1.124631745620913, + "grad_norm": 0.1754762977361679, + "learning_rate": 0.0001155631824777098, + "loss": 1.9943, + "step": 295470 + }, + { + "epoch": 1.1246698080890358, + "grad_norm": 0.1715954691171646, + "learning_rate": 0.00011550899447017837, + "loss": 1.968, + "step": 295480 + }, + { + "epoch": 1.1247078705571585, + "grad_norm": 0.15342910587787628, + "learning_rate": 0.00011545481409852493, + "loss": 1.9897, + "step": 295490 + }, + { + "epoch": 1.1247459330252811, + "grad_norm": 0.15989144146442413, + "learning_rate": 0.00011540064135952244, + "loss": 1.9736, + "step": 295500 + }, + { + "epoch": 1.1247839954934038, + "grad_norm": 0.15441451966762543, + "learning_rate": 0.00011534647624994604, + "loss": 1.9855, + "step": 295510 + }, + { + "epoch": 1.1248220579615265, + "grad_norm": 0.1686556190252304, + "learning_rate": 0.0001152923187665731, + "loss": 1.9804, + "step": 295520 + }, + { + "epoch": 1.1248601204296491, + "grad_norm": 0.1646524965763092, + "learning_rate": 0.00011523816890618339, + "loss": 1.9759, + "step": 295530 + }, + { + "epoch": 1.1248981828977718, + "grad_norm": 0.19196270406246185, + "learning_rate": 0.00011518402666555883, + "loss": 1.9846, + "step": 295540 + }, + { + "epoch": 1.1249362453658944, + "grad_norm": 0.15963903069496155, + "learning_rate": 0.00011512989204148366, + "loss": 1.97, + "step": 295550 + }, + { + "epoch": 1.1249743078340173, + "grad_norm": 0.2054601013660431, + "learning_rate": 0.00011507576503074446, + "loss": 1.9729, + "step": 295560 + }, + { + "epoch": 1.12501237030214, + "grad_norm": 0.24047240614891052, + "learning_rate": 0.00011502164563012984, + "loss": 1.9889, + "step": 295570 + }, + { + "epoch": 1.1250504327702626, + "grad_norm": 0.15630276501178741, + "learning_rate": 0.00011496753383643088, + "loss": 1.9876, + "step": 295580 + }, + { + "epoch": 1.1250884952383853, + "grad_norm": 0.17492572963237762, + "learning_rate": 0.00011491342964644086, + "loss": 1.9782, + "step": 295590 + }, + { + "epoch": 1.125126557706508, + "grad_norm": 0.16902922093868256, + "learning_rate": 0.00011485933305695523, + "loss": 1.9842, + "step": 295600 + }, + { + "epoch": 1.1251646201746306, + "grad_norm": 0.22221390902996063, + "learning_rate": 0.00011480524406477183, + "loss": 1.9568, + "step": 295610 + }, + { + "epoch": 1.1252026826427532, + "grad_norm": 0.17057080566883087, + "learning_rate": 0.00011475116266669061, + "loss": 2.005, + "step": 295620 + }, + { + "epoch": 1.125240745110876, + "grad_norm": 0.1738719493150711, + "learning_rate": 0.00011469708885951391, + "loss": 1.9981, + "step": 295630 + }, + { + "epoch": 1.1252788075789986, + "grad_norm": 0.19181248545646667, + "learning_rate": 0.00011464302264004616, + "loss": 1.9821, + "step": 295640 + }, + { + "epoch": 1.1253168700471212, + "grad_norm": 0.1625184714794159, + "learning_rate": 0.00011458896400509412, + "loss": 1.9852, + "step": 295650 + }, + { + "epoch": 1.125354932515244, + "grad_norm": 0.16737158596515656, + "learning_rate": 0.00011453491295146673, + "loss": 1.9703, + "step": 295660 + }, + { + "epoch": 1.1253929949833668, + "grad_norm": 0.1710674911737442, + "learning_rate": 0.0001144808694759753, + "loss": 1.9956, + "step": 295670 + }, + { + "epoch": 1.1254310574514894, + "grad_norm": 0.19023360311985016, + "learning_rate": 0.00011442683357543321, + "loss": 1.981, + "step": 295680 + }, + { + "epoch": 1.125469119919612, + "grad_norm": 0.15628500282764435, + "learning_rate": 0.00011437280524665622, + "loss": 1.9769, + "step": 295690 + }, + { + "epoch": 1.1255071823877347, + "grad_norm": 0.20175419747829437, + "learning_rate": 0.00011431878448646221, + "loss": 1.9766, + "step": 295700 + }, + { + "epoch": 1.1255452448558574, + "grad_norm": 0.24420325458049774, + "learning_rate": 0.0001142647712916713, + "loss": 1.9911, + "step": 295710 + }, + { + "epoch": 1.12558330732398, + "grad_norm": 0.16868871450424194, + "learning_rate": 0.00011421076565910588, + "loss": 1.9839, + "step": 295720 + }, + { + "epoch": 1.125621369792103, + "grad_norm": 0.20664623379707336, + "learning_rate": 0.00011415676758559052, + "loss": 1.9715, + "step": 295730 + }, + { + "epoch": 1.1256594322602256, + "grad_norm": 0.16074231266975403, + "learning_rate": 0.00011410277706795214, + "loss": 1.9817, + "step": 295740 + }, + { + "epoch": 1.1256974947283482, + "grad_norm": 0.16438347101211548, + "learning_rate": 0.00011404879410301966, + "loss": 1.9852, + "step": 295750 + }, + { + "epoch": 1.125735557196471, + "grad_norm": 0.16818535327911377, + "learning_rate": 0.00011399481868762435, + "loss": 1.9846, + "step": 295760 + }, + { + "epoch": 1.1257736196645936, + "grad_norm": 0.1612391471862793, + "learning_rate": 0.00011394085081859973, + "loss": 1.9794, + "step": 295770 + }, + { + "epoch": 1.1258116821327162, + "grad_norm": 0.16969744861125946, + "learning_rate": 0.00011388689049278144, + "loss": 1.9829, + "step": 295780 + }, + { + "epoch": 1.1258497446008389, + "grad_norm": 0.1831132024526596, + "learning_rate": 0.00011383293770700742, + "loss": 2.0053, + "step": 295790 + }, + { + "epoch": 1.1258878070689615, + "grad_norm": 0.17153841257095337, + "learning_rate": 0.00011377899245811773, + "loss": 1.9794, + "step": 295800 + }, + { + "epoch": 1.1259258695370842, + "grad_norm": 0.18936266005039215, + "learning_rate": 0.00011372505474295469, + "loss": 1.974, + "step": 295810 + }, + { + "epoch": 1.1259639320052068, + "grad_norm": 0.173858642578125, + "learning_rate": 0.00011367112455836287, + "loss": 1.9904, + "step": 295820 + }, + { + "epoch": 1.1260019944733297, + "grad_norm": 0.19076798856258392, + "learning_rate": 0.00011361720190118885, + "loss": 1.981, + "step": 295830 + }, + { + "epoch": 1.1260400569414524, + "grad_norm": 0.1642879694700241, + "learning_rate": 0.00011356328676828164, + "loss": 1.9907, + "step": 295840 + }, + { + "epoch": 1.126078119409575, + "grad_norm": 0.21258746087551117, + "learning_rate": 0.00011350937915649235, + "loss": 1.9874, + "step": 295850 + }, + { + "epoch": 1.1261161818776977, + "grad_norm": 0.18977473676204681, + "learning_rate": 0.00011345547906267423, + "loss": 1.9859, + "step": 295860 + }, + { + "epoch": 1.1261542443458203, + "grad_norm": 0.15967296063899994, + "learning_rate": 0.00011340158648368287, + "loss": 1.9887, + "step": 295870 + }, + { + "epoch": 1.126192306813943, + "grad_norm": 0.19567842781543732, + "learning_rate": 0.0001133477014163759, + "loss": 1.9831, + "step": 295880 + }, + { + "epoch": 1.1262303692820657, + "grad_norm": 0.1792905181646347, + "learning_rate": 0.00011329382385761322, + "loss": 1.979, + "step": 295890 + }, + { + "epoch": 1.1262684317501883, + "grad_norm": 0.2360984981060028, + "learning_rate": 0.0001132399538042569, + "loss": 1.9905, + "step": 295900 + }, + { + "epoch": 1.1263064942183112, + "grad_norm": 0.2607705891132355, + "learning_rate": 0.0001131860912531712, + "loss": 1.9877, + "step": 295910 + }, + { + "epoch": 1.1263445566864339, + "grad_norm": 0.19371066987514496, + "learning_rate": 0.00011313223620122254, + "loss": 1.9767, + "step": 295920 + }, + { + "epoch": 1.1263826191545565, + "grad_norm": 0.20442219078540802, + "learning_rate": 0.00011307838864527958, + "loss": 1.993, + "step": 295930 + }, + { + "epoch": 1.1264206816226792, + "grad_norm": 0.1811714768409729, + "learning_rate": 0.00011302454858221306, + "loss": 1.9741, + "step": 295940 + }, + { + "epoch": 1.1264587440908018, + "grad_norm": 0.18447086215019226, + "learning_rate": 0.00011297071600889597, + "loss": 1.9793, + "step": 295950 + }, + { + "epoch": 1.1264968065589245, + "grad_norm": 0.2133498638868332, + "learning_rate": 0.00011291689092220347, + "loss": 1.9918, + "step": 295960 + }, + { + "epoch": 1.1265348690270471, + "grad_norm": 0.18323472142219543, + "learning_rate": 0.00011286307331901286, + "loss": 1.9766, + "step": 295970 + }, + { + "epoch": 1.1265729314951698, + "grad_norm": 0.19975514709949493, + "learning_rate": 0.00011280926319620367, + "loss": 1.9853, + "step": 295980 + }, + { + "epoch": 1.1266109939632925, + "grad_norm": 0.1980314552783966, + "learning_rate": 0.00011275546055065755, + "loss": 1.9823, + "step": 295990 + }, + { + "epoch": 1.1266490564314153, + "grad_norm": 0.1543724536895752, + "learning_rate": 0.0001127016653792583, + "loss": 1.9694, + "step": 296000 + }, + { + "epoch": 1.126687118899538, + "grad_norm": 0.2997741997241974, + "learning_rate": 0.00011264787767889195, + "loss": 1.9968, + "step": 296010 + }, + { + "epoch": 1.1267251813676606, + "grad_norm": 0.1481897085905075, + "learning_rate": 0.00011259409744644656, + "loss": 1.9892, + "step": 296020 + }, + { + "epoch": 1.1267632438357833, + "grad_norm": 0.1712397038936615, + "learning_rate": 0.00011254032467881254, + "loss": 1.9968, + "step": 296030 + }, + { + "epoch": 1.126801306303906, + "grad_norm": 0.18674853444099426, + "learning_rate": 0.00011248655937288233, + "loss": 1.9717, + "step": 296040 + }, + { + "epoch": 1.1268393687720286, + "grad_norm": 0.15560908615589142, + "learning_rate": 0.00011243280152555052, + "loss": 1.9718, + "step": 296050 + }, + { + "epoch": 1.1268774312401513, + "grad_norm": 0.1548231691122055, + "learning_rate": 0.00011237905113371388, + "loss": 1.9717, + "step": 296060 + }, + { + "epoch": 1.126915493708274, + "grad_norm": 0.17277362942695618, + "learning_rate": 0.00011232530819427139, + "loss": 1.9809, + "step": 296070 + }, + { + "epoch": 1.1269535561763968, + "grad_norm": 0.21410466730594635, + "learning_rate": 0.00011227157270412408, + "loss": 1.9721, + "step": 296080 + }, + { + "epoch": 1.1269916186445195, + "grad_norm": 0.1602579653263092, + "learning_rate": 0.0001122178446601752, + "loss": 1.996, + "step": 296090 + }, + { + "epoch": 1.1270296811126421, + "grad_norm": 0.1896025538444519, + "learning_rate": 0.00011216412405933013, + "loss": 1.9844, + "step": 296100 + }, + { + "epoch": 1.1270677435807648, + "grad_norm": 0.17863158881664276, + "learning_rate": 0.00011211041089849639, + "loss": 1.976, + "step": 296110 + }, + { + "epoch": 1.1271058060488874, + "grad_norm": 0.16004763543605804, + "learning_rate": 0.00011205670517458355, + "loss": 1.9752, + "step": 296120 + }, + { + "epoch": 1.12714386851701, + "grad_norm": 0.18769557774066925, + "learning_rate": 0.00011200300688450348, + "loss": 1.9684, + "step": 296130 + }, + { + "epoch": 1.1271819309851328, + "grad_norm": 0.2082507163286209, + "learning_rate": 0.00011194931602517006, + "loss": 1.9913, + "step": 296140 + }, + { + "epoch": 1.1272199934532554, + "grad_norm": 0.24489381909370422, + "learning_rate": 0.00011189563259349939, + "loss": 1.9738, + "step": 296150 + }, + { + "epoch": 1.127258055921378, + "grad_norm": 0.1563321352005005, + "learning_rate": 0.00011184195658640967, + "loss": 1.985, + "step": 296160 + }, + { + "epoch": 1.127296118389501, + "grad_norm": 0.17740443348884583, + "learning_rate": 0.0001117882880008212, + "loss": 1.9926, + "step": 296170 + }, + { + "epoch": 1.1273341808576236, + "grad_norm": 0.19535517692565918, + "learning_rate": 0.00011173462683365648, + "loss": 1.976, + "step": 296180 + }, + { + "epoch": 1.1273722433257463, + "grad_norm": 0.21839255094528198, + "learning_rate": 0.00011168097308184, + "loss": 1.9817, + "step": 296190 + }, + { + "epoch": 1.127410305793869, + "grad_norm": 0.17370064556598663, + "learning_rate": 0.00011162732674229854, + "loss": 1.9826, + "step": 296200 + }, + { + "epoch": 1.1274483682619916, + "grad_norm": 0.20356355607509613, + "learning_rate": 0.00011157368781196092, + "loss": 1.9824, + "step": 296210 + }, + { + "epoch": 1.1274864307301142, + "grad_norm": 0.19404418766498566, + "learning_rate": 0.00011152005628775808, + "loss": 1.9751, + "step": 296220 + }, + { + "epoch": 1.1275244931982369, + "grad_norm": 0.18887370824813843, + "learning_rate": 0.00011146643216662305, + "loss": 1.9801, + "step": 296230 + }, + { + "epoch": 1.1275625556663595, + "grad_norm": 0.1544542908668518, + "learning_rate": 0.00011141281544549104, + "loss": 1.9832, + "step": 296240 + }, + { + "epoch": 1.1276006181344824, + "grad_norm": 0.16381599009037018, + "learning_rate": 0.0001113592061212994, + "loss": 1.9786, + "step": 296250 + }, + { + "epoch": 1.127638680602605, + "grad_norm": 0.15992294251918793, + "learning_rate": 0.00011130560419098745, + "loss": 1.9862, + "step": 296260 + }, + { + "epoch": 1.1276767430707277, + "grad_norm": 0.15825308859348297, + "learning_rate": 0.00011125200965149674, + "loss": 1.9713, + "step": 296270 + }, + { + "epoch": 1.1277148055388504, + "grad_norm": 0.26244789361953735, + "learning_rate": 0.00011119842249977086, + "loss": 1.9804, + "step": 296280 + }, + { + "epoch": 1.127752868006973, + "grad_norm": 0.18329375982284546, + "learning_rate": 0.00011114484273275565, + "loss": 1.9765, + "step": 296290 + }, + { + "epoch": 1.1277909304750957, + "grad_norm": 0.17616413533687592, + "learning_rate": 0.00011109127034739886, + "loss": 1.9673, + "step": 296300 + }, + { + "epoch": 1.1278289929432184, + "grad_norm": 0.173064187169075, + "learning_rate": 0.00011103770534065044, + "loss": 1.9677, + "step": 296310 + }, + { + "epoch": 1.127867055411341, + "grad_norm": 0.15833042562007904, + "learning_rate": 0.00011098414770946247, + "loss": 2.0004, + "step": 296320 + }, + { + "epoch": 1.1279051178794637, + "grad_norm": 0.17509348690509796, + "learning_rate": 0.00011093059745078898, + "loss": 1.9947, + "step": 296330 + }, + { + "epoch": 1.1279431803475863, + "grad_norm": 0.165169358253479, + "learning_rate": 0.0001108770545615863, + "loss": 1.987, + "step": 296340 + }, + { + "epoch": 1.1279812428157092, + "grad_norm": 0.15263719856739044, + "learning_rate": 0.00011082351903881278, + "loss": 1.9635, + "step": 296350 + }, + { + "epoch": 1.1280193052838319, + "grad_norm": 0.15287859737873077, + "learning_rate": 0.00011076999087942874, + "loss": 1.9762, + "step": 296360 + }, + { + "epoch": 1.1280573677519545, + "grad_norm": 0.2017001211643219, + "learning_rate": 0.0001107164700803967, + "loss": 1.9723, + "step": 296370 + }, + { + "epoch": 1.1280954302200772, + "grad_norm": 0.20477700233459473, + "learning_rate": 0.00011066295663868136, + "loss": 1.9905, + "step": 296380 + }, + { + "epoch": 1.1281334926881998, + "grad_norm": 0.14756295084953308, + "learning_rate": 0.00011060945055124927, + "loss": 1.9988, + "step": 296390 + }, + { + "epoch": 1.1281715551563225, + "grad_norm": 0.15950356423854828, + "learning_rate": 0.00011055595181506923, + "loss": 1.9633, + "step": 296400 + }, + { + "epoch": 1.1282096176244452, + "grad_norm": 0.18172836303710938, + "learning_rate": 0.00011050246042711215, + "loss": 1.9882, + "step": 296410 + }, + { + "epoch": 1.128247680092568, + "grad_norm": 0.21200765669345856, + "learning_rate": 0.00011044897638435092, + "loss": 1.9799, + "step": 296420 + }, + { + "epoch": 1.1282857425606907, + "grad_norm": 0.23758922517299652, + "learning_rate": 0.0001103954996837605, + "loss": 1.9862, + "step": 296430 + }, + { + "epoch": 1.1283238050288134, + "grad_norm": 0.19562426209449768, + "learning_rate": 0.00011034203032231804, + "loss": 1.9813, + "step": 296440 + }, + { + "epoch": 1.128361867496936, + "grad_norm": 0.14884260296821594, + "learning_rate": 0.00011028856829700262, + "loss": 1.9836, + "step": 296450 + }, + { + "epoch": 1.1283999299650587, + "grad_norm": 0.1560460478067398, + "learning_rate": 0.00011023511360479548, + "loss": 1.9928, + "step": 296460 + }, + { + "epoch": 1.1284379924331813, + "grad_norm": 0.21366089582443237, + "learning_rate": 0.00011018166624268, + "loss": 2.0003, + "step": 296470 + }, + { + "epoch": 1.128476054901304, + "grad_norm": 0.16234548389911652, + "learning_rate": 0.00011012822620764146, + "loss": 1.9894, + "step": 296480 + }, + { + "epoch": 1.1285141173694266, + "grad_norm": 0.16439791023731232, + "learning_rate": 0.00011007479349666728, + "loss": 1.9939, + "step": 296490 + }, + { + "epoch": 1.1285521798375493, + "grad_norm": 0.19343294203281403, + "learning_rate": 0.00011002136810674701, + "loss": 1.9721, + "step": 296500 + }, + { + "epoch": 1.128590242305672, + "grad_norm": 0.19053199887275696, + "learning_rate": 0.00010996795003487214, + "loss": 1.9727, + "step": 296510 + }, + { + "epoch": 1.1286283047737948, + "grad_norm": 0.17076334357261658, + "learning_rate": 0.0001099145392780363, + "loss": 1.9911, + "step": 296520 + }, + { + "epoch": 1.1286663672419175, + "grad_norm": 0.1734541654586792, + "learning_rate": 0.0001098611358332352, + "loss": 1.9801, + "step": 296530 + }, + { + "epoch": 1.1287044297100401, + "grad_norm": 0.15060538053512573, + "learning_rate": 0.0001098077396974666, + "loss": 1.9822, + "step": 296540 + }, + { + "epoch": 1.1287424921781628, + "grad_norm": 0.1633031666278839, + "learning_rate": 0.00010975435086773017, + "loss": 1.9791, + "step": 296550 + }, + { + "epoch": 1.1287805546462855, + "grad_norm": 0.1746307760477066, + "learning_rate": 0.0001097009693410278, + "loss": 1.9897, + "step": 296560 + }, + { + "epoch": 1.1288186171144081, + "grad_norm": 0.22154556214809418, + "learning_rate": 0.00010964759511436334, + "loss": 1.9744, + "step": 296570 + }, + { + "epoch": 1.1288566795825308, + "grad_norm": 0.18822148442268372, + "learning_rate": 0.0001095942281847428, + "loss": 1.9763, + "step": 296580 + }, + { + "epoch": 1.1288947420506537, + "grad_norm": 0.19326618313789368, + "learning_rate": 0.00010954086854917411, + "loss": 1.982, + "step": 296590 + }, + { + "epoch": 1.1289328045187763, + "grad_norm": 0.17983761429786682, + "learning_rate": 0.00010948751620466729, + "loss": 1.9656, + "step": 296600 + }, + { + "epoch": 1.128970866986899, + "grad_norm": 0.15488919615745544, + "learning_rate": 0.0001094341711482344, + "loss": 1.9861, + "step": 296610 + }, + { + "epoch": 1.1290089294550216, + "grad_norm": 0.15306726098060608, + "learning_rate": 0.00010938083337688954, + "loss": 1.9758, + "step": 296620 + }, + { + "epoch": 1.1290469919231443, + "grad_norm": 0.19177936017513275, + "learning_rate": 0.00010932750288764886, + "loss": 1.9768, + "step": 296630 + }, + { + "epoch": 1.129085054391267, + "grad_norm": 0.1583833545446396, + "learning_rate": 0.00010927417967753056, + "loss": 1.9795, + "step": 296640 + }, + { + "epoch": 1.1291231168593896, + "grad_norm": 0.2055852711200714, + "learning_rate": 0.00010922086374355483, + "loss": 1.9832, + "step": 296650 + }, + { + "epoch": 1.1291611793275123, + "grad_norm": 0.17747201025485992, + "learning_rate": 0.00010916755508274395, + "loss": 1.9769, + "step": 296660 + }, + { + "epoch": 1.129199241795635, + "grad_norm": 0.17510271072387695, + "learning_rate": 0.00010911425369212213, + "loss": 1.9808, + "step": 296670 + }, + { + "epoch": 1.1292373042637576, + "grad_norm": 0.18866494297981262, + "learning_rate": 0.0001090609595687157, + "loss": 1.9882, + "step": 296680 + }, + { + "epoch": 1.1292753667318804, + "grad_norm": 0.16015563905239105, + "learning_rate": 0.00010900767270955303, + "loss": 2.0012, + "step": 296690 + }, + { + "epoch": 1.129313429200003, + "grad_norm": 0.18745210766792297, + "learning_rate": 0.00010895439311166444, + "loss": 1.976, + "step": 296700 + }, + { + "epoch": 1.1293514916681258, + "grad_norm": 0.16483469307422638, + "learning_rate": 0.00010890112077208231, + "loss": 1.9783, + "step": 296710 + }, + { + "epoch": 1.1293895541362484, + "grad_norm": 0.19420553743839264, + "learning_rate": 0.00010884785568784106, + "loss": 1.9935, + "step": 296720 + }, + { + "epoch": 1.129427616604371, + "grad_norm": 0.22112570703029633, + "learning_rate": 0.00010879459785597712, + "loss": 1.9688, + "step": 296730 + }, + { + "epoch": 1.1294656790724937, + "grad_norm": 0.167385995388031, + "learning_rate": 0.00010874134727352891, + "loss": 1.9629, + "step": 296740 + }, + { + "epoch": 1.1295037415406164, + "grad_norm": 0.15777790546417236, + "learning_rate": 0.00010868810393753681, + "loss": 1.9813, + "step": 296750 + }, + { + "epoch": 1.129541804008739, + "grad_norm": 0.15105342864990234, + "learning_rate": 0.00010863486784504334, + "loss": 1.9845, + "step": 296760 + }, + { + "epoch": 1.129579866476862, + "grad_norm": 0.18358340859413147, + "learning_rate": 0.00010858163899309304, + "loss": 1.9745, + "step": 296770 + }, + { + "epoch": 1.1296179289449846, + "grad_norm": 0.1916862279176712, + "learning_rate": 0.00010852841737873231, + "loss": 1.9855, + "step": 296780 + }, + { + "epoch": 1.1296559914131072, + "grad_norm": 0.20552465319633484, + "learning_rate": 0.0001084752029990097, + "loss": 1.9693, + "step": 296790 + }, + { + "epoch": 1.12969405388123, + "grad_norm": 0.20443643629550934, + "learning_rate": 0.00010842199585097567, + "loss": 1.9762, + "step": 296800 + }, + { + "epoch": 1.1297321163493526, + "grad_norm": 0.17776182293891907, + "learning_rate": 0.0001083687959316827, + "loss": 1.9901, + "step": 296810 + }, + { + "epoch": 1.1297701788174752, + "grad_norm": 0.19605980813503265, + "learning_rate": 0.00010831560323818528, + "loss": 1.997, + "step": 296820 + }, + { + "epoch": 1.1298082412855979, + "grad_norm": 0.18584735691547394, + "learning_rate": 0.00010826241776754003, + "loss": 2.0007, + "step": 296830 + }, + { + "epoch": 1.1298463037537205, + "grad_norm": 0.1926882266998291, + "learning_rate": 0.00010820923951680534, + "loss": 1.978, + "step": 296840 + }, + { + "epoch": 1.1298843662218432, + "grad_norm": 0.162827268242836, + "learning_rate": 0.00010815606848304177, + "loss": 1.9698, + "step": 296850 + }, + { + "epoch": 1.129922428689966, + "grad_norm": 0.16366046667099, + "learning_rate": 0.00010810290466331174, + "loss": 1.9902, + "step": 296860 + }, + { + "epoch": 1.1299604911580887, + "grad_norm": 0.17877492308616638, + "learning_rate": 0.00010804974805467976, + "loss": 1.9791, + "step": 296870 + }, + { + "epoch": 1.1299985536262114, + "grad_norm": 0.2411981225013733, + "learning_rate": 0.00010799659865421236, + "loss": 1.9728, + "step": 296880 + }, + { + "epoch": 1.130036616094334, + "grad_norm": 0.16815371811389923, + "learning_rate": 0.00010794345645897791, + "loss": 1.9928, + "step": 296890 + }, + { + "epoch": 1.1300746785624567, + "grad_norm": 0.1610078513622284, + "learning_rate": 0.00010789032146604694, + "loss": 1.9701, + "step": 296900 + }, + { + "epoch": 1.1301127410305793, + "grad_norm": 0.16686779260635376, + "learning_rate": 0.00010783719367249184, + "loss": 1.9837, + "step": 296910 + }, + { + "epoch": 1.130150803498702, + "grad_norm": 0.20752695202827454, + "learning_rate": 0.00010778407307538707, + "loss": 1.9828, + "step": 296920 + }, + { + "epoch": 1.1301888659668247, + "grad_norm": 0.17727501690387726, + "learning_rate": 0.0001077309596718089, + "loss": 1.9746, + "step": 296930 + }, + { + "epoch": 1.1302269284349475, + "grad_norm": 0.22601988911628723, + "learning_rate": 0.00010767785345883585, + "loss": 1.9872, + "step": 296940 + }, + { + "epoch": 1.1302649909030702, + "grad_norm": 0.19532141089439392, + "learning_rate": 0.00010762475443354824, + "loss": 1.975, + "step": 296950 + }, + { + "epoch": 1.1303030533711929, + "grad_norm": 0.2647620439529419, + "learning_rate": 0.00010757166259302831, + "loss": 1.9796, + "step": 296960 + }, + { + "epoch": 1.1303411158393155, + "grad_norm": 0.16196277737617493, + "learning_rate": 0.00010751857793436048, + "loss": 1.9828, + "step": 296970 + }, + { + "epoch": 1.1303791783074382, + "grad_norm": 0.17007310688495636, + "learning_rate": 0.000107465500454631, + "loss": 1.9866, + "step": 296980 + }, + { + "epoch": 1.1304172407755608, + "grad_norm": 0.18671287596225739, + "learning_rate": 0.00010741243015092795, + "loss": 1.9739, + "step": 296990 + }, + { + "epoch": 1.1304553032436835, + "grad_norm": 0.15183429419994354, + "learning_rate": 0.00010735936702034177, + "loss": 1.9822, + "step": 297000 + }, + { + "epoch": 1.1304933657118061, + "grad_norm": 0.1812126636505127, + "learning_rate": 0.00010730631105996452, + "loss": 1.9597, + "step": 297010 + }, + { + "epoch": 1.1305314281799288, + "grad_norm": 0.1727277636528015, + "learning_rate": 0.00010725326226689036, + "loss": 1.9757, + "step": 297020 + }, + { + "epoch": 1.1305694906480517, + "grad_norm": 0.26448723673820496, + "learning_rate": 0.00010720022063821538, + "loss": 1.9662, + "step": 297030 + }, + { + "epoch": 1.1306075531161743, + "grad_norm": 0.17803862690925598, + "learning_rate": 0.00010714718617103769, + "loss": 1.9927, + "step": 297040 + }, + { + "epoch": 1.130645615584297, + "grad_norm": 0.19999749958515167, + "learning_rate": 0.00010709415886245721, + "loss": 1.9591, + "step": 297050 + }, + { + "epoch": 1.1306836780524196, + "grad_norm": 0.15710560977458954, + "learning_rate": 0.00010704113870957605, + "loss": 1.9859, + "step": 297060 + }, + { + "epoch": 1.1307217405205423, + "grad_norm": 0.1491328626871109, + "learning_rate": 0.00010698812570949801, + "loss": 1.9753, + "step": 297070 + }, + { + "epoch": 1.130759802988665, + "grad_norm": 0.15009577572345734, + "learning_rate": 0.00010693511985932908, + "loss": 1.986, + "step": 297080 + }, + { + "epoch": 1.1307978654567876, + "grad_norm": 0.25345826148986816, + "learning_rate": 0.00010688212115617708, + "loss": 1.9726, + "step": 297090 + }, + { + "epoch": 1.1308359279249103, + "grad_norm": 0.1647961139678955, + "learning_rate": 0.00010682912959715174, + "loss": 1.9772, + "step": 297100 + }, + { + "epoch": 1.1308739903930332, + "grad_norm": 0.20859834551811218, + "learning_rate": 0.0001067761451793648, + "loss": 1.9753, + "step": 297110 + }, + { + "epoch": 1.1309120528611558, + "grad_norm": 0.19893114268779755, + "learning_rate": 0.00010672316789992998, + "loss": 1.9791, + "step": 297120 + }, + { + "epoch": 1.1309501153292785, + "grad_norm": 0.15905693173408508, + "learning_rate": 0.00010667019775596287, + "loss": 1.9719, + "step": 297130 + }, + { + "epoch": 1.1309881777974011, + "grad_norm": 0.17043165862560272, + "learning_rate": 0.00010661723474458113, + "loss": 1.9805, + "step": 297140 + }, + { + "epoch": 1.1310262402655238, + "grad_norm": 0.1681310534477234, + "learning_rate": 0.00010656427886290415, + "loss": 1.9964, + "step": 297150 + }, + { + "epoch": 1.1310643027336464, + "grad_norm": 0.16626735031604767, + "learning_rate": 0.00010651133010805341, + "loss": 1.9854, + "step": 297160 + }, + { + "epoch": 1.131102365201769, + "grad_norm": 0.15672798454761505, + "learning_rate": 0.00010645838847715228, + "loss": 1.9995, + "step": 297170 + }, + { + "epoch": 1.1311404276698918, + "grad_norm": 0.16240811347961426, + "learning_rate": 0.00010640545396732609, + "loss": 1.9765, + "step": 297180 + }, + { + "epoch": 1.1311784901380144, + "grad_norm": 0.1623651087284088, + "learning_rate": 0.0001063525265757021, + "loss": 1.984, + "step": 297190 + }, + { + "epoch": 1.131216552606137, + "grad_norm": 0.17290998995304108, + "learning_rate": 0.00010629960629940943, + "loss": 1.9954, + "step": 297200 + }, + { + "epoch": 1.13125461507426, + "grad_norm": 0.1701308637857437, + "learning_rate": 0.00010624669313557927, + "loss": 1.9883, + "step": 297210 + }, + { + "epoch": 1.1312926775423826, + "grad_norm": 0.15361827611923218, + "learning_rate": 0.00010619378708134464, + "loss": 1.9643, + "step": 297220 + }, + { + "epoch": 1.1313307400105053, + "grad_norm": 0.22033323347568512, + "learning_rate": 0.00010614088813384043, + "loss": 1.985, + "step": 297230 + }, + { + "epoch": 1.131368802478628, + "grad_norm": 0.20703737437725067, + "learning_rate": 0.00010608799629020355, + "loss": 1.9874, + "step": 297240 + }, + { + "epoch": 1.1314068649467506, + "grad_norm": 0.224261075258255, + "learning_rate": 0.00010603511154757289, + "loss": 1.9673, + "step": 297250 + }, + { + "epoch": 1.1314449274148732, + "grad_norm": 0.1555107980966568, + "learning_rate": 0.00010598223390308909, + "loss": 1.9845, + "step": 297260 + }, + { + "epoch": 1.131482989882996, + "grad_norm": 0.16893711686134338, + "learning_rate": 0.0001059293633538948, + "loss": 1.9947, + "step": 297270 + }, + { + "epoch": 1.1315210523511188, + "grad_norm": 0.15874548256397247, + "learning_rate": 0.0001058764998971346, + "loss": 1.9789, + "step": 297280 + }, + { + "epoch": 1.1315591148192414, + "grad_norm": 0.157329261302948, + "learning_rate": 0.000105823643529955, + "loss": 1.9629, + "step": 297290 + }, + { + "epoch": 1.131597177287364, + "grad_norm": 0.19571171700954437, + "learning_rate": 0.00010577079424950431, + "loss": 1.9684, + "step": 297300 + }, + { + "epoch": 1.1316352397554867, + "grad_norm": 0.22006621956825256, + "learning_rate": 0.00010571795205293289, + "loss": 1.9703, + "step": 297310 + }, + { + "epoch": 1.1316733022236094, + "grad_norm": 0.16492506861686707, + "learning_rate": 0.00010566511693739294, + "loss": 1.9637, + "step": 297320 + }, + { + "epoch": 1.131711364691732, + "grad_norm": 0.14963102340698242, + "learning_rate": 0.00010561228890003854, + "loss": 1.981, + "step": 297330 + }, + { + "epoch": 1.1317494271598547, + "grad_norm": 0.1697867512702942, + "learning_rate": 0.0001055594679380258, + "loss": 1.9744, + "step": 297340 + }, + { + "epoch": 1.1317874896279774, + "grad_norm": 0.2040063887834549, + "learning_rate": 0.00010550665404851251, + "loss": 1.9844, + "step": 297350 + }, + { + "epoch": 1.1318255520961, + "grad_norm": 0.1494297832250595, + "learning_rate": 0.00010545384722865859, + "loss": 1.9817, + "step": 297360 + }, + { + "epoch": 1.1318636145642227, + "grad_norm": 0.18378987908363342, + "learning_rate": 0.00010540104747562578, + "loss": 1.9842, + "step": 297370 + }, + { + "epoch": 1.1319016770323456, + "grad_norm": 0.14976900815963745, + "learning_rate": 0.00010534825478657767, + "loss": 1.9872, + "step": 297380 + }, + { + "epoch": 1.1319397395004682, + "grad_norm": 0.1700267791748047, + "learning_rate": 0.00010529546915867983, + "loss": 1.9756, + "step": 297390 + }, + { + "epoch": 1.1319778019685909, + "grad_norm": 0.163448765873909, + "learning_rate": 0.00010524269058909964, + "loss": 1.9755, + "step": 297400 + }, + { + "epoch": 1.1320158644367135, + "grad_norm": 0.1536484956741333, + "learning_rate": 0.00010518991907500635, + "loss": 1.9725, + "step": 297410 + }, + { + "epoch": 1.1320539269048362, + "grad_norm": 0.18388310074806213, + "learning_rate": 0.00010513715461357132, + "loss": 1.987, + "step": 297420 + }, + { + "epoch": 1.1320919893729589, + "grad_norm": 0.25140684843063354, + "learning_rate": 0.00010508439720196755, + "loss": 1.9676, + "step": 297430 + }, + { + "epoch": 1.1321300518410815, + "grad_norm": 0.17318034172058105, + "learning_rate": 0.00010503164683737005, + "loss": 1.9887, + "step": 297440 + }, + { + "epoch": 1.1321681143092044, + "grad_norm": 0.22555945813655853, + "learning_rate": 0.0001049789035169556, + "loss": 1.9752, + "step": 297450 + }, + { + "epoch": 1.132206176777327, + "grad_norm": 0.1887245774269104, + "learning_rate": 0.00010492616723790311, + "loss": 1.9603, + "step": 297460 + }, + { + "epoch": 1.1322442392454497, + "grad_norm": 0.17808274924755096, + "learning_rate": 0.00010487343799739302, + "loss": 1.982, + "step": 297470 + }, + { + "epoch": 1.1322823017135724, + "grad_norm": 0.1736079901456833, + "learning_rate": 0.00010482071579260805, + "loss": 1.9716, + "step": 297480 + }, + { + "epoch": 1.132320364181695, + "grad_norm": 0.1533351093530655, + "learning_rate": 0.00010476800062073244, + "loss": 1.9908, + "step": 297490 + }, + { + "epoch": 1.1323584266498177, + "grad_norm": 0.1535065919160843, + "learning_rate": 0.00010471529247895256, + "loss": 1.967, + "step": 297500 + }, + { + "epoch": 1.1323964891179403, + "grad_norm": 0.1636301577091217, + "learning_rate": 0.00010466259136445654, + "loss": 1.9878, + "step": 297510 + }, + { + "epoch": 1.132434551586063, + "grad_norm": 0.15821027755737305, + "learning_rate": 0.00010460989727443437, + "loss": 1.9884, + "step": 297520 + }, + { + "epoch": 1.1324726140541856, + "grad_norm": 0.14496588706970215, + "learning_rate": 0.000104557210206078, + "loss": 1.9745, + "step": 297530 + }, + { + "epoch": 1.1325106765223083, + "grad_norm": 0.22977522015571594, + "learning_rate": 0.00010450453015658107, + "loss": 1.9829, + "step": 297540 + }, + { + "epoch": 1.1325487389904312, + "grad_norm": 0.21885627508163452, + "learning_rate": 0.00010445185712313937, + "loss": 1.9794, + "step": 297550 + }, + { + "epoch": 1.1325868014585538, + "grad_norm": 0.1769750565290451, + "learning_rate": 0.00010439919110295037, + "loss": 1.9772, + "step": 297560 + }, + { + "epoch": 1.1326248639266765, + "grad_norm": 0.16426600515842438, + "learning_rate": 0.00010434653209321331, + "loss": 1.9725, + "step": 297570 + }, + { + "epoch": 1.1326629263947992, + "grad_norm": 0.26224613189697266, + "learning_rate": 0.00010429388009112961, + "loss": 1.9706, + "step": 297580 + }, + { + "epoch": 1.1327009888629218, + "grad_norm": 0.1549231857061386, + "learning_rate": 0.00010424123509390221, + "loss": 1.9857, + "step": 297590 + }, + { + "epoch": 1.1327390513310445, + "grad_norm": 0.1712195724248886, + "learning_rate": 0.00010418859709873607, + "loss": 1.9808, + "step": 297600 + }, + { + "epoch": 1.1327771137991671, + "grad_norm": 0.16183438897132874, + "learning_rate": 0.0001041359661028381, + "loss": 1.9599, + "step": 297610 + }, + { + "epoch": 1.1328151762672898, + "grad_norm": 0.17860190570354462, + "learning_rate": 0.00010408334210341691, + "loss": 1.9769, + "step": 297620 + }, + { + "epoch": 1.1328532387354127, + "grad_norm": 0.17467913031578064, + "learning_rate": 0.00010403072509768302, + "loss": 1.9814, + "step": 297630 + }, + { + "epoch": 1.1328913012035353, + "grad_norm": 0.1577252298593521, + "learning_rate": 0.00010397811508284882, + "loss": 1.979, + "step": 297640 + }, + { + "epoch": 1.132929363671658, + "grad_norm": 0.15403807163238525, + "learning_rate": 0.00010392551205612854, + "loss": 1.9672, + "step": 297650 + }, + { + "epoch": 1.1329674261397806, + "grad_norm": 0.16157092154026031, + "learning_rate": 0.0001038729160147382, + "loss": 1.9873, + "step": 297660 + }, + { + "epoch": 1.1330054886079033, + "grad_norm": 0.1798163652420044, + "learning_rate": 0.00010382032695589577, + "loss": 1.9807, + "step": 297670 + }, + { + "epoch": 1.133043551076026, + "grad_norm": 0.1702636331319809, + "learning_rate": 0.000103767744876821, + "loss": 1.9789, + "step": 297680 + }, + { + "epoch": 1.1330816135441486, + "grad_norm": 0.1928313672542572, + "learning_rate": 0.00010371516977473555, + "loss": 1.9817, + "step": 297690 + }, + { + "epoch": 1.1331196760122713, + "grad_norm": 0.22674444317817688, + "learning_rate": 0.00010366260164686286, + "loss": 1.9741, + "step": 297700 + }, + { + "epoch": 1.133157738480394, + "grad_norm": 0.19050756096839905, + "learning_rate": 0.00010361004049042815, + "loss": 1.975, + "step": 297710 + }, + { + "epoch": 1.1331958009485168, + "grad_norm": 0.19661086797714233, + "learning_rate": 0.00010355748630265865, + "loss": 1.96, + "step": 297720 + }, + { + "epoch": 1.1332338634166395, + "grad_norm": 0.16321800649166107, + "learning_rate": 0.00010350493908078334, + "loss": 1.9791, + "step": 297730 + }, + { + "epoch": 1.133271925884762, + "grad_norm": 0.18268156051635742, + "learning_rate": 0.00010345239882203295, + "loss": 1.9836, + "step": 297740 + }, + { + "epoch": 1.1333099883528848, + "grad_norm": 0.2021750658750534, + "learning_rate": 0.00010339986552364023, + "loss": 1.9676, + "step": 297750 + }, + { + "epoch": 1.1333480508210074, + "grad_norm": 0.18211646378040314, + "learning_rate": 0.00010334733918283956, + "loss": 1.9793, + "step": 297760 + }, + { + "epoch": 1.13338611328913, + "grad_norm": 0.19580800831317902, + "learning_rate": 0.00010329481979686728, + "loss": 1.9851, + "step": 297770 + }, + { + "epoch": 1.1334241757572527, + "grad_norm": 0.23755834996700287, + "learning_rate": 0.00010324230736296152, + "loss": 1.9817, + "step": 297780 + }, + { + "epoch": 1.1334622382253754, + "grad_norm": 0.2371157705783844, + "learning_rate": 0.00010318980187836235, + "loss": 1.9978, + "step": 297790 + }, + { + "epoch": 1.1335003006934983, + "grad_norm": 0.2062951624393463, + "learning_rate": 0.0001031373033403114, + "loss": 1.9864, + "step": 297800 + }, + { + "epoch": 1.133538363161621, + "grad_norm": 0.2074883133172989, + "learning_rate": 0.00010308481174605239, + "loss": 1.9832, + "step": 297810 + }, + { + "epoch": 1.1335764256297436, + "grad_norm": 0.21004609763622284, + "learning_rate": 0.00010303232709283072, + "loss": 1.9749, + "step": 297820 + }, + { + "epoch": 1.1336144880978662, + "grad_norm": 0.1587824821472168, + "learning_rate": 0.00010297984937789368, + "loss": 1.9848, + "step": 297830 + }, + { + "epoch": 1.133652550565989, + "grad_norm": 0.1737077683210373, + "learning_rate": 0.00010292737859849027, + "loss": 1.967, + "step": 297840 + }, + { + "epoch": 1.1336906130341116, + "grad_norm": 0.1679849773645401, + "learning_rate": 0.00010287491475187149, + "loss": 1.9777, + "step": 297850 + }, + { + "epoch": 1.1337286755022342, + "grad_norm": 0.16807515919208527, + "learning_rate": 0.00010282245783528998, + "loss": 1.9746, + "step": 297860 + }, + { + "epoch": 1.1337667379703569, + "grad_norm": 0.20174390077590942, + "learning_rate": 0.00010277000784600032, + "loss": 1.9811, + "step": 297870 + }, + { + "epoch": 1.1338048004384795, + "grad_norm": 0.17630982398986816, + "learning_rate": 0.00010271756478125876, + "loss": 1.9681, + "step": 297880 + }, + { + "epoch": 1.1338428629066024, + "grad_norm": 0.1595836579799652, + "learning_rate": 0.00010266512863832355, + "loss": 1.978, + "step": 297890 + }, + { + "epoch": 1.133880925374725, + "grad_norm": 0.17745345830917358, + "learning_rate": 0.0001026126994144545, + "loss": 1.973, + "step": 297900 + }, + { + "epoch": 1.1339189878428477, + "grad_norm": 0.17257341742515564, + "learning_rate": 0.00010256027710691357, + "loss": 1.9759, + "step": 297910 + }, + { + "epoch": 1.1339570503109704, + "grad_norm": 0.17954018712043762, + "learning_rate": 0.00010250786171296417, + "loss": 1.9809, + "step": 297920 + }, + { + "epoch": 1.133995112779093, + "grad_norm": 0.28424471616744995, + "learning_rate": 0.00010245545322987176, + "loss": 1.9967, + "step": 297930 + }, + { + "epoch": 1.1340331752472157, + "grad_norm": 0.17595075070858002, + "learning_rate": 0.00010240305165490349, + "loss": 1.9744, + "step": 297940 + }, + { + "epoch": 1.1340712377153384, + "grad_norm": 0.23913931846618652, + "learning_rate": 0.00010235065698532836, + "loss": 1.9924, + "step": 297950 + }, + { + "epoch": 1.134109300183461, + "grad_norm": 0.16063128411769867, + "learning_rate": 0.00010229826921841707, + "loss": 1.9784, + "step": 297960 + }, + { + "epoch": 1.134147362651584, + "grad_norm": 0.15882925689220428, + "learning_rate": 0.00010224588835144227, + "loss": 1.9737, + "step": 297970 + }, + { + "epoch": 1.1341854251197065, + "grad_norm": 0.1765865981578827, + "learning_rate": 0.00010219351438167829, + "loss": 1.9787, + "step": 297980 + }, + { + "epoch": 1.1342234875878292, + "grad_norm": 0.17888866364955902, + "learning_rate": 0.00010214114730640129, + "loss": 1.9502, + "step": 297990 + }, + { + "epoch": 1.1342615500559519, + "grad_norm": 0.1510838121175766, + "learning_rate": 0.00010208878712288928, + "loss": 1.9698, + "step": 298000 + }, + { + "epoch": 1.1342996125240745, + "grad_norm": 0.1642741858959198, + "learning_rate": 0.00010203643382842193, + "loss": 1.976, + "step": 298010 + }, + { + "epoch": 1.1343376749921972, + "grad_norm": 0.1682412475347519, + "learning_rate": 0.00010198408742028082, + "loss": 1.9792, + "step": 298020 + }, + { + "epoch": 1.1343757374603198, + "grad_norm": 0.2024160772562027, + "learning_rate": 0.00010193174789574922, + "loss": 1.9628, + "step": 298030 + }, + { + "epoch": 1.1344137999284425, + "grad_norm": 0.15808521211147308, + "learning_rate": 0.00010187941525211236, + "loss": 1.9679, + "step": 298040 + }, + { + "epoch": 1.1344518623965651, + "grad_norm": 0.1619788408279419, + "learning_rate": 0.00010182708948665698, + "loss": 1.9777, + "step": 298050 + }, + { + "epoch": 1.1344899248646878, + "grad_norm": 0.16730374097824097, + "learning_rate": 0.00010177477059667185, + "loss": 1.9801, + "step": 298060 + }, + { + "epoch": 1.1345279873328107, + "grad_norm": 0.16122709214687347, + "learning_rate": 0.00010172245857944739, + "loss": 1.951, + "step": 298070 + }, + { + "epoch": 1.1345660498009333, + "grad_norm": 0.21013543009757996, + "learning_rate": 0.00010167015343227582, + "loss": 1.9829, + "step": 298080 + }, + { + "epoch": 1.134604112269056, + "grad_norm": 0.20861048996448517, + "learning_rate": 0.0001016178551524512, + "loss": 1.9743, + "step": 298090 + }, + { + "epoch": 1.1346421747371787, + "grad_norm": 0.19113865494728088, + "learning_rate": 0.00010156556373726933, + "loss": 1.9696, + "step": 298100 + }, + { + "epoch": 1.1346802372053013, + "grad_norm": 0.15994352102279663, + "learning_rate": 0.00010151327918402769, + "loss": 1.9772, + "step": 298110 + }, + { + "epoch": 1.134718299673424, + "grad_norm": 0.17390236258506775, + "learning_rate": 0.00010146100149002568, + "loss": 1.9778, + "step": 298120 + }, + { + "epoch": 1.1347563621415466, + "grad_norm": 0.16655398905277252, + "learning_rate": 0.00010140873065256435, + "loss": 1.9879, + "step": 298130 + }, + { + "epoch": 1.1347944246096695, + "grad_norm": 0.21683935821056366, + "learning_rate": 0.0001013564666689466, + "loss": 1.9837, + "step": 298140 + }, + { + "epoch": 1.1348324870777922, + "grad_norm": 0.2061341404914856, + "learning_rate": 0.00010130420953647712, + "loss": 1.9745, + "step": 298150 + }, + { + "epoch": 1.1348705495459148, + "grad_norm": 0.16226203739643097, + "learning_rate": 0.00010125195925246228, + "loss": 1.9701, + "step": 298160 + }, + { + "epoch": 1.1349086120140375, + "grad_norm": 0.17854997515678406, + "learning_rate": 0.00010119971581421028, + "loss": 1.9734, + "step": 298170 + }, + { + "epoch": 1.1349466744821601, + "grad_norm": 0.15226025879383087, + "learning_rate": 0.000101147479219031, + "loss": 1.9774, + "step": 298180 + }, + { + "epoch": 1.1349847369502828, + "grad_norm": 0.16963133215904236, + "learning_rate": 0.00010109524946423615, + "loss": 1.9851, + "step": 298190 + }, + { + "epoch": 1.1350227994184054, + "grad_norm": 0.1541719138622284, + "learning_rate": 0.0001010430265471392, + "loss": 1.9658, + "step": 298200 + }, + { + "epoch": 1.135060861886528, + "grad_norm": 0.14477787911891937, + "learning_rate": 0.00010099081046505543, + "loss": 1.9773, + "step": 298210 + }, + { + "epoch": 1.1350989243546508, + "grad_norm": 0.17722904682159424, + "learning_rate": 0.00010093860121530168, + "loss": 1.9694, + "step": 298220 + }, + { + "epoch": 1.1351369868227734, + "grad_norm": 0.21424292027950287, + "learning_rate": 0.00010088639879519684, + "loss": 1.9837, + "step": 298230 + }, + { + "epoch": 1.1351750492908963, + "grad_norm": 0.1900191605091095, + "learning_rate": 0.00010083420320206127, + "loss": 1.9801, + "step": 298240 + }, + { + "epoch": 1.135213111759019, + "grad_norm": 0.21574735641479492, + "learning_rate": 0.00010078201443321722, + "loss": 1.9715, + "step": 298250 + }, + { + "epoch": 1.1352511742271416, + "grad_norm": 0.21656334400177002, + "learning_rate": 0.00010072983248598866, + "loss": 1.9911, + "step": 298260 + }, + { + "epoch": 1.1352892366952643, + "grad_norm": 0.2067733258008957, + "learning_rate": 0.0001006776573577014, + "loss": 1.9777, + "step": 298270 + }, + { + "epoch": 1.135327299163387, + "grad_norm": 0.166462704539299, + "learning_rate": 0.00010062548904568286, + "loss": 1.9704, + "step": 298280 + }, + { + "epoch": 1.1353653616315096, + "grad_norm": 0.19629982113838196, + "learning_rate": 0.00010057332754726222, + "loss": 1.9824, + "step": 298290 + }, + { + "epoch": 1.1354034240996322, + "grad_norm": 0.18000733852386475, + "learning_rate": 0.00010052117285977053, + "loss": 1.9789, + "step": 298300 + }, + { + "epoch": 1.1354414865677551, + "grad_norm": 0.1905238926410675, + "learning_rate": 0.00010046902498054051, + "loss": 1.9669, + "step": 298310 + }, + { + "epoch": 1.1354795490358778, + "grad_norm": 0.16512887179851532, + "learning_rate": 0.00010041688390690646, + "loss": 1.994, + "step": 298320 + }, + { + "epoch": 1.1355176115040004, + "grad_norm": 0.21684348583221436, + "learning_rate": 0.00010036474963620473, + "loss": 1.9725, + "step": 298330 + }, + { + "epoch": 1.135555673972123, + "grad_norm": 0.16749443113803864, + "learning_rate": 0.00010031262216577314, + "loss": 1.9656, + "step": 298340 + }, + { + "epoch": 1.1355937364402457, + "grad_norm": 0.20533698797225952, + "learning_rate": 0.00010026050149295146, + "loss": 1.9785, + "step": 298350 + }, + { + "epoch": 1.1356317989083684, + "grad_norm": 0.20634326338768005, + "learning_rate": 0.000100208387615081, + "loss": 1.9695, + "step": 298360 + }, + { + "epoch": 1.135669861376491, + "grad_norm": 0.19714361429214478, + "learning_rate": 0.00010015628052950487, + "loss": 1.9733, + "step": 298370 + }, + { + "epoch": 1.1357079238446137, + "grad_norm": 0.1932590901851654, + "learning_rate": 0.00010010418023356798, + "loss": 1.9822, + "step": 298380 + }, + { + "epoch": 1.1357459863127364, + "grad_norm": 0.1969641149044037, + "learning_rate": 0.0001000520867246169, + "loss": 1.9844, + "step": 298390 + }, + { + "epoch": 1.135784048780859, + "grad_norm": 0.2220790535211563, + "learning_rate": 9.999999999999998e-05, + "loss": 1.9783, + "step": 298400 + }, + { + "epoch": 1.135822111248982, + "grad_norm": 0.18969888985157013, + "learning_rate": 9.994792005706726e-05, + "loss": 1.97, + "step": 298410 + }, + { + "epoch": 1.1358601737171046, + "grad_norm": 0.1967613846063614, + "learning_rate": 9.989584689317044e-05, + "loss": 1.9727, + "step": 298420 + }, + { + "epoch": 1.1358982361852272, + "grad_norm": 0.1870899349451065, + "learning_rate": 9.9843780505663e-05, + "loss": 1.9715, + "step": 298430 + }, + { + "epoch": 1.1359362986533499, + "grad_norm": 0.15287436544895172, + "learning_rate": 9.979172089190025e-05, + "loss": 1.9639, + "step": 298440 + }, + { + "epoch": 1.1359743611214725, + "grad_norm": 0.15644054114818573, + "learning_rate": 9.973966804923906e-05, + "loss": 1.9825, + "step": 298450 + }, + { + "epoch": 1.1360124235895952, + "grad_norm": 0.15644334256649017, + "learning_rate": 9.96876219750381e-05, + "loss": 1.967, + "step": 298460 + }, + { + "epoch": 1.1360504860577179, + "grad_norm": 0.16445119678974152, + "learning_rate": 9.963558266665774e-05, + "loss": 1.9825, + "step": 298470 + }, + { + "epoch": 1.1360885485258405, + "grad_norm": 0.301796019077301, + "learning_rate": 9.958355012146004e-05, + "loss": 1.9884, + "step": 298480 + }, + { + "epoch": 1.1361266109939634, + "grad_norm": 0.1797219067811966, + "learning_rate": 9.953152433680879e-05, + "loss": 1.9829, + "step": 298490 + }, + { + "epoch": 1.136164673462086, + "grad_norm": 0.20839370787143707, + "learning_rate": 9.947950531006944e-05, + "loss": 1.978, + "step": 298500 + }, + { + "epoch": 1.1362027359302087, + "grad_norm": 0.19797562062740326, + "learning_rate": 9.942749303860938e-05, + "loss": 1.9762, + "step": 298510 + }, + { + "epoch": 1.1362407983983314, + "grad_norm": 0.19425931572914124, + "learning_rate": 9.937548751979742e-05, + "loss": 1.9837, + "step": 298520 + }, + { + "epoch": 1.136278860866454, + "grad_norm": 0.17285150289535522, + "learning_rate": 9.932348875100417e-05, + "loss": 1.9877, + "step": 298530 + }, + { + "epoch": 1.1363169233345767, + "grad_norm": 0.29804208874702454, + "learning_rate": 9.927149672960206e-05, + "loss": 1.9629, + "step": 298540 + }, + { + "epoch": 1.1363549858026993, + "grad_norm": 0.23229555785655975, + "learning_rate": 9.921951145296509e-05, + "loss": 1.9564, + "step": 298550 + }, + { + "epoch": 1.136393048270822, + "grad_norm": 0.16856501996517181, + "learning_rate": 9.916753291846891e-05, + "loss": 1.9887, + "step": 298560 + }, + { + "epoch": 1.1364311107389446, + "grad_norm": 0.1554311364889145, + "learning_rate": 9.911556112349118e-05, + "loss": 1.9722, + "step": 298570 + }, + { + "epoch": 1.1364691732070675, + "grad_norm": 0.18315407633781433, + "learning_rate": 9.906359606541088e-05, + "loss": 1.9774, + "step": 298580 + }, + { + "epoch": 1.1365072356751902, + "grad_norm": 0.19487372040748596, + "learning_rate": 9.901163774160893e-05, + "loss": 1.9875, + "step": 298590 + }, + { + "epoch": 1.1365452981433128, + "grad_norm": 0.2191956639289856, + "learning_rate": 9.895968614946788e-05, + "loss": 1.9834, + "step": 298600 + }, + { + "epoch": 1.1365833606114355, + "grad_norm": 0.21160957217216492, + "learning_rate": 9.890774128637187e-05, + "loss": 1.9923, + "step": 298610 + }, + { + "epoch": 1.1366214230795582, + "grad_norm": 0.17996318638324738, + "learning_rate": 9.885580314970694e-05, + "loss": 1.9884, + "step": 298620 + }, + { + "epoch": 1.1366594855476808, + "grad_norm": 0.15935932099819183, + "learning_rate": 9.880387173686067e-05, + "loss": 1.9787, + "step": 298630 + }, + { + "epoch": 1.1366975480158035, + "grad_norm": 0.16337305307388306, + "learning_rate": 9.875194704522239e-05, + "loss": 1.9762, + "step": 298640 + }, + { + "epoch": 1.1367356104839261, + "grad_norm": 0.22064292430877686, + "learning_rate": 9.870002907218311e-05, + "loss": 1.9696, + "step": 298650 + }, + { + "epoch": 1.136773672952049, + "grad_norm": 0.16844485700130463, + "learning_rate": 9.864811781513556e-05, + "loss": 1.9818, + "step": 298660 + }, + { + "epoch": 1.1368117354201717, + "grad_norm": 0.1495898962020874, + "learning_rate": 9.8596213271474e-05, + "loss": 1.9782, + "step": 298670 + }, + { + "epoch": 1.1368497978882943, + "grad_norm": 0.17440442740917206, + "learning_rate": 9.854431543859454e-05, + "loss": 1.9685, + "step": 298680 + }, + { + "epoch": 1.136887860356417, + "grad_norm": 0.18147921562194824, + "learning_rate": 9.849242431389499e-05, + "loss": 1.9949, + "step": 298690 + }, + { + "epoch": 1.1369259228245396, + "grad_norm": 0.17838215827941895, + "learning_rate": 9.844053989477475e-05, + "loss": 1.9868, + "step": 298700 + }, + { + "epoch": 1.1369639852926623, + "grad_norm": 0.20777563750743866, + "learning_rate": 9.838866217863485e-05, + "loss": 1.9595, + "step": 298710 + }, + { + "epoch": 1.137002047760785, + "grad_norm": 0.1972523182630539, + "learning_rate": 9.833679116287819e-05, + "loss": 1.9932, + "step": 298720 + }, + { + "epoch": 1.1370401102289076, + "grad_norm": 0.23394937813282013, + "learning_rate": 9.828492684490914e-05, + "loss": 1.9728, + "step": 298730 + }, + { + "epoch": 1.1370781726970303, + "grad_norm": 0.2772226631641388, + "learning_rate": 9.823306922213388e-05, + "loss": 1.9758, + "step": 298740 + }, + { + "epoch": 1.1371162351651531, + "grad_norm": 0.17437441647052765, + "learning_rate": 9.818121829196019e-05, + "loss": 1.9603, + "step": 298750 + }, + { + "epoch": 1.1371542976332758, + "grad_norm": 0.17861786484718323, + "learning_rate": 9.812937405179761e-05, + "loss": 1.9753, + "step": 298760 + }, + { + "epoch": 1.1371923601013985, + "grad_norm": 0.15384817123413086, + "learning_rate": 9.807753649905726e-05, + "loss": 1.9929, + "step": 298770 + }, + { + "epoch": 1.1372304225695211, + "grad_norm": 0.25518345832824707, + "learning_rate": 9.802570563115192e-05, + "loss": 1.9663, + "step": 298780 + }, + { + "epoch": 1.1372684850376438, + "grad_norm": 0.1736324280500412, + "learning_rate": 9.797388144549613e-05, + "loss": 1.9664, + "step": 298790 + }, + { + "epoch": 1.1373065475057664, + "grad_norm": 0.1534629762172699, + "learning_rate": 9.792206393950609e-05, + "loss": 1.963, + "step": 298800 + }, + { + "epoch": 1.137344609973889, + "grad_norm": 0.22825098037719727, + "learning_rate": 9.787025311059944e-05, + "loss": 1.9605, + "step": 298810 + }, + { + "epoch": 1.1373826724420117, + "grad_norm": 0.26282426714897156, + "learning_rate": 9.78184489561959e-05, + "loss": 1.9622, + "step": 298820 + }, + { + "epoch": 1.1374207349101346, + "grad_norm": 0.16258125007152557, + "learning_rate": 9.77666514737165e-05, + "loss": 1.9895, + "step": 298830 + }, + { + "epoch": 1.1374587973782573, + "grad_norm": 0.17438267171382904, + "learning_rate": 9.771486066058405e-05, + "loss": 1.9749, + "step": 298840 + }, + { + "epoch": 1.13749685984638, + "grad_norm": 0.16162815690040588, + "learning_rate": 9.766307651422295e-05, + "loss": 1.9826, + "step": 298850 + }, + { + "epoch": 1.1375349223145026, + "grad_norm": 0.1707250326871872, + "learning_rate": 9.761129903205951e-05, + "loss": 1.9758, + "step": 298860 + }, + { + "epoch": 1.1375729847826253, + "grad_norm": 0.15322420001029968, + "learning_rate": 9.755952821152125e-05, + "loss": 1.9797, + "step": 298870 + }, + { + "epoch": 1.137611047250748, + "grad_norm": 0.24775369465351105, + "learning_rate": 9.750776405003786e-05, + "loss": 1.9842, + "step": 298880 + }, + { + "epoch": 1.1376491097188706, + "grad_norm": 0.22171650826931, + "learning_rate": 9.745600654504028e-05, + "loss": 1.9615, + "step": 298890 + }, + { + "epoch": 1.1376871721869932, + "grad_norm": 0.19848200678825378, + "learning_rate": 9.740425569396128e-05, + "loss": 1.9709, + "step": 298900 + }, + { + "epoch": 1.1377252346551159, + "grad_norm": 0.18477272987365723, + "learning_rate": 9.735251149423519e-05, + "loss": 1.9779, + "step": 298910 + }, + { + "epoch": 1.1377632971232385, + "grad_norm": 0.1732361912727356, + "learning_rate": 9.730077394329817e-05, + "loss": 1.9699, + "step": 298920 + }, + { + "epoch": 1.1378013595913614, + "grad_norm": 0.22373192012310028, + "learning_rate": 9.724904303858773e-05, + "loss": 1.9723, + "step": 298930 + }, + { + "epoch": 1.137839422059484, + "grad_norm": 0.25084006786346436, + "learning_rate": 9.71973187775434e-05, + "loss": 1.9754, + "step": 298940 + }, + { + "epoch": 1.1378774845276067, + "grad_norm": 0.17961499094963074, + "learning_rate": 9.714560115760601e-05, + "loss": 1.9705, + "step": 298950 + }, + { + "epoch": 1.1379155469957294, + "grad_norm": 0.21094465255737305, + "learning_rate": 9.709389017621817e-05, + "loss": 1.9725, + "step": 298960 + }, + { + "epoch": 1.137953609463852, + "grad_norm": 0.16603122651576996, + "learning_rate": 9.704218583082425e-05, + "loss": 1.9843, + "step": 298970 + }, + { + "epoch": 1.1379916719319747, + "grad_norm": 0.20088063180446625, + "learning_rate": 9.699048811887001e-05, + "loss": 1.9762, + "step": 298980 + }, + { + "epoch": 1.1380297344000974, + "grad_norm": 0.23191094398498535, + "learning_rate": 9.693879703780301e-05, + "loss": 1.9712, + "step": 298990 + }, + { + "epoch": 1.1380677968682202, + "grad_norm": 0.16466926038265228, + "learning_rate": 9.688711258507249e-05, + "loss": 1.9798, + "step": 299000 + }, + { + "epoch": 1.138105859336343, + "grad_norm": 0.2375544011592865, + "learning_rate": 9.683543475812923e-05, + "loss": 1.9771, + "step": 299010 + }, + { + "epoch": 1.1381439218044656, + "grad_norm": 0.2090522199869156, + "learning_rate": 9.678376355442564e-05, + "loss": 1.975, + "step": 299020 + }, + { + "epoch": 1.1381819842725882, + "grad_norm": 0.21409104764461517, + "learning_rate": 9.673209897141577e-05, + "loss": 1.9739, + "step": 299030 + }, + { + "epoch": 1.1382200467407109, + "grad_norm": 0.28223034739494324, + "learning_rate": 9.668044100655537e-05, + "loss": 1.972, + "step": 299040 + }, + { + "epoch": 1.1382581092088335, + "grad_norm": 0.23368722200393677, + "learning_rate": 9.66287896573017e-05, + "loss": 1.9783, + "step": 299050 + }, + { + "epoch": 1.1382961716769562, + "grad_norm": 0.159254789352417, + "learning_rate": 9.65771449211138e-05, + "loss": 1.9837, + "step": 299060 + }, + { + "epoch": 1.1383342341450788, + "grad_norm": 0.19009754061698914, + "learning_rate": 9.652550679545224e-05, + "loss": 1.9702, + "step": 299070 + }, + { + "epoch": 1.1383722966132015, + "grad_norm": 0.19186371564865112, + "learning_rate": 9.647387527777918e-05, + "loss": 1.9598, + "step": 299080 + }, + { + "epoch": 1.1384103590813242, + "grad_norm": 0.19739702343940735, + "learning_rate": 9.642225036555846e-05, + "loss": 1.9891, + "step": 299090 + }, + { + "epoch": 1.138448421549447, + "grad_norm": 0.16868695616722107, + "learning_rate": 9.637063205625552e-05, + "loss": 1.9734, + "step": 299100 + }, + { + "epoch": 1.1384864840175697, + "grad_norm": 0.19652950763702393, + "learning_rate": 9.631902034733747e-05, + "loss": 1.9728, + "step": 299110 + }, + { + "epoch": 1.1385245464856923, + "grad_norm": 0.20640340447425842, + "learning_rate": 9.626741523627302e-05, + "loss": 1.9793, + "step": 299120 + }, + { + "epoch": 1.138562608953815, + "grad_norm": 0.1784249097108841, + "learning_rate": 9.621581672053242e-05, + "loss": 1.9887, + "step": 299130 + }, + { + "epoch": 1.1386006714219377, + "grad_norm": 0.20432287454605103, + "learning_rate": 9.616422479758763e-05, + "loss": 1.9682, + "step": 299140 + }, + { + "epoch": 1.1386387338900603, + "grad_norm": 0.17736558616161346, + "learning_rate": 9.611263946491222e-05, + "loss": 1.9784, + "step": 299150 + }, + { + "epoch": 1.138676796358183, + "grad_norm": 0.16323047876358032, + "learning_rate": 9.606106071998128e-05, + "loss": 1.9779, + "step": 299160 + }, + { + "epoch": 1.1387148588263059, + "grad_norm": 0.1606374979019165, + "learning_rate": 9.600948856027158e-05, + "loss": 1.9792, + "step": 299170 + }, + { + "epoch": 1.1387529212944285, + "grad_norm": 0.16580626368522644, + "learning_rate": 9.595792298326156e-05, + "loss": 1.9712, + "step": 299180 + }, + { + "epoch": 1.1387909837625512, + "grad_norm": 0.1735002100467682, + "learning_rate": 9.590636398643115e-05, + "loss": 1.9833, + "step": 299190 + }, + { + "epoch": 1.1388290462306738, + "grad_norm": 0.1848945915699005, + "learning_rate": 9.585481156726194e-05, + "loss": 1.9876, + "step": 299200 + }, + { + "epoch": 1.1388671086987965, + "grad_norm": 0.1890447437763214, + "learning_rate": 9.580326572323721e-05, + "loss": 1.9624, + "step": 299210 + }, + { + "epoch": 1.1389051711669191, + "grad_norm": 0.1751815676689148, + "learning_rate": 9.575172645184166e-05, + "loss": 1.9613, + "step": 299220 + }, + { + "epoch": 1.1389432336350418, + "grad_norm": 0.1621846705675125, + "learning_rate": 9.570019375056172e-05, + "loss": 1.9691, + "step": 299230 + }, + { + "epoch": 1.1389812961031645, + "grad_norm": 0.19736886024475098, + "learning_rate": 9.56486676168854e-05, + "loss": 1.9573, + "step": 299240 + }, + { + "epoch": 1.139019358571287, + "grad_norm": 0.17240135371685028, + "learning_rate": 9.559714804830239e-05, + "loss": 1.9839, + "step": 299250 + }, + { + "epoch": 1.1390574210394098, + "grad_norm": 0.16781572997570038, + "learning_rate": 9.554563504230378e-05, + "loss": 1.987, + "step": 299260 + }, + { + "epoch": 1.1390954835075326, + "grad_norm": 0.1575121283531189, + "learning_rate": 9.549412859638246e-05, + "loss": 1.9769, + "step": 299270 + }, + { + "epoch": 1.1391335459756553, + "grad_norm": 0.16098329424858093, + "learning_rate": 9.544262870803272e-05, + "loss": 1.9785, + "step": 299280 + }, + { + "epoch": 1.139171608443778, + "grad_norm": 0.14912086725234985, + "learning_rate": 9.539113537475064e-05, + "loss": 1.9649, + "step": 299290 + }, + { + "epoch": 1.1392096709119006, + "grad_norm": 0.24539200961589813, + "learning_rate": 9.533964859403388e-05, + "loss": 1.9819, + "step": 299300 + }, + { + "epoch": 1.1392477333800233, + "grad_norm": 0.15809039771556854, + "learning_rate": 9.528816836338145e-05, + "loss": 1.9839, + "step": 299310 + }, + { + "epoch": 1.139285795848146, + "grad_norm": 0.201808899641037, + "learning_rate": 9.523669468029428e-05, + "loss": 1.973, + "step": 299320 + }, + { + "epoch": 1.1393238583162686, + "grad_norm": 0.1430976837873459, + "learning_rate": 9.51852275422746e-05, + "loss": 1.9703, + "step": 299330 + }, + { + "epoch": 1.1393619207843912, + "grad_norm": 0.20181354880332947, + "learning_rate": 9.51337669468264e-05, + "loss": 1.9741, + "step": 299340 + }, + { + "epoch": 1.1393999832525141, + "grad_norm": 0.20010481774806976, + "learning_rate": 9.508231289145525e-05, + "loss": 1.965, + "step": 299350 + }, + { + "epoch": 1.1394380457206368, + "grad_norm": 0.27025026082992554, + "learning_rate": 9.503086537366824e-05, + "loss": 1.973, + "step": 299360 + }, + { + "epoch": 1.1394761081887594, + "grad_norm": 0.24568350613117218, + "learning_rate": 9.497942439097412e-05, + "loss": 1.9673, + "step": 299370 + }, + { + "epoch": 1.139514170656882, + "grad_norm": 0.18625633418560028, + "learning_rate": 9.492798994088309e-05, + "loss": 1.9752, + "step": 299380 + }, + { + "epoch": 1.1395522331250048, + "grad_norm": 0.21505199372768402, + "learning_rate": 9.487656202090706e-05, + "loss": 1.9768, + "step": 299390 + }, + { + "epoch": 1.1395902955931274, + "grad_norm": 0.17063318192958832, + "learning_rate": 9.48251406285594e-05, + "loss": 1.9715, + "step": 299400 + }, + { + "epoch": 1.13962835806125, + "grad_norm": 0.15697281062602997, + "learning_rate": 9.477372576135524e-05, + "loss": 1.9826, + "step": 299410 + }, + { + "epoch": 1.1396664205293727, + "grad_norm": 0.1719808429479599, + "learning_rate": 9.472231741681109e-05, + "loss": 1.9865, + "step": 299420 + }, + { + "epoch": 1.1397044829974954, + "grad_norm": 0.15887048840522766, + "learning_rate": 9.467091559244523e-05, + "loss": 1.9684, + "step": 299430 + }, + { + "epoch": 1.1397425454656183, + "grad_norm": 0.1654588282108307, + "learning_rate": 9.461952028577731e-05, + "loss": 1.9778, + "step": 299440 + }, + { + "epoch": 1.139780607933741, + "grad_norm": 0.19598272442817688, + "learning_rate": 9.456813149432863e-05, + "loss": 1.9752, + "step": 299450 + }, + { + "epoch": 1.1398186704018636, + "grad_norm": 0.17805804312229156, + "learning_rate": 9.451674921562215e-05, + "loss": 1.9555, + "step": 299460 + }, + { + "epoch": 1.1398567328699862, + "grad_norm": 0.1505025327205658, + "learning_rate": 9.446537344718226e-05, + "loss": 1.964, + "step": 299470 + }, + { + "epoch": 1.139894795338109, + "grad_norm": 0.18607187271118164, + "learning_rate": 9.441400418653506e-05, + "loss": 1.9841, + "step": 299480 + }, + { + "epoch": 1.1399328578062315, + "grad_norm": 0.15925636887550354, + "learning_rate": 9.436264143120809e-05, + "loss": 1.9809, + "step": 299490 + }, + { + "epoch": 1.1399709202743542, + "grad_norm": 0.1464940458536148, + "learning_rate": 9.43112851787305e-05, + "loss": 1.9839, + "step": 299500 + }, + { + "epoch": 1.1400089827424769, + "grad_norm": 0.16832898557186127, + "learning_rate": 9.425993542663302e-05, + "loss": 1.984, + "step": 299510 + }, + { + "epoch": 1.1400470452105997, + "grad_norm": 0.16101309657096863, + "learning_rate": 9.420859217244798e-05, + "loss": 1.9805, + "step": 299520 + }, + { + "epoch": 1.1400851076787224, + "grad_norm": 0.21225060522556305, + "learning_rate": 9.415725541370912e-05, + "loss": 1.9737, + "step": 299530 + }, + { + "epoch": 1.140123170146845, + "grad_norm": 0.19584670662879944, + "learning_rate": 9.410592514795191e-05, + "loss": 1.9933, + "step": 299540 + }, + { + "epoch": 1.1401612326149677, + "grad_norm": 0.1607595980167389, + "learning_rate": 9.405460137271332e-05, + "loss": 1.9671, + "step": 299550 + }, + { + "epoch": 1.1401992950830904, + "grad_norm": 0.17394018173217773, + "learning_rate": 9.400328408553189e-05, + "loss": 1.9726, + "step": 299560 + }, + { + "epoch": 1.140237357551213, + "grad_norm": 0.2028769999742508, + "learning_rate": 9.395197328394761e-05, + "loss": 1.9808, + "step": 299570 + }, + { + "epoch": 1.1402754200193357, + "grad_norm": 0.14995448291301727, + "learning_rate": 9.390066896550214e-05, + "loss": 1.9827, + "step": 299580 + }, + { + "epoch": 1.1403134824874583, + "grad_norm": 0.14870086312294006, + "learning_rate": 9.384937112773866e-05, + "loss": 1.9908, + "step": 299590 + }, + { + "epoch": 1.140351544955581, + "grad_norm": 0.20218494534492493, + "learning_rate": 9.379807976820198e-05, + "loss": 1.9749, + "step": 299600 + }, + { + "epoch": 1.1403896074237039, + "grad_norm": 0.18345682322978973, + "learning_rate": 9.374679488443827e-05, + "loss": 1.988, + "step": 299610 + }, + { + "epoch": 1.1404276698918265, + "grad_norm": 0.181381493806839, + "learning_rate": 9.369551647399544e-05, + "loss": 1.9728, + "step": 299620 + }, + { + "epoch": 1.1404657323599492, + "grad_norm": 0.22997988760471344, + "learning_rate": 9.364424453442277e-05, + "loss": 1.9923, + "step": 299630 + }, + { + "epoch": 1.1405037948280718, + "grad_norm": 0.2116875797510147, + "learning_rate": 9.35929790632713e-05, + "loss": 1.9713, + "step": 299640 + }, + { + "epoch": 1.1405418572961945, + "grad_norm": 0.15564244985580444, + "learning_rate": 9.354172005809336e-05, + "loss": 1.9671, + "step": 299650 + }, + { + "epoch": 1.1405799197643172, + "grad_norm": 0.17033228278160095, + "learning_rate": 9.349046751644308e-05, + "loss": 1.9729, + "step": 299660 + }, + { + "epoch": 1.1406179822324398, + "grad_norm": 0.15838542580604553, + "learning_rate": 9.343922143587602e-05, + "loss": 1.9854, + "step": 299670 + }, + { + "epoch": 1.1406560447005625, + "grad_norm": 0.16979332268238068, + "learning_rate": 9.33879818139492e-05, + "loss": 1.9885, + "step": 299680 + }, + { + "epoch": 1.1406941071686854, + "grad_norm": 1.0191371440887451, + "learning_rate": 9.333674864822123e-05, + "loss": 1.9587, + "step": 299690 + }, + { + "epoch": 1.140732169636808, + "grad_norm": 0.15956035256385803, + "learning_rate": 9.32855219362524e-05, + "loss": 1.9788, + "step": 299700 + }, + { + "epoch": 1.1407702321049307, + "grad_norm": 0.20539459586143494, + "learning_rate": 9.323430167560427e-05, + "loss": 1.969, + "step": 299710 + }, + { + "epoch": 1.1408082945730533, + "grad_norm": 0.1533263623714447, + "learning_rate": 9.318308786384016e-05, + "loss": 1.9509, + "step": 299720 + }, + { + "epoch": 1.140846357041176, + "grad_norm": 0.1617736518383026, + "learning_rate": 9.313188049852484e-05, + "loss": 1.9832, + "step": 299730 + }, + { + "epoch": 1.1408844195092986, + "grad_norm": 0.18019530177116394, + "learning_rate": 9.308067957722466e-05, + "loss": 1.9786, + "step": 299740 + }, + { + "epoch": 1.1409224819774213, + "grad_norm": 0.19964605569839478, + "learning_rate": 9.302948509750736e-05, + "loss": 1.9749, + "step": 299750 + }, + { + "epoch": 1.140960544445544, + "grad_norm": 0.1651037633419037, + "learning_rate": 9.297829705694239e-05, + "loss": 1.9657, + "step": 299760 + }, + { + "epoch": 1.1409986069136666, + "grad_norm": 0.16229411959648132, + "learning_rate": 9.29271154531005e-05, + "loss": 1.9559, + "step": 299770 + }, + { + "epoch": 1.1410366693817893, + "grad_norm": 0.17111137509346008, + "learning_rate": 9.287594028355434e-05, + "loss": 1.9762, + "step": 299780 + }, + { + "epoch": 1.1410747318499121, + "grad_norm": 0.15575209259986877, + "learning_rate": 9.28247715458777e-05, + "loss": 1.9798, + "step": 299790 + }, + { + "epoch": 1.1411127943180348, + "grad_norm": 0.15766337513923645, + "learning_rate": 9.27736092376461e-05, + "loss": 1.9719, + "step": 299800 + }, + { + "epoch": 1.1411508567861575, + "grad_norm": 0.19725461304187775, + "learning_rate": 9.272245335643659e-05, + "loss": 1.981, + "step": 299810 + }, + { + "epoch": 1.1411889192542801, + "grad_norm": 0.17753826081752777, + "learning_rate": 9.267130389982752e-05, + "loss": 1.9767, + "step": 299820 + }, + { + "epoch": 1.1412269817224028, + "grad_norm": 0.234220951795578, + "learning_rate": 9.262016086539904e-05, + "loss": 1.9754, + "step": 299830 + }, + { + "epoch": 1.1412650441905254, + "grad_norm": 0.15564116835594177, + "learning_rate": 9.256902425073272e-05, + "loss": 1.9668, + "step": 299840 + }, + { + "epoch": 1.141303106658648, + "grad_norm": 0.1745157688856125, + "learning_rate": 9.251789405341165e-05, + "loss": 1.9849, + "step": 299850 + }, + { + "epoch": 1.141341169126771, + "grad_norm": 0.237146258354187, + "learning_rate": 9.24667702710203e-05, + "loss": 1.9658, + "step": 299860 + }, + { + "epoch": 1.1413792315948936, + "grad_norm": 0.18115739524364471, + "learning_rate": 9.24156529011449e-05, + "loss": 1.9765, + "step": 299870 + }, + { + "epoch": 1.1414172940630163, + "grad_norm": 0.18322153389453888, + "learning_rate": 9.2364541941373e-05, + "loss": 1.9627, + "step": 299880 + }, + { + "epoch": 1.141455356531139, + "grad_norm": 0.19589002430438995, + "learning_rate": 9.231343738929371e-05, + "loss": 1.9795, + "step": 299890 + }, + { + "epoch": 1.1414934189992616, + "grad_norm": 0.16380958259105682, + "learning_rate": 9.226233924249772e-05, + "loss": 1.9838, + "step": 299900 + }, + { + "epoch": 1.1415314814673843, + "grad_norm": 0.15757855772972107, + "learning_rate": 9.221124749857718e-05, + "loss": 1.9773, + "step": 299910 + }, + { + "epoch": 1.141569543935507, + "grad_norm": 0.16874462366104126, + "learning_rate": 9.216016215512574e-05, + "loss": 1.9866, + "step": 299920 + }, + { + "epoch": 1.1416076064036296, + "grad_norm": 0.166023388504982, + "learning_rate": 9.210908320973855e-05, + "loss": 1.9787, + "step": 299930 + }, + { + "epoch": 1.1416456688717522, + "grad_norm": 0.16237130761146545, + "learning_rate": 9.20580106600123e-05, + "loss": 1.9737, + "step": 299940 + }, + { + "epoch": 1.1416837313398749, + "grad_norm": 0.19373784959316254, + "learning_rate": 9.200694450354508e-05, + "loss": 1.9597, + "step": 299950 + }, + { + "epoch": 1.1417217938079978, + "grad_norm": 0.19997325539588928, + "learning_rate": 9.19558847379367e-05, + "loss": 1.9802, + "step": 299960 + }, + { + "epoch": 1.1417598562761204, + "grad_norm": 0.24005170166492462, + "learning_rate": 9.190483136078826e-05, + "loss": 1.9716, + "step": 299970 + }, + { + "epoch": 1.141797918744243, + "grad_norm": 0.17374595999717712, + "learning_rate": 9.185378436970249e-05, + "loss": 1.9686, + "step": 299980 + }, + { + "epoch": 1.1418359812123657, + "grad_norm": 0.1472892165184021, + "learning_rate": 9.180274376228348e-05, + "loss": 1.9745, + "step": 299990 + }, + { + "epoch": 1.1418740436804884, + "grad_norm": 0.16212306916713715, + "learning_rate": 9.175170953613698e-05, + "loss": 1.9749, + "step": 300000 + }, + { + "epoch": 1.141912106148611, + "grad_norm": 0.18655334413051605, + "learning_rate": 9.17006816888702e-05, + "loss": 1.9681, + "step": 300010 + }, + { + "epoch": 1.1419501686167337, + "grad_norm": 0.14725945889949799, + "learning_rate": 9.164966021809163e-05, + "loss": 1.9648, + "step": 300020 + }, + { + "epoch": 1.1419882310848566, + "grad_norm": 0.16695605218410492, + "learning_rate": 9.159864512141164e-05, + "loss": 1.9784, + "step": 300030 + }, + { + "epoch": 1.1420262935529792, + "grad_norm": 0.26002660393714905, + "learning_rate": 9.154763639644175e-05, + "loss": 1.9867, + "step": 300040 + }, + { + "epoch": 1.142064356021102, + "grad_norm": 0.20152299106121063, + "learning_rate": 9.149663404079523e-05, + "loss": 1.9748, + "step": 300050 + }, + { + "epoch": 1.1421024184892246, + "grad_norm": 0.17907015979290009, + "learning_rate": 9.144563805208655e-05, + "loss": 1.9745, + "step": 300060 + }, + { + "epoch": 1.1421404809573472, + "grad_norm": 0.18931683897972107, + "learning_rate": 9.1394648427932e-05, + "loss": 1.9837, + "step": 300070 + }, + { + "epoch": 1.1421785434254699, + "grad_norm": 0.26568910479545593, + "learning_rate": 9.1343665165949e-05, + "loss": 1.9771, + "step": 300080 + }, + { + "epoch": 1.1422166058935925, + "grad_norm": 0.2429303526878357, + "learning_rate": 9.129268826375681e-05, + "loss": 1.9723, + "step": 300090 + }, + { + "epoch": 1.1422546683617152, + "grad_norm": 0.16020800173282623, + "learning_rate": 9.124171771897594e-05, + "loss": 1.9659, + "step": 300100 + }, + { + "epoch": 1.1422927308298378, + "grad_norm": 0.2040097862482071, + "learning_rate": 9.119075352922852e-05, + "loss": 1.9734, + "step": 300110 + }, + { + "epoch": 1.1423307932979605, + "grad_norm": 0.23395033180713654, + "learning_rate": 9.113979569213799e-05, + "loss": 1.9724, + "step": 300120 + }, + { + "epoch": 1.1423688557660834, + "grad_norm": 0.18835385143756866, + "learning_rate": 9.108884420532948e-05, + "loss": 1.9819, + "step": 300130 + }, + { + "epoch": 1.142406918234206, + "grad_norm": 0.1681976020336151, + "learning_rate": 9.10378990664294e-05, + "loss": 1.9761, + "step": 300140 + }, + { + "epoch": 1.1424449807023287, + "grad_norm": 0.17211827635765076, + "learning_rate": 9.09869602730658e-05, + "loss": 1.9749, + "step": 300150 + }, + { + "epoch": 1.1424830431704514, + "grad_norm": 0.20787794888019562, + "learning_rate": 9.093602782286809e-05, + "loss": 1.9556, + "step": 300160 + }, + { + "epoch": 1.142521105638574, + "grad_norm": 0.18131273984909058, + "learning_rate": 9.088510171346731e-05, + "loss": 1.9549, + "step": 300170 + }, + { + "epoch": 1.1425591681066967, + "grad_norm": 0.20370520651340485, + "learning_rate": 9.083418194249577e-05, + "loss": 1.9759, + "step": 300180 + }, + { + "epoch": 1.1425972305748193, + "grad_norm": 0.23574639856815338, + "learning_rate": 9.07832685075874e-05, + "loss": 1.9816, + "step": 300190 + }, + { + "epoch": 1.142635293042942, + "grad_norm": 0.2160944789648056, + "learning_rate": 9.073236140637748e-05, + "loss": 1.9807, + "step": 300200 + }, + { + "epoch": 1.1426733555110649, + "grad_norm": 0.18580316007137299, + "learning_rate": 9.068146063650295e-05, + "loss": 1.9964, + "step": 300210 + }, + { + "epoch": 1.1427114179791875, + "grad_norm": 0.17532704770565033, + "learning_rate": 9.063056619560206e-05, + "loss": 1.9674, + "step": 300220 + }, + { + "epoch": 1.1427494804473102, + "grad_norm": 0.16181550920009613, + "learning_rate": 9.057967808131457e-05, + "loss": 1.9736, + "step": 300230 + }, + { + "epoch": 1.1427875429154328, + "grad_norm": 0.2239847034215927, + "learning_rate": 9.052879629128174e-05, + "loss": 1.9651, + "step": 300240 + }, + { + "epoch": 1.1428256053835555, + "grad_norm": 0.15953215956687927, + "learning_rate": 9.04779208231462e-05, + "loss": 1.9786, + "step": 300250 + }, + { + "epoch": 1.1428636678516781, + "grad_norm": 0.1677812933921814, + "learning_rate": 9.042705167455212e-05, + "loss": 1.9803, + "step": 300260 + }, + { + "epoch": 1.1429017303198008, + "grad_norm": 0.16961197555065155, + "learning_rate": 9.037618884314519e-05, + "loss": 1.963, + "step": 300270 + }, + { + "epoch": 1.1429397927879235, + "grad_norm": 0.18011970818042755, + "learning_rate": 9.032533232657242e-05, + "loss": 1.9596, + "step": 300280 + }, + { + "epoch": 1.1429778552560461, + "grad_norm": 0.17349569499492645, + "learning_rate": 9.027448212248241e-05, + "loss": 1.9693, + "step": 300290 + }, + { + "epoch": 1.143015917724169, + "grad_norm": 0.1612197458744049, + "learning_rate": 9.022363822852514e-05, + "loss": 1.981, + "step": 300300 + }, + { + "epoch": 1.1430539801922917, + "grad_norm": 0.19596810638904572, + "learning_rate": 9.017280064235206e-05, + "loss": 1.9687, + "step": 300310 + }, + { + "epoch": 1.1430920426604143, + "grad_norm": 0.22040875256061554, + "learning_rate": 9.012196936161604e-05, + "loss": 1.9715, + "step": 300320 + }, + { + "epoch": 1.143130105128537, + "grad_norm": 0.16568516194820404, + "learning_rate": 9.007114438397157e-05, + "loss": 1.9823, + "step": 300330 + }, + { + "epoch": 1.1431681675966596, + "grad_norm": 0.24007253348827362, + "learning_rate": 9.002032570707441e-05, + "loss": 1.9561, + "step": 300340 + }, + { + "epoch": 1.1432062300647823, + "grad_norm": 0.15698036551475525, + "learning_rate": 8.996951332858189e-05, + "loss": 1.9642, + "step": 300350 + }, + { + "epoch": 1.143244292532905, + "grad_norm": 0.2927793264389038, + "learning_rate": 8.991870724615264e-05, + "loss": 1.9689, + "step": 300360 + }, + { + "epoch": 1.1432823550010276, + "grad_norm": 0.2039732038974762, + "learning_rate": 8.986790745744694e-05, + "loss": 1.9858, + "step": 300370 + }, + { + "epoch": 1.1433204174691505, + "grad_norm": 0.18189412355422974, + "learning_rate": 8.981711396012631e-05, + "loss": 1.9597, + "step": 300380 + }, + { + "epoch": 1.1433584799372731, + "grad_norm": 0.16770325601100922, + "learning_rate": 8.976632675185392e-05, + "loss": 1.963, + "step": 300390 + }, + { + "epoch": 1.1433965424053958, + "grad_norm": 0.18383848667144775, + "learning_rate": 8.971554583029428e-05, + "loss": 1.9853, + "step": 300400 + }, + { + "epoch": 1.1434346048735184, + "grad_norm": 0.16434144973754883, + "learning_rate": 8.96647711931134e-05, + "loss": 1.9733, + "step": 300410 + }, + { + "epoch": 1.143472667341641, + "grad_norm": 0.159880593419075, + "learning_rate": 8.961400283797855e-05, + "loss": 1.9717, + "step": 300420 + }, + { + "epoch": 1.1435107298097638, + "grad_norm": 0.21327455341815948, + "learning_rate": 8.956324076255873e-05, + "loss": 1.9518, + "step": 300430 + }, + { + "epoch": 1.1435487922778864, + "grad_norm": 0.20327025651931763, + "learning_rate": 8.951248496452414e-05, + "loss": 1.9664, + "step": 300440 + }, + { + "epoch": 1.143586854746009, + "grad_norm": 0.1809774935245514, + "learning_rate": 8.946173544154663e-05, + "loss": 1.97, + "step": 300450 + }, + { + "epoch": 1.1436249172141317, + "grad_norm": 0.2367480844259262, + "learning_rate": 8.941099219129923e-05, + "loss": 1.9739, + "step": 300460 + }, + { + "epoch": 1.1436629796822546, + "grad_norm": 0.2103966772556305, + "learning_rate": 8.936025521145669e-05, + "loss": 1.9825, + "step": 300470 + }, + { + "epoch": 1.1437010421503773, + "grad_norm": 0.2010219842195511, + "learning_rate": 8.930952449969499e-05, + "loss": 1.9819, + "step": 300480 + }, + { + "epoch": 1.1437391046185, + "grad_norm": 0.17966939508914948, + "learning_rate": 8.925880005369158e-05, + "loss": 1.9826, + "step": 300490 + }, + { + "epoch": 1.1437771670866226, + "grad_norm": 0.19157104194164276, + "learning_rate": 8.920808187112539e-05, + "loss": 1.9739, + "step": 300500 + }, + { + "epoch": 1.1438152295547452, + "grad_norm": 0.16343864798545837, + "learning_rate": 8.915736994967688e-05, + "loss": 1.957, + "step": 300510 + }, + { + "epoch": 1.143853292022868, + "grad_norm": 0.23392702639102936, + "learning_rate": 8.910666428702774e-05, + "loss": 1.9744, + "step": 300520 + }, + { + "epoch": 1.1438913544909906, + "grad_norm": 0.19292297959327698, + "learning_rate": 8.905596488086121e-05, + "loss": 1.9719, + "step": 300530 + }, + { + "epoch": 1.1439294169591132, + "grad_norm": 0.1789822280406952, + "learning_rate": 8.900527172886186e-05, + "loss": 1.9835, + "step": 300540 + }, + { + "epoch": 1.143967479427236, + "grad_norm": 0.18455876410007477, + "learning_rate": 8.895458482871588e-05, + "loss": 1.9684, + "step": 300550 + }, + { + "epoch": 1.1440055418953587, + "grad_norm": 0.19163396954536438, + "learning_rate": 8.890390417811068e-05, + "loss": 1.9713, + "step": 300560 + }, + { + "epoch": 1.1440436043634814, + "grad_norm": 0.16672499477863312, + "learning_rate": 8.88532297747352e-05, + "loss": 1.9732, + "step": 300570 + }, + { + "epoch": 1.144081666831604, + "grad_norm": 0.16392666101455688, + "learning_rate": 8.880256161627987e-05, + "loss": 1.9744, + "step": 300580 + }, + { + "epoch": 1.1441197292997267, + "grad_norm": 0.16160540282726288, + "learning_rate": 8.87518997004363e-05, + "loss": 1.9866, + "step": 300590 + }, + { + "epoch": 1.1441577917678494, + "grad_norm": 0.1681511402130127, + "learning_rate": 8.870124402489783e-05, + "loss": 1.9804, + "step": 300600 + }, + { + "epoch": 1.144195854235972, + "grad_norm": 0.16743887960910797, + "learning_rate": 8.865059458735896e-05, + "loss": 1.961, + "step": 300610 + }, + { + "epoch": 1.1442339167040947, + "grad_norm": 0.1727055162191391, + "learning_rate": 8.859995138551575e-05, + "loss": 1.9758, + "step": 300620 + }, + { + "epoch": 1.1442719791722173, + "grad_norm": 0.24352505803108215, + "learning_rate": 8.85493144170657e-05, + "loss": 1.974, + "step": 300630 + }, + { + "epoch": 1.14431004164034, + "grad_norm": 0.3063167929649353, + "learning_rate": 8.849868367970759e-05, + "loss": 1.9736, + "step": 300640 + }, + { + "epoch": 1.1443481041084629, + "grad_norm": 0.2421969473361969, + "learning_rate": 8.844805917114179e-05, + "loss": 1.9631, + "step": 300650 + }, + { + "epoch": 1.1443861665765855, + "grad_norm": 0.2752586603164673, + "learning_rate": 8.839744088906998e-05, + "loss": 1.9775, + "step": 300660 + }, + { + "epoch": 1.1444242290447082, + "grad_norm": 0.27569398283958435, + "learning_rate": 8.834682883119516e-05, + "loss": 1.9517, + "step": 300670 + }, + { + "epoch": 1.1444622915128309, + "grad_norm": 0.20763832330703735, + "learning_rate": 8.829622299522195e-05, + "loss": 1.9607, + "step": 300680 + }, + { + "epoch": 1.1445003539809535, + "grad_norm": 0.19430187344551086, + "learning_rate": 8.824562337885627e-05, + "loss": 1.9661, + "step": 300690 + }, + { + "epoch": 1.1445384164490762, + "grad_norm": 0.1806371957063675, + "learning_rate": 8.819502997980544e-05, + "loss": 1.966, + "step": 300700 + }, + { + "epoch": 1.1445764789171988, + "grad_norm": 0.17690540850162506, + "learning_rate": 8.814444279577821e-05, + "loss": 1.9626, + "step": 300710 + }, + { + "epoch": 1.1446145413853217, + "grad_norm": 0.16142280399799347, + "learning_rate": 8.809386182448475e-05, + "loss": 1.9715, + "step": 300720 + }, + { + "epoch": 1.1446526038534444, + "grad_norm": 0.2089589387178421, + "learning_rate": 8.804328706363663e-05, + "loss": 1.9686, + "step": 300730 + }, + { + "epoch": 1.144690666321567, + "grad_norm": 0.19803060591220856, + "learning_rate": 8.79927185109467e-05, + "loss": 1.9662, + "step": 300740 + }, + { + "epoch": 1.1447287287896897, + "grad_norm": 0.288856565952301, + "learning_rate": 8.794215616412949e-05, + "loss": 1.9652, + "step": 300750 + }, + { + "epoch": 1.1447667912578123, + "grad_norm": 0.26154083013534546, + "learning_rate": 8.789160002090063e-05, + "loss": 1.9576, + "step": 300760 + }, + { + "epoch": 1.144804853725935, + "grad_norm": 0.17871473729610443, + "learning_rate": 8.784105007897741e-05, + "loss": 1.9585, + "step": 300770 + }, + { + "epoch": 1.1448429161940576, + "grad_norm": 0.2214268147945404, + "learning_rate": 8.779050633607838e-05, + "loss": 1.9551, + "step": 300780 + }, + { + "epoch": 1.1448809786621803, + "grad_norm": 0.2183777391910553, + "learning_rate": 8.773996878992341e-05, + "loss": 1.9834, + "step": 300790 + }, + { + "epoch": 1.144919041130303, + "grad_norm": 0.17460443079471588, + "learning_rate": 8.768943743823393e-05, + "loss": 1.9695, + "step": 300800 + }, + { + "epoch": 1.1449571035984256, + "grad_norm": 0.17166626453399658, + "learning_rate": 8.763891227873272e-05, + "loss": 1.9736, + "step": 300810 + }, + { + "epoch": 1.1449951660665485, + "grad_norm": 0.19382481276988983, + "learning_rate": 8.758839330914398e-05, + "loss": 1.9818, + "step": 300820 + }, + { + "epoch": 1.1450332285346712, + "grad_norm": 0.2120654433965683, + "learning_rate": 8.753788052719314e-05, + "loss": 1.9526, + "step": 300830 + }, + { + "epoch": 1.1450712910027938, + "grad_norm": 0.18074551224708557, + "learning_rate": 8.748737393060723e-05, + "loss": 1.9668, + "step": 300840 + }, + { + "epoch": 1.1451093534709165, + "grad_norm": 0.16520459949970245, + "learning_rate": 8.743687351711455e-05, + "loss": 1.9662, + "step": 300850 + }, + { + "epoch": 1.1451474159390391, + "grad_norm": 0.172722727060318, + "learning_rate": 8.738637928444481e-05, + "loss": 1.9783, + "step": 300860 + }, + { + "epoch": 1.1451854784071618, + "grad_norm": 0.1552434265613556, + "learning_rate": 8.73358912303292e-05, + "loss": 1.9719, + "step": 300870 + }, + { + "epoch": 1.1452235408752844, + "grad_norm": 0.21896371245384216, + "learning_rate": 8.728540935250018e-05, + "loss": 1.9716, + "step": 300880 + }, + { + "epoch": 1.1452616033434073, + "grad_norm": 0.16902369260787964, + "learning_rate": 8.723493364869161e-05, + "loss": 1.9721, + "step": 300890 + }, + { + "epoch": 1.14529966581153, + "grad_norm": 0.16694094240665436, + "learning_rate": 8.718446411663882e-05, + "loss": 1.9724, + "step": 300900 + }, + { + "epoch": 1.1453377282796526, + "grad_norm": 0.24911977350711823, + "learning_rate": 8.71340007540784e-05, + "loss": 1.9756, + "step": 300910 + }, + { + "epoch": 1.1453757907477753, + "grad_norm": 0.15349158644676208, + "learning_rate": 8.708354355874842e-05, + "loss": 1.9725, + "step": 300920 + }, + { + "epoch": 1.145413853215898, + "grad_norm": 0.17509305477142334, + "learning_rate": 8.703309252838832e-05, + "loss": 1.9719, + "step": 300930 + }, + { + "epoch": 1.1454519156840206, + "grad_norm": 0.16706439852714539, + "learning_rate": 8.698264766073894e-05, + "loss": 1.99, + "step": 300940 + }, + { + "epoch": 1.1454899781521433, + "grad_norm": 0.1890660524368286, + "learning_rate": 8.693220895354236e-05, + "loss": 1.9681, + "step": 300950 + }, + { + "epoch": 1.145528040620266, + "grad_norm": 0.1490461230278015, + "learning_rate": 8.68817764045422e-05, + "loss": 1.9789, + "step": 300960 + }, + { + "epoch": 1.1455661030883886, + "grad_norm": 0.16561798751354218, + "learning_rate": 8.68313500114834e-05, + "loss": 1.9659, + "step": 300970 + }, + { + "epoch": 1.1456041655565112, + "grad_norm": 0.17476516962051392, + "learning_rate": 8.678092977211227e-05, + "loss": 1.9652, + "step": 300980 + }, + { + "epoch": 1.1456422280246341, + "grad_norm": 0.27982303500175476, + "learning_rate": 8.67305156841765e-05, + "loss": 1.978, + "step": 300990 + }, + { + "epoch": 1.1456802904927568, + "grad_norm": 0.1696423441171646, + "learning_rate": 8.668010774542517e-05, + "loss": 1.9711, + "step": 301000 + }, + { + "epoch": 1.1457183529608794, + "grad_norm": 0.17661775648593903, + "learning_rate": 8.66297059536087e-05, + "loss": 1.9582, + "step": 301010 + }, + { + "epoch": 1.145756415429002, + "grad_norm": 0.1834118366241455, + "learning_rate": 8.657931030647892e-05, + "loss": 1.9677, + "step": 301020 + }, + { + "epoch": 1.1457944778971247, + "grad_norm": 0.2517413794994354, + "learning_rate": 8.652892080178893e-05, + "loss": 1.9707, + "step": 301030 + }, + { + "epoch": 1.1458325403652474, + "grad_norm": 0.20651330053806305, + "learning_rate": 8.647853743729334e-05, + "loss": 1.9786, + "step": 301040 + }, + { + "epoch": 1.14587060283337, + "grad_norm": 0.18979932367801666, + "learning_rate": 8.642816021074806e-05, + "loss": 1.9777, + "step": 301050 + }, + { + "epoch": 1.145908665301493, + "grad_norm": 0.1647159606218338, + "learning_rate": 8.637778911991034e-05, + "loss": 1.9661, + "step": 301060 + }, + { + "epoch": 1.1459467277696156, + "grad_norm": 0.16355659067630768, + "learning_rate": 8.632742416253892e-05, + "loss": 1.978, + "step": 301070 + }, + { + "epoch": 1.1459847902377382, + "grad_norm": 0.18683548271656036, + "learning_rate": 8.627706533639367e-05, + "loss": 1.9661, + "step": 301080 + }, + { + "epoch": 1.146022852705861, + "grad_norm": 0.16159670054912567, + "learning_rate": 8.62267126392361e-05, + "loss": 1.9747, + "step": 301090 + }, + { + "epoch": 1.1460609151739836, + "grad_norm": 0.1670175939798355, + "learning_rate": 8.617636606882878e-05, + "loss": 1.9645, + "step": 301100 + }, + { + "epoch": 1.1460989776421062, + "grad_norm": 0.2345433384180069, + "learning_rate": 8.612602562293592e-05, + "loss": 1.9624, + "step": 301110 + }, + { + "epoch": 1.1461370401102289, + "grad_norm": 0.1708359718322754, + "learning_rate": 8.607569129932302e-05, + "loss": 1.959, + "step": 301120 + }, + { + "epoch": 1.1461751025783515, + "grad_norm": 0.17826248705387115, + "learning_rate": 8.602536309575682e-05, + "loss": 1.9683, + "step": 301130 + }, + { + "epoch": 1.1462131650464742, + "grad_norm": 0.16386419534683228, + "learning_rate": 8.597504101000547e-05, + "loss": 1.9544, + "step": 301140 + }, + { + "epoch": 1.1462512275145968, + "grad_norm": 0.2092752605676651, + "learning_rate": 8.592472503983856e-05, + "loss": 1.9565, + "step": 301150 + }, + { + "epoch": 1.1462892899827197, + "grad_norm": 0.18252654373645782, + "learning_rate": 8.587441518302686e-05, + "loss": 1.9588, + "step": 301160 + }, + { + "epoch": 1.1463273524508424, + "grad_norm": 0.19344913959503174, + "learning_rate": 8.582411143734275e-05, + "loss": 1.9642, + "step": 301170 + }, + { + "epoch": 1.146365414918965, + "grad_norm": 0.14749857783317566, + "learning_rate": 8.577381380055976e-05, + "loss": 1.9716, + "step": 301180 + }, + { + "epoch": 1.1464034773870877, + "grad_norm": 0.320700466632843, + "learning_rate": 8.572352227045283e-05, + "loss": 1.9729, + "step": 301190 + }, + { + "epoch": 1.1464415398552104, + "grad_norm": 0.20049598813056946, + "learning_rate": 8.567323684479821e-05, + "loss": 1.9823, + "step": 301200 + }, + { + "epoch": 1.146479602323333, + "grad_norm": 0.21233174204826355, + "learning_rate": 8.56229575213736e-05, + "loss": 1.9532, + "step": 301210 + }, + { + "epoch": 1.1465176647914557, + "grad_norm": 0.1554403305053711, + "learning_rate": 8.557268429795795e-05, + "loss": 1.9611, + "step": 301220 + }, + { + "epoch": 1.1465557272595783, + "grad_norm": 0.21042974293231964, + "learning_rate": 8.552241717233167e-05, + "loss": 1.9917, + "step": 301230 + }, + { + "epoch": 1.1465937897277012, + "grad_norm": 0.23165065050125122, + "learning_rate": 8.547215614227633e-05, + "loss": 1.9802, + "step": 301240 + }, + { + "epoch": 1.1466318521958239, + "grad_norm": 0.22600439190864563, + "learning_rate": 8.542190120557503e-05, + "loss": 1.9662, + "step": 301250 + }, + { + "epoch": 1.1466699146639465, + "grad_norm": 0.1871139258146286, + "learning_rate": 8.53716523600121e-05, + "loss": 1.9679, + "step": 301260 + }, + { + "epoch": 1.1467079771320692, + "grad_norm": 0.15982182323932648, + "learning_rate": 8.532140960337325e-05, + "loss": 1.9672, + "step": 301270 + }, + { + "epoch": 1.1467460396001918, + "grad_norm": 0.17912767827510834, + "learning_rate": 8.52711729334456e-05, + "loss": 1.9752, + "step": 301280 + }, + { + "epoch": 1.1467841020683145, + "grad_norm": 0.20834913849830627, + "learning_rate": 8.522094234801742e-05, + "loss": 1.9641, + "step": 301290 + }, + { + "epoch": 1.1468221645364371, + "grad_norm": 0.2113134264945984, + "learning_rate": 8.517071784487856e-05, + "loss": 1.9671, + "step": 301300 + }, + { + "epoch": 1.1468602270045598, + "grad_norm": 0.1577615588903427, + "learning_rate": 8.51204994218201e-05, + "loss": 1.9691, + "step": 301310 + }, + { + "epoch": 1.1468982894726825, + "grad_norm": 0.2631106376647949, + "learning_rate": 8.507028707663434e-05, + "loss": 1.9665, + "step": 301320 + }, + { + "epoch": 1.1469363519408053, + "grad_norm": 0.2079096883535385, + "learning_rate": 8.502008080711504e-05, + "loss": 1.97, + "step": 301330 + }, + { + "epoch": 1.146974414408928, + "grad_norm": 0.2349461168050766, + "learning_rate": 8.496988061105737e-05, + "loss": 1.9595, + "step": 301340 + }, + { + "epoch": 1.1470124768770507, + "grad_norm": 0.2771170139312744, + "learning_rate": 8.491968648625759e-05, + "loss": 1.9716, + "step": 301350 + }, + { + "epoch": 1.1470505393451733, + "grad_norm": 0.1749347448348999, + "learning_rate": 8.486949843051361e-05, + "loss": 1.9601, + "step": 301360 + }, + { + "epoch": 1.147088601813296, + "grad_norm": 0.17974738776683807, + "learning_rate": 8.481931644162444e-05, + "loss": 1.972, + "step": 301370 + }, + { + "epoch": 1.1471266642814186, + "grad_norm": 0.1842113435268402, + "learning_rate": 8.476914051739044e-05, + "loss": 1.9564, + "step": 301380 + }, + { + "epoch": 1.1471647267495413, + "grad_norm": 0.1851554960012436, + "learning_rate": 8.471897065561335e-05, + "loss": 1.9538, + "step": 301390 + }, + { + "epoch": 1.147202789217664, + "grad_norm": 0.1589783877134323, + "learning_rate": 8.466880685409628e-05, + "loss": 1.9642, + "step": 301400 + }, + { + "epoch": 1.1472408516857868, + "grad_norm": 0.1574392318725586, + "learning_rate": 8.461864911064354e-05, + "loss": 1.9566, + "step": 301410 + }, + { + "epoch": 1.1472789141539095, + "grad_norm": 0.19300295412540436, + "learning_rate": 8.456849742306095e-05, + "loss": 1.9905, + "step": 301420 + }, + { + "epoch": 1.1473169766220321, + "grad_norm": 0.19914722442626953, + "learning_rate": 8.451835178915546e-05, + "loss": 1.9676, + "step": 301430 + }, + { + "epoch": 1.1473550390901548, + "grad_norm": 0.17109766602516174, + "learning_rate": 8.446821220673545e-05, + "loss": 1.9665, + "step": 301440 + }, + { + "epoch": 1.1473931015582775, + "grad_norm": 0.17407475411891937, + "learning_rate": 8.441807867361057e-05, + "loss": 1.9642, + "step": 301450 + }, + { + "epoch": 1.1474311640264, + "grad_norm": 0.1888018250465393, + "learning_rate": 8.436795118759188e-05, + "loss": 1.9629, + "step": 301460 + }, + { + "epoch": 1.1474692264945228, + "grad_norm": 0.16494682431221008, + "learning_rate": 8.431782974649166e-05, + "loss": 1.9727, + "step": 301470 + }, + { + "epoch": 1.1475072889626454, + "grad_norm": 0.1666668802499771, + "learning_rate": 8.426771434812363e-05, + "loss": 1.9768, + "step": 301480 + }, + { + "epoch": 1.147545351430768, + "grad_norm": 0.1791200041770935, + "learning_rate": 8.421760499030267e-05, + "loss": 1.9882, + "step": 301490 + }, + { + "epoch": 1.1475834138988907, + "grad_norm": 0.16579587757587433, + "learning_rate": 8.416750167084503e-05, + "loss": 1.9702, + "step": 301500 + }, + { + "epoch": 1.1476214763670136, + "grad_norm": 0.16474118828773499, + "learning_rate": 8.411740438756837e-05, + "loss": 1.9715, + "step": 301510 + }, + { + "epoch": 1.1476595388351363, + "grad_norm": 0.2248903065919876, + "learning_rate": 8.406731313829158e-05, + "loss": 1.9559, + "step": 301520 + }, + { + "epoch": 1.147697601303259, + "grad_norm": 0.15927183628082275, + "learning_rate": 8.401722792083483e-05, + "loss": 1.975, + "step": 301530 + }, + { + "epoch": 1.1477356637713816, + "grad_norm": 0.14855605363845825, + "learning_rate": 8.396714873301969e-05, + "loss": 1.9761, + "step": 301540 + }, + { + "epoch": 1.1477737262395042, + "grad_norm": 0.1880766898393631, + "learning_rate": 8.391707557266903e-05, + "loss": 1.9772, + "step": 301550 + }, + { + "epoch": 1.147811788707627, + "grad_norm": 0.17366820573806763, + "learning_rate": 8.386700843760698e-05, + "loss": 1.9774, + "step": 301560 + }, + { + "epoch": 1.1478498511757496, + "grad_norm": 0.1661127507686615, + "learning_rate": 8.381694732565898e-05, + "loss": 1.9658, + "step": 301570 + }, + { + "epoch": 1.1478879136438724, + "grad_norm": 0.18777918815612793, + "learning_rate": 8.376689223465179e-05, + "loss": 1.9882, + "step": 301580 + }, + { + "epoch": 1.147925976111995, + "grad_norm": 0.2077363282442093, + "learning_rate": 8.371684316241346e-05, + "loss": 1.968, + "step": 301590 + }, + { + "epoch": 1.1479640385801178, + "grad_norm": 0.23291166126728058, + "learning_rate": 8.366680010677346e-05, + "loss": 1.9749, + "step": 301600 + }, + { + "epoch": 1.1480021010482404, + "grad_norm": 0.19629411399364471, + "learning_rate": 8.361676306556238e-05, + "loss": 1.9864, + "step": 301610 + }, + { + "epoch": 1.148040163516363, + "grad_norm": 0.18168382346630096, + "learning_rate": 8.356673203661231e-05, + "loss": 1.9607, + "step": 301620 + }, + { + "epoch": 1.1480782259844857, + "grad_norm": 0.18806378543376923, + "learning_rate": 8.351670701775644e-05, + "loss": 1.974, + "step": 301630 + }, + { + "epoch": 1.1481162884526084, + "grad_norm": 0.157528817653656, + "learning_rate": 8.346668800682944e-05, + "loss": 1.961, + "step": 301640 + }, + { + "epoch": 1.148154350920731, + "grad_norm": 0.17656798660755157, + "learning_rate": 8.341667500166706e-05, + "loss": 1.952, + "step": 301650 + }, + { + "epoch": 1.1481924133888537, + "grad_norm": 0.27359622716903687, + "learning_rate": 8.336666800010667e-05, + "loss": 1.9646, + "step": 301660 + }, + { + "epoch": 1.1482304758569764, + "grad_norm": 0.17596830427646637, + "learning_rate": 8.33166669999867e-05, + "loss": 1.9712, + "step": 301670 + }, + { + "epoch": 1.1482685383250992, + "grad_norm": 0.16874903440475464, + "learning_rate": 8.326667199914684e-05, + "loss": 1.972, + "step": 301680 + }, + { + "epoch": 1.1483066007932219, + "grad_norm": 0.20028790831565857, + "learning_rate": 8.321668299542828e-05, + "loss": 1.9815, + "step": 301690 + }, + { + "epoch": 1.1483446632613445, + "grad_norm": 0.15614601969718933, + "learning_rate": 8.316669998667337e-05, + "loss": 1.9696, + "step": 301700 + }, + { + "epoch": 1.1483827257294672, + "grad_norm": 0.2561275064945221, + "learning_rate": 8.31167229707257e-05, + "loss": 1.9636, + "step": 301710 + }, + { + "epoch": 1.1484207881975899, + "grad_norm": 0.1641322672367096, + "learning_rate": 8.306675194543029e-05, + "loss": 1.9527, + "step": 301720 + }, + { + "epoch": 1.1484588506657125, + "grad_norm": 0.177987203001976, + "learning_rate": 8.301678690863346e-05, + "loss": 1.9626, + "step": 301730 + }, + { + "epoch": 1.1484969131338352, + "grad_norm": 0.18081584572792053, + "learning_rate": 8.296682785818266e-05, + "loss": 1.9683, + "step": 301740 + }, + { + "epoch": 1.148534975601958, + "grad_norm": 0.16380837559700012, + "learning_rate": 8.291687479192673e-05, + "loss": 1.9854, + "step": 301750 + }, + { + "epoch": 1.1485730380700807, + "grad_norm": 0.1782199740409851, + "learning_rate": 8.286692770771581e-05, + "loss": 1.9782, + "step": 301760 + }, + { + "epoch": 1.1486111005382034, + "grad_norm": 0.19196809828281403, + "learning_rate": 8.281698660340126e-05, + "loss": 1.978, + "step": 301770 + }, + { + "epoch": 1.148649163006326, + "grad_norm": 0.28066837787628174, + "learning_rate": 8.276705147683583e-05, + "loss": 1.9707, + "step": 301780 + }, + { + "epoch": 1.1486872254744487, + "grad_norm": 0.22474101185798645, + "learning_rate": 8.271712232587353e-05, + "loss": 1.968, + "step": 301790 + }, + { + "epoch": 1.1487252879425713, + "grad_norm": 0.18222326040267944, + "learning_rate": 8.266719914836951e-05, + "loss": 1.9798, + "step": 301800 + }, + { + "epoch": 1.148763350410694, + "grad_norm": 0.173833966255188, + "learning_rate": 8.261728194218038e-05, + "loss": 1.9731, + "step": 301810 + }, + { + "epoch": 1.1488014128788167, + "grad_norm": 0.16559606790542603, + "learning_rate": 8.256737070516396e-05, + "loss": 1.9481, + "step": 301820 + }, + { + "epoch": 1.1488394753469393, + "grad_norm": 0.15625733137130737, + "learning_rate": 8.251746543517935e-05, + "loss": 1.9775, + "step": 301830 + }, + { + "epoch": 1.148877537815062, + "grad_norm": 0.1697421818971634, + "learning_rate": 8.246756613008694e-05, + "loss": 1.9711, + "step": 301840 + }, + { + "epoch": 1.1489156002831848, + "grad_norm": 0.24292589724063873, + "learning_rate": 8.241767278774837e-05, + "loss": 1.9604, + "step": 301850 + }, + { + "epoch": 1.1489536627513075, + "grad_norm": 0.1861419975757599, + "learning_rate": 8.236778540602657e-05, + "loss": 1.9623, + "step": 301860 + }, + { + "epoch": 1.1489917252194302, + "grad_norm": 0.17582234740257263, + "learning_rate": 8.231790398278582e-05, + "loss": 1.9622, + "step": 301870 + }, + { + "epoch": 1.1490297876875528, + "grad_norm": 0.16839978098869324, + "learning_rate": 8.226802851589154e-05, + "loss": 1.9493, + "step": 301880 + }, + { + "epoch": 1.1490678501556755, + "grad_norm": 0.16905924677848816, + "learning_rate": 8.221815900321056e-05, + "loss": 1.966, + "step": 301890 + }, + { + "epoch": 1.1491059126237981, + "grad_norm": 0.17358462512493134, + "learning_rate": 8.216829544261084e-05, + "loss": 1.9735, + "step": 301900 + }, + { + "epoch": 1.1491439750919208, + "grad_norm": 0.22775432467460632, + "learning_rate": 8.211843783196177e-05, + "loss": 1.9678, + "step": 301910 + }, + { + "epoch": 1.1491820375600437, + "grad_norm": 0.17102882266044617, + "learning_rate": 8.206858616913388e-05, + "loss": 1.9719, + "step": 301920 + }, + { + "epoch": 1.1492201000281663, + "grad_norm": 0.17452996969223022, + "learning_rate": 8.201874045199909e-05, + "loss": 1.9729, + "step": 301930 + }, + { + "epoch": 1.149258162496289, + "grad_norm": 0.15462011098861694, + "learning_rate": 8.196890067843038e-05, + "loss": 1.959, + "step": 301940 + }, + { + "epoch": 1.1492962249644116, + "grad_norm": 0.20610308647155762, + "learning_rate": 8.191906684630224e-05, + "loss": 1.9652, + "step": 301950 + }, + { + "epoch": 1.1493342874325343, + "grad_norm": 0.16314344108104706, + "learning_rate": 8.186923895349036e-05, + "loss": 1.9803, + "step": 301960 + }, + { + "epoch": 1.149372349900657, + "grad_norm": 0.15605981647968292, + "learning_rate": 8.18194169978716e-05, + "loss": 1.9577, + "step": 301970 + }, + { + "epoch": 1.1494104123687796, + "grad_norm": 0.1960136443376541, + "learning_rate": 8.176960097732416e-05, + "loss": 1.9453, + "step": 301980 + }, + { + "epoch": 1.1494484748369023, + "grad_norm": 0.16937169432640076, + "learning_rate": 8.171979088972747e-05, + "loss": 1.9588, + "step": 301990 + }, + { + "epoch": 1.149486537305025, + "grad_norm": 0.1623869091272354, + "learning_rate": 8.166998673296222e-05, + "loss": 1.9786, + "step": 302000 + }, + { + "epoch": 1.1495245997731476, + "grad_norm": 0.1552773416042328, + "learning_rate": 8.162018850491043e-05, + "loss": 1.9606, + "step": 302010 + }, + { + "epoch": 1.1495626622412705, + "grad_norm": 0.1646445244550705, + "learning_rate": 8.157039620345536e-05, + "loss": 1.9741, + "step": 302020 + }, + { + "epoch": 1.1496007247093931, + "grad_norm": 0.18836228549480438, + "learning_rate": 8.15206098264814e-05, + "loss": 1.9618, + "step": 302030 + }, + { + "epoch": 1.1496387871775158, + "grad_norm": 0.1809222400188446, + "learning_rate": 8.147082937187439e-05, + "loss": 1.9648, + "step": 302040 + }, + { + "epoch": 1.1496768496456384, + "grad_norm": 0.16798196732997894, + "learning_rate": 8.142105483752133e-05, + "loss": 1.9748, + "step": 302050 + }, + { + "epoch": 1.149714912113761, + "grad_norm": 0.17388495802879333, + "learning_rate": 8.137128622131046e-05, + "loss": 1.9723, + "step": 302060 + }, + { + "epoch": 1.1497529745818837, + "grad_norm": 0.15599116683006287, + "learning_rate": 8.132152352113121e-05, + "loss": 1.963, + "step": 302070 + }, + { + "epoch": 1.1497910370500064, + "grad_norm": 0.1911918967962265, + "learning_rate": 8.127176673487451e-05, + "loss": 1.9852, + "step": 302080 + }, + { + "epoch": 1.149829099518129, + "grad_norm": 0.16465184092521667, + "learning_rate": 8.122201586043231e-05, + "loss": 1.9665, + "step": 302090 + }, + { + "epoch": 1.149867161986252, + "grad_norm": 0.1780809909105301, + "learning_rate": 8.11722708956979e-05, + "loss": 1.9684, + "step": 302100 + }, + { + "epoch": 1.1499052244543746, + "grad_norm": 0.20676323771476746, + "learning_rate": 8.112253183856577e-05, + "loss": 1.9681, + "step": 302110 + }, + { + "epoch": 1.1499432869224973, + "grad_norm": 0.19733954966068268, + "learning_rate": 8.107279868693179e-05, + "loss": 1.968, + "step": 302120 + }, + { + "epoch": 1.14998134939062, + "grad_norm": 0.17312826216220856, + "learning_rate": 8.102307143869287e-05, + "loss": 1.9565, + "step": 302130 + }, + { + "epoch": 1.1500194118587426, + "grad_norm": 0.2496890276670456, + "learning_rate": 8.097335009174734e-05, + "loss": 1.9626, + "step": 302140 + }, + { + "epoch": 1.1500574743268652, + "grad_norm": 0.18430748581886292, + "learning_rate": 8.092363464399477e-05, + "loss": 1.9772, + "step": 302150 + }, + { + "epoch": 1.1500955367949879, + "grad_norm": 0.16753272712230682, + "learning_rate": 8.087392509333585e-05, + "loss": 1.9665, + "step": 302160 + }, + { + "epoch": 1.1501335992631105, + "grad_norm": 0.18290819227695465, + "learning_rate": 8.08242214376726e-05, + "loss": 1.9639, + "step": 302170 + }, + { + "epoch": 1.1501716617312332, + "grad_norm": 0.24969084560871124, + "learning_rate": 8.077452367490834e-05, + "loss": 1.9856, + "step": 302180 + }, + { + "epoch": 1.150209724199356, + "grad_norm": 0.16676537692546844, + "learning_rate": 8.072483180294748e-05, + "loss": 1.955, + "step": 302190 + }, + { + "epoch": 1.1502477866674787, + "grad_norm": 0.19444943964481354, + "learning_rate": 8.067514581969587e-05, + "loss": 1.9693, + "step": 302200 + }, + { + "epoch": 1.1502858491356014, + "grad_norm": 0.1640620082616806, + "learning_rate": 8.06254657230604e-05, + "loss": 1.9651, + "step": 302210 + }, + { + "epoch": 1.150323911603724, + "grad_norm": 0.21142394840717316, + "learning_rate": 8.057579151094929e-05, + "loss": 1.9714, + "step": 302220 + }, + { + "epoch": 1.1503619740718467, + "grad_norm": 0.2291944921016693, + "learning_rate": 8.052612318127206e-05, + "loss": 1.966, + "step": 302230 + }, + { + "epoch": 1.1504000365399694, + "grad_norm": 0.16705965995788574, + "learning_rate": 8.047646073193937e-05, + "loss": 1.9574, + "step": 302240 + }, + { + "epoch": 1.150438099008092, + "grad_norm": 0.19383689761161804, + "learning_rate": 8.042680416086318e-05, + "loss": 1.9416, + "step": 302250 + }, + { + "epoch": 1.1504761614762147, + "grad_norm": 0.18747715651988983, + "learning_rate": 8.037715346595658e-05, + "loss": 1.9612, + "step": 302260 + }, + { + "epoch": 1.1505142239443376, + "grad_norm": 0.15447022020816803, + "learning_rate": 8.03275086451341e-05, + "loss": 1.9681, + "step": 302270 + }, + { + "epoch": 1.1505522864124602, + "grad_norm": 0.17916887998580933, + "learning_rate": 8.02778696963113e-05, + "loss": 1.973, + "step": 302280 + }, + { + "epoch": 1.1505903488805829, + "grad_norm": 0.19271798431873322, + "learning_rate": 8.022823661740497e-05, + "loss": 1.9645, + "step": 302290 + }, + { + "epoch": 1.1506284113487055, + "grad_norm": 0.4011487066745758, + "learning_rate": 8.017860940633337e-05, + "loss": 1.9622, + "step": 302300 + }, + { + "epoch": 1.1506664738168282, + "grad_norm": 0.20441265404224396, + "learning_rate": 8.012898806101571e-05, + "loss": 1.9643, + "step": 302310 + }, + { + "epoch": 1.1507045362849508, + "grad_norm": 0.20115308463573456, + "learning_rate": 8.007937257937264e-05, + "loss": 1.9782, + "step": 302320 + }, + { + "epoch": 1.1507425987530735, + "grad_norm": 0.1862449198961258, + "learning_rate": 8.00297629593259e-05, + "loss": 1.977, + "step": 302330 + }, + { + "epoch": 1.1507806612211962, + "grad_norm": 0.19222941994667053, + "learning_rate": 7.998015919879848e-05, + "loss": 1.9711, + "step": 302340 + }, + { + "epoch": 1.1508187236893188, + "grad_norm": 0.19031883776187897, + "learning_rate": 7.993056129571469e-05, + "loss": 1.9633, + "step": 302350 + }, + { + "epoch": 1.1508567861574415, + "grad_norm": 0.1577989161014557, + "learning_rate": 7.988096924799992e-05, + "loss": 1.9626, + "step": 302360 + }, + { + "epoch": 1.1508948486255643, + "grad_norm": 0.1963740736246109, + "learning_rate": 7.98313830535809e-05, + "loss": 1.9553, + "step": 302370 + }, + { + "epoch": 1.150932911093687, + "grad_norm": 0.19436369836330414, + "learning_rate": 7.978180271038555e-05, + "loss": 1.9551, + "step": 302380 + }, + { + "epoch": 1.1509709735618097, + "grad_norm": 0.16641823947429657, + "learning_rate": 7.973222821634301e-05, + "loss": 1.9818, + "step": 302390 + }, + { + "epoch": 1.1510090360299323, + "grad_norm": 0.15929053723812103, + "learning_rate": 7.968265956938365e-05, + "loss": 1.9607, + "step": 302400 + }, + { + "epoch": 1.151047098498055, + "grad_norm": 0.18515542149543762, + "learning_rate": 7.963309676743897e-05, + "loss": 1.9773, + "step": 302410 + }, + { + "epoch": 1.1510851609661776, + "grad_norm": 0.1987186074256897, + "learning_rate": 7.958353980844186e-05, + "loss": 1.962, + "step": 302420 + }, + { + "epoch": 1.1511232234343003, + "grad_norm": 0.16314023733139038, + "learning_rate": 7.953398869032624e-05, + "loss": 1.9514, + "step": 302430 + }, + { + "epoch": 1.1511612859024232, + "grad_norm": 0.23564612865447998, + "learning_rate": 7.948444341102751e-05, + "loss": 1.9583, + "step": 302440 + }, + { + "epoch": 1.1511993483705458, + "grad_norm": 0.1620829701423645, + "learning_rate": 7.943490396848196e-05, + "loss": 1.9682, + "step": 302450 + }, + { + "epoch": 1.1512374108386685, + "grad_norm": 0.16987600922584534, + "learning_rate": 7.938537036062731e-05, + "loss": 1.9683, + "step": 302460 + }, + { + "epoch": 1.1512754733067911, + "grad_norm": 0.1522493213415146, + "learning_rate": 7.933584258540244e-05, + "loss": 1.9538, + "step": 302470 + }, + { + "epoch": 1.1513135357749138, + "grad_norm": 0.2353200763463974, + "learning_rate": 7.928632064074742e-05, + "loss": 1.9714, + "step": 302480 + }, + { + "epoch": 1.1513515982430365, + "grad_norm": 0.16485443711280823, + "learning_rate": 7.923680452460358e-05, + "loss": 1.9737, + "step": 302490 + }, + { + "epoch": 1.1513896607111591, + "grad_norm": 0.16668550670146942, + "learning_rate": 7.918729423491338e-05, + "loss": 1.9749, + "step": 302500 + }, + { + "epoch": 1.1514277231792818, + "grad_norm": 0.20765995979309082, + "learning_rate": 7.913778976962066e-05, + "loss": 1.9772, + "step": 302510 + }, + { + "epoch": 1.1514657856474044, + "grad_norm": 0.20101527869701385, + "learning_rate": 7.908829112667027e-05, + "loss": 1.9733, + "step": 302520 + }, + { + "epoch": 1.151503848115527, + "grad_norm": 0.2320103794336319, + "learning_rate": 7.90387983040084e-05, + "loss": 1.9716, + "step": 302530 + }, + { + "epoch": 1.15154191058365, + "grad_norm": 0.17331230640411377, + "learning_rate": 7.898931129958232e-05, + "loss": 1.9588, + "step": 302540 + }, + { + "epoch": 1.1515799730517726, + "grad_norm": 0.16984319686889648, + "learning_rate": 7.893983011134065e-05, + "loss": 1.9793, + "step": 302550 + }, + { + "epoch": 1.1516180355198953, + "grad_norm": 0.23322752118110657, + "learning_rate": 7.889035473723316e-05, + "loss": 1.9725, + "step": 302560 + }, + { + "epoch": 1.151656097988018, + "grad_norm": 0.16987261176109314, + "learning_rate": 7.884088517521077e-05, + "loss": 1.9758, + "step": 302570 + }, + { + "epoch": 1.1516941604561406, + "grad_norm": 0.17440181970596313, + "learning_rate": 7.87914214232257e-05, + "loss": 1.9754, + "step": 302580 + }, + { + "epoch": 1.1517322229242632, + "grad_norm": 0.175144761800766, + "learning_rate": 7.874196347923124e-05, + "loss": 1.9655, + "step": 302590 + }, + { + "epoch": 1.151770285392386, + "grad_norm": 0.15603378415107727, + "learning_rate": 7.869251134118211e-05, + "loss": 1.9788, + "step": 302600 + }, + { + "epoch": 1.1518083478605088, + "grad_norm": 0.16844221949577332, + "learning_rate": 7.864306500703388e-05, + "loss": 1.9644, + "step": 302610 + }, + { + "epoch": 1.1518464103286314, + "grad_norm": 0.21338878571987152, + "learning_rate": 7.859362447474367e-05, + "loss": 1.9785, + "step": 302620 + }, + { + "epoch": 1.151884472796754, + "grad_norm": 0.2597297728061676, + "learning_rate": 7.854418974226968e-05, + "loss": 1.9643, + "step": 302630 + }, + { + "epoch": 1.1519225352648768, + "grad_norm": 0.1646358072757721, + "learning_rate": 7.849476080757112e-05, + "loss": 1.9587, + "step": 302640 + }, + { + "epoch": 1.1519605977329994, + "grad_norm": 0.17903150618076324, + "learning_rate": 7.844533766860873e-05, + "loss": 1.9807, + "step": 302650 + }, + { + "epoch": 1.151998660201122, + "grad_norm": 0.20694409310817719, + "learning_rate": 7.839592032334414e-05, + "loss": 1.9693, + "step": 302660 + }, + { + "epoch": 1.1520367226692447, + "grad_norm": 0.16047678887844086, + "learning_rate": 7.834650876974037e-05, + "loss": 1.9586, + "step": 302670 + }, + { + "epoch": 1.1520747851373674, + "grad_norm": 0.16605503857135773, + "learning_rate": 7.829710300576149e-05, + "loss": 1.9728, + "step": 302680 + }, + { + "epoch": 1.15211284760549, + "grad_norm": 0.20652402937412262, + "learning_rate": 7.824770302937296e-05, + "loss": 1.9672, + "step": 302690 + }, + { + "epoch": 1.1521509100736127, + "grad_norm": 0.27568838000297546, + "learning_rate": 7.819830883854123e-05, + "loss": 1.9635, + "step": 302700 + }, + { + "epoch": 1.1521889725417356, + "grad_norm": 0.21026568114757538, + "learning_rate": 7.814892043123401e-05, + "loss": 1.9642, + "step": 302710 + }, + { + "epoch": 1.1522270350098582, + "grad_norm": 0.1902541071176529, + "learning_rate": 7.809953780542029e-05, + "loss": 1.9524, + "step": 302720 + }, + { + "epoch": 1.152265097477981, + "grad_norm": 0.2355373501777649, + "learning_rate": 7.805016095907009e-05, + "loss": 1.9549, + "step": 302730 + }, + { + "epoch": 1.1523031599461036, + "grad_norm": 0.2074352353811264, + "learning_rate": 7.800078989015475e-05, + "loss": 1.965, + "step": 302740 + }, + { + "epoch": 1.1523412224142262, + "grad_norm": 0.1499231606721878, + "learning_rate": 7.795142459664667e-05, + "loss": 1.9625, + "step": 302750 + }, + { + "epoch": 1.1523792848823489, + "grad_norm": 0.17020770907402039, + "learning_rate": 7.790206507651964e-05, + "loss": 1.9787, + "step": 302760 + }, + { + "epoch": 1.1524173473504715, + "grad_norm": 0.2170293778181076, + "learning_rate": 7.78527113277484e-05, + "loss": 1.9478, + "step": 302770 + }, + { + "epoch": 1.1524554098185944, + "grad_norm": 0.15761138498783112, + "learning_rate": 7.780336334830901e-05, + "loss": 1.9772, + "step": 302780 + }, + { + "epoch": 1.152493472286717, + "grad_norm": 0.15958812832832336, + "learning_rate": 7.775402113617869e-05, + "loss": 1.9716, + "step": 302790 + }, + { + "epoch": 1.1525315347548397, + "grad_norm": 0.16891001164913177, + "learning_rate": 7.770468468933572e-05, + "loss": 1.9664, + "step": 302800 + }, + { + "epoch": 1.1525695972229624, + "grad_norm": 0.20234356820583344, + "learning_rate": 7.76553540057599e-05, + "loss": 1.97, + "step": 302810 + }, + { + "epoch": 1.152607659691085, + "grad_norm": 0.19146844744682312, + "learning_rate": 7.760602908343178e-05, + "loss": 1.9494, + "step": 302820 + }, + { + "epoch": 1.1526457221592077, + "grad_norm": 0.15346533060073853, + "learning_rate": 7.755670992033342e-05, + "loss": 1.9651, + "step": 302830 + }, + { + "epoch": 1.1526837846273303, + "grad_norm": 0.1874997764825821, + "learning_rate": 7.750739651444782e-05, + "loss": 1.9729, + "step": 302840 + }, + { + "epoch": 1.152721847095453, + "grad_norm": 0.1829904466867447, + "learning_rate": 7.745808886375938e-05, + "loss": 1.9689, + "step": 302850 + }, + { + "epoch": 1.1527599095635757, + "grad_norm": 0.18967437744140625, + "learning_rate": 7.740878696625348e-05, + "loss": 1.9536, + "step": 302860 + }, + { + "epoch": 1.1527979720316983, + "grad_norm": 0.2254030406475067, + "learning_rate": 7.735949081991678e-05, + "loss": 1.9757, + "step": 302870 + }, + { + "epoch": 1.1528360344998212, + "grad_norm": 0.18517284095287323, + "learning_rate": 7.731020042273713e-05, + "loss": 1.9689, + "step": 302880 + }, + { + "epoch": 1.1528740969679439, + "grad_norm": 0.16112570464611053, + "learning_rate": 7.726091577270344e-05, + "loss": 1.9783, + "step": 302890 + }, + { + "epoch": 1.1529121594360665, + "grad_norm": 0.18462646007537842, + "learning_rate": 7.721163686780592e-05, + "loss": 1.9542, + "step": 302900 + }, + { + "epoch": 1.1529502219041892, + "grad_norm": 0.16934382915496826, + "learning_rate": 7.716236370603591e-05, + "loss": 1.9459, + "step": 302910 + }, + { + "epoch": 1.1529882843723118, + "grad_norm": 0.25239670276641846, + "learning_rate": 7.711309628538582e-05, + "loss": 1.9621, + "step": 302920 + }, + { + "epoch": 1.1530263468404345, + "grad_norm": 0.24700672924518585, + "learning_rate": 7.706383460384947e-05, + "loss": 1.975, + "step": 302930 + }, + { + "epoch": 1.1530644093085571, + "grad_norm": 0.19068533182144165, + "learning_rate": 7.701457865942157e-05, + "loss": 1.9715, + "step": 302940 + }, + { + "epoch": 1.1531024717766798, + "grad_norm": 0.1721537709236145, + "learning_rate": 7.696532845009818e-05, + "loss": 1.9569, + "step": 302950 + }, + { + "epoch": 1.1531405342448027, + "grad_norm": 0.1698552519083023, + "learning_rate": 7.691608397387639e-05, + "loss": 1.9763, + "step": 302960 + }, + { + "epoch": 1.1531785967129253, + "grad_norm": 0.22980883717536926, + "learning_rate": 7.686684522875464e-05, + "loss": 1.9647, + "step": 302970 + }, + { + "epoch": 1.153216659181048, + "grad_norm": 0.17749278247356415, + "learning_rate": 7.681761221273237e-05, + "loss": 1.9646, + "step": 302980 + }, + { + "epoch": 1.1532547216491706, + "grad_norm": 0.1919885128736496, + "learning_rate": 7.676838492381028e-05, + "loss": 1.9735, + "step": 302990 + }, + { + "epoch": 1.1532927841172933, + "grad_norm": 0.17048048973083496, + "learning_rate": 7.671916335999019e-05, + "loss": 1.9717, + "step": 303000 + }, + { + "epoch": 1.153330846585416, + "grad_norm": 0.17630760371685028, + "learning_rate": 7.666994751927508e-05, + "loss": 1.9608, + "step": 303010 + }, + { + "epoch": 1.1533689090535386, + "grad_norm": 0.2684946358203888, + "learning_rate": 7.662073739966907e-05, + "loss": 1.9576, + "step": 303020 + }, + { + "epoch": 1.1534069715216613, + "grad_norm": 0.2561207115650177, + "learning_rate": 7.657153299917746e-05, + "loss": 1.9509, + "step": 303030 + }, + { + "epoch": 1.153445033989784, + "grad_norm": 0.16998520493507385, + "learning_rate": 7.652233431580674e-05, + "loss": 1.9554, + "step": 303040 + }, + { + "epoch": 1.1534830964579068, + "grad_norm": 0.16677066683769226, + "learning_rate": 7.647314134756462e-05, + "loss": 1.9666, + "step": 303050 + }, + { + "epoch": 1.1535211589260295, + "grad_norm": 0.19740892946720123, + "learning_rate": 7.642395409245972e-05, + "loss": 1.9617, + "step": 303060 + }, + { + "epoch": 1.1535592213941521, + "grad_norm": 0.16956956684589386, + "learning_rate": 7.637477254850211e-05, + "loss": 1.9576, + "step": 303070 + }, + { + "epoch": 1.1535972838622748, + "grad_norm": 0.18850646913051605, + "learning_rate": 7.632559671370282e-05, + "loss": 1.9697, + "step": 303080 + }, + { + "epoch": 1.1536353463303974, + "grad_norm": 0.23317211866378784, + "learning_rate": 7.627642658607408e-05, + "loss": 1.9564, + "step": 303090 + }, + { + "epoch": 1.15367340879852, + "grad_norm": 0.20958870649337769, + "learning_rate": 7.622726216362935e-05, + "loss": 1.9667, + "step": 303100 + }, + { + "epoch": 1.1537114712666428, + "grad_norm": 0.17249689996242523, + "learning_rate": 7.617810344438313e-05, + "loss": 1.9644, + "step": 303110 + }, + { + "epoch": 1.1537495337347654, + "grad_norm": 0.17346225678920746, + "learning_rate": 7.612895042635115e-05, + "loss": 1.9599, + "step": 303120 + }, + { + "epoch": 1.1537875962028883, + "grad_norm": 0.20161131024360657, + "learning_rate": 7.60798031075503e-05, + "loss": 1.9571, + "step": 303130 + }, + { + "epoch": 1.153825658671011, + "grad_norm": 0.21738913655281067, + "learning_rate": 7.603066148599852e-05, + "loss": 1.9649, + "step": 303140 + }, + { + "epoch": 1.1538637211391336, + "grad_norm": 0.18385431170463562, + "learning_rate": 7.598152555971499e-05, + "loss": 1.9635, + "step": 303150 + }, + { + "epoch": 1.1539017836072563, + "grad_norm": 0.16730673611164093, + "learning_rate": 7.593239532671992e-05, + "loss": 1.9565, + "step": 303160 + }, + { + "epoch": 1.153939846075379, + "grad_norm": 0.2135828733444214, + "learning_rate": 7.588327078503493e-05, + "loss": 1.9588, + "step": 303170 + }, + { + "epoch": 1.1539779085435016, + "grad_norm": 0.16073031723499298, + "learning_rate": 7.58341519326825e-05, + "loss": 1.9711, + "step": 303180 + }, + { + "epoch": 1.1540159710116242, + "grad_norm": 0.19444327056407928, + "learning_rate": 7.578503876768639e-05, + "loss": 1.9652, + "step": 303190 + }, + { + "epoch": 1.1540540334797469, + "grad_norm": 0.23536929488182068, + "learning_rate": 7.573593128807149e-05, + "loss": 1.9553, + "step": 303200 + }, + { + "epoch": 1.1540920959478695, + "grad_norm": 0.18366196751594543, + "learning_rate": 7.568682949186378e-05, + "loss": 1.9519, + "step": 303210 + }, + { + "epoch": 1.1541301584159922, + "grad_norm": 0.15574891865253448, + "learning_rate": 7.563773337709045e-05, + "loss": 1.966, + "step": 303220 + }, + { + "epoch": 1.154168220884115, + "grad_norm": 0.15715356171131134, + "learning_rate": 7.558864294177986e-05, + "loss": 1.962, + "step": 303230 + }, + { + "epoch": 1.1542062833522377, + "grad_norm": 0.180779829621315, + "learning_rate": 7.553955818396141e-05, + "loss": 1.9712, + "step": 303240 + }, + { + "epoch": 1.1542443458203604, + "grad_norm": 0.21225744485855103, + "learning_rate": 7.549047910166567e-05, + "loss": 1.9687, + "step": 303250 + }, + { + "epoch": 1.154282408288483, + "grad_norm": 0.19442957639694214, + "learning_rate": 7.54414056929244e-05, + "loss": 1.9653, + "step": 303260 + }, + { + "epoch": 1.1543204707566057, + "grad_norm": 0.2081403285264969, + "learning_rate": 7.539233795577044e-05, + "loss": 1.9541, + "step": 303270 + }, + { + "epoch": 1.1543585332247284, + "grad_norm": 0.20303623378276825, + "learning_rate": 7.534327588823775e-05, + "loss": 1.9758, + "step": 303280 + }, + { + "epoch": 1.154396595692851, + "grad_norm": 0.18361127376556396, + "learning_rate": 7.529421948836157e-05, + "loss": 1.9541, + "step": 303290 + }, + { + "epoch": 1.154434658160974, + "grad_norm": 0.16972817480564117, + "learning_rate": 7.524516875417809e-05, + "loss": 1.9627, + "step": 303300 + }, + { + "epoch": 1.1544727206290966, + "grad_norm": 0.17593394219875336, + "learning_rate": 7.51961236837247e-05, + "loss": 1.9843, + "step": 303310 + }, + { + "epoch": 1.1545107830972192, + "grad_norm": 0.16857187449932098, + "learning_rate": 7.514708427503997e-05, + "loss": 1.9515, + "step": 303320 + }, + { + "epoch": 1.1545488455653419, + "grad_norm": 0.1726546436548233, + "learning_rate": 7.509805052616358e-05, + "loss": 1.9772, + "step": 303330 + }, + { + "epoch": 1.1545869080334645, + "grad_norm": 0.17818816006183624, + "learning_rate": 7.504902243513623e-05, + "loss": 1.9621, + "step": 303340 + }, + { + "epoch": 1.1546249705015872, + "grad_norm": 0.15913088619709015, + "learning_rate": 7.500000000000001e-05, + "loss": 1.9798, + "step": 303350 + }, + { + "epoch": 1.1546630329697098, + "grad_norm": 0.17664912343025208, + "learning_rate": 7.495098321879785e-05, + "loss": 1.9676, + "step": 303360 + }, + { + "epoch": 1.1547010954378325, + "grad_norm": 0.16069193184375763, + "learning_rate": 7.4901972089574e-05, + "loss": 1.9605, + "step": 303370 + }, + { + "epoch": 1.1547391579059552, + "grad_norm": 0.17986124753952026, + "learning_rate": 7.485296661037371e-05, + "loss": 1.9573, + "step": 303380 + }, + { + "epoch": 1.1547772203740778, + "grad_norm": 0.19635865092277527, + "learning_rate": 7.48039667792435e-05, + "loss": 1.9727, + "step": 303390 + }, + { + "epoch": 1.1548152828422007, + "grad_norm": 0.22032029926776886, + "learning_rate": 7.475497259423086e-05, + "loss": 1.954, + "step": 303400 + }, + { + "epoch": 1.1548533453103234, + "grad_norm": 0.22101953625679016, + "learning_rate": 7.470598405338452e-05, + "loss": 1.9699, + "step": 303410 + }, + { + "epoch": 1.154891407778446, + "grad_norm": 0.21217355132102966, + "learning_rate": 7.465700115475433e-05, + "loss": 1.9597, + "step": 303420 + }, + { + "epoch": 1.1549294702465687, + "grad_norm": 0.24058283865451813, + "learning_rate": 7.460802389639115e-05, + "loss": 1.9757, + "step": 303430 + }, + { + "epoch": 1.1549675327146913, + "grad_norm": 0.1930130273103714, + "learning_rate": 7.455905227634707e-05, + "loss": 1.9633, + "step": 303440 + }, + { + "epoch": 1.155005595182814, + "grad_norm": 0.24008707702159882, + "learning_rate": 7.451008629267525e-05, + "loss": 1.9592, + "step": 303450 + }, + { + "epoch": 1.1550436576509366, + "grad_norm": 0.18197044730186462, + "learning_rate": 7.446112594343002e-05, + "loss": 1.9622, + "step": 303460 + }, + { + "epoch": 1.1550817201190595, + "grad_norm": 0.1568765491247177, + "learning_rate": 7.441217122666678e-05, + "loss": 1.9822, + "step": 303470 + }, + { + "epoch": 1.1551197825871822, + "grad_norm": 0.19632264971733093, + "learning_rate": 7.436322214044206e-05, + "loss": 1.9763, + "step": 303480 + }, + { + "epoch": 1.1551578450553048, + "grad_norm": 0.17583273351192474, + "learning_rate": 7.431427868281354e-05, + "loss": 1.9582, + "step": 303490 + }, + { + "epoch": 1.1551959075234275, + "grad_norm": 0.24570678174495697, + "learning_rate": 7.426534085184e-05, + "loss": 1.9578, + "step": 303500 + }, + { + "epoch": 1.1552339699915501, + "grad_norm": 0.17563122510910034, + "learning_rate": 7.421640864558126e-05, + "loss": 1.96, + "step": 303510 + }, + { + "epoch": 1.1552720324596728, + "grad_norm": 0.1920919120311737, + "learning_rate": 7.416748206209834e-05, + "loss": 1.98, + "step": 303520 + }, + { + "epoch": 1.1553100949277955, + "grad_norm": 0.22609946131706238, + "learning_rate": 7.411856109945342e-05, + "loss": 1.983, + "step": 303530 + }, + { + "epoch": 1.1553481573959181, + "grad_norm": 0.24520838260650635, + "learning_rate": 7.406964575570962e-05, + "loss": 1.9539, + "step": 303540 + }, + { + "epoch": 1.1553862198640408, + "grad_norm": 0.18873053789138794, + "learning_rate": 7.402073602893139e-05, + "loss": 1.9559, + "step": 303550 + }, + { + "epoch": 1.1554242823321634, + "grad_norm": 0.16989383101463318, + "learning_rate": 7.397183191718415e-05, + "loss": 1.9777, + "step": 303560 + }, + { + "epoch": 1.1554623448002863, + "grad_norm": 0.16686667501926422, + "learning_rate": 7.39229334185344e-05, + "loss": 1.9763, + "step": 303570 + }, + { + "epoch": 1.155500407268409, + "grad_norm": 0.19937288761138916, + "learning_rate": 7.387404053104985e-05, + "loss": 1.9618, + "step": 303580 + }, + { + "epoch": 1.1555384697365316, + "grad_norm": 0.29525628685951233, + "learning_rate": 7.382515325279932e-05, + "loss": 1.9504, + "step": 303590 + }, + { + "epoch": 1.1555765322046543, + "grad_norm": 0.19494149088859558, + "learning_rate": 7.377627158185262e-05, + "loss": 1.9646, + "step": 303600 + }, + { + "epoch": 1.155614594672777, + "grad_norm": 0.17477074265480042, + "learning_rate": 7.372739551628077e-05, + "loss": 1.9841, + "step": 303610 + }, + { + "epoch": 1.1556526571408996, + "grad_norm": 0.18820302188396454, + "learning_rate": 7.36785250541559e-05, + "loss": 1.9751, + "step": 303620 + }, + { + "epoch": 1.1556907196090223, + "grad_norm": 0.17995865643024445, + "learning_rate": 7.362966019355116e-05, + "loss": 1.9578, + "step": 303630 + }, + { + "epoch": 1.1557287820771451, + "grad_norm": 0.20242054760456085, + "learning_rate": 7.358080093254088e-05, + "loss": 1.9558, + "step": 303640 + }, + { + "epoch": 1.1557668445452678, + "grad_norm": 0.16632670164108276, + "learning_rate": 7.353194726920048e-05, + "loss": 1.9689, + "step": 303650 + }, + { + "epoch": 1.1558049070133904, + "grad_norm": 0.19060280919075012, + "learning_rate": 7.348309920160646e-05, + "loss": 1.98, + "step": 303660 + }, + { + "epoch": 1.155842969481513, + "grad_norm": 0.19867919385433197, + "learning_rate": 7.343425672783649e-05, + "loss": 1.9562, + "step": 303670 + }, + { + "epoch": 1.1558810319496358, + "grad_norm": 0.16830924153327942, + "learning_rate": 7.338541984596919e-05, + "loss": 1.9738, + "step": 303680 + }, + { + "epoch": 1.1559190944177584, + "grad_norm": 0.22567281126976013, + "learning_rate": 7.333658855408443e-05, + "loss": 1.954, + "step": 303690 + }, + { + "epoch": 1.155957156885881, + "grad_norm": 0.1821957528591156, + "learning_rate": 7.328776285026307e-05, + "loss": 1.955, + "step": 303700 + }, + { + "epoch": 1.1559952193540037, + "grad_norm": 0.18949563801288605, + "learning_rate": 7.323894273258719e-05, + "loss": 1.9618, + "step": 303710 + }, + { + "epoch": 1.1560332818221264, + "grad_norm": 0.19841155409812927, + "learning_rate": 7.319012819913984e-05, + "loss": 1.9636, + "step": 303720 + }, + { + "epoch": 1.156071344290249, + "grad_norm": 0.2149336189031601, + "learning_rate": 7.314131924800532e-05, + "loss": 1.9625, + "step": 303730 + }, + { + "epoch": 1.156109406758372, + "grad_norm": 0.21468611061573029, + "learning_rate": 7.309251587726878e-05, + "loss": 1.9596, + "step": 303740 + }, + { + "epoch": 1.1561474692264946, + "grad_norm": 0.18424129486083984, + "learning_rate": 7.304371808501675e-05, + "loss": 1.9603, + "step": 303750 + }, + { + "epoch": 1.1561855316946172, + "grad_norm": 0.22661066055297852, + "learning_rate": 7.299492586933664e-05, + "loss": 1.9705, + "step": 303760 + }, + { + "epoch": 1.15622359416274, + "grad_norm": 0.1968468725681305, + "learning_rate": 7.2946139228317e-05, + "loss": 1.9627, + "step": 303770 + }, + { + "epoch": 1.1562616566308626, + "grad_norm": 0.21646060049533844, + "learning_rate": 7.289735816004762e-05, + "loss": 1.9668, + "step": 303780 + }, + { + "epoch": 1.1562997190989852, + "grad_norm": 0.19419118762016296, + "learning_rate": 7.284858266261913e-05, + "loss": 1.9674, + "step": 303790 + }, + { + "epoch": 1.1563377815671079, + "grad_norm": 0.1530521959066391, + "learning_rate": 7.279981273412346e-05, + "loss": 1.9683, + "step": 303800 + }, + { + "epoch": 1.1563758440352305, + "grad_norm": 0.16724202036857605, + "learning_rate": 7.275104837265351e-05, + "loss": 1.9522, + "step": 303810 + }, + { + "epoch": 1.1564139065033534, + "grad_norm": 0.19780854880809784, + "learning_rate": 7.270228957630338e-05, + "loss": 1.9632, + "step": 303820 + }, + { + "epoch": 1.156451968971476, + "grad_norm": 0.20006397366523743, + "learning_rate": 7.265353634316801e-05, + "loss": 1.9701, + "step": 303830 + }, + { + "epoch": 1.1564900314395987, + "grad_norm": 0.1747836023569107, + "learning_rate": 7.260478867134385e-05, + "loss": 1.9603, + "step": 303840 + }, + { + "epoch": 1.1565280939077214, + "grad_norm": 0.16377757489681244, + "learning_rate": 7.2556046558928e-05, + "loss": 1.9596, + "step": 303850 + }, + { + "epoch": 1.156566156375844, + "grad_norm": 0.16101588308811188, + "learning_rate": 7.250731000401889e-05, + "loss": 1.9651, + "step": 303860 + }, + { + "epoch": 1.1566042188439667, + "grad_norm": 0.1737445890903473, + "learning_rate": 7.245857900471603e-05, + "loss": 1.9536, + "step": 303870 + }, + { + "epoch": 1.1566422813120893, + "grad_norm": 0.1697576940059662, + "learning_rate": 7.240985355911989e-05, + "loss": 1.9601, + "step": 303880 + }, + { + "epoch": 1.156680343780212, + "grad_norm": 0.20912794768810272, + "learning_rate": 7.236113366533204e-05, + "loss": 1.957, + "step": 303890 + }, + { + "epoch": 1.1567184062483347, + "grad_norm": 0.2573632299900055, + "learning_rate": 7.231241932145532e-05, + "loss": 1.9675, + "step": 303900 + }, + { + "epoch": 1.1567564687164575, + "grad_norm": 0.17370592057704926, + "learning_rate": 7.226371052559344e-05, + "loss": 1.9684, + "step": 303910 + }, + { + "epoch": 1.1567945311845802, + "grad_norm": 0.16651690006256104, + "learning_rate": 7.221500727585123e-05, + "loss": 1.9663, + "step": 303920 + }, + { + "epoch": 1.1568325936527029, + "grad_norm": 0.19553054869174957, + "learning_rate": 7.216630957033471e-05, + "loss": 1.9673, + "step": 303930 + }, + { + "epoch": 1.1568706561208255, + "grad_norm": 0.22437965869903564, + "learning_rate": 7.211761740715089e-05, + "loss": 1.97, + "step": 303940 + }, + { + "epoch": 1.1569087185889482, + "grad_norm": 0.17220978438854218, + "learning_rate": 7.206893078440773e-05, + "loss": 1.9674, + "step": 303950 + }, + { + "epoch": 1.1569467810570708, + "grad_norm": 0.17623186111450195, + "learning_rate": 7.202024970021458e-05, + "loss": 1.9689, + "step": 303960 + }, + { + "epoch": 1.1569848435251935, + "grad_norm": 0.17047753930091858, + "learning_rate": 7.197157415268164e-05, + "loss": 1.9631, + "step": 303970 + }, + { + "epoch": 1.1570229059933161, + "grad_norm": 0.16086052358150482, + "learning_rate": 7.192290413992014e-05, + "loss": 1.9626, + "step": 303980 + }, + { + "epoch": 1.157060968461439, + "grad_norm": 0.1761477291584015, + "learning_rate": 7.187423966004258e-05, + "loss": 1.9639, + "step": 303990 + }, + { + "epoch": 1.1570990309295617, + "grad_norm": 0.2322581708431244, + "learning_rate": 7.182558071116241e-05, + "loss": 1.9552, + "step": 304000 + }, + { + "epoch": 1.1571370933976843, + "grad_norm": 0.20044724643230438, + "learning_rate": 7.177692729139407e-05, + "loss": 1.9746, + "step": 304010 + }, + { + "epoch": 1.157175155865807, + "grad_norm": 0.19975224137306213, + "learning_rate": 7.172827939885328e-05, + "loss": 1.9616, + "step": 304020 + }, + { + "epoch": 1.1572132183339296, + "grad_norm": 0.19980034232139587, + "learning_rate": 7.16796370316567e-05, + "loss": 1.9614, + "step": 304030 + }, + { + "epoch": 1.1572512808020523, + "grad_norm": 0.22720147669315338, + "learning_rate": 7.163100018792213e-05, + "loss": 1.9604, + "step": 304040 + }, + { + "epoch": 1.157289343270175, + "grad_norm": 0.19307057559490204, + "learning_rate": 7.158236886576825e-05, + "loss": 1.9668, + "step": 304050 + }, + { + "epoch": 1.1573274057382976, + "grad_norm": 0.20916832983493805, + "learning_rate": 7.153374306331506e-05, + "loss": 1.963, + "step": 304060 + }, + { + "epoch": 1.1573654682064203, + "grad_norm": 0.23292502760887146, + "learning_rate": 7.14851227786834e-05, + "loss": 1.9586, + "step": 304070 + }, + { + "epoch": 1.157403530674543, + "grad_norm": 0.19604690372943878, + "learning_rate": 7.143650800999546e-05, + "loss": 1.9666, + "step": 304080 + }, + { + "epoch": 1.1574415931426658, + "grad_norm": 0.24700568616390228, + "learning_rate": 7.138789875537421e-05, + "loss": 1.9708, + "step": 304090 + }, + { + "epoch": 1.1574796556107885, + "grad_norm": 0.18617315590381622, + "learning_rate": 7.133929501294385e-05, + "loss": 1.9697, + "step": 304100 + }, + { + "epoch": 1.1575177180789111, + "grad_norm": 0.1749623566865921, + "learning_rate": 7.12906967808295e-05, + "loss": 1.9645, + "step": 304110 + }, + { + "epoch": 1.1575557805470338, + "grad_norm": 0.2002592533826828, + "learning_rate": 7.124210405715754e-05, + "loss": 1.9637, + "step": 304120 + }, + { + "epoch": 1.1575938430151564, + "grad_norm": 0.22793354094028473, + "learning_rate": 7.11935168400552e-05, + "loss": 1.9692, + "step": 304130 + }, + { + "epoch": 1.157631905483279, + "grad_norm": 0.24129308760166168, + "learning_rate": 7.114493512765102e-05, + "loss": 1.9641, + "step": 304140 + }, + { + "epoch": 1.1576699679514018, + "grad_norm": 0.2590607702732086, + "learning_rate": 7.109635891807431e-05, + "loss": 1.9537, + "step": 304150 + }, + { + "epoch": 1.1577080304195246, + "grad_norm": 0.18408513069152832, + "learning_rate": 7.104778820945567e-05, + "loss": 1.9552, + "step": 304160 + }, + { + "epoch": 1.1577460928876473, + "grad_norm": 0.1907476782798767, + "learning_rate": 7.099922299992667e-05, + "loss": 1.9644, + "step": 304170 + }, + { + "epoch": 1.15778415535577, + "grad_norm": 0.1599113494157791, + "learning_rate": 7.09506632876199e-05, + "loss": 1.9782, + "step": 304180 + }, + { + "epoch": 1.1578222178238926, + "grad_norm": 0.16144965589046478, + "learning_rate": 7.090210907066902e-05, + "loss": 1.9544, + "step": 304190 + }, + { + "epoch": 1.1578602802920153, + "grad_norm": 0.27204036712646484, + "learning_rate": 7.085356034720891e-05, + "loss": 1.963, + "step": 304200 + }, + { + "epoch": 1.157898342760138, + "grad_norm": 0.23324240744113922, + "learning_rate": 7.080501711537524e-05, + "loss": 1.9775, + "step": 304210 + }, + { + "epoch": 1.1579364052282606, + "grad_norm": 0.18385274708271027, + "learning_rate": 7.075647937330492e-05, + "loss": 1.9564, + "step": 304220 + }, + { + "epoch": 1.1579744676963832, + "grad_norm": 0.19012723863124847, + "learning_rate": 7.070794711913586e-05, + "loss": 1.9635, + "step": 304230 + }, + { + "epoch": 1.158012530164506, + "grad_norm": 0.16869176924228668, + "learning_rate": 7.065942035100697e-05, + "loss": 1.9797, + "step": 304240 + }, + { + "epoch": 1.1580505926326286, + "grad_norm": 0.16135039925575256, + "learning_rate": 7.061089906705831e-05, + "loss": 1.9657, + "step": 304250 + }, + { + "epoch": 1.1580886551007514, + "grad_norm": 0.21593551337718964, + "learning_rate": 7.056238326543091e-05, + "loss": 1.9672, + "step": 304260 + }, + { + "epoch": 1.158126717568874, + "grad_norm": 0.17168347537517548, + "learning_rate": 7.051387294426686e-05, + "loss": 1.9796, + "step": 304270 + }, + { + "epoch": 1.1581647800369967, + "grad_norm": 0.16949261724948883, + "learning_rate": 7.046536810170939e-05, + "loss": 1.9613, + "step": 304280 + }, + { + "epoch": 1.1582028425051194, + "grad_norm": 0.17988714575767517, + "learning_rate": 7.04168687359027e-05, + "loss": 1.9675, + "step": 304290 + }, + { + "epoch": 1.158240904973242, + "grad_norm": 0.17011313140392303, + "learning_rate": 7.036837484499197e-05, + "loss": 1.9513, + "step": 304300 + }, + { + "epoch": 1.1582789674413647, + "grad_norm": 0.15754835307598114, + "learning_rate": 7.031988642712356e-05, + "loss": 1.9462, + "step": 304310 + }, + { + "epoch": 1.1583170299094874, + "grad_norm": 0.17138944566249847, + "learning_rate": 7.027140348044481e-05, + "loss": 1.9552, + "step": 304320 + }, + { + "epoch": 1.1583550923776103, + "grad_norm": 0.17376810312271118, + "learning_rate": 7.022292600310409e-05, + "loss": 1.9686, + "step": 304330 + }, + { + "epoch": 1.158393154845733, + "grad_norm": 0.17283578217029572, + "learning_rate": 7.01744539932509e-05, + "loss": 1.9541, + "step": 304340 + }, + { + "epoch": 1.1584312173138556, + "grad_norm": 0.17546117305755615, + "learning_rate": 7.012598744903565e-05, + "loss": 1.9617, + "step": 304350 + }, + { + "epoch": 1.1584692797819782, + "grad_norm": 0.21400034427642822, + "learning_rate": 7.007752636860987e-05, + "loss": 1.9661, + "step": 304360 + }, + { + "epoch": 1.1585073422501009, + "grad_norm": 0.1719067394733429, + "learning_rate": 7.002907075012615e-05, + "loss": 1.9594, + "step": 304370 + }, + { + "epoch": 1.1585454047182235, + "grad_norm": 0.17050892114639282, + "learning_rate": 6.998062059173816e-05, + "loss": 1.958, + "step": 304380 + }, + { + "epoch": 1.1585834671863462, + "grad_norm": 0.16611924767494202, + "learning_rate": 6.993217589160039e-05, + "loss": 1.965, + "step": 304390 + }, + { + "epoch": 1.1586215296544689, + "grad_norm": 0.1858852505683899, + "learning_rate": 6.988373664786868e-05, + "loss": 1.9479, + "step": 304400 + }, + { + "epoch": 1.1586595921225915, + "grad_norm": 0.22573794424533844, + "learning_rate": 6.983530285869965e-05, + "loss": 1.9637, + "step": 304410 + }, + { + "epoch": 1.1586976545907142, + "grad_norm": 0.23732437193393707, + "learning_rate": 6.978687452225108e-05, + "loss": 1.9365, + "step": 304420 + }, + { + "epoch": 1.158735717058837, + "grad_norm": 0.18224066495895386, + "learning_rate": 6.97384516366818e-05, + "loss": 1.9747, + "step": 304430 + }, + { + "epoch": 1.1587737795269597, + "grad_norm": 0.17750975489616394, + "learning_rate": 6.969003420015163e-05, + "loss": 1.9698, + "step": 304440 + }, + { + "epoch": 1.1588118419950824, + "grad_norm": 0.1634315848350525, + "learning_rate": 6.964162221082143e-05, + "loss": 1.9638, + "step": 304450 + }, + { + "epoch": 1.158849904463205, + "grad_norm": 0.19310379028320312, + "learning_rate": 6.959321566685312e-05, + "loss": 1.9784, + "step": 304460 + }, + { + "epoch": 1.1588879669313277, + "grad_norm": 0.1664789468050003, + "learning_rate": 6.954481456640965e-05, + "loss": 1.9676, + "step": 304470 + }, + { + "epoch": 1.1589260293994503, + "grad_norm": 0.16946376860141754, + "learning_rate": 6.949641890765496e-05, + "loss": 1.9643, + "step": 304480 + }, + { + "epoch": 1.158964091867573, + "grad_norm": 0.16697698831558228, + "learning_rate": 6.9448028688754e-05, + "loss": 1.975, + "step": 304490 + }, + { + "epoch": 1.1590021543356959, + "grad_norm": 0.15951833128929138, + "learning_rate": 6.93996439078729e-05, + "loss": 1.9594, + "step": 304500 + }, + { + "epoch": 1.1590402168038185, + "grad_norm": 0.15703727304935455, + "learning_rate": 6.935126456317864e-05, + "loss": 1.9526, + "step": 304510 + }, + { + "epoch": 1.1590782792719412, + "grad_norm": 0.20362144708633423, + "learning_rate": 6.93028906528394e-05, + "loss": 1.9527, + "step": 304520 + }, + { + "epoch": 1.1591163417400638, + "grad_norm": 0.1691470742225647, + "learning_rate": 6.925452217502426e-05, + "loss": 1.9588, + "step": 304530 + }, + { + "epoch": 1.1591544042081865, + "grad_norm": 0.2185783088207245, + "learning_rate": 6.920615912790335e-05, + "loss": 1.9514, + "step": 304540 + }, + { + "epoch": 1.1591924666763092, + "grad_norm": 0.2571600377559662, + "learning_rate": 6.915780150964784e-05, + "loss": 1.9577, + "step": 304550 + }, + { + "epoch": 1.1592305291444318, + "grad_norm": 0.22867870330810547, + "learning_rate": 6.910944931842993e-05, + "loss": 1.9614, + "step": 304560 + }, + { + "epoch": 1.1592685916125545, + "grad_norm": 0.18404366075992584, + "learning_rate": 6.906110255242293e-05, + "loss": 1.97, + "step": 304570 + }, + { + "epoch": 1.1593066540806771, + "grad_norm": 0.1827774941921234, + "learning_rate": 6.901276120980104e-05, + "loss": 1.978, + "step": 304580 + }, + { + "epoch": 1.1593447165487998, + "grad_norm": 0.16489237546920776, + "learning_rate": 6.896442528873947e-05, + "loss": 1.9562, + "step": 304590 + }, + { + "epoch": 1.1593827790169227, + "grad_norm": 0.17081385850906372, + "learning_rate": 6.891609478741461e-05, + "loss": 1.962, + "step": 304600 + }, + { + "epoch": 1.1594208414850453, + "grad_norm": 0.17646686732769012, + "learning_rate": 6.886776970400366e-05, + "loss": 1.9628, + "step": 304610 + }, + { + "epoch": 1.159458903953168, + "grad_norm": 0.16338463127613068, + "learning_rate": 6.88194500366851e-05, + "loss": 1.9712, + "step": 304620 + }, + { + "epoch": 1.1594969664212906, + "grad_norm": 0.16763192415237427, + "learning_rate": 6.87711357836383e-05, + "loss": 1.9664, + "step": 304630 + }, + { + "epoch": 1.1595350288894133, + "grad_norm": 0.229514941573143, + "learning_rate": 6.87228269430435e-05, + "loss": 1.9557, + "step": 304640 + }, + { + "epoch": 1.159573091357536, + "grad_norm": 0.17145362496376038, + "learning_rate": 6.867452351308224e-05, + "loss": 1.9482, + "step": 304650 + }, + { + "epoch": 1.1596111538256586, + "grad_norm": 0.17883513867855072, + "learning_rate": 6.862622549193687e-05, + "loss": 1.9748, + "step": 304660 + }, + { + "epoch": 1.1596492162937813, + "grad_norm": 0.21892984211444855, + "learning_rate": 6.857793287779074e-05, + "loss": 1.9617, + "step": 304670 + }, + { + "epoch": 1.1596872787619041, + "grad_norm": 0.2010929137468338, + "learning_rate": 6.852964566882852e-05, + "loss": 1.957, + "step": 304680 + }, + { + "epoch": 1.1597253412300268, + "grad_norm": 0.24334891140460968, + "learning_rate": 6.84813638632355e-05, + "loss": 1.9722, + "step": 304690 + }, + { + "epoch": 1.1597634036981495, + "grad_norm": 0.199227973818779, + "learning_rate": 6.843308745919829e-05, + "loss": 1.9589, + "step": 304700 + }, + { + "epoch": 1.159801466166272, + "grad_norm": 0.18418475985527039, + "learning_rate": 6.83848164549043e-05, + "loss": 1.9581, + "step": 304710 + }, + { + "epoch": 1.1598395286343948, + "grad_norm": 0.17104840278625488, + "learning_rate": 6.833655084854206e-05, + "loss": 1.9706, + "step": 304720 + }, + { + "epoch": 1.1598775911025174, + "grad_norm": 0.18676508963108063, + "learning_rate": 6.828829063830106e-05, + "loss": 1.9681, + "step": 304730 + }, + { + "epoch": 1.15991565357064, + "grad_norm": 0.1584375649690628, + "learning_rate": 6.824003582237192e-05, + "loss": 1.9613, + "step": 304740 + }, + { + "epoch": 1.1599537160387627, + "grad_norm": 0.17978356778621674, + "learning_rate": 6.819178639894619e-05, + "loss": 1.9759, + "step": 304750 + }, + { + "epoch": 1.1599917785068854, + "grad_norm": 0.3203710913658142, + "learning_rate": 6.814354236621634e-05, + "loss": 1.9691, + "step": 304760 + }, + { + "epoch": 1.1600298409750083, + "grad_norm": 0.18568219244480133, + "learning_rate": 6.809530372237599e-05, + "loss": 1.979, + "step": 304770 + }, + { + "epoch": 1.160067903443131, + "grad_norm": 0.16148421168327332, + "learning_rate": 6.80470704656197e-05, + "loss": 1.9631, + "step": 304780 + }, + { + "epoch": 1.1601059659112536, + "grad_norm": 0.16459301114082336, + "learning_rate": 6.799884259414301e-05, + "loss": 1.9657, + "step": 304790 + }, + { + "epoch": 1.1601440283793762, + "grad_norm": 0.1900845319032669, + "learning_rate": 6.795062010614268e-05, + "loss": 1.9622, + "step": 304800 + }, + { + "epoch": 1.160182090847499, + "grad_norm": 0.17193518579006195, + "learning_rate": 6.79024029998161e-05, + "loss": 1.966, + "step": 304810 + }, + { + "epoch": 1.1602201533156216, + "grad_norm": 0.15967318415641785, + "learning_rate": 6.785419127336201e-05, + "loss": 1.9692, + "step": 304820 + }, + { + "epoch": 1.1602582157837442, + "grad_norm": 0.18435196578502655, + "learning_rate": 6.780598492497997e-05, + "loss": 1.9687, + "step": 304830 + }, + { + "epoch": 1.1602962782518669, + "grad_norm": 0.18528448045253754, + "learning_rate": 6.775778395287058e-05, + "loss": 1.9525, + "step": 304840 + }, + { + "epoch": 1.1603343407199898, + "grad_norm": 0.17487603425979614, + "learning_rate": 6.770958835523545e-05, + "loss": 1.9672, + "step": 304850 + }, + { + "epoch": 1.1603724031881124, + "grad_norm": 0.19005072116851807, + "learning_rate": 6.766139813027721e-05, + "loss": 1.9568, + "step": 304860 + }, + { + "epoch": 1.160410465656235, + "grad_norm": 0.18123486638069153, + "learning_rate": 6.761321327619951e-05, + "loss": 1.952, + "step": 304870 + }, + { + "epoch": 1.1604485281243577, + "grad_norm": 0.17274688184261322, + "learning_rate": 6.756503379120693e-05, + "loss": 1.9593, + "step": 304880 + }, + { + "epoch": 1.1604865905924804, + "grad_norm": 0.21813954412937164, + "learning_rate": 6.751685967350512e-05, + "loss": 1.9536, + "step": 304890 + }, + { + "epoch": 1.160524653060603, + "grad_norm": 0.1760721057653427, + "learning_rate": 6.746869092130071e-05, + "loss": 1.9694, + "step": 304900 + }, + { + "epoch": 1.1605627155287257, + "grad_norm": 0.1831454485654831, + "learning_rate": 6.74205275328012e-05, + "loss": 1.9586, + "step": 304910 + }, + { + "epoch": 1.1606007779968484, + "grad_norm": 0.16150572896003723, + "learning_rate": 6.737236950621533e-05, + "loss": 1.9704, + "step": 304920 + }, + { + "epoch": 1.160638840464971, + "grad_norm": 0.19636644423007965, + "learning_rate": 6.73242168397527e-05, + "loss": 1.9555, + "step": 304930 + }, + { + "epoch": 1.160676902933094, + "grad_norm": 0.19180063903331757, + "learning_rate": 6.72760695316239e-05, + "loss": 1.9542, + "step": 304940 + }, + { + "epoch": 1.1607149654012165, + "grad_norm": 0.16178670525550842, + "learning_rate": 6.722792758004054e-05, + "loss": 1.9506, + "step": 304950 + }, + { + "epoch": 1.1607530278693392, + "grad_norm": 0.2005394548177719, + "learning_rate": 6.717979098321514e-05, + "loss": 1.9667, + "step": 304960 + }, + { + "epoch": 1.1607910903374619, + "grad_norm": 0.18609443306922913, + "learning_rate": 6.713165973936136e-05, + "loss": 1.9624, + "step": 304970 + }, + { + "epoch": 1.1608291528055845, + "grad_norm": 0.16814211010932922, + "learning_rate": 6.708353384669386e-05, + "loss": 1.9541, + "step": 304980 + }, + { + "epoch": 1.1608672152737072, + "grad_norm": 0.20419470965862274, + "learning_rate": 6.70354133034281e-05, + "loss": 1.971, + "step": 304990 + }, + { + "epoch": 1.1609052777418298, + "grad_norm": 0.17679406702518463, + "learning_rate": 6.69872981077807e-05, + "loss": 1.9572, + "step": 305000 + }, + { + "epoch": 1.1609433402099525, + "grad_norm": 0.16615521907806396, + "learning_rate": 6.693918825796918e-05, + "loss": 1.9707, + "step": 305010 + }, + { + "epoch": 1.1609814026780754, + "grad_norm": 0.15864615142345428, + "learning_rate": 6.689108375221215e-05, + "loss": 1.9472, + "step": 305020 + }, + { + "epoch": 1.161019465146198, + "grad_norm": 0.17157498002052307, + "learning_rate": 6.684298458872912e-05, + "loss": 1.9614, + "step": 305030 + }, + { + "epoch": 1.1610575276143207, + "grad_norm": 0.20575599372386932, + "learning_rate": 6.679489076574052e-05, + "loss": 1.9593, + "step": 305040 + }, + { + "epoch": 1.1610955900824433, + "grad_norm": 0.17096468806266785, + "learning_rate": 6.674680228146807e-05, + "loss": 1.9675, + "step": 305050 + }, + { + "epoch": 1.161133652550566, + "grad_norm": 0.24533316493034363, + "learning_rate": 6.669871913413411e-05, + "loss": 1.9692, + "step": 305060 + }, + { + "epoch": 1.1611717150186887, + "grad_norm": 0.17510323226451874, + "learning_rate": 6.665064132196219e-05, + "loss": 1.9595, + "step": 305070 + }, + { + "epoch": 1.1612097774868113, + "grad_norm": 0.19663465023040771, + "learning_rate": 6.660256884317678e-05, + "loss": 1.9644, + "step": 305080 + }, + { + "epoch": 1.161247839954934, + "grad_norm": 0.191786527633667, + "learning_rate": 6.65545016960033e-05, + "loss": 1.9567, + "step": 305090 + }, + { + "epoch": 1.1612859024230566, + "grad_norm": 0.21058157086372375, + "learning_rate": 6.650643987866823e-05, + "loss": 1.9684, + "step": 305100 + }, + { + "epoch": 1.1613239648911793, + "grad_norm": 0.26317745447158813, + "learning_rate": 6.645838338939902e-05, + "loss": 1.9668, + "step": 305110 + }, + { + "epoch": 1.1613620273593022, + "grad_norm": 0.1573423594236374, + "learning_rate": 6.641033222642401e-05, + "loss": 1.9637, + "step": 305120 + }, + { + "epoch": 1.1614000898274248, + "grad_norm": 0.1872624158859253, + "learning_rate": 6.636228638797265e-05, + "loss": 1.968, + "step": 305130 + }, + { + "epoch": 1.1614381522955475, + "grad_norm": 0.17351549863815308, + "learning_rate": 6.631424587227525e-05, + "loss": 1.9619, + "step": 305140 + }, + { + "epoch": 1.1614762147636701, + "grad_norm": 0.25747624039649963, + "learning_rate": 6.626621067756322e-05, + "loss": 1.9572, + "step": 305150 + }, + { + "epoch": 1.1615142772317928, + "grad_norm": 0.1605093777179718, + "learning_rate": 6.621818080206881e-05, + "loss": 1.9513, + "step": 305160 + }, + { + "epoch": 1.1615523396999154, + "grad_norm": 0.27054768800735474, + "learning_rate": 6.617015624402539e-05, + "loss": 1.9632, + "step": 305170 + }, + { + "epoch": 1.161590402168038, + "grad_norm": 0.31650516390800476, + "learning_rate": 6.612213700166726e-05, + "loss": 1.9643, + "step": 305180 + }, + { + "epoch": 1.161628464636161, + "grad_norm": 0.2560306191444397, + "learning_rate": 6.607412307322963e-05, + "loss": 1.9683, + "step": 305190 + }, + { + "epoch": 1.1616665271042836, + "grad_norm": 0.17595048248767853, + "learning_rate": 6.602611445694878e-05, + "loss": 1.9727, + "step": 305200 + }, + { + "epoch": 1.1617045895724063, + "grad_norm": 0.1622474044561386, + "learning_rate": 6.597811115106189e-05, + "loss": 1.9543, + "step": 305210 + }, + { + "epoch": 1.161742652040529, + "grad_norm": 0.15321014821529388, + "learning_rate": 6.593011315380715e-05, + "loss": 1.9747, + "step": 305220 + }, + { + "epoch": 1.1617807145086516, + "grad_norm": 0.19443893432617188, + "learning_rate": 6.588212046342373e-05, + "loss": 1.9778, + "step": 305230 + }, + { + "epoch": 1.1618187769767743, + "grad_norm": 0.19640414416790009, + "learning_rate": 6.583413307815178e-05, + "loss": 1.9603, + "step": 305240 + }, + { + "epoch": 1.161856839444897, + "grad_norm": 0.2108943611383438, + "learning_rate": 6.578615099623241e-05, + "loss": 1.9596, + "step": 305250 + }, + { + "epoch": 1.1618949019130196, + "grad_norm": 0.19390849769115448, + "learning_rate": 6.573817421590772e-05, + "loss": 1.9599, + "step": 305260 + }, + { + "epoch": 1.1619329643811422, + "grad_norm": 0.19029496610164642, + "learning_rate": 6.56902027354207e-05, + "loss": 1.9529, + "step": 305270 + }, + { + "epoch": 1.161971026849265, + "grad_norm": 0.19222138822078705, + "learning_rate": 6.564223655301533e-05, + "loss": 1.9422, + "step": 305280 + }, + { + "epoch": 1.1620090893173878, + "grad_norm": 0.19947002828121185, + "learning_rate": 6.559427566693677e-05, + "loss": 1.9631, + "step": 305290 + }, + { + "epoch": 1.1620471517855104, + "grad_norm": 0.1722879707813263, + "learning_rate": 6.554632007543083e-05, + "loss": 1.9616, + "step": 305300 + }, + { + "epoch": 1.162085214253633, + "grad_norm": 0.23039458692073822, + "learning_rate": 6.54983697767445e-05, + "loss": 1.9565, + "step": 305310 + }, + { + "epoch": 1.1621232767217557, + "grad_norm": 0.22281059622764587, + "learning_rate": 6.545042476912567e-05, + "loss": 1.9498, + "step": 305320 + }, + { + "epoch": 1.1621613391898784, + "grad_norm": 0.19715505838394165, + "learning_rate": 6.540248505082314e-05, + "loss": 1.9547, + "step": 305330 + }, + { + "epoch": 1.162199401658001, + "grad_norm": 0.17719301581382751, + "learning_rate": 6.535455062008682e-05, + "loss": 1.9519, + "step": 305340 + }, + { + "epoch": 1.1622374641261237, + "grad_norm": 0.17887887358665466, + "learning_rate": 6.530662147516747e-05, + "loss": 1.9764, + "step": 305350 + }, + { + "epoch": 1.1622755265942466, + "grad_norm": 0.20813018083572388, + "learning_rate": 6.525869761431685e-05, + "loss": 1.9619, + "step": 305360 + }, + { + "epoch": 1.1623135890623693, + "grad_norm": 0.18251609802246094, + "learning_rate": 6.521077903578764e-05, + "loss": 1.9576, + "step": 305370 + }, + { + "epoch": 1.162351651530492, + "grad_norm": 0.1724395602941513, + "learning_rate": 6.516286573783359e-05, + "loss": 1.9643, + "step": 305380 + }, + { + "epoch": 1.1623897139986146, + "grad_norm": 0.19655396044254303, + "learning_rate": 6.511495771870928e-05, + "loss": 1.975, + "step": 305390 + }, + { + "epoch": 1.1624277764667372, + "grad_norm": 0.19053930044174194, + "learning_rate": 6.506705497667037e-05, + "loss": 1.955, + "step": 305400 + }, + { + "epoch": 1.1624658389348599, + "grad_norm": 0.17787426710128784, + "learning_rate": 6.501915750997339e-05, + "loss": 1.9564, + "step": 305410 + }, + { + "epoch": 1.1625039014029825, + "grad_norm": 0.1610625982284546, + "learning_rate": 6.497126531687586e-05, + "loss": 1.9546, + "step": 305420 + }, + { + "epoch": 1.1625419638711052, + "grad_norm": 0.16635026037693024, + "learning_rate": 6.492337839563633e-05, + "loss": 1.9731, + "step": 305430 + }, + { + "epoch": 1.1625800263392279, + "grad_norm": 0.2111584097146988, + "learning_rate": 6.487549674451415e-05, + "loss": 1.9607, + "step": 305440 + }, + { + "epoch": 1.1626180888073505, + "grad_norm": 0.24752797186374664, + "learning_rate": 6.482762036176976e-05, + "loss": 1.9543, + "step": 305450 + }, + { + "epoch": 1.1626561512754734, + "grad_norm": 0.1773507297039032, + "learning_rate": 6.477974924566449e-05, + "loss": 1.9613, + "step": 305460 + }, + { + "epoch": 1.162694213743596, + "grad_norm": 0.1826154589653015, + "learning_rate": 6.473188339446073e-05, + "loss": 1.9643, + "step": 305470 + }, + { + "epoch": 1.1627322762117187, + "grad_norm": 0.17847725749015808, + "learning_rate": 6.468402280642166e-05, + "loss": 1.9631, + "step": 305480 + }, + { + "epoch": 1.1627703386798414, + "grad_norm": 0.19692540168762207, + "learning_rate": 6.46361674798116e-05, + "loss": 1.9562, + "step": 305490 + }, + { + "epoch": 1.162808401147964, + "grad_norm": 0.22006727755069733, + "learning_rate": 6.458831741289561e-05, + "loss": 1.9556, + "step": 305500 + }, + { + "epoch": 1.1628464636160867, + "grad_norm": 0.22079311311244965, + "learning_rate": 6.454047260393991e-05, + "loss": 1.9466, + "step": 305510 + }, + { + "epoch": 1.1628845260842093, + "grad_norm": 0.17099638283252716, + "learning_rate": 6.449263305121155e-05, + "loss": 1.968, + "step": 305520 + }, + { + "epoch": 1.162922588552332, + "grad_norm": 0.19776278734207153, + "learning_rate": 6.444479875297859e-05, + "loss": 1.9651, + "step": 305530 + }, + { + "epoch": 1.1629606510204549, + "grad_norm": 0.16865932941436768, + "learning_rate": 6.439696970750997e-05, + "loss": 1.9672, + "step": 305540 + }, + { + "epoch": 1.1629987134885775, + "grad_norm": 0.21851569414138794, + "learning_rate": 6.434914591307561e-05, + "loss": 1.9699, + "step": 305550 + }, + { + "epoch": 1.1630367759567002, + "grad_norm": 0.1963483840227127, + "learning_rate": 6.430132736794653e-05, + "loss": 1.9727, + "step": 305560 + }, + { + "epoch": 1.1630748384248228, + "grad_norm": 0.18260200321674347, + "learning_rate": 6.425351407039438e-05, + "loss": 1.9569, + "step": 305570 + }, + { + "epoch": 1.1631129008929455, + "grad_norm": 0.22926168143749237, + "learning_rate": 6.420570601869202e-05, + "loss": 1.9607, + "step": 305580 + }, + { + "epoch": 1.1631509633610682, + "grad_norm": 0.16698819398880005, + "learning_rate": 6.415790321111326e-05, + "loss": 1.9588, + "step": 305590 + }, + { + "epoch": 1.1631890258291908, + "grad_norm": 0.168158158659935, + "learning_rate": 6.411010564593267e-05, + "loss": 1.9564, + "step": 305600 + }, + { + "epoch": 1.1632270882973135, + "grad_norm": 0.15741604566574097, + "learning_rate": 6.406231332142586e-05, + "loss": 1.9443, + "step": 305610 + }, + { + "epoch": 1.1632651507654361, + "grad_norm": 0.2450927495956421, + "learning_rate": 6.401452623586951e-05, + "loss": 1.9674, + "step": 305620 + }, + { + "epoch": 1.163303213233559, + "grad_norm": 0.16945163905620575, + "learning_rate": 6.396674438754102e-05, + "loss": 1.9597, + "step": 305630 + }, + { + "epoch": 1.1633412757016817, + "grad_norm": 0.18803608417510986, + "learning_rate": 6.391896777471889e-05, + "loss": 1.9671, + "step": 305640 + }, + { + "epoch": 1.1633793381698043, + "grad_norm": 0.1747310608625412, + "learning_rate": 6.387119639568251e-05, + "loss": 1.9599, + "step": 305650 + }, + { + "epoch": 1.163417400637927, + "grad_norm": 0.1990187019109726, + "learning_rate": 6.382343024871223e-05, + "loss": 1.9542, + "step": 305660 + }, + { + "epoch": 1.1634554631060496, + "grad_norm": 0.21996289491653442, + "learning_rate": 6.377566933208934e-05, + "loss": 1.9618, + "step": 305670 + }, + { + "epoch": 1.1634935255741723, + "grad_norm": 0.19845467805862427, + "learning_rate": 6.372791364409603e-05, + "loss": 1.9532, + "step": 305680 + }, + { + "epoch": 1.163531588042295, + "grad_norm": 0.18662182986736298, + "learning_rate": 6.368016318301551e-05, + "loss": 1.9682, + "step": 305690 + }, + { + "epoch": 1.1635696505104176, + "grad_norm": 0.17204593122005463, + "learning_rate": 6.363241794713175e-05, + "loss": 1.9621, + "step": 305700 + }, + { + "epoch": 1.1636077129785405, + "grad_norm": 0.157108873128891, + "learning_rate": 6.358467793473e-05, + "loss": 1.95, + "step": 305710 + }, + { + "epoch": 1.1636457754466631, + "grad_norm": 0.2213294804096222, + "learning_rate": 6.353694314409609e-05, + "loss": 1.9537, + "step": 305720 + }, + { + "epoch": 1.1636838379147858, + "grad_norm": 0.19779251515865326, + "learning_rate": 6.3489213573517e-05, + "loss": 1.9694, + "step": 305730 + }, + { + "epoch": 1.1637219003829085, + "grad_norm": 0.21293361485004425, + "learning_rate": 6.344148922128052e-05, + "loss": 1.9813, + "step": 305740 + }, + { + "epoch": 1.1637599628510311, + "grad_norm": 0.171671062707901, + "learning_rate": 6.339377008567548e-05, + "loss": 1.9632, + "step": 305750 + }, + { + "epoch": 1.1637980253191538, + "grad_norm": 0.19532741606235504, + "learning_rate": 6.334605616499162e-05, + "loss": 1.9601, + "step": 305760 + }, + { + "epoch": 1.1638360877872764, + "grad_norm": 0.17116063833236694, + "learning_rate": 6.329834745751956e-05, + "loss": 1.9591, + "step": 305770 + }, + { + "epoch": 1.163874150255399, + "grad_norm": 0.19638307392597198, + "learning_rate": 6.325064396155089e-05, + "loss": 1.9599, + "step": 305780 + }, + { + "epoch": 1.1639122127235217, + "grad_norm": 0.18028929829597473, + "learning_rate": 6.320294567537816e-05, + "loss": 1.9614, + "step": 305790 + }, + { + "epoch": 1.1639502751916446, + "grad_norm": 0.1766701340675354, + "learning_rate": 6.315525259729477e-05, + "loss": 1.9643, + "step": 305800 + }, + { + "epoch": 1.1639883376597673, + "grad_norm": 0.16383618116378784, + "learning_rate": 6.31075647255952e-05, + "loss": 1.9609, + "step": 305810 + }, + { + "epoch": 1.16402640012789, + "grad_norm": 0.2566216289997101, + "learning_rate": 6.305988205857465e-05, + "loss": 1.9509, + "step": 305820 + }, + { + "epoch": 1.1640644625960126, + "grad_norm": 0.19184941053390503, + "learning_rate": 6.30122045945295e-05, + "loss": 1.9533, + "step": 305830 + }, + { + "epoch": 1.1641025250641353, + "grad_norm": 0.1650419682264328, + "learning_rate": 6.296453233175686e-05, + "loss": 1.9633, + "step": 305840 + }, + { + "epoch": 1.164140587532258, + "grad_norm": 0.19763387739658356, + "learning_rate": 6.291686526855484e-05, + "loss": 1.9599, + "step": 305850 + }, + { + "epoch": 1.1641786500003806, + "grad_norm": 0.19652384519577026, + "learning_rate": 6.286920340322244e-05, + "loss": 1.9632, + "step": 305860 + }, + { + "epoch": 1.1642167124685032, + "grad_norm": 0.180561825633049, + "learning_rate": 6.282154673405965e-05, + "loss": 1.9648, + "step": 305870 + }, + { + "epoch": 1.164254774936626, + "grad_norm": 0.18484221398830414, + "learning_rate": 6.277389525936733e-05, + "loss": 1.9591, + "step": 305880 + }, + { + "epoch": 1.1642928374047488, + "grad_norm": 0.21535907685756683, + "learning_rate": 6.272624897744739e-05, + "loss": 1.9443, + "step": 305890 + }, + { + "epoch": 1.1643308998728714, + "grad_norm": 0.16880010068416595, + "learning_rate": 6.267860788660246e-05, + "loss": 1.9568, + "step": 305900 + }, + { + "epoch": 1.164368962340994, + "grad_norm": 0.19901631772518158, + "learning_rate": 6.263097198513623e-05, + "loss": 1.9607, + "step": 305910 + }, + { + "epoch": 1.1644070248091167, + "grad_norm": 0.16724003851413727, + "learning_rate": 6.258334127135335e-05, + "loss": 1.9688, + "step": 305920 + }, + { + "epoch": 1.1644450872772394, + "grad_norm": 0.16612428426742554, + "learning_rate": 6.253571574355927e-05, + "loss": 1.9612, + "step": 305930 + }, + { + "epoch": 1.164483149745362, + "grad_norm": 0.1706397384405136, + "learning_rate": 6.248809540006034e-05, + "loss": 1.9509, + "step": 305940 + }, + { + "epoch": 1.1645212122134847, + "grad_norm": 0.1672465056180954, + "learning_rate": 6.24404802391641e-05, + "loss": 1.9615, + "step": 305950 + }, + { + "epoch": 1.1645592746816074, + "grad_norm": 0.17334552109241486, + "learning_rate": 6.239287025917872e-05, + "loss": 1.9665, + "step": 305960 + }, + { + "epoch": 1.16459733714973, + "grad_norm": 0.16249677538871765, + "learning_rate": 6.234526545841335e-05, + "loss": 1.9494, + "step": 305970 + }, + { + "epoch": 1.164635399617853, + "grad_norm": 0.18276648223400116, + "learning_rate": 6.22976658351782e-05, + "loss": 1.9656, + "step": 305980 + }, + { + "epoch": 1.1646734620859756, + "grad_norm": 0.2105034440755844, + "learning_rate": 6.225007138778427e-05, + "loss": 1.9517, + "step": 305990 + }, + { + "epoch": 1.1647115245540982, + "grad_norm": 0.16858388483524323, + "learning_rate": 6.220248211454344e-05, + "loss": 1.9651, + "step": 306000 + }, + { + "epoch": 1.1647495870222209, + "grad_norm": 0.19777953624725342, + "learning_rate": 6.215489801376861e-05, + "loss": 1.9811, + "step": 306010 + }, + { + "epoch": 1.1647876494903435, + "grad_norm": 0.17725993692874908, + "learning_rate": 6.210731908377365e-05, + "loss": 1.9575, + "step": 306020 + }, + { + "epoch": 1.1648257119584662, + "grad_norm": 0.1922709047794342, + "learning_rate": 6.205974532287323e-05, + "loss": 1.9556, + "step": 306030 + }, + { + "epoch": 1.1648637744265888, + "grad_norm": 0.1779535710811615, + "learning_rate": 6.201217672938287e-05, + "loss": 1.9557, + "step": 306040 + }, + { + "epoch": 1.1649018368947117, + "grad_norm": 0.18894916772842407, + "learning_rate": 6.196461330161924e-05, + "loss": 1.9579, + "step": 306050 + }, + { + "epoch": 1.1649398993628344, + "grad_norm": 0.237869530916214, + "learning_rate": 6.191705503789962e-05, + "loss": 1.9727, + "step": 306060 + }, + { + "epoch": 1.164977961830957, + "grad_norm": 0.1724383383989334, + "learning_rate": 6.18695019365425e-05, + "loss": 1.9686, + "step": 306070 + }, + { + "epoch": 1.1650160242990797, + "grad_norm": 0.15821924805641174, + "learning_rate": 6.18219539958671e-05, + "loss": 1.959, + "step": 306080 + }, + { + "epoch": 1.1650540867672023, + "grad_norm": 0.14829407632350922, + "learning_rate": 6.177441121419363e-05, + "loss": 1.9554, + "step": 306090 + }, + { + "epoch": 1.165092149235325, + "grad_norm": 0.19108377397060394, + "learning_rate": 6.172687358984314e-05, + "loss": 1.9553, + "step": 306100 + }, + { + "epoch": 1.1651302117034477, + "grad_norm": 0.1724005788564682, + "learning_rate": 6.167934112113766e-05, + "loss": 1.95, + "step": 306110 + }, + { + "epoch": 1.1651682741715703, + "grad_norm": 0.22939327359199524, + "learning_rate": 6.16318138064001e-05, + "loss": 1.9561, + "step": 306120 + }, + { + "epoch": 1.165206336639693, + "grad_norm": 0.18707819283008575, + "learning_rate": 6.158429164395429e-05, + "loss": 1.9672, + "step": 306130 + }, + { + "epoch": 1.1652443991078156, + "grad_norm": 0.17966648936271667, + "learning_rate": 6.153677463212493e-05, + "loss": 1.9533, + "step": 306140 + }, + { + "epoch": 1.1652824615759385, + "grad_norm": 0.18556983768939972, + "learning_rate": 6.148926276923772e-05, + "loss": 1.9532, + "step": 306150 + }, + { + "epoch": 1.1653205240440612, + "grad_norm": 0.18378737568855286, + "learning_rate": 6.144175605361912e-05, + "loss": 1.9693, + "step": 306160 + }, + { + "epoch": 1.1653585865121838, + "grad_norm": 0.18640461564064026, + "learning_rate": 6.139425448359665e-05, + "loss": 1.9565, + "step": 306170 + }, + { + "epoch": 1.1653966489803065, + "grad_norm": 0.1846592277288437, + "learning_rate": 6.134675805749856e-05, + "loss": 1.9603, + "step": 306180 + }, + { + "epoch": 1.1654347114484291, + "grad_norm": 0.1734076589345932, + "learning_rate": 6.129926677365427e-05, + "loss": 1.9705, + "step": 306190 + }, + { + "epoch": 1.1654727739165518, + "grad_norm": 0.19703339040279388, + "learning_rate": 6.125178063039388e-05, + "loss": 1.9509, + "step": 306200 + }, + { + "epoch": 1.1655108363846745, + "grad_norm": 0.1685038059949875, + "learning_rate": 6.120429962604845e-05, + "loss": 1.9646, + "step": 306210 + }, + { + "epoch": 1.1655488988527973, + "grad_norm": 0.16940629482269287, + "learning_rate": 6.115682375894994e-05, + "loss": 1.9422, + "step": 306220 + }, + { + "epoch": 1.16558696132092, + "grad_norm": 0.24949117004871368, + "learning_rate": 6.110935302743121e-05, + "loss": 1.9393, + "step": 306230 + }, + { + "epoch": 1.1656250237890426, + "grad_norm": 0.17381447553634644, + "learning_rate": 6.106188742982605e-05, + "loss": 1.9629, + "step": 306240 + }, + { + "epoch": 1.1656630862571653, + "grad_norm": 0.18676424026489258, + "learning_rate": 6.101442696446918e-05, + "loss": 1.9591, + "step": 306250 + }, + { + "epoch": 1.165701148725288, + "grad_norm": 0.1781821846961975, + "learning_rate": 6.0966971629696167e-05, + "loss": 1.9586, + "step": 306260 + }, + { + "epoch": 1.1657392111934106, + "grad_norm": 0.19924336671829224, + "learning_rate": 6.091952142384344e-05, + "loss": 1.9532, + "step": 306270 + }, + { + "epoch": 1.1657772736615333, + "grad_norm": 0.213816300034523, + "learning_rate": 6.087207634524844e-05, + "loss": 1.9674, + "step": 306280 + }, + { + "epoch": 1.165815336129656, + "grad_norm": 0.19950461387634277, + "learning_rate": 6.0824636392249335e-05, + "loss": 1.9465, + "step": 306290 + }, + { + "epoch": 1.1658533985977786, + "grad_norm": 0.1716541051864624, + "learning_rate": 6.077720156318539e-05, + "loss": 1.9426, + "step": 306300 + }, + { + "epoch": 1.1658914610659012, + "grad_norm": 0.22820167243480682, + "learning_rate": 6.072977185639666e-05, + "loss": 1.9545, + "step": 306310 + }, + { + "epoch": 1.1659295235340241, + "grad_norm": 0.27457278966903687, + "learning_rate": 6.068234727022409e-05, + "loss": 1.9686, + "step": 306320 + }, + { + "epoch": 1.1659675860021468, + "grad_norm": 0.18272149562835693, + "learning_rate": 6.06349278030095e-05, + "loss": 1.9673, + "step": 306330 + }, + { + "epoch": 1.1660056484702694, + "grad_norm": 0.5270007252693176, + "learning_rate": 6.058751345309571e-05, + "loss": 1.9778, + "step": 306340 + }, + { + "epoch": 1.166043710938392, + "grad_norm": 0.17469589412212372, + "learning_rate": 6.0540104218826385e-05, + "loss": 1.9526, + "step": 306350 + }, + { + "epoch": 1.1660817734065148, + "grad_norm": 0.21892161667346954, + "learning_rate": 6.049270009854596e-05, + "loss": 1.9649, + "step": 306360 + }, + { + "epoch": 1.1661198358746374, + "grad_norm": 0.24794617295265198, + "learning_rate": 6.044530109059992e-05, + "loss": 1.9562, + "step": 306370 + }, + { + "epoch": 1.16615789834276, + "grad_norm": 0.17378780245780945, + "learning_rate": 6.039790719333465e-05, + "loss": 1.9506, + "step": 306380 + }, + { + "epoch": 1.1661959608108827, + "grad_norm": 0.20277273654937744, + "learning_rate": 6.035051840509731e-05, + "loss": 1.9633, + "step": 306390 + }, + { + "epoch": 1.1662340232790056, + "grad_norm": 0.23824036121368408, + "learning_rate": 6.030313472423604e-05, + "loss": 1.957, + "step": 306400 + }, + { + "epoch": 1.1662720857471283, + "grad_norm": 0.20040448009967804, + "learning_rate": 6.025575614909978e-05, + "loss": 1.9453, + "step": 306410 + }, + { + "epoch": 1.166310148215251, + "grad_norm": 0.16171133518218994, + "learning_rate": 6.0208382678038465e-05, + "loss": 1.9657, + "step": 306420 + }, + { + "epoch": 1.1663482106833736, + "grad_norm": 0.2363443672657013, + "learning_rate": 6.0161014309402854e-05, + "loss": 1.9467, + "step": 306430 + }, + { + "epoch": 1.1663862731514962, + "grad_norm": 0.22468698024749756, + "learning_rate": 6.0113651041544594e-05, + "loss": 1.9753, + "step": 306440 + }, + { + "epoch": 1.166424335619619, + "grad_norm": 0.17263805866241455, + "learning_rate": 6.006629287281629e-05, + "loss": 1.9626, + "step": 306450 + }, + { + "epoch": 1.1664623980877415, + "grad_norm": 0.1878524273633957, + "learning_rate": 6.00189398015713e-05, + "loss": 1.9435, + "step": 306460 + }, + { + "epoch": 1.1665004605558642, + "grad_norm": 0.24534186720848083, + "learning_rate": 5.9971591826164016e-05, + "loss": 1.9672, + "step": 306470 + }, + { + "epoch": 1.1665385230239869, + "grad_norm": 0.16337037086486816, + "learning_rate": 5.992424894494963e-05, + "loss": 1.9647, + "step": 306480 + }, + { + "epoch": 1.1665765854921097, + "grad_norm": 0.1678982526063919, + "learning_rate": 5.9876911156284185e-05, + "loss": 1.9514, + "step": 306490 + }, + { + "epoch": 1.1666146479602324, + "grad_norm": 0.1978108137845993, + "learning_rate": 5.982957845852477e-05, + "loss": 1.9553, + "step": 306500 + }, + { + "epoch": 1.166652710428355, + "grad_norm": 0.2141515016555786, + "learning_rate": 5.978225085002914e-05, + "loss": 1.9429, + "step": 306510 + }, + { + "epoch": 1.1666907728964777, + "grad_norm": 0.19807757437229156, + "learning_rate": 5.9734928329156115e-05, + "loss": 1.9559, + "step": 306520 + }, + { + "epoch": 1.1667288353646004, + "grad_norm": 0.23704470694065094, + "learning_rate": 5.968761089426522e-05, + "loss": 1.9582, + "step": 306530 + }, + { + "epoch": 1.166766897832723, + "grad_norm": 0.2034177929162979, + "learning_rate": 5.9640298543717056e-05, + "loss": 1.9695, + "step": 306540 + }, + { + "epoch": 1.1668049603008457, + "grad_norm": 0.24106188118457794, + "learning_rate": 5.9592991275872924e-05, + "loss": 1.9601, + "step": 306550 + }, + { + "epoch": 1.1668430227689683, + "grad_norm": 0.8812771439552307, + "learning_rate": 5.954568908909519e-05, + "loss": 1.9633, + "step": 306560 + }, + { + "epoch": 1.1668810852370912, + "grad_norm": 0.18400143086910248, + "learning_rate": 5.9498391981746946e-05, + "loss": 1.9466, + "step": 306570 + }, + { + "epoch": 1.1669191477052139, + "grad_norm": 0.2550337016582489, + "learning_rate": 5.9451099952192225e-05, + "loss": 1.9706, + "step": 306580 + }, + { + "epoch": 1.1669572101733365, + "grad_norm": 0.2880820333957672, + "learning_rate": 5.940381299879594e-05, + "loss": 1.9525, + "step": 306590 + }, + { + "epoch": 1.1669952726414592, + "grad_norm": 0.2965538203716278, + "learning_rate": 5.935653111992384e-05, + "loss": 1.9653, + "step": 306600 + }, + { + "epoch": 1.1670333351095818, + "grad_norm": 0.19963648915290833, + "learning_rate": 5.9309254313942576e-05, + "loss": 1.9417, + "step": 306610 + }, + { + "epoch": 1.1670713975777045, + "grad_norm": 0.17213891446590424, + "learning_rate": 5.926198257921972e-05, + "loss": 1.9497, + "step": 306620 + }, + { + "epoch": 1.1671094600458272, + "grad_norm": 0.18678686022758484, + "learning_rate": 5.92147159141237e-05, + "loss": 1.9553, + "step": 306630 + }, + { + "epoch": 1.1671475225139498, + "grad_norm": 0.1734098196029663, + "learning_rate": 5.916745431702375e-05, + "loss": 1.966, + "step": 306640 + }, + { + "epoch": 1.1671855849820725, + "grad_norm": 1.143925428390503, + "learning_rate": 5.9120197786290075e-05, + "loss": 1.9547, + "step": 306650 + }, + { + "epoch": 1.1672236474501954, + "grad_norm": 0.19502100348472595, + "learning_rate": 5.9072946320293644e-05, + "loss": 1.9582, + "step": 306660 + }, + { + "epoch": 1.167261709918318, + "grad_norm": 0.2442169040441513, + "learning_rate": 5.902569991740636e-05, + "loss": 1.956, + "step": 306670 + }, + { + "epoch": 1.1672997723864407, + "grad_norm": 0.16955025494098663, + "learning_rate": 5.8978458576001096e-05, + "loss": 1.9544, + "step": 306680 + }, + { + "epoch": 1.1673378348545633, + "grad_norm": 0.19239042699337006, + "learning_rate": 5.8931222294451356e-05, + "loss": 1.9573, + "step": 306690 + }, + { + "epoch": 1.167375897322686, + "grad_norm": 0.17494860291481018, + "learning_rate": 5.888399107113179e-05, + "loss": 1.9526, + "step": 306700 + }, + { + "epoch": 1.1674139597908086, + "grad_norm": 0.16352291405200958, + "learning_rate": 5.883676490441775e-05, + "loss": 1.9569, + "step": 306710 + }, + { + "epoch": 1.1674520222589313, + "grad_norm": 0.17652276158332825, + "learning_rate": 5.878954379268542e-05, + "loss": 1.9541, + "step": 306720 + }, + { + "epoch": 1.167490084727054, + "grad_norm": 0.17422421276569366, + "learning_rate": 5.874232773431199e-05, + "loss": 1.9595, + "step": 306730 + }, + { + "epoch": 1.1675281471951768, + "grad_norm": 0.16758720576763153, + "learning_rate": 5.869511672767541e-05, + "loss": 1.9633, + "step": 306740 + }, + { + "epoch": 1.1675662096632995, + "grad_norm": 0.2105366438627243, + "learning_rate": 5.8647910771154664e-05, + "loss": 1.9614, + "step": 306750 + }, + { + "epoch": 1.1676042721314221, + "grad_norm": 0.1798621416091919, + "learning_rate": 5.860070986312932e-05, + "loss": 1.9383, + "step": 306760 + }, + { + "epoch": 1.1676423345995448, + "grad_norm": 0.1724506914615631, + "learning_rate": 5.855351400198011e-05, + "loss": 1.9671, + "step": 306770 + }, + { + "epoch": 1.1676803970676675, + "grad_norm": 0.1844310313463211, + "learning_rate": 5.85063231860884e-05, + "loss": 1.9546, + "step": 306780 + }, + { + "epoch": 1.1677184595357901, + "grad_norm": 0.22345036268234253, + "learning_rate": 5.845913741383646e-05, + "loss": 1.9597, + "step": 306790 + }, + { + "epoch": 1.1677565220039128, + "grad_norm": 0.20319147408008575, + "learning_rate": 5.841195668360766e-05, + "loss": 1.9604, + "step": 306800 + }, + { + "epoch": 1.1677945844720354, + "grad_norm": 0.19100074470043182, + "learning_rate": 5.836478099378595e-05, + "loss": 1.9528, + "step": 306810 + }, + { + "epoch": 1.167832646940158, + "grad_norm": 0.1862342208623886, + "learning_rate": 5.831761034275623e-05, + "loss": 1.9621, + "step": 306820 + }, + { + "epoch": 1.1678707094082807, + "grad_norm": 0.16333100199699402, + "learning_rate": 5.827044472890431e-05, + "loss": 1.9546, + "step": 306830 + }, + { + "epoch": 1.1679087718764036, + "grad_norm": 0.25692513585090637, + "learning_rate": 5.822328415061684e-05, + "loss": 1.9577, + "step": 306840 + }, + { + "epoch": 1.1679468343445263, + "grad_norm": 0.19265873730182648, + "learning_rate": 5.8176128606281244e-05, + "loss": 1.9613, + "step": 306850 + }, + { + "epoch": 1.167984896812649, + "grad_norm": 0.19025906920433044, + "learning_rate": 5.8128978094285967e-05, + "loss": 1.9544, + "step": 306860 + }, + { + "epoch": 1.1680229592807716, + "grad_norm": 0.1604529321193695, + "learning_rate": 5.8081832613020246e-05, + "loss": 1.9609, + "step": 306870 + }, + { + "epoch": 1.1680610217488943, + "grad_norm": 0.1947263926267624, + "learning_rate": 5.8034692160874095e-05, + "loss": 1.9538, + "step": 306880 + }, + { + "epoch": 1.168099084217017, + "grad_norm": 0.18386346101760864, + "learning_rate": 5.798755673623851e-05, + "loss": 1.9615, + "step": 306890 + }, + { + "epoch": 1.1681371466851396, + "grad_norm": 0.1659373790025711, + "learning_rate": 5.7940426337505234e-05, + "loss": 1.9615, + "step": 306900 + }, + { + "epoch": 1.1681752091532625, + "grad_norm": 0.18153296411037445, + "learning_rate": 5.789330096306688e-05, + "loss": 1.962, + "step": 306910 + }, + { + "epoch": 1.168213271621385, + "grad_norm": 0.15782102942466736, + "learning_rate": 5.7846180611317136e-05, + "loss": 1.9554, + "step": 306920 + }, + { + "epoch": 1.1682513340895078, + "grad_norm": 0.235306054353714, + "learning_rate": 5.779906528065021e-05, + "loss": 1.9783, + "step": 306930 + }, + { + "epoch": 1.1682893965576304, + "grad_norm": 0.24597300589084625, + "learning_rate": 5.775195496946134e-05, + "loss": 1.9586, + "step": 306940 + }, + { + "epoch": 1.168327459025753, + "grad_norm": 0.18952709436416626, + "learning_rate": 5.77048496761467e-05, + "loss": 1.9446, + "step": 306950 + }, + { + "epoch": 1.1683655214938757, + "grad_norm": 0.1808987855911255, + "learning_rate": 5.765774939910312e-05, + "loss": 1.9617, + "step": 306960 + }, + { + "epoch": 1.1684035839619984, + "grad_norm": 0.21182473003864288, + "learning_rate": 5.761065413672839e-05, + "loss": 1.964, + "step": 306970 + }, + { + "epoch": 1.168441646430121, + "grad_norm": 0.19097618758678436, + "learning_rate": 5.756356388742123e-05, + "loss": 1.9718, + "step": 306980 + }, + { + "epoch": 1.1684797088982437, + "grad_norm": 0.19493243098258972, + "learning_rate": 5.751647864958104e-05, + "loss": 1.9593, + "step": 306990 + }, + { + "epoch": 1.1685177713663664, + "grad_norm": 0.2880631685256958, + "learning_rate": 5.74693984216082e-05, + "loss": 1.9523, + "step": 307000 + }, + { + "epoch": 1.1685558338344892, + "grad_norm": 0.21112750470638275, + "learning_rate": 5.7422323201903934e-05, + "loss": 1.974, + "step": 307010 + }, + { + "epoch": 1.168593896302612, + "grad_norm": 0.29016757011413574, + "learning_rate": 5.7375252988870195e-05, + "loss": 1.9517, + "step": 307020 + }, + { + "epoch": 1.1686319587707346, + "grad_norm": 0.1702297329902649, + "learning_rate": 5.7328187780909926e-05, + "loss": 1.9525, + "step": 307030 + }, + { + "epoch": 1.1686700212388572, + "grad_norm": 0.26800990104675293, + "learning_rate": 5.7281127576426897e-05, + "loss": 1.9656, + "step": 307040 + }, + { + "epoch": 1.1687080837069799, + "grad_norm": 0.1691807359457016, + "learning_rate": 5.7234072373825663e-05, + "loss": 1.9621, + "step": 307050 + }, + { + "epoch": 1.1687461461751025, + "grad_norm": 0.17145681381225586, + "learning_rate": 5.7187022171511615e-05, + "loss": 1.9594, + "step": 307060 + }, + { + "epoch": 1.1687842086432252, + "grad_norm": 0.18351706862449646, + "learning_rate": 5.713997696789114e-05, + "loss": 1.9485, + "step": 307070 + }, + { + "epoch": 1.168822271111348, + "grad_norm": 0.20628760755062103, + "learning_rate": 5.709293676137128e-05, + "loss": 1.9634, + "step": 307080 + }, + { + "epoch": 1.1688603335794707, + "grad_norm": 0.26568111777305603, + "learning_rate": 5.704590155036005e-05, + "loss": 1.9536, + "step": 307090 + }, + { + "epoch": 1.1688983960475934, + "grad_norm": 0.1946471780538559, + "learning_rate": 5.699887133326625e-05, + "loss": 1.9555, + "step": 307100 + }, + { + "epoch": 1.168936458515716, + "grad_norm": 0.1880960464477539, + "learning_rate": 5.695184610849957e-05, + "loss": 1.9573, + "step": 307110 + }, + { + "epoch": 1.1689745209838387, + "grad_norm": 0.20076966285705566, + "learning_rate": 5.690482587447049e-05, + "loss": 1.9584, + "step": 307120 + }, + { + "epoch": 1.1690125834519614, + "grad_norm": 0.21330180764198303, + "learning_rate": 5.685781062959039e-05, + "loss": 1.9517, + "step": 307130 + }, + { + "epoch": 1.169050645920084, + "grad_norm": 0.18850235641002655, + "learning_rate": 5.681080037227143e-05, + "loss": 1.9545, + "step": 307140 + }, + { + "epoch": 1.1690887083882067, + "grad_norm": 0.17607633769512177, + "learning_rate": 5.6763795100926666e-05, + "loss": 1.9583, + "step": 307150 + }, + { + "epoch": 1.1691267708563293, + "grad_norm": 0.16168035566806793, + "learning_rate": 5.671679481396996e-05, + "loss": 1.964, + "step": 307160 + }, + { + "epoch": 1.169164833324452, + "grad_norm": 0.16363447904586792, + "learning_rate": 5.66697995098161e-05, + "loss": 1.9628, + "step": 307170 + }, + { + "epoch": 1.1692028957925749, + "grad_norm": 0.20630410313606262, + "learning_rate": 5.662280918688056e-05, + "loss": 1.9481, + "step": 307180 + }, + { + "epoch": 1.1692409582606975, + "grad_norm": 0.18285779654979706, + "learning_rate": 5.657582384357979e-05, + "loss": 1.9669, + "step": 307190 + }, + { + "epoch": 1.1692790207288202, + "grad_norm": 0.20902836322784424, + "learning_rate": 5.652884347833098e-05, + "loss": 1.9534, + "step": 307200 + }, + { + "epoch": 1.1693170831969428, + "grad_norm": 0.214891254901886, + "learning_rate": 5.6481868089552244e-05, + "loss": 1.9623, + "step": 307210 + }, + { + "epoch": 1.1693551456650655, + "grad_norm": 0.20295549929141998, + "learning_rate": 5.643489767566251e-05, + "loss": 1.9576, + "step": 307220 + }, + { + "epoch": 1.1693932081331881, + "grad_norm": 0.22516366839408875, + "learning_rate": 5.638793223508148e-05, + "loss": 1.956, + "step": 307230 + }, + { + "epoch": 1.1694312706013108, + "grad_norm": 0.22936879098415375, + "learning_rate": 5.634097176622977e-05, + "loss": 1.9409, + "step": 307240 + }, + { + "epoch": 1.1694693330694335, + "grad_norm": 0.2069147825241089, + "learning_rate": 5.629401626752878e-05, + "loss": 1.9515, + "step": 307250 + }, + { + "epoch": 1.1695073955375563, + "grad_norm": 0.19395551085472107, + "learning_rate": 5.6247065737400846e-05, + "loss": 1.9583, + "step": 307260 + }, + { + "epoch": 1.169545458005679, + "grad_norm": 0.18069936335086823, + "learning_rate": 5.6200120174268896e-05, + "loss": 1.9577, + "step": 307270 + }, + { + "epoch": 1.1695835204738017, + "grad_norm": 0.17336305975914001, + "learning_rate": 5.615317957655708e-05, + "loss": 1.9519, + "step": 307280 + }, + { + "epoch": 1.1696215829419243, + "grad_norm": 0.179020956158638, + "learning_rate": 5.610624394268998e-05, + "loss": 1.9509, + "step": 307290 + }, + { + "epoch": 1.169659645410047, + "grad_norm": 0.1873224973678589, + "learning_rate": 5.605931327109326e-05, + "loss": 1.9532, + "step": 307300 + }, + { + "epoch": 1.1696977078781696, + "grad_norm": 0.18882738053798676, + "learning_rate": 5.601238756019328e-05, + "loss": 1.9581, + "step": 307310 + }, + { + "epoch": 1.1697357703462923, + "grad_norm": 0.233427032828331, + "learning_rate": 5.596546680841741e-05, + "loss": 1.9634, + "step": 307320 + }, + { + "epoch": 1.169773832814415, + "grad_norm": 0.17924568057060242, + "learning_rate": 5.591855101419363e-05, + "loss": 1.9435, + "step": 307330 + }, + { + "epoch": 1.1698118952825376, + "grad_norm": 0.21077688038349152, + "learning_rate": 5.587164017595092e-05, + "loss": 1.9613, + "step": 307340 + }, + { + "epoch": 1.1698499577506605, + "grad_norm": 0.16858820617198944, + "learning_rate": 5.582473429211904e-05, + "loss": 1.9592, + "step": 307350 + }, + { + "epoch": 1.1698880202187831, + "grad_norm": 0.23580597341060638, + "learning_rate": 5.577783336112846e-05, + "loss": 1.9504, + "step": 307360 + }, + { + "epoch": 1.1699260826869058, + "grad_norm": 0.2029874324798584, + "learning_rate": 5.5730937381410716e-05, + "loss": 1.948, + "step": 307370 + }, + { + "epoch": 1.1699641451550284, + "grad_norm": 0.16511982679367065, + "learning_rate": 5.568404635139801e-05, + "loss": 1.9501, + "step": 307380 + }, + { + "epoch": 1.170002207623151, + "grad_norm": 0.19717973470687866, + "learning_rate": 5.5637160269523255e-05, + "loss": 1.9478, + "step": 307390 + }, + { + "epoch": 1.1700402700912738, + "grad_norm": 0.1704729199409485, + "learning_rate": 5.559027913422055e-05, + "loss": 1.9724, + "step": 307400 + }, + { + "epoch": 1.1700783325593964, + "grad_norm": 0.19611027836799622, + "learning_rate": 5.554340294392452e-05, + "loss": 1.965, + "step": 307410 + }, + { + "epoch": 1.170116395027519, + "grad_norm": 0.21575294435024261, + "learning_rate": 5.549653169707064e-05, + "loss": 1.9538, + "step": 307420 + }, + { + "epoch": 1.170154457495642, + "grad_norm": 0.178140789270401, + "learning_rate": 5.5449665392095335e-05, + "loss": 1.9614, + "step": 307430 + }, + { + "epoch": 1.1701925199637646, + "grad_norm": 0.17497879266738892, + "learning_rate": 5.5402804027435806e-05, + "loss": 1.9578, + "step": 307440 + }, + { + "epoch": 1.1702305824318873, + "grad_norm": 0.25354599952697754, + "learning_rate": 5.535594760153001e-05, + "loss": 1.9485, + "step": 307450 + }, + { + "epoch": 1.17026864490001, + "grad_norm": 0.18424321711063385, + "learning_rate": 5.530909611281682e-05, + "loss": 1.9586, + "step": 307460 + }, + { + "epoch": 1.1703067073681326, + "grad_norm": 0.1908847540616989, + "learning_rate": 5.526224955973585e-05, + "loss": 1.9467, + "step": 307470 + }, + { + "epoch": 1.1703447698362552, + "grad_norm": 0.18996630609035492, + "learning_rate": 5.521540794072766e-05, + "loss": 1.9628, + "step": 307480 + }, + { + "epoch": 1.170382832304378, + "grad_norm": 0.2502085566520691, + "learning_rate": 5.5168571254233415e-05, + "loss": 1.9529, + "step": 307490 + }, + { + "epoch": 1.1704208947725006, + "grad_norm": 0.17994414269924164, + "learning_rate": 5.512173949869537e-05, + "loss": 1.9512, + "step": 307500 + }, + { + "epoch": 1.1704589572406232, + "grad_norm": 0.1656319797039032, + "learning_rate": 5.5074912672556374e-05, + "loss": 1.9551, + "step": 307510 + }, + { + "epoch": 1.170497019708746, + "grad_norm": 0.18443243205547333, + "learning_rate": 5.502809077426019e-05, + "loss": 1.9529, + "step": 307520 + }, + { + "epoch": 1.1705350821768687, + "grad_norm": 0.18963739275932312, + "learning_rate": 5.498127380225149e-05, + "loss": 1.9496, + "step": 307530 + }, + { + "epoch": 1.1705731446449914, + "grad_norm": 0.22023898363113403, + "learning_rate": 5.4934461754975517e-05, + "loss": 1.9699, + "step": 307540 + }, + { + "epoch": 1.170611207113114, + "grad_norm": 0.17199300229549408, + "learning_rate": 5.488765463087864e-05, + "loss": 1.9604, + "step": 307550 + }, + { + "epoch": 1.1706492695812367, + "grad_norm": 0.1808360069990158, + "learning_rate": 5.484085242840775e-05, + "loss": 1.9503, + "step": 307560 + }, + { + "epoch": 1.1706873320493594, + "grad_norm": 0.15934909880161285, + "learning_rate": 5.479405514601077e-05, + "loss": 1.9497, + "step": 307570 + }, + { + "epoch": 1.170725394517482, + "grad_norm": 0.17372657358646393, + "learning_rate": 5.4747262782136264e-05, + "loss": 1.9602, + "step": 307580 + }, + { + "epoch": 1.1707634569856047, + "grad_norm": 0.17633219063282013, + "learning_rate": 5.470047533523387e-05, + "loss": 1.9561, + "step": 307590 + }, + { + "epoch": 1.1708015194537276, + "grad_norm": 0.2467532753944397, + "learning_rate": 5.4653692803753765e-05, + "loss": 1.9538, + "step": 307600 + }, + { + "epoch": 1.1708395819218502, + "grad_norm": 0.19640734791755676, + "learning_rate": 5.460691518614708e-05, + "loss": 1.9673, + "step": 307610 + }, + { + "epoch": 1.1708776443899729, + "grad_norm": 0.20919203758239746, + "learning_rate": 5.4560142480865715e-05, + "loss": 1.9555, + "step": 307620 + }, + { + "epoch": 1.1709157068580955, + "grad_norm": 0.20885075628757477, + "learning_rate": 5.451337468636241e-05, + "loss": 1.9728, + "step": 307630 + }, + { + "epoch": 1.1709537693262182, + "grad_norm": 0.183942511677742, + "learning_rate": 5.4466611801090695e-05, + "loss": 1.9515, + "step": 307640 + }, + { + "epoch": 1.1709918317943409, + "grad_norm": 0.1857999712228775, + "learning_rate": 5.4419853823505e-05, + "loss": 1.9585, + "step": 307650 + }, + { + "epoch": 1.1710298942624635, + "grad_norm": 0.17464017868041992, + "learning_rate": 5.437310075206037e-05, + "loss": 1.9645, + "step": 307660 + }, + { + "epoch": 1.1710679567305862, + "grad_norm": 0.2077675610780716, + "learning_rate": 5.4326352585212845e-05, + "loss": 1.9431, + "step": 307670 + }, + { + "epoch": 1.1711060191987088, + "grad_norm": 0.16267137229442596, + "learning_rate": 5.4279609321419234e-05, + "loss": 1.9511, + "step": 307680 + }, + { + "epoch": 1.1711440816668315, + "grad_norm": 0.1626701056957245, + "learning_rate": 5.423287095913709e-05, + "loss": 1.9489, + "step": 307690 + }, + { + "epoch": 1.1711821441349544, + "grad_norm": 0.16942808032035828, + "learning_rate": 5.418613749682477e-05, + "loss": 1.9686, + "step": 307700 + }, + { + "epoch": 1.171220206603077, + "grad_norm": 0.2459169328212738, + "learning_rate": 5.413940893294161e-05, + "loss": 1.9626, + "step": 307710 + }, + { + "epoch": 1.1712582690711997, + "grad_norm": 0.175147145986557, + "learning_rate": 5.409268526594752e-05, + "loss": 1.9528, + "step": 307720 + }, + { + "epoch": 1.1712963315393223, + "grad_norm": 0.18634077906608582, + "learning_rate": 5.404596649430338e-05, + "loss": 1.9408, + "step": 307730 + }, + { + "epoch": 1.171334394007445, + "grad_norm": 0.2112835794687271, + "learning_rate": 5.399925261647076e-05, + "loss": 1.9431, + "step": 307740 + }, + { + "epoch": 1.1713724564755676, + "grad_norm": 0.21302802860736847, + "learning_rate": 5.395254363091218e-05, + "loss": 1.9582, + "step": 307750 + }, + { + "epoch": 1.1714105189436903, + "grad_norm": 0.18202129006385803, + "learning_rate": 5.390583953609074e-05, + "loss": 1.9528, + "step": 307760 + }, + { + "epoch": 1.1714485814118132, + "grad_norm": 0.17398393154144287, + "learning_rate": 5.385914033047068e-05, + "loss": 1.9627, + "step": 307770 + }, + { + "epoch": 1.1714866438799358, + "grad_norm": 0.16768161952495575, + "learning_rate": 5.3812446012516666e-05, + "loss": 1.9476, + "step": 307780 + }, + { + "epoch": 1.1715247063480585, + "grad_norm": 0.3693370819091797, + "learning_rate": 5.376575658069449e-05, + "loss": 1.9576, + "step": 307790 + }, + { + "epoch": 1.1715627688161812, + "grad_norm": 0.18076057732105255, + "learning_rate": 5.371907203347054e-05, + "loss": 1.9466, + "step": 307800 + }, + { + "epoch": 1.1716008312843038, + "grad_norm": 0.28043287992477417, + "learning_rate": 5.3672392369312094e-05, + "loss": 1.961, + "step": 307810 + }, + { + "epoch": 1.1716388937524265, + "grad_norm": 0.17701946198940277, + "learning_rate": 5.362571758668711e-05, + "loss": 1.9509, + "step": 307820 + }, + { + "epoch": 1.1716769562205491, + "grad_norm": 0.17819836735725403, + "learning_rate": 5.357904768406457e-05, + "loss": 1.9577, + "step": 307830 + }, + { + "epoch": 1.1717150186886718, + "grad_norm": 0.19525788724422455, + "learning_rate": 5.353238265991417e-05, + "loss": 1.9572, + "step": 307840 + }, + { + "epoch": 1.1717530811567944, + "grad_norm": 0.2147541493177414, + "learning_rate": 5.3485722512706226e-05, + "loss": 1.9744, + "step": 307850 + }, + { + "epoch": 1.171791143624917, + "grad_norm": 0.18522879481315613, + "learning_rate": 5.343906724091208e-05, + "loss": 1.938, + "step": 307860 + }, + { + "epoch": 1.17182920609304, + "grad_norm": 0.1954905092716217, + "learning_rate": 5.339241684300378e-05, + "loss": 1.9528, + "step": 307870 + }, + { + "epoch": 1.1718672685611626, + "grad_norm": 0.29039400815963745, + "learning_rate": 5.3345771317454105e-05, + "loss": 1.9531, + "step": 307880 + }, + { + "epoch": 1.1719053310292853, + "grad_norm": 0.23559370636940002, + "learning_rate": 5.329913066273684e-05, + "loss": 1.957, + "step": 307890 + }, + { + "epoch": 1.171943393497408, + "grad_norm": 0.1852443665266037, + "learning_rate": 5.325249487732631e-05, + "loss": 1.955, + "step": 307900 + }, + { + "epoch": 1.1719814559655306, + "grad_norm": 0.2008378654718399, + "learning_rate": 5.3205863959697894e-05, + "loss": 1.9596, + "step": 307910 + }, + { + "epoch": 1.1720195184336533, + "grad_norm": 0.17911696434020996, + "learning_rate": 5.3159237908327496e-05, + "loss": 1.9486, + "step": 307920 + }, + { + "epoch": 1.172057580901776, + "grad_norm": 0.18760210275650024, + "learning_rate": 5.311261672169204e-05, + "loss": 1.9505, + "step": 307930 + }, + { + "epoch": 1.1720956433698988, + "grad_norm": 0.1668512225151062, + "learning_rate": 5.306600039826909e-05, + "loss": 1.9575, + "step": 307940 + }, + { + "epoch": 1.1721337058380215, + "grad_norm": 0.17411920428276062, + "learning_rate": 5.3019388936537126e-05, + "loss": 1.9437, + "step": 307950 + }, + { + "epoch": 1.1721717683061441, + "grad_norm": 0.18603424727916718, + "learning_rate": 5.297278233497532e-05, + "loss": 1.97, + "step": 307960 + }, + { + "epoch": 1.1722098307742668, + "grad_norm": 0.1917838305234909, + "learning_rate": 5.2926180592063725e-05, + "loss": 1.9572, + "step": 307970 + }, + { + "epoch": 1.1722478932423894, + "grad_norm": 0.16847239434719086, + "learning_rate": 5.287958370628315e-05, + "loss": 1.9621, + "step": 307980 + }, + { + "epoch": 1.172285955710512, + "grad_norm": 0.1744316965341568, + "learning_rate": 5.2832991676115105e-05, + "loss": 1.9345, + "step": 307990 + }, + { + "epoch": 1.1723240181786347, + "grad_norm": 0.18387804925441742, + "learning_rate": 5.2786404500042076e-05, + "loss": 1.9495, + "step": 308000 + }, + { + "epoch": 1.1723620806467574, + "grad_norm": 0.18154680728912354, + "learning_rate": 5.273982217654716e-05, + "loss": 1.9527, + "step": 308010 + }, + { + "epoch": 1.17240014311488, + "grad_norm": 0.18909908831119537, + "learning_rate": 5.269324470411435e-05, + "loss": 1.9495, + "step": 308020 + }, + { + "epoch": 1.1724382055830027, + "grad_norm": 0.16833332180976868, + "learning_rate": 5.264667208122842e-05, + "loss": 1.9584, + "step": 308030 + }, + { + "epoch": 1.1724762680511256, + "grad_norm": 0.21265791356563568, + "learning_rate": 5.2600104306374855e-05, + "loss": 1.9436, + "step": 308040 + }, + { + "epoch": 1.1725143305192482, + "grad_norm": 0.21704767644405365, + "learning_rate": 5.255354137804002e-05, + "loss": 1.9588, + "step": 308050 + }, + { + "epoch": 1.172552392987371, + "grad_norm": 0.17958103120326996, + "learning_rate": 5.2506983294711044e-05, + "loss": 1.9394, + "step": 308060 + }, + { + "epoch": 1.1725904554554936, + "grad_norm": 0.15790031850337982, + "learning_rate": 5.246043005487583e-05, + "loss": 1.9654, + "step": 308070 + }, + { + "epoch": 1.1726285179236162, + "grad_norm": 0.1757218837738037, + "learning_rate": 5.24138816570231e-05, + "loss": 1.9596, + "step": 308080 + }, + { + "epoch": 1.1726665803917389, + "grad_norm": 0.1712648719549179, + "learning_rate": 5.2367338099642225e-05, + "loss": 1.9405, + "step": 308090 + }, + { + "epoch": 1.1727046428598615, + "grad_norm": 0.18772435188293457, + "learning_rate": 5.232079938122358e-05, + "loss": 1.9544, + "step": 308100 + }, + { + "epoch": 1.1727427053279842, + "grad_norm": 0.1672012358903885, + "learning_rate": 5.227426550025815e-05, + "loss": 1.9627, + "step": 308110 + }, + { + "epoch": 1.172780767796107, + "grad_norm": 0.17659921944141388, + "learning_rate": 5.222773645523776e-05, + "loss": 1.9638, + "step": 308120 + }, + { + "epoch": 1.1728188302642297, + "grad_norm": 0.18108125030994415, + "learning_rate": 5.218121224465505e-05, + "loss": 1.9514, + "step": 308130 + }, + { + "epoch": 1.1728568927323524, + "grad_norm": 0.18479257822036743, + "learning_rate": 5.2134692867003455e-05, + "loss": 1.9452, + "step": 308140 + }, + { + "epoch": 1.172894955200475, + "grad_norm": 0.19881558418273926, + "learning_rate": 5.2088178320777056e-05, + "loss": 1.9725, + "step": 308150 + }, + { + "epoch": 1.1729330176685977, + "grad_norm": 0.2701951265335083, + "learning_rate": 5.2041668604470906e-05, + "loss": 1.9486, + "step": 308160 + }, + { + "epoch": 1.1729710801367204, + "grad_norm": 0.16124291718006134, + "learning_rate": 5.19951637165807e-05, + "loss": 1.9452, + "step": 308170 + }, + { + "epoch": 1.173009142604843, + "grad_norm": 0.17446215450763702, + "learning_rate": 5.194866365560297e-05, + "loss": 1.9543, + "step": 308180 + }, + { + "epoch": 1.1730472050729657, + "grad_norm": 0.18593032658100128, + "learning_rate": 5.190216842003498e-05, + "loss": 1.9547, + "step": 308190 + }, + { + "epoch": 1.1730852675410883, + "grad_norm": 0.215800940990448, + "learning_rate": 5.185567800837493e-05, + "loss": 1.9664, + "step": 308200 + }, + { + "epoch": 1.1731233300092112, + "grad_norm": 0.16304409503936768, + "learning_rate": 5.1809192419121575e-05, + "loss": 1.963, + "step": 308210 + }, + { + "epoch": 1.1731613924773339, + "grad_norm": 0.18333649635314941, + "learning_rate": 5.176271165077456e-05, + "loss": 1.9536, + "step": 308220 + }, + { + "epoch": 1.1731994549454565, + "grad_norm": 0.1928211748600006, + "learning_rate": 5.171623570183437e-05, + "loss": 1.9775, + "step": 308230 + }, + { + "epoch": 1.1732375174135792, + "grad_norm": 0.22613413631916046, + "learning_rate": 5.166976457080208e-05, + "loss": 1.9617, + "step": 308240 + }, + { + "epoch": 1.1732755798817018, + "grad_norm": 0.1687794327735901, + "learning_rate": 5.162329825617984e-05, + "loss": 1.9512, + "step": 308250 + }, + { + "epoch": 1.1733136423498245, + "grad_norm": 0.16178713738918304, + "learning_rate": 5.157683675647023e-05, + "loss": 1.9437, + "step": 308260 + }, + { + "epoch": 1.1733517048179471, + "grad_norm": 0.2514328062534332, + "learning_rate": 5.1530380070176906e-05, + "loss": 1.9582, + "step": 308270 + }, + { + "epoch": 1.1733897672860698, + "grad_norm": 0.20378462970256805, + "learning_rate": 5.148392819580405e-05, + "loss": 1.962, + "step": 308280 + }, + { + "epoch": 1.1734278297541927, + "grad_norm": 0.19982340931892395, + "learning_rate": 5.143748113185681e-05, + "loss": 1.9499, + "step": 308290 + }, + { + "epoch": 1.1734658922223153, + "grad_norm": 0.19612081348896027, + "learning_rate": 5.139103887684099e-05, + "loss": 1.9618, + "step": 308300 + }, + { + "epoch": 1.173503954690438, + "grad_norm": 0.26761767268180847, + "learning_rate": 5.1344601429263225e-05, + "loss": 1.9622, + "step": 308310 + }, + { + "epoch": 1.1735420171585607, + "grad_norm": 0.34418386220932007, + "learning_rate": 5.129816878763094e-05, + "loss": 1.9521, + "step": 308320 + }, + { + "epoch": 1.1735800796266833, + "grad_norm": 0.19532065093517303, + "learning_rate": 5.125174095045226e-05, + "loss": 1.9598, + "step": 308330 + }, + { + "epoch": 1.173618142094806, + "grad_norm": 0.16967004537582397, + "learning_rate": 5.120531791623617e-05, + "loss": 1.9729, + "step": 308340 + }, + { + "epoch": 1.1736562045629286, + "grad_norm": 0.20884713530540466, + "learning_rate": 5.11588996834923e-05, + "loss": 1.9408, + "step": 308350 + }, + { + "epoch": 1.1736942670310513, + "grad_norm": 0.19751541316509247, + "learning_rate": 5.111248625073117e-05, + "loss": 1.9677, + "step": 308360 + }, + { + "epoch": 1.173732329499174, + "grad_norm": 0.2124282270669937, + "learning_rate": 5.106607761646403e-05, + "loss": 1.9533, + "step": 308370 + }, + { + "epoch": 1.1737703919672968, + "grad_norm": 0.16762572526931763, + "learning_rate": 5.101967377920297e-05, + "loss": 1.9353, + "step": 308380 + }, + { + "epoch": 1.1738084544354195, + "grad_norm": 0.1747930943965912, + "learning_rate": 5.097327473746066e-05, + "loss": 1.9539, + "step": 308390 + }, + { + "epoch": 1.1738465169035421, + "grad_norm": 0.274648517370224, + "learning_rate": 5.0926880489750697e-05, + "loss": 1.9545, + "step": 308400 + }, + { + "epoch": 1.1738845793716648, + "grad_norm": 0.16737380623817444, + "learning_rate": 5.0880491034587415e-05, + "loss": 1.9434, + "step": 308410 + }, + { + "epoch": 1.1739226418397875, + "grad_norm": 0.24273249506950378, + "learning_rate": 5.083410637048585e-05, + "loss": 1.9465, + "step": 308420 + }, + { + "epoch": 1.17396070430791, + "grad_norm": 0.22172129154205322, + "learning_rate": 5.0787726495961975e-05, + "loss": 1.9555, + "step": 308430 + }, + { + "epoch": 1.1739987667760328, + "grad_norm": 0.20090153813362122, + "learning_rate": 5.0741351409532286e-05, + "loss": 1.9495, + "step": 308440 + }, + { + "epoch": 1.1740368292441554, + "grad_norm": 0.2024862915277481, + "learning_rate": 5.069498110971427e-05, + "loss": 1.9506, + "step": 308450 + }, + { + "epoch": 1.1740748917122783, + "grad_norm": 0.19691678881645203, + "learning_rate": 5.064861559502604e-05, + "loss": 1.9438, + "step": 308460 + }, + { + "epoch": 1.174112954180401, + "grad_norm": 0.16210220754146576, + "learning_rate": 5.060225486398651e-05, + "loss": 1.9568, + "step": 308470 + }, + { + "epoch": 1.1741510166485236, + "grad_norm": 0.1802566796541214, + "learning_rate": 5.0555898915115365e-05, + "loss": 1.9371, + "step": 308480 + }, + { + "epoch": 1.1741890791166463, + "grad_norm": 0.1668899655342102, + "learning_rate": 5.050954774693306e-05, + "loss": 1.9509, + "step": 308490 + }, + { + "epoch": 1.174227141584769, + "grad_norm": 0.23647716641426086, + "learning_rate": 5.046320135796073e-05, + "loss": 1.9583, + "step": 308500 + }, + { + "epoch": 1.1742652040528916, + "grad_norm": 0.17502707242965698, + "learning_rate": 5.04168597467205e-05, + "loss": 1.9474, + "step": 308510 + }, + { + "epoch": 1.1743032665210142, + "grad_norm": 0.1797052025794983, + "learning_rate": 5.0370522911734936e-05, + "loss": 1.9501, + "step": 308520 + }, + { + "epoch": 1.174341328989137, + "grad_norm": 0.1689724177122116, + "learning_rate": 5.032419085152767e-05, + "loss": 1.9597, + "step": 308530 + }, + { + "epoch": 1.1743793914572596, + "grad_norm": 0.20890824496746063, + "learning_rate": 5.0277863564622826e-05, + "loss": 1.9407, + "step": 308540 + }, + { + "epoch": 1.1744174539253822, + "grad_norm": 0.17047648131847382, + "learning_rate": 5.0231541049545526e-05, + "loss": 1.9345, + "step": 308550 + }, + { + "epoch": 1.174455516393505, + "grad_norm": 0.19897052645683289, + "learning_rate": 5.018522330482145e-05, + "loss": 1.9541, + "step": 308560 + }, + { + "epoch": 1.1744935788616278, + "grad_norm": 0.18051151931285858, + "learning_rate": 5.013891032897722e-05, + "loss": 1.9606, + "step": 308570 + }, + { + "epoch": 1.1745316413297504, + "grad_norm": 0.1798298954963684, + "learning_rate": 5.009260212054007e-05, + "loss": 1.9582, + "step": 308580 + }, + { + "epoch": 1.174569703797873, + "grad_norm": 0.22572393715381622, + "learning_rate": 5.004629867803806e-05, + "loss": 1.9626, + "step": 308590 + }, + { + "epoch": 1.1746077662659957, + "grad_norm": 0.2541802227497101, + "learning_rate": 4.999999999999999e-05, + "loss": 1.9604, + "step": 308600 + }, + { + "epoch": 1.1746458287341184, + "grad_norm": 0.16402609646320343, + "learning_rate": 4.995370608495542e-05, + "loss": 1.9521, + "step": 308610 + }, + { + "epoch": 1.174683891202241, + "grad_norm": 0.19087038934230804, + "learning_rate": 4.9907416931434745e-05, + "loss": 1.9573, + "step": 308620 + }, + { + "epoch": 1.174721953670364, + "grad_norm": 0.18596763908863068, + "learning_rate": 4.9861132537968924e-05, + "loss": 1.9596, + "step": 308630 + }, + { + "epoch": 1.1747600161384866, + "grad_norm": 0.1944790929555893, + "learning_rate": 4.981485290308979e-05, + "loss": 1.9552, + "step": 308640 + }, + { + "epoch": 1.1747980786066092, + "grad_norm": 0.18553079664707184, + "learning_rate": 4.976857802533002e-05, + "loss": 1.9522, + "step": 308650 + }, + { + "epoch": 1.1748361410747319, + "grad_norm": 0.18692132830619812, + "learning_rate": 4.9722307903222906e-05, + "loss": 1.9618, + "step": 308660 + }, + { + "epoch": 1.1748742035428545, + "grad_norm": 0.1631677895784378, + "learning_rate": 4.96760425353025e-05, + "loss": 1.941, + "step": 308670 + }, + { + "epoch": 1.1749122660109772, + "grad_norm": 0.1892775148153305, + "learning_rate": 4.96297819201037e-05, + "loss": 1.9443, + "step": 308680 + }, + { + "epoch": 1.1749503284790999, + "grad_norm": 0.2765691876411438, + "learning_rate": 4.958352605616212e-05, + "loss": 1.9605, + "step": 308690 + }, + { + "epoch": 1.1749883909472225, + "grad_norm": 0.2970679998397827, + "learning_rate": 4.9537274942014045e-05, + "loss": 1.9544, + "step": 308700 + }, + { + "epoch": 1.1750264534153452, + "grad_norm": 0.16758288443088531, + "learning_rate": 4.949102857619658e-05, + "loss": 1.9628, + "step": 308710 + }, + { + "epoch": 1.1750645158834678, + "grad_norm": 0.1633490025997162, + "learning_rate": 4.944478695724758e-05, + "loss": 1.949, + "step": 308720 + }, + { + "epoch": 1.1751025783515907, + "grad_norm": 0.19200217723846436, + "learning_rate": 4.9398550083705695e-05, + "loss": 1.9588, + "step": 308730 + }, + { + "epoch": 1.1751406408197134, + "grad_norm": 0.18912263214588165, + "learning_rate": 4.935231795411027e-05, + "loss": 1.9559, + "step": 308740 + }, + { + "epoch": 1.175178703287836, + "grad_norm": 0.23634260892868042, + "learning_rate": 4.930609056700136e-05, + "loss": 1.9593, + "step": 308750 + }, + { + "epoch": 1.1752167657559587, + "grad_norm": 0.16199201345443726, + "learning_rate": 4.9259867920919796e-05, + "loss": 1.9608, + "step": 308760 + }, + { + "epoch": 1.1752548282240813, + "grad_norm": 0.19153495132923126, + "learning_rate": 4.9213650014407254e-05, + "loss": 1.9599, + "step": 308770 + }, + { + "epoch": 1.175292890692204, + "grad_norm": 0.20420241355895996, + "learning_rate": 4.9167436846005996e-05, + "loss": 1.9383, + "step": 308780 + }, + { + "epoch": 1.1753309531603267, + "grad_norm": 0.16810394823551178, + "learning_rate": 4.9121228414259145e-05, + "loss": 1.949, + "step": 308790 + }, + { + "epoch": 1.1753690156284495, + "grad_norm": 0.16946613788604736, + "learning_rate": 4.907502471771058e-05, + "loss": 1.9318, + "step": 308800 + }, + { + "epoch": 1.1754070780965722, + "grad_norm": 0.20083031058311462, + "learning_rate": 4.90288257549048e-05, + "loss": 1.9548, + "step": 308810 + }, + { + "epoch": 1.1754451405646948, + "grad_norm": 0.16079193353652954, + "learning_rate": 4.8982631524387255e-05, + "loss": 1.9533, + "step": 308820 + }, + { + "epoch": 1.1754832030328175, + "grad_norm": 0.21811576187610626, + "learning_rate": 4.8936442024703875e-05, + "loss": 1.9406, + "step": 308830 + }, + { + "epoch": 1.1755212655009402, + "grad_norm": 0.16867418587207794, + "learning_rate": 4.8890257254401607e-05, + "loss": 1.9484, + "step": 308840 + }, + { + "epoch": 1.1755593279690628, + "grad_norm": 0.1909358650445938, + "learning_rate": 4.8844077212027884e-05, + "loss": 1.9523, + "step": 308850 + }, + { + "epoch": 1.1755973904371855, + "grad_norm": 0.1731940060853958, + "learning_rate": 4.879790189613109e-05, + "loss": 1.9482, + "step": 308860 + }, + { + "epoch": 1.1756354529053081, + "grad_norm": 0.16265356540679932, + "learning_rate": 4.875173130526028e-05, + "loss": 1.9485, + "step": 308870 + }, + { + "epoch": 1.1756735153734308, + "grad_norm": 0.23693326115608215, + "learning_rate": 4.8705565437965224e-05, + "loss": 1.9544, + "step": 308880 + }, + { + "epoch": 1.1757115778415534, + "grad_norm": 0.18974530696868896, + "learning_rate": 4.865940429279647e-05, + "loss": 1.9717, + "step": 308890 + }, + { + "epoch": 1.1757496403096763, + "grad_norm": 0.22193016111850739, + "learning_rate": 4.861324786830529e-05, + "loss": 1.945, + "step": 308900 + }, + { + "epoch": 1.175787702777799, + "grad_norm": 0.19164617359638214, + "learning_rate": 4.8567096163043665e-05, + "loss": 1.9572, + "step": 308910 + }, + { + "epoch": 1.1758257652459216, + "grad_norm": 0.18589046597480774, + "learning_rate": 4.8520949175564376e-05, + "loss": 1.9483, + "step": 308920 + }, + { + "epoch": 1.1758638277140443, + "grad_norm": 0.18081121146678925, + "learning_rate": 4.847480690442091e-05, + "loss": 1.9562, + "step": 308930 + }, + { + "epoch": 1.175901890182167, + "grad_norm": 0.21181713044643402, + "learning_rate": 4.842866934816747e-05, + "loss": 1.9296, + "step": 308940 + }, + { + "epoch": 1.1759399526502896, + "grad_norm": 0.18270161747932434, + "learning_rate": 4.8382536505359065e-05, + "loss": 1.947, + "step": 308950 + }, + { + "epoch": 1.1759780151184123, + "grad_norm": 0.24431847035884857, + "learning_rate": 4.8336408374551446e-05, + "loss": 1.9363, + "step": 308960 + }, + { + "epoch": 1.176016077586535, + "grad_norm": 0.16476482152938843, + "learning_rate": 4.829028495430088e-05, + "loss": 1.9545, + "step": 308970 + }, + { + "epoch": 1.1760541400546578, + "grad_norm": 0.16913557052612305, + "learning_rate": 4.82441662431648e-05, + "loss": 1.9538, + "step": 308980 + }, + { + "epoch": 1.1760922025227805, + "grad_norm": 0.23285558819770813, + "learning_rate": 4.8198052239700916e-05, + "loss": 1.962, + "step": 308990 + }, + { + "epoch": 1.1761302649909031, + "grad_norm": 0.20442961156368256, + "learning_rate": 4.815194294246805e-05, + "loss": 1.9402, + "step": 309000 + }, + { + "epoch": 1.1761683274590258, + "grad_norm": 0.2592919170856476, + "learning_rate": 4.810583835002547e-05, + "loss": 1.9595, + "step": 309010 + }, + { + "epoch": 1.1762063899271484, + "grad_norm": 0.2001858800649643, + "learning_rate": 4.805973846093331e-05, + "loss": 1.9597, + "step": 309020 + }, + { + "epoch": 1.176244452395271, + "grad_norm": 0.2916547656059265, + "learning_rate": 4.801364327375246e-05, + "loss": 1.9454, + "step": 309030 + }, + { + "epoch": 1.1762825148633937, + "grad_norm": 0.20770905911922455, + "learning_rate": 4.796755278704456e-05, + "loss": 1.9377, + "step": 309040 + }, + { + "epoch": 1.1763205773315164, + "grad_norm": 0.17296180129051208, + "learning_rate": 4.792146699937189e-05, + "loss": 1.9505, + "step": 309050 + }, + { + "epoch": 1.176358639799639, + "grad_norm": 0.19396503269672394, + "learning_rate": 4.787538590929752e-05, + "loss": 1.9531, + "step": 309060 + }, + { + "epoch": 1.176396702267762, + "grad_norm": 0.17859074473381042, + "learning_rate": 4.782930951538516e-05, + "loss": 1.9766, + "step": 309070 + }, + { + "epoch": 1.1764347647358846, + "grad_norm": 0.20435374975204468, + "learning_rate": 4.778323781619948e-05, + "loss": 1.9643, + "step": 309080 + }, + { + "epoch": 1.1764728272040073, + "grad_norm": 0.20311976969242096, + "learning_rate": 4.77371708103056e-05, + "loss": 1.9517, + "step": 309090 + }, + { + "epoch": 1.17651088967213, + "grad_norm": 0.1713547557592392, + "learning_rate": 4.769110849626962e-05, + "loss": 1.9468, + "step": 309100 + }, + { + "epoch": 1.1765489521402526, + "grad_norm": 0.23550809919834137, + "learning_rate": 4.764505087265819e-05, + "loss": 1.9523, + "step": 309110 + }, + { + "epoch": 1.1765870146083752, + "grad_norm": 0.26078513264656067, + "learning_rate": 4.759899793803874e-05, + "loss": 1.9644, + "step": 309120 + }, + { + "epoch": 1.1766250770764979, + "grad_norm": 0.23422762751579285, + "learning_rate": 4.755294969097951e-05, + "loss": 1.9594, + "step": 309130 + }, + { + "epoch": 1.1766631395446205, + "grad_norm": 0.1606295108795166, + "learning_rate": 4.750690613004932e-05, + "loss": 1.9612, + "step": 309140 + }, + { + "epoch": 1.1767012020127434, + "grad_norm": 0.18316294252872467, + "learning_rate": 4.746086725381782e-05, + "loss": 1.9563, + "step": 309150 + }, + { + "epoch": 1.176739264480866, + "grad_norm": 0.17885041236877441, + "learning_rate": 4.741483306085548e-05, + "loss": 1.9468, + "step": 309160 + }, + { + "epoch": 1.1767773269489887, + "grad_norm": 0.18115143477916718, + "learning_rate": 4.736880354973322e-05, + "loss": 1.9709, + "step": 309170 + }, + { + "epoch": 1.1768153894171114, + "grad_norm": 0.21183575689792633, + "learning_rate": 4.7322778719022965e-05, + "loss": 1.9343, + "step": 309180 + }, + { + "epoch": 1.176853451885234, + "grad_norm": 0.16498693823814392, + "learning_rate": 4.7276758567297185e-05, + "loss": 1.9554, + "step": 309190 + }, + { + "epoch": 1.1768915143533567, + "grad_norm": 0.16676482558250427, + "learning_rate": 4.723074309312919e-05, + "loss": 1.9722, + "step": 309200 + }, + { + "epoch": 1.1769295768214794, + "grad_norm": 0.18812629580497742, + "learning_rate": 4.7184732295092906e-05, + "loss": 1.9486, + "step": 309210 + }, + { + "epoch": 1.176967639289602, + "grad_norm": 0.18773570656776428, + "learning_rate": 4.7138726171763124e-05, + "loss": 1.9511, + "step": 309220 + }, + { + "epoch": 1.1770057017577247, + "grad_norm": 0.1896013617515564, + "learning_rate": 4.709272472171527e-05, + "loss": 1.9579, + "step": 309230 + }, + { + "epoch": 1.1770437642258476, + "grad_norm": 0.23378810286521912, + "learning_rate": 4.7046727943525435e-05, + "loss": 1.9618, + "step": 309240 + }, + { + "epoch": 1.1770818266939702, + "grad_norm": 0.19067463278770447, + "learning_rate": 4.700073583577058e-05, + "loss": 1.9452, + "step": 309250 + }, + { + "epoch": 1.1771198891620929, + "grad_norm": 0.35932278633117676, + "learning_rate": 4.695474839702823e-05, + "loss": 1.9587, + "step": 309260 + }, + { + "epoch": 1.1771579516302155, + "grad_norm": 0.2065085619688034, + "learning_rate": 4.6908765625876756e-05, + "loss": 1.9413, + "step": 309270 + }, + { + "epoch": 1.1771960140983382, + "grad_norm": 0.1663128137588501, + "learning_rate": 4.6862787520895236e-05, + "loss": 1.9552, + "step": 309280 + }, + { + "epoch": 1.1772340765664608, + "grad_norm": 0.1832168996334076, + "learning_rate": 4.681681408066341e-05, + "loss": 1.9498, + "step": 309290 + }, + { + "epoch": 1.1772721390345835, + "grad_norm": 0.1732766032218933, + "learning_rate": 4.677084530376174e-05, + "loss": 1.9633, + "step": 309300 + }, + { + "epoch": 1.1773102015027062, + "grad_norm": 0.17328311502933502, + "learning_rate": 4.6724881188771496e-05, + "loss": 1.9516, + "step": 309310 + }, + { + "epoch": 1.177348263970829, + "grad_norm": 0.21596384048461914, + "learning_rate": 4.6678921734274514e-05, + "loss": 1.9495, + "step": 309320 + }, + { + "epoch": 1.1773863264389517, + "grad_norm": 0.16711944341659546, + "learning_rate": 4.663296693885355e-05, + "loss": 1.9413, + "step": 309330 + }, + { + "epoch": 1.1774243889070743, + "grad_norm": 0.18340061604976654, + "learning_rate": 4.6587016801091896e-05, + "loss": 1.9523, + "step": 309340 + }, + { + "epoch": 1.177462451375197, + "grad_norm": 0.25791123509407043, + "learning_rate": 4.654107131957363e-05, + "loss": 1.9586, + "step": 309350 + }, + { + "epoch": 1.1775005138433197, + "grad_norm": 0.2621697187423706, + "learning_rate": 4.649513049288362e-05, + "loss": 1.9548, + "step": 309360 + }, + { + "epoch": 1.1775385763114423, + "grad_norm": 0.2004096806049347, + "learning_rate": 4.6449194319607316e-05, + "loss": 1.9617, + "step": 309370 + }, + { + "epoch": 1.177576638779565, + "grad_norm": 0.20095586776733398, + "learning_rate": 4.640326279833096e-05, + "loss": 1.9636, + "step": 309380 + }, + { + "epoch": 1.1776147012476876, + "grad_norm": 0.16082298755645752, + "learning_rate": 4.635733592764152e-05, + "loss": 1.9451, + "step": 309390 + }, + { + "epoch": 1.1776527637158103, + "grad_norm": 0.17891064286231995, + "learning_rate": 4.631141370612668e-05, + "loss": 1.9619, + "step": 309400 + }, + { + "epoch": 1.177690826183933, + "grad_norm": 0.19131295382976532, + "learning_rate": 4.62654961323748e-05, + "loss": 1.9525, + "step": 309410 + }, + { + "epoch": 1.1777288886520558, + "grad_norm": 0.21167497336864471, + "learning_rate": 4.621958320497493e-05, + "loss": 1.931, + "step": 309420 + }, + { + "epoch": 1.1777669511201785, + "grad_norm": 0.15909339487552643, + "learning_rate": 4.617367492251695e-05, + "loss": 1.9569, + "step": 309430 + }, + { + "epoch": 1.1778050135883011, + "grad_norm": 0.1648852676153183, + "learning_rate": 4.61277712835913e-05, + "loss": 1.9531, + "step": 309440 + }, + { + "epoch": 1.1778430760564238, + "grad_norm": 0.17828480899333954, + "learning_rate": 4.608187228678928e-05, + "loss": 1.961, + "step": 309450 + }, + { + "epoch": 1.1778811385245465, + "grad_norm": 0.21379193663597107, + "learning_rate": 4.60359779307028e-05, + "loss": 1.9505, + "step": 309460 + }, + { + "epoch": 1.1779192009926691, + "grad_norm": 0.16374193131923676, + "learning_rate": 4.599008821392459e-05, + "loss": 1.9543, + "step": 309470 + }, + { + "epoch": 1.1779572634607918, + "grad_norm": 0.18641433119773865, + "learning_rate": 4.594420313504788e-05, + "loss": 1.9334, + "step": 309480 + }, + { + "epoch": 1.1779953259289146, + "grad_norm": 0.16827502846717834, + "learning_rate": 4.589832269266686e-05, + "loss": 1.9557, + "step": 309490 + }, + { + "epoch": 1.1780333883970373, + "grad_norm": 0.17855706810951233, + "learning_rate": 4.5852446885376266e-05, + "loss": 1.9442, + "step": 309500 + }, + { + "epoch": 1.17807145086516, + "grad_norm": 0.16724631190299988, + "learning_rate": 4.5806575711771606e-05, + "loss": 1.941, + "step": 309510 + }, + { + "epoch": 1.1781095133332826, + "grad_norm": 0.17224138975143433, + "learning_rate": 4.576070917044906e-05, + "loss": 1.9512, + "step": 309520 + }, + { + "epoch": 1.1781475758014053, + "grad_norm": 0.16810820996761322, + "learning_rate": 4.571484726000563e-05, + "loss": 1.946, + "step": 309530 + }, + { + "epoch": 1.178185638269528, + "grad_norm": 0.24897336959838867, + "learning_rate": 4.56689899790389e-05, + "loss": 1.9474, + "step": 309540 + }, + { + "epoch": 1.1782237007376506, + "grad_norm": 0.20070362091064453, + "learning_rate": 4.5623137326147144e-05, + "loss": 1.9635, + "step": 309550 + }, + { + "epoch": 1.1782617632057732, + "grad_norm": 0.1960253268480301, + "learning_rate": 4.557728929992949e-05, + "loss": 1.9475, + "step": 309560 + }, + { + "epoch": 1.178299825673896, + "grad_norm": 0.20311059057712555, + "learning_rate": 4.5531445898985556e-05, + "loss": 1.9517, + "step": 309570 + }, + { + "epoch": 1.1783378881420186, + "grad_norm": 0.19430354237556458, + "learning_rate": 4.548560712191596e-05, + "loss": 1.9538, + "step": 309580 + }, + { + "epoch": 1.1783759506101414, + "grad_norm": 0.24194493889808655, + "learning_rate": 4.543977296732177e-05, + "loss": 1.9608, + "step": 309590 + }, + { + "epoch": 1.178414013078264, + "grad_norm": 0.22881710529327393, + "learning_rate": 4.539394343380482e-05, + "loss": 1.9712, + "step": 309600 + }, + { + "epoch": 1.1784520755463868, + "grad_norm": 0.16793493926525116, + "learning_rate": 4.534811851996773e-05, + "loss": 1.9553, + "step": 309610 + }, + { + "epoch": 1.1784901380145094, + "grad_norm": 0.1682312786579132, + "learning_rate": 4.530229822441373e-05, + "loss": 1.9439, + "step": 309620 + }, + { + "epoch": 1.178528200482632, + "grad_norm": 0.2150745689868927, + "learning_rate": 4.5256482545746825e-05, + "loss": 1.9562, + "step": 309630 + }, + { + "epoch": 1.1785662629507547, + "grad_norm": 0.16091136634349823, + "learning_rate": 4.521067148257168e-05, + "loss": 1.9522, + "step": 309640 + }, + { + "epoch": 1.1786043254188774, + "grad_norm": 0.20136001706123352, + "learning_rate": 4.5164865033493705e-05, + "loss": 1.9651, + "step": 309650 + }, + { + "epoch": 1.1786423878870003, + "grad_norm": 0.21942678093910217, + "learning_rate": 4.511906319711895e-05, + "loss": 1.9523, + "step": 309660 + }, + { + "epoch": 1.178680450355123, + "grad_norm": 0.1780799776315689, + "learning_rate": 4.507326597205419e-05, + "loss": 1.9447, + "step": 309670 + }, + { + "epoch": 1.1787185128232456, + "grad_norm": 0.17522697150707245, + "learning_rate": 4.502747335690699e-05, + "loss": 1.9513, + "step": 309680 + }, + { + "epoch": 1.1787565752913682, + "grad_norm": 0.19580373167991638, + "learning_rate": 4.498168535028541e-05, + "loss": 1.9591, + "step": 309690 + }, + { + "epoch": 1.178794637759491, + "grad_norm": 0.16261278092861176, + "learning_rate": 4.493590195079844e-05, + "loss": 1.9678, + "step": 309700 + }, + { + "epoch": 1.1788327002276136, + "grad_norm": 0.17700830101966858, + "learning_rate": 4.48901231570556e-05, + "loss": 1.9639, + "step": 309710 + }, + { + "epoch": 1.1788707626957362, + "grad_norm": 0.16604109108448029, + "learning_rate": 4.4844348967667256e-05, + "loss": 1.9421, + "step": 309720 + }, + { + "epoch": 1.1789088251638589, + "grad_norm": 0.18127460777759552, + "learning_rate": 4.479857938124432e-05, + "loss": 1.955, + "step": 309730 + }, + { + "epoch": 1.1789468876319815, + "grad_norm": 0.17123474180698395, + "learning_rate": 4.47528143963985e-05, + "loss": 1.9516, + "step": 309740 + }, + { + "epoch": 1.1789849501001042, + "grad_norm": 0.21618498861789703, + "learning_rate": 4.470705401174213e-05, + "loss": 1.948, + "step": 309750 + }, + { + "epoch": 1.179023012568227, + "grad_norm": 0.24029800295829773, + "learning_rate": 4.466129822588838e-05, + "loss": 1.9471, + "step": 309760 + }, + { + "epoch": 1.1790610750363497, + "grad_norm": 0.1724097579717636, + "learning_rate": 4.461554703745102e-05, + "loss": 1.968, + "step": 309770 + }, + { + "epoch": 1.1790991375044724, + "grad_norm": 0.18432177603244781, + "learning_rate": 4.456980044504444e-05, + "loss": 1.9507, + "step": 309780 + }, + { + "epoch": 1.179137199972595, + "grad_norm": 0.22263087332248688, + "learning_rate": 4.45240584472838e-05, + "loss": 1.9441, + "step": 309790 + }, + { + "epoch": 1.1791752624407177, + "grad_norm": 0.19197730720043182, + "learning_rate": 4.447832104278504e-05, + "loss": 1.9578, + "step": 309800 + }, + { + "epoch": 1.1792133249088403, + "grad_norm": 0.17583264410495758, + "learning_rate": 4.4432588230164716e-05, + "loss": 1.9589, + "step": 309810 + }, + { + "epoch": 1.179251387376963, + "grad_norm": 0.20924118161201477, + "learning_rate": 4.438686000803999e-05, + "loss": 1.9488, + "step": 309820 + }, + { + "epoch": 1.1792894498450857, + "grad_norm": 0.17755398154258728, + "learning_rate": 4.43411363750289e-05, + "loss": 1.9513, + "step": 309830 + }, + { + "epoch": 1.1793275123132085, + "grad_norm": 0.16246619820594788, + "learning_rate": 4.4295417329750065e-05, + "loss": 1.9466, + "step": 309840 + }, + { + "epoch": 1.1793655747813312, + "grad_norm": 0.16991542279720306, + "learning_rate": 4.424970287082275e-05, + "loss": 1.9484, + "step": 309850 + }, + { + "epoch": 1.1794036372494539, + "grad_norm": 0.19759690761566162, + "learning_rate": 4.420399299686706e-05, + "loss": 1.9442, + "step": 309860 + }, + { + "epoch": 1.1794416997175765, + "grad_norm": 0.17393594980239868, + "learning_rate": 4.4158287706503654e-05, + "loss": 1.9552, + "step": 309870 + }, + { + "epoch": 1.1794797621856992, + "grad_norm": 0.19019247591495514, + "learning_rate": 4.411258699835402e-05, + "loss": 1.9498, + "step": 309880 + }, + { + "epoch": 1.1795178246538218, + "grad_norm": 0.16410709917545319, + "learning_rate": 4.4066890871040154e-05, + "loss": 1.9563, + "step": 309890 + }, + { + "epoch": 1.1795558871219445, + "grad_norm": 0.25346609950065613, + "learning_rate": 4.4021199323184935e-05, + "loss": 1.9568, + "step": 309900 + }, + { + "epoch": 1.1795939495900671, + "grad_norm": 0.17545926570892334, + "learning_rate": 4.3975512353411736e-05, + "loss": 1.9559, + "step": 309910 + }, + { + "epoch": 1.1796320120581898, + "grad_norm": 0.16444379091262817, + "learning_rate": 4.3929829960344834e-05, + "loss": 1.9442, + "step": 309920 + }, + { + "epoch": 1.1796700745263127, + "grad_norm": 0.21553288400173187, + "learning_rate": 4.388415214260899e-05, + "loss": 1.9604, + "step": 309930 + }, + { + "epoch": 1.1797081369944353, + "grad_norm": 0.19073207676410675, + "learning_rate": 4.38384788988298e-05, + "loss": 1.9669, + "step": 309940 + }, + { + "epoch": 1.179746199462558, + "grad_norm": 0.19403015077114105, + "learning_rate": 4.379281022763354e-05, + "loss": 1.9532, + "step": 309950 + }, + { + "epoch": 1.1797842619306806, + "grad_norm": 0.20546835660934448, + "learning_rate": 4.374714612764702e-05, + "loss": 1.9527, + "step": 309960 + }, + { + "epoch": 1.1798223243988033, + "grad_norm": 0.17869624495506287, + "learning_rate": 4.370148659749795e-05, + "loss": 1.9535, + "step": 309970 + }, + { + "epoch": 1.179860386866926, + "grad_norm": 0.2714109718799591, + "learning_rate": 4.3655831635814604e-05, + "loss": 1.9581, + "step": 309980 + }, + { + "epoch": 1.1798984493350486, + "grad_norm": 0.1904449611902237, + "learning_rate": 4.36101812412259e-05, + "loss": 1.9384, + "step": 309990 + }, + { + "epoch": 1.1799365118031713, + "grad_norm": 0.2278692126274109, + "learning_rate": 4.356453541236155e-05, + "loss": 1.9505, + "step": 310000 + }, + { + "epoch": 1.1799745742712942, + "grad_norm": 0.2904459834098816, + "learning_rate": 4.351889414785193e-05, + "loss": 1.9501, + "step": 310010 + }, + { + "epoch": 1.1800126367394168, + "grad_norm": 0.16368386149406433, + "learning_rate": 4.347325744632807e-05, + "loss": 1.9456, + "step": 310020 + }, + { + "epoch": 1.1800506992075395, + "grad_norm": 0.19859318435192108, + "learning_rate": 4.3427625306421635e-05, + "loss": 1.9482, + "step": 310030 + }, + { + "epoch": 1.1800887616756621, + "grad_norm": 0.254438579082489, + "learning_rate": 4.338199772676504e-05, + "loss": 1.9418, + "step": 310040 + }, + { + "epoch": 1.1801268241437848, + "grad_norm": 0.16224907338619232, + "learning_rate": 4.333637470599144e-05, + "loss": 1.9517, + "step": 310050 + }, + { + "epoch": 1.1801648866119074, + "grad_norm": 0.18530027568340302, + "learning_rate": 4.329075624273454e-05, + "loss": 1.947, + "step": 310060 + }, + { + "epoch": 1.18020294908003, + "grad_norm": 0.18948470056056976, + "learning_rate": 4.324514233562876e-05, + "loss": 1.939, + "step": 310070 + }, + { + "epoch": 1.1802410115481528, + "grad_norm": 0.17915013432502747, + "learning_rate": 4.319953298330937e-05, + "loss": 1.9549, + "step": 310080 + }, + { + "epoch": 1.1802790740162754, + "grad_norm": 0.17633748054504395, + "learning_rate": 4.315392818441205e-05, + "loss": 1.9545, + "step": 310090 + }, + { + "epoch": 1.1803171364843983, + "grad_norm": 0.1659269481897354, + "learning_rate": 4.310832793757335e-05, + "loss": 1.9408, + "step": 310100 + }, + { + "epoch": 1.180355198952521, + "grad_norm": 0.20943781733512878, + "learning_rate": 4.306273224143048e-05, + "loss": 1.9491, + "step": 310110 + }, + { + "epoch": 1.1803932614206436, + "grad_norm": 0.200185164809227, + "learning_rate": 4.301714109462118e-05, + "loss": 1.95, + "step": 310120 + }, + { + "epoch": 1.1804313238887663, + "grad_norm": 0.19608174264431, + "learning_rate": 4.297155449578416e-05, + "loss": 1.9572, + "step": 310130 + }, + { + "epoch": 1.180469386356889, + "grad_norm": 0.1803928017616272, + "learning_rate": 4.2925972443558444e-05, + "loss": 1.955, + "step": 310140 + }, + { + "epoch": 1.1805074488250116, + "grad_norm": 0.19274504482746124, + "learning_rate": 4.2880394936584076e-05, + "loss": 1.9401, + "step": 310150 + }, + { + "epoch": 1.1805455112931342, + "grad_norm": 0.18078354001045227, + "learning_rate": 4.283482197350158e-05, + "loss": 1.9405, + "step": 310160 + }, + { + "epoch": 1.1805835737612569, + "grad_norm": 0.1925884485244751, + "learning_rate": 4.278925355295216e-05, + "loss": 1.939, + "step": 310170 + }, + { + "epoch": 1.1806216362293798, + "grad_norm": 0.301413893699646, + "learning_rate": 4.2743689673577776e-05, + "loss": 1.9515, + "step": 310180 + }, + { + "epoch": 1.1806596986975024, + "grad_norm": 0.2065173089504242, + "learning_rate": 4.269813033402103e-05, + "loss": 1.949, + "step": 310190 + }, + { + "epoch": 1.180697761165625, + "grad_norm": 0.17483308911323547, + "learning_rate": 4.265257553292523e-05, + "loss": 1.9521, + "step": 310200 + }, + { + "epoch": 1.1807358236337477, + "grad_norm": 0.17191389203071594, + "learning_rate": 4.2607025268934284e-05, + "loss": 1.9715, + "step": 310210 + }, + { + "epoch": 1.1807738861018704, + "grad_norm": 0.21063725650310516, + "learning_rate": 4.2561479540692835e-05, + "loss": 1.9425, + "step": 310220 + }, + { + "epoch": 1.180811948569993, + "grad_norm": 0.19254496693611145, + "learning_rate": 4.25159383468462e-05, + "loss": 1.9571, + "step": 310230 + }, + { + "epoch": 1.1808500110381157, + "grad_norm": 0.17268456518650055, + "learning_rate": 4.247040168604027e-05, + "loss": 1.9453, + "step": 310240 + }, + { + "epoch": 1.1808880735062384, + "grad_norm": 0.17198698222637177, + "learning_rate": 4.242486955692182e-05, + "loss": 1.9469, + "step": 310250 + }, + { + "epoch": 1.180926135974361, + "grad_norm": 0.24686148762702942, + "learning_rate": 4.237934195813814e-05, + "loss": 1.9493, + "step": 310260 + }, + { + "epoch": 1.1809641984424837, + "grad_norm": 0.16179026663303375, + "learning_rate": 4.233381888833721e-05, + "loss": 1.9527, + "step": 310270 + }, + { + "epoch": 1.1810022609106066, + "grad_norm": 0.20546956360340118, + "learning_rate": 4.228830034616771e-05, + "loss": 1.9437, + "step": 310280 + }, + { + "epoch": 1.1810403233787292, + "grad_norm": 0.18097162246704102, + "learning_rate": 4.2242786330278946e-05, + "loss": 1.9525, + "step": 310290 + }, + { + "epoch": 1.1810783858468519, + "grad_norm": 0.19285722076892853, + "learning_rate": 4.2197276839320954e-05, + "loss": 1.9466, + "step": 310300 + }, + { + "epoch": 1.1811164483149745, + "grad_norm": 0.18017497658729553, + "learning_rate": 4.215177187194446e-05, + "loss": 1.9542, + "step": 310310 + }, + { + "epoch": 1.1811545107830972, + "grad_norm": 0.1642260104417801, + "learning_rate": 4.210627142680073e-05, + "loss": 1.943, + "step": 310320 + }, + { + "epoch": 1.1811925732512198, + "grad_norm": 0.19965514540672302, + "learning_rate": 4.206077550254189e-05, + "loss": 1.9523, + "step": 310330 + }, + { + "epoch": 1.1812306357193425, + "grad_norm": 0.16829894483089447, + "learning_rate": 4.201528409782057e-05, + "loss": 1.9378, + "step": 310340 + }, + { + "epoch": 1.1812686981874654, + "grad_norm": 0.21287326514720917, + "learning_rate": 4.1969797211290184e-05, + "loss": 1.9502, + "step": 310350 + }, + { + "epoch": 1.181306760655588, + "grad_norm": 0.18998737633228302, + "learning_rate": 4.192431484160464e-05, + "loss": 1.9561, + "step": 310360 + }, + { + "epoch": 1.1813448231237107, + "grad_norm": 0.32120609283447266, + "learning_rate": 4.187883698741879e-05, + "loss": 1.964, + "step": 310370 + }, + { + "epoch": 1.1813828855918334, + "grad_norm": 0.19673456251621246, + "learning_rate": 4.1833363647387936e-05, + "loss": 1.9589, + "step": 310380 + }, + { + "epoch": 1.181420948059956, + "grad_norm": 0.26058077812194824, + "learning_rate": 4.1787894820168095e-05, + "loss": 1.9454, + "step": 310390 + }, + { + "epoch": 1.1814590105280787, + "grad_norm": 0.20470654964447021, + "learning_rate": 4.1742430504416005e-05, + "loss": 1.9355, + "step": 310400 + }, + { + "epoch": 1.1814970729962013, + "grad_norm": 0.1853208690881729, + "learning_rate": 4.1696970698789024e-05, + "loss": 1.9549, + "step": 310410 + }, + { + "epoch": 1.181535135464324, + "grad_norm": 0.2150898277759552, + "learning_rate": 4.165151540194512e-05, + "loss": 1.9366, + "step": 310420 + }, + { + "epoch": 1.1815731979324466, + "grad_norm": 0.16874505579471588, + "learning_rate": 4.160606461254313e-05, + "loss": 1.9421, + "step": 310430 + }, + { + "epoch": 1.1816112604005693, + "grad_norm": 0.29402124881744385, + "learning_rate": 4.1560618329242295e-05, + "loss": 1.9467, + "step": 310440 + }, + { + "epoch": 1.1816493228686922, + "grad_norm": 0.17367145419120789, + "learning_rate": 4.151517655070269e-05, + "loss": 1.9415, + "step": 310450 + }, + { + "epoch": 1.1816873853368148, + "grad_norm": 0.23331019282341003, + "learning_rate": 4.1469739275585e-05, + "loss": 1.9572, + "step": 310460 + }, + { + "epoch": 1.1817254478049375, + "grad_norm": 0.19143813848495483, + "learning_rate": 4.142430650255058e-05, + "loss": 1.9506, + "step": 310470 + }, + { + "epoch": 1.1817635102730601, + "grad_norm": 0.18256507813930511, + "learning_rate": 4.1378878230261486e-05, + "loss": 1.9535, + "step": 310480 + }, + { + "epoch": 1.1818015727411828, + "grad_norm": 0.17464636266231537, + "learning_rate": 4.133345445738035e-05, + "loss": 1.9631, + "step": 310490 + }, + { + "epoch": 1.1818396352093055, + "grad_norm": 0.23930378258228302, + "learning_rate": 4.1288035182570516e-05, + "loss": 1.955, + "step": 310500 + }, + { + "epoch": 1.1818776976774281, + "grad_norm": 0.251952201128006, + "learning_rate": 4.1242620404496e-05, + "loss": 1.9445, + "step": 310510 + }, + { + "epoch": 1.181915760145551, + "grad_norm": 0.17035020887851715, + "learning_rate": 4.119721012182154e-05, + "loss": 1.954, + "step": 310520 + }, + { + "epoch": 1.1819538226136737, + "grad_norm": 0.17202967405319214, + "learning_rate": 4.115180433321236e-05, + "loss": 1.9375, + "step": 310530 + }, + { + "epoch": 1.1819918850817963, + "grad_norm": 0.17133724689483643, + "learning_rate": 4.1106403037334425e-05, + "loss": 1.9619, + "step": 310540 + }, + { + "epoch": 1.182029947549919, + "grad_norm": 0.21714834868907928, + "learning_rate": 4.106100623285452e-05, + "loss": 1.9432, + "step": 310550 + }, + { + "epoch": 1.1820680100180416, + "grad_norm": 0.2697466015815735, + "learning_rate": 4.1015613918439834e-05, + "loss": 1.9442, + "step": 310560 + }, + { + "epoch": 1.1821060724861643, + "grad_norm": 0.22082401812076569, + "learning_rate": 4.097022609275841e-05, + "loss": 1.9457, + "step": 310570 + }, + { + "epoch": 1.182144134954287, + "grad_norm": 0.1688980758190155, + "learning_rate": 4.092484275447883e-05, + "loss": 1.9488, + "step": 310580 + }, + { + "epoch": 1.1821821974224096, + "grad_norm": 0.17043808102607727, + "learning_rate": 4.087946390227038e-05, + "loss": 1.9441, + "step": 310590 + }, + { + "epoch": 1.1822202598905323, + "grad_norm": 0.20406900346279144, + "learning_rate": 4.083408953480294e-05, + "loss": 1.9499, + "step": 310600 + }, + { + "epoch": 1.182258322358655, + "grad_norm": 0.16481229662895203, + "learning_rate": 4.078871965074726e-05, + "loss": 1.9488, + "step": 310610 + }, + { + "epoch": 1.1822963848267778, + "grad_norm": 0.15880829095840454, + "learning_rate": 4.074335424877446e-05, + "loss": 1.9451, + "step": 310620 + }, + { + "epoch": 1.1823344472949004, + "grad_norm": 0.22673694789409637, + "learning_rate": 4.069799332755653e-05, + "loss": 1.9517, + "step": 310630 + }, + { + "epoch": 1.182372509763023, + "grad_norm": 0.18625514209270477, + "learning_rate": 4.0652636885765924e-05, + "loss": 1.9469, + "step": 310640 + }, + { + "epoch": 1.1824105722311458, + "grad_norm": 0.16471987962722778, + "learning_rate": 4.060728492207599e-05, + "loss": 1.946, + "step": 310650 + }, + { + "epoch": 1.1824486346992684, + "grad_norm": 0.20095032453536987, + "learning_rate": 4.056193743516051e-05, + "loss": 1.9425, + "step": 310660 + }, + { + "epoch": 1.182486697167391, + "grad_norm": 0.17171847820281982, + "learning_rate": 4.0516594423694094e-05, + "loss": 1.9447, + "step": 310670 + }, + { + "epoch": 1.1825247596355137, + "grad_norm": 0.17001047730445862, + "learning_rate": 4.047125588635192e-05, + "loss": 1.9569, + "step": 310680 + }, + { + "epoch": 1.1825628221036366, + "grad_norm": 0.21695543825626373, + "learning_rate": 4.042592182180976e-05, + "loss": 1.9419, + "step": 310690 + }, + { + "epoch": 1.1826008845717593, + "grad_norm": 0.2746710777282715, + "learning_rate": 4.038059222874413e-05, + "loss": 1.9388, + "step": 310700 + }, + { + "epoch": 1.182638947039882, + "grad_norm": 0.16558623313903809, + "learning_rate": 4.033526710583218e-05, + "loss": 1.9475, + "step": 310710 + }, + { + "epoch": 1.1826770095080046, + "grad_norm": 0.18636375665664673, + "learning_rate": 4.028994645175166e-05, + "loss": 1.9405, + "step": 310720 + }, + { + "epoch": 1.1827150719761272, + "grad_norm": 0.1795215904712677, + "learning_rate": 4.0244630265181106e-05, + "loss": 1.9454, + "step": 310730 + }, + { + "epoch": 1.18275313444425, + "grad_norm": 0.17834648489952087, + "learning_rate": 4.019931854479958e-05, + "loss": 1.9597, + "step": 310740 + }, + { + "epoch": 1.1827911969123726, + "grad_norm": 0.20142875611782074, + "learning_rate": 4.015401128928686e-05, + "loss": 1.9508, + "step": 310750 + }, + { + "epoch": 1.1828292593804952, + "grad_norm": 0.320974737405777, + "learning_rate": 4.010870849732323e-05, + "loss": 1.947, + "step": 310760 + }, + { + "epoch": 1.1828673218486179, + "grad_norm": 0.19955416023731232, + "learning_rate": 4.006341016758991e-05, + "loss": 1.9422, + "step": 310770 + }, + { + "epoch": 1.1829053843167405, + "grad_norm": 0.2023458629846573, + "learning_rate": 4.0018116298768395e-05, + "loss": 1.9383, + "step": 310780 + }, + { + "epoch": 1.1829434467848634, + "grad_norm": 0.1666051596403122, + "learning_rate": 3.997282688954124e-05, + "loss": 1.9527, + "step": 310790 + }, + { + "epoch": 1.182981509252986, + "grad_norm": 0.19157986342906952, + "learning_rate": 3.992754193859127e-05, + "loss": 1.9395, + "step": 310800 + }, + { + "epoch": 1.1830195717211087, + "grad_norm": 0.2026226967573166, + "learning_rate": 3.988226144460228e-05, + "loss": 1.9464, + "step": 310810 + }, + { + "epoch": 1.1830576341892314, + "grad_norm": 0.2299056202173233, + "learning_rate": 3.9836985406258466e-05, + "loss": 1.9761, + "step": 310820 + }, + { + "epoch": 1.183095696657354, + "grad_norm": 0.19825097918510437, + "learning_rate": 3.979171382224478e-05, + "loss": 1.9366, + "step": 310830 + }, + { + "epoch": 1.1831337591254767, + "grad_norm": 0.20664182305335999, + "learning_rate": 3.974644669124677e-05, + "loss": 1.9402, + "step": 310840 + }, + { + "epoch": 1.1831718215935993, + "grad_norm": 0.1697346717119217, + "learning_rate": 3.9701184011950776e-05, + "loss": 1.9308, + "step": 310850 + }, + { + "epoch": 1.183209884061722, + "grad_norm": 0.20206105709075928, + "learning_rate": 3.965592578304361e-05, + "loss": 1.9618, + "step": 310860 + }, + { + "epoch": 1.1832479465298449, + "grad_norm": 0.17904360592365265, + "learning_rate": 3.9610672003212776e-05, + "loss": 1.9319, + "step": 310870 + }, + { + "epoch": 1.1832860089979675, + "grad_norm": 0.1792462319135666, + "learning_rate": 3.956542267114649e-05, + "loss": 1.958, + "step": 310880 + }, + { + "epoch": 1.1833240714660902, + "grad_norm": 0.1927379071712494, + "learning_rate": 3.9520177785533516e-05, + "loss": 1.9403, + "step": 310890 + }, + { + "epoch": 1.1833621339342129, + "grad_norm": 0.3455449342727661, + "learning_rate": 3.947493734506335e-05, + "loss": 1.9403, + "step": 310900 + }, + { + "epoch": 1.1834001964023355, + "grad_norm": 0.2741151750087738, + "learning_rate": 3.942970134842605e-05, + "loss": 1.956, + "step": 310910 + }, + { + "epoch": 1.1834382588704582, + "grad_norm": 0.1895100325345993, + "learning_rate": 3.938446979431243e-05, + "loss": 1.9479, + "step": 310920 + }, + { + "epoch": 1.1834763213385808, + "grad_norm": 0.2061176300048828, + "learning_rate": 3.933924268141387e-05, + "loss": 1.9514, + "step": 310930 + }, + { + "epoch": 1.1835143838067035, + "grad_norm": 0.21165236830711365, + "learning_rate": 3.92940200084223e-05, + "loss": 1.9597, + "step": 310940 + }, + { + "epoch": 1.1835524462748261, + "grad_norm": 0.1790468394756317, + "learning_rate": 3.924880177403051e-05, + "loss": 1.9593, + "step": 310950 + }, + { + "epoch": 1.183590508742949, + "grad_norm": 0.17601050436496735, + "learning_rate": 3.920358797693169e-05, + "loss": 1.9462, + "step": 310960 + }, + { + "epoch": 1.1836285712110717, + "grad_norm": 0.17719541490077972, + "learning_rate": 3.915837861581989e-05, + "loss": 1.9449, + "step": 310970 + }, + { + "epoch": 1.1836666336791943, + "grad_norm": 0.17760923504829407, + "learning_rate": 3.911317368938972e-05, + "loss": 1.9537, + "step": 310980 + }, + { + "epoch": 1.183704696147317, + "grad_norm": 0.22732368111610413, + "learning_rate": 3.9067973196336314e-05, + "loss": 1.956, + "step": 310990 + }, + { + "epoch": 1.1837427586154396, + "grad_norm": 0.23127682507038116, + "learning_rate": 3.902277713535563e-05, + "loss": 1.9452, + "step": 311000 + }, + { + "epoch": 1.1837808210835623, + "grad_norm": 0.21646471321582794, + "learning_rate": 3.897758550514419e-05, + "loss": 1.9532, + "step": 311010 + }, + { + "epoch": 1.183818883551685, + "grad_norm": 0.1624293029308319, + "learning_rate": 3.8932398304399045e-05, + "loss": 1.9427, + "step": 311020 + }, + { + "epoch": 1.1838569460198076, + "grad_norm": 0.1677110195159912, + "learning_rate": 3.888721553181806e-05, + "loss": 1.9521, + "step": 311030 + }, + { + "epoch": 1.1838950084879305, + "grad_norm": 0.17499026656150818, + "learning_rate": 3.884203718609969e-05, + "loss": 1.9535, + "step": 311040 + }, + { + "epoch": 1.1839330709560532, + "grad_norm": 0.2630564272403717, + "learning_rate": 3.879686326594295e-05, + "loss": 1.9484, + "step": 311050 + }, + { + "epoch": 1.1839711334241758, + "grad_norm": 0.18801864981651306, + "learning_rate": 3.875169377004756e-05, + "loss": 1.9652, + "step": 311060 + }, + { + "epoch": 1.1840091958922985, + "grad_norm": 0.16058778762817383, + "learning_rate": 3.870652869711383e-05, + "loss": 1.956, + "step": 311070 + }, + { + "epoch": 1.1840472583604211, + "grad_norm": 0.2152181714773178, + "learning_rate": 3.8661368045842746e-05, + "loss": 1.955, + "step": 311080 + }, + { + "epoch": 1.1840853208285438, + "grad_norm": 0.20467229187488556, + "learning_rate": 3.861621181493602e-05, + "loss": 1.9561, + "step": 311090 + }, + { + "epoch": 1.1841233832966664, + "grad_norm": 0.24097484350204468, + "learning_rate": 3.8571060003095734e-05, + "loss": 1.9561, + "step": 311100 + }, + { + "epoch": 1.184161445764789, + "grad_norm": 0.19084009528160095, + "learning_rate": 3.8525912609024884e-05, + "loss": 1.9436, + "step": 311110 + }, + { + "epoch": 1.1841995082329118, + "grad_norm": 0.18321532011032104, + "learning_rate": 3.848076963142694e-05, + "loss": 1.9416, + "step": 311120 + }, + { + "epoch": 1.1842375707010344, + "grad_norm": 0.20860257744789124, + "learning_rate": 3.843563106900611e-05, + "loss": 1.9656, + "step": 311130 + }, + { + "epoch": 1.1842756331691573, + "grad_norm": 0.16122521460056305, + "learning_rate": 3.839049692046703e-05, + "loss": 1.9442, + "step": 311140 + }, + { + "epoch": 1.18431369563728, + "grad_norm": 0.1690884232521057, + "learning_rate": 3.834536718451531e-05, + "loss": 1.9653, + "step": 311150 + }, + { + "epoch": 1.1843517581054026, + "grad_norm": 0.17913898825645447, + "learning_rate": 3.8300241859856865e-05, + "loss": 1.9435, + "step": 311160 + }, + { + "epoch": 1.1843898205735253, + "grad_norm": 0.22345957159996033, + "learning_rate": 3.825512094519845e-05, + "loss": 1.9274, + "step": 311170 + }, + { + "epoch": 1.184427883041648, + "grad_norm": 0.1835993528366089, + "learning_rate": 3.8210004439247325e-05, + "loss": 1.9451, + "step": 311180 + }, + { + "epoch": 1.1844659455097706, + "grad_norm": 0.2462897002696991, + "learning_rate": 3.8164892340711464e-05, + "loss": 1.9447, + "step": 311190 + }, + { + "epoch": 1.1845040079778932, + "grad_norm": 0.2044731229543686, + "learning_rate": 3.811978464829935e-05, + "loss": 1.9594, + "step": 311200 + }, + { + "epoch": 1.1845420704460161, + "grad_norm": 0.19596067070960999, + "learning_rate": 3.8074681360720396e-05, + "loss": 1.9486, + "step": 311210 + }, + { + "epoch": 1.1845801329141388, + "grad_norm": 0.16007938981056213, + "learning_rate": 3.8029582476684254e-05, + "loss": 1.9511, + "step": 311220 + }, + { + "epoch": 1.1846181953822614, + "grad_norm": 0.17175820469856262, + "learning_rate": 3.798448799490151e-05, + "loss": 1.945, + "step": 311230 + }, + { + "epoch": 1.184656257850384, + "grad_norm": 0.1731996387243271, + "learning_rate": 3.793939791408313e-05, + "loss": 1.9533, + "step": 311240 + }, + { + "epoch": 1.1846943203185067, + "grad_norm": 0.1798241287469864, + "learning_rate": 3.7894312232940986e-05, + "loss": 1.9383, + "step": 311250 + }, + { + "epoch": 1.1847323827866294, + "grad_norm": 0.22271887958049774, + "learning_rate": 3.7849230950187275e-05, + "loss": 1.9644, + "step": 311260 + }, + { + "epoch": 1.184770445254752, + "grad_norm": 0.2261335551738739, + "learning_rate": 3.780415406453508e-05, + "loss": 1.9441, + "step": 311270 + }, + { + "epoch": 1.1848085077228747, + "grad_norm": 0.20029690861701965, + "learning_rate": 3.775908157469804e-05, + "loss": 1.9342, + "step": 311280 + }, + { + "epoch": 1.1848465701909974, + "grad_norm": 0.23057448863983154, + "learning_rate": 3.77140134793903e-05, + "loss": 1.9427, + "step": 311290 + }, + { + "epoch": 1.18488463265912, + "grad_norm": 0.1756334900856018, + "learning_rate": 3.7668949777326766e-05, + "loss": 1.932, + "step": 311300 + }, + { + "epoch": 1.184922695127243, + "grad_norm": 0.18955238163471222, + "learning_rate": 3.762389046722292e-05, + "loss": 1.9363, + "step": 311310 + }, + { + "epoch": 1.1849607575953656, + "grad_norm": 0.18324987590312958, + "learning_rate": 3.75788355477949e-05, + "loss": 1.9385, + "step": 311320 + }, + { + "epoch": 1.1849988200634882, + "grad_norm": 0.3313455283641815, + "learning_rate": 3.753378501775939e-05, + "loss": 1.9337, + "step": 311330 + }, + { + "epoch": 1.1850368825316109, + "grad_norm": 0.16478504240512848, + "learning_rate": 3.748873887583376e-05, + "loss": 1.939, + "step": 311340 + }, + { + "epoch": 1.1850749449997335, + "grad_norm": 0.19854886829853058, + "learning_rate": 3.7443697120736084e-05, + "loss": 1.9477, + "step": 311350 + }, + { + "epoch": 1.1851130074678562, + "grad_norm": 0.1801048219203949, + "learning_rate": 3.7398659751184894e-05, + "loss": 1.9489, + "step": 311360 + }, + { + "epoch": 1.1851510699359789, + "grad_norm": 0.1807958483695984, + "learning_rate": 3.735362676589948e-05, + "loss": 1.9418, + "step": 311370 + }, + { + "epoch": 1.1851891324041017, + "grad_norm": 0.1839410960674286, + "learning_rate": 3.730859816359961e-05, + "loss": 1.9475, + "step": 311380 + }, + { + "epoch": 1.1852271948722244, + "grad_norm": 0.1806168407201767, + "learning_rate": 3.726357394300583e-05, + "loss": 1.955, + "step": 311390 + }, + { + "epoch": 1.185265257340347, + "grad_norm": 0.25393858551979065, + "learning_rate": 3.721855410283931e-05, + "loss": 1.9487, + "step": 311400 + }, + { + "epoch": 1.1853033198084697, + "grad_norm": 0.17133808135986328, + "learning_rate": 3.7173538641821655e-05, + "loss": 1.9371, + "step": 311410 + }, + { + "epoch": 1.1853413822765924, + "grad_norm": 0.19476234912872314, + "learning_rate": 3.7128527558675294e-05, + "loss": 1.9417, + "step": 311420 + }, + { + "epoch": 1.185379444744715, + "grad_norm": 0.17833714187145233, + "learning_rate": 3.708352085212319e-05, + "loss": 1.9461, + "step": 311430 + }, + { + "epoch": 1.1854175072128377, + "grad_norm": 0.17181991040706635, + "learning_rate": 3.703851852088891e-05, + "loss": 1.955, + "step": 311440 + }, + { + "epoch": 1.1854555696809603, + "grad_norm": 0.19017015397548676, + "learning_rate": 3.699352056369665e-05, + "loss": 1.9537, + "step": 311450 + }, + { + "epoch": 1.185493632149083, + "grad_norm": 0.1896897703409195, + "learning_rate": 3.6948526979271255e-05, + "loss": 1.956, + "step": 311460 + }, + { + "epoch": 1.1855316946172056, + "grad_norm": 0.283040851354599, + "learning_rate": 3.69035377663382e-05, + "loss": 1.9381, + "step": 311470 + }, + { + "epoch": 1.1855697570853285, + "grad_norm": 0.17958037555217743, + "learning_rate": 3.685855292362356e-05, + "loss": 1.9513, + "step": 311480 + }, + { + "epoch": 1.1856078195534512, + "grad_norm": 0.23295582830905914, + "learning_rate": 3.681357244985401e-05, + "loss": 1.9404, + "step": 311490 + }, + { + "epoch": 1.1856458820215738, + "grad_norm": 0.17441517114639282, + "learning_rate": 3.676859634375679e-05, + "loss": 1.9539, + "step": 311500 + }, + { + "epoch": 1.1856839444896965, + "grad_norm": 0.26736241579055786, + "learning_rate": 3.672362460405992e-05, + "loss": 1.9524, + "step": 311510 + }, + { + "epoch": 1.1857220069578192, + "grad_norm": 0.1684333086013794, + "learning_rate": 3.6678657229491875e-05, + "loss": 1.9404, + "step": 311520 + }, + { + "epoch": 1.1857600694259418, + "grad_norm": 0.22045107185840607, + "learning_rate": 3.663369421878188e-05, + "loss": 1.9431, + "step": 311530 + }, + { + "epoch": 1.1857981318940645, + "grad_norm": 0.16330958902835846, + "learning_rate": 3.658873557065967e-05, + "loss": 1.9505, + "step": 311540 + }, + { + "epoch": 1.1858361943621873, + "grad_norm": 0.16610394418239594, + "learning_rate": 3.65437812838556e-05, + "loss": 1.9432, + "step": 311550 + }, + { + "epoch": 1.18587425683031, + "grad_norm": 0.17661607265472412, + "learning_rate": 3.649883135710075e-05, + "loss": 1.9375, + "step": 311560 + }, + { + "epoch": 1.1859123192984327, + "grad_norm": 0.18251265585422516, + "learning_rate": 3.645388578912667e-05, + "loss": 1.9434, + "step": 311570 + }, + { + "epoch": 1.1859503817665553, + "grad_norm": 0.21383360028266907, + "learning_rate": 3.640894457866567e-05, + "loss": 1.9609, + "step": 311580 + }, + { + "epoch": 1.185988444234678, + "grad_norm": 0.21023496985435486, + "learning_rate": 3.636400772445059e-05, + "loss": 1.9483, + "step": 311590 + }, + { + "epoch": 1.1860265067028006, + "grad_norm": 0.18044514954090118, + "learning_rate": 3.6319075225214827e-05, + "loss": 1.934, + "step": 311600 + }, + { + "epoch": 1.1860645691709233, + "grad_norm": 0.18245013058185577, + "learning_rate": 3.627414707969251e-05, + "loss": 1.938, + "step": 311610 + }, + { + "epoch": 1.186102631639046, + "grad_norm": 0.18348990380764008, + "learning_rate": 3.6229223286618316e-05, + "loss": 1.9436, + "step": 311620 + }, + { + "epoch": 1.1861406941071686, + "grad_norm": 0.18171392381191254, + "learning_rate": 3.618430384472754e-05, + "loss": 1.9557, + "step": 311630 + }, + { + "epoch": 1.1861787565752913, + "grad_norm": 0.16605229675769806, + "learning_rate": 3.613938875275618e-05, + "loss": 1.9441, + "step": 311640 + }, + { + "epoch": 1.1862168190434141, + "grad_norm": 0.19410014152526855, + "learning_rate": 3.6094478009440654e-05, + "loss": 1.96, + "step": 311650 + }, + { + "epoch": 1.1862548815115368, + "grad_norm": 0.18071909248828888, + "learning_rate": 3.604957161351818e-05, + "loss": 1.9384, + "step": 311660 + }, + { + "epoch": 1.1862929439796595, + "grad_norm": 0.28662827610969543, + "learning_rate": 3.60046695637265e-05, + "loss": 1.94, + "step": 311670 + }, + { + "epoch": 1.186331006447782, + "grad_norm": 0.19265218079090118, + "learning_rate": 3.5959771858803895e-05, + "loss": 1.9385, + "step": 311680 + }, + { + "epoch": 1.1863690689159048, + "grad_norm": 0.22032389044761658, + "learning_rate": 3.591487849748942e-05, + "loss": 1.9678, + "step": 311690 + }, + { + "epoch": 1.1864071313840274, + "grad_norm": 0.184286430478096, + "learning_rate": 3.5869989478522655e-05, + "loss": 1.941, + "step": 311700 + }, + { + "epoch": 1.18644519385215, + "grad_norm": 0.18217714130878448, + "learning_rate": 3.582510480064377e-05, + "loss": 1.9475, + "step": 311710 + }, + { + "epoch": 1.1864832563202727, + "grad_norm": 0.18673953413963318, + "learning_rate": 3.57802244625936e-05, + "loss": 1.9584, + "step": 311720 + }, + { + "epoch": 1.1865213187883956, + "grad_norm": 0.23944932222366333, + "learning_rate": 3.573534846311349e-05, + "loss": 1.956, + "step": 311730 + }, + { + "epoch": 1.1865593812565183, + "grad_norm": 0.18649595975875854, + "learning_rate": 3.5690476800945506e-05, + "loss": 1.9469, + "step": 311740 + }, + { + "epoch": 1.186597443724641, + "grad_norm": 0.20941564440727234, + "learning_rate": 3.564560947483225e-05, + "loss": 1.943, + "step": 311750 + }, + { + "epoch": 1.1866355061927636, + "grad_norm": 0.21465104818344116, + "learning_rate": 3.5600746483516966e-05, + "loss": 1.9346, + "step": 311760 + }, + { + "epoch": 1.1866735686608862, + "grad_norm": 0.2321460098028183, + "learning_rate": 3.555588782574354e-05, + "loss": 1.9361, + "step": 311770 + }, + { + "epoch": 1.186711631129009, + "grad_norm": 0.20696842670440674, + "learning_rate": 3.5511033500256306e-05, + "loss": 1.9359, + "step": 311780 + }, + { + "epoch": 1.1867496935971316, + "grad_norm": 0.18398690223693848, + "learning_rate": 3.546618350580044e-05, + "loss": 1.9403, + "step": 311790 + }, + { + "epoch": 1.1867877560652542, + "grad_norm": 0.2288619577884674, + "learning_rate": 3.542133784112156e-05, + "loss": 1.9522, + "step": 311800 + }, + { + "epoch": 1.1868258185333769, + "grad_norm": 0.2083914875984192, + "learning_rate": 3.5376496504965825e-05, + "loss": 1.9457, + "step": 311810 + }, + { + "epoch": 1.1868638810014998, + "grad_norm": 0.252368301153183, + "learning_rate": 3.53316594960803e-05, + "loss": 1.9451, + "step": 311820 + }, + { + "epoch": 1.1869019434696224, + "grad_norm": 0.18113096058368683, + "learning_rate": 3.5286826813212324e-05, + "loss": 1.9409, + "step": 311830 + }, + { + "epoch": 1.186940005937745, + "grad_norm": 0.16996903717517853, + "learning_rate": 3.524199845511e-05, + "loss": 1.9494, + "step": 311840 + }, + { + "epoch": 1.1869780684058677, + "grad_norm": 0.16921067237854004, + "learning_rate": 3.519717442052201e-05, + "loss": 1.9507, + "step": 311850 + }, + { + "epoch": 1.1870161308739904, + "grad_norm": 0.19410954415798187, + "learning_rate": 3.5152354708197606e-05, + "loss": 1.9522, + "step": 311860 + }, + { + "epoch": 1.187054193342113, + "grad_norm": 0.17984320223331451, + "learning_rate": 3.510753931688676e-05, + "loss": 1.9501, + "step": 311870 + }, + { + "epoch": 1.1870922558102357, + "grad_norm": 0.17801634967327118, + "learning_rate": 3.5062728245339904e-05, + "loss": 1.9544, + "step": 311880 + }, + { + "epoch": 1.1871303182783584, + "grad_norm": 0.16607418656349182, + "learning_rate": 3.5017921492308146e-05, + "loss": 1.9503, + "step": 311890 + }, + { + "epoch": 1.1871683807464812, + "grad_norm": 0.24475719034671783, + "learning_rate": 3.4973119056543153e-05, + "loss": 1.9596, + "step": 311900 + }, + { + "epoch": 1.187206443214604, + "grad_norm": 0.23632270097732544, + "learning_rate": 3.492832093679727e-05, + "loss": 1.9537, + "step": 311910 + }, + { + "epoch": 1.1872445056827265, + "grad_norm": 0.17317236959934235, + "learning_rate": 3.488352713182336e-05, + "loss": 1.9299, + "step": 311920 + }, + { + "epoch": 1.1872825681508492, + "grad_norm": 0.20168110728263855, + "learning_rate": 3.4838737640374876e-05, + "loss": 1.955, + "step": 311930 + }, + { + "epoch": 1.1873206306189719, + "grad_norm": 0.20480570197105408, + "learning_rate": 3.479395246120598e-05, + "loss": 1.9501, + "step": 311940 + }, + { + "epoch": 1.1873586930870945, + "grad_norm": 0.17610032856464386, + "learning_rate": 3.474917159307139e-05, + "loss": 1.9372, + "step": 311950 + }, + { + "epoch": 1.1873967555552172, + "grad_norm": 0.18562458455562592, + "learning_rate": 3.470439503472633e-05, + "loss": 1.9624, + "step": 311960 + }, + { + "epoch": 1.1874348180233398, + "grad_norm": 0.18058809638023376, + "learning_rate": 3.465962278492674e-05, + "loss": 1.9325, + "step": 311970 + }, + { + "epoch": 1.1874728804914625, + "grad_norm": 0.20649303495883942, + "learning_rate": 3.46148548424291e-05, + "loss": 1.9451, + "step": 311980 + }, + { + "epoch": 1.1875109429595851, + "grad_norm": 0.19297026097774506, + "learning_rate": 3.457009120599047e-05, + "loss": 1.9423, + "step": 311990 + }, + { + "epoch": 1.187549005427708, + "grad_norm": 0.21698220074176788, + "learning_rate": 3.452533187436863e-05, + "loss": 1.9408, + "step": 312000 + }, + { + "epoch": 1.1875870678958307, + "grad_norm": 0.16666904091835022, + "learning_rate": 3.4480576846321776e-05, + "loss": 1.947, + "step": 312010 + }, + { + "epoch": 1.1876251303639533, + "grad_norm": 0.17687274515628815, + "learning_rate": 3.44358261206088e-05, + "loss": 1.9673, + "step": 312020 + }, + { + "epoch": 1.187663192832076, + "grad_norm": 0.16840630769729614, + "learning_rate": 3.4391079695989205e-05, + "loss": 1.9574, + "step": 312030 + }, + { + "epoch": 1.1877012553001987, + "grad_norm": 0.18446031212806702, + "learning_rate": 3.4346337571223076e-05, + "loss": 1.9394, + "step": 312040 + }, + { + "epoch": 1.1877393177683213, + "grad_norm": 0.16199266910552979, + "learning_rate": 3.430159974507102e-05, + "loss": 1.9382, + "step": 312050 + }, + { + "epoch": 1.187777380236444, + "grad_norm": 0.1947520673274994, + "learning_rate": 3.4256866216294424e-05, + "loss": 1.9438, + "step": 312060 + }, + { + "epoch": 1.1878154427045668, + "grad_norm": 0.17160986363887787, + "learning_rate": 3.4212136983655054e-05, + "loss": 1.9554, + "step": 312070 + }, + { + "epoch": 1.1878535051726895, + "grad_norm": 0.165254607796669, + "learning_rate": 3.416741204591539e-05, + "loss": 1.947, + "step": 312080 + }, + { + "epoch": 1.1878915676408122, + "grad_norm": 0.1807015836238861, + "learning_rate": 3.4122691401838434e-05, + "loss": 1.946, + "step": 312090 + }, + { + "epoch": 1.1879296301089348, + "grad_norm": 0.18120074272155762, + "learning_rate": 3.4077975050187894e-05, + "loss": 1.9529, + "step": 312100 + }, + { + "epoch": 1.1879676925770575, + "grad_norm": 0.16498395800590515, + "learning_rate": 3.4033262989727974e-05, + "loss": 1.9464, + "step": 312110 + }, + { + "epoch": 1.1880057550451801, + "grad_norm": 0.16716605424880981, + "learning_rate": 3.398855521922356e-05, + "loss": 1.9524, + "step": 312120 + }, + { + "epoch": 1.1880438175133028, + "grad_norm": 0.19660872220993042, + "learning_rate": 3.394385173743997e-05, + "loss": 1.9343, + "step": 312130 + }, + { + "epoch": 1.1880818799814254, + "grad_norm": 0.17590086162090302, + "learning_rate": 3.3899152543143294e-05, + "loss": 1.9378, + "step": 312140 + }, + { + "epoch": 1.188119942449548, + "grad_norm": 0.23901164531707764, + "learning_rate": 3.385445763510014e-05, + "loss": 1.9423, + "step": 312150 + }, + { + "epoch": 1.1881580049176708, + "grad_norm": 0.18615446984767914, + "learning_rate": 3.3809767012077666e-05, + "loss": 1.9621, + "step": 312160 + }, + { + "epoch": 1.1881960673857936, + "grad_norm": 0.22386965155601501, + "learning_rate": 3.3765080672843615e-05, + "loss": 1.9405, + "step": 312170 + }, + { + "epoch": 1.1882341298539163, + "grad_norm": 0.16738446056842804, + "learning_rate": 3.372039861616649e-05, + "loss": 1.9445, + "step": 312180 + }, + { + "epoch": 1.188272192322039, + "grad_norm": 0.17396967113018036, + "learning_rate": 3.367572084081516e-05, + "loss": 1.9474, + "step": 312190 + }, + { + "epoch": 1.1883102547901616, + "grad_norm": 0.24288448691368103, + "learning_rate": 3.3631047345559273e-05, + "loss": 1.9487, + "step": 312200 + }, + { + "epoch": 1.1883483172582843, + "grad_norm": 0.18360836803913116, + "learning_rate": 3.358637812916887e-05, + "loss": 1.932, + "step": 312210 + }, + { + "epoch": 1.188386379726407, + "grad_norm": 0.24498017132282257, + "learning_rate": 3.3541713190414714e-05, + "loss": 1.9589, + "step": 312220 + }, + { + "epoch": 1.1884244421945296, + "grad_norm": 0.25882506370544434, + "learning_rate": 3.349705252806812e-05, + "loss": 1.9496, + "step": 312230 + }, + { + "epoch": 1.1884625046626525, + "grad_norm": 0.2343091368675232, + "learning_rate": 3.345239614090112e-05, + "loss": 1.929, + "step": 312240 + }, + { + "epoch": 1.1885005671307751, + "grad_norm": 0.16979889571666718, + "learning_rate": 3.340774402768604e-05, + "loss": 1.9386, + "step": 312250 + }, + { + "epoch": 1.1885386295988978, + "grad_norm": 0.2081635296344757, + "learning_rate": 3.336309618719607e-05, + "loss": 1.9534, + "step": 312260 + }, + { + "epoch": 1.1885766920670204, + "grad_norm": 0.1663932502269745, + "learning_rate": 3.331845261820493e-05, + "loss": 1.9548, + "step": 312270 + }, + { + "epoch": 1.188614754535143, + "grad_norm": 0.2160760760307312, + "learning_rate": 3.3273813319486755e-05, + "loss": 1.9401, + "step": 312280 + }, + { + "epoch": 1.1886528170032657, + "grad_norm": 0.1771797239780426, + "learning_rate": 3.322917828981642e-05, + "loss": 1.9519, + "step": 312290 + }, + { + "epoch": 1.1886908794713884, + "grad_norm": 0.2006005495786667, + "learning_rate": 3.31845475279694e-05, + "loss": 1.9413, + "step": 312300 + }, + { + "epoch": 1.188728941939511, + "grad_norm": 0.18026748299598694, + "learning_rate": 3.313992103272168e-05, + "loss": 1.9469, + "step": 312310 + }, + { + "epoch": 1.1887670044076337, + "grad_norm": 0.1619405448436737, + "learning_rate": 3.3095298802849895e-05, + "loss": 1.9395, + "step": 312320 + }, + { + "epoch": 1.1888050668757564, + "grad_norm": 0.1686805933713913, + "learning_rate": 3.305068083713125e-05, + "loss": 1.9428, + "step": 312330 + }, + { + "epoch": 1.1888431293438793, + "grad_norm": 0.24637804925441742, + "learning_rate": 3.3006067134343456e-05, + "loss": 1.9311, + "step": 312340 + }, + { + "epoch": 1.188881191812002, + "grad_norm": 0.1797938495874405, + "learning_rate": 3.2961457693264865e-05, + "loss": 1.9514, + "step": 312350 + }, + { + "epoch": 1.1889192542801246, + "grad_norm": 0.18646450340747833, + "learning_rate": 3.291685251267451e-05, + "loss": 1.9462, + "step": 312360 + }, + { + "epoch": 1.1889573167482472, + "grad_norm": 0.1743801236152649, + "learning_rate": 3.287225159135182e-05, + "loss": 1.9308, + "step": 312370 + }, + { + "epoch": 1.1889953792163699, + "grad_norm": 0.17725461721420288, + "learning_rate": 3.2827654928076875e-05, + "loss": 1.9493, + "step": 312380 + }, + { + "epoch": 1.1890334416844925, + "grad_norm": 0.17935842275619507, + "learning_rate": 3.278306252163049e-05, + "loss": 1.9426, + "step": 312390 + }, + { + "epoch": 1.1890715041526152, + "grad_norm": 0.16997292637825012, + "learning_rate": 3.27384743707938e-05, + "loss": 1.9395, + "step": 312400 + }, + { + "epoch": 1.189109566620738, + "grad_norm": 0.17147625982761383, + "learning_rate": 3.269389047434868e-05, + "loss": 1.9554, + "step": 312410 + }, + { + "epoch": 1.1891476290888607, + "grad_norm": 0.1671450138092041, + "learning_rate": 3.264931083107764e-05, + "loss": 1.9463, + "step": 312420 + }, + { + "epoch": 1.1891856915569834, + "grad_norm": 0.18187834322452545, + "learning_rate": 3.260473543976367e-05, + "loss": 1.9425, + "step": 312430 + }, + { + "epoch": 1.189223754025106, + "grad_norm": 0.17444400489330292, + "learning_rate": 3.256016429919029e-05, + "loss": 1.965, + "step": 312440 + }, + { + "epoch": 1.1892618164932287, + "grad_norm": 0.20525328814983368, + "learning_rate": 3.25155974081417e-05, + "loss": 1.9463, + "step": 312450 + }, + { + "epoch": 1.1892998789613514, + "grad_norm": 0.21237628161907196, + "learning_rate": 3.247103476540264e-05, + "loss": 1.9546, + "step": 312460 + }, + { + "epoch": 1.189337941429474, + "grad_norm": 0.2117680311203003, + "learning_rate": 3.2426476369758484e-05, + "loss": 1.9507, + "step": 312470 + }, + { + "epoch": 1.1893760038975967, + "grad_norm": 0.1771915704011917, + "learning_rate": 3.238192221999514e-05, + "loss": 1.926, + "step": 312480 + }, + { + "epoch": 1.1894140663657193, + "grad_norm": 0.21037092804908752, + "learning_rate": 3.2337372314899084e-05, + "loss": 1.9375, + "step": 312490 + }, + { + "epoch": 1.189452128833842, + "grad_norm": 0.23058432340621948, + "learning_rate": 3.2292826653257336e-05, + "loss": 1.9567, + "step": 312500 + }, + { + "epoch": 1.1894901913019649, + "grad_norm": 0.23512980341911316, + "learning_rate": 3.224828523385759e-05, + "loss": 1.9493, + "step": 312510 + }, + { + "epoch": 1.1895282537700875, + "grad_norm": 0.26605743169784546, + "learning_rate": 3.220374805548803e-05, + "loss": 1.9509, + "step": 312520 + }, + { + "epoch": 1.1895663162382102, + "grad_norm": 0.1657746434211731, + "learning_rate": 3.215921511693748e-05, + "loss": 1.9511, + "step": 312530 + }, + { + "epoch": 1.1896043787063328, + "grad_norm": 0.20481301844120026, + "learning_rate": 3.2114686416995276e-05, + "loss": 1.9573, + "step": 312540 + }, + { + "epoch": 1.1896424411744555, + "grad_norm": 0.16254596412181854, + "learning_rate": 3.207016195445139e-05, + "loss": 1.9208, + "step": 312550 + }, + { + "epoch": 1.1896805036425782, + "grad_norm": 0.1650143563747406, + "learning_rate": 3.202564172809641e-05, + "loss": 1.9371, + "step": 312560 + }, + { + "epoch": 1.1897185661107008, + "grad_norm": 0.18093803524971008, + "learning_rate": 3.19811257367213e-05, + "loss": 1.9401, + "step": 312570 + }, + { + "epoch": 1.1897566285788235, + "grad_norm": 0.16682739555835724, + "learning_rate": 3.1936613979117846e-05, + "loss": 1.9504, + "step": 312580 + }, + { + "epoch": 1.1897946910469464, + "grad_norm": 0.19406753778457642, + "learning_rate": 3.189210645407825e-05, + "loss": 1.9407, + "step": 312590 + }, + { + "epoch": 1.189832753515069, + "grad_norm": 0.17175889015197754, + "learning_rate": 3.184760316039536e-05, + "loss": 1.9384, + "step": 312600 + }, + { + "epoch": 1.1898708159831917, + "grad_norm": 0.18349462747573853, + "learning_rate": 3.1803104096862546e-05, + "loss": 1.931, + "step": 312610 + }, + { + "epoch": 1.1899088784513143, + "grad_norm": 0.18410073220729828, + "learning_rate": 3.175860926227381e-05, + "loss": 1.9407, + "step": 312620 + }, + { + "epoch": 1.189946940919437, + "grad_norm": 0.21190586686134338, + "learning_rate": 3.1714118655423616e-05, + "loss": 1.9425, + "step": 312630 + }, + { + "epoch": 1.1899850033875596, + "grad_norm": 0.1733289361000061, + "learning_rate": 3.166963227510716e-05, + "loss": 1.9366, + "step": 312640 + }, + { + "epoch": 1.1900230658556823, + "grad_norm": 0.18026097118854523, + "learning_rate": 3.162515012012013e-05, + "loss": 1.9445, + "step": 312650 + }, + { + "epoch": 1.190061128323805, + "grad_norm": 0.17617754638195038, + "learning_rate": 3.1580672189258756e-05, + "loss": 1.9361, + "step": 312660 + }, + { + "epoch": 1.1900991907919276, + "grad_norm": 0.20476016402244568, + "learning_rate": 3.153619848131989e-05, + "loss": 1.9575, + "step": 312670 + }, + { + "epoch": 1.1901372532600505, + "grad_norm": 0.19444191455841064, + "learning_rate": 3.1491728995100885e-05, + "loss": 1.9584, + "step": 312680 + }, + { + "epoch": 1.1901753157281731, + "grad_norm": 0.18735110759735107, + "learning_rate": 3.144726372939977e-05, + "loss": 1.9506, + "step": 312690 + }, + { + "epoch": 1.1902133781962958, + "grad_norm": 0.2517315447330475, + "learning_rate": 3.140280268301504e-05, + "loss": 1.9419, + "step": 312700 + }, + { + "epoch": 1.1902514406644185, + "grad_norm": 0.1806691586971283, + "learning_rate": 3.13583458547459e-05, + "loss": 1.9516, + "step": 312710 + }, + { + "epoch": 1.1902895031325411, + "grad_norm": 0.16991178691387177, + "learning_rate": 3.131389324339185e-05, + "loss": 1.9428, + "step": 312720 + }, + { + "epoch": 1.1903275656006638, + "grad_norm": 0.20859868824481964, + "learning_rate": 3.126944484775335e-05, + "loss": 1.9492, + "step": 312730 + }, + { + "epoch": 1.1903656280687864, + "grad_norm": 0.17317542433738708, + "learning_rate": 3.1225000666631084e-05, + "loss": 1.94, + "step": 312740 + }, + { + "epoch": 1.190403690536909, + "grad_norm": 0.16412481665611267, + "learning_rate": 3.118056069882652e-05, + "loss": 1.9554, + "step": 312750 + }, + { + "epoch": 1.190441753005032, + "grad_norm": 0.18478161096572876, + "learning_rate": 3.113612494314161e-05, + "loss": 1.9357, + "step": 312760 + }, + { + "epoch": 1.1904798154731546, + "grad_norm": 0.18147684633731842, + "learning_rate": 3.1091693398378806e-05, + "loss": 1.9553, + "step": 312770 + }, + { + "epoch": 1.1905178779412773, + "grad_norm": 0.15987755358219147, + "learning_rate": 3.104726606334124e-05, + "loss": 1.9422, + "step": 312780 + }, + { + "epoch": 1.1905559404094, + "grad_norm": 0.2043231874704361, + "learning_rate": 3.1002842936832584e-05, + "loss": 1.9432, + "step": 312790 + }, + { + "epoch": 1.1905940028775226, + "grad_norm": 0.22990889847278595, + "learning_rate": 3.095842401765703e-05, + "loss": 1.9499, + "step": 312800 + }, + { + "epoch": 1.1906320653456453, + "grad_norm": 0.16571252048015594, + "learning_rate": 3.091400930461946e-05, + "loss": 1.9481, + "step": 312810 + }, + { + "epoch": 1.190670127813768, + "grad_norm": 0.20370744168758392, + "learning_rate": 3.086959879652512e-05, + "loss": 1.9402, + "step": 312820 + }, + { + "epoch": 1.1907081902818906, + "grad_norm": 0.21053428947925568, + "learning_rate": 3.082519249217997e-05, + "loss": 1.9536, + "step": 312830 + }, + { + "epoch": 1.1907462527500132, + "grad_norm": 0.21404720842838287, + "learning_rate": 3.07807903903905e-05, + "loss": 1.9445, + "step": 312840 + }, + { + "epoch": 1.1907843152181359, + "grad_norm": 0.16291579604148865, + "learning_rate": 3.073639248996374e-05, + "loss": 1.9357, + "step": 312850 + }, + { + "epoch": 1.1908223776862588, + "grad_norm": 0.1799086183309555, + "learning_rate": 3.069199878970741e-05, + "loss": 1.957, + "step": 312860 + }, + { + "epoch": 1.1908604401543814, + "grad_norm": 0.16775865852832794, + "learning_rate": 3.064760928842958e-05, + "loss": 1.947, + "step": 312870 + }, + { + "epoch": 1.190898502622504, + "grad_norm": 0.18161626160144806, + "learning_rate": 3.0603223984939034e-05, + "loss": 1.9356, + "step": 312880 + }, + { + "epoch": 1.1909365650906267, + "grad_norm": 0.20884421467781067, + "learning_rate": 3.055884287804506e-05, + "loss": 1.9548, + "step": 312890 + }, + { + "epoch": 1.1909746275587494, + "grad_norm": 0.21153324842453003, + "learning_rate": 3.0514465966557548e-05, + "loss": 1.9429, + "step": 312900 + }, + { + "epoch": 1.191012690026872, + "grad_norm": 0.18954439461231232, + "learning_rate": 3.04700932492869e-05, + "loss": 1.9546, + "step": 312910 + }, + { + "epoch": 1.1910507524949947, + "grad_norm": 0.18886590003967285, + "learning_rate": 3.042572472504418e-05, + "loss": 1.935, + "step": 312920 + }, + { + "epoch": 1.1910888149631176, + "grad_norm": 0.18550170958042145, + "learning_rate": 3.038136039264089e-05, + "loss": 1.9363, + "step": 312930 + }, + { + "epoch": 1.1911268774312402, + "grad_norm": 0.18696042895317078, + "learning_rate": 3.0337000250889148e-05, + "loss": 1.9352, + "step": 312940 + }, + { + "epoch": 1.191164939899363, + "grad_norm": 0.2550421953201294, + "learning_rate": 3.0292644298601624e-05, + "loss": 1.9396, + "step": 312950 + }, + { + "epoch": 1.1912030023674856, + "grad_norm": 0.23013365268707275, + "learning_rate": 3.0248292534591547e-05, + "loss": 1.9346, + "step": 312960 + }, + { + "epoch": 1.1912410648356082, + "grad_norm": 0.18774259090423584, + "learning_rate": 3.0203944957672812e-05, + "loss": 1.9354, + "step": 312970 + }, + { + "epoch": 1.1912791273037309, + "grad_norm": 0.20159746706485748, + "learning_rate": 3.0159601566659646e-05, + "loss": 1.9391, + "step": 312980 + }, + { + "epoch": 1.1913171897718535, + "grad_norm": 0.21470539271831512, + "learning_rate": 3.01152623603671e-05, + "loss": 1.939, + "step": 312990 + }, + { + "epoch": 1.1913552522399762, + "grad_norm": 0.1757175177335739, + "learning_rate": 3.0070927337610522e-05, + "loss": 1.9355, + "step": 313000 + }, + { + "epoch": 1.1913933147080988, + "grad_norm": 0.23895221948623657, + "learning_rate": 3.0026596497206016e-05, + "loss": 1.9392, + "step": 313010 + }, + { + "epoch": 1.1914313771762215, + "grad_norm": 0.16913849115371704, + "learning_rate": 2.99822698379702e-05, + "loss": 1.9458, + "step": 313020 + }, + { + "epoch": 1.1914694396443444, + "grad_norm": 0.1750878542661667, + "learning_rate": 2.9937947358720187e-05, + "loss": 1.9409, + "step": 313030 + }, + { + "epoch": 1.191507502112467, + "grad_norm": 0.1695113629102707, + "learning_rate": 2.989362905827364e-05, + "loss": 1.9365, + "step": 313040 + }, + { + "epoch": 1.1915455645805897, + "grad_norm": 0.20208555459976196, + "learning_rate": 2.9849314935448957e-05, + "loss": 1.9481, + "step": 313050 + }, + { + "epoch": 1.1915836270487123, + "grad_norm": 0.1992395520210266, + "learning_rate": 2.9805004989064856e-05, + "loss": 1.9431, + "step": 313060 + }, + { + "epoch": 1.191621689516835, + "grad_norm": 0.18389198184013367, + "learning_rate": 2.976069921794078e-05, + "loss": 1.9201, + "step": 313070 + }, + { + "epoch": 1.1916597519849577, + "grad_norm": 0.21777984499931335, + "learning_rate": 2.9716397620896564e-05, + "loss": 1.9528, + "step": 313080 + }, + { + "epoch": 1.1916978144530803, + "grad_norm": 0.17072694003582, + "learning_rate": 2.9672100196752814e-05, + "loss": 1.929, + "step": 313090 + }, + { + "epoch": 1.1917358769212032, + "grad_norm": 0.20009645819664001, + "learning_rate": 2.962780694433054e-05, + "loss": 1.962, + "step": 313100 + }, + { + "epoch": 1.1917739393893259, + "grad_norm": 0.2427327036857605, + "learning_rate": 2.9583517862451338e-05, + "loss": 1.9386, + "step": 313110 + }, + { + "epoch": 1.1918120018574485, + "grad_norm": 0.2267937809228897, + "learning_rate": 2.9539232949937324e-05, + "loss": 1.9516, + "step": 313120 + }, + { + "epoch": 1.1918500643255712, + "grad_norm": 0.2037351280450821, + "learning_rate": 2.9494952205611326e-05, + "loss": 1.9547, + "step": 313130 + }, + { + "epoch": 1.1918881267936938, + "grad_norm": 0.18010416626930237, + "learning_rate": 2.9450675628296454e-05, + "loss": 1.9486, + "step": 313140 + }, + { + "epoch": 1.1919261892618165, + "grad_norm": 0.1703222244977951, + "learning_rate": 2.9406403216816645e-05, + "loss": 1.9436, + "step": 313150 + }, + { + "epoch": 1.1919642517299391, + "grad_norm": 0.17430193722248077, + "learning_rate": 2.936213496999629e-05, + "loss": 1.937, + "step": 313160 + }, + { + "epoch": 1.1920023141980618, + "grad_norm": 0.21456824243068695, + "learning_rate": 2.9317870886660215e-05, + "loss": 1.9408, + "step": 313170 + }, + { + "epoch": 1.1920403766661845, + "grad_norm": 0.18208642303943634, + "learning_rate": 2.9273610965633966e-05, + "loss": 1.9482, + "step": 313180 + }, + { + "epoch": 1.192078439134307, + "grad_norm": 0.23920206725597382, + "learning_rate": 2.9229355205743546e-05, + "loss": 1.9449, + "step": 313190 + }, + { + "epoch": 1.19211650160243, + "grad_norm": 0.16931700706481934, + "learning_rate": 2.91851036058155e-05, + "loss": 1.9462, + "step": 313200 + }, + { + "epoch": 1.1921545640705526, + "grad_norm": 0.18483975529670715, + "learning_rate": 2.9140856164677043e-05, + "loss": 1.9318, + "step": 313210 + }, + { + "epoch": 1.1921926265386753, + "grad_norm": 0.21092703938484192, + "learning_rate": 2.9096612881155837e-05, + "loss": 1.9254, + "step": 313220 + }, + { + "epoch": 1.192230689006798, + "grad_norm": 0.21394091844558716, + "learning_rate": 2.9052373754080096e-05, + "loss": 1.9402, + "step": 313230 + }, + { + "epoch": 1.1922687514749206, + "grad_norm": 0.20954100787639618, + "learning_rate": 2.900813878227859e-05, + "loss": 1.9382, + "step": 313240 + }, + { + "epoch": 1.1923068139430433, + "grad_norm": 0.17000707983970642, + "learning_rate": 2.8963907964580705e-05, + "loss": 1.9459, + "step": 313250 + }, + { + "epoch": 1.192344876411166, + "grad_norm": 0.24068133533000946, + "learning_rate": 2.8919681299816258e-05, + "loss": 1.9385, + "step": 313260 + }, + { + "epoch": 1.1923829388792888, + "grad_norm": 0.20357105135917664, + "learning_rate": 2.8875458786815745e-05, + "loss": 1.936, + "step": 313270 + }, + { + "epoch": 1.1924210013474115, + "grad_norm": 0.16180942952632904, + "learning_rate": 2.8831240424410153e-05, + "loss": 1.9408, + "step": 313280 + }, + { + "epoch": 1.1924590638155341, + "grad_norm": 0.19413597881793976, + "learning_rate": 2.878702621143092e-05, + "loss": 1.9403, + "step": 313290 + }, + { + "epoch": 1.1924971262836568, + "grad_norm": 0.17977626621723175, + "learning_rate": 2.874281614671026e-05, + "loss": 1.9362, + "step": 313300 + }, + { + "epoch": 1.1925351887517794, + "grad_norm": 0.18004660308361053, + "learning_rate": 2.8698610229080712e-05, + "loss": 1.9421, + "step": 313310 + }, + { + "epoch": 1.192573251219902, + "grad_norm": 0.1666060835123062, + "learning_rate": 2.8654408457375436e-05, + "loss": 1.951, + "step": 313320 + }, + { + "epoch": 1.1926113136880248, + "grad_norm": 0.1712903380393982, + "learning_rate": 2.8610210830428194e-05, + "loss": 1.9472, + "step": 313330 + }, + { + "epoch": 1.1926493761561474, + "grad_norm": 0.1879504919052124, + "learning_rate": 2.8566017347073316e-05, + "loss": 1.9448, + "step": 313340 + }, + { + "epoch": 1.19268743862427, + "grad_norm": 0.20638255774974823, + "learning_rate": 2.85218280061455e-05, + "loss": 1.9425, + "step": 313350 + }, + { + "epoch": 1.1927255010923927, + "grad_norm": 0.20174163579940796, + "learning_rate": 2.847764280648013e-05, + "loss": 1.9428, + "step": 313360 + }, + { + "epoch": 1.1927635635605156, + "grad_norm": 0.16518354415893555, + "learning_rate": 2.843346174691319e-05, + "loss": 1.9474, + "step": 313370 + }, + { + "epoch": 1.1928016260286383, + "grad_norm": 0.174319326877594, + "learning_rate": 2.838928482628106e-05, + "loss": 1.9429, + "step": 313380 + }, + { + "epoch": 1.192839688496761, + "grad_norm": 0.177846297621727, + "learning_rate": 2.834511204342072e-05, + "loss": 1.9572, + "step": 313390 + }, + { + "epoch": 1.1928777509648836, + "grad_norm": 0.1643485277891159, + "learning_rate": 2.8300943397169822e-05, + "loss": 1.9542, + "step": 313400 + }, + { + "epoch": 1.1929158134330062, + "grad_norm": 0.2616300880908966, + "learning_rate": 2.8256778886366352e-05, + "loss": 1.9307, + "step": 313410 + }, + { + "epoch": 1.192953875901129, + "grad_norm": 0.1790495663881302, + "learning_rate": 2.8212618509848962e-05, + "loss": 1.9297, + "step": 313420 + }, + { + "epoch": 1.1929919383692515, + "grad_norm": 0.2060040533542633, + "learning_rate": 2.8168462266456808e-05, + "loss": 1.9326, + "step": 313430 + }, + { + "epoch": 1.1930300008373742, + "grad_norm": 0.1828863024711609, + "learning_rate": 2.8124310155029644e-05, + "loss": 1.9327, + "step": 313440 + }, + { + "epoch": 1.193068063305497, + "grad_norm": 0.20013552904129028, + "learning_rate": 2.8080162174407674e-05, + "loss": 1.9428, + "step": 313450 + }, + { + "epoch": 1.1931061257736197, + "grad_norm": 0.18757100403308868, + "learning_rate": 2.803601832343178e-05, + "loss": 1.9338, + "step": 313460 + }, + { + "epoch": 1.1931441882417424, + "grad_norm": 0.2295101135969162, + "learning_rate": 2.7991878600943264e-05, + "loss": 1.9467, + "step": 313470 + }, + { + "epoch": 1.193182250709865, + "grad_norm": 0.20257122814655304, + "learning_rate": 2.794774300578401e-05, + "loss": 1.9423, + "step": 313480 + }, + { + "epoch": 1.1932203131779877, + "grad_norm": 0.16395969688892365, + "learning_rate": 2.7903611536796436e-05, + "loss": 1.9381, + "step": 313490 + }, + { + "epoch": 1.1932583756461104, + "grad_norm": 0.19475962221622467, + "learning_rate": 2.7859484192823526e-05, + "loss": 1.9327, + "step": 313500 + }, + { + "epoch": 1.193296438114233, + "grad_norm": 0.18422411382198334, + "learning_rate": 2.781536097270876e-05, + "loss": 1.9462, + "step": 313510 + }, + { + "epoch": 1.1933345005823557, + "grad_norm": 0.2034992128610611, + "learning_rate": 2.7771241875296226e-05, + "loss": 1.9518, + "step": 313520 + }, + { + "epoch": 1.1933725630504783, + "grad_norm": 0.20844464004039764, + "learning_rate": 2.7727126899430465e-05, + "loss": 1.927, + "step": 313530 + }, + { + "epoch": 1.1934106255186012, + "grad_norm": 0.26188117265701294, + "learning_rate": 2.7683016043956622e-05, + "loss": 1.9378, + "step": 313540 + }, + { + "epoch": 1.1934486879867239, + "grad_norm": 0.19190119206905365, + "learning_rate": 2.7638909307720394e-05, + "loss": 1.9423, + "step": 313550 + }, + { + "epoch": 1.1934867504548465, + "grad_norm": 0.2001878321170807, + "learning_rate": 2.7594806689567932e-05, + "loss": 1.9494, + "step": 313560 + }, + { + "epoch": 1.1935248129229692, + "grad_norm": 0.18613621592521667, + "learning_rate": 2.7550708188346042e-05, + "loss": 1.9432, + "step": 313570 + }, + { + "epoch": 1.1935628753910918, + "grad_norm": 0.17761319875717163, + "learning_rate": 2.750661380290198e-05, + "loss": 1.9453, + "step": 313580 + }, + { + "epoch": 1.1936009378592145, + "grad_norm": 0.1721399426460266, + "learning_rate": 2.746252353208356e-05, + "loss": 1.9284, + "step": 313590 + }, + { + "epoch": 1.1936390003273372, + "grad_norm": 0.21422426402568817, + "learning_rate": 2.7418437374739146e-05, + "loss": 1.9455, + "step": 313600 + }, + { + "epoch": 1.1936770627954598, + "grad_norm": 0.1898270547389984, + "learning_rate": 2.7374355329717658e-05, + "loss": 1.9361, + "step": 313610 + }, + { + "epoch": 1.1937151252635827, + "grad_norm": 0.17938223481178284, + "learning_rate": 2.7330277395868463e-05, + "loss": 1.9244, + "step": 313620 + }, + { + "epoch": 1.1937531877317054, + "grad_norm": 0.1766996681690216, + "learning_rate": 2.7286203572041534e-05, + "loss": 1.94, + "step": 313630 + }, + { + "epoch": 1.193791250199828, + "grad_norm": 0.20516465604305267, + "learning_rate": 2.724213385708746e-05, + "loss": 1.9421, + "step": 313640 + }, + { + "epoch": 1.1938293126679507, + "grad_norm": 0.19619545340538025, + "learning_rate": 2.7198068249857212e-05, + "loss": 1.9409, + "step": 313650 + }, + { + "epoch": 1.1938673751360733, + "grad_norm": 0.16490577161312103, + "learning_rate": 2.7154006749202374e-05, + "loss": 1.9446, + "step": 313660 + }, + { + "epoch": 1.193905437604196, + "grad_norm": 0.1935669481754303, + "learning_rate": 2.7109949353975093e-05, + "loss": 1.952, + "step": 313670 + }, + { + "epoch": 1.1939435000723186, + "grad_norm": 0.18772977590560913, + "learning_rate": 2.706589606302795e-05, + "loss": 1.9411, + "step": 313680 + }, + { + "epoch": 1.1939815625404413, + "grad_norm": 0.16066856682300568, + "learning_rate": 2.7021846875214196e-05, + "loss": 1.9517, + "step": 313690 + }, + { + "epoch": 1.194019625008564, + "grad_norm": 0.22064299881458282, + "learning_rate": 2.6977801789387467e-05, + "loss": 1.9477, + "step": 313700 + }, + { + "epoch": 1.1940576874766868, + "grad_norm": 0.16915158927440643, + "learning_rate": 2.6933760804402074e-05, + "loss": 1.9415, + "step": 313710 + }, + { + "epoch": 1.1940957499448095, + "grad_norm": 0.17556576430797577, + "learning_rate": 2.688972391911276e-05, + "loss": 1.9364, + "step": 313720 + }, + { + "epoch": 1.1941338124129321, + "grad_norm": 0.16539603471755981, + "learning_rate": 2.6845691132374893e-05, + "loss": 1.9474, + "step": 313730 + }, + { + "epoch": 1.1941718748810548, + "grad_norm": 0.22699984908103943, + "learning_rate": 2.6801662443044216e-05, + "loss": 1.9348, + "step": 313740 + }, + { + "epoch": 1.1942099373491775, + "grad_norm": 0.1771809309720993, + "learning_rate": 2.67576378499772e-05, + "loss": 1.9524, + "step": 313750 + }, + { + "epoch": 1.1942479998173001, + "grad_norm": 0.2590922713279724, + "learning_rate": 2.6713617352030706e-05, + "loss": 1.9448, + "step": 313760 + }, + { + "epoch": 1.1942860622854228, + "grad_norm": 0.19297786056995392, + "learning_rate": 2.6669600948062202e-05, + "loss": 1.9471, + "step": 313770 + }, + { + "epoch": 1.1943241247535454, + "grad_norm": 0.2514785826206207, + "learning_rate": 2.6625588636929655e-05, + "loss": 1.9442, + "step": 313780 + }, + { + "epoch": 1.1943621872216683, + "grad_norm": 0.1699245721101761, + "learning_rate": 2.658158041749159e-05, + "loss": 1.9483, + "step": 313790 + }, + { + "epoch": 1.194400249689791, + "grad_norm": 0.19253213703632355, + "learning_rate": 2.653757628860698e-05, + "loss": 1.9327, + "step": 313800 + }, + { + "epoch": 1.1944383121579136, + "grad_norm": 0.1772114783525467, + "learning_rate": 2.64935762491354e-05, + "loss": 1.9418, + "step": 313810 + }, + { + "epoch": 1.1944763746260363, + "grad_norm": 0.1950380653142929, + "learning_rate": 2.6449580297937036e-05, + "loss": 1.9528, + "step": 313820 + }, + { + "epoch": 1.194514437094159, + "grad_norm": 0.17496085166931152, + "learning_rate": 2.6405588433872363e-05, + "loss": 1.9434, + "step": 313830 + }, + { + "epoch": 1.1945524995622816, + "grad_norm": 0.18000604212284088, + "learning_rate": 2.6361600655802674e-05, + "loss": 1.9442, + "step": 313840 + }, + { + "epoch": 1.1945905620304043, + "grad_norm": 0.19483064115047455, + "learning_rate": 2.6317616962589607e-05, + "loss": 1.9437, + "step": 313850 + }, + { + "epoch": 1.194628624498527, + "grad_norm": 0.18538741767406464, + "learning_rate": 2.6273637353095293e-05, + "loss": 1.9333, + "step": 313860 + }, + { + "epoch": 1.1946666869666496, + "grad_norm": 0.1824004203081131, + "learning_rate": 2.622966182618258e-05, + "loss": 1.943, + "step": 313870 + }, + { + "epoch": 1.1947047494347722, + "grad_norm": 0.16097493469715118, + "learning_rate": 2.6185690380714667e-05, + "loss": 1.9396, + "step": 313880 + }, + { + "epoch": 1.194742811902895, + "grad_norm": 0.1737389713525772, + "learning_rate": 2.614172301555534e-05, + "loss": 1.9427, + "step": 313890 + }, + { + "epoch": 1.1947808743710178, + "grad_norm": 0.2533680498600006, + "learning_rate": 2.609775972956896e-05, + "loss": 1.9475, + "step": 313900 + }, + { + "epoch": 1.1948189368391404, + "grad_norm": 0.21982336044311523, + "learning_rate": 2.6053800521620373e-05, + "loss": 1.9465, + "step": 313910 + }, + { + "epoch": 1.194856999307263, + "grad_norm": 0.16938644647598267, + "learning_rate": 2.6009845390574938e-05, + "loss": 1.9566, + "step": 313920 + }, + { + "epoch": 1.1948950617753857, + "grad_norm": 0.17015881836414337, + "learning_rate": 2.5965894335298556e-05, + "loss": 1.9493, + "step": 313930 + }, + { + "epoch": 1.1949331242435084, + "grad_norm": 0.21307599544525146, + "learning_rate": 2.5921947354657637e-05, + "loss": 1.9352, + "step": 313940 + }, + { + "epoch": 1.194971186711631, + "grad_norm": 0.23158085346221924, + "learning_rate": 2.5878004447519144e-05, + "loss": 1.9418, + "step": 313950 + }, + { + "epoch": 1.195009249179754, + "grad_norm": 0.20640724897384644, + "learning_rate": 2.583406561275059e-05, + "loss": 1.9333, + "step": 313960 + }, + { + "epoch": 1.1950473116478766, + "grad_norm": 0.20745491981506348, + "learning_rate": 2.5790130849219882e-05, + "loss": 1.9252, + "step": 313970 + }, + { + "epoch": 1.1950853741159992, + "grad_norm": 0.21391427516937256, + "learning_rate": 2.5746200155795652e-05, + "loss": 1.933, + "step": 313980 + }, + { + "epoch": 1.195123436584122, + "grad_norm": 0.19119727611541748, + "learning_rate": 2.5702273531346853e-05, + "loss": 1.9459, + "step": 313990 + }, + { + "epoch": 1.1951614990522446, + "grad_norm": 0.17244857549667358, + "learning_rate": 2.5658350974743118e-05, + "loss": 1.9333, + "step": 314000 + }, + { + "epoch": 1.1951995615203672, + "grad_norm": 0.18785542249679565, + "learning_rate": 2.561443248485451e-05, + "loss": 1.9409, + "step": 314010 + }, + { + "epoch": 1.1952376239884899, + "grad_norm": 0.17021456360816956, + "learning_rate": 2.5570518060551662e-05, + "loss": 1.9427, + "step": 314020 + }, + { + "epoch": 1.1952756864566125, + "grad_norm": 0.17522631585597992, + "learning_rate": 2.5526607700705752e-05, + "loss": 1.9404, + "step": 314030 + }, + { + "epoch": 1.1953137489247352, + "grad_norm": 0.2075173407793045, + "learning_rate": 2.5482701404188346e-05, + "loss": 1.9444, + "step": 314040 + }, + { + "epoch": 1.1953518113928578, + "grad_norm": 0.17764155566692352, + "learning_rate": 2.543879916987174e-05, + "loss": 1.9352, + "step": 314050 + }, + { + "epoch": 1.1953898738609807, + "grad_norm": 0.17194154858589172, + "learning_rate": 2.5394900996628557e-05, + "loss": 1.9521, + "step": 314060 + }, + { + "epoch": 1.1954279363291034, + "grad_norm": 0.16174156963825226, + "learning_rate": 2.5351006883332085e-05, + "loss": 1.9441, + "step": 314070 + }, + { + "epoch": 1.195465998797226, + "grad_norm": 0.17151190340518951, + "learning_rate": 2.5307116828856058e-05, + "loss": 1.9378, + "step": 314080 + }, + { + "epoch": 1.1955040612653487, + "grad_norm": 0.20262061059474945, + "learning_rate": 2.5263230832074712e-05, + "loss": 1.9404, + "step": 314090 + }, + { + "epoch": 1.1955421237334714, + "grad_norm": 0.2190033197402954, + "learning_rate": 2.5219348891862836e-05, + "loss": 1.9277, + "step": 314100 + }, + { + "epoch": 1.195580186201594, + "grad_norm": 0.2151559442281723, + "learning_rate": 2.5175471007095775e-05, + "loss": 1.9598, + "step": 314110 + }, + { + "epoch": 1.1956182486697167, + "grad_norm": 0.3101893961429596, + "learning_rate": 2.513159717664937e-05, + "loss": 1.9379, + "step": 314120 + }, + { + "epoch": 1.1956563111378395, + "grad_norm": 0.19195280969142914, + "learning_rate": 2.5087727399399963e-05, + "loss": 1.9548, + "step": 314130 + }, + { + "epoch": 1.1956943736059622, + "grad_norm": 0.19188041985034943, + "learning_rate": 2.5043861674224344e-05, + "loss": 1.9448, + "step": 314140 + }, + { + "epoch": 1.1957324360740849, + "grad_norm": 0.16497381031513214, + "learning_rate": 2.500000000000002e-05, + "loss": 1.9446, + "step": 314150 + }, + { + "epoch": 1.1957704985422075, + "grad_norm": 0.1741824448108673, + "learning_rate": 2.4956142375604785e-05, + "loss": 1.9393, + "step": 314160 + }, + { + "epoch": 1.1958085610103302, + "grad_norm": 0.19525335729122162, + "learning_rate": 2.491228879991714e-05, + "loss": 1.9339, + "step": 314170 + }, + { + "epoch": 1.1958466234784528, + "grad_norm": 0.23857097327709198, + "learning_rate": 2.486843927181598e-05, + "loss": 1.9416, + "step": 314180 + }, + { + "epoch": 1.1958846859465755, + "grad_norm": 0.21334300935268402, + "learning_rate": 2.4824593790180814e-05, + "loss": 1.9469, + "step": 314190 + }, + { + "epoch": 1.1959227484146981, + "grad_norm": 0.1724768429994583, + "learning_rate": 2.47807523538916e-05, + "loss": 1.9473, + "step": 314200 + }, + { + "epoch": 1.1959608108828208, + "grad_norm": 0.17365297675132751, + "learning_rate": 2.4736914961828783e-05, + "loss": 1.9458, + "step": 314210 + }, + { + "epoch": 1.1959988733509435, + "grad_norm": 0.18585903942584991, + "learning_rate": 2.469308161287337e-05, + "loss": 1.9241, + "step": 314220 + }, + { + "epoch": 1.1960369358190663, + "grad_norm": 0.1692325323820114, + "learning_rate": 2.464925230590692e-05, + "loss": 1.9273, + "step": 314230 + }, + { + "epoch": 1.196074998287189, + "grad_norm": 0.1854136884212494, + "learning_rate": 2.46054270398115e-05, + "loss": 1.9353, + "step": 314240 + }, + { + "epoch": 1.1961130607553117, + "grad_norm": 0.17451083660125732, + "learning_rate": 2.456160581346961e-05, + "loss": 1.9396, + "step": 314250 + }, + { + "epoch": 1.1961511232234343, + "grad_norm": 0.23344092071056366, + "learning_rate": 2.4517788625764314e-05, + "loss": 1.9489, + "step": 314260 + }, + { + "epoch": 1.196189185691557, + "grad_norm": 0.17425473034381866, + "learning_rate": 2.447397547557928e-05, + "loss": 1.944, + "step": 314270 + }, + { + "epoch": 1.1962272481596796, + "grad_norm": 0.18360301852226257, + "learning_rate": 2.443016636179851e-05, + "loss": 1.9276, + "step": 314280 + }, + { + "epoch": 1.1962653106278023, + "grad_norm": 0.16981305181980133, + "learning_rate": 2.4386361283306623e-05, + "loss": 1.9358, + "step": 314290 + }, + { + "epoch": 1.196303373095925, + "grad_norm": 0.16044333577156067, + "learning_rate": 2.4342560238988788e-05, + "loss": 1.9372, + "step": 314300 + }, + { + "epoch": 1.1963414355640478, + "grad_norm": 0.21252797544002533, + "learning_rate": 2.429876322773067e-05, + "loss": 1.9511, + "step": 314310 + }, + { + "epoch": 1.1963794980321705, + "grad_norm": 0.17850229144096375, + "learning_rate": 2.425497024841833e-05, + "loss": 1.9317, + "step": 314320 + }, + { + "epoch": 1.1964175605002931, + "grad_norm": 0.16965432465076447, + "learning_rate": 2.421118129993849e-05, + "loss": 1.9541, + "step": 314330 + }, + { + "epoch": 1.1964556229684158, + "grad_norm": 0.17304007709026337, + "learning_rate": 2.416739638117832e-05, + "loss": 1.942, + "step": 314340 + }, + { + "epoch": 1.1964936854365384, + "grad_norm": 0.19845229387283325, + "learning_rate": 2.4123615491025486e-05, + "loss": 1.939, + "step": 314350 + }, + { + "epoch": 1.196531747904661, + "grad_norm": 0.17881131172180176, + "learning_rate": 2.4079838628368268e-05, + "loss": 1.9412, + "step": 314360 + }, + { + "epoch": 1.1965698103727838, + "grad_norm": 0.21299217641353607, + "learning_rate": 2.4036065792095274e-05, + "loss": 1.9476, + "step": 314370 + }, + { + "epoch": 1.1966078728409064, + "grad_norm": 0.24780753254890442, + "learning_rate": 2.3992296981095786e-05, + "loss": 1.9381, + "step": 314380 + }, + { + "epoch": 1.196645935309029, + "grad_norm": 0.1839221566915512, + "learning_rate": 2.3948532194259465e-05, + "loss": 1.9472, + "step": 314390 + }, + { + "epoch": 1.196683997777152, + "grad_norm": 0.2312176525592804, + "learning_rate": 2.3904771430476702e-05, + "loss": 1.9393, + "step": 314400 + }, + { + "epoch": 1.1967220602452746, + "grad_norm": 0.1622578650712967, + "learning_rate": 2.386101468863805e-05, + "loss": 1.9414, + "step": 314410 + }, + { + "epoch": 1.1967601227133973, + "grad_norm": 0.16811257600784302, + "learning_rate": 2.381726196763495e-05, + "loss": 1.9453, + "step": 314420 + }, + { + "epoch": 1.19679818518152, + "grad_norm": 0.17034026980400085, + "learning_rate": 2.377351326635907e-05, + "loss": 1.9404, + "step": 314430 + }, + { + "epoch": 1.1968362476496426, + "grad_norm": 0.1621735841035843, + "learning_rate": 2.3729768583702737e-05, + "loss": 1.959, + "step": 314440 + }, + { + "epoch": 1.1968743101177652, + "grad_norm": 0.20885998010635376, + "learning_rate": 2.3686027918558727e-05, + "loss": 1.9579, + "step": 314450 + }, + { + "epoch": 1.196912372585888, + "grad_norm": 0.18931780755519867, + "learning_rate": 2.364229126982037e-05, + "loss": 1.9541, + "step": 314460 + }, + { + "epoch": 1.1969504350540106, + "grad_norm": 0.1982506960630417, + "learning_rate": 2.3598558636381386e-05, + "loss": 1.9369, + "step": 314470 + }, + { + "epoch": 1.1969884975221334, + "grad_norm": 0.21738344430923462, + "learning_rate": 2.3554830017136153e-05, + "loss": 1.9427, + "step": 314480 + }, + { + "epoch": 1.197026559990256, + "grad_norm": 0.18937720358371735, + "learning_rate": 2.3511105410979505e-05, + "loss": 1.9383, + "step": 314490 + }, + { + "epoch": 1.1970646224583787, + "grad_norm": 0.2373971939086914, + "learning_rate": 2.346738481680677e-05, + "loss": 1.9378, + "step": 314500 + }, + { + "epoch": 1.1971026849265014, + "grad_norm": 0.21745845675468445, + "learning_rate": 2.342366823351372e-05, + "loss": 1.9449, + "step": 314510 + }, + { + "epoch": 1.197140747394624, + "grad_norm": 0.17506791651248932, + "learning_rate": 2.3379955659996732e-05, + "loss": 1.9456, + "step": 314520 + }, + { + "epoch": 1.1971788098627467, + "grad_norm": 0.17458729445934296, + "learning_rate": 2.3336247095152697e-05, + "loss": 1.9456, + "step": 314530 + }, + { + "epoch": 1.1972168723308694, + "grad_norm": 0.17563146352767944, + "learning_rate": 2.329254253787888e-05, + "loss": 1.9325, + "step": 314540 + }, + { + "epoch": 1.197254934798992, + "grad_norm": 0.18956497311592102, + "learning_rate": 2.3248841987073222e-05, + "loss": 1.9448, + "step": 314550 + }, + { + "epoch": 1.1972929972671147, + "grad_norm": 0.17544913291931152, + "learning_rate": 2.3205145441634046e-05, + "loss": 1.9445, + "step": 314560 + }, + { + "epoch": 1.1973310597352376, + "grad_norm": 0.2312907725572586, + "learning_rate": 2.3161452900460235e-05, + "loss": 1.9474, + "step": 314570 + }, + { + "epoch": 1.1973691222033602, + "grad_norm": 0.1695864498615265, + "learning_rate": 2.3117764362451167e-05, + "loss": 1.9376, + "step": 314580 + }, + { + "epoch": 1.1974071846714829, + "grad_norm": 0.2052493542432785, + "learning_rate": 2.3074079826506668e-05, + "loss": 1.9466, + "step": 314590 + }, + { + "epoch": 1.1974452471396055, + "grad_norm": 0.1754942685365677, + "learning_rate": 2.303039929152717e-05, + "loss": 1.9307, + "step": 314600 + }, + { + "epoch": 1.1974833096077282, + "grad_norm": 0.16554835438728333, + "learning_rate": 2.298672275641356e-05, + "loss": 1.9401, + "step": 314610 + }, + { + "epoch": 1.1975213720758509, + "grad_norm": 0.1665649712085724, + "learning_rate": 2.2943050220067207e-05, + "loss": 1.9384, + "step": 314620 + }, + { + "epoch": 1.1975594345439735, + "grad_norm": 0.17551663517951965, + "learning_rate": 2.2899381681389997e-05, + "loss": 1.9471, + "step": 314630 + }, + { + "epoch": 1.1975974970120962, + "grad_norm": 0.1639355570077896, + "learning_rate": 2.2855717139284304e-05, + "loss": 1.9447, + "step": 314640 + }, + { + "epoch": 1.197635559480219, + "grad_norm": 0.19529812037944794, + "learning_rate": 2.2812056592653064e-05, + "loss": 1.9564, + "step": 314650 + }, + { + "epoch": 1.1976736219483417, + "grad_norm": 0.21838751435279846, + "learning_rate": 2.276840004039965e-05, + "loss": 1.9413, + "step": 314660 + }, + { + "epoch": 1.1977116844164644, + "grad_norm": 0.16338661313056946, + "learning_rate": 2.2724747481428e-05, + "loss": 1.9572, + "step": 314670 + }, + { + "epoch": 1.197749746884587, + "grad_norm": 0.2040199339389801, + "learning_rate": 2.2681098914642483e-05, + "loss": 1.9375, + "step": 314680 + }, + { + "epoch": 1.1977878093527097, + "grad_norm": 0.2077571004629135, + "learning_rate": 2.263745433894798e-05, + "loss": 1.9341, + "step": 314690 + }, + { + "epoch": 1.1978258718208323, + "grad_norm": 0.16745197772979736, + "learning_rate": 2.259381375324987e-05, + "loss": 1.9454, + "step": 314700 + }, + { + "epoch": 1.197863934288955, + "grad_norm": 0.22090986371040344, + "learning_rate": 2.255017715645413e-05, + "loss": 1.9298, + "step": 314710 + }, + { + "epoch": 1.1979019967570776, + "grad_norm": 0.2772158086299896, + "learning_rate": 2.250654454746709e-05, + "loss": 1.9289, + "step": 314720 + }, + { + "epoch": 1.1979400592252003, + "grad_norm": 0.2125716656446457, + "learning_rate": 2.2462915925195727e-05, + "loss": 1.9379, + "step": 314730 + }, + { + "epoch": 1.197978121693323, + "grad_norm": 0.21410690248012543, + "learning_rate": 2.2419291288547417e-05, + "loss": 1.9402, + "step": 314740 + }, + { + "epoch": 1.1980161841614458, + "grad_norm": 0.25055932998657227, + "learning_rate": 2.237567063642998e-05, + "loss": 1.953, + "step": 314750 + }, + { + "epoch": 1.1980542466295685, + "grad_norm": 0.2538086473941803, + "learning_rate": 2.233205396775195e-05, + "loss": 1.9401, + "step": 314760 + }, + { + "epoch": 1.1980923090976912, + "grad_norm": 0.19334672391414642, + "learning_rate": 2.2288441281422045e-05, + "loss": 1.9182, + "step": 314770 + }, + { + "epoch": 1.1981303715658138, + "grad_norm": 0.18246856331825256, + "learning_rate": 2.224483257634985e-05, + "loss": 1.9641, + "step": 314780 + }, + { + "epoch": 1.1981684340339365, + "grad_norm": 0.16561700403690338, + "learning_rate": 2.2201227851445126e-05, + "loss": 1.956, + "step": 314790 + }, + { + "epoch": 1.1982064965020591, + "grad_norm": 0.182077556848526, + "learning_rate": 2.2157627105618307e-05, + "loss": 1.9251, + "step": 314800 + }, + { + "epoch": 1.1982445589701818, + "grad_norm": 0.21854285895824432, + "learning_rate": 2.2114030337780257e-05, + "loss": 1.9317, + "step": 314810 + }, + { + "epoch": 1.1982826214383047, + "grad_norm": 0.1683615893125534, + "learning_rate": 2.2070437546842403e-05, + "loss": 1.9418, + "step": 314820 + }, + { + "epoch": 1.1983206839064273, + "grad_norm": 0.20173421502113342, + "learning_rate": 2.2026848731716566e-05, + "loss": 1.9415, + "step": 314830 + }, + { + "epoch": 1.19835874637455, + "grad_norm": 0.17751389741897583, + "learning_rate": 2.198326389131522e-05, + "loss": 1.9401, + "step": 314840 + }, + { + "epoch": 1.1983968088426726, + "grad_norm": 0.1780787855386734, + "learning_rate": 2.1939683024551128e-05, + "loss": 1.9293, + "step": 314850 + }, + { + "epoch": 1.1984348713107953, + "grad_norm": 0.1698933094739914, + "learning_rate": 2.1896106130337657e-05, + "loss": 1.9442, + "step": 314860 + }, + { + "epoch": 1.198472933778918, + "grad_norm": 0.18079432845115662, + "learning_rate": 2.1852533207588786e-05, + "loss": 1.9448, + "step": 314870 + }, + { + "epoch": 1.1985109962470406, + "grad_norm": 0.19679990410804749, + "learning_rate": 2.1808964255218778e-05, + "loss": 1.9552, + "step": 314880 + }, + { + "epoch": 1.1985490587151633, + "grad_norm": 0.19070780277252197, + "learning_rate": 2.176539927214244e-05, + "loss": 1.9346, + "step": 314890 + }, + { + "epoch": 1.198587121183286, + "grad_norm": 0.1800595223903656, + "learning_rate": 2.1721838257275195e-05, + "loss": 1.9378, + "step": 314900 + }, + { + "epoch": 1.1986251836514086, + "grad_norm": 0.22656163573265076, + "learning_rate": 2.1678281209532912e-05, + "loss": 1.9489, + "step": 314910 + }, + { + "epoch": 1.1986632461195315, + "grad_norm": 0.16311343014240265, + "learning_rate": 2.163472812783185e-05, + "loss": 1.942, + "step": 314920 + }, + { + "epoch": 1.1987013085876541, + "grad_norm": 0.1718357354402542, + "learning_rate": 2.1591179011088868e-05, + "loss": 1.9385, + "step": 314930 + }, + { + "epoch": 1.1987393710557768, + "grad_norm": 0.18059194087982178, + "learning_rate": 2.1547633858221284e-05, + "loss": 1.9283, + "step": 314940 + }, + { + "epoch": 1.1987774335238994, + "grad_norm": 0.2025146633386612, + "learning_rate": 2.15040926681469e-05, + "loss": 1.9472, + "step": 314950 + }, + { + "epoch": 1.198815495992022, + "grad_norm": 0.15839137136936188, + "learning_rate": 2.1460555439784036e-05, + "loss": 1.9309, + "step": 314960 + }, + { + "epoch": 1.1988535584601447, + "grad_norm": 0.2715093195438385, + "learning_rate": 2.1417022172051492e-05, + "loss": 1.945, + "step": 314970 + }, + { + "epoch": 1.1988916209282674, + "grad_norm": 0.18683266639709473, + "learning_rate": 2.1373492863868526e-05, + "loss": 1.9467, + "step": 314980 + }, + { + "epoch": 1.1989296833963903, + "grad_norm": 0.17632196843624115, + "learning_rate": 2.132996751415489e-05, + "loss": 1.9365, + "step": 314990 + }, + { + "epoch": 1.198967745864513, + "grad_norm": 0.17075899243354797, + "learning_rate": 2.128644612183095e-05, + "loss": 1.9374, + "step": 315000 + }, + { + "epoch": 1.1990058083326356, + "grad_norm": 0.1725279837846756, + "learning_rate": 2.1242928685817396e-05, + "loss": 1.9407, + "step": 315010 + }, + { + "epoch": 1.1990438708007582, + "grad_norm": 0.19900698959827423, + "learning_rate": 2.119941520503549e-05, + "loss": 1.9349, + "step": 315020 + }, + { + "epoch": 1.199081933268881, + "grad_norm": 0.30478909611701965, + "learning_rate": 2.1155905678406972e-05, + "loss": 1.9411, + "step": 315030 + }, + { + "epoch": 1.1991199957370036, + "grad_norm": 0.21382011473178864, + "learning_rate": 2.11124001048541e-05, + "loss": 1.9448, + "step": 315040 + }, + { + "epoch": 1.1991580582051262, + "grad_norm": 0.19819001853466034, + "learning_rate": 2.106889848329957e-05, + "loss": 1.9446, + "step": 315050 + }, + { + "epoch": 1.1991961206732489, + "grad_norm": 0.16067779064178467, + "learning_rate": 2.102540081266663e-05, + "loss": 1.9528, + "step": 315060 + }, + { + "epoch": 1.1992341831413715, + "grad_norm": 0.24102060496807098, + "learning_rate": 2.0981907091878916e-05, + "loss": 1.9434, + "step": 315070 + }, + { + "epoch": 1.1992722456094942, + "grad_norm": 0.1644870638847351, + "learning_rate": 2.0938417319860626e-05, + "loss": 1.9393, + "step": 315080 + }, + { + "epoch": 1.199310308077617, + "grad_norm": 0.25682348012924194, + "learning_rate": 2.0894931495536507e-05, + "loss": 1.9235, + "step": 315090 + }, + { + "epoch": 1.1993483705457397, + "grad_norm": 0.16999438405036926, + "learning_rate": 2.085144961783164e-05, + "loss": 1.9349, + "step": 315100 + }, + { + "epoch": 1.1993864330138624, + "grad_norm": 0.1793372631072998, + "learning_rate": 2.0807971685671723e-05, + "loss": 1.9546, + "step": 315110 + }, + { + "epoch": 1.199424495481985, + "grad_norm": 0.20978040993213654, + "learning_rate": 2.0764497697982886e-05, + "loss": 1.9386, + "step": 315120 + }, + { + "epoch": 1.1994625579501077, + "grad_norm": 0.21455638110637665, + "learning_rate": 2.0721027653691715e-05, + "loss": 1.9365, + "step": 315130 + }, + { + "epoch": 1.1995006204182304, + "grad_norm": 0.20593143999576569, + "learning_rate": 2.06775615517254e-05, + "loss": 1.9368, + "step": 315140 + }, + { + "epoch": 1.199538682886353, + "grad_norm": 0.18800471723079681, + "learning_rate": 2.0634099391011462e-05, + "loss": 1.9266, + "step": 315150 + }, + { + "epoch": 1.1995767453544757, + "grad_norm": 0.165223628282547, + "learning_rate": 2.0590641170478043e-05, + "loss": 1.9438, + "step": 315160 + }, + { + "epoch": 1.1996148078225986, + "grad_norm": 0.16987620294094086, + "learning_rate": 2.0547186889053713e-05, + "loss": 1.9355, + "step": 315170 + }, + { + "epoch": 1.1996528702907212, + "grad_norm": 0.17159295082092285, + "learning_rate": 2.0503736545667505e-05, + "loss": 1.9462, + "step": 315180 + }, + { + "epoch": 1.1996909327588439, + "grad_norm": 0.16801120340824127, + "learning_rate": 2.0460290139248937e-05, + "loss": 1.9341, + "step": 315190 + }, + { + "epoch": 1.1997289952269665, + "grad_norm": 0.16506770253181458, + "learning_rate": 2.0416847668728032e-05, + "loss": 1.9476, + "step": 315200 + }, + { + "epoch": 1.1997670576950892, + "grad_norm": 0.194406196475029, + "learning_rate": 2.037340913303537e-05, + "loss": 1.932, + "step": 315210 + }, + { + "epoch": 1.1998051201632118, + "grad_norm": 0.272256463766098, + "learning_rate": 2.0329974531101915e-05, + "loss": 1.944, + "step": 315220 + }, + { + "epoch": 1.1998431826313345, + "grad_norm": 0.2028186172246933, + "learning_rate": 2.028654386185913e-05, + "loss": 1.9352, + "step": 315230 + }, + { + "epoch": 1.1998812450994571, + "grad_norm": 0.21224938333034515, + "learning_rate": 2.0243117124238987e-05, + "loss": 1.9453, + "step": 315240 + }, + { + "epoch": 1.1999193075675798, + "grad_norm": 0.1703551709651947, + "learning_rate": 2.0199694317173943e-05, + "loss": 1.947, + "step": 315250 + }, + { + "epoch": 1.1999573700357027, + "grad_norm": 0.16464319825172424, + "learning_rate": 2.015627543959686e-05, + "loss": 1.9359, + "step": 315260 + }, + { + "epoch": 1.1999954325038253, + "grad_norm": 0.16439057886600494, + "learning_rate": 2.0112860490441308e-05, + "loss": 1.9366, + "step": 315270 + }, + { + "epoch": 1.200033494971948, + "grad_norm": 0.1883467137813568, + "learning_rate": 2.0069449468640977e-05, + "loss": 1.939, + "step": 315280 + }, + { + "epoch": 1.2000715574400707, + "grad_norm": 0.1818651407957077, + "learning_rate": 2.0026042373130437e-05, + "loss": 1.9201, + "step": 315290 + }, + { + "epoch": 1.2001096199081933, + "grad_norm": 0.18975751101970673, + "learning_rate": 1.9982639202844378e-05, + "loss": 1.9409, + "step": 315300 + }, + { + "epoch": 1.200147682376316, + "grad_norm": 0.2573046088218689, + "learning_rate": 1.993923995671826e-05, + "loss": 1.9434, + "step": 315310 + }, + { + "epoch": 1.2001857448444386, + "grad_norm": 0.1690666824579239, + "learning_rate": 1.9895844633687822e-05, + "loss": 1.9301, + "step": 315320 + }, + { + "epoch": 1.2002238073125613, + "grad_norm": 0.18057064712047577, + "learning_rate": 1.9852453232689473e-05, + "loss": 1.9557, + "step": 315330 + }, + { + "epoch": 1.2002618697806842, + "grad_norm": 0.16391849517822266, + "learning_rate": 1.9809065752659895e-05, + "loss": 1.9417, + "step": 315340 + }, + { + "epoch": 1.2002999322488068, + "grad_norm": 0.18901868164539337, + "learning_rate": 1.976568219253633e-05, + "loss": 1.9536, + "step": 315350 + }, + { + "epoch": 1.2003379947169295, + "grad_norm": 0.19878660142421722, + "learning_rate": 1.9722302551256623e-05, + "loss": 1.9449, + "step": 315360 + }, + { + "epoch": 1.2003760571850521, + "grad_norm": 0.17215828597545624, + "learning_rate": 1.967892682775896e-05, + "loss": 1.9358, + "step": 315370 + }, + { + "epoch": 1.2004141196531748, + "grad_norm": 0.19715379178524017, + "learning_rate": 1.963555502098202e-05, + "loss": 1.9249, + "step": 315380 + }, + { + "epoch": 1.2004521821212975, + "grad_norm": 0.2076154500246048, + "learning_rate": 1.959218712986499e-05, + "loss": 1.9369, + "step": 315390 + }, + { + "epoch": 1.20049024458942, + "grad_norm": 0.17226295173168182, + "learning_rate": 1.9548823153347493e-05, + "loss": 1.9362, + "step": 315400 + }, + { + "epoch": 1.2005283070575428, + "grad_norm": 0.20354777574539185, + "learning_rate": 1.950546309036977e-05, + "loss": 1.9439, + "step": 315410 + }, + { + "epoch": 1.2005663695256654, + "grad_norm": 0.20497560501098633, + "learning_rate": 1.946210693987238e-05, + "loss": 1.9236, + "step": 315420 + }, + { + "epoch": 1.2006044319937883, + "grad_norm": 0.16953223943710327, + "learning_rate": 1.9418754700796404e-05, + "loss": 1.9513, + "step": 315430 + }, + { + "epoch": 1.200642494461911, + "grad_norm": 0.16467490792274475, + "learning_rate": 1.937540637208335e-05, + "loss": 1.9453, + "step": 315440 + }, + { + "epoch": 1.2006805569300336, + "grad_norm": 0.23944929242134094, + "learning_rate": 1.9332061952675395e-05, + "loss": 1.9557, + "step": 315450 + }, + { + "epoch": 1.2007186193981563, + "grad_norm": 0.26693135499954224, + "learning_rate": 1.9288721441515e-05, + "loss": 1.9552, + "step": 315460 + }, + { + "epoch": 1.200756681866279, + "grad_norm": 0.1673591136932373, + "learning_rate": 1.9245384837545233e-05, + "loss": 1.9439, + "step": 315470 + }, + { + "epoch": 1.2007947443344016, + "grad_norm": 0.17863279581069946, + "learning_rate": 1.9202052139709437e-05, + "loss": 1.9347, + "step": 315480 + }, + { + "epoch": 1.2008328068025242, + "grad_norm": 0.20780551433563232, + "learning_rate": 1.9158723346951735e-05, + "loss": 1.9419, + "step": 315490 + }, + { + "epoch": 1.200870869270647, + "grad_norm": 0.19351142644882202, + "learning_rate": 1.911539845821636e-05, + "loss": 1.9557, + "step": 315500 + }, + { + "epoch": 1.2009089317387698, + "grad_norm": 0.2197602540254593, + "learning_rate": 1.9072077472448378e-05, + "loss": 1.9619, + "step": 315510 + }, + { + "epoch": 1.2009469942068924, + "grad_norm": 0.17918086051940918, + "learning_rate": 1.9028760388593136e-05, + "loss": 1.9573, + "step": 315520 + }, + { + "epoch": 1.200985056675015, + "grad_norm": 0.2039628028869629, + "learning_rate": 1.8985447205596473e-05, + "loss": 1.9322, + "step": 315530 + }, + { + "epoch": 1.2010231191431378, + "grad_norm": 0.24598252773284912, + "learning_rate": 1.8942137922404735e-05, + "loss": 1.9402, + "step": 315540 + }, + { + "epoch": 1.2010611816112604, + "grad_norm": 0.3049358129501343, + "learning_rate": 1.8898832537964706e-05, + "loss": 1.918, + "step": 315550 + }, + { + "epoch": 1.201099244079383, + "grad_norm": 0.1963425576686859, + "learning_rate": 1.8855531051223675e-05, + "loss": 1.9256, + "step": 315560 + }, + { + "epoch": 1.2011373065475057, + "grad_norm": 0.164651021361351, + "learning_rate": 1.8812233461129368e-05, + "loss": 1.9371, + "step": 315570 + }, + { + "epoch": 1.2011753690156284, + "grad_norm": 0.1651751697063446, + "learning_rate": 1.876893976663008e-05, + "loss": 1.9283, + "step": 315580 + }, + { + "epoch": 1.201213431483751, + "grad_norm": 0.16817796230316162, + "learning_rate": 1.872564996667442e-05, + "loss": 1.944, + "step": 315590 + }, + { + "epoch": 1.2012514939518737, + "grad_norm": 0.16912603378295898, + "learning_rate": 1.868236406021162e-05, + "loss": 1.9374, + "step": 315600 + }, + { + "epoch": 1.2012895564199966, + "grad_norm": 0.16702499985694885, + "learning_rate": 1.863908204619136e-05, + "loss": 1.9432, + "step": 315610 + }, + { + "epoch": 1.2013276188881192, + "grad_norm": 0.167423278093338, + "learning_rate": 1.8595803923563635e-05, + "loss": 1.941, + "step": 315620 + }, + { + "epoch": 1.2013656813562419, + "grad_norm": 0.17401781678199768, + "learning_rate": 1.855252969127913e-05, + "loss": 1.9322, + "step": 315630 + }, + { + "epoch": 1.2014037438243645, + "grad_norm": 0.16266636550426483, + "learning_rate": 1.85092593482889e-05, + "loss": 1.9245, + "step": 315640 + }, + { + "epoch": 1.2014418062924872, + "grad_norm": 0.17383623123168945, + "learning_rate": 1.846599289354445e-05, + "loss": 1.9491, + "step": 315650 + }, + { + "epoch": 1.2014798687606099, + "grad_norm": 0.24540668725967407, + "learning_rate": 1.842273032599778e-05, + "loss": 1.9434, + "step": 315660 + }, + { + "epoch": 1.2015179312287325, + "grad_norm": 0.24183866381645203, + "learning_rate": 1.837947164460141e-05, + "loss": 1.9423, + "step": 315670 + }, + { + "epoch": 1.2015559936968554, + "grad_norm": 0.16616332530975342, + "learning_rate": 1.833621684830816e-05, + "loss": 1.9295, + "step": 315680 + }, + { + "epoch": 1.201594056164978, + "grad_norm": 0.15895366668701172, + "learning_rate": 1.8292965936071604e-05, + "loss": 1.9401, + "step": 315690 + }, + { + "epoch": 1.2016321186331007, + "grad_norm": 0.18763354420661926, + "learning_rate": 1.8249718906845513e-05, + "loss": 1.9384, + "step": 315700 + }, + { + "epoch": 1.2016701811012234, + "grad_norm": 0.18516993522644043, + "learning_rate": 1.820647575958434e-05, + "loss": 1.9294, + "step": 315710 + }, + { + "epoch": 1.201708243569346, + "grad_norm": 0.17786172032356262, + "learning_rate": 1.8163236493242807e-05, + "loss": 1.9444, + "step": 315720 + }, + { + "epoch": 1.2017463060374687, + "grad_norm": 0.17752547562122345, + "learning_rate": 1.812000110677625e-05, + "loss": 1.9276, + "step": 315730 + }, + { + "epoch": 1.2017843685055913, + "grad_norm": 0.17008447647094727, + "learning_rate": 1.8076769599140398e-05, + "loss": 1.9465, + "step": 315740 + }, + { + "epoch": 1.201822430973714, + "grad_norm": 0.1759907305240631, + "learning_rate": 1.8033541969291523e-05, + "loss": 1.9282, + "step": 315750 + }, + { + "epoch": 1.2018604934418367, + "grad_norm": 0.22066645324230194, + "learning_rate": 1.7990318216186297e-05, + "loss": 1.9298, + "step": 315760 + }, + { + "epoch": 1.2018985559099593, + "grad_norm": 0.1596367508172989, + "learning_rate": 1.7947098338781887e-05, + "loss": 1.9404, + "step": 315770 + }, + { + "epoch": 1.2019366183780822, + "grad_norm": 0.18029053509235382, + "learning_rate": 1.790388233603596e-05, + "loss": 1.9486, + "step": 315780 + }, + { + "epoch": 1.2019746808462048, + "grad_norm": 0.22112157940864563, + "learning_rate": 1.7860670206906572e-05, + "loss": 1.9393, + "step": 315790 + }, + { + "epoch": 1.2020127433143275, + "grad_norm": 0.17528939247131348, + "learning_rate": 1.781746195035222e-05, + "loss": 1.9224, + "step": 315800 + }, + { + "epoch": 1.2020508057824502, + "grad_norm": 0.1923305094242096, + "learning_rate": 1.7774257565332075e-05, + "loss": 1.929, + "step": 315810 + }, + { + "epoch": 1.2020888682505728, + "grad_norm": 0.17303921282291412, + "learning_rate": 1.7731057050805578e-05, + "loss": 1.9316, + "step": 315820 + }, + { + "epoch": 1.2021269307186955, + "grad_norm": 0.21711373329162598, + "learning_rate": 1.768786040573267e-05, + "loss": 1.9342, + "step": 315830 + }, + { + "epoch": 1.2021649931868181, + "grad_norm": 0.2056269496679306, + "learning_rate": 1.764466762907385e-05, + "loss": 1.9473, + "step": 315840 + }, + { + "epoch": 1.202203055654941, + "grad_norm": 0.23204374313354492, + "learning_rate": 1.76014787197899e-05, + "loss": 1.9336, + "step": 315850 + }, + { + "epoch": 1.2022411181230637, + "grad_norm": 0.17051061987876892, + "learning_rate": 1.755829367684225e-05, + "loss": 1.9504, + "step": 315860 + }, + { + "epoch": 1.2022791805911863, + "grad_norm": 0.19815026223659515, + "learning_rate": 1.7515112499192688e-05, + "loss": 1.9396, + "step": 315870 + }, + { + "epoch": 1.202317243059309, + "grad_norm": 0.16430552303791046, + "learning_rate": 1.747193518580359e-05, + "loss": 1.95, + "step": 315880 + }, + { + "epoch": 1.2023553055274316, + "grad_norm": 0.2549295723438263, + "learning_rate": 1.7428761735637623e-05, + "loss": 1.9507, + "step": 315890 + }, + { + "epoch": 1.2023933679955543, + "grad_norm": 0.17700010538101196, + "learning_rate": 1.738559214765806e-05, + "loss": 1.9427, + "step": 315900 + }, + { + "epoch": 1.202431430463677, + "grad_norm": 0.17616471648216248, + "learning_rate": 1.7342426420828506e-05, + "loss": 1.9553, + "step": 315910 + }, + { + "epoch": 1.2024694929317996, + "grad_norm": 0.18350853025913239, + "learning_rate": 1.729926455411318e-05, + "loss": 1.928, + "step": 315920 + }, + { + "epoch": 1.2025075553999223, + "grad_norm": 0.1687181144952774, + "learning_rate": 1.725610654647669e-05, + "loss": 1.9386, + "step": 315930 + }, + { + "epoch": 1.202545617868045, + "grad_norm": 0.1649000197649002, + "learning_rate": 1.7212952396884085e-05, + "loss": 1.935, + "step": 315940 + }, + { + "epoch": 1.2025836803361678, + "grad_norm": 0.1732434183359146, + "learning_rate": 1.7169802104300857e-05, + "loss": 1.9515, + "step": 315950 + }, + { + "epoch": 1.2026217428042905, + "grad_norm": 0.20426325500011444, + "learning_rate": 1.7126655667693057e-05, + "loss": 1.9453, + "step": 315960 + }, + { + "epoch": 1.2026598052724131, + "grad_norm": 0.23240119218826294, + "learning_rate": 1.7083513086027125e-05, + "loss": 1.945, + "step": 315970 + }, + { + "epoch": 1.2026978677405358, + "grad_norm": 0.16769737005233765, + "learning_rate": 1.7040374358269996e-05, + "loss": 1.9309, + "step": 315980 + }, + { + "epoch": 1.2027359302086584, + "grad_norm": 0.16312843561172485, + "learning_rate": 1.6997239483388995e-05, + "loss": 1.9313, + "step": 315990 + }, + { + "epoch": 1.202773992676781, + "grad_norm": 0.21969033777713776, + "learning_rate": 1.695410846035206e-05, + "loss": 1.9374, + "step": 316000 + }, + { + "epoch": 1.2028120551449037, + "grad_norm": 0.18756617605686188, + "learning_rate": 1.6910981288127404e-05, + "loss": 1.9189, + "step": 316010 + }, + { + "epoch": 1.2028501176130264, + "grad_norm": 0.16086409986019135, + "learning_rate": 1.68678579656838e-05, + "loss": 1.9408, + "step": 316020 + }, + { + "epoch": 1.2028881800811493, + "grad_norm": 0.16711248457431793, + "learning_rate": 1.6824738491990565e-05, + "loss": 1.9363, + "step": 316030 + }, + { + "epoch": 1.202926242549272, + "grad_norm": 0.16341838240623474, + "learning_rate": 1.6781622866017197e-05, + "loss": 1.9355, + "step": 316040 + }, + { + "epoch": 1.2029643050173946, + "grad_norm": 0.19947198033332825, + "learning_rate": 1.673851108673402e-05, + "loss": 1.9331, + "step": 316050 + }, + { + "epoch": 1.2030023674855173, + "grad_norm": 0.17706602811813354, + "learning_rate": 1.6695403153111577e-05, + "loss": 1.9516, + "step": 316060 + }, + { + "epoch": 1.20304042995364, + "grad_norm": 0.1658812314271927, + "learning_rate": 1.6652299064120913e-05, + "loss": 1.9349, + "step": 316070 + }, + { + "epoch": 1.2030784924217626, + "grad_norm": 0.1825098842382431, + "learning_rate": 1.6609198818733574e-05, + "loss": 1.95, + "step": 316080 + }, + { + "epoch": 1.2031165548898852, + "grad_norm": 0.17820632457733154, + "learning_rate": 1.656610241592149e-05, + "loss": 1.9301, + "step": 316090 + }, + { + "epoch": 1.2031546173580079, + "grad_norm": 0.16276079416275024, + "learning_rate": 1.65230098546571e-05, + "loss": 1.9461, + "step": 316100 + }, + { + "epoch": 1.2031926798261305, + "grad_norm": 0.19121406972408295, + "learning_rate": 1.6479921133913333e-05, + "loss": 1.9316, + "step": 316110 + }, + { + "epoch": 1.2032307422942534, + "grad_norm": 0.1845954954624176, + "learning_rate": 1.643683625266351e-05, + "loss": 1.943, + "step": 316120 + }, + { + "epoch": 1.203268804762376, + "grad_norm": 0.20090362429618835, + "learning_rate": 1.6393755209881512e-05, + "loss": 1.9367, + "step": 316130 + }, + { + "epoch": 1.2033068672304987, + "grad_norm": 0.1640109121799469, + "learning_rate": 1.6350678004541542e-05, + "loss": 1.9381, + "step": 316140 + }, + { + "epoch": 1.2033449296986214, + "grad_norm": 0.17727604508399963, + "learning_rate": 1.630760463561831e-05, + "loss": 1.9326, + "step": 316150 + }, + { + "epoch": 1.203382992166744, + "grad_norm": 0.16008159518241882, + "learning_rate": 1.626453510208703e-05, + "loss": 1.9412, + "step": 316160 + }, + { + "epoch": 1.2034210546348667, + "grad_norm": 0.16219596564769745, + "learning_rate": 1.6221469402923296e-05, + "loss": 1.9379, + "step": 316170 + }, + { + "epoch": 1.2034591171029894, + "grad_norm": 0.18563158810138702, + "learning_rate": 1.6178407537103256e-05, + "loss": 1.942, + "step": 316180 + }, + { + "epoch": 1.203497179571112, + "grad_norm": 0.2555886507034302, + "learning_rate": 1.6135349503603458e-05, + "loss": 1.934, + "step": 316190 + }, + { + "epoch": 1.203535242039235, + "grad_norm": 0.15692013502120972, + "learning_rate": 1.609229530140083e-05, + "loss": 1.9368, + "step": 316200 + }, + { + "epoch": 1.2035733045073576, + "grad_norm": 0.19800697267055511, + "learning_rate": 1.6049244929472905e-05, + "loss": 1.9496, + "step": 316210 + }, + { + "epoch": 1.2036113669754802, + "grad_norm": 0.18161258101463318, + "learning_rate": 1.600619838679751e-05, + "loss": 1.9358, + "step": 316220 + }, + { + "epoch": 1.2036494294436029, + "grad_norm": 0.20629428327083588, + "learning_rate": 1.5963155672353124e-05, + "loss": 1.9272, + "step": 316230 + }, + { + "epoch": 1.2036874919117255, + "grad_norm": 0.20691783726215363, + "learning_rate": 1.5920116785118455e-05, + "loss": 1.9311, + "step": 316240 + }, + { + "epoch": 1.2037255543798482, + "grad_norm": 0.1592143326997757, + "learning_rate": 1.5877081724072873e-05, + "loss": 1.9388, + "step": 316250 + }, + { + "epoch": 1.2037636168479708, + "grad_norm": 0.16132837533950806, + "learning_rate": 1.5834050488196084e-05, + "loss": 1.9406, + "step": 316260 + }, + { + "epoch": 1.2038016793160935, + "grad_norm": 0.1728985458612442, + "learning_rate": 1.5791023076468235e-05, + "loss": 1.9434, + "step": 316270 + }, + { + "epoch": 1.2038397417842162, + "grad_norm": 0.17591522634029388, + "learning_rate": 1.574799948786998e-05, + "loss": 1.9307, + "step": 316280 + }, + { + "epoch": 1.203877804252339, + "grad_norm": 0.17040970921516418, + "learning_rate": 1.570497972138235e-05, + "loss": 1.9486, + "step": 316290 + }, + { + "epoch": 1.2039158667204617, + "grad_norm": 0.1899997591972351, + "learning_rate": 1.5661963775986997e-05, + "loss": 1.9467, + "step": 316300 + }, + { + "epoch": 1.2039539291885843, + "grad_norm": 0.18535642325878143, + "learning_rate": 1.5618951650665848e-05, + "loss": 1.9334, + "step": 316310 + }, + { + "epoch": 1.203991991656707, + "grad_norm": 0.16790741682052612, + "learning_rate": 1.557594334440132e-05, + "loss": 1.923, + "step": 316320 + }, + { + "epoch": 1.2040300541248297, + "grad_norm": 0.1611746996641159, + "learning_rate": 1.5532938856176403e-05, + "loss": 1.9311, + "step": 316330 + }, + { + "epoch": 1.2040681165929523, + "grad_norm": 0.16700397431850433, + "learning_rate": 1.548993818497435e-05, + "loss": 1.9496, + "step": 316340 + }, + { + "epoch": 1.204106179061075, + "grad_norm": 0.1745607554912567, + "learning_rate": 1.5446941329778974e-05, + "loss": 1.9348, + "step": 316350 + }, + { + "epoch": 1.2041442415291976, + "grad_norm": 0.15827342867851257, + "learning_rate": 1.540394828957453e-05, + "loss": 1.9396, + "step": 316360 + }, + { + "epoch": 1.2041823039973205, + "grad_norm": 0.19324856996536255, + "learning_rate": 1.536095906334578e-05, + "loss": 1.918, + "step": 316370 + }, + { + "epoch": 1.2042203664654432, + "grad_norm": 0.16694240272045135, + "learning_rate": 1.531797365007781e-05, + "loss": 1.9436, + "step": 316380 + }, + { + "epoch": 1.2042584289335658, + "grad_norm": 0.17361164093017578, + "learning_rate": 1.5274992048756263e-05, + "loss": 1.9374, + "step": 316390 + }, + { + "epoch": 1.2042964914016885, + "grad_norm": 0.17740680277347565, + "learning_rate": 1.5232014258367122e-05, + "loss": 1.9351, + "step": 316400 + }, + { + "epoch": 1.2043345538698111, + "grad_norm": 0.16227102279663086, + "learning_rate": 1.5189040277896915e-05, + "loss": 1.9475, + "step": 316410 + }, + { + "epoch": 1.2043726163379338, + "grad_norm": 0.18371431529521942, + "learning_rate": 1.5146070106332622e-05, + "loss": 1.9432, + "step": 316420 + }, + { + "epoch": 1.2044106788060565, + "grad_norm": 0.21148449182510376, + "learning_rate": 1.5103103742661606e-05, + "loss": 1.9433, + "step": 316430 + }, + { + "epoch": 1.2044487412741791, + "grad_norm": 0.1894027441740036, + "learning_rate": 1.5060141185871735e-05, + "loss": 1.9346, + "step": 316440 + }, + { + "epoch": 1.2044868037423018, + "grad_norm": 0.17949220538139343, + "learning_rate": 1.5017182434951259e-05, + "loss": 1.9331, + "step": 316450 + }, + { + "epoch": 1.2045248662104244, + "grad_norm": 0.18078577518463135, + "learning_rate": 1.4974227488888991e-05, + "loss": 1.9371, + "step": 316460 + }, + { + "epoch": 1.2045629286785473, + "grad_norm": 0.18450722098350525, + "learning_rate": 1.4931276346674071e-05, + "loss": 1.9365, + "step": 316470 + }, + { + "epoch": 1.20460099114667, + "grad_norm": 0.17150239646434784, + "learning_rate": 1.4888329007296142e-05, + "loss": 1.9435, + "step": 316480 + }, + { + "epoch": 1.2046390536147926, + "grad_norm": 0.18355433642864227, + "learning_rate": 1.4845385469745342e-05, + "loss": 1.9367, + "step": 316490 + }, + { + "epoch": 1.2046771160829153, + "grad_norm": 0.16779930889606476, + "learning_rate": 1.4802445733012093e-05, + "loss": 1.9297, + "step": 316500 + }, + { + "epoch": 1.204715178551038, + "grad_norm": 0.1708436906337738, + "learning_rate": 1.4759509796087478e-05, + "loss": 1.937, + "step": 316510 + }, + { + "epoch": 1.2047532410191606, + "grad_norm": 0.18960228562355042, + "learning_rate": 1.471657765796286e-05, + "loss": 1.9386, + "step": 316520 + }, + { + "epoch": 1.2047913034872832, + "grad_norm": 0.17748971283435822, + "learning_rate": 1.46736493176301e-05, + "loss": 1.9326, + "step": 316530 + }, + { + "epoch": 1.2048293659554061, + "grad_norm": 0.16371700167655945, + "learning_rate": 1.4630724774081561e-05, + "loss": 1.9442, + "step": 316540 + }, + { + "epoch": 1.2048674284235288, + "grad_norm": 0.17727847397327423, + "learning_rate": 1.4587804026310048e-05, + "loss": 1.9279, + "step": 316550 + }, + { + "epoch": 1.2049054908916514, + "grad_norm": 0.17282675206661224, + "learning_rate": 1.4544887073308643e-05, + "loss": 1.9324, + "step": 316560 + }, + { + "epoch": 1.204943553359774, + "grad_norm": 0.18812990188598633, + "learning_rate": 1.4501973914071098e-05, + "loss": 1.9275, + "step": 316570 + }, + { + "epoch": 1.2049816158278968, + "grad_norm": 0.16910743713378906, + "learning_rate": 1.4459064547591439e-05, + "loss": 1.9572, + "step": 316580 + }, + { + "epoch": 1.2050196782960194, + "grad_norm": 0.18709997832775116, + "learning_rate": 1.441615897286419e-05, + "loss": 1.9381, + "step": 316590 + }, + { + "epoch": 1.205057740764142, + "grad_norm": 0.1978847086429596, + "learning_rate": 1.4373257188884492e-05, + "loss": 1.954, + "step": 316600 + }, + { + "epoch": 1.2050958032322647, + "grad_norm": 0.1758776754140854, + "learning_rate": 1.4330359194647647e-05, + "loss": 1.9405, + "step": 316610 + }, + { + "epoch": 1.2051338657003874, + "grad_norm": 0.1603417992591858, + "learning_rate": 1.4287464989149512e-05, + "loss": 1.952, + "step": 316620 + }, + { + "epoch": 1.20517192816851, + "grad_norm": 0.1598341017961502, + "learning_rate": 1.4244574571386449e-05, + "loss": 1.948, + "step": 316630 + }, + { + "epoch": 1.205209990636633, + "grad_norm": 0.16874103248119354, + "learning_rate": 1.4201687940355257e-05, + "loss": 1.934, + "step": 316640 + }, + { + "epoch": 1.2052480531047556, + "grad_norm": 0.21132296323776245, + "learning_rate": 1.4158805095053019e-05, + "loss": 1.937, + "step": 316650 + }, + { + "epoch": 1.2052861155728782, + "grad_norm": 0.2193910777568817, + "learning_rate": 1.4115926034477478e-05, + "loss": 1.9226, + "step": 316660 + }, + { + "epoch": 1.205324178041001, + "grad_norm": 0.16305066645145416, + "learning_rate": 1.4073050757626715e-05, + "loss": 1.9384, + "step": 316670 + }, + { + "epoch": 1.2053622405091236, + "grad_norm": 0.20663656294345856, + "learning_rate": 1.4030179263499254e-05, + "loss": 1.9442, + "step": 316680 + }, + { + "epoch": 1.2054003029772462, + "grad_norm": 0.18724772334098816, + "learning_rate": 1.3987311551094061e-05, + "loss": 1.9262, + "step": 316690 + }, + { + "epoch": 1.2054383654453689, + "grad_norm": 0.1713886857032776, + "learning_rate": 1.3944447619410495e-05, + "loss": 1.957, + "step": 316700 + }, + { + "epoch": 1.2054764279134917, + "grad_norm": 0.18026380240917206, + "learning_rate": 1.3901587467448463e-05, + "loss": 1.943, + "step": 316710 + }, + { + "epoch": 1.2055144903816144, + "grad_norm": 0.21686388552188873, + "learning_rate": 1.385873109420821e-05, + "loss": 1.9225, + "step": 316720 + }, + { + "epoch": 1.205552552849737, + "grad_norm": 0.17331081628799438, + "learning_rate": 1.3815878498690593e-05, + "loss": 1.9402, + "step": 316730 + }, + { + "epoch": 1.2055906153178597, + "grad_norm": 0.162716805934906, + "learning_rate": 1.377302967989663e-05, + "loss": 1.9288, + "step": 316740 + }, + { + "epoch": 1.2056286777859824, + "grad_norm": 0.1783205270767212, + "learning_rate": 1.373018463682807e-05, + "loss": 1.9269, + "step": 316750 + }, + { + "epoch": 1.205666740254105, + "grad_norm": 0.22573402523994446, + "learning_rate": 1.368734336848687e-05, + "loss": 1.9314, + "step": 316760 + }, + { + "epoch": 1.2057048027222277, + "grad_norm": 0.1547260284423828, + "learning_rate": 1.3644505873875557e-05, + "loss": 1.9366, + "step": 316770 + }, + { + "epoch": 1.2057428651903503, + "grad_norm": 0.2504291534423828, + "learning_rate": 1.3601672151997092e-05, + "loss": 1.9327, + "step": 316780 + }, + { + "epoch": 1.205780927658473, + "grad_norm": 0.17203214764595032, + "learning_rate": 1.3558842201854771e-05, + "loss": 1.924, + "step": 316790 + }, + { + "epoch": 1.2058189901265957, + "grad_norm": 0.1697988212108612, + "learning_rate": 1.3516016022452503e-05, + "loss": 1.9553, + "step": 316800 + }, + { + "epoch": 1.2058570525947185, + "grad_norm": 0.1626092493534088, + "learning_rate": 1.3473193612794531e-05, + "loss": 1.9418, + "step": 316810 + }, + { + "epoch": 1.2058951150628412, + "grad_norm": 0.15964530408382416, + "learning_rate": 1.3430374971885483e-05, + "loss": 1.9268, + "step": 316820 + }, + { + "epoch": 1.2059331775309639, + "grad_norm": 0.15963692963123322, + "learning_rate": 1.3387560098730434e-05, + "loss": 1.9271, + "step": 316830 + }, + { + "epoch": 1.2059712399990865, + "grad_norm": 0.17314162850379944, + "learning_rate": 1.3344748992335065e-05, + "loss": 1.9507, + "step": 316840 + }, + { + "epoch": 1.2060093024672092, + "grad_norm": 0.17278534173965454, + "learning_rate": 1.3301941651705396e-05, + "loss": 1.9598, + "step": 316850 + }, + { + "epoch": 1.2060473649353318, + "grad_norm": 0.15973657369613647, + "learning_rate": 1.3259138075847722e-05, + "loss": 1.9439, + "step": 316860 + }, + { + "epoch": 1.2060854274034545, + "grad_norm": 0.1658315509557724, + "learning_rate": 1.3216338263769056e-05, + "loss": 1.9355, + "step": 316870 + }, + { + "epoch": 1.2061234898715771, + "grad_norm": 0.1809733659029007, + "learning_rate": 1.3173542214476586e-05, + "loss": 1.9396, + "step": 316880 + }, + { + "epoch": 1.2061615523397, + "grad_norm": 0.16338767111301422, + "learning_rate": 1.3130749926978158e-05, + "loss": 1.946, + "step": 316890 + }, + { + "epoch": 1.2061996148078227, + "grad_norm": 0.17688487470149994, + "learning_rate": 1.3087961400281956e-05, + "loss": 1.9438, + "step": 316900 + }, + { + "epoch": 1.2062376772759453, + "grad_norm": 0.19036678969860077, + "learning_rate": 1.3045176633396494e-05, + "loss": 1.9236, + "step": 316910 + }, + { + "epoch": 1.206275739744068, + "grad_norm": 0.17270910739898682, + "learning_rate": 1.3002395625330954e-05, + "loss": 1.9296, + "step": 316920 + }, + { + "epoch": 1.2063138022121906, + "grad_norm": 0.16987460851669312, + "learning_rate": 1.2959618375094795e-05, + "loss": 1.943, + "step": 316930 + }, + { + "epoch": 1.2063518646803133, + "grad_norm": 0.2615673840045929, + "learning_rate": 1.2916844881697864e-05, + "loss": 1.9325, + "step": 316940 + }, + { + "epoch": 1.206389927148436, + "grad_norm": 0.2283545732498169, + "learning_rate": 1.2874075144150565e-05, + "loss": 1.9469, + "step": 316950 + }, + { + "epoch": 1.2064279896165586, + "grad_norm": 0.18416893482208252, + "learning_rate": 1.2831309161463745e-05, + "loss": 1.9421, + "step": 316960 + }, + { + "epoch": 1.2064660520846813, + "grad_norm": 0.20439530909061432, + "learning_rate": 1.2788546932648526e-05, + "loss": 1.9314, + "step": 316970 + }, + { + "epoch": 1.2065041145528042, + "grad_norm": 0.1733761429786682, + "learning_rate": 1.27457884567167e-05, + "loss": 1.9307, + "step": 316980 + }, + { + "epoch": 1.2065421770209268, + "grad_norm": 0.16197511553764343, + "learning_rate": 1.2703033732680281e-05, + "loss": 1.9357, + "step": 316990 + }, + { + "epoch": 1.2065802394890495, + "grad_norm": 0.1635819524526596, + "learning_rate": 1.2660282759551834e-05, + "loss": 1.9314, + "step": 317000 + }, + { + "epoch": 1.2066183019571721, + "grad_norm": 0.15866439044475555, + "learning_rate": 1.2617535536344205e-05, + "loss": 1.9365, + "step": 317010 + }, + { + "epoch": 1.2066563644252948, + "grad_norm": 0.16617876291275024, + "learning_rate": 1.257479206207096e-05, + "loss": 1.9362, + "step": 317020 + }, + { + "epoch": 1.2066944268934174, + "grad_norm": 0.168635293841362, + "learning_rate": 1.2532052335745892e-05, + "loss": 1.9368, + "step": 317030 + }, + { + "epoch": 1.20673248936154, + "grad_norm": 0.1748170703649521, + "learning_rate": 1.2489316356383173e-05, + "loss": 1.9439, + "step": 317040 + }, + { + "epoch": 1.2067705518296628, + "grad_norm": 0.1713894158601761, + "learning_rate": 1.2446584122997539e-05, + "loss": 1.9278, + "step": 317050 + }, + { + "epoch": 1.2068086142977856, + "grad_norm": 0.17230573296546936, + "learning_rate": 1.2403855634604111e-05, + "loss": 1.9289, + "step": 317060 + }, + { + "epoch": 1.2068466767659083, + "grad_norm": 0.16374553740024567, + "learning_rate": 1.2361130890218452e-05, + "loss": 1.9377, + "step": 317070 + }, + { + "epoch": 1.206884739234031, + "grad_norm": 0.1656305491924286, + "learning_rate": 1.2318409888856519e-05, + "loss": 1.941, + "step": 317080 + }, + { + "epoch": 1.2069228017021536, + "grad_norm": 0.16557244956493378, + "learning_rate": 1.2275692629534819e-05, + "loss": 1.933, + "step": 317090 + }, + { + "epoch": 1.2069608641702763, + "grad_norm": 0.2730533480644226, + "learning_rate": 1.2232979111270082e-05, + "loss": 1.947, + "step": 317100 + }, + { + "epoch": 1.206998926638399, + "grad_norm": 0.20291326940059662, + "learning_rate": 1.2190269333079707e-05, + "loss": 1.9453, + "step": 317110 + }, + { + "epoch": 1.2070369891065216, + "grad_norm": 0.16012758016586304, + "learning_rate": 1.2147563293981311e-05, + "loss": 1.9241, + "step": 317120 + }, + { + "epoch": 1.2070750515746442, + "grad_norm": 0.22131392359733582, + "learning_rate": 1.2104860992993016e-05, + "loss": 1.9247, + "step": 317130 + }, + { + "epoch": 1.2071131140427669, + "grad_norm": 0.1807316094636917, + "learning_rate": 1.206216242913344e-05, + "loss": 1.929, + "step": 317140 + }, + { + "epoch": 1.2071511765108898, + "grad_norm": 0.16896076500415802, + "learning_rate": 1.2019467601421642e-05, + "loss": 1.9314, + "step": 317150 + }, + { + "epoch": 1.2071892389790124, + "grad_norm": 0.18690729141235352, + "learning_rate": 1.1976776508876908e-05, + "loss": 1.9534, + "step": 317160 + }, + { + "epoch": 1.207227301447135, + "grad_norm": 0.16740208864212036, + "learning_rate": 1.193408915051919e-05, + "loss": 1.9324, + "step": 317170 + }, + { + "epoch": 1.2072653639152577, + "grad_norm": 0.17816956341266632, + "learning_rate": 1.189140552536877e-05, + "loss": 1.9526, + "step": 317180 + }, + { + "epoch": 1.2073034263833804, + "grad_norm": 0.16290433704853058, + "learning_rate": 1.1848725632446322e-05, + "loss": 1.9398, + "step": 317190 + }, + { + "epoch": 1.207341488851503, + "grad_norm": 0.17890475690364838, + "learning_rate": 1.1806049470773017e-05, + "loss": 1.9397, + "step": 317200 + }, + { + "epoch": 1.2073795513196257, + "grad_norm": 0.20076799392700195, + "learning_rate": 1.176337703937036e-05, + "loss": 1.9346, + "step": 317210 + }, + { + "epoch": 1.2074176137877484, + "grad_norm": 0.1811753809452057, + "learning_rate": 1.1720708337260466e-05, + "loss": 1.9452, + "step": 317220 + }, + { + "epoch": 1.2074556762558712, + "grad_norm": 0.1892351508140564, + "learning_rate": 1.1678043363465673e-05, + "loss": 1.9416, + "step": 317230 + }, + { + "epoch": 1.207493738723994, + "grad_norm": 0.17329028248786926, + "learning_rate": 1.1635382117008819e-05, + "loss": 1.9457, + "step": 317240 + }, + { + "epoch": 1.2075318011921166, + "grad_norm": 0.17507338523864746, + "learning_rate": 1.1592724596913185e-05, + "loss": 1.9367, + "step": 317250 + }, + { + "epoch": 1.2075698636602392, + "grad_norm": 0.15674608945846558, + "learning_rate": 1.155007080220255e-05, + "loss": 1.9333, + "step": 317260 + }, + { + "epoch": 1.2076079261283619, + "grad_norm": 0.18256492912769318, + "learning_rate": 1.1507420731900975e-05, + "loss": 1.9415, + "step": 317270 + }, + { + "epoch": 1.2076459885964845, + "grad_norm": 0.1951712816953659, + "learning_rate": 1.1464774385033017e-05, + "loss": 1.9341, + "step": 317280 + }, + { + "epoch": 1.2076840510646072, + "grad_norm": 0.18389761447906494, + "learning_rate": 1.1422131760623733e-05, + "loss": 1.9462, + "step": 317290 + }, + { + "epoch": 1.2077221135327298, + "grad_norm": 0.18417753279209137, + "learning_rate": 1.1379492857698403e-05, + "loss": 1.9332, + "step": 317300 + }, + { + "epoch": 1.2077601760008525, + "grad_norm": 0.1719813197851181, + "learning_rate": 1.133685767528292e-05, + "loss": 1.9373, + "step": 317310 + }, + { + "epoch": 1.2077982384689752, + "grad_norm": 0.16601203382015228, + "learning_rate": 1.1294226212403614e-05, + "loss": 1.9414, + "step": 317320 + }, + { + "epoch": 1.207836300937098, + "grad_norm": 0.2003100961446762, + "learning_rate": 1.1251598468087043e-05, + "loss": 1.933, + "step": 317330 + }, + { + "epoch": 1.2078743634052207, + "grad_norm": 0.21693551540374756, + "learning_rate": 1.1208974441360432e-05, + "loss": 1.9375, + "step": 317340 + }, + { + "epoch": 1.2079124258733434, + "grad_norm": 0.1749928891658783, + "learning_rate": 1.1166354131251167e-05, + "loss": 1.9329, + "step": 317350 + }, + { + "epoch": 1.207950488341466, + "grad_norm": 0.20150145888328552, + "learning_rate": 1.112373753678736e-05, + "loss": 1.9366, + "step": 317360 + }, + { + "epoch": 1.2079885508095887, + "grad_norm": 0.16071544587612152, + "learning_rate": 1.1081124656997232e-05, + "loss": 1.9243, + "step": 317370 + }, + { + "epoch": 1.2080266132777113, + "grad_norm": 0.18917936086654663, + "learning_rate": 1.1038515490909729e-05, + "loss": 1.9473, + "step": 317380 + }, + { + "epoch": 1.208064675745834, + "grad_norm": 0.16255995631217957, + "learning_rate": 1.0995910037553958e-05, + "loss": 1.9363, + "step": 317390 + }, + { + "epoch": 1.2081027382139569, + "grad_norm": 0.16722454130649567, + "learning_rate": 1.0953308295959696e-05, + "loss": 1.9399, + "step": 317400 + }, + { + "epoch": 1.2081408006820795, + "grad_norm": 0.16331323981285095, + "learning_rate": 1.0910710265156887e-05, + "loss": 1.9211, + "step": 317410 + }, + { + "epoch": 1.2081788631502022, + "grad_norm": 0.160036101937294, + "learning_rate": 1.0868115944176082e-05, + "loss": 1.9367, + "step": 317420 + }, + { + "epoch": 1.2082169256183248, + "grad_norm": 0.18118809163570404, + "learning_rate": 1.0825525332048115e-05, + "loss": 1.9514, + "step": 317430 + }, + { + "epoch": 1.2082549880864475, + "grad_norm": 0.21691325306892395, + "learning_rate": 1.0782938427804478e-05, + "loss": 1.9334, + "step": 317440 + }, + { + "epoch": 1.2082930505545701, + "grad_norm": 0.17789939045906067, + "learning_rate": 1.074035523047684e-05, + "loss": 1.9348, + "step": 317450 + }, + { + "epoch": 1.2083311130226928, + "grad_norm": 0.20680665969848633, + "learning_rate": 1.069777573909736e-05, + "loss": 1.9311, + "step": 317460 + }, + { + "epoch": 1.2083691754908155, + "grad_norm": 0.17139819264411926, + "learning_rate": 1.0655199952698647e-05, + "loss": 1.9313, + "step": 317470 + }, + { + "epoch": 1.2084072379589381, + "grad_norm": 0.17623241245746613, + "learning_rate": 1.061262787031375e-05, + "loss": 1.9437, + "step": 317480 + }, + { + "epoch": 1.2084453004270608, + "grad_norm": 0.18453365564346313, + "learning_rate": 1.0570059490976114e-05, + "loss": 1.9378, + "step": 317490 + }, + { + "epoch": 1.2084833628951837, + "grad_norm": 0.16772012412548065, + "learning_rate": 1.0527494813719562e-05, + "loss": 1.9325, + "step": 317500 + }, + { + "epoch": 1.2085214253633063, + "grad_norm": 0.19086311757564545, + "learning_rate": 1.048493383757837e-05, + "loss": 1.937, + "step": 317510 + }, + { + "epoch": 1.208559487831429, + "grad_norm": 0.16531312465667725, + "learning_rate": 1.0442376561587307e-05, + "loss": 1.9429, + "step": 317520 + }, + { + "epoch": 1.2085975502995516, + "grad_norm": 0.16466625034809113, + "learning_rate": 1.0399822984781482e-05, + "loss": 1.9408, + "step": 317530 + }, + { + "epoch": 1.2086356127676743, + "grad_norm": 0.16177676618099213, + "learning_rate": 1.035727310619633e-05, + "loss": 1.9397, + "step": 317540 + }, + { + "epoch": 1.208673675235797, + "grad_norm": 0.21583956480026245, + "learning_rate": 1.0314726924867956e-05, + "loss": 1.9376, + "step": 317550 + }, + { + "epoch": 1.2087117377039196, + "grad_norm": 0.16004690527915955, + "learning_rate": 1.0272184439832577e-05, + "loss": 1.9389, + "step": 317560 + }, + { + "epoch": 1.2087498001720425, + "grad_norm": 0.17332518100738525, + "learning_rate": 1.0229645650127183e-05, + "loss": 1.918, + "step": 317570 + }, + { + "epoch": 1.2087878626401651, + "grad_norm": 0.16747303307056427, + "learning_rate": 1.0187110554788825e-05, + "loss": 1.9181, + "step": 317580 + }, + { + "epoch": 1.2088259251082878, + "grad_norm": 0.16373571753501892, + "learning_rate": 1.0144579152855215e-05, + "loss": 1.934, + "step": 317590 + }, + { + "epoch": 1.2088639875764104, + "grad_norm": 0.16885121166706085, + "learning_rate": 1.0102051443364402e-05, + "loss": 1.9454, + "step": 317600 + }, + { + "epoch": 1.208902050044533, + "grad_norm": 0.1702057272195816, + "learning_rate": 1.005952742535482e-05, + "loss": 1.9385, + "step": 317610 + }, + { + "epoch": 1.2089401125126558, + "grad_norm": 0.17671136558055878, + "learning_rate": 1.001700709786535e-05, + "loss": 1.9336, + "step": 317620 + }, + { + "epoch": 1.2089781749807784, + "grad_norm": 0.1710839569568634, + "learning_rate": 9.974490459935314e-06, + "loss": 1.9421, + "step": 317630 + }, + { + "epoch": 1.209016237448901, + "grad_norm": 0.167668417096138, + "learning_rate": 9.931977510604484e-06, + "loss": 1.9408, + "step": 317640 + }, + { + "epoch": 1.2090542999170237, + "grad_norm": 0.17699985206127167, + "learning_rate": 9.889468248912903e-06, + "loss": 1.927, + "step": 317650 + }, + { + "epoch": 1.2090923623851464, + "grad_norm": 0.17739124596118927, + "learning_rate": 9.846962673901227e-06, + "loss": 1.9175, + "step": 317660 + }, + { + "epoch": 1.2091304248532693, + "grad_norm": 0.19273975491523743, + "learning_rate": 9.804460784610281e-06, + "loss": 1.9332, + "step": 317670 + }, + { + "epoch": 1.209168487321392, + "grad_norm": 0.16854619979858398, + "learning_rate": 9.76196258008155e-06, + "loss": 1.9269, + "step": 317680 + }, + { + "epoch": 1.2092065497895146, + "grad_norm": 0.1591169238090515, + "learning_rate": 9.719468059356862e-06, + "loss": 1.9375, + "step": 317690 + }, + { + "epoch": 1.2092446122576372, + "grad_norm": 0.170506551861763, + "learning_rate": 9.676977221478366e-06, + "loss": 1.9252, + "step": 317700 + }, + { + "epoch": 1.20928267472576, + "grad_norm": 0.19091635942459106, + "learning_rate": 9.634490065488666e-06, + "loss": 1.9325, + "step": 317710 + }, + { + "epoch": 1.2093207371938826, + "grad_norm": 0.16340164840221405, + "learning_rate": 9.592006590430857e-06, + "loss": 1.9413, + "step": 317720 + }, + { + "epoch": 1.2093587996620052, + "grad_norm": 0.16731935739517212, + "learning_rate": 9.549526795348428e-06, + "loss": 1.9398, + "step": 317730 + }, + { + "epoch": 1.2093968621301279, + "grad_norm": 0.18606775999069214, + "learning_rate": 9.507050679285146e-06, + "loss": 1.925, + "step": 317740 + }, + { + "epoch": 1.2094349245982507, + "grad_norm": 0.17653732001781464, + "learning_rate": 9.464578241285382e-06, + "loss": 1.9292, + "step": 317750 + }, + { + "epoch": 1.2094729870663734, + "grad_norm": 0.16764487326145172, + "learning_rate": 9.422109480393847e-06, + "loss": 1.9397, + "step": 317760 + }, + { + "epoch": 1.209511049534496, + "grad_norm": 0.1619822233915329, + "learning_rate": 9.37964439565564e-06, + "loss": 1.929, + "step": 317770 + }, + { + "epoch": 1.2095491120026187, + "grad_norm": 0.1595163494348526, + "learning_rate": 9.337182986116243e-06, + "loss": 1.9218, + "step": 317780 + }, + { + "epoch": 1.2095871744707414, + "grad_norm": 0.25330692529678345, + "learning_rate": 9.29472525082159e-06, + "loss": 1.9222, + "step": 317790 + }, + { + "epoch": 1.209625236938864, + "grad_norm": 0.1648254096508026, + "learning_rate": 9.252271188818106e-06, + "loss": 1.9195, + "step": 317800 + }, + { + "epoch": 1.2096632994069867, + "grad_norm": 0.19968004524707794, + "learning_rate": 9.209820799152502e-06, + "loss": 1.9254, + "step": 317810 + }, + { + "epoch": 1.2097013618751093, + "grad_norm": 0.17890578508377075, + "learning_rate": 9.167374080871982e-06, + "loss": 1.9243, + "step": 317820 + }, + { + "epoch": 1.209739424343232, + "grad_norm": 0.18902572989463806, + "learning_rate": 9.124931033024142e-06, + "loss": 1.9313, + "step": 317830 + }, + { + "epoch": 1.2097774868113549, + "grad_norm": 0.21188783645629883, + "learning_rate": 9.08249165465691e-06, + "loss": 1.9444, + "step": 317840 + }, + { + "epoch": 1.2098155492794775, + "grad_norm": 0.2646874189376831, + "learning_rate": 9.040055944818825e-06, + "loss": 1.9169, + "step": 317850 + }, + { + "epoch": 1.2098536117476002, + "grad_norm": 0.19424621760845184, + "learning_rate": 8.997623902558538e-06, + "loss": 1.9423, + "step": 317860 + }, + { + "epoch": 1.2098916742157229, + "grad_norm": 0.17482692003250122, + "learning_rate": 8.955195526925474e-06, + "loss": 1.9476, + "step": 317870 + }, + { + "epoch": 1.2099297366838455, + "grad_norm": 0.17090508341789246, + "learning_rate": 8.912770816969173e-06, + "loss": 1.9295, + "step": 317880 + }, + { + "epoch": 1.2099677991519682, + "grad_norm": 0.1726713925600052, + "learning_rate": 8.870349771739671e-06, + "loss": 1.9363, + "step": 317890 + }, + { + "epoch": 1.2100058616200908, + "grad_norm": 0.23577260971069336, + "learning_rate": 8.827932390287508e-06, + "loss": 1.938, + "step": 317900 + }, + { + "epoch": 1.2100439240882135, + "grad_norm": 0.1568213552236557, + "learning_rate": 8.785518671663495e-06, + "loss": 1.9339, + "step": 317910 + }, + { + "epoch": 1.2100819865563364, + "grad_norm": 0.17128297686576843, + "learning_rate": 8.743108614918948e-06, + "loss": 1.9275, + "step": 317920 + }, + { + "epoch": 1.210120049024459, + "grad_norm": 0.1653999388217926, + "learning_rate": 8.700702219105516e-06, + "loss": 1.9453, + "step": 317930 + }, + { + "epoch": 1.2101581114925817, + "grad_norm": 0.23350019752979279, + "learning_rate": 8.6582994832754e-06, + "loss": 1.9327, + "step": 317940 + }, + { + "epoch": 1.2101961739607043, + "grad_norm": 0.19468331336975098, + "learning_rate": 8.61590040648108e-06, + "loss": 1.9386, + "step": 317950 + }, + { + "epoch": 1.210234236428827, + "grad_norm": 0.16924616694450378, + "learning_rate": 8.573504987775426e-06, + "loss": 1.9449, + "step": 317960 + }, + { + "epoch": 1.2102722988969496, + "grad_norm": 0.18172939121723175, + "learning_rate": 8.531113226211806e-06, + "loss": 1.9302, + "step": 317970 + }, + { + "epoch": 1.2103103613650723, + "grad_norm": 0.18186110258102417, + "learning_rate": 8.488725120843921e-06, + "loss": 1.9338, + "step": 317980 + }, + { + "epoch": 1.210348423833195, + "grad_norm": 0.21057625114917755, + "learning_rate": 8.446340670725972e-06, + "loss": 1.9433, + "step": 317990 + }, + { + "epoch": 1.2103864863013176, + "grad_norm": 0.19673345983028412, + "learning_rate": 8.403959874912492e-06, + "loss": 1.9254, + "step": 318000 + }, + { + "epoch": 1.2104245487694405, + "grad_norm": 0.16198976337909698, + "learning_rate": 8.361582732458406e-06, + "loss": 1.9523, + "step": 318010 + }, + { + "epoch": 1.2104626112375632, + "grad_norm": 0.17466185986995697, + "learning_rate": 8.319209242419135e-06, + "loss": 1.9346, + "step": 318020 + }, + { + "epoch": 1.2105006737056858, + "grad_norm": 0.16120444238185883, + "learning_rate": 8.27683940385049e-06, + "loss": 1.9469, + "step": 318030 + }, + { + "epoch": 1.2105387361738085, + "grad_norm": 0.15718050301074982, + "learning_rate": 8.234473215808503e-06, + "loss": 1.9322, + "step": 318040 + }, + { + "epoch": 1.2105767986419311, + "grad_norm": 0.18450264632701874, + "learning_rate": 8.192110677349928e-06, + "loss": 1.9446, + "step": 318050 + }, + { + "epoch": 1.2106148611100538, + "grad_norm": 0.18494777381420135, + "learning_rate": 8.14975178753169e-06, + "loss": 1.9359, + "step": 318060 + }, + { + "epoch": 1.2106529235781764, + "grad_norm": 0.15343858301639557, + "learning_rate": 8.107396545411205e-06, + "loss": 1.932, + "step": 318070 + }, + { + "epoch": 1.210690986046299, + "grad_norm": 0.1697983294725418, + "learning_rate": 8.065044950046286e-06, + "loss": 1.9562, + "step": 318080 + }, + { + "epoch": 1.210729048514422, + "grad_norm": 0.18019534647464752, + "learning_rate": 8.02269700049507e-06, + "loss": 1.9273, + "step": 318090 + }, + { + "epoch": 1.2107671109825446, + "grad_norm": 0.16635341942310333, + "learning_rate": 7.980352695816263e-06, + "loss": 1.9335, + "step": 318100 + }, + { + "epoch": 1.2108051734506673, + "grad_norm": 0.17308969795703888, + "learning_rate": 7.938012035068887e-06, + "loss": 1.9377, + "step": 318110 + }, + { + "epoch": 1.21084323591879, + "grad_norm": 0.2071884721517563, + "learning_rate": 7.895675017312365e-06, + "loss": 1.9307, + "step": 318120 + }, + { + "epoch": 1.2108812983869126, + "grad_norm": 0.18859954178333282, + "learning_rate": 7.853341641606448e-06, + "loss": 1.9303, + "step": 318130 + }, + { + "epoch": 1.2109193608550353, + "grad_norm": 0.18228723108768463, + "learning_rate": 7.8110119070115e-06, + "loss": 1.9243, + "step": 318140 + }, + { + "epoch": 1.210957423323158, + "grad_norm": 0.17649097740650177, + "learning_rate": 7.76868581258805e-06, + "loss": 1.9338, + "step": 318150 + }, + { + "epoch": 1.2109954857912806, + "grad_norm": 0.1678033173084259, + "learning_rate": 7.726363357397182e-06, + "loss": 1.9327, + "step": 318160 + }, + { + "epoch": 1.2110335482594032, + "grad_norm": 0.17184267938137054, + "learning_rate": 7.684044540500368e-06, + "loss": 1.9444, + "step": 318170 + }, + { + "epoch": 1.211071610727526, + "grad_norm": 0.20127668976783752, + "learning_rate": 7.641729360959471e-06, + "loss": 1.9327, + "step": 318180 + }, + { + "epoch": 1.2111096731956488, + "grad_norm": 0.17560744285583496, + "learning_rate": 7.59941781783674e-06, + "loss": 1.937, + "step": 318190 + }, + { + "epoch": 1.2111477356637714, + "grad_norm": 0.1622043251991272, + "learning_rate": 7.55710991019476e-06, + "loss": 1.9276, + "step": 318200 + }, + { + "epoch": 1.211185798131894, + "grad_norm": 0.19448207318782806, + "learning_rate": 7.514805637096666e-06, + "loss": 1.9358, + "step": 318210 + }, + { + "epoch": 1.2112238606000167, + "grad_norm": 0.1590512990951538, + "learning_rate": 7.472504997605878e-06, + "loss": 1.9456, + "step": 318220 + }, + { + "epoch": 1.2112619230681394, + "grad_norm": 0.16298441588878632, + "learning_rate": 7.430207990786364e-06, + "loss": 1.9235, + "step": 318230 + }, + { + "epoch": 1.211299985536262, + "grad_norm": 0.16473017632961273, + "learning_rate": 7.3879146157022625e-06, + "loss": 1.9291, + "step": 318240 + }, + { + "epoch": 1.2113380480043847, + "grad_norm": 0.15773583948612213, + "learning_rate": 7.345624871418266e-06, + "loss": 1.9402, + "step": 318250 + }, + { + "epoch": 1.2113761104725076, + "grad_norm": 0.1632518619298935, + "learning_rate": 7.303338756999511e-06, + "loss": 1.9394, + "step": 318260 + }, + { + "epoch": 1.2114141729406303, + "grad_norm": 0.17342713475227356, + "learning_rate": 7.261056271511412e-06, + "loss": 1.9373, + "step": 318270 + }, + { + "epoch": 1.211452235408753, + "grad_norm": 0.18714912235736847, + "learning_rate": 7.218777414019828e-06, + "loss": 1.945, + "step": 318280 + }, + { + "epoch": 1.2114902978768756, + "grad_norm": 0.17201866209506989, + "learning_rate": 7.17650218359106e-06, + "loss": 1.9261, + "step": 318290 + }, + { + "epoch": 1.2115283603449982, + "grad_norm": 0.20901884138584137, + "learning_rate": 7.134230579291856e-06, + "loss": 1.9428, + "step": 318300 + }, + { + "epoch": 1.2115664228131209, + "grad_norm": 0.20689956843852997, + "learning_rate": 7.091962600189128e-06, + "loss": 1.946, + "step": 318310 + }, + { + "epoch": 1.2116044852812435, + "grad_norm": 0.22000889480113983, + "learning_rate": 7.049698245350511e-06, + "loss": 1.932, + "step": 318320 + }, + { + "epoch": 1.2116425477493662, + "grad_norm": 0.1608712524175644, + "learning_rate": 7.007437513843751e-06, + "loss": 1.9325, + "step": 318330 + }, + { + "epoch": 1.2116806102174889, + "grad_norm": 0.16387580335140228, + "learning_rate": 6.965180404737204e-06, + "loss": 1.9485, + "step": 318340 + }, + { + "epoch": 1.2117186726856115, + "grad_norm": 0.17372748255729675, + "learning_rate": 6.922926917099504e-06, + "loss": 1.9167, + "step": 318350 + }, + { + "epoch": 1.2117567351537344, + "grad_norm": 0.1678519994020462, + "learning_rate": 6.8806770499997285e-06, + "loss": 1.9423, + "step": 318360 + }, + { + "epoch": 1.211794797621857, + "grad_norm": 0.16828025877475739, + "learning_rate": 6.838430802507345e-06, + "loss": 1.9371, + "step": 318370 + }, + { + "epoch": 1.2118328600899797, + "grad_norm": 0.159216970205307, + "learning_rate": 6.7961881736922635e-06, + "loss": 1.9414, + "step": 318380 + }, + { + "epoch": 1.2118709225581024, + "grad_norm": 0.17254793643951416, + "learning_rate": 6.753949162624673e-06, + "loss": 1.9395, + "step": 318390 + }, + { + "epoch": 1.211908985026225, + "grad_norm": 0.18198195099830627, + "learning_rate": 6.711713768375261e-06, + "loss": 1.9348, + "step": 318400 + }, + { + "epoch": 1.2119470474943477, + "grad_norm": 0.17052502930164337, + "learning_rate": 6.669481990015103e-06, + "loss": 1.9217, + "step": 318410 + }, + { + "epoch": 1.2119851099624703, + "grad_norm": 0.18030428886413574, + "learning_rate": 6.62725382661572e-06, + "loss": 1.9321, + "step": 318420 + }, + { + "epoch": 1.2120231724305932, + "grad_norm": 0.16270743310451508, + "learning_rate": 6.585029277248911e-06, + "loss": 1.9319, + "step": 318430 + }, + { + "epoch": 1.2120612348987159, + "grad_norm": 0.15881286561489105, + "learning_rate": 6.542808340986917e-06, + "loss": 1.9458, + "step": 318440 + }, + { + "epoch": 1.2120992973668385, + "grad_norm": 0.22062478959560394, + "learning_rate": 6.5005910169023684e-06, + "loss": 1.9293, + "step": 318450 + }, + { + "epoch": 1.2121373598349612, + "grad_norm": 0.15809518098831177, + "learning_rate": 6.458377304068342e-06, + "loss": 1.9403, + "step": 318460 + }, + { + "epoch": 1.2121754223030838, + "grad_norm": 0.15915168821811676, + "learning_rate": 6.4161672015583535e-06, + "loss": 1.9463, + "step": 318470 + }, + { + "epoch": 1.2122134847712065, + "grad_norm": 0.1819572001695633, + "learning_rate": 6.373960708446147e-06, + "loss": 1.9396, + "step": 318480 + }, + { + "epoch": 1.2122515472393292, + "grad_norm": 0.16173192858695984, + "learning_rate": 6.331757823806017e-06, + "loss": 1.9395, + "step": 318490 + }, + { + "epoch": 1.2122896097074518, + "grad_norm": 0.18848130106925964, + "learning_rate": 6.289558546712537e-06, + "loss": 1.9447, + "step": 318500 + }, + { + "epoch": 1.2123276721755745, + "grad_norm": 0.16205497086048126, + "learning_rate": 6.247362876240781e-06, + "loss": 1.9263, + "step": 318510 + }, + { + "epoch": 1.2123657346436971, + "grad_norm": 0.1631089150905609, + "learning_rate": 6.205170811466099e-06, + "loss": 1.9276, + "step": 318520 + }, + { + "epoch": 1.21240379711182, + "grad_norm": 0.1711810976266861, + "learning_rate": 6.162982351464452e-06, + "loss": 1.9488, + "step": 318530 + }, + { + "epoch": 1.2124418595799427, + "grad_norm": 0.15897248685359955, + "learning_rate": 6.12079749531197e-06, + "loss": 1.9305, + "step": 318540 + }, + { + "epoch": 1.2124799220480653, + "grad_norm": 0.1594250500202179, + "learning_rate": 6.078616242085222e-06, + "loss": 1.9329, + "step": 318550 + }, + { + "epoch": 1.212517984516188, + "grad_norm": 0.1622227430343628, + "learning_rate": 6.036438590861227e-06, + "loss": 1.933, + "step": 318560 + }, + { + "epoch": 1.2125560469843106, + "grad_norm": 0.17288058996200562, + "learning_rate": 5.994264540717442e-06, + "loss": 1.9237, + "step": 318570 + }, + { + "epoch": 1.2125941094524333, + "grad_norm": 0.16310521960258484, + "learning_rate": 5.952094090731552e-06, + "loss": 1.9314, + "step": 318580 + }, + { + "epoch": 1.212632171920556, + "grad_norm": 0.1626407355070114, + "learning_rate": 5.9099272399818494e-06, + "loss": 1.9348, + "step": 318590 + }, + { + "epoch": 1.2126702343886786, + "grad_norm": 0.1794476956129074, + "learning_rate": 5.867763987546904e-06, + "loss": 1.9306, + "step": 318600 + }, + { + "epoch": 1.2127082968568015, + "grad_norm": 0.17908766865730286, + "learning_rate": 5.825604332505563e-06, + "loss": 1.9346, + "step": 318610 + }, + { + "epoch": 1.2127463593249241, + "grad_norm": 0.16136077046394348, + "learning_rate": 5.7834482739372865e-06, + "loss": 1.9268, + "step": 318620 + }, + { + "epoch": 1.2127844217930468, + "grad_norm": 0.1657281070947647, + "learning_rate": 5.741295810921865e-06, + "loss": 1.9527, + "step": 318630 + }, + { + "epoch": 1.2128224842611695, + "grad_norm": 0.17226015031337738, + "learning_rate": 5.699146942539313e-06, + "loss": 1.9229, + "step": 318640 + }, + { + "epoch": 1.212860546729292, + "grad_norm": 0.16876555979251862, + "learning_rate": 5.657001667870254e-06, + "loss": 1.9412, + "step": 318650 + }, + { + "epoch": 1.2128986091974148, + "grad_norm": 0.1647004932165146, + "learning_rate": 5.614859985995646e-06, + "loss": 1.9339, + "step": 318660 + }, + { + "epoch": 1.2129366716655374, + "grad_norm": 0.15683583915233612, + "learning_rate": 5.572721895996779e-06, + "loss": 1.9253, + "step": 318670 + }, + { + "epoch": 1.21297473413366, + "grad_norm": 0.16682861745357513, + "learning_rate": 5.5305873969553884e-06, + "loss": 1.9218, + "step": 318680 + }, + { + "epoch": 1.2130127966017827, + "grad_norm": 0.16168420016765594, + "learning_rate": 5.488456487953486e-06, + "loss": 1.9149, + "step": 318690 + }, + { + "epoch": 1.2130508590699056, + "grad_norm": 0.1688627451658249, + "learning_rate": 5.446329168073638e-06, + "loss": 1.931, + "step": 318700 + }, + { + "epoch": 1.2130889215380283, + "grad_norm": 0.19926753640174866, + "learning_rate": 5.404205436398801e-06, + "loss": 1.9402, + "step": 318710 + }, + { + "epoch": 1.213126984006151, + "grad_norm": 0.17776836454868317, + "learning_rate": 5.3620852920121535e-06, + "loss": 1.9384, + "step": 318720 + }, + { + "epoch": 1.2131650464742736, + "grad_norm": 0.1572011560201645, + "learning_rate": 5.319968733997371e-06, + "loss": 1.9441, + "step": 318730 + }, + { + "epoch": 1.2132031089423962, + "grad_norm": 0.15843211114406586, + "learning_rate": 5.2778557614385765e-06, + "loss": 1.9369, + "step": 318740 + }, + { + "epoch": 1.213241171410519, + "grad_norm": 0.15909917652606964, + "learning_rate": 5.235746373420114e-06, + "loss": 1.935, + "step": 318750 + }, + { + "epoch": 1.2132792338786416, + "grad_norm": 0.16105514764785767, + "learning_rate": 5.193640569026936e-06, + "loss": 1.9425, + "step": 318760 + }, + { + "epoch": 1.2133172963467642, + "grad_norm": 0.1726350486278534, + "learning_rate": 5.151538347344165e-06, + "loss": 1.9362, + "step": 318770 + }, + { + "epoch": 1.213355358814887, + "grad_norm": 0.25620037317276, + "learning_rate": 5.109439707457475e-06, + "loss": 1.9541, + "step": 318780 + }, + { + "epoch": 1.2133934212830098, + "grad_norm": 0.1771782636642456, + "learning_rate": 5.067344648452876e-06, + "loss": 1.9432, + "step": 318790 + }, + { + "epoch": 1.2134314837511324, + "grad_norm": 0.15600189566612244, + "learning_rate": 5.02525316941671e-06, + "loss": 1.9246, + "step": 318800 + }, + { + "epoch": 1.213469546219255, + "grad_norm": 0.1723054200410843, + "learning_rate": 4.983165269435874e-06, + "loss": 1.9399, + "step": 318810 + }, + { + "epoch": 1.2135076086873777, + "grad_norm": 0.19066469371318817, + "learning_rate": 4.941080947597376e-06, + "loss": 1.9316, + "step": 318820 + }, + { + "epoch": 1.2135456711555004, + "grad_norm": 0.15641188621520996, + "learning_rate": 4.89900020298889e-06, + "loss": 1.9428, + "step": 318830 + }, + { + "epoch": 1.213583733623623, + "grad_norm": 0.16088195145130157, + "learning_rate": 4.856923034698313e-06, + "loss": 1.9475, + "step": 318840 + }, + { + "epoch": 1.2136217960917457, + "grad_norm": 0.2013937532901764, + "learning_rate": 4.814849441813984e-06, + "loss": 1.9396, + "step": 318850 + }, + { + "epoch": 1.2136598585598684, + "grad_norm": 0.171700119972229, + "learning_rate": 4.7727794234246895e-06, + "loss": 1.9231, + "step": 318860 + }, + { + "epoch": 1.2136979210279912, + "grad_norm": 0.17729665338993073, + "learning_rate": 4.730712978619433e-06, + "loss": 1.9184, + "step": 318870 + }, + { + "epoch": 1.213735983496114, + "grad_norm": 0.1747244894504547, + "learning_rate": 4.688650106487724e-06, + "loss": 1.947, + "step": 318880 + }, + { + "epoch": 1.2137740459642365, + "grad_norm": 0.16286538541316986, + "learning_rate": 4.646590806119511e-06, + "loss": 1.9236, + "step": 318890 + }, + { + "epoch": 1.2138121084323592, + "grad_norm": 0.1795330047607422, + "learning_rate": 4.604535076605021e-06, + "loss": 1.9268, + "step": 318900 + }, + { + "epoch": 1.2138501709004819, + "grad_norm": 0.1655510812997818, + "learning_rate": 4.562482917034927e-06, + "loss": 1.9311, + "step": 318910 + }, + { + "epoch": 1.2138882333686045, + "grad_norm": 0.1607380360364914, + "learning_rate": 4.520434326500178e-06, + "loss": 1.9266, + "step": 318920 + }, + { + "epoch": 1.2139262958367272, + "grad_norm": 0.18507783114910126, + "learning_rate": 4.478389304092334e-06, + "loss": 1.9361, + "step": 318930 + }, + { + "epoch": 1.2139643583048498, + "grad_norm": 0.19285060465335846, + "learning_rate": 4.436347848903122e-06, + "loss": 1.9398, + "step": 318940 + }, + { + "epoch": 1.2140024207729727, + "grad_norm": 0.17271341383457184, + "learning_rate": 4.394309960024767e-06, + "loss": 1.9372, + "step": 318950 + }, + { + "epoch": 1.2140404832410954, + "grad_norm": 0.18780827522277832, + "learning_rate": 4.352275636549829e-06, + "loss": 1.9356, + "step": 318960 + }, + { + "epoch": 1.214078545709218, + "grad_norm": 0.16267111897468567, + "learning_rate": 4.310244877571312e-06, + "loss": 1.9213, + "step": 318970 + }, + { + "epoch": 1.2141166081773407, + "grad_norm": 0.16083675622940063, + "learning_rate": 4.268217682182496e-06, + "loss": 1.9338, + "step": 318980 + }, + { + "epoch": 1.2141546706454633, + "grad_norm": 0.1647607833147049, + "learning_rate": 4.226194049477161e-06, + "loss": 1.9419, + "step": 318990 + }, + { + "epoch": 1.214192733113586, + "grad_norm": 0.15294934809207916, + "learning_rate": 4.184173978549477e-06, + "loss": 1.9551, + "step": 319000 + }, + { + "epoch": 1.2142307955817087, + "grad_norm": 0.1650754064321518, + "learning_rate": 4.142157468493779e-06, + "loss": 1.9333, + "step": 319010 + }, + { + "epoch": 1.2142688580498313, + "grad_norm": 0.156051367521286, + "learning_rate": 4.100144518405125e-06, + "loss": 1.9246, + "step": 319020 + }, + { + "epoch": 1.214306920517954, + "grad_norm": 0.17990271747112274, + "learning_rate": 4.0581351273787374e-06, + "loss": 1.9356, + "step": 319030 + }, + { + "epoch": 1.2143449829860766, + "grad_norm": 0.1663575917482376, + "learning_rate": 4.016129294510229e-06, + "loss": 1.9261, + "step": 319040 + }, + { + "epoch": 1.2143830454541995, + "grad_norm": 0.1609850525856018, + "learning_rate": 3.974127018895657e-06, + "loss": 1.9233, + "step": 319050 + }, + { + "epoch": 1.2144211079223222, + "grad_norm": 0.18278783559799194, + "learning_rate": 3.932128299631466e-06, + "loss": 1.9385, + "step": 319060 + }, + { + "epoch": 1.2144591703904448, + "grad_norm": 0.170439213514328, + "learning_rate": 3.890133135814378e-06, + "loss": 1.9417, + "step": 319070 + }, + { + "epoch": 1.2144972328585675, + "grad_norm": 0.16244801878929138, + "learning_rate": 3.8481415265416154e-06, + "loss": 1.9365, + "step": 319080 + }, + { + "epoch": 1.2145352953266901, + "grad_norm": 0.18392355740070343, + "learning_rate": 3.806153470910789e-06, + "loss": 1.9466, + "step": 319090 + }, + { + "epoch": 1.2145733577948128, + "grad_norm": 0.16437573730945587, + "learning_rate": 3.7641689680197875e-06, + "loss": 1.9267, + "step": 319100 + }, + { + "epoch": 1.2146114202629354, + "grad_norm": 0.20235992968082428, + "learning_rate": 3.7221880169669985e-06, + "loss": 1.9407, + "step": 319110 + }, + { + "epoch": 1.2146494827310583, + "grad_norm": 0.16109450161457062, + "learning_rate": 3.680210616851087e-06, + "loss": 1.9403, + "step": 319120 + }, + { + "epoch": 1.214687545199181, + "grad_norm": 0.16093021631240845, + "learning_rate": 3.6382367667710527e-06, + "loss": 1.9376, + "step": 319130 + }, + { + "epoch": 1.2147256076673036, + "grad_norm": 0.1587112993001938, + "learning_rate": 3.596266465826559e-06, + "loss": 1.9571, + "step": 319140 + }, + { + "epoch": 1.2147636701354263, + "grad_norm": 0.1975339949131012, + "learning_rate": 3.554299713117326e-06, + "loss": 1.9325, + "step": 319150 + }, + { + "epoch": 1.214801732603549, + "grad_norm": 0.16535383462905884, + "learning_rate": 3.51233650774363e-06, + "loss": 1.926, + "step": 319160 + }, + { + "epoch": 1.2148397950716716, + "grad_norm": 0.17008942365646362, + "learning_rate": 3.470376848806078e-06, + "loss": 1.9341, + "step": 319170 + }, + { + "epoch": 1.2148778575397943, + "grad_norm": 0.1578245759010315, + "learning_rate": 3.4284207354056127e-06, + "loss": 1.9351, + "step": 319180 + }, + { + "epoch": 1.214915920007917, + "grad_norm": 0.15404680371284485, + "learning_rate": 3.3864681666436748e-06, + "loss": 1.9453, + "step": 319190 + }, + { + "epoch": 1.2149539824760396, + "grad_norm": 0.15549880266189575, + "learning_rate": 3.3445191416219824e-06, + "loss": 1.9181, + "step": 319200 + }, + { + "epoch": 1.2149920449441622, + "grad_norm": 0.1594415158033371, + "learning_rate": 3.302573659442698e-06, + "loss": 1.9511, + "step": 319210 + }, + { + "epoch": 1.2150301074122851, + "grad_norm": 0.16243025660514832, + "learning_rate": 3.2606317192083177e-06, + "loss": 1.9354, + "step": 319220 + }, + { + "epoch": 1.2150681698804078, + "grad_norm": 0.15204553306102753, + "learning_rate": 3.2186933200216706e-06, + "loss": 1.9283, + "step": 319230 + }, + { + "epoch": 1.2151062323485304, + "grad_norm": 0.15984119474887848, + "learning_rate": 3.17675846098614e-06, + "loss": 1.9428, + "step": 319240 + }, + { + "epoch": 1.215144294816653, + "grad_norm": 0.17340975999832153, + "learning_rate": 3.134827141205221e-06, + "loss": 1.9369, + "step": 319250 + }, + { + "epoch": 1.2151823572847757, + "grad_norm": 0.15931546688079834, + "learning_rate": 3.0928993597830747e-06, + "loss": 1.9346, + "step": 319260 + }, + { + "epoch": 1.2152204197528984, + "grad_norm": 0.15926462411880493, + "learning_rate": 3.0509751158240285e-06, + "loss": 1.9432, + "step": 319270 + }, + { + "epoch": 1.215258482221021, + "grad_norm": 0.1612613946199417, + "learning_rate": 3.0090544084329096e-06, + "loss": 1.9393, + "step": 319280 + }, + { + "epoch": 1.215296544689144, + "grad_norm": 0.16437748074531555, + "learning_rate": 2.967137236714823e-06, + "loss": 1.9395, + "step": 319290 + }, + { + "epoch": 1.2153346071572666, + "grad_norm": 0.1627143770456314, + "learning_rate": 2.925223599775373e-06, + "loss": 1.9374, + "step": 319300 + }, + { + "epoch": 1.2153726696253893, + "grad_norm": 0.15959599614143372, + "learning_rate": 2.8833134967203857e-06, + "loss": 1.9318, + "step": 319310 + }, + { + "epoch": 1.215410732093512, + "grad_norm": 0.15941393375396729, + "learning_rate": 2.841406926656187e-06, + "loss": 1.9385, + "step": 319320 + }, + { + "epoch": 1.2154487945616346, + "grad_norm": 0.15475495159626007, + "learning_rate": 2.7995038886894363e-06, + "loss": 1.9382, + "step": 319330 + }, + { + "epoch": 1.2154868570297572, + "grad_norm": 0.15965044498443604, + "learning_rate": 2.7576043819272366e-06, + "loss": 1.9247, + "step": 319340 + }, + { + "epoch": 1.2155249194978799, + "grad_norm": 0.1535380631685257, + "learning_rate": 2.7157084054769134e-06, + "loss": 1.9418, + "step": 319350 + }, + { + "epoch": 1.2155629819660025, + "grad_norm": 0.16000913083553314, + "learning_rate": 2.6738159584463463e-06, + "loss": 1.9311, + "step": 319360 + }, + { + "epoch": 1.2156010444341252, + "grad_norm": 0.15471646189689636, + "learning_rate": 2.6319270399435825e-06, + "loss": 1.9355, + "step": 319370 + }, + { + "epoch": 1.2156391069022479, + "grad_norm": 0.1763332486152649, + "learning_rate": 2.5900416490772794e-06, + "loss": 1.9287, + "step": 319380 + }, + { + "epoch": 1.2156771693703707, + "grad_norm": 0.17447736859321594, + "learning_rate": 2.5481597849563165e-06, + "loss": 1.9364, + "step": 319390 + }, + { + "epoch": 1.2157152318384934, + "grad_norm": 0.1559518277645111, + "learning_rate": 2.5062814466900175e-06, + "loss": 1.9156, + "step": 319400 + }, + { + "epoch": 1.215753294306616, + "grad_norm": 0.16822876036167145, + "learning_rate": 2.464406633388039e-06, + "loss": 1.9356, + "step": 319410 + }, + { + "epoch": 1.2157913567747387, + "grad_norm": 0.1587214171886444, + "learning_rate": 2.4225353441603704e-06, + "loss": 1.9373, + "step": 319420 + }, + { + "epoch": 1.2158294192428614, + "grad_norm": 0.1601061224937439, + "learning_rate": 2.3806675781175015e-06, + "loss": 1.9624, + "step": 319430 + }, + { + "epoch": 1.215867481710984, + "grad_norm": 0.15818408131599426, + "learning_rate": 2.3388033343701987e-06, + "loss": 1.9261, + "step": 319440 + }, + { + "epoch": 1.2159055441791067, + "grad_norm": 0.17259342968463898, + "learning_rate": 2.2969426120296177e-06, + "loss": 1.9251, + "step": 319450 + }, + { + "epoch": 1.2159436066472293, + "grad_norm": 0.20744659006595612, + "learning_rate": 2.255085410207358e-06, + "loss": 1.9259, + "step": 319460 + }, + { + "epoch": 1.2159816691153522, + "grad_norm": 0.15923643112182617, + "learning_rate": 2.2132317280152412e-06, + "loss": 1.9455, + "step": 319470 + }, + { + "epoch": 1.2160197315834749, + "grad_norm": 0.15893425047397614, + "learning_rate": 2.1713815645655886e-06, + "loss": 1.9264, + "step": 319480 + }, + { + "epoch": 1.2160577940515975, + "grad_norm": 0.1528540700674057, + "learning_rate": 2.12953491897111e-06, + "loss": 1.9386, + "step": 319490 + }, + { + "epoch": 1.2160958565197202, + "grad_norm": 0.16423656046390533, + "learning_rate": 2.0876917903447924e-06, + "loss": 1.9327, + "step": 319500 + }, + { + "epoch": 1.2161339189878428, + "grad_norm": 0.15889745950698853, + "learning_rate": 2.0458521778000673e-06, + "loss": 1.9425, + "step": 319510 + }, + { + "epoch": 1.2161719814559655, + "grad_norm": 0.18033714592456818, + "learning_rate": 2.004016080450699e-06, + "loss": 1.9349, + "step": 319520 + }, + { + "epoch": 1.2162100439240882, + "grad_norm": 0.1714215725660324, + "learning_rate": 1.962183497410841e-06, + "loss": 1.9437, + "step": 319530 + }, + { + "epoch": 1.2162481063922108, + "grad_norm": 0.1582074910402298, + "learning_rate": 1.9203544277950347e-06, + "loss": 1.936, + "step": 319540 + }, + { + "epoch": 1.2162861688603335, + "grad_norm": 0.16662302613258362, + "learning_rate": 1.8785288707180993e-06, + "loss": 1.9353, + "step": 319550 + }, + { + "epoch": 1.2163242313284564, + "grad_norm": 0.15821482241153717, + "learning_rate": 1.8367068252954089e-06, + "loss": 1.9545, + "step": 319560 + }, + { + "epoch": 1.216362293796579, + "grad_norm": 0.15704180300235748, + "learning_rate": 1.79488829064256e-06, + "loss": 1.9378, + "step": 319570 + }, + { + "epoch": 1.2164003562647017, + "grad_norm": 0.16972213983535767, + "learning_rate": 1.7530732658755378e-06, + "loss": 1.9328, + "step": 319580 + }, + { + "epoch": 1.2164384187328243, + "grad_norm": 0.1585708111524582, + "learning_rate": 1.7112617501107152e-06, + "loss": 1.956, + "step": 319590 + }, + { + "epoch": 1.216476481200947, + "grad_norm": 0.17058701813220978, + "learning_rate": 1.6694537424649103e-06, + "loss": 1.9334, + "step": 319600 + }, + { + "epoch": 1.2165145436690696, + "grad_norm": 0.154144287109375, + "learning_rate": 1.6276492420551626e-06, + "loss": 1.9254, + "step": 319610 + }, + { + "epoch": 1.2165526061371923, + "grad_norm": 0.1549491584300995, + "learning_rate": 1.5858482479990111e-06, + "loss": 1.9385, + "step": 319620 + }, + { + "epoch": 1.216590668605315, + "grad_norm": 0.1657015085220337, + "learning_rate": 1.5440507594143282e-06, + "loss": 1.9349, + "step": 319630 + }, + { + "epoch": 1.2166287310734378, + "grad_norm": 0.15674883127212524, + "learning_rate": 1.5022567754193194e-06, + "loss": 1.9381, + "step": 319640 + }, + { + "epoch": 1.2166667935415605, + "grad_norm": 0.15751342475414276, + "learning_rate": 1.4604662951325787e-06, + "loss": 1.9341, + "step": 319650 + }, + { + "epoch": 1.2167048560096831, + "grad_norm": 0.17440181970596313, + "learning_rate": 1.4186793176730883e-06, + "loss": 1.9286, + "step": 319660 + }, + { + "epoch": 1.2167429184778058, + "grad_norm": 0.15538595616817474, + "learning_rate": 1.376895842160164e-06, + "loss": 1.9357, + "step": 319670 + }, + { + "epoch": 1.2167809809459285, + "grad_norm": 0.15798527002334595, + "learning_rate": 1.3351158677135655e-06, + "loss": 1.933, + "step": 319680 + }, + { + "epoch": 1.2168190434140511, + "grad_norm": 0.15561030805110931, + "learning_rate": 1.29333939345333e-06, + "loss": 1.9273, + "step": 319690 + }, + { + "epoch": 1.2168571058821738, + "grad_norm": 0.15403597056865692, + "learning_rate": 1.2515664184999387e-06, + "loss": 1.9359, + "step": 319700 + }, + { + "epoch": 1.2168951683502964, + "grad_norm": 0.16248761117458344, + "learning_rate": 1.2097969419741506e-06, + "loss": 1.9214, + "step": 319710 + }, + { + "epoch": 1.216933230818419, + "grad_norm": 0.16012699902057648, + "learning_rate": 1.1680309629972241e-06, + "loss": 1.9351, + "step": 319720 + }, + { + "epoch": 1.216971293286542, + "grad_norm": 0.15856362879276276, + "learning_rate": 1.1262684806905842e-06, + "loss": 1.9329, + "step": 319730 + }, + { + "epoch": 1.2170093557546646, + "grad_norm": 0.15176869928836823, + "learning_rate": 1.0845094941762668e-06, + "loss": 1.9431, + "step": 319740 + }, + { + "epoch": 1.2170474182227873, + "grad_norm": 0.16340629756450653, + "learning_rate": 1.0427540025765292e-06, + "loss": 1.9335, + "step": 319750 + }, + { + "epoch": 1.21708548069091, + "grad_norm": 0.15526182949543, + "learning_rate": 1.0010020050140178e-06, + "loss": 1.9234, + "step": 319760 + }, + { + "epoch": 1.2171235431590326, + "grad_norm": 0.17227885127067566, + "learning_rate": 9.592535006117676e-07, + "loss": 1.9371, + "step": 319770 + }, + { + "epoch": 1.2171616056271553, + "grad_norm": 0.1604343056678772, + "learning_rate": 9.175084884931462e-07, + "loss": 1.9323, + "step": 319780 + }, + { + "epoch": 1.217199668095278, + "grad_norm": 0.16079290211200714, + "learning_rate": 8.757669677818547e-07, + "loss": 1.9255, + "step": 319790 + }, + { + "epoch": 1.2172377305634006, + "grad_norm": 0.15556688606739044, + "learning_rate": 8.340289376020938e-07, + "loss": 1.9335, + "step": 319800 + }, + { + "epoch": 1.2172757930315234, + "grad_norm": 0.1539129763841629, + "learning_rate": 7.922943970782859e-07, + "loss": 1.9473, + "step": 319810 + }, + { + "epoch": 1.217313855499646, + "grad_norm": 0.15851177275180817, + "learning_rate": 7.505633453353533e-07, + "loss": 1.9168, + "step": 319820 + }, + { + "epoch": 1.2173519179677688, + "grad_norm": 0.16344186663627625, + "learning_rate": 7.088357814984958e-07, + "loss": 1.9315, + "step": 319830 + }, + { + "epoch": 1.2173899804358914, + "grad_norm": 0.15521660447120667, + "learning_rate": 6.671117046931907e-07, + "loss": 1.9425, + "step": 319840 + }, + { + "epoch": 1.217428042904014, + "grad_norm": 0.15747219324111938, + "learning_rate": 6.253911140455259e-07, + "loss": 1.9277, + "step": 319850 + }, + { + "epoch": 1.2174661053721367, + "grad_norm": 0.15791189670562744, + "learning_rate": 5.83674008681756e-07, + "loss": 1.943, + "step": 319860 + }, + { + "epoch": 1.2175041678402594, + "grad_norm": 0.16524600982666016, + "learning_rate": 5.41960387728524e-07, + "loss": 1.9465, + "step": 319870 + }, + { + "epoch": 1.217542230308382, + "grad_norm": 0.1657755821943283, + "learning_rate": 5.002502503129169e-07, + "loss": 1.9395, + "step": 319880 + }, + { + "epoch": 1.2175802927765047, + "grad_norm": 0.1625651866197586, + "learning_rate": 4.585435955623551e-07, + "loss": 1.9362, + "step": 319890 + }, + { + "epoch": 1.2176183552446274, + "grad_norm": 0.16468630731105804, + "learning_rate": 4.1684042260459186e-07, + "loss": 1.9332, + "step": 319900 + }, + { + "epoch": 1.2176564177127502, + "grad_norm": 0.1581428050994873, + "learning_rate": 3.7514073056771347e-07, + "loss": 1.9283, + "step": 319910 + }, + { + "epoch": 1.217694480180873, + "grad_norm": 0.16948306560516357, + "learning_rate": 3.334445185803059e-07, + "loss": 1.9369, + "step": 319920 + }, + { + "epoch": 1.2177325426489956, + "grad_norm": 0.15806998312473297, + "learning_rate": 2.917517857711771e-07, + "loss": 1.9403, + "step": 319930 + }, + { + "epoch": 1.2177706051171182, + "grad_norm": 0.15995928645133972, + "learning_rate": 2.5006253126952374e-07, + "loss": 1.9287, + "step": 319940 + }, + { + "epoch": 1.2178086675852409, + "grad_norm": 0.15800589323043823, + "learning_rate": 2.0837675420504187e-07, + "loss": 1.9312, + "step": 319950 + }, + { + "epoch": 1.2178467300533635, + "grad_norm": 0.14926236867904663, + "learning_rate": 1.6669445370759428e-07, + "loss": 1.9412, + "step": 319960 + }, + { + "epoch": 1.2178847925214862, + "grad_norm": 0.1593756526708603, + "learning_rate": 1.2501562890743223e-07, + "loss": 1.9365, + "step": 319970 + }, + { + "epoch": 1.217922854989609, + "grad_norm": 0.16351835429668427, + "learning_rate": 8.334027893541763e-08, + "loss": 1.9345, + "step": 319980 + }, + { + "epoch": 1.2179609174577317, + "grad_norm": 0.1576191782951355, + "learning_rate": 4.166840292246788e-08, + "loss": 1.9253, + "step": 319990 + }, + { + "epoch": 1.2179989799258544, + "grad_norm": 0.15571853518486023, + "learning_rate": 0.0, + "loss": 1.9405, + "step": 320000 + } + ], + "logging_steps": 10, + "max_steps": 320000, + "num_input_tokens_seen": 0, + "num_train_epochs": 2, + "save_steps": 10000, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 2.0539318127253395e+21, + "train_batch_size": 8, + "trial_name": null, + "trial_params": null +}