{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 0.7, "eval_steps": 2000, "global_step": 70000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0001, "grad_norm": 62.5, "learning_rate": 3.27e-05, "loss": 96.4218, "loss/aux_loss": 0.06346827149391174, "loss/crossentropy": 10.520584106445312, "loss/logits": 8.62325460910797, "step": 10 }, { "epoch": 0.0002, "grad_norm": 64.0, "grad_norm_var": 47.05416666666667, "learning_rate": 3.54e-05, "loss": 90.9983, "loss/aux_loss": 0.05445752870291472, "loss/crossentropy": 9.918135738372802, "loss/logits": 8.018545007705688, "step": 20 }, { "epoch": 0.0003, "grad_norm": 60.75, "grad_norm_var": 3.05390625, "learning_rate": 3.81e-05, "loss": 84.3016, "loss/aux_loss": 0.052422930113971235, "loss/crossentropy": 9.341589832305909, "loss/logits": 7.614971446990967, "step": 30 }, { "epoch": 0.0004, "grad_norm": 59.0, "grad_norm_var": 264.72682291666666, "learning_rate": 4.08e-05, "loss": 78.0706, "loss/aux_loss": 0.05138566605746746, "loss/crossentropy": 8.834511041641235, "loss/logits": 6.946270298957825, "step": 40 }, { "epoch": 0.0005, "grad_norm": 54.0, "grad_norm_var": 295.6518229166667, "learning_rate": 4.35e-05, "loss": 73.2196, "loss/aux_loss": 0.05028745252639055, "loss/crossentropy": 8.335509753227234, "loss/logits": 6.576072025299072, "step": 50 }, { "epoch": 0.0006, "grad_norm": 49.5, "grad_norm_var": 12.49765625, "learning_rate": 4.62e-05, "loss": 69.3149, "loss/aux_loss": 0.04974018670618534, "loss/crossentropy": 8.07826225757599, "loss/logits": 6.17107310295105, "step": 60 }, { "epoch": 0.0007, "grad_norm": 33.5, "grad_norm_var": 37.90182291666667, "learning_rate": 4.89e-05, "loss": 65.9331, "loss/aux_loss": 0.049574922397732736, "loss/crossentropy": 7.850642085075378, "loss/logits": 5.917052555084228, "step": 70 }, { "epoch": 0.0008, "grad_norm": 24.75, "grad_norm_var": 683.9268229166667, "learning_rate": 5.16e-05, "loss": 63.4084, "loss/aux_loss": 0.0500051811337471, "loss/crossentropy": 7.550819325447082, "loss/logits": 5.728990888595581, "step": 80 }, { "epoch": 0.0009, "grad_norm": 65.5, "grad_norm_var": 863.5828125, "learning_rate": 5.429999999999999e-05, "loss": 61.9785, "loss/aux_loss": 0.05104184336960316, "loss/crossentropy": 7.397841739654541, "loss/logits": 5.37682204246521, "step": 90 }, { "epoch": 0.001, "grad_norm": 53.25, "grad_norm_var": 342.37433268229165, "learning_rate": 5.6999999999999996e-05, "loss": 60.8601, "loss/aux_loss": 0.05242748130112886, "loss/crossentropy": 7.235720539093018, "loss/logits": 5.34943995475769, "step": 100 }, { "epoch": 0.0011, "grad_norm": 68.5, "grad_norm_var": 300.8374348958333, "learning_rate": 5.97e-05, "loss": 59.9541, "loss/aux_loss": 0.05149786453694105, "loss/crossentropy": 7.193077087402344, "loss/logits": 5.278037166595459, "step": 110 }, { "epoch": 0.0012, "grad_norm": 29.375, "grad_norm_var": 219.08326822916666, "learning_rate": 6.24e-05, "loss": 58.794, "loss/aux_loss": 0.05140427742153406, "loss/crossentropy": 7.130345010757447, "loss/logits": 5.037704062461853, "step": 120 }, { "epoch": 0.0013, "grad_norm": 15.0625, "grad_norm_var": 210.59972330729167, "learning_rate": 6.51e-05, "loss": 58.1276, "loss/aux_loss": 0.050786581449210645, "loss/crossentropy": 6.900083208084107, "loss/logits": 5.119465160369873, "step": 130 }, { "epoch": 0.0014, "grad_norm": 15.4375, "grad_norm_var": 242.2203125, "learning_rate": 6.780000000000001e-05, "loss": 57.251, "loss/aux_loss": 0.050472350977361205, "loss/crossentropy": 6.90623025894165, "loss/logits": 4.989370441436767, "step": 140 }, { "epoch": 0.0015, "grad_norm": 23.75, "grad_norm_var": 53.895686848958334, "learning_rate": 7.049999999999999e-05, "loss": 55.9011, "loss/aux_loss": 0.05010317321866751, "loss/crossentropy": 6.878646898269653, "loss/logits": 4.966757416725159, "step": 150 }, { "epoch": 0.0016, "grad_norm": 22.875, "grad_norm_var": 85.49140625, "learning_rate": 7.32e-05, "loss": 55.0174, "loss/aux_loss": 0.05009230561554432, "loss/crossentropy": 6.685335445404053, "loss/logits": 4.792002511024475, "step": 160 }, { "epoch": 0.0017, "grad_norm": 24.5, "grad_norm_var": 172.98014322916666, "learning_rate": 7.589999999999999e-05, "loss": 54.2538, "loss/aux_loss": 0.049807760119438174, "loss/crossentropy": 6.676994824409485, "loss/logits": 4.751176404953003, "step": 170 }, { "epoch": 0.0018, "grad_norm": 49.0, "grad_norm_var": 130.01295572916666, "learning_rate": 7.86e-05, "loss": 53.571, "loss/aux_loss": 0.049537939578294755, "loss/crossentropy": 6.596286082267762, "loss/logits": 4.717176723480224, "step": 180 }, { "epoch": 0.0019, "grad_norm": 22.0, "grad_norm_var": 68.81145833333333, "learning_rate": 8.13e-05, "loss": 52.313, "loss/aux_loss": 0.049018622189760205, "loss/crossentropy": 6.518313908576966, "loss/logits": 4.740558981895447, "step": 190 }, { "epoch": 0.002, "grad_norm": 23.125, "grad_norm_var": 39.942708333333336, "learning_rate": 8.4e-05, "loss": 51.4724, "loss/aux_loss": 0.04901752769947052, "loss/crossentropy": 6.31372344493866, "loss/logits": 4.539055609703064, "step": 200 }, { "epoch": 0.0021, "grad_norm": 25.5, "grad_norm_var": 51.50584309895833, "learning_rate": 8.67e-05, "loss": 50.4864, "loss/aux_loss": 0.049004881829023364, "loss/crossentropy": 6.431901216506958, "loss/logits": 4.541516590118408, "step": 210 }, { "epoch": 0.0022, "grad_norm": 23.75, "grad_norm_var": 45.890625, "learning_rate": 8.94e-05, "loss": 49.4348, "loss/aux_loss": 0.0489825276657939, "loss/crossentropy": 6.201497769355774, "loss/logits": 4.2463623762130736, "step": 220 }, { "epoch": 0.0023, "grad_norm": 32.75, "grad_norm_var": 33.282291666666666, "learning_rate": 9.21e-05, "loss": 48.7892, "loss/aux_loss": 0.04863291662186384, "loss/crossentropy": 6.257090902328491, "loss/logits": 4.189764153957367, "step": 230 }, { "epoch": 0.0024, "grad_norm": 33.0, "grad_norm_var": 26.2509765625, "learning_rate": 9.479999999999999e-05, "loss": 47.4078, "loss/aux_loss": 0.04870687611401081, "loss/crossentropy": 6.129881906509399, "loss/logits": 4.127840185165406, "step": 240 }, { "epoch": 0.0025, "grad_norm": 19.375, "grad_norm_var": 26.835416666666667, "learning_rate": 9.75e-05, "loss": 46.931, "loss/aux_loss": 0.04863628149032593, "loss/crossentropy": 5.8494936466217045, "loss/logits": 4.019280314445496, "step": 250 }, { "epoch": 0.0026, "grad_norm": 37.75, "grad_norm_var": 30.508072916666666, "learning_rate": 0.0001002, "loss": 45.6479, "loss/aux_loss": 0.04853515811264515, "loss/crossentropy": 5.952807331085205, "loss/logits": 4.096065545082093, "step": 260 }, { "epoch": 0.0027, "grad_norm": 29.25, "grad_norm_var": 42.109309895833334, "learning_rate": 0.0001029, "loss": 45.4552, "loss/aux_loss": 0.04864779394119978, "loss/crossentropy": 5.901494193077087, "loss/logits": 4.014258062839508, "step": 270 }, { "epoch": 0.0028, "grad_norm": 30.75, "grad_norm_var": 31.6494140625, "learning_rate": 0.00010560000000000002, "loss": 45.0348, "loss/aux_loss": 0.04867569580674171, "loss/crossentropy": 5.769275331497193, "loss/logits": 3.9283937215805054, "step": 280 }, { "epoch": 0.0029, "grad_norm": 24.0, "grad_norm_var": 49.4306640625, "learning_rate": 0.00010829999999999999, "loss": 44.4484, "loss/aux_loss": 0.048759896866977216, "loss/crossentropy": 5.552310681343078, "loss/logits": 3.828005838394165, "step": 290 }, { "epoch": 0.003, "grad_norm": 23.5, "grad_norm_var": 37.06666666666667, "learning_rate": 0.00011099999999999999, "loss": 44.2056, "loss/aux_loss": 0.04860832653939724, "loss/crossentropy": 5.736793255805969, "loss/logits": 3.8119420647621154, "step": 300 }, { "epoch": 0.0031, "grad_norm": 34.75, "grad_norm_var": 25.449739583333333, "learning_rate": 0.00011369999999999999, "loss": 43.1406, "loss/aux_loss": 0.04863522592931986, "loss/crossentropy": 5.70694375038147, "loss/logits": 3.7460230350494386, "step": 310 }, { "epoch": 0.0032, "grad_norm": 28.75, "grad_norm_var": 23.142643229166666, "learning_rate": 0.0001164, "loss": 43.2674, "loss/aux_loss": 0.048594312928617, "loss/crossentropy": 5.645468616485596, "loss/logits": 3.795237183570862, "step": 320 }, { "epoch": 0.0033, "grad_norm": 25.125, "grad_norm_var": 27.248893229166665, "learning_rate": 0.0001191, "loss": 42.4531, "loss/aux_loss": 0.048580970242619516, "loss/crossentropy": 5.573257780075073, "loss/logits": 3.625744652748108, "step": 330 }, { "epoch": 0.0034, "grad_norm": 27.0, "grad_norm_var": 18.6728515625, "learning_rate": 0.00012179999999999999, "loss": 42.3091, "loss/aux_loss": 0.04864873345941305, "loss/crossentropy": 5.666665482521057, "loss/logits": 3.6989678740501404, "step": 340 }, { "epoch": 0.0035, "grad_norm": 25.625, "grad_norm_var": 14.792122395833333, "learning_rate": 0.0001245, "loss": 41.7634, "loss/aux_loss": 0.048494835197925565, "loss/crossentropy": 5.532446098327637, "loss/logits": 3.6617549777030947, "step": 350 }, { "epoch": 0.0036, "grad_norm": 27.125, "grad_norm_var": 36.68274739583333, "learning_rate": 0.0001272, "loss": 41.3748, "loss/aux_loss": 0.04851998519152403, "loss/crossentropy": 5.461347937583923, "loss/logits": 3.681316375732422, "step": 360 }, { "epoch": 0.0037, "grad_norm": 32.25, "grad_norm_var": 62.7072265625, "learning_rate": 0.0001299, "loss": 41.0029, "loss/aux_loss": 0.0486336350440979, "loss/crossentropy": 5.420117592811584, "loss/logits": 3.614268946647644, "step": 370 }, { "epoch": 0.0038, "grad_norm": 25.375, "grad_norm_var": 29.937955729166667, "learning_rate": 0.0001326, "loss": 40.4483, "loss/aux_loss": 0.048567987978458405, "loss/crossentropy": 5.519010901451111, "loss/logits": 3.4499247074127197, "step": 380 }, { "epoch": 0.0039, "grad_norm": 28.375, "grad_norm_var": 14.257291666666667, "learning_rate": 0.0001353, "loss": 39.9714, "loss/aux_loss": 0.04847833849489689, "loss/crossentropy": 5.376910948753357, "loss/logits": 3.4180081248283387, "step": 390 }, { "epoch": 0.004, "grad_norm": 25.25, "grad_norm_var": 17.4041015625, "learning_rate": 0.00013800000000000002, "loss": 39.9151, "loss/aux_loss": 0.048561175167560575, "loss/crossentropy": 5.305628776550293, "loss/logits": 3.4211841225624084, "step": 400 }, { "epoch": 0.0041, "grad_norm": 26.625, "grad_norm_var": 23.691080729166668, "learning_rate": 0.00014069999999999998, "loss": 39.9258, "loss/aux_loss": 0.0485720319673419, "loss/crossentropy": 5.2558026790618895, "loss/logits": 3.477493929862976, "step": 410 }, { "epoch": 0.0042, "grad_norm": 24.0, "grad_norm_var": 22.3181640625, "learning_rate": 0.0001434, "loss": 39.5536, "loss/aux_loss": 0.04851338397711515, "loss/crossentropy": 5.359068250656128, "loss/logits": 3.373235845565796, "step": 420 }, { "epoch": 0.0043, "grad_norm": 23.25, "grad_norm_var": 25.664518229166667, "learning_rate": 0.00014609999999999997, "loss": 38.9821, "loss/aux_loss": 0.04848247561603784, "loss/crossentropy": 5.391582441329956, "loss/logits": 3.3668909788131716, "step": 430 }, { "epoch": 0.0044, "grad_norm": 29.125, "grad_norm_var": 27.742122395833334, "learning_rate": 0.00014879999999999998, "loss": 38.4838, "loss/aux_loss": 0.048559782840311524, "loss/crossentropy": 5.219124293327331, "loss/logits": 3.2023038268089294, "step": 440 }, { "epoch": 0.0045, "grad_norm": 24.125, "grad_norm_var": 13.058268229166666, "learning_rate": 0.0001515, "loss": 37.7637, "loss/aux_loss": 0.048486584424972536, "loss/crossentropy": 5.257930779457093, "loss/logits": 3.2549287557601927, "step": 450 }, { "epoch": 0.0046, "grad_norm": 26.0, "grad_norm_var": 13.198372395833333, "learning_rate": 0.00015419999999999998, "loss": 37.8807, "loss/aux_loss": 0.048509182222187516, "loss/crossentropy": 5.24229850769043, "loss/logits": 3.2370536804199217, "step": 460 }, { "epoch": 0.0047, "grad_norm": 26.375, "grad_norm_var": 10.068489583333333, "learning_rate": 0.0001569, "loss": 37.1278, "loss/aux_loss": 0.048433386348187925, "loss/crossentropy": 5.20349223613739, "loss/logits": 3.114039051532745, "step": 470 }, { "epoch": 0.0048, "grad_norm": 20.875, "grad_norm_var": 11.547916666666667, "learning_rate": 0.0001596, "loss": 36.9174, "loss/aux_loss": 0.04847547374665737, "loss/crossentropy": 4.9283219337463375, "loss/logits": 3.213498628139496, "step": 480 }, { "epoch": 0.0049, "grad_norm": 21.625, "grad_norm_var": 7.476041666666666, "learning_rate": 0.0001623, "loss": 36.5318, "loss/aux_loss": 0.048416960053145885, "loss/crossentropy": 5.040558886528015, "loss/logits": 3.13973091840744, "step": 490 }, { "epoch": 0.005, "grad_norm": 22.5, "grad_norm_var": 8.258072916666666, "learning_rate": 0.000165, "loss": 36.6402, "loss/aux_loss": 0.048433396965265274, "loss/crossentropy": 5.000589728355408, "loss/logits": 3.196159243583679, "step": 500 }, { "epoch": 0.0051, "grad_norm": 25.375, "grad_norm_var": 17.601497395833334, "learning_rate": 0.0001677, "loss": 36.0775, "loss/aux_loss": 0.048407428339123725, "loss/crossentropy": 5.022399640083313, "loss/logits": 3.1573411226272583, "step": 510 }, { "epoch": 0.0052, "grad_norm": 26.25, "grad_norm_var": 11.326497395833334, "learning_rate": 0.0001704, "loss": 35.8341, "loss/aux_loss": 0.04850205350667238, "loss/crossentropy": 5.029168057441711, "loss/logits": 3.0023858308792115, "step": 520 }, { "epoch": 0.0053, "grad_norm": 34.75, "grad_norm_var": 26.2072265625, "learning_rate": 0.0001731, "loss": 35.7083, "loss/aux_loss": 0.048469410836696626, "loss/crossentropy": 4.937405061721802, "loss/logits": 3.096103620529175, "step": 530 }, { "epoch": 0.0054, "grad_norm": 26.5, "grad_norm_var": 26.883333333333333, "learning_rate": 0.00017580000000000002, "loss": 35.1926, "loss/aux_loss": 0.04851417765021324, "loss/crossentropy": 4.968003535270691, "loss/logits": 3.037220096588135, "step": 540 }, { "epoch": 0.0055, "grad_norm": 21.75, "grad_norm_var": 3.1494140625, "learning_rate": 0.0001785, "loss": 34.8693, "loss/aux_loss": 0.048468691483139995, "loss/crossentropy": 4.928069758415222, "loss/logits": 3.0163326144218443, "step": 550 }, { "epoch": 0.0056, "grad_norm": 21.125, "grad_norm_var": 40.6625, "learning_rate": 0.0001812, "loss": 34.8376, "loss/aux_loss": 0.04853217788040638, "loss/crossentropy": 4.8081374049186705, "loss/logits": 2.9309885263442994, "step": 560 }, { "epoch": 0.0057, "grad_norm": 21.125, "grad_norm_var": 9.170572916666666, "learning_rate": 0.00018389999999999997, "loss": 34.4132, "loss/aux_loss": 0.04839835949242115, "loss/crossentropy": 4.890771484375, "loss/logits": 2.9162360787391663, "step": 570 }, { "epoch": 0.0058, "grad_norm": 24.75, "grad_norm_var": 5.412955729166667, "learning_rate": 0.00018659999999999998, "loss": 33.9858, "loss/aux_loss": 0.04839918464422226, "loss/crossentropy": 4.828824257850647, "loss/logits": 2.9052307963371278, "step": 580 }, { "epoch": 0.0059, "grad_norm": 102.5, "grad_norm_var": 448.30104166666666, "learning_rate": 0.0001893, "loss": 34.3014, "loss/aux_loss": 0.04844543803483248, "loss/crossentropy": 4.836876845359802, "loss/logits": 2.9816999673843383, "step": 590 }, { "epoch": 0.006, "grad_norm": 23.0, "grad_norm_var": 839.7497395833333, "learning_rate": 0.00019199999999999998, "loss": 34.1366, "loss/aux_loss": 0.0485780967399478, "loss/crossentropy": 4.918647742271423, "loss/logits": 3.028424918651581, "step": 600 }, { "epoch": 0.0061, "grad_norm": 19.5, "grad_norm_var": 14.370247395833333, "learning_rate": 0.0001947, "loss": 33.7583, "loss/aux_loss": 0.04842391442507506, "loss/crossentropy": 4.706963205337525, "loss/logits": 2.918571615219116, "step": 610 }, { "epoch": 0.0062, "grad_norm": 16.0, "grad_norm_var": 6.815625, "learning_rate": 0.0001974, "loss": 33.1779, "loss/aux_loss": 0.04836068209260702, "loss/crossentropy": 4.702796244621277, "loss/logits": 2.8033588767051696, "step": 620 }, { "epoch": 0.0063, "grad_norm": 24.75, "grad_norm_var": 9.648030598958334, "learning_rate": 0.00020009999999999998, "loss": 32.6916, "loss/aux_loss": 0.04836873207241297, "loss/crossentropy": 4.663065433502197, "loss/logits": 2.7192453861236574, "step": 630 }, { "epoch": 0.0064, "grad_norm": 28.5, "grad_norm_var": 13.41875, "learning_rate": 0.0002028, "loss": 32.5747, "loss/aux_loss": 0.048406153731048104, "loss/crossentropy": 4.850475025177002, "loss/logits": 2.844682276248932, "step": 640 }, { "epoch": 0.0065, "grad_norm": 15.1875, "grad_norm_var": 10.483707682291667, "learning_rate": 0.0002055, "loss": 32.627, "loss/aux_loss": 0.04839936923235655, "loss/crossentropy": 4.642724204063415, "loss/logits": 2.7970473051071165, "step": 650 }, { "epoch": 0.0066, "grad_norm": 17.25, "grad_norm_var": 9.181103515625, "learning_rate": 0.0002082, "loss": 31.9502, "loss/aux_loss": 0.04840312860906124, "loss/crossentropy": 4.64382244348526, "loss/logits": 2.7651517271995543, "step": 660 }, { "epoch": 0.0067, "grad_norm": 22.0, "grad_norm_var": 6.279166666666667, "learning_rate": 0.0002109, "loss": 31.5068, "loss/aux_loss": 0.048387892358005044, "loss/crossentropy": 4.641875433921814, "loss/logits": 2.7343064188957213, "step": 670 }, { "epoch": 0.0068, "grad_norm": 20.0, "grad_norm_var": 5.815348307291667, "learning_rate": 0.00021360000000000001, "loss": 30.7349, "loss/aux_loss": 0.04838373064994812, "loss/crossentropy": 4.57262305021286, "loss/logits": 2.6575307488441466, "step": 680 }, { "epoch": 0.0069, "grad_norm": 21.25, "grad_norm_var": 3.3708170572916667, "learning_rate": 0.00021629999999999997, "loss": 30.9303, "loss/aux_loss": 0.048367501422762874, "loss/crossentropy": 4.517275846004486, "loss/logits": 2.7227562189102175, "step": 690 }, { "epoch": 0.007, "grad_norm": 17.75, "grad_norm_var": 5.214322916666666, "learning_rate": 0.00021899999999999998, "loss": 30.7433, "loss/aux_loss": 0.048321043699979783, "loss/crossentropy": 4.465225088596344, "loss/logits": 2.628221809864044, "step": 700 }, { "epoch": 0.0071, "grad_norm": 18.125, "grad_norm_var": 5.217643229166667, "learning_rate": 0.00022169999999999997, "loss": 30.6391, "loss/aux_loss": 0.04836261495947838, "loss/crossentropy": 4.5598583102226256, "loss/logits": 2.5861354947090147, "step": 710 }, { "epoch": 0.0072, "grad_norm": 18.625, "grad_norm_var": 14.745247395833333, "learning_rate": 0.00022439999999999998, "loss": 30.0185, "loss/aux_loss": 0.048368556424975395, "loss/crossentropy": 4.439025247097016, "loss/logits": 2.484178614616394, "step": 720 }, { "epoch": 0.0073, "grad_norm": 20.625, "grad_norm_var": 8.686458333333333, "learning_rate": 0.0002271, "loss": 29.7983, "loss/aux_loss": 0.048322527296841146, "loss/crossentropy": 4.3540124773979185, "loss/logits": 2.446344316005707, "step": 730 }, { "epoch": 0.0074, "grad_norm": 17.375, "grad_norm_var": 5.894124348958333, "learning_rate": 0.00022979999999999997, "loss": 29.5599, "loss/aux_loss": 0.04832367654889822, "loss/crossentropy": 4.300390827655792, "loss/logits": 2.501788628101349, "step": 740 }, { "epoch": 0.0075, "grad_norm": 14.5, "grad_norm_var": 6.899739583333333, "learning_rate": 0.00023249999999999999, "loss": 29.1483, "loss/aux_loss": 0.04832951854914427, "loss/crossentropy": 4.52186803817749, "loss/logits": 2.4985528230667113, "step": 750 }, { "epoch": 0.0076, "grad_norm": 17.875, "grad_norm_var": 4.874332682291667, "learning_rate": 0.0002352, "loss": 29.0176, "loss/aux_loss": 0.04831754751503468, "loss/crossentropy": 4.319947266578675, "loss/logits": 2.37314190864563, "step": 760 }, { "epoch": 0.0077, "grad_norm": 18.75, "grad_norm_var": 4.414518229166666, "learning_rate": 0.00023789999999999998, "loss": 28.4552, "loss/aux_loss": 0.04835870675742626, "loss/crossentropy": 4.228903424739838, "loss/logits": 2.382171905040741, "step": 770 }, { "epoch": 0.0078, "grad_norm": 17.875, "grad_norm_var": 4.404622395833333, "learning_rate": 0.0002406, "loss": 27.9477, "loss/aux_loss": 0.048351569660007955, "loss/crossentropy": 4.279499888420105, "loss/logits": 2.3113824844360353, "step": 780 }, { "epoch": 0.0079, "grad_norm": 14.5, "grad_norm_var": 4.849934895833333, "learning_rate": 0.0002433, "loss": 28.1858, "loss/aux_loss": 0.04831267800182104, "loss/crossentropy": 4.268010532855987, "loss/logits": 2.357981026172638, "step": 790 }, { "epoch": 0.008, "grad_norm": 17.875, "grad_norm_var": 5.742822265625, "learning_rate": 0.000246, "loss": 27.944, "loss/aux_loss": 0.04835358560085297, "loss/crossentropy": 4.222308611869812, "loss/logits": 2.316913056373596, "step": 800 }, { "epoch": 0.0081, "grad_norm": 14.3125, "grad_norm_var": 5.843343098958333, "learning_rate": 0.0002487, "loss": 27.446, "loss/aux_loss": 0.048313943669199944, "loss/crossentropy": 4.209726583957672, "loss/logits": 2.374450123310089, "step": 810 }, { "epoch": 0.0082, "grad_norm": 20.0, "grad_norm_var": 14.9884765625, "learning_rate": 0.0002514, "loss": 27.673, "loss/aux_loss": 0.04833365194499493, "loss/crossentropy": 4.135816490650177, "loss/logits": 2.3683163046836855, "step": 820 }, { "epoch": 0.0083, "grad_norm": 20.0, "grad_norm_var": 14.966650390625, "learning_rate": 0.0002541, "loss": 27.0603, "loss/aux_loss": 0.04834430795162916, "loss/crossentropy": 4.212264752388, "loss/logits": 2.292864066362381, "step": 830 }, { "epoch": 0.0084, "grad_norm": 13.0, "grad_norm_var": 6.715869140625, "learning_rate": 0.00025679999999999995, "loss": 26.9205, "loss/aux_loss": 0.048344089090824126, "loss/crossentropy": 4.146613943576813, "loss/logits": 2.337345379590988, "step": 840 }, { "epoch": 0.0085, "grad_norm": 38.0, "grad_norm_var": 36.98123372395833, "learning_rate": 0.00025949999999999997, "loss": 26.9794, "loss/aux_loss": 0.04836427103728056, "loss/crossentropy": 4.061418402194977, "loss/logits": 2.285140597820282, "step": 850 }, { "epoch": 0.0086, "grad_norm": 16.125, "grad_norm_var": 29.924723307291668, "learning_rate": 0.0002622, "loss": 26.9251, "loss/aux_loss": 0.048360053822398184, "loss/crossentropy": 4.185706174373626, "loss/logits": 2.221945381164551, "step": 860 }, { "epoch": 0.0087, "grad_norm": 16.875, "grad_norm_var": 4.328889973958334, "learning_rate": 0.0002649, "loss": 26.3776, "loss/aux_loss": 0.04831914566457272, "loss/crossentropy": 4.120305705070495, "loss/logits": 2.215441507101059, "step": 870 }, { "epoch": 0.0088, "grad_norm": 14.75, "grad_norm_var": 1.937353515625, "learning_rate": 0.0002676, "loss": 26.4197, "loss/aux_loss": 0.04831472560763359, "loss/crossentropy": 4.091458034515381, "loss/logits": 2.2590562105178833, "step": 880 }, { "epoch": 0.0089, "grad_norm": 17.25, "grad_norm_var": 5.08828125, "learning_rate": 0.00027029999999999996, "loss": 26.5735, "loss/aux_loss": 0.048319687880575654, "loss/crossentropy": 4.131750977039337, "loss/logits": 2.2907270908355715, "step": 890 }, { "epoch": 0.009, "grad_norm": 16.875, "grad_norm_var": 2.8785807291666665, "learning_rate": 0.00027299999999999997, "loss": 25.9573, "loss/aux_loss": 0.04829480424523354, "loss/crossentropy": 4.1353423476219175, "loss/logits": 2.2223441004753113, "step": 900 }, { "epoch": 0.0091, "grad_norm": 15.8125, "grad_norm_var": 1.7628743489583334, "learning_rate": 0.0002757, "loss": 25.6367, "loss/aux_loss": 0.0482923174276948, "loss/crossentropy": 4.0344107985496525, "loss/logits": 2.1614388108253477, "step": 910 }, { "epoch": 0.0092, "grad_norm": 19.125, "grad_norm_var": 5.126822916666667, "learning_rate": 0.0002784, "loss": 25.4326, "loss/aux_loss": 0.0483014602214098, "loss/crossentropy": 3.857894313335419, "loss/logits": 2.1173263430595397, "step": 920 }, { "epoch": 0.0093, "grad_norm": 17.5, "grad_norm_var": 3.0747395833333333, "learning_rate": 0.0002811, "loss": 24.9668, "loss/aux_loss": 0.04832738190889359, "loss/crossentropy": 3.821620452404022, "loss/logits": 2.0369732558727263, "step": 930 }, { "epoch": 0.0094, "grad_norm": 15.375, "grad_norm_var": 3.349072265625, "learning_rate": 0.00028379999999999996, "loss": 25.2724, "loss/aux_loss": 0.04831767976284027, "loss/crossentropy": 4.015332496166229, "loss/logits": 2.054348534345627, "step": 940 }, { "epoch": 0.0095, "grad_norm": 13.875, "grad_norm_var": 2.811572265625, "learning_rate": 0.00028649999999999997, "loss": 24.9269, "loss/aux_loss": 0.04830477572977543, "loss/crossentropy": 3.9600290179252626, "loss/logits": 2.0728322982788088, "step": 950 }, { "epoch": 0.0096, "grad_norm": 12.125, "grad_norm_var": 2.8313639322916666, "learning_rate": 0.0002892, "loss": 24.9397, "loss/aux_loss": 0.04828764032572508, "loss/crossentropy": 3.9735502004623413, "loss/logits": 2.0897044599056245, "step": 960 }, { "epoch": 0.0097, "grad_norm": 12.9375, "grad_norm_var": 3.466910807291667, "learning_rate": 0.0002919, "loss": 25.0229, "loss/aux_loss": 0.04828515090048313, "loss/crossentropy": 3.849641752243042, "loss/logits": 2.0807200193405153, "step": 970 }, { "epoch": 0.0098, "grad_norm": 17.125, "grad_norm_var": 2.0829264322916665, "learning_rate": 0.00029459999999999995, "loss": 24.5474, "loss/aux_loss": 0.04829510189592838, "loss/crossentropy": 3.926040601730347, "loss/logits": 1.9675580561161041, "step": 980 }, { "epoch": 0.0099, "grad_norm": 14.1875, "grad_norm_var": 2.063134765625, "learning_rate": 0.00029729999999999996, "loss": 24.7495, "loss/aux_loss": 0.04827150721102953, "loss/crossentropy": 3.920617640018463, "loss/logits": 2.0571080267429354, "step": 990 }, { "epoch": 0.01, "grad_norm": 23.125, "grad_norm_var": 9.017041015625, "learning_rate": 0.0003, "loss": 24.6181, "loss/aux_loss": 0.04830240122973919, "loss/crossentropy": 3.960080420970917, "loss/logits": 2.0806682467460633, "step": 1000 }, { "epoch": 0.0101, "grad_norm": 13.625, "grad_norm_var": 7.297330729166666, "learning_rate": 0.0003, "loss": 24.5401, "loss/aux_loss": 0.048330770991742614, "loss/crossentropy": 3.909256339073181, "loss/logits": 2.0541693389415743, "step": 1010 }, { "epoch": 0.0102, "grad_norm": 13.75, "grad_norm_var": 2.5978515625, "learning_rate": 0.0003, "loss": 24.0457, "loss/aux_loss": 0.04828005637973547, "loss/crossentropy": 4.0945284247398375, "loss/logits": 2.0571001410484313, "step": 1020 }, { "epoch": 0.0103, "grad_norm": 12.0, "grad_norm_var": 2.8195149739583334, "learning_rate": 0.0003, "loss": 23.9983, "loss/aux_loss": 0.048290212824940684, "loss/crossentropy": 3.792713475227356, "loss/logits": 1.9736050605773925, "step": 1030 }, { "epoch": 0.0104, "grad_norm": 14.4375, "grad_norm_var": 45.916650390625, "learning_rate": 0.0003, "loss": 23.7592, "loss/aux_loss": 0.048343191482126714, "loss/crossentropy": 3.667546308040619, "loss/logits": 1.9718676209449768, "step": 1040 }, { "epoch": 0.0105, "grad_norm": 13.75, "grad_norm_var": 4.7306640625, "learning_rate": 0.0003, "loss": 23.9655, "loss/aux_loss": 0.04828641843050718, "loss/crossentropy": 3.918486988544464, "loss/logits": 2.0048129856586456, "step": 1050 }, { "epoch": 0.0106, "grad_norm": 15.0, "grad_norm_var": 1.9869140625, "learning_rate": 0.0003, "loss": 23.6091, "loss/aux_loss": 0.048306448943912984, "loss/crossentropy": 3.855974185466766, "loss/logits": 1.956015944480896, "step": 1060 }, { "epoch": 0.0107, "grad_norm": 13.125, "grad_norm_var": 1.4332682291666667, "learning_rate": 0.0003, "loss": 23.576, "loss/aux_loss": 0.048309461772441865, "loss/crossentropy": 3.5664370179176332, "loss/logits": 1.9399469137191772, "step": 1070 }, { "epoch": 0.0108, "grad_norm": 16.875, "grad_norm_var": 96.96066080729166, "learning_rate": 0.0003, "loss": 23.5042, "loss/aux_loss": 0.04829024374485016, "loss/crossentropy": 3.9391483783721926, "loss/logits": 2.0180298566818236, "step": 1080 }, { "epoch": 0.0109, "grad_norm": 14.25, "grad_norm_var": 99.06608072916667, "learning_rate": 0.0003, "loss": 23.2801, "loss/aux_loss": 0.04827498830854893, "loss/crossentropy": 3.925715708732605, "loss/logits": 1.9511402130126954, "step": 1090 }, { "epoch": 0.011, "grad_norm": 11.875, "grad_norm_var": 1.5015462239583333, "learning_rate": 0.0003, "loss": 23.2888, "loss/aux_loss": 0.04827521629631519, "loss/crossentropy": 4.049439036846161, "loss/logits": 1.9741652667522431, "step": 1100 }, { "epoch": 0.0111, "grad_norm": 13.5, "grad_norm_var": 3.6861979166666665, "learning_rate": 0.0003, "loss": 22.8228, "loss/aux_loss": 0.048285826854407785, "loss/crossentropy": 3.7126415371894836, "loss/logits": 1.8875436723232268, "step": 1110 }, { "epoch": 0.0112, "grad_norm": 15.125, "grad_norm_var": 3.2712076822916667, "learning_rate": 0.0003, "loss": 22.8436, "loss/aux_loss": 0.048280049860477445, "loss/crossentropy": 3.875200855731964, "loss/logits": 1.8699533224105835, "step": 1120 }, { "epoch": 0.0113, "grad_norm": 12.875, "grad_norm_var": 1.5421712239583334, "learning_rate": 0.0003, "loss": 22.9724, "loss/aux_loss": 0.04830393195152283, "loss/crossentropy": 3.7354134917259216, "loss/logits": 1.9599017381668091, "step": 1130 }, { "epoch": 0.0114, "grad_norm": 11.1875, "grad_norm_var": 1.6598307291666667, "learning_rate": 0.0003, "loss": 22.91, "loss/aux_loss": 0.04829528890550137, "loss/crossentropy": 3.832562971115112, "loss/logits": 1.9021077275276184, "step": 1140 }, { "epoch": 0.0115, "grad_norm": 12.0, "grad_norm_var": 2.373551432291667, "learning_rate": 0.0003, "loss": 22.5944, "loss/aux_loss": 0.04828084670007229, "loss/crossentropy": 3.8583874821662905, "loss/logits": 1.9061977505683898, "step": 1150 }, { "epoch": 0.0116, "grad_norm": 11.6875, "grad_norm_var": 4.650374348958334, "learning_rate": 0.0003, "loss": 22.6571, "loss/aux_loss": 0.04829124473035336, "loss/crossentropy": 3.729883003234863, "loss/logits": 1.8983563661575318, "step": 1160 }, { "epoch": 0.0117, "grad_norm": 12.75, "grad_norm_var": 4.216080729166666, "learning_rate": 0.0003, "loss": 22.5304, "loss/aux_loss": 0.0483067661523819, "loss/crossentropy": 3.8876662373542787, "loss/logits": 1.8905851602554322, "step": 1170 }, { "epoch": 0.0118, "grad_norm": 12.1875, "grad_norm_var": 1.5910807291666667, "learning_rate": 0.0003, "loss": 22.2809, "loss/aux_loss": 0.048292340524494645, "loss/crossentropy": 3.9721433520317078, "loss/logits": 1.8897149801254272, "step": 1180 }, { "epoch": 0.0119, "grad_norm": 14.4375, "grad_norm_var": 7.739322916666667, "learning_rate": 0.0003, "loss": 22.4589, "loss/aux_loss": 0.048297750391066076, "loss/crossentropy": 3.6948838114738463, "loss/logits": 1.8489306330680848, "step": 1190 }, { "epoch": 0.012, "grad_norm": 10.5, "grad_norm_var": 7.207666015625, "learning_rate": 0.0003, "loss": 22.2067, "loss/aux_loss": 0.048272774554789066, "loss/crossentropy": 3.913854885101318, "loss/logits": 1.861431396007538, "step": 1200 }, { "epoch": 0.0121, "grad_norm": 15.3125, "grad_norm_var": 5.213655598958334, "learning_rate": 0.0003, "loss": 22.2212, "loss/aux_loss": 0.04833245109766722, "loss/crossentropy": 3.696351206302643, "loss/logits": 1.8378067016601562, "step": 1210 }, { "epoch": 0.0122, "grad_norm": 9.9375, "grad_norm_var": 3.999853515625, "learning_rate": 0.0003, "loss": 22.0734, "loss/aux_loss": 0.04830023720860481, "loss/crossentropy": 3.807795548439026, "loss/logits": 1.8107618153095246, "step": 1220 }, { "epoch": 0.0123, "grad_norm": 12.5, "grad_norm_var": 12.50078125, "learning_rate": 0.0003, "loss": 21.7587, "loss/aux_loss": 0.04829862117767334, "loss/crossentropy": 3.750839185714722, "loss/logits": 1.794652533531189, "step": 1230 }, { "epoch": 0.0124, "grad_norm": 12.375, "grad_norm_var": 115.42858072916667, "learning_rate": 0.0003, "loss": 21.93, "loss/aux_loss": 0.048292195051908494, "loss/crossentropy": 3.7465561628341675, "loss/logits": 1.797796505689621, "step": 1240 }, { "epoch": 0.0125, "grad_norm": 12.25, "grad_norm_var": 186.75416666666666, "learning_rate": 0.0003, "loss": 21.953, "loss/aux_loss": 0.04834472518414259, "loss/crossentropy": 3.6869328737258913, "loss/logits": 1.797852247953415, "step": 1250 }, { "epoch": 0.0126, "grad_norm": 9.75, "grad_norm_var": 1.7327473958333333, "learning_rate": 0.0003, "loss": 21.9868, "loss/aux_loss": 0.048277279175817964, "loss/crossentropy": 3.6617552042007446, "loss/logits": 1.7641812562942505, "step": 1260 }, { "epoch": 0.0127, "grad_norm": 10.75, "grad_norm_var": 1.4202473958333333, "learning_rate": 0.0003, "loss": 21.6879, "loss/aux_loss": 0.04827971309423447, "loss/crossentropy": 3.4563692212104797, "loss/logits": 1.7538020849227904, "step": 1270 }, { "epoch": 0.0128, "grad_norm": 9.6875, "grad_norm_var": 0.5093098958333333, "learning_rate": 0.0003, "loss": 21.5679, "loss/aux_loss": 0.048257603868842126, "loss/crossentropy": 3.737559175491333, "loss/logits": 1.8031953394412994, "step": 1280 }, { "epoch": 0.0129, "grad_norm": 10.8125, "grad_norm_var": 3.0283854166666666, "learning_rate": 0.0003, "loss": 21.801, "loss/aux_loss": 0.0482830997556448, "loss/crossentropy": 3.788530111312866, "loss/logits": 1.8610890209674835, "step": 1290 }, { "epoch": 0.013, "grad_norm": 10.3125, "grad_norm_var": 0.363134765625, "learning_rate": 0.0003, "loss": 21.5052, "loss/aux_loss": 0.04827403090894222, "loss/crossentropy": 3.7146639943122866, "loss/logits": 1.8084448158740998, "step": 1300 }, { "epoch": 0.0131, "grad_norm": 10.3125, "grad_norm_var": 0.35885416666666664, "learning_rate": 0.0003, "loss": 21.1896, "loss/aux_loss": 0.048255456425249574, "loss/crossentropy": 3.6508117794990538, "loss/logits": 1.7647499084472655, "step": 1310 }, { "epoch": 0.0132, "grad_norm": 10.75, "grad_norm_var": 48.28553059895833, "learning_rate": 0.0003, "loss": 21.2771, "loss/aux_loss": 0.04835470654070377, "loss/crossentropy": 3.6754523515701294, "loss/logits": 1.7459341287612915, "step": 1320 }, { "epoch": 0.0133, "grad_norm": 9.75, "grad_norm_var": 18.715348307291666, "learning_rate": 0.0003, "loss": 21.4809, "loss/aux_loss": 0.04828258771449327, "loss/crossentropy": 3.671703588962555, "loss/logits": 1.753785401582718, "step": 1330 }, { "epoch": 0.0134, "grad_norm": 10.375, "grad_norm_var": 0.9012858072916666, "learning_rate": 0.0003, "loss": 21.3825, "loss/aux_loss": 0.048258156329393384, "loss/crossentropy": 3.691448616981506, "loss/logits": 1.7658027529716491, "step": 1340 }, { "epoch": 0.0135, "grad_norm": 8.4375, "grad_norm_var": 0.6473307291666667, "learning_rate": 0.0003, "loss": 21.3845, "loss/aux_loss": 0.04825436770915985, "loss/crossentropy": 3.645776665210724, "loss/logits": 1.7162048041820526, "step": 1350 }, { "epoch": 0.0136, "grad_norm": 10.0625, "grad_norm_var": 0.58984375, "learning_rate": 0.0003, "loss": 20.7527, "loss/aux_loss": 0.04827475063502788, "loss/crossentropy": 3.6466134548187257, "loss/logits": 1.694813996553421, "step": 1360 }, { "epoch": 0.0137, "grad_norm": 10.0625, "grad_norm_var": 0.4046223958333333, "learning_rate": 0.0003, "loss": 21.017, "loss/aux_loss": 0.04827818218618631, "loss/crossentropy": 3.7088001132011414, "loss/logits": 1.70878404378891, "step": 1370 }, { "epoch": 0.0138, "grad_norm": 11.875, "grad_norm_var": 0.70703125, "learning_rate": 0.0003, "loss": 20.9997, "loss/aux_loss": 0.04826367888599634, "loss/crossentropy": 3.7648239493370057, "loss/logits": 1.7505885064601898, "step": 1380 }, { "epoch": 0.0139, "grad_norm": 9.5625, "grad_norm_var": 0.7249348958333334, "learning_rate": 0.0003, "loss": 20.798, "loss/aux_loss": 0.04830121118575335, "loss/crossentropy": 3.696249544620514, "loss/logits": 1.7071794509887694, "step": 1390 }, { "epoch": 0.014, "grad_norm": 8.875, "grad_norm_var": 0.7769368489583334, "learning_rate": 0.0003, "loss": 21.0944, "loss/aux_loss": 0.04826546385884285, "loss/crossentropy": 3.6206825494766237, "loss/logits": 1.766976636648178, "step": 1400 }, { "epoch": 0.0141, "grad_norm": 9.0625, "grad_norm_var": 0.6129557291666666, "learning_rate": 0.0003, "loss": 20.5101, "loss/aux_loss": 0.048279773257672784, "loss/crossentropy": 3.5249340176582336, "loss/logits": 1.6939750254154204, "step": 1410 }, { "epoch": 0.0142, "grad_norm": 9.25, "grad_norm_var": 1.0231608072916667, "learning_rate": 0.0003, "loss": 20.5613, "loss/aux_loss": 0.04827423859387636, "loss/crossentropy": 3.45823814868927, "loss/logits": 1.6488157391548157, "step": 1420 }, { "epoch": 0.0143, "grad_norm": 9.125, "grad_norm_var": 1.0061848958333333, "learning_rate": 0.0003, "loss": 20.6672, "loss/aux_loss": 0.048273424990475176, "loss/crossentropy": 3.5580545544624327, "loss/logits": 1.6708523690700532, "step": 1430 }, { "epoch": 0.0144, "grad_norm": 9.5625, "grad_norm_var": 0.5536295572916666, "learning_rate": 0.0003, "loss": 20.7267, "loss/aux_loss": 0.048252567276358606, "loss/crossentropy": 3.50860835313797, "loss/logits": 1.690982359647751, "step": 1440 }, { "epoch": 0.0145, "grad_norm": 9.0, "grad_norm_var": 0.4879557291666667, "learning_rate": 0.0003, "loss": 20.3679, "loss/aux_loss": 0.048260470107197764, "loss/crossentropy": 3.5979798078536986, "loss/logits": 1.727076655626297, "step": 1450 }, { "epoch": 0.0146, "grad_norm": 9.5625, "grad_norm_var": 0.6821451822916667, "learning_rate": 0.0003, "loss": 20.5641, "loss/aux_loss": 0.04824462234973907, "loss/crossentropy": 3.473064345121384, "loss/logits": 1.6370813488960265, "step": 1460 }, { "epoch": 0.0147, "grad_norm": 9.25, "grad_norm_var": 0.5143229166666666, "learning_rate": 0.0003, "loss": 20.4817, "loss/aux_loss": 0.04825858902186155, "loss/crossentropy": 3.680207347869873, "loss/logits": 1.7349261403083802, "step": 1470 }, { "epoch": 0.0148, "grad_norm": 10.0625, "grad_norm_var": 168.77784830729166, "learning_rate": 0.0003, "loss": 20.4344, "loss/aux_loss": 0.04829352758824825, "loss/crossentropy": 3.643856203556061, "loss/logits": 1.6838299632072449, "step": 1480 }, { "epoch": 0.0149, "grad_norm": 10.0, "grad_norm_var": 164.6166015625, "learning_rate": 0.0003, "loss": 20.2984, "loss/aux_loss": 0.04828519467264414, "loss/crossentropy": 3.491868484020233, "loss/logits": 1.6502962768077851, "step": 1490 }, { "epoch": 0.015, "grad_norm": 9.3125, "grad_norm_var": 1.26796875, "learning_rate": 0.0003, "loss": 20.2734, "loss/aux_loss": 0.0482696495950222, "loss/crossentropy": 3.54322612285614, "loss/logits": 1.6839805364608764, "step": 1500 }, { "epoch": 0.0151, "grad_norm": 9.6875, "grad_norm_var": 1.3467732747395833, "learning_rate": 0.0003, "loss": 20.223, "loss/aux_loss": 0.048268103040754795, "loss/crossentropy": 3.591710591316223, "loss/logits": 1.6459056198596955, "step": 1510 }, { "epoch": 0.0152, "grad_norm": 9.5625, "grad_norm_var": 0.30625, "learning_rate": 0.0003, "loss": 19.9556, "loss/aux_loss": 0.04826340805739164, "loss/crossentropy": 3.663398194313049, "loss/logits": 1.6375055193901062, "step": 1520 }, { "epoch": 0.0153, "grad_norm": 9.6875, "grad_norm_var": 8.3697265625, "learning_rate": 0.0003, "loss": 19.9875, "loss/aux_loss": 0.048267958126962184, "loss/crossentropy": 3.429060697555542, "loss/logits": 1.5783089220523834, "step": 1530 }, { "epoch": 0.0154, "grad_norm": 7.96875, "grad_norm_var": 9.10787353515625, "learning_rate": 0.0003, "loss": 19.7039, "loss/aux_loss": 0.048261369951069354, "loss/crossentropy": 3.662845695018768, "loss/logits": 1.6366191446781158, "step": 1540 }, { "epoch": 0.0155, "grad_norm": 8.1875, "grad_norm_var": 37.75735270182292, "learning_rate": 0.0003, "loss": 19.9279, "loss/aux_loss": 0.048299112170934674, "loss/crossentropy": 3.5160138845443725, "loss/logits": 1.613296240568161, "step": 1550 }, { "epoch": 0.0156, "grad_norm": 8.875, "grad_norm_var": 0.6832967122395833, "learning_rate": 0.0003, "loss": 19.8753, "loss/aux_loss": 0.04826027043163776, "loss/crossentropy": 3.6247658729553223, "loss/logits": 1.606148999929428, "step": 1560 }, { "epoch": 0.0157, "grad_norm": 8.75, "grad_norm_var": 6.793082682291667, "learning_rate": 0.0003, "loss": 19.7959, "loss/aux_loss": 0.04825771022588014, "loss/crossentropy": 3.3447017312049865, "loss/logits": 1.5702200174331664, "step": 1570 }, { "epoch": 0.0158, "grad_norm": 8.6875, "grad_norm_var": 6.344645182291667, "learning_rate": 0.0003, "loss": 19.8071, "loss/aux_loss": 0.048260610550642014, "loss/crossentropy": 3.414109396934509, "loss/logits": 1.6106845080852508, "step": 1580 }, { "epoch": 0.0159, "grad_norm": 8.125, "grad_norm_var": 0.25833333333333336, "learning_rate": 0.0003, "loss": 19.716, "loss/aux_loss": 0.048247416689991954, "loss/crossentropy": 3.4498027682304384, "loss/logits": 1.621438193321228, "step": 1590 }, { "epoch": 0.016, "grad_norm": 8.5625, "grad_norm_var": 0.28177083333333336, "learning_rate": 0.0003, "loss": 19.6554, "loss/aux_loss": 0.048260800912976264, "loss/crossentropy": 3.564990556240082, "loss/logits": 1.6194686591625214, "step": 1600 }, { "epoch": 0.0161, "grad_norm": 9.75, "grad_norm_var": 0.21692708333333333, "learning_rate": 0.0003, "loss": 19.6566, "loss/aux_loss": 0.048259117268025876, "loss/crossentropy": 3.4118771314620973, "loss/logits": 1.6117245256900787, "step": 1610 }, { "epoch": 0.0162, "grad_norm": 8.6875, "grad_norm_var": 1.2012858072916666, "learning_rate": 0.0003, "loss": 19.2843, "loss/aux_loss": 0.04827672149986029, "loss/crossentropy": 3.3465479731559755, "loss/logits": 1.5816888511180878, "step": 1620 }, { "epoch": 0.0163, "grad_norm": 8.6875, "grad_norm_var": 0.2676920572916667, "learning_rate": 0.0003, "loss": 19.4599, "loss/aux_loss": 0.048250201344490054, "loss/crossentropy": 3.515201151371002, "loss/logits": 1.5413510143756866, "step": 1630 }, { "epoch": 0.0164, "grad_norm": 8.4375, "grad_norm_var": 0.42831624348958336, "learning_rate": 0.0003, "loss": 19.4, "loss/aux_loss": 0.04824144206941128, "loss/crossentropy": 3.4953723192214965, "loss/logits": 1.6243361711502076, "step": 1640 }, { "epoch": 0.0165, "grad_norm": 8.125, "grad_norm_var": 0.42760009765625, "learning_rate": 0.0003, "loss": 19.5616, "loss/aux_loss": 0.04824843630194664, "loss/crossentropy": 3.373341774940491, "loss/logits": 1.5757689416408538, "step": 1650 }, { "epoch": 0.0166, "grad_norm": 9.3125, "grad_norm_var": 0.18840738932291667, "learning_rate": 0.0003, "loss": 19.2866, "loss/aux_loss": 0.04825681522488594, "loss/crossentropy": 3.339651143550873, "loss/logits": 1.5976973354816437, "step": 1660 }, { "epoch": 0.0167, "grad_norm": 9.125, "grad_norm_var": 5.00054931640625, "learning_rate": 0.0003, "loss": 19.5115, "loss/aux_loss": 0.04826808106154203, "loss/crossentropy": 3.5275720238685606, "loss/logits": 1.5411208510398864, "step": 1670 }, { "epoch": 0.0168, "grad_norm": 10.1875, "grad_norm_var": 47.442952473958336, "learning_rate": 0.0003, "loss": 19.3194, "loss/aux_loss": 0.048269005678594115, "loss/crossentropy": 3.509286916255951, "loss/logits": 1.5299911737442016, "step": 1680 }, { "epoch": 0.0169, "grad_norm": 8.0625, "grad_norm_var": 45.85546875, "learning_rate": 0.0003, "loss": 19.2647, "loss/aux_loss": 0.048268322832882404, "loss/crossentropy": 3.4770318508148192, "loss/logits": 1.552085292339325, "step": 1690 }, { "epoch": 0.017, "grad_norm": 8.375, "grad_norm_var": 0.31067301432291666, "learning_rate": 0.0003, "loss": 19.1182, "loss/aux_loss": 0.048229466564953326, "loss/crossentropy": 3.3933613896369934, "loss/logits": 1.5347690343856812, "step": 1700 }, { "epoch": 0.0171, "grad_norm": 8.125, "grad_norm_var": 0.16428629557291666, "learning_rate": 0.0003, "loss": 19.4514, "loss/aux_loss": 0.04824534244835377, "loss/crossentropy": 3.460851287841797, "loss/logits": 1.5554963886737823, "step": 1710 }, { "epoch": 0.0172, "grad_norm": 8.5625, "grad_norm_var": 0.14095052083333334, "learning_rate": 0.0003, "loss": 19.0598, "loss/aux_loss": 0.0482347022742033, "loss/crossentropy": 3.4911609292030334, "loss/logits": 1.6111477196216584, "step": 1720 }, { "epoch": 0.0173, "grad_norm": 10.1875, "grad_norm_var": 8.088505045572917, "learning_rate": 0.0003, "loss": 19.2344, "loss/aux_loss": 0.04825140796601772, "loss/crossentropy": 3.6284345746040345, "loss/logits": 1.5853043377399445, "step": 1730 }, { "epoch": 0.0174, "grad_norm": 8.4375, "grad_norm_var": 8.165690104166666, "learning_rate": 0.0003, "loss": 18.7796, "loss/aux_loss": 0.04823643118143082, "loss/crossentropy": 3.4696247935295106, "loss/logits": 1.61273393034935, "step": 1740 }, { "epoch": 0.0175, "grad_norm": 7.84375, "grad_norm_var": 0.20670166015625, "learning_rate": 0.0003, "loss": 18.8781, "loss/aux_loss": 0.04823619779199362, "loss/crossentropy": 3.4937520623207092, "loss/logits": 1.5296509921550752, "step": 1750 }, { "epoch": 0.0176, "grad_norm": 8.375, "grad_norm_var": 0.24099934895833333, "learning_rate": 0.0003, "loss": 18.7339, "loss/aux_loss": 0.04824729897081852, "loss/crossentropy": 3.497152864933014, "loss/logits": 1.575348162651062, "step": 1760 }, { "epoch": 0.0177, "grad_norm": 8.5625, "grad_norm_var": 0.38118082682291665, "learning_rate": 0.0003, "loss": 18.8957, "loss/aux_loss": 0.04825843013823032, "loss/crossentropy": 3.3257681131362915, "loss/logits": 1.5223451495170592, "step": 1770 }, { "epoch": 0.0178, "grad_norm": 8.1875, "grad_norm_var": 0.24722900390625, "learning_rate": 0.0003, "loss": 19.0113, "loss/aux_loss": 0.04823619592934847, "loss/crossentropy": 3.5192960858345033, "loss/logits": 1.6035509347915649, "step": 1780 }, { "epoch": 0.0179, "grad_norm": 8.0625, "grad_norm_var": 0.14099934895833333, "learning_rate": 0.0003, "loss": 18.8621, "loss/aux_loss": 0.04824311789125204, "loss/crossentropy": 3.4132414937019346, "loss/logits": 1.5165437579154968, "step": 1790 }, { "epoch": 0.018, "grad_norm": 8.6875, "grad_norm_var": 3.5010050455729167, "learning_rate": 0.0003, "loss": 18.6503, "loss/aux_loss": 0.04824370257556439, "loss/crossentropy": 3.402624809741974, "loss/logits": 1.53942152261734, "step": 1800 }, { "epoch": 0.0181, "grad_norm": 8.5, "grad_norm_var": 58.00753580729167, "learning_rate": 0.0003, "loss": 19.3208, "loss/aux_loss": 0.04827347882091999, "loss/crossentropy": 3.6006906509399412, "loss/logits": 1.5737698435783387, "step": 1810 }, { "epoch": 0.0182, "grad_norm": 8.5625, "grad_norm_var": 60.507405598958336, "learning_rate": 0.0003, "loss": 18.8721, "loss/aux_loss": 0.04824970234185457, "loss/crossentropy": 3.5692302942276, "loss/logits": 1.5307071149349212, "step": 1820 }, { "epoch": 0.0183, "grad_norm": 11.25, "grad_norm_var": 6.178238932291666, "learning_rate": 0.0003, "loss": 18.5814, "loss/aux_loss": 0.04826509784907103, "loss/crossentropy": 3.40536652803421, "loss/logits": 1.5309065878391266, "step": 1830 }, { "epoch": 0.0184, "grad_norm": 8.125, "grad_norm_var": 6.129427083333334, "learning_rate": 0.0003, "loss": 18.7299, "loss/aux_loss": 0.0482620395720005, "loss/crossentropy": 3.196433222293854, "loss/logits": 1.4907720267772675, "step": 1840 }, { "epoch": 0.0185, "grad_norm": 8.3125, "grad_norm_var": 0.17825520833333333, "learning_rate": 0.0003, "loss": 18.7455, "loss/aux_loss": 0.04824581053107977, "loss/crossentropy": 3.423103988170624, "loss/logits": 1.5436949849128723, "step": 1850 }, { "epoch": 0.0186, "grad_norm": 8.375, "grad_norm_var": 2.1146484375, "learning_rate": 0.0003, "loss": 18.6511, "loss/aux_loss": 0.048259328678250314, "loss/crossentropy": 3.3652100563049316, "loss/logits": 1.4386920034885406, "step": 1860 }, { "epoch": 0.0187, "grad_norm": 8.5625, "grad_norm_var": 1.8754191080729166, "learning_rate": 0.0003, "loss": 18.607, "loss/aux_loss": 0.04825924132019281, "loss/crossentropy": 3.448017656803131, "loss/logits": 1.536920464038849, "step": 1870 }, { "epoch": 0.0188, "grad_norm": 8.4375, "grad_norm_var": 0.24664306640625, "learning_rate": 0.0003, "loss": 18.5559, "loss/aux_loss": 0.048245815001428126, "loss/crossentropy": 3.3453487277030947, "loss/logits": 1.5264363229274749, "step": 1880 }, { "epoch": 0.0189, "grad_norm": 7.875, "grad_norm_var": 0.310791015625, "learning_rate": 0.0003, "loss": 18.5365, "loss/aux_loss": 0.04823654443025589, "loss/crossentropy": 3.373169445991516, "loss/logits": 1.5205184280872346, "step": 1890 }, { "epoch": 0.019, "grad_norm": 7.75, "grad_norm_var": 0.32105712890625, "learning_rate": 0.0003, "loss": 18.496, "loss/aux_loss": 0.048227923549711706, "loss/crossentropy": 3.2882206797599793, "loss/logits": 1.566467821598053, "step": 1900 }, { "epoch": 0.0191, "grad_norm": 8.0, "grad_norm_var": 0.301416015625, "learning_rate": 0.0003, "loss": 18.2065, "loss/aux_loss": 0.04824189618229866, "loss/crossentropy": 3.397510600090027, "loss/logits": 1.4857994496822358, "step": 1910 }, { "epoch": 0.0192, "grad_norm": 7.78125, "grad_norm_var": 0.6359212239583333, "learning_rate": 0.0003, "loss": 18.5913, "loss/aux_loss": 0.04824940506368876, "loss/crossentropy": 3.396523857116699, "loss/logits": 1.516043496131897, "step": 1920 }, { "epoch": 0.0193, "grad_norm": 7.71875, "grad_norm_var": 0.7240885416666667, "learning_rate": 0.0003, "loss": 18.9066, "loss/aux_loss": 0.04823889695107937, "loss/crossentropy": 3.5669950366020204, "loss/logits": 1.5904681384563446, "step": 1930 }, { "epoch": 0.0194, "grad_norm": 8.5, "grad_norm_var": 0.16226806640625, "learning_rate": 0.0003, "loss": 18.5454, "loss/aux_loss": 0.04824310019612312, "loss/crossentropy": 3.4322083711624147, "loss/logits": 1.4962215423583984, "step": 1940 }, { "epoch": 0.0195, "grad_norm": 7.8125, "grad_norm_var": 0.13033854166666667, "learning_rate": 0.0003, "loss": 18.4341, "loss/aux_loss": 0.048227659240365026, "loss/crossentropy": 3.4103391289711, "loss/logits": 1.486283725500107, "step": 1950 }, { "epoch": 0.0196, "grad_norm": 7.71875, "grad_norm_var": 0.22428385416666666, "learning_rate": 0.0003, "loss": 18.2887, "loss/aux_loss": 0.04824389982968569, "loss/crossentropy": 3.2727973818778993, "loss/logits": 1.4560063302516937, "step": 1960 }, { "epoch": 0.0197, "grad_norm": 13.8125, "grad_norm_var": 2.357926432291667, "learning_rate": 0.0003, "loss": 18.2356, "loss/aux_loss": 0.04824434258043766, "loss/crossentropy": 3.459345853328705, "loss/logits": 1.460297852754593, "step": 1970 }, { "epoch": 0.0198, "grad_norm": 11.4375, "grad_norm_var": 18.870817057291667, "learning_rate": 0.0003, "loss": 18.5238, "loss/aux_loss": 0.04825907479971647, "loss/crossentropy": 3.424750554561615, "loss/logits": 1.5167337119579316, "step": 1980 }, { "epoch": 0.0199, "grad_norm": 7.90625, "grad_norm_var": 18.401546223958334, "learning_rate": 0.0003, "loss": 18.2073, "loss/aux_loss": 0.04825407154858112, "loss/crossentropy": 3.2709259510040285, "loss/logits": 1.4685286700725555, "step": 1990 }, { "epoch": 0.02, "grad_norm": 7.96875, "grad_norm_var": 0.3753865559895833, "learning_rate": 0.0003, "loss": 18.0374, "loss/aux_loss": 0.04822464138269424, "loss/crossentropy": 3.350880300998688, "loss/logits": 1.4842880725860597, "step": 2000 }, { "epoch": 0.0201, "grad_norm": 9.0, "grad_norm_var": 54.53902587890625, "learning_rate": 0.0003, "loss": 18.1583, "loss/aux_loss": 0.04824762139469385, "loss/crossentropy": 3.2653831958770754, "loss/logits": 1.4564920663833618, "step": 2010 }, { "epoch": 0.0202, "grad_norm": 13.9375, "grad_norm_var": 54.16106363932292, "learning_rate": 0.0003, "loss": 17.7872, "loss/aux_loss": 0.04823515806347132, "loss/crossentropy": 3.265163505077362, "loss/logits": 1.410541558265686, "step": 2020 }, { "epoch": 0.0203, "grad_norm": 8.3125, "grad_norm_var": 2.7471964518229166, "learning_rate": 0.0003, "loss": 18.125, "loss/aux_loss": 0.048241624422371385, "loss/crossentropy": 3.389461839199066, "loss/logits": 1.46819948554039, "step": 2030 }, { "epoch": 0.0204, "grad_norm": 7.71875, "grad_norm_var": 0.14091389973958332, "learning_rate": 0.0003, "loss": 18.0986, "loss/aux_loss": 0.04822319280356169, "loss/crossentropy": 3.543317806720734, "loss/logits": 1.5373745501041411, "step": 2040 }, { "epoch": 0.0205, "grad_norm": 7.625, "grad_norm_var": 0.67301025390625, "learning_rate": 0.0003, "loss": 18.102, "loss/aux_loss": 0.04826016817241907, "loss/crossentropy": 3.3688653111457825, "loss/logits": 1.4797544002532959, "step": 2050 }, { "epoch": 0.0206, "grad_norm": 17.5, "grad_norm_var": 5.9224609375, "learning_rate": 0.0003, "loss": 18.1704, "loss/aux_loss": 0.04823215901851654, "loss/crossentropy": 3.3886295437812803, "loss/logits": 1.4773655652999877, "step": 2060 }, { "epoch": 0.0207, "grad_norm": 8.5, "grad_norm_var": 5.833333333333333, "learning_rate": 0.0003, "loss": 17.9943, "loss/aux_loss": 0.04824898187071085, "loss/crossentropy": 3.1112423300743104, "loss/logits": 1.4438789427280425, "step": 2070 }, { "epoch": 0.0208, "grad_norm": 7.46875, "grad_norm_var": 0.5363932291666667, "learning_rate": 0.0003, "loss": 18.0344, "loss/aux_loss": 0.0482429688796401, "loss/crossentropy": 3.352174973487854, "loss/logits": 1.4526370763778687, "step": 2080 }, { "epoch": 0.0209, "grad_norm": 7.65625, "grad_norm_var": 0.14293212890625, "learning_rate": 0.0003, "loss": 18.11, "loss/aux_loss": 0.048234878666698934, "loss/crossentropy": 3.240985023975372, "loss/logits": 1.4991967618465423, "step": 2090 }, { "epoch": 0.021, "grad_norm": 8.5625, "grad_norm_var": 0.07864176432291667, "learning_rate": 0.0003, "loss": 17.8796, "loss/aux_loss": 0.04822954386472702, "loss/crossentropy": 3.5183646202087404, "loss/logits": 1.4771794497966766, "step": 2100 }, { "epoch": 0.0211, "grad_norm": 8.0, "grad_norm_var": 0.15666910807291667, "learning_rate": 0.0003, "loss": 17.8422, "loss/aux_loss": 0.048239548690617085, "loss/crossentropy": 3.3616485238075255, "loss/logits": 1.4091070950031281, "step": 2110 }, { "epoch": 0.0212, "grad_norm": 7.9375, "grad_norm_var": 11.716109212239584, "learning_rate": 0.0003, "loss": 17.7719, "loss/aux_loss": 0.04824555143713951, "loss/crossentropy": 3.439083182811737, "loss/logits": 1.4733355700969697, "step": 2120 }, { "epoch": 0.0213, "grad_norm": 8.5625, "grad_norm_var": 84.65826822916667, "learning_rate": 0.0003, "loss": 17.7924, "loss/aux_loss": 0.048250272311270236, "loss/crossentropy": 3.4830735325813293, "loss/logits": 1.4258549392223359, "step": 2130 }, { "epoch": 0.0214, "grad_norm": 8.125, "grad_norm_var": 53.116520182291666, "learning_rate": 0.0003, "loss": 17.8883, "loss/aux_loss": 0.048237613029778005, "loss/crossentropy": 3.2555914759635924, "loss/logits": 1.4283065259456635, "step": 2140 }, { "epoch": 0.0215, "grad_norm": 8.125, "grad_norm_var": 0.3610310872395833, "learning_rate": 0.0003, "loss": 17.6419, "loss/aux_loss": 0.04823284205049276, "loss/crossentropy": 3.3793551921844482, "loss/logits": 1.4274089336395264, "step": 2150 }, { "epoch": 0.0216, "grad_norm": 7.21875, "grad_norm_var": 1.8485677083333334, "learning_rate": 0.0003, "loss": 17.6442, "loss/aux_loss": 0.048232033289968966, "loss/crossentropy": 3.2164774179458617, "loss/logits": 1.4371109902858734, "step": 2160 }, { "epoch": 0.0217, "grad_norm": 7.71875, "grad_norm_var": 1.8079264322916666, "learning_rate": 0.0003, "loss": 17.7263, "loss/aux_loss": 0.04821321051567793, "loss/crossentropy": 3.3871463894844056, "loss/logits": 1.4453998267650605, "step": 2170 }, { "epoch": 0.0218, "grad_norm": 8.875, "grad_norm_var": 0.3880859375, "learning_rate": 0.0003, "loss": 17.6607, "loss/aux_loss": 0.048226891085505486, "loss/crossentropy": 3.213350570201874, "loss/logits": 1.4165163397789002, "step": 2180 }, { "epoch": 0.0219, "grad_norm": 8.125, "grad_norm_var": 0.4266764322916667, "learning_rate": 0.0003, "loss": 17.7184, "loss/aux_loss": 0.04822581373155117, "loss/crossentropy": 3.582288146018982, "loss/logits": 1.4369397819042207, "step": 2190 }, { "epoch": 0.022, "grad_norm": 8.3125, "grad_norm_var": 0.914306640625, "learning_rate": 0.0003, "loss": 17.7448, "loss/aux_loss": 0.04825443848967552, "loss/crossentropy": 3.306717586517334, "loss/logits": 1.4155293583869935, "step": 2200 }, { "epoch": 0.0221, "grad_norm": 9.0625, "grad_norm_var": 0.8086222330729167, "learning_rate": 0.0003, "loss": 17.6038, "loss/aux_loss": 0.04823794979602099, "loss/crossentropy": 3.26631623506546, "loss/logits": 1.3826520234346389, "step": 2210 }, { "epoch": 0.0222, "grad_norm": 7.75, "grad_norm_var": 0.35058186848958334, "learning_rate": 0.0003, "loss": 17.5963, "loss/aux_loss": 0.048243265226483346, "loss/crossentropy": 3.28060497045517, "loss/logits": 1.4131011009216308, "step": 2220 }, { "epoch": 0.0223, "grad_norm": 7.625, "grad_norm_var": 0.18853759765625, "learning_rate": 0.0003, "loss": 17.6245, "loss/aux_loss": 0.04822924640029669, "loss/crossentropy": 3.287998414039612, "loss/logits": 1.4267325103282928, "step": 2230 }, { "epoch": 0.0224, "grad_norm": 7.875, "grad_norm_var": 0.257421875, "learning_rate": 0.0003, "loss": 17.6129, "loss/aux_loss": 0.04821740183979273, "loss/crossentropy": 3.378717005252838, "loss/logits": 1.4063582181930543, "step": 2240 }, { "epoch": 0.0225, "grad_norm": 7.59375, "grad_norm_var": 0.20963541666666666, "learning_rate": 0.0003, "loss": 17.6276, "loss/aux_loss": 0.04823744297027588, "loss/crossentropy": 3.4024731159210204, "loss/logits": 1.4467666923999787, "step": 2250 }, { "epoch": 0.0226, "grad_norm": 8.875, "grad_norm_var": 0.246484375, "learning_rate": 0.0003, "loss": 17.6328, "loss/aux_loss": 0.04822640102356672, "loss/crossentropy": 3.333562135696411, "loss/logits": 1.4312103688716888, "step": 2260 }, { "epoch": 0.0227, "grad_norm": 8.0625, "grad_norm_var": 0.67398681640625, "learning_rate": 0.0003, "loss": 17.5135, "loss/aux_loss": 0.04822310116142035, "loss/crossentropy": 3.320937788486481, "loss/logits": 1.418309098482132, "step": 2270 }, { "epoch": 0.0228, "grad_norm": 6.90625, "grad_norm_var": 0.44390869140625, "learning_rate": 0.0003, "loss": 17.3436, "loss/aux_loss": 0.04823301304131746, "loss/crossentropy": 3.12678724527359, "loss/logits": 1.379988819360733, "step": 2280 }, { "epoch": 0.0229, "grad_norm": 11.625, "grad_norm_var": 1.3313761393229167, "learning_rate": 0.0003, "loss": 17.6211, "loss/aux_loss": 0.04822282623499632, "loss/crossentropy": 3.2918911814689635, "loss/logits": 1.4198866367340088, "step": 2290 }, { "epoch": 0.023, "grad_norm": 8.0, "grad_norm_var": 1.1663899739583334, "learning_rate": 0.0003, "loss": 17.4747, "loss/aux_loss": 0.048235368356108664, "loss/crossentropy": 3.306833505630493, "loss/logits": 1.392419272661209, "step": 2300 }, { "epoch": 0.0231, "grad_norm": 7.3125, "grad_norm_var": 0.33331705729166666, "learning_rate": 0.0003, "loss": 17.2234, "loss/aux_loss": 0.048222755640745164, "loss/crossentropy": 3.361720085144043, "loss/logits": 1.421975213289261, "step": 2310 }, { "epoch": 0.0232, "grad_norm": 9.75, "grad_norm_var": 112.17330322265624, "learning_rate": 0.0003, "loss": 17.2352, "loss/aux_loss": 0.048219884373247625, "loss/crossentropy": 3.3393725872039797, "loss/logits": 1.4201282680034637, "step": 2320 }, { "epoch": 0.0233, "grad_norm": 8.1875, "grad_norm_var": 110.715625, "learning_rate": 0.0003, "loss": 17.2496, "loss/aux_loss": 0.04821732547134161, "loss/crossentropy": 3.389024722576141, "loss/logits": 1.435178142786026, "step": 2330 }, { "epoch": 0.0234, "grad_norm": 6.96875, "grad_norm_var": 0.2806640625, "learning_rate": 0.0003, "loss": 17.331, "loss/aux_loss": 0.04822313766926527, "loss/crossentropy": 3.232159233093262, "loss/logits": 1.3994586706161498, "step": 2340 }, { "epoch": 0.0235, "grad_norm": 7.96875, "grad_norm_var": 0.2126953125, "learning_rate": 0.0003, "loss": 17.2108, "loss/aux_loss": 0.04821410346776247, "loss/crossentropy": 3.443037581443787, "loss/logits": 1.4070569813251494, "step": 2350 }, { "epoch": 0.0236, "grad_norm": 8.0625, "grad_norm_var": 0.13619384765625, "learning_rate": 0.0003, "loss": 17.301, "loss/aux_loss": 0.048248034156858924, "loss/crossentropy": 3.349116563796997, "loss/logits": 1.3975160002708436, "step": 2360 }, { "epoch": 0.0237, "grad_norm": 7.84375, "grad_norm_var": 2.121415201822917, "learning_rate": 0.0003, "loss": 17.1628, "loss/aux_loss": 0.04823202043771744, "loss/crossentropy": 3.340719926357269, "loss/logits": 1.3724242806434632, "step": 2370 }, { "epoch": 0.0238, "grad_norm": 7.9375, "grad_norm_var": 1.950390625, "learning_rate": 0.0003, "loss": 17.2873, "loss/aux_loss": 0.0482368228957057, "loss/crossentropy": 3.295661818981171, "loss/logits": 1.4170908331871033, "step": 2380 }, { "epoch": 0.0239, "grad_norm": 7.84375, "grad_norm_var": 0.28982747395833336, "learning_rate": 0.0003, "loss": 17.1479, "loss/aux_loss": 0.048220128566026685, "loss/crossentropy": 3.149917113780975, "loss/logits": 1.3780086159706115, "step": 2390 }, { "epoch": 0.024, "grad_norm": 7.53125, "grad_norm_var": 0.56021728515625, "learning_rate": 0.0003, "loss": 16.9571, "loss/aux_loss": 0.04820647966116667, "loss/crossentropy": 3.203866708278656, "loss/logits": 1.3228682637214662, "step": 2400 }, { "epoch": 0.0241, "grad_norm": 7.6875, "grad_norm_var": 0.18140869140625, "learning_rate": 0.0003, "loss": 17.225, "loss/aux_loss": 0.04822152461856603, "loss/crossentropy": 3.219542622566223, "loss/logits": 1.3637619763612747, "step": 2410 }, { "epoch": 0.0242, "grad_norm": 7.28125, "grad_norm_var": 0.19696858723958333, "learning_rate": 0.0003, "loss": 17.3445, "loss/aux_loss": 0.04821507520973682, "loss/crossentropy": 3.437433052062988, "loss/logits": 1.4493371307849885, "step": 2420 }, { "epoch": 0.0243, "grad_norm": 9.5, "grad_norm_var": 0.36900634765625, "learning_rate": 0.0003, "loss": 17.2769, "loss/aux_loss": 0.048233349435031415, "loss/crossentropy": 3.327606177330017, "loss/logits": 1.3730829060077667, "step": 2430 }, { "epoch": 0.0244, "grad_norm": 7.0625, "grad_norm_var": 0.58326416015625, "learning_rate": 0.0003, "loss": 17.1716, "loss/aux_loss": 0.04821010734885931, "loss/crossentropy": 3.442364740371704, "loss/logits": 1.3955539762973785, "step": 2440 }, { "epoch": 0.0245, "grad_norm": 7.4375, "grad_norm_var": 0.19250895182291666, "learning_rate": 0.0003, "loss": 17.0851, "loss/aux_loss": 0.04822715688496828, "loss/crossentropy": 3.262503242492676, "loss/logits": 1.3460212230682373, "step": 2450 }, { "epoch": 0.0246, "grad_norm": 7.125, "grad_norm_var": 0.09498697916666667, "learning_rate": 0.0003, "loss": 16.9652, "loss/aux_loss": 0.04821744803339243, "loss/crossentropy": 3.3741399884223937, "loss/logits": 1.4004681944847106, "step": 2460 }, { "epoch": 0.0247, "grad_norm": 7.875, "grad_norm_var": 0.21073811848958332, "learning_rate": 0.0003, "loss": 16.7236, "loss/aux_loss": 0.048226969130337236, "loss/crossentropy": 3.1367203831672668, "loss/logits": 1.3466423988342284, "step": 2470 }, { "epoch": 0.0248, "grad_norm": 9.3125, "grad_norm_var": 0.46910400390625, "learning_rate": 0.0003, "loss": 17.0984, "loss/aux_loss": 0.048214548453688624, "loss/crossentropy": 3.2789533734321594, "loss/logits": 1.3584135174751282, "step": 2480 }, { "epoch": 0.0249, "grad_norm": 8.4375, "grad_norm_var": 0.721337890625, "learning_rate": 0.0003, "loss": 17.1085, "loss/aux_loss": 0.048216362856328486, "loss/crossentropy": 3.315259212255478, "loss/logits": 1.396775197982788, "step": 2490 }, { "epoch": 0.025, "grad_norm": 7.46875, "grad_norm_var": 1.4252604166666667, "learning_rate": 0.0003, "loss": 16.9872, "loss/aux_loss": 0.048228930495679376, "loss/crossentropy": 3.373861300945282, "loss/logits": 1.3967225074768066, "step": 2500 }, { "epoch": 0.0251, "grad_norm": 9.625, "grad_norm_var": 1.22525634765625, "learning_rate": 0.0003, "loss": 17.1017, "loss/aux_loss": 0.04823280908167362, "loss/crossentropy": 3.2639551222324372, "loss/logits": 1.3296611040830613, "step": 2510 }, { "epoch": 0.0252, "grad_norm": 8.25, "grad_norm_var": 18.701493326822916, "learning_rate": 0.0003, "loss": 16.9072, "loss/aux_loss": 0.04824296310544014, "loss/crossentropy": 3.264930558204651, "loss/logits": 1.3745314061641694, "step": 2520 }, { "epoch": 0.0253, "grad_norm": 7.03125, "grad_norm_var": 18.965034993489585, "learning_rate": 0.0003, "loss": 16.7434, "loss/aux_loss": 0.048214029893279074, "loss/crossentropy": 3.231077790260315, "loss/logits": 1.3421391308307649, "step": 2530 }, { "epoch": 0.0254, "grad_norm": 8.375, "grad_norm_var": 0.3060546875, "learning_rate": 0.0003, "loss": 16.8817, "loss/aux_loss": 0.048227564059197904, "loss/crossentropy": 3.341559386253357, "loss/logits": 1.367657434940338, "step": 2540 }, { "epoch": 0.0255, "grad_norm": 8.0625, "grad_norm_var": 0.450634765625, "learning_rate": 0.0003, "loss": 16.6762, "loss/aux_loss": 0.04822465777397156, "loss/crossentropy": 3.1764264702796936, "loss/logits": 1.2923425018787384, "step": 2550 }, { "epoch": 0.0256, "grad_norm": 8.6875, "grad_norm_var": 0.9891560872395834, "learning_rate": 0.0003, "loss": 16.982, "loss/aux_loss": 0.04822836928069592, "loss/crossentropy": 3.1380072832107544, "loss/logits": 1.329880553483963, "step": 2560 }, { "epoch": 0.0257, "grad_norm": 7.0625, "grad_norm_var": 6.76246337890625, "learning_rate": 0.0003, "loss": 16.8472, "loss/aux_loss": 0.048224599473178385, "loss/crossentropy": 3.171788203716278, "loss/logits": 1.3234851002693175, "step": 2570 }, { "epoch": 0.0258, "grad_norm": 7.75, "grad_norm_var": 7.284305826822917, "learning_rate": 0.0003, "loss": 16.7384, "loss/aux_loss": 0.04820049479603768, "loss/crossentropy": 3.2456391513347627, "loss/logits": 1.3257298290729522, "step": 2580 }, { "epoch": 0.0259, "grad_norm": 8.6875, "grad_norm_var": 0.70699462890625, "learning_rate": 0.0003, "loss": 16.7462, "loss/aux_loss": 0.04820205494761467, "loss/crossentropy": 3.2381733298301696, "loss/logits": 1.3446310222148896, "step": 2590 }, { "epoch": 0.026, "grad_norm": 6.875, "grad_norm_var": 0.3619140625, "learning_rate": 0.0003, "loss": 16.7997, "loss/aux_loss": 0.0482096241787076, "loss/crossentropy": 3.317233157157898, "loss/logits": 1.3122055113315583, "step": 2600 }, { "epoch": 0.0261, "grad_norm": 7.25, "grad_norm_var": 0.3204264322916667, "learning_rate": 0.0003, "loss": 16.7259, "loss/aux_loss": 0.04821378495544195, "loss/crossentropy": 3.224324756860733, "loss/logits": 1.362189695239067, "step": 2610 }, { "epoch": 0.0262, "grad_norm": 7.84375, "grad_norm_var": 2.25601806640625, "learning_rate": 0.0003, "loss": 16.4682, "loss/aux_loss": 0.04823370911180973, "loss/crossentropy": 3.174444782733917, "loss/logits": 1.3091946482658385, "step": 2620 }, { "epoch": 0.0263, "grad_norm": 7.4375, "grad_norm_var": 0.55625, "learning_rate": 0.0003, "loss": 16.8187, "loss/aux_loss": 0.048213068023324014, "loss/crossentropy": 3.3021878719329836, "loss/logits": 1.3765088856220244, "step": 2630 }, { "epoch": 0.0264, "grad_norm": 7.6875, "grad_norm_var": 0.21092122395833332, "learning_rate": 0.0003, "loss": 16.6867, "loss/aux_loss": 0.0482132213190198, "loss/crossentropy": 3.2036616921424867, "loss/logits": 1.3353200852870941, "step": 2640 }, { "epoch": 0.0265, "grad_norm": 7.65625, "grad_norm_var": 0.17981770833333333, "learning_rate": 0.0003, "loss": 16.6249, "loss/aux_loss": 0.048217184469103815, "loss/crossentropy": 3.1017094254493713, "loss/logits": 1.293505471944809, "step": 2650 }, { "epoch": 0.0266, "grad_norm": 7.28125, "grad_norm_var": 0.24312744140625, "learning_rate": 0.0003, "loss": 16.8997, "loss/aux_loss": 0.04821909796446562, "loss/crossentropy": 3.3471083879470824, "loss/logits": 1.4033735275268555, "step": 2660 }, { "epoch": 0.0267, "grad_norm": 8.25, "grad_norm_var": 0.14654541015625, "learning_rate": 0.0003, "loss": 16.6097, "loss/aux_loss": 0.04820960406213999, "loss/crossentropy": 3.306285870075226, "loss/logits": 1.3414114236831665, "step": 2670 }, { "epoch": 0.0268, "grad_norm": 7.53125, "grad_norm_var": 0.12222900390625, "learning_rate": 0.0003, "loss": 16.7267, "loss/aux_loss": 0.04820468667894602, "loss/crossentropy": 3.1302775621414183, "loss/logits": 1.3628006160259247, "step": 2680 }, { "epoch": 0.0269, "grad_norm": 7.5, "grad_norm_var": 0.09217122395833334, "learning_rate": 0.0003, "loss": 16.6414, "loss/aux_loss": 0.04819583874195814, "loss/crossentropy": 3.252695155143738, "loss/logits": 1.3152020871639252, "step": 2690 }, { "epoch": 0.027, "grad_norm": 6.9375, "grad_norm_var": 0.1095703125, "learning_rate": 0.0003, "loss": 16.5657, "loss/aux_loss": 0.04820863176137209, "loss/crossentropy": 3.3271077156066893, "loss/logits": 1.3572327196598053, "step": 2700 }, { "epoch": 0.0271, "grad_norm": 7.125, "grad_norm_var": 0.19192301432291667, "learning_rate": 0.0003, "loss": 16.5538, "loss/aux_loss": 0.048196819797158244, "loss/crossentropy": 3.245091903209686, "loss/logits": 1.3349639832973481, "step": 2710 }, { "epoch": 0.0272, "grad_norm": 7.75, "grad_norm_var": 0.09881184895833334, "learning_rate": 0.0003, "loss": 16.6612, "loss/aux_loss": 0.048208712972700594, "loss/crossentropy": 3.2764087319374084, "loss/logits": 1.3150906205177306, "step": 2720 }, { "epoch": 0.0273, "grad_norm": 7.78125, "grad_norm_var": 0.103125, "learning_rate": 0.0003, "loss": 16.6158, "loss/aux_loss": 0.04819247759878635, "loss/crossentropy": 3.273999774456024, "loss/logits": 1.3284586131572724, "step": 2730 }, { "epoch": 0.0274, "grad_norm": 7.875, "grad_norm_var": 0.20220947265625, "learning_rate": 0.0003, "loss": 16.4436, "loss/aux_loss": 0.04821184277534485, "loss/crossentropy": 3.188088583946228, "loss/logits": 1.3358671367168427, "step": 2740 }, { "epoch": 0.0275, "grad_norm": 7.5625, "grad_norm_var": 0.21302083333333333, "learning_rate": 0.0003, "loss": 16.3073, "loss/aux_loss": 0.04821435939520598, "loss/crossentropy": 3.136403810977936, "loss/logits": 1.287468433380127, "step": 2750 }, { "epoch": 0.0276, "grad_norm": 7.71875, "grad_norm_var": 0.320947265625, "learning_rate": 0.0003, "loss": 16.5717, "loss/aux_loss": 0.04819820411503315, "loss/crossentropy": 3.159669041633606, "loss/logits": 1.3258511304855347, "step": 2760 }, { "epoch": 0.0277, "grad_norm": 7.5625, "grad_norm_var": 0.45631510416666665, "learning_rate": 0.0003, "loss": 16.3568, "loss/aux_loss": 0.04820672180503607, "loss/crossentropy": 3.243663287162781, "loss/logits": 1.2874810814857482, "step": 2770 }, { "epoch": 0.0278, "grad_norm": 8.5625, "grad_norm_var": 0.17890218098958333, "learning_rate": 0.0003, "loss": 16.5081, "loss/aux_loss": 0.04820285327732563, "loss/crossentropy": 3.083403432369232, "loss/logits": 1.3088326066732408, "step": 2780 }, { "epoch": 0.0279, "grad_norm": 7.09375, "grad_norm_var": 0.27081705729166666, "learning_rate": 0.0003, "loss": 16.4457, "loss/aux_loss": 0.04822454117238521, "loss/crossentropy": 3.147365128993988, "loss/logits": 1.2816300868988038, "step": 2790 }, { "epoch": 0.028, "grad_norm": 7.53125, "grad_norm_var": 0.17732747395833334, "learning_rate": 0.0003, "loss": 16.361, "loss/aux_loss": 0.0482009943574667, "loss/crossentropy": 3.2210754632949827, "loss/logits": 1.3175399780273438, "step": 2800 }, { "epoch": 0.0281, "grad_norm": 8.625, "grad_norm_var": 0.15832926432291666, "learning_rate": 0.0003, "loss": 16.4841, "loss/aux_loss": 0.04820490088313818, "loss/crossentropy": 3.225731301307678, "loss/logits": 1.3222549259662628, "step": 2810 }, { "epoch": 0.0282, "grad_norm": 7.125, "grad_norm_var": 0.30517171223958334, "learning_rate": 0.0003, "loss": 16.3537, "loss/aux_loss": 0.04823195319622755, "loss/crossentropy": 3.0282162189483643, "loss/logits": 1.2549142867326737, "step": 2820 }, { "epoch": 0.0283, "grad_norm": 7.03125, "grad_norm_var": 0.2861328125, "learning_rate": 0.0003, "loss": 16.1414, "loss/aux_loss": 0.04820448886603117, "loss/crossentropy": 3.1825570702552795, "loss/logits": 1.2594711065292359, "step": 2830 }, { "epoch": 0.0284, "grad_norm": 7.40625, "grad_norm_var": 0.32847900390625, "learning_rate": 0.0003, "loss": 16.3758, "loss/aux_loss": 0.04820435829460621, "loss/crossentropy": 3.3334421873092652, "loss/logits": 1.3914376556873322, "step": 2840 }, { "epoch": 0.0285, "grad_norm": 8.0, "grad_norm_var": 0.41412353515625, "learning_rate": 0.0003, "loss": 16.3215, "loss/aux_loss": 0.048213552497327325, "loss/crossentropy": 3.167293357849121, "loss/logits": 1.3143129229545594, "step": 2850 }, { "epoch": 0.0286, "grad_norm": 8.5625, "grad_norm_var": 0.34501546223958335, "learning_rate": 0.0003, "loss": 16.343, "loss/aux_loss": 0.048203857988119124, "loss/crossentropy": 3.168602633476257, "loss/logits": 1.3595575094223022, "step": 2860 }, { "epoch": 0.0287, "grad_norm": 7.1875, "grad_norm_var": 26.682645670572917, "learning_rate": 0.0003, "loss": 16.2912, "loss/aux_loss": 0.04821203649044037, "loss/crossentropy": 3.1967454075813295, "loss/logits": 1.3017989635467528, "step": 2870 }, { "epoch": 0.0288, "grad_norm": 7.40625, "grad_norm_var": 0.23925374348958334, "learning_rate": 0.0003, "loss": 16.4016, "loss/aux_loss": 0.04820753578096628, "loss/crossentropy": 3.1777406215667723, "loss/logits": 1.3188459992408752, "step": 2880 }, { "epoch": 0.0289, "grad_norm": 8.5, "grad_norm_var": 0.16041259765625, "learning_rate": 0.0003, "loss": 16.317, "loss/aux_loss": 0.048205715417861936, "loss/crossentropy": 3.2767266154289247, "loss/logits": 1.2999807298183441, "step": 2890 }, { "epoch": 0.029, "grad_norm": 7.53125, "grad_norm_var": 0.20714518229166667, "learning_rate": 0.0003, "loss": 16.2915, "loss/aux_loss": 0.04820862989872694, "loss/crossentropy": 3.1509845733642576, "loss/logits": 1.2746458113193513, "step": 2900 }, { "epoch": 0.0291, "grad_norm": 9.0625, "grad_norm_var": 0.45621337890625, "learning_rate": 0.0003, "loss": 16.3776, "loss/aux_loss": 0.0481941731646657, "loss/crossentropy": 3.201290011405945, "loss/logits": 1.2829424917697907, "step": 2910 }, { "epoch": 0.0292, "grad_norm": 7.34375, "grad_norm_var": 0.30279541015625, "learning_rate": 0.0003, "loss": 16.4304, "loss/aux_loss": 0.048205789737403394, "loss/crossentropy": 3.203648090362549, "loss/logits": 1.3133781254291534, "step": 2920 }, { "epoch": 0.0293, "grad_norm": 8.0, "grad_norm_var": 0.14918212890625, "learning_rate": 0.0003, "loss": 16.2822, "loss/aux_loss": 0.048185784742236136, "loss/crossentropy": 3.1988660097122192, "loss/logits": 1.2729051291942597, "step": 2930 }, { "epoch": 0.0294, "grad_norm": 6.6875, "grad_norm_var": 0.37200520833333334, "learning_rate": 0.0003, "loss": 16.0591, "loss/aux_loss": 0.04820053558796644, "loss/crossentropy": 3.2543280601501463, "loss/logits": 1.2817346930503846, "step": 2940 }, { "epoch": 0.0295, "grad_norm": 8.875, "grad_norm_var": 0.5007161458333333, "learning_rate": 0.0003, "loss": 16.1596, "loss/aux_loss": 0.048183665983378886, "loss/crossentropy": 3.2304122924804686, "loss/logits": 1.3286712884902954, "step": 2950 }, { "epoch": 0.0296, "grad_norm": 7.8125, "grad_norm_var": 13.414176432291667, "learning_rate": 0.0003, "loss": 16.1869, "loss/aux_loss": 0.048213465884327886, "loss/crossentropy": 3.363071584701538, "loss/logits": 1.3079668641090394, "step": 2960 }, { "epoch": 0.0297, "grad_norm": 7.375, "grad_norm_var": 14.020833333333334, "learning_rate": 0.0003, "loss": 15.9104, "loss/aux_loss": 0.048193281330168244, "loss/crossentropy": 3.13505756855011, "loss/logits": 1.331637018918991, "step": 2970 }, { "epoch": 0.0298, "grad_norm": 7.40625, "grad_norm_var": 0.18229166666666666, "learning_rate": 0.0003, "loss": 16.3034, "loss/aux_loss": 0.04818958211690187, "loss/crossentropy": 3.2140108823776243, "loss/logits": 1.3344007432460785, "step": 2980 }, { "epoch": 0.0299, "grad_norm": 7.46875, "grad_norm_var": 0.428369140625, "learning_rate": 0.0003, "loss": 16.2893, "loss/aux_loss": 0.048207861743867396, "loss/crossentropy": 3.2560169219970705, "loss/logits": 1.279381561279297, "step": 2990 }, { "epoch": 0.03, "grad_norm": 8.75, "grad_norm_var": 2.956734212239583, "learning_rate": 0.0003, "loss": 16.2927, "loss/aux_loss": 0.04818707294762135, "loss/crossentropy": 3.217176949977875, "loss/logits": 1.3047203302383423, "step": 3000 }, { "epoch": 0.0301, "grad_norm": 9.125, "grad_norm_var": 0.6278645833333333, "learning_rate": 0.0003, "loss": 16.2956, "loss/aux_loss": 0.048182461969554426, "loss/crossentropy": 3.2268913865089415, "loss/logits": 1.2893227458000183, "step": 3010 }, { "epoch": 0.0302, "grad_norm": 7.0625, "grad_norm_var": 0.35689697265625, "learning_rate": 0.0003, "loss": 15.8782, "loss/aux_loss": 0.04818679504096508, "loss/crossentropy": 3.0483377814292907, "loss/logits": 1.3037330031394958, "step": 3020 }, { "epoch": 0.0303, "grad_norm": 6.65625, "grad_norm_var": 0.13655192057291668, "learning_rate": 0.0003, "loss": 16.2123, "loss/aux_loss": 0.04818045124411583, "loss/crossentropy": 3.0745912194252014, "loss/logits": 1.2567619979381561, "step": 3030 }, { "epoch": 0.0304, "grad_norm": 7.3125, "grad_norm_var": 0.1279296875, "learning_rate": 0.0003, "loss": 16.2275, "loss/aux_loss": 0.04818618576973677, "loss/crossentropy": 3.313616728782654, "loss/logits": 1.3123571872711182, "step": 3040 }, { "epoch": 0.0305, "grad_norm": 6.8125, "grad_norm_var": 0.306640625, "learning_rate": 0.0003, "loss": 15.9426, "loss/aux_loss": 0.048186902329325675, "loss/crossentropy": 3.066525948047638, "loss/logits": 1.2751049637794494, "step": 3050 }, { "epoch": 0.0306, "grad_norm": 7.53125, "grad_norm_var": 9.796805826822917, "learning_rate": 0.0003, "loss": 16.1247, "loss/aux_loss": 0.048196819610893726, "loss/crossentropy": 3.2423496723175047, "loss/logits": 1.305676233768463, "step": 3060 }, { "epoch": 0.0307, "grad_norm": 6.78125, "grad_norm_var": 2.33648681640625, "learning_rate": 0.0003, "loss": 15.9665, "loss/aux_loss": 0.04817989952862263, "loss/crossentropy": 3.178650379180908, "loss/logits": 1.2622127085924149, "step": 3070 }, { "epoch": 0.0308, "grad_norm": 8.125, "grad_norm_var": 0.16197916666666667, "learning_rate": 0.0003, "loss": 15.8411, "loss/aux_loss": 0.048175792768597604, "loss/crossentropy": 3.2178627133369444, "loss/logits": 1.2681122601032258, "step": 3080 }, { "epoch": 0.0309, "grad_norm": 7.90625, "grad_norm_var": 0.19068603515625, "learning_rate": 0.0003, "loss": 15.9343, "loss/aux_loss": 0.04819290656596422, "loss/crossentropy": 3.0956594944000244, "loss/logits": 1.2836836636066438, "step": 3090 }, { "epoch": 0.031, "grad_norm": 6.8125, "grad_norm_var": 0.12760009765625, "learning_rate": 0.0003, "loss": 16.1508, "loss/aux_loss": 0.04819896165281534, "loss/crossentropy": 3.242776608467102, "loss/logits": 1.260975569486618, "step": 3100 }, { "epoch": 0.0311, "grad_norm": 6.75, "grad_norm_var": 0.20302327473958334, "learning_rate": 0.0003, "loss": 15.8037, "loss/aux_loss": 0.048188280686736105, "loss/crossentropy": 3.0742504239082336, "loss/logits": 1.247715598344803, "step": 3110 }, { "epoch": 0.0312, "grad_norm": 7.5, "grad_norm_var": 0.09726155598958333, "learning_rate": 0.0003, "loss": 16.1331, "loss/aux_loss": 0.048201543465256694, "loss/crossentropy": 3.1597721457481383, "loss/logits": 1.2908858835697175, "step": 3120 }, { "epoch": 0.0313, "grad_norm": 7.6875, "grad_norm_var": 0.5277303059895834, "learning_rate": 0.0003, "loss": 15.8831, "loss/aux_loss": 0.04819868616759777, "loss/crossentropy": 3.1610820293426514, "loss/logits": 1.2815950632095336, "step": 3130 }, { "epoch": 0.0314, "grad_norm": 7.3125, "grad_norm_var": 0.3472493489583333, "learning_rate": 0.0003, "loss": 16.038, "loss/aux_loss": 0.048184423707425594, "loss/crossentropy": 3.1954041719436646, "loss/logits": 1.278364223241806, "step": 3140 }, { "epoch": 0.0315, "grad_norm": 7.1875, "grad_norm_var": 0.039567057291666666, "learning_rate": 0.0003, "loss": 15.916, "loss/aux_loss": 0.048186035640537736, "loss/crossentropy": 3.1560078144073485, "loss/logits": 1.3042196780443192, "step": 3150 }, { "epoch": 0.0316, "grad_norm": 7.75, "grad_norm_var": 0.35128580729166664, "learning_rate": 0.0003, "loss": 15.9454, "loss/aux_loss": 0.04819800220429897, "loss/crossentropy": 3.1910813629627226, "loss/logits": 1.2715374946594238, "step": 3160 }, { "epoch": 0.0317, "grad_norm": 8.4375, "grad_norm_var": 0.6532389322916666, "learning_rate": 0.0003, "loss": 15.6996, "loss/aux_loss": 0.048211091198027134, "loss/crossentropy": 3.116113305091858, "loss/logits": 1.2663143903017045, "step": 3170 }, { "epoch": 0.0318, "grad_norm": 7.53125, "grad_norm_var": 5.35582275390625, "learning_rate": 0.0003, "loss": 16.1554, "loss/aux_loss": 0.048189323768019675, "loss/crossentropy": 3.1312987327575685, "loss/logits": 1.2820051074028016, "step": 3180 }, { "epoch": 0.0319, "grad_norm": 7.65625, "grad_norm_var": 5.358837890625, "learning_rate": 0.0003, "loss": 15.9255, "loss/aux_loss": 0.04818729739636183, "loss/crossentropy": 3.0685723185539246, "loss/logits": 1.2486902892589569, "step": 3190 }, { "epoch": 0.032, "grad_norm": 7.09375, "grad_norm_var": 0.20826416015625, "learning_rate": 0.0003, "loss": 15.8965, "loss/aux_loss": 0.04819039478898048, "loss/crossentropy": 3.235087752342224, "loss/logits": 1.253222393989563, "step": 3200 }, { "epoch": 0.0321, "grad_norm": 7.53125, "grad_norm_var": 0.9037760416666667, "learning_rate": 0.0003, "loss": 16.0786, "loss/aux_loss": 0.0481830982491374, "loss/crossentropy": 3.1696943759918215, "loss/logits": 1.2844607293605805, "step": 3210 }, { "epoch": 0.0322, "grad_norm": 7.4375, "grad_norm_var": 0.9649739583333333, "learning_rate": 0.0003, "loss": 15.7072, "loss/aux_loss": 0.04817593917250633, "loss/crossentropy": 3.0222031831741334, "loss/logits": 1.2671060264110565, "step": 3220 }, { "epoch": 0.0323, "grad_norm": 7.4375, "grad_norm_var": 0.08889567057291667, "learning_rate": 0.0003, "loss": 15.7409, "loss/aux_loss": 0.04818395711481571, "loss/crossentropy": 3.200468099117279, "loss/logits": 1.3267085552215576, "step": 3230 }, { "epoch": 0.0324, "grad_norm": 10.4375, "grad_norm_var": 188.03019205729166, "learning_rate": 0.0003, "loss": 15.8984, "loss/aux_loss": 0.0481902739033103, "loss/crossentropy": 3.126142477989197, "loss/logits": 1.2616908073425293, "step": 3240 }, { "epoch": 0.0325, "grad_norm": 7.34375, "grad_norm_var": 188.55543212890626, "learning_rate": 0.0003, "loss": 15.9279, "loss/aux_loss": 0.048199089244008064, "loss/crossentropy": 3.1426218867301943, "loss/logits": 1.2768731236457824, "step": 3250 }, { "epoch": 0.0326, "grad_norm": 7.0, "grad_norm_var": 0.15623372395833332, "learning_rate": 0.0003, "loss": 15.6804, "loss/aux_loss": 0.04819824192672968, "loss/crossentropy": 3.130205762386322, "loss/logits": 1.2509568214416504, "step": 3260 }, { "epoch": 0.0327, "grad_norm": 7.03125, "grad_norm_var": 0.19342041015625, "learning_rate": 0.0003, "loss": 15.7794, "loss/aux_loss": 0.048181666433811186, "loss/crossentropy": 3.203247845172882, "loss/logits": 1.248792153596878, "step": 3270 }, { "epoch": 0.0328, "grad_norm": 6.90625, "grad_norm_var": 0.3446451822916667, "learning_rate": 0.0003, "loss": 15.6809, "loss/aux_loss": 0.04819200746715069, "loss/crossentropy": 3.2286102890968325, "loss/logits": 1.2171394854784012, "step": 3280 }, { "epoch": 0.0329, "grad_norm": 6.75, "grad_norm_var": 0.19798177083333332, "learning_rate": 0.0003, "loss": 15.677, "loss/aux_loss": 0.048186296969652175, "loss/crossentropy": 3.0580495953559876, "loss/logits": 1.2661554515361786, "step": 3290 }, { "epoch": 0.033, "grad_norm": 7.0, "grad_norm_var": 76.57779541015626, "learning_rate": 0.0003, "loss": 15.86, "loss/aux_loss": 0.04821221027523279, "loss/crossentropy": 2.969318687915802, "loss/logits": 1.2122164875268937, "step": 3300 }, { "epoch": 0.0331, "grad_norm": 7.09375, "grad_norm_var": 0.06282145182291667, "learning_rate": 0.0003, "loss": 15.7174, "loss/aux_loss": 0.048201913200318816, "loss/crossentropy": 3.2775181770324706, "loss/logits": 1.2739900410175324, "step": 3310 }, { "epoch": 0.0332, "grad_norm": 7.375, "grad_norm_var": 0.08489583333333334, "learning_rate": 0.0003, "loss": 15.6614, "loss/aux_loss": 0.048193711787462234, "loss/crossentropy": 3.079313504695892, "loss/logits": 1.2485756576061249, "step": 3320 }, { "epoch": 0.0333, "grad_norm": 7.125, "grad_norm_var": 0.068994140625, "learning_rate": 0.0003, "loss": 15.8517, "loss/aux_loss": 0.048206409066915513, "loss/crossentropy": 3.030042564868927, "loss/logits": 1.1876587241888046, "step": 3330 }, { "epoch": 0.0334, "grad_norm": 7.625, "grad_norm_var": 0.196337890625, "learning_rate": 0.0003, "loss": 15.6156, "loss/aux_loss": 0.04819293972104788, "loss/crossentropy": 3.024798274040222, "loss/logits": 1.2279581785202027, "step": 3340 }, { "epoch": 0.0335, "grad_norm": 8.1875, "grad_norm_var": 0.192822265625, "learning_rate": 0.0003, "loss": 15.6715, "loss/aux_loss": 0.048190113715827466, "loss/crossentropy": 3.1095346808433533, "loss/logits": 1.2218665778636932, "step": 3350 }, { "epoch": 0.0336, "grad_norm": 7.875, "grad_norm_var": 0.8559733072916667, "learning_rate": 0.0003, "loss": 15.4338, "loss/aux_loss": 0.04819390587508678, "loss/crossentropy": 3.0634355664253237, "loss/logits": 1.2227939546108246, "step": 3360 }, { "epoch": 0.0337, "grad_norm": 6.84375, "grad_norm_var": 0.8440104166666667, "learning_rate": 0.0003, "loss": 15.6555, "loss/aux_loss": 0.04817529227584601, "loss/crossentropy": 3.2624236226081846, "loss/logits": 1.2496430993080139, "step": 3370 }, { "epoch": 0.0338, "grad_norm": 7.25, "grad_norm_var": 0.08365478515625, "learning_rate": 0.0003, "loss": 15.7475, "loss/aux_loss": 0.048181839287281036, "loss/crossentropy": 3.000634413957596, "loss/logits": 1.2105579853057862, "step": 3380 }, { "epoch": 0.0339, "grad_norm": 7.71875, "grad_norm_var": 0.118212890625, "learning_rate": 0.0003, "loss": 15.878, "loss/aux_loss": 0.04818106349557638, "loss/crossentropy": 3.1646122694015504, "loss/logits": 1.2451474606990813, "step": 3390 }, { "epoch": 0.034, "grad_norm": 8.125, "grad_norm_var": 0.15712483723958334, "learning_rate": 0.0003, "loss": 15.5974, "loss/aux_loss": 0.04819212630391121, "loss/crossentropy": 3.0883402824401855, "loss/logits": 1.231631088256836, "step": 3400 }, { "epoch": 0.0341, "grad_norm": 7.90625, "grad_norm_var": 43.145894368489586, "learning_rate": 0.0003, "loss": 15.6349, "loss/aux_loss": 0.048203271254897116, "loss/crossentropy": 3.0851239562034607, "loss/logits": 1.2473519384860992, "step": 3410 }, { "epoch": 0.0342, "grad_norm": 8.125, "grad_norm_var": 42.346354166666664, "learning_rate": 0.0003, "loss": 15.6873, "loss/aux_loss": 0.04819300062954426, "loss/crossentropy": 3.2211881279945374, "loss/logits": 1.2318155229091645, "step": 3420 }, { "epoch": 0.0343, "grad_norm": 7.28125, "grad_norm_var": 0.29765218098958335, "learning_rate": 0.0003, "loss": 15.6778, "loss/aux_loss": 0.04818199146538973, "loss/crossentropy": 3.163746166229248, "loss/logits": 1.2294625520706177, "step": 3430 }, { "epoch": 0.0344, "grad_norm": 6.875, "grad_norm_var": 0.31027018229166664, "learning_rate": 0.0003, "loss": 15.859, "loss/aux_loss": 0.04818481933325529, "loss/crossentropy": 3.2157267451286318, "loss/logits": 1.2203275740146637, "step": 3440 }, { "epoch": 0.0345, "grad_norm": 7.46875, "grad_norm_var": 0.4372355143229167, "learning_rate": 0.0003, "loss": 15.4943, "loss/aux_loss": 0.04818508345633745, "loss/crossentropy": 2.981385588645935, "loss/logits": 1.2513148784637451, "step": 3450 }, { "epoch": 0.0346, "grad_norm": 7.96875, "grad_norm_var": 0.27421875, "learning_rate": 0.0003, "loss": 15.6128, "loss/aux_loss": 0.048185240291059014, "loss/crossentropy": 3.065082919597626, "loss/logits": 1.2668458700180054, "step": 3460 }, { "epoch": 0.0347, "grad_norm": 7.34375, "grad_norm_var": 0.0998046875, "learning_rate": 0.0003, "loss": 15.6479, "loss/aux_loss": 0.04818276725709438, "loss/crossentropy": 3.196527397632599, "loss/logits": 1.2704426288604735, "step": 3470 }, { "epoch": 0.0348, "grad_norm": 7.53125, "grad_norm_var": 77.90299072265626, "learning_rate": 0.0003, "loss": 15.213, "loss/aux_loss": 0.04820560179650783, "loss/crossentropy": 3.0156648635864256, "loss/logits": 1.1708497077226638, "step": 3480 }, { "epoch": 0.0349, "grad_norm": 6.5, "grad_norm_var": 0.52232666015625, "learning_rate": 0.0003, "loss": 15.5807, "loss/aux_loss": 0.04817814268171787, "loss/crossentropy": 3.3053033113479615, "loss/logits": 1.2551276683807373, "step": 3490 }, { "epoch": 0.035, "grad_norm": 7.25, "grad_norm_var": 0.22096354166666668, "learning_rate": 0.0003, "loss": 15.4642, "loss/aux_loss": 0.048177143558859825, "loss/crossentropy": 3.1717012524604797, "loss/logits": 1.2090398788452148, "step": 3500 }, { "epoch": 0.0351, "grad_norm": 7.34375, "grad_norm_var": 0.17125244140625, "learning_rate": 0.0003, "loss": 15.469, "loss/aux_loss": 0.04816991053521633, "loss/crossentropy": 3.0794217944145204, "loss/logits": 1.2378638923168181, "step": 3510 }, { "epoch": 0.0352, "grad_norm": 7.34375, "grad_norm_var": 0.5747233072916667, "learning_rate": 0.0003, "loss": 15.3886, "loss/aux_loss": 0.04817315954715014, "loss/crossentropy": 3.1408966541290284, "loss/logits": 1.1839520275592803, "step": 3520 }, { "epoch": 0.0353, "grad_norm": 8.375, "grad_norm_var": 0.579541015625, "learning_rate": 0.0003, "loss": 15.4735, "loss/aux_loss": 0.04817787241190672, "loss/crossentropy": 3.1789328932762144, "loss/logits": 1.241087591648102, "step": 3530 }, { "epoch": 0.0354, "grad_norm": 7.1875, "grad_norm_var": 0.35953369140625, "learning_rate": 0.0003, "loss": 15.4254, "loss/aux_loss": 0.04818295389413833, "loss/crossentropy": 3.1350058197975157, "loss/logits": 1.218758872151375, "step": 3540 }, { "epoch": 0.0355, "grad_norm": 7.5, "grad_norm_var": 0.14218343098958333, "learning_rate": 0.0003, "loss": 15.4481, "loss/aux_loss": 0.048182402923703196, "loss/crossentropy": 3.1021770238876343, "loss/logits": 1.157341206073761, "step": 3550 }, { "epoch": 0.0356, "grad_norm": 7.5625, "grad_norm_var": 0.220947265625, "learning_rate": 0.0003, "loss": 15.6278, "loss/aux_loss": 0.048180959187448025, "loss/crossentropy": 3.1709139943122864, "loss/logits": 1.2263819336891175, "step": 3560 }, { "epoch": 0.0357, "grad_norm": 7.375, "grad_norm_var": 0.3221638997395833, "learning_rate": 0.0003, "loss": 15.3394, "loss/aux_loss": 0.04818211700767279, "loss/crossentropy": 3.2739925384521484, "loss/logits": 1.2379388093948365, "step": 3570 }, { "epoch": 0.0358, "grad_norm": 6.90625, "grad_norm_var": 0.8153605143229167, "learning_rate": 0.0003, "loss": 15.5107, "loss/aux_loss": 0.04817323740571737, "loss/crossentropy": 3.136904263496399, "loss/logits": 1.2624209761619567, "step": 3580 }, { "epoch": 0.0359, "grad_norm": 7.4375, "grad_norm_var": 0.128125, "learning_rate": 0.0003, "loss": 15.654, "loss/aux_loss": 0.04819139763712883, "loss/crossentropy": 3.1375385880470277, "loss/logits": 1.2516057163476944, "step": 3590 }, { "epoch": 0.036, "grad_norm": 6.75, "grad_norm_var": 0.09338785807291666, "learning_rate": 0.0003, "loss": 15.5777, "loss/aux_loss": 0.04818712417036295, "loss/crossentropy": 3.3446964859962462, "loss/logits": 1.214635932445526, "step": 3600 }, { "epoch": 0.0361, "grad_norm": 7.46875, "grad_norm_var": 0.10930582682291666, "learning_rate": 0.0003, "loss": 15.4192, "loss/aux_loss": 0.04818251971155405, "loss/crossentropy": 3.177507519721985, "loss/logits": 1.2218758046627045, "step": 3610 }, { "epoch": 0.0362, "grad_norm": 8.75, "grad_norm_var": 0.21578369140625, "learning_rate": 0.0003, "loss": 15.6698, "loss/aux_loss": 0.048180416226387024, "loss/crossentropy": 3.1352601170539858, "loss/logits": 1.2688252985477448, "step": 3620 }, { "epoch": 0.0363, "grad_norm": 7.75, "grad_norm_var": 0.240625, "learning_rate": 0.0003, "loss": 15.4033, "loss/aux_loss": 0.04819142427295446, "loss/crossentropy": 3.1684580206871034, "loss/logits": 1.2518825322389602, "step": 3630 }, { "epoch": 0.0364, "grad_norm": 7.25, "grad_norm_var": 0.072509765625, "learning_rate": 0.0003, "loss": 15.3378, "loss/aux_loss": 0.04819103125482797, "loss/crossentropy": 3.210425066947937, "loss/logits": 1.2049184322357178, "step": 3640 }, { "epoch": 0.0365, "grad_norm": 7.0625, "grad_norm_var": 49.72636311848958, "learning_rate": 0.0003, "loss": 15.2234, "loss/aux_loss": 0.048194903507828714, "loss/crossentropy": 3.2832493662834166, "loss/logits": 1.221068474650383, "step": 3650 }, { "epoch": 0.0366, "grad_norm": 9.125, "grad_norm_var": 0.3234375, "learning_rate": 0.0003, "loss": 15.3403, "loss/aux_loss": 0.04818172939121723, "loss/crossentropy": 3.280470097064972, "loss/logits": 1.229696273803711, "step": 3660 }, { "epoch": 0.0367, "grad_norm": 7.5, "grad_norm_var": 0.39843343098958334, "learning_rate": 0.0003, "loss": 15.3494, "loss/aux_loss": 0.048176801204681395, "loss/crossentropy": 3.0316362023353576, "loss/logits": 1.2116109132766724, "step": 3670 }, { "epoch": 0.0368, "grad_norm": 7.84375, "grad_norm_var": 0.3272786458333333, "learning_rate": 0.0003, "loss": 15.4476, "loss/aux_loss": 0.04817160293459892, "loss/crossentropy": 3.2188750505447388, "loss/logits": 1.2286329954862594, "step": 3680 }, { "epoch": 0.0369, "grad_norm": 8.5625, "grad_norm_var": 0.23899332682291666, "learning_rate": 0.0003, "loss": 15.3283, "loss/aux_loss": 0.04817473813891411, "loss/crossentropy": 3.1428863406181335, "loss/logits": 1.2700409144163132, "step": 3690 }, { "epoch": 0.037, "grad_norm": 7.375, "grad_norm_var": 0.1796875, "learning_rate": 0.0003, "loss": 15.2114, "loss/aux_loss": 0.048194908909499644, "loss/crossentropy": 2.9049007534980773, "loss/logits": 1.209693717956543, "step": 3700 }, { "epoch": 0.0371, "grad_norm": 7.15625, "grad_norm_var": 4.603153483072917, "learning_rate": 0.0003, "loss": 15.4273, "loss/aux_loss": 0.04818948246538639, "loss/crossentropy": 3.1089539527893066, "loss/logits": 1.231841367483139, "step": 3710 }, { "epoch": 0.0372, "grad_norm": 7.5625, "grad_norm_var": 0.122119140625, "learning_rate": 0.0003, "loss": 15.3983, "loss/aux_loss": 0.04818321000784635, "loss/crossentropy": 2.9950480341911314, "loss/logits": 1.1607284903526307, "step": 3720 }, { "epoch": 0.0373, "grad_norm": 7.4375, "grad_norm_var": 0.15904947916666667, "learning_rate": 0.0003, "loss": 15.1891, "loss/aux_loss": 0.04817815236747265, "loss/crossentropy": 3.161143660545349, "loss/logits": 1.238188961148262, "step": 3730 }, { "epoch": 0.0374, "grad_norm": 7.03125, "grad_norm_var": 0.3502604166666667, "learning_rate": 0.0003, "loss": 15.3511, "loss/aux_loss": 0.04818747155368328, "loss/crossentropy": 3.086538052558899, "loss/logits": 1.1786428213119506, "step": 3740 }, { "epoch": 0.0375, "grad_norm": 7.46875, "grad_norm_var": 0.11708577473958333, "learning_rate": 0.0003, "loss": 15.4379, "loss/aux_loss": 0.04817507416009903, "loss/crossentropy": 3.203218102455139, "loss/logits": 1.2379136860370636, "step": 3750 }, { "epoch": 0.0376, "grad_norm": 7.03125, "grad_norm_var": 0.09334309895833333, "learning_rate": 0.0003, "loss": 15.1209, "loss/aux_loss": 0.048195258155465125, "loss/crossentropy": 2.8767175674438477, "loss/logits": 1.135795423388481, "step": 3760 }, { "epoch": 0.0377, "grad_norm": 8.0, "grad_norm_var": 0.5704386393229167, "learning_rate": 0.0003, "loss": 15.1931, "loss/aux_loss": 0.04818210508674383, "loss/crossentropy": 3.0543838024139403, "loss/logits": 1.1589192599058151, "step": 3770 }, { "epoch": 0.0378, "grad_norm": 8.6875, "grad_norm_var": 0.9666015625, "learning_rate": 0.0003, "loss": 15.3202, "loss/aux_loss": 0.04818203579634428, "loss/crossentropy": 3.1746195673942568, "loss/logits": 1.2106265246868133, "step": 3780 }, { "epoch": 0.0379, "grad_norm": 9.125, "grad_norm_var": 54.760921223958334, "learning_rate": 0.0003, "loss": 15.0472, "loss/aux_loss": 0.04817959927022457, "loss/crossentropy": 3.1303970336914064, "loss/logits": 1.1739704608917236, "step": 3790 }, { "epoch": 0.038, "grad_norm": 7.28125, "grad_norm_var": 55.79407552083333, "learning_rate": 0.0003, "loss": 15.335, "loss/aux_loss": 0.048162929527461526, "loss/crossentropy": 3.1775804996490478, "loss/logits": 1.2157859086990357, "step": 3800 }, { "epoch": 0.0381, "grad_norm": 7.4375, "grad_norm_var": 0.09794514973958333, "learning_rate": 0.0003, "loss": 15.4016, "loss/aux_loss": 0.048171533085405824, "loss/crossentropy": 3.184191071987152, "loss/logits": 1.200016838312149, "step": 3810 }, { "epoch": 0.0382, "grad_norm": 8.125, "grad_norm_var": 0.15972900390625, "learning_rate": 0.0003, "loss": 15.2867, "loss/aux_loss": 0.048174711503088476, "loss/crossentropy": 3.093715155124664, "loss/logits": 1.1898449569940568, "step": 3820 }, { "epoch": 0.0383, "grad_norm": 7.53125, "grad_norm_var": 0.28216145833333334, "learning_rate": 0.0003, "loss": 15.179, "loss/aux_loss": 0.04817448034882545, "loss/crossentropy": 3.1919341683387756, "loss/logits": 1.2327097624540329, "step": 3830 }, { "epoch": 0.0384, "grad_norm": 16.25, "grad_norm_var": 4.76041259765625, "learning_rate": 0.0003, "loss": 15.1992, "loss/aux_loss": 0.048165909759700296, "loss/crossentropy": 3.132761836051941, "loss/logits": 1.1952956855297088, "step": 3840 }, { "epoch": 0.0385, "grad_norm": 7.96875, "grad_norm_var": 5.12109375, "learning_rate": 0.0003, "loss": 15.2487, "loss/aux_loss": 0.0481891430914402, "loss/crossentropy": 2.9917232036590575, "loss/logits": 1.1668372660875321, "step": 3850 }, { "epoch": 0.0386, "grad_norm": 7.9375, "grad_norm_var": 0.13531494140625, "learning_rate": 0.0003, "loss": 15.1914, "loss/aux_loss": 0.0481577729806304, "loss/crossentropy": 3.071371626853943, "loss/logits": 1.2018966376781464, "step": 3860 }, { "epoch": 0.0387, "grad_norm": 7.46875, "grad_norm_var": 0.12994791666666666, "learning_rate": 0.0003, "loss": 15.0261, "loss/aux_loss": 0.04815917555242777, "loss/crossentropy": 3.0264050543308256, "loss/logits": 1.1740915864706039, "step": 3870 }, { "epoch": 0.0388, "grad_norm": 7.4375, "grad_norm_var": 0.2575154622395833, "learning_rate": 0.0003, "loss": 15.1013, "loss/aux_loss": 0.04817144125699997, "loss/crossentropy": 3.0360918402671815, "loss/logits": 1.152667647600174, "step": 3880 }, { "epoch": 0.0389, "grad_norm": 8.1875, "grad_norm_var": 0.21964518229166666, "learning_rate": 0.0003, "loss": 15.0667, "loss/aux_loss": 0.04816736020147801, "loss/crossentropy": 3.1190317153930662, "loss/logits": 1.185880446434021, "step": 3890 }, { "epoch": 0.039, "grad_norm": 9.0625, "grad_norm_var": 0.39088134765625, "learning_rate": 0.0003, "loss": 15.1793, "loss/aux_loss": 0.04816778711974621, "loss/crossentropy": 3.00163277387619, "loss/logits": 1.1773586809635161, "step": 3900 }, { "epoch": 0.0391, "grad_norm": 7.65625, "grad_norm_var": 0.35870768229166666, "learning_rate": 0.0003, "loss": 15.1278, "loss/aux_loss": 0.04818731751292944, "loss/crossentropy": 2.939511752128601, "loss/logits": 1.2299345314502717, "step": 3910 }, { "epoch": 0.0392, "grad_norm": 7.5, "grad_norm_var": 0.09436442057291666, "learning_rate": 0.0003, "loss": 15.0897, "loss/aux_loss": 0.0481788320466876, "loss/crossentropy": 3.0653002142906187, "loss/logits": 1.1968173742294312, "step": 3920 }, { "epoch": 0.0393, "grad_norm": 7.28125, "grad_norm_var": 0.16145426432291668, "learning_rate": 0.0003, "loss": 15.0164, "loss/aux_loss": 0.04818780794739723, "loss/crossentropy": 2.995146155357361, "loss/logits": 1.155534029006958, "step": 3930 }, { "epoch": 0.0394, "grad_norm": 7.78125, "grad_norm_var": 0.11588134765625, "learning_rate": 0.0003, "loss": 15.0909, "loss/aux_loss": 0.04817194156348705, "loss/crossentropy": 2.9778522849082947, "loss/logits": 1.188610589504242, "step": 3940 }, { "epoch": 0.0395, "grad_norm": 8.5, "grad_norm_var": 0.22825520833333332, "learning_rate": 0.0003, "loss": 14.9485, "loss/aux_loss": 0.04817061126232147, "loss/crossentropy": 3.0706464409828187, "loss/logits": 1.1276392668485642, "step": 3950 }, { "epoch": 0.0396, "grad_norm": 7.59375, "grad_norm_var": 0.1765625, "learning_rate": 0.0003, "loss": 15.1761, "loss/aux_loss": 0.04817082397639751, "loss/crossentropy": 3.0294368386268617, "loss/logits": 1.1730817139148713, "step": 3960 }, { "epoch": 0.0397, "grad_norm": 7.78125, "grad_norm_var": 0.37659098307291666, "learning_rate": 0.0003, "loss": 14.9544, "loss/aux_loss": 0.048177217692136766, "loss/crossentropy": 3.003262734413147, "loss/logits": 1.1717498630285264, "step": 3970 }, { "epoch": 0.0398, "grad_norm": 7.59375, "grad_norm_var": 0.46226806640625, "learning_rate": 0.0003, "loss": 15.0399, "loss/aux_loss": 0.04816052261739969, "loss/crossentropy": 3.108014762401581, "loss/logits": 1.1962302416563033, "step": 3980 }, { "epoch": 0.0399, "grad_norm": 8.0625, "grad_norm_var": 0.15533854166666666, "learning_rate": 0.0003, "loss": 15.0799, "loss/aux_loss": 0.04815749432891607, "loss/crossentropy": 3.185481405258179, "loss/logits": 1.2161674737930297, "step": 3990 }, { "epoch": 0.04, "grad_norm": 8.25, "grad_norm_var": 0.155712890625, "learning_rate": 0.0003, "loss": 15.0788, "loss/aux_loss": 0.0481644194573164, "loss/crossentropy": 3.1585240364074707, "loss/logits": 1.1965474605560302, "step": 4000 }, { "epoch": 0.0401, "grad_norm": 7.21875, "grad_norm_var": 0.21419270833333334, "learning_rate": 0.0003, "loss": 15.0548, "loss/aux_loss": 0.04816298447549343, "loss/crossentropy": 3.1302665889263155, "loss/logits": 1.1530128061771392, "step": 4010 }, { "epoch": 0.0402, "grad_norm": 7.53125, "grad_norm_var": 0.13173421223958334, "learning_rate": 0.0003, "loss": 15.0497, "loss/aux_loss": 0.04816345106810331, "loss/crossentropy": 3.142714560031891, "loss/logits": 1.1879263758659362, "step": 4020 }, { "epoch": 0.0403, "grad_norm": 7.6875, "grad_norm_var": 0.06339518229166667, "learning_rate": 0.0003, "loss": 14.9433, "loss/aux_loss": 0.04815952125936747, "loss/crossentropy": 3.0843304634094237, "loss/logits": 1.2199938654899598, "step": 4030 }, { "epoch": 0.0404, "grad_norm": 6.875, "grad_norm_var": 0.15402018229166667, "learning_rate": 0.0003, "loss": 15.19, "loss/aux_loss": 0.04817276708781719, "loss/crossentropy": 3.128165376186371, "loss/logits": 1.1765313237905501, "step": 4040 }, { "epoch": 0.0405, "grad_norm": 30.375, "grad_norm_var": 32.932275390625, "learning_rate": 0.0003, "loss": 14.8461, "loss/aux_loss": 0.048168274387717244, "loss/crossentropy": 3.005221629142761, "loss/logits": 1.1654815077781677, "step": 4050 }, { "epoch": 0.0406, "grad_norm": 7.59375, "grad_norm_var": 32.006754557291664, "learning_rate": 0.0003, "loss": 15.0802, "loss/aux_loss": 0.04817748311907053, "loss/crossentropy": 3.095579755306244, "loss/logits": 1.1656386017799378, "step": 4060 }, { "epoch": 0.0407, "grad_norm": 8.6875, "grad_norm_var": 1.2711873372395834, "learning_rate": 0.0003, "loss": 15.1126, "loss/aux_loss": 0.04816674739122391, "loss/crossentropy": 3.1798906683921815, "loss/logits": 1.207847249507904, "step": 4070 }, { "epoch": 0.0408, "grad_norm": 7.875, "grad_norm_var": 1.26412353515625, "learning_rate": 0.0003, "loss": 15.1471, "loss/aux_loss": 0.04817005805671215, "loss/crossentropy": 3.125558304786682, "loss/logits": 1.1919424772262572, "step": 4080 }, { "epoch": 0.0409, "grad_norm": 7.5625, "grad_norm_var": 0.2809529622395833, "learning_rate": 0.0003, "loss": 14.853, "loss/aux_loss": 0.048167549446225165, "loss/crossentropy": 3.097035455703735, "loss/logits": 1.1527066469192504, "step": 4090 }, { "epoch": 0.041, "grad_norm": 7.4375, "grad_norm_var": 0.690087890625, "learning_rate": 0.0003, "loss": 14.8294, "loss/aux_loss": 0.04817544762045145, "loss/crossentropy": 2.9536795616149902, "loss/logits": 1.1623014092445374, "step": 4100 }, { "epoch": 0.0411, "grad_norm": 7.90625, "grad_norm_var": 1.3970011393229167, "learning_rate": 0.0003, "loss": 15.0244, "loss/aux_loss": 0.048174246400594714, "loss/crossentropy": 3.1965074062347414, "loss/logits": 1.162026983499527, "step": 4110 }, { "epoch": 0.0412, "grad_norm": 8.1875, "grad_norm_var": 0.08495686848958334, "learning_rate": 0.0003, "loss": 14.9105, "loss/aux_loss": 0.048172399029135705, "loss/crossentropy": 3.0823826670646666, "loss/logits": 1.199626660346985, "step": 4120 }, { "epoch": 0.0413, "grad_norm": 7.40625, "grad_norm_var": 0.15282796223958334, "learning_rate": 0.0003, "loss": 15.0299, "loss/aux_loss": 0.048171821609139444, "loss/crossentropy": 3.1277252316474913, "loss/logits": 1.1875766038894653, "step": 4130 }, { "epoch": 0.0414, "grad_norm": 7.875, "grad_norm_var": 0.18984375, "learning_rate": 0.0003, "loss": 14.9988, "loss/aux_loss": 0.04816291127353907, "loss/crossentropy": 2.999015522003174, "loss/logits": 1.1738766431808472, "step": 4140 }, { "epoch": 0.0415, "grad_norm": 7.59375, "grad_norm_var": 0.2892578125, "learning_rate": 0.0003, "loss": 15.1225, "loss/aux_loss": 0.04815590269863605, "loss/crossentropy": 3.1540396094322203, "loss/logits": 1.2201361060142517, "step": 4150 }, { "epoch": 0.0416, "grad_norm": 8.0625, "grad_norm_var": 0.25598551432291666, "learning_rate": 0.0003, "loss": 14.6316, "loss/aux_loss": 0.04817016571760178, "loss/crossentropy": 3.0777355790138246, "loss/logits": 1.148938202857971, "step": 4160 }, { "epoch": 0.0417, "grad_norm": 7.21875, "grad_norm_var": 0.8997395833333334, "learning_rate": 0.0003, "loss": 14.8675, "loss/aux_loss": 0.04817523639649153, "loss/crossentropy": 3.154482841491699, "loss/logits": 1.1807423561811448, "step": 4170 }, { "epoch": 0.0418, "grad_norm": 7.875, "grad_norm_var": 0.19582926432291667, "learning_rate": 0.0003, "loss": 14.7081, "loss/aux_loss": 0.048159117065370086, "loss/crossentropy": 3.1314535260200502, "loss/logits": 1.1639139771461486, "step": 4180 }, { "epoch": 0.0419, "grad_norm": 7.90625, "grad_norm_var": 0.18684488932291668, "learning_rate": 0.0003, "loss": 14.9338, "loss/aux_loss": 0.048169083148241046, "loss/crossentropy": 3.0862425684928896, "loss/logits": 1.1751366287469864, "step": 4190 }, { "epoch": 0.042, "grad_norm": 7.625, "grad_norm_var": 0.11756184895833334, "learning_rate": 0.0003, "loss": 14.87, "loss/aux_loss": 0.04817427862435579, "loss/crossentropy": 3.106311786174774, "loss/logits": 1.1633146226406097, "step": 4200 }, { "epoch": 0.0421, "grad_norm": 7.5625, "grad_norm_var": 13.382124837239584, "learning_rate": 0.0003, "loss": 14.8649, "loss/aux_loss": 0.0481806568801403, "loss/crossentropy": 2.915513515472412, "loss/logits": 1.1206313014030456, "step": 4210 }, { "epoch": 0.0422, "grad_norm": 7.4375, "grad_norm_var": 13.166520182291666, "learning_rate": 0.0003, "loss": 14.8359, "loss/aux_loss": 0.04816134050488472, "loss/crossentropy": 3.0834913730621336, "loss/logits": 1.1465971380472184, "step": 4220 }, { "epoch": 0.0423, "grad_norm": 7.34375, "grad_norm_var": 0.16795247395833332, "learning_rate": 0.0003, "loss": 14.9675, "loss/aux_loss": 0.04815997164696455, "loss/crossentropy": 3.181843435764313, "loss/logits": 1.1813198417425155, "step": 4230 }, { "epoch": 0.0424, "grad_norm": 8.1875, "grad_norm_var": 0.26461181640625, "learning_rate": 0.0003, "loss": 14.9965, "loss/aux_loss": 0.048162421025335786, "loss/crossentropy": 3.0605735301971437, "loss/logits": 1.2021290510892868, "step": 4240 }, { "epoch": 0.0425, "grad_norm": 9.8125, "grad_norm_var": 0.52867431640625, "learning_rate": 0.0003, "loss": 14.8701, "loss/aux_loss": 0.04816258866339922, "loss/crossentropy": 3.0808632254600523, "loss/logits": 1.213870882987976, "step": 4250 }, { "epoch": 0.0426, "grad_norm": 7.3125, "grad_norm_var": 0.35133056640625, "learning_rate": 0.0003, "loss": 14.9504, "loss/aux_loss": 0.04816483333706856, "loss/crossentropy": 3.2533419847488405, "loss/logits": 1.2143194258213044, "step": 4260 }, { "epoch": 0.0427, "grad_norm": 8.25, "grad_norm_var": 0.18931884765625, "learning_rate": 0.0003, "loss": 14.8317, "loss/aux_loss": 0.04816996194422245, "loss/crossentropy": 2.921747499704361, "loss/logits": 1.1640391945838928, "step": 4270 }, { "epoch": 0.0428, "grad_norm": 8.3125, "grad_norm_var": 0.29081624348958335, "learning_rate": 0.0003, "loss": 15.0626, "loss/aux_loss": 0.04816504456102848, "loss/crossentropy": 3.0316648125648498, "loss/logits": 1.1759659737348556, "step": 4280 }, { "epoch": 0.0429, "grad_norm": 7.125, "grad_norm_var": 0.26145833333333335, "learning_rate": 0.0003, "loss": 14.7015, "loss/aux_loss": 0.04817175418138504, "loss/crossentropy": 3.0507488489151, "loss/logits": 1.181325948238373, "step": 4290 }, { "epoch": 0.043, "grad_norm": 7.78125, "grad_norm_var": 0.21343994140625, "learning_rate": 0.0003, "loss": 14.8985, "loss/aux_loss": 0.04816540405154228, "loss/crossentropy": 3.0950448393821715, "loss/logits": 1.2072101056575775, "step": 4300 }, { "epoch": 0.0431, "grad_norm": 8.1875, "grad_norm_var": 0.12737223307291667, "learning_rate": 0.0003, "loss": 14.9397, "loss/aux_loss": 0.048161011561751364, "loss/crossentropy": 2.983470690250397, "loss/logits": 1.1625974208116532, "step": 4310 }, { "epoch": 0.0432, "grad_norm": 8.375, "grad_norm_var": 0.17089436848958334, "learning_rate": 0.0003, "loss": 14.9934, "loss/aux_loss": 0.04816465843468905, "loss/crossentropy": 3.100528526306152, "loss/logits": 1.2168638974428176, "step": 4320 }, { "epoch": 0.0433, "grad_norm": 7.09375, "grad_norm_var": 0.15452067057291666, "learning_rate": 0.0003, "loss": 14.8999, "loss/aux_loss": 0.04815626051276922, "loss/crossentropy": 3.1541409373283384, "loss/logits": 1.1997069358825683, "step": 4330 }, { "epoch": 0.0434, "grad_norm": 7.9375, "grad_norm_var": 1.5087198893229166, "learning_rate": 0.0003, "loss": 14.7317, "loss/aux_loss": 0.04816477261483669, "loss/crossentropy": 3.048540270328522, "loss/logits": 1.2071462273597717, "step": 4340 }, { "epoch": 0.0435, "grad_norm": 7.8125, "grad_norm_var": 0.10623372395833333, "learning_rate": 0.0003, "loss": 14.713, "loss/aux_loss": 0.04816347248852253, "loss/crossentropy": 2.9770607709884644, "loss/logits": 1.1550648272037507, "step": 4350 }, { "epoch": 0.0436, "grad_norm": 8.0625, "grad_norm_var": 0.13566080729166666, "learning_rate": 0.0003, "loss": 15.0412, "loss/aux_loss": 0.04816342815756798, "loss/crossentropy": 2.9482832670211794, "loss/logits": 1.1551517724990845, "step": 4360 }, { "epoch": 0.0437, "grad_norm": 8.0625, "grad_norm_var": 0.10624593098958333, "learning_rate": 0.0003, "loss": 14.8035, "loss/aux_loss": 0.04816410057246685, "loss/crossentropy": 3.0711460292339323, "loss/logits": 1.1584541529417038, "step": 4370 }, { "epoch": 0.0438, "grad_norm": 7.375, "grad_norm_var": 0.20885009765625, "learning_rate": 0.0003, "loss": 14.7389, "loss/aux_loss": 0.0481667784973979, "loss/crossentropy": 2.9650609135627746, "loss/logits": 1.1401590436697007, "step": 4380 }, { "epoch": 0.0439, "grad_norm": 7.75, "grad_norm_var": 0.15338134765625, "learning_rate": 0.0003, "loss": 14.5523, "loss/aux_loss": 0.04817138686776161, "loss/crossentropy": 3.0582551836967466, "loss/logits": 1.0985677868127823, "step": 4390 }, { "epoch": 0.044, "grad_norm": 7.875, "grad_norm_var": 0.12655843098958333, "learning_rate": 0.0003, "loss": 14.7145, "loss/aux_loss": 0.04816395286470652, "loss/crossentropy": 3.0119667410850526, "loss/logits": 1.1839350372552873, "step": 4400 }, { "epoch": 0.0441, "grad_norm": 8.5, "grad_norm_var": 0.13058268229166667, "learning_rate": 0.0003, "loss": 14.7486, "loss/aux_loss": 0.04816783182322979, "loss/crossentropy": 2.9910679340362547, "loss/logits": 1.1801847249269486, "step": 4410 }, { "epoch": 0.0442, "grad_norm": 18.625, "grad_norm_var": 7.329557291666666, "learning_rate": 0.0003, "loss": 14.7453, "loss/aux_loss": 0.04816215075552464, "loss/crossentropy": 2.981612813472748, "loss/logits": 1.183125939965248, "step": 4420 }, { "epoch": 0.0443, "grad_norm": 7.0625, "grad_norm_var": 7.44742431640625, "learning_rate": 0.0003, "loss": 14.626, "loss/aux_loss": 0.04817271661013365, "loss/crossentropy": 2.9835289478302003, "loss/logits": 1.158128410577774, "step": 4430 }, { "epoch": 0.0444, "grad_norm": 7.90625, "grad_norm_var": 0.14615478515625, "learning_rate": 0.0003, "loss": 14.5175, "loss/aux_loss": 0.04816505704075098, "loss/crossentropy": 3.097472053766251, "loss/logits": 1.168840977549553, "step": 4440 }, { "epoch": 0.0445, "grad_norm": 12.4375, "grad_norm_var": 1.418994140625, "learning_rate": 0.0003, "loss": 14.561, "loss/aux_loss": 0.04816557168960571, "loss/crossentropy": 3.1079689621925355, "loss/logits": 1.1443527430295943, "step": 4450 }, { "epoch": 0.0446, "grad_norm": 8.625, "grad_norm_var": 1.31578369140625, "learning_rate": 0.0003, "loss": 14.674, "loss/aux_loss": 0.04816855322569609, "loss/crossentropy": 2.927138316631317, "loss/logits": 1.148129415512085, "step": 4460 }, { "epoch": 0.0447, "grad_norm": 7.75, "grad_norm_var": 0.18723551432291666, "learning_rate": 0.0003, "loss": 14.5317, "loss/aux_loss": 0.04815917164087295, "loss/crossentropy": 3.1104054749011993, "loss/logits": 1.1480105966329575, "step": 4470 }, { "epoch": 0.0448, "grad_norm": 8.25, "grad_norm_var": 0.16847330729166668, "learning_rate": 0.0003, "loss": 14.4452, "loss/aux_loss": 0.048180781118571755, "loss/crossentropy": 2.762672412395477, "loss/logits": 1.0904426872730255, "step": 4480 }, { "epoch": 0.0449, "grad_norm": 7.40625, "grad_norm_var": 0.16334635416666668, "learning_rate": 0.0003, "loss": 14.6466, "loss/aux_loss": 0.0481576981022954, "loss/crossentropy": 3.10465407371521, "loss/logits": 1.152064311504364, "step": 4490 }, { "epoch": 0.045, "grad_norm": 7.125, "grad_norm_var": 0.09772135416666666, "learning_rate": 0.0003, "loss": 14.4935, "loss/aux_loss": 0.048153439350426194, "loss/crossentropy": 2.9880860924720762, "loss/logits": 1.1234510779380797, "step": 4500 }, { "epoch": 0.0451, "grad_norm": 8.1875, "grad_norm_var": 0.47294514973958335, "learning_rate": 0.0003, "loss": 14.5779, "loss/aux_loss": 0.048160174302756785, "loss/crossentropy": 3.0695066869258882, "loss/logits": 1.1693350702524186, "step": 4510 }, { "epoch": 0.0452, "grad_norm": 8.0625, "grad_norm_var": 0.54361572265625, "learning_rate": 0.0003, "loss": 14.5628, "loss/aux_loss": 0.04815432522445917, "loss/crossentropy": 3.1152522921562196, "loss/logits": 1.1614423453807832, "step": 4520 }, { "epoch": 0.0453, "grad_norm": 7.34375, "grad_norm_var": 0.43072509765625, "learning_rate": 0.0003, "loss": 14.578, "loss/aux_loss": 0.04815590996295214, "loss/crossentropy": 3.122402215003967, "loss/logits": 1.191055852174759, "step": 4530 }, { "epoch": 0.0454, "grad_norm": 8.25, "grad_norm_var": 5.175028483072917, "learning_rate": 0.0003, "loss": 14.7617, "loss/aux_loss": 0.048156161420047285, "loss/crossentropy": 3.0762326240539553, "loss/logits": 1.1874168932437896, "step": 4540 }, { "epoch": 0.0455, "grad_norm": 9.125, "grad_norm_var": 1.2244099934895833, "learning_rate": 0.0003, "loss": 14.7042, "loss/aux_loss": 0.04815144389867783, "loss/crossentropy": 3.053194510936737, "loss/logits": 1.1742142677307128, "step": 4550 }, { "epoch": 0.0456, "grad_norm": 8.125, "grad_norm_var": 0.3241170247395833, "learning_rate": 0.0003, "loss": 14.5613, "loss/aux_loss": 0.04815953467041254, "loss/crossentropy": 3.0894832491874693, "loss/logits": 1.123066246509552, "step": 4560 }, { "epoch": 0.0457, "grad_norm": 7.71875, "grad_norm_var": 0.22732747395833333, "learning_rate": 0.0003, "loss": 14.6751, "loss/aux_loss": 0.048164136707782745, "loss/crossentropy": 3.278604805469513, "loss/logits": 1.1765309482812882, "step": 4570 }, { "epoch": 0.0458, "grad_norm": 7.875, "grad_norm_var": 0.2938761393229167, "learning_rate": 0.0003, "loss": 14.5222, "loss/aux_loss": 0.04816136136651039, "loss/crossentropy": 3.122606945037842, "loss/logits": 1.1583560228347778, "step": 4580 }, { "epoch": 0.0459, "grad_norm": 8.25, "grad_norm_var": 0.24804280598958334, "learning_rate": 0.0003, "loss": 14.666, "loss/aux_loss": 0.048167569935321806, "loss/crossentropy": 2.8724692463874817, "loss/logits": 1.1007645279169083, "step": 4590 }, { "epoch": 0.046, "grad_norm": 7.53125, "grad_norm_var": 0.08318684895833334, "learning_rate": 0.0003, "loss": 14.5749, "loss/aux_loss": 0.048159733042120935, "loss/crossentropy": 3.096747946739197, "loss/logits": 1.1529816329479217, "step": 4600 }, { "epoch": 0.0461, "grad_norm": 7.6875, "grad_norm_var": 0.08440348307291666, "learning_rate": 0.0003, "loss": 14.6826, "loss/aux_loss": 0.048148723877966405, "loss/crossentropy": 3.0729199647903442, "loss/logits": 1.2091837465763091, "step": 4610 }, { "epoch": 0.0462, "grad_norm": 8.125, "grad_norm_var": 0.05917561848958333, "learning_rate": 0.0003, "loss": 14.5881, "loss/aux_loss": 0.04815642535686493, "loss/crossentropy": 3.0322453498840334, "loss/logits": 1.1239049285650253, "step": 4620 }, { "epoch": 0.0463, "grad_norm": 7.4375, "grad_norm_var": 0.5194010416666667, "learning_rate": 0.0003, "loss": 14.5686, "loss/aux_loss": 0.048170761205255985, "loss/crossentropy": 3.0770667433738708, "loss/logits": 1.1368163347244262, "step": 4630 }, { "epoch": 0.0464, "grad_norm": 8.125, "grad_norm_var": 0.6132649739583333, "learning_rate": 0.0003, "loss": 14.3923, "loss/aux_loss": 0.048156014271080495, "loss/crossentropy": 3.1633099794387816, "loss/logits": 1.1499724864959717, "step": 4640 }, { "epoch": 0.0465, "grad_norm": 7.96875, "grad_norm_var": 1.9275349934895833, "learning_rate": 0.0003, "loss": 14.3876, "loss/aux_loss": 0.048154591023921965, "loss/crossentropy": 3.066124379634857, "loss/logits": 1.1568672150373458, "step": 4650 }, { "epoch": 0.0466, "grad_norm": 8.625, "grad_norm_var": 7.602848307291667, "learning_rate": 0.0003, "loss": 14.6209, "loss/aux_loss": 0.048150830902159214, "loss/crossentropy": 2.938699722290039, "loss/logits": 1.1516984760761262, "step": 4660 }, { "epoch": 0.0467, "grad_norm": 7.46875, "grad_norm_var": 6.106083170572917, "learning_rate": 0.0003, "loss": 14.6674, "loss/aux_loss": 0.04815071895718574, "loss/crossentropy": 3.0229847908020018, "loss/logits": 1.136454886198044, "step": 4670 }, { "epoch": 0.0468, "grad_norm": 7.5625, "grad_norm_var": 0.10810139973958334, "learning_rate": 0.0003, "loss": 14.8643, "loss/aux_loss": 0.04815872758626938, "loss/crossentropy": 2.9929285645484924, "loss/logits": 1.1554243832826614, "step": 4680 }, { "epoch": 0.0469, "grad_norm": 7.875, "grad_norm_var": 0.1236328125, "learning_rate": 0.0003, "loss": 14.5369, "loss/aux_loss": 0.04814963173121214, "loss/crossentropy": 3.0233195781707765, "loss/logits": 1.1462786018848419, "step": 4690 }, { "epoch": 0.047, "grad_norm": 7.96875, "grad_norm_var": 0.14599202473958334, "learning_rate": 0.0003, "loss": 14.6454, "loss/aux_loss": 0.04816053248941898, "loss/crossentropy": 2.9162982583045958, "loss/logits": 1.1257199048995972, "step": 4700 }, { "epoch": 0.0471, "grad_norm": 8.875, "grad_norm_var": 0.27884114583333336, "learning_rate": 0.0003, "loss": 14.4935, "loss/aux_loss": 0.04815351460129023, "loss/crossentropy": 3.166695535182953, "loss/logits": 1.173658263683319, "step": 4710 }, { "epoch": 0.0472, "grad_norm": 7.4375, "grad_norm_var": 0.27346598307291664, "learning_rate": 0.0003, "loss": 14.4925, "loss/aux_loss": 0.048155609704554084, "loss/crossentropy": 3.00724972486496, "loss/logits": 1.147817325592041, "step": 4720 }, { "epoch": 0.0473, "grad_norm": 7.53125, "grad_norm_var": 0.2710774739583333, "learning_rate": 0.0003, "loss": 14.3945, "loss/aux_loss": 0.04815868772566319, "loss/crossentropy": 3.056724321842194, "loss/logits": 1.1427915573120118, "step": 4730 }, { "epoch": 0.0474, "grad_norm": 7.78125, "grad_norm_var": 0.20940348307291667, "learning_rate": 0.0003, "loss": 14.5173, "loss/aux_loss": 0.048164534568786624, "loss/crossentropy": 2.8747935056686402, "loss/logits": 1.09987430870533, "step": 4740 }, { "epoch": 0.0475, "grad_norm": 8.0, "grad_norm_var": 0.23528645833333334, "learning_rate": 0.0003, "loss": 14.49, "loss/aux_loss": 0.048151783645153046, "loss/crossentropy": 2.8358932733535767, "loss/logits": 1.0616690814495087, "step": 4750 }, { "epoch": 0.0476, "grad_norm": 7.625, "grad_norm_var": 0.36178385416666664, "learning_rate": 0.0003, "loss": 14.4691, "loss/aux_loss": 0.0481619393453002, "loss/crossentropy": 3.1596203804016114, "loss/logits": 1.210114187002182, "step": 4760 }, { "epoch": 0.0477, "grad_norm": 8.0, "grad_norm_var": 0.15462239583333334, "learning_rate": 0.0003, "loss": 14.5602, "loss/aux_loss": 0.048146472126245496, "loss/crossentropy": 3.001967716217041, "loss/logits": 1.096169427037239, "step": 4770 }, { "epoch": 0.0478, "grad_norm": 7.6875, "grad_norm_var": 0.18140869140625, "learning_rate": 0.0003, "loss": 14.4163, "loss/aux_loss": 0.04816320165991783, "loss/crossentropy": 2.876752531528473, "loss/logits": 1.1474198400974274, "step": 4780 }, { "epoch": 0.0479, "grad_norm": 9.0625, "grad_norm_var": 0.24347330729166666, "learning_rate": 0.0003, "loss": 14.2293, "loss/aux_loss": 0.0481548685580492, "loss/crossentropy": 2.9930427193641664, "loss/logits": 1.1371810525655746, "step": 4790 }, { "epoch": 0.048, "grad_norm": 8.6875, "grad_norm_var": 0.22493082682291668, "learning_rate": 0.0003, "loss": 14.6348, "loss/aux_loss": 0.048159336857497695, "loss/crossentropy": 3.0536611795425417, "loss/logits": 1.1425227701663971, "step": 4800 }, { "epoch": 0.0481, "grad_norm": 8.1875, "grad_norm_var": 0.19347330729166667, "learning_rate": 0.0003, "loss": 14.3975, "loss/aux_loss": 0.04815634544938803, "loss/crossentropy": 2.9617689490318297, "loss/logits": 1.1070881575345992, "step": 4810 }, { "epoch": 0.0482, "grad_norm": 7.6875, "grad_norm_var": 0.27467447916666665, "learning_rate": 0.0003, "loss": 14.404, "loss/aux_loss": 0.04816010873764753, "loss/crossentropy": 2.945191979408264, "loss/logits": 1.1223448246717453, "step": 4820 }, { "epoch": 0.0483, "grad_norm": 8.1875, "grad_norm_var": 0.27291666666666664, "learning_rate": 0.0003, "loss": 14.4795, "loss/aux_loss": 0.048158070631325246, "loss/crossentropy": 2.93973708152771, "loss/logits": 1.1405175089836121, "step": 4830 }, { "epoch": 0.0484, "grad_norm": 7.8125, "grad_norm_var": 0.269384765625, "learning_rate": 0.0003, "loss": 14.6711, "loss/aux_loss": 0.048154968209564684, "loss/crossentropy": 3.004188358783722, "loss/logits": 1.1400604486465453, "step": 4840 }, { "epoch": 0.0485, "grad_norm": 9.375, "grad_norm_var": 1.53570556640625, "learning_rate": 0.0003, "loss": 14.5167, "loss/aux_loss": 0.04815916530787945, "loss/crossentropy": 2.9153899431228636, "loss/logits": 1.1129061222076415, "step": 4850 }, { "epoch": 0.0486, "grad_norm": 7.65625, "grad_norm_var": 1.5292805989583333, "learning_rate": 0.0003, "loss": 14.3376, "loss/aux_loss": 0.04816344752907753, "loss/crossentropy": 3.0024606227874755, "loss/logits": 1.1566831320524216, "step": 4860 }, { "epoch": 0.0487, "grad_norm": 8.0, "grad_norm_var": 0.24295247395833333, "learning_rate": 0.0003, "loss": 14.1824, "loss/aux_loss": 0.0481420386582613, "loss/crossentropy": 2.8804391503334044, "loss/logits": 1.098368188738823, "step": 4870 }, { "epoch": 0.0488, "grad_norm": 7.78125, "grad_norm_var": 0.20292561848958332, "learning_rate": 0.0003, "loss": 14.4489, "loss/aux_loss": 0.04815696161240339, "loss/crossentropy": 2.9788331627845763, "loss/logits": 1.1000428795814514, "step": 4880 }, { "epoch": 0.0489, "grad_norm": 7.71875, "grad_norm_var": 0.38917643229166665, "learning_rate": 0.0003, "loss": 14.6589, "loss/aux_loss": 0.04814700428396464, "loss/crossentropy": 3.02801970243454, "loss/logits": 1.1341104060411453, "step": 4890 }, { "epoch": 0.049, "grad_norm": 8.375, "grad_norm_var": 0.40823160807291664, "learning_rate": 0.0003, "loss": 14.4838, "loss/aux_loss": 0.048148921132087706, "loss/crossentropy": 3.061317926645279, "loss/logits": 1.1178199291229247, "step": 4900 }, { "epoch": 0.0491, "grad_norm": 8.125, "grad_norm_var": 0.31573893229166666, "learning_rate": 0.0003, "loss": 14.3403, "loss/aux_loss": 0.04815245717763901, "loss/crossentropy": 3.0220317125320433, "loss/logits": 1.0949908673763276, "step": 4910 }, { "epoch": 0.0492, "grad_norm": 8.4375, "grad_norm_var": 0.13892822265625, "learning_rate": 0.0003, "loss": 14.4711, "loss/aux_loss": 0.04815434459596872, "loss/crossentropy": 3.0331790328025816, "loss/logits": 1.0993872165679932, "step": 4920 }, { "epoch": 0.0493, "grad_norm": 7.65625, "grad_norm_var": 0.26901041666666664, "learning_rate": 0.0003, "loss": 14.2343, "loss/aux_loss": 0.048155249655246736, "loss/crossentropy": 2.97544447183609, "loss/logits": 1.1062311738729478, "step": 4930 }, { "epoch": 0.0494, "grad_norm": 14.875, "grad_norm_var": 3.21763916015625, "learning_rate": 0.0003, "loss": 14.4118, "loss/aux_loss": 0.048153795301914215, "loss/crossentropy": 3.121039032936096, "loss/logits": 1.1289394974708558, "step": 4940 }, { "epoch": 0.0495, "grad_norm": 7.71875, "grad_norm_var": 3.0484212239583335, "learning_rate": 0.0003, "loss": 14.3417, "loss/aux_loss": 0.04815581478178501, "loss/crossentropy": 3.015563631057739, "loss/logits": 1.1119945228099823, "step": 4950 }, { "epoch": 0.0496, "grad_norm": 13.0625, "grad_norm_var": 3.7396484375, "learning_rate": 0.0003, "loss": 14.3192, "loss/aux_loss": 0.048152280040085316, "loss/crossentropy": 2.8887117922306063, "loss/logits": 1.096293193101883, "step": 4960 }, { "epoch": 0.0497, "grad_norm": 8.0, "grad_norm_var": 3.763016764322917, "learning_rate": 0.0003, "loss": 14.274, "loss/aux_loss": 0.04814924951642752, "loss/crossentropy": 3.0876585960388185, "loss/logits": 1.1102905184030534, "step": 4970 }, { "epoch": 0.0498, "grad_norm": 7.5, "grad_norm_var": 0.10857747395833334, "learning_rate": 0.0003, "loss": 14.2513, "loss/aux_loss": 0.0481419550254941, "loss/crossentropy": 3.1604169964790345, "loss/logits": 1.135578241944313, "step": 4980 }, { "epoch": 0.0499, "grad_norm": 7.84375, "grad_norm_var": 0.10129801432291667, "learning_rate": 0.0003, "loss": 14.1859, "loss/aux_loss": 0.0481536041945219, "loss/crossentropy": 2.9750654339790343, "loss/logits": 1.1200665444135667, "step": 4990 }, { "epoch": 0.05, "grad_norm": 7.875, "grad_norm_var": 0.1974609375, "learning_rate": 0.0003, "loss": 14.2076, "loss/aux_loss": 0.048159463331103325, "loss/crossentropy": 2.9379626870155335, "loss/logits": 1.0767972767353058, "step": 5000 }, { "epoch": 0.0501, "grad_norm": 8.0625, "grad_norm_var": 0.13033854166666667, "learning_rate": 0.0003, "loss": 14.3938, "loss/aux_loss": 0.04816003683954477, "loss/crossentropy": 2.8308571100234987, "loss/logits": 1.1243964433670044, "step": 5010 }, { "epoch": 0.0502, "grad_norm": 8.375, "grad_norm_var": 0.18127848307291666, "learning_rate": 0.0003, "loss": 14.3241, "loss/aux_loss": 0.048151292651891706, "loss/crossentropy": 2.9098775744438172, "loss/logits": 1.1258880913257598, "step": 5020 }, { "epoch": 0.0503, "grad_norm": 7.59375, "grad_norm_var": 0.74088134765625, "learning_rate": 0.0003, "loss": 14.5341, "loss/aux_loss": 0.048141808994114396, "loss/crossentropy": 3.010620355606079, "loss/logits": 1.1356734812259675, "step": 5030 }, { "epoch": 0.0504, "grad_norm": 8.5, "grad_norm_var": 0.177587890625, "learning_rate": 0.0003, "loss": 14.4808, "loss/aux_loss": 0.048152133263647555, "loss/crossentropy": 2.956807887554169, "loss/logits": 1.0851380228996277, "step": 5040 }, { "epoch": 0.0505, "grad_norm": 8.375, "grad_norm_var": 0.22001546223958332, "learning_rate": 0.0003, "loss": 14.4873, "loss/aux_loss": 0.04815137479454279, "loss/crossentropy": 3.0215251445770264, "loss/logits": 1.166229221224785, "step": 5050 }, { "epoch": 0.0506, "grad_norm": 7.53125, "grad_norm_var": 0.19120686848958332, "learning_rate": 0.0003, "loss": 14.2421, "loss/aux_loss": 0.04814422242343426, "loss/crossentropy": 3.028605377674103, "loss/logits": 1.1260013222694396, "step": 5060 }, { "epoch": 0.0507, "grad_norm": 7.4375, "grad_norm_var": 0.13407796223958332, "learning_rate": 0.0003, "loss": 14.4855, "loss/aux_loss": 0.04815457910299301, "loss/crossentropy": 3.0295214653015137, "loss/logits": 1.1266607105731965, "step": 5070 }, { "epoch": 0.0508, "grad_norm": 8.125, "grad_norm_var": 16.2919921875, "learning_rate": 0.0003, "loss": 14.278, "loss/aux_loss": 0.048153937235474585, "loss/crossentropy": 3.0743547797203066, "loss/logits": 1.1398055493831634, "step": 5080 }, { "epoch": 0.0509, "grad_norm": 8.3125, "grad_norm_var": 0.3312459309895833, "learning_rate": 0.0003, "loss": 14.3122, "loss/aux_loss": 0.048145625926554206, "loss/crossentropy": 2.9891109347343443, "loss/logits": 1.144765716791153, "step": 5090 }, { "epoch": 0.051, "grad_norm": 8.375, "grad_norm_var": 0.25514322916666665, "learning_rate": 0.0003, "loss": 14.0619, "loss/aux_loss": 0.048147767595946786, "loss/crossentropy": 2.923846483230591, "loss/logits": 1.104039838910103, "step": 5100 }, { "epoch": 0.0511, "grad_norm": 8.4375, "grad_norm_var": 0.18834228515625, "learning_rate": 0.0003, "loss": 14.3523, "loss/aux_loss": 0.04815386533737183, "loss/crossentropy": 2.9420456171035765, "loss/logits": 1.110900694131851, "step": 5110 }, { "epoch": 0.0512, "grad_norm": 8.4375, "grad_norm_var": 0.348291015625, "learning_rate": 0.0003, "loss": 14.21, "loss/aux_loss": 0.04815777577459812, "loss/crossentropy": 2.9808182954788207, "loss/logits": 1.1418810188770294, "step": 5120 }, { "epoch": 0.0513, "grad_norm": 8.1875, "grad_norm_var": 0.24459635416666667, "learning_rate": 0.0003, "loss": 14.1038, "loss/aux_loss": 0.048149819299578664, "loss/crossentropy": 2.9293219327926634, "loss/logits": 1.151758760213852, "step": 5130 }, { "epoch": 0.0514, "grad_norm": 7.78125, "grad_norm_var": 0.5398274739583333, "learning_rate": 0.0003, "loss": 14.0253, "loss/aux_loss": 0.04813775867223739, "loss/crossentropy": 3.086165702342987, "loss/logits": 1.1042977631092072, "step": 5140 }, { "epoch": 0.0515, "grad_norm": 7.96875, "grad_norm_var": 0.27209879557291666, "learning_rate": 0.0003, "loss": 14.2102, "loss/aux_loss": 0.04814709778875113, "loss/crossentropy": 3.16923828125, "loss/logits": 1.1310043185949326, "step": 5150 }, { "epoch": 0.0516, "grad_norm": 7.53125, "grad_norm_var": 0.10705973307291666, "learning_rate": 0.0003, "loss": 14.2875, "loss/aux_loss": 0.04815742298960686, "loss/crossentropy": 2.9624265909194945, "loss/logits": 1.1060597985982894, "step": 5160 }, { "epoch": 0.0517, "grad_norm": 8.375, "grad_norm_var": 0.072509765625, "learning_rate": 0.0003, "loss": 14.4538, "loss/aux_loss": 0.04815982095897198, "loss/crossentropy": 2.942893236875534, "loss/logits": 1.113595375418663, "step": 5170 }, { "epoch": 0.0518, "grad_norm": 8.125, "grad_norm_var": 0.13105061848958333, "learning_rate": 0.0003, "loss": 14.0981, "loss/aux_loss": 0.04815513715147972, "loss/crossentropy": 3.05008624792099, "loss/logits": 1.1298416316509248, "step": 5180 }, { "epoch": 0.0519, "grad_norm": 7.59375, "grad_norm_var": 0.17190348307291667, "learning_rate": 0.0003, "loss": 14.2019, "loss/aux_loss": 0.04814068842679262, "loss/crossentropy": 2.9983504891395567, "loss/logits": 1.1261755168437957, "step": 5190 }, { "epoch": 0.052, "grad_norm": 8.25, "grad_norm_var": 0.10623372395833333, "learning_rate": 0.0003, "loss": 14.0924, "loss/aux_loss": 0.04815409407019615, "loss/crossentropy": 2.815429699420929, "loss/logits": 1.1029013335704803, "step": 5200 }, { "epoch": 0.0521, "grad_norm": 7.96875, "grad_norm_var": 0.075634765625, "learning_rate": 0.0003, "loss": 14.0729, "loss/aux_loss": 0.04814719296991825, "loss/crossentropy": 3.0798101305961607, "loss/logits": 1.1071963399648665, "step": 5210 }, { "epoch": 0.0522, "grad_norm": 7.5, "grad_norm_var": 0.10396728515625, "learning_rate": 0.0003, "loss": 14.2266, "loss/aux_loss": 0.04813891816884279, "loss/crossentropy": 3.0311917304992675, "loss/logits": 1.1385094463825225, "step": 5220 }, { "epoch": 0.0523, "grad_norm": 7.59375, "grad_norm_var": 0.09295247395833334, "learning_rate": 0.0003, "loss": 14.0168, "loss/aux_loss": 0.04814713895320892, "loss/crossentropy": 2.8070708096027372, "loss/logits": 1.037602314352989, "step": 5230 }, { "epoch": 0.0524, "grad_norm": 8.5625, "grad_norm_var": 0.23748372395833334, "learning_rate": 0.0003, "loss": 14.206, "loss/aux_loss": 0.048150969482958314, "loss/crossentropy": 2.9220390915870667, "loss/logits": 1.1021725416183472, "step": 5240 }, { "epoch": 0.0525, "grad_norm": 7.96875, "grad_norm_var": 0.3568644205729167, "learning_rate": 0.0003, "loss": 14.2243, "loss/aux_loss": 0.048154151812195775, "loss/crossentropy": 3.0725671291351317, "loss/logits": 1.1407492518424989, "step": 5250 }, { "epoch": 0.0526, "grad_norm": 8.25, "grad_norm_var": 1.03599853515625, "learning_rate": 0.0003, "loss": 14.2113, "loss/aux_loss": 0.04815595541149378, "loss/crossentropy": 3.073868250846863, "loss/logits": 1.086431348323822, "step": 5260 }, { "epoch": 0.0527, "grad_norm": 8.6875, "grad_norm_var": 0.8241495768229167, "learning_rate": 0.0003, "loss": 14.2052, "loss/aux_loss": 0.04815474133938551, "loss/crossentropy": 2.942746305465698, "loss/logits": 1.085268846154213, "step": 5270 }, { "epoch": 0.0528, "grad_norm": 8.125, "grad_norm_var": 0.457421875, "learning_rate": 0.0003, "loss": 14.1014, "loss/aux_loss": 0.0481459453701973, "loss/crossentropy": 2.9510623097419737, "loss/logits": 1.086976206302643, "step": 5280 }, { "epoch": 0.0529, "grad_norm": 9.8125, "grad_norm_var": 0.6368448893229167, "learning_rate": 0.0003, "loss": 14.0806, "loss/aux_loss": 0.04814429916441441, "loss/crossentropy": 2.91622234582901, "loss/logits": 1.1365332275629043, "step": 5290 }, { "epoch": 0.053, "grad_norm": 8.125, "grad_norm_var": 0.32281494140625, "learning_rate": 0.0003, "loss": 14.1434, "loss/aux_loss": 0.04813830778002739, "loss/crossentropy": 2.9429489850997923, "loss/logits": 1.1115789502859115, "step": 5300 }, { "epoch": 0.0531, "grad_norm": 7.46875, "grad_norm_var": 0.2721638997395833, "learning_rate": 0.0003, "loss": 14.1278, "loss/aux_loss": 0.04815119802951813, "loss/crossentropy": 3.0424102902412415, "loss/logits": 1.137840673327446, "step": 5310 }, { "epoch": 0.0532, "grad_norm": 8.1875, "grad_norm_var": 0.20178629557291666, "learning_rate": 0.0003, "loss": 14.2472, "loss/aux_loss": 0.048156299628317356, "loss/crossentropy": 2.9693280339241026, "loss/logits": 1.1287171095609665, "step": 5320 }, { "epoch": 0.0533, "grad_norm": 8.25, "grad_norm_var": 0.07245686848958334, "learning_rate": 0.0003, "loss": 14.3863, "loss/aux_loss": 0.04813747089356184, "loss/crossentropy": 3.1067948579788207, "loss/logits": 1.1718181252479554, "step": 5330 }, { "epoch": 0.0534, "grad_norm": 10.875, "grad_norm_var": 0.6001139322916667, "learning_rate": 0.0003, "loss": 14.1949, "loss/aux_loss": 0.0481356767937541, "loss/crossentropy": 2.9768314242362974, "loss/logits": 1.1081100910902024, "step": 5340 }, { "epoch": 0.0535, "grad_norm": 8.0625, "grad_norm_var": 0.6735677083333333, "learning_rate": 0.0003, "loss": 14.1155, "loss/aux_loss": 0.04814883153885603, "loss/crossentropy": 3.0978642463684083, "loss/logits": 1.112101286649704, "step": 5350 }, { "epoch": 0.0536, "grad_norm": 8.375, "grad_norm_var": 0.17733968098958333, "learning_rate": 0.0003, "loss": 14.2689, "loss/aux_loss": 0.048155237548053266, "loss/crossentropy": 2.9267095983028413, "loss/logits": 1.1321902126073837, "step": 5360 }, { "epoch": 0.0537, "grad_norm": 8.1875, "grad_norm_var": 0.123828125, "learning_rate": 0.0003, "loss": 14.0068, "loss/aux_loss": 0.048150830715894696, "loss/crossentropy": 3.0328433394432066, "loss/logits": 1.0583814442157746, "step": 5370 }, { "epoch": 0.0538, "grad_norm": 8.4375, "grad_norm_var": 0.160791015625, "learning_rate": 0.0003, "loss": 14.2637, "loss/aux_loss": 0.04814900886267424, "loss/crossentropy": 2.8612841725349427, "loss/logits": 1.0983431458473205, "step": 5380 }, { "epoch": 0.0539, "grad_norm": 7.75, "grad_norm_var": 0.16663004557291666, "learning_rate": 0.0003, "loss": 14.0972, "loss/aux_loss": 0.04815038740634918, "loss/crossentropy": 2.872392749786377, "loss/logits": 1.062236163020134, "step": 5390 }, { "epoch": 0.054, "grad_norm": 8.5, "grad_norm_var": 0.20662434895833334, "learning_rate": 0.0003, "loss": 14.0276, "loss/aux_loss": 0.048151925951242444, "loss/crossentropy": 2.777138501405716, "loss/logits": 1.043939945101738, "step": 5400 }, { "epoch": 0.0541, "grad_norm": 9.125, "grad_norm_var": 0.20220947265625, "learning_rate": 0.0003, "loss": 14.1898, "loss/aux_loss": 0.04814861789345741, "loss/crossentropy": 2.9948280215263368, "loss/logits": 1.0816247820854188, "step": 5410 }, { "epoch": 0.0542, "grad_norm": 8.125, "grad_norm_var": 0.16243082682291668, "learning_rate": 0.0003, "loss": 14.1748, "loss/aux_loss": 0.04814356118440628, "loss/crossentropy": 2.984057831764221, "loss/logits": 1.1022383213043212, "step": 5420 }, { "epoch": 0.0543, "grad_norm": 8.375, "grad_norm_var": 0.21638997395833334, "learning_rate": 0.0003, "loss": 14.1883, "loss/aux_loss": 0.04814845807850361, "loss/crossentropy": 2.9097337126731873, "loss/logits": 1.0658887088298798, "step": 5430 }, { "epoch": 0.0544, "grad_norm": 46.75, "grad_norm_var": 91.66760660807292, "learning_rate": 0.0003, "loss": 14.2163, "loss/aux_loss": 0.04814211465418339, "loss/crossentropy": 3.150989270210266, "loss/logits": 1.1435310065746307, "step": 5440 }, { "epoch": 0.0545, "grad_norm": 7.90625, "grad_norm_var": 91.24908854166667, "learning_rate": 0.0003, "loss": 14.2637, "loss/aux_loss": 0.04814708679914474, "loss/crossentropy": 3.065591824054718, "loss/logits": 1.1185233294963837, "step": 5450 }, { "epoch": 0.0546, "grad_norm": 8.75, "grad_norm_var": 0.31951497395833334, "learning_rate": 0.0003, "loss": 14.3104, "loss/aux_loss": 0.04814552329480648, "loss/crossentropy": 3.0562121748924254, "loss/logits": 1.1377945810556411, "step": 5460 }, { "epoch": 0.0547, "grad_norm": 7.78125, "grad_norm_var": 0.33352457682291664, "learning_rate": 0.0003, "loss": 14.0194, "loss/aux_loss": 0.048139683343470095, "loss/crossentropy": 3.186306917667389, "loss/logits": 1.0923507630825042, "step": 5470 }, { "epoch": 0.0548, "grad_norm": 8.5, "grad_norm_var": 0.11419270833333334, "learning_rate": 0.0003, "loss": 14.0885, "loss/aux_loss": 0.0481383940204978, "loss/crossentropy": 3.0000529527664184, "loss/logits": 1.0960578143596649, "step": 5480 }, { "epoch": 0.0549, "grad_norm": 8.4375, "grad_norm_var": 0.14837239583333334, "learning_rate": 0.0003, "loss": 14.1311, "loss/aux_loss": 0.048138375580310824, "loss/crossentropy": 3.0034351110458375, "loss/logits": 1.079491952061653, "step": 5490 }, { "epoch": 0.055, "grad_norm": 8.3125, "grad_norm_var": 0.12092692057291667, "learning_rate": 0.0003, "loss": 14.1602, "loss/aux_loss": 0.04813902676105499, "loss/crossentropy": 3.0370962262153625, "loss/logits": 1.071971568465233, "step": 5500 }, { "epoch": 0.0551, "grad_norm": 9.125, "grad_norm_var": 0.159619140625, "learning_rate": 0.0003, "loss": 14.1168, "loss/aux_loss": 0.04815224166959524, "loss/crossentropy": 2.94165198802948, "loss/logits": 1.09517442882061, "step": 5510 }, { "epoch": 0.0552, "grad_norm": 8.75, "grad_norm_var": 0.38424479166666664, "learning_rate": 0.0003, "loss": 14.1283, "loss/aux_loss": 0.048148746229708196, "loss/crossentropy": 2.889024776220322, "loss/logits": 1.0823973000049592, "step": 5520 }, { "epoch": 0.0553, "grad_norm": 7.90625, "grad_norm_var": 0.37433268229166666, "learning_rate": 0.0003, "loss": 14.2453, "loss/aux_loss": 0.04814521931111813, "loss/crossentropy": 2.9829455733299257, "loss/logits": 1.1254934877157212, "step": 5530 }, { "epoch": 0.0554, "grad_norm": 8.1875, "grad_norm_var": 0.23541259765625, "learning_rate": 0.0003, "loss": 14.1698, "loss/aux_loss": 0.04814098011702299, "loss/crossentropy": 2.950876700878143, "loss/logits": 1.1378295987844467, "step": 5540 }, { "epoch": 0.0555, "grad_norm": 7.90625, "grad_norm_var": 0.186572265625, "learning_rate": 0.0003, "loss": 14.1457, "loss/aux_loss": 0.04814466387033463, "loss/crossentropy": 2.9882196366786955, "loss/logits": 1.0835947006940843, "step": 5550 }, { "epoch": 0.0556, "grad_norm": 8.125, "grad_norm_var": 0.25572509765625, "learning_rate": 0.0003, "loss": 14.0392, "loss/aux_loss": 0.04815062917768955, "loss/crossentropy": 2.829884684085846, "loss/logits": 1.0776374101638795, "step": 5560 }, { "epoch": 0.0557, "grad_norm": 20.25, "grad_norm_var": 8.858072916666666, "learning_rate": 0.0003, "loss": 14.0187, "loss/aux_loss": 0.04814136177301407, "loss/crossentropy": 3.0365766048431397, "loss/logits": 1.1214863985776902, "step": 5570 }, { "epoch": 0.0558, "grad_norm": 8.875, "grad_norm_var": 71.89733072916667, "learning_rate": 0.0003, "loss": 14.0507, "loss/aux_loss": 0.048154527135193345, "loss/crossentropy": 2.925475996732712, "loss/logits": 1.0984899312257768, "step": 5580 }, { "epoch": 0.0559, "grad_norm": 9.1875, "grad_norm_var": 68.73677978515624, "learning_rate": 0.0003, "loss": 14.0769, "loss/aux_loss": 0.048137818835675715, "loss/crossentropy": 3.0427648425102234, "loss/logits": 1.1238386183977127, "step": 5590 }, { "epoch": 0.056, "grad_norm": 8.625, "grad_norm_var": 0.35065104166666666, "learning_rate": 0.0003, "loss": 14.018, "loss/aux_loss": 0.04814153481274843, "loss/crossentropy": 2.9993926525115966, "loss/logits": 1.0859254390001296, "step": 5600 }, { "epoch": 0.0561, "grad_norm": 7.84375, "grad_norm_var": 0.24605712890625, "learning_rate": 0.0003, "loss": 14.0132, "loss/aux_loss": 0.04814342502504587, "loss/crossentropy": 3.1588930010795595, "loss/logits": 1.1444143801927567, "step": 5610 }, { "epoch": 0.0562, "grad_norm": 8.3125, "grad_norm_var": 0.13964436848958334, "learning_rate": 0.0003, "loss": 13.9644, "loss/aux_loss": 0.04813936911523342, "loss/crossentropy": 2.9891305387020113, "loss/logits": 1.08986476957798, "step": 5620 }, { "epoch": 0.0563, "grad_norm": 7.71875, "grad_norm_var": 0.15133056640625, "learning_rate": 0.0003, "loss": 14.0305, "loss/aux_loss": 0.04814296532422304, "loss/crossentropy": 3.094134247303009, "loss/logits": 1.1330428838729858, "step": 5630 }, { "epoch": 0.0564, "grad_norm": 8.0, "grad_norm_var": 0.22073160807291667, "learning_rate": 0.0003, "loss": 14.0265, "loss/aux_loss": 0.04816127121448517, "loss/crossentropy": 2.90863493680954, "loss/logits": 1.0908836662769317, "step": 5640 }, { "epoch": 0.0565, "grad_norm": 8.75, "grad_norm_var": 0.20128580729166667, "learning_rate": 0.0003, "loss": 14.1691, "loss/aux_loss": 0.048136289790272714, "loss/crossentropy": 3.045944094657898, "loss/logits": 1.1683479130268097, "step": 5650 }, { "epoch": 0.0566, "grad_norm": 7.59375, "grad_norm_var": 0.221484375, "learning_rate": 0.0003, "loss": 13.8956, "loss/aux_loss": 0.0481420211493969, "loss/crossentropy": 2.9611165285110475, "loss/logits": 1.1002487033605575, "step": 5660 }, { "epoch": 0.0567, "grad_norm": 7.78125, "grad_norm_var": 0.1322265625, "learning_rate": 0.0003, "loss": 13.9722, "loss/aux_loss": 0.04813913106918335, "loss/crossentropy": 2.832181286811829, "loss/logits": 1.085351037979126, "step": 5670 }, { "epoch": 0.0568, "grad_norm": 8.4375, "grad_norm_var": 0.09976806640625, "learning_rate": 0.0003, "loss": 13.9409, "loss/aux_loss": 0.04813998658210039, "loss/crossentropy": 3.0723737359046934, "loss/logits": 1.1221662908792496, "step": 5680 }, { "epoch": 0.0569, "grad_norm": 8.75, "grad_norm_var": 0.18485921223958332, "learning_rate": 0.0003, "loss": 13.9103, "loss/aux_loss": 0.048149769008159635, "loss/crossentropy": 2.8909295797348022, "loss/logits": 1.0669385582208633, "step": 5690 }, { "epoch": 0.057, "grad_norm": 8.1875, "grad_norm_var": 0.164306640625, "learning_rate": 0.0003, "loss": 14.2382, "loss/aux_loss": 0.04814224392175674, "loss/crossentropy": 3.028742825984955, "loss/logits": 1.1198367089033128, "step": 5700 }, { "epoch": 0.0571, "grad_norm": 7.9375, "grad_norm_var": 0.16002197265625, "learning_rate": 0.0003, "loss": 13.9939, "loss/aux_loss": 0.04813466928899288, "loss/crossentropy": 3.091606914997101, "loss/logits": 1.111482274532318, "step": 5710 }, { "epoch": 0.0572, "grad_norm": 8.1875, "grad_norm_var": 0.13919270833333333, "learning_rate": 0.0003, "loss": 13.8898, "loss/aux_loss": 0.04814035035669804, "loss/crossentropy": 2.9719881653785705, "loss/logits": 1.1058259099721908, "step": 5720 }, { "epoch": 0.0573, "grad_norm": 8.0625, "grad_norm_var": 20.40621337890625, "learning_rate": 0.0003, "loss": 14.125, "loss/aux_loss": 0.048147077485918996, "loss/crossentropy": 2.940553843975067, "loss/logits": 1.0541133284568787, "step": 5730 }, { "epoch": 0.0574, "grad_norm": 9.125, "grad_norm_var": 0.126416015625, "learning_rate": 0.0003, "loss": 14.0709, "loss/aux_loss": 0.04814092367887497, "loss/crossentropy": 2.9801509261131285, "loss/logits": 1.1179528176784514, "step": 5740 }, { "epoch": 0.0575, "grad_norm": 8.0625, "grad_norm_var": 0.21633707682291667, "learning_rate": 0.0003, "loss": 13.8988, "loss/aux_loss": 0.048131171986460684, "loss/crossentropy": 3.1363558411598205, "loss/logits": 1.1174342811107636, "step": 5750 }, { "epoch": 0.0576, "grad_norm": 8.1875, "grad_norm_var": 0.42724202473958334, "learning_rate": 0.0003, "loss": 13.8693, "loss/aux_loss": 0.048153743520379065, "loss/crossentropy": 3.0531252682209016, "loss/logits": 1.0992789357900619, "step": 5760 }, { "epoch": 0.0577, "grad_norm": 7.96875, "grad_norm_var": 0.39302978515625, "learning_rate": 0.0003, "loss": 13.758, "loss/aux_loss": 0.048135829716920854, "loss/crossentropy": 3.001027262210846, "loss/logits": 1.0745349794626236, "step": 5770 }, { "epoch": 0.0578, "grad_norm": 8.6875, "grad_norm_var": 0.33866780598958335, "learning_rate": 0.0003, "loss": 13.7585, "loss/aux_loss": 0.04814055394381285, "loss/crossentropy": 2.9369577765464783, "loss/logits": 1.0799493759870529, "step": 5780 }, { "epoch": 0.0579, "grad_norm": 8.1875, "grad_norm_var": 0.6395833333333333, "learning_rate": 0.0003, "loss": 14.1834, "loss/aux_loss": 0.04815905783325434, "loss/crossentropy": 3.069815826416016, "loss/logits": 1.1343096286058425, "step": 5790 }, { "epoch": 0.058, "grad_norm": 8.1875, "grad_norm_var": 0.27545572916666666, "learning_rate": 0.0003, "loss": 13.9437, "loss/aux_loss": 0.04814049322158098, "loss/crossentropy": 2.8448895037174227, "loss/logits": 1.0605036556720733, "step": 5800 }, { "epoch": 0.0581, "grad_norm": 8.625, "grad_norm_var": 0.33121337890625, "learning_rate": 0.0003, "loss": 13.9217, "loss/aux_loss": 0.048141079396009444, "loss/crossentropy": 2.959049415588379, "loss/logits": 1.1173090249300004, "step": 5810 }, { "epoch": 0.0582, "grad_norm": 8.5625, "grad_norm_var": 0.582666015625, "learning_rate": 0.0003, "loss": 13.9177, "loss/aux_loss": 0.04814137741923332, "loss/crossentropy": 2.9781831741333007, "loss/logits": 1.088547134399414, "step": 5820 }, { "epoch": 0.0583, "grad_norm": 8.5, "grad_norm_var": 0.52457275390625, "learning_rate": 0.0003, "loss": 13.8936, "loss/aux_loss": 0.04813809935003519, "loss/crossentropy": 2.920023334026337, "loss/logits": 1.0897945940494538, "step": 5830 }, { "epoch": 0.0584, "grad_norm": 8.5, "grad_norm_var": 0.13635660807291666, "learning_rate": 0.0003, "loss": 13.929, "loss/aux_loss": 0.04814415480941534, "loss/crossentropy": 2.9571971893310547, "loss/logits": 1.0757667511701583, "step": 5840 }, { "epoch": 0.0585, "grad_norm": 8.125, "grad_norm_var": 0.13433837890625, "learning_rate": 0.0003, "loss": 13.9993, "loss/aux_loss": 0.04814188275486231, "loss/crossentropy": 2.9641053080558777, "loss/logits": 1.0629219651222228, "step": 5850 }, { "epoch": 0.0586, "grad_norm": 8.4375, "grad_norm_var": 0.4556640625, "learning_rate": 0.0003, "loss": 13.9047, "loss/aux_loss": 0.04814865179359913, "loss/crossentropy": 2.838666582107544, "loss/logits": 1.08486467897892, "step": 5860 }, { "epoch": 0.0587, "grad_norm": 8.25, "grad_norm_var": 27.4416015625, "learning_rate": 0.0003, "loss": 13.7725, "loss/aux_loss": 0.048147336766123774, "loss/crossentropy": 2.966917932033539, "loss/logits": 1.0681630432605744, "step": 5870 }, { "epoch": 0.0588, "grad_norm": 8.3125, "grad_norm_var": 18.5572265625, "learning_rate": 0.0003, "loss": 13.805, "loss/aux_loss": 0.04814778696745634, "loss/crossentropy": 2.870664370059967, "loss/logits": 1.0493683815002441, "step": 5880 }, { "epoch": 0.0589, "grad_norm": 8.5, "grad_norm_var": 0.5998331705729166, "learning_rate": 0.0003, "loss": 13.8099, "loss/aux_loss": 0.04814050365239382, "loss/crossentropy": 2.922038221359253, "loss/logits": 1.0779344737529755, "step": 5890 }, { "epoch": 0.059, "grad_norm": 9.375, "grad_norm_var": 0.202978515625, "learning_rate": 0.0003, "loss": 13.9627, "loss/aux_loss": 0.04813796691596508, "loss/crossentropy": 3.1271554470062255, "loss/logits": 1.1131070137023926, "step": 5900 }, { "epoch": 0.0591, "grad_norm": 8.625, "grad_norm_var": 0.20284830729166667, "learning_rate": 0.0003, "loss": 13.7172, "loss/aux_loss": 0.048142065107822415, "loss/crossentropy": 2.9341515243053435, "loss/logits": 1.0991775900125504, "step": 5910 }, { "epoch": 0.0592, "grad_norm": 8.375, "grad_norm_var": 0.32784830729166664, "learning_rate": 0.0003, "loss": 13.9532, "loss/aux_loss": 0.04815160110592842, "loss/crossentropy": 2.963713300228119, "loss/logits": 1.0947488635778426, "step": 5920 }, { "epoch": 0.0593, "grad_norm": 8.1875, "grad_norm_var": 0.3666015625, "learning_rate": 0.0003, "loss": 13.8793, "loss/aux_loss": 0.048143844306468966, "loss/crossentropy": 2.8118023216724395, "loss/logits": 1.0895264118909835, "step": 5930 }, { "epoch": 0.0594, "grad_norm": 8.375, "grad_norm_var": 0.08274739583333333, "learning_rate": 0.0003, "loss": 13.7795, "loss/aux_loss": 0.04813539497554302, "loss/crossentropy": 2.8907833218574526, "loss/logits": 1.0538031846284865, "step": 5940 }, { "epoch": 0.0595, "grad_norm": 8.5625, "grad_norm_var": 0.24547119140625, "learning_rate": 0.0003, "loss": 14.0992, "loss/aux_loss": 0.048147369921207425, "loss/crossentropy": 2.9670627653598785, "loss/logits": 1.1421405851840973, "step": 5950 }, { "epoch": 0.0596, "grad_norm": 9.375, "grad_norm_var": 0.67174072265625, "learning_rate": 0.0003, "loss": 13.8134, "loss/aux_loss": 0.04814773909747601, "loss/crossentropy": 2.9220254778862, "loss/logits": 1.0881559133529664, "step": 5960 }, { "epoch": 0.0597, "grad_norm": 8.8125, "grad_norm_var": 68.9384765625, "learning_rate": 0.0003, "loss": 14.0155, "loss/aux_loss": 0.048162427730858326, "loss/crossentropy": 2.977382260560989, "loss/logits": 1.0755089968442917, "step": 5970 }, { "epoch": 0.0598, "grad_norm": 8.0625, "grad_norm_var": 68.26399739583333, "learning_rate": 0.0003, "loss": 14.0255, "loss/aux_loss": 0.048142471350729465, "loss/crossentropy": 2.8615992307662963, "loss/logits": 1.0675591200590133, "step": 5980 }, { "epoch": 0.0599, "grad_norm": 8.6875, "grad_norm_var": 0.53717041015625, "learning_rate": 0.0003, "loss": 13.6566, "loss/aux_loss": 0.048138993233442305, "loss/crossentropy": 3.0715150594711305, "loss/logits": 1.0647211134433747, "step": 5990 }, { "epoch": 0.06, "grad_norm": 8.5, "grad_norm_var": 0.13583577473958333, "learning_rate": 0.0003, "loss": 13.8894, "loss/aux_loss": 0.04813880603760481, "loss/crossentropy": 2.99127779006958, "loss/logits": 1.0782989412546158, "step": 6000 }, { "epoch": 0.0601, "grad_norm": 8.375, "grad_norm_var": 0.13151041666666666, "learning_rate": 0.0003, "loss": 13.9352, "loss/aux_loss": 0.04813908338546753, "loss/crossentropy": 2.9843607366085054, "loss/logits": 1.0743216931819917, "step": 6010 }, { "epoch": 0.0602, "grad_norm": 8.375, "grad_norm_var": 1.2416015625, "learning_rate": 0.0003, "loss": 13.9324, "loss/aux_loss": 0.04814204126596451, "loss/crossentropy": 3.001486176252365, "loss/logits": 1.0827761620283127, "step": 6020 }, { "epoch": 0.0603, "grad_norm": 9.0625, "grad_norm_var": 0.3578125, "learning_rate": 0.0003, "loss": 13.5495, "loss/aux_loss": 0.048134736530482766, "loss/crossentropy": 2.8943534910678865, "loss/logits": 1.089774450659752, "step": 6030 }, { "epoch": 0.0604, "grad_norm": 8.25, "grad_norm_var": 0.21990559895833334, "learning_rate": 0.0003, "loss": 13.91, "loss/aux_loss": 0.0481480710208416, "loss/crossentropy": 3.125103998184204, "loss/logits": 1.1064673095941544, "step": 6040 }, { "epoch": 0.0605, "grad_norm": 9.5625, "grad_norm_var": 0.5637980143229167, "learning_rate": 0.0003, "loss": 13.7693, "loss/aux_loss": 0.04813214130699635, "loss/crossentropy": 2.9569589614868166, "loss/logits": 1.0811177968978882, "step": 6050 }, { "epoch": 0.0606, "grad_norm": 9.0, "grad_norm_var": 0.32428385416666666, "learning_rate": 0.0003, "loss": 13.737, "loss/aux_loss": 0.048133809491991995, "loss/crossentropy": 2.873425018787384, "loss/logits": 1.066912430524826, "step": 6060 }, { "epoch": 0.0607, "grad_norm": 8.8125, "grad_norm_var": 0.12480061848958333, "learning_rate": 0.0003, "loss": 13.7512, "loss/aux_loss": 0.048146062158048154, "loss/crossentropy": 2.874787837266922, "loss/logits": 0.99142906665802, "step": 6070 }, { "epoch": 0.0608, "grad_norm": 9.5625, "grad_norm_var": 0.5244140625, "learning_rate": 0.0003, "loss": 13.7192, "loss/aux_loss": 0.04813813380897045, "loss/crossentropy": 2.8974472165107725, "loss/logits": 1.0416524529457092, "step": 6080 }, { "epoch": 0.0609, "grad_norm": 11.25, "grad_norm_var": 0.8244140625, "learning_rate": 0.0003, "loss": 14.0021, "loss/aux_loss": 0.04813482966274023, "loss/crossentropy": 2.9082088649272917, "loss/logits": 1.1170342415571213, "step": 6090 }, { "epoch": 0.061, "grad_norm": 7.90625, "grad_norm_var": 0.6071248372395833, "learning_rate": 0.0003, "loss": 13.8868, "loss/aux_loss": 0.048123362846672534, "loss/crossentropy": 3.10910404920578, "loss/logits": 1.1065054565668107, "step": 6100 }, { "epoch": 0.0611, "grad_norm": 9.0625, "grad_norm_var": 0.14185791015625, "learning_rate": 0.0003, "loss": 13.9858, "loss/aux_loss": 0.04813080281019211, "loss/crossentropy": 2.9956843733787535, "loss/logits": 1.0740672290325164, "step": 6110 }, { "epoch": 0.0612, "grad_norm": 8.875, "grad_norm_var": 0.11495768229166667, "learning_rate": 0.0003, "loss": 13.8486, "loss/aux_loss": 0.048135568387806416, "loss/crossentropy": 3.0476453006267548, "loss/logits": 1.074926945567131, "step": 6120 }, { "epoch": 0.0613, "grad_norm": 11.0, "grad_norm_var": 44.31295572916667, "learning_rate": 0.0003, "loss": 13.7639, "loss/aux_loss": 0.04814387541264296, "loss/crossentropy": 3.015053462982178, "loss/logits": 1.0650121331214906, "step": 6130 }, { "epoch": 0.0614, "grad_norm": 9.3125, "grad_norm_var": 18.512353515625, "learning_rate": 0.0003, "loss": 13.6984, "loss/aux_loss": 0.04813182633370161, "loss/crossentropy": 2.9327735245227813, "loss/logits": 1.062555307149887, "step": 6140 }, { "epoch": 0.0615, "grad_norm": 7.84375, "grad_norm_var": 0.5481404622395833, "learning_rate": 0.0003, "loss": 13.7344, "loss/aux_loss": 0.048124780878424644, "loss/crossentropy": 2.884230363368988, "loss/logits": 1.0680664718151092, "step": 6150 }, { "epoch": 0.0616, "grad_norm": 8.9375, "grad_norm_var": 0.13401285807291666, "learning_rate": 0.0003, "loss": 14.0286, "loss/aux_loss": 0.04813941400498152, "loss/crossentropy": 2.973123300075531, "loss/logits": 1.0950632393360138, "step": 6160 }, { "epoch": 0.0617, "grad_norm": 7.96875, "grad_norm_var": 0.15256754557291666, "learning_rate": 0.0003, "loss": 13.4949, "loss/aux_loss": 0.04813013020902872, "loss/crossentropy": 3.0717917561531065, "loss/logits": 1.096383735537529, "step": 6170 }, { "epoch": 0.0618, "grad_norm": 8.8125, "grad_norm_var": 0.3224894205729167, "learning_rate": 0.0003, "loss": 13.6958, "loss/aux_loss": 0.04813482668250799, "loss/crossentropy": 2.905612015724182, "loss/logits": 1.0801061391830444, "step": 6180 }, { "epoch": 0.0619, "grad_norm": 8.125, "grad_norm_var": 1.2854166666666667, "learning_rate": 0.0003, "loss": 13.8524, "loss/aux_loss": 0.04814470838755369, "loss/crossentropy": 3.090296733379364, "loss/logits": 1.1191903114318849, "step": 6190 }, { "epoch": 0.062, "grad_norm": 8.5625, "grad_norm_var": 0.2548014322916667, "learning_rate": 0.0003, "loss": 13.8103, "loss/aux_loss": 0.04813280999660492, "loss/crossentropy": 2.9565974533557893, "loss/logits": 1.0762405812740325, "step": 6200 }, { "epoch": 0.0621, "grad_norm": 8.8125, "grad_norm_var": 0.12838541666666667, "learning_rate": 0.0003, "loss": 13.9951, "loss/aux_loss": 0.04814301636070013, "loss/crossentropy": 3.0025951147079466, "loss/logits": 1.094373619556427, "step": 6210 }, { "epoch": 0.0622, "grad_norm": 8.8125, "grad_norm_var": 0.15935872395833334, "learning_rate": 0.0003, "loss": 13.7959, "loss/aux_loss": 0.04813591837882995, "loss/crossentropy": 2.890333390235901, "loss/logits": 1.1366484671831132, "step": 6220 }, { "epoch": 0.0623, "grad_norm": 8.0, "grad_norm_var": 0.483056640625, "learning_rate": 0.0003, "loss": 13.9138, "loss/aux_loss": 0.04814350325614214, "loss/crossentropy": 2.989057755470276, "loss/logits": 1.0995355397462845, "step": 6230 }, { "epoch": 0.0624, "grad_norm": 8.4375, "grad_norm_var": 0.23398030598958333, "learning_rate": 0.0003, "loss": 13.7247, "loss/aux_loss": 0.04812443405389786, "loss/crossentropy": 2.929747235774994, "loss/logits": 1.070769226551056, "step": 6240 }, { "epoch": 0.0625, "grad_norm": 8.9375, "grad_norm_var": 0.19888916015625, "learning_rate": 0.0003, "loss": 13.8281, "loss/aux_loss": 0.04813152626156807, "loss/crossentropy": 2.9640577673912047, "loss/logits": 1.0986740648746491, "step": 6250 }, { "epoch": 0.0626, "grad_norm": 8.8125, "grad_norm_var": 0.21834309895833334, "learning_rate": 0.0003, "loss": 13.693, "loss/aux_loss": 0.048149599321186544, "loss/crossentropy": 3.0804611802101136, "loss/logits": 1.0116279065608977, "step": 6260 }, { "epoch": 0.0627, "grad_norm": 18.0, "grad_norm_var": 5.355322265625, "learning_rate": 0.0003, "loss": 13.5479, "loss/aux_loss": 0.04814065471291542, "loss/crossentropy": 2.9952731311321257, "loss/logits": 1.1052643030881881, "step": 6270 }, { "epoch": 0.0628, "grad_norm": 9.1875, "grad_norm_var": 5.497359212239584, "learning_rate": 0.0003, "loss": 13.7811, "loss/aux_loss": 0.048146970197558404, "loss/crossentropy": 3.019810402393341, "loss/logits": 1.0878846973180771, "step": 6280 }, { "epoch": 0.0629, "grad_norm": 9.375, "grad_norm_var": 0.2244140625, "learning_rate": 0.0003, "loss": 13.7007, "loss/aux_loss": 0.04813486896455288, "loss/crossentropy": 3.0539894580841063, "loss/logits": 1.1094782143831252, "step": 6290 }, { "epoch": 0.063, "grad_norm": 8.75, "grad_norm_var": 0.15198160807291666, "learning_rate": 0.0003, "loss": 13.6997, "loss/aux_loss": 0.048145148530602457, "loss/crossentropy": 2.8999388575553895, "loss/logits": 1.0792778134346008, "step": 6300 }, { "epoch": 0.0631, "grad_norm": 8.5, "grad_norm_var": 111.25478108723958, "learning_rate": 0.0003, "loss": 13.7056, "loss/aux_loss": 0.048147874511778356, "loss/crossentropy": 2.960583436489105, "loss/logits": 1.0709624886512756, "step": 6310 }, { "epoch": 0.0632, "grad_norm": 9.125, "grad_norm_var": 109.709228515625, "learning_rate": 0.0003, "loss": 13.8739, "loss/aux_loss": 0.04814481791108847, "loss/crossentropy": 2.9161580562591554, "loss/logits": 1.0524902671575547, "step": 6320 }, { "epoch": 0.0633, "grad_norm": 8.8125, "grad_norm_var": 0.32526041666666666, "learning_rate": 0.0003, "loss": 13.9297, "loss/aux_loss": 0.048139688558876514, "loss/crossentropy": 3.0711183190345763, "loss/logits": 1.1227669954299926, "step": 6330 }, { "epoch": 0.0634, "grad_norm": 9.0625, "grad_norm_var": 0.3661295572916667, "learning_rate": 0.0003, "loss": 13.6939, "loss/aux_loss": 0.048136590234935286, "loss/crossentropy": 2.935191023349762, "loss/logits": 1.071747088432312, "step": 6340 }, { "epoch": 0.0635, "grad_norm": 8.9375, "grad_norm_var": 0.06131184895833333, "learning_rate": 0.0003, "loss": 13.8008, "loss/aux_loss": 0.04813410900533199, "loss/crossentropy": 3.015982925891876, "loss/logits": 1.0928217798471451, "step": 6350 }, { "epoch": 0.0636, "grad_norm": 8.3125, "grad_norm_var": 0.12537434895833333, "learning_rate": 0.0003, "loss": 13.7574, "loss/aux_loss": 0.0481348292902112, "loss/crossentropy": 2.796919822692871, "loss/logits": 1.064060640335083, "step": 6360 }, { "epoch": 0.0637, "grad_norm": 8.8125, "grad_norm_var": 0.23995768229166667, "learning_rate": 0.0003, "loss": 13.8537, "loss/aux_loss": 0.04814393315464258, "loss/crossentropy": 3.1008806109428404, "loss/logits": 1.0501255184412002, "step": 6370 }, { "epoch": 0.0638, "grad_norm": 8.625, "grad_norm_var": 0.116259765625, "learning_rate": 0.0003, "loss": 13.6849, "loss/aux_loss": 0.048127164505422114, "loss/crossentropy": 2.8864944219589233, "loss/logits": 1.0921163856983185, "step": 6380 }, { "epoch": 0.0639, "grad_norm": 8.625, "grad_norm_var": 76.34386393229167, "learning_rate": 0.0003, "loss": 13.6183, "loss/aux_loss": 0.048148133978247645, "loss/crossentropy": 2.8359330534934997, "loss/logits": 1.0765960454940795, "step": 6390 }, { "epoch": 0.064, "grad_norm": 8.3125, "grad_norm_var": 0.277587890625, "learning_rate": 0.0003, "loss": 13.6039, "loss/aux_loss": 0.04812461007386446, "loss/crossentropy": 2.999741852283478, "loss/logits": 1.0552677452564239, "step": 6400 }, { "epoch": 0.0641, "grad_norm": 8.8125, "grad_norm_var": 0.2833333333333333, "learning_rate": 0.0003, "loss": 13.7492, "loss/aux_loss": 0.04812895692884922, "loss/crossentropy": 3.226720857620239, "loss/logits": 1.0827998757362365, "step": 6410 }, { "epoch": 0.0642, "grad_norm": 8.6875, "grad_norm_var": 0.13553059895833333, "learning_rate": 0.0003, "loss": 13.7557, "loss/aux_loss": 0.04812961965799332, "loss/crossentropy": 2.979693388938904, "loss/logits": 1.0688404828310012, "step": 6420 }, { "epoch": 0.0643, "grad_norm": 8.5, "grad_norm_var": 0.28761393229166665, "learning_rate": 0.0003, "loss": 13.6565, "loss/aux_loss": 0.04814809542149305, "loss/crossentropy": 2.8194834649562837, "loss/logits": 1.0419757306575774, "step": 6430 }, { "epoch": 0.0644, "grad_norm": 8.5, "grad_norm_var": 0.25193684895833335, "learning_rate": 0.0003, "loss": 13.7414, "loss/aux_loss": 0.048131111077964306, "loss/crossentropy": 2.894507110118866, "loss/logits": 1.0287611424922942, "step": 6440 }, { "epoch": 0.0645, "grad_norm": 8.25, "grad_norm_var": 23.7390625, "learning_rate": 0.0003, "loss": 13.6572, "loss/aux_loss": 0.04814545251429081, "loss/crossentropy": 2.9162492036819456, "loss/logits": 1.0510101735591888, "step": 6450 }, { "epoch": 0.0646, "grad_norm": 8.0625, "grad_norm_var": 0.353369140625, "learning_rate": 0.0003, "loss": 13.6879, "loss/aux_loss": 0.04813457876443863, "loss/crossentropy": 3.00765939950943, "loss/logits": 1.0992391586303711, "step": 6460 }, { "epoch": 0.0647, "grad_norm": 9.0, "grad_norm_var": 0.2699055989583333, "learning_rate": 0.0003, "loss": 13.7296, "loss/aux_loss": 0.04814710468053818, "loss/crossentropy": 2.933130156993866, "loss/logits": 1.0464091002941132, "step": 6470 }, { "epoch": 0.0648, "grad_norm": 8.75, "grad_norm_var": 0.21276041666666667, "learning_rate": 0.0003, "loss": 13.5071, "loss/aux_loss": 0.0481443403288722, "loss/crossentropy": 2.893195056915283, "loss/logits": 1.0626126766204833, "step": 6480 }, { "epoch": 0.0649, "grad_norm": 9.875, "grad_norm_var": 0.32551676432291665, "learning_rate": 0.0003, "loss": 13.7133, "loss/aux_loss": 0.04813146814703941, "loss/crossentropy": 3.026914322376251, "loss/logits": 1.1285286754369737, "step": 6490 }, { "epoch": 0.065, "grad_norm": 8.8125, "grad_norm_var": 0.19479166666666667, "learning_rate": 0.0003, "loss": 13.7583, "loss/aux_loss": 0.048133007064461705, "loss/crossentropy": 3.017659032344818, "loss/logits": 1.064196562767029, "step": 6500 }, { "epoch": 0.0651, "grad_norm": 9.5625, "grad_norm_var": 0.2625, "learning_rate": 0.0003, "loss": 13.6455, "loss/aux_loss": 0.04813391268253327, "loss/crossentropy": 2.9791279196739198, "loss/logits": 1.068275386095047, "step": 6510 }, { "epoch": 0.0652, "grad_norm": 8.0, "grad_norm_var": 0.26712239583333336, "learning_rate": 0.0003, "loss": 13.7033, "loss/aux_loss": 0.04812156092375517, "loss/crossentropy": 2.9347316145896913, "loss/logits": 1.0693759769201279, "step": 6520 }, { "epoch": 0.0653, "grad_norm": 8.125, "grad_norm_var": 0.15130208333333334, "learning_rate": 0.0003, "loss": 13.4842, "loss/aux_loss": 0.04813071470707655, "loss/crossentropy": 3.036532533168793, "loss/logits": 1.07112657725811, "step": 6530 }, { "epoch": 0.0654, "grad_norm": 8.4375, "grad_norm_var": 0.08318684895833334, "learning_rate": 0.0003, "loss": 13.6518, "loss/aux_loss": 0.04812001138925552, "loss/crossentropy": 3.02432986497879, "loss/logits": 1.0842196673154831, "step": 6540 }, { "epoch": 0.0655, "grad_norm": 8.1875, "grad_norm_var": 0.14270833333333333, "learning_rate": 0.0003, "loss": 13.5997, "loss/aux_loss": 0.04814151749014854, "loss/crossentropy": 2.8338264346122743, "loss/logits": 1.0342780292034148, "step": 6550 }, { "epoch": 0.0656, "grad_norm": 8.3125, "grad_norm_var": 0.245166015625, "learning_rate": 0.0003, "loss": 13.6701, "loss/aux_loss": 0.04813636671751738, "loss/crossentropy": 2.8755680441856386, "loss/logits": 1.0737797766923904, "step": 6560 }, { "epoch": 0.0657, "grad_norm": 9.3125, "grad_norm_var": 5.57515869140625, "learning_rate": 0.0003, "loss": 13.7123, "loss/aux_loss": 0.048146852850914, "loss/crossentropy": 2.824841636419296, "loss/logits": 1.0282084316015243, "step": 6570 }, { "epoch": 0.0658, "grad_norm": 9.75, "grad_norm_var": 4.969559733072916, "learning_rate": 0.0003, "loss": 13.5714, "loss/aux_loss": 0.04814463872462511, "loss/crossentropy": 2.800820177793503, "loss/logits": 1.0070704787969589, "step": 6580 }, { "epoch": 0.0659, "grad_norm": 8.9375, "grad_norm_var": 0.8249837239583333, "learning_rate": 0.0003, "loss": 13.8031, "loss/aux_loss": 0.04813964460045099, "loss/crossentropy": 2.9267017126083372, "loss/logits": 1.0605516761541367, "step": 6590 }, { "epoch": 0.066, "grad_norm": 8.625, "grad_norm_var": 0.20974934895833333, "learning_rate": 0.0003, "loss": 13.6457, "loss/aux_loss": 0.04813900291919708, "loss/crossentropy": 2.8911925733089445, "loss/logits": 1.0161655098199844, "step": 6600 }, { "epoch": 0.0661, "grad_norm": 9.375, "grad_norm_var": 0.08899739583333334, "learning_rate": 0.0003, "loss": 13.6958, "loss/aux_loss": 0.048127881996333596, "loss/crossentropy": 3.0655489921569825, "loss/logits": 1.0680898874998093, "step": 6610 }, { "epoch": 0.0662, "grad_norm": 8.4375, "grad_norm_var": 0.26223958333333336, "learning_rate": 0.0003, "loss": 13.6129, "loss/aux_loss": 0.04813065193593502, "loss/crossentropy": 3.0288997888565063, "loss/logits": 1.0894548326730729, "step": 6620 }, { "epoch": 0.0663, "grad_norm": 8.8125, "grad_norm_var": 0.22980143229166666, "learning_rate": 0.0003, "loss": 13.5437, "loss/aux_loss": 0.04813928250223398, "loss/crossentropy": 2.8766731202602385, "loss/logits": 1.0580507218837738, "step": 6630 }, { "epoch": 0.0664, "grad_norm": 9.25, "grad_norm_var": 0.6728515625, "learning_rate": 0.0003, "loss": 13.5473, "loss/aux_loss": 0.04812793843448162, "loss/crossentropy": 3.049793744087219, "loss/logits": 1.0934020727872849, "step": 6640 }, { "epoch": 0.0665, "grad_norm": 8.9375, "grad_norm_var": 0.7403483072916667, "learning_rate": 0.0003, "loss": 13.6928, "loss/aux_loss": 0.04813400413841009, "loss/crossentropy": 3.0123080134391786, "loss/logits": 1.0466331481933593, "step": 6650 }, { "epoch": 0.0666, "grad_norm": 8.3125, "grad_norm_var": 0.11365559895833334, "learning_rate": 0.0003, "loss": 13.6429, "loss/aux_loss": 0.04813118148595095, "loss/crossentropy": 2.941323435306549, "loss/logits": 1.0748949706554414, "step": 6660 }, { "epoch": 0.0667, "grad_norm": 10.5, "grad_norm_var": 22.969010416666666, "learning_rate": 0.0003, "loss": 13.6803, "loss/aux_loss": 0.04812760762870312, "loss/crossentropy": 2.996757823228836, "loss/logits": 1.0776958972215653, "step": 6670 }, { "epoch": 0.0668, "grad_norm": 8.3125, "grad_norm_var": 22.934830729166666, "learning_rate": 0.0003, "loss": 13.4982, "loss/aux_loss": 0.04812924452126026, "loss/crossentropy": 3.0282610774040224, "loss/logits": 1.0759372055530547, "step": 6680 }, { "epoch": 0.0669, "grad_norm": 8.5625, "grad_norm_var": 0.09557291666666666, "learning_rate": 0.0003, "loss": 13.5785, "loss/aux_loss": 0.04813319090753794, "loss/crossentropy": 3.0203604459762574, "loss/logits": 1.0731590211391449, "step": 6690 }, { "epoch": 0.067, "grad_norm": 9.1875, "grad_norm_var": 0.20358072916666667, "learning_rate": 0.0003, "loss": 13.567, "loss/aux_loss": 0.04814446251839399, "loss/crossentropy": 2.9507773220539093, "loss/logits": 1.070712435245514, "step": 6700 }, { "epoch": 0.0671, "grad_norm": 9.3125, "grad_norm_var": 0.06041666666666667, "learning_rate": 0.0003, "loss": 13.7238, "loss/aux_loss": 0.04814040027558804, "loss/crossentropy": 2.8811200976371767, "loss/logits": 1.0566608518362046, "step": 6710 }, { "epoch": 0.0672, "grad_norm": 8.4375, "grad_norm_var": 0.111962890625, "learning_rate": 0.0003, "loss": 13.5595, "loss/aux_loss": 0.048139039613306525, "loss/crossentropy": 3.0208721280097963, "loss/logits": 1.0752500742673874, "step": 6720 }, { "epoch": 0.0673, "grad_norm": 10.25, "grad_norm_var": 0.47317708333333336, "learning_rate": 0.0003, "loss": 13.5703, "loss/aux_loss": 0.04813670702278614, "loss/crossentropy": 3.0942620396614076, "loss/logits": 1.0718396067619325, "step": 6730 }, { "epoch": 0.0674, "grad_norm": 8.5625, "grad_norm_var": 0.3111979166666667, "learning_rate": 0.0003, "loss": 13.7648, "loss/aux_loss": 0.048137816973030566, "loss/crossentropy": 3.032657301425934, "loss/logits": 1.0898617118597032, "step": 6740 }, { "epoch": 0.0675, "grad_norm": 8.6875, "grad_norm_var": 0.3921875, "learning_rate": 0.0003, "loss": 13.682, "loss/aux_loss": 0.048137097433209416, "loss/crossentropy": 3.14407594203949, "loss/logits": 1.0231775403022767, "step": 6750 }, { "epoch": 0.0676, "grad_norm": 8.3125, "grad_norm_var": 0.31942952473958336, "learning_rate": 0.0003, "loss": 13.5635, "loss/aux_loss": 0.048129927739501, "loss/crossentropy": 3.025140118598938, "loss/logits": 1.051462560892105, "step": 6760 }, { "epoch": 0.0677, "grad_norm": 9.1875, "grad_norm_var": 0.3827433268229167, "learning_rate": 0.0003, "loss": 13.4952, "loss/aux_loss": 0.04813184943050146, "loss/crossentropy": 3.036393105983734, "loss/logits": 1.0355240046977996, "step": 6770 }, { "epoch": 0.0678, "grad_norm": 9.6875, "grad_norm_var": 0.45315348307291664, "learning_rate": 0.0003, "loss": 13.6149, "loss/aux_loss": 0.048135831765830514, "loss/crossentropy": 3.0204987287521363, "loss/logits": 1.0412966758012772, "step": 6780 }, { "epoch": 0.0679, "grad_norm": 9.375, "grad_norm_var": 0.19273681640625, "learning_rate": 0.0003, "loss": 13.5112, "loss/aux_loss": 0.048138886131346224, "loss/crossentropy": 2.7807726860046387, "loss/logits": 1.0488616794347763, "step": 6790 }, { "epoch": 0.068, "grad_norm": 8.8125, "grad_norm_var": 0.2384765625, "learning_rate": 0.0003, "loss": 13.8943, "loss/aux_loss": 0.048131432943046094, "loss/crossentropy": 3.068627381324768, "loss/logits": 1.097953936457634, "step": 6800 }, { "epoch": 0.0681, "grad_norm": 8.6875, "grad_norm_var": 0.16027018229166667, "learning_rate": 0.0003, "loss": 13.4602, "loss/aux_loss": 0.04812794364988804, "loss/crossentropy": 2.918659710884094, "loss/logits": 1.0384095519781114, "step": 6810 }, { "epoch": 0.0682, "grad_norm": 9.4375, "grad_norm_var": 0.13854166666666667, "learning_rate": 0.0003, "loss": 13.5068, "loss/aux_loss": 0.04812986459583044, "loss/crossentropy": 2.9638909816741945, "loss/logits": 1.0766464948654175, "step": 6820 }, { "epoch": 0.0683, "grad_norm": 8.6875, "grad_norm_var": 0.30514322916666664, "learning_rate": 0.0003, "loss": 13.4987, "loss/aux_loss": 0.04813037347048521, "loss/crossentropy": 2.929929780960083, "loss/logits": 1.0736204475164413, "step": 6830 }, { "epoch": 0.0684, "grad_norm": 8.625, "grad_norm_var": 0.1087890625, "learning_rate": 0.0003, "loss": 13.5562, "loss/aux_loss": 0.048125201091170314, "loss/crossentropy": 2.925602376461029, "loss/logits": 1.0603425681591034, "step": 6840 }, { "epoch": 0.0685, "grad_norm": 10.625, "grad_norm_var": 32.565608723958334, "learning_rate": 0.0003, "loss": 13.612, "loss/aux_loss": 0.048129927739501, "loss/crossentropy": 2.9750213265419005, "loss/logits": 1.0510219603776931, "step": 6850 }, { "epoch": 0.0686, "grad_norm": 8.4375, "grad_norm_var": 32.53943684895833, "learning_rate": 0.0003, "loss": 13.5853, "loss/aux_loss": 0.048130680806934834, "loss/crossentropy": 2.939654362201691, "loss/logits": 1.081050756573677, "step": 6860 }, { "epoch": 0.0687, "grad_norm": 10.25, "grad_norm_var": 2.6824055989583333, "learning_rate": 0.0003, "loss": 13.6429, "loss/aux_loss": 0.048120100237429145, "loss/crossentropy": 3.081457090377808, "loss/logits": 1.042839017510414, "step": 6870 }, { "epoch": 0.0688, "grad_norm": 8.875, "grad_norm_var": 0.37109375, "learning_rate": 0.0003, "loss": 13.5743, "loss/aux_loss": 0.04812851026654243, "loss/crossentropy": 2.9788452863693236, "loss/logits": 1.082206028699875, "step": 6880 }, { "epoch": 0.0689, "grad_norm": 8.8125, "grad_norm_var": 0.12545572916666667, "learning_rate": 0.0003, "loss": 13.3157, "loss/aux_loss": 0.048133143596351145, "loss/crossentropy": 2.8911037921905516, "loss/logits": 1.064788919687271, "step": 6890 }, { "epoch": 0.069, "grad_norm": 9.1875, "grad_norm_var": 0.22213541666666667, "learning_rate": 0.0003, "loss": 13.6541, "loss/aux_loss": 0.04813574869185686, "loss/crossentropy": 3.002419984340668, "loss/logits": 1.031218209862709, "step": 6900 }, { "epoch": 0.0691, "grad_norm": 10.0, "grad_norm_var": 0.33274739583333335, "learning_rate": 0.0003, "loss": 13.4968, "loss/aux_loss": 0.048126774840056896, "loss/crossentropy": 2.7656411051750185, "loss/logits": 1.0477019995450974, "step": 6910 }, { "epoch": 0.0692, "grad_norm": 8.25, "grad_norm_var": 0.7972493489583333, "learning_rate": 0.0003, "loss": 13.4553, "loss/aux_loss": 0.04813210777938366, "loss/crossentropy": 2.996220147609711, "loss/logits": 1.076385298371315, "step": 6920 }, { "epoch": 0.0693, "grad_norm": 8.375, "grad_norm_var": 0.7956990559895833, "learning_rate": 0.0003, "loss": 13.6957, "loss/aux_loss": 0.04813826754689217, "loss/crossentropy": 3.052574133872986, "loss/logits": 1.0720904529094697, "step": 6930 }, { "epoch": 0.0694, "grad_norm": 9.5, "grad_norm_var": 0.28631184895833334, "learning_rate": 0.0003, "loss": 13.2369, "loss/aux_loss": 0.048143592104315756, "loss/crossentropy": 2.9044690668582915, "loss/logits": 1.0308396130800248, "step": 6940 }, { "epoch": 0.0695, "grad_norm": 9.0, "grad_norm_var": 7.226416015625, "learning_rate": 0.0003, "loss": 13.4, "loss/aux_loss": 0.04815136883407831, "loss/crossentropy": 2.968579125404358, "loss/logits": 1.0896869003772736, "step": 6950 }, { "epoch": 0.0696, "grad_norm": 8.9375, "grad_norm_var": 0.508447265625, "learning_rate": 0.0003, "loss": 13.505, "loss/aux_loss": 0.048129218816757205, "loss/crossentropy": 2.9993494272232057, "loss/logits": 1.0352261871099473, "step": 6960 }, { "epoch": 0.0697, "grad_norm": 9.875, "grad_norm_var": 0.6348307291666667, "learning_rate": 0.0003, "loss": 13.6564, "loss/aux_loss": 0.04812279660254717, "loss/crossentropy": 2.970927131175995, "loss/logits": 1.0954313904047013, "step": 6970 }, { "epoch": 0.0698, "grad_norm": 9.5, "grad_norm_var": 0.2203125, "learning_rate": 0.0003, "loss": 13.5257, "loss/aux_loss": 0.04813241846859455, "loss/crossentropy": 3.085184133052826, "loss/logits": 1.0751633316278457, "step": 6980 }, { "epoch": 0.0699, "grad_norm": 9.625, "grad_norm_var": 0.24140625, "learning_rate": 0.0003, "loss": 13.5025, "loss/aux_loss": 0.048134620860219, "loss/crossentropy": 2.9781801462173463, "loss/logits": 1.073571789264679, "step": 6990 }, { "epoch": 0.07, "grad_norm": 9.0, "grad_norm_var": 0.211572265625, "learning_rate": 0.0003, "loss": 13.3884, "loss/aux_loss": 0.048129561357200146, "loss/crossentropy": 2.836885952949524, "loss/logits": 1.0355841994285584, "step": 7000 }, { "epoch": 0.0701, "grad_norm": 8.9375, "grad_norm_var": 0.05572916666666667, "learning_rate": 0.0003, "loss": 13.3751, "loss/aux_loss": 0.04813597537577152, "loss/crossentropy": 3.124106729030609, "loss/logits": 1.087377232313156, "step": 7010 }, { "epoch": 0.0702, "grad_norm": 10.0625, "grad_norm_var": 0.28932291666666665, "learning_rate": 0.0003, "loss": 13.6506, "loss/aux_loss": 0.04812758322805166, "loss/crossentropy": 3.0415536522865296, "loss/logits": 1.0985166609287262, "step": 7020 }, { "epoch": 0.0703, "grad_norm": 9.125, "grad_norm_var": 0.32928059895833334, "learning_rate": 0.0003, "loss": 13.5101, "loss/aux_loss": 0.048123250156641005, "loss/crossentropy": 2.903241181373596, "loss/logits": 1.0201388955116273, "step": 7030 }, { "epoch": 0.0704, "grad_norm": 11.0625, "grad_norm_var": 0.6508951822916667, "learning_rate": 0.0003, "loss": 13.5161, "loss/aux_loss": 0.048136004246771336, "loss/crossentropy": 2.9222341775894165, "loss/logits": 1.0136090040206909, "step": 7040 }, { "epoch": 0.0705, "grad_norm": 8.875, "grad_norm_var": 0.5880208333333333, "learning_rate": 0.0003, "loss": 13.5017, "loss/aux_loss": 0.048121250979602334, "loss/crossentropy": 2.9636539459228515, "loss/logits": 1.079690435528755, "step": 7050 }, { "epoch": 0.0706, "grad_norm": 9.0625, "grad_norm_var": 0.231884765625, "learning_rate": 0.0003, "loss": 13.4343, "loss/aux_loss": 0.048114245571196076, "loss/crossentropy": 3.0601498603820803, "loss/logits": 1.066567412018776, "step": 7060 }, { "epoch": 0.0707, "grad_norm": 8.375, "grad_norm_var": 0.17994791666666668, "learning_rate": 0.0003, "loss": 13.3876, "loss/aux_loss": 0.048132015578448775, "loss/crossentropy": 2.9693622946739198, "loss/logits": 1.0554438531398773, "step": 7070 }, { "epoch": 0.0708, "grad_norm": 9.1875, "grad_norm_var": 0.4327473958333333, "learning_rate": 0.0003, "loss": 13.495, "loss/aux_loss": 0.04813415054231882, "loss/crossentropy": 2.8446732878685, "loss/logits": 1.0407306522130966, "step": 7080 }, { "epoch": 0.0709, "grad_norm": 9.1875, "grad_norm_var": 0.16599934895833332, "learning_rate": 0.0003, "loss": 13.4971, "loss/aux_loss": 0.04813690483570099, "loss/crossentropy": 2.961193633079529, "loss/logits": 1.04742229282856, "step": 7090 }, { "epoch": 0.071, "grad_norm": 9.1875, "grad_norm_var": 0.207666015625, "learning_rate": 0.0003, "loss": 13.3663, "loss/aux_loss": 0.04812674857676029, "loss/crossentropy": 2.898916572332382, "loss/logits": 1.0212170660495759, "step": 7100 }, { "epoch": 0.0711, "grad_norm": 9.875, "grad_norm_var": 0.1853515625, "learning_rate": 0.0003, "loss": 13.5333, "loss/aux_loss": 0.0481316477060318, "loss/crossentropy": 2.9347579002380373, "loss/logits": 0.9982910871505737, "step": 7110 }, { "epoch": 0.0712, "grad_norm": 8.5, "grad_norm_var": 0.17381184895833332, "learning_rate": 0.0003, "loss": 13.4628, "loss/aux_loss": 0.048122935183346274, "loss/crossentropy": 2.993864929676056, "loss/logits": 1.0623649686574936, "step": 7120 }, { "epoch": 0.0713, "grad_norm": 9.625, "grad_norm_var": 6.217643229166667, "learning_rate": 0.0003, "loss": 13.3297, "loss/aux_loss": 0.048131784237921235, "loss/crossentropy": 2.932550811767578, "loss/logits": 1.0577648341655732, "step": 7130 }, { "epoch": 0.0714, "grad_norm": 11.3125, "grad_norm_var": 18.408072916666665, "learning_rate": 0.0003, "loss": 13.6702, "loss/aux_loss": 0.04816462509334087, "loss/crossentropy": 3.037775385379791, "loss/logits": 1.0722199440002442, "step": 7140 }, { "epoch": 0.0715, "grad_norm": 10.125, "grad_norm_var": 5.566910807291666, "learning_rate": 0.0003, "loss": 13.5737, "loss/aux_loss": 0.048116024024784564, "loss/crossentropy": 2.9626861453056335, "loss/logits": 1.0311239361763, "step": 7150 }, { "epoch": 0.0716, "grad_norm": 10.375, "grad_norm_var": 2.6786295572916665, "learning_rate": 0.0003, "loss": 13.4786, "loss/aux_loss": 0.04811809528619051, "loss/crossentropy": 2.9172492921352386, "loss/logits": 1.056332242488861, "step": 7160 }, { "epoch": 0.0717, "grad_norm": 9.8125, "grad_norm_var": 2.5603515625, "learning_rate": 0.0003, "loss": 13.4434, "loss/aux_loss": 0.04812202490866184, "loss/crossentropy": 3.057690107822418, "loss/logits": 1.0905145525932312, "step": 7170 }, { "epoch": 0.0718, "grad_norm": 11.8125, "grad_norm_var": 26.733707682291666, "learning_rate": 0.0003, "loss": 13.5006, "loss/aux_loss": 0.04812044147402048, "loss/crossentropy": 2.9608686804771422, "loss/logits": 1.0904987782239914, "step": 7180 }, { "epoch": 0.0719, "grad_norm": 9.625, "grad_norm_var": 25.960921223958334, "learning_rate": 0.0003, "loss": 13.4463, "loss/aux_loss": 0.048137583397328855, "loss/crossentropy": 2.895015776157379, "loss/logits": 1.0172715038061142, "step": 7190 }, { "epoch": 0.072, "grad_norm": 8.25, "grad_norm_var": 0.31417643229166664, "learning_rate": 0.0003, "loss": 13.5728, "loss/aux_loss": 0.04812595229595899, "loss/crossentropy": 2.8322587251663207, "loss/logits": 1.015550658106804, "step": 7200 }, { "epoch": 0.0721, "grad_norm": 9.875, "grad_norm_var": 0.2728515625, "learning_rate": 0.0003, "loss": 13.6334, "loss/aux_loss": 0.048122029192745684, "loss/crossentropy": 3.037608253955841, "loss/logits": 1.059059676527977, "step": 7210 }, { "epoch": 0.0722, "grad_norm": 10.5625, "grad_norm_var": 0.31951497395833334, "learning_rate": 0.0003, "loss": 13.4644, "loss/aux_loss": 0.04813063070178032, "loss/crossentropy": 2.9392677783966064, "loss/logits": 1.026180136203766, "step": 7220 }, { "epoch": 0.0723, "grad_norm": 8.5, "grad_norm_var": 0.5988118489583333, "learning_rate": 0.0003, "loss": 13.4826, "loss/aux_loss": 0.04812701418995857, "loss/crossentropy": 2.9874269366264343, "loss/logits": 1.0393612265586853, "step": 7230 }, { "epoch": 0.0724, "grad_norm": 8.4375, "grad_norm_var": 0.6067708333333334, "learning_rate": 0.0003, "loss": 13.2889, "loss/aux_loss": 0.04812222328037023, "loss/crossentropy": 2.7734349012374877, "loss/logits": 1.0329697102308273, "step": 7240 }, { "epoch": 0.0725, "grad_norm": 8.9375, "grad_norm_var": 0.18592122395833333, "learning_rate": 0.0003, "loss": 13.5025, "loss/aux_loss": 0.04812582526355982, "loss/crossentropy": 2.9962441444396974, "loss/logits": 1.024059322476387, "step": 7250 }, { "epoch": 0.0726, "grad_norm": 9.4375, "grad_norm_var": 0.06796875, "learning_rate": 0.0003, "loss": 13.2807, "loss/aux_loss": 0.048132246173918244, "loss/crossentropy": 2.820746290683746, "loss/logits": 1.0364280879497527, "step": 7260 }, { "epoch": 0.0727, "grad_norm": 8.5625, "grad_norm_var": 0.19034830729166666, "learning_rate": 0.0003, "loss": 13.5448, "loss/aux_loss": 0.048121783323585986, "loss/crossentropy": 3.010144531726837, "loss/logits": 1.0459368169307708, "step": 7270 }, { "epoch": 0.0728, "grad_norm": 9.0, "grad_norm_var": 5.168733723958334, "learning_rate": 0.0003, "loss": 13.3248, "loss/aux_loss": 0.048125391267240046, "loss/crossentropy": 2.889055919647217, "loss/logits": 1.0278723955154419, "step": 7280 }, { "epoch": 0.0729, "grad_norm": 9.875, "grad_norm_var": 5.200634765625, "learning_rate": 0.0003, "loss": 13.2574, "loss/aux_loss": 0.04812876787036657, "loss/crossentropy": 2.8185496270656585, "loss/logits": 1.0287230491638184, "step": 7290 }, { "epoch": 0.073, "grad_norm": 8.875, "grad_norm_var": 0.32233072916666666, "learning_rate": 0.0003, "loss": 13.3965, "loss/aux_loss": 0.04812293406575918, "loss/crossentropy": 2.804865860939026, "loss/logits": 1.0147784382104874, "step": 7300 }, { "epoch": 0.0731, "grad_norm": 8.5, "grad_norm_var": 0.18904622395833334, "learning_rate": 0.0003, "loss": 13.3527, "loss/aux_loss": 0.04813094306737185, "loss/crossentropy": 2.918304455280304, "loss/logits": 1.079040315747261, "step": 7310 }, { "epoch": 0.0732, "grad_norm": 10.8125, "grad_norm_var": 49.551936848958334, "learning_rate": 0.0003, "loss": 13.3824, "loss/aux_loss": 0.04812980853021145, "loss/crossentropy": 2.9288637161254885, "loss/logits": 1.0903980165719986, "step": 7320 }, { "epoch": 0.0733, "grad_norm": 9.0, "grad_norm_var": 49.57076822916667, "learning_rate": 0.0003, "loss": 13.2886, "loss/aux_loss": 0.04812570326030254, "loss/crossentropy": 3.002810549736023, "loss/logits": 1.0841778188943862, "step": 7330 }, { "epoch": 0.0734, "grad_norm": 8.9375, "grad_norm_var": 3.1300618489583334, "learning_rate": 0.0003, "loss": 13.5399, "loss/aux_loss": 0.04813100174069405, "loss/crossentropy": 2.796613943576813, "loss/logits": 1.0452454775571822, "step": 7340 }, { "epoch": 0.0735, "grad_norm": 9.5, "grad_norm_var": 8.736442057291667, "learning_rate": 0.0003, "loss": 13.5117, "loss/aux_loss": 0.04813167788088322, "loss/crossentropy": 2.962803506851196, "loss/logits": 1.0180111587047578, "step": 7350 }, { "epoch": 0.0736, "grad_norm": 8.5625, "grad_norm_var": 7.845768229166667, "learning_rate": 0.0003, "loss": 13.3754, "loss/aux_loss": 0.04813878424465656, "loss/crossentropy": 2.81993590593338, "loss/logits": 1.0155292719602584, "step": 7360 }, { "epoch": 0.0737, "grad_norm": 9.875, "grad_norm_var": 0.30514322916666664, "learning_rate": 0.0003, "loss": 13.3653, "loss/aux_loss": 0.04812592975795269, "loss/crossentropy": 2.8093133509159087, "loss/logits": 1.0710052281618119, "step": 7370 }, { "epoch": 0.0738, "grad_norm": 8.6875, "grad_norm_var": 0.24295247395833333, "learning_rate": 0.0003, "loss": 13.5517, "loss/aux_loss": 0.04812061432749033, "loss/crossentropy": 2.838201379776001, "loss/logits": 1.0604495793581008, "step": 7380 }, { "epoch": 0.0739, "grad_norm": 9.4375, "grad_norm_var": 0.20167643229166668, "learning_rate": 0.0003, "loss": 13.5058, "loss/aux_loss": 0.048129613324999806, "loss/crossentropy": 3.0717398285865785, "loss/logits": 1.068494337797165, "step": 7390 }, { "epoch": 0.074, "grad_norm": 9.0, "grad_norm_var": 0.10050455729166667, "learning_rate": 0.0003, "loss": 13.3541, "loss/aux_loss": 0.048131177015602586, "loss/crossentropy": 2.886682081222534, "loss/logits": 1.030799898505211, "step": 7400 }, { "epoch": 0.0741, "grad_norm": 8.75, "grad_norm_var": 0.44505208333333335, "learning_rate": 0.0003, "loss": 13.336, "loss/aux_loss": 0.04812990296632051, "loss/crossentropy": 2.886744201183319, "loss/logits": 1.0507986098527908, "step": 7410 }, { "epoch": 0.0742, "grad_norm": 9.5625, "grad_norm_var": 0.338525390625, "learning_rate": 0.0003, "loss": 13.5748, "loss/aux_loss": 0.04813168831169605, "loss/crossentropy": 2.8818042397499086, "loss/logits": 1.0512411534786223, "step": 7420 }, { "epoch": 0.0743, "grad_norm": 8.6875, "grad_norm_var": 0.3155598958333333, "learning_rate": 0.0003, "loss": 13.4908, "loss/aux_loss": 0.04812338091433048, "loss/crossentropy": 2.8881842494010925, "loss/logits": 1.0324068903923034, "step": 7430 }, { "epoch": 0.0744, "grad_norm": 9.6875, "grad_norm_var": 0.24993489583333334, "learning_rate": 0.0003, "loss": 13.3201, "loss/aux_loss": 0.04813193250447512, "loss/crossentropy": 2.707760387659073, "loss/logits": 1.041485771536827, "step": 7440 }, { "epoch": 0.0745, "grad_norm": 9.3125, "grad_norm_var": 0.5744140625, "learning_rate": 0.0003, "loss": 13.1496, "loss/aux_loss": 0.04813508708029986, "loss/crossentropy": 2.864105689525604, "loss/logits": 0.9952063351869583, "step": 7450 }, { "epoch": 0.0746, "grad_norm": 9.1875, "grad_norm_var": 0.17433268229166668, "learning_rate": 0.0003, "loss": 13.6981, "loss/aux_loss": 0.04812800846993923, "loss/crossentropy": 3.035504865646362, "loss/logits": 1.0508066952228545, "step": 7460 }, { "epoch": 0.0747, "grad_norm": 8.75, "grad_norm_var": 0.115869140625, "learning_rate": 0.0003, "loss": 13.3485, "loss/aux_loss": 0.048122041299939154, "loss/crossentropy": 2.7956653356552126, "loss/logits": 1.0159823626279831, "step": 7470 }, { "epoch": 0.0748, "grad_norm": 9.0, "grad_norm_var": 0.3651041666666667, "learning_rate": 0.0003, "loss": 13.4521, "loss/aux_loss": 0.048129369504749775, "loss/crossentropy": 2.969923257827759, "loss/logits": 1.0386756300926208, "step": 7480 }, { "epoch": 0.0749, "grad_norm": 9.125, "grad_norm_var": 1.2169270833333334, "learning_rate": 0.0003, "loss": 13.3878, "loss/aux_loss": 0.04812902975827456, "loss/crossentropy": 3.011993145942688, "loss/logits": 1.042042750120163, "step": 7490 }, { "epoch": 0.075, "grad_norm": 9.0, "grad_norm_var": 0.242431640625, "learning_rate": 0.0003, "loss": 13.4144, "loss/aux_loss": 0.04812177959829569, "loss/crossentropy": 3.07977237701416, "loss/logits": 1.0765836715698243, "step": 7500 }, { "epoch": 0.0751, "grad_norm": 9.75, "grad_norm_var": 0.33645833333333336, "learning_rate": 0.0003, "loss": 13.4444, "loss/aux_loss": 0.04811902064830065, "loss/crossentropy": 2.9798253655433653, "loss/logits": 1.0538707852363587, "step": 7510 }, { "epoch": 0.0752, "grad_norm": 9.0, "grad_norm_var": 0.18904622395833334, "learning_rate": 0.0003, "loss": 13.3292, "loss/aux_loss": 0.048129689320921897, "loss/crossentropy": 2.9677703261375425, "loss/logits": 1.0421554505825044, "step": 7520 }, { "epoch": 0.0753, "grad_norm": 9.0625, "grad_norm_var": 0.10859375, "learning_rate": 0.0003, "loss": 13.1599, "loss/aux_loss": 0.0481248639523983, "loss/crossentropy": 2.9143801808357237, "loss/logits": 1.0279333680868148, "step": 7530 }, { "epoch": 0.0754, "grad_norm": 12.0625, "grad_norm_var": 0.9742024739583334, "learning_rate": 0.0003, "loss": 13.2219, "loss/aux_loss": 0.048115496151149274, "loss/crossentropy": 2.980224275588989, "loss/logits": 1.039728471636772, "step": 7540 }, { "epoch": 0.0755, "grad_norm": 9.125, "grad_norm_var": 0.9067057291666667, "learning_rate": 0.0003, "loss": 13.3821, "loss/aux_loss": 0.04812954906374216, "loss/crossentropy": 2.9692449450492857, "loss/logits": 1.047850751876831, "step": 7550 }, { "epoch": 0.0756, "grad_norm": 9.3125, "grad_norm_var": 0.07805989583333334, "learning_rate": 0.0003, "loss": 13.3762, "loss/aux_loss": 0.048129818961024286, "loss/crossentropy": 2.990733635425568, "loss/logits": 1.035420474410057, "step": 7560 }, { "epoch": 0.0757, "grad_norm": 9.375, "grad_norm_var": 5.944514973958333, "learning_rate": 0.0003, "loss": 13.5087, "loss/aux_loss": 0.048126287385821344, "loss/crossentropy": 3.0128140330314634, "loss/logits": 1.0564094483852386, "step": 7570 }, { "epoch": 0.0758, "grad_norm": 9.1875, "grad_norm_var": 0.4749348958333333, "learning_rate": 0.0003, "loss": 13.2836, "loss/aux_loss": 0.048132631182670596, "loss/crossentropy": 2.837554985284805, "loss/logits": 1.0422370553016662, "step": 7580 }, { "epoch": 0.0759, "grad_norm": 9.0625, "grad_norm_var": 0.45388997395833336, "learning_rate": 0.0003, "loss": 13.1942, "loss/aux_loss": 0.048142952285706996, "loss/crossentropy": 2.6931034505367277, "loss/logits": 0.96292115598917, "step": 7590 }, { "epoch": 0.076, "grad_norm": 22.125, "grad_norm_var": 10.807275390625, "learning_rate": 0.0003, "loss": 13.1736, "loss/aux_loss": 0.04812065456062555, "loss/crossentropy": 2.9220955312252044, "loss/logits": 1.0179326832294464, "step": 7600 }, { "epoch": 0.0761, "grad_norm": 10.125, "grad_norm_var": 10.003629557291667, "learning_rate": 0.0003, "loss": 13.4376, "loss/aux_loss": 0.04812586084008217, "loss/crossentropy": 3.021818733215332, "loss/logits": 1.0483725011348723, "step": 7610 }, { "epoch": 0.0762, "grad_norm": 8.6875, "grad_norm_var": 1.119384765625, "learning_rate": 0.0003, "loss": 13.2549, "loss/aux_loss": 0.04812179896980524, "loss/crossentropy": 2.780340301990509, "loss/logits": 1.0378026425838471, "step": 7620 }, { "epoch": 0.0763, "grad_norm": 9.875, "grad_norm_var": 1.0983723958333333, "learning_rate": 0.0003, "loss": 13.22, "loss/aux_loss": 0.04812696985900402, "loss/crossentropy": 2.8523794054985045, "loss/logits": 1.0297119617462158, "step": 7630 }, { "epoch": 0.0764, "grad_norm": 9.0, "grad_norm_var": 4.044645182291666, "learning_rate": 0.0003, "loss": 13.391, "loss/aux_loss": 0.04813089091330767, "loss/crossentropy": 2.98396714925766, "loss/logits": 1.0715709984302522, "step": 7640 }, { "epoch": 0.0765, "grad_norm": 8.8125, "grad_norm_var": 4.178759765625, "learning_rate": 0.0003, "loss": 13.1596, "loss/aux_loss": 0.048119811527431014, "loss/crossentropy": 2.919600564241409, "loss/logits": 0.9985195219516754, "step": 7650 }, { "epoch": 0.0766, "grad_norm": 10.3125, "grad_norm_var": 0.21808268229166666, "learning_rate": 0.0003, "loss": 13.4931, "loss/aux_loss": 0.048118037171661854, "loss/crossentropy": 2.9762576520442963, "loss/logits": 1.0814913272857667, "step": 7660 }, { "epoch": 0.0767, "grad_norm": 9.6875, "grad_norm_var": 0.18014322916666667, "learning_rate": 0.0003, "loss": 13.3496, "loss/aux_loss": 0.04812421500682831, "loss/crossentropy": 2.9494404554367066, "loss/logits": 1.0210811465978622, "step": 7670 }, { "epoch": 0.0768, "grad_norm": 9.5625, "grad_norm_var": 0.19524739583333334, "learning_rate": 0.0003, "loss": 13.3235, "loss/aux_loss": 0.04811930097639561, "loss/crossentropy": 2.8925601482391357, "loss/logits": 1.0253148704767228, "step": 7680 }, { "epoch": 0.0769, "grad_norm": 9.5, "grad_norm_var": 0.3277180989583333, "learning_rate": 0.0003, "loss": 13.4136, "loss/aux_loss": 0.04813026450574398, "loss/crossentropy": 2.9719626665115357, "loss/logits": 1.1240254521369935, "step": 7690 }, { "epoch": 0.077, "grad_norm": 9.3125, "grad_norm_var": 0.28932291666666665, "learning_rate": 0.0003, "loss": 13.3069, "loss/aux_loss": 0.048117564991116524, "loss/crossentropy": 2.9367773652076723, "loss/logits": 1.0887930393218994, "step": 7700 }, { "epoch": 0.0771, "grad_norm": 9.3125, "grad_norm_var": 0.16608072916666666, "learning_rate": 0.0003, "loss": 13.4119, "loss/aux_loss": 0.04813154824078083, "loss/crossentropy": 2.821038991212845, "loss/logits": 1.0327024161815643, "step": 7710 }, { "epoch": 0.0772, "grad_norm": 9.6875, "grad_norm_var": 0.1103515625, "learning_rate": 0.0003, "loss": 13.3759, "loss/aux_loss": 0.04812257084995508, "loss/crossentropy": 2.922965955734253, "loss/logits": 1.0887499898672104, "step": 7720 }, { "epoch": 0.0773, "grad_norm": 8.75, "grad_norm_var": 0.39212239583333336, "learning_rate": 0.0003, "loss": 13.1938, "loss/aux_loss": 0.04812620896846056, "loss/crossentropy": 2.873304957151413, "loss/logits": 1.0040156990289688, "step": 7730 }, { "epoch": 0.0774, "grad_norm": 9.75, "grad_norm_var": 0.46608072916666665, "learning_rate": 0.0003, "loss": 13.5227, "loss/aux_loss": 0.048138899728655815, "loss/crossentropy": 2.9522013902664184, "loss/logits": 1.0542542576789855, "step": 7740 }, { "epoch": 0.0775, "grad_norm": 8.9375, "grad_norm_var": 0.5067057291666667, "learning_rate": 0.0003, "loss": 13.2829, "loss/aux_loss": 0.04812390860170126, "loss/crossentropy": 2.9055544257164003, "loss/logits": 1.0098161727190018, "step": 7750 }, { "epoch": 0.0776, "grad_norm": 9.0625, "grad_norm_var": 0.22941080729166666, "learning_rate": 0.0003, "loss": 13.3498, "loss/aux_loss": 0.04813548941165209, "loss/crossentropy": 2.980088174343109, "loss/logits": 1.0641280621290208, "step": 7760 }, { "epoch": 0.0777, "grad_norm": 10.5, "grad_norm_var": 0.3078125, "learning_rate": 0.0003, "loss": 13.3331, "loss/aux_loss": 0.048122762329876424, "loss/crossentropy": 2.9135417342185974, "loss/logits": 1.0203843981027603, "step": 7770 }, { "epoch": 0.0778, "grad_norm": 9.25, "grad_norm_var": 7.404150390625, "learning_rate": 0.0003, "loss": 13.2873, "loss/aux_loss": 0.04813399352133274, "loss/crossentropy": 2.841181445121765, "loss/logits": 1.055801859498024, "step": 7780 }, { "epoch": 0.0779, "grad_norm": 10.6875, "grad_norm_var": 6.986197916666667, "learning_rate": 0.0003, "loss": 13.1383, "loss/aux_loss": 0.04812779631465673, "loss/crossentropy": 2.8845179080963135, "loss/logits": 1.0134330958127975, "step": 7790 }, { "epoch": 0.078, "grad_norm": 34.0, "grad_norm_var": 37.05651041666667, "learning_rate": 0.0003, "loss": 13.2773, "loss/aux_loss": 0.04813188221305609, "loss/crossentropy": 2.9372805774211885, "loss/logits": 0.9858429193496704, "step": 7800 }, { "epoch": 0.0781, "grad_norm": 8.5625, "grad_norm_var": 37.1462890625, "learning_rate": 0.0003, "loss": 13.3157, "loss/aux_loss": 0.04813784416764975, "loss/crossentropy": 2.767670226097107, "loss/logits": 1.0314590692520142, "step": 7810 }, { "epoch": 0.0782, "grad_norm": 9.1875, "grad_norm_var": 0.728759765625, "learning_rate": 0.0003, "loss": 13.071, "loss/aux_loss": 0.048109129257500174, "loss/crossentropy": 2.9309176981449125, "loss/logits": 1.0385325998067856, "step": 7820 }, { "epoch": 0.0783, "grad_norm": 9.6875, "grad_norm_var": 0.3277180989583333, "learning_rate": 0.0003, "loss": 13.1624, "loss/aux_loss": 0.04811421576887369, "loss/crossentropy": 2.97960284948349, "loss/logits": 1.0207297384738923, "step": 7830 }, { "epoch": 0.0784, "grad_norm": 9.9375, "grad_norm_var": 1.3139973958333333, "learning_rate": 0.0003, "loss": 13.1052, "loss/aux_loss": 0.048128409497439864, "loss/crossentropy": 2.8389533042907713, "loss/logits": 1.035956397652626, "step": 7840 }, { "epoch": 0.0785, "grad_norm": 9.125, "grad_norm_var": 0.7958333333333333, "learning_rate": 0.0003, "loss": 13.2657, "loss/aux_loss": 0.04812541268765926, "loss/crossentropy": 2.9305792689323424, "loss/logits": 1.0182174772024155, "step": 7850 }, { "epoch": 0.0786, "grad_norm": 8.9375, "grad_norm_var": 0.4175618489583333, "learning_rate": 0.0003, "loss": 13.3332, "loss/aux_loss": 0.0481265714392066, "loss/crossentropy": 3.0935042262077332, "loss/logits": 1.0543415069580078, "step": 7860 }, { "epoch": 0.0787, "grad_norm": 9.3125, "grad_norm_var": 0.5536295572916666, "learning_rate": 0.0003, "loss": 13.2632, "loss/aux_loss": 0.048121869936585425, "loss/crossentropy": 2.859709286689758, "loss/logits": 1.0271731585264205, "step": 7870 }, { "epoch": 0.0788, "grad_norm": 9.375, "grad_norm_var": 0.32180989583333336, "learning_rate": 0.0003, "loss": 13.3159, "loss/aux_loss": 0.04812927972525358, "loss/crossentropy": 2.855903148651123, "loss/logits": 1.0336874067783355, "step": 7880 }, { "epoch": 0.0789, "grad_norm": 9.75, "grad_norm_var": 0.236572265625, "learning_rate": 0.0003, "loss": 13.2051, "loss/aux_loss": 0.048123538866639136, "loss/crossentropy": 2.887688386440277, "loss/logits": 1.0365424662828446, "step": 7890 }, { "epoch": 0.079, "grad_norm": 11.25, "grad_norm_var": 0.37473958333333335, "learning_rate": 0.0003, "loss": 13.265, "loss/aux_loss": 0.04812454991042614, "loss/crossentropy": 2.9836980283260344, "loss/logits": 1.014005294442177, "step": 7900 }, { "epoch": 0.0791, "grad_norm": 9.5, "grad_norm_var": 0.369384765625, "learning_rate": 0.0003, "loss": 13.2299, "loss/aux_loss": 0.048119301721453664, "loss/crossentropy": 3.002364158630371, "loss/logits": 1.0333095729351043, "step": 7910 }, { "epoch": 0.0792, "grad_norm": 8.8125, "grad_norm_var": 0.18326822916666666, "learning_rate": 0.0003, "loss": 13.2434, "loss/aux_loss": 0.048124428279697894, "loss/crossentropy": 2.854734891653061, "loss/logits": 1.014777159690857, "step": 7920 }, { "epoch": 0.0793, "grad_norm": 9.5, "grad_norm_var": 0.18014322916666667, "learning_rate": 0.0003, "loss": 13.3392, "loss/aux_loss": 0.04812102187424898, "loss/crossentropy": 2.9866424322128298, "loss/logits": 1.04144589304924, "step": 7930 }, { "epoch": 0.0794, "grad_norm": 8.9375, "grad_norm_var": 6.890625, "learning_rate": 0.0003, "loss": 13.2693, "loss/aux_loss": 0.048126323707401754, "loss/crossentropy": 2.918100368976593, "loss/logits": 1.0156398355960845, "step": 7940 }, { "epoch": 0.0795, "grad_norm": 9.0625, "grad_norm_var": 0.5878743489583333, "learning_rate": 0.0003, "loss": 13.0763, "loss/aux_loss": 0.0481284249573946, "loss/crossentropy": 2.915715491771698, "loss/logits": 1.0164682030677796, "step": 7950 }, { "epoch": 0.0796, "grad_norm": 9.25, "grad_norm_var": 0.1791015625, "learning_rate": 0.0003, "loss": 13.0751, "loss/aux_loss": 0.04813295528292656, "loss/crossentropy": 2.8354081392288206, "loss/logits": 0.978666540980339, "step": 7960 }, { "epoch": 0.0797, "grad_norm": 8.8125, "grad_norm_var": 0.2955729166666667, "learning_rate": 0.0003, "loss": 13.3034, "loss/aux_loss": 0.0481256989762187, "loss/crossentropy": 2.8299093306064607, "loss/logits": 1.0147087454795838, "step": 7970 }, { "epoch": 0.0798, "grad_norm": 9.0625, "grad_norm_var": 0.25286458333333334, "learning_rate": 0.0003, "loss": 13.1301, "loss/aux_loss": 0.04812098871916533, "loss/crossentropy": 2.9957703232765196, "loss/logits": 1.0261031478643416, "step": 7980 }, { "epoch": 0.0799, "grad_norm": 9.25, "grad_norm_var": 0.09308268229166666, "learning_rate": 0.0003, "loss": 13.1294, "loss/aux_loss": 0.048122938722372055, "loss/crossentropy": 2.8062502324581144, "loss/logits": 1.0008294701576232, "step": 7990 }, { "epoch": 0.08, "grad_norm": 9.3125, "grad_norm_var": 0.074072265625, "learning_rate": 0.0003, "loss": 13.2716, "loss/aux_loss": 0.04812256768345833, "loss/crossentropy": 2.9094210386276247, "loss/logits": 1.0217216283082962, "step": 8000 }, { "epoch": 0.0801, "grad_norm": 9.25, "grad_norm_var": 0.5249348958333333, "learning_rate": 0.0003, "loss": 13.2679, "loss/aux_loss": 0.048118163272738455, "loss/crossentropy": 2.896920144557953, "loss/logits": 1.0116204470396042, "step": 8010 }, { "epoch": 0.0802, "grad_norm": 9.125, "grad_norm_var": 0.2696451822916667, "learning_rate": 0.0003, "loss": 13.1435, "loss/aux_loss": 0.048120760917663576, "loss/crossentropy": 2.954100179672241, "loss/logits": 1.0261077135801315, "step": 8020 }, { "epoch": 0.0803, "grad_norm": 10.375, "grad_norm_var": 590.15078125, "learning_rate": 0.0003, "loss": 13.296, "loss/aux_loss": 0.04813782777637243, "loss/crossentropy": 2.9724114894866944, "loss/logits": 1.0246656686067581, "step": 8030 }, { "epoch": 0.0804, "grad_norm": 9.3125, "grad_norm_var": 588.4619140625, "learning_rate": 0.0003, "loss": 13.3131, "loss/aux_loss": 0.04812064114958048, "loss/crossentropy": 2.873293662071228, "loss/logits": 1.0408964782953263, "step": 8040 }, { "epoch": 0.0805, "grad_norm": 9.5, "grad_norm_var": 0.21927083333333333, "learning_rate": 0.0003, "loss": 13.4035, "loss/aux_loss": 0.04812377672642469, "loss/crossentropy": 3.0599602937698362, "loss/logits": 1.0257072687149047, "step": 8050 }, { "epoch": 0.0806, "grad_norm": 9.125, "grad_norm_var": 1.2195149739583333, "learning_rate": 0.0003, "loss": 13.1163, "loss/aux_loss": 0.0481159932911396, "loss/crossentropy": 2.8620250105857847, "loss/logits": 0.9948093295097351, "step": 8060 }, { "epoch": 0.0807, "grad_norm": 9.0, "grad_norm_var": 18.907405598958334, "learning_rate": 0.0003, "loss": 13.4342, "loss/aux_loss": 0.048125050216913226, "loss/crossentropy": 2.892456221580505, "loss/logits": 1.0422109365463257, "step": 8070 }, { "epoch": 0.0808, "grad_norm": 10.5, "grad_norm_var": 6.577457682291667, "learning_rate": 0.0003, "loss": 13.336, "loss/aux_loss": 0.04813191127032042, "loss/crossentropy": 2.735273379087448, "loss/logits": 1.033458188176155, "step": 8080 }, { "epoch": 0.0809, "grad_norm": 9.125, "grad_norm_var": 54.12708333333333, "learning_rate": 0.0003, "loss": 13.3489, "loss/aux_loss": 0.04812964014708996, "loss/crossentropy": 2.843802607059479, "loss/logits": 1.0632713794708253, "step": 8090 }, { "epoch": 0.081, "grad_norm": 9.625, "grad_norm_var": 0.4376139322916667, "learning_rate": 0.0003, "loss": 13.0564, "loss/aux_loss": 0.04813598971813917, "loss/crossentropy": 2.841505432128906, "loss/logits": 1.0077117711305619, "step": 8100 }, { "epoch": 0.0811, "grad_norm": 9.5, "grad_norm_var": 0.24088541666666666, "learning_rate": 0.0003, "loss": 13.1167, "loss/aux_loss": 0.04812768436968327, "loss/crossentropy": 2.8955862760543822, "loss/logits": 0.997417938709259, "step": 8110 }, { "epoch": 0.0812, "grad_norm": 9.875, "grad_norm_var": 0.233837890625, "learning_rate": 0.0003, "loss": 13.1671, "loss/aux_loss": 0.04812203329056501, "loss/crossentropy": 2.961570382118225, "loss/logits": 1.0406351834535599, "step": 8120 }, { "epoch": 0.0813, "grad_norm": 9.375, "grad_norm_var": 0.32864583333333336, "learning_rate": 0.0003, "loss": 13.2364, "loss/aux_loss": 0.04812574498355389, "loss/crossentropy": 2.8037814855575562, "loss/logits": 1.0229216545820237, "step": 8130 }, { "epoch": 0.0814, "grad_norm": 9.6875, "grad_norm_var": 0.20677083333333332, "learning_rate": 0.0003, "loss": 13.1272, "loss/aux_loss": 0.04812443684786558, "loss/crossentropy": 2.901040017604828, "loss/logits": 1.0405553728342056, "step": 8140 }, { "epoch": 0.0815, "grad_norm": 9.1875, "grad_norm_var": 0.42962239583333334, "learning_rate": 0.0003, "loss": 13.2754, "loss/aux_loss": 0.04811316020786762, "loss/crossentropy": 3.041601026058197, "loss/logits": 1.049459946155548, "step": 8150 }, { "epoch": 0.0816, "grad_norm": 9.125, "grad_norm_var": 0.41354166666666664, "learning_rate": 0.0003, "loss": 13.1756, "loss/aux_loss": 0.048122165724635124, "loss/crossentropy": 2.9084804534912108, "loss/logits": 1.028309690952301, "step": 8160 }, { "epoch": 0.0817, "grad_norm": 9.625, "grad_norm_var": 0.23566080729166666, "learning_rate": 0.0003, "loss": 13.2451, "loss/aux_loss": 0.04811309780925512, "loss/crossentropy": 2.7368631422519685, "loss/logits": 1.002194732427597, "step": 8170 }, { "epoch": 0.0818, "grad_norm": 9.8125, "grad_norm_var": 0.187744140625, "learning_rate": 0.0003, "loss": 13.0183, "loss/aux_loss": 0.0481279119849205, "loss/crossentropy": 2.908668839931488, "loss/logits": 1.0175655782222748, "step": 8180 }, { "epoch": 0.0819, "grad_norm": 9.0, "grad_norm_var": 0.21927083333333333, "learning_rate": 0.0003, "loss": 13.3305, "loss/aux_loss": 0.0481220519170165, "loss/crossentropy": 2.986106610298157, "loss/logits": 1.025682133436203, "step": 8190 }, { "epoch": 0.082, "grad_norm": 9.125, "grad_norm_var": 0.20115559895833332, "learning_rate": 0.0003, "loss": 13.0415, "loss/aux_loss": 0.04811887349933386, "loss/crossentropy": 2.9409547805786134, "loss/logits": 0.9966282039880753, "step": 8200 }, { "epoch": 0.0821, "grad_norm": 9.6875, "grad_norm_var": 9.998811848958333, "learning_rate": 0.0003, "loss": 13.2195, "loss/aux_loss": 0.04813245311379433, "loss/crossentropy": 2.8335422039031983, "loss/logits": 0.9939737856388092, "step": 8210 }, { "epoch": 0.0822, "grad_norm": 9.4375, "grad_norm_var": 9.541129557291667, "learning_rate": 0.0003, "loss": 13.2792, "loss/aux_loss": 0.04812641255557537, "loss/crossentropy": 2.935366129875183, "loss/logits": 0.9935110956430435, "step": 8220 }, { "epoch": 0.0823, "grad_norm": 10.4375, "grad_norm_var": 1.395947265625, "learning_rate": 0.0003, "loss": 13.4096, "loss/aux_loss": 0.048128544352948666, "loss/crossentropy": 3.0038156509399414, "loss/logits": 1.00513653755188, "step": 8230 }, { "epoch": 0.0824, "grad_norm": 11.1875, "grad_norm_var": 1.0632649739583333, "learning_rate": 0.0003, "loss": 13.2308, "loss/aux_loss": 0.048123649694025515, "loss/crossentropy": 2.852214002609253, "loss/logits": 0.9939478904008865, "step": 8240 }, { "epoch": 0.0825, "grad_norm": 9.875, "grad_norm_var": 1.1158854166666667, "learning_rate": 0.0003, "loss": 13.1413, "loss/aux_loss": 0.0481396097689867, "loss/crossentropy": 2.907454788684845, "loss/logits": 1.0276815801858903, "step": 8250 }, { "epoch": 0.0826, "grad_norm": 9.8125, "grad_norm_var": 0.37224934895833334, "learning_rate": 0.0003, "loss": 13.1788, "loss/aux_loss": 0.04811225663870573, "loss/crossentropy": 2.921643829345703, "loss/logits": 1.0124903351068497, "step": 8260 }, { "epoch": 0.0827, "grad_norm": 10.125, "grad_norm_var": 1.4671875, "learning_rate": 0.0003, "loss": 13.2313, "loss/aux_loss": 0.0481251984834671, "loss/crossentropy": 2.755663204193115, "loss/logits": 1.007522416114807, "step": 8270 }, { "epoch": 0.0828, "grad_norm": 10.125, "grad_norm_var": 1.0781087239583333, "learning_rate": 0.0003, "loss": 13.1676, "loss/aux_loss": 0.048118762858212, "loss/crossentropy": 2.948284614086151, "loss/logits": 1.0121029019355774, "step": 8280 }, { "epoch": 0.0829, "grad_norm": 11.4375, "grad_norm_var": 0.3949055989583333, "learning_rate": 0.0003, "loss": 13.1264, "loss/aux_loss": 0.04811645671725273, "loss/crossentropy": 3.065406286716461, "loss/logits": 1.0118898630142212, "step": 8290 }, { "epoch": 0.083, "grad_norm": 10.6875, "grad_norm_var": 3.4449055989583335, "learning_rate": 0.0003, "loss": 13.2246, "loss/aux_loss": 0.04812601022422314, "loss/crossentropy": 2.9024933516979217, "loss/logits": 1.0029625982046126, "step": 8300 }, { "epoch": 0.0831, "grad_norm": 10.625, "grad_norm_var": 0.6285807291666666, "learning_rate": 0.0003, "loss": 13.2829, "loss/aux_loss": 0.04812156446278095, "loss/crossentropy": 3.0828394651412965, "loss/logits": 1.098636594414711, "step": 8310 }, { "epoch": 0.0832, "grad_norm": 8.6875, "grad_norm_var": 0.43748372395833335, "learning_rate": 0.0003, "loss": 13.0539, "loss/aux_loss": 0.048117737844586374, "loss/crossentropy": 2.9925745487213136, "loss/logits": 1.0268135398626328, "step": 8320 }, { "epoch": 0.0833, "grad_norm": 8.6875, "grad_norm_var": 0.29010416666666666, "learning_rate": 0.0003, "loss": 12.9795, "loss/aux_loss": 0.048119370639324185, "loss/crossentropy": 2.8304718136787415, "loss/logits": 0.9774332970380784, "step": 8330 }, { "epoch": 0.0834, "grad_norm": 10.75, "grad_norm_var": 2.7471354166666666, "learning_rate": 0.0003, "loss": 13.2265, "loss/aux_loss": 0.04812915232032537, "loss/crossentropy": 2.805083268880844, "loss/logits": 1.0131800711154937, "step": 8340 }, { "epoch": 0.0835, "grad_norm": 10.25, "grad_norm_var": 2.8114583333333334, "learning_rate": 0.0003, "loss": 13.1295, "loss/aux_loss": 0.04811917226761579, "loss/crossentropy": 3.0019118428230285, "loss/logits": 1.0275006771087647, "step": 8350 }, { "epoch": 0.0836, "grad_norm": 10.0625, "grad_norm_var": 0.3181640625, "learning_rate": 0.0003, "loss": 13.2826, "loss/aux_loss": 0.048113813251256944, "loss/crossentropy": 2.8711145401000975, "loss/logits": 1.0059622257947922, "step": 8360 }, { "epoch": 0.0837, "grad_norm": 9.1875, "grad_norm_var": 0.19607747395833333, "learning_rate": 0.0003, "loss": 13.2408, "loss/aux_loss": 0.04812101162970066, "loss/crossentropy": 3.0431210875511168, "loss/logits": 1.0434471309185027, "step": 8370 }, { "epoch": 0.0838, "grad_norm": 9.75, "grad_norm_var": 0.2228515625, "learning_rate": 0.0003, "loss": 13.0946, "loss/aux_loss": 0.04811821822077036, "loss/crossentropy": 2.918779957294464, "loss/logits": 1.0080697566270829, "step": 8380 }, { "epoch": 0.0839, "grad_norm": 9.125, "grad_norm_var": 113.34347330729166, "learning_rate": 0.0003, "loss": 12.9875, "loss/aux_loss": 0.04812317434698343, "loss/crossentropy": 2.8696110606193543, "loss/logits": 1.021797129511833, "step": 8390 }, { "epoch": 0.084, "grad_norm": 9.5625, "grad_norm_var": 113.546728515625, "learning_rate": 0.0003, "loss": 13.0979, "loss/aux_loss": 0.048132005520164965, "loss/crossentropy": 2.8620001435279847, "loss/logits": 1.0312078952789308, "step": 8400 }, { "epoch": 0.0841, "grad_norm": 8.8125, "grad_norm_var": 0.22537434895833333, "learning_rate": 0.0003, "loss": 13.1934, "loss/aux_loss": 0.048122233152389525, "loss/crossentropy": 2.7849833965301514, "loss/logits": 1.0247801810503006, "step": 8410 }, { "epoch": 0.0842, "grad_norm": 8.5, "grad_norm_var": 0.118994140625, "learning_rate": 0.0003, "loss": 13.2404, "loss/aux_loss": 0.048128989338874814, "loss/crossentropy": 2.9442156195640563, "loss/logits": 1.0153923511505127, "step": 8420 }, { "epoch": 0.0843, "grad_norm": 9.8125, "grad_norm_var": 0.2554524739583333, "learning_rate": 0.0003, "loss": 13.0926, "loss/aux_loss": 0.04813152328133583, "loss/crossentropy": 2.9320022106170653, "loss/logits": 1.0175166606903077, "step": 8430 }, { "epoch": 0.0844, "grad_norm": 9.875, "grad_norm_var": 5.853238932291666, "learning_rate": 0.0003, "loss": 13.2954, "loss/aux_loss": 0.04813098907470703, "loss/crossentropy": 2.9264755129814146, "loss/logits": 1.0203221708536148, "step": 8440 }, { "epoch": 0.0845, "grad_norm": 9.375, "grad_norm_var": 6.127197265625, "learning_rate": 0.0003, "loss": 13.1144, "loss/aux_loss": 0.04811909180134535, "loss/crossentropy": 2.916581463813782, "loss/logits": 0.9956386595964432, "step": 8450 }, { "epoch": 0.0846, "grad_norm": 8.75, "grad_norm_var": 0.160009765625, "learning_rate": 0.0003, "loss": 13.178, "loss/aux_loss": 0.04812520742416382, "loss/crossentropy": 2.9652814626693726, "loss/logits": 1.02629674077034, "step": 8460 }, { "epoch": 0.0847, "grad_norm": 9.4375, "grad_norm_var": 0.116650390625, "learning_rate": 0.0003, "loss": 13.1988, "loss/aux_loss": 0.048121622577309606, "loss/crossentropy": 2.971061831712723, "loss/logits": 1.0064853310585022, "step": 8470 }, { "epoch": 0.0848, "grad_norm": 10.5625, "grad_norm_var": 0.15701497395833333, "learning_rate": 0.0003, "loss": 13.1627, "loss/aux_loss": 0.048127346113324164, "loss/crossentropy": 2.9350649237632753, "loss/logits": 0.9827002599835396, "step": 8480 }, { "epoch": 0.0849, "grad_norm": 8.25, "grad_norm_var": 0.310009765625, "learning_rate": 0.0003, "loss": 12.9945, "loss/aux_loss": 0.048121594451367856, "loss/crossentropy": 2.8912373781204224, "loss/logits": 0.9815872967243194, "step": 8490 }, { "epoch": 0.085, "grad_norm": 10.125, "grad_norm_var": 0.8640625, "learning_rate": 0.0003, "loss": 13.3333, "loss/aux_loss": 0.04812703672796488, "loss/crossentropy": 3.0496490001678467, "loss/logits": 1.0207342118024827, "step": 8500 }, { "epoch": 0.0851, "grad_norm": 9.6875, "grad_norm_var": 16.475244140625, "learning_rate": 0.0003, "loss": 13.2066, "loss/aux_loss": 0.048138654045760634, "loss/crossentropy": 2.858214247226715, "loss/logits": 1.0198309272527695, "step": 8510 }, { "epoch": 0.0852, "grad_norm": 9.375, "grad_norm_var": 16.990738932291666, "learning_rate": 0.0003, "loss": 12.9225, "loss/aux_loss": 0.04811470378190279, "loss/crossentropy": 2.6653155386447906, "loss/logits": 0.9890996038913726, "step": 8520 }, { "epoch": 0.0853, "grad_norm": 10.0625, "grad_norm_var": 0.29217122395833334, "learning_rate": 0.0003, "loss": 13.0286, "loss/aux_loss": 0.04812881331890821, "loss/crossentropy": 2.818387824296951, "loss/logits": 1.0328501909971237, "step": 8530 }, { "epoch": 0.0854, "grad_norm": 9.9375, "grad_norm_var": 0.4891764322916667, "learning_rate": 0.0003, "loss": 12.9661, "loss/aux_loss": 0.04812416769564152, "loss/crossentropy": 2.7885517358779905, "loss/logits": 1.0058355391025544, "step": 8540 }, { "epoch": 0.0855, "grad_norm": 9.3125, "grad_norm_var": 0.6669108072916666, "learning_rate": 0.0003, "loss": 13.1039, "loss/aux_loss": 0.04812269229441881, "loss/crossentropy": 2.9664511680603027, "loss/logits": 1.0393647104501724, "step": 8550 }, { "epoch": 0.0856, "grad_norm": 9.0625, "grad_norm_var": 0.323681640625, "learning_rate": 0.0003, "loss": 13.211, "loss/aux_loss": 0.04812395125627518, "loss/crossentropy": 2.9345888257026673, "loss/logits": 1.017241859436035, "step": 8560 }, { "epoch": 0.0857, "grad_norm": 9.625, "grad_norm_var": 37.13743489583333, "learning_rate": 0.0003, "loss": 13.1819, "loss/aux_loss": 0.04813410099595785, "loss/crossentropy": 2.9240545988082887, "loss/logits": 1.0288849472999573, "step": 8570 }, { "epoch": 0.0858, "grad_norm": 10.5, "grad_norm_var": 0.5333333333333333, "learning_rate": 0.0003, "loss": 13.2227, "loss/aux_loss": 0.048121962882578376, "loss/crossentropy": 2.855889308452606, "loss/logits": 1.0060656636953353, "step": 8580 }, { "epoch": 0.0859, "grad_norm": 12.0625, "grad_norm_var": 0.6429524739583333, "learning_rate": 0.0003, "loss": 13.1099, "loss/aux_loss": 0.048110452853143214, "loss/crossentropy": 2.8173590660095216, "loss/logits": 1.0144326239824295, "step": 8590 }, { "epoch": 0.086, "grad_norm": 90.5, "grad_norm_var": 409.363134765625, "learning_rate": 0.0003, "loss": 13.2486, "loss/aux_loss": 0.048120449669659136, "loss/crossentropy": 2.889569455385208, "loss/logits": 1.0104403495788574, "step": 8600 }, { "epoch": 0.0861, "grad_norm": 10.0625, "grad_norm_var": 405.5660807291667, "learning_rate": 0.0003, "loss": 12.9023, "loss/aux_loss": 0.04813555497676134, "loss/crossentropy": 2.891407001018524, "loss/logits": 0.9928454220294952, "step": 8610 }, { "epoch": 0.0862, "grad_norm": 10.3125, "grad_norm_var": 1.1833333333333333, "learning_rate": 0.0003, "loss": 13.1671, "loss/aux_loss": 0.048119562491774556, "loss/crossentropy": 2.9349429488182066, "loss/logits": 1.0234294265508652, "step": 8620 }, { "epoch": 0.0863, "grad_norm": 9.8125, "grad_norm_var": 1.1183430989583334, "learning_rate": 0.0003, "loss": 13.1522, "loss/aux_loss": 0.048119149915874, "loss/crossentropy": 2.953269922733307, "loss/logits": 1.029870542883873, "step": 8630 }, { "epoch": 0.0864, "grad_norm": 21.625, "grad_norm_var": 8.992952473958333, "learning_rate": 0.0003, "loss": 13.223, "loss/aux_loss": 0.048137610964477065, "loss/crossentropy": 2.854456979036331, "loss/logits": 1.0149786740541458, "step": 8640 }, { "epoch": 0.0865, "grad_norm": 9.125, "grad_norm_var": 8.875455729166667, "learning_rate": 0.0003, "loss": 12.8707, "loss/aux_loss": 0.048116568848490714, "loss/crossentropy": 2.9507800936698914, "loss/logits": 1.0633498966693877, "step": 8650 }, { "epoch": 0.0866, "grad_norm": 9.75, "grad_norm_var": 0.17433268229166668, "learning_rate": 0.0003, "loss": 13.2224, "loss/aux_loss": 0.048123492300510405, "loss/crossentropy": 2.8969777107238768, "loss/logits": 0.9931401669979095, "step": 8660 }, { "epoch": 0.0867, "grad_norm": 9.0, "grad_norm_var": 8.690885416666667, "learning_rate": 0.0003, "loss": 13.1396, "loss/aux_loss": 0.048116271197795865, "loss/crossentropy": 2.907929790019989, "loss/logits": 1.0244786828756332, "step": 8670 }, { "epoch": 0.0868, "grad_norm": 9.625, "grad_norm_var": 8.547135416666666, "learning_rate": 0.0003, "loss": 12.9869, "loss/aux_loss": 0.04812566060572863, "loss/crossentropy": 2.813516306877136, "loss/logits": 0.9591111838817596, "step": 8680 }, { "epoch": 0.0869, "grad_norm": 10.5625, "grad_norm_var": 0.22706705729166668, "learning_rate": 0.0003, "loss": 13.0031, "loss/aux_loss": 0.04811654146760702, "loss/crossentropy": 2.7499096274375914, "loss/logits": 0.9870797544717789, "step": 8690 }, { "epoch": 0.087, "grad_norm": 9.625, "grad_norm_var": 0.4505208333333333, "learning_rate": 0.0003, "loss": 13.0701, "loss/aux_loss": 0.04811768177896738, "loss/crossentropy": 2.812624078989029, "loss/logits": 0.9721063941717147, "step": 8700 }, { "epoch": 0.0871, "grad_norm": 9.6875, "grad_norm_var": 0.13357747395833333, "learning_rate": 0.0003, "loss": 13.2135, "loss/aux_loss": 0.048116148076951505, "loss/crossentropy": 2.9922009468078614, "loss/logits": 1.015498149394989, "step": 8710 }, { "epoch": 0.0872, "grad_norm": 9.75, "grad_norm_var": 0.13326822916666667, "learning_rate": 0.0003, "loss": 13.0671, "loss/aux_loss": 0.048121779784560205, "loss/crossentropy": 2.827225810289383, "loss/logits": 0.9838965624570847, "step": 8720 }, { "epoch": 0.0873, "grad_norm": 9.6875, "grad_norm_var": 0.26764322916666666, "learning_rate": 0.0003, "loss": 13.1019, "loss/aux_loss": 0.04811729565262794, "loss/crossentropy": 2.959608232975006, "loss/logits": 0.9972497940063476, "step": 8730 }, { "epoch": 0.0874, "grad_norm": 9.3125, "grad_norm_var": 0.16920572916666668, "learning_rate": 0.0003, "loss": 12.7734, "loss/aux_loss": 0.048125034943223, "loss/crossentropy": 2.6938082754611967, "loss/logits": 0.9462698817253112, "step": 8740 }, { "epoch": 0.0875, "grad_norm": 9.5625, "grad_norm_var": 0.21560872395833333, "learning_rate": 0.0003, "loss": 13.0511, "loss/aux_loss": 0.04812650829553604, "loss/crossentropy": 2.7083167552948, "loss/logits": 0.989093354344368, "step": 8750 }, { "epoch": 0.0876, "grad_norm": 9.625, "grad_norm_var": 0.198291015625, "learning_rate": 0.0003, "loss": 12.9391, "loss/aux_loss": 0.0481193732470274, "loss/crossentropy": 2.9418309926986694, "loss/logits": 0.9726715385913849, "step": 8760 }, { "epoch": 0.0877, "grad_norm": 9.375, "grad_norm_var": 0.22213541666666667, "learning_rate": 0.0003, "loss": 13.1485, "loss/aux_loss": 0.048114814795553684, "loss/crossentropy": 2.9309451580047607, "loss/logits": 1.0644985824823379, "step": 8770 }, { "epoch": 0.0878, "grad_norm": 9.75, "grad_norm_var": 92.121728515625, "learning_rate": 0.0003, "loss": 13.0324, "loss/aux_loss": 0.04812544099986553, "loss/crossentropy": 2.960424965620041, "loss/logits": 0.9863006621599197, "step": 8780 }, { "epoch": 0.0879, "grad_norm": 9.125, "grad_norm_var": 0.211572265625, "learning_rate": 0.0003, "loss": 13.2247, "loss/aux_loss": 0.048114399425685406, "loss/crossentropy": 2.848669397830963, "loss/logits": 1.0225479423999786, "step": 8790 }, { "epoch": 0.088, "grad_norm": 9.125, "grad_norm_var": 0.250634765625, "learning_rate": 0.0003, "loss": 13.0081, "loss/aux_loss": 0.0481090260669589, "loss/crossentropy": 3.036766457557678, "loss/logits": 1.005482006072998, "step": 8800 }, { "epoch": 0.0881, "grad_norm": 9.75, "grad_norm_var": 0.10545247395833333, "learning_rate": 0.0003, "loss": 13.0766, "loss/aux_loss": 0.048118382692337036, "loss/crossentropy": 2.9599334478378294, "loss/logits": 1.022737380862236, "step": 8810 }, { "epoch": 0.0882, "grad_norm": 9.6875, "grad_norm_var": 0.33639322916666664, "learning_rate": 0.0003, "loss": 13.0431, "loss/aux_loss": 0.04812664575874805, "loss/crossentropy": 2.761233627796173, "loss/logits": 1.0341258555650712, "step": 8820 }, { "epoch": 0.0883, "grad_norm": 10.125, "grad_norm_var": 0.3087076822916667, "learning_rate": 0.0003, "loss": 12.9998, "loss/aux_loss": 0.048121739737689496, "loss/crossentropy": 2.9523282289505004, "loss/logits": 1.0163812279701232, "step": 8830 }, { "epoch": 0.0884, "grad_norm": 10.25, "grad_norm_var": 49.53359375, "learning_rate": 0.0003, "loss": 13.0433, "loss/aux_loss": 0.048123051226139066, "loss/crossentropy": 2.9619523882865906, "loss/logits": 1.0154429644346237, "step": 8840 }, { "epoch": 0.0885, "grad_norm": 9.0625, "grad_norm_var": 48.044010416666666, "learning_rate": 0.0003, "loss": 13.0594, "loss/aux_loss": 0.048118606954813, "loss/crossentropy": 2.768035900592804, "loss/logits": 1.0090958893299102, "step": 8850 }, { "epoch": 0.0886, "grad_norm": 9.625, "grad_norm_var": 0.24021809895833332, "learning_rate": 0.0003, "loss": 13.0136, "loss/aux_loss": 0.04812074787914753, "loss/crossentropy": 2.864524757862091, "loss/logits": 0.9908095836639405, "step": 8860 }, { "epoch": 0.0887, "grad_norm": 10.75, "grad_norm_var": 0.5041015625, "learning_rate": 0.0003, "loss": 13.0489, "loss/aux_loss": 0.04811746664345264, "loss/crossentropy": 2.8981815814971923, "loss/logits": 0.9988605201244354, "step": 8870 }, { "epoch": 0.0888, "grad_norm": 9.3125, "grad_norm_var": 0.303125, "learning_rate": 0.0003, "loss": 13.0495, "loss/aux_loss": 0.04811958447098732, "loss/crossentropy": 2.953895443677902, "loss/logits": 0.9865518122911453, "step": 8880 }, { "epoch": 0.0889, "grad_norm": 9.5625, "grad_norm_var": 0.25636393229166665, "learning_rate": 0.0003, "loss": 13.0068, "loss/aux_loss": 0.048110640980303286, "loss/crossentropy": 2.844312059879303, "loss/logits": 0.999048775434494, "step": 8890 }, { "epoch": 0.089, "grad_norm": 9.125, "grad_norm_var": 0.3324055989583333, "learning_rate": 0.0003, "loss": 13.1248, "loss/aux_loss": 0.048114532604813576, "loss/crossentropy": 2.8329219222068787, "loss/logits": 1.0227496713399886, "step": 8900 }, { "epoch": 0.0891, "grad_norm": 9.4375, "grad_norm_var": 0.695166015625, "learning_rate": 0.0003, "loss": 13.2069, "loss/aux_loss": 0.048122035898268224, "loss/crossentropy": 2.9590111494064333, "loss/logits": 1.020371201634407, "step": 8910 }, { "epoch": 0.0892, "grad_norm": 8.625, "grad_norm_var": 0.4691243489583333, "learning_rate": 0.0003, "loss": 12.9333, "loss/aux_loss": 0.04812424685806036, "loss/crossentropy": 2.803478956222534, "loss/logits": 0.9767372757196426, "step": 8920 }, { "epoch": 0.0893, "grad_norm": 9.0625, "grad_norm_var": 0.2598958333333333, "learning_rate": 0.0003, "loss": 13.2306, "loss/aux_loss": 0.04812055286020041, "loss/crossentropy": 2.90518371462822, "loss/logits": 0.9993892073631286, "step": 8930 }, { "epoch": 0.0894, "grad_norm": 10.0, "grad_norm_var": 0.15462239583333334, "learning_rate": 0.0003, "loss": 13.2241, "loss/aux_loss": 0.04811331238597631, "loss/crossentropy": 2.9701803803443907, "loss/logits": 0.9940306186676026, "step": 8940 }, { "epoch": 0.0895, "grad_norm": 10.375, "grad_norm_var": 0.246337890625, "learning_rate": 0.0003, "loss": 13.0112, "loss/aux_loss": 0.04811495840549469, "loss/crossentropy": 3.0566563248634337, "loss/logits": 1.0184517830610276, "step": 8950 }, { "epoch": 0.0896, "grad_norm": 9.4375, "grad_norm_var": 4.417952473958334, "learning_rate": 0.0003, "loss": 12.9823, "loss/aux_loss": 0.04812169056385755, "loss/crossentropy": 2.948707568645477, "loss/logits": 0.9956671565771102, "step": 8960 }, { "epoch": 0.0897, "grad_norm": 9.125, "grad_norm_var": 4.491520182291667, "learning_rate": 0.0003, "loss": 13.0456, "loss/aux_loss": 0.04811549689620733, "loss/crossentropy": 2.8826889276504515, "loss/logits": 1.0055119961500167, "step": 8970 }, { "epoch": 0.0898, "grad_norm": 9.5625, "grad_norm_var": 4.914306640625, "learning_rate": 0.0003, "loss": 13.3204, "loss/aux_loss": 0.04811891969293356, "loss/crossentropy": 3.0966086268424986, "loss/logits": 1.0039119273424149, "step": 8980 }, { "epoch": 0.0899, "grad_norm": 9.5625, "grad_norm_var": 0.9282389322916667, "learning_rate": 0.0003, "loss": 13.0716, "loss/aux_loss": 0.04812802001833916, "loss/crossentropy": 2.773033380508423, "loss/logits": 1.0084805130958556, "step": 8990 }, { "epoch": 0.09, "grad_norm": 10.1875, "grad_norm_var": 0.8994791666666667, "learning_rate": 0.0003, "loss": 13.2718, "loss/aux_loss": 0.0481186056509614, "loss/crossentropy": 2.934514182806015, "loss/logits": 1.0663816720247268, "step": 9000 }, { "epoch": 0.0901, "grad_norm": 9.1875, "grad_norm_var": 0.2916015625, "learning_rate": 0.0003, "loss": 13.0098, "loss/aux_loss": 0.04811523351818323, "loss/crossentropy": 2.8817059993743896, "loss/logits": 1.0227952599525452, "step": 9010 }, { "epoch": 0.0902, "grad_norm": 10.8125, "grad_norm_var": 1.7858723958333333, "learning_rate": 0.0003, "loss": 12.9215, "loss/aux_loss": 0.04812025129795074, "loss/crossentropy": 2.9280009150505064, "loss/logits": 1.008087882399559, "step": 9020 }, { "epoch": 0.0903, "grad_norm": 10.625, "grad_norm_var": 1.5988932291666667, "learning_rate": 0.0003, "loss": 12.9675, "loss/aux_loss": 0.048126825504004954, "loss/crossentropy": 2.752195543050766, "loss/logits": 0.938539656996727, "step": 9030 }, { "epoch": 0.0904, "grad_norm": 9.6875, "grad_norm_var": 0.17838541666666666, "learning_rate": 0.0003, "loss": 13.1815, "loss/aux_loss": 0.04811705574393273, "loss/crossentropy": 3.056387519836426, "loss/logits": 1.0307760834693909, "step": 9040 }, { "epoch": 0.0905, "grad_norm": 9.5625, "grad_norm_var": 0.14724934895833333, "learning_rate": 0.0003, "loss": 13.2356, "loss/aux_loss": 0.04812313225120306, "loss/crossentropy": 2.984651046991348, "loss/logits": 1.0027798056602477, "step": 9050 }, { "epoch": 0.0906, "grad_norm": 8.8125, "grad_norm_var": 0.325634765625, "learning_rate": 0.0003, "loss": 13.0133, "loss/aux_loss": 0.04811926949769259, "loss/crossentropy": 2.916082763671875, "loss/logits": 0.9860832780599594, "step": 9060 }, { "epoch": 0.0907, "grad_norm": 10.8125, "grad_norm_var": 0.266259765625, "learning_rate": 0.0003, "loss": 12.9402, "loss/aux_loss": 0.048123452626168725, "loss/crossentropy": 2.843355292081833, "loss/logits": 0.9923195570707322, "step": 9070 }, { "epoch": 0.0908, "grad_norm": 9.625, "grad_norm_var": 0.5714680989583333, "learning_rate": 0.0003, "loss": 12.7962, "loss/aux_loss": 0.04811744131147862, "loss/crossentropy": 2.929332971572876, "loss/logits": 1.011452180147171, "step": 9080 }, { "epoch": 0.0909, "grad_norm": 10.4375, "grad_norm_var": 0.22369791666666666, "learning_rate": 0.0003, "loss": 13.0572, "loss/aux_loss": 0.04812127202749252, "loss/crossentropy": 2.9542043566703797, "loss/logits": 0.9913775563240051, "step": 9090 }, { "epoch": 0.091, "grad_norm": 10.0, "grad_norm_var": 0.4495930989583333, "learning_rate": 0.0003, "loss": 13.0991, "loss/aux_loss": 0.048116521537303926, "loss/crossentropy": 2.845492494106293, "loss/logits": 1.0074622273445129, "step": 9100 }, { "epoch": 0.0911, "grad_norm": 10.0625, "grad_norm_var": 0.633837890625, "learning_rate": 0.0003, "loss": 12.9897, "loss/aux_loss": 0.048106766492128375, "loss/crossentropy": 2.902200919389725, "loss/logits": 1.0262346029281617, "step": 9110 }, { "epoch": 0.0912, "grad_norm": 10.375, "grad_norm_var": 5.352718098958333, "learning_rate": 0.0003, "loss": 13.0402, "loss/aux_loss": 0.04812852665781975, "loss/crossentropy": 2.9274023175239563, "loss/logits": 0.9989449590444565, "step": 9120 }, { "epoch": 0.0913, "grad_norm": 10.125, "grad_norm_var": 5.371077473958334, "learning_rate": 0.0003, "loss": 13.1259, "loss/aux_loss": 0.048123916052281855, "loss/crossentropy": 2.957271945476532, "loss/logits": 1.0392587214708329, "step": 9130 }, { "epoch": 0.0914, "grad_norm": 11.0, "grad_norm_var": 0.8235514322916667, "learning_rate": 0.0003, "loss": 12.8346, "loss/aux_loss": 0.04811377823352814, "loss/crossentropy": 2.7599571704864503, "loss/logits": 0.9800522536039352, "step": 9140 }, { "epoch": 0.0915, "grad_norm": 9.0625, "grad_norm_var": 14.900764973958333, "learning_rate": 0.0003, "loss": 13.0363, "loss/aux_loss": 0.048123881407082084, "loss/crossentropy": 2.8104595303535462, "loss/logits": 0.9722192943096161, "step": 9150 }, { "epoch": 0.0916, "grad_norm": 10.8125, "grad_norm_var": 14.4884765625, "learning_rate": 0.0003, "loss": 13.0768, "loss/aux_loss": 0.04811329320073128, "loss/crossentropy": 2.85021288394928, "loss/logits": 1.0288948625326158, "step": 9160 }, { "epoch": 0.0917, "grad_norm": 9.25, "grad_norm_var": 0.5411295572916667, "learning_rate": 0.0003, "loss": 13.0842, "loss/aux_loss": 0.04812074415385723, "loss/crossentropy": 2.9284089267253877, "loss/logits": 1.0179531484842301, "step": 9170 }, { "epoch": 0.0918, "grad_norm": 9.9375, "grad_norm_var": 0.6243326822916667, "learning_rate": 0.0003, "loss": 13.0484, "loss/aux_loss": 0.04811491388827562, "loss/crossentropy": 2.8634442031383514, "loss/logits": 0.988609355688095, "step": 9180 }, { "epoch": 0.0919, "grad_norm": 15.25, "grad_norm_var": 2.2025390625, "learning_rate": 0.0003, "loss": 12.9292, "loss/aux_loss": 0.04811589177697897, "loss/crossentropy": 3.035586249828339, "loss/logits": 1.017078360915184, "step": 9190 }, { "epoch": 0.092, "grad_norm": 10.3125, "grad_norm_var": 2.242447916666667, "learning_rate": 0.0003, "loss": 13.0149, "loss/aux_loss": 0.04812207706272602, "loss/crossentropy": 2.962714272737503, "loss/logits": 0.9997862339019775, "step": 9200 }, { "epoch": 0.0921, "grad_norm": 11.625, "grad_norm_var": 0.5791666666666667, "learning_rate": 0.0003, "loss": 12.9794, "loss/aux_loss": 0.04811257142573595, "loss/crossentropy": 2.904304379224777, "loss/logits": 0.9970894068479538, "step": 9210 }, { "epoch": 0.0922, "grad_norm": 9.75, "grad_norm_var": 0.455322265625, "learning_rate": 0.0003, "loss": 12.997, "loss/aux_loss": 0.048116791248321536, "loss/crossentropy": 2.9704554200172426, "loss/logits": 1.009730476140976, "step": 9220 }, { "epoch": 0.0923, "grad_norm": 9.3125, "grad_norm_var": 0.22263997395833332, "learning_rate": 0.0003, "loss": 13.0321, "loss/aux_loss": 0.04812146797776222, "loss/crossentropy": 2.927646744251251, "loss/logits": 0.9740961879491806, "step": 9230 }, { "epoch": 0.0924, "grad_norm": 10.25, "grad_norm_var": 0.6384765625, "learning_rate": 0.0003, "loss": 13.0843, "loss/aux_loss": 0.048116331547498704, "loss/crossentropy": 2.9682451248168946, "loss/logits": 1.0054449021816254, "step": 9240 }, { "epoch": 0.0925, "grad_norm": 9.25, "grad_norm_var": 0.81796875, "learning_rate": 0.0003, "loss": 12.8984, "loss/aux_loss": 0.0481177942827344, "loss/crossentropy": 2.9605862140655517, "loss/logits": 0.9988209009170532, "step": 9250 }, { "epoch": 0.0926, "grad_norm": 9.9375, "grad_norm_var": 7.1212890625, "learning_rate": 0.0003, "loss": 13.126, "loss/aux_loss": 0.04811736159026623, "loss/crossentropy": 2.968954026699066, "loss/logits": 0.9968051850795746, "step": 9260 }, { "epoch": 0.0927, "grad_norm": 10.5, "grad_norm_var": 7.165999348958334, "learning_rate": 0.0003, "loss": 12.9006, "loss/aux_loss": 0.04812134802341461, "loss/crossentropy": 2.95685738325119, "loss/logits": 1.017841598391533, "step": 9270 }, { "epoch": 0.0928, "grad_norm": 10.1875, "grad_norm_var": 302.6844889322917, "learning_rate": 0.0003, "loss": 13.0568, "loss/aux_loss": 0.04813589584082365, "loss/crossentropy": 2.8788455188274384, "loss/logits": 0.998471787571907, "step": 9280 }, { "epoch": 0.0929, "grad_norm": 9.8125, "grad_norm_var": 304.08396809895834, "learning_rate": 0.0003, "loss": 12.9694, "loss/aux_loss": 0.04811546951532364, "loss/crossentropy": 2.8760639309883116, "loss/logits": 0.9897254168987274, "step": 9290 }, { "epoch": 0.093, "grad_norm": 10.0625, "grad_norm_var": 0.14270833333333333, "learning_rate": 0.0003, "loss": 12.875, "loss/aux_loss": 0.0481147637590766, "loss/crossentropy": 2.9088239908218383, "loss/logits": 0.9841889888048172, "step": 9300 }, { "epoch": 0.0931, "grad_norm": 9.8125, "grad_norm_var": 0.21131184895833333, "learning_rate": 0.0003, "loss": 12.9544, "loss/aux_loss": 0.04811472594738007, "loss/crossentropy": 3.0154574632644655, "loss/logits": 0.9868688434362411, "step": 9310 }, { "epoch": 0.0932, "grad_norm": 10.1875, "grad_norm_var": 0.297509765625, "learning_rate": 0.0003, "loss": 13.0221, "loss/aux_loss": 0.04811571668833494, "loss/crossentropy": 2.744140291213989, "loss/logits": 0.9741410970687866, "step": 9320 }, { "epoch": 0.0933, "grad_norm": 9.625, "grad_norm_var": 0.41354166666666664, "learning_rate": 0.0003, "loss": 13.0847, "loss/aux_loss": 0.04812242966145277, "loss/crossentropy": 2.8483268916606903, "loss/logits": 1.0017479300498962, "step": 9330 }, { "epoch": 0.0934, "grad_norm": 9.5, "grad_norm_var": 16.938916015625, "learning_rate": 0.0003, "loss": 12.8932, "loss/aux_loss": 0.048115167394280435, "loss/crossentropy": 2.951818656921387, "loss/logits": 1.034898152947426, "step": 9340 }, { "epoch": 0.0935, "grad_norm": 9.3125, "grad_norm_var": 0.141259765625, "learning_rate": 0.0003, "loss": 13.0014, "loss/aux_loss": 0.048115997575223446, "loss/crossentropy": 2.835058981180191, "loss/logits": 0.9820165306329727, "step": 9350 }, { "epoch": 0.0936, "grad_norm": 10.1875, "grad_norm_var": 0.151806640625, "learning_rate": 0.0003, "loss": 12.8639, "loss/aux_loss": 0.04810713436454535, "loss/crossentropy": 2.9016472816467287, "loss/logits": 1.0063132762908935, "step": 9360 }, { "epoch": 0.0937, "grad_norm": 9.9375, "grad_norm_var": 0.9284993489583333, "learning_rate": 0.0003, "loss": 12.858, "loss/aux_loss": 0.04812375083565712, "loss/crossentropy": 2.984380769729614, "loss/logits": 1.0249317467212677, "step": 9370 }, { "epoch": 0.0938, "grad_norm": 10.5, "grad_norm_var": 0.8635416666666667, "learning_rate": 0.0003, "loss": 12.9143, "loss/aux_loss": 0.0481270782649517, "loss/crossentropy": 3.0072665452957152, "loss/logits": 0.9971794277429581, "step": 9380 }, { "epoch": 0.0939, "grad_norm": 10.0625, "grad_norm_var": 0.22902018229166668, "learning_rate": 0.0003, "loss": 12.9288, "loss/aux_loss": 0.04811225328594446, "loss/crossentropy": 2.952876567840576, "loss/logits": 0.981144642829895, "step": 9390 }, { "epoch": 0.094, "grad_norm": 9.5, "grad_norm_var": 0.20546875, "learning_rate": 0.0003, "loss": 12.9573, "loss/aux_loss": 0.04811703842133284, "loss/crossentropy": 2.9657641530036924, "loss/logits": 1.008799707889557, "step": 9400 }, { "epoch": 0.0941, "grad_norm": 10.125, "grad_norm_var": 0.28020833333333334, "learning_rate": 0.0003, "loss": 13.0344, "loss/aux_loss": 0.04812249001115561, "loss/crossentropy": 2.868061417341232, "loss/logits": 0.9425824016332627, "step": 9410 }, { "epoch": 0.0942, "grad_norm": 9.0625, "grad_norm_var": 0.2880045572916667, "learning_rate": 0.0003, "loss": 12.8889, "loss/aux_loss": 0.04811691902577877, "loss/crossentropy": 2.810444962978363, "loss/logits": 0.9671340584754944, "step": 9420 }, { "epoch": 0.0943, "grad_norm": 9.25, "grad_norm_var": 0.250244140625, "learning_rate": 0.0003, "loss": 12.8051, "loss/aux_loss": 0.04814633168280125, "loss/crossentropy": 2.7531135201454164, "loss/logits": 0.9443521648645401, "step": 9430 }, { "epoch": 0.0944, "grad_norm": 9.5, "grad_norm_var": 0.11144205729166666, "learning_rate": 0.0003, "loss": 12.9351, "loss/aux_loss": 0.04811623003333807, "loss/crossentropy": 2.773250675201416, "loss/logits": 0.9573301702737809, "step": 9440 }, { "epoch": 0.0945, "grad_norm": 10.0625, "grad_norm_var": 0.14998372395833334, "learning_rate": 0.0003, "loss": 12.9976, "loss/aux_loss": 0.048125031776726244, "loss/crossentropy": 2.843584269285202, "loss/logits": 0.9623809665441513, "step": 9450 }, { "epoch": 0.0946, "grad_norm": 10.75, "grad_norm_var": 0.35740559895833335, "learning_rate": 0.0003, "loss": 12.9905, "loss/aux_loss": 0.0481179354712367, "loss/crossentropy": 3.025428628921509, "loss/logits": 1.0071224570274353, "step": 9460 }, { "epoch": 0.0947, "grad_norm": 9.25, "grad_norm_var": 0.340625, "learning_rate": 0.0003, "loss": 12.9006, "loss/aux_loss": 0.0481242848560214, "loss/crossentropy": 2.919004487991333, "loss/logits": 1.0092800080776214, "step": 9470 }, { "epoch": 0.0948, "grad_norm": 9.4375, "grad_norm_var": 0.364306640625, "learning_rate": 0.0003, "loss": 12.8888, "loss/aux_loss": 0.04811065457761288, "loss/crossentropy": 3.0337927043437958, "loss/logits": 0.970859882235527, "step": 9480 }, { "epoch": 0.0949, "grad_norm": 9.4375, "grad_norm_var": 0.2561848958333333, "learning_rate": 0.0003, "loss": 12.969, "loss/aux_loss": 0.048123066686093806, "loss/crossentropy": 2.9421743154525757, "loss/logits": 1.0259678810834885, "step": 9490 }, { "epoch": 0.095, "grad_norm": 9.75, "grad_norm_var": 0.2704264322916667, "learning_rate": 0.0003, "loss": 12.9057, "loss/aux_loss": 0.04810989499092102, "loss/crossentropy": 2.908745914697647, "loss/logits": 1.004162722826004, "step": 9500 }, { "epoch": 0.0951, "grad_norm": 9.8125, "grad_norm_var": 0.354150390625, "learning_rate": 0.0003, "loss": 12.8508, "loss/aux_loss": 0.04811614695936441, "loss/crossentropy": 2.8484590649604797, "loss/logits": 0.9944918006658554, "step": 9510 }, { "epoch": 0.0952, "grad_norm": 10.0625, "grad_norm_var": 0.296728515625, "learning_rate": 0.0003, "loss": 12.8238, "loss/aux_loss": 0.04812110308557749, "loss/crossentropy": 2.9715175151824953, "loss/logits": 0.9781792253255844, "step": 9520 }, { "epoch": 0.0953, "grad_norm": 9.9375, "grad_norm_var": 0.26170247395833335, "learning_rate": 0.0003, "loss": 12.8021, "loss/aux_loss": 0.04812615159898996, "loss/crossentropy": 2.8001496493816376, "loss/logits": 0.943726196885109, "step": 9530 }, { "epoch": 0.0954, "grad_norm": 9.125, "grad_norm_var": 0.2950520833333333, "learning_rate": 0.0003, "loss": 13.0212, "loss/aux_loss": 0.048111764900386336, "loss/crossentropy": 2.9262121081352235, "loss/logits": 1.0509262353181839, "step": 9540 }, { "epoch": 0.0955, "grad_norm": 10.3125, "grad_norm_var": 0.11717122395833333, "learning_rate": 0.0003, "loss": 12.7972, "loss/aux_loss": 0.04811500422656536, "loss/crossentropy": 2.7417452692985536, "loss/logits": 0.963932403922081, "step": 9550 }, { "epoch": 0.0956, "grad_norm": 10.6875, "grad_norm_var": 0.4046223958333333, "learning_rate": 0.0003, "loss": 12.7335, "loss/aux_loss": 0.04812417142093182, "loss/crossentropy": 2.8524417519569396, "loss/logits": 0.9906006306409836, "step": 9560 }, { "epoch": 0.0957, "grad_norm": 10.375, "grad_norm_var": 0.7884765625, "learning_rate": 0.0003, "loss": 12.6479, "loss/aux_loss": 0.048113958537578584, "loss/crossentropy": 2.860063922405243, "loss/logits": 0.9770903497934341, "step": 9570 }, { "epoch": 0.0958, "grad_norm": 9.25, "grad_norm_var": 0.14869791666666668, "learning_rate": 0.0003, "loss": 12.97, "loss/aux_loss": 0.048113430850207806, "loss/crossentropy": 2.7825845539569856, "loss/logits": 0.9913632333278656, "step": 9580 }, { "epoch": 0.0959, "grad_norm": 10.6875, "grad_norm_var": 1.1207682291666667, "learning_rate": 0.0003, "loss": 13.0485, "loss/aux_loss": 0.04811343587934971, "loss/crossentropy": 2.7735751450061796, "loss/logits": 0.9879475176334381, "step": 9590 }, { "epoch": 0.096, "grad_norm": 9.5, "grad_norm_var": 1.1030598958333333, "learning_rate": 0.0003, "loss": 13.0665, "loss/aux_loss": 0.048116713762283325, "loss/crossentropy": 2.8584636390209197, "loss/logits": 0.9740468025207519, "step": 9600 }, { "epoch": 0.0961, "grad_norm": 11.0625, "grad_norm_var": 0.21712239583333334, "learning_rate": 0.0003, "loss": 13.0707, "loss/aux_loss": 0.04812497589737177, "loss/crossentropy": 2.8642295002937317, "loss/logits": 1.0438130795955658, "step": 9610 }, { "epoch": 0.0962, "grad_norm": 10.375, "grad_norm_var": 78.78743489583333, "learning_rate": 0.0003, "loss": 12.9392, "loss/aux_loss": 0.04811538271605968, "loss/crossentropy": 2.8932973623275755, "loss/logits": 1.0000649869441987, "step": 9620 }, { "epoch": 0.0963, "grad_norm": 10.1875, "grad_norm_var": 0.401025390625, "learning_rate": 0.0003, "loss": 12.8071, "loss/aux_loss": 0.04812538847327232, "loss/crossentropy": 2.641858923435211, "loss/logits": 0.9451945751905442, "step": 9630 }, { "epoch": 0.0964, "grad_norm": 9.5625, "grad_norm_var": 0.15206705729166667, "learning_rate": 0.0003, "loss": 12.8081, "loss/aux_loss": 0.04811387863010168, "loss/crossentropy": 2.752705854177475, "loss/logits": 0.9918626010417938, "step": 9640 }, { "epoch": 0.0965, "grad_norm": 10.75, "grad_norm_var": 0.32810872395833335, "learning_rate": 0.0003, "loss": 12.8976, "loss/aux_loss": 0.04811955615878105, "loss/crossentropy": 2.823894906044006, "loss/logits": 0.9711399942636489, "step": 9650 }, { "epoch": 0.0966, "grad_norm": 9.1875, "grad_norm_var": 0.3337890625, "learning_rate": 0.0003, "loss": 12.9509, "loss/aux_loss": 0.0481190113350749, "loss/crossentropy": 2.993069517612457, "loss/logits": 0.9871428191661835, "step": 9660 }, { "epoch": 0.0967, "grad_norm": 9.5625, "grad_norm_var": 0.5012858072916667, "learning_rate": 0.0003, "loss": 12.7698, "loss/aux_loss": 0.048111490719020364, "loss/crossentropy": 2.8160251498222353, "loss/logits": 0.9605364561080932, "step": 9670 }, { "epoch": 0.0968, "grad_norm": 9.25, "grad_norm_var": 0.17667643229166666, "learning_rate": 0.0003, "loss": 12.8989, "loss/aux_loss": 0.04811663068830967, "loss/crossentropy": 2.9415274262428284, "loss/logits": 0.9684463948011398, "step": 9680 }, { "epoch": 0.0969, "grad_norm": 10.5, "grad_norm_var": 53.12389322916667, "learning_rate": 0.0003, "loss": 12.8548, "loss/aux_loss": 0.048127869702875616, "loss/crossentropy": 2.8381851077079774, "loss/logits": 0.9528964549303055, "step": 9690 }, { "epoch": 0.097, "grad_norm": 10.625, "grad_norm_var": 51.1869140625, "learning_rate": 0.0003, "loss": 12.8864, "loss/aux_loss": 0.04811157062649727, "loss/crossentropy": 2.917622911930084, "loss/logits": 1.0014064520597459, "step": 9700 }, { "epoch": 0.0971, "grad_norm": 9.5625, "grad_norm_var": 0.356884765625, "learning_rate": 0.0003, "loss": 12.97, "loss/aux_loss": 0.04812054745852947, "loss/crossentropy": 2.870450019836426, "loss/logits": 0.990039375424385, "step": 9710 }, { "epoch": 0.0972, "grad_norm": 9.75, "grad_norm_var": 0.4280598958333333, "learning_rate": 0.0003, "loss": 12.8376, "loss/aux_loss": 0.048113865032792094, "loss/crossentropy": 2.947874927520752, "loss/logits": 1.01834077835083, "step": 9720 }, { "epoch": 0.0973, "grad_norm": 9.75, "grad_norm_var": 0.202587890625, "learning_rate": 0.0003, "loss": 12.6772, "loss/aux_loss": 0.04811162706464529, "loss/crossentropy": 2.6616825222969056, "loss/logits": 0.922445324063301, "step": 9730 }, { "epoch": 0.0974, "grad_norm": 9.3125, "grad_norm_var": 0.23567708333333334, "learning_rate": 0.0003, "loss": 12.8276, "loss/aux_loss": 0.048115427419543264, "loss/crossentropy": 2.8596638798713685, "loss/logits": 0.9671652972698211, "step": 9740 }, { "epoch": 0.0975, "grad_norm": 9.4375, "grad_norm_var": 0.27980143229166665, "learning_rate": 0.0003, "loss": 12.8823, "loss/aux_loss": 0.048122276365756986, "loss/crossentropy": 2.9232805013656615, "loss/logits": 0.9951166033744812, "step": 9750 }, { "epoch": 0.0976, "grad_norm": 9.375, "grad_norm_var": 0.2384765625, "learning_rate": 0.0003, "loss": 12.8269, "loss/aux_loss": 0.04811736922711134, "loss/crossentropy": 3.0413878917694093, "loss/logits": 1.0016505420207977, "step": 9760 }, { "epoch": 0.0977, "grad_norm": 9.6875, "grad_norm_var": 0.24529622395833334, "learning_rate": 0.0003, "loss": 12.8884, "loss/aux_loss": 0.048109458200633524, "loss/crossentropy": 2.893119239807129, "loss/logits": 1.0159206092357635, "step": 9770 }, { "epoch": 0.0978, "grad_norm": 10.1875, "grad_norm_var": 0.20045572916666668, "learning_rate": 0.0003, "loss": 12.8463, "loss/aux_loss": 0.048116378486156464, "loss/crossentropy": 3.002572274208069, "loss/logits": 1.0260325849056244, "step": 9780 }, { "epoch": 0.0979, "grad_norm": 9.8125, "grad_norm_var": 0.1884765625, "learning_rate": 0.0003, "loss": 12.7471, "loss/aux_loss": 0.04811564590781927, "loss/crossentropy": 2.8663101851940156, "loss/logits": 0.945113542675972, "step": 9790 }, { "epoch": 0.098, "grad_norm": 9.9375, "grad_norm_var": 0.36041666666666666, "learning_rate": 0.0003, "loss": 13.0079, "loss/aux_loss": 0.0481245506554842, "loss/crossentropy": 2.7454223036766052, "loss/logits": 0.9564665943384171, "step": 9800 }, { "epoch": 0.0981, "grad_norm": 9.8125, "grad_norm_var": 0.4278645833333333, "learning_rate": 0.0003, "loss": 12.8525, "loss/aux_loss": 0.04811416696757078, "loss/crossentropy": 2.8844858169555665, "loss/logits": 0.9936564028263092, "step": 9810 }, { "epoch": 0.0982, "grad_norm": 9.5, "grad_norm_var": 0.221728515625, "learning_rate": 0.0003, "loss": 12.8522, "loss/aux_loss": 0.048113705776631835, "loss/crossentropy": 2.937456488609314, "loss/logits": 0.9975145667791366, "step": 9820 }, { "epoch": 0.0983, "grad_norm": 9.1875, "grad_norm_var": 0.23318684895833333, "learning_rate": 0.0003, "loss": 12.948, "loss/aux_loss": 0.04812261760234833, "loss/crossentropy": 2.863471287488937, "loss/logits": 0.9870826095342636, "step": 9830 }, { "epoch": 0.0984, "grad_norm": 9.9375, "grad_norm_var": 0.4388020833333333, "learning_rate": 0.0003, "loss": 12.785, "loss/aux_loss": 0.04811761137098074, "loss/crossentropy": 2.860468626022339, "loss/logits": 0.9974869579076767, "step": 9840 }, { "epoch": 0.0985, "grad_norm": 10.875, "grad_norm_var": 0.39108072916666664, "learning_rate": 0.0003, "loss": 12.9577, "loss/aux_loss": 0.04811706598848105, "loss/crossentropy": 2.8506002187728883, "loss/logits": 1.000709992647171, "step": 9850 }, { "epoch": 0.0986, "grad_norm": 9.0, "grad_norm_var": 0.254931640625, "learning_rate": 0.0003, "loss": 12.5998, "loss/aux_loss": 0.04811438079923391, "loss/crossentropy": 2.918227458000183, "loss/logits": 0.9769401401281357, "step": 9860 }, { "epoch": 0.0987, "grad_norm": 10.9375, "grad_norm_var": 0.31295572916666664, "learning_rate": 0.0003, "loss": 12.979, "loss/aux_loss": 0.04811355788260698, "loss/crossentropy": 2.909677565097809, "loss/logits": 1.0252159029245376, "step": 9870 }, { "epoch": 0.0988, "grad_norm": 10.4375, "grad_norm_var": 0.2921223958333333, "learning_rate": 0.0003, "loss": 12.8383, "loss/aux_loss": 0.04811448734253645, "loss/crossentropy": 2.835783588886261, "loss/logits": 1.0406290709972381, "step": 9880 }, { "epoch": 0.0989, "grad_norm": 10.1875, "grad_norm_var": 1.6822265625, "learning_rate": 0.0003, "loss": 12.7556, "loss/aux_loss": 0.04814105350524187, "loss/crossentropy": 2.7648268580436706, "loss/logits": 0.9558891981840134, "step": 9890 }, { "epoch": 0.099, "grad_norm": 10.4375, "grad_norm_var": 1.9072916666666666, "learning_rate": 0.0003, "loss": 12.7867, "loss/aux_loss": 0.04811670910567045, "loss/crossentropy": 2.68316650390625, "loss/logits": 0.9622927576303482, "step": 9900 }, { "epoch": 0.0991, "grad_norm": 10.125, "grad_norm_var": 0.46608072916666665, "learning_rate": 0.0003, "loss": 12.8684, "loss/aux_loss": 0.04812458418309688, "loss/crossentropy": 2.880593103170395, "loss/logits": 0.9721406042575836, "step": 9910 }, { "epoch": 0.0992, "grad_norm": 10.5, "grad_norm_var": 0.55546875, "learning_rate": 0.0003, "loss": 12.817, "loss/aux_loss": 0.048123272694647314, "loss/crossentropy": 2.6709973573684693, "loss/logits": 0.9354108065366745, "step": 9920 }, { "epoch": 0.0993, "grad_norm": 9.6875, "grad_norm_var": 0.395166015625, "learning_rate": 0.0003, "loss": 12.8799, "loss/aux_loss": 0.04812105931341648, "loss/crossentropy": 2.934725469350815, "loss/logits": 0.9813075840473175, "step": 9930 }, { "epoch": 0.0994, "grad_norm": 9.625, "grad_norm_var": 0.6903483072916666, "learning_rate": 0.0003, "loss": 12.903, "loss/aux_loss": 0.048118251748383044, "loss/crossentropy": 2.8453499555587767, "loss/logits": 0.9796870052814484, "step": 9940 }, { "epoch": 0.0995, "grad_norm": 14.625, "grad_norm_var": 2.387223307291667, "learning_rate": 0.0003, "loss": 12.8252, "loss/aux_loss": 0.048117080517113206, "loss/crossentropy": 2.8250075817108153, "loss/logits": 0.9736212283372879, "step": 9950 }, { "epoch": 0.0996, "grad_norm": 9.875, "grad_norm_var": 1.3822265625, "learning_rate": 0.0003, "loss": 12.6306, "loss/aux_loss": 0.04811842925846577, "loss/crossentropy": 2.854235601425171, "loss/logits": 1.007426416873932, "step": 9960 }, { "epoch": 0.0997, "grad_norm": 9.875, "grad_norm_var": 0.3648274739583333, "learning_rate": 0.0003, "loss": 12.6579, "loss/aux_loss": 0.04811596740037203, "loss/crossentropy": 2.898962616920471, "loss/logits": 0.9763563752174378, "step": 9970 }, { "epoch": 0.0998, "grad_norm": 10.5, "grad_norm_var": 0.172900390625, "learning_rate": 0.0003, "loss": 12.788, "loss/aux_loss": 0.048104763589799406, "loss/crossentropy": 2.9158723652362823, "loss/logits": 1.0095852971076966, "step": 9980 }, { "epoch": 0.0999, "grad_norm": 10.8125, "grad_norm_var": 0.2770182291666667, "learning_rate": 0.0003, "loss": 12.7328, "loss/aux_loss": 0.04811613652855158, "loss/crossentropy": 2.781576532125473, "loss/logits": 1.0038779705762864, "step": 9990 }, { "epoch": 0.1, "grad_norm": 9.6875, "grad_norm_var": 0.6130208333333333, "learning_rate": 0.0003, "loss": 12.8425, "loss/aux_loss": 0.048115148395299914, "loss/crossentropy": 2.7442554593086244, "loss/logits": 0.9685165584087372, "step": 10000 }, { "epoch": 0.1001, "grad_norm": 10.25, "grad_norm_var": 1.0940104166666667, "learning_rate": 0.0003, "loss": 12.7596, "loss/aux_loss": 0.04810796473175287, "loss/crossentropy": 2.8970122635364532, "loss/logits": 0.9651453495025635, "step": 10010 }, { "epoch": 0.1002, "grad_norm": 10.875, "grad_norm_var": 0.8113932291666667, "learning_rate": 0.0003, "loss": 13.0034, "loss/aux_loss": 0.048116610012948514, "loss/crossentropy": 2.872769057750702, "loss/logits": 1.0002406895160676, "step": 10020 }, { "epoch": 0.1003, "grad_norm": 12.1875, "grad_norm_var": 0.6697265625, "learning_rate": 0.0003, "loss": 12.7285, "loss/aux_loss": 0.04810873456299305, "loss/crossentropy": 2.888649785518646, "loss/logits": 0.9968151926994324, "step": 10030 }, { "epoch": 0.1004, "grad_norm": 10.0625, "grad_norm_var": 0.540625, "learning_rate": 0.0003, "loss": 12.8715, "loss/aux_loss": 0.048114927113056184, "loss/crossentropy": 2.9668263673782347, "loss/logits": 1.0093841701745987, "step": 10040 }, { "epoch": 0.1005, "grad_norm": 10.5625, "grad_norm_var": 0.202587890625, "learning_rate": 0.0003, "loss": 12.871, "loss/aux_loss": 0.048109718784689906, "loss/crossentropy": 2.841026210784912, "loss/logits": 0.9876527488231659, "step": 10050 }, { "epoch": 0.1006, "grad_norm": 10.5625, "grad_norm_var": 0.4266764322916667, "learning_rate": 0.0003, "loss": 12.7901, "loss/aux_loss": 0.04810853134840727, "loss/crossentropy": 2.692527735233307, "loss/logits": 0.9895975649356842, "step": 10060 }, { "epoch": 0.1007, "grad_norm": 9.875, "grad_norm_var": 0.4041015625, "learning_rate": 0.0003, "loss": 12.4866, "loss/aux_loss": 0.04811131805181503, "loss/crossentropy": 2.908632504940033, "loss/logits": 0.9596006900072098, "step": 10070 }, { "epoch": 0.1008, "grad_norm": 10.125, "grad_norm_var": 0.3046875, "learning_rate": 0.0003, "loss": 12.7782, "loss/aux_loss": 0.04811954293400049, "loss/crossentropy": 2.901764976978302, "loss/logits": 1.0220121264457702, "step": 10080 }, { "epoch": 0.1009, "grad_norm": 10.8125, "grad_norm_var": 0.45358072916666664, "learning_rate": 0.0003, "loss": 12.8296, "loss/aux_loss": 0.04812715277075767, "loss/crossentropy": 2.7433866381645204, "loss/logits": 0.9685066968202591, "step": 10090 }, { "epoch": 0.101, "grad_norm": 11.6875, "grad_norm_var": 0.4431640625, "learning_rate": 0.0003, "loss": 12.7263, "loss/aux_loss": 0.04812118727713823, "loss/crossentropy": 2.9564905166625977, "loss/logits": 1.0407138913869858, "step": 10100 }, { "epoch": 0.1011, "grad_norm": 11.125, "grad_norm_var": 0.3551432291666667, "learning_rate": 0.0003, "loss": 12.8901, "loss/aux_loss": 0.04811162799596787, "loss/crossentropy": 2.962075352668762, "loss/logits": 1.002569890022278, "step": 10110 }, { "epoch": 0.1012, "grad_norm": 9.75, "grad_norm_var": 0.42185872395833335, "learning_rate": 0.0003, "loss": 12.777, "loss/aux_loss": 0.04811493325978518, "loss/crossentropy": 2.9621083974838256, "loss/logits": 1.0220870167016982, "step": 10120 }, { "epoch": 0.1013, "grad_norm": 70.5, "grad_norm_var": 225.11183268229166, "learning_rate": 0.0003, "loss": 12.8111, "loss/aux_loss": 0.04811273105442524, "loss/crossentropy": 2.8190457224845886, "loss/logits": 0.9978448241949082, "step": 10130 }, { "epoch": 0.1014, "grad_norm": 9.75, "grad_norm_var": 225.50514322916666, "learning_rate": 0.0003, "loss": 12.8321, "loss/aux_loss": 0.0481122450903058, "loss/crossentropy": 2.7599350273609162, "loss/logits": 0.9731186151504516, "step": 10140 }, { "epoch": 0.1015, "grad_norm": 10.6875, "grad_norm_var": 0.35271809895833334, "learning_rate": 0.0003, "loss": 12.7794, "loss/aux_loss": 0.0481121052056551, "loss/crossentropy": 2.918788194656372, "loss/logits": 1.0331996023654937, "step": 10150 }, { "epoch": 0.1016, "grad_norm": 10.0625, "grad_norm_var": 0.2916666666666667, "learning_rate": 0.0003, "loss": 12.9847, "loss/aux_loss": 0.04810354914516211, "loss/crossentropy": 2.9620222568511965, "loss/logits": 0.9895435065031052, "step": 10160 }, { "epoch": 0.1017, "grad_norm": 10.1875, "grad_norm_var": 61.03513997395833, "learning_rate": 0.0003, "loss": 12.905, "loss/aux_loss": 0.048123286291956904, "loss/crossentropy": 2.8044037401676176, "loss/logits": 0.9406631171703339, "step": 10170 }, { "epoch": 0.1018, "grad_norm": 10.5, "grad_norm_var": 0.23118489583333332, "learning_rate": 0.0003, "loss": 12.838, "loss/aux_loss": 0.04811715167015791, "loss/crossentropy": 2.957458180189133, "loss/logits": 0.9943399399518966, "step": 10180 }, { "epoch": 0.1019, "grad_norm": 12.125, "grad_norm_var": 8.0984375, "learning_rate": 0.0003, "loss": 12.8759, "loss/aux_loss": 0.048108558543026446, "loss/crossentropy": 2.867668330669403, "loss/logits": 0.9879345417022705, "step": 10190 }, { "epoch": 0.102, "grad_norm": 10.125, "grad_norm_var": 7.892301432291666, "learning_rate": 0.0003, "loss": 12.9061, "loss/aux_loss": 0.04811856150627136, "loss/crossentropy": 2.870317333936691, "loss/logits": 1.0022278010845185, "step": 10200 }, { "epoch": 0.1021, "grad_norm": 10.125, "grad_norm_var": 0.1166015625, "learning_rate": 0.0003, "loss": 12.7326, "loss/aux_loss": 0.048116784729063514, "loss/crossentropy": 2.678470027446747, "loss/logits": 0.9406646758317947, "step": 10210 }, { "epoch": 0.1022, "grad_norm": 11.0, "grad_norm_var": 1.2061848958333334, "learning_rate": 0.0003, "loss": 12.7671, "loss/aux_loss": 0.04811875224113464, "loss/crossentropy": 2.9115442454814913, "loss/logits": 0.9856185555458069, "step": 10220 }, { "epoch": 0.1023, "grad_norm": 10.3125, "grad_norm_var": 2.476676432291667, "learning_rate": 0.0003, "loss": 12.8463, "loss/aux_loss": 0.048114926740527156, "loss/crossentropy": 2.987521970272064, "loss/logits": 0.9880728483200073, "step": 10230 }, { "epoch": 0.1024, "grad_norm": 9.6875, "grad_norm_var": 0.4239583333333333, "learning_rate": 0.0003, "loss": 12.9066, "loss/aux_loss": 0.04811114761978388, "loss/crossentropy": 2.957077658176422, "loss/logits": 1.0159188747406005, "step": 10240 }, { "epoch": 0.1025, "grad_norm": 10.25, "grad_norm_var": 0.19348958333333333, "learning_rate": 0.0003, "loss": 12.8744, "loss/aux_loss": 0.04811904225498438, "loss/crossentropy": 2.9327427983283996, "loss/logits": 1.014840191602707, "step": 10250 }, { "epoch": 0.1026, "grad_norm": 11.25, "grad_norm_var": 26.683854166666666, "learning_rate": 0.0003, "loss": 12.8864, "loss/aux_loss": 0.04811773095279932, "loss/crossentropy": 2.9283841848373413, "loss/logits": 0.9840510159730911, "step": 10260 }, { "epoch": 0.1027, "grad_norm": 9.6875, "grad_norm_var": 5.591129557291667, "learning_rate": 0.0003, "loss": 12.7431, "loss/aux_loss": 0.04809635002166033, "loss/crossentropy": 2.9701969385147096, "loss/logits": 1.00972381234169, "step": 10270 }, { "epoch": 0.1028, "grad_norm": 9.875, "grad_norm_var": 0.2659993489583333, "learning_rate": 0.0003, "loss": 12.6803, "loss/aux_loss": 0.04810932390391827, "loss/crossentropy": 2.9193489074707033, "loss/logits": 0.9834885329008103, "step": 10280 }, { "epoch": 0.1029, "grad_norm": 9.6875, "grad_norm_var": 0.2775390625, "learning_rate": 0.0003, "loss": 12.8644, "loss/aux_loss": 0.048101594857871535, "loss/crossentropy": 2.9100057601928713, "loss/logits": 0.99757040143013, "step": 10290 }, { "epoch": 0.103, "grad_norm": 10.0625, "grad_norm_var": 0.3106770833333333, "learning_rate": 0.0003, "loss": 12.6547, "loss/aux_loss": 0.04812427274882793, "loss/crossentropy": 2.7211422979831696, "loss/logits": 0.9326405107975007, "step": 10300 }, { "epoch": 0.1031, "grad_norm": 10.75, "grad_norm_var": 0.27029622395833336, "learning_rate": 0.0003, "loss": 12.9066, "loss/aux_loss": 0.04811542592942715, "loss/crossentropy": 2.8410415768623354, "loss/logits": 0.9540263682603836, "step": 10310 }, { "epoch": 0.1032, "grad_norm": 10.0625, "grad_norm_var": 0.152587890625, "learning_rate": 0.0003, "loss": 12.8612, "loss/aux_loss": 0.048105557821691035, "loss/crossentropy": 2.894010055065155, "loss/logits": 0.9962664604187011, "step": 10320 }, { "epoch": 0.1033, "grad_norm": 10.625, "grad_norm_var": 0.5556640625, "learning_rate": 0.0003, "loss": 12.7817, "loss/aux_loss": 0.04811650700867176, "loss/crossentropy": 2.7931331276893614, "loss/logits": 0.9934678196907043, "step": 10330 }, { "epoch": 0.1034, "grad_norm": 10.1875, "grad_norm_var": 0.115087890625, "learning_rate": 0.0003, "loss": 12.6721, "loss/aux_loss": 0.04811523836106062, "loss/crossentropy": 2.8752257347106935, "loss/logits": 0.9926656931638718, "step": 10340 }, { "epoch": 0.1035, "grad_norm": 10.3125, "grad_norm_var": 0.208837890625, "learning_rate": 0.0003, "loss": 12.747, "loss/aux_loss": 0.04811122994869947, "loss/crossentropy": 2.863996922969818, "loss/logits": 0.9837069183588028, "step": 10350 }, { "epoch": 0.1036, "grad_norm": 9.8125, "grad_norm_var": 0.19959309895833333, "learning_rate": 0.0003, "loss": 12.7221, "loss/aux_loss": 0.048110642656683925, "loss/crossentropy": 2.9583349347114565, "loss/logits": 0.9609038531780243, "step": 10360 }, { "epoch": 0.1037, "grad_norm": 11.625, "grad_norm_var": 0.349072265625, "learning_rate": 0.0003, "loss": 12.8269, "loss/aux_loss": 0.04811428822577, "loss/crossentropy": 2.875106942653656, "loss/logits": 0.9508922189474106, "step": 10370 }, { "epoch": 0.1038, "grad_norm": 9.3125, "grad_norm_var": 0.5851399739583333, "learning_rate": 0.0003, "loss": 12.7956, "loss/aux_loss": 0.04812815226614475, "loss/crossentropy": 2.9032375514507294, "loss/logits": 0.9839789032936096, "step": 10380 }, { "epoch": 0.1039, "grad_norm": 10.75, "grad_norm_var": 25.269124348958332, "learning_rate": 0.0003, "loss": 12.8725, "loss/aux_loss": 0.048113728314638136, "loss/crossentropy": 2.811239331960678, "loss/logits": 0.9782313734292984, "step": 10390 }, { "epoch": 0.104, "grad_norm": 10.8125, "grad_norm_var": 25.748811848958333, "learning_rate": 0.0003, "loss": 12.5987, "loss/aux_loss": 0.04810786601155996, "loss/crossentropy": 2.8436325669288633, "loss/logits": 0.9547698825597764, "step": 10400 }, { "epoch": 0.1041, "grad_norm": 10.5625, "grad_norm_var": 0.17962239583333334, "learning_rate": 0.0003, "loss": 12.6758, "loss/aux_loss": 0.04811085946857929, "loss/crossentropy": 2.827778089046478, "loss/logits": 0.9550431787967681, "step": 10410 }, { "epoch": 0.1042, "grad_norm": 9.8125, "grad_norm_var": 0.919384765625, "learning_rate": 0.0003, "loss": 12.8482, "loss/aux_loss": 0.04811614342033863, "loss/crossentropy": 3.0204949617385863, "loss/logits": 0.9995712280273438, "step": 10420 }, { "epoch": 0.1043, "grad_norm": 10.125, "grad_norm_var": 0.5180826822916667, "learning_rate": 0.0003, "loss": 12.6728, "loss/aux_loss": 0.04812094569206238, "loss/crossentropy": 3.012200677394867, "loss/logits": 0.9757115840911865, "step": 10430 }, { "epoch": 0.1044, "grad_norm": 9.75, "grad_norm_var": 0.284619140625, "learning_rate": 0.0003, "loss": 12.7465, "loss/aux_loss": 0.0481198638677597, "loss/crossentropy": 2.8174231171607973, "loss/logits": 0.9774721026420593, "step": 10440 }, { "epoch": 0.1045, "grad_norm": 10.3125, "grad_norm_var": 0.229931640625, "learning_rate": 0.0003, "loss": 12.7017, "loss/aux_loss": 0.04811002127826214, "loss/crossentropy": 2.955516219139099, "loss/logits": 0.9981975615024566, "step": 10450 }, { "epoch": 0.1046, "grad_norm": 10.0625, "grad_norm_var": 0.4632649739583333, "learning_rate": 0.0003, "loss": 12.8212, "loss/aux_loss": 0.048105720058083536, "loss/crossentropy": 2.8493134498596193, "loss/logits": 1.0022914230823516, "step": 10460 }, { "epoch": 0.1047, "grad_norm": 10.4375, "grad_norm_var": 17.641145833333333, "learning_rate": 0.0003, "loss": 12.8144, "loss/aux_loss": 0.04811700396239758, "loss/crossentropy": 2.865918278694153, "loss/logits": 1.011181029677391, "step": 10470 }, { "epoch": 0.1048, "grad_norm": 10.5625, "grad_norm_var": 1.2333333333333334, "learning_rate": 0.0003, "loss": 12.7668, "loss/aux_loss": 0.048107659071683885, "loss/crossentropy": 2.9243461012840273, "loss/logits": 1.0033222287893295, "step": 10480 }, { "epoch": 0.1049, "grad_norm": 10.1875, "grad_norm_var": 0.43645833333333334, "learning_rate": 0.0003, "loss": 12.7835, "loss/aux_loss": 0.04813041500747204, "loss/crossentropy": 2.799642193317413, "loss/logits": 0.9750822395086288, "step": 10490 }, { "epoch": 0.105, "grad_norm": 10.5625, "grad_norm_var": 0.3960774739583333, "learning_rate": 0.0003, "loss": 12.6303, "loss/aux_loss": 0.048102636635303495, "loss/crossentropy": 2.842100405693054, "loss/logits": 0.9673471480607987, "step": 10500 }, { "epoch": 0.1051, "grad_norm": 9.6875, "grad_norm_var": 0.19264322916666668, "learning_rate": 0.0003, "loss": 12.64, "loss/aux_loss": 0.04811730049550533, "loss/crossentropy": 2.735097426176071, "loss/logits": 0.953799894452095, "step": 10510 }, { "epoch": 0.1052, "grad_norm": 9.5, "grad_norm_var": 0.428125, "learning_rate": 0.0003, "loss": 12.767, "loss/aux_loss": 0.04810708742588758, "loss/crossentropy": 3.0010022163391112, "loss/logits": 1.0266100823879243, "step": 10520 }, { "epoch": 0.1053, "grad_norm": 10.0625, "grad_norm_var": 0.37083333333333335, "learning_rate": 0.0003, "loss": 12.6364, "loss/aux_loss": 0.04811326451599598, "loss/crossentropy": 2.8574136972427366, "loss/logits": 0.9649319559335708, "step": 10530 }, { "epoch": 0.1054, "grad_norm": 9.75, "grad_norm_var": 0.5776041666666667, "learning_rate": 0.0003, "loss": 12.8414, "loss/aux_loss": 0.048117882758378985, "loss/crossentropy": 2.8915890574455263, "loss/logits": 1.019568595290184, "step": 10540 }, { "epoch": 0.1055, "grad_norm": 10.125, "grad_norm_var": 0.26027018229166665, "learning_rate": 0.0003, "loss": 12.6664, "loss/aux_loss": 0.04811375327408314, "loss/crossentropy": 2.851538288593292, "loss/logits": 0.9997588336467743, "step": 10550 }, { "epoch": 0.1056, "grad_norm": 10.0, "grad_norm_var": 0.20358072916666667, "learning_rate": 0.0003, "loss": 12.8045, "loss/aux_loss": 0.04811806846410036, "loss/crossentropy": 2.9150373101234437, "loss/logits": 0.9894289702177048, "step": 10560 }, { "epoch": 0.1057, "grad_norm": 10.1875, "grad_norm_var": 0.12083333333333333, "learning_rate": 0.0003, "loss": 12.7669, "loss/aux_loss": 0.048116890527307984, "loss/crossentropy": 2.8022518932819365, "loss/logits": 0.9851718157529831, "step": 10570 }, { "epoch": 0.1058, "grad_norm": 11.3125, "grad_norm_var": 0.23697916666666666, "learning_rate": 0.0003, "loss": 12.9142, "loss/aux_loss": 0.04811734985560179, "loss/crossentropy": 2.915334862470627, "loss/logits": 1.051106184720993, "step": 10580 }, { "epoch": 0.1059, "grad_norm": 9.9375, "grad_norm_var": 0.47068684895833335, "learning_rate": 0.0003, "loss": 12.7202, "loss/aux_loss": 0.04810705240815878, "loss/crossentropy": 2.683490252494812, "loss/logits": 0.9757168561220169, "step": 10590 }, { "epoch": 0.106, "grad_norm": 11.625, "grad_norm_var": 14.690869140625, "learning_rate": 0.0003, "loss": 12.9961, "loss/aux_loss": 0.04812338836491108, "loss/crossentropy": 2.8796960532665254, "loss/logits": 1.0100533604621886, "step": 10600 }, { "epoch": 0.1061, "grad_norm": 10.5625, "grad_norm_var": 0.6958333333333333, "learning_rate": 0.0003, "loss": 12.6616, "loss/aux_loss": 0.04811020065099001, "loss/crossentropy": 2.790817213058472, "loss/logits": 0.9659003525972366, "step": 10610 }, { "epoch": 0.1062, "grad_norm": 10.6875, "grad_norm_var": 0.41171875, "learning_rate": 0.0003, "loss": 12.6359, "loss/aux_loss": 0.04811752960085869, "loss/crossentropy": 2.971816051006317, "loss/logits": 0.9964886039495469, "step": 10620 }, { "epoch": 0.1063, "grad_norm": 10.9375, "grad_norm_var": 0.350244140625, "learning_rate": 0.0003, "loss": 12.842, "loss/aux_loss": 0.04811201822012663, "loss/crossentropy": 2.986808705329895, "loss/logits": 0.9862239271402359, "step": 10630 }, { "epoch": 0.1064, "grad_norm": 10.5, "grad_norm_var": 0.33318684895833334, "learning_rate": 0.0003, "loss": 12.6619, "loss/aux_loss": 0.048111039027571675, "loss/crossentropy": 2.8363620817661284, "loss/logits": 0.9908882945775985, "step": 10640 }, { "epoch": 0.1065, "grad_norm": 10.5625, "grad_norm_var": 0.436962890625, "learning_rate": 0.0003, "loss": 12.5936, "loss/aux_loss": 0.04811495747417212, "loss/crossentropy": 2.7697442412376403, "loss/logits": 0.9236278921365738, "step": 10650 }, { "epoch": 0.1066, "grad_norm": 10.1875, "grad_norm_var": 0.24451497395833333, "learning_rate": 0.0003, "loss": 12.8339, "loss/aux_loss": 0.04810470268130303, "loss/crossentropy": 2.695315259695053, "loss/logits": 0.9890154510736465, "step": 10660 }, { "epoch": 0.1067, "grad_norm": 10.5, "grad_norm_var": 0.371337890625, "learning_rate": 0.0003, "loss": 12.6645, "loss/aux_loss": 0.04811508543789387, "loss/crossentropy": 2.8011455297470094, "loss/logits": 0.9550657361745835, "step": 10670 }, { "epoch": 0.1068, "grad_norm": 10.25, "grad_norm_var": 1.6645833333333333, "learning_rate": 0.0003, "loss": 12.6319, "loss/aux_loss": 0.048115532658994195, "loss/crossentropy": 2.857553493976593, "loss/logits": 0.9820433109998703, "step": 10680 }, { "epoch": 0.1069, "grad_norm": 10.8125, "grad_norm_var": 1.7983723958333333, "learning_rate": 0.0003, "loss": 12.742, "loss/aux_loss": 0.04811100009828806, "loss/crossentropy": 2.743243044614792, "loss/logits": 0.9736525624990463, "step": 10690 }, { "epoch": 0.107, "grad_norm": 9.625, "grad_norm_var": 0.21183268229166666, "learning_rate": 0.0003, "loss": 12.5406, "loss/aux_loss": 0.048111391440033915, "loss/crossentropy": 2.8312166213989256, "loss/logits": 0.9848694235086441, "step": 10700 }, { "epoch": 0.1071, "grad_norm": 11.0, "grad_norm_var": 0.32198893229166664, "learning_rate": 0.0003, "loss": 12.6608, "loss/aux_loss": 0.04810557756572962, "loss/crossentropy": 2.7473709881305695, "loss/logits": 0.9552412897348403, "step": 10710 }, { "epoch": 0.1072, "grad_norm": 10.0625, "grad_norm_var": 0.30271809895833335, "learning_rate": 0.0003, "loss": 12.5961, "loss/aux_loss": 0.048110843263566495, "loss/crossentropy": 2.7141247391700745, "loss/logits": 0.9736550092697144, "step": 10720 }, { "epoch": 0.1073, "grad_norm": 9.3125, "grad_norm_var": 0.1681640625, "learning_rate": 0.0003, "loss": 12.5023, "loss/aux_loss": 0.048104040697216986, "loss/crossentropy": 2.6945619106292726, "loss/logits": 0.9484582245349884, "step": 10730 }, { "epoch": 0.1074, "grad_norm": 10.5, "grad_norm_var": 0.28177083333333336, "learning_rate": 0.0003, "loss": 12.6278, "loss/aux_loss": 0.04810405727475882, "loss/crossentropy": 2.8070200264453886, "loss/logits": 0.9321490287780761, "step": 10740 }, { "epoch": 0.1075, "grad_norm": 9.5, "grad_norm_var": 0.199853515625, "learning_rate": 0.0003, "loss": 12.7025, "loss/aux_loss": 0.04810698907822371, "loss/crossentropy": 2.9952612042427065, "loss/logits": 0.9933030098676682, "step": 10750 }, { "epoch": 0.1076, "grad_norm": 9.875, "grad_norm_var": 0.6831868489583334, "learning_rate": 0.0003, "loss": 12.6858, "loss/aux_loss": 0.048116383515298365, "loss/crossentropy": 2.80760772228241, "loss/logits": 0.9752837151288987, "step": 10760 }, { "epoch": 0.1077, "grad_norm": 10.9375, "grad_norm_var": 0.7341145833333333, "learning_rate": 0.0003, "loss": 12.6131, "loss/aux_loss": 0.04810668155550957, "loss/crossentropy": 2.941310775279999, "loss/logits": 0.9818355232477188, "step": 10770 }, { "epoch": 0.1078, "grad_norm": 9.375, "grad_norm_var": 0.39264322916666666, "learning_rate": 0.0003, "loss": 12.5573, "loss/aux_loss": 0.048113299161195756, "loss/crossentropy": 2.860517716407776, "loss/logits": 0.9614722609519959, "step": 10780 }, { "epoch": 0.1079, "grad_norm": 9.5625, "grad_norm_var": 0.18787434895833333, "learning_rate": 0.0003, "loss": 12.6124, "loss/aux_loss": 0.04811013750731945, "loss/crossentropy": 2.760462909936905, "loss/logits": 0.9715398788452149, "step": 10790 }, { "epoch": 0.108, "grad_norm": 10.75, "grad_norm_var": 0.36248372395833334, "learning_rate": 0.0003, "loss": 12.6471, "loss/aux_loss": 0.04810702111572027, "loss/crossentropy": 2.9887078046798705, "loss/logits": 0.9805259108543396, "step": 10800 }, { "epoch": 0.1081, "grad_norm": 10.6875, "grad_norm_var": 0.2743326822916667, "learning_rate": 0.0003, "loss": 12.653, "loss/aux_loss": 0.048103836551308635, "loss/crossentropy": 2.8864383697509766, "loss/logits": 0.9688400447368621, "step": 10810 }, { "epoch": 0.1082, "grad_norm": 10.5625, "grad_norm_var": 0.5113932291666666, "learning_rate": 0.0003, "loss": 12.7579, "loss/aux_loss": 0.04811310451477766, "loss/crossentropy": 2.869203519821167, "loss/logits": 0.9728466540575027, "step": 10820 }, { "epoch": 0.1083, "grad_norm": 10.125, "grad_norm_var": 0.35703125, "learning_rate": 0.0003, "loss": 12.7857, "loss/aux_loss": 0.04811171405017376, "loss/crossentropy": 2.8055492877960204, "loss/logits": 0.9609241902828216, "step": 10830 }, { "epoch": 0.1084, "grad_norm": 10.1875, "grad_norm_var": 0.5317708333333333, "learning_rate": 0.0003, "loss": 12.6272, "loss/aux_loss": 0.04810267500579357, "loss/crossentropy": 2.8873426795005797, "loss/logits": 0.983044245839119, "step": 10840 }, { "epoch": 0.1085, "grad_norm": 12.375, "grad_norm_var": 1.7999837239583334, "learning_rate": 0.0003, "loss": 12.5848, "loss/aux_loss": 0.0481255043298006, "loss/crossentropy": 2.8434741854667664, "loss/logits": 0.9674687087535858, "step": 10850 }, { "epoch": 0.1086, "grad_norm": 10.875, "grad_norm_var": 1.5106608072916667, "learning_rate": 0.0003, "loss": 12.7505, "loss/aux_loss": 0.04811022691428661, "loss/crossentropy": 2.7955089688301085, "loss/logits": 0.9789073407649994, "step": 10860 }, { "epoch": 0.1087, "grad_norm": 10.8125, "grad_norm_var": 0.23956705729166666, "learning_rate": 0.0003, "loss": 12.7856, "loss/aux_loss": 0.048125218600034714, "loss/crossentropy": 2.7512724816799166, "loss/logits": 0.9434742718935013, "step": 10870 }, { "epoch": 0.1088, "grad_norm": 10.5, "grad_norm_var": 0.4942708333333333, "learning_rate": 0.0003, "loss": 12.8403, "loss/aux_loss": 0.048105237260460856, "loss/crossentropy": 3.1708402156829836, "loss/logits": 0.9876413464546203, "step": 10880 }, { "epoch": 0.1089, "grad_norm": 11.5, "grad_norm_var": 0.288525390625, "learning_rate": 0.0003, "loss": 12.4812, "loss/aux_loss": 0.04810988549143076, "loss/crossentropy": 2.802176779508591, "loss/logits": 0.9748163193464279, "step": 10890 }, { "epoch": 0.109, "grad_norm": 10.3125, "grad_norm_var": 0.287744140625, "learning_rate": 0.0003, "loss": 12.5199, "loss/aux_loss": 0.04810547549277544, "loss/crossentropy": 2.9589218378067015, "loss/logits": 0.9759902417659759, "step": 10900 }, { "epoch": 0.1091, "grad_norm": 11.25, "grad_norm_var": 0.5187337239583333, "learning_rate": 0.0003, "loss": 12.6455, "loss/aux_loss": 0.048122298903763294, "loss/crossentropy": 2.7537475407123564, "loss/logits": 0.968425664305687, "step": 10910 }, { "epoch": 0.1092, "grad_norm": 10.9375, "grad_norm_var": 3.849853515625, "learning_rate": 0.0003, "loss": 12.6097, "loss/aux_loss": 0.04810692425817251, "loss/crossentropy": 2.87729851603508, "loss/logits": 0.988306000828743, "step": 10920 }, { "epoch": 0.1093, "grad_norm": 11.625, "grad_norm_var": 3.958056640625, "learning_rate": 0.0003, "loss": 12.4243, "loss/aux_loss": 0.04811464417725801, "loss/crossentropy": 2.823906672000885, "loss/logits": 0.9596776217222214, "step": 10930 }, { "epoch": 0.1094, "grad_norm": 10.125, "grad_norm_var": 0.39576822916666665, "learning_rate": 0.0003, "loss": 12.6005, "loss/aux_loss": 0.04811041634529829, "loss/crossentropy": 2.8116785049438477, "loss/logits": 1.0132469624280929, "step": 10940 }, { "epoch": 0.1095, "grad_norm": 11.8125, "grad_norm_var": 0.43483072916666665, "learning_rate": 0.0003, "loss": 12.7462, "loss/aux_loss": 0.04810987431555987, "loss/crossentropy": 2.908396100997925, "loss/logits": 1.008555829524994, "step": 10950 }, { "epoch": 0.1096, "grad_norm": 10.1875, "grad_norm_var": 0.51640625, "learning_rate": 0.0003, "loss": 12.6827, "loss/aux_loss": 0.0481119841337204, "loss/crossentropy": 2.865977716445923, "loss/logits": 0.969117721915245, "step": 10960 }, { "epoch": 0.1097, "grad_norm": 10.4375, "grad_norm_var": 0.4945149739583333, "learning_rate": 0.0003, "loss": 12.5983, "loss/aux_loss": 0.048109224624931814, "loss/crossentropy": 2.9365819096565247, "loss/logits": 0.9682926207780838, "step": 10970 }, { "epoch": 0.1098, "grad_norm": 9.875, "grad_norm_var": 0.481494140625, "learning_rate": 0.0003, "loss": 12.6494, "loss/aux_loss": 0.048102812469005586, "loss/crossentropy": 2.924280512332916, "loss/logits": 1.0038185507059096, "step": 10980 }, { "epoch": 0.1099, "grad_norm": 11.1875, "grad_norm_var": 1.7367024739583334, "learning_rate": 0.0003, "loss": 12.9254, "loss/aux_loss": 0.048106090165674686, "loss/crossentropy": 2.9131445050239564, "loss/logits": 0.9844058066606521, "step": 10990 }, { "epoch": 0.11, "grad_norm": 10.1875, "grad_norm_var": 0.44264322916666665, "learning_rate": 0.0003, "loss": 12.5432, "loss/aux_loss": 0.04811704996973276, "loss/crossentropy": 2.814534968137741, "loss/logits": 0.9921759486198425, "step": 11000 }, { "epoch": 0.1101, "grad_norm": 10.5, "grad_norm_var": 0.370947265625, "learning_rate": 0.0003, "loss": 12.7211, "loss/aux_loss": 0.048108032904565334, "loss/crossentropy": 2.7905489981174467, "loss/logits": 0.9878934472799301, "step": 11010 }, { "epoch": 0.1102, "grad_norm": 11.8125, "grad_norm_var": 9.125895182291666, "learning_rate": 0.0003, "loss": 12.6461, "loss/aux_loss": 0.048117210157215595, "loss/crossentropy": 2.8903677582740785, "loss/logits": 0.9732258021831512, "step": 11020 }, { "epoch": 0.1103, "grad_norm": 9.1875, "grad_norm_var": 9.377197265625, "learning_rate": 0.0003, "loss": 12.6081, "loss/aux_loss": 0.04811104368418455, "loss/crossentropy": 2.719060683250427, "loss/logits": 0.9554944217205048, "step": 11030 }, { "epoch": 0.1104, "grad_norm": 10.9375, "grad_norm_var": 0.5860514322916667, "learning_rate": 0.0003, "loss": 12.7441, "loss/aux_loss": 0.04811680149286986, "loss/crossentropy": 2.81855326294899, "loss/logits": 0.976987361907959, "step": 11040 }, { "epoch": 0.1105, "grad_norm": 9.625, "grad_norm_var": 0.6400390625, "learning_rate": 0.0003, "loss": 12.6213, "loss/aux_loss": 0.0481078302487731, "loss/crossentropy": 2.97838671207428, "loss/logits": 0.9554787337779999, "step": 11050 }, { "epoch": 0.1106, "grad_norm": 9.8125, "grad_norm_var": 0.20818684895833334, "learning_rate": 0.0003, "loss": 12.5692, "loss/aux_loss": 0.04810439124703407, "loss/crossentropy": 2.821098101139069, "loss/logits": 0.9252155363559723, "step": 11060 }, { "epoch": 0.1107, "grad_norm": 10.0625, "grad_norm_var": 1.1117024739583334, "learning_rate": 0.0003, "loss": 12.6105, "loss/aux_loss": 0.048108091577887535, "loss/crossentropy": 2.782370573282242, "loss/logits": 0.9402445912361145, "step": 11070 }, { "epoch": 0.1108, "grad_norm": 10.875, "grad_norm_var": 0.4554524739583333, "learning_rate": 0.0003, "loss": 12.7023, "loss/aux_loss": 0.04811048619449139, "loss/crossentropy": 2.903420704603195, "loss/logits": 1.0490208446979523, "step": 11080 }, { "epoch": 0.1109, "grad_norm": 10.1875, "grad_norm_var": 0.378369140625, "learning_rate": 0.0003, "loss": 12.529, "loss/aux_loss": 0.04811018593609333, "loss/crossentropy": 2.9194815278053285, "loss/logits": 1.0058273494243621, "step": 11090 }, { "epoch": 0.111, "grad_norm": 11.625, "grad_norm_var": 0.28587239583333335, "learning_rate": 0.0003, "loss": 12.6302, "loss/aux_loss": 0.04811076112091541, "loss/crossentropy": 2.787671518325806, "loss/logits": 0.9877743035554886, "step": 11100 }, { "epoch": 0.1111, "grad_norm": 10.0625, "grad_norm_var": 0.23982747395833334, "learning_rate": 0.0003, "loss": 12.7033, "loss/aux_loss": 0.04810548275709152, "loss/crossentropy": 2.9209081172943114, "loss/logits": 1.001340913772583, "step": 11110 }, { "epoch": 0.1112, "grad_norm": 9.25, "grad_norm_var": 0.25677083333333334, "learning_rate": 0.0003, "loss": 12.6145, "loss/aux_loss": 0.04811436515301466, "loss/crossentropy": 2.8942541658878325, "loss/logits": 0.9485535502433777, "step": 11120 }, { "epoch": 0.1113, "grad_norm": 9.875, "grad_norm_var": 0.2587890625, "learning_rate": 0.0003, "loss": 12.5613, "loss/aux_loss": 0.048104191198945045, "loss/crossentropy": 2.803705060482025, "loss/logits": 0.9647155731916428, "step": 11130 }, { "epoch": 0.1114, "grad_norm": 10.1875, "grad_norm_var": 0.25729166666666664, "learning_rate": 0.0003, "loss": 12.5791, "loss/aux_loss": 0.04812238048762083, "loss/crossentropy": 2.862819027900696, "loss/logits": 0.9601949125528335, "step": 11140 }, { "epoch": 0.1115, "grad_norm": 11.125, "grad_norm_var": 0.23370768229166666, "learning_rate": 0.0003, "loss": 12.5839, "loss/aux_loss": 0.04810948688536883, "loss/crossentropy": 2.8555395185947416, "loss/logits": 0.9564761400222779, "step": 11150 }, { "epoch": 0.1116, "grad_norm": 10.3125, "grad_norm_var": 0.2872233072916667, "learning_rate": 0.0003, "loss": 12.6588, "loss/aux_loss": 0.0481093930080533, "loss/crossentropy": 2.9155186653137206, "loss/logits": 0.9658820390701294, "step": 11160 }, { "epoch": 0.1117, "grad_norm": 11.1875, "grad_norm_var": 0.20045572916666668, "learning_rate": 0.0003, "loss": 12.777, "loss/aux_loss": 0.048106205835938456, "loss/crossentropy": 2.8000992953777315, "loss/logits": 0.9923090279102326, "step": 11170 }, { "epoch": 0.1118, "grad_norm": 11.75, "grad_norm_var": 0.3815104166666667, "learning_rate": 0.0003, "loss": 12.6665, "loss/aux_loss": 0.048104862496256826, "loss/crossentropy": 2.849539339542389, "loss/logits": 0.973883080482483, "step": 11180 }, { "epoch": 0.1119, "grad_norm": 10.625, "grad_norm_var": 0.39739583333333334, "learning_rate": 0.0003, "loss": 12.4681, "loss/aux_loss": 0.048110177554190156, "loss/crossentropy": 2.8586711943149568, "loss/logits": 1.001354029774666, "step": 11190 }, { "epoch": 0.112, "grad_norm": 19.0, "grad_norm_var": 5.034358723958333, "learning_rate": 0.0003, "loss": 12.638, "loss/aux_loss": 0.04811324365437031, "loss/crossentropy": 2.8593406438827516, "loss/logits": 0.9571125656366348, "step": 11200 }, { "epoch": 0.1121, "grad_norm": 10.4375, "grad_norm_var": 4.9947265625, "learning_rate": 0.0003, "loss": 12.5001, "loss/aux_loss": 0.04811281282454729, "loss/crossentropy": 3.0845739006996156, "loss/logits": 0.9708561331033707, "step": 11210 }, { "epoch": 0.1122, "grad_norm": 10.5625, "grad_norm_var": 0.21243489583333333, "learning_rate": 0.0003, "loss": 12.6438, "loss/aux_loss": 0.04810171201825142, "loss/crossentropy": 2.9789989948272706, "loss/logits": 0.9724924474954605, "step": 11220 }, { "epoch": 0.1123, "grad_norm": 10.625, "grad_norm_var": 0.251416015625, "learning_rate": 0.0003, "loss": 12.8096, "loss/aux_loss": 0.04811390731483698, "loss/crossentropy": 2.897776019573212, "loss/logits": 1.0156351894140243, "step": 11230 }, { "epoch": 0.1124, "grad_norm": 10.6875, "grad_norm_var": 1.221337890625, "learning_rate": 0.0003, "loss": 12.5675, "loss/aux_loss": 0.04811619278043509, "loss/crossentropy": 2.7954994082450866, "loss/logits": 0.9756214946508408, "step": 11240 }, { "epoch": 0.1125, "grad_norm": 10.5625, "grad_norm_var": 0.25983072916666666, "learning_rate": 0.0003, "loss": 12.5928, "loss/aux_loss": 0.04811410661786795, "loss/crossentropy": 2.7457215189933777, "loss/logits": 0.9541913717985153, "step": 11250 }, { "epoch": 0.1126, "grad_norm": 10.5, "grad_norm_var": 0.23084309895833333, "learning_rate": 0.0003, "loss": 12.634, "loss/aux_loss": 0.0481100007891655, "loss/crossentropy": 2.8804012298583985, "loss/logits": 0.9697092175483704, "step": 11260 }, { "epoch": 0.1127, "grad_norm": 12.4375, "grad_norm_var": 0.7613118489583334, "learning_rate": 0.0003, "loss": 12.5108, "loss/aux_loss": 0.04810825977474451, "loss/crossentropy": 2.989805257320404, "loss/logits": 1.0063245952129365, "step": 11270 }, { "epoch": 0.1128, "grad_norm": 10.75, "grad_norm_var": 5.937483723958334, "learning_rate": 0.0003, "loss": 12.7367, "loss/aux_loss": 0.048129872791469096, "loss/crossentropy": 2.910978400707245, "loss/logits": 0.9753200441598893, "step": 11280 }, { "epoch": 0.1129, "grad_norm": 10.25, "grad_norm_var": 0.63359375, "learning_rate": 0.0003, "loss": 12.6325, "loss/aux_loss": 0.048113250732421876, "loss/crossentropy": 2.76353098154068, "loss/logits": 0.966698682308197, "step": 11290 }, { "epoch": 0.113, "grad_norm": 9.9375, "grad_norm_var": 0.629931640625, "learning_rate": 0.0003, "loss": 12.5728, "loss/aux_loss": 0.04810833781957626, "loss/crossentropy": 2.8197692394256593, "loss/logits": 0.9561370402574539, "step": 11300 }, { "epoch": 0.1131, "grad_norm": 10.1875, "grad_norm_var": 0.20364583333333333, "learning_rate": 0.0003, "loss": 12.8143, "loss/aux_loss": 0.04810529686510563, "loss/crossentropy": 2.8484480023384093, "loss/logits": 0.9861988663673401, "step": 11310 }, { "epoch": 0.1132, "grad_norm": 10.75, "grad_norm_var": 0.278369140625, "learning_rate": 0.0003, "loss": 12.6227, "loss/aux_loss": 0.04811426196247339, "loss/crossentropy": 2.833006477355957, "loss/logits": 0.9779454618692398, "step": 11320 }, { "epoch": 0.1133, "grad_norm": 15.25, "grad_norm_var": 1.6007649739583334, "learning_rate": 0.0003, "loss": 12.369, "loss/aux_loss": 0.048116072081029415, "loss/crossentropy": 2.728998416662216, "loss/logits": 0.9195287644863128, "step": 11330 }, { "epoch": 0.1134, "grad_norm": 10.3125, "grad_norm_var": 1.529931640625, "learning_rate": 0.0003, "loss": 12.6997, "loss/aux_loss": 0.04811266250908375, "loss/crossentropy": 2.7394905209541323, "loss/logits": 0.9578628540039062, "step": 11340 }, { "epoch": 0.1135, "grad_norm": 10.125, "grad_norm_var": 0.7983723958333333, "learning_rate": 0.0003, "loss": 12.4926, "loss/aux_loss": 0.04810461904853582, "loss/crossentropy": 2.8682806730270385, "loss/logits": 1.0078667521476745, "step": 11350 }, { "epoch": 0.1136, "grad_norm": 9.625, "grad_norm_var": 0.842431640625, "learning_rate": 0.0003, "loss": 12.688, "loss/aux_loss": 0.0481068329885602, "loss/crossentropy": 2.8801401615142823, "loss/logits": 0.9915682733058929, "step": 11360 }, { "epoch": 0.1137, "grad_norm": 10.3125, "grad_norm_var": 0.3640625, "learning_rate": 0.0003, "loss": 12.449, "loss/aux_loss": 0.048105799593031406, "loss/crossentropy": 2.9282567858695985, "loss/logits": 0.9672238737344742, "step": 11370 }, { "epoch": 0.1138, "grad_norm": 10.5625, "grad_norm_var": 0.28274739583333336, "learning_rate": 0.0003, "loss": 12.5677, "loss/aux_loss": 0.04810627568513155, "loss/crossentropy": 2.8078087627887727, "loss/logits": 0.9780084967613221, "step": 11380 }, { "epoch": 0.1139, "grad_norm": 10.875, "grad_norm_var": 3.143684895833333, "learning_rate": 0.0003, "loss": 12.5506, "loss/aux_loss": 0.048108652047812936, "loss/crossentropy": 2.987882399559021, "loss/logits": 0.972921484708786, "step": 11390 }, { "epoch": 0.114, "grad_norm": 10.1875, "grad_norm_var": 2.8686848958333333, "learning_rate": 0.0003, "loss": 12.6501, "loss/aux_loss": 0.04810910746455192, "loss/crossentropy": 3.1420127391815185, "loss/logits": 1.0203659981489182, "step": 11400 }, { "epoch": 0.1141, "grad_norm": 11.25, "grad_norm_var": 0.37265625, "learning_rate": 0.0003, "loss": 12.6042, "loss/aux_loss": 0.04811469428241253, "loss/crossentropy": 2.939081585407257, "loss/logits": 0.9754122138023377, "step": 11410 }, { "epoch": 0.1142, "grad_norm": 10.9375, "grad_norm_var": 0.5501139322916667, "learning_rate": 0.0003, "loss": 12.4749, "loss/aux_loss": 0.048111149854958055, "loss/crossentropy": 2.8218182921409607, "loss/logits": 0.9850091069936753, "step": 11420 }, { "epoch": 0.1143, "grad_norm": 9.625, "grad_norm_var": 0.5001139322916667, "learning_rate": 0.0003, "loss": 12.7164, "loss/aux_loss": 0.04811033252626658, "loss/crossentropy": 2.908424949645996, "loss/logits": 0.9455067425966263, "step": 11430 }, { "epoch": 0.1144, "grad_norm": 10.625, "grad_norm_var": 0.8135416666666667, "learning_rate": 0.0003, "loss": 12.617, "loss/aux_loss": 0.04811215177178383, "loss/crossentropy": 2.83238645195961, "loss/logits": 0.9909904628992081, "step": 11440 }, { "epoch": 0.1145, "grad_norm": 10.625, "grad_norm_var": 75.9525390625, "learning_rate": 0.0003, "loss": 12.631, "loss/aux_loss": 0.04813397005200386, "loss/crossentropy": 2.8597318053245546, "loss/logits": 0.9930762708187103, "step": 11450 }, { "epoch": 0.1146, "grad_norm": 9.0625, "grad_norm_var": 0.27076822916666665, "learning_rate": 0.0003, "loss": 12.7103, "loss/aux_loss": 0.04811841379851103, "loss/crossentropy": 2.7436058163642882, "loss/logits": 0.9644762337207794, "step": 11460 }, { "epoch": 0.1147, "grad_norm": 9.9375, "grad_norm_var": 0.5945149739583333, "learning_rate": 0.0003, "loss": 12.7142, "loss/aux_loss": 0.04810911137610674, "loss/crossentropy": 2.85939040184021, "loss/logits": 0.9520360499620437, "step": 11470 }, { "epoch": 0.1148, "grad_norm": 10.5, "grad_norm_var": 0.4886555989583333, "learning_rate": 0.0003, "loss": 12.5397, "loss/aux_loss": 0.04810511395335197, "loss/crossentropy": 2.976986992359161, "loss/logits": 0.9621847212314606, "step": 11480 }, { "epoch": 0.1149, "grad_norm": 10.25, "grad_norm_var": 0.429150390625, "learning_rate": 0.0003, "loss": 12.3119, "loss/aux_loss": 0.048113946430385114, "loss/crossentropy": 2.851515471935272, "loss/logits": 0.9701724350452423, "step": 11490 }, { "epoch": 0.115, "grad_norm": 10.875, "grad_norm_var": 0.6905598958333333, "learning_rate": 0.0003, "loss": 12.5609, "loss/aux_loss": 0.048105195350945, "loss/crossentropy": 2.8894742131233215, "loss/logits": 0.9788044720888138, "step": 11500 }, { "epoch": 0.1151, "grad_norm": 10.25, "grad_norm_var": 0.508837890625, "learning_rate": 0.0003, "loss": 12.6204, "loss/aux_loss": 0.04811161942780018, "loss/crossentropy": 2.7808387517929076, "loss/logits": 0.9664526283740997, "step": 11510 }, { "epoch": 0.1152, "grad_norm": 10.6875, "grad_norm_var": 0.2964680989583333, "learning_rate": 0.0003, "loss": 12.5533, "loss/aux_loss": 0.048106780648231505, "loss/crossentropy": 3.0173611760139467, "loss/logits": 0.9817897409200669, "step": 11520 }, { "epoch": 0.1153, "grad_norm": 12.375, "grad_norm_var": 0.4494140625, "learning_rate": 0.0003, "loss": 12.7684, "loss/aux_loss": 0.04811464920639992, "loss/crossentropy": 2.9131483495235444, "loss/logits": 0.9585329800844192, "step": 11530 }, { "epoch": 0.1154, "grad_norm": 10.75, "grad_norm_var": 0.5989420572916667, "learning_rate": 0.0003, "loss": 12.5183, "loss/aux_loss": 0.04811756443232298, "loss/crossentropy": 2.868135905265808, "loss/logits": 0.9498421907424927, "step": 11540 }, { "epoch": 0.1155, "grad_norm": 10.4375, "grad_norm_var": 0.6488932291666667, "learning_rate": 0.0003, "loss": 12.5642, "loss/aux_loss": 0.0480995262041688, "loss/crossentropy": 2.7917768478393556, "loss/logits": 0.9568525284528733, "step": 11550 }, { "epoch": 0.1156, "grad_norm": 10.8125, "grad_norm_var": 0.334228515625, "learning_rate": 0.0003, "loss": 12.4884, "loss/aux_loss": 0.04810049049556255, "loss/crossentropy": 2.9180258393287657, "loss/logits": 0.98627108335495, "step": 11560 }, { "epoch": 0.1157, "grad_norm": 10.9375, "grad_norm_var": 0.5395833333333333, "learning_rate": 0.0003, "loss": 12.627, "loss/aux_loss": 0.0481100594624877, "loss/crossentropy": 2.81765558719635, "loss/logits": 0.9568387240171432, "step": 11570 }, { "epoch": 0.1158, "grad_norm": 9.6875, "grad_norm_var": 0.17472330729166666, "learning_rate": 0.0003, "loss": 12.7317, "loss/aux_loss": 0.04811111558228731, "loss/crossentropy": 2.8285086691379546, "loss/logits": 0.9868262588977814, "step": 11580 }, { "epoch": 0.1159, "grad_norm": 10.5625, "grad_norm_var": 0.332666015625, "learning_rate": 0.0003, "loss": 12.3661, "loss/aux_loss": 0.04810287747532129, "loss/crossentropy": 2.7959317326545716, "loss/logits": 0.9438671588897705, "step": 11590 }, { "epoch": 0.116, "grad_norm": 11.0, "grad_norm_var": 0.46013997395833334, "learning_rate": 0.0003, "loss": 12.5853, "loss/aux_loss": 0.04810447972267866, "loss/crossentropy": 2.96794798374176, "loss/logits": 0.9699487566947937, "step": 11600 }, { "epoch": 0.1161, "grad_norm": 9.9375, "grad_norm_var": 0.2872233072916667, "learning_rate": 0.0003, "loss": 12.5827, "loss/aux_loss": 0.04810726400464773, "loss/crossentropy": 2.7267133831977843, "loss/logits": 0.9387221932411194, "step": 11610 }, { "epoch": 0.1162, "grad_norm": 10.6875, "grad_norm_var": 0.2567057291666667, "learning_rate": 0.0003, "loss": 12.621, "loss/aux_loss": 0.04810353182256222, "loss/crossentropy": 2.8360018610954283, "loss/logits": 0.9253288894891739, "step": 11620 }, { "epoch": 0.1163, "grad_norm": 11.25, "grad_norm_var": 0.2911295572916667, "learning_rate": 0.0003, "loss": 12.5106, "loss/aux_loss": 0.048119408451020716, "loss/crossentropy": 2.794688510894775, "loss/logits": 0.9578901708126069, "step": 11630 }, { "epoch": 0.1164, "grad_norm": 11.375, "grad_norm_var": 0.43722330729166664, "learning_rate": 0.0003, "loss": 12.6836, "loss/aux_loss": 0.04810473546385765, "loss/crossentropy": 2.8710934042930605, "loss/logits": 0.9788650065660477, "step": 11640 }, { "epoch": 0.1165, "grad_norm": 10.0625, "grad_norm_var": 0.3859375, "learning_rate": 0.0003, "loss": 12.6164, "loss/aux_loss": 0.04809908457100391, "loss/crossentropy": 2.807088649272919, "loss/logits": 0.9633297890424728, "step": 11650 }, { "epoch": 0.1166, "grad_norm": 9.6875, "grad_norm_var": 0.3400390625, "learning_rate": 0.0003, "loss": 12.648, "loss/aux_loss": 0.04809819832444191, "loss/crossentropy": 2.9749920845031737, "loss/logits": 1.0201595097780227, "step": 11660 }, { "epoch": 0.1167, "grad_norm": 11.875, "grad_norm_var": 2.988785807291667, "learning_rate": 0.0003, "loss": 12.4478, "loss/aux_loss": 0.04812034796923399, "loss/crossentropy": 2.62253178358078, "loss/logits": 0.9186932921409607, "step": 11670 }, { "epoch": 0.1168, "grad_norm": 11.9375, "grad_norm_var": 12.179622395833333, "learning_rate": 0.0003, "loss": 12.4319, "loss/aux_loss": 0.04812979437410832, "loss/crossentropy": 2.857901084423065, "loss/logits": 0.9803258359432221, "step": 11680 }, { "epoch": 0.1169, "grad_norm": 11.1875, "grad_norm_var": 12.487353515625, "learning_rate": 0.0003, "loss": 12.792, "loss/aux_loss": 0.04811025280505419, "loss/crossentropy": 2.872191935777664, "loss/logits": 0.9423937231302262, "step": 11690 }, { "epoch": 0.117, "grad_norm": 10.5625, "grad_norm_var": 0.14036458333333332, "learning_rate": 0.0003, "loss": 12.5762, "loss/aux_loss": 0.04811386782675982, "loss/crossentropy": 2.741291904449463, "loss/logits": 0.9453139632940293, "step": 11700 }, { "epoch": 0.1171, "grad_norm": 10.75, "grad_norm_var": 0.21795247395833334, "learning_rate": 0.0003, "loss": 12.3318, "loss/aux_loss": 0.04810644257813692, "loss/crossentropy": 2.9782674610614777, "loss/logits": 0.9474372059106827, "step": 11710 }, { "epoch": 0.1172, "grad_norm": 9.9375, "grad_norm_var": 0.3042805989583333, "learning_rate": 0.0003, "loss": 12.6985, "loss/aux_loss": 0.04810058567672968, "loss/crossentropy": 2.759519326686859, "loss/logits": 1.003102535009384, "step": 11720 }, { "epoch": 0.1173, "grad_norm": 10.5625, "grad_norm_var": 2.947379557291667, "learning_rate": 0.0003, "loss": 12.5665, "loss/aux_loss": 0.04811586532741785, "loss/crossentropy": 2.885226249694824, "loss/logits": 0.9729419648647308, "step": 11730 }, { "epoch": 0.1174, "grad_norm": 10.9375, "grad_norm_var": 2.6150390625, "learning_rate": 0.0003, "loss": 12.5392, "loss/aux_loss": 0.04810945596545935, "loss/crossentropy": 2.859631586074829, "loss/logits": 0.9514502733945847, "step": 11740 }, { "epoch": 0.1175, "grad_norm": 11.0625, "grad_norm_var": 0.47701822916666664, "learning_rate": 0.0003, "loss": 12.6717, "loss/aux_loss": 0.048104429990053175, "loss/crossentropy": 2.885056400299072, "loss/logits": 0.9942733883857727, "step": 11750 }, { "epoch": 0.1176, "grad_norm": 10.5, "grad_norm_var": 0.6020182291666667, "learning_rate": 0.0003, "loss": 12.6648, "loss/aux_loss": 0.04809593297541141, "loss/crossentropy": 2.9696100473403932, "loss/logits": 0.9882340937852859, "step": 11760 }, { "epoch": 0.1177, "grad_norm": 11.125, "grad_norm_var": 0.21451822916666666, "learning_rate": 0.0003, "loss": 12.4929, "loss/aux_loss": 0.04809958972036839, "loss/crossentropy": 2.844915008544922, "loss/logits": 0.983967337012291, "step": 11770 }, { "epoch": 0.1178, "grad_norm": 11.0625, "grad_norm_var": 0.245556640625, "learning_rate": 0.0003, "loss": 12.4835, "loss/aux_loss": 0.048105498775839806, "loss/crossentropy": 2.9103447675704954, "loss/logits": 0.9672578752040863, "step": 11780 }, { "epoch": 0.1179, "grad_norm": 10.3125, "grad_norm_var": 0.106884765625, "learning_rate": 0.0003, "loss": 12.599, "loss/aux_loss": 0.04810230545699597, "loss/crossentropy": 2.8057246923446657, "loss/logits": 0.9675930976867676, "step": 11790 }, { "epoch": 0.118, "grad_norm": 11.25, "grad_norm_var": 0.14348958333333334, "learning_rate": 0.0003, "loss": 12.4241, "loss/aux_loss": 0.04811300784349441, "loss/crossentropy": 2.794022238254547, "loss/logits": 0.9394135266542435, "step": 11800 }, { "epoch": 0.1181, "grad_norm": 10.1875, "grad_norm_var": 0.31197916666666664, "learning_rate": 0.0003, "loss": 12.5517, "loss/aux_loss": 0.04811709113419056, "loss/crossentropy": 2.727192759513855, "loss/logits": 0.9235861957073211, "step": 11810 }, { "epoch": 0.1182, "grad_norm": 10.75, "grad_norm_var": 0.4572265625, "learning_rate": 0.0003, "loss": 12.4864, "loss/aux_loss": 0.048109428584575654, "loss/crossentropy": 2.7515438914299013, "loss/logits": 0.9708627730607986, "step": 11820 }, { "epoch": 0.1183, "grad_norm": 10.3125, "grad_norm_var": 0.426025390625, "learning_rate": 0.0003, "loss": 12.4548, "loss/aux_loss": 0.04810903538018465, "loss/crossentropy": 2.889864444732666, "loss/logits": 0.9854389071464539, "step": 11830 }, { "epoch": 0.1184, "grad_norm": 10.1875, "grad_norm_var": 0.20703125, "learning_rate": 0.0003, "loss": 12.6243, "loss/aux_loss": 0.04810080174356699, "loss/crossentropy": 2.85439276099205, "loss/logits": 0.9542811691761017, "step": 11840 }, { "epoch": 0.1185, "grad_norm": 10.5625, "grad_norm_var": 0.21979166666666666, "learning_rate": 0.0003, "loss": 12.6767, "loss/aux_loss": 0.048108363337814805, "loss/crossentropy": 2.8335661768913267, "loss/logits": 0.9943309754133225, "step": 11850 }, { "epoch": 0.1186, "grad_norm": 10.3125, "grad_norm_var": 34.180843098958334, "learning_rate": 0.0003, "loss": 12.4189, "loss/aux_loss": 0.048121347464621066, "loss/crossentropy": 2.8601845264434815, "loss/logits": 0.9276136964559555, "step": 11860 }, { "epoch": 0.1187, "grad_norm": 10.5625, "grad_norm_var": 0.24295247395833333, "learning_rate": 0.0003, "loss": 12.5005, "loss/aux_loss": 0.04811071082949638, "loss/crossentropy": 2.7520765900611877, "loss/logits": 0.9491381376981736, "step": 11870 }, { "epoch": 0.1188, "grad_norm": 10.0, "grad_norm_var": 0.1697265625, "learning_rate": 0.0003, "loss": 12.5049, "loss/aux_loss": 0.048104492016136646, "loss/crossentropy": 3.0041671991348267, "loss/logits": 1.0042832434177398, "step": 11880 }, { "epoch": 0.1189, "grad_norm": 10.5625, "grad_norm_var": 0.12849934895833334, "learning_rate": 0.0003, "loss": 12.5701, "loss/aux_loss": 0.048117601312696934, "loss/crossentropy": 2.745959347486496, "loss/logits": 0.9276633858680725, "step": 11890 }, { "epoch": 0.119, "grad_norm": 10.125, "grad_norm_var": 0.20514322916666666, "learning_rate": 0.0003, "loss": 12.4645, "loss/aux_loss": 0.04810411240905523, "loss/crossentropy": 2.842986249923706, "loss/logits": 0.9896512359380722, "step": 11900 }, { "epoch": 0.1191, "grad_norm": 11.3125, "grad_norm_var": 0.43463541666666666, "learning_rate": 0.0003, "loss": 12.5719, "loss/aux_loss": 0.04811199139803648, "loss/crossentropy": 2.7681937336921694, "loss/logits": 0.9506595671176911, "step": 11910 }, { "epoch": 0.1192, "grad_norm": 11.375, "grad_norm_var": 0.27263997395833334, "learning_rate": 0.0003, "loss": 12.6075, "loss/aux_loss": 0.04810190089046955, "loss/crossentropy": 2.9162669658660887, "loss/logits": 0.9811139643192291, "step": 11920 }, { "epoch": 0.1193, "grad_norm": 10.625, "grad_norm_var": 0.19724934895833332, "learning_rate": 0.0003, "loss": 12.5182, "loss/aux_loss": 0.04810644220560789, "loss/crossentropy": 2.889602208137512, "loss/logits": 0.9657979607582092, "step": 11930 }, { "epoch": 0.1194, "grad_norm": 10.4375, "grad_norm_var": 0.27317708333333335, "learning_rate": 0.0003, "loss": 12.4569, "loss/aux_loss": 0.04809871483594179, "loss/crossentropy": 3.0264495491981505, "loss/logits": 0.9761357963085174, "step": 11940 }, { "epoch": 0.1195, "grad_norm": 10.375, "grad_norm_var": 0.12862955729166667, "learning_rate": 0.0003, "loss": 12.4498, "loss/aux_loss": 0.04811019022017717, "loss/crossentropy": 2.784598481655121, "loss/logits": 0.9536922335624695, "step": 11950 }, { "epoch": 0.1196, "grad_norm": 10.875, "grad_norm_var": 24.016780598958334, "learning_rate": 0.0003, "loss": 12.4946, "loss/aux_loss": 0.04811384323984384, "loss/crossentropy": 2.9115478515625, "loss/logits": 0.9744400382041931, "step": 11960 }, { "epoch": 0.1197, "grad_norm": 11.3125, "grad_norm_var": 0.674072265625, "learning_rate": 0.0003, "loss": 12.5965, "loss/aux_loss": 0.04810780603438616, "loss/crossentropy": 2.894571363925934, "loss/logits": 0.9742325752973556, "step": 11970 }, { "epoch": 0.1198, "grad_norm": 11.6875, "grad_norm_var": 0.30911458333333336, "learning_rate": 0.0003, "loss": 12.6002, "loss/aux_loss": 0.04811006467789412, "loss/crossentropy": 3.06647070646286, "loss/logits": 0.9646219074726105, "step": 11980 }, { "epoch": 0.1199, "grad_norm": 10.1875, "grad_norm_var": 0.217041015625, "learning_rate": 0.0003, "loss": 12.3748, "loss/aux_loss": 0.04810543842613697, "loss/crossentropy": 2.722061502933502, "loss/logits": 0.9802715986967087, "step": 11990 }, { "epoch": 0.12, "grad_norm": 10.0625, "grad_norm_var": 0.42649739583333335, "learning_rate": 0.0003, "loss": 12.5784, "loss/aux_loss": 0.04810118656605482, "loss/crossentropy": 2.83687162399292, "loss/logits": 0.9512553691864014, "step": 12000 }, { "epoch": 0.1201, "grad_norm": 11.125, "grad_norm_var": 0.24733072916666668, "learning_rate": 0.0003, "loss": 12.3364, "loss/aux_loss": 0.04810278117656708, "loss/crossentropy": 2.744813871383667, "loss/logits": 0.9523531854152679, "step": 12010 }, { "epoch": 0.1202, "grad_norm": 10.75, "grad_norm_var": 0.23214518229166667, "learning_rate": 0.0003, "loss": 12.6001, "loss/aux_loss": 0.04810234196484089, "loss/crossentropy": 2.830919635295868, "loss/logits": 0.9435950011014939, "step": 12020 }, { "epoch": 0.1203, "grad_norm": 11.1875, "grad_norm_var": 0.23201497395833334, "learning_rate": 0.0003, "loss": 12.5726, "loss/aux_loss": 0.04810579065233469, "loss/crossentropy": 2.802901232242584, "loss/logits": 0.971744042634964, "step": 12030 }, { "epoch": 0.1204, "grad_norm": 11.1875, "grad_norm_var": 0.30206705729166666, "learning_rate": 0.0003, "loss": 12.6394, "loss/aux_loss": 0.04810426253825426, "loss/crossentropy": 2.839026927947998, "loss/logits": 0.971143838763237, "step": 12040 }, { "epoch": 0.1205, "grad_norm": 10.8125, "grad_norm_var": 0.19895833333333332, "learning_rate": 0.0003, "loss": 12.5642, "loss/aux_loss": 0.04810550380498171, "loss/crossentropy": 2.8231468319892885, "loss/logits": 0.9654949724674224, "step": 12050 }, { "epoch": 0.1206, "grad_norm": 10.6875, "grad_norm_var": 0.17420247395833333, "learning_rate": 0.0003, "loss": 12.3904, "loss/aux_loss": 0.048108019307255744, "loss/crossentropy": 3.012854266166687, "loss/logits": 0.9883525311946869, "step": 12060 }, { "epoch": 0.1207, "grad_norm": 11.125, "grad_norm_var": 0.268603515625, "learning_rate": 0.0003, "loss": 12.5321, "loss/aux_loss": 0.04810563083738088, "loss/crossentropy": 2.70223063826561, "loss/logits": 0.9730505347251892, "step": 12070 }, { "epoch": 0.1208, "grad_norm": 10.875, "grad_norm_var": 0.271728515625, "learning_rate": 0.0003, "loss": 12.5535, "loss/aux_loss": 0.04810192938894033, "loss/crossentropy": 3.0259074330329896, "loss/logits": 0.9854034870862961, "step": 12080 }, { "epoch": 0.1209, "grad_norm": 10.375, "grad_norm_var": 0.17161458333333332, "learning_rate": 0.0003, "loss": 12.765, "loss/aux_loss": 0.04810161255300045, "loss/crossentropy": 2.875988984107971, "loss/logits": 0.9918344229459762, "step": 12090 }, { "epoch": 0.121, "grad_norm": 11.375, "grad_norm_var": 0.19635416666666666, "learning_rate": 0.0003, "loss": 12.623, "loss/aux_loss": 0.048103974759578706, "loss/crossentropy": 2.742973780632019, "loss/logits": 0.9306074976921082, "step": 12100 }, { "epoch": 0.1211, "grad_norm": 10.5625, "grad_norm_var": 0.22024739583333333, "learning_rate": 0.0003, "loss": 12.5789, "loss/aux_loss": 0.04811310023069382, "loss/crossentropy": 2.705821967124939, "loss/logits": 0.9371457666158676, "step": 12110 }, { "epoch": 0.1212, "grad_norm": 11.4375, "grad_norm_var": 0.9910807291666667, "learning_rate": 0.0003, "loss": 12.353, "loss/aux_loss": 0.04810843821614981, "loss/crossentropy": 2.6658570945262907, "loss/logits": 0.9095493495464325, "step": 12120 }, { "epoch": 0.1213, "grad_norm": 10.5625, "grad_norm_var": 1.1050618489583333, "learning_rate": 0.0003, "loss": 12.6082, "loss/aux_loss": 0.04810453653335571, "loss/crossentropy": 2.8400238871574404, "loss/logits": 0.9993251740932465, "step": 12130 }, { "epoch": 0.1214, "grad_norm": 10.9375, "grad_norm_var": 0.3003743489583333, "learning_rate": 0.0003, "loss": 12.293, "loss/aux_loss": 0.04811606556177139, "loss/crossentropy": 2.8960613369941712, "loss/logits": 0.9764822989702224, "step": 12140 }, { "epoch": 0.1215, "grad_norm": 10.25, "grad_norm_var": 0.29244791666666664, "learning_rate": 0.0003, "loss": 12.4496, "loss/aux_loss": 0.048100278712809086, "loss/crossentropy": 3.0011345863342287, "loss/logits": 0.9704394817352295, "step": 12150 }, { "epoch": 0.1216, "grad_norm": 11.3125, "grad_norm_var": 0.4044270833333333, "learning_rate": 0.0003, "loss": 12.5732, "loss/aux_loss": 0.0481085266917944, "loss/crossentropy": 2.7692679166793823, "loss/logits": 0.9632198810577393, "step": 12160 }, { "epoch": 0.1217, "grad_norm": 11.0, "grad_norm_var": 0.24609375, "learning_rate": 0.0003, "loss": 12.4976, "loss/aux_loss": 0.04810815379023552, "loss/crossentropy": 2.8866684079170226, "loss/logits": 0.9551214545965194, "step": 12170 }, { "epoch": 0.1218, "grad_norm": 11.1875, "grad_norm_var": 0.23318684895833333, "learning_rate": 0.0003, "loss": 12.5185, "loss/aux_loss": 0.048101365193724634, "loss/crossentropy": 2.8655909061431886, "loss/logits": 0.9533731818199158, "step": 12180 }, { "epoch": 0.1219, "grad_norm": 11.0, "grad_norm_var": 0.8066243489583333, "learning_rate": 0.0003, "loss": 12.6911, "loss/aux_loss": 0.04811571817845106, "loss/crossentropy": 2.8677718937397003, "loss/logits": 0.976726308465004, "step": 12190 }, { "epoch": 0.122, "grad_norm": 10.4375, "grad_norm_var": 1.0106770833333334, "learning_rate": 0.0003, "loss": 12.5852, "loss/aux_loss": 0.048111325688660146, "loss/crossentropy": 2.756421709060669, "loss/logits": 0.9976501137018203, "step": 12200 }, { "epoch": 0.1221, "grad_norm": 10.3125, "grad_norm_var": 1.3247233072916667, "learning_rate": 0.0003, "loss": 12.5394, "loss/aux_loss": 0.04810780212283135, "loss/crossentropy": 2.9484314799308775, "loss/logits": 0.9888930469751358, "step": 12210 }, { "epoch": 0.1222, "grad_norm": 11.25, "grad_norm_var": 1.724853515625, "learning_rate": 0.0003, "loss": 12.658, "loss/aux_loss": 0.048103698343038556, "loss/crossentropy": 2.8530756711959837, "loss/logits": 0.961388236284256, "step": 12220 }, { "epoch": 0.1223, "grad_norm": 10.375, "grad_norm_var": 0.7403645833333333, "learning_rate": 0.0003, "loss": 12.6152, "loss/aux_loss": 0.048111156560480595, "loss/crossentropy": 2.7180242002010346, "loss/logits": 0.9586433321237564, "step": 12230 }, { "epoch": 0.1224, "grad_norm": 10.1875, "grad_norm_var": 0.7048014322916667, "learning_rate": 0.0003, "loss": 12.4559, "loss/aux_loss": 0.048110079020261765, "loss/crossentropy": 2.9755612432956697, "loss/logits": 0.9319843083620072, "step": 12240 }, { "epoch": 0.1225, "grad_norm": 10.8125, "grad_norm_var": 0.697119140625, "learning_rate": 0.0003, "loss": 12.4664, "loss/aux_loss": 0.048104499280452725, "loss/crossentropy": 2.828293478488922, "loss/logits": 0.9228764444589614, "step": 12250 }, { "epoch": 0.1226, "grad_norm": 10.9375, "grad_norm_var": 0.234228515625, "learning_rate": 0.0003, "loss": 12.6363, "loss/aux_loss": 0.048111573606729505, "loss/crossentropy": 2.9271127223968505, "loss/logits": 0.9774176150560379, "step": 12260 }, { "epoch": 0.1227, "grad_norm": 12.75, "grad_norm_var": 8.608968098958334, "learning_rate": 0.0003, "loss": 12.3807, "loss/aux_loss": 0.048101313598454, "loss/crossentropy": 2.63518745303154, "loss/logits": 0.9886480629444122, "step": 12270 }, { "epoch": 0.1228, "grad_norm": 10.3125, "grad_norm_var": 9.355712890625, "learning_rate": 0.0003, "loss": 12.6403, "loss/aux_loss": 0.04810824524611235, "loss/crossentropy": 2.950111997127533, "loss/logits": 0.9644519031047821, "step": 12280 }, { "epoch": 0.1229, "grad_norm": 11.0, "grad_norm_var": 1.6855305989583333, "learning_rate": 0.0003, "loss": 12.6324, "loss/aux_loss": 0.048108363337814805, "loss/crossentropy": 3.013728940486908, "loss/logits": 0.999145370721817, "step": 12290 }, { "epoch": 0.123, "grad_norm": 10.5, "grad_norm_var": 0.28904622395833335, "learning_rate": 0.0003, "loss": 12.5458, "loss/aux_loss": 0.048100477643311025, "loss/crossentropy": 2.722964417934418, "loss/logits": 0.9335471302270889, "step": 12300 }, { "epoch": 0.1231, "grad_norm": 11.875, "grad_norm_var": 0.453125, "learning_rate": 0.0003, "loss": 12.4757, "loss/aux_loss": 0.04810597654432058, "loss/crossentropy": 2.820869207382202, "loss/logits": 0.9774489820003509, "step": 12310 }, { "epoch": 0.1232, "grad_norm": 10.1875, "grad_norm_var": 0.27316080729166664, "learning_rate": 0.0003, "loss": 12.4025, "loss/aux_loss": 0.04810553044080734, "loss/crossentropy": 2.7451001048088073, "loss/logits": 0.9624879866838455, "step": 12320 }, { "epoch": 0.1233, "grad_norm": 10.6875, "grad_norm_var": 0.25857747395833336, "learning_rate": 0.0003, "loss": 12.4356, "loss/aux_loss": 0.04811267796903849, "loss/crossentropy": 2.917902183532715, "loss/logits": 0.9623291105031967, "step": 12330 }, { "epoch": 0.1234, "grad_norm": 10.8125, "grad_norm_var": 0.24842122395833333, "learning_rate": 0.0003, "loss": 12.5192, "loss/aux_loss": 0.04810788352042437, "loss/crossentropy": 2.9027243733406065, "loss/logits": 0.9865126490592957, "step": 12340 }, { "epoch": 0.1235, "grad_norm": 10.875, "grad_norm_var": 0.14217122395833334, "learning_rate": 0.0003, "loss": 12.6416, "loss/aux_loss": 0.04810071587562561, "loss/crossentropy": 2.8123831510543824, "loss/logits": 0.9990533202886581, "step": 12350 }, { "epoch": 0.1236, "grad_norm": 10.9375, "grad_norm_var": 0.07185872395833333, "learning_rate": 0.0003, "loss": 12.417, "loss/aux_loss": 0.04809997137635946, "loss/crossentropy": 2.8776489377021788, "loss/logits": 0.9396691709756851, "step": 12360 }, { "epoch": 0.1237, "grad_norm": 11.1875, "grad_norm_var": 0.17771809895833332, "learning_rate": 0.0003, "loss": 12.4562, "loss/aux_loss": 0.048106643930077554, "loss/crossentropy": 2.726925420761108, "loss/logits": 0.9575481981039047, "step": 12370 }, { "epoch": 0.1238, "grad_norm": 10.625, "grad_norm_var": 0.34427083333333336, "learning_rate": 0.0003, "loss": 12.5861, "loss/aux_loss": 0.0481028001755476, "loss/crossentropy": 2.8643892288208006, "loss/logits": 0.9604303538799286, "step": 12380 }, { "epoch": 0.1239, "grad_norm": 11.1875, "grad_norm_var": 0.17838541666666666, "learning_rate": 0.0003, "loss": 12.5781, "loss/aux_loss": 0.04810880180448294, "loss/crossentropy": 2.9660362005233765, "loss/logits": 0.9596373349428177, "step": 12390 }, { "epoch": 0.124, "grad_norm": 10.625, "grad_norm_var": 0.1931640625, "learning_rate": 0.0003, "loss": 12.6396, "loss/aux_loss": 0.04811811447143555, "loss/crossentropy": 2.980528914928436, "loss/logits": 0.9617955178022385, "step": 12400 }, { "epoch": 0.1241, "grad_norm": 11.625, "grad_norm_var": 1.4231770833333333, "learning_rate": 0.0003, "loss": 12.6891, "loss/aux_loss": 0.048112993128597736, "loss/crossentropy": 2.9703574776649475, "loss/logits": 0.973251935839653, "step": 12410 }, { "epoch": 0.1242, "grad_norm": 10.375, "grad_norm_var": 1.4962890625, "learning_rate": 0.0003, "loss": 12.3924, "loss/aux_loss": 0.04811142534017563, "loss/crossentropy": 2.6934870958328245, "loss/logits": 0.9628842860460282, "step": 12420 }, { "epoch": 0.1243, "grad_norm": 31.75, "grad_norm_var": 27.332747395833334, "learning_rate": 0.0003, "loss": 12.5895, "loss/aux_loss": 0.04810582157224417, "loss/crossentropy": 2.749896514415741, "loss/logits": 0.9580157309770584, "step": 12430 }, { "epoch": 0.1244, "grad_norm": 10.375, "grad_norm_var": 27.559879557291666, "learning_rate": 0.0003, "loss": 12.5005, "loss/aux_loss": 0.04812322128564119, "loss/crossentropy": 2.8022406458854676, "loss/logits": 0.9580208510160446, "step": 12440 }, { "epoch": 0.1245, "grad_norm": 10.5625, "grad_norm_var": 0.3863118489583333, "learning_rate": 0.0003, "loss": 12.5714, "loss/aux_loss": 0.048095401376485825, "loss/crossentropy": 2.8494678735733032, "loss/logits": 0.9524266660213471, "step": 12450 }, { "epoch": 0.1246, "grad_norm": 10.0, "grad_norm_var": 0.20271809895833334, "learning_rate": 0.0003, "loss": 12.5446, "loss/aux_loss": 0.04811746347695589, "loss/crossentropy": 2.8464840769767763, "loss/logits": 0.9794892787933349, "step": 12460 }, { "epoch": 0.1247, "grad_norm": 10.6875, "grad_norm_var": 0.3634765625, "learning_rate": 0.0003, "loss": 12.5093, "loss/aux_loss": 0.04810458458960056, "loss/crossentropy": 2.8821428060531615, "loss/logits": 0.9548739582300186, "step": 12470 }, { "epoch": 0.1248, "grad_norm": 10.25, "grad_norm_var": 158.55701497395833, "learning_rate": 0.0003, "loss": 12.5456, "loss/aux_loss": 0.04812399763613939, "loss/crossentropy": 2.8613539934158325, "loss/logits": 0.9762499183416367, "step": 12480 }, { "epoch": 0.1249, "grad_norm": 10.8125, "grad_norm_var": 0.5799479166666667, "learning_rate": 0.0003, "loss": 12.5057, "loss/aux_loss": 0.04810742326080799, "loss/crossentropy": 2.865919351577759, "loss/logits": 0.9773925930261612, "step": 12490 }, { "epoch": 0.125, "grad_norm": 10.375, "grad_norm_var": 0.40623372395833335, "learning_rate": 0.0003, "loss": 12.4343, "loss/aux_loss": 0.04811244308948517, "loss/crossentropy": 2.7066974461078646, "loss/logits": 0.9389273285865783, "step": 12500 }, { "epoch": 0.1251, "grad_norm": 11.3125, "grad_norm_var": 0.38670247395833335, "learning_rate": 0.0003, "loss": 12.4011, "loss/aux_loss": 0.04809202216565609, "loss/crossentropy": 2.9026967763900755, "loss/logits": 0.969332093000412, "step": 12510 }, { "epoch": 0.1252, "grad_norm": 10.875, "grad_norm_var": 0.6098958333333333, "learning_rate": 0.0003, "loss": 12.4336, "loss/aux_loss": 0.04811373949050903, "loss/crossentropy": 2.665140724182129, "loss/logits": 0.9239953130483627, "step": 12520 }, { "epoch": 0.1253, "grad_norm": 11.1875, "grad_norm_var": 0.7331868489583333, "learning_rate": 0.0003, "loss": 12.6341, "loss/aux_loss": 0.04810454789549112, "loss/crossentropy": 2.8536665797233582, "loss/logits": 0.9737724870443344, "step": 12530 }, { "epoch": 0.1254, "grad_norm": 10.75, "grad_norm_var": 0.253125, "learning_rate": 0.0003, "loss": 12.2721, "loss/aux_loss": 0.04812637399882078, "loss/crossentropy": 2.623065769672394, "loss/logits": 0.9491947621107102, "step": 12540 }, { "epoch": 0.1255, "grad_norm": 10.3125, "grad_norm_var": 0.291259765625, "learning_rate": 0.0003, "loss": 12.346, "loss/aux_loss": 0.04810612387955189, "loss/crossentropy": 2.8949776351451875, "loss/logits": 0.9749656409025192, "step": 12550 }, { "epoch": 0.1256, "grad_norm": 9.875, "grad_norm_var": 0.20358072916666667, "learning_rate": 0.0003, "loss": 12.39, "loss/aux_loss": 0.048107765056192874, "loss/crossentropy": 2.9021278619766235, "loss/logits": 0.9786544352769851, "step": 12560 }, { "epoch": 0.1257, "grad_norm": 10.4375, "grad_norm_var": 0.23723958333333334, "learning_rate": 0.0003, "loss": 12.377, "loss/aux_loss": 0.04810867067426443, "loss/crossentropy": 2.6891712307929994, "loss/logits": 0.9230633974075317, "step": 12570 }, { "epoch": 0.1258, "grad_norm": 10.5, "grad_norm_var": 0.12057291666666667, "learning_rate": 0.0003, "loss": 12.5812, "loss/aux_loss": 0.04811552707105875, "loss/crossentropy": 2.969293546676636, "loss/logits": 0.9738602817058564, "step": 12580 }, { "epoch": 0.1259, "grad_norm": 10.1875, "grad_norm_var": 0.17498372395833334, "learning_rate": 0.0003, "loss": 12.4027, "loss/aux_loss": 0.04809841345995665, "loss/crossentropy": 2.835331308841705, "loss/logits": 0.9679557770490647, "step": 12590 }, { "epoch": 0.126, "grad_norm": 10.0625, "grad_norm_var": 0.237744140625, "learning_rate": 0.0003, "loss": 12.5829, "loss/aux_loss": 0.048117955774068834, "loss/crossentropy": 2.8491066575050352, "loss/logits": 0.910678106546402, "step": 12600 }, { "epoch": 0.1261, "grad_norm": 11.0625, "grad_norm_var": 0.4869791666666667, "learning_rate": 0.0003, "loss": 12.5276, "loss/aux_loss": 0.04809832703322172, "loss/crossentropy": 2.8928737163543703, "loss/logits": 0.9363324135541916, "step": 12610 }, { "epoch": 0.1262, "grad_norm": 10.8125, "grad_norm_var": 0.4989420572916667, "learning_rate": 0.0003, "loss": 12.4322, "loss/aux_loss": 0.04810363110154867, "loss/crossentropy": 2.8710333466529847, "loss/logits": 0.9906549990177155, "step": 12620 }, { "epoch": 0.1263, "grad_norm": 11.25, "grad_norm_var": 0.16053059895833333, "learning_rate": 0.0003, "loss": 12.4911, "loss/aux_loss": 0.04810148868709803, "loss/crossentropy": 2.8756853461265566, "loss/logits": 0.9440797507762909, "step": 12630 }, { "epoch": 0.1264, "grad_norm": 18.375, "grad_norm_var": 7.615625, "learning_rate": 0.0003, "loss": 12.4571, "loss/aux_loss": 0.04810570180416107, "loss/crossentropy": 2.6758979201316833, "loss/logits": 0.9320230633020401, "step": 12640 }, { "epoch": 0.1265, "grad_norm": 10.3125, "grad_norm_var": 4.074983723958334, "learning_rate": 0.0003, "loss": 12.4666, "loss/aux_loss": 0.048111373744905, "loss/crossentropy": 2.842112112045288, "loss/logits": 0.9325708895921707, "step": 12650 }, { "epoch": 0.1266, "grad_norm": 10.0, "grad_norm_var": 1.2192708333333333, "learning_rate": 0.0003, "loss": 12.2817, "loss/aux_loss": 0.04809804186224938, "loss/crossentropy": 2.9903613328933716, "loss/logits": 0.9794372290372848, "step": 12660 }, { "epoch": 0.1267, "grad_norm": 10.375, "grad_norm_var": 0.468212890625, "learning_rate": 0.0003, "loss": 12.3768, "loss/aux_loss": 0.048095152527093885, "loss/crossentropy": 2.74501034617424, "loss/logits": 0.9036791056394577, "step": 12670 }, { "epoch": 0.1268, "grad_norm": 10.1875, "grad_norm_var": 0.32578125, "learning_rate": 0.0003, "loss": 12.654, "loss/aux_loss": 0.04810344278812408, "loss/crossentropy": 2.962386405467987, "loss/logits": 0.9665306150913239, "step": 12680 }, { "epoch": 0.1269, "grad_norm": 11.75, "grad_norm_var": 0.36183268229166665, "learning_rate": 0.0003, "loss": 12.4222, "loss/aux_loss": 0.04809885267168283, "loss/crossentropy": 2.909253853559494, "loss/logits": 0.9747259318828583, "step": 12690 }, { "epoch": 0.127, "grad_norm": 10.9375, "grad_norm_var": 0.2843098958333333, "learning_rate": 0.0003, "loss": 12.5564, "loss/aux_loss": 0.048109317757189275, "loss/crossentropy": 2.8867732286453247, "loss/logits": 0.9507931470870972, "step": 12700 }, { "epoch": 0.1271, "grad_norm": 12.8125, "grad_norm_var": 0.38409830729166666, "learning_rate": 0.0003, "loss": 12.4902, "loss/aux_loss": 0.04810472708195448, "loss/crossentropy": 2.6750588059425353, "loss/logits": 0.9369807064533233, "step": 12710 }, { "epoch": 0.1272, "grad_norm": 11.375, "grad_norm_var": 0.54609375, "learning_rate": 0.0003, "loss": 12.48, "loss/aux_loss": 0.048105572909116746, "loss/crossentropy": 2.9057799935340882, "loss/logits": 0.9927403450012207, "step": 12720 }, { "epoch": 0.1273, "grad_norm": 10.6875, "grad_norm_var": 0.3148274739583333, "learning_rate": 0.0003, "loss": 12.5244, "loss/aux_loss": 0.04810297396034002, "loss/crossentropy": 2.9415181994438173, "loss/logits": 0.9982303559780121, "step": 12730 }, { "epoch": 0.1274, "grad_norm": 10.5625, "grad_norm_var": 0.1697265625, "learning_rate": 0.0003, "loss": 12.403, "loss/aux_loss": 0.04809920433908701, "loss/crossentropy": 2.903778100013733, "loss/logits": 0.970294651389122, "step": 12740 }, { "epoch": 0.1275, "grad_norm": 10.8125, "grad_norm_var": 0.15050455729166667, "learning_rate": 0.0003, "loss": 12.455, "loss/aux_loss": 0.048093832843005654, "loss/crossentropy": 2.9087541341781615, "loss/logits": 0.9725099325180053, "step": 12750 }, { "epoch": 0.1276, "grad_norm": 10.5625, "grad_norm_var": 0.13274739583333334, "learning_rate": 0.0003, "loss": 12.5525, "loss/aux_loss": 0.04810660276561975, "loss/crossentropy": 2.792685878276825, "loss/logits": 0.9733223885297775, "step": 12760 }, { "epoch": 0.1277, "grad_norm": 11.375, "grad_norm_var": 0.18019205729166668, "learning_rate": 0.0003, "loss": 12.559, "loss/aux_loss": 0.04810384083539247, "loss/crossentropy": 2.8665752828121187, "loss/logits": 0.9338672608137131, "step": 12770 }, { "epoch": 0.1278, "grad_norm": 10.1875, "grad_norm_var": 0.3889973958333333, "learning_rate": 0.0003, "loss": 12.4119, "loss/aux_loss": 0.048106889240443707, "loss/crossentropy": 2.8227752327919005, "loss/logits": 0.9450346022844315, "step": 12780 }, { "epoch": 0.1279, "grad_norm": 11.125, "grad_norm_var": 0.29791666666666666, "learning_rate": 0.0003, "loss": 12.6173, "loss/aux_loss": 0.048102630861103536, "loss/crossentropy": 2.9129024147987366, "loss/logits": 0.9713657557964325, "step": 12790 }, { "epoch": 0.128, "grad_norm": 10.9375, "grad_norm_var": 0.18274739583333333, "learning_rate": 0.0003, "loss": 12.326, "loss/aux_loss": 0.04809645600616932, "loss/crossentropy": 2.8920622408390044, "loss/logits": 0.954812154173851, "step": 12800 }, { "epoch": 0.1281, "grad_norm": 10.9375, "grad_norm_var": 0.17420247395833333, "learning_rate": 0.0003, "loss": 12.2345, "loss/aux_loss": 0.04810621030628681, "loss/crossentropy": 2.7076722204685213, "loss/logits": 0.9177807062864304, "step": 12810 }, { "epoch": 0.1282, "grad_norm": 10.0, "grad_norm_var": 0.6026041666666667, "learning_rate": 0.0003, "loss": 12.4786, "loss/aux_loss": 0.048109995760023595, "loss/crossentropy": 2.8327449679374697, "loss/logits": 0.9546353191137313, "step": 12820 }, { "epoch": 0.1283, "grad_norm": 11.9375, "grad_norm_var": 0.8191243489583333, "learning_rate": 0.0003, "loss": 12.5182, "loss/aux_loss": 0.048099367320537566, "loss/crossentropy": 2.9235777378082277, "loss/logits": 0.9615249812602997, "step": 12830 }, { "epoch": 0.1284, "grad_norm": 11.25, "grad_norm_var": 3.563134765625, "learning_rate": 0.0003, "loss": 12.3886, "loss/aux_loss": 0.04810474757105112, "loss/crossentropy": 2.7467468440532685, "loss/logits": 0.9554690361022949, "step": 12840 }, { "epoch": 0.1285, "grad_norm": 11.625, "grad_norm_var": 0.36666666666666664, "learning_rate": 0.0003, "loss": 12.4959, "loss/aux_loss": 0.04810317847877741, "loss/crossentropy": 2.9639437079429625, "loss/logits": 0.9639540314674377, "step": 12850 }, { "epoch": 0.1286, "grad_norm": 11.6875, "grad_norm_var": 0.5353515625, "learning_rate": 0.0003, "loss": 12.4018, "loss/aux_loss": 0.04811002798378468, "loss/crossentropy": 2.7028500497341157, "loss/logits": 0.9559302479028702, "step": 12860 }, { "epoch": 0.1287, "grad_norm": 11.5, "grad_norm_var": 53.47472330729167, "learning_rate": 0.0003, "loss": 12.5657, "loss/aux_loss": 0.04811037741601467, "loss/crossentropy": 2.924159586429596, "loss/logits": 0.9749170869588852, "step": 12870 }, { "epoch": 0.1288, "grad_norm": 10.4375, "grad_norm_var": 53.731103515625, "learning_rate": 0.0003, "loss": 12.3255, "loss/aux_loss": 0.048108428902924064, "loss/crossentropy": 2.763288676738739, "loss/logits": 0.9064864754676819, "step": 12880 }, { "epoch": 0.1289, "grad_norm": 10.3125, "grad_norm_var": 0.22604166666666667, "learning_rate": 0.0003, "loss": 12.2636, "loss/aux_loss": 0.048106398433446884, "loss/crossentropy": 2.785652810335159, "loss/logits": 0.9184371441602707, "step": 12890 }, { "epoch": 0.129, "grad_norm": 10.625, "grad_norm_var": 0.30514322916666664, "learning_rate": 0.0003, "loss": 12.3059, "loss/aux_loss": 0.048100420646369456, "loss/crossentropy": 2.790884238481522, "loss/logits": 0.9849524915218353, "step": 12900 }, { "epoch": 0.1291, "grad_norm": 11.0625, "grad_norm_var": 0.179150390625, "learning_rate": 0.0003, "loss": 12.4044, "loss/aux_loss": 0.048105467297136786, "loss/crossentropy": 2.791968286037445, "loss/logits": 0.9593799233436584, "step": 12910 }, { "epoch": 0.1292, "grad_norm": 10.25, "grad_norm_var": 0.3337890625, "learning_rate": 0.0003, "loss": 12.4358, "loss/aux_loss": 0.04811060018837452, "loss/crossentropy": 2.9321176767349244, "loss/logits": 0.9812934130430222, "step": 12920 }, { "epoch": 0.1293, "grad_norm": 11.0625, "grad_norm_var": 0.22784830729166666, "learning_rate": 0.0003, "loss": 12.4753, "loss/aux_loss": 0.04810346253216267, "loss/crossentropy": 2.8504304766654966, "loss/logits": 0.9624378353357315, "step": 12930 }, { "epoch": 0.1294, "grad_norm": 18.75, "grad_norm_var": 4.043229166666666, "learning_rate": 0.0003, "loss": 12.5174, "loss/aux_loss": 0.048096288181841376, "loss/crossentropy": 2.861344063282013, "loss/logits": 0.9707422107458115, "step": 12940 }, { "epoch": 0.1295, "grad_norm": 11.1875, "grad_norm_var": 3.975455729166667, "learning_rate": 0.0003, "loss": 12.413, "loss/aux_loss": 0.048110200092196465, "loss/crossentropy": 2.8129209518432616, "loss/logits": 0.9287580490112305, "step": 12950 }, { "epoch": 0.1296, "grad_norm": 11.0, "grad_norm_var": 0.25338541666666664, "learning_rate": 0.0003, "loss": 12.3644, "loss/aux_loss": 0.04810931608080864, "loss/crossentropy": 2.802262395620346, "loss/logits": 0.9195797771215439, "step": 12960 }, { "epoch": 0.1297, "grad_norm": 11.1875, "grad_norm_var": 0.5309895833333333, "learning_rate": 0.0003, "loss": 12.3496, "loss/aux_loss": 0.048108757846057414, "loss/crossentropy": 3.0164557695388794, "loss/logits": 0.9846212476491928, "step": 12970 }, { "epoch": 0.1298, "grad_norm": 10.875, "grad_norm_var": 0.4212890625, "learning_rate": 0.0003, "loss": 12.382, "loss/aux_loss": 0.04810013268142939, "loss/crossentropy": 2.9002213299274446, "loss/logits": 0.9503946632146836, "step": 12980 }, { "epoch": 0.1299, "grad_norm": 10.375, "grad_norm_var": 0.15167643229166666, "learning_rate": 0.0003, "loss": 12.3921, "loss/aux_loss": 0.048111158050596715, "loss/crossentropy": 2.7677676558494566, "loss/logits": 0.921833261847496, "step": 12990 }, { "epoch": 0.13, "grad_norm": 10.75, "grad_norm_var": 0.388525390625, "learning_rate": 0.0003, "loss": 12.3999, "loss/aux_loss": 0.048099796287715435, "loss/crossentropy": 2.906938135623932, "loss/logits": 0.9610762029886246, "step": 13000 }, { "epoch": 0.1301, "grad_norm": 10.0625, "grad_norm_var": 0.36139322916666666, "learning_rate": 0.0003, "loss": 12.6066, "loss/aux_loss": 0.04810947496443987, "loss/crossentropy": 2.892643666267395, "loss/logits": 0.9925059139728546, "step": 13010 }, { "epoch": 0.1302, "grad_norm": 10.875, "grad_norm_var": 0.160400390625, "learning_rate": 0.0003, "loss": 12.2514, "loss/aux_loss": 0.048100730404257774, "loss/crossentropy": 2.7110345482826235, "loss/logits": 0.931193083524704, "step": 13020 }, { "epoch": 0.1303, "grad_norm": 11.6875, "grad_norm_var": 0.546337890625, "learning_rate": 0.0003, "loss": 12.7064, "loss/aux_loss": 0.04810579176992178, "loss/crossentropy": 2.92630649805069, "loss/logits": 0.9686931163072586, "step": 13030 }, { "epoch": 0.1304, "grad_norm": 10.5, "grad_norm_var": 0.7660807291666667, "learning_rate": 0.0003, "loss": 12.1971, "loss/aux_loss": 0.04810760095715523, "loss/crossentropy": 2.778294336795807, "loss/logits": 0.9471270084381104, "step": 13040 }, { "epoch": 0.1305, "grad_norm": 10.1875, "grad_norm_var": 0.5150390625, "learning_rate": 0.0003, "loss": 12.5498, "loss/aux_loss": 0.048095237277448175, "loss/crossentropy": 2.8965494871139525, "loss/logits": 0.9763620316982269, "step": 13050 }, { "epoch": 0.1306, "grad_norm": 10.8125, "grad_norm_var": 0.3341145833333333, "learning_rate": 0.0003, "loss": 12.268, "loss/aux_loss": 0.04810608047991991, "loss/crossentropy": 2.704496759176254, "loss/logits": 0.9139129340648651, "step": 13060 }, { "epoch": 0.1307, "grad_norm": 10.5625, "grad_norm_var": 0.2833333333333333, "learning_rate": 0.0003, "loss": 12.3673, "loss/aux_loss": 0.04811034444719553, "loss/crossentropy": 2.8740296959877014, "loss/logits": 0.9611405491828918, "step": 13070 }, { "epoch": 0.1308, "grad_norm": 10.8125, "grad_norm_var": 0.19138997395833332, "learning_rate": 0.0003, "loss": 12.4817, "loss/aux_loss": 0.048108231462538245, "loss/crossentropy": 2.746237635612488, "loss/logits": 0.9631420075893402, "step": 13080 }, { "epoch": 0.1309, "grad_norm": 10.9375, "grad_norm_var": 0.26484375, "learning_rate": 0.0003, "loss": 12.4288, "loss/aux_loss": 0.048110059648752215, "loss/crossentropy": 2.974073600769043, "loss/logits": 0.964441043138504, "step": 13090 }, { "epoch": 0.131, "grad_norm": 11.75, "grad_norm_var": 0.20130208333333333, "learning_rate": 0.0003, "loss": 12.4907, "loss/aux_loss": 0.04810066521167755, "loss/crossentropy": 2.760193109512329, "loss/logits": 0.9369175344705581, "step": 13100 }, { "epoch": 0.1311, "grad_norm": 11.125, "grad_norm_var": 1.0639973958333333, "learning_rate": 0.0003, "loss": 12.464, "loss/aux_loss": 0.04809979852288961, "loss/crossentropy": 2.846820616722107, "loss/logits": 0.9538910329341889, "step": 13110 }, { "epoch": 0.1312, "grad_norm": 10.5625, "grad_norm_var": 0.8995930989583333, "learning_rate": 0.0003, "loss": 12.4394, "loss/aux_loss": 0.048106100782752036, "loss/crossentropy": 2.9314664363861085, "loss/logits": 0.94806087911129, "step": 13120 }, { "epoch": 0.1313, "grad_norm": 11.375, "grad_norm_var": 0.4202962239583333, "learning_rate": 0.0003, "loss": 12.3422, "loss/aux_loss": 0.0481000566855073, "loss/crossentropy": 2.8791940450668334, "loss/logits": 0.9509881615638733, "step": 13130 }, { "epoch": 0.1314, "grad_norm": 10.125, "grad_norm_var": 0.27615559895833336, "learning_rate": 0.0003, "loss": 12.4757, "loss/aux_loss": 0.048098215088248256, "loss/crossentropy": 2.9675530552864076, "loss/logits": 0.9818042993545533, "step": 13140 }, { "epoch": 0.1315, "grad_norm": 10.625, "grad_norm_var": 0.2618326822916667, "learning_rate": 0.0003, "loss": 12.336, "loss/aux_loss": 0.04810485653579235, "loss/crossentropy": 2.804446077346802, "loss/logits": 0.9385320395231247, "step": 13150 }, { "epoch": 0.1316, "grad_norm": 11.6875, "grad_norm_var": 0.24368489583333333, "learning_rate": 0.0003, "loss": 12.4802, "loss/aux_loss": 0.04809897020459175, "loss/crossentropy": 2.8940049529075624, "loss/logits": 0.9573934972286224, "step": 13160 }, { "epoch": 0.1317, "grad_norm": 10.625, "grad_norm_var": 0.30271809895833335, "learning_rate": 0.0003, "loss": 12.3512, "loss/aux_loss": 0.04811223279684782, "loss/crossentropy": 2.7365167438983917, "loss/logits": 0.9326226800680161, "step": 13170 }, { "epoch": 0.1318, "grad_norm": 10.625, "grad_norm_var": 0.19765625, "learning_rate": 0.0003, "loss": 12.4848, "loss/aux_loss": 0.048102138377726075, "loss/crossentropy": 2.780118942260742, "loss/logits": 0.9540079593658447, "step": 13180 }, { "epoch": 0.1319, "grad_norm": 10.6875, "grad_norm_var": 0.9747233072916667, "learning_rate": 0.0003, "loss": 12.3066, "loss/aux_loss": 0.0481060640886426, "loss/crossentropy": 2.848835837841034, "loss/logits": 0.9211630582809448, "step": 13190 }, { "epoch": 0.132, "grad_norm": 10.6875, "grad_norm_var": 0.92421875, "learning_rate": 0.0003, "loss": 12.4287, "loss/aux_loss": 0.048100389540195465, "loss/crossentropy": 2.8563956737518312, "loss/logits": 0.9615561842918396, "step": 13200 }, { "epoch": 0.1321, "grad_norm": 10.8125, "grad_norm_var": 0.39296875, "learning_rate": 0.0003, "loss": 12.4611, "loss/aux_loss": 0.04809982106089592, "loss/crossentropy": 2.99262011051178, "loss/logits": 0.9975022733211517, "step": 13210 }, { "epoch": 0.1322, "grad_norm": 10.0, "grad_norm_var": 0.5723795572916667, "learning_rate": 0.0003, "loss": 12.3749, "loss/aux_loss": 0.048114350996911526, "loss/crossentropy": 2.715546762943268, "loss/logits": 0.9087715715169906, "step": 13220 }, { "epoch": 0.1323, "grad_norm": 11.4375, "grad_norm_var": 0.8630045572916667, "learning_rate": 0.0003, "loss": 12.3739, "loss/aux_loss": 0.048113764822483064, "loss/crossentropy": 2.8105762124061586, "loss/logits": 0.9538555532693863, "step": 13230 }, { "epoch": 0.1324, "grad_norm": 11.9375, "grad_norm_var": 19.576025390625, "learning_rate": 0.0003, "loss": 12.2992, "loss/aux_loss": 0.04810525067150593, "loss/crossentropy": 2.91482892036438, "loss/logits": 0.9708419471979142, "step": 13240 }, { "epoch": 0.1325, "grad_norm": 11.375, "grad_norm_var": 0.3712076822916667, "learning_rate": 0.0003, "loss": 12.35, "loss/aux_loss": 0.04810534752905369, "loss/crossentropy": 2.8350456237792967, "loss/logits": 0.9504047840833664, "step": 13250 }, { "epoch": 0.1326, "grad_norm": 10.75, "grad_norm_var": 0.229931640625, "learning_rate": 0.0003, "loss": 12.5435, "loss/aux_loss": 0.048105095699429515, "loss/crossentropy": 2.9701404571533203, "loss/logits": 0.9582396388053894, "step": 13260 }, { "epoch": 0.1327, "grad_norm": 10.6875, "grad_norm_var": 0.201416015625, "learning_rate": 0.0003, "loss": 12.3695, "loss/aux_loss": 0.04809423070400953, "loss/crossentropy": 2.981611502170563, "loss/logits": 0.9848940640687942, "step": 13270 }, { "epoch": 0.1328, "grad_norm": 10.8125, "grad_norm_var": 0.204931640625, "learning_rate": 0.0003, "loss": 12.2578, "loss/aux_loss": 0.04810588490217924, "loss/crossentropy": 2.786838227510452, "loss/logits": 0.9377759993076324, "step": 13280 }, { "epoch": 0.1329, "grad_norm": 10.3125, "grad_norm_var": 0.354541015625, "learning_rate": 0.0003, "loss": 12.4597, "loss/aux_loss": 0.04810259565711021, "loss/crossentropy": 2.8435731649398805, "loss/logits": 0.9304347574710846, "step": 13290 }, { "epoch": 0.133, "grad_norm": 10.625, "grad_norm_var": 0.2581868489583333, "learning_rate": 0.0003, "loss": 12.3977, "loss/aux_loss": 0.0481047386303544, "loss/crossentropy": 2.893440508842468, "loss/logits": 0.9740776270627975, "step": 13300 }, { "epoch": 0.1331, "grad_norm": 10.3125, "grad_norm_var": 1.2949055989583333, "learning_rate": 0.0003, "loss": 12.3455, "loss/aux_loss": 0.04810352213680744, "loss/crossentropy": 2.6359397768974304, "loss/logits": 0.9171884417533874, "step": 13310 }, { "epoch": 0.1332, "grad_norm": 12.1875, "grad_norm_var": 1.2700358072916667, "learning_rate": 0.0003, "loss": 12.3154, "loss/aux_loss": 0.0481058057397604, "loss/crossentropy": 2.766212022304535, "loss/logits": 0.9359346807003022, "step": 13320 }, { "epoch": 0.1333, "grad_norm": 10.0, "grad_norm_var": 0.6723307291666667, "learning_rate": 0.0003, "loss": 12.4084, "loss/aux_loss": 0.048116312362253666, "loss/crossentropy": 2.84612637758255, "loss/logits": 0.939299488067627, "step": 13330 }, { "epoch": 0.1334, "grad_norm": 10.8125, "grad_norm_var": 0.555322265625, "learning_rate": 0.0003, "loss": 12.4866, "loss/aux_loss": 0.04810191094875336, "loss/crossentropy": 2.958892011642456, "loss/logits": 0.9613621711730957, "step": 13340 }, { "epoch": 0.1335, "grad_norm": 10.875, "grad_norm_var": 0.30149739583333335, "learning_rate": 0.0003, "loss": 12.4951, "loss/aux_loss": 0.04811299704015255, "loss/crossentropy": 2.949324941635132, "loss/logits": 0.9427460253238678, "step": 13350 }, { "epoch": 0.1336, "grad_norm": 10.0, "grad_norm_var": 0.17420247395833333, "learning_rate": 0.0003, "loss": 12.3633, "loss/aux_loss": 0.04811058808118105, "loss/crossentropy": 2.9030325174331666, "loss/logits": 0.9233207911252975, "step": 13360 }, { "epoch": 0.1337, "grad_norm": 12.625, "grad_norm_var": 0.3277180989583333, "learning_rate": 0.0003, "loss": 12.1642, "loss/aux_loss": 0.04810114298015833, "loss/crossentropy": 2.709307849407196, "loss/logits": 0.9239124625921249, "step": 13370 }, { "epoch": 0.1338, "grad_norm": 10.625, "grad_norm_var": 0.30514322916666664, "learning_rate": 0.0003, "loss": 12.4912, "loss/aux_loss": 0.04811024907976389, "loss/crossentropy": 2.8618651926517487, "loss/logits": 0.9447506815195084, "step": 13380 }, { "epoch": 0.1339, "grad_norm": 11.6875, "grad_norm_var": 0.19166666666666668, "learning_rate": 0.0003, "loss": 12.4706, "loss/aux_loss": 0.04810334574431181, "loss/crossentropy": 2.9332224130630493, "loss/logits": 0.9420458465814591, "step": 13390 }, { "epoch": 0.134, "grad_norm": 10.6875, "grad_norm_var": 0.23748372395833334, "learning_rate": 0.0003, "loss": 12.2177, "loss/aux_loss": 0.04811100345104933, "loss/crossentropy": 2.896355766057968, "loss/logits": 0.9644664227962494, "step": 13400 }, { "epoch": 0.1341, "grad_norm": 11.0, "grad_norm_var": 0.44021809895833336, "learning_rate": 0.0003, "loss": 12.4903, "loss/aux_loss": 0.04810348581522703, "loss/crossentropy": 2.7759104132652284, "loss/logits": 0.9495417177677155, "step": 13410 }, { "epoch": 0.1342, "grad_norm": 10.0, "grad_norm_var": 0.492822265625, "learning_rate": 0.0003, "loss": 12.4067, "loss/aux_loss": 0.04810683950781822, "loss/crossentropy": 2.804324197769165, "loss/logits": 0.9235481023788452, "step": 13420 }, { "epoch": 0.1343, "grad_norm": 10.75, "grad_norm_var": 1.1260416666666666, "learning_rate": 0.0003, "loss": 12.2612, "loss/aux_loss": 0.04810345564037562, "loss/crossentropy": 2.7631209015846254, "loss/logits": 0.966147831082344, "step": 13430 }, { "epoch": 0.1344, "grad_norm": 11.3125, "grad_norm_var": 0.8817545572916666, "learning_rate": 0.0003, "loss": 12.3751, "loss/aux_loss": 0.048112759739160536, "loss/crossentropy": 2.9659682273864747, "loss/logits": 0.9447232961654664, "step": 13440 }, { "epoch": 0.1345, "grad_norm": 11.9375, "grad_norm_var": 0.4984212239583333, "learning_rate": 0.0003, "loss": 12.3162, "loss/aux_loss": 0.048107668198645114, "loss/crossentropy": 2.83985230922699, "loss/logits": 0.9504481822252273, "step": 13450 }, { "epoch": 0.1346, "grad_norm": 10.625, "grad_norm_var": 0.3667805989583333, "learning_rate": 0.0003, "loss": 12.1897, "loss/aux_loss": 0.04810948856174946, "loss/crossentropy": 2.7549479007720947, "loss/logits": 0.9273734211921691, "step": 13460 }, { "epoch": 0.1347, "grad_norm": 12.375, "grad_norm_var": 0.31764322916666665, "learning_rate": 0.0003, "loss": 12.2581, "loss/aux_loss": 0.0481115635484457, "loss/crossentropy": 2.7301569998264315, "loss/logits": 0.9274598181247711, "step": 13470 }, { "epoch": 0.1348, "grad_norm": 11.625, "grad_norm_var": 0.32146809895833334, "learning_rate": 0.0003, "loss": 12.2671, "loss/aux_loss": 0.04809574782848358, "loss/crossentropy": 2.7924930095672607, "loss/logits": 0.9410725235939026, "step": 13480 }, { "epoch": 0.1349, "grad_norm": 12.375, "grad_norm_var": 0.30520833333333336, "learning_rate": 0.0003, "loss": 12.4427, "loss/aux_loss": 0.048100709170103076, "loss/crossentropy": 2.8529832124710084, "loss/logits": 0.964218020439148, "step": 13490 }, { "epoch": 0.135, "grad_norm": 11.6875, "grad_norm_var": 0.48019205729166664, "learning_rate": 0.0003, "loss": 12.5486, "loss/aux_loss": 0.04810782596468925, "loss/crossentropy": 2.767691594362259, "loss/logits": 0.9534753412008286, "step": 13500 }, { "epoch": 0.1351, "grad_norm": 11.0625, "grad_norm_var": 0.3384765625, "learning_rate": 0.0003, "loss": 12.4671, "loss/aux_loss": 0.048108152486383914, "loss/crossentropy": 2.840370202064514, "loss/logits": 0.9720666646957398, "step": 13510 }, { "epoch": 0.1352, "grad_norm": 10.6875, "grad_norm_var": 0.40232747395833335, "learning_rate": 0.0003, "loss": 12.3131, "loss/aux_loss": 0.04810761827975511, "loss/crossentropy": 2.7722171783447265, "loss/logits": 0.919560182094574, "step": 13520 }, { "epoch": 0.1353, "grad_norm": 11.125, "grad_norm_var": 0.27265625, "learning_rate": 0.0003, "loss": 12.3471, "loss/aux_loss": 0.048098478280007836, "loss/crossentropy": 2.7604997634887694, "loss/logits": 0.9506667792797089, "step": 13530 }, { "epoch": 0.1354, "grad_norm": 10.5625, "grad_norm_var": 0.23385416666666667, "learning_rate": 0.0003, "loss": 12.5057, "loss/aux_loss": 0.048096814006567, "loss/crossentropy": 2.93693727850914, "loss/logits": 1.006801837682724, "step": 13540 }, { "epoch": 0.1355, "grad_norm": 10.5, "grad_norm_var": 0.23357747395833334, "learning_rate": 0.0003, "loss": 12.4151, "loss/aux_loss": 0.04810099713504314, "loss/crossentropy": 2.9465499818325043, "loss/logits": 0.9544957995414733, "step": 13550 }, { "epoch": 0.1356, "grad_norm": 11.1875, "grad_norm_var": 0.29322916666666665, "learning_rate": 0.0003, "loss": 12.4663, "loss/aux_loss": 0.04810344949364662, "loss/crossentropy": 2.783466875553131, "loss/logits": 0.982630443572998, "step": 13560 }, { "epoch": 0.1357, "grad_norm": 12.0, "grad_norm_var": 0.36451822916666665, "learning_rate": 0.0003, "loss": 12.3713, "loss/aux_loss": 0.04810951203107834, "loss/crossentropy": 2.66209716796875, "loss/logits": 0.9058698862791061, "step": 13570 }, { "epoch": 0.1358, "grad_norm": 12.125, "grad_norm_var": 0.5755208333333334, "learning_rate": 0.0003, "loss": 12.4161, "loss/aux_loss": 0.04810614287853241, "loss/crossentropy": 2.793833488225937, "loss/logits": 0.908539018034935, "step": 13580 }, { "epoch": 0.1359, "grad_norm": 10.6875, "grad_norm_var": 0.36495768229166664, "learning_rate": 0.0003, "loss": 12.4122, "loss/aux_loss": 0.048112927563488485, "loss/crossentropy": 2.7475938618183138, "loss/logits": 0.9510286509990692, "step": 13590 }, { "epoch": 0.136, "grad_norm": 11.375, "grad_norm_var": 0.23639322916666666, "learning_rate": 0.0003, "loss": 12.2558, "loss/aux_loss": 0.048097974807024005, "loss/crossentropy": 2.8546653985977173, "loss/logits": 0.9764155447483063, "step": 13600 }, { "epoch": 0.1361, "grad_norm": 10.25, "grad_norm_var": 0.2652180989583333, "learning_rate": 0.0003, "loss": 12.3507, "loss/aux_loss": 0.048107730224728584, "loss/crossentropy": 2.7831094443798063, "loss/logits": 0.933989730477333, "step": 13610 }, { "epoch": 0.1362, "grad_norm": 10.8125, "grad_norm_var": 0.27076822916666665, "learning_rate": 0.0003, "loss": 12.4364, "loss/aux_loss": 0.04810614828020334, "loss/crossentropy": 2.8577419936656954, "loss/logits": 0.9404568552970887, "step": 13620 }, { "epoch": 0.1363, "grad_norm": 10.875, "grad_norm_var": 0.12902018229166667, "learning_rate": 0.0003, "loss": 12.2498, "loss/aux_loss": 0.04811039827764034, "loss/crossentropy": 2.732570058107376, "loss/logits": 0.9277025848627091, "step": 13630 }, { "epoch": 0.1364, "grad_norm": 11.9375, "grad_norm_var": 0.34055989583333335, "learning_rate": 0.0003, "loss": 12.2871, "loss/aux_loss": 0.04810627643018961, "loss/crossentropy": 2.8617196679115295, "loss/logits": 0.9472320884466171, "step": 13640 }, { "epoch": 0.1365, "grad_norm": 11.1875, "grad_norm_var": 0.4354166666666667, "learning_rate": 0.0003, "loss": 12.2645, "loss/aux_loss": 0.048111869394779204, "loss/crossentropy": 2.8335381925106047, "loss/logits": 0.9731518387794494, "step": 13650 }, { "epoch": 0.1366, "grad_norm": 10.9375, "grad_norm_var": 0.267822265625, "learning_rate": 0.0003, "loss": 12.1778, "loss/aux_loss": 0.048117116838693616, "loss/crossentropy": 2.772057569026947, "loss/logits": 0.9199839055538177, "step": 13660 }, { "epoch": 0.1367, "grad_norm": 10.8125, "grad_norm_var": 0.24348958333333334, "learning_rate": 0.0003, "loss": 12.2065, "loss/aux_loss": 0.04810203909873963, "loss/crossentropy": 2.8068443894386292, "loss/logits": 0.9232182204723358, "step": 13670 }, { "epoch": 0.1368, "grad_norm": 10.875, "grad_norm_var": 0.48639322916666666, "learning_rate": 0.0003, "loss": 12.304, "loss/aux_loss": 0.048113486543297765, "loss/crossentropy": 2.8098415970802306, "loss/logits": 0.9973312526941299, "step": 13680 }, { "epoch": 0.1369, "grad_norm": 11.5625, "grad_norm_var": 0.769775390625, "learning_rate": 0.0003, "loss": 12.3701, "loss/aux_loss": 0.048099438101053237, "loss/crossentropy": 2.9046237051486967, "loss/logits": 0.9579191863536834, "step": 13690 }, { "epoch": 0.137, "grad_norm": 10.6875, "grad_norm_var": 0.4046875, "learning_rate": 0.0003, "loss": 12.563, "loss/aux_loss": 0.04810395799577236, "loss/crossentropy": 2.983587795495987, "loss/logits": 0.9559501677751541, "step": 13700 }, { "epoch": 0.1371, "grad_norm": 10.25, "grad_norm_var": 0.6278483072916666, "learning_rate": 0.0003, "loss": 12.2734, "loss/aux_loss": 0.04810773227363825, "loss/crossentropy": 2.8191973209381103, "loss/logits": 0.92610003054142, "step": 13710 }, { "epoch": 0.1372, "grad_norm": 11.6875, "grad_norm_var": 0.46066080729166664, "learning_rate": 0.0003, "loss": 12.3531, "loss/aux_loss": 0.04809810016304254, "loss/crossentropy": 2.871203887462616, "loss/logits": 0.9389143049716949, "step": 13720 }, { "epoch": 0.1373, "grad_norm": 11.125, "grad_norm_var": 0.2565104166666667, "learning_rate": 0.0003, "loss": 12.2326, "loss/aux_loss": 0.04810184333473444, "loss/crossentropy": 2.794565808773041, "loss/logits": 0.9445665180683136, "step": 13730 }, { "epoch": 0.1374, "grad_norm": 11.9375, "grad_norm_var": 0.22604166666666667, "learning_rate": 0.0003, "loss": 12.3533, "loss/aux_loss": 0.048102909699082375, "loss/crossentropy": 2.8416384637355803, "loss/logits": 0.9654471457004548, "step": 13740 }, { "epoch": 0.1375, "grad_norm": 10.1875, "grad_norm_var": 0.3582682291666667, "learning_rate": 0.0003, "loss": 12.327, "loss/aux_loss": 0.04810390882194042, "loss/crossentropy": 2.8315653204917908, "loss/logits": 0.9377313375473022, "step": 13750 }, { "epoch": 0.1376, "grad_norm": 10.3125, "grad_norm_var": 0.36769205729166665, "learning_rate": 0.0003, "loss": 12.4527, "loss/aux_loss": 0.04810698907822371, "loss/crossentropy": 2.719471883773804, "loss/logits": 0.9382916927337647, "step": 13760 }, { "epoch": 0.1377, "grad_norm": 11.125, "grad_norm_var": 0.21555989583333332, "learning_rate": 0.0003, "loss": 12.2269, "loss/aux_loss": 0.048097971081733706, "loss/crossentropy": 2.861399304866791, "loss/logits": 0.9493412613868714, "step": 13770 }, { "epoch": 0.1378, "grad_norm": 11.75, "grad_norm_var": 0.6538899739583334, "learning_rate": 0.0003, "loss": 12.3158, "loss/aux_loss": 0.048107971996068956, "loss/crossentropy": 2.840020203590393, "loss/logits": 0.9154278337955475, "step": 13780 }, { "epoch": 0.1379, "grad_norm": 10.6875, "grad_norm_var": 0.37862955729166664, "learning_rate": 0.0003, "loss": 12.3169, "loss/aux_loss": 0.04810533430427313, "loss/crossentropy": 2.7927276849746705, "loss/logits": 0.9583002328872681, "step": 13790 }, { "epoch": 0.138, "grad_norm": 11.0, "grad_norm_var": 0.22161458333333334, "learning_rate": 0.0003, "loss": 12.3317, "loss/aux_loss": 0.04810004401952028, "loss/crossentropy": 2.894696593284607, "loss/logits": 0.9791848719120025, "step": 13800 }, { "epoch": 0.1381, "grad_norm": 11.3125, "grad_norm_var": 0.30201822916666665, "learning_rate": 0.0003, "loss": 12.3781, "loss/aux_loss": 0.04811547808349133, "loss/crossentropy": 2.8908798456192017, "loss/logits": 0.894439697265625, "step": 13810 }, { "epoch": 0.1382, "grad_norm": 12.0, "grad_norm_var": 0.24230143229166667, "learning_rate": 0.0003, "loss": 12.3648, "loss/aux_loss": 0.048103314451873304, "loss/crossentropy": 2.6465035498142244, "loss/logits": 0.9376543581485748, "step": 13820 }, { "epoch": 0.1383, "grad_norm": 11.375, "grad_norm_var": 0.21555989583333332, "learning_rate": 0.0003, "loss": 12.3217, "loss/aux_loss": 0.048104156740009785, "loss/crossentropy": 2.957222414016724, "loss/logits": 0.9343441456556321, "step": 13830 }, { "epoch": 0.1384, "grad_norm": 10.5, "grad_norm_var": 0.33697916666666666, "learning_rate": 0.0003, "loss": 12.1056, "loss/aux_loss": 0.04809317253530025, "loss/crossentropy": 2.6575527429580688, "loss/logits": 0.9211295455694198, "step": 13840 }, { "epoch": 0.1385, "grad_norm": 12.25, "grad_norm_var": 0.41139322916666665, "learning_rate": 0.0003, "loss": 12.3165, "loss/aux_loss": 0.04810005649924278, "loss/crossentropy": 2.8109684944152833, "loss/logits": 0.9039525598287582, "step": 13850 }, { "epoch": 0.1386, "grad_norm": 11.125, "grad_norm_var": 0.30462239583333334, "learning_rate": 0.0003, "loss": 12.4473, "loss/aux_loss": 0.048096515238285065, "loss/crossentropy": 2.7612143099308013, "loss/logits": 0.9181933552026749, "step": 13860 }, { "epoch": 0.1387, "grad_norm": 10.4375, "grad_norm_var": 0.15623372395833332, "learning_rate": 0.0003, "loss": 12.3176, "loss/aux_loss": 0.04810582865029574, "loss/crossentropy": 2.8384499669075014, "loss/logits": 0.9379000872373581, "step": 13870 }, { "epoch": 0.1388, "grad_norm": 11.0625, "grad_norm_var": 0.33839518229166665, "learning_rate": 0.0003, "loss": 12.2429, "loss/aux_loss": 0.04810143150389194, "loss/crossentropy": 2.819534254074097, "loss/logits": 0.9464493721723557, "step": 13880 }, { "epoch": 0.1389, "grad_norm": 10.625, "grad_norm_var": 0.3815104166666667, "learning_rate": 0.0003, "loss": 12.5049, "loss/aux_loss": 0.04809562023729086, "loss/crossentropy": 2.858844381570816, "loss/logits": 0.9540527880191803, "step": 13890 }, { "epoch": 0.139, "grad_norm": 11.5, "grad_norm_var": 0.4141764322916667, "learning_rate": 0.0003, "loss": 12.3641, "loss/aux_loss": 0.04810257162898779, "loss/crossentropy": 2.8486937463283537, "loss/logits": 0.962219363451004, "step": 13900 }, { "epoch": 0.1391, "grad_norm": 10.75, "grad_norm_var": 0.2528483072916667, "learning_rate": 0.0003, "loss": 12.3461, "loss/aux_loss": 0.048107230477035044, "loss/crossentropy": 2.6285312592983248, "loss/logits": 0.9440800845623016, "step": 13910 }, { "epoch": 0.1392, "grad_norm": 11.5, "grad_norm_var": 0.6181640625, "learning_rate": 0.0003, "loss": 12.2632, "loss/aux_loss": 0.048101754114031794, "loss/crossentropy": 2.7042616248130797, "loss/logits": 0.911108523607254, "step": 13920 }, { "epoch": 0.1393, "grad_norm": 10.5, "grad_norm_var": 0.391650390625, "learning_rate": 0.0003, "loss": 12.4164, "loss/aux_loss": 0.048104698024690154, "loss/crossentropy": 2.760655826330185, "loss/logits": 0.9386287301778793, "step": 13930 }, { "epoch": 0.1394, "grad_norm": 11.375, "grad_norm_var": 0.17526041666666667, "learning_rate": 0.0003, "loss": 12.3058, "loss/aux_loss": 0.04810354206711054, "loss/crossentropy": 2.8063110530376436, "loss/logits": 0.9448209255933762, "step": 13940 }, { "epoch": 0.1395, "grad_norm": 10.6875, "grad_norm_var": 0.106494140625, "learning_rate": 0.0003, "loss": 12.3086, "loss/aux_loss": 0.048095726780593394, "loss/crossentropy": 2.791933035850525, "loss/logits": 0.9196500927209854, "step": 13950 }, { "epoch": 0.1396, "grad_norm": 11.125, "grad_norm_var": 0.4676432291666667, "learning_rate": 0.0003, "loss": 12.4126, "loss/aux_loss": 0.04809935782104731, "loss/crossentropy": 2.8098475694656373, "loss/logits": 0.9401834368705749, "step": 13960 }, { "epoch": 0.1397, "grad_norm": 11.75, "grad_norm_var": 0.23430989583333334, "learning_rate": 0.0003, "loss": 11.9704, "loss/aux_loss": 0.04809695854783058, "loss/crossentropy": 2.8572974622249605, "loss/logits": 0.9450656235218048, "step": 13970 }, { "epoch": 0.1398, "grad_norm": 11.5, "grad_norm_var": 0.5817708333333333, "learning_rate": 0.0003, "loss": 12.2933, "loss/aux_loss": 0.04811172261834144, "loss/crossentropy": 2.7184654772281647, "loss/logits": 1.0148230105638505, "step": 13980 }, { "epoch": 0.1399, "grad_norm": 10.3125, "grad_norm_var": 0.6473307291666667, "learning_rate": 0.0003, "loss": 12.3804, "loss/aux_loss": 0.048092580400407314, "loss/crossentropy": 2.8070730805397033, "loss/logits": 0.9323766380548477, "step": 13990 }, { "epoch": 0.14, "grad_norm": 12.4375, "grad_norm_var": 7.025260416666667, "learning_rate": 0.0003, "loss": 12.5511, "loss/aux_loss": 0.048104867339134216, "loss/crossentropy": 2.888067865371704, "loss/logits": 0.9707460403442383, "step": 14000 }, { "epoch": 0.1401, "grad_norm": 10.375, "grad_norm_var": 6.564957682291666, "learning_rate": 0.0003, "loss": 12.2582, "loss/aux_loss": 0.04810222536325455, "loss/crossentropy": 2.671162748336792, "loss/logits": 0.9309888124465943, "step": 14010 }, { "epoch": 0.1402, "grad_norm": 12.25, "grad_norm_var": 0.7313639322916666, "learning_rate": 0.0003, "loss": 12.3619, "loss/aux_loss": 0.04810118060559034, "loss/crossentropy": 2.8578037440776827, "loss/logits": 0.9234833359718323, "step": 14020 }, { "epoch": 0.1403, "grad_norm": 9.875, "grad_norm_var": 0.919775390625, "learning_rate": 0.0003, "loss": 12.2416, "loss/aux_loss": 0.04809358064085245, "loss/crossentropy": 2.9598345518112184, "loss/logits": 0.9572826653718949, "step": 14030 }, { "epoch": 0.1404, "grad_norm": 10.9375, "grad_norm_var": 0.23748372395833334, "learning_rate": 0.0003, "loss": 12.2382, "loss/aux_loss": 0.04810269232839346, "loss/crossentropy": 2.7237884759902955, "loss/logits": 0.9275262981653214, "step": 14040 }, { "epoch": 0.1405, "grad_norm": 13.3125, "grad_norm_var": 0.4171875, "learning_rate": 0.0003, "loss": 12.6322, "loss/aux_loss": 0.04810795094817877, "loss/crossentropy": 2.8176922678947447, "loss/logits": 0.9676473349332809, "step": 14050 }, { "epoch": 0.1406, "grad_norm": 11.0, "grad_norm_var": 0.517431640625, "learning_rate": 0.0003, "loss": 12.3324, "loss/aux_loss": 0.04809824340045452, "loss/crossentropy": 2.830560302734375, "loss/logits": 0.9444521218538284, "step": 14060 }, { "epoch": 0.1407, "grad_norm": 11.1875, "grad_norm_var": 0.38448893229166664, "learning_rate": 0.0003, "loss": 12.2942, "loss/aux_loss": 0.048107242211699486, "loss/crossentropy": 2.8280949234962462, "loss/logits": 0.9159689843654633, "step": 14070 }, { "epoch": 0.1408, "grad_norm": 10.8125, "grad_norm_var": 21.600244140625, "learning_rate": 0.0003, "loss": 12.1401, "loss/aux_loss": 0.048114721104502677, "loss/crossentropy": 2.805542439222336, "loss/logits": 0.9222520262002945, "step": 14080 }, { "epoch": 0.1409, "grad_norm": 10.875, "grad_norm_var": 0.7968587239583333, "learning_rate": 0.0003, "loss": 12.3523, "loss/aux_loss": 0.048097500950098036, "loss/crossentropy": 2.820968973636627, "loss/logits": 0.9572250634431839, "step": 14090 }, { "epoch": 0.141, "grad_norm": 10.375, "grad_norm_var": 0.4398274739583333, "learning_rate": 0.0003, "loss": 12.3109, "loss/aux_loss": 0.04811274372041226, "loss/crossentropy": 2.873396396636963, "loss/logits": 0.9583924978971481, "step": 14100 }, { "epoch": 0.1411, "grad_norm": 11.0, "grad_norm_var": 0.26886393229166666, "learning_rate": 0.0003, "loss": 12.4389, "loss/aux_loss": 0.04809671007096768, "loss/crossentropy": 2.81766881942749, "loss/logits": 0.9768635481595993, "step": 14110 }, { "epoch": 0.1412, "grad_norm": 11.8125, "grad_norm_var": 0.1962890625, "learning_rate": 0.0003, "loss": 12.124, "loss/aux_loss": 0.048110161907970905, "loss/crossentropy": 2.7933003425598146, "loss/logits": 0.9018822848796845, "step": 14120 }, { "epoch": 0.1413, "grad_norm": 11.8125, "grad_norm_var": 0.478369140625, "learning_rate": 0.0003, "loss": 12.3726, "loss/aux_loss": 0.04809680469334125, "loss/crossentropy": 2.961318391561508, "loss/logits": 0.9694911539554596, "step": 14130 }, { "epoch": 0.1414, "grad_norm": 11.0625, "grad_norm_var": 0.30388997395833334, "learning_rate": 0.0003, "loss": 12.3803, "loss/aux_loss": 0.048111779242753984, "loss/crossentropy": 2.7930466175079345, "loss/logits": 0.9387133151292801, "step": 14140 }, { "epoch": 0.1415, "grad_norm": 11.0625, "grad_norm_var": 0.327587890625, "learning_rate": 0.0003, "loss": 12.3241, "loss/aux_loss": 0.048099853470921515, "loss/crossentropy": 2.7507693111896514, "loss/logits": 0.923522162437439, "step": 14150 }, { "epoch": 0.1416, "grad_norm": 9.8125, "grad_norm_var": 0.2556640625, "learning_rate": 0.0003, "loss": 12.1408, "loss/aux_loss": 0.04810427725315094, "loss/crossentropy": 2.591277301311493, "loss/logits": 0.9115919172763824, "step": 14160 }, { "epoch": 0.1417, "grad_norm": 10.75, "grad_norm_var": 0.335009765625, "learning_rate": 0.0003, "loss": 12.3768, "loss/aux_loss": 0.04809947330504656, "loss/crossentropy": 2.9588594675064086, "loss/logits": 0.9485181331634521, "step": 14170 }, { "epoch": 0.1418, "grad_norm": 10.75, "grad_norm_var": 0.44698893229166664, "learning_rate": 0.0003, "loss": 12.4181, "loss/aux_loss": 0.04810207560658455, "loss/crossentropy": 2.9570438385009767, "loss/logits": 0.939887073636055, "step": 14180 }, { "epoch": 0.1419, "grad_norm": 11.8125, "grad_norm_var": 0.3916015625, "learning_rate": 0.0003, "loss": 12.401, "loss/aux_loss": 0.0481014484539628, "loss/crossentropy": 2.912750172615051, "loss/logits": 0.9410306662321091, "step": 14190 }, { "epoch": 0.142, "grad_norm": 11.6875, "grad_norm_var": 0.2747395833333333, "learning_rate": 0.0003, "loss": 12.3249, "loss/aux_loss": 0.04810392800718546, "loss/crossentropy": 2.840937912464142, "loss/logits": 0.9358460456132889, "step": 14200 }, { "epoch": 0.1421, "grad_norm": 10.875, "grad_norm_var": 0.3424479166666667, "learning_rate": 0.0003, "loss": 12.1907, "loss/aux_loss": 0.048108947835862635, "loss/crossentropy": 2.7028416991233826, "loss/logits": 0.9135666370391846, "step": 14210 }, { "epoch": 0.1422, "grad_norm": 10.6875, "grad_norm_var": 0.391650390625, "learning_rate": 0.0003, "loss": 12.2893, "loss/aux_loss": 0.04810764603316784, "loss/crossentropy": 2.7547565340995788, "loss/logits": 0.9308681100606918, "step": 14220 }, { "epoch": 0.1423, "grad_norm": 11.625, "grad_norm_var": 0.134228515625, "learning_rate": 0.0003, "loss": 12.4046, "loss/aux_loss": 0.0481036901473999, "loss/crossentropy": 2.8777437806129456, "loss/logits": 0.9518471479415893, "step": 14230 }, { "epoch": 0.1424, "grad_norm": 11.9375, "grad_norm_var": 0.343603515625, "learning_rate": 0.0003, "loss": 12.2732, "loss/aux_loss": 0.04810591135174036, "loss/crossentropy": 2.758294236660004, "loss/logits": 0.9424175173044205, "step": 14240 }, { "epoch": 0.1425, "grad_norm": 11.0, "grad_norm_var": 0.365087890625, "learning_rate": 0.0003, "loss": 12.2751, "loss/aux_loss": 0.04810457993298769, "loss/crossentropy": 2.7782435297966, "loss/logits": 0.9354342728853225, "step": 14250 }, { "epoch": 0.1426, "grad_norm": 11.1875, "grad_norm_var": 0.2275390625, "learning_rate": 0.0003, "loss": 12.3644, "loss/aux_loss": 0.04809907414019108, "loss/crossentropy": 2.6942915558815, "loss/logits": 0.9781391233205795, "step": 14260 }, { "epoch": 0.1427, "grad_norm": 11.25, "grad_norm_var": 0.15818684895833332, "learning_rate": 0.0003, "loss": 12.2785, "loss/aux_loss": 0.04809475652873516, "loss/crossentropy": 2.8013816356658934, "loss/logits": 0.9540988564491272, "step": 14270 }, { "epoch": 0.1428, "grad_norm": 10.8125, "grad_norm_var": 0.174072265625, "learning_rate": 0.0003, "loss": 12.274, "loss/aux_loss": 0.048094392754137516, "loss/crossentropy": 2.7784948647022247, "loss/logits": 0.9767741382122039, "step": 14280 }, { "epoch": 0.1429, "grad_norm": 12.1875, "grad_norm_var": 0.49412434895833335, "learning_rate": 0.0003, "loss": 12.0625, "loss/aux_loss": 0.04810249712318182, "loss/crossentropy": 2.7680072247982026, "loss/logits": 0.9276201993227005, "step": 14290 }, { "epoch": 0.143, "grad_norm": 10.8125, "grad_norm_var": 0.6501139322916667, "learning_rate": 0.0003, "loss": 12.377, "loss/aux_loss": 0.04810411389917135, "loss/crossentropy": 2.857174110412598, "loss/logits": 0.9727380841970443, "step": 14300 }, { "epoch": 0.1431, "grad_norm": 10.25, "grad_norm_var": 0.37890625, "learning_rate": 0.0003, "loss": 12.3747, "loss/aux_loss": 0.04810033030807972, "loss/crossentropy": 2.8648359537124635, "loss/logits": 0.9480609089136124, "step": 14310 }, { "epoch": 0.1432, "grad_norm": 11.125, "grad_norm_var": 0.42630208333333336, "learning_rate": 0.0003, "loss": 12.3662, "loss/aux_loss": 0.04809709247201681, "loss/crossentropy": 2.898632252216339, "loss/logits": 0.9568489253520965, "step": 14320 }, { "epoch": 0.1433, "grad_norm": 12.375, "grad_norm_var": 0.4400390625, "learning_rate": 0.0003, "loss": 12.0264, "loss/aux_loss": 0.048100917227566244, "loss/crossentropy": 2.740042132139206, "loss/logits": 0.9168058276176453, "step": 14330 }, { "epoch": 0.1434, "grad_norm": 11.5, "grad_norm_var": 0.25909830729166666, "learning_rate": 0.0003, "loss": 12.2735, "loss/aux_loss": 0.048106574639678004, "loss/crossentropy": 2.599212634563446, "loss/logits": 0.9046968847513199, "step": 14340 }, { "epoch": 0.1435, "grad_norm": 11.375, "grad_norm_var": 0.18430989583333332, "learning_rate": 0.0003, "loss": 12.3895, "loss/aux_loss": 0.0481049045920372, "loss/crossentropy": 2.817890876531601, "loss/logits": 0.9239292711019516, "step": 14350 }, { "epoch": 0.1436, "grad_norm": 11.8125, "grad_norm_var": 0.17701822916666668, "learning_rate": 0.0003, "loss": 12.1993, "loss/aux_loss": 0.048096088133752345, "loss/crossentropy": 2.841284441947937, "loss/logits": 0.9078822374343872, "step": 14360 }, { "epoch": 0.1437, "grad_norm": 12.1875, "grad_norm_var": 0.334228515625, "learning_rate": 0.0003, "loss": 12.2995, "loss/aux_loss": 0.04810677636414766, "loss/crossentropy": 2.8144130051136016, "loss/logits": 0.9448065549135208, "step": 14370 }, { "epoch": 0.1438, "grad_norm": 10.75, "grad_norm_var": 0.499462890625, "learning_rate": 0.0003, "loss": 12.3904, "loss/aux_loss": 0.048093562759459016, "loss/crossentropy": 2.7329901337623594, "loss/logits": 0.9095306128263474, "step": 14380 }, { "epoch": 0.1439, "grad_norm": 11.0, "grad_norm_var": 0.3337890625, "learning_rate": 0.0003, "loss": 12.3279, "loss/aux_loss": 0.048103582486510275, "loss/crossentropy": 2.7921825289726256, "loss/logits": 0.8989864021539689, "step": 14390 }, { "epoch": 0.144, "grad_norm": 10.3125, "grad_norm_var": 0.3791015625, "learning_rate": 0.0003, "loss": 12.2123, "loss/aux_loss": 0.04809750877320766, "loss/crossentropy": 2.757075470685959, "loss/logits": 0.9241562187671661, "step": 14400 }, { "epoch": 0.1441, "grad_norm": 11.5625, "grad_norm_var": 0.3485514322916667, "learning_rate": 0.0003, "loss": 12.2931, "loss/aux_loss": 0.04810139331966638, "loss/crossentropy": 2.7922261476516725, "loss/logits": 0.9176064521074295, "step": 14410 }, { "epoch": 0.1442, "grad_norm": 11.875, "grad_norm_var": 0.3316243489583333, "learning_rate": 0.0003, "loss": 12.254, "loss/aux_loss": 0.04809432700276375, "loss/crossentropy": 2.699055606126785, "loss/logits": 0.922983717918396, "step": 14420 }, { "epoch": 0.1443, "grad_norm": 13.25, "grad_norm_var": 0.5127604166666667, "learning_rate": 0.0003, "loss": 12.3069, "loss/aux_loss": 0.04811172112822533, "loss/crossentropy": 2.6716023087501526, "loss/logits": 0.916047015786171, "step": 14430 }, { "epoch": 0.1444, "grad_norm": 12.0, "grad_norm_var": 0.5277180989583333, "learning_rate": 0.0003, "loss": 12.063, "loss/aux_loss": 0.048108107224106786, "loss/crossentropy": 2.722955423593521, "loss/logits": 0.9298226207494735, "step": 14440 }, { "epoch": 0.1445, "grad_norm": 12.0, "grad_norm_var": 0.6900390625, "learning_rate": 0.0003, "loss": 12.4038, "loss/aux_loss": 0.04810595251619816, "loss/crossentropy": 2.758984863758087, "loss/logits": 0.9555283427238465, "step": 14450 }, { "epoch": 0.1446, "grad_norm": 10.5, "grad_norm_var": 1.40625, "learning_rate": 0.0003, "loss": 12.2967, "loss/aux_loss": 0.04809891190379858, "loss/crossentropy": 2.9647063076496125, "loss/logits": 0.9323074251413346, "step": 14460 }, { "epoch": 0.1447, "grad_norm": 11.0625, "grad_norm_var": 0.2816243489583333, "learning_rate": 0.0003, "loss": 12.1597, "loss/aux_loss": 0.048092216812074186, "loss/crossentropy": 2.825933372974396, "loss/logits": 0.9327166765928269, "step": 14470 }, { "epoch": 0.1448, "grad_norm": 11.625, "grad_norm_var": 0.21927083333333333, "learning_rate": 0.0003, "loss": 12.1911, "loss/aux_loss": 0.048105007782578466, "loss/crossentropy": 2.8361350774765013, "loss/logits": 0.9484387129545212, "step": 14480 }, { "epoch": 0.1449, "grad_norm": 12.125, "grad_norm_var": 0.44895833333333335, "learning_rate": 0.0003, "loss": 12.2172, "loss/aux_loss": 0.04810567460954189, "loss/crossentropy": 3.038946294784546, "loss/logits": 0.93484668135643, "step": 14490 }, { "epoch": 0.145, "grad_norm": 12.4375, "grad_norm_var": 0.190625, "learning_rate": 0.0003, "loss": 12.0355, "loss/aux_loss": 0.04809667877852917, "loss/crossentropy": 2.7166079699993135, "loss/logits": 0.9272844552993774, "step": 14500 }, { "epoch": 0.1451, "grad_norm": 10.8125, "grad_norm_var": 0.3563639322916667, "learning_rate": 0.0003, "loss": 12.1147, "loss/aux_loss": 0.04810612741857767, "loss/crossentropy": 2.8721679210662843, "loss/logits": 0.9418263047933578, "step": 14510 }, { "epoch": 0.1452, "grad_norm": 13.75, "grad_norm_var": 0.8372395833333334, "learning_rate": 0.0003, "loss": 12.294, "loss/aux_loss": 0.0481014546006918, "loss/crossentropy": 2.8245011150836943, "loss/logits": 0.9494610846042633, "step": 14520 }, { "epoch": 0.1453, "grad_norm": 12.375, "grad_norm_var": 0.6234375, "learning_rate": 0.0003, "loss": 12.4112, "loss/aux_loss": 0.048096208833158016, "loss/crossentropy": 2.841428017616272, "loss/logits": 0.9481588363647461, "step": 14530 }, { "epoch": 0.1454, "grad_norm": 11.625, "grad_norm_var": 0.2752604166666667, "learning_rate": 0.0003, "loss": 12.3017, "loss/aux_loss": 0.048103177733719346, "loss/crossentropy": 2.8085617065429687, "loss/logits": 0.9153319448232651, "step": 14540 }, { "epoch": 0.1455, "grad_norm": 10.3125, "grad_norm_var": 0.7103515625, "learning_rate": 0.0003, "loss": 12.0693, "loss/aux_loss": 0.04810036141425371, "loss/crossentropy": 2.780474007129669, "loss/logits": 0.9415223807096481, "step": 14550 }, { "epoch": 0.1456, "grad_norm": 13.25, "grad_norm_var": 0.590087890625, "learning_rate": 0.0003, "loss": 12.2025, "loss/aux_loss": 0.04810147602111101, "loss/crossentropy": 2.99658949971199, "loss/logits": 0.9620972305536271, "step": 14560 }, { "epoch": 0.1457, "grad_norm": 11.5625, "grad_norm_var": 0.5958170572916667, "learning_rate": 0.0003, "loss": 12.2627, "loss/aux_loss": 0.04810364861041307, "loss/crossentropy": 2.804142338037491, "loss/logits": 0.9306607961654663, "step": 14570 }, { "epoch": 0.1458, "grad_norm": 11.5625, "grad_norm_var": 0.718212890625, "learning_rate": 0.0003, "loss": 12.2289, "loss/aux_loss": 0.048095517046749595, "loss/crossentropy": 2.887796187400818, "loss/logits": 0.9498396337032318, "step": 14580 }, { "epoch": 0.1459, "grad_norm": 10.4375, "grad_norm_var": 224.5869140625, "learning_rate": 0.0003, "loss": 12.2851, "loss/aux_loss": 0.04810778181999922, "loss/crossentropy": 2.8193862557411196, "loss/logits": 0.9181474059820175, "step": 14590 }, { "epoch": 0.146, "grad_norm": 10.625, "grad_norm_var": 0.9703125, "learning_rate": 0.0003, "loss": 12.3751, "loss/aux_loss": 0.048111128620803356, "loss/crossentropy": 2.8784336388111114, "loss/logits": 0.9728086590766907, "step": 14600 }, { "epoch": 0.1461, "grad_norm": 11.25, "grad_norm_var": 1.16015625, "learning_rate": 0.0003, "loss": 12.307, "loss/aux_loss": 0.0481025354936719, "loss/crossentropy": 2.7547997891902924, "loss/logits": 0.9490894585847854, "step": 14610 }, { "epoch": 0.1462, "grad_norm": 10.5, "grad_norm_var": 0.7546875, "learning_rate": 0.0003, "loss": 12.2052, "loss/aux_loss": 0.04810136090964079, "loss/crossentropy": 2.8345041155815123, "loss/logits": 0.932789009809494, "step": 14620 }, { "epoch": 0.1463, "grad_norm": 10.6875, "grad_norm_var": 0.5806640625, "learning_rate": 0.0003, "loss": 12.3757, "loss/aux_loss": 0.048109573498368266, "loss/crossentropy": 2.812575626373291, "loss/logits": 0.909963321685791, "step": 14630 }, { "epoch": 0.1464, "grad_norm": 10.5625, "grad_norm_var": 0.7407389322916667, "learning_rate": 0.0003, "loss": 12.3286, "loss/aux_loss": 0.04808946587145328, "loss/crossentropy": 3.0018001079559324, "loss/logits": 0.9122880339622498, "step": 14640 }, { "epoch": 0.1465, "grad_norm": 11.5625, "grad_norm_var": 0.2999348958333333, "learning_rate": 0.0003, "loss": 12.2271, "loss/aux_loss": 0.04810205716639757, "loss/crossentropy": 2.813987505435944, "loss/logits": 0.9646568685770035, "step": 14650 }, { "epoch": 0.1466, "grad_norm": 11.375, "grad_norm_var": 0.30130208333333336, "learning_rate": 0.0003, "loss": 12.3144, "loss/aux_loss": 0.04810758735984564, "loss/crossentropy": 2.8299679458141327, "loss/logits": 0.986625736951828, "step": 14660 }, { "epoch": 0.1467, "grad_norm": 11.0625, "grad_norm_var": 0.448291015625, "learning_rate": 0.0003, "loss": 12.1161, "loss/aux_loss": 0.04809447340667248, "loss/crossentropy": 2.72513552904129, "loss/logits": 0.9294513940811158, "step": 14670 }, { "epoch": 0.1468, "grad_norm": 12.1875, "grad_norm_var": 0.6794270833333333, "learning_rate": 0.0003, "loss": 12.3315, "loss/aux_loss": 0.048106583207845686, "loss/crossentropy": 2.876382863521576, "loss/logits": 0.9856162458658219, "step": 14680 }, { "epoch": 0.1469, "grad_norm": 11.625, "grad_norm_var": 0.43483072916666665, "learning_rate": 0.0003, "loss": 12.2266, "loss/aux_loss": 0.04810109194368124, "loss/crossentropy": 2.7526570439338682, "loss/logits": 0.9561353415250778, "step": 14690 }, { "epoch": 0.147, "grad_norm": 10.5, "grad_norm_var": 0.3259765625, "learning_rate": 0.0003, "loss": 12.2576, "loss/aux_loss": 0.04809536933898926, "loss/crossentropy": 2.841619998216629, "loss/logits": 0.9037140548229218, "step": 14700 }, { "epoch": 0.1471, "grad_norm": 10.6875, "grad_norm_var": 0.17081705729166666, "learning_rate": 0.0003, "loss": 12.1134, "loss/aux_loss": 0.04811270516365766, "loss/crossentropy": 2.8084808826446532, "loss/logits": 0.8835839122533798, "step": 14710 }, { "epoch": 0.1472, "grad_norm": 10.6875, "grad_norm_var": 0.15703125, "learning_rate": 0.0003, "loss": 12.362, "loss/aux_loss": 0.04809713140130043, "loss/crossentropy": 2.7635149002075194, "loss/logits": 0.9131356865167618, "step": 14720 }, { "epoch": 0.1473, "grad_norm": 37.75, "grad_norm_var": 43.606754557291666, "learning_rate": 0.0003, "loss": 12.3082, "loss/aux_loss": 0.04809991512447596, "loss/crossentropy": 2.6443506985902787, "loss/logits": 0.9391460686922073, "step": 14730 }, { "epoch": 0.1474, "grad_norm": 10.75, "grad_norm_var": 43.39609375, "learning_rate": 0.0003, "loss": 12.1862, "loss/aux_loss": 0.048105531558394435, "loss/crossentropy": 2.745135086774826, "loss/logits": 0.9022040009498596, "step": 14740 }, { "epoch": 0.1475, "grad_norm": 11.125, "grad_norm_var": 0.849072265625, "learning_rate": 0.0003, "loss": 12.2467, "loss/aux_loss": 0.04809885751456022, "loss/crossentropy": 2.8118128538131715, "loss/logits": 0.9468438357114792, "step": 14750 }, { "epoch": 0.1476, "grad_norm": 10.75, "grad_norm_var": 0.8421223958333334, "learning_rate": 0.0003, "loss": 12.2405, "loss/aux_loss": 0.04809672348201275, "loss/crossentropy": 2.9226966857910157, "loss/logits": 0.9427454113960266, "step": 14760 }, { "epoch": 0.1477, "grad_norm": 10.75, "grad_norm_var": 0.402978515625, "learning_rate": 0.0003, "loss": 12.0438, "loss/aux_loss": 0.04809822179377079, "loss/crossentropy": 2.7584859311580656, "loss/logits": 0.9179711043834686, "step": 14770 }, { "epoch": 0.1478, "grad_norm": 10.75, "grad_norm_var": 0.32941080729166666, "learning_rate": 0.0003, "loss": 12.2503, "loss/aux_loss": 0.04809885416179895, "loss/crossentropy": 2.9167349338531494, "loss/logits": 0.9420515596866608, "step": 14780 }, { "epoch": 0.1479, "grad_norm": 11.625, "grad_norm_var": 8.384098307291667, "learning_rate": 0.0003, "loss": 12.296, "loss/aux_loss": 0.04811673872172832, "loss/crossentropy": 2.6683114945888518, "loss/logits": 0.9089529395103455, "step": 14790 }, { "epoch": 0.148, "grad_norm": 11.0625, "grad_norm_var": 8.616145833333333, "learning_rate": 0.0003, "loss": 12.4053, "loss/aux_loss": 0.048098386451601985, "loss/crossentropy": 2.920657384395599, "loss/logits": 0.954785504937172, "step": 14800 }, { "epoch": 0.1481, "grad_norm": 11.5625, "grad_norm_var": 0.6645182291666667, "learning_rate": 0.0003, "loss": 12.32, "loss/aux_loss": 0.048104028962552545, "loss/crossentropy": 2.8948807239532472, "loss/logits": 0.9660200357437134, "step": 14810 }, { "epoch": 0.1482, "grad_norm": 10.8125, "grad_norm_var": 0.3228515625, "learning_rate": 0.0003, "loss": 12.2445, "loss/aux_loss": 0.04809078220278025, "loss/crossentropy": 2.6194146156311033, "loss/logits": 0.9160060435533524, "step": 14820 }, { "epoch": 0.1483, "grad_norm": 11.5625, "grad_norm_var": 0.3328125, "learning_rate": 0.0003, "loss": 12.2885, "loss/aux_loss": 0.048103746958076954, "loss/crossentropy": 2.754777866601944, "loss/logits": 0.949258816242218, "step": 14830 }, { "epoch": 0.1484, "grad_norm": 11.8125, "grad_norm_var": 19.371077473958334, "learning_rate": 0.0003, "loss": 12.3542, "loss/aux_loss": 0.0480975853279233, "loss/crossentropy": 2.8865331768989564, "loss/logits": 0.9503841429948807, "step": 14840 }, { "epoch": 0.1485, "grad_norm": 11.5625, "grad_norm_var": 19.502978515625, "learning_rate": 0.0003, "loss": 12.2103, "loss/aux_loss": 0.04809964876621962, "loss/crossentropy": 2.8561151921749115, "loss/logits": 0.9401407986879349, "step": 14850 }, { "epoch": 0.1486, "grad_norm": 11.3125, "grad_norm_var": 0.692431640625, "learning_rate": 0.0003, "loss": 12.2275, "loss/aux_loss": 0.04810830354690552, "loss/crossentropy": 2.8200284421443937, "loss/logits": 0.9532903909683228, "step": 14860 }, { "epoch": 0.1487, "grad_norm": 12.4375, "grad_norm_var": 20.823421223958334, "learning_rate": 0.0003, "loss": 12.2273, "loss/aux_loss": 0.04810541290789842, "loss/crossentropy": 2.6964865624904633, "loss/logits": 0.9282492130994797, "step": 14870 }, { "epoch": 0.1488, "grad_norm": 12.4375, "grad_norm_var": 20.870247395833335, "learning_rate": 0.0003, "loss": 12.3184, "loss/aux_loss": 0.04810648560523987, "loss/crossentropy": 2.8158532321453094, "loss/logits": 0.9431109875440598, "step": 14880 }, { "epoch": 0.1489, "grad_norm": 10.5, "grad_norm_var": 0.601025390625, "learning_rate": 0.0003, "loss": 12.2071, "loss/aux_loss": 0.048101908154785634, "loss/crossentropy": 2.803946614265442, "loss/logits": 0.9242048561573029, "step": 14890 }, { "epoch": 0.149, "grad_norm": 10.6875, "grad_norm_var": 0.7249348958333334, "learning_rate": 0.0003, "loss": 12.0886, "loss/aux_loss": 0.04810064677149058, "loss/crossentropy": 2.778617113828659, "loss/logits": 0.9423558235168457, "step": 14900 }, { "epoch": 0.1491, "grad_norm": 11.75, "grad_norm_var": 0.5096354166666667, "learning_rate": 0.0003, "loss": 12.1908, "loss/aux_loss": 0.0481045451015234, "loss/crossentropy": 2.854165458679199, "loss/logits": 0.9379553228616715, "step": 14910 }, { "epoch": 0.1492, "grad_norm": 11.6875, "grad_norm_var": 0.46848958333333335, "learning_rate": 0.0003, "loss": 12.4479, "loss/aux_loss": 0.04809947554022074, "loss/crossentropy": 2.8008215487003327, "loss/logits": 0.9356680005788803, "step": 14920 }, { "epoch": 0.1493, "grad_norm": 11.6875, "grad_norm_var": 0.13587239583333333, "learning_rate": 0.0003, "loss": 12.0882, "loss/aux_loss": 0.04809720925986767, "loss/crossentropy": 2.7476901531219484, "loss/logits": 0.9420498043298722, "step": 14930 }, { "epoch": 0.1494, "grad_norm": 10.9375, "grad_norm_var": 0.208447265625, "learning_rate": 0.0003, "loss": 12.2413, "loss/aux_loss": 0.0480950940400362, "loss/crossentropy": 2.864052379131317, "loss/logits": 0.9272116690874099, "step": 14940 }, { "epoch": 0.1495, "grad_norm": 11.375, "grad_norm_var": 0.311181640625, "learning_rate": 0.0003, "loss": 12.2616, "loss/aux_loss": 0.04809942021965981, "loss/crossentropy": 2.9187353610992433, "loss/logits": 0.9510285437107087, "step": 14950 }, { "epoch": 0.1496, "grad_norm": 11.9375, "grad_norm_var": 47.15206705729167, "learning_rate": 0.0003, "loss": 12.3457, "loss/aux_loss": 0.04811178985983133, "loss/crossentropy": 2.7071347713470457, "loss/logits": 0.9264847010374069, "step": 14960 }, { "epoch": 0.1497, "grad_norm": 10.875, "grad_norm_var": 0.9809895833333333, "learning_rate": 0.0003, "loss": 12.0873, "loss/aux_loss": 0.048096916265785696, "loss/crossentropy": 2.8192372739315035, "loss/logits": 0.961195969581604, "step": 14970 }, { "epoch": 0.1498, "grad_norm": 11.0, "grad_norm_var": 0.7156087239583333, "learning_rate": 0.0003, "loss": 12.113, "loss/aux_loss": 0.04811111818999052, "loss/crossentropy": 2.698056328296661, "loss/logits": 0.9076811641454696, "step": 14980 }, { "epoch": 0.1499, "grad_norm": 12.0, "grad_norm_var": 269.00714518229165, "learning_rate": 0.0003, "loss": 12.3343, "loss/aux_loss": 0.048099159449338916, "loss/crossentropy": 2.7637811422348024, "loss/logits": 0.968485102057457, "step": 14990 }, { "epoch": 0.15, "grad_norm": 11.375, "grad_norm_var": 269.80128580729166, "learning_rate": 0.0003, "loss": 12.4506, "loss/aux_loss": 0.04810250028967857, "loss/crossentropy": 2.8541912317276, "loss/logits": 0.987116688489914, "step": 15000 }, { "epoch": 0.1501, "grad_norm": 10.75, "grad_norm_var": 0.6130208333333333, "learning_rate": 0.0003, "loss": 12.2255, "loss/aux_loss": 0.048107155971229075, "loss/crossentropy": 2.7818135201931, "loss/logits": 0.9167533338069915, "step": 15010 }, { "epoch": 0.1502, "grad_norm": 11.3125, "grad_norm_var": 0.13170572916666667, "learning_rate": 0.0003, "loss": 12.356, "loss/aux_loss": 0.04810172915458679, "loss/crossentropy": 2.940346974134445, "loss/logits": 0.952643695473671, "step": 15020 }, { "epoch": 0.1503, "grad_norm": 12.125, "grad_norm_var": 0.5635416666666667, "learning_rate": 0.0003, "loss": 12.2117, "loss/aux_loss": 0.04809871483594179, "loss/crossentropy": 2.7255462646484374, "loss/logits": 0.9134910553693771, "step": 15030 }, { "epoch": 0.1504, "grad_norm": 12.0, "grad_norm_var": 0.5828125, "learning_rate": 0.0003, "loss": 12.197, "loss/aux_loss": 0.04809676483273506, "loss/crossentropy": 2.9399500370025633, "loss/logits": 0.9335614711046218, "step": 15040 }, { "epoch": 0.1505, "grad_norm": 11.75, "grad_norm_var": 0.296337890625, "learning_rate": 0.0003, "loss": 12.2855, "loss/aux_loss": 0.048100493475794794, "loss/crossentropy": 2.83375204205513, "loss/logits": 0.9076710551977157, "step": 15050 }, { "epoch": 0.1506, "grad_norm": 10.125, "grad_norm_var": 1.1322916666666667, "learning_rate": 0.0003, "loss": 12.1571, "loss/aux_loss": 0.0481040021404624, "loss/crossentropy": 2.9388111233711243, "loss/logits": 0.9607951223850251, "step": 15060 }, { "epoch": 0.1507, "grad_norm": 11.875, "grad_norm_var": 0.3541666666666667, "learning_rate": 0.0003, "loss": 12.4012, "loss/aux_loss": 0.04810583982616663, "loss/crossentropy": 2.875928044319153, "loss/logits": 0.9627457737922669, "step": 15070 }, { "epoch": 0.1508, "grad_norm": 10.6875, "grad_norm_var": 0.32786458333333335, "learning_rate": 0.0003, "loss": 12.2652, "loss/aux_loss": 0.04809319153428078, "loss/crossentropy": 2.9540405869483948, "loss/logits": 0.9330450028181076, "step": 15080 }, { "epoch": 0.1509, "grad_norm": 11.625, "grad_norm_var": 0.32760416666666664, "learning_rate": 0.0003, "loss": 12.1051, "loss/aux_loss": 0.048104870691895486, "loss/crossentropy": 2.8114062428474424, "loss/logits": 0.9364354491233826, "step": 15090 }, { "epoch": 0.151, "grad_norm": 12.0, "grad_norm_var": 36.4619140625, "learning_rate": 0.0003, "loss": 12.2164, "loss/aux_loss": 0.04811019506305456, "loss/crossentropy": 2.859804928302765, "loss/logits": 0.9465476185083389, "step": 15100 }, { "epoch": 0.1511, "grad_norm": 10.9375, "grad_norm_var": 35.42526041666667, "learning_rate": 0.0003, "loss": 12.3094, "loss/aux_loss": 0.04809868466109037, "loss/crossentropy": 2.8438979148864747, "loss/logits": 0.8984570145606995, "step": 15110 }, { "epoch": 0.1512, "grad_norm": 12.3125, "grad_norm_var": 1.280712890625, "learning_rate": 0.0003, "loss": 12.2106, "loss/aux_loss": 0.048102298937737945, "loss/crossentropy": 2.8705235600471495, "loss/logits": 0.9498428493738175, "step": 15120 }, { "epoch": 0.1513, "grad_norm": 10.9375, "grad_norm_var": 1.2218098958333334, "learning_rate": 0.0003, "loss": 12.208, "loss/aux_loss": 0.04809999018907547, "loss/crossentropy": 2.90715229511261, "loss/logits": 0.9575346529483795, "step": 15130 }, { "epoch": 0.1514, "grad_norm": 13.125, "grad_norm_var": 0.46145833333333336, "learning_rate": 0.0003, "loss": 12.2519, "loss/aux_loss": 0.04809638075530529, "loss/crossentropy": 2.7828579187393188, "loss/logits": 0.9503386884927749, "step": 15140 }, { "epoch": 0.1515, "grad_norm": 11.5625, "grad_norm_var": 0.3907389322916667, "learning_rate": 0.0003, "loss": 12.2184, "loss/aux_loss": 0.04810054805129767, "loss/crossentropy": 2.871291196346283, "loss/logits": 0.9457465648651123, "step": 15150 }, { "epoch": 0.1516, "grad_norm": 11.6875, "grad_norm_var": 0.328369140625, "learning_rate": 0.0003, "loss": 12.1969, "loss/aux_loss": 0.04810226745903492, "loss/crossentropy": 2.773683416843414, "loss/logits": 0.9155610114336014, "step": 15160 }, { "epoch": 0.1517, "grad_norm": 10.875, "grad_norm_var": 0.39036458333333335, "learning_rate": 0.0003, "loss": 12.1309, "loss/aux_loss": 0.04810235556215048, "loss/crossentropy": 2.908278775215149, "loss/logits": 0.9158676236867904, "step": 15170 }, { "epoch": 0.1518, "grad_norm": 10.875, "grad_norm_var": 0.24264322916666667, "learning_rate": 0.0003, "loss": 12.1781, "loss/aux_loss": 0.04809516854584217, "loss/crossentropy": 2.737054407596588, "loss/logits": 0.9324061542749404, "step": 15180 }, { "epoch": 0.1519, "grad_norm": 10.6875, "grad_norm_var": 24.646354166666665, "learning_rate": 0.0003, "loss": 12.1106, "loss/aux_loss": 0.048110068589448926, "loss/crossentropy": 2.820358157157898, "loss/logits": 0.937472653388977, "step": 15190 }, { "epoch": 0.152, "grad_norm": 12.1875, "grad_norm_var": 1.719384765625, "learning_rate": 0.0003, "loss": 12.1341, "loss/aux_loss": 0.04811493624001741, "loss/crossentropy": 2.8123875498771667, "loss/logits": 0.915363097190857, "step": 15200 }, { "epoch": 0.1521, "grad_norm": 11.0625, "grad_norm_var": 1.6275390625, "learning_rate": 0.0003, "loss": 12.0856, "loss/aux_loss": 0.048097194731235506, "loss/crossentropy": 2.9831456780433654, "loss/logits": 0.9460885792970657, "step": 15210 }, { "epoch": 0.1522, "grad_norm": 10.25, "grad_norm_var": 0.3004557291666667, "learning_rate": 0.0003, "loss": 11.8513, "loss/aux_loss": 0.04810490664094687, "loss/crossentropy": 2.7776617228984835, "loss/logits": 0.9056734681129456, "step": 15220 }, { "epoch": 0.1523, "grad_norm": 11.25, "grad_norm_var": 0.22263997395833332, "learning_rate": 0.0003, "loss": 12.0605, "loss/aux_loss": 0.04809698183089495, "loss/crossentropy": 2.8233635425567627, "loss/logits": 0.9389057904481888, "step": 15230 }, { "epoch": 0.1524, "grad_norm": 11.6875, "grad_norm_var": 0.4488118489583333, "learning_rate": 0.0003, "loss": 12.1783, "loss/aux_loss": 0.048117165453732014, "loss/crossentropy": 2.66759774684906, "loss/logits": 0.9018658816814422, "step": 15240 }, { "epoch": 0.1525, "grad_norm": 11.75, "grad_norm_var": 0.46295572916666666, "learning_rate": 0.0003, "loss": 12.1848, "loss/aux_loss": 0.048101647198200224, "loss/crossentropy": 2.803697109222412, "loss/logits": 0.9530074447393417, "step": 15250 }, { "epoch": 0.1526, "grad_norm": 10.9375, "grad_norm_var": 0.25930989583333336, "learning_rate": 0.0003, "loss": 12.1231, "loss/aux_loss": 0.04810661189258099, "loss/crossentropy": 2.847998285293579, "loss/logits": 0.9225472122430801, "step": 15260 }, { "epoch": 0.1527, "grad_norm": 11.75, "grad_norm_var": 0.14837239583333334, "learning_rate": 0.0003, "loss": 12.2114, "loss/aux_loss": 0.048101527616381645, "loss/crossentropy": 2.8020704984664917, "loss/logits": 0.9532386660575867, "step": 15270 }, { "epoch": 0.1528, "grad_norm": 11.625, "grad_norm_var": 0.20701497395833332, "learning_rate": 0.0003, "loss": 11.9737, "loss/aux_loss": 0.0480971185490489, "loss/crossentropy": 2.984708344936371, "loss/logits": 0.9079653114080429, "step": 15280 }, { "epoch": 0.1529, "grad_norm": 11.1875, "grad_norm_var": 0.22057291666666667, "learning_rate": 0.0003, "loss": 12.3004, "loss/aux_loss": 0.0480941278859973, "loss/crossentropy": 2.877718913555145, "loss/logits": 0.9893636494874954, "step": 15290 }, { "epoch": 0.153, "grad_norm": 11.0, "grad_norm_var": 0.5619791666666667, "learning_rate": 0.0003, "loss": 12.3488, "loss/aux_loss": 0.04810192976146936, "loss/crossentropy": 2.6575785517692565, "loss/logits": 0.9273690760135651, "step": 15300 }, { "epoch": 0.1531, "grad_norm": 12.1875, "grad_norm_var": 0.675, "learning_rate": 0.0003, "loss": 12.3104, "loss/aux_loss": 0.048097644187510016, "loss/crossentropy": 2.748287373781204, "loss/logits": 0.948896062374115, "step": 15310 }, { "epoch": 0.1532, "grad_norm": 11.125, "grad_norm_var": 0.31953125, "learning_rate": 0.0003, "loss": 12.2604, "loss/aux_loss": 0.04810179900377989, "loss/crossentropy": 2.7924574255943297, "loss/logits": 0.9476647943258285, "step": 15320 }, { "epoch": 0.1533, "grad_norm": 12.0625, "grad_norm_var": 0.2618326822916667, "learning_rate": 0.0003, "loss": 12.1864, "loss/aux_loss": 0.04809467382729053, "loss/crossentropy": 2.8050376057624815, "loss/logits": 0.9532152026891708, "step": 15330 }, { "epoch": 0.1534, "grad_norm": 11.625, "grad_norm_var": 0.29140625, "learning_rate": 0.0003, "loss": 12.347, "loss/aux_loss": 0.04810280818492174, "loss/crossentropy": 2.8835896611213685, "loss/logits": 0.9571986377239228, "step": 15340 }, { "epoch": 0.1535, "grad_norm": 11.1875, "grad_norm_var": 0.18333333333333332, "learning_rate": 0.0003, "loss": 12.2084, "loss/aux_loss": 0.04809752386063337, "loss/crossentropy": 2.749102717638016, "loss/logits": 0.9065598905086517, "step": 15350 }, { "epoch": 0.1536, "grad_norm": 11.4375, "grad_norm_var": 0.21979166666666666, "learning_rate": 0.0003, "loss": 12.0297, "loss/aux_loss": 0.04811021964997053, "loss/crossentropy": 2.9079548954963683, "loss/logits": 0.9038378298282623, "step": 15360 }, { "epoch": 0.1537, "grad_norm": 11.625, "grad_norm_var": 0.38743489583333335, "learning_rate": 0.0003, "loss": 12.2798, "loss/aux_loss": 0.04810038134455681, "loss/crossentropy": 2.696992439031601, "loss/logits": 0.9243462920188904, "step": 15370 }, { "epoch": 0.1538, "grad_norm": 10.6875, "grad_norm_var": 0.419775390625, "learning_rate": 0.0003, "loss": 12.3808, "loss/aux_loss": 0.048097037523984906, "loss/crossentropy": 2.955533170700073, "loss/logits": 0.962461119890213, "step": 15380 }, { "epoch": 0.1539, "grad_norm": 11.625, "grad_norm_var": 0.4515462239583333, "learning_rate": 0.0003, "loss": 12.255, "loss/aux_loss": 0.04808609709143639, "loss/crossentropy": 2.8893189787864686, "loss/logits": 0.9806782245635987, "step": 15390 }, { "epoch": 0.154, "grad_norm": 13.125, "grad_norm_var": 0.5421875, "learning_rate": 0.0003, "loss": 12.2583, "loss/aux_loss": 0.048098246194422246, "loss/crossentropy": 3.016023313999176, "loss/logits": 0.9697432667016983, "step": 15400 }, { "epoch": 0.1541, "grad_norm": 11.1875, "grad_norm_var": 0.5311848958333333, "learning_rate": 0.0003, "loss": 12.2399, "loss/aux_loss": 0.04809315577149391, "loss/crossentropy": 2.803581511974335, "loss/logits": 0.9420624375343323, "step": 15410 }, { "epoch": 0.1542, "grad_norm": 12.1875, "grad_norm_var": 0.371728515625, "learning_rate": 0.0003, "loss": 12.2796, "loss/aux_loss": 0.04809564612805843, "loss/crossentropy": 2.9065362393856047, "loss/logits": 0.9607432782649994, "step": 15420 }, { "epoch": 0.1543, "grad_norm": 11.8125, "grad_norm_var": 0.4650390625, "learning_rate": 0.0003, "loss": 12.2096, "loss/aux_loss": 0.04809183832257986, "loss/crossentropy": 2.902686321735382, "loss/logits": 0.9689311563968659, "step": 15430 }, { "epoch": 0.1544, "grad_norm": 15.0, "grad_norm_var": 3.1499837239583335, "learning_rate": 0.0003, "loss": 12.1702, "loss/aux_loss": 0.04809550140053034, "loss/crossentropy": 2.842085200548172, "loss/logits": 0.9201117724180221, "step": 15440 }, { "epoch": 0.1545, "grad_norm": 12.0, "grad_norm_var": 3.273811848958333, "learning_rate": 0.0003, "loss": 12.4202, "loss/aux_loss": 0.048114397749304774, "loss/crossentropy": 2.8335028886795044, "loss/logits": 0.9513376891613007, "step": 15450 }, { "epoch": 0.1546, "grad_norm": 11.0625, "grad_norm_var": 0.345166015625, "learning_rate": 0.0003, "loss": 12.2035, "loss/aux_loss": 0.04809577390551567, "loss/crossentropy": 2.7710729837417603, "loss/logits": 0.9169834047555924, "step": 15460 }, { "epoch": 0.1547, "grad_norm": 10.5625, "grad_norm_var": 0.6999837239583333, "learning_rate": 0.0003, "loss": 12.0555, "loss/aux_loss": 0.04811387322843075, "loss/crossentropy": 2.7559852480888365, "loss/logits": 0.8992862313985824, "step": 15470 }, { "epoch": 0.1548, "grad_norm": 11.0625, "grad_norm_var": 0.3203125, "learning_rate": 0.0003, "loss": 12.2393, "loss/aux_loss": 0.048102208971977235, "loss/crossentropy": 2.8739076018333436, "loss/logits": 0.8997802734375, "step": 15480 }, { "epoch": 0.1549, "grad_norm": 12.3125, "grad_norm_var": 0.24777018229166667, "learning_rate": 0.0003, "loss": 12.3324, "loss/aux_loss": 0.04809423796832561, "loss/crossentropy": 2.89512904882431, "loss/logits": 0.9646374642848968, "step": 15490 }, { "epoch": 0.155, "grad_norm": 11.6875, "grad_norm_var": 10.139436848958333, "learning_rate": 0.0003, "loss": 12.278, "loss/aux_loss": 0.04810593910515308, "loss/crossentropy": 2.809836542606354, "loss/logits": 0.9548726409673691, "step": 15500 }, { "epoch": 0.1551, "grad_norm": 12.5625, "grad_norm_var": 0.4507649739583333, "learning_rate": 0.0003, "loss": 12.1395, "loss/aux_loss": 0.048092353343963626, "loss/crossentropy": 2.638697361946106, "loss/logits": 0.9311948031187057, "step": 15510 }, { "epoch": 0.1552, "grad_norm": 11.625, "grad_norm_var": 0.28359375, "learning_rate": 0.0003, "loss": 12.2342, "loss/aux_loss": 0.048095655255019666, "loss/crossentropy": 2.988735723495483, "loss/logits": 0.9588959008455277, "step": 15520 }, { "epoch": 0.1553, "grad_norm": 13.375, "grad_norm_var": 150.09296875, "learning_rate": 0.0003, "loss": 12.2716, "loss/aux_loss": 0.04810060281306505, "loss/crossentropy": 2.6777639269828795, "loss/logits": 0.9254604041576385, "step": 15530 }, { "epoch": 0.1554, "grad_norm": 11.375, "grad_norm_var": 0.410791015625, "learning_rate": 0.0003, "loss": 12.256, "loss/aux_loss": 0.04809675142168999, "loss/crossentropy": 2.9493250966072084, "loss/logits": 0.9647281706333161, "step": 15540 }, { "epoch": 0.1555, "grad_norm": 10.125, "grad_norm_var": 0.28951822916666664, "learning_rate": 0.0003, "loss": 12.3685, "loss/aux_loss": 0.048103061877191065, "loss/crossentropy": 2.8565509915351868, "loss/logits": 0.9633009701967239, "step": 15550 }, { "epoch": 0.1556, "grad_norm": 10.875, "grad_norm_var": 0.7700520833333333, "learning_rate": 0.0003, "loss": 12.2548, "loss/aux_loss": 0.04810278750956058, "loss/crossentropy": 2.889665186405182, "loss/logits": 0.9235396683216095, "step": 15560 }, { "epoch": 0.1557, "grad_norm": 11.5, "grad_norm_var": 0.8440104166666667, "learning_rate": 0.0003, "loss": 12.3396, "loss/aux_loss": 0.04810402244329452, "loss/crossentropy": 2.737299156188965, "loss/logits": 0.9336203277111054, "step": 15570 }, { "epoch": 0.1558, "grad_norm": 11.125, "grad_norm_var": 0.8192708333333333, "learning_rate": 0.0003, "loss": 12.1569, "loss/aux_loss": 0.048098998703062536, "loss/crossentropy": 2.78268221616745, "loss/logits": 0.9026233315467834, "step": 15580 }, { "epoch": 0.1559, "grad_norm": 11.6875, "grad_norm_var": 0.13162434895833333, "learning_rate": 0.0003, "loss": 12.2533, "loss/aux_loss": 0.048094463720917704, "loss/crossentropy": 2.901452112197876, "loss/logits": 0.9481937050819397, "step": 15590 }, { "epoch": 0.156, "grad_norm": 10.5625, "grad_norm_var": 0.5333170572916667, "learning_rate": 0.0003, "loss": 12.2767, "loss/aux_loss": 0.048098971135914326, "loss/crossentropy": 2.8950270414352417, "loss/logits": 0.9641896247863769, "step": 15600 }, { "epoch": 0.1561, "grad_norm": 11.625, "grad_norm_var": 0.34178059895833335, "learning_rate": 0.0003, "loss": 11.981, "loss/aux_loss": 0.04809776470065117, "loss/crossentropy": 2.802956283092499, "loss/logits": 0.920897588133812, "step": 15610 }, { "epoch": 0.1562, "grad_norm": 10.4375, "grad_norm_var": 0.1875, "learning_rate": 0.0003, "loss": 12.2279, "loss/aux_loss": 0.04809428732842207, "loss/crossentropy": 2.8816702008247375, "loss/logits": 0.9599309653043747, "step": 15620 }, { "epoch": 0.1563, "grad_norm": 11.5, "grad_norm_var": 0.2686848958333333, "learning_rate": 0.0003, "loss": 12.1109, "loss/aux_loss": 0.04809434395283461, "loss/crossentropy": 2.7398535430431368, "loss/logits": 0.8981555104255676, "step": 15630 }, { "epoch": 0.1564, "grad_norm": 10.875, "grad_norm_var": 0.31451822916666666, "learning_rate": 0.0003, "loss": 12.081, "loss/aux_loss": 0.04809578433632851, "loss/crossentropy": 2.823803460597992, "loss/logits": 0.937409034371376, "step": 15640 }, { "epoch": 0.1565, "grad_norm": 12.0625, "grad_norm_var": 0.3103515625, "learning_rate": 0.0003, "loss": 12.0933, "loss/aux_loss": 0.04810278974473477, "loss/crossentropy": 2.9505991697311402, "loss/logits": 0.9344629585742951, "step": 15650 }, { "epoch": 0.1566, "grad_norm": 11.25, "grad_norm_var": 0.3744140625, "learning_rate": 0.0003, "loss": 12.1932, "loss/aux_loss": 0.04810452219098806, "loss/crossentropy": 2.92813218832016, "loss/logits": 0.9079012930393219, "step": 15660 }, { "epoch": 0.1567, "grad_norm": 11.125, "grad_norm_var": 0.23697916666666666, "learning_rate": 0.0003, "loss": 12.1024, "loss/aux_loss": 0.04809817671775818, "loss/crossentropy": 2.5720150113105773, "loss/logits": 0.9325621664524079, "step": 15670 }, { "epoch": 0.1568, "grad_norm": 12.375, "grad_norm_var": 0.33515625, "learning_rate": 0.0003, "loss": 12.0574, "loss/aux_loss": 0.04810387324541807, "loss/crossentropy": 2.8633701324462892, "loss/logits": 0.9684804528951645, "step": 15680 }, { "epoch": 0.1569, "grad_norm": 11.1875, "grad_norm_var": 0.33006184895833335, "learning_rate": 0.0003, "loss": 12.3274, "loss/aux_loss": 0.04809674210846424, "loss/crossentropy": 2.86969450712204, "loss/logits": 0.9554843038320542, "step": 15690 }, { "epoch": 0.157, "grad_norm": 10.125, "grad_norm_var": 0.33839518229166665, "learning_rate": 0.0003, "loss": 12.1372, "loss/aux_loss": 0.0481055686250329, "loss/crossentropy": 2.640603184700012, "loss/logits": 0.8951573967933655, "step": 15700 }, { "epoch": 0.1571, "grad_norm": 12.5, "grad_norm_var": 0.44244791666666666, "learning_rate": 0.0003, "loss": 12.2587, "loss/aux_loss": 0.04809990283101797, "loss/crossentropy": 2.864989972114563, "loss/logits": 0.9769071489572525, "step": 15710 }, { "epoch": 0.1572, "grad_norm": 11.4375, "grad_norm_var": 0.28515625, "learning_rate": 0.0003, "loss": 12.0975, "loss/aux_loss": 0.0480915404856205, "loss/crossentropy": 2.7617894768714906, "loss/logits": 0.9289013177156449, "step": 15720 }, { "epoch": 0.1573, "grad_norm": 11.3125, "grad_norm_var": 0.19816080729166666, "learning_rate": 0.0003, "loss": 12.2621, "loss/aux_loss": 0.0480972645804286, "loss/crossentropy": 2.8848094820976256, "loss/logits": 0.9512290894985199, "step": 15730 }, { "epoch": 0.1574, "grad_norm": 12.0, "grad_norm_var": 0.5363118489583333, "learning_rate": 0.0003, "loss": 12.2609, "loss/aux_loss": 0.04810263868421316, "loss/crossentropy": 2.94705730676651, "loss/logits": 0.9353223860263824, "step": 15740 }, { "epoch": 0.1575, "grad_norm": 10.875, "grad_norm_var": 0.45364583333333336, "learning_rate": 0.0003, "loss": 12.0892, "loss/aux_loss": 0.048098064586520196, "loss/crossentropy": 2.923324429988861, "loss/logits": 0.9398457109928131, "step": 15750 }, { "epoch": 0.1576, "grad_norm": 11.375, "grad_norm_var": 0.22135416666666666, "learning_rate": 0.0003, "loss": 12.0815, "loss/aux_loss": 0.048105406761169436, "loss/crossentropy": 2.720612233877182, "loss/logits": 0.8936576157808304, "step": 15760 }, { "epoch": 0.1577, "grad_norm": 11.4375, "grad_norm_var": 0.3462890625, "learning_rate": 0.0003, "loss": 12.2061, "loss/aux_loss": 0.04810209292918444, "loss/crossentropy": 2.7888991832733154, "loss/logits": 0.9135987132787704, "step": 15770 }, { "epoch": 0.1578, "grad_norm": 11.875, "grad_norm_var": 0.219775390625, "learning_rate": 0.0003, "loss": 12.2186, "loss/aux_loss": 0.048100620880723, "loss/crossentropy": 2.94092253446579, "loss/logits": 0.9436818659305573, "step": 15780 }, { "epoch": 0.1579, "grad_norm": 12.6875, "grad_norm_var": 0.28566080729166665, "learning_rate": 0.0003, "loss": 12.2812, "loss/aux_loss": 0.04810180887579918, "loss/crossentropy": 2.6856593787670135, "loss/logits": 0.890146228671074, "step": 15790 }, { "epoch": 0.158, "grad_norm": 10.75, "grad_norm_var": 0.4161458333333333, "learning_rate": 0.0003, "loss": 12.1282, "loss/aux_loss": 0.04810675587505102, "loss/crossentropy": 2.824471127986908, "loss/logits": 0.8990681618452072, "step": 15800 }, { "epoch": 0.1581, "grad_norm": 11.25, "grad_norm_var": 0.27706705729166664, "learning_rate": 0.0003, "loss": 12.2103, "loss/aux_loss": 0.04810557030141353, "loss/crossentropy": 2.780168378353119, "loss/logits": 0.9444521903991699, "step": 15810 }, { "epoch": 0.1582, "grad_norm": 11.5625, "grad_norm_var": 0.5145670572916666, "learning_rate": 0.0003, "loss": 12.1687, "loss/aux_loss": 0.04809657074511051, "loss/crossentropy": 2.8092296421527863, "loss/logits": 0.9068554252386093, "step": 15820 }, { "epoch": 0.1583, "grad_norm": 11.25, "grad_norm_var": 0.6791015625, "learning_rate": 0.0003, "loss": 11.9178, "loss/aux_loss": 0.04810602068901062, "loss/crossentropy": 2.8012136101722716, "loss/logits": 0.8847994655370712, "step": 15830 }, { "epoch": 0.1584, "grad_norm": 10.875, "grad_norm_var": 0.4574055989583333, "learning_rate": 0.0003, "loss": 12.1666, "loss/aux_loss": 0.04810144230723381, "loss/crossentropy": 2.838590919971466, "loss/logits": 0.9268758982419968, "step": 15840 }, { "epoch": 0.1585, "grad_norm": 12.1875, "grad_norm_var": 18.598893229166666, "learning_rate": 0.0003, "loss": 12.1954, "loss/aux_loss": 0.04810195360332727, "loss/crossentropy": 2.9161871790885927, "loss/logits": 0.9759931951761246, "step": 15850 }, { "epoch": 0.1586, "grad_norm": 12.25, "grad_norm_var": 0.5593587239583333, "learning_rate": 0.0003, "loss": 12.1365, "loss/aux_loss": 0.04810574501752853, "loss/crossentropy": 2.8291844010353087, "loss/logits": 0.9489723861217498, "step": 15860 }, { "epoch": 0.1587, "grad_norm": 11.3125, "grad_norm_var": 0.30050455729166664, "learning_rate": 0.0003, "loss": 12.0479, "loss/aux_loss": 0.04810310564935207, "loss/crossentropy": 2.7364535570144652, "loss/logits": 0.9301041215658188, "step": 15870 }, { "epoch": 0.1588, "grad_norm": 11.6875, "grad_norm_var": 0.43865559895833334, "learning_rate": 0.0003, "loss": 12.1076, "loss/aux_loss": 0.04809244927018881, "loss/crossentropy": 2.7394683599472045, "loss/logits": 0.9101533353328705, "step": 15880 }, { "epoch": 0.1589, "grad_norm": 11.5625, "grad_norm_var": 0.44453125, "learning_rate": 0.0003, "loss": 12.134, "loss/aux_loss": 0.04809813145548105, "loss/crossentropy": 2.774601572751999, "loss/logits": 0.9330229997634888, "step": 15890 }, { "epoch": 0.159, "grad_norm": 10.4375, "grad_norm_var": 1.927978515625, "learning_rate": 0.0003, "loss": 12.1558, "loss/aux_loss": 0.048109999299049376, "loss/crossentropy": 2.6731720924377442, "loss/logits": 0.8847457319498062, "step": 15900 }, { "epoch": 0.1591, "grad_norm": 11.875, "grad_norm_var": 1.7113932291666667, "learning_rate": 0.0003, "loss": 12.1998, "loss/aux_loss": 0.04809703305363655, "loss/crossentropy": 2.887303102016449, "loss/logits": 0.9333222597837448, "step": 15910 }, { "epoch": 0.1592, "grad_norm": 11.4375, "grad_norm_var": 0.3822265625, "learning_rate": 0.0003, "loss": 12.2916, "loss/aux_loss": 0.04809732548892498, "loss/crossentropy": 2.969210720062256, "loss/logits": 0.9471912950277328, "step": 15920 }, { "epoch": 0.1593, "grad_norm": 10.4375, "grad_norm_var": 0.4900390625, "learning_rate": 0.0003, "loss": 12.078, "loss/aux_loss": 0.0480979910120368, "loss/crossentropy": 2.7797336280345917, "loss/logits": 0.8907982796430588, "step": 15930 }, { "epoch": 0.1594, "grad_norm": 10.4375, "grad_norm_var": 0.3692708333333333, "learning_rate": 0.0003, "loss": 12.0424, "loss/aux_loss": 0.048096719570457934, "loss/crossentropy": 2.7327013194561003, "loss/logits": 0.898812472820282, "step": 15940 }, { "epoch": 0.1595, "grad_norm": 15.3125, "grad_norm_var": 124.77394205729166, "learning_rate": 0.0003, "loss": 12.2458, "loss/aux_loss": 0.04809980187565088, "loss/crossentropy": 2.7992530286312105, "loss/logits": 0.9126897126436233, "step": 15950 }, { "epoch": 0.1596, "grad_norm": 11.0, "grad_norm_var": 125.29425455729167, "learning_rate": 0.0003, "loss": 12.3109, "loss/aux_loss": 0.04810700826346874, "loss/crossentropy": 2.7022558569908144, "loss/logits": 0.965540987253189, "step": 15960 }, { "epoch": 0.1597, "grad_norm": 11.0625, "grad_norm_var": 0.098291015625, "learning_rate": 0.0003, "loss": 11.9925, "loss/aux_loss": 0.04811773002147675, "loss/crossentropy": 2.837008905410767, "loss/logits": 0.9231843024492263, "step": 15970 }, { "epoch": 0.1598, "grad_norm": 11.0625, "grad_norm_var": 0.2630208333333333, "learning_rate": 0.0003, "loss": 12.165, "loss/aux_loss": 0.04810309894382954, "loss/crossentropy": 2.728328824043274, "loss/logits": 0.9210437297821045, "step": 15980 }, { "epoch": 0.1599, "grad_norm": 11.0, "grad_norm_var": 0.19295247395833334, "learning_rate": 0.0003, "loss": 12.4097, "loss/aux_loss": 0.048101219907402994, "loss/crossentropy": 2.9919423699378966, "loss/logits": 0.926442277431488, "step": 15990 }, { "epoch": 0.16, "grad_norm": 10.875, "grad_norm_var": 0.126025390625, "learning_rate": 0.0003, "loss": 12.2974, "loss/aux_loss": 0.04809750020503998, "loss/crossentropy": 2.9499990582466125, "loss/logits": 0.987061333656311, "step": 16000 }, { "epoch": 0.1601, "grad_norm": 12.4375, "grad_norm_var": 0.20911458333333333, "learning_rate": 0.0003, "loss": 12.2875, "loss/aux_loss": 0.04809744451195001, "loss/crossentropy": 2.8682973265647886, "loss/logits": 0.9318195432424545, "step": 16010 }, { "epoch": 0.1602, "grad_norm": 11.5625, "grad_norm_var": 0.2109375, "learning_rate": 0.0003, "loss": 12.1403, "loss/aux_loss": 0.0480994550511241, "loss/crossentropy": 2.770287108421326, "loss/logits": 0.8830248892307282, "step": 16020 }, { "epoch": 0.1603, "grad_norm": 11.0625, "grad_norm_var": 0.4239420572916667, "learning_rate": 0.0003, "loss": 12.1359, "loss/aux_loss": 0.04810148365795612, "loss/crossentropy": 2.974120169878006, "loss/logits": 0.9349057674407959, "step": 16030 }, { "epoch": 0.1604, "grad_norm": 10.8125, "grad_norm_var": 0.47734375, "learning_rate": 0.0003, "loss": 12.0298, "loss/aux_loss": 0.048095306381583214, "loss/crossentropy": 2.8043901443481447, "loss/logits": 0.9003081053495408, "step": 16040 }, { "epoch": 0.1605, "grad_norm": 11.6875, "grad_norm_var": 0.278759765625, "learning_rate": 0.0003, "loss": 12.2773, "loss/aux_loss": 0.04809126928448677, "loss/crossentropy": 2.9143420457839966, "loss/logits": 0.9333951026201248, "step": 16050 }, { "epoch": 0.1606, "grad_norm": 10.9375, "grad_norm_var": 0.4009765625, "learning_rate": 0.0003, "loss": 12.0947, "loss/aux_loss": 0.048095472529530524, "loss/crossentropy": 2.952311336994171, "loss/logits": 0.9581076145172119, "step": 16060 }, { "epoch": 0.1607, "grad_norm": 11.125, "grad_norm_var": 0.2712890625, "learning_rate": 0.0003, "loss": 12.2128, "loss/aux_loss": 0.048100481182336806, "loss/crossentropy": 2.8516422152519225, "loss/logits": 0.9667297631502152, "step": 16070 }, { "epoch": 0.1608, "grad_norm": 11.8125, "grad_norm_var": 0.1984375, "learning_rate": 0.0003, "loss": 12.2553, "loss/aux_loss": 0.04809715617448092, "loss/crossentropy": 2.77035049200058, "loss/logits": 0.9237784296274185, "step": 16080 }, { "epoch": 0.1609, "grad_norm": 11.3125, "grad_norm_var": 0.3114420572916667, "learning_rate": 0.0003, "loss": 12.0383, "loss/aux_loss": 0.04809458721429109, "loss/crossentropy": 2.816128599643707, "loss/logits": 0.9450860530138016, "step": 16090 }, { "epoch": 0.161, "grad_norm": 11.4375, "grad_norm_var": 0.33839518229166665, "learning_rate": 0.0003, "loss": 12.1705, "loss/aux_loss": 0.048097101412713526, "loss/crossentropy": 2.6644342601299287, "loss/logits": 0.9285436570644379, "step": 16100 }, { "epoch": 0.1611, "grad_norm": 11.9375, "grad_norm_var": 0.247900390625, "learning_rate": 0.0003, "loss": 12.3188, "loss/aux_loss": 0.0481021337211132, "loss/crossentropy": 2.8370134472846984, "loss/logits": 0.9432329386472702, "step": 16110 }, { "epoch": 0.1612, "grad_norm": 11.375, "grad_norm_var": 0.17928059895833334, "learning_rate": 0.0003, "loss": 12.2744, "loss/aux_loss": 0.04810014273971319, "loss/crossentropy": 2.829314595460892, "loss/logits": 0.9058397889137269, "step": 16120 }, { "epoch": 0.1613, "grad_norm": 11.625, "grad_norm_var": 0.3004557291666667, "learning_rate": 0.0003, "loss": 12.1699, "loss/aux_loss": 0.048092078790068625, "loss/crossentropy": 2.7174839854240416, "loss/logits": 0.9196837037801743, "step": 16130 }, { "epoch": 0.1614, "grad_norm": 10.625, "grad_norm_var": 0.5082682291666667, "learning_rate": 0.0003, "loss": 12.1341, "loss/aux_loss": 0.04810208380222321, "loss/crossentropy": 2.823376166820526, "loss/logits": 0.9344120264053345, "step": 16140 }, { "epoch": 0.1615, "grad_norm": 10.75, "grad_norm_var": 0.36847330729166666, "learning_rate": 0.0003, "loss": 12.1209, "loss/aux_loss": 0.04809723366051912, "loss/crossentropy": 2.7950705885887146, "loss/logits": 0.9158975452184677, "step": 16150 }, { "epoch": 0.1616, "grad_norm": 11.25, "grad_norm_var": 0.3140625, "learning_rate": 0.0003, "loss": 12.1399, "loss/aux_loss": 0.04809668511152267, "loss/crossentropy": 2.8807433605194093, "loss/logits": 0.9192746669054032, "step": 16160 }, { "epoch": 0.1617, "grad_norm": 13.375, "grad_norm_var": 0.4009765625, "learning_rate": 0.0003, "loss": 12.2518, "loss/aux_loss": 0.04810331519693136, "loss/crossentropy": 2.7547273516654966, "loss/logits": 0.9467386364936828, "step": 16170 }, { "epoch": 0.1618, "grad_norm": 11.0625, "grad_norm_var": 0.496875, "learning_rate": 0.0003, "loss": 12.2401, "loss/aux_loss": 0.04808684252202511, "loss/crossentropy": 2.847959554195404, "loss/logits": 0.9240302503108978, "step": 16180 }, { "epoch": 0.1619, "grad_norm": 11.0625, "grad_norm_var": 0.312353515625, "learning_rate": 0.0003, "loss": 12.2344, "loss/aux_loss": 0.048099389672279357, "loss/crossentropy": 2.845101058483124, "loss/logits": 0.942423290014267, "step": 16190 }, { "epoch": 0.162, "grad_norm": 10.6875, "grad_norm_var": 0.22029622395833334, "learning_rate": 0.0003, "loss": 11.9915, "loss/aux_loss": 0.04809580724686384, "loss/crossentropy": 2.7099331617355347, "loss/logits": 0.9017234027385712, "step": 16200 }, { "epoch": 0.1621, "grad_norm": 10.9375, "grad_norm_var": 0.5179524739583333, "learning_rate": 0.0003, "loss": 11.9946, "loss/aux_loss": 0.048097424954175946, "loss/crossentropy": 2.884207457304001, "loss/logits": 0.8981190234422683, "step": 16210 }, { "epoch": 0.1622, "grad_norm": 11.875, "grad_norm_var": 0.238134765625, "learning_rate": 0.0003, "loss": 12.2356, "loss/aux_loss": 0.04810638912022114, "loss/crossentropy": 2.747766560316086, "loss/logits": 0.9162409037351609, "step": 16220 }, { "epoch": 0.1623, "grad_norm": 10.5, "grad_norm_var": 0.13644205729166667, "learning_rate": 0.0003, "loss": 11.9741, "loss/aux_loss": 0.04809376634657383, "loss/crossentropy": 2.8902024030685425, "loss/logits": 0.936252373456955, "step": 16230 }, { "epoch": 0.1624, "grad_norm": 11.8125, "grad_norm_var": 0.366259765625, "learning_rate": 0.0003, "loss": 12.3348, "loss/aux_loss": 0.048098774440586564, "loss/crossentropy": 2.803633749485016, "loss/logits": 0.973576670885086, "step": 16240 }, { "epoch": 0.1625, "grad_norm": 12.1875, "grad_norm_var": 0.278369140625, "learning_rate": 0.0003, "loss": 12.0583, "loss/aux_loss": 0.04810395650565624, "loss/crossentropy": 2.8404315412044525, "loss/logits": 0.9931963056325912, "step": 16250 }, { "epoch": 0.1626, "grad_norm": 12.375, "grad_norm_var": 0.3738932291666667, "learning_rate": 0.0003, "loss": 12.1916, "loss/aux_loss": 0.048100400157272814, "loss/crossentropy": 2.768079376220703, "loss/logits": 0.9406552851200104, "step": 16260 }, { "epoch": 0.1627, "grad_norm": 11.3125, "grad_norm_var": 0.28899739583333334, "learning_rate": 0.0003, "loss": 12.0582, "loss/aux_loss": 0.04809861071407795, "loss/crossentropy": 2.713436472415924, "loss/logits": 0.9488321393728256, "step": 16270 }, { "epoch": 0.1628, "grad_norm": 11.375, "grad_norm_var": 0.380322265625, "learning_rate": 0.0003, "loss": 12.0068, "loss/aux_loss": 0.04809640850871801, "loss/crossentropy": 2.780947434902191, "loss/logits": 0.9337312400341033, "step": 16280 }, { "epoch": 0.1629, "grad_norm": 11.875, "grad_norm_var": 0.439306640625, "learning_rate": 0.0003, "loss": 12.1232, "loss/aux_loss": 0.04809816125780344, "loss/crossentropy": 2.8880489349365233, "loss/logits": 0.929016700387001, "step": 16290 }, { "epoch": 0.163, "grad_norm": 11.6875, "grad_norm_var": 0.3094889322916667, "learning_rate": 0.0003, "loss": 11.988, "loss/aux_loss": 0.048106462322175504, "loss/crossentropy": 2.9447537541389464, "loss/logits": 0.9289344936609268, "step": 16300 }, { "epoch": 0.1631, "grad_norm": 11.5625, "grad_norm_var": 19.153059895833334, "learning_rate": 0.0003, "loss": 12.2841, "loss/aux_loss": 0.04810431189835072, "loss/crossentropy": 2.7848674178123476, "loss/logits": 0.9472992300987244, "step": 16310 }, { "epoch": 0.1632, "grad_norm": 12.0, "grad_norm_var": 17.702604166666667, "learning_rate": 0.0003, "loss": 12.2275, "loss/aux_loss": 0.04810850899666548, "loss/crossentropy": 2.7362507581710815, "loss/logits": 0.9090913355350494, "step": 16320 }, { "epoch": 0.1633, "grad_norm": 10.5, "grad_norm_var": 1.0449055989583333, "learning_rate": 0.0003, "loss": 12.1755, "loss/aux_loss": 0.04809928461909294, "loss/crossentropy": 2.870719301700592, "loss/logits": 0.953993484377861, "step": 16330 }, { "epoch": 0.1634, "grad_norm": 11.9375, "grad_norm_var": 0.270166015625, "learning_rate": 0.0003, "loss": 12.1613, "loss/aux_loss": 0.04809772912412882, "loss/crossentropy": 2.730927813053131, "loss/logits": 0.9100559711456299, "step": 16340 }, { "epoch": 0.1635, "grad_norm": 12.625, "grad_norm_var": 0.6403483072916667, "learning_rate": 0.0003, "loss": 11.9033, "loss/aux_loss": 0.04809962585568428, "loss/crossentropy": 2.6463473558425905, "loss/logits": 0.8951964765787125, "step": 16350 }, { "epoch": 0.1636, "grad_norm": 11.0, "grad_norm_var": 0.9075520833333334, "learning_rate": 0.0003, "loss": 12.2224, "loss/aux_loss": 0.04809305313974619, "loss/crossentropy": 2.7235575318336487, "loss/logits": 0.9235330730676651, "step": 16360 }, { "epoch": 0.1637, "grad_norm": 11.125, "grad_norm_var": 0.47433268229166664, "learning_rate": 0.0003, "loss": 12.0492, "loss/aux_loss": 0.04809541571885347, "loss/crossentropy": 2.8656546056270598, "loss/logits": 0.9521847158670426, "step": 16370 }, { "epoch": 0.1638, "grad_norm": 11.625, "grad_norm_var": 0.34256184895833336, "learning_rate": 0.0003, "loss": 12.1038, "loss/aux_loss": 0.048089108802378176, "loss/crossentropy": 2.8255446553230286, "loss/logits": 0.8946599334478378, "step": 16380 }, { "epoch": 0.1639, "grad_norm": 11.75, "grad_norm_var": 0.49581705729166664, "learning_rate": 0.0003, "loss": 12.1369, "loss/aux_loss": 0.04810045957565308, "loss/crossentropy": 2.6379716813564302, "loss/logits": 0.9489578425884246, "step": 16390 }, { "epoch": 0.164, "grad_norm": 10.4375, "grad_norm_var": 0.5274576822916667, "learning_rate": 0.0003, "loss": 12.1495, "loss/aux_loss": 0.0480981033295393, "loss/crossentropy": 2.691847151517868, "loss/logits": 0.9181078314781189, "step": 16400 }, { "epoch": 0.1641, "grad_norm": 11.25, "grad_norm_var": 6.025, "learning_rate": 0.0003, "loss": 12.1718, "loss/aux_loss": 0.048096432350575924, "loss/crossentropy": 2.829243075847626, "loss/logits": 0.9195181250572204, "step": 16410 }, { "epoch": 0.1642, "grad_norm": 11.25, "grad_norm_var": 2.2655598958333334, "learning_rate": 0.0003, "loss": 12.1507, "loss/aux_loss": 0.04810443092137575, "loss/crossentropy": 2.794591999053955, "loss/logits": 0.9034171938896179, "step": 16420 }, { "epoch": 0.1643, "grad_norm": 12.3125, "grad_norm_var": 0.42604166666666665, "learning_rate": 0.0003, "loss": 12.0589, "loss/aux_loss": 0.0480903310701251, "loss/crossentropy": 2.806131112575531, "loss/logits": 0.9333689689636231, "step": 16430 }, { "epoch": 0.1644, "grad_norm": 12.875, "grad_norm_var": 0.47291666666666665, "learning_rate": 0.0003, "loss": 12.1076, "loss/aux_loss": 0.04809165094047785, "loss/crossentropy": 3.009689784049988, "loss/logits": 0.9455327719449997, "step": 16440 }, { "epoch": 0.1645, "grad_norm": 12.3125, "grad_norm_var": 0.6640462239583333, "learning_rate": 0.0003, "loss": 12.1808, "loss/aux_loss": 0.04810215122997761, "loss/crossentropy": 2.7933058738708496, "loss/logits": 0.8973431855440139, "step": 16450 }, { "epoch": 0.1646, "grad_norm": 12.5625, "grad_norm_var": 0.501025390625, "learning_rate": 0.0003, "loss": 11.9219, "loss/aux_loss": 0.04809769950807095, "loss/crossentropy": 2.6854580640792847, "loss/logits": 0.9056837558746338, "step": 16460 }, { "epoch": 0.1647, "grad_norm": 12.5, "grad_norm_var": 0.2234375, "learning_rate": 0.0003, "loss": 12.1113, "loss/aux_loss": 0.048105937987565996, "loss/crossentropy": 2.7549439489841463, "loss/logits": 0.917845630645752, "step": 16470 }, { "epoch": 0.1648, "grad_norm": 11.75, "grad_norm_var": 0.2669270833333333, "learning_rate": 0.0003, "loss": 12.1535, "loss/aux_loss": 0.048096487298607826, "loss/crossentropy": 2.927453136444092, "loss/logits": 0.9296642661094665, "step": 16480 }, { "epoch": 0.1649, "grad_norm": 11.4375, "grad_norm_var": 0.35442708333333334, "learning_rate": 0.0003, "loss": 12.0322, "loss/aux_loss": 0.048098070360720155, "loss/crossentropy": 2.93691543340683, "loss/logits": 0.9586718380451202, "step": 16490 }, { "epoch": 0.165, "grad_norm": 12.0625, "grad_norm_var": 0.5493326822916667, "learning_rate": 0.0003, "loss": 12.214, "loss/aux_loss": 0.04809797964990139, "loss/crossentropy": 2.7319608986377717, "loss/logits": 0.9313073545694351, "step": 16500 }, { "epoch": 0.1651, "grad_norm": 11.0625, "grad_norm_var": 17.5791015625, "learning_rate": 0.0003, "loss": 12.113, "loss/aux_loss": 0.048102827928960326, "loss/crossentropy": 2.8502477288246153, "loss/logits": 0.9453782886266708, "step": 16510 }, { "epoch": 0.1652, "grad_norm": 10.875, "grad_norm_var": 0.16378580729166667, "learning_rate": 0.0003, "loss": 12.0053, "loss/aux_loss": 0.04810033868998289, "loss/crossentropy": 2.5949636459350587, "loss/logits": 0.8820204049348831, "step": 16520 }, { "epoch": 0.1653, "grad_norm": 11.125, "grad_norm_var": 0.48943684895833334, "learning_rate": 0.0003, "loss": 12.1139, "loss/aux_loss": 0.04809820037335157, "loss/crossentropy": 2.789997029304504, "loss/logits": 0.9239853471517563, "step": 16530 }, { "epoch": 0.1654, "grad_norm": 12.8125, "grad_norm_var": 0.37180989583333335, "learning_rate": 0.0003, "loss": 12.3409, "loss/aux_loss": 0.04809273220598698, "loss/crossentropy": 2.9401179909706117, "loss/logits": 0.9572012543678283, "step": 16540 }, { "epoch": 0.1655, "grad_norm": 11.375, "grad_norm_var": 0.5218587239583333, "learning_rate": 0.0003, "loss": 12.111, "loss/aux_loss": 0.04809644967317581, "loss/crossentropy": 2.735247939825058, "loss/logits": 0.9140194296836853, "step": 16550 }, { "epoch": 0.1656, "grad_norm": 13.4375, "grad_norm_var": 0.6286295572916667, "learning_rate": 0.0003, "loss": 12.1088, "loss/aux_loss": 0.048106629587709906, "loss/crossentropy": 2.5925142049789427, "loss/logits": 0.8779049098491669, "step": 16560 }, { "epoch": 0.1657, "grad_norm": 11.3125, "grad_norm_var": 0.7202473958333333, "learning_rate": 0.0003, "loss": 12.0626, "loss/aux_loss": 0.048098241165280345, "loss/crossentropy": 2.78861083984375, "loss/logits": 0.9169972121715546, "step": 16570 }, { "epoch": 0.1658, "grad_norm": 12.1875, "grad_norm_var": 0.7563639322916667, "learning_rate": 0.0003, "loss": 12.0808, "loss/aux_loss": 0.04810166098177433, "loss/crossentropy": 2.8476951360702514, "loss/logits": 0.9255498439073563, "step": 16580 }, { "epoch": 0.1659, "grad_norm": 11.1875, "grad_norm_var": 0.8036458333333333, "learning_rate": 0.0003, "loss": 11.9856, "loss/aux_loss": 0.048094157315790656, "loss/crossentropy": 2.7307428240776064, "loss/logits": 0.9049693077802659, "step": 16590 }, { "epoch": 0.166, "grad_norm": 11.4375, "grad_norm_var": 0.491650390625, "learning_rate": 0.0003, "loss": 12.0904, "loss/aux_loss": 0.048098260350525376, "loss/crossentropy": 2.7222547829151154, "loss/logits": 0.9318049371242523, "step": 16600 }, { "epoch": 0.1661, "grad_norm": 12.9375, "grad_norm_var": 0.5280598958333333, "learning_rate": 0.0003, "loss": 12.0952, "loss/aux_loss": 0.04809851739555597, "loss/crossentropy": 2.6276703774929047, "loss/logits": 0.8886691600084304, "step": 16610 }, { "epoch": 0.1662, "grad_norm": 11.9375, "grad_norm_var": 0.40260416666666665, "learning_rate": 0.0003, "loss": 12.1522, "loss/aux_loss": 0.048096096701920034, "loss/crossentropy": 2.756567734479904, "loss/logits": 0.9005006104707718, "step": 16620 }, { "epoch": 0.1663, "grad_norm": 12.25, "grad_norm_var": 0.397119140625, "learning_rate": 0.0003, "loss": 12.0698, "loss/aux_loss": 0.04808939378708601, "loss/crossentropy": 2.8644691705703735, "loss/logits": 0.9494952738285065, "step": 16630 }, { "epoch": 0.1664, "grad_norm": 10.875, "grad_norm_var": 0.304150390625, "learning_rate": 0.0003, "loss": 11.9855, "loss/aux_loss": 0.04809617009013891, "loss/crossentropy": 2.8036171019077303, "loss/logits": 0.9411976546049118, "step": 16640 }, { "epoch": 0.1665, "grad_norm": 11.8125, "grad_norm_var": 1.0863932291666667, "learning_rate": 0.0003, "loss": 12.1059, "loss/aux_loss": 0.04810191765427589, "loss/crossentropy": 2.8542271971702577, "loss/logits": 0.941945058107376, "step": 16650 }, { "epoch": 0.1666, "grad_norm": 11.6875, "grad_norm_var": 0.356103515625, "learning_rate": 0.0003, "loss": 12.1667, "loss/aux_loss": 0.048105718195438386, "loss/crossentropy": 2.759011608362198, "loss/logits": 0.911252424120903, "step": 16660 }, { "epoch": 0.1667, "grad_norm": 10.5, "grad_norm_var": 0.34140625, "learning_rate": 0.0003, "loss": 12.019, "loss/aux_loss": 0.048089655488729476, "loss/crossentropy": 2.7977048456668854, "loss/logits": 0.9163706332445145, "step": 16670 }, { "epoch": 0.1668, "grad_norm": 11.8125, "grad_norm_var": 0.4, "learning_rate": 0.0003, "loss": 12.0449, "loss/aux_loss": 0.04810653738677502, "loss/crossentropy": 2.8017389357089995, "loss/logits": 0.9295397102832794, "step": 16680 }, { "epoch": 0.1669, "grad_norm": 10.5625, "grad_norm_var": 0.34420572916666664, "learning_rate": 0.0003, "loss": 12.1962, "loss/aux_loss": 0.04809100721031427, "loss/crossentropy": 2.8505070567131043, "loss/logits": 0.9185640811920166, "step": 16690 }, { "epoch": 0.167, "grad_norm": 12.8125, "grad_norm_var": 1.0841145833333334, "learning_rate": 0.0003, "loss": 12.0777, "loss/aux_loss": 0.048104220815002915, "loss/crossentropy": 2.6532647252082824, "loss/logits": 0.9005499392747879, "step": 16700 }, { "epoch": 0.1671, "grad_norm": 11.25, "grad_norm_var": 1.1744140625, "learning_rate": 0.0003, "loss": 11.9404, "loss/aux_loss": 0.04809899311512709, "loss/crossentropy": 2.7995954275131227, "loss/logits": 0.906044989824295, "step": 16710 }, { "epoch": 0.1672, "grad_norm": 11.625, "grad_norm_var": 0.7273274739583333, "learning_rate": 0.0003, "loss": 11.93, "loss/aux_loss": 0.04809800013899803, "loss/crossentropy": 2.867034387588501, "loss/logits": 0.908442784845829, "step": 16720 }, { "epoch": 0.1673, "grad_norm": 12.5, "grad_norm_var": 0.760400390625, "learning_rate": 0.0003, "loss": 11.9833, "loss/aux_loss": 0.048098081909120086, "loss/crossentropy": 2.7534588992595674, "loss/logits": 0.9382378399372101, "step": 16730 }, { "epoch": 0.1674, "grad_norm": 12.0, "grad_norm_var": 0.29791666666666666, "learning_rate": 0.0003, "loss": 12.1122, "loss/aux_loss": 0.04809783697128296, "loss/crossentropy": 2.8023226737976072, "loss/logits": 0.9453730881214142, "step": 16740 }, { "epoch": 0.1675, "grad_norm": 11.3125, "grad_norm_var": 0.24816080729166667, "learning_rate": 0.0003, "loss": 12.0329, "loss/aux_loss": 0.04809421058744192, "loss/crossentropy": 2.7011972665786743, "loss/logits": 0.9254505336284637, "step": 16750 }, { "epoch": 0.1676, "grad_norm": 11.0625, "grad_norm_var": 0.3059895833333333, "learning_rate": 0.0003, "loss": 12.1991, "loss/aux_loss": 0.048098991997539996, "loss/crossentropy": 2.923702526092529, "loss/logits": 0.9767153590917588, "step": 16760 }, { "epoch": 0.1677, "grad_norm": 11.0, "grad_norm_var": 0.4400390625, "learning_rate": 0.0003, "loss": 12.2559, "loss/aux_loss": 0.048096513748168944, "loss/crossentropy": 2.9745861649513246, "loss/logits": 0.9630038678646088, "step": 16770 }, { "epoch": 0.1678, "grad_norm": 10.625, "grad_norm_var": 0.424072265625, "learning_rate": 0.0003, "loss": 12.0234, "loss/aux_loss": 0.048099739477038383, "loss/crossentropy": 2.6956757068634034, "loss/logits": 0.9454267978668213, "step": 16780 }, { "epoch": 0.1679, "grad_norm": 12.0, "grad_norm_var": 0.2275390625, "learning_rate": 0.0003, "loss": 12.1579, "loss/aux_loss": 0.04809470549225807, "loss/crossentropy": 2.8803565382957457, "loss/logits": 0.9466471463441849, "step": 16790 }, { "epoch": 0.168, "grad_norm": 11.0625, "grad_norm_var": 0.3606770833333333, "learning_rate": 0.0003, "loss": 12.0702, "loss/aux_loss": 0.048100711591541764, "loss/crossentropy": 2.769081395864487, "loss/logits": 0.9133290886878968, "step": 16800 }, { "epoch": 0.1681, "grad_norm": 11.3125, "grad_norm_var": 0.29542643229166665, "learning_rate": 0.0003, "loss": 12.026, "loss/aux_loss": 0.04810358509421349, "loss/crossentropy": 2.6967382431030273, "loss/logits": 0.9303892910480499, "step": 16810 }, { "epoch": 0.1682, "grad_norm": 12.0, "grad_norm_var": 0.1791015625, "learning_rate": 0.0003, "loss": 12.2407, "loss/aux_loss": 0.04810531884431839, "loss/crossentropy": 2.7483027279376984, "loss/logits": 0.9215909510850906, "step": 16820 }, { "epoch": 0.1683, "grad_norm": 12.4375, "grad_norm_var": 0.4901041666666667, "learning_rate": 0.0003, "loss": 12.0529, "loss/aux_loss": 0.04809485897421837, "loss/crossentropy": 2.6491159200668335, "loss/logits": 0.8965664654970169, "step": 16830 }, { "epoch": 0.1684, "grad_norm": 11.9375, "grad_norm_var": 0.496728515625, "learning_rate": 0.0003, "loss": 12.0189, "loss/aux_loss": 0.04810217395424843, "loss/crossentropy": 2.8553712725639344, "loss/logits": 0.9247318297624588, "step": 16840 }, { "epoch": 0.1685, "grad_norm": 11.6875, "grad_norm_var": 0.19192708333333333, "learning_rate": 0.0003, "loss": 12.2266, "loss/aux_loss": 0.04810085538774729, "loss/crossentropy": 2.841351580619812, "loss/logits": 0.9316177189350128, "step": 16850 }, { "epoch": 0.1686, "grad_norm": 12.1875, "grad_norm_var": 0.28097330729166664, "learning_rate": 0.0003, "loss": 12.166, "loss/aux_loss": 0.04810477644205093, "loss/crossentropy": 2.816389191150665, "loss/logits": 0.9328649133443833, "step": 16860 }, { "epoch": 0.1687, "grad_norm": 11.25, "grad_norm_var": 0.2598958333333333, "learning_rate": 0.0003, "loss": 12.0411, "loss/aux_loss": 0.04809478260576725, "loss/crossentropy": 2.9236293196678163, "loss/logits": 0.9430270612239837, "step": 16870 }, { "epoch": 0.1688, "grad_norm": 11.0, "grad_norm_var": 0.2899576822916667, "learning_rate": 0.0003, "loss": 11.9808, "loss/aux_loss": 0.0481045238673687, "loss/crossentropy": 2.7350330710411073, "loss/logits": 0.8710766971111298, "step": 16880 }, { "epoch": 0.1689, "grad_norm": 11.3125, "grad_norm_var": 0.5572265625, "learning_rate": 0.0003, "loss": 12.021, "loss/aux_loss": 0.048098239861428735, "loss/crossentropy": 2.7996289134025574, "loss/logits": 0.8801421314477921, "step": 16890 }, { "epoch": 0.169, "grad_norm": 11.4375, "grad_norm_var": 0.31573893229166666, "learning_rate": 0.0003, "loss": 12.0939, "loss/aux_loss": 0.048104897141456604, "loss/crossentropy": 2.7211228966712953, "loss/logits": 0.9303423374891281, "step": 16900 }, { "epoch": 0.1691, "grad_norm": 12.5, "grad_norm_var": 0.6618326822916667, "learning_rate": 0.0003, "loss": 12.0377, "loss/aux_loss": 0.04810284618288278, "loss/crossentropy": 2.7154513716697695, "loss/logits": 0.9080936968326568, "step": 16910 }, { "epoch": 0.1692, "grad_norm": 12.5625, "grad_norm_var": 0.7940104166666667, "learning_rate": 0.0003, "loss": 12.0661, "loss/aux_loss": 0.0480948593467474, "loss/crossentropy": 2.756969064474106, "loss/logits": 0.9487773150205612, "step": 16920 }, { "epoch": 0.1693, "grad_norm": 10.5625, "grad_norm_var": 0.402197265625, "learning_rate": 0.0003, "loss": 12.1444, "loss/aux_loss": 0.04810402132570744, "loss/crossentropy": 2.7484578788280487, "loss/logits": 0.9173682719469071, "step": 16930 }, { "epoch": 0.1694, "grad_norm": 12.125, "grad_norm_var": 0.2416015625, "learning_rate": 0.0003, "loss": 12.0027, "loss/aux_loss": 0.04809407070279122, "loss/crossentropy": 2.719779831171036, "loss/logits": 0.9158334016799927, "step": 16940 }, { "epoch": 0.1695, "grad_norm": 12.3125, "grad_norm_var": 0.28203125, "learning_rate": 0.0003, "loss": 12.0734, "loss/aux_loss": 0.04810118954628706, "loss/crossentropy": 2.806976354122162, "loss/logits": 0.8893602877855301, "step": 16950 }, { "epoch": 0.1696, "grad_norm": 11.25, "grad_norm_var": 1.8876139322916667, "learning_rate": 0.0003, "loss": 12.0654, "loss/aux_loss": 0.04809729289263487, "loss/crossentropy": 2.8548884272575377, "loss/logits": 0.9692226439714432, "step": 16960 }, { "epoch": 0.1697, "grad_norm": 10.5625, "grad_norm_var": 0.314697265625, "learning_rate": 0.0003, "loss": 12.1883, "loss/aux_loss": 0.048106925748288634, "loss/crossentropy": 2.9143474459648133, "loss/logits": 0.9318220674991607, "step": 16970 }, { "epoch": 0.1698, "grad_norm": 12.625, "grad_norm_var": 1.1700358072916666, "learning_rate": 0.0003, "loss": 11.9753, "loss/aux_loss": 0.04809020813554525, "loss/crossentropy": 2.8300904273986816, "loss/logits": 0.9694455862045288, "step": 16980 }, { "epoch": 0.1699, "grad_norm": 11.4375, "grad_norm_var": 0.403125, "learning_rate": 0.0003, "loss": 12.2506, "loss/aux_loss": 0.048101211339235304, "loss/crossentropy": 2.794898247718811, "loss/logits": 0.9496973544359207, "step": 16990 }, { "epoch": 0.17, "grad_norm": 12.0625, "grad_norm_var": 0.43333333333333335, "learning_rate": 0.0003, "loss": 12.101, "loss/aux_loss": 0.04809295553714037, "loss/crossentropy": 3.0219761967658996, "loss/logits": 0.9177909851074219, "step": 17000 }, { "epoch": 0.1701, "grad_norm": 15.75, "grad_norm_var": 96.87667643229166, "learning_rate": 0.0003, "loss": 12.0937, "loss/aux_loss": 0.0481051966547966, "loss/crossentropy": 2.9502204298973083, "loss/logits": 0.9211991935968399, "step": 17010 }, { "epoch": 0.1702, "grad_norm": 11.75, "grad_norm_var": 2.8739420572916665, "learning_rate": 0.0003, "loss": 12.0823, "loss/aux_loss": 0.04810426589101553, "loss/crossentropy": 2.6327461183071135, "loss/logits": 0.923973485827446, "step": 17020 }, { "epoch": 0.1703, "grad_norm": 11.375, "grad_norm_var": 1.0531087239583334, "learning_rate": 0.0003, "loss": 12.0928, "loss/aux_loss": 0.04810360558331013, "loss/crossentropy": 3.002879500389099, "loss/logits": 0.9532357782125473, "step": 17030 }, { "epoch": 0.1704, "grad_norm": 11.1875, "grad_norm_var": 0.63046875, "learning_rate": 0.0003, "loss": 12.0248, "loss/aux_loss": 0.0480900751426816, "loss/crossentropy": 2.7537549138069153, "loss/logits": 0.9276573568582535, "step": 17040 }, { "epoch": 0.1705, "grad_norm": 13.375, "grad_norm_var": 1.6624837239583334, "learning_rate": 0.0003, "loss": 12.0054, "loss/aux_loss": 0.04810062348842621, "loss/crossentropy": 2.884317523241043, "loss/logits": 0.9326794624328614, "step": 17050 }, { "epoch": 0.1706, "grad_norm": 11.5, "grad_norm_var": 1.736181640625, "learning_rate": 0.0003, "loss": 11.8953, "loss/aux_loss": 0.04811934363096952, "loss/crossentropy": 2.6659162402153016, "loss/logits": 0.8868398576974869, "step": 17060 }, { "epoch": 0.1707, "grad_norm": 11.875, "grad_norm_var": 0.5770833333333333, "learning_rate": 0.0003, "loss": 12.0149, "loss/aux_loss": 0.04809251334518194, "loss/crossentropy": 2.749342954158783, "loss/logits": 0.9031396269798279, "step": 17070 }, { "epoch": 0.1708, "grad_norm": 11.875, "grad_norm_var": 0.2526041666666667, "learning_rate": 0.0003, "loss": 12.1526, "loss/aux_loss": 0.048104824125766756, "loss/crossentropy": 2.856028115749359, "loss/logits": 0.959146237373352, "step": 17080 }, { "epoch": 0.1709, "grad_norm": 11.625, "grad_norm_var": 0.1916015625, "learning_rate": 0.0003, "loss": 12.0983, "loss/aux_loss": 0.04809093903750181, "loss/crossentropy": 2.8145798802375794, "loss/logits": 0.9006113916635513, "step": 17090 }, { "epoch": 0.171, "grad_norm": 12.3125, "grad_norm_var": 0.22337239583333332, "learning_rate": 0.0003, "loss": 12.1089, "loss/aux_loss": 0.04809470176696777, "loss/crossentropy": 2.878612220287323, "loss/logits": 0.9135033786296844, "step": 17100 }, { "epoch": 0.1711, "grad_norm": 11.375, "grad_norm_var": 54.241129557291664, "learning_rate": 0.0003, "loss": 12.0472, "loss/aux_loss": 0.048102441057562825, "loss/crossentropy": 2.811024880409241, "loss/logits": 0.8989885419607162, "step": 17110 }, { "epoch": 0.1712, "grad_norm": 11.5, "grad_norm_var": 0.40206705729166664, "learning_rate": 0.0003, "loss": 12.1958, "loss/aux_loss": 0.048090110532939434, "loss/crossentropy": 2.8124298572540285, "loss/logits": 0.9289597928524017, "step": 17120 }, { "epoch": 0.1713, "grad_norm": 11.625, "grad_norm_var": 0.2353515625, "learning_rate": 0.0003, "loss": 11.8911, "loss/aux_loss": 0.04809228479862213, "loss/crossentropy": 2.876737803220749, "loss/logits": 0.9436014890670776, "step": 17130 }, { "epoch": 0.1714, "grad_norm": 11.375, "grad_norm_var": 0.2886555989583333, "learning_rate": 0.0003, "loss": 11.9557, "loss/aux_loss": 0.048099903389811516, "loss/crossentropy": 2.8804137110710144, "loss/logits": 0.938829579949379, "step": 17140 }, { "epoch": 0.1715, "grad_norm": 10.6875, "grad_norm_var": 0.260791015625, "learning_rate": 0.0003, "loss": 11.852, "loss/aux_loss": 0.04809135273098945, "loss/crossentropy": 2.7023903012275694, "loss/logits": 0.8962929219007492, "step": 17150 }, { "epoch": 0.1716, "grad_norm": 11.5625, "grad_norm_var": 0.3447265625, "learning_rate": 0.0003, "loss": 12.158, "loss/aux_loss": 0.048096888884902, "loss/crossentropy": 2.827605813741684, "loss/logits": 0.9441530287265778, "step": 17160 }, { "epoch": 0.1717, "grad_norm": 13.5, "grad_norm_var": 0.7931640625, "learning_rate": 0.0003, "loss": 12.1546, "loss/aux_loss": 0.048093185387551786, "loss/crossentropy": 2.814880883693695, "loss/logits": 0.9108005404472351, "step": 17170 }, { "epoch": 0.1718, "grad_norm": 11.625, "grad_norm_var": 3.0380208333333334, "learning_rate": 0.0003, "loss": 11.962, "loss/aux_loss": 0.04810141772031784, "loss/crossentropy": 2.7444641530513763, "loss/logits": 0.9487886667251587, "step": 17180 }, { "epoch": 0.1719, "grad_norm": 11.6875, "grad_norm_var": 2.831770833333333, "learning_rate": 0.0003, "loss": 12.1348, "loss/aux_loss": 0.04809289593249559, "loss/crossentropy": 2.7105092108249664, "loss/logits": 0.9182222783565521, "step": 17190 }, { "epoch": 0.172, "grad_norm": 11.375, "grad_norm_var": 0.474853515625, "learning_rate": 0.0003, "loss": 12.1075, "loss/aux_loss": 0.048097463138401506, "loss/crossentropy": 2.8113415241241455, "loss/logits": 0.9427078306674957, "step": 17200 }, { "epoch": 0.1721, "grad_norm": 12.375, "grad_norm_var": 0.2786458333333333, "learning_rate": 0.0003, "loss": 12.14, "loss/aux_loss": 0.04809903036803007, "loss/crossentropy": 2.9176873922348023, "loss/logits": 0.9191664904356003, "step": 17210 }, { "epoch": 0.1722, "grad_norm": 10.3125, "grad_norm_var": 0.5504557291666666, "learning_rate": 0.0003, "loss": 12.0308, "loss/aux_loss": 0.04809430036693811, "loss/crossentropy": 2.64280886054039, "loss/logits": 0.8799058675765992, "step": 17220 }, { "epoch": 0.1723, "grad_norm": 11.3125, "grad_norm_var": 0.4051432291666667, "learning_rate": 0.0003, "loss": 12.1222, "loss/aux_loss": 0.048100571148097515, "loss/crossentropy": 2.9199374198913572, "loss/logits": 0.9405399680137634, "step": 17230 }, { "epoch": 0.1724, "grad_norm": 11.1875, "grad_norm_var": 0.29464518229166664, "learning_rate": 0.0003, "loss": 12.2574, "loss/aux_loss": 0.04809443484991789, "loss/crossentropy": 2.7939966559410094, "loss/logits": 0.9183706283569336, "step": 17240 }, { "epoch": 0.1725, "grad_norm": 12.0, "grad_norm_var": 0.2562337239583333, "learning_rate": 0.0003, "loss": 12.0131, "loss/aux_loss": 0.048107451759278774, "loss/crossentropy": 2.779514318704605, "loss/logits": 0.9068025201559067, "step": 17250 }, { "epoch": 0.1726, "grad_norm": 11.8125, "grad_norm_var": 0.39108072916666664, "learning_rate": 0.0003, "loss": 11.9615, "loss/aux_loss": 0.04809543266892433, "loss/crossentropy": 2.817984676361084, "loss/logits": 0.9144764870405198, "step": 17260 }, { "epoch": 0.1727, "grad_norm": 11.625, "grad_norm_var": 2.011458333333333, "learning_rate": 0.0003, "loss": 11.9494, "loss/aux_loss": 0.04809968285262585, "loss/crossentropy": 2.9492964446544647, "loss/logits": 0.9344431668519974, "step": 17270 }, { "epoch": 0.1728, "grad_norm": 13.0, "grad_norm_var": 2.0067545572916665, "learning_rate": 0.0003, "loss": 11.9271, "loss/aux_loss": 0.04809718765318394, "loss/crossentropy": 2.7608347654342653, "loss/logits": 0.9011356472969055, "step": 17280 }, { "epoch": 0.1729, "grad_norm": 11.8125, "grad_norm_var": 0.7468098958333333, "learning_rate": 0.0003, "loss": 12.0838, "loss/aux_loss": 0.048104557767510416, "loss/crossentropy": 2.7879473209381103, "loss/logits": 0.9154693454504013, "step": 17290 }, { "epoch": 0.173, "grad_norm": 11.625, "grad_norm_var": 0.749072265625, "learning_rate": 0.0003, "loss": 11.9908, "loss/aux_loss": 0.048098857142031194, "loss/crossentropy": 2.6744504272937775, "loss/logits": 0.8712642341852188, "step": 17300 }, { "epoch": 0.1731, "grad_norm": 12.0625, "grad_norm_var": 0.38671875, "learning_rate": 0.0003, "loss": 12.1573, "loss/aux_loss": 0.048095573857426646, "loss/crossentropy": 3.0483207941055297, "loss/logits": 0.93597452044487, "step": 17310 }, { "epoch": 0.1732, "grad_norm": 12.5625, "grad_norm_var": 0.15818684895833332, "learning_rate": 0.0003, "loss": 11.8331, "loss/aux_loss": 0.04809464327991009, "loss/crossentropy": 2.7563810288906097, "loss/logits": 0.8930452913045883, "step": 17320 }, { "epoch": 0.1733, "grad_norm": 11.875, "grad_norm_var": 50.563395182291664, "learning_rate": 0.0003, "loss": 12.0362, "loss/aux_loss": 0.048101813159883024, "loss/crossentropy": 2.807816767692566, "loss/logits": 0.9140335559844971, "step": 17330 }, { "epoch": 0.1734, "grad_norm": 12.125, "grad_norm_var": 50.9265625, "learning_rate": 0.0003, "loss": 11.9131, "loss/aux_loss": 0.048089164309203625, "loss/crossentropy": 2.7212966203689577, "loss/logits": 0.9433120638132095, "step": 17340 }, { "epoch": 0.1735, "grad_norm": 12.25, "grad_norm_var": 0.15402018229166667, "learning_rate": 0.0003, "loss": 12.1065, "loss/aux_loss": 0.04809843562543392, "loss/crossentropy": 2.6257729053497316, "loss/logits": 0.8846357733011245, "step": 17350 }, { "epoch": 0.1736, "grad_norm": 12.0625, "grad_norm_var": 0.445166015625, "learning_rate": 0.0003, "loss": 12.0886, "loss/aux_loss": 0.04810376763343811, "loss/crossentropy": 2.8265872836112975, "loss/logits": 0.9571549206972122, "step": 17360 }, { "epoch": 0.1737, "grad_norm": 11.0625, "grad_norm_var": 0.4348958333333333, "learning_rate": 0.0003, "loss": 12.0218, "loss/aux_loss": 0.04809571448713541, "loss/crossentropy": 2.695615494251251, "loss/logits": 0.9150578171014786, "step": 17370 }, { "epoch": 0.1738, "grad_norm": 11.8125, "grad_norm_var": 0.46139322916666664, "learning_rate": 0.0003, "loss": 12.2693, "loss/aux_loss": 0.04809955190867186, "loss/crossentropy": 2.6700818240642548, "loss/logits": 0.9094936668872833, "step": 17380 }, { "epoch": 0.1739, "grad_norm": 11.1875, "grad_norm_var": 0.7393229166666667, "learning_rate": 0.0003, "loss": 12.0212, "loss/aux_loss": 0.048098945058882236, "loss/crossentropy": 2.7970273315906526, "loss/logits": 0.8984217762947082, "step": 17390 }, { "epoch": 0.174, "grad_norm": 12.8125, "grad_norm_var": 1.09375, "learning_rate": 0.0003, "loss": 12.0838, "loss/aux_loss": 0.048093376122415064, "loss/crossentropy": 2.8114991784095764, "loss/logits": 0.8884566456079483, "step": 17400 }, { "epoch": 0.1741, "grad_norm": 12.375, "grad_norm_var": 1.1936848958333333, "learning_rate": 0.0003, "loss": 12.0995, "loss/aux_loss": 0.04810470137745142, "loss/crossentropy": 2.7583046913146974, "loss/logits": 0.9460157155990601, "step": 17410 }, { "epoch": 0.1742, "grad_norm": 25.875, "grad_norm_var": 12.618733723958334, "learning_rate": 0.0003, "loss": 12.1705, "loss/aux_loss": 0.04809574950486421, "loss/crossentropy": 2.821639972925186, "loss/logits": 0.920597642660141, "step": 17420 }, { "epoch": 0.1743, "grad_norm": 12.0, "grad_norm_var": 13.774593098958333, "learning_rate": 0.0003, "loss": 12.1343, "loss/aux_loss": 0.04809047318994999, "loss/crossentropy": 2.9120493054389955, "loss/logits": 0.9170797854661942, "step": 17430 }, { "epoch": 0.1744, "grad_norm": 12.4375, "grad_norm_var": 2.804541015625, "learning_rate": 0.0003, "loss": 12.0075, "loss/aux_loss": 0.0480960488319397, "loss/crossentropy": 2.7624635457992555, "loss/logits": 0.8919235855340958, "step": 17440 }, { "epoch": 0.1745, "grad_norm": 11.6875, "grad_norm_var": 0.469775390625, "learning_rate": 0.0003, "loss": 12.1544, "loss/aux_loss": 0.048093258403241634, "loss/crossentropy": 2.8480118989944456, "loss/logits": 0.9208219617605209, "step": 17450 }, { "epoch": 0.1746, "grad_norm": 10.875, "grad_norm_var": 0.167041015625, "learning_rate": 0.0003, "loss": 12.038, "loss/aux_loss": 0.048100156150758265, "loss/crossentropy": 2.81412872672081, "loss/logits": 0.925744378566742, "step": 17460 }, { "epoch": 0.1747, "grad_norm": 11.1875, "grad_norm_var": 0.22317708333333333, "learning_rate": 0.0003, "loss": 12.0594, "loss/aux_loss": 0.04808522202074528, "loss/crossentropy": 2.782361996173859, "loss/logits": 0.9385877996683121, "step": 17470 }, { "epoch": 0.1748, "grad_norm": 12.3125, "grad_norm_var": 23.0869140625, "learning_rate": 0.0003, "loss": 12.1986, "loss/aux_loss": 0.04810808375477791, "loss/crossentropy": 2.819118005037308, "loss/logits": 0.9407922476530075, "step": 17480 }, { "epoch": 0.1749, "grad_norm": 11.5625, "grad_norm_var": 22.847119140625, "learning_rate": 0.0003, "loss": 11.9801, "loss/aux_loss": 0.04809485077857971, "loss/crossentropy": 2.8491112112998964, "loss/logits": 0.9401687920093537, "step": 17490 }, { "epoch": 0.175, "grad_norm": 12.6875, "grad_norm_var": 0.262744140625, "learning_rate": 0.0003, "loss": 11.9604, "loss/aux_loss": 0.04808875843882561, "loss/crossentropy": 2.8413546562194822, "loss/logits": 0.9534878820180893, "step": 17500 }, { "epoch": 0.1751, "grad_norm": 11.4375, "grad_norm_var": 0.3395182291666667, "learning_rate": 0.0003, "loss": 12.1045, "loss/aux_loss": 0.048095555044710636, "loss/crossentropy": 2.7935108840465546, "loss/logits": 0.9017595887184143, "step": 17510 }, { "epoch": 0.1752, "grad_norm": 11.0, "grad_norm_var": 0.33274739583333335, "learning_rate": 0.0003, "loss": 12.0272, "loss/aux_loss": 0.04809608049690724, "loss/crossentropy": 2.933014285564423, "loss/logits": 0.9154089689254761, "step": 17520 }, { "epoch": 0.1753, "grad_norm": 11.8125, "grad_norm_var": 0.2848307291666667, "learning_rate": 0.0003, "loss": 12.0961, "loss/aux_loss": 0.04810178428888321, "loss/crossentropy": 2.8541224718093874, "loss/logits": 0.9548793703317642, "step": 17530 }, { "epoch": 0.1754, "grad_norm": 10.6875, "grad_norm_var": 0.43307291666666664, "learning_rate": 0.0003, "loss": 12.0888, "loss/aux_loss": 0.04808586481958628, "loss/crossentropy": 2.7221501886844637, "loss/logits": 0.9254509091377259, "step": 17540 }, { "epoch": 0.1755, "grad_norm": 10.9375, "grad_norm_var": 0.595556640625, "learning_rate": 0.0003, "loss": 11.8596, "loss/aux_loss": 0.04811476822942495, "loss/crossentropy": 2.8995654344558717, "loss/logits": 0.9064432740211487, "step": 17550 }, { "epoch": 0.1756, "grad_norm": 12.3125, "grad_norm_var": 0.382666015625, "learning_rate": 0.0003, "loss": 11.7278, "loss/aux_loss": 0.048095325380563735, "loss/crossentropy": 2.797735607624054, "loss/logits": 0.8821221351623535, "step": 17560 }, { "epoch": 0.1757, "grad_norm": 11.125, "grad_norm_var": 0.3997395833333333, "learning_rate": 0.0003, "loss": 11.8596, "loss/aux_loss": 0.04810337759554386, "loss/crossentropy": 2.7168959975242615, "loss/logits": 0.8649186968803406, "step": 17570 }, { "epoch": 0.1758, "grad_norm": 12.0, "grad_norm_var": 0.5856608072916667, "learning_rate": 0.0003, "loss": 12.1123, "loss/aux_loss": 0.0481048546731472, "loss/crossentropy": 2.896924364566803, "loss/logits": 0.9432176023721695, "step": 17580 }, { "epoch": 0.1759, "grad_norm": 11.3125, "grad_norm_var": 0.663916015625, "learning_rate": 0.0003, "loss": 12.1182, "loss/aux_loss": 0.048101380653679374, "loss/crossentropy": 2.7174128890037537, "loss/logits": 0.8965833187103271, "step": 17590 }, { "epoch": 0.176, "grad_norm": 12.0625, "grad_norm_var": 33.6681640625, "learning_rate": 0.0003, "loss": 12.0515, "loss/aux_loss": 0.04809325095266104, "loss/crossentropy": 2.6860816717147826, "loss/logits": 0.9306042581796646, "step": 17600 }, { "epoch": 0.1761, "grad_norm": 11.0, "grad_norm_var": 32.864583333333336, "learning_rate": 0.0003, "loss": 12.0491, "loss/aux_loss": 0.04810206014662981, "loss/crossentropy": 2.795276200771332, "loss/logits": 0.9053617566823959, "step": 17610 }, { "epoch": 0.1762, "grad_norm": 11.0625, "grad_norm_var": 0.749462890625, "learning_rate": 0.0003, "loss": 11.947, "loss/aux_loss": 0.04809454921633005, "loss/crossentropy": 2.6671301662921905, "loss/logits": 0.9019128113985062, "step": 17620 }, { "epoch": 0.1763, "grad_norm": 12.75, "grad_norm_var": 0.48318684895833336, "learning_rate": 0.0003, "loss": 12.1508, "loss/aux_loss": 0.04808996580541134, "loss/crossentropy": 2.8986705422401426, "loss/logits": 0.9507706761360168, "step": 17630 }, { "epoch": 0.1764, "grad_norm": 11.75, "grad_norm_var": 1.9614420572916667, "learning_rate": 0.0003, "loss": 11.9558, "loss/aux_loss": 0.04809357337653637, "loss/crossentropy": 2.818922591209412, "loss/logits": 0.890010553598404, "step": 17640 }, { "epoch": 0.1765, "grad_norm": 11.25, "grad_norm_var": 2.1322265625, "learning_rate": 0.0003, "loss": 11.9349, "loss/aux_loss": 0.04809540584683418, "loss/crossentropy": 2.8297097086906433, "loss/logits": 0.9366719990968704, "step": 17650 }, { "epoch": 0.1766, "grad_norm": 12.4375, "grad_norm_var": 0.9899576822916667, "learning_rate": 0.0003, "loss": 12.1478, "loss/aux_loss": 0.048089148849248885, "loss/crossentropy": 2.843839108943939, "loss/logits": 0.9119983077049255, "step": 17660 }, { "epoch": 0.1767, "grad_norm": 12.0, "grad_norm_var": 0.7913899739583333, "learning_rate": 0.0003, "loss": 12.1032, "loss/aux_loss": 0.04810617808252573, "loss/crossentropy": 2.8277599930763246, "loss/logits": 0.9263883680105209, "step": 17670 }, { "epoch": 0.1768, "grad_norm": 13.5625, "grad_norm_var": 0.5104166666666666, "learning_rate": 0.0003, "loss": 12.0376, "loss/aux_loss": 0.0480927873402834, "loss/crossentropy": 2.8370666086673735, "loss/logits": 0.9011499643325805, "step": 17680 }, { "epoch": 0.1769, "grad_norm": 12.6875, "grad_norm_var": 0.5216145833333333, "learning_rate": 0.0003, "loss": 11.9335, "loss/aux_loss": 0.048093126900494096, "loss/crossentropy": 2.7675021648406983, "loss/logits": 0.8972540199756622, "step": 17690 }, { "epoch": 0.177, "grad_norm": 11.75, "grad_norm_var": 0.34739583333333335, "learning_rate": 0.0003, "loss": 11.8338, "loss/aux_loss": 0.04809641428291798, "loss/crossentropy": 2.691696697473526, "loss/logits": 0.8934990376234054, "step": 17700 }, { "epoch": 0.1771, "grad_norm": 12.125, "grad_norm_var": 0.236572265625, "learning_rate": 0.0003, "loss": 11.9839, "loss/aux_loss": 0.04809584002941847, "loss/crossentropy": 2.9292188465595244, "loss/logits": 0.9080984503030777, "step": 17710 }, { "epoch": 0.1772, "grad_norm": 13.25, "grad_norm_var": 0.27786458333333336, "learning_rate": 0.0003, "loss": 12.0181, "loss/aux_loss": 0.048103841580450536, "loss/crossentropy": 2.6175199866294863, "loss/logits": 0.9136331707239151, "step": 17720 }, { "epoch": 0.1773, "grad_norm": 12.75, "grad_norm_var": 0.5515625, "learning_rate": 0.0003, "loss": 11.9502, "loss/aux_loss": 0.04809474535286427, "loss/crossentropy": 2.9119593143463134, "loss/logits": 0.9304135531187058, "step": 17730 }, { "epoch": 0.1774, "grad_norm": 11.5, "grad_norm_var": 0.445947265625, "learning_rate": 0.0003, "loss": 12.0916, "loss/aux_loss": 0.0480996023863554, "loss/crossentropy": 2.8041651487350463, "loss/logits": 0.9179874926805496, "step": 17740 }, { "epoch": 0.1775, "grad_norm": 11.9375, "grad_norm_var": 0.238916015625, "learning_rate": 0.0003, "loss": 11.9421, "loss/aux_loss": 0.04809048194438219, "loss/crossentropy": 2.9143458247184753, "loss/logits": 0.9369097352027893, "step": 17750 }, { "epoch": 0.1776, "grad_norm": 12.875, "grad_norm_var": 0.49993489583333334, "learning_rate": 0.0003, "loss": 12.0297, "loss/aux_loss": 0.04809221494942904, "loss/crossentropy": 2.77188703417778, "loss/logits": 0.8611804962158203, "step": 17760 }, { "epoch": 0.1777, "grad_norm": 19.0, "grad_norm_var": 3.628889973958333, "learning_rate": 0.0003, "loss": 12.1735, "loss/aux_loss": 0.04809418804943562, "loss/crossentropy": 2.866736590862274, "loss/logits": 0.9347006261348725, "step": 17770 }, { "epoch": 0.1778, "grad_norm": 11.125, "grad_norm_var": 3.622509765625, "learning_rate": 0.0003, "loss": 11.8912, "loss/aux_loss": 0.048098650947213176, "loss/crossentropy": 2.8250136613845824, "loss/logits": 0.945642602443695, "step": 17780 }, { "epoch": 0.1779, "grad_norm": 11.125, "grad_norm_var": 0.2462890625, "learning_rate": 0.0003, "loss": 12.0138, "loss/aux_loss": 0.0481024345383048, "loss/crossentropy": 2.750953811407089, "loss/logits": 0.8871120661497116, "step": 17790 }, { "epoch": 0.178, "grad_norm": 12.25, "grad_norm_var": 109.72862955729167, "learning_rate": 0.0003, "loss": 11.8782, "loss/aux_loss": 0.048107188753783704, "loss/crossentropy": 2.9277958452701567, "loss/logits": 0.9459708213806153, "step": 17800 }, { "epoch": 0.1781, "grad_norm": 13.8125, "grad_norm_var": 3.061393229166667, "learning_rate": 0.0003, "loss": 12.0301, "loss/aux_loss": 0.048091620206832886, "loss/crossentropy": 2.8570632517337797, "loss/logits": 0.9262526482343674, "step": 17810 }, { "epoch": 0.1782, "grad_norm": 10.5625, "grad_norm_var": 3.207275390625, "learning_rate": 0.0003, "loss": 12.0702, "loss/aux_loss": 0.0481030935421586, "loss/crossentropy": 2.7938737750053404, "loss/logits": 0.9401546657085419, "step": 17820 }, { "epoch": 0.1783, "grad_norm": 11.875, "grad_norm_var": 2.024853515625, "learning_rate": 0.0003, "loss": 11.9894, "loss/aux_loss": 0.04810605850070715, "loss/crossentropy": 2.67935990691185, "loss/logits": 0.8830744028091431, "step": 17830 }, { "epoch": 0.1784, "grad_norm": 12.8125, "grad_norm_var": 4.722900390625, "learning_rate": 0.0003, "loss": 12.0205, "loss/aux_loss": 0.04809609260410071, "loss/crossentropy": 2.708746474981308, "loss/logits": 0.9178021907806396, "step": 17840 }, { "epoch": 0.1785, "grad_norm": 11.1875, "grad_norm_var": 4.58671875, "learning_rate": 0.0003, "loss": 12.012, "loss/aux_loss": 0.04809623472392559, "loss/crossentropy": 2.7406187474727632, "loss/logits": 0.9204235941171646, "step": 17850 }, { "epoch": 0.1786, "grad_norm": 12.0625, "grad_norm_var": 0.156103515625, "learning_rate": 0.0003, "loss": 12.0759, "loss/aux_loss": 0.04808931238949299, "loss/crossentropy": 2.822909486293793, "loss/logits": 0.9199528455734253, "step": 17860 }, { "epoch": 0.1787, "grad_norm": 11.75, "grad_norm_var": 0.38331705729166665, "learning_rate": 0.0003, "loss": 12.0255, "loss/aux_loss": 0.048098467849195005, "loss/crossentropy": 2.9027469515800477, "loss/logits": 0.9527244418859482, "step": 17870 }, { "epoch": 0.1788, "grad_norm": 11.375, "grad_norm_var": 0.349462890625, "learning_rate": 0.0003, "loss": 11.9763, "loss/aux_loss": 0.048094440065324304, "loss/crossentropy": 2.8290345549583433, "loss/logits": 0.965818139910698, "step": 17880 }, { "epoch": 0.1789, "grad_norm": 12.0625, "grad_norm_var": 15.248681640625, "learning_rate": 0.0003, "loss": 11.9306, "loss/aux_loss": 0.048097760416567326, "loss/crossentropy": 2.7043901085853577, "loss/logits": 0.8894819289445877, "step": 17890 }, { "epoch": 0.179, "grad_norm": 12.25, "grad_norm_var": 14.694791666666667, "learning_rate": 0.0003, "loss": 12.1708, "loss/aux_loss": 0.04809524416923523, "loss/crossentropy": 2.865119767189026, "loss/logits": 0.9330274909734726, "step": 17900 }, { "epoch": 0.1791, "grad_norm": 12.0625, "grad_norm_var": 0.23487955729166668, "learning_rate": 0.0003, "loss": 11.9219, "loss/aux_loss": 0.04810286946594715, "loss/crossentropy": 2.8617121458053587, "loss/logits": 0.9211963266134262, "step": 17910 }, { "epoch": 0.1792, "grad_norm": 13.125, "grad_norm_var": 0.25826822916666664, "learning_rate": 0.0003, "loss": 12.1026, "loss/aux_loss": 0.048097353614866736, "loss/crossentropy": 2.9374179244041443, "loss/logits": 0.946164458990097, "step": 17920 }, { "epoch": 0.1793, "grad_norm": 11.6875, "grad_norm_var": 0.9634765625, "learning_rate": 0.0003, "loss": 11.9333, "loss/aux_loss": 0.04809586051851511, "loss/crossentropy": 2.804642015695572, "loss/logits": 0.884665310382843, "step": 17930 }, { "epoch": 0.1794, "grad_norm": 11.9375, "grad_norm_var": 0.7452473958333333, "learning_rate": 0.0003, "loss": 12.1827, "loss/aux_loss": 0.04808973409235477, "loss/crossentropy": 2.782781344652176, "loss/logits": 0.9129390954971314, "step": 17940 }, { "epoch": 0.1795, "grad_norm": 12.0625, "grad_norm_var": 0.424853515625, "learning_rate": 0.0003, "loss": 11.9763, "loss/aux_loss": 0.04808917623013258, "loss/crossentropy": 2.7897274017333986, "loss/logits": 0.8996834605932236, "step": 17950 }, { "epoch": 0.1796, "grad_norm": 11.625, "grad_norm_var": 0.5186848958333333, "learning_rate": 0.0003, "loss": 12.0981, "loss/aux_loss": 0.048086441680788995, "loss/crossentropy": 2.7038078784942625, "loss/logits": 0.8972001552581788, "step": 17960 }, { "epoch": 0.1797, "grad_norm": 11.0625, "grad_norm_var": 0.19088541666666667, "learning_rate": 0.0003, "loss": 12.2017, "loss/aux_loss": 0.04809106402099132, "loss/crossentropy": 2.738478219509125, "loss/logits": 0.9567953556776047, "step": 17970 }, { "epoch": 0.1798, "grad_norm": 11.25, "grad_norm_var": 0.3150390625, "learning_rate": 0.0003, "loss": 12.2132, "loss/aux_loss": 0.04809019956737757, "loss/crossentropy": 2.894696664810181, "loss/logits": 0.9616926342248917, "step": 17980 }, { "epoch": 0.1799, "grad_norm": 11.9375, "grad_norm_var": 0.292041015625, "learning_rate": 0.0003, "loss": 12.0304, "loss/aux_loss": 0.048091168701648715, "loss/crossentropy": 2.8822677552700045, "loss/logits": 0.9124285817146301, "step": 17990 }, { "epoch": 0.18, "grad_norm": 11.375, "grad_norm_var": 0.322119140625, "learning_rate": 0.0003, "loss": 11.8724, "loss/aux_loss": 0.04809627775102854, "loss/crossentropy": 2.790885365009308, "loss/logits": 0.9168848097324371, "step": 18000 }, { "epoch": 0.1801, "grad_norm": 11.0, "grad_norm_var": 0.28318684895833335, "learning_rate": 0.0003, "loss": 12.1473, "loss/aux_loss": 0.048096579127013685, "loss/crossentropy": 2.8502917110919954, "loss/logits": 0.9435136646032334, "step": 18010 }, { "epoch": 0.1802, "grad_norm": 11.0, "grad_norm_var": 0.3889973958333333, "learning_rate": 0.0003, "loss": 11.9018, "loss/aux_loss": 0.04810248874127865, "loss/crossentropy": 2.7946541905403137, "loss/logits": 0.9097151190042496, "step": 18020 }, { "epoch": 0.1803, "grad_norm": 11.6875, "grad_norm_var": 0.19581705729166668, "learning_rate": 0.0003, "loss": 11.9168, "loss/aux_loss": 0.048086031526327136, "loss/crossentropy": 2.8708603501319887, "loss/logits": 0.9378985464572906, "step": 18030 }, { "epoch": 0.1804, "grad_norm": 11.25, "grad_norm_var": 0.4671223958333333, "learning_rate": 0.0003, "loss": 12.0196, "loss/aux_loss": 0.04809171762317419, "loss/crossentropy": 2.757344883680344, "loss/logits": 0.9077586501836776, "step": 18040 }, { "epoch": 0.1805, "grad_norm": 11.6875, "grad_norm_var": 0.5227701822916667, "learning_rate": 0.0003, "loss": 12.1929, "loss/aux_loss": 0.04808536898344755, "loss/crossentropy": 2.9499477982521056, "loss/logits": 0.938400462269783, "step": 18050 }, { "epoch": 0.1806, "grad_norm": 11.125, "grad_norm_var": 0.259619140625, "learning_rate": 0.0003, "loss": 11.8522, "loss/aux_loss": 0.04810300972312689, "loss/crossentropy": 2.7223174929618836, "loss/logits": 0.9064554870128632, "step": 18060 }, { "epoch": 0.1807, "grad_norm": 11.1875, "grad_norm_var": 0.19680989583333333, "learning_rate": 0.0003, "loss": 12.0087, "loss/aux_loss": 0.048086580634117124, "loss/crossentropy": 2.81934916973114, "loss/logits": 0.932407483458519, "step": 18070 }, { "epoch": 0.1808, "grad_norm": 12.5625, "grad_norm_var": 0.4161295572916667, "learning_rate": 0.0003, "loss": 11.9218, "loss/aux_loss": 0.048097232170403, "loss/crossentropy": 2.783638632297516, "loss/logits": 0.9202796012163162, "step": 18080 }, { "epoch": 0.1809, "grad_norm": 12.0, "grad_norm_var": 4.4009765625, "learning_rate": 0.0003, "loss": 12.0502, "loss/aux_loss": 0.04810140375047922, "loss/crossentropy": 2.8965428352355955, "loss/logits": 0.9300930172204971, "step": 18090 }, { "epoch": 0.181, "grad_norm": 11.9375, "grad_norm_var": 4.006770833333333, "learning_rate": 0.0003, "loss": 12.2106, "loss/aux_loss": 0.04809464998543263, "loss/crossentropy": 2.82760112285614, "loss/logits": 0.9162612468004226, "step": 18100 }, { "epoch": 0.1811, "grad_norm": 11.1875, "grad_norm_var": 0.7079264322916666, "learning_rate": 0.0003, "loss": 11.9327, "loss/aux_loss": 0.04809616301208734, "loss/crossentropy": 2.9348750352859496, "loss/logits": 0.908600127696991, "step": 18110 }, { "epoch": 0.1812, "grad_norm": 11.3125, "grad_norm_var": 0.5577473958333333, "learning_rate": 0.0003, "loss": 12.0074, "loss/aux_loss": 0.0480873541906476, "loss/crossentropy": 2.8367616474628448, "loss/logits": 0.9282374233007431, "step": 18120 }, { "epoch": 0.1813, "grad_norm": 12.125, "grad_norm_var": 0.3374837239583333, "learning_rate": 0.0003, "loss": 12.0499, "loss/aux_loss": 0.04810123294591904, "loss/crossentropy": 2.811716413497925, "loss/logits": 0.8937458395957947, "step": 18130 }, { "epoch": 0.1814, "grad_norm": 11.9375, "grad_norm_var": 0.340478515625, "learning_rate": 0.0003, "loss": 12.0357, "loss/aux_loss": 0.04809012711048126, "loss/crossentropy": 2.7347005784511564, "loss/logits": 0.8987887173891067, "step": 18140 }, { "epoch": 0.1815, "grad_norm": 10.9375, "grad_norm_var": 0.4512858072916667, "learning_rate": 0.0003, "loss": 12.0446, "loss/aux_loss": 0.048095178604125974, "loss/crossentropy": 2.792975926399231, "loss/logits": 0.936535793542862, "step": 18150 }, { "epoch": 0.1816, "grad_norm": 11.9375, "grad_norm_var": 0.1978515625, "learning_rate": 0.0003, "loss": 12.1014, "loss/aux_loss": 0.048092770390212536, "loss/crossentropy": 2.6440272629261017, "loss/logits": 0.9145908206701279, "step": 18160 }, { "epoch": 0.1817, "grad_norm": 11.0, "grad_norm_var": 0.27858072916666665, "learning_rate": 0.0003, "loss": 11.801, "loss/aux_loss": 0.04809875432401896, "loss/crossentropy": 2.7408132016658784, "loss/logits": 0.8750650644302368, "step": 18170 }, { "epoch": 0.1818, "grad_norm": 11.4375, "grad_norm_var": 1.0853515625, "learning_rate": 0.0003, "loss": 12.1041, "loss/aux_loss": 0.04810262303799391, "loss/crossentropy": 2.7091507375240327, "loss/logits": 0.9323061019182205, "step": 18180 }, { "epoch": 0.1819, "grad_norm": 11.8125, "grad_norm_var": 1.352197265625, "learning_rate": 0.0003, "loss": 12.1093, "loss/aux_loss": 0.04809154383838177, "loss/crossentropy": 2.9365743041038512, "loss/logits": 0.9312824219465256, "step": 18190 }, { "epoch": 0.182, "grad_norm": 11.3125, "grad_norm_var": 0.7645670572916666, "learning_rate": 0.0003, "loss": 11.9302, "loss/aux_loss": 0.04810417983680963, "loss/crossentropy": 2.696337890625, "loss/logits": 0.9121669709682465, "step": 18200 }, { "epoch": 0.1821, "grad_norm": 11.5, "grad_norm_var": 0.37180989583333335, "learning_rate": 0.0003, "loss": 12.1331, "loss/aux_loss": 0.04809307269752026, "loss/crossentropy": 2.718644219636917, "loss/logits": 0.9004943788051605, "step": 18210 }, { "epoch": 0.1822, "grad_norm": 10.8125, "grad_norm_var": 0.5102701822916667, "learning_rate": 0.0003, "loss": 11.953, "loss/aux_loss": 0.04809835311025381, "loss/crossentropy": 2.907946026325226, "loss/logits": 0.9023657441139221, "step": 18220 }, { "epoch": 0.1823, "grad_norm": 10.9375, "grad_norm_var": 0.429541015625, "learning_rate": 0.0003, "loss": 11.7626, "loss/aux_loss": 0.04809815175831318, "loss/crossentropy": 2.7833638072013853, "loss/logits": 0.9019864350557327, "step": 18230 }, { "epoch": 0.1824, "grad_norm": 12.0, "grad_norm_var": 0.9494791666666667, "learning_rate": 0.0003, "loss": 12.1544, "loss/aux_loss": 0.04810196273028851, "loss/crossentropy": 2.856829822063446, "loss/logits": 0.8836144953966141, "step": 18240 }, { "epoch": 0.1825, "grad_norm": 12.0, "grad_norm_var": 0.853125, "learning_rate": 0.0003, "loss": 11.7924, "loss/aux_loss": 0.04810160342603922, "loss/crossentropy": 2.8368868112564085, "loss/logits": 0.9327179700136184, "step": 18250 }, { "epoch": 0.1826, "grad_norm": 11.5625, "grad_norm_var": 0.25323893229166666, "learning_rate": 0.0003, "loss": 12.1253, "loss/aux_loss": 0.04810277093201876, "loss/crossentropy": 2.7802767038345335, "loss/logits": 0.8940910458564758, "step": 18260 }, { "epoch": 0.1827, "grad_norm": 12.0, "grad_norm_var": 0.2759765625, "learning_rate": 0.0003, "loss": 11.9725, "loss/aux_loss": 0.04809536635875702, "loss/crossentropy": 2.845566821098328, "loss/logits": 0.9107513338327408, "step": 18270 }, { "epoch": 0.1828, "grad_norm": 11.375, "grad_norm_var": 0.4071451822916667, "learning_rate": 0.0003, "loss": 11.985, "loss/aux_loss": 0.04809177704155445, "loss/crossentropy": 2.708817595243454, "loss/logits": 0.8630902379751205, "step": 18280 }, { "epoch": 0.1829, "grad_norm": 11.5, "grad_norm_var": 0.27708333333333335, "learning_rate": 0.0003, "loss": 11.8368, "loss/aux_loss": 0.04809412229806185, "loss/crossentropy": 2.6290226101875307, "loss/logits": 0.927997687458992, "step": 18290 }, { "epoch": 0.183, "grad_norm": 11.0, "grad_norm_var": 0.5557291666666667, "learning_rate": 0.0003, "loss": 12.079, "loss/aux_loss": 0.04809760414063931, "loss/crossentropy": 2.820905792713165, "loss/logits": 0.9419155091047287, "step": 18300 }, { "epoch": 0.1831, "grad_norm": 21.875, "grad_norm_var": 389.9878743489583, "learning_rate": 0.0003, "loss": 12.2556, "loss/aux_loss": 0.04809573795646429, "loss/crossentropy": 2.8808292627334593, "loss/logits": 0.9127176314592361, "step": 18310 }, { "epoch": 0.1832, "grad_norm": 11.5625, "grad_norm_var": 6.450374348958333, "learning_rate": 0.0003, "loss": 11.9252, "loss/aux_loss": 0.04810270164161921, "loss/crossentropy": 2.638647198677063, "loss/logits": 0.8678022742271423, "step": 18320 }, { "epoch": 0.1833, "grad_norm": 12.5, "grad_norm_var": 0.24576822916666666, "learning_rate": 0.0003, "loss": 11.9287, "loss/aux_loss": 0.048086739145219326, "loss/crossentropy": 2.7931796431541445, "loss/logits": 0.9423355519771576, "step": 18330 }, { "epoch": 0.1834, "grad_norm": 11.5625, "grad_norm_var": 0.28878580729166664, "learning_rate": 0.0003, "loss": 11.8476, "loss/aux_loss": 0.04810136705636978, "loss/crossentropy": 2.7460675835609436, "loss/logits": 0.9129256516695022, "step": 18340 }, { "epoch": 0.1835, "grad_norm": 11.625, "grad_norm_var": 0.39264322916666666, "learning_rate": 0.0003, "loss": 12.1094, "loss/aux_loss": 0.04808819629251957, "loss/crossentropy": 2.9586320996284483, "loss/logits": 0.926827785372734, "step": 18350 }, { "epoch": 0.1836, "grad_norm": 12.25, "grad_norm_var": 0.271728515625, "learning_rate": 0.0003, "loss": 12.0495, "loss/aux_loss": 0.04808443989604712, "loss/crossentropy": 2.8124279379844666, "loss/logits": 0.929901072382927, "step": 18360 }, { "epoch": 0.1837, "grad_norm": 11.5, "grad_norm_var": 0.550244140625, "learning_rate": 0.0003, "loss": 12.1857, "loss/aux_loss": 0.04809539690613747, "loss/crossentropy": 2.9786995530128477, "loss/logits": 0.9320331394672394, "step": 18370 }, { "epoch": 0.1838, "grad_norm": 13.0625, "grad_norm_var": 0.5140625, "learning_rate": 0.0003, "loss": 11.9399, "loss/aux_loss": 0.048090987093746665, "loss/crossentropy": 2.921971356868744, "loss/logits": 0.9322270661592483, "step": 18380 }, { "epoch": 0.1839, "grad_norm": 11.9375, "grad_norm_var": 0.30416666666666664, "learning_rate": 0.0003, "loss": 12.0165, "loss/aux_loss": 0.04809644818305969, "loss/crossentropy": 2.7656728088855744, "loss/logits": 0.8962883800268173, "step": 18390 }, { "epoch": 0.184, "grad_norm": 11.25, "grad_norm_var": 0.3473307291666667, "learning_rate": 0.0003, "loss": 11.9019, "loss/aux_loss": 0.048087817057967185, "loss/crossentropy": 2.9055333137512207, "loss/logits": 0.903611746430397, "step": 18400 }, { "epoch": 0.1841, "grad_norm": 11.125, "grad_norm_var": 0.43697916666666664, "learning_rate": 0.0003, "loss": 11.9844, "loss/aux_loss": 0.04810544457286596, "loss/crossentropy": 2.788058453798294, "loss/logits": 0.9051843047142029, "step": 18410 }, { "epoch": 0.1842, "grad_norm": 11.4375, "grad_norm_var": 0.15729166666666666, "learning_rate": 0.0003, "loss": 11.8739, "loss/aux_loss": 0.0480987248942256, "loss/crossentropy": 2.725960999727249, "loss/logits": 0.9009216666221619, "step": 18420 }, { "epoch": 0.1843, "grad_norm": 11.5, "grad_norm_var": 0.0900390625, "learning_rate": 0.0003, "loss": 12.0252, "loss/aux_loss": 0.04809308275580406, "loss/crossentropy": 2.8324514091014863, "loss/logits": 0.9101878136396409, "step": 18430 }, { "epoch": 0.1844, "grad_norm": 12.1875, "grad_norm_var": 0.235791015625, "learning_rate": 0.0003, "loss": 12.037, "loss/aux_loss": 0.04810200035572052, "loss/crossentropy": 2.7521001577377318, "loss/logits": 0.8864558875560761, "step": 18440 }, { "epoch": 0.1845, "grad_norm": 11.75, "grad_norm_var": 0.21951497395833333, "learning_rate": 0.0003, "loss": 11.9625, "loss/aux_loss": 0.04809632524847984, "loss/crossentropy": 2.670381647348404, "loss/logits": 0.9308747231960297, "step": 18450 }, { "epoch": 0.1846, "grad_norm": 11.75, "grad_norm_var": 7.765999348958333, "learning_rate": 0.0003, "loss": 11.9008, "loss/aux_loss": 0.04809861965477467, "loss/crossentropy": 2.729556679725647, "loss/logits": 0.9098370641469955, "step": 18460 }, { "epoch": 0.1847, "grad_norm": 11.1875, "grad_norm_var": 0.390087890625, "learning_rate": 0.0003, "loss": 11.92, "loss/aux_loss": 0.04810507521033287, "loss/crossentropy": 2.72969531416893, "loss/logits": 0.8956705331802368, "step": 18470 }, { "epoch": 0.1848, "grad_norm": 11.375, "grad_norm_var": 0.366650390625, "learning_rate": 0.0003, "loss": 12.1006, "loss/aux_loss": 0.048082930594682695, "loss/crossentropy": 2.878212571144104, "loss/logits": 0.9402207374572754, "step": 18480 }, { "epoch": 0.1849, "grad_norm": 12.375, "grad_norm_var": 0.27316080729166664, "learning_rate": 0.0003, "loss": 11.9182, "loss/aux_loss": 0.048104613274335864, "loss/crossentropy": 2.8093533754348754, "loss/logits": 0.9267432987689972, "step": 18490 }, { "epoch": 0.185, "grad_norm": 11.3125, "grad_norm_var": 0.45089518229166664, "learning_rate": 0.0003, "loss": 12.0427, "loss/aux_loss": 0.04809886794537306, "loss/crossentropy": 2.8468139350414274, "loss/logits": 0.9497668504714966, "step": 18500 }, { "epoch": 0.1851, "grad_norm": 11.875, "grad_norm_var": 0.48409830729166664, "learning_rate": 0.0003, "loss": 12.2465, "loss/aux_loss": 0.04809215907007456, "loss/crossentropy": 2.7852579593658446, "loss/logits": 0.9196423381567002, "step": 18510 }, { "epoch": 0.1852, "grad_norm": 11.875, "grad_norm_var": 0.42701822916666665, "learning_rate": 0.0003, "loss": 11.7951, "loss/aux_loss": 0.04809624664485455, "loss/crossentropy": 2.7887323558330537, "loss/logits": 0.9103799790143967, "step": 18520 }, { "epoch": 0.1853, "grad_norm": 11.6875, "grad_norm_var": 0.4661458333333333, "learning_rate": 0.0003, "loss": 12.0625, "loss/aux_loss": 0.04808788318186998, "loss/crossentropy": 2.808869343996048, "loss/logits": 0.9083440005779266, "step": 18530 }, { "epoch": 0.1854, "grad_norm": 10.8125, "grad_norm_var": 0.3947265625, "learning_rate": 0.0003, "loss": 11.8269, "loss/aux_loss": 0.04810307510197163, "loss/crossentropy": 2.657493585348129, "loss/logits": 0.9473574429750442, "step": 18540 }, { "epoch": 0.1855, "grad_norm": 11.75, "grad_norm_var": 0.34036458333333336, "learning_rate": 0.0003, "loss": 11.9915, "loss/aux_loss": 0.048092805035412314, "loss/crossentropy": 2.809315764904022, "loss/logits": 0.9260794132947922, "step": 18550 }, { "epoch": 0.1856, "grad_norm": 12.0625, "grad_norm_var": 0.19920247395833332, "learning_rate": 0.0003, "loss": 11.99, "loss/aux_loss": 0.04810086619108915, "loss/crossentropy": 2.834631139039993, "loss/logits": 0.8880037814378738, "step": 18560 }, { "epoch": 0.1857, "grad_norm": 11.5625, "grad_norm_var": 0.27447916666666666, "learning_rate": 0.0003, "loss": 12.0215, "loss/aux_loss": 0.04809237774461508, "loss/crossentropy": 2.82566694021225, "loss/logits": 0.9089670658111573, "step": 18570 }, { "epoch": 0.1858, "grad_norm": 11.25, "grad_norm_var": 0.4634765625, "learning_rate": 0.0003, "loss": 11.9482, "loss/aux_loss": 0.048088868334889415, "loss/crossentropy": 2.8229294657707213, "loss/logits": 0.9378566771745682, "step": 18580 }, { "epoch": 0.1859, "grad_norm": 12.5, "grad_norm_var": 0.2718587239583333, "learning_rate": 0.0003, "loss": 11.8611, "loss/aux_loss": 0.04808779731392861, "loss/crossentropy": 2.7925811648368835, "loss/logits": 0.9152165412902832, "step": 18590 }, { "epoch": 0.186, "grad_norm": 12.875, "grad_norm_var": 0.2869140625, "learning_rate": 0.0003, "loss": 11.9566, "loss/aux_loss": 0.04808272887021303, "loss/crossentropy": 2.84689114689827, "loss/logits": 0.9415480852127075, "step": 18600 }, { "epoch": 0.1861, "grad_norm": 12.625, "grad_norm_var": 0.47389322916666665, "learning_rate": 0.0003, "loss": 12.017, "loss/aux_loss": 0.0480954147875309, "loss/crossentropy": 2.9054375171661375, "loss/logits": 0.9320352107286454, "step": 18610 }, { "epoch": 0.1862, "grad_norm": 12.8125, "grad_norm_var": 0.4014973958333333, "learning_rate": 0.0003, "loss": 11.9752, "loss/aux_loss": 0.04809523019939661, "loss/crossentropy": 2.8620842158794404, "loss/logits": 0.9115070551633835, "step": 18620 }, { "epoch": 0.1863, "grad_norm": 11.4375, "grad_norm_var": 0.3343587239583333, "learning_rate": 0.0003, "loss": 11.9462, "loss/aux_loss": 0.04809815548360348, "loss/crossentropy": 2.799818730354309, "loss/logits": 0.9180498957633972, "step": 18630 }, { "epoch": 0.1864, "grad_norm": 10.75, "grad_norm_var": 0.3402180989583333, "learning_rate": 0.0003, "loss": 12.1313, "loss/aux_loss": 0.048090537264943126, "loss/crossentropy": 2.9112443208694456, "loss/logits": 0.9628580302000046, "step": 18640 }, { "epoch": 0.1865, "grad_norm": 12.0625, "grad_norm_var": 0.3275390625, "learning_rate": 0.0003, "loss": 11.8097, "loss/aux_loss": 0.04809784088283777, "loss/crossentropy": 2.7898995995521547, "loss/logits": 0.8758876919746399, "step": 18650 }, { "epoch": 0.1866, "grad_norm": 11.25, "grad_norm_var": 0.355712890625, "learning_rate": 0.0003, "loss": 11.9504, "loss/aux_loss": 0.04809091240167618, "loss/crossentropy": 2.883331334590912, "loss/logits": 0.9137732326984406, "step": 18660 }, { "epoch": 0.1867, "grad_norm": 10.875, "grad_norm_var": 0.2604166666666667, "learning_rate": 0.0003, "loss": 11.9183, "loss/aux_loss": 0.04810288343578577, "loss/crossentropy": 2.827151381969452, "loss/logits": 0.9118337035179138, "step": 18670 }, { "epoch": 0.1868, "grad_norm": 12.5, "grad_norm_var": 0.5574055989583333, "learning_rate": 0.0003, "loss": 11.7425, "loss/aux_loss": 0.048099182173609735, "loss/crossentropy": 2.751882565021515, "loss/logits": 0.915845838189125, "step": 18680 }, { "epoch": 0.1869, "grad_norm": 18.25, "grad_norm_var": 2.970572916666667, "learning_rate": 0.0003, "loss": 11.9627, "loss/aux_loss": 0.04809111282229424, "loss/crossentropy": 2.8157127261161805, "loss/logits": 0.8805839955806732, "step": 18690 }, { "epoch": 0.187, "grad_norm": 11.75, "grad_norm_var": 6.431103515625, "learning_rate": 0.0003, "loss": 12.0129, "loss/aux_loss": 0.04809790011495352, "loss/crossentropy": 2.9344887137413025, "loss/logits": 0.9546463966369629, "step": 18700 }, { "epoch": 0.1871, "grad_norm": 13.0, "grad_norm_var": 4.466927083333333, "learning_rate": 0.0003, "loss": 11.8991, "loss/aux_loss": 0.048103202134370804, "loss/crossentropy": 2.6000198304653166, "loss/logits": 0.8983616352081298, "step": 18710 }, { "epoch": 0.1872, "grad_norm": 10.75, "grad_norm_var": 1.0605305989583333, "learning_rate": 0.0003, "loss": 11.8265, "loss/aux_loss": 0.048097947239875795, "loss/crossentropy": 2.74048707485199, "loss/logits": 0.9315300911664963, "step": 18720 }, { "epoch": 0.1873, "grad_norm": 11.0625, "grad_norm_var": 0.9038899739583334, "learning_rate": 0.0003, "loss": 11.8877, "loss/aux_loss": 0.04809299129992724, "loss/crossentropy": 2.6987858176231385, "loss/logits": 0.8788728475570678, "step": 18730 }, { "epoch": 0.1874, "grad_norm": 10.875, "grad_norm_var": 1.0518229166666666, "learning_rate": 0.0003, "loss": 11.9914, "loss/aux_loss": 0.04809512048959732, "loss/crossentropy": 2.7466448664665224, "loss/logits": 0.9056605339050293, "step": 18740 }, { "epoch": 0.1875, "grad_norm": 11.375, "grad_norm_var": 0.5421223958333333, "learning_rate": 0.0003, "loss": 12.0469, "loss/aux_loss": 0.04808540716767311, "loss/crossentropy": 2.8385793924331666, "loss/logits": 0.9397071808576584, "step": 18750 }, { "epoch": 0.1876, "grad_norm": 12.125, "grad_norm_var": 0.2952962239583333, "learning_rate": 0.0003, "loss": 11.9745, "loss/aux_loss": 0.04808508455753326, "loss/crossentropy": 2.7859063267707826, "loss/logits": 0.9249111205339432, "step": 18760 }, { "epoch": 0.1877, "grad_norm": 11.8125, "grad_norm_var": 0.253369140625, "learning_rate": 0.0003, "loss": 11.9344, "loss/aux_loss": 0.048101603612303735, "loss/crossentropy": 2.96168977022171, "loss/logits": 0.9427939087152482, "step": 18770 }, { "epoch": 0.1878, "grad_norm": 12.125, "grad_norm_var": 0.23474934895833333, "learning_rate": 0.0003, "loss": 11.8409, "loss/aux_loss": 0.04809587094932795, "loss/crossentropy": 2.6832815647125243, "loss/logits": 0.8893307328224183, "step": 18780 }, { "epoch": 0.1879, "grad_norm": 12.9375, "grad_norm_var": 0.46925455729166665, "learning_rate": 0.0003, "loss": 11.8433, "loss/aux_loss": 0.048098374903202054, "loss/crossentropy": 2.973353409767151, "loss/logits": 0.9185240358114243, "step": 18790 }, { "epoch": 0.188, "grad_norm": 11.3125, "grad_norm_var": 0.385009765625, "learning_rate": 0.0003, "loss": 11.8279, "loss/aux_loss": 0.04809072986245155, "loss/crossentropy": 2.883065390586853, "loss/logits": 0.8887928575277328, "step": 18800 }, { "epoch": 0.1881, "grad_norm": 11.4375, "grad_norm_var": 0.1421875, "learning_rate": 0.0003, "loss": 12.0192, "loss/aux_loss": 0.04809522368013859, "loss/crossentropy": 2.8153730273246764, "loss/logits": 0.8990904957056045, "step": 18810 }, { "epoch": 0.1882, "grad_norm": 12.125, "grad_norm_var": 0.44524739583333334, "learning_rate": 0.0003, "loss": 12.0603, "loss/aux_loss": 0.04809008222073317, "loss/crossentropy": 2.917261230945587, "loss/logits": 0.9583010584115982, "step": 18820 }, { "epoch": 0.1883, "grad_norm": 12.125, "grad_norm_var": 0.4032389322916667, "learning_rate": 0.0003, "loss": 12.0391, "loss/aux_loss": 0.04809148814529181, "loss/crossentropy": 2.7239452958106996, "loss/logits": 0.9369824826717377, "step": 18830 }, { "epoch": 0.1884, "grad_norm": 12.375, "grad_norm_var": 0.9059895833333333, "learning_rate": 0.0003, "loss": 11.9066, "loss/aux_loss": 0.04810024816542864, "loss/crossentropy": 2.6165760159492493, "loss/logits": 0.8547280013561249, "step": 18840 }, { "epoch": 0.1885, "grad_norm": 17.25, "grad_norm_var": 2.301497395833333, "learning_rate": 0.0003, "loss": 11.9055, "loss/aux_loss": 0.048094492405653, "loss/crossentropy": 3.0237093448638914, "loss/logits": 0.9564484775066375, "step": 18850 }, { "epoch": 0.1886, "grad_norm": 11.4375, "grad_norm_var": 2.128059895833333, "learning_rate": 0.0003, "loss": 12.0438, "loss/aux_loss": 0.04808449726551771, "loss/crossentropy": 2.8197420120239256, "loss/logits": 0.9342156380414963, "step": 18860 }, { "epoch": 0.1887, "grad_norm": 11.1875, "grad_norm_var": 0.26691080729166666, "learning_rate": 0.0003, "loss": 11.826, "loss/aux_loss": 0.04809930399060249, "loss/crossentropy": 2.7798034250736237, "loss/logits": 0.8755748480558395, "step": 18870 }, { "epoch": 0.1888, "grad_norm": 12.0625, "grad_norm_var": 0.35545247395833335, "learning_rate": 0.0003, "loss": 11.6886, "loss/aux_loss": 0.04809543527662754, "loss/crossentropy": 2.8541658937931063, "loss/logits": 0.892428070306778, "step": 18880 }, { "epoch": 0.1889, "grad_norm": 11.9375, "grad_norm_var": 0.32224934895833335, "learning_rate": 0.0003, "loss": 11.7692, "loss/aux_loss": 0.04810644872486591, "loss/crossentropy": 2.7232487499713898, "loss/logits": 0.9081037282943726, "step": 18890 }, { "epoch": 0.189, "grad_norm": 11.375, "grad_norm_var": 0.2938639322916667, "learning_rate": 0.0003, "loss": 11.9456, "loss/aux_loss": 0.04808319099247456, "loss/crossentropy": 2.5981625437736513, "loss/logits": 0.8822880685329437, "step": 18900 }, { "epoch": 0.1891, "grad_norm": 11.8125, "grad_norm_var": 0.36456705729166666, "learning_rate": 0.0003, "loss": 11.9909, "loss/aux_loss": 0.048096727766096595, "loss/crossentropy": 2.718340504169464, "loss/logits": 0.8984918922185898, "step": 18910 }, { "epoch": 0.1892, "grad_norm": 12.5, "grad_norm_var": 0.4791015625, "learning_rate": 0.0003, "loss": 12.0497, "loss/aux_loss": 0.048100071772933004, "loss/crossentropy": 2.914843189716339, "loss/logits": 0.9236764490604401, "step": 18920 }, { "epoch": 0.1893, "grad_norm": 11.1875, "grad_norm_var": 0.5247395833333334, "learning_rate": 0.0003, "loss": 11.8306, "loss/aux_loss": 0.0480832876637578, "loss/crossentropy": 2.8327117800712585, "loss/logits": 0.9310741007328034, "step": 18930 }, { "epoch": 0.1894, "grad_norm": 10.875, "grad_norm_var": 0.3282389322916667, "learning_rate": 0.0003, "loss": 11.8084, "loss/aux_loss": 0.048092326149344444, "loss/crossentropy": 2.8622347712516785, "loss/logits": 0.8887595921754837, "step": 18940 }, { "epoch": 0.1895, "grad_norm": 12.3125, "grad_norm_var": 0.4891764322916667, "learning_rate": 0.0003, "loss": 11.9539, "loss/aux_loss": 0.048091747984290126, "loss/crossentropy": 2.8625791549682615, "loss/logits": 0.9395757526159286, "step": 18950 }, { "epoch": 0.1896, "grad_norm": 12.0625, "grad_norm_var": 0.24073893229166668, "learning_rate": 0.0003, "loss": 11.8872, "loss/aux_loss": 0.04809561818838119, "loss/crossentropy": 2.93541020154953, "loss/logits": 0.9296731561422348, "step": 18960 }, { "epoch": 0.1897, "grad_norm": 11.0625, "grad_norm_var": 0.5176432291666667, "learning_rate": 0.0003, "loss": 11.926, "loss/aux_loss": 0.04809174351394176, "loss/crossentropy": 2.85881884098053, "loss/logits": 0.9174812495708465, "step": 18970 }, { "epoch": 0.1898, "grad_norm": 12.8125, "grad_norm_var": 0.49375, "learning_rate": 0.0003, "loss": 11.9255, "loss/aux_loss": 0.04810503609478474, "loss/crossentropy": 2.7044818341732024, "loss/logits": 0.9038681089878082, "step": 18980 }, { "epoch": 0.1899, "grad_norm": 12.3125, "grad_norm_var": 0.323291015625, "learning_rate": 0.0003, "loss": 11.934, "loss/aux_loss": 0.04808723460882902, "loss/crossentropy": 2.8215215682983397, "loss/logits": 0.8740798741579056, "step": 18990 }, { "epoch": 0.19, "grad_norm": 12.5625, "grad_norm_var": 0.5848795572916666, "learning_rate": 0.0003, "loss": 11.9942, "loss/aux_loss": 0.04809704348444939, "loss/crossentropy": 2.9540371537208556, "loss/logits": 0.9380867898464202, "step": 19000 }, { "epoch": 0.1901, "grad_norm": 11.875, "grad_norm_var": 35.141927083333336, "learning_rate": 0.0003, "loss": 12.117, "loss/aux_loss": 0.04809831455349922, "loss/crossentropy": 2.848419559001923, "loss/logits": 0.9661454766988754, "step": 19010 }, { "epoch": 0.1902, "grad_norm": 11.75, "grad_norm_var": 1.568212890625, "learning_rate": 0.0003, "loss": 12.1233, "loss/aux_loss": 0.048095112666487694, "loss/crossentropy": 2.914617598056793, "loss/logits": 0.940568807721138, "step": 19020 }, { "epoch": 0.1903, "grad_norm": 12.75, "grad_norm_var": 1.885791015625, "learning_rate": 0.0003, "loss": 12.0781, "loss/aux_loss": 0.04809290152043104, "loss/crossentropy": 2.9060685276985168, "loss/logits": 0.9084505170583725, "step": 19030 }, { "epoch": 0.1904, "grad_norm": 12.0625, "grad_norm_var": 0.547509765625, "learning_rate": 0.0003, "loss": 11.8626, "loss/aux_loss": 0.04809753466397524, "loss/crossentropy": 2.8450966238975526, "loss/logits": 0.9347045987844467, "step": 19040 }, { "epoch": 0.1905, "grad_norm": 11.75, "grad_norm_var": 0.421728515625, "learning_rate": 0.0003, "loss": 11.786, "loss/aux_loss": 0.04809632711112499, "loss/crossentropy": 2.6032280802726744, "loss/logits": 0.8754363477230072, "step": 19050 }, { "epoch": 0.1906, "grad_norm": 12.4375, "grad_norm_var": 0.266650390625, "learning_rate": 0.0003, "loss": 11.9427, "loss/aux_loss": 0.048089970275759696, "loss/crossentropy": 2.8427577376365663, "loss/logits": 0.9292992860078811, "step": 19060 }, { "epoch": 0.1907, "grad_norm": 11.9375, "grad_norm_var": 0.29099934895833335, "learning_rate": 0.0003, "loss": 12.0185, "loss/aux_loss": 0.04809428974986076, "loss/crossentropy": 2.9434940934181215, "loss/logits": 0.8928971856832504, "step": 19070 }, { "epoch": 0.1908, "grad_norm": 11.6875, "grad_norm_var": 0.7889973958333333, "learning_rate": 0.0003, "loss": 11.742, "loss/aux_loss": 0.048089478723704816, "loss/crossentropy": 2.7458222687244414, "loss/logits": 0.9186016976833343, "step": 19080 }, { "epoch": 0.1909, "grad_norm": 11.0625, "grad_norm_var": 1.5890462239583334, "learning_rate": 0.0003, "loss": 12.0539, "loss/aux_loss": 0.048099744878709313, "loss/crossentropy": 2.829107737541199, "loss/logits": 0.9015246391296386, "step": 19090 }, { "epoch": 0.191, "grad_norm": 11.375, "grad_norm_var": 1.597900390625, "learning_rate": 0.0003, "loss": 11.939, "loss/aux_loss": 0.04808116909116507, "loss/crossentropy": 2.874987268447876, "loss/logits": 0.9092469424009323, "step": 19100 }, { "epoch": 0.1911, "grad_norm": 11.625, "grad_norm_var": 0.36912434895833335, "learning_rate": 0.0003, "loss": 11.9227, "loss/aux_loss": 0.04809539634734392, "loss/crossentropy": 2.7079729199409486, "loss/logits": 0.9122515827417373, "step": 19110 }, { "epoch": 0.1912, "grad_norm": 11.1875, "grad_norm_var": 0.3335774739583333, "learning_rate": 0.0003, "loss": 12.094, "loss/aux_loss": 0.0480892339721322, "loss/crossentropy": 2.6642766416072847, "loss/logits": 0.9355534881353378, "step": 19120 }, { "epoch": 0.1913, "grad_norm": 12.75, "grad_norm_var": 0.28631184895833334, "learning_rate": 0.0003, "loss": 11.8506, "loss/aux_loss": 0.04809222798794508, "loss/crossentropy": 2.581315791606903, "loss/logits": 0.8726950109004974, "step": 19130 }, { "epoch": 0.1914, "grad_norm": 11.9375, "grad_norm_var": 0.19993489583333332, "learning_rate": 0.0003, "loss": 11.9463, "loss/aux_loss": 0.0480927174910903, "loss/crossentropy": 2.8361901879310607, "loss/logits": 0.93881676197052, "step": 19140 }, { "epoch": 0.1915, "grad_norm": 12.5625, "grad_norm_var": 0.3233723958333333, "learning_rate": 0.0003, "loss": 11.8851, "loss/aux_loss": 0.04809240084141493, "loss/crossentropy": 2.772093939781189, "loss/logits": 0.895565664768219, "step": 19150 }, { "epoch": 0.1916, "grad_norm": 12.8125, "grad_norm_var": 0.42941080729166664, "learning_rate": 0.0003, "loss": 11.8773, "loss/aux_loss": 0.04808843210339546, "loss/crossentropy": 2.7380272090435027, "loss/logits": 0.8849580556154251, "step": 19160 }, { "epoch": 0.1917, "grad_norm": 11.375, "grad_norm_var": 0.6075520833333333, "learning_rate": 0.0003, "loss": 12.0319, "loss/aux_loss": 0.04809383936226368, "loss/crossentropy": 2.679046392440796, "loss/logits": 0.9247982114553451, "step": 19170 }, { "epoch": 0.1918, "grad_norm": 12.5625, "grad_norm_var": 0.51875, "learning_rate": 0.0003, "loss": 11.9646, "loss/aux_loss": 0.0480990482494235, "loss/crossentropy": 2.8308603882789614, "loss/logits": 0.9139487504959106, "step": 19180 }, { "epoch": 0.1919, "grad_norm": 12.1875, "grad_norm_var": 0.8937337239583333, "learning_rate": 0.0003, "loss": 11.9498, "loss/aux_loss": 0.04809563048183918, "loss/crossentropy": 2.868353658914566, "loss/logits": 0.9112594306468964, "step": 19190 }, { "epoch": 0.192, "grad_norm": 11.75, "grad_norm_var": 178.2056640625, "learning_rate": 0.0003, "loss": 11.9624, "loss/aux_loss": 0.04809640198945999, "loss/crossentropy": 2.875636076927185, "loss/logits": 0.9370063930749893, "step": 19200 }, { "epoch": 0.1921, "grad_norm": 11.625, "grad_norm_var": 0.4247233072916667, "learning_rate": 0.0003, "loss": 11.9338, "loss/aux_loss": 0.048091776110231875, "loss/crossentropy": 2.885943067073822, "loss/logits": 0.9205142021179199, "step": 19210 }, { "epoch": 0.1922, "grad_norm": 11.3125, "grad_norm_var": 0.292431640625, "learning_rate": 0.0003, "loss": 11.841, "loss/aux_loss": 0.04809496812522411, "loss/crossentropy": 2.5971029341220855, "loss/logits": 0.8990915536880493, "step": 19220 }, { "epoch": 0.1923, "grad_norm": 11.6875, "grad_norm_var": 0.3155598958333333, "learning_rate": 0.0003, "loss": 11.9581, "loss/aux_loss": 0.048088048957288264, "loss/crossentropy": 2.8906711101531983, "loss/logits": 0.9080457538366318, "step": 19230 }, { "epoch": 0.1924, "grad_norm": 12.8125, "grad_norm_var": 0.363134765625, "learning_rate": 0.0003, "loss": 11.9134, "loss/aux_loss": 0.04809061922132969, "loss/crossentropy": 2.7267106890678408, "loss/logits": 0.8822133630514145, "step": 19240 }, { "epoch": 0.1925, "grad_norm": 12.4375, "grad_norm_var": 0.18854166666666666, "learning_rate": 0.0003, "loss": 11.9539, "loss/aux_loss": 0.04809251707047224, "loss/crossentropy": 2.834822082519531, "loss/logits": 0.9051540076732636, "step": 19250 }, { "epoch": 0.1926, "grad_norm": 12.1875, "grad_norm_var": 0.3575520833333333, "learning_rate": 0.0003, "loss": 11.8532, "loss/aux_loss": 0.04809150565415621, "loss/crossentropy": 2.673792243003845, "loss/logits": 0.8930087149143219, "step": 19260 }, { "epoch": 0.1927, "grad_norm": 11.625, "grad_norm_var": 0.376025390625, "learning_rate": 0.0003, "loss": 11.8667, "loss/aux_loss": 0.04809306338429451, "loss/crossentropy": 2.8607924938201905, "loss/logits": 0.8900872558355332, "step": 19270 }, { "epoch": 0.1928, "grad_norm": 12.125, "grad_norm_var": 0.15714518229166666, "learning_rate": 0.0003, "loss": 11.9148, "loss/aux_loss": 0.04807972889393568, "loss/crossentropy": 2.803767132759094, "loss/logits": 0.9348350763320923, "step": 19280 }, { "epoch": 0.1929, "grad_norm": 11.125, "grad_norm_var": 0.41067708333333336, "learning_rate": 0.0003, "loss": 11.8139, "loss/aux_loss": 0.04809964876621962, "loss/crossentropy": 2.7644663214683534, "loss/logits": 0.9412413388490677, "step": 19290 }, { "epoch": 0.193, "grad_norm": 11.9375, "grad_norm_var": 15.3375, "learning_rate": 0.0003, "loss": 11.8668, "loss/aux_loss": 0.04809567742049694, "loss/crossentropy": 2.687431216239929, "loss/logits": 0.8960238516330719, "step": 19300 }, { "epoch": 0.1931, "grad_norm": 12.5, "grad_norm_var": 14.444645182291667, "learning_rate": 0.0003, "loss": 12.0495, "loss/aux_loss": 0.04808875881135464, "loss/crossentropy": 2.8696415305137633, "loss/logits": 0.9359003514051437, "step": 19310 }, { "epoch": 0.1932, "grad_norm": 14.25, "grad_norm_var": 0.6895182291666667, "learning_rate": 0.0003, "loss": 11.8149, "loss/aux_loss": 0.048093396797776224, "loss/crossentropy": 2.7438224017620088, "loss/logits": 0.9250198155641556, "step": 19320 }, { "epoch": 0.1933, "grad_norm": 11.5625, "grad_norm_var": 1.2317057291666667, "learning_rate": 0.0003, "loss": 11.9555, "loss/aux_loss": 0.0480903297662735, "loss/crossentropy": 2.8425944447517395, "loss/logits": 0.9191201657056809, "step": 19330 }, { "epoch": 0.1934, "grad_norm": 13.375, "grad_norm_var": 0.6511555989583333, "learning_rate": 0.0003, "loss": 11.8307, "loss/aux_loss": 0.04809546768665314, "loss/crossentropy": 2.794313246011734, "loss/logits": 0.8963600903749466, "step": 19340 }, { "epoch": 0.1935, "grad_norm": 11.6875, "grad_norm_var": 4.347135416666666, "learning_rate": 0.0003, "loss": 11.9059, "loss/aux_loss": 0.048097337037324904, "loss/crossentropy": 2.8060832381248475, "loss/logits": 0.923801937699318, "step": 19350 }, { "epoch": 0.1936, "grad_norm": 11.25, "grad_norm_var": 0.4930826822916667, "learning_rate": 0.0003, "loss": 12.0551, "loss/aux_loss": 0.048092464171350005, "loss/crossentropy": 2.737381660938263, "loss/logits": 0.9238520950078964, "step": 19360 }, { "epoch": 0.1937, "grad_norm": 12.0, "grad_norm_var": 2.992431640625, "learning_rate": 0.0003, "loss": 11.9236, "loss/aux_loss": 0.04809140842407942, "loss/crossentropy": 2.7283401012420656, "loss/logits": 0.9115884095430374, "step": 19370 }, { "epoch": 0.1938, "grad_norm": 13.25, "grad_norm_var": 3.064306640625, "learning_rate": 0.0003, "loss": 11.9373, "loss/aux_loss": 0.04809054136276245, "loss/crossentropy": 2.9607500314712523, "loss/logits": 0.9213700443506241, "step": 19380 }, { "epoch": 0.1939, "grad_norm": 12.5, "grad_norm_var": 0.5308430989583334, "learning_rate": 0.0003, "loss": 12.0326, "loss/aux_loss": 0.04808128047734499, "loss/crossentropy": 2.966229736804962, "loss/logits": 0.9744098156690597, "step": 19390 }, { "epoch": 0.194, "grad_norm": 11.75, "grad_norm_var": 8.039697265625, "learning_rate": 0.0003, "loss": 11.8697, "loss/aux_loss": 0.04811720736324787, "loss/crossentropy": 2.8290202260017394, "loss/logits": 0.9148620575666427, "step": 19400 }, { "epoch": 0.1941, "grad_norm": 12.0625, "grad_norm_var": 0.25983072916666666, "learning_rate": 0.0003, "loss": 11.81, "loss/aux_loss": 0.0480983579531312, "loss/crossentropy": 2.618745720386505, "loss/logits": 0.8423079371452331, "step": 19410 }, { "epoch": 0.1942, "grad_norm": 11.875, "grad_norm_var": 15.804671223958334, "learning_rate": 0.0003, "loss": 11.8575, "loss/aux_loss": 0.048095330409705636, "loss/crossentropy": 2.8630192160606383, "loss/logits": 0.9032826870679855, "step": 19420 }, { "epoch": 0.1943, "grad_norm": 12.3125, "grad_norm_var": 0.23776041666666667, "learning_rate": 0.0003, "loss": 11.8413, "loss/aux_loss": 0.04809598363935948, "loss/crossentropy": 2.714806389808655, "loss/logits": 0.8757014304399491, "step": 19430 }, { "epoch": 0.1944, "grad_norm": 12.0, "grad_norm_var": 0.3473307291666667, "learning_rate": 0.0003, "loss": 11.9041, "loss/aux_loss": 0.04809326659888029, "loss/crossentropy": 2.81378653049469, "loss/logits": 0.9226094603538513, "step": 19440 }, { "epoch": 0.1945, "grad_norm": 12.9375, "grad_norm_var": 0.469384765625, "learning_rate": 0.0003, "loss": 12.0162, "loss/aux_loss": 0.04810178130865097, "loss/crossentropy": 2.7176017642021177, "loss/logits": 0.8890448838472367, "step": 19450 }, { "epoch": 0.1946, "grad_norm": 12.25, "grad_norm_var": 0.7030598958333333, "learning_rate": 0.0003, "loss": 11.8237, "loss/aux_loss": 0.048099988326430324, "loss/crossentropy": 2.578825032711029, "loss/logits": 0.8693708449602127, "step": 19460 }, { "epoch": 0.1947, "grad_norm": 11.875, "grad_norm_var": 0.24191080729166667, "learning_rate": 0.0003, "loss": 11.9591, "loss/aux_loss": 0.04810124989598989, "loss/crossentropy": 2.919858819246292, "loss/logits": 0.9405999302864074, "step": 19470 }, { "epoch": 0.1948, "grad_norm": 11.3125, "grad_norm_var": 0.07786458333333333, "learning_rate": 0.0003, "loss": 11.876, "loss/aux_loss": 0.04809066876769066, "loss/crossentropy": 2.830302083492279, "loss/logits": 0.9027499586343766, "step": 19480 }, { "epoch": 0.1949, "grad_norm": 11.8125, "grad_norm_var": 2.059830729166667, "learning_rate": 0.0003, "loss": 11.879, "loss/aux_loss": 0.04809845667332411, "loss/crossentropy": 2.846599793434143, "loss/logits": 0.8904654294252395, "step": 19490 }, { "epoch": 0.195, "grad_norm": 11.6875, "grad_norm_var": 1.7852701822916666, "learning_rate": 0.0003, "loss": 11.9823, "loss/aux_loss": 0.048098682425916195, "loss/crossentropy": 2.6993426620960235, "loss/logits": 0.9028889060020446, "step": 19500 }, { "epoch": 0.1951, "grad_norm": 11.5, "grad_norm_var": 0.4869140625, "learning_rate": 0.0003, "loss": 11.8731, "loss/aux_loss": 0.04808852486312389, "loss/crossentropy": 2.9450518250465394, "loss/logits": 0.923612329363823, "step": 19510 }, { "epoch": 0.1952, "grad_norm": 11.9375, "grad_norm_var": 0.6424479166666667, "learning_rate": 0.0003, "loss": 11.9629, "loss/aux_loss": 0.048087956570088866, "loss/crossentropy": 2.7310150384902956, "loss/logits": 0.8867404013872147, "step": 19520 }, { "epoch": 0.1953, "grad_norm": 11.6875, "grad_norm_var": 0.7098307291666667, "learning_rate": 0.0003, "loss": 11.9431, "loss/aux_loss": 0.04809447377920151, "loss/crossentropy": 2.7376580953598024, "loss/logits": 0.9287895351648331, "step": 19530 }, { "epoch": 0.1954, "grad_norm": 11.9375, "grad_norm_var": 0.45078125, "learning_rate": 0.0003, "loss": 11.9501, "loss/aux_loss": 0.04808611460030079, "loss/crossentropy": 2.8004270434379577, "loss/logits": 0.9067860126495362, "step": 19540 }, { "epoch": 0.1955, "grad_norm": 11.6875, "grad_norm_var": 0.2916015625, "learning_rate": 0.0003, "loss": 11.8895, "loss/aux_loss": 0.048095055297017096, "loss/crossentropy": 2.7705915451049803, "loss/logits": 0.8861099511384964, "step": 19550 }, { "epoch": 0.1956, "grad_norm": 11.3125, "grad_norm_var": 0.3238118489583333, "learning_rate": 0.0003, "loss": 11.8624, "loss/aux_loss": 0.048082451336085796, "loss/crossentropy": 2.7829070925712585, "loss/logits": 0.9209084331989288, "step": 19560 }, { "epoch": 0.1957, "grad_norm": 11.6875, "grad_norm_var": 48.423893229166666, "learning_rate": 0.0003, "loss": 11.8985, "loss/aux_loss": 0.048102785088121894, "loss/crossentropy": 2.795788884162903, "loss/logits": 0.9219643086194992, "step": 19570 }, { "epoch": 0.1958, "grad_norm": 12.0, "grad_norm_var": 0.17649739583333332, "learning_rate": 0.0003, "loss": 11.8718, "loss/aux_loss": 0.04808918442577124, "loss/crossentropy": 2.6786233842372895, "loss/logits": 0.8851831436157227, "step": 19580 }, { "epoch": 0.1959, "grad_norm": 12.4375, "grad_norm_var": 0.28958333333333336, "learning_rate": 0.0003, "loss": 11.9507, "loss/aux_loss": 0.048088375851511954, "loss/crossentropy": 2.7911401748657227, "loss/logits": 0.9245690137147904, "step": 19590 }, { "epoch": 0.196, "grad_norm": 12.0, "grad_norm_var": 0.4019368489583333, "learning_rate": 0.0003, "loss": 12.0528, "loss/aux_loss": 0.04808861836791038, "loss/crossentropy": 2.89783319234848, "loss/logits": 0.9419608056545258, "step": 19600 }, { "epoch": 0.1961, "grad_norm": 12.25, "grad_norm_var": 4.918733723958334, "learning_rate": 0.0003, "loss": 11.9651, "loss/aux_loss": 0.048108036443591115, "loss/crossentropy": 2.7094775795936585, "loss/logits": 0.8831329464912414, "step": 19610 }, { "epoch": 0.1962, "grad_norm": 13.75, "grad_norm_var": 0.55234375, "learning_rate": 0.0003, "loss": 11.9267, "loss/aux_loss": 0.048093420639634135, "loss/crossentropy": 2.6996466517448425, "loss/logits": 0.8816385596990586, "step": 19620 }, { "epoch": 0.1963, "grad_norm": 11.875, "grad_norm_var": 0.6469889322916667, "learning_rate": 0.0003, "loss": 12.072, "loss/aux_loss": 0.0480880755931139, "loss/crossentropy": 2.9747036695480347, "loss/logits": 0.9532156825065613, "step": 19630 }, { "epoch": 0.1964, "grad_norm": 11.3125, "grad_norm_var": 0.18776041666666668, "learning_rate": 0.0003, "loss": 11.9037, "loss/aux_loss": 0.04809441566467285, "loss/crossentropy": 2.816385340690613, "loss/logits": 0.8943806827068329, "step": 19640 }, { "epoch": 0.1965, "grad_norm": 11.6875, "grad_norm_var": 0.3033854166666667, "learning_rate": 0.0003, "loss": 11.8965, "loss/aux_loss": 0.04809524808079004, "loss/crossentropy": 2.786463499069214, "loss/logits": 0.8947435468435287, "step": 19650 }, { "epoch": 0.1966, "grad_norm": 11.5, "grad_norm_var": 0.31443684895833335, "learning_rate": 0.0003, "loss": 11.9229, "loss/aux_loss": 0.04809077382087708, "loss/crossentropy": 2.635916793346405, "loss/logits": 0.8936503291130066, "step": 19660 }, { "epoch": 0.1967, "grad_norm": 11.9375, "grad_norm_var": 74.60792643229166, "learning_rate": 0.0003, "loss": 11.9338, "loss/aux_loss": 0.04811822287738323, "loss/crossentropy": 2.851349139213562, "loss/logits": 0.9552539438009262, "step": 19670 }, { "epoch": 0.1968, "grad_norm": 13.3125, "grad_norm_var": 539.7253743489583, "learning_rate": 0.0003, "loss": 11.8808, "loss/aux_loss": 0.04809991996735334, "loss/crossentropy": 2.686124062538147, "loss/logits": 0.8603927254676819, "step": 19680 }, { "epoch": 0.1969, "grad_norm": 11.125, "grad_norm_var": 2.3004557291666665, "learning_rate": 0.0003, "loss": 12.0303, "loss/aux_loss": 0.048102331534028056, "loss/crossentropy": 2.856088125705719, "loss/logits": 0.8935580879449845, "step": 19690 }, { "epoch": 0.197, "grad_norm": 12.4375, "grad_norm_var": 0.290625, "learning_rate": 0.0003, "loss": 11.6799, "loss/aux_loss": 0.04809893500059843, "loss/crossentropy": 2.4471123695373533, "loss/logits": 0.8147686392068862, "step": 19700 }, { "epoch": 0.1971, "grad_norm": 15.0, "grad_norm_var": 36.69993489583333, "learning_rate": 0.0003, "loss": 11.9236, "loss/aux_loss": 0.048108641244471076, "loss/crossentropy": 2.7717926442623138, "loss/logits": 0.9015519857406616, "step": 19710 }, { "epoch": 0.1972, "grad_norm": 11.4375, "grad_norm_var": 35.6734375, "learning_rate": 0.0003, "loss": 11.9069, "loss/aux_loss": 0.04808454010635614, "loss/crossentropy": 2.759740972518921, "loss/logits": 0.916330274939537, "step": 19720 }, { "epoch": 0.1973, "grad_norm": 11.8125, "grad_norm_var": 0.5555826822916666, "learning_rate": 0.0003, "loss": 11.8078, "loss/aux_loss": 0.04809134602546692, "loss/crossentropy": 2.824735289812088, "loss/logits": 0.915794974565506, "step": 19730 }, { "epoch": 0.1974, "grad_norm": 11.9375, "grad_norm_var": 0.36354166666666665, "learning_rate": 0.0003, "loss": 12.0126, "loss/aux_loss": 0.04809140507131815, "loss/crossentropy": 2.8288570284843444, "loss/logits": 0.899117162823677, "step": 19740 }, { "epoch": 0.1975, "grad_norm": 12.8125, "grad_norm_var": 0.6856608072916667, "learning_rate": 0.0003, "loss": 12.0629, "loss/aux_loss": 0.04809539392590523, "loss/crossentropy": 2.808449399471283, "loss/logits": 0.9411624908447266, "step": 19750 }, { "epoch": 0.1976, "grad_norm": 12.0625, "grad_norm_var": 0.8481608072916667, "learning_rate": 0.0003, "loss": 11.8754, "loss/aux_loss": 0.04808392804116011, "loss/crossentropy": 2.8559494376182557, "loss/logits": 0.8965466380119324, "step": 19760 }, { "epoch": 0.1977, "grad_norm": 12.25, "grad_norm_var": 0.3729166666666667, "learning_rate": 0.0003, "loss": 11.7598, "loss/aux_loss": 0.048097134567797184, "loss/crossentropy": 2.6028787732124328, "loss/logits": 0.883382824063301, "step": 19770 }, { "epoch": 0.1978, "grad_norm": 12.4375, "grad_norm_var": 0.375, "learning_rate": 0.0003, "loss": 12.0172, "loss/aux_loss": 0.0480909226462245, "loss/crossentropy": 2.745894658565521, "loss/logits": 0.8921247333288193, "step": 19780 }, { "epoch": 0.1979, "grad_norm": 11.875, "grad_norm_var": 0.14869791666666668, "learning_rate": 0.0003, "loss": 11.9666, "loss/aux_loss": 0.04809285439550877, "loss/crossentropy": 2.7850330114364623, "loss/logits": 0.912046593427658, "step": 19790 }, { "epoch": 0.198, "grad_norm": 11.3125, "grad_norm_var": 0.4832682291666667, "learning_rate": 0.0003, "loss": 11.9582, "loss/aux_loss": 0.048090188205242156, "loss/crossentropy": 2.5545909225940706, "loss/logits": 0.8726184368133545, "step": 19800 }, { "epoch": 0.1981, "grad_norm": 12.1875, "grad_norm_var": 0.6088541666666667, "learning_rate": 0.0003, "loss": 11.8976, "loss/aux_loss": 0.04808424971997738, "loss/crossentropy": 2.764713633060455, "loss/logits": 0.9010277688503265, "step": 19810 }, { "epoch": 0.1982, "grad_norm": 11.75, "grad_norm_var": 0.27005208333333336, "learning_rate": 0.0003, "loss": 11.933, "loss/aux_loss": 0.048095231875777245, "loss/crossentropy": 2.8057524442672728, "loss/logits": 0.912197208404541, "step": 19820 }, { "epoch": 0.1983, "grad_norm": 11.5625, "grad_norm_var": 0.286962890625, "learning_rate": 0.0003, "loss": 11.8571, "loss/aux_loss": 0.048092585243284705, "loss/crossentropy": 2.7631799936294557, "loss/logits": 0.9289673507213593, "step": 19830 }, { "epoch": 0.1984, "grad_norm": 11.3125, "grad_norm_var": 0.17291666666666666, "learning_rate": 0.0003, "loss": 11.7907, "loss/aux_loss": 0.04810852501541376, "loss/crossentropy": 2.6866043627262117, "loss/logits": 0.905146250128746, "step": 19840 }, { "epoch": 0.1985, "grad_norm": 11.5, "grad_norm_var": 0.29816080729166666, "learning_rate": 0.0003, "loss": 11.8136, "loss/aux_loss": 0.048087861575186255, "loss/crossentropy": 2.9205057621002197, "loss/logits": 0.8634889364242554, "step": 19850 }, { "epoch": 0.1986, "grad_norm": 12.0, "grad_norm_var": 0.5166015625, "learning_rate": 0.0003, "loss": 12.0486, "loss/aux_loss": 0.0480953972786665, "loss/crossentropy": 2.9846151471138, "loss/logits": 0.929237163066864, "step": 19860 }, { "epoch": 0.1987, "grad_norm": 13.0625, "grad_norm_var": 0.6348307291666667, "learning_rate": 0.0003, "loss": 11.9695, "loss/aux_loss": 0.0481043117120862, "loss/crossentropy": 2.7358814120292663, "loss/logits": 0.8989768654108048, "step": 19870 }, { "epoch": 0.1988, "grad_norm": 13.5625, "grad_norm_var": 4.075260416666667, "learning_rate": 0.0003, "loss": 12.0024, "loss/aux_loss": 0.04810099173337221, "loss/crossentropy": 2.8778868198394774, "loss/logits": 0.945749819278717, "step": 19880 }, { "epoch": 0.1989, "grad_norm": 12.5, "grad_norm_var": 0.3583170572916667, "learning_rate": 0.0003, "loss": 11.8718, "loss/aux_loss": 0.04808882363140583, "loss/crossentropy": 2.848455381393433, "loss/logits": 0.9049903869628906, "step": 19890 }, { "epoch": 0.199, "grad_norm": 11.4375, "grad_norm_var": 0.24680989583333332, "learning_rate": 0.0003, "loss": 11.8808, "loss/aux_loss": 0.04809690471738577, "loss/crossentropy": 2.8279551804065703, "loss/logits": 0.9005701452493667, "step": 19900 }, { "epoch": 0.1991, "grad_norm": 11.5, "grad_norm_var": 0.353369140625, "learning_rate": 0.0003, "loss": 11.9659, "loss/aux_loss": 0.04810180887579918, "loss/crossentropy": 2.8082579135894776, "loss/logits": 0.9061174720525742, "step": 19910 }, { "epoch": 0.1992, "grad_norm": 13.25, "grad_norm_var": 0.4071451822916667, "learning_rate": 0.0003, "loss": 11.7249, "loss/aux_loss": 0.0480797978118062, "loss/crossentropy": 2.827571380138397, "loss/logits": 0.8759961783885956, "step": 19920 }, { "epoch": 0.1993, "grad_norm": 11.875, "grad_norm_var": 0.49192708333333335, "learning_rate": 0.0003, "loss": 12.0004, "loss/aux_loss": 0.04808774013072252, "loss/crossentropy": 2.8538602709770204, "loss/logits": 0.9502677023410797, "step": 19930 }, { "epoch": 0.1994, "grad_norm": 12.5, "grad_norm_var": 0.41868489583333335, "learning_rate": 0.0003, "loss": 11.6786, "loss/aux_loss": 0.04809605274349451, "loss/crossentropy": 2.667774814367294, "loss/logits": 0.8690688908100128, "step": 19940 }, { "epoch": 0.1995, "grad_norm": 12.4375, "grad_norm_var": 0.18214518229166668, "learning_rate": 0.0003, "loss": 11.7086, "loss/aux_loss": 0.04809432104229927, "loss/crossentropy": 2.7703699648380278, "loss/logits": 0.8963902860879898, "step": 19950 }, { "epoch": 0.1996, "grad_norm": 11.6875, "grad_norm_var": 0.4400390625, "learning_rate": 0.0003, "loss": 11.8438, "loss/aux_loss": 0.04808981604874134, "loss/crossentropy": 2.852495664358139, "loss/logits": 0.9264784097671509, "step": 19960 }, { "epoch": 0.1997, "grad_norm": 12.5625, "grad_norm_var": 0.5706868489583333, "learning_rate": 0.0003, "loss": 11.8371, "loss/aux_loss": 0.04808941353112459, "loss/crossentropy": 2.6668283939361572, "loss/logits": 0.8973574638366699, "step": 19970 }, { "epoch": 0.1998, "grad_norm": 11.9375, "grad_norm_var": 0.645556640625, "learning_rate": 0.0003, "loss": 12.0197, "loss/aux_loss": 0.04809361547231674, "loss/crossentropy": 2.7544242978096007, "loss/logits": 0.9390354514122009, "step": 19980 }, { "epoch": 0.1999, "grad_norm": 13.5, "grad_norm_var": 0.3184733072916667, "learning_rate": 0.0003, "loss": 11.998, "loss/aux_loss": 0.04809475131332874, "loss/crossentropy": 2.8074730575084685, "loss/logits": 0.8892990052700043, "step": 19990 }, { "epoch": 0.2, "grad_norm": 12.375, "grad_norm_var": 0.36795247395833336, "learning_rate": 0.0003, "loss": 11.8505, "loss/aux_loss": 0.04809391163289547, "loss/crossentropy": 2.663703387975693, "loss/logits": 0.8925030082464218, "step": 20000 }, { "epoch": 0.2001, "grad_norm": 13.0625, "grad_norm_var": 0.281494140625, "learning_rate": 0.0003, "loss": 11.9197, "loss/aux_loss": 0.048094058968126775, "loss/crossentropy": 2.886281728744507, "loss/logits": 0.8830851048231125, "step": 20010 }, { "epoch": 0.2002, "grad_norm": 11.75, "grad_norm_var": 0.480322265625, "learning_rate": 0.0003, "loss": 11.8763, "loss/aux_loss": 0.04809100497514009, "loss/crossentropy": 2.7841232419013977, "loss/logits": 0.8877080678939819, "step": 20020 }, { "epoch": 0.2003, "grad_norm": 11.125, "grad_norm_var": 3.560660807291667, "learning_rate": 0.0003, "loss": 11.629, "loss/aux_loss": 0.048091770894825456, "loss/crossentropy": 2.6575345516204836, "loss/logits": 0.8857085227966308, "step": 20030 }, { "epoch": 0.2004, "grad_norm": 12.8125, "grad_norm_var": 3.2007649739583335, "learning_rate": 0.0003, "loss": 11.8182, "loss/aux_loss": 0.0480912720784545, "loss/crossentropy": 2.7816062927246095, "loss/logits": 0.8824679642915726, "step": 20040 }, { "epoch": 0.2005, "grad_norm": 11.375, "grad_norm_var": 0.8089680989583333, "learning_rate": 0.0003, "loss": 11.8584, "loss/aux_loss": 0.048090060241520405, "loss/crossentropy": 2.7619201481342315, "loss/logits": 0.8898406118154526, "step": 20050 }, { "epoch": 0.2006, "grad_norm": 11.5625, "grad_norm_var": 0.8502604166666666, "learning_rate": 0.0003, "loss": 11.7573, "loss/aux_loss": 0.04808730930089951, "loss/crossentropy": 2.752053952217102, "loss/logits": 0.9270232617855072, "step": 20060 }, { "epoch": 0.2007, "grad_norm": 12.3125, "grad_norm_var": 0.26287434895833334, "learning_rate": 0.0003, "loss": 11.9228, "loss/aux_loss": 0.04809353221207857, "loss/crossentropy": 2.749705493450165, "loss/logits": 0.9575551152229309, "step": 20070 }, { "epoch": 0.2008, "grad_norm": 12.25, "grad_norm_var": 0.28489583333333335, "learning_rate": 0.0003, "loss": 11.7852, "loss/aux_loss": 0.04809745699167252, "loss/crossentropy": 2.6136541962623596, "loss/logits": 0.9018281251192093, "step": 20080 }, { "epoch": 0.2009, "grad_norm": 11.375, "grad_norm_var": 0.24264322916666667, "learning_rate": 0.0003, "loss": 11.6765, "loss/aux_loss": 0.04809559304267168, "loss/crossentropy": 2.757488173246384, "loss/logits": 0.9004332274198532, "step": 20090 }, { "epoch": 0.201, "grad_norm": 11.6875, "grad_norm_var": 0.10154622395833333, "learning_rate": 0.0003, "loss": 11.942, "loss/aux_loss": 0.04809119962155819, "loss/crossentropy": 2.8438161253929137, "loss/logits": 0.9208786696195602, "step": 20100 }, { "epoch": 0.2011, "grad_norm": 10.8125, "grad_norm_var": 1.2364583333333334, "learning_rate": 0.0003, "loss": 11.8541, "loss/aux_loss": 0.048097537644207475, "loss/crossentropy": 2.763727468252182, "loss/logits": 0.9115354359149933, "step": 20110 }, { "epoch": 0.2012, "grad_norm": 11.6875, "grad_norm_var": 0.5036295572916667, "learning_rate": 0.0003, "loss": 11.8807, "loss/aux_loss": 0.04808099064975977, "loss/crossentropy": 2.8801899015903474, "loss/logits": 0.9038815647363663, "step": 20120 }, { "epoch": 0.2013, "grad_norm": 12.0625, "grad_norm_var": 0.41848958333333336, "learning_rate": 0.0003, "loss": 11.9244, "loss/aux_loss": 0.048102755844593045, "loss/crossentropy": 2.9221291661262514, "loss/logits": 0.9072444885969162, "step": 20130 }, { "epoch": 0.2014, "grad_norm": 12.3125, "grad_norm_var": 0.21223958333333334, "learning_rate": 0.0003, "loss": 11.7636, "loss/aux_loss": 0.04809268806129694, "loss/crossentropy": 2.69385387301445, "loss/logits": 0.9101363003253937, "step": 20140 }, { "epoch": 0.2015, "grad_norm": 11.75, "grad_norm_var": 0.7202962239583334, "learning_rate": 0.0003, "loss": 11.82, "loss/aux_loss": 0.04809230994433165, "loss/crossentropy": 2.800033277273178, "loss/logits": 0.9162040054798126, "step": 20150 }, { "epoch": 0.2016, "grad_norm": 11.9375, "grad_norm_var": 0.5473795572916667, "learning_rate": 0.0003, "loss": 11.9676, "loss/aux_loss": 0.04809824600815773, "loss/crossentropy": 2.828506714105606, "loss/logits": 0.8795387417078018, "step": 20160 }, { "epoch": 0.2017, "grad_norm": 11.75, "grad_norm_var": 0.40305989583333335, "learning_rate": 0.0003, "loss": 11.9633, "loss/aux_loss": 0.0480917414650321, "loss/crossentropy": 2.8800631880760195, "loss/logits": 0.875808122754097, "step": 20170 }, { "epoch": 0.2018, "grad_norm": 12.4375, "grad_norm_var": 0.460009765625, "learning_rate": 0.0003, "loss": 11.8485, "loss/aux_loss": 0.04808654151856899, "loss/crossentropy": 2.7784875988960267, "loss/logits": 0.9081717103719711, "step": 20180 }, { "epoch": 0.2019, "grad_norm": 12.4375, "grad_norm_var": 0.23917643229166666, "learning_rate": 0.0003, "loss": 11.8831, "loss/aux_loss": 0.04809090811759233, "loss/crossentropy": 2.8120036482810975, "loss/logits": 0.9035557597875595, "step": 20190 }, { "epoch": 0.202, "grad_norm": 11.625, "grad_norm_var": 0.2228515625, "learning_rate": 0.0003, "loss": 11.8836, "loss/aux_loss": 0.048092659749090674, "loss/crossentropy": 2.607446867227554, "loss/logits": 0.9118954926729202, "step": 20200 }, { "epoch": 0.2021, "grad_norm": 11.9375, "grad_norm_var": 0.41139322916666665, "learning_rate": 0.0003, "loss": 11.9498, "loss/aux_loss": 0.04809481520205736, "loss/crossentropy": 2.781216490268707, "loss/logits": 0.88041250705719, "step": 20210 }, { "epoch": 0.2022, "grad_norm": 12.5625, "grad_norm_var": 1.6801432291666667, "learning_rate": 0.0003, "loss": 12.0076, "loss/aux_loss": 0.04808514565229416, "loss/crossentropy": 2.691144472360611, "loss/logits": 0.9039320826530457, "step": 20220 }, { "epoch": 0.2023, "grad_norm": 12.875, "grad_norm_var": 1.501416015625, "learning_rate": 0.0003, "loss": 11.8567, "loss/aux_loss": 0.048104914277791976, "loss/crossentropy": 2.6379098296165466, "loss/logits": 0.8757845312356949, "step": 20230 }, { "epoch": 0.2024, "grad_norm": 11.625, "grad_norm_var": 165.79401041666668, "learning_rate": 0.0003, "loss": 11.967, "loss/aux_loss": 0.0480836084112525, "loss/crossentropy": 2.774720752239227, "loss/logits": 0.8843321442604065, "step": 20240 }, { "epoch": 0.2025, "grad_norm": 12.6875, "grad_norm_var": 0.454931640625, "learning_rate": 0.0003, "loss": 11.9247, "loss/aux_loss": 0.04810081459581852, "loss/crossentropy": 2.810983347892761, "loss/logits": 0.9296145677566529, "step": 20250 }, { "epoch": 0.2026, "grad_norm": 12.0625, "grad_norm_var": 0.36795247395833336, "learning_rate": 0.0003, "loss": 11.6745, "loss/aux_loss": 0.04808822646737099, "loss/crossentropy": 2.684907627105713, "loss/logits": 0.8728846788406373, "step": 20260 }, { "epoch": 0.2027, "grad_norm": 12.375, "grad_norm_var": 0.21339518229166668, "learning_rate": 0.0003, "loss": 11.9664, "loss/aux_loss": 0.048096506483852865, "loss/crossentropy": 2.8471142053604126, "loss/logits": 0.9133492529392242, "step": 20270 }, { "epoch": 0.2028, "grad_norm": 11.4375, "grad_norm_var": 0.7161295572916667, "learning_rate": 0.0003, "loss": 11.7893, "loss/aux_loss": 0.04809205681085586, "loss/crossentropy": 2.612596166133881, "loss/logits": 0.870291605591774, "step": 20280 }, { "epoch": 0.2029, "grad_norm": 12.8125, "grad_norm_var": 0.8388020833333333, "learning_rate": 0.0003, "loss": 11.9102, "loss/aux_loss": 0.04809516165405512, "loss/crossentropy": 3.0022507131099703, "loss/logits": 0.9417583554983139, "step": 20290 }, { "epoch": 0.203, "grad_norm": 11.8125, "grad_norm_var": 0.9346354166666667, "learning_rate": 0.0003, "loss": 11.919, "loss/aux_loss": 0.048092111200094226, "loss/crossentropy": 2.8318777084350586, "loss/logits": 0.9274811953306198, "step": 20300 }, { "epoch": 0.2031, "grad_norm": 11.4375, "grad_norm_var": 0.48639322916666666, "learning_rate": 0.0003, "loss": 11.8993, "loss/aux_loss": 0.04809704571962357, "loss/crossentropy": 2.653505039215088, "loss/logits": 0.8742677927017212, "step": 20310 }, { "epoch": 0.2032, "grad_norm": 11.875, "grad_norm_var": 0.3296875, "learning_rate": 0.0003, "loss": 11.8204, "loss/aux_loss": 0.04809397198259831, "loss/crossentropy": 2.8869922399520873, "loss/logits": 0.9349960386753082, "step": 20320 }, { "epoch": 0.2033, "grad_norm": 12.625, "grad_norm_var": 0.4853515625, "learning_rate": 0.0003, "loss": 11.7685, "loss/aux_loss": 0.04808988757431507, "loss/crossentropy": 2.713202327489853, "loss/logits": 0.9036098659038544, "step": 20330 }, { "epoch": 0.2034, "grad_norm": 11.25, "grad_norm_var": 0.31243489583333334, "learning_rate": 0.0003, "loss": 11.783, "loss/aux_loss": 0.04809346310794353, "loss/crossentropy": 2.7694801688194275, "loss/logits": 0.9182763814926147, "step": 20340 }, { "epoch": 0.2035, "grad_norm": 12.4375, "grad_norm_var": 0.44308268229166664, "learning_rate": 0.0003, "loss": 11.8069, "loss/aux_loss": 0.04808987472206354, "loss/crossentropy": 2.7956194162368773, "loss/logits": 0.8902797639369965, "step": 20350 }, { "epoch": 0.2036, "grad_norm": 11.6875, "grad_norm_var": 0.18567708333333333, "learning_rate": 0.0003, "loss": 11.8991, "loss/aux_loss": 0.048092426359653474, "loss/crossentropy": 2.7856826066970823, "loss/logits": 0.8947367310523987, "step": 20360 }, { "epoch": 0.2037, "grad_norm": 11.8125, "grad_norm_var": 7.349739583333333, "learning_rate": 0.0003, "loss": 11.8399, "loss/aux_loss": 0.04809611644595861, "loss/crossentropy": 2.7929181456565857, "loss/logits": 0.9144560486078263, "step": 20370 }, { "epoch": 0.2038, "grad_norm": 13.625, "grad_norm_var": 0.5590983072916667, "learning_rate": 0.0003, "loss": 11.8832, "loss/aux_loss": 0.04809699356555939, "loss/crossentropy": 2.8222780883312226, "loss/logits": 0.8883333146572113, "step": 20380 }, { "epoch": 0.2039, "grad_norm": 12.0625, "grad_norm_var": 7.416650390625, "learning_rate": 0.0003, "loss": 11.7412, "loss/aux_loss": 0.048093249835073945, "loss/crossentropy": 2.8504996478557585, "loss/logits": 0.9075500249862671, "step": 20390 }, { "epoch": 0.204, "grad_norm": 12.0625, "grad_norm_var": 0.23331705729166666, "learning_rate": 0.0003, "loss": 12.0127, "loss/aux_loss": 0.04809419121593237, "loss/crossentropy": 2.8577288150787354, "loss/logits": 0.902344498038292, "step": 20400 }, { "epoch": 0.2041, "grad_norm": 12.5, "grad_norm_var": 77.52389322916666, "learning_rate": 0.0003, "loss": 11.7763, "loss/aux_loss": 0.04809053186327219, "loss/crossentropy": 2.835852700471878, "loss/logits": 0.9226241886615754, "step": 20410 }, { "epoch": 0.2042, "grad_norm": 11.4375, "grad_norm_var": 0.4442057291666667, "learning_rate": 0.0003, "loss": 11.8972, "loss/aux_loss": 0.04809423293918371, "loss/crossentropy": 2.8161026298999787, "loss/logits": 0.926646676659584, "step": 20420 }, { "epoch": 0.2043, "grad_norm": 11.4375, "grad_norm_var": 0.691650390625, "learning_rate": 0.0003, "loss": 11.8844, "loss/aux_loss": 0.04809097535908222, "loss/crossentropy": 2.763902723789215, "loss/logits": 0.9052618652582168, "step": 20430 }, { "epoch": 0.2044, "grad_norm": 12.6875, "grad_norm_var": 0.6910807291666666, "learning_rate": 0.0003, "loss": 11.7705, "loss/aux_loss": 0.04810637105256319, "loss/crossentropy": 2.6885470867156984, "loss/logits": 0.8763923466205596, "step": 20440 }, { "epoch": 0.2045, "grad_norm": 13.8125, "grad_norm_var": 56.29894205729167, "learning_rate": 0.0003, "loss": 11.9226, "loss/aux_loss": 0.048100747354328635, "loss/crossentropy": 2.8076935350894927, "loss/logits": 0.894736310839653, "step": 20450 }, { "epoch": 0.2046, "grad_norm": 12.8125, "grad_norm_var": 54.62161458333333, "learning_rate": 0.0003, "loss": 11.8524, "loss/aux_loss": 0.0480956481769681, "loss/crossentropy": 2.8430003762245177, "loss/logits": 0.9245178937911988, "step": 20460 }, { "epoch": 0.2047, "grad_norm": 11.3125, "grad_norm_var": 0.391650390625, "learning_rate": 0.0003, "loss": 11.7571, "loss/aux_loss": 0.04809457026422024, "loss/crossentropy": 2.6352721631526945, "loss/logits": 0.8530379116535187, "step": 20470 }, { "epoch": 0.2048, "grad_norm": 13.4375, "grad_norm_var": 0.6627604166666666, "learning_rate": 0.0003, "loss": 11.9712, "loss/aux_loss": 0.04810174349695444, "loss/crossentropy": 2.8618993282318117, "loss/logits": 0.8944301903247833, "step": 20480 }, { "epoch": 0.2049, "grad_norm": 13.9375, "grad_norm_var": 0.7, "learning_rate": 0.0003, "loss": 11.951, "loss/aux_loss": 0.04809792432934046, "loss/crossentropy": 2.7998989462852477, "loss/logits": 0.8966112703084945, "step": 20490 }, { "epoch": 0.205, "grad_norm": 13.625, "grad_norm_var": 0.350244140625, "learning_rate": 0.0003, "loss": 12.0158, "loss/aux_loss": 0.04808657988905907, "loss/crossentropy": 2.919311285018921, "loss/logits": 0.9131373822689056, "step": 20500 }, { "epoch": 0.2051, "grad_norm": 11.625, "grad_norm_var": 0.4176432291666667, "learning_rate": 0.0003, "loss": 11.8259, "loss/aux_loss": 0.04808542001992464, "loss/crossentropy": 2.8769485354423523, "loss/logits": 0.9414841264486313, "step": 20510 }, { "epoch": 0.2052, "grad_norm": 11.9375, "grad_norm_var": 0.343994140625, "learning_rate": 0.0003, "loss": 11.9157, "loss/aux_loss": 0.04809294939041138, "loss/crossentropy": 2.6883323431015014, "loss/logits": 0.8925897628068924, "step": 20520 }, { "epoch": 0.2053, "grad_norm": 11.25, "grad_norm_var": 0.4405598958333333, "learning_rate": 0.0003, "loss": 11.9364, "loss/aux_loss": 0.04809175301343203, "loss/crossentropy": 2.723712849617004, "loss/logits": 0.8805695950984955, "step": 20530 }, { "epoch": 0.2054, "grad_norm": 12.25, "grad_norm_var": 0.5085774739583333, "learning_rate": 0.0003, "loss": 11.7972, "loss/aux_loss": 0.04808581694960594, "loss/crossentropy": 2.7969413816928865, "loss/logits": 0.8847203850746155, "step": 20540 }, { "epoch": 0.2055, "grad_norm": 11.6875, "grad_norm_var": 0.1775390625, "learning_rate": 0.0003, "loss": 11.8719, "loss/aux_loss": 0.04808731451630592, "loss/crossentropy": 2.7113927245140075, "loss/logits": 0.8873605281114578, "step": 20550 }, { "epoch": 0.2056, "grad_norm": 12.6875, "grad_norm_var": 0.39837239583333334, "learning_rate": 0.0003, "loss": 11.7171, "loss/aux_loss": 0.048092583753168584, "loss/crossentropy": 2.7325293242931368, "loss/logits": 0.8983477979898453, "step": 20560 }, { "epoch": 0.2057, "grad_norm": 12.25, "grad_norm_var": 0.3421223958333333, "learning_rate": 0.0003, "loss": 11.791, "loss/aux_loss": 0.048091687634587287, "loss/crossentropy": 2.744081234931946, "loss/logits": 0.9124333083629608, "step": 20570 }, { "epoch": 0.2058, "grad_norm": 12.0625, "grad_norm_var": 0.3817057291666667, "learning_rate": 0.0003, "loss": 11.8137, "loss/aux_loss": 0.04810182899236679, "loss/crossentropy": 2.82774156332016, "loss/logits": 0.9164555937051773, "step": 20580 }, { "epoch": 0.2059, "grad_norm": 12.9375, "grad_norm_var": 2.7134765625, "learning_rate": 0.0003, "loss": 11.7681, "loss/aux_loss": 0.04808175358921289, "loss/crossentropy": 2.7752291679382326, "loss/logits": 0.923569667339325, "step": 20590 }, { "epoch": 0.206, "grad_norm": 11.875, "grad_norm_var": 2.0637858072916666, "learning_rate": 0.0003, "loss": 11.8039, "loss/aux_loss": 0.04809446018189192, "loss/crossentropy": 2.7302875399589537, "loss/logits": 0.9092204391956329, "step": 20600 }, { "epoch": 0.2061, "grad_norm": 12.8125, "grad_norm_var": 0.3763020833333333, "learning_rate": 0.0003, "loss": 11.7904, "loss/aux_loss": 0.04808835070580244, "loss/crossentropy": 2.9346749424934386, "loss/logits": 0.8693037539720535, "step": 20610 }, { "epoch": 0.2062, "grad_norm": 11.625, "grad_norm_var": 0.2877604166666667, "learning_rate": 0.0003, "loss": 11.7153, "loss/aux_loss": 0.04808915480971336, "loss/crossentropy": 2.891802728176117, "loss/logits": 0.872070437669754, "step": 20620 }, { "epoch": 0.2063, "grad_norm": 12.25, "grad_norm_var": 43.424739583333334, "learning_rate": 0.0003, "loss": 11.8111, "loss/aux_loss": 0.0481000667437911, "loss/crossentropy": 2.752538466453552, "loss/logits": 0.910025691986084, "step": 20630 }, { "epoch": 0.2064, "grad_norm": 12.1875, "grad_norm_var": 0.3419270833333333, "learning_rate": 0.0003, "loss": 11.7758, "loss/aux_loss": 0.04808867033571005, "loss/crossentropy": 2.816843068599701, "loss/logits": 0.8870363384485245, "step": 20640 }, { "epoch": 0.2065, "grad_norm": 11.4375, "grad_norm_var": 0.27537434895833335, "learning_rate": 0.0003, "loss": 11.9281, "loss/aux_loss": 0.04809427950531244, "loss/crossentropy": 2.798200511932373, "loss/logits": 0.9375192672014236, "step": 20650 }, { "epoch": 0.2066, "grad_norm": 12.375, "grad_norm_var": 3.9953125, "learning_rate": 0.0003, "loss": 11.9466, "loss/aux_loss": 0.048091378435492514, "loss/crossentropy": 2.707651823759079, "loss/logits": 0.8876151233911515, "step": 20660 }, { "epoch": 0.2067, "grad_norm": 12.5, "grad_norm_var": 1.7358723958333333, "learning_rate": 0.0003, "loss": 11.8981, "loss/aux_loss": 0.048106171749532224, "loss/crossentropy": 2.697846329212189, "loss/logits": 0.9072348445653915, "step": 20670 }, { "epoch": 0.2068, "grad_norm": 13.0, "grad_norm_var": 1.8640625, "learning_rate": 0.0003, "loss": 11.7331, "loss/aux_loss": 0.04809394646435976, "loss/crossentropy": 2.8085982382297514, "loss/logits": 0.8863144606351853, "step": 20680 }, { "epoch": 0.2069, "grad_norm": 12.6875, "grad_norm_var": 0.5259765625, "learning_rate": 0.0003, "loss": 11.6847, "loss/aux_loss": 0.048090824671089646, "loss/crossentropy": 2.688712340593338, "loss/logits": 0.9194134473800659, "step": 20690 }, { "epoch": 0.207, "grad_norm": 12.25, "grad_norm_var": 0.43430989583333335, "learning_rate": 0.0003, "loss": 11.8249, "loss/aux_loss": 0.048093540407717225, "loss/crossentropy": 2.8308887600898744, "loss/logits": 0.9012588620185852, "step": 20700 }, { "epoch": 0.2071, "grad_norm": 12.625, "grad_norm_var": 0.38483072916666666, "learning_rate": 0.0003, "loss": 11.8041, "loss/aux_loss": 0.04809002298861742, "loss/crossentropy": 2.7663665294647215, "loss/logits": 0.8768691569566727, "step": 20710 }, { "epoch": 0.2072, "grad_norm": 12.0625, "grad_norm_var": 0.8957682291666667, "learning_rate": 0.0003, "loss": 11.7693, "loss/aux_loss": 0.0480813292786479, "loss/crossentropy": 2.7900067985057833, "loss/logits": 0.9126892119646073, "step": 20720 }, { "epoch": 0.2073, "grad_norm": 11.25, "grad_norm_var": 0.735400390625, "learning_rate": 0.0003, "loss": 11.8439, "loss/aux_loss": 0.04809948187321424, "loss/crossentropy": 2.7565189003944397, "loss/logits": 0.8766478002071381, "step": 20730 }, { "epoch": 0.2074, "grad_norm": 12.125, "grad_norm_var": 0.45558268229166665, "learning_rate": 0.0003, "loss": 11.8518, "loss/aux_loss": 0.048083253763616086, "loss/crossentropy": 2.7713675141334533, "loss/logits": 0.8875041484832764, "step": 20740 }, { "epoch": 0.2075, "grad_norm": 12.8125, "grad_norm_var": 0.9202962239583333, "learning_rate": 0.0003, "loss": 11.9413, "loss/aux_loss": 0.04809508752077818, "loss/crossentropy": 2.937278914451599, "loss/logits": 0.9425408929586411, "step": 20750 }, { "epoch": 0.2076, "grad_norm": 11.5, "grad_norm_var": 0.9744140625, "learning_rate": 0.0003, "loss": 11.9612, "loss/aux_loss": 0.04808584563434124, "loss/crossentropy": 2.837298500537872, "loss/logits": 0.9269993782043457, "step": 20760 }, { "epoch": 0.2077, "grad_norm": 12.0, "grad_norm_var": 0.24420572916666666, "learning_rate": 0.0003, "loss": 11.8762, "loss/aux_loss": 0.0480938971042633, "loss/crossentropy": 2.7334739685058596, "loss/logits": 0.8894063144922256, "step": 20770 }, { "epoch": 0.2078, "grad_norm": 11.4375, "grad_norm_var": 0.2652180989583333, "learning_rate": 0.0003, "loss": 11.8865, "loss/aux_loss": 0.04807817693799734, "loss/crossentropy": 2.7916306495666503, "loss/logits": 0.9009652465581894, "step": 20780 }, { "epoch": 0.2079, "grad_norm": 12.4375, "grad_norm_var": 0.24921875, "learning_rate": 0.0003, "loss": 11.8028, "loss/aux_loss": 0.0480882965028286, "loss/crossentropy": 2.7257829308509827, "loss/logits": 0.9154664635658264, "step": 20790 }, { "epoch": 0.208, "grad_norm": 12.6875, "grad_norm_var": 0.2400390625, "learning_rate": 0.0003, "loss": 12.0261, "loss/aux_loss": 0.048092817142605784, "loss/crossentropy": 2.7954021215438845, "loss/logits": 0.9113053381443024, "step": 20800 }, { "epoch": 0.2081, "grad_norm": 13.0, "grad_norm_var": 0.2259765625, "learning_rate": 0.0003, "loss": 11.7759, "loss/aux_loss": 0.04809394646435976, "loss/crossentropy": 2.8197373390197753, "loss/logits": 0.8813621670007705, "step": 20810 }, { "epoch": 0.2082, "grad_norm": 19.5, "grad_norm_var": 3.6874348958333334, "learning_rate": 0.0003, "loss": 11.6814, "loss/aux_loss": 0.0480814166367054, "loss/crossentropy": 2.817176288366318, "loss/logits": 0.9225684970617294, "step": 20820 }, { "epoch": 0.2083, "grad_norm": 13.0625, "grad_norm_var": 4.002718098958334, "learning_rate": 0.0003, "loss": 12.0019, "loss/aux_loss": 0.0480852359905839, "loss/crossentropy": 2.668290489912033, "loss/logits": 0.9053199380636215, "step": 20830 }, { "epoch": 0.2084, "grad_norm": 11.875, "grad_norm_var": 0.613134765625, "learning_rate": 0.0003, "loss": 11.8592, "loss/aux_loss": 0.04809372667223215, "loss/crossentropy": 2.7935187935829164, "loss/logits": 0.8976625889539719, "step": 20840 }, { "epoch": 0.2085, "grad_norm": 13.125, "grad_norm_var": 0.438134765625, "learning_rate": 0.0003, "loss": 11.6427, "loss/aux_loss": 0.04809672702103853, "loss/crossentropy": 2.868895101547241, "loss/logits": 0.8949025511741638, "step": 20850 }, { "epoch": 0.2086, "grad_norm": 12.375, "grad_norm_var": 0.442822265625, "learning_rate": 0.0003, "loss": 11.7802, "loss/aux_loss": 0.04807591456919909, "loss/crossentropy": 2.884802359342575, "loss/logits": 0.88098503947258, "step": 20860 }, { "epoch": 0.2087, "grad_norm": 11.875, "grad_norm_var": 0.331103515625, "learning_rate": 0.0003, "loss": 11.8061, "loss/aux_loss": 0.048096058703958985, "loss/crossentropy": 2.86653151512146, "loss/logits": 0.9177994340658188, "step": 20870 }, { "epoch": 0.2088, "grad_norm": 11.5625, "grad_norm_var": 0.1890625, "learning_rate": 0.0003, "loss": 11.6698, "loss/aux_loss": 0.04808076079934835, "loss/crossentropy": 2.669018977880478, "loss/logits": 0.8996531933546066, "step": 20880 }, { "epoch": 0.2089, "grad_norm": 11.375, "grad_norm_var": 0.10584309895833334, "learning_rate": 0.0003, "loss": 11.7736, "loss/aux_loss": 0.048097938485443595, "loss/crossentropy": 2.814938074350357, "loss/logits": 0.8803753167390823, "step": 20890 }, { "epoch": 0.209, "grad_norm": 12.6875, "grad_norm_var": 0.249462890625, "learning_rate": 0.0003, "loss": 11.8773, "loss/aux_loss": 0.048087738640606406, "loss/crossentropy": 2.8559614181518556, "loss/logits": 0.924946254491806, "step": 20900 }, { "epoch": 0.2091, "grad_norm": 11.8125, "grad_norm_var": 0.45677083333333335, "learning_rate": 0.0003, "loss": 11.7792, "loss/aux_loss": 0.04809671528637409, "loss/crossentropy": 2.833332586288452, "loss/logits": 0.9064707219600677, "step": 20910 }, { "epoch": 0.2092, "grad_norm": 12.875, "grad_norm_var": 0.45115559895833335, "learning_rate": 0.0003, "loss": 11.6766, "loss/aux_loss": 0.048085590824484825, "loss/crossentropy": 2.6808117508888243, "loss/logits": 0.8879122287034988, "step": 20920 }, { "epoch": 0.2093, "grad_norm": 13.0, "grad_norm_var": 0.3841145833333333, "learning_rate": 0.0003, "loss": 11.926, "loss/aux_loss": 0.048085760325193405, "loss/crossentropy": 2.8332987904548643, "loss/logits": 0.9374050021171569, "step": 20930 }, { "epoch": 0.2094, "grad_norm": 12.375, "grad_norm_var": 0.48125, "learning_rate": 0.0003, "loss": 11.7561, "loss/aux_loss": 0.0480853458866477, "loss/crossentropy": 2.8423936545848845, "loss/logits": 0.9064114809036254, "step": 20940 }, { "epoch": 0.2095, "grad_norm": 12.25, "grad_norm_var": 0.6079264322916667, "learning_rate": 0.0003, "loss": 11.7373, "loss/aux_loss": 0.048095690459012984, "loss/crossentropy": 2.6432439744472505, "loss/logits": 0.8873191922903061, "step": 20950 }, { "epoch": 0.2096, "grad_norm": 12.0625, "grad_norm_var": 0.54296875, "learning_rate": 0.0003, "loss": 11.7411, "loss/aux_loss": 0.04809337351471186, "loss/crossentropy": 2.7309992611408234, "loss/logits": 0.8804535895586014, "step": 20960 }, { "epoch": 0.2097, "grad_norm": 12.0, "grad_norm_var": 0.32734375, "learning_rate": 0.0003, "loss": 11.8208, "loss/aux_loss": 0.04809424672275782, "loss/crossentropy": 2.733998316526413, "loss/logits": 0.8988215506076813, "step": 20970 }, { "epoch": 0.2098, "grad_norm": 11.9375, "grad_norm_var": 0.21222330729166666, "learning_rate": 0.0003, "loss": 11.896, "loss/aux_loss": 0.04808952175080776, "loss/crossentropy": 2.5865807056427004, "loss/logits": 0.854627400636673, "step": 20980 }, { "epoch": 0.2099, "grad_norm": 12.375, "grad_norm_var": 0.38645833333333335, "learning_rate": 0.0003, "loss": 11.8622, "loss/aux_loss": 0.04808156508952379, "loss/crossentropy": 2.851080930233002, "loss/logits": 0.9109200239181519, "step": 20990 }, { "epoch": 0.21, "grad_norm": 11.6875, "grad_norm_var": 0.49661458333333336, "learning_rate": 0.0003, "loss": 11.7082, "loss/aux_loss": 0.04808969963341951, "loss/crossentropy": 2.84299578666687, "loss/logits": 0.8802772104740143, "step": 21000 }, { "epoch": 0.2101, "grad_norm": 12.375, "grad_norm_var": 0.5040201822916667, "learning_rate": 0.0003, "loss": 11.8945, "loss/aux_loss": 0.048087981343269345, "loss/crossentropy": 2.6353746175765993, "loss/logits": 0.8833418905735015, "step": 21010 }, { "epoch": 0.2102, "grad_norm": 11.75, "grad_norm_var": 0.4593098958333333, "learning_rate": 0.0003, "loss": 11.6485, "loss/aux_loss": 0.048100402019917964, "loss/crossentropy": 2.8276872038841248, "loss/logits": 0.9351934552192688, "step": 21020 }, { "epoch": 0.2103, "grad_norm": 12.6875, "grad_norm_var": 0.4083170572916667, "learning_rate": 0.0003, "loss": 11.6644, "loss/aux_loss": 0.048088280111551286, "loss/crossentropy": 2.8238101243972777, "loss/logits": 0.8919139176607132, "step": 21030 }, { "epoch": 0.2104, "grad_norm": 11.375, "grad_norm_var": 0.4735514322916667, "learning_rate": 0.0003, "loss": 11.6249, "loss/aux_loss": 0.04809662196785212, "loss/crossentropy": 2.7005931556224825, "loss/logits": 0.8710683017969132, "step": 21040 }, { "epoch": 0.2105, "grad_norm": 12.0, "grad_norm_var": 0.3633951822916667, "learning_rate": 0.0003, "loss": 11.895, "loss/aux_loss": 0.048086578585207464, "loss/crossentropy": 2.9533318161964415, "loss/logits": 0.9189774692058563, "step": 21050 }, { "epoch": 0.2106, "grad_norm": 13.0, "grad_norm_var": 0.34542643229166664, "learning_rate": 0.0003, "loss": 11.933, "loss/aux_loss": 0.04809347465634346, "loss/crossentropy": 2.7933754503726957, "loss/logits": 0.8989807814359665, "step": 21060 }, { "epoch": 0.2107, "grad_norm": 11.625, "grad_norm_var": 0.3900390625, "learning_rate": 0.0003, "loss": 11.7229, "loss/aux_loss": 0.048093566112220286, "loss/crossentropy": 2.909538185596466, "loss/logits": 0.9254848033189773, "step": 21070 }, { "epoch": 0.2108, "grad_norm": 12.3125, "grad_norm_var": 0.48483072916666664, "learning_rate": 0.0003, "loss": 11.8718, "loss/aux_loss": 0.048095266707241534, "loss/crossentropy": 2.77746034860611, "loss/logits": 0.8918098568916321, "step": 21080 }, { "epoch": 0.2109, "grad_norm": 12.3125, "grad_norm_var": 0.5109375, "learning_rate": 0.0003, "loss": 11.8693, "loss/aux_loss": 0.04808137100189924, "loss/crossentropy": 2.8595972299575805, "loss/logits": 0.9113215535879136, "step": 21090 }, { "epoch": 0.211, "grad_norm": 12.1875, "grad_norm_var": 0.226416015625, "learning_rate": 0.0003, "loss": 11.7439, "loss/aux_loss": 0.04809752646833658, "loss/crossentropy": 2.8955005407333374, "loss/logits": 0.9018853276968002, "step": 21100 }, { "epoch": 0.2111, "grad_norm": 13.3125, "grad_norm_var": 0.44021809895833336, "learning_rate": 0.0003, "loss": 11.8998, "loss/aux_loss": 0.04808686450123787, "loss/crossentropy": 2.8281142473220826, "loss/logits": 0.9361472398042678, "step": 21110 }, { "epoch": 0.2112, "grad_norm": 11.8125, "grad_norm_var": 0.7166015625, "learning_rate": 0.0003, "loss": 11.9078, "loss/aux_loss": 0.04808351919054985, "loss/crossentropy": 2.8331224858760833, "loss/logits": 0.925226366519928, "step": 21120 }, { "epoch": 0.2113, "grad_norm": 12.25, "grad_norm_var": 0.20045572916666668, "learning_rate": 0.0003, "loss": 11.8609, "loss/aux_loss": 0.048093733564019206, "loss/crossentropy": 2.7209898710250853, "loss/logits": 0.9094552010297775, "step": 21130 }, { "epoch": 0.2114, "grad_norm": 13.1875, "grad_norm_var": 0.3734212239583333, "learning_rate": 0.0003, "loss": 11.8317, "loss/aux_loss": 0.04808988496661186, "loss/crossentropy": 2.7308087766170503, "loss/logits": 0.875358846783638, "step": 21140 }, { "epoch": 0.2115, "grad_norm": 11.875, "grad_norm_var": 1.264306640625, "learning_rate": 0.0003, "loss": 11.7752, "loss/aux_loss": 0.04809015057981014, "loss/crossentropy": 2.744631814956665, "loss/logits": 0.890999186038971, "step": 21150 }, { "epoch": 0.2116, "grad_norm": 11.3125, "grad_norm_var": 1.3072916666666667, "learning_rate": 0.0003, "loss": 11.7284, "loss/aux_loss": 0.04809484537690878, "loss/crossentropy": 2.7210877299308778, "loss/logits": 0.8934634417295456, "step": 21160 }, { "epoch": 0.2117, "grad_norm": 13.75, "grad_norm_var": 0.718603515625, "learning_rate": 0.0003, "loss": 11.6448, "loss/aux_loss": 0.04808205626904964, "loss/crossentropy": 2.6400113105773926, "loss/logits": 0.8528676211833954, "step": 21170 }, { "epoch": 0.2118, "grad_norm": 11.625, "grad_norm_var": 0.5536295572916666, "learning_rate": 0.0003, "loss": 11.881, "loss/aux_loss": 0.0481026129797101, "loss/crossentropy": 2.7602346658706667, "loss/logits": 0.8984622836112977, "step": 21180 }, { "epoch": 0.2119, "grad_norm": 12.3125, "grad_norm_var": 0.2978515625, "learning_rate": 0.0003, "loss": 11.9188, "loss/aux_loss": 0.04807357918471098, "loss/crossentropy": 2.9864767670631407, "loss/logits": 0.9314229309558868, "step": 21190 }, { "epoch": 0.212, "grad_norm": 12.0, "grad_norm_var": 0.6145182291666667, "learning_rate": 0.0003, "loss": 12.0001, "loss/aux_loss": 0.048103776201605795, "loss/crossentropy": 2.816402053833008, "loss/logits": 0.904208105802536, "step": 21200 }, { "epoch": 0.2121, "grad_norm": 12.0625, "grad_norm_var": 0.7231608072916667, "learning_rate": 0.0003, "loss": 11.7257, "loss/aux_loss": 0.048085693083703515, "loss/crossentropy": 2.708358186483383, "loss/logits": 0.898724827170372, "step": 21210 }, { "epoch": 0.2122, "grad_norm": 12.0, "grad_norm_var": 15.506103515625, "learning_rate": 0.0003, "loss": 11.7972, "loss/aux_loss": 0.04809206072241068, "loss/crossentropy": 2.744321119785309, "loss/logits": 0.9206572264432907, "step": 21220 }, { "epoch": 0.2123, "grad_norm": 14.3125, "grad_norm_var": 14.617041015625, "learning_rate": 0.0003, "loss": 11.7268, "loss/aux_loss": 0.048096605204045774, "loss/crossentropy": 2.72475118637085, "loss/logits": 0.874206417798996, "step": 21230 }, { "epoch": 0.2124, "grad_norm": 12.9375, "grad_norm_var": 0.4019368489583333, "learning_rate": 0.0003, "loss": 11.8328, "loss/aux_loss": 0.04808426145464182, "loss/crossentropy": 2.815295088291168, "loss/logits": 0.9246113210916519, "step": 21240 }, { "epoch": 0.2125, "grad_norm": 11.8125, "grad_norm_var": 0.375634765625, "learning_rate": 0.0003, "loss": 11.8002, "loss/aux_loss": 0.048097210749983785, "loss/crossentropy": 2.5762141942977905, "loss/logits": 0.8894416421651841, "step": 21250 }, { "epoch": 0.2126, "grad_norm": 13.0, "grad_norm_var": 0.410400390625, "learning_rate": 0.0003, "loss": 11.8324, "loss/aux_loss": 0.048088593408465385, "loss/crossentropy": 2.783554768562317, "loss/logits": 0.8944692641496659, "step": 21260 }, { "epoch": 0.2127, "grad_norm": 12.375, "grad_norm_var": 0.23956705729166666, "learning_rate": 0.0003, "loss": 11.9322, "loss/aux_loss": 0.048084756731987, "loss/crossentropy": 2.8908560514450072, "loss/logits": 0.9285173654556275, "step": 21270 }, { "epoch": 0.2128, "grad_norm": 12.4375, "grad_norm_var": 0.3651041666666667, "learning_rate": 0.0003, "loss": 11.7035, "loss/aux_loss": 0.04809834379702806, "loss/crossentropy": 2.7666608333587646, "loss/logits": 0.8596565514802933, "step": 21280 }, { "epoch": 0.2129, "grad_norm": 13.125, "grad_norm_var": 0.397900390625, "learning_rate": 0.0003, "loss": 11.8816, "loss/aux_loss": 0.048090608604252336, "loss/crossentropy": 2.7386081337928774, "loss/logits": 0.8871663898229599, "step": 21290 }, { "epoch": 0.213, "grad_norm": 11.8125, "grad_norm_var": 0.5919108072916667, "learning_rate": 0.0003, "loss": 11.6848, "loss/aux_loss": 0.04808788150548935, "loss/crossentropy": 2.851125454902649, "loss/logits": 0.9087550818920136, "step": 21300 }, { "epoch": 0.2131, "grad_norm": 12.4375, "grad_norm_var": 0.3337890625, "learning_rate": 0.0003, "loss": 11.6448, "loss/aux_loss": 0.0480893436819315, "loss/crossentropy": 3.0054169058799745, "loss/logits": 0.9328852593898773, "step": 21310 }, { "epoch": 0.2132, "grad_norm": 11.75, "grad_norm_var": 0.529931640625, "learning_rate": 0.0003, "loss": 11.6956, "loss/aux_loss": 0.048091666772961617, "loss/crossentropy": 2.760949170589447, "loss/logits": 0.8893462926149368, "step": 21320 }, { "epoch": 0.2133, "grad_norm": 13.125, "grad_norm_var": 0.3516764322916667, "learning_rate": 0.0003, "loss": 11.8081, "loss/aux_loss": 0.0480879507958889, "loss/crossentropy": 2.766853415966034, "loss/logits": 0.8801511764526367, "step": 21330 }, { "epoch": 0.2134, "grad_norm": 12.125, "grad_norm_var": 0.45193684895833336, "learning_rate": 0.0003, "loss": 11.8681, "loss/aux_loss": 0.04809288065880537, "loss/crossentropy": 2.6554811358451844, "loss/logits": 0.911386126279831, "step": 21340 }, { "epoch": 0.2135, "grad_norm": 12.375, "grad_norm_var": 0.4337076822916667, "learning_rate": 0.0003, "loss": 12.0507, "loss/aux_loss": 0.04808918032795191, "loss/crossentropy": 2.887453854084015, "loss/logits": 0.9052879035472869, "step": 21350 }, { "epoch": 0.2136, "grad_norm": 12.0, "grad_norm_var": 0.5317708333333333, "learning_rate": 0.0003, "loss": 11.7237, "loss/aux_loss": 0.04809961635619402, "loss/crossentropy": 2.739542376995087, "loss/logits": 0.9075916647911072, "step": 21360 }, { "epoch": 0.2137, "grad_norm": 12.4375, "grad_norm_var": 0.486181640625, "learning_rate": 0.0003, "loss": 11.827, "loss/aux_loss": 0.048078613728284834, "loss/crossentropy": 2.7362434446811674, "loss/logits": 0.8810094386339188, "step": 21370 }, { "epoch": 0.2138, "grad_norm": 12.5625, "grad_norm_var": 1.4038899739583333, "learning_rate": 0.0003, "loss": 11.8722, "loss/aux_loss": 0.048095672950148584, "loss/crossentropy": 2.9124315857887266, "loss/logits": 0.9393705606460572, "step": 21380 }, { "epoch": 0.2139, "grad_norm": 13.5625, "grad_norm_var": 0.784228515625, "learning_rate": 0.0003, "loss": 11.753, "loss/aux_loss": 0.04809557497501373, "loss/crossentropy": 2.7900996267795564, "loss/logits": 0.9042523264884949, "step": 21390 }, { "epoch": 0.214, "grad_norm": 12.75, "grad_norm_var": 41.506184895833336, "learning_rate": 0.0003, "loss": 11.8063, "loss/aux_loss": 0.048089190199971196, "loss/crossentropy": 2.750228983163834, "loss/logits": 0.889886274933815, "step": 21400 }, { "epoch": 0.2141, "grad_norm": 11.875, "grad_norm_var": 42.12511393229167, "learning_rate": 0.0003, "loss": 11.889, "loss/aux_loss": 0.04809757433831692, "loss/crossentropy": 2.798718500137329, "loss/logits": 0.9059660851955413, "step": 21410 }, { "epoch": 0.2142, "grad_norm": 12.625, "grad_norm_var": 3.5251139322916667, "learning_rate": 0.0003, "loss": 11.7521, "loss/aux_loss": 0.048101062327623366, "loss/crossentropy": 2.8062780797481537, "loss/logits": 0.9037539154291153, "step": 21420 }, { "epoch": 0.2143, "grad_norm": 12.1875, "grad_norm_var": 0.24088541666666666, "learning_rate": 0.0003, "loss": 11.6689, "loss/aux_loss": 0.048091071844100955, "loss/crossentropy": 2.7986648082733154, "loss/logits": 0.8963402301073075, "step": 21430 }, { "epoch": 0.2144, "grad_norm": 12.0, "grad_norm_var": 0.26614583333333336, "learning_rate": 0.0003, "loss": 12.0498, "loss/aux_loss": 0.04808345343917608, "loss/crossentropy": 2.9033891916275025, "loss/logits": 0.9560538738965988, "step": 21440 }, { "epoch": 0.2145, "grad_norm": 12.6875, "grad_norm_var": 0.37862955729166664, "learning_rate": 0.0003, "loss": 11.8566, "loss/aux_loss": 0.04810182619839907, "loss/crossentropy": 2.799178421497345, "loss/logits": 0.9117594748735428, "step": 21450 }, { "epoch": 0.2146, "grad_norm": 12.4375, "grad_norm_var": 0.5555826822916666, "learning_rate": 0.0003, "loss": 11.9289, "loss/aux_loss": 0.04809532649815083, "loss/crossentropy": 2.7515031695365906, "loss/logits": 0.9373772829771042, "step": 21460 }, { "epoch": 0.2147, "grad_norm": 11.875, "grad_norm_var": 0.7036458333333333, "learning_rate": 0.0003, "loss": 11.8032, "loss/aux_loss": 0.048093835823237896, "loss/crossentropy": 2.7646782994270325, "loss/logits": 0.911266279220581, "step": 21470 }, { "epoch": 0.2148, "grad_norm": 11.75, "grad_norm_var": 0.17369791666666667, "learning_rate": 0.0003, "loss": 11.914, "loss/aux_loss": 0.04809645172208547, "loss/crossentropy": 2.7859348595142364, "loss/logits": 0.875312551856041, "step": 21480 }, { "epoch": 0.2149, "grad_norm": 12.5625, "grad_norm_var": 2.250260416666667, "learning_rate": 0.0003, "loss": 11.8084, "loss/aux_loss": 0.048092559166252616, "loss/crossentropy": 2.886737060546875, "loss/logits": 0.8639699459075928, "step": 21490 }, { "epoch": 0.215, "grad_norm": 11.75, "grad_norm_var": 2.191145833333333, "learning_rate": 0.0003, "loss": 11.7606, "loss/aux_loss": 0.04809298049658537, "loss/crossentropy": 2.9002821505069734, "loss/logits": 0.9444745779037476, "step": 21500 }, { "epoch": 0.2151, "grad_norm": 12.625, "grad_norm_var": 0.410400390625, "learning_rate": 0.0003, "loss": 11.8001, "loss/aux_loss": 0.048099953681230545, "loss/crossentropy": 2.702463275194168, "loss/logits": 0.9036620557308197, "step": 21510 }, { "epoch": 0.2152, "grad_norm": 11.875, "grad_norm_var": 0.33474934895833336, "learning_rate": 0.0003, "loss": 11.88, "loss/aux_loss": 0.048086378164589404, "loss/crossentropy": 2.7439981400966644, "loss/logits": 0.9525706797838212, "step": 21520 }, { "epoch": 0.2153, "grad_norm": 11.8125, "grad_norm_var": 0.21770833333333334, "learning_rate": 0.0003, "loss": 11.7527, "loss/aux_loss": 0.04810568634420633, "loss/crossentropy": 2.6959027111530305, "loss/logits": 0.8563485085964203, "step": 21530 }, { "epoch": 0.2154, "grad_norm": 12.3125, "grad_norm_var": 0.26925455729166664, "learning_rate": 0.0003, "loss": 11.8753, "loss/aux_loss": 0.04807868674397468, "loss/crossentropy": 2.769312971830368, "loss/logits": 0.9261300444602967, "step": 21540 }, { "epoch": 0.2155, "grad_norm": 12.5, "grad_norm_var": 23.747770182291667, "learning_rate": 0.0003, "loss": 11.9234, "loss/aux_loss": 0.04809074979275465, "loss/crossentropy": 2.858646285533905, "loss/logits": 0.9108448445796966, "step": 21550 }, { "epoch": 0.2156, "grad_norm": 12.875, "grad_norm_var": 23.503125, "learning_rate": 0.0003, "loss": 11.8421, "loss/aux_loss": 0.04809603709727526, "loss/crossentropy": 2.884668844938278, "loss/logits": 0.9331677317619324, "step": 21560 }, { "epoch": 0.2157, "grad_norm": 13.5, "grad_norm_var": 1.2812337239583333, "learning_rate": 0.0003, "loss": 11.7742, "loss/aux_loss": 0.04808317497372627, "loss/crossentropy": 2.7802948713302613, "loss/logits": 0.9503753989934921, "step": 21570 }, { "epoch": 0.2158, "grad_norm": 20.625, "grad_norm_var": 4.628059895833333, "learning_rate": 0.0003, "loss": 11.8258, "loss/aux_loss": 0.04808113072067499, "loss/crossentropy": 2.7815487384796143, "loss/logits": 0.9218122154474259, "step": 21580 }, { "epoch": 0.2159, "grad_norm": 14.9375, "grad_norm_var": 4.69609375, "learning_rate": 0.0003, "loss": 11.7204, "loss/aux_loss": 0.048096010275185105, "loss/crossentropy": 2.831838434934616, "loss/logits": 0.8932915806770325, "step": 21590 }, { "epoch": 0.216, "grad_norm": 11.8125, "grad_norm_var": 0.756103515625, "learning_rate": 0.0003, "loss": 11.7068, "loss/aux_loss": 0.048090577125549316, "loss/crossentropy": 2.818828046321869, "loss/logits": 0.9055852591991425, "step": 21600 }, { "epoch": 0.2161, "grad_norm": 12.75, "grad_norm_var": 0.34889322916666665, "learning_rate": 0.0003, "loss": 11.8699, "loss/aux_loss": 0.04809344317764044, "loss/crossentropy": 2.725664830207825, "loss/logits": 0.9231677383184433, "step": 21610 }, { "epoch": 0.2162, "grad_norm": 13.625, "grad_norm_var": 0.6227701822916667, "learning_rate": 0.0003, "loss": 11.8195, "loss/aux_loss": 0.04809017200022936, "loss/crossentropy": 2.7314105927944183, "loss/logits": 0.9009778618812561, "step": 21620 }, { "epoch": 0.2163, "grad_norm": 12.375, "grad_norm_var": 3.8499348958333335, "learning_rate": 0.0003, "loss": 11.7735, "loss/aux_loss": 0.04809233695268631, "loss/crossentropy": 2.7671496987342836, "loss/logits": 0.9043550729751587, "step": 21630 }, { "epoch": 0.2164, "grad_norm": 13.125, "grad_norm_var": 4.204166666666667, "learning_rate": 0.0003, "loss": 11.7381, "loss/aux_loss": 0.04809225425124168, "loss/crossentropy": 2.715823769569397, "loss/logits": 0.8934505701065063, "step": 21640 }, { "epoch": 0.2165, "grad_norm": 13.5625, "grad_norm_var": 1.0377604166666667, "learning_rate": 0.0003, "loss": 11.8044, "loss/aux_loss": 0.04808531980961561, "loss/crossentropy": 2.7514367580413817, "loss/logits": 0.9063582092523574, "step": 21650 }, { "epoch": 0.2166, "grad_norm": 12.5, "grad_norm_var": 0.4020182291666667, "learning_rate": 0.0003, "loss": 11.6973, "loss/aux_loss": 0.04809113219380379, "loss/crossentropy": 2.71242498755455, "loss/logits": 0.9035886704921723, "step": 21660 }, { "epoch": 0.2167, "grad_norm": 11.6875, "grad_norm_var": 0.5839680989583333, "learning_rate": 0.0003, "loss": 11.6988, "loss/aux_loss": 0.0480911111459136, "loss/crossentropy": 2.934883952140808, "loss/logits": 0.8832208603620529, "step": 21670 }, { "epoch": 0.2168, "grad_norm": 12.25, "grad_norm_var": 0.632666015625, "learning_rate": 0.0003, "loss": 11.7549, "loss/aux_loss": 0.048089130967855456, "loss/crossentropy": 2.677219772338867, "loss/logits": 0.8748012632131577, "step": 21680 }, { "epoch": 0.2169, "grad_norm": 11.75, "grad_norm_var": 0.5957682291666667, "learning_rate": 0.0003, "loss": 11.874, "loss/aux_loss": 0.0480881916359067, "loss/crossentropy": 2.8454249918460848, "loss/logits": 0.8973806709051132, "step": 21690 }, { "epoch": 0.217, "grad_norm": 11.625, "grad_norm_var": 0.42552083333333335, "learning_rate": 0.0003, "loss": 11.8774, "loss/aux_loss": 0.04809354934841394, "loss/crossentropy": 2.930919277667999, "loss/logits": 0.9322040349245071, "step": 21700 }, { "epoch": 0.2171, "grad_norm": 13.1875, "grad_norm_var": 0.38229166666666664, "learning_rate": 0.0003, "loss": 11.8088, "loss/aux_loss": 0.04809126667678356, "loss/crossentropy": 2.709583592414856, "loss/logits": 0.8984523087739944, "step": 21710 }, { "epoch": 0.2172, "grad_norm": 12.4375, "grad_norm_var": 0.3509765625, "learning_rate": 0.0003, "loss": 11.8755, "loss/aux_loss": 0.04808931071311236, "loss/crossentropy": 2.797287333011627, "loss/logits": 0.8914159804582595, "step": 21720 }, { "epoch": 0.2173, "grad_norm": 12.0, "grad_norm_var": 0.3853515625, "learning_rate": 0.0003, "loss": 11.7927, "loss/aux_loss": 0.0480969849973917, "loss/crossentropy": 2.6972643613815306, "loss/logits": 0.8642873585224151, "step": 21730 }, { "epoch": 0.2174, "grad_norm": 12.375, "grad_norm_var": 0.453369140625, "learning_rate": 0.0003, "loss": 11.7777, "loss/aux_loss": 0.04809529315680265, "loss/crossentropy": 2.7677155137062073, "loss/logits": 0.8832725346088409, "step": 21740 }, { "epoch": 0.2175, "grad_norm": 11.5, "grad_norm_var": 0.34739583333333335, "learning_rate": 0.0003, "loss": 11.9139, "loss/aux_loss": 0.048091381415724756, "loss/crossentropy": 2.799149090051651, "loss/logits": 0.887069022655487, "step": 21750 }, { "epoch": 0.2176, "grad_norm": 12.4375, "grad_norm_var": 0.17076822916666667, "learning_rate": 0.0003, "loss": 11.8353, "loss/aux_loss": 0.04808845948427916, "loss/crossentropy": 3.0753382325172423, "loss/logits": 0.945702788233757, "step": 21760 }, { "epoch": 0.2177, "grad_norm": 12.1875, "grad_norm_var": 0.2087890625, "learning_rate": 0.0003, "loss": 11.6985, "loss/aux_loss": 0.048092255368828773, "loss/crossentropy": 2.6490719497203825, "loss/logits": 0.8540039539337159, "step": 21770 }, { "epoch": 0.2178, "grad_norm": 12.625, "grad_norm_var": 0.19583333333333333, "learning_rate": 0.0003, "loss": 11.8542, "loss/aux_loss": 0.048097673989832404, "loss/crossentropy": 2.8406033515930176, "loss/logits": 0.8934641659259797, "step": 21780 }, { "epoch": 0.2179, "grad_norm": 14.25, "grad_norm_var": 0.6061848958333333, "learning_rate": 0.0003, "loss": 11.8071, "loss/aux_loss": 0.048086115159094334, "loss/crossentropy": 2.8944154620170592, "loss/logits": 0.9081297039985656, "step": 21790 }, { "epoch": 0.218, "grad_norm": 13.0, "grad_norm_var": 14.662955729166667, "learning_rate": 0.0003, "loss": 11.9785, "loss/aux_loss": 0.04810140430927277, "loss/crossentropy": 2.7707842707633974, "loss/logits": 0.8933016210794449, "step": 21800 }, { "epoch": 0.2181, "grad_norm": 12.25, "grad_norm_var": 14.276676432291667, "learning_rate": 0.0003, "loss": 12.1024, "loss/aux_loss": 0.04808119479566812, "loss/crossentropy": 2.8384734869003294, "loss/logits": 0.9208894163370133, "step": 21810 }, { "epoch": 0.2182, "grad_norm": 12.625, "grad_norm_var": 0.46243489583333336, "learning_rate": 0.0003, "loss": 11.9, "loss/aux_loss": 0.048093420639634135, "loss/crossentropy": 2.710639762878418, "loss/logits": 0.894580963253975, "step": 21820 }, { "epoch": 0.2183, "grad_norm": 12.5, "grad_norm_var": 1.1384765625, "learning_rate": 0.0003, "loss": 11.6904, "loss/aux_loss": 0.048085536994040015, "loss/crossentropy": 2.621820467710495, "loss/logits": 0.8434902101755142, "step": 21830 }, { "epoch": 0.2184, "grad_norm": 14.875, "grad_norm_var": 24.766910807291666, "learning_rate": 0.0003, "loss": 11.8784, "loss/aux_loss": 0.04809453897178173, "loss/crossentropy": 2.8327670872211455, "loss/logits": 0.9073660403490067, "step": 21840 }, { "epoch": 0.2185, "grad_norm": 13.875, "grad_norm_var": 24.213134765625, "learning_rate": 0.0003, "loss": 11.6735, "loss/aux_loss": 0.04809699393808842, "loss/crossentropy": 2.730017304420471, "loss/logits": 0.9322267979383468, "step": 21850 }, { "epoch": 0.2186, "grad_norm": 12.4375, "grad_norm_var": 1.146875, "learning_rate": 0.0003, "loss": 11.6376, "loss/aux_loss": 0.04807912241667509, "loss/crossentropy": 2.786021035909653, "loss/logits": 0.8863259345293045, "step": 21860 }, { "epoch": 0.2187, "grad_norm": 11.6875, "grad_norm_var": 0.44114583333333335, "learning_rate": 0.0003, "loss": 11.588, "loss/aux_loss": 0.04809343423694372, "loss/crossentropy": 2.709645652770996, "loss/logits": 0.8401134133338928, "step": 21870 }, { "epoch": 0.2188, "grad_norm": 12.5, "grad_norm_var": 0.619775390625, "learning_rate": 0.0003, "loss": 11.6378, "loss/aux_loss": 0.048089167289435866, "loss/crossentropy": 2.9162204384803774, "loss/logits": 0.8603705197572709, "step": 21880 }, { "epoch": 0.2189, "grad_norm": 12.125, "grad_norm_var": 0.441259765625, "learning_rate": 0.0003, "loss": 11.8368, "loss/aux_loss": 0.048093576729297635, "loss/crossentropy": 2.6666926383972167, "loss/logits": 0.909285506606102, "step": 21890 }, { "epoch": 0.219, "grad_norm": 12.5625, "grad_norm_var": 0.28880208333333335, "learning_rate": 0.0003, "loss": 11.8581, "loss/aux_loss": 0.04808791261166334, "loss/crossentropy": 2.8666730880737306, "loss/logits": 0.9074492365121841, "step": 21900 }, { "epoch": 0.2191, "grad_norm": 13.375, "grad_norm_var": 0.5520670572916667, "learning_rate": 0.0003, "loss": 11.7345, "loss/aux_loss": 0.04809127487242222, "loss/crossentropy": 2.749277150630951, "loss/logits": 0.9035682111978531, "step": 21910 }, { "epoch": 0.2192, "grad_norm": 12.75, "grad_norm_var": 0.36399739583333335, "learning_rate": 0.0003, "loss": 11.7659, "loss/aux_loss": 0.048089764453470706, "loss/crossentropy": 2.681344139575958, "loss/logits": 0.900179210305214, "step": 21920 }, { "epoch": 0.2193, "grad_norm": 11.8125, "grad_norm_var": 1.0594889322916667, "learning_rate": 0.0003, "loss": 11.7378, "loss/aux_loss": 0.0481033293530345, "loss/crossentropy": 2.6201368153095244, "loss/logits": 0.8585344612598419, "step": 21930 }, { "epoch": 0.2194, "grad_norm": 13.375, "grad_norm_var": 1.4016764322916666, "learning_rate": 0.0003, "loss": 11.9075, "loss/aux_loss": 0.04807730689644814, "loss/crossentropy": 2.8310318291187286, "loss/logits": 0.9102666884660721, "step": 21940 }, { "epoch": 0.2195, "grad_norm": 12.75, "grad_norm_var": 0.6691243489583333, "learning_rate": 0.0003, "loss": 11.6649, "loss/aux_loss": 0.048088141903281215, "loss/crossentropy": 2.9216031610965727, "loss/logits": 0.9007417112588882, "step": 21950 }, { "epoch": 0.2196, "grad_norm": 12.0, "grad_norm_var": 0.5438639322916666, "learning_rate": 0.0003, "loss": 11.7617, "loss/aux_loss": 0.04808661881834268, "loss/crossentropy": 2.839560979604721, "loss/logits": 0.8860168516635895, "step": 21960 }, { "epoch": 0.2197, "grad_norm": 11.8125, "grad_norm_var": 3.3452473958333333, "learning_rate": 0.0003, "loss": 11.7196, "loss/aux_loss": 0.048084485530853274, "loss/crossentropy": 2.7600815176963804, "loss/logits": 0.880821418762207, "step": 21970 }, { "epoch": 0.2198, "grad_norm": 11.5, "grad_norm_var": 0.5484375, "learning_rate": 0.0003, "loss": 11.8505, "loss/aux_loss": 0.04809036664664745, "loss/crossentropy": 2.9518965005874636, "loss/logits": 0.8992276877164841, "step": 21980 }, { "epoch": 0.2199, "grad_norm": 14.5625, "grad_norm_var": 0.8442057291666667, "learning_rate": 0.0003, "loss": 11.7163, "loss/aux_loss": 0.04808600507676601, "loss/crossentropy": 2.7657691895961762, "loss/logits": 0.9018583208322525, "step": 21990 }, { "epoch": 0.22, "grad_norm": 12.1875, "grad_norm_var": 0.9114583333333334, "learning_rate": 0.0003, "loss": 11.6793, "loss/aux_loss": 0.048087524622678755, "loss/crossentropy": 2.6612784922122956, "loss/logits": 0.8721506536006928, "step": 22000 }, { "epoch": 0.2201, "grad_norm": 12.375, "grad_norm_var": 0.41848958333333336, "learning_rate": 0.0003, "loss": 11.9123, "loss/aux_loss": 0.04809351172298193, "loss/crossentropy": 2.81771005988121, "loss/logits": 0.9572886168956757, "step": 22010 }, { "epoch": 0.2202, "grad_norm": 12.6875, "grad_norm_var": 6.704622395833334, "learning_rate": 0.0003, "loss": 11.8047, "loss/aux_loss": 0.04809220097959042, "loss/crossentropy": 2.817577600479126, "loss/logits": 0.8661315441131592, "step": 22020 }, { "epoch": 0.2203, "grad_norm": 13.125, "grad_norm_var": 5.740364583333333, "learning_rate": 0.0003, "loss": 11.9214, "loss/aux_loss": 0.04808165710419417, "loss/crossentropy": 2.768628853559494, "loss/logits": 0.8950851440429688, "step": 22030 }, { "epoch": 0.2204, "grad_norm": 12.3125, "grad_norm_var": 22.538785807291667, "learning_rate": 0.0003, "loss": 11.9163, "loss/aux_loss": 0.04808369372040033, "loss/crossentropy": 2.7819134533405303, "loss/logits": 0.8844695091247559, "step": 22040 }, { "epoch": 0.2205, "grad_norm": 13.4375, "grad_norm_var": 0.431884765625, "learning_rate": 0.0003, "loss": 11.7862, "loss/aux_loss": 0.048092160001397134, "loss/crossentropy": 2.9033903241157533, "loss/logits": 0.9348111391067505, "step": 22050 }, { "epoch": 0.2206, "grad_norm": 11.875, "grad_norm_var": 0.6830729166666667, "learning_rate": 0.0003, "loss": 11.7504, "loss/aux_loss": 0.04808596298098564, "loss/crossentropy": 2.782503831386566, "loss/logits": 0.8899946212768555, "step": 22060 }, { "epoch": 0.2207, "grad_norm": 12.5625, "grad_norm_var": 0.4663899739583333, "learning_rate": 0.0003, "loss": 11.7566, "loss/aux_loss": 0.048095478489995, "loss/crossentropy": 2.7743342220783234, "loss/logits": 0.8939844936132431, "step": 22070 }, { "epoch": 0.2208, "grad_norm": 12.375, "grad_norm_var": 0.262744140625, "learning_rate": 0.0003, "loss": 11.8283, "loss/aux_loss": 0.04808841645717621, "loss/crossentropy": 2.554595720767975, "loss/logits": 0.8608134061098098, "step": 22080 }, { "epoch": 0.2209, "grad_norm": 12.375, "grad_norm_var": 0.2847493489583333, "learning_rate": 0.0003, "loss": 11.8984, "loss/aux_loss": 0.048084456473588943, "loss/crossentropy": 2.8617907404899596, "loss/logits": 0.9006909459829331, "step": 22090 }, { "epoch": 0.221, "grad_norm": 11.9375, "grad_norm_var": 0.272509765625, "learning_rate": 0.0003, "loss": 11.6435, "loss/aux_loss": 0.04809146206825972, "loss/crossentropy": 2.5811066746711733, "loss/logits": 0.8601150065660477, "step": 22100 }, { "epoch": 0.2211, "grad_norm": 12.4375, "grad_norm_var": 0.3798014322916667, "learning_rate": 0.0003, "loss": 11.8274, "loss/aux_loss": 0.04808408990502357, "loss/crossentropy": 2.7892317831516267, "loss/logits": 0.905488446354866, "step": 22110 }, { "epoch": 0.2212, "grad_norm": 12.5, "grad_norm_var": 0.2619140625, "learning_rate": 0.0003, "loss": 11.6912, "loss/aux_loss": 0.048095279932022096, "loss/crossentropy": 2.7180880904197693, "loss/logits": 0.8672394514083862, "step": 22120 }, { "epoch": 0.2213, "grad_norm": 12.375, "grad_norm_var": 0.29607747395833334, "learning_rate": 0.0003, "loss": 11.8284, "loss/aux_loss": 0.048083136044442656, "loss/crossentropy": 2.751019012928009, "loss/logits": 0.8794708341360092, "step": 22130 }, { "epoch": 0.2214, "grad_norm": 12.4375, "grad_norm_var": 0.1525390625, "learning_rate": 0.0003, "loss": 11.8437, "loss/aux_loss": 0.04808746688067913, "loss/crossentropy": 2.961369812488556, "loss/logits": 0.8975703865289688, "step": 22140 }, { "epoch": 0.2215, "grad_norm": 12.5, "grad_norm_var": 0.28045247395833334, "learning_rate": 0.0003, "loss": 11.7406, "loss/aux_loss": 0.04809634368866682, "loss/crossentropy": 2.7276877880096437, "loss/logits": 0.8959174305200577, "step": 22150 }, { "epoch": 0.2216, "grad_norm": 12.8125, "grad_norm_var": 0.389697265625, "learning_rate": 0.0003, "loss": 11.7637, "loss/aux_loss": 0.04808992594480514, "loss/crossentropy": 2.621533715724945, "loss/logits": 0.8763752758502961, "step": 22160 }, { "epoch": 0.2217, "grad_norm": 13.5, "grad_norm_var": 0.34347330729166664, "learning_rate": 0.0003, "loss": 11.6706, "loss/aux_loss": 0.04809186160564423, "loss/crossentropy": 2.7217795610427857, "loss/logits": 0.8898161560297012, "step": 22170 }, { "epoch": 0.2218, "grad_norm": 12.3125, "grad_norm_var": 0.601025390625, "learning_rate": 0.0003, "loss": 11.6356, "loss/aux_loss": 0.04809278640896082, "loss/crossentropy": 2.7290789067745207, "loss/logits": 0.8597593367099762, "step": 22180 }, { "epoch": 0.2219, "grad_norm": 12.375, "grad_norm_var": 0.43359375, "learning_rate": 0.0003, "loss": 11.7534, "loss/aux_loss": 0.04809238947927952, "loss/crossentropy": 2.707075160741806, "loss/logits": 0.8652425140142441, "step": 22190 }, { "epoch": 0.222, "grad_norm": 11.9375, "grad_norm_var": 0.35885416666666664, "learning_rate": 0.0003, "loss": 11.6756, "loss/aux_loss": 0.04808915685862303, "loss/crossentropy": 2.8292889297008514, "loss/logits": 0.8936943262815475, "step": 22200 }, { "epoch": 0.2221, "grad_norm": 12.3125, "grad_norm_var": 0.37180989583333335, "learning_rate": 0.0003, "loss": 11.825, "loss/aux_loss": 0.048080139234662055, "loss/crossentropy": 2.70860413312912, "loss/logits": 0.8934529781341553, "step": 22210 }, { "epoch": 0.2222, "grad_norm": 12.1875, "grad_norm_var": 0.28566080729166665, "learning_rate": 0.0003, "loss": 11.7766, "loss/aux_loss": 0.04809638597071171, "loss/crossentropy": 2.704670661687851, "loss/logits": 0.8709723800420761, "step": 22220 }, { "epoch": 0.2223, "grad_norm": 11.0625, "grad_norm_var": 0.307666015625, "learning_rate": 0.0003, "loss": 11.6679, "loss/aux_loss": 0.04809023775160313, "loss/crossentropy": 2.6749909996986387, "loss/logits": 0.9050649791955948, "step": 22230 }, { "epoch": 0.2224, "grad_norm": 12.625, "grad_norm_var": 0.3921875, "learning_rate": 0.0003, "loss": 11.7514, "loss/aux_loss": 0.04809599500149488, "loss/crossentropy": 2.760683298110962, "loss/logits": 0.8826134830713273, "step": 22240 }, { "epoch": 0.2225, "grad_norm": 13.3125, "grad_norm_var": 1.2234375, "learning_rate": 0.0003, "loss": 11.9613, "loss/aux_loss": 0.048089191876351835, "loss/crossentropy": 2.767944025993347, "loss/logits": 0.8672012895345688, "step": 22250 }, { "epoch": 0.2226, "grad_norm": 13.4375, "grad_norm_var": 0.4613118489583333, "learning_rate": 0.0003, "loss": 11.7787, "loss/aux_loss": 0.0480911660939455, "loss/crossentropy": 2.735838997364044, "loss/logits": 0.8864156484603882, "step": 22260 }, { "epoch": 0.2227, "grad_norm": 11.9375, "grad_norm_var": 0.26223958333333336, "learning_rate": 0.0003, "loss": 11.7865, "loss/aux_loss": 0.04808713924139738, "loss/crossentropy": 2.83599910736084, "loss/logits": 0.9017521053552627, "step": 22270 }, { "epoch": 0.2228, "grad_norm": 10.9375, "grad_norm_var": 0.37473958333333335, "learning_rate": 0.0003, "loss": 11.7516, "loss/aux_loss": 0.0480857228860259, "loss/crossentropy": 2.830791783332825, "loss/logits": 0.9194071799516678, "step": 22280 }, { "epoch": 0.2229, "grad_norm": 13.5, "grad_norm_var": 0.6660807291666667, "learning_rate": 0.0003, "loss": 11.8099, "loss/aux_loss": 0.04808124005794525, "loss/crossentropy": 2.8348045706748963, "loss/logits": 0.903641340136528, "step": 22290 }, { "epoch": 0.223, "grad_norm": 13.1875, "grad_norm_var": 1.6406087239583333, "learning_rate": 0.0003, "loss": 11.8544, "loss/aux_loss": 0.04809259995818138, "loss/crossentropy": 2.897055411338806, "loss/logits": 0.9287648230791092, "step": 22300 }, { "epoch": 0.2231, "grad_norm": 12.9375, "grad_norm_var": 0.3203125, "learning_rate": 0.0003, "loss": 11.7389, "loss/aux_loss": 0.048087797872722146, "loss/crossentropy": 2.8019288659095762, "loss/logits": 0.8725453674793243, "step": 22310 }, { "epoch": 0.2232, "grad_norm": 12.1875, "grad_norm_var": 0.9932291666666667, "learning_rate": 0.0003, "loss": 11.8846, "loss/aux_loss": 0.048088356666266915, "loss/crossentropy": 2.6834902286529543, "loss/logits": 0.8782364130020142, "step": 22320 }, { "epoch": 0.2233, "grad_norm": 12.4375, "grad_norm_var": 1.0511555989583334, "learning_rate": 0.0003, "loss": 11.7531, "loss/aux_loss": 0.048094474151730536, "loss/crossentropy": 2.6937114894390106, "loss/logits": 0.8777317255735397, "step": 22330 }, { "epoch": 0.2234, "grad_norm": 13.0, "grad_norm_var": 0.5492024739583333, "learning_rate": 0.0003, "loss": 11.6566, "loss/aux_loss": 0.048084880039095876, "loss/crossentropy": 2.72471564412117, "loss/logits": 0.8733905553817749, "step": 22340 }, { "epoch": 0.2235, "grad_norm": 11.5625, "grad_norm_var": 0.867431640625, "learning_rate": 0.0003, "loss": 11.6595, "loss/aux_loss": 0.048089952766895296, "loss/crossentropy": 2.840152883529663, "loss/logits": 0.9174001008272171, "step": 22350 }, { "epoch": 0.2236, "grad_norm": 12.1875, "grad_norm_var": 0.560400390625, "learning_rate": 0.0003, "loss": 11.7196, "loss/aux_loss": 0.0480857165530324, "loss/crossentropy": 2.8605542302131655, "loss/logits": 0.9183370441198349, "step": 22360 }, { "epoch": 0.2237, "grad_norm": 11.5625, "grad_norm_var": 0.40740559895833334, "learning_rate": 0.0003, "loss": 11.7638, "loss/aux_loss": 0.048090421594679356, "loss/crossentropy": 2.639111566543579, "loss/logits": 0.8860017955303192, "step": 22370 }, { "epoch": 0.2238, "grad_norm": 11.6875, "grad_norm_var": 0.20128580729166667, "learning_rate": 0.0003, "loss": 11.9126, "loss/aux_loss": 0.04809177350252867, "loss/crossentropy": 2.785674238204956, "loss/logits": 0.8810646086931229, "step": 22380 }, { "epoch": 0.2239, "grad_norm": 12.5, "grad_norm_var": 0.218603515625, "learning_rate": 0.0003, "loss": 11.6681, "loss/aux_loss": 0.048096783272922036, "loss/crossentropy": 2.7202962040901184, "loss/logits": 0.8531753093004226, "step": 22390 }, { "epoch": 0.224, "grad_norm": 13.4375, "grad_norm_var": 2.6541015625, "learning_rate": 0.0003, "loss": 11.9144, "loss/aux_loss": 0.0480857115238905, "loss/crossentropy": 2.794497346878052, "loss/logits": 0.8948001682758331, "step": 22400 }, { "epoch": 0.2241, "grad_norm": 16.75, "grad_norm_var": 3.6030598958333333, "learning_rate": 0.0003, "loss": 11.7171, "loss/aux_loss": 0.04808383211493492, "loss/crossentropy": 2.809972804784775, "loss/logits": 0.9038825124502182, "step": 22410 }, { "epoch": 0.2242, "grad_norm": 13.25, "grad_norm_var": 1.80625, "learning_rate": 0.0003, "loss": 11.4694, "loss/aux_loss": 0.04809340089559555, "loss/crossentropy": 2.78351212143898, "loss/logits": 0.8861901849508286, "step": 22420 }, { "epoch": 0.2243, "grad_norm": 12.125, "grad_norm_var": 0.8671223958333333, "learning_rate": 0.0003, "loss": 11.6393, "loss/aux_loss": 0.04809296205639839, "loss/crossentropy": 2.749815273284912, "loss/logits": 0.9010616183280945, "step": 22430 }, { "epoch": 0.2244, "grad_norm": 12.0, "grad_norm_var": 9.208837890625, "learning_rate": 0.0003, "loss": 11.7555, "loss/aux_loss": 0.04808775205165148, "loss/crossentropy": 2.968637430667877, "loss/logits": 0.9231620490550995, "step": 22440 }, { "epoch": 0.2245, "grad_norm": 12.1875, "grad_norm_var": 0.4400390625, "learning_rate": 0.0003, "loss": 11.6987, "loss/aux_loss": 0.0480935113504529, "loss/crossentropy": 2.851404082775116, "loss/logits": 0.897919625043869, "step": 22450 }, { "epoch": 0.2246, "grad_norm": 11.875, "grad_norm_var": 0.2806640625, "learning_rate": 0.0003, "loss": 11.7398, "loss/aux_loss": 0.04809074774384499, "loss/crossentropy": 2.65076659321785, "loss/logits": 0.8881769001483917, "step": 22460 }, { "epoch": 0.2247, "grad_norm": 12.0625, "grad_norm_var": 1.1931640625, "learning_rate": 0.0003, "loss": 11.6777, "loss/aux_loss": 0.04809971358627081, "loss/crossentropy": 2.645810514688492, "loss/logits": 0.8362853050231933, "step": 22470 }, { "epoch": 0.2248, "grad_norm": 14.8125, "grad_norm_var": 103.21354166666667, "learning_rate": 0.0003, "loss": 11.8468, "loss/aux_loss": 0.048092149384319785, "loss/crossentropy": 2.8907059490680695, "loss/logits": 0.8786774843931198, "step": 22480 }, { "epoch": 0.2249, "grad_norm": 13.0625, "grad_norm_var": 100.78019205729167, "learning_rate": 0.0003, "loss": 11.7205, "loss/aux_loss": 0.04809787534177303, "loss/crossentropy": 2.716031605005264, "loss/logits": 0.8944111734628677, "step": 22490 }, { "epoch": 0.225, "grad_norm": 13.75, "grad_norm_var": 1.50625, "learning_rate": 0.0003, "loss": 11.6669, "loss/aux_loss": 0.048082894459366796, "loss/crossentropy": 2.7395161747932435, "loss/logits": 0.8643982857465744, "step": 22500 }, { "epoch": 0.2251, "grad_norm": 12.375, "grad_norm_var": 1.5290201822916667, "learning_rate": 0.0003, "loss": 11.5853, "loss/aux_loss": 0.04808229207992554, "loss/crossentropy": 2.7536255359649657, "loss/logits": 0.881432518362999, "step": 22510 }, { "epoch": 0.2252, "grad_norm": 11.625, "grad_norm_var": 0.448681640625, "learning_rate": 0.0003, "loss": 11.7597, "loss/aux_loss": 0.04810373391956091, "loss/crossentropy": 2.747165524959564, "loss/logits": 0.8833198219537735, "step": 22520 }, { "epoch": 0.2253, "grad_norm": 11.8125, "grad_norm_var": 0.46432291666666664, "learning_rate": 0.0003, "loss": 11.8328, "loss/aux_loss": 0.04809314012527466, "loss/crossentropy": 2.67395840883255, "loss/logits": 0.9117985635995864, "step": 22530 }, { "epoch": 0.2254, "grad_norm": 12.6875, "grad_norm_var": 0.15987955729166667, "learning_rate": 0.0003, "loss": 11.6949, "loss/aux_loss": 0.04808804150670767, "loss/crossentropy": 2.7599778056144713, "loss/logits": 0.8923967123031616, "step": 22540 }, { "epoch": 0.2255, "grad_norm": 12.4375, "grad_norm_var": 0.5255045572916667, "learning_rate": 0.0003, "loss": 11.6803, "loss/aux_loss": 0.0480937123298645, "loss/crossentropy": 2.731845957040787, "loss/logits": 0.9252343803644181, "step": 22550 }, { "epoch": 0.2256, "grad_norm": 13.125, "grad_norm_var": 0.7700520833333333, "learning_rate": 0.0003, "loss": 11.8578, "loss/aux_loss": 0.04808880146592855, "loss/crossentropy": 2.9239902973175047, "loss/logits": 0.8844455033540726, "step": 22560 }, { "epoch": 0.2257, "grad_norm": 11.625, "grad_norm_var": 1.0207682291666667, "learning_rate": 0.0003, "loss": 11.7067, "loss/aux_loss": 0.04809002634137869, "loss/crossentropy": 2.89136780500412, "loss/logits": 0.9194484144449234, "step": 22570 }, { "epoch": 0.2258, "grad_norm": 12.0625, "grad_norm_var": 1.0624348958333334, "learning_rate": 0.0003, "loss": 11.5666, "loss/aux_loss": 0.048089371994137764, "loss/crossentropy": 2.775197160243988, "loss/logits": 0.8997502565383911, "step": 22580 }, { "epoch": 0.2259, "grad_norm": 11.9375, "grad_norm_var": 2.983056640625, "learning_rate": 0.0003, "loss": 11.7513, "loss/aux_loss": 0.04809422306716442, "loss/crossentropy": 2.7966256499290467, "loss/logits": 0.8797528147697449, "step": 22590 }, { "epoch": 0.226, "grad_norm": 11.6875, "grad_norm_var": 1.4276041666666666, "learning_rate": 0.0003, "loss": 11.6478, "loss/aux_loss": 0.048089210875332355, "loss/crossentropy": 2.7377444982528685, "loss/logits": 0.9153787553310394, "step": 22600 }, { "epoch": 0.2261, "grad_norm": 14.75, "grad_norm_var": 0.5952962239583334, "learning_rate": 0.0003, "loss": 11.7362, "loss/aux_loss": 0.0480865390971303, "loss/crossentropy": 2.7379279255867006, "loss/logits": 0.8918594628572464, "step": 22610 }, { "epoch": 0.2262, "grad_norm": 14.25, "grad_norm_var": 2.0085774739583333, "learning_rate": 0.0003, "loss": 11.7772, "loss/aux_loss": 0.048102805577218535, "loss/crossentropy": 2.7676973700523377, "loss/logits": 0.9144205540418625, "step": 22620 }, { "epoch": 0.2263, "grad_norm": 11.625, "grad_norm_var": 0.8378743489583333, "learning_rate": 0.0003, "loss": 11.7351, "loss/aux_loss": 0.04808990322053432, "loss/crossentropy": 2.763928699493408, "loss/logits": 0.9580153465270996, "step": 22630 }, { "epoch": 0.2264, "grad_norm": 11.875, "grad_norm_var": 0.2749837239583333, "learning_rate": 0.0003, "loss": 11.7633, "loss/aux_loss": 0.04808971676975489, "loss/crossentropy": 2.798508107662201, "loss/logits": 0.9377011686563492, "step": 22640 }, { "epoch": 0.2265, "grad_norm": 11.8125, "grad_norm_var": 0.5379557291666667, "learning_rate": 0.0003, "loss": 11.6478, "loss/aux_loss": 0.04807438086718321, "loss/crossentropy": 2.628761428594589, "loss/logits": 0.901868748664856, "step": 22650 }, { "epoch": 0.2266, "grad_norm": 11.625, "grad_norm_var": 0.8390462239583333, "learning_rate": 0.0003, "loss": 11.7954, "loss/aux_loss": 0.04809013176709413, "loss/crossentropy": 2.828604358434677, "loss/logits": 0.9256632804870606, "step": 22660 }, { "epoch": 0.2267, "grad_norm": 12.625, "grad_norm_var": 0.5305826822916667, "learning_rate": 0.0003, "loss": 11.6155, "loss/aux_loss": 0.048091720603406427, "loss/crossentropy": 2.6923003435134887, "loss/logits": 0.9034017592668533, "step": 22670 }, { "epoch": 0.2268, "grad_norm": 11.8125, "grad_norm_var": 0.2908854166666667, "learning_rate": 0.0003, "loss": 11.7298, "loss/aux_loss": 0.04808386079967022, "loss/crossentropy": 2.871219742298126, "loss/logits": 0.9219763696193695, "step": 22680 }, { "epoch": 0.2269, "grad_norm": 12.375, "grad_norm_var": 0.4014973958333333, "learning_rate": 0.0003, "loss": 11.6765, "loss/aux_loss": 0.048084440641105175, "loss/crossentropy": 2.6931581676006315, "loss/logits": 0.8805693238973618, "step": 22690 }, { "epoch": 0.227, "grad_norm": 13.6875, "grad_norm_var": 1.3969889322916667, "learning_rate": 0.0003, "loss": 11.7598, "loss/aux_loss": 0.04809512868523598, "loss/crossentropy": 2.6413376092910767, "loss/logits": 0.8963235735893249, "step": 22700 }, { "epoch": 0.2271, "grad_norm": 12.6875, "grad_norm_var": 0.39296875, "learning_rate": 0.0003, "loss": 11.8279, "loss/aux_loss": 0.04809400998055935, "loss/crossentropy": 2.791292655467987, "loss/logits": 0.8894282549619674, "step": 22710 }, { "epoch": 0.2272, "grad_norm": 12.4375, "grad_norm_var": 0.3333333333333333, "learning_rate": 0.0003, "loss": 11.8071, "loss/aux_loss": 0.048091070353984834, "loss/crossentropy": 2.6207732558250427, "loss/logits": 0.8468140810728073, "step": 22720 }, { "epoch": 0.2273, "grad_norm": 11.3125, "grad_norm_var": 0.35128580729166664, "learning_rate": 0.0003, "loss": 11.6102, "loss/aux_loss": 0.04808534793555737, "loss/crossentropy": 2.6754838645458223, "loss/logits": 0.9055883139371872, "step": 22730 }, { "epoch": 0.2274, "grad_norm": 11.6875, "grad_norm_var": 0.44217122395833336, "learning_rate": 0.0003, "loss": 11.5479, "loss/aux_loss": 0.04807910211384296, "loss/crossentropy": 2.8112324655056, "loss/logits": 0.8994197815656662, "step": 22740 }, { "epoch": 0.2275, "grad_norm": 13.75, "grad_norm_var": 192.89264322916668, "learning_rate": 0.0003, "loss": 11.6927, "loss/aux_loss": 0.04809170123189688, "loss/crossentropy": 2.8299485445022583, "loss/logits": 0.8875698268413543, "step": 22750 }, { "epoch": 0.2276, "grad_norm": 13.125, "grad_norm_var": 1.995947265625, "learning_rate": 0.0003, "loss": 11.9368, "loss/aux_loss": 0.04808708317577839, "loss/crossentropy": 2.8931700348854066, "loss/logits": 0.9314892888069153, "step": 22760 }, { "epoch": 0.2277, "grad_norm": 13.5, "grad_norm_var": 0.3824055989583333, "learning_rate": 0.0003, "loss": 11.7043, "loss/aux_loss": 0.048081159219145776, "loss/crossentropy": 2.8375320076942443, "loss/logits": 0.8928394854068756, "step": 22770 }, { "epoch": 0.2278, "grad_norm": 13.5, "grad_norm_var": 0.5559895833333334, "learning_rate": 0.0003, "loss": 11.6841, "loss/aux_loss": 0.048088202998042104, "loss/crossentropy": 2.8470484018325806, "loss/logits": 0.8978864282369614, "step": 22780 }, { "epoch": 0.2279, "grad_norm": 12.5625, "grad_norm_var": 0.7514973958333333, "learning_rate": 0.0003, "loss": 11.7581, "loss/aux_loss": 0.048088363744318484, "loss/crossentropy": 2.70345995426178, "loss/logits": 0.8970031559467315, "step": 22790 }, { "epoch": 0.228, "grad_norm": 13.0625, "grad_norm_var": 0.5981770833333333, "learning_rate": 0.0003, "loss": 11.7236, "loss/aux_loss": 0.048087395168840884, "loss/crossentropy": 2.7649930655956267, "loss/logits": 0.877352437376976, "step": 22800 }, { "epoch": 0.2281, "grad_norm": 12.5, "grad_norm_var": 0.2879557291666667, "learning_rate": 0.0003, "loss": 11.7192, "loss/aux_loss": 0.048083343915641306, "loss/crossentropy": 2.756735974550247, "loss/logits": 0.8518844783306122, "step": 22810 }, { "epoch": 0.2282, "grad_norm": 12.4375, "grad_norm_var": 0.23995768229166667, "learning_rate": 0.0003, "loss": 11.7118, "loss/aux_loss": 0.04808799996972084, "loss/crossentropy": 2.756363260746002, "loss/logits": 0.907541635632515, "step": 22820 }, { "epoch": 0.2283, "grad_norm": 13.25, "grad_norm_var": 33.850244140625, "learning_rate": 0.0003, "loss": 11.7879, "loss/aux_loss": 0.04809822123497724, "loss/crossentropy": 2.733557677268982, "loss/logits": 0.896966302394867, "step": 22830 }, { "epoch": 0.2284, "grad_norm": 12.25, "grad_norm_var": 32.6337890625, "learning_rate": 0.0003, "loss": 11.7543, "loss/aux_loss": 0.0481018140912056, "loss/crossentropy": 2.759879392385483, "loss/logits": 0.8836588621139526, "step": 22840 }, { "epoch": 0.2285, "grad_norm": 12.4375, "grad_norm_var": 0.12433268229166666, "learning_rate": 0.0003, "loss": 11.7755, "loss/aux_loss": 0.04808047190308571, "loss/crossentropy": 2.8531015515327454, "loss/logits": 0.9062738597393036, "step": 22850 }, { "epoch": 0.2286, "grad_norm": 12.9375, "grad_norm_var": 0.10519205729166667, "learning_rate": 0.0003, "loss": 11.682, "loss/aux_loss": 0.04810410905629396, "loss/crossentropy": 2.9115795135498046, "loss/logits": 0.9097014546394349, "step": 22860 }, { "epoch": 0.2287, "grad_norm": 13.1875, "grad_norm_var": 2.2611979166666667, "learning_rate": 0.0003, "loss": 11.6901, "loss/aux_loss": 0.048095389269292355, "loss/crossentropy": 2.8267542123794556, "loss/logits": 0.9044133692979812, "step": 22870 }, { "epoch": 0.2288, "grad_norm": 12.8125, "grad_norm_var": 0.6259765625, "learning_rate": 0.0003, "loss": 11.541, "loss/aux_loss": 0.04808672312647104, "loss/crossentropy": 2.6773048043251038, "loss/logits": 0.876294469833374, "step": 22880 }, { "epoch": 0.2289, "grad_norm": 12.5, "grad_norm_var": 0.6655598958333333, "learning_rate": 0.0003, "loss": 11.7934, "loss/aux_loss": 0.04808298200368881, "loss/crossentropy": 2.837724781036377, "loss/logits": 0.9370515316724777, "step": 22890 }, { "epoch": 0.229, "grad_norm": 11.875, "grad_norm_var": 0.620556640625, "learning_rate": 0.0003, "loss": 11.8536, "loss/aux_loss": 0.04808994997292757, "loss/crossentropy": 2.795435976982117, "loss/logits": 0.8800392180681229, "step": 22900 }, { "epoch": 0.2291, "grad_norm": 13.0625, "grad_norm_var": 0.425634765625, "learning_rate": 0.0003, "loss": 11.5746, "loss/aux_loss": 0.04808636344969273, "loss/crossentropy": 2.9793556809425352, "loss/logits": 0.86444131731987, "step": 22910 }, { "epoch": 0.2292, "grad_norm": 11.9375, "grad_norm_var": 0.29375, "learning_rate": 0.0003, "loss": 11.7719, "loss/aux_loss": 0.048105028085410596, "loss/crossentropy": 2.7319081902503966, "loss/logits": 0.8803468406200409, "step": 22920 }, { "epoch": 0.2293, "grad_norm": 11.9375, "grad_norm_var": 0.30441080729166664, "learning_rate": 0.0003, "loss": 11.8234, "loss/aux_loss": 0.04808462020009756, "loss/crossentropy": 2.821427547931671, "loss/logits": 0.8934536874294281, "step": 22930 }, { "epoch": 0.2294, "grad_norm": 13.4375, "grad_norm_var": 0.47537434895833336, "learning_rate": 0.0003, "loss": 11.7563, "loss/aux_loss": 0.048085405677556994, "loss/crossentropy": 2.751129651069641, "loss/logits": 0.9135142832994461, "step": 22940 }, { "epoch": 0.2295, "grad_norm": 11.6875, "grad_norm_var": 0.386181640625, "learning_rate": 0.0003, "loss": 11.6328, "loss/aux_loss": 0.04809577204287052, "loss/crossentropy": 2.6600105464458466, "loss/logits": 0.8954817146062851, "step": 22950 }, { "epoch": 0.2296, "grad_norm": 12.25, "grad_norm_var": 0.460400390625, "learning_rate": 0.0003, "loss": 11.9641, "loss/aux_loss": 0.04808154255151749, "loss/crossentropy": 2.909790873527527, "loss/logits": 0.9102999448776246, "step": 22960 }, { "epoch": 0.2297, "grad_norm": 12.4375, "grad_norm_var": 0.38483072916666666, "learning_rate": 0.0003, "loss": 11.5812, "loss/aux_loss": 0.04809312988072634, "loss/crossentropy": 2.7076492428779604, "loss/logits": 0.8611445337533951, "step": 22970 }, { "epoch": 0.2298, "grad_norm": 12.625, "grad_norm_var": 1.0254557291666666, "learning_rate": 0.0003, "loss": 11.8651, "loss/aux_loss": 0.0480959540233016, "loss/crossentropy": 2.900135505199432, "loss/logits": 0.8992374151945114, "step": 22980 }, { "epoch": 0.2299, "grad_norm": 11.5, "grad_norm_var": 1.0036458333333333, "learning_rate": 0.0003, "loss": 11.787, "loss/aux_loss": 0.0480839628726244, "loss/crossentropy": 2.7226045966148376, "loss/logits": 0.9184604525566101, "step": 22990 }, { "epoch": 0.23, "grad_norm": 12.625, "grad_norm_var": 0.5655598958333333, "learning_rate": 0.0003, "loss": 11.6746, "loss/aux_loss": 0.0480868011713028, "loss/crossentropy": 2.7485710740089417, "loss/logits": 0.8804697394371033, "step": 23000 }, { "epoch": 0.2301, "grad_norm": 12.75, "grad_norm_var": 0.434228515625, "learning_rate": 0.0003, "loss": 11.6392, "loss/aux_loss": 0.04808677285909653, "loss/crossentropy": 2.898291528224945, "loss/logits": 0.8929012924432754, "step": 23010 }, { "epoch": 0.2302, "grad_norm": 11.9375, "grad_norm_var": 0.2840983072916667, "learning_rate": 0.0003, "loss": 11.7874, "loss/aux_loss": 0.04808888360857964, "loss/crossentropy": 2.7599457263946534, "loss/logits": 0.9127897024154663, "step": 23020 }, { "epoch": 0.2303, "grad_norm": 12.625, "grad_norm_var": 0.23567708333333334, "learning_rate": 0.0003, "loss": 11.8279, "loss/aux_loss": 0.04809257406741381, "loss/crossentropy": 2.8466604590415954, "loss/logits": 0.9271048754453659, "step": 23030 }, { "epoch": 0.2304, "grad_norm": 12.375, "grad_norm_var": 0.42942708333333335, "learning_rate": 0.0003, "loss": 11.5945, "loss/aux_loss": 0.04807973112910986, "loss/crossentropy": 2.6964468479156496, "loss/logits": 0.8535116940736771, "step": 23040 }, { "epoch": 0.2305, "grad_norm": 12.5, "grad_norm_var": 0.4483723958333333, "learning_rate": 0.0003, "loss": 11.7818, "loss/aux_loss": 0.04808500371873379, "loss/crossentropy": 2.773308277130127, "loss/logits": 0.8800354272127151, "step": 23050 }, { "epoch": 0.2306, "grad_norm": 13.0, "grad_norm_var": 0.8705729166666667, "learning_rate": 0.0003, "loss": 11.7319, "loss/aux_loss": 0.04808144625276327, "loss/crossentropy": 2.7295325756073, "loss/logits": 0.9066801935434341, "step": 23060 }, { "epoch": 0.2307, "grad_norm": 12.375, "grad_norm_var": 0.8344889322916667, "learning_rate": 0.0003, "loss": 11.7069, "loss/aux_loss": 0.048089582659304145, "loss/crossentropy": 2.673926168680191, "loss/logits": 0.8846386224031448, "step": 23070 }, { "epoch": 0.2308, "grad_norm": 12.75, "grad_norm_var": 0.4093098958333333, "learning_rate": 0.0003, "loss": 11.6815, "loss/aux_loss": 0.048079993948340415, "loss/crossentropy": 2.838780736923218, "loss/logits": 0.8946847975254059, "step": 23080 }, { "epoch": 0.2309, "grad_norm": 11.8125, "grad_norm_var": 0.2999348958333333, "learning_rate": 0.0003, "loss": 11.76, "loss/aux_loss": 0.04808596204966307, "loss/crossentropy": 2.7160808563232424, "loss/logits": 0.9145932257175445, "step": 23090 }, { "epoch": 0.231, "grad_norm": 11.75, "grad_norm_var": 0.32493489583333335, "learning_rate": 0.0003, "loss": 11.6019, "loss/aux_loss": 0.04809281267225742, "loss/crossentropy": 2.861774879693985, "loss/logits": 0.8780633181333541, "step": 23100 }, { "epoch": 0.2311, "grad_norm": 11.875, "grad_norm_var": 0.43802083333333336, "learning_rate": 0.0003, "loss": 11.8316, "loss/aux_loss": 0.04808343816548586, "loss/crossentropy": 2.7952277660369873, "loss/logits": 0.9055339187383652, "step": 23110 }, { "epoch": 0.2312, "grad_norm": 12.3125, "grad_norm_var": 0.670947265625, "learning_rate": 0.0003, "loss": 11.8885, "loss/aux_loss": 0.048087695986032485, "loss/crossentropy": 2.7752737283706663, "loss/logits": 0.9146613448858261, "step": 23120 }, { "epoch": 0.2313, "grad_norm": 12.9375, "grad_norm_var": 0.6710774739583333, "learning_rate": 0.0003, "loss": 11.7875, "loss/aux_loss": 0.04809215571731329, "loss/crossentropy": 2.8519309163093567, "loss/logits": 0.9128359079360961, "step": 23130 }, { "epoch": 0.2314, "grad_norm": 13.25, "grad_norm_var": 0.5870930989583333, "learning_rate": 0.0003, "loss": 11.5688, "loss/aux_loss": 0.04807926807552576, "loss/crossentropy": 2.764670741558075, "loss/logits": 0.9091036021709442, "step": 23140 }, { "epoch": 0.2315, "grad_norm": 12.3125, "grad_norm_var": 0.349072265625, "learning_rate": 0.0003, "loss": 11.7851, "loss/aux_loss": 0.04807650428265333, "loss/crossentropy": 2.707452893257141, "loss/logits": 0.894105252623558, "step": 23150 }, { "epoch": 0.2316, "grad_norm": 13.0625, "grad_norm_var": 133.3056640625, "learning_rate": 0.0003, "loss": 11.6686, "loss/aux_loss": 0.04811877477914095, "loss/crossentropy": 2.869442331790924, "loss/logits": 0.8912162572145462, "step": 23160 }, { "epoch": 0.2317, "grad_norm": 12.6875, "grad_norm_var": 131.73795572916666, "learning_rate": 0.0003, "loss": 11.8418, "loss/aux_loss": 0.0480883814394474, "loss/crossentropy": 2.8353028416633608, "loss/logits": 0.9513318210840225, "step": 23170 }, { "epoch": 0.2318, "grad_norm": 12.75, "grad_norm_var": 2.652718098958333, "learning_rate": 0.0003, "loss": 11.6498, "loss/aux_loss": 0.0480922332033515, "loss/crossentropy": 2.7743508577346803, "loss/logits": 0.9048791795969009, "step": 23180 }, { "epoch": 0.2319, "grad_norm": 12.125, "grad_norm_var": 2.579931640625, "learning_rate": 0.0003, "loss": 11.5636, "loss/aux_loss": 0.04809550289064646, "loss/crossentropy": 2.796035075187683, "loss/logits": 0.8494812101125717, "step": 23190 }, { "epoch": 0.232, "grad_norm": 13.5, "grad_norm_var": 0.26764322916666666, "learning_rate": 0.0003, "loss": 11.8704, "loss/aux_loss": 0.048094166442751884, "loss/crossentropy": 2.6794604539871214, "loss/logits": 0.9145378708839417, "step": 23200 }, { "epoch": 0.2321, "grad_norm": 13.125, "grad_norm_var": 0.4744791666666667, "learning_rate": 0.0003, "loss": 11.7678, "loss/aux_loss": 0.04808245878666639, "loss/crossentropy": 2.75232680439949, "loss/logits": 0.8650152295827865, "step": 23210 }, { "epoch": 0.2322, "grad_norm": 11.625, "grad_norm_var": 0.46243489583333336, "learning_rate": 0.0003, "loss": 11.6645, "loss/aux_loss": 0.04808831550180912, "loss/crossentropy": 2.7722482800483705, "loss/logits": 0.8622186064720154, "step": 23220 }, { "epoch": 0.2323, "grad_norm": 13.6875, "grad_norm_var": 0.834619140625, "learning_rate": 0.0003, "loss": 11.7358, "loss/aux_loss": 0.04809358511120081, "loss/crossentropy": 2.8247627317905426, "loss/logits": 0.8922833681106568, "step": 23230 }, { "epoch": 0.2324, "grad_norm": 12.1875, "grad_norm_var": 0.651806640625, "learning_rate": 0.0003, "loss": 11.6687, "loss/aux_loss": 0.048091284930706024, "loss/crossentropy": 2.7717152774333953, "loss/logits": 0.8566128462553024, "step": 23240 }, { "epoch": 0.2325, "grad_norm": 12.25, "grad_norm_var": 0.3120930989583333, "learning_rate": 0.0003, "loss": 11.6438, "loss/aux_loss": 0.04808084759861231, "loss/crossentropy": 2.743647050857544, "loss/logits": 0.8860394328832626, "step": 23250 }, { "epoch": 0.2326, "grad_norm": 12.4375, "grad_norm_var": 0.2906087239583333, "learning_rate": 0.0003, "loss": 11.7391, "loss/aux_loss": 0.04808401893824339, "loss/crossentropy": 2.8058079719543456, "loss/logits": 0.9324862480163574, "step": 23260 }, { "epoch": 0.2327, "grad_norm": 11.8125, "grad_norm_var": 0.28878580729166664, "learning_rate": 0.0003, "loss": 11.6006, "loss/aux_loss": 0.04809973333030939, "loss/crossentropy": 2.797436898946762, "loss/logits": 0.8818973273038864, "step": 23270 }, { "epoch": 0.2328, "grad_norm": 12.375, "grad_norm_var": 0.17237955729166668, "learning_rate": 0.0003, "loss": 11.6795, "loss/aux_loss": 0.04809030685573816, "loss/crossentropy": 2.8836780309677126, "loss/logits": 0.9173092126846314, "step": 23280 }, { "epoch": 0.2329, "grad_norm": 12.125, "grad_norm_var": 0.27786458333333336, "learning_rate": 0.0003, "loss": 11.6142, "loss/aux_loss": 0.04809027072042227, "loss/crossentropy": 2.829053020477295, "loss/logits": 0.8915682911872864, "step": 23290 }, { "epoch": 0.233, "grad_norm": 12.3125, "grad_norm_var": 0.172509765625, "learning_rate": 0.0003, "loss": 11.7196, "loss/aux_loss": 0.04808690585196018, "loss/crossentropy": 2.793965721130371, "loss/logits": 0.8819243282079696, "step": 23300 }, { "epoch": 0.2331, "grad_norm": 11.625, "grad_norm_var": 0.4320149739583333, "learning_rate": 0.0003, "loss": 11.7091, "loss/aux_loss": 0.04808926545083523, "loss/crossentropy": 2.5778140842914583, "loss/logits": 0.8577252298593521, "step": 23310 }, { "epoch": 0.2332, "grad_norm": 13.375, "grad_norm_var": 0.718212890625, "learning_rate": 0.0003, "loss": 11.8099, "loss/aux_loss": 0.04810008257627487, "loss/crossentropy": 2.9423258543014525, "loss/logits": 0.9043860971927643, "step": 23320 }, { "epoch": 0.2333, "grad_norm": 13.0, "grad_norm_var": 0.8684895833333334, "learning_rate": 0.0003, "loss": 11.5931, "loss/aux_loss": 0.04809061642736197, "loss/crossentropy": 2.75088050365448, "loss/logits": 0.8834013044834137, "step": 23330 }, { "epoch": 0.2334, "grad_norm": 12.375, "grad_norm_var": 0.7817057291666667, "learning_rate": 0.0003, "loss": 11.6771, "loss/aux_loss": 0.0480886397883296, "loss/crossentropy": 2.788175332546234, "loss/logits": 0.9197595477104187, "step": 23340 }, { "epoch": 0.2335, "grad_norm": 12.4375, "grad_norm_var": 0.45305989583333334, "learning_rate": 0.0003, "loss": 11.807, "loss/aux_loss": 0.04809667635709047, "loss/crossentropy": 2.8132767200469972, "loss/logits": 0.8944458961486816, "step": 23350 }, { "epoch": 0.2336, "grad_norm": 11.625, "grad_norm_var": 0.40729166666666666, "learning_rate": 0.0003, "loss": 11.7539, "loss/aux_loss": 0.0480756500735879, "loss/crossentropy": 2.86536762714386, "loss/logits": 0.876064345240593, "step": 23360 }, { "epoch": 0.2337, "grad_norm": 12.4375, "grad_norm_var": 0.5337076822916667, "learning_rate": 0.0003, "loss": 11.6954, "loss/aux_loss": 0.048088740557432175, "loss/crossentropy": 2.861802363395691, "loss/logits": 0.8744904607534408, "step": 23370 }, { "epoch": 0.2338, "grad_norm": 12.1875, "grad_norm_var": 0.4778645833333333, "learning_rate": 0.0003, "loss": 11.7553, "loss/aux_loss": 0.0480877548456192, "loss/crossentropy": 2.8678762316703796, "loss/logits": 0.9301058530807496, "step": 23380 }, { "epoch": 0.2339, "grad_norm": 12.625, "grad_norm_var": 0.28899739583333334, "learning_rate": 0.0003, "loss": 11.7144, "loss/aux_loss": 0.04808529950678349, "loss/crossentropy": 2.858980119228363, "loss/logits": 0.877686470746994, "step": 23390 }, { "epoch": 0.234, "grad_norm": 12.0625, "grad_norm_var": 0.184619140625, "learning_rate": 0.0003, "loss": 11.6092, "loss/aux_loss": 0.048086689226329325, "loss/crossentropy": 2.603729021549225, "loss/logits": 0.850713437795639, "step": 23400 }, { "epoch": 0.2341, "grad_norm": 12.25, "grad_norm_var": 1.2035807291666667, "learning_rate": 0.0003, "loss": 11.6522, "loss/aux_loss": 0.048092365451157096, "loss/crossentropy": 2.9920172095298767, "loss/logits": 0.8820174932479858, "step": 23410 }, { "epoch": 0.2342, "grad_norm": 12.5, "grad_norm_var": 0.3931640625, "learning_rate": 0.0003, "loss": 11.7154, "loss/aux_loss": 0.048093152418732646, "loss/crossentropy": 2.6669042885303496, "loss/logits": 0.8748789399862289, "step": 23420 }, { "epoch": 0.2343, "grad_norm": 12.3125, "grad_norm_var": 0.329150390625, "learning_rate": 0.0003, "loss": 11.6354, "loss/aux_loss": 0.04808267038315535, "loss/crossentropy": 2.6751941323280333, "loss/logits": 0.8559034675359726, "step": 23430 }, { "epoch": 0.2344, "grad_norm": 11.625, "grad_norm_var": 0.696728515625, "learning_rate": 0.0003, "loss": 11.6513, "loss/aux_loss": 0.048090195283293724, "loss/crossentropy": 2.750403940677643, "loss/logits": 0.8850887566804886, "step": 23440 }, { "epoch": 0.2345, "grad_norm": 12.375, "grad_norm_var": 0.2567545572916667, "learning_rate": 0.0003, "loss": 11.8244, "loss/aux_loss": 0.048084620386362076, "loss/crossentropy": 2.9328520774841307, "loss/logits": 0.9011586248874665, "step": 23450 }, { "epoch": 0.2346, "grad_norm": 12.75, "grad_norm_var": 0.09739583333333333, "learning_rate": 0.0003, "loss": 11.6918, "loss/aux_loss": 0.04808659795671701, "loss/crossentropy": 2.7696239829063414, "loss/logits": 0.8988734126091004, "step": 23460 }, { "epoch": 0.2347, "grad_norm": 13.125, "grad_norm_var": 0.332666015625, "learning_rate": 0.0003, "loss": 11.8292, "loss/aux_loss": 0.0480822155252099, "loss/crossentropy": 2.910421371459961, "loss/logits": 0.900927659869194, "step": 23470 }, { "epoch": 0.2348, "grad_norm": 12.125, "grad_norm_var": 0.54921875, "learning_rate": 0.0003, "loss": 11.6806, "loss/aux_loss": 0.04809327684342861, "loss/crossentropy": 2.8524921536445618, "loss/logits": 0.9137292951345444, "step": 23480 }, { "epoch": 0.2349, "grad_norm": 13.875, "grad_norm_var": 2.0796223958333333, "learning_rate": 0.0003, "loss": 11.7016, "loss/aux_loss": 0.048085125908255574, "loss/crossentropy": 2.680436742305756, "loss/logits": 0.8520400941371917, "step": 23490 }, { "epoch": 0.235, "grad_norm": 12.8125, "grad_norm_var": 2.0036295572916667, "learning_rate": 0.0003, "loss": 11.7285, "loss/aux_loss": 0.04808577839285135, "loss/crossentropy": 2.6893193125724792, "loss/logits": 0.8656334489583969, "step": 23500 }, { "epoch": 0.2351, "grad_norm": 12.25, "grad_norm_var": 0.3738932291666667, "learning_rate": 0.0003, "loss": 11.7499, "loss/aux_loss": 0.048086671903729436, "loss/crossentropy": 2.8467262983322144, "loss/logits": 0.8982198029756546, "step": 23510 }, { "epoch": 0.2352, "grad_norm": 12.8125, "grad_norm_var": 0.202197265625, "learning_rate": 0.0003, "loss": 11.8075, "loss/aux_loss": 0.04808525312691927, "loss/crossentropy": 2.8524319410324095, "loss/logits": 0.8897974759340286, "step": 23520 }, { "epoch": 0.2353, "grad_norm": 14.0625, "grad_norm_var": 0.5054524739583334, "learning_rate": 0.0003, "loss": 11.951, "loss/aux_loss": 0.048080182448029515, "loss/crossentropy": 2.7926797032356263, "loss/logits": 0.8885494351387024, "step": 23530 }, { "epoch": 0.2354, "grad_norm": 11.8125, "grad_norm_var": 0.5848307291666667, "learning_rate": 0.0003, "loss": 11.5039, "loss/aux_loss": 0.048084812425076964, "loss/crossentropy": 2.79399893283844, "loss/logits": 0.8674466758966446, "step": 23540 }, { "epoch": 0.2355, "grad_norm": 13.1875, "grad_norm_var": 0.29816080729166666, "learning_rate": 0.0003, "loss": 11.7157, "loss/aux_loss": 0.048091997392475605, "loss/crossentropy": 2.812830251455307, "loss/logits": 0.8933589518070221, "step": 23550 }, { "epoch": 0.2356, "grad_norm": 13.0625, "grad_norm_var": 0.20045572916666668, "learning_rate": 0.0003, "loss": 11.6117, "loss/aux_loss": 0.048093314096331594, "loss/crossentropy": 2.773878538608551, "loss/logits": 0.8754092365503311, "step": 23560 }, { "epoch": 0.2357, "grad_norm": 13.75, "grad_norm_var": 1.3430826822916666, "learning_rate": 0.0003, "loss": 11.7499, "loss/aux_loss": 0.04808519445359707, "loss/crossentropy": 2.888442850112915, "loss/logits": 0.8818228989839554, "step": 23570 }, { "epoch": 0.2358, "grad_norm": 12.5625, "grad_norm_var": 1.249072265625, "learning_rate": 0.0003, "loss": 11.8386, "loss/aux_loss": 0.04807713199406862, "loss/crossentropy": 2.6926519870758057, "loss/logits": 0.8780468791723252, "step": 23580 }, { "epoch": 0.2359, "grad_norm": 12.0, "grad_norm_var": 0.44166666666666665, "learning_rate": 0.0003, "loss": 11.6326, "loss/aux_loss": 0.048089764825999734, "loss/crossentropy": 2.6616145730018617, "loss/logits": 0.8930239170789719, "step": 23590 }, { "epoch": 0.236, "grad_norm": 12.625, "grad_norm_var": 0.453125, "learning_rate": 0.0003, "loss": 11.7866, "loss/aux_loss": 0.048086699284613135, "loss/crossentropy": 2.7168304443359377, "loss/logits": 0.8998159736394882, "step": 23600 }, { "epoch": 0.2361, "grad_norm": 13.5, "grad_norm_var": 0.5286295572916667, "learning_rate": 0.0003, "loss": 11.906, "loss/aux_loss": 0.048088240809738635, "loss/crossentropy": 2.7172608613967895, "loss/logits": 0.8923117220401764, "step": 23610 }, { "epoch": 0.2362, "grad_norm": 12.8125, "grad_norm_var": 0.30104166666666665, "learning_rate": 0.0003, "loss": 11.5223, "loss/aux_loss": 0.04808646198362112, "loss/crossentropy": 2.706549334526062, "loss/logits": 0.9012427359819413, "step": 23620 }, { "epoch": 0.2363, "grad_norm": 12.625, "grad_norm_var": 0.30572916666666666, "learning_rate": 0.0003, "loss": 11.6055, "loss/aux_loss": 0.04809609428048134, "loss/crossentropy": 2.7896106839179993, "loss/logits": 0.8902231156826019, "step": 23630 }, { "epoch": 0.2364, "grad_norm": 12.6875, "grad_norm_var": 0.37233072916666665, "learning_rate": 0.0003, "loss": 11.962, "loss/aux_loss": 0.048088495060801505, "loss/crossentropy": 3.007174789905548, "loss/logits": 0.9304111152887344, "step": 23640 }, { "epoch": 0.2365, "grad_norm": 12.3125, "grad_norm_var": 0.286181640625, "learning_rate": 0.0003, "loss": 11.5338, "loss/aux_loss": 0.04809539187699556, "loss/crossentropy": 2.633792459964752, "loss/logits": 0.8579708755016326, "step": 23650 }, { "epoch": 0.2366, "grad_norm": 11.8125, "grad_norm_var": 0.7730305989583334, "learning_rate": 0.0003, "loss": 11.6288, "loss/aux_loss": 0.048088702373206615, "loss/crossentropy": 2.738995945453644, "loss/logits": 0.8920851528644562, "step": 23660 }, { "epoch": 0.2367, "grad_norm": 11.4375, "grad_norm_var": 0.74921875, "learning_rate": 0.0003, "loss": 11.7406, "loss/aux_loss": 0.04808499738574028, "loss/crossentropy": 2.750992178916931, "loss/logits": 0.8741905808448791, "step": 23670 }, { "epoch": 0.2368, "grad_norm": 12.5, "grad_norm_var": 0.7702473958333333, "learning_rate": 0.0003, "loss": 11.7279, "loss/aux_loss": 0.04809374678879976, "loss/crossentropy": 3.0117597341537476, "loss/logits": 0.9128781437873841, "step": 23680 }, { "epoch": 0.2369, "grad_norm": 12.375, "grad_norm_var": 1.1684733072916667, "learning_rate": 0.0003, "loss": 11.6513, "loss/aux_loss": 0.04808376375585795, "loss/crossentropy": 2.730088675022125, "loss/logits": 0.8473275810480118, "step": 23690 }, { "epoch": 0.237, "grad_norm": 11.5625, "grad_norm_var": 1.056494140625, "learning_rate": 0.0003, "loss": 11.8007, "loss/aux_loss": 0.048095212876796724, "loss/crossentropy": 2.7834979057312013, "loss/logits": 0.8746491730213165, "step": 23700 }, { "epoch": 0.2371, "grad_norm": 13.875, "grad_norm_var": 1.337353515625, "learning_rate": 0.0003, "loss": 11.5859, "loss/aux_loss": 0.048090392164885996, "loss/crossentropy": 2.772467577457428, "loss/logits": 0.9062090307474137, "step": 23710 }, { "epoch": 0.2372, "grad_norm": 12.5625, "grad_norm_var": 0.7644368489583333, "learning_rate": 0.0003, "loss": 11.674, "loss/aux_loss": 0.048084620386362076, "loss/crossentropy": 2.814483368396759, "loss/logits": 0.9223452210426331, "step": 23720 }, { "epoch": 0.2373, "grad_norm": 12.1875, "grad_norm_var": 0.1921875, "learning_rate": 0.0003, "loss": 11.652, "loss/aux_loss": 0.048086386919021604, "loss/crossentropy": 2.937320578098297, "loss/logits": 0.867486959695816, "step": 23730 }, { "epoch": 0.2374, "grad_norm": 13.375, "grad_norm_var": 0.28098958333333335, "learning_rate": 0.0003, "loss": 11.6409, "loss/aux_loss": 0.048084064945578577, "loss/crossentropy": 2.8077134013175966, "loss/logits": 0.8964003264904022, "step": 23740 }, { "epoch": 0.2375, "grad_norm": 12.6875, "grad_norm_var": 1.6445149739583333, "learning_rate": 0.0003, "loss": 11.8131, "loss/aux_loss": 0.048087784089148045, "loss/crossentropy": 2.9468029141426086, "loss/logits": 0.9184634208679199, "step": 23750 }, { "epoch": 0.2376, "grad_norm": 11.625, "grad_norm_var": 1.9526041666666667, "learning_rate": 0.0003, "loss": 11.7089, "loss/aux_loss": 0.04808371346443892, "loss/crossentropy": 2.8894827246665953, "loss/logits": 0.8926361262798309, "step": 23760 }, { "epoch": 0.2377, "grad_norm": 11.375, "grad_norm_var": 0.48899739583333335, "learning_rate": 0.0003, "loss": 11.7193, "loss/aux_loss": 0.04808234348893166, "loss/crossentropy": 2.8240739822387697, "loss/logits": 0.8710032075643539, "step": 23770 }, { "epoch": 0.2378, "grad_norm": 13.5, "grad_norm_var": 2.7316243489583334, "learning_rate": 0.0003, "loss": 11.5324, "loss/aux_loss": 0.0480916004627943, "loss/crossentropy": 2.7174317240715027, "loss/logits": 0.8585263520479203, "step": 23780 }, { "epoch": 0.2379, "grad_norm": 12.0, "grad_norm_var": 2.7860514322916665, "learning_rate": 0.0003, "loss": 11.8451, "loss/aux_loss": 0.048090549744665624, "loss/crossentropy": 2.843950593471527, "loss/logits": 0.9064037382602692, "step": 23790 }, { "epoch": 0.238, "grad_norm": 12.125, "grad_norm_var": 0.31365559895833334, "learning_rate": 0.0003, "loss": 11.7494, "loss/aux_loss": 0.04807556625455618, "loss/crossentropy": 2.704611933231354, "loss/logits": 0.8805167257785798, "step": 23800 }, { "epoch": 0.2381, "grad_norm": 13.0, "grad_norm_var": 0.40826822916666666, "learning_rate": 0.0003, "loss": 11.5183, "loss/aux_loss": 0.04808481372892857, "loss/crossentropy": 2.536171966791153, "loss/logits": 0.8397417157888413, "step": 23810 }, { "epoch": 0.2382, "grad_norm": 12.5, "grad_norm_var": 0.27545572916666666, "learning_rate": 0.0003, "loss": 11.6364, "loss/aux_loss": 0.048093268647789955, "loss/crossentropy": 2.7942283511161805, "loss/logits": 0.8568087071180344, "step": 23820 }, { "epoch": 0.2383, "grad_norm": 12.25, "grad_norm_var": 0.4332682291666667, "learning_rate": 0.0003, "loss": 11.6525, "loss/aux_loss": 0.048078492656350134, "loss/crossentropy": 2.7623124718666077, "loss/logits": 0.891291829943657, "step": 23830 }, { "epoch": 0.2384, "grad_norm": 12.5625, "grad_norm_var": 0.34152018229166664, "learning_rate": 0.0003, "loss": 11.7087, "loss/aux_loss": 0.048094586841762064, "loss/crossentropy": 2.678860205411911, "loss/logits": 0.8403998255729676, "step": 23840 }, { "epoch": 0.2385, "grad_norm": 13.375, "grad_norm_var": 0.4306640625, "learning_rate": 0.0003, "loss": 11.8878, "loss/aux_loss": 0.04808190613985062, "loss/crossentropy": 2.858857882022858, "loss/logits": 0.8940188169479371, "step": 23850 }, { "epoch": 0.2386, "grad_norm": 12.375, "grad_norm_var": 14.811832682291667, "learning_rate": 0.0003, "loss": 11.7931, "loss/aux_loss": 0.048093811981379984, "loss/crossentropy": 2.911667358875275, "loss/logits": 0.911825567483902, "step": 23860 }, { "epoch": 0.2387, "grad_norm": 12.9375, "grad_norm_var": 0.6860514322916667, "learning_rate": 0.0003, "loss": 11.6265, "loss/aux_loss": 0.04809156283736229, "loss/crossentropy": 2.897962212562561, "loss/logits": 0.9157760441303253, "step": 23870 }, { "epoch": 0.2388, "grad_norm": 13.5625, "grad_norm_var": 0.5841145833333333, "learning_rate": 0.0003, "loss": 11.6992, "loss/aux_loss": 0.048074356466531756, "loss/crossentropy": 2.6709546744823456, "loss/logits": 0.892021319270134, "step": 23880 }, { "epoch": 0.2389, "grad_norm": 12.5625, "grad_norm_var": 0.5063639322916667, "learning_rate": 0.0003, "loss": 11.9177, "loss/aux_loss": 0.04809843823313713, "loss/crossentropy": 2.818812572956085, "loss/logits": 0.9256382822990418, "step": 23890 }, { "epoch": 0.239, "grad_norm": 13.3125, "grad_norm_var": 0.2604166666666667, "learning_rate": 0.0003, "loss": 11.6635, "loss/aux_loss": 0.048081925325095656, "loss/crossentropy": 2.768986976146698, "loss/logits": 0.8817100405693055, "step": 23900 }, { "epoch": 0.2391, "grad_norm": 13.0, "grad_norm_var": 1.490869140625, "learning_rate": 0.0003, "loss": 11.6234, "loss/aux_loss": 0.048085336573421955, "loss/crossentropy": 2.748648244142532, "loss/logits": 0.9171183824539184, "step": 23910 }, { "epoch": 0.2392, "grad_norm": 12.25, "grad_norm_var": 1.4512858072916666, "learning_rate": 0.0003, "loss": 11.7612, "loss/aux_loss": 0.04809110928326845, "loss/crossentropy": 2.7163472533226014, "loss/logits": 0.8676630944013596, "step": 23920 }, { "epoch": 0.2393, "grad_norm": 13.1875, "grad_norm_var": 0.746728515625, "learning_rate": 0.0003, "loss": 11.7462, "loss/aux_loss": 0.048089482076466086, "loss/crossentropy": 2.930872416496277, "loss/logits": 0.9010693699121475, "step": 23930 }, { "epoch": 0.2394, "grad_norm": 12.6875, "grad_norm_var": 0.5486979166666667, "learning_rate": 0.0003, "loss": 11.896, "loss/aux_loss": 0.048085294850170615, "loss/crossentropy": 2.798089528083801, "loss/logits": 0.8853724330663681, "step": 23940 }, { "epoch": 0.2395, "grad_norm": 12.5625, "grad_norm_var": 0.3851399739583333, "learning_rate": 0.0003, "loss": 11.6158, "loss/aux_loss": 0.04808676280081272, "loss/crossentropy": 2.726479697227478, "loss/logits": 0.8973057448863984, "step": 23950 }, { "epoch": 0.2396, "grad_norm": 14.25, "grad_norm_var": 0.3282389322916667, "learning_rate": 0.0003, "loss": 11.7098, "loss/aux_loss": 0.048084568418562415, "loss/crossentropy": 2.6635270595550535, "loss/logits": 0.8485010534524917, "step": 23960 }, { "epoch": 0.2397, "grad_norm": 11.5, "grad_norm_var": 0.7620930989583333, "learning_rate": 0.0003, "loss": 11.5762, "loss/aux_loss": 0.048088745586574076, "loss/crossentropy": 2.744753432273865, "loss/logits": 0.8812153309583663, "step": 23970 }, { "epoch": 0.2398, "grad_norm": 12.8125, "grad_norm_var": 0.4384765625, "learning_rate": 0.0003, "loss": 11.7152, "loss/aux_loss": 0.04808169640600681, "loss/crossentropy": 2.7333896338939665, "loss/logits": 0.9171457648277282, "step": 23980 }, { "epoch": 0.2399, "grad_norm": 29.5, "grad_norm_var": 19.277067057291667, "learning_rate": 0.0003, "loss": 11.5902, "loss/aux_loss": 0.04808990899473429, "loss/crossentropy": 2.865362215042114, "loss/logits": 0.8600284993648529, "step": 23990 }, { "epoch": 0.24, "grad_norm": 12.6875, "grad_norm_var": 18.717171223958335, "learning_rate": 0.0003, "loss": 11.785, "loss/aux_loss": 0.04809808786958456, "loss/crossentropy": 2.812199038267136, "loss/logits": 0.8720352232456208, "step": 24000 }, { "epoch": 0.2401, "grad_norm": 11.875, "grad_norm_var": 0.16744791666666667, "learning_rate": 0.0003, "loss": 11.6551, "loss/aux_loss": 0.04809410627931356, "loss/crossentropy": 2.7996535181999205, "loss/logits": 0.9151755809783936, "step": 24010 }, { "epoch": 0.2402, "grad_norm": 13.125, "grad_norm_var": 0.5535807291666667, "learning_rate": 0.0003, "loss": 11.7477, "loss/aux_loss": 0.04809091780334711, "loss/crossentropy": 2.7805619120597838, "loss/logits": 0.9082524001598358, "step": 24020 }, { "epoch": 0.2403, "grad_norm": 13.25, "grad_norm_var": 0.4984375, "learning_rate": 0.0003, "loss": 11.6539, "loss/aux_loss": 0.04808855000883341, "loss/crossentropy": 2.7244292974472044, "loss/logits": 0.8969215124845504, "step": 24030 }, { "epoch": 0.2404, "grad_norm": 11.4375, "grad_norm_var": 0.6372395833333333, "learning_rate": 0.0003, "loss": 11.8925, "loss/aux_loss": 0.04808660857379436, "loss/crossentropy": 2.8658366203308105, "loss/logits": 0.9359738051891326, "step": 24040 }, { "epoch": 0.2405, "grad_norm": 12.3125, "grad_norm_var": 0.3150390625, "learning_rate": 0.0003, "loss": 11.6215, "loss/aux_loss": 0.0480943713337183, "loss/crossentropy": 2.7264646887779236, "loss/logits": 0.8683656752109528, "step": 24050 }, { "epoch": 0.2406, "grad_norm": 13.625, "grad_norm_var": 0.8895182291666667, "learning_rate": 0.0003, "loss": 11.6396, "loss/aux_loss": 0.04809534400701523, "loss/crossentropy": 2.7673123121261596, "loss/logits": 0.8471581250429153, "step": 24060 }, { "epoch": 0.2407, "grad_norm": 11.8125, "grad_norm_var": 0.6770182291666667, "learning_rate": 0.0003, "loss": 11.6868, "loss/aux_loss": 0.04808705560863018, "loss/crossentropy": 2.7488048553466795, "loss/logits": 0.918352234363556, "step": 24070 }, { "epoch": 0.2408, "grad_norm": 11.75, "grad_norm_var": 0.4117024739583333, "learning_rate": 0.0003, "loss": 11.6628, "loss/aux_loss": 0.048093785718083384, "loss/crossentropy": 2.8444491744041445, "loss/logits": 0.8840048730373382, "step": 24080 }, { "epoch": 0.2409, "grad_norm": 12.0, "grad_norm_var": 0.461962890625, "learning_rate": 0.0003, "loss": 11.5886, "loss/aux_loss": 0.04807962272316217, "loss/crossentropy": 2.580649846792221, "loss/logits": 0.8638424456119538, "step": 24090 }, { "epoch": 0.241, "grad_norm": 11.8125, "grad_norm_var": 0.3580729166666667, "learning_rate": 0.0003, "loss": 11.7413, "loss/aux_loss": 0.04808608740568161, "loss/crossentropy": 2.8323341250419616, "loss/logits": 0.8996293157339096, "step": 24100 }, { "epoch": 0.2411, "grad_norm": 12.3125, "grad_norm_var": 1.4559895833333334, "learning_rate": 0.0003, "loss": 11.6987, "loss/aux_loss": 0.048095569014549255, "loss/crossentropy": 2.7775806427001952, "loss/logits": 0.8833230465650559, "step": 24110 }, { "epoch": 0.2412, "grad_norm": 12.75, "grad_norm_var": 0.4014973958333333, "learning_rate": 0.0003, "loss": 11.5617, "loss/aux_loss": 0.04807928055524826, "loss/crossentropy": 2.7209652066230774, "loss/logits": 0.8995220333337783, "step": 24120 }, { "epoch": 0.2413, "grad_norm": 13.6875, "grad_norm_var": 0.43951822916666666, "learning_rate": 0.0003, "loss": 11.6276, "loss/aux_loss": 0.04808956161141396, "loss/crossentropy": 2.757280480861664, "loss/logits": 0.8826348453760147, "step": 24130 }, { "epoch": 0.2414, "grad_norm": 12.75, "grad_norm_var": 0.33177083333333335, "learning_rate": 0.0003, "loss": 11.5764, "loss/aux_loss": 0.048085734620690344, "loss/crossentropy": 2.783533537387848, "loss/logits": 0.8649254590272903, "step": 24140 }, { "epoch": 0.2415, "grad_norm": 12.75, "grad_norm_var": 3.777327473958333, "learning_rate": 0.0003, "loss": 11.6753, "loss/aux_loss": 0.04808969590812921, "loss/crossentropy": 2.678688037395477, "loss/logits": 0.8812060475349426, "step": 24150 }, { "epoch": 0.2416, "grad_norm": 13.5625, "grad_norm_var": 3.59375, "learning_rate": 0.0003, "loss": 11.5482, "loss/aux_loss": 0.0480934102088213, "loss/crossentropy": 2.7929326593875885, "loss/logits": 0.8826883345842361, "step": 24160 }, { "epoch": 0.2417, "grad_norm": 12.125, "grad_norm_var": 0.3504557291666667, "learning_rate": 0.0003, "loss": 11.3805, "loss/aux_loss": 0.04809669218957424, "loss/crossentropy": 2.5945322930812837, "loss/logits": 0.8391418486833573, "step": 24170 }, { "epoch": 0.2418, "grad_norm": 13.3125, "grad_norm_var": 1.5546712239583333, "learning_rate": 0.0003, "loss": 11.5907, "loss/aux_loss": 0.04808249343186617, "loss/crossentropy": 2.734819310903549, "loss/logits": 0.8693808823823929, "step": 24180 }, { "epoch": 0.2419, "grad_norm": 12.0, "grad_norm_var": 0.388525390625, "learning_rate": 0.0003, "loss": 11.6518, "loss/aux_loss": 0.04808155260980129, "loss/crossentropy": 2.7737566351890566, "loss/logits": 0.8870420664548874, "step": 24190 }, { "epoch": 0.242, "grad_norm": 12.5, "grad_norm_var": 0.5936848958333333, "learning_rate": 0.0003, "loss": 11.6452, "loss/aux_loss": 0.04808844365179539, "loss/crossentropy": 2.78323655128479, "loss/logits": 0.8678434014320373, "step": 24200 }, { "epoch": 0.2421, "grad_norm": 12.1875, "grad_norm_var": 0.5105305989583333, "learning_rate": 0.0003, "loss": 11.4859, "loss/aux_loss": 0.0480864379554987, "loss/crossentropy": 2.769151270389557, "loss/logits": 0.8635666728019714, "step": 24210 }, { "epoch": 0.2422, "grad_norm": 11.8125, "grad_norm_var": 0.23917643229166666, "learning_rate": 0.0003, "loss": 11.772, "loss/aux_loss": 0.048094228468835355, "loss/crossentropy": 2.812956178188324, "loss/logits": 0.9112226068973541, "step": 24220 }, { "epoch": 0.2423, "grad_norm": 11.75, "grad_norm_var": 0.328759765625, "learning_rate": 0.0003, "loss": 11.669, "loss/aux_loss": 0.04808081742376089, "loss/crossentropy": 2.7633156895637514, "loss/logits": 0.8465212196111679, "step": 24230 }, { "epoch": 0.2424, "grad_norm": 13.0625, "grad_norm_var": 0.5065104166666666, "learning_rate": 0.0003, "loss": 11.6417, "loss/aux_loss": 0.04809690322726965, "loss/crossentropy": 2.744890737533569, "loss/logits": 0.9320186167955399, "step": 24240 }, { "epoch": 0.2425, "grad_norm": 12.125, "grad_norm_var": 4.525374348958334, "learning_rate": 0.0003, "loss": 11.8212, "loss/aux_loss": 0.04809857420623302, "loss/crossentropy": 2.7761366605758666, "loss/logits": 0.8958946943283081, "step": 24250 }, { "epoch": 0.2426, "grad_norm": 13.0, "grad_norm_var": 0.2875, "learning_rate": 0.0003, "loss": 11.631, "loss/aux_loss": 0.04808212518692016, "loss/crossentropy": 2.8316932320594788, "loss/logits": 0.8923729687929154, "step": 24260 }, { "epoch": 0.2427, "grad_norm": 12.6875, "grad_norm_var": 0.431884765625, "learning_rate": 0.0003, "loss": 11.6632, "loss/aux_loss": 0.048090006597340106, "loss/crossentropy": 2.7280545473098754, "loss/logits": 0.8711204528808594, "step": 24270 }, { "epoch": 0.2428, "grad_norm": 13.0, "grad_norm_var": 0.3106770833333333, "learning_rate": 0.0003, "loss": 11.6513, "loss/aux_loss": 0.048082527332007886, "loss/crossentropy": 2.7286766350269316, "loss/logits": 0.8891306400299073, "step": 24280 }, { "epoch": 0.2429, "grad_norm": 13.6875, "grad_norm_var": 0.24837239583333334, "learning_rate": 0.0003, "loss": 11.6466, "loss/aux_loss": 0.048082358203828335, "loss/crossentropy": 2.9011032223701476, "loss/logits": 0.8931461691856384, "step": 24290 }, { "epoch": 0.243, "grad_norm": 12.375, "grad_norm_var": 0.2994140625, "learning_rate": 0.0003, "loss": 11.5106, "loss/aux_loss": 0.048092238046228884, "loss/crossentropy": 2.682009291648865, "loss/logits": 0.8750419646501542, "step": 24300 }, { "epoch": 0.2431, "grad_norm": 13.0625, "grad_norm_var": 0.26443684895833336, "learning_rate": 0.0003, "loss": 11.7725, "loss/aux_loss": 0.04807449225336313, "loss/crossentropy": 2.79811235666275, "loss/logits": 0.8686894834041595, "step": 24310 }, { "epoch": 0.2432, "grad_norm": 12.125, "grad_norm_var": 0.4176432291666667, "learning_rate": 0.0003, "loss": 11.6726, "loss/aux_loss": 0.04808956328779459, "loss/crossentropy": 2.6413731455802916, "loss/logits": 0.8763428032398224, "step": 24320 }, { "epoch": 0.2433, "grad_norm": 12.6875, "grad_norm_var": 0.5105305989583333, "learning_rate": 0.0003, "loss": 11.6373, "loss/aux_loss": 0.048085764050483704, "loss/crossentropy": 2.6801956832408904, "loss/logits": 0.8385010361671448, "step": 24330 }, { "epoch": 0.2434, "grad_norm": 12.375, "grad_norm_var": 1.7222493489583333, "learning_rate": 0.0003, "loss": 11.6253, "loss/aux_loss": 0.04808337558060884, "loss/crossentropy": 2.7743629932403566, "loss/logits": 0.9043363749980926, "step": 24340 }, { "epoch": 0.2435, "grad_norm": 12.1875, "grad_norm_var": 0.13566080729166666, "learning_rate": 0.0003, "loss": 11.6274, "loss/aux_loss": 0.048089998215436934, "loss/crossentropy": 2.8051861047744753, "loss/logits": 0.8626913219690323, "step": 24350 }, { "epoch": 0.2436, "grad_norm": 13.0, "grad_norm_var": 0.408447265625, "learning_rate": 0.0003, "loss": 11.6349, "loss/aux_loss": 0.048083267733454706, "loss/crossentropy": 2.909751272201538, "loss/logits": 0.9241881400346756, "step": 24360 }, { "epoch": 0.2437, "grad_norm": 12.75, "grad_norm_var": 0.28854166666666664, "learning_rate": 0.0003, "loss": 11.83, "loss/aux_loss": 0.048077669180929664, "loss/crossentropy": 2.8290556192398073, "loss/logits": 0.8948422998189927, "step": 24370 }, { "epoch": 0.2438, "grad_norm": 12.3125, "grad_norm_var": 0.49420572916666666, "learning_rate": 0.0003, "loss": 11.7276, "loss/aux_loss": 0.048091036081314084, "loss/crossentropy": 2.913513660430908, "loss/logits": 0.9211061328649521, "step": 24380 }, { "epoch": 0.2439, "grad_norm": 12.5, "grad_norm_var": 0.4315104166666667, "learning_rate": 0.0003, "loss": 11.6142, "loss/aux_loss": 0.04808881543576717, "loss/crossentropy": 2.8371637940406798, "loss/logits": 0.8743333727121353, "step": 24390 }, { "epoch": 0.244, "grad_norm": 11.375, "grad_norm_var": 0.3895182291666667, "learning_rate": 0.0003, "loss": 11.6272, "loss/aux_loss": 0.04808051008731127, "loss/crossentropy": 2.772133195400238, "loss/logits": 0.8708338439464569, "step": 24400 }, { "epoch": 0.2441, "grad_norm": 15.25, "grad_norm_var": 0.8311848958333333, "learning_rate": 0.0003, "loss": 11.7284, "loss/aux_loss": 0.048093300126492974, "loss/crossentropy": 2.814771521091461, "loss/logits": 0.8521833211183548, "step": 24410 }, { "epoch": 0.2442, "grad_norm": 12.125, "grad_norm_var": 0.7298014322916667, "learning_rate": 0.0003, "loss": 11.5798, "loss/aux_loss": 0.0480850936844945, "loss/crossentropy": 2.5935686111450194, "loss/logits": 0.8524984180927276, "step": 24420 }, { "epoch": 0.2443, "grad_norm": 13.0, "grad_norm_var": 0.20519205729166667, "learning_rate": 0.0003, "loss": 11.5651, "loss/aux_loss": 0.04809098821133375, "loss/crossentropy": 2.702817916870117, "loss/logits": 0.8753552913665772, "step": 24430 }, { "epoch": 0.2444, "grad_norm": 13.125, "grad_norm_var": 0.23098958333333333, "learning_rate": 0.0003, "loss": 11.7146, "loss/aux_loss": 0.048081192560493945, "loss/crossentropy": 2.789567303657532, "loss/logits": 0.8468646883964539, "step": 24440 }, { "epoch": 0.2445, "grad_norm": 16.5, "grad_norm_var": 1.030712890625, "learning_rate": 0.0003, "loss": 11.7024, "loss/aux_loss": 0.04808731079101562, "loss/crossentropy": 2.7426182508468626, "loss/logits": 0.8534230351448059, "step": 24450 }, { "epoch": 0.2446, "grad_norm": 12.6875, "grad_norm_var": 1.5005208333333333, "learning_rate": 0.0003, "loss": 11.6524, "loss/aux_loss": 0.04808608312159777, "loss/crossentropy": 2.8565701603889466, "loss/logits": 0.8676398396492004, "step": 24460 }, { "epoch": 0.2447, "grad_norm": 14.625, "grad_norm_var": 1.1264973958333333, "learning_rate": 0.0003, "loss": 11.4879, "loss/aux_loss": 0.048085580207407476, "loss/crossentropy": 2.898229694366455, "loss/logits": 0.8859318733215332, "step": 24470 }, { "epoch": 0.2448, "grad_norm": 13.1875, "grad_norm_var": 0.4410807291666667, "learning_rate": 0.0003, "loss": 11.6515, "loss/aux_loss": 0.04807874243706465, "loss/crossentropy": 2.850202000141144, "loss/logits": 0.88456309735775, "step": 24480 }, { "epoch": 0.2449, "grad_norm": 11.375, "grad_norm_var": 0.3223958333333333, "learning_rate": 0.0003, "loss": 11.7189, "loss/aux_loss": 0.048079111985862254, "loss/crossentropy": 2.808897280693054, "loss/logits": 0.8941350758075715, "step": 24490 }, { "epoch": 0.245, "grad_norm": 13.0, "grad_norm_var": 1.0204264322916667, "learning_rate": 0.0003, "loss": 11.5987, "loss/aux_loss": 0.04809224735945463, "loss/crossentropy": 2.7774929463863374, "loss/logits": 0.8461995214223862, "step": 24500 }, { "epoch": 0.2451, "grad_norm": 13.25, "grad_norm_var": 1.1275390625, "learning_rate": 0.0003, "loss": 11.5477, "loss/aux_loss": 0.04809009712189436, "loss/crossentropy": 2.791159617900848, "loss/logits": 0.8854242950677872, "step": 24510 }, { "epoch": 0.2452, "grad_norm": 12.5625, "grad_norm_var": 0.5010416666666667, "learning_rate": 0.0003, "loss": 11.7486, "loss/aux_loss": 0.048082375340163706, "loss/crossentropy": 2.664378434419632, "loss/logits": 0.8719703197479248, "step": 24520 }, { "epoch": 0.2453, "grad_norm": 45.25, "grad_norm_var": 66.98527018229167, "learning_rate": 0.0003, "loss": 11.9223, "loss/aux_loss": 0.048085335083305834, "loss/crossentropy": 2.715837526321411, "loss/logits": 0.9175373882055282, "step": 24530 }, { "epoch": 0.2454, "grad_norm": 13.1875, "grad_norm_var": 64.931103515625, "learning_rate": 0.0003, "loss": 11.5829, "loss/aux_loss": 0.04809392262250185, "loss/crossentropy": 2.799639356136322, "loss/logits": 0.8938455194234848, "step": 24540 }, { "epoch": 0.2455, "grad_norm": 13.0625, "grad_norm_var": 0.3728515625, "learning_rate": 0.0003, "loss": 11.7221, "loss/aux_loss": 0.04808378964662552, "loss/crossentropy": 2.777541899681091, "loss/logits": 0.8686657905578613, "step": 24550 }, { "epoch": 0.2456, "grad_norm": 12.875, "grad_norm_var": 0.32355143229166666, "learning_rate": 0.0003, "loss": 11.8611, "loss/aux_loss": 0.048089235462248324, "loss/crossentropy": 2.872539556026459, "loss/logits": 0.9508152902126312, "step": 24560 }, { "epoch": 0.2457, "grad_norm": 12.9375, "grad_norm_var": 0.30987955729166666, "learning_rate": 0.0003, "loss": 11.5595, "loss/aux_loss": 0.04808945395052433, "loss/crossentropy": 2.9004230976104735, "loss/logits": 0.9007263153791427, "step": 24570 }, { "epoch": 0.2458, "grad_norm": 11.6875, "grad_norm_var": 0.2916015625, "learning_rate": 0.0003, "loss": 11.7691, "loss/aux_loss": 0.048084504902362823, "loss/crossentropy": 2.81181880235672, "loss/logits": 0.8950879544019699, "step": 24580 }, { "epoch": 0.2459, "grad_norm": 12.6875, "grad_norm_var": 2.4169270833333334, "learning_rate": 0.0003, "loss": 11.7429, "loss/aux_loss": 0.04808555655181408, "loss/crossentropy": 2.890042209625244, "loss/logits": 0.8888252973556519, "step": 24590 }, { "epoch": 0.246, "grad_norm": 13.125, "grad_norm_var": 1.9864420572916666, "learning_rate": 0.0003, "loss": 11.7697, "loss/aux_loss": 0.04808424487709999, "loss/crossentropy": 2.8062154173851015, "loss/logits": 0.8845973283052444, "step": 24600 }, { "epoch": 0.2461, "grad_norm": 12.3125, "grad_norm_var": 0.47342122395833336, "learning_rate": 0.0003, "loss": 11.6025, "loss/aux_loss": 0.04808759950101375, "loss/crossentropy": 2.638791638612747, "loss/logits": 0.8616722971200943, "step": 24610 }, { "epoch": 0.2462, "grad_norm": 12.0, "grad_norm_var": 2.019514973958333, "learning_rate": 0.0003, "loss": 11.3808, "loss/aux_loss": 0.0480861397460103, "loss/crossentropy": 2.6739172518253325, "loss/logits": 0.857967483997345, "step": 24620 }, { "epoch": 0.2463, "grad_norm": 14.125, "grad_norm_var": 11.3953125, "learning_rate": 0.0003, "loss": 11.7237, "loss/aux_loss": 0.048097777739167215, "loss/crossentropy": 2.6798054337501527, "loss/logits": 0.9119128674268723, "step": 24630 }, { "epoch": 0.2464, "grad_norm": 12.75, "grad_norm_var": 3.7905598958333333, "learning_rate": 0.0003, "loss": 11.5751, "loss/aux_loss": 0.04808594770729542, "loss/crossentropy": 2.6202758669853212, "loss/logits": 0.8779011040925979, "step": 24640 }, { "epoch": 0.2465, "grad_norm": 13.75, "grad_norm_var": 4.788004557291667, "learning_rate": 0.0003, "loss": 11.5291, "loss/aux_loss": 0.0480955732986331, "loss/crossentropy": 2.6888505935668947, "loss/logits": 0.8404242038726807, "step": 24650 }, { "epoch": 0.2466, "grad_norm": 11.875, "grad_norm_var": 0.5864583333333333, "learning_rate": 0.0003, "loss": 11.644, "loss/aux_loss": 0.0480740413069725, "loss/crossentropy": 2.9277888417243956, "loss/logits": 0.8985714882612228, "step": 24660 }, { "epoch": 0.2467, "grad_norm": 11.5, "grad_norm_var": 0.4141764322916667, "learning_rate": 0.0003, "loss": 11.7472, "loss/aux_loss": 0.048084502667188646, "loss/crossentropy": 2.661174988746643, "loss/logits": 0.8858565300703048, "step": 24670 }, { "epoch": 0.2468, "grad_norm": 11.9375, "grad_norm_var": 0.5109212239583333, "learning_rate": 0.0003, "loss": 11.7638, "loss/aux_loss": 0.048086884804069994, "loss/crossentropy": 2.990370142459869, "loss/logits": 0.9090431898832321, "step": 24680 }, { "epoch": 0.2469, "grad_norm": 12.9375, "grad_norm_var": 0.48333333333333334, "learning_rate": 0.0003, "loss": 11.7163, "loss/aux_loss": 0.04808713924139738, "loss/crossentropy": 2.6298464059829714, "loss/logits": 0.8759280443191528, "step": 24690 }, { "epoch": 0.247, "grad_norm": 13.875, "grad_norm_var": 11.1150390625, "learning_rate": 0.0003, "loss": 11.6638, "loss/aux_loss": 0.04808840285986662, "loss/crossentropy": 2.8006470084190367, "loss/logits": 0.8584316343069076, "step": 24700 }, { "epoch": 0.2471, "grad_norm": 14.375, "grad_norm_var": 10.780208333333333, "learning_rate": 0.0003, "loss": 11.7515, "loss/aux_loss": 0.04808585830032826, "loss/crossentropy": 2.7873790740966795, "loss/logits": 0.8926585078239441, "step": 24710 }, { "epoch": 0.2472, "grad_norm": 12.4375, "grad_norm_var": 0.2879557291666667, "learning_rate": 0.0003, "loss": 11.6114, "loss/aux_loss": 0.048089620657265186, "loss/crossentropy": 2.5645355701446535, "loss/logits": 0.8463786870241166, "step": 24720 }, { "epoch": 0.2473, "grad_norm": 12.25, "grad_norm_var": 0.454541015625, "learning_rate": 0.0003, "loss": 11.6297, "loss/aux_loss": 0.04808438029140234, "loss/crossentropy": 2.8039010167121887, "loss/logits": 0.9113354772329331, "step": 24730 }, { "epoch": 0.2474, "grad_norm": 13.5, "grad_norm_var": 0.4088541666666667, "learning_rate": 0.0003, "loss": 11.5603, "loss/aux_loss": 0.04807938225567341, "loss/crossentropy": 2.6084700644016268, "loss/logits": 0.869733153283596, "step": 24740 }, { "epoch": 0.2475, "grad_norm": 12.6875, "grad_norm_var": 0.48826497395833335, "learning_rate": 0.0003, "loss": 11.5865, "loss/aux_loss": 0.04809128176420927, "loss/crossentropy": 2.571286141872406, "loss/logits": 0.8443722426891327, "step": 24750 }, { "epoch": 0.2476, "grad_norm": 12.875, "grad_norm_var": 1.0113932291666667, "learning_rate": 0.0003, "loss": 11.5163, "loss/aux_loss": 0.04808931332081556, "loss/crossentropy": 2.765368914604187, "loss/logits": 0.8564152508974076, "step": 24760 }, { "epoch": 0.2477, "grad_norm": 12.5625, "grad_norm_var": 0.372119140625, "learning_rate": 0.0003, "loss": 11.5343, "loss/aux_loss": 0.04808468669652939, "loss/crossentropy": 2.847411096096039, "loss/logits": 0.9346977740526199, "step": 24770 }, { "epoch": 0.2478, "grad_norm": 11.875, "grad_norm_var": 0.7286458333333333, "learning_rate": 0.0003, "loss": 11.6268, "loss/aux_loss": 0.0480878546833992, "loss/crossentropy": 2.6432011306285856, "loss/logits": 0.8749868780374527, "step": 24780 }, { "epoch": 0.2479, "grad_norm": 12.875, "grad_norm_var": 0.5903483072916667, "learning_rate": 0.0003, "loss": 11.6199, "loss/aux_loss": 0.04810009114444256, "loss/crossentropy": 2.712999904155731, "loss/logits": 0.8862002283334732, "step": 24790 }, { "epoch": 0.248, "grad_norm": 12.125, "grad_norm_var": 0.41730143229166666, "learning_rate": 0.0003, "loss": 11.6145, "loss/aux_loss": 0.048082103952765465, "loss/crossentropy": 2.768636167049408, "loss/logits": 0.862320426106453, "step": 24800 }, { "epoch": 0.2481, "grad_norm": 12.875, "grad_norm_var": 0.48370768229166666, "learning_rate": 0.0003, "loss": 11.7063, "loss/aux_loss": 0.048084338754415513, "loss/crossentropy": 2.689415818452835, "loss/logits": 0.8696422547101974, "step": 24810 }, { "epoch": 0.2482, "grad_norm": 13.375, "grad_norm_var": 0.5311848958333333, "learning_rate": 0.0003, "loss": 11.7926, "loss/aux_loss": 0.04808789137750864, "loss/crossentropy": 2.7690405011177064, "loss/logits": 0.8735806256532669, "step": 24820 }, { "epoch": 0.2483, "grad_norm": 12.875, "grad_norm_var": 0.32745768229166666, "learning_rate": 0.0003, "loss": 11.5478, "loss/aux_loss": 0.04808993488550186, "loss/crossentropy": 2.7007332861423494, "loss/logits": 0.9016262739896774, "step": 24830 }, { "epoch": 0.2484, "grad_norm": 13.25, "grad_norm_var": 693.0523274739584, "learning_rate": 0.0003, "loss": 11.5585, "loss/aux_loss": 0.04810011051595211, "loss/crossentropy": 2.5810685038566588, "loss/logits": 0.864134407043457, "step": 24840 }, { "epoch": 0.2485, "grad_norm": 12.8125, "grad_norm_var": 0.8244140625, "learning_rate": 0.0003, "loss": 11.6735, "loss/aux_loss": 0.048092559725046155, "loss/crossentropy": 2.752824580669403, "loss/logits": 0.8686655551195145, "step": 24850 }, { "epoch": 0.2486, "grad_norm": 12.9375, "grad_norm_var": 0.671337890625, "learning_rate": 0.0003, "loss": 11.8462, "loss/aux_loss": 0.04807847458869219, "loss/crossentropy": 2.9063711762428284, "loss/logits": 0.9188533276319504, "step": 24860 }, { "epoch": 0.2487, "grad_norm": 12.4375, "grad_norm_var": 0.27459309895833334, "learning_rate": 0.0003, "loss": 11.644, "loss/aux_loss": 0.04809844493865967, "loss/crossentropy": 2.7648052334785462, "loss/logits": 0.867266783118248, "step": 24870 }, { "epoch": 0.2488, "grad_norm": 13.6875, "grad_norm_var": 3.9888020833333333, "learning_rate": 0.0003, "loss": 11.5867, "loss/aux_loss": 0.04808073379099369, "loss/crossentropy": 2.7701613664627076, "loss/logits": 0.9056573390960694, "step": 24880 }, { "epoch": 0.2489, "grad_norm": 11.9375, "grad_norm_var": 2.978385416666667, "learning_rate": 0.0003, "loss": 11.5873, "loss/aux_loss": 0.04808578360825777, "loss/crossentropy": 2.8290310978889464, "loss/logits": 0.8755116432905197, "step": 24890 }, { "epoch": 0.249, "grad_norm": 13.625, "grad_norm_var": 0.8264973958333334, "learning_rate": 0.0003, "loss": 11.6796, "loss/aux_loss": 0.04808125514537096, "loss/crossentropy": 2.816509687900543, "loss/logits": 0.9027190536260605, "step": 24900 }, { "epoch": 0.2491, "grad_norm": 12.0625, "grad_norm_var": 0.8419270833333333, "learning_rate": 0.0003, "loss": 11.5386, "loss/aux_loss": 0.0480922881513834, "loss/crossentropy": 2.76520716547966, "loss/logits": 0.892632269859314, "step": 24910 }, { "epoch": 0.2492, "grad_norm": 11.75, "grad_norm_var": 0.30193684895833334, "learning_rate": 0.0003, "loss": 11.693, "loss/aux_loss": 0.0480814853683114, "loss/crossentropy": 2.720960557460785, "loss/logits": 0.8856659799814224, "step": 24920 }, { "epoch": 0.2493, "grad_norm": 14.25, "grad_norm_var": 0.8380208333333333, "learning_rate": 0.0003, "loss": 11.5816, "loss/aux_loss": 0.04809188954532147, "loss/crossentropy": 2.646233397722244, "loss/logits": 0.8720506697893142, "step": 24930 }, { "epoch": 0.2494, "grad_norm": 12.0625, "grad_norm_var": 1.5056640625, "learning_rate": 0.0003, "loss": 11.7634, "loss/aux_loss": 0.04808452129364014, "loss/crossentropy": 2.83508540391922, "loss/logits": 0.8792938023805619, "step": 24940 }, { "epoch": 0.2495, "grad_norm": 12.875, "grad_norm_var": 0.49733072916666665, "learning_rate": 0.0003, "loss": 11.4464, "loss/aux_loss": 0.048088039830327035, "loss/crossentropy": 2.7586292266845702, "loss/logits": 0.8813098579645157, "step": 24950 }, { "epoch": 0.2496, "grad_norm": 13.4375, "grad_norm_var": 1.2320149739583333, "learning_rate": 0.0003, "loss": 11.6449, "loss/aux_loss": 0.04809652119874954, "loss/crossentropy": 2.7922417759895324, "loss/logits": 0.9288152068853378, "step": 24960 }, { "epoch": 0.2497, "grad_norm": 13.0625, "grad_norm_var": 0.31027018229166664, "learning_rate": 0.0003, "loss": 11.5653, "loss/aux_loss": 0.048079832829535006, "loss/crossentropy": 2.713176792860031, "loss/logits": 0.8755482017993927, "step": 24970 }, { "epoch": 0.2498, "grad_norm": 12.625, "grad_norm_var": 0.5158854166666667, "learning_rate": 0.0003, "loss": 11.6771, "loss/aux_loss": 0.04808032140135765, "loss/crossentropy": 2.844007110595703, "loss/logits": 0.8661886304616928, "step": 24980 }, { "epoch": 0.2499, "grad_norm": 11.625, "grad_norm_var": 0.5384765625, "learning_rate": 0.0003, "loss": 11.4978, "loss/aux_loss": 0.048091298528015615, "loss/crossentropy": 2.7953576683998107, "loss/logits": 0.8499617218971253, "step": 24990 }, { "epoch": 0.25, "grad_norm": 12.875, "grad_norm_var": 0.9852701822916666, "learning_rate": 0.0003, "loss": 11.5883, "loss/aux_loss": 0.04808867685496807, "loss/crossentropy": 2.528530162572861, "loss/logits": 0.854027372598648, "step": 25000 }, { "epoch": 0.2501, "grad_norm": 11.25, "grad_norm_var": 0.9353515625, "learning_rate": 0.0003, "loss": 11.6363, "loss/aux_loss": 0.04808705858886242, "loss/crossentropy": 2.843946361541748, "loss/logits": 0.8924509882926941, "step": 25010 }, { "epoch": 0.2502, "grad_norm": 12.3125, "grad_norm_var": 0.35349934895833335, "learning_rate": 0.0003, "loss": 11.8235, "loss/aux_loss": 0.04808267503976822, "loss/crossentropy": 2.891792821884155, "loss/logits": 0.9047430366277694, "step": 25020 }, { "epoch": 0.2503, "grad_norm": 12.75, "grad_norm_var": 0.2886555989583333, "learning_rate": 0.0003, "loss": 11.6444, "loss/aux_loss": 0.048091215640306474, "loss/crossentropy": 2.904352879524231, "loss/logits": 0.877800577878952, "step": 25030 }, { "epoch": 0.2504, "grad_norm": 12.1875, "grad_norm_var": 0.33318684895833334, "learning_rate": 0.0003, "loss": 11.6522, "loss/aux_loss": 0.04808731395751238, "loss/crossentropy": 2.7148699164390564, "loss/logits": 0.8997314661741257, "step": 25040 }, { "epoch": 0.2505, "grad_norm": 13.8125, "grad_norm_var": 0.37303059895833335, "learning_rate": 0.0003, "loss": 11.8294, "loss/aux_loss": 0.04808868896216154, "loss/crossentropy": 2.790895849466324, "loss/logits": 0.9084702879190445, "step": 25050 }, { "epoch": 0.2506, "grad_norm": 12.3125, "grad_norm_var": 0.36920572916666666, "learning_rate": 0.0003, "loss": 11.7443, "loss/aux_loss": 0.04809939563274383, "loss/crossentropy": 2.911231255531311, "loss/logits": 0.8896218776702881, "step": 25060 }, { "epoch": 0.2507, "grad_norm": 13.0, "grad_norm_var": 0.3450520833333333, "learning_rate": 0.0003, "loss": 11.5574, "loss/aux_loss": 0.048084039054811, "loss/crossentropy": 2.8057745695114136, "loss/logits": 0.8932328909635544, "step": 25070 }, { "epoch": 0.2508, "grad_norm": 12.6875, "grad_norm_var": 0.4593587239583333, "learning_rate": 0.0003, "loss": 11.8264, "loss/aux_loss": 0.04808835387229919, "loss/crossentropy": 2.9282448649406434, "loss/logits": 0.9194880992174148, "step": 25080 }, { "epoch": 0.2509, "grad_norm": 11.8125, "grad_norm_var": 0.40167643229166666, "learning_rate": 0.0003, "loss": 11.5529, "loss/aux_loss": 0.04807906914502382, "loss/crossentropy": 2.532350409030914, "loss/logits": 0.8764607399702072, "step": 25090 }, { "epoch": 0.251, "grad_norm": 12.25, "grad_norm_var": 0.2752604166666667, "learning_rate": 0.0003, "loss": 11.5749, "loss/aux_loss": 0.048076278157532217, "loss/crossentropy": 2.7150728702545166, "loss/logits": 0.8551843196153641, "step": 25100 }, { "epoch": 0.2511, "grad_norm": 12.375, "grad_norm_var": 0.14368489583333333, "learning_rate": 0.0003, "loss": 11.6707, "loss/aux_loss": 0.048083152808249, "loss/crossentropy": 2.6560796737670898, "loss/logits": 0.8625768065452576, "step": 25110 }, { "epoch": 0.2512, "grad_norm": 12.375, "grad_norm_var": 0.238525390625, "learning_rate": 0.0003, "loss": 11.31, "loss/aux_loss": 0.04808560535311699, "loss/crossentropy": 2.692982625961304, "loss/logits": 0.8346902966499329, "step": 25120 }, { "epoch": 0.2513, "grad_norm": 12.9375, "grad_norm_var": 0.13326822916666667, "learning_rate": 0.0003, "loss": 11.6084, "loss/aux_loss": 0.04809413086622953, "loss/crossentropy": 2.8028789699077605, "loss/logits": 0.8861746788024902, "step": 25130 }, { "epoch": 0.2514, "grad_norm": 13.5, "grad_norm_var": 0.17068684895833333, "learning_rate": 0.0003, "loss": 11.7255, "loss/aux_loss": 0.04807902295142412, "loss/crossentropy": 2.77418338060379, "loss/logits": 0.8591089010238647, "step": 25140 }, { "epoch": 0.2515, "grad_norm": 12.0, "grad_norm_var": 14.376497395833333, "learning_rate": 0.0003, "loss": 11.5784, "loss/aux_loss": 0.04810504075139761, "loss/crossentropy": 2.6118695974349975, "loss/logits": 0.8479388684034348, "step": 25150 }, { "epoch": 0.2516, "grad_norm": 12.0625, "grad_norm_var": 14.047119140625, "learning_rate": 0.0003, "loss": 11.6321, "loss/aux_loss": 0.04807586278766394, "loss/crossentropy": 2.8019371032714844, "loss/logits": 0.9008934259414673, "step": 25160 }, { "epoch": 0.2517, "grad_norm": 12.625, "grad_norm_var": 5.012955729166666, "learning_rate": 0.0003, "loss": 11.6511, "loss/aux_loss": 0.048094934225082396, "loss/crossentropy": 2.6834351480007173, "loss/logits": 0.8760968536138535, "step": 25170 }, { "epoch": 0.2518, "grad_norm": 13.3125, "grad_norm_var": 2.4541015625, "learning_rate": 0.0003, "loss": 11.7221, "loss/aux_loss": 0.04808460958302021, "loss/crossentropy": 2.778984820842743, "loss/logits": 0.8968686580657959, "step": 25180 }, { "epoch": 0.2519, "grad_norm": 12.5625, "grad_norm_var": 0.365625, "learning_rate": 0.0003, "loss": 11.6415, "loss/aux_loss": 0.04808416347950697, "loss/crossentropy": 2.6761425912380217, "loss/logits": 0.8864099949598312, "step": 25190 }, { "epoch": 0.252, "grad_norm": 12.6875, "grad_norm_var": 0.19620768229166666, "learning_rate": 0.0003, "loss": 11.7456, "loss/aux_loss": 0.04808526486158371, "loss/crossentropy": 2.809593695402145, "loss/logits": 0.9087226182222367, "step": 25200 }, { "epoch": 0.2521, "grad_norm": 12.875, "grad_norm_var": 0.4205729166666667, "learning_rate": 0.0003, "loss": 11.649, "loss/aux_loss": 0.04808737710118294, "loss/crossentropy": 2.8328150868415833, "loss/logits": 0.8956249594688416, "step": 25210 }, { "epoch": 0.2522, "grad_norm": 11.8125, "grad_norm_var": 0.3546223958333333, "learning_rate": 0.0003, "loss": 11.6388, "loss/aux_loss": 0.048089083097875115, "loss/crossentropy": 2.736672604084015, "loss/logits": 0.8746443182229996, "step": 25220 }, { "epoch": 0.2523, "grad_norm": 12.8125, "grad_norm_var": 0.3624348958333333, "learning_rate": 0.0003, "loss": 11.6174, "loss/aux_loss": 0.04808070510625839, "loss/crossentropy": 3.001026463508606, "loss/logits": 0.8746398031711579, "step": 25230 }, { "epoch": 0.2524, "grad_norm": 12.4375, "grad_norm_var": 3.6433430989583333, "learning_rate": 0.0003, "loss": 11.7027, "loss/aux_loss": 0.048089549690485, "loss/crossentropy": 2.7485376834869384, "loss/logits": 0.8798061728477478, "step": 25240 }, { "epoch": 0.2525, "grad_norm": 15.25, "grad_norm_var": 1.2581868489583334, "learning_rate": 0.0003, "loss": 11.8481, "loss/aux_loss": 0.048088116385042665, "loss/crossentropy": 2.907766008377075, "loss/logits": 0.927979850769043, "step": 25250 }, { "epoch": 0.2526, "grad_norm": 12.75, "grad_norm_var": 0.674853515625, "learning_rate": 0.0003, "loss": 11.5283, "loss/aux_loss": 0.04809806887060404, "loss/crossentropy": 2.813064420223236, "loss/logits": 0.8849809437990188, "step": 25260 }, { "epoch": 0.2527, "grad_norm": 13.5625, "grad_norm_var": 0.222119140625, "learning_rate": 0.0003, "loss": 11.4406, "loss/aux_loss": 0.048092986829578875, "loss/crossentropy": 2.680801993608475, "loss/logits": 0.8623011440038681, "step": 25270 }, { "epoch": 0.2528, "grad_norm": 12.5625, "grad_norm_var": 0.28541666666666665, "learning_rate": 0.0003, "loss": 11.6218, "loss/aux_loss": 0.04807704258710146, "loss/crossentropy": 2.6295208811759947, "loss/logits": 0.8690453052520752, "step": 25280 }, { "epoch": 0.2529, "grad_norm": 13.0, "grad_norm_var": 0.35154622395833335, "learning_rate": 0.0003, "loss": 11.6239, "loss/aux_loss": 0.048089309222996236, "loss/crossentropy": 2.666023552417755, "loss/logits": 0.8957912296056747, "step": 25290 }, { "epoch": 0.253, "grad_norm": 12.875, "grad_norm_var": 0.26666666666666666, "learning_rate": 0.0003, "loss": 11.4241, "loss/aux_loss": 0.048083115555346015, "loss/crossentropy": 2.83921422958374, "loss/logits": 0.883840236067772, "step": 25300 }, { "epoch": 0.2531, "grad_norm": 12.625, "grad_norm_var": 0.5619791666666667, "learning_rate": 0.0003, "loss": 11.6852, "loss/aux_loss": 0.048087958991527555, "loss/crossentropy": 2.8075567483901978, "loss/logits": 0.8912134855985642, "step": 25310 }, { "epoch": 0.2532, "grad_norm": 12.0625, "grad_norm_var": 0.5716145833333334, "learning_rate": 0.0003, "loss": 11.6948, "loss/aux_loss": 0.048075183667242526, "loss/crossentropy": 2.834882414340973, "loss/logits": 0.9086032390594483, "step": 25320 }, { "epoch": 0.2533, "grad_norm": 12.0, "grad_norm_var": 0.39993489583333336, "learning_rate": 0.0003, "loss": 11.5648, "loss/aux_loss": 0.04808804150670767, "loss/crossentropy": 2.7690355598926546, "loss/logits": 0.8761381387710572, "step": 25330 }, { "epoch": 0.2534, "grad_norm": 12.8125, "grad_norm_var": 208.065625, "learning_rate": 0.0003, "loss": 11.8033, "loss/aux_loss": 0.048099280893802644, "loss/crossentropy": 2.8881313681602476, "loss/logits": 0.9373392134904861, "step": 25340 }, { "epoch": 0.2535, "grad_norm": 12.4375, "grad_norm_var": 0.7734212239583333, "learning_rate": 0.0003, "loss": 11.6623, "loss/aux_loss": 0.04808682128787041, "loss/crossentropy": 2.7554580926895142, "loss/logits": 0.8729925930500031, "step": 25350 }, { "epoch": 0.2536, "grad_norm": 12.125, "grad_norm_var": 0.687744140625, "learning_rate": 0.0003, "loss": 11.6251, "loss/aux_loss": 0.048085390403866765, "loss/crossentropy": 2.6890475332736967, "loss/logits": 0.8627552896738052, "step": 25360 }, { "epoch": 0.2537, "grad_norm": 12.9375, "grad_norm_var": 0.4130045572916667, "learning_rate": 0.0003, "loss": 11.6627, "loss/aux_loss": 0.04808287601917982, "loss/crossentropy": 2.9235028862953185, "loss/logits": 0.9073904246091843, "step": 25370 }, { "epoch": 0.2538, "grad_norm": 12.875, "grad_norm_var": 0.6108723958333333, "learning_rate": 0.0003, "loss": 11.6099, "loss/aux_loss": 0.0480959203094244, "loss/crossentropy": 2.6273869574069977, "loss/logits": 0.8745059370994568, "step": 25380 }, { "epoch": 0.2539, "grad_norm": 12.0625, "grad_norm_var": 0.6755208333333333, "learning_rate": 0.0003, "loss": 11.6251, "loss/aux_loss": 0.04807964153587818, "loss/crossentropy": 2.7656370401382446, "loss/logits": 0.8815957188606263, "step": 25390 }, { "epoch": 0.254, "grad_norm": 12.8125, "grad_norm_var": 4.477018229166666, "learning_rate": 0.0003, "loss": 11.8206, "loss/aux_loss": 0.048090039566159246, "loss/crossentropy": 2.8786653161048887, "loss/logits": 0.8921761155128479, "step": 25400 }, { "epoch": 0.2541, "grad_norm": 13.0625, "grad_norm_var": 0.45514322916666666, "learning_rate": 0.0003, "loss": 11.6351, "loss/aux_loss": 0.048083982057869436, "loss/crossentropy": 2.8235522508621216, "loss/logits": 0.914072972536087, "step": 25410 }, { "epoch": 0.2542, "grad_norm": 12.5, "grad_norm_var": 0.38671875, "learning_rate": 0.0003, "loss": 11.5798, "loss/aux_loss": 0.04809177853167057, "loss/crossentropy": 2.6220124840736387, "loss/logits": 0.8807148039340973, "step": 25420 }, { "epoch": 0.2543, "grad_norm": 12.875, "grad_norm_var": 0.16027018229166667, "learning_rate": 0.0003, "loss": 11.6279, "loss/aux_loss": 0.04809038415551185, "loss/crossentropy": 2.70097331404686, "loss/logits": 0.8611227154731751, "step": 25430 }, { "epoch": 0.2544, "grad_norm": 15.0625, "grad_norm_var": 1.840869140625, "learning_rate": 0.0003, "loss": 11.5793, "loss/aux_loss": 0.04807671457529068, "loss/crossentropy": 2.6667848229408264, "loss/logits": 0.8680727303028106, "step": 25440 }, { "epoch": 0.2545, "grad_norm": 12.3125, "grad_norm_var": 2.2770670572916667, "learning_rate": 0.0003, "loss": 11.5694, "loss/aux_loss": 0.04809817839413881, "loss/crossentropy": 2.905455070734024, "loss/logits": 0.9042976140975952, "step": 25450 }, { "epoch": 0.2546, "grad_norm": 12.125, "grad_norm_var": 1.07265625, "learning_rate": 0.0003, "loss": 11.7353, "loss/aux_loss": 0.04808375872671604, "loss/crossentropy": 2.9227493464946748, "loss/logits": 0.9188271731138229, "step": 25460 }, { "epoch": 0.2547, "grad_norm": 13.75, "grad_norm_var": 1.6091145833333333, "learning_rate": 0.0003, "loss": 11.8679, "loss/aux_loss": 0.048084712401032445, "loss/crossentropy": 2.888567340373993, "loss/logits": 0.8932348757982254, "step": 25470 }, { "epoch": 0.2548, "grad_norm": 12.125, "grad_norm_var": 2.026416015625, "learning_rate": 0.0003, "loss": 11.6388, "loss/aux_loss": 0.04808471277356148, "loss/crossentropy": 2.767711889743805, "loss/logits": 0.8723369985818863, "step": 25480 }, { "epoch": 0.2549, "grad_norm": 13.0, "grad_norm_var": 0.26458333333333334, "learning_rate": 0.0003, "loss": 11.6305, "loss/aux_loss": 0.048084079287946224, "loss/crossentropy": 2.7936369240283967, "loss/logits": 0.9237923324108124, "step": 25490 }, { "epoch": 0.255, "grad_norm": 12.8125, "grad_norm_var": 0.2837890625, "learning_rate": 0.0003, "loss": 11.5733, "loss/aux_loss": 0.04808622878044844, "loss/crossentropy": 2.749346935749054, "loss/logits": 0.9049636125564575, "step": 25500 }, { "epoch": 0.2551, "grad_norm": 11.3125, "grad_norm_var": 0.6278483072916666, "learning_rate": 0.0003, "loss": 11.6192, "loss/aux_loss": 0.048095759376883505, "loss/crossentropy": 2.862506020069122, "loss/logits": 0.9004225820302963, "step": 25510 }, { "epoch": 0.2552, "grad_norm": 13.4375, "grad_norm_var": 0.5340983072916666, "learning_rate": 0.0003, "loss": 11.6151, "loss/aux_loss": 0.048074688762426376, "loss/crossentropy": 2.6472591876983644, "loss/logits": 0.8818151533603669, "step": 25520 }, { "epoch": 0.2553, "grad_norm": 12.5625, "grad_norm_var": 0.26764322916666666, "learning_rate": 0.0003, "loss": 11.7054, "loss/aux_loss": 0.048086194694042204, "loss/crossentropy": 2.7406247556209564, "loss/logits": 0.868209832906723, "step": 25530 }, { "epoch": 0.2554, "grad_norm": 12.375, "grad_norm_var": 0.19036458333333334, "learning_rate": 0.0003, "loss": 11.6693, "loss/aux_loss": 0.04809033088386059, "loss/crossentropy": 2.64967337846756, "loss/logits": 0.8649856716394424, "step": 25540 }, { "epoch": 0.2555, "grad_norm": 14.0625, "grad_norm_var": 0.5400390625, "learning_rate": 0.0003, "loss": 11.6214, "loss/aux_loss": 0.04808993134647608, "loss/crossentropy": 2.640416944026947, "loss/logits": 0.85947944521904, "step": 25550 }, { "epoch": 0.2556, "grad_norm": 15.0625, "grad_norm_var": 0.6822265625, "learning_rate": 0.0003, "loss": 11.5651, "loss/aux_loss": 0.04808539263904095, "loss/crossentropy": 2.7396462678909304, "loss/logits": 0.8959068596363068, "step": 25560 }, { "epoch": 0.2557, "grad_norm": 11.5625, "grad_norm_var": 1.1291666666666667, "learning_rate": 0.0003, "loss": 11.3876, "loss/aux_loss": 0.048090565018355846, "loss/crossentropy": 2.7080013751983643, "loss/logits": 0.8827524572610855, "step": 25570 }, { "epoch": 0.2558, "grad_norm": 12.375, "grad_norm_var": 1.1947916666666667, "learning_rate": 0.0003, "loss": 11.7393, "loss/aux_loss": 0.0480854069814086, "loss/crossentropy": 2.8358673214912415, "loss/logits": 0.9099754065275192, "step": 25580 }, { "epoch": 0.2559, "grad_norm": 12.625, "grad_norm_var": 0.8205729166666667, "learning_rate": 0.0003, "loss": 11.5309, "loss/aux_loss": 0.048086575232446194, "loss/crossentropy": 2.6382982313632963, "loss/logits": 0.8928437829017639, "step": 25590 }, { "epoch": 0.256, "grad_norm": 12.4375, "grad_norm_var": 0.19763997395833333, "learning_rate": 0.0003, "loss": 11.5599, "loss/aux_loss": 0.0480822779238224, "loss/crossentropy": 2.821625292301178, "loss/logits": 0.8573364794254303, "step": 25600 }, { "epoch": 0.2561, "grad_norm": 12.625, "grad_norm_var": 0.8898274739583333, "learning_rate": 0.0003, "loss": 11.5878, "loss/aux_loss": 0.04809480402618647, "loss/crossentropy": 2.7832887768745422, "loss/logits": 0.8507031291723252, "step": 25610 }, { "epoch": 0.2562, "grad_norm": 13.125, "grad_norm_var": 0.9608723958333333, "learning_rate": 0.0003, "loss": 11.5664, "loss/aux_loss": 0.04807593021541834, "loss/crossentropy": 2.731532007455826, "loss/logits": 0.8629953473806381, "step": 25620 }, { "epoch": 0.2563, "grad_norm": 11.8125, "grad_norm_var": 0.45558268229166665, "learning_rate": 0.0003, "loss": 11.759, "loss/aux_loss": 0.048087543621659276, "loss/crossentropy": 2.854618912935257, "loss/logits": 0.9120256692171097, "step": 25630 }, { "epoch": 0.2564, "grad_norm": 11.5625, "grad_norm_var": 0.35390625, "learning_rate": 0.0003, "loss": 11.6293, "loss/aux_loss": 0.04808175507932901, "loss/crossentropy": 2.798936349153519, "loss/logits": 0.8882687538862228, "step": 25640 }, { "epoch": 0.2565, "grad_norm": 12.3125, "grad_norm_var": 1.4824055989583333, "learning_rate": 0.0003, "loss": 11.5947, "loss/aux_loss": 0.04808369390666485, "loss/crossentropy": 2.745366007089615, "loss/logits": 0.8873745143413544, "step": 25650 }, { "epoch": 0.2566, "grad_norm": 13.75, "grad_norm_var": 1.8715983072916667, "learning_rate": 0.0003, "loss": 11.6324, "loss/aux_loss": 0.04809246361255646, "loss/crossentropy": 2.9094391226768495, "loss/logits": 0.890197029709816, "step": 25660 }, { "epoch": 0.2567, "grad_norm": 13.5, "grad_norm_var": 0.9082682291666667, "learning_rate": 0.0003, "loss": 11.5822, "loss/aux_loss": 0.04809121619910002, "loss/crossentropy": 2.8318686723709106, "loss/logits": 0.8528442829847336, "step": 25670 }, { "epoch": 0.2568, "grad_norm": 13.0, "grad_norm_var": 0.335400390625, "learning_rate": 0.0003, "loss": 11.6176, "loss/aux_loss": 0.04808668699115515, "loss/crossentropy": 2.590553969144821, "loss/logits": 0.8228752464056015, "step": 25680 }, { "epoch": 0.2569, "grad_norm": 12.9375, "grad_norm_var": 0.34427083333333336, "learning_rate": 0.0003, "loss": 11.8269, "loss/aux_loss": 0.04809065740555525, "loss/crossentropy": 2.8544575750827788, "loss/logits": 0.8806323766708374, "step": 25690 }, { "epoch": 0.257, "grad_norm": 13.3125, "grad_norm_var": 14.973177083333333, "learning_rate": 0.0003, "loss": 11.6281, "loss/aux_loss": 0.048080390132963655, "loss/crossentropy": 2.8158641815185548, "loss/logits": 0.9019149035215378, "step": 25700 }, { "epoch": 0.2571, "grad_norm": 14.5625, "grad_norm_var": 14.9875, "learning_rate": 0.0003, "loss": 11.7727, "loss/aux_loss": 0.04808665197342634, "loss/crossentropy": 2.745079851150513, "loss/logits": 0.9046284675598144, "step": 25710 }, { "epoch": 0.2572, "grad_norm": 13.4375, "grad_norm_var": 1.815869140625, "learning_rate": 0.0003, "loss": 11.6452, "loss/aux_loss": 0.048082825168967246, "loss/crossentropy": 2.678209352493286, "loss/logits": 0.8738999456167221, "step": 25720 }, { "epoch": 0.2573, "grad_norm": 12.25, "grad_norm_var": 0.8992024739583333, "learning_rate": 0.0003, "loss": 11.5744, "loss/aux_loss": 0.04809469617903232, "loss/crossentropy": 2.65126051902771, "loss/logits": 0.8762364238500595, "step": 25730 }, { "epoch": 0.2574, "grad_norm": 13.25, "grad_norm_var": 0.5166015625, "learning_rate": 0.0003, "loss": 11.5408, "loss/aux_loss": 0.04808213766664267, "loss/crossentropy": 2.7597130656242372, "loss/logits": 0.8789749413728714, "step": 25740 }, { "epoch": 0.2575, "grad_norm": 12.875, "grad_norm_var": 0.3753743489583333, "learning_rate": 0.0003, "loss": 11.5427, "loss/aux_loss": 0.0480932829901576, "loss/crossentropy": 2.783565378189087, "loss/logits": 0.8482154309749603, "step": 25750 }, { "epoch": 0.2576, "grad_norm": 13.5, "grad_norm_var": 0.131103515625, "learning_rate": 0.0003, "loss": 11.6673, "loss/aux_loss": 0.04809057265520096, "loss/crossentropy": 2.7950429677963258, "loss/logits": 0.8746149778366089, "step": 25760 }, { "epoch": 0.2577, "grad_norm": 13.375, "grad_norm_var": 0.22864583333333333, "learning_rate": 0.0003, "loss": 11.7949, "loss/aux_loss": 0.04808306787163019, "loss/crossentropy": 2.7718122243881225, "loss/logits": 0.9080736309289932, "step": 25770 }, { "epoch": 0.2578, "grad_norm": 13.4375, "grad_norm_var": 0.50625, "learning_rate": 0.0003, "loss": 11.6395, "loss/aux_loss": 0.04808392096310854, "loss/crossentropy": 2.849545049667358, "loss/logits": 0.9059659481048584, "step": 25780 }, { "epoch": 0.2579, "grad_norm": 13.375, "grad_norm_var": 0.4791666666666667, "learning_rate": 0.0003, "loss": 11.6708, "loss/aux_loss": 0.048084107041358945, "loss/crossentropy": 2.835736083984375, "loss/logits": 0.8999375015497207, "step": 25790 }, { "epoch": 0.258, "grad_norm": 13.625, "grad_norm_var": 0.36638997395833334, "learning_rate": 0.0003, "loss": 11.6178, "loss/aux_loss": 0.048090285435318944, "loss/crossentropy": 2.8020094275474547, "loss/logits": 0.8916181594133377, "step": 25800 }, { "epoch": 0.2581, "grad_norm": 12.375, "grad_norm_var": 0.3465983072916667, "learning_rate": 0.0003, "loss": 11.4354, "loss/aux_loss": 0.04808872099965811, "loss/crossentropy": 2.652283215522766, "loss/logits": 0.8868981301784515, "step": 25810 }, { "epoch": 0.2582, "grad_norm": 12.75, "grad_norm_var": 0.18787434895833333, "learning_rate": 0.0003, "loss": 11.6028, "loss/aux_loss": 0.04808285553008318, "loss/crossentropy": 2.6281425058841705, "loss/logits": 0.8745265692472458, "step": 25820 }, { "epoch": 0.2583, "grad_norm": 13.3125, "grad_norm_var": 0.23671875, "learning_rate": 0.0003, "loss": 11.7574, "loss/aux_loss": 0.048094558902084826, "loss/crossentropy": 2.900680327415466, "loss/logits": 0.8990511387586594, "step": 25830 }, { "epoch": 0.2584, "grad_norm": 13.25, "grad_norm_var": 12.904166666666667, "learning_rate": 0.0003, "loss": 11.5865, "loss/aux_loss": 0.0480920797213912, "loss/crossentropy": 2.7239894449710844, "loss/logits": 0.8879533141851426, "step": 25840 }, { "epoch": 0.2585, "grad_norm": 14.3125, "grad_norm_var": 12.797379557291666, "learning_rate": 0.0003, "loss": 11.4818, "loss/aux_loss": 0.0480831490829587, "loss/crossentropy": 2.729900598526001, "loss/logits": 0.8634632736444473, "step": 25850 }, { "epoch": 0.2586, "grad_norm": 13.5625, "grad_norm_var": 0.5343098958333333, "learning_rate": 0.0003, "loss": 11.6186, "loss/aux_loss": 0.048087738640606406, "loss/crossentropy": 2.9004143357276915, "loss/logits": 0.9239776521921158, "step": 25860 }, { "epoch": 0.2587, "grad_norm": 11.9375, "grad_norm_var": 0.45286458333333335, "learning_rate": 0.0003, "loss": 11.5712, "loss/aux_loss": 0.04809346161782742, "loss/crossentropy": 2.6097486078739167, "loss/logits": 0.8526429146528244, "step": 25870 }, { "epoch": 0.2588, "grad_norm": 14.25, "grad_norm_var": 0.6369140625, "learning_rate": 0.0003, "loss": 11.6677, "loss/aux_loss": 0.04809065666049719, "loss/crossentropy": 2.687888467311859, "loss/logits": 0.857833543419838, "step": 25880 }, { "epoch": 0.2589, "grad_norm": 13.5, "grad_norm_var": 0.5983723958333333, "learning_rate": 0.0003, "loss": 11.7092, "loss/aux_loss": 0.04807872045785189, "loss/crossentropy": 2.8037472486495973, "loss/logits": 0.8581808120012283, "step": 25890 }, { "epoch": 0.259, "grad_norm": 12.5625, "grad_norm_var": 0.6403483072916667, "learning_rate": 0.0003, "loss": 11.6719, "loss/aux_loss": 0.04809915721416473, "loss/crossentropy": 2.8186557054519654, "loss/logits": 0.8756254881620407, "step": 25900 }, { "epoch": 0.2591, "grad_norm": 12.5625, "grad_norm_var": 0.6071451822916667, "learning_rate": 0.0003, "loss": 11.6912, "loss/aux_loss": 0.04807936530560255, "loss/crossentropy": 2.649220663309097, "loss/logits": 0.8669601738452911, "step": 25910 }, { "epoch": 0.2592, "grad_norm": 13.1875, "grad_norm_var": 0.38136393229166665, "learning_rate": 0.0003, "loss": 11.5527, "loss/aux_loss": 0.04808104652911425, "loss/crossentropy": 2.690031635761261, "loss/logits": 0.8827971279621124, "step": 25920 }, { "epoch": 0.2593, "grad_norm": 12.75, "grad_norm_var": 0.2999348958333333, "learning_rate": 0.0003, "loss": 11.5134, "loss/aux_loss": 0.04808360021561384, "loss/crossentropy": 2.661074286699295, "loss/logits": 0.8508890032768249, "step": 25930 }, { "epoch": 0.2594, "grad_norm": 11.9375, "grad_norm_var": 0.33541666666666664, "learning_rate": 0.0003, "loss": 11.6292, "loss/aux_loss": 0.04808128103613853, "loss/crossentropy": 2.8447935700416567, "loss/logits": 0.9187151223421097, "step": 25940 }, { "epoch": 0.2595, "grad_norm": 13.1875, "grad_norm_var": 67.79713541666666, "learning_rate": 0.0003, "loss": 11.6673, "loss/aux_loss": 0.04809282161295414, "loss/crossentropy": 2.7433357715606688, "loss/logits": 0.8858923971652984, "step": 25950 }, { "epoch": 0.2596, "grad_norm": 12.5625, "grad_norm_var": 67.778369140625, "learning_rate": 0.0003, "loss": 11.5163, "loss/aux_loss": 0.04808433465659619, "loss/crossentropy": 2.951106083393097, "loss/logits": 0.8900610029697418, "step": 25960 }, { "epoch": 0.2597, "grad_norm": 12.875, "grad_norm_var": 0.17355143229166667, "learning_rate": 0.0003, "loss": 11.6586, "loss/aux_loss": 0.048085383325815204, "loss/crossentropy": 2.7705465078353884, "loss/logits": 0.873355257511139, "step": 25970 }, { "epoch": 0.2598, "grad_norm": 12.375, "grad_norm_var": 0.315478515625, "learning_rate": 0.0003, "loss": 11.5712, "loss/aux_loss": 0.04809144306927919, "loss/crossentropy": 2.6246292769908903, "loss/logits": 0.8274095565080642, "step": 25980 }, { "epoch": 0.2599, "grad_norm": 12.5625, "grad_norm_var": 0.2718098958333333, "learning_rate": 0.0003, "loss": 11.6178, "loss/aux_loss": 0.04809448383748531, "loss/crossentropy": 2.7689568281173704, "loss/logits": 0.8811231583356858, "step": 25990 }, { "epoch": 0.26, "grad_norm": 12.6875, "grad_norm_var": 0.435400390625, "learning_rate": 0.0003, "loss": 11.5572, "loss/aux_loss": 0.04808851294219494, "loss/crossentropy": 2.837930643558502, "loss/logits": 0.8556112885475159, "step": 26000 }, { "epoch": 0.2601, "grad_norm": 12.75, "grad_norm_var": 0.48951822916666665, "learning_rate": 0.0003, "loss": 11.4797, "loss/aux_loss": 0.04808756932616234, "loss/crossentropy": 2.7235122978687287, "loss/logits": 0.846416375041008, "step": 26010 }, { "epoch": 0.2602, "grad_norm": 12.75, "grad_norm_var": 0.22369791666666666, "learning_rate": 0.0003, "loss": 11.4731, "loss/aux_loss": 0.04808957688510418, "loss/crossentropy": 2.7550492763519285, "loss/logits": 0.8788383483886719, "step": 26020 }, { "epoch": 0.2603, "grad_norm": 11.6875, "grad_norm_var": 0.282275390625, "learning_rate": 0.0003, "loss": 11.447, "loss/aux_loss": 0.04809210356324911, "loss/crossentropy": 2.713481593132019, "loss/logits": 0.8958616107702255, "step": 26030 }, { "epoch": 0.2604, "grad_norm": 14.5625, "grad_norm_var": 0.9640625, "learning_rate": 0.0003, "loss": 11.7818, "loss/aux_loss": 0.04807912241667509, "loss/crossentropy": 2.9397593259811403, "loss/logits": 0.8796298056840897, "step": 26040 }, { "epoch": 0.2605, "grad_norm": 13.375, "grad_norm_var": 0.8477701822916667, "learning_rate": 0.0003, "loss": 11.5145, "loss/aux_loss": 0.048083426989614965, "loss/crossentropy": 2.6227431118488314, "loss/logits": 0.9062366098165512, "step": 26050 }, { "epoch": 0.2606, "grad_norm": 11.0625, "grad_norm_var": 0.5407389322916667, "learning_rate": 0.0003, "loss": 11.7285, "loss/aux_loss": 0.048087958618998526, "loss/crossentropy": 2.8171847641468046, "loss/logits": 0.8737149238586426, "step": 26060 }, { "epoch": 0.2607, "grad_norm": 13.125, "grad_norm_var": 0.6838541666666667, "learning_rate": 0.0003, "loss": 11.591, "loss/aux_loss": 0.04808755628764629, "loss/crossentropy": 2.6662391245365145, "loss/logits": 0.8623921722173691, "step": 26070 }, { "epoch": 0.2608, "grad_norm": 12.875, "grad_norm_var": 0.23904622395833333, "learning_rate": 0.0003, "loss": 11.3801, "loss/aux_loss": 0.04808433074504137, "loss/crossentropy": 2.7102749705314637, "loss/logits": 0.8698142647743226, "step": 26080 }, { "epoch": 0.2609, "grad_norm": 12.625, "grad_norm_var": 0.2552083333333333, "learning_rate": 0.0003, "loss": 11.6073, "loss/aux_loss": 0.048082430846989155, "loss/crossentropy": 2.717993235588074, "loss/logits": 0.9071961134672165, "step": 26090 }, { "epoch": 0.261, "grad_norm": 12.8125, "grad_norm_var": 0.5120930989583333, "learning_rate": 0.0003, "loss": 11.5483, "loss/aux_loss": 0.048091440461575985, "loss/crossentropy": 2.642581123113632, "loss/logits": 0.8533870339393616, "step": 26100 }, { "epoch": 0.2611, "grad_norm": 13.0625, "grad_norm_var": 0.5610514322916667, "learning_rate": 0.0003, "loss": 11.5351, "loss/aux_loss": 0.04807432852685452, "loss/crossentropy": 2.754035633802414, "loss/logits": 0.8514756739139557, "step": 26110 }, { "epoch": 0.2612, "grad_norm": 12.75, "grad_norm_var": 0.4058430989583333, "learning_rate": 0.0003, "loss": 11.5255, "loss/aux_loss": 0.048089875094592574, "loss/crossentropy": 2.742984265089035, "loss/logits": 0.8871752589941024, "step": 26120 }, { "epoch": 0.2613, "grad_norm": 11.0, "grad_norm_var": 0.46640625, "learning_rate": 0.0003, "loss": 11.619, "loss/aux_loss": 0.04808270148932934, "loss/crossentropy": 2.87599663734436, "loss/logits": 0.887411966919899, "step": 26130 }, { "epoch": 0.2614, "grad_norm": 13.875, "grad_norm_var": 0.481494140625, "learning_rate": 0.0003, "loss": 11.7214, "loss/aux_loss": 0.04808396827429533, "loss/crossentropy": 2.8104442954063416, "loss/logits": 0.9045211106538773, "step": 26140 }, { "epoch": 0.2615, "grad_norm": 12.5, "grad_norm_var": 0.326025390625, "learning_rate": 0.0003, "loss": 11.5677, "loss/aux_loss": 0.048081147111952306, "loss/crossentropy": 2.7364437103271486, "loss/logits": 0.8579443216323852, "step": 26150 }, { "epoch": 0.2616, "grad_norm": 12.5, "grad_norm_var": 0.48587239583333336, "learning_rate": 0.0003, "loss": 11.7239, "loss/aux_loss": 0.04808290041983128, "loss/crossentropy": 2.8139244556427, "loss/logits": 0.9172647058963775, "step": 26160 }, { "epoch": 0.2617, "grad_norm": 12.375, "grad_norm_var": 0.2626139322916667, "learning_rate": 0.0003, "loss": 11.6432, "loss/aux_loss": 0.04808450732380152, "loss/crossentropy": 2.843486213684082, "loss/logits": 0.8691521465778351, "step": 26170 }, { "epoch": 0.2618, "grad_norm": 13.1875, "grad_norm_var": 0.27615559895833336, "learning_rate": 0.0003, "loss": 11.6097, "loss/aux_loss": 0.04808285385370255, "loss/crossentropy": 2.858822929859161, "loss/logits": 0.9088696330785752, "step": 26180 }, { "epoch": 0.2619, "grad_norm": 13.25, "grad_norm_var": 5.26015625, "learning_rate": 0.0003, "loss": 11.6214, "loss/aux_loss": 0.048090561851859094, "loss/crossentropy": 2.7624371886253356, "loss/logits": 0.8909125179052353, "step": 26190 }, { "epoch": 0.262, "grad_norm": 12.6875, "grad_norm_var": 0.24112955729166666, "learning_rate": 0.0003, "loss": 11.5151, "loss/aux_loss": 0.04809354785829782, "loss/crossentropy": 2.7231006741523744, "loss/logits": 0.8609473258256912, "step": 26200 }, { "epoch": 0.2621, "grad_norm": 12.625, "grad_norm_var": 0.21354166666666666, "learning_rate": 0.0003, "loss": 11.5397, "loss/aux_loss": 0.048093443363904954, "loss/crossentropy": 2.785054862499237, "loss/logits": 0.8880521357059479, "step": 26210 }, { "epoch": 0.2622, "grad_norm": 14.5625, "grad_norm_var": 0.42849934895833336, "learning_rate": 0.0003, "loss": 11.6959, "loss/aux_loss": 0.04808842465281486, "loss/crossentropy": 2.5711670517921448, "loss/logits": 0.8366287380456925, "step": 26220 }, { "epoch": 0.2623, "grad_norm": 13.125, "grad_norm_var": 0.404931640625, "learning_rate": 0.0003, "loss": 11.5461, "loss/aux_loss": 0.04808579571545124, "loss/crossentropy": 2.6578098058700563, "loss/logits": 0.826743358373642, "step": 26230 }, { "epoch": 0.2624, "grad_norm": 12.5, "grad_norm_var": 0.44212239583333335, "learning_rate": 0.0003, "loss": 11.7337, "loss/aux_loss": 0.04809788726270199, "loss/crossentropy": 2.923408627510071, "loss/logits": 0.9030967593193054, "step": 26240 }, { "epoch": 0.2625, "grad_norm": 12.375, "grad_norm_var": 0.3395182291666667, "learning_rate": 0.0003, "loss": 11.5184, "loss/aux_loss": 0.04807977341115475, "loss/crossentropy": 2.7566078901290894, "loss/logits": 0.8837383359670639, "step": 26250 }, { "epoch": 0.2626, "grad_norm": 12.3125, "grad_norm_var": 0.50546875, "learning_rate": 0.0003, "loss": 11.4649, "loss/aux_loss": 0.04808521345257759, "loss/crossentropy": 2.762271225452423, "loss/logits": 0.849093359708786, "step": 26260 }, { "epoch": 0.2627, "grad_norm": 12.9375, "grad_norm_var": 0.6986979166666667, "learning_rate": 0.0003, "loss": 11.5331, "loss/aux_loss": 0.04808817598968744, "loss/crossentropy": 2.7467776775360107, "loss/logits": 0.875149542093277, "step": 26270 }, { "epoch": 0.2628, "grad_norm": 13.8125, "grad_norm_var": 0.4869140625, "learning_rate": 0.0003, "loss": 11.5921, "loss/aux_loss": 0.04808767884969711, "loss/crossentropy": 2.7188161492347716, "loss/logits": 0.8897079229354858, "step": 26280 }, { "epoch": 0.2629, "grad_norm": 14.5625, "grad_norm_var": 5.152018229166667, "learning_rate": 0.0003, "loss": 11.5294, "loss/aux_loss": 0.048097186163067816, "loss/crossentropy": 2.8574177980422975, "loss/logits": 0.8552042752504349, "step": 26290 }, { "epoch": 0.263, "grad_norm": 12.625, "grad_norm_var": 5.252978515625, "learning_rate": 0.0003, "loss": 11.7339, "loss/aux_loss": 0.04809401351958513, "loss/crossentropy": 2.900586748123169, "loss/logits": 0.9175508260726929, "step": 26300 }, { "epoch": 0.2631, "grad_norm": 12.4375, "grad_norm_var": 0.36300455729166664, "learning_rate": 0.0003, "loss": 11.8741, "loss/aux_loss": 0.04807520993053913, "loss/crossentropy": 2.955962133407593, "loss/logits": 0.8903858751058579, "step": 26310 }, { "epoch": 0.2632, "grad_norm": 13.375, "grad_norm_var": 0.33006184895833335, "learning_rate": 0.0003, "loss": 11.6969, "loss/aux_loss": 0.04809126146137714, "loss/crossentropy": 2.8080713510513307, "loss/logits": 0.8494821518659592, "step": 26320 }, { "epoch": 0.2633, "grad_norm": 12.1875, "grad_norm_var": 0.5207682291666667, "learning_rate": 0.0003, "loss": 11.6019, "loss/aux_loss": 0.04808663856238127, "loss/crossentropy": 2.8642470121383665, "loss/logits": 0.869270795583725, "step": 26330 }, { "epoch": 0.2634, "grad_norm": 12.5625, "grad_norm_var": 0.13592122395833334, "learning_rate": 0.0003, "loss": 11.6974, "loss/aux_loss": 0.04808529764413834, "loss/crossentropy": 2.7711823523044585, "loss/logits": 0.89097281396389, "step": 26340 }, { "epoch": 0.2635, "grad_norm": 11.9375, "grad_norm_var": 0.22342122395833333, "learning_rate": 0.0003, "loss": 11.6506, "loss/aux_loss": 0.04808674547821283, "loss/crossentropy": 2.8846903085708617, "loss/logits": 0.8763628363609314, "step": 26350 }, { "epoch": 0.2636, "grad_norm": 12.3125, "grad_norm_var": 0.3492024739583333, "learning_rate": 0.0003, "loss": 11.6322, "loss/aux_loss": 0.048084713704884055, "loss/crossentropy": 2.593327397108078, "loss/logits": 0.8438139349222183, "step": 26360 }, { "epoch": 0.2637, "grad_norm": 11.875, "grad_norm_var": 0.308056640625, "learning_rate": 0.0003, "loss": 11.7669, "loss/aux_loss": 0.048082432709634304, "loss/crossentropy": 2.8110472738742827, "loss/logits": 0.9003132045269012, "step": 26370 }, { "epoch": 0.2638, "grad_norm": 12.0625, "grad_norm_var": 0.28177083333333336, "learning_rate": 0.0003, "loss": 11.6746, "loss/aux_loss": 0.04809358026832342, "loss/crossentropy": 2.8158118963241576, "loss/logits": 0.8962694942951203, "step": 26380 }, { "epoch": 0.2639, "grad_norm": 14.3125, "grad_norm_var": 0.706103515625, "learning_rate": 0.0003, "loss": 11.6306, "loss/aux_loss": 0.048089638352394104, "loss/crossentropy": 2.935623896121979, "loss/logits": 0.8567668348550797, "step": 26390 }, { "epoch": 0.264, "grad_norm": 12.4375, "grad_norm_var": 0.8358723958333333, "learning_rate": 0.0003, "loss": 11.6547, "loss/aux_loss": 0.04807798489928246, "loss/crossentropy": 2.7165545761585235, "loss/logits": 0.8737062573432922, "step": 26400 }, { "epoch": 0.2641, "grad_norm": 12.1875, "grad_norm_var": 0.24270833333333333, "learning_rate": 0.0003, "loss": 11.5237, "loss/aux_loss": 0.048086860589683054, "loss/crossentropy": 2.733562481403351, "loss/logits": 0.8268854200839997, "step": 26410 }, { "epoch": 0.2642, "grad_norm": 14.125, "grad_norm_var": 0.341650390625, "learning_rate": 0.0003, "loss": 11.6531, "loss/aux_loss": 0.048096845485270025, "loss/crossentropy": 2.649859589338303, "loss/logits": 0.8604597598314285, "step": 26420 }, { "epoch": 0.2643, "grad_norm": 13.5, "grad_norm_var": 0.2969889322916667, "learning_rate": 0.0003, "loss": 11.4639, "loss/aux_loss": 0.0480915080755949, "loss/crossentropy": 2.7871899247169494, "loss/logits": 0.8752772063016891, "step": 26430 }, { "epoch": 0.2644, "grad_norm": 13.4375, "grad_norm_var": 0.21451822916666666, "learning_rate": 0.0003, "loss": 11.6634, "loss/aux_loss": 0.04808179382234812, "loss/crossentropy": 2.647487211227417, "loss/logits": 0.8647993594408036, "step": 26440 }, { "epoch": 0.2645, "grad_norm": 13.0625, "grad_norm_var": 0.248681640625, "learning_rate": 0.0003, "loss": 11.5747, "loss/aux_loss": 0.0480790950357914, "loss/crossentropy": 2.751650595664978, "loss/logits": 0.9247609049081802, "step": 26450 }, { "epoch": 0.2646, "grad_norm": 12.375, "grad_norm_var": 0.15052083333333333, "learning_rate": 0.0003, "loss": 11.3959, "loss/aux_loss": 0.04808844346553087, "loss/crossentropy": 2.733932113647461, "loss/logits": 0.854560700058937, "step": 26460 }, { "epoch": 0.2647, "grad_norm": 13.0625, "grad_norm_var": 1.4692057291666667, "learning_rate": 0.0003, "loss": 11.6134, "loss/aux_loss": 0.04808341935276985, "loss/crossentropy": 2.7537252068519593, "loss/logits": 0.8283010810613632, "step": 26470 }, { "epoch": 0.2648, "grad_norm": 12.625, "grad_norm_var": 0.579541015625, "learning_rate": 0.0003, "loss": 11.6445, "loss/aux_loss": 0.04808064494282007, "loss/crossentropy": 2.715360426902771, "loss/logits": 0.8789581745862961, "step": 26480 }, { "epoch": 0.2649, "grad_norm": 13.5625, "grad_norm_var": 0.43683268229166666, "learning_rate": 0.0003, "loss": 11.5003, "loss/aux_loss": 0.04808939713984728, "loss/crossentropy": 2.752163290977478, "loss/logits": 0.8825481355190277, "step": 26490 }, { "epoch": 0.265, "grad_norm": 12.1875, "grad_norm_var": 0.44138997395833335, "learning_rate": 0.0003, "loss": 11.4298, "loss/aux_loss": 0.04808378238230944, "loss/crossentropy": 2.66518457531929, "loss/logits": 0.8795353204011918, "step": 26500 }, { "epoch": 0.2651, "grad_norm": 13.375, "grad_norm_var": 1.1955729166666667, "learning_rate": 0.0003, "loss": 11.5447, "loss/aux_loss": 0.048084886930882934, "loss/crossentropy": 2.7104385554790498, "loss/logits": 0.8602730393409729, "step": 26510 }, { "epoch": 0.2652, "grad_norm": 12.1875, "grad_norm_var": 1.5015462239583333, "learning_rate": 0.0003, "loss": 11.5465, "loss/aux_loss": 0.04808354377746582, "loss/crossentropy": 2.6698466658592226, "loss/logits": 0.8915825933218002, "step": 26520 }, { "epoch": 0.2653, "grad_norm": 13.25, "grad_norm_var": 0.8618326822916667, "learning_rate": 0.0003, "loss": 11.3179, "loss/aux_loss": 0.04808184951543808, "loss/crossentropy": 2.778967833518982, "loss/logits": 0.8605857610702514, "step": 26530 }, { "epoch": 0.2654, "grad_norm": 12.4375, "grad_norm_var": 0.7886555989583334, "learning_rate": 0.0003, "loss": 11.6822, "loss/aux_loss": 0.04807785861194134, "loss/crossentropy": 2.9185683012008665, "loss/logits": 0.876692533493042, "step": 26540 }, { "epoch": 0.2655, "grad_norm": 12.9375, "grad_norm_var": 0.366650390625, "learning_rate": 0.0003, "loss": 11.6013, "loss/aux_loss": 0.04808231629431248, "loss/crossentropy": 2.7370326638221742, "loss/logits": 0.867209044098854, "step": 26550 }, { "epoch": 0.2656, "grad_norm": 14.125, "grad_norm_var": 0.47784830729166666, "learning_rate": 0.0003, "loss": 11.5959, "loss/aux_loss": 0.0480883315205574, "loss/crossentropy": 2.799232506752014, "loss/logits": 0.8930830955505371, "step": 26560 }, { "epoch": 0.2657, "grad_norm": 12.125, "grad_norm_var": 0.6108723958333333, "learning_rate": 0.0003, "loss": 11.6437, "loss/aux_loss": 0.04807663895189762, "loss/crossentropy": 2.7677125334739685, "loss/logits": 0.9062906086444855, "step": 26570 }, { "epoch": 0.2658, "grad_norm": 12.375, "grad_norm_var": 0.232275390625, "learning_rate": 0.0003, "loss": 11.4253, "loss/aux_loss": 0.04809227138757706, "loss/crossentropy": 2.684408128261566, "loss/logits": 0.878808343410492, "step": 26580 }, { "epoch": 0.2659, "grad_norm": 13.75, "grad_norm_var": 0.4923014322916667, "learning_rate": 0.0003, "loss": 11.5427, "loss/aux_loss": 0.04807818587869406, "loss/crossentropy": 2.9242777824401855, "loss/logits": 0.8927222698926925, "step": 26590 }, { "epoch": 0.266, "grad_norm": 13.625, "grad_norm_var": 0.483056640625, "learning_rate": 0.0003, "loss": 11.5401, "loss/aux_loss": 0.04808124490082264, "loss/crossentropy": 2.8354394733905792, "loss/logits": 0.8955871939659119, "step": 26600 }, { "epoch": 0.2661, "grad_norm": 13.6875, "grad_norm_var": 0.9447916666666667, "learning_rate": 0.0003, "loss": 11.5013, "loss/aux_loss": 0.04808798339217901, "loss/crossentropy": 2.9905380249023437, "loss/logits": 0.9007535576820374, "step": 26610 }, { "epoch": 0.2662, "grad_norm": 11.875, "grad_norm_var": 0.6120930989583333, "learning_rate": 0.0003, "loss": 11.4842, "loss/aux_loss": 0.04808419458568096, "loss/crossentropy": 2.6949995160102844, "loss/logits": 0.8447135239839554, "step": 26620 }, { "epoch": 0.2663, "grad_norm": 11.5, "grad_norm_var": 0.5355305989583333, "learning_rate": 0.0003, "loss": 11.5274, "loss/aux_loss": 0.048078814335167405, "loss/crossentropy": 2.6906064808368684, "loss/logits": 0.9114213407039642, "step": 26630 }, { "epoch": 0.2664, "grad_norm": 15.625, "grad_norm_var": 0.9187337239583333, "learning_rate": 0.0003, "loss": 11.6276, "loss/aux_loss": 0.048075567744672296, "loss/crossentropy": 2.8382048666477204, "loss/logits": 0.8896595865488053, "step": 26640 }, { "epoch": 0.2665, "grad_norm": 12.9375, "grad_norm_var": 0.5497395833333333, "learning_rate": 0.0003, "loss": 11.7462, "loss/aux_loss": 0.048085755482316014, "loss/crossentropy": 2.9384037852287292, "loss/logits": 0.8757476270198822, "step": 26650 }, { "epoch": 0.2666, "grad_norm": 14.375, "grad_norm_var": 0.6030598958333333, "learning_rate": 0.0003, "loss": 11.578, "loss/aux_loss": 0.04808356873691082, "loss/crossentropy": 2.7069952189922333, "loss/logits": 0.8372669726610183, "step": 26660 }, { "epoch": 0.2667, "grad_norm": 13.6875, "grad_norm_var": 0.6320149739583333, "learning_rate": 0.0003, "loss": 11.6305, "loss/aux_loss": 0.04808407332748175, "loss/crossentropy": 2.866211920976639, "loss/logits": 0.926874178647995, "step": 26670 }, { "epoch": 0.2668, "grad_norm": 12.4375, "grad_norm_var": 0.42649739583333335, "learning_rate": 0.0003, "loss": 11.512, "loss/aux_loss": 0.048087192699313164, "loss/crossentropy": 2.779119974374771, "loss/logits": 0.8863234221935272, "step": 26680 }, { "epoch": 0.2669, "grad_norm": 12.25, "grad_norm_var": 5.164957682291667, "learning_rate": 0.0003, "loss": 11.529, "loss/aux_loss": 0.048087614588439465, "loss/crossentropy": 2.786206305027008, "loss/logits": 0.8752316683530807, "step": 26690 }, { "epoch": 0.267, "grad_norm": 14.1875, "grad_norm_var": 0.3640625, "learning_rate": 0.0003, "loss": 11.6246, "loss/aux_loss": 0.048085883259773254, "loss/crossentropy": 3.017382597923279, "loss/logits": 0.9026335388422012, "step": 26700 }, { "epoch": 0.2671, "grad_norm": 12.75, "grad_norm_var": 0.36847330729166666, "learning_rate": 0.0003, "loss": 11.7263, "loss/aux_loss": 0.04808237832039595, "loss/crossentropy": 2.7911539018154143, "loss/logits": 0.8946028083562851, "step": 26710 }, { "epoch": 0.2672, "grad_norm": 12.5625, "grad_norm_var": 0.3453125, "learning_rate": 0.0003, "loss": 11.5712, "loss/aux_loss": 0.04808432050049305, "loss/crossentropy": 2.7174662709236146, "loss/logits": 0.862998154759407, "step": 26720 }, { "epoch": 0.2673, "grad_norm": 14.4375, "grad_norm_var": 0.49609375, "learning_rate": 0.0003, "loss": 11.697, "loss/aux_loss": 0.04808508008718491, "loss/crossentropy": 2.8050862312316895, "loss/logits": 0.8875089168548584, "step": 26730 }, { "epoch": 0.2674, "grad_norm": 13.5, "grad_norm_var": 0.591650390625, "learning_rate": 0.0003, "loss": 11.6684, "loss/aux_loss": 0.04808866996318102, "loss/crossentropy": 2.8705193996429443, "loss/logits": 0.8535552382469177, "step": 26740 }, { "epoch": 0.2675, "grad_norm": 12.5625, "grad_norm_var": 0.2669270833333333, "learning_rate": 0.0003, "loss": 11.584, "loss/aux_loss": 0.04808246474713087, "loss/crossentropy": 2.6758610129356386, "loss/logits": 0.8840056896209717, "step": 26750 }, { "epoch": 0.2676, "grad_norm": 12.25, "grad_norm_var": 0.265625, "learning_rate": 0.0003, "loss": 11.5109, "loss/aux_loss": 0.04808396678417921, "loss/crossentropy": 2.7247723996639253, "loss/logits": 0.8758624956011772, "step": 26760 }, { "epoch": 0.2677, "grad_norm": 13.3125, "grad_norm_var": 0.5591145833333333, "learning_rate": 0.0003, "loss": 11.5339, "loss/aux_loss": 0.04808528199791908, "loss/crossentropy": 2.6667077600955964, "loss/logits": 0.8680305898189544, "step": 26770 }, { "epoch": 0.2678, "grad_norm": 12.125, "grad_norm_var": 0.2708333333333333, "learning_rate": 0.0003, "loss": 11.5729, "loss/aux_loss": 0.048083195276558396, "loss/crossentropy": 2.646883499622345, "loss/logits": 0.8598526418209076, "step": 26780 }, { "epoch": 0.2679, "grad_norm": 12.5, "grad_norm_var": 0.31573893229166666, "learning_rate": 0.0003, "loss": 11.4182, "loss/aux_loss": 0.04808151088654995, "loss/crossentropy": 2.8521530270576476, "loss/logits": 0.8823621451854706, "step": 26790 }, { "epoch": 0.268, "grad_norm": 12.6875, "grad_norm_var": 0.250634765625, "learning_rate": 0.0003, "loss": 11.442, "loss/aux_loss": 0.0480816463008523, "loss/crossentropy": 2.7866445600986482, "loss/logits": 0.8612229824066162, "step": 26800 }, { "epoch": 0.2681, "grad_norm": 12.75, "grad_norm_var": 0.379931640625, "learning_rate": 0.0003, "loss": 11.5301, "loss/aux_loss": 0.048094099201261994, "loss/crossentropy": 2.8141289949417114, "loss/logits": 0.8547409534454345, "step": 26810 }, { "epoch": 0.2682, "grad_norm": 13.6875, "grad_norm_var": 0.29933268229166665, "learning_rate": 0.0003, "loss": 11.7054, "loss/aux_loss": 0.048088458552956584, "loss/crossentropy": 2.8167191982269286, "loss/logits": 0.9029836922883987, "step": 26820 }, { "epoch": 0.2683, "grad_norm": 12.75, "grad_norm_var": 0.539697265625, "learning_rate": 0.0003, "loss": 11.5224, "loss/aux_loss": 0.0480873117223382, "loss/crossentropy": 2.6812549769878387, "loss/logits": 0.8772442221641541, "step": 26830 }, { "epoch": 0.2684, "grad_norm": 13.1875, "grad_norm_var": 0.6906087239583333, "learning_rate": 0.0003, "loss": 11.565, "loss/aux_loss": 0.04808258228003979, "loss/crossentropy": 2.7515992164611816, "loss/logits": 0.879136735200882, "step": 26840 }, { "epoch": 0.2685, "grad_norm": 12.75, "grad_norm_var": 1.274462890625, "learning_rate": 0.0003, "loss": 11.6051, "loss/aux_loss": 0.048084885254502295, "loss/crossentropy": 2.586995255947113, "loss/logits": 0.8740639716386795, "step": 26850 }, { "epoch": 0.2686, "grad_norm": 13.5, "grad_norm_var": 0.2515625, "learning_rate": 0.0003, "loss": 11.389, "loss/aux_loss": 0.0480819696560502, "loss/crossentropy": 2.7247099459171293, "loss/logits": 0.8685719013214112, "step": 26860 }, { "epoch": 0.2687, "grad_norm": 12.375, "grad_norm_var": 72.13385416666667, "learning_rate": 0.0003, "loss": 11.6388, "loss/aux_loss": 0.04809704162180424, "loss/crossentropy": 2.6560521006584166, "loss/logits": 0.8820772796869278, "step": 26870 }, { "epoch": 0.2688, "grad_norm": 12.9375, "grad_norm_var": 0.1853515625, "learning_rate": 0.0003, "loss": 11.6079, "loss/aux_loss": 0.04808767940849066, "loss/crossentropy": 2.5908863723278044, "loss/logits": 0.8665450185537338, "step": 26880 }, { "epoch": 0.2689, "grad_norm": 13.125, "grad_norm_var": 0.6077473958333334, "learning_rate": 0.0003, "loss": 11.6409, "loss/aux_loss": 0.048092026449739936, "loss/crossentropy": 2.7218611598014832, "loss/logits": 0.8763234496116639, "step": 26890 }, { "epoch": 0.269, "grad_norm": 13.0625, "grad_norm_var": 0.35323893229166664, "learning_rate": 0.0003, "loss": 11.674, "loss/aux_loss": 0.04808721747249365, "loss/crossentropy": 2.7273535072803496, "loss/logits": 0.8680451363325119, "step": 26900 }, { "epoch": 0.2691, "grad_norm": 11.75, "grad_norm_var": 2.484635416666667, "learning_rate": 0.0003, "loss": 11.6283, "loss/aux_loss": 0.04808290395885706, "loss/crossentropy": 2.7846306562423706, "loss/logits": 0.909109690785408, "step": 26910 }, { "epoch": 0.2692, "grad_norm": 12.25, "grad_norm_var": 0.7494140625, "learning_rate": 0.0003, "loss": 11.3449, "loss/aux_loss": 0.04808285906910896, "loss/crossentropy": 2.779471981525421, "loss/logits": 0.8647771954536438, "step": 26920 }, { "epoch": 0.2693, "grad_norm": 13.375, "grad_norm_var": 40.298160807291666, "learning_rate": 0.0003, "loss": 11.5546, "loss/aux_loss": 0.0480886047706008, "loss/crossentropy": 2.7506704151630403, "loss/logits": 0.8976017504930496, "step": 26930 }, { "epoch": 0.2694, "grad_norm": 13.4375, "grad_norm_var": 0.459619140625, "learning_rate": 0.0003, "loss": 11.59, "loss/aux_loss": 0.04809030499309301, "loss/crossentropy": 2.809163624048233, "loss/logits": 0.9011189788579941, "step": 26940 }, { "epoch": 0.2695, "grad_norm": 13.3125, "grad_norm_var": 0.374462890625, "learning_rate": 0.0003, "loss": 11.6427, "loss/aux_loss": 0.04809495285153389, "loss/crossentropy": 2.747515672445297, "loss/logits": 0.8646955370903016, "step": 26950 }, { "epoch": 0.2696, "grad_norm": 13.0625, "grad_norm_var": 0.414306640625, "learning_rate": 0.0003, "loss": 11.4118, "loss/aux_loss": 0.048085729405283925, "loss/crossentropy": 2.776648241281509, "loss/logits": 0.8744494527578354, "step": 26960 }, { "epoch": 0.2697, "grad_norm": 12.625, "grad_norm_var": 0.496337890625, "learning_rate": 0.0003, "loss": 11.6107, "loss/aux_loss": 0.0480810709297657, "loss/crossentropy": 2.7663753151893617, "loss/logits": 0.8697874486446381, "step": 26970 }, { "epoch": 0.2698, "grad_norm": 12.6875, "grad_norm_var": 0.36013997395833336, "learning_rate": 0.0003, "loss": 11.4819, "loss/aux_loss": 0.04809112492948771, "loss/crossentropy": 2.5756009936332704, "loss/logits": 0.832565313577652, "step": 26980 }, { "epoch": 0.2699, "grad_norm": 13.4375, "grad_norm_var": 0.3505045572916667, "learning_rate": 0.0003, "loss": 11.5591, "loss/aux_loss": 0.04808456730097532, "loss/crossentropy": 2.715240556001663, "loss/logits": 0.9006956547498703, "step": 26990 }, { "epoch": 0.27, "grad_norm": 12.6875, "grad_norm_var": 8.5306640625, "learning_rate": 0.0003, "loss": 11.3831, "loss/aux_loss": 0.04807609599083662, "loss/crossentropy": 2.7678284883499145, "loss/logits": 0.8785594999790192, "step": 27000 }, { "epoch": 0.2701, "grad_norm": 13.6875, "grad_norm_var": 0.16300455729166666, "learning_rate": 0.0003, "loss": 11.6529, "loss/aux_loss": 0.0480866638943553, "loss/crossentropy": 2.6905364990234375, "loss/logits": 0.8590237915515899, "step": 27010 }, { "epoch": 0.2702, "grad_norm": 12.4375, "grad_norm_var": 0.3159993489583333, "learning_rate": 0.0003, "loss": 11.6173, "loss/aux_loss": 0.048088221624493596, "loss/crossentropy": 2.8152272939682006, "loss/logits": 0.8619503259658814, "step": 27020 }, { "epoch": 0.2703, "grad_norm": 13.375, "grad_norm_var": 0.1962890625, "learning_rate": 0.0003, "loss": 11.6658, "loss/aux_loss": 0.04808411095291376, "loss/crossentropy": 2.842225217819214, "loss/logits": 0.8590496510267258, "step": 27030 }, { "epoch": 0.2704, "grad_norm": 13.125, "grad_norm_var": 0.583447265625, "learning_rate": 0.0003, "loss": 11.4641, "loss/aux_loss": 0.048092346824705604, "loss/crossentropy": 2.7975880026817324, "loss/logits": 0.8442713886499404, "step": 27040 }, { "epoch": 0.2705, "grad_norm": 12.625, "grad_norm_var": 0.9462076822916666, "learning_rate": 0.0003, "loss": 11.6847, "loss/aux_loss": 0.04807819910347462, "loss/crossentropy": 2.8695399880409242, "loss/logits": 0.9112765967845917, "step": 27050 }, { "epoch": 0.2706, "grad_norm": 12.6875, "grad_norm_var": 0.490869140625, "learning_rate": 0.0003, "loss": 11.6033, "loss/aux_loss": 0.048083419911563396, "loss/crossentropy": 2.5501754522323608, "loss/logits": 0.8562157094478607, "step": 27060 }, { "epoch": 0.2707, "grad_norm": 13.125, "grad_norm_var": 0.5098958333333333, "learning_rate": 0.0003, "loss": 11.768, "loss/aux_loss": 0.04808535445481539, "loss/crossentropy": 2.795117211341858, "loss/logits": 0.8820730477571488, "step": 27070 }, { "epoch": 0.2708, "grad_norm": 12.9375, "grad_norm_var": 0.5040201822916667, "learning_rate": 0.0003, "loss": 11.5046, "loss/aux_loss": 0.04808331392705441, "loss/crossentropy": 2.6431259870529176, "loss/logits": 0.8500302553176879, "step": 27080 }, { "epoch": 0.2709, "grad_norm": 12.9375, "grad_norm_var": 0.49724934895833334, "learning_rate": 0.0003, "loss": 11.5656, "loss/aux_loss": 0.04808337949216366, "loss/crossentropy": 2.9284351587295534, "loss/logits": 0.9117402613162995, "step": 27090 }, { "epoch": 0.271, "grad_norm": 14.0, "grad_norm_var": 0.35857747395833334, "learning_rate": 0.0003, "loss": 11.4482, "loss/aux_loss": 0.048084777966141704, "loss/crossentropy": 2.5703269481658935, "loss/logits": 0.8528429746627808, "step": 27100 }, { "epoch": 0.2711, "grad_norm": 12.5625, "grad_norm_var": 0.37916666666666665, "learning_rate": 0.0003, "loss": 11.4456, "loss/aux_loss": 0.04808557964861393, "loss/crossentropy": 2.910882604122162, "loss/logits": 0.9175168991088867, "step": 27110 }, { "epoch": 0.2712, "grad_norm": 14.0, "grad_norm_var": 0.4205729166666667, "learning_rate": 0.0003, "loss": 11.5441, "loss/aux_loss": 0.048091310635209085, "loss/crossentropy": 2.818699848651886, "loss/logits": 0.8743287414312363, "step": 27120 }, { "epoch": 0.2713, "grad_norm": 12.375, "grad_norm_var": 0.42389322916666666, "learning_rate": 0.0003, "loss": 11.433, "loss/aux_loss": 0.04808477144688368, "loss/crossentropy": 2.7713629007339478, "loss/logits": 0.8480364233255386, "step": 27130 }, { "epoch": 0.2714, "grad_norm": 12.3125, "grad_norm_var": 0.49270833333333336, "learning_rate": 0.0003, "loss": 11.4776, "loss/aux_loss": 0.048079821094870565, "loss/crossentropy": 2.791437101364136, "loss/logits": 0.8964347183704376, "step": 27140 }, { "epoch": 0.2715, "grad_norm": 12.75, "grad_norm_var": 0.180712890625, "learning_rate": 0.0003, "loss": 11.6727, "loss/aux_loss": 0.04808723647147417, "loss/crossentropy": 2.799302804470062, "loss/logits": 0.9013757139444352, "step": 27150 }, { "epoch": 0.2716, "grad_norm": 11.5, "grad_norm_var": 0.32732747395833334, "learning_rate": 0.0003, "loss": 11.5774, "loss/aux_loss": 0.04808212071657181, "loss/crossentropy": 2.7711844205856324, "loss/logits": 0.8806311905384063, "step": 27160 }, { "epoch": 0.2717, "grad_norm": 12.125, "grad_norm_var": 0.3702473958333333, "learning_rate": 0.0003, "loss": 11.6159, "loss/aux_loss": 0.04808286111801863, "loss/crossentropy": 2.7594713032245637, "loss/logits": 0.8914604634046555, "step": 27170 }, { "epoch": 0.2718, "grad_norm": 12.75, "grad_norm_var": 0.3030598958333333, "learning_rate": 0.0003, "loss": 11.6285, "loss/aux_loss": 0.04807987064123154, "loss/crossentropy": 2.8910335302352905, "loss/logits": 0.8775747418403625, "step": 27180 }, { "epoch": 0.2719, "grad_norm": 13.0, "grad_norm_var": 0.7816243489583333, "learning_rate": 0.0003, "loss": 11.5454, "loss/aux_loss": 0.048089956678450105, "loss/crossentropy": 2.863065016269684, "loss/logits": 0.8842020243406296, "step": 27190 }, { "epoch": 0.272, "grad_norm": 12.5, "grad_norm_var": 0.403759765625, "learning_rate": 0.0003, "loss": 11.6229, "loss/aux_loss": 0.048087510466575625, "loss/crossentropy": 2.7746796369552613, "loss/logits": 0.8964880555868149, "step": 27200 }, { "epoch": 0.2721, "grad_norm": 11.8125, "grad_norm_var": 0.4014973958333333, "learning_rate": 0.0003, "loss": 11.7884, "loss/aux_loss": 0.04808600451797247, "loss/crossentropy": 2.9064237117767333, "loss/logits": 0.9102618426084519, "step": 27210 }, { "epoch": 0.2722, "grad_norm": 14.1875, "grad_norm_var": 0.48899739583333335, "learning_rate": 0.0003, "loss": 11.5522, "loss/aux_loss": 0.04808280412107706, "loss/crossentropy": 2.8749096155166627, "loss/logits": 0.8867197275161743, "step": 27220 }, { "epoch": 0.2723, "grad_norm": 13.75, "grad_norm_var": 0.46868489583333334, "learning_rate": 0.0003, "loss": 11.6081, "loss/aux_loss": 0.04809102062135935, "loss/crossentropy": 2.806208127737045, "loss/logits": 0.9116105139255524, "step": 27230 }, { "epoch": 0.2724, "grad_norm": 14.25, "grad_norm_var": 1.3843098958333333, "learning_rate": 0.0003, "loss": 11.5575, "loss/aux_loss": 0.04807698503136635, "loss/crossentropy": 2.9286911368370054, "loss/logits": 0.8646731346845626, "step": 27240 }, { "epoch": 0.2725, "grad_norm": 13.125, "grad_norm_var": 1.3192057291666666, "learning_rate": 0.0003, "loss": 11.5521, "loss/aux_loss": 0.04809784200042486, "loss/crossentropy": 2.829662698507309, "loss/logits": 0.9164007723331451, "step": 27250 }, { "epoch": 0.2726, "grad_norm": 14.5, "grad_norm_var": 0.9409993489583334, "learning_rate": 0.0003, "loss": 11.4188, "loss/aux_loss": 0.04808580614626408, "loss/crossentropy": 2.5694850265979765, "loss/logits": 0.8638879209756851, "step": 27260 }, { "epoch": 0.2727, "grad_norm": 12.75, "grad_norm_var": 0.5489420572916667, "learning_rate": 0.0003, "loss": 11.5619, "loss/aux_loss": 0.04808543249964714, "loss/crossentropy": 2.8475801050662994, "loss/logits": 0.8850974351167679, "step": 27270 }, { "epoch": 0.2728, "grad_norm": 15.0, "grad_norm_var": 2.805712890625, "learning_rate": 0.0003, "loss": 11.5083, "loss/aux_loss": 0.04808332417160273, "loss/crossentropy": 2.787931036949158, "loss/logits": 0.9341086566448211, "step": 27280 }, { "epoch": 0.2729, "grad_norm": 13.125, "grad_norm_var": 2.713134765625, "learning_rate": 0.0003, "loss": 11.6033, "loss/aux_loss": 0.048095279932022096, "loss/crossentropy": 2.7088790059089662, "loss/logits": 0.8688720375299454, "step": 27290 }, { "epoch": 0.273, "grad_norm": 13.3125, "grad_norm_var": 1.1643229166666667, "learning_rate": 0.0003, "loss": 11.469, "loss/aux_loss": 0.04808818940073252, "loss/crossentropy": 2.6304342091083526, "loss/logits": 0.8546393603086472, "step": 27300 }, { "epoch": 0.2731, "grad_norm": 12.875, "grad_norm_var": 0.48170572916666665, "learning_rate": 0.0003, "loss": 11.4469, "loss/aux_loss": 0.048087147809565065, "loss/crossentropy": 2.6426671385765075, "loss/logits": 0.8355433255434036, "step": 27310 }, { "epoch": 0.2732, "grad_norm": 12.8125, "grad_norm_var": 0.2900390625, "learning_rate": 0.0003, "loss": 11.5179, "loss/aux_loss": 0.04808585401624441, "loss/crossentropy": 2.7217608451843263, "loss/logits": 0.8875219106674195, "step": 27320 }, { "epoch": 0.2733, "grad_norm": 12.8125, "grad_norm_var": 0.5149576822916667, "learning_rate": 0.0003, "loss": 11.5724, "loss/aux_loss": 0.04808675888925791, "loss/crossentropy": 2.8080302834510804, "loss/logits": 0.8594749808311463, "step": 27330 }, { "epoch": 0.2734, "grad_norm": 11.875, "grad_norm_var": 0.49947916666666664, "learning_rate": 0.0003, "loss": 11.4152, "loss/aux_loss": 0.048075826838612556, "loss/crossentropy": 2.8058014869689942, "loss/logits": 0.8780788242816925, "step": 27340 }, { "epoch": 0.2735, "grad_norm": 14.75, "grad_norm_var": 0.5893229166666667, "learning_rate": 0.0003, "loss": 11.516, "loss/aux_loss": 0.04808777756989002, "loss/crossentropy": 2.840649002790451, "loss/logits": 0.8928281188011169, "step": 27350 }, { "epoch": 0.2736, "grad_norm": 13.25, "grad_norm_var": 1.003759765625, "learning_rate": 0.0003, "loss": 11.5198, "loss/aux_loss": 0.048086445592343804, "loss/crossentropy": 2.6759494841098785, "loss/logits": 0.866712149977684, "step": 27360 }, { "epoch": 0.2737, "grad_norm": 12.0625, "grad_norm_var": 1.5731770833333334, "learning_rate": 0.0003, "loss": 11.5458, "loss/aux_loss": 0.048074633441865446, "loss/crossentropy": 2.805925118923187, "loss/logits": 0.8581642717123031, "step": 27370 }, { "epoch": 0.2738, "grad_norm": 13.4375, "grad_norm_var": 1.6359375, "learning_rate": 0.0003, "loss": 11.5177, "loss/aux_loss": 0.04809354469180107, "loss/crossentropy": 2.7967172265052795, "loss/logits": 0.8863823890686036, "step": 27380 }, { "epoch": 0.2739, "grad_norm": 12.0, "grad_norm_var": 0.6683430989583333, "learning_rate": 0.0003, "loss": 11.4643, "loss/aux_loss": 0.04809064380824566, "loss/crossentropy": 2.7077986001968384, "loss/logits": 0.8588582128286362, "step": 27390 }, { "epoch": 0.274, "grad_norm": 13.6875, "grad_norm_var": 0.38671875, "learning_rate": 0.0003, "loss": 11.5795, "loss/aux_loss": 0.048084030672907827, "loss/crossentropy": 2.750448912382126, "loss/logits": 0.8991485238075256, "step": 27400 }, { "epoch": 0.2741, "grad_norm": 13.75, "grad_norm_var": 0.5129557291666667, "learning_rate": 0.0003, "loss": 11.5598, "loss/aux_loss": 0.048072746582329275, "loss/crossentropy": 2.761543083190918, "loss/logits": 0.8485326498746872, "step": 27410 }, { "epoch": 0.2742, "grad_norm": 12.8125, "grad_norm_var": 0.4556640625, "learning_rate": 0.0003, "loss": 11.3436, "loss/aux_loss": 0.048084151931107044, "loss/crossentropy": 2.604575699567795, "loss/logits": 0.8465295255184173, "step": 27420 }, { "epoch": 0.2743, "grad_norm": 13.5625, "grad_norm_var": 0.5302083333333333, "learning_rate": 0.0003, "loss": 11.4867, "loss/aux_loss": 0.04809482246637344, "loss/crossentropy": 2.54980583190918, "loss/logits": 0.8440865933895111, "step": 27430 }, { "epoch": 0.2744, "grad_norm": 13.625, "grad_norm_var": 4.076285807291667, "learning_rate": 0.0003, "loss": 11.5182, "loss/aux_loss": 0.04807782378047705, "loss/crossentropy": 2.7580845236778258, "loss/logits": 0.882664081454277, "step": 27440 }, { "epoch": 0.2745, "grad_norm": 12.9375, "grad_norm_var": 0.379931640625, "learning_rate": 0.0003, "loss": 11.4551, "loss/aux_loss": 0.0480885649099946, "loss/crossentropy": 2.777578568458557, "loss/logits": 0.87208411693573, "step": 27450 }, { "epoch": 0.2746, "grad_norm": 12.6875, "grad_norm_var": 0.34427083333333336, "learning_rate": 0.0003, "loss": 11.4538, "loss/aux_loss": 0.04808246586471796, "loss/crossentropy": 2.8558852434158326, "loss/logits": 0.8860389828681946, "step": 27460 }, { "epoch": 0.2747, "grad_norm": 12.6875, "grad_norm_var": 0.2786458333333333, "learning_rate": 0.0003, "loss": 11.5559, "loss/aux_loss": 0.048081047087907794, "loss/crossentropy": 2.5927527368068697, "loss/logits": 0.8575126707553864, "step": 27470 }, { "epoch": 0.2748, "grad_norm": 13.25, "grad_norm_var": 0.42185872395833335, "learning_rate": 0.0003, "loss": 11.4521, "loss/aux_loss": 0.048087083548307416, "loss/crossentropy": 2.752536880970001, "loss/logits": 0.8572630852460861, "step": 27480 }, { "epoch": 0.2749, "grad_norm": 13.5625, "grad_norm_var": 0.46599934895833334, "learning_rate": 0.0003, "loss": 11.6157, "loss/aux_loss": 0.04808045290410519, "loss/crossentropy": 2.865806245803833, "loss/logits": 0.9061722487211228, "step": 27490 }, { "epoch": 0.275, "grad_norm": 12.5, "grad_norm_var": 0.45753580729166665, "learning_rate": 0.0003, "loss": 11.6082, "loss/aux_loss": 0.04807562381029129, "loss/crossentropy": 2.7103756070137024, "loss/logits": 0.9027006924152374, "step": 27500 }, { "epoch": 0.2751, "grad_norm": 12.5625, "grad_norm_var": 0.322119140625, "learning_rate": 0.0003, "loss": 11.5937, "loss/aux_loss": 0.04807604216039181, "loss/crossentropy": 2.841430151462555, "loss/logits": 0.8734793215990067, "step": 27510 }, { "epoch": 0.2752, "grad_norm": 13.625, "grad_norm_var": 1.0101399739583334, "learning_rate": 0.0003, "loss": 11.642, "loss/aux_loss": 0.048089314438402656, "loss/crossentropy": 2.8229639172554015, "loss/logits": 0.8981556743383408, "step": 27520 }, { "epoch": 0.2753, "grad_norm": 13.9375, "grad_norm_var": 2.7639973958333335, "learning_rate": 0.0003, "loss": 11.4326, "loss/aux_loss": 0.04808274004608393, "loss/crossentropy": 2.5911940157413484, "loss/logits": 0.8717542558908462, "step": 27530 }, { "epoch": 0.2754, "grad_norm": 13.25, "grad_norm_var": 3.187760416666667, "learning_rate": 0.0003, "loss": 11.459, "loss/aux_loss": 0.048087103292346, "loss/crossentropy": 2.75337210893631, "loss/logits": 0.8438648998737335, "step": 27540 }, { "epoch": 0.2755, "grad_norm": 12.625, "grad_norm_var": 0.6598795572916667, "learning_rate": 0.0003, "loss": 11.4768, "loss/aux_loss": 0.04808942452073097, "loss/crossentropy": 2.6577411115169527, "loss/logits": 0.8484239518642426, "step": 27550 }, { "epoch": 0.2756, "grad_norm": 13.5625, "grad_norm_var": 0.30625, "learning_rate": 0.0003, "loss": 11.5486, "loss/aux_loss": 0.048091153427958486, "loss/crossentropy": 2.8733396172523498, "loss/logits": 0.8842191725969315, "step": 27560 }, { "epoch": 0.2757, "grad_norm": 12.1875, "grad_norm_var": 0.292431640625, "learning_rate": 0.0003, "loss": 11.3524, "loss/aux_loss": 0.048078698106110096, "loss/crossentropy": 2.8211456060409548, "loss/logits": 0.8710596203804016, "step": 27570 }, { "epoch": 0.2758, "grad_norm": 12.625, "grad_norm_var": 0.356494140625, "learning_rate": 0.0003, "loss": 11.3923, "loss/aux_loss": 0.04809141457080841, "loss/crossentropy": 2.810522723197937, "loss/logits": 0.9148801237344741, "step": 27580 }, { "epoch": 0.2759, "grad_norm": 13.125, "grad_norm_var": 0.688525390625, "learning_rate": 0.0003, "loss": 11.46, "loss/aux_loss": 0.04809119552373886, "loss/crossentropy": 2.571478694677353, "loss/logits": 0.8658771872520447, "step": 27590 }, { "epoch": 0.276, "grad_norm": 11.875, "grad_norm_var": 0.5340983072916666, "learning_rate": 0.0003, "loss": 11.4551, "loss/aux_loss": 0.04807944148778916, "loss/crossentropy": 2.8209518790245056, "loss/logits": 0.8891745388507843, "step": 27600 }, { "epoch": 0.2761, "grad_norm": 12.0, "grad_norm_var": 0.3296875, "learning_rate": 0.0003, "loss": 11.5451, "loss/aux_loss": 0.04807922653853893, "loss/crossentropy": 2.714892899990082, "loss/logits": 0.9096907198429107, "step": 27610 }, { "epoch": 0.2762, "grad_norm": 13.75, "grad_norm_var": 0.6673014322916667, "learning_rate": 0.0003, "loss": 11.5883, "loss/aux_loss": 0.04808503799140453, "loss/crossentropy": 2.7832123041152954, "loss/logits": 0.894438949227333, "step": 27620 }, { "epoch": 0.2763, "grad_norm": 12.3125, "grad_norm_var": 0.39869791666666665, "learning_rate": 0.0003, "loss": 11.591, "loss/aux_loss": 0.048086524568498136, "loss/crossentropy": 2.882674145698547, "loss/logits": 0.8659522473812103, "step": 27630 }, { "epoch": 0.2764, "grad_norm": 12.9375, "grad_norm_var": 0.4231770833333333, "learning_rate": 0.0003, "loss": 11.5889, "loss/aux_loss": 0.048090490885078906, "loss/crossentropy": 2.76504762172699, "loss/logits": 0.8722820281982422, "step": 27640 }, { "epoch": 0.2765, "grad_norm": 13.0, "grad_norm_var": 0.4369140625, "learning_rate": 0.0003, "loss": 11.7117, "loss/aux_loss": 0.04808298014104366, "loss/crossentropy": 2.773081195354462, "loss/logits": 0.8426672071218491, "step": 27650 }, { "epoch": 0.2766, "grad_norm": 13.5625, "grad_norm_var": 141.33151041666667, "learning_rate": 0.0003, "loss": 11.678, "loss/aux_loss": 0.04809086322784424, "loss/crossentropy": 2.782191741466522, "loss/logits": 0.882819551229477, "step": 27660 }, { "epoch": 0.2767, "grad_norm": 17.5, "grad_norm_var": 1.9231608072916666, "learning_rate": 0.0003, "loss": 11.4237, "loss/aux_loss": 0.04807958249002695, "loss/crossentropy": 2.6636355757713317, "loss/logits": 0.8572327792644501, "step": 27670 }, { "epoch": 0.2768, "grad_norm": 13.0, "grad_norm_var": 1.603125, "learning_rate": 0.0003, "loss": 11.4229, "loss/aux_loss": 0.0480815913528204, "loss/crossentropy": 2.890985882282257, "loss/logits": 0.8461414545774459, "step": 27680 }, { "epoch": 0.2769, "grad_norm": 13.8125, "grad_norm_var": 1.2593098958333333, "learning_rate": 0.0003, "loss": 11.6608, "loss/aux_loss": 0.048087593354284766, "loss/crossentropy": 2.839232790470123, "loss/logits": 0.8831172704696655, "step": 27690 }, { "epoch": 0.277, "grad_norm": 13.1875, "grad_norm_var": 1.5706868489583334, "learning_rate": 0.0003, "loss": 11.5061, "loss/aux_loss": 0.04807878881692886, "loss/crossentropy": 2.685603749752045, "loss/logits": 0.877737945318222, "step": 27700 }, { "epoch": 0.2771, "grad_norm": 13.0625, "grad_norm_var": 0.9463541666666667, "learning_rate": 0.0003, "loss": 11.5132, "loss/aux_loss": 0.04808571934700012, "loss/crossentropy": 2.794572043418884, "loss/logits": 0.8642860800027847, "step": 27710 }, { "epoch": 0.2772, "grad_norm": 13.625, "grad_norm_var": 0.5702473958333333, "learning_rate": 0.0003, "loss": 11.5303, "loss/aux_loss": 0.04808823838829994, "loss/crossentropy": 2.8599129617214203, "loss/logits": 0.8836135894060135, "step": 27720 }, { "epoch": 0.2773, "grad_norm": 13.0625, "grad_norm_var": 0.462353515625, "learning_rate": 0.0003, "loss": 11.5011, "loss/aux_loss": 0.048078597895801066, "loss/crossentropy": 2.775701862573624, "loss/logits": 0.8424900531768799, "step": 27730 }, { "epoch": 0.2774, "grad_norm": 12.3125, "grad_norm_var": 0.5702962239583333, "learning_rate": 0.0003, "loss": 11.6663, "loss/aux_loss": 0.048077501729130744, "loss/crossentropy": 2.7949552178382873, "loss/logits": 0.8686194092035293, "step": 27740 }, { "epoch": 0.2775, "grad_norm": 13.3125, "grad_norm_var": 9.37421875, "learning_rate": 0.0003, "loss": 11.4591, "loss/aux_loss": 0.048088216595351695, "loss/crossentropy": 2.655610829591751, "loss/logits": 0.853823122382164, "step": 27750 }, { "epoch": 0.2776, "grad_norm": 13.5625, "grad_norm_var": 1.8868326822916666, "learning_rate": 0.0003, "loss": 11.6049, "loss/aux_loss": 0.04809031039476395, "loss/crossentropy": 2.91897075176239, "loss/logits": 0.8576211661100388, "step": 27760 }, { "epoch": 0.2777, "grad_norm": 12.875, "grad_norm_var": 1.4387858072916666, "learning_rate": 0.0003, "loss": 11.3556, "loss/aux_loss": 0.04808913040906191, "loss/crossentropy": 2.6004298627376556, "loss/logits": 0.8527587816119194, "step": 27770 }, { "epoch": 0.2778, "grad_norm": 12.8125, "grad_norm_var": 0.44972330729166665, "learning_rate": 0.0003, "loss": 11.5785, "loss/aux_loss": 0.04808642938733101, "loss/crossentropy": 2.721281111240387, "loss/logits": 0.8621364802122116, "step": 27780 }, { "epoch": 0.2779, "grad_norm": 11.9375, "grad_norm_var": 0.27024739583333335, "learning_rate": 0.0003, "loss": 11.4411, "loss/aux_loss": 0.04807955492287874, "loss/crossentropy": 2.6037085890769958, "loss/logits": 0.8568143039941788, "step": 27790 }, { "epoch": 0.278, "grad_norm": 13.8125, "grad_norm_var": 0.540625, "learning_rate": 0.0003, "loss": 11.5396, "loss/aux_loss": 0.048087948746979234, "loss/crossentropy": 2.8853622317314147, "loss/logits": 0.8670936018228531, "step": 27800 }, { "epoch": 0.2781, "grad_norm": 12.0625, "grad_norm_var": 0.20974934895833333, "learning_rate": 0.0003, "loss": 11.6926, "loss/aux_loss": 0.048087144270539284, "loss/crossentropy": 2.8916147112846375, "loss/logits": 0.9140586674213409, "step": 27810 }, { "epoch": 0.2782, "grad_norm": 12.75, "grad_norm_var": 0.9538899739583333, "learning_rate": 0.0003, "loss": 11.4982, "loss/aux_loss": 0.048076816648244855, "loss/crossentropy": 2.742960512638092, "loss/logits": 0.8774885207414627, "step": 27820 }, { "epoch": 0.2783, "grad_norm": 13.3125, "grad_norm_var": 2.1745930989583333, "learning_rate": 0.0003, "loss": 11.5583, "loss/aux_loss": 0.048096643574535844, "loss/crossentropy": 2.8718445897102356, "loss/logits": 0.8958581119775773, "step": 27830 }, { "epoch": 0.2784, "grad_norm": 14.125, "grad_norm_var": 2.208268229166667, "learning_rate": 0.0003, "loss": 11.7273, "loss/aux_loss": 0.04808414224535227, "loss/crossentropy": 2.936895763874054, "loss/logits": 0.8936193466186524, "step": 27840 }, { "epoch": 0.2785, "grad_norm": 12.0625, "grad_norm_var": 1.5369140625, "learning_rate": 0.0003, "loss": 11.42, "loss/aux_loss": 0.04808822255581617, "loss/crossentropy": 2.812196373939514, "loss/logits": 0.8949258029460907, "step": 27850 }, { "epoch": 0.2786, "grad_norm": 13.0625, "grad_norm_var": 0.7958170572916666, "learning_rate": 0.0003, "loss": 11.3989, "loss/aux_loss": 0.04807552136480808, "loss/crossentropy": 2.788651943206787, "loss/logits": 0.8348335802555085, "step": 27860 }, { "epoch": 0.2787, "grad_norm": 13.6875, "grad_norm_var": 0.7613932291666666, "learning_rate": 0.0003, "loss": 11.5888, "loss/aux_loss": 0.048087479919195174, "loss/crossentropy": 2.7045384287834167, "loss/logits": 0.9338543623685837, "step": 27870 }, { "epoch": 0.2788, "grad_norm": 13.25, "grad_norm_var": 0.6075520833333333, "learning_rate": 0.0003, "loss": 11.5237, "loss/aux_loss": 0.048085146211087705, "loss/crossentropy": 2.7415911316871644, "loss/logits": 0.856224250793457, "step": 27880 }, { "epoch": 0.2789, "grad_norm": 12.3125, "grad_norm_var": 0.38058268229166664, "learning_rate": 0.0003, "loss": 11.5602, "loss/aux_loss": 0.04807989951223135, "loss/crossentropy": 2.7379807472229003, "loss/logits": 0.8650393694639206, "step": 27890 }, { "epoch": 0.279, "grad_norm": 13.0, "grad_norm_var": 0.1525390625, "learning_rate": 0.0003, "loss": 11.608, "loss/aux_loss": 0.048083983920514585, "loss/crossentropy": 2.8002022445201873, "loss/logits": 0.8693450152873993, "step": 27900 }, { "epoch": 0.2791, "grad_norm": 13.0625, "grad_norm_var": 0.425244140625, "learning_rate": 0.0003, "loss": 11.5797, "loss/aux_loss": 0.04808452669531107, "loss/crossentropy": 2.7015219628810883, "loss/logits": 0.874952495098114, "step": 27910 }, { "epoch": 0.2792, "grad_norm": 12.5625, "grad_norm_var": 0.667041015625, "learning_rate": 0.0003, "loss": 11.5391, "loss/aux_loss": 0.04808107353746891, "loss/crossentropy": 2.6511988162994387, "loss/logits": 0.844877976179123, "step": 27920 }, { "epoch": 0.2793, "grad_norm": 16.0, "grad_norm_var": 3.456103515625, "learning_rate": 0.0003, "loss": 11.4723, "loss/aux_loss": 0.0480935912579298, "loss/crossentropy": 2.69124321937561, "loss/logits": 0.8498719304800033, "step": 27930 }, { "epoch": 0.2794, "grad_norm": 16.0, "grad_norm_var": 2.3822265625, "learning_rate": 0.0003, "loss": 11.5932, "loss/aux_loss": 0.0480877697467804, "loss/crossentropy": 2.886867892742157, "loss/logits": 0.8901501029729844, "step": 27940 }, { "epoch": 0.2795, "grad_norm": 11.75, "grad_norm_var": 0.9264973958333333, "learning_rate": 0.0003, "loss": 11.3919, "loss/aux_loss": 0.0480851836502552, "loss/crossentropy": 2.5583014130592345, "loss/logits": 0.8257781475782394, "step": 27950 }, { "epoch": 0.2796, "grad_norm": 12.6875, "grad_norm_var": 0.704150390625, "learning_rate": 0.0003, "loss": 11.349, "loss/aux_loss": 0.048085610195994374, "loss/crossentropy": 2.756976544857025, "loss/logits": 0.8353757977485656, "step": 27960 }, { "epoch": 0.2797, "grad_norm": 14.125, "grad_norm_var": 0.378369140625, "learning_rate": 0.0003, "loss": 11.5423, "loss/aux_loss": 0.048080562800168994, "loss/crossentropy": 2.733251041173935, "loss/logits": 0.8563009589910507, "step": 27970 }, { "epoch": 0.2798, "grad_norm": 13.375, "grad_norm_var": 0.47369791666666666, "learning_rate": 0.0003, "loss": 11.4917, "loss/aux_loss": 0.048089844174683094, "loss/crossentropy": 2.684944635629654, "loss/logits": 0.8707854568958282, "step": 27980 }, { "epoch": 0.2799, "grad_norm": 12.0, "grad_norm_var": 0.1666015625, "learning_rate": 0.0003, "loss": 11.6145, "loss/aux_loss": 0.04807740524411201, "loss/crossentropy": 2.8147059202194216, "loss/logits": 0.8616000026464462, "step": 27990 }, { "epoch": 0.28, "grad_norm": 13.8125, "grad_norm_var": 0.35989583333333336, "learning_rate": 0.0003, "loss": 11.605, "loss/aux_loss": 0.048080855049192905, "loss/crossentropy": 2.7819701194763184, "loss/logits": 0.8910420656204223, "step": 28000 }, { "epoch": 0.2801, "grad_norm": 12.25, "grad_norm_var": 1.1546223958333333, "learning_rate": 0.0003, "loss": 11.4798, "loss/aux_loss": 0.048088941164314745, "loss/crossentropy": 2.7400415241718292, "loss/logits": 0.8723890751600265, "step": 28010 }, { "epoch": 0.2802, "grad_norm": 13.5, "grad_norm_var": 0.7181640625, "learning_rate": 0.0003, "loss": 11.5532, "loss/aux_loss": 0.04808063674718142, "loss/crossentropy": 2.8234307289123537, "loss/logits": 0.8648925483226776, "step": 28020 }, { "epoch": 0.2803, "grad_norm": 14.1875, "grad_norm_var": 0.7884765625, "learning_rate": 0.0003, "loss": 11.5742, "loss/aux_loss": 0.048078789934515954, "loss/crossentropy": 2.8293231964111327, "loss/logits": 0.9036984205245971, "step": 28030 }, { "epoch": 0.2804, "grad_norm": 12.8125, "grad_norm_var": 0.5999348958333334, "learning_rate": 0.0003, "loss": 11.5579, "loss/aux_loss": 0.04807864893227816, "loss/crossentropy": 2.8947394490242004, "loss/logits": 0.8946270734071732, "step": 28040 }, { "epoch": 0.2805, "grad_norm": 12.5625, "grad_norm_var": 0.49635416666666665, "learning_rate": 0.0003, "loss": 11.5083, "loss/aux_loss": 0.04808484595268965, "loss/crossentropy": 2.6746840596199037, "loss/logits": 0.8644850313663482, "step": 28050 }, { "epoch": 0.2806, "grad_norm": 12.875, "grad_norm_var": 1402.0190104166666, "learning_rate": 0.0003, "loss": 11.557, "loss/aux_loss": 0.048079652898013595, "loss/crossentropy": 2.6483142554759977, "loss/logits": 0.8723117738962174, "step": 28060 }, { "epoch": 0.2807, "grad_norm": 13.1875, "grad_norm_var": 1397.2218587239583, "learning_rate": 0.0003, "loss": 11.4769, "loss/aux_loss": 0.04809585195034742, "loss/crossentropy": 2.778903841972351, "loss/logits": 0.8626104056835174, "step": 28070 }, { "epoch": 0.2808, "grad_norm": 12.6875, "grad_norm_var": 0.4710774739583333, "learning_rate": 0.0003, "loss": 11.6971, "loss/aux_loss": 0.048076757788658143, "loss/crossentropy": 2.6813664495944978, "loss/logits": 0.8790749669075012, "step": 28080 }, { "epoch": 0.2809, "grad_norm": 13.5, "grad_norm_var": 0.48170572916666665, "learning_rate": 0.0003, "loss": 11.5118, "loss/aux_loss": 0.048085582070052625, "loss/crossentropy": 2.8408753156661986, "loss/logits": 0.8780440986156464, "step": 28090 }, { "epoch": 0.281, "grad_norm": 13.8125, "grad_norm_var": 0.3015625, "learning_rate": 0.0003, "loss": 11.3568, "loss/aux_loss": 0.04808991327881813, "loss/crossentropy": 2.5951784670352938, "loss/logits": 0.8443504124879837, "step": 28100 }, { "epoch": 0.2811, "grad_norm": 15.0, "grad_norm_var": 0.451806640625, "learning_rate": 0.0003, "loss": 11.5474, "loss/aux_loss": 0.048089342564344405, "loss/crossentropy": 2.6413574039936067, "loss/logits": 0.8657073110342026, "step": 28110 }, { "epoch": 0.2812, "grad_norm": 12.375, "grad_norm_var": 0.6270182291666667, "learning_rate": 0.0003, "loss": 11.3937, "loss/aux_loss": 0.04809096623212099, "loss/crossentropy": 2.7579336047172545, "loss/logits": 0.8672685265541077, "step": 28120 }, { "epoch": 0.2813, "grad_norm": 12.875, "grad_norm_var": 0.3942057291666667, "learning_rate": 0.0003, "loss": 11.5618, "loss/aux_loss": 0.04808580968528986, "loss/crossentropy": 2.7464880406856538, "loss/logits": 0.8809055328369141, "step": 28130 }, { "epoch": 0.2814, "grad_norm": 12.1875, "grad_norm_var": 0.3719889322916667, "learning_rate": 0.0003, "loss": 11.4683, "loss/aux_loss": 0.04808620549738407, "loss/crossentropy": 2.729203450679779, "loss/logits": 0.8312252789735795, "step": 28140 }, { "epoch": 0.2815, "grad_norm": 11.9375, "grad_norm_var": 4.386832682291667, "learning_rate": 0.0003, "loss": 11.592, "loss/aux_loss": 0.04809138756245375, "loss/crossentropy": 2.76963392496109, "loss/logits": 0.8744438081979752, "step": 28150 }, { "epoch": 0.2816, "grad_norm": 13.0, "grad_norm_var": 4.50078125, "learning_rate": 0.0003, "loss": 11.5057, "loss/aux_loss": 0.04808358568698168, "loss/crossentropy": 2.7290889263153075, "loss/logits": 0.8577259719371796, "step": 28160 }, { "epoch": 0.2817, "grad_norm": 12.875, "grad_norm_var": 1.4513020833333334, "learning_rate": 0.0003, "loss": 11.7034, "loss/aux_loss": 0.04809059873223305, "loss/crossentropy": 2.6322677731513977, "loss/logits": 0.8687084138393402, "step": 28170 }, { "epoch": 0.2818, "grad_norm": 12.8125, "grad_norm_var": 0.6677083333333333, "learning_rate": 0.0003, "loss": 11.4394, "loss/aux_loss": 0.04808175694197416, "loss/crossentropy": 2.655596649646759, "loss/logits": 0.8649382144212723, "step": 28180 }, { "epoch": 0.2819, "grad_norm": 12.375, "grad_norm_var": 0.7869140625, "learning_rate": 0.0003, "loss": 11.5506, "loss/aux_loss": 0.048091573640704155, "loss/crossentropy": 2.83742733001709, "loss/logits": 0.8902147322893142, "step": 28190 }, { "epoch": 0.282, "grad_norm": 12.5, "grad_norm_var": 0.4087890625, "learning_rate": 0.0003, "loss": 11.5759, "loss/aux_loss": 0.048080408945679665, "loss/crossentropy": 2.654254060983658, "loss/logits": 0.8351120471954345, "step": 28200 }, { "epoch": 0.2821, "grad_norm": 13.4375, "grad_norm_var": 0.566650390625, "learning_rate": 0.0003, "loss": 11.6061, "loss/aux_loss": 0.04808955620974302, "loss/crossentropy": 2.756123435497284, "loss/logits": 0.8621039360761642, "step": 28210 }, { "epoch": 0.2822, "grad_norm": 16.125, "grad_norm_var": 0.8916666666666667, "learning_rate": 0.0003, "loss": 11.7105, "loss/aux_loss": 0.04808141849935055, "loss/crossentropy": 2.6973861932754515, "loss/logits": 0.9015775710344315, "step": 28220 }, { "epoch": 0.2823, "grad_norm": 13.4375, "grad_norm_var": 1.2152180989583334, "learning_rate": 0.0003, "loss": 11.5418, "loss/aux_loss": 0.04808177202939987, "loss/crossentropy": 2.7852281630039215, "loss/logits": 0.8739712238311768, "step": 28230 }, { "epoch": 0.2824, "grad_norm": 12.375, "grad_norm_var": 0.35714518229166664, "learning_rate": 0.0003, "loss": 11.4382, "loss/aux_loss": 0.0480819521471858, "loss/crossentropy": 2.8439980030059813, "loss/logits": 0.8818934857845306, "step": 28240 }, { "epoch": 0.2825, "grad_norm": 12.5, "grad_norm_var": 0.40310872395833336, "learning_rate": 0.0003, "loss": 11.3937, "loss/aux_loss": 0.04809800013899803, "loss/crossentropy": 2.752180802822113, "loss/logits": 0.8844646722078323, "step": 28250 }, { "epoch": 0.2826, "grad_norm": 12.3125, "grad_norm_var": 0.5004557291666667, "learning_rate": 0.0003, "loss": 11.5381, "loss/aux_loss": 0.04807610791176557, "loss/crossentropy": 2.587927532196045, "loss/logits": 0.8262291848659515, "step": 28260 }, { "epoch": 0.2827, "grad_norm": 14.625, "grad_norm_var": 0.5624348958333333, "learning_rate": 0.0003, "loss": 11.3631, "loss/aux_loss": 0.04808882139623165, "loss/crossentropy": 2.8169186234474184, "loss/logits": 0.8660006016492844, "step": 28270 }, { "epoch": 0.2828, "grad_norm": 13.4375, "grad_norm_var": 0.6494791666666667, "learning_rate": 0.0003, "loss": 11.7342, "loss/aux_loss": 0.04807655774056911, "loss/crossentropy": 2.7526882588863373, "loss/logits": 0.8869515836238862, "step": 28280 }, { "epoch": 0.2829, "grad_norm": 12.5, "grad_norm_var": 0.586181640625, "learning_rate": 0.0003, "loss": 11.759, "loss/aux_loss": 0.048088379204273224, "loss/crossentropy": 2.7451719284057616, "loss/logits": 0.9054650783538818, "step": 28290 }, { "epoch": 0.283, "grad_norm": 12.0, "grad_norm_var": 0.5676920572916667, "learning_rate": 0.0003, "loss": 11.4817, "loss/aux_loss": 0.04807801488786936, "loss/crossentropy": 2.8209929168224335, "loss/logits": 0.8833929538726807, "step": 28300 }, { "epoch": 0.2831, "grad_norm": 14.375, "grad_norm_var": 0.9614420572916667, "learning_rate": 0.0003, "loss": 11.4367, "loss/aux_loss": 0.048084160685539244, "loss/crossentropy": 2.6196862697601317, "loss/logits": 0.8329427570104599, "step": 28310 }, { "epoch": 0.2832, "grad_norm": 13.5, "grad_norm_var": 0.5783854166666667, "learning_rate": 0.0003, "loss": 11.5229, "loss/aux_loss": 0.0480828158557415, "loss/crossentropy": 2.847772258520126, "loss/logits": 0.8690467923879623, "step": 28320 }, { "epoch": 0.2833, "grad_norm": 12.125, "grad_norm_var": 0.8005045572916667, "learning_rate": 0.0003, "loss": 11.4262, "loss/aux_loss": 0.048074822314083575, "loss/crossentropy": 2.687458795309067, "loss/logits": 0.8230527967214585, "step": 28330 }, { "epoch": 0.2834, "grad_norm": 14.6875, "grad_norm_var": 0.592041015625, "learning_rate": 0.0003, "loss": 11.6375, "loss/aux_loss": 0.048090120404958726, "loss/crossentropy": 2.898961102962494, "loss/logits": 0.8803368806838989, "step": 28340 }, { "epoch": 0.2835, "grad_norm": 13.375, "grad_norm_var": 0.2587890625, "learning_rate": 0.0003, "loss": 11.6828, "loss/aux_loss": 0.048081318661570546, "loss/crossentropy": 2.6955320119857786, "loss/logits": 0.8958701252937317, "step": 28350 }, { "epoch": 0.2836, "grad_norm": 12.1875, "grad_norm_var": 0.29420572916666665, "learning_rate": 0.0003, "loss": 11.6121, "loss/aux_loss": 0.048087548650801185, "loss/crossentropy": 2.8415299594402312, "loss/logits": 0.8907667517662048, "step": 28360 }, { "epoch": 0.2837, "grad_norm": 13.5625, "grad_norm_var": 0.29659830729166664, "learning_rate": 0.0003, "loss": 11.6166, "loss/aux_loss": 0.048079358972609045, "loss/crossentropy": 2.7396019995212555, "loss/logits": 0.8907454043626786, "step": 28370 }, { "epoch": 0.2838, "grad_norm": 13.375, "grad_norm_var": 0.152978515625, "learning_rate": 0.0003, "loss": 11.4829, "loss/aux_loss": 0.04807760044932365, "loss/crossentropy": 2.768324136734009, "loss/logits": 0.8386527955532074, "step": 28380 }, { "epoch": 0.2839, "grad_norm": 13.5625, "grad_norm_var": 0.23865559895833333, "learning_rate": 0.0003, "loss": 11.6151, "loss/aux_loss": 0.048082569241523744, "loss/crossentropy": 2.878603792190552, "loss/logits": 0.8644590139389038, "step": 28390 }, { "epoch": 0.284, "grad_norm": 14.8125, "grad_norm_var": 0.49680989583333335, "learning_rate": 0.0003, "loss": 11.6059, "loss/aux_loss": 0.04808495007455349, "loss/crossentropy": 2.6303452491760253, "loss/logits": 0.8935995787382126, "step": 28400 }, { "epoch": 0.2841, "grad_norm": 12.125, "grad_norm_var": 0.733056640625, "learning_rate": 0.0003, "loss": 11.4731, "loss/aux_loss": 0.048085509426891805, "loss/crossentropy": 2.828030973672867, "loss/logits": 0.8445266515016556, "step": 28410 }, { "epoch": 0.2842, "grad_norm": 13.6875, "grad_norm_var": 0.44503580729166664, "learning_rate": 0.0003, "loss": 11.4002, "loss/aux_loss": 0.04808097891509533, "loss/crossentropy": 2.6388884663581846, "loss/logits": 0.9015711516141891, "step": 28420 }, { "epoch": 0.2843, "grad_norm": 13.75, "grad_norm_var": 0.4434733072916667, "learning_rate": 0.0003, "loss": 11.514, "loss/aux_loss": 0.048092016205191615, "loss/crossentropy": 2.815828490257263, "loss/logits": 0.9084261149168015, "step": 28430 }, { "epoch": 0.2844, "grad_norm": 12.625, "grad_norm_var": 0.54453125, "learning_rate": 0.0003, "loss": 11.5181, "loss/aux_loss": 0.04807435479015112, "loss/crossentropy": 2.7728405237197875, "loss/logits": 0.887555119395256, "step": 28440 }, { "epoch": 0.2845, "grad_norm": 12.5625, "grad_norm_var": 0.42916666666666664, "learning_rate": 0.0003, "loss": 11.5056, "loss/aux_loss": 0.04808815475553274, "loss/crossentropy": 2.8544474244117737, "loss/logits": 0.8347731292247772, "step": 28450 }, { "epoch": 0.2846, "grad_norm": 14.0625, "grad_norm_var": 0.5046223958333333, "learning_rate": 0.0003, "loss": 11.5202, "loss/aux_loss": 0.048085703514516354, "loss/crossentropy": 2.570024532079697, "loss/logits": 0.8466786921024323, "step": 28460 }, { "epoch": 0.2847, "grad_norm": 14.375, "grad_norm_var": 0.6171223958333333, "learning_rate": 0.0003, "loss": 11.319, "loss/aux_loss": 0.04809032492339611, "loss/crossentropy": 2.7953803539276123, "loss/logits": 0.8648561179637909, "step": 28470 }, { "epoch": 0.2848, "grad_norm": 13.625, "grad_norm_var": 0.5113932291666666, "learning_rate": 0.0003, "loss": 11.452, "loss/aux_loss": 0.04808381143957376, "loss/crossentropy": 2.7199991762638094, "loss/logits": 0.8637189954519272, "step": 28480 }, { "epoch": 0.2849, "grad_norm": 12.625, "grad_norm_var": 0.410400390625, "learning_rate": 0.0003, "loss": 11.4654, "loss/aux_loss": 0.04808434545993805, "loss/crossentropy": 2.8280713319778443, "loss/logits": 0.9059916436672211, "step": 28490 }, { "epoch": 0.285, "grad_norm": 51.75, "grad_norm_var": 95.12967122395834, "learning_rate": 0.0003, "loss": 11.5128, "loss/aux_loss": 0.0480845658108592, "loss/crossentropy": 2.858335256576538, "loss/logits": 0.9236481755971908, "step": 28500 }, { "epoch": 0.2851, "grad_norm": 14.1875, "grad_norm_var": 92.11041666666667, "learning_rate": 0.0003, "loss": 11.5787, "loss/aux_loss": 0.04808925464749336, "loss/crossentropy": 2.846384787559509, "loss/logits": 0.8865832269191742, "step": 28510 }, { "epoch": 0.2852, "grad_norm": 13.625, "grad_norm_var": 0.3070149739583333, "learning_rate": 0.0003, "loss": 11.6155, "loss/aux_loss": 0.048087059520184994, "loss/crossentropy": 2.6891987919807434, "loss/logits": 0.8724437922239303, "step": 28520 }, { "epoch": 0.2853, "grad_norm": 13.1875, "grad_norm_var": 0.28487955729166664, "learning_rate": 0.0003, "loss": 11.647, "loss/aux_loss": 0.04809146039187908, "loss/crossentropy": 2.803027904033661, "loss/logits": 0.8911570340394974, "step": 28530 }, { "epoch": 0.2854, "grad_norm": 13.0, "grad_norm_var": 0.46796875, "learning_rate": 0.0003, "loss": 11.6234, "loss/aux_loss": 0.04807390477508307, "loss/crossentropy": 2.7482841432094576, "loss/logits": 0.8875775545835495, "step": 28540 }, { "epoch": 0.2855, "grad_norm": 12.75, "grad_norm_var": 55.06139322916667, "learning_rate": 0.0003, "loss": 11.552, "loss/aux_loss": 0.04810281321406364, "loss/crossentropy": 2.6797729313373564, "loss/logits": 0.8286285102367401, "step": 28550 }, { "epoch": 0.2856, "grad_norm": 13.4375, "grad_norm_var": 1.0009765625, "learning_rate": 0.0003, "loss": 11.489, "loss/aux_loss": 0.048082736879587175, "loss/crossentropy": 2.6085013091564178, "loss/logits": 0.8426523476839065, "step": 28560 }, { "epoch": 0.2857, "grad_norm": 15.625, "grad_norm_var": 0.8051432291666667, "learning_rate": 0.0003, "loss": 11.5542, "loss/aux_loss": 0.04808100238442421, "loss/crossentropy": 2.788421058654785, "loss/logits": 0.865365993976593, "step": 28570 }, { "epoch": 0.2858, "grad_norm": 13.3125, "grad_norm_var": 0.9296712239583333, "learning_rate": 0.0003, "loss": 11.5754, "loss/aux_loss": 0.04808872230350971, "loss/crossentropy": 2.6733221411705017, "loss/logits": 0.8689684510231018, "step": 28580 }, { "epoch": 0.2859, "grad_norm": 13.5, "grad_norm_var": 0.5859375, "learning_rate": 0.0003, "loss": 11.3833, "loss/aux_loss": 0.048073044046759605, "loss/crossentropy": 2.7072140097618105, "loss/logits": 0.8347415089607239, "step": 28590 }, { "epoch": 0.286, "grad_norm": 13.0625, "grad_norm_var": 0.8980305989583334, "learning_rate": 0.0003, "loss": 11.5353, "loss/aux_loss": 0.04807990416884422, "loss/crossentropy": 2.6441255509853363, "loss/logits": 0.8408429473638535, "step": 28600 }, { "epoch": 0.2861, "grad_norm": 13.1875, "grad_norm_var": 0.32667643229166665, "learning_rate": 0.0003, "loss": 11.3859, "loss/aux_loss": 0.04808638412505388, "loss/crossentropy": 2.8064417958259584, "loss/logits": 0.8815937727689743, "step": 28610 }, { "epoch": 0.2862, "grad_norm": 13.6875, "grad_norm_var": 0.1900390625, "learning_rate": 0.0003, "loss": 11.3424, "loss/aux_loss": 0.04808064606040716, "loss/crossentropy": 2.692779916524887, "loss/logits": 0.8429404377937317, "step": 28620 }, { "epoch": 0.2863, "grad_norm": 13.5625, "grad_norm_var": 0.3089680989583333, "learning_rate": 0.0003, "loss": 11.656, "loss/aux_loss": 0.04809319917112589, "loss/crossentropy": 2.8748478055000306, "loss/logits": 0.9063412040472031, "step": 28630 }, { "epoch": 0.2864, "grad_norm": 16.875, "grad_norm_var": 13.566650390625, "learning_rate": 0.0003, "loss": 11.4876, "loss/aux_loss": 0.04807700905948877, "loss/crossentropy": 2.779051947593689, "loss/logits": 0.8662481039762497, "step": 28640 }, { "epoch": 0.2865, "grad_norm": 15.0625, "grad_norm_var": 12.745426432291667, "learning_rate": 0.0003, "loss": 11.4317, "loss/aux_loss": 0.04808287639170885, "loss/crossentropy": 2.825643515586853, "loss/logits": 0.8664470076560974, "step": 28650 }, { "epoch": 0.2866, "grad_norm": 13.1875, "grad_norm_var": 0.5754557291666667, "learning_rate": 0.0003, "loss": 11.2821, "loss/aux_loss": 0.04808809049427509, "loss/crossentropy": 2.735125958919525, "loss/logits": 0.8850825309753418, "step": 28660 }, { "epoch": 0.2867, "grad_norm": 15.0, "grad_norm_var": 1.239306640625, "learning_rate": 0.0003, "loss": 11.6871, "loss/aux_loss": 0.04809844307601452, "loss/crossentropy": 2.6700200915336607, "loss/logits": 0.8919987231492996, "step": 28670 }, { "epoch": 0.2868, "grad_norm": 14.75, "grad_norm_var": 1.1426432291666666, "learning_rate": 0.0003, "loss": 11.4459, "loss/aux_loss": 0.04808284323662519, "loss/crossentropy": 2.639972817897797, "loss/logits": 0.8575376510620117, "step": 28680 }, { "epoch": 0.2869, "grad_norm": 13.1875, "grad_norm_var": 0.633447265625, "learning_rate": 0.0003, "loss": 11.448, "loss/aux_loss": 0.04808585159480572, "loss/crossentropy": 2.684755891561508, "loss/logits": 0.858421990275383, "step": 28690 }, { "epoch": 0.287, "grad_norm": 13.3125, "grad_norm_var": 1.0254557291666666, "learning_rate": 0.0003, "loss": 11.4013, "loss/aux_loss": 0.04808926824480295, "loss/crossentropy": 2.733264869451523, "loss/logits": 0.8662783950567245, "step": 28700 }, { "epoch": 0.2871, "grad_norm": 13.125, "grad_norm_var": 0.4328125, "learning_rate": 0.0003, "loss": 11.6559, "loss/aux_loss": 0.04808164779096842, "loss/crossentropy": 2.625485306978226, "loss/logits": 0.8678814113140106, "step": 28710 }, { "epoch": 0.2872, "grad_norm": 12.75, "grad_norm_var": 0.3489583333333333, "learning_rate": 0.0003, "loss": 11.5625, "loss/aux_loss": 0.048087488114833835, "loss/crossentropy": 2.7748125314712526, "loss/logits": 0.8737635612487793, "step": 28720 }, { "epoch": 0.2873, "grad_norm": 13.75, "grad_norm_var": 0.5188639322916667, "learning_rate": 0.0003, "loss": 11.3701, "loss/aux_loss": 0.04807881489396095, "loss/crossentropy": 2.825051474571228, "loss/logits": 0.869257315993309, "step": 28730 }, { "epoch": 0.2874, "grad_norm": 12.75, "grad_norm_var": 0.3101399739583333, "learning_rate": 0.0003, "loss": 11.5001, "loss/aux_loss": 0.04808692578226328, "loss/crossentropy": 2.682652533054352, "loss/logits": 0.8922774195671082, "step": 28740 }, { "epoch": 0.2875, "grad_norm": 12.875, "grad_norm_var": 0.2384765625, "learning_rate": 0.0003, "loss": 11.5308, "loss/aux_loss": 0.048078923113644124, "loss/crossentropy": 2.748230826854706, "loss/logits": 0.9064568638801574, "step": 28750 }, { "epoch": 0.2876, "grad_norm": 12.375, "grad_norm_var": 0.2869140625, "learning_rate": 0.0003, "loss": 11.4102, "loss/aux_loss": 0.0480932604521513, "loss/crossentropy": 2.6067364394664763, "loss/logits": 0.8314082384109497, "step": 28760 }, { "epoch": 0.2877, "grad_norm": 12.5625, "grad_norm_var": 0.4200358072916667, "learning_rate": 0.0003, "loss": 11.4282, "loss/aux_loss": 0.04807720612734556, "loss/crossentropy": 2.7676464080810548, "loss/logits": 0.8628965139389038, "step": 28770 }, { "epoch": 0.2878, "grad_norm": 13.5, "grad_norm_var": 0.6832682291666666, "learning_rate": 0.0003, "loss": 11.5929, "loss/aux_loss": 0.048078482411801814, "loss/crossentropy": 2.7174685835838317, "loss/logits": 0.8926462024450302, "step": 28780 }, { "epoch": 0.2879, "grad_norm": 13.875, "grad_norm_var": 0.73671875, "learning_rate": 0.0003, "loss": 11.4883, "loss/aux_loss": 0.04808929469436407, "loss/crossentropy": 2.8711092829704286, "loss/logits": 0.9005297362804413, "step": 28790 }, { "epoch": 0.288, "grad_norm": 58.75, "grad_norm_var": 130.48567708333334, "learning_rate": 0.0003, "loss": 11.499, "loss/aux_loss": 0.04807561915367842, "loss/crossentropy": 2.684372991323471, "loss/logits": 0.8370014727115631, "step": 28800 }, { "epoch": 0.2881, "grad_norm": 12.75, "grad_norm_var": 130.08795572916668, "learning_rate": 0.0003, "loss": 11.4676, "loss/aux_loss": 0.04809402357786894, "loss/crossentropy": 2.8078381299972532, "loss/logits": 0.8840030491352081, "step": 28810 }, { "epoch": 0.2882, "grad_norm": 13.25, "grad_norm_var": 0.5019368489583333, "learning_rate": 0.0003, "loss": 11.4451, "loss/aux_loss": 0.04808465614914894, "loss/crossentropy": 2.681842344999313, "loss/logits": 0.882282269001007, "step": 28820 }, { "epoch": 0.2883, "grad_norm": 12.8125, "grad_norm_var": 0.2556640625, "learning_rate": 0.0003, "loss": 11.4593, "loss/aux_loss": 0.04808338657021523, "loss/crossentropy": 2.718055808544159, "loss/logits": 0.8846965968608856, "step": 28830 }, { "epoch": 0.2884, "grad_norm": 13.625, "grad_norm_var": 3.116259765625, "learning_rate": 0.0003, "loss": 11.4866, "loss/aux_loss": 0.048081991448998454, "loss/crossentropy": 2.824300652742386, "loss/logits": 0.8746830075979233, "step": 28840 }, { "epoch": 0.2885, "grad_norm": 13.625, "grad_norm_var": 0.311181640625, "learning_rate": 0.0003, "loss": 11.2453, "loss/aux_loss": 0.04808319676667452, "loss/crossentropy": 2.759380376338959, "loss/logits": 0.8470451653003692, "step": 28850 }, { "epoch": 0.2886, "grad_norm": 12.5625, "grad_norm_var": 1.086181640625, "learning_rate": 0.0003, "loss": 11.5076, "loss/aux_loss": 0.04808532185852528, "loss/crossentropy": 2.7223631918430327, "loss/logits": 0.8595778405666351, "step": 28860 }, { "epoch": 0.2887, "grad_norm": 13.3125, "grad_norm_var": 0.7536458333333333, "learning_rate": 0.0003, "loss": 11.5933, "loss/aux_loss": 0.04808366596698761, "loss/crossentropy": 2.6659576177597044, "loss/logits": 0.8435241490602493, "step": 28870 }, { "epoch": 0.2888, "grad_norm": 12.375, "grad_norm_var": 1.361572265625, "learning_rate": 0.0003, "loss": 11.6656, "loss/aux_loss": 0.04809127729386091, "loss/crossentropy": 2.731011927127838, "loss/logits": 0.8725592494010925, "step": 28880 }, { "epoch": 0.2889, "grad_norm": 12.8125, "grad_norm_var": 1.0494140625, "learning_rate": 0.0003, "loss": 11.3996, "loss/aux_loss": 0.04808544144034386, "loss/crossentropy": 2.852566087245941, "loss/logits": 0.9123659133911133, "step": 28890 }, { "epoch": 0.289, "grad_norm": 12.625, "grad_norm_var": 0.5302083333333333, "learning_rate": 0.0003, "loss": 11.5145, "loss/aux_loss": 0.04808180872350931, "loss/crossentropy": 2.8115632593631745, "loss/logits": 0.8549628496170044, "step": 28900 }, { "epoch": 0.2891, "grad_norm": 12.625, "grad_norm_var": 0.4416015625, "learning_rate": 0.0003, "loss": 11.4916, "loss/aux_loss": 0.048086360283195975, "loss/crossentropy": 2.8208987712860107, "loss/logits": 0.8684911131858826, "step": 28910 }, { "epoch": 0.2892, "grad_norm": 12.875, "grad_norm_var": 0.306494140625, "learning_rate": 0.0003, "loss": 11.6515, "loss/aux_loss": 0.048084983974695204, "loss/crossentropy": 2.82762331366539, "loss/logits": 0.8740989983081817, "step": 28920 }, { "epoch": 0.2893, "grad_norm": 12.375, "grad_norm_var": 0.20428059895833334, "learning_rate": 0.0003, "loss": 11.3879, "loss/aux_loss": 0.048082150518894196, "loss/crossentropy": 2.7328701674938203, "loss/logits": 0.8231735050678253, "step": 28930 }, { "epoch": 0.2894, "grad_norm": 13.375, "grad_norm_var": 0.40572916666666664, "learning_rate": 0.0003, "loss": 11.4964, "loss/aux_loss": 0.04808959234505892, "loss/crossentropy": 2.681563550233841, "loss/logits": 0.8461143642663955, "step": 28940 }, { "epoch": 0.2895, "grad_norm": 12.5, "grad_norm_var": 0.245947265625, "learning_rate": 0.0003, "loss": 11.3121, "loss/aux_loss": 0.04808651022613049, "loss/crossentropy": 2.6562454462051392, "loss/logits": 0.8221059828996659, "step": 28950 }, { "epoch": 0.2896, "grad_norm": 13.0625, "grad_norm_var": 0.122509765625, "learning_rate": 0.0003, "loss": 11.4933, "loss/aux_loss": 0.04808787871152163, "loss/crossentropy": 2.725093901157379, "loss/logits": 0.8643009692430497, "step": 28960 }, { "epoch": 0.2897, "grad_norm": 12.8125, "grad_norm_var": 0.23203125, "learning_rate": 0.0003, "loss": 11.4234, "loss/aux_loss": 0.04808852039277554, "loss/crossentropy": 2.7337252140045165, "loss/logits": 0.8330873519182205, "step": 28970 }, { "epoch": 0.2898, "grad_norm": 13.1875, "grad_norm_var": 0.406494140625, "learning_rate": 0.0003, "loss": 11.53, "loss/aux_loss": 0.048081529699265955, "loss/crossentropy": 2.6677880942821504, "loss/logits": 0.8496348142623902, "step": 28980 }, { "epoch": 0.2899, "grad_norm": 12.4375, "grad_norm_var": 0.3011555989583333, "learning_rate": 0.0003, "loss": 11.3731, "loss/aux_loss": 0.04807930588722229, "loss/crossentropy": 2.6456236064434053, "loss/logits": 0.8759482502937317, "step": 28990 }, { "epoch": 0.29, "grad_norm": 12.3125, "grad_norm_var": 0.14021809895833334, "learning_rate": 0.0003, "loss": 11.5333, "loss/aux_loss": 0.048075495101511476, "loss/crossentropy": 2.8131125450134276, "loss/logits": 0.8634077340364457, "step": 29000 }, { "epoch": 0.2901, "grad_norm": 12.8125, "grad_norm_var": 0.12342122395833334, "learning_rate": 0.0003, "loss": 11.5456, "loss/aux_loss": 0.04809298645704985, "loss/crossentropy": 2.8442670702934265, "loss/logits": 0.8695224732160568, "step": 29010 }, { "epoch": 0.2902, "grad_norm": 12.25, "grad_norm_var": 0.21458333333333332, "learning_rate": 0.0003, "loss": 11.5506, "loss/aux_loss": 0.04808041173964739, "loss/crossentropy": 2.5905315399169924, "loss/logits": 0.8509000718593598, "step": 29020 }, { "epoch": 0.2903, "grad_norm": 12.25, "grad_norm_var": 0.5184733072916666, "learning_rate": 0.0003, "loss": 11.5184, "loss/aux_loss": 0.0480886347591877, "loss/crossentropy": 2.892964768409729, "loss/logits": 0.887509498000145, "step": 29030 }, { "epoch": 0.2904, "grad_norm": 14.1875, "grad_norm_var": 0.609375, "learning_rate": 0.0003, "loss": 11.6753, "loss/aux_loss": 0.048083288595080376, "loss/crossentropy": 2.7917010486125946, "loss/logits": 0.9125192284584045, "step": 29040 }, { "epoch": 0.2905, "grad_norm": 12.875, "grad_norm_var": 0.8778483072916666, "learning_rate": 0.0003, "loss": 11.4772, "loss/aux_loss": 0.04808119647204876, "loss/crossentropy": 2.6203967094421388, "loss/logits": 0.8430052489042282, "step": 29050 }, { "epoch": 0.2906, "grad_norm": 13.125, "grad_norm_var": 2.5011555989583334, "learning_rate": 0.0003, "loss": 11.3984, "loss/aux_loss": 0.048092004284262656, "loss/crossentropy": 2.7506280064582826, "loss/logits": 0.8497364521026611, "step": 29060 }, { "epoch": 0.2907, "grad_norm": 13.1875, "grad_norm_var": 2.12578125, "learning_rate": 0.0003, "loss": 11.4993, "loss/aux_loss": 0.04808201938867569, "loss/crossentropy": 2.765266942977905, "loss/logits": 0.8675076127052307, "step": 29070 }, { "epoch": 0.2908, "grad_norm": 12.9375, "grad_norm_var": 0.17838541666666666, "learning_rate": 0.0003, "loss": 11.4641, "loss/aux_loss": 0.048099024966359136, "loss/crossentropy": 2.6994349300861358, "loss/logits": 0.8749856293201447, "step": 29080 }, { "epoch": 0.2909, "grad_norm": 12.75, "grad_norm_var": 0.386962890625, "learning_rate": 0.0003, "loss": 11.6602, "loss/aux_loss": 0.048081963881850244, "loss/crossentropy": 2.8470765471458437, "loss/logits": 0.9124794363975525, "step": 29090 }, { "epoch": 0.291, "grad_norm": 12.75, "grad_norm_var": 0.6010416666666667, "learning_rate": 0.0003, "loss": 11.7205, "loss/aux_loss": 0.0480800811201334, "loss/crossentropy": 2.632298457622528, "loss/logits": 0.8833822071552276, "step": 29100 }, { "epoch": 0.2911, "grad_norm": 13.375, "grad_norm_var": 0.9180826822916667, "learning_rate": 0.0003, "loss": 11.5494, "loss/aux_loss": 0.0480863980948925, "loss/crossentropy": 2.8006245315074922, "loss/logits": 0.8473658740520478, "step": 29110 }, { "epoch": 0.2912, "grad_norm": 15.375, "grad_norm_var": 20.746614583333333, "learning_rate": 0.0003, "loss": 11.4375, "loss/aux_loss": 0.0480819221585989, "loss/crossentropy": 2.8713009297847747, "loss/logits": 0.8780528694391251, "step": 29120 }, { "epoch": 0.2913, "grad_norm": 13.6875, "grad_norm_var": 20.1234375, "learning_rate": 0.0003, "loss": 11.3549, "loss/aux_loss": 0.048090817779302596, "loss/crossentropy": 2.890332305431366, "loss/logits": 0.9046699106693268, "step": 29130 }, { "epoch": 0.2914, "grad_norm": 13.25, "grad_norm_var": 0.9503743489583333, "learning_rate": 0.0003, "loss": 11.377, "loss/aux_loss": 0.048084873519837854, "loss/crossentropy": 2.6816116988658907, "loss/logits": 0.8458852350711823, "step": 29140 }, { "epoch": 0.2915, "grad_norm": 12.75, "grad_norm_var": 0.383447265625, "learning_rate": 0.0003, "loss": 11.5313, "loss/aux_loss": 0.04807651937007904, "loss/crossentropy": 2.791849434375763, "loss/logits": 0.8508994936943054, "step": 29150 }, { "epoch": 0.2916, "grad_norm": 13.75, "grad_norm_var": 0.5989583333333334, "learning_rate": 0.0003, "loss": 11.5582, "loss/aux_loss": 0.04808504190295935, "loss/crossentropy": 2.727277672290802, "loss/logits": 0.8817504495382309, "step": 29160 }, { "epoch": 0.2917, "grad_norm": 13.3125, "grad_norm_var": 1.3075358072916667, "learning_rate": 0.0003, "loss": 11.5291, "loss/aux_loss": 0.04808583091944456, "loss/crossentropy": 2.900036704540253, "loss/logits": 0.8793569028377533, "step": 29170 }, { "epoch": 0.2918, "grad_norm": 12.1875, "grad_norm_var": 1.3009765625, "learning_rate": 0.0003, "loss": 11.5269, "loss/aux_loss": 0.04807761292904615, "loss/crossentropy": 2.790689837932587, "loss/logits": 0.8850260347127914, "step": 29180 }, { "epoch": 0.2919, "grad_norm": 12.4375, "grad_norm_var": 0.7283854166666667, "learning_rate": 0.0003, "loss": 11.454, "loss/aux_loss": 0.04808845371007919, "loss/crossentropy": 2.3579376369714735, "loss/logits": 0.795929902791977, "step": 29190 }, { "epoch": 0.292, "grad_norm": 14.0, "grad_norm_var": 0.2769368489583333, "learning_rate": 0.0003, "loss": 11.4508, "loss/aux_loss": 0.04808519259095192, "loss/crossentropy": 2.759959888458252, "loss/logits": 0.8524115920066834, "step": 29200 }, { "epoch": 0.2921, "grad_norm": 12.3125, "grad_norm_var": 0.40349934895833334, "learning_rate": 0.0003, "loss": 11.5466, "loss/aux_loss": 0.048085874505341054, "loss/crossentropy": 2.7660795211791993, "loss/logits": 0.8959324955940247, "step": 29210 }, { "epoch": 0.2922, "grad_norm": 12.6875, "grad_norm_var": 0.9738932291666667, "learning_rate": 0.0003, "loss": 11.5221, "loss/aux_loss": 0.04807844292372465, "loss/crossentropy": 2.739389771223068, "loss/logits": 0.9002001017332077, "step": 29220 }, { "epoch": 0.2923, "grad_norm": 12.6875, "grad_norm_var": 0.8861979166666667, "learning_rate": 0.0003, "loss": 11.604, "loss/aux_loss": 0.04809589311480522, "loss/crossentropy": 2.8620386838912966, "loss/logits": 0.8738722622394561, "step": 29230 }, { "epoch": 0.2924, "grad_norm": 13.0, "grad_norm_var": 0.8202473958333333, "learning_rate": 0.0003, "loss": 11.5345, "loss/aux_loss": 0.048076955042779444, "loss/crossentropy": 2.765863335132599, "loss/logits": 0.8648845195770264, "step": 29240 }, { "epoch": 0.2925, "grad_norm": 14.125, "grad_norm_var": 0.6406087239583333, "learning_rate": 0.0003, "loss": 11.4273, "loss/aux_loss": 0.04808403495699167, "loss/crossentropy": 2.6642160415649414, "loss/logits": 0.8542217493057251, "step": 29250 }, { "epoch": 0.2926, "grad_norm": 14.125, "grad_norm_var": 0.6924479166666667, "learning_rate": 0.0003, "loss": 11.4789, "loss/aux_loss": 0.048092410899698734, "loss/crossentropy": 2.6984796285629273, "loss/logits": 0.865311412513256, "step": 29260 }, { "epoch": 0.2927, "grad_norm": 14.1875, "grad_norm_var": 0.48125, "learning_rate": 0.0003, "loss": 11.6032, "loss/aux_loss": 0.04807758815586567, "loss/crossentropy": 2.764070636034012, "loss/logits": 0.9029274463653565, "step": 29270 }, { "epoch": 0.2928, "grad_norm": 12.6875, "grad_norm_var": 0.37005208333333334, "learning_rate": 0.0003, "loss": 11.4, "loss/aux_loss": 0.04808840956538916, "loss/crossentropy": 2.5958930790424346, "loss/logits": 0.8585714161396026, "step": 29280 }, { "epoch": 0.2929, "grad_norm": 13.3125, "grad_norm_var": 0.3031087239583333, "learning_rate": 0.0003, "loss": 11.3794, "loss/aux_loss": 0.048076996393501756, "loss/crossentropy": 2.841508948802948, "loss/logits": 0.8784020185470581, "step": 29290 }, { "epoch": 0.293, "grad_norm": 13.375, "grad_norm_var": 1.5706868489583334, "learning_rate": 0.0003, "loss": 11.445, "loss/aux_loss": 0.04809955209493637, "loss/crossentropy": 2.80951851606369, "loss/logits": 0.8316282510757447, "step": 29300 }, { "epoch": 0.2931, "grad_norm": 13.1875, "grad_norm_var": 0.7716145833333333, "learning_rate": 0.0003, "loss": 11.5951, "loss/aux_loss": 0.04807936865836382, "loss/crossentropy": 2.7189249217510225, "loss/logits": 0.8644407778978348, "step": 29310 }, { "epoch": 0.2932, "grad_norm": 12.6875, "grad_norm_var": 0.5526041666666667, "learning_rate": 0.0003, "loss": 11.5453, "loss/aux_loss": 0.048080489970743656, "loss/crossentropy": 2.8331545174121855, "loss/logits": 0.8822837799787522, "step": 29320 }, { "epoch": 0.2933, "grad_norm": 13.1875, "grad_norm_var": 0.23229166666666667, "learning_rate": 0.0003, "loss": 11.7391, "loss/aux_loss": 0.0480887183919549, "loss/crossentropy": 2.7327269673347474, "loss/logits": 0.8839978009462357, "step": 29330 }, { "epoch": 0.2934, "grad_norm": 13.625, "grad_norm_var": 0.22654622395833332, "learning_rate": 0.0003, "loss": 11.4887, "loss/aux_loss": 0.04807803481817245, "loss/crossentropy": 2.721933346986771, "loss/logits": 0.8577464699745179, "step": 29340 }, { "epoch": 0.2935, "grad_norm": 13.25, "grad_norm_var": 0.18448893229166666, "learning_rate": 0.0003, "loss": 11.4648, "loss/aux_loss": 0.04808668624609709, "loss/crossentropy": 2.8373769760131835, "loss/logits": 0.8713858962059021, "step": 29350 }, { "epoch": 0.2936, "grad_norm": 13.1875, "grad_norm_var": 1.0161458333333333, "learning_rate": 0.0003, "loss": 11.4781, "loss/aux_loss": 0.04806650560349226, "loss/crossentropy": 2.651980197429657, "loss/logits": 0.8469345271587372, "step": 29360 }, { "epoch": 0.2937, "grad_norm": 14.1875, "grad_norm_var": 0.3165201822916667, "learning_rate": 0.0003, "loss": 11.4494, "loss/aux_loss": 0.048085213825106624, "loss/crossentropy": 2.8691640198230743, "loss/logits": 0.8683050394058227, "step": 29370 }, { "epoch": 0.2938, "grad_norm": 13.75, "grad_norm_var": 0.42630208333333336, "learning_rate": 0.0003, "loss": 11.5254, "loss/aux_loss": 0.04808463230729103, "loss/crossentropy": 2.6619069993495943, "loss/logits": 0.8452241331338882, "step": 29380 }, { "epoch": 0.2939, "grad_norm": 12.625, "grad_norm_var": 0.3916015625, "learning_rate": 0.0003, "loss": 11.4964, "loss/aux_loss": 0.04807299487292767, "loss/crossentropy": 2.63063805103302, "loss/logits": 0.8311535373330117, "step": 29390 }, { "epoch": 0.294, "grad_norm": 13.3125, "grad_norm_var": 0.3087890625, "learning_rate": 0.0003, "loss": 11.437, "loss/aux_loss": 0.048074229061603545, "loss/crossentropy": 2.745287525653839, "loss/logits": 0.8874899983406067, "step": 29400 }, { "epoch": 0.2941, "grad_norm": 13.125, "grad_norm_var": 0.2837890625, "learning_rate": 0.0003, "loss": 11.5053, "loss/aux_loss": 0.04808676596730947, "loss/crossentropy": 2.740684485435486, "loss/logits": 0.8409494936466217, "step": 29410 }, { "epoch": 0.2942, "grad_norm": 12.1875, "grad_norm_var": 0.21920572916666667, "learning_rate": 0.0003, "loss": 11.5673, "loss/aux_loss": 0.04807845540344715, "loss/crossentropy": 2.8806114912033083, "loss/logits": 0.8758183747529984, "step": 29420 }, { "epoch": 0.2943, "grad_norm": 12.8125, "grad_norm_var": 1.0633951822916667, "learning_rate": 0.0003, "loss": 11.4363, "loss/aux_loss": 0.04808698520064354, "loss/crossentropy": 2.717182183265686, "loss/logits": 0.8745023101568222, "step": 29430 }, { "epoch": 0.2944, "grad_norm": 13.25, "grad_norm_var": 0.4212890625, "learning_rate": 0.0003, "loss": 11.5294, "loss/aux_loss": 0.048078556172549726, "loss/crossentropy": 2.6900423645973204, "loss/logits": 0.8533391326665878, "step": 29440 }, { "epoch": 0.2945, "grad_norm": 13.8125, "grad_norm_var": 0.2962890625, "learning_rate": 0.0003, "loss": 11.4001, "loss/aux_loss": 0.0480863269418478, "loss/crossentropy": 2.7647065460681914, "loss/logits": 0.8730087608098984, "step": 29450 }, { "epoch": 0.2946, "grad_norm": 13.625, "grad_norm_var": 1.1960774739583333, "learning_rate": 0.0003, "loss": 11.5183, "loss/aux_loss": 0.0480956656858325, "loss/crossentropy": 2.675736755132675, "loss/logits": 0.8797785133123398, "step": 29460 }, { "epoch": 0.2947, "grad_norm": 12.8125, "grad_norm_var": 1.1954264322916666, "learning_rate": 0.0003, "loss": 11.3158, "loss/aux_loss": 0.0480833875015378, "loss/crossentropy": 2.6308836817741392, "loss/logits": 0.8395634293556213, "step": 29470 }, { "epoch": 0.2948, "grad_norm": 12.0625, "grad_norm_var": 0.5860514322916667, "learning_rate": 0.0003, "loss": 11.2813, "loss/aux_loss": 0.048089759424328804, "loss/crossentropy": 2.599923449754715, "loss/logits": 0.8390878111124038, "step": 29480 }, { "epoch": 0.2949, "grad_norm": 13.6875, "grad_norm_var": 0.5067545572916666, "learning_rate": 0.0003, "loss": 11.4816, "loss/aux_loss": 0.0480830617249012, "loss/crossentropy": 2.597203868627548, "loss/logits": 0.8285312354564667, "step": 29490 }, { "epoch": 0.295, "grad_norm": 12.5625, "grad_norm_var": 0.3780598958333333, "learning_rate": 0.0003, "loss": 11.416, "loss/aux_loss": 0.048088861629366875, "loss/crossentropy": 2.8947019577026367, "loss/logits": 0.8721114903688431, "step": 29500 }, { "epoch": 0.2951, "grad_norm": 12.9375, "grad_norm_var": 0.34698893229166666, "learning_rate": 0.0003, "loss": 11.4425, "loss/aux_loss": 0.048086998984217644, "loss/crossentropy": 2.6360188245773317, "loss/logits": 0.848075520992279, "step": 29510 }, { "epoch": 0.2952, "grad_norm": 12.125, "grad_norm_var": 0.190869140625, "learning_rate": 0.0003, "loss": 11.3914, "loss/aux_loss": 0.04809488840401173, "loss/crossentropy": 2.63630353808403, "loss/logits": 0.8704184353351593, "step": 29520 }, { "epoch": 0.2953, "grad_norm": 12.6875, "grad_norm_var": 0.31834309895833335, "learning_rate": 0.0003, "loss": 11.5943, "loss/aux_loss": 0.04807685688138008, "loss/crossentropy": 2.7015933096408844, "loss/logits": 0.8864135921001435, "step": 29530 }, { "epoch": 0.2954, "grad_norm": 12.6875, "grad_norm_var": 0.263525390625, "learning_rate": 0.0003, "loss": 11.5153, "loss/aux_loss": 0.04809237774461508, "loss/crossentropy": 2.8147780299186707, "loss/logits": 0.8690049260854721, "step": 29540 }, { "epoch": 0.2955, "grad_norm": 15.375, "grad_norm_var": 4.4134765625, "learning_rate": 0.0003, "loss": 11.5648, "loss/aux_loss": 0.048073606193065645, "loss/crossentropy": 2.8568318367004393, "loss/logits": 0.9089872241020203, "step": 29550 }, { "epoch": 0.2956, "grad_norm": 12.125, "grad_norm_var": 4.423372395833334, "learning_rate": 0.0003, "loss": 11.4575, "loss/aux_loss": 0.04808273408561945, "loss/crossentropy": 2.7235109508037567, "loss/logits": 0.8837243676185608, "step": 29560 }, { "epoch": 0.2957, "grad_norm": 13.75, "grad_norm_var": 0.4041015625, "learning_rate": 0.0003, "loss": 11.4504, "loss/aux_loss": 0.04809108339250088, "loss/crossentropy": 2.7183880388736723, "loss/logits": 0.8825681626796722, "step": 29570 }, { "epoch": 0.2958, "grad_norm": 12.875, "grad_norm_var": 0.15149739583333333, "learning_rate": 0.0003, "loss": 11.4876, "loss/aux_loss": 0.04807641636580229, "loss/crossentropy": 2.8469328343868257, "loss/logits": 0.8711580604314804, "step": 29580 }, { "epoch": 0.2959, "grad_norm": 12.4375, "grad_norm_var": 0.16300455729166666, "learning_rate": 0.0003, "loss": 11.5655, "loss/aux_loss": 0.04808410815894604, "loss/crossentropy": 2.7582703590393067, "loss/logits": 0.8958453744649887, "step": 29590 }, { "epoch": 0.296, "grad_norm": 14.3125, "grad_norm_var": 0.378125, "learning_rate": 0.0003, "loss": 11.587, "loss/aux_loss": 0.048079511523246764, "loss/crossentropy": 2.830457305908203, "loss/logits": 0.916206705570221, "step": 29600 }, { "epoch": 0.2961, "grad_norm": 12.625, "grad_norm_var": 0.6526041666666667, "learning_rate": 0.0003, "loss": 11.3545, "loss/aux_loss": 0.048083196952939035, "loss/crossentropy": 2.6322430610656737, "loss/logits": 0.8604376584291458, "step": 29610 }, { "epoch": 0.2962, "grad_norm": 13.625, "grad_norm_var": 0.5140625, "learning_rate": 0.0003, "loss": 11.3875, "loss/aux_loss": 0.048088392801582815, "loss/crossentropy": 2.7959298372268675, "loss/logits": 0.8871166080236434, "step": 29620 }, { "epoch": 0.2963, "grad_norm": 18.875, "grad_norm_var": 3.191389973958333, "learning_rate": 0.0003, "loss": 11.3276, "loss/aux_loss": 0.04808803517371416, "loss/crossentropy": 2.799188733100891, "loss/logits": 0.8698725253343582, "step": 29630 }, { "epoch": 0.2964, "grad_norm": 14.5625, "grad_norm_var": 2.5827962239583333, "learning_rate": 0.0003, "loss": 11.51, "loss/aux_loss": 0.04808425158262253, "loss/crossentropy": 2.8158665776252745, "loss/logits": 0.8623910456895828, "step": 29640 }, { "epoch": 0.2965, "grad_norm": 15.3125, "grad_norm_var": 0.7145833333333333, "learning_rate": 0.0003, "loss": 11.4356, "loss/aux_loss": 0.0480889655649662, "loss/crossentropy": 2.744274616241455, "loss/logits": 0.8680987030267715, "step": 29650 }, { "epoch": 0.2966, "grad_norm": 13.8125, "grad_norm_var": 0.6032389322916667, "learning_rate": 0.0003, "loss": 11.3509, "loss/aux_loss": 0.04808337744325399, "loss/crossentropy": 2.6652339160442353, "loss/logits": 0.8523492991924286, "step": 29660 }, { "epoch": 0.2967, "grad_norm": 13.3125, "grad_norm_var": 0.3489583333333333, "learning_rate": 0.0003, "loss": 11.4915, "loss/aux_loss": 0.04808827750384807, "loss/crossentropy": 2.8008382678031922, "loss/logits": 0.894950145483017, "step": 29670 }, { "epoch": 0.2968, "grad_norm": 15.375, "grad_norm_var": 0.7530598958333333, "learning_rate": 0.0003, "loss": 11.5014, "loss/aux_loss": 0.04807999767363071, "loss/crossentropy": 2.7002371549606323, "loss/logits": 0.8693482935428619, "step": 29680 }, { "epoch": 0.2969, "grad_norm": 13.25, "grad_norm_var": 0.5391764322916667, "learning_rate": 0.0003, "loss": 11.474, "loss/aux_loss": 0.048070631176233294, "loss/crossentropy": 2.921978032588959, "loss/logits": 0.899678111076355, "step": 29690 }, { "epoch": 0.297, "grad_norm": 14.0625, "grad_norm_var": 0.4332682291666667, "learning_rate": 0.0003, "loss": 11.5034, "loss/aux_loss": 0.048087138868868354, "loss/crossentropy": 2.7084551751613617, "loss/logits": 0.8722521513700485, "step": 29700 }, { "epoch": 0.2971, "grad_norm": 12.9375, "grad_norm_var": 0.5426432291666666, "learning_rate": 0.0003, "loss": 11.6086, "loss/aux_loss": 0.04807428289204836, "loss/crossentropy": 2.5567859768867494, "loss/logits": 0.8739277720451355, "step": 29710 }, { "epoch": 0.2972, "grad_norm": 12.5, "grad_norm_var": 0.34427083333333336, "learning_rate": 0.0003, "loss": 11.3334, "loss/aux_loss": 0.048082271590828896, "loss/crossentropy": 2.7359997153282167, "loss/logits": 0.8679700314998626, "step": 29720 }, { "epoch": 0.2973, "grad_norm": 13.1875, "grad_norm_var": 1.246337890625, "learning_rate": 0.0003, "loss": 11.3988, "loss/aux_loss": 0.04808435477316379, "loss/crossentropy": 2.658203488588333, "loss/logits": 0.8306466698646545, "step": 29730 }, { "epoch": 0.2974, "grad_norm": 13.5, "grad_norm_var": 0.9030598958333333, "learning_rate": 0.0003, "loss": 11.5435, "loss/aux_loss": 0.048076405003666875, "loss/crossentropy": 2.683915287256241, "loss/logits": 0.8744410634040832, "step": 29740 }, { "epoch": 0.2975, "grad_norm": 16.125, "grad_norm_var": 92.749462890625, "learning_rate": 0.0003, "loss": 11.5341, "loss/aux_loss": 0.04808039367198944, "loss/crossentropy": 2.7447816848754885, "loss/logits": 0.8636388152837753, "step": 29750 }, { "epoch": 0.2976, "grad_norm": 12.75, "grad_norm_var": 92.86678059895833, "learning_rate": 0.0003, "loss": 11.381, "loss/aux_loss": 0.048084843531250955, "loss/crossentropy": 2.6227781534194947, "loss/logits": 0.8834515571594238, "step": 29760 }, { "epoch": 0.2977, "grad_norm": 12.625, "grad_norm_var": 0.331884765625, "learning_rate": 0.0003, "loss": 11.4901, "loss/aux_loss": 0.04807569459080696, "loss/crossentropy": 2.6902407228946688, "loss/logits": 0.8586381793022155, "step": 29770 }, { "epoch": 0.2978, "grad_norm": 13.25, "grad_norm_var": 0.506884765625, "learning_rate": 0.0003, "loss": 11.4579, "loss/aux_loss": 0.04808189757168293, "loss/crossentropy": 2.9000410437583923, "loss/logits": 0.8773433297872544, "step": 29780 }, { "epoch": 0.2979, "grad_norm": 12.875, "grad_norm_var": 0.5541015625, "learning_rate": 0.0003, "loss": 11.4549, "loss/aux_loss": 0.048081548884510994, "loss/crossentropy": 2.702808624505997, "loss/logits": 0.8589540451765061, "step": 29790 }, { "epoch": 0.298, "grad_norm": 13.125, "grad_norm_var": 0.4332682291666667, "learning_rate": 0.0003, "loss": 11.5048, "loss/aux_loss": 0.04808537419885397, "loss/crossentropy": 2.7269632279872895, "loss/logits": 0.8750491231679917, "step": 29800 }, { "epoch": 0.2981, "grad_norm": 13.0625, "grad_norm_var": 0.23123372395833333, "learning_rate": 0.0003, "loss": 11.4269, "loss/aux_loss": 0.04808894339948892, "loss/crossentropy": 2.5503712058067323, "loss/logits": 0.8601103842258453, "step": 29810 }, { "epoch": 0.2982, "grad_norm": 13.5625, "grad_norm_var": 0.6322265625, "learning_rate": 0.0003, "loss": 11.4354, "loss/aux_loss": 0.048082937858998774, "loss/crossentropy": 2.692414093017578, "loss/logits": 0.8803679436445236, "step": 29820 }, { "epoch": 0.2983, "grad_norm": 13.1875, "grad_norm_var": 0.363525390625, "learning_rate": 0.0003, "loss": 11.4705, "loss/aux_loss": 0.048084008321166036, "loss/crossentropy": 2.779222333431244, "loss/logits": 0.8870409220457077, "step": 29830 }, { "epoch": 0.2984, "grad_norm": 12.5625, "grad_norm_var": 0.25636393229166665, "learning_rate": 0.0003, "loss": 11.3471, "loss/aux_loss": 0.04808189794421196, "loss/crossentropy": 2.8359466314315798, "loss/logits": 0.895777115225792, "step": 29840 }, { "epoch": 0.2985, "grad_norm": 14.625, "grad_norm_var": 0.4911295572916667, "learning_rate": 0.0003, "loss": 11.5757, "loss/aux_loss": 0.048088085278868674, "loss/crossentropy": 2.681327813863754, "loss/logits": 0.8474095374345779, "step": 29850 }, { "epoch": 0.2986, "grad_norm": 14.1875, "grad_norm_var": 1.6374837239583333, "learning_rate": 0.0003, "loss": 11.4771, "loss/aux_loss": 0.048079535365104675, "loss/crossentropy": 2.74518221616745, "loss/logits": 0.8559576362371445, "step": 29860 }, { "epoch": 0.2987, "grad_norm": 12.6875, "grad_norm_var": 0.528759765625, "learning_rate": 0.0003, "loss": 11.6162, "loss/aux_loss": 0.04807321559637785, "loss/crossentropy": 2.8557145297527313, "loss/logits": 0.9057471811771393, "step": 29870 }, { "epoch": 0.2988, "grad_norm": 12.875, "grad_norm_var": 0.8013020833333333, "learning_rate": 0.0003, "loss": 11.6681, "loss/aux_loss": 0.048088300973176956, "loss/crossentropy": 2.751220625638962, "loss/logits": 0.8958558738231659, "step": 29880 }, { "epoch": 0.2989, "grad_norm": 11.75, "grad_norm_var": 0.9364583333333333, "learning_rate": 0.0003, "loss": 11.4696, "loss/aux_loss": 0.04808711316436529, "loss/crossentropy": 2.7763246476650236, "loss/logits": 0.8446446388959885, "step": 29890 }, { "epoch": 0.299, "grad_norm": 13.25, "grad_norm_var": 0.60078125, "learning_rate": 0.0003, "loss": 11.3327, "loss/aux_loss": 0.048075996339321136, "loss/crossentropy": 2.47065726518631, "loss/logits": 0.8439252525568008, "step": 29900 }, { "epoch": 0.2991, "grad_norm": 13.125, "grad_norm_var": 0.4515462239583333, "learning_rate": 0.0003, "loss": 11.3397, "loss/aux_loss": 0.04808225966989994, "loss/crossentropy": 2.684304392337799, "loss/logits": 0.8657424867153167, "step": 29910 }, { "epoch": 0.2992, "grad_norm": 12.8125, "grad_norm_var": 0.081884765625, "learning_rate": 0.0003, "loss": 11.4104, "loss/aux_loss": 0.04808203261345625, "loss/crossentropy": 2.857515978813171, "loss/logits": 0.8850834548473359, "step": 29920 }, { "epoch": 0.2993, "grad_norm": 13.625, "grad_norm_var": 310.426806640625, "learning_rate": 0.0003, "loss": 11.3021, "loss/aux_loss": 0.04808754250407219, "loss/crossentropy": 2.865814244747162, "loss/logits": 0.9045403331518174, "step": 29930 }, { "epoch": 0.2994, "grad_norm": 13.0625, "grad_norm_var": 1.0432291666666667, "learning_rate": 0.0003, "loss": 11.5943, "loss/aux_loss": 0.04808537941426039, "loss/crossentropy": 2.7755528509616854, "loss/logits": 0.888872966170311, "step": 29940 }, { "epoch": 0.2995, "grad_norm": 12.4375, "grad_norm_var": 0.2540201822916667, "learning_rate": 0.0003, "loss": 11.3798, "loss/aux_loss": 0.048091139644384384, "loss/crossentropy": 2.5855644285678863, "loss/logits": 0.8455839395523072, "step": 29950 }, { "epoch": 0.2996, "grad_norm": 14.0, "grad_norm_var": 1.2926432291666667, "learning_rate": 0.0003, "loss": 11.4159, "loss/aux_loss": 0.04809227306395769, "loss/crossentropy": 2.719691050052643, "loss/logits": 0.8842676371335983, "step": 29960 }, { "epoch": 0.2997, "grad_norm": 13.125, "grad_norm_var": 1.37109375, "learning_rate": 0.0003, "loss": 11.4488, "loss/aux_loss": 0.0480900889262557, "loss/crossentropy": 2.6705852150917053, "loss/logits": 0.8526766896247864, "step": 29970 }, { "epoch": 0.2998, "grad_norm": 13.0, "grad_norm_var": 0.16365559895833334, "learning_rate": 0.0003, "loss": 11.4354, "loss/aux_loss": 0.04807566087692976, "loss/crossentropy": 2.8326764822006227, "loss/logits": 0.8908536106348037, "step": 29980 }, { "epoch": 0.2999, "grad_norm": 14.25, "grad_norm_var": 0.218603515625, "learning_rate": 0.0003, "loss": 11.4361, "loss/aux_loss": 0.04809534475207329, "loss/crossentropy": 2.653664433956146, "loss/logits": 0.8664328694343567, "step": 29990 }, { "epoch": 0.3, "grad_norm": 13.875, "grad_norm_var": 16.411393229166666, "learning_rate": 0.0003, "loss": 11.5995, "loss/aux_loss": 0.04808858595788479, "loss/crossentropy": 2.733062154054642, "loss/logits": 0.8457317858934402, "step": 30000 }, { "epoch": 0.3001, "grad_norm": 21.875, "grad_norm_var": 2748.9501139322915, "learning_rate": 0.0003, "loss": 11.4879, "loss/aux_loss": 0.0480996148660779, "loss/crossentropy": 2.7736578941345216, "loss/logits": 0.8403934806585311, "step": 30010 }, { "epoch": 0.3002, "grad_norm": 13.125, "grad_norm_var": 19.3478515625, "learning_rate": 0.0003, "loss": 11.4726, "loss/aux_loss": 0.048092894814908506, "loss/crossentropy": 2.705011248588562, "loss/logits": 0.8455720961093902, "step": 30020 }, { "epoch": 0.3003, "grad_norm": 12.125, "grad_norm_var": 37.18553059895833, "learning_rate": 0.0003, "loss": 11.4553, "loss/aux_loss": 0.04807716105133295, "loss/crossentropy": 2.9149852752685548, "loss/logits": 0.8917495250701905, "step": 30030 }, { "epoch": 0.3004, "grad_norm": 12.1875, "grad_norm_var": 37.059488932291664, "learning_rate": 0.0003, "loss": 11.3983, "loss/aux_loss": 0.04808226209133863, "loss/crossentropy": 2.841688472032547, "loss/logits": 0.8982161253690719, "step": 30040 }, { "epoch": 0.3005, "grad_norm": 12.8125, "grad_norm_var": 0.34451497395833336, "learning_rate": 0.0003, "loss": 11.6804, "loss/aux_loss": 0.04808265995234251, "loss/crossentropy": 2.737361544370651, "loss/logits": 0.8148900896310807, "step": 30050 }, { "epoch": 0.3006, "grad_norm": 14.3125, "grad_norm_var": 0.5416015625, "learning_rate": 0.0003, "loss": 11.6761, "loss/aux_loss": 0.04808789361268282, "loss/crossentropy": 2.577936816215515, "loss/logits": 0.8507110446691513, "step": 30060 }, { "epoch": 0.3007, "grad_norm": 15.125, "grad_norm_var": 4.1171875, "learning_rate": 0.0003, "loss": 11.6774, "loss/aux_loss": 0.04809730667620897, "loss/crossentropy": 2.743165111541748, "loss/logits": 0.8715814143419266, "step": 30070 }, { "epoch": 0.3008, "grad_norm": 12.625, "grad_norm_var": 1.2132649739583334, "learning_rate": 0.0003, "loss": 11.6082, "loss/aux_loss": 0.04807158131152391, "loss/crossentropy": 2.795542907714844, "loss/logits": 0.8549720883369446, "step": 30080 }, { "epoch": 0.3009, "grad_norm": 12.375, "grad_norm_var": 0.562744140625, "learning_rate": 0.0003, "loss": 11.5119, "loss/aux_loss": 0.04808140005916357, "loss/crossentropy": 2.8126461267471314, "loss/logits": 0.8966263324022293, "step": 30090 }, { "epoch": 0.301, "grad_norm": 12.75, "grad_norm_var": 0.65390625, "learning_rate": 0.0003, "loss": 11.4426, "loss/aux_loss": 0.048090949095785616, "loss/crossentropy": 2.7090745508670806, "loss/logits": 0.8385947048664093, "step": 30100 }, { "epoch": 0.3011, "grad_norm": 12.6875, "grad_norm_var": 0.6181640625, "learning_rate": 0.0003, "loss": 11.5944, "loss/aux_loss": 0.04808566849678755, "loss/crossentropy": 2.751387929916382, "loss/logits": 0.8912309646606446, "step": 30110 }, { "epoch": 0.3012, "grad_norm": 13.25, "grad_norm_var": 0.54609375, "learning_rate": 0.0003, "loss": 11.3972, "loss/aux_loss": 0.04808124527335167, "loss/crossentropy": 2.841317903995514, "loss/logits": 0.8788245469331741, "step": 30120 }, { "epoch": 0.3013, "grad_norm": 13.375, "grad_norm_var": 0.324462890625, "learning_rate": 0.0003, "loss": 11.5188, "loss/aux_loss": 0.04808822274208069, "loss/crossentropy": 2.6749501705169676, "loss/logits": 0.8571932524442673, "step": 30130 }, { "epoch": 0.3014, "grad_norm": 13.0, "grad_norm_var": 0.18631184895833333, "learning_rate": 0.0003, "loss": 11.5919, "loss/aux_loss": 0.04807674512267113, "loss/crossentropy": 2.8263603806495667, "loss/logits": 0.9167416036128998, "step": 30140 }, { "epoch": 0.3015, "grad_norm": 13.25, "grad_norm_var": 0.5544108072916667, "learning_rate": 0.0003, "loss": 11.3587, "loss/aux_loss": 0.04808600451797247, "loss/crossentropy": 2.689769744873047, "loss/logits": 0.8513695240020752, "step": 30150 }, { "epoch": 0.3016, "grad_norm": 13.5, "grad_norm_var": 0.20271809895833334, "learning_rate": 0.0003, "loss": 11.4439, "loss/aux_loss": 0.04808979425579309, "loss/crossentropy": 2.705520159006119, "loss/logits": 0.8287128508090973, "step": 30160 }, { "epoch": 0.3017, "grad_norm": 13.375, "grad_norm_var": 0.34373372395833335, "learning_rate": 0.0003, "loss": 11.3879, "loss/aux_loss": 0.04808113612234592, "loss/crossentropy": 2.59697830080986, "loss/logits": 0.8457653447985649, "step": 30170 }, { "epoch": 0.3018, "grad_norm": 13.3125, "grad_norm_var": 0.40572916666666664, "learning_rate": 0.0003, "loss": 11.6186, "loss/aux_loss": 0.04808783624321222, "loss/crossentropy": 2.7700432360172274, "loss/logits": 0.8764997065067291, "step": 30180 }, { "epoch": 0.3019, "grad_norm": 13.375, "grad_norm_var": 0.5805826822916667, "learning_rate": 0.0003, "loss": 11.4003, "loss/aux_loss": 0.04809058122336864, "loss/crossentropy": 2.7070785045623778, "loss/logits": 0.8807005375623703, "step": 30190 }, { "epoch": 0.302, "grad_norm": 12.0, "grad_norm_var": 0.5976399739583333, "learning_rate": 0.0003, "loss": 11.2816, "loss/aux_loss": 0.048085220903158185, "loss/crossentropy": 2.625184786319733, "loss/logits": 0.8463748693466187, "step": 30200 }, { "epoch": 0.3021, "grad_norm": 13.875, "grad_norm_var": 0.5113932291666666, "learning_rate": 0.0003, "loss": 11.4653, "loss/aux_loss": 0.04808960650116205, "loss/crossentropy": 2.690778136253357, "loss/logits": 0.8736082255840302, "step": 30210 }, { "epoch": 0.3022, "grad_norm": 13.375, "grad_norm_var": 0.38605143229166666, "learning_rate": 0.0003, "loss": 11.5442, "loss/aux_loss": 0.04808677174150944, "loss/crossentropy": 2.769177794456482, "loss/logits": 0.8669810116291046, "step": 30220 }, { "epoch": 0.3023, "grad_norm": 13.875, "grad_norm_var": 0.43279622395833334, "learning_rate": 0.0003, "loss": 11.3678, "loss/aux_loss": 0.04807732906192541, "loss/crossentropy": 2.66594517827034, "loss/logits": 0.8435944467782974, "step": 30230 }, { "epoch": 0.3024, "grad_norm": 13.625, "grad_norm_var": 0.349462890625, "learning_rate": 0.0003, "loss": 11.5848, "loss/aux_loss": 0.048086778819561006, "loss/crossentropy": 2.83836350440979, "loss/logits": 0.8659113794565201, "step": 30240 }, { "epoch": 0.3025, "grad_norm": 13.5, "grad_norm_var": 0.40305989583333335, "learning_rate": 0.0003, "loss": 11.3988, "loss/aux_loss": 0.04808458890765906, "loss/crossentropy": 2.716430550813675, "loss/logits": 0.8613810330629349, "step": 30250 }, { "epoch": 0.3026, "grad_norm": 65.0, "grad_norm_var": 168.16276041666666, "learning_rate": 0.0003, "loss": 11.4989, "loss/aux_loss": 0.04808981157839298, "loss/crossentropy": 2.8385813772678374, "loss/logits": 0.8725706160068512, "step": 30260 }, { "epoch": 0.3027, "grad_norm": 13.25, "grad_norm_var": 168.298681640625, "learning_rate": 0.0003, "loss": 11.4201, "loss/aux_loss": 0.04808289185166359, "loss/crossentropy": 2.7978179454803467, "loss/logits": 0.8753843992948532, "step": 30270 }, { "epoch": 0.3028, "grad_norm": 12.5625, "grad_norm_var": 0.8785807291666666, "learning_rate": 0.0003, "loss": 11.5324, "loss/aux_loss": 0.048077399097383025, "loss/crossentropy": 2.7661708891391754, "loss/logits": 0.8352512449026108, "step": 30280 }, { "epoch": 0.3029, "grad_norm": 14.0, "grad_norm_var": 0.48072916666666665, "learning_rate": 0.0003, "loss": 11.5311, "loss/aux_loss": 0.04808757621794939, "loss/crossentropy": 2.787020003795624, "loss/logits": 0.8474883437156677, "step": 30290 }, { "epoch": 0.303, "grad_norm": 13.875, "grad_norm_var": 0.32146809895833334, "learning_rate": 0.0003, "loss": 11.5551, "loss/aux_loss": 0.048091338202357295, "loss/crossentropy": 2.6601632058620455, "loss/logits": 0.8541019320487976, "step": 30300 }, { "epoch": 0.3031, "grad_norm": 14.1875, "grad_norm_var": 0.6218098958333333, "learning_rate": 0.0003, "loss": 11.4167, "loss/aux_loss": 0.04807760640978813, "loss/crossentropy": 2.621816486120224, "loss/logits": 0.8466038852930069, "step": 30310 }, { "epoch": 0.3032, "grad_norm": 13.5, "grad_norm_var": 0.8030598958333334, "learning_rate": 0.0003, "loss": 11.6232, "loss/aux_loss": 0.048088185116648675, "loss/crossentropy": 2.719381844997406, "loss/logits": 0.853942820429802, "step": 30320 }, { "epoch": 0.3033, "grad_norm": 12.0625, "grad_norm_var": 0.863134765625, "learning_rate": 0.0003, "loss": 11.4434, "loss/aux_loss": 0.04807737711817026, "loss/crossentropy": 2.677357393503189, "loss/logits": 0.8405762702226639, "step": 30330 }, { "epoch": 0.3034, "grad_norm": 12.875, "grad_norm_var": 0.522119140625, "learning_rate": 0.0003, "loss": 11.3732, "loss/aux_loss": 0.04808907844126224, "loss/crossentropy": 2.676799476146698, "loss/logits": 0.8608356237411499, "step": 30340 }, { "epoch": 0.3035, "grad_norm": 13.5625, "grad_norm_var": 0.465478515625, "learning_rate": 0.0003, "loss": 11.5126, "loss/aux_loss": 0.048095157742500304, "loss/crossentropy": 2.7750605642795563, "loss/logits": 0.8369805574417114, "step": 30350 }, { "epoch": 0.3036, "grad_norm": 13.125, "grad_norm_var": 0.25, "learning_rate": 0.0003, "loss": 11.4669, "loss/aux_loss": 0.04807246904820204, "loss/crossentropy": 2.611363673210144, "loss/logits": 0.8665720134973526, "step": 30360 }, { "epoch": 0.3037, "grad_norm": 12.875, "grad_norm_var": 0.29140625, "learning_rate": 0.0003, "loss": 11.637, "loss/aux_loss": 0.048090710304677486, "loss/crossentropy": 2.826841878890991, "loss/logits": 0.8832567691802978, "step": 30370 }, { "epoch": 0.3038, "grad_norm": 14.6875, "grad_norm_var": 3.5155598958333334, "learning_rate": 0.0003, "loss": 11.542, "loss/aux_loss": 0.04807953424751758, "loss/crossentropy": 2.7730814576148988, "loss/logits": 0.8635116755962372, "step": 30380 }, { "epoch": 0.3039, "grad_norm": 16.375, "grad_norm_var": 3.6962890625, "learning_rate": 0.0003, "loss": 11.4562, "loss/aux_loss": 0.048072515055537224, "loss/crossentropy": 2.73685462474823, "loss/logits": 0.8686755329370499, "step": 30390 }, { "epoch": 0.304, "grad_norm": 14.625, "grad_norm_var": 0.9533854166666667, "learning_rate": 0.0003, "loss": 11.5056, "loss/aux_loss": 0.048081375658512115, "loss/crossentropy": 2.789488208293915, "loss/logits": 0.8672916740179062, "step": 30400 }, { "epoch": 0.3041, "grad_norm": 13.375, "grad_norm_var": 0.4032389322916667, "learning_rate": 0.0003, "loss": 11.4649, "loss/aux_loss": 0.048084143362939355, "loss/crossentropy": 2.8491791486740112, "loss/logits": 0.8795545041561127, "step": 30410 }, { "epoch": 0.3042, "grad_norm": 12.3125, "grad_norm_var": 0.6163899739583333, "learning_rate": 0.0003, "loss": 11.4793, "loss/aux_loss": 0.04807379003614187, "loss/crossentropy": 2.7467067003250123, "loss/logits": 0.8532693386077881, "step": 30420 }, { "epoch": 0.3043, "grad_norm": 12.75, "grad_norm_var": 1.0354166666666667, "learning_rate": 0.0003, "loss": 11.3694, "loss/aux_loss": 0.04808585681021214, "loss/crossentropy": 2.574428778886795, "loss/logits": 0.837331035733223, "step": 30430 }, { "epoch": 0.3044, "grad_norm": 13.8125, "grad_norm_var": 0.3973795572916667, "learning_rate": 0.0003, "loss": 11.5313, "loss/aux_loss": 0.04808316696435213, "loss/crossentropy": 2.8282381296157837, "loss/logits": 0.8839503526687622, "step": 30440 }, { "epoch": 0.3045, "grad_norm": 14.375, "grad_norm_var": 0.5514973958333333, "learning_rate": 0.0003, "loss": 11.6023, "loss/aux_loss": 0.048081176541745665, "loss/crossentropy": 2.7252886414527895, "loss/logits": 0.8965796858072281, "step": 30450 }, { "epoch": 0.3046, "grad_norm": 13.5, "grad_norm_var": 0.6479166666666667, "learning_rate": 0.0003, "loss": 11.549, "loss/aux_loss": 0.048083511739969255, "loss/crossentropy": 2.711775553226471, "loss/logits": 0.8697979748249054, "step": 30460 }, { "epoch": 0.3047, "grad_norm": 12.6875, "grad_norm_var": 0.903125, "learning_rate": 0.0003, "loss": 11.5003, "loss/aux_loss": 0.048078321292996405, "loss/crossentropy": 2.7702556967735292, "loss/logits": 0.8688585251569748, "step": 30470 }, { "epoch": 0.3048, "grad_norm": 14.0625, "grad_norm_var": 0.4832682291666667, "learning_rate": 0.0003, "loss": 11.461, "loss/aux_loss": 0.04808118157088757, "loss/crossentropy": 2.677464705705643, "loss/logits": 0.8462830722332001, "step": 30480 }, { "epoch": 0.3049, "grad_norm": 12.75, "grad_norm_var": 0.42213541666666665, "learning_rate": 0.0003, "loss": 11.3989, "loss/aux_loss": 0.04808572474867105, "loss/crossentropy": 2.7207378327846525, "loss/logits": 0.9032052427530288, "step": 30490 }, { "epoch": 0.305, "grad_norm": 13.375, "grad_norm_var": 0.32265625, "learning_rate": 0.0003, "loss": 11.4989, "loss/aux_loss": 0.04807707965373993, "loss/crossentropy": 2.855698162317276, "loss/logits": 0.8542087256908417, "step": 30500 }, { "epoch": 0.3051, "grad_norm": 13.125, "grad_norm_var": 0.43411458333333336, "learning_rate": 0.0003, "loss": 11.5808, "loss/aux_loss": 0.04808335490524769, "loss/crossentropy": 2.7730906128883364, "loss/logits": 0.8658265113830567, "step": 30510 }, { "epoch": 0.3052, "grad_norm": 13.25, "grad_norm_var": 0.4462890625, "learning_rate": 0.0003, "loss": 11.3824, "loss/aux_loss": 0.04808741491287947, "loss/crossentropy": 2.6597979426383973, "loss/logits": 0.8335691154003143, "step": 30520 }, { "epoch": 0.3053, "grad_norm": 14.0625, "grad_norm_var": 0.2416015625, "learning_rate": 0.0003, "loss": 11.5599, "loss/aux_loss": 0.0480789877474308, "loss/crossentropy": 2.7108544588088987, "loss/logits": 0.8437099695205689, "step": 30530 }, { "epoch": 0.3054, "grad_norm": 14.25, "grad_norm_var": 0.5458333333333333, "learning_rate": 0.0003, "loss": 11.5408, "loss/aux_loss": 0.048078577406704424, "loss/crossentropy": 2.8623337388038634, "loss/logits": 0.8665123015642167, "step": 30540 }, { "epoch": 0.3055, "grad_norm": 13.125, "grad_norm_var": 0.7640462239583333, "learning_rate": 0.0003, "loss": 11.4355, "loss/aux_loss": 0.048077302612364294, "loss/crossentropy": 2.7096797108650206, "loss/logits": 0.8873968094587326, "step": 30550 }, { "epoch": 0.3056, "grad_norm": 13.125, "grad_norm_var": 0.31105143229166665, "learning_rate": 0.0003, "loss": 11.4985, "loss/aux_loss": 0.048081740364432336, "loss/crossentropy": 2.7807741165161133, "loss/logits": 0.8777556359767914, "step": 30560 }, { "epoch": 0.3057, "grad_norm": 12.75, "grad_norm_var": 0.27810872395833336, "learning_rate": 0.0003, "loss": 11.4167, "loss/aux_loss": 0.0480744456872344, "loss/crossentropy": 2.8844709157943726, "loss/logits": 0.9081658095121383, "step": 30570 }, { "epoch": 0.3058, "grad_norm": 12.8125, "grad_norm_var": 0.5905598958333333, "learning_rate": 0.0003, "loss": 11.4081, "loss/aux_loss": 0.04809431917965412, "loss/crossentropy": 2.6604327261447906, "loss/logits": 0.8287265658378601, "step": 30580 }, { "epoch": 0.3059, "grad_norm": 13.875, "grad_norm_var": 0.40545247395833334, "learning_rate": 0.0003, "loss": 11.4122, "loss/aux_loss": 0.048075050488114356, "loss/crossentropy": 2.70709490776062, "loss/logits": 0.8873141348361969, "step": 30590 }, { "epoch": 0.306, "grad_norm": 13.0, "grad_norm_var": 0.313134765625, "learning_rate": 0.0003, "loss": 11.4442, "loss/aux_loss": 0.04807507153600454, "loss/crossentropy": 2.7605147421360017, "loss/logits": 0.8720807194709778, "step": 30600 }, { "epoch": 0.3061, "grad_norm": 13.125, "grad_norm_var": 0.3611979166666667, "learning_rate": 0.0003, "loss": 11.4805, "loss/aux_loss": 0.04808742217719555, "loss/crossentropy": 2.649176824092865, "loss/logits": 0.8440777510404587, "step": 30610 }, { "epoch": 0.3062, "grad_norm": 13.875, "grad_norm_var": 0.20930989583333334, "learning_rate": 0.0003, "loss": 11.4971, "loss/aux_loss": 0.048072243295609954, "loss/crossentropy": 2.8579718112945556, "loss/logits": 0.8863743543624878, "step": 30620 }, { "epoch": 0.3063, "grad_norm": 12.875, "grad_norm_var": 0.15870768229166668, "learning_rate": 0.0003, "loss": 11.4342, "loss/aux_loss": 0.04809508193284273, "loss/crossentropy": 2.7085582673549653, "loss/logits": 0.855813917517662, "step": 30630 }, { "epoch": 0.3064, "grad_norm": 14.625, "grad_norm_var": 0.4129557291666667, "learning_rate": 0.0003, "loss": 11.5658, "loss/aux_loss": 0.048079350404441355, "loss/crossentropy": 2.8295456767082214, "loss/logits": 0.8438192725181579, "step": 30640 }, { "epoch": 0.3065, "grad_norm": 13.5625, "grad_norm_var": 0.9072265625, "learning_rate": 0.0003, "loss": 11.2753, "loss/aux_loss": 0.048086012713611126, "loss/crossentropy": 2.637072730064392, "loss/logits": 0.846964082121849, "step": 30650 }, { "epoch": 0.3066, "grad_norm": 13.0625, "grad_norm_var": 1.236962890625, "learning_rate": 0.0003, "loss": 11.4979, "loss/aux_loss": 0.04808483086526394, "loss/crossentropy": 2.7757628083229067, "loss/logits": 0.8778376072645188, "step": 30660 }, { "epoch": 0.3067, "grad_norm": 12.5625, "grad_norm_var": 0.5494140625, "learning_rate": 0.0003, "loss": 11.4651, "loss/aux_loss": 0.048084459826350213, "loss/crossentropy": 2.7548458218574523, "loss/logits": 0.9005285263061523, "step": 30670 }, { "epoch": 0.3068, "grad_norm": 14.5625, "grad_norm_var": 0.6807291666666667, "learning_rate": 0.0003, "loss": 11.6068, "loss/aux_loss": 0.048079180717468264, "loss/crossentropy": 2.737465226650238, "loss/logits": 0.8828152716159821, "step": 30680 }, { "epoch": 0.3069, "grad_norm": 13.375, "grad_norm_var": 0.46484375, "learning_rate": 0.0003, "loss": 11.4114, "loss/aux_loss": 0.04807949494570494, "loss/crossentropy": 2.727854001522064, "loss/logits": 0.8459988683462143, "step": 30690 }, { "epoch": 0.307, "grad_norm": 12.0625, "grad_norm_var": 0.50078125, "learning_rate": 0.0003, "loss": 11.3173, "loss/aux_loss": 0.048087647184729576, "loss/crossentropy": 2.7050256431102753, "loss/logits": 0.8718531727790833, "step": 30700 }, { "epoch": 0.3071, "grad_norm": 12.9375, "grad_norm_var": 3.343603515625, "learning_rate": 0.0003, "loss": 11.4537, "loss/aux_loss": 0.048082553595304486, "loss/crossentropy": 2.7552569687366484, "loss/logits": 0.8774021625518799, "step": 30710 }, { "epoch": 0.3072, "grad_norm": 12.4375, "grad_norm_var": 1.0231770833333333, "learning_rate": 0.0003, "loss": 11.3565, "loss/aux_loss": 0.04808600023388863, "loss/crossentropy": 2.6642140567302706, "loss/logits": 0.8275765240192413, "step": 30720 }, { "epoch": 0.3073, "grad_norm": 13.5625, "grad_norm_var": 0.5879557291666667, "learning_rate": 0.0003, "loss": 11.5265, "loss/aux_loss": 0.04808558952063322, "loss/crossentropy": 2.7769883573055267, "loss/logits": 0.8872695177793503, "step": 30730 }, { "epoch": 0.3074, "grad_norm": 12.5625, "grad_norm_var": 0.5175618489583333, "learning_rate": 0.0003, "loss": 11.4342, "loss/aux_loss": 0.048081249184906485, "loss/crossentropy": 2.8268501818180085, "loss/logits": 0.8817792952060699, "step": 30740 }, { "epoch": 0.3075, "grad_norm": 12.8125, "grad_norm_var": 0.558447265625, "learning_rate": 0.0003, "loss": 11.626, "loss/aux_loss": 0.048086341470479965, "loss/crossentropy": 2.7402640700340273, "loss/logits": 0.8773202210664749, "step": 30750 }, { "epoch": 0.3076, "grad_norm": 13.875, "grad_norm_var": 0.16588541666666667, "learning_rate": 0.0003, "loss": 11.6524, "loss/aux_loss": 0.048075375705957414, "loss/crossentropy": 2.6162120938301086, "loss/logits": 0.842825299501419, "step": 30760 }, { "epoch": 0.3077, "grad_norm": 12.6875, "grad_norm_var": 0.40911458333333334, "learning_rate": 0.0003, "loss": 11.5109, "loss/aux_loss": 0.04809289965778589, "loss/crossentropy": 2.719909155368805, "loss/logits": 0.8441285580396652, "step": 30770 }, { "epoch": 0.3078, "grad_norm": 12.875, "grad_norm_var": 0.6238932291666667, "learning_rate": 0.0003, "loss": 11.389, "loss/aux_loss": 0.048084799014031884, "loss/crossentropy": 2.7028889536857603, "loss/logits": 0.8539338052272797, "step": 30780 }, { "epoch": 0.3079, "grad_norm": 13.3125, "grad_norm_var": 0.3238932291666667, "learning_rate": 0.0003, "loss": 11.3617, "loss/aux_loss": 0.04808486420661211, "loss/crossentropy": 2.5718206644058226, "loss/logits": 0.8437188386917114, "step": 30790 }, { "epoch": 0.308, "grad_norm": 13.5, "grad_norm_var": 0.26608072916666664, "learning_rate": 0.0003, "loss": 11.4156, "loss/aux_loss": 0.04808387756347656, "loss/crossentropy": 2.6824039578437806, "loss/logits": 0.8235249221324921, "step": 30800 }, { "epoch": 0.3081, "grad_norm": 13.125, "grad_norm_var": 0.27005208333333336, "learning_rate": 0.0003, "loss": 11.4488, "loss/aux_loss": 0.048087064921855924, "loss/crossentropy": 2.7326850056648255, "loss/logits": 0.8537106692790986, "step": 30810 }, { "epoch": 0.3082, "grad_norm": 13.5625, "grad_norm_var": 0.23951822916666668, "learning_rate": 0.0003, "loss": 11.4492, "loss/aux_loss": 0.048091364093124866, "loss/crossentropy": 2.7636500120162966, "loss/logits": 0.8671251088380814, "step": 30820 }, { "epoch": 0.3083, "grad_norm": 13.4375, "grad_norm_var": 1.5900390625, "learning_rate": 0.0003, "loss": 11.502, "loss/aux_loss": 0.04808245934545994, "loss/crossentropy": 2.856794422864914, "loss/logits": 0.8604202717542648, "step": 30830 }, { "epoch": 0.3084, "grad_norm": 13.5, "grad_norm_var": 1.5712076822916667, "learning_rate": 0.0003, "loss": 11.37, "loss/aux_loss": 0.04808021280914545, "loss/crossentropy": 2.831040990352631, "loss/logits": 0.8481287360191345, "step": 30840 }, { "epoch": 0.3085, "grad_norm": 13.75, "grad_norm_var": 0.31326497395833336, "learning_rate": 0.0003, "loss": 11.3802, "loss/aux_loss": 0.04808556064963341, "loss/crossentropy": 2.7837505459785463, "loss/logits": 0.8858030140399933, "step": 30850 }, { "epoch": 0.3086, "grad_norm": 13.3125, "grad_norm_var": 0.6613118489583333, "learning_rate": 0.0003, "loss": 11.4733, "loss/aux_loss": 0.04808393772691488, "loss/crossentropy": 2.646303951740265, "loss/logits": 0.8927146643400192, "step": 30860 }, { "epoch": 0.3087, "grad_norm": 13.3125, "grad_norm_var": 1.0082682291666667, "learning_rate": 0.0003, "loss": 11.5069, "loss/aux_loss": 0.04806661605834961, "loss/crossentropy": 2.6964763700962067, "loss/logits": 0.8716022908687592, "step": 30870 }, { "epoch": 0.3088, "grad_norm": 13.625, "grad_norm_var": 1.622509765625, "learning_rate": 0.0003, "loss": 11.4855, "loss/aux_loss": 0.0480882540345192, "loss/crossentropy": 2.744866168498993, "loss/logits": 0.8746917128562928, "step": 30880 }, { "epoch": 0.3089, "grad_norm": 12.375, "grad_norm_var": 0.5468098958333333, "learning_rate": 0.0003, "loss": 11.4911, "loss/aux_loss": 0.04807100631296635, "loss/crossentropy": 2.820644873380661, "loss/logits": 0.8648158997297287, "step": 30890 }, { "epoch": 0.309, "grad_norm": 12.875, "grad_norm_var": 0.3555826822916667, "learning_rate": 0.0003, "loss": 11.3729, "loss/aux_loss": 0.048082031309604645, "loss/crossentropy": 2.7035711348056792, "loss/logits": 0.8440298497676849, "step": 30900 }, { "epoch": 0.3091, "grad_norm": 13.375, "grad_norm_var": 0.3395182291666667, "learning_rate": 0.0003, "loss": 11.5239, "loss/aux_loss": 0.048081082105636594, "loss/crossentropy": 2.6476317226886747, "loss/logits": 0.8694226413965225, "step": 30910 }, { "epoch": 0.3092, "grad_norm": 14.0625, "grad_norm_var": 0.27805989583333335, "learning_rate": 0.0003, "loss": 11.557, "loss/aux_loss": 0.04807517770677805, "loss/crossentropy": 2.789444291591644, "loss/logits": 0.8987109959125519, "step": 30920 }, { "epoch": 0.3093, "grad_norm": 13.0, "grad_norm_var": 0.448291015625, "learning_rate": 0.0003, "loss": 11.4581, "loss/aux_loss": 0.048085184581577775, "loss/crossentropy": 2.8388954520225527, "loss/logits": 0.8571620523929596, "step": 30930 }, { "epoch": 0.3094, "grad_norm": 13.5625, "grad_norm_var": 1.1176432291666667, "learning_rate": 0.0003, "loss": 11.4472, "loss/aux_loss": 0.04808578956872225, "loss/crossentropy": 2.7803068816661836, "loss/logits": 0.8517871230840683, "step": 30940 }, { "epoch": 0.3095, "grad_norm": 13.6875, "grad_norm_var": 0.879931640625, "learning_rate": 0.0003, "loss": 11.2953, "loss/aux_loss": 0.048075218498706815, "loss/crossentropy": 2.6304258346557616, "loss/logits": 0.8701532393693924, "step": 30950 }, { "epoch": 0.3096, "grad_norm": 14.1875, "grad_norm_var": 0.5738932291666666, "learning_rate": 0.0003, "loss": 11.4005, "loss/aux_loss": 0.04808662962168455, "loss/crossentropy": 2.7041739583015443, "loss/logits": 0.8431258827447892, "step": 30960 }, { "epoch": 0.3097, "grad_norm": 14.625, "grad_norm_var": 0.693212890625, "learning_rate": 0.0003, "loss": 11.1962, "loss/aux_loss": 0.04808198884129524, "loss/crossentropy": 2.5106098532676695, "loss/logits": 0.8334185928106308, "step": 30970 }, { "epoch": 0.3098, "grad_norm": 13.5625, "grad_norm_var": 0.5868326822916666, "learning_rate": 0.0003, "loss": 11.3845, "loss/aux_loss": 0.04808704257011413, "loss/crossentropy": 2.824685072898865, "loss/logits": 0.8856196343898773, "step": 30980 }, { "epoch": 0.3099, "grad_norm": 13.8125, "grad_norm_var": 0.5233723958333333, "learning_rate": 0.0003, "loss": 11.2844, "loss/aux_loss": 0.04808703400194645, "loss/crossentropy": 2.70513573884964, "loss/logits": 0.8594135075807572, "step": 30990 }, { "epoch": 0.31, "grad_norm": 12.8125, "grad_norm_var": 0.32537434895833334, "learning_rate": 0.0003, "loss": 11.431, "loss/aux_loss": 0.048076413199305536, "loss/crossentropy": 2.790517818927765, "loss/logits": 0.8762030184268952, "step": 31000 }, { "epoch": 0.3101, "grad_norm": 12.3125, "grad_norm_var": 0.309375, "learning_rate": 0.0003, "loss": 11.3837, "loss/aux_loss": 0.048087396286427976, "loss/crossentropy": 2.823656415939331, "loss/logits": 0.8393119305372239, "step": 31010 }, { "epoch": 0.3102, "grad_norm": 13.3125, "grad_norm_var": 38.86300455729167, "learning_rate": 0.0003, "loss": 11.455, "loss/aux_loss": 0.04807733986526728, "loss/crossentropy": 2.8026981115341187, "loss/logits": 0.8650432884693146, "step": 31020 }, { "epoch": 0.3103, "grad_norm": 17.125, "grad_norm_var": 1.2098958333333334, "learning_rate": 0.0003, "loss": 11.4485, "loss/aux_loss": 0.048109233193099496, "loss/crossentropy": 2.7331307351589205, "loss/logits": 0.8930859625339508, "step": 31030 }, { "epoch": 0.3104, "grad_norm": 14.125, "grad_norm_var": 1.227587890625, "learning_rate": 0.0003, "loss": 11.5582, "loss/aux_loss": 0.04805864728987217, "loss/crossentropy": 2.7618947505950926, "loss/logits": 0.858463802933693, "step": 31040 }, { "epoch": 0.3105, "grad_norm": 12.5625, "grad_norm_var": 0.5054524739583334, "learning_rate": 0.0003, "loss": 11.2948, "loss/aux_loss": 0.048091983795166014, "loss/crossentropy": 2.8342044055461884, "loss/logits": 0.8382025718688965, "step": 31050 }, { "epoch": 0.3106, "grad_norm": 13.3125, "grad_norm_var": 0.2530598958333333, "learning_rate": 0.0003, "loss": 11.3819, "loss/aux_loss": 0.04808224979788065, "loss/crossentropy": 2.667121487855911, "loss/logits": 0.8481329113245011, "step": 31060 }, { "epoch": 0.3107, "grad_norm": 13.5, "grad_norm_var": 0.192431640625, "learning_rate": 0.0003, "loss": 11.4812, "loss/aux_loss": 0.04808955937623978, "loss/crossentropy": 2.6511776447296143, "loss/logits": 0.8379587948322296, "step": 31070 }, { "epoch": 0.3108, "grad_norm": 12.375, "grad_norm_var": 0.3859375, "learning_rate": 0.0003, "loss": 11.4201, "loss/aux_loss": 0.048076062090694906, "loss/crossentropy": 2.909103608131409, "loss/logits": 0.9112180799245835, "step": 31080 }, { "epoch": 0.3109, "grad_norm": 12.3125, "grad_norm_var": 0.23605143229166667, "learning_rate": 0.0003, "loss": 11.5163, "loss/aux_loss": 0.04807505179196596, "loss/crossentropy": 2.720409429073334, "loss/logits": 0.8697555780410766, "step": 31090 }, { "epoch": 0.311, "grad_norm": 12.625, "grad_norm_var": 0.35149739583333334, "learning_rate": 0.0003, "loss": 11.3274, "loss/aux_loss": 0.04808955602347851, "loss/crossentropy": 2.6798146247863768, "loss/logits": 0.8508224755525589, "step": 31100 }, { "epoch": 0.3111, "grad_norm": 12.9375, "grad_norm_var": 0.9098307291666666, "learning_rate": 0.0003, "loss": 11.2321, "loss/aux_loss": 0.048081112653017045, "loss/crossentropy": 2.8907525897026063, "loss/logits": 0.8622247904539109, "step": 31110 }, { "epoch": 0.3112, "grad_norm": 14.0, "grad_norm_var": 0.6708333333333333, "learning_rate": 0.0003, "loss": 11.3221, "loss/aux_loss": 0.048092206753790376, "loss/crossentropy": 2.5958180367946624, "loss/logits": 0.8239250183105469, "step": 31120 }, { "epoch": 0.3113, "grad_norm": 13.625, "grad_norm_var": 0.5369791666666667, "learning_rate": 0.0003, "loss": 11.3484, "loss/aux_loss": 0.04808017313480377, "loss/crossentropy": 2.730751097202301, "loss/logits": 0.8445838242769241, "step": 31130 }, { "epoch": 0.3114, "grad_norm": 14.0625, "grad_norm_var": 1.043212890625, "learning_rate": 0.0003, "loss": 11.2755, "loss/aux_loss": 0.0480805704370141, "loss/crossentropy": 2.760500192642212, "loss/logits": 0.8478647708892822, "step": 31140 }, { "epoch": 0.3115, "grad_norm": 13.625, "grad_norm_var": 0.799462890625, "learning_rate": 0.0003, "loss": 11.5213, "loss/aux_loss": 0.048074069805443286, "loss/crossentropy": 2.8183942079544066, "loss/logits": 0.8743588626384735, "step": 31150 }, { "epoch": 0.3116, "grad_norm": 13.6875, "grad_norm_var": 0.36053059895833334, "learning_rate": 0.0003, "loss": 11.3418, "loss/aux_loss": 0.048081109300255775, "loss/crossentropy": 2.8368311285972596, "loss/logits": 0.8868528872728347, "step": 31160 }, { "epoch": 0.3117, "grad_norm": 13.75, "grad_norm_var": 0.474072265625, "learning_rate": 0.0003, "loss": 11.3342, "loss/aux_loss": 0.048071020655333994, "loss/crossentropy": 2.630265325307846, "loss/logits": 0.8686846703290939, "step": 31170 }, { "epoch": 0.3118, "grad_norm": 13.625, "grad_norm_var": 0.22420247395833334, "learning_rate": 0.0003, "loss": 11.6492, "loss/aux_loss": 0.04808335732668638, "loss/crossentropy": 2.9368000745773317, "loss/logits": 0.8898035645484924, "step": 31180 }, { "epoch": 0.3119, "grad_norm": 13.1875, "grad_norm_var": 0.237353515625, "learning_rate": 0.0003, "loss": 11.5728, "loss/aux_loss": 0.04807795882225037, "loss/crossentropy": 2.8425419092178346, "loss/logits": 0.8769685357809067, "step": 31190 }, { "epoch": 0.312, "grad_norm": 13.125, "grad_norm_var": 0.21145833333333333, "learning_rate": 0.0003, "loss": 11.5003, "loss/aux_loss": 0.04807908125221729, "loss/crossentropy": 2.8865275263786314, "loss/logits": 0.89976706802845, "step": 31200 }, { "epoch": 0.3121, "grad_norm": 13.0625, "grad_norm_var": 0.4032389322916667, "learning_rate": 0.0003, "loss": 11.448, "loss/aux_loss": 0.048086687363684176, "loss/crossentropy": 2.8929324388504027, "loss/logits": 0.8646660923957825, "step": 31210 }, { "epoch": 0.3122, "grad_norm": 13.625, "grad_norm_var": 3.517041015625, "learning_rate": 0.0003, "loss": 11.3664, "loss/aux_loss": 0.04807643294334411, "loss/crossentropy": 2.6457720398902893, "loss/logits": 0.8991645514965058, "step": 31220 }, { "epoch": 0.3123, "grad_norm": 13.5, "grad_norm_var": 3.8999348958333333, "learning_rate": 0.0003, "loss": 11.4684, "loss/aux_loss": 0.04808459933847189, "loss/crossentropy": 2.634989720582962, "loss/logits": 0.8654608964920044, "step": 31230 }, { "epoch": 0.3124, "grad_norm": 14.3125, "grad_norm_var": 0.4212890625, "learning_rate": 0.0003, "loss": 11.719, "loss/aux_loss": 0.04808775335550308, "loss/crossentropy": 2.8814554154872893, "loss/logits": 0.8489186823368072, "step": 31240 }, { "epoch": 0.3125, "grad_norm": 12.8125, "grad_norm_var": 0.6421223958333333, "learning_rate": 0.0003, "loss": 11.5017, "loss/aux_loss": 0.04807210192084312, "loss/crossentropy": 2.8812507152557374, "loss/logits": 0.8861448734998703, "step": 31250 }, { "epoch": 0.3126, "grad_norm": 13.8125, "grad_norm_var": 1.1890625, "learning_rate": 0.0003, "loss": 11.3459, "loss/aux_loss": 0.048083325289189814, "loss/crossentropy": 2.6456491708755494, "loss/logits": 0.8774872869253159, "step": 31260 }, { "epoch": 0.3127, "grad_norm": 14.5625, "grad_norm_var": 0.7696451822916667, "learning_rate": 0.0003, "loss": 11.4124, "loss/aux_loss": 0.048082329146564004, "loss/crossentropy": 2.855871230363846, "loss/logits": 0.8438941597938537, "step": 31270 }, { "epoch": 0.3128, "grad_norm": 12.6875, "grad_norm_var": 0.7869140625, "learning_rate": 0.0003, "loss": 11.2841, "loss/aux_loss": 0.048089153692126275, "loss/crossentropy": 2.7080669164657594, "loss/logits": 0.8581030815839767, "step": 31280 }, { "epoch": 0.3129, "grad_norm": 13.375, "grad_norm_var": 0.5102701822916667, "learning_rate": 0.0003, "loss": 11.5182, "loss/aux_loss": 0.048074636980891226, "loss/crossentropy": 2.7201138913631437, "loss/logits": 0.8733494013547898, "step": 31290 }, { "epoch": 0.313, "grad_norm": 12.4375, "grad_norm_var": 0.6463541666666667, "learning_rate": 0.0003, "loss": 11.5523, "loss/aux_loss": 0.04808723703026772, "loss/crossentropy": 2.778682363033295, "loss/logits": 0.8577165812253952, "step": 31300 }, { "epoch": 0.3131, "grad_norm": 13.5, "grad_norm_var": 0.2984375, "learning_rate": 0.0003, "loss": 11.4788, "loss/aux_loss": 0.048080189153552055, "loss/crossentropy": 2.9088799715042115, "loss/logits": 0.8727356672286988, "step": 31310 }, { "epoch": 0.3132, "grad_norm": 12.9375, "grad_norm_var": 0.17076822916666667, "learning_rate": 0.0003, "loss": 11.3828, "loss/aux_loss": 0.04808860644698143, "loss/crossentropy": 2.718260443210602, "loss/logits": 0.8305355608463287, "step": 31320 }, { "epoch": 0.3133, "grad_norm": 14.0625, "grad_norm_var": 0.379541015625, "learning_rate": 0.0003, "loss": 11.5571, "loss/aux_loss": 0.04808090459555388, "loss/crossentropy": 2.7178287625312807, "loss/logits": 0.8788865208625793, "step": 31330 }, { "epoch": 0.3134, "grad_norm": 15.75, "grad_norm_var": 1.0238932291666667, "learning_rate": 0.0003, "loss": 11.3747, "loss/aux_loss": 0.04807234313338995, "loss/crossentropy": 2.6713554739952086, "loss/logits": 0.8787687391042709, "step": 31340 }, { "epoch": 0.3135, "grad_norm": 12.8125, "grad_norm_var": 1.5390625, "learning_rate": 0.0003, "loss": 11.3459, "loss/aux_loss": 0.04808554369956255, "loss/crossentropy": 2.652766835689545, "loss/logits": 0.8742602497339249, "step": 31350 }, { "epoch": 0.3136, "grad_norm": 13.1875, "grad_norm_var": 1.2830729166666666, "learning_rate": 0.0003, "loss": 11.4199, "loss/aux_loss": 0.04807996340095997, "loss/crossentropy": 2.7023261964321135, "loss/logits": 0.8666334301233292, "step": 31360 }, { "epoch": 0.3137, "grad_norm": 11.9375, "grad_norm_var": 1.11953125, "learning_rate": 0.0003, "loss": 11.4297, "loss/aux_loss": 0.048081773333251476, "loss/crossentropy": 2.786700093746185, "loss/logits": 0.902243122458458, "step": 31370 }, { "epoch": 0.3138, "grad_norm": 13.0625, "grad_norm_var": 0.4800618489583333, "learning_rate": 0.0003, "loss": 11.4868, "loss/aux_loss": 0.048089880496263504, "loss/crossentropy": 2.808494824171066, "loss/logits": 0.8556656092405319, "step": 31380 }, { "epoch": 0.3139, "grad_norm": 14.1875, "grad_norm_var": 0.21484375, "learning_rate": 0.0003, "loss": 11.4171, "loss/aux_loss": 0.04808277599513531, "loss/crossentropy": 2.771555906534195, "loss/logits": 0.8765601277351379, "step": 31390 }, { "epoch": 0.314, "grad_norm": 12.6875, "grad_norm_var": 1.2710774739583333, "learning_rate": 0.0003, "loss": 11.6018, "loss/aux_loss": 0.048083768039941785, "loss/crossentropy": 2.888131785392761, "loss/logits": 0.8636642038822174, "step": 31400 }, { "epoch": 0.3141, "grad_norm": 13.3125, "grad_norm_var": 0.29375, "learning_rate": 0.0003, "loss": 11.3992, "loss/aux_loss": 0.048076036386191845, "loss/crossentropy": 2.5779885232448576, "loss/logits": 0.8284551709890365, "step": 31410 }, { "epoch": 0.3142, "grad_norm": 15.1875, "grad_norm_var": 0.45675455729166664, "learning_rate": 0.0003, "loss": 11.239, "loss/aux_loss": 0.0480815326794982, "loss/crossentropy": 2.857342076301575, "loss/logits": 0.866741943359375, "step": 31420 }, { "epoch": 0.3143, "grad_norm": 13.8125, "grad_norm_var": 0.4875, "learning_rate": 0.0003, "loss": 11.5045, "loss/aux_loss": 0.04808294028043747, "loss/crossentropy": 2.730056095123291, "loss/logits": 0.8772373676300049, "step": 31430 }, { "epoch": 0.3144, "grad_norm": 12.75, "grad_norm_var": 0.6671223958333333, "learning_rate": 0.0003, "loss": 11.4201, "loss/aux_loss": 0.0480757225304842, "loss/crossentropy": 2.8834362506866453, "loss/logits": 0.8785695016384125, "step": 31440 }, { "epoch": 0.3145, "grad_norm": 13.125, "grad_norm_var": 0.709375, "learning_rate": 0.0003, "loss": 11.2471, "loss/aux_loss": 0.048089978471398356, "loss/crossentropy": 2.6122034907341005, "loss/logits": 0.8332406580448151, "step": 31450 }, { "epoch": 0.3146, "grad_norm": 13.5625, "grad_norm_var": 0.6348307291666667, "learning_rate": 0.0003, "loss": 11.4156, "loss/aux_loss": 0.04808205291628838, "loss/crossentropy": 2.7532804131507875, "loss/logits": 0.8511408418416977, "step": 31460 }, { "epoch": 0.3147, "grad_norm": 12.5, "grad_norm_var": 1.6400390625, "learning_rate": 0.0003, "loss": 11.4242, "loss/aux_loss": 0.04809170886874199, "loss/crossentropy": 2.6764910399913786, "loss/logits": 0.8635757118463516, "step": 31470 }, { "epoch": 0.3148, "grad_norm": 13.0625, "grad_norm_var": 0.8389973958333333, "learning_rate": 0.0003, "loss": 11.2935, "loss/aux_loss": 0.04808271527290344, "loss/crossentropy": 2.6128583550453186, "loss/logits": 0.8351830154657364, "step": 31480 }, { "epoch": 0.3149, "grad_norm": 13.9375, "grad_norm_var": 3.76484375, "learning_rate": 0.0003, "loss": 11.6416, "loss/aux_loss": 0.048077508620917794, "loss/crossentropy": 2.8005987286567686, "loss/logits": 0.9104048877954483, "step": 31490 }, { "epoch": 0.315, "grad_norm": 14.75, "grad_norm_var": 7.100244140625, "learning_rate": 0.0003, "loss": 11.474, "loss/aux_loss": 0.04809277784079313, "loss/crossentropy": 2.739937108755112, "loss/logits": 0.8828804194927216, "step": 31500 }, { "epoch": 0.3151, "grad_norm": 13.75, "grad_norm_var": 5.307926432291667, "learning_rate": 0.0003, "loss": 11.4576, "loss/aux_loss": 0.04807978179305792, "loss/crossentropy": 2.912484383583069, "loss/logits": 0.9298472136259079, "step": 31510 }, { "epoch": 0.3152, "grad_norm": 13.3125, "grad_norm_var": 1.0136555989583333, "learning_rate": 0.0003, "loss": 11.4793, "loss/aux_loss": 0.04807396475225687, "loss/crossentropy": 2.864921712875366, "loss/logits": 0.8761105090379715, "step": 31520 }, { "epoch": 0.3153, "grad_norm": 13.0, "grad_norm_var": 0.35670572916666665, "learning_rate": 0.0003, "loss": 11.2979, "loss/aux_loss": 0.04808592237532139, "loss/crossentropy": 2.7697804093360903, "loss/logits": 0.8579605609178543, "step": 31530 }, { "epoch": 0.3154, "grad_norm": 14.5, "grad_norm_var": 0.5884765625, "learning_rate": 0.0003, "loss": 11.364, "loss/aux_loss": 0.048074756562709806, "loss/crossentropy": 2.720014047622681, "loss/logits": 0.8563840836286545, "step": 31540 }, { "epoch": 0.3155, "grad_norm": 15.3125, "grad_norm_var": 0.7206868489583333, "learning_rate": 0.0003, "loss": 11.3455, "loss/aux_loss": 0.048072342202067374, "loss/crossentropy": 2.643301236629486, "loss/logits": 0.8437513649463654, "step": 31550 }, { "epoch": 0.3156, "grad_norm": 38.5, "grad_norm_var": 39.195247395833334, "learning_rate": 0.0003, "loss": 11.5141, "loss/aux_loss": 0.04808412864804268, "loss/crossentropy": 2.726420682668686, "loss/logits": 0.8565041303634644, "step": 31560 }, { "epoch": 0.3157, "grad_norm": 13.3125, "grad_norm_var": 39.03489583333333, "learning_rate": 0.0003, "loss": 11.5841, "loss/aux_loss": 0.048086312040686606, "loss/crossentropy": 2.7990392088890075, "loss/logits": 0.866115254163742, "step": 31570 }, { "epoch": 0.3158, "grad_norm": 13.375, "grad_norm_var": 3.559830729166667, "learning_rate": 0.0003, "loss": 11.2449, "loss/aux_loss": 0.04807570930570364, "loss/crossentropy": 2.7598312139511108, "loss/logits": 0.848942118883133, "step": 31580 }, { "epoch": 0.3159, "grad_norm": 13.9375, "grad_norm_var": 0.4830729166666667, "learning_rate": 0.0003, "loss": 11.4392, "loss/aux_loss": 0.04808403309434652, "loss/crossentropy": 2.7267268300056458, "loss/logits": 0.8826425462961197, "step": 31590 }, { "epoch": 0.316, "grad_norm": 13.6875, "grad_norm_var": 0.37550455729166665, "learning_rate": 0.0003, "loss": 11.3998, "loss/aux_loss": 0.04807063583284617, "loss/crossentropy": 2.776882898807526, "loss/logits": 0.8546944618225097, "step": 31600 }, { "epoch": 0.3161, "grad_norm": 12.9375, "grad_norm_var": 0.8063639322916667, "learning_rate": 0.0003, "loss": 11.5461, "loss/aux_loss": 0.04809814915060997, "loss/crossentropy": 2.643150007724762, "loss/logits": 0.8487259536981583, "step": 31610 }, { "epoch": 0.3162, "grad_norm": 12.1875, "grad_norm_var": 0.8200358072916667, "learning_rate": 0.0003, "loss": 11.357, "loss/aux_loss": 0.0480756500735879, "loss/crossentropy": 2.7450030565261843, "loss/logits": 0.858446741104126, "step": 31620 }, { "epoch": 0.3163, "grad_norm": 12.5625, "grad_norm_var": 0.8395182291666666, "learning_rate": 0.0003, "loss": 11.4617, "loss/aux_loss": 0.04808495547622442, "loss/crossentropy": 2.8632676005363464, "loss/logits": 0.8507615506649018, "step": 31630 }, { "epoch": 0.3164, "grad_norm": 13.625, "grad_norm_var": 0.8075520833333333, "learning_rate": 0.0003, "loss": 11.3846, "loss/aux_loss": 0.048081559129059315, "loss/crossentropy": 2.681163477897644, "loss/logits": 0.8584868460893631, "step": 31640 }, { "epoch": 0.3165, "grad_norm": 14.0625, "grad_norm_var": 0.461572265625, "learning_rate": 0.0003, "loss": 11.3037, "loss/aux_loss": 0.04807944130152464, "loss/crossentropy": 2.757179379463196, "loss/logits": 0.8438855946063996, "step": 31650 }, { "epoch": 0.3166, "grad_norm": 13.125, "grad_norm_var": 0.30983072916666665, "learning_rate": 0.0003, "loss": 11.4289, "loss/aux_loss": 0.048075127974152566, "loss/crossentropy": 2.775540769100189, "loss/logits": 0.8543334901332855, "step": 31660 }, { "epoch": 0.3167, "grad_norm": 12.8125, "grad_norm_var": 0.24178059895833334, "learning_rate": 0.0003, "loss": 11.3283, "loss/aux_loss": 0.04808103535324335, "loss/crossentropy": 2.771780288219452, "loss/logits": 0.8589775919914245, "step": 31670 }, { "epoch": 0.3168, "grad_norm": 12.75, "grad_norm_var": 0.22146809895833333, "learning_rate": 0.0003, "loss": 11.5683, "loss/aux_loss": 0.04807479549199343, "loss/crossentropy": 2.889441192150116, "loss/logits": 0.8831599056720734, "step": 31680 }, { "epoch": 0.3169, "grad_norm": 14.5, "grad_norm_var": 0.4813639322916667, "learning_rate": 0.0003, "loss": 11.5118, "loss/aux_loss": 0.048088008724153045, "loss/crossentropy": 2.794116795063019, "loss/logits": 0.8760564774274826, "step": 31690 }, { "epoch": 0.317, "grad_norm": 17.875, "grad_norm_var": 60.6546875, "learning_rate": 0.0003, "loss": 11.4563, "loss/aux_loss": 0.04808449987322092, "loss/crossentropy": 2.7889586448669434, "loss/logits": 0.8560769230127334, "step": 31700 }, { "epoch": 0.3171, "grad_norm": 13.0, "grad_norm_var": 61.812434895833334, "learning_rate": 0.0003, "loss": 11.4129, "loss/aux_loss": 0.048075599037110804, "loss/crossentropy": 2.764506447315216, "loss/logits": 0.8688730716705322, "step": 31710 }, { "epoch": 0.3172, "grad_norm": 12.3125, "grad_norm_var": 1.903125, "learning_rate": 0.0003, "loss": 11.401, "loss/aux_loss": 0.04808210451155901, "loss/crossentropy": 2.6953455746173858, "loss/logits": 0.8976037502288818, "step": 31720 }, { "epoch": 0.3173, "grad_norm": 17.75, "grad_norm_var": 4.958968098958334, "learning_rate": 0.0003, "loss": 11.3902, "loss/aux_loss": 0.04808040298521519, "loss/crossentropy": 2.7925826787948607, "loss/logits": 0.8728821247816085, "step": 31730 }, { "epoch": 0.3174, "grad_norm": 12.4375, "grad_norm_var": 1.7452473958333334, "learning_rate": 0.0003, "loss": 11.5282, "loss/aux_loss": 0.048076373524963856, "loss/crossentropy": 2.8562302708625795, "loss/logits": 0.8475510686635971, "step": 31740 }, { "epoch": 0.3175, "grad_norm": 12.625, "grad_norm_var": 0.6619791666666667, "learning_rate": 0.0003, "loss": 11.366, "loss/aux_loss": 0.048085829429328444, "loss/crossentropy": 2.649816393852234, "loss/logits": 0.8313356369733811, "step": 31750 }, { "epoch": 0.3176, "grad_norm": 14.3125, "grad_norm_var": 0.8421223958333334, "learning_rate": 0.0003, "loss": 11.4549, "loss/aux_loss": 0.0480808213353157, "loss/crossentropy": 2.7275112867355347, "loss/logits": 0.8897455483675003, "step": 31760 }, { "epoch": 0.3177, "grad_norm": 13.5, "grad_norm_var": 0.5714680989583333, "learning_rate": 0.0003, "loss": 11.3229, "loss/aux_loss": 0.048082269914448264, "loss/crossentropy": 2.7526735663414, "loss/logits": 0.8474861830472946, "step": 31770 }, { "epoch": 0.3178, "grad_norm": 13.3125, "grad_norm_var": 0.47394205729166666, "learning_rate": 0.0003, "loss": 11.5803, "loss/aux_loss": 0.04808472171425819, "loss/crossentropy": 2.834140819311142, "loss/logits": 0.8541903674602509, "step": 31780 }, { "epoch": 0.3179, "grad_norm": 12.5, "grad_norm_var": 0.7118326822916666, "learning_rate": 0.0003, "loss": 11.3839, "loss/aux_loss": 0.04806725513190031, "loss/crossentropy": 2.5748830080032348, "loss/logits": 0.8210257202386856, "step": 31790 }, { "epoch": 0.318, "grad_norm": 13.25, "grad_norm_var": 0.6691243489583333, "learning_rate": 0.0003, "loss": 11.4991, "loss/aux_loss": 0.0480900140479207, "loss/crossentropy": 2.720593547821045, "loss/logits": 0.8570217370986939, "step": 31800 }, { "epoch": 0.3181, "grad_norm": 16.375, "grad_norm_var": 7.630208333333333, "learning_rate": 0.0003, "loss": 11.3505, "loss/aux_loss": 0.048079617135226724, "loss/crossentropy": 2.76233834028244, "loss/logits": 0.8391731053590774, "step": 31810 }, { "epoch": 0.3182, "grad_norm": 12.125, "grad_norm_var": 8.1328125, "learning_rate": 0.0003, "loss": 11.4925, "loss/aux_loss": 0.04807399399578571, "loss/crossentropy": 2.793236219882965, "loss/logits": 0.8872128367424011, "step": 31820 }, { "epoch": 0.3183, "grad_norm": 14.125, "grad_norm_var": 0.43527018229166664, "learning_rate": 0.0003, "loss": 11.4887, "loss/aux_loss": 0.048083842545747754, "loss/crossentropy": 2.860837161540985, "loss/logits": 0.8602665454149246, "step": 31830 }, { "epoch": 0.3184, "grad_norm": 14.125, "grad_norm_var": 0.341650390625, "learning_rate": 0.0003, "loss": 11.3578, "loss/aux_loss": 0.04807821288704872, "loss/crossentropy": 2.7253613471984863, "loss/logits": 0.8533663004636765, "step": 31840 }, { "epoch": 0.3185, "grad_norm": 15.75, "grad_norm_var": 625.615625, "learning_rate": 0.0003, "loss": 11.595, "loss/aux_loss": 0.04808409884572029, "loss/crossentropy": 2.833714520931244, "loss/logits": 0.8993043005466461, "step": 31850 }, { "epoch": 0.3186, "grad_norm": 13.5625, "grad_norm_var": 608.6546223958334, "learning_rate": 0.0003, "loss": 11.3319, "loss/aux_loss": 0.048097670264542106, "loss/crossentropy": 2.7307827293872835, "loss/logits": 0.8190259605646133, "step": 31860 }, { "epoch": 0.3187, "grad_norm": 14.375, "grad_norm_var": 4.886051432291667, "learning_rate": 0.0003, "loss": 11.449, "loss/aux_loss": 0.04808141943067312, "loss/crossentropy": 2.7856150209903716, "loss/logits": 0.851562550663948, "step": 31870 }, { "epoch": 0.3188, "grad_norm": 13.4375, "grad_norm_var": 0.31378580729166666, "learning_rate": 0.0003, "loss": 11.2744, "loss/aux_loss": 0.048080786131322384, "loss/crossentropy": 2.759511637687683, "loss/logits": 0.8260854959487915, "step": 31880 }, { "epoch": 0.3189, "grad_norm": 14.5, "grad_norm_var": 0.337353515625, "learning_rate": 0.0003, "loss": 11.4055, "loss/aux_loss": 0.048088025860488416, "loss/crossentropy": 2.7933689653873444, "loss/logits": 0.8317734956741333, "step": 31890 }, { "epoch": 0.319, "grad_norm": 13.8125, "grad_norm_var": 0.4176432291666667, "learning_rate": 0.0003, "loss": 11.4489, "loss/aux_loss": 0.04807045944035053, "loss/crossentropy": 2.671738988161087, "loss/logits": 0.8746799319982529, "step": 31900 }, { "epoch": 0.3191, "grad_norm": 13.125, "grad_norm_var": 0.5280598958333333, "learning_rate": 0.0003, "loss": 11.4261, "loss/aux_loss": 0.04807942863553762, "loss/crossentropy": 2.8266174256801606, "loss/logits": 0.9010621100664139, "step": 31910 }, { "epoch": 0.3192, "grad_norm": 14.1875, "grad_norm_var": 0.42823893229166665, "learning_rate": 0.0003, "loss": 11.3704, "loss/aux_loss": 0.04807835109531879, "loss/crossentropy": 2.8471252858638763, "loss/logits": 0.8496732413768768, "step": 31920 }, { "epoch": 0.3193, "grad_norm": 15.25, "grad_norm_var": 0.5960774739583333, "learning_rate": 0.0003, "loss": 11.3404, "loss/aux_loss": 0.04808904957026243, "loss/crossentropy": 2.7125583946704865, "loss/logits": 0.8533723443746567, "step": 31930 }, { "epoch": 0.3194, "grad_norm": 12.1875, "grad_norm_var": 0.6400390625, "learning_rate": 0.0003, "loss": 11.526, "loss/aux_loss": 0.048076958023011686, "loss/crossentropy": 2.847039544582367, "loss/logits": 0.8665654867887497, "step": 31940 }, { "epoch": 0.3195, "grad_norm": 12.6875, "grad_norm_var": 1.1645182291666667, "learning_rate": 0.0003, "loss": 11.3755, "loss/aux_loss": 0.04807892180979252, "loss/crossentropy": 2.6123467087745667, "loss/logits": 0.8488023519515991, "step": 31950 }, { "epoch": 0.3196, "grad_norm": 13.8125, "grad_norm_var": 0.5695149739583333, "learning_rate": 0.0003, "loss": 11.3545, "loss/aux_loss": 0.048081924021244046, "loss/crossentropy": 2.756511354446411, "loss/logits": 0.8485715210437774, "step": 31960 }, { "epoch": 0.3197, "grad_norm": 11.9375, "grad_norm_var": 0.6130045572916667, "learning_rate": 0.0003, "loss": 11.5651, "loss/aux_loss": 0.04808203242719174, "loss/crossentropy": 2.696575939655304, "loss/logits": 0.8660462826490403, "step": 31970 }, { "epoch": 0.3198, "grad_norm": 13.625, "grad_norm_var": 0.5766764322916667, "learning_rate": 0.0003, "loss": 11.5154, "loss/aux_loss": 0.048074822314083575, "loss/crossentropy": 2.7823503494262694, "loss/logits": 0.8631897240877151, "step": 31980 }, { "epoch": 0.3199, "grad_norm": 13.0625, "grad_norm_var": 0.6207682291666666, "learning_rate": 0.0003, "loss": 11.4817, "loss/aux_loss": 0.048085047490894796, "loss/crossentropy": 2.765849894285202, "loss/logits": 0.8870153099298477, "step": 31990 }, { "epoch": 0.32, "grad_norm": 13.3125, "grad_norm_var": 0.352197265625, "learning_rate": 0.0003, "loss": 11.4848, "loss/aux_loss": 0.048076517321169375, "loss/crossentropy": 2.818051886558533, "loss/logits": 0.8712541669607162, "step": 32000 }, { "epoch": 0.3201, "grad_norm": 14.0625, "grad_norm_var": 0.294384765625, "learning_rate": 0.0003, "loss": 11.4565, "loss/aux_loss": 0.0480777308344841, "loss/crossentropy": 2.7592976331710815, "loss/logits": 0.8439573287963867, "step": 32010 }, { "epoch": 0.3202, "grad_norm": 15.3125, "grad_norm_var": 759.9216145833333, "learning_rate": 0.0003, "loss": 11.679, "loss/aux_loss": 0.04811829086393118, "loss/crossentropy": 2.7429904997348786, "loss/logits": 0.87835733294487, "step": 32020 }, { "epoch": 0.3203, "grad_norm": 12.5625, "grad_norm_var": 14.346598307291666, "learning_rate": 0.0003, "loss": 11.4291, "loss/aux_loss": 0.04807977806776762, "loss/crossentropy": 2.733081966638565, "loss/logits": 0.8281262069940567, "step": 32030 }, { "epoch": 0.3204, "grad_norm": 13.0, "grad_norm_var": 0.744775390625, "learning_rate": 0.0003, "loss": 11.2668, "loss/aux_loss": 0.04808316174894571, "loss/crossentropy": 2.7543557405471804, "loss/logits": 0.8591379880905151, "step": 32040 }, { "epoch": 0.3205, "grad_norm": 13.8125, "grad_norm_var": 0.4337076822916667, "learning_rate": 0.0003, "loss": 11.4739, "loss/aux_loss": 0.04808379802852869, "loss/crossentropy": 2.8462088823318483, "loss/logits": 0.8858730256557464, "step": 32050 }, { "epoch": 0.3206, "grad_norm": 12.4375, "grad_norm_var": 0.350244140625, "learning_rate": 0.0003, "loss": 11.5575, "loss/aux_loss": 0.04808332268148661, "loss/crossentropy": 2.804873597621918, "loss/logits": 0.8518771290779114, "step": 32060 }, { "epoch": 0.3207, "grad_norm": 12.9375, "grad_norm_var": 0.7770182291666666, "learning_rate": 0.0003, "loss": 11.508, "loss/aux_loss": 0.0480858214199543, "loss/crossentropy": 2.7543485045433043, "loss/logits": 0.8625475555658341, "step": 32070 }, { "epoch": 0.3208, "grad_norm": 14.5625, "grad_norm_var": 0.5645182291666667, "learning_rate": 0.0003, "loss": 11.3667, "loss/aux_loss": 0.04808385856449604, "loss/crossentropy": 2.7623300909996034, "loss/logits": 0.838151478767395, "step": 32080 }, { "epoch": 0.3209, "grad_norm": 12.0625, "grad_norm_var": 0.557275390625, "learning_rate": 0.0003, "loss": 11.4308, "loss/aux_loss": 0.048071561940014364, "loss/crossentropy": 2.7351042151451113, "loss/logits": 0.8292459070682525, "step": 32090 }, { "epoch": 0.321, "grad_norm": 13.1875, "grad_norm_var": 0.52890625, "learning_rate": 0.0003, "loss": 11.4485, "loss/aux_loss": 0.048081991448998454, "loss/crossentropy": 2.742439067363739, "loss/logits": 0.8626155495643616, "step": 32100 }, { "epoch": 0.3211, "grad_norm": 13.4375, "grad_norm_var": 0.3035807291666667, "learning_rate": 0.0003, "loss": 11.2873, "loss/aux_loss": 0.048089691810309886, "loss/crossentropy": 2.7030389070510865, "loss/logits": 0.8663504242897033, "step": 32110 }, { "epoch": 0.3212, "grad_norm": 12.8125, "grad_norm_var": 2.847916666666667, "learning_rate": 0.0003, "loss": 11.4217, "loss/aux_loss": 0.04807127509266138, "loss/crossentropy": 2.885518616437912, "loss/logits": 0.8689032286405564, "step": 32120 }, { "epoch": 0.3213, "grad_norm": 13.0625, "grad_norm_var": 2.738655598958333, "learning_rate": 0.0003, "loss": 11.3219, "loss/aux_loss": 0.04809402395039797, "loss/crossentropy": 2.8164079904556276, "loss/logits": 0.8654607564210892, "step": 32130 }, { "epoch": 0.3214, "grad_norm": 16.0, "grad_norm_var": 0.8416015625, "learning_rate": 0.0003, "loss": 11.4046, "loss/aux_loss": 0.04806608557701111, "loss/crossentropy": 2.7131950318813325, "loss/logits": 0.8581384032964706, "step": 32140 }, { "epoch": 0.3215, "grad_norm": 13.375, "grad_norm_var": 0.9400390625, "learning_rate": 0.0003, "loss": 11.4208, "loss/aux_loss": 0.04809866081923246, "loss/crossentropy": 2.64786559343338, "loss/logits": 0.8615017741918564, "step": 32150 }, { "epoch": 0.3216, "grad_norm": 13.1875, "grad_norm_var": 1.1765625, "learning_rate": 0.0003, "loss": 11.3239, "loss/aux_loss": 0.04808507617563009, "loss/crossentropy": 2.799293911457062, "loss/logits": 0.8632352501153946, "step": 32160 }, { "epoch": 0.3217, "grad_norm": 13.0625, "grad_norm_var": 0.6638020833333333, "learning_rate": 0.0003, "loss": 11.6439, "loss/aux_loss": 0.048080069571733476, "loss/crossentropy": 2.813327169418335, "loss/logits": 0.8687012135982514, "step": 32170 }, { "epoch": 0.3218, "grad_norm": 14.125, "grad_norm_var": 0.4744140625, "learning_rate": 0.0003, "loss": 11.4474, "loss/aux_loss": 0.048083030991256236, "loss/crossentropy": 2.8561726570129395, "loss/logits": 0.848215913772583, "step": 32180 }, { "epoch": 0.3219, "grad_norm": 13.3125, "grad_norm_var": 20.351041666666667, "learning_rate": 0.0003, "loss": 11.3414, "loss/aux_loss": 0.04807794373482466, "loss/crossentropy": 2.7342350482940674, "loss/logits": 0.8707915544509888, "step": 32190 }, { "epoch": 0.322, "grad_norm": 12.875, "grad_norm_var": 19.792952473958334, "learning_rate": 0.0003, "loss": 11.4378, "loss/aux_loss": 0.04808533620089293, "loss/crossentropy": 2.751993161439896, "loss/logits": 0.8459922909736634, "step": 32200 }, { "epoch": 0.3221, "grad_norm": 14.0, "grad_norm_var": 178.95364583333333, "learning_rate": 0.0003, "loss": 11.4808, "loss/aux_loss": 0.048082702048122886, "loss/crossentropy": 2.855089473724365, "loss/logits": 0.8649806082248688, "step": 32210 }, { "epoch": 0.3222, "grad_norm": 14.9375, "grad_norm_var": 1.689306640625, "learning_rate": 0.0003, "loss": 11.4922, "loss/aux_loss": 0.04807514175772667, "loss/crossentropy": 2.5906433165073395, "loss/logits": 0.843293958902359, "step": 32220 }, { "epoch": 0.3223, "grad_norm": 12.9375, "grad_norm_var": 1.0794108072916666, "learning_rate": 0.0003, "loss": 11.2442, "loss/aux_loss": 0.048078662157058714, "loss/crossentropy": 2.815950345993042, "loss/logits": 0.8553645879030227, "step": 32230 }, { "epoch": 0.3224, "grad_norm": 13.875, "grad_norm_var": 0.636962890625, "learning_rate": 0.0003, "loss": 11.4003, "loss/aux_loss": 0.04807271771132946, "loss/crossentropy": 2.759478431940079, "loss/logits": 0.8785617917776107, "step": 32240 }, { "epoch": 0.3225, "grad_norm": 14.9375, "grad_norm_var": 0.6570149739583333, "learning_rate": 0.0003, "loss": 11.34, "loss/aux_loss": 0.048083205707371235, "loss/crossentropy": 2.76770259141922, "loss/logits": 0.8515851318836212, "step": 32250 }, { "epoch": 0.3226, "grad_norm": 14.4375, "grad_norm_var": 0.5434733072916667, "learning_rate": 0.0003, "loss": 11.3623, "loss/aux_loss": 0.04807707685977221, "loss/crossentropy": 2.7345820903778075, "loss/logits": 0.8831362873315811, "step": 32260 }, { "epoch": 0.3227, "grad_norm": 13.0, "grad_norm_var": 0.4853515625, "learning_rate": 0.0003, "loss": 11.4747, "loss/aux_loss": 0.04807059448212385, "loss/crossentropy": 2.7904665589332582, "loss/logits": 0.8971100717782974, "step": 32270 }, { "epoch": 0.3228, "grad_norm": 13.25, "grad_norm_var": 0.49114583333333334, "learning_rate": 0.0003, "loss": 11.4368, "loss/aux_loss": 0.04808034915477037, "loss/crossentropy": 2.7924468517303467, "loss/logits": 0.8524171829223632, "step": 32280 }, { "epoch": 0.3229, "grad_norm": 13.4375, "grad_norm_var": 3.166145833333333, "learning_rate": 0.0003, "loss": 11.3999, "loss/aux_loss": 0.04808408003300428, "loss/crossentropy": 2.6971071362495422, "loss/logits": 0.8586209654808045, "step": 32290 }, { "epoch": 0.323, "grad_norm": 14.3125, "grad_norm_var": 0.5126139322916666, "learning_rate": 0.0003, "loss": 11.4116, "loss/aux_loss": 0.048079677298665045, "loss/crossentropy": 2.803563630580902, "loss/logits": 0.8560720324516297, "step": 32300 }, { "epoch": 0.3231, "grad_norm": 12.0625, "grad_norm_var": 0.36712239583333334, "learning_rate": 0.0003, "loss": 11.3175, "loss/aux_loss": 0.04807909969240427, "loss/crossentropy": 2.779414027929306, "loss/logits": 0.8658244550228119, "step": 32310 }, { "epoch": 0.3232, "grad_norm": 12.9375, "grad_norm_var": 0.3101399739583333, "learning_rate": 0.0003, "loss": 11.2228, "loss/aux_loss": 0.048088131844997405, "loss/crossentropy": 2.6179952681064607, "loss/logits": 0.81593057513237, "step": 32320 }, { "epoch": 0.3233, "grad_norm": 13.3125, "grad_norm_var": 0.11521809895833333, "learning_rate": 0.0003, "loss": 11.397, "loss/aux_loss": 0.04808655325323343, "loss/crossentropy": 2.8150524377822874, "loss/logits": 0.8867930352687836, "step": 32330 }, { "epoch": 0.3234, "grad_norm": 13.125, "grad_norm_var": 0.21145833333333333, "learning_rate": 0.0003, "loss": 11.2842, "loss/aux_loss": 0.04808272738009691, "loss/crossentropy": 2.7270405888557434, "loss/logits": 0.8832971513271332, "step": 32340 }, { "epoch": 0.3235, "grad_norm": 13.625, "grad_norm_var": 1.1784993489583333, "learning_rate": 0.0003, "loss": 11.5892, "loss/aux_loss": 0.0480838356539607, "loss/crossentropy": 2.7016100168228148, "loss/logits": 0.8885600864887238, "step": 32350 }, { "epoch": 0.3236, "grad_norm": 12.9375, "grad_norm_var": 0.38605143229166666, "learning_rate": 0.0003, "loss": 11.2312, "loss/aux_loss": 0.0480771854519844, "loss/crossentropy": 2.814963674545288, "loss/logits": 0.8497733741998672, "step": 32360 }, { "epoch": 0.3237, "grad_norm": 13.5, "grad_norm_var": 0.35618489583333335, "learning_rate": 0.0003, "loss": 11.263, "loss/aux_loss": 0.048084933497011664, "loss/crossentropy": 2.7236967265605925, "loss/logits": 0.8645591795444488, "step": 32370 }, { "epoch": 0.3238, "grad_norm": 13.8125, "grad_norm_var": 0.5859212239583333, "learning_rate": 0.0003, "loss": 11.5835, "loss/aux_loss": 0.04807772319763899, "loss/crossentropy": 2.7548747062683105, "loss/logits": 0.8677338659763336, "step": 32380 }, { "epoch": 0.3239, "grad_norm": 14.375, "grad_norm_var": 0.41399739583333334, "learning_rate": 0.0003, "loss": 11.2919, "loss/aux_loss": 0.0480786906555295, "loss/crossentropy": 2.8514682233333586, "loss/logits": 0.850545859336853, "step": 32390 }, { "epoch": 0.324, "grad_norm": 12.875, "grad_norm_var": 0.6752604166666667, "learning_rate": 0.0003, "loss": 11.3948, "loss/aux_loss": 0.048079483583569525, "loss/crossentropy": 2.9048630833625793, "loss/logits": 0.8999389052391052, "step": 32400 }, { "epoch": 0.3241, "grad_norm": 12.9375, "grad_norm_var": 43.757747395833334, "learning_rate": 0.0003, "loss": 11.3345, "loss/aux_loss": 0.04807599056512117, "loss/crossentropy": 2.835081601142883, "loss/logits": 0.8689920961856842, "step": 32410 }, { "epoch": 0.3242, "grad_norm": 13.375, "grad_norm_var": 41.23292643229167, "learning_rate": 0.0003, "loss": 11.4935, "loss/aux_loss": 0.04808198846876621, "loss/crossentropy": 2.7409429788589477, "loss/logits": 0.852924308180809, "step": 32420 }, { "epoch": 0.3243, "grad_norm": 14.0625, "grad_norm_var": 0.7806640625, "learning_rate": 0.0003, "loss": 11.3053, "loss/aux_loss": 0.04807913806289434, "loss/crossentropy": 2.697390305995941, "loss/logits": 0.8261424988508225, "step": 32430 }, { "epoch": 0.3244, "grad_norm": 13.1875, "grad_norm_var": 0.3082682291666667, "learning_rate": 0.0003, "loss": 11.4026, "loss/aux_loss": 0.04807761088013649, "loss/crossentropy": 2.8476333379745484, "loss/logits": 0.8590798646211624, "step": 32440 }, { "epoch": 0.3245, "grad_norm": 13.0625, "grad_norm_var": 0.6376139322916666, "learning_rate": 0.0003, "loss": 11.3744, "loss/aux_loss": 0.0480758473277092, "loss/crossentropy": 2.7274765491485597, "loss/logits": 0.8521047949790954, "step": 32450 }, { "epoch": 0.3246, "grad_norm": 12.5, "grad_norm_var": 0.79140625, "learning_rate": 0.0003, "loss": 11.4257, "loss/aux_loss": 0.048076083324849604, "loss/crossentropy": 2.7871260046958923, "loss/logits": 0.8288904428482056, "step": 32460 }, { "epoch": 0.3247, "grad_norm": 12.9375, "grad_norm_var": 0.2708333333333333, "learning_rate": 0.0003, "loss": 11.2386, "loss/aux_loss": 0.048092946968972684, "loss/crossentropy": 2.6454286336898805, "loss/logits": 0.8405205219984054, "step": 32470 }, { "epoch": 0.3248, "grad_norm": 14.125, "grad_norm_var": 0.6605305989583333, "learning_rate": 0.0003, "loss": 11.4729, "loss/aux_loss": 0.048083856143057345, "loss/crossentropy": 2.8707290291786194, "loss/logits": 0.8646343678236008, "step": 32480 }, { "epoch": 0.3249, "grad_norm": 13.125, "grad_norm_var": 1.0945149739583333, "learning_rate": 0.0003, "loss": 11.4695, "loss/aux_loss": 0.04808031674474478, "loss/crossentropy": 2.734501177072525, "loss/logits": 0.8618703633546829, "step": 32490 }, { "epoch": 0.325, "grad_norm": 13.625, "grad_norm_var": 0.342822265625, "learning_rate": 0.0003, "loss": 11.3474, "loss/aux_loss": 0.0480826249346137, "loss/crossentropy": 2.4993535339832307, "loss/logits": 0.8169450134038925, "step": 32500 }, { "epoch": 0.3251, "grad_norm": 13.0625, "grad_norm_var": 0.24256184895833333, "learning_rate": 0.0003, "loss": 11.1817, "loss/aux_loss": 0.04808486234396696, "loss/crossentropy": 2.7927271008491514, "loss/logits": 0.8319184005260467, "step": 32510 }, { "epoch": 0.3252, "grad_norm": 13.4375, "grad_norm_var": 0.43333333333333335, "learning_rate": 0.0003, "loss": 11.4204, "loss/aux_loss": 0.048070120811462405, "loss/crossentropy": 2.7226893484592436, "loss/logits": 0.842848926782608, "step": 32520 }, { "epoch": 0.3253, "grad_norm": 14.25, "grad_norm_var": 2.196875, "learning_rate": 0.0003, "loss": 11.338, "loss/aux_loss": 0.04808647688478231, "loss/crossentropy": 2.754758191108704, "loss/logits": 0.8788342326879501, "step": 32530 }, { "epoch": 0.3254, "grad_norm": 13.3125, "grad_norm_var": 2.3486979166666666, "learning_rate": 0.0003, "loss": 11.4313, "loss/aux_loss": 0.04807833395898342, "loss/crossentropy": 2.6338735044002535, "loss/logits": 0.8634207069873809, "step": 32540 }, { "epoch": 0.3255, "grad_norm": 12.875, "grad_norm_var": 0.24036458333333333, "learning_rate": 0.0003, "loss": 11.3306, "loss/aux_loss": 0.04807445779442787, "loss/crossentropy": 2.74268923997879, "loss/logits": 0.8131024420261384, "step": 32550 }, { "epoch": 0.3256, "grad_norm": 13.5, "grad_norm_var": 0.3675618489583333, "learning_rate": 0.0003, "loss": 11.5403, "loss/aux_loss": 0.04809182155877352, "loss/crossentropy": 2.729319167137146, "loss/logits": 0.8604931205511093, "step": 32560 }, { "epoch": 0.3257, "grad_norm": 13.8125, "grad_norm_var": 0.37159830729166665, "learning_rate": 0.0003, "loss": 11.2922, "loss/aux_loss": 0.04806775096803904, "loss/crossentropy": 2.7165225446224213, "loss/logits": 0.8697218716144561, "step": 32570 }, { "epoch": 0.3258, "grad_norm": 13.0625, "grad_norm_var": 0.6143229166666667, "learning_rate": 0.0003, "loss": 11.5015, "loss/aux_loss": 0.04808111321181059, "loss/crossentropy": 2.6520249009132386, "loss/logits": 0.8677373945713043, "step": 32580 }, { "epoch": 0.3259, "grad_norm": 13.5, "grad_norm_var": 0.3104166666666667, "learning_rate": 0.0003, "loss": 11.2081, "loss/aux_loss": 0.04808580782264471, "loss/crossentropy": 2.626668655872345, "loss/logits": 0.8411953181028367, "step": 32590 }, { "epoch": 0.326, "grad_norm": 13.375, "grad_norm_var": 0.19869791666666667, "learning_rate": 0.0003, "loss": 11.4345, "loss/aux_loss": 0.04807950202375651, "loss/crossentropy": 2.6047201275825502, "loss/logits": 0.8363840937614441, "step": 32600 }, { "epoch": 0.3261, "grad_norm": 13.3125, "grad_norm_var": 0.225, "learning_rate": 0.0003, "loss": 11.4163, "loss/aux_loss": 0.04808168914169073, "loss/crossentropy": 2.745485466718674, "loss/logits": 0.855038857460022, "step": 32610 }, { "epoch": 0.3262, "grad_norm": 13.9375, "grad_norm_var": 66.59993489583333, "learning_rate": 0.0003, "loss": 11.5196, "loss/aux_loss": 0.048089150339365005, "loss/crossentropy": 2.6984958589076995, "loss/logits": 0.8720762193202972, "step": 32620 }, { "epoch": 0.3263, "grad_norm": 15.25, "grad_norm_var": 64.37161458333334, "learning_rate": 0.0003, "loss": 11.5008, "loss/aux_loss": 0.04807731341570616, "loss/crossentropy": 2.6679943084716795, "loss/logits": 0.859524542093277, "step": 32630 }, { "epoch": 0.3264, "grad_norm": 14.0, "grad_norm_var": 0.6309733072916667, "learning_rate": 0.0003, "loss": 11.4637, "loss/aux_loss": 0.04808408729732037, "loss/crossentropy": 2.699479818344116, "loss/logits": 0.84074946641922, "step": 32640 }, { "epoch": 0.3265, "grad_norm": 15.9375, "grad_norm_var": 0.9561848958333333, "learning_rate": 0.0003, "loss": 11.3639, "loss/aux_loss": 0.04807748533785343, "loss/crossentropy": 2.654457098245621, "loss/logits": 0.8376824676990509, "step": 32650 }, { "epoch": 0.3266, "grad_norm": 13.125, "grad_norm_var": 2.091259765625, "learning_rate": 0.0003, "loss": 11.4042, "loss/aux_loss": 0.04807919319719076, "loss/crossentropy": 2.6513519108295442, "loss/logits": 0.8539661675691604, "step": 32660 }, { "epoch": 0.3267, "grad_norm": 13.25, "grad_norm_var": 0.7520670572916667, "learning_rate": 0.0003, "loss": 11.331, "loss/aux_loss": 0.04807972591370344, "loss/crossentropy": 2.807789134979248, "loss/logits": 0.8730264127254486, "step": 32670 }, { "epoch": 0.3268, "grad_norm": 15.5, "grad_norm_var": 0.9223307291666667, "learning_rate": 0.0003, "loss": 11.4216, "loss/aux_loss": 0.048074715211987494, "loss/crossentropy": 2.852227210998535, "loss/logits": 0.8674792051315308, "step": 32680 }, { "epoch": 0.3269, "grad_norm": 12.5625, "grad_norm_var": 0.9957682291666666, "learning_rate": 0.0003, "loss": 11.4985, "loss/aux_loss": 0.04807224553078413, "loss/crossentropy": 2.6428285241127014, "loss/logits": 0.8319959819316864, "step": 32690 }, { "epoch": 0.327, "grad_norm": 12.8125, "grad_norm_var": 1.8821451822916666, "learning_rate": 0.0003, "loss": 11.3336, "loss/aux_loss": 0.048086940124630925, "loss/crossentropy": 2.7363753497600554, "loss/logits": 0.8253339737653732, "step": 32700 }, { "epoch": 0.3271, "grad_norm": 15.5625, "grad_norm_var": 1.7759765625, "learning_rate": 0.0003, "loss": 11.5912, "loss/aux_loss": 0.04807717055082321, "loss/crossentropy": 2.7939674854278564, "loss/logits": 0.8734579056501388, "step": 32710 }, { "epoch": 0.3272, "grad_norm": 13.8125, "grad_norm_var": 1.1382649739583333, "learning_rate": 0.0003, "loss": 11.3646, "loss/aux_loss": 0.04808651357889175, "loss/crossentropy": 2.657242488861084, "loss/logits": 0.8325454264879226, "step": 32720 }, { "epoch": 0.3273, "grad_norm": 13.3125, "grad_norm_var": 1.162353515625, "learning_rate": 0.0003, "loss": 11.3746, "loss/aux_loss": 0.048074505664408206, "loss/crossentropy": 2.7361601114273073, "loss/logits": 0.8744795680046081, "step": 32730 }, { "epoch": 0.3274, "grad_norm": 12.9375, "grad_norm_var": 0.37237955729166666, "learning_rate": 0.0003, "loss": 11.2877, "loss/aux_loss": 0.04808551985770464, "loss/crossentropy": 2.480491054058075, "loss/logits": 0.8157087236642837, "step": 32740 }, { "epoch": 0.3275, "grad_norm": 13.1875, "grad_norm_var": 0.95078125, "learning_rate": 0.0003, "loss": 11.2748, "loss/aux_loss": 0.04807721842080355, "loss/crossentropy": 2.7901974260807036, "loss/logits": 0.8725056558847427, "step": 32750 }, { "epoch": 0.3276, "grad_norm": 13.5625, "grad_norm_var": 0.8020670572916667, "learning_rate": 0.0003, "loss": 11.4644, "loss/aux_loss": 0.04809094499796629, "loss/crossentropy": 2.6982684254646303, "loss/logits": 0.8750581175088883, "step": 32760 }, { "epoch": 0.3277, "grad_norm": 14.125, "grad_norm_var": 0.9791015625, "learning_rate": 0.0003, "loss": 11.3499, "loss/aux_loss": 0.04807215426117182, "loss/crossentropy": 2.56101336479187, "loss/logits": 0.859020522236824, "step": 32770 }, { "epoch": 0.3278, "grad_norm": 13.4375, "grad_norm_var": 1.1011555989583333, "learning_rate": 0.0003, "loss": 11.3663, "loss/aux_loss": 0.048077127523720266, "loss/crossentropy": 2.7586211442947386, "loss/logits": 0.8794794708490372, "step": 32780 }, { "epoch": 0.3279, "grad_norm": 14.625, "grad_norm_var": 0.3056640625, "learning_rate": 0.0003, "loss": 11.575, "loss/aux_loss": 0.04807180892676115, "loss/crossentropy": 2.8552963852882387, "loss/logits": 0.9014603316783905, "step": 32790 }, { "epoch": 0.328, "grad_norm": 13.0625, "grad_norm_var": 0.448681640625, "learning_rate": 0.0003, "loss": 11.659, "loss/aux_loss": 0.048085894994437696, "loss/crossentropy": 2.7301037549972533, "loss/logits": 0.8675024837255478, "step": 32800 }, { "epoch": 0.3281, "grad_norm": 13.6875, "grad_norm_var": 0.2535807291666667, "learning_rate": 0.0003, "loss": 11.422, "loss/aux_loss": 0.04807611275464296, "loss/crossentropy": 2.61910994052887, "loss/logits": 0.8257275193929672, "step": 32810 }, { "epoch": 0.3282, "grad_norm": 13.0625, "grad_norm_var": 0.219384765625, "learning_rate": 0.0003, "loss": 11.3728, "loss/aux_loss": 0.04807970225811005, "loss/crossentropy": 2.5538667261600496, "loss/logits": 0.8033603578805923, "step": 32820 }, { "epoch": 0.3283, "grad_norm": 13.3125, "grad_norm_var": 0.544384765625, "learning_rate": 0.0003, "loss": 11.449, "loss/aux_loss": 0.048089250549674036, "loss/crossentropy": 2.8291757106781006, "loss/logits": 0.86662557721138, "step": 32830 }, { "epoch": 0.3284, "grad_norm": 13.3125, "grad_norm_var": 0.4356608072916667, "learning_rate": 0.0003, "loss": 11.3986, "loss/aux_loss": 0.048079658299684525, "loss/crossentropy": 2.6797396779060363, "loss/logits": 0.8524430304765701, "step": 32840 }, { "epoch": 0.3285, "grad_norm": 13.1875, "grad_norm_var": 0.3113932291666667, "learning_rate": 0.0003, "loss": 11.5521, "loss/aux_loss": 0.048092910647392274, "loss/crossentropy": 2.8970932602882384, "loss/logits": 0.8721674889326095, "step": 32850 }, { "epoch": 0.3286, "grad_norm": 13.3125, "grad_norm_var": 0.2041015625, "learning_rate": 0.0003, "loss": 11.3052, "loss/aux_loss": 0.04807873163372278, "loss/crossentropy": 2.6216281414031983, "loss/logits": 0.8397267490625382, "step": 32860 }, { "epoch": 0.3287, "grad_norm": 12.5, "grad_norm_var": 0.36404622395833336, "learning_rate": 0.0003, "loss": 11.4758, "loss/aux_loss": 0.048077932186424734, "loss/crossentropy": 2.8529594242572784, "loss/logits": 0.9037439674139023, "step": 32870 }, { "epoch": 0.3288, "grad_norm": 12.5, "grad_norm_var": 0.39088541666666665, "learning_rate": 0.0003, "loss": 11.2377, "loss/aux_loss": 0.04808652587234974, "loss/crossentropy": 2.665499210357666, "loss/logits": 0.8454530268907547, "step": 32880 }, { "epoch": 0.3289, "grad_norm": 12.9375, "grad_norm_var": 0.3072265625, "learning_rate": 0.0003, "loss": 11.3039, "loss/aux_loss": 0.04807148966938257, "loss/crossentropy": 2.602234035730362, "loss/logits": 0.8243951052427292, "step": 32890 }, { "epoch": 0.329, "grad_norm": 12.9375, "grad_norm_var": 0.2581868489583333, "learning_rate": 0.0003, "loss": 11.2624, "loss/aux_loss": 0.04808493070304394, "loss/crossentropy": 2.7614921808242796, "loss/logits": 0.8612099617719651, "step": 32900 }, { "epoch": 0.3291, "grad_norm": 14.5, "grad_norm_var": 0.5416666666666666, "learning_rate": 0.0003, "loss": 11.3729, "loss/aux_loss": 0.04808545112609863, "loss/crossentropy": 2.7853208422660827, "loss/logits": 0.8972157269716263, "step": 32910 }, { "epoch": 0.3292, "grad_norm": 14.0625, "grad_norm_var": 0.808837890625, "learning_rate": 0.0003, "loss": 11.4716, "loss/aux_loss": 0.04807188231498003, "loss/crossentropy": 2.709330898523331, "loss/logits": 0.8775971084833145, "step": 32920 }, { "epoch": 0.3293, "grad_norm": 12.5, "grad_norm_var": 0.28411458333333334, "learning_rate": 0.0003, "loss": 11.3775, "loss/aux_loss": 0.04807582814246416, "loss/crossentropy": 2.63877277970314, "loss/logits": 0.8797126650810242, "step": 32930 }, { "epoch": 0.3294, "grad_norm": 12.9375, "grad_norm_var": 0.2796875, "learning_rate": 0.0003, "loss": 11.3529, "loss/aux_loss": 0.04808872751891613, "loss/crossentropy": 2.681774044036865, "loss/logits": 0.862132015824318, "step": 32940 }, { "epoch": 0.3295, "grad_norm": 13.625, "grad_norm_var": 136.1453125, "learning_rate": 0.0003, "loss": 11.4708, "loss/aux_loss": 0.04807477127760649, "loss/crossentropy": 2.7079889833927155, "loss/logits": 0.8643805146217346, "step": 32950 }, { "epoch": 0.3296, "grad_norm": 14.5, "grad_norm_var": 133.14055989583332, "learning_rate": 0.0003, "loss": 11.4029, "loss/aux_loss": 0.04808342196047306, "loss/crossentropy": 2.7009809732437136, "loss/logits": 0.8897195219993591, "step": 32960 }, { "epoch": 0.3297, "grad_norm": 12.9375, "grad_norm_var": 0.55703125, "learning_rate": 0.0003, "loss": 11.4977, "loss/aux_loss": 0.04807784240692854, "loss/crossentropy": 2.888811504840851, "loss/logits": 0.9022614181041717, "step": 32970 }, { "epoch": 0.3298, "grad_norm": 13.25, "grad_norm_var": 0.7114420572916667, "learning_rate": 0.0003, "loss": 11.4451, "loss/aux_loss": 0.04807289652526379, "loss/crossentropy": 2.7488197565078734, "loss/logits": 0.8802861243486404, "step": 32980 }, { "epoch": 0.3299, "grad_norm": 14.875, "grad_norm_var": 1.1337890625, "learning_rate": 0.0003, "loss": 11.5285, "loss/aux_loss": 0.048081624880433084, "loss/crossentropy": 2.7710135102272035, "loss/logits": 0.8831702828407287, "step": 32990 }, { "epoch": 0.33, "grad_norm": 14.0, "grad_norm_var": 0.39334309895833336, "learning_rate": 0.0003, "loss": 11.5182, "loss/aux_loss": 0.048084603250026704, "loss/crossentropy": 2.920530825853348, "loss/logits": 0.8848973125219345, "step": 33000 }, { "epoch": 0.3301, "grad_norm": 14.125, "grad_norm_var": 0.30911458333333336, "learning_rate": 0.0003, "loss": 11.2578, "loss/aux_loss": 0.04807808380573988, "loss/crossentropy": 2.7349561214447022, "loss/logits": 0.8306137710809708, "step": 33010 }, { "epoch": 0.3302, "grad_norm": 12.8125, "grad_norm_var": 0.21013997395833334, "learning_rate": 0.0003, "loss": 11.3987, "loss/aux_loss": 0.04808585941791534, "loss/crossentropy": 2.7925686955451967, "loss/logits": 0.8563876241445542, "step": 33020 }, { "epoch": 0.3303, "grad_norm": 12.8125, "grad_norm_var": 0.4200520833333333, "learning_rate": 0.0003, "loss": 11.4525, "loss/aux_loss": 0.048077084310352805, "loss/crossentropy": 2.829834222793579, "loss/logits": 0.8569678455591202, "step": 33030 }, { "epoch": 0.3304, "grad_norm": 13.0625, "grad_norm_var": 2.3742024739583334, "learning_rate": 0.0003, "loss": 11.4428, "loss/aux_loss": 0.04808685947209597, "loss/crossentropy": 2.6605750918388367, "loss/logits": 0.8512579023838043, "step": 33040 }, { "epoch": 0.3305, "grad_norm": 13.25, "grad_norm_var": 2.8878743489583334, "learning_rate": 0.0003, "loss": 11.4905, "loss/aux_loss": 0.04808191433548927, "loss/crossentropy": 2.821686065196991, "loss/logits": 0.8521415889263153, "step": 33050 }, { "epoch": 0.3306, "grad_norm": 14.0625, "grad_norm_var": 1.6442545572916667, "learning_rate": 0.0003, "loss": 11.448, "loss/aux_loss": 0.048077190294861794, "loss/crossentropy": 2.784515953063965, "loss/logits": 0.8502372175455093, "step": 33060 }, { "epoch": 0.3307, "grad_norm": 14.625, "grad_norm_var": 1.2645182291666666, "learning_rate": 0.0003, "loss": 11.298, "loss/aux_loss": 0.04808208737522364, "loss/crossentropy": 2.7311050057411195, "loss/logits": 0.8578761130571365, "step": 33070 }, { "epoch": 0.3308, "grad_norm": 13.5625, "grad_norm_var": 0.60703125, "learning_rate": 0.0003, "loss": 11.5828, "loss/aux_loss": 0.0480745630338788, "loss/crossentropy": 2.7229528069496154, "loss/logits": 0.8834622859954834, "step": 33080 }, { "epoch": 0.3309, "grad_norm": 12.3125, "grad_norm_var": 0.2822265625, "learning_rate": 0.0003, "loss": 11.3736, "loss/aux_loss": 0.048079187795519826, "loss/crossentropy": 2.596200668811798, "loss/logits": 0.8260492444038391, "step": 33090 }, { "epoch": 0.331, "grad_norm": 13.25, "grad_norm_var": 0.3492024739583333, "learning_rate": 0.0003, "loss": 11.4958, "loss/aux_loss": 0.04808150306344032, "loss/crossentropy": 2.6033548295497893, "loss/logits": 0.8426315069198609, "step": 33100 }, { "epoch": 0.3311, "grad_norm": 13.3125, "grad_norm_var": 0.43019205729166665, "learning_rate": 0.0003, "loss": 11.1088, "loss/aux_loss": 0.048071819357573986, "loss/crossentropy": 2.667391049861908, "loss/logits": 0.8707249313592911, "step": 33110 }, { "epoch": 0.3312, "grad_norm": 13.625, "grad_norm_var": 0.5379557291666667, "learning_rate": 0.0003, "loss": 11.4546, "loss/aux_loss": 0.04808641467243433, "loss/crossentropy": 2.759235656261444, "loss/logits": 0.8834040969610214, "step": 33120 }, { "epoch": 0.3313, "grad_norm": 13.8125, "grad_norm_var": 0.4339680989583333, "learning_rate": 0.0003, "loss": 11.4916, "loss/aux_loss": 0.04807813689112663, "loss/crossentropy": 2.7543640404939653, "loss/logits": 0.8365322396159172, "step": 33130 }, { "epoch": 0.3314, "grad_norm": 14.125, "grad_norm_var": 15.084488932291666, "learning_rate": 0.0003, "loss": 11.4041, "loss/aux_loss": 0.048085601069033146, "loss/crossentropy": 2.870689940452576, "loss/logits": 0.850860208272934, "step": 33140 }, { "epoch": 0.3315, "grad_norm": 13.5, "grad_norm_var": 15.106705729166666, "learning_rate": 0.0003, "loss": 11.2983, "loss/aux_loss": 0.04808761551976204, "loss/crossentropy": 2.716045266389847, "loss/logits": 0.8434042870998383, "step": 33150 }, { "epoch": 0.3316, "grad_norm": 14.125, "grad_norm_var": 0.5707682291666667, "learning_rate": 0.0003, "loss": 11.5516, "loss/aux_loss": 0.04807737655937672, "loss/crossentropy": 2.8646560847759246, "loss/logits": 0.8435910433530808, "step": 33160 }, { "epoch": 0.3317, "grad_norm": 13.4375, "grad_norm_var": 1.0468098958333334, "learning_rate": 0.0003, "loss": 11.2852, "loss/aux_loss": 0.04808404687792063, "loss/crossentropy": 2.6018213868141173, "loss/logits": 0.8262585073709487, "step": 33170 }, { "epoch": 0.3318, "grad_norm": 12.75, "grad_norm_var": 0.6083333333333333, "learning_rate": 0.0003, "loss": 11.3755, "loss/aux_loss": 0.04808040447533131, "loss/crossentropy": 2.752035117149353, "loss/logits": 0.8619087725877762, "step": 33180 }, { "epoch": 0.3319, "grad_norm": 13.1875, "grad_norm_var": 0.5118326822916667, "learning_rate": 0.0003, "loss": 11.3865, "loss/aux_loss": 0.048074712976813316, "loss/crossentropy": 2.859244775772095, "loss/logits": 0.8745595574378967, "step": 33190 }, { "epoch": 0.332, "grad_norm": 14.125, "grad_norm_var": 0.625244140625, "learning_rate": 0.0003, "loss": 11.3773, "loss/aux_loss": 0.04807355534285307, "loss/crossentropy": 2.8394315361976625, "loss/logits": 0.8535150647163391, "step": 33200 }, { "epoch": 0.3321, "grad_norm": 14.5, "grad_norm_var": 0.6541015625, "learning_rate": 0.0003, "loss": 11.3814, "loss/aux_loss": 0.04808756597340107, "loss/crossentropy": 2.8331064164638518, "loss/logits": 0.8748747378587722, "step": 33210 }, { "epoch": 0.3322, "grad_norm": 14.8125, "grad_norm_var": 1.4541015625, "learning_rate": 0.0003, "loss": 11.6009, "loss/aux_loss": 0.048076229728758337, "loss/crossentropy": 2.8229152381420137, "loss/logits": 0.8776537507772446, "step": 33220 }, { "epoch": 0.3323, "grad_norm": 13.1875, "grad_norm_var": 0.5733723958333333, "learning_rate": 0.0003, "loss": 11.453, "loss/aux_loss": 0.04807769935578108, "loss/crossentropy": 2.744255816936493, "loss/logits": 0.8375910878181457, "step": 33230 }, { "epoch": 0.3324, "grad_norm": 15.25, "grad_norm_var": 1.1174479166666667, "learning_rate": 0.0003, "loss": 11.5114, "loss/aux_loss": 0.048086580075323584, "loss/crossentropy": 2.7748912930488587, "loss/logits": 0.8750550776720047, "step": 33240 }, { "epoch": 0.3325, "grad_norm": 12.75, "grad_norm_var": 1.4270182291666667, "learning_rate": 0.0003, "loss": 11.4482, "loss/aux_loss": 0.04807029981166124, "loss/crossentropy": 2.785758376121521, "loss/logits": 0.8546015530824661, "step": 33250 }, { "epoch": 0.3326, "grad_norm": 13.0625, "grad_norm_var": 0.21302083333333333, "learning_rate": 0.0003, "loss": 11.4774, "loss/aux_loss": 0.048079310730099675, "loss/crossentropy": 2.7421350955963133, "loss/logits": 0.858822014927864, "step": 33260 }, { "epoch": 0.3327, "grad_norm": 13.625, "grad_norm_var": 0.18229166666666666, "learning_rate": 0.0003, "loss": 11.3066, "loss/aux_loss": 0.04808879122138023, "loss/crossentropy": 2.7908874809741975, "loss/logits": 0.877889646589756, "step": 33270 }, { "epoch": 0.3328, "grad_norm": 13.625, "grad_norm_var": 0.5369791666666667, "learning_rate": 0.0003, "loss": 11.4317, "loss/aux_loss": 0.048072985000908376, "loss/crossentropy": 2.6791608691215516, "loss/logits": 0.8541211634874344, "step": 33280 }, { "epoch": 0.3329, "grad_norm": 56.0, "grad_norm_var": 114.448681640625, "learning_rate": 0.0003, "loss": 11.4201, "loss/aux_loss": 0.0480899965390563, "loss/crossentropy": 2.7793687105178835, "loss/logits": 0.8533193141222, "step": 33290 }, { "epoch": 0.333, "grad_norm": 13.375, "grad_norm_var": 113.77381184895833, "learning_rate": 0.0003, "loss": 11.5771, "loss/aux_loss": 0.04808140806853771, "loss/crossentropy": 2.759211188554764, "loss/logits": 0.847288829088211, "step": 33300 }, { "epoch": 0.3331, "grad_norm": 13.4375, "grad_norm_var": 0.36139322916666666, "learning_rate": 0.0003, "loss": 11.5233, "loss/aux_loss": 0.04808169547468424, "loss/crossentropy": 2.6220255315303804, "loss/logits": 0.8704487830400467, "step": 33310 }, { "epoch": 0.3332, "grad_norm": 13.4375, "grad_norm_var": 0.392431640625, "learning_rate": 0.0003, "loss": 11.3581, "loss/aux_loss": 0.0480834748595953, "loss/crossentropy": 2.7258807718753815, "loss/logits": 0.8605304449796677, "step": 33320 }, { "epoch": 0.3333, "grad_norm": 12.5625, "grad_norm_var": 0.46990559895833334, "learning_rate": 0.0003, "loss": 11.3325, "loss/aux_loss": 0.048091739602386954, "loss/crossentropy": 2.7206650257110594, "loss/logits": 0.8408534616231919, "step": 33330 }, { "epoch": 0.3334, "grad_norm": 13.5, "grad_norm_var": 0.253759765625, "learning_rate": 0.0003, "loss": 11.4986, "loss/aux_loss": 0.04806930366903543, "loss/crossentropy": 2.7812957525253297, "loss/logits": 0.855445483326912, "step": 33340 }, { "epoch": 0.3335, "grad_norm": 13.625, "grad_norm_var": 0.506494140625, "learning_rate": 0.0003, "loss": 11.4043, "loss/aux_loss": 0.04807830974459648, "loss/crossentropy": 2.7645578622817992, "loss/logits": 0.8752608805894851, "step": 33350 }, { "epoch": 0.3336, "grad_norm": 13.8125, "grad_norm_var": 0.8958333333333334, "learning_rate": 0.0003, "loss": 11.386, "loss/aux_loss": 0.04809688944369554, "loss/crossentropy": 2.757777750492096, "loss/logits": 0.8512715846300125, "step": 33360 }, { "epoch": 0.3337, "grad_norm": 15.5, "grad_norm_var": 0.689306640625, "learning_rate": 0.0003, "loss": 11.4614, "loss/aux_loss": 0.048079153336584565, "loss/crossentropy": 2.7252914190292357, "loss/logits": 0.8573682248592377, "step": 33370 }, { "epoch": 0.3338, "grad_norm": 13.625, "grad_norm_var": 1085.8733723958333, "learning_rate": 0.0003, "loss": 11.4558, "loss/aux_loss": 0.04808936920017004, "loss/crossentropy": 2.648952716588974, "loss/logits": 0.8360880434513092, "step": 33380 }, { "epoch": 0.3339, "grad_norm": 14.0, "grad_norm_var": 1096.3770182291667, "learning_rate": 0.0003, "loss": 11.2424, "loss/aux_loss": 0.04808183200657368, "loss/crossentropy": 2.7101231694221495, "loss/logits": 0.8388356804847718, "step": 33390 }, { "epoch": 0.334, "grad_norm": 14.0, "grad_norm_var": 46.952457682291666, "learning_rate": 0.0003, "loss": 11.4011, "loss/aux_loss": 0.048093979991972444, "loss/crossentropy": 2.7197480618953707, "loss/logits": 0.8777379095554352, "step": 33400 }, { "epoch": 0.3341, "grad_norm": 12.75, "grad_norm_var": 0.5402180989583333, "learning_rate": 0.0003, "loss": 11.3751, "loss/aux_loss": 0.04807923678308725, "loss/crossentropy": 2.874398422241211, "loss/logits": 0.908473339676857, "step": 33410 }, { "epoch": 0.3342, "grad_norm": 14.75, "grad_norm_var": 0.653759765625, "learning_rate": 0.0003, "loss": 11.3637, "loss/aux_loss": 0.04807525873184204, "loss/crossentropy": 2.8315866231918334, "loss/logits": 0.8731096774339676, "step": 33420 }, { "epoch": 0.3343, "grad_norm": 13.0, "grad_norm_var": 0.6478515625, "learning_rate": 0.0003, "loss": 11.4268, "loss/aux_loss": 0.04808218479156494, "loss/crossentropy": 2.957446539402008, "loss/logits": 0.869893753528595, "step": 33430 }, { "epoch": 0.3344, "grad_norm": 12.5, "grad_norm_var": 0.5835774739583334, "learning_rate": 0.0003, "loss": 11.4248, "loss/aux_loss": 0.04807921797037125, "loss/crossentropy": 2.8220800697803496, "loss/logits": 0.8761366009712219, "step": 33440 }, { "epoch": 0.3345, "grad_norm": 15.75, "grad_norm_var": 0.9004557291666667, "learning_rate": 0.0003, "loss": 11.4527, "loss/aux_loss": 0.04809036403894425, "loss/crossentropy": 2.7487100541591643, "loss/logits": 0.863110476732254, "step": 33450 }, { "epoch": 0.3346, "grad_norm": 14.9375, "grad_norm_var": 0.5707682291666667, "learning_rate": 0.0003, "loss": 11.388, "loss/aux_loss": 0.04808029551059008, "loss/crossentropy": 2.7007455945014955, "loss/logits": 0.8433271735906601, "step": 33460 }, { "epoch": 0.3347, "grad_norm": 15.0625, "grad_norm_var": 0.30911458333333336, "learning_rate": 0.0003, "loss": 11.2174, "loss/aux_loss": 0.04807520732283592, "loss/crossentropy": 2.7608300507068635, "loss/logits": 0.8681216955184936, "step": 33470 }, { "epoch": 0.3348, "grad_norm": 51.25, "grad_norm_var": 86.63619791666666, "learning_rate": 0.0003, "loss": 11.4039, "loss/aux_loss": 0.0480832202360034, "loss/crossentropy": 2.7710861444473265, "loss/logits": 0.8367562472820282, "step": 33480 }, { "epoch": 0.3349, "grad_norm": 12.25, "grad_norm_var": 88.547119140625, "learning_rate": 0.0003, "loss": 11.3695, "loss/aux_loss": 0.04809240307658911, "loss/crossentropy": 2.803847813606262, "loss/logits": 0.862313050031662, "step": 33490 }, { "epoch": 0.335, "grad_norm": 12.875, "grad_norm_var": 0.628369140625, "learning_rate": 0.0003, "loss": 11.3663, "loss/aux_loss": 0.04807665664702654, "loss/crossentropy": 2.6767329633235932, "loss/logits": 0.8725397795438766, "step": 33500 }, { "epoch": 0.3351, "grad_norm": 13.5625, "grad_norm_var": 0.30130208333333336, "learning_rate": 0.0003, "loss": 11.5062, "loss/aux_loss": 0.04808883797377348, "loss/crossentropy": 2.7930223047733307, "loss/logits": 0.9104775667190552, "step": 33510 }, { "epoch": 0.3352, "grad_norm": 13.5, "grad_norm_var": 0.264697265625, "learning_rate": 0.0003, "loss": 11.3642, "loss/aux_loss": 0.04807222187519074, "loss/crossentropy": 2.809762644767761, "loss/logits": 0.858039128780365, "step": 33520 }, { "epoch": 0.3353, "grad_norm": 13.3125, "grad_norm_var": 0.15755208333333334, "learning_rate": 0.0003, "loss": 11.3144, "loss/aux_loss": 0.04808267746120691, "loss/crossentropy": 2.903217875957489, "loss/logits": 0.8900675117969513, "step": 33530 }, { "epoch": 0.3354, "grad_norm": 13.375, "grad_norm_var": 0.2572265625, "learning_rate": 0.0003, "loss": 11.4675, "loss/aux_loss": 0.048075296357274054, "loss/crossentropy": 2.8689566016197205, "loss/logits": 0.8549737244844436, "step": 33540 }, { "epoch": 0.3355, "grad_norm": 13.1875, "grad_norm_var": 0.5383951822916667, "learning_rate": 0.0003, "loss": 11.259, "loss/aux_loss": 0.04808174092322588, "loss/crossentropy": 2.632620471715927, "loss/logits": 0.8442192494869232, "step": 33550 }, { "epoch": 0.3356, "grad_norm": 13.5, "grad_norm_var": 0.679150390625, "learning_rate": 0.0003, "loss": 11.506, "loss/aux_loss": 0.048078482039272785, "loss/crossentropy": 2.8332688093185423, "loss/logits": 0.8708831310272217, "step": 33560 }, { "epoch": 0.3357, "grad_norm": 13.75, "grad_norm_var": 0.7718098958333334, "learning_rate": 0.0003, "loss": 11.5518, "loss/aux_loss": 0.048078897222876546, "loss/crossentropy": 2.7509835004806518, "loss/logits": 0.8593619883060455, "step": 33570 }, { "epoch": 0.3358, "grad_norm": 14.0, "grad_norm_var": 1.4572265625, "learning_rate": 0.0003, "loss": 11.3659, "loss/aux_loss": 0.04807315096259117, "loss/crossentropy": 2.7531749844551086, "loss/logits": 0.8511229604482651, "step": 33580 }, { "epoch": 0.3359, "grad_norm": 12.875, "grad_norm_var": 0.23118489583333332, "learning_rate": 0.0003, "loss": 11.5134, "loss/aux_loss": 0.04808947648853064, "loss/crossentropy": 2.7343482613563537, "loss/logits": 0.8925404042005539, "step": 33590 }, { "epoch": 0.336, "grad_norm": 12.75, "grad_norm_var": 0.47708333333333336, "learning_rate": 0.0003, "loss": 11.3391, "loss/aux_loss": 0.04807898830622435, "loss/crossentropy": 2.814681512117386, "loss/logits": 0.8659243017435074, "step": 33600 }, { "epoch": 0.3361, "grad_norm": 15.0, "grad_norm_var": 0.6880208333333333, "learning_rate": 0.0003, "loss": 11.575, "loss/aux_loss": 0.04807465691119432, "loss/crossentropy": 2.740473783016205, "loss/logits": 0.8824987977743148, "step": 33610 }, { "epoch": 0.3362, "grad_norm": 14.5625, "grad_norm_var": 0.4078125, "learning_rate": 0.0003, "loss": 11.3833, "loss/aux_loss": 0.04808358158916235, "loss/crossentropy": 2.7260211586952208, "loss/logits": 0.8684123188257218, "step": 33620 }, { "epoch": 0.3363, "grad_norm": 14.125, "grad_norm_var": 0.2, "learning_rate": 0.0003, "loss": 11.3365, "loss/aux_loss": 0.048076849430799484, "loss/crossentropy": 2.7123505532741548, "loss/logits": 0.8516561061143875, "step": 33630 }, { "epoch": 0.3364, "grad_norm": 13.9375, "grad_norm_var": 0.17185872395833332, "learning_rate": 0.0003, "loss": 11.2675, "loss/aux_loss": 0.048073905520141125, "loss/crossentropy": 2.894002687931061, "loss/logits": 0.8935140758752823, "step": 33640 }, { "epoch": 0.3365, "grad_norm": 14.0, "grad_norm_var": 0.661962890625, "learning_rate": 0.0003, "loss": 11.4262, "loss/aux_loss": 0.04808433558791876, "loss/crossentropy": 2.725138372182846, "loss/logits": 0.8834821820259094, "step": 33650 }, { "epoch": 0.3366, "grad_norm": 13.75, "grad_norm_var": 0.515625, "learning_rate": 0.0003, "loss": 11.3247, "loss/aux_loss": 0.048078181222081184, "loss/crossentropy": 2.6882384717464447, "loss/logits": 0.8374345928430558, "step": 33660 }, { "epoch": 0.3367, "grad_norm": 12.6875, "grad_norm_var": 0.9149576822916666, "learning_rate": 0.0003, "loss": 11.3953, "loss/aux_loss": 0.048090609908103946, "loss/crossentropy": 2.7942125260829926, "loss/logits": 0.8645864456892014, "step": 33670 }, { "epoch": 0.3368, "grad_norm": 13.6875, "grad_norm_var": 0.50625, "learning_rate": 0.0003, "loss": 11.61, "loss/aux_loss": 0.04808273129165173, "loss/crossentropy": 2.7299344420433043, "loss/logits": 0.8819968163967132, "step": 33680 }, { "epoch": 0.3369, "grad_norm": 14.0625, "grad_norm_var": 4.205843098958334, "learning_rate": 0.0003, "loss": 11.5101, "loss/aux_loss": 0.048078842274844644, "loss/crossentropy": 2.6255062937736513, "loss/logits": 0.8747300773859024, "step": 33690 }, { "epoch": 0.337, "grad_norm": 13.1875, "grad_norm_var": 0.42916666666666664, "learning_rate": 0.0003, "loss": 11.4736, "loss/aux_loss": 0.04807350095361471, "loss/crossentropy": 2.7771036982536317, "loss/logits": 0.8716427236795425, "step": 33700 }, { "epoch": 0.3371, "grad_norm": 13.8125, "grad_norm_var": 0.4578125, "learning_rate": 0.0003, "loss": 11.5287, "loss/aux_loss": 0.04808547291904688, "loss/crossentropy": 2.8936782777309418, "loss/logits": 0.8637538403272629, "step": 33710 }, { "epoch": 0.3372, "grad_norm": 14.9375, "grad_norm_var": 0.39817708333333335, "learning_rate": 0.0003, "loss": 11.4074, "loss/aux_loss": 0.04807719625532627, "loss/crossentropy": 2.781727874279022, "loss/logits": 0.8768081456422806, "step": 33720 }, { "epoch": 0.3373, "grad_norm": 14.0625, "grad_norm_var": 6.6869140625, "learning_rate": 0.0003, "loss": 11.4137, "loss/aux_loss": 0.04808393493294716, "loss/crossentropy": 2.816925013065338, "loss/logits": 0.857237920165062, "step": 33730 }, { "epoch": 0.3374, "grad_norm": 12.625, "grad_norm_var": 0.618603515625, "learning_rate": 0.0003, "loss": 11.3942, "loss/aux_loss": 0.04807567745447159, "loss/crossentropy": 2.733104008436203, "loss/logits": 0.8575152397155762, "step": 33740 }, { "epoch": 0.3375, "grad_norm": 13.4375, "grad_norm_var": 0.35130208333333335, "learning_rate": 0.0003, "loss": 11.4799, "loss/aux_loss": 0.04807877670973539, "loss/crossentropy": 2.8148476839065553, "loss/logits": 0.8529479697346687, "step": 33750 }, { "epoch": 0.3376, "grad_norm": 14.5625, "grad_norm_var": 0.7265625, "learning_rate": 0.0003, "loss": 11.327, "loss/aux_loss": 0.0480769969522953, "loss/crossentropy": 2.7635378301143647, "loss/logits": 0.8825518250465393, "step": 33760 }, { "epoch": 0.3377, "grad_norm": 13.125, "grad_norm_var": 0.34479166666666666, "learning_rate": 0.0003, "loss": 11.2382, "loss/aux_loss": 0.04808934032917023, "loss/crossentropy": 2.5587519288063048, "loss/logits": 0.803919005393982, "step": 33770 }, { "epoch": 0.3378, "grad_norm": 14.9375, "grad_norm_var": 3.247900390625, "learning_rate": 0.0003, "loss": 11.5012, "loss/aux_loss": 0.04807521179318428, "loss/crossentropy": 2.668776106834412, "loss/logits": 0.8375656992197037, "step": 33780 }, { "epoch": 0.3379, "grad_norm": 13.75, "grad_norm_var": 3.1749837239583334, "learning_rate": 0.0003, "loss": 11.2613, "loss/aux_loss": 0.04808988273143768, "loss/crossentropy": 2.650000900030136, "loss/logits": 0.8299892216920852, "step": 33790 }, { "epoch": 0.338, "grad_norm": 14.75, "grad_norm_var": 7.923030598958333, "learning_rate": 0.0003, "loss": 11.405, "loss/aux_loss": 0.04808232057839632, "loss/crossentropy": 2.8221355438232423, "loss/logits": 0.8894154459238053, "step": 33800 }, { "epoch": 0.3381, "grad_norm": 12.625, "grad_norm_var": 0.47980143229166666, "learning_rate": 0.0003, "loss": 11.503, "loss/aux_loss": 0.04806968811899424, "loss/crossentropy": 2.8208815813064576, "loss/logits": 0.896739274263382, "step": 33810 }, { "epoch": 0.3382, "grad_norm": 13.375, "grad_norm_var": 0.46087239583333334, "learning_rate": 0.0003, "loss": 11.3597, "loss/aux_loss": 0.048082143254578116, "loss/crossentropy": 2.549110287427902, "loss/logits": 0.8261379420757293, "step": 33820 }, { "epoch": 0.3383, "grad_norm": 14.3125, "grad_norm_var": 0.6005208333333333, "learning_rate": 0.0003, "loss": 11.6718, "loss/aux_loss": 0.048074525967240336, "loss/crossentropy": 2.8008286237716673, "loss/logits": 0.9250722289085388, "step": 33830 }, { "epoch": 0.3384, "grad_norm": 13.6875, "grad_norm_var": 0.25128580729166666, "learning_rate": 0.0003, "loss": 11.343, "loss/aux_loss": 0.04808689635246992, "loss/crossentropy": 2.663837468624115, "loss/logits": 0.8424749076366425, "step": 33840 }, { "epoch": 0.3385, "grad_norm": 13.75, "grad_norm_var": 1.8880208333333333, "learning_rate": 0.0003, "loss": 11.5091, "loss/aux_loss": 0.04809404145926237, "loss/crossentropy": 2.8522875905036926, "loss/logits": 0.8729503244161606, "step": 33850 }, { "epoch": 0.3386, "grad_norm": 16.625, "grad_norm_var": 0.809228515625, "learning_rate": 0.0003, "loss": 11.449, "loss/aux_loss": 0.04807314351201057, "loss/crossentropy": 2.9294650077819826, "loss/logits": 0.8873317569494248, "step": 33860 }, { "epoch": 0.3387, "grad_norm": 15.0, "grad_norm_var": 1.1075358072916666, "learning_rate": 0.0003, "loss": 11.4419, "loss/aux_loss": 0.048078053072094916, "loss/crossentropy": 2.893061339855194, "loss/logits": 0.9111079752445221, "step": 33870 }, { "epoch": 0.3388, "grad_norm": 14.125, "grad_norm_var": 2.179541015625, "learning_rate": 0.0003, "loss": 11.3273, "loss/aux_loss": 0.04808342736214399, "loss/crossentropy": 2.602770173549652, "loss/logits": 0.81967893242836, "step": 33880 }, { "epoch": 0.3389, "grad_norm": 13.3125, "grad_norm_var": 0.9304524739583333, "learning_rate": 0.0003, "loss": 11.321, "loss/aux_loss": 0.048087199591100215, "loss/crossentropy": 2.5976479768753054, "loss/logits": 0.8418799489736557, "step": 33890 }, { "epoch": 0.339, "grad_norm": 13.5625, "grad_norm_var": 0.8169108072916667, "learning_rate": 0.0003, "loss": 11.3486, "loss/aux_loss": 0.04807840995490551, "loss/crossentropy": 2.9478099584579467, "loss/logits": 0.8819531232118607, "step": 33900 }, { "epoch": 0.3391, "grad_norm": 13.1875, "grad_norm_var": 0.41326497395833334, "learning_rate": 0.0003, "loss": 11.2091, "loss/aux_loss": 0.04807400442659855, "loss/crossentropy": 2.6751762211322783, "loss/logits": 0.8382753849029541, "step": 33910 }, { "epoch": 0.3392, "grad_norm": 12.5, "grad_norm_var": 0.4864420572916667, "learning_rate": 0.0003, "loss": 11.3863, "loss/aux_loss": 0.048082627542316916, "loss/crossentropy": 2.67808051109314, "loss/logits": 0.8579610645771026, "step": 33920 }, { "epoch": 0.3393, "grad_norm": 13.25, "grad_norm_var": 1.636572265625, "learning_rate": 0.0003, "loss": 11.5376, "loss/aux_loss": 0.04807865135371685, "loss/crossentropy": 2.8202176868915556, "loss/logits": 0.8423859208822251, "step": 33930 }, { "epoch": 0.3394, "grad_norm": 13.5625, "grad_norm_var": 0.4158854166666667, "learning_rate": 0.0003, "loss": 11.3614, "loss/aux_loss": 0.04808816146105528, "loss/crossentropy": 2.5449154317378997, "loss/logits": 0.8428879886865616, "step": 33940 }, { "epoch": 0.3395, "grad_norm": 13.25, "grad_norm_var": 0.5692057291666667, "learning_rate": 0.0003, "loss": 11.22, "loss/aux_loss": 0.04808717239648104, "loss/crossentropy": 2.7124799370765684, "loss/logits": 0.8479482620954514, "step": 33950 }, { "epoch": 0.3396, "grad_norm": 12.9375, "grad_norm_var": 0.35833333333333334, "learning_rate": 0.0003, "loss": 11.4288, "loss/aux_loss": 0.04808026142418385, "loss/crossentropy": 2.769565761089325, "loss/logits": 0.8855602651834488, "step": 33960 }, { "epoch": 0.3397, "grad_norm": 12.875, "grad_norm_var": 0.22849934895833332, "learning_rate": 0.0003, "loss": 11.3179, "loss/aux_loss": 0.04807202909141779, "loss/crossentropy": 2.7668771505355836, "loss/logits": 0.8770667523145675, "step": 33970 }, { "epoch": 0.3398, "grad_norm": 12.625, "grad_norm_var": 0.37303059895833335, "learning_rate": 0.0003, "loss": 11.3418, "loss/aux_loss": 0.04808063618838787, "loss/crossentropy": 2.774995541572571, "loss/logits": 0.8627175658941268, "step": 33980 }, { "epoch": 0.3399, "grad_norm": 12.625, "grad_norm_var": 0.19869791666666667, "learning_rate": 0.0003, "loss": 11.2925, "loss/aux_loss": 0.04808147568255663, "loss/crossentropy": 2.731263720989227, "loss/logits": 0.8523558408021927, "step": 33990 }, { "epoch": 0.34, "grad_norm": 12.6875, "grad_norm_var": 0.6572916666666667, "learning_rate": 0.0003, "loss": 11.4506, "loss/aux_loss": 0.048083949461579324, "loss/crossentropy": 2.6820335149765016, "loss/logits": 0.8677924752235413, "step": 34000 }, { "epoch": 0.3401, "grad_norm": 14.0, "grad_norm_var": 0.484619140625, "learning_rate": 0.0003, "loss": 11.49, "loss/aux_loss": 0.04807724487036467, "loss/crossentropy": 2.8576271653175356, "loss/logits": 0.886279183626175, "step": 34010 }, { "epoch": 0.3402, "grad_norm": 13.125, "grad_norm_var": 0.33203125, "learning_rate": 0.0003, "loss": 11.2666, "loss/aux_loss": 0.048078724555671214, "loss/crossentropy": 2.695602595806122, "loss/logits": 0.8482803136110306, "step": 34020 }, { "epoch": 0.3403, "grad_norm": 13.1875, "grad_norm_var": 0.272900390625, "learning_rate": 0.0003, "loss": 11.3761, "loss/aux_loss": 0.04807147961109877, "loss/crossentropy": 2.681200659275055, "loss/logits": 0.8647918730974198, "step": 34030 }, { "epoch": 0.3404, "grad_norm": 13.1875, "grad_norm_var": 21.075764973958332, "learning_rate": 0.0003, "loss": 11.387, "loss/aux_loss": 0.04808759596198797, "loss/crossentropy": 2.7159022450447083, "loss/logits": 0.8712036728858947, "step": 34040 }, { "epoch": 0.3405, "grad_norm": 13.5625, "grad_norm_var": 21.0384765625, "learning_rate": 0.0003, "loss": 11.5856, "loss/aux_loss": 0.048080033622682095, "loss/crossentropy": 2.847675824165344, "loss/logits": 0.8919312745332718, "step": 34050 }, { "epoch": 0.3406, "grad_norm": 13.75, "grad_norm_var": 0.22180989583333333, "learning_rate": 0.0003, "loss": 11.4877, "loss/aux_loss": 0.048076984100043775, "loss/crossentropy": 2.7297983527183534, "loss/logits": 0.8955170571804046, "step": 34060 }, { "epoch": 0.3407, "grad_norm": 12.6875, "grad_norm_var": 0.26712239583333336, "learning_rate": 0.0003, "loss": 11.4626, "loss/aux_loss": 0.048071731068193915, "loss/crossentropy": 2.777487003803253, "loss/logits": 0.8421605467796326, "step": 34070 }, { "epoch": 0.3408, "grad_norm": 16.75, "grad_norm_var": 1.576025390625, "learning_rate": 0.0003, "loss": 11.5644, "loss/aux_loss": 0.048079511150717735, "loss/crossentropy": 2.791468983888626, "loss/logits": 0.8447980105876922, "step": 34080 }, { "epoch": 0.3409, "grad_norm": 14.5625, "grad_norm_var": 113.454541015625, "learning_rate": 0.0003, "loss": 11.3856, "loss/aux_loss": 0.04809570461511612, "loss/crossentropy": 2.746097815036774, "loss/logits": 0.8729157716035842, "step": 34090 }, { "epoch": 0.341, "grad_norm": 13.125, "grad_norm_var": 114.9181640625, "learning_rate": 0.0003, "loss": 11.2094, "loss/aux_loss": 0.04807548206299543, "loss/crossentropy": 2.744466412067413, "loss/logits": 0.8620479941368103, "step": 34100 }, { "epoch": 0.3411, "grad_norm": 13.375, "grad_norm_var": 0.9307291666666667, "learning_rate": 0.0003, "loss": 11.2762, "loss/aux_loss": 0.04807662982493639, "loss/crossentropy": 2.7420936226844788, "loss/logits": 0.8486543864011764, "step": 34110 }, { "epoch": 0.3412, "grad_norm": 14.375, "grad_norm_var": 0.5926920572916666, "learning_rate": 0.0003, "loss": 11.3183, "loss/aux_loss": 0.04808496292680502, "loss/crossentropy": 2.7532592713832855, "loss/logits": 0.8521647185087204, "step": 34120 }, { "epoch": 0.3413, "grad_norm": 13.8125, "grad_norm_var": 0.19479166666666667, "learning_rate": 0.0003, "loss": 11.4643, "loss/aux_loss": 0.04806942287832498, "loss/crossentropy": 2.971810203790665, "loss/logits": 0.9140418171882629, "step": 34130 }, { "epoch": 0.3414, "grad_norm": 13.5, "grad_norm_var": 0.6016764322916667, "learning_rate": 0.0003, "loss": 11.4657, "loss/aux_loss": 0.048084068857133386, "loss/crossentropy": 2.714846724271774, "loss/logits": 0.8961813569068908, "step": 34140 }, { "epoch": 0.3415, "grad_norm": 13.125, "grad_norm_var": 0.86015625, "learning_rate": 0.0003, "loss": 11.4352, "loss/aux_loss": 0.048082141764461996, "loss/crossentropy": 2.7342415273189546, "loss/logits": 0.8455929309129715, "step": 34150 }, { "epoch": 0.3416, "grad_norm": 13.5625, "grad_norm_var": 0.6512858072916666, "learning_rate": 0.0003, "loss": 11.2724, "loss/aux_loss": 0.048071014508605, "loss/crossentropy": 2.6547606706619264, "loss/logits": 0.8492685943841934, "step": 34160 }, { "epoch": 0.3417, "grad_norm": 14.75, "grad_norm_var": 1.117431640625, "learning_rate": 0.0003, "loss": 11.502, "loss/aux_loss": 0.04807817898690701, "loss/crossentropy": 2.7856763303279877, "loss/logits": 0.8842839747667313, "step": 34170 }, { "epoch": 0.3418, "grad_norm": 13.375, "grad_norm_var": 0.5221354166666666, "learning_rate": 0.0003, "loss": 11.3981, "loss/aux_loss": 0.04807715006172657, "loss/crossentropy": 2.708691877126694, "loss/logits": 0.8723496258258819, "step": 34180 }, { "epoch": 0.3419, "grad_norm": 13.375, "grad_norm_var": 0.45362955729166665, "learning_rate": 0.0003, "loss": 11.3953, "loss/aux_loss": 0.04807659070938826, "loss/crossentropy": 2.7593763947486876, "loss/logits": 0.8531175792217255, "step": 34190 }, { "epoch": 0.342, "grad_norm": 13.875, "grad_norm_var": 0.21066080729166667, "learning_rate": 0.0003, "loss": 11.2668, "loss/aux_loss": 0.04807346910238266, "loss/crossentropy": 2.825400298833847, "loss/logits": 0.8504022687673569, "step": 34200 }, { "epoch": 0.3421, "grad_norm": 13.6875, "grad_norm_var": 0.6419108072916667, "learning_rate": 0.0003, "loss": 11.3772, "loss/aux_loss": 0.048073016293346885, "loss/crossentropy": 2.614541435241699, "loss/logits": 0.8537357658147812, "step": 34210 }, { "epoch": 0.3422, "grad_norm": 16.75, "grad_norm_var": 1.0940104166666667, "learning_rate": 0.0003, "loss": 11.2334, "loss/aux_loss": 0.04808624424040318, "loss/crossentropy": 2.588237798213959, "loss/logits": 0.8261738806962967, "step": 34220 }, { "epoch": 0.3423, "grad_norm": 13.6875, "grad_norm_var": 1.0035807291666667, "learning_rate": 0.0003, "loss": 11.3177, "loss/aux_loss": 0.04808156695216894, "loss/crossentropy": 2.766414910554886, "loss/logits": 0.8738790214061737, "step": 34230 }, { "epoch": 0.3424, "grad_norm": 13.6875, "grad_norm_var": 0.32024739583333334, "learning_rate": 0.0003, "loss": 11.1505, "loss/aux_loss": 0.04807690214365721, "loss/crossentropy": 2.6533170878887176, "loss/logits": 0.8381021320819855, "step": 34240 }, { "epoch": 0.3425, "grad_norm": 13.125, "grad_norm_var": 0.4025390625, "learning_rate": 0.0003, "loss": 11.3866, "loss/aux_loss": 0.048076437786221504, "loss/crossentropy": 2.852277064323425, "loss/logits": 0.8767792642116546, "step": 34250 }, { "epoch": 0.3426, "grad_norm": 12.875, "grad_norm_var": 0.3776041666666667, "learning_rate": 0.0003, "loss": 11.3834, "loss/aux_loss": 0.04808211978524923, "loss/crossentropy": 2.8015721797943116, "loss/logits": 0.8706277936697007, "step": 34260 }, { "epoch": 0.3427, "grad_norm": 13.125, "grad_norm_var": 2.2030598958333334, "learning_rate": 0.0003, "loss": 11.3862, "loss/aux_loss": 0.048078938759863375, "loss/crossentropy": 2.790551495552063, "loss/logits": 0.8879824995994567, "step": 34270 }, { "epoch": 0.3428, "grad_norm": 13.3125, "grad_norm_var": 2.0182291666666665, "learning_rate": 0.0003, "loss": 11.4202, "loss/aux_loss": 0.048073595948517324, "loss/crossentropy": 2.634516406059265, "loss/logits": 0.8375965476036071, "step": 34280 }, { "epoch": 0.3429, "grad_norm": 14.5, "grad_norm_var": 0.3700358072916667, "learning_rate": 0.0003, "loss": 11.1732, "loss/aux_loss": 0.048077768087387084, "loss/crossentropy": 2.8296406984329225, "loss/logits": 0.8416439831256867, "step": 34290 }, { "epoch": 0.343, "grad_norm": 14.8125, "grad_norm_var": 8.235139973958333, "learning_rate": 0.0003, "loss": 11.4395, "loss/aux_loss": 0.04807805363088846, "loss/crossentropy": 2.6985132932662963, "loss/logits": 0.8508864104747772, "step": 34300 }, { "epoch": 0.3431, "grad_norm": 12.8125, "grad_norm_var": 8.224934895833334, "learning_rate": 0.0003, "loss": 11.3689, "loss/aux_loss": 0.04808331541717052, "loss/crossentropy": 2.7489245235919952, "loss/logits": 0.8492704391479492, "step": 34310 }, { "epoch": 0.3432, "grad_norm": 15.5, "grad_norm_var": 0.6957682291666667, "learning_rate": 0.0003, "loss": 11.3776, "loss/aux_loss": 0.048074118047952655, "loss/crossentropy": 2.6718755304813384, "loss/logits": 0.8244736731052399, "step": 34320 }, { "epoch": 0.3433, "grad_norm": 14.0, "grad_norm_var": 0.699853515625, "learning_rate": 0.0003, "loss": 11.3257, "loss/aux_loss": 0.04808854255825281, "loss/crossentropy": 2.5519628286361695, "loss/logits": 0.8258508026599884, "step": 34330 }, { "epoch": 0.3434, "grad_norm": 13.5625, "grad_norm_var": 0.9551920572916667, "learning_rate": 0.0003, "loss": 11.2696, "loss/aux_loss": 0.048077429085969924, "loss/crossentropy": 2.7307616233825684, "loss/logits": 0.8781674951314926, "step": 34340 }, { "epoch": 0.3435, "grad_norm": 12.875, "grad_norm_var": 0.2400390625, "learning_rate": 0.0003, "loss": 11.5434, "loss/aux_loss": 0.0480849402025342, "loss/crossentropy": 2.7033145487308503, "loss/logits": 0.8667132765054703, "step": 34350 }, { "epoch": 0.3436, "grad_norm": 13.625, "grad_norm_var": 0.24348958333333334, "learning_rate": 0.0003, "loss": 11.2203, "loss/aux_loss": 0.048083757422864436, "loss/crossentropy": 2.7016442120075226, "loss/logits": 0.8575159192085267, "step": 34360 }, { "epoch": 0.3437, "grad_norm": 13.25, "grad_norm_var": 0.5801432291666667, "learning_rate": 0.0003, "loss": 11.4771, "loss/aux_loss": 0.04808876011520624, "loss/crossentropy": 2.7391174018383024, "loss/logits": 0.864966481924057, "step": 34370 }, { "epoch": 0.3438, "grad_norm": 13.0625, "grad_norm_var": 0.8673014322916667, "learning_rate": 0.0003, "loss": 11.489, "loss/aux_loss": 0.04807764030992985, "loss/crossentropy": 2.7683672428131105, "loss/logits": 0.8739204913377762, "step": 34380 }, { "epoch": 0.3439, "grad_norm": 13.1875, "grad_norm_var": 0.6416015625, "learning_rate": 0.0003, "loss": 11.2858, "loss/aux_loss": 0.04807857647538185, "loss/crossentropy": 2.7284740686416624, "loss/logits": 0.8791959375143051, "step": 34390 }, { "epoch": 0.344, "grad_norm": 13.9375, "grad_norm_var": 0.7230305989583333, "learning_rate": 0.0003, "loss": 11.4192, "loss/aux_loss": 0.04808525741100311, "loss/crossentropy": 2.701452487707138, "loss/logits": 0.8486621975898743, "step": 34400 }, { "epoch": 0.3441, "grad_norm": 14.0, "grad_norm_var": 0.49420572916666666, "learning_rate": 0.0003, "loss": 11.4331, "loss/aux_loss": 0.04808118660002947, "loss/crossentropy": 2.7283570528030396, "loss/logits": 0.8572055399417877, "step": 34410 }, { "epoch": 0.3442, "grad_norm": 15.4375, "grad_norm_var": 67.07941080729167, "learning_rate": 0.0003, "loss": 11.4184, "loss/aux_loss": 0.04808388836681843, "loss/crossentropy": 2.7710089802742006, "loss/logits": 0.9001825273036956, "step": 34420 }, { "epoch": 0.3443, "grad_norm": 14.375, "grad_norm_var": 66.71573893229167, "learning_rate": 0.0003, "loss": 11.4897, "loss/aux_loss": 0.04807852674275637, "loss/crossentropy": 2.9153838396072387, "loss/logits": 0.9058898121118546, "step": 34430 }, { "epoch": 0.3444, "grad_norm": 13.4375, "grad_norm_var": 0.4254557291666667, "learning_rate": 0.0003, "loss": 11.3548, "loss/aux_loss": 0.048092870600521566, "loss/crossentropy": 2.798048400878906, "loss/logits": 0.8726730048656464, "step": 34440 }, { "epoch": 0.3445, "grad_norm": 13.125, "grad_norm_var": 2.915478515625, "learning_rate": 0.0003, "loss": 11.3393, "loss/aux_loss": 0.04807271007448435, "loss/crossentropy": 2.8184443950653075, "loss/logits": 0.8590573251247406, "step": 34450 }, { "epoch": 0.3446, "grad_norm": 12.625, "grad_norm_var": 2.8291015625, "learning_rate": 0.0003, "loss": 11.3975, "loss/aux_loss": 0.048090783134102824, "loss/crossentropy": 2.57768235206604, "loss/logits": 0.8573962718248367, "step": 34460 }, { "epoch": 0.3447, "grad_norm": 15.3125, "grad_norm_var": 3.8590983072916667, "learning_rate": 0.0003, "loss": 11.4101, "loss/aux_loss": 0.0480766186490655, "loss/crossentropy": 2.7841560423374174, "loss/logits": 0.8446702927350997, "step": 34470 }, { "epoch": 0.3448, "grad_norm": 13.9375, "grad_norm_var": 497.44138997395834, "learning_rate": 0.0003, "loss": 11.3165, "loss/aux_loss": 0.04808448310941458, "loss/crossentropy": 2.6943975150585175, "loss/logits": 0.8504249632358551, "step": 34480 }, { "epoch": 0.3449, "grad_norm": 13.8125, "grad_norm_var": 500.03435872395835, "learning_rate": 0.0003, "loss": 11.4362, "loss/aux_loss": 0.04807149842381477, "loss/crossentropy": 2.736327660083771, "loss/logits": 0.900990754365921, "step": 34490 }, { "epoch": 0.345, "grad_norm": 14.0, "grad_norm_var": 0.38357747395833336, "learning_rate": 0.0003, "loss": 11.5657, "loss/aux_loss": 0.04809287562966347, "loss/crossentropy": 2.7342132091522218, "loss/logits": 0.864747279882431, "step": 34500 }, { "epoch": 0.3451, "grad_norm": 13.8125, "grad_norm_var": 0.5598958333333334, "learning_rate": 0.0003, "loss": 11.3414, "loss/aux_loss": 0.04807418640702963, "loss/crossentropy": 2.7356625139713286, "loss/logits": 0.8603394240140915, "step": 34510 }, { "epoch": 0.3452, "grad_norm": 14.75, "grad_norm_var": 15.267643229166667, "learning_rate": 0.0003, "loss": 11.451, "loss/aux_loss": 0.048086957447230814, "loss/crossentropy": 2.6491969525814056, "loss/logits": 0.8844427525997162, "step": 34520 }, { "epoch": 0.3453, "grad_norm": 13.125, "grad_norm_var": 16.376546223958332, "learning_rate": 0.0003, "loss": 11.39, "loss/aux_loss": 0.048073998652398586, "loss/crossentropy": 2.768836522102356, "loss/logits": 0.8581522196531296, "step": 34530 }, { "epoch": 0.3454, "grad_norm": 14.5625, "grad_norm_var": 0.8609212239583334, "learning_rate": 0.0003, "loss": 11.3267, "loss/aux_loss": 0.04808975532650948, "loss/crossentropy": 2.5816560626029967, "loss/logits": 0.8240507543087006, "step": 34540 }, { "epoch": 0.3455, "grad_norm": 13.3125, "grad_norm_var": 0.42967122395833335, "learning_rate": 0.0003, "loss": 11.3352, "loss/aux_loss": 0.048081275261938575, "loss/crossentropy": 2.775824022293091, "loss/logits": 0.8478355586528779, "step": 34550 }, { "epoch": 0.3456, "grad_norm": 52.75, "grad_norm_var": 95.89368489583333, "learning_rate": 0.0003, "loss": 11.4611, "loss/aux_loss": 0.04807286318391561, "loss/crossentropy": 2.7201479375362396, "loss/logits": 0.8709542602300644, "step": 34560 }, { "epoch": 0.3457, "grad_norm": 13.5625, "grad_norm_var": 94.72369791666667, "learning_rate": 0.0003, "loss": 11.3639, "loss/aux_loss": 0.04808838125318289, "loss/crossentropy": 2.6683280885219576, "loss/logits": 0.8334614604711532, "step": 34570 }, { "epoch": 0.3458, "grad_norm": 13.125, "grad_norm_var": 0.28828125, "learning_rate": 0.0003, "loss": 11.2168, "loss/aux_loss": 0.048079443350434306, "loss/crossentropy": 2.8578147292137146, "loss/logits": 0.851711419224739, "step": 34580 }, { "epoch": 0.3459, "grad_norm": 14.0, "grad_norm_var": 1.5541015625, "learning_rate": 0.0003, "loss": 11.5112, "loss/aux_loss": 0.048081373795866966, "loss/crossentropy": 2.717966139316559, "loss/logits": 0.8493025034666062, "step": 34590 }, { "epoch": 0.346, "grad_norm": 13.1875, "grad_norm_var": 1.81953125, "learning_rate": 0.0003, "loss": 11.344, "loss/aux_loss": 0.04807697702199221, "loss/crossentropy": 2.6692949771881103, "loss/logits": 0.8717973381280899, "step": 34600 }, { "epoch": 0.3461, "grad_norm": 12.6875, "grad_norm_var": 0.6989583333333333, "learning_rate": 0.0003, "loss": 11.3426, "loss/aux_loss": 0.048083077929913995, "loss/crossentropy": 2.7653361916542054, "loss/logits": 0.8600286096334457, "step": 34610 }, { "epoch": 0.3462, "grad_norm": 15.5, "grad_norm_var": 0.632275390625, "learning_rate": 0.0003, "loss": 11.5035, "loss/aux_loss": 0.04807848259806633, "loss/crossentropy": 2.6988938450813293, "loss/logits": 0.8438379615545273, "step": 34620 }, { "epoch": 0.3463, "grad_norm": 13.1875, "grad_norm_var": 0.7317057291666667, "learning_rate": 0.0003, "loss": 11.3984, "loss/aux_loss": 0.048082325235009195, "loss/crossentropy": 2.8675466597080232, "loss/logits": 0.8446835935115814, "step": 34630 }, { "epoch": 0.3464, "grad_norm": 13.625, "grad_norm_var": 0.482666015625, "learning_rate": 0.0003, "loss": 11.5275, "loss/aux_loss": 0.04808323420584202, "loss/crossentropy": 2.6933222889900206, "loss/logits": 0.8721669852733612, "step": 34640 }, { "epoch": 0.3465, "grad_norm": 13.6875, "grad_norm_var": 4.624983723958334, "learning_rate": 0.0003, "loss": 11.2929, "loss/aux_loss": 0.04808050952851772, "loss/crossentropy": 2.932732379436493, "loss/logits": 0.8866453051567078, "step": 34650 }, { "epoch": 0.3466, "grad_norm": 14.0, "grad_norm_var": 4.804801432291667, "learning_rate": 0.0003, "loss": 11.5469, "loss/aux_loss": 0.04808491580188275, "loss/crossentropy": 2.6847081184387207, "loss/logits": 0.8588018774986267, "step": 34660 }, { "epoch": 0.3467, "grad_norm": 12.375, "grad_norm_var": 0.226025390625, "learning_rate": 0.0003, "loss": 11.3049, "loss/aux_loss": 0.04807180892676115, "loss/crossentropy": 2.8481385111808777, "loss/logits": 0.8246441930532455, "step": 34670 }, { "epoch": 0.3468, "grad_norm": 13.5625, "grad_norm_var": 0.1884765625, "learning_rate": 0.0003, "loss": 11.4136, "loss/aux_loss": 0.04807733632624149, "loss/crossentropy": 2.7871821761131286, "loss/logits": 0.8697138547897338, "step": 34680 }, { "epoch": 0.3469, "grad_norm": 13.3125, "grad_norm_var": 0.16243489583333334, "learning_rate": 0.0003, "loss": 11.2759, "loss/aux_loss": 0.04808486551046372, "loss/crossentropy": 2.7706130504608155, "loss/logits": 0.8613585025072098, "step": 34690 }, { "epoch": 0.347, "grad_norm": 13.875, "grad_norm_var": 0.6391764322916667, "learning_rate": 0.0003, "loss": 11.2369, "loss/aux_loss": 0.048074539937078956, "loss/crossentropy": 2.725221812725067, "loss/logits": 0.8312882751226425, "step": 34700 }, { "epoch": 0.3471, "grad_norm": 14.75, "grad_norm_var": 1.1984375, "learning_rate": 0.0003, "loss": 11.3719, "loss/aux_loss": 0.048076304234564306, "loss/crossentropy": 2.8168802559375763, "loss/logits": 0.8541012018918991, "step": 34710 }, { "epoch": 0.3472, "grad_norm": 14.9375, "grad_norm_var": 0.8786295572916667, "learning_rate": 0.0003, "loss": 11.2599, "loss/aux_loss": 0.048082977347075936, "loss/crossentropy": 2.701399064064026, "loss/logits": 0.8401564180850982, "step": 34720 }, { "epoch": 0.3473, "grad_norm": 13.5625, "grad_norm_var": 0.941259765625, "learning_rate": 0.0003, "loss": 11.4601, "loss/aux_loss": 0.048081225156784056, "loss/crossentropy": 2.8854560017585755, "loss/logits": 0.8480129152536392, "step": 34730 }, { "epoch": 0.3474, "grad_norm": 14.375, "grad_norm_var": 0.24881184895833333, "learning_rate": 0.0003, "loss": 11.4022, "loss/aux_loss": 0.048076600581407544, "loss/crossentropy": 2.7241687536239625, "loss/logits": 0.8581605464220047, "step": 34740 }, { "epoch": 0.3475, "grad_norm": 13.3125, "grad_norm_var": 0.37472330729166664, "learning_rate": 0.0003, "loss": 11.3392, "loss/aux_loss": 0.04808560237288475, "loss/crossentropy": 2.6501355826854707, "loss/logits": 0.843683734536171, "step": 34750 }, { "epoch": 0.3476, "grad_norm": 14.0625, "grad_norm_var": 0.20792643229166666, "learning_rate": 0.0003, "loss": 11.4091, "loss/aux_loss": 0.04806531127542257, "loss/crossentropy": 2.753983849287033, "loss/logits": 0.8834265947341919, "step": 34760 }, { "epoch": 0.3477, "grad_norm": 14.0, "grad_norm_var": 0.3792805989583333, "learning_rate": 0.0003, "loss": 11.5056, "loss/aux_loss": 0.04808344487100839, "loss/crossentropy": 2.8421459555625916, "loss/logits": 0.9221995055675507, "step": 34770 }, { "epoch": 0.3478, "grad_norm": 14.0625, "grad_norm_var": 0.41119791666666666, "learning_rate": 0.0003, "loss": 11.3332, "loss/aux_loss": 0.04807017482817173, "loss/crossentropy": 2.630517715215683, "loss/logits": 0.8022065937519074, "step": 34780 }, { "epoch": 0.3479, "grad_norm": 13.25, "grad_norm_var": 0.37161458333333336, "learning_rate": 0.0003, "loss": 11.4773, "loss/aux_loss": 0.04808255229145288, "loss/crossentropy": 2.7792518377304076, "loss/logits": 0.8739649176597595, "step": 34790 }, { "epoch": 0.348, "grad_norm": 13.25, "grad_norm_var": 0.24998372395833332, "learning_rate": 0.0003, "loss": 11.2592, "loss/aux_loss": 0.048081311769783495, "loss/crossentropy": 2.8258360862731933, "loss/logits": 0.8601921498775482, "step": 34800 }, { "epoch": 0.3481, "grad_norm": 12.1875, "grad_norm_var": 0.3882649739583333, "learning_rate": 0.0003, "loss": 11.2427, "loss/aux_loss": 0.048077326826751234, "loss/crossentropy": 2.804766833782196, "loss/logits": 0.8551715075969696, "step": 34810 }, { "epoch": 0.3482, "grad_norm": 13.0625, "grad_norm_var": 0.44212239583333335, "learning_rate": 0.0003, "loss": 11.3048, "loss/aux_loss": 0.048078553192317484, "loss/crossentropy": 2.6608037412166596, "loss/logits": 0.8357015043497086, "step": 34820 }, { "epoch": 0.3483, "grad_norm": 13.9375, "grad_norm_var": 0.230712890625, "learning_rate": 0.0003, "loss": 11.3251, "loss/aux_loss": 0.04808176066726446, "loss/crossentropy": 2.824368530511856, "loss/logits": 0.8423573106527329, "step": 34830 }, { "epoch": 0.3484, "grad_norm": 13.1875, "grad_norm_var": 0.2775390625, "learning_rate": 0.0003, "loss": 11.5143, "loss/aux_loss": 0.04807648658752441, "loss/crossentropy": 2.8121955931186675, "loss/logits": 0.8731589168310165, "step": 34840 }, { "epoch": 0.3485, "grad_norm": 15.0625, "grad_norm_var": 1.0973958333333333, "learning_rate": 0.0003, "loss": 11.3939, "loss/aux_loss": 0.04807734619826078, "loss/crossentropy": 2.754815798997879, "loss/logits": 0.8677790522575378, "step": 34850 }, { "epoch": 0.3486, "grad_norm": 13.75, "grad_norm_var": 1.135791015625, "learning_rate": 0.0003, "loss": 11.3472, "loss/aux_loss": 0.04807490929961204, "loss/crossentropy": 2.728018116950989, "loss/logits": 0.8459836810827255, "step": 34860 }, { "epoch": 0.3487, "grad_norm": 13.75, "grad_norm_var": 0.6589680989583333, "learning_rate": 0.0003, "loss": 11.2925, "loss/aux_loss": 0.0480719706043601, "loss/crossentropy": 2.681806039810181, "loss/logits": 0.8287836849689484, "step": 34870 }, { "epoch": 0.3488, "grad_norm": 12.625, "grad_norm_var": 0.313916015625, "learning_rate": 0.0003, "loss": 11.2936, "loss/aux_loss": 0.04808789901435375, "loss/crossentropy": 2.5996453762054443, "loss/logits": 0.8564824372529983, "step": 34880 }, { "epoch": 0.3489, "grad_norm": 13.125, "grad_norm_var": 0.4479166666666667, "learning_rate": 0.0003, "loss": 11.417, "loss/aux_loss": 0.04807357657700777, "loss/crossentropy": 2.754497063159943, "loss/logits": 0.8746155887842179, "step": 34890 }, { "epoch": 0.349, "grad_norm": 13.9375, "grad_norm_var": 1.4731770833333333, "learning_rate": 0.0003, "loss": 11.3581, "loss/aux_loss": 0.04808343891054392, "loss/crossentropy": 2.722722589969635, "loss/logits": 0.8574351370334625, "step": 34900 }, { "epoch": 0.3491, "grad_norm": 13.4375, "grad_norm_var": 0.468994140625, "learning_rate": 0.0003, "loss": 11.2342, "loss/aux_loss": 0.04807738587260246, "loss/crossentropy": 2.85051429271698, "loss/logits": 0.8694952636957168, "step": 34910 }, { "epoch": 0.3492, "grad_norm": 13.8125, "grad_norm_var": 0.30935872395833336, "learning_rate": 0.0003, "loss": 11.2306, "loss/aux_loss": 0.04808169659227133, "loss/crossentropy": 2.581733113527298, "loss/logits": 0.8042867451906204, "step": 34920 }, { "epoch": 0.3493, "grad_norm": 13.8125, "grad_norm_var": 0.36380208333333336, "learning_rate": 0.0003, "loss": 11.3718, "loss/aux_loss": 0.04807847626507282, "loss/crossentropy": 2.7856172263622283, "loss/logits": 0.8864524632692337, "step": 34930 }, { "epoch": 0.3494, "grad_norm": 12.5625, "grad_norm_var": 0.5591145833333333, "learning_rate": 0.0003, "loss": 11.4262, "loss/aux_loss": 0.04807476550340652, "loss/crossentropy": 2.842118561267853, "loss/logits": 0.8961553603410721, "step": 34940 }, { "epoch": 0.3495, "grad_norm": 13.125, "grad_norm_var": 0.6594889322916667, "learning_rate": 0.0003, "loss": 11.2777, "loss/aux_loss": 0.04807927329093218, "loss/crossentropy": 2.732057309150696, "loss/logits": 0.8318034172058105, "step": 34950 }, { "epoch": 0.3496, "grad_norm": 13.5625, "grad_norm_var": 0.6219889322916666, "learning_rate": 0.0003, "loss": 11.2521, "loss/aux_loss": 0.04807781353592873, "loss/crossentropy": 2.712051713466644, "loss/logits": 0.8508867889642715, "step": 34960 }, { "epoch": 0.3497, "grad_norm": 14.5625, "grad_norm_var": 0.6476399739583333, "learning_rate": 0.0003, "loss": 11.2176, "loss/aux_loss": 0.04807377476245165, "loss/crossentropy": 2.7046292066574096, "loss/logits": 0.8390609532594681, "step": 34970 }, { "epoch": 0.3498, "grad_norm": 13.1875, "grad_norm_var": 0.25201822916666666, "learning_rate": 0.0003, "loss": 11.3613, "loss/aux_loss": 0.04808987118303776, "loss/crossentropy": 2.7671579003334044, "loss/logits": 0.8521817743778228, "step": 34980 }, { "epoch": 0.3499, "grad_norm": 13.125, "grad_norm_var": 0.21834309895833334, "learning_rate": 0.0003, "loss": 11.2822, "loss/aux_loss": 0.048079888336360455, "loss/crossentropy": 2.6444417238235474, "loss/logits": 0.842853182554245, "step": 34990 }, { "epoch": 0.35, "grad_norm": 13.125, "grad_norm_var": 0.212353515625, "learning_rate": 0.0003, "loss": 11.2254, "loss/aux_loss": 0.04808303378522396, "loss/crossentropy": 2.692433053255081, "loss/logits": 0.8417773574590683, "step": 35000 }, { "epoch": 0.3501, "grad_norm": 14.0, "grad_norm_var": 0.2947265625, "learning_rate": 0.0003, "loss": 11.4454, "loss/aux_loss": 0.04807858187705279, "loss/crossentropy": 2.7733002305030823, "loss/logits": 0.8693195581436157, "step": 35010 }, { "epoch": 0.3502, "grad_norm": 14.5625, "grad_norm_var": 0.8526041666666667, "learning_rate": 0.0003, "loss": 11.3941, "loss/aux_loss": 0.04808044396340847, "loss/crossentropy": 2.615969657897949, "loss/logits": 0.8622830808162689, "step": 35020 }, { "epoch": 0.3503, "grad_norm": 12.625, "grad_norm_var": 0.931494140625, "learning_rate": 0.0003, "loss": 11.2957, "loss/aux_loss": 0.04807266090065241, "loss/crossentropy": 2.801264774799347, "loss/logits": 0.8435007154941558, "step": 35030 }, { "epoch": 0.3504, "grad_norm": 13.3125, "grad_norm_var": 0.484619140625, "learning_rate": 0.0003, "loss": 11.3, "loss/aux_loss": 0.04808126632124186, "loss/crossentropy": 2.7451845824718477, "loss/logits": 0.867804229259491, "step": 35040 }, { "epoch": 0.3505, "grad_norm": 14.1875, "grad_norm_var": 0.151806640625, "learning_rate": 0.0003, "loss": 11.346, "loss/aux_loss": 0.04807517919689417, "loss/crossentropy": 2.8747010231018066, "loss/logits": 0.8618029087781907, "step": 35050 }, { "epoch": 0.3506, "grad_norm": 14.0, "grad_norm_var": 0.3374837239583333, "learning_rate": 0.0003, "loss": 11.4672, "loss/aux_loss": 0.048079409264028074, "loss/crossentropy": 2.8339676380157472, "loss/logits": 0.856550145149231, "step": 35060 }, { "epoch": 0.3507, "grad_norm": 13.5625, "grad_norm_var": 1.504931640625, "learning_rate": 0.0003, "loss": 11.4003, "loss/aux_loss": 0.04808185379952192, "loss/crossentropy": 2.8066389322280885, "loss/logits": 0.897555747628212, "step": 35070 }, { "epoch": 0.3508, "grad_norm": 14.875, "grad_norm_var": 0.5884765625, "learning_rate": 0.0003, "loss": 11.546, "loss/aux_loss": 0.04807468615472317, "loss/crossentropy": 2.8240845441818236, "loss/logits": 0.8684464514255523, "step": 35080 }, { "epoch": 0.3509, "grad_norm": 15.9375, "grad_norm_var": 5.7009765625, "learning_rate": 0.0003, "loss": 11.3058, "loss/aux_loss": 0.04807514958083629, "loss/crossentropy": 2.7188303232192994, "loss/logits": 0.8454073309898377, "step": 35090 }, { "epoch": 0.351, "grad_norm": 13.625, "grad_norm_var": 5.8384765625, "learning_rate": 0.0003, "loss": 11.3441, "loss/aux_loss": 0.04809205364435911, "loss/crossentropy": 2.6878881573677065, "loss/logits": 0.8072956264019012, "step": 35100 }, { "epoch": 0.3511, "grad_norm": 12.1875, "grad_norm_var": 1.4634765625, "learning_rate": 0.0003, "loss": 11.3553, "loss/aux_loss": 0.048081132024526595, "loss/crossentropy": 2.7471259951591493, "loss/logits": 0.871997344493866, "step": 35110 }, { "epoch": 0.3512, "grad_norm": 14.625, "grad_norm_var": 0.869775390625, "learning_rate": 0.0003, "loss": 11.4906, "loss/aux_loss": 0.04808983094990253, "loss/crossentropy": 2.767477738857269, "loss/logits": 0.8434695929288865, "step": 35120 }, { "epoch": 0.3513, "grad_norm": 13.125, "grad_norm_var": 0.463916015625, "learning_rate": 0.0003, "loss": 11.3584, "loss/aux_loss": 0.048065770603716376, "loss/crossentropy": 2.678995144367218, "loss/logits": 0.8505305916070938, "step": 35130 }, { "epoch": 0.3514, "grad_norm": 14.625, "grad_norm_var": 0.4921875, "learning_rate": 0.0003, "loss": 11.253, "loss/aux_loss": 0.04808024074882269, "loss/crossentropy": 2.7573612451553347, "loss/logits": 0.8538782745599747, "step": 35140 }, { "epoch": 0.3515, "grad_norm": 13.0625, "grad_norm_var": 0.5835774739583334, "learning_rate": 0.0003, "loss": 11.1857, "loss/aux_loss": 0.04808320011943579, "loss/crossentropy": 2.6276727855205535, "loss/logits": 0.8079499483108521, "step": 35150 }, { "epoch": 0.3516, "grad_norm": 14.3125, "grad_norm_var": 1.0212890625, "learning_rate": 0.0003, "loss": 11.4222, "loss/aux_loss": 0.0480812968686223, "loss/crossentropy": 2.6966384649276733, "loss/logits": 0.8357253611087799, "step": 35160 }, { "epoch": 0.3517, "grad_norm": 13.0, "grad_norm_var": 0.9255045572916667, "learning_rate": 0.0003, "loss": 11.3839, "loss/aux_loss": 0.04808384496718645, "loss/crossentropy": 2.7424313902854918, "loss/logits": 0.8915427207946778, "step": 35170 }, { "epoch": 0.3518, "grad_norm": 14.0625, "grad_norm_var": 10.929020182291667, "learning_rate": 0.0003, "loss": 11.3673, "loss/aux_loss": 0.048090758919715884, "loss/crossentropy": 2.70193572640419, "loss/logits": 0.8317285031080246, "step": 35180 }, { "epoch": 0.3519, "grad_norm": 13.6875, "grad_norm_var": 0.20338541666666668, "learning_rate": 0.0003, "loss": 11.3432, "loss/aux_loss": 0.048075008764863016, "loss/crossentropy": 2.724704682826996, "loss/logits": 0.8616194367408753, "step": 35190 }, { "epoch": 0.352, "grad_norm": 13.25, "grad_norm_var": 0.42706705729166666, "learning_rate": 0.0003, "loss": 11.3289, "loss/aux_loss": 0.04808625653386116, "loss/crossentropy": 2.6437718331813813, "loss/logits": 0.829070645570755, "step": 35200 }, { "epoch": 0.3521, "grad_norm": 14.625, "grad_norm_var": 0.65625, "learning_rate": 0.0003, "loss": 11.3784, "loss/aux_loss": 0.04808688312768936, "loss/crossentropy": 2.8204615235328676, "loss/logits": 0.9107112646102905, "step": 35210 }, { "epoch": 0.3522, "grad_norm": 15.0, "grad_norm_var": 0.6231770833333333, "learning_rate": 0.0003, "loss": 11.3746, "loss/aux_loss": 0.048073959164321424, "loss/crossentropy": 2.9584303498268127, "loss/logits": 0.8751641631126403, "step": 35220 }, { "epoch": 0.3523, "grad_norm": 12.6875, "grad_norm_var": 0.447900390625, "learning_rate": 0.0003, "loss": 11.3529, "loss/aux_loss": 0.048080052994191644, "loss/crossentropy": 2.8173577427864074, "loss/logits": 0.8873382836580277, "step": 35230 }, { "epoch": 0.3524, "grad_norm": 13.6875, "grad_norm_var": 0.5597493489583333, "learning_rate": 0.0003, "loss": 11.543, "loss/aux_loss": 0.048076377063989637, "loss/crossentropy": 2.825025236606598, "loss/logits": 0.8857592105865478, "step": 35240 }, { "epoch": 0.3525, "grad_norm": 13.3125, "grad_norm_var": 0.6442057291666666, "learning_rate": 0.0003, "loss": 11.3836, "loss/aux_loss": 0.048085327818989755, "loss/crossentropy": 2.6969442307949065, "loss/logits": 0.8094238936901093, "step": 35250 }, { "epoch": 0.3526, "grad_norm": 13.875, "grad_norm_var": 0.645556640625, "learning_rate": 0.0003, "loss": 11.3612, "loss/aux_loss": 0.04808722659945488, "loss/crossentropy": 2.6123400807380674, "loss/logits": 0.8106647431850433, "step": 35260 }, { "epoch": 0.3527, "grad_norm": 14.125, "grad_norm_var": 55.956624348958336, "learning_rate": 0.0003, "loss": 11.3009, "loss/aux_loss": 0.048074896819889544, "loss/crossentropy": 2.810643696784973, "loss/logits": 0.8637238830327988, "step": 35270 }, { "epoch": 0.3528, "grad_norm": 22.625, "grad_norm_var": 68.66764322916667, "learning_rate": 0.0003, "loss": 11.3654, "loss/aux_loss": 0.04808924626559019, "loss/crossentropy": 2.6293312191963194, "loss/logits": 0.8716479748487472, "step": 35280 }, { "epoch": 0.3529, "grad_norm": 13.875, "grad_norm_var": 7.793082682291667, "learning_rate": 0.0003, "loss": 11.2959, "loss/aux_loss": 0.04808361511677504, "loss/crossentropy": 2.6466811537742614, "loss/logits": 0.8509993731975556, "step": 35290 }, { "epoch": 0.353, "grad_norm": 14.5625, "grad_norm_var": 0.49114583333333334, "learning_rate": 0.0003, "loss": 11.2404, "loss/aux_loss": 0.048073071613907815, "loss/crossentropy": 2.7615082263946533, "loss/logits": 0.8562895059585571, "step": 35300 }, { "epoch": 0.3531, "grad_norm": 13.25, "grad_norm_var": 0.2009765625, "learning_rate": 0.0003, "loss": 11.2599, "loss/aux_loss": 0.04808885268867016, "loss/crossentropy": 2.7079634070396423, "loss/logits": 0.8336867898702621, "step": 35310 }, { "epoch": 0.3532, "grad_norm": 13.3125, "grad_norm_var": 0.14503580729166668, "learning_rate": 0.0003, "loss": 11.3066, "loss/aux_loss": 0.04807536732405424, "loss/crossentropy": 2.9801666378974914, "loss/logits": 0.8767634421586991, "step": 35320 }, { "epoch": 0.3533, "grad_norm": 12.625, "grad_norm_var": 0.30245768229166664, "learning_rate": 0.0003, "loss": 11.538, "loss/aux_loss": 0.048068761453032495, "loss/crossentropy": 2.752707290649414, "loss/logits": 0.878770825266838, "step": 35330 }, { "epoch": 0.3534, "grad_norm": 13.8125, "grad_norm_var": 0.4618326822916667, "learning_rate": 0.0003, "loss": 11.3234, "loss/aux_loss": 0.04809120837599039, "loss/crossentropy": 2.751829779148102, "loss/logits": 0.8367206364870071, "step": 35340 }, { "epoch": 0.3535, "grad_norm": 13.1875, "grad_norm_var": 0.6983723958333333, "learning_rate": 0.0003, "loss": 11.3473, "loss/aux_loss": 0.04807098787277937, "loss/crossentropy": 2.903922712802887, "loss/logits": 0.885872820019722, "step": 35350 }, { "epoch": 0.3536, "grad_norm": 13.4375, "grad_norm_var": 0.747900390625, "learning_rate": 0.0003, "loss": 11.2302, "loss/aux_loss": 0.04807676579803229, "loss/crossentropy": 2.7807978630065917, "loss/logits": 0.8654327929019928, "step": 35360 }, { "epoch": 0.3537, "grad_norm": 13.75, "grad_norm_var": 0.4561848958333333, "learning_rate": 0.0003, "loss": 11.4295, "loss/aux_loss": 0.04808081611990929, "loss/crossentropy": 2.8749902486801147, "loss/logits": 0.8648800730705262, "step": 35370 }, { "epoch": 0.3538, "grad_norm": 14.3125, "grad_norm_var": 10.038997395833333, "learning_rate": 0.0003, "loss": 11.2158, "loss/aux_loss": 0.04807778876274824, "loss/crossentropy": 2.7491287708282472, "loss/logits": 0.8350054025650024, "step": 35380 }, { "epoch": 0.3539, "grad_norm": 13.125, "grad_norm_var": 0.4544270833333333, "learning_rate": 0.0003, "loss": 11.3964, "loss/aux_loss": 0.048072314076125625, "loss/crossentropy": 2.757090598344803, "loss/logits": 0.8464928805828095, "step": 35390 }, { "epoch": 0.354, "grad_norm": 12.9375, "grad_norm_var": 0.3385416666666667, "learning_rate": 0.0003, "loss": 11.2412, "loss/aux_loss": 0.04808321315795183, "loss/crossentropy": 2.7094544529914857, "loss/logits": 0.8820542007684707, "step": 35400 }, { "epoch": 0.3541, "grad_norm": 15.0625, "grad_norm_var": 11.601416015625, "learning_rate": 0.0003, "loss": 11.3299, "loss/aux_loss": 0.04808483738452196, "loss/crossentropy": 2.7222808599472046, "loss/logits": 0.835541981458664, "step": 35410 }, { "epoch": 0.3542, "grad_norm": 13.0625, "grad_norm_var": 0.7833333333333333, "learning_rate": 0.0003, "loss": 11.3284, "loss/aux_loss": 0.04806767236441374, "loss/crossentropy": 2.756817102432251, "loss/logits": 0.8438375443220139, "step": 35420 }, { "epoch": 0.3543, "grad_norm": 13.6875, "grad_norm_var": 0.32076822916666664, "learning_rate": 0.0003, "loss": 11.1812, "loss/aux_loss": 0.04807917140424252, "loss/crossentropy": 2.6759197235107424, "loss/logits": 0.809447106719017, "step": 35430 }, { "epoch": 0.3544, "grad_norm": 13.625, "grad_norm_var": 0.32526041666666666, "learning_rate": 0.0003, "loss": 11.3935, "loss/aux_loss": 0.048069739155471324, "loss/crossentropy": 2.920281636714935, "loss/logits": 0.8890509903430939, "step": 35440 }, { "epoch": 0.3545, "grad_norm": 12.75, "grad_norm_var": 0.43748372395833335, "learning_rate": 0.0003, "loss": 11.2958, "loss/aux_loss": 0.048088513500988486, "loss/crossentropy": 2.5987202882766725, "loss/logits": 0.8323431223630905, "step": 35450 }, { "epoch": 0.3546, "grad_norm": 20.0, "grad_norm_var": 367.611181640625, "learning_rate": 0.0003, "loss": 11.345, "loss/aux_loss": 0.04808755200356245, "loss/crossentropy": 2.6423967361450194, "loss/logits": 0.849226924777031, "step": 35460 }, { "epoch": 0.3547, "grad_norm": 13.125, "grad_norm_var": 2.9330729166666667, "learning_rate": 0.0003, "loss": 11.3632, "loss/aux_loss": 0.04807810019701719, "loss/crossentropy": 2.7577707767486572, "loss/logits": 0.8662696242332458, "step": 35470 }, { "epoch": 0.3548, "grad_norm": 13.875, "grad_norm_var": 0.524853515625, "learning_rate": 0.0003, "loss": 11.4028, "loss/aux_loss": 0.048086337931454184, "loss/crossentropy": 2.615101230144501, "loss/logits": 0.836988553404808, "step": 35480 }, { "epoch": 0.3549, "grad_norm": 15.125, "grad_norm_var": 1.8457682291666666, "learning_rate": 0.0003, "loss": 11.2295, "loss/aux_loss": 0.04806696325540542, "loss/crossentropy": 2.6185821652412415, "loss/logits": 0.8607719987630844, "step": 35490 }, { "epoch": 0.355, "grad_norm": 14.875, "grad_norm_var": 1.1585774739583334, "learning_rate": 0.0003, "loss": 11.3416, "loss/aux_loss": 0.04807763248682022, "loss/crossentropy": 2.7731160163879394, "loss/logits": 0.8407220751047134, "step": 35500 }, { "epoch": 0.3551, "grad_norm": 13.1875, "grad_norm_var": 0.5738932291666666, "learning_rate": 0.0003, "loss": 11.3636, "loss/aux_loss": 0.04808692019432783, "loss/crossentropy": 2.7722087264060975, "loss/logits": 0.8629306703805923, "step": 35510 }, { "epoch": 0.3552, "grad_norm": 13.875, "grad_norm_var": 0.29322916666666665, "learning_rate": 0.0003, "loss": 11.4107, "loss/aux_loss": 0.04807148296386003, "loss/crossentropy": 2.8003060460090636, "loss/logits": 0.8875281304121018, "step": 35520 }, { "epoch": 0.3553, "grad_norm": 12.4375, "grad_norm_var": 0.35323893229166664, "learning_rate": 0.0003, "loss": 11.4067, "loss/aux_loss": 0.04808224029839039, "loss/crossentropy": 2.701129513978958, "loss/logits": 0.8584021329879761, "step": 35530 }, { "epoch": 0.3554, "grad_norm": 13.875, "grad_norm_var": 0.42888997395833334, "learning_rate": 0.0003, "loss": 11.4743, "loss/aux_loss": 0.048079540766775605, "loss/crossentropy": 2.85890337228775, "loss/logits": 0.8821686983108521, "step": 35540 }, { "epoch": 0.3555, "grad_norm": 13.5625, "grad_norm_var": 0.37473958333333335, "learning_rate": 0.0003, "loss": 11.3944, "loss/aux_loss": 0.04807381983846426, "loss/crossentropy": 2.7811784029006956, "loss/logits": 0.8604174524545669, "step": 35550 }, { "epoch": 0.3556, "grad_norm": 14.125, "grad_norm_var": 0.4749348958333333, "learning_rate": 0.0003, "loss": 11.2323, "loss/aux_loss": 0.04808265678584576, "loss/crossentropy": 2.76348534822464, "loss/logits": 0.8644289672374725, "step": 35560 }, { "epoch": 0.3557, "grad_norm": 13.0625, "grad_norm_var": 0.3317057291666667, "learning_rate": 0.0003, "loss": 11.3451, "loss/aux_loss": 0.048073519952595234, "loss/crossentropy": 2.821522521972656, "loss/logits": 0.8591786533594131, "step": 35570 }, { "epoch": 0.3558, "grad_norm": 16.875, "grad_norm_var": 1.5110514322916666, "learning_rate": 0.0003, "loss": 11.1857, "loss/aux_loss": 0.048086483217775824, "loss/crossentropy": 2.685242211818695, "loss/logits": 0.807622566819191, "step": 35580 }, { "epoch": 0.3559, "grad_norm": 11.8125, "grad_norm_var": 2.0515625, "learning_rate": 0.0003, "loss": 11.3026, "loss/aux_loss": 0.04807799514383078, "loss/crossentropy": 2.5974143624305723, "loss/logits": 0.8406628459692002, "step": 35590 }, { "epoch": 0.356, "grad_norm": 14.5, "grad_norm_var": 0.5382649739583333, "learning_rate": 0.0003, "loss": 11.3024, "loss/aux_loss": 0.04808833636343479, "loss/crossentropy": 2.5895915746688845, "loss/logits": 0.8398721873760223, "step": 35600 }, { "epoch": 0.3561, "grad_norm": 13.9375, "grad_norm_var": 0.4663899739583333, "learning_rate": 0.0003, "loss": 11.2555, "loss/aux_loss": 0.04808176904916763, "loss/crossentropy": 2.6462597012519837, "loss/logits": 0.8560863435268402, "step": 35610 }, { "epoch": 0.3562, "grad_norm": 13.4375, "grad_norm_var": 0.3993326822916667, "learning_rate": 0.0003, "loss": 11.3956, "loss/aux_loss": 0.04806818459182978, "loss/crossentropy": 2.619718074798584, "loss/logits": 0.8831523567438125, "step": 35620 }, { "epoch": 0.3563, "grad_norm": 14.5625, "grad_norm_var": 0.44733072916666666, "learning_rate": 0.0003, "loss": 11.4719, "loss/aux_loss": 0.04808085449039936, "loss/crossentropy": 2.7037489295005797, "loss/logits": 0.8706602722406387, "step": 35630 }, { "epoch": 0.3564, "grad_norm": 14.25, "grad_norm_var": 0.613134765625, "learning_rate": 0.0003, "loss": 11.3119, "loss/aux_loss": 0.04807729236781597, "loss/crossentropy": 2.698741543292999, "loss/logits": 0.877493503689766, "step": 35640 }, { "epoch": 0.3565, "grad_norm": 13.5, "grad_norm_var": 0.6702962239583333, "learning_rate": 0.0003, "loss": 11.2573, "loss/aux_loss": 0.04807835165411234, "loss/crossentropy": 2.6732171416282653, "loss/logits": 0.8277383238077164, "step": 35650 }, { "epoch": 0.3566, "grad_norm": 13.125, "grad_norm_var": 0.20701497395833332, "learning_rate": 0.0003, "loss": 11.3619, "loss/aux_loss": 0.04807893112301827, "loss/crossentropy": 2.7110289692878724, "loss/logits": 0.8260849803686142, "step": 35660 }, { "epoch": 0.3567, "grad_norm": 13.0625, "grad_norm_var": 0.23671875, "learning_rate": 0.0003, "loss": 11.1931, "loss/aux_loss": 0.0480891864746809, "loss/crossentropy": 2.6819321513175964, "loss/logits": 0.8022189557552337, "step": 35670 }, { "epoch": 0.3568, "grad_norm": 12.8125, "grad_norm_var": 0.42706705729166666, "learning_rate": 0.0003, "loss": 11.4875, "loss/aux_loss": 0.04807867780327797, "loss/crossentropy": 2.8046223700046538, "loss/logits": 0.8751489996910096, "step": 35680 }, { "epoch": 0.3569, "grad_norm": 14.3125, "grad_norm_var": 0.2921223958333333, "learning_rate": 0.0003, "loss": 11.6221, "loss/aux_loss": 0.04808136597275734, "loss/crossentropy": 2.7549788117408753, "loss/logits": 0.8519262999296189, "step": 35690 }, { "epoch": 0.357, "grad_norm": 13.6875, "grad_norm_var": 0.12630208333333334, "learning_rate": 0.0003, "loss": 11.265, "loss/aux_loss": 0.04808007068932056, "loss/crossentropy": 2.647283446788788, "loss/logits": 0.8637819319963456, "step": 35700 }, { "epoch": 0.3571, "grad_norm": 12.5625, "grad_norm_var": 0.23984375, "learning_rate": 0.0003, "loss": 11.3778, "loss/aux_loss": 0.048076775297522543, "loss/crossentropy": 2.6670687079429625, "loss/logits": 0.8395314335823059, "step": 35710 }, { "epoch": 0.3572, "grad_norm": 12.8125, "grad_norm_var": 0.4837890625, "learning_rate": 0.0003, "loss": 11.3448, "loss/aux_loss": 0.0480807974934578, "loss/crossentropy": 2.8054326295852663, "loss/logits": 0.8705591022968292, "step": 35720 }, { "epoch": 0.3573, "grad_norm": 13.125, "grad_norm_var": 0.4630208333333333, "learning_rate": 0.0003, "loss": 11.4336, "loss/aux_loss": 0.048079059645533564, "loss/crossentropy": 2.64428573846817, "loss/logits": 0.8815708011388779, "step": 35730 }, { "epoch": 0.3574, "grad_norm": 14.125, "grad_norm_var": 0.5306640625, "learning_rate": 0.0003, "loss": 11.3834, "loss/aux_loss": 0.048077475652098654, "loss/crossentropy": 2.6877181112766264, "loss/logits": 0.8534661501646041, "step": 35740 }, { "epoch": 0.3575, "grad_norm": 12.4375, "grad_norm_var": 0.5773274739583333, "learning_rate": 0.0003, "loss": 11.1881, "loss/aux_loss": 0.04806953519582748, "loss/crossentropy": 2.6530270755290983, "loss/logits": 0.8448708355426788, "step": 35750 }, { "epoch": 0.3576, "grad_norm": 14.5, "grad_norm_var": 0.7054524739583333, "learning_rate": 0.0003, "loss": 11.1465, "loss/aux_loss": 0.048080852068960664, "loss/crossentropy": 2.6025227308273315, "loss/logits": 0.8209551721811295, "step": 35760 }, { "epoch": 0.3577, "grad_norm": 13.6875, "grad_norm_var": 0.5111979166666667, "learning_rate": 0.0003, "loss": 11.354, "loss/aux_loss": 0.04807685352861881, "loss/crossentropy": 2.7108654737472535, "loss/logits": 0.8405659079551697, "step": 35770 }, { "epoch": 0.3578, "grad_norm": 14.0, "grad_norm_var": 0.7598795572916667, "learning_rate": 0.0003, "loss": 11.3892, "loss/aux_loss": 0.048074676841497424, "loss/crossentropy": 2.8448933243751524, "loss/logits": 0.8365987449884414, "step": 35780 }, { "epoch": 0.3579, "grad_norm": 14.4375, "grad_norm_var": 0.4903645833333333, "learning_rate": 0.0003, "loss": 11.2817, "loss/aux_loss": 0.04807745218276978, "loss/crossentropy": 2.8002660870552063, "loss/logits": 0.8614138662815094, "step": 35790 }, { "epoch": 0.358, "grad_norm": 12.75, "grad_norm_var": 0.8332682291666667, "learning_rate": 0.0003, "loss": 11.3633, "loss/aux_loss": 0.048078560084104535, "loss/crossentropy": 2.7766624689102173, "loss/logits": 0.8491496801376343, "step": 35800 }, { "epoch": 0.3581, "grad_norm": 13.375, "grad_norm_var": 3.879541015625, "learning_rate": 0.0003, "loss": 11.3036, "loss/aux_loss": 0.04808343965560198, "loss/crossentropy": 2.7654669165611265, "loss/logits": 0.8556907385587692, "step": 35810 }, { "epoch": 0.3582, "grad_norm": 13.375, "grad_norm_var": 0.29099934895833335, "learning_rate": 0.0003, "loss": 11.3808, "loss/aux_loss": 0.04808389656245708, "loss/crossentropy": 2.712459546327591, "loss/logits": 0.854012405872345, "step": 35820 }, { "epoch": 0.3583, "grad_norm": 13.0, "grad_norm_var": 0.18097330729166666, "learning_rate": 0.0003, "loss": 11.3502, "loss/aux_loss": 0.04807974956929684, "loss/crossentropy": 2.6321381747722628, "loss/logits": 0.850381875038147, "step": 35830 }, { "epoch": 0.3584, "grad_norm": 12.75, "grad_norm_var": 0.41451822916666664, "learning_rate": 0.0003, "loss": 11.1751, "loss/aux_loss": 0.04808203168213367, "loss/crossentropy": 2.6016912758350372, "loss/logits": 0.8554532587528229, "step": 35840 }, { "epoch": 0.3585, "grad_norm": 12.5, "grad_norm_var": 0.7824055989583333, "learning_rate": 0.0003, "loss": 11.4128, "loss/aux_loss": 0.04807893205434084, "loss/crossentropy": 2.8680613577365874, "loss/logits": 0.8510574102401733, "step": 35850 }, { "epoch": 0.3586, "grad_norm": 12.8125, "grad_norm_var": 0.679541015625, "learning_rate": 0.0003, "loss": 11.2393, "loss/aux_loss": 0.048077529110014436, "loss/crossentropy": 2.755670565366745, "loss/logits": 0.8600349962711334, "step": 35860 }, { "epoch": 0.3587, "grad_norm": 12.4375, "grad_norm_var": 0.9173014322916667, "learning_rate": 0.0003, "loss": 11.3892, "loss/aux_loss": 0.048092295043170454, "loss/crossentropy": 2.7275496542453768, "loss/logits": 0.8351588726043702, "step": 35870 }, { "epoch": 0.3588, "grad_norm": 13.0, "grad_norm_var": 0.9158854166666667, "learning_rate": 0.0003, "loss": 11.2862, "loss/aux_loss": 0.0480747377499938, "loss/crossentropy": 2.7512252271175384, "loss/logits": 0.8427632987499237, "step": 35880 }, { "epoch": 0.3589, "grad_norm": 13.5625, "grad_norm_var": 0.29322916666666665, "learning_rate": 0.0003, "loss": 11.3548, "loss/aux_loss": 0.04807990454137325, "loss/crossentropy": 2.873078280687332, "loss/logits": 0.8730248123407364, "step": 35890 }, { "epoch": 0.359, "grad_norm": 13.5, "grad_norm_var": 0.3624348958333333, "learning_rate": 0.0003, "loss": 11.2026, "loss/aux_loss": 0.048082062415778636, "loss/crossentropy": 2.740494179725647, "loss/logits": 0.8442809909582139, "step": 35900 }, { "epoch": 0.3591, "grad_norm": 14.375, "grad_norm_var": 0.512744140625, "learning_rate": 0.0003, "loss": 11.23, "loss/aux_loss": 0.048080322705209254, "loss/crossentropy": 2.7258496403694155, "loss/logits": 0.8964686661958694, "step": 35910 }, { "epoch": 0.3592, "grad_norm": 14.0625, "grad_norm_var": 0.6156087239583333, "learning_rate": 0.0003, "loss": 11.332, "loss/aux_loss": 0.048071503080427645, "loss/crossentropy": 2.664508581161499, "loss/logits": 0.8316588670015335, "step": 35920 }, { "epoch": 0.3593, "grad_norm": 13.6875, "grad_norm_var": 0.14178059895833334, "learning_rate": 0.0003, "loss": 11.3462, "loss/aux_loss": 0.0480843897908926, "loss/crossentropy": 2.8054317951202394, "loss/logits": 0.8416423499584198, "step": 35930 }, { "epoch": 0.3594, "grad_norm": 13.5625, "grad_norm_var": 0.1541015625, "learning_rate": 0.0003, "loss": 11.4479, "loss/aux_loss": 0.04807868916541338, "loss/crossentropy": 2.6694052278995515, "loss/logits": 0.8517626136541366, "step": 35940 }, { "epoch": 0.3595, "grad_norm": 17.375, "grad_norm_var": 312.1473795572917, "learning_rate": 0.0003, "loss": 11.422, "loss/aux_loss": 0.04807343017309904, "loss/crossentropy": 2.7886245131492613, "loss/logits": 0.8560446441173554, "step": 35950 }, { "epoch": 0.3596, "grad_norm": 15.3125, "grad_norm_var": 306.50983072916665, "learning_rate": 0.0003, "loss": 11.528, "loss/aux_loss": 0.0480889055877924, "loss/crossentropy": 2.7681180238723755, "loss/logits": 0.8745385199785233, "step": 35960 }, { "epoch": 0.3597, "grad_norm": 15.0625, "grad_norm_var": 0.3260416666666667, "learning_rate": 0.0003, "loss": 11.4073, "loss/aux_loss": 0.048074235394597056, "loss/crossentropy": 2.7186142563819886, "loss/logits": 0.8708992570638656, "step": 35970 }, { "epoch": 0.3598, "grad_norm": 13.1875, "grad_norm_var": 0.390087890625, "learning_rate": 0.0003, "loss": 11.256, "loss/aux_loss": 0.048084134608507155, "loss/crossentropy": 2.6606498062610626, "loss/logits": 0.8529165148735046, "step": 35980 }, { "epoch": 0.3599, "grad_norm": 13.8125, "grad_norm_var": 0.6249837239583333, "learning_rate": 0.0003, "loss": 11.4559, "loss/aux_loss": 0.04807859268039465, "loss/crossentropy": 2.862497079372406, "loss/logits": 0.8499436527490616, "step": 35990 }, { "epoch": 0.36, "grad_norm": 14.125, "grad_norm_var": 0.48899739583333335, "learning_rate": 0.0003, "loss": 11.4658, "loss/aux_loss": 0.04807100892066955, "loss/crossentropy": 2.797253680229187, "loss/logits": 0.8588318228721619, "step": 36000 }, { "epoch": 0.3601, "grad_norm": 13.6875, "grad_norm_var": 0.3251139322916667, "learning_rate": 0.0003, "loss": 11.2969, "loss/aux_loss": 0.04807574506849051, "loss/crossentropy": 2.5939278662204743, "loss/logits": 0.8101026326417923, "step": 36010 }, { "epoch": 0.3602, "grad_norm": 14.3125, "grad_norm_var": 0.73828125, "learning_rate": 0.0003, "loss": 11.3092, "loss/aux_loss": 0.048069524206221105, "loss/crossentropy": 2.8990219116210936, "loss/logits": 0.8483186364173889, "step": 36020 }, { "epoch": 0.3603, "grad_norm": 13.25, "grad_norm_var": 0.36599934895833336, "learning_rate": 0.0003, "loss": 11.3561, "loss/aux_loss": 0.04808562994003296, "loss/crossentropy": 2.6316673278808596, "loss/logits": 0.8744110763072968, "step": 36030 }, { "epoch": 0.3604, "grad_norm": 13.3125, "grad_norm_var": 0.48385416666666664, "learning_rate": 0.0003, "loss": 11.2467, "loss/aux_loss": 0.04806984197348356, "loss/crossentropy": 2.8506676077842714, "loss/logits": 0.8674321442842483, "step": 36040 }, { "epoch": 0.3605, "grad_norm": 13.625, "grad_norm_var": 0.3028645833333333, "learning_rate": 0.0003, "loss": 11.374, "loss/aux_loss": 0.048074014112353326, "loss/crossentropy": 2.6722546577453614, "loss/logits": 0.8741536557674408, "step": 36050 }, { "epoch": 0.3606, "grad_norm": 14.75, "grad_norm_var": 0.3251139322916667, "learning_rate": 0.0003, "loss": 11.1968, "loss/aux_loss": 0.04807903002947569, "loss/crossentropy": 2.777270722389221, "loss/logits": 0.8541721493005753, "step": 36060 }, { "epoch": 0.3607, "grad_norm": 14.0625, "grad_norm_var": 0.480322265625, "learning_rate": 0.0003, "loss": 11.3552, "loss/aux_loss": 0.04808016233146191, "loss/crossentropy": 2.692441987991333, "loss/logits": 0.8523607522249221, "step": 36070 }, { "epoch": 0.3608, "grad_norm": 12.8125, "grad_norm_var": 0.40896809895833336, "learning_rate": 0.0003, "loss": 11.2746, "loss/aux_loss": 0.048079789616167545, "loss/crossentropy": 2.746452260017395, "loss/logits": 0.863274747133255, "step": 36080 }, { "epoch": 0.3609, "grad_norm": 12.9375, "grad_norm_var": 0.583837890625, "learning_rate": 0.0003, "loss": 11.4223, "loss/aux_loss": 0.048079486936330795, "loss/crossentropy": 2.662570732831955, "loss/logits": 0.8952972948551178, "step": 36090 }, { "epoch": 0.361, "grad_norm": 14.875, "grad_norm_var": 1.0411295572916666, "learning_rate": 0.0003, "loss": 11.3986, "loss/aux_loss": 0.048073232360184195, "loss/crossentropy": 2.687173879146576, "loss/logits": 0.8554467290639878, "step": 36100 }, { "epoch": 0.3611, "grad_norm": 14.0, "grad_norm_var": 0.997509765625, "learning_rate": 0.0003, "loss": 11.2722, "loss/aux_loss": 0.048092739656567574, "loss/crossentropy": 2.521233695745468, "loss/logits": 0.7955525845289231, "step": 36110 }, { "epoch": 0.3612, "grad_norm": 15.125, "grad_norm_var": 0.9139973958333333, "learning_rate": 0.0003, "loss": 11.4461, "loss/aux_loss": 0.04807361625134945, "loss/crossentropy": 2.970944273471832, "loss/logits": 0.8794440478086472, "step": 36120 }, { "epoch": 0.3613, "grad_norm": 12.4375, "grad_norm_var": 1.252587890625, "learning_rate": 0.0003, "loss": 11.2798, "loss/aux_loss": 0.04807492271065712, "loss/crossentropy": 2.717639720439911, "loss/logits": 0.8793477922677994, "step": 36130 }, { "epoch": 0.3614, "grad_norm": 13.625, "grad_norm_var": 0.8056640625, "learning_rate": 0.0003, "loss": 11.2061, "loss/aux_loss": 0.04807124081999063, "loss/crossentropy": 2.7465671420097353, "loss/logits": 0.8570981532335281, "step": 36140 }, { "epoch": 0.3615, "grad_norm": 14.0, "grad_norm_var": 0.7410807291666667, "learning_rate": 0.0003, "loss": 11.4171, "loss/aux_loss": 0.04809042625129223, "loss/crossentropy": 2.874330496788025, "loss/logits": 0.8536765873432159, "step": 36150 }, { "epoch": 0.3616, "grad_norm": 15.1875, "grad_norm_var": 0.9301920572916667, "learning_rate": 0.0003, "loss": 11.3739, "loss/aux_loss": 0.048077494464814664, "loss/crossentropy": 2.586655741930008, "loss/logits": 0.8711060285568237, "step": 36160 }, { "epoch": 0.3617, "grad_norm": 14.4375, "grad_norm_var": 1.490869140625, "learning_rate": 0.0003, "loss": 11.3623, "loss/aux_loss": 0.04806613698601723, "loss/crossentropy": 2.5908707082271576, "loss/logits": 0.8134723126888275, "step": 36170 }, { "epoch": 0.3618, "grad_norm": 13.9375, "grad_norm_var": 1.3150390625, "learning_rate": 0.0003, "loss": 11.4247, "loss/aux_loss": 0.048084022291004655, "loss/crossentropy": 2.795676851272583, "loss/logits": 0.8587139397859573, "step": 36180 }, { "epoch": 0.3619, "grad_norm": 13.6875, "grad_norm_var": 0.7551920572916667, "learning_rate": 0.0003, "loss": 11.2165, "loss/aux_loss": 0.048082707822322844, "loss/crossentropy": 2.8844053208827973, "loss/logits": 0.8785182237625122, "step": 36190 }, { "epoch": 0.362, "grad_norm": 13.0625, "grad_norm_var": 0.3738932291666667, "learning_rate": 0.0003, "loss": 11.2659, "loss/aux_loss": 0.04806621167808771, "loss/crossentropy": 2.760159510374069, "loss/logits": 0.8577248483896256, "step": 36200 }, { "epoch": 0.3621, "grad_norm": 13.375, "grad_norm_var": 0.31808268229166664, "learning_rate": 0.0003, "loss": 11.2366, "loss/aux_loss": 0.04807536099106073, "loss/crossentropy": 2.608304864168167, "loss/logits": 0.829086622595787, "step": 36210 }, { "epoch": 0.3622, "grad_norm": 13.875, "grad_norm_var": 0.2072265625, "learning_rate": 0.0003, "loss": 11.4158, "loss/aux_loss": 0.04807885363698006, "loss/crossentropy": 2.6055088222026823, "loss/logits": 0.825168663263321, "step": 36220 }, { "epoch": 0.3623, "grad_norm": 13.4375, "grad_norm_var": 0.39334309895833336, "learning_rate": 0.0003, "loss": 11.553, "loss/aux_loss": 0.0480809198692441, "loss/crossentropy": 2.9127083659172057, "loss/logits": 0.8658655256032943, "step": 36230 }, { "epoch": 0.3624, "grad_norm": 14.25, "grad_norm_var": 0.37213541666666666, "learning_rate": 0.0003, "loss": 11.2686, "loss/aux_loss": 0.048089105263352395, "loss/crossentropy": 2.8726187229156492, "loss/logits": 0.8607639342546463, "step": 36240 }, { "epoch": 0.3625, "grad_norm": 12.9375, "grad_norm_var": 1.0445149739583333, "learning_rate": 0.0003, "loss": 11.2932, "loss/aux_loss": 0.04807120338082314, "loss/crossentropy": 2.898715019226074, "loss/logits": 0.8681466579437256, "step": 36250 }, { "epoch": 0.3626, "grad_norm": 12.9375, "grad_norm_var": 0.8946451822916667, "learning_rate": 0.0003, "loss": 11.373, "loss/aux_loss": 0.0480785708874464, "loss/crossentropy": 2.6757899284362794, "loss/logits": 0.8399546831846237, "step": 36260 }, { "epoch": 0.3627, "grad_norm": 12.625, "grad_norm_var": 0.471728515625, "learning_rate": 0.0003, "loss": 11.2656, "loss/aux_loss": 0.048072914406657216, "loss/crossentropy": 2.7273074328899383, "loss/logits": 0.834012359380722, "step": 36270 }, { "epoch": 0.3628, "grad_norm": 15.625, "grad_norm_var": 0.5853515625, "learning_rate": 0.0003, "loss": 11.3074, "loss/aux_loss": 0.04808451887220144, "loss/crossentropy": 2.661761927604675, "loss/logits": 0.8267664194107056, "step": 36280 }, { "epoch": 0.3629, "grad_norm": 13.875, "grad_norm_var": 7.678645833333333, "learning_rate": 0.0003, "loss": 11.1024, "loss/aux_loss": 0.04808289129287004, "loss/crossentropy": 2.8848253428936004, "loss/logits": 0.8689317673444747, "step": 36290 }, { "epoch": 0.363, "grad_norm": 13.875, "grad_norm_var": 7.61328125, "learning_rate": 0.0003, "loss": 11.3154, "loss/aux_loss": 0.048081953264772895, "loss/crossentropy": 2.703933322429657, "loss/logits": 0.860775688290596, "step": 36300 }, { "epoch": 0.3631, "grad_norm": 12.875, "grad_norm_var": 0.6884765625, "learning_rate": 0.0003, "loss": 11.2708, "loss/aux_loss": 0.04807724803686142, "loss/crossentropy": 2.715017533302307, "loss/logits": 0.8319126725196838, "step": 36310 }, { "epoch": 0.3632, "grad_norm": 22.625, "grad_norm_var": 5.701936848958334, "learning_rate": 0.0003, "loss": 11.2363, "loss/aux_loss": 0.04807513263076544, "loss/crossentropy": 2.829657733440399, "loss/logits": 0.8587293684482574, "step": 36320 }, { "epoch": 0.3633, "grad_norm": 14.25, "grad_norm_var": 4.881184895833333, "learning_rate": 0.0003, "loss": 11.408, "loss/aux_loss": 0.04807958342134953, "loss/crossentropy": 2.7696733355522154, "loss/logits": 0.8509972572326661, "step": 36330 }, { "epoch": 0.3634, "grad_norm": 14.625, "grad_norm_var": 0.25388997395833335, "learning_rate": 0.0003, "loss": 11.3764, "loss/aux_loss": 0.04807343930006027, "loss/crossentropy": 2.824759781360626, "loss/logits": 0.8681064277887345, "step": 36340 }, { "epoch": 0.3635, "grad_norm": 13.375, "grad_norm_var": 0.49386393229166664, "learning_rate": 0.0003, "loss": 11.3524, "loss/aux_loss": 0.04807184562087059, "loss/crossentropy": 2.8086614489555357, "loss/logits": 0.8541026085615158, "step": 36350 }, { "epoch": 0.3636, "grad_norm": 14.4375, "grad_norm_var": 0.5608723958333334, "learning_rate": 0.0003, "loss": 11.0975, "loss/aux_loss": 0.04808126986026764, "loss/crossentropy": 2.7107265830039977, "loss/logits": 0.8230546474456787, "step": 36360 }, { "epoch": 0.3637, "grad_norm": 15.5, "grad_norm_var": 0.6462076822916667, "learning_rate": 0.0003, "loss": 11.2573, "loss/aux_loss": 0.048082560300827026, "loss/crossentropy": 2.660491919517517, "loss/logits": 0.8427970826625824, "step": 36370 }, { "epoch": 0.3638, "grad_norm": 12.6875, "grad_norm_var": 0.799072265625, "learning_rate": 0.0003, "loss": 11.3318, "loss/aux_loss": 0.048079566471278666, "loss/crossentropy": 2.788001722097397, "loss/logits": 0.8650757223367691, "step": 36380 }, { "epoch": 0.3639, "grad_norm": 12.9375, "grad_norm_var": 0.584375, "learning_rate": 0.0003, "loss": 11.3163, "loss/aux_loss": 0.048077472113072874, "loss/crossentropy": 2.7954628705978393, "loss/logits": 0.8578163594007492, "step": 36390 }, { "epoch": 0.364, "grad_norm": 14.0, "grad_norm_var": 0.38743489583333335, "learning_rate": 0.0003, "loss": 11.3133, "loss/aux_loss": 0.04807708989828825, "loss/crossentropy": 2.665116882324219, "loss/logits": 0.8234895557165146, "step": 36400 }, { "epoch": 0.3641, "grad_norm": 14.875, "grad_norm_var": 0.7328125, "learning_rate": 0.0003, "loss": 11.5322, "loss/aux_loss": 0.04808671064674854, "loss/crossentropy": 2.858789348602295, "loss/logits": 0.8716346949338913, "step": 36410 }, { "epoch": 0.3642, "grad_norm": 14.0625, "grad_norm_var": 0.6462890625, "learning_rate": 0.0003, "loss": 11.1745, "loss/aux_loss": 0.048071319982409474, "loss/crossentropy": 2.877179265022278, "loss/logits": 0.8559047758579255, "step": 36420 }, { "epoch": 0.3643, "grad_norm": 13.3125, "grad_norm_var": 0.7098958333333333, "learning_rate": 0.0003, "loss": 11.3668, "loss/aux_loss": 0.04808102864772081, "loss/crossentropy": 2.699104994535446, "loss/logits": 0.8286954373121261, "step": 36430 }, { "epoch": 0.3644, "grad_norm": 13.75, "grad_norm_var": 0.3348795572916667, "learning_rate": 0.0003, "loss": 11.3764, "loss/aux_loss": 0.04807946030050516, "loss/crossentropy": 2.7624664068222047, "loss/logits": 0.8778378039598465, "step": 36440 }, { "epoch": 0.3645, "grad_norm": 13.5625, "grad_norm_var": 0.21614583333333334, "learning_rate": 0.0003, "loss": 11.4262, "loss/aux_loss": 0.04808296486735344, "loss/crossentropy": 2.7352758646011353, "loss/logits": 0.8656487733125686, "step": 36450 }, { "epoch": 0.3646, "grad_norm": 14.3125, "grad_norm_var": 0.4103515625, "learning_rate": 0.0003, "loss": 11.4924, "loss/aux_loss": 0.04807944241911173, "loss/crossentropy": 2.774025857448578, "loss/logits": 0.8716156959533692, "step": 36460 }, { "epoch": 0.3647, "grad_norm": 14.125, "grad_norm_var": 0.39791666666666664, "learning_rate": 0.0003, "loss": 11.2264, "loss/aux_loss": 0.04807451739907265, "loss/crossentropy": 2.8603923201560972, "loss/logits": 0.8951089948415756, "step": 36470 }, { "epoch": 0.3648, "grad_norm": 13.5625, "grad_norm_var": 0.35201822916666664, "learning_rate": 0.0003, "loss": 11.4953, "loss/aux_loss": 0.04807769488543272, "loss/crossentropy": 2.6446187674999235, "loss/logits": 0.8907380670309066, "step": 36480 }, { "epoch": 0.3649, "grad_norm": 12.8125, "grad_norm_var": 6.081363932291667, "learning_rate": 0.0003, "loss": 11.185, "loss/aux_loss": 0.048073522932827475, "loss/crossentropy": 2.7823033690452577, "loss/logits": 0.8572315156459809, "step": 36490 }, { "epoch": 0.365, "grad_norm": 14.125, "grad_norm_var": 0.465087890625, "learning_rate": 0.0003, "loss": 11.3351, "loss/aux_loss": 0.048076865077018735, "loss/crossentropy": 2.7182795643806457, "loss/logits": 0.8255507349967957, "step": 36500 }, { "epoch": 0.3651, "grad_norm": 13.4375, "grad_norm_var": 0.5874348958333333, "learning_rate": 0.0003, "loss": 11.275, "loss/aux_loss": 0.048083111830055716, "loss/crossentropy": 2.7402828454971315, "loss/logits": 0.8623090296983719, "step": 36510 }, { "epoch": 0.3652, "grad_norm": 14.0, "grad_norm_var": 0.3395182291666667, "learning_rate": 0.0003, "loss": 11.3483, "loss/aux_loss": 0.04808528944849968, "loss/crossentropy": 2.7780889511108398, "loss/logits": 0.8970998287200928, "step": 36520 }, { "epoch": 0.3653, "grad_norm": 13.75, "grad_norm_var": 912.176416015625, "learning_rate": 0.0003, "loss": 11.3774, "loss/aux_loss": 0.04809322264045477, "loss/crossentropy": 2.5786903738975524, "loss/logits": 0.8551579564809799, "step": 36530 }, { "epoch": 0.3654, "grad_norm": 13.5625, "grad_norm_var": 0.26666666666666666, "learning_rate": 0.0003, "loss": 11.3992, "loss/aux_loss": 0.04808025564998388, "loss/crossentropy": 2.6024239301681518, "loss/logits": 0.8405012160539627, "step": 36540 }, { "epoch": 0.3655, "grad_norm": 13.3125, "grad_norm_var": 0.9311848958333333, "learning_rate": 0.0003, "loss": 11.1519, "loss/aux_loss": 0.04808431137353182, "loss/crossentropy": 2.628155159950256, "loss/logits": 0.8049672454595566, "step": 36550 }, { "epoch": 0.3656, "grad_norm": 15.0, "grad_norm_var": 0.49420572916666666, "learning_rate": 0.0003, "loss": 11.3941, "loss/aux_loss": 0.048075790517032146, "loss/crossentropy": 2.7668872237205506, "loss/logits": 0.838652953505516, "step": 36560 }, { "epoch": 0.3657, "grad_norm": 12.625, "grad_norm_var": 0.6994140625, "learning_rate": 0.0003, "loss": 11.3062, "loss/aux_loss": 0.04808170460164547, "loss/crossentropy": 2.7823184549808504, "loss/logits": 0.9078426092863083, "step": 36570 }, { "epoch": 0.3658, "grad_norm": 13.0, "grad_norm_var": 0.8878743489583333, "learning_rate": 0.0003, "loss": 11.0968, "loss/aux_loss": 0.048066372610628606, "loss/crossentropy": 2.6771502017974855, "loss/logits": 0.8519851267337799, "step": 36580 }, { "epoch": 0.3659, "grad_norm": 13.625, "grad_norm_var": 0.6288899739583333, "learning_rate": 0.0003, "loss": 11.4529, "loss/aux_loss": 0.048079500906169415, "loss/crossentropy": 2.7219568133354186, "loss/logits": 0.8512856423854828, "step": 36590 }, { "epoch": 0.366, "grad_norm": 13.375, "grad_norm_var": 0.48587239583333336, "learning_rate": 0.0003, "loss": 11.3979, "loss/aux_loss": 0.048080523125827314, "loss/crossentropy": 2.677553081512451, "loss/logits": 0.8542275846004486, "step": 36600 }, { "epoch": 0.3661, "grad_norm": 13.0625, "grad_norm_var": 0.35442708333333334, "learning_rate": 0.0003, "loss": 11.2516, "loss/aux_loss": 0.04807436354458332, "loss/crossentropy": 2.7150216817855837, "loss/logits": 0.8340162307024002, "step": 36610 }, { "epoch": 0.3662, "grad_norm": 13.5, "grad_norm_var": 13.445686848958333, "learning_rate": 0.0003, "loss": 11.2813, "loss/aux_loss": 0.04808293953537941, "loss/crossentropy": 2.7328949213027953, "loss/logits": 0.8716106861829758, "step": 36620 }, { "epoch": 0.3663, "grad_norm": 13.6875, "grad_norm_var": 0.9638020833333333, "learning_rate": 0.0003, "loss": 11.2869, "loss/aux_loss": 0.04808443430811167, "loss/crossentropy": 2.728735291957855, "loss/logits": 0.8548354119062423, "step": 36630 }, { "epoch": 0.3664, "grad_norm": 13.5625, "grad_norm_var": 4.690087890625, "learning_rate": 0.0003, "loss": 11.3094, "loss/aux_loss": 0.048085299693048, "loss/crossentropy": 2.6517282664775848, "loss/logits": 0.8119051426649093, "step": 36640 }, { "epoch": 0.3665, "grad_norm": 14.1875, "grad_norm_var": 4.356705729166666, "learning_rate": 0.0003, "loss": 11.3516, "loss/aux_loss": 0.04806904401630163, "loss/crossentropy": 2.6928380608558653, "loss/logits": 0.8894807904958725, "step": 36650 }, { "epoch": 0.3666, "grad_norm": 13.25, "grad_norm_var": 0.4025390625, "learning_rate": 0.0003, "loss": 11.1876, "loss/aux_loss": 0.04807957727462053, "loss/crossentropy": 2.6671720802783967, "loss/logits": 0.8575594484806061, "step": 36660 }, { "epoch": 0.3667, "grad_norm": 13.625, "grad_norm_var": 0.6534993489583333, "learning_rate": 0.0003, "loss": 11.3217, "loss/aux_loss": 0.04807212818413973, "loss/crossentropy": 2.920051896572113, "loss/logits": 0.9000935316085815, "step": 36670 }, { "epoch": 0.3668, "grad_norm": 13.0625, "grad_norm_var": 0.578759765625, "learning_rate": 0.0003, "loss": 11.2754, "loss/aux_loss": 0.04808471836149693, "loss/crossentropy": 2.7112753033638, "loss/logits": 0.8980684787034988, "step": 36680 }, { "epoch": 0.3669, "grad_norm": 14.5625, "grad_norm_var": 0.46339518229166665, "learning_rate": 0.0003, "loss": 11.175, "loss/aux_loss": 0.048070876859128475, "loss/crossentropy": 2.8545451045036314, "loss/logits": 0.8624769806861877, "step": 36690 }, { "epoch": 0.367, "grad_norm": 14.25, "grad_norm_var": 0.3614583333333333, "learning_rate": 0.0003, "loss": 11.419, "loss/aux_loss": 0.048080637119710445, "loss/crossentropy": 2.584076887369156, "loss/logits": 0.8492877304553985, "step": 36700 }, { "epoch": 0.3671, "grad_norm": 14.5, "grad_norm_var": 0.46901041666666665, "learning_rate": 0.0003, "loss": 11.2005, "loss/aux_loss": 0.04807239808142185, "loss/crossentropy": 2.8296147108078005, "loss/logits": 0.8446451902389527, "step": 36710 }, { "epoch": 0.3672, "grad_norm": 13.625, "grad_norm_var": 0.3153483072916667, "learning_rate": 0.0003, "loss": 11.2948, "loss/aux_loss": 0.04807722382247448, "loss/crossentropy": 2.8139419972896578, "loss/logits": 0.8666492760181427, "step": 36720 }, { "epoch": 0.3673, "grad_norm": 13.0, "grad_norm_var": 0.2140625, "learning_rate": 0.0003, "loss": 11.2456, "loss/aux_loss": 0.04808667413890362, "loss/crossentropy": 2.7662573993206023, "loss/logits": 0.8440918147563934, "step": 36730 }, { "epoch": 0.3674, "grad_norm": 13.3125, "grad_norm_var": 0.48162434895833334, "learning_rate": 0.0003, "loss": 11.1766, "loss/aux_loss": 0.048073191195726395, "loss/crossentropy": 2.733067828416824, "loss/logits": 0.8450499773025513, "step": 36740 }, { "epoch": 0.3675, "grad_norm": 13.25, "grad_norm_var": 0.408837890625, "learning_rate": 0.0003, "loss": 11.1612, "loss/aux_loss": 0.04808583315461874, "loss/crossentropy": 2.661664068698883, "loss/logits": 0.8497480273246765, "step": 36750 }, { "epoch": 0.3676, "grad_norm": 15.3125, "grad_norm_var": 1.2481608072916666, "learning_rate": 0.0003, "loss": 11.3239, "loss/aux_loss": 0.048081538453698155, "loss/crossentropy": 2.836627209186554, "loss/logits": 0.8409482598304748, "step": 36760 }, { "epoch": 0.3677, "grad_norm": 14.125, "grad_norm_var": 0.6759765625, "learning_rate": 0.0003, "loss": 11.2629, "loss/aux_loss": 0.048070738464593886, "loss/crossentropy": 2.7837388277053834, "loss/logits": 0.8499901384115219, "step": 36770 }, { "epoch": 0.3678, "grad_norm": 13.75, "grad_norm_var": 0.29713541666666665, "learning_rate": 0.0003, "loss": 11.4489, "loss/aux_loss": 0.04807408787310123, "loss/crossentropy": 2.8329702377319337, "loss/logits": 0.8994654446840287, "step": 36780 }, { "epoch": 0.3679, "grad_norm": 13.1875, "grad_norm_var": 0.3837076822916667, "learning_rate": 0.0003, "loss": 11.1841, "loss/aux_loss": 0.04807285293936729, "loss/crossentropy": 2.7574662566184998, "loss/logits": 0.8559128046035767, "step": 36790 }, { "epoch": 0.368, "grad_norm": 14.6875, "grad_norm_var": 0.5015462239583334, "learning_rate": 0.0003, "loss": 11.2499, "loss/aux_loss": 0.04807376656681299, "loss/crossentropy": 2.684080684185028, "loss/logits": 0.8515265494585037, "step": 36800 }, { "epoch": 0.3681, "grad_norm": 13.9375, "grad_norm_var": 0.609228515625, "learning_rate": 0.0003, "loss": 11.3147, "loss/aux_loss": 0.04808369651436806, "loss/crossentropy": 2.618355232477188, "loss/logits": 0.8441348135471344, "step": 36810 }, { "epoch": 0.3682, "grad_norm": 13.4375, "grad_norm_var": 5.31953125, "learning_rate": 0.0003, "loss": 11.2577, "loss/aux_loss": 0.04807754773646593, "loss/crossentropy": 2.5288033723831176, "loss/logits": 0.8119451552629471, "step": 36820 }, { "epoch": 0.3683, "grad_norm": 13.4375, "grad_norm_var": 8.267708333333333, "learning_rate": 0.0003, "loss": 11.2319, "loss/aux_loss": 0.048084663413465026, "loss/crossentropy": 2.753344786167145, "loss/logits": 0.8751234143972397, "step": 36830 }, { "epoch": 0.3684, "grad_norm": 14.8125, "grad_norm_var": 0.308447265625, "learning_rate": 0.0003, "loss": 11.4226, "loss/aux_loss": 0.04807217866182327, "loss/crossentropy": 2.67869313955307, "loss/logits": 0.8474486947059632, "step": 36840 }, { "epoch": 0.3685, "grad_norm": 13.9375, "grad_norm_var": 3.936051432291667, "learning_rate": 0.0003, "loss": 11.1881, "loss/aux_loss": 0.048085474967956544, "loss/crossentropy": 2.6841680705547333, "loss/logits": 0.8220482736825943, "step": 36850 }, { "epoch": 0.3686, "grad_norm": 13.75, "grad_norm_var": 4.113997395833334, "learning_rate": 0.0003, "loss": 11.2414, "loss/aux_loss": 0.04807091634720564, "loss/crossentropy": 2.6126440107822417, "loss/logits": 0.8671426773071289, "step": 36860 }, { "epoch": 0.3687, "grad_norm": 13.3125, "grad_norm_var": 0.36432291666666666, "learning_rate": 0.0003, "loss": 11.4133, "loss/aux_loss": 0.048081329092383385, "loss/crossentropy": 2.675434243679047, "loss/logits": 0.8265480697154999, "step": 36870 }, { "epoch": 0.3688, "grad_norm": 13.1875, "grad_norm_var": 0.42902018229166666, "learning_rate": 0.0003, "loss": 11.2284, "loss/aux_loss": 0.048071084544062614, "loss/crossentropy": 2.649240803718567, "loss/logits": 0.858475786447525, "step": 36880 }, { "epoch": 0.3689, "grad_norm": 14.0, "grad_norm_var": 0.5214680989583333, "learning_rate": 0.0003, "loss": 11.453, "loss/aux_loss": 0.04807976335287094, "loss/crossentropy": 2.740285503864288, "loss/logits": 0.8892779976129532, "step": 36890 }, { "epoch": 0.369, "grad_norm": 14.1875, "grad_norm_var": 0.5254557291666667, "learning_rate": 0.0003, "loss": 11.4388, "loss/aux_loss": 0.048085974715650084, "loss/crossentropy": 2.597354656457901, "loss/logits": 0.852543905377388, "step": 36900 }, { "epoch": 0.3691, "grad_norm": 13.6875, "grad_norm_var": 1.0181640625, "learning_rate": 0.0003, "loss": 11.34, "loss/aux_loss": 0.04807802941650152, "loss/crossentropy": 2.7012298822402956, "loss/logits": 0.8767822653055191, "step": 36910 }, { "epoch": 0.3692, "grad_norm": 13.6875, "grad_norm_var": 0.36171875, "learning_rate": 0.0003, "loss": 11.251, "loss/aux_loss": 0.048073366098105905, "loss/crossentropy": 2.560819482803345, "loss/logits": 0.8467923909425735, "step": 36920 }, { "epoch": 0.3693, "grad_norm": 15.0, "grad_norm_var": 0.342041015625, "learning_rate": 0.0003, "loss": 11.148, "loss/aux_loss": 0.04808372184634209, "loss/crossentropy": 2.713025426864624, "loss/logits": 0.8554750919342041, "step": 36930 }, { "epoch": 0.3694, "grad_norm": 14.3125, "grad_norm_var": 0.5695149739583333, "learning_rate": 0.0003, "loss": 11.1105, "loss/aux_loss": 0.048074273765087126, "loss/crossentropy": 2.560827577114105, "loss/logits": 0.837596133351326, "step": 36940 }, { "epoch": 0.3695, "grad_norm": 13.8125, "grad_norm_var": 0.4853515625, "learning_rate": 0.0003, "loss": 11.237, "loss/aux_loss": 0.04807106014341116, "loss/crossentropy": 2.7133314967155457, "loss/logits": 0.8286193758249283, "step": 36950 }, { "epoch": 0.3696, "grad_norm": 13.9375, "grad_norm_var": 0.28683268229166664, "learning_rate": 0.0003, "loss": 11.3376, "loss/aux_loss": 0.04807455725967884, "loss/crossentropy": 2.6688225150108336, "loss/logits": 0.8450548857450485, "step": 36960 }, { "epoch": 0.3697, "grad_norm": 14.375, "grad_norm_var": 0.458056640625, "learning_rate": 0.0003, "loss": 11.2668, "loss/aux_loss": 0.048079893365502356, "loss/crossentropy": 2.7559533953666686, "loss/logits": 0.8815567016601562, "step": 36970 }, { "epoch": 0.3698, "grad_norm": 14.5625, "grad_norm_var": 0.4317708333333333, "learning_rate": 0.0003, "loss": 11.4765, "loss/aux_loss": 0.04807979427278042, "loss/crossentropy": 2.837592136859894, "loss/logits": 0.855880606174469, "step": 36980 }, { "epoch": 0.3699, "grad_norm": 13.4375, "grad_norm_var": 0.477197265625, "learning_rate": 0.0003, "loss": 11.3164, "loss/aux_loss": 0.048071004822850226, "loss/crossentropy": 2.743358498811722, "loss/logits": 0.8194127559661866, "step": 36990 }, { "epoch": 0.37, "grad_norm": 14.625, "grad_norm_var": 0.527587890625, "learning_rate": 0.0003, "loss": 11.5576, "loss/aux_loss": 0.04808164816349745, "loss/crossentropy": 2.8244762778282166, "loss/logits": 0.8719862341880799, "step": 37000 }, { "epoch": 0.3701, "grad_norm": 13.8125, "grad_norm_var": 0.5921712239583333, "learning_rate": 0.0003, "loss": 11.2832, "loss/aux_loss": 0.048074906878173354, "loss/crossentropy": 2.8368687868118285, "loss/logits": 0.8303968459367752, "step": 37010 }, { "epoch": 0.3702, "grad_norm": 12.625, "grad_norm_var": 59.25792643229167, "learning_rate": 0.0003, "loss": 11.1296, "loss/aux_loss": 0.048082617297768596, "loss/crossentropy": 2.7732748210430147, "loss/logits": 0.8518129020929337, "step": 37020 }, { "epoch": 0.3703, "grad_norm": 13.5625, "grad_norm_var": 157.23645833333333, "learning_rate": 0.0003, "loss": 11.4101, "loss/aux_loss": 0.04808836504817009, "loss/crossentropy": 2.71566726565361, "loss/logits": 0.8372407227754592, "step": 37030 }, { "epoch": 0.3704, "grad_norm": 14.5, "grad_norm_var": 115.4890625, "learning_rate": 0.0003, "loss": 11.4471, "loss/aux_loss": 0.04808253161609173, "loss/crossentropy": 2.8112912774086, "loss/logits": 0.8763896584510803, "step": 37040 }, { "epoch": 0.3705, "grad_norm": 13.5625, "grad_norm_var": 0.4981770833333333, "learning_rate": 0.0003, "loss": 11.2863, "loss/aux_loss": 0.048083126358687875, "loss/crossentropy": 2.740783101320267, "loss/logits": 0.8048513799905777, "step": 37050 }, { "epoch": 0.3706, "grad_norm": 13.9375, "grad_norm_var": 0.5832682291666667, "learning_rate": 0.0003, "loss": 11.2289, "loss/aux_loss": 0.04807225782424211, "loss/crossentropy": 2.809919023513794, "loss/logits": 0.8596052765846253, "step": 37060 }, { "epoch": 0.3707, "grad_norm": 13.625, "grad_norm_var": 0.32667643229166665, "learning_rate": 0.0003, "loss": 11.2248, "loss/aux_loss": 0.04808387588709593, "loss/crossentropy": 2.8138983845710754, "loss/logits": 0.8710784047842026, "step": 37070 }, { "epoch": 0.3708, "grad_norm": 13.1875, "grad_norm_var": 0.35989583333333336, "learning_rate": 0.0003, "loss": 11.4458, "loss/aux_loss": 0.04807316083461046, "loss/crossentropy": 2.647478461265564, "loss/logits": 0.8652923613786697, "step": 37080 }, { "epoch": 0.3709, "grad_norm": 14.125, "grad_norm_var": 90.54837239583334, "learning_rate": 0.0003, "loss": 11.3368, "loss/aux_loss": 0.04807783383876085, "loss/crossentropy": 2.7728021681308745, "loss/logits": 0.8556511580944062, "step": 37090 }, { "epoch": 0.371, "grad_norm": 13.6875, "grad_norm_var": 0.5078125, "learning_rate": 0.0003, "loss": 11.2261, "loss/aux_loss": 0.04807612337172031, "loss/crossentropy": 2.7843292593955993, "loss/logits": 0.8542019307613373, "step": 37100 }, { "epoch": 0.3711, "grad_norm": 13.5625, "grad_norm_var": 0.3551432291666667, "learning_rate": 0.0003, "loss": 11.4133, "loss/aux_loss": 0.048083677515387536, "loss/crossentropy": 2.768745648860931, "loss/logits": 0.8667060941457748, "step": 37110 }, { "epoch": 0.3712, "grad_norm": 12.9375, "grad_norm_var": 0.38748372395833336, "learning_rate": 0.0003, "loss": 11.2222, "loss/aux_loss": 0.04808000139892101, "loss/crossentropy": 2.6551915645599364, "loss/logits": 0.8254688054323196, "step": 37120 }, { "epoch": 0.3713, "grad_norm": 13.3125, "grad_norm_var": 0.41534830729166666, "learning_rate": 0.0003, "loss": 11.1694, "loss/aux_loss": 0.048072263970971106, "loss/crossentropy": 2.6525086402893066, "loss/logits": 0.8595420539379119, "step": 37130 }, { "epoch": 0.3714, "grad_norm": 12.8125, "grad_norm_var": 0.739306640625, "learning_rate": 0.0003, "loss": 11.2952, "loss/aux_loss": 0.04807467870414257, "loss/crossentropy": 2.77539883852005, "loss/logits": 0.8893805712461471, "step": 37140 }, { "epoch": 0.3715, "grad_norm": 13.8125, "grad_norm_var": 0.8153645833333333, "learning_rate": 0.0003, "loss": 11.3194, "loss/aux_loss": 0.04808569923043251, "loss/crossentropy": 2.7440546989440917, "loss/logits": 0.855570039153099, "step": 37150 }, { "epoch": 0.3716, "grad_norm": 13.5625, "grad_norm_var": 0.5109212239583333, "learning_rate": 0.0003, "loss": 11.1757, "loss/aux_loss": 0.04810119271278381, "loss/crossentropy": 2.6668431758880615, "loss/logits": 0.8302851766347885, "step": 37160 }, { "epoch": 0.3717, "grad_norm": 14.0, "grad_norm_var": 1.3648274739583333, "learning_rate": 0.0003, "loss": 11.3603, "loss/aux_loss": 0.048070420511066914, "loss/crossentropy": 2.657299679517746, "loss/logits": 0.8717033207416535, "step": 37170 }, { "epoch": 0.3718, "grad_norm": 13.0625, "grad_norm_var": 1.5200358072916667, "learning_rate": 0.0003, "loss": 11.2216, "loss/aux_loss": 0.048081215284764764, "loss/crossentropy": 2.9752244472503664, "loss/logits": 0.8586607486009598, "step": 37180 }, { "epoch": 0.3719, "grad_norm": 12.75, "grad_norm_var": 0.29108072916666666, "learning_rate": 0.0003, "loss": 11.3228, "loss/aux_loss": 0.04807878099381924, "loss/crossentropy": 2.8252045154571532, "loss/logits": 0.8735492646694183, "step": 37190 }, { "epoch": 0.372, "grad_norm": 16.375, "grad_norm_var": 0.9677083333333333, "learning_rate": 0.0003, "loss": 11.4145, "loss/aux_loss": 0.04807718005031347, "loss/crossentropy": 2.825979804992676, "loss/logits": 0.8712354183197022, "step": 37200 }, { "epoch": 0.3721, "grad_norm": 12.8125, "grad_norm_var": 1.1072916666666666, "learning_rate": 0.0003, "loss": 11.1937, "loss/aux_loss": 0.04808374121785164, "loss/crossentropy": 2.5838040828704836, "loss/logits": 0.8319817185401917, "step": 37210 }, { "epoch": 0.3722, "grad_norm": 14.0, "grad_norm_var": 0.42120768229166666, "learning_rate": 0.0003, "loss": 11.3134, "loss/aux_loss": 0.04806477259844542, "loss/crossentropy": 2.7815585494041444, "loss/logits": 0.8544807106256485, "step": 37220 }, { "epoch": 0.3723, "grad_norm": 13.125, "grad_norm_var": 0.433056640625, "learning_rate": 0.0003, "loss": 11.2287, "loss/aux_loss": 0.04808630477637053, "loss/crossentropy": 2.6713213086128236, "loss/logits": 0.8477931290864944, "step": 37230 }, { "epoch": 0.3724, "grad_norm": 13.0625, "grad_norm_var": 0.4471354166666667, "learning_rate": 0.0003, "loss": 11.3342, "loss/aux_loss": 0.04807670786976814, "loss/crossentropy": 2.6989696443080904, "loss/logits": 0.8390361964702606, "step": 37240 }, { "epoch": 0.3725, "grad_norm": 14.25, "grad_norm_var": 0.36354166666666665, "learning_rate": 0.0003, "loss": 11.1887, "loss/aux_loss": 0.048072361201047895, "loss/crossentropy": 2.6670637369155883, "loss/logits": 0.8481419175863266, "step": 37250 }, { "epoch": 0.3726, "grad_norm": 14.3125, "grad_norm_var": 0.30670572916666666, "learning_rate": 0.0003, "loss": 11.2535, "loss/aux_loss": 0.04808654896914959, "loss/crossentropy": 2.7662817001342774, "loss/logits": 0.8623171299695969, "step": 37260 }, { "epoch": 0.3727, "grad_norm": 14.625, "grad_norm_var": 0.3473307291666667, "learning_rate": 0.0003, "loss": 11.313, "loss/aux_loss": 0.04807217847555876, "loss/crossentropy": 2.655113381147385, "loss/logits": 0.8379829883575439, "step": 37270 }, { "epoch": 0.3728, "grad_norm": 13.625, "grad_norm_var": 0.364306640625, "learning_rate": 0.0003, "loss": 11.3619, "loss/aux_loss": 0.04807236138731241, "loss/crossentropy": 2.7911306262016295, "loss/logits": 0.8533078819513321, "step": 37280 }, { "epoch": 0.3729, "grad_norm": 13.6875, "grad_norm_var": 0.4390462239583333, "learning_rate": 0.0003, "loss": 11.2455, "loss/aux_loss": 0.048087391443550585, "loss/crossentropy": 2.675192391872406, "loss/logits": 0.8323038935661315, "step": 37290 }, { "epoch": 0.373, "grad_norm": 13.125, "grad_norm_var": 0.30909830729166665, "learning_rate": 0.0003, "loss": 11.2423, "loss/aux_loss": 0.048072326742112634, "loss/crossentropy": 2.807281959056854, "loss/logits": 0.8477059155702591, "step": 37300 }, { "epoch": 0.3731, "grad_norm": 13.625, "grad_norm_var": 0.7629557291666667, "learning_rate": 0.0003, "loss": 11.139, "loss/aux_loss": 0.04808030799031258, "loss/crossentropy": 2.8326464533805846, "loss/logits": 0.8708844691514969, "step": 37310 }, { "epoch": 0.3732, "grad_norm": 14.3125, "grad_norm_var": 0.43430989583333335, "learning_rate": 0.0003, "loss": 11.3773, "loss/aux_loss": 0.04808178097009659, "loss/crossentropy": 2.7206430673599242, "loss/logits": 0.9048705369234085, "step": 37320 }, { "epoch": 0.3733, "grad_norm": 14.0, "grad_norm_var": 0.294775390625, "learning_rate": 0.0003, "loss": 11.1414, "loss/aux_loss": 0.04806458819657564, "loss/crossentropy": 2.6413770437240602, "loss/logits": 0.8203712821006774, "step": 37330 }, { "epoch": 0.3734, "grad_norm": 14.125, "grad_norm_var": 0.38515625, "learning_rate": 0.0003, "loss": 11.193, "loss/aux_loss": 0.04807915184646845, "loss/crossentropy": 2.800459563732147, "loss/logits": 0.8518052160739898, "step": 37340 }, { "epoch": 0.3735, "grad_norm": 14.3125, "grad_norm_var": 0.5583170572916667, "learning_rate": 0.0003, "loss": 11.2843, "loss/aux_loss": 0.04809236507862806, "loss/crossentropy": 2.648477429151535, "loss/logits": 0.8216162532567978, "step": 37350 }, { "epoch": 0.3736, "grad_norm": 17.125, "grad_norm_var": 1.143212890625, "learning_rate": 0.0003, "loss": 11.4077, "loss/aux_loss": 0.048064970411360264, "loss/crossentropy": 2.7521123051643372, "loss/logits": 0.8703978210687637, "step": 37360 }, { "epoch": 0.3737, "grad_norm": 14.25, "grad_norm_var": 1.049462890625, "learning_rate": 0.0003, "loss": 11.4275, "loss/aux_loss": 0.04807972647249699, "loss/crossentropy": 2.737633216381073, "loss/logits": 0.8671592533588409, "step": 37370 }, { "epoch": 0.3738, "grad_norm": 13.0625, "grad_norm_var": 0.2950520833333333, "learning_rate": 0.0003, "loss": 11.1968, "loss/aux_loss": 0.04807094354182482, "loss/crossentropy": 2.7768781900405886, "loss/logits": 0.8548236817121506, "step": 37380 }, { "epoch": 0.3739, "grad_norm": 14.0625, "grad_norm_var": 0.4281087239583333, "learning_rate": 0.0003, "loss": 11.4145, "loss/aux_loss": 0.048079249635338786, "loss/crossentropy": 2.779543364048004, "loss/logits": 0.82299225628376, "step": 37390 }, { "epoch": 0.374, "grad_norm": 14.5625, "grad_norm_var": 0.34524739583333336, "learning_rate": 0.0003, "loss": 11.385, "loss/aux_loss": 0.04807200077921152, "loss/crossentropy": 2.843193084001541, "loss/logits": 0.8840054035186767, "step": 37400 }, { "epoch": 0.3741, "grad_norm": 15.125, "grad_norm_var": 0.2822265625, "learning_rate": 0.0003, "loss": 11.3311, "loss/aux_loss": 0.04806772284209728, "loss/crossentropy": 2.692880618572235, "loss/logits": 0.827517831325531, "step": 37410 }, { "epoch": 0.3742, "grad_norm": 13.0, "grad_norm_var": 0.3374837239583333, "learning_rate": 0.0003, "loss": 11.3374, "loss/aux_loss": 0.04807442184537649, "loss/crossentropy": 2.8666168451309204, "loss/logits": 0.8572170734405518, "step": 37420 }, { "epoch": 0.3743, "grad_norm": 13.875, "grad_norm_var": 0.5565104166666667, "learning_rate": 0.0003, "loss": 11.4877, "loss/aux_loss": 0.04808829519897699, "loss/crossentropy": 2.800356590747833, "loss/logits": 0.8869089663028717, "step": 37430 }, { "epoch": 0.3744, "grad_norm": 14.125, "grad_norm_var": 0.56875, "learning_rate": 0.0003, "loss": 11.1438, "loss/aux_loss": 0.04806892778724432, "loss/crossentropy": 2.55942959189415, "loss/logits": 0.8205576926469803, "step": 37440 }, { "epoch": 0.3745, "grad_norm": 14.375, "grad_norm_var": 3.4124348958333335, "learning_rate": 0.0003, "loss": 11.219, "loss/aux_loss": 0.04807998221367597, "loss/crossentropy": 2.665431547164917, "loss/logits": 0.8188263595104217, "step": 37450 }, { "epoch": 0.3746, "grad_norm": 13.3125, "grad_norm_var": 0.7863932291666667, "learning_rate": 0.0003, "loss": 11.2692, "loss/aux_loss": 0.04808599669486284, "loss/crossentropy": 2.5414306223392487, "loss/logits": 0.8146803647279739, "step": 37460 }, { "epoch": 0.3747, "grad_norm": 13.5625, "grad_norm_var": 0.5702473958333333, "learning_rate": 0.0003, "loss": 11.1549, "loss/aux_loss": 0.04807412289083004, "loss/crossentropy": 2.700755310058594, "loss/logits": 0.8417092651128769, "step": 37470 }, { "epoch": 0.3748, "grad_norm": 13.4375, "grad_norm_var": 0.292041015625, "learning_rate": 0.0003, "loss": 11.2612, "loss/aux_loss": 0.04807847496122122, "loss/crossentropy": 2.6125539779663085, "loss/logits": 0.8809157848358155, "step": 37480 }, { "epoch": 0.3749, "grad_norm": 13.8125, "grad_norm_var": 0.49264322916666664, "learning_rate": 0.0003, "loss": 11.1776, "loss/aux_loss": 0.0480781901627779, "loss/crossentropy": 2.731597048044205, "loss/logits": 0.8342852920293808, "step": 37490 }, { "epoch": 0.375, "grad_norm": 13.375, "grad_norm_var": 0.9214680989583334, "learning_rate": 0.0003, "loss": 11.1111, "loss/aux_loss": 0.04808135274797678, "loss/crossentropy": 2.7630858182907105, "loss/logits": 0.8406100690364837, "step": 37500 }, { "epoch": 0.3751, "grad_norm": 13.5625, "grad_norm_var": 0.7954264322916667, "learning_rate": 0.0003, "loss": 11.2801, "loss/aux_loss": 0.04807340987026691, "loss/crossentropy": 2.808059513568878, "loss/logits": 0.8589271575212478, "step": 37510 }, { "epoch": 0.3752, "grad_norm": 14.0625, "grad_norm_var": 0.18409830729166668, "learning_rate": 0.0003, "loss": 11.3453, "loss/aux_loss": 0.04808524418622255, "loss/crossentropy": 2.832816928625107, "loss/logits": 0.87020343542099, "step": 37520 }, { "epoch": 0.3753, "grad_norm": 12.6875, "grad_norm_var": 0.19895833333333332, "learning_rate": 0.0003, "loss": 11.2984, "loss/aux_loss": 0.0480803806334734, "loss/crossentropy": 2.7448639810085296, "loss/logits": 0.8650804668664932, "step": 37530 }, { "epoch": 0.3754, "grad_norm": 13.0625, "grad_norm_var": 0.29713541666666665, "learning_rate": 0.0003, "loss": 11.2711, "loss/aux_loss": 0.04808024540543556, "loss/crossentropy": 2.723136955499649, "loss/logits": 0.8429204732179642, "step": 37540 }, { "epoch": 0.3755, "grad_norm": 13.125, "grad_norm_var": 0.4122233072916667, "learning_rate": 0.0003, "loss": 11.3523, "loss/aux_loss": 0.04806947018951178, "loss/crossentropy": 2.7606225490570067, "loss/logits": 0.8683151304721832, "step": 37550 }, { "epoch": 0.3756, "grad_norm": 12.8125, "grad_norm_var": 2.58828125, "learning_rate": 0.0003, "loss": 11.3345, "loss/aux_loss": 0.048083030991256236, "loss/crossentropy": 2.5333042323589323, "loss/logits": 0.8388000845909118, "step": 37560 }, { "epoch": 0.3757, "grad_norm": 18.375, "grad_norm_var": 295.15208333333334, "learning_rate": 0.0003, "loss": 11.4007, "loss/aux_loss": 0.04808737169951201, "loss/crossentropy": 2.724211460351944, "loss/logits": 0.8297827035188675, "step": 37570 }, { "epoch": 0.3758, "grad_norm": 13.9375, "grad_norm_var": 292.0377604166667, "learning_rate": 0.0003, "loss": 11.2395, "loss/aux_loss": 0.048087266832590105, "loss/crossentropy": 2.724383169412613, "loss/logits": 0.8737955868244172, "step": 37580 }, { "epoch": 0.3759, "grad_norm": 14.0, "grad_norm_var": 0.43865559895833334, "learning_rate": 0.0003, "loss": 11.2987, "loss/aux_loss": 0.04807658027857542, "loss/crossentropy": 2.810201585292816, "loss/logits": 0.8624020755290985, "step": 37590 }, { "epoch": 0.376, "grad_norm": 13.0625, "grad_norm_var": 0.5176920572916667, "learning_rate": 0.0003, "loss": 11.3521, "loss/aux_loss": 0.048078496009111404, "loss/crossentropy": 2.75020290017128, "loss/logits": 0.8415679961442948, "step": 37600 }, { "epoch": 0.3761, "grad_norm": 14.875, "grad_norm_var": 0.5782389322916667, "learning_rate": 0.0003, "loss": 11.1843, "loss/aux_loss": 0.04807473961263895, "loss/crossentropy": 2.7164148449897767, "loss/logits": 0.8833770871162414, "step": 37610 }, { "epoch": 0.3762, "grad_norm": 15.375, "grad_norm_var": 0.7687337239583333, "learning_rate": 0.0003, "loss": 11.2037, "loss/aux_loss": 0.04808453526347876, "loss/crossentropy": 2.697547745704651, "loss/logits": 0.8366124957799912, "step": 37620 }, { "epoch": 0.3763, "grad_norm": 14.0625, "grad_norm_var": 0.7102701822916667, "learning_rate": 0.0003, "loss": 11.4474, "loss/aux_loss": 0.04807381797581911, "loss/crossentropy": 2.698218834400177, "loss/logits": 0.8631418794393539, "step": 37630 }, { "epoch": 0.3764, "grad_norm": 13.625, "grad_norm_var": 0.265087890625, "learning_rate": 0.0003, "loss": 11.2557, "loss/aux_loss": 0.04807877913117409, "loss/crossentropy": 2.5806887984275817, "loss/logits": 0.8300289899110794, "step": 37640 }, { "epoch": 0.3765, "grad_norm": 13.625, "grad_norm_var": 0.4354166666666667, "learning_rate": 0.0003, "loss": 11.2533, "loss/aux_loss": 0.048092398792505264, "loss/crossentropy": 2.6587139785289766, "loss/logits": 0.8522565513849258, "step": 37650 }, { "epoch": 0.3766, "grad_norm": 14.6875, "grad_norm_var": 23.364957682291667, "learning_rate": 0.0003, "loss": 11.3313, "loss/aux_loss": 0.048093941807746884, "loss/crossentropy": 2.806702709197998, "loss/logits": 0.8423727869987487, "step": 37660 }, { "epoch": 0.3767, "grad_norm": 17.75, "grad_norm_var": 442.132275390625, "learning_rate": 0.0003, "loss": 11.307, "loss/aux_loss": 0.04808268621563912, "loss/crossentropy": 2.6064475953578947, "loss/logits": 0.8261544018983841, "step": 37670 }, { "epoch": 0.3768, "grad_norm": 15.875, "grad_norm_var": 14.4119140625, "learning_rate": 0.0003, "loss": 11.3099, "loss/aux_loss": 0.04807278923690319, "loss/crossentropy": 2.6616617262363436, "loss/logits": 0.8287598133087158, "step": 37680 }, { "epoch": 0.3769, "grad_norm": 15.1875, "grad_norm_var": 2.6393229166666665, "learning_rate": 0.0003, "loss": 11.2747, "loss/aux_loss": 0.04807351883500814, "loss/crossentropy": 2.8991053104400635, "loss/logits": 0.8774980515241623, "step": 37690 }, { "epoch": 0.377, "grad_norm": 13.5, "grad_norm_var": 1.1721354166666667, "learning_rate": 0.0003, "loss": 11.3018, "loss/aux_loss": 0.04807792901992798, "loss/crossentropy": 2.793347454071045, "loss/logits": 0.8624837636947632, "step": 37700 }, { "epoch": 0.3771, "grad_norm": 13.5625, "grad_norm_var": 0.9067057291666667, "learning_rate": 0.0003, "loss": 11.1449, "loss/aux_loss": 0.04807579685002565, "loss/crossentropy": 2.564465194940567, "loss/logits": 0.8291731148958206, "step": 37710 }, { "epoch": 0.3772, "grad_norm": 12.1875, "grad_norm_var": 0.49733072916666665, "learning_rate": 0.0003, "loss": 11.1825, "loss/aux_loss": 0.0480663126334548, "loss/crossentropy": 2.8131748914718626, "loss/logits": 0.8564931780099869, "step": 37720 }, { "epoch": 0.3773, "grad_norm": 14.3125, "grad_norm_var": 1.3179524739583333, "learning_rate": 0.0003, "loss": 11.1927, "loss/aux_loss": 0.04809189885854721, "loss/crossentropy": 2.8035511016845702, "loss/logits": 0.8402773588895798, "step": 37730 }, { "epoch": 0.3774, "grad_norm": 13.375, "grad_norm_var": 1.3494140625, "learning_rate": 0.0003, "loss": 11.2049, "loss/aux_loss": 0.048071368038654326, "loss/crossentropy": 2.788879954814911, "loss/logits": 0.8656217336654664, "step": 37740 }, { "epoch": 0.3775, "grad_norm": 13.1875, "grad_norm_var": 0.861962890625, "learning_rate": 0.0003, "loss": 11.0996, "loss/aux_loss": 0.04808103609830141, "loss/crossentropy": 2.627894651889801, "loss/logits": 0.8183565735816956, "step": 37750 }, { "epoch": 0.3776, "grad_norm": 14.4375, "grad_norm_var": 1.5488932291666666, "learning_rate": 0.0003, "loss": 11.2273, "loss/aux_loss": 0.048079018481075764, "loss/crossentropy": 2.7429580628871917, "loss/logits": 0.8102349221706391, "step": 37760 }, { "epoch": 0.3777, "grad_norm": 14.0, "grad_norm_var": 1.4503743489583334, "learning_rate": 0.0003, "loss": 11.3375, "loss/aux_loss": 0.0480774000287056, "loss/crossentropy": 2.7857055068016052, "loss/logits": 0.8557232707738877, "step": 37770 }, { "epoch": 0.3778, "grad_norm": 14.4375, "grad_norm_var": 0.51875, "learning_rate": 0.0003, "loss": 11.3197, "loss/aux_loss": 0.04807243067771196, "loss/crossentropy": 2.889014649391174, "loss/logits": 0.8674527406692505, "step": 37780 }, { "epoch": 0.3779, "grad_norm": 14.0, "grad_norm_var": 0.2916015625, "learning_rate": 0.0003, "loss": 11.1796, "loss/aux_loss": 0.04807484410703182, "loss/crossentropy": 2.7089039623737334, "loss/logits": 0.8426949769258499, "step": 37790 }, { "epoch": 0.378, "grad_norm": 13.5625, "grad_norm_var": 0.39576822916666665, "learning_rate": 0.0003, "loss": 11.2755, "loss/aux_loss": 0.048075446113944056, "loss/crossentropy": 2.6602770924568175, "loss/logits": 0.8283806025981904, "step": 37800 }, { "epoch": 0.3781, "grad_norm": 12.75, "grad_norm_var": 0.42962239583333334, "learning_rate": 0.0003, "loss": 11.205, "loss/aux_loss": 0.04808063935488462, "loss/crossentropy": 2.594762307405472, "loss/logits": 0.8218467265367508, "step": 37810 }, { "epoch": 0.3782, "grad_norm": 13.6875, "grad_norm_var": 0.49933268229166666, "learning_rate": 0.0003, "loss": 11.1313, "loss/aux_loss": 0.048066642694175245, "loss/crossentropy": 2.5958735227584837, "loss/logits": 0.8756007015705108, "step": 37820 }, { "epoch": 0.3783, "grad_norm": 17.625, "grad_norm_var": 1.3032389322916667, "learning_rate": 0.0003, "loss": 11.283, "loss/aux_loss": 0.048081421107053754, "loss/crossentropy": 2.6480916321277617, "loss/logits": 0.8332589745521546, "step": 37830 }, { "epoch": 0.3784, "grad_norm": 15.125, "grad_norm_var": 1.1004557291666666, "learning_rate": 0.0003, "loss": 11.3192, "loss/aux_loss": 0.048070864751935005, "loss/crossentropy": 2.741181659698486, "loss/logits": 0.8588764518499374, "step": 37840 }, { "epoch": 0.3785, "grad_norm": 16.25, "grad_norm_var": 0.9374348958333333, "learning_rate": 0.0003, "loss": 11.4032, "loss/aux_loss": 0.048078233189880845, "loss/crossentropy": 2.9572018921375274, "loss/logits": 0.8699509769678115, "step": 37850 }, { "epoch": 0.3786, "grad_norm": 13.5625, "grad_norm_var": 0.9024576822916667, "learning_rate": 0.0003, "loss": 11.2661, "loss/aux_loss": 0.048085720464587214, "loss/crossentropy": 2.580213463306427, "loss/logits": 0.7983238309621811, "step": 37860 }, { "epoch": 0.3787, "grad_norm": 14.125, "grad_norm_var": 0.342041015625, "learning_rate": 0.0003, "loss": 11.4754, "loss/aux_loss": 0.04807271733880043, "loss/crossentropy": 2.709409844875336, "loss/logits": 0.8600565820932389, "step": 37870 }, { "epoch": 0.3788, "grad_norm": 13.9375, "grad_norm_var": 0.4671223958333333, "learning_rate": 0.0003, "loss": 11.281, "loss/aux_loss": 0.048080405406653884, "loss/crossentropy": 2.82181898355484, "loss/logits": 0.8688194662332535, "step": 37880 }, { "epoch": 0.3789, "grad_norm": 14.25, "grad_norm_var": 1.0020182291666666, "learning_rate": 0.0003, "loss": 11.2233, "loss/aux_loss": 0.04807996470481157, "loss/crossentropy": 2.773501121997833, "loss/logits": 0.8502937823534011, "step": 37890 }, { "epoch": 0.379, "grad_norm": 13.875, "grad_norm_var": 0.25467122395833336, "learning_rate": 0.0003, "loss": 11.2839, "loss/aux_loss": 0.048080760054290295, "loss/crossentropy": 2.6579030215740205, "loss/logits": 0.8489834278821945, "step": 37900 }, { "epoch": 0.3791, "grad_norm": 13.9375, "grad_norm_var": 1.21328125, "learning_rate": 0.0003, "loss": 11.2377, "loss/aux_loss": 0.04807602297514677, "loss/crossentropy": 2.6808858156204223, "loss/logits": 0.85300872027874, "step": 37910 }, { "epoch": 0.3792, "grad_norm": 13.125, "grad_norm_var": 1.8402180989583334, "learning_rate": 0.0003, "loss": 11.2778, "loss/aux_loss": 0.04807765781879425, "loss/crossentropy": 2.6116097033023835, "loss/logits": 0.8364583939313889, "step": 37920 }, { "epoch": 0.3793, "grad_norm": 13.0, "grad_norm_var": 1.376025390625, "learning_rate": 0.0003, "loss": 11.3479, "loss/aux_loss": 0.04807660095393658, "loss/crossentropy": 2.7459771037101746, "loss/logits": 0.8774673551321029, "step": 37930 }, { "epoch": 0.3794, "grad_norm": 14.3125, "grad_norm_var": 1.4880208333333333, "learning_rate": 0.0003, "loss": 11.1755, "loss/aux_loss": 0.0480880094692111, "loss/crossentropy": 2.7383559942245483, "loss/logits": 0.8331858664751053, "step": 37940 }, { "epoch": 0.3795, "grad_norm": 14.6875, "grad_norm_var": 1.3161458333333333, "learning_rate": 0.0003, "loss": 11.2232, "loss/aux_loss": 0.04807455353438854, "loss/crossentropy": 2.7862467050552366, "loss/logits": 0.8778936117887497, "step": 37950 }, { "epoch": 0.3796, "grad_norm": 14.0625, "grad_norm_var": 0.651025390625, "learning_rate": 0.0003, "loss": 11.2377, "loss/aux_loss": 0.04807548765093088, "loss/crossentropy": 2.6477761268615723, "loss/logits": 0.8515429794788361, "step": 37960 }, { "epoch": 0.3797, "grad_norm": 14.875, "grad_norm_var": 0.5752604166666667, "learning_rate": 0.0003, "loss": 11.5044, "loss/aux_loss": 0.04807682652026415, "loss/crossentropy": 2.8267282128334044, "loss/logits": 0.8833474934101104, "step": 37970 }, { "epoch": 0.3798, "grad_norm": 13.625, "grad_norm_var": 0.4791666666666667, "learning_rate": 0.0003, "loss": 11.1736, "loss/aux_loss": 0.048078049533069135, "loss/crossentropy": 2.788653367757797, "loss/logits": 0.8569782227277756, "step": 37980 }, { "epoch": 0.3799, "grad_norm": 12.8125, "grad_norm_var": 0.4383951822916667, "learning_rate": 0.0003, "loss": 11.267, "loss/aux_loss": 0.04807330220937729, "loss/crossentropy": 2.745168626308441, "loss/logits": 0.8513006120920181, "step": 37990 }, { "epoch": 0.38, "grad_norm": 14.25, "grad_norm_var": 0.7040201822916666, "learning_rate": 0.0003, "loss": 11.312, "loss/aux_loss": 0.048071262612938884, "loss/crossentropy": 2.828562021255493, "loss/logits": 0.8815089613199234, "step": 38000 }, { "epoch": 0.3801, "grad_norm": 13.5625, "grad_norm_var": 0.5681640625, "learning_rate": 0.0003, "loss": 11.2651, "loss/aux_loss": 0.04807847216725349, "loss/crossentropy": 2.6953054130077363, "loss/logits": 0.841254535317421, "step": 38010 }, { "epoch": 0.3802, "grad_norm": 13.75, "grad_norm_var": 0.350634765625, "learning_rate": 0.0003, "loss": 11.2093, "loss/aux_loss": 0.04808027595281601, "loss/crossentropy": 2.7304549276828767, "loss/logits": 0.8305320262908935, "step": 38020 }, { "epoch": 0.3803, "grad_norm": 12.4375, "grad_norm_var": 0.2353515625, "learning_rate": 0.0003, "loss": 11.2175, "loss/aux_loss": 0.04807380642741919, "loss/crossentropy": 2.672012412548065, "loss/logits": 0.8408935517072678, "step": 38030 }, { "epoch": 0.3804, "grad_norm": 14.125, "grad_norm_var": 0.23748372395833334, "learning_rate": 0.0003, "loss": 11.3549, "loss/aux_loss": 0.04807410296052694, "loss/crossentropy": 2.8609830141067505, "loss/logits": 0.8738586813211441, "step": 38040 }, { "epoch": 0.3805, "grad_norm": 14.5, "grad_norm_var": 1.24921875, "learning_rate": 0.0003, "loss": 11.2491, "loss/aux_loss": 0.048076963238418105, "loss/crossentropy": 2.8962836384773256, "loss/logits": 0.8688966006040573, "step": 38050 }, { "epoch": 0.3806, "grad_norm": 14.0625, "grad_norm_var": 0.5122395833333333, "learning_rate": 0.0003, "loss": 11.361, "loss/aux_loss": 0.04807176198810339, "loss/crossentropy": 2.8302778005599976, "loss/logits": 0.8801500231027604, "step": 38060 }, { "epoch": 0.3807, "grad_norm": 14.125, "grad_norm_var": 0.35480143229166666, "learning_rate": 0.0003, "loss": 11.2694, "loss/aux_loss": 0.048076769523322585, "loss/crossentropy": 2.6821465611457826, "loss/logits": 0.8561849266290664, "step": 38070 }, { "epoch": 0.3808, "grad_norm": 14.9375, "grad_norm_var": 0.4009765625, "learning_rate": 0.0003, "loss": 11.2842, "loss/aux_loss": 0.04807623084634542, "loss/crossentropy": 2.868058133125305, "loss/logits": 0.8749160617589951, "step": 38080 }, { "epoch": 0.3809, "grad_norm": 14.625, "grad_norm_var": 0.6916015625, "learning_rate": 0.0003, "loss": 11.2862, "loss/aux_loss": 0.0480802733451128, "loss/crossentropy": 2.762987458705902, "loss/logits": 0.8203055411577225, "step": 38090 }, { "epoch": 0.381, "grad_norm": 14.75, "grad_norm_var": 0.3472493489583333, "learning_rate": 0.0003, "loss": 11.267, "loss/aux_loss": 0.0480813367292285, "loss/crossentropy": 2.5167156994342803, "loss/logits": 0.8391987591981888, "step": 38100 }, { "epoch": 0.3811, "grad_norm": 14.5, "grad_norm_var": 0.3890462239583333, "learning_rate": 0.0003, "loss": 11.3582, "loss/aux_loss": 0.04806756749749184, "loss/crossentropy": 2.9812386274337768, "loss/logits": 0.8828330308198928, "step": 38110 }, { "epoch": 0.3812, "grad_norm": 15.375, "grad_norm_var": 0.619775390625, "learning_rate": 0.0003, "loss": 11.2353, "loss/aux_loss": 0.048090960085392, "loss/crossentropy": 2.782704734802246, "loss/logits": 0.8270899027585983, "step": 38120 }, { "epoch": 0.3813, "grad_norm": 12.9375, "grad_norm_var": 0.73515625, "learning_rate": 0.0003, "loss": 11.2608, "loss/aux_loss": 0.048075029626488686, "loss/crossentropy": 2.669895362854004, "loss/logits": 0.8388209640979767, "step": 38130 }, { "epoch": 0.3814, "grad_norm": 13.3125, "grad_norm_var": 0.5979166666666667, "learning_rate": 0.0003, "loss": 11.3802, "loss/aux_loss": 0.048083779774606226, "loss/crossentropy": 2.775761139392853, "loss/logits": 0.8643197298049927, "step": 38140 }, { "epoch": 0.3815, "grad_norm": 12.9375, "grad_norm_var": 0.6613932291666667, "learning_rate": 0.0003, "loss": 11.0835, "loss/aux_loss": 0.04807844534516335, "loss/crossentropy": 2.769335401058197, "loss/logits": 0.822916254401207, "step": 38150 }, { "epoch": 0.3816, "grad_norm": 12.8125, "grad_norm_var": 0.9258951822916667, "learning_rate": 0.0003, "loss": 11.2522, "loss/aux_loss": 0.0480781301856041, "loss/crossentropy": 2.5931221723556517, "loss/logits": 0.8241954296827316, "step": 38160 }, { "epoch": 0.3817, "grad_norm": 13.5, "grad_norm_var": 0.6152180989583333, "learning_rate": 0.0003, "loss": 11.2332, "loss/aux_loss": 0.048073183931410315, "loss/crossentropy": 2.7164094507694245, "loss/logits": 0.8017847687005997, "step": 38170 }, { "epoch": 0.3818, "grad_norm": 13.9375, "grad_norm_var": 0.450244140625, "learning_rate": 0.0003, "loss": 11.2502, "loss/aux_loss": 0.04807348102331162, "loss/crossentropy": 2.7940221190452577, "loss/logits": 0.8343671351671219, "step": 38180 }, { "epoch": 0.3819, "grad_norm": 13.0625, "grad_norm_var": 0.46608072916666665, "learning_rate": 0.0003, "loss": 11.1612, "loss/aux_loss": 0.04808283261954784, "loss/crossentropy": 2.659856015443802, "loss/logits": 0.80497907102108, "step": 38190 }, { "epoch": 0.382, "grad_norm": 12.875, "grad_norm_var": 0.581884765625, "learning_rate": 0.0003, "loss": 11.3037, "loss/aux_loss": 0.04806670006364584, "loss/crossentropy": 2.79791459441185, "loss/logits": 0.8519628554582596, "step": 38200 }, { "epoch": 0.3821, "grad_norm": 13.25, "grad_norm_var": 1.0557291666666666, "learning_rate": 0.0003, "loss": 11.0594, "loss/aux_loss": 0.04808285720646381, "loss/crossentropy": 2.640339195728302, "loss/logits": 0.7928971499204636, "step": 38210 }, { "epoch": 0.3822, "grad_norm": 12.9375, "grad_norm_var": 0.9680826822916667, "learning_rate": 0.0003, "loss": 11.2579, "loss/aux_loss": 0.048081761412322524, "loss/crossentropy": 2.681269496679306, "loss/logits": 0.8393559873104095, "step": 38220 }, { "epoch": 0.3823, "grad_norm": 14.0625, "grad_norm_var": 0.4390625, "learning_rate": 0.0003, "loss": 11.2653, "loss/aux_loss": 0.04807662703096867, "loss/crossentropy": 2.575548267364502, "loss/logits": 0.8309787482023239, "step": 38230 }, { "epoch": 0.3824, "grad_norm": 14.5625, "grad_norm_var": 0.42967122395833335, "learning_rate": 0.0003, "loss": 11.2071, "loss/aux_loss": 0.04807767011225224, "loss/crossentropy": 2.598121851682663, "loss/logits": 0.8631124287843704, "step": 38240 }, { "epoch": 0.3825, "grad_norm": 14.0, "grad_norm_var": 0.17550455729166667, "learning_rate": 0.0003, "loss": 11.1865, "loss/aux_loss": 0.04808439090847969, "loss/crossentropy": 2.6111572325229644, "loss/logits": 0.7978465467691421, "step": 38250 }, { "epoch": 0.3826, "grad_norm": 13.6875, "grad_norm_var": 0.5291015625, "learning_rate": 0.0003, "loss": 11.3335, "loss/aux_loss": 0.048076186701655386, "loss/crossentropy": 2.824894219636917, "loss/logits": 0.8665509730577469, "step": 38260 }, { "epoch": 0.3827, "grad_norm": 14.0625, "grad_norm_var": 0.43826497395833336, "learning_rate": 0.0003, "loss": 11.3737, "loss/aux_loss": 0.04807574283331632, "loss/crossentropy": 2.7032552480697634, "loss/logits": 0.8640910536050797, "step": 38270 }, { "epoch": 0.3828, "grad_norm": 14.0625, "grad_norm_var": 1.1030598958333333, "learning_rate": 0.0003, "loss": 11.3598, "loss/aux_loss": 0.04807320572435856, "loss/crossentropy": 2.7154432415962217, "loss/logits": 0.868793374300003, "step": 38280 }, { "epoch": 0.3829, "grad_norm": 14.3125, "grad_norm_var": 0.3275390625, "learning_rate": 0.0003, "loss": 11.1598, "loss/aux_loss": 0.04809410870075226, "loss/crossentropy": 2.817612624168396, "loss/logits": 0.863050663471222, "step": 38290 }, { "epoch": 0.383, "grad_norm": 14.9375, "grad_norm_var": 0.40675455729166665, "learning_rate": 0.0003, "loss": 11.2505, "loss/aux_loss": 0.04806933347135782, "loss/crossentropy": 2.6598378300666807, "loss/logits": 0.8606748700141906, "step": 38300 }, { "epoch": 0.3831, "grad_norm": 13.25, "grad_norm_var": 13.406705729166667, "learning_rate": 0.0003, "loss": 11.2905, "loss/aux_loss": 0.04808672070503235, "loss/crossentropy": 2.607471966743469, "loss/logits": 0.8459422647953033, "step": 38310 }, { "epoch": 0.3832, "grad_norm": 13.375, "grad_norm_var": 13.0916015625, "learning_rate": 0.0003, "loss": 11.3664, "loss/aux_loss": 0.04807449951767921, "loss/crossentropy": 2.73186194896698, "loss/logits": 0.8628045409917832, "step": 38320 }, { "epoch": 0.3833, "grad_norm": 15.3125, "grad_norm_var": 3.707145182291667, "learning_rate": 0.0003, "loss": 11.5162, "loss/aux_loss": 0.04808128159493208, "loss/crossentropy": 2.7736764550209045, "loss/logits": 0.8548513650894165, "step": 38330 }, { "epoch": 0.3834, "grad_norm": 14.625, "grad_norm_var": 0.7597493489583333, "learning_rate": 0.0003, "loss": 11.3973, "loss/aux_loss": 0.0480838356539607, "loss/crossentropy": 2.8341826438903808, "loss/logits": 0.8437968879938126, "step": 38340 }, { "epoch": 0.3835, "grad_norm": 16.625, "grad_norm_var": 0.7883951822916667, "learning_rate": 0.0003, "loss": 11.2896, "loss/aux_loss": 0.04808208886533975, "loss/crossentropy": 2.6580508768558504, "loss/logits": 0.8282568514347076, "step": 38350 }, { "epoch": 0.3836, "grad_norm": 13.125, "grad_norm_var": 0.77578125, "learning_rate": 0.0003, "loss": 11.4887, "loss/aux_loss": 0.04807929620146752, "loss/crossentropy": 2.7363623082637787, "loss/logits": 0.8577351301908493, "step": 38360 }, { "epoch": 0.3837, "grad_norm": 13.6875, "grad_norm_var": 0.44244791666666666, "learning_rate": 0.0003, "loss": 11.4202, "loss/aux_loss": 0.04807289559394121, "loss/crossentropy": 2.8419145464897158, "loss/logits": 0.8924077719449997, "step": 38370 }, { "epoch": 0.3838, "grad_norm": 13.9375, "grad_norm_var": 1.4704264322916667, "learning_rate": 0.0003, "loss": 11.2463, "loss/aux_loss": 0.0480803145095706, "loss/crossentropy": 2.823737806081772, "loss/logits": 0.8809779584407806, "step": 38380 }, { "epoch": 0.3839, "grad_norm": 13.9375, "grad_norm_var": 1.5296875, "learning_rate": 0.0003, "loss": 11.3235, "loss/aux_loss": 0.04806930013000965, "loss/crossentropy": 2.7565455436706543, "loss/logits": 0.8279327541589737, "step": 38390 }, { "epoch": 0.384, "grad_norm": 13.6875, "grad_norm_var": 0.4903483072916667, "learning_rate": 0.0003, "loss": 11.0918, "loss/aux_loss": 0.04807766154408455, "loss/crossentropy": 2.6135290563106537, "loss/logits": 0.8393938690423965, "step": 38400 }, { "epoch": 0.3841, "grad_norm": 33.0, "grad_norm_var": 24.82421875, "learning_rate": 0.0003, "loss": 11.1982, "loss/aux_loss": 0.04807633478194475, "loss/crossentropy": 2.6710329234600065, "loss/logits": 0.8311236262321472, "step": 38410 }, { "epoch": 0.3842, "grad_norm": 14.9375, "grad_norm_var": 22.696875, "learning_rate": 0.0003, "loss": 11.3847, "loss/aux_loss": 0.048079009726643564, "loss/crossentropy": 2.6408372461795806, "loss/logits": 0.8508718222379684, "step": 38420 }, { "epoch": 0.3843, "grad_norm": 14.5, "grad_norm_var": 0.7051432291666667, "learning_rate": 0.0003, "loss": 11.3346, "loss/aux_loss": 0.04808001890778542, "loss/crossentropy": 2.7175046026706697, "loss/logits": 0.8537416934967041, "step": 38430 }, { "epoch": 0.3844, "grad_norm": 13.4375, "grad_norm_var": 0.5770833333333333, "learning_rate": 0.0003, "loss": 11.2761, "loss/aux_loss": 0.048079208470880985, "loss/crossentropy": 2.7895686745643617, "loss/logits": 0.8444579422473908, "step": 38440 }, { "epoch": 0.3845, "grad_norm": 14.4375, "grad_norm_var": 0.8260416666666667, "learning_rate": 0.0003, "loss": 11.2037, "loss/aux_loss": 0.04807562418282032, "loss/crossentropy": 2.6479432761669157, "loss/logits": 0.839043453335762, "step": 38450 }, { "epoch": 0.3846, "grad_norm": 14.125, "grad_norm_var": 0.5632649739583333, "learning_rate": 0.0003, "loss": 11.2972, "loss/aux_loss": 0.04808216225355864, "loss/crossentropy": 2.7740365862846375, "loss/logits": 0.8312047332525253, "step": 38460 }, { "epoch": 0.3847, "grad_norm": 14.0, "grad_norm_var": 0.5236979166666667, "learning_rate": 0.0003, "loss": 11.2348, "loss/aux_loss": 0.04807409662753344, "loss/crossentropy": 2.7867905139923095, "loss/logits": 0.8606914162635804, "step": 38470 }, { "epoch": 0.3848, "grad_norm": 13.625, "grad_norm_var": 0.8204264322916667, "learning_rate": 0.0003, "loss": 11.1963, "loss/aux_loss": 0.04807875119149685, "loss/crossentropy": 2.7122581124305727, "loss/logits": 0.8468606352806092, "step": 38480 }, { "epoch": 0.3849, "grad_norm": 14.25, "grad_norm_var": 0.9686848958333333, "learning_rate": 0.0003, "loss": 11.2863, "loss/aux_loss": 0.04807748645544052, "loss/crossentropy": 2.779924100637436, "loss/logits": 0.8382695466279984, "step": 38490 }, { "epoch": 0.385, "grad_norm": 12.8125, "grad_norm_var": 0.420556640625, "learning_rate": 0.0003, "loss": 11.3427, "loss/aux_loss": 0.0480777844786644, "loss/crossentropy": 2.725245749950409, "loss/logits": 0.8357455193996429, "step": 38500 }, { "epoch": 0.3851, "grad_norm": 14.9375, "grad_norm_var": 0.518994140625, "learning_rate": 0.0003, "loss": 11.3015, "loss/aux_loss": 0.04808611571788788, "loss/crossentropy": 2.803503179550171, "loss/logits": 0.8664654195308685, "step": 38510 }, { "epoch": 0.3852, "grad_norm": 12.9375, "grad_norm_var": 0.5083170572916667, "learning_rate": 0.0003, "loss": 11.0804, "loss/aux_loss": 0.048075488209724425, "loss/crossentropy": 2.8052771151065827, "loss/logits": 0.8602871984243393, "step": 38520 }, { "epoch": 0.3853, "grad_norm": 15.0625, "grad_norm_var": 0.361181640625, "learning_rate": 0.0003, "loss": 11.2198, "loss/aux_loss": 0.04808064680546522, "loss/crossentropy": 2.808874398469925, "loss/logits": 0.8875895857810974, "step": 38530 }, { "epoch": 0.3854, "grad_norm": 13.25, "grad_norm_var": 0.572900390625, "learning_rate": 0.0003, "loss": 11.2937, "loss/aux_loss": 0.048069310747087, "loss/crossentropy": 2.7475598096847533, "loss/logits": 0.8482136219739914, "step": 38540 }, { "epoch": 0.3855, "grad_norm": 14.1875, "grad_norm_var": 0.7632649739583334, "learning_rate": 0.0003, "loss": 11.2942, "loss/aux_loss": 0.04807958360761404, "loss/crossentropy": 2.6134680569171906, "loss/logits": 0.8251173198223114, "step": 38550 }, { "epoch": 0.3856, "grad_norm": 13.6875, "grad_norm_var": 0.48587239583333336, "learning_rate": 0.0003, "loss": 11.2606, "loss/aux_loss": 0.048083121702075, "loss/crossentropy": 2.7685590624809264, "loss/logits": 0.852242037653923, "step": 38560 }, { "epoch": 0.3857, "grad_norm": 13.8125, "grad_norm_var": 0.13802083333333334, "learning_rate": 0.0003, "loss": 11.3004, "loss/aux_loss": 0.04806161895394325, "loss/crossentropy": 2.70892972946167, "loss/logits": 0.8436130315065384, "step": 38570 }, { "epoch": 0.3858, "grad_norm": 15.0, "grad_norm_var": 0.8285807291666667, "learning_rate": 0.0003, "loss": 11.36, "loss/aux_loss": 0.04808020200580358, "loss/crossentropy": 2.798497807979584, "loss/logits": 0.8555373579263688, "step": 38580 }, { "epoch": 0.3859, "grad_norm": 14.625, "grad_norm_var": 0.5480305989583333, "learning_rate": 0.0003, "loss": 11.1952, "loss/aux_loss": 0.04807691927999258, "loss/crossentropy": 2.627361184358597, "loss/logits": 0.854537034034729, "step": 38590 }, { "epoch": 0.386, "grad_norm": 13.75, "grad_norm_var": 0.245556640625, "learning_rate": 0.0003, "loss": 11.1977, "loss/aux_loss": 0.04807463120669127, "loss/crossentropy": 2.816139954328537, "loss/logits": 0.8892535716295242, "step": 38600 }, { "epoch": 0.3861, "grad_norm": 13.9375, "grad_norm_var": 0.33670247395833336, "learning_rate": 0.0003, "loss": 11.2819, "loss/aux_loss": 0.04806935954838991, "loss/crossentropy": 2.7068843841552734, "loss/logits": 0.8351662307977676, "step": 38610 }, { "epoch": 0.3862, "grad_norm": 15.3125, "grad_norm_var": 0.5954264322916667, "learning_rate": 0.0003, "loss": 11.3311, "loss/aux_loss": 0.048080881126224995, "loss/crossentropy": 2.768476206064224, "loss/logits": 0.8442646831274032, "step": 38620 }, { "epoch": 0.3863, "grad_norm": 13.625, "grad_norm_var": 0.31756184895833334, "learning_rate": 0.0003, "loss": 11.1192, "loss/aux_loss": 0.04808139074593783, "loss/crossentropy": 2.7400481700897217, "loss/logits": 0.8542584419250489, "step": 38630 }, { "epoch": 0.3864, "grad_norm": 12.9375, "grad_norm_var": 13.647395833333333, "learning_rate": 0.0003, "loss": 11.0828, "loss/aux_loss": 0.04807969853281975, "loss/crossentropy": 2.630267012119293, "loss/logits": 0.8385035455226898, "step": 38640 }, { "epoch": 0.3865, "grad_norm": 14.25, "grad_norm_var": 0.44921875, "learning_rate": 0.0003, "loss": 11.1954, "loss/aux_loss": 0.048078606836497784, "loss/crossentropy": 2.6999772429466247, "loss/logits": 0.8357056826353073, "step": 38650 }, { "epoch": 0.3866, "grad_norm": 15.375, "grad_norm_var": 0.5769368489583333, "learning_rate": 0.0003, "loss": 11.3088, "loss/aux_loss": 0.0480708010494709, "loss/crossentropy": 2.7469111561775206, "loss/logits": 0.8259833127260208, "step": 38660 }, { "epoch": 0.3867, "grad_norm": 13.0625, "grad_norm_var": 0.7369140625, "learning_rate": 0.0003, "loss": 11.2538, "loss/aux_loss": 0.04808108452707529, "loss/crossentropy": 2.8682199835777284, "loss/logits": 0.8337242752313614, "step": 38670 }, { "epoch": 0.3868, "grad_norm": 13.8125, "grad_norm_var": 0.902197265625, "learning_rate": 0.0003, "loss": 11.2038, "loss/aux_loss": 0.0480692382901907, "loss/crossentropy": 2.748265969753265, "loss/logits": 0.8487885296344757, "step": 38680 }, { "epoch": 0.3869, "grad_norm": 13.25, "grad_norm_var": 0.3020833333333333, "learning_rate": 0.0003, "loss": 11.2714, "loss/aux_loss": 0.04808627497404814, "loss/crossentropy": 2.6985159516334534, "loss/logits": 0.8388892740011216, "step": 38690 }, { "epoch": 0.387, "grad_norm": 18.25, "grad_norm_var": 1.5707682291666667, "learning_rate": 0.0003, "loss": 11.1927, "loss/aux_loss": 0.048079338297247885, "loss/crossentropy": 2.7121275901794433, "loss/logits": 0.8318122088909149, "step": 38700 }, { "epoch": 0.3871, "grad_norm": 14.1875, "grad_norm_var": 1.6813639322916667, "learning_rate": 0.0003, "loss": 11.3357, "loss/aux_loss": 0.04807351864874363, "loss/crossentropy": 2.8397586047649384, "loss/logits": 0.8729697972536087, "step": 38710 }, { "epoch": 0.3872, "grad_norm": 14.125, "grad_norm_var": 0.5071451822916667, "learning_rate": 0.0003, "loss": 11.2126, "loss/aux_loss": 0.04808269124478102, "loss/crossentropy": 2.6068269073963166, "loss/logits": 0.8011586487293243, "step": 38720 }, { "epoch": 0.3873, "grad_norm": 13.875, "grad_norm_var": 0.4791015625, "learning_rate": 0.0003, "loss": 11.1212, "loss/aux_loss": 0.04808108098804951, "loss/crossentropy": 2.8975651144981383, "loss/logits": 0.847677406668663, "step": 38730 }, { "epoch": 0.3874, "grad_norm": 13.5, "grad_norm_var": 0.2749837239583333, "learning_rate": 0.0003, "loss": 11.2524, "loss/aux_loss": 0.04808564744889736, "loss/crossentropy": 2.6788780450820924, "loss/logits": 0.8401986241340638, "step": 38740 }, { "epoch": 0.3875, "grad_norm": 14.1875, "grad_norm_var": 0.49073893229166665, "learning_rate": 0.0003, "loss": 11.335, "loss/aux_loss": 0.048062538541853425, "loss/crossentropy": 2.7721718668937685, "loss/logits": 0.8494657784700393, "step": 38750 }, { "epoch": 0.3876, "grad_norm": 14.3125, "grad_norm_var": 0.7044270833333334, "learning_rate": 0.0003, "loss": 11.243, "loss/aux_loss": 0.04808050319552422, "loss/crossentropy": 2.6818510770797728, "loss/logits": 0.8431717932224274, "step": 38760 }, { "epoch": 0.3877, "grad_norm": 13.875, "grad_norm_var": 0.43828125, "learning_rate": 0.0003, "loss": 11.1333, "loss/aux_loss": 0.04807931166142225, "loss/crossentropy": 2.8285969376564024, "loss/logits": 0.8329048067331314, "step": 38770 }, { "epoch": 0.3878, "grad_norm": 13.5, "grad_norm_var": 0.30390625, "learning_rate": 0.0003, "loss": 11.207, "loss/aux_loss": 0.04807636775076389, "loss/crossentropy": 2.7491527557373048, "loss/logits": 0.8234784305095673, "step": 38780 }, { "epoch": 0.3879, "grad_norm": 13.25, "grad_norm_var": 0.6166666666666667, "learning_rate": 0.0003, "loss": 11.101, "loss/aux_loss": 0.048082555457949636, "loss/crossentropy": 2.4799464106559754, "loss/logits": 0.8477719098329544, "step": 38790 }, { "epoch": 0.388, "grad_norm": 13.3125, "grad_norm_var": 0.5782389322916667, "learning_rate": 0.0003, "loss": 11.3399, "loss/aux_loss": 0.0480673098936677, "loss/crossentropy": 2.77501580119133, "loss/logits": 0.8746285647153854, "step": 38800 }, { "epoch": 0.3881, "grad_norm": 13.5625, "grad_norm_var": 0.7794270833333333, "learning_rate": 0.0003, "loss": 11.1978, "loss/aux_loss": 0.04808091875165701, "loss/crossentropy": 2.731118106842041, "loss/logits": 0.860284361243248, "step": 38810 }, { "epoch": 0.3882, "grad_norm": 13.375, "grad_norm_var": 0.436962890625, "learning_rate": 0.0003, "loss": 11.2145, "loss/aux_loss": 0.04807815104722977, "loss/crossentropy": 2.6639424443244932, "loss/logits": 0.8353655904531478, "step": 38820 }, { "epoch": 0.3883, "grad_norm": 13.6875, "grad_norm_var": 0.42511393229166666, "learning_rate": 0.0003, "loss": 11.3399, "loss/aux_loss": 0.04808487202972174, "loss/crossentropy": 2.8214931964874266, "loss/logits": 0.8728219717741013, "step": 38830 }, { "epoch": 0.3884, "grad_norm": 14.1875, "grad_norm_var": 0.315869140625, "learning_rate": 0.0003, "loss": 11.3803, "loss/aux_loss": 0.048068196326494214, "loss/crossentropy": 2.951774549484253, "loss/logits": 0.8737129330635071, "step": 38840 }, { "epoch": 0.3885, "grad_norm": 13.625, "grad_norm_var": 0.19036458333333334, "learning_rate": 0.0003, "loss": 11.3304, "loss/aux_loss": 0.048076307959854604, "loss/crossentropy": 2.614348477125168, "loss/logits": 0.8102818191051483, "step": 38850 }, { "epoch": 0.3886, "grad_norm": 14.5, "grad_norm_var": 0.199853515625, "learning_rate": 0.0003, "loss": 11.1863, "loss/aux_loss": 0.04808367285877466, "loss/crossentropy": 2.670736050605774, "loss/logits": 0.8290715306997299, "step": 38860 }, { "epoch": 0.3887, "grad_norm": 13.0625, "grad_norm_var": 0.3790201822916667, "learning_rate": 0.0003, "loss": 11.4597, "loss/aux_loss": 0.04807464815676212, "loss/crossentropy": 2.7976260662078856, "loss/logits": 0.8721549570560455, "step": 38870 }, { "epoch": 0.3888, "grad_norm": 14.0625, "grad_norm_var": 4.417822265625, "learning_rate": 0.0003, "loss": 11.4346, "loss/aux_loss": 0.048079947382211684, "loss/crossentropy": 2.7102751970291137, "loss/logits": 0.8924416452646255, "step": 38880 }, { "epoch": 0.3889, "grad_norm": 14.6875, "grad_norm_var": 3.782275390625, "learning_rate": 0.0003, "loss": 11.3206, "loss/aux_loss": 0.048079301975667475, "loss/crossentropy": 2.900500977039337, "loss/logits": 0.8390416592359543, "step": 38890 }, { "epoch": 0.389, "grad_norm": 14.125, "grad_norm_var": 0.547119140625, "learning_rate": 0.0003, "loss": 11.1748, "loss/aux_loss": 0.04807066544890404, "loss/crossentropy": 2.6859599113464356, "loss/logits": 0.8484610259532929, "step": 38900 }, { "epoch": 0.3891, "grad_norm": 12.9375, "grad_norm_var": 0.5132649739583334, "learning_rate": 0.0003, "loss": 11.1113, "loss/aux_loss": 0.04807322956621647, "loss/crossentropy": 2.678993618488312, "loss/logits": 0.8367465615272522, "step": 38910 }, { "epoch": 0.3892, "grad_norm": 13.9375, "grad_norm_var": 0.694775390625, "learning_rate": 0.0003, "loss": 11.2919, "loss/aux_loss": 0.04807643033564091, "loss/crossentropy": 2.71058109998703, "loss/logits": 0.8457289397716522, "step": 38920 }, { "epoch": 0.3893, "grad_norm": 13.9375, "grad_norm_var": 0.654541015625, "learning_rate": 0.0003, "loss": 11.2503, "loss/aux_loss": 0.048072323016822335, "loss/crossentropy": 2.544221270084381, "loss/logits": 0.8313421994447708, "step": 38930 }, { "epoch": 0.3894, "grad_norm": 14.1875, "grad_norm_var": 0.22862955729166667, "learning_rate": 0.0003, "loss": 11.3605, "loss/aux_loss": 0.04807420931756497, "loss/crossentropy": 2.8579561948776244, "loss/logits": 0.8661619156599045, "step": 38940 }, { "epoch": 0.3895, "grad_norm": 14.625, "grad_norm_var": 0.7454264322916667, "learning_rate": 0.0003, "loss": 11.3917, "loss/aux_loss": 0.048077373020350936, "loss/crossentropy": 2.728842890262604, "loss/logits": 0.8730567246675491, "step": 38950 }, { "epoch": 0.3896, "grad_norm": 13.5625, "grad_norm_var": 0.541259765625, "learning_rate": 0.0003, "loss": 11.3586, "loss/aux_loss": 0.04807371459901333, "loss/crossentropy": 2.8443053007125854, "loss/logits": 0.8811523258686066, "step": 38960 }, { "epoch": 0.3897, "grad_norm": 14.25, "grad_norm_var": 0.5179524739583333, "learning_rate": 0.0003, "loss": 11.3437, "loss/aux_loss": 0.04806583281606436, "loss/crossentropy": 2.693267875909805, "loss/logits": 0.8548869907855987, "step": 38970 }, { "epoch": 0.3898, "grad_norm": 15.375, "grad_norm_var": 0.6398274739583333, "learning_rate": 0.0003, "loss": 11.2203, "loss/aux_loss": 0.04806962329894304, "loss/crossentropy": 2.8249629139900208, "loss/logits": 0.8568162739276886, "step": 38980 }, { "epoch": 0.3899, "grad_norm": 13.75, "grad_norm_var": 0.39724934895833336, "learning_rate": 0.0003, "loss": 11.2541, "loss/aux_loss": 0.04808248896151781, "loss/crossentropy": 2.7104422807693482, "loss/logits": 0.8193158626556396, "step": 38990 }, { "epoch": 0.39, "grad_norm": 13.9375, "grad_norm_var": 0.6528645833333333, "learning_rate": 0.0003, "loss": 11.336, "loss/aux_loss": 0.04807611163705587, "loss/crossentropy": 2.734510087966919, "loss/logits": 0.880244129896164, "step": 39000 }, { "epoch": 0.3901, "grad_norm": 13.8125, "grad_norm_var": 0.8196451822916667, "learning_rate": 0.0003, "loss": 11.2298, "loss/aux_loss": 0.048070698976516724, "loss/crossentropy": 2.7943927884101867, "loss/logits": 0.8071790516376496, "step": 39010 }, { "epoch": 0.3902, "grad_norm": 14.0, "grad_norm_var": 0.4534993489583333, "learning_rate": 0.0003, "loss": 11.3262, "loss/aux_loss": 0.048071658983826634, "loss/crossentropy": 2.7959813237190247, "loss/logits": 0.8696642935276031, "step": 39020 }, { "epoch": 0.3903, "grad_norm": 13.1875, "grad_norm_var": 0.39264322916666666, "learning_rate": 0.0003, "loss": 11.2013, "loss/aux_loss": 0.04807510618120432, "loss/crossentropy": 2.625903457403183, "loss/logits": 0.8251208335161209, "step": 39030 }, { "epoch": 0.3904, "grad_norm": 13.6875, "grad_norm_var": 0.2674479166666667, "learning_rate": 0.0003, "loss": 11.2806, "loss/aux_loss": 0.04807391464710235, "loss/crossentropy": 2.740411990880966, "loss/logits": 0.8526921212673187, "step": 39040 }, { "epoch": 0.3905, "grad_norm": 14.0, "grad_norm_var": 2.7739583333333333, "learning_rate": 0.0003, "loss": 11.1987, "loss/aux_loss": 0.048082204163074495, "loss/crossentropy": 2.744875466823578, "loss/logits": 0.8375120222568512, "step": 39050 }, { "epoch": 0.3906, "grad_norm": 15.5625, "grad_norm_var": 3.3329264322916665, "learning_rate": 0.0003, "loss": 11.092, "loss/aux_loss": 0.04808939266949892, "loss/crossentropy": 2.5393874824047087, "loss/logits": 0.8006115674972534, "step": 39060 }, { "epoch": 0.3907, "grad_norm": 13.9375, "grad_norm_var": 0.78125, "learning_rate": 0.0003, "loss": 11.3496, "loss/aux_loss": 0.048069536313414575, "loss/crossentropy": 2.8732733964920043, "loss/logits": 0.8897728711366654, "step": 39070 }, { "epoch": 0.3908, "grad_norm": 14.9375, "grad_norm_var": 0.6104166666666667, "learning_rate": 0.0003, "loss": 11.2304, "loss/aux_loss": 0.048077582754194735, "loss/crossentropy": 2.779140567779541, "loss/logits": 0.8601418375968933, "step": 39080 }, { "epoch": 0.3909, "grad_norm": 13.3125, "grad_norm_var": 0.6387858072916667, "learning_rate": 0.0003, "loss": 11.2849, "loss/aux_loss": 0.04808152187615633, "loss/crossentropy": 2.901885849237442, "loss/logits": 0.8603871166706085, "step": 39090 }, { "epoch": 0.391, "grad_norm": 14.3125, "grad_norm_var": 1.2986979166666666, "learning_rate": 0.0003, "loss": 11.1638, "loss/aux_loss": 0.048083712719380854, "loss/crossentropy": 2.6146180272102355, "loss/logits": 0.8246536731719971, "step": 39100 }, { "epoch": 0.3911, "grad_norm": 13.3125, "grad_norm_var": 0.5108723958333333, "learning_rate": 0.0003, "loss": 11.2838, "loss/aux_loss": 0.04807220734655857, "loss/crossentropy": 2.606696993112564, "loss/logits": 0.8235841602087021, "step": 39110 }, { "epoch": 0.3912, "grad_norm": 14.1875, "grad_norm_var": 0.5723307291666667, "learning_rate": 0.0003, "loss": 11.1847, "loss/aux_loss": 0.04808733835816383, "loss/crossentropy": 2.591922175884247, "loss/logits": 0.7960670560598373, "step": 39120 }, { "epoch": 0.3913, "grad_norm": 13.5625, "grad_norm_var": 1.135791015625, "learning_rate": 0.0003, "loss": 11.1785, "loss/aux_loss": 0.048079108074307444, "loss/crossentropy": 2.8134935319423677, "loss/logits": 0.8420876532793045, "step": 39130 }, { "epoch": 0.3914, "grad_norm": 14.0625, "grad_norm_var": 0.5067708333333333, "learning_rate": 0.0003, "loss": 11.2383, "loss/aux_loss": 0.048073142766952515, "loss/crossentropy": 2.853075420856476, "loss/logits": 0.8272636830806732, "step": 39140 }, { "epoch": 0.3915, "grad_norm": 14.5, "grad_norm_var": 0.858837890625, "learning_rate": 0.0003, "loss": 11.2376, "loss/aux_loss": 0.04809119720011949, "loss/crossentropy": 2.522566032409668, "loss/logits": 0.8258247703313828, "step": 39150 }, { "epoch": 0.3916, "grad_norm": 13.875, "grad_norm_var": 0.737744140625, "learning_rate": 0.0003, "loss": 11.2268, "loss/aux_loss": 0.048074861988425255, "loss/crossentropy": 2.750867176055908, "loss/logits": 0.846402308344841, "step": 39160 }, { "epoch": 0.3917, "grad_norm": 14.625, "grad_norm_var": 1.01171875, "learning_rate": 0.0003, "loss": 11.2082, "loss/aux_loss": 0.048072229884564874, "loss/crossentropy": 2.7354251742362976, "loss/logits": 0.8645006984472274, "step": 39170 }, { "epoch": 0.3918, "grad_norm": 13.625, "grad_norm_var": 1.2765462239583334, "learning_rate": 0.0003, "loss": 11.2339, "loss/aux_loss": 0.04807515200227499, "loss/crossentropy": 2.7877457082271575, "loss/logits": 0.8666865587234497, "step": 39180 }, { "epoch": 0.3919, "grad_norm": 13.875, "grad_norm_var": 0.6962076822916666, "learning_rate": 0.0003, "loss": 11.5099, "loss/aux_loss": 0.04808229543268681, "loss/crossentropy": 2.6917248964309692, "loss/logits": 0.8847535520792007, "step": 39190 }, { "epoch": 0.392, "grad_norm": 15.375, "grad_norm_var": 0.5378743489583333, "learning_rate": 0.0003, "loss": 11.3657, "loss/aux_loss": 0.048071438632905486, "loss/crossentropy": 2.789354109764099, "loss/logits": 0.8606502175331116, "step": 39200 }, { "epoch": 0.3921, "grad_norm": 13.25, "grad_norm_var": 1.1359212239583334, "learning_rate": 0.0003, "loss": 11.3121, "loss/aux_loss": 0.048081927560269834, "loss/crossentropy": 2.780927097797394, "loss/logits": 0.886367890238762, "step": 39210 }, { "epoch": 0.3922, "grad_norm": 13.8125, "grad_norm_var": 0.908447265625, "learning_rate": 0.0003, "loss": 11.2079, "loss/aux_loss": 0.048087149113416675, "loss/crossentropy": 2.6847366988658905, "loss/logits": 0.8129809975624085, "step": 39220 }, { "epoch": 0.3923, "grad_norm": 13.6875, "grad_norm_var": 0.5606608072916667, "learning_rate": 0.0003, "loss": 11.3817, "loss/aux_loss": 0.0480810409411788, "loss/crossentropy": 2.937886118888855, "loss/logits": 0.906218609213829, "step": 39230 }, { "epoch": 0.3924, "grad_norm": 13.9375, "grad_norm_var": 0.51640625, "learning_rate": 0.0003, "loss": 11.266, "loss/aux_loss": 0.04808030817657709, "loss/crossentropy": 2.6783434629440306, "loss/logits": 0.8466577887535095, "step": 39240 }, { "epoch": 0.3925, "grad_norm": 13.625, "grad_norm_var": 0.551025390625, "learning_rate": 0.0003, "loss": 11.2733, "loss/aux_loss": 0.048078637942671774, "loss/crossentropy": 2.7514628052711485, "loss/logits": 0.851484876871109, "step": 39250 }, { "epoch": 0.3926, "grad_norm": 14.4375, "grad_norm_var": 0.8516764322916667, "learning_rate": 0.0003, "loss": 11.1715, "loss/aux_loss": 0.04808551203459501, "loss/crossentropy": 2.4610378623008726, "loss/logits": 0.8220183670520782, "step": 39260 }, { "epoch": 0.3927, "grad_norm": 14.3125, "grad_norm_var": 0.5624348958333333, "learning_rate": 0.0003, "loss": 11.2718, "loss/aux_loss": 0.04808190818876028, "loss/crossentropy": 2.7207518577575684, "loss/logits": 0.8513909667730332, "step": 39270 }, { "epoch": 0.3928, "grad_norm": 13.9375, "grad_norm_var": 0.5618326822916667, "learning_rate": 0.0003, "loss": 11.2583, "loss/aux_loss": 0.04807283375412226, "loss/crossentropy": 2.514565271139145, "loss/logits": 0.8464554220438003, "step": 39280 }, { "epoch": 0.3929, "grad_norm": 14.0, "grad_norm_var": 0.5028645833333333, "learning_rate": 0.0003, "loss": 11.1733, "loss/aux_loss": 0.04806869979947805, "loss/crossentropy": 2.7700137376785277, "loss/logits": 0.849734765291214, "step": 39290 }, { "epoch": 0.393, "grad_norm": 13.25, "grad_norm_var": 0.7659993489583333, "learning_rate": 0.0003, "loss": 11.274, "loss/aux_loss": 0.048083323240280154, "loss/crossentropy": 2.843100357055664, "loss/logits": 0.8784733712673187, "step": 39300 }, { "epoch": 0.3931, "grad_norm": 12.625, "grad_norm_var": 0.6634765625, "learning_rate": 0.0003, "loss": 11.4241, "loss/aux_loss": 0.04807077012956142, "loss/crossentropy": 2.799287849664688, "loss/logits": 0.8680036425590515, "step": 39310 }, { "epoch": 0.3932, "grad_norm": 13.5, "grad_norm_var": 0.7202473958333333, "learning_rate": 0.0003, "loss": 11.2232, "loss/aux_loss": 0.04807591922581196, "loss/crossentropy": 2.7692100405693054, "loss/logits": 0.8555681079626083, "step": 39320 }, { "epoch": 0.3933, "grad_norm": 14.6875, "grad_norm_var": 0.97578125, "learning_rate": 0.0003, "loss": 11.3526, "loss/aux_loss": 0.04808627963066101, "loss/crossentropy": 2.7542243778705595, "loss/logits": 0.842135438323021, "step": 39330 }, { "epoch": 0.3934, "grad_norm": 14.4375, "grad_norm_var": 0.45358072916666664, "learning_rate": 0.0003, "loss": 11.3093, "loss/aux_loss": 0.048074960522353646, "loss/crossentropy": 2.8112044095993043, "loss/logits": 0.8553465873003006, "step": 39340 }, { "epoch": 0.3935, "grad_norm": 13.6875, "grad_norm_var": 0.32545572916666665, "learning_rate": 0.0003, "loss": 11.2175, "loss/aux_loss": 0.048079118691384794, "loss/crossentropy": 2.6633784532547, "loss/logits": 0.8331804633140564, "step": 39350 }, { "epoch": 0.3936, "grad_norm": 14.1875, "grad_norm_var": 0.35494791666666664, "learning_rate": 0.0003, "loss": 11.2346, "loss/aux_loss": 0.04808082692325115, "loss/crossentropy": 2.67893762588501, "loss/logits": 0.8055594295263291, "step": 39360 }, { "epoch": 0.3937, "grad_norm": 13.875, "grad_norm_var": 0.5494140625, "learning_rate": 0.0003, "loss": 11.3957, "loss/aux_loss": 0.04807113204151392, "loss/crossentropy": 2.6513688981533052, "loss/logits": 0.8550222337245941, "step": 39370 }, { "epoch": 0.3938, "grad_norm": 13.5, "grad_norm_var": 0.5113932291666666, "learning_rate": 0.0003, "loss": 11.1742, "loss/aux_loss": 0.04808781389147043, "loss/crossentropy": 2.5786080420017243, "loss/logits": 0.8316751003265381, "step": 39380 }, { "epoch": 0.3939, "grad_norm": 14.125, "grad_norm_var": 0.38274739583333334, "learning_rate": 0.0003, "loss": 11.3349, "loss/aux_loss": 0.048061727173626424, "loss/crossentropy": 2.815303325653076, "loss/logits": 0.8787429064512253, "step": 39390 }, { "epoch": 0.394, "grad_norm": 13.375, "grad_norm_var": 0.4574055989583333, "learning_rate": 0.0003, "loss": 11.1497, "loss/aux_loss": 0.04808071050792932, "loss/crossentropy": 2.8457810401916506, "loss/logits": 0.8681068003177643, "step": 39400 }, { "epoch": 0.3941, "grad_norm": 13.875, "grad_norm_var": 0.637744140625, "learning_rate": 0.0003, "loss": 11.1906, "loss/aux_loss": 0.048074718564748764, "loss/crossentropy": 2.536262887716293, "loss/logits": 0.8039017617702484, "step": 39410 }, { "epoch": 0.3942, "grad_norm": 13.125, "grad_norm_var": 0.6919270833333333, "learning_rate": 0.0003, "loss": 11.2357, "loss/aux_loss": 0.04807904493063688, "loss/crossentropy": 2.7414814889431, "loss/logits": 0.8549802154302597, "step": 39420 }, { "epoch": 0.3943, "grad_norm": 16.375, "grad_norm_var": 0.8254557291666667, "learning_rate": 0.0003, "loss": 11.1561, "loss/aux_loss": 0.048071026988327506, "loss/crossentropy": 2.9144181966781617, "loss/logits": 0.8272378146648407, "step": 39430 }, { "epoch": 0.3944, "grad_norm": 13.875, "grad_norm_var": 0.8355305989583334, "learning_rate": 0.0003, "loss": 11.3736, "loss/aux_loss": 0.048078907653689384, "loss/crossentropy": 2.664820075035095, "loss/logits": 0.8304022997617722, "step": 39440 }, { "epoch": 0.3945, "grad_norm": 14.875, "grad_norm_var": 0.34088541666666666, "learning_rate": 0.0003, "loss": 11.2582, "loss/aux_loss": 0.04807480573654175, "loss/crossentropy": 2.5339000284671784, "loss/logits": 0.7781210362911224, "step": 39450 }, { "epoch": 0.3946, "grad_norm": 15.0625, "grad_norm_var": 0.42337239583333336, "learning_rate": 0.0003, "loss": 11.3047, "loss/aux_loss": 0.0480910299345851, "loss/crossentropy": 2.610448843240738, "loss/logits": 0.8010566890239715, "step": 39460 }, { "epoch": 0.3947, "grad_norm": 13.625, "grad_norm_var": 0.4361979166666667, "learning_rate": 0.0003, "loss": 11.2501, "loss/aux_loss": 0.04807810541242361, "loss/crossentropy": 2.775594508647919, "loss/logits": 0.8565292507410049, "step": 39470 }, { "epoch": 0.3948, "grad_norm": 13.25, "grad_norm_var": 0.44073893229166666, "learning_rate": 0.0003, "loss": 11.1435, "loss/aux_loss": 0.048072817362844945, "loss/crossentropy": 2.6863482356071473, "loss/logits": 0.8157978534698487, "step": 39480 }, { "epoch": 0.3949, "grad_norm": 13.4375, "grad_norm_var": 0.40305989583333335, "learning_rate": 0.0003, "loss": 11.3558, "loss/aux_loss": 0.04808029588311911, "loss/crossentropy": 2.8376736283302306, "loss/logits": 0.8666150987148284, "step": 39490 }, { "epoch": 0.395, "grad_norm": 14.5, "grad_norm_var": 0.4551920572916667, "learning_rate": 0.0003, "loss": 11.2658, "loss/aux_loss": 0.048076699860394, "loss/crossentropy": 2.607940810918808, "loss/logits": 0.8273808121681213, "step": 39500 }, { "epoch": 0.3951, "grad_norm": 12.8125, "grad_norm_var": 0.7525390625, "learning_rate": 0.0003, "loss": 11.1388, "loss/aux_loss": 0.04806460794061422, "loss/crossentropy": 2.6791930377483366, "loss/logits": 0.8224393516778946, "step": 39510 }, { "epoch": 0.3952, "grad_norm": 12.625, "grad_norm_var": 0.6700358072916667, "learning_rate": 0.0003, "loss": 10.9948, "loss/aux_loss": 0.04807190522551537, "loss/crossentropy": 2.6685730695724486, "loss/logits": 0.819244459271431, "step": 39520 }, { "epoch": 0.3953, "grad_norm": 13.875, "grad_norm_var": 0.38014322916666665, "learning_rate": 0.0003, "loss": 11.1964, "loss/aux_loss": 0.048081899993121624, "loss/crossentropy": 2.6292571663856505, "loss/logits": 0.8497846484184265, "step": 39530 }, { "epoch": 0.3954, "grad_norm": 13.75, "grad_norm_var": 0.25462239583333335, "learning_rate": 0.0003, "loss": 11.359, "loss/aux_loss": 0.048076131381094456, "loss/crossentropy": 2.8662326276302337, "loss/logits": 0.8483193576335907, "step": 39540 }, { "epoch": 0.3955, "grad_norm": 13.9375, "grad_norm_var": 0.47805989583333336, "learning_rate": 0.0003, "loss": 11.1232, "loss/aux_loss": 0.048073952086269855, "loss/crossentropy": 2.7887901782989504, "loss/logits": 0.8364946961402893, "step": 39550 }, { "epoch": 0.3956, "grad_norm": 13.3125, "grad_norm_var": 0.3306640625, "learning_rate": 0.0003, "loss": 11.4657, "loss/aux_loss": 0.04808053988963366, "loss/crossentropy": 2.7247639894485474, "loss/logits": 0.8865299373865128, "step": 39560 }, { "epoch": 0.3957, "grad_norm": 14.3125, "grad_norm_var": 0.25, "learning_rate": 0.0003, "loss": 11.1221, "loss/aux_loss": 0.048066049627959725, "loss/crossentropy": 2.7538771450519564, "loss/logits": 0.8528720825910568, "step": 39570 }, { "epoch": 0.3958, "grad_norm": 13.875, "grad_norm_var": 0.3035807291666667, "learning_rate": 0.0003, "loss": 11.1495, "loss/aux_loss": 0.04807343017309904, "loss/crossentropy": 2.7472833156585694, "loss/logits": 0.8405825644731522, "step": 39580 }, { "epoch": 0.3959, "grad_norm": 14.1875, "grad_norm_var": 0.24713541666666666, "learning_rate": 0.0003, "loss": 11.1893, "loss/aux_loss": 0.0480898505076766, "loss/crossentropy": 2.8660534262657165, "loss/logits": 0.8727923810482026, "step": 39590 }, { "epoch": 0.396, "grad_norm": 13.75, "grad_norm_var": 0.3712076822916667, "learning_rate": 0.0003, "loss": 11.3245, "loss/aux_loss": 0.04807707834988832, "loss/crossentropy": 2.6662731945514677, "loss/logits": 0.8399159997701645, "step": 39600 }, { "epoch": 0.3961, "grad_norm": 13.1875, "grad_norm_var": 0.5163899739583333, "learning_rate": 0.0003, "loss": 11.2979, "loss/aux_loss": 0.04807784650474787, "loss/crossentropy": 2.767361307144165, "loss/logits": 0.8635453820228577, "step": 39610 }, { "epoch": 0.3962, "grad_norm": 13.8125, "grad_norm_var": 0.44217122395833336, "learning_rate": 0.0003, "loss": 11.3056, "loss/aux_loss": 0.048077255859971045, "loss/crossentropy": 2.885894167423248, "loss/logits": 0.85841805934906, "step": 39620 }, { "epoch": 0.3963, "grad_norm": 14.5, "grad_norm_var": 13.565478515625, "learning_rate": 0.0003, "loss": 11.2025, "loss/aux_loss": 0.04807215016335249, "loss/crossentropy": 2.854511320590973, "loss/logits": 0.8788524448871613, "step": 39630 }, { "epoch": 0.3964, "grad_norm": 13.4375, "grad_norm_var": 13.962093098958333, "learning_rate": 0.0003, "loss": 11.3555, "loss/aux_loss": 0.04809546619653702, "loss/crossentropy": 2.764549750089645, "loss/logits": 0.8647037327289582, "step": 39640 }, { "epoch": 0.3965, "grad_norm": 13.0625, "grad_norm_var": 0.37578125, "learning_rate": 0.0003, "loss": 11.1444, "loss/aux_loss": 0.04806444570422173, "loss/crossentropy": 2.7343260645866394, "loss/logits": 0.8285282194614411, "step": 39650 }, { "epoch": 0.3966, "grad_norm": 14.0, "grad_norm_var": 0.4197265625, "learning_rate": 0.0003, "loss": 11.271, "loss/aux_loss": 0.04809067714959383, "loss/crossentropy": 2.6628151297569276, "loss/logits": 0.8381938517093659, "step": 39660 }, { "epoch": 0.3967, "grad_norm": 13.1875, "grad_norm_var": 0.468994140625, "learning_rate": 0.0003, "loss": 11.26, "loss/aux_loss": 0.04807262271642685, "loss/crossentropy": 2.827323651313782, "loss/logits": 0.8302334070205688, "step": 39670 }, { "epoch": 0.3968, "grad_norm": 13.625, "grad_norm_var": 0.2494140625, "learning_rate": 0.0003, "loss": 11.2455, "loss/aux_loss": 0.04807976856827736, "loss/crossentropy": 2.863471567630768, "loss/logits": 0.8567991226911544, "step": 39680 }, { "epoch": 0.3969, "grad_norm": 13.875, "grad_norm_var": 0.7364583333333333, "learning_rate": 0.0003, "loss": 11.3192, "loss/aux_loss": 0.04808229897171259, "loss/crossentropy": 2.7523205041885377, "loss/logits": 0.8334077000617981, "step": 39690 }, { "epoch": 0.397, "grad_norm": 14.9375, "grad_norm_var": 0.7137858072916666, "learning_rate": 0.0003, "loss": 11.1529, "loss/aux_loss": 0.04808085970580578, "loss/crossentropy": 2.8388813376426696, "loss/logits": 0.8468579053878784, "step": 39700 }, { "epoch": 0.3971, "grad_norm": 13.625, "grad_norm_var": 0.283056640625, "learning_rate": 0.0003, "loss": 11.1723, "loss/aux_loss": 0.04806781094521284, "loss/crossentropy": 2.8035045742988585, "loss/logits": 0.8642447054386139, "step": 39710 }, { "epoch": 0.3972, "grad_norm": 12.3125, "grad_norm_var": 0.39842122395833335, "learning_rate": 0.0003, "loss": 11.1913, "loss/aux_loss": 0.04807733949273825, "loss/crossentropy": 2.6676317691802978, "loss/logits": 0.8248802542686462, "step": 39720 }, { "epoch": 0.3973, "grad_norm": 14.1875, "grad_norm_var": 3.1890462239583335, "learning_rate": 0.0003, "loss": 11.419, "loss/aux_loss": 0.048076121136546135, "loss/crossentropy": 2.783367484807968, "loss/logits": 0.8631105840206146, "step": 39730 }, { "epoch": 0.3974, "grad_norm": 12.6875, "grad_norm_var": 0.56953125, "learning_rate": 0.0003, "loss": 11.2753, "loss/aux_loss": 0.04807474035769701, "loss/crossentropy": 2.648731881380081, "loss/logits": 0.8296503305435181, "step": 39740 }, { "epoch": 0.3975, "grad_norm": 13.8125, "grad_norm_var": 0.4315104166666667, "learning_rate": 0.0003, "loss": 11.3218, "loss/aux_loss": 0.048075980879366396, "loss/crossentropy": 2.709260368347168, "loss/logits": 0.8509759098291397, "step": 39750 }, { "epoch": 0.3976, "grad_norm": 16.875, "grad_norm_var": 0.8150390625, "learning_rate": 0.0003, "loss": 11.3212, "loss/aux_loss": 0.048085262067615986, "loss/crossentropy": 2.561914938688278, "loss/logits": 0.8712035864591599, "step": 39760 }, { "epoch": 0.3977, "grad_norm": 14.25, "grad_norm_var": 1.005322265625, "learning_rate": 0.0003, "loss": 11.353, "loss/aux_loss": 0.048075600527226925, "loss/crossentropy": 2.821903848648071, "loss/logits": 0.8585720628499984, "step": 39770 }, { "epoch": 0.3978, "grad_norm": 13.4375, "grad_norm_var": 0.6304524739583334, "learning_rate": 0.0003, "loss": 11.1229, "loss/aux_loss": 0.04808585401624441, "loss/crossentropy": 2.5138413667678834, "loss/logits": 0.7884344816207886, "step": 39780 }, { "epoch": 0.3979, "grad_norm": 13.25, "grad_norm_var": 0.49347330729166666, "learning_rate": 0.0003, "loss": 11.1641, "loss/aux_loss": 0.04808144606649876, "loss/crossentropy": 2.563122200965881, "loss/logits": 0.8173367559909821, "step": 39790 }, { "epoch": 0.398, "grad_norm": 13.75, "grad_norm_var": 0.30388997395833334, "learning_rate": 0.0003, "loss": 10.9662, "loss/aux_loss": 0.048083293810486795, "loss/crossentropy": 2.514444661140442, "loss/logits": 0.8048440098762513, "step": 39800 }, { "epoch": 0.3981, "grad_norm": 14.0625, "grad_norm_var": 21.170572916666668, "learning_rate": 0.0003, "loss": 11.246, "loss/aux_loss": 0.0480785084888339, "loss/crossentropy": 2.842600917816162, "loss/logits": 0.8754805415868759, "step": 39810 }, { "epoch": 0.3982, "grad_norm": 13.8125, "grad_norm_var": 20.811442057291668, "learning_rate": 0.0003, "loss": 11.2918, "loss/aux_loss": 0.048076963610947133, "loss/crossentropy": 2.7346277594566346, "loss/logits": 0.8496310234069824, "step": 39820 }, { "epoch": 0.3983, "grad_norm": 13.625, "grad_norm_var": 1.0786458333333333, "learning_rate": 0.0003, "loss": 11.1372, "loss/aux_loss": 0.048078888468444346, "loss/crossentropy": 2.68115548491478, "loss/logits": 0.8213476330041886, "step": 39830 }, { "epoch": 0.3984, "grad_norm": 14.6875, "grad_norm_var": 0.44998372395833336, "learning_rate": 0.0003, "loss": 11.1956, "loss/aux_loss": 0.048077587597072126, "loss/crossentropy": 2.689275288581848, "loss/logits": 0.8431656301021576, "step": 39840 }, { "epoch": 0.3985, "grad_norm": 13.75, "grad_norm_var": 0.36521809895833335, "learning_rate": 0.0003, "loss": 11.3385, "loss/aux_loss": 0.04807688985019922, "loss/crossentropy": 2.775956404209137, "loss/logits": 0.8679609030485154, "step": 39850 }, { "epoch": 0.3986, "grad_norm": 13.5625, "grad_norm_var": 0.36692708333333335, "learning_rate": 0.0003, "loss": 11.3861, "loss/aux_loss": 0.04808139931410551, "loss/crossentropy": 2.6501555681228637, "loss/logits": 0.8278191804885864, "step": 39860 }, { "epoch": 0.3987, "grad_norm": 13.8125, "grad_norm_var": 0.327587890625, "learning_rate": 0.0003, "loss": 11.3386, "loss/aux_loss": 0.04808625839650631, "loss/crossentropy": 2.752862584590912, "loss/logits": 0.8313428431749343, "step": 39870 }, { "epoch": 0.3988, "grad_norm": 15.0, "grad_norm_var": 0.49973958333333335, "learning_rate": 0.0003, "loss": 11.1974, "loss/aux_loss": 0.04805862847715616, "loss/crossentropy": 2.766802215576172, "loss/logits": 0.8339938923716546, "step": 39880 }, { "epoch": 0.3989, "grad_norm": 14.9375, "grad_norm_var": 0.4676432291666667, "learning_rate": 0.0003, "loss": 11.4879, "loss/aux_loss": 0.0480917839333415, "loss/crossentropy": 2.7454636096954346, "loss/logits": 0.8500055640935897, "step": 39890 }, { "epoch": 0.399, "grad_norm": 14.125, "grad_norm_var": 0.5358723958333333, "learning_rate": 0.0003, "loss": 11.3118, "loss/aux_loss": 0.04806092549115419, "loss/crossentropy": 2.7647584557533262, "loss/logits": 0.8545819491147995, "step": 39900 }, { "epoch": 0.3991, "grad_norm": 15.375, "grad_norm_var": 11.383707682291666, "learning_rate": 0.0003, "loss": 11.2725, "loss/aux_loss": 0.04808534067124128, "loss/crossentropy": 2.7508405685424804, "loss/logits": 0.8540914624929428, "step": 39910 }, { "epoch": 0.3992, "grad_norm": 15.125, "grad_norm_var": 9.876676432291667, "learning_rate": 0.0003, "loss": 11.3259, "loss/aux_loss": 0.04807797037065029, "loss/crossentropy": 2.67775102853775, "loss/logits": 0.8853228390216827, "step": 39920 }, { "epoch": 0.3993, "grad_norm": 13.875, "grad_norm_var": 1.1197265625, "learning_rate": 0.0003, "loss": 11.1964, "loss/aux_loss": 0.04806930739432573, "loss/crossentropy": 2.670240956544876, "loss/logits": 0.8302730619907379, "step": 39930 }, { "epoch": 0.3994, "grad_norm": 14.4375, "grad_norm_var": 1.28984375, "learning_rate": 0.0003, "loss": 11.3559, "loss/aux_loss": 0.04808368775993586, "loss/crossentropy": 2.7974973797798155, "loss/logits": 0.8431978434324264, "step": 39940 }, { "epoch": 0.3995, "grad_norm": 14.1875, "grad_norm_var": 0.9925618489583333, "learning_rate": 0.0003, "loss": 11.3789, "loss/aux_loss": 0.04807193577289581, "loss/crossentropy": 2.8047056078910826, "loss/logits": 0.8324245274066925, "step": 39950 }, { "epoch": 0.3996, "grad_norm": 15.0, "grad_norm_var": 0.4400390625, "learning_rate": 0.0003, "loss": 11.295, "loss/aux_loss": 0.048078315891325475, "loss/crossentropy": 2.907421922683716, "loss/logits": 0.8765753865242004, "step": 39960 }, { "epoch": 0.3997, "grad_norm": 14.8125, "grad_norm_var": 0.3042805989583333, "learning_rate": 0.0003, "loss": 11.2693, "loss/aux_loss": 0.04807358868420124, "loss/crossentropy": 2.6865237832069395, "loss/logits": 0.8515175133943558, "step": 39970 }, { "epoch": 0.3998, "grad_norm": 13.5625, "grad_norm_var": 0.33839518229166665, "learning_rate": 0.0003, "loss": 11.2747, "loss/aux_loss": 0.04806916173547506, "loss/crossentropy": 2.7106878042221068, "loss/logits": 0.8739930838346481, "step": 39980 }, { "epoch": 0.3999, "grad_norm": 13.5625, "grad_norm_var": 0.31640625, "learning_rate": 0.0003, "loss": 11.2064, "loss/aux_loss": 0.04807684104889631, "loss/crossentropy": 2.7278328776359557, "loss/logits": 0.8172307670116424, "step": 39990 }, { "epoch": 0.4, "grad_norm": 13.125, "grad_norm_var": 0.15701497395833333, "learning_rate": 0.0003, "loss": 11.2938, "loss/aux_loss": 0.04807619974017143, "loss/crossentropy": 2.7106220006942747, "loss/logits": 0.8441434442996979, "step": 40000 }, { "epoch": 0.4001, "grad_norm": 14.4375, "grad_norm_var": 0.45358072916666664, "learning_rate": 0.0003, "loss": 11.172, "loss/aux_loss": 0.048076309636235236, "loss/crossentropy": 2.640831911563873, "loss/logits": 0.8590665191411972, "step": 40010 }, { "epoch": 0.4002, "grad_norm": 13.875, "grad_norm_var": 0.545556640625, "learning_rate": 0.0003, "loss": 11.414, "loss/aux_loss": 0.048071987740695474, "loss/crossentropy": 2.869738209247589, "loss/logits": 0.866798147559166, "step": 40020 }, { "epoch": 0.4003, "grad_norm": 13.3125, "grad_norm_var": 0.4275390625, "learning_rate": 0.0003, "loss": 11.328, "loss/aux_loss": 0.048072556219995025, "loss/crossentropy": 2.6954082608222962, "loss/logits": 0.831238204240799, "step": 40030 }, { "epoch": 0.4004, "grad_norm": 14.5625, "grad_norm_var": 0.32962239583333336, "learning_rate": 0.0003, "loss": 11.3336, "loss/aux_loss": 0.04808360133320093, "loss/crossentropy": 2.5370292246341704, "loss/logits": 0.8065064072608947, "step": 40040 }, { "epoch": 0.4005, "grad_norm": 14.25, "grad_norm_var": 0.6997395833333333, "learning_rate": 0.0003, "loss": 11.2486, "loss/aux_loss": 0.04807292725890875, "loss/crossentropy": 2.687315058708191, "loss/logits": 0.8650965690612793, "step": 40050 }, { "epoch": 0.4006, "grad_norm": 14.0, "grad_norm_var": 0.3633951822916667, "learning_rate": 0.0003, "loss": 11.0689, "loss/aux_loss": 0.048067699931561944, "loss/crossentropy": 2.8071807265281676, "loss/logits": 0.8397237300872803, "step": 40060 }, { "epoch": 0.4007, "grad_norm": 14.25, "grad_norm_var": 0.4202473958333333, "learning_rate": 0.0003, "loss": 11.4838, "loss/aux_loss": 0.04807199016213417, "loss/crossentropy": 2.72553288936615, "loss/logits": 0.8591938436031341, "step": 40070 }, { "epoch": 0.4008, "grad_norm": 14.5625, "grad_norm_var": 0.3973795572916667, "learning_rate": 0.0003, "loss": 11.1278, "loss/aux_loss": 0.048072010092437265, "loss/crossentropy": 2.725685381889343, "loss/logits": 0.8271927177906037, "step": 40080 }, { "epoch": 0.4009, "grad_norm": 14.6875, "grad_norm_var": 0.3041015625, "learning_rate": 0.0003, "loss": 11.2294, "loss/aux_loss": 0.048078466951847074, "loss/crossentropy": 2.692145121097565, "loss/logits": 0.8675953030586243, "step": 40090 }, { "epoch": 0.401, "grad_norm": 15.25, "grad_norm_var": 0.2764973958333333, "learning_rate": 0.0003, "loss": 11.3525, "loss/aux_loss": 0.048074528202414514, "loss/crossentropy": 2.8597113609313967, "loss/logits": 0.8692526042461395, "step": 40100 }, { "epoch": 0.4011, "grad_norm": 13.5, "grad_norm_var": 0.262744140625, "learning_rate": 0.0003, "loss": 11.1604, "loss/aux_loss": 0.048076745681464673, "loss/crossentropy": 2.693953478336334, "loss/logits": 0.8202391982078552, "step": 40110 }, { "epoch": 0.4012, "grad_norm": 13.125, "grad_norm_var": 0.21443684895833334, "learning_rate": 0.0003, "loss": 11.1184, "loss/aux_loss": 0.04807984083890915, "loss/crossentropy": 2.7698384284973145, "loss/logits": 0.8715376138687134, "step": 40120 }, { "epoch": 0.4013, "grad_norm": 14.25, "grad_norm_var": 0.7333333333333333, "learning_rate": 0.0003, "loss": 11.2354, "loss/aux_loss": 0.04807922802865505, "loss/crossentropy": 2.58315287232399, "loss/logits": 0.8681640088558197, "step": 40130 }, { "epoch": 0.4014, "grad_norm": 12.9375, "grad_norm_var": 0.46067708333333335, "learning_rate": 0.0003, "loss": 11.2891, "loss/aux_loss": 0.048069264926016333, "loss/crossentropy": 2.6770537555217744, "loss/logits": 0.8575960993766785, "step": 40140 }, { "epoch": 0.4015, "grad_norm": 14.8125, "grad_norm_var": 0.368994140625, "learning_rate": 0.0003, "loss": 11.1327, "loss/aux_loss": 0.048076451011002067, "loss/crossentropy": 2.8826801657676695, "loss/logits": 0.8579352647066116, "step": 40150 }, { "epoch": 0.4016, "grad_norm": 13.8125, "grad_norm_var": 0.5153483072916667, "learning_rate": 0.0003, "loss": 11.2343, "loss/aux_loss": 0.04809125438332558, "loss/crossentropy": 2.618731087446213, "loss/logits": 0.7966024458408356, "step": 40160 }, { "epoch": 0.4017, "grad_norm": 15.0625, "grad_norm_var": 0.2830729166666667, "learning_rate": 0.0003, "loss": 11.1517, "loss/aux_loss": 0.04808546844869852, "loss/crossentropy": 2.646185064315796, "loss/logits": 0.8252136647701264, "step": 40170 }, { "epoch": 0.4018, "grad_norm": 14.3125, "grad_norm_var": 0.3223795572916667, "learning_rate": 0.0003, "loss": 11.1737, "loss/aux_loss": 0.048075793869793416, "loss/crossentropy": 2.519075998663902, "loss/logits": 0.8385014414787293, "step": 40180 }, { "epoch": 0.4019, "grad_norm": 14.25, "grad_norm_var": 0.4786295572916667, "learning_rate": 0.0003, "loss": 11.3532, "loss/aux_loss": 0.0480783874168992, "loss/crossentropy": 2.806821274757385, "loss/logits": 0.841489189863205, "step": 40190 }, { "epoch": 0.402, "grad_norm": 12.6875, "grad_norm_var": 8.168082682291667, "learning_rate": 0.0003, "loss": 11.3093, "loss/aux_loss": 0.04808771722018719, "loss/crossentropy": 2.7427878618240356, "loss/logits": 0.8800740391016006, "step": 40200 }, { "epoch": 0.4021, "grad_norm": 13.3125, "grad_norm_var": 0.27024739583333335, "learning_rate": 0.0003, "loss": 11.1774, "loss/aux_loss": 0.048070460185408594, "loss/crossentropy": 2.5837554335594177, "loss/logits": 0.855616545677185, "step": 40210 }, { "epoch": 0.4022, "grad_norm": 13.8125, "grad_norm_var": 0.6893229166666667, "learning_rate": 0.0003, "loss": 11.2717, "loss/aux_loss": 0.04808332584798336, "loss/crossentropy": 2.7743981003761293, "loss/logits": 0.8714166820049286, "step": 40220 }, { "epoch": 0.4023, "grad_norm": 13.375, "grad_norm_var": 0.743994140625, "learning_rate": 0.0003, "loss": 11.2791, "loss/aux_loss": 0.04807800035923719, "loss/crossentropy": 2.73896102309227, "loss/logits": 0.8382152438163757, "step": 40230 }, { "epoch": 0.4024, "grad_norm": 15.5625, "grad_norm_var": 2.974739583333333, "learning_rate": 0.0003, "loss": 11.133, "loss/aux_loss": 0.0480761282145977, "loss/crossentropy": 2.6960204541683197, "loss/logits": 0.8285995244979858, "step": 40240 }, { "epoch": 0.4025, "grad_norm": 14.6875, "grad_norm_var": 3.1540201822916667, "learning_rate": 0.0003, "loss": 11.3377, "loss/aux_loss": 0.048080798238515854, "loss/crossentropy": 2.774839425086975, "loss/logits": 0.8541697800159455, "step": 40250 }, { "epoch": 0.4026, "grad_norm": 15.9375, "grad_norm_var": 0.5161458333333333, "learning_rate": 0.0003, "loss": 11.2857, "loss/aux_loss": 0.04807197824120522, "loss/crossentropy": 2.7606529712677004, "loss/logits": 0.8577615320682526, "step": 40260 }, { "epoch": 0.4027, "grad_norm": 13.8125, "grad_norm_var": 0.4900390625, "learning_rate": 0.0003, "loss": 11.2021, "loss/aux_loss": 0.04807481914758682, "loss/crossentropy": 2.809233945608139, "loss/logits": 0.8515638172626495, "step": 40270 }, { "epoch": 0.4028, "grad_norm": 15.4375, "grad_norm_var": 0.39920247395833336, "learning_rate": 0.0003, "loss": 11.1816, "loss/aux_loss": 0.04807350169867277, "loss/crossentropy": 2.748124420642853, "loss/logits": 0.8494113475084305, "step": 40280 }, { "epoch": 0.4029, "grad_norm": 15.8125, "grad_norm_var": 0.5233723958333333, "learning_rate": 0.0003, "loss": 11.4207, "loss/aux_loss": 0.04807033948600292, "loss/crossentropy": 2.719420325756073, "loss/logits": 0.8667916238307953, "step": 40290 }, { "epoch": 0.403, "grad_norm": 14.6875, "grad_norm_var": 0.27858072916666665, "learning_rate": 0.0003, "loss": 11.3065, "loss/aux_loss": 0.048076901398599145, "loss/crossentropy": 2.6295212328433992, "loss/logits": 0.8683276027441025, "step": 40300 }, { "epoch": 0.4031, "grad_norm": 16.625, "grad_norm_var": 16.3119140625, "learning_rate": 0.0003, "loss": 11.1657, "loss/aux_loss": 0.048084932193160054, "loss/crossentropy": 2.8191932320594786, "loss/logits": 0.8459627896547317, "step": 40310 }, { "epoch": 0.4032, "grad_norm": 14.0625, "grad_norm_var": 16.589957682291665, "learning_rate": 0.0003, "loss": 11.2229, "loss/aux_loss": 0.04807272832840681, "loss/crossentropy": 2.8436803817749023, "loss/logits": 0.8772078216075897, "step": 40320 }, { "epoch": 0.4033, "grad_norm": 13.125, "grad_norm_var": 0.17962239583333334, "learning_rate": 0.0003, "loss": 11.2633, "loss/aux_loss": 0.04807746745646, "loss/crossentropy": 2.7854455411434174, "loss/logits": 0.8320712119340896, "step": 40330 }, { "epoch": 0.4034, "grad_norm": 13.8125, "grad_norm_var": 0.3485514322916667, "learning_rate": 0.0003, "loss": 11.2072, "loss/aux_loss": 0.04806860648095608, "loss/crossentropy": 2.8128843665122987, "loss/logits": 0.8971195042133331, "step": 40340 }, { "epoch": 0.4035, "grad_norm": 12.9375, "grad_norm_var": 0.5788899739583333, "learning_rate": 0.0003, "loss": 11.2615, "loss/aux_loss": 0.04807734172791243, "loss/crossentropy": 2.719151735305786, "loss/logits": 0.8351715385913849, "step": 40350 }, { "epoch": 0.4036, "grad_norm": 13.8125, "grad_norm_var": 0.6541666666666667, "learning_rate": 0.0003, "loss": 11.217, "loss/aux_loss": 0.04807971119880676, "loss/crossentropy": 2.65439595580101, "loss/logits": 0.8389561653137207, "step": 40360 }, { "epoch": 0.4037, "grad_norm": 12.875, "grad_norm_var": 0.7363932291666667, "learning_rate": 0.0003, "loss": 11.3419, "loss/aux_loss": 0.048067951761186126, "loss/crossentropy": 2.8152174830436705, "loss/logits": 0.8660207390785217, "step": 40370 }, { "epoch": 0.4038, "grad_norm": 14.75, "grad_norm_var": 0.5778645833333333, "learning_rate": 0.0003, "loss": 11.2324, "loss/aux_loss": 0.04808384161442518, "loss/crossentropy": 2.682347524166107, "loss/logits": 0.8513666987419128, "step": 40380 }, { "epoch": 0.4039, "grad_norm": 14.0, "grad_norm_var": 0.4456868489583333, "learning_rate": 0.0003, "loss": 11.1544, "loss/aux_loss": 0.04806650690734386, "loss/crossentropy": 2.735366094112396, "loss/logits": 0.861262845993042, "step": 40390 }, { "epoch": 0.404, "grad_norm": 13.3125, "grad_norm_var": 0.6650390625, "learning_rate": 0.0003, "loss": 11.442, "loss/aux_loss": 0.0480755427852273, "loss/crossentropy": 2.8482566595077516, "loss/logits": 0.8759390920400619, "step": 40400 }, { "epoch": 0.4041, "grad_norm": 13.875, "grad_norm_var": 0.29620768229166666, "learning_rate": 0.0003, "loss": 11.2389, "loss/aux_loss": 0.04807413425296545, "loss/crossentropy": 2.877766025066376, "loss/logits": 0.8527081072330475, "step": 40410 }, { "epoch": 0.4042, "grad_norm": 15.6875, "grad_norm_var": 1855.3328125, "learning_rate": 0.0003, "loss": 11.212, "loss/aux_loss": 0.04808369241654873, "loss/crossentropy": 2.6940404534339906, "loss/logits": 0.8131880909204483, "step": 40420 }, { "epoch": 0.4043, "grad_norm": 15.8125, "grad_norm_var": 1844.8794270833334, "learning_rate": 0.0003, "loss": 11.2414, "loss/aux_loss": 0.048076518811285496, "loss/crossentropy": 2.773360276222229, "loss/logits": 0.8594042271375656, "step": 40430 }, { "epoch": 0.4044, "grad_norm": 13.75, "grad_norm_var": 3.066650390625, "learning_rate": 0.0003, "loss": 11.2978, "loss/aux_loss": 0.048078188113868235, "loss/crossentropy": 2.703492206335068, "loss/logits": 0.8389413356781006, "step": 40440 }, { "epoch": 0.4045, "grad_norm": 13.8125, "grad_norm_var": 0.389306640625, "learning_rate": 0.0003, "loss": 11.2174, "loss/aux_loss": 0.04808528777211905, "loss/crossentropy": 2.682606953382492, "loss/logits": 0.8352272599935532, "step": 40450 }, { "epoch": 0.4046, "grad_norm": 13.1875, "grad_norm_var": 56.533056640625, "learning_rate": 0.0003, "loss": 11.2459, "loss/aux_loss": 0.04807485770434141, "loss/crossentropy": 2.8166627526283263, "loss/logits": 0.907360565662384, "step": 40460 }, { "epoch": 0.4047, "grad_norm": 13.5625, "grad_norm_var": 1.5634765625, "learning_rate": 0.0003, "loss": 11.1853, "loss/aux_loss": 0.048092107847332956, "loss/crossentropy": 2.744275617599487, "loss/logits": 0.8467898726463318, "step": 40470 }, { "epoch": 0.4048, "grad_norm": 13.3125, "grad_norm_var": 0.5330729166666667, "learning_rate": 0.0003, "loss": 11.3042, "loss/aux_loss": 0.04806965496391058, "loss/crossentropy": 2.959182548522949, "loss/logits": 0.8552993059158325, "step": 40480 }, { "epoch": 0.4049, "grad_norm": 12.75, "grad_norm_var": 0.3731608072916667, "learning_rate": 0.0003, "loss": 11.3252, "loss/aux_loss": 0.04807450994849205, "loss/crossentropy": 2.826492565870285, "loss/logits": 0.8594222873449325, "step": 40490 }, { "epoch": 0.405, "grad_norm": 16.25, "grad_norm_var": 1.2333333333333334, "learning_rate": 0.0003, "loss": 11.1622, "loss/aux_loss": 0.04808164164423943, "loss/crossentropy": 2.7364363431930543, "loss/logits": 0.8271835565567016, "step": 40500 }, { "epoch": 0.4051, "grad_norm": 12.875, "grad_norm_var": 0.9535807291666667, "learning_rate": 0.0003, "loss": 11.2023, "loss/aux_loss": 0.048075484670698644, "loss/crossentropy": 2.6764685451984405, "loss/logits": 0.8407616734504699, "step": 40510 }, { "epoch": 0.4052, "grad_norm": 14.375, "grad_norm_var": 53.86302083333333, "learning_rate": 0.0003, "loss": 11.1894, "loss/aux_loss": 0.048087388090789315, "loss/crossentropy": 2.6650672793388366, "loss/logits": 0.8593515366315841, "step": 40520 }, { "epoch": 0.4053, "grad_norm": 19.625, "grad_norm_var": 38.290478515625, "learning_rate": 0.0003, "loss": 11.304, "loss/aux_loss": 0.04808492045849562, "loss/crossentropy": 2.5504296123981476, "loss/logits": 0.8223045408725739, "step": 40530 }, { "epoch": 0.4054, "grad_norm": 13.25, "grad_norm_var": 5.257535807291666, "learning_rate": 0.0003, "loss": 11.2125, "loss/aux_loss": 0.04807108696550131, "loss/crossentropy": 2.6947197139263155, "loss/logits": 0.8361944794654846, "step": 40540 }, { "epoch": 0.4055, "grad_norm": 14.1875, "grad_norm_var": 0.5137858072916667, "learning_rate": 0.0003, "loss": 11.1638, "loss/aux_loss": 0.048065942153334616, "loss/crossentropy": 2.8109546184539793, "loss/logits": 0.829085710644722, "step": 40550 }, { "epoch": 0.4056, "grad_norm": 14.0, "grad_norm_var": 0.6329264322916667, "learning_rate": 0.0003, "loss": 11.1777, "loss/aux_loss": 0.048072488605976106, "loss/crossentropy": 2.786225712299347, "loss/logits": 0.8105708062648773, "step": 40560 }, { "epoch": 0.4057, "grad_norm": 13.5, "grad_norm_var": 6.563395182291667, "learning_rate": 0.0003, "loss": 11.3507, "loss/aux_loss": 0.04808564819395542, "loss/crossentropy": 2.751372504234314, "loss/logits": 0.8564148962497711, "step": 40570 }, { "epoch": 0.4058, "grad_norm": 13.0625, "grad_norm_var": 0.8296223958333333, "learning_rate": 0.0003, "loss": 11.215, "loss/aux_loss": 0.04807268865406513, "loss/crossentropy": 2.8208558201789855, "loss/logits": 0.8638029783964157, "step": 40580 }, { "epoch": 0.4059, "grad_norm": 15.5625, "grad_norm_var": 0.5239420572916667, "learning_rate": 0.0003, "loss": 11.28, "loss/aux_loss": 0.04807143602520227, "loss/crossentropy": 2.665737110376358, "loss/logits": 0.8453109055757523, "step": 40590 }, { "epoch": 0.406, "grad_norm": 15.375, "grad_norm_var": 0.7660807291666667, "learning_rate": 0.0003, "loss": 11.1757, "loss/aux_loss": 0.04807765483856201, "loss/crossentropy": 2.630817985534668, "loss/logits": 0.8509970605373383, "step": 40600 }, { "epoch": 0.4061, "grad_norm": 15.4375, "grad_norm_var": 0.9417805989583333, "learning_rate": 0.0003, "loss": 11.1785, "loss/aux_loss": 0.04807794988155365, "loss/crossentropy": 2.5632822811603546, "loss/logits": 0.8242575019598007, "step": 40610 }, { "epoch": 0.4062, "grad_norm": 16.5, "grad_norm_var": 0.7395182291666667, "learning_rate": 0.0003, "loss": 11.1997, "loss/aux_loss": 0.04807608798146248, "loss/crossentropy": 2.808782720565796, "loss/logits": 0.8660883277654647, "step": 40620 }, { "epoch": 0.4063, "grad_norm": 15.6875, "grad_norm_var": 1.1874348958333334, "learning_rate": 0.0003, "loss": 11.2832, "loss/aux_loss": 0.04806890748441219, "loss/crossentropy": 2.926540124416351, "loss/logits": 0.86942158639431, "step": 40630 }, { "epoch": 0.4064, "grad_norm": 14.75, "grad_norm_var": 131.5337890625, "learning_rate": 0.0003, "loss": 11.3468, "loss/aux_loss": 0.04808539636433125, "loss/crossentropy": 2.8163744449615478, "loss/logits": 0.8719450891017914, "step": 40640 }, { "epoch": 0.4065, "grad_norm": 13.8125, "grad_norm_var": 1.8114583333333334, "learning_rate": 0.0003, "loss": 11.3543, "loss/aux_loss": 0.04807546809315681, "loss/crossentropy": 2.686785101890564, "loss/logits": 0.877001416683197, "step": 40650 }, { "epoch": 0.4066, "grad_norm": 13.125, "grad_norm_var": 0.5839680989583333, "learning_rate": 0.0003, "loss": 11.1328, "loss/aux_loss": 0.04807546567171812, "loss/crossentropy": 2.637565851211548, "loss/logits": 0.8497596830129623, "step": 40660 }, { "epoch": 0.4067, "grad_norm": 12.6875, "grad_norm_var": 0.59375, "learning_rate": 0.0003, "loss": 11.3324, "loss/aux_loss": 0.04807331208139658, "loss/crossentropy": 2.5887813806533813, "loss/logits": 0.8211749017238616, "step": 40670 }, { "epoch": 0.4068, "grad_norm": 14.0, "grad_norm_var": 0.9645182291666666, "learning_rate": 0.0003, "loss": 11.2639, "loss/aux_loss": 0.0480765325948596, "loss/crossentropy": 2.6423826932907106, "loss/logits": 0.852640700340271, "step": 40680 }, { "epoch": 0.4069, "grad_norm": 14.375, "grad_norm_var": 0.35792643229166665, "learning_rate": 0.0003, "loss": 11.0373, "loss/aux_loss": 0.0480792922899127, "loss/crossentropy": 2.738467514514923, "loss/logits": 0.8378350138664246, "step": 40690 }, { "epoch": 0.407, "grad_norm": 14.0, "grad_norm_var": 3.504280598958333, "learning_rate": 0.0003, "loss": 11.5276, "loss/aux_loss": 0.0480826161801815, "loss/crossentropy": 2.7524060189723967, "loss/logits": 0.8455175578594207, "step": 40700 }, { "epoch": 0.4071, "grad_norm": 13.625, "grad_norm_var": 0.8868326822916667, "learning_rate": 0.0003, "loss": 11.1276, "loss/aux_loss": 0.04806787483394146, "loss/crossentropy": 2.55394446849823, "loss/logits": 0.8222862929105759, "step": 40710 }, { "epoch": 0.4072, "grad_norm": 14.625, "grad_norm_var": 0.6627604166666666, "learning_rate": 0.0003, "loss": 11.2755, "loss/aux_loss": 0.04808096699416638, "loss/crossentropy": 2.7554591298103333, "loss/logits": 0.8580325931310654, "step": 40720 }, { "epoch": 0.4073, "grad_norm": 15.0, "grad_norm_var": 1.2700358072916667, "learning_rate": 0.0003, "loss": 11.2255, "loss/aux_loss": 0.04807104617357254, "loss/crossentropy": 2.79397691488266, "loss/logits": 0.8811484813690186, "step": 40730 }, { "epoch": 0.4074, "grad_norm": 15.25, "grad_norm_var": 1.4535807291666667, "learning_rate": 0.0003, "loss": 11.267, "loss/aux_loss": 0.04806910958141088, "loss/crossentropy": 2.837631583213806, "loss/logits": 0.8389610022306442, "step": 40740 }, { "epoch": 0.4075, "grad_norm": 14.25, "grad_norm_var": 1.6921712239583333, "learning_rate": 0.0003, "loss": 11.1731, "loss/aux_loss": 0.04807141162455082, "loss/crossentropy": 2.658374536037445, "loss/logits": 0.834993302822113, "step": 40750 }, { "epoch": 0.4076, "grad_norm": 14.6875, "grad_norm_var": 1.2516764322916667, "learning_rate": 0.0003, "loss": 11.1559, "loss/aux_loss": 0.04808028191328049, "loss/crossentropy": 2.8203831791877745, "loss/logits": 0.849945318698883, "step": 40760 }, { "epoch": 0.4077, "grad_norm": 14.8125, "grad_norm_var": 0.5962076822916667, "learning_rate": 0.0003, "loss": 11.2235, "loss/aux_loss": 0.04808716755360365, "loss/crossentropy": 2.6665258586406706, "loss/logits": 0.8106503874063492, "step": 40770 }, { "epoch": 0.4078, "grad_norm": 14.375, "grad_norm_var": 0.47994791666666664, "learning_rate": 0.0003, "loss": 11.1902, "loss/aux_loss": 0.048067199811339376, "loss/crossentropy": 2.8551108717918394, "loss/logits": 0.8341933101415634, "step": 40780 }, { "epoch": 0.4079, "grad_norm": 13.6875, "grad_norm_var": 0.46404622395833334, "learning_rate": 0.0003, "loss": 11.3637, "loss/aux_loss": 0.048095259629189965, "loss/crossentropy": 2.789784300327301, "loss/logits": 0.8456574827432632, "step": 40790 }, { "epoch": 0.408, "grad_norm": 14.625, "grad_norm_var": 0.36692708333333335, "learning_rate": 0.0003, "loss": 11.353, "loss/aux_loss": 0.04808163102716208, "loss/crossentropy": 2.769635444879532, "loss/logits": 0.8221473515033721, "step": 40800 }, { "epoch": 0.4081, "grad_norm": 14.4375, "grad_norm_var": 0.235791015625, "learning_rate": 0.0003, "loss": 11.0647, "loss/aux_loss": 0.04807380121201277, "loss/crossentropy": 2.565002143383026, "loss/logits": 0.80843525826931, "step": 40810 }, { "epoch": 0.4082, "grad_norm": 13.4375, "grad_norm_var": 0.17630208333333333, "learning_rate": 0.0003, "loss": 11.2184, "loss/aux_loss": 0.04807892981916666, "loss/crossentropy": 2.784299910068512, "loss/logits": 0.8424362123012543, "step": 40820 }, { "epoch": 0.4083, "grad_norm": 15.8125, "grad_norm_var": 0.6523274739583333, "learning_rate": 0.0003, "loss": 11.1706, "loss/aux_loss": 0.048079443722963335, "loss/crossentropy": 2.6999199271202086, "loss/logits": 0.8214786738157273, "step": 40830 }, { "epoch": 0.4084, "grad_norm": 14.25, "grad_norm_var": 0.57265625, "learning_rate": 0.0003, "loss": 11.2877, "loss/aux_loss": 0.04807170610874891, "loss/crossentropy": 2.830400151014328, "loss/logits": 0.8450867384672165, "step": 40840 }, { "epoch": 0.4085, "grad_norm": 14.125, "grad_norm_var": 0.26555989583333334, "learning_rate": 0.0003, "loss": 11.3547, "loss/aux_loss": 0.048065755516290665, "loss/crossentropy": 2.723574197292328, "loss/logits": 0.8342130482196808, "step": 40850 }, { "epoch": 0.4086, "grad_norm": 13.875, "grad_norm_var": 0.4051432291666667, "learning_rate": 0.0003, "loss": 11.1298, "loss/aux_loss": 0.0480728205293417, "loss/crossentropy": 2.7340495467185972, "loss/logits": 0.8313911110162735, "step": 40860 }, { "epoch": 0.4087, "grad_norm": 15.4375, "grad_norm_var": 0.48880208333333336, "learning_rate": 0.0003, "loss": 11.1538, "loss/aux_loss": 0.04808170907199383, "loss/crossentropy": 2.6556981980800627, "loss/logits": 0.8474443554878235, "step": 40870 }, { "epoch": 0.4088, "grad_norm": 14.25, "grad_norm_var": 0.42962239583333334, "learning_rate": 0.0003, "loss": 11.0847, "loss/aux_loss": 0.04807398784905672, "loss/crossentropy": 2.612995356321335, "loss/logits": 0.8423753798007965, "step": 40880 }, { "epoch": 0.4089, "grad_norm": 16.375, "grad_norm_var": 1.2555826822916667, "learning_rate": 0.0003, "loss": 11.2143, "loss/aux_loss": 0.04807846713811159, "loss/crossentropy": 2.7332702219486236, "loss/logits": 0.864795908331871, "step": 40890 }, { "epoch": 0.409, "grad_norm": 13.6875, "grad_norm_var": 10.117692057291666, "learning_rate": 0.0003, "loss": 11.2962, "loss/aux_loss": 0.0480788690969348, "loss/crossentropy": 2.662673217058182, "loss/logits": 0.8612865924835205, "step": 40900 }, { "epoch": 0.4091, "grad_norm": 14.5, "grad_norm_var": 0.31131184895833336, "learning_rate": 0.0003, "loss": 11.1592, "loss/aux_loss": 0.04808458536863327, "loss/crossentropy": 2.793060463666916, "loss/logits": 0.8244423866271973, "step": 40910 }, { "epoch": 0.4092, "grad_norm": 18.0, "grad_norm_var": 1.2817545572916667, "learning_rate": 0.0003, "loss": 11.3325, "loss/aux_loss": 0.048066679015755655, "loss/crossentropy": 2.822656285762787, "loss/logits": 0.8820368677377701, "step": 40920 }, { "epoch": 0.4093, "grad_norm": 14.1875, "grad_norm_var": 1.397900390625, "learning_rate": 0.0003, "loss": 11.1258, "loss/aux_loss": 0.04807949531823397, "loss/crossentropy": 2.837810254096985, "loss/logits": 0.8587910264730454, "step": 40930 }, { "epoch": 0.4094, "grad_norm": 15.5625, "grad_norm_var": 0.8061848958333333, "learning_rate": 0.0003, "loss": 11.2212, "loss/aux_loss": 0.04807252325117588, "loss/crossentropy": 2.607957309484482, "loss/logits": 0.8224194586277008, "step": 40940 }, { "epoch": 0.4095, "grad_norm": 13.6875, "grad_norm_var": 0.6403483072916667, "learning_rate": 0.0003, "loss": 11.1002, "loss/aux_loss": 0.048077072761952874, "loss/crossentropy": 2.6987807989120483, "loss/logits": 0.8149879366159439, "step": 40950 }, { "epoch": 0.4096, "grad_norm": 14.125, "grad_norm_var": 1.686962890625, "learning_rate": 0.0003, "loss": 11.3598, "loss/aux_loss": 0.048078333213925364, "loss/crossentropy": 2.849722057580948, "loss/logits": 0.8561419308185577, "step": 40960 }, { "epoch": 0.4097, "grad_norm": 13.5, "grad_norm_var": 1.6792805989583333, "learning_rate": 0.0003, "loss": 11.2066, "loss/aux_loss": 0.04807558581233025, "loss/crossentropy": 2.783593249320984, "loss/logits": 0.8805976897478104, "step": 40970 }, { "epoch": 0.4098, "grad_norm": 16.75, "grad_norm_var": 0.8942057291666666, "learning_rate": 0.0003, "loss": 11.2792, "loss/aux_loss": 0.04807433895766735, "loss/crossentropy": 2.619139677286148, "loss/logits": 0.8347267210483551, "step": 40980 }, { "epoch": 0.4099, "grad_norm": 16.125, "grad_norm_var": 1.1270833333333334, "learning_rate": 0.0003, "loss": 11.2101, "loss/aux_loss": 0.04807988088577986, "loss/crossentropy": 2.6135978281497954, "loss/logits": 0.8248639732599259, "step": 40990 }, { "epoch": 0.41, "grad_norm": 15.125, "grad_norm_var": 1.1157389322916667, "learning_rate": 0.0003, "loss": 11.1235, "loss/aux_loss": 0.04808098264038563, "loss/crossentropy": 2.711561453342438, "loss/logits": 0.8340432167053222, "step": 41000 }, { "epoch": 0.4101, "grad_norm": 15.0, "grad_norm_var": 0.7129557291666667, "learning_rate": 0.0003, "loss": 11.141, "loss/aux_loss": 0.04807994924485683, "loss/crossentropy": 2.672397243976593, "loss/logits": 0.8090786308050155, "step": 41010 }, { "epoch": 0.4102, "grad_norm": 13.3125, "grad_norm_var": 0.5226399739583333, "learning_rate": 0.0003, "loss": 11.2172, "loss/aux_loss": 0.04806915018707514, "loss/crossentropy": 2.7911486864089965, "loss/logits": 0.8176318496465683, "step": 41020 }, { "epoch": 0.4103, "grad_norm": 13.8125, "grad_norm_var": 0.27029622395833336, "learning_rate": 0.0003, "loss": 11.0946, "loss/aux_loss": 0.04807211291044951, "loss/crossentropy": 2.7278398156166075, "loss/logits": 0.837305772304535, "step": 41030 }, { "epoch": 0.4104, "grad_norm": 13.75, "grad_norm_var": 0.372900390625, "learning_rate": 0.0003, "loss": 11.1227, "loss/aux_loss": 0.04808787349611521, "loss/crossentropy": 2.6341135680675505, "loss/logits": 0.8195017322897911, "step": 41040 }, { "epoch": 0.4105, "grad_norm": 13.1875, "grad_norm_var": 2.482796223958333, "learning_rate": 0.0003, "loss": 11.2554, "loss/aux_loss": 0.04807057995349169, "loss/crossentropy": 2.843722766637802, "loss/logits": 0.8595122218132019, "step": 41050 }, { "epoch": 0.4106, "grad_norm": 14.125, "grad_norm_var": 2.5208333333333335, "learning_rate": 0.0003, "loss": 11.1853, "loss/aux_loss": 0.04808246102184057, "loss/crossentropy": 2.6582208454608915, "loss/logits": 0.8346160590648651, "step": 41060 }, { "epoch": 0.4107, "grad_norm": 15.3125, "grad_norm_var": 0.71171875, "learning_rate": 0.0003, "loss": 11.3008, "loss/aux_loss": 0.04806876853108406, "loss/crossentropy": 2.6965928435325623, "loss/logits": 0.8599708110094071, "step": 41070 }, { "epoch": 0.4108, "grad_norm": 14.125, "grad_norm_var": 0.8223307291666667, "learning_rate": 0.0003, "loss": 11.0954, "loss/aux_loss": 0.048069384321570395, "loss/crossentropy": 2.714770442247391, "loss/logits": 0.8336487352848053, "step": 41080 }, { "epoch": 0.4109, "grad_norm": 13.8125, "grad_norm_var": 0.6469889322916667, "learning_rate": 0.0003, "loss": 11.2912, "loss/aux_loss": 0.04807863663882017, "loss/crossentropy": 2.7766244173049928, "loss/logits": 0.8574995458126068, "step": 41090 }, { "epoch": 0.411, "grad_norm": 14.75, "grad_norm_var": 1.143603515625, "learning_rate": 0.0003, "loss": 10.9719, "loss/aux_loss": 0.04807050917297602, "loss/crossentropy": 2.6750208139419556, "loss/logits": 0.8000975757837295, "step": 41100 }, { "epoch": 0.4111, "grad_norm": 14.0625, "grad_norm_var": 1.7415201822916666, "learning_rate": 0.0003, "loss": 11.0615, "loss/aux_loss": 0.048084990307688716, "loss/crossentropy": 2.8587915897369385, "loss/logits": 0.859082692861557, "step": 41110 }, { "epoch": 0.4112, "grad_norm": 12.9375, "grad_norm_var": 0.441650390625, "learning_rate": 0.0003, "loss": 11.2455, "loss/aux_loss": 0.04807133413851261, "loss/crossentropy": 2.6838557541370394, "loss/logits": 0.8341993808746337, "step": 41120 }, { "epoch": 0.4113, "grad_norm": 14.5, "grad_norm_var": 0.8056640625, "learning_rate": 0.0003, "loss": 11.3317, "loss/aux_loss": 0.04806279819458723, "loss/crossentropy": 2.934316062927246, "loss/logits": 0.852023234963417, "step": 41130 }, { "epoch": 0.4114, "grad_norm": 13.1875, "grad_norm_var": 0.5887858072916666, "learning_rate": 0.0003, "loss": 11.1059, "loss/aux_loss": 0.0480830904096365, "loss/crossentropy": 2.8082756876945494, "loss/logits": 0.814395149052143, "step": 41140 }, { "epoch": 0.4115, "grad_norm": 14.1875, "grad_norm_var": 3.896354166666667, "learning_rate": 0.0003, "loss": 11.2447, "loss/aux_loss": 0.048078673891723156, "loss/crossentropy": 2.7707399845123293, "loss/logits": 0.8573799431324005, "step": 41150 }, { "epoch": 0.4116, "grad_norm": 13.75, "grad_norm_var": 3.89609375, "learning_rate": 0.0003, "loss": 11.2491, "loss/aux_loss": 0.0480762155726552, "loss/crossentropy": 2.9101900935173033, "loss/logits": 0.8609474629163743, "step": 41160 }, { "epoch": 0.4117, "grad_norm": 14.1875, "grad_norm_var": 1.0238932291666667, "learning_rate": 0.0003, "loss": 11.1961, "loss/aux_loss": 0.048080237582325935, "loss/crossentropy": 2.606840658187866, "loss/logits": 0.8273939996957779, "step": 41170 }, { "epoch": 0.4118, "grad_norm": 14.5, "grad_norm_var": 0.1619140625, "learning_rate": 0.0003, "loss": 11.2267, "loss/aux_loss": 0.04807858131825924, "loss/crossentropy": 2.684722530841827, "loss/logits": 0.840096390247345, "step": 41180 }, { "epoch": 0.4119, "grad_norm": 14.375, "grad_norm_var": 0.6071451822916667, "learning_rate": 0.0003, "loss": 11.2679, "loss/aux_loss": 0.04808023814111948, "loss/crossentropy": 2.697424811124802, "loss/logits": 0.8633444011211395, "step": 41190 }, { "epoch": 0.412, "grad_norm": 14.0625, "grad_norm_var": 0.28274739583333336, "learning_rate": 0.0003, "loss": 11.0866, "loss/aux_loss": 0.04807175993919373, "loss/crossentropy": 2.634129375219345, "loss/logits": 0.8138844251632691, "step": 41200 }, { "epoch": 0.4121, "grad_norm": 14.1875, "grad_norm_var": 0.5567057291666667, "learning_rate": 0.0003, "loss": 11.0525, "loss/aux_loss": 0.048080427944660185, "loss/crossentropy": 2.6594059228897096, "loss/logits": 0.8541360199451447, "step": 41210 }, { "epoch": 0.4122, "grad_norm": 14.4375, "grad_norm_var": 1.0149576822916666, "learning_rate": 0.0003, "loss": 11.2432, "loss/aux_loss": 0.04808235038071871, "loss/crossentropy": 2.797593057155609, "loss/logits": 0.886846199631691, "step": 41220 }, { "epoch": 0.4123, "grad_norm": 13.4375, "grad_norm_var": 0.8604166666666667, "learning_rate": 0.0003, "loss": 11.0625, "loss/aux_loss": 0.04806336238980293, "loss/crossentropy": 2.474899399280548, "loss/logits": 0.7937245279550552, "step": 41230 }, { "epoch": 0.4124, "grad_norm": 13.625, "grad_norm_var": 0.5230305989583334, "learning_rate": 0.0003, "loss": 11.2082, "loss/aux_loss": 0.048086734302341935, "loss/crossentropy": 2.6535877227783202, "loss/logits": 0.8522655874490738, "step": 41240 }, { "epoch": 0.4125, "grad_norm": 13.25, "grad_norm_var": 13.269124348958334, "learning_rate": 0.0003, "loss": 11.2381, "loss/aux_loss": 0.048068624176085, "loss/crossentropy": 2.7681680560112, "loss/logits": 0.8193393349647522, "step": 41250 }, { "epoch": 0.4126, "grad_norm": 13.0, "grad_norm_var": 14.011962890625, "learning_rate": 0.0003, "loss": 11.1778, "loss/aux_loss": 0.04807403292506933, "loss/crossentropy": 2.693704390525818, "loss/logits": 0.861787760257721, "step": 41260 }, { "epoch": 0.4127, "grad_norm": 14.6875, "grad_norm_var": 0.5087076822916666, "learning_rate": 0.0003, "loss": 11.2949, "loss/aux_loss": 0.04806809015572071, "loss/crossentropy": 2.527338033914566, "loss/logits": 0.8236821800470352, "step": 41270 }, { "epoch": 0.4128, "grad_norm": 13.9375, "grad_norm_var": 0.5129557291666667, "learning_rate": 0.0003, "loss": 11.22, "loss/aux_loss": 0.04808800853788853, "loss/crossentropy": 2.7711110353469848, "loss/logits": 0.8424245923757553, "step": 41280 }, { "epoch": 0.4129, "grad_norm": 14.375, "grad_norm_var": 0.23878580729166668, "learning_rate": 0.0003, "loss": 11.1104, "loss/aux_loss": 0.04806681144982576, "loss/crossentropy": 2.7462151408195496, "loss/logits": 0.878471040725708, "step": 41290 }, { "epoch": 0.413, "grad_norm": 13.6875, "grad_norm_var": 0.24386393229166667, "learning_rate": 0.0003, "loss": 11.2019, "loss/aux_loss": 0.048082958348095416, "loss/crossentropy": 2.857834202051163, "loss/logits": 0.8067145884037018, "step": 41300 }, { "epoch": 0.4131, "grad_norm": 13.6875, "grad_norm_var": 0.2322265625, "learning_rate": 0.0003, "loss": 11.0486, "loss/aux_loss": 0.048069640435278414, "loss/crossentropy": 2.736476743221283, "loss/logits": 0.8467221200466156, "step": 41310 }, { "epoch": 0.4132, "grad_norm": 13.4375, "grad_norm_var": 1.1536295572916666, "learning_rate": 0.0003, "loss": 11.3639, "loss/aux_loss": 0.048079471290111545, "loss/crossentropy": 2.822791963815689, "loss/logits": 0.891073489189148, "step": 41320 }, { "epoch": 0.4133, "grad_norm": 16.25, "grad_norm_var": 1.016650390625, "learning_rate": 0.0003, "loss": 11.2361, "loss/aux_loss": 0.04807520154863596, "loss/crossentropy": 2.7339930176734923, "loss/logits": 0.8536212533712387, "step": 41330 }, { "epoch": 0.4134, "grad_norm": 14.5, "grad_norm_var": 0.8587076822916667, "learning_rate": 0.0003, "loss": 11.0921, "loss/aux_loss": 0.048079288192093374, "loss/crossentropy": 2.6249010980129244, "loss/logits": 0.8314791291952133, "step": 41340 }, { "epoch": 0.4135, "grad_norm": 15.3125, "grad_norm_var": 0.3551432291666667, "learning_rate": 0.0003, "loss": 11.1947, "loss/aux_loss": 0.04807145558297634, "loss/crossentropy": 2.6940083622932436, "loss/logits": 0.8695379942655563, "step": 41350 }, { "epoch": 0.4136, "grad_norm": 13.625, "grad_norm_var": 0.5160807291666667, "learning_rate": 0.0003, "loss": 11.1861, "loss/aux_loss": 0.04807656276971102, "loss/crossentropy": 2.5916694521903993, "loss/logits": 0.844970840215683, "step": 41360 }, { "epoch": 0.4137, "grad_norm": 13.8125, "grad_norm_var": 0.5186848958333333, "learning_rate": 0.0003, "loss": 11.362, "loss/aux_loss": 0.04807461556047201, "loss/crossentropy": 2.6328794419765473, "loss/logits": 0.8364063590765, "step": 41370 }, { "epoch": 0.4138, "grad_norm": 14.6875, "grad_norm_var": 0.237744140625, "learning_rate": 0.0003, "loss": 11.201, "loss/aux_loss": 0.04807669036090374, "loss/crossentropy": 2.700971281528473, "loss/logits": 0.8363195568323135, "step": 41380 }, { "epoch": 0.4139, "grad_norm": 14.0625, "grad_norm_var": 0.34524739583333336, "learning_rate": 0.0003, "loss": 11.3147, "loss/aux_loss": 0.04808150418102741, "loss/crossentropy": 2.7341397404670715, "loss/logits": 0.8220134526491165, "step": 41390 }, { "epoch": 0.414, "grad_norm": 13.75, "grad_norm_var": 0.3525390625, "learning_rate": 0.0003, "loss": 11.303, "loss/aux_loss": 0.04807320982217789, "loss/crossentropy": 2.7358368039131165, "loss/logits": 0.8331804543733596, "step": 41400 }, { "epoch": 0.4141, "grad_norm": 15.0625, "grad_norm_var": 1.0388020833333333, "learning_rate": 0.0003, "loss": 11.2284, "loss/aux_loss": 0.04808115866035223, "loss/crossentropy": 2.6575556874275206, "loss/logits": 0.8795315742492675, "step": 41410 }, { "epoch": 0.4142, "grad_norm": 14.5625, "grad_norm_var": 0.8374348958333333, "learning_rate": 0.0003, "loss": 11.2057, "loss/aux_loss": 0.04808524567633867, "loss/crossentropy": 2.639425593614578, "loss/logits": 0.8381777286529541, "step": 41420 }, { "epoch": 0.4143, "grad_norm": 13.375, "grad_norm_var": 0.7231608072916667, "learning_rate": 0.0003, "loss": 11.2503, "loss/aux_loss": 0.04807372502982617, "loss/crossentropy": 2.7047315418720244, "loss/logits": 0.8312118053436279, "step": 41430 }, { "epoch": 0.4144, "grad_norm": 14.3125, "grad_norm_var": 0.2953125, "learning_rate": 0.0003, "loss": 11.1798, "loss/aux_loss": 0.048076724819839003, "loss/crossentropy": 2.651015895605087, "loss/logits": 0.840973848104477, "step": 41440 }, { "epoch": 0.4145, "grad_norm": 13.625, "grad_norm_var": 0.5132649739583334, "learning_rate": 0.0003, "loss": 11.2366, "loss/aux_loss": 0.04806795343756676, "loss/crossentropy": 2.8247196197509767, "loss/logits": 0.8217742323875428, "step": 41450 }, { "epoch": 0.4146, "grad_norm": 14.1875, "grad_norm_var": 0.46027018229166666, "learning_rate": 0.0003, "loss": 11.0727, "loss/aux_loss": 0.04807949978858232, "loss/crossentropy": 2.5847329258918763, "loss/logits": 0.8199368387460708, "step": 41460 }, { "epoch": 0.4147, "grad_norm": 13.8125, "grad_norm_var": 0.314697265625, "learning_rate": 0.0003, "loss": 11.2538, "loss/aux_loss": 0.04806741494685411, "loss/crossentropy": 2.833824622631073, "loss/logits": 0.8637136548757554, "step": 41470 }, { "epoch": 0.4148, "grad_norm": 13.25, "grad_norm_var": 0.16380208333333332, "learning_rate": 0.0003, "loss": 11.1968, "loss/aux_loss": 0.04808125514537096, "loss/crossentropy": 2.6305019736289976, "loss/logits": 0.8417465597391128, "step": 41480 }, { "epoch": 0.4149, "grad_norm": 25.375, "grad_norm_var": 8.584749348958333, "learning_rate": 0.0003, "loss": 11.1024, "loss/aux_loss": 0.0480692395940423, "loss/crossentropy": 2.78939009308815, "loss/logits": 0.8338617235422134, "step": 41490 }, { "epoch": 0.415, "grad_norm": 15.625, "grad_norm_var": 8.855452473958334, "learning_rate": 0.0003, "loss": 11.1944, "loss/aux_loss": 0.04809089172631502, "loss/crossentropy": 2.633754700422287, "loss/logits": 0.8260821491479874, "step": 41500 }, { "epoch": 0.4151, "grad_norm": 15.125, "grad_norm_var": 1.2046223958333333, "learning_rate": 0.0003, "loss": 11.1703, "loss/aux_loss": 0.048080214858055116, "loss/crossentropy": 2.872915321588516, "loss/logits": 0.847340676188469, "step": 41510 }, { "epoch": 0.4152, "grad_norm": 15.4375, "grad_norm_var": 1.2669108072916666, "learning_rate": 0.0003, "loss": 11.0204, "loss/aux_loss": 0.04807033985853195, "loss/crossentropy": 2.750588357448578, "loss/logits": 0.8613759696483612, "step": 41520 }, { "epoch": 0.4153, "grad_norm": 13.3125, "grad_norm_var": 0.5091145833333334, "learning_rate": 0.0003, "loss": 11.2335, "loss/aux_loss": 0.04808351546525955, "loss/crossentropy": 2.702675199508667, "loss/logits": 0.8650101304054261, "step": 41530 }, { "epoch": 0.4154, "grad_norm": 15.75, "grad_norm_var": 0.498291015625, "learning_rate": 0.0003, "loss": 11.2771, "loss/aux_loss": 0.048078674264252184, "loss/crossentropy": 2.69321893453598, "loss/logits": 0.8795695304870605, "step": 41540 }, { "epoch": 0.4155, "grad_norm": 13.9375, "grad_norm_var": 1.3921223958333333, "learning_rate": 0.0003, "loss": 11.2882, "loss/aux_loss": 0.0480771878734231, "loss/crossentropy": 2.849241554737091, "loss/logits": 0.8312081456184387, "step": 41550 }, { "epoch": 0.4156, "grad_norm": 13.875, "grad_norm_var": 1.506884765625, "learning_rate": 0.0003, "loss": 11.2257, "loss/aux_loss": 0.04807361774146557, "loss/crossentropy": 2.63561954498291, "loss/logits": 0.8598904728889465, "step": 41560 }, { "epoch": 0.4157, "grad_norm": 13.5, "grad_norm_var": 0.570947265625, "learning_rate": 0.0003, "loss": 11.2278, "loss/aux_loss": 0.04808267373591661, "loss/crossentropy": 2.668130397796631, "loss/logits": 0.8185748666524887, "step": 41570 }, { "epoch": 0.4158, "grad_norm": 14.4375, "grad_norm_var": 0.470166015625, "learning_rate": 0.0003, "loss": 11.1306, "loss/aux_loss": 0.04807985983788967, "loss/crossentropy": 2.642909526824951, "loss/logits": 0.8164368301630021, "step": 41580 }, { "epoch": 0.4159, "grad_norm": 14.6875, "grad_norm_var": 0.4727701822916667, "learning_rate": 0.0003, "loss": 11.1133, "loss/aux_loss": 0.04805450364947319, "loss/crossentropy": 2.6953525304794312, "loss/logits": 0.8492616504430771, "step": 41590 }, { "epoch": 0.416, "grad_norm": 15.375, "grad_norm_var": 50.5171875, "learning_rate": 0.0003, "loss": 11.4191, "loss/aux_loss": 0.048086438328027725, "loss/crossentropy": 2.696820414066315, "loss/logits": 0.8366290658712388, "step": 41600 }, { "epoch": 0.4161, "grad_norm": 14.875, "grad_norm_var": 0.4183430989583333, "learning_rate": 0.0003, "loss": 11.178, "loss/aux_loss": 0.04807772561907768, "loss/crossentropy": 2.730132043361664, "loss/logits": 0.8532672584056854, "step": 41610 }, { "epoch": 0.4162, "grad_norm": 13.6875, "grad_norm_var": 0.45625, "learning_rate": 0.0003, "loss": 11.2598, "loss/aux_loss": 0.04807155448943377, "loss/crossentropy": 2.8405850529670715, "loss/logits": 0.8775646090507507, "step": 41620 }, { "epoch": 0.4163, "grad_norm": 14.25, "grad_norm_var": 3.5476399739583333, "learning_rate": 0.0003, "loss": 11.392, "loss/aux_loss": 0.048077188059687616, "loss/crossentropy": 2.6101203083992006, "loss/logits": 0.8570107728242874, "step": 41630 }, { "epoch": 0.4164, "grad_norm": 15.0625, "grad_norm_var": 2.9973795572916666, "learning_rate": 0.0003, "loss": 11.3679, "loss/aux_loss": 0.0480748301371932, "loss/crossentropy": 2.6226659595966337, "loss/logits": 0.8122975617647171, "step": 41640 }, { "epoch": 0.4165, "grad_norm": 15.0, "grad_norm_var": 0.42233072916666664, "learning_rate": 0.0003, "loss": 11.2247, "loss/aux_loss": 0.04807394295930863, "loss/crossentropy": 2.675206708908081, "loss/logits": 0.8600716292858124, "step": 41650 }, { "epoch": 0.4166, "grad_norm": 14.1875, "grad_norm_var": 0.31951497395833334, "learning_rate": 0.0003, "loss": 11.3153, "loss/aux_loss": 0.04808421637862921, "loss/crossentropy": 2.7114802062511445, "loss/logits": 0.8375776976346969, "step": 41660 }, { "epoch": 0.4167, "grad_norm": 13.6875, "grad_norm_var": 3.4596354166666665, "learning_rate": 0.0003, "loss": 11.1687, "loss/aux_loss": 0.048071696795523165, "loss/crossentropy": 2.6611205101013184, "loss/logits": 0.8244008392095565, "step": 41670 }, { "epoch": 0.4168, "grad_norm": 15.4375, "grad_norm_var": 3.692122395833333, "learning_rate": 0.0003, "loss": 11.1502, "loss/aux_loss": 0.048073360323905946, "loss/crossentropy": 2.6855955958366393, "loss/logits": 0.8512616366147995, "step": 41680 }, { "epoch": 0.4169, "grad_norm": 12.75, "grad_norm_var": 0.6722493489583333, "learning_rate": 0.0003, "loss": 11.2267, "loss/aux_loss": 0.04807688854634762, "loss/crossentropy": 2.8731314897537232, "loss/logits": 0.8567210525274277, "step": 41690 }, { "epoch": 0.417, "grad_norm": 14.3125, "grad_norm_var": 0.6924479166666667, "learning_rate": 0.0003, "loss": 11.4032, "loss/aux_loss": 0.04807659108191729, "loss/crossentropy": 2.819017004966736, "loss/logits": 0.8509372651576996, "step": 41700 }, { "epoch": 0.4171, "grad_norm": 14.4375, "grad_norm_var": 0.627978515625, "learning_rate": 0.0003, "loss": 11.1128, "loss/aux_loss": 0.04807431064546108, "loss/crossentropy": 2.587670737504959, "loss/logits": 0.8356228917837143, "step": 41710 }, { "epoch": 0.4172, "grad_norm": 13.9375, "grad_norm_var": 0.8402180989583333, "learning_rate": 0.0003, "loss": 11.3018, "loss/aux_loss": 0.048075992986559866, "loss/crossentropy": 2.6217161655426025, "loss/logits": 0.8416286587715149, "step": 41720 }, { "epoch": 0.4173, "grad_norm": 12.5, "grad_norm_var": 1.4369140625, "learning_rate": 0.0003, "loss": 11.2056, "loss/aux_loss": 0.048069687001407145, "loss/crossentropy": 2.705968415737152, "loss/logits": 0.8546758621931076, "step": 41730 }, { "epoch": 0.4174, "grad_norm": 14.6875, "grad_norm_var": 1.9641764322916666, "learning_rate": 0.0003, "loss": 11.0999, "loss/aux_loss": 0.04808080643415451, "loss/crossentropy": 2.729911983013153, "loss/logits": 0.8501161009073257, "step": 41740 }, { "epoch": 0.4175, "grad_norm": 14.6875, "grad_norm_var": 4.536051432291667, "learning_rate": 0.0003, "loss": 11.2002, "loss/aux_loss": 0.0480776721611619, "loss/crossentropy": 2.471704250574112, "loss/logits": 0.8195729270577431, "step": 41750 }, { "epoch": 0.4176, "grad_norm": 15.75, "grad_norm_var": 0.6348307291666667, "learning_rate": 0.0003, "loss": 11.1175, "loss/aux_loss": 0.04808681774884462, "loss/crossentropy": 2.607026255130768, "loss/logits": 0.8329499930143356, "step": 41760 }, { "epoch": 0.4177, "grad_norm": 13.6875, "grad_norm_var": 0.5440104166666667, "learning_rate": 0.0003, "loss": 11.2311, "loss/aux_loss": 0.048072993755340576, "loss/crossentropy": 2.548939037322998, "loss/logits": 0.8520324468612671, "step": 41770 }, { "epoch": 0.4178, "grad_norm": 14.0, "grad_norm_var": 0.30130208333333336, "learning_rate": 0.0003, "loss": 11.1902, "loss/aux_loss": 0.04807652682065964, "loss/crossentropy": 2.70545357465744, "loss/logits": 0.8085658639669419, "step": 41780 }, { "epoch": 0.4179, "grad_norm": 14.1875, "grad_norm_var": 0.785791015625, "learning_rate": 0.0003, "loss": 11.2129, "loss/aux_loss": 0.04808551725000143, "loss/crossentropy": 2.7015809535980226, "loss/logits": 0.8569774001836776, "step": 41790 }, { "epoch": 0.418, "grad_norm": 13.5625, "grad_norm_var": 0.7012858072916667, "learning_rate": 0.0003, "loss": 11.1873, "loss/aux_loss": 0.04806754495948553, "loss/crossentropy": 2.7126809656620026, "loss/logits": 0.8494727402925492, "step": 41800 }, { "epoch": 0.4181, "grad_norm": 15.4375, "grad_norm_var": 0.334375, "learning_rate": 0.0003, "loss": 11.2021, "loss/aux_loss": 0.04807236008346081, "loss/crossentropy": 2.6086998522281646, "loss/logits": 0.854032838344574, "step": 41810 }, { "epoch": 0.4182, "grad_norm": 14.4375, "grad_norm_var": 0.279541015625, "learning_rate": 0.0003, "loss": 11.2497, "loss/aux_loss": 0.048076076060533525, "loss/crossentropy": 2.7394097089767455, "loss/logits": 0.8530152827501297, "step": 41820 }, { "epoch": 0.4183, "grad_norm": 13.5625, "grad_norm_var": 0.3042805989583333, "learning_rate": 0.0003, "loss": 11.2131, "loss/aux_loss": 0.048075619898736474, "loss/crossentropy": 2.693272775411606, "loss/logits": 0.8179311394691468, "step": 41830 }, { "epoch": 0.4184, "grad_norm": 14.875, "grad_norm_var": 0.6973958333333333, "learning_rate": 0.0003, "loss": 11.2869, "loss/aux_loss": 0.048083697259426114, "loss/crossentropy": 2.731438684463501, "loss/logits": 0.8441088706254959, "step": 41840 }, { "epoch": 0.4185, "grad_norm": 14.8125, "grad_norm_var": 0.5348795572916667, "learning_rate": 0.0003, "loss": 11.2476, "loss/aux_loss": 0.04807062391191721, "loss/crossentropy": 2.7216593980789185, "loss/logits": 0.8569325089454651, "step": 41850 }, { "epoch": 0.4186, "grad_norm": 14.375, "grad_norm_var": 0.318994140625, "learning_rate": 0.0003, "loss": 11.1622, "loss/aux_loss": 0.04807673562318086, "loss/crossentropy": 2.7688432216644285, "loss/logits": 0.868677607178688, "step": 41860 }, { "epoch": 0.4187, "grad_norm": 14.375, "grad_norm_var": 0.549853515625, "learning_rate": 0.0003, "loss": 11.1659, "loss/aux_loss": 0.04807185679674149, "loss/crossentropy": 2.743114960193634, "loss/logits": 0.8016538411378861, "step": 41870 }, { "epoch": 0.4188, "grad_norm": 14.4375, "grad_norm_var": 0.633056640625, "learning_rate": 0.0003, "loss": 11.4034, "loss/aux_loss": 0.04806502480059862, "loss/crossentropy": 2.7593605399131773, "loss/logits": 0.8573271870613098, "step": 41880 }, { "epoch": 0.4189, "grad_norm": 14.8125, "grad_norm_var": 0.4806640625, "learning_rate": 0.0003, "loss": 11.2272, "loss/aux_loss": 0.048074369132518766, "loss/crossentropy": 2.729891151189804, "loss/logits": 0.877751037478447, "step": 41890 }, { "epoch": 0.419, "grad_norm": 15.5, "grad_norm_var": 0.5794108072916667, "learning_rate": 0.0003, "loss": 11.2764, "loss/aux_loss": 0.04806425198912621, "loss/crossentropy": 2.7819466471672056, "loss/logits": 0.8514897584915161, "step": 41900 }, { "epoch": 0.4191, "grad_norm": 15.375, "grad_norm_var": 0.35149739583333334, "learning_rate": 0.0003, "loss": 11.1274, "loss/aux_loss": 0.048081301525235175, "loss/crossentropy": 2.7535706400871276, "loss/logits": 0.862214544415474, "step": 41910 }, { "epoch": 0.4192, "grad_norm": 13.5, "grad_norm_var": 0.5299479166666666, "learning_rate": 0.0003, "loss": 11.2672, "loss/aux_loss": 0.0480682335793972, "loss/crossentropy": 2.62730153799057, "loss/logits": 0.8124941200017929, "step": 41920 }, { "epoch": 0.4193, "grad_norm": 14.875, "grad_norm_var": 0.52109375, "learning_rate": 0.0003, "loss": 11.2227, "loss/aux_loss": 0.04808680079877377, "loss/crossentropy": 2.6323707461357118, "loss/logits": 0.8262597292661666, "step": 41930 }, { "epoch": 0.4194, "grad_norm": 14.5, "grad_norm_var": 0.290625, "learning_rate": 0.0003, "loss": 11.0658, "loss/aux_loss": 0.048071213997900486, "loss/crossentropy": 2.8325400054454803, "loss/logits": 0.8364947497844696, "step": 41940 }, { "epoch": 0.4195, "grad_norm": 13.75, "grad_norm_var": 1.2986979166666666, "learning_rate": 0.0003, "loss": 11.3139, "loss/aux_loss": 0.048080979473888875, "loss/crossentropy": 2.669241964817047, "loss/logits": 0.853290992975235, "step": 41950 }, { "epoch": 0.4196, "grad_norm": 13.5625, "grad_norm_var": 1.460791015625, "learning_rate": 0.0003, "loss": 11.1538, "loss/aux_loss": 0.048079793155193326, "loss/crossentropy": 2.7789398312568663, "loss/logits": 0.8316181004047394, "step": 41960 }, { "epoch": 0.4197, "grad_norm": 15.0625, "grad_norm_var": 0.3932291666666667, "learning_rate": 0.0003, "loss": 11.411, "loss/aux_loss": 0.04805813655257225, "loss/crossentropy": 2.850307047367096, "loss/logits": 0.8470256596803665, "step": 41970 }, { "epoch": 0.4198, "grad_norm": 14.125, "grad_norm_var": 0.590087890625, "learning_rate": 0.0003, "loss": 11.2341, "loss/aux_loss": 0.04809781014919281, "loss/crossentropy": 2.73042853474617, "loss/logits": 0.8153778612613678, "step": 41980 }, { "epoch": 0.4199, "grad_norm": 14.875, "grad_norm_var": 1.03515625, "learning_rate": 0.0003, "loss": 11.2463, "loss/aux_loss": 0.048078482411801814, "loss/crossentropy": 2.771644139289856, "loss/logits": 0.8651615500450134, "step": 41990 }, { "epoch": 0.42, "grad_norm": 13.9375, "grad_norm_var": 1.0833170572916666, "learning_rate": 0.0003, "loss": 11.2312, "loss/aux_loss": 0.04807273019105196, "loss/crossentropy": 2.685261583328247, "loss/logits": 0.8609261780977249, "step": 42000 }, { "epoch": 0.4201, "grad_norm": 15.1875, "grad_norm_var": 0.44581705729166665, "learning_rate": 0.0003, "loss": 11.23, "loss/aux_loss": 0.048074769973754886, "loss/crossentropy": 2.605684131383896, "loss/logits": 0.8102221429347992, "step": 42010 }, { "epoch": 0.4202, "grad_norm": 13.5625, "grad_norm_var": 0.5038899739583333, "learning_rate": 0.0003, "loss": 11.2462, "loss/aux_loss": 0.04807645082473755, "loss/crossentropy": 2.828294575214386, "loss/logits": 0.8302065849304199, "step": 42020 }, { "epoch": 0.4203, "grad_norm": 14.1875, "grad_norm_var": 0.4630208333333333, "learning_rate": 0.0003, "loss": 11.2458, "loss/aux_loss": 0.04807501658797264, "loss/crossentropy": 2.6530769050121306, "loss/logits": 0.8315863937139512, "step": 42030 }, { "epoch": 0.4204, "grad_norm": 14.3125, "grad_norm_var": 0.361962890625, "learning_rate": 0.0003, "loss": 11.1785, "loss/aux_loss": 0.0480683233588934, "loss/crossentropy": 2.774942231178284, "loss/logits": 0.8557955861091614, "step": 42040 }, { "epoch": 0.4205, "grad_norm": 15.8125, "grad_norm_var": 1.2765625, "learning_rate": 0.0003, "loss": 11.1787, "loss/aux_loss": 0.04808812700212002, "loss/crossentropy": 2.7394619226455688, "loss/logits": 0.8685553550720215, "step": 42050 }, { "epoch": 0.4206, "grad_norm": 15.3125, "grad_norm_var": 1.025244140625, "learning_rate": 0.0003, "loss": 11.141, "loss/aux_loss": 0.04807517230510712, "loss/crossentropy": 2.646501141786575, "loss/logits": 0.8364500343799591, "step": 42060 }, { "epoch": 0.4207, "grad_norm": 15.4375, "grad_norm_var": 6.862223307291667, "learning_rate": 0.0003, "loss": 11.1773, "loss/aux_loss": 0.048075311444699766, "loss/crossentropy": 2.95753812789917, "loss/logits": 0.8557416766881942, "step": 42070 }, { "epoch": 0.4208, "grad_norm": 13.25, "grad_norm_var": 2.1541015625, "learning_rate": 0.0003, "loss": 11.1619, "loss/aux_loss": 0.04807597082108259, "loss/crossentropy": 2.747110295295715, "loss/logits": 0.8265171319246292, "step": 42080 }, { "epoch": 0.4209, "grad_norm": 13.3125, "grad_norm_var": 0.35651041666666666, "learning_rate": 0.0003, "loss": 11.2256, "loss/aux_loss": 0.048070704378187654, "loss/crossentropy": 2.6773354530334474, "loss/logits": 0.8335412830114365, "step": 42090 }, { "epoch": 0.421, "grad_norm": 14.0, "grad_norm_var": 0.7171223958333334, "learning_rate": 0.0003, "loss": 11.3195, "loss/aux_loss": 0.04807304628193378, "loss/crossentropy": 2.7497352182865145, "loss/logits": 0.8508964985609054, "step": 42100 }, { "epoch": 0.4211, "grad_norm": 15.5, "grad_norm_var": 3.123177083333333, "learning_rate": 0.0003, "loss": 11.2999, "loss/aux_loss": 0.0480742210522294, "loss/crossentropy": 2.733784317970276, "loss/logits": 0.8499576389789582, "step": 42110 }, { "epoch": 0.4212, "grad_norm": 14.625, "grad_norm_var": 3.007926432291667, "learning_rate": 0.0003, "loss": 11.1922, "loss/aux_loss": 0.04807363022118807, "loss/crossentropy": 2.723239630460739, "loss/logits": 0.8264488846063613, "step": 42120 }, { "epoch": 0.4213, "grad_norm": 14.3125, "grad_norm_var": 0.3167805989583333, "learning_rate": 0.0003, "loss": 11.2428, "loss/aux_loss": 0.04807157460600138, "loss/crossentropy": 2.740461474657059, "loss/logits": 0.8402502328157425, "step": 42130 }, { "epoch": 0.4214, "grad_norm": 14.5, "grad_norm_var": 0.317822265625, "learning_rate": 0.0003, "loss": 11.3028, "loss/aux_loss": 0.04807231556624174, "loss/crossentropy": 2.737111634016037, "loss/logits": 0.8425071030855179, "step": 42140 }, { "epoch": 0.4215, "grad_norm": 13.875, "grad_norm_var": 0.472509765625, "learning_rate": 0.0003, "loss": 11.1031, "loss/aux_loss": 0.048074459098279475, "loss/crossentropy": 2.6582253992557527, "loss/logits": 0.8121216595172882, "step": 42150 }, { "epoch": 0.4216, "grad_norm": 15.1875, "grad_norm_var": 0.7161295572916667, "learning_rate": 0.0003, "loss": 11.2392, "loss/aux_loss": 0.04806816950440407, "loss/crossentropy": 2.573642885684967, "loss/logits": 0.8212338477373123, "step": 42160 }, { "epoch": 0.4217, "grad_norm": 14.25, "grad_norm_var": 0.7730305989583334, "learning_rate": 0.0003, "loss": 11.1449, "loss/aux_loss": 0.048084497638046744, "loss/crossentropy": 2.7038708448410036, "loss/logits": 0.8355427473783493, "step": 42170 }, { "epoch": 0.4218, "grad_norm": 13.8125, "grad_norm_var": 0.9291015625, "learning_rate": 0.0003, "loss": 11.1934, "loss/aux_loss": 0.048070290684700014, "loss/crossentropy": 2.932389295101166, "loss/logits": 0.8793515950441361, "step": 42180 }, { "epoch": 0.4219, "grad_norm": 13.75, "grad_norm_var": 0.6808430989583333, "learning_rate": 0.0003, "loss": 11.2596, "loss/aux_loss": 0.04806995559483766, "loss/crossentropy": 2.848974609375, "loss/logits": 0.887108889222145, "step": 42190 }, { "epoch": 0.422, "grad_norm": 14.3125, "grad_norm_var": 0.36139322916666666, "learning_rate": 0.0003, "loss": 11.0734, "loss/aux_loss": 0.04808047562837601, "loss/crossentropy": 2.7708349883556367, "loss/logits": 0.8702079772949218, "step": 42200 }, { "epoch": 0.4221, "grad_norm": 13.6875, "grad_norm_var": 0.3931640625, "learning_rate": 0.0003, "loss": 11.0987, "loss/aux_loss": 0.04806628059595823, "loss/crossentropy": 2.75937157869339, "loss/logits": 0.8796556890010834, "step": 42210 }, { "epoch": 0.4222, "grad_norm": 14.6875, "grad_norm_var": 0.392822265625, "learning_rate": 0.0003, "loss": 11.296, "loss/aux_loss": 0.0480836084112525, "loss/crossentropy": 2.686858814954758, "loss/logits": 0.8247960180044174, "step": 42220 }, { "epoch": 0.4223, "grad_norm": 14.5625, "grad_norm_var": 0.758837890625, "learning_rate": 0.0003, "loss": 11.1395, "loss/aux_loss": 0.048082459904253486, "loss/crossentropy": 2.888724946975708, "loss/logits": 0.840698453783989, "step": 42230 }, { "epoch": 0.4224, "grad_norm": 15.4375, "grad_norm_var": 2.5072265625, "learning_rate": 0.0003, "loss": 11.3138, "loss/aux_loss": 0.04806939046829939, "loss/crossentropy": 2.7978740334510803, "loss/logits": 0.8392590701580047, "step": 42240 }, { "epoch": 0.4225, "grad_norm": 14.6875, "grad_norm_var": 1.006494140625, "learning_rate": 0.0003, "loss": 11.2709, "loss/aux_loss": 0.048084151558578016, "loss/crossentropy": 2.694107210636139, "loss/logits": 0.8511635422706604, "step": 42250 }, { "epoch": 0.4226, "grad_norm": 13.8125, "grad_norm_var": 0.407275390625, "learning_rate": 0.0003, "loss": 11.1712, "loss/aux_loss": 0.04807136338204145, "loss/crossentropy": 2.8651691317558288, "loss/logits": 0.8362912058830261, "step": 42260 }, { "epoch": 0.4227, "grad_norm": 13.8125, "grad_norm_var": 0.4098307291666667, "learning_rate": 0.0003, "loss": 11.301, "loss/aux_loss": 0.04806836117058992, "loss/crossentropy": 2.8899617552757264, "loss/logits": 0.8420800924301147, "step": 42270 }, { "epoch": 0.4228, "grad_norm": 14.0, "grad_norm_var": 0.4931640625, "learning_rate": 0.0003, "loss": 11.2737, "loss/aux_loss": 0.048084226250648496, "loss/crossentropy": 2.731697905063629, "loss/logits": 0.8795881062746048, "step": 42280 }, { "epoch": 0.4229, "grad_norm": 14.5625, "grad_norm_var": 0.454150390625, "learning_rate": 0.0003, "loss": 11.0708, "loss/aux_loss": 0.048074116744101045, "loss/crossentropy": 2.6348713278770446, "loss/logits": 0.8292662829160691, "step": 42290 }, { "epoch": 0.423, "grad_norm": 14.625, "grad_norm_var": 0.9536295572916667, "learning_rate": 0.0003, "loss": 11.159, "loss/aux_loss": 0.04808458648622036, "loss/crossentropy": 2.6694875180721285, "loss/logits": 0.8599371790885926, "step": 42300 }, { "epoch": 0.4231, "grad_norm": 15.3125, "grad_norm_var": 0.34308268229166666, "learning_rate": 0.0003, "loss": 11.0977, "loss/aux_loss": 0.04808226190507412, "loss/crossentropy": 2.7208563089370728, "loss/logits": 0.8512184768915176, "step": 42310 }, { "epoch": 0.4232, "grad_norm": 14.1875, "grad_norm_var": 0.48631184895833335, "learning_rate": 0.0003, "loss": 11.0967, "loss/aux_loss": 0.04806763473898172, "loss/crossentropy": 2.7380436182022097, "loss/logits": 0.8502931475639344, "step": 42320 }, { "epoch": 0.4233, "grad_norm": 13.5625, "grad_norm_var": 0.8145182291666667, "learning_rate": 0.0003, "loss": 11.0071, "loss/aux_loss": 0.048071845807135104, "loss/crossentropy": 2.7763596057891844, "loss/logits": 0.8628914952278137, "step": 42330 }, { "epoch": 0.4234, "grad_norm": 14.375, "grad_norm_var": 2.943603515625, "learning_rate": 0.0003, "loss": 10.9369, "loss/aux_loss": 0.04808341804891825, "loss/crossentropy": 2.7524753272533418, "loss/logits": 0.829263374209404, "step": 42340 }, { "epoch": 0.4235, "grad_norm": 12.8125, "grad_norm_var": 2.596337890625, "learning_rate": 0.0003, "loss": 11.1457, "loss/aux_loss": 0.04806433003395796, "loss/crossentropy": 2.7292973041534423, "loss/logits": 0.8203612565994263, "step": 42350 }, { "epoch": 0.4236, "grad_norm": 15.1875, "grad_norm_var": 0.830322265625, "learning_rate": 0.0003, "loss": 11.224, "loss/aux_loss": 0.048083837144076824, "loss/crossentropy": 2.7010737299919128, "loss/logits": 0.8339439123868942, "step": 42360 }, { "epoch": 0.4237, "grad_norm": 17.75, "grad_norm_var": 132.42135416666667, "learning_rate": 0.0003, "loss": 11.3355, "loss/aux_loss": 0.04808040820062161, "loss/crossentropy": 2.748473286628723, "loss/logits": 0.8279530495405197, "step": 42370 }, { "epoch": 0.4238, "grad_norm": 14.125, "grad_norm_var": 134.461962890625, "learning_rate": 0.0003, "loss": 11.1351, "loss/aux_loss": 0.04808204211294651, "loss/crossentropy": 2.726352107524872, "loss/logits": 0.8515573889017105, "step": 42380 }, { "epoch": 0.4239, "grad_norm": 14.3125, "grad_norm_var": 0.803369140625, "learning_rate": 0.0003, "loss": 11.1572, "loss/aux_loss": 0.04808192439377308, "loss/crossentropy": 2.593681287765503, "loss/logits": 0.8259778410196305, "step": 42390 }, { "epoch": 0.424, "grad_norm": 14.6875, "grad_norm_var": 0.7634765625, "learning_rate": 0.0003, "loss": 11.1284, "loss/aux_loss": 0.04806700516492128, "loss/crossentropy": 2.8635907411575316, "loss/logits": 0.8664378643035888, "step": 42400 }, { "epoch": 0.4241, "grad_norm": 14.75, "grad_norm_var": 1.23203125, "learning_rate": 0.0003, "loss": 11.085, "loss/aux_loss": 0.048076603934168814, "loss/crossentropy": 2.645522326231003, "loss/logits": 0.8040447324514389, "step": 42410 }, { "epoch": 0.4242, "grad_norm": 14.25, "grad_norm_var": 0.42120768229166666, "learning_rate": 0.0003, "loss": 11.172, "loss/aux_loss": 0.04807595741003752, "loss/crossentropy": 2.6108031809329986, "loss/logits": 0.8477170407772064, "step": 42420 }, { "epoch": 0.4243, "grad_norm": 14.0, "grad_norm_var": 0.3681640625, "learning_rate": 0.0003, "loss": 11.1488, "loss/aux_loss": 0.04806941151618958, "loss/crossentropy": 2.6864835619926453, "loss/logits": 0.8509948909282684, "step": 42430 }, { "epoch": 0.4244, "grad_norm": 13.375, "grad_norm_var": 0.464697265625, "learning_rate": 0.0003, "loss": 11.3073, "loss/aux_loss": 0.04807401150465011, "loss/crossentropy": 2.864110291004181, "loss/logits": 0.8419386476278305, "step": 42440 }, { "epoch": 0.4245, "grad_norm": 14.6875, "grad_norm_var": 1.1081868489583333, "learning_rate": 0.0003, "loss": 11.109, "loss/aux_loss": 0.04807962235063314, "loss/crossentropy": 2.665892016887665, "loss/logits": 0.8466892153024673, "step": 42450 }, { "epoch": 0.4246, "grad_norm": 14.8125, "grad_norm_var": 0.9630045572916667, "learning_rate": 0.0003, "loss": 11.1531, "loss/aux_loss": 0.048076581209897995, "loss/crossentropy": 2.7316180169582367, "loss/logits": 0.8452065467834473, "step": 42460 }, { "epoch": 0.4247, "grad_norm": 13.4375, "grad_norm_var": 0.8898274739583333, "learning_rate": 0.0003, "loss": 11.1563, "loss/aux_loss": 0.04807441867887974, "loss/crossentropy": 2.7544716358184815, "loss/logits": 0.8261888146400451, "step": 42470 }, { "epoch": 0.4248, "grad_norm": 15.3125, "grad_norm_var": 0.7120930989583333, "learning_rate": 0.0003, "loss": 11.3566, "loss/aux_loss": 0.04807336274534464, "loss/crossentropy": 2.8429744720458983, "loss/logits": 0.8528753489255905, "step": 42480 }, { "epoch": 0.4249, "grad_norm": 13.9375, "grad_norm_var": 0.7978515625, "learning_rate": 0.0003, "loss": 11.2925, "loss/aux_loss": 0.04807384237647057, "loss/crossentropy": 2.8680081605911254, "loss/logits": 0.8669378757476807, "step": 42490 }, { "epoch": 0.425, "grad_norm": 15.0, "grad_norm_var": 1.0207682291666667, "learning_rate": 0.0003, "loss": 11.3442, "loss/aux_loss": 0.04807020053267479, "loss/crossentropy": 2.7715225398540495, "loss/logits": 0.8685531944036484, "step": 42500 }, { "epoch": 0.4251, "grad_norm": 14.3125, "grad_norm_var": 0.5525390625, "learning_rate": 0.0003, "loss": 11.0004, "loss/aux_loss": 0.048080707900226116, "loss/crossentropy": 2.6281380653381348, "loss/logits": 0.8210119009017944, "step": 42510 }, { "epoch": 0.4252, "grad_norm": 15.125, "grad_norm_var": 0.4669108072916667, "learning_rate": 0.0003, "loss": 11.2617, "loss/aux_loss": 0.048080474697053434, "loss/crossentropy": 2.61398241519928, "loss/logits": 0.8228483706712723, "step": 42520 }, { "epoch": 0.4253, "grad_norm": 12.6875, "grad_norm_var": 1.0641764322916667, "learning_rate": 0.0003, "loss": 11.0279, "loss/aux_loss": 0.04806278124451637, "loss/crossentropy": 2.4613637685775758, "loss/logits": 0.8284233272075653, "step": 42530 }, { "epoch": 0.4254, "grad_norm": 15.375, "grad_norm_var": 67.23683268229166, "learning_rate": 0.0003, "loss": 10.9845, "loss/aux_loss": 0.048084072582423684, "loss/crossentropy": 2.721170890331268, "loss/logits": 0.8446434617042542, "step": 42540 }, { "epoch": 0.4255, "grad_norm": 13.75, "grad_norm_var": 67.12888997395834, "learning_rate": 0.0003, "loss": 11.0296, "loss/aux_loss": 0.048082982562482356, "loss/crossentropy": 2.51130490899086, "loss/logits": 0.7815639197826385, "step": 42550 }, { "epoch": 0.4256, "grad_norm": 14.875, "grad_norm_var": 15.626302083333334, "learning_rate": 0.0003, "loss": 11.2562, "loss/aux_loss": 0.04806295093148947, "loss/crossentropy": 2.7693334579467774, "loss/logits": 0.8466608166694641, "step": 42560 }, { "epoch": 0.4257, "grad_norm": 14.5, "grad_norm_var": 15.836572265625, "learning_rate": 0.0003, "loss": 11.2921, "loss/aux_loss": 0.04808557108044624, "loss/crossentropy": 2.6812986373901366, "loss/logits": 0.841954892873764, "step": 42570 }, { "epoch": 0.4258, "grad_norm": 14.3125, "grad_norm_var": 0.978369140625, "learning_rate": 0.0003, "loss": 11.252, "loss/aux_loss": 0.048081206530332564, "loss/crossentropy": 2.687948948144913, "loss/logits": 0.8682124525308609, "step": 42580 }, { "epoch": 0.4259, "grad_norm": 15.4375, "grad_norm_var": 0.7228515625, "learning_rate": 0.0003, "loss": 11.1966, "loss/aux_loss": 0.0480744980275631, "loss/crossentropy": 2.668538528680801, "loss/logits": 0.8255208849906921, "step": 42590 }, { "epoch": 0.426, "grad_norm": 14.75, "grad_norm_var": 0.9821451822916667, "learning_rate": 0.0003, "loss": 11.1911, "loss/aux_loss": 0.04807751923799515, "loss/crossentropy": 2.661841082572937, "loss/logits": 0.8377692878246308, "step": 42600 }, { "epoch": 0.4261, "grad_norm": 15.6875, "grad_norm_var": 0.9484375, "learning_rate": 0.0003, "loss": 11.3501, "loss/aux_loss": 0.04808114971965551, "loss/crossentropy": 2.902574121952057, "loss/logits": 0.8539043575525284, "step": 42610 }, { "epoch": 0.4262, "grad_norm": 13.5625, "grad_norm_var": 0.9540201822916666, "learning_rate": 0.0003, "loss": 11.1911, "loss/aux_loss": 0.048068844713270664, "loss/crossentropy": 2.7202011168003084, "loss/logits": 0.8331751823425293, "step": 42620 }, { "epoch": 0.4263, "grad_norm": 13.9375, "grad_norm_var": 0.24140625, "learning_rate": 0.0003, "loss": 11.3027, "loss/aux_loss": 0.048070861399173735, "loss/crossentropy": 2.889012670516968, "loss/logits": 0.8704992473125458, "step": 42630 }, { "epoch": 0.4264, "grad_norm": 15.125, "grad_norm_var": 0.6125, "learning_rate": 0.0003, "loss": 11.1455, "loss/aux_loss": 0.04808156322687864, "loss/crossentropy": 2.7083074033260344, "loss/logits": 0.8564732939004898, "step": 42640 }, { "epoch": 0.4265, "grad_norm": 13.625, "grad_norm_var": 0.61484375, "learning_rate": 0.0003, "loss": 11.2121, "loss/aux_loss": 0.048069928959012034, "loss/crossentropy": 2.73454931974411, "loss/logits": 0.8486188590526581, "step": 42650 }, { "epoch": 0.4266, "grad_norm": 12.875, "grad_norm_var": 0.33984375, "learning_rate": 0.0003, "loss": 11.1947, "loss/aux_loss": 0.04808113239705562, "loss/crossentropy": 2.792097818851471, "loss/logits": 0.8488514006137848, "step": 42660 }, { "epoch": 0.4267, "grad_norm": 13.5, "grad_norm_var": 1.0273274739583333, "learning_rate": 0.0003, "loss": 11.0177, "loss/aux_loss": 0.04808440897613764, "loss/crossentropy": 2.567507326602936, "loss/logits": 0.8366076290607453, "step": 42670 }, { "epoch": 0.4268, "grad_norm": 13.3125, "grad_norm_var": 0.37941080729166665, "learning_rate": 0.0003, "loss": 11.0272, "loss/aux_loss": 0.048066616617143156, "loss/crossentropy": 2.7124005913734437, "loss/logits": 0.8338280886411666, "step": 42680 }, { "epoch": 0.4269, "grad_norm": 14.0625, "grad_norm_var": 0.2353515625, "learning_rate": 0.0003, "loss": 11.178, "loss/aux_loss": 0.048080355115234855, "loss/crossentropy": 2.7802015602588654, "loss/logits": 0.8339311271905899, "step": 42690 }, { "epoch": 0.427, "grad_norm": 15.0625, "grad_norm_var": 0.48956705729166666, "learning_rate": 0.0003, "loss": 11.0708, "loss/aux_loss": 0.048078592866659164, "loss/crossentropy": 2.6688977122306823, "loss/logits": 0.7996150583028794, "step": 42700 }, { "epoch": 0.4271, "grad_norm": 14.625, "grad_norm_var": 6.747379557291667, "learning_rate": 0.0003, "loss": 11.1714, "loss/aux_loss": 0.048077211156487464, "loss/crossentropy": 2.715333503484726, "loss/logits": 0.8404178529977798, "step": 42710 }, { "epoch": 0.4272, "grad_norm": 13.5625, "grad_norm_var": 0.27734375, "learning_rate": 0.0003, "loss": 11.0897, "loss/aux_loss": 0.04808191582560539, "loss/crossentropy": 2.709022808074951, "loss/logits": 0.8491158545017242, "step": 42720 }, { "epoch": 0.4273, "grad_norm": 12.9375, "grad_norm_var": 0.72578125, "learning_rate": 0.0003, "loss": 11.0368, "loss/aux_loss": 0.048076875135302545, "loss/crossentropy": 2.7228225231170655, "loss/logits": 0.8443385303020478, "step": 42730 }, { "epoch": 0.4274, "grad_norm": 16.25, "grad_norm_var": 1.623291015625, "learning_rate": 0.0003, "loss": 11.2172, "loss/aux_loss": 0.04807112403213978, "loss/crossentropy": 2.7902682304382322, "loss/logits": 0.863338616490364, "step": 42740 }, { "epoch": 0.4275, "grad_norm": 14.25, "grad_norm_var": 1.578125, "learning_rate": 0.0003, "loss": 11.2052, "loss/aux_loss": 0.048067062720656396, "loss/crossentropy": 2.757642900943756, "loss/logits": 0.8275944203138351, "step": 42750 }, { "epoch": 0.4276, "grad_norm": 15.8125, "grad_norm_var": 0.6692708333333334, "learning_rate": 0.0003, "loss": 11.0956, "loss/aux_loss": 0.04808202516287565, "loss/crossentropy": 2.728620332479477, "loss/logits": 0.8508161783218384, "step": 42760 }, { "epoch": 0.4277, "grad_norm": 13.4375, "grad_norm_var": 0.5416015625, "learning_rate": 0.0003, "loss": 11.0911, "loss/aux_loss": 0.04807655792683363, "loss/crossentropy": 2.6857302367687224, "loss/logits": 0.8251177936792373, "step": 42770 }, { "epoch": 0.4278, "grad_norm": 14.25, "grad_norm_var": 0.3346354166666667, "learning_rate": 0.0003, "loss": 11.2185, "loss/aux_loss": 0.048068818263709547, "loss/crossentropy": 2.7882674872875213, "loss/logits": 0.8415611743927002, "step": 42780 }, { "epoch": 0.4279, "grad_norm": 13.6875, "grad_norm_var": 0.393603515625, "learning_rate": 0.0003, "loss": 11.1943, "loss/aux_loss": 0.048084525391459465, "loss/crossentropy": 2.8016934394836426, "loss/logits": 0.8504360228776932, "step": 42790 }, { "epoch": 0.428, "grad_norm": 15.5625, "grad_norm_var": 0.8610514322916667, "learning_rate": 0.0003, "loss": 11.0533, "loss/aux_loss": 0.0480705926194787, "loss/crossentropy": 2.7852961301803587, "loss/logits": 0.8427935183048249, "step": 42800 }, { "epoch": 0.4281, "grad_norm": 14.375, "grad_norm_var": 0.5283854166666667, "learning_rate": 0.0003, "loss": 11.1688, "loss/aux_loss": 0.048075289465487, "loss/crossentropy": 2.851541531085968, "loss/logits": 0.8648887991905212, "step": 42810 }, { "epoch": 0.4282, "grad_norm": 15.5625, "grad_norm_var": 0.6245930989583334, "learning_rate": 0.0003, "loss": 11.1358, "loss/aux_loss": 0.04808152187615633, "loss/crossentropy": 2.728837323188782, "loss/logits": 0.8126325309276581, "step": 42820 }, { "epoch": 0.4283, "grad_norm": 14.0625, "grad_norm_var": 5.513655598958334, "learning_rate": 0.0003, "loss": 11.3191, "loss/aux_loss": 0.048070829920470716, "loss/crossentropy": 2.9222333669662475, "loss/logits": 0.8803920924663544, "step": 42830 }, { "epoch": 0.4284, "grad_norm": 14.125, "grad_norm_var": 5.320556640625, "learning_rate": 0.0003, "loss": 11.1858, "loss/aux_loss": 0.04807902462780476, "loss/crossentropy": 2.750357246398926, "loss/logits": 0.8127260476350784, "step": 42840 }, { "epoch": 0.4285, "grad_norm": 15.125, "grad_norm_var": 0.3848795572916667, "learning_rate": 0.0003, "loss": 11.2752, "loss/aux_loss": 0.048070596531033516, "loss/crossentropy": 2.6048742115497587, "loss/logits": 0.8551869869232178, "step": 42850 }, { "epoch": 0.4286, "grad_norm": 14.25, "grad_norm_var": 0.5794108072916667, "learning_rate": 0.0003, "loss": 11.132, "loss/aux_loss": 0.048073590733110905, "loss/crossentropy": 2.8263909220695496, "loss/logits": 0.8633887559175492, "step": 42860 }, { "epoch": 0.4287, "grad_norm": 13.375, "grad_norm_var": 1.9244140625, "learning_rate": 0.0003, "loss": 11.3004, "loss/aux_loss": 0.04807244185358286, "loss/crossentropy": 2.7152198910713197, "loss/logits": 0.8596496641635895, "step": 42870 }, { "epoch": 0.4288, "grad_norm": 15.25, "grad_norm_var": 1.720556640625, "learning_rate": 0.0003, "loss": 11.3347, "loss/aux_loss": 0.04807446151971817, "loss/crossentropy": 2.721926176548004, "loss/logits": 0.8560007959604263, "step": 42880 }, { "epoch": 0.4289, "grad_norm": 14.0, "grad_norm_var": 2.203108723958333, "learning_rate": 0.0003, "loss": 11.0291, "loss/aux_loss": 0.04807375371456146, "loss/crossentropy": 2.8105222463607786, "loss/logits": 0.8752927869558335, "step": 42890 }, { "epoch": 0.429, "grad_norm": 15.75, "grad_norm_var": 1.431494140625, "learning_rate": 0.0003, "loss": 10.9854, "loss/aux_loss": 0.04806910492479801, "loss/crossentropy": 2.690297359228134, "loss/logits": 0.837578096985817, "step": 42900 }, { "epoch": 0.4291, "grad_norm": 13.25, "grad_norm_var": 1.439697265625, "learning_rate": 0.0003, "loss": 11.1458, "loss/aux_loss": 0.04807917848229408, "loss/crossentropy": 2.812906527519226, "loss/logits": 0.839617344737053, "step": 42910 }, { "epoch": 0.4292, "grad_norm": 14.125, "grad_norm_var": 0.37185872395833336, "learning_rate": 0.0003, "loss": 11.2282, "loss/aux_loss": 0.04807562492787838, "loss/crossentropy": 2.778420227766037, "loss/logits": 0.8231628626585007, "step": 42920 }, { "epoch": 0.4293, "grad_norm": 14.5625, "grad_norm_var": 0.30520833333333336, "learning_rate": 0.0003, "loss": 10.9583, "loss/aux_loss": 0.048068897984921935, "loss/crossentropy": 2.7826973259449006, "loss/logits": 0.8252352714538574, "step": 42930 }, { "epoch": 0.4294, "grad_norm": 14.5625, "grad_norm_var": 0.5637858072916667, "learning_rate": 0.0003, "loss": 11.159, "loss/aux_loss": 0.04807413574308157, "loss/crossentropy": 2.5992193698883055, "loss/logits": 0.8467897325754166, "step": 42940 }, { "epoch": 0.4295, "grad_norm": 15.5, "grad_norm_var": 0.315478515625, "learning_rate": 0.0003, "loss": 10.9422, "loss/aux_loss": 0.04807993993163109, "loss/crossentropy": 2.524407982826233, "loss/logits": 0.8164256751537323, "step": 42950 }, { "epoch": 0.4296, "grad_norm": 14.25, "grad_norm_var": 0.6348795572916667, "learning_rate": 0.0003, "loss": 11.2791, "loss/aux_loss": 0.04807226173579693, "loss/crossentropy": 2.6596532464027405, "loss/logits": 0.8530319899320602, "step": 42960 }, { "epoch": 0.4297, "grad_norm": 13.9375, "grad_norm_var": 1.3340983072916666, "learning_rate": 0.0003, "loss": 11.0176, "loss/aux_loss": 0.04807053208351135, "loss/crossentropy": 2.718638336658478, "loss/logits": 0.8549129962921143, "step": 42970 }, { "epoch": 0.4298, "grad_norm": 15.5, "grad_norm_var": 1.595947265625, "learning_rate": 0.0003, "loss": 11.2805, "loss/aux_loss": 0.04808430094271898, "loss/crossentropy": 2.637483465671539, "loss/logits": 0.8228224605321884, "step": 42980 }, { "epoch": 0.4299, "grad_norm": 13.8125, "grad_norm_var": 0.708056640625, "learning_rate": 0.0003, "loss": 11.1181, "loss/aux_loss": 0.04807401914149523, "loss/crossentropy": 2.7925415635108948, "loss/logits": 0.82631796002388, "step": 42990 }, { "epoch": 0.43, "grad_norm": 15.3125, "grad_norm_var": 0.6119140625, "learning_rate": 0.0003, "loss": 11.188, "loss/aux_loss": 0.04808122981339693, "loss/crossentropy": 2.739946460723877, "loss/logits": 0.833682969212532, "step": 43000 }, { "epoch": 0.4301, "grad_norm": 14.375, "grad_norm_var": 1809.0212890625, "learning_rate": 0.0003, "loss": 11.3295, "loss/aux_loss": 0.0480794845148921, "loss/crossentropy": 2.7596442997455597, "loss/logits": 0.8646740794181824, "step": 43010 }, { "epoch": 0.4302, "grad_norm": 13.25, "grad_norm_var": 0.8145670572916667, "learning_rate": 0.0003, "loss": 11.0717, "loss/aux_loss": 0.04808532949537039, "loss/crossentropy": 2.585456448793411, "loss/logits": 0.8001091122627259, "step": 43020 }, { "epoch": 0.4303, "grad_norm": 13.4375, "grad_norm_var": 0.20572916666666666, "learning_rate": 0.0003, "loss": 11.1748, "loss/aux_loss": 0.04807361718267202, "loss/crossentropy": 2.6641751885414124, "loss/logits": 0.8602477341890336, "step": 43030 }, { "epoch": 0.4304, "grad_norm": 14.0625, "grad_norm_var": 1.0601399739583333, "learning_rate": 0.0003, "loss": 11.2242, "loss/aux_loss": 0.04806696530431509, "loss/crossentropy": 2.7215495467185975, "loss/logits": 0.8751588940620423, "step": 43040 }, { "epoch": 0.4305, "grad_norm": 14.625, "grad_norm_var": 0.760400390625, "learning_rate": 0.0003, "loss": 11.0104, "loss/aux_loss": 0.04807785041630268, "loss/crossentropy": 2.6973119556903837, "loss/logits": 0.8398984521627426, "step": 43050 }, { "epoch": 0.4306, "grad_norm": 13.625, "grad_norm_var": 0.7916015625, "learning_rate": 0.0003, "loss": 11.4002, "loss/aux_loss": 0.04808125030249357, "loss/crossentropy": 2.758901071548462, "loss/logits": 0.8731786936521531, "step": 43060 }, { "epoch": 0.4307, "grad_norm": 13.25, "grad_norm_var": 0.610009765625, "learning_rate": 0.0003, "loss": 11.1299, "loss/aux_loss": 0.04807206802070141, "loss/crossentropy": 2.708112859725952, "loss/logits": 0.7979910880327225, "step": 43070 }, { "epoch": 0.4308, "grad_norm": 14.0625, "grad_norm_var": 0.7150390625, "learning_rate": 0.0003, "loss": 11.145, "loss/aux_loss": 0.04807481300085783, "loss/crossentropy": 2.7164962589740753, "loss/logits": 0.8307441174983978, "step": 43080 }, { "epoch": 0.4309, "grad_norm": 14.3125, "grad_norm_var": 0.603369140625, "learning_rate": 0.0003, "loss": 11.2082, "loss/aux_loss": 0.048082906566560266, "loss/crossentropy": 2.830370819568634, "loss/logits": 0.8672911942005157, "step": 43090 }, { "epoch": 0.431, "grad_norm": 16.0, "grad_norm_var": 0.6677083333333333, "learning_rate": 0.0003, "loss": 11.2983, "loss/aux_loss": 0.048063176684081554, "loss/crossentropy": 2.7665525555610655, "loss/logits": 0.8384849548339843, "step": 43100 }, { "epoch": 0.4311, "grad_norm": 13.75, "grad_norm_var": 0.8820149739583333, "learning_rate": 0.0003, "loss": 11.3458, "loss/aux_loss": 0.0480750685557723, "loss/crossentropy": 2.6808901131153107, "loss/logits": 0.8562900602817536, "step": 43110 }, { "epoch": 0.4312, "grad_norm": 15.0, "grad_norm_var": 0.675244140625, "learning_rate": 0.0003, "loss": 11.2144, "loss/aux_loss": 0.048073401860892775, "loss/crossentropy": 2.792336130142212, "loss/logits": 0.8535761684179306, "step": 43120 }, { "epoch": 0.4313, "grad_norm": 13.0, "grad_norm_var": 0.264306640625, "learning_rate": 0.0003, "loss": 11.3156, "loss/aux_loss": 0.048070548288524154, "loss/crossentropy": 2.618429493904114, "loss/logits": 0.8526875019073487, "step": 43130 }, { "epoch": 0.4314, "grad_norm": 14.6875, "grad_norm_var": 0.3589680989583333, "learning_rate": 0.0003, "loss": 10.973, "loss/aux_loss": 0.048080014809966085, "loss/crossentropy": 2.6525802075862885, "loss/logits": 0.8168764710426331, "step": 43140 }, { "epoch": 0.4315, "grad_norm": 14.3125, "grad_norm_var": 0.3770670572916667, "learning_rate": 0.0003, "loss": 11.207, "loss/aux_loss": 0.048070788569748404, "loss/crossentropy": 2.828217601776123, "loss/logits": 0.8801011204719543, "step": 43150 }, { "epoch": 0.4316, "grad_norm": 13.8125, "grad_norm_var": 0.122509765625, "learning_rate": 0.0003, "loss": 11.0918, "loss/aux_loss": 0.04807632640004158, "loss/crossentropy": 2.74559006690979, "loss/logits": 0.8384597927331925, "step": 43160 }, { "epoch": 0.4317, "grad_norm": 14.375, "grad_norm_var": 0.35149739583333334, "learning_rate": 0.0003, "loss": 11.1667, "loss/aux_loss": 0.0480788629502058, "loss/crossentropy": 2.6675486505031585, "loss/logits": 0.858903244137764, "step": 43170 }, { "epoch": 0.4318, "grad_norm": 15.625, "grad_norm_var": 0.9427083333333334, "learning_rate": 0.0003, "loss": 11.1853, "loss/aux_loss": 0.04807556346058846, "loss/crossentropy": 2.764018404483795, "loss/logits": 0.8240988850593567, "step": 43180 }, { "epoch": 0.4319, "grad_norm": 14.0, "grad_norm_var": 0.96875, "learning_rate": 0.0003, "loss": 11.271, "loss/aux_loss": 0.04806935228407383, "loss/crossentropy": 2.8441020369529726, "loss/logits": 0.8296251714229583, "step": 43190 }, { "epoch": 0.432, "grad_norm": 15.0, "grad_norm_var": 0.245556640625, "learning_rate": 0.0003, "loss": 11.2131, "loss/aux_loss": 0.04808467049151659, "loss/crossentropy": 2.6359627187252044, "loss/logits": 0.8088801056146622, "step": 43200 }, { "epoch": 0.4321, "grad_norm": 13.375, "grad_norm_var": 0.28370768229166665, "learning_rate": 0.0003, "loss": 11.1807, "loss/aux_loss": 0.04807442501187324, "loss/crossentropy": 2.629367303848267, "loss/logits": 0.8538803130388259, "step": 43210 }, { "epoch": 0.4322, "grad_norm": 13.9375, "grad_norm_var": 0.3994140625, "learning_rate": 0.0003, "loss": 11.1458, "loss/aux_loss": 0.0480644728988409, "loss/crossentropy": 2.70165359377861, "loss/logits": 0.8403062671422958, "step": 43220 }, { "epoch": 0.4323, "grad_norm": 14.0, "grad_norm_var": 0.28253580729166666, "learning_rate": 0.0003, "loss": 11.2449, "loss/aux_loss": 0.048089561983942986, "loss/crossentropy": 2.599137383699417, "loss/logits": 0.8236678332090378, "step": 43230 }, { "epoch": 0.4324, "grad_norm": 13.375, "grad_norm_var": 0.305322265625, "learning_rate": 0.0003, "loss": 11.2337, "loss/aux_loss": 0.04807605054229498, "loss/crossentropy": 2.8088149547576906, "loss/logits": 0.8818845838308335, "step": 43240 }, { "epoch": 0.4325, "grad_norm": 15.0, "grad_norm_var": 0.4161295572916667, "learning_rate": 0.0003, "loss": 11.204, "loss/aux_loss": 0.04806992541998625, "loss/crossentropy": 2.8428435802459715, "loss/logits": 0.8899227410554886, "step": 43250 }, { "epoch": 0.4326, "grad_norm": 14.875, "grad_norm_var": 0.388134765625, "learning_rate": 0.0003, "loss": 11.1815, "loss/aux_loss": 0.04808182567358017, "loss/crossentropy": 2.685653477907181, "loss/logits": 0.8344039708375931, "step": 43260 }, { "epoch": 0.4327, "grad_norm": 13.6875, "grad_norm_var": 0.3270182291666667, "learning_rate": 0.0003, "loss": 11.3153, "loss/aux_loss": 0.04808889031410217, "loss/crossentropy": 2.976236271858215, "loss/logits": 0.8441417008638382, "step": 43270 }, { "epoch": 0.4328, "grad_norm": 14.125, "grad_norm_var": 0.93203125, "learning_rate": 0.0003, "loss": 11.0959, "loss/aux_loss": 0.04806580301374197, "loss/crossentropy": 2.711184060573578, "loss/logits": 0.840970367193222, "step": 43280 }, { "epoch": 0.4329, "grad_norm": 16.25, "grad_norm_var": 0.7880208333333333, "learning_rate": 0.0003, "loss": 11.2848, "loss/aux_loss": 0.04807568024843931, "loss/crossentropy": 2.764776086807251, "loss/logits": 0.8647139281034469, "step": 43290 }, { "epoch": 0.433, "grad_norm": 13.5, "grad_norm_var": 58.57421875, "learning_rate": 0.0003, "loss": 11.2579, "loss/aux_loss": 0.0480875076726079, "loss/crossentropy": 2.639911252260208, "loss/logits": 0.855933940410614, "step": 43300 }, { "epoch": 0.4331, "grad_norm": 13.625, "grad_norm_var": 58.1744140625, "learning_rate": 0.0003, "loss": 11.1596, "loss/aux_loss": 0.048066693171858785, "loss/crossentropy": 2.7509276986122133, "loss/logits": 0.8506129652261734, "step": 43310 }, { "epoch": 0.4332, "grad_norm": 14.125, "grad_norm_var": 0.2994140625, "learning_rate": 0.0003, "loss": 11.2364, "loss/aux_loss": 0.04807441793382168, "loss/crossentropy": 2.7721996307373047, "loss/logits": 0.841002207994461, "step": 43320 }, { "epoch": 0.4333, "grad_norm": 12.9375, "grad_norm_var": 0.411962890625, "learning_rate": 0.0003, "loss": 11.0887, "loss/aux_loss": 0.04807002730667591, "loss/crossentropy": 2.702434003353119, "loss/logits": 0.8071956008672714, "step": 43330 }, { "epoch": 0.4334, "grad_norm": 15.125, "grad_norm_var": 0.3262858072916667, "learning_rate": 0.0003, "loss": 11.1554, "loss/aux_loss": 0.04807128868997097, "loss/crossentropy": 2.727681612968445, "loss/logits": 0.8188419610261917, "step": 43340 }, { "epoch": 0.4335, "grad_norm": 13.875, "grad_norm_var": 0.27447916666666666, "learning_rate": 0.0003, "loss": 11.196, "loss/aux_loss": 0.04806978348642588, "loss/crossentropy": 2.779603922367096, "loss/logits": 0.8584359914064408, "step": 43350 }, { "epoch": 0.4336, "grad_norm": 14.125, "grad_norm_var": 0.5494140625, "learning_rate": 0.0003, "loss": 11.2599, "loss/aux_loss": 0.04807741772383452, "loss/crossentropy": 2.7377023220062258, "loss/logits": 0.8400143414735795, "step": 43360 }, { "epoch": 0.4337, "grad_norm": 15.9375, "grad_norm_var": 1.5880208333333334, "learning_rate": 0.0003, "loss": 11.14, "loss/aux_loss": 0.04808063004165888, "loss/crossentropy": 2.7512278735637663, "loss/logits": 0.8345052689313889, "step": 43370 }, { "epoch": 0.4338, "grad_norm": 13.875, "grad_norm_var": 1.5528645833333334, "learning_rate": 0.0003, "loss": 11.0079, "loss/aux_loss": 0.04806357547640801, "loss/crossentropy": 2.7596873223781584, "loss/logits": 0.8277094513177872, "step": 43380 }, { "epoch": 0.4339, "grad_norm": 14.4375, "grad_norm_var": 0.5911458333333334, "learning_rate": 0.0003, "loss": 11.0468, "loss/aux_loss": 0.04807555004954338, "loss/crossentropy": 2.7843938052654265, "loss/logits": 0.8598534375429153, "step": 43390 }, { "epoch": 0.434, "grad_norm": 14.375, "grad_norm_var": 0.410400390625, "learning_rate": 0.0003, "loss": 11.1959, "loss/aux_loss": 0.048078845627605914, "loss/crossentropy": 2.5064321935176848, "loss/logits": 0.8137326329946518, "step": 43400 }, { "epoch": 0.4341, "grad_norm": 14.5, "grad_norm_var": 0.521728515625, "learning_rate": 0.0003, "loss": 11.2099, "loss/aux_loss": 0.04807638339698315, "loss/crossentropy": 2.8900754928588865, "loss/logits": 0.8562443405389786, "step": 43410 }, { "epoch": 0.4342, "grad_norm": 14.875, "grad_norm_var": 0.3265462239583333, "learning_rate": 0.0003, "loss": 11.3643, "loss/aux_loss": 0.04807485099881888, "loss/crossentropy": 2.76791490316391, "loss/logits": 0.8608255743980407, "step": 43420 }, { "epoch": 0.4343, "grad_norm": 13.75, "grad_norm_var": 0.7968098958333333, "learning_rate": 0.0003, "loss": 11.0301, "loss/aux_loss": 0.04807616826146841, "loss/crossentropy": 2.6704135179519652, "loss/logits": 0.8353795439004899, "step": 43430 }, { "epoch": 0.4344, "grad_norm": 14.4375, "grad_norm_var": 0.878369140625, "learning_rate": 0.0003, "loss": 11.1472, "loss/aux_loss": 0.04807905219495297, "loss/crossentropy": 2.6787062883377075, "loss/logits": 0.8412859380245209, "step": 43440 }, { "epoch": 0.4345, "grad_norm": 14.75, "grad_norm_var": 0.46027018229166666, "learning_rate": 0.0003, "loss": 11.1762, "loss/aux_loss": 0.04807356093078852, "loss/crossentropy": 2.7421591579914093, "loss/logits": 0.8316115468740464, "step": 43450 }, { "epoch": 0.4346, "grad_norm": 14.8125, "grad_norm_var": 0.47732747395833336, "learning_rate": 0.0003, "loss": 11.0474, "loss/aux_loss": 0.04807029739022255, "loss/crossentropy": 2.731302946805954, "loss/logits": 0.8437435656785965, "step": 43460 }, { "epoch": 0.4347, "grad_norm": 14.4375, "grad_norm_var": 0.42864583333333334, "learning_rate": 0.0003, "loss": 11.1795, "loss/aux_loss": 0.048083835281431675, "loss/crossentropy": 2.633001279830933, "loss/logits": 0.8292666167020798, "step": 43470 }, { "epoch": 0.4348, "grad_norm": 13.75, "grad_norm_var": 0.6476399739583333, "learning_rate": 0.0003, "loss": 11.071, "loss/aux_loss": 0.04806802663952112, "loss/crossentropy": 2.698196220397949, "loss/logits": 0.811674302816391, "step": 43480 }, { "epoch": 0.4349, "grad_norm": 15.8125, "grad_norm_var": 0.7286295572916667, "learning_rate": 0.0003, "loss": 11.106, "loss/aux_loss": 0.04807435814291239, "loss/crossentropy": 2.8671145260334017, "loss/logits": 0.8537077218294143, "step": 43490 }, { "epoch": 0.435, "grad_norm": 15.375, "grad_norm_var": 0.6048014322916667, "learning_rate": 0.0003, "loss": 11.0003, "loss/aux_loss": 0.048080187290906906, "loss/crossentropy": 2.722633057832718, "loss/logits": 0.8278620541095734, "step": 43500 }, { "epoch": 0.4351, "grad_norm": 14.3125, "grad_norm_var": 0.3934733072916667, "learning_rate": 0.0003, "loss": 11.2418, "loss/aux_loss": 0.048065092600882056, "loss/crossentropy": 2.733147954940796, "loss/logits": 0.8363703429698944, "step": 43510 }, { "epoch": 0.4352, "grad_norm": 13.875, "grad_norm_var": 0.43162434895833335, "learning_rate": 0.0003, "loss": 11.1544, "loss/aux_loss": 0.04809010047465563, "loss/crossentropy": 2.6833800315856933, "loss/logits": 0.8280310302972793, "step": 43520 }, { "epoch": 0.4353, "grad_norm": 14.0625, "grad_norm_var": 0.5497395833333333, "learning_rate": 0.0003, "loss": 11.153, "loss/aux_loss": 0.04807169977575541, "loss/crossentropy": 2.723561632633209, "loss/logits": 0.8542778968811036, "step": 43530 }, { "epoch": 0.4354, "grad_norm": 15.1875, "grad_norm_var": 0.8591145833333333, "learning_rate": 0.0003, "loss": 11.1498, "loss/aux_loss": 0.0480776134878397, "loss/crossentropy": 2.627856492996216, "loss/logits": 0.8360980361700058, "step": 43540 }, { "epoch": 0.4355, "grad_norm": 14.5, "grad_norm_var": 0.4905598958333333, "learning_rate": 0.0003, "loss": 11.2714, "loss/aux_loss": 0.04807308316230774, "loss/crossentropy": 2.766408783197403, "loss/logits": 0.8315641492605209, "step": 43550 }, { "epoch": 0.4356, "grad_norm": 14.75, "grad_norm_var": 0.3055826822916667, "learning_rate": 0.0003, "loss": 11.3886, "loss/aux_loss": 0.048083190061151984, "loss/crossentropy": 2.806036615371704, "loss/logits": 0.883603885769844, "step": 43560 }, { "epoch": 0.4357, "grad_norm": 15.0, "grad_norm_var": 1.5067057291666666, "learning_rate": 0.0003, "loss": 11.0006, "loss/aux_loss": 0.0480629924684763, "loss/crossentropy": 2.5885447025299073, "loss/logits": 0.8374526888132096, "step": 43570 }, { "epoch": 0.4358, "grad_norm": 14.625, "grad_norm_var": 1.2667805989583334, "learning_rate": 0.0003, "loss": 11.0437, "loss/aux_loss": 0.04808878097683191, "loss/crossentropy": 2.7568553149700166, "loss/logits": 0.8364752948284149, "step": 43580 }, { "epoch": 0.4359, "grad_norm": 14.875, "grad_norm_var": 0.2431640625, "learning_rate": 0.0003, "loss": 11.2293, "loss/aux_loss": 0.048077457770705226, "loss/crossentropy": 2.718920850753784, "loss/logits": 0.8596648782491684, "step": 43590 }, { "epoch": 0.436, "grad_norm": 13.625, "grad_norm_var": 0.323681640625, "learning_rate": 0.0003, "loss": 11.2943, "loss/aux_loss": 0.04807487428188324, "loss/crossentropy": 2.6981576442718507, "loss/logits": 0.8179612189531327, "step": 43600 }, { "epoch": 0.4361, "grad_norm": 14.1875, "grad_norm_var": 0.404931640625, "learning_rate": 0.0003, "loss": 11.187, "loss/aux_loss": 0.04809190686792135, "loss/crossentropy": 2.6059759140014647, "loss/logits": 0.8273712396621704, "step": 43610 }, { "epoch": 0.4362, "grad_norm": 14.5, "grad_norm_var": 0.39646809895833335, "learning_rate": 0.0003, "loss": 11.1114, "loss/aux_loss": 0.048068244755268094, "loss/crossentropy": 2.7903677105903624, "loss/logits": 0.859819746017456, "step": 43620 }, { "epoch": 0.4363, "grad_norm": 13.6875, "grad_norm_var": 0.8854166666666666, "learning_rate": 0.0003, "loss": 11.1303, "loss/aux_loss": 0.048077091202139856, "loss/crossentropy": 2.811820614337921, "loss/logits": 0.8357854694128036, "step": 43630 }, { "epoch": 0.4364, "grad_norm": 14.3125, "grad_norm_var": 0.396875, "learning_rate": 0.0003, "loss": 11.2525, "loss/aux_loss": 0.048074031434953216, "loss/crossentropy": 2.7376105189323425, "loss/logits": 0.8477279067039489, "step": 43640 }, { "epoch": 0.4365, "grad_norm": 15.4375, "grad_norm_var": 0.595166015625, "learning_rate": 0.0003, "loss": 11.1445, "loss/aux_loss": 0.048072155378758905, "loss/crossentropy": 2.633346253633499, "loss/logits": 0.8097257345914841, "step": 43650 }, { "epoch": 0.4366, "grad_norm": 14.25, "grad_norm_var": 0.479541015625, "learning_rate": 0.0003, "loss": 11.4783, "loss/aux_loss": 0.04806961789727211, "loss/crossentropy": 2.7709633708000183, "loss/logits": 0.8549257487058639, "step": 43660 }, { "epoch": 0.4367, "grad_norm": 14.4375, "grad_norm_var": 0.6348795572916667, "learning_rate": 0.0003, "loss": 11.2698, "loss/aux_loss": 0.04807803872972727, "loss/crossentropy": 2.8225900530815125, "loss/logits": 0.860035040974617, "step": 43670 }, { "epoch": 0.4368, "grad_norm": 14.75, "grad_norm_var": 1.0791666666666666, "learning_rate": 0.0003, "loss": 10.9831, "loss/aux_loss": 0.0480777345597744, "loss/crossentropy": 2.673772931098938, "loss/logits": 0.8094230264425277, "step": 43680 }, { "epoch": 0.4369, "grad_norm": 13.0, "grad_norm_var": 0.876025390625, "learning_rate": 0.0003, "loss": 11.1929, "loss/aux_loss": 0.048065176233649254, "loss/crossentropy": 2.7113537013530733, "loss/logits": 0.8660049647092819, "step": 43690 }, { "epoch": 0.437, "grad_norm": 14.3125, "grad_norm_var": 0.5234375, "learning_rate": 0.0003, "loss": 11.1811, "loss/aux_loss": 0.048079431615769865, "loss/crossentropy": 2.649705445766449, "loss/logits": 0.7998382925987244, "step": 43700 }, { "epoch": 0.4371, "grad_norm": 14.0, "grad_norm_var": 0.822119140625, "learning_rate": 0.0003, "loss": 11.2201, "loss/aux_loss": 0.04807834941893816, "loss/crossentropy": 2.8499518752098085, "loss/logits": 0.8855004251003266, "step": 43710 }, { "epoch": 0.4372, "grad_norm": 15.3125, "grad_norm_var": 0.8173014322916666, "learning_rate": 0.0003, "loss": 11.035, "loss/aux_loss": 0.04806433636695147, "loss/crossentropy": 2.820804786682129, "loss/logits": 0.85684075653553, "step": 43720 }, { "epoch": 0.4373, "grad_norm": 15.1875, "grad_norm_var": 0.5952962239583334, "learning_rate": 0.0003, "loss": 11.0527, "loss/aux_loss": 0.04808305986225605, "loss/crossentropy": 2.71166330575943, "loss/logits": 0.8343310207128525, "step": 43730 }, { "epoch": 0.4374, "grad_norm": 14.125, "grad_norm_var": 0.485791015625, "learning_rate": 0.0003, "loss": 11.2295, "loss/aux_loss": 0.04806539099663496, "loss/crossentropy": 2.766424697637558, "loss/logits": 0.8757703483104706, "step": 43740 }, { "epoch": 0.4375, "grad_norm": 14.375, "grad_norm_var": 0.6057291666666667, "learning_rate": 0.0003, "loss": 11.1771, "loss/aux_loss": 0.04808200504630804, "loss/crossentropy": 2.7064111471176147, "loss/logits": 0.8637802988290787, "step": 43750 }, { "epoch": 0.4376, "grad_norm": 18.625, "grad_norm_var": 1.48203125, "learning_rate": 0.0003, "loss": 11.1127, "loss/aux_loss": 0.04807017743587494, "loss/crossentropy": 2.653383284807205, "loss/logits": 0.85880506336689, "step": 43760 }, { "epoch": 0.4377, "grad_norm": 14.125, "grad_norm_var": 1.5556640625, "learning_rate": 0.0003, "loss": 11.3311, "loss/aux_loss": 0.04808002356439829, "loss/crossentropy": 2.712994170188904, "loss/logits": 0.853711587190628, "step": 43770 }, { "epoch": 0.4378, "grad_norm": 14.3125, "grad_norm_var": 0.3636555989583333, "learning_rate": 0.0003, "loss": 10.9948, "loss/aux_loss": 0.04807909522205591, "loss/crossentropy": 2.5925404846668245, "loss/logits": 0.804463854432106, "step": 43780 }, { "epoch": 0.4379, "grad_norm": 14.25, "grad_norm_var": 0.32420247395833335, "learning_rate": 0.0003, "loss": 11.23, "loss/aux_loss": 0.04806744996458292, "loss/crossentropy": 2.6414481580257414, "loss/logits": 0.8415878742933274, "step": 43790 }, { "epoch": 0.438, "grad_norm": 14.4375, "grad_norm_var": 0.140625, "learning_rate": 0.0003, "loss": 11.2518, "loss/aux_loss": 0.04808863271027804, "loss/crossentropy": 2.6875229835510255, "loss/logits": 0.8200345158576965, "step": 43800 }, { "epoch": 0.4381, "grad_norm": 16.5, "grad_norm_var": 0.6113932291666667, "learning_rate": 0.0003, "loss": 11.152, "loss/aux_loss": 0.04806707743555307, "loss/crossentropy": 2.513974744081497, "loss/logits": 0.8057096034288407, "step": 43810 }, { "epoch": 0.4382, "grad_norm": 14.25, "grad_norm_var": 0.7825520833333334, "learning_rate": 0.0003, "loss": 11.2658, "loss/aux_loss": 0.04807666204869747, "loss/crossentropy": 2.51427965760231, "loss/logits": 0.7930530071258545, "step": 43820 }, { "epoch": 0.4383, "grad_norm": 13.875, "grad_norm_var": 0.5705729166666667, "learning_rate": 0.0003, "loss": 11.2529, "loss/aux_loss": 0.048075707629323006, "loss/crossentropy": 2.711172878742218, "loss/logits": 0.8392647117376327, "step": 43830 }, { "epoch": 0.4384, "grad_norm": 14.5, "grad_norm_var": 0.6738118489583333, "learning_rate": 0.0003, "loss": 11.2097, "loss/aux_loss": 0.04807080589234829, "loss/crossentropy": 2.796042335033417, "loss/logits": 0.8655966311693192, "step": 43840 }, { "epoch": 0.4385, "grad_norm": 14.0625, "grad_norm_var": 0.6041015625, "learning_rate": 0.0003, "loss": 11.1667, "loss/aux_loss": 0.048078172095119956, "loss/crossentropy": 2.5107653200626374, "loss/logits": 0.7942128717899323, "step": 43850 }, { "epoch": 0.4386, "grad_norm": 14.8125, "grad_norm_var": 0.18411458333333333, "learning_rate": 0.0003, "loss": 10.9717, "loss/aux_loss": 0.048072948679327966, "loss/crossentropy": 2.6562119662761687, "loss/logits": 0.839997673034668, "step": 43860 }, { "epoch": 0.4387, "grad_norm": 13.875, "grad_norm_var": 0.5127604166666667, "learning_rate": 0.0003, "loss": 11.1122, "loss/aux_loss": 0.048071831464767456, "loss/crossentropy": 2.632074463367462, "loss/logits": 0.8598015516996383, "step": 43870 }, { "epoch": 0.4388, "grad_norm": 14.625, "grad_norm_var": 0.656884765625, "learning_rate": 0.0003, "loss": 11.1054, "loss/aux_loss": 0.04807578288018703, "loss/crossentropy": 2.6886400461196898, "loss/logits": 0.8491002053022385, "step": 43880 }, { "epoch": 0.4389, "grad_norm": 15.0, "grad_norm_var": 0.3563639322916667, "learning_rate": 0.0003, "loss": 10.9558, "loss/aux_loss": 0.0480809960514307, "loss/crossentropy": 2.4997453689575195, "loss/logits": 0.7939290121197701, "step": 43890 }, { "epoch": 0.439, "grad_norm": 13.25, "grad_norm_var": 0.34765625, "learning_rate": 0.0003, "loss": 11.3424, "loss/aux_loss": 0.048082989640533924, "loss/crossentropy": 2.65654296875, "loss/logits": 0.8427457630634307, "step": 43900 }, { "epoch": 0.4391, "grad_norm": 14.0625, "grad_norm_var": 0.14542643229166666, "learning_rate": 0.0003, "loss": 11.0091, "loss/aux_loss": 0.04807342197746038, "loss/crossentropy": 2.6360219061374663, "loss/logits": 0.8580428868532181, "step": 43910 }, { "epoch": 0.4392, "grad_norm": 13.75, "grad_norm_var": 1.5449055989583333, "learning_rate": 0.0003, "loss": 11.1625, "loss/aux_loss": 0.04808421973139047, "loss/crossentropy": 2.6617501974105835, "loss/logits": 0.8398198932409286, "step": 43920 }, { "epoch": 0.4393, "grad_norm": 14.375, "grad_norm_var": 1.5051920572916666, "learning_rate": 0.0003, "loss": 11.1698, "loss/aux_loss": 0.048086178675293925, "loss/crossentropy": 2.554797637462616, "loss/logits": 0.7996778011322021, "step": 43930 }, { "epoch": 0.4394, "grad_norm": 13.75, "grad_norm_var": 0.4822265625, "learning_rate": 0.0003, "loss": 11.0295, "loss/aux_loss": 0.04808267541229725, "loss/crossentropy": 2.7181775331497193, "loss/logits": 0.8317540198564529, "step": 43940 }, { "epoch": 0.4395, "grad_norm": 13.125, "grad_norm_var": 0.399072265625, "learning_rate": 0.0003, "loss": 11.0659, "loss/aux_loss": 0.0480749236419797, "loss/crossentropy": 2.7755655884742736, "loss/logits": 0.8220769613981247, "step": 43950 }, { "epoch": 0.4396, "grad_norm": 14.375, "grad_norm_var": 0.241259765625, "learning_rate": 0.0003, "loss": 11.143, "loss/aux_loss": 0.04807134997099638, "loss/crossentropy": 2.8343628644943237, "loss/logits": 0.8345601588487626, "step": 43960 }, { "epoch": 0.4397, "grad_norm": 13.5625, "grad_norm_var": 0.24894205729166666, "learning_rate": 0.0003, "loss": 11.0354, "loss/aux_loss": 0.04807570315897465, "loss/crossentropy": 2.7424150824546816, "loss/logits": 0.8381778568029403, "step": 43970 }, { "epoch": 0.4398, "grad_norm": 13.9375, "grad_norm_var": 0.27493489583333336, "learning_rate": 0.0003, "loss": 11.1642, "loss/aux_loss": 0.048077446036040784, "loss/crossentropy": 2.688514918088913, "loss/logits": 0.8474517434835434, "step": 43980 }, { "epoch": 0.4399, "grad_norm": 13.625, "grad_norm_var": 1.0007649739583333, "learning_rate": 0.0003, "loss": 11.216, "loss/aux_loss": 0.04808139279484749, "loss/crossentropy": 2.829476696252823, "loss/logits": 0.8397254914045333, "step": 43990 }, { "epoch": 0.44, "grad_norm": 13.8125, "grad_norm_var": 0.865869140625, "learning_rate": 0.0003, "loss": 11.2128, "loss/aux_loss": 0.04807093515992165, "loss/crossentropy": 2.7557824432849882, "loss/logits": 0.8229851201176643, "step": 44000 }, { "epoch": 0.4401, "grad_norm": 15.8125, "grad_norm_var": 0.5528645833333333, "learning_rate": 0.0003, "loss": 11.1385, "loss/aux_loss": 0.04807540941983461, "loss/crossentropy": 2.7282972991466523, "loss/logits": 0.8258565187454223, "step": 44010 }, { "epoch": 0.4402, "grad_norm": 13.1875, "grad_norm_var": 0.6098307291666667, "learning_rate": 0.0003, "loss": 11.3424, "loss/aux_loss": 0.048072741366922855, "loss/crossentropy": 2.8332776546478273, "loss/logits": 0.8698825478553772, "step": 44020 }, { "epoch": 0.4403, "grad_norm": 13.875, "grad_norm_var": 0.6869140625, "learning_rate": 0.0003, "loss": 11.1079, "loss/aux_loss": 0.04808550868183374, "loss/crossentropy": 2.728848767280579, "loss/logits": 0.878032585978508, "step": 44030 }, { "epoch": 0.4404, "grad_norm": 13.5, "grad_norm_var": 0.10983072916666667, "learning_rate": 0.0003, "loss": 11.182, "loss/aux_loss": 0.04806646145880222, "loss/crossentropy": 2.660983008146286, "loss/logits": 0.8294680565595627, "step": 44040 }, { "epoch": 0.4405, "grad_norm": 15.0, "grad_norm_var": 1346.16953125, "learning_rate": 0.0003, "loss": 11.2939, "loss/aux_loss": 0.04808670189231634, "loss/crossentropy": 2.8212135076522826, "loss/logits": 0.8692754089832306, "step": 44050 }, { "epoch": 0.4406, "grad_norm": 14.75, "grad_norm_var": 1335.9011555989584, "learning_rate": 0.0003, "loss": 10.9897, "loss/aux_loss": 0.04808017909526825, "loss/crossentropy": 2.73088259100914, "loss/logits": 0.8444351434707642, "step": 44060 }, { "epoch": 0.4407, "grad_norm": 16.25, "grad_norm_var": 0.8820149739583333, "learning_rate": 0.0003, "loss": 11.2521, "loss/aux_loss": 0.04806812740862369, "loss/crossentropy": 2.777199387550354, "loss/logits": 0.8744244068861008, "step": 44070 }, { "epoch": 0.4408, "grad_norm": 15.5, "grad_norm_var": 0.5431640625, "learning_rate": 0.0003, "loss": 11.0439, "loss/aux_loss": 0.04805951733142137, "loss/crossentropy": 2.804098057746887, "loss/logits": 0.864593580365181, "step": 44080 }, { "epoch": 0.4409, "grad_norm": 13.375, "grad_norm_var": 0.634375, "learning_rate": 0.0003, "loss": 11.1746, "loss/aux_loss": 0.048096229508519175, "loss/crossentropy": 2.6453644156455995, "loss/logits": 0.8177176743745804, "step": 44090 }, { "epoch": 0.441, "grad_norm": 15.25, "grad_norm_var": 0.5972493489583334, "learning_rate": 0.0003, "loss": 11.1598, "loss/aux_loss": 0.048072378523647784, "loss/crossentropy": 2.8311945855617524, "loss/logits": 0.8428573668003082, "step": 44100 }, { "epoch": 0.4411, "grad_norm": 14.0, "grad_norm_var": 0.32693684895833336, "learning_rate": 0.0003, "loss": 11.1579, "loss/aux_loss": 0.048064601607620716, "loss/crossentropy": 2.780519354343414, "loss/logits": 0.8639049649238586, "step": 44110 }, { "epoch": 0.4412, "grad_norm": 14.1875, "grad_norm_var": 0.214697265625, "learning_rate": 0.0003, "loss": 10.999, "loss/aux_loss": 0.048081880807876586, "loss/crossentropy": 2.7817383885383604, "loss/logits": 0.8632400244474411, "step": 44120 }, { "epoch": 0.4413, "grad_norm": 13.625, "grad_norm_var": 7.629427083333334, "learning_rate": 0.0003, "loss": 11.0469, "loss/aux_loss": 0.048066666908562185, "loss/crossentropy": 2.7238622844219207, "loss/logits": 0.8448743641376495, "step": 44130 }, { "epoch": 0.4414, "grad_norm": 13.9375, "grad_norm_var": 0.6567057291666667, "learning_rate": 0.0003, "loss": 11.154, "loss/aux_loss": 0.04808225836604833, "loss/crossentropy": 2.7381427764892576, "loss/logits": 0.8270679324865341, "step": 44140 }, { "epoch": 0.4415, "grad_norm": 14.9375, "grad_norm_var": 0.5067708333333333, "learning_rate": 0.0003, "loss": 11.2288, "loss/aux_loss": 0.04807639848440885, "loss/crossentropy": 2.7136885285377503, "loss/logits": 0.8549921065568924, "step": 44150 }, { "epoch": 0.4416, "grad_norm": 14.625, "grad_norm_var": 0.46295572916666666, "learning_rate": 0.0003, "loss": 11.2024, "loss/aux_loss": 0.04807550571858883, "loss/crossentropy": 2.909850722551346, "loss/logits": 0.8444527328014374, "step": 44160 }, { "epoch": 0.4417, "grad_norm": 15.0, "grad_norm_var": 0.3348307291666667, "learning_rate": 0.0003, "loss": 11.2275, "loss/aux_loss": 0.04807161632925272, "loss/crossentropy": 2.8793214321136475, "loss/logits": 0.8599152326583862, "step": 44170 }, { "epoch": 0.4418, "grad_norm": 15.4375, "grad_norm_var": 0.7601399739583333, "learning_rate": 0.0003, "loss": 11.1919, "loss/aux_loss": 0.04808258600533009, "loss/crossentropy": 2.723651033639908, "loss/logits": 0.8421857535839081, "step": 44180 }, { "epoch": 0.4419, "grad_norm": 16.625, "grad_norm_var": 1.0032389322916666, "learning_rate": 0.0003, "loss": 11.1651, "loss/aux_loss": 0.04808426704257727, "loss/crossentropy": 2.745072239637375, "loss/logits": 0.8335127264261246, "step": 44190 }, { "epoch": 0.442, "grad_norm": 13.4375, "grad_norm_var": 0.7898274739583333, "learning_rate": 0.0003, "loss": 11.0781, "loss/aux_loss": 0.048068450205028056, "loss/crossentropy": 2.478744846582413, "loss/logits": 0.8013067185878754, "step": 44200 }, { "epoch": 0.4421, "grad_norm": 15.1875, "grad_norm_var": 0.43136393229166664, "learning_rate": 0.0003, "loss": 11.0637, "loss/aux_loss": 0.04808264952152967, "loss/crossentropy": 2.5987396478652953, "loss/logits": 0.8236001014709473, "step": 44210 }, { "epoch": 0.4422, "grad_norm": 14.3125, "grad_norm_var": 0.348291015625, "learning_rate": 0.0003, "loss": 11.243, "loss/aux_loss": 0.04807177521288395, "loss/crossentropy": 2.930016368627548, "loss/logits": 0.8525474965572357, "step": 44220 }, { "epoch": 0.4423, "grad_norm": 14.8125, "grad_norm_var": 2.8152180989583333, "learning_rate": 0.0003, "loss": 11.1347, "loss/aux_loss": 0.04807257354259491, "loss/crossentropy": 2.85026136636734, "loss/logits": 0.8361983984708786, "step": 44230 }, { "epoch": 0.4424, "grad_norm": 14.25, "grad_norm_var": 3.3347493489583333, "learning_rate": 0.0003, "loss": 11.3583, "loss/aux_loss": 0.04808483067899942, "loss/crossentropy": 2.742392921447754, "loss/logits": 0.8808601886034012, "step": 44240 }, { "epoch": 0.4425, "grad_norm": 14.3125, "grad_norm_var": 1.027587890625, "learning_rate": 0.0003, "loss": 11.1345, "loss/aux_loss": 0.0480776023119688, "loss/crossentropy": 2.7934012949466704, "loss/logits": 0.8520541161298751, "step": 44250 }, { "epoch": 0.4426, "grad_norm": 12.6875, "grad_norm_var": 0.8328125, "learning_rate": 0.0003, "loss": 10.9229, "loss/aux_loss": 0.0480777820572257, "loss/crossentropy": 2.8462532997131347, "loss/logits": 0.8383017539978027, "step": 44260 }, { "epoch": 0.4427, "grad_norm": 14.6875, "grad_norm_var": 0.5344889322916667, "learning_rate": 0.0003, "loss": 11.1232, "loss/aux_loss": 0.04807592108845711, "loss/crossentropy": 2.671667981147766, "loss/logits": 0.8324110358953476, "step": 44270 }, { "epoch": 0.4428, "grad_norm": 14.6875, "grad_norm_var": 1.0883951822916667, "learning_rate": 0.0003, "loss": 11.0707, "loss/aux_loss": 0.04807809516787529, "loss/crossentropy": 2.7182459354400637, "loss/logits": 0.8334134668111801, "step": 44280 }, { "epoch": 0.4429, "grad_norm": 13.9375, "grad_norm_var": 1.4891764322916667, "learning_rate": 0.0003, "loss": 11.3104, "loss/aux_loss": 0.04806990176439285, "loss/crossentropy": 2.6499986171722414, "loss/logits": 0.8422462284564972, "step": 44290 }, { "epoch": 0.443, "grad_norm": 16.125, "grad_norm_var": 0.8169270833333333, "learning_rate": 0.0003, "loss": 11.087, "loss/aux_loss": 0.048081264831125736, "loss/crossentropy": 2.6629028499126433, "loss/logits": 0.7944082587957382, "step": 44300 }, { "epoch": 0.4431, "grad_norm": 14.25, "grad_norm_var": 1.045556640625, "learning_rate": 0.0003, "loss": 11.1663, "loss/aux_loss": 0.04805992990732193, "loss/crossentropy": 2.6393331587314606, "loss/logits": 0.8219176232814789, "step": 44310 }, { "epoch": 0.4432, "grad_norm": 13.625, "grad_norm_var": 1.5378743489583333, "learning_rate": 0.0003, "loss": 11.1893, "loss/aux_loss": 0.048083177767693996, "loss/crossentropy": 2.829824334383011, "loss/logits": 0.8309338241815567, "step": 44320 }, { "epoch": 0.4433, "grad_norm": 35.75, "grad_norm_var": 28.234309895833334, "learning_rate": 0.0003, "loss": 10.9533, "loss/aux_loss": 0.048075624741613865, "loss/crossentropy": 2.6464429974555967, "loss/logits": 0.8063034623861313, "step": 44330 }, { "epoch": 0.4434, "grad_norm": 14.4375, "grad_norm_var": 28.2, "learning_rate": 0.0003, "loss": 11.0942, "loss/aux_loss": 0.048067497089505196, "loss/crossentropy": 2.6193343341350555, "loss/logits": 0.8304236233234406, "step": 44340 }, { "epoch": 0.4435, "grad_norm": 14.9375, "grad_norm_var": 0.934375, "learning_rate": 0.0003, "loss": 10.9155, "loss/aux_loss": 0.04807949960231781, "loss/crossentropy": 2.7103063344955443, "loss/logits": 0.8495049208402634, "step": 44350 }, { "epoch": 0.4436, "grad_norm": 14.625, "grad_norm_var": 0.452587890625, "learning_rate": 0.0003, "loss": 11.1538, "loss/aux_loss": 0.048082873411476615, "loss/crossentropy": 2.634609413146973, "loss/logits": 0.8501360476016998, "step": 44360 }, { "epoch": 0.4437, "grad_norm": 14.0, "grad_norm_var": 0.25974934895833335, "learning_rate": 0.0003, "loss": 11.1608, "loss/aux_loss": 0.04807829111814499, "loss/crossentropy": 2.619146168231964, "loss/logits": 0.8419374793767929, "step": 44370 }, { "epoch": 0.4438, "grad_norm": 14.625, "grad_norm_var": 0.38644205729166664, "learning_rate": 0.0003, "loss": 11.1946, "loss/aux_loss": 0.04806345794349909, "loss/crossentropy": 2.7630446314811707, "loss/logits": 0.8234979271888733, "step": 44380 }, { "epoch": 0.4439, "grad_norm": 14.8125, "grad_norm_var": 188.98396809895834, "learning_rate": 0.0003, "loss": 11.1839, "loss/aux_loss": 0.04808218106627464, "loss/crossentropy": 2.7055815279483797, "loss/logits": 0.8304955214262009, "step": 44390 }, { "epoch": 0.444, "grad_norm": 14.625, "grad_norm_var": 0.49347330729166666, "learning_rate": 0.0003, "loss": 11.1252, "loss/aux_loss": 0.048086220771074294, "loss/crossentropy": 2.6687645077705384, "loss/logits": 0.8254845380783081, "step": 44400 }, { "epoch": 0.4441, "grad_norm": 14.1875, "grad_norm_var": 22.872249348958334, "learning_rate": 0.0003, "loss": 11.2087, "loss/aux_loss": 0.04806323740631342, "loss/crossentropy": 2.7094571113586428, "loss/logits": 0.8551447689533234, "step": 44410 }, { "epoch": 0.4442, "grad_norm": 13.375, "grad_norm_var": 0.490478515625, "learning_rate": 0.0003, "loss": 11.0191, "loss/aux_loss": 0.0480901513248682, "loss/crossentropy": 2.6127541959285736, "loss/logits": 0.7744009613990783, "step": 44420 }, { "epoch": 0.4443, "grad_norm": 16.125, "grad_norm_var": 0.8528483072916667, "learning_rate": 0.0003, "loss": 11.1025, "loss/aux_loss": 0.04808180164545774, "loss/crossentropy": 2.7499490082263947, "loss/logits": 0.8446706473827362, "step": 44430 }, { "epoch": 0.4444, "grad_norm": 13.75, "grad_norm_var": 0.781103515625, "learning_rate": 0.0003, "loss": 11.3893, "loss/aux_loss": 0.048070714622735974, "loss/crossentropy": 2.789631450176239, "loss/logits": 0.8639173865318298, "step": 44440 }, { "epoch": 0.4445, "grad_norm": 13.6875, "grad_norm_var": 0.14855143229166667, "learning_rate": 0.0003, "loss": 11.1676, "loss/aux_loss": 0.0480786357074976, "loss/crossentropy": 2.794544792175293, "loss/logits": 0.8530236780643463, "step": 44450 }, { "epoch": 0.4446, "grad_norm": 13.1875, "grad_norm_var": 0.23430989583333334, "learning_rate": 0.0003, "loss": 10.9838, "loss/aux_loss": 0.04807055927813053, "loss/crossentropy": 2.6851485848426817, "loss/logits": 0.8450033336877822, "step": 44460 }, { "epoch": 0.4447, "grad_norm": 13.375, "grad_norm_var": 0.5980305989583333, "learning_rate": 0.0003, "loss": 11.1546, "loss/aux_loss": 0.04807008933275938, "loss/crossentropy": 2.6952660202980043, "loss/logits": 0.806543692946434, "step": 44470 }, { "epoch": 0.4448, "grad_norm": 14.6875, "grad_norm_var": 1.6238932291666666, "learning_rate": 0.0003, "loss": 11.2524, "loss/aux_loss": 0.04807232767343521, "loss/crossentropy": 2.8130900621414185, "loss/logits": 0.8416864901781083, "step": 44480 }, { "epoch": 0.4449, "grad_norm": 14.6875, "grad_norm_var": 1.2067708333333333, "learning_rate": 0.0003, "loss": 11.2551, "loss/aux_loss": 0.04807587340474129, "loss/crossentropy": 2.982305383682251, "loss/logits": 0.8985978931188583, "step": 44490 }, { "epoch": 0.445, "grad_norm": 14.3125, "grad_norm_var": 0.45358072916666664, "learning_rate": 0.0003, "loss": 11.1725, "loss/aux_loss": 0.048066375963389876, "loss/crossentropy": 2.6484339118003843, "loss/logits": 0.8086622357368469, "step": 44500 }, { "epoch": 0.4451, "grad_norm": 15.1875, "grad_norm_var": 0.7822265625, "learning_rate": 0.0003, "loss": 11.2345, "loss/aux_loss": 0.04808136448264122, "loss/crossentropy": 2.829063284397125, "loss/logits": 0.8414050981402397, "step": 44510 }, { "epoch": 0.4452, "grad_norm": 13.8125, "grad_norm_var": 0.3245930989583333, "learning_rate": 0.0003, "loss": 11.2001, "loss/aux_loss": 0.0480697114020586, "loss/crossentropy": 2.8107310473918914, "loss/logits": 0.8371037811040878, "step": 44520 }, { "epoch": 0.4453, "grad_norm": 14.0, "grad_norm_var": 0.4483723958333333, "learning_rate": 0.0003, "loss": 11.3441, "loss/aux_loss": 0.04806445110589266, "loss/crossentropy": 2.7447421967983248, "loss/logits": 0.8555226683616638, "step": 44530 }, { "epoch": 0.4454, "grad_norm": 14.5625, "grad_norm_var": 0.9785807291666667, "learning_rate": 0.0003, "loss": 11.0857, "loss/aux_loss": 0.04807347375899553, "loss/crossentropy": 2.7596997022628784, "loss/logits": 0.8404556185007095, "step": 44540 }, { "epoch": 0.4455, "grad_norm": 14.625, "grad_norm_var": 0.675634765625, "learning_rate": 0.0003, "loss": 11.1599, "loss/aux_loss": 0.048077495954930785, "loss/crossentropy": 2.838477683067322, "loss/logits": 0.8429495930671692, "step": 44550 }, { "epoch": 0.4456, "grad_norm": 14.1875, "grad_norm_var": 0.159619140625, "learning_rate": 0.0003, "loss": 11.005, "loss/aux_loss": 0.04807081557810307, "loss/crossentropy": 2.608337712287903, "loss/logits": 0.8179334878921509, "step": 44560 }, { "epoch": 0.4457, "grad_norm": 13.25, "grad_norm_var": 0.2166015625, "learning_rate": 0.0003, "loss": 11.0062, "loss/aux_loss": 0.04806958455592394, "loss/crossentropy": 2.4894912481307983, "loss/logits": 0.8116365820169449, "step": 44570 }, { "epoch": 0.4458, "grad_norm": 14.9375, "grad_norm_var": 1.2166015625, "learning_rate": 0.0003, "loss": 11.0503, "loss/aux_loss": 0.04808680806308985, "loss/crossentropy": 2.543110156059265, "loss/logits": 0.8014299184083938, "step": 44580 }, { "epoch": 0.4459, "grad_norm": 13.625, "grad_norm_var": 0.38743489583333335, "learning_rate": 0.0003, "loss": 11.2837, "loss/aux_loss": 0.048072746768593785, "loss/crossentropy": 2.6654117584228514, "loss/logits": 0.8336873948574066, "step": 44590 }, { "epoch": 0.446, "grad_norm": 14.0625, "grad_norm_var": 0.6465983072916667, "learning_rate": 0.0003, "loss": 11.1985, "loss/aux_loss": 0.04807266443967819, "loss/crossentropy": 2.6742038309574125, "loss/logits": 0.8307890117168426, "step": 44600 }, { "epoch": 0.4461, "grad_norm": 19.0, "grad_norm_var": 1.7292805989583333, "learning_rate": 0.0003, "loss": 11.1619, "loss/aux_loss": 0.04808099921792745, "loss/crossentropy": 2.602944529056549, "loss/logits": 0.8340393453836441, "step": 44610 }, { "epoch": 0.4462, "grad_norm": 14.5, "grad_norm_var": 1.643603515625, "learning_rate": 0.0003, "loss": 11.2413, "loss/aux_loss": 0.048074822127819064, "loss/crossentropy": 2.7373409271240234, "loss/logits": 0.8392162501811982, "step": 44620 }, { "epoch": 0.4463, "grad_norm": 13.875, "grad_norm_var": 0.268603515625, "learning_rate": 0.0003, "loss": 11.2579, "loss/aux_loss": 0.04809278659522533, "loss/crossentropy": 2.75430805683136, "loss/logits": 0.8329770535230636, "step": 44630 }, { "epoch": 0.4464, "grad_norm": 13.9375, "grad_norm_var": 0.543603515625, "learning_rate": 0.0003, "loss": 11.333, "loss/aux_loss": 0.04807210359722376, "loss/crossentropy": 2.8144919753074644, "loss/logits": 0.8301558136940003, "step": 44640 }, { "epoch": 0.4465, "grad_norm": 14.25, "grad_norm_var": 0.38800455729166666, "learning_rate": 0.0003, "loss": 11.0804, "loss/aux_loss": 0.048076588474214074, "loss/crossentropy": 2.5720925986766816, "loss/logits": 0.8426287531852722, "step": 44650 }, { "epoch": 0.4466, "grad_norm": 14.0625, "grad_norm_var": 0.9827962239583333, "learning_rate": 0.0003, "loss": 11.1989, "loss/aux_loss": 0.04807624667882919, "loss/crossentropy": 2.7645800590515135, "loss/logits": 0.867130133509636, "step": 44660 }, { "epoch": 0.4467, "grad_norm": 13.4375, "grad_norm_var": 0.46087239583333334, "learning_rate": 0.0003, "loss": 11.1464, "loss/aux_loss": 0.04807679317891598, "loss/crossentropy": 2.819456601142883, "loss/logits": 0.8306093007326126, "step": 44670 }, { "epoch": 0.4468, "grad_norm": 14.125, "grad_norm_var": 0.261181640625, "learning_rate": 0.0003, "loss": 11.1236, "loss/aux_loss": 0.04808004982769489, "loss/crossentropy": 2.7177935242652893, "loss/logits": 0.8448736160993576, "step": 44680 }, { "epoch": 0.4469, "grad_norm": 14.4375, "grad_norm_var": 0.390869140625, "learning_rate": 0.0003, "loss": 11.2744, "loss/aux_loss": 0.04807827845215797, "loss/crossentropy": 2.7277093112468718, "loss/logits": 0.8623002141714096, "step": 44690 }, { "epoch": 0.447, "grad_norm": 15.375, "grad_norm_var": 0.3541015625, "learning_rate": 0.0003, "loss": 11.0419, "loss/aux_loss": 0.04807532671838999, "loss/crossentropy": 2.633826696872711, "loss/logits": 0.8072617381811142, "step": 44700 }, { "epoch": 0.4471, "grad_norm": 13.0625, "grad_norm_var": 0.8296875, "learning_rate": 0.0003, "loss": 11.1652, "loss/aux_loss": 0.04807002525776625, "loss/crossentropy": 2.8819324254989622, "loss/logits": 0.8609935432672501, "step": 44710 }, { "epoch": 0.4472, "grad_norm": 12.8125, "grad_norm_var": 67.39420572916667, "learning_rate": 0.0003, "loss": 11.1658, "loss/aux_loss": 0.04808681160211563, "loss/crossentropy": 2.6865515530109407, "loss/logits": 0.8306647807359695, "step": 44720 }, { "epoch": 0.4473, "grad_norm": 15.0, "grad_norm_var": 1.3202473958333334, "learning_rate": 0.0003, "loss": 11.311, "loss/aux_loss": 0.04807041622698307, "loss/crossentropy": 2.785504710674286, "loss/logits": 0.8608017772436142, "step": 44730 }, { "epoch": 0.4474, "grad_norm": 14.25, "grad_norm_var": 0.5462076822916667, "learning_rate": 0.0003, "loss": 11.0018, "loss/aux_loss": 0.048068745993077755, "loss/crossentropy": 2.671027088165283, "loss/logits": 0.8593548953533172, "step": 44740 }, { "epoch": 0.4475, "grad_norm": 14.6875, "grad_norm_var": 0.5098795572916667, "learning_rate": 0.0003, "loss": 11.244, "loss/aux_loss": 0.0480734009295702, "loss/crossentropy": 2.8455959856510162, "loss/logits": 0.8765670835971833, "step": 44750 }, { "epoch": 0.4476, "grad_norm": 15.0625, "grad_norm_var": 0.29609375, "learning_rate": 0.0003, "loss": 11.3258, "loss/aux_loss": 0.04808003343641758, "loss/crossentropy": 2.717009627819061, "loss/logits": 0.8516732335090638, "step": 44760 }, { "epoch": 0.4477, "grad_norm": 13.875, "grad_norm_var": 0.2508951822916667, "learning_rate": 0.0003, "loss": 11.0886, "loss/aux_loss": 0.04807702694088221, "loss/crossentropy": 2.591119593381882, "loss/logits": 0.8043858855962753, "step": 44770 }, { "epoch": 0.4478, "grad_norm": 13.75, "grad_norm_var": 0.37701822916666666, "learning_rate": 0.0003, "loss": 11.2574, "loss/aux_loss": 0.04807428196072579, "loss/crossentropy": 2.826832854747772, "loss/logits": 0.8612869143486023, "step": 44780 }, { "epoch": 0.4479, "grad_norm": 14.5625, "grad_norm_var": 2.029801432291667, "learning_rate": 0.0003, "loss": 11.0258, "loss/aux_loss": 0.048076750710606575, "loss/crossentropy": 2.607262873649597, "loss/logits": 0.8060549914836883, "step": 44790 }, { "epoch": 0.448, "grad_norm": 15.0625, "grad_norm_var": 0.7778645833333333, "learning_rate": 0.0003, "loss": 11.1719, "loss/aux_loss": 0.04807757344096899, "loss/crossentropy": 2.5432204246520995, "loss/logits": 0.7854818969964981, "step": 44800 }, { "epoch": 0.4481, "grad_norm": 14.0625, "grad_norm_var": 0.18274739583333333, "learning_rate": 0.0003, "loss": 11.2529, "loss/aux_loss": 0.048074766620993616, "loss/crossentropy": 2.85523384809494, "loss/logits": 0.8906599700450897, "step": 44810 }, { "epoch": 0.4482, "grad_norm": 14.0625, "grad_norm_var": 0.38723958333333336, "learning_rate": 0.0003, "loss": 11.3863, "loss/aux_loss": 0.04806411787867546, "loss/crossentropy": 2.7882557988166807, "loss/logits": 0.8717421501874923, "step": 44820 }, { "epoch": 0.4483, "grad_norm": 14.4375, "grad_norm_var": 0.31027018229166664, "learning_rate": 0.0003, "loss": 11.1229, "loss/aux_loss": 0.04807241186499596, "loss/crossentropy": 2.737172317504883, "loss/logits": 0.8286905974149704, "step": 44830 }, { "epoch": 0.4484, "grad_norm": 13.5625, "grad_norm_var": 0.32810872395833335, "learning_rate": 0.0003, "loss": 11.3924, "loss/aux_loss": 0.048084205389022826, "loss/crossentropy": 2.8003673791885375, "loss/logits": 0.8761254161596298, "step": 44840 }, { "epoch": 0.4485, "grad_norm": 15.625, "grad_norm_var": 7.253580729166667, "learning_rate": 0.0003, "loss": 11.2739, "loss/aux_loss": 0.04808361791074276, "loss/crossentropy": 2.7707901895046234, "loss/logits": 0.8379460781812668, "step": 44850 }, { "epoch": 0.4486, "grad_norm": 13.75, "grad_norm_var": 7.713134765625, "learning_rate": 0.0003, "loss": 11.1027, "loss/aux_loss": 0.04806753098964691, "loss/crossentropy": 2.857259654998779, "loss/logits": 0.8505940139293671, "step": 44860 }, { "epoch": 0.4487, "grad_norm": 14.5625, "grad_norm_var": 0.3846354166666667, "learning_rate": 0.0003, "loss": 11.0449, "loss/aux_loss": 0.048075702600181104, "loss/crossentropy": 2.698404437303543, "loss/logits": 0.817566591501236, "step": 44870 }, { "epoch": 0.4488, "grad_norm": 13.4375, "grad_norm_var": 0.4984212239583333, "learning_rate": 0.0003, "loss": 11.0498, "loss/aux_loss": 0.04807261247187853, "loss/crossentropy": 2.7336514472961424, "loss/logits": 0.8175310790538788, "step": 44880 }, { "epoch": 0.4489, "grad_norm": 13.5625, "grad_norm_var": 0.32493489583333335, "learning_rate": 0.0003, "loss": 11.2769, "loss/aux_loss": 0.0480836022645235, "loss/crossentropy": 2.6234305024147035, "loss/logits": 0.7813559800386429, "step": 44890 }, { "epoch": 0.449, "grad_norm": 13.8125, "grad_norm_var": 3.256103515625, "learning_rate": 0.0003, "loss": 11.2918, "loss/aux_loss": 0.048073142766952515, "loss/crossentropy": 2.686808633804321, "loss/logits": 0.831533208489418, "step": 44900 }, { "epoch": 0.4491, "grad_norm": 13.75, "grad_norm_var": 0.213525390625, "learning_rate": 0.0003, "loss": 11.2277, "loss/aux_loss": 0.0480730053037405, "loss/crossentropy": 2.5610205233097076, "loss/logits": 0.8396010220050811, "step": 44910 }, { "epoch": 0.4492, "grad_norm": 15.1875, "grad_norm_var": 0.4244140625, "learning_rate": 0.0003, "loss": 11.3523, "loss/aux_loss": 0.04808488227427006, "loss/crossentropy": 2.7543952822685243, "loss/logits": 0.8435162544250489, "step": 44920 }, { "epoch": 0.4493, "grad_norm": 13.5, "grad_norm_var": 0.6640625, "learning_rate": 0.0003, "loss": 11.066, "loss/aux_loss": 0.04807263296097517, "loss/crossentropy": 2.755419361591339, "loss/logits": 0.8490778416395187, "step": 44930 }, { "epoch": 0.4494, "grad_norm": 13.875, "grad_norm_var": 0.21053059895833334, "learning_rate": 0.0003, "loss": 11.2204, "loss/aux_loss": 0.048071037791669366, "loss/crossentropy": 2.6401973962783813, "loss/logits": 0.8432885766029358, "step": 44940 }, { "epoch": 0.4495, "grad_norm": 14.8125, "grad_norm_var": 0.7831868489583333, "learning_rate": 0.0003, "loss": 11.0892, "loss/aux_loss": 0.04807609617710114, "loss/crossentropy": 2.685221529006958, "loss/logits": 0.8308875828981399, "step": 44950 }, { "epoch": 0.4496, "grad_norm": 13.1875, "grad_norm_var": 0.596337890625, "learning_rate": 0.0003, "loss": 11.2131, "loss/aux_loss": 0.048079296760261056, "loss/crossentropy": 2.694732528924942, "loss/logits": 0.8388465225696564, "step": 44960 }, { "epoch": 0.4497, "grad_norm": 13.625, "grad_norm_var": 0.5374348958333334, "learning_rate": 0.0003, "loss": 11.0489, "loss/aux_loss": 0.048061837814748286, "loss/crossentropy": 2.737008786201477, "loss/logits": 0.850694689154625, "step": 44970 }, { "epoch": 0.4498, "grad_norm": 13.6875, "grad_norm_var": 0.3843587239583333, "learning_rate": 0.0003, "loss": 11.1391, "loss/aux_loss": 0.048087633959949014, "loss/crossentropy": 2.8430655121803285, "loss/logits": 0.8736658453941345, "step": 44980 }, { "epoch": 0.4499, "grad_norm": 13.6875, "grad_norm_var": 0.2945149739583333, "learning_rate": 0.0003, "loss": 11.1745, "loss/aux_loss": 0.04807091951370239, "loss/crossentropy": 2.856264519691467, "loss/logits": 0.8631505787372589, "step": 44990 }, { "epoch": 0.45, "grad_norm": 14.4375, "grad_norm_var": 0.34576822916666666, "learning_rate": 0.0003, "loss": 11.1792, "loss/aux_loss": 0.048074228875339034, "loss/crossentropy": 2.7000016987323763, "loss/logits": 0.8304085314273835, "step": 45000 }, { "epoch": 0.4501, "grad_norm": 19.375, "grad_norm_var": 1.9964680989583334, "learning_rate": 0.0003, "loss": 11.201, "loss/aux_loss": 0.04807187356054783, "loss/crossentropy": 2.5905265331268312, "loss/logits": 0.8301917672157287, "step": 45010 }, { "epoch": 0.4502, "grad_norm": 14.5625, "grad_norm_var": 1.995166015625, "learning_rate": 0.0003, "loss": 11.3012, "loss/aux_loss": 0.048079838044941425, "loss/crossentropy": 2.655366039276123, "loss/logits": 0.8250725924968719, "step": 45020 }, { "epoch": 0.4503, "grad_norm": 13.9375, "grad_norm_var": 0.453125, "learning_rate": 0.0003, "loss": 11.0779, "loss/aux_loss": 0.048072745092213154, "loss/crossentropy": 2.873497819900513, "loss/logits": 0.8643653631210327, "step": 45030 }, { "epoch": 0.4504, "grad_norm": 13.875, "grad_norm_var": 0.2747395833333333, "learning_rate": 0.0003, "loss": 11.0926, "loss/aux_loss": 0.04807302486151457, "loss/crossentropy": 2.816511571407318, "loss/logits": 0.8430682748556138, "step": 45040 }, { "epoch": 0.4505, "grad_norm": 15.0, "grad_norm_var": 9.468994140625, "learning_rate": 0.0003, "loss": 11.2874, "loss/aux_loss": 0.0480752307921648, "loss/crossentropy": 2.796900761127472, "loss/logits": 0.8639124810695649, "step": 45050 }, { "epoch": 0.4506, "grad_norm": 15.4375, "grad_norm_var": 0.5764973958333334, "learning_rate": 0.0003, "loss": 11.2776, "loss/aux_loss": 0.048078755289316176, "loss/crossentropy": 2.770525109767914, "loss/logits": 0.8221762269735337, "step": 45060 }, { "epoch": 0.4507, "grad_norm": 14.0625, "grad_norm_var": 0.5235514322916667, "learning_rate": 0.0003, "loss": 11.0756, "loss/aux_loss": 0.048073511384427545, "loss/crossentropy": 2.642447865009308, "loss/logits": 0.823766753077507, "step": 45070 }, { "epoch": 0.4508, "grad_norm": 14.25, "grad_norm_var": 0.667822265625, "learning_rate": 0.0003, "loss": 11.2534, "loss/aux_loss": 0.04807865601032972, "loss/crossentropy": 2.773304843902588, "loss/logits": 0.8674245417118073, "step": 45080 }, { "epoch": 0.4509, "grad_norm": 14.0625, "grad_norm_var": 1.1044108072916667, "learning_rate": 0.0003, "loss": 10.9283, "loss/aux_loss": 0.04806472901254892, "loss/crossentropy": 2.545005625486374, "loss/logits": 0.7938053220510483, "step": 45090 }, { "epoch": 0.451, "grad_norm": 14.625, "grad_norm_var": 0.41287434895833336, "learning_rate": 0.0003, "loss": 10.9847, "loss/aux_loss": 0.04808530602604151, "loss/crossentropy": 2.6405605256557463, "loss/logits": 0.8240345329046249, "step": 45100 }, { "epoch": 0.4511, "grad_norm": 14.1875, "grad_norm_var": 0.48605143229166664, "learning_rate": 0.0003, "loss": 11.2396, "loss/aux_loss": 0.04806354120373726, "loss/crossentropy": 2.7839693784713746, "loss/logits": 0.8868257701396942, "step": 45110 }, { "epoch": 0.4512, "grad_norm": 16.875, "grad_norm_var": 0.9191243489583333, "learning_rate": 0.0003, "loss": 11.0655, "loss/aux_loss": 0.048075102269649506, "loss/crossentropy": 2.752034366130829, "loss/logits": 0.840661883354187, "step": 45120 }, { "epoch": 0.4513, "grad_norm": 16.25, "grad_norm_var": 1.6202473958333334, "learning_rate": 0.0003, "loss": 11.2822, "loss/aux_loss": 0.048065843246877196, "loss/crossentropy": 2.7595421195030214, "loss/logits": 0.8664416402578354, "step": 45130 }, { "epoch": 0.4514, "grad_norm": 14.5625, "grad_norm_var": 0.522509765625, "learning_rate": 0.0003, "loss": 11.1742, "loss/aux_loss": 0.04807279203087091, "loss/crossentropy": 2.7141676127910612, "loss/logits": 0.8610961318016053, "step": 45140 }, { "epoch": 0.4515, "grad_norm": 15.8125, "grad_norm_var": 0.7348307291666667, "learning_rate": 0.0003, "loss": 11.1127, "loss/aux_loss": 0.04807740245014429, "loss/crossentropy": 2.674371284246445, "loss/logits": 0.8470130562782288, "step": 45150 }, { "epoch": 0.4516, "grad_norm": 13.25, "grad_norm_var": 5.373958333333333, "learning_rate": 0.0003, "loss": 11.1149, "loss/aux_loss": 0.04808497317135334, "loss/crossentropy": 2.658644849061966, "loss/logits": 0.8162630528211594, "step": 45160 }, { "epoch": 0.4517, "grad_norm": 14.75, "grad_norm_var": 4.355322265625, "learning_rate": 0.0003, "loss": 11.1443, "loss/aux_loss": 0.048073196038603785, "loss/crossentropy": 2.693699061870575, "loss/logits": 0.8589479506015778, "step": 45170 }, { "epoch": 0.4518, "grad_norm": 16.875, "grad_norm_var": 0.7890625, "learning_rate": 0.0003, "loss": 11.2395, "loss/aux_loss": 0.04807229600846767, "loss/crossentropy": 2.809789764881134, "loss/logits": 0.8285229980945588, "step": 45180 }, { "epoch": 0.4519, "grad_norm": 14.0625, "grad_norm_var": 0.75, "learning_rate": 0.0003, "loss": 10.9856, "loss/aux_loss": 0.048070633225142954, "loss/crossentropy": 2.75104238986969, "loss/logits": 0.8169887810945511, "step": 45190 }, { "epoch": 0.452, "grad_norm": 14.0, "grad_norm_var": 0.47239583333333335, "learning_rate": 0.0003, "loss": 11.2444, "loss/aux_loss": 0.04806809239089489, "loss/crossentropy": 2.7128111243247988, "loss/logits": 0.8189259111881256, "step": 45200 }, { "epoch": 0.4521, "grad_norm": 14.0625, "grad_norm_var": 0.6692057291666667, "learning_rate": 0.0003, "loss": 10.9503, "loss/aux_loss": 0.0480776846408844, "loss/crossentropy": 2.7245679616928102, "loss/logits": 0.8289970546960831, "step": 45210 }, { "epoch": 0.4522, "grad_norm": 13.3125, "grad_norm_var": 0.5874348958333333, "learning_rate": 0.0003, "loss": 11.094, "loss/aux_loss": 0.04808888658881187, "loss/crossentropy": 2.482409542798996, "loss/logits": 0.8053638786077499, "step": 45220 }, { "epoch": 0.4523, "grad_norm": 13.875, "grad_norm_var": 0.5699055989583334, "learning_rate": 0.0003, "loss": 11.1009, "loss/aux_loss": 0.04806402549147606, "loss/crossentropy": 2.8999637961387634, "loss/logits": 0.8342925250530243, "step": 45230 }, { "epoch": 0.4524, "grad_norm": 13.0, "grad_norm_var": 0.3544108072916667, "learning_rate": 0.0003, "loss": 11.1598, "loss/aux_loss": 0.04807737078517675, "loss/crossentropy": 2.8522120237350466, "loss/logits": 0.8593276113271713, "step": 45240 }, { "epoch": 0.4525, "grad_norm": 14.875, "grad_norm_var": 0.5133951822916667, "learning_rate": 0.0003, "loss": 10.9334, "loss/aux_loss": 0.048071842454373834, "loss/crossentropy": 2.540039598941803, "loss/logits": 0.8037934333086014, "step": 45250 }, { "epoch": 0.4526, "grad_norm": 14.625, "grad_norm_var": 0.62109375, "learning_rate": 0.0003, "loss": 11.1593, "loss/aux_loss": 0.04807714056223631, "loss/crossentropy": 2.7473401188850404, "loss/logits": 0.8323444128036499, "step": 45260 }, { "epoch": 0.4527, "grad_norm": 13.375, "grad_norm_var": 0.5430826822916667, "learning_rate": 0.0003, "loss": 11.1556, "loss/aux_loss": 0.048075577989220616, "loss/crossentropy": 2.639462560415268, "loss/logits": 0.8283006697893143, "step": 45270 }, { "epoch": 0.4528, "grad_norm": 14.5, "grad_norm_var": 0.459619140625, "learning_rate": 0.0003, "loss": 11.0925, "loss/aux_loss": 0.04807265214622021, "loss/crossentropy": 2.733612394332886, "loss/logits": 0.8260679453611374, "step": 45280 }, { "epoch": 0.4529, "grad_norm": 15.25, "grad_norm_var": 0.3228515625, "learning_rate": 0.0003, "loss": 11.2664, "loss/aux_loss": 0.04807320795953274, "loss/crossentropy": 2.817549741268158, "loss/logits": 0.872169628739357, "step": 45290 }, { "epoch": 0.453, "grad_norm": 14.625, "grad_norm_var": 1.0290201822916667, "learning_rate": 0.0003, "loss": 11.2883, "loss/aux_loss": 0.04808189887553453, "loss/crossentropy": 2.7291213452816008, "loss/logits": 0.8284125924110413, "step": 45300 }, { "epoch": 0.4531, "grad_norm": 14.25, "grad_norm_var": 0.38619791666666664, "learning_rate": 0.0003, "loss": 11.3393, "loss/aux_loss": 0.04806639589369297, "loss/crossentropy": 2.888676828145981, "loss/logits": 0.8628419786691666, "step": 45310 }, { "epoch": 0.4532, "grad_norm": 16.125, "grad_norm_var": 30.763395182291667, "learning_rate": 0.0003, "loss": 11.3253, "loss/aux_loss": 0.04808086268603802, "loss/crossentropy": 2.8755642414093017, "loss/logits": 0.8577650129795075, "step": 45320 }, { "epoch": 0.4533, "grad_norm": 14.625, "grad_norm_var": 26.501936848958334, "learning_rate": 0.0003, "loss": 11.3604, "loss/aux_loss": 0.04807670023292303, "loss/crossentropy": 2.795285141468048, "loss/logits": 0.8813230514526367, "step": 45330 }, { "epoch": 0.4534, "grad_norm": 14.1875, "grad_norm_var": 8.334830729166667, "learning_rate": 0.0003, "loss": 10.9839, "loss/aux_loss": 0.048065191879868505, "loss/crossentropy": 2.9003730535507204, "loss/logits": 0.839951154589653, "step": 45340 }, { "epoch": 0.4535, "grad_norm": 13.25, "grad_norm_var": 0.870166015625, "learning_rate": 0.0003, "loss": 11.0878, "loss/aux_loss": 0.048059961013495925, "loss/crossentropy": 2.8457574963569643, "loss/logits": 0.8674950510263443, "step": 45350 }, { "epoch": 0.4536, "grad_norm": 14.8125, "grad_norm_var": 2.363264973958333, "learning_rate": 0.0003, "loss": 11.1738, "loss/aux_loss": 0.04807946924120188, "loss/crossentropy": 2.8464213252067565, "loss/logits": 0.8665098369121551, "step": 45360 }, { "epoch": 0.4537, "grad_norm": 13.25, "grad_norm_var": 2.085400390625, "learning_rate": 0.0003, "loss": 11.1825, "loss/aux_loss": 0.04806684292852879, "loss/crossentropy": 2.7150739192962647, "loss/logits": 0.8413305938243866, "step": 45370 }, { "epoch": 0.4538, "grad_norm": 13.3125, "grad_norm_var": 1.294775390625, "learning_rate": 0.0003, "loss": 11.2586, "loss/aux_loss": 0.048075356893241404, "loss/crossentropy": 2.809218281507492, "loss/logits": 0.8536065101623536, "step": 45380 }, { "epoch": 0.4539, "grad_norm": 13.8125, "grad_norm_var": 1.880712890625, "learning_rate": 0.0003, "loss": 11.1555, "loss/aux_loss": 0.04809560999274254, "loss/crossentropy": 2.602170443534851, "loss/logits": 0.8421902984380722, "step": 45390 }, { "epoch": 0.454, "grad_norm": 13.25, "grad_norm_var": 2.2202962239583335, "learning_rate": 0.0003, "loss": 11.1732, "loss/aux_loss": 0.04805552512407303, "loss/crossentropy": 2.8673832774162293, "loss/logits": 0.8450622230768203, "step": 45400 }, { "epoch": 0.4541, "grad_norm": 13.75, "grad_norm_var": 0.9009765625, "learning_rate": 0.0003, "loss": 11.023, "loss/aux_loss": 0.04807879459112883, "loss/crossentropy": 2.6993161380290984, "loss/logits": 0.8065011203289032, "step": 45410 }, { "epoch": 0.4542, "grad_norm": 13.5, "grad_norm_var": 0.83046875, "learning_rate": 0.0003, "loss": 11.0644, "loss/aux_loss": 0.04806725066155195, "loss/crossentropy": 2.628107964992523, "loss/logits": 0.8442522406578064, "step": 45420 }, { "epoch": 0.4543, "grad_norm": 14.0, "grad_norm_var": 0.7358723958333333, "learning_rate": 0.0003, "loss": 11.1131, "loss/aux_loss": 0.04807357750833034, "loss/crossentropy": 2.7379313945770263, "loss/logits": 0.8388034462928772, "step": 45430 }, { "epoch": 0.4544, "grad_norm": 13.8125, "grad_norm_var": 0.3941243489583333, "learning_rate": 0.0003, "loss": 11.2006, "loss/aux_loss": 0.04807491805404425, "loss/crossentropy": 2.750547635555267, "loss/logits": 0.8313853859901428, "step": 45440 }, { "epoch": 0.4545, "grad_norm": 13.875, "grad_norm_var": 0.5580729166666667, "learning_rate": 0.0003, "loss": 11.2105, "loss/aux_loss": 0.04807582795619965, "loss/crossentropy": 2.771452808380127, "loss/logits": 0.8088362455368042, "step": 45450 }, { "epoch": 0.4546, "grad_norm": 13.6875, "grad_norm_var": 0.7884765625, "learning_rate": 0.0003, "loss": 11.1884, "loss/aux_loss": 0.04807773567736149, "loss/crossentropy": 2.833453130722046, "loss/logits": 0.8409688085317611, "step": 45460 }, { "epoch": 0.4547, "grad_norm": 16.875, "grad_norm_var": 1.33984375, "learning_rate": 0.0003, "loss": 11.1772, "loss/aux_loss": 0.04806816857308149, "loss/crossentropy": 2.5675463676452637, "loss/logits": 0.8408284574747086, "step": 45470 }, { "epoch": 0.4548, "grad_norm": 13.3125, "grad_norm_var": 1.1582682291666666, "learning_rate": 0.0003, "loss": 11.3571, "loss/aux_loss": 0.04808399137109518, "loss/crossentropy": 2.719745373725891, "loss/logits": 0.8130148202180862, "step": 45480 }, { "epoch": 0.4549, "grad_norm": 14.8125, "grad_norm_var": 0.7718587239583333, "learning_rate": 0.0003, "loss": 11.2929, "loss/aux_loss": 0.048072703368961814, "loss/crossentropy": 2.8237855315208433, "loss/logits": 0.886102220416069, "step": 45490 }, { "epoch": 0.455, "grad_norm": 14.75, "grad_norm_var": 0.449072265625, "learning_rate": 0.0003, "loss": 11.0946, "loss/aux_loss": 0.04806439485400915, "loss/crossentropy": 2.6629326224327086, "loss/logits": 0.8396694749593735, "step": 45500 }, { "epoch": 0.4551, "grad_norm": 14.125, "grad_norm_var": 0.4596354166666667, "learning_rate": 0.0003, "loss": 11.3091, "loss/aux_loss": 0.04807718340307474, "loss/crossentropy": 2.878407192230225, "loss/logits": 0.8687193512916564, "step": 45510 }, { "epoch": 0.4552, "grad_norm": 13.375, "grad_norm_var": 0.4332682291666667, "learning_rate": 0.0003, "loss": 11.1584, "loss/aux_loss": 0.04807638432830572, "loss/crossentropy": 2.7596149682998656, "loss/logits": 0.8342228949069976, "step": 45520 }, { "epoch": 0.4553, "grad_norm": 13.5625, "grad_norm_var": 0.2872395833333333, "learning_rate": 0.0003, "loss": 11.2693, "loss/aux_loss": 0.04807369913905859, "loss/crossentropy": 3.037293183803558, "loss/logits": 0.8476502895355225, "step": 45530 }, { "epoch": 0.4554, "grad_norm": 14.375, "grad_norm_var": 0.8960774739583334, "learning_rate": 0.0003, "loss": 10.9162, "loss/aux_loss": 0.04806880354881286, "loss/crossentropy": 2.5978680908679963, "loss/logits": 0.8057895511388778, "step": 45540 }, { "epoch": 0.4555, "grad_norm": 14.5625, "grad_norm_var": 0.954931640625, "learning_rate": 0.0003, "loss": 11.0929, "loss/aux_loss": 0.04807190727442503, "loss/crossentropy": 2.7636757493019104, "loss/logits": 0.8655385166406632, "step": 45550 }, { "epoch": 0.4556, "grad_norm": 14.875, "grad_norm_var": 0.3963541666666667, "learning_rate": 0.0003, "loss": 11.1835, "loss/aux_loss": 0.048070300556719306, "loss/crossentropy": 2.5805073499679567, "loss/logits": 0.846220064163208, "step": 45560 }, { "epoch": 0.4557, "grad_norm": 13.875, "grad_norm_var": 0.7149576822916667, "learning_rate": 0.0003, "loss": 11.0291, "loss/aux_loss": 0.04808596391230822, "loss/crossentropy": 2.6298948764801025, "loss/logits": 0.8278081536293029, "step": 45570 }, { "epoch": 0.4558, "grad_norm": 13.4375, "grad_norm_var": 0.8462890625, "learning_rate": 0.0003, "loss": 11.0205, "loss/aux_loss": 0.04807253833860159, "loss/crossentropy": 2.6736935675144196, "loss/logits": 0.8378624528646469, "step": 45580 }, { "epoch": 0.4559, "grad_norm": 13.9375, "grad_norm_var": 0.6683430989583333, "learning_rate": 0.0003, "loss": 11.2032, "loss/aux_loss": 0.04806935098022223, "loss/crossentropy": 2.6951875925064086, "loss/logits": 0.818251371383667, "step": 45590 }, { "epoch": 0.456, "grad_norm": 14.3125, "grad_norm_var": 0.2919108072916667, "learning_rate": 0.0003, "loss": 11.2407, "loss/aux_loss": 0.04809526577591896, "loss/crossentropy": 2.7838382720947266, "loss/logits": 0.8391630411148071, "step": 45600 }, { "epoch": 0.4561, "grad_norm": 14.5625, "grad_norm_var": 0.6301432291666667, "learning_rate": 0.0003, "loss": 11.1217, "loss/aux_loss": 0.048053614981472495, "loss/crossentropy": 2.632568824291229, "loss/logits": 0.845693039894104, "step": 45610 }, { "epoch": 0.4562, "grad_norm": 15.8125, "grad_norm_var": 0.6426920572916667, "learning_rate": 0.0003, "loss": 11.1653, "loss/aux_loss": 0.048069927655160424, "loss/crossentropy": 2.8206692337989807, "loss/logits": 0.8522645890712738, "step": 45620 }, { "epoch": 0.4563, "grad_norm": 14.75, "grad_norm_var": 0.8895833333333333, "learning_rate": 0.0003, "loss": 11.037, "loss/aux_loss": 0.048098363913595676, "loss/crossentropy": 2.769987952709198, "loss/logits": 0.8345672219991684, "step": 45630 }, { "epoch": 0.4564, "grad_norm": 16.625, "grad_norm_var": 0.988525390625, "learning_rate": 0.0003, "loss": 11.143, "loss/aux_loss": 0.04806884527206421, "loss/crossentropy": 2.755461257696152, "loss/logits": 0.8266684681177139, "step": 45640 }, { "epoch": 0.4565, "grad_norm": 13.8125, "grad_norm_var": 1.1817708333333334, "learning_rate": 0.0003, "loss": 11.2068, "loss/aux_loss": 0.04807171430438757, "loss/crossentropy": 2.7919964730739593, "loss/logits": 0.8207480728626251, "step": 45650 }, { "epoch": 0.4566, "grad_norm": 13.125, "grad_norm_var": 0.7269368489583333, "learning_rate": 0.0003, "loss": 11.0707, "loss/aux_loss": 0.04808006528764963, "loss/crossentropy": 2.6854530811309814, "loss/logits": 0.8648782402276993, "step": 45660 }, { "epoch": 0.4567, "grad_norm": 13.6875, "grad_norm_var": 0.7372395833333333, "learning_rate": 0.0003, "loss": 11.2434, "loss/aux_loss": 0.04807576704770326, "loss/crossentropy": 2.7998494148254394, "loss/logits": 0.812621483206749, "step": 45670 }, { "epoch": 0.4568, "grad_norm": 16.625, "grad_norm_var": 0.718603515625, "learning_rate": 0.0003, "loss": 11.2177, "loss/aux_loss": 0.04807013794779778, "loss/crossentropy": 2.7176717042922975, "loss/logits": 0.8751024842262268, "step": 45680 }, { "epoch": 0.4569, "grad_norm": 14.25, "grad_norm_var": 0.5817057291666666, "learning_rate": 0.0003, "loss": 11.094, "loss/aux_loss": 0.0480797715485096, "loss/crossentropy": 2.630698436498642, "loss/logits": 0.8169205486774445, "step": 45690 }, { "epoch": 0.457, "grad_norm": 15.0, "grad_norm_var": 0.6380208333333334, "learning_rate": 0.0003, "loss": 11.2579, "loss/aux_loss": 0.048068609088659286, "loss/crossentropy": 2.7162768125534056, "loss/logits": 0.8566293030977249, "step": 45700 }, { "epoch": 0.4571, "grad_norm": 14.1875, "grad_norm_var": 0.6869791666666667, "learning_rate": 0.0003, "loss": 11.0956, "loss/aux_loss": 0.048069410026073456, "loss/crossentropy": 2.666369599103928, "loss/logits": 0.846130108833313, "step": 45710 }, { "epoch": 0.4572, "grad_norm": 14.875, "grad_norm_var": 0.4239420572916667, "learning_rate": 0.0003, "loss": 11.3907, "loss/aux_loss": 0.04808373041450977, "loss/crossentropy": 2.7648482978343965, "loss/logits": 0.8433201760053635, "step": 45720 }, { "epoch": 0.4573, "grad_norm": 14.3125, "grad_norm_var": 0.3359375, "learning_rate": 0.0003, "loss": 10.9668, "loss/aux_loss": 0.04806965310126543, "loss/crossentropy": 2.628385591506958, "loss/logits": 0.8232957303524018, "step": 45730 }, { "epoch": 0.4574, "grad_norm": 13.5625, "grad_norm_var": 0.17526041666666667, "learning_rate": 0.0003, "loss": 11.028, "loss/aux_loss": 0.0480685856193304, "loss/crossentropy": 2.8586939454078673, "loss/logits": 0.8752603858709336, "step": 45740 }, { "epoch": 0.4575, "grad_norm": 14.1875, "grad_norm_var": 1.039697265625, "learning_rate": 0.0003, "loss": 11.2328, "loss/aux_loss": 0.04807407390326261, "loss/crossentropy": 2.7650853276252745, "loss/logits": 0.8581269145011902, "step": 45750 }, { "epoch": 0.4576, "grad_norm": 14.25, "grad_norm_var": 1.2577962239583333, "learning_rate": 0.0003, "loss": 11.2529, "loss/aux_loss": 0.04806558396667242, "loss/crossentropy": 2.670504766702652, "loss/logits": 0.8334173530340194, "step": 45760 }, { "epoch": 0.4577, "grad_norm": 13.25, "grad_norm_var": 0.2718098958333333, "learning_rate": 0.0003, "loss": 10.9535, "loss/aux_loss": 0.04807530362159014, "loss/crossentropy": 2.6863086402416227, "loss/logits": 0.7867847800254821, "step": 45770 }, { "epoch": 0.4578, "grad_norm": 14.75, "grad_norm_var": 0.29739583333333336, "learning_rate": 0.0003, "loss": 11.0572, "loss/aux_loss": 0.04807478487491608, "loss/crossentropy": 2.7828167259693144, "loss/logits": 0.8462360620498657, "step": 45780 }, { "epoch": 0.4579, "grad_norm": 14.25, "grad_norm_var": 0.5660807291666666, "learning_rate": 0.0003, "loss": 11.212, "loss/aux_loss": 0.04808699581772089, "loss/crossentropy": 2.7481481969356536, "loss/logits": 0.8268178194761276, "step": 45790 }, { "epoch": 0.458, "grad_norm": 15.4375, "grad_norm_var": 0.563525390625, "learning_rate": 0.0003, "loss": 11.1529, "loss/aux_loss": 0.0480684332549572, "loss/crossentropy": 2.6542268633842467, "loss/logits": 0.8398558348417282, "step": 45800 }, { "epoch": 0.4581, "grad_norm": 14.5625, "grad_norm_var": 0.570556640625, "learning_rate": 0.0003, "loss": 11.0782, "loss/aux_loss": 0.04807002916932106, "loss/crossentropy": 2.724322813749313, "loss/logits": 0.8166536599397659, "step": 45810 }, { "epoch": 0.4582, "grad_norm": 15.25, "grad_norm_var": 0.9817545572916667, "learning_rate": 0.0003, "loss": 11.1127, "loss/aux_loss": 0.04808499440550804, "loss/crossentropy": 2.623278909921646, "loss/logits": 0.8362865537405014, "step": 45820 }, { "epoch": 0.4583, "grad_norm": 14.875, "grad_norm_var": 1.3544270833333334, "learning_rate": 0.0003, "loss": 11.1638, "loss/aux_loss": 0.048072373308241365, "loss/crossentropy": 2.7822245001792907, "loss/logits": 0.8362233757972717, "step": 45830 }, { "epoch": 0.4584, "grad_norm": 15.25, "grad_norm_var": 0.7874837239583333, "learning_rate": 0.0003, "loss": 11.0392, "loss/aux_loss": 0.04805925581604242, "loss/crossentropy": 2.83905189037323, "loss/logits": 0.8411778301000595, "step": 45840 }, { "epoch": 0.4585, "grad_norm": 14.0, "grad_norm_var": 0.319384765625, "learning_rate": 0.0003, "loss": 11.136, "loss/aux_loss": 0.048078327998518945, "loss/crossentropy": 2.6659455597400665, "loss/logits": 0.8175705790519714, "step": 45850 }, { "epoch": 0.4586, "grad_norm": 14.0625, "grad_norm_var": 0.1556640625, "learning_rate": 0.0003, "loss": 11.1928, "loss/aux_loss": 0.04805846642702818, "loss/crossentropy": 2.7064037203788756, "loss/logits": 0.8587689280509949, "step": 45860 }, { "epoch": 0.4587, "grad_norm": 14.8125, "grad_norm_var": 0.6977701822916667, "learning_rate": 0.0003, "loss": 11.2205, "loss/aux_loss": 0.04808104075491428, "loss/crossentropy": 2.6719759106636047, "loss/logits": 0.8492055386304855, "step": 45870 }, { "epoch": 0.4588, "grad_norm": 14.1875, "grad_norm_var": 68.06521809895834, "learning_rate": 0.0003, "loss": 11.3008, "loss/aux_loss": 0.048071546480059624, "loss/crossentropy": 2.944806432723999, "loss/logits": 0.8660283535718918, "step": 45880 }, { "epoch": 0.4589, "grad_norm": 13.1875, "grad_norm_var": 0.8034993489583333, "learning_rate": 0.0003, "loss": 11.1377, "loss/aux_loss": 0.04807901885360479, "loss/crossentropy": 2.7466224670410155, "loss/logits": 0.8269301950931549, "step": 45890 }, { "epoch": 0.459, "grad_norm": 14.5, "grad_norm_var": 1.1934733072916666, "learning_rate": 0.0003, "loss": 10.9827, "loss/aux_loss": 0.04806364104151726, "loss/crossentropy": 2.5919273018836977, "loss/logits": 0.8305024951696396, "step": 45900 }, { "epoch": 0.4591, "grad_norm": 14.75, "grad_norm_var": 0.9688639322916667, "learning_rate": 0.0003, "loss": 11.1109, "loss/aux_loss": 0.04807860106229782, "loss/crossentropy": 2.6946555733680726, "loss/logits": 0.8430137366056443, "step": 45910 }, { "epoch": 0.4592, "grad_norm": 13.8125, "grad_norm_var": 0.5634765625, "learning_rate": 0.0003, "loss": 10.9809, "loss/aux_loss": 0.048061893321573734, "loss/crossentropy": 2.7854873657226564, "loss/logits": 0.8397706598043442, "step": 45920 }, { "epoch": 0.4593, "grad_norm": 14.3125, "grad_norm_var": 0.615869140625, "learning_rate": 0.0003, "loss": 11.193, "loss/aux_loss": 0.04807352740317583, "loss/crossentropy": 2.7639912247657774, "loss/logits": 0.8295485734939575, "step": 45930 }, { "epoch": 0.4594, "grad_norm": 14.3125, "grad_norm_var": 0.585400390625, "learning_rate": 0.0003, "loss": 11.0833, "loss/aux_loss": 0.04807315729558468, "loss/crossentropy": 2.7072408556938172, "loss/logits": 0.83205626308918, "step": 45940 }, { "epoch": 0.4595, "grad_norm": 13.375, "grad_norm_var": 0.324462890625, "learning_rate": 0.0003, "loss": 11.1237, "loss/aux_loss": 0.04807840902358294, "loss/crossentropy": 2.775273883342743, "loss/logits": 0.8464349508285522, "step": 45950 }, { "epoch": 0.4596, "grad_norm": 14.3125, "grad_norm_var": 0.7369791666666666, "learning_rate": 0.0003, "loss": 11.1625, "loss/aux_loss": 0.048066251538693906, "loss/crossentropy": 2.761694145202637, "loss/logits": 0.8378835052251816, "step": 45960 }, { "epoch": 0.4597, "grad_norm": 13.875, "grad_norm_var": 0.39453125, "learning_rate": 0.0003, "loss": 11.16, "loss/aux_loss": 0.048080692254006865, "loss/crossentropy": 2.6108572721481322, "loss/logits": 0.8397493064403534, "step": 45970 }, { "epoch": 0.4598, "grad_norm": 14.25, "grad_norm_var": 0.30572916666666666, "learning_rate": 0.0003, "loss": 11.1456, "loss/aux_loss": 0.04807419683784246, "loss/crossentropy": 2.7630624175071716, "loss/logits": 0.8382513612508774, "step": 45980 }, { "epoch": 0.4599, "grad_norm": 13.875, "grad_norm_var": 0.295556640625, "learning_rate": 0.0003, "loss": 11.0327, "loss/aux_loss": 0.048076769709587096, "loss/crossentropy": 2.7021873712539675, "loss/logits": 0.8546818345785141, "step": 45990 }, { "epoch": 0.46, "grad_norm": 14.4375, "grad_norm_var": 0.33932291666666664, "learning_rate": 0.0003, "loss": 11.1386, "loss/aux_loss": 0.04808690138161183, "loss/crossentropy": 2.621007192134857, "loss/logits": 0.8594667464494705, "step": 46000 }, { "epoch": 0.4601, "grad_norm": 17.5, "grad_norm_var": 0.825, "learning_rate": 0.0003, "loss": 11.2569, "loss/aux_loss": 0.04806713555008173, "loss/crossentropy": 2.6976438403129577, "loss/logits": 0.8213419556617737, "step": 46010 }, { "epoch": 0.4602, "grad_norm": 13.875, "grad_norm_var": 1.0632649739583333, "learning_rate": 0.0003, "loss": 11.113, "loss/aux_loss": 0.048088106140494344, "loss/crossentropy": 2.520746982097626, "loss/logits": 0.8219372004270553, "step": 46020 }, { "epoch": 0.4603, "grad_norm": 15.9375, "grad_norm_var": 0.590478515625, "learning_rate": 0.0003, "loss": 11.2881, "loss/aux_loss": 0.048078617081046104, "loss/crossentropy": 2.840721619129181, "loss/logits": 0.8715362250804901, "step": 46030 }, { "epoch": 0.4604, "grad_norm": 13.4375, "grad_norm_var": 0.7445149739583333, "learning_rate": 0.0003, "loss": 10.9794, "loss/aux_loss": 0.048061452060937884, "loss/crossentropy": 2.6260744273662566, "loss/logits": 0.8213737875223159, "step": 46040 }, { "epoch": 0.4605, "grad_norm": 14.0625, "grad_norm_var": 0.2738118489583333, "learning_rate": 0.0003, "loss": 11.3335, "loss/aux_loss": 0.048074982687830926, "loss/crossentropy": 2.783993864059448, "loss/logits": 0.8809378027915955, "step": 46050 }, { "epoch": 0.4606, "grad_norm": 14.0625, "grad_norm_var": 1.7400390625, "learning_rate": 0.0003, "loss": 11.1385, "loss/aux_loss": 0.0480780715122819, "loss/crossentropy": 2.767115068435669, "loss/logits": 0.8657530009746551, "step": 46060 }, { "epoch": 0.4607, "grad_norm": 14.5625, "grad_norm_var": 1.1791015625, "learning_rate": 0.0003, "loss": 11.0435, "loss/aux_loss": 0.04808122143149376, "loss/crossentropy": 2.465041011571884, "loss/logits": 0.8050199329853058, "step": 46070 }, { "epoch": 0.4608, "grad_norm": 14.8125, "grad_norm_var": 0.37135416666666665, "learning_rate": 0.0003, "loss": 11.1579, "loss/aux_loss": 0.04807000420987606, "loss/crossentropy": 2.614718121290207, "loss/logits": 0.8386980295181274, "step": 46080 }, { "epoch": 0.4609, "grad_norm": 14.8125, "grad_norm_var": 2.321728515625, "learning_rate": 0.0003, "loss": 11.1314, "loss/aux_loss": 0.0480863306671381, "loss/crossentropy": 2.7782493591308595, "loss/logits": 0.8749098181724548, "step": 46090 }, { "epoch": 0.461, "grad_norm": 13.8125, "grad_norm_var": 0.5106770833333333, "learning_rate": 0.0003, "loss": 11.1323, "loss/aux_loss": 0.048072931729257105, "loss/crossentropy": 2.8069660782814028, "loss/logits": 0.827642685174942, "step": 46100 }, { "epoch": 0.4611, "grad_norm": 13.75, "grad_norm_var": 0.5239583333333333, "learning_rate": 0.0003, "loss": 11.1249, "loss/aux_loss": 0.048074934631586075, "loss/crossentropy": 2.5645545959472655, "loss/logits": 0.8122676819562912, "step": 46110 }, { "epoch": 0.4612, "grad_norm": 14.375, "grad_norm_var": 0.8458170572916667, "learning_rate": 0.0003, "loss": 11.1389, "loss/aux_loss": 0.04807667378336191, "loss/crossentropy": 2.7756105303764342, "loss/logits": 0.8374488890171051, "step": 46120 }, { "epoch": 0.4613, "grad_norm": 14.5, "grad_norm_var": 0.643212890625, "learning_rate": 0.0003, "loss": 11.1476, "loss/aux_loss": 0.048084722831845284, "loss/crossentropy": 2.862342894077301, "loss/logits": 0.8817748308181763, "step": 46130 }, { "epoch": 0.4614, "grad_norm": 19.375, "grad_norm_var": 185.560791015625, "learning_rate": 0.0003, "loss": 11.1808, "loss/aux_loss": 0.048067561350762844, "loss/crossentropy": 2.596503585577011, "loss/logits": 0.8253944367170334, "step": 46140 }, { "epoch": 0.4615, "grad_norm": 14.5, "grad_norm_var": 185.8166015625, "learning_rate": 0.0003, "loss": 11.1384, "loss/aux_loss": 0.04807464778423309, "loss/crossentropy": 2.654829728603363, "loss/logits": 0.8233345150947571, "step": 46150 }, { "epoch": 0.4616, "grad_norm": 17.125, "grad_norm_var": 0.84609375, "learning_rate": 0.0003, "loss": 11.0069, "loss/aux_loss": 0.048072910867631435, "loss/crossentropy": 2.758739471435547, "loss/logits": 0.8712568372488022, "step": 46160 }, { "epoch": 0.4617, "grad_norm": 13.875, "grad_norm_var": 0.951025390625, "learning_rate": 0.0003, "loss": 11.0499, "loss/aux_loss": 0.0480669179931283, "loss/crossentropy": 2.609746116399765, "loss/logits": 0.8174644142389298, "step": 46170 }, { "epoch": 0.4618, "grad_norm": 16.5, "grad_norm_var": 0.5794270833333334, "learning_rate": 0.0003, "loss": 11.1406, "loss/aux_loss": 0.04809269942343235, "loss/crossentropy": 2.7125259757041933, "loss/logits": 0.829924201965332, "step": 46180 }, { "epoch": 0.4619, "grad_norm": 14.6875, "grad_norm_var": 0.4805826822916667, "learning_rate": 0.0003, "loss": 11.194, "loss/aux_loss": 0.04806003961712122, "loss/crossentropy": 2.7480635344982147, "loss/logits": 0.844669246673584, "step": 46190 }, { "epoch": 0.462, "grad_norm": 14.25, "grad_norm_var": 0.4934895833333333, "learning_rate": 0.0003, "loss": 11.2594, "loss/aux_loss": 0.0480703953653574, "loss/crossentropy": 2.7278804779052734, "loss/logits": 0.8578185975551605, "step": 46200 }, { "epoch": 0.4621, "grad_norm": 15.125, "grad_norm_var": 0.4827473958333333, "learning_rate": 0.0003, "loss": 11.1531, "loss/aux_loss": 0.04808447137475014, "loss/crossentropy": 2.583157116174698, "loss/logits": 0.8229734599590302, "step": 46210 }, { "epoch": 0.4622, "grad_norm": 14.375, "grad_norm_var": 0.181884765625, "learning_rate": 0.0003, "loss": 11.2071, "loss/aux_loss": 0.048074091970920566, "loss/crossentropy": 2.6130159497261047, "loss/logits": 0.8274956196546555, "step": 46220 }, { "epoch": 0.4623, "grad_norm": 14.875, "grad_norm_var": 0.326806640625, "learning_rate": 0.0003, "loss": 11.1961, "loss/aux_loss": 0.04808221161365509, "loss/crossentropy": 2.7028493165969847, "loss/logits": 0.8648825436830521, "step": 46230 }, { "epoch": 0.4624, "grad_norm": 13.9375, "grad_norm_var": 0.405712890625, "learning_rate": 0.0003, "loss": 11.0438, "loss/aux_loss": 0.04807335082441568, "loss/crossentropy": 2.5999584436416625, "loss/logits": 0.8028821110725403, "step": 46240 }, { "epoch": 0.4625, "grad_norm": 13.0, "grad_norm_var": 0.47263997395833335, "learning_rate": 0.0003, "loss": 11.0311, "loss/aux_loss": 0.04807595033198595, "loss/crossentropy": 2.7554810464382173, "loss/logits": 0.841679847240448, "step": 46250 }, { "epoch": 0.4626, "grad_norm": 13.9375, "grad_norm_var": 0.47858072916666666, "learning_rate": 0.0003, "loss": 11.0339, "loss/aux_loss": 0.04807290639728308, "loss/crossentropy": 2.826436698436737, "loss/logits": 0.8312930345535279, "step": 46260 }, { "epoch": 0.4627, "grad_norm": 14.875, "grad_norm_var": 0.5843587239583333, "learning_rate": 0.0003, "loss": 11.194, "loss/aux_loss": 0.04807629976421594, "loss/crossentropy": 2.6330519676208497, "loss/logits": 0.8189602941274643, "step": 46270 }, { "epoch": 0.4628, "grad_norm": 14.0, "grad_norm_var": 0.36912434895833335, "learning_rate": 0.0003, "loss": 11.1749, "loss/aux_loss": 0.048078407719731334, "loss/crossentropy": 2.704203653335571, "loss/logits": 0.8706455767154694, "step": 46280 }, { "epoch": 0.4629, "grad_norm": 14.3125, "grad_norm_var": 0.361181640625, "learning_rate": 0.0003, "loss": 11.0766, "loss/aux_loss": 0.04808397404849529, "loss/crossentropy": 2.679097306728363, "loss/logits": 0.8224625796079635, "step": 46290 }, { "epoch": 0.463, "grad_norm": 14.25, "grad_norm_var": 0.33982747395833335, "learning_rate": 0.0003, "loss": 11.0087, "loss/aux_loss": 0.04806742705404758, "loss/crossentropy": 2.6513377904891966, "loss/logits": 0.7877238169312477, "step": 46300 }, { "epoch": 0.4631, "grad_norm": 14.875, "grad_norm_var": 0.46484375, "learning_rate": 0.0003, "loss": 11.1393, "loss/aux_loss": 0.04807655327022076, "loss/crossentropy": 2.542707550525665, "loss/logits": 0.8172414094209671, "step": 46310 }, { "epoch": 0.4632, "grad_norm": 14.1875, "grad_norm_var": 0.9903483072916667, "learning_rate": 0.0003, "loss": 11.1643, "loss/aux_loss": 0.04806794133037329, "loss/crossentropy": 2.8341934442520142, "loss/logits": 0.8416117280721664, "step": 46320 }, { "epoch": 0.4633, "grad_norm": 14.25, "grad_norm_var": 0.546875, "learning_rate": 0.0003, "loss": 11.1235, "loss/aux_loss": 0.04808292984962463, "loss/crossentropy": 2.7436033606529238, "loss/logits": 0.8215555369853973, "step": 46330 }, { "epoch": 0.4634, "grad_norm": 15.125, "grad_norm_var": 0.32472330729166665, "learning_rate": 0.0003, "loss": 11.4896, "loss/aux_loss": 0.04807242415845394, "loss/crossentropy": 2.7257355570793154, "loss/logits": 0.8585422575473786, "step": 46340 }, { "epoch": 0.4635, "grad_norm": 14.1875, "grad_norm_var": 7.888004557291667, "learning_rate": 0.0003, "loss": 11.112, "loss/aux_loss": 0.0480771217495203, "loss/crossentropy": 2.5921223521232606, "loss/logits": 0.8013135939836502, "step": 46350 }, { "epoch": 0.4636, "grad_norm": 15.625, "grad_norm_var": 6.84375, "learning_rate": 0.0003, "loss": 11.2715, "loss/aux_loss": 0.04807939790189266, "loss/crossentropy": 2.74160099029541, "loss/logits": 0.8368293017148971, "step": 46360 }, { "epoch": 0.4637, "grad_norm": 14.4375, "grad_norm_var": 0.4964680989583333, "learning_rate": 0.0003, "loss": 11.2342, "loss/aux_loss": 0.04807531572878361, "loss/crossentropy": 2.690195268392563, "loss/logits": 0.8286543309688568, "step": 46370 }, { "epoch": 0.4638, "grad_norm": 13.8125, "grad_norm_var": 0.37161458333333336, "learning_rate": 0.0003, "loss": 11.1557, "loss/aux_loss": 0.048067241348326205, "loss/crossentropy": 2.800563335418701, "loss/logits": 0.8472881704568863, "step": 46380 }, { "epoch": 0.4639, "grad_norm": 14.875, "grad_norm_var": 0.4212890625, "learning_rate": 0.0003, "loss": 11.2161, "loss/aux_loss": 0.048077549785375595, "loss/crossentropy": 2.8395881056785583, "loss/logits": 0.8345950871706009, "step": 46390 }, { "epoch": 0.464, "grad_norm": 13.4375, "grad_norm_var": 0.580712890625, "learning_rate": 0.0003, "loss": 11.2054, "loss/aux_loss": 0.048074125126004216, "loss/crossentropy": 2.6557404458522798, "loss/logits": 0.8473072737455368, "step": 46400 }, { "epoch": 0.4641, "grad_norm": 14.25, "grad_norm_var": 0.5900390625, "learning_rate": 0.0003, "loss": 11.1693, "loss/aux_loss": 0.04806842841207981, "loss/crossentropy": 2.796481668949127, "loss/logits": 0.8443670809268952, "step": 46410 }, { "epoch": 0.4642, "grad_norm": 14.0625, "grad_norm_var": 0.6135416666666667, "learning_rate": 0.0003, "loss": 11.1548, "loss/aux_loss": 0.048084372840821746, "loss/crossentropy": 2.760413956642151, "loss/logits": 0.8425527215003967, "step": 46420 }, { "epoch": 0.4643, "grad_norm": 15.3125, "grad_norm_var": 0.633056640625, "learning_rate": 0.0003, "loss": 11.2473, "loss/aux_loss": 0.048075103759765626, "loss/crossentropy": 2.7352624416351317, "loss/logits": 0.8507170170545578, "step": 46430 }, { "epoch": 0.4644, "grad_norm": 14.5625, "grad_norm_var": 1.6200358072916667, "learning_rate": 0.0003, "loss": 11.2188, "loss/aux_loss": 0.048076110705733296, "loss/crossentropy": 2.759999096393585, "loss/logits": 0.8683469414710998, "step": 46440 }, { "epoch": 0.4645, "grad_norm": 14.0, "grad_norm_var": 0.44296875, "learning_rate": 0.0003, "loss": 11.1575, "loss/aux_loss": 0.04807174541056156, "loss/crossentropy": 2.7848057746887207, "loss/logits": 0.8197889029979706, "step": 46450 }, { "epoch": 0.4646, "grad_norm": 14.5, "grad_norm_var": 0.5098307291666667, "learning_rate": 0.0003, "loss": 11.0128, "loss/aux_loss": 0.04808044023811817, "loss/crossentropy": 2.6938924133777618, "loss/logits": 0.8240805625915527, "step": 46460 }, { "epoch": 0.4647, "grad_norm": 14.0, "grad_norm_var": 0.403125, "learning_rate": 0.0003, "loss": 11.2575, "loss/aux_loss": 0.04806338362395764, "loss/crossentropy": 2.746562212705612, "loss/logits": 0.8269091069698333, "step": 46470 }, { "epoch": 0.4648, "grad_norm": 13.125, "grad_norm_var": 0.37057291666666664, "learning_rate": 0.0003, "loss": 11.1455, "loss/aux_loss": 0.048089843057096, "loss/crossentropy": 2.7687614023685456, "loss/logits": 0.8314522951841354, "step": 46480 }, { "epoch": 0.4649, "grad_norm": 13.6875, "grad_norm_var": 60.507747395833334, "learning_rate": 0.0003, "loss": 11.321, "loss/aux_loss": 0.04807740245014429, "loss/crossentropy": 2.713161385059357, "loss/logits": 0.8670831322669983, "step": 46490 }, { "epoch": 0.465, "grad_norm": 15.75, "grad_norm_var": 59.106770833333336, "learning_rate": 0.0003, "loss": 11.1114, "loss/aux_loss": 0.04807022921741009, "loss/crossentropy": 2.7916195511817934, "loss/logits": 0.8448772758245469, "step": 46500 }, { "epoch": 0.4651, "grad_norm": 14.375, "grad_norm_var": 0.8102701822916667, "learning_rate": 0.0003, "loss": 11.2403, "loss/aux_loss": 0.048087266460061076, "loss/crossentropy": 2.7339360535144808, "loss/logits": 0.8473565667867661, "step": 46510 }, { "epoch": 0.4652, "grad_norm": 14.1875, "grad_norm_var": 1.5085774739583333, "learning_rate": 0.0003, "loss": 11.1092, "loss/aux_loss": 0.04807992558926344, "loss/crossentropy": 2.879719001054764, "loss/logits": 0.8366673439741135, "step": 46520 }, { "epoch": 0.4653, "grad_norm": 14.1875, "grad_norm_var": 1.065478515625, "learning_rate": 0.0003, "loss": 11.0443, "loss/aux_loss": 0.04807272665202618, "loss/crossentropy": 2.8038435697555544, "loss/logits": 0.8307929456233978, "step": 46530 }, { "epoch": 0.4654, "grad_norm": 15.3125, "grad_norm_var": 1.0056640625, "learning_rate": 0.0003, "loss": 11.02, "loss/aux_loss": 0.048068790882825854, "loss/crossentropy": 2.8201247453689575, "loss/logits": 0.8609393984079361, "step": 46540 }, { "epoch": 0.4655, "grad_norm": 15.0625, "grad_norm_var": 0.7639973958333334, "learning_rate": 0.0003, "loss": 10.9846, "loss/aux_loss": 0.04807808455079794, "loss/crossentropy": 2.641385281085968, "loss/logits": 0.8092559695243835, "step": 46550 }, { "epoch": 0.4656, "grad_norm": 14.75, "grad_norm_var": 0.5659993489583334, "learning_rate": 0.0003, "loss": 11.0114, "loss/aux_loss": 0.048075922578573224, "loss/crossentropy": 2.549112868309021, "loss/logits": 0.7909631967544556, "step": 46560 }, { "epoch": 0.4657, "grad_norm": 14.875, "grad_norm_var": 0.309375, "learning_rate": 0.0003, "loss": 11.1391, "loss/aux_loss": 0.04808427933603525, "loss/crossentropy": 2.796935510635376, "loss/logits": 0.8834913015365601, "step": 46570 }, { "epoch": 0.4658, "grad_norm": 13.4375, "grad_norm_var": 0.385400390625, "learning_rate": 0.0003, "loss": 10.9412, "loss/aux_loss": 0.04806541427969933, "loss/crossentropy": 2.649865931272507, "loss/logits": 0.8530022531747818, "step": 46580 }, { "epoch": 0.4659, "grad_norm": 13.625, "grad_norm_var": 0.34609375, "learning_rate": 0.0003, "loss": 11.3094, "loss/aux_loss": 0.04808234199881554, "loss/crossentropy": 2.7041312396526336, "loss/logits": 0.8479943692684173, "step": 46590 }, { "epoch": 0.466, "grad_norm": 14.0625, "grad_norm_var": 0.37810872395833334, "learning_rate": 0.0003, "loss": 11.2404, "loss/aux_loss": 0.04807773306965828, "loss/crossentropy": 2.6949519872665406, "loss/logits": 0.828350055217743, "step": 46600 }, { "epoch": 0.4661, "grad_norm": 14.3125, "grad_norm_var": 0.5723307291666667, "learning_rate": 0.0003, "loss": 11.1212, "loss/aux_loss": 0.048074806481599806, "loss/crossentropy": 2.8176488399505617, "loss/logits": 0.8534007757902146, "step": 46610 }, { "epoch": 0.4662, "grad_norm": 15.625, "grad_norm_var": 0.8304524739583333, "learning_rate": 0.0003, "loss": 10.9068, "loss/aux_loss": 0.04807550571858883, "loss/crossentropy": 2.683753031492233, "loss/logits": 0.8132462590932846, "step": 46620 }, { "epoch": 0.4663, "grad_norm": 14.6875, "grad_norm_var": 0.7883951822916667, "learning_rate": 0.0003, "loss": 11.1563, "loss/aux_loss": 0.04807454776018858, "loss/crossentropy": 2.742726969718933, "loss/logits": 0.8348707377910614, "step": 46630 }, { "epoch": 0.4664, "grad_norm": 14.1875, "grad_norm_var": 0.3150390625, "learning_rate": 0.0003, "loss": 11.1891, "loss/aux_loss": 0.04808441940695048, "loss/crossentropy": 2.798567849397659, "loss/logits": 0.8553645014762878, "step": 46640 }, { "epoch": 0.4665, "grad_norm": 15.0625, "grad_norm_var": 0.252978515625, "learning_rate": 0.0003, "loss": 11.0609, "loss/aux_loss": 0.04807608053088188, "loss/crossentropy": 2.6830251634120943, "loss/logits": 0.8196864813566208, "step": 46650 }, { "epoch": 0.4666, "grad_norm": 13.5625, "grad_norm_var": 0.5020833333333333, "learning_rate": 0.0003, "loss": 11.0233, "loss/aux_loss": 0.048064926825463775, "loss/crossentropy": 2.7553923606872557, "loss/logits": 0.853671881556511, "step": 46660 }, { "epoch": 0.4667, "grad_norm": 14.0625, "grad_norm_var": 0.645166015625, "learning_rate": 0.0003, "loss": 11.1551, "loss/aux_loss": 0.0480745829641819, "loss/crossentropy": 2.7849517345428465, "loss/logits": 0.8592245787382126, "step": 46670 }, { "epoch": 0.4668, "grad_norm": 13.5, "grad_norm_var": 0.5312337239583333, "learning_rate": 0.0003, "loss": 11.0834, "loss/aux_loss": 0.04807850234210491, "loss/crossentropy": 2.877470552921295, "loss/logits": 0.8332111418247223, "step": 46680 }, { "epoch": 0.4669, "grad_norm": 15.375, "grad_norm_var": 0.5258951822916667, "learning_rate": 0.0003, "loss": 11.1315, "loss/aux_loss": 0.048073893412947655, "loss/crossentropy": 2.6425564885139465, "loss/logits": 0.834293258190155, "step": 46690 }, { "epoch": 0.467, "grad_norm": 13.9375, "grad_norm_var": 0.4778645833333333, "learning_rate": 0.0003, "loss": 11.1072, "loss/aux_loss": 0.048084541223943233, "loss/crossentropy": 2.7174774527549745, "loss/logits": 0.8334024339914322, "step": 46700 }, { "epoch": 0.4671, "grad_norm": 14.5, "grad_norm_var": 0.657275390625, "learning_rate": 0.0003, "loss": 11.0016, "loss/aux_loss": 0.04806803483515978, "loss/crossentropy": 2.5842558205127717, "loss/logits": 0.8057064324617386, "step": 46710 }, { "epoch": 0.4672, "grad_norm": 13.9375, "grad_norm_var": 0.7184733072916667, "learning_rate": 0.0003, "loss": 11.0592, "loss/aux_loss": 0.04807887524366379, "loss/crossentropy": 2.8033472537994384, "loss/logits": 0.827720096707344, "step": 46720 }, { "epoch": 0.4673, "grad_norm": 14.6875, "grad_norm_var": 1.1671223958333334, "learning_rate": 0.0003, "loss": 11.0515, "loss/aux_loss": 0.04807654283940792, "loss/crossentropy": 2.6938624501228334, "loss/logits": 0.8351145356893539, "step": 46730 }, { "epoch": 0.4674, "grad_norm": 14.125, "grad_norm_var": 0.8932291666666666, "learning_rate": 0.0003, "loss": 11.1301, "loss/aux_loss": 0.0480727557092905, "loss/crossentropy": 2.7404383420944214, "loss/logits": 0.8500682055950165, "step": 46740 }, { "epoch": 0.4675, "grad_norm": 15.25, "grad_norm_var": 0.9286458333333333, "learning_rate": 0.0003, "loss": 11.1978, "loss/aux_loss": 0.04807423073798418, "loss/crossentropy": 2.6602605104446413, "loss/logits": 0.847180300951004, "step": 46750 }, { "epoch": 0.4676, "grad_norm": 14.375, "grad_norm_var": 0.8609375, "learning_rate": 0.0003, "loss": 11.2002, "loss/aux_loss": 0.048065530881285665, "loss/crossentropy": 2.704084634780884, "loss/logits": 0.8421103477478027, "step": 46760 }, { "epoch": 0.4677, "grad_norm": 14.6875, "grad_norm_var": 0.30859375, "learning_rate": 0.0003, "loss": 11.1246, "loss/aux_loss": 0.048081099055707455, "loss/crossentropy": 2.7489688992500305, "loss/logits": 0.8322398364543915, "step": 46770 }, { "epoch": 0.4678, "grad_norm": 13.5, "grad_norm_var": 0.24412434895833332, "learning_rate": 0.0003, "loss": 10.8646, "loss/aux_loss": 0.04807508103549481, "loss/crossentropy": 2.630194664001465, "loss/logits": 0.8167033612728118, "step": 46780 }, { "epoch": 0.4679, "grad_norm": 13.5625, "grad_norm_var": 0.66875, "learning_rate": 0.0003, "loss": 11.1296, "loss/aux_loss": 0.04808248318731785, "loss/crossentropy": 2.754181432723999, "loss/logits": 0.8682912677526474, "step": 46790 }, { "epoch": 0.468, "grad_norm": 13.9375, "grad_norm_var": 0.7300618489583334, "learning_rate": 0.0003, "loss": 11.1366, "loss/aux_loss": 0.04807512406259775, "loss/crossentropy": 2.807320773601532, "loss/logits": 0.8542409300804138, "step": 46800 }, { "epoch": 0.4681, "grad_norm": 13.625, "grad_norm_var": 0.6877604166666667, "learning_rate": 0.0003, "loss": 11.0965, "loss/aux_loss": 0.04807012528181076, "loss/crossentropy": 2.7169011294841767, "loss/logits": 0.8518575340509414, "step": 46810 }, { "epoch": 0.4682, "grad_norm": 15.0, "grad_norm_var": 0.5700358072916667, "learning_rate": 0.0003, "loss": 11.1799, "loss/aux_loss": 0.048083074390888214, "loss/crossentropy": 2.6793820321559907, "loss/logits": 0.8517037600278854, "step": 46820 }, { "epoch": 0.4683, "grad_norm": 15.6875, "grad_norm_var": 0.6593098958333333, "learning_rate": 0.0003, "loss": 10.9502, "loss/aux_loss": 0.04807259049266577, "loss/crossentropy": 2.71062428355217, "loss/logits": 0.8300145417451859, "step": 46830 }, { "epoch": 0.4684, "grad_norm": 13.1875, "grad_norm_var": 0.7988118489583333, "learning_rate": 0.0003, "loss": 11.1379, "loss/aux_loss": 0.04807367566972971, "loss/crossentropy": 2.77940719127655, "loss/logits": 0.8435407996177673, "step": 46840 }, { "epoch": 0.4685, "grad_norm": 14.0625, "grad_norm_var": 0.5723795572916667, "learning_rate": 0.0003, "loss": 11.0454, "loss/aux_loss": 0.048079926520586014, "loss/crossentropy": 2.5133470952510835, "loss/logits": 0.8150019586086273, "step": 46850 }, { "epoch": 0.4686, "grad_norm": 14.0, "grad_norm_var": 0.6298014322916666, "learning_rate": 0.0003, "loss": 11.1473, "loss/aux_loss": 0.04806667976081371, "loss/crossentropy": 2.8036882996559145, "loss/logits": 0.8526762962341309, "step": 46860 }, { "epoch": 0.4687, "grad_norm": 13.9375, "grad_norm_var": 0.437353515625, "learning_rate": 0.0003, "loss": 11.0997, "loss/aux_loss": 0.04806803800165653, "loss/crossentropy": 2.7410527229309083, "loss/logits": 0.8402520000934601, "step": 46870 }, { "epoch": 0.4688, "grad_norm": 15.4375, "grad_norm_var": 0.5257649739583333, "learning_rate": 0.0003, "loss": 11.2314, "loss/aux_loss": 0.0480783374980092, "loss/crossentropy": 2.5751788198947905, "loss/logits": 0.8502856940031052, "step": 46880 }, { "epoch": 0.4689, "grad_norm": 14.625, "grad_norm_var": 0.39055989583333334, "learning_rate": 0.0003, "loss": 11.1735, "loss/aux_loss": 0.048073621653020385, "loss/crossentropy": 2.7046410202980042, "loss/logits": 0.8371324121952057, "step": 46890 }, { "epoch": 0.469, "grad_norm": 15.125, "grad_norm_var": 0.6059895833333333, "learning_rate": 0.0003, "loss": 10.9607, "loss/aux_loss": 0.04807861391454935, "loss/crossentropy": 2.786002492904663, "loss/logits": 0.8466158479452133, "step": 46900 }, { "epoch": 0.4691, "grad_norm": 14.375, "grad_norm_var": 0.49920247395833334, "learning_rate": 0.0003, "loss": 11.2057, "loss/aux_loss": 0.04806822370737791, "loss/crossentropy": 2.6618904173374176, "loss/logits": 0.8399115055799484, "step": 46910 }, { "epoch": 0.4692, "grad_norm": 12.875, "grad_norm_var": 0.8572265625, "learning_rate": 0.0003, "loss": 11.0673, "loss/aux_loss": 0.048087282478809355, "loss/crossentropy": 2.707621991634369, "loss/logits": 0.8286347270011902, "step": 46920 }, { "epoch": 0.4693, "grad_norm": 14.9375, "grad_norm_var": 0.9629557291666667, "learning_rate": 0.0003, "loss": 11.0683, "loss/aux_loss": 0.048071885108947755, "loss/crossentropy": 2.7881909012794495, "loss/logits": 0.8593872129917145, "step": 46930 }, { "epoch": 0.4694, "grad_norm": 14.6875, "grad_norm_var": 0.6229166666666667, "learning_rate": 0.0003, "loss": 11.2006, "loss/aux_loss": 0.048074318841099736, "loss/crossentropy": 2.8505053877830506, "loss/logits": 0.824359530210495, "step": 46940 }, { "epoch": 0.4695, "grad_norm": 13.4375, "grad_norm_var": 0.9066243489583333, "learning_rate": 0.0003, "loss": 10.9586, "loss/aux_loss": 0.04807030726224184, "loss/crossentropy": 2.7617590546607973, "loss/logits": 0.8363817751407623, "step": 46950 }, { "epoch": 0.4696, "grad_norm": 15.0625, "grad_norm_var": 0.5494140625, "learning_rate": 0.0003, "loss": 11.2257, "loss/aux_loss": 0.048083371855318545, "loss/crossentropy": 2.7488768696784973, "loss/logits": 0.8112467706203461, "step": 46960 }, { "epoch": 0.4697, "grad_norm": 14.625, "grad_norm_var": 0.5541015625, "learning_rate": 0.0003, "loss": 11.1934, "loss/aux_loss": 0.048067718744277954, "loss/crossentropy": 2.713601952791214, "loss/logits": 0.8483431667089463, "step": 46970 }, { "epoch": 0.4698, "grad_norm": 14.375, "grad_norm_var": 1.0541666666666667, "learning_rate": 0.0003, "loss": 11.2815, "loss/aux_loss": 0.04806978404521942, "loss/crossentropy": 2.7652095437049864, "loss/logits": 0.899308231472969, "step": 46980 }, { "epoch": 0.4699, "grad_norm": 16.5, "grad_norm_var": 0.4723307291666667, "learning_rate": 0.0003, "loss": 11.1383, "loss/aux_loss": 0.04807682782411575, "loss/crossentropy": 2.7637027978897093, "loss/logits": 0.8606138914823532, "step": 46990 }, { "epoch": 0.47, "grad_norm": 13.875, "grad_norm_var": 0.6843587239583333, "learning_rate": 0.0003, "loss": 11.0771, "loss/aux_loss": 0.04807099532335997, "loss/crossentropy": 2.546469062566757, "loss/logits": 0.8381029695272446, "step": 47000 }, { "epoch": 0.4701, "grad_norm": 13.625, "grad_norm_var": 0.7808430989583334, "learning_rate": 0.0003, "loss": 11.2179, "loss/aux_loss": 0.048072867281734946, "loss/crossentropy": 2.650752639770508, "loss/logits": 0.808814725279808, "step": 47010 }, { "epoch": 0.4702, "grad_norm": 15.25, "grad_norm_var": 3.489697265625, "learning_rate": 0.0003, "loss": 11.1539, "loss/aux_loss": 0.04807664155960083, "loss/crossentropy": 2.579551470279694, "loss/logits": 0.8188108772039413, "step": 47020 }, { "epoch": 0.4703, "grad_norm": 15.375, "grad_norm_var": 3.3722493489583334, "learning_rate": 0.0003, "loss": 11.1198, "loss/aux_loss": 0.04807888753712177, "loss/crossentropy": 2.577794688940048, "loss/logits": 0.7898141339421272, "step": 47030 }, { "epoch": 0.4704, "grad_norm": 13.4375, "grad_norm_var": 0.8204264322916667, "learning_rate": 0.0003, "loss": 10.9596, "loss/aux_loss": 0.04807488694787025, "loss/crossentropy": 2.592330676317215, "loss/logits": 0.8327637195587159, "step": 47040 }, { "epoch": 0.4705, "grad_norm": 14.0625, "grad_norm_var": 0.6800618489583333, "learning_rate": 0.0003, "loss": 11.0693, "loss/aux_loss": 0.048071105033159256, "loss/crossentropy": 2.6126275599002837, "loss/logits": 0.7871117860078811, "step": 47050 }, { "epoch": 0.4706, "grad_norm": 13.625, "grad_norm_var": 0.5578125, "learning_rate": 0.0003, "loss": 11.225, "loss/aux_loss": 0.04807652160525322, "loss/crossentropy": 2.8688260078430177, "loss/logits": 0.874985545873642, "step": 47060 }, { "epoch": 0.4707, "grad_norm": 13.8125, "grad_norm_var": 0.5534993489583333, "learning_rate": 0.0003, "loss": 11.0904, "loss/aux_loss": 0.04806956704705954, "loss/crossentropy": 2.6465198278427122, "loss/logits": 0.8147805094718933, "step": 47070 }, { "epoch": 0.4708, "grad_norm": 14.375, "grad_norm_var": 0.702978515625, "learning_rate": 0.0003, "loss": 11.1015, "loss/aux_loss": 0.04807991813868284, "loss/crossentropy": 2.845957559347153, "loss/logits": 0.8246800363063812, "step": 47080 }, { "epoch": 0.4709, "grad_norm": 15.25, "grad_norm_var": 1.0942057291666667, "learning_rate": 0.0003, "loss": 10.9138, "loss/aux_loss": 0.04807828050106764, "loss/crossentropy": 2.6598312139511107, "loss/logits": 0.7844241559505463, "step": 47090 }, { "epoch": 0.471, "grad_norm": 15.0, "grad_norm_var": 4.145833333333333, "learning_rate": 0.0003, "loss": 10.998, "loss/aux_loss": 0.0480682672932744, "loss/crossentropy": 2.7192655980587004, "loss/logits": 0.8166234135627747, "step": 47100 }, { "epoch": 0.4711, "grad_norm": 14.5, "grad_norm_var": 3.998421223958333, "learning_rate": 0.0003, "loss": 11.1524, "loss/aux_loss": 0.04806790165603161, "loss/crossentropy": 2.8544438123703, "loss/logits": 0.8227733701467514, "step": 47110 }, { "epoch": 0.4712, "grad_norm": 20.375, "grad_norm_var": 2.564957682291667, "learning_rate": 0.0003, "loss": 10.9704, "loss/aux_loss": 0.04809897541999817, "loss/crossentropy": 2.6712704062461854, "loss/logits": 0.8468765825033188, "step": 47120 }, { "epoch": 0.4713, "grad_norm": 14.5, "grad_norm_var": 2.6884765625, "learning_rate": 0.0003, "loss": 11.1002, "loss/aux_loss": 0.04806103594601154, "loss/crossentropy": 2.654213637113571, "loss/logits": 0.8358179897069931, "step": 47130 }, { "epoch": 0.4714, "grad_norm": 14.0625, "grad_norm_var": 0.2447265625, "learning_rate": 0.0003, "loss": 11.2835, "loss/aux_loss": 0.048075387999415395, "loss/crossentropy": 2.828604817390442, "loss/logits": 0.8241938591003418, "step": 47140 }, { "epoch": 0.4715, "grad_norm": 16.125, "grad_norm_var": 1.079541015625, "learning_rate": 0.0003, "loss": 11.092, "loss/aux_loss": 0.04808494281023741, "loss/crossentropy": 2.693093776702881, "loss/logits": 0.8137997329235077, "step": 47150 }, { "epoch": 0.4716, "grad_norm": 14.875, "grad_norm_var": 0.7993326822916667, "learning_rate": 0.0003, "loss": 11.0348, "loss/aux_loss": 0.04807308670133352, "loss/crossentropy": 2.648904633522034, "loss/logits": 0.8604849994182586, "step": 47160 }, { "epoch": 0.4717, "grad_norm": 14.3125, "grad_norm_var": 0.6218098958333333, "learning_rate": 0.0003, "loss": 11.0803, "loss/aux_loss": 0.04807374849915504, "loss/crossentropy": 2.6493657648563387, "loss/logits": 0.8324615597724915, "step": 47170 }, { "epoch": 0.4718, "grad_norm": 15.75, "grad_norm_var": 2.496468098958333, "learning_rate": 0.0003, "loss": 10.9996, "loss/aux_loss": 0.048081782087683676, "loss/crossentropy": 2.64060292840004, "loss/logits": 0.8085512012243271, "step": 47180 }, { "epoch": 0.4719, "grad_norm": 14.25, "grad_norm_var": 2.939957682291667, "learning_rate": 0.0003, "loss": 11.111, "loss/aux_loss": 0.04806481916457415, "loss/crossentropy": 2.6764299035072328, "loss/logits": 0.8713664382696151, "step": 47190 }, { "epoch": 0.472, "grad_norm": 14.9375, "grad_norm_var": 0.23709309895833333, "learning_rate": 0.0003, "loss": 10.9265, "loss/aux_loss": 0.048081744089722635, "loss/crossentropy": 2.605439066886902, "loss/logits": 0.8055230677127838, "step": 47200 }, { "epoch": 0.4721, "grad_norm": 15.125, "grad_norm_var": 0.211572265625, "learning_rate": 0.0003, "loss": 11.0814, "loss/aux_loss": 0.048065853863954545, "loss/crossentropy": 2.8016534447669983, "loss/logits": 0.821602874994278, "step": 47210 }, { "epoch": 0.4722, "grad_norm": 14.5625, "grad_norm_var": 1.84140625, "learning_rate": 0.0003, "loss": 11.0479, "loss/aux_loss": 0.048074455559253694, "loss/crossentropy": 2.6670961678028107, "loss/logits": 0.8343064039945602, "step": 47220 }, { "epoch": 0.4723, "grad_norm": 14.5625, "grad_norm_var": 1.4374837239583333, "learning_rate": 0.0003, "loss": 11.1253, "loss/aux_loss": 0.0480674734339118, "loss/crossentropy": 2.7862078309059144, "loss/logits": 0.8591863840818406, "step": 47230 }, { "epoch": 0.4724, "grad_norm": 14.3125, "grad_norm_var": 0.7679524739583333, "learning_rate": 0.0003, "loss": 11.1595, "loss/aux_loss": 0.048076837323606014, "loss/crossentropy": 2.742043745517731, "loss/logits": 0.8124003469944, "step": 47240 }, { "epoch": 0.4725, "grad_norm": 14.5, "grad_norm_var": 0.8222493489583333, "learning_rate": 0.0003, "loss": 11.1575, "loss/aux_loss": 0.04806795790791511, "loss/crossentropy": 2.688308924436569, "loss/logits": 0.8328756958246231, "step": 47250 }, { "epoch": 0.4726, "grad_norm": 13.9375, "grad_norm_var": 0.24947916666666667, "learning_rate": 0.0003, "loss": 11.1297, "loss/aux_loss": 0.04808314982801676, "loss/crossentropy": 2.64586056470871, "loss/logits": 0.8115894719958305, "step": 47260 }, { "epoch": 0.4727, "grad_norm": 14.875, "grad_norm_var": 0.315869140625, "learning_rate": 0.0003, "loss": 11.1337, "loss/aux_loss": 0.048072290048003195, "loss/crossentropy": 2.638070636987686, "loss/logits": 0.8229748249053955, "step": 47270 }, { "epoch": 0.4728, "grad_norm": 15.0, "grad_norm_var": 0.5014973958333333, "learning_rate": 0.0003, "loss": 11.2275, "loss/aux_loss": 0.04807377941906452, "loss/crossentropy": 2.794324481487274, "loss/logits": 0.8639295071363449, "step": 47280 }, { "epoch": 0.4729, "grad_norm": 14.625, "grad_norm_var": 0.2816243489583333, "learning_rate": 0.0003, "loss": 11.087, "loss/aux_loss": 0.04807295482605696, "loss/crossentropy": 2.8186910152435303, "loss/logits": 0.8727422952651978, "step": 47290 }, { "epoch": 0.473, "grad_norm": 15.3125, "grad_norm_var": 0.7311848958333333, "learning_rate": 0.0003, "loss": 11.0382, "loss/aux_loss": 0.04807542134076357, "loss/crossentropy": 2.7608199238777162, "loss/logits": 0.8439187675714492, "step": 47300 }, { "epoch": 0.4731, "grad_norm": 13.125, "grad_norm_var": 0.4222493489583333, "learning_rate": 0.0003, "loss": 11.0159, "loss/aux_loss": 0.04807105585932732, "loss/crossentropy": 2.8253512501716616, "loss/logits": 0.8332709580659866, "step": 47310 }, { "epoch": 0.4732, "grad_norm": 14.9375, "grad_norm_var": 0.3082682291666667, "learning_rate": 0.0003, "loss": 11.2657, "loss/aux_loss": 0.04807895310223102, "loss/crossentropy": 2.7635598182678223, "loss/logits": 0.862557715177536, "step": 47320 }, { "epoch": 0.4733, "grad_norm": 13.875, "grad_norm_var": 0.43214518229166665, "learning_rate": 0.0003, "loss": 11.1253, "loss/aux_loss": 0.04807691089808941, "loss/crossentropy": 2.7444288194179536, "loss/logits": 0.8490573525428772, "step": 47330 }, { "epoch": 0.4734, "grad_norm": 14.875, "grad_norm_var": 0.39034830729166664, "learning_rate": 0.0003, "loss": 11.0076, "loss/aux_loss": 0.04806413035839796, "loss/crossentropy": 2.6496753454208375, "loss/logits": 0.8270862758159637, "step": 47340 }, { "epoch": 0.4735, "grad_norm": 15.125, "grad_norm_var": 0.543994140625, "learning_rate": 0.0003, "loss": 11.0769, "loss/aux_loss": 0.04807628635317087, "loss/crossentropy": 2.6999243259429933, "loss/logits": 0.8481123268604278, "step": 47350 }, { "epoch": 0.4736, "grad_norm": 13.75, "grad_norm_var": 0.37838541666666664, "learning_rate": 0.0003, "loss": 11.0257, "loss/aux_loss": 0.04807580206543207, "loss/crossentropy": 2.6780034720897676, "loss/logits": 0.7950571686029434, "step": 47360 }, { "epoch": 0.4737, "grad_norm": 13.3125, "grad_norm_var": 0.3567708333333333, "learning_rate": 0.0003, "loss": 11.0902, "loss/aux_loss": 0.048067801631987095, "loss/crossentropy": 2.508834218978882, "loss/logits": 0.8000911891460418, "step": 47370 }, { "epoch": 0.4738, "grad_norm": 13.75, "grad_norm_var": 1.1479166666666667, "learning_rate": 0.0003, "loss": 11.1167, "loss/aux_loss": 0.04807884600013494, "loss/crossentropy": 2.683100324869156, "loss/logits": 0.8076074302196503, "step": 47380 }, { "epoch": 0.4739, "grad_norm": 14.375, "grad_norm_var": 0.7238118489583333, "learning_rate": 0.0003, "loss": 11.2828, "loss/aux_loss": 0.048065136186778545, "loss/crossentropy": 2.7504623413085936, "loss/logits": 0.8754628151655197, "step": 47390 }, { "epoch": 0.474, "grad_norm": 14.375, "grad_norm_var": 3.4449055989583335, "learning_rate": 0.0003, "loss": 11.2768, "loss/aux_loss": 0.048078180849552156, "loss/crossentropy": 2.7185048401355743, "loss/logits": 0.8097441285848618, "step": 47400 }, { "epoch": 0.4741, "grad_norm": 15.0, "grad_norm_var": 3.372509765625, "learning_rate": 0.0003, "loss": 11.0728, "loss/aux_loss": 0.04806989543139935, "loss/crossentropy": 2.5462702572345735, "loss/logits": 0.8237005978822708, "step": 47410 }, { "epoch": 0.4742, "grad_norm": 14.875, "grad_norm_var": 0.3337890625, "learning_rate": 0.0003, "loss": 11.1406, "loss/aux_loss": 0.04807539358735084, "loss/crossentropy": 2.801107907295227, "loss/logits": 0.8503676056861877, "step": 47420 }, { "epoch": 0.4743, "grad_norm": 14.0, "grad_norm_var": 1.088916015625, "learning_rate": 0.0003, "loss": 11.3099, "loss/aux_loss": 0.04808101002126932, "loss/crossentropy": 2.658205211162567, "loss/logits": 0.8350113093852997, "step": 47430 }, { "epoch": 0.4744, "grad_norm": 13.1875, "grad_norm_var": 1.206103515625, "learning_rate": 0.0003, "loss": 10.9774, "loss/aux_loss": 0.048071620799601075, "loss/crossentropy": 2.6002185225486754, "loss/logits": 0.8004775673151017, "step": 47440 }, { "epoch": 0.4745, "grad_norm": 15.375, "grad_norm_var": 0.4525390625, "learning_rate": 0.0003, "loss": 11.0775, "loss/aux_loss": 0.04807077720761299, "loss/crossentropy": 2.785941880941391, "loss/logits": 0.8646159768104553, "step": 47450 }, { "epoch": 0.4746, "grad_norm": 15.0625, "grad_norm_var": 0.427197265625, "learning_rate": 0.0003, "loss": 11.056, "loss/aux_loss": 0.048071554861962795, "loss/crossentropy": 2.7326180696487428, "loss/logits": 0.8212353408336639, "step": 47460 }, { "epoch": 0.4747, "grad_norm": 14.5, "grad_norm_var": 0.28567708333333336, "learning_rate": 0.0003, "loss": 11.1585, "loss/aux_loss": 0.04807238578796387, "loss/crossentropy": 2.755151998996735, "loss/logits": 0.8459553897380829, "step": 47470 }, { "epoch": 0.4748, "grad_norm": 14.5, "grad_norm_var": 0.6296223958333333, "learning_rate": 0.0003, "loss": 11.1363, "loss/aux_loss": 0.04808005690574646, "loss/crossentropy": 2.7062522768974304, "loss/logits": 0.823663991689682, "step": 47480 }, { "epoch": 0.4749, "grad_norm": 12.8125, "grad_norm_var": 1.0574055989583333, "learning_rate": 0.0003, "loss": 11.078, "loss/aux_loss": 0.04807120095938444, "loss/crossentropy": 2.7074629366397858, "loss/logits": 0.8275787591934204, "step": 47490 }, { "epoch": 0.475, "grad_norm": 14.0, "grad_norm_var": 0.7460774739583333, "learning_rate": 0.0003, "loss": 11.2052, "loss/aux_loss": 0.04807096980512142, "loss/crossentropy": 2.7104438126087187, "loss/logits": 0.8310169786214828, "step": 47500 }, { "epoch": 0.4751, "grad_norm": 14.625, "grad_norm_var": 0.5322265625, "learning_rate": 0.0003, "loss": 11.1362, "loss/aux_loss": 0.04808025192469358, "loss/crossentropy": 2.709762120246887, "loss/logits": 0.8251390606164932, "step": 47510 }, { "epoch": 0.4752, "grad_norm": 14.6875, "grad_norm_var": 0.620166015625, "learning_rate": 0.0003, "loss": 11.1076, "loss/aux_loss": 0.04807448033243418, "loss/crossentropy": 2.773437148332596, "loss/logits": 0.8604608118534088, "step": 47520 }, { "epoch": 0.4753, "grad_norm": 15.3125, "grad_norm_var": 0.2972493489583333, "learning_rate": 0.0003, "loss": 11.2152, "loss/aux_loss": 0.048080704919993875, "loss/crossentropy": 2.793582892417908, "loss/logits": 0.8527662813663482, "step": 47530 }, { "epoch": 0.4754, "grad_norm": 17.5, "grad_norm_var": 0.8360514322916667, "learning_rate": 0.0003, "loss": 11.1323, "loss/aux_loss": 0.04807116650044918, "loss/crossentropy": 2.6637362360954286, "loss/logits": 0.8446590304374695, "step": 47540 }, { "epoch": 0.4755, "grad_norm": 13.9375, "grad_norm_var": 0.8988118489583333, "learning_rate": 0.0003, "loss": 11.0892, "loss/aux_loss": 0.04808674175292253, "loss/crossentropy": 2.6643483340740204, "loss/logits": 0.8172670543193817, "step": 47550 }, { "epoch": 0.4756, "grad_norm": 15.0625, "grad_norm_var": 0.41534830729166666, "learning_rate": 0.0003, "loss": 11.0397, "loss/aux_loss": 0.0480627054348588, "loss/crossentropy": 2.537232494354248, "loss/logits": 0.8236280262470246, "step": 47560 }, { "epoch": 0.4757, "grad_norm": 13.25, "grad_norm_var": 0.5806640625, "learning_rate": 0.0003, "loss": 11.1531, "loss/aux_loss": 0.04807612933218479, "loss/crossentropy": 2.6730732560157775, "loss/logits": 0.8434269517660141, "step": 47570 }, { "epoch": 0.4758, "grad_norm": 13.625, "grad_norm_var": 0.6679524739583333, "learning_rate": 0.0003, "loss": 11.4003, "loss/aux_loss": 0.04807697795331478, "loss/crossentropy": 2.80619136095047, "loss/logits": 0.8598318874835968, "step": 47580 }, { "epoch": 0.4759, "grad_norm": 15.75, "grad_norm_var": 0.7728515625, "learning_rate": 0.0003, "loss": 11.1052, "loss/aux_loss": 0.04807619452476501, "loss/crossentropy": 2.746232843399048, "loss/logits": 0.8442793190479279, "step": 47590 }, { "epoch": 0.476, "grad_norm": 15.3125, "grad_norm_var": 0.46920572916666664, "learning_rate": 0.0003, "loss": 10.9343, "loss/aux_loss": 0.04806558098644018, "loss/crossentropy": 2.794121563434601, "loss/logits": 0.8108414888381958, "step": 47600 }, { "epoch": 0.4761, "grad_norm": 13.75, "grad_norm_var": 0.22962239583333333, "learning_rate": 0.0003, "loss": 11.1234, "loss/aux_loss": 0.048085267655551434, "loss/crossentropy": 2.7552334010601043, "loss/logits": 0.8350756138563156, "step": 47610 }, { "epoch": 0.4762, "grad_norm": 14.6875, "grad_norm_var": 0.37180989583333335, "learning_rate": 0.0003, "loss": 11.0146, "loss/aux_loss": 0.048061388358473775, "loss/crossentropy": 2.721570539474487, "loss/logits": 0.8332007586956024, "step": 47620 }, { "epoch": 0.4763, "grad_norm": 28.0, "grad_norm_var": 11.769010416666667, "learning_rate": 0.0003, "loss": 11.1894, "loss/aux_loss": 0.048081564158201216, "loss/crossentropy": 2.689694482088089, "loss/logits": 0.8350923985242844, "step": 47630 }, { "epoch": 0.4764, "grad_norm": 14.5625, "grad_norm_var": 15.7728515625, "learning_rate": 0.0003, "loss": 11.178, "loss/aux_loss": 0.048070958070456984, "loss/crossentropy": 2.8211479425430297, "loss/logits": 0.8629825711250305, "step": 47640 }, { "epoch": 0.4765, "grad_norm": 14.25, "grad_norm_var": 1.2567057291666666, "learning_rate": 0.0003, "loss": 11.0979, "loss/aux_loss": 0.04807667341083288, "loss/crossentropy": 2.6792636036872866, "loss/logits": 0.8190094619989395, "step": 47650 }, { "epoch": 0.4766, "grad_norm": 15.0, "grad_norm_var": 0.68828125, "learning_rate": 0.0003, "loss": 11.2105, "loss/aux_loss": 0.04806965459138155, "loss/crossentropy": 2.6743164896965026, "loss/logits": 0.8335071861743927, "step": 47660 }, { "epoch": 0.4767, "grad_norm": 15.9375, "grad_norm_var": 3.9567057291666665, "learning_rate": 0.0003, "loss": 11.1965, "loss/aux_loss": 0.04807929284870625, "loss/crossentropy": 2.7867653131484986, "loss/logits": 0.8356254577636719, "step": 47670 }, { "epoch": 0.4768, "grad_norm": 15.875, "grad_norm_var": 0.7306640625, "learning_rate": 0.0003, "loss": 11.3107, "loss/aux_loss": 0.04806722085922956, "loss/crossentropy": 2.628416657447815, "loss/logits": 0.827005535364151, "step": 47680 }, { "epoch": 0.4769, "grad_norm": 14.9375, "grad_norm_var": 2.5714680989583334, "learning_rate": 0.0003, "loss": 11.065, "loss/aux_loss": 0.048084712401032445, "loss/crossentropy": 2.7183729648590087, "loss/logits": 0.798021674156189, "step": 47690 }, { "epoch": 0.477, "grad_norm": 13.25, "grad_norm_var": 2.953108723958333, "learning_rate": 0.0003, "loss": 11.115, "loss/aux_loss": 0.04808032158762217, "loss/crossentropy": 2.842688000202179, "loss/logits": 0.8267738074064255, "step": 47700 }, { "epoch": 0.4771, "grad_norm": 15.0625, "grad_norm_var": 0.9541015625, "learning_rate": 0.0003, "loss": 11.1761, "loss/aux_loss": 0.04806661438196898, "loss/crossentropy": 2.811937117576599, "loss/logits": 0.8213659793138504, "step": 47710 }, { "epoch": 0.4772, "grad_norm": 15.3125, "grad_norm_var": 0.8820149739583333, "learning_rate": 0.0003, "loss": 11.1166, "loss/aux_loss": 0.048074382916092874, "loss/crossentropy": 2.6995534360408784, "loss/logits": 0.8364298850297928, "step": 47720 }, { "epoch": 0.4773, "grad_norm": 17.625, "grad_norm_var": 51.601497395833334, "learning_rate": 0.0003, "loss": 11.2294, "loss/aux_loss": 0.04806399717926979, "loss/crossentropy": 2.6920024275779726, "loss/logits": 0.8450191617012024, "step": 47730 }, { "epoch": 0.4774, "grad_norm": 13.8125, "grad_norm_var": 51.483317057291664, "learning_rate": 0.0003, "loss": 11.047, "loss/aux_loss": 0.04807861316949129, "loss/crossentropy": 2.694988691806793, "loss/logits": 0.8258244037628174, "step": 47740 }, { "epoch": 0.4775, "grad_norm": 14.8125, "grad_norm_var": 0.6005045572916666, "learning_rate": 0.0003, "loss": 11.1232, "loss/aux_loss": 0.048065470159053804, "loss/crossentropy": 2.887584125995636, "loss/logits": 0.8933877527713776, "step": 47750 }, { "epoch": 0.4776, "grad_norm": 16.75, "grad_norm_var": 0.788525390625, "learning_rate": 0.0003, "loss": 10.8924, "loss/aux_loss": 0.04808261953294277, "loss/crossentropy": 2.6200126349925994, "loss/logits": 0.8102252304553985, "step": 47760 }, { "epoch": 0.4777, "grad_norm": 14.0625, "grad_norm_var": 0.9286458333333333, "learning_rate": 0.0003, "loss": 10.9407, "loss/aux_loss": 0.04806542750447988, "loss/crossentropy": 2.6927346110343935, "loss/logits": 0.8065591782331467, "step": 47770 }, { "epoch": 0.4778, "grad_norm": 16.625, "grad_norm_var": 0.6200520833333333, "learning_rate": 0.0003, "loss": 10.9372, "loss/aux_loss": 0.0480722114443779, "loss/crossentropy": 2.7056717574596405, "loss/logits": 0.825093024969101, "step": 47780 }, { "epoch": 0.4779, "grad_norm": 14.75, "grad_norm_var": 0.5973795572916667, "learning_rate": 0.0003, "loss": 11.1361, "loss/aux_loss": 0.04807592462748289, "loss/crossentropy": 2.7489038705825806, "loss/logits": 0.8381727159023284, "step": 47790 }, { "epoch": 0.478, "grad_norm": 14.1875, "grad_norm_var": 0.3947265625, "learning_rate": 0.0003, "loss": 11.0163, "loss/aux_loss": 0.04807247947901487, "loss/crossentropy": 2.7729135751724243, "loss/logits": 0.8364667683839798, "step": 47800 }, { "epoch": 0.4781, "grad_norm": 14.8125, "grad_norm_var": 0.43826497395833336, "learning_rate": 0.0003, "loss": 11.1055, "loss/aux_loss": 0.04807865787297487, "loss/crossentropy": 2.699937582015991, "loss/logits": 0.8402740955352783, "step": 47810 }, { "epoch": 0.4782, "grad_norm": 14.8125, "grad_norm_var": 0.33489583333333334, "learning_rate": 0.0003, "loss": 11.0608, "loss/aux_loss": 0.04806937780231237, "loss/crossentropy": 2.7677336633205414, "loss/logits": 0.8421557247638702, "step": 47820 }, { "epoch": 0.4783, "grad_norm": 14.5625, "grad_norm_var": 0.29373372395833336, "learning_rate": 0.0003, "loss": 11.1592, "loss/aux_loss": 0.04806809443980455, "loss/crossentropy": 2.534024041891098, "loss/logits": 0.7998458266258239, "step": 47830 }, { "epoch": 0.4784, "grad_norm": 14.9375, "grad_norm_var": 0.7176432291666667, "learning_rate": 0.0003, "loss": 11.1525, "loss/aux_loss": 0.04807998463511467, "loss/crossentropy": 2.6285907328128815, "loss/logits": 0.8426926136016846, "step": 47840 }, { "epoch": 0.4785, "grad_norm": 14.3125, "grad_norm_var": 0.3020833333333333, "learning_rate": 0.0003, "loss": 11.071, "loss/aux_loss": 0.04807542841881514, "loss/crossentropy": 2.613954132795334, "loss/logits": 0.8264847338199616, "step": 47850 }, { "epoch": 0.4786, "grad_norm": 14.25, "grad_norm_var": 0.42849934895833336, "learning_rate": 0.0003, "loss": 10.9981, "loss/aux_loss": 0.04806617666035891, "loss/crossentropy": 2.7965017437934874, "loss/logits": 0.8339490979909897, "step": 47860 }, { "epoch": 0.4787, "grad_norm": 14.4375, "grad_norm_var": 0.98203125, "learning_rate": 0.0003, "loss": 11.0962, "loss/aux_loss": 0.048079381324350835, "loss/crossentropy": 2.7282164812088014, "loss/logits": 0.8202213078737259, "step": 47870 }, { "epoch": 0.4788, "grad_norm": 15.375, "grad_norm_var": 0.6374348958333333, "learning_rate": 0.0003, "loss": 11.1136, "loss/aux_loss": 0.048078177496790886, "loss/crossentropy": 2.6539010763168336, "loss/logits": 0.812344890832901, "step": 47880 }, { "epoch": 0.4789, "grad_norm": 15.125, "grad_norm_var": 1.1622233072916666, "learning_rate": 0.0003, "loss": 10.9553, "loss/aux_loss": 0.048067126609385016, "loss/crossentropy": 2.7369919776916505, "loss/logits": 0.8319634586572647, "step": 47890 }, { "epoch": 0.479, "grad_norm": 15.6875, "grad_norm_var": 7.665559895833334, "learning_rate": 0.0003, "loss": 11.136, "loss/aux_loss": 0.048075918667018415, "loss/crossentropy": 2.895642626285553, "loss/logits": 0.8190632820129394, "step": 47900 }, { "epoch": 0.4791, "grad_norm": 15.0, "grad_norm_var": 7.870768229166667, "learning_rate": 0.0003, "loss": 11.0985, "loss/aux_loss": 0.048073847964406016, "loss/crossentropy": 2.734054809808731, "loss/logits": 0.8439525783061981, "step": 47910 }, { "epoch": 0.4792, "grad_norm": 14.8125, "grad_norm_var": 0.39088541666666665, "learning_rate": 0.0003, "loss": 11.0615, "loss/aux_loss": 0.048062325455248356, "loss/crossentropy": 2.5045742869377134, "loss/logits": 0.8234647005796433, "step": 47920 }, { "epoch": 0.4793, "grad_norm": 14.9375, "grad_norm_var": 0.32198893229166664, "learning_rate": 0.0003, "loss": 11.0936, "loss/aux_loss": 0.048081851191818716, "loss/crossentropy": 2.55519557595253, "loss/logits": 0.8111729115247727, "step": 47930 }, { "epoch": 0.4794, "grad_norm": 14.1875, "grad_norm_var": 0.4494140625, "learning_rate": 0.0003, "loss": 11.0921, "loss/aux_loss": 0.04807233922183514, "loss/crossentropy": 2.8699767351150514, "loss/logits": 0.8300641059875489, "step": 47940 }, { "epoch": 0.4795, "grad_norm": 14.0, "grad_norm_var": 1.0027180989583333, "learning_rate": 0.0003, "loss": 11.0988, "loss/aux_loss": 0.048071389086544514, "loss/crossentropy": 2.839382266998291, "loss/logits": 0.8115487396717072, "step": 47950 }, { "epoch": 0.4796, "grad_norm": 13.4375, "grad_norm_var": 0.6723795572916667, "learning_rate": 0.0003, "loss": 11.0085, "loss/aux_loss": 0.048065657168626784, "loss/crossentropy": 2.7501906633377073, "loss/logits": 0.8143287628889084, "step": 47960 }, { "epoch": 0.4797, "grad_norm": 16.25, "grad_norm_var": 0.8417805989583333, "learning_rate": 0.0003, "loss": 11.0563, "loss/aux_loss": 0.048076402582228187, "loss/crossentropy": 2.676505321264267, "loss/logits": 0.8360762178897858, "step": 47970 }, { "epoch": 0.4798, "grad_norm": 14.6875, "grad_norm_var": 0.9072265625, "learning_rate": 0.0003, "loss": 11.1315, "loss/aux_loss": 0.04806589502841234, "loss/crossentropy": 2.6232878804206847, "loss/logits": 0.8199890315532684, "step": 47980 }, { "epoch": 0.4799, "grad_norm": 13.5625, "grad_norm_var": 0.6792805989583334, "learning_rate": 0.0003, "loss": 11.1561, "loss/aux_loss": 0.048061208054423335, "loss/crossentropy": 2.7454289555549622, "loss/logits": 0.8468107730150223, "step": 47990 }, { "epoch": 0.48, "grad_norm": 14.3125, "grad_norm_var": 0.32447916666666665, "learning_rate": 0.0003, "loss": 11.0602, "loss/aux_loss": 0.0480829494073987, "loss/crossentropy": 2.7469854950904846, "loss/logits": 0.8431436151266098, "step": 48000 }, { "epoch": 0.4801, "grad_norm": 14.75, "grad_norm_var": 0.6816243489583333, "learning_rate": 0.0003, "loss": 11.2187, "loss/aux_loss": 0.04806633796542883, "loss/crossentropy": 2.800862890481949, "loss/logits": 0.8245023936033249, "step": 48010 }, { "epoch": 0.4802, "grad_norm": 14.9375, "grad_norm_var": 0.6129557291666666, "learning_rate": 0.0003, "loss": 11.0638, "loss/aux_loss": 0.04809022005647421, "loss/crossentropy": 2.6285562753677367, "loss/logits": 0.8116117030382156, "step": 48020 }, { "epoch": 0.4803, "grad_norm": 14.6875, "grad_norm_var": 6.8197265625, "learning_rate": 0.0003, "loss": 11.0978, "loss/aux_loss": 0.048075268231332305, "loss/crossentropy": 2.732567811012268, "loss/logits": 0.844546177983284, "step": 48030 }, { "epoch": 0.4804, "grad_norm": 15.25, "grad_norm_var": 6.534879557291666, "learning_rate": 0.0003, "loss": 11.0056, "loss/aux_loss": 0.048071599751710894, "loss/crossentropy": 2.695717829465866, "loss/logits": 0.7987852036952973, "step": 48040 }, { "epoch": 0.4805, "grad_norm": 14.8125, "grad_norm_var": 0.43776041666666665, "learning_rate": 0.0003, "loss": 11.0473, "loss/aux_loss": 0.04807654451578856, "loss/crossentropy": 2.877095127105713, "loss/logits": 0.8336584985256195, "step": 48050 }, { "epoch": 0.4806, "grad_norm": 14.75, "grad_norm_var": 0.758447265625, "learning_rate": 0.0003, "loss": 11.0729, "loss/aux_loss": 0.04807484894990921, "loss/crossentropy": 2.8576854825019837, "loss/logits": 0.8484826743602752, "step": 48060 }, { "epoch": 0.4807, "grad_norm": 14.5625, "grad_norm_var": 0.6466145833333333, "learning_rate": 0.0003, "loss": 10.9455, "loss/aux_loss": 0.04806608278304338, "loss/crossentropy": 2.7044803380966185, "loss/logits": 0.8198621302843094, "step": 48070 }, { "epoch": 0.4808, "grad_norm": 14.0625, "grad_norm_var": 0.234228515625, "learning_rate": 0.0003, "loss": 11.2132, "loss/aux_loss": 0.048075415566563605, "loss/crossentropy": 2.8697200059890746, "loss/logits": 0.8511811017990112, "step": 48080 }, { "epoch": 0.4809, "grad_norm": 15.3125, "grad_norm_var": 0.3804524739583333, "learning_rate": 0.0003, "loss": 11.1382, "loss/aux_loss": 0.04807231742888689, "loss/crossentropy": 2.600096642971039, "loss/logits": 0.8244834512472152, "step": 48090 }, { "epoch": 0.481, "grad_norm": 16.5, "grad_norm_var": 0.5700520833333333, "learning_rate": 0.0003, "loss": 10.9186, "loss/aux_loss": 0.04807635005563497, "loss/crossentropy": 2.604368954896927, "loss/logits": 0.8412629932165145, "step": 48100 }, { "epoch": 0.4811, "grad_norm": 14.0625, "grad_norm_var": 0.5356770833333333, "learning_rate": 0.0003, "loss": 11.1471, "loss/aux_loss": 0.04806556645780802, "loss/crossentropy": 2.717805975675583, "loss/logits": 0.829289898276329, "step": 48110 }, { "epoch": 0.4812, "grad_norm": 13.875, "grad_norm_var": 0.698291015625, "learning_rate": 0.0003, "loss": 11.1148, "loss/aux_loss": 0.04808323364704847, "loss/crossentropy": 2.707306903600693, "loss/logits": 0.8816626042127609, "step": 48120 }, { "epoch": 0.4813, "grad_norm": 14.8125, "grad_norm_var": 0.7191243489583333, "learning_rate": 0.0003, "loss": 11.1623, "loss/aux_loss": 0.04807705953717232, "loss/crossentropy": 2.7037087202072145, "loss/logits": 0.8497424215078354, "step": 48130 }, { "epoch": 0.4814, "grad_norm": 13.8125, "grad_norm_var": 0.7626139322916666, "learning_rate": 0.0003, "loss": 11.158, "loss/aux_loss": 0.04806127455085516, "loss/crossentropy": 2.565345358848572, "loss/logits": 0.8115527182817459, "step": 48140 }, { "epoch": 0.4815, "grad_norm": 18.0, "grad_norm_var": 63.133447265625, "learning_rate": 0.0003, "loss": 11.2609, "loss/aux_loss": 0.04808726757764816, "loss/crossentropy": 2.6367207527160645, "loss/logits": 0.831730630993843, "step": 48150 }, { "epoch": 0.4816, "grad_norm": 15.875, "grad_norm_var": 177.33839518229166, "learning_rate": 0.0003, "loss": 11.1555, "loss/aux_loss": 0.04808530658483505, "loss/crossentropy": 2.6289633989334105, "loss/logits": 0.846698772907257, "step": 48160 }, { "epoch": 0.4817, "grad_norm": 14.125, "grad_norm_var": 135.46302083333333, "learning_rate": 0.0003, "loss": 10.9952, "loss/aux_loss": 0.0480637326836586, "loss/crossentropy": 2.6193889021873473, "loss/logits": 0.8518955647945404, "step": 48170 }, { "epoch": 0.4818, "grad_norm": 14.6875, "grad_norm_var": 0.23631184895833332, "learning_rate": 0.0003, "loss": 10.9472, "loss/aux_loss": 0.04807006679475308, "loss/crossentropy": 2.5161637544631956, "loss/logits": 0.7855435490608216, "step": 48180 }, { "epoch": 0.4819, "grad_norm": 15.75, "grad_norm_var": 1.4041015625, "learning_rate": 0.0003, "loss": 11.1728, "loss/aux_loss": 0.04807739406824112, "loss/crossentropy": 2.578041511774063, "loss/logits": 0.8267721891403198, "step": 48190 }, { "epoch": 0.482, "grad_norm": 14.75, "grad_norm_var": 1.2307291666666667, "learning_rate": 0.0003, "loss": 11.1619, "loss/aux_loss": 0.04807015117257833, "loss/crossentropy": 2.851909363269806, "loss/logits": 0.862451794743538, "step": 48200 }, { "epoch": 0.4821, "grad_norm": 15.625, "grad_norm_var": 0.6079264322916667, "learning_rate": 0.0003, "loss": 11.1798, "loss/aux_loss": 0.04807384721934795, "loss/crossentropy": 2.7018039345741274, "loss/logits": 0.8276410967111587, "step": 48210 }, { "epoch": 0.4822, "grad_norm": 14.9375, "grad_norm_var": 0.8374837239583334, "learning_rate": 0.0003, "loss": 11.1851, "loss/aux_loss": 0.04808012768626213, "loss/crossentropy": 2.792585861682892, "loss/logits": 0.8121782958507537, "step": 48220 }, { "epoch": 0.4823, "grad_norm": 14.25, "grad_norm_var": 0.5713541666666667, "learning_rate": 0.0003, "loss": 11.0181, "loss/aux_loss": 0.04806952588260174, "loss/crossentropy": 2.7548343539237976, "loss/logits": 0.8300218850374221, "step": 48230 }, { "epoch": 0.4824, "grad_norm": 13.625, "grad_norm_var": 0.5208333333333334, "learning_rate": 0.0003, "loss": 10.8143, "loss/aux_loss": 0.04807311985641718, "loss/crossentropy": 2.582223576307297, "loss/logits": 0.8206641644239425, "step": 48240 }, { "epoch": 0.4825, "grad_norm": 14.125, "grad_norm_var": 0.4598307291666667, "learning_rate": 0.0003, "loss": 11.1807, "loss/aux_loss": 0.04807979855686426, "loss/crossentropy": 2.6816562175750733, "loss/logits": 0.8363949626684188, "step": 48250 }, { "epoch": 0.4826, "grad_norm": 14.125, "grad_norm_var": 0.2869791666666667, "learning_rate": 0.0003, "loss": 11.0489, "loss/aux_loss": 0.04806763082742691, "loss/crossentropy": 2.7296660900115968, "loss/logits": 0.8218880474567414, "step": 48260 }, { "epoch": 0.4827, "grad_norm": 13.9375, "grad_norm_var": 0.29347330729166665, "learning_rate": 0.0003, "loss": 11.1751, "loss/aux_loss": 0.04807308837771416, "loss/crossentropy": 2.7679852724075316, "loss/logits": 0.8572757095098495, "step": 48270 }, { "epoch": 0.4828, "grad_norm": 14.75, "grad_norm_var": 0.2908854166666667, "learning_rate": 0.0003, "loss": 10.9314, "loss/aux_loss": 0.04806960169225931, "loss/crossentropy": 2.6580540001392365, "loss/logits": 0.8273628979921341, "step": 48280 }, { "epoch": 0.4829, "grad_norm": 14.625, "grad_norm_var": 0.41795247395833335, "learning_rate": 0.0003, "loss": 11.0294, "loss/aux_loss": 0.04808035921305418, "loss/crossentropy": 2.713185727596283, "loss/logits": 0.8252017825841904, "step": 48290 }, { "epoch": 0.483, "grad_norm": 13.9375, "grad_norm_var": 0.28592122395833336, "learning_rate": 0.0003, "loss": 11.0288, "loss/aux_loss": 0.04807136319577694, "loss/crossentropy": 2.6470188081264494, "loss/logits": 0.8096356302499771, "step": 48300 }, { "epoch": 0.4831, "grad_norm": 14.6875, "grad_norm_var": 0.33865559895833336, "learning_rate": 0.0003, "loss": 11.0124, "loss/aux_loss": 0.048075702227652076, "loss/crossentropy": 2.628252637386322, "loss/logits": 0.8306295484304428, "step": 48310 }, { "epoch": 0.4832, "grad_norm": 13.9375, "grad_norm_var": 0.333837890625, "learning_rate": 0.0003, "loss": 11.2257, "loss/aux_loss": 0.04808404166251421, "loss/crossentropy": 2.575061935186386, "loss/logits": 0.8175129801034927, "step": 48320 }, { "epoch": 0.4833, "grad_norm": 14.0625, "grad_norm_var": 0.39386393229166666, "learning_rate": 0.0003, "loss": 11.2456, "loss/aux_loss": 0.04806752149015665, "loss/crossentropy": 2.6643171072006226, "loss/logits": 0.8468601524829864, "step": 48330 }, { "epoch": 0.4834, "grad_norm": 16.375, "grad_norm_var": 0.9049479166666666, "learning_rate": 0.0003, "loss": 11.0414, "loss/aux_loss": 0.048077189922332765, "loss/crossentropy": 2.659744346141815, "loss/logits": 0.7988389104604721, "step": 48340 }, { "epoch": 0.4835, "grad_norm": 14.375, "grad_norm_var": 367.3979166666667, "learning_rate": 0.0003, "loss": 11.1791, "loss/aux_loss": 0.04807696957141161, "loss/crossentropy": 2.824947530031204, "loss/logits": 0.8144560337066651, "step": 48350 }, { "epoch": 0.4836, "grad_norm": 14.75, "grad_norm_var": 2.502067057291667, "learning_rate": 0.0003, "loss": 11.0857, "loss/aux_loss": 0.048078769072890284, "loss/crossentropy": 2.700359559059143, "loss/logits": 0.8216308414936065, "step": 48360 }, { "epoch": 0.4837, "grad_norm": 15.0625, "grad_norm_var": 0.2712890625, "learning_rate": 0.0003, "loss": 11.0512, "loss/aux_loss": 0.048067495599389075, "loss/crossentropy": 2.7094205021858215, "loss/logits": 0.848452877998352, "step": 48370 }, { "epoch": 0.4838, "grad_norm": 14.3125, "grad_norm_var": 0.5499348958333333, "learning_rate": 0.0003, "loss": 11.2143, "loss/aux_loss": 0.04807147514075041, "loss/crossentropy": 2.610717463493347, "loss/logits": 0.8028295308351516, "step": 48380 }, { "epoch": 0.4839, "grad_norm": 15.875, "grad_norm_var": 0.8441243489583333, "learning_rate": 0.0003, "loss": 11.0868, "loss/aux_loss": 0.04808458909392357, "loss/crossentropy": 2.649008184671402, "loss/logits": 0.7971860766410828, "step": 48390 }, { "epoch": 0.484, "grad_norm": 16.875, "grad_norm_var": 0.6166666666666667, "learning_rate": 0.0003, "loss": 11.1314, "loss/aux_loss": 0.048060483299195766, "loss/crossentropy": 2.60991570353508, "loss/logits": 0.8266617238521576, "step": 48400 }, { "epoch": 0.4841, "grad_norm": 14.5, "grad_norm_var": 0.7341145833333333, "learning_rate": 0.0003, "loss": 11.2854, "loss/aux_loss": 0.04807337708771229, "loss/crossentropy": 2.8312358379364015, "loss/logits": 0.8423346072435379, "step": 48410 }, { "epoch": 0.4842, "grad_norm": 14.25, "grad_norm_var": 0.461572265625, "learning_rate": 0.0003, "loss": 11.0738, "loss/aux_loss": 0.048069039918482305, "loss/crossentropy": 2.6287239670753477, "loss/logits": 0.8132491081953048, "step": 48420 }, { "epoch": 0.4843, "grad_norm": 14.375, "grad_norm_var": 0.38409830729166666, "learning_rate": 0.0003, "loss": 11.1052, "loss/aux_loss": 0.04807084016501904, "loss/crossentropy": 2.6560630083084105, "loss/logits": 0.8387343198060989, "step": 48430 }, { "epoch": 0.4844, "grad_norm": 15.3125, "grad_norm_var": 0.22786458333333334, "learning_rate": 0.0003, "loss": 11.218, "loss/aux_loss": 0.04808529261499643, "loss/crossentropy": 2.574995279312134, "loss/logits": 0.8190798044204712, "step": 48440 }, { "epoch": 0.4845, "grad_norm": 14.5, "grad_norm_var": 1.1051920572916667, "learning_rate": 0.0003, "loss": 11.2403, "loss/aux_loss": 0.04806236382573843, "loss/crossentropy": 2.641158491373062, "loss/logits": 0.8146007388830185, "step": 48450 }, { "epoch": 0.4846, "grad_norm": 13.125, "grad_norm_var": 0.8126139322916667, "learning_rate": 0.0003, "loss": 11.1928, "loss/aux_loss": 0.048080836050212385, "loss/crossentropy": 2.9179535865783692, "loss/logits": 0.8601721286773681, "step": 48460 }, { "epoch": 0.4847, "grad_norm": 14.5625, "grad_norm_var": 0.4988932291666667, "learning_rate": 0.0003, "loss": 11.1159, "loss/aux_loss": 0.04806997887790203, "loss/crossentropy": 2.536268186569214, "loss/logits": 0.8264925092458725, "step": 48470 }, { "epoch": 0.4848, "grad_norm": 13.3125, "grad_norm_var": 1.3744791666666667, "learning_rate": 0.0003, "loss": 11.0528, "loss/aux_loss": 0.0480733098462224, "loss/crossentropy": 2.5422776341438293, "loss/logits": 0.7887350648641587, "step": 48480 }, { "epoch": 0.4849, "grad_norm": 14.125, "grad_norm_var": 0.5436848958333333, "learning_rate": 0.0003, "loss": 11.1683, "loss/aux_loss": 0.048076307587325576, "loss/crossentropy": 2.744097375869751, "loss/logits": 0.8400337219238281, "step": 48490 }, { "epoch": 0.485, "grad_norm": 14.3125, "grad_norm_var": 0.22473958333333333, "learning_rate": 0.0003, "loss": 11.0867, "loss/aux_loss": 0.048070046678185464, "loss/crossentropy": 2.8123038172721864, "loss/logits": 0.8171926707029342, "step": 48500 }, { "epoch": 0.4851, "grad_norm": 14.0, "grad_norm_var": 0.318994140625, "learning_rate": 0.0003, "loss": 10.9393, "loss/aux_loss": 0.048068254627287386, "loss/crossentropy": 2.693702256679535, "loss/logits": 0.8272054940462112, "step": 48510 }, { "epoch": 0.4852, "grad_norm": 15.4375, "grad_norm_var": 0.492822265625, "learning_rate": 0.0003, "loss": 11.2476, "loss/aux_loss": 0.04807813167572021, "loss/crossentropy": 2.7948949217796324, "loss/logits": 0.8314435452222824, "step": 48520 }, { "epoch": 0.4853, "grad_norm": 14.0625, "grad_norm_var": 0.4557291666666667, "learning_rate": 0.0003, "loss": 11.1876, "loss/aux_loss": 0.04806694649159908, "loss/crossentropy": 2.935932195186615, "loss/logits": 0.8360010713338852, "step": 48530 }, { "epoch": 0.4854, "grad_norm": 14.625, "grad_norm_var": 59.37890625, "learning_rate": 0.0003, "loss": 11.0565, "loss/aux_loss": 0.048075059242546556, "loss/crossentropy": 2.6938049614429476, "loss/logits": 0.8414293229579926, "step": 48540 }, { "epoch": 0.4855, "grad_norm": 14.0, "grad_norm_var": 51.1009765625, "learning_rate": 0.0003, "loss": 11.2106, "loss/aux_loss": 0.0480806240811944, "loss/crossentropy": 2.7439634084701536, "loss/logits": 0.8203217297792434, "step": 48550 }, { "epoch": 0.4856, "grad_norm": 15.4375, "grad_norm_var": 0.3509765625, "learning_rate": 0.0003, "loss": 11.0734, "loss/aux_loss": 0.04806556981056929, "loss/crossentropy": 2.665660631656647, "loss/logits": 0.8277056187391281, "step": 48560 }, { "epoch": 0.4857, "grad_norm": 14.4375, "grad_norm_var": 0.25128580729166666, "learning_rate": 0.0003, "loss": 11.0559, "loss/aux_loss": 0.04807299673557282, "loss/crossentropy": 2.8438867926597595, "loss/logits": 0.8425880312919617, "step": 48570 }, { "epoch": 0.4858, "grad_norm": 13.5625, "grad_norm_var": 0.641650390625, "learning_rate": 0.0003, "loss": 10.9983, "loss/aux_loss": 0.04807940311729908, "loss/crossentropy": 2.7427866578102114, "loss/logits": 0.8035318404436111, "step": 48580 }, { "epoch": 0.4859, "grad_norm": 14.5, "grad_norm_var": 0.7212890625, "learning_rate": 0.0003, "loss": 10.964, "loss/aux_loss": 0.04807807970792055, "loss/crossentropy": 2.6292437076568604, "loss/logits": 0.800726181268692, "step": 48590 }, { "epoch": 0.486, "grad_norm": 15.625, "grad_norm_var": 0.913916015625, "learning_rate": 0.0003, "loss": 11.1338, "loss/aux_loss": 0.04807404633611441, "loss/crossentropy": 2.686028057336807, "loss/logits": 0.8427057951688767, "step": 48600 }, { "epoch": 0.4861, "grad_norm": 14.875, "grad_norm_var": 0.8051432291666667, "learning_rate": 0.0003, "loss": 10.9843, "loss/aux_loss": 0.04807655718177557, "loss/crossentropy": 2.5984533965587615, "loss/logits": 0.7909109711647033, "step": 48610 }, { "epoch": 0.4862, "grad_norm": 16.625, "grad_norm_var": 1.9972493489583334, "learning_rate": 0.0003, "loss": 10.9925, "loss/aux_loss": 0.04807711597532034, "loss/crossentropy": 2.593876451253891, "loss/logits": 0.831505474448204, "step": 48620 }, { "epoch": 0.4863, "grad_norm": 13.9375, "grad_norm_var": 1.0317545572916667, "learning_rate": 0.0003, "loss": 11.0305, "loss/aux_loss": 0.04807915035635233, "loss/crossentropy": 2.761583888530731, "loss/logits": 0.8605498760938645, "step": 48630 }, { "epoch": 0.4864, "grad_norm": 14.8125, "grad_norm_var": 0.37233072916666665, "learning_rate": 0.0003, "loss": 11.1418, "loss/aux_loss": 0.04806447252631187, "loss/crossentropy": 2.606149101257324, "loss/logits": 0.8358212620019912, "step": 48640 }, { "epoch": 0.4865, "grad_norm": 15.0, "grad_norm_var": 0.32962239583333336, "learning_rate": 0.0003, "loss": 10.9811, "loss/aux_loss": 0.04808054771274328, "loss/crossentropy": 2.6689969122409822, "loss/logits": 0.8446838974952697, "step": 48650 }, { "epoch": 0.4866, "grad_norm": 15.1875, "grad_norm_var": 2.2749348958333333, "learning_rate": 0.0003, "loss": 11.2319, "loss/aux_loss": 0.048064498230814934, "loss/crossentropy": 2.760690987110138, "loss/logits": 0.8313882291316986, "step": 48660 }, { "epoch": 0.4867, "grad_norm": 14.5, "grad_norm_var": 0.6442057291666666, "learning_rate": 0.0003, "loss": 11.0473, "loss/aux_loss": 0.04807339608669281, "loss/crossentropy": 2.789001631736755, "loss/logits": 0.8379965245723724, "step": 48670 }, { "epoch": 0.4868, "grad_norm": 14.5625, "grad_norm_var": 0.3728515625, "learning_rate": 0.0003, "loss": 11.193, "loss/aux_loss": 0.04807340279221535, "loss/crossentropy": 2.65660617351532, "loss/logits": 0.8233199805021286, "step": 48680 }, { "epoch": 0.4869, "grad_norm": 15.625, "grad_norm_var": 0.5338541666666666, "learning_rate": 0.0003, "loss": 10.828, "loss/aux_loss": 0.04807305708527565, "loss/crossentropy": 2.6044947862625123, "loss/logits": 0.7953520357608795, "step": 48690 }, { "epoch": 0.487, "grad_norm": 14.8125, "grad_norm_var": 0.7331868489583333, "learning_rate": 0.0003, "loss": 11.0447, "loss/aux_loss": 0.04807252772152424, "loss/crossentropy": 2.5909561276435853, "loss/logits": 0.8053539365530014, "step": 48700 }, { "epoch": 0.4871, "grad_norm": 17.0, "grad_norm_var": 0.653759765625, "learning_rate": 0.0003, "loss": 11.1363, "loss/aux_loss": 0.04807318150997162, "loss/crossentropy": 2.6690219819545744, "loss/logits": 0.8558798760175705, "step": 48710 }, { "epoch": 0.4872, "grad_norm": 13.1875, "grad_norm_var": 0.7322916666666667, "learning_rate": 0.0003, "loss": 10.9482, "loss/aux_loss": 0.048087730258703235, "loss/crossentropy": 2.7921680390834807, "loss/logits": 0.8220372408628464, "step": 48720 }, { "epoch": 0.4873, "grad_norm": 14.3125, "grad_norm_var": 0.6102701822916666, "learning_rate": 0.0003, "loss": 11.0286, "loss/aux_loss": 0.04806871749460697, "loss/crossentropy": 2.6840153992176057, "loss/logits": 0.850041389465332, "step": 48730 }, { "epoch": 0.4874, "grad_norm": 15.5625, "grad_norm_var": 0.7270670572916667, "learning_rate": 0.0003, "loss": 11.1639, "loss/aux_loss": 0.04806739930063486, "loss/crossentropy": 2.858708620071411, "loss/logits": 0.8481850981712341, "step": 48740 }, { "epoch": 0.4875, "grad_norm": 16.25, "grad_norm_var": 3.2315104166666666, "learning_rate": 0.0003, "loss": 10.9632, "loss/aux_loss": 0.04807026702910662, "loss/crossentropy": 2.6791675448417664, "loss/logits": 0.8031369209289551, "step": 48750 }, { "epoch": 0.4876, "grad_norm": 13.375, "grad_norm_var": 0.8124837239583333, "learning_rate": 0.0003, "loss": 10.9988, "loss/aux_loss": 0.04808725789189339, "loss/crossentropy": 2.6807437360286714, "loss/logits": 0.8061759442090988, "step": 48760 }, { "epoch": 0.4877, "grad_norm": 15.5625, "grad_norm_var": 0.7791666666666667, "learning_rate": 0.0003, "loss": 11.1572, "loss/aux_loss": 0.04806392826139927, "loss/crossentropy": 2.44208277463913, "loss/logits": 0.8261926531791687, "step": 48770 }, { "epoch": 0.4878, "grad_norm": 15.625, "grad_norm_var": 0.9025390625, "learning_rate": 0.0003, "loss": 11.1327, "loss/aux_loss": 0.04807030875235796, "loss/crossentropy": 2.6741757929325103, "loss/logits": 0.8217558234930038, "step": 48780 }, { "epoch": 0.4879, "grad_norm": 14.9375, "grad_norm_var": 0.49412434895833335, "learning_rate": 0.0003, "loss": 11.1311, "loss/aux_loss": 0.04807980488985777, "loss/crossentropy": 2.5993297338485717, "loss/logits": 0.8183979272842408, "step": 48790 }, { "epoch": 0.488, "grad_norm": 15.0625, "grad_norm_var": 0.2634765625, "learning_rate": 0.0003, "loss": 11.1117, "loss/aux_loss": 0.04807808380573988, "loss/crossentropy": 2.6094757199287413, "loss/logits": 0.7935949236154556, "step": 48800 }, { "epoch": 0.4881, "grad_norm": 15.6875, "grad_norm_var": 0.7231770833333333, "learning_rate": 0.0003, "loss": 11.2077, "loss/aux_loss": 0.04806241802871227, "loss/crossentropy": 2.876955544948578, "loss/logits": 0.8346400111913681, "step": 48810 }, { "epoch": 0.4882, "grad_norm": 15.9375, "grad_norm_var": 0.9645182291666666, "learning_rate": 0.0003, "loss": 11.1004, "loss/aux_loss": 0.048071075975894925, "loss/crossentropy": 2.8417654395103455, "loss/logits": 0.8361264318227768, "step": 48820 }, { "epoch": 0.4883, "grad_norm": 14.1875, "grad_norm_var": 0.551416015625, "learning_rate": 0.0003, "loss": 11.171, "loss/aux_loss": 0.04807602632790804, "loss/crossentropy": 2.799322694540024, "loss/logits": 0.8414522469043731, "step": 48830 }, { "epoch": 0.4884, "grad_norm": 15.875, "grad_norm_var": 0.8311848958333333, "learning_rate": 0.0003, "loss": 11.2567, "loss/aux_loss": 0.04806129559874535, "loss/crossentropy": 2.689255505800247, "loss/logits": 0.8547393798828125, "step": 48840 }, { "epoch": 0.4885, "grad_norm": 13.9375, "grad_norm_var": 0.563916015625, "learning_rate": 0.0003, "loss": 11.1285, "loss/aux_loss": 0.04807682503014803, "loss/crossentropy": 2.797248286008835, "loss/logits": 0.8470206201076508, "step": 48850 }, { "epoch": 0.4886, "grad_norm": 14.625, "grad_norm_var": 0.78046875, "learning_rate": 0.0003, "loss": 11.0018, "loss/aux_loss": 0.04807422161102295, "loss/crossentropy": 2.511183685064316, "loss/logits": 0.7976685196161271, "step": 48860 }, { "epoch": 0.4887, "grad_norm": 14.9375, "grad_norm_var": 7.047509765625, "learning_rate": 0.0003, "loss": 11.0695, "loss/aux_loss": 0.04808024186640978, "loss/crossentropy": 2.8470928072929382, "loss/logits": 0.8435232043266296, "step": 48870 }, { "epoch": 0.4888, "grad_norm": 15.625, "grad_norm_var": 7.532666015625, "learning_rate": 0.0003, "loss": 11.0045, "loss/aux_loss": 0.048071150295436384, "loss/crossentropy": 2.7967050075531006, "loss/logits": 0.8275404214859009, "step": 48880 }, { "epoch": 0.4889, "grad_norm": 14.5625, "grad_norm_var": 0.4337890625, "learning_rate": 0.0003, "loss": 11.0734, "loss/aux_loss": 0.04806635808199644, "loss/crossentropy": 2.776483827829361, "loss/logits": 0.8333809942007064, "step": 48890 }, { "epoch": 0.489, "grad_norm": 13.8125, "grad_norm_var": 0.5416015625, "learning_rate": 0.0003, "loss": 11.1054, "loss/aux_loss": 0.04807385727763176, "loss/crossentropy": 2.7456183671951293, "loss/logits": 0.8244105398654937, "step": 48900 }, { "epoch": 0.4891, "grad_norm": 14.5625, "grad_norm_var": 0.9114420572916667, "learning_rate": 0.0003, "loss": 11.2572, "loss/aux_loss": 0.04807461760938168, "loss/crossentropy": 2.9388389587402344, "loss/logits": 0.8438066065311431, "step": 48910 }, { "epoch": 0.4892, "grad_norm": 14.5625, "grad_norm_var": 0.9707682291666667, "learning_rate": 0.0003, "loss": 11.3584, "loss/aux_loss": 0.04808568153530359, "loss/crossentropy": 2.8007669508457185, "loss/logits": 0.8510422587394715, "step": 48920 }, { "epoch": 0.4893, "grad_norm": 14.1875, "grad_norm_var": 0.916650390625, "learning_rate": 0.0003, "loss": 10.971, "loss/aux_loss": 0.0480602802708745, "loss/crossentropy": 2.614273113012314, "loss/logits": 0.8163867175579071, "step": 48930 }, { "epoch": 0.4894, "grad_norm": 15.125, "grad_norm_var": 3.5825520833333333, "learning_rate": 0.0003, "loss": 11.1483, "loss/aux_loss": 0.04808527324348688, "loss/crossentropy": 2.74897957444191, "loss/logits": 0.8206755816936493, "step": 48940 }, { "epoch": 0.4895, "grad_norm": 14.5, "grad_norm_var": 2.756705729166667, "learning_rate": 0.0003, "loss": 11.139, "loss/aux_loss": 0.04805995114147663, "loss/crossentropy": 2.8490783333778382, "loss/logits": 0.8311156839132309, "step": 48950 }, { "epoch": 0.4896, "grad_norm": 14.5, "grad_norm_var": 0.779150390625, "learning_rate": 0.0003, "loss": 11.1204, "loss/aux_loss": 0.04806854724884033, "loss/crossentropy": 2.634866565465927, "loss/logits": 0.8357069045305252, "step": 48960 }, { "epoch": 0.4897, "grad_norm": 14.875, "grad_norm_var": 0.4479166666666667, "learning_rate": 0.0003, "loss": 10.908, "loss/aux_loss": 0.04807532802224159, "loss/crossentropy": 2.716035795211792, "loss/logits": 0.8380373746156693, "step": 48970 }, { "epoch": 0.4898, "grad_norm": 20.375, "grad_norm_var": 2.3684895833333335, "learning_rate": 0.0003, "loss": 11.1729, "loss/aux_loss": 0.04806866105645895, "loss/crossentropy": 2.523072302341461, "loss/logits": 0.809576940536499, "step": 48980 }, { "epoch": 0.4899, "grad_norm": 14.5625, "grad_norm_var": 2.4712890625, "learning_rate": 0.0003, "loss": 10.9139, "loss/aux_loss": 0.0480740724131465, "loss/crossentropy": 2.5105146706104278, "loss/logits": 0.8070461362600326, "step": 48990 }, { "epoch": 0.49, "grad_norm": 14.125, "grad_norm_var": 0.3726399739583333, "learning_rate": 0.0003, "loss": 10.9705, "loss/aux_loss": 0.048071536049246785, "loss/crossentropy": 2.8065383076667785, "loss/logits": 0.8255183070898056, "step": 49000 }, { "epoch": 0.4901, "grad_norm": 14.9375, "grad_norm_var": 0.33917643229166666, "learning_rate": 0.0003, "loss": 11.1304, "loss/aux_loss": 0.048070686869323254, "loss/crossentropy": 2.783307147026062, "loss/logits": 0.817973655462265, "step": 49010 }, { "epoch": 0.4902, "grad_norm": 13.25, "grad_norm_var": 0.7018229166666666, "learning_rate": 0.0003, "loss": 11.1179, "loss/aux_loss": 0.048075612261891366, "loss/crossentropy": 2.7615358352661135, "loss/logits": 0.8785318732261658, "step": 49020 }, { "epoch": 0.4903, "grad_norm": 14.0, "grad_norm_var": 0.8627604166666667, "learning_rate": 0.0003, "loss": 11.1486, "loss/aux_loss": 0.04808089416474104, "loss/crossentropy": 2.7252781689167023, "loss/logits": 0.8219867736101151, "step": 49030 }, { "epoch": 0.4904, "grad_norm": 15.0625, "grad_norm_var": 0.7822916666666667, "learning_rate": 0.0003, "loss": 10.9766, "loss/aux_loss": 0.048066638968884946, "loss/crossentropy": 2.6021959662437437, "loss/logits": 0.8107183337211609, "step": 49040 }, { "epoch": 0.4905, "grad_norm": 14.25, "grad_norm_var": 0.6675618489583334, "learning_rate": 0.0003, "loss": 11.0778, "loss/aux_loss": 0.04807810839265585, "loss/crossentropy": 2.7399057030677794, "loss/logits": 0.8506060719490052, "step": 49050 }, { "epoch": 0.4906, "grad_norm": 14.0625, "grad_norm_var": 0.299853515625, "learning_rate": 0.0003, "loss": 10.9571, "loss/aux_loss": 0.04807314686477184, "loss/crossentropy": 2.8067448258399965, "loss/logits": 0.8424109250307084, "step": 49060 }, { "epoch": 0.4907, "grad_norm": 15.125, "grad_norm_var": 0.6353515625, "learning_rate": 0.0003, "loss": 11.0461, "loss/aux_loss": 0.048080555908381936, "loss/crossentropy": 2.6311775505542756, "loss/logits": 0.8175216227769851, "step": 49070 }, { "epoch": 0.4908, "grad_norm": 13.125, "grad_norm_var": 1538.88359375, "learning_rate": 0.0003, "loss": 11.1342, "loss/aux_loss": 0.04807594697922468, "loss/crossentropy": 2.646185690164566, "loss/logits": 0.8125636070966721, "step": 49080 }, { "epoch": 0.4909, "grad_norm": 13.75, "grad_norm_var": 0.865087890625, "learning_rate": 0.0003, "loss": 11.1334, "loss/aux_loss": 0.04806858729571104, "loss/crossentropy": 2.7989614844322204, "loss/logits": 0.825323560833931, "step": 49090 }, { "epoch": 0.491, "grad_norm": 14.75, "grad_norm_var": 0.31197916666666664, "learning_rate": 0.0003, "loss": 11.1018, "loss/aux_loss": 0.04807522725313902, "loss/crossentropy": 2.722189944982529, "loss/logits": 0.8272465378046036, "step": 49100 }, { "epoch": 0.4911, "grad_norm": 15.3125, "grad_norm_var": 0.37180989583333335, "learning_rate": 0.0003, "loss": 11.0668, "loss/aux_loss": 0.048069358244538306, "loss/crossentropy": 2.7910789966583254, "loss/logits": 0.8452953428030014, "step": 49110 }, { "epoch": 0.4912, "grad_norm": 14.5625, "grad_norm_var": 0.38430989583333336, "learning_rate": 0.0003, "loss": 11.0669, "loss/aux_loss": 0.04807008057832718, "loss/crossentropy": 2.660131776332855, "loss/logits": 0.8045534908771514, "step": 49120 }, { "epoch": 0.4913, "grad_norm": 13.875, "grad_norm_var": 1.168603515625, "learning_rate": 0.0003, "loss": 10.9811, "loss/aux_loss": 0.048069944977760314, "loss/crossentropy": 2.664647787809372, "loss/logits": 0.8101317912340165, "step": 49130 }, { "epoch": 0.4914, "grad_norm": 14.875, "grad_norm_var": 1.0976399739583333, "learning_rate": 0.0003, "loss": 11.0029, "loss/aux_loss": 0.04808374773710966, "loss/crossentropy": 2.6824153780937197, "loss/logits": 0.8375868052244186, "step": 49140 }, { "epoch": 0.4915, "grad_norm": 14.0625, "grad_norm_var": 1.1554524739583334, "learning_rate": 0.0003, "loss": 11.1305, "loss/aux_loss": 0.04806366134434938, "loss/crossentropy": 2.7819727063179016, "loss/logits": 0.848738157749176, "step": 49150 }, { "epoch": 0.4916, "grad_norm": 15.8125, "grad_norm_var": 0.650634765625, "learning_rate": 0.0003, "loss": 11.2014, "loss/aux_loss": 0.048080661334097385, "loss/crossentropy": 2.6901016354560854, "loss/logits": 0.850712725520134, "step": 49160 }, { "epoch": 0.4917, "grad_norm": 14.1875, "grad_norm_var": 7.600634765625, "learning_rate": 0.0003, "loss": 11.1302, "loss/aux_loss": 0.04806759636849165, "loss/crossentropy": 2.750749206542969, "loss/logits": 0.8357772469520569, "step": 49170 }, { "epoch": 0.4918, "grad_norm": 16.625, "grad_norm_var": 15.297330729166667, "learning_rate": 0.0003, "loss": 11.0257, "loss/aux_loss": 0.048100476153194904, "loss/crossentropy": 2.585780292749405, "loss/logits": 0.7976143449544907, "step": 49180 }, { "epoch": 0.4919, "grad_norm": 16.875, "grad_norm_var": 8.941259765625, "learning_rate": 0.0003, "loss": 11.1719, "loss/aux_loss": 0.048067899979650976, "loss/crossentropy": 2.7343214392662047, "loss/logits": 0.835987788438797, "step": 49190 }, { "epoch": 0.492, "grad_norm": 15.375, "grad_norm_var": 0.9044270833333333, "learning_rate": 0.0003, "loss": 11.0691, "loss/aux_loss": 0.04807320646941662, "loss/crossentropy": 2.6477014422416687, "loss/logits": 0.8445640474557876, "step": 49200 }, { "epoch": 0.4921, "grad_norm": 14.6875, "grad_norm_var": 0.6184733072916667, "learning_rate": 0.0003, "loss": 11.0543, "loss/aux_loss": 0.048070452734828, "loss/crossentropy": 2.7898465573787687, "loss/logits": 0.8426847785711289, "step": 49210 }, { "epoch": 0.4922, "grad_norm": 16.75, "grad_norm_var": 0.6379557291666667, "learning_rate": 0.0003, "loss": 11.094, "loss/aux_loss": 0.04807724431157112, "loss/crossentropy": 2.6970956563949584, "loss/logits": 0.8135352551937103, "step": 49220 }, { "epoch": 0.4923, "grad_norm": 16.125, "grad_norm_var": 1.0978515625, "learning_rate": 0.0003, "loss": 11.1644, "loss/aux_loss": 0.048074241168797015, "loss/crossentropy": 2.668863868713379, "loss/logits": 0.803708478808403, "step": 49230 }, { "epoch": 0.4924, "grad_norm": 14.25, "grad_norm_var": 0.9801432291666666, "learning_rate": 0.0003, "loss": 11.1072, "loss/aux_loss": 0.04808567836880684, "loss/crossentropy": 2.7105720579624175, "loss/logits": 0.8392592817544937, "step": 49240 }, { "epoch": 0.4925, "grad_norm": 14.0625, "grad_norm_var": 0.61328125, "learning_rate": 0.0003, "loss": 11.1155, "loss/aux_loss": 0.04807297699153423, "loss/crossentropy": 2.7127108812332152, "loss/logits": 0.8241129338741302, "step": 49250 }, { "epoch": 0.4926, "grad_norm": 13.875, "grad_norm_var": 1.4054524739583334, "learning_rate": 0.0003, "loss": 10.8193, "loss/aux_loss": 0.04807514287531376, "loss/crossentropy": 2.4936522424221037, "loss/logits": 0.7675803631544114, "step": 49260 }, { "epoch": 0.4927, "grad_norm": 14.4375, "grad_norm_var": 0.5218098958333334, "learning_rate": 0.0003, "loss": 11.0741, "loss/aux_loss": 0.0480735182762146, "loss/crossentropy": 2.5747238457202912, "loss/logits": 0.825323086977005, "step": 49270 }, { "epoch": 0.4928, "grad_norm": 14.75, "grad_norm_var": 0.5516764322916666, "learning_rate": 0.0003, "loss": 10.9145, "loss/aux_loss": 0.048077659122645854, "loss/crossentropy": 2.6410838782787325, "loss/logits": 0.7915389269590378, "step": 49280 }, { "epoch": 0.4929, "grad_norm": 14.25, "grad_norm_var": 1.9891764322916667, "learning_rate": 0.0003, "loss": 11.1417, "loss/aux_loss": 0.04807187095284462, "loss/crossentropy": 2.687062478065491, "loss/logits": 0.842128136754036, "step": 49290 }, { "epoch": 0.493, "grad_norm": 14.5625, "grad_norm_var": 1.7817545572916667, "learning_rate": 0.0003, "loss": 11.1977, "loss/aux_loss": 0.048072554357349875, "loss/crossentropy": 2.6798668265342713, "loss/logits": 0.8448872178792953, "step": 49300 }, { "epoch": 0.4931, "grad_norm": 15.625, "grad_norm_var": 2.8541666666666665, "learning_rate": 0.0003, "loss": 11.2154, "loss/aux_loss": 0.04807398393750191, "loss/crossentropy": 2.708115738630295, "loss/logits": 0.8346493154764175, "step": 49310 }, { "epoch": 0.4932, "grad_norm": 15.4375, "grad_norm_var": 3.0263020833333334, "learning_rate": 0.0003, "loss": 11.0153, "loss/aux_loss": 0.04807023722678423, "loss/crossentropy": 2.7580654978752137, "loss/logits": 0.8456075847148895, "step": 49320 }, { "epoch": 0.4933, "grad_norm": 14.25, "grad_norm_var": 0.3138020833333333, "learning_rate": 0.0003, "loss": 10.9872, "loss/aux_loss": 0.04807710256427526, "loss/crossentropy": 2.983762502670288, "loss/logits": 0.8636047869920731, "step": 49330 }, { "epoch": 0.4934, "grad_norm": 14.8125, "grad_norm_var": 1.1015462239583333, "learning_rate": 0.0003, "loss": 10.9637, "loss/aux_loss": 0.04806807395070791, "loss/crossentropy": 2.467972230911255, "loss/logits": 0.7947597026824951, "step": 49340 }, { "epoch": 0.4935, "grad_norm": 13.1875, "grad_norm_var": 0.37628580729166666, "learning_rate": 0.0003, "loss": 11.0743, "loss/aux_loss": 0.04806096330285072, "loss/crossentropy": 2.7554059624671936, "loss/logits": 0.8382692068815232, "step": 49350 }, { "epoch": 0.4936, "grad_norm": 14.4375, "grad_norm_var": 0.47708333333333336, "learning_rate": 0.0003, "loss": 10.9667, "loss/aux_loss": 0.04808943476527929, "loss/crossentropy": 2.6665013074874877, "loss/logits": 0.8158300817012787, "step": 49360 }, { "epoch": 0.4937, "grad_norm": 14.5, "grad_norm_var": 0.49152018229166666, "learning_rate": 0.0003, "loss": 10.8597, "loss/aux_loss": 0.0480616694316268, "loss/crossentropy": 2.788412946462631, "loss/logits": 0.8223504841327667, "step": 49370 }, { "epoch": 0.4938, "grad_norm": 13.375, "grad_norm_var": 0.46087239583333334, "learning_rate": 0.0003, "loss": 11.2213, "loss/aux_loss": 0.04808288011699915, "loss/crossentropy": 2.643638551235199, "loss/logits": 0.8171029478311539, "step": 49380 }, { "epoch": 0.4939, "grad_norm": 14.4375, "grad_norm_var": 0.3973795572916667, "learning_rate": 0.0003, "loss": 10.9418, "loss/aux_loss": 0.04807625114917755, "loss/crossentropy": 2.7338321208953857, "loss/logits": 0.8375842243432998, "step": 49390 }, { "epoch": 0.494, "grad_norm": 14.6875, "grad_norm_var": 2.592431640625, "learning_rate": 0.0003, "loss": 11.3181, "loss/aux_loss": 0.04807534031569958, "loss/crossentropy": 2.6576287031173704, "loss/logits": 0.8461039811372757, "step": 49400 }, { "epoch": 0.4941, "grad_norm": 14.1875, "grad_norm_var": 0.3700358072916667, "learning_rate": 0.0003, "loss": 10.883, "loss/aux_loss": 0.04808159098029137, "loss/crossentropy": 2.8762070536613464, "loss/logits": 0.8369950473308563, "step": 49410 }, { "epoch": 0.4942, "grad_norm": 14.25, "grad_norm_var": 0.33326822916666665, "learning_rate": 0.0003, "loss": 11.0212, "loss/aux_loss": 0.04807743299752474, "loss/crossentropy": 2.612814891338348, "loss/logits": 0.7918889284133911, "step": 49420 }, { "epoch": 0.4943, "grad_norm": 14.8125, "grad_norm_var": 6.32265625, "learning_rate": 0.0003, "loss": 11.0833, "loss/aux_loss": 0.04807397872209549, "loss/crossentropy": 2.7480882346630096, "loss/logits": 0.837815847992897, "step": 49430 }, { "epoch": 0.4944, "grad_norm": 14.5625, "grad_norm_var": 5.894905598958333, "learning_rate": 0.0003, "loss": 11.1483, "loss/aux_loss": 0.04806387610733509, "loss/crossentropy": 2.7095935344696045, "loss/logits": 0.8372041195631027, "step": 49440 }, { "epoch": 0.4945, "grad_norm": 15.5625, "grad_norm_var": 0.7262858072916667, "learning_rate": 0.0003, "loss": 10.9738, "loss/aux_loss": 0.048079794831573965, "loss/crossentropy": 2.595053482055664, "loss/logits": 0.8303221762180328, "step": 49450 }, { "epoch": 0.4946, "grad_norm": 15.0, "grad_norm_var": 0.693212890625, "learning_rate": 0.0003, "loss": 11.1594, "loss/aux_loss": 0.04807852059602737, "loss/crossentropy": 2.8263864398002623, "loss/logits": 0.8446054220199585, "step": 49460 }, { "epoch": 0.4947, "grad_norm": 15.1875, "grad_norm_var": 0.38865559895833335, "learning_rate": 0.0003, "loss": 11.0476, "loss/aux_loss": 0.04807306993752718, "loss/crossentropy": 2.781277060508728, "loss/logits": 0.8279220938682557, "step": 49470 }, { "epoch": 0.4948, "grad_norm": 13.25, "grad_norm_var": 0.8374837239583334, "learning_rate": 0.0003, "loss": 11.0972, "loss/aux_loss": 0.0480706974864006, "loss/crossentropy": 2.7130113363265993, "loss/logits": 0.8263016819953919, "step": 49480 }, { "epoch": 0.4949, "grad_norm": 15.375, "grad_norm_var": 0.3653645833333333, "learning_rate": 0.0003, "loss": 11.1089, "loss/aux_loss": 0.04807698726654053, "loss/crossentropy": 2.870068061351776, "loss/logits": 0.8862683087587356, "step": 49490 }, { "epoch": 0.495, "grad_norm": 14.625, "grad_norm_var": 0.790625, "learning_rate": 0.0003, "loss": 11.0814, "loss/aux_loss": 0.04808134399354458, "loss/crossentropy": 2.724984419345856, "loss/logits": 0.8414475739002227, "step": 49500 }, { "epoch": 0.4951, "grad_norm": 15.0625, "grad_norm_var": 0.5382649739583333, "learning_rate": 0.0003, "loss": 11.0471, "loss/aux_loss": 0.04806778896600008, "loss/crossentropy": 2.652865248918533, "loss/logits": 0.8197323232889175, "step": 49510 }, { "epoch": 0.4952, "grad_norm": 14.625, "grad_norm_var": 0.4546712239583333, "learning_rate": 0.0003, "loss": 11.051, "loss/aux_loss": 0.04808030594140291, "loss/crossentropy": 2.7049909591674806, "loss/logits": 0.8460562914609909, "step": 49520 }, { "epoch": 0.4953, "grad_norm": 14.5625, "grad_norm_var": 1.0590983072916667, "learning_rate": 0.0003, "loss": 10.8739, "loss/aux_loss": 0.0480698561295867, "loss/crossentropy": 2.702311968803406, "loss/logits": 0.8319470345973968, "step": 49530 }, { "epoch": 0.4954, "grad_norm": 14.6875, "grad_norm_var": 1.3048014322916666, "learning_rate": 0.0003, "loss": 11.0651, "loss/aux_loss": 0.048078288696706294, "loss/crossentropy": 2.6818348348140715, "loss/logits": 0.8396122336387635, "step": 49540 }, { "epoch": 0.4955, "grad_norm": 14.0625, "grad_norm_var": 0.73203125, "learning_rate": 0.0003, "loss": 11.0647, "loss/aux_loss": 0.04807879626750946, "loss/crossentropy": 2.7367110908031465, "loss/logits": 0.824239781498909, "step": 49550 }, { "epoch": 0.4956, "grad_norm": 14.9375, "grad_norm_var": 0.648291015625, "learning_rate": 0.0003, "loss": 10.9588, "loss/aux_loss": 0.04806343484669924, "loss/crossentropy": 2.6144358277320863, "loss/logits": 0.8504854917526246, "step": 49560 }, { "epoch": 0.4957, "grad_norm": 14.5, "grad_norm_var": 0.9452473958333333, "learning_rate": 0.0003, "loss": 11.0769, "loss/aux_loss": 0.0480787593871355, "loss/crossentropy": 2.649821126461029, "loss/logits": 0.8459902018308639, "step": 49570 }, { "epoch": 0.4958, "grad_norm": 15.375, "grad_norm_var": 0.488134765625, "learning_rate": 0.0003, "loss": 11.1721, "loss/aux_loss": 0.04807934109121561, "loss/crossentropy": 2.710836374759674, "loss/logits": 0.8465037196874619, "step": 49580 }, { "epoch": 0.4959, "grad_norm": 14.25, "grad_norm_var": 0.59921875, "learning_rate": 0.0003, "loss": 10.9767, "loss/aux_loss": 0.04806499667465687, "loss/crossentropy": 2.8792532682418823, "loss/logits": 0.8230845898389816, "step": 49590 }, { "epoch": 0.496, "grad_norm": 16.25, "grad_norm_var": 1.7556640625, "learning_rate": 0.0003, "loss": 11.0328, "loss/aux_loss": 0.04807982686907053, "loss/crossentropy": 2.6688818752765657, "loss/logits": 0.8213403493165969, "step": 49600 }, { "epoch": 0.4961, "grad_norm": 13.375, "grad_norm_var": 1.2947265625, "learning_rate": 0.0003, "loss": 11.1903, "loss/aux_loss": 0.04807685986161232, "loss/crossentropy": 2.7829610109329224, "loss/logits": 0.8443383306264878, "step": 49610 }, { "epoch": 0.4962, "grad_norm": 14.0, "grad_norm_var": 0.4874348958333333, "learning_rate": 0.0003, "loss": 10.9827, "loss/aux_loss": 0.048072018660604954, "loss/crossentropy": 2.6003858983516692, "loss/logits": 0.823526531457901, "step": 49620 }, { "epoch": 0.4963, "grad_norm": 14.375, "grad_norm_var": 0.306494140625, "learning_rate": 0.0003, "loss": 11.0844, "loss/aux_loss": 0.04806656241416931, "loss/crossentropy": 2.573668730258942, "loss/logits": 0.818437111377716, "step": 49630 }, { "epoch": 0.4964, "grad_norm": 14.4375, "grad_norm_var": 0.44073893229166666, "learning_rate": 0.0003, "loss": 10.9656, "loss/aux_loss": 0.04807706866413355, "loss/crossentropy": 2.759202075004578, "loss/logits": 0.840299728512764, "step": 49640 }, { "epoch": 0.4965, "grad_norm": 14.375, "grad_norm_var": 0.4557291666666667, "learning_rate": 0.0003, "loss": 11.0086, "loss/aux_loss": 0.0480681087821722, "loss/crossentropy": 2.6497737407684325, "loss/logits": 0.8393264710903168, "step": 49650 }, { "epoch": 0.4966, "grad_norm": 14.0, "grad_norm_var": 0.43474934895833334, "learning_rate": 0.0003, "loss": 11.1813, "loss/aux_loss": 0.048070674762129784, "loss/crossentropy": 2.611664170026779, "loss/logits": 0.8550961494445801, "step": 49660 }, { "epoch": 0.4967, "grad_norm": 15.375, "grad_norm_var": 10.99765625, "learning_rate": 0.0003, "loss": 11.1897, "loss/aux_loss": 0.048074728436768056, "loss/crossentropy": 2.723458409309387, "loss/logits": 0.842160576581955, "step": 49670 }, { "epoch": 0.4968, "grad_norm": 15.625, "grad_norm_var": 0.83046875, "learning_rate": 0.0003, "loss": 11.0535, "loss/aux_loss": 0.048070961609482765, "loss/crossentropy": 2.845665168762207, "loss/logits": 0.8630867063999176, "step": 49680 }, { "epoch": 0.4969, "grad_norm": 15.5, "grad_norm_var": 3.896614583333333, "learning_rate": 0.0003, "loss": 11.1466, "loss/aux_loss": 0.048077776283025744, "loss/crossentropy": 2.748256707191467, "loss/logits": 0.8359945237636566, "step": 49690 }, { "epoch": 0.497, "grad_norm": 14.25, "grad_norm_var": 1.504931640625, "learning_rate": 0.0003, "loss": 11.1832, "loss/aux_loss": 0.04807632230222225, "loss/crossentropy": 2.6894050359725954, "loss/logits": 0.8393000155687332, "step": 49700 }, { "epoch": 0.4971, "grad_norm": 15.3125, "grad_norm_var": 1.5921223958333333, "learning_rate": 0.0003, "loss": 11.2199, "loss/aux_loss": 0.04806073512881994, "loss/crossentropy": 2.835462886095047, "loss/logits": 0.8458648949861527, "step": 49710 }, { "epoch": 0.4972, "grad_norm": 15.3125, "grad_norm_var": 0.45078125, "learning_rate": 0.0003, "loss": 11.07, "loss/aux_loss": 0.048081550374627115, "loss/crossentropy": 2.640725481510162, "loss/logits": 0.8050734728574753, "step": 49720 }, { "epoch": 0.4973, "grad_norm": 14.8125, "grad_norm_var": 0.299853515625, "learning_rate": 0.0003, "loss": 10.9709, "loss/aux_loss": 0.04808553606271744, "loss/crossentropy": 2.663204771280289, "loss/logits": 0.7960788905620575, "step": 49730 }, { "epoch": 0.4974, "grad_norm": 15.25, "grad_norm_var": 60.1416015625, "learning_rate": 0.0003, "loss": 11.0414, "loss/aux_loss": 0.04805216509848833, "loss/crossentropy": 2.5938608229160307, "loss/logits": 0.8149016201496124, "step": 49740 }, { "epoch": 0.4975, "grad_norm": 13.875, "grad_norm_var": 59.84060872395833, "learning_rate": 0.0003, "loss": 11.2352, "loss/aux_loss": 0.048083586245775224, "loss/crossentropy": 2.6643555045127867, "loss/logits": 0.815577107667923, "step": 49750 }, { "epoch": 0.4976, "grad_norm": 14.6875, "grad_norm_var": 0.23645833333333333, "learning_rate": 0.0003, "loss": 10.9069, "loss/aux_loss": 0.048074550181627276, "loss/crossentropy": 2.618529570102692, "loss/logits": 0.8274006098508835, "step": 49760 }, { "epoch": 0.4977, "grad_norm": 14.8125, "grad_norm_var": 0.4942708333333333, "learning_rate": 0.0003, "loss": 11.1073, "loss/aux_loss": 0.04806795679032803, "loss/crossentropy": 2.694086503982544, "loss/logits": 0.8305206030607224, "step": 49770 }, { "epoch": 0.4978, "grad_norm": 14.25, "grad_norm_var": 0.37107747395833335, "learning_rate": 0.0003, "loss": 11.1821, "loss/aux_loss": 0.048074840754270556, "loss/crossentropy": 2.715044713020325, "loss/logits": 0.8461979001760482, "step": 49780 }, { "epoch": 0.4979, "grad_norm": 14.5, "grad_norm_var": 0.7588541666666667, "learning_rate": 0.0003, "loss": 11.1215, "loss/aux_loss": 0.048070978559553626, "loss/crossentropy": 2.643089586496353, "loss/logits": 0.806346595287323, "step": 49790 }, { "epoch": 0.498, "grad_norm": 28.625, "grad_norm_var": 12.72578125, "learning_rate": 0.0003, "loss": 11.1285, "loss/aux_loss": 0.048064558580517766, "loss/crossentropy": 2.699583125114441, "loss/logits": 0.8594643115997315, "step": 49800 }, { "epoch": 0.4981, "grad_norm": 14.6875, "grad_norm_var": 13.309309895833334, "learning_rate": 0.0003, "loss": 11.0075, "loss/aux_loss": 0.04808203764259815, "loss/crossentropy": 2.7328962683677673, "loss/logits": 0.8031840980052948, "step": 49810 }, { "epoch": 0.4982, "grad_norm": 14.6875, "grad_norm_var": 0.5218098958333334, "learning_rate": 0.0003, "loss": 11.2001, "loss/aux_loss": 0.04806886278092861, "loss/crossentropy": 2.808696722984314, "loss/logits": 0.8115378528833389, "step": 49820 }, { "epoch": 0.4983, "grad_norm": 15.0, "grad_norm_var": 0.32745768229166666, "learning_rate": 0.0003, "loss": 10.9526, "loss/aux_loss": 0.04807125814259052, "loss/crossentropy": 2.6571006894111635, "loss/logits": 0.8229701191186904, "step": 49830 }, { "epoch": 0.4984, "grad_norm": 13.5625, "grad_norm_var": 0.766259765625, "learning_rate": 0.0003, "loss": 11.1019, "loss/aux_loss": 0.04808720909059048, "loss/crossentropy": 2.7945067286491394, "loss/logits": 0.8268558502197265, "step": 49840 }, { "epoch": 0.4985, "grad_norm": 15.375, "grad_norm_var": 0.4476399739583333, "learning_rate": 0.0003, "loss": 10.9448, "loss/aux_loss": 0.04806675110012293, "loss/crossentropy": 2.5788372695446014, "loss/logits": 0.7906678229570389, "step": 49850 }, { "epoch": 0.4986, "grad_norm": 14.8125, "grad_norm_var": 0.4019368489583333, "learning_rate": 0.0003, "loss": 11.1355, "loss/aux_loss": 0.04806358329951763, "loss/crossentropy": 2.6663641929626465, "loss/logits": 0.8542564064264297, "step": 49860 }, { "epoch": 0.4987, "grad_norm": 13.875, "grad_norm_var": 0.32864583333333336, "learning_rate": 0.0003, "loss": 11.111, "loss/aux_loss": 0.04807950519025326, "loss/crossentropy": 2.5420661509037017, "loss/logits": 0.8125041216611862, "step": 49870 }, { "epoch": 0.4988, "grad_norm": 14.9375, "grad_norm_var": 0.3811848958333333, "learning_rate": 0.0003, "loss": 11.1871, "loss/aux_loss": 0.04807094018906355, "loss/crossentropy": 2.7237226247787474, "loss/logits": 0.8434429466724396, "step": 49880 }, { "epoch": 0.4989, "grad_norm": 13.5625, "grad_norm_var": 0.6645670572916667, "learning_rate": 0.0003, "loss": 11.0105, "loss/aux_loss": 0.0480699697509408, "loss/crossentropy": 2.797856557369232, "loss/logits": 0.8193605899810791, "step": 49890 }, { "epoch": 0.499, "grad_norm": 15.0625, "grad_norm_var": 0.38800455729166666, "learning_rate": 0.0003, "loss": 11.002, "loss/aux_loss": 0.048073521442711355, "loss/crossentropy": 2.7964406251907348, "loss/logits": 0.8461063802242279, "step": 49900 }, { "epoch": 0.4991, "grad_norm": 14.0625, "grad_norm_var": 0.3094889322916667, "learning_rate": 0.0003, "loss": 11.2589, "loss/aux_loss": 0.04806688260287047, "loss/crossentropy": 2.6836509346961974, "loss/logits": 0.821695277094841, "step": 49910 }, { "epoch": 0.4992, "grad_norm": 15.125, "grad_norm_var": 0.240478515625, "learning_rate": 0.0003, "loss": 11.0785, "loss/aux_loss": 0.048078537732362744, "loss/crossentropy": 2.5942283451557158, "loss/logits": 0.8679295003414154, "step": 49920 }, { "epoch": 0.4993, "grad_norm": 15.0, "grad_norm_var": 0.33058268229166665, "learning_rate": 0.0003, "loss": 11.1418, "loss/aux_loss": 0.0480769170448184, "loss/crossentropy": 2.7477990865707396, "loss/logits": 0.8367729008197784, "step": 49930 }, { "epoch": 0.4994, "grad_norm": 14.5625, "grad_norm_var": 0.8858723958333333, "learning_rate": 0.0003, "loss": 10.927, "loss/aux_loss": 0.04806565400213003, "loss/crossentropy": 2.6473158240318297, "loss/logits": 0.8119089126586914, "step": 49940 }, { "epoch": 0.4995, "grad_norm": 14.9375, "grad_norm_var": 0.5980305989583333, "learning_rate": 0.0003, "loss": 11.1737, "loss/aux_loss": 0.04807810839265585, "loss/crossentropy": 2.774595522880554, "loss/logits": 0.8349743068218232, "step": 49950 }, { "epoch": 0.4996, "grad_norm": 15.1875, "grad_norm_var": 0.6206868489583334, "learning_rate": 0.0003, "loss": 11.195, "loss/aux_loss": 0.04806303158402443, "loss/crossentropy": 2.6357213258743286, "loss/logits": 0.8456666976213455, "step": 49960 }, { "epoch": 0.4997, "grad_norm": 15.1875, "grad_norm_var": 0.9304524739583333, "learning_rate": 0.0003, "loss": 10.9898, "loss/aux_loss": 0.048077212646603584, "loss/crossentropy": 2.6961658537387847, "loss/logits": 0.8146316468715668, "step": 49970 }, { "epoch": 0.4998, "grad_norm": 13.25, "grad_norm_var": 0.326025390625, "learning_rate": 0.0003, "loss": 11.0497, "loss/aux_loss": 0.0480698449537158, "loss/crossentropy": 2.809601533412933, "loss/logits": 0.8374509602785111, "step": 49980 }, { "epoch": 0.4999, "grad_norm": 13.75, "grad_norm_var": 0.523291015625, "learning_rate": 0.0003, "loss": 11.1053, "loss/aux_loss": 0.0480751309543848, "loss/crossentropy": 2.734011006355286, "loss/logits": 0.8388842344284058, "step": 49990 }, { "epoch": 0.5, "grad_norm": 15.5, "grad_norm_var": 0.29791666666666666, "learning_rate": 0.0003, "loss": 11.0715, "loss/aux_loss": 0.04807236734777689, "loss/crossentropy": 2.8549141943454743, "loss/logits": 0.8532051771879197, "step": 50000 }, { "epoch": 0.5001, "grad_norm": 15.375, "grad_norm_var": 0.7484212239583333, "learning_rate": 0.0003, "loss": 11.1863, "loss/aux_loss": 0.0480760183185339, "loss/crossentropy": 2.526221138238907, "loss/logits": 0.8279460847377778, "step": 50010 }, { "epoch": 0.5002, "grad_norm": 14.75, "grad_norm_var": 0.8884765625, "learning_rate": 0.0003, "loss": 11.0844, "loss/aux_loss": 0.04807603172957897, "loss/crossentropy": 2.786004549264908, "loss/logits": 0.8343773394823074, "step": 50020 }, { "epoch": 0.5003, "grad_norm": 14.75, "grad_norm_var": 0.40234375, "learning_rate": 0.0003, "loss": 11.1362, "loss/aux_loss": 0.04807902593165636, "loss/crossentropy": 2.614196312427521, "loss/logits": 0.8386217921972274, "step": 50030 }, { "epoch": 0.5004, "grad_norm": 14.25, "grad_norm_var": 0.4337076822916667, "learning_rate": 0.0003, "loss": 11.0241, "loss/aux_loss": 0.048064802400767805, "loss/crossentropy": 2.7048224210739136, "loss/logits": 0.8228438705205917, "step": 50040 }, { "epoch": 0.5005, "grad_norm": 15.4375, "grad_norm_var": 188.100244140625, "learning_rate": 0.0003, "loss": 11.1056, "loss/aux_loss": 0.04807979743927717, "loss/crossentropy": 2.6778744578361513, "loss/logits": 0.8488940119743347, "step": 50050 }, { "epoch": 0.5006, "grad_norm": 16.875, "grad_norm_var": 2.1015625, "learning_rate": 0.0003, "loss": 11.0635, "loss/aux_loss": 0.0480663301423192, "loss/crossentropy": 2.6144404113292694, "loss/logits": 0.7824886530637741, "step": 50060 }, { "epoch": 0.5007, "grad_norm": 16.5, "grad_norm_var": 0.9775390625, "learning_rate": 0.0003, "loss": 11.13, "loss/aux_loss": 0.048070177994668485, "loss/crossentropy": 2.738340699672699, "loss/logits": 0.823601758480072, "step": 50070 }, { "epoch": 0.5008, "grad_norm": 13.0, "grad_norm_var": 0.8587076822916667, "learning_rate": 0.0003, "loss": 11.0701, "loss/aux_loss": 0.0480703879147768, "loss/crossentropy": 2.7732195377349855, "loss/logits": 0.8485528379678726, "step": 50080 }, { "epoch": 0.5009, "grad_norm": 15.9375, "grad_norm_var": 0.47902018229166665, "learning_rate": 0.0003, "loss": 10.9742, "loss/aux_loss": 0.04807235468178987, "loss/crossentropy": 2.750224161148071, "loss/logits": 0.8227509766817093, "step": 50090 }, { "epoch": 0.501, "grad_norm": 13.0625, "grad_norm_var": 0.8212076822916666, "learning_rate": 0.0003, "loss": 11.0486, "loss/aux_loss": 0.048071041516959664, "loss/crossentropy": 2.774020862579346, "loss/logits": 0.8568490296602249, "step": 50100 }, { "epoch": 0.5011, "grad_norm": 14.25, "grad_norm_var": 0.4593587239583333, "learning_rate": 0.0003, "loss": 11.04, "loss/aux_loss": 0.048077203519642356, "loss/crossentropy": 2.6621095538139343, "loss/logits": 0.8019068986177444, "step": 50110 }, { "epoch": 0.5012, "grad_norm": 15.0, "grad_norm_var": 15.922395833333333, "learning_rate": 0.0003, "loss": 11.2264, "loss/aux_loss": 0.04807031415402889, "loss/crossentropy": 2.7066645860671996, "loss/logits": 0.8399304032325745, "step": 50120 }, { "epoch": 0.5013, "grad_norm": 14.3125, "grad_norm_var": 157.83645833333333, "learning_rate": 0.0003, "loss": 11.1837, "loss/aux_loss": 0.048072899132966994, "loss/crossentropy": 2.6874527156353, "loss/logits": 0.7938042402267456, "step": 50130 }, { "epoch": 0.5014, "grad_norm": 14.9375, "grad_norm_var": 1.9280598958333333, "learning_rate": 0.0003, "loss": 11.0273, "loss/aux_loss": 0.04807997718453407, "loss/crossentropy": 2.8585541009902955, "loss/logits": 0.8428541749715805, "step": 50140 }, { "epoch": 0.5015, "grad_norm": 13.9375, "grad_norm_var": 0.8619140625, "learning_rate": 0.0003, "loss": 11.1604, "loss/aux_loss": 0.04806944746524096, "loss/crossentropy": 2.7434488892555238, "loss/logits": 0.809849202632904, "step": 50150 }, { "epoch": 0.5016, "grad_norm": 14.5, "grad_norm_var": 0.5499837239583333, "learning_rate": 0.0003, "loss": 10.9876, "loss/aux_loss": 0.04807947650551796, "loss/crossentropy": 2.6629399359226227, "loss/logits": 0.8340989917516708, "step": 50160 }, { "epoch": 0.5017, "grad_norm": 14.1875, "grad_norm_var": 0.3348307291666667, "learning_rate": 0.0003, "loss": 10.9673, "loss/aux_loss": 0.048072699643671515, "loss/crossentropy": 2.620541423559189, "loss/logits": 0.7972109645605088, "step": 50170 }, { "epoch": 0.5018, "grad_norm": 14.9375, "grad_norm_var": 0.36451822916666665, "learning_rate": 0.0003, "loss": 11.0375, "loss/aux_loss": 0.04806852545589209, "loss/crossentropy": 2.6545013010501863, "loss/logits": 0.7923622548580169, "step": 50180 }, { "epoch": 0.5019, "grad_norm": 13.8125, "grad_norm_var": 0.321875, "learning_rate": 0.0003, "loss": 11.1828, "loss/aux_loss": 0.048075834102928636, "loss/crossentropy": 2.8539741396903993, "loss/logits": 0.8799011826515197, "step": 50190 }, { "epoch": 0.502, "grad_norm": 13.875, "grad_norm_var": 0.4442545572916667, "learning_rate": 0.0003, "loss": 10.9104, "loss/aux_loss": 0.04806615300476551, "loss/crossentropy": 2.641600948572159, "loss/logits": 0.7879884839057922, "step": 50200 }, { "epoch": 0.5021, "grad_norm": 16.0, "grad_norm_var": 1.0354166666666667, "learning_rate": 0.0003, "loss": 10.984, "loss/aux_loss": 0.04808139093220234, "loss/crossentropy": 2.5606437027454376, "loss/logits": 0.8123639971017838, "step": 50210 }, { "epoch": 0.5022, "grad_norm": 13.75, "grad_norm_var": 0.7514973958333333, "learning_rate": 0.0003, "loss": 11.198, "loss/aux_loss": 0.0480662377551198, "loss/crossentropy": 2.697253167629242, "loss/logits": 0.8449769735336303, "step": 50220 }, { "epoch": 0.5023, "grad_norm": 16.375, "grad_norm_var": 0.793994140625, "learning_rate": 0.0003, "loss": 11.1197, "loss/aux_loss": 0.04807373005896807, "loss/crossentropy": 2.7797034323215484, "loss/logits": 0.8115016400814057, "step": 50230 }, { "epoch": 0.5024, "grad_norm": 14.5625, "grad_norm_var": 269.56243489583335, "learning_rate": 0.0003, "loss": 11.1053, "loss/aux_loss": 0.0480750922113657, "loss/crossentropy": 2.799087393283844, "loss/logits": 0.8153054699301719, "step": 50240 }, { "epoch": 0.5025, "grad_norm": 15.625, "grad_norm_var": 267.090625, "learning_rate": 0.0003, "loss": 11.0676, "loss/aux_loss": 0.04806772004812956, "loss/crossentropy": 2.7093150496482847, "loss/logits": 0.8238262414932251, "step": 50250 }, { "epoch": 0.5026, "grad_norm": 16.25, "grad_norm_var": 0.7735514322916667, "learning_rate": 0.0003, "loss": 11.1285, "loss/aux_loss": 0.04807775299996138, "loss/crossentropy": 2.690925532579422, "loss/logits": 0.8347889751195907, "step": 50260 }, { "epoch": 0.5027, "grad_norm": 16.5, "grad_norm_var": 0.8484212239583333, "learning_rate": 0.0003, "loss": 11.0363, "loss/aux_loss": 0.04806351810693741, "loss/crossentropy": 2.7472257018089294, "loss/logits": 0.8274175226688385, "step": 50270 }, { "epoch": 0.5028, "grad_norm": 18.375, "grad_norm_var": 233.536572265625, "learning_rate": 0.0003, "loss": 11.3035, "loss/aux_loss": 0.048083963245153426, "loss/crossentropy": 2.9165143728256226, "loss/logits": 0.8531142026185989, "step": 50280 }, { "epoch": 0.5029, "grad_norm": 13.875, "grad_norm_var": 228.95670572916666, "learning_rate": 0.0003, "loss": 11.0395, "loss/aux_loss": 0.048072627559304235, "loss/crossentropy": 2.9179752588272097, "loss/logits": 0.8547284364700317, "step": 50290 }, { "epoch": 0.503, "grad_norm": 14.1875, "grad_norm_var": 1.7687337239583334, "learning_rate": 0.0003, "loss": 11.0327, "loss/aux_loss": 0.048072236590087414, "loss/crossentropy": 2.6800991177558897, "loss/logits": 0.8386821538209915, "step": 50300 }, { "epoch": 0.5031, "grad_norm": 14.4375, "grad_norm_var": 0.724462890625, "learning_rate": 0.0003, "loss": 11.1287, "loss/aux_loss": 0.04807893894612789, "loss/crossentropy": 2.683143067359924, "loss/logits": 0.8309021919965744, "step": 50310 }, { "epoch": 0.5032, "grad_norm": 13.8125, "grad_norm_var": 0.3902180989583333, "learning_rate": 0.0003, "loss": 10.955, "loss/aux_loss": 0.048069473914802076, "loss/crossentropy": 2.6512105405330657, "loss/logits": 0.8183946311473846, "step": 50320 }, { "epoch": 0.5033, "grad_norm": 14.8125, "grad_norm_var": 150.9041015625, "learning_rate": 0.0003, "loss": 11.2423, "loss/aux_loss": 0.04808805175125599, "loss/crossentropy": 2.5937359273433684, "loss/logits": 0.8347382307052612, "step": 50330 }, { "epoch": 0.5034, "grad_norm": 13.9375, "grad_norm_var": 0.5128743489583333, "learning_rate": 0.0003, "loss": 11.0916, "loss/aux_loss": 0.04806785080581903, "loss/crossentropy": 2.7667890906333925, "loss/logits": 0.8234162241220474, "step": 50340 }, { "epoch": 0.5035, "grad_norm": 13.6875, "grad_norm_var": 0.48020833333333335, "learning_rate": 0.0003, "loss": 10.9453, "loss/aux_loss": 0.04806566257029772, "loss/crossentropy": 2.5693565726280214, "loss/logits": 0.8084887236356735, "step": 50350 }, { "epoch": 0.5036, "grad_norm": 14.9375, "grad_norm_var": 1.7186848958333334, "learning_rate": 0.0003, "loss": 11.1153, "loss/aux_loss": 0.04807969201356173, "loss/crossentropy": 2.739818775653839, "loss/logits": 0.8364595293998718, "step": 50360 }, { "epoch": 0.5037, "grad_norm": 14.8125, "grad_norm_var": 1.5858723958333334, "learning_rate": 0.0003, "loss": 11.1912, "loss/aux_loss": 0.04808463733643294, "loss/crossentropy": 2.6995759308338165, "loss/logits": 0.8050726383924485, "step": 50370 }, { "epoch": 0.5038, "grad_norm": 13.8125, "grad_norm_var": 0.5778645833333333, "learning_rate": 0.0003, "loss": 10.9566, "loss/aux_loss": 0.048061018250882624, "loss/crossentropy": 2.6875993072986604, "loss/logits": 0.8345743596553803, "step": 50380 }, { "epoch": 0.5039, "grad_norm": 13.9375, "grad_norm_var": 0.45362955729166665, "learning_rate": 0.0003, "loss": 11.041, "loss/aux_loss": 0.04808121174573898, "loss/crossentropy": 2.851317548751831, "loss/logits": 0.8381609439849853, "step": 50390 }, { "epoch": 0.504, "grad_norm": 14.9375, "grad_norm_var": 0.32317708333333334, "learning_rate": 0.0003, "loss": 10.9708, "loss/aux_loss": 0.04808126352727413, "loss/crossentropy": 2.739950382709503, "loss/logits": 0.8330327928066253, "step": 50400 }, { "epoch": 0.5041, "grad_norm": 14.625, "grad_norm_var": 0.2955729166666667, "learning_rate": 0.0003, "loss": 11.0153, "loss/aux_loss": 0.04806138556450605, "loss/crossentropy": 2.7034616589546205, "loss/logits": 0.851497569680214, "step": 50410 }, { "epoch": 0.5042, "grad_norm": 15.3125, "grad_norm_var": 0.2337890625, "learning_rate": 0.0003, "loss": 11.1483, "loss/aux_loss": 0.04807694610208273, "loss/crossentropy": 2.784847009181976, "loss/logits": 0.8578290939331055, "step": 50420 }, { "epoch": 0.5043, "grad_norm": 14.125, "grad_norm_var": 0.3153483072916667, "learning_rate": 0.0003, "loss": 10.8493, "loss/aux_loss": 0.048078867606818676, "loss/crossentropy": 2.6146656930446626, "loss/logits": 0.7973528385162354, "step": 50430 }, { "epoch": 0.5044, "grad_norm": 14.75, "grad_norm_var": 0.341650390625, "learning_rate": 0.0003, "loss": 11.0265, "loss/aux_loss": 0.048073571361601355, "loss/crossentropy": 2.8554004311561583, "loss/logits": 0.8406151056289672, "step": 50440 }, { "epoch": 0.5045, "grad_norm": 14.0625, "grad_norm_var": 0.4567057291666667, "learning_rate": 0.0003, "loss": 10.9926, "loss/aux_loss": 0.048083682730793956, "loss/crossentropy": 2.713830453157425, "loss/logits": 0.8488957345485687, "step": 50450 }, { "epoch": 0.5046, "grad_norm": 13.0625, "grad_norm_var": 1.5556640625, "learning_rate": 0.0003, "loss": 11.11, "loss/aux_loss": 0.048066365718841556, "loss/crossentropy": 2.902998661994934, "loss/logits": 0.8279580295085907, "step": 50460 }, { "epoch": 0.5047, "grad_norm": 15.5625, "grad_norm_var": 0.7299479166666667, "learning_rate": 0.0003, "loss": 10.9543, "loss/aux_loss": 0.04807742275297642, "loss/crossentropy": 2.7396446764469147, "loss/logits": 0.8536162942647934, "step": 50470 }, { "epoch": 0.5048, "grad_norm": 13.625, "grad_norm_var": 0.7792805989583333, "learning_rate": 0.0003, "loss": 10.9662, "loss/aux_loss": 0.048072476498782636, "loss/crossentropy": 2.8252785921096804, "loss/logits": 0.869564825296402, "step": 50480 }, { "epoch": 0.5049, "grad_norm": 14.25, "grad_norm_var": 0.6042805989583333, "learning_rate": 0.0003, "loss": 11.0867, "loss/aux_loss": 0.04807360861450434, "loss/crossentropy": 2.7099923491477966, "loss/logits": 0.8268058747053146, "step": 50490 }, { "epoch": 0.505, "grad_norm": 14.5625, "grad_norm_var": 0.49138997395833334, "learning_rate": 0.0003, "loss": 10.9612, "loss/aux_loss": 0.04806963559240103, "loss/crossentropy": 2.7237312316894533, "loss/logits": 0.8322340279817582, "step": 50500 }, { "epoch": 0.5051, "grad_norm": 16.75, "grad_norm_var": 0.5423014322916667, "learning_rate": 0.0003, "loss": 11.0486, "loss/aux_loss": 0.04807584658265114, "loss/crossentropy": 2.7208118796348573, "loss/logits": 0.8030982494354248, "step": 50510 }, { "epoch": 0.5052, "grad_norm": 15.6875, "grad_norm_var": 0.9128743489583333, "learning_rate": 0.0003, "loss": 11.1668, "loss/aux_loss": 0.04807027783244848, "loss/crossentropy": 2.654416823387146, "loss/logits": 0.8475210994482041, "step": 50520 }, { "epoch": 0.5053, "grad_norm": 14.5625, "grad_norm_var": 15.680143229166667, "learning_rate": 0.0003, "loss": 10.9435, "loss/aux_loss": 0.04807609729468822, "loss/crossentropy": 2.5730921030044556, "loss/logits": 0.838240772485733, "step": 50530 }, { "epoch": 0.5054, "grad_norm": 13.9375, "grad_norm_var": 15.242171223958334, "learning_rate": 0.0003, "loss": 11.1737, "loss/aux_loss": 0.04807768948376179, "loss/crossentropy": 2.771801221370697, "loss/logits": 0.8654070168733596, "step": 50540 }, { "epoch": 0.5055, "grad_norm": 15.1875, "grad_norm_var": 0.717822265625, "learning_rate": 0.0003, "loss": 11.2251, "loss/aux_loss": 0.048080765083432196, "loss/crossentropy": 2.7945044159889223, "loss/logits": 0.838275796175003, "step": 50550 }, { "epoch": 0.5056, "grad_norm": 13.9375, "grad_norm_var": 0.4163899739583333, "learning_rate": 0.0003, "loss": 11.2706, "loss/aux_loss": 0.048064058646559715, "loss/crossentropy": 2.9519375801086425, "loss/logits": 0.8616402268409729, "step": 50560 }, { "epoch": 0.5057, "grad_norm": 14.875, "grad_norm_var": 0.3119140625, "learning_rate": 0.0003, "loss": 11.0596, "loss/aux_loss": 0.04807540029287338, "loss/crossentropy": 2.5487895905971527, "loss/logits": 0.7579917728900909, "step": 50570 }, { "epoch": 0.5058, "grad_norm": 15.375, "grad_norm_var": 0.15857747395833333, "learning_rate": 0.0003, "loss": 11.0143, "loss/aux_loss": 0.04808062519878149, "loss/crossentropy": 2.639469766616821, "loss/logits": 0.8144773453474045, "step": 50580 }, { "epoch": 0.5059, "grad_norm": 13.5625, "grad_norm_var": 0.40358072916666665, "learning_rate": 0.0003, "loss": 11.0849, "loss/aux_loss": 0.048076164163649085, "loss/crossentropy": 2.898840081691742, "loss/logits": 0.8714124709367752, "step": 50590 }, { "epoch": 0.506, "grad_norm": 15.9375, "grad_norm_var": 0.6910807291666666, "learning_rate": 0.0003, "loss": 11.1215, "loss/aux_loss": 0.048068304732441905, "loss/crossentropy": 2.8098879933357237, "loss/logits": 0.8603219360113143, "step": 50600 }, { "epoch": 0.5061, "grad_norm": 14.6875, "grad_norm_var": 0.397119140625, "learning_rate": 0.0003, "loss": 10.9947, "loss/aux_loss": 0.04808787330985069, "loss/crossentropy": 2.6829119682312013, "loss/logits": 0.8325252383947372, "step": 50610 }, { "epoch": 0.5062, "grad_norm": 14.9375, "grad_norm_var": 0.3572916666666667, "learning_rate": 0.0003, "loss": 11.0705, "loss/aux_loss": 0.04808214660733938, "loss/crossentropy": 2.7047139048576354, "loss/logits": 0.8234624296426774, "step": 50620 }, { "epoch": 0.5063, "grad_norm": 14.125, "grad_norm_var": 0.484375, "learning_rate": 0.0003, "loss": 10.9123, "loss/aux_loss": 0.04805761631578207, "loss/crossentropy": 2.766616940498352, "loss/logits": 0.8159997165203094, "step": 50630 }, { "epoch": 0.5064, "grad_norm": 14.8125, "grad_norm_var": 0.30857747395833335, "learning_rate": 0.0003, "loss": 11.0885, "loss/aux_loss": 0.048085050843656066, "loss/crossentropy": 2.773435640335083, "loss/logits": 0.8345916509628296, "step": 50640 }, { "epoch": 0.5065, "grad_norm": 14.625, "grad_norm_var": 0.511181640625, "learning_rate": 0.0003, "loss": 11.0337, "loss/aux_loss": 0.048079907149076465, "loss/crossentropy": 2.7371358036994935, "loss/logits": 0.8105407744646073, "step": 50650 }, { "epoch": 0.5066, "grad_norm": 14.4375, "grad_norm_var": 0.6534993489583333, "learning_rate": 0.0003, "loss": 10.9133, "loss/aux_loss": 0.048066824488341806, "loss/crossentropy": 2.6571763515472413, "loss/logits": 0.800066152215004, "step": 50660 }, { "epoch": 0.5067, "grad_norm": 14.875, "grad_norm_var": 0.4369140625, "learning_rate": 0.0003, "loss": 11.2353, "loss/aux_loss": 0.0480772802606225, "loss/crossentropy": 2.7930760741233827, "loss/logits": 0.8640264600515366, "step": 50670 }, { "epoch": 0.5068, "grad_norm": 14.5625, "grad_norm_var": 0.313916015625, "learning_rate": 0.0003, "loss": 11.1041, "loss/aux_loss": 0.04806650523096323, "loss/crossentropy": 2.7191444516181944, "loss/logits": 0.8551998734474182, "step": 50680 }, { "epoch": 0.5069, "grad_norm": 16.0, "grad_norm_var": 0.5488118489583333, "learning_rate": 0.0003, "loss": 11.088, "loss/aux_loss": 0.04807362388819456, "loss/crossentropy": 2.662330609560013, "loss/logits": 0.8373664259910584, "step": 50690 }, { "epoch": 0.507, "grad_norm": 17.0, "grad_norm_var": 0.87109375, "learning_rate": 0.0003, "loss": 10.8954, "loss/aux_loss": 0.04806805476546287, "loss/crossentropy": 2.762675553560257, "loss/logits": 0.8527992933988571, "step": 50700 }, { "epoch": 0.5071, "grad_norm": 14.1875, "grad_norm_var": 0.7587890625, "learning_rate": 0.0003, "loss": 11.0398, "loss/aux_loss": 0.04807548895478249, "loss/crossentropy": 2.6396145045757295, "loss/logits": 0.8134666383266449, "step": 50710 }, { "epoch": 0.5072, "grad_norm": 14.6875, "grad_norm_var": 0.17805989583333334, "learning_rate": 0.0003, "loss": 10.959, "loss/aux_loss": 0.04807210974395275, "loss/crossentropy": 2.8681382477283477, "loss/logits": 0.8086955964565277, "step": 50720 }, { "epoch": 0.5073, "grad_norm": 15.5625, "grad_norm_var": 0.2509765625, "learning_rate": 0.0003, "loss": 11.1709, "loss/aux_loss": 0.04807695783674717, "loss/crossentropy": 2.681786209344864, "loss/logits": 0.820175650715828, "step": 50730 }, { "epoch": 0.5074, "grad_norm": 15.375, "grad_norm_var": 0.4979166666666667, "learning_rate": 0.0003, "loss": 10.9011, "loss/aux_loss": 0.0480791661888361, "loss/crossentropy": 2.6313997209072113, "loss/logits": 0.8222554922103882, "step": 50740 }, { "epoch": 0.5075, "grad_norm": 14.375, "grad_norm_var": 0.42303059895833334, "learning_rate": 0.0003, "loss": 11.059, "loss/aux_loss": 0.048068783991038797, "loss/crossentropy": 2.8632388710975647, "loss/logits": 0.8794094920158386, "step": 50750 }, { "epoch": 0.5076, "grad_norm": 16.0, "grad_norm_var": 2.855712890625, "learning_rate": 0.0003, "loss": 10.9921, "loss/aux_loss": 0.04807401727885008, "loss/crossentropy": 2.740087425708771, "loss/logits": 0.8258580267429352, "step": 50760 }, { "epoch": 0.5077, "grad_norm": 14.0, "grad_norm_var": 3.0181640625, "learning_rate": 0.0003, "loss": 11.2036, "loss/aux_loss": 0.04808244872838259, "loss/crossentropy": 2.6240702331066132, "loss/logits": 0.8118566811084748, "step": 50770 }, { "epoch": 0.5078, "grad_norm": 15.5625, "grad_norm_var": 0.7315104166666667, "learning_rate": 0.0003, "loss": 11.0808, "loss/aux_loss": 0.04807658027857542, "loss/crossentropy": 2.756423282623291, "loss/logits": 0.8248686224222184, "step": 50780 }, { "epoch": 0.5079, "grad_norm": 16.5, "grad_norm_var": 0.6945149739583333, "learning_rate": 0.0003, "loss": 10.9433, "loss/aux_loss": 0.04807685222476721, "loss/crossentropy": 2.8291961908340455, "loss/logits": 0.8037528693675995, "step": 50790 }, { "epoch": 0.508, "grad_norm": 15.1875, "grad_norm_var": 0.467431640625, "learning_rate": 0.0003, "loss": 11.0704, "loss/aux_loss": 0.04805314373224974, "loss/crossentropy": 2.850136566162109, "loss/logits": 0.8551195234060287, "step": 50800 }, { "epoch": 0.5081, "grad_norm": 15.75, "grad_norm_var": 0.4166666666666667, "learning_rate": 0.0003, "loss": 11.0691, "loss/aux_loss": 0.04807635135948658, "loss/crossentropy": 2.7100765228271486, "loss/logits": 0.8092273443937301, "step": 50810 }, { "epoch": 0.5082, "grad_norm": 15.8125, "grad_norm_var": 0.768603515625, "learning_rate": 0.0003, "loss": 11.1062, "loss/aux_loss": 0.04807732086628676, "loss/crossentropy": 2.6739711463451385, "loss/logits": 0.8558676153421402, "step": 50820 }, { "epoch": 0.5083, "grad_norm": 15.0, "grad_norm_var": 0.3651041666666667, "learning_rate": 0.0003, "loss": 11.1696, "loss/aux_loss": 0.048065226152539255, "loss/crossentropy": 2.7774511337280274, "loss/logits": 0.8358054220676422, "step": 50830 }, { "epoch": 0.5084, "grad_norm": 14.5, "grad_norm_var": 1.025244140625, "learning_rate": 0.0003, "loss": 11.0029, "loss/aux_loss": 0.04808473084121943, "loss/crossentropy": 2.6720672845840454, "loss/logits": 0.8290561676025391, "step": 50840 }, { "epoch": 0.5085, "grad_norm": 15.75, "grad_norm_var": 0.6338541666666667, "learning_rate": 0.0003, "loss": 11.2649, "loss/aux_loss": 0.04807058796286583, "loss/crossentropy": 2.7967530369758604, "loss/logits": 0.8311424374580383, "step": 50850 }, { "epoch": 0.5086, "grad_norm": 14.875, "grad_norm_var": 0.503125, "learning_rate": 0.0003, "loss": 11.1975, "loss/aux_loss": 0.048081024549901485, "loss/crossentropy": 2.813568663597107, "loss/logits": 0.858703076839447, "step": 50860 }, { "epoch": 0.5087, "grad_norm": 14.5625, "grad_norm_var": 0.4905598958333333, "learning_rate": 0.0003, "loss": 10.9619, "loss/aux_loss": 0.04806181099265814, "loss/crossentropy": 2.6827987372875213, "loss/logits": 0.8177167236804962, "step": 50870 }, { "epoch": 0.5088, "grad_norm": 13.9375, "grad_norm_var": 0.7285807291666667, "learning_rate": 0.0003, "loss": 11.0417, "loss/aux_loss": 0.04808285180479288, "loss/crossentropy": 2.5345280170440674, "loss/logits": 0.7993739306926727, "step": 50880 }, { "epoch": 0.5089, "grad_norm": 14.625, "grad_norm_var": 0.7882649739583333, "learning_rate": 0.0003, "loss": 11.1877, "loss/aux_loss": 0.04806562829762697, "loss/crossentropy": 2.7045423090457916, "loss/logits": 0.8371960252523423, "step": 50890 }, { "epoch": 0.509, "grad_norm": 14.5, "grad_norm_var": 0.4561848958333333, "learning_rate": 0.0003, "loss": 11.0157, "loss/aux_loss": 0.048074452206492424, "loss/crossentropy": 2.830974745750427, "loss/logits": 0.8685533732175827, "step": 50900 }, { "epoch": 0.5091, "grad_norm": 13.4375, "grad_norm_var": 0.30441080729166664, "learning_rate": 0.0003, "loss": 11.0398, "loss/aux_loss": 0.04807122685015201, "loss/crossentropy": 2.736586630344391, "loss/logits": 0.8220911502838135, "step": 50910 }, { "epoch": 0.5092, "grad_norm": 14.125, "grad_norm_var": 0.612353515625, "learning_rate": 0.0003, "loss": 10.9015, "loss/aux_loss": 0.04807872846722603, "loss/crossentropy": 2.7401693642139433, "loss/logits": 0.8360977441072464, "step": 50920 }, { "epoch": 0.5093, "grad_norm": 16.25, "grad_norm_var": 0.44698893229166664, "learning_rate": 0.0003, "loss": 10.9974, "loss/aux_loss": 0.04807762745767832, "loss/crossentropy": 2.7443562030792235, "loss/logits": 0.7941692680120468, "step": 50930 }, { "epoch": 0.5094, "grad_norm": 15.625, "grad_norm_var": 0.503369140625, "learning_rate": 0.0003, "loss": 11.2314, "loss/aux_loss": 0.048069654405117034, "loss/crossentropy": 2.642306762933731, "loss/logits": 0.8203934520483017, "step": 50940 }, { "epoch": 0.5095, "grad_norm": 14.625, "grad_norm_var": 0.38697916666666665, "learning_rate": 0.0003, "loss": 11.1083, "loss/aux_loss": 0.0480804480612278, "loss/crossentropy": 2.747548055648804, "loss/logits": 0.8185478031635285, "step": 50950 }, { "epoch": 0.5096, "grad_norm": 16.25, "grad_norm_var": 0.4195149739583333, "learning_rate": 0.0003, "loss": 10.9981, "loss/aux_loss": 0.048063276521861556, "loss/crossentropy": 2.8265784859657286, "loss/logits": 0.8263007819652557, "step": 50960 }, { "epoch": 0.5097, "grad_norm": 14.25, "grad_norm_var": 0.6395182291666667, "learning_rate": 0.0003, "loss": 11.0577, "loss/aux_loss": 0.048073398880660534, "loss/crossentropy": 2.760447859764099, "loss/logits": 0.8406284034252167, "step": 50970 }, { "epoch": 0.5098, "grad_norm": 13.375, "grad_norm_var": 0.5361979166666667, "learning_rate": 0.0003, "loss": 11.0733, "loss/aux_loss": 0.04807597603648901, "loss/crossentropy": 2.785585403442383, "loss/logits": 0.8304526567459106, "step": 50980 }, { "epoch": 0.5099, "grad_norm": 13.1875, "grad_norm_var": 0.4671223958333333, "learning_rate": 0.0003, "loss": 11.1251, "loss/aux_loss": 0.0480725534260273, "loss/crossentropy": 2.7471178472042084, "loss/logits": 0.8607824087142945, "step": 50990 }, { "epoch": 0.51, "grad_norm": 14.75, "grad_norm_var": 0.396875, "learning_rate": 0.0003, "loss": 11.0672, "loss/aux_loss": 0.04808218758553266, "loss/crossentropy": 2.7346564173698424, "loss/logits": 0.8275700658559799, "step": 51000 }, { "epoch": 0.5101, "grad_norm": 14.875, "grad_norm_var": 0.26015625, "learning_rate": 0.0003, "loss": 11.1911, "loss/aux_loss": 0.048065698333084585, "loss/crossentropy": 2.800529360771179, "loss/logits": 0.8583203822374343, "step": 51010 }, { "epoch": 0.5102, "grad_norm": 15.4375, "grad_norm_var": 0.20545247395833333, "learning_rate": 0.0003, "loss": 11.1362, "loss/aux_loss": 0.04808482229709625, "loss/crossentropy": 2.708675539493561, "loss/logits": 0.8227375984191895, "step": 51020 }, { "epoch": 0.5103, "grad_norm": 13.5, "grad_norm_var": 0.45514322916666666, "learning_rate": 0.0003, "loss": 10.9633, "loss/aux_loss": 0.04807134531438351, "loss/crossentropy": 2.5863205909729006, "loss/logits": 0.833231994509697, "step": 51030 }, { "epoch": 0.5104, "grad_norm": 15.625, "grad_norm_var": 0.8641764322916666, "learning_rate": 0.0003, "loss": 10.992, "loss/aux_loss": 0.04806938972324133, "loss/crossentropy": 2.5987710535526274, "loss/logits": 0.8221124142408371, "step": 51040 }, { "epoch": 0.5105, "grad_norm": 15.4375, "grad_norm_var": 2.3384765625, "learning_rate": 0.0003, "loss": 11.1323, "loss/aux_loss": 0.04807955361902714, "loss/crossentropy": 2.7755866408348084, "loss/logits": 0.8378236562013626, "step": 51050 }, { "epoch": 0.5106, "grad_norm": 14.9375, "grad_norm_var": 0.4400390625, "learning_rate": 0.0003, "loss": 11.0057, "loss/aux_loss": 0.048068666271865367, "loss/crossentropy": 2.6365352988243105, "loss/logits": 0.8126265555620193, "step": 51060 }, { "epoch": 0.5107, "grad_norm": 14.4375, "grad_norm_var": 0.4025390625, "learning_rate": 0.0003, "loss": 11.1041, "loss/aux_loss": 0.0480704678222537, "loss/crossentropy": 2.6664901852607725, "loss/logits": 0.826370707154274, "step": 51070 }, { "epoch": 0.5108, "grad_norm": 14.625, "grad_norm_var": 0.30911458333333336, "learning_rate": 0.0003, "loss": 11.0334, "loss/aux_loss": 0.048087571002542975, "loss/crossentropy": 2.511679470539093, "loss/logits": 0.8164161443710327, "step": 51080 }, { "epoch": 0.5109, "grad_norm": 15.3125, "grad_norm_var": 0.208837890625, "learning_rate": 0.0003, "loss": 11.0154, "loss/aux_loss": 0.04806430675089359, "loss/crossentropy": 2.721172201633453, "loss/logits": 0.7954764574766159, "step": 51090 }, { "epoch": 0.511, "grad_norm": 12.9375, "grad_norm_var": 0.5202962239583333, "learning_rate": 0.0003, "loss": 11.0233, "loss/aux_loss": 0.04807245638221502, "loss/crossentropy": 2.619920516014099, "loss/logits": 0.8742161899805069, "step": 51100 }, { "epoch": 0.5111, "grad_norm": 14.375, "grad_norm_var": 75.28722330729167, "learning_rate": 0.0003, "loss": 11.1598, "loss/aux_loss": 0.04808569718152285, "loss/crossentropy": 2.64174947142601, "loss/logits": 0.7941539883613586, "step": 51110 }, { "epoch": 0.5112, "grad_norm": 14.75, "grad_norm_var": 2.0212890625, "learning_rate": 0.0003, "loss": 10.969, "loss/aux_loss": 0.04807308297604322, "loss/crossentropy": 2.7143703937530517, "loss/logits": 0.8080418884754181, "step": 51120 }, { "epoch": 0.5113, "grad_norm": 15.75, "grad_norm_var": 0.504541015625, "learning_rate": 0.0003, "loss": 11.1646, "loss/aux_loss": 0.0480702068656683, "loss/crossentropy": 2.8439256310462953, "loss/logits": 0.849370151758194, "step": 51130 }, { "epoch": 0.5114, "grad_norm": 14.1875, "grad_norm_var": 0.309228515625, "learning_rate": 0.0003, "loss": 11.2208, "loss/aux_loss": 0.048074052482843396, "loss/crossentropy": 2.691173183917999, "loss/logits": 0.8135675758123397, "step": 51140 }, { "epoch": 0.5115, "grad_norm": 14.625, "grad_norm_var": 1.3165201822916666, "learning_rate": 0.0003, "loss": 11.055, "loss/aux_loss": 0.04807727038860321, "loss/crossentropy": 2.8615395545959474, "loss/logits": 0.833811953663826, "step": 51150 }, { "epoch": 0.5116, "grad_norm": 14.3125, "grad_norm_var": 1.818994140625, "learning_rate": 0.0003, "loss": 11.23, "loss/aux_loss": 0.04806257952004671, "loss/crossentropy": 2.637695002555847, "loss/logits": 0.8264784872531891, "step": 51160 }, { "epoch": 0.5117, "grad_norm": 15.9375, "grad_norm_var": 1.2104166666666667, "learning_rate": 0.0003, "loss": 11.1183, "loss/aux_loss": 0.0480752969160676, "loss/crossentropy": 2.65714670419693, "loss/logits": 0.8472563087940216, "step": 51170 }, { "epoch": 0.5118, "grad_norm": 14.375, "grad_norm_var": 0.372900390625, "learning_rate": 0.0003, "loss": 11.0255, "loss/aux_loss": 0.04808554705232382, "loss/crossentropy": 2.6965773463249207, "loss/logits": 0.7938098013401031, "step": 51180 }, { "epoch": 0.5119, "grad_norm": 14.375, "grad_norm_var": 0.45826822916666665, "learning_rate": 0.0003, "loss": 11.0631, "loss/aux_loss": 0.04806725028902292, "loss/crossentropy": 2.699047327041626, "loss/logits": 0.8511703968048095, "step": 51190 }, { "epoch": 0.512, "grad_norm": 20.0, "grad_norm_var": 2.3203125, "learning_rate": 0.0003, "loss": 11.016, "loss/aux_loss": 0.04806656241416931, "loss/crossentropy": 2.580675709247589, "loss/logits": 0.7733906388282776, "step": 51200 }, { "epoch": 0.5121, "grad_norm": 15.3125, "grad_norm_var": 2.1278483072916665, "learning_rate": 0.0003, "loss": 11.2027, "loss/aux_loss": 0.04808239191770554, "loss/crossentropy": 2.8701157569885254, "loss/logits": 0.8477713167667389, "step": 51210 }, { "epoch": 0.5122, "grad_norm": 14.25, "grad_norm_var": 0.595166015625, "learning_rate": 0.0003, "loss": 11.1915, "loss/aux_loss": 0.04806038942188025, "loss/crossentropy": 2.67539005279541, "loss/logits": 0.8615487456321717, "step": 51220 }, { "epoch": 0.5123, "grad_norm": 14.3125, "grad_norm_var": 1.3512858072916667, "learning_rate": 0.0003, "loss": 10.9781, "loss/aux_loss": 0.04807022716850042, "loss/crossentropy": 2.6567323327064516, "loss/logits": 0.8351798057556152, "step": 51230 }, { "epoch": 0.5124, "grad_norm": 15.25, "grad_norm_var": 0.6555826822916667, "learning_rate": 0.0003, "loss": 11.0322, "loss/aux_loss": 0.04807539042085409, "loss/crossentropy": 2.5088176906108854, "loss/logits": 0.8250403732061387, "step": 51240 }, { "epoch": 0.5125, "grad_norm": 15.5, "grad_norm_var": 0.8641764322916666, "learning_rate": 0.0003, "loss": 10.8031, "loss/aux_loss": 0.0480835122987628, "loss/crossentropy": 2.4206930220127107, "loss/logits": 0.7768561899662018, "step": 51250 }, { "epoch": 0.5126, "grad_norm": 14.1875, "grad_norm_var": 0.865087890625, "learning_rate": 0.0003, "loss": 11.0423, "loss/aux_loss": 0.04806485194712877, "loss/crossentropy": 2.7578662991523744, "loss/logits": 0.8342026203870774, "step": 51260 }, { "epoch": 0.5127, "grad_norm": 15.0625, "grad_norm_var": 0.5695149739583333, "learning_rate": 0.0003, "loss": 11.0093, "loss/aux_loss": 0.04807938933372498, "loss/crossentropy": 2.6144285678863524, "loss/logits": 0.8025279492139816, "step": 51270 }, { "epoch": 0.5128, "grad_norm": 14.375, "grad_norm_var": 0.41901041666666666, "learning_rate": 0.0003, "loss": 11.0911, "loss/aux_loss": 0.04807790834456682, "loss/crossentropy": 2.751217710971832, "loss/logits": 0.8257042407989502, "step": 51280 }, { "epoch": 0.5129, "grad_norm": 16.875, "grad_norm_var": 0.5968098958333333, "learning_rate": 0.0003, "loss": 11.0336, "loss/aux_loss": 0.04806835390627384, "loss/crossentropy": 2.7564366936683653, "loss/logits": 0.8366163045167923, "step": 51290 }, { "epoch": 0.513, "grad_norm": 15.9375, "grad_norm_var": 1.1509765625, "learning_rate": 0.0003, "loss": 11.0734, "loss/aux_loss": 0.04808133132755756, "loss/crossentropy": 2.557454949617386, "loss/logits": 0.8112009972333908, "step": 51300 }, { "epoch": 0.5131, "grad_norm": 14.625, "grad_norm_var": 0.8239583333333333, "learning_rate": 0.0003, "loss": 11.1618, "loss/aux_loss": 0.04807974435389042, "loss/crossentropy": 2.628247785568237, "loss/logits": 0.820238995552063, "step": 51310 }, { "epoch": 0.5132, "grad_norm": 15.1875, "grad_norm_var": 0.7140462239583333, "learning_rate": 0.0003, "loss": 11.0499, "loss/aux_loss": 0.048069828934967515, "loss/crossentropy": 2.68018000125885, "loss/logits": 0.8339303702116012, "step": 51320 }, { "epoch": 0.5133, "grad_norm": 13.75, "grad_norm_var": 0.470947265625, "learning_rate": 0.0003, "loss": 11.0399, "loss/aux_loss": 0.04807551633566618, "loss/crossentropy": 2.8822931230068205, "loss/logits": 0.8533807754516601, "step": 51330 }, { "epoch": 0.5134, "grad_norm": 14.8125, "grad_norm_var": 0.5947916666666667, "learning_rate": 0.0003, "loss": 11.0277, "loss/aux_loss": 0.04806902166455984, "loss/crossentropy": 2.6886990547180174, "loss/logits": 0.8332021862268448, "step": 51340 }, { "epoch": 0.5135, "grad_norm": 14.1875, "grad_norm_var": 0.4669108072916667, "learning_rate": 0.0003, "loss": 11.0988, "loss/aux_loss": 0.04808459766209126, "loss/crossentropy": 2.685576003789902, "loss/logits": 0.8132620543241501, "step": 51350 }, { "epoch": 0.5136, "grad_norm": 14.5625, "grad_norm_var": 0.240478515625, "learning_rate": 0.0003, "loss": 11.1777, "loss/aux_loss": 0.04807017575949431, "loss/crossentropy": 2.7257566928863524, "loss/logits": 0.8060571432113648, "step": 51360 }, { "epoch": 0.5137, "grad_norm": 16.75, "grad_norm_var": 0.6994140625, "learning_rate": 0.0003, "loss": 10.858, "loss/aux_loss": 0.0480698412284255, "loss/crossentropy": 2.5365270376205444, "loss/logits": 0.816512593626976, "step": 51370 }, { "epoch": 0.5138, "grad_norm": 14.5, "grad_norm_var": 0.754541015625, "learning_rate": 0.0003, "loss": 11.1839, "loss/aux_loss": 0.0480722613632679, "loss/crossentropy": 2.6611290633678437, "loss/logits": 0.8183812767267227, "step": 51380 }, { "epoch": 0.5139, "grad_norm": 14.5625, "grad_norm_var": 1.1984375, "learning_rate": 0.0003, "loss": 10.9507, "loss/aux_loss": 0.04807326439768076, "loss/crossentropy": 2.77123561501503, "loss/logits": 0.8091616094112396, "step": 51390 }, { "epoch": 0.514, "grad_norm": 14.5, "grad_norm_var": 49.47433268229167, "learning_rate": 0.0003, "loss": 11.1145, "loss/aux_loss": 0.04806629903614521, "loss/crossentropy": 2.870545446872711, "loss/logits": 0.8608437448740005, "step": 51400 }, { "epoch": 0.5141, "grad_norm": 14.75, "grad_norm_var": 48.878580729166664, "learning_rate": 0.0003, "loss": 10.9984, "loss/aux_loss": 0.04807921946048736, "loss/crossentropy": 2.808860683441162, "loss/logits": 0.8428457826375961, "step": 51410 }, { "epoch": 0.5142, "grad_norm": 15.0, "grad_norm_var": 0.7305826822916667, "learning_rate": 0.0003, "loss": 11.1773, "loss/aux_loss": 0.04806988965719938, "loss/crossentropy": 2.8295932352542876, "loss/logits": 0.8579282373189926, "step": 51420 }, { "epoch": 0.5143, "grad_norm": 15.0625, "grad_norm_var": 0.3619140625, "learning_rate": 0.0003, "loss": 11.0726, "loss/aux_loss": 0.048077551648020744, "loss/crossentropy": 2.625492978096008, "loss/logits": 0.8137675523757935, "step": 51430 }, { "epoch": 0.5144, "grad_norm": 15.1875, "grad_norm_var": 0.2757649739583333, "learning_rate": 0.0003, "loss": 11.1048, "loss/aux_loss": 0.0480729004368186, "loss/crossentropy": 2.861330282688141, "loss/logits": 0.8448628783226013, "step": 51440 }, { "epoch": 0.5145, "grad_norm": 14.8125, "grad_norm_var": 0.32472330729166665, "learning_rate": 0.0003, "loss": 11.0903, "loss/aux_loss": 0.04806978609412908, "loss/crossentropy": 2.701605361700058, "loss/logits": 0.7941394478082657, "step": 51450 }, { "epoch": 0.5146, "grad_norm": 15.3125, "grad_norm_var": 0.33670247395833336, "learning_rate": 0.0003, "loss": 11.028, "loss/aux_loss": 0.04808089081197977, "loss/crossentropy": 2.683489578962326, "loss/logits": 0.8333400577306748, "step": 51460 }, { "epoch": 0.5147, "grad_norm": 15.1875, "grad_norm_var": 0.9140462239583333, "learning_rate": 0.0003, "loss": 10.9866, "loss/aux_loss": 0.04807344228029251, "loss/crossentropy": 2.7665489315986633, "loss/logits": 0.8190008670091629, "step": 51470 }, { "epoch": 0.5148, "grad_norm": 13.8125, "grad_norm_var": 1.077978515625, "learning_rate": 0.0003, "loss": 10.8875, "loss/aux_loss": 0.04807010628283024, "loss/crossentropy": 2.58920761346817, "loss/logits": 0.8068946480751038, "step": 51480 }, { "epoch": 0.5149, "grad_norm": 15.5625, "grad_norm_var": 1.1469889322916667, "learning_rate": 0.0003, "loss": 11.0627, "loss/aux_loss": 0.04806904457509518, "loss/crossentropy": 2.661976617574692, "loss/logits": 0.8472563207149506, "step": 51490 }, { "epoch": 0.515, "grad_norm": 15.0625, "grad_norm_var": 1.404931640625, "learning_rate": 0.0003, "loss": 11.0402, "loss/aux_loss": 0.048074154369533065, "loss/crossentropy": 2.8123088240623475, "loss/logits": 0.8572196811437607, "step": 51500 }, { "epoch": 0.5151, "grad_norm": 13.875, "grad_norm_var": 1.3155598958333334, "learning_rate": 0.0003, "loss": 10.9436, "loss/aux_loss": 0.04807008523494005, "loss/crossentropy": 2.711523699760437, "loss/logits": 0.8486291140317916, "step": 51510 }, { "epoch": 0.5152, "grad_norm": 14.125, "grad_norm_var": 0.4384765625, "learning_rate": 0.0003, "loss": 11.0391, "loss/aux_loss": 0.04808166529983282, "loss/crossentropy": 2.766609239578247, "loss/logits": 0.8237248331308364, "step": 51520 }, { "epoch": 0.5153, "grad_norm": 15.8125, "grad_norm_var": 2.97265625, "learning_rate": 0.0003, "loss": 11.0567, "loss/aux_loss": 0.048059662245213984, "loss/crossentropy": 2.716182154417038, "loss/logits": 0.8255683243274688, "step": 51530 }, { "epoch": 0.5154, "grad_norm": 17.75, "grad_norm_var": 1760.974853515625, "learning_rate": 0.0003, "loss": 11.0691, "loss/aux_loss": 0.048087839223444465, "loss/crossentropy": 2.7150439620018005, "loss/logits": 0.8070930659770965, "step": 51540 }, { "epoch": 0.5155, "grad_norm": 15.625, "grad_norm_var": 8.13046875, "learning_rate": 0.0003, "loss": 11.0962, "loss/aux_loss": 0.048063835315406325, "loss/crossentropy": 2.81580011844635, "loss/logits": 0.8323242962360382, "step": 51550 }, { "epoch": 0.5156, "grad_norm": 15.0625, "grad_norm_var": 0.43331705729166664, "learning_rate": 0.0003, "loss": 11.0541, "loss/aux_loss": 0.04805823341012001, "loss/crossentropy": 2.722984766960144, "loss/logits": 0.8511014252901077, "step": 51560 }, { "epoch": 0.5157, "grad_norm": 15.1875, "grad_norm_var": 0.193603515625, "learning_rate": 0.0003, "loss": 11.0758, "loss/aux_loss": 0.04808205440640449, "loss/crossentropy": 2.716118276119232, "loss/logits": 0.8201917320489883, "step": 51570 }, { "epoch": 0.5158, "grad_norm": 13.9375, "grad_norm_var": 0.5306640625, "learning_rate": 0.0003, "loss": 11.2349, "loss/aux_loss": 0.048078343458473685, "loss/crossentropy": 2.661349093914032, "loss/logits": 0.8327278316020965, "step": 51580 }, { "epoch": 0.5159, "grad_norm": 15.3125, "grad_norm_var": 0.6602701822916667, "learning_rate": 0.0003, "loss": 10.9827, "loss/aux_loss": 0.04806844424456358, "loss/crossentropy": 2.665369528532028, "loss/logits": 0.8237587451934815, "step": 51590 }, { "epoch": 0.516, "grad_norm": 14.1875, "grad_norm_var": 0.8885416666666667, "learning_rate": 0.0003, "loss": 10.9778, "loss/aux_loss": 0.04806563388556242, "loss/crossentropy": 2.744618034362793, "loss/logits": 0.8178337156772614, "step": 51600 }, { "epoch": 0.5161, "grad_norm": 14.375, "grad_norm_var": 0.753759765625, "learning_rate": 0.0003, "loss": 10.9837, "loss/aux_loss": 0.04807935301214457, "loss/crossentropy": 2.6647940456867216, "loss/logits": 0.8174872279167176, "step": 51610 }, { "epoch": 0.5162, "grad_norm": 14.4375, "grad_norm_var": 0.8535807291666667, "learning_rate": 0.0003, "loss": 11.1282, "loss/aux_loss": 0.0480736693367362, "loss/crossentropy": 2.6697156190872193, "loss/logits": 0.8190987050533295, "step": 51620 }, { "epoch": 0.5163, "grad_norm": 14.3125, "grad_norm_var": 0.8660807291666667, "learning_rate": 0.0003, "loss": 10.9985, "loss/aux_loss": 0.04808506760746241, "loss/crossentropy": 2.6137089908123015, "loss/logits": 0.8341957181692123, "step": 51630 }, { "epoch": 0.5164, "grad_norm": 19.375, "grad_norm_var": 1.9468587239583333, "learning_rate": 0.0003, "loss": 11.0263, "loss/aux_loss": 0.048055645637214185, "loss/crossentropy": 2.7922864675521852, "loss/logits": 0.8405203580856323, "step": 51640 }, { "epoch": 0.5165, "grad_norm": 15.8125, "grad_norm_var": 1.5942545572916667, "learning_rate": 0.0003, "loss": 11.0421, "loss/aux_loss": 0.04808124210685492, "loss/crossentropy": 2.6083596289157867, "loss/logits": 0.7906621545553207, "step": 51650 }, { "epoch": 0.5166, "grad_norm": 14.6875, "grad_norm_var": 0.3421223958333333, "learning_rate": 0.0003, "loss": 11.1676, "loss/aux_loss": 0.04807032104581595, "loss/crossentropy": 2.7878468513488768, "loss/logits": 0.8364533364772797, "step": 51660 }, { "epoch": 0.5167, "grad_norm": 16.125, "grad_norm_var": 0.48240559895833335, "learning_rate": 0.0003, "loss": 11.083, "loss/aux_loss": 0.04807390999048948, "loss/crossentropy": 2.8989575624465944, "loss/logits": 0.7967435866594315, "step": 51670 }, { "epoch": 0.5168, "grad_norm": 15.3125, "grad_norm_var": 0.4363932291666667, "learning_rate": 0.0003, "loss": 10.8368, "loss/aux_loss": 0.04807572904974222, "loss/crossentropy": 2.6682200372219085, "loss/logits": 0.7805459082126618, "step": 51680 }, { "epoch": 0.5169, "grad_norm": 14.3125, "grad_norm_var": 0.5583170572916667, "learning_rate": 0.0003, "loss": 10.9825, "loss/aux_loss": 0.048071629367768764, "loss/crossentropy": 2.6587085843086244, "loss/logits": 0.8401948183774948, "step": 51690 }, { "epoch": 0.517, "grad_norm": 16.5, "grad_norm_var": 0.7034993489583333, "learning_rate": 0.0003, "loss": 11.1472, "loss/aux_loss": 0.04806802216917276, "loss/crossentropy": 2.8146503806114196, "loss/logits": 0.8239098250865936, "step": 51700 }, { "epoch": 0.5171, "grad_norm": 16.0, "grad_norm_var": 187.33723958333334, "learning_rate": 0.0003, "loss": 11.1429, "loss/aux_loss": 0.04808622244745493, "loss/crossentropy": 2.851152813434601, "loss/logits": 0.8723404318094253, "step": 51710 }, { "epoch": 0.5172, "grad_norm": 14.375, "grad_norm_var": 188.23019205729167, "learning_rate": 0.0003, "loss": 10.9794, "loss/aux_loss": 0.04806773141026497, "loss/crossentropy": 2.7131691336631776, "loss/logits": 0.8205563336610794, "step": 51720 }, { "epoch": 0.5173, "grad_norm": 14.5625, "grad_norm_var": 0.41380208333333335, "learning_rate": 0.0003, "loss": 11.0507, "loss/aux_loss": 0.04806727990508079, "loss/crossentropy": 2.589705538749695, "loss/logits": 0.8440980285406112, "step": 51730 }, { "epoch": 0.5174, "grad_norm": 14.5, "grad_norm_var": 0.5446451822916667, "learning_rate": 0.0003, "loss": 10.9806, "loss/aux_loss": 0.04807034377008677, "loss/crossentropy": 2.791779488325119, "loss/logits": 0.8750499516725541, "step": 51740 }, { "epoch": 0.5175, "grad_norm": 13.375, "grad_norm_var": 0.39791666666666664, "learning_rate": 0.0003, "loss": 10.8982, "loss/aux_loss": 0.048070017248392105, "loss/crossentropy": 2.7478320360183717, "loss/logits": 0.8556131899356842, "step": 51750 }, { "epoch": 0.5176, "grad_norm": 13.4375, "grad_norm_var": 0.6917805989583333, "learning_rate": 0.0003, "loss": 11.1149, "loss/aux_loss": 0.04807256907224655, "loss/crossentropy": 2.7422623872756957, "loss/logits": 0.8193183451890945, "step": 51760 }, { "epoch": 0.5177, "grad_norm": 14.5, "grad_norm_var": 0.597900390625, "learning_rate": 0.0003, "loss": 11.1238, "loss/aux_loss": 0.048071306012570855, "loss/crossentropy": 2.6727042496204376, "loss/logits": 0.8390705615282059, "step": 51770 }, { "epoch": 0.5178, "grad_norm": 15.4375, "grad_norm_var": 0.311962890625, "learning_rate": 0.0003, "loss": 10.9609, "loss/aux_loss": 0.04806660022586584, "loss/crossentropy": 2.560292327404022, "loss/logits": 0.8443563103675842, "step": 51780 }, { "epoch": 0.5179, "grad_norm": 13.9375, "grad_norm_var": 0.5431640625, "learning_rate": 0.0003, "loss": 10.9787, "loss/aux_loss": 0.04807846024632454, "loss/crossentropy": 2.757317876815796, "loss/logits": 0.841183426976204, "step": 51790 }, { "epoch": 0.518, "grad_norm": 15.4375, "grad_norm_var": 0.5400390625, "learning_rate": 0.0003, "loss": 11.0011, "loss/aux_loss": 0.04806059673428535, "loss/crossentropy": 2.6568395853042603, "loss/logits": 0.8394730240106583, "step": 51800 }, { "epoch": 0.5181, "grad_norm": 14.0, "grad_norm_var": 0.3941243489583333, "learning_rate": 0.0003, "loss": 10.9822, "loss/aux_loss": 0.04807980302721262, "loss/crossentropy": 2.7917647838592528, "loss/logits": 0.8249937295913696, "step": 51810 }, { "epoch": 0.5182, "grad_norm": 14.125, "grad_norm_var": 0.6313639322916667, "learning_rate": 0.0003, "loss": 11.235, "loss/aux_loss": 0.04807339478284121, "loss/crossentropy": 2.748648017644882, "loss/logits": 0.8033011108636856, "step": 51820 }, { "epoch": 0.5183, "grad_norm": 15.3125, "grad_norm_var": 0.4869140625, "learning_rate": 0.0003, "loss": 10.9818, "loss/aux_loss": 0.04806631077080965, "loss/crossentropy": 2.7276119709014894, "loss/logits": 0.8400139749050141, "step": 51830 }, { "epoch": 0.5184, "grad_norm": 13.5, "grad_norm_var": 0.6833170572916667, "learning_rate": 0.0003, "loss": 11.1312, "loss/aux_loss": 0.0480747552588582, "loss/crossentropy": 2.6863146901130674, "loss/logits": 0.8191409975290298, "step": 51840 }, { "epoch": 0.5185, "grad_norm": 14.375, "grad_norm_var": 0.418603515625, "learning_rate": 0.0003, "loss": 11.0812, "loss/aux_loss": 0.048067673854529855, "loss/crossentropy": 2.724157619476318, "loss/logits": 0.8137524396181106, "step": 51850 }, { "epoch": 0.5186, "grad_norm": 14.4375, "grad_norm_var": 0.5212076822916667, "learning_rate": 0.0003, "loss": 10.9867, "loss/aux_loss": 0.04808175042271614, "loss/crossentropy": 2.7627050638198853, "loss/logits": 0.8452069222927093, "step": 51860 }, { "epoch": 0.5187, "grad_norm": 14.75, "grad_norm_var": 0.8152180989583333, "learning_rate": 0.0003, "loss": 10.9896, "loss/aux_loss": 0.048070698603987695, "loss/crossentropy": 2.6193623900413514, "loss/logits": 0.8333276480436325, "step": 51870 }, { "epoch": 0.5188, "grad_norm": 14.1875, "grad_norm_var": 0.5051432291666667, "learning_rate": 0.0003, "loss": 11.0474, "loss/aux_loss": 0.04807662758976221, "loss/crossentropy": 2.6479109644889833, "loss/logits": 0.8272971555590629, "step": 51880 }, { "epoch": 0.5189, "grad_norm": 14.9375, "grad_norm_var": 0.41087239583333335, "learning_rate": 0.0003, "loss": 10.9789, "loss/aux_loss": 0.048069034889340403, "loss/crossentropy": 2.781693035364151, "loss/logits": 0.7891067415475845, "step": 51890 }, { "epoch": 0.519, "grad_norm": 14.8125, "grad_norm_var": 0.46920572916666664, "learning_rate": 0.0003, "loss": 11.0487, "loss/aux_loss": 0.04807375390082598, "loss/crossentropy": 2.760209488868713, "loss/logits": 0.8277522176504135, "step": 51900 }, { "epoch": 0.5191, "grad_norm": 14.4375, "grad_norm_var": 0.6077473958333334, "learning_rate": 0.0003, "loss": 11.1482, "loss/aux_loss": 0.04807181041687727, "loss/crossentropy": 2.803401565551758, "loss/logits": 0.8468029230833054, "step": 51910 }, { "epoch": 0.5192, "grad_norm": 13.6875, "grad_norm_var": 0.8886555989583333, "learning_rate": 0.0003, "loss": 11.1805, "loss/aux_loss": 0.048076972179114816, "loss/crossentropy": 2.638881093263626, "loss/logits": 0.8475582480430603, "step": 51920 }, { "epoch": 0.5193, "grad_norm": 15.1875, "grad_norm_var": 0.7930826822916667, "learning_rate": 0.0003, "loss": 11.147, "loss/aux_loss": 0.048069387674331665, "loss/crossentropy": 2.719567573070526, "loss/logits": 0.8314665377140045, "step": 51930 }, { "epoch": 0.5194, "grad_norm": 14.3125, "grad_norm_var": 0.32916666666666666, "learning_rate": 0.0003, "loss": 11.2553, "loss/aux_loss": 0.0480784498155117, "loss/crossentropy": 2.7691810011863707, "loss/logits": 0.8145269155502319, "step": 51940 }, { "epoch": 0.5195, "grad_norm": 14.75, "grad_norm_var": 0.9473307291666667, "learning_rate": 0.0003, "loss": 11.2179, "loss/aux_loss": 0.04808474984019995, "loss/crossentropy": 2.6275469183921816, "loss/logits": 0.8544179648160934, "step": 51950 }, { "epoch": 0.5196, "grad_norm": 14.5625, "grad_norm_var": 0.9541666666666667, "learning_rate": 0.0003, "loss": 11.0911, "loss/aux_loss": 0.04806585069745779, "loss/crossentropy": 2.7665723621845246, "loss/logits": 0.839416640996933, "step": 51960 }, { "epoch": 0.5197, "grad_norm": 14.125, "grad_norm_var": 0.9515625, "learning_rate": 0.0003, "loss": 11.0934, "loss/aux_loss": 0.04807109031826258, "loss/crossentropy": 2.7955354332923887, "loss/logits": 0.8091706037521362, "step": 51970 }, { "epoch": 0.5198, "grad_norm": 14.375, "grad_norm_var": 0.790869140625, "learning_rate": 0.0003, "loss": 10.9905, "loss/aux_loss": 0.04806825909763575, "loss/crossentropy": 2.7529439866542815, "loss/logits": 0.8357030868530273, "step": 51980 }, { "epoch": 0.5199, "grad_norm": 13.8125, "grad_norm_var": 0.36013997395833336, "learning_rate": 0.0003, "loss": 10.9674, "loss/aux_loss": 0.048064617440104485, "loss/crossentropy": 2.542526823282242, "loss/logits": 0.7883663177490234, "step": 51990 }, { "epoch": 0.52, "grad_norm": 14.5625, "grad_norm_var": 0.5002604166666667, "learning_rate": 0.0003, "loss": 11.1598, "loss/aux_loss": 0.048075161315500735, "loss/crossentropy": 2.624401843547821, "loss/logits": 0.8276967614889145, "step": 52000 }, { "epoch": 0.5201, "grad_norm": 13.25, "grad_norm_var": 0.3551432291666667, "learning_rate": 0.0003, "loss": 11.0968, "loss/aux_loss": 0.048081882484257224, "loss/crossentropy": 2.6817555725574493, "loss/logits": 0.8237560451030731, "step": 52010 }, { "epoch": 0.5202, "grad_norm": 14.4375, "grad_norm_var": 0.49724934895833334, "learning_rate": 0.0003, "loss": 10.8817, "loss/aux_loss": 0.0480725109577179, "loss/crossentropy": 2.782274627685547, "loss/logits": 0.8277134209871292, "step": 52020 }, { "epoch": 0.5203, "grad_norm": 14.4375, "grad_norm_var": 0.7645182291666667, "learning_rate": 0.0003, "loss": 11.0936, "loss/aux_loss": 0.04807168003171682, "loss/crossentropy": 2.7582414865493776, "loss/logits": 0.825353017449379, "step": 52030 }, { "epoch": 0.5204, "grad_norm": 16.0, "grad_norm_var": 0.22120768229166668, "learning_rate": 0.0003, "loss": 11.1796, "loss/aux_loss": 0.04807770941406488, "loss/crossentropy": 2.7298890888690948, "loss/logits": 0.8350825905799866, "step": 52040 }, { "epoch": 0.5205, "grad_norm": 15.3125, "grad_norm_var": 0.30857747395833335, "learning_rate": 0.0003, "loss": 11.1765, "loss/aux_loss": 0.048072948679327966, "loss/crossentropy": 2.7066911339759825, "loss/logits": 0.79820456802845, "step": 52050 }, { "epoch": 0.5206, "grad_norm": 15.1875, "grad_norm_var": 0.22029622395833334, "learning_rate": 0.0003, "loss": 11.0453, "loss/aux_loss": 0.04807343035936355, "loss/crossentropy": 2.80033460855484, "loss/logits": 0.8346506953239441, "step": 52060 }, { "epoch": 0.5207, "grad_norm": 14.375, "grad_norm_var": 0.9656087239583333, "learning_rate": 0.0003, "loss": 11.0548, "loss/aux_loss": 0.048069695197045806, "loss/crossentropy": 2.9161171913146973, "loss/logits": 0.8307078570127487, "step": 52070 }, { "epoch": 0.5208, "grad_norm": 13.4375, "grad_norm_var": 1.5013020833333333, "learning_rate": 0.0003, "loss": 10.9583, "loss/aux_loss": 0.048071876727044584, "loss/crossentropy": 2.7011435866355895, "loss/logits": 0.8153011113405227, "step": 52080 }, { "epoch": 0.5209, "grad_norm": 13.875, "grad_norm_var": 0.499853515625, "learning_rate": 0.0003, "loss": 11.1305, "loss/aux_loss": 0.048079690895974636, "loss/crossentropy": 2.848330098390579, "loss/logits": 0.8801573872566223, "step": 52090 }, { "epoch": 0.521, "grad_norm": 17.0, "grad_norm_var": 0.8848307291666667, "learning_rate": 0.0003, "loss": 11.2081, "loss/aux_loss": 0.04806297663599253, "loss/crossentropy": 2.7441537618637084, "loss/logits": 0.8512663036584854, "step": 52100 }, { "epoch": 0.5211, "grad_norm": 13.75, "grad_norm_var": 0.8066243489583333, "learning_rate": 0.0003, "loss": 11.0919, "loss/aux_loss": 0.048075188882648945, "loss/crossentropy": 2.779297721385956, "loss/logits": 0.8286562114953995, "step": 52110 }, { "epoch": 0.5212, "grad_norm": 14.875, "grad_norm_var": 0.6700520833333333, "learning_rate": 0.0003, "loss": 11.0529, "loss/aux_loss": 0.04808066878467798, "loss/crossentropy": 2.675219976902008, "loss/logits": 0.822179701924324, "step": 52120 }, { "epoch": 0.5213, "grad_norm": 15.4375, "grad_norm_var": 3.655143229166667, "learning_rate": 0.0003, "loss": 11.055, "loss/aux_loss": 0.04806961119174957, "loss/crossentropy": 2.6856570720672606, "loss/logits": 0.8347759008407593, "step": 52130 }, { "epoch": 0.5214, "grad_norm": 13.875, "grad_norm_var": 1.320947265625, "learning_rate": 0.0003, "loss": 11.083, "loss/aux_loss": 0.048076645098626615, "loss/crossentropy": 2.7049236536026, "loss/logits": 0.8613912463188171, "step": 52140 }, { "epoch": 0.5215, "grad_norm": 15.8125, "grad_norm_var": 1.0139973958333333, "learning_rate": 0.0003, "loss": 11.1862, "loss/aux_loss": 0.04807023461908102, "loss/crossentropy": 2.8382157564163206, "loss/logits": 0.8311042636632919, "step": 52150 }, { "epoch": 0.5216, "grad_norm": 14.5625, "grad_norm_var": 0.28274739583333336, "learning_rate": 0.0003, "loss": 10.8209, "loss/aux_loss": 0.04806796368211508, "loss/crossentropy": 2.614904749393463, "loss/logits": 0.820676788687706, "step": 52160 }, { "epoch": 0.5217, "grad_norm": 17.375, "grad_norm_var": 432.25670572916664, "learning_rate": 0.0003, "loss": 11.0011, "loss/aux_loss": 0.04808936510235071, "loss/crossentropy": 2.7092471361160277, "loss/logits": 0.8464861899614334, "step": 52170 }, { "epoch": 0.5218, "grad_norm": 14.75, "grad_norm_var": 422.3980305989583, "learning_rate": 0.0003, "loss": 10.9375, "loss/aux_loss": 0.0480745954439044, "loss/crossentropy": 2.670332300662994, "loss/logits": 0.8062131941318512, "step": 52180 }, { "epoch": 0.5219, "grad_norm": 19.75, "grad_norm_var": 66.319775390625, "learning_rate": 0.0003, "loss": 10.9526, "loss/aux_loss": 0.04807987660169601, "loss/crossentropy": 2.55713204741478, "loss/logits": 0.7974629938602448, "step": 52190 }, { "epoch": 0.522, "grad_norm": 16.0, "grad_norm_var": 65.07649739583333, "learning_rate": 0.0003, "loss": 11.1056, "loss/aux_loss": 0.04808241315186024, "loss/crossentropy": 2.6287878811359406, "loss/logits": 0.8144524425268174, "step": 52200 }, { "epoch": 0.5221, "grad_norm": 14.875, "grad_norm_var": 6.874983723958334, "learning_rate": 0.0003, "loss": 11.0077, "loss/aux_loss": 0.04806690067052841, "loss/crossentropy": 2.838245689868927, "loss/logits": 0.8343179583549499, "step": 52210 }, { "epoch": 0.5222, "grad_norm": 16.375, "grad_norm_var": 173.25271809895833, "learning_rate": 0.0003, "loss": 11.0835, "loss/aux_loss": 0.0480758348479867, "loss/crossentropy": 2.793542319536209, "loss/logits": 0.8210146844387054, "step": 52220 }, { "epoch": 0.5223, "grad_norm": 16.625, "grad_norm_var": 169.347119140625, "learning_rate": 0.0003, "loss": 11.2643, "loss/aux_loss": 0.048066737875342366, "loss/crossentropy": 2.8591265738010407, "loss/logits": 0.8664484679698944, "step": 52230 }, { "epoch": 0.5224, "grad_norm": 16.875, "grad_norm_var": 1.5620930989583333, "learning_rate": 0.0003, "loss": 11.0528, "loss/aux_loss": 0.048073151335120204, "loss/crossentropy": 2.916612446308136, "loss/logits": 0.8306013375520707, "step": 52240 }, { "epoch": 0.5225, "grad_norm": 15.75, "grad_norm_var": 0.9489583333333333, "learning_rate": 0.0003, "loss": 10.9975, "loss/aux_loss": 0.048077428713440895, "loss/crossentropy": 2.7287715911865233, "loss/logits": 0.8001983672380447, "step": 52250 }, { "epoch": 0.5226, "grad_norm": 15.3125, "grad_norm_var": 1.1489583333333333, "learning_rate": 0.0003, "loss": 11.1489, "loss/aux_loss": 0.04807046465575695, "loss/crossentropy": 2.690057897567749, "loss/logits": 0.8564533293247223, "step": 52260 }, { "epoch": 0.5227, "grad_norm": 14.6875, "grad_norm_var": 1.4202962239583334, "learning_rate": 0.0003, "loss": 11.1189, "loss/aux_loss": 0.04805558752268553, "loss/crossentropy": 2.7192665219306944, "loss/logits": 0.8252882838249207, "step": 52270 }, { "epoch": 0.5228, "grad_norm": 14.0625, "grad_norm_var": 1.149853515625, "learning_rate": 0.0003, "loss": 11.0902, "loss/aux_loss": 0.048078200593590734, "loss/crossentropy": 2.6997458934783936, "loss/logits": 0.8123593002557754, "step": 52280 }, { "epoch": 0.5229, "grad_norm": 14.3125, "grad_norm_var": 0.503759765625, "learning_rate": 0.0003, "loss": 11.0166, "loss/aux_loss": 0.04806804172694683, "loss/crossentropy": 2.7590546131134035, "loss/logits": 0.8631124138832093, "step": 52290 }, { "epoch": 0.523, "grad_norm": 15.125, "grad_norm_var": 0.36744791666666665, "learning_rate": 0.0003, "loss": 11.065, "loss/aux_loss": 0.048063941113650796, "loss/crossentropy": 2.7753712356090547, "loss/logits": 0.8426102191209793, "step": 52300 }, { "epoch": 0.5231, "grad_norm": 14.875, "grad_norm_var": 0.5161295572916667, "learning_rate": 0.0003, "loss": 11.0001, "loss/aux_loss": 0.04806868564337492, "loss/crossentropy": 2.659641718864441, "loss/logits": 0.7856981217861175, "step": 52310 }, { "epoch": 0.5232, "grad_norm": 15.5625, "grad_norm_var": 0.32784830729166664, "learning_rate": 0.0003, "loss": 10.9511, "loss/aux_loss": 0.04808425325900316, "loss/crossentropy": 2.805215048789978, "loss/logits": 0.8224257946014404, "step": 52320 }, { "epoch": 0.5233, "grad_norm": 14.1875, "grad_norm_var": 0.6854166666666667, "learning_rate": 0.0003, "loss": 11.0145, "loss/aux_loss": 0.04806450065225363, "loss/crossentropy": 2.7425873398780825, "loss/logits": 0.8594145178794861, "step": 52330 }, { "epoch": 0.5234, "grad_norm": 14.4375, "grad_norm_var": 0.8325520833333333, "learning_rate": 0.0003, "loss": 10.9849, "loss/aux_loss": 0.048070778325200084, "loss/crossentropy": 2.6495799660682677, "loss/logits": 0.8561073631048203, "step": 52340 }, { "epoch": 0.5235, "grad_norm": 14.4375, "grad_norm_var": 0.3733723958333333, "learning_rate": 0.0003, "loss": 11.1114, "loss/aux_loss": 0.048080655932426455, "loss/crossentropy": 2.7500119626522066, "loss/logits": 0.8257864147424698, "step": 52350 }, { "epoch": 0.5236, "grad_norm": 13.625, "grad_norm_var": 0.33098958333333334, "learning_rate": 0.0003, "loss": 11.1795, "loss/aux_loss": 0.04806145485490561, "loss/crossentropy": 2.7029913425445558, "loss/logits": 0.8572422236204147, "step": 52360 }, { "epoch": 0.5237, "grad_norm": 14.75, "grad_norm_var": 0.5702962239583333, "learning_rate": 0.0003, "loss": 10.9993, "loss/aux_loss": 0.04806832876056433, "loss/crossentropy": 2.8137829422950746, "loss/logits": 0.8569367885589599, "step": 52370 }, { "epoch": 0.5238, "grad_norm": 15.3125, "grad_norm_var": 0.3067708333333333, "learning_rate": 0.0003, "loss": 11.19, "loss/aux_loss": 0.04807415381073952, "loss/crossentropy": 2.7384074926376343, "loss/logits": 0.8470358967781066, "step": 52380 }, { "epoch": 0.5239, "grad_norm": 15.0625, "grad_norm_var": 0.51953125, "learning_rate": 0.0003, "loss": 11.1412, "loss/aux_loss": 0.048076053522527216, "loss/crossentropy": 2.646379691362381, "loss/logits": 0.8290688633918762, "step": 52390 }, { "epoch": 0.524, "grad_norm": 14.5, "grad_norm_var": 1.0010416666666666, "learning_rate": 0.0003, "loss": 11.132, "loss/aux_loss": 0.04807012863457203, "loss/crossentropy": 2.5831472992897035, "loss/logits": 0.8385468900203705, "step": 52400 }, { "epoch": 0.5241, "grad_norm": 14.0625, "grad_norm_var": 0.63046875, "learning_rate": 0.0003, "loss": 11.025, "loss/aux_loss": 0.0480716010555625, "loss/crossentropy": 2.7688582479953765, "loss/logits": 0.81949682533741, "step": 52410 }, { "epoch": 0.5242, "grad_norm": 14.875, "grad_norm_var": 1.2026041666666667, "learning_rate": 0.0003, "loss": 11.0372, "loss/aux_loss": 0.04806106220930815, "loss/crossentropy": 2.662490212917328, "loss/logits": 0.8316513210535049, "step": 52420 }, { "epoch": 0.5243, "grad_norm": 14.5625, "grad_norm_var": 0.7013020833333333, "learning_rate": 0.0003, "loss": 11.0859, "loss/aux_loss": 0.04808855298906565, "loss/crossentropy": 2.754934787750244, "loss/logits": 0.8487011790275574, "step": 52430 }, { "epoch": 0.5244, "grad_norm": 15.25, "grad_norm_var": 0.5528645833333333, "learning_rate": 0.0003, "loss": 11.0998, "loss/aux_loss": 0.04806870725005865, "loss/crossentropy": 2.7495794236660003, "loss/logits": 0.8377079129219055, "step": 52440 }, { "epoch": 0.5245, "grad_norm": 14.4375, "grad_norm_var": 0.6384765625, "learning_rate": 0.0003, "loss": 11.0026, "loss/aux_loss": 0.04806353971362114, "loss/crossentropy": 2.7716871798038483, "loss/logits": 0.8267540782690048, "step": 52450 }, { "epoch": 0.5246, "grad_norm": 15.6875, "grad_norm_var": 0.8356608072916667, "learning_rate": 0.0003, "loss": 11.0121, "loss/aux_loss": 0.048077926598489286, "loss/crossentropy": 2.6334754884243012, "loss/logits": 0.8087111979722976, "step": 52460 }, { "epoch": 0.5247, "grad_norm": 14.375, "grad_norm_var": 0.5186848958333333, "learning_rate": 0.0003, "loss": 11.1926, "loss/aux_loss": 0.048081686906516555, "loss/crossentropy": 2.869077742099762, "loss/logits": 0.8490048706531524, "step": 52470 }, { "epoch": 0.5248, "grad_norm": 58.5, "grad_norm_var": 120.93463541666667, "learning_rate": 0.0003, "loss": 11.1115, "loss/aux_loss": 0.04806402996182442, "loss/crossentropy": 2.7439518332481385, "loss/logits": 0.8263050705194473, "step": 52480 }, { "epoch": 0.5249, "grad_norm": 13.75, "grad_norm_var": 120.603369140625, "learning_rate": 0.0003, "loss": 11.1591, "loss/aux_loss": 0.04807528704404831, "loss/crossentropy": 2.756017154455185, "loss/logits": 0.8585342705249787, "step": 52490 }, { "epoch": 0.525, "grad_norm": 17.0, "grad_norm_var": 0.8096354166666667, "learning_rate": 0.0003, "loss": 10.8914, "loss/aux_loss": 0.04808173086494207, "loss/crossentropy": 2.5397191107273103, "loss/logits": 0.7909625247120857, "step": 52500 }, { "epoch": 0.5251, "grad_norm": 15.0625, "grad_norm_var": 0.6671223958333333, "learning_rate": 0.0003, "loss": 11.1209, "loss/aux_loss": 0.04808140583336353, "loss/crossentropy": 2.77059742808342, "loss/logits": 0.8521383255720139, "step": 52510 }, { "epoch": 0.5252, "grad_norm": 14.875, "grad_norm_var": 0.5299479166666666, "learning_rate": 0.0003, "loss": 11.0222, "loss/aux_loss": 0.048074524849653244, "loss/crossentropy": 2.839930164813995, "loss/logits": 0.8667486757040024, "step": 52520 }, { "epoch": 0.5253, "grad_norm": 14.9375, "grad_norm_var": 0.49347330729166666, "learning_rate": 0.0003, "loss": 11.1244, "loss/aux_loss": 0.048075276613235476, "loss/crossentropy": 2.7688077211380007, "loss/logits": 0.8387952595949173, "step": 52530 }, { "epoch": 0.5254, "grad_norm": 14.625, "grad_norm_var": 0.20774739583333332, "learning_rate": 0.0003, "loss": 10.9861, "loss/aux_loss": 0.048070489801466464, "loss/crossentropy": 2.698532724380493, "loss/logits": 0.8255835890769958, "step": 52540 }, { "epoch": 0.5255, "grad_norm": 14.625, "grad_norm_var": 0.23587239583333333, "learning_rate": 0.0003, "loss": 10.9713, "loss/aux_loss": 0.048068863339722157, "loss/crossentropy": 2.6561999797821043, "loss/logits": 0.8281907647848129, "step": 52550 }, { "epoch": 0.5256, "grad_norm": 14.9375, "grad_norm_var": 0.49635416666666665, "learning_rate": 0.0003, "loss": 11.1839, "loss/aux_loss": 0.04807021860033274, "loss/crossentropy": 2.6908550024032594, "loss/logits": 0.8255572736263275, "step": 52560 }, { "epoch": 0.5257, "grad_norm": 14.875, "grad_norm_var": 0.877978515625, "learning_rate": 0.0003, "loss": 11.1567, "loss/aux_loss": 0.04808451551944017, "loss/crossentropy": 2.6696718633174896, "loss/logits": 0.8380467757582665, "step": 52570 }, { "epoch": 0.5258, "grad_norm": 14.4375, "grad_norm_var": 0.8130208333333333, "learning_rate": 0.0003, "loss": 11.2055, "loss/aux_loss": 0.048067509196698666, "loss/crossentropy": 2.7303407311439516, "loss/logits": 0.8700813353061676, "step": 52580 }, { "epoch": 0.5259, "grad_norm": 14.125, "grad_norm_var": 0.3790201822916667, "learning_rate": 0.0003, "loss": 11.0618, "loss/aux_loss": 0.04807270802557469, "loss/crossentropy": 2.6694105565547943, "loss/logits": 0.7754775255918502, "step": 52590 }, { "epoch": 0.526, "grad_norm": 14.375, "grad_norm_var": 0.6511555989583333, "learning_rate": 0.0003, "loss": 10.9769, "loss/aux_loss": 0.048081257939338685, "loss/crossentropy": 2.6837186098098753, "loss/logits": 0.8063045144081116, "step": 52600 }, { "epoch": 0.5261, "grad_norm": 14.0, "grad_norm_var": 0.396728515625, "learning_rate": 0.0003, "loss": 11.0115, "loss/aux_loss": 0.04805648773908615, "loss/crossentropy": 2.7159491300582888, "loss/logits": 0.8259630739688874, "step": 52610 }, { "epoch": 0.5262, "grad_norm": 14.1875, "grad_norm_var": 0.3636555989583333, "learning_rate": 0.0003, "loss": 11.1381, "loss/aux_loss": 0.048068524710834025, "loss/crossentropy": 2.7253064274787904, "loss/logits": 0.8619579613208771, "step": 52620 }, { "epoch": 0.5263, "grad_norm": 16.625, "grad_norm_var": 0.5613932291666667, "learning_rate": 0.0003, "loss": 11.0564, "loss/aux_loss": 0.048091832920908927, "loss/crossentropy": 2.6312204539775848, "loss/logits": 0.8314170449972152, "step": 52630 }, { "epoch": 0.5264, "grad_norm": 15.6875, "grad_norm_var": 0.6113118489583333, "learning_rate": 0.0003, "loss": 10.9914, "loss/aux_loss": 0.04807051923125982, "loss/crossentropy": 2.6146963119506834, "loss/logits": 0.8094371676445007, "step": 52640 }, { "epoch": 0.5265, "grad_norm": 14.625, "grad_norm_var": 0.5033854166666667, "learning_rate": 0.0003, "loss": 11.0347, "loss/aux_loss": 0.048073076829314235, "loss/crossentropy": 2.6342477977275847, "loss/logits": 0.8295989811420441, "step": 52650 }, { "epoch": 0.5266, "grad_norm": 14.3125, "grad_norm_var": 0.3854166666666667, "learning_rate": 0.0003, "loss": 11.14, "loss/aux_loss": 0.04807663932442665, "loss/crossentropy": 2.7020954489707947, "loss/logits": 0.8581462055444717, "step": 52660 }, { "epoch": 0.5267, "grad_norm": 14.125, "grad_norm_var": 0.30831705729166664, "learning_rate": 0.0003, "loss": 11.1963, "loss/aux_loss": 0.04808024019002914, "loss/crossentropy": 2.7205568671226503, "loss/logits": 0.8558012962341308, "step": 52670 }, { "epoch": 0.5268, "grad_norm": 14.5625, "grad_norm_var": 0.48483072916666664, "learning_rate": 0.0003, "loss": 11.2696, "loss/aux_loss": 0.04806710928678513, "loss/crossentropy": 2.6928990364074705, "loss/logits": 0.8565327882766723, "step": 52680 }, { "epoch": 0.5269, "grad_norm": 13.9375, "grad_norm_var": 0.33326822916666665, "learning_rate": 0.0003, "loss": 11.0155, "loss/aux_loss": 0.04807606115937233, "loss/crossentropy": 2.7816842436790465, "loss/logits": 0.8381152004003525, "step": 52690 }, { "epoch": 0.527, "grad_norm": 14.3125, "grad_norm_var": 0.5822265625, "learning_rate": 0.0003, "loss": 10.9005, "loss/aux_loss": 0.04807661436498165, "loss/crossentropy": 2.7815606117248537, "loss/logits": 0.8124045938253402, "step": 52700 }, { "epoch": 0.5271, "grad_norm": 19.125, "grad_norm_var": 126.51925455729166, "learning_rate": 0.0003, "loss": 11.0246, "loss/aux_loss": 0.04807195011526346, "loss/crossentropy": 2.7958267748355867, "loss/logits": 0.8013758540153504, "step": 52710 }, { "epoch": 0.5272, "grad_norm": 15.5625, "grad_norm_var": 1.7684895833333334, "learning_rate": 0.0003, "loss": 10.8688, "loss/aux_loss": 0.04807553049176931, "loss/crossentropy": 2.6325803816318514, "loss/logits": 0.774325168132782, "step": 52720 }, { "epoch": 0.5273, "grad_norm": 16.25, "grad_norm_var": 0.6153483072916667, "learning_rate": 0.0003, "loss": 10.9838, "loss/aux_loss": 0.048076539672911166, "loss/crossentropy": 2.693459987640381, "loss/logits": 0.8326463222503662, "step": 52730 }, { "epoch": 0.5274, "grad_norm": 14.9375, "grad_norm_var": 0.92890625, "learning_rate": 0.0003, "loss": 11.0022, "loss/aux_loss": 0.048076620884239675, "loss/crossentropy": 2.664745795726776, "loss/logits": 0.7966249287128448, "step": 52740 }, { "epoch": 0.5275, "grad_norm": 14.25, "grad_norm_var": 0.7697265625, "learning_rate": 0.0003, "loss": 11.0997, "loss/aux_loss": 0.048067349940538406, "loss/crossentropy": 2.7236247181892397, "loss/logits": 0.8551719903945922, "step": 52750 }, { "epoch": 0.5276, "grad_norm": 16.125, "grad_norm_var": 1.3202473958333334, "learning_rate": 0.0003, "loss": 11.1669, "loss/aux_loss": 0.04806178268045187, "loss/crossentropy": 2.6296289205551147, "loss/logits": 0.8232845932245254, "step": 52760 }, { "epoch": 0.5277, "grad_norm": 14.125, "grad_norm_var": 1.2356608072916666, "learning_rate": 0.0003, "loss": 10.9658, "loss/aux_loss": 0.048072172701358794, "loss/crossentropy": 2.7454709470272065, "loss/logits": 0.8835760146379471, "step": 52770 }, { "epoch": 0.5278, "grad_norm": 13.8125, "grad_norm_var": 0.46608072916666665, "learning_rate": 0.0003, "loss": 10.9155, "loss/aux_loss": 0.04808282610028982, "loss/crossentropy": 2.6181671559810638, "loss/logits": 0.7878573626279831, "step": 52780 }, { "epoch": 0.5279, "grad_norm": 14.0, "grad_norm_var": 0.4754557291666667, "learning_rate": 0.0003, "loss": 11.1934, "loss/aux_loss": 0.04805999156087637, "loss/crossentropy": 2.5984963536262513, "loss/logits": 0.827095377445221, "step": 52790 }, { "epoch": 0.528, "grad_norm": 15.8125, "grad_norm_var": 3.491650390625, "learning_rate": 0.0003, "loss": 10.891, "loss/aux_loss": 0.048077487759292124, "loss/crossentropy": 2.659852463006973, "loss/logits": 0.8111390471458435, "step": 52800 }, { "epoch": 0.5281, "grad_norm": 17.0, "grad_norm_var": 1.5148274739583334, "learning_rate": 0.0003, "loss": 11.0344, "loss/aux_loss": 0.048082652315497396, "loss/crossentropy": 2.7374175548553468, "loss/logits": 0.8587904393672943, "step": 52810 }, { "epoch": 0.5282, "grad_norm": 15.0, "grad_norm_var": 0.9212890625, "learning_rate": 0.0003, "loss": 11.0007, "loss/aux_loss": 0.048067308217287066, "loss/crossentropy": 2.6461476027965545, "loss/logits": 0.8455465078353882, "step": 52820 }, { "epoch": 0.5283, "grad_norm": 14.6875, "grad_norm_var": 0.19791666666666666, "learning_rate": 0.0003, "loss": 11.1109, "loss/aux_loss": 0.04806644786149263, "loss/crossentropy": 2.6328052401542665, "loss/logits": 0.8342522650957107, "step": 52830 }, { "epoch": 0.5284, "grad_norm": 14.375, "grad_norm_var": 0.9562337239583333, "learning_rate": 0.0003, "loss": 11.086, "loss/aux_loss": 0.04808475598692894, "loss/crossentropy": 2.6256862759590147, "loss/logits": 0.7829153060913085, "step": 52840 }, { "epoch": 0.5285, "grad_norm": 17.0, "grad_norm_var": 1.2145670572916667, "learning_rate": 0.0003, "loss": 10.9496, "loss/aux_loss": 0.04806415122002363, "loss/crossentropy": 2.6609319686889648, "loss/logits": 0.8291169613599777, "step": 52850 }, { "epoch": 0.5286, "grad_norm": 15.25, "grad_norm_var": 0.740478515625, "learning_rate": 0.0003, "loss": 11.0984, "loss/aux_loss": 0.04807539191097021, "loss/crossentropy": 2.7916451573371885, "loss/logits": 0.8835980743169785, "step": 52860 }, { "epoch": 0.5287, "grad_norm": 15.0625, "grad_norm_var": 0.3244140625, "learning_rate": 0.0003, "loss": 11.1493, "loss/aux_loss": 0.04807141460478306, "loss/crossentropy": 2.6395734310150147, "loss/logits": 0.8369731396436692, "step": 52870 }, { "epoch": 0.5288, "grad_norm": 15.0625, "grad_norm_var": 0.3999837239583333, "learning_rate": 0.0003, "loss": 11.1769, "loss/aux_loss": 0.0480716809630394, "loss/crossentropy": 2.646906042098999, "loss/logits": 0.8252363950014114, "step": 52880 }, { "epoch": 0.5289, "grad_norm": 14.6875, "grad_norm_var": 0.53359375, "learning_rate": 0.0003, "loss": 10.8501, "loss/aux_loss": 0.0480677381157875, "loss/crossentropy": 2.6961144506931305, "loss/logits": 0.8162854909896851, "step": 52890 }, { "epoch": 0.529, "grad_norm": 14.3125, "grad_norm_var": 0.9093098958333333, "learning_rate": 0.0003, "loss": 11.0589, "loss/aux_loss": 0.04807200450450182, "loss/crossentropy": 2.698295068740845, "loss/logits": 0.8631285429000854, "step": 52900 }, { "epoch": 0.5291, "grad_norm": 13.5, "grad_norm_var": 0.7817708333333333, "learning_rate": 0.0003, "loss": 11.1834, "loss/aux_loss": 0.048069717921316625, "loss/crossentropy": 2.6712867975234986, "loss/logits": 0.8359966963529587, "step": 52910 }, { "epoch": 0.5292, "grad_norm": 14.75, "grad_norm_var": 0.6963541666666667, "learning_rate": 0.0003, "loss": 11.0659, "loss/aux_loss": 0.04808163065463304, "loss/crossentropy": 2.8286949574947355, "loss/logits": 0.8520474523305893, "step": 52920 }, { "epoch": 0.5293, "grad_norm": 13.375, "grad_norm_var": 218.29959309895833, "learning_rate": 0.0003, "loss": 11.0394, "loss/aux_loss": 0.04807936865836382, "loss/crossentropy": 2.6493199944496153, "loss/logits": 0.8226024299860001, "step": 52930 }, { "epoch": 0.5294, "grad_norm": 14.375, "grad_norm_var": 1.834375, "learning_rate": 0.0003, "loss": 11.0009, "loss/aux_loss": 0.04807878416031599, "loss/crossentropy": 2.7264573156833647, "loss/logits": 0.8180064380168914, "step": 52940 }, { "epoch": 0.5295, "grad_norm": 17.875, "grad_norm_var": 209.07029622395834, "learning_rate": 0.0003, "loss": 11.0623, "loss/aux_loss": 0.04807962104678154, "loss/crossentropy": 2.6582289695739747, "loss/logits": 0.8249925941228866, "step": 52950 }, { "epoch": 0.5296, "grad_norm": 16.125, "grad_norm_var": 205.12589518229166, "learning_rate": 0.0003, "loss": 11.0608, "loss/aux_loss": 0.048064501583576204, "loss/crossentropy": 2.701625847816467, "loss/logits": 0.8246585041284561, "step": 52960 }, { "epoch": 0.5297, "grad_norm": 15.5625, "grad_norm_var": 1.485400390625, "learning_rate": 0.0003, "loss": 11.118, "loss/aux_loss": 0.04807926844805479, "loss/crossentropy": 2.733833837509155, "loss/logits": 0.8345234960317611, "step": 52970 }, { "epoch": 0.5298, "grad_norm": 14.875, "grad_norm_var": 10.382747395833333, "learning_rate": 0.0003, "loss": 11.1176, "loss/aux_loss": 0.04808099288493395, "loss/crossentropy": 2.7490680694580076, "loss/logits": 0.838299173116684, "step": 52980 }, { "epoch": 0.5299, "grad_norm": 13.625, "grad_norm_var": 10.414827473958333, "learning_rate": 0.0003, "loss": 11.0292, "loss/aux_loss": 0.048076806217432023, "loss/crossentropy": 2.852311670780182, "loss/logits": 0.8578749477863312, "step": 52990 }, { "epoch": 0.53, "grad_norm": 14.5, "grad_norm_var": 1.9286295572916667, "learning_rate": 0.0003, "loss": 11.0788, "loss/aux_loss": 0.04808791987597942, "loss/crossentropy": 2.48682958483696, "loss/logits": 0.7837358355522156, "step": 53000 }, { "epoch": 0.5301, "grad_norm": 15.25, "grad_norm_var": 0.5119791666666667, "learning_rate": 0.0003, "loss": 11.0287, "loss/aux_loss": 0.04806363768875599, "loss/crossentropy": 2.7360428392887117, "loss/logits": 0.8066596657037735, "step": 53010 }, { "epoch": 0.5302, "grad_norm": 14.0625, "grad_norm_var": 1.45078125, "learning_rate": 0.0003, "loss": 11.0826, "loss/aux_loss": 0.048080742731690405, "loss/crossentropy": 2.624992382526398, "loss/logits": 0.8372664958238601, "step": 53020 }, { "epoch": 0.5303, "grad_norm": 17.125, "grad_norm_var": 1.5770833333333334, "learning_rate": 0.0003, "loss": 10.9728, "loss/aux_loss": 0.0480709882453084, "loss/crossentropy": 2.6210521042346953, "loss/logits": 0.8308377593755722, "step": 53030 }, { "epoch": 0.5304, "grad_norm": 14.75, "grad_norm_var": 1.211962890625, "learning_rate": 0.0003, "loss": 11.0566, "loss/aux_loss": 0.04806949980556965, "loss/crossentropy": 2.679835093021393, "loss/logits": 0.8029073655605317, "step": 53040 }, { "epoch": 0.5305, "grad_norm": 15.3125, "grad_norm_var": 0.3603515625, "learning_rate": 0.0003, "loss": 11.064, "loss/aux_loss": 0.04807988815009594, "loss/crossentropy": 2.649093449115753, "loss/logits": 0.8222986310720444, "step": 53050 }, { "epoch": 0.5306, "grad_norm": 15.1875, "grad_norm_var": 0.5082682291666667, "learning_rate": 0.0003, "loss": 11.0953, "loss/aux_loss": 0.04807553198188543, "loss/crossentropy": 2.7045671463012697, "loss/logits": 0.815049484372139, "step": 53060 }, { "epoch": 0.5307, "grad_norm": 14.875, "grad_norm_var": 0.33255208333333336, "learning_rate": 0.0003, "loss": 11.0753, "loss/aux_loss": 0.04806431755423546, "loss/crossentropy": 2.7456027269363403, "loss/logits": 0.8385494530200959, "step": 53070 }, { "epoch": 0.5308, "grad_norm": 14.3125, "grad_norm_var": 0.260009765625, "learning_rate": 0.0003, "loss": 10.9512, "loss/aux_loss": 0.04808401577174663, "loss/crossentropy": 2.5376985907554626, "loss/logits": 0.8007230907678604, "step": 53080 }, { "epoch": 0.5309, "grad_norm": 15.0625, "grad_norm_var": 3.8843098958333333, "learning_rate": 0.0003, "loss": 10.9647, "loss/aux_loss": 0.04807156920433044, "loss/crossentropy": 2.8426152527332307, "loss/logits": 0.8263060122728347, "step": 53090 }, { "epoch": 0.531, "grad_norm": 14.25, "grad_norm_var": 3.9544270833333335, "learning_rate": 0.0003, "loss": 11.1311, "loss/aux_loss": 0.048070313036441804, "loss/crossentropy": 2.679327297210693, "loss/logits": 0.8237017244100571, "step": 53100 }, { "epoch": 0.5311, "grad_norm": 14.1875, "grad_norm_var": 0.3791015625, "learning_rate": 0.0003, "loss": 11.1016, "loss/aux_loss": 0.04808187987655401, "loss/crossentropy": 2.571546399593353, "loss/logits": 0.8291330844163894, "step": 53110 }, { "epoch": 0.5312, "grad_norm": 14.25, "grad_norm_var": 0.44244791666666666, "learning_rate": 0.0003, "loss": 11.0984, "loss/aux_loss": 0.04806428123265505, "loss/crossentropy": 2.682811915874481, "loss/logits": 0.8410823673009873, "step": 53120 }, { "epoch": 0.5313, "grad_norm": 14.0625, "grad_norm_var": 0.443994140625, "learning_rate": 0.0003, "loss": 11.0374, "loss/aux_loss": 0.04808186236768961, "loss/crossentropy": 2.8723417639732363, "loss/logits": 0.8567765563726425, "step": 53130 }, { "epoch": 0.5314, "grad_norm": 14.0, "grad_norm_var": 0.48943684895833334, "learning_rate": 0.0003, "loss": 11.0291, "loss/aux_loss": 0.048074983060359955, "loss/crossentropy": 2.634695219993591, "loss/logits": 0.836324593424797, "step": 53140 }, { "epoch": 0.5315, "grad_norm": 14.625, "grad_norm_var": 0.8124837239583333, "learning_rate": 0.0003, "loss": 11.1395, "loss/aux_loss": 0.04806621633470058, "loss/crossentropy": 2.740517848730087, "loss/logits": 0.8308217048645019, "step": 53150 }, { "epoch": 0.5316, "grad_norm": 13.1875, "grad_norm_var": 0.7023274739583333, "learning_rate": 0.0003, "loss": 11.0551, "loss/aux_loss": 0.04807417429983616, "loss/crossentropy": 2.7302970767021177, "loss/logits": 0.8507604837417603, "step": 53160 }, { "epoch": 0.5317, "grad_norm": 15.1875, "grad_norm_var": 1.5652180989583333, "learning_rate": 0.0003, "loss": 11.0137, "loss/aux_loss": 0.04807768277823925, "loss/crossentropy": 2.676066642999649, "loss/logits": 0.8240761816501617, "step": 53170 }, { "epoch": 0.5318, "grad_norm": 14.3125, "grad_norm_var": 0.4925618489583333, "learning_rate": 0.0003, "loss": 11.1415, "loss/aux_loss": 0.04806169308722019, "loss/crossentropy": 2.6013071119785307, "loss/logits": 0.8342882752418518, "step": 53180 }, { "epoch": 0.5319, "grad_norm": 14.625, "grad_norm_var": 0.3927083333333333, "learning_rate": 0.0003, "loss": 11.129, "loss/aux_loss": 0.04808182790875435, "loss/crossentropy": 2.7180078864097594, "loss/logits": 0.8595794111490249, "step": 53190 }, { "epoch": 0.532, "grad_norm": 15.0625, "grad_norm_var": 0.19212239583333332, "learning_rate": 0.0003, "loss": 11.0407, "loss/aux_loss": 0.04806814473122358, "loss/crossentropy": 2.6401141822338103, "loss/logits": 0.7996371448040008, "step": 53200 }, { "epoch": 0.5321, "grad_norm": 14.4375, "grad_norm_var": 0.2530598958333333, "learning_rate": 0.0003, "loss": 11.0913, "loss/aux_loss": 0.04807532113045454, "loss/crossentropy": 2.7931397438049315, "loss/logits": 0.7978598833084106, "step": 53210 }, { "epoch": 0.5322, "grad_norm": 14.875, "grad_norm_var": 0.28854166666666664, "learning_rate": 0.0003, "loss": 11.0714, "loss/aux_loss": 0.04807628560811281, "loss/crossentropy": 2.6044381737709044, "loss/logits": 0.8384490758180618, "step": 53220 }, { "epoch": 0.5323, "grad_norm": 15.0625, "grad_norm_var": 0.3311848958333333, "learning_rate": 0.0003, "loss": 11.0452, "loss/aux_loss": 0.048069493100047114, "loss/crossentropy": 2.725711923837662, "loss/logits": 0.8274922966957092, "step": 53230 }, { "epoch": 0.5324, "grad_norm": 14.3125, "grad_norm_var": 0.6875, "learning_rate": 0.0003, "loss": 11.0122, "loss/aux_loss": 0.0480713777244091, "loss/crossentropy": 2.7457379400730133, "loss/logits": 0.8154986947774887, "step": 53240 }, { "epoch": 0.5325, "grad_norm": 14.3125, "grad_norm_var": 0.4058430989583333, "learning_rate": 0.0003, "loss": 11.0645, "loss/aux_loss": 0.048076901398599145, "loss/crossentropy": 2.644556438922882, "loss/logits": 0.7974002599716187, "step": 53250 }, { "epoch": 0.5326, "grad_norm": 14.5, "grad_norm_var": 1.065478515625, "learning_rate": 0.0003, "loss": 11.1355, "loss/aux_loss": 0.04806389715522528, "loss/crossentropy": 2.6610435485839843, "loss/logits": 0.8192419022321701, "step": 53260 }, { "epoch": 0.5327, "grad_norm": 14.3125, "grad_norm_var": 1.362353515625, "learning_rate": 0.0003, "loss": 11.0868, "loss/aux_loss": 0.04807983003556728, "loss/crossentropy": 2.726958382129669, "loss/logits": 0.8118180692195892, "step": 53270 }, { "epoch": 0.5328, "grad_norm": 15.5625, "grad_norm_var": 0.5813639322916667, "learning_rate": 0.0003, "loss": 11.1905, "loss/aux_loss": 0.048070806078612804, "loss/crossentropy": 2.5736697733402254, "loss/logits": 0.8248791873455048, "step": 53280 }, { "epoch": 0.5329, "grad_norm": 14.3125, "grad_norm_var": 0.5400390625, "learning_rate": 0.0003, "loss": 11.117, "loss/aux_loss": 0.04807869885116815, "loss/crossentropy": 2.789425587654114, "loss/logits": 0.8584134370088577, "step": 53290 }, { "epoch": 0.533, "grad_norm": 14.25, "grad_norm_var": 0.3907389322916667, "learning_rate": 0.0003, "loss": 11.0787, "loss/aux_loss": 0.048071909509599206, "loss/crossentropy": 2.6537403225898744, "loss/logits": 0.8524068266153335, "step": 53300 }, { "epoch": 0.5331, "grad_norm": 16.25, "grad_norm_var": 1.6032389322916667, "learning_rate": 0.0003, "loss": 11.2028, "loss/aux_loss": 0.04807873219251633, "loss/crossentropy": 2.86398446559906, "loss/logits": 0.84943727850914, "step": 53310 }, { "epoch": 0.5332, "grad_norm": 15.4375, "grad_norm_var": 0.3870930989583333, "learning_rate": 0.0003, "loss": 11.0856, "loss/aux_loss": 0.048069454915821555, "loss/crossentropy": 2.6376845240592957, "loss/logits": 0.845120832324028, "step": 53320 }, { "epoch": 0.5333, "grad_norm": 14.0625, "grad_norm_var": 0.5899576822916667, "learning_rate": 0.0003, "loss": 11.0704, "loss/aux_loss": 0.04807726014405489, "loss/crossentropy": 2.852811598777771, "loss/logits": 0.808278375864029, "step": 53330 }, { "epoch": 0.5334, "grad_norm": 15.0625, "grad_norm_var": 0.6477701822916667, "learning_rate": 0.0003, "loss": 10.8892, "loss/aux_loss": 0.04806978944689035, "loss/crossentropy": 2.7639645457267763, "loss/logits": 0.8498774021863937, "step": 53340 }, { "epoch": 0.5335, "grad_norm": 15.125, "grad_norm_var": 0.7806640625, "learning_rate": 0.0003, "loss": 10.8939, "loss/aux_loss": 0.048068863339722157, "loss/crossentropy": 2.472467356920242, "loss/logits": 0.7923805028200149, "step": 53350 }, { "epoch": 0.5336, "grad_norm": 14.4375, "grad_norm_var": 0.6285807291666666, "learning_rate": 0.0003, "loss": 10.9191, "loss/aux_loss": 0.04807096607983112, "loss/crossentropy": 2.535427379608154, "loss/logits": 0.8171293288469315, "step": 53360 }, { "epoch": 0.5337, "grad_norm": 15.3125, "grad_norm_var": 0.27029622395833336, "learning_rate": 0.0003, "loss": 10.9995, "loss/aux_loss": 0.04807757455855608, "loss/crossentropy": 2.699053144454956, "loss/logits": 0.8358200043439865, "step": 53370 }, { "epoch": 0.5338, "grad_norm": 14.5625, "grad_norm_var": 0.37916666666666665, "learning_rate": 0.0003, "loss": 10.9628, "loss/aux_loss": 0.04807063937187195, "loss/crossentropy": 2.595501071214676, "loss/logits": 0.8261586040258407, "step": 53380 }, { "epoch": 0.5339, "grad_norm": 14.3125, "grad_norm_var": 0.746875, "learning_rate": 0.0003, "loss": 11.0618, "loss/aux_loss": 0.04807272292673588, "loss/crossentropy": 2.6923603653907775, "loss/logits": 0.8231137573719025, "step": 53390 }, { "epoch": 0.534, "grad_norm": 15.1875, "grad_norm_var": 0.44869791666666664, "learning_rate": 0.0003, "loss": 10.979, "loss/aux_loss": 0.04807500522583723, "loss/crossentropy": 2.829385429620743, "loss/logits": 0.8415767669677734, "step": 53400 }, { "epoch": 0.5341, "grad_norm": 14.125, "grad_norm_var": 0.3702473958333333, "learning_rate": 0.0003, "loss": 10.9503, "loss/aux_loss": 0.04806397818028927, "loss/crossentropy": 2.4466098248958588, "loss/logits": 0.7799597263336182, "step": 53410 }, { "epoch": 0.5342, "grad_norm": 14.25, "grad_norm_var": 0.5653483072916666, "learning_rate": 0.0003, "loss": 11.1548, "loss/aux_loss": 0.04807454627007246, "loss/crossentropy": 2.669411617517471, "loss/logits": 0.8267213612794876, "step": 53420 }, { "epoch": 0.5343, "grad_norm": 15.9375, "grad_norm_var": 0.8238932291666666, "learning_rate": 0.0003, "loss": 10.9296, "loss/aux_loss": 0.048065404407680035, "loss/crossentropy": 2.78000248670578, "loss/logits": 0.8359499126672745, "step": 53430 }, { "epoch": 0.5344, "grad_norm": 14.4375, "grad_norm_var": 0.276025390625, "learning_rate": 0.0003, "loss": 11.0951, "loss/aux_loss": 0.04806756917387247, "loss/crossentropy": 2.854048955440521, "loss/logits": 0.8441928833723068, "step": 53440 }, { "epoch": 0.5345, "grad_norm": 13.5, "grad_norm_var": 0.8166015625, "learning_rate": 0.0003, "loss": 11.1276, "loss/aux_loss": 0.04807045813649893, "loss/crossentropy": 2.7718479573726653, "loss/logits": 0.8208920061588287, "step": 53450 }, { "epoch": 0.5346, "grad_norm": 15.0625, "grad_norm_var": 1.2453125, "learning_rate": 0.0003, "loss": 11.1, "loss/aux_loss": 0.048068010993301866, "loss/crossentropy": 2.890912193059921, "loss/logits": 0.812038055062294, "step": 53460 }, { "epoch": 0.5347, "grad_norm": 15.875, "grad_norm_var": 0.943994140625, "learning_rate": 0.0003, "loss": 11.0452, "loss/aux_loss": 0.04808399192988873, "loss/crossentropy": 2.8517406702041628, "loss/logits": 0.8417524635791779, "step": 53470 }, { "epoch": 0.5348, "grad_norm": 14.9375, "grad_norm_var": 0.9677083333333333, "learning_rate": 0.0003, "loss": 10.964, "loss/aux_loss": 0.048053649812936784, "loss/crossentropy": 2.7846075654029847, "loss/logits": 0.8391733020544052, "step": 53480 }, { "epoch": 0.5349, "grad_norm": 15.75, "grad_norm_var": 0.26171875, "learning_rate": 0.0003, "loss": 11.0427, "loss/aux_loss": 0.04807787444442511, "loss/crossentropy": 2.6856286406517027, "loss/logits": 0.8094239175319672, "step": 53490 }, { "epoch": 0.535, "grad_norm": 15.3125, "grad_norm_var": 0.5763020833333333, "learning_rate": 0.0003, "loss": 11.0114, "loss/aux_loss": 0.048055261932313444, "loss/crossentropy": 2.517116981744766, "loss/logits": 0.7816114693880081, "step": 53500 }, { "epoch": 0.5351, "grad_norm": 13.6875, "grad_norm_var": 0.5478515625, "learning_rate": 0.0003, "loss": 11.1496, "loss/aux_loss": 0.0480762280523777, "loss/crossentropy": 2.9342130780220033, "loss/logits": 0.856569093465805, "step": 53510 }, { "epoch": 0.5352, "grad_norm": 14.125, "grad_norm_var": 5.563525390625, "learning_rate": 0.0003, "loss": 11.0715, "loss/aux_loss": 0.04807373471558094, "loss/crossentropy": 2.8552743911743166, "loss/logits": 0.8635714590549469, "step": 53520 }, { "epoch": 0.5353, "grad_norm": 14.4375, "grad_norm_var": 80.24724934895833, "learning_rate": 0.0003, "loss": 11.065, "loss/aux_loss": 0.04806609898805618, "loss/crossentropy": 2.737239396572113, "loss/logits": 0.8101500362157822, "step": 53530 }, { "epoch": 0.5354, "grad_norm": 15.8125, "grad_norm_var": 79.16443684895833, "learning_rate": 0.0003, "loss": 10.8988, "loss/aux_loss": 0.04807798117399216, "loss/crossentropy": 2.601036584377289, "loss/logits": 0.8181641131639481, "step": 53540 }, { "epoch": 0.5355, "grad_norm": 15.1875, "grad_norm_var": 0.45779622395833336, "learning_rate": 0.0003, "loss": 11.0052, "loss/aux_loss": 0.04806607346981764, "loss/crossentropy": 2.7229600071907045, "loss/logits": 0.8561374306678772, "step": 53550 }, { "epoch": 0.5356, "grad_norm": 17.25, "grad_norm_var": 1.2523274739583334, "learning_rate": 0.0003, "loss": 11.011, "loss/aux_loss": 0.04806992299854755, "loss/crossentropy": 2.877775228023529, "loss/logits": 0.8536604076623917, "step": 53560 }, { "epoch": 0.5357, "grad_norm": 14.125, "grad_norm_var": 1.068994140625, "learning_rate": 0.0003, "loss": 10.9918, "loss/aux_loss": 0.04807414263486862, "loss/crossentropy": 2.705450266599655, "loss/logits": 0.8226163148880005, "step": 53570 }, { "epoch": 0.5358, "grad_norm": 15.8125, "grad_norm_var": 0.403369140625, "learning_rate": 0.0003, "loss": 10.838, "loss/aux_loss": 0.0480672538280487, "loss/crossentropy": 2.744877350330353, "loss/logits": 0.793895834684372, "step": 53580 }, { "epoch": 0.5359, "grad_norm": 15.0625, "grad_norm_var": 0.9878743489583334, "learning_rate": 0.0003, "loss": 11.1031, "loss/aux_loss": 0.04807367753237486, "loss/crossentropy": 2.871882838010788, "loss/logits": 0.8179901033639908, "step": 53590 }, { "epoch": 0.536, "grad_norm": 14.4375, "grad_norm_var": 0.9676432291666667, "learning_rate": 0.0003, "loss": 10.9992, "loss/aux_loss": 0.048075889237225056, "loss/crossentropy": 2.8211424231529234, "loss/logits": 0.8252835303544999, "step": 53600 }, { "epoch": 0.5361, "grad_norm": 14.875, "grad_norm_var": 0.7976399739583333, "learning_rate": 0.0003, "loss": 11.194, "loss/aux_loss": 0.048059284873306754, "loss/crossentropy": 2.791327440738678, "loss/logits": 0.8332475572824478, "step": 53610 }, { "epoch": 0.5362, "grad_norm": 13.6875, "grad_norm_var": 1.3942057291666667, "learning_rate": 0.0003, "loss": 11.1379, "loss/aux_loss": 0.04807420019060373, "loss/crossentropy": 2.5513383507728578, "loss/logits": 0.7817022532224656, "step": 53620 }, { "epoch": 0.5363, "grad_norm": 14.625, "grad_norm_var": 6.633317057291666, "learning_rate": 0.0003, "loss": 11.0451, "loss/aux_loss": 0.04806692767888308, "loss/crossentropy": 2.764022743701935, "loss/logits": 0.8246651530265808, "step": 53630 }, { "epoch": 0.5364, "grad_norm": 14.25, "grad_norm_var": 6.286197916666667, "learning_rate": 0.0003, "loss": 10.9723, "loss/aux_loss": 0.0480807863175869, "loss/crossentropy": 2.5619694709777834, "loss/logits": 0.8253705441951752, "step": 53640 }, { "epoch": 0.5365, "grad_norm": 13.625, "grad_norm_var": 0.6011555989583334, "learning_rate": 0.0003, "loss": 10.9507, "loss/aux_loss": 0.04806951284408569, "loss/crossentropy": 2.7298443794250487, "loss/logits": 0.8315545797348023, "step": 53650 }, { "epoch": 0.5366, "grad_norm": 14.0625, "grad_norm_var": 0.804931640625, "learning_rate": 0.0003, "loss": 10.979, "loss/aux_loss": 0.04807220809161663, "loss/crossentropy": 2.6058572232723236, "loss/logits": 0.7958117395639419, "step": 53660 }, { "epoch": 0.5367, "grad_norm": 14.125, "grad_norm_var": 0.602978515625, "learning_rate": 0.0003, "loss": 11.0157, "loss/aux_loss": 0.04807724934071302, "loss/crossentropy": 2.6349853515625, "loss/logits": 0.8204777866601944, "step": 53670 }, { "epoch": 0.5368, "grad_norm": 14.3125, "grad_norm_var": 0.51015625, "learning_rate": 0.0003, "loss": 11.212, "loss/aux_loss": 0.04807522259652615, "loss/crossentropy": 2.769087851047516, "loss/logits": 0.8173623502254486, "step": 53680 }, { "epoch": 0.5369, "grad_norm": 15.375, "grad_norm_var": 0.83671875, "learning_rate": 0.0003, "loss": 11.0955, "loss/aux_loss": 0.04807520546019077, "loss/crossentropy": 2.750330662727356, "loss/logits": 0.8260623097419739, "step": 53690 }, { "epoch": 0.537, "grad_norm": 14.75, "grad_norm_var": 0.9067545572916667, "learning_rate": 0.0003, "loss": 11.0911, "loss/aux_loss": 0.04807036910206079, "loss/crossentropy": 2.819355845451355, "loss/logits": 0.8768564403057099, "step": 53700 }, { "epoch": 0.5371, "grad_norm": 14.5625, "grad_norm_var": 1.2452473958333334, "learning_rate": 0.0003, "loss": 10.955, "loss/aux_loss": 0.04807029124349356, "loss/crossentropy": 2.6186971068382263, "loss/logits": 0.8072131723165512, "step": 53710 }, { "epoch": 0.5372, "grad_norm": 14.75, "grad_norm_var": 3.1669270833333334, "learning_rate": 0.0003, "loss": 11.0975, "loss/aux_loss": 0.04807576686143875, "loss/crossentropy": 2.7736194491386414, "loss/logits": 0.8309052169322968, "step": 53720 }, { "epoch": 0.5373, "grad_norm": 15.625, "grad_norm_var": 0.3016764322916667, "learning_rate": 0.0003, "loss": 11.0322, "loss/aux_loss": 0.04806541334837675, "loss/crossentropy": 2.686142373085022, "loss/logits": 0.8141505211591721, "step": 53730 }, { "epoch": 0.5374, "grad_norm": 15.6875, "grad_norm_var": 0.4722493489583333, "learning_rate": 0.0003, "loss": 11.0185, "loss/aux_loss": 0.04807978682219982, "loss/crossentropy": 2.636608195304871, "loss/logits": 0.8161023885011673, "step": 53740 }, { "epoch": 0.5375, "grad_norm": 13.8125, "grad_norm_var": 0.6322916666666667, "learning_rate": 0.0003, "loss": 10.986, "loss/aux_loss": 0.04806647896766662, "loss/crossentropy": 2.7851890683174134, "loss/logits": 0.8317944526672363, "step": 53750 }, { "epoch": 0.5376, "grad_norm": 15.0, "grad_norm_var": 2.291520182291667, "learning_rate": 0.0003, "loss": 11.1272, "loss/aux_loss": 0.04807963985949755, "loss/crossentropy": 2.6976101815700533, "loss/logits": 0.8801734536886215, "step": 53760 }, { "epoch": 0.5377, "grad_norm": 15.125, "grad_norm_var": 1.95234375, "learning_rate": 0.0003, "loss": 11.0244, "loss/aux_loss": 0.048057077825069426, "loss/crossentropy": 2.6501388132572172, "loss/logits": 0.8309973537921905, "step": 53770 }, { "epoch": 0.5378, "grad_norm": 15.0, "grad_norm_var": 0.369384765625, "learning_rate": 0.0003, "loss": 10.8356, "loss/aux_loss": 0.04807448796927929, "loss/crossentropy": 2.7889013409614565, "loss/logits": 0.8378504902124405, "step": 53780 }, { "epoch": 0.5379, "grad_norm": 14.625, "grad_norm_var": 0.5079264322916667, "learning_rate": 0.0003, "loss": 10.9582, "loss/aux_loss": 0.04807044938206673, "loss/crossentropy": 2.509946370124817, "loss/logits": 0.8080231517553329, "step": 53790 }, { "epoch": 0.538, "grad_norm": 15.3125, "grad_norm_var": 1.011962890625, "learning_rate": 0.0003, "loss": 11.1271, "loss/aux_loss": 0.04807122368365526, "loss/crossentropy": 2.6480683028697967, "loss/logits": 0.7914795011281968, "step": 53800 }, { "epoch": 0.5381, "grad_norm": 14.8125, "grad_norm_var": 0.6346354166666667, "learning_rate": 0.0003, "loss": 10.9294, "loss/aux_loss": 0.048066381551325324, "loss/crossentropy": 2.7862293422222137, "loss/logits": 0.8505240023136139, "step": 53810 }, { "epoch": 0.5382, "grad_norm": 14.9375, "grad_norm_var": 0.29375, "learning_rate": 0.0003, "loss": 11.038, "loss/aux_loss": 0.048070579580962655, "loss/crossentropy": 2.6358683347702025, "loss/logits": 0.8482916533946991, "step": 53820 }, { "epoch": 0.5383, "grad_norm": 16.25, "grad_norm_var": 20.885270182291666, "learning_rate": 0.0003, "loss": 11.0671, "loss/aux_loss": 0.04806550685316324, "loss/crossentropy": 2.6382621049880983, "loss/logits": 0.8401525467634201, "step": 53830 }, { "epoch": 0.5384, "grad_norm": 15.0, "grad_norm_var": 0.3759765625, "learning_rate": 0.0003, "loss": 11.1593, "loss/aux_loss": 0.04807865265756846, "loss/crossentropy": 2.6878114998340608, "loss/logits": 0.846782973408699, "step": 53840 }, { "epoch": 0.5385, "grad_norm": 15.1875, "grad_norm_var": 0.121875, "learning_rate": 0.0003, "loss": 11.0436, "loss/aux_loss": 0.0480678740888834, "loss/crossentropy": 2.6060830295085906, "loss/logits": 0.8473536789417266, "step": 53850 }, { "epoch": 0.5386, "grad_norm": 14.0625, "grad_norm_var": 0.7895670572916667, "learning_rate": 0.0003, "loss": 11.094, "loss/aux_loss": 0.04806717596948147, "loss/crossentropy": 2.6601334273815156, "loss/logits": 0.8278073340654373, "step": 53860 }, { "epoch": 0.5387, "grad_norm": 13.75, "grad_norm_var": 0.8055826822916666, "learning_rate": 0.0003, "loss": 10.9571, "loss/aux_loss": 0.048082117736339566, "loss/crossentropy": 2.7736589670181275, "loss/logits": 0.7962304085493088, "step": 53870 }, { "epoch": 0.5388, "grad_norm": 14.5, "grad_norm_var": 0.513916015625, "learning_rate": 0.0003, "loss": 11.1215, "loss/aux_loss": 0.04806166738271713, "loss/crossentropy": 2.7752721309661865, "loss/logits": 0.8360392391681671, "step": 53880 }, { "epoch": 0.5389, "grad_norm": 14.1875, "grad_norm_var": 0.7431640625, "learning_rate": 0.0003, "loss": 11.2056, "loss/aux_loss": 0.04807320367544889, "loss/crossentropy": 2.800821077823639, "loss/logits": 0.8525734037160874, "step": 53890 }, { "epoch": 0.539, "grad_norm": 13.9375, "grad_norm_var": 0.7877604166666666, "learning_rate": 0.0003, "loss": 11.0453, "loss/aux_loss": 0.048065982013940814, "loss/crossentropy": 2.7617095947265624, "loss/logits": 0.8081191062927247, "step": 53900 }, { "epoch": 0.5391, "grad_norm": 14.5, "grad_norm_var": 1.1628743489583333, "learning_rate": 0.0003, "loss": 10.8716, "loss/aux_loss": 0.04807501696050167, "loss/crossentropy": 2.5991472363471986, "loss/logits": 0.8059735208749771, "step": 53910 }, { "epoch": 0.5392, "grad_norm": 16.875, "grad_norm_var": 1.0609212239583334, "learning_rate": 0.0003, "loss": 10.9493, "loss/aux_loss": 0.048074368946254255, "loss/crossentropy": 2.7167088091373444, "loss/logits": 0.8214272201061249, "step": 53920 }, { "epoch": 0.5393, "grad_norm": 15.75, "grad_norm_var": 0.6758951822916667, "learning_rate": 0.0003, "loss": 11.0663, "loss/aux_loss": 0.048074664548039436, "loss/crossentropy": 2.6797154784202575, "loss/logits": 0.8302604794502259, "step": 53930 }, { "epoch": 0.5394, "grad_norm": 14.5625, "grad_norm_var": 0.3733723958333333, "learning_rate": 0.0003, "loss": 11.1273, "loss/aux_loss": 0.04806742053478956, "loss/crossentropy": 2.7210973858833314, "loss/logits": 0.8412497580051422, "step": 53940 }, { "epoch": 0.5395, "grad_norm": 15.0, "grad_norm_var": 0.466259765625, "learning_rate": 0.0003, "loss": 11.0095, "loss/aux_loss": 0.048062196187675, "loss/crossentropy": 2.655183678865433, "loss/logits": 0.8122419893741608, "step": 53950 }, { "epoch": 0.5396, "grad_norm": 15.3125, "grad_norm_var": 0.3477701822916667, "learning_rate": 0.0003, "loss": 11.1166, "loss/aux_loss": 0.04807857759296894, "loss/crossentropy": 2.614110505580902, "loss/logits": 0.8455834567546845, "step": 53960 }, { "epoch": 0.5397, "grad_norm": 14.125, "grad_norm_var": 0.5222493489583333, "learning_rate": 0.0003, "loss": 11.0279, "loss/aux_loss": 0.04806742183864117, "loss/crossentropy": 2.6582097709178925, "loss/logits": 0.8321360021829605, "step": 53970 }, { "epoch": 0.5398, "grad_norm": 15.0, "grad_norm_var": 0.8941243489583334, "learning_rate": 0.0003, "loss": 11.0669, "loss/aux_loss": 0.04807064067572355, "loss/crossentropy": 2.8422482132911684, "loss/logits": 0.8568818151950837, "step": 53980 }, { "epoch": 0.5399, "grad_norm": 13.5625, "grad_norm_var": 0.267041015625, "learning_rate": 0.0003, "loss": 10.8872, "loss/aux_loss": 0.04806562513113022, "loss/crossentropy": 2.64736921787262, "loss/logits": 0.8495258182287216, "step": 53990 }, { "epoch": 0.54, "grad_norm": 14.6875, "grad_norm_var": 0.8567708333333334, "learning_rate": 0.0003, "loss": 11.0104, "loss/aux_loss": 0.048078795336186884, "loss/crossentropy": 2.7528712272644045, "loss/logits": 0.830004358291626, "step": 54000 }, { "epoch": 0.5401, "grad_norm": 14.25, "grad_norm_var": 0.9296875, "learning_rate": 0.0003, "loss": 11.1183, "loss/aux_loss": 0.04806767832487822, "loss/crossentropy": 2.5062259435653687, "loss/logits": 0.8438413232564926, "step": 54010 }, { "epoch": 0.5402, "grad_norm": 14.9375, "grad_norm_var": 0.5441243489583333, "learning_rate": 0.0003, "loss": 10.9066, "loss/aux_loss": 0.048075572960078715, "loss/crossentropy": 2.6077619075775145, "loss/logits": 0.8247323483228683, "step": 54020 }, { "epoch": 0.5403, "grad_norm": 14.1875, "grad_norm_var": 0.43776041666666665, "learning_rate": 0.0003, "loss": 11.0432, "loss/aux_loss": 0.04806858468800783, "loss/crossentropy": 2.5667442798614504, "loss/logits": 0.8107105433940888, "step": 54030 }, { "epoch": 0.5404, "grad_norm": 16.375, "grad_norm_var": 0.65859375, "learning_rate": 0.0003, "loss": 11.0633, "loss/aux_loss": 0.048066397197544575, "loss/crossentropy": 2.6910421431064604, "loss/logits": 0.8274150729179383, "step": 54040 }, { "epoch": 0.5405, "grad_norm": 20.25, "grad_norm_var": 693.61171875, "learning_rate": 0.0003, "loss": 11.1854, "loss/aux_loss": 0.04809403121471405, "loss/crossentropy": 2.966396164894104, "loss/logits": 0.8811068832874298, "step": 54050 }, { "epoch": 0.5406, "grad_norm": 15.5625, "grad_norm_var": 696.5848307291667, "learning_rate": 0.0003, "loss": 10.9561, "loss/aux_loss": 0.048076951503753663, "loss/crossentropy": 2.7103439450263975, "loss/logits": 0.8318052858114242, "step": 54060 }, { "epoch": 0.5407, "grad_norm": 13.9375, "grad_norm_var": 0.6145670572916667, "learning_rate": 0.0003, "loss": 11.0968, "loss/aux_loss": 0.048073592409491536, "loss/crossentropy": 2.802405446767807, "loss/logits": 0.807966560125351, "step": 54070 }, { "epoch": 0.5408, "grad_norm": 14.625, "grad_norm_var": 1.6135416666666667, "learning_rate": 0.0003, "loss": 10.9053, "loss/aux_loss": 0.048079601302742955, "loss/crossentropy": 2.689373391866684, "loss/logits": 0.8208520948886872, "step": 54080 }, { "epoch": 0.5409, "grad_norm": 14.0625, "grad_norm_var": 1.5844889322916667, "learning_rate": 0.0003, "loss": 10.9493, "loss/aux_loss": 0.04807072449475527, "loss/crossentropy": 2.792709541320801, "loss/logits": 0.8486111849546433, "step": 54090 }, { "epoch": 0.541, "grad_norm": 15.6875, "grad_norm_var": 0.526025390625, "learning_rate": 0.0003, "loss": 11.0094, "loss/aux_loss": 0.0480718944221735, "loss/crossentropy": 2.5616662383079527, "loss/logits": 0.8192130953073502, "step": 54100 }, { "epoch": 0.5411, "grad_norm": 15.25, "grad_norm_var": 0.60546875, "learning_rate": 0.0003, "loss": 10.9774, "loss/aux_loss": 0.04807149339467287, "loss/crossentropy": 2.642424190044403, "loss/logits": 0.8189570486545563, "step": 54110 }, { "epoch": 0.5412, "grad_norm": 14.6875, "grad_norm_var": 0.2639973958333333, "learning_rate": 0.0003, "loss": 10.9475, "loss/aux_loss": 0.04808628931641579, "loss/crossentropy": 2.6260932087898254, "loss/logits": 0.8346313923597336, "step": 54120 }, { "epoch": 0.5413, "grad_norm": 13.9375, "grad_norm_var": 0.23170572916666668, "learning_rate": 0.0003, "loss": 11.0016, "loss/aux_loss": 0.04805894047021866, "loss/crossentropy": 2.718409872055054, "loss/logits": 0.8037611931562424, "step": 54130 }, { "epoch": 0.5414, "grad_norm": 16.0, "grad_norm_var": 0.4911295572916667, "learning_rate": 0.0003, "loss": 11.0083, "loss/aux_loss": 0.04807175807654858, "loss/crossentropy": 2.665117746591568, "loss/logits": 0.8265301376581192, "step": 54140 }, { "epoch": 0.5415, "grad_norm": 16.125, "grad_norm_var": 0.982275390625, "learning_rate": 0.0003, "loss": 10.9641, "loss/aux_loss": 0.0480738976970315, "loss/crossentropy": 2.575757938623428, "loss/logits": 0.8114981263875961, "step": 54150 }, { "epoch": 0.5416, "grad_norm": 14.5, "grad_norm_var": 0.6212890625, "learning_rate": 0.0003, "loss": 11.0063, "loss/aux_loss": 0.04806560389697552, "loss/crossentropy": 2.744628429412842, "loss/logits": 0.8167267292737961, "step": 54160 }, { "epoch": 0.5417, "grad_norm": 14.375, "grad_norm_var": 0.365869140625, "learning_rate": 0.0003, "loss": 10.8746, "loss/aux_loss": 0.04807755947113037, "loss/crossentropy": 2.729469120502472, "loss/logits": 0.8389277517795563, "step": 54170 }, { "epoch": 0.5418, "grad_norm": 15.6875, "grad_norm_var": 0.6606608072916667, "learning_rate": 0.0003, "loss": 11.0025, "loss/aux_loss": 0.04807156063616276, "loss/crossentropy": 2.6684438705444338, "loss/logits": 0.8205473870038986, "step": 54180 }, { "epoch": 0.5419, "grad_norm": 15.875, "grad_norm_var": 0.5477701822916666, "learning_rate": 0.0003, "loss": 10.9314, "loss/aux_loss": 0.04807228222489357, "loss/crossentropy": 2.6462816834449767, "loss/logits": 0.8203870177268981, "step": 54190 }, { "epoch": 0.542, "grad_norm": 13.75, "grad_norm_var": 0.851025390625, "learning_rate": 0.0003, "loss": 11.1092, "loss/aux_loss": 0.048070806078612804, "loss/crossentropy": 2.653973418474197, "loss/logits": 0.8173866599798203, "step": 54200 }, { "epoch": 0.5421, "grad_norm": 15.25, "grad_norm_var": 0.3895182291666667, "learning_rate": 0.0003, "loss": 11.0387, "loss/aux_loss": 0.04807880613952875, "loss/crossentropy": 2.8338425755500793, "loss/logits": 0.8203609675168991, "step": 54210 }, { "epoch": 0.5422, "grad_norm": 14.8125, "grad_norm_var": 0.40167643229166666, "learning_rate": 0.0003, "loss": 11.0663, "loss/aux_loss": 0.04807204119861126, "loss/crossentropy": 2.8691022396087646, "loss/logits": 0.8554022014141083, "step": 54220 }, { "epoch": 0.5423, "grad_norm": 15.25, "grad_norm_var": 41.256510416666664, "learning_rate": 0.0003, "loss": 10.8823, "loss/aux_loss": 0.04807537421584129, "loss/crossentropy": 2.7167785286903383, "loss/logits": 0.8512430638074875, "step": 54230 }, { "epoch": 0.5424, "grad_norm": 14.5625, "grad_norm_var": 38.6275390625, "learning_rate": 0.0003, "loss": 10.9108, "loss/aux_loss": 0.04806912895292044, "loss/crossentropy": 2.7357052505016326, "loss/logits": 0.8582530438899993, "step": 54240 }, { "epoch": 0.5425, "grad_norm": 14.6875, "grad_norm_var": 2.011979166666667, "learning_rate": 0.0003, "loss": 11.1132, "loss/aux_loss": 0.048073516227304935, "loss/crossentropy": 2.8648219108581543, "loss/logits": 0.8151409834623337, "step": 54250 }, { "epoch": 0.5426, "grad_norm": 15.0, "grad_norm_var": 2.1770833333333335, "learning_rate": 0.0003, "loss": 11.1463, "loss/aux_loss": 0.0480679165571928, "loss/crossentropy": 2.6788148045539857, "loss/logits": 0.8590417355298996, "step": 54260 }, { "epoch": 0.5427, "grad_norm": 15.375, "grad_norm_var": 0.44998372395833336, "learning_rate": 0.0003, "loss": 11.0944, "loss/aux_loss": 0.04807999115437269, "loss/crossentropy": 2.7842895090579987, "loss/logits": 0.7997796133160591, "step": 54270 }, { "epoch": 0.5428, "grad_norm": 15.9375, "grad_norm_var": 1.0296223958333333, "learning_rate": 0.0003, "loss": 11.0713, "loss/aux_loss": 0.04806948360055685, "loss/crossentropy": 2.6741649389266966, "loss/logits": 0.8174891114234925, "step": 54280 }, { "epoch": 0.5429, "grad_norm": 14.5625, "grad_norm_var": 1.2658854166666667, "learning_rate": 0.0003, "loss": 10.9228, "loss/aux_loss": 0.04806262943893671, "loss/crossentropy": 2.7555585384368895, "loss/logits": 0.8146154910326004, "step": 54290 }, { "epoch": 0.543, "grad_norm": 15.875, "grad_norm_var": 1.404931640625, "learning_rate": 0.0003, "loss": 11.0579, "loss/aux_loss": 0.048083382099866866, "loss/crossentropy": 2.644556665420532, "loss/logits": 0.8286139577627182, "step": 54300 }, { "epoch": 0.5431, "grad_norm": 14.4375, "grad_norm_var": 1.439306640625, "learning_rate": 0.0003, "loss": 11.0906, "loss/aux_loss": 0.04806300513446331, "loss/crossentropy": 2.77071772813797, "loss/logits": 0.8199887424707413, "step": 54310 }, { "epoch": 0.5432, "grad_norm": 14.625, "grad_norm_var": 0.5684895833333333, "learning_rate": 0.0003, "loss": 10.952, "loss/aux_loss": 0.04807317145168781, "loss/crossentropy": 2.7126712799072266, "loss/logits": 0.8230218112468719, "step": 54320 }, { "epoch": 0.5433, "grad_norm": 15.3125, "grad_norm_var": 0.5093587239583334, "learning_rate": 0.0003, "loss": 11.2114, "loss/aux_loss": 0.048068922385573386, "loss/crossentropy": 2.6962040960788727, "loss/logits": 0.8404717385768891, "step": 54330 }, { "epoch": 0.5434, "grad_norm": 13.625, "grad_norm_var": 0.4540201822916667, "learning_rate": 0.0003, "loss": 11.0177, "loss/aux_loss": 0.0480722289532423, "loss/crossentropy": 2.549234163761139, "loss/logits": 0.8165672957897187, "step": 54340 }, { "epoch": 0.5435, "grad_norm": 13.6875, "grad_norm_var": 1.0739420572916667, "learning_rate": 0.0003, "loss": 11.0096, "loss/aux_loss": 0.04807580169290304, "loss/crossentropy": 2.7629685401916504, "loss/logits": 0.8371683716773987, "step": 54350 }, { "epoch": 0.5436, "grad_norm": 15.0, "grad_norm_var": 0.49073893229166665, "learning_rate": 0.0003, "loss": 11.0909, "loss/aux_loss": 0.04806205108761787, "loss/crossentropy": 2.7718708157539367, "loss/logits": 0.8284541577100754, "step": 54360 }, { "epoch": 0.5437, "grad_norm": 14.125, "grad_norm_var": 0.41451822916666664, "learning_rate": 0.0003, "loss": 11.088, "loss/aux_loss": 0.0480658633634448, "loss/crossentropy": 2.6623750627040863, "loss/logits": 0.7974577456712723, "step": 54370 }, { "epoch": 0.5438, "grad_norm": 14.0, "grad_norm_var": 0.5153483072916667, "learning_rate": 0.0003, "loss": 10.9292, "loss/aux_loss": 0.048075420595705506, "loss/crossentropy": 2.6294266045093537, "loss/logits": 0.8390023171901703, "step": 54380 }, { "epoch": 0.5439, "grad_norm": 15.4375, "grad_norm_var": 0.738134765625, "learning_rate": 0.0003, "loss": 10.9715, "loss/aux_loss": 0.04806606397032738, "loss/crossentropy": 2.7456657886505127, "loss/logits": 0.8316713005304337, "step": 54390 }, { "epoch": 0.544, "grad_norm": 15.8125, "grad_norm_var": 0.39191080729166666, "learning_rate": 0.0003, "loss": 11.0328, "loss/aux_loss": 0.048073101229965685, "loss/crossentropy": 2.708446371555328, "loss/logits": 0.8340208530426025, "step": 54400 }, { "epoch": 0.5441, "grad_norm": 17.375, "grad_norm_var": 1.078369140625, "learning_rate": 0.0003, "loss": 10.9475, "loss/aux_loss": 0.04807380214333534, "loss/crossentropy": 2.5974309384822845, "loss/logits": 0.7814567148685455, "step": 54410 }, { "epoch": 0.5442, "grad_norm": 15.0, "grad_norm_var": 1.2375, "learning_rate": 0.0003, "loss": 11.0752, "loss/aux_loss": 0.04807327631860971, "loss/crossentropy": 2.7958995699882507, "loss/logits": 0.852491220831871, "step": 54420 }, { "epoch": 0.5443, "grad_norm": 14.5625, "grad_norm_var": 1.4841145833333333, "learning_rate": 0.0003, "loss": 11.0555, "loss/aux_loss": 0.04806957729160786, "loss/crossentropy": 2.706936830282211, "loss/logits": 0.8366940230131149, "step": 54430 }, { "epoch": 0.5444, "grad_norm": 13.5, "grad_norm_var": 0.29108072916666666, "learning_rate": 0.0003, "loss": 11.0092, "loss/aux_loss": 0.04806767478585243, "loss/crossentropy": 2.5973580896854402, "loss/logits": 0.8097669005393981, "step": 54440 }, { "epoch": 0.5445, "grad_norm": 15.3125, "grad_norm_var": 0.6947265625, "learning_rate": 0.0003, "loss": 10.9494, "loss/aux_loss": 0.04808604661375284, "loss/crossentropy": 2.7189027309417724, "loss/logits": 0.8361554414033889, "step": 54450 }, { "epoch": 0.5446, "grad_norm": 14.5, "grad_norm_var": 2.285660807291667, "learning_rate": 0.0003, "loss": 10.9697, "loss/aux_loss": 0.048063131235539916, "loss/crossentropy": 2.6419017791748045, "loss/logits": 0.8406848013401031, "step": 54460 }, { "epoch": 0.5447, "grad_norm": 15.8125, "grad_norm_var": 0.2712890625, "learning_rate": 0.0003, "loss": 11.0544, "loss/aux_loss": 0.04808263406157494, "loss/crossentropy": 2.768437546491623, "loss/logits": 0.856353759765625, "step": 54470 }, { "epoch": 0.5448, "grad_norm": 14.625, "grad_norm_var": 0.688134765625, "learning_rate": 0.0003, "loss": 10.9724, "loss/aux_loss": 0.04806215986609459, "loss/crossentropy": 2.8292512774467466, "loss/logits": 0.8433954983949661, "step": 54480 }, { "epoch": 0.5449, "grad_norm": 15.625, "grad_norm_var": 0.8994140625, "learning_rate": 0.0003, "loss": 10.8039, "loss/aux_loss": 0.0480883689597249, "loss/crossentropy": 2.797008693218231, "loss/logits": 0.8281289517879487, "step": 54490 }, { "epoch": 0.545, "grad_norm": 14.75, "grad_norm_var": 0.6885416666666667, "learning_rate": 0.0003, "loss": 11.0977, "loss/aux_loss": 0.048079678975045684, "loss/crossentropy": 2.7917538404464723, "loss/logits": 0.7895294100046157, "step": 54500 }, { "epoch": 0.5451, "grad_norm": 14.0625, "grad_norm_var": 0.8801432291666667, "learning_rate": 0.0003, "loss": 11.0203, "loss/aux_loss": 0.04806761741638184, "loss/crossentropy": 2.5670079469680784, "loss/logits": 0.7952559888362885, "step": 54510 }, { "epoch": 0.5452, "grad_norm": 15.4375, "grad_norm_var": 0.3082682291666667, "learning_rate": 0.0003, "loss": 11.051, "loss/aux_loss": 0.04807319939136505, "loss/crossentropy": 2.767977863550186, "loss/logits": 0.8227100253105164, "step": 54520 }, { "epoch": 0.5453, "grad_norm": 15.1875, "grad_norm_var": 0.7530598958333333, "learning_rate": 0.0003, "loss": 11.0836, "loss/aux_loss": 0.04807401541620493, "loss/crossentropy": 2.7236214160919188, "loss/logits": 0.8225501179695129, "step": 54530 }, { "epoch": 0.5454, "grad_norm": 13.75, "grad_norm_var": 1.500634765625, "learning_rate": 0.0003, "loss": 10.9399, "loss/aux_loss": 0.04807001277804375, "loss/crossentropy": 2.706336867809296, "loss/logits": 0.8362784326076508, "step": 54540 }, { "epoch": 0.5455, "grad_norm": 15.3125, "grad_norm_var": 1.7351399739583333, "learning_rate": 0.0003, "loss": 11.1466, "loss/aux_loss": 0.04807394836097956, "loss/crossentropy": 2.6270270586013793, "loss/logits": 0.8055624902248383, "step": 54550 }, { "epoch": 0.5456, "grad_norm": 14.5625, "grad_norm_var": 1.8981608072916667, "learning_rate": 0.0003, "loss": 11.0043, "loss/aux_loss": 0.04806969799101353, "loss/crossentropy": 2.7887719571590424, "loss/logits": 0.8097354710102082, "step": 54560 }, { "epoch": 0.5457, "grad_norm": 16.25, "grad_norm_var": 0.8872395833333333, "learning_rate": 0.0003, "loss": 10.9318, "loss/aux_loss": 0.04807890877127648, "loss/crossentropy": 2.632401758432388, "loss/logits": 0.8157328695058823, "step": 54570 }, { "epoch": 0.5458, "grad_norm": 16.25, "grad_norm_var": 0.6329264322916667, "learning_rate": 0.0003, "loss": 11.043, "loss/aux_loss": 0.04806562662124634, "loss/crossentropy": 2.796691632270813, "loss/logits": 0.8395285964012146, "step": 54580 }, { "epoch": 0.5459, "grad_norm": 14.5625, "grad_norm_var": 1.1449055989583334, "learning_rate": 0.0003, "loss": 10.998, "loss/aux_loss": 0.04808074235916138, "loss/crossentropy": 2.683507615327835, "loss/logits": 0.8001698046922684, "step": 54590 }, { "epoch": 0.546, "grad_norm": 15.875, "grad_norm_var": 2.4734212239583333, "learning_rate": 0.0003, "loss": 11.0751, "loss/aux_loss": 0.04807427860796452, "loss/crossentropy": 2.66720929145813, "loss/logits": 0.8314949810504914, "step": 54600 }, { "epoch": 0.5461, "grad_norm": 14.9375, "grad_norm_var": 1.858837890625, "learning_rate": 0.0003, "loss": 10.9659, "loss/aux_loss": 0.0480766873806715, "loss/crossentropy": 2.564626210927963, "loss/logits": 0.7770012825727463, "step": 54610 }, { "epoch": 0.5462, "grad_norm": 14.125, "grad_norm_var": 0.8690104166666667, "learning_rate": 0.0003, "loss": 10.8087, "loss/aux_loss": 0.04806546028703451, "loss/crossentropy": 2.6509805858135223, "loss/logits": 0.7985016733407975, "step": 54620 }, { "epoch": 0.5463, "grad_norm": 15.0625, "grad_norm_var": 1.0657389322916666, "learning_rate": 0.0003, "loss": 10.9941, "loss/aux_loss": 0.048076178319752215, "loss/crossentropy": 2.7503870487213136, "loss/logits": 0.8250322550535202, "step": 54630 }, { "epoch": 0.5464, "grad_norm": 14.3125, "grad_norm_var": 0.5979166666666667, "learning_rate": 0.0003, "loss": 11.075, "loss/aux_loss": 0.04807407483458519, "loss/crossentropy": 2.688520979881287, "loss/logits": 0.7950679957866669, "step": 54640 }, { "epoch": 0.5465, "grad_norm": 20.25, "grad_norm_var": 12.39765625, "learning_rate": 0.0003, "loss": 10.8744, "loss/aux_loss": 0.04807685688138008, "loss/crossentropy": 2.671421802043915, "loss/logits": 0.8197872430086136, "step": 54650 }, { "epoch": 0.5466, "grad_norm": 14.6875, "grad_norm_var": 2.2609375, "learning_rate": 0.0003, "loss": 10.8834, "loss/aux_loss": 0.048076017200946806, "loss/crossentropy": 2.5632533609867094, "loss/logits": 0.7861254096031189, "step": 54660 }, { "epoch": 0.5467, "grad_norm": 15.1875, "grad_norm_var": 0.87890625, "learning_rate": 0.0003, "loss": 10.9958, "loss/aux_loss": 0.048075830191373826, "loss/crossentropy": 2.669093906879425, "loss/logits": 0.8262115895748139, "step": 54670 }, { "epoch": 0.5468, "grad_norm": 16.5, "grad_norm_var": 1.33984375, "learning_rate": 0.0003, "loss": 10.9333, "loss/aux_loss": 0.04807015471160412, "loss/crossentropy": 2.7327490568161013, "loss/logits": 0.8197944283485412, "step": 54680 }, { "epoch": 0.5469, "grad_norm": 15.6875, "grad_norm_var": 1.074853515625, "learning_rate": 0.0003, "loss": 11.0295, "loss/aux_loss": 0.04807836562395096, "loss/crossentropy": 2.6087993323802947, "loss/logits": 0.806918916106224, "step": 54690 }, { "epoch": 0.547, "grad_norm": 15.125, "grad_norm_var": 0.35930989583333334, "learning_rate": 0.0003, "loss": 11.0181, "loss/aux_loss": 0.04807017408311367, "loss/crossentropy": 2.839758336544037, "loss/logits": 0.8093813061714172, "step": 54700 }, { "epoch": 0.5471, "grad_norm": 15.6875, "grad_norm_var": 0.43670247395833334, "learning_rate": 0.0003, "loss": 10.9142, "loss/aux_loss": 0.04807600136846304, "loss/crossentropy": 2.644270604848862, "loss/logits": 0.7997170180082321, "step": 54710 }, { "epoch": 0.5472, "grad_norm": 15.25, "grad_norm_var": 0.309375, "learning_rate": 0.0003, "loss": 11.1508, "loss/aux_loss": 0.04805862028151751, "loss/crossentropy": 2.7486122012138368, "loss/logits": 0.8468534052371979, "step": 54720 }, { "epoch": 0.5473, "grad_norm": 13.8125, "grad_norm_var": 0.5001139322916667, "learning_rate": 0.0003, "loss": 11.0571, "loss/aux_loss": 0.04807792566716671, "loss/crossentropy": 2.6369648575782776, "loss/logits": 0.8063502162694931, "step": 54730 }, { "epoch": 0.5474, "grad_norm": 15.25, "grad_norm_var": 0.33984375, "learning_rate": 0.0003, "loss": 10.9084, "loss/aux_loss": 0.048066607862710956, "loss/crossentropy": 2.598222774267197, "loss/logits": 0.8337789565324784, "step": 54740 }, { "epoch": 0.5475, "grad_norm": 14.875, "grad_norm_var": 146.435400390625, "learning_rate": 0.0003, "loss": 10.9427, "loss/aux_loss": 0.04807037971913815, "loss/crossentropy": 2.689501368999481, "loss/logits": 0.8253029197454452, "step": 54750 }, { "epoch": 0.5476, "grad_norm": 14.9375, "grad_norm_var": 145.92447916666666, "learning_rate": 0.0003, "loss": 10.929, "loss/aux_loss": 0.048080086894333365, "loss/crossentropy": 2.7107209861278534, "loss/logits": 0.8063658207654953, "step": 54760 }, { "epoch": 0.5477, "grad_norm": 15.5, "grad_norm_var": 0.3714680989583333, "learning_rate": 0.0003, "loss": 11.0567, "loss/aux_loss": 0.048056581430137156, "loss/crossentropy": 2.7536053538322447, "loss/logits": 0.8437339574098587, "step": 54770 }, { "epoch": 0.5478, "grad_norm": 14.125, "grad_norm_var": 0.221728515625, "learning_rate": 0.0003, "loss": 11.0786, "loss/aux_loss": 0.04807734116911888, "loss/crossentropy": 2.624860906600952, "loss/logits": 0.8355853497982025, "step": 54780 }, { "epoch": 0.5479, "grad_norm": 14.75, "grad_norm_var": 0.372119140625, "learning_rate": 0.0003, "loss": 11.0605, "loss/aux_loss": 0.048068249225616456, "loss/crossentropy": 2.6733208775520323, "loss/logits": 0.8279371082782745, "step": 54790 }, { "epoch": 0.548, "grad_norm": 14.1875, "grad_norm_var": 0.5143229166666666, "learning_rate": 0.0003, "loss": 11.0193, "loss/aux_loss": 0.04806514009833336, "loss/crossentropy": 2.83870667219162, "loss/logits": 0.8224406003952026, "step": 54800 }, { "epoch": 0.5481, "grad_norm": 16.875, "grad_norm_var": 0.6921223958333333, "learning_rate": 0.0003, "loss": 11.2115, "loss/aux_loss": 0.048080642521381375, "loss/crossentropy": 2.7003244876861574, "loss/logits": 0.8368692755699157, "step": 54810 }, { "epoch": 0.5482, "grad_norm": 14.5, "grad_norm_var": 1.157275390625, "learning_rate": 0.0003, "loss": 11.0255, "loss/aux_loss": 0.0480611115694046, "loss/crossentropy": 2.7027824997901915, "loss/logits": 0.8393090069293976, "step": 54820 }, { "epoch": 0.5483, "grad_norm": 15.4375, "grad_norm_var": 4.451285807291667, "learning_rate": 0.0003, "loss": 10.9206, "loss/aux_loss": 0.04807600080966949, "loss/crossentropy": 2.8590354561805724, "loss/logits": 0.8446751832962036, "step": 54830 }, { "epoch": 0.5484, "grad_norm": 15.0625, "grad_norm_var": 3.78046875, "learning_rate": 0.0003, "loss": 11.0024, "loss/aux_loss": 0.048070641607046126, "loss/crossentropy": 2.7838706493377687, "loss/logits": 0.8221488207578659, "step": 54840 }, { "epoch": 0.5485, "grad_norm": 14.875, "grad_norm_var": 23.503645833333334, "learning_rate": 0.0003, "loss": 10.9116, "loss/aux_loss": 0.04807809256017208, "loss/crossentropy": 2.7407828688621523, "loss/logits": 0.8634491443634034, "step": 54850 }, { "epoch": 0.5486, "grad_norm": 17.125, "grad_norm_var": 0.9280598958333334, "learning_rate": 0.0003, "loss": 11.0951, "loss/aux_loss": 0.04807487297803163, "loss/crossentropy": 2.6179397821426393, "loss/logits": 0.7829213112592697, "step": 54860 }, { "epoch": 0.5487, "grad_norm": 16.375, "grad_norm_var": 1.118994140625, "learning_rate": 0.0003, "loss": 11.1043, "loss/aux_loss": 0.04806428123265505, "loss/crossentropy": 2.738786405324936, "loss/logits": 0.8392746210098266, "step": 54870 }, { "epoch": 0.5488, "grad_norm": 16.625, "grad_norm_var": 0.9288899739583333, "learning_rate": 0.0003, "loss": 11.0692, "loss/aux_loss": 0.04807651992887259, "loss/crossentropy": 2.739248037338257, "loss/logits": 0.8410652667284012, "step": 54880 }, { "epoch": 0.5489, "grad_norm": 15.125, "grad_norm_var": 0.8481770833333333, "learning_rate": 0.0003, "loss": 11.0996, "loss/aux_loss": 0.048066032119095325, "loss/crossentropy": 2.841780698299408, "loss/logits": 0.8673689156770706, "step": 54890 }, { "epoch": 0.549, "grad_norm": 14.3125, "grad_norm_var": 1.1091145833333333, "learning_rate": 0.0003, "loss": 10.9194, "loss/aux_loss": 0.04806853048503399, "loss/crossentropy": 2.7467273652553557, "loss/logits": 0.8478365898132324, "step": 54900 }, { "epoch": 0.5491, "grad_norm": 14.875, "grad_norm_var": 0.23854166666666668, "learning_rate": 0.0003, "loss": 10.8423, "loss/aux_loss": 0.04806788172572851, "loss/crossentropy": 2.656290876865387, "loss/logits": 0.8057217448949814, "step": 54910 }, { "epoch": 0.5492, "grad_norm": 15.5625, "grad_norm_var": 0.42421875, "learning_rate": 0.0003, "loss": 10.9517, "loss/aux_loss": 0.04808411467820406, "loss/crossentropy": 2.7929548025131226, "loss/logits": 0.8252448886632919, "step": 54920 }, { "epoch": 0.5493, "grad_norm": 15.5625, "grad_norm_var": 0.4051920572916667, "learning_rate": 0.0003, "loss": 10.9819, "loss/aux_loss": 0.0480615908280015, "loss/crossentropy": 2.7801303029060365, "loss/logits": 0.8024150758981705, "step": 54930 }, { "epoch": 0.5494, "grad_norm": 14.125, "grad_norm_var": 0.4627604166666667, "learning_rate": 0.0003, "loss": 10.9448, "loss/aux_loss": 0.04807640910148621, "loss/crossentropy": 2.7152156591415406, "loss/logits": 0.8221557170152665, "step": 54940 }, { "epoch": 0.5495, "grad_norm": 13.25, "grad_norm_var": 0.5196451822916667, "learning_rate": 0.0003, "loss": 11.0243, "loss/aux_loss": 0.04806815255433321, "loss/crossentropy": 2.622380143404007, "loss/logits": 0.8374263972043992, "step": 54950 }, { "epoch": 0.5496, "grad_norm": 15.6875, "grad_norm_var": 106.15402018229166, "learning_rate": 0.0003, "loss": 10.9103, "loss/aux_loss": 0.04807578641921282, "loss/crossentropy": 2.779756152629852, "loss/logits": 0.8186855703592301, "step": 54960 }, { "epoch": 0.5497, "grad_norm": 14.8125, "grad_norm_var": 0.678125, "learning_rate": 0.0003, "loss": 11.0254, "loss/aux_loss": 0.04808486998081207, "loss/crossentropy": 2.5636366307735443, "loss/logits": 0.8077445298433303, "step": 54970 }, { "epoch": 0.5498, "grad_norm": 13.875, "grad_norm_var": 0.73515625, "learning_rate": 0.0003, "loss": 11.012, "loss/aux_loss": 0.04805983640253544, "loss/crossentropy": 2.850145775079727, "loss/logits": 0.8156041219830513, "step": 54980 }, { "epoch": 0.5499, "grad_norm": 15.125, "grad_norm_var": 68.479541015625, "learning_rate": 0.0003, "loss": 10.9674, "loss/aux_loss": 0.0480882341042161, "loss/crossentropy": 2.763186824321747, "loss/logits": 0.8171787321567535, "step": 54990 }, { "epoch": 0.55, "grad_norm": 14.8125, "grad_norm_var": 1.0983723958333333, "learning_rate": 0.0003, "loss": 10.9881, "loss/aux_loss": 0.048068669810891154, "loss/crossentropy": 2.663837069272995, "loss/logits": 0.8430362701416015, "step": 55000 }, { "epoch": 0.5501, "grad_norm": 15.9375, "grad_norm_var": 0.6478515625, "learning_rate": 0.0003, "loss": 11.0946, "loss/aux_loss": 0.048068431206047535, "loss/crossentropy": 2.65292683839798, "loss/logits": 0.8245347827672959, "step": 55010 }, { "epoch": 0.5502, "grad_norm": 15.1875, "grad_norm_var": 0.368994140625, "learning_rate": 0.0003, "loss": 11.0463, "loss/aux_loss": 0.04806177597492933, "loss/crossentropy": 2.7886301994323732, "loss/logits": 0.8723696410655976, "step": 55020 }, { "epoch": 0.5503, "grad_norm": 14.4375, "grad_norm_var": 1.080322265625, "learning_rate": 0.0003, "loss": 11.0383, "loss/aux_loss": 0.04808465298265219, "loss/crossentropy": 2.846861410140991, "loss/logits": 0.8170645713806153, "step": 55030 }, { "epoch": 0.5504, "grad_norm": 17.875, "grad_norm_var": 1.7322265625, "learning_rate": 0.0003, "loss": 11.0391, "loss/aux_loss": 0.04806484617292881, "loss/crossentropy": 2.611069065332413, "loss/logits": 0.7951992452144623, "step": 55040 }, { "epoch": 0.5505, "grad_norm": 15.3125, "grad_norm_var": 1.0645182291666666, "learning_rate": 0.0003, "loss": 11.0523, "loss/aux_loss": 0.048074701242148875, "loss/crossentropy": 2.811970281600952, "loss/logits": 0.794430273771286, "step": 55050 }, { "epoch": 0.5506, "grad_norm": 14.75, "grad_norm_var": 0.6796875, "learning_rate": 0.0003, "loss": 10.9043, "loss/aux_loss": 0.048068948276340964, "loss/crossentropy": 2.664133369922638, "loss/logits": 0.814795833826065, "step": 55060 }, { "epoch": 0.5507, "grad_norm": 15.3125, "grad_norm_var": 0.6744791666666666, "learning_rate": 0.0003, "loss": 11.0856, "loss/aux_loss": 0.04806470796465874, "loss/crossentropy": 2.8181748032569884, "loss/logits": 0.8605926305055618, "step": 55070 }, { "epoch": 0.5508, "grad_norm": 15.3125, "grad_norm_var": 0.50078125, "learning_rate": 0.0003, "loss": 11.0929, "loss/aux_loss": 0.048071750067174436, "loss/crossentropy": 2.6172981381416323, "loss/logits": 0.8222677648067475, "step": 55080 }, { "epoch": 0.5509, "grad_norm": 18.125, "grad_norm_var": 1.0407389322916667, "learning_rate": 0.0003, "loss": 10.8124, "loss/aux_loss": 0.04807625990360975, "loss/crossentropy": 2.448382931947708, "loss/logits": 0.7904783099889755, "step": 55090 }, { "epoch": 0.551, "grad_norm": 14.75, "grad_norm_var": 3.9869140625, "learning_rate": 0.0003, "loss": 11.0645, "loss/aux_loss": 0.04806983452290296, "loss/crossentropy": 2.7576751828193666, "loss/logits": 0.8664533495903015, "step": 55100 }, { "epoch": 0.5511, "grad_norm": 15.875, "grad_norm_var": 0.45818684895833334, "learning_rate": 0.0003, "loss": 10.8939, "loss/aux_loss": 0.04806837178766728, "loss/crossentropy": 2.7311266005039214, "loss/logits": 0.8202107667922973, "step": 55110 }, { "epoch": 0.5512, "grad_norm": 14.75, "grad_norm_var": 0.746728515625, "learning_rate": 0.0003, "loss": 10.8522, "loss/aux_loss": 0.04809461031109095, "loss/crossentropy": 2.481432467699051, "loss/logits": 0.7854719698429108, "step": 55120 }, { "epoch": 0.5513, "grad_norm": 15.8125, "grad_norm_var": 0.5721354166666667, "learning_rate": 0.0003, "loss": 10.9235, "loss/aux_loss": 0.048076366260647777, "loss/crossentropy": 2.6985863506793977, "loss/logits": 0.7947354167699814, "step": 55130 }, { "epoch": 0.5514, "grad_norm": 16.375, "grad_norm_var": 0.8880208333333334, "learning_rate": 0.0003, "loss": 11.1427, "loss/aux_loss": 0.048077909648418425, "loss/crossentropy": 2.8347915410995483, "loss/logits": 0.8606969892978669, "step": 55140 }, { "epoch": 0.5515, "grad_norm": 14.8125, "grad_norm_var": 0.5734375, "learning_rate": 0.0003, "loss": 11.0213, "loss/aux_loss": 0.04807186853140592, "loss/crossentropy": 2.677201247215271, "loss/logits": 0.8178416341543198, "step": 55150 }, { "epoch": 0.5516, "grad_norm": 14.6875, "grad_norm_var": 0.47578125, "learning_rate": 0.0003, "loss": 11.1246, "loss/aux_loss": 0.04808474499732256, "loss/crossentropy": 2.7420763611793517, "loss/logits": 0.8414195388555527, "step": 55160 }, { "epoch": 0.5517, "grad_norm": 15.3125, "grad_norm_var": 0.49933268229166666, "learning_rate": 0.0003, "loss": 11.0815, "loss/aux_loss": 0.0480824813246727, "loss/crossentropy": 2.7030155539512633, "loss/logits": 0.8236714661121368, "step": 55170 }, { "epoch": 0.5518, "grad_norm": 15.4375, "grad_norm_var": 0.470166015625, "learning_rate": 0.0003, "loss": 11.1841, "loss/aux_loss": 0.048062578216195105, "loss/crossentropy": 2.6597807705402374, "loss/logits": 0.8233522891998291, "step": 55180 }, { "epoch": 0.5519, "grad_norm": 15.875, "grad_norm_var": 1.1374837239583333, "learning_rate": 0.0003, "loss": 10.9552, "loss/aux_loss": 0.048083331808447836, "loss/crossentropy": 2.794076269865036, "loss/logits": 0.8059151649475098, "step": 55190 }, { "epoch": 0.552, "grad_norm": 14.6875, "grad_norm_var": 0.7822265625, "learning_rate": 0.0003, "loss": 11.2436, "loss/aux_loss": 0.048071344010531904, "loss/crossentropy": 2.935925805568695, "loss/logits": 0.8763310700654984, "step": 55200 }, { "epoch": 0.5521, "grad_norm": 14.4375, "grad_norm_var": 0.8796875, "learning_rate": 0.0003, "loss": 10.8792, "loss/aux_loss": 0.04806312434375286, "loss/crossentropy": 2.5801034331321717, "loss/logits": 0.7766230911016464, "step": 55210 }, { "epoch": 0.5522, "grad_norm": 16.0, "grad_norm_var": 1.910009765625, "learning_rate": 0.0003, "loss": 11.0385, "loss/aux_loss": 0.048060713522136214, "loss/crossentropy": 2.7574446558952332, "loss/logits": 0.844352638721466, "step": 55220 }, { "epoch": 0.5523, "grad_norm": 14.1875, "grad_norm_var": 1.689697265625, "learning_rate": 0.0003, "loss": 10.8802, "loss/aux_loss": 0.048071770928800106, "loss/crossentropy": 2.5709005653858186, "loss/logits": 0.7922522544860839, "step": 55230 }, { "epoch": 0.5524, "grad_norm": 15.125, "grad_norm_var": 0.2791666666666667, "learning_rate": 0.0003, "loss": 11.0235, "loss/aux_loss": 0.04807364530861378, "loss/crossentropy": 2.7921212911605835, "loss/logits": 0.8340934455394745, "step": 55240 }, { "epoch": 0.5525, "grad_norm": 14.875, "grad_norm_var": 0.3009765625, "learning_rate": 0.0003, "loss": 11.0513, "loss/aux_loss": 0.04807345513254404, "loss/crossentropy": 2.711283278465271, "loss/logits": 0.8268725454807282, "step": 55250 }, { "epoch": 0.5526, "grad_norm": 16.375, "grad_norm_var": 0.49947916666666664, "learning_rate": 0.0003, "loss": 10.8667, "loss/aux_loss": 0.048064783401787284, "loss/crossentropy": 2.5558693051338195, "loss/logits": 0.8117028713226319, "step": 55260 }, { "epoch": 0.5527, "grad_norm": 15.625, "grad_norm_var": 0.506884765625, "learning_rate": 0.0003, "loss": 11.0831, "loss/aux_loss": 0.0480767959728837, "loss/crossentropy": 2.6161147236824034, "loss/logits": 0.8263348460197448, "step": 55270 }, { "epoch": 0.5528, "grad_norm": 15.9375, "grad_norm_var": 0.44453125, "learning_rate": 0.0003, "loss": 11.0553, "loss/aux_loss": 0.04807081706821918, "loss/crossentropy": 2.8348384737968444, "loss/logits": 0.8327891290187835, "step": 55280 }, { "epoch": 0.5529, "grad_norm": 13.75, "grad_norm_var": 0.7016764322916667, "learning_rate": 0.0003, "loss": 11.0743, "loss/aux_loss": 0.04807010293006897, "loss/crossentropy": 2.8104523420333862, "loss/logits": 0.8793832540512085, "step": 55290 }, { "epoch": 0.553, "grad_norm": 15.125, "grad_norm_var": 0.8228515625, "learning_rate": 0.0003, "loss": 10.9962, "loss/aux_loss": 0.04807137455791235, "loss/crossentropy": 2.6533069372177125, "loss/logits": 0.831883293390274, "step": 55300 }, { "epoch": 0.5531, "grad_norm": 15.125, "grad_norm_var": 0.70390625, "learning_rate": 0.0003, "loss": 11.0324, "loss/aux_loss": 0.04808204546570778, "loss/crossentropy": 2.8105109453201296, "loss/logits": 0.8209470868110657, "step": 55310 }, { "epoch": 0.5532, "grad_norm": 14.0, "grad_norm_var": 0.2744140625, "learning_rate": 0.0003, "loss": 11.1437, "loss/aux_loss": 0.04806657768785953, "loss/crossentropy": 2.852742946147919, "loss/logits": 0.8403396517038345, "step": 55320 }, { "epoch": 0.5533, "grad_norm": 14.4375, "grad_norm_var": 0.42473958333333334, "learning_rate": 0.0003, "loss": 10.9109, "loss/aux_loss": 0.04807582776993513, "loss/crossentropy": 2.6064063906669617, "loss/logits": 0.8204812169075012, "step": 55330 }, { "epoch": 0.5534, "grad_norm": 17.25, "grad_norm_var": 0.7844889322916667, "learning_rate": 0.0003, "loss": 10.8695, "loss/aux_loss": 0.04807104654610157, "loss/crossentropy": 2.8387674689292908, "loss/logits": 0.8554284036159515, "step": 55340 }, { "epoch": 0.5535, "grad_norm": 19.25, "grad_norm_var": 1.8325358072916667, "learning_rate": 0.0003, "loss": 10.8966, "loss/aux_loss": 0.048077484220266344, "loss/crossentropy": 2.6154538214206697, "loss/logits": 0.7829844743013382, "step": 55350 }, { "epoch": 0.5536, "grad_norm": 39.5, "grad_norm_var": 38.35558268229167, "learning_rate": 0.0003, "loss": 10.9473, "loss/aux_loss": 0.04806430507451296, "loss/crossentropy": 2.670952570438385, "loss/logits": 0.8368105083703995, "step": 55360 }, { "epoch": 0.5537, "grad_norm": 14.6875, "grad_norm_var": 37.1072265625, "learning_rate": 0.0003, "loss": 11.1894, "loss/aux_loss": 0.04807990249246359, "loss/crossentropy": 2.566202479600906, "loss/logits": 0.8429438978433609, "step": 55370 }, { "epoch": 0.5538, "grad_norm": 15.0, "grad_norm_var": 0.7909993489583333, "learning_rate": 0.0003, "loss": 11.0169, "loss/aux_loss": 0.04807171169668436, "loss/crossentropy": 2.6199849128723143, "loss/logits": 0.7806309968233108, "step": 55380 }, { "epoch": 0.5539, "grad_norm": 14.6875, "grad_norm_var": 0.7916666666666666, "learning_rate": 0.0003, "loss": 11.1617, "loss/aux_loss": 0.04807775840163231, "loss/crossentropy": 2.8030936300754545, "loss/logits": 0.8265924125909805, "step": 55390 }, { "epoch": 0.554, "grad_norm": 14.875, "grad_norm_var": 1.0254557291666666, "learning_rate": 0.0003, "loss": 10.9235, "loss/aux_loss": 0.04806862715631723, "loss/crossentropy": 2.6641399443149565, "loss/logits": 0.8228778213262558, "step": 55400 }, { "epoch": 0.5541, "grad_norm": 14.5, "grad_norm_var": 0.5369140625, "learning_rate": 0.0003, "loss": 11.1513, "loss/aux_loss": 0.04806708451360464, "loss/crossentropy": 2.73874751329422, "loss/logits": 0.8570866554975509, "step": 55410 }, { "epoch": 0.5542, "grad_norm": 14.5, "grad_norm_var": 0.27545572916666666, "learning_rate": 0.0003, "loss": 11.0742, "loss/aux_loss": 0.048080652765929696, "loss/crossentropy": 2.6296676993370056, "loss/logits": 0.8333312928676605, "step": 55420 }, { "epoch": 0.5543, "grad_norm": 15.0, "grad_norm_var": 4.2884765625, "learning_rate": 0.0003, "loss": 10.7601, "loss/aux_loss": 0.04808066971600056, "loss/crossentropy": 2.6608549892902373, "loss/logits": 0.7777025699615479, "step": 55430 }, { "epoch": 0.5544, "grad_norm": 15.1875, "grad_norm_var": 4.875764973958334, "learning_rate": 0.0003, "loss": 10.8756, "loss/aux_loss": 0.04807064030319452, "loss/crossentropy": 2.6703452289104463, "loss/logits": 0.8353601545095444, "step": 55440 }, { "epoch": 0.5545, "grad_norm": 13.5625, "grad_norm_var": 1.1557291666666667, "learning_rate": 0.0003, "loss": 10.8567, "loss/aux_loss": 0.04806992132216692, "loss/crossentropy": 2.8200165271759032, "loss/logits": 0.8091388493776321, "step": 55450 }, { "epoch": 0.5546, "grad_norm": 15.0, "grad_norm_var": 26.2869140625, "learning_rate": 0.0003, "loss": 10.9664, "loss/aux_loss": 0.04806872811168432, "loss/crossentropy": 2.758486533164978, "loss/logits": 0.814564099907875, "step": 55460 }, { "epoch": 0.5547, "grad_norm": 17.25, "grad_norm_var": 24.257666015625, "learning_rate": 0.0003, "loss": 10.9383, "loss/aux_loss": 0.048077772557735446, "loss/crossentropy": 2.850853431224823, "loss/logits": 0.8375491023063659, "step": 55470 }, { "epoch": 0.5548, "grad_norm": 15.4375, "grad_norm_var": 0.7166666666666667, "learning_rate": 0.0003, "loss": 10.9209, "loss/aux_loss": 0.04806488305330277, "loss/crossentropy": 2.7074069380760193, "loss/logits": 0.809591680765152, "step": 55480 }, { "epoch": 0.5549, "grad_norm": 18.25, "grad_norm_var": 115.28430989583333, "learning_rate": 0.0003, "loss": 11.1249, "loss/aux_loss": 0.048068816773593426, "loss/crossentropy": 2.7336994290351866, "loss/logits": 0.8458652794361115, "step": 55490 }, { "epoch": 0.555, "grad_norm": 15.3125, "grad_norm_var": 115.85260416666667, "learning_rate": 0.0003, "loss": 11.0102, "loss/aux_loss": 0.04807518254965544, "loss/crossentropy": 2.773914611339569, "loss/logits": 0.8233480423688888, "step": 55500 }, { "epoch": 0.5551, "grad_norm": 15.125, "grad_norm_var": 2.074934895833333, "learning_rate": 0.0003, "loss": 11.0417, "loss/aux_loss": 0.048073450662195684, "loss/crossentropy": 2.622314327955246, "loss/logits": 0.8316299766302109, "step": 55510 }, { "epoch": 0.5552, "grad_norm": 15.8125, "grad_norm_var": 2.1219889322916665, "learning_rate": 0.0003, "loss": 10.9537, "loss/aux_loss": 0.04807238392531872, "loss/crossentropy": 2.688526248931885, "loss/logits": 0.8633313030004501, "step": 55520 }, { "epoch": 0.5553, "grad_norm": 14.5, "grad_norm_var": 0.8098795572916667, "learning_rate": 0.0003, "loss": 11.1069, "loss/aux_loss": 0.048073047399520875, "loss/crossentropy": 2.9057937622070313, "loss/logits": 0.8318546801805496, "step": 55530 }, { "epoch": 0.5554, "grad_norm": 15.3125, "grad_norm_var": 0.7106608072916667, "learning_rate": 0.0003, "loss": 10.8562, "loss/aux_loss": 0.04806810189038515, "loss/crossentropy": 2.677476871013641, "loss/logits": 0.786837711930275, "step": 55540 }, { "epoch": 0.5555, "grad_norm": 14.6875, "grad_norm_var": 0.3433430989583333, "learning_rate": 0.0003, "loss": 11.1194, "loss/aux_loss": 0.048065618611872196, "loss/crossentropy": 2.7434459567070006, "loss/logits": 0.813059389591217, "step": 55550 }, { "epoch": 0.5556, "grad_norm": 14.5625, "grad_norm_var": 0.3681640625, "learning_rate": 0.0003, "loss": 10.8911, "loss/aux_loss": 0.04808599632233381, "loss/crossentropy": 2.6911366164684294, "loss/logits": 0.8269895523786545, "step": 55560 }, { "epoch": 0.5557, "grad_norm": 14.25, "grad_norm_var": 0.372509765625, "learning_rate": 0.0003, "loss": 10.9757, "loss/aux_loss": 0.04807158019393683, "loss/crossentropy": 2.719972950220108, "loss/logits": 0.8304482787847519, "step": 55570 }, { "epoch": 0.5558, "grad_norm": 14.8125, "grad_norm_var": 1.05, "learning_rate": 0.0003, "loss": 11.0324, "loss/aux_loss": 0.048060659877955915, "loss/crossentropy": 2.758992946147919, "loss/logits": 0.8127716392278671, "step": 55580 }, { "epoch": 0.5559, "grad_norm": 15.6875, "grad_norm_var": 0.48020833333333335, "learning_rate": 0.0003, "loss": 11.1187, "loss/aux_loss": 0.04808644969016314, "loss/crossentropy": 2.6615478575229643, "loss/logits": 0.8186038672924042, "step": 55590 }, { "epoch": 0.556, "grad_norm": 15.0625, "grad_norm_var": 0.7535807291666666, "learning_rate": 0.0003, "loss": 10.984, "loss/aux_loss": 0.04806722085922956, "loss/crossentropy": 2.8301248073577883, "loss/logits": 0.8224711626768112, "step": 55600 }, { "epoch": 0.5561, "grad_norm": 15.6875, "grad_norm_var": 0.9426432291666667, "learning_rate": 0.0003, "loss": 11.0619, "loss/aux_loss": 0.048068858496844766, "loss/crossentropy": 2.652260237932205, "loss/logits": 0.8152152061462402, "step": 55610 }, { "epoch": 0.5562, "grad_norm": 15.9375, "grad_norm_var": 0.6843098958333333, "learning_rate": 0.0003, "loss": 11.0558, "loss/aux_loss": 0.048064957931637765, "loss/crossentropy": 2.8026095151901247, "loss/logits": 0.8398134261369705, "step": 55620 }, { "epoch": 0.5563, "grad_norm": 14.0, "grad_norm_var": 0.9531087239583333, "learning_rate": 0.0003, "loss": 10.9296, "loss/aux_loss": 0.048068616352975366, "loss/crossentropy": 2.6979903995990755, "loss/logits": 0.8080804139375687, "step": 55630 }, { "epoch": 0.5564, "grad_norm": 14.875, "grad_norm_var": 0.63671875, "learning_rate": 0.0003, "loss": 10.895, "loss/aux_loss": 0.04807308129966259, "loss/crossentropy": 2.7710575222969056, "loss/logits": 0.807817280292511, "step": 55640 }, { "epoch": 0.5565, "grad_norm": 15.1875, "grad_norm_var": 0.8942057291666666, "learning_rate": 0.0003, "loss": 10.9262, "loss/aux_loss": 0.04807392563670874, "loss/crossentropy": 2.736243361234665, "loss/logits": 0.8000924259424209, "step": 55650 }, { "epoch": 0.5566, "grad_norm": 14.875, "grad_norm_var": 0.2556640625, "learning_rate": 0.0003, "loss": 10.9558, "loss/aux_loss": 0.04807540960609913, "loss/crossentropy": 2.8422346234321596, "loss/logits": 0.830447968840599, "step": 55660 }, { "epoch": 0.5567, "grad_norm": 15.3125, "grad_norm_var": 5286.068473307291, "learning_rate": 0.0003, "loss": 11.0703, "loss/aux_loss": 0.04807211048901081, "loss/crossentropy": 2.746444511413574, "loss/logits": 0.8070830225944519, "step": 55670 }, { "epoch": 0.5568, "grad_norm": 13.75, "grad_norm_var": 5261.000244140625, "learning_rate": 0.0003, "loss": 10.9093, "loss/aux_loss": 0.048075992986559866, "loss/crossentropy": 2.802550220489502, "loss/logits": 0.8187012434005737, "step": 55680 }, { "epoch": 0.5569, "grad_norm": 16.125, "grad_norm_var": 2.572249348958333, "learning_rate": 0.0003, "loss": 11.0327, "loss/aux_loss": 0.048064975813031194, "loss/crossentropy": 2.5375086605548858, "loss/logits": 0.8208780288696289, "step": 55690 }, { "epoch": 0.557, "grad_norm": 14.5, "grad_norm_var": 0.4688639322916667, "learning_rate": 0.0003, "loss": 11.0094, "loss/aux_loss": 0.04807396829128265, "loss/crossentropy": 2.7012075066566466, "loss/logits": 0.8320501059293747, "step": 55700 }, { "epoch": 0.5571, "grad_norm": 14.25, "grad_norm_var": 0.24140625, "learning_rate": 0.0003, "loss": 11.1335, "loss/aux_loss": 0.048072258941829205, "loss/crossentropy": 2.685439348220825, "loss/logits": 0.8327443897724152, "step": 55710 }, { "epoch": 0.5572, "grad_norm": 14.9375, "grad_norm_var": 0.24166666666666667, "learning_rate": 0.0003, "loss": 11.0821, "loss/aux_loss": 0.0480724660679698, "loss/crossentropy": 2.8291414260864256, "loss/logits": 0.8656550794839859, "step": 55720 }, { "epoch": 0.5573, "grad_norm": 15.0625, "grad_norm_var": 15.460791015625, "learning_rate": 0.0003, "loss": 10.9168, "loss/aux_loss": 0.04806995950639248, "loss/crossentropy": 2.7315680921077727, "loss/logits": 0.7951398670673371, "step": 55730 }, { "epoch": 0.5574, "grad_norm": 14.5625, "grad_norm_var": 0.6952473958333333, "learning_rate": 0.0003, "loss": 11.2082, "loss/aux_loss": 0.048077480867505074, "loss/crossentropy": 2.734374761581421, "loss/logits": 0.8259120464324952, "step": 55740 }, { "epoch": 0.5575, "grad_norm": 16.0, "grad_norm_var": 1.183837890625, "learning_rate": 0.0003, "loss": 11.066, "loss/aux_loss": 0.048074356466531756, "loss/crossentropy": 2.626655274629593, "loss/logits": 0.8067521005868912, "step": 55750 }, { "epoch": 0.5576, "grad_norm": 15.3125, "grad_norm_var": 0.7752604166666667, "learning_rate": 0.0003, "loss": 11.0163, "loss/aux_loss": 0.048075728304684165, "loss/crossentropy": 2.6281754672527313, "loss/logits": 0.8328516259789467, "step": 55760 }, { "epoch": 0.5577, "grad_norm": 15.0, "grad_norm_var": 1.0020182291666666, "learning_rate": 0.0003, "loss": 10.9977, "loss/aux_loss": 0.04805116355419159, "loss/crossentropy": 2.502528029680252, "loss/logits": 0.7761318385601044, "step": 55770 }, { "epoch": 0.5578, "grad_norm": 13.875, "grad_norm_var": 0.8640462239583333, "learning_rate": 0.0003, "loss": 11.0557, "loss/aux_loss": 0.04808572828769684, "loss/crossentropy": 2.792075717449188, "loss/logits": 0.8123959988355637, "step": 55780 }, { "epoch": 0.5579, "grad_norm": 15.4375, "grad_norm_var": 0.5848307291666667, "learning_rate": 0.0003, "loss": 11.0302, "loss/aux_loss": 0.04807162135839462, "loss/crossentropy": 2.764680355787277, "loss/logits": 0.8557851523160934, "step": 55790 }, { "epoch": 0.558, "grad_norm": 14.1875, "grad_norm_var": 0.627197265625, "learning_rate": 0.0003, "loss": 11.0062, "loss/aux_loss": 0.04807290825992823, "loss/crossentropy": 2.6959391951560976, "loss/logits": 0.8017183929681778, "step": 55800 }, { "epoch": 0.5581, "grad_norm": 14.1875, "grad_norm_var": 0.4681640625, "learning_rate": 0.0003, "loss": 10.9007, "loss/aux_loss": 0.04806526694446802, "loss/crossentropy": 2.712996482849121, "loss/logits": 0.8240525692701339, "step": 55810 }, { "epoch": 0.5582, "grad_norm": 21.375, "grad_norm_var": 3.1531087239583333, "learning_rate": 0.0003, "loss": 11.0358, "loss/aux_loss": 0.048070162907242774, "loss/crossentropy": 2.7307373881340027, "loss/logits": 0.82339708507061, "step": 55820 }, { "epoch": 0.5583, "grad_norm": 13.75, "grad_norm_var": 2.9661458333333335, "learning_rate": 0.0003, "loss": 10.8756, "loss/aux_loss": 0.04807821772992611, "loss/crossentropy": 2.8782392740249634, "loss/logits": 0.8153641313314438, "step": 55830 }, { "epoch": 0.5584, "grad_norm": 14.8125, "grad_norm_var": 0.42823893229166665, "learning_rate": 0.0003, "loss": 11.0125, "loss/aux_loss": 0.04807160831987858, "loss/crossentropy": 2.8019445538520813, "loss/logits": 0.8658664226531982, "step": 55840 }, { "epoch": 0.5585, "grad_norm": 15.1875, "grad_norm_var": 0.6179524739583333, "learning_rate": 0.0003, "loss": 11.0465, "loss/aux_loss": 0.0480682285502553, "loss/crossentropy": 2.633160024881363, "loss/logits": 0.8404253333806991, "step": 55850 }, { "epoch": 0.5586, "grad_norm": 14.25, "grad_norm_var": 1.3625, "learning_rate": 0.0003, "loss": 11.0788, "loss/aux_loss": 0.048069980926811695, "loss/crossentropy": 2.977233016490936, "loss/logits": 0.8548012495040893, "step": 55860 }, { "epoch": 0.5587, "grad_norm": 17.5, "grad_norm_var": 1.658837890625, "learning_rate": 0.0003, "loss": 11.0906, "loss/aux_loss": 0.0480623546987772, "loss/crossentropy": 2.668596589565277, "loss/logits": 0.8072352677583694, "step": 55870 }, { "epoch": 0.5588, "grad_norm": 15.1875, "grad_norm_var": 1.0363932291666667, "learning_rate": 0.0003, "loss": 10.9759, "loss/aux_loss": 0.04808857198804617, "loss/crossentropy": 2.6548173129558563, "loss/logits": 0.832793864607811, "step": 55880 }, { "epoch": 0.5589, "grad_norm": 15.1875, "grad_norm_var": 0.35305989583333336, "learning_rate": 0.0003, "loss": 10.9757, "loss/aux_loss": 0.04806632045656443, "loss/crossentropy": 2.6122241616249084, "loss/logits": 0.8252353370189667, "step": 55890 }, { "epoch": 0.559, "grad_norm": 14.3125, "grad_norm_var": 0.826025390625, "learning_rate": 0.0003, "loss": 10.9732, "loss/aux_loss": 0.0480646962299943, "loss/crossentropy": 2.8674940884113314, "loss/logits": 0.8213084667921067, "step": 55900 }, { "epoch": 0.5591, "grad_norm": 14.25, "grad_norm_var": 0.79921875, "learning_rate": 0.0003, "loss": 11.0364, "loss/aux_loss": 0.04807482287287712, "loss/crossentropy": 2.709700071811676, "loss/logits": 0.8265916168689728, "step": 55910 }, { "epoch": 0.5592, "grad_norm": 15.0, "grad_norm_var": 1.2120930989583334, "learning_rate": 0.0003, "loss": 11.0549, "loss/aux_loss": 0.048078625462949276, "loss/crossentropy": 2.725412595272064, "loss/logits": 0.8201213121414185, "step": 55920 }, { "epoch": 0.5593, "grad_norm": 15.4375, "grad_norm_var": 1.197900390625, "learning_rate": 0.0003, "loss": 11.0721, "loss/aux_loss": 0.04807287231087685, "loss/crossentropy": 2.781103193759918, "loss/logits": 0.8102922707796096, "step": 55930 }, { "epoch": 0.5594, "grad_norm": 14.25, "grad_norm_var": 0.6386555989583333, "learning_rate": 0.0003, "loss": 11.234, "loss/aux_loss": 0.04807069655507803, "loss/crossentropy": 2.869676959514618, "loss/logits": 0.8445936232805252, "step": 55940 }, { "epoch": 0.5595, "grad_norm": 14.3125, "grad_norm_var": 0.4103515625, "learning_rate": 0.0003, "loss": 10.9058, "loss/aux_loss": 0.048074489645659926, "loss/crossentropy": 2.652887338399887, "loss/logits": 0.8485498696565628, "step": 55950 }, { "epoch": 0.5596, "grad_norm": 15.0625, "grad_norm_var": 0.6891764322916667, "learning_rate": 0.0003, "loss": 10.8934, "loss/aux_loss": 0.04807630702853203, "loss/crossentropy": 2.62935094833374, "loss/logits": 0.8135641008615494, "step": 55960 }, { "epoch": 0.5597, "grad_norm": 15.6875, "grad_norm_var": 0.805322265625, "learning_rate": 0.0003, "loss": 11.1182, "loss/aux_loss": 0.04807244669646025, "loss/crossentropy": 2.4817294061183928, "loss/logits": 0.804246386885643, "step": 55970 }, { "epoch": 0.5598, "grad_norm": 15.125, "grad_norm_var": 0.7141764322916667, "learning_rate": 0.0003, "loss": 10.9826, "loss/aux_loss": 0.048072634264826775, "loss/crossentropy": 2.8035527586936952, "loss/logits": 0.8370449364185333, "step": 55980 }, { "epoch": 0.5599, "grad_norm": 15.6875, "grad_norm_var": 0.5895670572916667, "learning_rate": 0.0003, "loss": 10.9758, "loss/aux_loss": 0.048080057837069035, "loss/crossentropy": 2.8196861863136293, "loss/logits": 0.8604692459106446, "step": 55990 }, { "epoch": 0.56, "grad_norm": 14.125, "grad_norm_var": 0.3753743489583333, "learning_rate": 0.0003, "loss": 11.119, "loss/aux_loss": 0.04806259609758854, "loss/crossentropy": 2.770169770717621, "loss/logits": 0.8338570713996887, "step": 56000 }, { "epoch": 0.5601, "grad_norm": 16.375, "grad_norm_var": 0.40078125, "learning_rate": 0.0003, "loss": 11.1225, "loss/aux_loss": 0.04806851968169212, "loss/crossentropy": 2.778761512041092, "loss/logits": 0.8658297926187515, "step": 56010 }, { "epoch": 0.5602, "grad_norm": 15.5625, "grad_norm_var": 0.825, "learning_rate": 0.0003, "loss": 11.145, "loss/aux_loss": 0.04807616528123617, "loss/crossentropy": 2.830496996641159, "loss/logits": 0.8245423913002015, "step": 56020 }, { "epoch": 0.5603, "grad_norm": 15.0625, "grad_norm_var": 0.638916015625, "learning_rate": 0.0003, "loss": 10.9945, "loss/aux_loss": 0.04807240832597017, "loss/crossentropy": 2.755543279647827, "loss/logits": 0.8172949731349946, "step": 56030 }, { "epoch": 0.5604, "grad_norm": 15.4375, "grad_norm_var": 0.6825358072916666, "learning_rate": 0.0003, "loss": 10.9785, "loss/aux_loss": 0.04805968664586544, "loss/crossentropy": 2.717133402824402, "loss/logits": 0.8595670849084854, "step": 56040 }, { "epoch": 0.5605, "grad_norm": 14.25, "grad_norm_var": 0.8062337239583334, "learning_rate": 0.0003, "loss": 10.9878, "loss/aux_loss": 0.0480790127068758, "loss/crossentropy": 2.7759326457977296, "loss/logits": 0.8354659885168075, "step": 56050 }, { "epoch": 0.5606, "grad_norm": 14.5625, "grad_norm_var": 0.37381184895833336, "learning_rate": 0.0003, "loss": 10.8017, "loss/aux_loss": 0.048070153221488, "loss/crossentropy": 2.7013581454753877, "loss/logits": 0.8085485100746155, "step": 56060 }, { "epoch": 0.5607, "grad_norm": 13.9375, "grad_norm_var": 0.7577473958333333, "learning_rate": 0.0003, "loss": 10.8808, "loss/aux_loss": 0.04807856027036905, "loss/crossentropy": 2.527774375677109, "loss/logits": 0.8128434181213379, "step": 56070 }, { "epoch": 0.5608, "grad_norm": 14.0, "grad_norm_var": 0.3275390625, "learning_rate": 0.0003, "loss": 11.0026, "loss/aux_loss": 0.04807322099804878, "loss/crossentropy": 2.6217800080776215, "loss/logits": 0.8302909851074218, "step": 56080 }, { "epoch": 0.5609, "grad_norm": 15.0, "grad_norm_var": 0.44803059895833336, "learning_rate": 0.0003, "loss": 10.8046, "loss/aux_loss": 0.04807568434625864, "loss/crossentropy": 2.5421776592731478, "loss/logits": 0.7773946285247803, "step": 56090 }, { "epoch": 0.561, "grad_norm": 15.0, "grad_norm_var": 0.2843098958333333, "learning_rate": 0.0003, "loss": 11.0388, "loss/aux_loss": 0.04807381071150303, "loss/crossentropy": 2.7090347170829774, "loss/logits": 0.8462309181690216, "step": 56100 }, { "epoch": 0.5611, "grad_norm": 15.625, "grad_norm_var": 0.26764322916666666, "learning_rate": 0.0003, "loss": 11.1147, "loss/aux_loss": 0.04806612860411406, "loss/crossentropy": 2.7417237401008605, "loss/logits": 0.8153161972761154, "step": 56110 }, { "epoch": 0.5612, "grad_norm": 15.9375, "grad_norm_var": 0.493603515625, "learning_rate": 0.0003, "loss": 11.0104, "loss/aux_loss": 0.048076927475631234, "loss/crossentropy": 2.685434067249298, "loss/logits": 0.8215235829353332, "step": 56120 }, { "epoch": 0.5613, "grad_norm": 15.0625, "grad_norm_var": 0.543994140625, "learning_rate": 0.0003, "loss": 10.9183, "loss/aux_loss": 0.048070359975099564, "loss/crossentropy": 2.7782336354255674, "loss/logits": 0.8645920783281327, "step": 56130 }, { "epoch": 0.5614, "grad_norm": 15.9375, "grad_norm_var": 0.56875, "learning_rate": 0.0003, "loss": 10.9654, "loss/aux_loss": 0.04806331600993872, "loss/crossentropy": 2.7166395127773284, "loss/logits": 0.835795333981514, "step": 56140 }, { "epoch": 0.5615, "grad_norm": 14.625, "grad_norm_var": 3.842431640625, "learning_rate": 0.0003, "loss": 11.0505, "loss/aux_loss": 0.04807646721601486, "loss/crossentropy": 2.7557631850242617, "loss/logits": 0.8347720831632615, "step": 56150 }, { "epoch": 0.5616, "grad_norm": 14.75, "grad_norm_var": 0.269384765625, "learning_rate": 0.0003, "loss": 11.1227, "loss/aux_loss": 0.04807073958218098, "loss/crossentropy": 2.834517753124237, "loss/logits": 0.8356727987527848, "step": 56160 }, { "epoch": 0.5617, "grad_norm": 14.9375, "grad_norm_var": 0.3465983072916667, "learning_rate": 0.0003, "loss": 10.8892, "loss/aux_loss": 0.048071736469864845, "loss/crossentropy": 2.6536025047302245, "loss/logits": 0.8201006531715394, "step": 56170 }, { "epoch": 0.5618, "grad_norm": 17.0, "grad_norm_var": 0.6214680989583333, "learning_rate": 0.0003, "loss": 11.0144, "loss/aux_loss": 0.048070326820015906, "loss/crossentropy": 2.6506611943244933, "loss/logits": 0.7980666756629944, "step": 56180 }, { "epoch": 0.5619, "grad_norm": 21.5, "grad_norm_var": 3.3739583333333334, "learning_rate": 0.0003, "loss": 11.0292, "loss/aux_loss": 0.048073834739625454, "loss/crossentropy": 2.757190352678299, "loss/logits": 0.8351425707340241, "step": 56190 }, { "epoch": 0.562, "grad_norm": 14.25, "grad_norm_var": 3.118212890625, "learning_rate": 0.0003, "loss": 11.0093, "loss/aux_loss": 0.048075878620147706, "loss/crossentropy": 2.798985254764557, "loss/logits": 0.8397725850343705, "step": 56200 }, { "epoch": 0.5621, "grad_norm": 14.125, "grad_norm_var": 1.3070149739583334, "learning_rate": 0.0003, "loss": 10.9908, "loss/aux_loss": 0.048066343553364275, "loss/crossentropy": 2.794421637058258, "loss/logits": 0.8280640333890915, "step": 56210 }, { "epoch": 0.5622, "grad_norm": 13.875, "grad_norm_var": 0.6166015625, "learning_rate": 0.0003, "loss": 10.968, "loss/aux_loss": 0.04806768260896206, "loss/crossentropy": 2.646075713634491, "loss/logits": 0.8375303894281387, "step": 56220 }, { "epoch": 0.5623, "grad_norm": 15.125, "grad_norm_var": 0.5046223958333333, "learning_rate": 0.0003, "loss": 11.0031, "loss/aux_loss": 0.04807407818734646, "loss/crossentropy": 2.8416967034339904, "loss/logits": 0.8331740826368332, "step": 56230 }, { "epoch": 0.5624, "grad_norm": 15.875, "grad_norm_var": 0.5088541666666667, "learning_rate": 0.0003, "loss": 11.121, "loss/aux_loss": 0.04808174110949039, "loss/crossentropy": 2.7374016523361204, "loss/logits": 0.8004465430974961, "step": 56240 }, { "epoch": 0.5625, "grad_norm": 14.625, "grad_norm_var": 0.8033854166666666, "learning_rate": 0.0003, "loss": 11.0857, "loss/aux_loss": 0.048068351671099664, "loss/crossentropy": 2.691207242012024, "loss/logits": 0.8244406789541244, "step": 56250 }, { "epoch": 0.5626, "grad_norm": 15.25, "grad_norm_var": 0.5994140625, "learning_rate": 0.0003, "loss": 11.033, "loss/aux_loss": 0.048069524578750134, "loss/crossentropy": 2.655339479446411, "loss/logits": 0.8004418700933457, "step": 56260 }, { "epoch": 0.5627, "grad_norm": 16.375, "grad_norm_var": 1.406103515625, "learning_rate": 0.0003, "loss": 10.9853, "loss/aux_loss": 0.04807061068713665, "loss/crossentropy": 2.562644922733307, "loss/logits": 0.7774939149618149, "step": 56270 }, { "epoch": 0.5628, "grad_norm": 14.5, "grad_norm_var": 110.99542643229167, "learning_rate": 0.0003, "loss": 10.9302, "loss/aux_loss": 0.04808583036065102, "loss/crossentropy": 2.7587235629558564, "loss/logits": 0.8342153191566467, "step": 56280 }, { "epoch": 0.5629, "grad_norm": 14.6875, "grad_norm_var": 0.5614420572916666, "learning_rate": 0.0003, "loss": 10.8492, "loss/aux_loss": 0.048062351532280445, "loss/crossentropy": 2.640738385915756, "loss/logits": 0.7902368202805519, "step": 56290 }, { "epoch": 0.563, "grad_norm": 13.9375, "grad_norm_var": 0.862353515625, "learning_rate": 0.0003, "loss": 10.9259, "loss/aux_loss": 0.04808336030691862, "loss/crossentropy": 2.662439024448395, "loss/logits": 0.8200179070234299, "step": 56300 }, { "epoch": 0.5631, "grad_norm": 13.9375, "grad_norm_var": 1.118212890625, "learning_rate": 0.0003, "loss": 11.1451, "loss/aux_loss": 0.048071098141372205, "loss/crossentropy": 2.7258577704429627, "loss/logits": 0.8454255849123001, "step": 56310 }, { "epoch": 0.5632, "grad_norm": 14.25, "grad_norm_var": 0.7738118489583333, "learning_rate": 0.0003, "loss": 10.8717, "loss/aux_loss": 0.048069667629897596, "loss/crossentropy": 2.721261328458786, "loss/logits": 0.8305182576179504, "step": 56320 }, { "epoch": 0.5633, "grad_norm": 14.3125, "grad_norm_var": 0.325, "learning_rate": 0.0003, "loss": 10.9718, "loss/aux_loss": 0.04807541910558939, "loss/crossentropy": 2.828034371137619, "loss/logits": 0.8110349535942077, "step": 56330 }, { "epoch": 0.5634, "grad_norm": 13.75, "grad_norm_var": 0.1484375, "learning_rate": 0.0003, "loss": 10.8651, "loss/aux_loss": 0.04806641507893801, "loss/crossentropy": 2.6425141513347628, "loss/logits": 0.7947331488132476, "step": 56340 }, { "epoch": 0.5635, "grad_norm": 13.75, "grad_norm_var": 13.531103515625, "learning_rate": 0.0003, "loss": 10.824, "loss/aux_loss": 0.048073524795472625, "loss/crossentropy": 2.704647868871689, "loss/logits": 0.7970446825027466, "step": 56350 }, { "epoch": 0.5636, "grad_norm": 14.4375, "grad_norm_var": 0.30618489583333336, "learning_rate": 0.0003, "loss": 10.8263, "loss/aux_loss": 0.04807068221271038, "loss/crossentropy": 2.7439758598804476, "loss/logits": 0.8404293477535247, "step": 56360 }, { "epoch": 0.5637, "grad_norm": 14.125, "grad_norm_var": 0.9722493489583334, "learning_rate": 0.0003, "loss": 11.0797, "loss/aux_loss": 0.048076880536973475, "loss/crossentropy": 2.848537635803223, "loss/logits": 0.8467506438493728, "step": 56370 }, { "epoch": 0.5638, "grad_norm": 15.75, "grad_norm_var": 0.7387858072916667, "learning_rate": 0.0003, "loss": 11.0897, "loss/aux_loss": 0.04807664547115564, "loss/crossentropy": 2.8234737038612367, "loss/logits": 0.8553803592920304, "step": 56380 }, { "epoch": 0.5639, "grad_norm": 14.5, "grad_norm_var": 0.567431640625, "learning_rate": 0.0003, "loss": 11.0941, "loss/aux_loss": 0.048065589554607865, "loss/crossentropy": 2.7501831650733948, "loss/logits": 0.8351290255784989, "step": 56390 }, { "epoch": 0.564, "grad_norm": 18.125, "grad_norm_var": 1.2236979166666666, "learning_rate": 0.0003, "loss": 11.1896, "loss/aux_loss": 0.04807354472577572, "loss/crossentropy": 2.709191882610321, "loss/logits": 0.848085030913353, "step": 56400 }, { "epoch": 0.5641, "grad_norm": 15.875, "grad_norm_var": 0.8799479166666667, "learning_rate": 0.0003, "loss": 10.8941, "loss/aux_loss": 0.04807153381407261, "loss/crossentropy": 2.7052394211292268, "loss/logits": 0.8169119179248809, "step": 56410 }, { "epoch": 0.5642, "grad_norm": 17.0, "grad_norm_var": 2.4324055989583333, "learning_rate": 0.0003, "loss": 10.9911, "loss/aux_loss": 0.04807809721678495, "loss/crossentropy": 2.7568470358848574, "loss/logits": 0.8337729841470718, "step": 56420 }, { "epoch": 0.5643, "grad_norm": 14.1875, "grad_norm_var": 3.5363118489583334, "learning_rate": 0.0003, "loss": 10.8475, "loss/aux_loss": 0.04806798957288265, "loss/crossentropy": 2.55110359787941, "loss/logits": 0.7923869863152504, "step": 56430 }, { "epoch": 0.5644, "grad_norm": 16.625, "grad_norm_var": 3.0268229166666667, "learning_rate": 0.0003, "loss": 11.0289, "loss/aux_loss": 0.048070752806961534, "loss/crossentropy": 2.7474220752716065, "loss/logits": 0.8456249058246612, "step": 56440 }, { "epoch": 0.5645, "grad_norm": 15.75, "grad_norm_var": 2.623893229166667, "learning_rate": 0.0003, "loss": 11.0403, "loss/aux_loss": 0.04807250145822763, "loss/crossentropy": 2.650127410888672, "loss/logits": 0.8189398646354675, "step": 56450 }, { "epoch": 0.5646, "grad_norm": 14.8125, "grad_norm_var": 0.6281087239583333, "learning_rate": 0.0003, "loss": 10.9889, "loss/aux_loss": 0.048068339750170705, "loss/crossentropy": 2.6610859453678133, "loss/logits": 0.7993488103151322, "step": 56460 }, { "epoch": 0.5647, "grad_norm": 14.625, "grad_norm_var": 0.537353515625, "learning_rate": 0.0003, "loss": 11.0154, "loss/aux_loss": 0.04807689357548952, "loss/crossentropy": 2.7062296152114866, "loss/logits": 0.8192767605185509, "step": 56470 }, { "epoch": 0.5648, "grad_norm": 15.5, "grad_norm_var": 0.7515462239583334, "learning_rate": 0.0003, "loss": 11.0572, "loss/aux_loss": 0.048062050342559816, "loss/crossentropy": 2.8266763508319857, "loss/logits": 0.8351973295211792, "step": 56480 }, { "epoch": 0.5649, "grad_norm": 14.125, "grad_norm_var": 0.8270182291666667, "learning_rate": 0.0003, "loss": 11.0404, "loss/aux_loss": 0.04807958882302046, "loss/crossentropy": 2.7233566522598265, "loss/logits": 0.8333428800106049, "step": 56490 }, { "epoch": 0.565, "grad_norm": 14.1875, "grad_norm_var": 0.41456705729166665, "learning_rate": 0.0003, "loss": 11.015, "loss/aux_loss": 0.04806462060660124, "loss/crossentropy": 2.6568702876567842, "loss/logits": 0.840363684296608, "step": 56500 }, { "epoch": 0.5651, "grad_norm": 15.1875, "grad_norm_var": 1.0598307291666667, "learning_rate": 0.0003, "loss": 11.1722, "loss/aux_loss": 0.048068604059517385, "loss/crossentropy": 2.863558900356293, "loss/logits": 0.8404663354158401, "step": 56510 }, { "epoch": 0.5652, "grad_norm": 14.0625, "grad_norm_var": 1.366650390625, "learning_rate": 0.0003, "loss": 10.9801, "loss/aux_loss": 0.048066033609211446, "loss/crossentropy": 2.7827521324157716, "loss/logits": 0.833331236243248, "step": 56520 }, { "epoch": 0.5653, "grad_norm": 16.25, "grad_norm_var": 0.7650390625, "learning_rate": 0.0003, "loss": 11.0661, "loss/aux_loss": 0.04807133264839649, "loss/crossentropy": 2.691127985715866, "loss/logits": 0.8125263452529907, "step": 56530 }, { "epoch": 0.5654, "grad_norm": 14.3125, "grad_norm_var": 0.5718587239583334, "learning_rate": 0.0003, "loss": 11.1224, "loss/aux_loss": 0.04806945119053126, "loss/crossentropy": 2.7853653192520142, "loss/logits": 0.8257082641124726, "step": 56540 }, { "epoch": 0.5655, "grad_norm": 15.375, "grad_norm_var": 0.29270833333333335, "learning_rate": 0.0003, "loss": 10.965, "loss/aux_loss": 0.048084007762372497, "loss/crossentropy": 2.676611590385437, "loss/logits": 0.8134146988391876, "step": 56550 }, { "epoch": 0.5656, "grad_norm": 14.3125, "grad_norm_var": 2.330322265625, "learning_rate": 0.0003, "loss": 10.794, "loss/aux_loss": 0.048067791387438774, "loss/crossentropy": 2.7615798473358155, "loss/logits": 0.8393901348114013, "step": 56560 }, { "epoch": 0.5657, "grad_norm": 14.3125, "grad_norm_var": 1.1707682291666666, "learning_rate": 0.0003, "loss": 10.9696, "loss/aux_loss": 0.0480693681165576, "loss/crossentropy": 2.7393906354904174, "loss/logits": 0.8108802825212479, "step": 56570 }, { "epoch": 0.5658, "grad_norm": 15.875, "grad_norm_var": 1.6731770833333333, "learning_rate": 0.0003, "loss": 10.9976, "loss/aux_loss": 0.04807717688381672, "loss/crossentropy": 2.7219885349273683, "loss/logits": 0.8048550575971604, "step": 56580 }, { "epoch": 0.5659, "grad_norm": 14.875, "grad_norm_var": 1.264306640625, "learning_rate": 0.0003, "loss": 11.093, "loss/aux_loss": 0.04807864520698786, "loss/crossentropy": 2.777973675727844, "loss/logits": 0.8606278628110886, "step": 56590 }, { "epoch": 0.566, "grad_norm": 14.125, "grad_norm_var": 1.4953125, "learning_rate": 0.0003, "loss": 10.868, "loss/aux_loss": 0.04806201551109553, "loss/crossentropy": 2.5385043144226076, "loss/logits": 0.7936165243387222, "step": 56600 }, { "epoch": 0.5661, "grad_norm": 14.1875, "grad_norm_var": 0.692041015625, "learning_rate": 0.0003, "loss": 10.9311, "loss/aux_loss": 0.04808430373668671, "loss/crossentropy": 2.6752517938613893, "loss/logits": 0.794241589307785, "step": 56610 }, { "epoch": 0.5662, "grad_norm": 14.375, "grad_norm_var": 1.6005208333333334, "learning_rate": 0.0003, "loss": 10.7976, "loss/aux_loss": 0.04808196313679218, "loss/crossentropy": 2.622132194042206, "loss/logits": 0.7990513414144516, "step": 56620 }, { "epoch": 0.5663, "grad_norm": 15.25, "grad_norm_var": 1.3752604166666667, "learning_rate": 0.0003, "loss": 10.9685, "loss/aux_loss": 0.048063672706484796, "loss/crossentropy": 2.659779739379883, "loss/logits": 0.8321389853954315, "step": 56630 }, { "epoch": 0.5664, "grad_norm": 16.0, "grad_norm_var": 0.3322265625, "learning_rate": 0.0003, "loss": 11.0019, "loss/aux_loss": 0.04806230738759041, "loss/crossentropy": 2.7045338630676268, "loss/logits": 0.8264268547296524, "step": 56640 }, { "epoch": 0.5665, "grad_norm": 14.4375, "grad_norm_var": 0.6581868489583333, "learning_rate": 0.0003, "loss": 11.0197, "loss/aux_loss": 0.04807612672448158, "loss/crossentropy": 2.6564504504203796, "loss/logits": 0.7988866597414017, "step": 56650 }, { "epoch": 0.5666, "grad_norm": 14.625, "grad_norm_var": 0.8681640625, "learning_rate": 0.0003, "loss": 10.7875, "loss/aux_loss": 0.04807793591171503, "loss/crossentropy": 2.573316812515259, "loss/logits": 0.7921032071113586, "step": 56660 }, { "epoch": 0.5667, "grad_norm": 15.5625, "grad_norm_var": 0.5376139322916667, "learning_rate": 0.0003, "loss": 10.9263, "loss/aux_loss": 0.048057892732322215, "loss/crossentropy": 2.730033391714096, "loss/logits": 0.8079028069972992, "step": 56670 }, { "epoch": 0.5668, "grad_norm": 15.0, "grad_norm_var": 0.26536458333333335, "learning_rate": 0.0003, "loss": 11.0131, "loss/aux_loss": 0.04807726927101612, "loss/crossentropy": 2.7096143126487733, "loss/logits": 0.8181146889925003, "step": 56680 }, { "epoch": 0.5669, "grad_norm": 15.0625, "grad_norm_var": 0.4228515625, "learning_rate": 0.0003, "loss": 11.098, "loss/aux_loss": 0.04807647932320833, "loss/crossentropy": 2.642156887054443, "loss/logits": 0.8269279479980469, "step": 56690 }, { "epoch": 0.567, "grad_norm": 13.9375, "grad_norm_var": 0.33123372395833334, "learning_rate": 0.0003, "loss": 10.9675, "loss/aux_loss": 0.04807464182376862, "loss/crossentropy": 2.8210769176483153, "loss/logits": 0.83407823741436, "step": 56700 }, { "epoch": 0.5671, "grad_norm": 15.4375, "grad_norm_var": 0.3556640625, "learning_rate": 0.0003, "loss": 11.1748, "loss/aux_loss": 0.04806642550975084, "loss/crossentropy": 2.749396449327469, "loss/logits": 0.8253662884235382, "step": 56710 }, { "epoch": 0.5672, "grad_norm": 14.75, "grad_norm_var": 0.515087890625, "learning_rate": 0.0003, "loss": 10.9303, "loss/aux_loss": 0.04807874038815498, "loss/crossentropy": 2.8592591881752014, "loss/logits": 0.8499416679143905, "step": 56720 }, { "epoch": 0.5673, "grad_norm": 15.5, "grad_norm_var": 0.5817545572916667, "learning_rate": 0.0003, "loss": 11.0301, "loss/aux_loss": 0.048067976161837576, "loss/crossentropy": 2.7235675573349, "loss/logits": 0.8350800782442093, "step": 56730 }, { "epoch": 0.5674, "grad_norm": 15.6875, "grad_norm_var": 0.468212890625, "learning_rate": 0.0003, "loss": 10.9828, "loss/aux_loss": 0.04807393439114094, "loss/crossentropy": 2.7318145632743835, "loss/logits": 0.8563040405511856, "step": 56740 }, { "epoch": 0.5675, "grad_norm": 14.1875, "grad_norm_var": 2.0502604166666667, "learning_rate": 0.0003, "loss": 10.915, "loss/aux_loss": 0.04806574210524559, "loss/crossentropy": 2.855338990688324, "loss/logits": 0.851107832789421, "step": 56750 }, { "epoch": 0.5676, "grad_norm": 16.625, "grad_norm_var": 2.186181640625, "learning_rate": 0.0003, "loss": 11.0244, "loss/aux_loss": 0.04807633981108665, "loss/crossentropy": 2.7718964219093323, "loss/logits": 0.8375044643878937, "step": 56760 }, { "epoch": 0.5677, "grad_norm": 16.0, "grad_norm_var": 2.228645833333333, "learning_rate": 0.0003, "loss": 11.0663, "loss/aux_loss": 0.048062844574451445, "loss/crossentropy": 2.8246702313423158, "loss/logits": 0.8107487201690674, "step": 56770 }, { "epoch": 0.5678, "grad_norm": 14.75, "grad_norm_var": 0.7020670572916666, "learning_rate": 0.0003, "loss": 11.0445, "loss/aux_loss": 0.0480653140693903, "loss/crossentropy": 2.789826810359955, "loss/logits": 0.8652868360280991, "step": 56780 }, { "epoch": 0.5679, "grad_norm": 14.625, "grad_norm_var": 0.5175618489583333, "learning_rate": 0.0003, "loss": 10.8587, "loss/aux_loss": 0.048073895275592804, "loss/crossentropy": 2.729355055093765, "loss/logits": 0.7821523636579514, "step": 56790 }, { "epoch": 0.568, "grad_norm": 14.8125, "grad_norm_var": 0.551416015625, "learning_rate": 0.0003, "loss": 11.0657, "loss/aux_loss": 0.04806792289018631, "loss/crossentropy": 2.722504496574402, "loss/logits": 0.8235153377056121, "step": 56800 }, { "epoch": 0.5681, "grad_norm": 15.0, "grad_norm_var": 0.220166015625, "learning_rate": 0.0003, "loss": 11.0569, "loss/aux_loss": 0.04807557370513678, "loss/crossentropy": 2.571262764930725, "loss/logits": 0.8196977347135543, "step": 56810 }, { "epoch": 0.5682, "grad_norm": 14.8125, "grad_norm_var": 0.18292643229166666, "learning_rate": 0.0003, "loss": 10.9235, "loss/aux_loss": 0.04806826990097761, "loss/crossentropy": 2.7095551788806915, "loss/logits": 0.8231742322444916, "step": 56820 }, { "epoch": 0.5683, "grad_norm": 15.3125, "grad_norm_var": 0.6325358072916667, "learning_rate": 0.0003, "loss": 11.0522, "loss/aux_loss": 0.04808192327618599, "loss/crossentropy": 2.724127823114395, "loss/logits": 0.7891089856624603, "step": 56830 }, { "epoch": 0.5684, "grad_norm": 15.375, "grad_norm_var": 0.35859375, "learning_rate": 0.0003, "loss": 11.1465, "loss/aux_loss": 0.04806146658957004, "loss/crossentropy": 2.758617115020752, "loss/logits": 0.8087145060300827, "step": 56840 }, { "epoch": 0.5685, "grad_norm": 13.5625, "grad_norm_var": 0.4038899739583333, "learning_rate": 0.0003, "loss": 10.8962, "loss/aux_loss": 0.048076235502958295, "loss/crossentropy": 2.6616262257099152, "loss/logits": 0.8008765608072281, "step": 56850 }, { "epoch": 0.5686, "grad_norm": 14.625, "grad_norm_var": 0.36822916666666666, "learning_rate": 0.0003, "loss": 10.8288, "loss/aux_loss": 0.04807582087814808, "loss/crossentropy": 2.8255065202713014, "loss/logits": 0.8107618898153305, "step": 56860 }, { "epoch": 0.5687, "grad_norm": 14.8125, "grad_norm_var": 0.33396809895833335, "learning_rate": 0.0003, "loss": 10.976, "loss/aux_loss": 0.04806512389332056, "loss/crossentropy": 2.8445683240890505, "loss/logits": 0.8330892562866211, "step": 56870 }, { "epoch": 0.5688, "grad_norm": 15.1875, "grad_norm_var": 0.38795572916666665, "learning_rate": 0.0003, "loss": 11.077, "loss/aux_loss": 0.04806285053491592, "loss/crossentropy": 2.6184718787670134, "loss/logits": 0.8130939185619355, "step": 56880 }, { "epoch": 0.5689, "grad_norm": 14.8125, "grad_norm_var": 0.265478515625, "learning_rate": 0.0003, "loss": 11.112, "loss/aux_loss": 0.04806987438350916, "loss/crossentropy": 2.666793406009674, "loss/logits": 0.8218275606632233, "step": 56890 }, { "epoch": 0.569, "grad_norm": 14.9375, "grad_norm_var": 0.2703125, "learning_rate": 0.0003, "loss": 10.9284, "loss/aux_loss": 0.0480709794908762, "loss/crossentropy": 2.6845811307430267, "loss/logits": 0.8159762293100357, "step": 56900 }, { "epoch": 0.5691, "grad_norm": 14.4375, "grad_norm_var": 0.38748372395833336, "learning_rate": 0.0003, "loss": 11.0256, "loss/aux_loss": 0.04806566461920738, "loss/crossentropy": 2.7073962688446045, "loss/logits": 0.815559196472168, "step": 56910 }, { "epoch": 0.5692, "grad_norm": 16.125, "grad_norm_var": 0.6726399739583333, "learning_rate": 0.0003, "loss": 10.9962, "loss/aux_loss": 0.04807677231729031, "loss/crossentropy": 2.6567338943481444, "loss/logits": 0.7852911531925202, "step": 56920 }, { "epoch": 0.5693, "grad_norm": 13.25, "grad_norm_var": 0.807275390625, "learning_rate": 0.0003, "loss": 10.8256, "loss/aux_loss": 0.04806611649692059, "loss/crossentropy": 2.900643491744995, "loss/logits": 0.8667346566915513, "step": 56930 }, { "epoch": 0.5694, "grad_norm": 14.25, "grad_norm_var": 0.8233723958333333, "learning_rate": 0.0003, "loss": 10.9697, "loss/aux_loss": 0.048079632222652435, "loss/crossentropy": 2.892075502872467, "loss/logits": 0.8571143001317978, "step": 56940 }, { "epoch": 0.5695, "grad_norm": 14.625, "grad_norm_var": 0.5994140625, "learning_rate": 0.0003, "loss": 10.9265, "loss/aux_loss": 0.048065055161714554, "loss/crossentropy": 2.6348765909671785, "loss/logits": 0.8346069097518921, "step": 56950 }, { "epoch": 0.5696, "grad_norm": 14.125, "grad_norm_var": 0.25983072916666666, "learning_rate": 0.0003, "loss": 10.9799, "loss/aux_loss": 0.04807092547416687, "loss/crossentropy": 2.802864468097687, "loss/logits": 0.8114332973957061, "step": 56960 }, { "epoch": 0.5697, "grad_norm": 15.75, "grad_norm_var": 0.6822265625, "learning_rate": 0.0003, "loss": 10.8788, "loss/aux_loss": 0.04807887505739927, "loss/crossentropy": 2.7381537735462187, "loss/logits": 0.87372607588768, "step": 56970 }, { "epoch": 0.5698, "grad_norm": 13.9375, "grad_norm_var": 0.7661295572916667, "learning_rate": 0.0003, "loss": 10.9834, "loss/aux_loss": 0.04806207437068224, "loss/crossentropy": 2.7998313903808594, "loss/logits": 0.8182542502880097, "step": 56980 }, { "epoch": 0.5699, "grad_norm": 16.125, "grad_norm_var": 0.699462890625, "learning_rate": 0.0003, "loss": 11.0665, "loss/aux_loss": 0.048081908747553824, "loss/crossentropy": 2.635771578550339, "loss/logits": 0.7996685534715653, "step": 56990 }, { "epoch": 0.57, "grad_norm": 13.9375, "grad_norm_var": 0.766650390625, "learning_rate": 0.0003, "loss": 10.9328, "loss/aux_loss": 0.04807494562119245, "loss/crossentropy": 2.6950223565101625, "loss/logits": 0.8105741649866104, "step": 57000 }, { "epoch": 0.5701, "grad_norm": 15.75, "grad_norm_var": 0.7155598958333333, "learning_rate": 0.0003, "loss": 11.0767, "loss/aux_loss": 0.0480563260614872, "loss/crossentropy": 2.711246186494827, "loss/logits": 0.8292560011148453, "step": 57010 }, { "epoch": 0.5702, "grad_norm": 15.5625, "grad_norm_var": 0.5749348958333333, "learning_rate": 0.0003, "loss": 10.8529, "loss/aux_loss": 0.048080886527895925, "loss/crossentropy": 2.688188964128494, "loss/logits": 0.8574258774518967, "step": 57020 }, { "epoch": 0.5703, "grad_norm": 14.0625, "grad_norm_var": 0.40545247395833334, "learning_rate": 0.0003, "loss": 10.9149, "loss/aux_loss": 0.04806966222822666, "loss/crossentropy": 2.668558394908905, "loss/logits": 0.813389179110527, "step": 57030 }, { "epoch": 0.5704, "grad_norm": 14.875, "grad_norm_var": 0.4353515625, "learning_rate": 0.0003, "loss": 11.0002, "loss/aux_loss": 0.04807461742311716, "loss/crossentropy": 2.6643282949924467, "loss/logits": 0.7867618024349212, "step": 57040 }, { "epoch": 0.5705, "grad_norm": 15.9375, "grad_norm_var": 0.31886393229166665, "learning_rate": 0.0003, "loss": 10.9509, "loss/aux_loss": 0.04807612039148808, "loss/crossentropy": 2.7182164669036863, "loss/logits": 0.778824046254158, "step": 57050 }, { "epoch": 0.5706, "grad_norm": 15.875, "grad_norm_var": 0.5471354166666667, "learning_rate": 0.0003, "loss": 10.9733, "loss/aux_loss": 0.04806810449808836, "loss/crossentropy": 2.8615632176399233, "loss/logits": 0.8570520609617234, "step": 57060 }, { "epoch": 0.5707, "grad_norm": 14.625, "grad_norm_var": 0.9356770833333333, "learning_rate": 0.0003, "loss": 10.8977, "loss/aux_loss": 0.04807794988155365, "loss/crossentropy": 2.7868527293205263, "loss/logits": 0.8298997163772583, "step": 57070 }, { "epoch": 0.5708, "grad_norm": 15.1875, "grad_norm_var": 0.87734375, "learning_rate": 0.0003, "loss": 11.081, "loss/aux_loss": 0.0480685269460082, "loss/crossentropy": 2.646866476535797, "loss/logits": 0.8291843563318253, "step": 57080 }, { "epoch": 0.5709, "grad_norm": 14.6875, "grad_norm_var": 0.625634765625, "learning_rate": 0.0003, "loss": 10.9265, "loss/aux_loss": 0.0480671152472496, "loss/crossentropy": 2.781124436855316, "loss/logits": 0.8275115400552749, "step": 57090 }, { "epoch": 0.571, "grad_norm": 15.1875, "grad_norm_var": 1.0968098958333334, "learning_rate": 0.0003, "loss": 11.0086, "loss/aux_loss": 0.04807842988520861, "loss/crossentropy": 2.722087186574936, "loss/logits": 0.8378143638372422, "step": 57100 }, { "epoch": 0.5711, "grad_norm": 13.9375, "grad_norm_var": 1.2960774739583334, "learning_rate": 0.0003, "loss": 10.8447, "loss/aux_loss": 0.04806256033480168, "loss/crossentropy": 2.69550861120224, "loss/logits": 0.8196415692567826, "step": 57110 }, { "epoch": 0.5712, "grad_norm": 15.125, "grad_norm_var": 0.9403483072916666, "learning_rate": 0.0003, "loss": 10.9062, "loss/aux_loss": 0.0480593366548419, "loss/crossentropy": 2.659187990427017, "loss/logits": 0.7925233572721482, "step": 57120 }, { "epoch": 0.5713, "grad_norm": 15.3125, "grad_norm_var": 0.5572265625, "learning_rate": 0.0003, "loss": 10.9287, "loss/aux_loss": 0.04807809740304947, "loss/crossentropy": 2.5628524363040923, "loss/logits": 0.778043681383133, "step": 57130 }, { "epoch": 0.5714, "grad_norm": 17.875, "grad_norm_var": 0.793212890625, "learning_rate": 0.0003, "loss": 10.9902, "loss/aux_loss": 0.04807734601199627, "loss/crossentropy": 2.6852267503738405, "loss/logits": 0.821107491850853, "step": 57140 }, { "epoch": 0.5715, "grad_norm": 23.625, "grad_norm_var": 5.084879557291667, "learning_rate": 0.0003, "loss": 10.9945, "loss/aux_loss": 0.04806005675345659, "loss/crossentropy": 2.670514500141144, "loss/logits": 0.8100097209215165, "step": 57150 }, { "epoch": 0.5716, "grad_norm": 14.4375, "grad_norm_var": 4.920817057291667, "learning_rate": 0.0003, "loss": 11.2069, "loss/aux_loss": 0.04807403068989515, "loss/crossentropy": 2.765077519416809, "loss/logits": 0.8135815739631653, "step": 57160 }, { "epoch": 0.5717, "grad_norm": 15.0625, "grad_norm_var": 2.030322265625, "learning_rate": 0.0003, "loss": 10.9361, "loss/aux_loss": 0.04807031713426113, "loss/crossentropy": 2.7000171720981596, "loss/logits": 0.7982536077499389, "step": 57170 }, { "epoch": 0.5718, "grad_norm": 14.8125, "grad_norm_var": 8.3837890625, "learning_rate": 0.0003, "loss": 10.9202, "loss/aux_loss": 0.04807793851941824, "loss/crossentropy": 2.6715080082416534, "loss/logits": 0.800389638543129, "step": 57180 }, { "epoch": 0.5719, "grad_norm": 15.0625, "grad_norm_var": 7.883968098958333, "learning_rate": 0.0003, "loss": 11.1442, "loss/aux_loss": 0.04807151965796948, "loss/crossentropy": 2.6992808401584627, "loss/logits": 0.8161318123340606, "step": 57190 }, { "epoch": 0.572, "grad_norm": 14.9375, "grad_norm_var": 0.543603515625, "learning_rate": 0.0003, "loss": 11.0557, "loss/aux_loss": 0.04805929586291313, "loss/crossentropy": 2.7212381601333617, "loss/logits": 0.8454442709684372, "step": 57200 }, { "epoch": 0.5721, "grad_norm": 15.375, "grad_norm_var": 0.40358072916666665, "learning_rate": 0.0003, "loss": 11.0428, "loss/aux_loss": 0.04806900396943092, "loss/crossentropy": 2.807091176509857, "loss/logits": 0.84793541431427, "step": 57210 }, { "epoch": 0.5722, "grad_norm": 15.0625, "grad_norm_var": 0.598681640625, "learning_rate": 0.0003, "loss": 10.9167, "loss/aux_loss": 0.04806927982717753, "loss/crossentropy": 2.9018397092819215, "loss/logits": 0.8026500940322876, "step": 57220 }, { "epoch": 0.5723, "grad_norm": 15.8125, "grad_norm_var": 0.5054524739583334, "learning_rate": 0.0003, "loss": 10.9284, "loss/aux_loss": 0.04806585274636745, "loss/crossentropy": 2.7474361181259157, "loss/logits": 0.7843928277492523, "step": 57230 }, { "epoch": 0.5724, "grad_norm": 14.0625, "grad_norm_var": 0.6785807291666667, "learning_rate": 0.0003, "loss": 11.0105, "loss/aux_loss": 0.04807022046297789, "loss/crossentropy": 2.684603381156921, "loss/logits": 0.8285898119211197, "step": 57240 }, { "epoch": 0.5725, "grad_norm": 15.0625, "grad_norm_var": 0.7791015625, "learning_rate": 0.0003, "loss": 10.8737, "loss/aux_loss": 0.04808129519224167, "loss/crossentropy": 2.704842007160187, "loss/logits": 0.8189653396606446, "step": 57250 }, { "epoch": 0.5726, "grad_norm": 15.625, "grad_norm_var": 0.44264322916666665, "learning_rate": 0.0003, "loss": 11.11, "loss/aux_loss": 0.048068471066653726, "loss/crossentropy": 2.755719757080078, "loss/logits": 0.8350825071334839, "step": 57260 }, { "epoch": 0.5727, "grad_norm": 16.125, "grad_norm_var": 0.5872395833333334, "learning_rate": 0.0003, "loss": 11.0595, "loss/aux_loss": 0.04805658888071775, "loss/crossentropy": 2.747038698196411, "loss/logits": 0.8029045939445496, "step": 57270 }, { "epoch": 0.5728, "grad_norm": 15.625, "grad_norm_var": 0.9292805989583334, "learning_rate": 0.0003, "loss": 11.097, "loss/aux_loss": 0.04807424712926149, "loss/crossentropy": 2.8873910784721373, "loss/logits": 0.8244173586368561, "step": 57280 }, { "epoch": 0.5729, "grad_norm": 13.6875, "grad_norm_var": 1.3277180989583333, "learning_rate": 0.0003, "loss": 10.763, "loss/aux_loss": 0.04808657988905907, "loss/crossentropy": 2.5384365618228912, "loss/logits": 0.8188546657562256, "step": 57290 }, { "epoch": 0.573, "grad_norm": 14.3125, "grad_norm_var": 0.25310872395833334, "learning_rate": 0.0003, "loss": 10.909, "loss/aux_loss": 0.04805942717939615, "loss/crossentropy": 2.7145915269851684, "loss/logits": 0.7983001649379731, "step": 57300 }, { "epoch": 0.5731, "grad_norm": 16.5, "grad_norm_var": 0.5681640625, "learning_rate": 0.0003, "loss": 10.949, "loss/aux_loss": 0.04808266796171665, "loss/crossentropy": 2.680216872692108, "loss/logits": 0.8321155905723572, "step": 57310 }, { "epoch": 0.5732, "grad_norm": 14.75, "grad_norm_var": 0.4596354166666667, "learning_rate": 0.0003, "loss": 11.0807, "loss/aux_loss": 0.0480640958994627, "loss/crossentropy": 2.772093391418457, "loss/logits": 0.8285915166139602, "step": 57320 }, { "epoch": 0.5733, "grad_norm": 15.375, "grad_norm_var": 0.4398274739583333, "learning_rate": 0.0003, "loss": 11.0067, "loss/aux_loss": 0.048067055828869346, "loss/crossentropy": 2.7100286722183227, "loss/logits": 0.8241377651691437, "step": 57330 }, { "epoch": 0.5734, "grad_norm": 14.75, "grad_norm_var": 1.8723307291666667, "learning_rate": 0.0003, "loss": 10.9345, "loss/aux_loss": 0.04808278437703848, "loss/crossentropy": 2.6721641540527346, "loss/logits": 0.8055184870958328, "step": 57340 }, { "epoch": 0.5735, "grad_norm": 18.625, "grad_norm_var": 2.445817057291667, "learning_rate": 0.0003, "loss": 10.886, "loss/aux_loss": 0.04806376602500677, "loss/crossentropy": 2.6517118215560913, "loss/logits": 0.8045364022254944, "step": 57350 }, { "epoch": 0.5736, "grad_norm": 13.875, "grad_norm_var": 1.1486979166666667, "learning_rate": 0.0003, "loss": 10.9331, "loss/aux_loss": 0.048079241439700125, "loss/crossentropy": 2.7825541257858277, "loss/logits": 0.8218899816274643, "step": 57360 }, { "epoch": 0.5737, "grad_norm": 15.6875, "grad_norm_var": 85.743212890625, "learning_rate": 0.0003, "loss": 10.9565, "loss/aux_loss": 0.0480652479454875, "loss/crossentropy": 2.707579892873764, "loss/logits": 0.82884761095047, "step": 57370 }, { "epoch": 0.5738, "grad_norm": 14.75, "grad_norm_var": 91.01847330729167, "learning_rate": 0.0003, "loss": 11.0795, "loss/aux_loss": 0.048074961826205256, "loss/crossentropy": 2.7413926482200623, "loss/logits": 0.7939124822616577, "step": 57380 }, { "epoch": 0.5739, "grad_norm": 14.75, "grad_norm_var": 2.615738932291667, "learning_rate": 0.0003, "loss": 10.9845, "loss/aux_loss": 0.04806408416479826, "loss/crossentropy": 2.707012790441513, "loss/logits": 0.830555847287178, "step": 57390 }, { "epoch": 0.574, "grad_norm": 13.875, "grad_norm_var": 0.516259765625, "learning_rate": 0.0003, "loss": 11.0791, "loss/aux_loss": 0.048079000785946846, "loss/crossentropy": 2.5664061307907104, "loss/logits": 0.7763120234012604, "step": 57400 }, { "epoch": 0.5741, "grad_norm": 14.6875, "grad_norm_var": 0.56484375, "learning_rate": 0.0003, "loss": 11.137, "loss/aux_loss": 0.04807056300342083, "loss/crossentropy": 2.758346974849701, "loss/logits": 0.8242935687303543, "step": 57410 }, { "epoch": 0.5742, "grad_norm": 14.6875, "grad_norm_var": 0.2916666666666667, "learning_rate": 0.0003, "loss": 10.9833, "loss/aux_loss": 0.04807440787553787, "loss/crossentropy": 2.6768109679222105, "loss/logits": 0.811660248041153, "step": 57420 }, { "epoch": 0.5743, "grad_norm": 15.3125, "grad_norm_var": 0.4571451822916667, "learning_rate": 0.0003, "loss": 11.1252, "loss/aux_loss": 0.0480594988912344, "loss/crossentropy": 2.7542240738868715, "loss/logits": 0.8283898085355759, "step": 57430 }, { "epoch": 0.5744, "grad_norm": 15.3125, "grad_norm_var": 0.39576822916666665, "learning_rate": 0.0003, "loss": 10.9843, "loss/aux_loss": 0.04808159470558167, "loss/crossentropy": 2.7614540815353394, "loss/logits": 0.8680014103651047, "step": 57440 }, { "epoch": 0.5745, "grad_norm": 14.6875, "grad_norm_var": 1.1231770833333334, "learning_rate": 0.0003, "loss": 10.8306, "loss/aux_loss": 0.04807989429682493, "loss/crossentropy": 2.7385359168052674, "loss/logits": 0.8156585484743119, "step": 57450 }, { "epoch": 0.5746, "grad_norm": 15.1875, "grad_norm_var": 0.28326822916666666, "learning_rate": 0.0003, "loss": 10.9561, "loss/aux_loss": 0.04805928226560354, "loss/crossentropy": 2.669804847240448, "loss/logits": 0.8256619513034821, "step": 57460 }, { "epoch": 0.5747, "grad_norm": 14.9375, "grad_norm_var": 0.14889322916666667, "learning_rate": 0.0003, "loss": 10.9695, "loss/aux_loss": 0.04807625375688076, "loss/crossentropy": 2.615860992670059, "loss/logits": 0.8644401401281356, "step": 57470 }, { "epoch": 0.5748, "grad_norm": 50.25, "grad_norm_var": 77.43899739583334, "learning_rate": 0.0003, "loss": 11.2276, "loss/aux_loss": 0.0480669941753149, "loss/crossentropy": 2.826398515701294, "loss/logits": 0.8551149964332581, "step": 57480 }, { "epoch": 0.5749, "grad_norm": 14.8125, "grad_norm_var": 76.96354166666667, "learning_rate": 0.0003, "loss": 11.0055, "loss/aux_loss": 0.04806830994784832, "loss/crossentropy": 2.7592093706130982, "loss/logits": 0.8087594985961915, "step": 57490 }, { "epoch": 0.575, "grad_norm": 14.8125, "grad_norm_var": 0.399072265625, "learning_rate": 0.0003, "loss": 11.0602, "loss/aux_loss": 0.04806661587208509, "loss/crossentropy": 2.6397600889205934, "loss/logits": 0.7925887256860733, "step": 57500 }, { "epoch": 0.5751, "grad_norm": 15.25, "grad_norm_var": 0.46990559895833334, "learning_rate": 0.0003, "loss": 11.14, "loss/aux_loss": 0.04806430134922266, "loss/crossentropy": 2.7477360010147094, "loss/logits": 0.8559922903776169, "step": 57510 }, { "epoch": 0.5752, "grad_norm": 14.375, "grad_norm_var": 1.5181640625, "learning_rate": 0.0003, "loss": 11.0349, "loss/aux_loss": 0.04808305911719799, "loss/crossentropy": 2.5889110445976256, "loss/logits": 0.8387425035238266, "step": 57520 }, { "epoch": 0.5753, "grad_norm": 15.875, "grad_norm_var": 1.505322265625, "learning_rate": 0.0003, "loss": 10.9867, "loss/aux_loss": 0.048064771480858325, "loss/crossentropy": 2.609954422712326, "loss/logits": 0.8136496782302857, "step": 57530 }, { "epoch": 0.5754, "grad_norm": 15.8125, "grad_norm_var": 0.9311848958333333, "learning_rate": 0.0003, "loss": 11.0806, "loss/aux_loss": 0.04807123206555843, "loss/crossentropy": 2.7172460675239565, "loss/logits": 0.8387424349784851, "step": 57540 }, { "epoch": 0.5755, "grad_norm": 14.1875, "grad_norm_var": 0.79453125, "learning_rate": 0.0003, "loss": 11.023, "loss/aux_loss": 0.04807838406413793, "loss/crossentropy": 2.702912151813507, "loss/logits": 0.8372643262147903, "step": 57550 }, { "epoch": 0.5756, "grad_norm": 15.8125, "grad_norm_var": 0.3731770833333333, "learning_rate": 0.0003, "loss": 10.9375, "loss/aux_loss": 0.04806549474596977, "loss/crossentropy": 2.909746289253235, "loss/logits": 0.8345662504434586, "step": 57560 }, { "epoch": 0.5757, "grad_norm": 15.125, "grad_norm_var": 0.7208333333333333, "learning_rate": 0.0003, "loss": 10.9115, "loss/aux_loss": 0.04808139894157648, "loss/crossentropy": 2.558688461780548, "loss/logits": 0.8080873370170594, "step": 57570 }, { "epoch": 0.5758, "grad_norm": 15.4375, "grad_norm_var": 0.42355143229166664, "learning_rate": 0.0003, "loss": 10.9426, "loss/aux_loss": 0.04807103350758553, "loss/crossentropy": 2.7293295919895173, "loss/logits": 0.8171383291482925, "step": 57580 }, { "epoch": 0.5759, "grad_norm": 16.0, "grad_norm_var": 1.1202473958333334, "learning_rate": 0.0003, "loss": 11.0332, "loss/aux_loss": 0.04807331319898367, "loss/crossentropy": 2.7175457954406737, "loss/logits": 0.8126596748828888, "step": 57590 }, { "epoch": 0.576, "grad_norm": 16.0, "grad_norm_var": 1.250634765625, "learning_rate": 0.0003, "loss": 11.0771, "loss/aux_loss": 0.04807139728218317, "loss/crossentropy": 2.728803825378418, "loss/logits": 0.8120762914419174, "step": 57600 }, { "epoch": 0.5761, "grad_norm": 14.1875, "grad_norm_var": 0.6979166666666666, "learning_rate": 0.0003, "loss": 10.973, "loss/aux_loss": 0.04807168822735548, "loss/crossentropy": 2.6952412009239195, "loss/logits": 0.8491257846355438, "step": 57610 }, { "epoch": 0.5762, "grad_norm": 15.125, "grad_norm_var": 0.345556640625, "learning_rate": 0.0003, "loss": 10.8365, "loss/aux_loss": 0.048068418726325036, "loss/crossentropy": 2.651057040691376, "loss/logits": 0.7869156956672668, "step": 57620 }, { "epoch": 0.5763, "grad_norm": 22.0, "grad_norm_var": 3.711962890625, "learning_rate": 0.0003, "loss": 11.1586, "loss/aux_loss": 0.04807181134819984, "loss/crossentropy": 2.6818241477012634, "loss/logits": 0.8666929543018341, "step": 57630 }, { "epoch": 0.5764, "grad_norm": 15.1875, "grad_norm_var": 3.421875, "learning_rate": 0.0003, "loss": 11.0666, "loss/aux_loss": 0.0480666371062398, "loss/crossentropy": 2.6690493881702424, "loss/logits": 0.8045336902141571, "step": 57640 }, { "epoch": 0.5765, "grad_norm": 15.5, "grad_norm_var": 0.502978515625, "learning_rate": 0.0003, "loss": 11.1792, "loss/aux_loss": 0.04807822220027447, "loss/crossentropy": 2.7619189381599427, "loss/logits": 0.8294977605342865, "step": 57650 }, { "epoch": 0.5766, "grad_norm": 13.875, "grad_norm_var": 0.6133951822916667, "learning_rate": 0.0003, "loss": 10.9134, "loss/aux_loss": 0.04808087293058634, "loss/crossentropy": 2.7703096151351927, "loss/logits": 0.7943071156740189, "step": 57660 }, { "epoch": 0.5767, "grad_norm": 15.8125, "grad_norm_var": 0.8546223958333333, "learning_rate": 0.0003, "loss": 10.9094, "loss/aux_loss": 0.048060201853513715, "loss/crossentropy": 2.537974363565445, "loss/logits": 0.7927771121263504, "step": 57670 }, { "epoch": 0.5768, "grad_norm": 15.875, "grad_norm_var": 0.7843098958333333, "learning_rate": 0.0003, "loss": 10.9885, "loss/aux_loss": 0.04805659111589193, "loss/crossentropy": 2.7599482774734496, "loss/logits": 0.8290839821100235, "step": 57680 }, { "epoch": 0.5769, "grad_norm": 15.5, "grad_norm_var": 92.83125, "learning_rate": 0.0003, "loss": 11.1651, "loss/aux_loss": 0.04809475895017386, "loss/crossentropy": 2.85399044752121, "loss/logits": 0.8747380167245865, "step": 57690 }, { "epoch": 0.577, "grad_norm": 16.375, "grad_norm_var": 42.48951822916667, "learning_rate": 0.0003, "loss": 11.2927, "loss/aux_loss": 0.048062573187053204, "loss/crossentropy": 2.8536964416503907, "loss/logits": 0.8355174720287323, "step": 57700 }, { "epoch": 0.5771, "grad_norm": 15.4375, "grad_norm_var": 0.37890625, "learning_rate": 0.0003, "loss": 11.0133, "loss/aux_loss": 0.04806618671864271, "loss/crossentropy": 2.658688408136368, "loss/logits": 0.8183623373508453, "step": 57710 }, { "epoch": 0.5772, "grad_norm": 15.0, "grad_norm_var": 0.47076822916666666, "learning_rate": 0.0003, "loss": 10.9248, "loss/aux_loss": 0.04806567393243313, "loss/crossentropy": 2.682706815004349, "loss/logits": 0.8114593774080276, "step": 57720 }, { "epoch": 0.5773, "grad_norm": 14.9375, "grad_norm_var": 0.7301432291666666, "learning_rate": 0.0003, "loss": 10.7614, "loss/aux_loss": 0.04807316064834595, "loss/crossentropy": 2.6830021500587464, "loss/logits": 0.8217417180538178, "step": 57730 }, { "epoch": 0.5774, "grad_norm": 14.375, "grad_norm_var": 0.46848958333333335, "learning_rate": 0.0003, "loss": 10.8117, "loss/aux_loss": 0.048069142177700995, "loss/crossentropy": 2.680797153711319, "loss/logits": 0.7941523939371109, "step": 57740 }, { "epoch": 0.5775, "grad_norm": 14.8125, "grad_norm_var": 0.4583333333333333, "learning_rate": 0.0003, "loss": 10.8973, "loss/aux_loss": 0.048070931993424895, "loss/crossentropy": 2.7513445258140563, "loss/logits": 0.8205919414758682, "step": 57750 }, { "epoch": 0.5776, "grad_norm": 14.5, "grad_norm_var": 0.47537434895833336, "learning_rate": 0.0003, "loss": 11.0942, "loss/aux_loss": 0.04808807913213968, "loss/crossentropy": 2.708397227525711, "loss/logits": 0.7808063089847564, "step": 57760 }, { "epoch": 0.5777, "grad_norm": 15.3125, "grad_norm_var": 0.385791015625, "learning_rate": 0.0003, "loss": 10.9057, "loss/aux_loss": 0.04805845711380243, "loss/crossentropy": 2.7761632323265077, "loss/logits": 0.8206240832805634, "step": 57770 }, { "epoch": 0.5778, "grad_norm": 15.3125, "grad_norm_var": 0.806494140625, "learning_rate": 0.0003, "loss": 10.9635, "loss/aux_loss": 0.048069931007921694, "loss/crossentropy": 2.7127468466758726, "loss/logits": 0.8321121394634247, "step": 57780 }, { "epoch": 0.5779, "grad_norm": 15.3125, "grad_norm_var": 0.6773274739583334, "learning_rate": 0.0003, "loss": 10.8796, "loss/aux_loss": 0.048075957037508485, "loss/crossentropy": 2.5557093918323517, "loss/logits": 0.7811422199010849, "step": 57790 }, { "epoch": 0.578, "grad_norm": 15.4375, "grad_norm_var": 0.46087239583333334, "learning_rate": 0.0003, "loss": 11.0337, "loss/aux_loss": 0.048079724051058294, "loss/crossentropy": 2.7828991770744325, "loss/logits": 0.8361575275659561, "step": 57800 }, { "epoch": 0.5781, "grad_norm": 15.375, "grad_norm_var": 0.5171223958333333, "learning_rate": 0.0003, "loss": 10.7984, "loss/aux_loss": 0.04807369504123926, "loss/crossentropy": 2.767413020133972, "loss/logits": 0.7979224413633347, "step": 57810 }, { "epoch": 0.5782, "grad_norm": 15.5625, "grad_norm_var": 12.863997395833334, "learning_rate": 0.0003, "loss": 11.0843, "loss/aux_loss": 0.04807633981108665, "loss/crossentropy": 2.841790997982025, "loss/logits": 0.8295446068048478, "step": 57820 }, { "epoch": 0.5783, "grad_norm": 14.4375, "grad_norm_var": 11.975374348958333, "learning_rate": 0.0003, "loss": 11.0327, "loss/aux_loss": 0.04806151837110519, "loss/crossentropy": 2.8679856061935425, "loss/logits": 0.863958340883255, "step": 57830 }, { "epoch": 0.5784, "grad_norm": 14.1875, "grad_norm_var": 0.6630208333333333, "learning_rate": 0.0003, "loss": 10.7655, "loss/aux_loss": 0.04806194268167019, "loss/crossentropy": 2.6133798182010652, "loss/logits": 0.8144498199224472, "step": 57840 }, { "epoch": 0.5785, "grad_norm": 14.625, "grad_norm_var": 0.43020833333333336, "learning_rate": 0.0003, "loss": 10.9158, "loss/aux_loss": 0.04808299690485, "loss/crossentropy": 2.5752854347229004, "loss/logits": 0.7725576773285866, "step": 57850 }, { "epoch": 0.5786, "grad_norm": 14.4375, "grad_norm_var": 0.30078125, "learning_rate": 0.0003, "loss": 11.1358, "loss/aux_loss": 0.04805977363139391, "loss/crossentropy": 2.6507094621658327, "loss/logits": 0.8261604458093643, "step": 57860 }, { "epoch": 0.5787, "grad_norm": 14.6875, "grad_norm_var": 0.7885416666666667, "learning_rate": 0.0003, "loss": 10.9286, "loss/aux_loss": 0.04806450437754393, "loss/crossentropy": 2.7155093371868135, "loss/logits": 0.8360859841108322, "step": 57870 }, { "epoch": 0.5788, "grad_norm": 14.75, "grad_norm_var": 2.076546223958333, "learning_rate": 0.0003, "loss": 11.0901, "loss/aux_loss": 0.048075138591229916, "loss/crossentropy": 2.7140918552875517, "loss/logits": 0.8228471457958222, "step": 57880 }, { "epoch": 0.5789, "grad_norm": 14.75, "grad_norm_var": 2.1946451822916666, "learning_rate": 0.0003, "loss": 11.0579, "loss/aux_loss": 0.048060869611799714, "loss/crossentropy": 2.7325907826423643, "loss/logits": 0.8381363540887833, "step": 57890 }, { "epoch": 0.579, "grad_norm": 14.75, "grad_norm_var": 0.9061848958333333, "learning_rate": 0.0003, "loss": 10.9924, "loss/aux_loss": 0.048078179731965064, "loss/crossentropy": 2.738635867834091, "loss/logits": 0.8476099342107772, "step": 57900 }, { "epoch": 0.5791, "grad_norm": 15.0, "grad_norm_var": 1.1124837239583334, "learning_rate": 0.0003, "loss": 10.9735, "loss/aux_loss": 0.048069071024656296, "loss/crossentropy": 2.8026002764701845, "loss/logits": 0.8490731894969941, "step": 57910 }, { "epoch": 0.5792, "grad_norm": 14.3125, "grad_norm_var": 0.7389973958333333, "learning_rate": 0.0003, "loss": 11.034, "loss/aux_loss": 0.04806493632495403, "loss/crossentropy": 2.6233414888381956, "loss/logits": 0.8417493313550949, "step": 57920 }, { "epoch": 0.5793, "grad_norm": 15.9375, "grad_norm_var": 0.7728515625, "learning_rate": 0.0003, "loss": 10.9387, "loss/aux_loss": 0.04806803483515978, "loss/crossentropy": 2.729952883720398, "loss/logits": 0.8270312875509263, "step": 57930 }, { "epoch": 0.5794, "grad_norm": 17.875, "grad_norm_var": 2.8348795572916665, "learning_rate": 0.0003, "loss": 11.1579, "loss/aux_loss": 0.04808615278452635, "loss/crossentropy": 2.733921545743942, "loss/logits": 0.8263924434781075, "step": 57940 }, { "epoch": 0.5795, "grad_norm": 14.5, "grad_norm_var": 0.916650390625, "learning_rate": 0.0003, "loss": 11.0477, "loss/aux_loss": 0.04806936271488667, "loss/crossentropy": 2.6277839660644533, "loss/logits": 0.8033677011728286, "step": 57950 }, { "epoch": 0.5796, "grad_norm": 15.125, "grad_norm_var": 0.39837239583333334, "learning_rate": 0.0003, "loss": 10.9133, "loss/aux_loss": 0.048060805164277555, "loss/crossentropy": 2.694873237609863, "loss/logits": 0.8217334061861038, "step": 57960 }, { "epoch": 0.5797, "grad_norm": 14.125, "grad_norm_var": 0.5139973958333334, "learning_rate": 0.0003, "loss": 10.9741, "loss/aux_loss": 0.04808518867939711, "loss/crossentropy": 2.6820975124835966, "loss/logits": 0.8232692778110504, "step": 57970 }, { "epoch": 0.5798, "grad_norm": 14.0, "grad_norm_var": 2.2020833333333334, "learning_rate": 0.0003, "loss": 11.1821, "loss/aux_loss": 0.04804998859763145, "loss/crossentropy": 2.7280581176280974, "loss/logits": 0.817136037349701, "step": 57980 }, { "epoch": 0.5799, "grad_norm": 16.625, "grad_norm_var": 3.3268229166666665, "learning_rate": 0.0003, "loss": 11.0193, "loss/aux_loss": 0.048069480992853644, "loss/crossentropy": 2.6904157042503356, "loss/logits": 0.8245778560638428, "step": 57990 }, { "epoch": 0.58, "grad_norm": 15.875, "grad_norm_var": 0.616259765625, "learning_rate": 0.0003, "loss": 11.0416, "loss/aux_loss": 0.048078110441565514, "loss/crossentropy": 2.7497189164161684, "loss/logits": 0.8141031920909881, "step": 58000 }, { "epoch": 0.5801, "grad_norm": 14.6875, "grad_norm_var": 0.6343587239583334, "learning_rate": 0.0003, "loss": 10.8494, "loss/aux_loss": 0.048049984686076644, "loss/crossentropy": 2.655291825532913, "loss/logits": 0.7856258243322373, "step": 58010 }, { "epoch": 0.5802, "grad_norm": 14.75, "grad_norm_var": 2.2038899739583333, "learning_rate": 0.0003, "loss": 11.0884, "loss/aux_loss": 0.048076943308115, "loss/crossentropy": 2.606997859477997, "loss/logits": 0.8418209999799728, "step": 58020 }, { "epoch": 0.5803, "grad_norm": 14.875, "grad_norm_var": 2.009228515625, "learning_rate": 0.0003, "loss": 11.0585, "loss/aux_loss": 0.04807098638266325, "loss/crossentropy": 2.7810503602027894, "loss/logits": 0.8308149874210358, "step": 58030 }, { "epoch": 0.5804, "grad_norm": 14.75, "grad_norm_var": 1.2375, "learning_rate": 0.0003, "loss": 11.0071, "loss/aux_loss": 0.048070183396339415, "loss/crossentropy": 2.6167166888713838, "loss/logits": 0.8119976550340653, "step": 58040 }, { "epoch": 0.5805, "grad_norm": 13.4375, "grad_norm_var": 12.139957682291667, "learning_rate": 0.0003, "loss": 10.9504, "loss/aux_loss": 0.04809025507420302, "loss/crossentropy": 2.7787895798683167, "loss/logits": 0.8444702595472335, "step": 58050 }, { "epoch": 0.5806, "grad_norm": 15.625, "grad_norm_var": 0.7994140625, "learning_rate": 0.0003, "loss": 11.1034, "loss/aux_loss": 0.04807179775089025, "loss/crossentropy": 2.671674072742462, "loss/logits": 0.815391731262207, "step": 58060 }, { "epoch": 0.5807, "grad_norm": 15.5, "grad_norm_var": 0.4554524739583333, "learning_rate": 0.0003, "loss": 11.0592, "loss/aux_loss": 0.048074791021645066, "loss/crossentropy": 2.6951618790626526, "loss/logits": 0.856848555803299, "step": 58070 }, { "epoch": 0.5808, "grad_norm": 14.25, "grad_norm_var": 1.3780598958333334, "learning_rate": 0.0003, "loss": 10.9668, "loss/aux_loss": 0.048078453540802, "loss/crossentropy": 2.5765260636806486, "loss/logits": 0.8218820422887803, "step": 58080 }, { "epoch": 0.5809, "grad_norm": 15.9375, "grad_norm_var": 0.4432291666666667, "learning_rate": 0.0003, "loss": 10.936, "loss/aux_loss": 0.04806906320154667, "loss/crossentropy": 2.7392422437667845, "loss/logits": 0.7855724722146988, "step": 58090 }, { "epoch": 0.581, "grad_norm": 14.6875, "grad_norm_var": 0.762744140625, "learning_rate": 0.0003, "loss": 10.7023, "loss/aux_loss": 0.04807261452078819, "loss/crossentropy": 2.508842921257019, "loss/logits": 0.7834379196166992, "step": 58100 }, { "epoch": 0.5811, "grad_norm": 16.125, "grad_norm_var": 0.4356770833333333, "learning_rate": 0.0003, "loss": 11.0333, "loss/aux_loss": 0.0480639960616827, "loss/crossentropy": 2.576325136423111, "loss/logits": 0.7919742912054062, "step": 58110 }, { "epoch": 0.5812, "grad_norm": 16.0, "grad_norm_var": 1.0244140625, "learning_rate": 0.0003, "loss": 11.0291, "loss/aux_loss": 0.04808232747018337, "loss/crossentropy": 2.7864030063152314, "loss/logits": 0.8283389776945114, "step": 58120 }, { "epoch": 0.5813, "grad_norm": 17.75, "grad_norm_var": 1.7234375, "learning_rate": 0.0003, "loss": 11.0488, "loss/aux_loss": 0.04805902913212776, "loss/crossentropy": 2.7792890667915344, "loss/logits": 0.8295496284961701, "step": 58130 }, { "epoch": 0.5814, "grad_norm": 14.8125, "grad_norm_var": 1.323681640625, "learning_rate": 0.0003, "loss": 11.0617, "loss/aux_loss": 0.04808140993118286, "loss/crossentropy": 2.7483465135097505, "loss/logits": 0.8322425484657288, "step": 58140 }, { "epoch": 0.5815, "grad_norm": 14.8125, "grad_norm_var": 0.75703125, "learning_rate": 0.0003, "loss": 11.0982, "loss/aux_loss": 0.048070278204977515, "loss/crossentropy": 2.9067394614219664, "loss/logits": 0.8337443679571152, "step": 58150 }, { "epoch": 0.5816, "grad_norm": 14.75, "grad_norm_var": 0.5151041666666667, "learning_rate": 0.0003, "loss": 10.9214, "loss/aux_loss": 0.0480665884912014, "loss/crossentropy": 2.706578928232193, "loss/logits": 0.8104908049106598, "step": 58160 }, { "epoch": 0.5817, "grad_norm": 16.375, "grad_norm_var": 1.6416015625, "learning_rate": 0.0003, "loss": 11.0962, "loss/aux_loss": 0.048067183792591096, "loss/crossentropy": 2.7630359292030335, "loss/logits": 0.8151145994663238, "step": 58170 }, { "epoch": 0.5818, "grad_norm": 13.625, "grad_norm_var": 2.781233723958333, "learning_rate": 0.0003, "loss": 10.9718, "loss/aux_loss": 0.048072848655283454, "loss/crossentropy": 2.6488034069538116, "loss/logits": 0.8012841731309891, "step": 58180 }, { "epoch": 0.5819, "grad_norm": 14.5625, "grad_norm_var": 1.9555826822916667, "learning_rate": 0.0003, "loss": 10.9685, "loss/aux_loss": 0.0480765612795949, "loss/crossentropy": 2.7052852630615236, "loss/logits": 0.837667453289032, "step": 58190 }, { "epoch": 0.582, "grad_norm": 14.5625, "grad_norm_var": 0.8645182291666667, "learning_rate": 0.0003, "loss": 11.076, "loss/aux_loss": 0.048066786117851736, "loss/crossentropy": 2.683205193281174, "loss/logits": 0.8363191336393356, "step": 58200 }, { "epoch": 0.5821, "grad_norm": 15.0, "grad_norm_var": 0.5692545572916666, "learning_rate": 0.0003, "loss": 10.8792, "loss/aux_loss": 0.04808394853025675, "loss/crossentropy": 2.706932079792023, "loss/logits": 0.8160331755876541, "step": 58210 }, { "epoch": 0.5822, "grad_norm": 15.5625, "grad_norm_var": 0.7785807291666667, "learning_rate": 0.0003, "loss": 11.039, "loss/aux_loss": 0.04806770384311676, "loss/crossentropy": 2.781590723991394, "loss/logits": 0.8191991955041885, "step": 58220 }, { "epoch": 0.5823, "grad_norm": 15.5, "grad_norm_var": 0.8033854166666666, "learning_rate": 0.0003, "loss": 10.9915, "loss/aux_loss": 0.048082100600004195, "loss/crossentropy": 2.827639192342758, "loss/logits": 0.8229591697454453, "step": 58230 }, { "epoch": 0.5824, "grad_norm": 13.3125, "grad_norm_var": 4.969384765625, "learning_rate": 0.0003, "loss": 10.998, "loss/aux_loss": 0.0480662764981389, "loss/crossentropy": 2.6668840289115905, "loss/logits": 0.8254508256912232, "step": 58240 }, { "epoch": 0.5825, "grad_norm": 14.3125, "grad_norm_var": 1.2660807291666667, "learning_rate": 0.0003, "loss": 10.9292, "loss/aux_loss": 0.04806844256818295, "loss/crossentropy": 2.786569392681122, "loss/logits": 0.831071189045906, "step": 58250 }, { "epoch": 0.5826, "grad_norm": 14.375, "grad_norm_var": 1.1525390625, "learning_rate": 0.0003, "loss": 11.0841, "loss/aux_loss": 0.048065231554210185, "loss/crossentropy": 2.6059127330780028, "loss/logits": 0.8094421774148941, "step": 58260 }, { "epoch": 0.5827, "grad_norm": 14.3125, "grad_norm_var": 0.690087890625, "learning_rate": 0.0003, "loss": 10.9705, "loss/aux_loss": 0.048076927289366723, "loss/crossentropy": 2.7309110164642334, "loss/logits": 0.8249422818422317, "step": 58270 }, { "epoch": 0.5828, "grad_norm": 16.5, "grad_norm_var": 2.154150390625, "learning_rate": 0.0003, "loss": 11.0905, "loss/aux_loss": 0.04807541277259588, "loss/crossentropy": 2.6696152329444884, "loss/logits": 0.8134458005428314, "step": 58280 }, { "epoch": 0.5829, "grad_norm": 13.9375, "grad_norm_var": 2.66875, "learning_rate": 0.0003, "loss": 10.864, "loss/aux_loss": 0.048075484298169616, "loss/crossentropy": 2.60534029006958, "loss/logits": 0.8042290031909942, "step": 58290 }, { "epoch": 0.583, "grad_norm": 14.6875, "grad_norm_var": 1.1087890625, "learning_rate": 0.0003, "loss": 11.0752, "loss/aux_loss": 0.04807412791997194, "loss/crossentropy": 2.669729250669479, "loss/logits": 0.8465682655572891, "step": 58300 }, { "epoch": 0.5831, "grad_norm": 14.875, "grad_norm_var": 0.42337239583333336, "learning_rate": 0.0003, "loss": 10.924, "loss/aux_loss": 0.04807706717401743, "loss/crossentropy": 2.641595256328583, "loss/logits": 0.8296791315078735, "step": 58310 }, { "epoch": 0.5832, "grad_norm": 15.25, "grad_norm_var": 0.48828125, "learning_rate": 0.0003, "loss": 11.086, "loss/aux_loss": 0.048067683912813665, "loss/crossentropy": 2.602385413646698, "loss/logits": 0.8127113878726959, "step": 58320 }, { "epoch": 0.5833, "grad_norm": 15.25, "grad_norm_var": 0.3636555989583333, "learning_rate": 0.0003, "loss": 11.0658, "loss/aux_loss": 0.04806897640228271, "loss/crossentropy": 2.675559568405151, "loss/logits": 0.8271180987358093, "step": 58330 }, { "epoch": 0.5834, "grad_norm": 15.0, "grad_norm_var": 0.08854166666666667, "learning_rate": 0.0003, "loss": 11.045, "loss/aux_loss": 0.04805998243391514, "loss/crossentropy": 2.7416910886764527, "loss/logits": 0.8429529070854187, "step": 58340 }, { "epoch": 0.5835, "grad_norm": 14.75, "grad_norm_var": 0.07810872395833333, "learning_rate": 0.0003, "loss": 10.9301, "loss/aux_loss": 0.048085011541843414, "loss/crossentropy": 2.5801856577396394, "loss/logits": 0.7824599385261536, "step": 58350 }, { "epoch": 0.5836, "grad_norm": 13.6875, "grad_norm_var": 0.5702473958333333, "learning_rate": 0.0003, "loss": 10.9614, "loss/aux_loss": 0.04806315153837204, "loss/crossentropy": 2.7932356715202333, "loss/logits": 0.8210479527711868, "step": 58360 }, { "epoch": 0.5837, "grad_norm": 14.8125, "grad_norm_var": 0.73828125, "learning_rate": 0.0003, "loss": 10.8114, "loss/aux_loss": 0.04807087611407042, "loss/crossentropy": 2.8822829246521, "loss/logits": 0.8095810860395432, "step": 58370 }, { "epoch": 0.5838, "grad_norm": 15.8125, "grad_norm_var": 0.8700358072916666, "learning_rate": 0.0003, "loss": 10.9375, "loss/aux_loss": 0.04807416722178459, "loss/crossentropy": 2.850358772277832, "loss/logits": 0.849553844332695, "step": 58380 }, { "epoch": 0.5839, "grad_norm": 15.9375, "grad_norm_var": 0.8473795572916667, "learning_rate": 0.0003, "loss": 10.9638, "loss/aux_loss": 0.04806542359292507, "loss/crossentropy": 2.8250136971473694, "loss/logits": 0.8444699108600616, "step": 58390 }, { "epoch": 0.584, "grad_norm": 16.25, "grad_norm_var": 3.767041015625, "learning_rate": 0.0003, "loss": 10.9255, "loss/aux_loss": 0.0480630787089467, "loss/crossentropy": 2.770486330986023, "loss/logits": 0.8150721251964569, "step": 58400 }, { "epoch": 0.5841, "grad_norm": 13.9375, "grad_norm_var": 3.655143229166667, "learning_rate": 0.0003, "loss": 10.9237, "loss/aux_loss": 0.048080130480229855, "loss/crossentropy": 2.77328075170517, "loss/logits": 0.8458864361047744, "step": 58410 }, { "epoch": 0.5842, "grad_norm": 15.5, "grad_norm_var": 0.5301432291666667, "learning_rate": 0.0003, "loss": 11.0029, "loss/aux_loss": 0.04805634468793869, "loss/crossentropy": 2.744527643918991, "loss/logits": 0.8247867822647095, "step": 58420 }, { "epoch": 0.5843, "grad_norm": 14.0625, "grad_norm_var": 0.5108723958333333, "learning_rate": 0.0003, "loss": 10.9075, "loss/aux_loss": 0.04806449562311173, "loss/crossentropy": 2.7784756422042847, "loss/logits": 0.8371286004781723, "step": 58430 }, { "epoch": 0.5844, "grad_norm": 13.8125, "grad_norm_var": 1.1895833333333334, "learning_rate": 0.0003, "loss": 10.8034, "loss/aux_loss": 0.048080151155591014, "loss/crossentropy": 2.7118197083473206, "loss/logits": 0.8134770125150681, "step": 58440 }, { "epoch": 0.5845, "grad_norm": 14.375, "grad_norm_var": 1.2567057291666666, "learning_rate": 0.0003, "loss": 10.9606, "loss/aux_loss": 0.04806223269551992, "loss/crossentropy": 2.8520585894584656, "loss/logits": 0.8356281250715256, "step": 58450 }, { "epoch": 0.5846, "grad_norm": 16.5, "grad_norm_var": 0.9921712239583333, "learning_rate": 0.0003, "loss": 10.991, "loss/aux_loss": 0.04806670732796192, "loss/crossentropy": 2.7324011504650114, "loss/logits": 0.8266925632953643, "step": 58460 }, { "epoch": 0.5847, "grad_norm": 15.4375, "grad_norm_var": 0.8575520833333333, "learning_rate": 0.0003, "loss": 10.9843, "loss/aux_loss": 0.04807538501918316, "loss/crossentropy": 2.647887235879898, "loss/logits": 0.820940124988556, "step": 58470 }, { "epoch": 0.5848, "grad_norm": 13.3125, "grad_norm_var": 1.0228515625, "learning_rate": 0.0003, "loss": 10.9996, "loss/aux_loss": 0.04807217437773943, "loss/crossentropy": 2.680449867248535, "loss/logits": 0.8432391703128814, "step": 58480 }, { "epoch": 0.5849, "grad_norm": 16.25, "grad_norm_var": 1.27421875, "learning_rate": 0.0003, "loss": 11.031, "loss/aux_loss": 0.04805920589715242, "loss/crossentropy": 2.7809171319007873, "loss/logits": 0.8145634055137634, "step": 58490 }, { "epoch": 0.585, "grad_norm": 15.5625, "grad_norm_var": 0.3885416666666667, "learning_rate": 0.0003, "loss": 10.9707, "loss/aux_loss": 0.048078577220439914, "loss/crossentropy": 2.741816544532776, "loss/logits": 0.8448689103126525, "step": 58500 }, { "epoch": 0.5851, "grad_norm": 15.1875, "grad_norm_var": 0.46295572916666666, "learning_rate": 0.0003, "loss": 10.9568, "loss/aux_loss": 0.048060483485460284, "loss/crossentropy": 2.691603738069534, "loss/logits": 0.8457825213670731, "step": 58510 }, { "epoch": 0.5852, "grad_norm": 14.8125, "grad_norm_var": 1.128759765625, "learning_rate": 0.0003, "loss": 10.9338, "loss/aux_loss": 0.04807424061000347, "loss/crossentropy": 2.6941749453544617, "loss/logits": 0.8350166887044906, "step": 58520 }, { "epoch": 0.5853, "grad_norm": 16.625, "grad_norm_var": 0.451806640625, "learning_rate": 0.0003, "loss": 11.0623, "loss/aux_loss": 0.04807107653468847, "loss/crossentropy": 2.8114802479743957, "loss/logits": 0.8427129536867142, "step": 58530 }, { "epoch": 0.5854, "grad_norm": 14.0625, "grad_norm_var": 0.9030598958333333, "learning_rate": 0.0003, "loss": 11.0062, "loss/aux_loss": 0.04807313997298479, "loss/crossentropy": 2.673390966653824, "loss/logits": 0.8236012995243073, "step": 58540 }, { "epoch": 0.5855, "grad_norm": 14.6875, "grad_norm_var": 1.0176432291666666, "learning_rate": 0.0003, "loss": 10.9709, "loss/aux_loss": 0.048067517951130866, "loss/crossentropy": 2.7362454771995544, "loss/logits": 0.8251888632774353, "step": 58550 }, { "epoch": 0.5856, "grad_norm": 16.625, "grad_norm_var": 1.235791015625, "learning_rate": 0.0003, "loss": 10.9647, "loss/aux_loss": 0.04807721339166164, "loss/crossentropy": 2.751077103614807, "loss/logits": 0.8136387556791306, "step": 58560 }, { "epoch": 0.5857, "grad_norm": 15.875, "grad_norm_var": 0.832666015625, "learning_rate": 0.0003, "loss": 11.0542, "loss/aux_loss": 0.048059957846999166, "loss/crossentropy": 2.695993906259537, "loss/logits": 0.8578321129083634, "step": 58570 }, { "epoch": 0.5858, "grad_norm": 13.6875, "grad_norm_var": 0.3734375, "learning_rate": 0.0003, "loss": 10.9334, "loss/aux_loss": 0.0480723824352026, "loss/crossentropy": 2.636319124698639, "loss/logits": 0.8126176208257675, "step": 58580 }, { "epoch": 0.5859, "grad_norm": 15.6875, "grad_norm_var": 0.6244140625, "learning_rate": 0.0003, "loss": 10.9793, "loss/aux_loss": 0.04807424917817116, "loss/crossentropy": 2.633167880773544, "loss/logits": 0.8149242758750915, "step": 58590 }, { "epoch": 0.586, "grad_norm": 13.4375, "grad_norm_var": 0.7452473958333333, "learning_rate": 0.0003, "loss": 10.7418, "loss/aux_loss": 0.04807692859321833, "loss/crossentropy": 2.4629740476608277, "loss/logits": 0.7668499648571014, "step": 58600 }, { "epoch": 0.5861, "grad_norm": 15.4375, "grad_norm_var": 0.6527180989583333, "learning_rate": 0.0003, "loss": 10.9786, "loss/aux_loss": 0.04806172419339418, "loss/crossentropy": 2.8479265451431273, "loss/logits": 0.8407178670167923, "step": 58610 }, { "epoch": 0.5862, "grad_norm": 15.125, "grad_norm_var": 0.5923014322916667, "learning_rate": 0.0003, "loss": 10.8859, "loss/aux_loss": 0.04807059057056904, "loss/crossentropy": 2.6659990191459655, "loss/logits": 0.814807391166687, "step": 58620 }, { "epoch": 0.5863, "grad_norm": 15.5625, "grad_norm_var": 0.2994791666666667, "learning_rate": 0.0003, "loss": 10.9763, "loss/aux_loss": 0.0480600368231535, "loss/crossentropy": 2.7559759974479676, "loss/logits": 0.8363692253828049, "step": 58630 }, { "epoch": 0.5864, "grad_norm": 14.8125, "grad_norm_var": 0.354541015625, "learning_rate": 0.0003, "loss": 10.9895, "loss/aux_loss": 0.048074362054467204, "loss/crossentropy": 2.8443562030792235, "loss/logits": 0.8306858450174331, "step": 58640 }, { "epoch": 0.5865, "grad_norm": 15.0, "grad_norm_var": 0.24777018229166667, "learning_rate": 0.0003, "loss": 10.9248, "loss/aux_loss": 0.04806084036827087, "loss/crossentropy": 2.7475152254104613, "loss/logits": 0.7995836168527604, "step": 58650 }, { "epoch": 0.5866, "grad_norm": 14.4375, "grad_norm_var": 0.5483723958333333, "learning_rate": 0.0003, "loss": 10.921, "loss/aux_loss": 0.04807322192937136, "loss/crossentropy": 2.705971562862396, "loss/logits": 0.8364428788423538, "step": 58660 }, { "epoch": 0.5867, "grad_norm": 15.0625, "grad_norm_var": 0.5433430989583333, "learning_rate": 0.0003, "loss": 10.9895, "loss/aux_loss": 0.04806621428579092, "loss/crossentropy": 2.723879784345627, "loss/logits": 0.8121023416519165, "step": 58670 }, { "epoch": 0.5868, "grad_norm": 15.5, "grad_norm_var": 0.8768229166666667, "learning_rate": 0.0003, "loss": 10.9435, "loss/aux_loss": 0.048067286051809786, "loss/crossentropy": 2.8236024498939516, "loss/logits": 0.8061568111181259, "step": 58680 }, { "epoch": 0.5869, "grad_norm": 14.9375, "grad_norm_var": 0.5377604166666666, "learning_rate": 0.0003, "loss": 11.0705, "loss/aux_loss": 0.04807589165866375, "loss/crossentropy": 2.6578650951385496, "loss/logits": 0.822320407629013, "step": 58690 }, { "epoch": 0.587, "grad_norm": 15.25, "grad_norm_var": 3.528059895833333, "learning_rate": 0.0003, "loss": 10.9717, "loss/aux_loss": 0.04807302374392748, "loss/crossentropy": 2.7690295398235323, "loss/logits": 0.8380024790763855, "step": 58700 }, { "epoch": 0.5871, "grad_norm": 14.375, "grad_norm_var": 1.2687337239583334, "learning_rate": 0.0003, "loss": 10.9628, "loss/aux_loss": 0.04807158224284649, "loss/crossentropy": 2.642226552963257, "loss/logits": 0.8027304679155349, "step": 58710 }, { "epoch": 0.5872, "grad_norm": 14.1875, "grad_norm_var": 0.4669270833333333, "learning_rate": 0.0003, "loss": 10.9551, "loss/aux_loss": 0.048065906204283235, "loss/crossentropy": 2.667159843444824, "loss/logits": 0.837336790561676, "step": 58720 }, { "epoch": 0.5873, "grad_norm": 15.25, "grad_norm_var": 0.6577473958333333, "learning_rate": 0.0003, "loss": 10.9683, "loss/aux_loss": 0.048073652759194375, "loss/crossentropy": 2.667749172449112, "loss/logits": 0.8197858512401581, "step": 58730 }, { "epoch": 0.5874, "grad_norm": 14.9375, "grad_norm_var": 6.6400390625, "learning_rate": 0.0003, "loss": 11.0102, "loss/aux_loss": 0.04808447286486626, "loss/crossentropy": 2.632328379154205, "loss/logits": 0.8462556928396225, "step": 58740 }, { "epoch": 0.5875, "grad_norm": 16.625, "grad_norm_var": 0.4176432291666667, "learning_rate": 0.0003, "loss": 10.9477, "loss/aux_loss": 0.048075118102133274, "loss/crossentropy": 2.7823431193828583, "loss/logits": 0.8039717346429824, "step": 58750 }, { "epoch": 0.5876, "grad_norm": 13.75, "grad_norm_var": 0.5447265625, "learning_rate": 0.0003, "loss": 10.968, "loss/aux_loss": 0.04806168247014284, "loss/crossentropy": 2.774631363153458, "loss/logits": 0.82295723259449, "step": 58760 }, { "epoch": 0.5877, "grad_norm": 14.0625, "grad_norm_var": 4.605582682291667, "learning_rate": 0.0003, "loss": 10.8307, "loss/aux_loss": 0.04806800838559866, "loss/crossentropy": 2.820412439107895, "loss/logits": 0.8312744557857513, "step": 58770 }, { "epoch": 0.5878, "grad_norm": 14.125, "grad_norm_var": 0.3238118489583333, "learning_rate": 0.0003, "loss": 10.9226, "loss/aux_loss": 0.04807846397161484, "loss/crossentropy": 2.5340620458126066, "loss/logits": 0.8130956321954728, "step": 58780 }, { "epoch": 0.5879, "grad_norm": 14.625, "grad_norm_var": 0.2986979166666667, "learning_rate": 0.0003, "loss": 10.9796, "loss/aux_loss": 0.048070663772523406, "loss/crossentropy": 2.617660069465637, "loss/logits": 0.7849185347557068, "step": 58790 }, { "epoch": 0.588, "grad_norm": 15.4375, "grad_norm_var": 0.9128743489583333, "learning_rate": 0.0003, "loss": 11.0414, "loss/aux_loss": 0.048075301200151445, "loss/crossentropy": 2.7353998363018035, "loss/logits": 0.8277339696884155, "step": 58800 }, { "epoch": 0.5881, "grad_norm": 15.6875, "grad_norm_var": 0.8166015625, "learning_rate": 0.0003, "loss": 10.9593, "loss/aux_loss": 0.04806892182677984, "loss/crossentropy": 2.720021104812622, "loss/logits": 0.8124103635549546, "step": 58810 }, { "epoch": 0.5882, "grad_norm": 16.25, "grad_norm_var": 0.705322265625, "learning_rate": 0.0003, "loss": 10.8535, "loss/aux_loss": 0.048067253082990646, "loss/crossentropy": 2.6595967948436736, "loss/logits": 0.7839356884360313, "step": 58820 }, { "epoch": 0.5883, "grad_norm": 15.0625, "grad_norm_var": 0.604931640625, "learning_rate": 0.0003, "loss": 10.8844, "loss/aux_loss": 0.048070118948817256, "loss/crossentropy": 2.7247247993946075, "loss/logits": 0.7789757996797562, "step": 58830 }, { "epoch": 0.5884, "grad_norm": 15.5625, "grad_norm_var": 1.5989420572916666, "learning_rate": 0.0003, "loss": 10.8944, "loss/aux_loss": 0.04808255434036255, "loss/crossentropy": 2.4786873877048494, "loss/logits": 0.7954070687294006, "step": 58840 }, { "epoch": 0.5885, "grad_norm": 14.0625, "grad_norm_var": 1.150634765625, "learning_rate": 0.0003, "loss": 10.7977, "loss/aux_loss": 0.04806720409542322, "loss/crossentropy": 2.6936080753803253, "loss/logits": 0.8249445348978043, "step": 58850 }, { "epoch": 0.5886, "grad_norm": 14.1875, "grad_norm_var": 0.6049479166666667, "learning_rate": 0.0003, "loss": 10.92, "loss/aux_loss": 0.04806901291012764, "loss/crossentropy": 2.748962438106537, "loss/logits": 0.840557438135147, "step": 58860 }, { "epoch": 0.5887, "grad_norm": 16.0, "grad_norm_var": 0.7219889322916667, "learning_rate": 0.0003, "loss": 10.963, "loss/aux_loss": 0.04807457271963358, "loss/crossentropy": 2.7299853801727294, "loss/logits": 0.8568669199943543, "step": 58870 }, { "epoch": 0.5888, "grad_norm": 14.4375, "grad_norm_var": 1.1009765625, "learning_rate": 0.0003, "loss": 10.9833, "loss/aux_loss": 0.0480708921328187, "loss/crossentropy": 2.6630140364170076, "loss/logits": 0.8391565322875977, "step": 58880 }, { "epoch": 0.5889, "grad_norm": 15.6875, "grad_norm_var": 0.5079264322916667, "learning_rate": 0.0003, "loss": 10.7613, "loss/aux_loss": 0.0480732224881649, "loss/crossentropy": 2.6498291552066804, "loss/logits": 0.8057867288589478, "step": 58890 }, { "epoch": 0.589, "grad_norm": 14.5625, "grad_norm_var": 1.2921712239583334, "learning_rate": 0.0003, "loss": 10.9393, "loss/aux_loss": 0.04806382786482573, "loss/crossentropy": 2.706892067193985, "loss/logits": 0.8239340364933014, "step": 58900 }, { "epoch": 0.5891, "grad_norm": 15.25, "grad_norm_var": 0.4327473958333333, "learning_rate": 0.0003, "loss": 11.2523, "loss/aux_loss": 0.04806530307978392, "loss/crossentropy": 2.5392131090164183, "loss/logits": 0.802097937464714, "step": 58910 }, { "epoch": 0.5892, "grad_norm": 16.0, "grad_norm_var": 103.838525390625, "learning_rate": 0.0003, "loss": 11.1054, "loss/aux_loss": 0.048078567162156104, "loss/crossentropy": 2.7394894659519196, "loss/logits": 0.8429781794548035, "step": 58920 }, { "epoch": 0.5893, "grad_norm": 16.875, "grad_norm_var": 101.1806640625, "learning_rate": 0.0003, "loss": 11.0567, "loss/aux_loss": 0.04806540366262198, "loss/crossentropy": 2.717589294910431, "loss/logits": 0.8201171487569809, "step": 58930 }, { "epoch": 0.5894, "grad_norm": 16.0, "grad_norm_var": 1.0885416666666667, "learning_rate": 0.0003, "loss": 10.9065, "loss/aux_loss": 0.04806985668838024, "loss/crossentropy": 2.721989232301712, "loss/logits": 0.8106504052877426, "step": 58940 }, { "epoch": 0.5895, "grad_norm": 14.375, "grad_norm_var": 0.3611979166666667, "learning_rate": 0.0003, "loss": 11.0415, "loss/aux_loss": 0.04806645512580872, "loss/crossentropy": 2.7016734063625334, "loss/logits": 0.8152611821889877, "step": 58950 }, { "epoch": 0.5896, "grad_norm": 15.8125, "grad_norm_var": 0.5468098958333333, "learning_rate": 0.0003, "loss": 10.9485, "loss/aux_loss": 0.04806891251355409, "loss/crossentropy": 2.62559455037117, "loss/logits": 0.8271364778280258, "step": 58960 }, { "epoch": 0.5897, "grad_norm": 15.3125, "grad_norm_var": 0.3322265625, "learning_rate": 0.0003, "loss": 11.052, "loss/aux_loss": 0.04806600380688906, "loss/crossentropy": 2.737446331977844, "loss/logits": 0.8194819182157517, "step": 58970 }, { "epoch": 0.5898, "grad_norm": 16.375, "grad_norm_var": 0.439697265625, "learning_rate": 0.0003, "loss": 11.0957, "loss/aux_loss": 0.04807234760373831, "loss/crossentropy": 2.760717141628265, "loss/logits": 0.8376249551773072, "step": 58980 }, { "epoch": 0.5899, "grad_norm": 14.875, "grad_norm_var": 1.3390462239583334, "learning_rate": 0.0003, "loss": 11.0902, "loss/aux_loss": 0.04807633645832539, "loss/crossentropy": 2.7784714460372926, "loss/logits": 0.8507242858409881, "step": 58990 }, { "epoch": 0.59, "grad_norm": 15.8125, "grad_norm_var": 1.2333170572916667, "learning_rate": 0.0003, "loss": 10.9738, "loss/aux_loss": 0.048075743950903416, "loss/crossentropy": 2.634059315919876, "loss/logits": 0.8007471144199372, "step": 59000 }, { "epoch": 0.5901, "grad_norm": 15.0625, "grad_norm_var": 0.501416015625, "learning_rate": 0.0003, "loss": 10.8689, "loss/aux_loss": 0.048068515583872796, "loss/crossentropy": 2.646367919445038, "loss/logits": 0.7871147692203522, "step": 59010 }, { "epoch": 0.5902, "grad_norm": 14.3125, "grad_norm_var": 0.6884765625, "learning_rate": 0.0003, "loss": 11.0153, "loss/aux_loss": 0.04807483684271574, "loss/crossentropy": 2.8454954862594604, "loss/logits": 0.8256110936403275, "step": 59020 }, { "epoch": 0.5903, "grad_norm": 13.8125, "grad_norm_var": 0.595947265625, "learning_rate": 0.0003, "loss": 10.7911, "loss/aux_loss": 0.04806120917201042, "loss/crossentropy": 2.7482463240623476, "loss/logits": 0.8132666110992431, "step": 59030 }, { "epoch": 0.5904, "grad_norm": 15.1875, "grad_norm_var": 0.685400390625, "learning_rate": 0.0003, "loss": 10.9159, "loss/aux_loss": 0.04806686472147703, "loss/crossentropy": 2.742703366279602, "loss/logits": 0.8217089116573334, "step": 59040 }, { "epoch": 0.5905, "grad_norm": 14.25, "grad_norm_var": 0.917431640625, "learning_rate": 0.0003, "loss": 10.9354, "loss/aux_loss": 0.048080751299858095, "loss/crossentropy": 2.7598276495933534, "loss/logits": 0.8276279777288437, "step": 59050 }, { "epoch": 0.5906, "grad_norm": 16.0, "grad_norm_var": 0.8372233072916667, "learning_rate": 0.0003, "loss": 11.1481, "loss/aux_loss": 0.048068036511540416, "loss/crossentropy": 2.5435283482074738, "loss/logits": 0.8292164400219917, "step": 59060 }, { "epoch": 0.5907, "grad_norm": 15.5, "grad_norm_var": 0.3714680989583333, "learning_rate": 0.0003, "loss": 11.13, "loss/aux_loss": 0.048066299967467786, "loss/crossentropy": 2.531124544143677, "loss/logits": 0.8264132618904114, "step": 59070 }, { "epoch": 0.5908, "grad_norm": 16.125, "grad_norm_var": 0.4212890625, "learning_rate": 0.0003, "loss": 11.0672, "loss/aux_loss": 0.0480706337839365, "loss/crossentropy": 2.7756115198135376, "loss/logits": 0.8168632984161377, "step": 59080 }, { "epoch": 0.5909, "grad_norm": 15.3125, "grad_norm_var": 1.2400390625, "learning_rate": 0.0003, "loss": 10.966, "loss/aux_loss": 0.048072228021919725, "loss/crossentropy": 2.5596219480037687, "loss/logits": 0.8085451662540436, "step": 59090 }, { "epoch": 0.591, "grad_norm": 14.1875, "grad_norm_var": 0.41795247395833335, "learning_rate": 0.0003, "loss": 11.0411, "loss/aux_loss": 0.04807391669601202, "loss/crossentropy": 2.7747272551059723, "loss/logits": 0.8068033158779144, "step": 59100 }, { "epoch": 0.5911, "grad_norm": 16.625, "grad_norm_var": 0.5026041666666666, "learning_rate": 0.0003, "loss": 11.0439, "loss/aux_loss": 0.048057069256901744, "loss/crossentropy": 2.8529131174087525, "loss/logits": 0.8054678052663803, "step": 59110 }, { "epoch": 0.5912, "grad_norm": 15.9375, "grad_norm_var": 0.3973795572916667, "learning_rate": 0.0003, "loss": 10.8576, "loss/aux_loss": 0.04808417148888111, "loss/crossentropy": 2.5729918599128725, "loss/logits": 0.7877096027135849, "step": 59120 }, { "epoch": 0.5913, "grad_norm": 15.125, "grad_norm_var": 0.5738118489583334, "learning_rate": 0.0003, "loss": 11.054, "loss/aux_loss": 0.048090359196066856, "loss/crossentropy": 2.69580699801445, "loss/logits": 0.8433273226022721, "step": 59130 }, { "epoch": 0.5914, "grad_norm": 15.625, "grad_norm_var": 0.3301920572916667, "learning_rate": 0.0003, "loss": 10.8815, "loss/aux_loss": 0.04807056300342083, "loss/crossentropy": 2.615113401412964, "loss/logits": 0.8006851255893708, "step": 59140 }, { "epoch": 0.5915, "grad_norm": 15.0, "grad_norm_var": 0.3577473958333333, "learning_rate": 0.0003, "loss": 10.8658, "loss/aux_loss": 0.04805748388171196, "loss/crossentropy": 2.775008863210678, "loss/logits": 0.803268751502037, "step": 59150 }, { "epoch": 0.5916, "grad_norm": 15.75, "grad_norm_var": 0.7499837239583333, "learning_rate": 0.0003, "loss": 10.9147, "loss/aux_loss": 0.04807665143162012, "loss/crossentropy": 2.7489787578582763, "loss/logits": 0.8320010215044021, "step": 59160 }, { "epoch": 0.5917, "grad_norm": 14.1875, "grad_norm_var": 1.4718587239583334, "learning_rate": 0.0003, "loss": 11.1376, "loss/aux_loss": 0.048068560846149924, "loss/crossentropy": 2.8063846230506897, "loss/logits": 0.8416935801506042, "step": 59170 }, { "epoch": 0.5918, "grad_norm": 14.6875, "grad_norm_var": 1.0505045572916667, "learning_rate": 0.0003, "loss": 11.1433, "loss/aux_loss": 0.04806650020182133, "loss/crossentropy": 2.7465264439582824, "loss/logits": 0.8265712201595307, "step": 59180 }, { "epoch": 0.5919, "grad_norm": 14.625, "grad_norm_var": 0.20859375, "learning_rate": 0.0003, "loss": 11.0092, "loss/aux_loss": 0.04806997440755367, "loss/crossentropy": 2.7317902624607084, "loss/logits": 0.829322350025177, "step": 59190 }, { "epoch": 0.592, "grad_norm": 15.75, "grad_norm_var": 0.5913899739583334, "learning_rate": 0.0003, "loss": 11.1233, "loss/aux_loss": 0.048072732985019684, "loss/crossentropy": 2.736423373222351, "loss/logits": 0.8223551511764526, "step": 59200 }, { "epoch": 0.5921, "grad_norm": 13.9375, "grad_norm_var": 0.7077473958333333, "learning_rate": 0.0003, "loss": 10.9564, "loss/aux_loss": 0.04806751888245344, "loss/crossentropy": 2.719071865081787, "loss/logits": 0.8144007086753845, "step": 59210 }, { "epoch": 0.5922, "grad_norm": 14.5, "grad_norm_var": 0.599072265625, "learning_rate": 0.0003, "loss": 11.0542, "loss/aux_loss": 0.04806646332144737, "loss/crossentropy": 2.6371989250183105, "loss/logits": 0.8028866291046143, "step": 59220 }, { "epoch": 0.5923, "grad_norm": 14.625, "grad_norm_var": 0.6218587239583333, "learning_rate": 0.0003, "loss": 11.05, "loss/aux_loss": 0.048065362870693205, "loss/crossentropy": 2.739983332157135, "loss/logits": 0.8592475086450577, "step": 59230 }, { "epoch": 0.5924, "grad_norm": 14.3125, "grad_norm_var": 5.140348307291666, "learning_rate": 0.0003, "loss": 11.0626, "loss/aux_loss": 0.04808344319462776, "loss/crossentropy": 2.6779512405395507, "loss/logits": 0.8342467457056045, "step": 59240 }, { "epoch": 0.5925, "grad_norm": 15.625, "grad_norm_var": 0.465087890625, "learning_rate": 0.0003, "loss": 10.6864, "loss/aux_loss": 0.04806926678866148, "loss/crossentropy": 2.6925257742404938, "loss/logits": 0.7975740045309067, "step": 59250 }, { "epoch": 0.5926, "grad_norm": 14.9375, "grad_norm_var": 0.4261555989583333, "learning_rate": 0.0003, "loss": 10.9838, "loss/aux_loss": 0.04807133283466101, "loss/crossentropy": 2.6862433731555937, "loss/logits": 0.830639323592186, "step": 59260 }, { "epoch": 0.5927, "grad_norm": 15.0625, "grad_norm_var": 0.607666015625, "learning_rate": 0.0003, "loss": 10.9802, "loss/aux_loss": 0.048060860484838486, "loss/crossentropy": 2.6332711696624758, "loss/logits": 0.8140772134065628, "step": 59270 }, { "epoch": 0.5928, "grad_norm": 15.5, "grad_norm_var": 0.642431640625, "learning_rate": 0.0003, "loss": 10.9915, "loss/aux_loss": 0.048063439317047596, "loss/crossentropy": 2.647914093732834, "loss/logits": 0.8220590710639953, "step": 59280 }, { "epoch": 0.5929, "grad_norm": 15.375, "grad_norm_var": 2.2627604166666666, "learning_rate": 0.0003, "loss": 11.0782, "loss/aux_loss": 0.048079627007246016, "loss/crossentropy": 2.739602434635162, "loss/logits": 0.846561822295189, "step": 59290 }, { "epoch": 0.593, "grad_norm": 16.25, "grad_norm_var": 2.2085774739583335, "learning_rate": 0.0003, "loss": 10.979, "loss/aux_loss": 0.04806938376277685, "loss/crossentropy": 2.6843549072742463, "loss/logits": 0.7924599975347519, "step": 59300 }, { "epoch": 0.5931, "grad_norm": 33.25, "grad_norm_var": 21.649462890625, "learning_rate": 0.0003, "loss": 10.8709, "loss/aux_loss": 0.04807360116392374, "loss/crossentropy": 2.6454947888851166, "loss/logits": 0.7973768830299377, "step": 59310 }, { "epoch": 0.5932, "grad_norm": 14.8125, "grad_norm_var": 21.169775390625, "learning_rate": 0.0003, "loss": 10.9825, "loss/aux_loss": 0.048061652667820455, "loss/crossentropy": 2.8345079243183138, "loss/logits": 0.8396099478006362, "step": 59320 }, { "epoch": 0.5933, "grad_norm": 13.875, "grad_norm_var": 0.815625, "learning_rate": 0.0003, "loss": 10.9579, "loss/aux_loss": 0.04807075336575508, "loss/crossentropy": 2.7278249740600584, "loss/logits": 0.8078439980745316, "step": 59330 }, { "epoch": 0.5934, "grad_norm": 15.0, "grad_norm_var": 0.859375, "learning_rate": 0.0003, "loss": 10.9258, "loss/aux_loss": 0.04808927923440933, "loss/crossentropy": 2.647115594148636, "loss/logits": 0.8381874442100525, "step": 59340 }, { "epoch": 0.5935, "grad_norm": 15.5625, "grad_norm_var": 1.0649576822916667, "learning_rate": 0.0003, "loss": 10.9276, "loss/aux_loss": 0.04806491620838642, "loss/crossentropy": 2.760968017578125, "loss/logits": 0.813837793469429, "step": 59350 }, { "epoch": 0.5936, "grad_norm": 14.125, "grad_norm_var": 0.940087890625, "learning_rate": 0.0003, "loss": 10.8929, "loss/aux_loss": 0.048059547506272796, "loss/crossentropy": 2.728524845838547, "loss/logits": 0.816395303606987, "step": 59360 }, { "epoch": 0.5937, "grad_norm": 15.75, "grad_norm_var": 0.6927083333333334, "learning_rate": 0.0003, "loss": 11.1686, "loss/aux_loss": 0.04809366017580032, "loss/crossentropy": 2.576079845428467, "loss/logits": 0.8472390443086624, "step": 59370 }, { "epoch": 0.5938, "grad_norm": 14.25, "grad_norm_var": 0.6554524739583333, "learning_rate": 0.0003, "loss": 10.8931, "loss/aux_loss": 0.048058745451271534, "loss/crossentropy": 2.5764957904815673, "loss/logits": 0.7882023543119431, "step": 59380 }, { "epoch": 0.5939, "grad_norm": 14.25, "grad_norm_var": 0.5186848958333333, "learning_rate": 0.0003, "loss": 10.8687, "loss/aux_loss": 0.04806121941655874, "loss/crossentropy": 2.4873786866664886, "loss/logits": 0.8160700887441635, "step": 59390 }, { "epoch": 0.594, "grad_norm": 14.625, "grad_norm_var": 0.5051432291666667, "learning_rate": 0.0003, "loss": 10.8354, "loss/aux_loss": 0.04808267336338758, "loss/crossentropy": 2.715548413991928, "loss/logits": 0.8030152201652527, "step": 59400 }, { "epoch": 0.5941, "grad_norm": 15.0625, "grad_norm_var": 0.8432291666666667, "learning_rate": 0.0003, "loss": 10.7494, "loss/aux_loss": 0.04806729760020971, "loss/crossentropy": 2.5783600986003874, "loss/logits": 0.775664460659027, "step": 59410 }, { "epoch": 0.5942, "grad_norm": 15.375, "grad_norm_var": 0.728125, "learning_rate": 0.0003, "loss": 10.9819, "loss/aux_loss": 0.04807338900864124, "loss/crossentropy": 2.7407156348228456, "loss/logits": 0.8106876760721207, "step": 59420 }, { "epoch": 0.5943, "grad_norm": 16.75, "grad_norm_var": 0.7608723958333333, "learning_rate": 0.0003, "loss": 11.1629, "loss/aux_loss": 0.048071779869496824, "loss/crossentropy": 2.7402497112751005, "loss/logits": 0.8036000728607178, "step": 59430 }, { "epoch": 0.5944, "grad_norm": 15.125, "grad_norm_var": 1.547509765625, "learning_rate": 0.0003, "loss": 10.9178, "loss/aux_loss": 0.04806051217019558, "loss/crossentropy": 2.725063371658325, "loss/logits": 0.8170418709516525, "step": 59440 }, { "epoch": 0.5945, "grad_norm": 16.125, "grad_norm_var": 0.7936848958333333, "learning_rate": 0.0003, "loss": 11.0313, "loss/aux_loss": 0.04807902853935957, "loss/crossentropy": 2.6812859654426573, "loss/logits": 0.8411592811346054, "step": 59450 }, { "epoch": 0.5946, "grad_norm": 15.375, "grad_norm_var": 0.5328125, "learning_rate": 0.0003, "loss": 10.8673, "loss/aux_loss": 0.04806883670389652, "loss/crossentropy": 2.526155251264572, "loss/logits": 0.7790047436952591, "step": 59460 }, { "epoch": 0.5947, "grad_norm": 14.5625, "grad_norm_var": 0.6650390625, "learning_rate": 0.0003, "loss": 11.0563, "loss/aux_loss": 0.04805390052497387, "loss/crossentropy": 2.714101165533066, "loss/logits": 0.8301648050546646, "step": 59470 }, { "epoch": 0.5948, "grad_norm": 15.5625, "grad_norm_var": 0.21555989583333332, "learning_rate": 0.0003, "loss": 11.0729, "loss/aux_loss": 0.04807809926569462, "loss/crossentropy": 2.582643520832062, "loss/logits": 0.8132951408624649, "step": 59480 }, { "epoch": 0.5949, "grad_norm": 15.625, "grad_norm_var": 0.6773274739583334, "learning_rate": 0.0003, "loss": 11.2003, "loss/aux_loss": 0.04807939510792494, "loss/crossentropy": 2.8899078488349916, "loss/logits": 0.8984211206436157, "step": 59490 }, { "epoch": 0.595, "grad_norm": 14.5625, "grad_norm_var": 0.4994140625, "learning_rate": 0.0003, "loss": 10.8003, "loss/aux_loss": 0.048056123591959474, "loss/crossentropy": 2.657677114009857, "loss/logits": 0.8166221141815185, "step": 59500 }, { "epoch": 0.5951, "grad_norm": 15.0, "grad_norm_var": 1.293212890625, "learning_rate": 0.0003, "loss": 10.9127, "loss/aux_loss": 0.04807478673756123, "loss/crossentropy": 2.618022048473358, "loss/logits": 0.829986622929573, "step": 59510 }, { "epoch": 0.5952, "grad_norm": 14.5, "grad_norm_var": 0.6301920572916667, "learning_rate": 0.0003, "loss": 10.9695, "loss/aux_loss": 0.0480778394266963, "loss/crossentropy": 2.6219813764095306, "loss/logits": 0.8168375045061111, "step": 59520 }, { "epoch": 0.5953, "grad_norm": 14.6875, "grad_norm_var": 14.586393229166667, "learning_rate": 0.0003, "loss": 11.0956, "loss/aux_loss": 0.0480570949614048, "loss/crossentropy": 2.6230372488498688, "loss/logits": 0.7977444887161255, "step": 59530 }, { "epoch": 0.5954, "grad_norm": 13.875, "grad_norm_var": 0.9593098958333334, "learning_rate": 0.0003, "loss": 10.9874, "loss/aux_loss": 0.04807024523615837, "loss/crossentropy": 2.7767493963241576, "loss/logits": 0.8549789160490036, "step": 59540 }, { "epoch": 0.5955, "grad_norm": 14.75, "grad_norm_var": 0.9890625, "learning_rate": 0.0003, "loss": 10.9328, "loss/aux_loss": 0.048075405322015284, "loss/crossentropy": 2.8718122959136965, "loss/logits": 0.8272106260061264, "step": 59550 }, { "epoch": 0.5956, "grad_norm": 15.5625, "grad_norm_var": 0.9208333333333333, "learning_rate": 0.0003, "loss": 10.7519, "loss/aux_loss": 0.04805839378386736, "loss/crossentropy": 2.621040326356888, "loss/logits": 0.8079787522554398, "step": 59560 }, { "epoch": 0.5957, "grad_norm": 14.5625, "grad_norm_var": 0.563916015625, "learning_rate": 0.0003, "loss": 10.9413, "loss/aux_loss": 0.048068532906472686, "loss/crossentropy": 2.7320153057575225, "loss/logits": 0.8291085928678512, "step": 59570 }, { "epoch": 0.5958, "grad_norm": 15.3125, "grad_norm_var": 0.435400390625, "learning_rate": 0.0003, "loss": 11.059, "loss/aux_loss": 0.048074721731245516, "loss/crossentropy": 2.596713310480118, "loss/logits": 0.8409205973148346, "step": 59580 }, { "epoch": 0.5959, "grad_norm": 14.4375, "grad_norm_var": 0.33513997395833334, "learning_rate": 0.0003, "loss": 11.0557, "loss/aux_loss": 0.04807321783155203, "loss/crossentropy": 2.7819936752319334, "loss/logits": 0.835452938079834, "step": 59590 }, { "epoch": 0.596, "grad_norm": 15.4375, "grad_norm_var": 2.527604166666667, "learning_rate": 0.0003, "loss": 10.9209, "loss/aux_loss": 0.04806660022586584, "loss/crossentropy": 2.7568194687366487, "loss/logits": 0.8260948032140731, "step": 59600 }, { "epoch": 0.5961, "grad_norm": 14.625, "grad_norm_var": 0.7150390625, "learning_rate": 0.0003, "loss": 11.0103, "loss/aux_loss": 0.04807510152459145, "loss/crossentropy": 2.743026089668274, "loss/logits": 0.8484435856342316, "step": 59610 }, { "epoch": 0.5962, "grad_norm": 15.5, "grad_norm_var": 0.7692057291666666, "learning_rate": 0.0003, "loss": 10.9734, "loss/aux_loss": 0.04806904960423708, "loss/crossentropy": 2.553094118833542, "loss/logits": 0.7921933591365814, "step": 59620 }, { "epoch": 0.5963, "grad_norm": 13.8125, "grad_norm_var": 0.4197265625, "learning_rate": 0.0003, "loss": 10.8507, "loss/aux_loss": 0.048070012219250204, "loss/crossentropy": 2.620549178123474, "loss/logits": 0.8065012693405151, "step": 59630 }, { "epoch": 0.5964, "grad_norm": 14.8125, "grad_norm_var": 0.51875, "learning_rate": 0.0003, "loss": 11.0665, "loss/aux_loss": 0.048063849285244944, "loss/crossentropy": 2.7524186074733734, "loss/logits": 0.8217565357685089, "step": 59640 }, { "epoch": 0.5965, "grad_norm": 16.625, "grad_norm_var": 0.3515625, "learning_rate": 0.0003, "loss": 10.9902, "loss/aux_loss": 0.04806663002818823, "loss/crossentropy": 2.8480568647384645, "loss/logits": 0.8349178761243821, "step": 59650 }, { "epoch": 0.5966, "grad_norm": 15.5625, "grad_norm_var": 0.368603515625, "learning_rate": 0.0003, "loss": 11.0885, "loss/aux_loss": 0.04807380642741919, "loss/crossentropy": 2.7424690067768096, "loss/logits": 0.8472974270582199, "step": 59660 }, { "epoch": 0.5967, "grad_norm": 15.375, "grad_norm_var": 0.24869791666666666, "learning_rate": 0.0003, "loss": 11.0736, "loss/aux_loss": 0.04807511363178492, "loss/crossentropy": 2.7169342398643495, "loss/logits": 0.8267938494682312, "step": 59670 }, { "epoch": 0.5968, "grad_norm": 15.5, "grad_norm_var": 0.6700520833333333, "learning_rate": 0.0003, "loss": 11.1724, "loss/aux_loss": 0.048063908331096174, "loss/crossentropy": 2.7714505553245545, "loss/logits": 0.8535281270742416, "step": 59680 }, { "epoch": 0.5969, "grad_norm": 14.25, "grad_norm_var": 0.5930826822916667, "learning_rate": 0.0003, "loss": 10.8832, "loss/aux_loss": 0.048075766302645206, "loss/crossentropy": 2.607689690589905, "loss/logits": 0.8187483072280883, "step": 59690 }, { "epoch": 0.597, "grad_norm": 16.375, "grad_norm_var": 0.46243489583333336, "learning_rate": 0.0003, "loss": 11.0849, "loss/aux_loss": 0.048065418377518654, "loss/crossentropy": 2.779401385784149, "loss/logits": 0.8177706062793731, "step": 59700 }, { "epoch": 0.5971, "grad_norm": 16.0, "grad_norm_var": 0.4337890625, "learning_rate": 0.0003, "loss": 11.0411, "loss/aux_loss": 0.04806800279766321, "loss/crossentropy": 2.6941158711910247, "loss/logits": 0.8276149153709411, "step": 59710 }, { "epoch": 0.5972, "grad_norm": 14.75, "grad_norm_var": 0.475, "learning_rate": 0.0003, "loss": 11.0057, "loss/aux_loss": 0.04807844683527947, "loss/crossentropy": 2.7109430134296417, "loss/logits": 0.8097800493240357, "step": 59720 }, { "epoch": 0.5973, "grad_norm": 14.6875, "grad_norm_var": 0.7171223958333334, "learning_rate": 0.0003, "loss": 10.9658, "loss/aux_loss": 0.0480660380795598, "loss/crossentropy": 2.6846647441387175, "loss/logits": 0.8089656233787537, "step": 59730 }, { "epoch": 0.5974, "grad_norm": 15.6875, "grad_norm_var": 0.8059895833333334, "learning_rate": 0.0003, "loss": 10.9255, "loss/aux_loss": 0.04807507041841745, "loss/crossentropy": 2.693540346622467, "loss/logits": 0.835108283162117, "step": 59740 }, { "epoch": 0.5975, "grad_norm": 16.5, "grad_norm_var": 0.428125, "learning_rate": 0.0003, "loss": 10.8601, "loss/aux_loss": 0.048072214052081105, "loss/crossentropy": 2.6454875826835633, "loss/logits": 0.8485147625207901, "step": 59750 }, { "epoch": 0.5976, "grad_norm": 15.1875, "grad_norm_var": 0.9978515625, "learning_rate": 0.0003, "loss": 11.0293, "loss/aux_loss": 0.04806381613016129, "loss/crossentropy": 2.784002923965454, "loss/logits": 0.823991322517395, "step": 59760 }, { "epoch": 0.5977, "grad_norm": 15.0625, "grad_norm_var": 0.25857747395833336, "learning_rate": 0.0003, "loss": 10.8259, "loss/aux_loss": 0.04807265438139439, "loss/crossentropy": 2.801326608657837, "loss/logits": 0.8200320184230805, "step": 59770 }, { "epoch": 0.5978, "grad_norm": 16.125, "grad_norm_var": 0.47146809895833336, "learning_rate": 0.0003, "loss": 10.9737, "loss/aux_loss": 0.04808081742376089, "loss/crossentropy": 2.856343114376068, "loss/logits": 0.8455061435699462, "step": 59780 }, { "epoch": 0.5979, "grad_norm": 18.125, "grad_norm_var": 1.2893229166666667, "learning_rate": 0.0003, "loss": 10.998, "loss/aux_loss": 0.048059838637709615, "loss/crossentropy": 2.553582340478897, "loss/logits": 0.7986224472522736, "step": 59790 }, { "epoch": 0.598, "grad_norm": 14.875, "grad_norm_var": 0.927978515625, "learning_rate": 0.0003, "loss": 11.0234, "loss/aux_loss": 0.048071127571165564, "loss/crossentropy": 2.8242238759994507, "loss/logits": 0.8418795853853226, "step": 59800 }, { "epoch": 0.5981, "grad_norm": 13.6875, "grad_norm_var": 0.2869140625, "learning_rate": 0.0003, "loss": 11.054, "loss/aux_loss": 0.04806619361042976, "loss/crossentropy": 2.7367840886116026, "loss/logits": 0.8270297706127167, "step": 59810 }, { "epoch": 0.5982, "grad_norm": 16.0, "grad_norm_var": 0.9035807291666667, "learning_rate": 0.0003, "loss": 10.8816, "loss/aux_loss": 0.048074010014534, "loss/crossentropy": 2.5995913684368133, "loss/logits": 0.843637329339981, "step": 59820 }, { "epoch": 0.5983, "grad_norm": 16.125, "grad_norm_var": 0.842431640625, "learning_rate": 0.0003, "loss": 10.9971, "loss/aux_loss": 0.04806699063628912, "loss/crossentropy": 2.66332545876503, "loss/logits": 0.7995132386684418, "step": 59830 }, { "epoch": 0.5984, "grad_norm": 14.0, "grad_norm_var": 1.2426432291666667, "learning_rate": 0.0003, "loss": 10.9088, "loss/aux_loss": 0.048076143860816954, "loss/crossentropy": 2.6590377569198607, "loss/logits": 0.8228894799947739, "step": 59840 }, { "epoch": 0.5985, "grad_norm": 14.3125, "grad_norm_var": 0.6864420572916666, "learning_rate": 0.0003, "loss": 11.0308, "loss/aux_loss": 0.04806055724620819, "loss/crossentropy": 2.734611225128174, "loss/logits": 0.8257469087839127, "step": 59850 }, { "epoch": 0.5986, "grad_norm": 13.6875, "grad_norm_var": 0.5270670572916667, "learning_rate": 0.0003, "loss": 10.9778, "loss/aux_loss": 0.048079499416053294, "loss/crossentropy": 2.8724292278289796, "loss/logits": 0.8287100523710251, "step": 59860 }, { "epoch": 0.5987, "grad_norm": 14.9375, "grad_norm_var": 2.584619140625, "learning_rate": 0.0003, "loss": 10.8757, "loss/aux_loss": 0.04806960113346577, "loss/crossentropy": 2.723463845252991, "loss/logits": 0.834690722823143, "step": 59870 }, { "epoch": 0.5988, "grad_norm": 16.625, "grad_norm_var": 1.8973307291666666, "learning_rate": 0.0003, "loss": 10.9234, "loss/aux_loss": 0.04806802999228239, "loss/crossentropy": 2.785686802864075, "loss/logits": 0.8314568728208542, "step": 59880 }, { "epoch": 0.5989, "grad_norm": 16.375, "grad_norm_var": 0.5518229166666667, "learning_rate": 0.0003, "loss": 10.9717, "loss/aux_loss": 0.04806967880576849, "loss/crossentropy": 2.7029653549194337, "loss/logits": 0.8201411485671997, "step": 59890 }, { "epoch": 0.599, "grad_norm": 16.125, "grad_norm_var": 0.5994140625, "learning_rate": 0.0003, "loss": 11.0052, "loss/aux_loss": 0.048081204667687415, "loss/crossentropy": 2.783577024936676, "loss/logits": 0.8226085513830185, "step": 59900 }, { "epoch": 0.5991, "grad_norm": 15.25, "grad_norm_var": 0.7863118489583333, "learning_rate": 0.0003, "loss": 11.0765, "loss/aux_loss": 0.0480691323056817, "loss/crossentropy": 2.6739238142967223, "loss/logits": 0.8000189930200576, "step": 59910 }, { "epoch": 0.5992, "grad_norm": 14.625, "grad_norm_var": 0.5018229166666667, "learning_rate": 0.0003, "loss": 10.948, "loss/aux_loss": 0.04806508179754019, "loss/crossentropy": 2.732639729976654, "loss/logits": 0.8182629913091659, "step": 59920 }, { "epoch": 0.5993, "grad_norm": 14.625, "grad_norm_var": 0.42233072916666664, "learning_rate": 0.0003, "loss": 10.9416, "loss/aux_loss": 0.04808191284537315, "loss/crossentropy": 2.773883467912674, "loss/logits": 0.8407058566808701, "step": 59930 }, { "epoch": 0.5994, "grad_norm": 15.625, "grad_norm_var": 0.33904622395833334, "learning_rate": 0.0003, "loss": 10.982, "loss/aux_loss": 0.04807113688439131, "loss/crossentropy": 2.7120142698287966, "loss/logits": 0.8165002167224884, "step": 59940 }, { "epoch": 0.5995, "grad_norm": 15.75, "grad_norm_var": 0.39993489583333336, "learning_rate": 0.0003, "loss": 10.9997, "loss/aux_loss": 0.04807593729346991, "loss/crossentropy": 2.646089047193527, "loss/logits": 0.8073794126510621, "step": 59950 }, { "epoch": 0.5996, "grad_norm": 15.3125, "grad_norm_var": 0.32928059895833334, "learning_rate": 0.0003, "loss": 11.0826, "loss/aux_loss": 0.048062044009566304, "loss/crossentropy": 2.8690964460372923, "loss/logits": 0.8507906019687652, "step": 59960 }, { "epoch": 0.5997, "grad_norm": 15.25, "grad_norm_var": 0.365869140625, "learning_rate": 0.0003, "loss": 10.9892, "loss/aux_loss": 0.048076673224568364, "loss/crossentropy": 2.638966166973114, "loss/logits": 0.8349994659423828, "step": 59970 }, { "epoch": 0.5998, "grad_norm": 14.3125, "grad_norm_var": 0.5264973958333333, "learning_rate": 0.0003, "loss": 10.9127, "loss/aux_loss": 0.0480753380805254, "loss/crossentropy": 2.7292301952838898, "loss/logits": 0.8326119810342789, "step": 59980 }, { "epoch": 0.5999, "grad_norm": 15.0, "grad_norm_var": 0.39479166666666665, "learning_rate": 0.0003, "loss": 11.0337, "loss/aux_loss": 0.04805723261088133, "loss/crossentropy": 2.7440689623355867, "loss/logits": 0.8415668040513993, "step": 59990 }, { "epoch": 0.6, "grad_norm": 14.5625, "grad_norm_var": 0.41847330729166665, "learning_rate": 0.0003, "loss": 11.1506, "loss/aux_loss": 0.04806195814162493, "loss/crossentropy": 2.7931241512298586, "loss/logits": 0.8480271577835083, "step": 60000 }, { "epoch": 0.6001, "grad_norm": 14.6875, "grad_norm_var": 3.018684895833333, "learning_rate": 0.0003, "loss": 10.8942, "loss/aux_loss": 0.04808544497936964, "loss/crossentropy": 2.8031197428703307, "loss/logits": 0.8436632961034775, "step": 60010 }, { "epoch": 0.6002, "grad_norm": 14.375, "grad_norm_var": 0.6575358072916667, "learning_rate": 0.0003, "loss": 11.009, "loss/aux_loss": 0.048070221580564974, "loss/crossentropy": 2.835988187789917, "loss/logits": 0.8101363390684128, "step": 60020 }, { "epoch": 0.6003, "grad_norm": 17.625, "grad_norm_var": 0.9681640625, "learning_rate": 0.0003, "loss": 11.0825, "loss/aux_loss": 0.04806274306029081, "loss/crossentropy": 2.634697949886322, "loss/logits": 0.8300037115812302, "step": 60030 }, { "epoch": 0.6004, "grad_norm": 15.4375, "grad_norm_var": 1.369775390625, "learning_rate": 0.0003, "loss": 10.8961, "loss/aux_loss": 0.04807508382946253, "loss/crossentropy": 2.7980096697807313, "loss/logits": 0.8517778217792511, "step": 60040 }, { "epoch": 0.6005, "grad_norm": 15.6875, "grad_norm_var": 1.1749348958333334, "learning_rate": 0.0003, "loss": 10.8726, "loss/aux_loss": 0.048068697564303875, "loss/crossentropy": 2.5798544883728027, "loss/logits": 0.7742179721593857, "step": 60050 }, { "epoch": 0.6006, "grad_norm": 16.625, "grad_norm_var": 1.3494791666666666, "learning_rate": 0.0003, "loss": 11.0209, "loss/aux_loss": 0.0480728967115283, "loss/crossentropy": 2.7089627504348757, "loss/logits": 0.8144480526447296, "step": 60060 }, { "epoch": 0.6007, "grad_norm": 15.25, "grad_norm_var": 1.4589680989583333, "learning_rate": 0.0003, "loss": 10.7924, "loss/aux_loss": 0.04806405883282423, "loss/crossentropy": 2.63613708615303, "loss/logits": 0.82077776491642, "step": 60070 }, { "epoch": 0.6008, "grad_norm": 14.625, "grad_norm_var": 0.13214518229166666, "learning_rate": 0.0003, "loss": 10.8491, "loss/aux_loss": 0.04806582164019346, "loss/crossentropy": 2.709311383962631, "loss/logits": 0.8013584047555924, "step": 60080 }, { "epoch": 0.6009, "grad_norm": 14.75, "grad_norm_var": 0.383837890625, "learning_rate": 0.0003, "loss": 11.0591, "loss/aux_loss": 0.04806982241570949, "loss/crossentropy": 2.7307440638542175, "loss/logits": 0.8089520663022995, "step": 60090 }, { "epoch": 0.601, "grad_norm": 14.6875, "grad_norm_var": 0.47552083333333334, "learning_rate": 0.0003, "loss": 11.0038, "loss/aux_loss": 0.04807723425328732, "loss/crossentropy": 2.713645851612091, "loss/logits": 0.8089924275875091, "step": 60100 }, { "epoch": 0.6011, "grad_norm": 15.3125, "grad_norm_var": 1.0103515625, "learning_rate": 0.0003, "loss": 10.8574, "loss/aux_loss": 0.0480644965544343, "loss/crossentropy": 2.7398121774196627, "loss/logits": 0.8082275360822677, "step": 60110 }, { "epoch": 0.6012, "grad_norm": 15.375, "grad_norm_var": 0.6893229166666667, "learning_rate": 0.0003, "loss": 10.923, "loss/aux_loss": 0.04806880187243223, "loss/crossentropy": 2.7813449084758757, "loss/logits": 0.8685790807008743, "step": 60120 }, { "epoch": 0.6013, "grad_norm": 14.25, "grad_norm_var": 0.377587890625, "learning_rate": 0.0003, "loss": 11.167, "loss/aux_loss": 0.04805507734417915, "loss/crossentropy": 2.8507793068885805, "loss/logits": 0.8408539682626724, "step": 60130 }, { "epoch": 0.6014, "grad_norm": 16.25, "grad_norm_var": 0.5222493489583333, "learning_rate": 0.0003, "loss": 10.9877, "loss/aux_loss": 0.048066430166363715, "loss/crossentropy": 2.662059265375137, "loss/logits": 0.8354754239320755, "step": 60140 }, { "epoch": 0.6015, "grad_norm": 14.875, "grad_norm_var": 1.5430826822916666, "learning_rate": 0.0003, "loss": 11.0714, "loss/aux_loss": 0.04806972537189722, "loss/crossentropy": 2.728497040271759, "loss/logits": 0.8564503043889999, "step": 60150 }, { "epoch": 0.6016, "grad_norm": 15.0625, "grad_norm_var": 1.3777180989583333, "learning_rate": 0.0003, "loss": 10.8885, "loss/aux_loss": 0.0480747090652585, "loss/crossentropy": 2.6242256700992583, "loss/logits": 0.8221401393413543, "step": 60160 }, { "epoch": 0.6017, "grad_norm": 14.3125, "grad_norm_var": 0.440869140625, "learning_rate": 0.0003, "loss": 10.9851, "loss/aux_loss": 0.04808126259595156, "loss/crossentropy": 2.571116214990616, "loss/logits": 0.8445754140615463, "step": 60170 }, { "epoch": 0.6018, "grad_norm": 15.125, "grad_norm_var": 0.6421875, "learning_rate": 0.0003, "loss": 11.1588, "loss/aux_loss": 0.0480703953653574, "loss/crossentropy": 2.6313143491744997, "loss/logits": 0.8429572701454162, "step": 60180 }, { "epoch": 0.6019, "grad_norm": 15.0625, "grad_norm_var": 0.4825358072916667, "learning_rate": 0.0003, "loss": 10.9319, "loss/aux_loss": 0.048066435009241106, "loss/crossentropy": 2.470182943344116, "loss/logits": 0.7674608916044235, "step": 60190 }, { "epoch": 0.602, "grad_norm": 14.5625, "grad_norm_var": 0.5676920572916667, "learning_rate": 0.0003, "loss": 10.865, "loss/aux_loss": 0.048062361776828766, "loss/crossentropy": 2.652854871749878, "loss/logits": 0.8476852804422379, "step": 60200 }, { "epoch": 0.6021, "grad_norm": 15.75, "grad_norm_var": 0.644775390625, "learning_rate": 0.0003, "loss": 10.9103, "loss/aux_loss": 0.04806744400411844, "loss/crossentropy": 2.886264109611511, "loss/logits": 0.8491258502006531, "step": 60210 }, { "epoch": 0.6022, "grad_norm": 16.25, "grad_norm_var": 15.070947265625, "learning_rate": 0.0003, "loss": 10.9318, "loss/aux_loss": 0.048081173188984395, "loss/crossentropy": 2.7197977185249327, "loss/logits": 0.8118861824274063, "step": 60220 }, { "epoch": 0.6023, "grad_norm": 16.875, "grad_norm_var": 15.35703125, "learning_rate": 0.0003, "loss": 10.8425, "loss/aux_loss": 0.04807243477553129, "loss/crossentropy": 2.675804728269577, "loss/logits": 0.7933743417263031, "step": 60230 }, { "epoch": 0.6024, "grad_norm": 15.6875, "grad_norm_var": 0.8639973958333333, "learning_rate": 0.0003, "loss": 10.9663, "loss/aux_loss": 0.048066473379731176, "loss/crossentropy": 2.5603831708431244, "loss/logits": 0.8213741898536682, "step": 60240 }, { "epoch": 0.6025, "grad_norm": 14.75, "grad_norm_var": 0.6863932291666667, "learning_rate": 0.0003, "loss": 11.0783, "loss/aux_loss": 0.04807442165911198, "loss/crossentropy": 2.759167742729187, "loss/logits": 0.8260885119438172, "step": 60250 }, { "epoch": 0.6026, "grad_norm": 15.625, "grad_norm_var": 0.44724934895833335, "learning_rate": 0.0003, "loss": 10.9391, "loss/aux_loss": 0.048061834275722505, "loss/crossentropy": 2.830324959754944, "loss/logits": 0.8341899156570435, "step": 60260 }, { "epoch": 0.6027, "grad_norm": 14.125, "grad_norm_var": 0.5426432291666666, "learning_rate": 0.0003, "loss": 10.9735, "loss/aux_loss": 0.04807800371199846, "loss/crossentropy": 2.710289627313614, "loss/logits": 0.8235593348741531, "step": 60270 }, { "epoch": 0.6028, "grad_norm": 12.9375, "grad_norm_var": 0.8927083333333333, "learning_rate": 0.0003, "loss": 10.8861, "loss/aux_loss": 0.04806134235113859, "loss/crossentropy": 2.702167409658432, "loss/logits": 0.8340632915496826, "step": 60280 }, { "epoch": 0.6029, "grad_norm": 15.875, "grad_norm_var": 1.0098307291666666, "learning_rate": 0.0003, "loss": 10.9902, "loss/aux_loss": 0.04806969091296196, "loss/crossentropy": 2.7321683406829833, "loss/logits": 0.834402334690094, "step": 60290 }, { "epoch": 0.603, "grad_norm": 15.625, "grad_norm_var": 0.6902180989583333, "learning_rate": 0.0003, "loss": 10.9249, "loss/aux_loss": 0.048069990053772924, "loss/crossentropy": 2.5171724021434785, "loss/logits": 0.7993605226278305, "step": 60300 }, { "epoch": 0.6031, "grad_norm": 14.0, "grad_norm_var": 0.43605143229166665, "learning_rate": 0.0003, "loss": 10.9595, "loss/aux_loss": 0.04806102756410837, "loss/crossentropy": 2.707091200351715, "loss/logits": 0.8481060534715652, "step": 60310 }, { "epoch": 0.6032, "grad_norm": 14.5625, "grad_norm_var": 0.2884765625, "learning_rate": 0.0003, "loss": 10.9526, "loss/aux_loss": 0.04808544833213091, "loss/crossentropy": 2.7442154586315155, "loss/logits": 0.8329938769340515, "step": 60320 }, { "epoch": 0.6033, "grad_norm": 17.125, "grad_norm_var": 0.6363118489583334, "learning_rate": 0.0003, "loss": 10.8661, "loss/aux_loss": 0.04806406293064356, "loss/crossentropy": 2.511446052789688, "loss/logits": 0.8104943811893464, "step": 60330 }, { "epoch": 0.6034, "grad_norm": 15.375, "grad_norm_var": 0.9984375, "learning_rate": 0.0003, "loss": 10.7618, "loss/aux_loss": 0.04807962328195572, "loss/crossentropy": 2.5813129425048826, "loss/logits": 0.8011042684316635, "step": 60340 }, { "epoch": 0.6035, "grad_norm": 16.0, "grad_norm_var": 0.7387858072916667, "learning_rate": 0.0003, "loss": 11.0578, "loss/aux_loss": 0.04808678813278675, "loss/crossentropy": 2.71219407916069, "loss/logits": 0.8353963553905487, "step": 60350 }, { "epoch": 0.6036, "grad_norm": 14.4375, "grad_norm_var": 0.5567708333333333, "learning_rate": 0.0003, "loss": 10.9285, "loss/aux_loss": 0.048061441816389563, "loss/crossentropy": 2.757435607910156, "loss/logits": 0.8533197224140168, "step": 60360 }, { "epoch": 0.6037, "grad_norm": 15.0, "grad_norm_var": 0.30514322916666664, "learning_rate": 0.0003, "loss": 10.9645, "loss/aux_loss": 0.04807481989264488, "loss/crossentropy": 2.685833466053009, "loss/logits": 0.8134330004453659, "step": 60370 }, { "epoch": 0.6038, "grad_norm": 15.25, "grad_norm_var": 0.3848795572916667, "learning_rate": 0.0003, "loss": 11.053, "loss/aux_loss": 0.04806959424167871, "loss/crossentropy": 2.602872520685196, "loss/logits": 0.8216569721698761, "step": 60380 }, { "epoch": 0.6039, "grad_norm": 14.125, "grad_norm_var": 0.6390462239583333, "learning_rate": 0.0003, "loss": 10.9354, "loss/aux_loss": 0.04806573148816824, "loss/crossentropy": 2.7883040606975555, "loss/logits": 0.8433178305625916, "step": 60390 }, { "epoch": 0.604, "grad_norm": 15.625, "grad_norm_var": 0.424853515625, "learning_rate": 0.0003, "loss": 10.9932, "loss/aux_loss": 0.0480732886120677, "loss/crossentropy": 2.6341083645820618, "loss/logits": 0.8201987504959106, "step": 60400 }, { "epoch": 0.6041, "grad_norm": 15.4375, "grad_norm_var": 4.114176432291667, "learning_rate": 0.0003, "loss": 10.8744, "loss/aux_loss": 0.048076138645410535, "loss/crossentropy": 2.8022005796432494, "loss/logits": 0.8076352566480637, "step": 60410 }, { "epoch": 0.6042, "grad_norm": 16.0, "grad_norm_var": 5.117708333333334, "learning_rate": 0.0003, "loss": 10.7988, "loss/aux_loss": 0.048061057738959787, "loss/crossentropy": 2.6338140249252318, "loss/logits": 0.7957394987344741, "step": 60420 }, { "epoch": 0.6043, "grad_norm": 14.8125, "grad_norm_var": 4.590869140625, "learning_rate": 0.0003, "loss": 11.0153, "loss/aux_loss": 0.04807674996554852, "loss/crossentropy": 2.7275028109550474, "loss/logits": 0.8166062444448471, "step": 60430 }, { "epoch": 0.6044, "grad_norm": 14.875, "grad_norm_var": 0.3009765625, "learning_rate": 0.0003, "loss": 10.9633, "loss/aux_loss": 0.04805723633617163, "loss/crossentropy": 2.7013749897480013, "loss/logits": 0.8536836624145507, "step": 60440 }, { "epoch": 0.6045, "grad_norm": 14.3125, "grad_norm_var": 0.5106770833333333, "learning_rate": 0.0003, "loss": 10.8347, "loss/aux_loss": 0.04807833768427372, "loss/crossentropy": 2.5935844779014587, "loss/logits": 0.8088052183389663, "step": 60450 }, { "epoch": 0.6046, "grad_norm": 15.0, "grad_norm_var": 0.478125, "learning_rate": 0.0003, "loss": 10.9532, "loss/aux_loss": 0.048071070946753024, "loss/crossentropy": 2.699798661470413, "loss/logits": 0.824173653125763, "step": 60460 }, { "epoch": 0.6047, "grad_norm": 16.875, "grad_norm_var": 0.451416015625, "learning_rate": 0.0003, "loss": 11.0419, "loss/aux_loss": 0.048078490793704985, "loss/crossentropy": 2.6727247834205627, "loss/logits": 0.8337906152009964, "step": 60470 }, { "epoch": 0.6048, "grad_norm": 14.8125, "grad_norm_var": 0.66875, "learning_rate": 0.0003, "loss": 10.9652, "loss/aux_loss": 0.048067685589194296, "loss/crossentropy": 2.5421866893768312, "loss/logits": 0.8169512122869491, "step": 60480 }, { "epoch": 0.6049, "grad_norm": 14.3125, "grad_norm_var": 0.5202473958333333, "learning_rate": 0.0003, "loss": 10.943, "loss/aux_loss": 0.04807629156857729, "loss/crossentropy": 2.7076221227645876, "loss/logits": 0.8126240253448487, "step": 60490 }, { "epoch": 0.605, "grad_norm": 16.375, "grad_norm_var": 0.4942708333333333, "learning_rate": 0.0003, "loss": 10.9824, "loss/aux_loss": 0.04806436561048031, "loss/crossentropy": 2.716249758005142, "loss/logits": 0.8121423751115799, "step": 60500 }, { "epoch": 0.6051, "grad_norm": 14.6875, "grad_norm_var": 0.33203125, "learning_rate": 0.0003, "loss": 10.8244, "loss/aux_loss": 0.048078736290335655, "loss/crossentropy": 2.5074705123901366, "loss/logits": 0.7894505262374878, "step": 60510 }, { "epoch": 0.6052, "grad_norm": 14.75, "grad_norm_var": 0.6769368489583333, "learning_rate": 0.0003, "loss": 10.8273, "loss/aux_loss": 0.04807204809039831, "loss/crossentropy": 2.6767341911792757, "loss/logits": 0.8195008933544159, "step": 60520 }, { "epoch": 0.6053, "grad_norm": 14.0625, "grad_norm_var": 0.9587076822916667, "learning_rate": 0.0003, "loss": 10.85, "loss/aux_loss": 0.04806831441819668, "loss/crossentropy": 2.8315181374549865, "loss/logits": 0.8047915935516358, "step": 60530 }, { "epoch": 0.6054, "grad_norm": 14.4375, "grad_norm_var": 0.4041015625, "learning_rate": 0.0003, "loss": 10.8398, "loss/aux_loss": 0.048069612868130204, "loss/crossentropy": 2.657101058959961, "loss/logits": 0.8052042782306671, "step": 60540 }, { "epoch": 0.6055, "grad_norm": 14.625, "grad_norm_var": 0.5738932291666666, "learning_rate": 0.0003, "loss": 10.8523, "loss/aux_loss": 0.04806829355657101, "loss/crossentropy": 2.584134030342102, "loss/logits": 0.7640033394098282, "step": 60550 }, { "epoch": 0.6056, "grad_norm": 15.25, "grad_norm_var": 0.375244140625, "learning_rate": 0.0003, "loss": 10.9682, "loss/aux_loss": 0.04807130675762892, "loss/crossentropy": 2.680067926645279, "loss/logits": 0.8243839502334595, "step": 60560 }, { "epoch": 0.6057, "grad_norm": 15.0625, "grad_norm_var": 0.43697916666666664, "learning_rate": 0.0003, "loss": 10.912, "loss/aux_loss": 0.0480777146294713, "loss/crossentropy": 2.6226901173591615, "loss/logits": 0.7878055989742279, "step": 60570 }, { "epoch": 0.6058, "grad_norm": 15.0625, "grad_norm_var": 0.3062337239583333, "learning_rate": 0.0003, "loss": 10.9453, "loss/aux_loss": 0.048065618798136714, "loss/crossentropy": 2.6935440480709074, "loss/logits": 0.8214478433132172, "step": 60580 }, { "epoch": 0.6059, "grad_norm": 15.6875, "grad_norm_var": 1.1348795572916666, "learning_rate": 0.0003, "loss": 10.9734, "loss/aux_loss": 0.0480723200365901, "loss/crossentropy": 2.7571221947669984, "loss/logits": 0.8032363146543503, "step": 60590 }, { "epoch": 0.606, "grad_norm": 14.5625, "grad_norm_var": 1.2327473958333333, "learning_rate": 0.0003, "loss": 10.88, "loss/aux_loss": 0.04806821886450052, "loss/crossentropy": 2.661402940750122, "loss/logits": 0.7886179000139236, "step": 60600 }, { "epoch": 0.6061, "grad_norm": 15.0, "grad_norm_var": 0.5450520833333333, "learning_rate": 0.0003, "loss": 10.8687, "loss/aux_loss": 0.048080092296004295, "loss/crossentropy": 2.4920336484909056, "loss/logits": 0.7764117568731308, "step": 60610 }, { "epoch": 0.6062, "grad_norm": 15.3125, "grad_norm_var": 0.428125, "learning_rate": 0.0003, "loss": 11.0299, "loss/aux_loss": 0.04806080795824528, "loss/crossentropy": 2.8082796573638915, "loss/logits": 0.8537631243467331, "step": 60620 }, { "epoch": 0.6063, "grad_norm": 16.25, "grad_norm_var": 0.29921875, "learning_rate": 0.0003, "loss": 11.0316, "loss/aux_loss": 0.04806738365441561, "loss/crossentropy": 2.7339873909950256, "loss/logits": 0.7961423873901368, "step": 60630 }, { "epoch": 0.6064, "grad_norm": 15.0625, "grad_norm_var": 0.390625, "learning_rate": 0.0003, "loss": 10.8029, "loss/aux_loss": 0.04806647207587957, "loss/crossentropy": 2.7505694150924684, "loss/logits": 0.8106872260570526, "step": 60640 }, { "epoch": 0.6065, "grad_norm": 18.5, "grad_norm_var": 0.990478515625, "learning_rate": 0.0003, "loss": 10.9839, "loss/aux_loss": 0.0480714239180088, "loss/crossentropy": 2.696449559926987, "loss/logits": 0.8294881820678711, "step": 60650 }, { "epoch": 0.6066, "grad_norm": 16.625, "grad_norm_var": 1.271728515625, "learning_rate": 0.0003, "loss": 10.7504, "loss/aux_loss": 0.04806580077856779, "loss/crossentropy": 2.5379061937332152, "loss/logits": 0.7808073431253433, "step": 60660 }, { "epoch": 0.6067, "grad_norm": 15.125, "grad_norm_var": 0.8075358072916666, "learning_rate": 0.0003, "loss": 11.0137, "loss/aux_loss": 0.04806665126234293, "loss/crossentropy": 2.7637165009975435, "loss/logits": 0.8278068244457245, "step": 60670 }, { "epoch": 0.6068, "grad_norm": 14.75, "grad_norm_var": 0.43917643229166664, "learning_rate": 0.0003, "loss": 10.8688, "loss/aux_loss": 0.04807304907590151, "loss/crossentropy": 2.606863057613373, "loss/logits": 0.8165640115737915, "step": 60680 }, { "epoch": 0.6069, "grad_norm": 15.0625, "grad_norm_var": 0.3447265625, "learning_rate": 0.0003, "loss": 10.9712, "loss/aux_loss": 0.04807915091514588, "loss/crossentropy": 2.7368947744369505, "loss/logits": 0.8139879643917084, "step": 60690 }, { "epoch": 0.607, "grad_norm": 13.4375, "grad_norm_var": 0.539697265625, "learning_rate": 0.0003, "loss": 10.8863, "loss/aux_loss": 0.04805984944105148, "loss/crossentropy": 2.561430436372757, "loss/logits": 0.7969212979078293, "step": 60700 }, { "epoch": 0.6071, "grad_norm": 16.125, "grad_norm_var": 0.8063639322916667, "learning_rate": 0.0003, "loss": 11.1143, "loss/aux_loss": 0.048070601746439935, "loss/crossentropy": 2.724953460693359, "loss/logits": 0.8761008381843567, "step": 60710 }, { "epoch": 0.6072, "grad_norm": 15.8125, "grad_norm_var": 0.5405598958333333, "learning_rate": 0.0003, "loss": 11.067, "loss/aux_loss": 0.04806268252432346, "loss/crossentropy": 2.722470408678055, "loss/logits": 0.8286719590425491, "step": 60720 }, { "epoch": 0.6073, "grad_norm": 16.75, "grad_norm_var": 0.48292643229166665, "learning_rate": 0.0003, "loss": 10.9106, "loss/aux_loss": 0.048081399872899055, "loss/crossentropy": 2.7040891528129576, "loss/logits": 0.8151125907897949, "step": 60730 }, { "epoch": 0.6074, "grad_norm": 14.9375, "grad_norm_var": 0.6304524739583334, "learning_rate": 0.0003, "loss": 11.0008, "loss/aux_loss": 0.04806943740695715, "loss/crossentropy": 2.750800085067749, "loss/logits": 0.811496239900589, "step": 60740 }, { "epoch": 0.6075, "grad_norm": 14.875, "grad_norm_var": 0.5072265625, "learning_rate": 0.0003, "loss": 10.9407, "loss/aux_loss": 0.048066843301057816, "loss/crossentropy": 2.5790812611579894, "loss/logits": 0.8062135219573975, "step": 60750 }, { "epoch": 0.6076, "grad_norm": 15.125, "grad_norm_var": 0.4410807291666667, "learning_rate": 0.0003, "loss": 10.938, "loss/aux_loss": 0.04806975163519382, "loss/crossentropy": 2.669712710380554, "loss/logits": 0.8050375521183014, "step": 60760 }, { "epoch": 0.6077, "grad_norm": 16.5, "grad_norm_var": 0.5805826822916667, "learning_rate": 0.0003, "loss": 10.9754, "loss/aux_loss": 0.04807074461132288, "loss/crossentropy": 2.7768321573734283, "loss/logits": 0.8277645260095596, "step": 60770 }, { "epoch": 0.6078, "grad_norm": 14.375, "grad_norm_var": 0.8393229166666667, "learning_rate": 0.0003, "loss": 10.9727, "loss/aux_loss": 0.04806243553757668, "loss/crossentropy": 2.7435842633247374, "loss/logits": 0.8338838994503022, "step": 60780 }, { "epoch": 0.6079, "grad_norm": 14.9375, "grad_norm_var": 0.7704264322916666, "learning_rate": 0.0003, "loss": 10.8381, "loss/aux_loss": 0.048067687265574935, "loss/crossentropy": 2.695614975690842, "loss/logits": 0.7965478479862214, "step": 60790 }, { "epoch": 0.608, "grad_norm": 16.375, "grad_norm_var": 1.0828125, "learning_rate": 0.0003, "loss": 10.9719, "loss/aux_loss": 0.04806322492659092, "loss/crossentropy": 2.747288691997528, "loss/logits": 0.8433381974697113, "step": 60800 }, { "epoch": 0.6081, "grad_norm": 14.9375, "grad_norm_var": 0.7783854166666667, "learning_rate": 0.0003, "loss": 10.9004, "loss/aux_loss": 0.04807988330721855, "loss/crossentropy": 2.7076495826244353, "loss/logits": 0.83790722489357, "step": 60810 }, { "epoch": 0.6082, "grad_norm": 15.9375, "grad_norm_var": 0.40305989583333335, "learning_rate": 0.0003, "loss": 11.0397, "loss/aux_loss": 0.0480671776458621, "loss/crossentropy": 2.6762712955474854, "loss/logits": 0.8186611771583557, "step": 60820 }, { "epoch": 0.6083, "grad_norm": 15.5625, "grad_norm_var": 0.33307291666666666, "learning_rate": 0.0003, "loss": 11.0564, "loss/aux_loss": 0.04807759691029787, "loss/crossentropy": 2.673157799243927, "loss/logits": 0.8050933957099915, "step": 60830 }, { "epoch": 0.6084, "grad_norm": 15.125, "grad_norm_var": 0.8190104166666666, "learning_rate": 0.0003, "loss": 10.8996, "loss/aux_loss": 0.048066492564976214, "loss/crossentropy": 2.7351067125797273, "loss/logits": 0.8209027826786042, "step": 60840 }, { "epoch": 0.6085, "grad_norm": 15.3125, "grad_norm_var": 0.1978515625, "learning_rate": 0.0003, "loss": 11.0685, "loss/aux_loss": 0.04806740805506706, "loss/crossentropy": 2.7408780336380003, "loss/logits": 0.8286194503307343, "step": 60850 }, { "epoch": 0.6086, "grad_norm": 14.25, "grad_norm_var": 0.15636393229166667, "learning_rate": 0.0003, "loss": 10.9519, "loss/aux_loss": 0.048057135008275506, "loss/crossentropy": 2.708062160015106, "loss/logits": 0.8348789572715759, "step": 60860 }, { "epoch": 0.6087, "grad_norm": 15.0625, "grad_norm_var": 0.245556640625, "learning_rate": 0.0003, "loss": 10.9066, "loss/aux_loss": 0.04808010403066874, "loss/crossentropy": 2.7931439101696016, "loss/logits": 0.823428162932396, "step": 60870 }, { "epoch": 0.6088, "grad_norm": 14.0625, "grad_norm_var": 0.3282389322916667, "learning_rate": 0.0003, "loss": 11.0119, "loss/aux_loss": 0.04807362128049135, "loss/crossentropy": 2.803303599357605, "loss/logits": 0.8527332812547683, "step": 60880 }, { "epoch": 0.6089, "grad_norm": 14.625, "grad_norm_var": 0.40208333333333335, "learning_rate": 0.0003, "loss": 10.9771, "loss/aux_loss": 0.04806482549756765, "loss/crossentropy": 2.5410806000232697, "loss/logits": 0.8482417315244675, "step": 60890 }, { "epoch": 0.609, "grad_norm": 15.125, "grad_norm_var": 0.276025390625, "learning_rate": 0.0003, "loss": 11.0426, "loss/aux_loss": 0.04806209746748209, "loss/crossentropy": 2.8440731525421143, "loss/logits": 0.8651130110025406, "step": 60900 }, { "epoch": 0.6091, "grad_norm": 15.0625, "grad_norm_var": 0.162744140625, "learning_rate": 0.0003, "loss": 10.8486, "loss/aux_loss": 0.048080362193286416, "loss/crossentropy": 2.608972841501236, "loss/logits": 0.8105991780757904, "step": 60910 }, { "epoch": 0.6092, "grad_norm": 15.6875, "grad_norm_var": 0.20807291666666666, "learning_rate": 0.0003, "loss": 10.9675, "loss/aux_loss": 0.04806051570922136, "loss/crossentropy": 2.6756951212882996, "loss/logits": 0.7980956196784973, "step": 60920 }, { "epoch": 0.6093, "grad_norm": 16.625, "grad_norm_var": 0.2994140625, "learning_rate": 0.0003, "loss": 11.0392, "loss/aux_loss": 0.048062234185636044, "loss/crossentropy": 2.738618332147598, "loss/logits": 0.8416084438562393, "step": 60930 }, { "epoch": 0.6094, "grad_norm": 16.0, "grad_norm_var": 0.4228515625, "learning_rate": 0.0003, "loss": 11.0543, "loss/aux_loss": 0.04806976187974214, "loss/crossentropy": 2.698890858888626, "loss/logits": 0.8149118602275849, "step": 60940 }, { "epoch": 0.6095, "grad_norm": 14.3125, "grad_norm_var": 0.46144205729166665, "learning_rate": 0.0003, "loss": 10.9536, "loss/aux_loss": 0.0480699822306633, "loss/crossentropy": 2.839012861251831, "loss/logits": 0.8258768379688263, "step": 60950 }, { "epoch": 0.6096, "grad_norm": 15.0, "grad_norm_var": 0.5419108072916666, "learning_rate": 0.0003, "loss": 10.9647, "loss/aux_loss": 0.048070922307670114, "loss/crossentropy": 2.738613134622574, "loss/logits": 0.8129228353500366, "step": 60960 }, { "epoch": 0.6097, "grad_norm": 17.625, "grad_norm_var": 0.9274576822916667, "learning_rate": 0.0003, "loss": 10.9578, "loss/aux_loss": 0.04805801305919886, "loss/crossentropy": 2.5094609320163728, "loss/logits": 0.8016722679138184, "step": 60970 }, { "epoch": 0.6098, "grad_norm": 15.8125, "grad_norm_var": 0.7968098958333333, "learning_rate": 0.0003, "loss": 10.9871, "loss/aux_loss": 0.048079372942447664, "loss/crossentropy": 2.664980614185333, "loss/logits": 0.8202985137701034, "step": 60980 }, { "epoch": 0.6099, "grad_norm": 15.875, "grad_norm_var": 0.5782389322916667, "learning_rate": 0.0003, "loss": 11.0115, "loss/aux_loss": 0.04807392340153456, "loss/crossentropy": 2.7239008784294128, "loss/logits": 0.831571900844574, "step": 60990 }, { "epoch": 0.61, "grad_norm": 13.5, "grad_norm_var": 0.58828125, "learning_rate": 0.0003, "loss": 10.8612, "loss/aux_loss": 0.04806995764374733, "loss/crossentropy": 2.6622074127197264, "loss/logits": 0.7961272418498992, "step": 61000 }, { "epoch": 0.6101, "grad_norm": 16.0, "grad_norm_var": 0.6364420572916667, "learning_rate": 0.0003, "loss": 10.8923, "loss/aux_loss": 0.04806650709360838, "loss/crossentropy": 2.7559936583042144, "loss/logits": 0.8345979481935502, "step": 61010 }, { "epoch": 0.6102, "grad_norm": 15.5625, "grad_norm_var": 2.0155598958333334, "learning_rate": 0.0003, "loss": 10.8705, "loss/aux_loss": 0.04807249642908573, "loss/crossentropy": 2.591252303123474, "loss/logits": 0.7885099232196808, "step": 61020 }, { "epoch": 0.6103, "grad_norm": 14.9375, "grad_norm_var": 2.109114583333333, "learning_rate": 0.0003, "loss": 10.9392, "loss/aux_loss": 0.04807036854326725, "loss/crossentropy": 2.685466194152832, "loss/logits": 0.8492469847202301, "step": 61030 }, { "epoch": 0.6104, "grad_norm": 14.4375, "grad_norm_var": 0.6700520833333333, "learning_rate": 0.0003, "loss": 10.9431, "loss/aux_loss": 0.04806849993765354, "loss/crossentropy": 2.6850741684436796, "loss/logits": 0.8018898099660874, "step": 61040 }, { "epoch": 0.6105, "grad_norm": 14.5, "grad_norm_var": 0.3611979166666667, "learning_rate": 0.0003, "loss": 10.8835, "loss/aux_loss": 0.04807262290269136, "loss/crossentropy": 2.704542863368988, "loss/logits": 0.834966391324997, "step": 61050 }, { "epoch": 0.6106, "grad_norm": 15.5, "grad_norm_var": 0.350244140625, "learning_rate": 0.0003, "loss": 10.9628, "loss/aux_loss": 0.04807685576379299, "loss/crossentropy": 2.6365856409072874, "loss/logits": 0.8094233006238938, "step": 61060 }, { "epoch": 0.6107, "grad_norm": 14.8125, "grad_norm_var": 0.3317057291666667, "learning_rate": 0.0003, "loss": 11.0145, "loss/aux_loss": 0.048073244467377665, "loss/crossentropy": 2.844811725616455, "loss/logits": 0.836044305562973, "step": 61070 }, { "epoch": 0.6108, "grad_norm": 16.5, "grad_norm_var": 1.5063639322916667, "learning_rate": 0.0003, "loss": 10.8371, "loss/aux_loss": 0.04806085731834173, "loss/crossentropy": 2.7075256764888764, "loss/logits": 0.8152056097984314, "step": 61080 }, { "epoch": 0.6109, "grad_norm": 17.0, "grad_norm_var": 0.8766764322916667, "learning_rate": 0.0003, "loss": 10.8974, "loss/aux_loss": 0.04807128459215164, "loss/crossentropy": 2.675181972980499, "loss/logits": 0.8163933247327805, "step": 61090 }, { "epoch": 0.611, "grad_norm": 15.625, "grad_norm_var": 0.8407389322916666, "learning_rate": 0.0003, "loss": 10.8567, "loss/aux_loss": 0.04806245286017656, "loss/crossentropy": 2.7227718472480773, "loss/logits": 0.8572416335344315, "step": 61100 }, { "epoch": 0.6111, "grad_norm": 17.0, "grad_norm_var": 0.7202473958333333, "learning_rate": 0.0003, "loss": 10.8851, "loss/aux_loss": 0.048066180758178235, "loss/crossentropy": 2.744245910644531, "loss/logits": 0.8281572759151459, "step": 61110 }, { "epoch": 0.6112, "grad_norm": 13.875, "grad_norm_var": 0.7058430989583333, "learning_rate": 0.0003, "loss": 10.974, "loss/aux_loss": 0.04806941282004118, "loss/crossentropy": 2.788532388210297, "loss/logits": 0.8086911767721177, "step": 61120 }, { "epoch": 0.6113, "grad_norm": 13.9375, "grad_norm_var": 2.058854166666667, "learning_rate": 0.0003, "loss": 10.9633, "loss/aux_loss": 0.048071177862584594, "loss/crossentropy": 2.5584035396575926, "loss/logits": 0.7963971257209778, "step": 61130 }, { "epoch": 0.6114, "grad_norm": 15.5, "grad_norm_var": 2.2919270833333334, "learning_rate": 0.0003, "loss": 11.0252, "loss/aux_loss": 0.048073113150894645, "loss/crossentropy": 2.816734492778778, "loss/logits": 0.8178564816713333, "step": 61140 }, { "epoch": 0.6115, "grad_norm": 14.9375, "grad_norm_var": 1.1398274739583334, "learning_rate": 0.0003, "loss": 11.1053, "loss/aux_loss": 0.04807866048067808, "loss/crossentropy": 2.7244319319725037, "loss/logits": 0.8512472093105317, "step": 61150 }, { "epoch": 0.6116, "grad_norm": 14.5625, "grad_norm_var": 0.8706868489583334, "learning_rate": 0.0003, "loss": 11.0246, "loss/aux_loss": 0.04805517755448818, "loss/crossentropy": 2.801200783252716, "loss/logits": 0.8401286274194717, "step": 61160 }, { "epoch": 0.6117, "grad_norm": 13.9375, "grad_norm_var": 0.4552083333333333, "learning_rate": 0.0003, "loss": 11.0931, "loss/aux_loss": 0.04807036258280277, "loss/crossentropy": 2.772087001800537, "loss/logits": 0.8375712424516678, "step": 61170 }, { "epoch": 0.6118, "grad_norm": 15.875, "grad_norm_var": 0.84375, "learning_rate": 0.0003, "loss": 10.9609, "loss/aux_loss": 0.04807903449982405, "loss/crossentropy": 2.622360199689865, "loss/logits": 0.8055921524763108, "step": 61180 }, { "epoch": 0.6119, "grad_norm": 15.4375, "grad_norm_var": 0.5645670572916667, "learning_rate": 0.0003, "loss": 11.0062, "loss/aux_loss": 0.04806223157793284, "loss/crossentropy": 2.681737995147705, "loss/logits": 0.813096073269844, "step": 61190 }, { "epoch": 0.612, "grad_norm": 15.9375, "grad_norm_var": 0.445166015625, "learning_rate": 0.0003, "loss": 10.9057, "loss/aux_loss": 0.048072535917162894, "loss/crossentropy": 2.6666407227516173, "loss/logits": 0.8438924968242645, "step": 61200 }, { "epoch": 0.6121, "grad_norm": 16.375, "grad_norm_var": 0.6181640625, "learning_rate": 0.0003, "loss": 10.8865, "loss/aux_loss": 0.04807114116847515, "loss/crossentropy": 2.698245918750763, "loss/logits": 0.8058286488056183, "step": 61210 }, { "epoch": 0.6122, "grad_norm": 16.125, "grad_norm_var": 0.8072265625, "learning_rate": 0.0003, "loss": 10.9027, "loss/aux_loss": 0.048066765256226066, "loss/crossentropy": 2.682483744621277, "loss/logits": 0.7972517877817153, "step": 61220 }, { "epoch": 0.6123, "grad_norm": 14.5, "grad_norm_var": 1.0152180989583333, "learning_rate": 0.0003, "loss": 10.9902, "loss/aux_loss": 0.04806827660650015, "loss/crossentropy": 2.8098564445972443, "loss/logits": 0.8148068457841873, "step": 61230 }, { "epoch": 0.6124, "grad_norm": 15.125, "grad_norm_var": 0.787744140625, "learning_rate": 0.0003, "loss": 10.925, "loss/aux_loss": 0.04808166231960058, "loss/crossentropy": 2.5316755414009093, "loss/logits": 0.8108256548643112, "step": 61240 }, { "epoch": 0.6125, "grad_norm": 13.5, "grad_norm_var": 0.662353515625, "learning_rate": 0.0003, "loss": 10.8142, "loss/aux_loss": 0.048060395009815696, "loss/crossentropy": 2.6689065754413606, "loss/logits": 0.7798559069633484, "step": 61250 }, { "epoch": 0.6126, "grad_norm": 15.0625, "grad_norm_var": 0.5609212239583333, "learning_rate": 0.0003, "loss": 10.9542, "loss/aux_loss": 0.04807155355811119, "loss/crossentropy": 2.671508860588074, "loss/logits": 0.8162847578525543, "step": 61260 }, { "epoch": 0.6127, "grad_norm": 15.125, "grad_norm_var": 0.26848958333333334, "learning_rate": 0.0003, "loss": 10.9077, "loss/aux_loss": 0.04807188101112843, "loss/crossentropy": 2.8237990200519563, "loss/logits": 0.8153227150440217, "step": 61270 }, { "epoch": 0.6128, "grad_norm": 14.5, "grad_norm_var": 22.748372395833332, "learning_rate": 0.0003, "loss": 11.0002, "loss/aux_loss": 0.048073857091367245, "loss/crossentropy": 2.720522928237915, "loss/logits": 0.8327284932136536, "step": 61280 }, { "epoch": 0.6129, "grad_norm": 15.6875, "grad_norm_var": 20.959358723958335, "learning_rate": 0.0003, "loss": 10.8401, "loss/aux_loss": 0.0480800049379468, "loss/crossentropy": 2.8034905910491945, "loss/logits": 0.7961880445480347, "step": 61290 }, { "epoch": 0.613, "grad_norm": 16.125, "grad_norm_var": 0.31378580729166666, "learning_rate": 0.0003, "loss": 10.9138, "loss/aux_loss": 0.04807343352586031, "loss/crossentropy": 2.59561088681221, "loss/logits": 0.8048137962818146, "step": 61300 }, { "epoch": 0.6131, "grad_norm": 14.875, "grad_norm_var": 0.700634765625, "learning_rate": 0.0003, "loss": 10.9678, "loss/aux_loss": 0.04806978832930327, "loss/crossentropy": 2.6666161894798277, "loss/logits": 0.8165017098188401, "step": 61310 }, { "epoch": 0.6132, "grad_norm": 14.5, "grad_norm_var": 0.784228515625, "learning_rate": 0.0003, "loss": 10.8918, "loss/aux_loss": 0.0480765713378787, "loss/crossentropy": 2.6589391052722933, "loss/logits": 0.8012593746185303, "step": 61320 }, { "epoch": 0.6133, "grad_norm": 15.625, "grad_norm_var": 1.0489583333333334, "learning_rate": 0.0003, "loss": 10.9852, "loss/aux_loss": 0.04807199165225029, "loss/crossentropy": 2.744808477163315, "loss/logits": 0.8064806133508682, "step": 61330 }, { "epoch": 0.6134, "grad_norm": 18.0, "grad_norm_var": 1.1325520833333333, "learning_rate": 0.0003, "loss": 10.9129, "loss/aux_loss": 0.04806118700653315, "loss/crossentropy": 2.7831350564956665, "loss/logits": 0.8111140578985214, "step": 61340 }, { "epoch": 0.6135, "grad_norm": 15.1875, "grad_norm_var": 1.3180826822916667, "learning_rate": 0.0003, "loss": 10.9594, "loss/aux_loss": 0.04806741625070572, "loss/crossentropy": 2.66780064702034, "loss/logits": 0.8168701589107513, "step": 61350 }, { "epoch": 0.6136, "grad_norm": 14.0625, "grad_norm_var": 0.9478515625, "learning_rate": 0.0003, "loss": 11.0428, "loss/aux_loss": 0.04806680958718061, "loss/crossentropy": 2.756870436668396, "loss/logits": 0.8362319558858872, "step": 61360 }, { "epoch": 0.6137, "grad_norm": 14.625, "grad_norm_var": 0.949853515625, "learning_rate": 0.0003, "loss": 11.0459, "loss/aux_loss": 0.04806111045181751, "loss/crossentropy": 2.7263152480125425, "loss/logits": 0.8313703805208206, "step": 61370 }, { "epoch": 0.6138, "grad_norm": 14.5625, "grad_norm_var": 0.459375, "learning_rate": 0.0003, "loss": 11.0347, "loss/aux_loss": 0.04806870762258768, "loss/crossentropy": 2.6985792994499205, "loss/logits": 0.8102845966815948, "step": 61380 }, { "epoch": 0.6139, "grad_norm": 15.1875, "grad_norm_var": 0.8634765625, "learning_rate": 0.0003, "loss": 11.1736, "loss/aux_loss": 0.048068526200950146, "loss/crossentropy": 2.7508439660072326, "loss/logits": 0.8183762282133102, "step": 61390 }, { "epoch": 0.614, "grad_norm": 15.125, "grad_norm_var": 0.3478515625, "learning_rate": 0.0003, "loss": 10.8252, "loss/aux_loss": 0.04806302357465029, "loss/crossentropy": 2.672939831018448, "loss/logits": 0.8024695843458176, "step": 61400 }, { "epoch": 0.6141, "grad_norm": 14.3125, "grad_norm_var": 0.3848958333333333, "learning_rate": 0.0003, "loss": 11.0337, "loss/aux_loss": 0.048076591454446316, "loss/crossentropy": 2.83030418753624, "loss/logits": 0.8267215609550476, "step": 61410 }, { "epoch": 0.6142, "grad_norm": 14.4375, "grad_norm_var": 0.6458333333333334, "learning_rate": 0.0003, "loss": 10.9174, "loss/aux_loss": 0.04806251674890518, "loss/crossentropy": 2.7849998474121094, "loss/logits": 0.8149536848068237, "step": 61420 }, { "epoch": 0.6143, "grad_norm": 15.1875, "grad_norm_var": 126.11222330729167, "learning_rate": 0.0003, "loss": 11.0026, "loss/aux_loss": 0.048078781180083754, "loss/crossentropy": 2.8044037342071535, "loss/logits": 0.8208670258522034, "step": 61430 }, { "epoch": 0.6144, "grad_norm": 14.1875, "grad_norm_var": 1.9354166666666666, "learning_rate": 0.0003, "loss": 10.9765, "loss/aux_loss": 0.0480761431157589, "loss/crossentropy": 2.7967730283737184, "loss/logits": 0.8021587640047073, "step": 61440 }, { "epoch": 0.6145, "grad_norm": 15.625, "grad_norm_var": 0.901416015625, "learning_rate": 0.0003, "loss": 11.0474, "loss/aux_loss": 0.048061800003051755, "loss/crossentropy": 2.6379260659217834, "loss/logits": 0.8099302232265473, "step": 61450 }, { "epoch": 0.6146, "grad_norm": 15.0, "grad_norm_var": 0.5874348958333333, "learning_rate": 0.0003, "loss": 11.0572, "loss/aux_loss": 0.048064269311726096, "loss/crossentropy": 2.754233205318451, "loss/logits": 0.8168322265148162, "step": 61460 }, { "epoch": 0.6147, "grad_norm": 15.0, "grad_norm_var": 0.5436848958333333, "learning_rate": 0.0003, "loss": 10.9424, "loss/aux_loss": 0.04808512944728136, "loss/crossentropy": 2.7634010910987854, "loss/logits": 0.8455385863780975, "step": 61470 }, { "epoch": 0.6148, "grad_norm": 15.75, "grad_norm_var": 1.7707682291666667, "learning_rate": 0.0003, "loss": 11.1005, "loss/aux_loss": 0.048066995665431024, "loss/crossentropy": 2.7025927007198334, "loss/logits": 0.8244033396244049, "step": 61480 }, { "epoch": 0.6149, "grad_norm": 15.8125, "grad_norm_var": 0.7311848958333333, "learning_rate": 0.0003, "loss": 10.7938, "loss/aux_loss": 0.048072151467204095, "loss/crossentropy": 2.481117475032806, "loss/logits": 0.8055284798145295, "step": 61490 }, { "epoch": 0.615, "grad_norm": 16.125, "grad_norm_var": 0.6077962239583333, "learning_rate": 0.0003, "loss": 10.9174, "loss/aux_loss": 0.04806630816310644, "loss/crossentropy": 2.6769744515419007, "loss/logits": 0.8111788332462311, "step": 61500 }, { "epoch": 0.6151, "grad_norm": 15.25, "grad_norm_var": 0.5186848958333333, "learning_rate": 0.0003, "loss": 10.9967, "loss/aux_loss": 0.04806443694978953, "loss/crossentropy": 2.683100473880768, "loss/logits": 0.8171298623085022, "step": 61510 }, { "epoch": 0.6152, "grad_norm": 15.4375, "grad_norm_var": 0.744384765625, "learning_rate": 0.0003, "loss": 10.8982, "loss/aux_loss": 0.04807937704026699, "loss/crossentropy": 2.7236180365085603, "loss/logits": 0.821711191534996, "step": 61520 }, { "epoch": 0.6153, "grad_norm": 17.25, "grad_norm_var": 1.3526041666666666, "learning_rate": 0.0003, "loss": 11.0416, "loss/aux_loss": 0.0480710020288825, "loss/crossentropy": 2.7528501987457275, "loss/logits": 0.8094364821910858, "step": 61530 }, { "epoch": 0.6154, "grad_norm": 16.25, "grad_norm_var": 1.1885416666666666, "learning_rate": 0.0003, "loss": 11.0722, "loss/aux_loss": 0.04807053804397583, "loss/crossentropy": 2.767429292201996, "loss/logits": 0.8756510764360428, "step": 61540 }, { "epoch": 0.6155, "grad_norm": 14.5625, "grad_norm_var": 0.620556640625, "learning_rate": 0.0003, "loss": 10.8218, "loss/aux_loss": 0.04807050470262766, "loss/crossentropy": 2.697913628816605, "loss/logits": 0.7820253252983094, "step": 61550 }, { "epoch": 0.6156, "grad_norm": 14.5625, "grad_norm_var": 0.38084309895833335, "learning_rate": 0.0003, "loss": 10.9104, "loss/aux_loss": 0.0480689549818635, "loss/crossentropy": 2.6347146034240723, "loss/logits": 0.7996633857488632, "step": 61560 }, { "epoch": 0.6157, "grad_norm": 14.375, "grad_norm_var": 0.5408854166666667, "learning_rate": 0.0003, "loss": 10.8944, "loss/aux_loss": 0.04806984327733517, "loss/crossentropy": 2.7262151658535005, "loss/logits": 0.8081013143062592, "step": 61570 }, { "epoch": 0.6158, "grad_norm": 16.125, "grad_norm_var": 0.8464680989583333, "learning_rate": 0.0003, "loss": 10.8768, "loss/aux_loss": 0.04807692188769579, "loss/crossentropy": 2.697046458721161, "loss/logits": 0.7776322573423385, "step": 61580 }, { "epoch": 0.6159, "grad_norm": 14.0625, "grad_norm_var": 0.794775390625, "learning_rate": 0.0003, "loss": 10.9726, "loss/aux_loss": 0.04806830957531929, "loss/crossentropy": 2.7502528548240663, "loss/logits": 0.8258104085922241, "step": 61590 }, { "epoch": 0.616, "grad_norm": 15.5625, "grad_norm_var": 0.4962890625, "learning_rate": 0.0003, "loss": 10.886, "loss/aux_loss": 0.04806863311678171, "loss/crossentropy": 2.702814507484436, "loss/logits": 0.8230858445167542, "step": 61600 }, { "epoch": 0.6161, "grad_norm": 15.3125, "grad_norm_var": 0.359228515625, "learning_rate": 0.0003, "loss": 10.9529, "loss/aux_loss": 0.048067998327314856, "loss/crossentropy": 2.7368035674095155, "loss/logits": 0.8276433378458024, "step": 61610 }, { "epoch": 0.6162, "grad_norm": 15.125, "grad_norm_var": 1.8343098958333333, "learning_rate": 0.0003, "loss": 10.9874, "loss/aux_loss": 0.048063565976917744, "loss/crossentropy": 2.6927200853824615, "loss/logits": 0.8120525509119034, "step": 61620 }, { "epoch": 0.6163, "grad_norm": 15.4375, "grad_norm_var": 1.581884765625, "learning_rate": 0.0003, "loss": 10.9596, "loss/aux_loss": 0.04806598611176014, "loss/crossentropy": 2.6300831198692323, "loss/logits": 0.7872573018074036, "step": 61630 }, { "epoch": 0.6164, "grad_norm": 14.5, "grad_norm_var": 0.5002604166666667, "learning_rate": 0.0003, "loss": 10.8337, "loss/aux_loss": 0.04807138796895742, "loss/crossentropy": 2.746791756153107, "loss/logits": 0.796841761469841, "step": 61640 }, { "epoch": 0.6165, "grad_norm": 15.125, "grad_norm_var": 0.915087890625, "learning_rate": 0.0003, "loss": 10.81, "loss/aux_loss": 0.04806817434728146, "loss/crossentropy": 2.5922399282455446, "loss/logits": 0.8125677675008773, "step": 61650 }, { "epoch": 0.6166, "grad_norm": 15.0625, "grad_norm_var": 0.43045247395833336, "learning_rate": 0.0003, "loss": 10.8958, "loss/aux_loss": 0.048080663196742535, "loss/crossentropy": 2.6909705996513367, "loss/logits": 0.8360078364610672, "step": 61660 }, { "epoch": 0.6167, "grad_norm": 15.3125, "grad_norm_var": 3.886962890625, "learning_rate": 0.0003, "loss": 10.7584, "loss/aux_loss": 0.04806402511894703, "loss/crossentropy": 2.5941467702388765, "loss/logits": 0.8068960756063461, "step": 61670 }, { "epoch": 0.6168, "grad_norm": 14.1875, "grad_norm_var": 0.5759765625, "learning_rate": 0.0003, "loss": 10.9424, "loss/aux_loss": 0.048076951317489146, "loss/crossentropy": 2.7264155983924865, "loss/logits": 0.8287631750106812, "step": 61680 }, { "epoch": 0.6169, "grad_norm": 14.25, "grad_norm_var": 0.5332682291666667, "learning_rate": 0.0003, "loss": 10.8441, "loss/aux_loss": 0.04805428683757782, "loss/crossentropy": 2.745866870880127, "loss/logits": 0.7957580178976059, "step": 61690 }, { "epoch": 0.617, "grad_norm": 14.75, "grad_norm_var": 0.39140625, "learning_rate": 0.0003, "loss": 10.9403, "loss/aux_loss": 0.04806794375181198, "loss/crossentropy": 2.6823421716690063, "loss/logits": 0.816996818780899, "step": 61700 }, { "epoch": 0.6171, "grad_norm": 14.625, "grad_norm_var": 0.6180826822916666, "learning_rate": 0.0003, "loss": 10.8513, "loss/aux_loss": 0.048078464530408385, "loss/crossentropy": 2.8029967546463013, "loss/logits": 0.8454837918281555, "step": 61710 }, { "epoch": 0.6172, "grad_norm": 15.4375, "grad_norm_var": 0.3431640625, "learning_rate": 0.0003, "loss": 10.8751, "loss/aux_loss": 0.04807190615683794, "loss/crossentropy": 2.6858488082885743, "loss/logits": 0.8107406437397003, "step": 61720 }, { "epoch": 0.6173, "grad_norm": 13.375, "grad_norm_var": 0.8051432291666667, "learning_rate": 0.0003, "loss": 10.9275, "loss/aux_loss": 0.048067220486700535, "loss/crossentropy": 2.6807423889636994, "loss/logits": 0.7955632448196411, "step": 61730 }, { "epoch": 0.6174, "grad_norm": 14.4375, "grad_norm_var": 1.01875, "learning_rate": 0.0003, "loss": 10.8206, "loss/aux_loss": 0.04806277137249708, "loss/crossentropy": 2.6485226929187773, "loss/logits": 0.7749884635210037, "step": 61740 }, { "epoch": 0.6175, "grad_norm": 14.75, "grad_norm_var": 0.29973958333333334, "learning_rate": 0.0003, "loss": 11.0639, "loss/aux_loss": 0.04807302244007587, "loss/crossentropy": 2.501930046081543, "loss/logits": 0.8085714936256408, "step": 61750 }, { "epoch": 0.6176, "grad_norm": 17.0, "grad_norm_var": 29.426416015625, "learning_rate": 0.0003, "loss": 10.9392, "loss/aux_loss": 0.048070022463798524, "loss/crossentropy": 2.56240091919899, "loss/logits": 0.8246626138687134, "step": 61760 }, { "epoch": 0.6177, "grad_norm": 14.25, "grad_norm_var": 29.176155598958335, "learning_rate": 0.0003, "loss": 10.8047, "loss/aux_loss": 0.048060801811516285, "loss/crossentropy": 2.7177587747573853, "loss/logits": 0.8236128687858582, "step": 61770 }, { "epoch": 0.6178, "grad_norm": 15.0, "grad_norm_var": 0.9817057291666667, "learning_rate": 0.0003, "loss": 11.0162, "loss/aux_loss": 0.048079765401780605, "loss/crossentropy": 2.6734387814998626, "loss/logits": 0.8322317689657212, "step": 61780 }, { "epoch": 0.6179, "grad_norm": 14.1875, "grad_norm_var": 0.7858723958333333, "learning_rate": 0.0003, "loss": 11.0301, "loss/aux_loss": 0.04804958906024694, "loss/crossentropy": 2.667484325170517, "loss/logits": 0.7962237685918808, "step": 61790 }, { "epoch": 0.618, "grad_norm": 17.375, "grad_norm_var": 1.3082682291666667, "learning_rate": 0.0003, "loss": 10.8701, "loss/aux_loss": 0.048083477467298505, "loss/crossentropy": 2.6046105325222015, "loss/logits": 0.8223717421293258, "step": 61800 }, { "epoch": 0.6181, "grad_norm": 15.0, "grad_norm_var": 1.2212890625, "learning_rate": 0.0003, "loss": 10.7544, "loss/aux_loss": 0.04806771744042635, "loss/crossentropy": 2.703940987586975, "loss/logits": 0.8195635229349136, "step": 61810 }, { "epoch": 0.6182, "grad_norm": 15.875, "grad_norm_var": 0.5813639322916667, "learning_rate": 0.0003, "loss": 10.8849, "loss/aux_loss": 0.048071309179067614, "loss/crossentropy": 2.7586312294006348, "loss/logits": 0.8400523275136947, "step": 61820 }, { "epoch": 0.6183, "grad_norm": 14.3125, "grad_norm_var": 1.1192057291666666, "learning_rate": 0.0003, "loss": 10.9316, "loss/aux_loss": 0.04807364828884601, "loss/crossentropy": 2.8496673822402956, "loss/logits": 0.8168164789676666, "step": 61830 }, { "epoch": 0.6184, "grad_norm": 16.375, "grad_norm_var": 4.017822265625, "learning_rate": 0.0003, "loss": 10.9409, "loss/aux_loss": 0.048056223429739475, "loss/crossentropy": 2.573849785327911, "loss/logits": 0.816324171423912, "step": 61840 }, { "epoch": 0.6185, "grad_norm": 14.4375, "grad_norm_var": 3.4400390625, "learning_rate": 0.0003, "loss": 10.9165, "loss/aux_loss": 0.04806674625724554, "loss/crossentropy": 2.757099586725235, "loss/logits": 0.8031830161809921, "step": 61850 }, { "epoch": 0.6186, "grad_norm": 15.25, "grad_norm_var": 0.35618489583333335, "learning_rate": 0.0003, "loss": 11.0356, "loss/aux_loss": 0.04808044787496328, "loss/crossentropy": 2.7227927923202513, "loss/logits": 0.817890202999115, "step": 61860 }, { "epoch": 0.6187, "grad_norm": 14.125, "grad_norm_var": 0.23326822916666667, "learning_rate": 0.0003, "loss": 10.7683, "loss/aux_loss": 0.048065101355314256, "loss/crossentropy": 2.5788680493831633, "loss/logits": 0.7917226999998093, "step": 61870 }, { "epoch": 0.6188, "grad_norm": 15.8125, "grad_norm_var": 0.42578125, "learning_rate": 0.0003, "loss": 10.8498, "loss/aux_loss": 0.04807348400354385, "loss/crossentropy": 2.8568573355674745, "loss/logits": 0.831426665186882, "step": 61880 }, { "epoch": 0.6189, "grad_norm": 14.375, "grad_norm_var": 0.7421223958333333, "learning_rate": 0.0003, "loss": 10.9383, "loss/aux_loss": 0.048065127618610856, "loss/crossentropy": 2.7187650322914125, "loss/logits": 0.7684122264385224, "step": 61890 }, { "epoch": 0.619, "grad_norm": 14.625, "grad_norm_var": 0.6462890625, "learning_rate": 0.0003, "loss": 10.8742, "loss/aux_loss": 0.048072330094873904, "loss/crossentropy": 2.7928129851818086, "loss/logits": 0.840417456626892, "step": 61900 }, { "epoch": 0.6191, "grad_norm": 17.375, "grad_norm_var": 0.6249348958333333, "learning_rate": 0.0003, "loss": 10.95, "loss/aux_loss": 0.048061074875295165, "loss/crossentropy": 2.827212655544281, "loss/logits": 0.851186552643776, "step": 61910 }, { "epoch": 0.6192, "grad_norm": 15.3125, "grad_norm_var": 0.8921223958333333, "learning_rate": 0.0003, "loss": 10.9378, "loss/aux_loss": 0.04807654786854983, "loss/crossentropy": 2.660733711719513, "loss/logits": 0.8380063980817795, "step": 61920 }, { "epoch": 0.6193, "grad_norm": 14.8125, "grad_norm_var": 0.570556640625, "learning_rate": 0.0003, "loss": 10.9107, "loss/aux_loss": 0.04806284811347723, "loss/crossentropy": 2.6565398812294005, "loss/logits": 0.8210236459970475, "step": 61930 }, { "epoch": 0.6194, "grad_norm": 15.125, "grad_norm_var": 0.33787434895833335, "learning_rate": 0.0003, "loss": 10.842, "loss/aux_loss": 0.04807104039937258, "loss/crossentropy": 2.5869544565677645, "loss/logits": 0.8126054167747497, "step": 61940 }, { "epoch": 0.6195, "grad_norm": 14.0625, "grad_norm_var": 0.4328125, "learning_rate": 0.0003, "loss": 10.8834, "loss/aux_loss": 0.048072011955082414, "loss/crossentropy": 2.839719223976135, "loss/logits": 0.823541471362114, "step": 61950 }, { "epoch": 0.6196, "grad_norm": 14.5625, "grad_norm_var": 0.3223795572916667, "learning_rate": 0.0003, "loss": 10.8743, "loss/aux_loss": 0.04806395042687654, "loss/crossentropy": 2.798982226848602, "loss/logits": 0.7807911396026611, "step": 61960 }, { "epoch": 0.6197, "grad_norm": 14.5, "grad_norm_var": 1.5895182291666667, "learning_rate": 0.0003, "loss": 10.7799, "loss/aux_loss": 0.04806208536028862, "loss/crossentropy": 2.6087071001529694, "loss/logits": 0.8195267915725708, "step": 61970 }, { "epoch": 0.6198, "grad_norm": 15.125, "grad_norm_var": 0.6953125, "learning_rate": 0.0003, "loss": 11.043, "loss/aux_loss": 0.048067536950111386, "loss/crossentropy": 2.7437108635902403, "loss/logits": 0.8448336660861969, "step": 61980 }, { "epoch": 0.6199, "grad_norm": 17.625, "grad_norm_var": 0.860791015625, "learning_rate": 0.0003, "loss": 10.9815, "loss/aux_loss": 0.04807477705180645, "loss/crossentropy": 2.70632341504097, "loss/logits": 0.8352989315986633, "step": 61990 }, { "epoch": 0.62, "grad_norm": 14.6875, "grad_norm_var": 1.3466145833333334, "learning_rate": 0.0003, "loss": 10.9892, "loss/aux_loss": 0.04806019198149443, "loss/crossentropy": 2.664289927482605, "loss/logits": 0.828738734126091, "step": 62000 }, { "epoch": 0.6201, "grad_norm": 15.1875, "grad_norm_var": 0.3690104166666667, "learning_rate": 0.0003, "loss": 10.9197, "loss/aux_loss": 0.04807502832263708, "loss/crossentropy": 2.728136438131332, "loss/logits": 0.8290602266788483, "step": 62010 }, { "epoch": 0.6202, "grad_norm": 14.5, "grad_norm_var": 0.5070149739583333, "learning_rate": 0.0003, "loss": 10.8922, "loss/aux_loss": 0.048071499727666375, "loss/crossentropy": 2.74332879781723, "loss/logits": 0.8003027319908143, "step": 62020 }, { "epoch": 0.6203, "grad_norm": 16.875, "grad_norm_var": 0.5157389322916667, "learning_rate": 0.0003, "loss": 11.0849, "loss/aux_loss": 0.048065542615950106, "loss/crossentropy": 2.7377222657203673, "loss/logits": 0.8492092847824096, "step": 62030 }, { "epoch": 0.6204, "grad_norm": 14.8125, "grad_norm_var": 1.1666015625, "learning_rate": 0.0003, "loss": 10.8011, "loss/aux_loss": 0.04808056894689798, "loss/crossentropy": 2.632717180252075, "loss/logits": 0.7819905787706375, "step": 62040 }, { "epoch": 0.6205, "grad_norm": 14.9375, "grad_norm_var": 0.558447265625, "learning_rate": 0.0003, "loss": 10.883, "loss/aux_loss": 0.04806170351803303, "loss/crossentropy": 2.7062652587890623, "loss/logits": 0.7980828583240509, "step": 62050 }, { "epoch": 0.6206, "grad_norm": 14.6875, "grad_norm_var": 1.1997233072916667, "learning_rate": 0.0003, "loss": 10.8167, "loss/aux_loss": 0.04807770270854235, "loss/crossentropy": 2.474736750125885, "loss/logits": 0.7879143923521041, "step": 62060 }, { "epoch": 0.6207, "grad_norm": 14.3125, "grad_norm_var": 0.5335774739583333, "learning_rate": 0.0003, "loss": 11.0472, "loss/aux_loss": 0.04806693401187658, "loss/crossentropy": 2.7242776453495026, "loss/logits": 0.8014493867754936, "step": 62070 }, { "epoch": 0.6208, "grad_norm": 15.9375, "grad_norm_var": 0.8457682291666667, "learning_rate": 0.0003, "loss": 10.9555, "loss/aux_loss": 0.048064196668565276, "loss/crossentropy": 2.8615013003349303, "loss/logits": 0.8332249820232391, "step": 62080 }, { "epoch": 0.6209, "grad_norm": 18.0, "grad_norm_var": 1.15078125, "learning_rate": 0.0003, "loss": 10.8982, "loss/aux_loss": 0.04807161465287209, "loss/crossentropy": 2.7590698480606077, "loss/logits": 0.828876069188118, "step": 62090 }, { "epoch": 0.621, "grad_norm": 16.125, "grad_norm_var": 0.6268229166666667, "learning_rate": 0.0003, "loss": 10.952, "loss/aux_loss": 0.048064458556473254, "loss/crossentropy": 2.8559590697288515, "loss/logits": 0.8306620687246322, "step": 62100 }, { "epoch": 0.6211, "grad_norm": 14.375, "grad_norm_var": 0.65, "learning_rate": 0.0003, "loss": 10.8323, "loss/aux_loss": 0.04808349907398224, "loss/crossentropy": 2.6381156027317045, "loss/logits": 0.829769441485405, "step": 62110 }, { "epoch": 0.6212, "grad_norm": 14.5625, "grad_norm_var": 0.40208333333333335, "learning_rate": 0.0003, "loss": 10.9283, "loss/aux_loss": 0.048080073297023775, "loss/crossentropy": 2.7890799164772035, "loss/logits": 0.8134836733341217, "step": 62120 }, { "epoch": 0.6213, "grad_norm": 14.9375, "grad_norm_var": 35.38430989583333, "learning_rate": 0.0003, "loss": 10.7432, "loss/aux_loss": 0.048065336607396605, "loss/crossentropy": 2.44475519657135, "loss/logits": 0.7698864176869392, "step": 62130 }, { "epoch": 0.6214, "grad_norm": 14.5, "grad_norm_var": 0.5469889322916667, "learning_rate": 0.0003, "loss": 10.9544, "loss/aux_loss": 0.04806721787899733, "loss/crossentropy": 2.5549269795417784, "loss/logits": 0.8181776434183121, "step": 62140 }, { "epoch": 0.6215, "grad_norm": 15.125, "grad_norm_var": 0.447900390625, "learning_rate": 0.0003, "loss": 10.8531, "loss/aux_loss": 0.048075252957642076, "loss/crossentropy": 2.8422864854335783, "loss/logits": 0.8431739717721939, "step": 62150 }, { "epoch": 0.6216, "grad_norm": 16.875, "grad_norm_var": 0.6466145833333333, "learning_rate": 0.0003, "loss": 10.9505, "loss/aux_loss": 0.048076402954757215, "loss/crossentropy": 2.8633032202720643, "loss/logits": 0.8444990605115891, "step": 62160 }, { "epoch": 0.6217, "grad_norm": 14.9375, "grad_norm_var": 1.3199055989583333, "learning_rate": 0.0003, "loss": 10.8851, "loss/aux_loss": 0.04805946424603462, "loss/crossentropy": 2.5738767266273497, "loss/logits": 0.7897825837135315, "step": 62170 }, { "epoch": 0.6218, "grad_norm": 14.4375, "grad_norm_var": 1.0264973958333334, "learning_rate": 0.0003, "loss": 10.9777, "loss/aux_loss": 0.04807272329926491, "loss/crossentropy": 2.559682661294937, "loss/logits": 0.8130290180444717, "step": 62180 }, { "epoch": 0.6219, "grad_norm": 15.75, "grad_norm_var": 1.2301432291666667, "learning_rate": 0.0003, "loss": 10.8184, "loss/aux_loss": 0.04808760862797499, "loss/crossentropy": 2.65628005862236, "loss/logits": 0.7793559074401856, "step": 62190 }, { "epoch": 0.622, "grad_norm": 27.75, "grad_norm_var": 10.504280598958333, "learning_rate": 0.0003, "loss": 10.9073, "loss/aux_loss": 0.04806331116706133, "loss/crossentropy": 2.751385676860809, "loss/logits": 0.8148221343755722, "step": 62200 }, { "epoch": 0.6221, "grad_norm": 14.8125, "grad_norm_var": 10.069205729166667, "learning_rate": 0.0003, "loss": 10.8365, "loss/aux_loss": 0.048077536001801494, "loss/crossentropy": 2.76932435631752, "loss/logits": 0.7825021982192993, "step": 62210 }, { "epoch": 0.6222, "grad_norm": 14.5625, "grad_norm_var": 0.34661458333333334, "learning_rate": 0.0003, "loss": 10.9128, "loss/aux_loss": 0.048069404624402526, "loss/crossentropy": 2.6793047428131103, "loss/logits": 0.7982782870531082, "step": 62220 }, { "epoch": 0.6223, "grad_norm": 15.5625, "grad_norm_var": 51.823893229166664, "learning_rate": 0.0003, "loss": 11.0149, "loss/aux_loss": 0.04806563127785921, "loss/crossentropy": 2.742257535457611, "loss/logits": 0.8206332385540008, "step": 62230 }, { "epoch": 0.6224, "grad_norm": 15.25, "grad_norm_var": 49.48118489583333, "learning_rate": 0.0003, "loss": 10.9483, "loss/aux_loss": 0.04807669762521982, "loss/crossentropy": 2.7733189463615417, "loss/logits": 0.8633380651473999, "step": 62240 }, { "epoch": 0.6225, "grad_norm": 15.625, "grad_norm_var": 1.4867024739583334, "learning_rate": 0.0003, "loss": 10.8326, "loss/aux_loss": 0.04806031119078398, "loss/crossentropy": 2.7665975272655485, "loss/logits": 0.825995746254921, "step": 62250 }, { "epoch": 0.6226, "grad_norm": 17.375, "grad_norm_var": 31.684619140625, "learning_rate": 0.0003, "loss": 10.9359, "loss/aux_loss": 0.048060914315283296, "loss/crossentropy": 2.7299613773822786, "loss/logits": 0.7919081568717956, "step": 62260 }, { "epoch": 0.6227, "grad_norm": 16.0, "grad_norm_var": 32.25402018229167, "learning_rate": 0.0003, "loss": 10.9558, "loss/aux_loss": 0.048073223978281024, "loss/crossentropy": 2.854112696647644, "loss/logits": 0.8261337369680405, "step": 62270 }, { "epoch": 0.6228, "grad_norm": 16.25, "grad_norm_var": 0.37604166666666666, "learning_rate": 0.0003, "loss": 10.9295, "loss/aux_loss": 0.04806328769773245, "loss/crossentropy": 2.6099496364593504, "loss/logits": 0.7905628532171249, "step": 62280 }, { "epoch": 0.6229, "grad_norm": 15.0, "grad_norm_var": 0.8378743489583333, "learning_rate": 0.0003, "loss": 11.1089, "loss/aux_loss": 0.048074782267212866, "loss/crossentropy": 2.856866729259491, "loss/logits": 0.8273001462221146, "step": 62290 }, { "epoch": 0.623, "grad_norm": 15.375, "grad_norm_var": 0.5945149739583333, "learning_rate": 0.0003, "loss": 11.0759, "loss/aux_loss": 0.048062778823077676, "loss/crossentropy": 2.8540167093276976, "loss/logits": 0.8591305077075958, "step": 62300 }, { "epoch": 0.6231, "grad_norm": 14.6875, "grad_norm_var": 1.1102701822916667, "learning_rate": 0.0003, "loss": 11.0353, "loss/aux_loss": 0.048072639107704165, "loss/crossentropy": 2.670693778991699, "loss/logits": 0.8144174665212631, "step": 62310 }, { "epoch": 0.6232, "grad_norm": 14.5625, "grad_norm_var": 0.4649576822916667, "learning_rate": 0.0003, "loss": 10.8635, "loss/aux_loss": 0.04807703364640474, "loss/crossentropy": 2.6906314373016356, "loss/logits": 0.7975012451410294, "step": 62320 }, { "epoch": 0.6233, "grad_norm": 14.1875, "grad_norm_var": 0.6809733072916667, "learning_rate": 0.0003, "loss": 10.9574, "loss/aux_loss": 0.04804598540067673, "loss/crossentropy": 2.7562792539596557, "loss/logits": 0.8315281063318253, "step": 62330 }, { "epoch": 0.6234, "grad_norm": 18.5, "grad_norm_var": 1.3674479166666667, "learning_rate": 0.0003, "loss": 11.0711, "loss/aux_loss": 0.04807227849960327, "loss/crossentropy": 2.699070680141449, "loss/logits": 0.8405324429273605, "step": 62340 }, { "epoch": 0.6235, "grad_norm": 15.375, "grad_norm_var": 48.110660807291666, "learning_rate": 0.0003, "loss": 10.9377, "loss/aux_loss": 0.04808774162083864, "loss/crossentropy": 2.8322594940662382, "loss/logits": 0.8086932510137558, "step": 62350 }, { "epoch": 0.6236, "grad_norm": 15.75, "grad_norm_var": 1.492041015625, "learning_rate": 0.0003, "loss": 10.8616, "loss/aux_loss": 0.04806898422539234, "loss/crossentropy": 2.5754688024520873, "loss/logits": 0.7867444813251495, "step": 62360 }, { "epoch": 0.6237, "grad_norm": 15.3125, "grad_norm_var": 0.5119140625, "learning_rate": 0.0003, "loss": 10.9684, "loss/aux_loss": 0.04807077944278717, "loss/crossentropy": 2.6778059184551237, "loss/logits": 0.8169686466455459, "step": 62370 }, { "epoch": 0.6238, "grad_norm": 16.125, "grad_norm_var": 0.6291015625, "learning_rate": 0.0003, "loss": 10.9766, "loss/aux_loss": 0.04807309564203024, "loss/crossentropy": 2.6437867105007173, "loss/logits": 0.823999360203743, "step": 62380 }, { "epoch": 0.6239, "grad_norm": 16.25, "grad_norm_var": 0.7306640625, "learning_rate": 0.0003, "loss": 10.8911, "loss/aux_loss": 0.04806749243289232, "loss/crossentropy": 2.6562050104141237, "loss/logits": 0.8306028187274933, "step": 62390 }, { "epoch": 0.624, "grad_norm": 15.5, "grad_norm_var": 0.3, "learning_rate": 0.0003, "loss": 11.061, "loss/aux_loss": 0.04807000830769539, "loss/crossentropy": 2.786527621746063, "loss/logits": 0.8158238917589188, "step": 62400 }, { "epoch": 0.6241, "grad_norm": 16.0, "grad_norm_var": 0.3340983072916667, "learning_rate": 0.0003, "loss": 11.0293, "loss/aux_loss": 0.04807388223707676, "loss/crossentropy": 2.7131783425807954, "loss/logits": 0.8209474682807922, "step": 62410 }, { "epoch": 0.6242, "grad_norm": 14.5, "grad_norm_var": 0.22109375, "learning_rate": 0.0003, "loss": 10.9286, "loss/aux_loss": 0.04806325174868107, "loss/crossentropy": 2.6579012751579283, "loss/logits": 0.8379012405872345, "step": 62420 }, { "epoch": 0.6243, "grad_norm": 15.25, "grad_norm_var": 0.3780598958333333, "learning_rate": 0.0003, "loss": 10.8599, "loss/aux_loss": 0.048072734661400315, "loss/crossentropy": 2.668544816970825, "loss/logits": 0.8266326695680618, "step": 62430 }, { "epoch": 0.6244, "grad_norm": 14.5625, "grad_norm_var": 0.6106608072916667, "learning_rate": 0.0003, "loss": 10.8735, "loss/aux_loss": 0.048060843162238596, "loss/crossentropy": 2.692414402961731, "loss/logits": 0.8246606469154358, "step": 62440 }, { "epoch": 0.6245, "grad_norm": 14.9375, "grad_norm_var": 1.0809733072916667, "learning_rate": 0.0003, "loss": 10.9023, "loss/aux_loss": 0.04808099400252104, "loss/crossentropy": 2.865987467765808, "loss/logits": 0.8370956897735595, "step": 62450 }, { "epoch": 0.6246, "grad_norm": 14.625, "grad_norm_var": 0.835791015625, "learning_rate": 0.0003, "loss": 10.8651, "loss/aux_loss": 0.04806222338229418, "loss/crossentropy": 2.5183817207813264, "loss/logits": 0.7817242562770843, "step": 62460 }, { "epoch": 0.6247, "grad_norm": 14.25, "grad_norm_var": 2.8445149739583333, "learning_rate": 0.0003, "loss": 10.8275, "loss/aux_loss": 0.04807907696813345, "loss/crossentropy": 2.5393788814544678, "loss/logits": 0.7698053836822509, "step": 62470 }, { "epoch": 0.6248, "grad_norm": 14.75, "grad_norm_var": 0.21964518229166666, "learning_rate": 0.0003, "loss": 10.9445, "loss/aux_loss": 0.04807716142386198, "loss/crossentropy": 2.879362916946411, "loss/logits": 0.8249101668596268, "step": 62480 }, { "epoch": 0.6249, "grad_norm": 15.5, "grad_norm_var": 0.3416666666666667, "learning_rate": 0.0003, "loss": 10.8909, "loss/aux_loss": 0.04806219730526209, "loss/crossentropy": 2.6007526874542237, "loss/logits": 0.8132065325975418, "step": 62490 }, { "epoch": 0.625, "grad_norm": 13.9375, "grad_norm_var": 0.5270670572916667, "learning_rate": 0.0003, "loss": 11.0119, "loss/aux_loss": 0.04807214587926865, "loss/crossentropy": 2.636463737487793, "loss/logits": 0.7892209351062774, "step": 62500 }, { "epoch": 0.6251, "grad_norm": 14.5, "grad_norm_var": 1.1276041666666667, "learning_rate": 0.0003, "loss": 10.9606, "loss/aux_loss": 0.04807151295244694, "loss/crossentropy": 2.6543030560016634, "loss/logits": 0.8350883662700653, "step": 62510 }, { "epoch": 0.6252, "grad_norm": 14.875, "grad_norm_var": 0.31354166666666666, "learning_rate": 0.0003, "loss": 10.8797, "loss/aux_loss": 0.04806313067674637, "loss/crossentropy": 2.7374988555908204, "loss/logits": 0.8234562575817108, "step": 62520 }, { "epoch": 0.6253, "grad_norm": 14.125, "grad_norm_var": 0.9880208333333333, "learning_rate": 0.0003, "loss": 11.2108, "loss/aux_loss": 0.048078093118965624, "loss/crossentropy": 2.745024061203003, "loss/logits": 0.8560461461544037, "step": 62530 }, { "epoch": 0.6254, "grad_norm": 15.0, "grad_norm_var": 0.7145182291666666, "learning_rate": 0.0003, "loss": 10.987, "loss/aux_loss": 0.04806946255266666, "loss/crossentropy": 2.651322227716446, "loss/logits": 0.8073702841997147, "step": 62540 }, { "epoch": 0.6255, "grad_norm": 15.75, "grad_norm_var": 0.2684733072916667, "learning_rate": 0.0003, "loss": 10.8866, "loss/aux_loss": 0.048070280253887175, "loss/crossentropy": 2.6663502156734467, "loss/logits": 0.823998111486435, "step": 62550 }, { "epoch": 0.6256, "grad_norm": 72.5, "grad_norm_var": 205.42701822916666, "learning_rate": 0.0003, "loss": 10.9477, "loss/aux_loss": 0.04807772878557444, "loss/crossentropy": 2.639860916137695, "loss/logits": 0.8050632417201996, "step": 62560 }, { "epoch": 0.6257, "grad_norm": 18.0, "grad_norm_var": 201.77667643229168, "learning_rate": 0.0003, "loss": 10.9815, "loss/aux_loss": 0.048079793155193326, "loss/crossentropy": 2.8103298008441926, "loss/logits": 0.8344414174556732, "step": 62570 }, { "epoch": 0.6258, "grad_norm": 15.8125, "grad_norm_var": 1.1109212239583333, "learning_rate": 0.0003, "loss": 10.9126, "loss/aux_loss": 0.04806471895426512, "loss/crossentropy": 2.6820645689964295, "loss/logits": 0.8160485446453094, "step": 62580 }, { "epoch": 0.6259, "grad_norm": 17.0, "grad_norm_var": 0.9077473958333333, "learning_rate": 0.0003, "loss": 10.8826, "loss/aux_loss": 0.048061324283480644, "loss/crossentropy": 2.5685730636119843, "loss/logits": 0.7997966796159744, "step": 62590 }, { "epoch": 0.626, "grad_norm": 15.25, "grad_norm_var": 1.097900390625, "learning_rate": 0.0003, "loss": 10.8269, "loss/aux_loss": 0.04806754421442747, "loss/crossentropy": 2.563853049278259, "loss/logits": 0.7696677416563034, "step": 62600 }, { "epoch": 0.6261, "grad_norm": 15.75, "grad_norm_var": 1.073681640625, "learning_rate": 0.0003, "loss": 10.7331, "loss/aux_loss": 0.04807633589953184, "loss/crossentropy": 2.7461453557014464, "loss/logits": 0.8335110425949097, "step": 62610 }, { "epoch": 0.6262, "grad_norm": 15.5, "grad_norm_var": 0.9044108072916667, "learning_rate": 0.0003, "loss": 10.9591, "loss/aux_loss": 0.04808416999876499, "loss/crossentropy": 2.652498370409012, "loss/logits": 0.7771803379058838, "step": 62620 }, { "epoch": 0.6263, "grad_norm": 15.125, "grad_norm_var": 0.30597330729166666, "learning_rate": 0.0003, "loss": 10.9905, "loss/aux_loss": 0.048068183846771716, "loss/crossentropy": 2.7204720437526704, "loss/logits": 0.8556759804487228, "step": 62630 }, { "epoch": 0.6264, "grad_norm": 14.625, "grad_norm_var": 0.813134765625, "learning_rate": 0.0003, "loss": 10.7167, "loss/aux_loss": 0.048065423220396045, "loss/crossentropy": 2.565179693698883, "loss/logits": 0.8034500062465668, "step": 62640 }, { "epoch": 0.6265, "grad_norm": 15.0625, "grad_norm_var": 13.729166666666666, "learning_rate": 0.0003, "loss": 10.8313, "loss/aux_loss": 0.048078210465610026, "loss/crossentropy": 2.7543790459632875, "loss/logits": 0.8006289631128312, "step": 62650 }, { "epoch": 0.6266, "grad_norm": 16.0, "grad_norm_var": 0.9395833333333333, "learning_rate": 0.0003, "loss": 10.9459, "loss/aux_loss": 0.04806745704263449, "loss/crossentropy": 2.822178053855896, "loss/logits": 0.8426857680082321, "step": 62660 }, { "epoch": 0.6267, "grad_norm": 15.0, "grad_norm_var": 0.7831868489583333, "learning_rate": 0.0003, "loss": 10.9596, "loss/aux_loss": 0.04807326514273882, "loss/crossentropy": 2.694602167606354, "loss/logits": 0.8324411004781723, "step": 62670 }, { "epoch": 0.6268, "grad_norm": 15.5625, "grad_norm_var": 86.090869140625, "learning_rate": 0.0003, "loss": 11.0313, "loss/aux_loss": 0.04808346442878246, "loss/crossentropy": 2.6940039336681365, "loss/logits": 0.8582751452922821, "step": 62680 }, { "epoch": 0.6269, "grad_norm": 17.0, "grad_norm_var": 84.3541015625, "learning_rate": 0.0003, "loss": 11.0566, "loss/aux_loss": 0.04807037822902203, "loss/crossentropy": 2.7277477622032165, "loss/logits": 0.8286092817783356, "step": 62690 }, { "epoch": 0.627, "grad_norm": 15.0, "grad_norm_var": 1.0344889322916666, "learning_rate": 0.0003, "loss": 10.897, "loss/aux_loss": 0.048066021874547005, "loss/crossentropy": 2.6495929658412933, "loss/logits": 0.7907640814781189, "step": 62700 }, { "epoch": 0.6271, "grad_norm": 14.8125, "grad_norm_var": 1.0577962239583334, "learning_rate": 0.0003, "loss": 11.0379, "loss/aux_loss": 0.04806892462074756, "loss/crossentropy": 2.6996763944625854, "loss/logits": 0.8464843809604645, "step": 62710 }, { "epoch": 0.6272, "grad_norm": 14.4375, "grad_norm_var": 0.22701822916666667, "learning_rate": 0.0003, "loss": 10.8966, "loss/aux_loss": 0.04808309208601713, "loss/crossentropy": 2.7269538223743437, "loss/logits": 0.797277769446373, "step": 62720 }, { "epoch": 0.6273, "grad_norm": 16.25, "grad_norm_var": 0.9309895833333334, "learning_rate": 0.0003, "loss": 10.9687, "loss/aux_loss": 0.04806727655231953, "loss/crossentropy": 2.64298877120018, "loss/logits": 0.7983599692583084, "step": 62730 }, { "epoch": 0.6274, "grad_norm": 15.5625, "grad_norm_var": 0.6864420572916666, "learning_rate": 0.0003, "loss": 11.0072, "loss/aux_loss": 0.04806558284908533, "loss/crossentropy": 2.675836908817291, "loss/logits": 0.8062641978263855, "step": 62740 }, { "epoch": 0.6275, "grad_norm": 16.25, "grad_norm_var": 0.7261555989583334, "learning_rate": 0.0003, "loss": 10.8478, "loss/aux_loss": 0.04808370973914862, "loss/crossentropy": 2.6298579216003417, "loss/logits": 0.8091606229543686, "step": 62750 }, { "epoch": 0.6276, "grad_norm": 15.6875, "grad_norm_var": 0.6263020833333334, "learning_rate": 0.0003, "loss": 10.9488, "loss/aux_loss": 0.048074055649340156, "loss/crossentropy": 2.684184890985489, "loss/logits": 0.8185036033391953, "step": 62760 }, { "epoch": 0.6277, "grad_norm": 15.0, "grad_norm_var": 0.7436848958333333, "learning_rate": 0.0003, "loss": 10.9509, "loss/aux_loss": 0.04805560186505318, "loss/crossentropy": 2.7323277831077575, "loss/logits": 0.8214883893728256, "step": 62770 }, { "epoch": 0.6278, "grad_norm": 16.375, "grad_norm_var": 0.6624837239583333, "learning_rate": 0.0003, "loss": 10.9952, "loss/aux_loss": 0.04808086436241865, "loss/crossentropy": 2.7757518172264097, "loss/logits": 0.8345901370048523, "step": 62780 }, { "epoch": 0.6279, "grad_norm": 15.8125, "grad_norm_var": 0.7156087239583333, "learning_rate": 0.0003, "loss": 10.8995, "loss/aux_loss": 0.04808343015611172, "loss/crossentropy": 2.809204262495041, "loss/logits": 0.8202072083950043, "step": 62790 }, { "epoch": 0.628, "grad_norm": 14.9375, "grad_norm_var": 0.37701822916666666, "learning_rate": 0.0003, "loss": 10.8546, "loss/aux_loss": 0.048064966686069965, "loss/crossentropy": 2.7755215167999268, "loss/logits": 0.8226945012807846, "step": 62800 }, { "epoch": 0.6281, "grad_norm": 14.5625, "grad_norm_var": 0.3634765625, "learning_rate": 0.0003, "loss": 10.8531, "loss/aux_loss": 0.04806779157370329, "loss/crossentropy": 2.684882569313049, "loss/logits": 0.82384153008461, "step": 62810 }, { "epoch": 0.6282, "grad_norm": 14.25, "grad_norm_var": 0.4567057291666667, "learning_rate": 0.0003, "loss": 10.8806, "loss/aux_loss": 0.048072170466184616, "loss/crossentropy": 2.750021505355835, "loss/logits": 0.7991617172956467, "step": 62820 }, { "epoch": 0.6283, "grad_norm": 14.8125, "grad_norm_var": 0.2503743489583333, "learning_rate": 0.0003, "loss": 10.9362, "loss/aux_loss": 0.04807143583893776, "loss/crossentropy": 2.749982488155365, "loss/logits": 0.8353980958461762, "step": 62830 }, { "epoch": 0.6284, "grad_norm": 14.25, "grad_norm_var": 0.517041015625, "learning_rate": 0.0003, "loss": 10.9349, "loss/aux_loss": 0.048057589866220954, "loss/crossentropy": 2.761917233467102, "loss/logits": 0.8192497193813324, "step": 62840 }, { "epoch": 0.6285, "grad_norm": 14.9375, "grad_norm_var": 0.5915201822916667, "learning_rate": 0.0003, "loss": 10.7967, "loss/aux_loss": 0.04808051139116287, "loss/crossentropy": 2.4791474997997285, "loss/logits": 0.8087964832782746, "step": 62850 }, { "epoch": 0.6286, "grad_norm": 15.6875, "grad_norm_var": 0.5188639322916667, "learning_rate": 0.0003, "loss": 11.022, "loss/aux_loss": 0.048080192692577836, "loss/crossentropy": 2.8895226955413817, "loss/logits": 0.8381287634372712, "step": 62860 }, { "epoch": 0.6287, "grad_norm": 15.5, "grad_norm_var": 0.5929524739583333, "learning_rate": 0.0003, "loss": 10.908, "loss/aux_loss": 0.04806222971528769, "loss/crossentropy": 2.747699362039566, "loss/logits": 0.8261835396289825, "step": 62870 }, { "epoch": 0.6288, "grad_norm": 15.375, "grad_norm_var": 1.2239420572916666, "learning_rate": 0.0003, "loss": 10.9217, "loss/aux_loss": 0.048062012530863285, "loss/crossentropy": 2.6717309832572935, "loss/logits": 0.8120827436447143, "step": 62880 }, { "epoch": 0.6289, "grad_norm": 13.9375, "grad_norm_var": 1.063525390625, "learning_rate": 0.0003, "loss": 10.7873, "loss/aux_loss": 0.048078616708517076, "loss/crossentropy": 2.6327943921089174, "loss/logits": 0.7977903634309769, "step": 62890 }, { "epoch": 0.629, "grad_norm": 16.0, "grad_norm_var": 0.4988932291666667, "learning_rate": 0.0003, "loss": 10.948, "loss/aux_loss": 0.04807599578052759, "loss/crossentropy": 2.727197366952896, "loss/logits": 0.8394771188497543, "step": 62900 }, { "epoch": 0.6291, "grad_norm": 15.375, "grad_norm_var": 0.3973307291666667, "learning_rate": 0.0003, "loss": 10.902, "loss/aux_loss": 0.04805990718305111, "loss/crossentropy": 2.791462790966034, "loss/logits": 0.8735287189483643, "step": 62910 }, { "epoch": 0.6292, "grad_norm": 14.6875, "grad_norm_var": 1.3952473958333333, "learning_rate": 0.0003, "loss": 10.9858, "loss/aux_loss": 0.04808229636400938, "loss/crossentropy": 2.740624117851257, "loss/logits": 0.7988775402307511, "step": 62920 }, { "epoch": 0.6293, "grad_norm": 14.9375, "grad_norm_var": 0.968212890625, "learning_rate": 0.0003, "loss": 10.8832, "loss/aux_loss": 0.0480513833463192, "loss/crossentropy": 2.785011887550354, "loss/logits": 0.8363262772560119, "step": 62930 }, { "epoch": 0.6294, "grad_norm": 16.25, "grad_norm_var": 0.5317708333333333, "learning_rate": 0.0003, "loss": 10.8444, "loss/aux_loss": 0.04807438999414444, "loss/crossentropy": 2.6182597100734712, "loss/logits": 0.7917582601308822, "step": 62940 }, { "epoch": 0.6295, "grad_norm": 14.375, "grad_norm_var": 0.4007649739583333, "learning_rate": 0.0003, "loss": 10.8933, "loss/aux_loss": 0.048065260984003544, "loss/crossentropy": 2.6386733055114746, "loss/logits": 0.8080016434192657, "step": 62950 }, { "epoch": 0.6296, "grad_norm": 14.5625, "grad_norm_var": 4.309358723958334, "learning_rate": 0.0003, "loss": 10.9506, "loss/aux_loss": 0.04806258585304022, "loss/crossentropy": 2.6573799908161164, "loss/logits": 0.797971498966217, "step": 62960 }, { "epoch": 0.6297, "grad_norm": 14.1875, "grad_norm_var": 0.6682291666666667, "learning_rate": 0.0003, "loss": 10.8314, "loss/aux_loss": 0.04807965587824583, "loss/crossentropy": 2.614883852005005, "loss/logits": 0.7928066223859787, "step": 62970 }, { "epoch": 0.6298, "grad_norm": 15.1875, "grad_norm_var": 0.452587890625, "learning_rate": 0.0003, "loss": 11.0064, "loss/aux_loss": 0.048063984513282774, "loss/crossentropy": 2.7016715586185454, "loss/logits": 0.8424327522516251, "step": 62980 }, { "epoch": 0.6299, "grad_norm": 15.0, "grad_norm_var": 0.3759765625, "learning_rate": 0.0003, "loss": 10.818, "loss/aux_loss": 0.04806662555783987, "loss/crossentropy": 2.7241014719009398, "loss/logits": 0.8228190451860428, "step": 62990 }, { "epoch": 0.63, "grad_norm": 15.625, "grad_norm_var": 0.3109212239583333, "learning_rate": 0.0003, "loss": 10.8628, "loss/aux_loss": 0.048064174503087996, "loss/crossentropy": 2.558414030075073, "loss/logits": 0.8088842839002609, "step": 63000 }, { "epoch": 0.6301, "grad_norm": 15.125, "grad_norm_var": 0.5624348958333333, "learning_rate": 0.0003, "loss": 11.0454, "loss/aux_loss": 0.04806617610156536, "loss/crossentropy": 2.8958580434322356, "loss/logits": 0.8480107396841049, "step": 63010 }, { "epoch": 0.6302, "grad_norm": 15.375, "grad_norm_var": 0.42185872395833335, "learning_rate": 0.0003, "loss": 10.8501, "loss/aux_loss": 0.04805904608219862, "loss/crossentropy": 2.912857186794281, "loss/logits": 0.8123132467269898, "step": 63020 }, { "epoch": 0.6303, "grad_norm": 15.1875, "grad_norm_var": 0.6196451822916667, "learning_rate": 0.0003, "loss": 10.6561, "loss/aux_loss": 0.04808768462389708, "loss/crossentropy": 2.677269399166107, "loss/logits": 0.8051055639982223, "step": 63030 }, { "epoch": 0.6304, "grad_norm": 15.0625, "grad_norm_var": 0.6065104166666667, "learning_rate": 0.0003, "loss": 10.784, "loss/aux_loss": 0.04805712196975946, "loss/crossentropy": 2.9472272872924803, "loss/logits": 0.8176089495420455, "step": 63040 }, { "epoch": 0.6305, "grad_norm": 15.25, "grad_norm_var": 0.51953125, "learning_rate": 0.0003, "loss": 10.8028, "loss/aux_loss": 0.04806398153305054, "loss/crossentropy": 2.6117322742938995, "loss/logits": 0.8162994027137757, "step": 63050 }, { "epoch": 0.6306, "grad_norm": 15.0625, "grad_norm_var": 1.9244140625, "learning_rate": 0.0003, "loss": 10.9476, "loss/aux_loss": 0.04808130543678999, "loss/crossentropy": 2.674235236644745, "loss/logits": 0.8312394112348557, "step": 63060 }, { "epoch": 0.6307, "grad_norm": 15.5, "grad_norm_var": 0.9276041666666667, "learning_rate": 0.0003, "loss": 11.0446, "loss/aux_loss": 0.048066967912018296, "loss/crossentropy": 2.751528322696686, "loss/logits": 0.8499272048473359, "step": 63070 }, { "epoch": 0.6308, "grad_norm": 14.4375, "grad_norm_var": 1.1157389322916667, "learning_rate": 0.0003, "loss": 10.9221, "loss/aux_loss": 0.04807424377650023, "loss/crossentropy": 2.756152319908142, "loss/logits": 0.8447676509618759, "step": 63080 }, { "epoch": 0.6309, "grad_norm": 15.3125, "grad_norm_var": 0.7453125, "learning_rate": 0.0003, "loss": 10.8395, "loss/aux_loss": 0.048067429848015306, "loss/crossentropy": 2.895149755477905, "loss/logits": 0.8409383088350296, "step": 63090 }, { "epoch": 0.631, "grad_norm": 14.0, "grad_norm_var": 0.9541666666666667, "learning_rate": 0.0003, "loss": 11.026, "loss/aux_loss": 0.04806844405829906, "loss/crossentropy": 2.747306799888611, "loss/logits": 0.8143503844738007, "step": 63100 }, { "epoch": 0.6311, "grad_norm": 15.125, "grad_norm_var": 4.254671223958334, "learning_rate": 0.0003, "loss": 10.9367, "loss/aux_loss": 0.04806724544614553, "loss/crossentropy": 2.741036427021027, "loss/logits": 0.8391730457544326, "step": 63110 }, { "epoch": 0.6312, "grad_norm": 14.1875, "grad_norm_var": 1.7449055989583333, "learning_rate": 0.0003, "loss": 11.0246, "loss/aux_loss": 0.04807869717478752, "loss/crossentropy": 2.748949956893921, "loss/logits": 0.8150217235088348, "step": 63120 }, { "epoch": 0.6313, "grad_norm": 15.0, "grad_norm_var": 1.1575520833333333, "learning_rate": 0.0003, "loss": 10.8627, "loss/aux_loss": 0.048072864301502705, "loss/crossentropy": 2.5791411340236663, "loss/logits": 0.7991462841629982, "step": 63130 }, { "epoch": 0.6314, "grad_norm": 15.25, "grad_norm_var": 1.3233723958333334, "learning_rate": 0.0003, "loss": 10.8953, "loss/aux_loss": 0.04805458467453718, "loss/crossentropy": 2.609523779153824, "loss/logits": 0.8489516407251358, "step": 63140 }, { "epoch": 0.6315, "grad_norm": 14.125, "grad_norm_var": 0.6304524739583334, "learning_rate": 0.0003, "loss": 10.934, "loss/aux_loss": 0.0480818985030055, "loss/crossentropy": 2.618219095468521, "loss/logits": 0.8005220651626587, "step": 63150 }, { "epoch": 0.6316, "grad_norm": 17.125, "grad_norm_var": 0.9078125, "learning_rate": 0.0003, "loss": 10.9303, "loss/aux_loss": 0.04806983359158039, "loss/crossentropy": 2.686432045698166, "loss/logits": 0.8284903228282928, "step": 63160 }, { "epoch": 0.6317, "grad_norm": 14.3125, "grad_norm_var": 1.4434895833333334, "learning_rate": 0.0003, "loss": 10.9313, "loss/aux_loss": 0.04806499164551496, "loss/crossentropy": 2.7233918964862824, "loss/logits": 0.8174569487571717, "step": 63170 }, { "epoch": 0.6318, "grad_norm": 16.0, "grad_norm_var": 169.39816080729167, "learning_rate": 0.0003, "loss": 10.8758, "loss/aux_loss": 0.048072904162108895, "loss/crossentropy": 2.759899604320526, "loss/logits": 0.8145837306976318, "step": 63180 }, { "epoch": 0.6319, "grad_norm": 15.75, "grad_norm_var": 164.56328125, "learning_rate": 0.0003, "loss": 10.949, "loss/aux_loss": 0.048071921803057194, "loss/crossentropy": 2.555657994747162, "loss/logits": 0.793342587351799, "step": 63190 }, { "epoch": 0.632, "grad_norm": 14.375, "grad_norm_var": 0.3575358072916667, "learning_rate": 0.0003, "loss": 10.9903, "loss/aux_loss": 0.04807258564978838, "loss/crossentropy": 2.7257075905799866, "loss/logits": 0.8129594385623932, "step": 63200 }, { "epoch": 0.6321, "grad_norm": 14.0, "grad_norm_var": 0.46295572916666666, "learning_rate": 0.0003, "loss": 10.8189, "loss/aux_loss": 0.048059084080159666, "loss/crossentropy": 2.704482650756836, "loss/logits": 0.8401682913303375, "step": 63210 }, { "epoch": 0.6322, "grad_norm": 14.75, "grad_norm_var": 0.42337239583333336, "learning_rate": 0.0003, "loss": 10.9708, "loss/aux_loss": 0.04807595741003752, "loss/crossentropy": 2.6612473666667937, "loss/logits": 0.8036254912614822, "step": 63220 }, { "epoch": 0.6323, "grad_norm": 15.1875, "grad_norm_var": 0.4369140625, "learning_rate": 0.0003, "loss": 10.8829, "loss/aux_loss": 0.04806538727134466, "loss/crossentropy": 2.730154258012772, "loss/logits": 0.8400603294372558, "step": 63230 }, { "epoch": 0.6324, "grad_norm": 15.3125, "grad_norm_var": 3.090559895833333, "learning_rate": 0.0003, "loss": 11.0641, "loss/aux_loss": 0.048070489801466464, "loss/crossentropy": 2.8468264818191527, "loss/logits": 0.838485524058342, "step": 63240 }, { "epoch": 0.6325, "grad_norm": 14.6875, "grad_norm_var": 3.206770833333333, "learning_rate": 0.0003, "loss": 11.0373, "loss/aux_loss": 0.04807015266269445, "loss/crossentropy": 2.78941011428833, "loss/logits": 0.8361764669418335, "step": 63250 }, { "epoch": 0.6326, "grad_norm": 14.75, "grad_norm_var": 29.7541015625, "learning_rate": 0.0003, "loss": 11.0234, "loss/aux_loss": 0.04807714801281691, "loss/crossentropy": 2.579346811771393, "loss/logits": 0.8216876536607742, "step": 63260 }, { "epoch": 0.6327, "grad_norm": 16.25, "grad_norm_var": 73.72159830729167, "learning_rate": 0.0003, "loss": 10.9517, "loss/aux_loss": 0.04807807840406895, "loss/crossentropy": 2.7519372761249543, "loss/logits": 0.8636613190174103, "step": 63270 }, { "epoch": 0.6328, "grad_norm": 17.0, "grad_norm_var": 0.8097493489583333, "learning_rate": 0.0003, "loss": 10.835, "loss/aux_loss": 0.04807962123304606, "loss/crossentropy": 2.561204981803894, "loss/logits": 0.7664595246315002, "step": 63280 }, { "epoch": 0.6329, "grad_norm": 14.3125, "grad_norm_var": 0.8628743489583334, "learning_rate": 0.0003, "loss": 10.8773, "loss/aux_loss": 0.04806769024580717, "loss/crossentropy": 2.592850297689438, "loss/logits": 0.8082565724849701, "step": 63290 }, { "epoch": 0.633, "grad_norm": 15.6875, "grad_norm_var": 0.5722493489583333, "learning_rate": 0.0003, "loss": 11.072, "loss/aux_loss": 0.048081412352621554, "loss/crossentropy": 2.8436991333961488, "loss/logits": 0.8220110654830932, "step": 63300 }, { "epoch": 0.6331, "grad_norm": 15.5, "grad_norm_var": 0.4627604166666667, "learning_rate": 0.0003, "loss": 10.9644, "loss/aux_loss": 0.04806417748332024, "loss/crossentropy": 2.746978682279587, "loss/logits": 0.8097629576921463, "step": 63310 }, { "epoch": 0.6332, "grad_norm": 14.125, "grad_norm_var": 1.061572265625, "learning_rate": 0.0003, "loss": 10.7537, "loss/aux_loss": 0.04806242845952511, "loss/crossentropy": 2.6207932353019716, "loss/logits": 0.8020304828882218, "step": 63320 }, { "epoch": 0.6333, "grad_norm": 14.375, "grad_norm_var": 0.3042805989583333, "learning_rate": 0.0003, "loss": 10.9099, "loss/aux_loss": 0.04806431401520968, "loss/crossentropy": 2.7724860310554504, "loss/logits": 0.812814936041832, "step": 63330 }, { "epoch": 0.6334, "grad_norm": 14.8125, "grad_norm_var": 3.206103515625, "learning_rate": 0.0003, "loss": 11.0152, "loss/aux_loss": 0.04808428026735782, "loss/crossentropy": 2.6957937836647035, "loss/logits": 0.8201660066843033, "step": 63340 }, { "epoch": 0.6335, "grad_norm": 18.625, "grad_norm_var": 4.637744140625, "learning_rate": 0.0003, "loss": 11.0486, "loss/aux_loss": 0.04807388950139284, "loss/crossentropy": 2.706111788749695, "loss/logits": 0.8277266383171081, "step": 63350 }, { "epoch": 0.6336, "grad_norm": 13.8125, "grad_norm_var": 1.4661458333333333, "learning_rate": 0.0003, "loss": 10.6886, "loss/aux_loss": 0.04806654676795006, "loss/crossentropy": 2.594613701105118, "loss/logits": 0.787641778588295, "step": 63360 }, { "epoch": 0.6337, "grad_norm": 16.875, "grad_norm_var": 0.5614420572916666, "learning_rate": 0.0003, "loss": 10.9344, "loss/aux_loss": 0.04806236121803522, "loss/crossentropy": 2.695681321620941, "loss/logits": 0.8070782214403153, "step": 63370 }, { "epoch": 0.6338, "grad_norm": 16.125, "grad_norm_var": 0.5291666666666667, "learning_rate": 0.0003, "loss": 10.9202, "loss/aux_loss": 0.04807351212948561, "loss/crossentropy": 2.5633945643901823, "loss/logits": 0.770171768963337, "step": 63380 }, { "epoch": 0.6339, "grad_norm": 15.0, "grad_norm_var": 0.3859375, "learning_rate": 0.0003, "loss": 10.9848, "loss/aux_loss": 0.04807458482682705, "loss/crossentropy": 2.643518990278244, "loss/logits": 0.8119804114103317, "step": 63390 }, { "epoch": 0.634, "grad_norm": 15.625, "grad_norm_var": 0.79609375, "learning_rate": 0.0003, "loss": 10.9034, "loss/aux_loss": 0.04807896073907614, "loss/crossentropy": 2.580906796455383, "loss/logits": 0.7909560561180115, "step": 63400 }, { "epoch": 0.6341, "grad_norm": 15.375, "grad_norm_var": 0.6630045572916666, "learning_rate": 0.0003, "loss": 10.8749, "loss/aux_loss": 0.04806428924202919, "loss/crossentropy": 2.6164624214172365, "loss/logits": 0.8118317008018494, "step": 63410 }, { "epoch": 0.6342, "grad_norm": 16.375, "grad_norm_var": 9.6837890625, "learning_rate": 0.0003, "loss": 10.9738, "loss/aux_loss": 0.0480818934738636, "loss/crossentropy": 2.6591660141944886, "loss/logits": 0.8064444810152054, "step": 63420 }, { "epoch": 0.6343, "grad_norm": 16.5, "grad_norm_var": 10.77578125, "learning_rate": 0.0003, "loss": 11.0038, "loss/aux_loss": 0.04806845411658287, "loss/crossentropy": 2.8173150777816773, "loss/logits": 0.839667072892189, "step": 63430 }, { "epoch": 0.6344, "grad_norm": 15.125, "grad_norm_var": 0.5028483072916666, "learning_rate": 0.0003, "loss": 10.8932, "loss/aux_loss": 0.04805213697254658, "loss/crossentropy": 2.636157047748566, "loss/logits": 0.792957991361618, "step": 63440 }, { "epoch": 0.6345, "grad_norm": 16.5, "grad_norm_var": 0.7291015625, "learning_rate": 0.0003, "loss": 10.8117, "loss/aux_loss": 0.048077466525137426, "loss/crossentropy": 2.6534729659557343, "loss/logits": 0.8073912143707276, "step": 63450 }, { "epoch": 0.6346, "grad_norm": 14.6875, "grad_norm_var": 0.768994140625, "learning_rate": 0.0003, "loss": 10.7352, "loss/aux_loss": 0.04807067047804594, "loss/crossentropy": 2.716173303127289, "loss/logits": 0.8196294963359833, "step": 63460 }, { "epoch": 0.6347, "grad_norm": 14.875, "grad_norm_var": 0.490869140625, "learning_rate": 0.0003, "loss": 10.8034, "loss/aux_loss": 0.04808057863265276, "loss/crossentropy": 2.6360740780830385, "loss/logits": 0.7901035279035569, "step": 63470 }, { "epoch": 0.6348, "grad_norm": 15.5, "grad_norm_var": 0.7166015625, "learning_rate": 0.0003, "loss": 10.8123, "loss/aux_loss": 0.048066640831530096, "loss/crossentropy": 2.6251393437385557, "loss/logits": 0.7680756062269211, "step": 63480 }, { "epoch": 0.6349, "grad_norm": 16.5, "grad_norm_var": 2.012744140625, "learning_rate": 0.0003, "loss": 10.8699, "loss/aux_loss": 0.048078449070453645, "loss/crossentropy": 2.72235426902771, "loss/logits": 0.8374341070652008, "step": 63490 }, { "epoch": 0.635, "grad_norm": 14.9375, "grad_norm_var": 2.2, "learning_rate": 0.0003, "loss": 10.886, "loss/aux_loss": 0.04805882424116135, "loss/crossentropy": 2.775067353248596, "loss/logits": 0.8465858489274979, "step": 63500 }, { "epoch": 0.6351, "grad_norm": 15.25, "grad_norm_var": 0.1453125, "learning_rate": 0.0003, "loss": 10.8996, "loss/aux_loss": 0.048069519177079204, "loss/crossentropy": 2.8206980526447296, "loss/logits": 0.8248533338308335, "step": 63510 }, { "epoch": 0.6352, "grad_norm": 15.125, "grad_norm_var": 0.60625, "learning_rate": 0.0003, "loss": 10.8778, "loss/aux_loss": 0.04807919226586819, "loss/crossentropy": 2.8253234326839447, "loss/logits": 0.8029891848564148, "step": 63520 }, { "epoch": 0.6353, "grad_norm": 15.75, "grad_norm_var": 0.830712890625, "learning_rate": 0.0003, "loss": 10.8775, "loss/aux_loss": 0.048057425394654275, "loss/crossentropy": 2.6366010308265686, "loss/logits": 0.8082585781812668, "step": 63530 }, { "epoch": 0.6354, "grad_norm": 15.9375, "grad_norm_var": 1.255322265625, "learning_rate": 0.0003, "loss": 10.7782, "loss/aux_loss": 0.048088168166577815, "loss/crossentropy": 2.5838097631931305, "loss/logits": 0.7870519459247589, "step": 63540 }, { "epoch": 0.6355, "grad_norm": 14.875, "grad_norm_var": 1.31015625, "learning_rate": 0.0003, "loss": 10.8468, "loss/aux_loss": 0.04806020092219114, "loss/crossentropy": 2.711851143836975, "loss/logits": 0.8394827723503113, "step": 63550 }, { "epoch": 0.6356, "grad_norm": 16.375, "grad_norm_var": 0.5389973958333333, "learning_rate": 0.0003, "loss": 10.8928, "loss/aux_loss": 0.048066692799329756, "loss/crossentropy": 2.7870961904525755, "loss/logits": 0.8038838863372803, "step": 63560 }, { "epoch": 0.6357, "grad_norm": 15.1875, "grad_norm_var": 0.41015625, "learning_rate": 0.0003, "loss": 10.9666, "loss/aux_loss": 0.04806511420756578, "loss/crossentropy": 2.641837865114212, "loss/logits": 0.8012272834777832, "step": 63570 }, { "epoch": 0.6358, "grad_norm": 14.0625, "grad_norm_var": 0.35625, "learning_rate": 0.0003, "loss": 10.8966, "loss/aux_loss": 0.0480894086882472, "loss/crossentropy": 2.6827530384063722, "loss/logits": 0.7748128771781921, "step": 63580 }, { "epoch": 0.6359, "grad_norm": 14.75, "grad_norm_var": 0.496337890625, "learning_rate": 0.0003, "loss": 10.8975, "loss/aux_loss": 0.048070944286882876, "loss/crossentropy": 2.441706246137619, "loss/logits": 0.8006876438856125, "step": 63590 }, { "epoch": 0.636, "grad_norm": 15.9375, "grad_norm_var": 0.2744140625, "learning_rate": 0.0003, "loss": 10.9769, "loss/aux_loss": 0.04805447738617659, "loss/crossentropy": 2.801032680273056, "loss/logits": 0.8238119214773179, "step": 63600 }, { "epoch": 0.6361, "grad_norm": 15.4375, "grad_norm_var": 0.48359375, "learning_rate": 0.0003, "loss": 10.8361, "loss/aux_loss": 0.04808128047734499, "loss/crossentropy": 2.6498410642147063, "loss/logits": 0.7981126606464386, "step": 63610 }, { "epoch": 0.6362, "grad_norm": 14.375, "grad_norm_var": 0.28046875, "learning_rate": 0.0003, "loss": 10.9804, "loss/aux_loss": 0.04807472582906484, "loss/crossentropy": 2.6279158115386965, "loss/logits": 0.8193077623844147, "step": 63620 }, { "epoch": 0.6363, "grad_norm": 15.0625, "grad_norm_var": 0.19568684895833333, "learning_rate": 0.0003, "loss": 10.7969, "loss/aux_loss": 0.048069669492542745, "loss/crossentropy": 2.8434048295021057, "loss/logits": 0.8453941226005555, "step": 63630 }, { "epoch": 0.6364, "grad_norm": 15.0, "grad_norm_var": 1.27265625, "learning_rate": 0.0003, "loss": 10.9003, "loss/aux_loss": 0.0480651993304491, "loss/crossentropy": 2.757810640335083, "loss/logits": 0.8333944648504257, "step": 63640 }, { "epoch": 0.6365, "grad_norm": 15.375, "grad_norm_var": 1.1129557291666667, "learning_rate": 0.0003, "loss": 11.0213, "loss/aux_loss": 0.04808567147701979, "loss/crossentropy": 2.717263233661652, "loss/logits": 0.862332072854042, "step": 63650 }, { "epoch": 0.6366, "grad_norm": 14.625, "grad_norm_var": 0.43743489583333334, "learning_rate": 0.0003, "loss": 11.0693, "loss/aux_loss": 0.04806395098567009, "loss/crossentropy": 2.705856317281723, "loss/logits": 0.7979970872402191, "step": 63660 }, { "epoch": 0.6367, "grad_norm": 16.125, "grad_norm_var": 0.489697265625, "learning_rate": 0.0003, "loss": 10.872, "loss/aux_loss": 0.048058228194713594, "loss/crossentropy": 2.552783203125, "loss/logits": 0.8018041133880616, "step": 63670 }, { "epoch": 0.6368, "grad_norm": 15.125, "grad_norm_var": 0.5059895833333333, "learning_rate": 0.0003, "loss": 10.7682, "loss/aux_loss": 0.048081094212830064, "loss/crossentropy": 2.757079029083252, "loss/logits": 0.8517242342233657, "step": 63680 }, { "epoch": 0.6369, "grad_norm": 15.875, "grad_norm_var": 0.4239420572916667, "learning_rate": 0.0003, "loss": 10.9756, "loss/aux_loss": 0.04806440509855747, "loss/crossentropy": 2.679883936047554, "loss/logits": 0.8242890566587449, "step": 63690 }, { "epoch": 0.637, "grad_norm": 15.0625, "grad_norm_var": 0.792431640625, "learning_rate": 0.0003, "loss": 10.6549, "loss/aux_loss": 0.04806427750736475, "loss/crossentropy": 2.6404387235641478, "loss/logits": 0.8252344757318497, "step": 63700 }, { "epoch": 0.6371, "grad_norm": 15.125, "grad_norm_var": 0.6921223958333333, "learning_rate": 0.0003, "loss": 10.952, "loss/aux_loss": 0.04807978924363852, "loss/crossentropy": 2.7173975467681886, "loss/logits": 0.8082460671663284, "step": 63710 }, { "epoch": 0.6372, "grad_norm": 16.5, "grad_norm_var": 0.566650390625, "learning_rate": 0.0003, "loss": 10.7532, "loss/aux_loss": 0.04806181751191616, "loss/crossentropy": 2.790048438310623, "loss/logits": 0.8141476571559906, "step": 63720 }, { "epoch": 0.6373, "grad_norm": 16.125, "grad_norm_var": 0.7513020833333334, "learning_rate": 0.0003, "loss": 10.9906, "loss/aux_loss": 0.048070454970002174, "loss/crossentropy": 2.7498911917209625, "loss/logits": 0.8331282079219818, "step": 63730 }, { "epoch": 0.6374, "grad_norm": 16.75, "grad_norm_var": 0.954931640625, "learning_rate": 0.0003, "loss": 10.76, "loss/aux_loss": 0.04806820340454578, "loss/crossentropy": 2.685372221469879, "loss/logits": 0.8030792355537415, "step": 63740 }, { "epoch": 0.6375, "grad_norm": 15.25, "grad_norm_var": 0.9230305989583333, "learning_rate": 0.0003, "loss": 10.9593, "loss/aux_loss": 0.048071705549955365, "loss/crossentropy": 2.553539252281189, "loss/logits": 0.8266629427671432, "step": 63750 }, { "epoch": 0.6376, "grad_norm": 14.6875, "grad_norm_var": 0.6065104166666667, "learning_rate": 0.0003, "loss": 10.8365, "loss/aux_loss": 0.04807527456432581, "loss/crossentropy": 2.7262804925441744, "loss/logits": 0.801684433221817, "step": 63760 }, { "epoch": 0.6377, "grad_norm": 16.625, "grad_norm_var": 0.5832682291666667, "learning_rate": 0.0003, "loss": 10.9552, "loss/aux_loss": 0.04806269612163305, "loss/crossentropy": 2.7759795606136324, "loss/logits": 0.8241303324699402, "step": 63770 }, { "epoch": 0.6378, "grad_norm": 14.3125, "grad_norm_var": 1.2555826822916667, "learning_rate": 0.0003, "loss": 10.8722, "loss/aux_loss": 0.04808154441416264, "loss/crossentropy": 2.723454737663269, "loss/logits": 0.7963994681835175, "step": 63780 }, { "epoch": 0.6379, "grad_norm": 15.0625, "grad_norm_var": 1.0734375, "learning_rate": 0.0003, "loss": 10.9994, "loss/aux_loss": 0.04806034788489342, "loss/crossentropy": 2.787865138053894, "loss/logits": 0.8507139623165131, "step": 63790 }, { "epoch": 0.638, "grad_norm": 14.625, "grad_norm_var": 1.4801432291666667, "learning_rate": 0.0003, "loss": 10.9361, "loss/aux_loss": 0.048068071529269216, "loss/crossentropy": 2.6697638273239135, "loss/logits": 0.8172403901815415, "step": 63800 }, { "epoch": 0.6381, "grad_norm": 16.25, "grad_norm_var": 0.7082682291666667, "learning_rate": 0.0003, "loss": 10.862, "loss/aux_loss": 0.048073775880038735, "loss/crossentropy": 2.798969733715057, "loss/logits": 0.8269981414079666, "step": 63810 }, { "epoch": 0.6382, "grad_norm": 15.4375, "grad_norm_var": 0.2596354166666667, "learning_rate": 0.0003, "loss": 10.9564, "loss/aux_loss": 0.04805988427251577, "loss/crossentropy": 2.732244443893433, "loss/logits": 0.8475836634635925, "step": 63820 }, { "epoch": 0.6383, "grad_norm": 14.4375, "grad_norm_var": 0.482666015625, "learning_rate": 0.0003, "loss": 10.9159, "loss/aux_loss": 0.04807668384164572, "loss/crossentropy": 2.782858157157898, "loss/logits": 0.7943806976079941, "step": 63830 }, { "epoch": 0.6384, "grad_norm": 16.25, "grad_norm_var": 0.328369140625, "learning_rate": 0.0003, "loss": 11.0277, "loss/aux_loss": 0.0480587437748909, "loss/crossentropy": 2.840771198272705, "loss/logits": 0.8426368027925492, "step": 63840 }, { "epoch": 0.6385, "grad_norm": 16.0, "grad_norm_var": 0.5332682291666667, "learning_rate": 0.0003, "loss": 10.9838, "loss/aux_loss": 0.04806698095053434, "loss/crossentropy": 2.771585577726364, "loss/logits": 0.8477590322494507, "step": 63850 }, { "epoch": 0.6386, "grad_norm": 15.875, "grad_norm_var": 0.3931640625, "learning_rate": 0.0003, "loss": 10.7073, "loss/aux_loss": 0.04806830566376448, "loss/crossentropy": 2.6541366040706635, "loss/logits": 0.8144404917955399, "step": 63860 }, { "epoch": 0.6387, "grad_norm": 15.875, "grad_norm_var": 0.6769368489583333, "learning_rate": 0.0003, "loss": 10.8338, "loss/aux_loss": 0.048068562522530556, "loss/crossentropy": 2.7276011228561403, "loss/logits": 0.7975292503833771, "step": 63870 }, { "epoch": 0.6388, "grad_norm": 14.0625, "grad_norm_var": 1.0471354166666667, "learning_rate": 0.0003, "loss": 10.9173, "loss/aux_loss": 0.04807192627340555, "loss/crossentropy": 2.703585624694824, "loss/logits": 0.8457301408052444, "step": 63880 }, { "epoch": 0.6389, "grad_norm": 15.8125, "grad_norm_var": 18.350455729166665, "learning_rate": 0.0003, "loss": 10.986, "loss/aux_loss": 0.04805469363927841, "loss/crossentropy": 2.7493717789649965, "loss/logits": 0.7949663013219833, "step": 63890 }, { "epoch": 0.639, "grad_norm": 15.375, "grad_norm_var": 17.873030598958334, "learning_rate": 0.0003, "loss": 10.8859, "loss/aux_loss": 0.048075062409043315, "loss/crossentropy": 2.909009563922882, "loss/logits": 0.8053748130798339, "step": 63900 }, { "epoch": 0.6391, "grad_norm": 15.6875, "grad_norm_var": 0.364306640625, "learning_rate": 0.0003, "loss": 10.9087, "loss/aux_loss": 0.048067734017968176, "loss/crossentropy": 2.9225926876068113, "loss/logits": 0.835890656709671, "step": 63910 }, { "epoch": 0.6392, "grad_norm": 15.75, "grad_norm_var": 3.35234375, "learning_rate": 0.0003, "loss": 10.8436, "loss/aux_loss": 0.048074383102357385, "loss/crossentropy": 2.780510759353638, "loss/logits": 0.8370512515306473, "step": 63920 }, { "epoch": 0.6393, "grad_norm": 14.6875, "grad_norm_var": 3.1134765625, "learning_rate": 0.0003, "loss": 10.8743, "loss/aux_loss": 0.048071058467030525, "loss/crossentropy": 2.6825768053531647, "loss/logits": 0.789939995110035, "step": 63930 }, { "epoch": 0.6394, "grad_norm": 15.4375, "grad_norm_var": 0.42967122395833335, "learning_rate": 0.0003, "loss": 11.0965, "loss/aux_loss": 0.048058373667299745, "loss/crossentropy": 2.78253173828125, "loss/logits": 0.8522822350263596, "step": 63940 }, { "epoch": 0.6395, "grad_norm": 15.0625, "grad_norm_var": 0.4110514322916667, "learning_rate": 0.0003, "loss": 10.8844, "loss/aux_loss": 0.04807241130620241, "loss/crossentropy": 2.608461046218872, "loss/logits": 0.8056067079305649, "step": 63950 }, { "epoch": 0.6396, "grad_norm": 14.375, "grad_norm_var": 0.46432291666666664, "learning_rate": 0.0003, "loss": 10.9174, "loss/aux_loss": 0.048062784038484095, "loss/crossentropy": 2.6751578748226166, "loss/logits": 0.8276819512248039, "step": 63960 }, { "epoch": 0.6397, "grad_norm": 15.625, "grad_norm_var": 0.788134765625, "learning_rate": 0.0003, "loss": 11.0461, "loss/aux_loss": 0.04806781299412251, "loss/crossentropy": 2.871128559112549, "loss/logits": 0.846360245347023, "step": 63970 }, { "epoch": 0.6398, "grad_norm": 15.75, "grad_norm_var": 73.32381184895833, "learning_rate": 0.0003, "loss": 11.1256, "loss/aux_loss": 0.04808484613895416, "loss/crossentropy": 2.7596030294895173, "loss/logits": 0.8853773176670074, "step": 63980 }, { "epoch": 0.6399, "grad_norm": 15.4375, "grad_norm_var": 73.80618489583334, "learning_rate": 0.0003, "loss": 10.8477, "loss/aux_loss": 0.04804834388196468, "loss/crossentropy": 2.660778295993805, "loss/logits": 0.8173895359039307, "step": 63990 }, { "epoch": 0.64, "grad_norm": 15.125, "grad_norm_var": 0.9999348958333333, "learning_rate": 0.0003, "loss": 10.8996, "loss/aux_loss": 0.048082450218498704, "loss/crossentropy": 2.6331078112125397, "loss/logits": 0.8155697345733642, "step": 64000 }, { "epoch": 0.6401, "grad_norm": 15.0625, "grad_norm_var": 0.8478515625, "learning_rate": 0.0003, "loss": 10.8079, "loss/aux_loss": 0.04807614423334598, "loss/crossentropy": 2.6761899530887603, "loss/logits": 0.7967321127653122, "step": 64010 }, { "epoch": 0.6402, "grad_norm": 14.25, "grad_norm_var": 0.3965983072916667, "learning_rate": 0.0003, "loss": 10.8051, "loss/aux_loss": 0.048058840073645116, "loss/crossentropy": 2.688008636236191, "loss/logits": 0.7768597364425659, "step": 64020 }, { "epoch": 0.6403, "grad_norm": 14.375, "grad_norm_var": 0.4049479166666667, "learning_rate": 0.0003, "loss": 10.8649, "loss/aux_loss": 0.048065507970750335, "loss/crossentropy": 2.7258352994918824, "loss/logits": 0.8018498718738556, "step": 64030 }, { "epoch": 0.6404, "grad_norm": 13.875, "grad_norm_var": 0.6686848958333333, "learning_rate": 0.0003, "loss": 10.6553, "loss/aux_loss": 0.04808259606361389, "loss/crossentropy": 2.684278553724289, "loss/logits": 0.7884336978197097, "step": 64040 }, { "epoch": 0.6405, "grad_norm": 14.25, "grad_norm_var": 0.86640625, "learning_rate": 0.0003, "loss": 10.8835, "loss/aux_loss": 0.04807013440877199, "loss/crossentropy": 2.6649328231811524, "loss/logits": 0.8022267431020736, "step": 64050 }, { "epoch": 0.6406, "grad_norm": 16.125, "grad_norm_var": 0.6298014322916666, "learning_rate": 0.0003, "loss": 10.8014, "loss/aux_loss": 0.04807114787399769, "loss/crossentropy": 2.7330439388751984, "loss/logits": 0.8096017986536026, "step": 64060 }, { "epoch": 0.6407, "grad_norm": 14.875, "grad_norm_var": 0.4886555989583333, "learning_rate": 0.0003, "loss": 10.8495, "loss/aux_loss": 0.048064742051064965, "loss/crossentropy": 2.6271802723407744, "loss/logits": 0.8042764306068421, "step": 64070 }, { "epoch": 0.6408, "grad_norm": 14.25, "grad_norm_var": 0.4212890625, "learning_rate": 0.0003, "loss": 10.9443, "loss/aux_loss": 0.048074343241751194, "loss/crossentropy": 2.651158905029297, "loss/logits": 0.8192477524280548, "step": 64080 }, { "epoch": 0.6409, "grad_norm": 16.0, "grad_norm_var": 0.5291015625, "learning_rate": 0.0003, "loss": 10.8708, "loss/aux_loss": 0.04805680923163891, "loss/crossentropy": 2.740699511766434, "loss/logits": 0.8247522652149201, "step": 64090 }, { "epoch": 0.641, "grad_norm": 16.875, "grad_norm_var": 1.893603515625, "learning_rate": 0.0003, "loss": 10.7972, "loss/aux_loss": 0.0480952775105834, "loss/crossentropy": 2.6142990469932554, "loss/logits": 0.7922994047403336, "step": 64100 }, { "epoch": 0.6411, "grad_norm": 14.5, "grad_norm_var": 0.7624837239583333, "learning_rate": 0.0003, "loss": 11.0196, "loss/aux_loss": 0.04806794486939907, "loss/crossentropy": 2.6602042615413666, "loss/logits": 0.8153569340705872, "step": 64110 }, { "epoch": 0.6412, "grad_norm": 14.875, "grad_norm_var": 0.4712890625, "learning_rate": 0.0003, "loss": 10.9019, "loss/aux_loss": 0.04807754717767239, "loss/crossentropy": 2.6273086309432983, "loss/logits": 0.7900595605373383, "step": 64120 }, { "epoch": 0.6413, "grad_norm": 15.125, "grad_norm_var": 1.4556640625, "learning_rate": 0.0003, "loss": 11.0146, "loss/aux_loss": 0.048073111660778524, "loss/crossentropy": 2.8401905834674834, "loss/logits": 0.8362075448036194, "step": 64130 }, { "epoch": 0.6414, "grad_norm": 16.0, "grad_norm_var": 1.1136555989583334, "learning_rate": 0.0003, "loss": 11.0899, "loss/aux_loss": 0.04805966299027205, "loss/crossentropy": 2.7619579434394836, "loss/logits": 0.806191298365593, "step": 64140 }, { "epoch": 0.6415, "grad_norm": 14.375, "grad_norm_var": 0.8770182291666667, "learning_rate": 0.0003, "loss": 10.9026, "loss/aux_loss": 0.0480696702376008, "loss/crossentropy": 2.6707617938518524, "loss/logits": 0.8110317856073379, "step": 64150 }, { "epoch": 0.6416, "grad_norm": 16.125, "grad_norm_var": 0.59609375, "learning_rate": 0.0003, "loss": 10.9761, "loss/aux_loss": 0.04806564971804619, "loss/crossentropy": 2.679551374912262, "loss/logits": 0.8288383305072784, "step": 64160 }, { "epoch": 0.6417, "grad_norm": 14.8125, "grad_norm_var": 0.6447265625, "learning_rate": 0.0003, "loss": 10.7706, "loss/aux_loss": 0.048071674257516864, "loss/crossentropy": 2.720490908622742, "loss/logits": 0.8180992275476455, "step": 64170 }, { "epoch": 0.6418, "grad_norm": 66.5, "grad_norm_var": 244.53170572916667, "learning_rate": 0.0003, "loss": 10.8762, "loss/aux_loss": 0.048073908500373366, "loss/crossentropy": 2.6952185809612272, "loss/logits": 0.7881834089756012, "step": 64180 }, { "epoch": 0.6419, "grad_norm": 18.375, "grad_norm_var": 226.25494791666668, "learning_rate": 0.0003, "loss": 10.8772, "loss/aux_loss": 0.04805737938731909, "loss/crossentropy": 2.7175555408000944, "loss/logits": 0.7974839717149734, "step": 64190 }, { "epoch": 0.642, "grad_norm": 15.8125, "grad_norm_var": 8.923697916666667, "learning_rate": 0.0003, "loss": 10.8089, "loss/aux_loss": 0.048077397607266904, "loss/crossentropy": 2.632685160636902, "loss/logits": 0.8100000500679017, "step": 64200 }, { "epoch": 0.6421, "grad_norm": 16.625, "grad_norm_var": 0.4105305989583333, "learning_rate": 0.0003, "loss": 10.9696, "loss/aux_loss": 0.04807335864752531, "loss/crossentropy": 2.6972862422466277, "loss/logits": 0.7903000891208649, "step": 64210 }, { "epoch": 0.6422, "grad_norm": 15.1875, "grad_norm_var": 0.42576497395833335, "learning_rate": 0.0003, "loss": 10.7641, "loss/aux_loss": 0.048062941245734694, "loss/crossentropy": 2.5591843128204346, "loss/logits": 0.7982923656702041, "step": 64220 }, { "epoch": 0.6423, "grad_norm": 15.25, "grad_norm_var": 0.5587890625, "learning_rate": 0.0003, "loss": 10.882, "loss/aux_loss": 0.04808053784072399, "loss/crossentropy": 2.68142853975296, "loss/logits": 0.8112878233194352, "step": 64230 }, { "epoch": 0.6424, "grad_norm": 14.5625, "grad_norm_var": 0.7109212239583333, "learning_rate": 0.0003, "loss": 10.903, "loss/aux_loss": 0.04807339143007994, "loss/crossentropy": 2.6936437368392943, "loss/logits": 0.7852249950170517, "step": 64240 }, { "epoch": 0.6425, "grad_norm": 16.5, "grad_norm_var": 0.6098307291666667, "learning_rate": 0.0003, "loss": 10.8919, "loss/aux_loss": 0.048065887205302714, "loss/crossentropy": 2.6046033978462217, "loss/logits": 0.8367180943489074, "step": 64250 }, { "epoch": 0.6426, "grad_norm": 16.375, "grad_norm_var": 1.3727701822916667, "learning_rate": 0.0003, "loss": 10.8046, "loss/aux_loss": 0.048077253997325896, "loss/crossentropy": 2.7186995148658752, "loss/logits": 0.8272952169179917, "step": 64260 }, { "epoch": 0.6427, "grad_norm": 15.0, "grad_norm_var": 1.4869140625, "learning_rate": 0.0003, "loss": 11.0264, "loss/aux_loss": 0.04806565903127193, "loss/crossentropy": 2.7013957381248472, "loss/logits": 0.8380502104759217, "step": 64270 }, { "epoch": 0.6428, "grad_norm": 15.3125, "grad_norm_var": 0.6114583333333333, "learning_rate": 0.0003, "loss": 10.8768, "loss/aux_loss": 0.048069261759519574, "loss/crossentropy": 2.6103322327136995, "loss/logits": 0.824969407916069, "step": 64280 }, { "epoch": 0.6429, "grad_norm": 15.125, "grad_norm_var": 0.39791666666666664, "learning_rate": 0.0003, "loss": 10.9467, "loss/aux_loss": 0.04807145707309246, "loss/crossentropy": 2.5910118997097014, "loss/logits": 0.7979099124670028, "step": 64290 }, { "epoch": 0.643, "grad_norm": 14.6875, "grad_norm_var": 0.7320149739583334, "learning_rate": 0.0003, "loss": 10.9267, "loss/aux_loss": 0.048070203140378, "loss/crossentropy": 2.789473479986191, "loss/logits": 0.7883479207754135, "step": 64300 }, { "epoch": 0.6431, "grad_norm": 15.25, "grad_norm_var": 1.0072916666666667, "learning_rate": 0.0003, "loss": 10.8941, "loss/aux_loss": 0.04807113241404295, "loss/crossentropy": 2.706324911117554, "loss/logits": 0.8335605084896087, "step": 64310 }, { "epoch": 0.6432, "grad_norm": 16.5, "grad_norm_var": 517.822509765625, "learning_rate": 0.0003, "loss": 10.9202, "loss/aux_loss": 0.04807917233556509, "loss/crossentropy": 2.5871843814849855, "loss/logits": 0.7983285367488862, "step": 64320 }, { "epoch": 0.6433, "grad_norm": 16.375, "grad_norm_var": 1.3219889322916667, "learning_rate": 0.0003, "loss": 10.8152, "loss/aux_loss": 0.04806725718080997, "loss/crossentropy": 2.714830732345581, "loss/logits": 0.7972691237926484, "step": 64330 }, { "epoch": 0.6434, "grad_norm": 14.9375, "grad_norm_var": 0.44464518229166666, "learning_rate": 0.0003, "loss": 11.0088, "loss/aux_loss": 0.048060521483421326, "loss/crossentropy": 2.863740932941437, "loss/logits": 0.8662774622440338, "step": 64340 }, { "epoch": 0.6435, "grad_norm": 14.375, "grad_norm_var": 0.461572265625, "learning_rate": 0.0003, "loss": 10.8927, "loss/aux_loss": 0.048076084814965725, "loss/crossentropy": 2.6789645075798036, "loss/logits": 0.8414348632097244, "step": 64350 }, { "epoch": 0.6436, "grad_norm": 13.5, "grad_norm_var": 0.2843098958333333, "learning_rate": 0.0003, "loss": 10.7573, "loss/aux_loss": 0.04807751737535, "loss/crossentropy": 2.5758812725543976, "loss/logits": 0.7923395410180092, "step": 64360 }, { "epoch": 0.6437, "grad_norm": 14.8125, "grad_norm_var": 0.5059733072916667, "learning_rate": 0.0003, "loss": 10.9254, "loss/aux_loss": 0.04806223139166832, "loss/crossentropy": 2.824587380886078, "loss/logits": 0.779283007979393, "step": 64370 }, { "epoch": 0.6438, "grad_norm": 15.9375, "grad_norm_var": 0.23162434895833334, "learning_rate": 0.0003, "loss": 11.1465, "loss/aux_loss": 0.04806876201182604, "loss/crossentropy": 2.7761879444122313, "loss/logits": 0.8234198421239853, "step": 64380 }, { "epoch": 0.6439, "grad_norm": 14.25, "grad_norm_var": 23.5265625, "learning_rate": 0.0003, "loss": 10.8994, "loss/aux_loss": 0.048068377934396264, "loss/crossentropy": 2.849505627155304, "loss/logits": 0.7996205180883408, "step": 64390 }, { "epoch": 0.644, "grad_norm": 14.875, "grad_norm_var": 24.1228515625, "learning_rate": 0.0003, "loss": 10.8842, "loss/aux_loss": 0.04806913807988167, "loss/crossentropy": 2.699050772190094, "loss/logits": 0.8141772150993347, "step": 64400 }, { "epoch": 0.6441, "grad_norm": 14.9375, "grad_norm_var": 0.29635416666666664, "learning_rate": 0.0003, "loss": 10.8829, "loss/aux_loss": 0.04808142352849245, "loss/crossentropy": 2.6662961184978484, "loss/logits": 0.8306810945272446, "step": 64410 }, { "epoch": 0.6442, "grad_norm": 15.25, "grad_norm_var": 0.5077473958333333, "learning_rate": 0.0003, "loss": 10.8389, "loss/aux_loss": 0.0480579923838377, "loss/crossentropy": 2.5457189321517943, "loss/logits": 0.7988121956586838, "step": 64420 }, { "epoch": 0.6443, "grad_norm": 14.5, "grad_norm_var": 0.24060872395833333, "learning_rate": 0.0003, "loss": 10.9997, "loss/aux_loss": 0.048068817704916, "loss/crossentropy": 2.6517822682857513, "loss/logits": 0.7966990500688553, "step": 64430 }, { "epoch": 0.6444, "grad_norm": 15.75, "grad_norm_var": 0.5145182291666667, "learning_rate": 0.0003, "loss": 10.9814, "loss/aux_loss": 0.048084283247590065, "loss/crossentropy": 2.663684105873108, "loss/logits": 0.8236712843179703, "step": 64440 }, { "epoch": 0.6445, "grad_norm": 16.0, "grad_norm_var": 0.5385416666666667, "learning_rate": 0.0003, "loss": 11.0452, "loss/aux_loss": 0.04805535394698381, "loss/crossentropy": 2.7321596264839174, "loss/logits": 0.81631198823452, "step": 64450 }, { "epoch": 0.6446, "grad_norm": 16.0, "grad_norm_var": 0.3385416666666667, "learning_rate": 0.0003, "loss": 10.7714, "loss/aux_loss": 0.0480718620121479, "loss/crossentropy": 2.5779692411422728, "loss/logits": 0.7793363690376282, "step": 64460 }, { "epoch": 0.6447, "grad_norm": 13.9375, "grad_norm_var": 0.6223795572916667, "learning_rate": 0.0003, "loss": 10.9604, "loss/aux_loss": 0.048071989230811595, "loss/crossentropy": 2.749948966503143, "loss/logits": 0.8526365518569946, "step": 64470 }, { "epoch": 0.6448, "grad_norm": 15.875, "grad_norm_var": 0.33229166666666665, "learning_rate": 0.0003, "loss": 10.9431, "loss/aux_loss": 0.04805717971175909, "loss/crossentropy": 2.7506559550762177, "loss/logits": 0.8254747807979583, "step": 64480 }, { "epoch": 0.6449, "grad_norm": 15.125, "grad_norm_var": 0.2518229166666667, "learning_rate": 0.0003, "loss": 11.1098, "loss/aux_loss": 0.04806588124483824, "loss/crossentropy": 2.728802466392517, "loss/logits": 0.8510254561901093, "step": 64490 }, { "epoch": 0.645, "grad_norm": 14.1875, "grad_norm_var": 0.5056640625, "learning_rate": 0.0003, "loss": 11.0118, "loss/aux_loss": 0.04808861147612333, "loss/crossentropy": 2.828171968460083, "loss/logits": 0.8330163925886154, "step": 64500 }, { "epoch": 0.6451, "grad_norm": 15.0625, "grad_norm_var": 0.24659830729166668, "learning_rate": 0.0003, "loss": 10.6805, "loss/aux_loss": 0.048062241077423094, "loss/crossentropy": 2.703647696971893, "loss/logits": 0.8099043250083924, "step": 64510 }, { "epoch": 0.6452, "grad_norm": 14.1875, "grad_norm_var": 0.42967122395833335, "learning_rate": 0.0003, "loss": 10.9858, "loss/aux_loss": 0.048076873645186424, "loss/crossentropy": 2.5978757619857786, "loss/logits": 0.807657128572464, "step": 64520 }, { "epoch": 0.6453, "grad_norm": 16.25, "grad_norm_var": 106.27864583333333, "learning_rate": 0.0003, "loss": 10.9333, "loss/aux_loss": 0.04806945752352476, "loss/crossentropy": 2.7974547028541563, "loss/logits": 0.8342884957790375, "step": 64530 }, { "epoch": 0.6454, "grad_norm": 14.3125, "grad_norm_var": 1.3356770833333333, "learning_rate": 0.0003, "loss": 10.9232, "loss/aux_loss": 0.04806990846991539, "loss/crossentropy": 2.650894695520401, "loss/logits": 0.7819722086191178, "step": 64540 }, { "epoch": 0.6455, "grad_norm": 15.375, "grad_norm_var": 0.658447265625, "learning_rate": 0.0003, "loss": 10.9033, "loss/aux_loss": 0.04806603621691465, "loss/crossentropy": 2.7686345756053923, "loss/logits": 0.7982172280550003, "step": 64550 }, { "epoch": 0.6456, "grad_norm": 14.6875, "grad_norm_var": 0.490087890625, "learning_rate": 0.0003, "loss": 10.7961, "loss/aux_loss": 0.048073521070182326, "loss/crossentropy": 2.632987970113754, "loss/logits": 0.8115098506212235, "step": 64560 }, { "epoch": 0.6457, "grad_norm": 15.5, "grad_norm_var": 1.2235514322916667, "learning_rate": 0.0003, "loss": 10.9672, "loss/aux_loss": 0.048069255985319616, "loss/crossentropy": 2.612694835662842, "loss/logits": 0.8157520830631256, "step": 64570 }, { "epoch": 0.6458, "grad_norm": 14.75, "grad_norm_var": 0.765087890625, "learning_rate": 0.0003, "loss": 10.9776, "loss/aux_loss": 0.04807736426591873, "loss/crossentropy": 2.5973219871520996, "loss/logits": 0.7989464849233627, "step": 64580 }, { "epoch": 0.6459, "grad_norm": 17.125, "grad_norm_var": 0.5703125, "learning_rate": 0.0003, "loss": 10.88, "loss/aux_loss": 0.048061249777674675, "loss/crossentropy": 2.6475314140319823, "loss/logits": 0.7981739670038224, "step": 64590 }, { "epoch": 0.646, "grad_norm": 15.1875, "grad_norm_var": 0.57109375, "learning_rate": 0.0003, "loss": 10.8287, "loss/aux_loss": 0.048082700744271276, "loss/crossentropy": 2.8341873228549956, "loss/logits": 0.8232771545648575, "step": 64600 }, { "epoch": 0.6461, "grad_norm": 16.25, "grad_norm_var": 1.3593587239583333, "learning_rate": 0.0003, "loss": 10.7596, "loss/aux_loss": 0.04807372409850359, "loss/crossentropy": 2.4986962258815764, "loss/logits": 0.7847854226827622, "step": 64610 }, { "epoch": 0.6462, "grad_norm": 17.375, "grad_norm_var": 1.5445149739583333, "learning_rate": 0.0003, "loss": 10.8791, "loss/aux_loss": 0.04804991818964481, "loss/crossentropy": 2.841459035873413, "loss/logits": 0.8310028403997421, "step": 64620 }, { "epoch": 0.6463, "grad_norm": 16.375, "grad_norm_var": 0.7822265625, "learning_rate": 0.0003, "loss": 10.9785, "loss/aux_loss": 0.048074251785874364, "loss/crossentropy": 2.6962937235832216, "loss/logits": 0.7954140931367875, "step": 64630 }, { "epoch": 0.6464, "grad_norm": 14.875, "grad_norm_var": 0.6072265625, "learning_rate": 0.0003, "loss": 11.0638, "loss/aux_loss": 0.04808508362621069, "loss/crossentropy": 2.764870321750641, "loss/logits": 0.8515766054391861, "step": 64640 }, { "epoch": 0.6465, "grad_norm": 15.6875, "grad_norm_var": 0.8242024739583333, "learning_rate": 0.0003, "loss": 11.0148, "loss/aux_loss": 0.04806652627885342, "loss/crossentropy": 2.594625836610794, "loss/logits": 0.7873844116926193, "step": 64650 }, { "epoch": 0.6466, "grad_norm": 15.1875, "grad_norm_var": 0.69609375, "learning_rate": 0.0003, "loss": 10.8542, "loss/aux_loss": 0.048073071800172326, "loss/crossentropy": 2.815520566701889, "loss/logits": 0.8115405261516571, "step": 64660 }, { "epoch": 0.6467, "grad_norm": 14.75, "grad_norm_var": 0.25323893229166666, "learning_rate": 0.0003, "loss": 10.866, "loss/aux_loss": 0.048070530965924264, "loss/crossentropy": 2.541173154115677, "loss/logits": 0.8035361468791962, "step": 64670 }, { "epoch": 0.6468, "grad_norm": 14.6875, "grad_norm_var": 3.2783854166666666, "learning_rate": 0.0003, "loss": 10.7697, "loss/aux_loss": 0.04807205218821764, "loss/crossentropy": 2.77914103269577, "loss/logits": 0.8178933262825012, "step": 64680 }, { "epoch": 0.6469, "grad_norm": 14.9375, "grad_norm_var": 1.8332682291666667, "learning_rate": 0.0003, "loss": 10.9381, "loss/aux_loss": 0.048067630268633366, "loss/crossentropy": 2.6256860315799715, "loss/logits": 0.833366334438324, "step": 64690 }, { "epoch": 0.647, "grad_norm": 14.6875, "grad_norm_var": 2.178759765625, "learning_rate": 0.0003, "loss": 11.0279, "loss/aux_loss": 0.04807144869118929, "loss/crossentropy": 2.684089946746826, "loss/logits": 0.8047218829393387, "step": 64700 }, { "epoch": 0.6471, "grad_norm": 15.3125, "grad_norm_var": 0.6446451822916667, "learning_rate": 0.0003, "loss": 10.895, "loss/aux_loss": 0.04806271083652973, "loss/crossentropy": 2.5051504015922545, "loss/logits": 0.814690887928009, "step": 64710 }, { "epoch": 0.6472, "grad_norm": 15.875, "grad_norm_var": 0.5378743489583333, "learning_rate": 0.0003, "loss": 11.0694, "loss/aux_loss": 0.04807141367346048, "loss/crossentropy": 2.6718297123909, "loss/logits": 0.8078697264194489, "step": 64720 }, { "epoch": 0.6473, "grad_norm": 15.1875, "grad_norm_var": 0.5561848958333333, "learning_rate": 0.0003, "loss": 11.1501, "loss/aux_loss": 0.04807430915534496, "loss/crossentropy": 2.7116922199726106, "loss/logits": 0.8617299765348434, "step": 64730 }, { "epoch": 0.6474, "grad_norm": 16.75, "grad_norm_var": 0.32024739583333334, "learning_rate": 0.0003, "loss": 11.0262, "loss/aux_loss": 0.04805788192898035, "loss/crossentropy": 2.7382602095603943, "loss/logits": 0.8403635859489441, "step": 64740 }, { "epoch": 0.6475, "grad_norm": 14.625, "grad_norm_var": 0.27708333333333335, "learning_rate": 0.0003, "loss": 10.9941, "loss/aux_loss": 0.048070405051112174, "loss/crossentropy": 2.7188424825668336, "loss/logits": 0.7995573878288269, "step": 64750 }, { "epoch": 0.6476, "grad_norm": 15.5625, "grad_norm_var": 0.367431640625, "learning_rate": 0.0003, "loss": 10.9026, "loss/aux_loss": 0.04807642940431833, "loss/crossentropy": 2.7417497992515565, "loss/logits": 0.8509037971496582, "step": 64760 }, { "epoch": 0.6477, "grad_norm": 15.5, "grad_norm_var": 1.25546875, "learning_rate": 0.0003, "loss": 10.9626, "loss/aux_loss": 0.0480690760537982, "loss/crossentropy": 2.647187089920044, "loss/logits": 0.8061125695705413, "step": 64770 }, { "epoch": 0.6478, "grad_norm": 16.625, "grad_norm_var": 0.746728515625, "learning_rate": 0.0003, "loss": 10.9044, "loss/aux_loss": 0.048072075471282005, "loss/crossentropy": 2.6679067850112914, "loss/logits": 0.8102442860603333, "step": 64780 }, { "epoch": 0.6479, "grad_norm": 14.3125, "grad_norm_var": 0.6597493489583334, "learning_rate": 0.0003, "loss": 10.8774, "loss/aux_loss": 0.04807139951735735, "loss/crossentropy": 2.679821991920471, "loss/logits": 0.8046330511569977, "step": 64790 }, { "epoch": 0.648, "grad_norm": 15.375, "grad_norm_var": 0.49777018229166664, "learning_rate": 0.0003, "loss": 10.8229, "loss/aux_loss": 0.04807682540267706, "loss/crossentropy": 2.7562019169330596, "loss/logits": 0.8137243837118149, "step": 64800 }, { "epoch": 0.6481, "grad_norm": 14.375, "grad_norm_var": 0.509619140625, "learning_rate": 0.0003, "loss": 10.8284, "loss/aux_loss": 0.048072621785104276, "loss/crossentropy": 2.723770010471344, "loss/logits": 0.8341768980026245, "step": 64810 }, { "epoch": 0.6482, "grad_norm": 16.0, "grad_norm_var": 0.34230143229166665, "learning_rate": 0.0003, "loss": 10.9179, "loss/aux_loss": 0.04807199724018574, "loss/crossentropy": 2.5898614048957826, "loss/logits": 0.8297031134366989, "step": 64820 }, { "epoch": 0.6483, "grad_norm": 15.1875, "grad_norm_var": 0.40089518229166665, "learning_rate": 0.0003, "loss": 10.7685, "loss/aux_loss": 0.04806529227644205, "loss/crossentropy": 2.736210232973099, "loss/logits": 0.8199382722377777, "step": 64830 }, { "epoch": 0.6484, "grad_norm": 15.5, "grad_norm_var": 0.6505208333333333, "learning_rate": 0.0003, "loss": 10.8731, "loss/aux_loss": 0.04806740824133158, "loss/crossentropy": 2.7414426445960998, "loss/logits": 0.829671436548233, "step": 64840 }, { "epoch": 0.6485, "grad_norm": 15.5, "grad_norm_var": 0.7796875, "learning_rate": 0.0003, "loss": 11.1061, "loss/aux_loss": 0.04807157013565302, "loss/crossentropy": 2.6303915977478027, "loss/logits": 0.8207491040229797, "step": 64850 }, { "epoch": 0.6486, "grad_norm": 16.375, "grad_norm_var": 0.650244140625, "learning_rate": 0.0003, "loss": 10.8262, "loss/aux_loss": 0.04807078931480646, "loss/crossentropy": 2.7495046079158785, "loss/logits": 0.8125106036663056, "step": 64860 }, { "epoch": 0.6487, "grad_norm": 14.5625, "grad_norm_var": 0.374072265625, "learning_rate": 0.0003, "loss": 10.8215, "loss/aux_loss": 0.04806679226458073, "loss/crossentropy": 2.52020383477211, "loss/logits": 0.7938949555158615, "step": 64870 }, { "epoch": 0.6488, "grad_norm": 14.875, "grad_norm_var": 0.44680989583333336, "learning_rate": 0.0003, "loss": 10.981, "loss/aux_loss": 0.04807438552379608, "loss/crossentropy": 2.713279777765274, "loss/logits": 0.8202762633562088, "step": 64880 }, { "epoch": 0.6489, "grad_norm": 15.3125, "grad_norm_var": 0.53046875, "learning_rate": 0.0003, "loss": 10.8555, "loss/aux_loss": 0.048073222115635875, "loss/crossentropy": 2.6626059472560883, "loss/logits": 0.8038224250078201, "step": 64890 }, { "epoch": 0.649, "grad_norm": 16.0, "grad_norm_var": 0.2659993489583333, "learning_rate": 0.0003, "loss": 10.8778, "loss/aux_loss": 0.04806961547583342, "loss/crossentropy": 2.7846663117408754, "loss/logits": 0.8211091995239258, "step": 64900 }, { "epoch": 0.6491, "grad_norm": 14.25, "grad_norm_var": 0.40260416666666665, "learning_rate": 0.0003, "loss": 10.8055, "loss/aux_loss": 0.04807712137699127, "loss/crossentropy": 2.6188452005386353, "loss/logits": 0.7841671526432037, "step": 64910 }, { "epoch": 0.6492, "grad_norm": 15.375, "grad_norm_var": 0.5683430989583333, "learning_rate": 0.0003, "loss": 10.8163, "loss/aux_loss": 0.04806164372712374, "loss/crossentropy": 2.6171076774597166, "loss/logits": 0.8012127339839935, "step": 64920 }, { "epoch": 0.6493, "grad_norm": 15.25, "grad_norm_var": 0.430322265625, "learning_rate": 0.0003, "loss": 10.894, "loss/aux_loss": 0.04807732943445444, "loss/crossentropy": 2.6409548163414, "loss/logits": 0.8105780005455017, "step": 64930 }, { "epoch": 0.6494, "grad_norm": 15.6875, "grad_norm_var": 0.6157389322916667, "learning_rate": 0.0003, "loss": 10.8077, "loss/aux_loss": 0.04806876610964537, "loss/crossentropy": 2.5447192013263704, "loss/logits": 0.7904939085245133, "step": 64940 }, { "epoch": 0.6495, "grad_norm": 17.0, "grad_norm_var": 0.6981770833333333, "learning_rate": 0.0003, "loss": 11.0475, "loss/aux_loss": 0.048071413300931454, "loss/crossentropy": 2.706156146526337, "loss/logits": 0.8606253623962402, "step": 64950 }, { "epoch": 0.6496, "grad_norm": 16.25, "grad_norm_var": 0.6577962239583334, "learning_rate": 0.0003, "loss": 10.9165, "loss/aux_loss": 0.04807477165013552, "loss/crossentropy": 2.676371121406555, "loss/logits": 0.8254680544137954, "step": 64960 }, { "epoch": 0.6497, "grad_norm": 16.0, "grad_norm_var": 0.37161458333333336, "learning_rate": 0.0003, "loss": 10.9431, "loss/aux_loss": 0.048062054254114625, "loss/crossentropy": 2.687245047092438, "loss/logits": 0.8192354500293731, "step": 64970 }, { "epoch": 0.6498, "grad_norm": 14.6875, "grad_norm_var": 0.697900390625, "learning_rate": 0.0003, "loss": 10.8736, "loss/aux_loss": 0.04806843213737011, "loss/crossentropy": 2.6569925785064696, "loss/logits": 0.8111906111240387, "step": 64980 }, { "epoch": 0.6499, "grad_norm": 16.25, "grad_norm_var": 0.6346354166666667, "learning_rate": 0.0003, "loss": 10.8889, "loss/aux_loss": 0.04806424044072628, "loss/crossentropy": 2.7027989625930786, "loss/logits": 0.8000789701938629, "step": 64990 }, { "epoch": 0.65, "grad_norm": 75.5, "grad_norm_var": 223.42849934895833, "learning_rate": 0.0003, "loss": 10.9805, "loss/aux_loss": 0.048073144629597664, "loss/crossentropy": 2.808467972278595, "loss/logits": 0.8571766018867493, "step": 65000 }, { "epoch": 0.6501, "grad_norm": 16.0, "grad_norm_var": 220.27537434895834, "learning_rate": 0.0003, "loss": 10.8087, "loss/aux_loss": 0.048067417740821836, "loss/crossentropy": 2.7439105987548826, "loss/logits": 0.824249017238617, "step": 65010 }, { "epoch": 0.6502, "grad_norm": 14.8125, "grad_norm_var": 0.4066243489583333, "learning_rate": 0.0003, "loss": 10.795, "loss/aux_loss": 0.048070468753576276, "loss/crossentropy": 2.763742119073868, "loss/logits": 0.8238922148942948, "step": 65020 }, { "epoch": 0.6503, "grad_norm": 14.75, "grad_norm_var": 0.5936848958333333, "learning_rate": 0.0003, "loss": 10.9753, "loss/aux_loss": 0.04807634837925434, "loss/crossentropy": 2.721911084651947, "loss/logits": 0.8021749824285507, "step": 65030 }, { "epoch": 0.6504, "grad_norm": 14.9375, "grad_norm_var": 7.0462890625, "learning_rate": 0.0003, "loss": 10.7693, "loss/aux_loss": 0.048060786351561545, "loss/crossentropy": 2.63610897064209, "loss/logits": 0.8058023959398269, "step": 65040 }, { "epoch": 0.6505, "grad_norm": 15.8125, "grad_norm_var": 0.7958333333333333, "learning_rate": 0.0003, "loss": 10.9499, "loss/aux_loss": 0.0480716660618782, "loss/crossentropy": 2.7749450325965883, "loss/logits": 0.8092481285333634, "step": 65050 }, { "epoch": 0.6506, "grad_norm": 15.1875, "grad_norm_var": 0.32081705729166665, "learning_rate": 0.0003, "loss": 10.9791, "loss/aux_loss": 0.048079953715205195, "loss/crossentropy": 2.901500105857849, "loss/logits": 0.8558017522096634, "step": 65060 }, { "epoch": 0.6507, "grad_norm": 14.625, "grad_norm_var": 0.49420572916666666, "learning_rate": 0.0003, "loss": 10.886, "loss/aux_loss": 0.04804853610694408, "loss/crossentropy": 2.8315212607383726, "loss/logits": 0.8276244908571243, "step": 65070 }, { "epoch": 0.6508, "grad_norm": 14.9375, "grad_norm_var": 1.0609375, "learning_rate": 0.0003, "loss": 10.9138, "loss/aux_loss": 0.048084153421223165, "loss/crossentropy": 2.681584632396698, "loss/logits": 0.7945889711380005, "step": 65080 }, { "epoch": 0.6509, "grad_norm": 15.625, "grad_norm_var": 0.94609375, "learning_rate": 0.0003, "loss": 10.9791, "loss/aux_loss": 0.04806964471936226, "loss/crossentropy": 2.7207436323165894, "loss/logits": 0.8214473009109498, "step": 65090 }, { "epoch": 0.651, "grad_norm": 16.125, "grad_norm_var": 1.1119791666666667, "learning_rate": 0.0003, "loss": 10.9402, "loss/aux_loss": 0.048057034611701965, "loss/crossentropy": 2.6684858202934265, "loss/logits": 0.7792475908994675, "step": 65100 }, { "epoch": 0.6511, "grad_norm": 15.25, "grad_norm_var": 1.1333333333333333, "learning_rate": 0.0003, "loss": 10.9291, "loss/aux_loss": 0.04806646164506674, "loss/crossentropy": 2.779614543914795, "loss/logits": 0.8046439945697784, "step": 65110 }, { "epoch": 0.6512, "grad_norm": 16.0, "grad_norm_var": 1.5386555989583333, "learning_rate": 0.0003, "loss": 11.059, "loss/aux_loss": 0.048073740862309935, "loss/crossentropy": 2.8252889752388, "loss/logits": 0.7956268131732941, "step": 65120 }, { "epoch": 0.6513, "grad_norm": 14.75, "grad_norm_var": 0.410791015625, "learning_rate": 0.0003, "loss": 11.017, "loss/aux_loss": 0.048064406216144565, "loss/crossentropy": 2.746646058559418, "loss/logits": 0.8085185199975967, "step": 65130 }, { "epoch": 0.6514, "grad_norm": 14.0625, "grad_norm_var": 0.24296875, "learning_rate": 0.0003, "loss": 10.9533, "loss/aux_loss": 0.04806833751499653, "loss/crossentropy": 2.7476025104522703, "loss/logits": 0.7968869656324387, "step": 65140 }, { "epoch": 0.6515, "grad_norm": 15.0625, "grad_norm_var": 0.35857747395833334, "learning_rate": 0.0003, "loss": 10.7864, "loss/aux_loss": 0.04806720819324255, "loss/crossentropy": 2.7668261766433715, "loss/logits": 0.7852804720401764, "step": 65150 }, { "epoch": 0.6516, "grad_norm": 14.6875, "grad_norm_var": 0.259228515625, "learning_rate": 0.0003, "loss": 10.6857, "loss/aux_loss": 0.048074799962341784, "loss/crossentropy": 2.6271615505218504, "loss/logits": 0.7937258869409561, "step": 65160 }, { "epoch": 0.6517, "grad_norm": 14.625, "grad_norm_var": 0.27858072916666665, "learning_rate": 0.0003, "loss": 11.0121, "loss/aux_loss": 0.048050605691969395, "loss/crossentropy": 2.762240695953369, "loss/logits": 0.8204376786947251, "step": 65170 }, { "epoch": 0.6518, "grad_norm": 14.1875, "grad_norm_var": 0.278125, "learning_rate": 0.0003, "loss": 10.9931, "loss/aux_loss": 0.048072893917560575, "loss/crossentropy": 2.6644616603851317, "loss/logits": 0.826135328412056, "step": 65180 }, { "epoch": 0.6519, "grad_norm": 14.75, "grad_norm_var": 0.7619140625, "learning_rate": 0.0003, "loss": 10.9311, "loss/aux_loss": 0.04807901922613382, "loss/crossentropy": 2.5317931294441225, "loss/logits": 0.7690812319517135, "step": 65190 }, { "epoch": 0.652, "grad_norm": 15.125, "grad_norm_var": 0.4796875, "learning_rate": 0.0003, "loss": 10.795, "loss/aux_loss": 0.04806858953088522, "loss/crossentropy": 2.638898861408234, "loss/logits": 0.7892222136259079, "step": 65200 }, { "epoch": 0.6521, "grad_norm": 16.0, "grad_norm_var": 0.9770182291666667, "learning_rate": 0.0003, "loss": 10.888, "loss/aux_loss": 0.048061699606478214, "loss/crossentropy": 2.654205119609833, "loss/logits": 0.8139635503292084, "step": 65210 }, { "epoch": 0.6522, "grad_norm": 15.3125, "grad_norm_var": 4.557535807291667, "learning_rate": 0.0003, "loss": 10.9919, "loss/aux_loss": 0.04807271305471659, "loss/crossentropy": 2.6672019481658937, "loss/logits": 0.7892401427030563, "step": 65220 }, { "epoch": 0.6523, "grad_norm": 15.625, "grad_norm_var": 2.3764973958333333, "learning_rate": 0.0003, "loss": 10.718, "loss/aux_loss": 0.04806781802326441, "loss/crossentropy": 2.7000105381011963, "loss/logits": 0.7944419324398041, "step": 65230 }, { "epoch": 0.6524, "grad_norm": 14.5, "grad_norm_var": 0.5551920572916667, "learning_rate": 0.0003, "loss": 10.8404, "loss/aux_loss": 0.04805983956903219, "loss/crossentropy": 2.5622940182685854, "loss/logits": 0.7859783351421357, "step": 65240 }, { "epoch": 0.6525, "grad_norm": 14.8125, "grad_norm_var": 0.9098307291666666, "learning_rate": 0.0003, "loss": 11.0443, "loss/aux_loss": 0.048075672797858716, "loss/crossentropy": 2.7040555834770204, "loss/logits": 0.8166841179132461, "step": 65250 }, { "epoch": 0.6526, "grad_norm": 15.8125, "grad_norm_var": 0.656103515625, "learning_rate": 0.0003, "loss": 10.8139, "loss/aux_loss": 0.048063617758452894, "loss/crossentropy": 2.736673855781555, "loss/logits": 0.7982501238584518, "step": 65260 }, { "epoch": 0.6527, "grad_norm": 16.125, "grad_norm_var": 0.4697265625, "learning_rate": 0.0003, "loss": 10.9216, "loss/aux_loss": 0.04806849732995033, "loss/crossentropy": 2.7332601666450502, "loss/logits": 0.8461134731769562, "step": 65270 }, { "epoch": 0.6528, "grad_norm": 14.6875, "grad_norm_var": 0.9911458333333333, "learning_rate": 0.0003, "loss": 10.9565, "loss/aux_loss": 0.04807373881340027, "loss/crossentropy": 2.682792294025421, "loss/logits": 0.8356576085090637, "step": 65280 }, { "epoch": 0.6529, "grad_norm": 15.125, "grad_norm_var": 1.2997395833333334, "learning_rate": 0.0003, "loss": 10.8412, "loss/aux_loss": 0.048058380000293256, "loss/crossentropy": 2.598079466819763, "loss/logits": 0.8082356095314026, "step": 65290 }, { "epoch": 0.653, "grad_norm": 14.875, "grad_norm_var": 1.1613118489583334, "learning_rate": 0.0003, "loss": 10.8663, "loss/aux_loss": 0.04806201159954071, "loss/crossentropy": 2.6836532950401306, "loss/logits": 0.7990910440683365, "step": 65300 }, { "epoch": 0.6531, "grad_norm": 14.8125, "grad_norm_var": 0.5763020833333333, "learning_rate": 0.0003, "loss": 10.9157, "loss/aux_loss": 0.04807909373193979, "loss/crossentropy": 2.7008519947528837, "loss/logits": 0.8079722136259079, "step": 65310 }, { "epoch": 0.6532, "grad_norm": 13.75, "grad_norm_var": 0.53671875, "learning_rate": 0.0003, "loss": 10.8207, "loss/aux_loss": 0.04807158559560776, "loss/crossentropy": 2.5901060104370117, "loss/logits": 0.8299726933240891, "step": 65320 }, { "epoch": 0.6533, "grad_norm": 16.0, "grad_norm_var": 0.8528645833333334, "learning_rate": 0.0003, "loss": 10.891, "loss/aux_loss": 0.0480627816170454, "loss/crossentropy": 2.6457729578018188, "loss/logits": 0.8211910486221313, "step": 65330 }, { "epoch": 0.6534, "grad_norm": 14.875, "grad_norm_var": 0.904150390625, "learning_rate": 0.0003, "loss": 10.8124, "loss/aux_loss": 0.04807421285659075, "loss/crossentropy": 2.6744938433170318, "loss/logits": 0.7985322535037994, "step": 65340 }, { "epoch": 0.6535, "grad_norm": 15.4375, "grad_norm_var": 1.1171223958333334, "learning_rate": 0.0003, "loss": 10.6537, "loss/aux_loss": 0.04807833898812532, "loss/crossentropy": 2.623306268453598, "loss/logits": 0.7695679128170013, "step": 65350 }, { "epoch": 0.6536, "grad_norm": 14.75, "grad_norm_var": 0.36248372395833334, "learning_rate": 0.0003, "loss": 10.7607, "loss/aux_loss": 0.048066575266420844, "loss/crossentropy": 2.7030728101730346, "loss/logits": 0.8045616328716279, "step": 65360 }, { "epoch": 0.6537, "grad_norm": 14.4375, "grad_norm_var": 0.7317708333333334, "learning_rate": 0.0003, "loss": 11.0547, "loss/aux_loss": 0.048070757277309896, "loss/crossentropy": 2.8383954405784606, "loss/logits": 0.8456792950630188, "step": 65370 }, { "epoch": 0.6538, "grad_norm": 15.1875, "grad_norm_var": 0.709228515625, "learning_rate": 0.0003, "loss": 10.8044, "loss/aux_loss": 0.04807031229138374, "loss/crossentropy": 2.7254865407943725, "loss/logits": 0.8441527247428894, "step": 65380 }, { "epoch": 0.6539, "grad_norm": 16.75, "grad_norm_var": 0.485400390625, "learning_rate": 0.0003, "loss": 10.8881, "loss/aux_loss": 0.04807039219886065, "loss/crossentropy": 2.7737102448940276, "loss/logits": 0.8173193544149399, "step": 65390 }, { "epoch": 0.654, "grad_norm": 14.625, "grad_norm_var": 0.7863932291666667, "learning_rate": 0.0003, "loss": 10.9235, "loss/aux_loss": 0.04807139802724123, "loss/crossentropy": 2.7199482560157775, "loss/logits": 0.8173371762037277, "step": 65400 }, { "epoch": 0.6541, "grad_norm": 14.75, "grad_norm_var": 1.0648274739583334, "learning_rate": 0.0003, "loss": 10.9632, "loss/aux_loss": 0.048072163760662076, "loss/crossentropy": 2.7658130168914794, "loss/logits": 0.8008842885494232, "step": 65410 }, { "epoch": 0.6542, "grad_norm": 15.875, "grad_norm_var": 0.55390625, "learning_rate": 0.0003, "loss": 10.9842, "loss/aux_loss": 0.04805851969867945, "loss/crossentropy": 2.8067606568336485, "loss/logits": 0.7942769289016723, "step": 65420 }, { "epoch": 0.6543, "grad_norm": 16.0, "grad_norm_var": 0.332275390625, "learning_rate": 0.0003, "loss": 10.9043, "loss/aux_loss": 0.048070518858730794, "loss/crossentropy": 2.683800792694092, "loss/logits": 0.8469008475542068, "step": 65430 }, { "epoch": 0.6544, "grad_norm": 14.6875, "grad_norm_var": 0.6820149739583333, "learning_rate": 0.0003, "loss": 10.9415, "loss/aux_loss": 0.04807311110198498, "loss/crossentropy": 2.7242295682430266, "loss/logits": 0.8172059804201126, "step": 65440 }, { "epoch": 0.6545, "grad_norm": 14.4375, "grad_norm_var": 0.5926432291666667, "learning_rate": 0.0003, "loss": 10.7765, "loss/aux_loss": 0.04806800615042448, "loss/crossentropy": 2.7106892645359038, "loss/logits": 0.8149084568023681, "step": 65450 }, { "epoch": 0.6546, "grad_norm": 15.3125, "grad_norm_var": 0.5994140625, "learning_rate": 0.0003, "loss": 11.0235, "loss/aux_loss": 0.048058449663221835, "loss/crossentropy": 2.6193589746952055, "loss/logits": 0.8009304910898208, "step": 65460 }, { "epoch": 0.6547, "grad_norm": 16.75, "grad_norm_var": 0.7122233072916667, "learning_rate": 0.0003, "loss": 10.9139, "loss/aux_loss": 0.04806790947914123, "loss/crossentropy": 2.677752900123596, "loss/logits": 0.8047443449497222, "step": 65470 }, { "epoch": 0.6548, "grad_norm": 14.1875, "grad_norm_var": 0.9540201822916666, "learning_rate": 0.0003, "loss": 10.966, "loss/aux_loss": 0.04808517023921013, "loss/crossentropy": 2.6315142631530763, "loss/logits": 0.8178540676832199, "step": 65480 }, { "epoch": 0.6549, "grad_norm": 15.875, "grad_norm_var": 0.6160807291666667, "learning_rate": 0.0003, "loss": 10.8793, "loss/aux_loss": 0.04805512484163046, "loss/crossentropy": 2.75492285490036, "loss/logits": 0.7663827478885651, "step": 65490 }, { "epoch": 0.655, "grad_norm": 14.8125, "grad_norm_var": 0.5440104166666667, "learning_rate": 0.0003, "loss": 10.945, "loss/aux_loss": 0.048072614893317225, "loss/crossentropy": 2.5011337757110597, "loss/logits": 0.8080022811889649, "step": 65500 }, { "epoch": 0.6551, "grad_norm": 15.5, "grad_norm_var": 0.43203125, "learning_rate": 0.0003, "loss": 11.0347, "loss/aux_loss": 0.048070020973682404, "loss/crossentropy": 2.806613862514496, "loss/logits": 0.8248602509498596, "step": 65510 }, { "epoch": 0.6552, "grad_norm": 15.5, "grad_norm_var": 0.265869140625, "learning_rate": 0.0003, "loss": 10.8658, "loss/aux_loss": 0.048069593869149684, "loss/crossentropy": 2.7143899381160734, "loss/logits": 0.812701740860939, "step": 65520 }, { "epoch": 0.6553, "grad_norm": 16.125, "grad_norm_var": 0.42355143229166664, "learning_rate": 0.0003, "loss": 10.8261, "loss/aux_loss": 0.0480674784630537, "loss/crossentropy": 2.8173023641109465, "loss/logits": 0.8293744832277298, "step": 65530 }, { "epoch": 0.6554, "grad_norm": 14.8125, "grad_norm_var": 1.043212890625, "learning_rate": 0.0003, "loss": 10.8558, "loss/aux_loss": 0.04806223623454571, "loss/crossentropy": 2.5952585637569427, "loss/logits": 0.7964704751968383, "step": 65540 }, { "epoch": 0.6555, "grad_norm": 15.875, "grad_norm_var": 0.7447916666666666, "learning_rate": 0.0003, "loss": 10.8759, "loss/aux_loss": 0.048076456785202025, "loss/crossentropy": 2.5286275029182432, "loss/logits": 0.8162722438573837, "step": 65550 }, { "epoch": 0.6556, "grad_norm": 15.0625, "grad_norm_var": 0.545556640625, "learning_rate": 0.0003, "loss": 10.8139, "loss/aux_loss": 0.04805875848978758, "loss/crossentropy": 2.9092560052871703, "loss/logits": 0.8326963096857071, "step": 65560 }, { "epoch": 0.6557, "grad_norm": 16.875, "grad_norm_var": 0.45284830729166664, "learning_rate": 0.0003, "loss": 10.6835, "loss/aux_loss": 0.04808024391531944, "loss/crossentropy": 2.513635885715485, "loss/logits": 0.7660587877035141, "step": 65570 }, { "epoch": 0.6558, "grad_norm": 15.375, "grad_norm_var": 0.9692545572916667, "learning_rate": 0.0003, "loss": 10.8846, "loss/aux_loss": 0.04806702360510826, "loss/crossentropy": 2.7251508593559266, "loss/logits": 0.815480324625969, "step": 65580 }, { "epoch": 0.6559, "grad_norm": 14.1875, "grad_norm_var": 0.6831868489583334, "learning_rate": 0.0003, "loss": 10.8619, "loss/aux_loss": 0.04806389529258013, "loss/crossentropy": 2.5533951461315154, "loss/logits": 0.7619587257504463, "step": 65590 }, { "epoch": 0.656, "grad_norm": 15.5, "grad_norm_var": 0.7244791666666667, "learning_rate": 0.0003, "loss": 10.8078, "loss/aux_loss": 0.048077251948416236, "loss/crossentropy": 2.7818902015686033, "loss/logits": 0.8328968584537506, "step": 65600 }, { "epoch": 0.6561, "grad_norm": 14.4375, "grad_norm_var": 0.9035807291666667, "learning_rate": 0.0003, "loss": 10.8577, "loss/aux_loss": 0.048065769299864766, "loss/crossentropy": 2.6951212108135225, "loss/logits": 0.7890205055475235, "step": 65610 }, { "epoch": 0.6562, "grad_norm": 15.4375, "grad_norm_var": 0.708837890625, "learning_rate": 0.0003, "loss": 11.0567, "loss/aux_loss": 0.048072610050439835, "loss/crossentropy": 2.7550642490386963, "loss/logits": 0.8612349301576614, "step": 65620 }, { "epoch": 0.6563, "grad_norm": 14.9375, "grad_norm_var": 0.28880208333333335, "learning_rate": 0.0003, "loss": 11.0369, "loss/aux_loss": 0.04806470815092325, "loss/crossentropy": 2.7217436909675596, "loss/logits": 0.8420159697532654, "step": 65630 }, { "epoch": 0.6564, "grad_norm": 15.0625, "grad_norm_var": 0.35618489583333335, "learning_rate": 0.0003, "loss": 10.9453, "loss/aux_loss": 0.048069029301404956, "loss/crossentropy": 2.7288358211517334, "loss/logits": 0.8264297485351563, "step": 65640 }, { "epoch": 0.6565, "grad_norm": 15.125, "grad_norm_var": 1.4625, "learning_rate": 0.0003, "loss": 10.9144, "loss/aux_loss": 0.048067984730005266, "loss/crossentropy": 2.7028370201587677, "loss/logits": 0.7846741080284119, "step": 65650 }, { "epoch": 0.6566, "grad_norm": 14.9375, "grad_norm_var": 46.948681640625, "learning_rate": 0.0003, "loss": 10.8727, "loss/aux_loss": 0.04807305838912725, "loss/crossentropy": 2.77775114774704, "loss/logits": 0.8294332057237626, "step": 65660 }, { "epoch": 0.6567, "grad_norm": 15.25, "grad_norm_var": 172.750634765625, "learning_rate": 0.0003, "loss": 10.8284, "loss/aux_loss": 0.04807068482041359, "loss/crossentropy": 2.617089319229126, "loss/logits": 0.8040230572223663, "step": 65670 }, { "epoch": 0.6568, "grad_norm": 17.125, "grad_norm_var": 12.295247395833334, "learning_rate": 0.0003, "loss": 10.8871, "loss/aux_loss": 0.048083870112895964, "loss/crossentropy": 2.7740320563316345, "loss/logits": 0.8273166418075562, "step": 65680 }, { "epoch": 0.6569, "grad_norm": 14.25, "grad_norm_var": 0.57421875, "learning_rate": 0.0003, "loss": 10.8403, "loss/aux_loss": 0.048068479262292386, "loss/crossentropy": 2.646625280380249, "loss/logits": 0.7698414534330368, "step": 65690 }, { "epoch": 0.657, "grad_norm": 16.0, "grad_norm_var": 0.8505208333333333, "learning_rate": 0.0003, "loss": 10.7472, "loss/aux_loss": 0.04805990979075432, "loss/crossentropy": 2.5781956732273104, "loss/logits": 0.7773859173059463, "step": 65700 }, { "epoch": 0.6571, "grad_norm": 15.375, "grad_norm_var": 0.9051432291666667, "learning_rate": 0.0003, "loss": 10.9749, "loss/aux_loss": 0.048071971721947195, "loss/crossentropy": 2.6975907385349274, "loss/logits": 0.8216118335723877, "step": 65710 }, { "epoch": 0.6572, "grad_norm": 14.75, "grad_norm_var": 0.89609375, "learning_rate": 0.0003, "loss": 10.9318, "loss/aux_loss": 0.04807693250477314, "loss/crossentropy": 2.7426917433738707, "loss/logits": 0.8241723477840424, "step": 65720 }, { "epoch": 0.6573, "grad_norm": 16.5, "grad_norm_var": 0.9436848958333334, "learning_rate": 0.0003, "loss": 11.0239, "loss/aux_loss": 0.04806841984391212, "loss/crossentropy": 2.820639455318451, "loss/logits": 0.8430281788110733, "step": 65730 }, { "epoch": 0.6574, "grad_norm": 16.375, "grad_norm_var": 0.9322265625, "learning_rate": 0.0003, "loss": 10.9057, "loss/aux_loss": 0.04807513765990734, "loss/crossentropy": 2.6548421382904053, "loss/logits": 0.8545916020870209, "step": 65740 }, { "epoch": 0.6575, "grad_norm": 14.875, "grad_norm_var": 1.0886555989583333, "learning_rate": 0.0003, "loss": 11.003, "loss/aux_loss": 0.048061134107410905, "loss/crossentropy": 2.6990270137786867, "loss/logits": 0.823108297586441, "step": 65750 }, { "epoch": 0.6576, "grad_norm": 15.375, "grad_norm_var": 0.6048014322916667, "learning_rate": 0.0003, "loss": 10.8804, "loss/aux_loss": 0.04807158131152391, "loss/crossentropy": 2.6189096331596375, "loss/logits": 0.7941760838031768, "step": 65760 }, { "epoch": 0.6577, "grad_norm": 15.125, "grad_norm_var": 0.2613118489583333, "learning_rate": 0.0003, "loss": 10.7767, "loss/aux_loss": 0.04806184582412243, "loss/crossentropy": 2.6444436371326447, "loss/logits": 0.8036207824945449, "step": 65770 }, { "epoch": 0.6578, "grad_norm": 15.25, "grad_norm_var": 0.3494140625, "learning_rate": 0.0003, "loss": 10.9201, "loss/aux_loss": 0.04807628132402897, "loss/crossentropy": 2.7261348724365235, "loss/logits": 0.8091208696365356, "step": 65780 }, { "epoch": 0.6579, "grad_norm": 14.25, "grad_norm_var": 0.7093587239583333, "learning_rate": 0.0003, "loss": 10.7387, "loss/aux_loss": 0.04807419925928116, "loss/crossentropy": 2.5443699240684508, "loss/logits": 0.8154137402772903, "step": 65790 }, { "epoch": 0.658, "grad_norm": 15.5625, "grad_norm_var": 0.664697265625, "learning_rate": 0.0003, "loss": 10.9833, "loss/aux_loss": 0.0480806415900588, "loss/crossentropy": 2.5583129703998564, "loss/logits": 0.830548295378685, "step": 65800 }, { "epoch": 0.6581, "grad_norm": 14.375, "grad_norm_var": 0.8320149739583333, "learning_rate": 0.0003, "loss": 10.9038, "loss/aux_loss": 0.04805789217352867, "loss/crossentropy": 2.6957422375679014, "loss/logits": 0.8219772160053254, "step": 65810 }, { "epoch": 0.6582, "grad_norm": 15.0625, "grad_norm_var": 0.801025390625, "learning_rate": 0.0003, "loss": 10.9157, "loss/aux_loss": 0.04807321224361658, "loss/crossentropy": 2.6964453876018526, "loss/logits": 0.8138649493455887, "step": 65820 }, { "epoch": 0.6583, "grad_norm": 15.75, "grad_norm_var": 0.337353515625, "learning_rate": 0.0003, "loss": 10.9783, "loss/aux_loss": 0.048068931140005586, "loss/crossentropy": 2.7101471066474914, "loss/logits": 0.8215384483337402, "step": 65830 }, { "epoch": 0.6584, "grad_norm": 16.0, "grad_norm_var": 0.6258951822916666, "learning_rate": 0.0003, "loss": 10.9277, "loss/aux_loss": 0.048083207570016384, "loss/crossentropy": 2.4620073318481444, "loss/logits": 0.7914715379476547, "step": 65840 }, { "epoch": 0.6585, "grad_norm": 15.875, "grad_norm_var": 0.5516764322916666, "learning_rate": 0.0003, "loss": 10.9253, "loss/aux_loss": 0.048061787895858285, "loss/crossentropy": 2.727776914834976, "loss/logits": 0.7932167321443557, "step": 65850 }, { "epoch": 0.6586, "grad_norm": 14.875, "grad_norm_var": 0.5926920572916666, "learning_rate": 0.0003, "loss": 10.9381, "loss/aux_loss": 0.04806985054165125, "loss/crossentropy": 2.8461714386940002, "loss/logits": 0.7947121143341065, "step": 65860 }, { "epoch": 0.6587, "grad_norm": 14.125, "grad_norm_var": 0.3098958333333333, "learning_rate": 0.0003, "loss": 10.8217, "loss/aux_loss": 0.04806865192949772, "loss/crossentropy": 2.6267981052398683, "loss/logits": 0.7960964858531951, "step": 65870 }, { "epoch": 0.6588, "grad_norm": 15.375, "grad_norm_var": 0.40670572916666664, "learning_rate": 0.0003, "loss": 10.8895, "loss/aux_loss": 0.04806759785860777, "loss/crossentropy": 2.741170364618301, "loss/logits": 0.8563113749027252, "step": 65880 }, { "epoch": 0.6589, "grad_norm": 15.0625, "grad_norm_var": 0.19542643229166667, "learning_rate": 0.0003, "loss": 10.8893, "loss/aux_loss": 0.04807578232139349, "loss/crossentropy": 2.7358233749866487, "loss/logits": 0.838133355975151, "step": 65890 }, { "epoch": 0.659, "grad_norm": 15.5625, "grad_norm_var": 0.13956705729166666, "learning_rate": 0.0003, "loss": 10.9873, "loss/aux_loss": 0.048049908503890036, "loss/crossentropy": 2.681365489959717, "loss/logits": 0.8367834746837616, "step": 65900 }, { "epoch": 0.6591, "grad_norm": 14.9375, "grad_norm_var": 0.347119140625, "learning_rate": 0.0003, "loss": 10.9322, "loss/aux_loss": 0.04807221945375204, "loss/crossentropy": 2.7976817011833193, "loss/logits": 0.8617210656404495, "step": 65910 }, { "epoch": 0.6592, "grad_norm": 14.75, "grad_norm_var": 1.5262858072916667, "learning_rate": 0.0003, "loss": 10.9087, "loss/aux_loss": 0.04806236419826746, "loss/crossentropy": 2.6097477436065675, "loss/logits": 0.8578928947448731, "step": 65920 }, { "epoch": 0.6593, "grad_norm": 17.125, "grad_norm_var": 1.06640625, "learning_rate": 0.0003, "loss": 11.0508, "loss/aux_loss": 0.048072699643671515, "loss/crossentropy": 2.638018161058426, "loss/logits": 0.8202035665512085, "step": 65930 }, { "epoch": 0.6594, "grad_norm": 16.5, "grad_norm_var": 1.5949055989583334, "learning_rate": 0.0003, "loss": 11.0022, "loss/aux_loss": 0.0480684619396925, "loss/crossentropy": 2.7689769506454467, "loss/logits": 0.8445602893829346, "step": 65940 }, { "epoch": 0.6595, "grad_norm": 15.5, "grad_norm_var": 2.2742024739583333, "learning_rate": 0.0003, "loss": 10.9084, "loss/aux_loss": 0.048072042688727376, "loss/crossentropy": 2.7954149782657622, "loss/logits": 0.7957364201545716, "step": 65950 }, { "epoch": 0.6596, "grad_norm": 15.0625, "grad_norm_var": 2.113916015625, "learning_rate": 0.0003, "loss": 10.9942, "loss/aux_loss": 0.04806188233196736, "loss/crossentropy": 2.71195827126503, "loss/logits": 0.7810200721025466, "step": 65960 }, { "epoch": 0.6597, "grad_norm": 15.0625, "grad_norm_var": 0.23932291666666666, "learning_rate": 0.0003, "loss": 10.8403, "loss/aux_loss": 0.04807203095406294, "loss/crossentropy": 2.6094238460063934, "loss/logits": 0.7922606945037842, "step": 65970 }, { "epoch": 0.6598, "grad_norm": 15.4375, "grad_norm_var": 0.396728515625, "learning_rate": 0.0003, "loss": 10.7105, "loss/aux_loss": 0.048070499673485756, "loss/crossentropy": 2.6893329977989198, "loss/logits": 0.7958266377449036, "step": 65980 }, { "epoch": 0.6599, "grad_norm": 14.6875, "grad_norm_var": 0.8907389322916667, "learning_rate": 0.0003, "loss": 11.0927, "loss/aux_loss": 0.048058568872511385, "loss/crossentropy": 2.7194202184677123, "loss/logits": 0.8332067221403122, "step": 65990 }, { "epoch": 0.66, "grad_norm": 15.6875, "grad_norm_var": 0.6409993489583333, "learning_rate": 0.0003, "loss": 10.7974, "loss/aux_loss": 0.04807265195995569, "loss/crossentropy": 2.690684497356415, "loss/logits": 0.8148763328790665, "step": 66000 }, { "epoch": 0.6601, "grad_norm": 15.8125, "grad_norm_var": 0.5574055989583333, "learning_rate": 0.0003, "loss": 10.888, "loss/aux_loss": 0.04805999808013439, "loss/crossentropy": 2.8406422197818757, "loss/logits": 0.8340202659368515, "step": 66010 }, { "epoch": 0.6602, "grad_norm": 16.625, "grad_norm_var": 0.6265462239583334, "learning_rate": 0.0003, "loss": 10.9604, "loss/aux_loss": 0.04808083530515432, "loss/crossentropy": 2.6737845301628114, "loss/logits": 0.7914625614881515, "step": 66020 }, { "epoch": 0.6603, "grad_norm": 14.75, "grad_norm_var": 0.5699055989583334, "learning_rate": 0.0003, "loss": 10.954, "loss/aux_loss": 0.048062770254909994, "loss/crossentropy": 2.8464788436889648, "loss/logits": 0.8319862931966782, "step": 66030 }, { "epoch": 0.6604, "grad_norm": 16.5, "grad_norm_var": 0.5905598958333333, "learning_rate": 0.0003, "loss": 11.004, "loss/aux_loss": 0.04807984437793493, "loss/crossentropy": 2.741937702894211, "loss/logits": 0.8358243376016616, "step": 66040 }, { "epoch": 0.6605, "grad_norm": 15.8125, "grad_norm_var": 1.010009765625, "learning_rate": 0.0003, "loss": 10.8128, "loss/aux_loss": 0.04805782604962587, "loss/crossentropy": 2.4419663667678835, "loss/logits": 0.8032531976699829, "step": 66050 }, { "epoch": 0.6606, "grad_norm": 14.75, "grad_norm_var": 0.459619140625, "learning_rate": 0.0003, "loss": 10.8743, "loss/aux_loss": 0.048064283281564715, "loss/crossentropy": 2.7457400977611544, "loss/logits": 0.8262648940086365, "step": 66060 }, { "epoch": 0.6607, "grad_norm": 14.875, "grad_norm_var": 0.4903645833333333, "learning_rate": 0.0003, "loss": 10.8164, "loss/aux_loss": 0.048074642196297646, "loss/crossentropy": 2.6913636445999147, "loss/logits": 0.8233010709285736, "step": 66070 }, { "epoch": 0.6608, "grad_norm": 15.5, "grad_norm_var": 0.5546223958333333, "learning_rate": 0.0003, "loss": 10.9488, "loss/aux_loss": 0.04807223491370678, "loss/crossentropy": 2.7321924567222595, "loss/logits": 0.8634290426969529, "step": 66080 }, { "epoch": 0.6609, "grad_norm": 14.5, "grad_norm_var": 0.6843098958333333, "learning_rate": 0.0003, "loss": 10.6757, "loss/aux_loss": 0.048049288988113406, "loss/crossentropy": 2.68677796125412, "loss/logits": 0.8587844461202622, "step": 66090 }, { "epoch": 0.661, "grad_norm": 15.125, "grad_norm_var": 0.6515625, "learning_rate": 0.0003, "loss": 10.8042, "loss/aux_loss": 0.048063311353325845, "loss/crossentropy": 2.6101099252700806, "loss/logits": 0.8087275177240372, "step": 66100 }, { "epoch": 0.6611, "grad_norm": 13.875, "grad_norm_var": 0.5549479166666667, "learning_rate": 0.0003, "loss": 10.8571, "loss/aux_loss": 0.04808393083512783, "loss/crossentropy": 2.6760826587677, "loss/logits": 0.780521473288536, "step": 66110 }, { "epoch": 0.6612, "grad_norm": 15.4375, "grad_norm_var": 0.451025390625, "learning_rate": 0.0003, "loss": 10.9433, "loss/aux_loss": 0.04805605374276638, "loss/crossentropy": 2.692962384223938, "loss/logits": 0.8174421191215515, "step": 66120 }, { "epoch": 0.6613, "grad_norm": 14.9375, "grad_norm_var": 0.76171875, "learning_rate": 0.0003, "loss": 10.9211, "loss/aux_loss": 0.04807621203362942, "loss/crossentropy": 2.6863688111305235, "loss/logits": 0.798431122303009, "step": 66130 }, { "epoch": 0.6614, "grad_norm": 13.9375, "grad_norm_var": 0.47630208333333335, "learning_rate": 0.0003, "loss": 10.9411, "loss/aux_loss": 0.04805770944803953, "loss/crossentropy": 2.7181039690971374, "loss/logits": 0.8259652733802796, "step": 66140 }, { "epoch": 0.6615, "grad_norm": 16.5, "grad_norm_var": 3.274853515625, "learning_rate": 0.0003, "loss": 10.8523, "loss/aux_loss": 0.04807611126452684, "loss/crossentropy": 2.8777012705802916, "loss/logits": 0.8209088236093521, "step": 66150 }, { "epoch": 0.6616, "grad_norm": 14.9375, "grad_norm_var": 3.316145833333333, "learning_rate": 0.0003, "loss": 11.0427, "loss/aux_loss": 0.048064004816114905, "loss/crossentropy": 2.9656025767326355, "loss/logits": 0.8535489648580551, "step": 66160 }, { "epoch": 0.6617, "grad_norm": 17.125, "grad_norm_var": 0.8879557291666667, "learning_rate": 0.0003, "loss": 10.7944, "loss/aux_loss": 0.04805393647402525, "loss/crossentropy": 2.6846187472343446, "loss/logits": 0.7934094220399857, "step": 66170 }, { "epoch": 0.6618, "grad_norm": 15.1875, "grad_norm_var": 0.6486979166666667, "learning_rate": 0.0003, "loss": 10.9017, "loss/aux_loss": 0.04807519093155861, "loss/crossentropy": 2.854272258281708, "loss/logits": 0.8095389395952225, "step": 66180 }, { "epoch": 0.6619, "grad_norm": 14.9375, "grad_norm_var": 0.613525390625, "learning_rate": 0.0003, "loss": 10.946, "loss/aux_loss": 0.04806575421243906, "loss/crossentropy": 2.7491804242134092, "loss/logits": 0.8097761183977127, "step": 66190 }, { "epoch": 0.662, "grad_norm": 16.625, "grad_norm_var": 0.4331868489583333, "learning_rate": 0.0003, "loss": 10.9215, "loss/aux_loss": 0.04807676300406456, "loss/crossentropy": 2.6795652329921724, "loss/logits": 0.801378121972084, "step": 66200 }, { "epoch": 0.6621, "grad_norm": 14.625, "grad_norm_var": 0.563134765625, "learning_rate": 0.0003, "loss": 10.8342, "loss/aux_loss": 0.04805709309875965, "loss/crossentropy": 2.752977591753006, "loss/logits": 0.8294487535953522, "step": 66210 }, { "epoch": 0.6622, "grad_norm": 15.375, "grad_norm_var": 0.8949055989583333, "learning_rate": 0.0003, "loss": 10.9269, "loss/aux_loss": 0.048077603057026866, "loss/crossentropy": 2.8537149250507357, "loss/logits": 0.8224076896905899, "step": 66220 }, { "epoch": 0.6623, "grad_norm": 16.75, "grad_norm_var": 0.5244140625, "learning_rate": 0.0003, "loss": 10.9263, "loss/aux_loss": 0.04807521738111973, "loss/crossentropy": 2.651250684261322, "loss/logits": 0.8318710893392562, "step": 66230 }, { "epoch": 0.6624, "grad_norm": 15.9375, "grad_norm_var": 0.4410807291666667, "learning_rate": 0.0003, "loss": 10.9263, "loss/aux_loss": 0.048059741780161855, "loss/crossentropy": 2.849357432126999, "loss/logits": 0.8336068332195282, "step": 66240 }, { "epoch": 0.6625, "grad_norm": 16.25, "grad_norm_var": 0.6070149739583334, "learning_rate": 0.0003, "loss": 10.8793, "loss/aux_loss": 0.04806891325861216, "loss/crossentropy": 2.5584873795509337, "loss/logits": 0.8233877867460251, "step": 66250 }, { "epoch": 0.6626, "grad_norm": 15.625, "grad_norm_var": 0.7613932291666666, "learning_rate": 0.0003, "loss": 10.8628, "loss/aux_loss": 0.04806743785738945, "loss/crossentropy": 2.732908582687378, "loss/logits": 0.8170014798641205, "step": 66260 }, { "epoch": 0.6627, "grad_norm": 14.875, "grad_norm_var": 0.363525390625, "learning_rate": 0.0003, "loss": 10.7629, "loss/aux_loss": 0.04805696085095405, "loss/crossentropy": 2.81008266210556, "loss/logits": 0.8305856496095657, "step": 66270 }, { "epoch": 0.6628, "grad_norm": 15.125, "grad_norm_var": 0.25045572916666664, "learning_rate": 0.0003, "loss": 10.8067, "loss/aux_loss": 0.048075790703296664, "loss/crossentropy": 2.528480714559555, "loss/logits": 0.7946991354227066, "step": 66280 }, { "epoch": 0.6629, "grad_norm": 15.6875, "grad_norm_var": 0.4090983072916667, "learning_rate": 0.0003, "loss": 10.9142, "loss/aux_loss": 0.048072568513453005, "loss/crossentropy": 2.7415711283683777, "loss/logits": 0.8110772639513015, "step": 66290 }, { "epoch": 0.663, "grad_norm": 14.375, "grad_norm_var": 0.7744791666666667, "learning_rate": 0.0003, "loss": 10.8882, "loss/aux_loss": 0.048075555637478826, "loss/crossentropy": 2.724819177389145, "loss/logits": 0.7944720953702926, "step": 66300 }, { "epoch": 0.6631, "grad_norm": 15.125, "grad_norm_var": 0.7395182291666667, "learning_rate": 0.0003, "loss": 10.9813, "loss/aux_loss": 0.04807244967669248, "loss/crossentropy": 2.643247830867767, "loss/logits": 0.8237002640962601, "step": 66310 }, { "epoch": 0.6632, "grad_norm": 14.9375, "grad_norm_var": 0.5169108072916667, "learning_rate": 0.0003, "loss": 10.7386, "loss/aux_loss": 0.048069367185235023, "loss/crossentropy": 2.7121821284294128, "loss/logits": 0.8011160790920258, "step": 66320 }, { "epoch": 0.6633, "grad_norm": 14.625, "grad_norm_var": 0.30402018229166666, "learning_rate": 0.0003, "loss": 10.8735, "loss/aux_loss": 0.04806159436702728, "loss/crossentropy": 2.7342435657978057, "loss/logits": 0.8220883011817932, "step": 66330 }, { "epoch": 0.6634, "grad_norm": 15.625, "grad_norm_var": 0.394775390625, "learning_rate": 0.0003, "loss": 10.8719, "loss/aux_loss": 0.0480760769918561, "loss/crossentropy": 2.803659129142761, "loss/logits": 0.8247465431690216, "step": 66340 }, { "epoch": 0.6635, "grad_norm": 14.625, "grad_norm_var": 0.9119791666666667, "learning_rate": 0.0003, "loss": 10.9, "loss/aux_loss": 0.04806450568139553, "loss/crossentropy": 2.7340852856636046, "loss/logits": 0.810696679353714, "step": 66350 }, { "epoch": 0.6636, "grad_norm": 14.875, "grad_norm_var": 0.8149576822916667, "learning_rate": 0.0003, "loss": 10.9793, "loss/aux_loss": 0.048075934126973155, "loss/crossentropy": 2.481299436092377, "loss/logits": 0.7764072805643082, "step": 66360 }, { "epoch": 0.6637, "grad_norm": 14.75, "grad_norm_var": 0.2515462239583333, "learning_rate": 0.0003, "loss": 10.9342, "loss/aux_loss": 0.04805976003408432, "loss/crossentropy": 2.8292657256126406, "loss/logits": 0.8594965010881424, "step": 66370 }, { "epoch": 0.6638, "grad_norm": 15.5, "grad_norm_var": 0.47784830729166666, "learning_rate": 0.0003, "loss": 10.7546, "loss/aux_loss": 0.048081879131495954, "loss/crossentropy": 2.621725058555603, "loss/logits": 0.778819665312767, "step": 66380 }, { "epoch": 0.6639, "grad_norm": 16.125, "grad_norm_var": 0.46261393229166664, "learning_rate": 0.0003, "loss": 10.8435, "loss/aux_loss": 0.04807425364851951, "loss/crossentropy": 2.6788039445877074, "loss/logits": 0.8278974890708923, "step": 66390 }, { "epoch": 0.664, "grad_norm": 14.5625, "grad_norm_var": 0.8796712239583333, "learning_rate": 0.0003, "loss": 10.9036, "loss/aux_loss": 0.04806184228509665, "loss/crossentropy": 2.79437460899353, "loss/logits": 0.8034818679094314, "step": 66400 }, { "epoch": 0.6641, "grad_norm": 14.75, "grad_norm_var": 1.0442057291666667, "learning_rate": 0.0003, "loss": 10.9, "loss/aux_loss": 0.0480803944170475, "loss/crossentropy": 2.8535486102104186, "loss/logits": 0.8078803330659866, "step": 66410 }, { "epoch": 0.6642, "grad_norm": 15.8125, "grad_norm_var": 0.6542805989583333, "learning_rate": 0.0003, "loss": 10.8832, "loss/aux_loss": 0.04806033242493868, "loss/crossentropy": 2.7427396893501284, "loss/logits": 0.8372073888778686, "step": 66420 }, { "epoch": 0.6643, "grad_norm": 14.6875, "grad_norm_var": 0.7947265625, "learning_rate": 0.0003, "loss": 10.7244, "loss/aux_loss": 0.048065388575196266, "loss/crossentropy": 2.710828936100006, "loss/logits": 0.8116421043872833, "step": 66430 }, { "epoch": 0.6644, "grad_norm": 15.3125, "grad_norm_var": 5.3578125, "learning_rate": 0.0003, "loss": 10.8632, "loss/aux_loss": 0.04807053990662098, "loss/crossentropy": 2.719035828113556, "loss/logits": 0.8238852351903916, "step": 66440 }, { "epoch": 0.6645, "grad_norm": 14.875, "grad_norm_var": 4.597639973958334, "learning_rate": 0.0003, "loss": 10.9155, "loss/aux_loss": 0.04806662444025278, "loss/crossentropy": 2.7444665908813475, "loss/logits": 0.8264550715684891, "step": 66450 }, { "epoch": 0.6646, "grad_norm": 15.125, "grad_norm_var": 0.21243489583333333, "learning_rate": 0.0003, "loss": 10.8299, "loss/aux_loss": 0.04806863609701395, "loss/crossentropy": 2.6724780917167665, "loss/logits": 0.8069694906473159, "step": 66460 }, { "epoch": 0.6647, "grad_norm": 19.0, "grad_norm_var": 1.3813639322916667, "learning_rate": 0.0003, "loss": 10.8276, "loss/aux_loss": 0.04807253777980804, "loss/crossentropy": 2.675402784347534, "loss/logits": 0.7911726206541061, "step": 66470 }, { "epoch": 0.6648, "grad_norm": 15.4375, "grad_norm_var": 1.024853515625, "learning_rate": 0.0003, "loss": 10.9674, "loss/aux_loss": 0.048071261309087274, "loss/crossentropy": 2.833948886394501, "loss/logits": 0.8181155323982239, "step": 66480 }, { "epoch": 0.6649, "grad_norm": 15.5, "grad_norm_var": 0.44503580729166664, "learning_rate": 0.0003, "loss": 10.9112, "loss/aux_loss": 0.048069555498659614, "loss/crossentropy": 2.501497894525528, "loss/logits": 0.7850837290287018, "step": 66490 }, { "epoch": 0.665, "grad_norm": 14.625, "grad_norm_var": 0.45546875, "learning_rate": 0.0003, "loss": 10.9807, "loss/aux_loss": 0.04806936550885439, "loss/crossentropy": 2.6703847885131835, "loss/logits": 0.8007300883531571, "step": 66500 }, { "epoch": 0.6651, "grad_norm": 15.0, "grad_norm_var": 0.30974934895833334, "learning_rate": 0.0003, "loss": 10.9003, "loss/aux_loss": 0.04807521235197783, "loss/crossentropy": 2.5529791355133056, "loss/logits": 0.8032549649477005, "step": 66510 }, { "epoch": 0.6652, "grad_norm": 15.1875, "grad_norm_var": 0.3719889322916667, "learning_rate": 0.0003, "loss": 10.8097, "loss/aux_loss": 0.048067670315504074, "loss/crossentropy": 2.7725186586380004, "loss/logits": 0.7987766593694687, "step": 66520 }, { "epoch": 0.6653, "grad_norm": 14.3125, "grad_norm_var": 0.4192708333333333, "learning_rate": 0.0003, "loss": 11.0081, "loss/aux_loss": 0.04806581847369671, "loss/crossentropy": 2.6651151537895204, "loss/logits": 0.8225375205278397, "step": 66530 }, { "epoch": 0.6654, "grad_norm": 15.375, "grad_norm_var": 1.2058430989583333, "learning_rate": 0.0003, "loss": 10.9034, "loss/aux_loss": 0.04806330688297748, "loss/crossentropy": 2.8271175622940063, "loss/logits": 0.8273738652467728, "step": 66540 }, { "epoch": 0.6655, "grad_norm": 14.5, "grad_norm_var": 0.6911458333333333, "learning_rate": 0.0003, "loss": 10.9021, "loss/aux_loss": 0.0480733385309577, "loss/crossentropy": 2.6847646474838256, "loss/logits": 0.7972731322050095, "step": 66550 }, { "epoch": 0.6656, "grad_norm": 14.0, "grad_norm_var": 0.6874837239583333, "learning_rate": 0.0003, "loss": 10.7575, "loss/aux_loss": 0.048074370436370376, "loss/crossentropy": 2.680130976438522, "loss/logits": 0.7642890572547912, "step": 66560 }, { "epoch": 0.6657, "grad_norm": 14.125, "grad_norm_var": 0.6921712239583333, "learning_rate": 0.0003, "loss": 10.8822, "loss/aux_loss": 0.04806676432490349, "loss/crossentropy": 2.791566550731659, "loss/logits": 0.8233636647462845, "step": 66570 }, { "epoch": 0.6658, "grad_norm": 15.5625, "grad_norm_var": 0.6950358072916667, "learning_rate": 0.0003, "loss": 10.9591, "loss/aux_loss": 0.04806861318647861, "loss/crossentropy": 2.824068772792816, "loss/logits": 0.8359575748443604, "step": 66580 }, { "epoch": 0.6659, "grad_norm": 16.5, "grad_norm_var": 3.5942057291666667, "learning_rate": 0.0003, "loss": 10.8459, "loss/aux_loss": 0.04806522503495216, "loss/crossentropy": 2.7881508350372313, "loss/logits": 0.8030070185661315, "step": 66590 }, { "epoch": 0.666, "grad_norm": 15.125, "grad_norm_var": 0.3900390625, "learning_rate": 0.0003, "loss": 10.9317, "loss/aux_loss": 0.048059667088091375, "loss/crossentropy": 2.747694218158722, "loss/logits": 0.8331306129693985, "step": 66600 }, { "epoch": 0.6661, "grad_norm": 16.0, "grad_norm_var": 0.3963541666666667, "learning_rate": 0.0003, "loss": 10.7401, "loss/aux_loss": 0.04808664340525866, "loss/crossentropy": 2.5784662127494813, "loss/logits": 0.7583020776510239, "step": 66610 }, { "epoch": 0.6662, "grad_norm": 15.75, "grad_norm_var": 0.7514973958333333, "learning_rate": 0.0003, "loss": 10.8734, "loss/aux_loss": 0.04807219747453928, "loss/crossentropy": 2.6055344462394716, "loss/logits": 0.8134197026491166, "step": 66620 }, { "epoch": 0.6663, "grad_norm": 15.0625, "grad_norm_var": 0.31495768229166665, "learning_rate": 0.0003, "loss": 10.8212, "loss/aux_loss": 0.048065138049423695, "loss/crossentropy": 2.6512358248233796, "loss/logits": 0.7796749144792556, "step": 66630 }, { "epoch": 0.6664, "grad_norm": 15.3125, "grad_norm_var": 0.6481770833333333, "learning_rate": 0.0003, "loss": 10.8075, "loss/aux_loss": 0.04805928375571966, "loss/crossentropy": 2.748984879255295, "loss/logits": 0.782018169760704, "step": 66640 }, { "epoch": 0.6665, "grad_norm": 14.6875, "grad_norm_var": 0.526806640625, "learning_rate": 0.0003, "loss": 10.9915, "loss/aux_loss": 0.048084696754813194, "loss/crossentropy": 2.6403255581855776, "loss/logits": 0.8360554903745652, "step": 66650 }, { "epoch": 0.6666, "grad_norm": 14.6875, "grad_norm_var": 0.5416666666666666, "learning_rate": 0.0003, "loss": 10.7651, "loss/aux_loss": 0.0480577452108264, "loss/crossentropy": 2.7464575350284575, "loss/logits": 0.816826593875885, "step": 66660 }, { "epoch": 0.6667, "grad_norm": 15.3125, "grad_norm_var": 0.6425618489583333, "learning_rate": 0.0003, "loss": 10.8847, "loss/aux_loss": 0.04806127417832613, "loss/crossentropy": 2.687554585933685, "loss/logits": 0.8001707077026368, "step": 66670 }, { "epoch": 0.6668, "grad_norm": 15.0625, "grad_norm_var": 0.3346354166666667, "learning_rate": 0.0003, "loss": 10.9473, "loss/aux_loss": 0.048075456917285916, "loss/crossentropy": 2.8568101286888123, "loss/logits": 0.8553566783666611, "step": 66680 }, { "epoch": 0.6669, "grad_norm": 15.25, "grad_norm_var": 0.2999348958333333, "learning_rate": 0.0003, "loss": 10.9538, "loss/aux_loss": 0.04806934054940939, "loss/crossentropy": 2.657517743110657, "loss/logits": 0.8053190678358078, "step": 66690 }, { "epoch": 0.667, "grad_norm": 15.9375, "grad_norm_var": 0.9419108072916667, "learning_rate": 0.0003, "loss": 10.8266, "loss/aux_loss": 0.04807275123894215, "loss/crossentropy": 2.449772423505783, "loss/logits": 0.7906589955091476, "step": 66700 }, { "epoch": 0.6671, "grad_norm": 14.75, "grad_norm_var": 0.5174479166666667, "learning_rate": 0.0003, "loss": 10.9636, "loss/aux_loss": 0.048078746907413004, "loss/crossentropy": 2.7611198365688323, "loss/logits": 0.8044763505458832, "step": 66710 }, { "epoch": 0.6672, "grad_norm": 17.5, "grad_norm_var": 3.1023274739583333, "learning_rate": 0.0003, "loss": 10.9441, "loss/aux_loss": 0.04806906506419182, "loss/crossentropy": 2.8186138391494753, "loss/logits": 0.8578125566244126, "step": 66720 }, { "epoch": 0.6673, "grad_norm": 16.625, "grad_norm_var": 5.800504557291666, "learning_rate": 0.0003, "loss": 10.7651, "loss/aux_loss": 0.04806268475949764, "loss/crossentropy": 2.5886098742485046, "loss/logits": 0.7767158389091492, "step": 66730 }, { "epoch": 0.6674, "grad_norm": 15.5625, "grad_norm_var": 5.258854166666667, "learning_rate": 0.0003, "loss": 10.9085, "loss/aux_loss": 0.04808114189654589, "loss/crossentropy": 2.7260345458984374, "loss/logits": 0.79498670399189, "step": 66740 }, { "epoch": 0.6675, "grad_norm": 14.5625, "grad_norm_var": 0.98125, "learning_rate": 0.0003, "loss": 10.8072, "loss/aux_loss": 0.048059665225446226, "loss/crossentropy": 2.596436160802841, "loss/logits": 0.7788780838251114, "step": 66750 }, { "epoch": 0.6676, "grad_norm": 14.3125, "grad_norm_var": 0.4231770833333333, "learning_rate": 0.0003, "loss": 10.9401, "loss/aux_loss": 0.048060267791152, "loss/crossentropy": 2.6341087102890013, "loss/logits": 0.8312490910291672, "step": 66760 }, { "epoch": 0.6677, "grad_norm": 14.0, "grad_norm_var": 0.336962890625, "learning_rate": 0.0003, "loss": 11.0289, "loss/aux_loss": 0.048079300485551354, "loss/crossentropy": 2.835488021373749, "loss/logits": 0.8537455588579178, "step": 66770 }, { "epoch": 0.6678, "grad_norm": 16.25, "grad_norm_var": 0.43333333333333335, "learning_rate": 0.0003, "loss": 10.8726, "loss/aux_loss": 0.04806447774171829, "loss/crossentropy": 2.640529549121857, "loss/logits": 0.8138740628957748, "step": 66780 }, { "epoch": 0.6679, "grad_norm": 14.4375, "grad_norm_var": 0.384228515625, "learning_rate": 0.0003, "loss": 10.8864, "loss/aux_loss": 0.048069434240460396, "loss/crossentropy": 2.718799889087677, "loss/logits": 0.804791709780693, "step": 66790 }, { "epoch": 0.668, "grad_norm": 15.4375, "grad_norm_var": 1.0369140625, "learning_rate": 0.0003, "loss": 10.8305, "loss/aux_loss": 0.04806906692683697, "loss/crossentropy": 2.598710483312607, "loss/logits": 0.8012362569570541, "step": 66800 }, { "epoch": 0.6681, "grad_norm": 15.8125, "grad_norm_var": 1.2259765625, "learning_rate": 0.0003, "loss": 10.9184, "loss/aux_loss": 0.048060832917690276, "loss/crossentropy": 2.6800104796886446, "loss/logits": 0.8155199468135834, "step": 66810 }, { "epoch": 0.6682, "grad_norm": 16.25, "grad_norm_var": 23.108268229166665, "learning_rate": 0.0003, "loss": 10.7432, "loss/aux_loss": 0.04805918000638485, "loss/crossentropy": 2.729696071147919, "loss/logits": 0.8015040099620819, "step": 66820 }, { "epoch": 0.6683, "grad_norm": 16.875, "grad_norm_var": 23.670035807291665, "learning_rate": 0.0003, "loss": 10.8194, "loss/aux_loss": 0.04807895701378584, "loss/crossentropy": 2.8300251722335816, "loss/logits": 0.8447652935981751, "step": 66830 }, { "epoch": 0.6684, "grad_norm": 17.125, "grad_norm_var": 0.650244140625, "learning_rate": 0.0003, "loss": 10.9207, "loss/aux_loss": 0.048053346760571006, "loss/crossentropy": 2.675478661060333, "loss/logits": 0.8098080486059189, "step": 66840 }, { "epoch": 0.6685, "grad_norm": 14.9375, "grad_norm_var": 0.5291015625, "learning_rate": 0.0003, "loss": 10.8967, "loss/aux_loss": 0.048054653219878674, "loss/crossentropy": 2.858685314655304, "loss/logits": 0.8220966100692749, "step": 66850 }, { "epoch": 0.6686, "grad_norm": 15.25, "grad_norm_var": 1.038134765625, "learning_rate": 0.0003, "loss": 10.8929, "loss/aux_loss": 0.04808123260736465, "loss/crossentropy": 2.7240459442138674, "loss/logits": 0.8174852192401886, "step": 66860 }, { "epoch": 0.6687, "grad_norm": 17.125, "grad_norm_var": 0.8919270833333334, "learning_rate": 0.0003, "loss": 11.0093, "loss/aux_loss": 0.04807883575558662, "loss/crossentropy": 2.665953540802002, "loss/logits": 0.8114261239767074, "step": 66870 }, { "epoch": 0.6688, "grad_norm": 15.4375, "grad_norm_var": 1.1030598958333333, "learning_rate": 0.0003, "loss": 10.9429, "loss/aux_loss": 0.04805787615478039, "loss/crossentropy": 2.685689914226532, "loss/logits": 0.806901153922081, "step": 66880 }, { "epoch": 0.6689, "grad_norm": 16.375, "grad_norm_var": 0.511181640625, "learning_rate": 0.0003, "loss": 10.8641, "loss/aux_loss": 0.0480721453204751, "loss/crossentropy": 2.829719823598862, "loss/logits": 0.8213761389255524, "step": 66890 }, { "epoch": 0.669, "grad_norm": 15.1875, "grad_norm_var": 0.6014973958333333, "learning_rate": 0.0003, "loss": 10.9479, "loss/aux_loss": 0.04806473944336176, "loss/crossentropy": 2.6682372391223907, "loss/logits": 0.8243909746408462, "step": 66900 }, { "epoch": 0.6691, "grad_norm": 15.1875, "grad_norm_var": 0.3916015625, "learning_rate": 0.0003, "loss": 10.8806, "loss/aux_loss": 0.048068069666624066, "loss/crossentropy": 2.672402673959732, "loss/logits": 0.8250930517911911, "step": 66910 }, { "epoch": 0.6692, "grad_norm": 14.5625, "grad_norm_var": 1.8825358072916667, "learning_rate": 0.0003, "loss": 11.0711, "loss/aux_loss": 0.04807632770389318, "loss/crossentropy": 2.706685644388199, "loss/logits": 0.8509224832057953, "step": 66920 }, { "epoch": 0.6693, "grad_norm": 17.25, "grad_norm_var": 0.6629557291666667, "learning_rate": 0.0003, "loss": 10.6838, "loss/aux_loss": 0.048064228519797324, "loss/crossentropy": 2.7776548743247984, "loss/logits": 0.7853770822286605, "step": 66930 }, { "epoch": 0.6694, "grad_norm": 14.625, "grad_norm_var": 0.8407389322916666, "learning_rate": 0.0003, "loss": 10.912, "loss/aux_loss": 0.048056557215750216, "loss/crossentropy": 2.7243767201900484, "loss/logits": 0.7943523436784744, "step": 66940 }, { "epoch": 0.6695, "grad_norm": 15.1875, "grad_norm_var": 1.0590983072916667, "learning_rate": 0.0003, "loss": 10.9122, "loss/aux_loss": 0.048080825619399546, "loss/crossentropy": 2.541512316465378, "loss/logits": 0.7956676542758941, "step": 66950 }, { "epoch": 0.6696, "grad_norm": 14.25, "grad_norm_var": 0.619775390625, "learning_rate": 0.0003, "loss": 10.8168, "loss/aux_loss": 0.04807412121444941, "loss/crossentropy": 2.71076363325119, "loss/logits": 0.8469307273626328, "step": 66960 }, { "epoch": 0.6697, "grad_norm": 15.5625, "grad_norm_var": 1.1841145833333333, "learning_rate": 0.0003, "loss": 10.7509, "loss/aux_loss": 0.0480673236772418, "loss/crossentropy": 2.56371031999588, "loss/logits": 0.7859199553728103, "step": 66970 }, { "epoch": 0.6698, "grad_norm": 17.375, "grad_norm_var": 0.7645670572916666, "learning_rate": 0.0003, "loss": 10.8529, "loss/aux_loss": 0.04807327184826136, "loss/crossentropy": 2.7085660099983215, "loss/logits": 0.8172822952270508, "step": 66980 }, { "epoch": 0.6699, "grad_norm": 15.875, "grad_norm_var": 0.6764973958333333, "learning_rate": 0.0003, "loss": 10.6937, "loss/aux_loss": 0.048074647411704065, "loss/crossentropy": 2.551885908842087, "loss/logits": 0.7823489457368851, "step": 66990 }, { "epoch": 0.67, "grad_norm": 15.1875, "grad_norm_var": 0.36652018229166666, "learning_rate": 0.0003, "loss": 10.999, "loss/aux_loss": 0.04806416109204292, "loss/crossentropy": 2.615692639350891, "loss/logits": 0.8171136409044266, "step": 67000 }, { "epoch": 0.6701, "grad_norm": 15.1875, "grad_norm_var": 0.22615559895833334, "learning_rate": 0.0003, "loss": 10.8361, "loss/aux_loss": 0.04806163609027862, "loss/crossentropy": 2.696385371685028, "loss/logits": 0.7908263862133026, "step": 67010 }, { "epoch": 0.6702, "grad_norm": 16.5, "grad_norm_var": 0.5753743489583333, "learning_rate": 0.0003, "loss": 10.8475, "loss/aux_loss": 0.04808085560798645, "loss/crossentropy": 2.712275046110153, "loss/logits": 0.824543422460556, "step": 67020 }, { "epoch": 0.6703, "grad_norm": 14.25, "grad_norm_var": 0.71640625, "learning_rate": 0.0003, "loss": 10.7361, "loss/aux_loss": 0.04806180745363235, "loss/crossentropy": 2.527262020111084, "loss/logits": 0.7941514313220978, "step": 67030 }, { "epoch": 0.6704, "grad_norm": 14.5625, "grad_norm_var": 0.5059895833333333, "learning_rate": 0.0003, "loss": 10.8482, "loss/aux_loss": 0.04806319680064917, "loss/crossentropy": 2.6808030009269714, "loss/logits": 0.7904377818107605, "step": 67040 }, { "epoch": 0.6705, "grad_norm": 15.125, "grad_norm_var": 0.41380208333333335, "learning_rate": 0.0003, "loss": 10.8096, "loss/aux_loss": 0.048068330809473994, "loss/crossentropy": 2.5817946434020995, "loss/logits": 0.8056021362543107, "step": 67050 }, { "epoch": 0.6706, "grad_norm": 15.0, "grad_norm_var": 0.9958170572916667, "learning_rate": 0.0003, "loss": 10.9695, "loss/aux_loss": 0.048056199215352535, "loss/crossentropy": 2.782631528377533, "loss/logits": 0.8429341733455658, "step": 67060 }, { "epoch": 0.6707, "grad_norm": 14.5625, "grad_norm_var": 0.7150390625, "learning_rate": 0.0003, "loss": 11.0217, "loss/aux_loss": 0.04807750023901462, "loss/crossentropy": 2.7971240878105164, "loss/logits": 0.8311490327119827, "step": 67070 }, { "epoch": 0.6708, "grad_norm": 14.125, "grad_norm_var": 0.28619791666666666, "learning_rate": 0.0003, "loss": 10.9232, "loss/aux_loss": 0.048067571222782136, "loss/crossentropy": 2.7258487045764923, "loss/logits": 0.8153935343027114, "step": 67080 }, { "epoch": 0.6709, "grad_norm": 14.5625, "grad_norm_var": 0.5079264322916667, "learning_rate": 0.0003, "loss": 10.7639, "loss/aux_loss": 0.04806251842528582, "loss/crossentropy": 2.828408050537109, "loss/logits": 0.8246762096881867, "step": 67090 }, { "epoch": 0.671, "grad_norm": 15.25, "grad_norm_var": 0.5353515625, "learning_rate": 0.0003, "loss": 10.8017, "loss/aux_loss": 0.04806637335568666, "loss/crossentropy": 2.6909542202949526, "loss/logits": 0.7874203026294708, "step": 67100 }, { "epoch": 0.6711, "grad_norm": 14.625, "grad_norm_var": 1.8065104166666666, "learning_rate": 0.0003, "loss": 10.7695, "loss/aux_loss": 0.04807747136801481, "loss/crossentropy": 2.7141048312187195, "loss/logits": 0.7738794207572937, "step": 67110 }, { "epoch": 0.6712, "grad_norm": 15.0625, "grad_norm_var": 1.5468098958333334, "learning_rate": 0.0003, "loss": 10.8937, "loss/aux_loss": 0.0480605298653245, "loss/crossentropy": 2.6752750635147096, "loss/logits": 0.7823032259941101, "step": 67120 }, { "epoch": 0.6713, "grad_norm": 14.625, "grad_norm_var": 0.30323893229166665, "learning_rate": 0.0003, "loss": 10.785, "loss/aux_loss": 0.04806859977543354, "loss/crossentropy": 2.962781381607056, "loss/logits": 0.8514289349317551, "step": 67130 }, { "epoch": 0.6714, "grad_norm": 14.9375, "grad_norm_var": 0.32109375, "learning_rate": 0.0003, "loss": 10.9113, "loss/aux_loss": 0.04805906768888235, "loss/crossentropy": 2.6575462460517882, "loss/logits": 0.8165967971086502, "step": 67140 }, { "epoch": 0.6715, "grad_norm": 15.6875, "grad_norm_var": 0.472119140625, "learning_rate": 0.0003, "loss": 10.8406, "loss/aux_loss": 0.04806913398206234, "loss/crossentropy": 2.6859039187431337, "loss/logits": 0.7919700384140015, "step": 67150 }, { "epoch": 0.6716, "grad_norm": 14.6875, "grad_norm_var": 0.8429524739583333, "learning_rate": 0.0003, "loss": 10.7851, "loss/aux_loss": 0.04807139337062836, "loss/crossentropy": 2.7103774309158326, "loss/logits": 0.8116638362407684, "step": 67160 }, { "epoch": 0.6717, "grad_norm": 15.1875, "grad_norm_var": 0.40870768229166665, "learning_rate": 0.0003, "loss": 10.9319, "loss/aux_loss": 0.04806287419050932, "loss/crossentropy": 2.6983575582504273, "loss/logits": 0.7836291432380676, "step": 67170 }, { "epoch": 0.6718, "grad_norm": 16.25, "grad_norm_var": 0.44733072916666666, "learning_rate": 0.0003, "loss": 10.8345, "loss/aux_loss": 0.04806513842195272, "loss/crossentropy": 2.8540413081645966, "loss/logits": 0.8250791281461716, "step": 67180 }, { "epoch": 0.6719, "grad_norm": 14.875, "grad_norm_var": 0.6218587239583333, "learning_rate": 0.0003, "loss": 10.9294, "loss/aux_loss": 0.048067282512784006, "loss/crossentropy": 2.7587247133255004, "loss/logits": 0.8604197174310684, "step": 67190 }, { "epoch": 0.672, "grad_norm": 14.9375, "grad_norm_var": 1.4669108072916666, "learning_rate": 0.0003, "loss": 10.7877, "loss/aux_loss": 0.04807413946837187, "loss/crossentropy": 2.6343702554702757, "loss/logits": 0.8144495546817779, "step": 67200 }, { "epoch": 0.6721, "grad_norm": 17.25, "grad_norm_var": 0.6837076822916667, "learning_rate": 0.0003, "loss": 10.9378, "loss/aux_loss": 0.04806031696498394, "loss/crossentropy": 2.6540800809860228, "loss/logits": 0.8209151834249496, "step": 67210 }, { "epoch": 0.6722, "grad_norm": 17.25, "grad_norm_var": 0.6257649739583333, "learning_rate": 0.0003, "loss": 10.9277, "loss/aux_loss": 0.04808404482901096, "loss/crossentropy": 2.6092947840690615, "loss/logits": 0.7998175516724586, "step": 67220 }, { "epoch": 0.6723, "grad_norm": 15.6875, "grad_norm_var": 0.7684733072916666, "learning_rate": 0.0003, "loss": 10.7706, "loss/aux_loss": 0.04805383253842592, "loss/crossentropy": 2.633415186405182, "loss/logits": 0.8068418264389038, "step": 67230 }, { "epoch": 0.6724, "grad_norm": 16.25, "grad_norm_var": 0.66875, "learning_rate": 0.0003, "loss": 10.8816, "loss/aux_loss": 0.04806835651397705, "loss/crossentropy": 2.6598873853683473, "loss/logits": 0.8227006554603576, "step": 67240 }, { "epoch": 0.6725, "grad_norm": 14.3125, "grad_norm_var": 0.51484375, "learning_rate": 0.0003, "loss": 10.776, "loss/aux_loss": 0.04807889815419912, "loss/crossentropy": 2.6441542148590087, "loss/logits": 0.8324314415454864, "step": 67250 }, { "epoch": 0.6726, "grad_norm": 14.5, "grad_norm_var": 0.522900390625, "learning_rate": 0.0003, "loss": 10.8126, "loss/aux_loss": 0.04807432275265455, "loss/crossentropy": 2.5847804844379425, "loss/logits": 0.766998502612114, "step": 67260 }, { "epoch": 0.6727, "grad_norm": 16.0, "grad_norm_var": 1.5234212239583333, "learning_rate": 0.0003, "loss": 10.9851, "loss/aux_loss": 0.04805461261421442, "loss/crossentropy": 2.6628905653953554, "loss/logits": 0.8119227319955826, "step": 67270 }, { "epoch": 0.6728, "grad_norm": 15.5, "grad_norm_var": 0.8550618489583334, "learning_rate": 0.0003, "loss": 10.8064, "loss/aux_loss": 0.0480805242434144, "loss/crossentropy": 2.709455114603043, "loss/logits": 0.7887743502855301, "step": 67280 }, { "epoch": 0.6729, "grad_norm": 15.6875, "grad_norm_var": 0.3078125, "learning_rate": 0.0003, "loss": 10.8286, "loss/aux_loss": 0.0480688139796257, "loss/crossentropy": 2.5801384925842283, "loss/logits": 0.8218373239040375, "step": 67290 }, { "epoch": 0.673, "grad_norm": 14.1875, "grad_norm_var": 0.9432291666666667, "learning_rate": 0.0003, "loss": 10.885, "loss/aux_loss": 0.04806957859545946, "loss/crossentropy": 2.741364133358002, "loss/logits": 0.8136252701282501, "step": 67300 }, { "epoch": 0.6731, "grad_norm": 14.6875, "grad_norm_var": 0.7747395833333334, "learning_rate": 0.0003, "loss": 10.8628, "loss/aux_loss": 0.04806738197803497, "loss/crossentropy": 2.7313589334487913, "loss/logits": 0.8280026108026505, "step": 67310 }, { "epoch": 0.6732, "grad_norm": 14.625, "grad_norm_var": 0.38483072916666666, "learning_rate": 0.0003, "loss": 10.935, "loss/aux_loss": 0.04806512799113989, "loss/crossentropy": 2.731142336130142, "loss/logits": 0.8196133434772491, "step": 67320 }, { "epoch": 0.6733, "grad_norm": 15.1875, "grad_norm_var": 0.5726399739583333, "learning_rate": 0.0003, "loss": 10.9229, "loss/aux_loss": 0.04806940630078316, "loss/crossentropy": 2.833940917253494, "loss/logits": 0.8169467687606812, "step": 67330 }, { "epoch": 0.6734, "grad_norm": 15.1875, "grad_norm_var": 1.2751139322916667, "learning_rate": 0.0003, "loss": 10.8098, "loss/aux_loss": 0.048075680062174796, "loss/crossentropy": 2.5253068923950197, "loss/logits": 0.7722189128398895, "step": 67340 }, { "epoch": 0.6735, "grad_norm": 16.5, "grad_norm_var": 1.2885416666666667, "learning_rate": 0.0003, "loss": 10.7254, "loss/aux_loss": 0.04805800002068281, "loss/crossentropy": 2.618474489450455, "loss/logits": 0.7805281549692153, "step": 67350 }, { "epoch": 0.6736, "grad_norm": 14.5625, "grad_norm_var": 0.41456705729166665, "learning_rate": 0.0003, "loss": 10.8487, "loss/aux_loss": 0.04805983938276768, "loss/crossentropy": 2.54278524518013, "loss/logits": 0.7987161606550217, "step": 67360 }, { "epoch": 0.6737, "grad_norm": 14.1875, "grad_norm_var": 0.29889322916666666, "learning_rate": 0.0003, "loss": 10.8717, "loss/aux_loss": 0.04807660169899464, "loss/crossentropy": 2.815138578414917, "loss/logits": 0.8121901094913483, "step": 67370 }, { "epoch": 0.6738, "grad_norm": 15.125, "grad_norm_var": 0.50078125, "learning_rate": 0.0003, "loss": 10.96, "loss/aux_loss": 0.04807771537452936, "loss/crossentropy": 2.770961511135101, "loss/logits": 0.8236127972602845, "step": 67380 }, { "epoch": 0.6739, "grad_norm": 14.875, "grad_norm_var": 0.22838541666666667, "learning_rate": 0.0003, "loss": 10.9856, "loss/aux_loss": 0.048058960027992724, "loss/crossentropy": 2.645844268798828, "loss/logits": 0.8427874892950058, "step": 67390 }, { "epoch": 0.674, "grad_norm": 16.0, "grad_norm_var": 3.4936848958333333, "learning_rate": 0.0003, "loss": 10.8695, "loss/aux_loss": 0.04807248618453741, "loss/crossentropy": 2.620198917388916, "loss/logits": 0.8078225284814835, "step": 67400 }, { "epoch": 0.6741, "grad_norm": 16.625, "grad_norm_var": 3.437613932291667, "learning_rate": 0.0003, "loss": 10.7651, "loss/aux_loss": 0.0480686979368329, "loss/crossentropy": 2.5131695568561554, "loss/logits": 0.7861690491437912, "step": 67410 }, { "epoch": 0.6742, "grad_norm": 20.125, "grad_norm_var": 2.122119140625, "learning_rate": 0.0003, "loss": 10.8228, "loss/aux_loss": 0.04807369913905859, "loss/crossentropy": 2.7605391681194305, "loss/logits": 0.7911547005176545, "step": 67420 }, { "epoch": 0.6743, "grad_norm": 16.125, "grad_norm_var": 1.8026041666666666, "learning_rate": 0.0003, "loss": 10.726, "loss/aux_loss": 0.04808209650218487, "loss/crossentropy": 2.621247559785843, "loss/logits": 0.7596588641405105, "step": 67430 }, { "epoch": 0.6744, "grad_norm": 16.5, "grad_norm_var": 0.91328125, "learning_rate": 0.0003, "loss": 10.8972, "loss/aux_loss": 0.04805378243327141, "loss/crossentropy": 2.679558593034744, "loss/logits": 0.8097591936588288, "step": 67440 }, { "epoch": 0.6745, "grad_norm": 15.3125, "grad_norm_var": 0.8770182291666667, "learning_rate": 0.0003, "loss": 10.8668, "loss/aux_loss": 0.04808393493294716, "loss/crossentropy": 2.6547152400016785, "loss/logits": 0.7948015958070755, "step": 67450 }, { "epoch": 0.6746, "grad_norm": 16.75, "grad_norm_var": 0.7478515625, "learning_rate": 0.0003, "loss": 10.943, "loss/aux_loss": 0.048087403364479545, "loss/crossentropy": 2.739590084552765, "loss/logits": 0.8123195976018905, "step": 67460 }, { "epoch": 0.6747, "grad_norm": 14.875, "grad_norm_var": 1.882666015625, "learning_rate": 0.0003, "loss": 10.9271, "loss/aux_loss": 0.04805273432284594, "loss/crossentropy": 2.7345412015914916, "loss/logits": 0.8273571223020554, "step": 67470 }, { "epoch": 0.6748, "grad_norm": 15.8125, "grad_norm_var": 0.30930989583333335, "learning_rate": 0.0003, "loss": 10.8483, "loss/aux_loss": 0.04808681271970272, "loss/crossentropy": 2.637076383829117, "loss/logits": 0.7990337044000626, "step": 67480 }, { "epoch": 0.6749, "grad_norm": 15.0, "grad_norm_var": 0.326806640625, "learning_rate": 0.0003, "loss": 10.8461, "loss/aux_loss": 0.04807149153202772, "loss/crossentropy": 2.694927138090134, "loss/logits": 0.7909066528081894, "step": 67490 }, { "epoch": 0.675, "grad_norm": 15.25, "grad_norm_var": 0.28899739583333334, "learning_rate": 0.0003, "loss": 10.8772, "loss/aux_loss": 0.04805634953081608, "loss/crossentropy": 2.811596691608429, "loss/logits": 0.8140271067619324, "step": 67500 }, { "epoch": 0.6751, "grad_norm": 15.5625, "grad_norm_var": 1.0056640625, "learning_rate": 0.0003, "loss": 10.9332, "loss/aux_loss": 0.04807057473808527, "loss/crossentropy": 2.724851429462433, "loss/logits": 0.8114170014858246, "step": 67510 }, { "epoch": 0.6752, "grad_norm": 16.25, "grad_norm_var": 1.0827962239583333, "learning_rate": 0.0003, "loss": 10.8187, "loss/aux_loss": 0.04808598104864359, "loss/crossentropy": 2.514454412460327, "loss/logits": 0.7901194989681244, "step": 67520 }, { "epoch": 0.6753, "grad_norm": 15.125, "grad_norm_var": 1.7328125, "learning_rate": 0.0003, "loss": 10.7733, "loss/aux_loss": 0.04806674364954233, "loss/crossentropy": 2.6894044280052185, "loss/logits": 0.8146826893091201, "step": 67530 }, { "epoch": 0.6754, "grad_norm": 15.8125, "grad_norm_var": 1.8983723958333334, "learning_rate": 0.0003, "loss": 10.9634, "loss/aux_loss": 0.04807586632668972, "loss/crossentropy": 2.7447893381118775, "loss/logits": 0.7936123460531235, "step": 67540 }, { "epoch": 0.6755, "grad_norm": 14.0625, "grad_norm_var": 1.3093098958333333, "learning_rate": 0.0003, "loss": 10.7428, "loss/aux_loss": 0.04806775413453579, "loss/crossentropy": 2.6494940400123594, "loss/logits": 0.8177571147680283, "step": 67550 }, { "epoch": 0.6756, "grad_norm": 15.1875, "grad_norm_var": 0.5523274739583334, "learning_rate": 0.0003, "loss": 10.9695, "loss/aux_loss": 0.04806860536336899, "loss/crossentropy": 2.6481561064720154, "loss/logits": 0.8320757627487183, "step": 67560 }, { "epoch": 0.6757, "grad_norm": 15.4375, "grad_norm_var": 0.31354166666666666, "learning_rate": 0.0003, "loss": 10.9555, "loss/aux_loss": 0.04807276241481304, "loss/crossentropy": 2.7160579323768617, "loss/logits": 0.8053322076797486, "step": 67570 }, { "epoch": 0.6758, "grad_norm": 14.5625, "grad_norm_var": 0.3108723958333333, "learning_rate": 0.0003, "loss": 10.8702, "loss/aux_loss": 0.04807675499469042, "loss/crossentropy": 2.67366309762001, "loss/logits": 0.7897478014230728, "step": 67580 }, { "epoch": 0.6759, "grad_norm": 15.6875, "grad_norm_var": 1.7702473958333333, "learning_rate": 0.0003, "loss": 10.867, "loss/aux_loss": 0.048066642321646216, "loss/crossentropy": 2.6360503315925596, "loss/logits": 0.8279545217752456, "step": 67590 }, { "epoch": 0.676, "grad_norm": 18.5, "grad_norm_var": 105.2556640625, "learning_rate": 0.0003, "loss": 10.8562, "loss/aux_loss": 0.04806712754070759, "loss/crossentropy": 2.723428654670715, "loss/logits": 0.8046755522489548, "step": 67600 }, { "epoch": 0.6761, "grad_norm": 15.5, "grad_norm_var": 1.7884765625, "learning_rate": 0.0003, "loss": 10.9624, "loss/aux_loss": 0.04807472750544548, "loss/crossentropy": 2.721188187599182, "loss/logits": 0.8297373622655868, "step": 67610 }, { "epoch": 0.6762, "grad_norm": 15.0, "grad_norm_var": 0.8551920572916667, "learning_rate": 0.0003, "loss": 10.8363, "loss/aux_loss": 0.04807953592389822, "loss/crossentropy": 2.59203023314476, "loss/logits": 0.8205517113208771, "step": 67620 }, { "epoch": 0.6763, "grad_norm": 15.6875, "grad_norm_var": 0.448681640625, "learning_rate": 0.0003, "loss": 10.9757, "loss/aux_loss": 0.048056138679385185, "loss/crossentropy": 2.7822072982788084, "loss/logits": 0.8061872452497483, "step": 67630 }, { "epoch": 0.6764, "grad_norm": 16.625, "grad_norm_var": 0.5163899739583333, "learning_rate": 0.0003, "loss": 10.7339, "loss/aux_loss": 0.04806150645017624, "loss/crossentropy": 2.551578390598297, "loss/logits": 0.7957051217555999, "step": 67640 }, { "epoch": 0.6765, "grad_norm": 15.125, "grad_norm_var": 0.308837890625, "learning_rate": 0.0003, "loss": 10.8204, "loss/aux_loss": 0.048080853372812274, "loss/crossentropy": 2.618013346195221, "loss/logits": 0.7895002514123917, "step": 67650 }, { "epoch": 0.6766, "grad_norm": 16.125, "grad_norm_var": 0.353759765625, "learning_rate": 0.0003, "loss": 11.013, "loss/aux_loss": 0.048052550107240674, "loss/crossentropy": 2.812237298488617, "loss/logits": 0.8751404196023941, "step": 67660 }, { "epoch": 0.6767, "grad_norm": 14.6875, "grad_norm_var": 0.385009765625, "learning_rate": 0.0003, "loss": 10.7287, "loss/aux_loss": 0.048068588599562645, "loss/crossentropy": 2.4027431547641753, "loss/logits": 0.7714583456516266, "step": 67670 }, { "epoch": 0.6768, "grad_norm": 15.75, "grad_norm_var": 0.5728515625, "learning_rate": 0.0003, "loss": 10.9295, "loss/aux_loss": 0.048069121316075325, "loss/crossentropy": 2.750480669736862, "loss/logits": 0.8289047926664352, "step": 67680 }, { "epoch": 0.6769, "grad_norm": 15.9375, "grad_norm_var": 1.20859375, "learning_rate": 0.0003, "loss": 10.9895, "loss/aux_loss": 0.04808015916496515, "loss/crossentropy": 2.8340500593185425, "loss/logits": 0.8436507463455201, "step": 67690 }, { "epoch": 0.677, "grad_norm": 15.1875, "grad_norm_var": 1.0079264322916666, "learning_rate": 0.0003, "loss": 10.9163, "loss/aux_loss": 0.048059361055493355, "loss/crossentropy": 2.6917006373405457, "loss/logits": 0.8295173823833466, "step": 67700 }, { "epoch": 0.6771, "grad_norm": 15.9375, "grad_norm_var": 0.6603515625, "learning_rate": 0.0003, "loss": 10.8006, "loss/aux_loss": 0.04806683622300625, "loss/crossentropy": 2.6369792103767393, "loss/logits": 0.7871336251497268, "step": 67710 }, { "epoch": 0.6772, "grad_norm": 16.125, "grad_norm_var": 1.2552083333333333, "learning_rate": 0.0003, "loss": 11.0148, "loss/aux_loss": 0.04807326085865497, "loss/crossentropy": 2.8675316095352175, "loss/logits": 0.8307450711727142, "step": 67720 }, { "epoch": 0.6773, "grad_norm": 16.375, "grad_norm_var": 0.6858723958333334, "learning_rate": 0.0003, "loss": 10.7185, "loss/aux_loss": 0.04807858150452375, "loss/crossentropy": 2.577520215511322, "loss/logits": 0.8100444704294205, "step": 67730 }, { "epoch": 0.6774, "grad_norm": 14.875, "grad_norm_var": 0.5481770833333334, "learning_rate": 0.0003, "loss": 10.849, "loss/aux_loss": 0.04805320855230093, "loss/crossentropy": 2.7695468187332155, "loss/logits": 0.8083236128091812, "step": 67740 }, { "epoch": 0.6775, "grad_norm": 15.0625, "grad_norm_var": 1.6390462239583334, "learning_rate": 0.0003, "loss": 10.8675, "loss/aux_loss": 0.048079329542815685, "loss/crossentropy": 2.576527512073517, "loss/logits": 0.8159837514162064, "step": 67750 }, { "epoch": 0.6776, "grad_norm": 15.4375, "grad_norm_var": 0.375, "learning_rate": 0.0003, "loss": 10.8023, "loss/aux_loss": 0.048062733933329584, "loss/crossentropy": 2.8369166016578675, "loss/logits": 0.8276311069726944, "step": 67760 }, { "epoch": 0.6777, "grad_norm": 14.0, "grad_norm_var": 1.247119140625, "learning_rate": 0.0003, "loss": 10.7408, "loss/aux_loss": 0.04806175995618105, "loss/crossentropy": 2.706052553653717, "loss/logits": 0.8145518273115158, "step": 67770 }, { "epoch": 0.6778, "grad_norm": 15.5625, "grad_norm_var": 0.5853515625, "learning_rate": 0.0003, "loss": 10.8887, "loss/aux_loss": 0.04806300979107618, "loss/crossentropy": 2.667705309391022, "loss/logits": 0.8158227071166039, "step": 67780 }, { "epoch": 0.6779, "grad_norm": 15.125, "grad_norm_var": 0.24635416666666668, "learning_rate": 0.0003, "loss": 10.8023, "loss/aux_loss": 0.04807372950017452, "loss/crossentropy": 2.7875693142414093, "loss/logits": 0.7981827527284622, "step": 67790 }, { "epoch": 0.678, "grad_norm": 16.625, "grad_norm_var": 0.38553059895833336, "learning_rate": 0.0003, "loss": 10.8735, "loss/aux_loss": 0.048066407814621924, "loss/crossentropy": 2.7808514714241026, "loss/logits": 0.8458144783973693, "step": 67800 }, { "epoch": 0.6781, "grad_norm": 15.4375, "grad_norm_var": 0.6945149739583333, "learning_rate": 0.0003, "loss": 10.8798, "loss/aux_loss": 0.048075595125555995, "loss/crossentropy": 2.605326008796692, "loss/logits": 0.7936393201351166, "step": 67810 }, { "epoch": 0.6782, "grad_norm": 14.875, "grad_norm_var": 0.8359375, "learning_rate": 0.0003, "loss": 10.9042, "loss/aux_loss": 0.048067840933799746, "loss/crossentropy": 2.6910835683345793, "loss/logits": 0.8230360358953476, "step": 67820 }, { "epoch": 0.6783, "grad_norm": 16.25, "grad_norm_var": 3.9468587239583335, "learning_rate": 0.0003, "loss": 10.8921, "loss/aux_loss": 0.048061666823923585, "loss/crossentropy": 2.8486799359321595, "loss/logits": 0.8791530191898346, "step": 67830 }, { "epoch": 0.6784, "grad_norm": 15.625, "grad_norm_var": 0.6270182291666667, "learning_rate": 0.0003, "loss": 11.0865, "loss/aux_loss": 0.04808611553162336, "loss/crossentropy": 2.694787919521332, "loss/logits": 0.7927749201655387, "step": 67840 }, { "epoch": 0.6785, "grad_norm": 15.1875, "grad_norm_var": 0.6895670572916667, "learning_rate": 0.0003, "loss": 10.8049, "loss/aux_loss": 0.048073581978678705, "loss/crossentropy": 2.5416905343532563, "loss/logits": 0.7923329859972, "step": 67850 }, { "epoch": 0.6786, "grad_norm": 14.0, "grad_norm_var": 0.9050618489583333, "learning_rate": 0.0003, "loss": 10.9769, "loss/aux_loss": 0.04805105049163103, "loss/crossentropy": 2.754929852485657, "loss/logits": 0.8427582740783691, "step": 67860 }, { "epoch": 0.6787, "grad_norm": 16.375, "grad_norm_var": 0.6212890625, "learning_rate": 0.0003, "loss": 10.9937, "loss/aux_loss": 0.04806809015572071, "loss/crossentropy": 2.8406196355819704, "loss/logits": 0.8406887620687484, "step": 67870 }, { "epoch": 0.6788, "grad_norm": 14.625, "grad_norm_var": 0.38201497395833334, "learning_rate": 0.0003, "loss": 10.818, "loss/aux_loss": 0.04806934855878353, "loss/crossentropy": 2.591457462310791, "loss/logits": 0.8105394840240479, "step": 67880 }, { "epoch": 0.6789, "grad_norm": 15.8125, "grad_norm_var": 0.4227701822916667, "learning_rate": 0.0003, "loss": 10.8923, "loss/aux_loss": 0.048066012747585776, "loss/crossentropy": 2.705242431163788, "loss/logits": 0.8005460679531098, "step": 67890 }, { "epoch": 0.679, "grad_norm": 14.6875, "grad_norm_var": 0.7940104166666667, "learning_rate": 0.0003, "loss": 10.7941, "loss/aux_loss": 0.04805295336991548, "loss/crossentropy": 2.646135312318802, "loss/logits": 0.8061649814248085, "step": 67900 }, { "epoch": 0.6791, "grad_norm": 16.0, "grad_norm_var": 0.6841145833333333, "learning_rate": 0.0003, "loss": 10.8556, "loss/aux_loss": 0.04807949885725975, "loss/crossentropy": 2.5016194105148317, "loss/logits": 0.7795857936143875, "step": 67910 }, { "epoch": 0.6792, "grad_norm": 16.0, "grad_norm_var": 0.5186848958333333, "learning_rate": 0.0003, "loss": 10.8385, "loss/aux_loss": 0.04806459601968527, "loss/crossentropy": 2.701348972320557, "loss/logits": 0.804606556892395, "step": 67920 }, { "epoch": 0.6793, "grad_norm": 15.0625, "grad_norm_var": 0.3329264322916667, "learning_rate": 0.0003, "loss": 10.8824, "loss/aux_loss": 0.04806465972214937, "loss/crossentropy": 2.708846724033356, "loss/logits": 0.7914534270763397, "step": 67930 }, { "epoch": 0.6794, "grad_norm": 16.5, "grad_norm_var": 285.405712890625, "learning_rate": 0.0003, "loss": 10.877, "loss/aux_loss": 0.048062794655561444, "loss/crossentropy": 2.6500784277915956, "loss/logits": 0.791703137755394, "step": 67940 }, { "epoch": 0.6795, "grad_norm": 17.0, "grad_norm_var": 281.64635416666664, "learning_rate": 0.0003, "loss": 11.0174, "loss/aux_loss": 0.04806160032749176, "loss/crossentropy": 2.6734558582305907, "loss/logits": 0.7966185420751571, "step": 67950 }, { "epoch": 0.6796, "grad_norm": 17.375, "grad_norm_var": 2.1645833333333333, "learning_rate": 0.0003, "loss": 10.655, "loss/aux_loss": 0.04807016905397177, "loss/crossentropy": 2.725010406970978, "loss/logits": 0.7850367069244385, "step": 67960 }, { "epoch": 0.6797, "grad_norm": 15.5, "grad_norm_var": 1.8442057291666667, "learning_rate": 0.0003, "loss": 11.0602, "loss/aux_loss": 0.04807140734046698, "loss/crossentropy": 2.7007255434989927, "loss/logits": 0.843214625120163, "step": 67970 }, { "epoch": 0.6798, "grad_norm": 15.6875, "grad_norm_var": 0.37303059895833335, "learning_rate": 0.0003, "loss": 10.7407, "loss/aux_loss": 0.0480677118524909, "loss/crossentropy": 2.6968838930130006, "loss/logits": 0.7861210882663727, "step": 67980 }, { "epoch": 0.6799, "grad_norm": 16.625, "grad_norm_var": 1.053369140625, "learning_rate": 0.0003, "loss": 10.8169, "loss/aux_loss": 0.04806595295667648, "loss/crossentropy": 2.692828023433685, "loss/logits": 0.8357434421777725, "step": 67990 }, { "epoch": 0.68, "grad_norm": 17.625, "grad_norm_var": 1.1062337239583333, "learning_rate": 0.0003, "loss": 10.9263, "loss/aux_loss": 0.04806125350296497, "loss/crossentropy": 2.8140974402427674, "loss/logits": 0.8311895668506623, "step": 68000 }, { "epoch": 0.6801, "grad_norm": 16.75, "grad_norm_var": 2.515738932291667, "learning_rate": 0.0003, "loss": 10.8513, "loss/aux_loss": 0.04805536307394505, "loss/crossentropy": 2.7598934888839723, "loss/logits": 0.817400798201561, "step": 68010 }, { "epoch": 0.6802, "grad_norm": 16.5, "grad_norm_var": 3.612434895833333, "learning_rate": 0.0003, "loss": 11.0643, "loss/aux_loss": 0.04808505550026894, "loss/crossentropy": 2.6912566304206846, "loss/logits": 0.819332605600357, "step": 68020 }, { "epoch": 0.6803, "grad_norm": 15.375, "grad_norm_var": 1.826025390625, "learning_rate": 0.0003, "loss": 10.9309, "loss/aux_loss": 0.04807189963757992, "loss/crossentropy": 2.62255003452301, "loss/logits": 0.8158010393381119, "step": 68030 }, { "epoch": 0.6804, "grad_norm": 16.375, "grad_norm_var": 2.052604166666667, "learning_rate": 0.0003, "loss": 10.9309, "loss/aux_loss": 0.048042737506330015, "loss/crossentropy": 2.7515727818012237, "loss/logits": 0.8346040636301041, "step": 68040 }, { "epoch": 0.6805, "grad_norm": 14.5625, "grad_norm_var": 2.3296223958333333, "learning_rate": 0.0003, "loss": 10.8757, "loss/aux_loss": 0.048085806891322136, "loss/crossentropy": 2.7306901931762697, "loss/logits": 0.7944915473461152, "step": 68050 }, { "epoch": 0.6806, "grad_norm": 16.875, "grad_norm_var": 0.5645833333333333, "learning_rate": 0.0003, "loss": 10.9716, "loss/aux_loss": 0.04805421140044928, "loss/crossentropy": 2.759699082374573, "loss/logits": 0.8278161138296127, "step": 68060 }, { "epoch": 0.6807, "grad_norm": 16.5, "grad_norm_var": 0.5499348958333333, "learning_rate": 0.0003, "loss": 10.984, "loss/aux_loss": 0.04806980360299349, "loss/crossentropy": 2.7111220836639403, "loss/logits": 0.8429874509572983, "step": 68070 }, { "epoch": 0.6808, "grad_norm": 14.625, "grad_norm_var": 0.7937337239583333, "learning_rate": 0.0003, "loss": 10.8038, "loss/aux_loss": 0.04806726835668087, "loss/crossentropy": 2.7184378623962404, "loss/logits": 0.8051327586174011, "step": 68080 }, { "epoch": 0.6809, "grad_norm": 19.625, "grad_norm_var": 1.8660807291666666, "learning_rate": 0.0003, "loss": 10.9857, "loss/aux_loss": 0.048076169565320015, "loss/crossentropy": 2.623359727859497, "loss/logits": 0.8120907008647918, "step": 68090 }, { "epoch": 0.681, "grad_norm": 15.3125, "grad_norm_var": 1.6804524739583333, "learning_rate": 0.0003, "loss": 10.7738, "loss/aux_loss": 0.04806400742381811, "loss/crossentropy": 2.5843277633190156, "loss/logits": 0.7641439378261566, "step": 68100 }, { "epoch": 0.6811, "grad_norm": 14.9375, "grad_norm_var": 0.33671875, "learning_rate": 0.0003, "loss": 11.0272, "loss/aux_loss": 0.048069718293845654, "loss/crossentropy": 2.7510910749435427, "loss/logits": 0.8262420713901519, "step": 68110 }, { "epoch": 0.6812, "grad_norm": 15.1875, "grad_norm_var": 0.5855305989583334, "learning_rate": 0.0003, "loss": 10.9944, "loss/aux_loss": 0.04808128289878368, "loss/crossentropy": 2.745889973640442, "loss/logits": 0.8404574304819107, "step": 68120 }, { "epoch": 0.6813, "grad_norm": 15.625, "grad_norm_var": 0.40792643229166664, "learning_rate": 0.0003, "loss": 10.9824, "loss/aux_loss": 0.04806574918329716, "loss/crossentropy": 2.8554471492767335, "loss/logits": 0.827678182721138, "step": 68130 }, { "epoch": 0.6814, "grad_norm": 18.125, "grad_norm_var": 0.9089680989583333, "learning_rate": 0.0003, "loss": 10.8597, "loss/aux_loss": 0.048068815097212794, "loss/crossentropy": 2.684776157140732, "loss/logits": 0.7816103935241699, "step": 68140 }, { "epoch": 0.6815, "grad_norm": 16.375, "grad_norm_var": 0.9602701822916667, "learning_rate": 0.0003, "loss": 10.7884, "loss/aux_loss": 0.04806648455560207, "loss/crossentropy": 2.6992865085601805, "loss/logits": 0.8477433979511261, "step": 68150 }, { "epoch": 0.6816, "grad_norm": 15.9375, "grad_norm_var": 0.48072916666666665, "learning_rate": 0.0003, "loss": 10.8101, "loss/aux_loss": 0.048081612959504125, "loss/crossentropy": 2.5780653059482574, "loss/logits": 0.7993250101804733, "step": 68160 }, { "epoch": 0.6817, "grad_norm": 14.5, "grad_norm_var": 0.5562337239583334, "learning_rate": 0.0003, "loss": 10.8097, "loss/aux_loss": 0.04806546475738287, "loss/crossentropy": 2.5478107750415804, "loss/logits": 0.8167504072189331, "step": 68170 }, { "epoch": 0.6818, "grad_norm": 16.0, "grad_norm_var": 0.9333170572916667, "learning_rate": 0.0003, "loss": 10.8763, "loss/aux_loss": 0.048067673482000826, "loss/crossentropy": 2.6177058100700377, "loss/logits": 0.8171544075012207, "step": 68180 }, { "epoch": 0.6819, "grad_norm": 13.9375, "grad_norm_var": 0.3876139322916667, "learning_rate": 0.0003, "loss": 10.8284, "loss/aux_loss": 0.04806404709815979, "loss/crossentropy": 2.6563953340053557, "loss/logits": 0.8121503591537476, "step": 68190 }, { "epoch": 0.682, "grad_norm": 20.625, "grad_norm_var": 2.582666015625, "learning_rate": 0.0003, "loss": 10.7687, "loss/aux_loss": 0.048071419820189476, "loss/crossentropy": 2.6710281014442443, "loss/logits": 0.8111292243003845, "step": 68200 }, { "epoch": 0.6821, "grad_norm": 16.25, "grad_norm_var": 2.412483723958333, "learning_rate": 0.0003, "loss": 10.9164, "loss/aux_loss": 0.04807704593986273, "loss/crossentropy": 2.562616801261902, "loss/logits": 0.8096489131450653, "step": 68210 }, { "epoch": 0.6822, "grad_norm": 16.5, "grad_norm_var": 1.0212076822916667, "learning_rate": 0.0003, "loss": 10.8428, "loss/aux_loss": 0.04805521406233311, "loss/crossentropy": 2.7523882746696473, "loss/logits": 0.8298066765069961, "step": 68220 }, { "epoch": 0.6823, "grad_norm": 14.875, "grad_norm_var": 0.9782389322916667, "learning_rate": 0.0003, "loss": 10.9563, "loss/aux_loss": 0.04807618539780378, "loss/crossentropy": 2.7051311850547792, "loss/logits": 0.7973050862550736, "step": 68230 }, { "epoch": 0.6824, "grad_norm": 14.875, "grad_norm_var": 0.420166015625, "learning_rate": 0.0003, "loss": 10.882, "loss/aux_loss": 0.048058620654046535, "loss/crossentropy": 2.6264807403087618, "loss/logits": 0.8164067506790161, "step": 68240 }, { "epoch": 0.6825, "grad_norm": 16.25, "grad_norm_var": 0.4014973958333333, "learning_rate": 0.0003, "loss": 11.0203, "loss/aux_loss": 0.048072101175785066, "loss/crossentropy": 2.7212952256202696, "loss/logits": 0.8090760707855225, "step": 68250 }, { "epoch": 0.6826, "grad_norm": 14.9375, "grad_norm_var": 0.3411458333333333, "learning_rate": 0.0003, "loss": 10.9662, "loss/aux_loss": 0.04806735776364803, "loss/crossentropy": 2.8346715688705446, "loss/logits": 0.8262193471193313, "step": 68260 }, { "epoch": 0.6827, "grad_norm": 15.25, "grad_norm_var": 0.539697265625, "learning_rate": 0.0003, "loss": 10.845, "loss/aux_loss": 0.04806448295712471, "loss/crossentropy": 2.8213131070137023, "loss/logits": 0.8190656453371048, "step": 68270 }, { "epoch": 0.6828, "grad_norm": 16.125, "grad_norm_var": 0.6005045572916666, "learning_rate": 0.0003, "loss": 10.8459, "loss/aux_loss": 0.048075663857162, "loss/crossentropy": 2.743839997053146, "loss/logits": 0.8118878155946732, "step": 68280 }, { "epoch": 0.6829, "grad_norm": 15.625, "grad_norm_var": 0.4852701822916667, "learning_rate": 0.0003, "loss": 11.0045, "loss/aux_loss": 0.048066263645887376, "loss/crossentropy": 2.638486051559448, "loss/logits": 0.8266521632671356, "step": 68290 }, { "epoch": 0.683, "grad_norm": 15.5625, "grad_norm_var": 0.48776041666666664, "learning_rate": 0.0003, "loss": 11.0403, "loss/aux_loss": 0.04806353356689215, "loss/crossentropy": 2.6697838246822356, "loss/logits": 0.8448014736175538, "step": 68300 }, { "epoch": 0.6831, "grad_norm": 14.875, "grad_norm_var": 0.6639973958333333, "learning_rate": 0.0003, "loss": 10.967, "loss/aux_loss": 0.04807850923389197, "loss/crossentropy": 2.746517300605774, "loss/logits": 0.8058427959680557, "step": 68310 }, { "epoch": 0.6832, "grad_norm": 15.125, "grad_norm_var": 0.6393229166666666, "learning_rate": 0.0003, "loss": 10.8865, "loss/aux_loss": 0.0480657272040844, "loss/crossentropy": 2.5843187630176545, "loss/logits": 0.768140897154808, "step": 68320 }, { "epoch": 0.6833, "grad_norm": 16.25, "grad_norm_var": 0.5322265625, "learning_rate": 0.0003, "loss": 11.0019, "loss/aux_loss": 0.04805855434387922, "loss/crossentropy": 2.7240235090255736, "loss/logits": 0.8157852947711944, "step": 68330 }, { "epoch": 0.6834, "grad_norm": 15.9375, "grad_norm_var": 0.8430826822916667, "learning_rate": 0.0003, "loss": 10.8722, "loss/aux_loss": 0.04806504771113396, "loss/crossentropy": 2.6174010276794433, "loss/logits": 0.8211093872785569, "step": 68340 }, { "epoch": 0.6835, "grad_norm": 15.9375, "grad_norm_var": 0.5624348958333333, "learning_rate": 0.0003, "loss": 10.7703, "loss/aux_loss": 0.0480718906968832, "loss/crossentropy": 2.7147361874580382, "loss/logits": 0.8232816010713577, "step": 68350 }, { "epoch": 0.6836, "grad_norm": 17.125, "grad_norm_var": 0.713916015625, "learning_rate": 0.0003, "loss": 10.7898, "loss/aux_loss": 0.048076497949659826, "loss/crossentropy": 2.6525621116161346, "loss/logits": 0.7956002086400986, "step": 68360 }, { "epoch": 0.6837, "grad_norm": 14.8125, "grad_norm_var": 0.9614583333333333, "learning_rate": 0.0003, "loss": 11.029, "loss/aux_loss": 0.0480541817843914, "loss/crossentropy": 2.7248359322547913, "loss/logits": 0.8173481345176696, "step": 68370 }, { "epoch": 0.6838, "grad_norm": 14.125, "grad_norm_var": 0.4525390625, "learning_rate": 0.0003, "loss": 11.0335, "loss/aux_loss": 0.0480697525665164, "loss/crossentropy": 2.772666358947754, "loss/logits": 0.8384395599365234, "step": 68380 }, { "epoch": 0.6839, "grad_norm": 16.5, "grad_norm_var": 0.455322265625, "learning_rate": 0.0003, "loss": 11.01, "loss/aux_loss": 0.04807109702378511, "loss/crossentropy": 2.5988758385181425, "loss/logits": 0.8189985305070877, "step": 68390 }, { "epoch": 0.684, "grad_norm": 15.4375, "grad_norm_var": 0.2900390625, "learning_rate": 0.0003, "loss": 10.8031, "loss/aux_loss": 0.04807253852486611, "loss/crossentropy": 2.8541467905044557, "loss/logits": 0.8169606924057007, "step": 68400 }, { "epoch": 0.6841, "grad_norm": 15.9375, "grad_norm_var": 0.44680989583333336, "learning_rate": 0.0003, "loss": 10.8272, "loss/aux_loss": 0.04806390330195427, "loss/crossentropy": 2.6487698316574098, "loss/logits": 0.83295978307724, "step": 68410 }, { "epoch": 0.6842, "grad_norm": 15.3125, "grad_norm_var": 0.5655598958333333, "learning_rate": 0.0003, "loss": 10.7979, "loss/aux_loss": 0.0480561263859272, "loss/crossentropy": 2.6623626351356506, "loss/logits": 0.8088810354471206, "step": 68420 }, { "epoch": 0.6843, "grad_norm": 16.125, "grad_norm_var": 0.6546223958333334, "learning_rate": 0.0003, "loss": 10.9656, "loss/aux_loss": 0.04806807264685631, "loss/crossentropy": 2.766378217935562, "loss/logits": 0.8358203887939453, "step": 68430 }, { "epoch": 0.6844, "grad_norm": 16.375, "grad_norm_var": 0.37745768229166665, "learning_rate": 0.0003, "loss": 10.9736, "loss/aux_loss": 0.04807545747607946, "loss/crossentropy": 2.5721539914608003, "loss/logits": 0.8290560871362687, "step": 68440 }, { "epoch": 0.6845, "grad_norm": 16.75, "grad_norm_var": 0.4228515625, "learning_rate": 0.0003, "loss": 10.696, "loss/aux_loss": 0.04805667717009783, "loss/crossentropy": 2.80389918088913, "loss/logits": 0.8191724687814712, "step": 68450 }, { "epoch": 0.6846, "grad_norm": 15.1875, "grad_norm_var": 0.42823893229166665, "learning_rate": 0.0003, "loss": 10.7993, "loss/aux_loss": 0.04807515386492014, "loss/crossentropy": 2.597550481557846, "loss/logits": 0.7920944511890411, "step": 68460 }, { "epoch": 0.6847, "grad_norm": 16.75, "grad_norm_var": 0.52421875, "learning_rate": 0.0003, "loss": 10.7448, "loss/aux_loss": 0.048058021068573, "loss/crossentropy": 2.759678292274475, "loss/logits": 0.8082679748535156, "step": 68470 }, { "epoch": 0.6848, "grad_norm": 16.625, "grad_norm_var": 0.5075358072916667, "learning_rate": 0.0003, "loss": 10.8687, "loss/aux_loss": 0.0480703329667449, "loss/crossentropy": 2.637404328584671, "loss/logits": 0.8026473224163055, "step": 68480 }, { "epoch": 0.6849, "grad_norm": 15.375, "grad_norm_var": 1.1485514322916666, "learning_rate": 0.0003, "loss": 10.8992, "loss/aux_loss": 0.04807462096214295, "loss/crossentropy": 2.501510390639305, "loss/logits": 0.7927737981081009, "step": 68490 }, { "epoch": 0.685, "grad_norm": 15.4375, "grad_norm_var": 1.1325520833333333, "learning_rate": 0.0003, "loss": 10.9832, "loss/aux_loss": 0.04806219376623631, "loss/crossentropy": 2.791027194261551, "loss/logits": 0.8303872972726822, "step": 68500 }, { "epoch": 0.6851, "grad_norm": 15.3125, "grad_norm_var": 0.8973307291666667, "learning_rate": 0.0003, "loss": 11.0139, "loss/aux_loss": 0.04807308483868837, "loss/crossentropy": 2.621407997608185, "loss/logits": 0.8331938594579696, "step": 68510 }, { "epoch": 0.6852, "grad_norm": 15.0625, "grad_norm_var": 0.6895670572916667, "learning_rate": 0.0003, "loss": 10.9548, "loss/aux_loss": 0.04806851521134377, "loss/crossentropy": 2.852750539779663, "loss/logits": 0.8136879056692123, "step": 68520 }, { "epoch": 0.6853, "grad_norm": 14.875, "grad_norm_var": 0.3726399739583333, "learning_rate": 0.0003, "loss": 10.7068, "loss/aux_loss": 0.04807034097611904, "loss/crossentropy": 2.626950180530548, "loss/logits": 0.8226811677217484, "step": 68530 }, { "epoch": 0.6854, "grad_norm": 15.25, "grad_norm_var": 0.39609375, "learning_rate": 0.0003, "loss": 10.893, "loss/aux_loss": 0.048067341558635235, "loss/crossentropy": 2.7131328761577604, "loss/logits": 0.8315279483795166, "step": 68540 }, { "epoch": 0.6855, "grad_norm": 14.75, "grad_norm_var": 0.36666666666666664, "learning_rate": 0.0003, "loss": 10.8631, "loss/aux_loss": 0.048068701103329656, "loss/crossentropy": 2.760225808620453, "loss/logits": 0.8215482652187347, "step": 68550 }, { "epoch": 0.6856, "grad_norm": 15.0625, "grad_norm_var": 1.3462890625, "learning_rate": 0.0003, "loss": 10.728, "loss/aux_loss": 0.04806747734546661, "loss/crossentropy": 2.6939137518405913, "loss/logits": 0.7923362493515015, "step": 68560 }, { "epoch": 0.6857, "grad_norm": 15.75, "grad_norm_var": 0.9894368489583333, "learning_rate": 0.0003, "loss": 10.8082, "loss/aux_loss": 0.048071125708520415, "loss/crossentropy": 2.578646457195282, "loss/logits": 0.7892535030841827, "step": 68570 }, { "epoch": 0.6858, "grad_norm": 15.5625, "grad_norm_var": 0.22578125, "learning_rate": 0.0003, "loss": 10.7482, "loss/aux_loss": 0.048062393255531785, "loss/crossentropy": 2.7403541207313538, "loss/logits": 0.7984594285488129, "step": 68580 }, { "epoch": 0.6859, "grad_norm": 15.4375, "grad_norm_var": 1.2327473958333333, "learning_rate": 0.0003, "loss": 10.8886, "loss/aux_loss": 0.04806282836943865, "loss/crossentropy": 2.47440989613533, "loss/logits": 0.7788677424192428, "step": 68590 }, { "epoch": 0.686, "grad_norm": 17.25, "grad_norm_var": 0.651806640625, "learning_rate": 0.0003, "loss": 10.8691, "loss/aux_loss": 0.04807308055460453, "loss/crossentropy": 2.790786528587341, "loss/logits": 0.8180954813957214, "step": 68600 }, { "epoch": 0.6861, "grad_norm": 15.4375, "grad_norm_var": 0.53203125, "learning_rate": 0.0003, "loss": 10.8299, "loss/aux_loss": 0.04806508533656597, "loss/crossentropy": 2.843202555179596, "loss/logits": 0.8343409359455108, "step": 68610 }, { "epoch": 0.6862, "grad_norm": 16.375, "grad_norm_var": 0.434228515625, "learning_rate": 0.0003, "loss": 10.9189, "loss/aux_loss": 0.048060751520097256, "loss/crossentropy": 2.684927535057068, "loss/logits": 0.8208200216293335, "step": 68620 }, { "epoch": 0.6863, "grad_norm": 16.0, "grad_norm_var": 0.5707682291666667, "learning_rate": 0.0003, "loss": 10.9396, "loss/aux_loss": 0.04805750884115696, "loss/crossentropy": 2.725750833749771, "loss/logits": 0.8101972997188568, "step": 68630 }, { "epoch": 0.6864, "grad_norm": 14.8125, "grad_norm_var": 0.48118489583333335, "learning_rate": 0.0003, "loss": 10.8437, "loss/aux_loss": 0.04807729534804821, "loss/crossentropy": 2.7160585641860964, "loss/logits": 0.8321511924266816, "step": 68640 }, { "epoch": 0.6865, "grad_norm": 15.0625, "grad_norm_var": 60.0947265625, "learning_rate": 0.0003, "loss": 10.7697, "loss/aux_loss": 0.048073595762252806, "loss/crossentropy": 2.7934968948364256, "loss/logits": 0.8110801339149475, "step": 68650 }, { "epoch": 0.6866, "grad_norm": 17.625, "grad_norm_var": 59.416666666666664, "learning_rate": 0.0003, "loss": 10.8273, "loss/aux_loss": 0.048069017380476, "loss/crossentropy": 2.611198389530182, "loss/logits": 0.7912805765867233, "step": 68660 }, { "epoch": 0.6867, "grad_norm": 15.875, "grad_norm_var": 2.5278645833333333, "learning_rate": 0.0003, "loss": 10.8545, "loss/aux_loss": 0.0480608643963933, "loss/crossentropy": 2.714128017425537, "loss/logits": 0.8020155668258667, "step": 68670 }, { "epoch": 0.6868, "grad_norm": 15.375, "grad_norm_var": 0.8078125, "learning_rate": 0.0003, "loss": 10.8363, "loss/aux_loss": 0.048069582879543306, "loss/crossentropy": 2.7775703012943267, "loss/logits": 0.8089812129735947, "step": 68680 }, { "epoch": 0.6869, "grad_norm": 17.875, "grad_norm_var": 1.0457682291666666, "learning_rate": 0.0003, "loss": 10.7583, "loss/aux_loss": 0.048077752627432344, "loss/crossentropy": 2.6942376673221586, "loss/logits": 0.7967723488807679, "step": 68690 }, { "epoch": 0.687, "grad_norm": 14.375, "grad_norm_var": 0.8697265625, "learning_rate": 0.0003, "loss": 10.9463, "loss/aux_loss": 0.04806126933544874, "loss/crossentropy": 2.6490222990512846, "loss/logits": 0.8524997681379318, "step": 68700 }, { "epoch": 0.6871, "grad_norm": 15.375, "grad_norm_var": 0.35149739583333334, "learning_rate": 0.0003, "loss": 10.6899, "loss/aux_loss": 0.04805758167058229, "loss/crossentropy": 2.6266492545604705, "loss/logits": 0.7861977398395539, "step": 68710 }, { "epoch": 0.6872, "grad_norm": 16.375, "grad_norm_var": 0.5044270833333333, "learning_rate": 0.0003, "loss": 11.0628, "loss/aux_loss": 0.048070350848138335, "loss/crossentropy": 2.7783553838729858, "loss/logits": 0.8349015235900878, "step": 68720 }, { "epoch": 0.6873, "grad_norm": 14.5625, "grad_norm_var": 0.9837890625, "learning_rate": 0.0003, "loss": 10.8576, "loss/aux_loss": 0.04807405862957239, "loss/crossentropy": 2.7715602993965147, "loss/logits": 0.7893172383308411, "step": 68730 }, { "epoch": 0.6874, "grad_norm": 15.0625, "grad_norm_var": 0.4676920572916667, "learning_rate": 0.0003, "loss": 10.7511, "loss/aux_loss": 0.04806721042841673, "loss/crossentropy": 2.721312952041626, "loss/logits": 0.8222410500049591, "step": 68740 }, { "epoch": 0.6875, "grad_norm": 14.375, "grad_norm_var": 0.4657389322916667, "learning_rate": 0.0003, "loss": 11.0532, "loss/aux_loss": 0.04806132633239031, "loss/crossentropy": 2.6498945474624636, "loss/logits": 0.8259395629167556, "step": 68750 }, { "epoch": 0.6876, "grad_norm": 14.0625, "grad_norm_var": 0.298681640625, "learning_rate": 0.0003, "loss": 10.7399, "loss/aux_loss": 0.04808110278099775, "loss/crossentropy": 2.6188538670539856, "loss/logits": 0.7968878641724586, "step": 68760 }, { "epoch": 0.6877, "grad_norm": 14.6875, "grad_norm_var": 0.4171223958333333, "learning_rate": 0.0003, "loss": 10.9881, "loss/aux_loss": 0.048065911047160625, "loss/crossentropy": 2.623822647333145, "loss/logits": 0.8174702137708664, "step": 68770 }, { "epoch": 0.6878, "grad_norm": 14.5, "grad_norm_var": 1.20625, "learning_rate": 0.0003, "loss": 10.9751, "loss/aux_loss": 0.04807086084038019, "loss/crossentropy": 2.7264267265796662, "loss/logits": 0.8461625635623932, "step": 68780 }, { "epoch": 0.6879, "grad_norm": 15.0625, "grad_norm_var": 0.814697265625, "learning_rate": 0.0003, "loss": 10.7834, "loss/aux_loss": 0.04806190486997366, "loss/crossentropy": 2.8499103784561157, "loss/logits": 0.846915426850319, "step": 68790 }, { "epoch": 0.688, "grad_norm": 14.5, "grad_norm_var": 0.7809895833333333, "learning_rate": 0.0003, "loss": 11.0182, "loss/aux_loss": 0.048072070069611075, "loss/crossentropy": 2.6494312465190886, "loss/logits": 0.8267855823040009, "step": 68800 }, { "epoch": 0.6881, "grad_norm": 17.5, "grad_norm_var": 1.037744140625, "learning_rate": 0.0003, "loss": 10.9319, "loss/aux_loss": 0.04805615525692701, "loss/crossentropy": 2.509679216146469, "loss/logits": 0.79253771007061, "step": 68810 }, { "epoch": 0.6882, "grad_norm": 15.4375, "grad_norm_var": 0.62421875, "learning_rate": 0.0003, "loss": 10.9166, "loss/aux_loss": 0.04807599224150181, "loss/crossentropy": 2.87341451048851, "loss/logits": 0.8474191457033158, "step": 68820 }, { "epoch": 0.6883, "grad_norm": 15.375, "grad_norm_var": 0.2955729166666667, "learning_rate": 0.0003, "loss": 11.0466, "loss/aux_loss": 0.048066175729036334, "loss/crossentropy": 2.767427670955658, "loss/logits": 0.8232954949140548, "step": 68830 }, { "epoch": 0.6884, "grad_norm": 14.875, "grad_norm_var": 0.38409830729166666, "learning_rate": 0.0003, "loss": 10.9533, "loss/aux_loss": 0.04806645177304745, "loss/crossentropy": 2.707893443107605, "loss/logits": 0.8553780347108841, "step": 68840 }, { "epoch": 0.6885, "grad_norm": 15.5625, "grad_norm_var": 0.5254557291666667, "learning_rate": 0.0003, "loss": 11.0061, "loss/aux_loss": 0.04807383120059967, "loss/crossentropy": 2.6774882674217224, "loss/logits": 0.8080006390810013, "step": 68850 }, { "epoch": 0.6886, "grad_norm": 15.4375, "grad_norm_var": 2.9567057291666665, "learning_rate": 0.0003, "loss": 10.856, "loss/aux_loss": 0.048062294721603394, "loss/crossentropy": 2.6642824053764342, "loss/logits": 0.7930444091558456, "step": 68860 }, { "epoch": 0.6887, "grad_norm": 15.8125, "grad_norm_var": 3.0775390625, "learning_rate": 0.0003, "loss": 10.9136, "loss/aux_loss": 0.048069755733013156, "loss/crossentropy": 2.7065019488334654, "loss/logits": 0.8077657282352447, "step": 68870 }, { "epoch": 0.6888, "grad_norm": 16.875, "grad_norm_var": 0.6134765625, "learning_rate": 0.0003, "loss": 11.0391, "loss/aux_loss": 0.048073760420084, "loss/crossentropy": 2.668808138370514, "loss/logits": 0.8384716600179672, "step": 68880 }, { "epoch": 0.6889, "grad_norm": 15.75, "grad_norm_var": 0.561962890625, "learning_rate": 0.0003, "loss": 10.8771, "loss/aux_loss": 0.048064139857888225, "loss/crossentropy": 2.6391066908836365, "loss/logits": 0.8003528326749801, "step": 68890 }, { "epoch": 0.689, "grad_norm": 15.0, "grad_norm_var": 328.2468587239583, "learning_rate": 0.0003, "loss": 10.9101, "loss/aux_loss": 0.048088057711720464, "loss/crossentropy": 2.670439213514328, "loss/logits": 0.7999959751963616, "step": 68900 }, { "epoch": 0.6891, "grad_norm": 16.75, "grad_norm_var": 0.926416015625, "learning_rate": 0.0003, "loss": 10.7369, "loss/aux_loss": 0.048071911372244355, "loss/crossentropy": 2.590096038579941, "loss/logits": 0.7963913947343826, "step": 68910 }, { "epoch": 0.6892, "grad_norm": 16.25, "grad_norm_var": 0.5931640625, "learning_rate": 0.0003, "loss": 10.8433, "loss/aux_loss": 0.04806236661970616, "loss/crossentropy": 2.7095581710338594, "loss/logits": 0.808975538611412, "step": 68920 }, { "epoch": 0.6893, "grad_norm": 15.0, "grad_norm_var": 0.7262858072916667, "learning_rate": 0.0003, "loss": 10.8429, "loss/aux_loss": 0.04807297587394714, "loss/crossentropy": 2.716857922077179, "loss/logits": 0.8298698961734772, "step": 68930 }, { "epoch": 0.6894, "grad_norm": 16.125, "grad_norm_var": 0.5973958333333333, "learning_rate": 0.0003, "loss": 10.8001, "loss/aux_loss": 0.04806691724807024, "loss/crossentropy": 2.7727342784404754, "loss/logits": 0.7861407697200775, "step": 68940 }, { "epoch": 0.6895, "grad_norm": 15.875, "grad_norm_var": 0.4127604166666667, "learning_rate": 0.0003, "loss": 10.6628, "loss/aux_loss": 0.04806840233504772, "loss/crossentropy": 2.7098950922489164, "loss/logits": 0.82764173746109, "step": 68950 }, { "epoch": 0.6896, "grad_norm": 16.5, "grad_norm_var": 0.3653483072916667, "learning_rate": 0.0003, "loss": 10.8558, "loss/aux_loss": 0.048067934811115265, "loss/crossentropy": 2.679134911298752, "loss/logits": 0.8346406280994415, "step": 68960 }, { "epoch": 0.6897, "grad_norm": 15.3125, "grad_norm_var": 0.42083333333333334, "learning_rate": 0.0003, "loss": 10.706, "loss/aux_loss": 0.04807941559702158, "loss/crossentropy": 2.5862072229385378, "loss/logits": 0.784822764992714, "step": 68970 }, { "epoch": 0.6898, "grad_norm": 15.25, "grad_norm_var": 0.38333333333333336, "learning_rate": 0.0003, "loss": 10.969, "loss/aux_loss": 0.04805704411119223, "loss/crossentropy": 2.7672839522361756, "loss/logits": 0.8178101569414139, "step": 68980 }, { "epoch": 0.6899, "grad_norm": 15.25, "grad_norm_var": 3.1681640625, "learning_rate": 0.0003, "loss": 10.7957, "loss/aux_loss": 0.0480627154931426, "loss/crossentropy": 2.7157513022422792, "loss/logits": 0.8189638644456864, "step": 68990 }, { "epoch": 0.69, "grad_norm": 14.625, "grad_norm_var": 0.68828125, "learning_rate": 0.0003, "loss": 10.8248, "loss/aux_loss": 0.048067789524793625, "loss/crossentropy": 2.6678106248378755, "loss/logits": 0.8147345900535583, "step": 69000 }, { "epoch": 0.6901, "grad_norm": 16.5, "grad_norm_var": 0.6473958333333333, "learning_rate": 0.0003, "loss": 10.9064, "loss/aux_loss": 0.04807272534817457, "loss/crossentropy": 2.617716884613037, "loss/logits": 0.8001246243715286, "step": 69010 }, { "epoch": 0.6902, "grad_norm": 16.75, "grad_norm_var": 3.076416015625, "learning_rate": 0.0003, "loss": 10.9515, "loss/aux_loss": 0.0480681125074625, "loss/crossentropy": 2.7977681756019592, "loss/logits": 0.8114293158054352, "step": 69020 }, { "epoch": 0.6903, "grad_norm": 15.875, "grad_norm_var": 2.7058430989583333, "learning_rate": 0.0003, "loss": 10.7312, "loss/aux_loss": 0.048084371723234653, "loss/crossentropy": 2.6386759102344515, "loss/logits": 0.808656194806099, "step": 69030 }, { "epoch": 0.6904, "grad_norm": 16.75, "grad_norm_var": 0.43430989583333335, "learning_rate": 0.0003, "loss": 10.9418, "loss/aux_loss": 0.04805478285998106, "loss/crossentropy": 2.640076959133148, "loss/logits": 0.7902828812599182, "step": 69040 }, { "epoch": 0.6905, "grad_norm": 16.375, "grad_norm_var": 1.1566243489583334, "learning_rate": 0.0003, "loss": 10.8981, "loss/aux_loss": 0.048069654405117034, "loss/crossentropy": 2.6928559839725494, "loss/logits": 0.8163411170244217, "step": 69050 }, { "epoch": 0.6906, "grad_norm": 15.1875, "grad_norm_var": 0.7915201822916667, "learning_rate": 0.0003, "loss": 10.9199, "loss/aux_loss": 0.048076878674328326, "loss/crossentropy": 2.6690307438373564, "loss/logits": 0.8052759945392609, "step": 69060 }, { "epoch": 0.6907, "grad_norm": 14.875, "grad_norm_var": 0.46599934895833334, "learning_rate": 0.0003, "loss": 10.7491, "loss/aux_loss": 0.048068926110863684, "loss/crossentropy": 2.593144977092743, "loss/logits": 0.7935424596071243, "step": 69070 }, { "epoch": 0.6908, "grad_norm": 15.5625, "grad_norm_var": 0.41868489583333335, "learning_rate": 0.0003, "loss": 10.8175, "loss/aux_loss": 0.04806020874530077, "loss/crossentropy": 2.6784588575363157, "loss/logits": 0.8021058231592179, "step": 69080 }, { "epoch": 0.6909, "grad_norm": 15.125, "grad_norm_var": 0.7081868489583333, "learning_rate": 0.0003, "loss": 10.9733, "loss/aux_loss": 0.04807012509554624, "loss/crossentropy": 2.667149120569229, "loss/logits": 0.8256219893693924, "step": 69090 }, { "epoch": 0.691, "grad_norm": 15.5625, "grad_norm_var": 0.5306640625, "learning_rate": 0.0003, "loss": 10.6466, "loss/aux_loss": 0.04807849489152431, "loss/crossentropy": 2.6570124447345735, "loss/logits": 0.7915462791919708, "step": 69100 }, { "epoch": 0.6911, "grad_norm": 14.4375, "grad_norm_var": 0.324853515625, "learning_rate": 0.0003, "loss": 10.8218, "loss/aux_loss": 0.048054102994501594, "loss/crossentropy": 2.5824302971363067, "loss/logits": 0.8005838513374328, "step": 69110 }, { "epoch": 0.6912, "grad_norm": 16.75, "grad_norm_var": 0.48748372395833334, "learning_rate": 0.0003, "loss": 10.9588, "loss/aux_loss": 0.04807182941585779, "loss/crossentropy": 2.7127415359020235, "loss/logits": 0.824643325805664, "step": 69120 }, { "epoch": 0.6913, "grad_norm": 15.5, "grad_norm_var": 0.5541015625, "learning_rate": 0.0003, "loss": 10.8355, "loss/aux_loss": 0.0480794757604599, "loss/crossentropy": 2.832990896701813, "loss/logits": 0.835834476351738, "step": 69130 }, { "epoch": 0.6914, "grad_norm": 15.8125, "grad_norm_var": 0.6528483072916667, "learning_rate": 0.0003, "loss": 10.8559, "loss/aux_loss": 0.04806724786758423, "loss/crossentropy": 2.5886457681655886, "loss/logits": 0.792993089556694, "step": 69140 }, { "epoch": 0.6915, "grad_norm": 14.875, "grad_norm_var": 0.24088541666666666, "learning_rate": 0.0003, "loss": 10.8768, "loss/aux_loss": 0.04806765224784613, "loss/crossentropy": 2.728767251968384, "loss/logits": 0.8263113409280777, "step": 69150 }, { "epoch": 0.6916, "grad_norm": 16.0, "grad_norm_var": 0.5479166666666667, "learning_rate": 0.0003, "loss": 10.8623, "loss/aux_loss": 0.048062493838369844, "loss/crossentropy": 2.649292767047882, "loss/logits": 0.8204698622226715, "step": 69160 }, { "epoch": 0.6917, "grad_norm": 17.375, "grad_norm_var": 0.8387858072916666, "learning_rate": 0.0003, "loss": 11.0247, "loss/aux_loss": 0.04808208290487528, "loss/crossentropy": 2.7280562281608582, "loss/logits": 0.8480396270751953, "step": 69170 }, { "epoch": 0.6918, "grad_norm": 14.6875, "grad_norm_var": 1.9921223958333334, "learning_rate": 0.0003, "loss": 10.8223, "loss/aux_loss": 0.04805909302085638, "loss/crossentropy": 2.467037004232407, "loss/logits": 0.7907186537981034, "step": 69180 }, { "epoch": 0.6919, "grad_norm": 17.75, "grad_norm_var": 0.7739583333333333, "learning_rate": 0.0003, "loss": 10.8994, "loss/aux_loss": 0.04806915447115898, "loss/crossentropy": 2.8982559561729433, "loss/logits": 0.8535742044448853, "step": 69190 }, { "epoch": 0.692, "grad_norm": 15.3125, "grad_norm_var": 0.9844889322916667, "learning_rate": 0.0003, "loss": 10.8113, "loss/aux_loss": 0.04806341417133808, "loss/crossentropy": 2.7215474128723143, "loss/logits": 0.8118506580591202, "step": 69200 }, { "epoch": 0.6921, "grad_norm": 15.4375, "grad_norm_var": 1.2166015625, "learning_rate": 0.0003, "loss": 10.9878, "loss/aux_loss": 0.04807480592280626, "loss/crossentropy": 2.6422359228134153, "loss/logits": 0.8116691440343857, "step": 69210 }, { "epoch": 0.6922, "grad_norm": 16.125, "grad_norm_var": 0.509375, "learning_rate": 0.0003, "loss": 10.9659, "loss/aux_loss": 0.048063729889690876, "loss/crossentropy": 2.697904723882675, "loss/logits": 0.8118081420660019, "step": 69220 }, { "epoch": 0.6923, "grad_norm": 14.8125, "grad_norm_var": 3.744124348958333, "learning_rate": 0.0003, "loss": 10.758, "loss/aux_loss": 0.04806367959827185, "loss/crossentropy": 2.5523222506046297, "loss/logits": 0.7924966961145401, "step": 69230 }, { "epoch": 0.6924, "grad_norm": 15.375, "grad_norm_var": 4.15390625, "learning_rate": 0.0003, "loss": 10.8883, "loss/aux_loss": 0.04806468244642019, "loss/crossentropy": 2.8213607549667357, "loss/logits": 0.8581427276134491, "step": 69240 }, { "epoch": 0.6925, "grad_norm": 15.0625, "grad_norm_var": 0.776806640625, "learning_rate": 0.0003, "loss": 10.9512, "loss/aux_loss": 0.048063874058425424, "loss/crossentropy": 2.7791464805603026, "loss/logits": 0.8555617034435272, "step": 69250 }, { "epoch": 0.6926, "grad_norm": 15.5625, "grad_norm_var": 0.5567057291666667, "learning_rate": 0.0003, "loss": 10.8125, "loss/aux_loss": 0.04807287901639938, "loss/crossentropy": 2.721563369035721, "loss/logits": 0.8256457418203353, "step": 69260 }, { "epoch": 0.6927, "grad_norm": 15.625, "grad_norm_var": 1.0127604166666666, "learning_rate": 0.0003, "loss": 10.7597, "loss/aux_loss": 0.04807109721004963, "loss/crossentropy": 2.6567570507526397, "loss/logits": 0.7937098532915116, "step": 69270 }, { "epoch": 0.6928, "grad_norm": 14.625, "grad_norm_var": 0.8239420572916667, "learning_rate": 0.0003, "loss": 10.922, "loss/aux_loss": 0.04806143771857023, "loss/crossentropy": 2.618191432952881, "loss/logits": 0.7884357571601868, "step": 69280 }, { "epoch": 0.6929, "grad_norm": 15.3125, "grad_norm_var": 1.0782389322916666, "learning_rate": 0.0003, "loss": 10.9623, "loss/aux_loss": 0.04808080028742552, "loss/crossentropy": 2.732917082309723, "loss/logits": 0.8358202904462815, "step": 69290 }, { "epoch": 0.693, "grad_norm": 14.625, "grad_norm_var": 1.6150390625, "learning_rate": 0.0003, "loss": 10.949, "loss/aux_loss": 0.048059186339378356, "loss/crossentropy": 2.7098691940307615, "loss/logits": 0.8036975592374802, "step": 69300 }, { "epoch": 0.6931, "grad_norm": 14.5, "grad_norm_var": 0.6299479166666667, "learning_rate": 0.0003, "loss": 10.8893, "loss/aux_loss": 0.04807203523814678, "loss/crossentropy": 2.675479108095169, "loss/logits": 0.8248216599225998, "step": 69310 }, { "epoch": 0.6932, "grad_norm": 16.375, "grad_norm_var": 0.5020833333333333, "learning_rate": 0.0003, "loss": 10.7723, "loss/aux_loss": 0.04805539548397064, "loss/crossentropy": 2.6765593349933625, "loss/logits": 0.8203548967838288, "step": 69320 }, { "epoch": 0.6933, "grad_norm": 14.125, "grad_norm_var": 1.6524576822916666, "learning_rate": 0.0003, "loss": 10.8687, "loss/aux_loss": 0.04807414021342993, "loss/crossentropy": 2.7173945188522337, "loss/logits": 0.8036992192268372, "step": 69330 }, { "epoch": 0.6934, "grad_norm": 15.25, "grad_norm_var": 1.7469889322916667, "learning_rate": 0.0003, "loss": 10.9554, "loss/aux_loss": 0.04807598683983087, "loss/crossentropy": 2.8862260222434997, "loss/logits": 0.809663537144661, "step": 69340 }, { "epoch": 0.6935, "grad_norm": 15.375, "grad_norm_var": 0.6254557291666667, "learning_rate": 0.0003, "loss": 10.6077, "loss/aux_loss": 0.04807170238345861, "loss/crossentropy": 2.7360516667366026, "loss/logits": 0.8002244532108307, "step": 69350 }, { "epoch": 0.6936, "grad_norm": 15.625, "grad_norm_var": 0.5206868489583333, "learning_rate": 0.0003, "loss": 10.9361, "loss/aux_loss": 0.04806582704186439, "loss/crossentropy": 2.6366516649723053, "loss/logits": 0.7836940854787826, "step": 69360 }, { "epoch": 0.6937, "grad_norm": 15.0625, "grad_norm_var": 1.025244140625, "learning_rate": 0.0003, "loss": 10.8865, "loss/aux_loss": 0.04807285238057375, "loss/crossentropy": 2.700425660610199, "loss/logits": 0.786887913942337, "step": 69370 }, { "epoch": 0.6938, "grad_norm": 15.9375, "grad_norm_var": 1.1244140625, "learning_rate": 0.0003, "loss": 10.7267, "loss/aux_loss": 0.048068417236208916, "loss/crossentropy": 2.751612478494644, "loss/logits": 0.8176582008600235, "step": 69380 }, { "epoch": 0.6939, "grad_norm": 14.5, "grad_norm_var": 0.6707682291666667, "learning_rate": 0.0003, "loss": 10.8515, "loss/aux_loss": 0.04806055538356304, "loss/crossentropy": 2.6871352314949037, "loss/logits": 0.8337746620178222, "step": 69390 }, { "epoch": 0.694, "grad_norm": 14.8125, "grad_norm_var": 0.42239583333333336, "learning_rate": 0.0003, "loss": 10.9541, "loss/aux_loss": 0.04807214047759771, "loss/crossentropy": 2.9056159615516663, "loss/logits": 0.8280203819274903, "step": 69400 }, { "epoch": 0.6941, "grad_norm": 14.25, "grad_norm_var": 0.9585774739583334, "learning_rate": 0.0003, "loss": 10.8548, "loss/aux_loss": 0.048070849664509294, "loss/crossentropy": 2.7352217197418214, "loss/logits": 0.822101253271103, "step": 69410 }, { "epoch": 0.6942, "grad_norm": 17.0, "grad_norm_var": 1.1666015625, "learning_rate": 0.0003, "loss": 10.8417, "loss/aux_loss": 0.04806300960481167, "loss/crossentropy": 2.747141933441162, "loss/logits": 0.8111588656902313, "step": 69420 }, { "epoch": 0.6943, "grad_norm": 15.0625, "grad_norm_var": 0.666259765625, "learning_rate": 0.0003, "loss": 10.9775, "loss/aux_loss": 0.04806242380291224, "loss/crossentropy": 2.7164130806922913, "loss/logits": 0.8515024065971375, "step": 69430 }, { "epoch": 0.6944, "grad_norm": 16.75, "grad_norm_var": 128.33709309895832, "learning_rate": 0.0003, "loss": 11.0138, "loss/aux_loss": 0.048085224255919455, "loss/crossentropy": 2.6374498426914217, "loss/logits": 0.8130934327840805, "step": 69440 }, { "epoch": 0.6945, "grad_norm": 16.25, "grad_norm_var": 2.3645833333333335, "learning_rate": 0.0003, "loss": 10.7526, "loss/aux_loss": 0.04807570818811655, "loss/crossentropy": 2.6251815021038056, "loss/logits": 0.8005797922611236, "step": 69450 }, { "epoch": 0.6946, "grad_norm": 14.875, "grad_norm_var": 0.9205729166666666, "learning_rate": 0.0003, "loss": 10.845, "loss/aux_loss": 0.048060524836182596, "loss/crossentropy": 2.734144937992096, "loss/logits": 0.8050288885831833, "step": 69460 }, { "epoch": 0.6947, "grad_norm": 15.625, "grad_norm_var": 0.60078125, "learning_rate": 0.0003, "loss": 10.8845, "loss/aux_loss": 0.048075083270668985, "loss/crossentropy": 2.705908918380737, "loss/logits": 0.8168601602315902, "step": 69470 }, { "epoch": 0.6948, "grad_norm": 15.5, "grad_norm_var": 0.7356770833333334, "learning_rate": 0.0003, "loss": 10.7593, "loss/aux_loss": 0.048058357648551465, "loss/crossentropy": 2.618184173107147, "loss/logits": 0.8116142481565476, "step": 69480 }, { "epoch": 0.6949, "grad_norm": 16.125, "grad_norm_var": 1.1541015625, "learning_rate": 0.0003, "loss": 10.9445, "loss/aux_loss": 0.0480714937672019, "loss/crossentropy": 2.6427643597126007, "loss/logits": 0.8324632406234741, "step": 69490 }, { "epoch": 0.695, "grad_norm": 15.0625, "grad_norm_var": 0.9432291666666667, "learning_rate": 0.0003, "loss": 10.7763, "loss/aux_loss": 0.048056024312973025, "loss/crossentropy": 2.7632019460201263, "loss/logits": 0.8252893060445785, "step": 69500 }, { "epoch": 0.6951, "grad_norm": 15.25, "grad_norm_var": 0.4384765625, "learning_rate": 0.0003, "loss": 10.904, "loss/aux_loss": 0.04805950913578272, "loss/crossentropy": 2.741923874616623, "loss/logits": 0.8043259769678116, "step": 69510 }, { "epoch": 0.6952, "grad_norm": 14.8125, "grad_norm_var": 0.211962890625, "learning_rate": 0.0003, "loss": 10.8344, "loss/aux_loss": 0.04806842133402824, "loss/crossentropy": 2.824173706769943, "loss/logits": 0.8347906857728958, "step": 69520 }, { "epoch": 0.6953, "grad_norm": 15.75, "grad_norm_var": 0.379541015625, "learning_rate": 0.0003, "loss": 10.9516, "loss/aux_loss": 0.04806260485202074, "loss/crossentropy": 2.81458033323288, "loss/logits": 0.8338780552148819, "step": 69530 }, { "epoch": 0.6954, "grad_norm": 15.4375, "grad_norm_var": 0.35201822916666664, "learning_rate": 0.0003, "loss": 10.7228, "loss/aux_loss": 0.04807497151196003, "loss/crossentropy": 2.718851935863495, "loss/logits": 0.7873619675636292, "step": 69540 }, { "epoch": 0.6955, "grad_norm": 15.25, "grad_norm_var": 0.40358072916666665, "learning_rate": 0.0003, "loss": 11.0079, "loss/aux_loss": 0.04806389175355434, "loss/crossentropy": 2.8200283885002135, "loss/logits": 0.8427982300519943, "step": 69550 }, { "epoch": 0.6956, "grad_norm": 16.125, "grad_norm_var": 0.8676432291666667, "learning_rate": 0.0003, "loss": 10.9712, "loss/aux_loss": 0.04806965868920088, "loss/crossentropy": 2.617030918598175, "loss/logits": 0.7800733983516693, "step": 69560 }, { "epoch": 0.6957, "grad_norm": 15.25, "grad_norm_var": 0.790869140625, "learning_rate": 0.0003, "loss": 10.6778, "loss/aux_loss": 0.0480612862855196, "loss/crossentropy": 2.6069294095039366, "loss/logits": 0.7806821346282959, "step": 69570 }, { "epoch": 0.6958, "grad_norm": 15.3125, "grad_norm_var": 0.493994140625, "learning_rate": 0.0003, "loss": 10.7764, "loss/aux_loss": 0.04807495810091496, "loss/crossentropy": 2.655094450712204, "loss/logits": 0.8132032155990601, "step": 69580 }, { "epoch": 0.6959, "grad_norm": 15.875, "grad_norm_var": 0.35052083333333334, "learning_rate": 0.0003, "loss": 10.6866, "loss/aux_loss": 0.04805904570966959, "loss/crossentropy": 2.5849641382694246, "loss/logits": 0.7947615712881089, "step": 69590 }, { "epoch": 0.696, "grad_norm": 15.625, "grad_norm_var": 0.5973958333333333, "learning_rate": 0.0003, "loss": 10.9278, "loss/aux_loss": 0.048071693442761895, "loss/crossentropy": 2.529132205247879, "loss/logits": 0.7914834886789321, "step": 69600 }, { "epoch": 0.6961, "grad_norm": 15.5, "grad_norm_var": 0.7426432291666667, "learning_rate": 0.0003, "loss": 10.7448, "loss/aux_loss": 0.048061798140406606, "loss/crossentropy": 2.5973862528800966, "loss/logits": 0.762446054816246, "step": 69610 }, { "epoch": 0.6962, "grad_norm": 15.625, "grad_norm_var": 0.7150390625, "learning_rate": 0.0003, "loss": 10.7611, "loss/aux_loss": 0.04808028992265463, "loss/crossentropy": 2.580735170841217, "loss/logits": 0.8113909959793091, "step": 69620 }, { "epoch": 0.6963, "grad_norm": 14.375, "grad_norm_var": 0.9327473958333333, "learning_rate": 0.0003, "loss": 10.7292, "loss/aux_loss": 0.04805351886898279, "loss/crossentropy": 2.6865119695663453, "loss/logits": 0.7755003601312638, "step": 69630 }, { "epoch": 0.6964, "grad_norm": 14.4375, "grad_norm_var": 0.8343098958333334, "learning_rate": 0.0003, "loss": 10.8817, "loss/aux_loss": 0.048071601428091526, "loss/crossentropy": 2.7158267498016357, "loss/logits": 0.813782611489296, "step": 69640 }, { "epoch": 0.6965, "grad_norm": 15.4375, "grad_norm_var": 0.877978515625, "learning_rate": 0.0003, "loss": 10.7957, "loss/aux_loss": 0.048066675662994385, "loss/crossentropy": 2.715426343679428, "loss/logits": 0.8276042312383651, "step": 69650 }, { "epoch": 0.6966, "grad_norm": 15.8125, "grad_norm_var": 0.6184895833333334, "learning_rate": 0.0003, "loss": 10.9359, "loss/aux_loss": 0.048063802719116214, "loss/crossentropy": 2.6631912708282472, "loss/logits": 0.8037597626447678, "step": 69660 }, { "epoch": 0.6967, "grad_norm": 15.5625, "grad_norm_var": 1.0847493489583333, "learning_rate": 0.0003, "loss": 11.0285, "loss/aux_loss": 0.0480733547359705, "loss/crossentropy": 2.6649845838546753, "loss/logits": 0.824079555273056, "step": 69670 }, { "epoch": 0.6968, "grad_norm": 19.75, "grad_norm_var": 2.1988932291666665, "learning_rate": 0.0003, "loss": 10.7858, "loss/aux_loss": 0.04807499777525663, "loss/crossentropy": 2.559821057319641, "loss/logits": 0.8164191097021103, "step": 69680 }, { "epoch": 0.6969, "grad_norm": 16.0, "grad_norm_var": 1.6702473958333333, "learning_rate": 0.0003, "loss": 10.7732, "loss/aux_loss": 0.048064416646957396, "loss/crossentropy": 2.5081125438213348, "loss/logits": 0.7806854665279388, "step": 69690 }, { "epoch": 0.697, "grad_norm": 17.125, "grad_norm_var": 0.537744140625, "learning_rate": 0.0003, "loss": 10.911, "loss/aux_loss": 0.048071997612714766, "loss/crossentropy": 2.798080360889435, "loss/logits": 0.849156191945076, "step": 69700 }, { "epoch": 0.6971, "grad_norm": 15.5, "grad_norm_var": 0.8555826822916667, "learning_rate": 0.0003, "loss": 10.7062, "loss/aux_loss": 0.048063857667148116, "loss/crossentropy": 2.721605783700943, "loss/logits": 0.7785751849412919, "step": 69710 }, { "epoch": 0.6972, "grad_norm": 15.5625, "grad_norm_var": 0.411572265625, "learning_rate": 0.0003, "loss": 10.8278, "loss/aux_loss": 0.048065428622066975, "loss/crossentropy": 2.7682719230651855, "loss/logits": 0.8334241211414337, "step": 69720 }, { "epoch": 0.6973, "grad_norm": 15.25, "grad_norm_var": 0.6947265625, "learning_rate": 0.0003, "loss": 10.7517, "loss/aux_loss": 0.04808158706873655, "loss/crossentropy": 2.7136940717697144, "loss/logits": 0.799360203742981, "step": 69730 }, { "epoch": 0.6974, "grad_norm": 15.8125, "grad_norm_var": 0.6452473958333333, "learning_rate": 0.0003, "loss": 10.871, "loss/aux_loss": 0.04807530529797077, "loss/crossentropy": 2.723317527770996, "loss/logits": 0.7894266813993454, "step": 69740 }, { "epoch": 0.6975, "grad_norm": 15.8125, "grad_norm_var": 140.85598958333333, "learning_rate": 0.0003, "loss": 10.8967, "loss/aux_loss": 0.048066935315728185, "loss/crossentropy": 2.7271577537059786, "loss/logits": 0.8186393707990647, "step": 69750 }, { "epoch": 0.6976, "grad_norm": 17.125, "grad_norm_var": 3.801806640625, "learning_rate": 0.0003, "loss": 10.8478, "loss/aux_loss": 0.04806602392345667, "loss/crossentropy": 2.499061381816864, "loss/logits": 0.7904832571744919, "step": 69760 }, { "epoch": 0.6977, "grad_norm": 15.1875, "grad_norm_var": 0.8098307291666667, "learning_rate": 0.0003, "loss": 10.9179, "loss/aux_loss": 0.04806220382452011, "loss/crossentropy": 2.8025425612926482, "loss/logits": 0.8429517328739167, "step": 69770 }, { "epoch": 0.6978, "grad_norm": 15.4375, "grad_norm_var": 0.5463541666666667, "learning_rate": 0.0003, "loss": 10.8747, "loss/aux_loss": 0.0480809373781085, "loss/crossentropy": 2.549819737672806, "loss/logits": 0.7724178716540336, "step": 69780 }, { "epoch": 0.6979, "grad_norm": 14.4375, "grad_norm_var": 0.5058430989583333, "learning_rate": 0.0003, "loss": 10.7673, "loss/aux_loss": 0.048068621568381785, "loss/crossentropy": 2.629793846607208, "loss/logits": 0.8028360933065415, "step": 69790 }, { "epoch": 0.698, "grad_norm": 14.9375, "grad_norm_var": 0.8476399739583333, "learning_rate": 0.0003, "loss": 10.6601, "loss/aux_loss": 0.048068128526210785, "loss/crossentropy": 2.6547737777233125, "loss/logits": 0.8297581821680069, "step": 69800 }, { "epoch": 0.6981, "grad_norm": 15.8125, "grad_norm_var": 0.863916015625, "learning_rate": 0.0003, "loss": 10.9559, "loss/aux_loss": 0.04806609004735947, "loss/crossentropy": 2.636369228363037, "loss/logits": 0.8080786511301994, "step": 69810 }, { "epoch": 0.6982, "grad_norm": 16.375, "grad_norm_var": 16.676416015625, "learning_rate": 0.0003, "loss": 10.9601, "loss/aux_loss": 0.04808267876505852, "loss/crossentropy": 2.6405935764312742, "loss/logits": 0.7962011188268662, "step": 69820 }, { "epoch": 0.6983, "grad_norm": 15.4375, "grad_norm_var": 1.4374348958333334, "learning_rate": 0.0003, "loss": 10.7869, "loss/aux_loss": 0.04806174710392952, "loss/crossentropy": 2.7505713582038878, "loss/logits": 0.7872932314872741, "step": 69830 }, { "epoch": 0.6984, "grad_norm": 14.9375, "grad_norm_var": 0.9222493489583333, "learning_rate": 0.0003, "loss": 10.9906, "loss/aux_loss": 0.04806131403893232, "loss/crossentropy": 2.8466971039772035, "loss/logits": 0.8570821315050126, "step": 69840 }, { "epoch": 0.6985, "grad_norm": 15.25, "grad_norm_var": 0.22389322916666668, "learning_rate": 0.0003, "loss": 10.897, "loss/aux_loss": 0.048060120269656184, "loss/crossentropy": 2.6072149515151977, "loss/logits": 0.8024741411209106, "step": 69850 }, { "epoch": 0.6986, "grad_norm": 14.9375, "grad_norm_var": 0.25201822916666666, "learning_rate": 0.0003, "loss": 10.7511, "loss/aux_loss": 0.048070829920470716, "loss/crossentropy": 2.805588722229004, "loss/logits": 0.8278827935457229, "step": 69860 }, { "epoch": 0.6987, "grad_norm": 16.125, "grad_norm_var": 0.35154622395833335, "learning_rate": 0.0003, "loss": 10.905, "loss/aux_loss": 0.04806218836456537, "loss/crossentropy": 2.727338945865631, "loss/logits": 0.7851577132940293, "step": 69870 }, { "epoch": 0.6988, "grad_norm": 15.375, "grad_norm_var": 0.5751139322916666, "learning_rate": 0.0003, "loss": 10.75, "loss/aux_loss": 0.04807434901595116, "loss/crossentropy": 2.7194652020931245, "loss/logits": 0.8074722796678543, "step": 69880 }, { "epoch": 0.6989, "grad_norm": 15.6875, "grad_norm_var": 0.5465983072916667, "learning_rate": 0.0003, "loss": 10.7433, "loss/aux_loss": 0.048061872646212575, "loss/crossentropy": 2.685518753528595, "loss/logits": 0.8617990851402283, "step": 69890 }, { "epoch": 0.699, "grad_norm": 14.9375, "grad_norm_var": 0.4423014322916667, "learning_rate": 0.0003, "loss": 10.7309, "loss/aux_loss": 0.04807621408253908, "loss/crossentropy": 2.622563087940216, "loss/logits": 0.809404906630516, "step": 69900 }, { "epoch": 0.6991, "grad_norm": 14.75, "grad_norm_var": 0.31417643229166664, "learning_rate": 0.0003, "loss": 10.9159, "loss/aux_loss": 0.04805949460715055, "loss/crossentropy": 2.748386710882187, "loss/logits": 0.8233764231204986, "step": 69910 }, { "epoch": 0.6992, "grad_norm": 15.25, "grad_norm_var": 0.4359212239583333, "learning_rate": 0.0003, "loss": 10.7166, "loss/aux_loss": 0.04807121455669403, "loss/crossentropy": 2.661421650648117, "loss/logits": 0.7985509872436524, "step": 69920 }, { "epoch": 0.6993, "grad_norm": 15.9375, "grad_norm_var": 1.0502604166666667, "learning_rate": 0.0003, "loss": 10.8889, "loss/aux_loss": 0.048076589964330195, "loss/crossentropy": 2.886363685131073, "loss/logits": 0.8310510069131851, "step": 69930 }, { "epoch": 0.6994, "grad_norm": 14.875, "grad_norm_var": 1.1340983072916666, "learning_rate": 0.0003, "loss": 10.9695, "loss/aux_loss": 0.04806211348623037, "loss/crossentropy": 2.730562311410904, "loss/logits": 0.8359936803579331, "step": 69940 }, { "epoch": 0.6995, "grad_norm": 15.0, "grad_norm_var": 0.21261393229166667, "learning_rate": 0.0003, "loss": 10.8209, "loss/aux_loss": 0.04806340225040913, "loss/crossentropy": 2.6769157886505126, "loss/logits": 0.7736663967370987, "step": 69950 }, { "epoch": 0.6996, "grad_norm": 17.25, "grad_norm_var": 0.4559733072916667, "learning_rate": 0.0003, "loss": 10.8065, "loss/aux_loss": 0.048078119195997714, "loss/crossentropy": 2.6120175421237946, "loss/logits": 0.8157159000635147, "step": 69960 }, { "epoch": 0.6997, "grad_norm": 15.9375, "grad_norm_var": 0.9344889322916666, "learning_rate": 0.0003, "loss": 10.9741, "loss/aux_loss": 0.04807462692260742, "loss/crossentropy": 2.7217097640037538, "loss/logits": 0.806912750005722, "step": 69970 }, { "epoch": 0.6998, "grad_norm": 14.4375, "grad_norm_var": 0.9408854166666667, "learning_rate": 0.0003, "loss": 10.7853, "loss/aux_loss": 0.04805513937026262, "loss/crossentropy": 2.603360629081726, "loss/logits": 0.8143667846918106, "step": 69980 }, { "epoch": 0.6999, "grad_norm": 15.6875, "grad_norm_var": 0.5075358072916667, "learning_rate": 0.0003, "loss": 10.9126, "loss/aux_loss": 0.04806795883923769, "loss/crossentropy": 2.860744071006775, "loss/logits": 0.8322966694831848, "step": 69990 }, { "epoch": 0.7, "grad_norm": 16.125, "grad_norm_var": 0.4610514322916667, "learning_rate": 0.0003, "loss": 10.8683, "loss/aux_loss": 0.04808022417128086, "loss/crossentropy": 2.708540141582489, "loss/logits": 0.8456598520278931, "step": 70000 } ], "logging_steps": 10, "max_steps": 100000, "num_input_tokens_seen": 0, "num_train_epochs": 9223372036854775807, "save_steps": 10000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 1.9787190713817498e+20, "train_batch_size": 4, "trial_name": null, "trial_params": null }