diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,48018 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.04, + "eval_steps": 1000, + "global_step": 4000, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 1e-05, + "grad_norm": 0.37634041905403137, + "learning_rate": 5e-06, + "loss": 0.169, + "loss/crossentropy": 2.8720462918281555, + "loss/hidden": 0.0, + "loss/logits": 0.16897856071591377, + "loss/reg": 4.4040703773498535, + "step": 1 + }, + { + "epoch": 2e-05, + "grad_norm": 0.35649582743644714, + "learning_rate": 1e-05, + "loss": 0.1696, + "loss/crossentropy": 2.715533673763275, + "loss/hidden": 0.0, + "loss/logits": 0.1695844642817974, + "loss/reg": 4.399058818817139, + "step": 2 + }, + { + "epoch": 3e-05, + "grad_norm": 0.3591013252735138, + "learning_rate": 1.5e-05, + "loss": 0.1782, + "loss/crossentropy": 2.6291310787200928, + "loss/hidden": 0.0, + "loss/logits": 0.1782267540693283, + "loss/reg": 4.394084930419922, + "step": 3 + }, + { + "epoch": 4e-05, + "grad_norm": 0.36401960253715515, + "learning_rate": 2e-05, + "loss": 0.1843, + "loss/crossentropy": 2.7142109274864197, + "loss/hidden": 0.0, + "loss/logits": 0.1843317598104477, + "loss/reg": 4.389005661010742, + "step": 4 + }, + { + "epoch": 5e-05, + "grad_norm": 0.3119131922721863, + "learning_rate": 2.5e-05, + "loss": 0.1625, + "loss/crossentropy": 2.7586326003074646, + "loss/hidden": 0.0, + "loss/logits": 0.1625315584242344, + "loss/reg": 4.3841166496276855, + "step": 5 + }, + { + "epoch": 6e-05, + "grad_norm": 0.3388400673866272, + "learning_rate": 3e-05, + "loss": 0.1844, + "loss/crossentropy": 2.8104345202445984, + "loss/hidden": 0.0, + "loss/logits": 0.1844346523284912, + "loss/reg": 4.3792877197265625, + "step": 6 + }, + { + "epoch": 7e-05, + "grad_norm": 0.4783320426940918, + "learning_rate": 3.5e-05, + "loss": 0.1843, + "loss/crossentropy": 2.8321655988693237, + "loss/hidden": 0.0, + "loss/logits": 0.18431555479764938, + "loss/reg": 4.37478494644165, + "step": 7 + }, + { + "epoch": 8e-05, + "grad_norm": 0.29636114835739136, + "learning_rate": 4e-05, + "loss": 0.1589, + "loss/crossentropy": 2.6809526681900024, + "loss/hidden": 0.0, + "loss/logits": 0.15894119441509247, + "loss/reg": 4.370139122009277, + "step": 8 + }, + { + "epoch": 9e-05, + "grad_norm": 0.30071625113487244, + "learning_rate": 4.5e-05, + "loss": 0.1657, + "loss/crossentropy": 2.6759764552116394, + "loss/hidden": 0.0, + "loss/logits": 0.16574353352189064, + "loss/reg": 4.365106105804443, + "step": 9 + }, + { + "epoch": 0.0001, + "grad_norm": 0.28883349895477295, + "learning_rate": 5e-05, + "loss": 0.1572, + "loss/crossentropy": 2.808637499809265, + "loss/hidden": 0.0, + "loss/logits": 0.15719739720225334, + "loss/reg": 4.360220909118652, + "step": 10 + }, + { + "epoch": 0.00011, + "grad_norm": 0.28243017196655273, + "learning_rate": 5e-05, + "loss": 0.1426, + "loss/crossentropy": 2.72423392534256, + "loss/hidden": 0.0, + "loss/logits": 0.14257685840129852, + "loss/reg": 4.355813503265381, + "step": 11 + }, + { + "epoch": 0.00012, + "grad_norm": 0.31152331829071045, + "learning_rate": 5e-05, + "loss": 0.147, + "loss/crossentropy": 2.710044264793396, + "loss/hidden": 0.0, + "loss/logits": 0.14701137319207191, + "loss/reg": 4.351265907287598, + "step": 12 + }, + { + "epoch": 0.00013, + "grad_norm": 0.2739678919315338, + "learning_rate": 5e-05, + "loss": 0.1499, + "loss/crossentropy": 2.7644649744033813, + "loss/hidden": 0.0, + "loss/logits": 0.149860430508852, + "loss/reg": 4.346287727355957, + "step": 13 + }, + { + "epoch": 0.00014, + "grad_norm": 0.2712353467941284, + "learning_rate": 5e-05, + "loss": 0.1454, + "loss/crossentropy": 2.7370432019233704, + "loss/hidden": 0.0, + "loss/logits": 0.14539287611842155, + "loss/reg": 4.340969085693359, + "step": 14 + }, + { + "epoch": 0.00015, + "grad_norm": 0.2667863667011261, + "learning_rate": 5e-05, + "loss": 0.1403, + "loss/crossentropy": 2.5638718008995056, + "loss/hidden": 0.0, + "loss/logits": 0.14029696956276894, + "loss/reg": 4.336019515991211, + "step": 15 + }, + { + "epoch": 0.00016, + "grad_norm": 0.30467212200164795, + "grad_norm_var": 0.0029449483710212204, + "learning_rate": 5e-05, + "loss": 0.1361, + "loss/crossentropy": 2.797445595264435, + "loss/hidden": 0.0, + "loss/logits": 0.13607431203126907, + "loss/reg": 4.330692291259766, + "step": 16 + }, + { + "epoch": 0.00017, + "grad_norm": 0.2617621421813965, + "grad_norm_var": 0.0029635281595075556, + "learning_rate": 5e-05, + "loss": 0.1443, + "loss/crossentropy": 2.7542406916618347, + "loss/hidden": 0.0, + "loss/logits": 0.14427556470036507, + "loss/reg": 4.325323581695557, + "step": 17 + }, + { + "epoch": 0.00018, + "grad_norm": 0.28648674488067627, + "grad_norm_var": 0.0028982593896559215, + "learning_rate": 5e-05, + "loss": 0.1396, + "loss/crossentropy": 2.674492835998535, + "loss/hidden": 0.0, + "loss/logits": 0.13961521908640862, + "loss/reg": 4.31995153427124, + "step": 18 + }, + { + "epoch": 0.00019, + "grad_norm": 0.26269060373306274, + "grad_norm_var": 0.002877724259904054, + "learning_rate": 5e-05, + "loss": 0.141, + "loss/crossentropy": 2.8323662281036377, + "loss/hidden": 0.0, + "loss/logits": 0.14103225618600845, + "loss/reg": 4.315446853637695, + "step": 19 + }, + { + "epoch": 0.0002, + "grad_norm": 0.2718074321746826, + "grad_norm_var": 0.0026993307095730186, + "learning_rate": 5e-05, + "loss": 0.1314, + "loss/crossentropy": 2.63212913274765, + "loss/hidden": 0.0, + "loss/logits": 0.1313977725803852, + "loss/reg": 4.310704708099365, + "step": 20 + }, + { + "epoch": 0.00021, + "grad_norm": 0.2430431842803955, + "grad_norm_var": 0.0028911751903802204, + "learning_rate": 5e-05, + "loss": 0.1324, + "loss/crossentropy": 2.664808928966522, + "loss/hidden": 0.0, + "loss/logits": 0.1324238833039999, + "loss/reg": 4.305792808532715, + "step": 21 + }, + { + "epoch": 0.00022, + "grad_norm": 0.24898661673069, + "grad_norm_var": 0.00288514612507397, + "learning_rate": 5e-05, + "loss": 0.1242, + "loss/crossentropy": 2.7142711877822876, + "loss/hidden": 0.0, + "loss/logits": 0.12423932552337646, + "loss/reg": 4.300712585449219, + "step": 22 + }, + { + "epoch": 0.00023, + "grad_norm": 0.3123313784599304, + "grad_norm_var": 0.0004523056580034851, + "learning_rate": 5e-05, + "loss": 0.1321, + "loss/crossentropy": 2.7829225063323975, + "loss/hidden": 0.0, + "loss/logits": 0.13212688639760017, + "loss/reg": 4.295501232147217, + "step": 23 + }, + { + "epoch": 0.00024, + "grad_norm": 0.25187963247299194, + "grad_norm_var": 0.00048027979198491945, + "learning_rate": 5e-05, + "loss": 0.1248, + "loss/crossentropy": 2.692659854888916, + "loss/hidden": 0.0, + "loss/logits": 0.12482420355081558, + "loss/reg": 4.2908830642700195, + "step": 24 + }, + { + "epoch": 0.00025, + "grad_norm": 0.2151177078485489, + "grad_norm_var": 0.0006726495064564575, + "learning_rate": 5e-05, + "loss": 0.1232, + "loss/crossentropy": 2.738182246685028, + "loss/hidden": 0.0, + "loss/logits": 0.1231868714094162, + "loss/reg": 4.285846710205078, + "step": 25 + }, + { + "epoch": 0.00026, + "grad_norm": 0.23308518528938293, + "grad_norm_var": 0.0007424884519799501, + "learning_rate": 5e-05, + "loss": 0.1174, + "loss/crossentropy": 2.555102586746216, + "loss/hidden": 0.0, + "loss/logits": 0.11737299524247646, + "loss/reg": 4.281113147735596, + "step": 26 + }, + { + "epoch": 0.00027, + "grad_norm": 0.24523235857486725, + "grad_norm_var": 0.0007604384721796281, + "learning_rate": 5e-05, + "loss": 0.1201, + "loss/crossentropy": 2.6816893815994263, + "loss/hidden": 0.0, + "loss/logits": 0.12014555744826794, + "loss/reg": 4.2765069007873535, + "step": 27 + }, + { + "epoch": 0.00028, + "grad_norm": 0.25897473096847534, + "grad_norm_var": 0.0006160828367585275, + "learning_rate": 5e-05, + "loss": 0.1227, + "loss/crossentropy": 2.7505548000335693, + "loss/hidden": 0.0, + "loss/logits": 0.12271320074796677, + "loss/reg": 4.27158260345459, + "step": 28 + }, + { + "epoch": 0.00029, + "grad_norm": 0.23087331652641296, + "grad_norm_var": 0.0006691547004593392, + "learning_rate": 5e-05, + "loss": 0.1181, + "loss/crossentropy": 2.8483291268348694, + "loss/hidden": 0.0, + "loss/logits": 0.11810225620865822, + "loss/reg": 4.267061233520508, + "step": 29 + }, + { + "epoch": 0.0003, + "grad_norm": 1.2210192680358887, + "grad_norm_var": 0.05843327221954173, + "learning_rate": 5e-05, + "loss": 0.1723, + "loss/crossentropy": 2.8535077571868896, + "loss/hidden": 0.0, + "loss/logits": 0.17234252952039242, + "loss/reg": 4.262645244598389, + "step": 30 + }, + { + "epoch": 0.00031, + "grad_norm": 0.2712586224079132, + "grad_norm_var": 0.058402986662709634, + "learning_rate": 5e-05, + "loss": 0.1156, + "loss/crossentropy": 2.6525614261627197, + "loss/hidden": 0.0, + "loss/logits": 0.11560441367328167, + "loss/reg": 4.258092403411865, + "step": 31 + }, + { + "epoch": 0.00032, + "grad_norm": 0.5226843953132629, + "grad_norm_var": 0.06092943089461011, + "learning_rate": 5e-05, + "loss": 0.1537, + "loss/crossentropy": 2.6228127479553223, + "loss/hidden": 0.0, + "loss/logits": 0.15369537472724915, + "loss/reg": 4.253781318664551, + "step": 32 + }, + { + "epoch": 0.00033, + "grad_norm": 0.35246461629867554, + "grad_norm_var": 0.06057510886832484, + "learning_rate": 5e-05, + "loss": 0.1216, + "loss/crossentropy": 2.6986429691314697, + "loss/hidden": 0.0, + "loss/logits": 0.12163393199443817, + "loss/reg": 4.249208450317383, + "step": 33 + }, + { + "epoch": 0.00034, + "grad_norm": 0.2868311405181885, + "grad_norm_var": 0.060572693607631393, + "learning_rate": 5e-05, + "loss": 0.1215, + "loss/crossentropy": 2.7423174381256104, + "loss/hidden": 0.0, + "loss/logits": 0.12151895463466644, + "loss/reg": 4.244677543640137, + "step": 34 + }, + { + "epoch": 0.00035, + "grad_norm": 0.2556142210960388, + "grad_norm_var": 0.06064807497415105, + "learning_rate": 5e-05, + "loss": 0.1137, + "loss/crossentropy": 2.7171207070350647, + "loss/hidden": 0.0, + "loss/logits": 0.1137176975607872, + "loss/reg": 4.2399797439575195, + "step": 35 + }, + { + "epoch": 0.00036, + "grad_norm": 0.2783287763595581, + "grad_norm_var": 0.060592460146055585, + "learning_rate": 5e-05, + "loss": 0.1138, + "loss/crossentropy": 2.7394094467163086, + "loss/hidden": 0.0, + "loss/logits": 0.11381806619465351, + "loss/reg": 4.235424041748047, + "step": 36 + }, + { + "epoch": 0.00037, + "grad_norm": 0.3065175712108612, + "grad_norm_var": 0.06003019540430902, + "learning_rate": 5e-05, + "loss": 0.1235, + "loss/crossentropy": 2.755502223968506, + "loss/hidden": 0.0, + "loss/logits": 0.12348765879869461, + "loss/reg": 4.2310051918029785, + "step": 37 + }, + { + "epoch": 0.00038, + "grad_norm": 0.26492562890052795, + "grad_norm_var": 0.059845851287469956, + "learning_rate": 5e-05, + "loss": 0.1119, + "loss/crossentropy": 2.8106552362442017, + "loss/hidden": 0.0, + "loss/logits": 0.11191634088754654, + "loss/reg": 4.226707935333252, + "step": 38 + }, + { + "epoch": 0.00039, + "grad_norm": 0.24673967063426971, + "grad_norm_var": 0.06039341868271975, + "learning_rate": 5e-05, + "loss": 0.1161, + "loss/crossentropy": 2.7490118741989136, + "loss/hidden": 0.0, + "loss/logits": 0.11609707958996296, + "loss/reg": 4.222842216491699, + "step": 39 + }, + { + "epoch": 0.0004, + "grad_norm": 0.2973298132419586, + "grad_norm_var": 0.05998792869591778, + "learning_rate": 5e-05, + "loss": 0.1124, + "loss/crossentropy": 2.7798808813095093, + "loss/hidden": 0.0, + "loss/logits": 0.11244922317564487, + "loss/reg": 4.218531131744385, + "step": 40 + }, + { + "epoch": 0.00041, + "grad_norm": 0.7517657279968262, + "grad_norm_var": 0.06884148715130983, + "learning_rate": 5e-05, + "loss": 0.1545, + "loss/crossentropy": 2.749855697154999, + "loss/hidden": 0.0, + "loss/logits": 0.15445118583738804, + "loss/reg": 4.214253902435303, + "step": 41 + }, + { + "epoch": 0.00042, + "grad_norm": 0.2417730987071991, + "grad_norm_var": 0.06868010027414732, + "learning_rate": 5e-05, + "loss": 0.1099, + "loss/crossentropy": 2.751042366027832, + "loss/hidden": 0.0, + "loss/logits": 0.1099155992269516, + "loss/reg": 4.2101359367370605, + "step": 42 + }, + { + "epoch": 0.00043, + "grad_norm": 0.2631951570510864, + "grad_norm_var": 0.06838462807177058, + "learning_rate": 5e-05, + "loss": 0.1165, + "loss/crossentropy": 2.7250843048095703, + "loss/hidden": 0.0, + "loss/logits": 0.11648696288466454, + "loss/reg": 4.206397533416748, + "step": 43 + }, + { + "epoch": 0.00044, + "grad_norm": 0.2518296241760254, + "grad_norm_var": 0.06850134865244813, + "learning_rate": 5e-05, + "loss": 0.1111, + "loss/crossentropy": 2.7153283953666687, + "loss/hidden": 0.0, + "loss/logits": 0.11108221486210823, + "loss/reg": 4.201878547668457, + "step": 44 + }, + { + "epoch": 0.00045, + "grad_norm": 0.24082158505916595, + "grad_norm_var": 0.06831278207672915, + "learning_rate": 5e-05, + "loss": 0.1177, + "loss/crossentropy": 2.6632660627365112, + "loss/hidden": 0.0, + "loss/logits": 0.11769118346273899, + "loss/reg": 4.19778299331665, + "step": 45 + }, + { + "epoch": 0.00046, + "grad_norm": 0.260890394449234, + "grad_norm_var": 0.018048092726357542, + "learning_rate": 5e-05, + "loss": 0.1227, + "loss/crossentropy": 2.7315176129341125, + "loss/hidden": 0.0, + "loss/logits": 0.12269957736134529, + "loss/reg": 4.193592071533203, + "step": 46 + }, + { + "epoch": 0.00047, + "grad_norm": 0.25268790125846863, + "grad_norm_var": 0.018186152495949234, + "learning_rate": 5e-05, + "loss": 0.1178, + "loss/crossentropy": 2.774504065513611, + "loss/hidden": 0.0, + "loss/logits": 0.11776839196681976, + "loss/reg": 4.189169406890869, + "step": 47 + }, + { + "epoch": 0.00048, + "grad_norm": 0.2759403884410858, + "grad_norm_var": 0.015229396543742831, + "learning_rate": 5e-05, + "loss": 0.1289, + "loss/crossentropy": 2.8515073657035828, + "loss/hidden": 0.0, + "loss/logits": 0.12885254248976707, + "loss/reg": 4.185054779052734, + "step": 48 + }, + { + "epoch": 0.00049, + "grad_norm": 0.24765782058238983, + "grad_norm_var": 0.015206926335741973, + "learning_rate": 5e-05, + "loss": 0.1256, + "loss/crossentropy": 2.7131593823432922, + "loss/hidden": 0.0, + "loss/logits": 0.1256290916353464, + "loss/reg": 4.1810526847839355, + "step": 49 + }, + { + "epoch": 0.0005, + "grad_norm": 0.3096969425678253, + "grad_norm_var": 0.015214156358291781, + "learning_rate": 5e-05, + "loss": 0.1401, + "loss/crossentropy": 2.7528311014175415, + "loss/hidden": 0.0, + "loss/logits": 0.14005928859114647, + "loss/reg": 4.176880359649658, + "step": 50 + }, + { + "epoch": 0.00051, + "grad_norm": 0.33225017786026, + "grad_norm_var": 0.015162352298149247, + "learning_rate": 5e-05, + "loss": 0.1618, + "loss/crossentropy": 2.73341304063797, + "loss/hidden": 0.0, + "loss/logits": 0.1618291698396206, + "loss/reg": 4.173260688781738, + "step": 51 + }, + { + "epoch": 0.00052, + "grad_norm": 0.33166685700416565, + "grad_norm_var": 0.015176107188209845, + "learning_rate": 5e-05, + "loss": 0.1704, + "loss/crossentropy": 2.824883460998535, + "loss/hidden": 0.0, + "loss/logits": 0.1703827939927578, + "loss/reg": 4.168625354766846, + "step": 52 + }, + { + "epoch": 0.00053, + "grad_norm": 0.4255874752998352, + "grad_norm_var": 0.01609058098027729, + "learning_rate": 5e-05, + "loss": 0.1856, + "loss/crossentropy": 2.8565452694892883, + "loss/hidden": 0.0, + "loss/logits": 0.18561138212680817, + "loss/reg": 4.164296627044678, + "step": 53 + }, + { + "epoch": 0.00054, + "grad_norm": 0.33207008242607117, + "grad_norm_var": 0.015949373509081675, + "learning_rate": 5e-05, + "loss": 0.1762, + "loss/crossentropy": 2.7211243510246277, + "loss/hidden": 0.0, + "loss/logits": 0.1762254200875759, + "loss/reg": 4.16010856628418, + "step": 54 + }, + { + "epoch": 0.00055, + "grad_norm": 0.3105420470237732, + "grad_norm_var": 0.01561146008609899, + "learning_rate": 5e-05, + "loss": 0.172, + "loss/crossentropy": 2.7821205854415894, + "loss/hidden": 0.0, + "loss/logits": 0.17203472182154655, + "loss/reg": 4.155950546264648, + "step": 55 + }, + { + "epoch": 0.00056, + "grad_norm": 0.3342844247817993, + "grad_norm_var": 0.015583353488029018, + "learning_rate": 5e-05, + "loss": 0.1675, + "loss/crossentropy": 2.783965766429901, + "loss/hidden": 0.0, + "loss/logits": 0.1675088219344616, + "loss/reg": 4.151437759399414, + "step": 56 + }, + { + "epoch": 0.00057, + "grad_norm": 0.3392151892185211, + "grad_norm_var": 0.0026173613848745727, + "learning_rate": 5e-05, + "loss": 0.1675, + "loss/crossentropy": 2.782883048057556, + "loss/hidden": 0.0, + "loss/logits": 0.16754426062107086, + "loss/reg": 4.1469950675964355, + "step": 57 + }, + { + "epoch": 0.00058, + "grad_norm": 0.46169230341911316, + "grad_norm_var": 0.004024211017059094, + "learning_rate": 5e-05, + "loss": 0.1828, + "loss/crossentropy": 2.6869139075279236, + "loss/hidden": 0.0, + "loss/logits": 0.18278859555721283, + "loss/reg": 4.142712116241455, + "step": 58 + }, + { + "epoch": 0.00059, + "grad_norm": 0.35874953866004944, + "grad_norm_var": 0.00399056950783742, + "learning_rate": 5e-05, + "loss": 0.179, + "loss/crossentropy": 2.683705747127533, + "loss/hidden": 0.0, + "loss/logits": 0.17896704375743866, + "loss/reg": 4.138728141784668, + "step": 59 + }, + { + "epoch": 0.0006, + "grad_norm": 0.3390788435935974, + "grad_norm_var": 0.0037128700604173097, + "learning_rate": 5e-05, + "loss": 0.1824, + "loss/crossentropy": 2.6724974513053894, + "loss/hidden": 0.0, + "loss/logits": 0.18236950412392616, + "loss/reg": 4.1345534324646, + "step": 60 + }, + { + "epoch": 0.00061, + "grad_norm": 0.3341596722602844, + "grad_norm_var": 0.003246451116369023, + "learning_rate": 5e-05, + "loss": 0.1694, + "loss/crossentropy": 2.956072986125946, + "loss/hidden": 0.0, + "loss/logits": 0.16935936734080315, + "loss/reg": 4.130521774291992, + "step": 61 + }, + { + "epoch": 0.00062, + "grad_norm": 0.33658263087272644, + "grad_norm_var": 0.0029283974011622186, + "learning_rate": 5e-05, + "loss": 0.1668, + "loss/crossentropy": 2.8409587144851685, + "loss/hidden": 0.0, + "loss/logits": 0.16678539663553238, + "loss/reg": 4.126163005828857, + "step": 62 + }, + { + "epoch": 0.00063, + "grad_norm": 0.33723217248916626, + "grad_norm_var": 0.0024741312804299983, + "learning_rate": 5e-05, + "loss": 0.1856, + "loss/crossentropy": 2.7388935685157776, + "loss/hidden": 0.0, + "loss/logits": 0.18555545806884766, + "loss/reg": 4.121931552886963, + "step": 63 + }, + { + "epoch": 0.00064, + "grad_norm": 0.34580445289611816, + "grad_norm_var": 0.0022020224702210757, + "learning_rate": 5e-05, + "loss": 0.1658, + "loss/crossentropy": 2.6729788780212402, + "loss/hidden": 0.0, + "loss/logits": 0.16578427329659462, + "loss/reg": 4.117753982543945, + "step": 64 + }, + { + "epoch": 0.00065, + "grad_norm": 0.33867374062538147, + "grad_norm_var": 0.0015716415803633144, + "learning_rate": 5e-05, + "loss": 0.1643, + "loss/crossentropy": 2.8432253003120422, + "loss/hidden": 0.0, + "loss/logits": 0.16425132378935814, + "loss/reg": 4.113894939422607, + "step": 65 + }, + { + "epoch": 0.00066, + "grad_norm": 0.42098623514175415, + "grad_norm_var": 0.001778022217079652, + "learning_rate": 5e-05, + "loss": 0.2155, + "loss/crossentropy": 2.6712504625320435, + "loss/hidden": 0.0, + "loss/logits": 0.21550852805376053, + "loss/reg": 4.10945463180542, + "step": 66 + }, + { + "epoch": 0.00067, + "grad_norm": 0.35403043031692505, + "grad_norm_var": 0.0017418631675115888, + "learning_rate": 5e-05, + "loss": 0.1798, + "loss/crossentropy": 2.7415149211883545, + "loss/hidden": 0.0, + "loss/logits": 0.1797672137618065, + "loss/reg": 4.1049418449401855, + "step": 67 + }, + { + "epoch": 0.00068, + "grad_norm": 0.34834232926368713, + "grad_norm_var": 0.0017045350753313, + "learning_rate": 5e-05, + "loss": 0.1783, + "loss/crossentropy": 2.6858341097831726, + "loss/hidden": 0.0, + "loss/logits": 0.17833665013313293, + "loss/reg": 4.100775718688965, + "step": 68 + }, + { + "epoch": 0.00069, + "grad_norm": 0.3541049063205719, + "grad_norm_var": 0.0013731843169029498, + "learning_rate": 5e-05, + "loss": 0.1744, + "loss/crossentropy": 2.8710713982582092, + "loss/hidden": 0.0, + "loss/logits": 0.1744227409362793, + "loss/reg": 4.096506595611572, + "step": 69 + }, + { + "epoch": 0.0007, + "grad_norm": 0.3736323118209839, + "grad_norm_var": 0.0013660110363047928, + "learning_rate": 5e-05, + "loss": 0.1994, + "loss/crossentropy": 2.858128011226654, + "loss/hidden": 0.0, + "loss/logits": 0.19940509647130966, + "loss/reg": 4.091678142547607, + "step": 70 + }, + { + "epoch": 0.00071, + "grad_norm": 0.33025625348091125, + "grad_norm_var": 0.001272272953577754, + "learning_rate": 5e-05, + "loss": 0.1646, + "loss/crossentropy": 2.692229390144348, + "loss/hidden": 0.0, + "loss/logits": 0.16458340734243393, + "loss/reg": 4.087361812591553, + "step": 71 + }, + { + "epoch": 0.00072, + "grad_norm": 0.6907688975334167, + "grad_norm_var": 0.00815051878013667, + "learning_rate": 5e-05, + "loss": 0.1757, + "loss/crossentropy": 2.886055052280426, + "loss/hidden": 0.0, + "loss/logits": 0.1757429726421833, + "loss/reg": 4.08318567276001, + "step": 72 + }, + { + "epoch": 0.00073, + "grad_norm": 0.3311053514480591, + "grad_norm_var": 0.008197602515626375, + "learning_rate": 5e-05, + "loss": 0.1682, + "loss/crossentropy": 2.704796850681305, + "loss/hidden": 0.0, + "loss/logits": 0.1681583784520626, + "loss/reg": 4.079033374786377, + "step": 73 + }, + { + "epoch": 0.00074, + "grad_norm": 0.3336332142353058, + "grad_norm_var": 0.0078012237613196535, + "learning_rate": 5e-05, + "loss": 0.1689, + "loss/crossentropy": 2.6181225776672363, + "loss/hidden": 0.0, + "loss/logits": 0.16892167925834656, + "loss/reg": 4.074740409851074, + "step": 74 + }, + { + "epoch": 0.00075, + "grad_norm": 0.33766406774520874, + "grad_norm_var": 0.007861895340318493, + "learning_rate": 5e-05, + "loss": 0.1712, + "loss/crossentropy": 2.756729245185852, + "loss/hidden": 0.0, + "loss/logits": 0.17122048512101173, + "loss/reg": 4.070303916931152, + "step": 75 + }, + { + "epoch": 0.00076, + "grad_norm": 0.34048837423324585, + "grad_norm_var": 0.007856372064757134, + "learning_rate": 5e-05, + "loss": 0.1763, + "loss/crossentropy": 2.62674218416214, + "loss/hidden": 0.0, + "loss/logits": 0.17628077790141106, + "loss/reg": 4.065893650054932, + "step": 76 + }, + { + "epoch": 0.00077, + "grad_norm": 0.3368911147117615, + "grad_norm_var": 0.007844070912018693, + "learning_rate": 5e-05, + "loss": 0.1789, + "loss/crossentropy": 2.838981509208679, + "loss/hidden": 0.0, + "loss/logits": 0.17892110347747803, + "loss/reg": 4.061193943023682, + "step": 77 + }, + { + "epoch": 0.00078, + "grad_norm": 0.2983826696872711, + "grad_norm_var": 0.008102358070792626, + "learning_rate": 5e-05, + "loss": 0.151, + "loss/crossentropy": 2.8157095909118652, + "loss/hidden": 0.0, + "loss/logits": 0.15098581835627556, + "loss/reg": 4.05631685256958, + "step": 78 + }, + { + "epoch": 0.00079, + "grad_norm": 0.34036847949028015, + "grad_norm_var": 0.008090524798600873, + "learning_rate": 5e-05, + "loss": 0.1772, + "loss/crossentropy": 2.742383122444153, + "loss/hidden": 0.0, + "loss/logits": 0.17718595638871193, + "loss/reg": 4.051788330078125, + "step": 79 + }, + { + "epoch": 0.0008, + "grad_norm": 0.3196929097175598, + "grad_norm_var": 0.008207612908988405, + "learning_rate": 5e-05, + "loss": 0.1574, + "loss/crossentropy": 2.64748877286911, + "loss/hidden": 0.0, + "loss/logits": 0.15740340948104858, + "loss/reg": 4.046438694000244, + "step": 80 + }, + { + "epoch": 0.00081, + "grad_norm": 0.3145473897457123, + "grad_norm_var": 0.008330494258097032, + "learning_rate": 5e-05, + "loss": 0.1591, + "loss/crossentropy": 2.7640033960342407, + "loss/hidden": 0.0, + "loss/logits": 0.15912048518657684, + "loss/reg": 4.041863441467285, + "step": 81 + }, + { + "epoch": 0.00082, + "grad_norm": 0.37658828496932983, + "grad_norm_var": 0.008116681055328008, + "learning_rate": 5e-05, + "loss": 0.1783, + "loss/crossentropy": 2.8226330876350403, + "loss/hidden": 0.0, + "loss/logits": 0.17833809927105904, + "loss/reg": 4.0372796058654785, + "step": 82 + }, + { + "epoch": 0.00083, + "grad_norm": 0.36421865224838257, + "grad_norm_var": 0.00811331907494814, + "learning_rate": 5e-05, + "loss": 0.1636, + "loss/crossentropy": 2.762717604637146, + "loss/hidden": 0.0, + "loss/logits": 0.16359057649970055, + "loss/reg": 4.032177925109863, + "step": 83 + }, + { + "epoch": 0.00084, + "grad_norm": 0.3138120174407959, + "grad_norm_var": 0.00825034262581384, + "learning_rate": 5e-05, + "loss": 0.1606, + "loss/crossentropy": 2.625426709651947, + "loss/hidden": 0.0, + "loss/logits": 0.16061001271009445, + "loss/reg": 4.027446269989014, + "step": 84 + }, + { + "epoch": 0.00085, + "grad_norm": 0.34441590309143066, + "grad_norm_var": 0.00826351514204321, + "learning_rate": 5e-05, + "loss": 0.1667, + "loss/crossentropy": 2.8294222950935364, + "loss/hidden": 0.0, + "loss/logits": 0.16673466563224792, + "loss/reg": 4.022748947143555, + "step": 85 + }, + { + "epoch": 0.00086, + "grad_norm": 0.316683828830719, + "grad_norm_var": 0.00835627592765974, + "learning_rate": 5e-05, + "loss": 0.1564, + "loss/crossentropy": 2.8250383734703064, + "loss/hidden": 0.0, + "loss/logits": 0.1564498096704483, + "loss/reg": 4.017378330230713, + "step": 86 + }, + { + "epoch": 0.00087, + "grad_norm": 0.3178180456161499, + "grad_norm_var": 0.008407967451986308, + "learning_rate": 5e-05, + "loss": 0.1589, + "loss/crossentropy": 2.831330358982086, + "loss/hidden": 0.0, + "loss/logits": 0.15890633687376976, + "loss/reg": 4.012408256530762, + "step": 87 + }, + { + "epoch": 0.00088, + "grad_norm": 0.33865824341773987, + "grad_norm_var": 0.00038455914158520567, + "learning_rate": 5e-05, + "loss": 0.1665, + "loss/crossentropy": 2.8202422857284546, + "loss/hidden": 0.0, + "loss/logits": 0.16647625714540482, + "loss/reg": 4.00655460357666, + "step": 88 + }, + { + "epoch": 0.00089, + "grad_norm": 0.33375900983810425, + "grad_norm_var": 0.00038439593085719167, + "learning_rate": 5e-05, + "loss": 0.1655, + "loss/crossentropy": 2.748092472553253, + "loss/hidden": 0.0, + "loss/logits": 0.1655096672475338, + "loss/reg": 4.000852584838867, + "step": 89 + }, + { + "epoch": 0.0009, + "grad_norm": 0.41060250997543335, + "grad_norm_var": 0.000761403690223957, + "learning_rate": 5e-05, + "loss": 0.1679, + "loss/crossentropy": 2.8519994616508484, + "loss/hidden": 0.0, + "loss/logits": 0.1679377369582653, + "loss/reg": 3.9966533184051514, + "step": 90 + }, + { + "epoch": 0.00091, + "grad_norm": 0.3349744379520416, + "grad_norm_var": 0.0007618998964447029, + "learning_rate": 5e-05, + "loss": 0.1663, + "loss/crossentropy": 2.8302014470100403, + "loss/hidden": 0.0, + "loss/logits": 0.16629018262028694, + "loss/reg": 3.9916272163391113, + "step": 91 + }, + { + "epoch": 0.00092, + "grad_norm": 0.40859073400497437, + "grad_norm_var": 0.0010778266384652254, + "learning_rate": 5e-05, + "loss": 0.1631, + "loss/crossentropy": 2.831357002258301, + "loss/hidden": 0.0, + "loss/logits": 0.16314184293150902, + "loss/reg": 3.9862587451934814, + "step": 92 + }, + { + "epoch": 0.00093, + "grad_norm": 0.3679395616054535, + "grad_norm_var": 0.0011174436691973562, + "learning_rate": 5e-05, + "loss": 0.1749, + "loss/crossentropy": 2.653463125228882, + "loss/hidden": 0.0, + "loss/logits": 0.17491210997104645, + "loss/reg": 3.9809703826904297, + "step": 93 + }, + { + "epoch": 0.00094, + "grad_norm": 0.33192068338394165, + "grad_norm_var": 0.000984578674839117, + "learning_rate": 5e-05, + "loss": 0.1689, + "loss/crossentropy": 2.9128816723823547, + "loss/hidden": 0.0, + "loss/logits": 0.16890091821551323, + "loss/reg": 3.9768238067626953, + "step": 94 + }, + { + "epoch": 0.00095, + "grad_norm": 0.33981162309646606, + "grad_norm_var": 0.000985009641976816, + "learning_rate": 5e-05, + "loss": 0.1651, + "loss/crossentropy": 2.8998738527297974, + "loss/hidden": 0.0, + "loss/logits": 0.1651129573583603, + "loss/reg": 3.9723405838012695, + "step": 95 + }, + { + "epoch": 0.00096, + "grad_norm": 0.31845277547836304, + "grad_norm_var": 0.0009894353533322537, + "learning_rate": 5e-05, + "loss": 0.1566, + "loss/crossentropy": 2.738618314266205, + "loss/hidden": 0.0, + "loss/logits": 0.15662826597690582, + "loss/reg": 3.9680373668670654, + "step": 96 + }, + { + "epoch": 0.00097, + "grad_norm": 0.3521839678287506, + "grad_norm_var": 0.0009211371554959176, + "learning_rate": 5e-05, + "loss": 0.1571, + "loss/crossentropy": 2.896687388420105, + "loss/hidden": 0.0, + "loss/logits": 0.15710216015577316, + "loss/reg": 3.964097499847412, + "step": 97 + }, + { + "epoch": 0.00098, + "grad_norm": 0.41529935598373413, + "grad_norm_var": 0.0011615701056859014, + "learning_rate": 5e-05, + "loss": 0.1761, + "loss/crossentropy": 2.6711183190345764, + "loss/hidden": 0.0, + "loss/logits": 0.176058791577816, + "loss/reg": 3.959585428237915, + "step": 98 + }, + { + "epoch": 0.00099, + "grad_norm": 0.3406970202922821, + "grad_norm_var": 0.0011533483453351997, + "learning_rate": 5e-05, + "loss": 0.1755, + "loss/crossentropy": 2.762200713157654, + "loss/hidden": 0.0, + "loss/logits": 0.17553818225860596, + "loss/reg": 3.9551267623901367, + "step": 99 + }, + { + "epoch": 0.001, + "grad_norm": 0.3295409083366394, + "grad_norm_var": 0.0010948026927074712, + "learning_rate": 5e-05, + "loss": 0.1791, + "loss/crossentropy": 2.666721522808075, + "loss/hidden": 0.0, + "loss/logits": 0.17914289608597755, + "loss/reg": 3.9509167671203613, + "step": 100 + }, + { + "epoch": 0.00101, + "grad_norm": 0.3429720401763916, + "grad_norm_var": 0.001096024238407974, + "learning_rate": 5e-05, + "loss": 0.1793, + "loss/crossentropy": 2.82060843706131, + "loss/hidden": 0.0, + "loss/logits": 0.1792576014995575, + "loss/reg": 3.9469358921051025, + "step": 101 + }, + { + "epoch": 0.00102, + "grad_norm": 0.3215195834636688, + "grad_norm_var": 0.0010760084324249537, + "learning_rate": 5e-05, + "loss": 0.1632, + "loss/crossentropy": 2.808405876159668, + "loss/hidden": 0.0, + "loss/logits": 0.16316882148385048, + "loss/reg": 3.943436622619629, + "step": 102 + }, + { + "epoch": 0.00103, + "grad_norm": 0.33158427476882935, + "grad_norm_var": 0.0010282390377130302, + "learning_rate": 5e-05, + "loss": 0.1783, + "loss/crossentropy": 2.8497248888015747, + "loss/hidden": 0.0, + "loss/logits": 0.1783306896686554, + "loss/reg": 3.9394803047180176, + "step": 103 + }, + { + "epoch": 0.00104, + "grad_norm": 0.3384368121623993, + "grad_norm_var": 0.001028611107856688, + "learning_rate": 5e-05, + "loss": 0.1773, + "loss/crossentropy": 2.8479551672935486, + "loss/hidden": 0.0, + "loss/logits": 0.17731818184256554, + "loss/reg": 3.935678243637085, + "step": 104 + }, + { + "epoch": 0.00105, + "grad_norm": 0.3275454342365265, + "grad_norm_var": 0.0010454262321925218, + "learning_rate": 5e-05, + "loss": 0.172, + "loss/crossentropy": 2.7240310311317444, + "loss/hidden": 0.0, + "loss/logits": 0.17204875499010086, + "loss/reg": 3.932224750518799, + "step": 105 + }, + { + "epoch": 0.00106, + "grad_norm": 0.3352244198322296, + "grad_norm_var": 0.0007990449288615142, + "learning_rate": 5e-05, + "loss": 0.1687, + "loss/crossentropy": 2.657980978488922, + "loss/hidden": 0.0, + "loss/logits": 0.16869833320379257, + "loss/reg": 3.92889142036438, + "step": 106 + }, + { + "epoch": 0.00107, + "grad_norm": 0.3195781409740448, + "grad_norm_var": 0.00083658300653268, + "learning_rate": 5e-05, + "loss": 0.1642, + "loss/crossentropy": 2.7351735830307007, + "loss/hidden": 0.0, + "loss/logits": 0.16421591117978096, + "loss/reg": 3.9260904788970947, + "step": 107 + }, + { + "epoch": 0.00108, + "grad_norm": 0.3216703534126282, + "grad_norm_var": 0.0005727423089818255, + "learning_rate": 5e-05, + "loss": 0.1611, + "loss/crossentropy": 2.835266649723053, + "loss/hidden": 0.0, + "loss/logits": 0.1611352562904358, + "loss/reg": 3.923356533050537, + "step": 108 + }, + { + "epoch": 0.00109, + "grad_norm": 0.3534785807132721, + "grad_norm_var": 0.0005312635552543169, + "learning_rate": 5e-05, + "loss": 0.1689, + "loss/crossentropy": 2.8821677565574646, + "loss/hidden": 0.0, + "loss/logits": 0.1689467802643776, + "loss/reg": 3.9208316802978516, + "step": 109 + }, + { + "epoch": 0.0011, + "grad_norm": 0.33851271867752075, + "grad_norm_var": 0.0005279815580263729, + "learning_rate": 5e-05, + "loss": 0.171, + "loss/crossentropy": 2.7201637029647827, + "loss/hidden": 0.0, + "loss/logits": 0.17095838487148285, + "loss/reg": 3.918743133544922, + "step": 110 + }, + { + "epoch": 0.00111, + "grad_norm": 0.32998839020729065, + "grad_norm_var": 0.0005331548233647158, + "learning_rate": 5e-05, + "loss": 0.166, + "loss/crossentropy": 2.6836928725242615, + "loss/hidden": 0.0, + "loss/logits": 0.16604754701256752, + "loss/reg": 3.914886951446533, + "step": 111 + }, + { + "epoch": 0.00112, + "grad_norm": 0.420744925737381, + "grad_norm_var": 0.0009131281860373264, + "learning_rate": 5e-05, + "loss": 0.1738, + "loss/crossentropy": 2.568650722503662, + "loss/hidden": 0.0, + "loss/logits": 0.1737859919667244, + "loss/reg": 3.9106812477111816, + "step": 112 + }, + { + "epoch": 0.00113, + "grad_norm": 0.3349835276603699, + "grad_norm_var": 0.000914996833659265, + "learning_rate": 5e-05, + "loss": 0.1522, + "loss/crossentropy": 2.7411792278289795, + "loss/hidden": 0.0, + "loss/logits": 0.15223057195544243, + "loss/reg": 3.9069032669067383, + "step": 113 + }, + { + "epoch": 0.00114, + "grad_norm": 0.34276068210601807, + "grad_norm_var": 0.0005529241807124034, + "learning_rate": 5e-05, + "loss": 0.1567, + "loss/crossentropy": 2.80877947807312, + "loss/hidden": 0.0, + "loss/logits": 0.1567244492471218, + "loss/reg": 3.90332293510437, + "step": 114 + }, + { + "epoch": 0.00115, + "grad_norm": 0.35375383496284485, + "grad_norm_var": 0.0005659636539689298, + "learning_rate": 5e-05, + "loss": 0.1657, + "loss/crossentropy": 2.698065936565399, + "loss/hidden": 0.0, + "loss/logits": 0.16574329882860184, + "loss/reg": 3.8998756408691406, + "step": 115 + }, + { + "epoch": 0.00116, + "grad_norm": 0.33278602361679077, + "grad_norm_var": 0.0005620343134485931, + "learning_rate": 5e-05, + "loss": 0.1739, + "loss/crossentropy": 2.7814364433288574, + "loss/hidden": 0.0, + "loss/logits": 0.17385346069931984, + "loss/reg": 3.8964290618896484, + "step": 116 + }, + { + "epoch": 0.00117, + "grad_norm": 0.35139891505241394, + "grad_norm_var": 0.0005694228893132684, + "learning_rate": 5e-05, + "loss": 0.1701, + "loss/crossentropy": 2.7721198201179504, + "loss/hidden": 0.0, + "loss/logits": 0.1701316274702549, + "loss/reg": 3.8925936222076416, + "step": 117 + }, + { + "epoch": 0.00118, + "grad_norm": 0.3708522915840149, + "grad_norm_var": 0.0005942298534055627, + "learning_rate": 5e-05, + "loss": 0.1723, + "loss/crossentropy": 2.8753750920295715, + "loss/hidden": 0.0, + "loss/logits": 0.17226434499025345, + "loss/reg": 3.888739824295044, + "step": 118 + }, + { + "epoch": 0.00119, + "grad_norm": 0.32619452476501465, + "grad_norm_var": 0.0006049363247454272, + "learning_rate": 5e-05, + "loss": 0.1559, + "loss/crossentropy": 2.792622923851013, + "loss/hidden": 0.0, + "loss/logits": 0.15585486218333244, + "loss/reg": 3.8849120140075684, + "step": 119 + }, + { + "epoch": 0.0012, + "grad_norm": 0.3160404562950134, + "grad_norm_var": 0.0006517621123632485, + "learning_rate": 5e-05, + "loss": 0.1657, + "loss/crossentropy": 2.833389937877655, + "loss/hidden": 0.0, + "loss/logits": 0.16574294120073318, + "loss/reg": 3.8814921379089355, + "step": 120 + }, + { + "epoch": 0.00121, + "grad_norm": 2.6332755088806152, + "grad_norm_var": 0.328414929277446, + "learning_rate": 5e-05, + "loss": 0.2807, + "loss/crossentropy": 2.960978329181671, + "loss/hidden": 0.0, + "loss/logits": 0.280683059245348, + "loss/reg": 3.8778162002563477, + "step": 121 + }, + { + "epoch": 0.00122, + "grad_norm": 0.39280807971954346, + "grad_norm_var": 0.32746202761424736, + "learning_rate": 5e-05, + "loss": 0.1791, + "loss/crossentropy": 2.8656354546546936, + "loss/hidden": 0.0, + "loss/logits": 0.17905254289507866, + "loss/reg": 3.8742706775665283, + "step": 122 + }, + { + "epoch": 0.00123, + "grad_norm": 0.36644095182418823, + "grad_norm_var": 0.3265348837601918, + "learning_rate": 5e-05, + "loss": 0.1765, + "loss/crossentropy": 2.776346266269684, + "loss/hidden": 0.0, + "loss/logits": 0.1764557734131813, + "loss/reg": 3.8701822757720947, + "step": 123 + }, + { + "epoch": 0.00124, + "grad_norm": 0.39717525243759155, + "grad_norm_var": 0.3251678188828664, + "learning_rate": 5e-05, + "loss": 0.1796, + "loss/crossentropy": 2.9204375743865967, + "loss/hidden": 0.0, + "loss/logits": 0.1796155981719494, + "loss/reg": 3.866316556930542, + "step": 124 + }, + { + "epoch": 0.00125, + "grad_norm": 0.366623193025589, + "grad_norm_var": 0.3249260727271075, + "learning_rate": 5e-05, + "loss": 0.1654, + "loss/crossentropy": 2.42034849524498, + "loss/hidden": 0.0, + "loss/logits": 0.165392205119133, + "loss/reg": 3.8625807762145996, + "step": 125 + }, + { + "epoch": 0.00126, + "grad_norm": 0.3638598918914795, + "grad_norm_var": 0.32442588175429127, + "learning_rate": 5e-05, + "loss": 0.1601, + "loss/crossentropy": 2.936553716659546, + "loss/hidden": 0.0, + "loss/logits": 0.16014225035905838, + "loss/reg": 3.8585283756256104, + "step": 126 + }, + { + "epoch": 0.00127, + "grad_norm": 0.3437521159648895, + "grad_norm_var": 0.3241257586372512, + "learning_rate": 5e-05, + "loss": 0.1603, + "loss/crossentropy": 2.8428520560264587, + "loss/hidden": 0.0, + "loss/logits": 0.16030794754624367, + "loss/reg": 3.854602813720703, + "step": 127 + }, + { + "epoch": 0.00128, + "grad_norm": 0.3604683578014374, + "grad_norm_var": 0.3249965569466151, + "learning_rate": 5e-05, + "loss": 0.1688, + "loss/crossentropy": 2.717309355735779, + "loss/hidden": 0.0, + "loss/logits": 0.1687549129128456, + "loss/reg": 3.85067081451416, + "step": 128 + }, + { + "epoch": 0.00129, + "grad_norm": 0.3499651849269867, + "grad_norm_var": 0.32468680185211135, + "learning_rate": 5e-05, + "loss": 0.1748, + "loss/crossentropy": 2.819560468196869, + "loss/hidden": 0.0, + "loss/logits": 0.17475899681448936, + "loss/reg": 3.8467037677764893, + "step": 129 + }, + { + "epoch": 0.0013, + "grad_norm": 0.3231496512889862, + "grad_norm_var": 0.32511678466571453, + "learning_rate": 5e-05, + "loss": 0.1695, + "loss/crossentropy": 2.5843223929405212, + "loss/hidden": 0.0, + "loss/logits": 0.16951489821076393, + "loss/reg": 3.843282699584961, + "step": 130 + }, + { + "epoch": 0.00131, + "grad_norm": 0.3588982820510864, + "grad_norm_var": 0.325020330590364, + "learning_rate": 5e-05, + "loss": 0.169, + "loss/crossentropy": 2.725651264190674, + "loss/hidden": 0.0, + "loss/logits": 0.16896183416247368, + "loss/reg": 3.839895725250244, + "step": 131 + }, + { + "epoch": 0.00132, + "grad_norm": 0.37743306159973145, + "grad_norm_var": 0.32416673149153025, + "learning_rate": 5e-05, + "loss": 0.1833, + "loss/crossentropy": 3.0410608053207397, + "loss/hidden": 0.0, + "loss/logits": 0.1833292953670025, + "loss/reg": 3.8355963230133057, + "step": 132 + }, + { + "epoch": 0.00133, + "grad_norm": 0.32988330721855164, + "grad_norm_var": 0.32462166470000664, + "learning_rate": 5e-05, + "loss": 0.1654, + "loss/crossentropy": 2.7005507349967957, + "loss/hidden": 0.0, + "loss/logits": 0.1653790920972824, + "loss/reg": 3.831345558166504, + "step": 133 + }, + { + "epoch": 0.00134, + "grad_norm": 0.35988613963127136, + "grad_norm_var": 0.32481589623167567, + "learning_rate": 5e-05, + "loss": 0.1792, + "loss/crossentropy": 2.7048683762550354, + "loss/hidden": 0.0, + "loss/logits": 0.17917973920702934, + "loss/reg": 3.8267781734466553, + "step": 134 + }, + { + "epoch": 0.00135, + "grad_norm": 0.32649827003479004, + "grad_norm_var": 0.324808949416691, + "learning_rate": 5e-05, + "loss": 0.1642, + "loss/crossentropy": 2.791461765766144, + "loss/hidden": 0.0, + "loss/logits": 0.16420895606279373, + "loss/reg": 3.8223133087158203, + "step": 135 + }, + { + "epoch": 0.00136, + "grad_norm": 0.6779212355613708, + "grad_norm_var": 0.32421967313153754, + "learning_rate": 5e-05, + "loss": 0.2361, + "loss/crossentropy": 3.063343107700348, + "loss/hidden": 0.0, + "loss/logits": 0.2360655590891838, + "loss/reg": 3.818582057952881, + "step": 136 + }, + { + "epoch": 0.00137, + "grad_norm": 0.4217770993709564, + "grad_norm_var": 0.0069040846383882, + "learning_rate": 5e-05, + "loss": 0.1936, + "loss/crossentropy": 2.8291149735450745, + "loss/hidden": 0.0, + "loss/logits": 0.19361505657434464, + "loss/reg": 3.814713716506958, + "step": 137 + }, + { + "epoch": 0.00138, + "grad_norm": 0.3183574378490448, + "grad_norm_var": 0.0071460434004817905, + "learning_rate": 5e-05, + "loss": 0.1596, + "loss/crossentropy": 2.733646512031555, + "loss/hidden": 0.0, + "loss/logits": 0.15959006920456886, + "loss/reg": 3.8112361431121826, + "step": 138 + }, + { + "epoch": 0.00139, + "grad_norm": 0.35119444131851196, + "grad_norm_var": 0.007183318962822194, + "learning_rate": 5e-05, + "loss": 0.1706, + "loss/crossentropy": 2.777931809425354, + "loss/hidden": 0.0, + "loss/logits": 0.17056189104914665, + "loss/reg": 3.807130813598633, + "step": 139 + }, + { + "epoch": 0.0014, + "grad_norm": 0.3381962478160858, + "grad_norm_var": 0.007239536480815012, + "learning_rate": 5e-05, + "loss": 0.1651, + "loss/crossentropy": 2.865752935409546, + "loss/hidden": 0.0, + "loss/logits": 0.16511252894997597, + "loss/reg": 3.8030734062194824, + "step": 140 + }, + { + "epoch": 0.00141, + "grad_norm": 0.35082533955574036, + "grad_norm_var": 0.007268548808216302, + "learning_rate": 5e-05, + "loss": 0.1608, + "loss/crossentropy": 2.734546184539795, + "loss/hidden": 0.0, + "loss/logits": 0.16080284118652344, + "loss/reg": 3.7996251583099365, + "step": 141 + }, + { + "epoch": 0.00142, + "grad_norm": 0.4269000291824341, + "grad_norm_var": 0.007448472313405929, + "learning_rate": 5e-05, + "loss": 0.1806, + "loss/crossentropy": 2.9227113127708435, + "loss/hidden": 0.0, + "loss/logits": 0.1805506870150566, + "loss/reg": 3.7955057621002197, + "step": 142 + }, + { + "epoch": 0.00143, + "grad_norm": 0.3532395660877228, + "grad_norm_var": 0.0074133753520221855, + "learning_rate": 5e-05, + "loss": 0.1588, + "loss/crossentropy": 2.9407125115394592, + "loss/hidden": 0.0, + "loss/logits": 0.15880529955029488, + "loss/reg": 3.791508197784424, + "step": 143 + }, + { + "epoch": 0.00144, + "grad_norm": 0.3449239134788513, + "grad_norm_var": 0.007461781173789813, + "learning_rate": 5e-05, + "loss": 0.1652, + "loss/crossentropy": 2.8305121660232544, + "loss/hidden": 0.0, + "loss/logits": 0.1652398444712162, + "loss/reg": 3.7871744632720947, + "step": 144 + }, + { + "epoch": 0.00145, + "grad_norm": 0.3272966742515564, + "grad_norm_var": 0.007571273873210712, + "learning_rate": 5e-05, + "loss": 0.1714, + "loss/crossentropy": 2.876939594745636, + "loss/hidden": 0.0, + "loss/logits": 0.17143940553069115, + "loss/reg": 3.7832887172698975, + "step": 145 + }, + { + "epoch": 0.00146, + "grad_norm": 0.31960922479629517, + "grad_norm_var": 0.007596131782178968, + "learning_rate": 5e-05, + "loss": 0.1558, + "loss/crossentropy": 2.7597694993019104, + "loss/hidden": 0.0, + "loss/logits": 0.15579523891210556, + "loss/reg": 3.77976393699646, + "step": 146 + }, + { + "epoch": 0.00147, + "grad_norm": 0.3329758048057556, + "grad_norm_var": 0.007690076208493398, + "learning_rate": 5e-05, + "loss": 0.1602, + "loss/crossentropy": 2.823091506958008, + "loss/hidden": 0.0, + "loss/logits": 0.16016652062535286, + "loss/reg": 3.776364326477051, + "step": 147 + }, + { + "epoch": 0.00148, + "grad_norm": 0.3245135545730591, + "grad_norm_var": 0.007828939248271782, + "learning_rate": 5e-05, + "loss": 0.1608, + "loss/crossentropy": 2.622242748737335, + "loss/hidden": 0.0, + "loss/logits": 0.16081608831882477, + "loss/reg": 3.7724997997283936, + "step": 148 + }, + { + "epoch": 0.00149, + "grad_norm": 0.3239537179470062, + "grad_norm_var": 0.007862062788276463, + "learning_rate": 5e-05, + "loss": 0.1559, + "loss/crossentropy": 2.826173484325409, + "loss/hidden": 0.0, + "loss/logits": 0.15591008588671684, + "loss/reg": 3.7680001258850098, + "step": 149 + }, + { + "epoch": 0.0015, + "grad_norm": 0.3199516534805298, + "grad_norm_var": 0.00800828926831548, + "learning_rate": 5e-05, + "loss": 0.1705, + "loss/crossentropy": 2.73406845331192, + "loss/hidden": 0.0, + "loss/logits": 0.17048393934965134, + "loss/reg": 3.7640268802642822, + "step": 150 + }, + { + "epoch": 0.00151, + "grad_norm": 0.3810157775878906, + "grad_norm_var": 0.00790594146931481, + "learning_rate": 5e-05, + "loss": 0.1772, + "loss/crossentropy": 2.746786952018738, + "loss/hidden": 0.0, + "loss/logits": 0.17715823650360107, + "loss/reg": 3.760627508163452, + "step": 151 + }, + { + "epoch": 0.00152, + "grad_norm": 0.33840498328208923, + "grad_norm_var": 0.0011503711202599262, + "learning_rate": 5e-05, + "loss": 0.168, + "loss/crossentropy": 2.7671576738357544, + "loss/hidden": 0.0, + "loss/logits": 0.1679898537695408, + "loss/reg": 3.7571685314178467, + "step": 152 + }, + { + "epoch": 0.00153, + "grad_norm": 0.35103219747543335, + "grad_norm_var": 0.0007702874374444798, + "learning_rate": 5e-05, + "loss": 0.1683, + "loss/crossentropy": 2.8394588828086853, + "loss/hidden": 0.0, + "loss/logits": 0.1682782731950283, + "loss/reg": 3.7533957958221436, + "step": 153 + }, + { + "epoch": 0.00154, + "grad_norm": 0.34948527812957764, + "grad_norm_var": 0.000724837481797543, + "learning_rate": 5e-05, + "loss": 0.1551, + "loss/crossentropy": 2.637475073337555, + "loss/hidden": 0.0, + "loss/logits": 0.1551469974219799, + "loss/reg": 3.7496984004974365, + "step": 154 + }, + { + "epoch": 0.00155, + "grad_norm": 0.32411250472068787, + "grad_norm_var": 0.000751360146424022, + "learning_rate": 5e-05, + "loss": 0.1655, + "loss/crossentropy": 2.65782767534256, + "loss/hidden": 0.0, + "loss/logits": 0.16551653295755386, + "loss/reg": 3.7462174892425537, + "step": 155 + }, + { + "epoch": 0.00156, + "grad_norm": 0.3659244775772095, + "grad_norm_var": 0.0007773935392291246, + "learning_rate": 5e-05, + "loss": 0.1618, + "loss/crossentropy": 2.8054139614105225, + "loss/hidden": 0.0, + "loss/logits": 0.16182733327150345, + "loss/reg": 3.7422730922698975, + "step": 156 + }, + { + "epoch": 0.00157, + "grad_norm": 0.3639696538448334, + "grad_norm_var": 0.0007968496539047743, + "learning_rate": 5e-05, + "loss": 0.172, + "loss/crossentropy": 2.643721103668213, + "loss/hidden": 0.0, + "loss/logits": 0.17196981981396675, + "loss/reg": 3.7390189170837402, + "step": 157 + }, + { + "epoch": 0.00158, + "grad_norm": 0.372111439704895, + "grad_norm_var": 0.0003986384080602812, + "learning_rate": 5e-05, + "loss": 0.1752, + "loss/crossentropy": 2.6860750317573547, + "loss/hidden": 0.0, + "loss/logits": 0.17522458359599113, + "loss/reg": 3.7351748943328857, + "step": 158 + }, + { + "epoch": 0.00159, + "grad_norm": 0.3412966728210449, + "grad_norm_var": 0.0003916975034196302, + "learning_rate": 5e-05, + "loss": 0.1732, + "loss/crossentropy": 2.7506829500198364, + "loss/hidden": 0.0, + "loss/logits": 0.17320549115538597, + "loss/reg": 3.731645345687866, + "step": 159 + }, + { + "epoch": 0.0016, + "grad_norm": 0.31508323550224304, + "grad_norm_var": 0.0004378510847698321, + "learning_rate": 5e-05, + "loss": 0.1676, + "loss/crossentropy": 2.672293782234192, + "loss/hidden": 0.0, + "loss/logits": 0.16758090257644653, + "loss/reg": 3.727598190307617, + "step": 160 + }, + { + "epoch": 0.00161, + "grad_norm": 0.39773106575012207, + "grad_norm_var": 0.0006223116385708161, + "learning_rate": 5e-05, + "loss": 0.1867, + "loss/crossentropy": 2.975751519203186, + "loss/hidden": 0.0, + "loss/logits": 0.18666821345686913, + "loss/reg": 3.7237842082977295, + "step": 161 + }, + { + "epoch": 0.00162, + "grad_norm": 0.3057797849178314, + "grad_norm_var": 0.0006812186499233134, + "learning_rate": 5e-05, + "loss": 0.1511, + "loss/crossentropy": 2.768982172012329, + "loss/hidden": 0.0, + "loss/logits": 0.15112394466996193, + "loss/reg": 3.7201473712921143, + "step": 162 + }, + { + "epoch": 0.00163, + "grad_norm": 0.39109617471694946, + "grad_norm_var": 0.0008052929738533592, + "learning_rate": 5e-05, + "loss": 0.1692, + "loss/crossentropy": 2.7556854486465454, + "loss/hidden": 0.0, + "loss/logits": 0.1692204400897026, + "loss/reg": 3.715847969055176, + "step": 163 + }, + { + "epoch": 0.00164, + "grad_norm": 0.3230038285255432, + "grad_norm_var": 0.0008101312463145642, + "learning_rate": 5e-05, + "loss": 0.158, + "loss/crossentropy": 2.663906216621399, + "loss/hidden": 0.0, + "loss/logits": 0.1579984687268734, + "loss/reg": 3.712200403213501, + "step": 164 + }, + { + "epoch": 0.00165, + "grad_norm": 0.32820436358451843, + "grad_norm_var": 0.0007977755717131292, + "learning_rate": 5e-05, + "loss": 0.1535, + "loss/crossentropy": 2.7556238174438477, + "loss/hidden": 0.0, + "loss/logits": 0.15348907560110092, + "loss/reg": 3.7093729972839355, + "step": 165 + }, + { + "epoch": 0.00166, + "grad_norm": 0.37247714400291443, + "grad_norm_var": 0.0007736858685811421, + "learning_rate": 5e-05, + "loss": 0.168, + "loss/crossentropy": 2.623964309692383, + "loss/hidden": 0.0, + "loss/logits": 0.16797634214162827, + "loss/reg": 3.7055835723876953, + "step": 166 + }, + { + "epoch": 0.00167, + "grad_norm": 0.31921809911727905, + "grad_norm_var": 0.0007674848471050747, + "learning_rate": 5e-05, + "loss": 0.1618, + "loss/crossentropy": 2.6233983039855957, + "loss/hidden": 0.0, + "loss/logits": 0.16180693730711937, + "loss/reg": 3.7018704414367676, + "step": 167 + }, + { + "epoch": 0.00168, + "grad_norm": 0.41518375277519226, + "grad_norm_var": 0.0010434978692974088, + "learning_rate": 5e-05, + "loss": 0.1842, + "loss/crossentropy": 2.794585347175598, + "loss/hidden": 0.0, + "loss/logits": 0.18423354998230934, + "loss/reg": 3.6984987258911133, + "step": 168 + }, + { + "epoch": 0.00169, + "grad_norm": 0.3530808985233307, + "grad_norm_var": 0.0010434324942960296, + "learning_rate": 5e-05, + "loss": 0.1818, + "loss/crossentropy": 2.725895941257477, + "loss/hidden": 0.0, + "loss/logits": 0.18175217881798744, + "loss/reg": 3.6949737071990967, + "step": 169 + }, + { + "epoch": 0.0017, + "grad_norm": 0.35729339718818665, + "grad_norm_var": 0.001044250197534243, + "learning_rate": 5e-05, + "loss": 0.1758, + "loss/crossentropy": 2.8144423365592957, + "loss/hidden": 0.0, + "loss/logits": 0.1758369542658329, + "loss/reg": 3.6909079551696777, + "step": 170 + }, + { + "epoch": 0.00171, + "grad_norm": 0.3258056044578552, + "grad_norm_var": 0.0010379424391956011, + "learning_rate": 5e-05, + "loss": 0.1615, + "loss/crossentropy": 2.6860609650611877, + "loss/hidden": 0.0, + "loss/logits": 0.1615053378045559, + "loss/reg": 3.6872336864471436, + "step": 171 + }, + { + "epoch": 0.00172, + "grad_norm": 0.3320024907588959, + "grad_norm_var": 0.0010511954351829684, + "learning_rate": 5e-05, + "loss": 0.1669, + "loss/crossentropy": 2.7618680596351624, + "loss/hidden": 0.0, + "loss/logits": 0.16686224937438965, + "loss/reg": 3.684033155441284, + "step": 172 + }, + { + "epoch": 0.00173, + "grad_norm": 0.32370057702064514, + "grad_norm_var": 0.001082015111668518, + "learning_rate": 5e-05, + "loss": 0.1568, + "loss/crossentropy": 2.8911356329917908, + "loss/hidden": 0.0, + "loss/logits": 0.15675026923418045, + "loss/reg": 3.6797101497650146, + "step": 173 + }, + { + "epoch": 0.00174, + "grad_norm": 0.3590388298034668, + "grad_norm_var": 0.0010512214971074684, + "learning_rate": 5e-05, + "loss": 0.1608, + "loss/crossentropy": 2.894763946533203, + "loss/hidden": 0.0, + "loss/logits": 0.16077794507145882, + "loss/reg": 3.676694393157959, + "step": 174 + }, + { + "epoch": 0.00175, + "grad_norm": 0.362693190574646, + "grad_norm_var": 0.0010621381304175893, + "learning_rate": 5e-05, + "loss": 0.181, + "loss/crossentropy": 2.9355967044830322, + "loss/hidden": 0.0, + "loss/logits": 0.18095535412430763, + "loss/reg": 3.6728715896606445, + "step": 175 + }, + { + "epoch": 0.00176, + "grad_norm": 0.3421201705932617, + "grad_norm_var": 0.0009861454944628978, + "learning_rate": 5e-05, + "loss": 0.1752, + "loss/crossentropy": 2.771928548812866, + "loss/hidden": 0.0, + "loss/logits": 0.17522436380386353, + "loss/reg": 3.6698157787323, + "step": 176 + }, + { + "epoch": 0.00177, + "grad_norm": 0.3921768069267273, + "grad_norm_var": 0.0009531156716223066, + "learning_rate": 5e-05, + "loss": 0.1682, + "loss/crossentropy": 2.9020140171051025, + "loss/hidden": 0.0, + "loss/logits": 0.16816864535212517, + "loss/reg": 3.6669130325317383, + "step": 177 + }, + { + "epoch": 0.00178, + "grad_norm": 0.414460688829422, + "grad_norm_var": 0.0010479472090343092, + "learning_rate": 5e-05, + "loss": 0.1652, + "loss/crossentropy": 2.871070384979248, + "loss/hidden": 0.0, + "loss/logits": 0.1651761755347252, + "loss/reg": 3.6637353897094727, + "step": 178 + }, + { + "epoch": 0.00179, + "grad_norm": 0.37821123003959656, + "grad_norm_var": 0.0009996989224075473, + "learning_rate": 5e-05, + "loss": 0.1618, + "loss/crossentropy": 2.8318552374839783, + "loss/hidden": 0.0, + "loss/logits": 0.16177014261484146, + "loss/reg": 3.6601526737213135, + "step": 179 + }, + { + "epoch": 0.0018, + "grad_norm": 0.33756861090660095, + "grad_norm_var": 0.0009485554235717804, + "learning_rate": 5e-05, + "loss": 0.164, + "loss/crossentropy": 2.7179840803146362, + "loss/hidden": 0.0, + "loss/logits": 0.16402991488575935, + "loss/reg": 3.655977725982666, + "step": 180 + }, + { + "epoch": 0.00181, + "grad_norm": 0.3508152663707733, + "grad_norm_var": 0.0008934631549546879, + "learning_rate": 5e-05, + "loss": 0.1824, + "loss/crossentropy": 2.655538856983185, + "loss/hidden": 0.0, + "loss/logits": 0.18240001425147057, + "loss/reg": 3.6522390842437744, + "step": 181 + }, + { + "epoch": 0.00182, + "grad_norm": 0.4800889194011688, + "grad_norm_var": 0.0018179163356779901, + "learning_rate": 5e-05, + "loss": 0.1773, + "loss/crossentropy": 2.9170504808425903, + "loss/hidden": 0.0, + "loss/logits": 0.17730093747377396, + "loss/reg": 3.648420810699463, + "step": 182 + }, + { + "epoch": 0.00183, + "grad_norm": 0.32715606689453125, + "grad_norm_var": 0.0017731703957083382, + "learning_rate": 5e-05, + "loss": 0.1599, + "loss/crossentropy": 2.6978230476379395, + "loss/hidden": 0.0, + "loss/logits": 0.15988203510642052, + "loss/reg": 3.644439458847046, + "step": 183 + }, + { + "epoch": 0.00184, + "grad_norm": 0.3219493329524994, + "grad_norm_var": 0.0017014689354580615, + "learning_rate": 5e-05, + "loss": 0.1588, + "loss/crossentropy": 2.772395610809326, + "loss/hidden": 0.0, + "loss/logits": 0.158803328871727, + "loss/reg": 3.6410605907440186, + "step": 184 + }, + { + "epoch": 0.00185, + "grad_norm": 0.3204100728034973, + "grad_norm_var": 0.0017978203455529696, + "learning_rate": 5e-05, + "loss": 0.1595, + "loss/crossentropy": 2.7290788292884827, + "loss/hidden": 0.0, + "loss/logits": 0.15948805212974548, + "loss/reg": 3.637272596359253, + "step": 185 + }, + { + "epoch": 0.00186, + "grad_norm": 0.34646865725517273, + "grad_norm_var": 0.0018059373173852718, + "learning_rate": 5e-05, + "loss": 0.1723, + "loss/crossentropy": 2.68435937166214, + "loss/hidden": 0.0, + "loss/logits": 0.17226089164614677, + "loss/reg": 3.6334545612335205, + "step": 186 + }, + { + "epoch": 0.00187, + "grad_norm": 0.35515356063842773, + "grad_norm_var": 0.001737051018651666, + "learning_rate": 5e-05, + "loss": 0.1656, + "loss/crossentropy": 2.8159299492836, + "loss/hidden": 0.0, + "loss/logits": 0.16557194665074348, + "loss/reg": 3.629215717315674, + "step": 187 + }, + { + "epoch": 0.00188, + "grad_norm": 0.31605055928230286, + "grad_norm_var": 0.0018103786054489293, + "learning_rate": 5e-05, + "loss": 0.1587, + "loss/crossentropy": 2.737620174884796, + "loss/hidden": 0.0, + "loss/logits": 0.15867746248841286, + "loss/reg": 3.6252663135528564, + "step": 188 + }, + { + "epoch": 0.00189, + "grad_norm": 0.3383916914463043, + "grad_norm_var": 0.0017566740185558556, + "learning_rate": 5e-05, + "loss": 0.1621, + "loss/crossentropy": 2.7829577326774597, + "loss/hidden": 0.0, + "loss/logits": 0.162098228931427, + "loss/reg": 3.621067523956299, + "step": 189 + }, + { + "epoch": 0.0019, + "grad_norm": 0.4556836783885956, + "grad_norm_var": 0.0023419423247556044, + "learning_rate": 5e-05, + "loss": 0.1687, + "loss/crossentropy": 2.9624626636505127, + "loss/hidden": 0.0, + "loss/logits": 0.1686898171901703, + "loss/reg": 3.6163265705108643, + "step": 190 + }, + { + "epoch": 0.00191, + "grad_norm": 0.3975931406021118, + "grad_norm_var": 0.0024075083289669527, + "learning_rate": 5e-05, + "loss": 0.155, + "loss/crossentropy": 2.731001079082489, + "loss/hidden": 0.0, + "loss/logits": 0.15501929074525833, + "loss/reg": 3.6121585369110107, + "step": 191 + }, + { + "epoch": 0.00192, + "grad_norm": 0.37328633666038513, + "grad_norm_var": 0.002364231645687964, + "learning_rate": 5e-05, + "loss": 0.1683, + "loss/crossentropy": 2.754942238330841, + "loss/hidden": 0.0, + "loss/logits": 0.16829831898212433, + "loss/reg": 3.6073873043060303, + "step": 192 + }, + { + "epoch": 0.00193, + "grad_norm": 0.3342723250389099, + "grad_norm_var": 0.0023955576435807737, + "learning_rate": 5e-05, + "loss": 0.1663, + "loss/crossentropy": 2.7424720525741577, + "loss/hidden": 0.0, + "loss/logits": 0.16633369401097298, + "loss/reg": 3.6036906242370605, + "step": 193 + }, + { + "epoch": 0.00194, + "grad_norm": 0.38286155462265015, + "grad_norm_var": 0.002251566346172081, + "learning_rate": 5e-05, + "loss": 0.1652, + "loss/crossentropy": 2.9778133630752563, + "loss/hidden": 0.0, + "loss/logits": 0.16522743180394173, + "loss/reg": 3.600316286087036, + "step": 194 + }, + { + "epoch": 0.00195, + "grad_norm": 0.36051952838897705, + "grad_norm_var": 0.0022364206403587715, + "learning_rate": 5e-05, + "loss": 0.1772, + "loss/crossentropy": 2.6842609643936157, + "loss/hidden": 0.0, + "loss/logits": 0.1771794743835926, + "loss/reg": 3.596491813659668, + "step": 195 + }, + { + "epoch": 0.00196, + "grad_norm": 0.3526027202606201, + "grad_norm_var": 0.0022007878333510996, + "learning_rate": 5e-05, + "loss": 0.1561, + "loss/crossentropy": 2.7837477922439575, + "loss/hidden": 0.0, + "loss/logits": 0.15605639293789864, + "loss/reg": 3.5926032066345215, + "step": 196 + }, + { + "epoch": 0.00197, + "grad_norm": 0.35895583033561707, + "grad_norm_var": 0.002191344445433652, + "learning_rate": 5e-05, + "loss": 0.1801, + "loss/crossentropy": 2.85478812456131, + "loss/hidden": 0.0, + "loss/logits": 0.1800978109240532, + "loss/reg": 3.589280843734741, + "step": 197 + }, + { + "epoch": 0.00198, + "grad_norm": 0.3372839689254761, + "grad_norm_var": 0.0012524713862786308, + "learning_rate": 5e-05, + "loss": 0.1571, + "loss/crossentropy": 2.805725872516632, + "loss/hidden": 0.0, + "loss/logits": 0.15710647776722908, + "loss/reg": 3.5851662158966064, + "step": 198 + }, + { + "epoch": 0.00199, + "grad_norm": 0.33652499318122864, + "grad_norm_var": 0.0012232813247675149, + "learning_rate": 5e-05, + "loss": 0.1652, + "loss/crossentropy": 2.657254457473755, + "loss/hidden": 0.0, + "loss/logits": 0.1651643067598343, + "loss/reg": 3.581798553466797, + "step": 199 + }, + { + "epoch": 0.002, + "grad_norm": 0.36757001280784607, + "grad_norm_var": 0.001149275638629573, + "learning_rate": 5e-05, + "loss": 0.1756, + "loss/crossentropy": 2.7496553659439087, + "loss/hidden": 0.0, + "loss/logits": 0.17555997148156166, + "loss/reg": 3.577878475189209, + "step": 200 + }, + { + "epoch": 0.00201, + "grad_norm": 0.4317435324192047, + "grad_norm_var": 0.0013607474972908151, + "learning_rate": 5e-05, + "loss": 0.1643, + "loss/crossentropy": 3.168861448764801, + "loss/hidden": 0.0, + "loss/logits": 0.164311021566391, + "loss/reg": 3.5741024017333984, + "step": 201 + }, + { + "epoch": 0.00202, + "grad_norm": 0.3569833040237427, + "grad_norm_var": 0.0013412425012825579, + "learning_rate": 5e-05, + "loss": 0.1778, + "loss/crossentropy": 2.7941558957099915, + "loss/hidden": 0.0, + "loss/logits": 0.17778108268976212, + "loss/reg": 3.5706787109375, + "step": 202 + }, + { + "epoch": 0.00203, + "grad_norm": 0.31648150086402893, + "grad_norm_var": 0.0014904716039333447, + "learning_rate": 5e-05, + "loss": 0.156, + "loss/crossentropy": 2.872058689594269, + "loss/hidden": 0.0, + "loss/logits": 0.1559964008629322, + "loss/reg": 3.5671305656433105, + "step": 203 + }, + { + "epoch": 0.00204, + "grad_norm": 0.32686129212379456, + "grad_norm_var": 0.0014293085106024154, + "learning_rate": 5e-05, + "loss": 0.1593, + "loss/crossentropy": 2.7316592931747437, + "loss/hidden": 0.0, + "loss/logits": 0.15925980731844902, + "loss/reg": 3.5632758140563965, + "step": 204 + }, + { + "epoch": 0.00205, + "grad_norm": 0.3191937506198883, + "grad_norm_var": 0.001518472211395964, + "learning_rate": 5e-05, + "loss": 0.1527, + "loss/crossentropy": 2.7802085876464844, + "loss/hidden": 0.0, + "loss/logits": 0.15268265083432198, + "loss/reg": 3.559633493423462, + "step": 205 + }, + { + "epoch": 0.00206, + "grad_norm": 0.34924882650375366, + "grad_norm_var": 0.0009115629505157467, + "learning_rate": 5e-05, + "loss": 0.1773, + "loss/crossentropy": 2.792604923248291, + "loss/hidden": 0.0, + "loss/logits": 0.17729893326759338, + "loss/reg": 3.555882453918457, + "step": 206 + }, + { + "epoch": 0.00207, + "grad_norm": 0.38204553723335266, + "grad_norm_var": 0.0008412229229646054, + "learning_rate": 5e-05, + "loss": 0.1735, + "loss/crossentropy": 2.729912281036377, + "loss/hidden": 0.0, + "loss/logits": 0.17347190529108047, + "loss/reg": 3.551867723464966, + "step": 207 + }, + { + "epoch": 0.00208, + "grad_norm": 0.316631555557251, + "grad_norm_var": 0.0009067368526577339, + "learning_rate": 5e-05, + "loss": 0.1521, + "loss/crossentropy": 2.6910020112991333, + "loss/hidden": 0.0, + "loss/logits": 0.15211007744073868, + "loss/reg": 3.547140598297119, + "step": 208 + }, + { + "epoch": 0.00209, + "grad_norm": 0.3024788200855255, + "grad_norm_var": 0.0010444754089082963, + "learning_rate": 5e-05, + "loss": 0.1534, + "loss/crossentropy": 2.6174367666244507, + "loss/hidden": 0.0, + "loss/logits": 0.15340904891490936, + "loss/reg": 3.5430798530578613, + "step": 209 + }, + { + "epoch": 0.0021, + "grad_norm": 0.31879743933677673, + "grad_norm_var": 0.0010192142441715734, + "learning_rate": 5e-05, + "loss": 0.1644, + "loss/crossentropy": 2.6434658765792847, + "loss/hidden": 0.0, + "loss/logits": 0.164449330419302, + "loss/reg": 3.539293050765991, + "step": 210 + }, + { + "epoch": 0.00211, + "grad_norm": 0.37038934230804443, + "grad_norm_var": 0.0010445807718520773, + "learning_rate": 5e-05, + "loss": 0.1618, + "loss/crossentropy": 2.7187950015068054, + "loss/hidden": 0.0, + "loss/logits": 0.16179471090435982, + "loss/reg": 3.5359723567962646, + "step": 211 + }, + { + "epoch": 0.00212, + "grad_norm": 0.3256055414676666, + "grad_norm_var": 0.0010681195543044476, + "learning_rate": 5e-05, + "loss": 0.1634, + "loss/crossentropy": 2.6802476048469543, + "loss/hidden": 0.0, + "loss/logits": 0.16339639574289322, + "loss/reg": 3.5320651531219482, + "step": 212 + }, + { + "epoch": 0.00213, + "grad_norm": 0.363210529088974, + "grad_norm_var": 0.0010772816324646883, + "learning_rate": 5e-05, + "loss": 0.1682, + "loss/crossentropy": 2.925456941127777, + "loss/hidden": 0.0, + "loss/logits": 0.16816257312893867, + "loss/reg": 3.527592420578003, + "step": 213 + }, + { + "epoch": 0.00214, + "grad_norm": 0.3341169059276581, + "grad_norm_var": 0.0010811945233913268, + "learning_rate": 5e-05, + "loss": 0.169, + "loss/crossentropy": 2.8775156140327454, + "loss/hidden": 0.0, + "loss/logits": 0.1689641959965229, + "loss/reg": 3.5241305828094482, + "step": 214 + }, + { + "epoch": 0.00215, + "grad_norm": 0.7971848249435425, + "grad_norm_var": 0.013831743286372744, + "learning_rate": 5e-05, + "loss": 0.1898, + "loss/crossentropy": 2.769020676612854, + "loss/hidden": 0.0, + "loss/logits": 0.18977811932563782, + "loss/reg": 3.5197558403015137, + "step": 215 + }, + { + "epoch": 0.00216, + "grad_norm": 0.3044687807559967, + "grad_norm_var": 0.014131832632900828, + "learning_rate": 5e-05, + "loss": 0.1467, + "loss/crossentropy": 2.792181670665741, + "loss/hidden": 0.0, + "loss/logits": 0.1466773971915245, + "loss/reg": 3.516072988510132, + "step": 216 + }, + { + "epoch": 0.00217, + "grad_norm": 0.3434732258319855, + "grad_norm_var": 0.013888774653188173, + "learning_rate": 5e-05, + "loss": 0.1698, + "loss/crossentropy": 2.577077627182007, + "loss/hidden": 0.0, + "loss/logits": 0.16977669671177864, + "loss/reg": 3.512517213821411, + "step": 217 + }, + { + "epoch": 0.00218, + "grad_norm": 0.37019920349121094, + "grad_norm_var": 0.013886977393692842, + "learning_rate": 5e-05, + "loss": 0.1943, + "loss/crossentropy": 2.722847878932953, + "loss/hidden": 0.0, + "loss/logits": 0.19428952783346176, + "loss/reg": 3.5091969966888428, + "step": 218 + }, + { + "epoch": 0.00219, + "grad_norm": 0.31637635827064514, + "grad_norm_var": 0.013887658605223226, + "learning_rate": 5e-05, + "loss": 0.1547, + "loss/crossentropy": 2.787532150745392, + "loss/hidden": 0.0, + "loss/logits": 0.1546883024275303, + "loss/reg": 3.5059781074523926, + "step": 219 + }, + { + "epoch": 0.0022, + "grad_norm": 0.368344783782959, + "grad_norm_var": 0.013784165910995568, + "learning_rate": 5e-05, + "loss": 0.1773, + "loss/crossentropy": 2.7095659971237183, + "loss/hidden": 0.0, + "loss/logits": 0.1773015893995762, + "loss/reg": 3.502683162689209, + "step": 220 + }, + { + "epoch": 0.00221, + "grad_norm": 0.3447912037372589, + "grad_norm_var": 0.013659872247631084, + "learning_rate": 5e-05, + "loss": 0.1688, + "loss/crossentropy": 2.7072474360466003, + "loss/hidden": 0.0, + "loss/logits": 0.1687602400779724, + "loss/reg": 3.4986109733581543, + "step": 221 + }, + { + "epoch": 0.00222, + "grad_norm": 0.3812227249145508, + "grad_norm_var": 0.013638668912457892, + "learning_rate": 5e-05, + "loss": 0.1811, + "loss/crossentropy": 2.8128660917282104, + "loss/hidden": 0.0, + "loss/logits": 0.18113631010055542, + "loss/reg": 3.4947147369384766, + "step": 222 + }, + { + "epoch": 0.00223, + "grad_norm": 0.339374303817749, + "grad_norm_var": 0.013690814024359154, + "learning_rate": 5e-05, + "loss": 0.1716, + "loss/crossentropy": 2.885101020336151, + "loss/hidden": 0.0, + "loss/logits": 0.17156245186924934, + "loss/reg": 3.4905753135681152, + "step": 223 + }, + { + "epoch": 0.00224, + "grad_norm": 0.3169143497943878, + "grad_norm_var": 0.013688861707923295, + "learning_rate": 5e-05, + "loss": 0.1589, + "loss/crossentropy": 2.6434147357940674, + "loss/hidden": 0.0, + "loss/logits": 0.15887855738401413, + "loss/reg": 3.486919641494751, + "step": 224 + }, + { + "epoch": 0.00225, + "grad_norm": 0.4436502456665039, + "grad_norm_var": 0.013690624557478688, + "learning_rate": 5e-05, + "loss": 0.2037, + "loss/crossentropy": 2.9042821526527405, + "loss/hidden": 0.0, + "loss/logits": 0.20374128222465515, + "loss/reg": 3.483499765396118, + "step": 225 + }, + { + "epoch": 0.00226, + "grad_norm": 0.44937804341316223, + "grad_norm_var": 0.01373632101878638, + "learning_rate": 5e-05, + "loss": 0.1588, + "loss/crossentropy": 2.79194039106369, + "loss/hidden": 0.0, + "loss/logits": 0.1587841510772705, + "loss/reg": 3.480142593383789, + "step": 226 + }, + { + "epoch": 0.00227, + "grad_norm": 0.3453376889228821, + "grad_norm_var": 0.013826164241530992, + "learning_rate": 5e-05, + "loss": 0.1659, + "loss/crossentropy": 2.7488330006599426, + "loss/hidden": 0.0, + "loss/logits": 0.16589100658893585, + "loss/reg": 3.476062297821045, + "step": 227 + }, + { + "epoch": 0.00228, + "grad_norm": 0.3845584988594055, + "grad_norm_var": 0.013584549048495138, + "learning_rate": 5e-05, + "loss": 0.1842, + "loss/crossentropy": 2.6935607194900513, + "loss/hidden": 0.0, + "loss/logits": 0.18416164070367813, + "loss/reg": 3.4726946353912354, + "step": 228 + }, + { + "epoch": 0.00229, + "grad_norm": 0.3347846567630768, + "grad_norm_var": 0.013727727146734722, + "learning_rate": 5e-05, + "loss": 0.1767, + "loss/crossentropy": 2.6182947754859924, + "loss/hidden": 0.0, + "loss/logits": 0.1767422929406166, + "loss/reg": 3.469238519668579, + "step": 229 + }, + { + "epoch": 0.0023, + "grad_norm": 0.35126739740371704, + "grad_norm_var": 0.01362772883112919, + "learning_rate": 5e-05, + "loss": 0.1694, + "loss/crossentropy": 2.8005401492118835, + "loss/hidden": 0.0, + "loss/logits": 0.1694028675556183, + "loss/reg": 3.4662599563598633, + "step": 230 + }, + { + "epoch": 0.00231, + "grad_norm": 0.37644773721694946, + "grad_norm_var": 0.0016784352808341082, + "learning_rate": 5e-05, + "loss": 0.1677, + "loss/crossentropy": 2.7537949085235596, + "loss/hidden": 0.0, + "loss/logits": 0.16772692278027534, + "loss/reg": 3.4623701572418213, + "step": 231 + }, + { + "epoch": 0.00232, + "grad_norm": 0.33086928725242615, + "grad_norm_var": 0.0015241936410912834, + "learning_rate": 5e-05, + "loss": 0.1624, + "loss/crossentropy": 2.7844293117523193, + "loss/hidden": 0.0, + "loss/logits": 0.1624348722398281, + "loss/reg": 3.459073066711426, + "step": 232 + }, + { + "epoch": 0.00233, + "grad_norm": 0.3152429461479187, + "grad_norm_var": 0.0016449122438399724, + "learning_rate": 5e-05, + "loss": 0.1607, + "loss/crossentropy": 2.5863555669784546, + "loss/hidden": 0.0, + "loss/logits": 0.16065017879009247, + "loss/reg": 3.456038475036621, + "step": 233 + }, + { + "epoch": 0.00234, + "grad_norm": 0.34679386019706726, + "grad_norm_var": 0.001649030072333372, + "learning_rate": 5e-05, + "loss": 0.1656, + "loss/crossentropy": 2.9068891406059265, + "loss/hidden": 0.0, + "loss/logits": 0.16555847227573395, + "loss/reg": 3.452618360519409, + "step": 234 + }, + { + "epoch": 0.00235, + "grad_norm": 0.36684513092041016, + "grad_norm_var": 0.001520832425550959, + "learning_rate": 5e-05, + "loss": 0.1878, + "loss/crossentropy": 2.6781840920448303, + "loss/hidden": 0.0, + "loss/logits": 0.18775511160492897, + "loss/reg": 3.4493637084960938, + "step": 235 + }, + { + "epoch": 0.00236, + "grad_norm": 0.39043235778808594, + "grad_norm_var": 0.0015693055369300879, + "learning_rate": 5e-05, + "loss": 0.1559, + "loss/crossentropy": 2.9237093925476074, + "loss/hidden": 0.0, + "loss/logits": 0.15592358261346817, + "loss/reg": 3.446392059326172, + "step": 236 + }, + { + "epoch": 0.00237, + "grad_norm": 0.3486286997795105, + "grad_norm_var": 0.0015605921838873513, + "learning_rate": 5e-05, + "loss": 0.1524, + "loss/crossentropy": 2.8276549577713013, + "loss/hidden": 0.0, + "loss/logits": 0.1523873247206211, + "loss/reg": 3.443490505218506, + "step": 237 + }, + { + "epoch": 0.00238, + "grad_norm": 0.4030380845069885, + "grad_norm_var": 0.0016408419596595262, + "learning_rate": 5e-05, + "loss": 0.1839, + "loss/crossentropy": 2.7374503016471863, + "loss/hidden": 0.0, + "loss/logits": 0.1839219257235527, + "loss/reg": 3.440230131149292, + "step": 238 + }, + { + "epoch": 0.00239, + "grad_norm": 0.3677695095539093, + "grad_norm_var": 0.0015933721835237928, + "learning_rate": 5e-05, + "loss": 0.1725, + "loss/crossentropy": 2.637487053871155, + "loss/hidden": 0.0, + "loss/logits": 0.1725292131304741, + "loss/reg": 3.4369444847106934, + "step": 239 + }, + { + "epoch": 0.0024, + "grad_norm": 0.3092736303806305, + "grad_norm_var": 0.001648043714460871, + "learning_rate": 5e-05, + "loss": 0.1608, + "loss/crossentropy": 2.785566747188568, + "loss/hidden": 0.0, + "loss/logits": 0.16075557842850685, + "loss/reg": 3.4329705238342285, + "step": 240 + }, + { + "epoch": 0.00241, + "grad_norm": 0.3242727518081665, + "grad_norm_var": 0.001311046071157899, + "learning_rate": 5e-05, + "loss": 0.1641, + "loss/crossentropy": 2.7823829650878906, + "loss/hidden": 0.0, + "loss/logits": 0.16410458087921143, + "loss/reg": 3.429222345352173, + "step": 241 + }, + { + "epoch": 0.00242, + "grad_norm": 0.3544396758079529, + "grad_norm_var": 0.0007310749754719385, + "learning_rate": 5e-05, + "loss": 0.1742, + "loss/crossentropy": 2.7899482250213623, + "loss/hidden": 0.0, + "loss/logits": 0.1741911694407463, + "loss/reg": 3.4251747131347656, + "step": 242 + }, + { + "epoch": 0.00243, + "grad_norm": 0.3156209886074066, + "grad_norm_var": 0.0008171231835736463, + "learning_rate": 5e-05, + "loss": 0.159, + "loss/crossentropy": 2.7414376735687256, + "loss/hidden": 0.0, + "loss/logits": 0.15898872911930084, + "loss/reg": 3.421576976776123, + "step": 243 + }, + { + "epoch": 0.00244, + "grad_norm": 0.3353999853134155, + "grad_norm_var": 0.000749955482525048, + "learning_rate": 5e-05, + "loss": 0.1669, + "loss/crossentropy": 2.707472503185272, + "loss/hidden": 0.0, + "loss/logits": 0.16693224385380745, + "loss/reg": 3.417820930480957, + "step": 244 + }, + { + "epoch": 0.00245, + "grad_norm": 0.32766133546829224, + "grad_norm_var": 0.0007658640613261528, + "learning_rate": 5e-05, + "loss": 0.1761, + "loss/crossentropy": 2.6950490474700928, + "loss/hidden": 0.0, + "loss/logits": 0.17608999833464622, + "loss/reg": 3.414095640182495, + "step": 245 + }, + { + "epoch": 0.00246, + "grad_norm": 0.31360548734664917, + "grad_norm_var": 0.0008368534177580581, + "learning_rate": 5e-05, + "loss": 0.1578, + "loss/crossentropy": 2.6977627873420715, + "loss/hidden": 0.0, + "loss/logits": 0.15783175826072693, + "loss/reg": 3.409533739089966, + "step": 246 + }, + { + "epoch": 0.00247, + "grad_norm": 0.35324403643608093, + "grad_norm_var": 0.0007744365123312817, + "learning_rate": 5e-05, + "loss": 0.1688, + "loss/crossentropy": 2.8509859442710876, + "loss/hidden": 0.0, + "loss/logits": 0.16875524446368217, + "loss/reg": 3.4052798748016357, + "step": 247 + }, + { + "epoch": 0.00248, + "grad_norm": 0.41796907782554626, + "grad_norm_var": 0.0010967197155327421, + "learning_rate": 5e-05, + "loss": 0.18, + "loss/crossentropy": 2.701251804828644, + "loss/hidden": 0.0, + "loss/logits": 0.1800428181886673, + "loss/reg": 3.4006803035736084, + "step": 248 + }, + { + "epoch": 0.00249, + "grad_norm": 0.33844876289367676, + "grad_norm_var": 0.0010247223552569313, + "learning_rate": 5e-05, + "loss": 0.1737, + "loss/crossentropy": 2.7646324038505554, + "loss/hidden": 0.0, + "loss/logits": 0.17367269843816757, + "loss/reg": 3.3968873023986816, + "step": 249 + }, + { + "epoch": 0.0025, + "grad_norm": 0.31011876463890076, + "grad_norm_var": 0.0011285754764581786, + "learning_rate": 5e-05, + "loss": 0.1591, + "loss/crossentropy": 2.7303661704063416, + "loss/hidden": 0.0, + "loss/logits": 0.15912269055843353, + "loss/reg": 3.3925790786743164, + "step": 250 + }, + { + "epoch": 0.00251, + "grad_norm": 0.4837491512298584, + "grad_norm_var": 0.0022679356659945546, + "learning_rate": 5e-05, + "loss": 0.1845, + "loss/crossentropy": 2.718783438205719, + "loss/hidden": 0.0, + "loss/logits": 0.18454211205244064, + "loss/reg": 3.389193296432495, + "step": 251 + }, + { + "epoch": 0.00252, + "grad_norm": 0.30302709341049194, + "grad_norm_var": 0.002342444325573334, + "learning_rate": 5e-05, + "loss": 0.1527, + "loss/crossentropy": 2.7513213753700256, + "loss/hidden": 0.0, + "loss/logits": 0.15272299572825432, + "loss/reg": 3.384976863861084, + "step": 252 + }, + { + "epoch": 0.00253, + "grad_norm": 0.3376671075820923, + "grad_norm_var": 0.002352530797232196, + "learning_rate": 5e-05, + "loss": 0.1717, + "loss/crossentropy": 2.7082377672195435, + "loss/hidden": 0.0, + "loss/logits": 0.1717442087829113, + "loss/reg": 3.381958246231079, + "step": 253 + }, + { + "epoch": 0.00254, + "grad_norm": 0.3470434546470642, + "grad_norm_var": 0.00215032290339258, + "learning_rate": 5e-05, + "loss": 0.1751, + "loss/crossentropy": 2.7747623324394226, + "loss/hidden": 0.0, + "loss/logits": 0.17506984621286392, + "loss/reg": 3.3780975341796875, + "step": 254 + }, + { + "epoch": 0.00255, + "grad_norm": 0.35893791913986206, + "grad_norm_var": 0.002129806794166807, + "learning_rate": 5e-05, + "loss": 0.176, + "loss/crossentropy": 2.6670790910720825, + "loss/hidden": 0.0, + "loss/logits": 0.17602670192718506, + "loss/reg": 3.374431848526001, + "step": 255 + }, + { + "epoch": 0.00256, + "grad_norm": 0.33274415135383606, + "grad_norm_var": 0.002050384071076557, + "learning_rate": 5e-05, + "loss": 0.1683, + "loss/crossentropy": 2.930284321308136, + "loss/hidden": 0.0, + "loss/logits": 0.16833152994513512, + "loss/reg": 3.3709969520568848, + "step": 256 + }, + { + "epoch": 0.00257, + "grad_norm": 0.3107251822948456, + "grad_norm_var": 0.0021031284267367073, + "learning_rate": 5e-05, + "loss": 0.154, + "loss/crossentropy": 2.7738651037216187, + "loss/hidden": 0.0, + "loss/logits": 0.15401111543178558, + "loss/reg": 3.36681866645813, + "step": 257 + }, + { + "epoch": 0.00258, + "grad_norm": 0.3238702118396759, + "grad_norm_var": 0.00212825610345269, + "learning_rate": 5e-05, + "loss": 0.1485, + "loss/crossentropy": 2.7926384806632996, + "loss/hidden": 0.0, + "loss/logits": 0.14850713685154915, + "loss/reg": 3.363298177719116, + "step": 258 + }, + { + "epoch": 0.00259, + "grad_norm": 0.3937188982963562, + "grad_norm_var": 0.0022101531057158843, + "learning_rate": 5e-05, + "loss": 0.1796, + "loss/crossentropy": 2.732594311237335, + "loss/hidden": 0.0, + "loss/logits": 0.17957409471273422, + "loss/reg": 3.3592464923858643, + "step": 259 + }, + { + "epoch": 0.0026, + "grad_norm": 0.35869738459587097, + "grad_norm_var": 0.002201067050087302, + "learning_rate": 5e-05, + "loss": 0.1608, + "loss/crossentropy": 2.6412158608436584, + "loss/hidden": 0.0, + "loss/logits": 0.1607726439833641, + "loss/reg": 3.355531692504883, + "step": 260 + }, + { + "epoch": 0.00261, + "grad_norm": 0.342753529548645, + "grad_norm_var": 0.002168938888458849, + "learning_rate": 5e-05, + "loss": 0.1679, + "loss/crossentropy": 2.8253002762794495, + "loss/hidden": 0.0, + "loss/logits": 0.16785955801606178, + "loss/reg": 3.3510327339172363, + "step": 261 + }, + { + "epoch": 0.00262, + "grad_norm": 0.3396557867527008, + "grad_norm_var": 0.0020792270475482005, + "learning_rate": 5e-05, + "loss": 0.1719, + "loss/crossentropy": 2.5446697473526, + "loss/hidden": 0.0, + "loss/logits": 0.1719457022845745, + "loss/reg": 3.3462460041046143, + "step": 262 + }, + { + "epoch": 0.00263, + "grad_norm": 0.326615571975708, + "grad_norm_var": 0.002123647634079288, + "learning_rate": 5e-05, + "loss": 0.1662, + "loss/crossentropy": 2.7185133695602417, + "loss/hidden": 0.0, + "loss/logits": 0.16621045768260956, + "loss/reg": 3.342698097229004, + "step": 263 + }, + { + "epoch": 0.00264, + "grad_norm": 0.372024804353714, + "grad_norm_var": 0.0018490612448516057, + "learning_rate": 5e-05, + "loss": 0.1785, + "loss/crossentropy": 2.90339195728302, + "loss/hidden": 0.0, + "loss/logits": 0.17848360165953636, + "loss/reg": 3.3390297889709473, + "step": 264 + }, + { + "epoch": 0.00265, + "grad_norm": 0.336412638425827, + "grad_norm_var": 0.0018521135396843155, + "learning_rate": 5e-05, + "loss": 0.1685, + "loss/crossentropy": 2.6998194456100464, + "loss/hidden": 0.0, + "loss/logits": 0.1684669330716133, + "loss/reg": 3.3360044956207275, + "step": 265 + }, + { + "epoch": 0.00266, + "grad_norm": 0.3179170787334442, + "grad_norm_var": 0.0018158920564407192, + "learning_rate": 5e-05, + "loss": 0.164, + "loss/crossentropy": 2.713620126247406, + "loss/hidden": 0.0, + "loss/logits": 0.16402245312929153, + "loss/reg": 3.33297061920166, + "step": 266 + }, + { + "epoch": 0.00267, + "grad_norm": 0.32180216908454895, + "grad_norm_var": 0.0005475447645484169, + "learning_rate": 5e-05, + "loss": 0.1561, + "loss/crossentropy": 2.8285736441612244, + "loss/hidden": 0.0, + "loss/logits": 0.15614933148026466, + "loss/reg": 3.330070972442627, + "step": 267 + }, + { + "epoch": 0.00268, + "grad_norm": 0.34155359864234924, + "grad_norm_var": 0.00045564919377512797, + "learning_rate": 5e-05, + "loss": 0.1666, + "loss/crossentropy": 2.698326587677002, + "loss/hidden": 0.0, + "loss/logits": 0.16663997247815132, + "loss/reg": 3.326782464981079, + "step": 268 + }, + { + "epoch": 0.00269, + "grad_norm": 0.3281239867210388, + "grad_norm_var": 0.0004660702159405468, + "learning_rate": 5e-05, + "loss": 0.1547, + "loss/crossentropy": 2.7132135033607483, + "loss/hidden": 0.0, + "loss/logits": 0.15473050251603127, + "loss/reg": 3.3232901096343994, + "step": 269 + }, + { + "epoch": 0.0027, + "grad_norm": 0.3694444000720978, + "grad_norm_var": 0.0005161187813034911, + "learning_rate": 5e-05, + "loss": 0.1658, + "loss/crossentropy": 2.943029820919037, + "loss/hidden": 0.0, + "loss/logits": 0.16579603031277657, + "loss/reg": 3.319425344467163, + "step": 270 + }, + { + "epoch": 0.00271, + "grad_norm": 0.3521305024623871, + "grad_norm_var": 0.0005038113254072218, + "learning_rate": 5e-05, + "loss": 0.1762, + "loss/crossentropy": 2.813421130180359, + "loss/hidden": 0.0, + "loss/logits": 0.17624986171722412, + "loss/reg": 3.31587553024292, + "step": 271 + }, + { + "epoch": 0.00272, + "grad_norm": 0.3419167995452881, + "grad_norm_var": 0.0004980410714001496, + "learning_rate": 5e-05, + "loss": 0.1579, + "loss/crossentropy": 2.6725985407829285, + "loss/hidden": 0.0, + "loss/logits": 0.1579086296260357, + "loss/reg": 3.313774347305298, + "step": 272 + }, + { + "epoch": 0.00273, + "grad_norm": 0.45973771810531616, + "grad_norm_var": 0.001257799356739812, + "learning_rate": 5e-05, + "loss": 0.1806, + "loss/crossentropy": 2.7593576908111572, + "loss/hidden": 0.0, + "loss/logits": 0.1806396320462227, + "loss/reg": 3.311671257019043, + "step": 273 + }, + { + "epoch": 0.00274, + "grad_norm": 0.327812522649765, + "grad_norm_var": 0.00124416933097297, + "learning_rate": 5e-05, + "loss": 0.1544, + "loss/crossentropy": 2.7368595004081726, + "loss/hidden": 0.0, + "loss/logits": 0.15438436716794968, + "loss/reg": 3.308312177658081, + "step": 274 + }, + { + "epoch": 0.00275, + "grad_norm": 0.43593037128448486, + "grad_norm_var": 0.001590926391083336, + "learning_rate": 5e-05, + "loss": 0.1721, + "loss/crossentropy": 2.8178694248199463, + "loss/hidden": 0.0, + "loss/logits": 0.17214355245232582, + "loss/reg": 3.30526065826416, + "step": 275 + }, + { + "epoch": 0.00276, + "grad_norm": 0.361247181892395, + "grad_norm_var": 0.0015927484925991053, + "learning_rate": 5e-05, + "loss": 0.1735, + "loss/crossentropy": 2.734869599342346, + "loss/hidden": 0.0, + "loss/logits": 0.17345493659377098, + "loss/reg": 3.3015174865722656, + "step": 276 + }, + { + "epoch": 0.00277, + "grad_norm": 0.3708873689174652, + "grad_norm_var": 0.0015974331537993436, + "learning_rate": 5e-05, + "loss": 0.1628, + "loss/crossentropy": 2.67022705078125, + "loss/hidden": 0.0, + "loss/logits": 0.16277828440070152, + "loss/reg": 3.2979342937469482, + "step": 277 + }, + { + "epoch": 0.00278, + "grad_norm": 0.3481086790561676, + "grad_norm_var": 0.0015829700282981919, + "learning_rate": 5e-05, + "loss": 0.1577, + "loss/crossentropy": 2.700168251991272, + "loss/hidden": 0.0, + "loss/logits": 0.15773406997323036, + "loss/reg": 3.294177532196045, + "step": 278 + }, + { + "epoch": 0.00279, + "grad_norm": 0.5134589076042175, + "grad_norm_var": 0.003008442642246208, + "learning_rate": 5e-05, + "loss": 0.1766, + "loss/crossentropy": 2.9033528566360474, + "loss/hidden": 0.0, + "loss/logits": 0.17655130848288536, + "loss/reg": 3.2909915447235107, + "step": 279 + }, + { + "epoch": 0.0028, + "grad_norm": 0.38205716013908386, + "grad_norm_var": 0.0030192383608610503, + "learning_rate": 5e-05, + "loss": 0.1934, + "loss/crossentropy": 2.7160211205482483, + "loss/hidden": 0.0, + "loss/logits": 0.19341961666941643, + "loss/reg": 3.2878634929656982, + "step": 280 + }, + { + "epoch": 0.00281, + "grad_norm": 0.3628558814525604, + "grad_norm_var": 0.002947045102075485, + "learning_rate": 5e-05, + "loss": 0.1709, + "loss/crossentropy": 2.6912715435028076, + "loss/hidden": 0.0, + "loss/logits": 0.17094064503908157, + "loss/reg": 3.284353256225586, + "step": 281 + }, + { + "epoch": 0.00282, + "grad_norm": 0.32757696509361267, + "grad_norm_var": 0.002884588952342071, + "learning_rate": 5e-05, + "loss": 0.1702, + "loss/crossentropy": 2.7818912267684937, + "loss/hidden": 0.0, + "loss/logits": 0.17016061395406723, + "loss/reg": 3.281097412109375, + "step": 282 + }, + { + "epoch": 0.00283, + "grad_norm": 0.5035110712051392, + "grad_norm_var": 0.003743174506031214, + "learning_rate": 5e-05, + "loss": 0.1891, + "loss/crossentropy": 2.8356027603149414, + "loss/hidden": 0.0, + "loss/logits": 0.1890563629567623, + "loss/reg": 3.277557611465454, + "step": 283 + }, + { + "epoch": 0.00284, + "grad_norm": 0.4021988809108734, + "grad_norm_var": 0.003638735284583853, + "learning_rate": 5e-05, + "loss": 0.1574, + "loss/crossentropy": 2.768595337867737, + "loss/hidden": 0.0, + "loss/logits": 0.15735788643360138, + "loss/reg": 3.274083375930786, + "step": 284 + }, + { + "epoch": 0.00285, + "grad_norm": 0.3557356297969818, + "grad_norm_var": 0.0034707811870306284, + "learning_rate": 5e-05, + "loss": 0.1618, + "loss/crossentropy": 2.7620763182640076, + "loss/hidden": 0.0, + "loss/logits": 0.16183337569236755, + "loss/reg": 3.2709619998931885, + "step": 285 + }, + { + "epoch": 0.00286, + "grad_norm": 0.35383257269859314, + "grad_norm_var": 0.0035254991255895857, + "learning_rate": 5e-05, + "loss": 0.164, + "loss/crossentropy": 2.7777530550956726, + "loss/hidden": 0.0, + "loss/logits": 0.16404272243380547, + "loss/reg": 3.2678396701812744, + "step": 286 + }, + { + "epoch": 0.00287, + "grad_norm": 0.3291935324668884, + "grad_norm_var": 0.0036663583934404683, + "learning_rate": 5e-05, + "loss": 0.1545, + "loss/crossentropy": 2.7828534841537476, + "loss/hidden": 0.0, + "loss/logits": 0.15450828149914742, + "loss/reg": 3.2639055252075195, + "step": 287 + }, + { + "epoch": 0.00288, + "grad_norm": 0.3174595534801483, + "grad_norm_var": 0.003847509504795695, + "learning_rate": 5e-05, + "loss": 0.163, + "loss/crossentropy": 2.7973127365112305, + "loss/hidden": 0.0, + "loss/logits": 0.16296324506402016, + "loss/reg": 3.2608911991119385, + "step": 288 + }, + { + "epoch": 0.00289, + "grad_norm": 0.3723791539669037, + "grad_norm_var": 0.0034478366033269445, + "learning_rate": 5e-05, + "loss": 0.1757, + "loss/crossentropy": 2.630415976047516, + "loss/hidden": 0.0, + "loss/logits": 0.17573364078998566, + "loss/reg": 3.257523775100708, + "step": 289 + }, + { + "epoch": 0.0029, + "grad_norm": 0.38034215569496155, + "grad_norm_var": 0.003261674725795945, + "learning_rate": 5e-05, + "loss": 0.1689, + "loss/crossentropy": 2.704579532146454, + "loss/hidden": 0.0, + "loss/logits": 0.16892588511109352, + "loss/reg": 3.254065752029419, + "step": 290 + }, + { + "epoch": 0.00291, + "grad_norm": 1.2464065551757812, + "grad_norm_var": 0.05011180607704591, + "learning_rate": 5e-05, + "loss": 0.1921, + "loss/crossentropy": 2.8595897555351257, + "loss/hidden": 0.0, + "loss/logits": 0.19211571291089058, + "loss/reg": 3.2510294914245605, + "step": 291 + }, + { + "epoch": 0.00292, + "grad_norm": 0.3307066559791565, + "grad_norm_var": 0.05046209325623486, + "learning_rate": 5e-05, + "loss": 0.1681, + "loss/crossentropy": 2.8195151686668396, + "loss/hidden": 0.0, + "loss/logits": 0.16810721158981323, + "loss/reg": 3.2478625774383545, + "step": 292 + }, + { + "epoch": 0.00293, + "grad_norm": 0.33664193749427795, + "grad_norm_var": 0.05081007066127065, + "learning_rate": 5e-05, + "loss": 0.1602, + "loss/crossentropy": 2.912789523601532, + "loss/hidden": 0.0, + "loss/logits": 0.16015203669667244, + "loss/reg": 3.2451751232147217, + "step": 293 + }, + { + "epoch": 0.00294, + "grad_norm": 0.42365437746047974, + "grad_norm_var": 0.05035293502217161, + "learning_rate": 5e-05, + "loss": 0.1833, + "loss/crossentropy": 2.7459517121315002, + "loss/hidden": 0.0, + "loss/logits": 0.18333038315176964, + "loss/reg": 3.242023229598999, + "step": 294 + }, + { + "epoch": 0.00295, + "grad_norm": 0.40840578079223633, + "grad_norm_var": 0.049924464393714924, + "learning_rate": 5e-05, + "loss": 0.176, + "loss/crossentropy": 2.778249144554138, + "loss/hidden": 0.0, + "loss/logits": 0.17595936357975006, + "loss/reg": 3.2388389110565186, + "step": 295 + }, + { + "epoch": 0.00296, + "grad_norm": 0.3591618835926056, + "grad_norm_var": 0.05009460642018338, + "learning_rate": 5e-05, + "loss": 0.1722, + "loss/crossentropy": 2.7121748328208923, + "loss/hidden": 0.0, + "loss/logits": 0.17217102646827698, + "loss/reg": 3.2364017963409424, + "step": 296 + }, + { + "epoch": 0.00297, + "grad_norm": 0.31615006923675537, + "grad_norm_var": 0.05062186135785553, + "learning_rate": 5e-05, + "loss": 0.1488, + "loss/crossentropy": 2.7933038473129272, + "loss/hidden": 0.0, + "loss/logits": 0.14879318699240685, + "loss/reg": 3.2344272136688232, + "step": 297 + }, + { + "epoch": 0.00298, + "grad_norm": 0.3586377799510956, + "grad_norm_var": 0.050288172636787816, + "learning_rate": 5e-05, + "loss": 0.1492, + "loss/crossentropy": 2.869333803653717, + "loss/hidden": 0.0, + "loss/logits": 0.14924299344420433, + "loss/reg": 3.232086181640625, + "step": 298 + }, + { + "epoch": 0.00299, + "grad_norm": 0.37798771262168884, + "grad_norm_var": 0.04995309393064352, + "learning_rate": 5e-05, + "loss": 0.1652, + "loss/crossentropy": 2.913083255290985, + "loss/hidden": 0.0, + "loss/logits": 0.1651969812810421, + "loss/reg": 3.2296016216278076, + "step": 299 + }, + { + "epoch": 0.003, + "grad_norm": 0.5914519429206848, + "grad_norm_var": 0.05182304954391634, + "learning_rate": 5e-05, + "loss": 0.1893, + "loss/crossentropy": 2.8007007241249084, + "loss/hidden": 0.0, + "loss/logits": 0.18929021432995796, + "loss/reg": 3.2266576290130615, + "step": 300 + }, + { + "epoch": 0.00301, + "grad_norm": 0.3292617201805115, + "grad_norm_var": 0.05212417516215169, + "learning_rate": 5e-05, + "loss": 0.16, + "loss/crossentropy": 2.7326099276542664, + "loss/hidden": 0.0, + "loss/logits": 0.15998771041631699, + "loss/reg": 3.2236239910125732, + "step": 301 + }, + { + "epoch": 0.00302, + "grad_norm": 0.3807355761528015, + "grad_norm_var": 0.05190702763823275, + "learning_rate": 5e-05, + "loss": 0.1831, + "loss/crossentropy": 2.7199636101722717, + "loss/hidden": 0.0, + "loss/logits": 0.1831417679786682, + "loss/reg": 3.220425844192505, + "step": 302 + }, + { + "epoch": 0.00303, + "grad_norm": 0.4008902907371521, + "grad_norm_var": 0.05127743798183474, + "learning_rate": 5e-05, + "loss": 0.1777, + "loss/crossentropy": 2.7570589184761047, + "loss/hidden": 0.0, + "loss/logits": 0.177694384008646, + "loss/reg": 3.2171859741210938, + "step": 303 + }, + { + "epoch": 0.00304, + "grad_norm": 0.35962697863578796, + "grad_norm_var": 0.05073816419262332, + "learning_rate": 5e-05, + "loss": 0.1574, + "loss/crossentropy": 2.809377670288086, + "loss/hidden": 0.0, + "loss/logits": 0.15741629898548126, + "loss/reg": 3.2137656211853027, + "step": 304 + }, + { + "epoch": 0.00305, + "grad_norm": 0.404453843832016, + "grad_norm_var": 0.05053133217663517, + "learning_rate": 5e-05, + "loss": 0.196, + "loss/crossentropy": 2.6374824047088623, + "loss/hidden": 0.0, + "loss/logits": 0.19603004679083824, + "loss/reg": 3.2106897830963135, + "step": 305 + }, + { + "epoch": 0.00306, + "grad_norm": 0.3411775231361389, + "grad_norm_var": 0.05092714807133293, + "learning_rate": 5e-05, + "loss": 0.1701, + "loss/crossentropy": 2.7098072171211243, + "loss/hidden": 0.0, + "loss/logits": 0.17005891352891922, + "loss/reg": 3.2072975635528564, + "step": 306 + }, + { + "epoch": 0.00307, + "grad_norm": 0.48913073539733887, + "grad_norm_var": 0.004874772049479148, + "learning_rate": 5e-05, + "loss": 0.2396, + "loss/crossentropy": 2.8529589772224426, + "loss/hidden": 0.0, + "loss/logits": 0.23958228901028633, + "loss/reg": 3.2032482624053955, + "step": 307 + }, + { + "epoch": 0.00308, + "grad_norm": 0.3359135389328003, + "grad_norm_var": 0.004836687315538634, + "learning_rate": 5e-05, + "loss": 0.154, + "loss/crossentropy": 2.825865149497986, + "loss/hidden": 0.0, + "loss/logits": 0.15401727706193924, + "loss/reg": 3.2001187801361084, + "step": 308 + }, + { + "epoch": 0.00309, + "grad_norm": 0.3673790693283081, + "grad_norm_var": 0.004683902714770716, + "learning_rate": 5e-05, + "loss": 0.1701, + "loss/crossentropy": 2.757752239704132, + "loss/hidden": 0.0, + "loss/logits": 0.1700810343027115, + "loss/reg": 3.197371244430542, + "step": 309 + }, + { + "epoch": 0.0031, + "grad_norm": 0.3675878643989563, + "grad_norm_var": 0.004630661781797581, + "learning_rate": 5e-05, + "loss": 0.1742, + "loss/crossentropy": 2.9055893421173096, + "loss/hidden": 0.0, + "loss/logits": 0.17424792051315308, + "loss/reg": 3.193922758102417, + "step": 310 + }, + { + "epoch": 0.00311, + "grad_norm": 0.3216918110847473, + "grad_norm_var": 0.004850203191397428, + "learning_rate": 5e-05, + "loss": 0.1529, + "loss/crossentropy": 2.7421942353248596, + "loss/hidden": 0.0, + "loss/logits": 0.15289029106497765, + "loss/reg": 3.1913256645202637, + "step": 311 + }, + { + "epoch": 0.00312, + "grad_norm": 0.30283358693122864, + "grad_norm_var": 0.005214980747276743, + "learning_rate": 5e-05, + "loss": 0.1501, + "loss/crossentropy": 2.68456107378006, + "loss/hidden": 0.0, + "loss/logits": 0.15012749284505844, + "loss/reg": 3.188302755355835, + "step": 312 + }, + { + "epoch": 0.00313, + "grad_norm": 0.3840731978416443, + "grad_norm_var": 0.004944937932171186, + "learning_rate": 5e-05, + "loss": 0.1565, + "loss/crossentropy": 2.7386457920074463, + "loss/hidden": 0.0, + "loss/logits": 0.15645165741443634, + "loss/reg": 3.1856327056884766, + "step": 313 + }, + { + "epoch": 0.00314, + "grad_norm": 0.3471456468105316, + "grad_norm_var": 0.004989069609234416, + "learning_rate": 5e-05, + "loss": 0.1661, + "loss/crossentropy": 2.8941837549209595, + "loss/hidden": 0.0, + "loss/logits": 0.16613885760307312, + "loss/reg": 3.182285785675049, + "step": 314 + }, + { + "epoch": 0.00315, + "grad_norm": 0.40083590149879456, + "grad_norm_var": 0.005011503442318803, + "learning_rate": 5e-05, + "loss": 0.1816, + "loss/crossentropy": 2.830922782421112, + "loss/hidden": 0.0, + "loss/logits": 0.18162691593170166, + "loss/reg": 3.1795294284820557, + "step": 315 + }, + { + "epoch": 0.00316, + "grad_norm": 0.40112313628196716, + "grad_norm_var": 0.001979603921073251, + "learning_rate": 5e-05, + "loss": 0.163, + "loss/crossentropy": 2.5529216527938843, + "loss/hidden": 0.0, + "loss/logits": 0.1629548817873001, + "loss/reg": 3.1763863563537598, + "step": 316 + }, + { + "epoch": 0.00317, + "grad_norm": 0.3936459720134735, + "grad_norm_var": 0.001881530067811854, + "learning_rate": 5e-05, + "loss": 0.1851, + "loss/crossentropy": 2.780943751335144, + "loss/hidden": 0.0, + "loss/logits": 0.18506472185254097, + "loss/reg": 3.173980474472046, + "step": 317 + }, + { + "epoch": 0.00318, + "grad_norm": 0.3727342486381531, + "grad_norm_var": 0.0018792953911145364, + "learning_rate": 5e-05, + "loss": 0.1827, + "loss/crossentropy": 2.76874041557312, + "loss/hidden": 0.0, + "loss/logits": 0.1826922371983528, + "loss/reg": 3.1704628467559814, + "step": 318 + }, + { + "epoch": 0.00319, + "grad_norm": 0.3470066785812378, + "grad_norm_var": 0.0018703712702832478, + "learning_rate": 5e-05, + "loss": 0.1598, + "loss/crossentropy": 2.740228831768036, + "loss/hidden": 0.0, + "loss/logits": 0.1597808077931404, + "loss/reg": 3.1674654483795166, + "step": 319 + }, + { + "epoch": 0.0032, + "grad_norm": 0.3653993010520935, + "grad_norm_var": 0.0018636832816178708, + "learning_rate": 5e-05, + "loss": 0.1549, + "loss/crossentropy": 2.883521616458893, + "loss/hidden": 0.0, + "loss/logits": 0.15490083023905754, + "loss/reg": 3.1641883850097656, + "step": 320 + }, + { + "epoch": 0.00321, + "grad_norm": 0.3510638475418091, + "grad_norm_var": 0.0018064205203171017, + "learning_rate": 5e-05, + "loss": 0.1577, + "loss/crossentropy": 2.9007150530815125, + "loss/hidden": 0.0, + "loss/logits": 0.15773681923747063, + "loss/reg": 3.1610915660858154, + "step": 321 + }, + { + "epoch": 0.00322, + "grad_norm": 0.5068875551223755, + "grad_norm_var": 0.0029290004167608335, + "learning_rate": 5e-05, + "loss": 0.2013, + "loss/crossentropy": 2.716179847717285, + "loss/hidden": 0.0, + "loss/logits": 0.2013130635023117, + "loss/reg": 3.1571173667907715, + "step": 322 + }, + { + "epoch": 0.00323, + "grad_norm": 0.40178200602531433, + "grad_norm_var": 0.0021162756618779235, + "learning_rate": 5e-05, + "loss": 0.1809, + "loss/crossentropy": 2.9381837844848633, + "loss/hidden": 0.0, + "loss/logits": 0.1809130534529686, + "loss/reg": 3.1540093421936035, + "step": 323 + }, + { + "epoch": 0.00324, + "grad_norm": 0.35252845287323, + "grad_norm_var": 0.0020514948206895294, + "learning_rate": 5e-05, + "loss": 0.1617, + "loss/crossentropy": 2.743869721889496, + "loss/hidden": 0.0, + "loss/logits": 0.16171807795763016, + "loss/reg": 3.1503779888153076, + "step": 324 + }, + { + "epoch": 0.00325, + "grad_norm": 0.36802011728286743, + "grad_norm_var": 0.0020509560983741053, + "learning_rate": 5e-05, + "loss": 0.1722, + "loss/crossentropy": 2.934039294719696, + "loss/hidden": 0.0, + "loss/logits": 0.17217914387583733, + "loss/reg": 3.1467440128326416, + "step": 325 + }, + { + "epoch": 0.00326, + "grad_norm": 0.4267319142818451, + "grad_norm_var": 0.0022188398751517274, + "learning_rate": 5e-05, + "loss": 0.1924, + "loss/crossentropy": 2.802468180656433, + "loss/hidden": 0.0, + "loss/logits": 0.19241869449615479, + "loss/reg": 3.1427693367004395, + "step": 326 + }, + { + "epoch": 0.00327, + "grad_norm": 0.34044548869132996, + "grad_norm_var": 0.0021007258044081806, + "learning_rate": 5e-05, + "loss": 0.1522, + "loss/crossentropy": 2.8443135619163513, + "loss/hidden": 0.0, + "loss/logits": 0.1522187888622284, + "loss/reg": 3.1388471126556396, + "step": 327 + }, + { + "epoch": 0.00328, + "grad_norm": 0.4276120066642761, + "grad_norm_var": 0.001808451579785623, + "learning_rate": 5e-05, + "loss": 0.1708, + "loss/crossentropy": 2.8915366530418396, + "loss/hidden": 0.0, + "loss/logits": 0.17084889113903046, + "loss/reg": 3.134800434112549, + "step": 328 + }, + { + "epoch": 0.00329, + "grad_norm": 0.3486219346523285, + "grad_norm_var": 0.0018993689379833108, + "learning_rate": 5e-05, + "loss": 0.1687, + "loss/crossentropy": 2.575106978416443, + "loss/hidden": 0.0, + "loss/logits": 0.16874929517507553, + "loss/reg": 3.1303272247314453, + "step": 329 + }, + { + "epoch": 0.0033, + "grad_norm": 0.365105539560318, + "grad_norm_var": 0.0018301403367672127, + "learning_rate": 5e-05, + "loss": 0.1842, + "loss/crossentropy": 2.6813217401504517, + "loss/hidden": 0.0, + "loss/logits": 0.18416454643011093, + "loss/reg": 3.1269419193267822, + "step": 330 + }, + { + "epoch": 0.00331, + "grad_norm": 0.5757820010185242, + "grad_norm_var": 0.004098500311938921, + "learning_rate": 5e-05, + "loss": 0.1935, + "loss/crossentropy": 2.9679067730903625, + "loss/hidden": 0.0, + "loss/logits": 0.19354857876896858, + "loss/reg": 3.1228113174438477, + "step": 331 + }, + { + "epoch": 0.00332, + "grad_norm": 0.405617356300354, + "grad_norm_var": 0.004102514647771457, + "learning_rate": 5e-05, + "loss": 0.1716, + "loss/crossentropy": 2.843691408634186, + "loss/hidden": 0.0, + "loss/logits": 0.1716487891972065, + "loss/reg": 3.120131015777588, + "step": 332 + }, + { + "epoch": 0.00333, + "grad_norm": 0.3825243413448334, + "grad_norm_var": 0.004114939464605513, + "learning_rate": 5e-05, + "loss": 0.1692, + "loss/crossentropy": 2.722069561481476, + "loss/hidden": 0.0, + "loss/logits": 0.16915880143642426, + "loss/reg": 3.117812395095825, + "step": 333 + }, + { + "epoch": 0.00334, + "grad_norm": 0.38414397835731506, + "grad_norm_var": 0.004087504594686679, + "learning_rate": 5e-05, + "loss": 0.1678, + "loss/crossentropy": 2.727014124393463, + "loss/hidden": 0.0, + "loss/logits": 0.1677936352789402, + "loss/reg": 3.115847587585449, + "step": 334 + }, + { + "epoch": 0.00335, + "grad_norm": 0.4531976580619812, + "grad_norm_var": 0.0040868556651997325, + "learning_rate": 5e-05, + "loss": 0.1726, + "loss/crossentropy": 2.5817691683769226, + "loss/hidden": 0.0, + "loss/logits": 0.17255331575870514, + "loss/reg": 3.1140594482421875, + "step": 335 + }, + { + "epoch": 0.00336, + "grad_norm": 0.33963721990585327, + "grad_norm_var": 0.004259094561609115, + "learning_rate": 5e-05, + "loss": 0.1605, + "loss/crossentropy": 2.7009602189064026, + "loss/hidden": 0.0, + "loss/logits": 0.16050074249505997, + "loss/reg": 3.1128299236297607, + "step": 336 + }, + { + "epoch": 0.00337, + "grad_norm": 0.36085280776023865, + "grad_norm_var": 0.00419878945557195, + "learning_rate": 5e-05, + "loss": 0.1641, + "loss/crossentropy": 2.7012510299682617, + "loss/hidden": 0.0, + "loss/logits": 0.16410352289676666, + "loss/reg": 3.109898805618286, + "step": 337 + }, + { + "epoch": 0.00338, + "grad_norm": 0.3331363797187805, + "grad_norm_var": 0.003666565441549352, + "learning_rate": 5e-05, + "loss": 0.1619, + "loss/crossentropy": 2.802642047405243, + "loss/hidden": 0.0, + "loss/logits": 0.1618719846010208, + "loss/reg": 3.1067681312561035, + "step": 338 + }, + { + "epoch": 0.00339, + "grad_norm": 0.41531723737716675, + "grad_norm_var": 0.003696375336840474, + "learning_rate": 5e-05, + "loss": 0.1877, + "loss/crossentropy": 2.622368335723877, + "loss/hidden": 0.0, + "loss/logits": 0.18770882859826088, + "loss/reg": 3.103231191635132, + "step": 339 + }, + { + "epoch": 0.0034, + "grad_norm": 0.3483443260192871, + "grad_norm_var": 0.0037197436901762657, + "learning_rate": 5e-05, + "loss": 0.1603, + "loss/crossentropy": 2.7523834109306335, + "loss/hidden": 0.0, + "loss/logits": 0.16034872457385063, + "loss/reg": 3.1008808612823486, + "step": 340 + }, + { + "epoch": 0.00341, + "grad_norm": 0.46117284893989563, + "grad_norm_var": 0.003961845355148208, + "learning_rate": 5e-05, + "loss": 0.1801, + "loss/crossentropy": 2.820544958114624, + "loss/hidden": 0.0, + "loss/logits": 0.18007208034396172, + "loss/reg": 3.098954439163208, + "step": 341 + }, + { + "epoch": 0.00342, + "grad_norm": 0.38020917773246765, + "grad_norm_var": 0.003918987421685794, + "learning_rate": 5e-05, + "loss": 0.1711, + "loss/crossentropy": 2.864526093006134, + "loss/hidden": 0.0, + "loss/logits": 0.1710633859038353, + "loss/reg": 3.0957016944885254, + "step": 342 + }, + { + "epoch": 0.00343, + "grad_norm": 0.3978392779827118, + "grad_norm_var": 0.0037065638898653073, + "learning_rate": 5e-05, + "loss": 0.1782, + "loss/crossentropy": 2.859494388103485, + "loss/hidden": 0.0, + "loss/logits": 0.1782137230038643, + "loss/reg": 3.09333872795105, + "step": 343 + }, + { + "epoch": 0.00344, + "grad_norm": 0.36975786089897156, + "grad_norm_var": 0.0036926924317912187, + "learning_rate": 5e-05, + "loss": 0.1658, + "loss/crossentropy": 2.7827839255332947, + "loss/hidden": 0.0, + "loss/logits": 0.1657763496041298, + "loss/reg": 3.090590715408325, + "step": 344 + }, + { + "epoch": 0.00345, + "grad_norm": 0.333897203207016, + "grad_norm_var": 0.0037974520830182084, + "learning_rate": 5e-05, + "loss": 0.1612, + "loss/crossentropy": 2.7804853320121765, + "loss/hidden": 0.0, + "loss/logits": 0.1611798331141472, + "loss/reg": 3.0877137184143066, + "step": 345 + }, + { + "epoch": 0.00346, + "grad_norm": 0.43794891238212585, + "grad_norm_var": 0.0038469119530984567, + "learning_rate": 5e-05, + "loss": 0.1678, + "loss/crossentropy": 2.7229984402656555, + "loss/hidden": 0.0, + "loss/logits": 0.1677897423505783, + "loss/reg": 3.085155725479126, + "step": 346 + }, + { + "epoch": 0.00347, + "grad_norm": 0.33257824182510376, + "grad_norm_var": 0.0018017603976316725, + "learning_rate": 5e-05, + "loss": 0.1745, + "loss/crossentropy": 2.7936434745788574, + "loss/hidden": 0.0, + "loss/logits": 0.17448442056775093, + "loss/reg": 3.0829925537109375, + "step": 347 + }, + { + "epoch": 0.00348, + "grad_norm": 0.393646240234375, + "grad_norm_var": 0.001775431972661142, + "learning_rate": 5e-05, + "loss": 0.1647, + "loss/crossentropy": 2.8590177297592163, + "loss/hidden": 0.0, + "loss/logits": 0.16474304348230362, + "loss/reg": 3.080383062362671, + "step": 348 + }, + { + "epoch": 0.00349, + "grad_norm": 0.34549105167388916, + "grad_norm_var": 0.0018623256252658482, + "learning_rate": 5e-05, + "loss": 0.1678, + "loss/crossentropy": 2.7182729840278625, + "loss/hidden": 0.0, + "loss/logits": 0.16776488721370697, + "loss/reg": 3.0792622566223145, + "step": 349 + }, + { + "epoch": 0.0035, + "grad_norm": 0.9833559393882751, + "grad_norm_var": 0.024598539346201563, + "learning_rate": 5e-05, + "loss": 0.1911, + "loss/crossentropy": 2.720784068107605, + "loss/hidden": 0.0, + "loss/logits": 0.19106518849730492, + "loss/reg": 3.0780370235443115, + "step": 350 + }, + { + "epoch": 0.00351, + "grad_norm": 0.3550430238246918, + "grad_norm_var": 0.02473872020472854, + "learning_rate": 5e-05, + "loss": 0.1683, + "loss/crossentropy": 2.7388776540756226, + "loss/hidden": 0.0, + "loss/logits": 0.16827991232275963, + "loss/reg": 3.07700777053833, + "step": 351 + }, + { + "epoch": 0.00352, + "grad_norm": 0.32236865162849426, + "grad_norm_var": 0.024923428623413246, + "learning_rate": 5e-05, + "loss": 0.1555, + "loss/crossentropy": 2.787019371986389, + "loss/hidden": 0.0, + "loss/logits": 0.1554781049489975, + "loss/reg": 3.075169324874878, + "step": 352 + }, + { + "epoch": 0.00353, + "grad_norm": 0.34089383482933044, + "grad_norm_var": 0.025080939274787682, + "learning_rate": 5e-05, + "loss": 0.1625, + "loss/crossentropy": 2.763973832130432, + "loss/hidden": 0.0, + "loss/logits": 0.1625489443540573, + "loss/reg": 3.0732734203338623, + "step": 353 + }, + { + "epoch": 0.00354, + "grad_norm": 0.35467466711997986, + "grad_norm_var": 0.02489081345717287, + "learning_rate": 5e-05, + "loss": 0.161, + "loss/crossentropy": 2.6696255207061768, + "loss/hidden": 0.0, + "loss/logits": 0.1610397771000862, + "loss/reg": 3.071441411972046, + "step": 354 + }, + { + "epoch": 0.00355, + "grad_norm": 0.3465348184108734, + "grad_norm_var": 0.02514492485323772, + "learning_rate": 5e-05, + "loss": 0.1692, + "loss/crossentropy": 2.8481903076171875, + "loss/hidden": 0.0, + "loss/logits": 0.16923030093312263, + "loss/reg": 3.068796157836914, + "step": 355 + }, + { + "epoch": 0.00356, + "grad_norm": 0.3337138891220093, + "grad_norm_var": 0.025271718941650815, + "learning_rate": 5e-05, + "loss": 0.1675, + "loss/crossentropy": 2.734935760498047, + "loss/hidden": 0.0, + "loss/logits": 0.1674855425953865, + "loss/reg": 3.066350221633911, + "step": 356 + }, + { + "epoch": 0.00357, + "grad_norm": 0.3486330509185791, + "grad_norm_var": 0.02522896182872459, + "learning_rate": 5e-05, + "loss": 0.1751, + "loss/crossentropy": 2.8282878398895264, + "loss/hidden": 0.0, + "loss/logits": 0.17508375644683838, + "loss/reg": 3.0633251667022705, + "step": 357 + }, + { + "epoch": 0.00358, + "grad_norm": 0.3714129626750946, + "grad_norm_var": 0.025255292610223575, + "learning_rate": 5e-05, + "loss": 0.1798, + "loss/crossentropy": 2.723433256149292, + "loss/hidden": 0.0, + "loss/logits": 0.17982058972120285, + "loss/reg": 3.060797929763794, + "step": 358 + }, + { + "epoch": 0.00359, + "grad_norm": 0.38819992542266846, + "grad_norm_var": 0.025261289598676597, + "learning_rate": 5e-05, + "loss": 0.179, + "loss/crossentropy": 2.5971017479896545, + "loss/hidden": 0.0, + "loss/logits": 0.17904112860560417, + "loss/reg": 3.057884693145752, + "step": 359 + }, + { + "epoch": 0.0036, + "grad_norm": 0.3948271870613098, + "grad_norm_var": 0.02520822524046924, + "learning_rate": 5e-05, + "loss": 0.1826, + "loss/crossentropy": 2.631825864315033, + "loss/hidden": 0.0, + "loss/logits": 0.1826026625931263, + "loss/reg": 3.0550246238708496, + "step": 360 + }, + { + "epoch": 0.00361, + "grad_norm": 0.46747469902038574, + "grad_norm_var": 0.025164775676019657, + "learning_rate": 5e-05, + "loss": 0.1849, + "loss/crossentropy": 2.628718376159668, + "loss/hidden": 0.0, + "loss/logits": 0.1848563477396965, + "loss/reg": 3.052072525024414, + "step": 361 + }, + { + "epoch": 0.00362, + "grad_norm": 0.9672635197639465, + "grad_norm_var": 0.044838716189940266, + "learning_rate": 5e-05, + "loss": 0.2236, + "loss/crossentropy": 2.7798518538475037, + "loss/hidden": 0.0, + "loss/logits": 0.22359847277402878, + "loss/reg": 3.048891305923462, + "step": 362 + }, + { + "epoch": 0.00363, + "grad_norm": 0.3722783029079437, + "grad_norm_var": 0.04436658011174813, + "learning_rate": 5e-05, + "loss": 0.175, + "loss/crossentropy": 2.7808294892311096, + "loss/hidden": 0.0, + "loss/logits": 0.17498808726668358, + "loss/reg": 3.0457394123077393, + "step": 363 + }, + { + "epoch": 0.00364, + "grad_norm": 0.3547132611274719, + "grad_norm_var": 0.04471680473078544, + "learning_rate": 5e-05, + "loss": 0.1602, + "loss/crossentropy": 2.7656018137931824, + "loss/hidden": 0.0, + "loss/logits": 0.16020696610212326, + "loss/reg": 3.043097734451294, + "step": 364 + }, + { + "epoch": 0.00365, + "grad_norm": 0.4774816632270813, + "grad_norm_var": 0.04413484451680517, + "learning_rate": 5e-05, + "loss": 0.1831, + "loss/crossentropy": 2.9051772356033325, + "loss/hidden": 0.0, + "loss/logits": 0.18311960250139236, + "loss/reg": 3.0403990745544434, + "step": 365 + }, + { + "epoch": 0.00366, + "grad_norm": 0.41332709789276123, + "grad_norm_var": 0.0238056716485936, + "learning_rate": 5e-05, + "loss": 0.1705, + "loss/crossentropy": 2.8529672026634216, + "loss/hidden": 0.0, + "loss/logits": 0.17049206793308258, + "loss/reg": 3.037370204925537, + "step": 366 + }, + { + "epoch": 0.00367, + "grad_norm": 0.39109355211257935, + "grad_norm_var": 0.023608062717162412, + "learning_rate": 5e-05, + "loss": 0.1798, + "loss/crossentropy": 2.796768307685852, + "loss/hidden": 0.0, + "loss/logits": 0.17983367666602135, + "loss/reg": 3.0343563556671143, + "step": 367 + }, + { + "epoch": 0.00368, + "grad_norm": 0.36531057953834534, + "grad_norm_var": 0.023191193861390014, + "learning_rate": 5e-05, + "loss": 0.1715, + "loss/crossentropy": 2.8276050686836243, + "loss/hidden": 0.0, + "loss/logits": 0.1715383380651474, + "loss/reg": 3.031611919403076, + "step": 368 + }, + { + "epoch": 0.00369, + "grad_norm": 0.33283501863479614, + "grad_norm_var": 0.023278092934366657, + "learning_rate": 5e-05, + "loss": 0.1499, + "loss/crossentropy": 2.6641258597373962, + "loss/hidden": 0.0, + "loss/logits": 0.14990831911563873, + "loss/reg": 3.0287797451019287, + "step": 369 + }, + { + "epoch": 0.0037, + "grad_norm": 0.4542810618877411, + "grad_norm_var": 0.023063995994232415, + "learning_rate": 5e-05, + "loss": 0.1722, + "loss/crossentropy": 2.7453941702842712, + "loss/hidden": 0.0, + "loss/logits": 0.1721569411456585, + "loss/reg": 3.0258800983428955, + "step": 370 + }, + { + "epoch": 0.00371, + "grad_norm": 0.3705763816833496, + "grad_norm_var": 0.022852728398777448, + "learning_rate": 5e-05, + "loss": 0.1849, + "loss/crossentropy": 2.635721504688263, + "loss/hidden": 0.0, + "loss/logits": 0.18491211906075478, + "loss/reg": 3.0235960483551025, + "step": 371 + }, + { + "epoch": 0.00372, + "grad_norm": 0.4085729420185089, + "grad_norm_var": 0.022289690361487074, + "learning_rate": 5e-05, + "loss": 0.1871, + "loss/crossentropy": 2.732766628265381, + "loss/hidden": 0.0, + "loss/logits": 0.18710973486304283, + "loss/reg": 3.0203185081481934, + "step": 372 + }, + { + "epoch": 0.00373, + "grad_norm": 0.3334612250328064, + "grad_norm_var": 0.022468457594482887, + "learning_rate": 5e-05, + "loss": 0.1662, + "loss/crossentropy": 2.783429443836212, + "loss/hidden": 0.0, + "loss/logits": 0.1662071831524372, + "loss/reg": 3.0171730518341064, + "step": 373 + }, + { + "epoch": 0.00374, + "grad_norm": 0.35536903142929077, + "grad_norm_var": 0.022607616164545874, + "learning_rate": 5e-05, + "loss": 0.1654, + "loss/crossentropy": 2.818749785423279, + "loss/hidden": 0.0, + "loss/logits": 0.16541225090622902, + "loss/reg": 3.0140058994293213, + "step": 374 + }, + { + "epoch": 0.00375, + "grad_norm": 0.348376989364624, + "grad_norm_var": 0.022917750109528078, + "learning_rate": 5e-05, + "loss": 0.1628, + "loss/crossentropy": 2.9099320769309998, + "loss/hidden": 0.0, + "loss/logits": 0.1627991460263729, + "loss/reg": 3.010875701904297, + "step": 375 + }, + { + "epoch": 0.00376, + "grad_norm": 0.3394787311553955, + "grad_norm_var": 0.023335225496050292, + "learning_rate": 5e-05, + "loss": 0.1605, + "loss/crossentropy": 2.811407744884491, + "loss/hidden": 0.0, + "loss/logits": 0.16051743179559708, + "loss/reg": 3.0073623657226562, + "step": 376 + }, + { + "epoch": 0.00377, + "grad_norm": 0.42454567551612854, + "grad_norm_var": 0.02319007765550817, + "learning_rate": 5e-05, + "loss": 0.1645, + "loss/crossentropy": 2.773725748062134, + "loss/hidden": 0.0, + "loss/logits": 0.16450630128383636, + "loss/reg": 3.004185676574707, + "step": 377 + }, + { + "epoch": 0.00378, + "grad_norm": 0.3412385582923889, + "grad_norm_var": 0.001946629707866813, + "learning_rate": 5e-05, + "loss": 0.1742, + "loss/crossentropy": 2.785709500312805, + "loss/hidden": 0.0, + "loss/logits": 0.17420916631817818, + "loss/reg": 3.0007028579711914, + "step": 378 + }, + { + "epoch": 0.00379, + "grad_norm": 0.3544103503227234, + "grad_norm_var": 0.001985417588834304, + "learning_rate": 5e-05, + "loss": 0.1716, + "loss/crossentropy": 2.670408546924591, + "loss/hidden": 0.0, + "loss/logits": 0.17155754566192627, + "loss/reg": 2.9972825050354004, + "step": 379 + }, + { + "epoch": 0.0038, + "grad_norm": 0.36286091804504395, + "grad_norm_var": 0.001963109812450625, + "learning_rate": 5e-05, + "loss": 0.1786, + "loss/crossentropy": 2.822770357131958, + "loss/hidden": 0.0, + "loss/logits": 0.17864727228879929, + "loss/reg": 2.993536949157715, + "step": 380 + }, + { + "epoch": 0.00381, + "grad_norm": 0.5003440976142883, + "grad_norm_var": 0.002294225514870618, + "learning_rate": 5e-05, + "loss": 0.1896, + "loss/crossentropy": 2.790800392627716, + "loss/hidden": 0.0, + "loss/logits": 0.18961890786886215, + "loss/reg": 2.990283489227295, + "step": 381 + }, + { + "epoch": 0.00382, + "grad_norm": 0.3698294758796692, + "grad_norm_var": 0.0022250210916228584, + "learning_rate": 5e-05, + "loss": 0.1648, + "loss/crossentropy": 2.8308547139167786, + "loss/hidden": 0.0, + "loss/logits": 0.16477400809526443, + "loss/reg": 2.986691474914551, + "step": 382 + }, + { + "epoch": 0.00383, + "grad_norm": 0.36506953835487366, + "grad_norm_var": 0.0022229105132923347, + "learning_rate": 5e-05, + "loss": 0.1682, + "loss/crossentropy": 2.744426727294922, + "loss/hidden": 0.0, + "loss/logits": 0.16819821670651436, + "loss/reg": 2.983008861541748, + "step": 383 + }, + { + "epoch": 0.00384, + "grad_norm": 0.3243113160133362, + "grad_norm_var": 0.002390011819316588, + "learning_rate": 5e-05, + "loss": 0.1596, + "loss/crossentropy": 2.89188152551651, + "loss/hidden": 0.0, + "loss/logits": 0.1595698669552803, + "loss/reg": 2.979191541671753, + "step": 384 + }, + { + "epoch": 0.00385, + "grad_norm": 0.36836785078048706, + "grad_norm_var": 0.002273433106164295, + "learning_rate": 5e-05, + "loss": 0.1747, + "loss/crossentropy": 2.902570128440857, + "loss/hidden": 0.0, + "loss/logits": 0.1746898777782917, + "loss/reg": 2.975698947906494, + "step": 385 + }, + { + "epoch": 0.00386, + "grad_norm": 0.3365325927734375, + "grad_norm_var": 0.001915978849994604, + "learning_rate": 5e-05, + "loss": 0.1608, + "loss/crossentropy": 2.766001045703888, + "loss/hidden": 0.0, + "loss/logits": 0.16081618145108223, + "loss/reg": 2.972667694091797, + "step": 386 + }, + { + "epoch": 0.00387, + "grad_norm": 0.35417604446411133, + "grad_norm_var": 0.0019292530227877358, + "learning_rate": 5e-05, + "loss": 0.1605, + "loss/crossentropy": 2.7814798951148987, + "loss/hidden": 0.0, + "loss/logits": 0.16046970710158348, + "loss/reg": 2.969967842102051, + "step": 387 + }, + { + "epoch": 0.00388, + "grad_norm": 1.9537514448165894, + "grad_norm_var": 0.15952536292831518, + "learning_rate": 5e-05, + "loss": 0.1926, + "loss/crossentropy": 2.782427728176117, + "loss/hidden": 0.0, + "loss/logits": 0.1925731934607029, + "loss/reg": 2.9671437740325928, + "step": 388 + }, + { + "epoch": 0.00389, + "grad_norm": 0.3620118498802185, + "grad_norm_var": 0.15907744774636304, + "learning_rate": 5e-05, + "loss": 0.1714, + "loss/crossentropy": 2.716952919960022, + "loss/hidden": 0.0, + "loss/logits": 0.17136194929480553, + "loss/reg": 2.9639716148376465, + "step": 389 + }, + { + "epoch": 0.0039, + "grad_norm": 0.3765539526939392, + "grad_norm_var": 0.1587921781193889, + "learning_rate": 5e-05, + "loss": 0.1689, + "loss/crossentropy": 2.8395472168922424, + "loss/hidden": 0.0, + "loss/logits": 0.16893662884831429, + "loss/reg": 2.9617481231689453, + "step": 390 + }, + { + "epoch": 0.00391, + "grad_norm": 0.39779096841812134, + "grad_norm_var": 0.15815917569478716, + "learning_rate": 5e-05, + "loss": 0.1677, + "loss/crossentropy": 2.813215434551239, + "loss/hidden": 0.0, + "loss/logits": 0.1676802597939968, + "loss/reg": 2.958872079849243, + "step": 391 + }, + { + "epoch": 0.00392, + "grad_norm": 2.267273187637329, + "grad_norm_var": 0.35670344578828533, + "learning_rate": 5e-05, + "loss": 0.1898, + "loss/crossentropy": 2.710484206676483, + "loss/hidden": 0.0, + "loss/logits": 0.18981827050447464, + "loss/reg": 2.9562907218933105, + "step": 392 + }, + { + "epoch": 0.00393, + "grad_norm": 0.4732659161090851, + "grad_norm_var": 0.35576926148027355, + "learning_rate": 5e-05, + "loss": 0.1789, + "loss/crossentropy": 2.7463297247886658, + "loss/hidden": 0.0, + "loss/logits": 0.17890166491270065, + "loss/reg": 2.953505277633667, + "step": 393 + }, + { + "epoch": 0.00394, + "grad_norm": 0.46487849950790405, + "grad_norm_var": 0.3525539310677323, + "learning_rate": 5e-05, + "loss": 0.1737, + "loss/crossentropy": 2.781617820262909, + "loss/hidden": 0.0, + "loss/logits": 0.17365656793117523, + "loss/reg": 2.951185464859009, + "step": 394 + }, + { + "epoch": 0.00395, + "grad_norm": 0.36613309383392334, + "grad_norm_var": 0.352175585204308, + "learning_rate": 5e-05, + "loss": 0.1749, + "loss/crossentropy": 2.9521047472953796, + "loss/hidden": 0.0, + "loss/logits": 0.1749158501625061, + "loss/reg": 2.949521780014038, + "step": 395 + }, + { + "epoch": 0.00396, + "grad_norm": 0.33889761567115784, + "grad_norm_var": 0.35297777688562104, + "learning_rate": 5e-05, + "loss": 0.1611, + "loss/crossentropy": 2.709399461746216, + "loss/hidden": 0.0, + "loss/logits": 0.161102045327425, + "loss/reg": 2.94758677482605, + "step": 396 + }, + { + "epoch": 0.00397, + "grad_norm": 0.37460586428642273, + "grad_norm_var": 0.3556567542520952, + "learning_rate": 5e-05, + "loss": 0.1709, + "loss/crossentropy": 2.8318939208984375, + "loss/hidden": 0.0, + "loss/logits": 0.1709057316184044, + "loss/reg": 2.9449052810668945, + "step": 397 + }, + { + "epoch": 0.00398, + "grad_norm": 0.36912715435028076, + "grad_norm_var": 0.3556777153015602, + "learning_rate": 5e-05, + "loss": 0.1699, + "loss/crossentropy": 2.7699413895606995, + "loss/hidden": 0.0, + "loss/logits": 0.16991987824440002, + "loss/reg": 2.9416375160217285, + "step": 398 + }, + { + "epoch": 0.00399, + "grad_norm": 0.4335935711860657, + "grad_norm_var": 0.35388598085202433, + "learning_rate": 5e-05, + "loss": 0.1621, + "loss/crossentropy": 2.6929262280464172, + "loss/hidden": 0.0, + "loss/logits": 0.16208457946777344, + "loss/reg": 2.9385571479797363, + "step": 399 + }, + { + "epoch": 0.004, + "grad_norm": 0.36466526985168457, + "grad_norm_var": 0.35251743192284957, + "learning_rate": 5e-05, + "loss": 0.1603, + "loss/crossentropy": 2.9028329849243164, + "loss/hidden": 0.0, + "loss/logits": 0.16026458516716957, + "loss/reg": 2.935270071029663, + "step": 400 + }, + { + "epoch": 0.00401, + "grad_norm": 0.31859657168388367, + "grad_norm_var": 0.3542100800677372, + "learning_rate": 5e-05, + "loss": 0.155, + "loss/crossentropy": 2.7707905769348145, + "loss/hidden": 0.0, + "loss/logits": 0.15500668808817863, + "loss/reg": 2.9319961071014404, + "step": 401 + }, + { + "epoch": 0.00402, + "grad_norm": 0.39714375138282776, + "grad_norm_var": 0.35233479687143326, + "learning_rate": 5e-05, + "loss": 0.1722, + "loss/crossentropy": 2.5947351455688477, + "loss/hidden": 0.0, + "loss/logits": 0.17224042490124702, + "loss/reg": 2.929413318634033, + "step": 402 + }, + { + "epoch": 0.00403, + "grad_norm": 2.267681121826172, + "grad_norm_var": 0.518261838886723, + "learning_rate": 5e-05, + "loss": 0.2313, + "loss/crossentropy": 2.703626811504364, + "loss/hidden": 0.0, + "loss/logits": 0.23128759488463402, + "loss/reg": 2.925968647003174, + "step": 403 + }, + { + "epoch": 0.00404, + "grad_norm": 0.38179340958595276, + "grad_norm_var": 0.414193396025083, + "learning_rate": 5e-05, + "loss": 0.1852, + "loss/crossentropy": 2.820598065853119, + "loss/hidden": 0.0, + "loss/logits": 0.18524771928787231, + "loss/reg": 2.9223124980926514, + "step": 404 + }, + { + "epoch": 0.00405, + "grad_norm": 0.35225149989128113, + "grad_norm_var": 0.4145378570625937, + "learning_rate": 5e-05, + "loss": 0.1672, + "loss/crossentropy": 2.761472165584564, + "loss/hidden": 0.0, + "loss/logits": 0.1672290712594986, + "loss/reg": 2.9187073707580566, + "step": 405 + }, + { + "epoch": 0.00406, + "grad_norm": 0.35987603664398193, + "grad_norm_var": 0.4150999685132215, + "learning_rate": 5e-05, + "loss": 0.1731, + "loss/crossentropy": 2.7906153202056885, + "loss/hidden": 0.0, + "loss/logits": 0.17313192784786224, + "loss/reg": 2.915867805480957, + "step": 406 + }, + { + "epoch": 0.00407, + "grad_norm": 0.36714085936546326, + "grad_norm_var": 0.416068714723823, + "learning_rate": 5e-05, + "loss": 0.1729, + "loss/crossentropy": 2.7815486192703247, + "loss/hidden": 0.0, + "loss/logits": 0.17289156094193459, + "loss/reg": 2.9124350547790527, + "step": 407 + }, + { + "epoch": 0.00408, + "grad_norm": 0.43249061703681946, + "grad_norm_var": 0.22313248530400895, + "learning_rate": 5e-05, + "loss": 0.1886, + "loss/crossentropy": 2.7781424522399902, + "loss/hidden": 0.0, + "loss/logits": 0.18864024803042412, + "loss/reg": 2.9094510078430176, + "step": 408 + }, + { + "epoch": 0.00409, + "grad_norm": 0.34418484568595886, + "grad_norm_var": 0.22470081409508588, + "learning_rate": 5e-05, + "loss": 0.1713, + "loss/crossentropy": 2.7818892002105713, + "loss/hidden": 0.0, + "loss/logits": 0.17131192237138748, + "loss/reg": 2.9064505100250244, + "step": 409 + }, + { + "epoch": 0.0041, + "grad_norm": 0.5792982578277588, + "grad_norm_var": 0.2250470715469535, + "learning_rate": 5e-05, + "loss": 0.1925, + "loss/crossentropy": 2.8059155344963074, + "loss/hidden": 0.0, + "loss/logits": 0.1925133354961872, + "loss/reg": 2.9032788276672363, + "step": 410 + }, + { + "epoch": 0.00411, + "grad_norm": 0.35917767882347107, + "grad_norm_var": 0.22517699381034958, + "learning_rate": 5e-05, + "loss": 0.1597, + "loss/crossentropy": 2.9948400259017944, + "loss/hidden": 0.0, + "loss/logits": 0.1597156822681427, + "loss/reg": 2.900125741958618, + "step": 411 + }, + { + "epoch": 0.00412, + "grad_norm": 0.32936394214630127, + "grad_norm_var": 0.2253906803631866, + "learning_rate": 5e-05, + "loss": 0.16, + "loss/crossentropy": 2.8464353680610657, + "loss/hidden": 0.0, + "loss/logits": 0.15996254980564117, + "loss/reg": 2.897120952606201, + "step": 412 + }, + { + "epoch": 0.00413, + "grad_norm": 0.3636591136455536, + "grad_norm_var": 0.22558401797348096, + "learning_rate": 5e-05, + "loss": 0.1896, + "loss/crossentropy": 2.5940242409706116, + "loss/hidden": 0.0, + "loss/logits": 0.18961919099092484, + "loss/reg": 2.8938040733337402, + "step": 413 + }, + { + "epoch": 0.00414, + "grad_norm": 0.3614409565925598, + "grad_norm_var": 0.2257231161008428, + "learning_rate": 5e-05, + "loss": 0.1669, + "loss/crossentropy": 2.9938586950302124, + "loss/hidden": 0.0, + "loss/logits": 0.16688520461320877, + "loss/reg": 2.891237497329712, + "step": 414 + }, + { + "epoch": 0.00415, + "grad_norm": 0.33793795108795166, + "grad_norm_var": 0.2271517945765009, + "learning_rate": 5e-05, + "loss": 0.1662, + "loss/crossentropy": 2.7566803693771362, + "loss/hidden": 0.0, + "loss/logits": 0.16617370769381523, + "loss/reg": 2.888444185256958, + "step": 415 + }, + { + "epoch": 0.00416, + "grad_norm": 0.33697640895843506, + "grad_norm_var": 0.22768012665927795, + "learning_rate": 5e-05, + "loss": 0.1609, + "loss/crossentropy": 2.7538956999778748, + "loss/hidden": 0.0, + "loss/logits": 0.1609276346862316, + "loss/reg": 2.8854165077209473, + "step": 416 + }, + { + "epoch": 0.00417, + "grad_norm": 0.33163169026374817, + "grad_norm_var": 0.22738752034767185, + "learning_rate": 5e-05, + "loss": 0.1686, + "loss/crossentropy": 2.716135025024414, + "loss/hidden": 0.0, + "loss/logits": 0.16858552396297455, + "loss/reg": 2.883502244949341, + "step": 417 + }, + { + "epoch": 0.00418, + "grad_norm": 0.3197973072528839, + "grad_norm_var": 0.22875903165210631, + "learning_rate": 5e-05, + "loss": 0.1622, + "loss/crossentropy": 2.8385114669799805, + "loss/hidden": 0.0, + "loss/logits": 0.16216163337230682, + "loss/reg": 2.8822600841522217, + "step": 418 + }, + { + "epoch": 0.00419, + "grad_norm": 0.3929068446159363, + "grad_norm_var": 0.0038269076397950408, + "learning_rate": 5e-05, + "loss": 0.1747, + "loss/crossentropy": 2.7871418595314026, + "loss/hidden": 0.0, + "loss/logits": 0.17471741139888763, + "loss/reg": 2.8793692588806152, + "step": 419 + }, + { + "epoch": 0.0042, + "grad_norm": 0.3870161473751068, + "grad_norm_var": 0.0038355224442556198, + "learning_rate": 5e-05, + "loss": 0.1704, + "loss/crossentropy": 2.7993595600128174, + "loss/hidden": 0.0, + "loss/logits": 0.1703585907816887, + "loss/reg": 2.8776159286499023, + "step": 420 + }, + { + "epoch": 0.00421, + "grad_norm": 0.35682201385498047, + "grad_norm_var": 0.0038246732894099337, + "learning_rate": 5e-05, + "loss": 0.1886, + "loss/crossentropy": 2.655856966972351, + "loss/hidden": 0.0, + "loss/logits": 0.18859218060970306, + "loss/reg": 2.87612247467041, + "step": 421 + }, + { + "epoch": 0.00422, + "grad_norm": 0.33115604519844055, + "grad_norm_var": 0.003924500155300174, + "learning_rate": 5e-05, + "loss": 0.1611, + "loss/crossentropy": 2.8695462942123413, + "loss/hidden": 0.0, + "loss/logits": 0.1611364483833313, + "loss/reg": 2.873897075653076, + "step": 422 + }, + { + "epoch": 0.00423, + "grad_norm": 0.4912989139556885, + "grad_norm_var": 0.004829238325957341, + "learning_rate": 5e-05, + "loss": 0.1784, + "loss/crossentropy": 2.7167177200317383, + "loss/hidden": 0.0, + "loss/logits": 0.17839327454566956, + "loss/reg": 2.8718714714050293, + "step": 423 + }, + { + "epoch": 0.00424, + "grad_norm": 0.3349898159503937, + "grad_norm_var": 0.004720821391959795, + "learning_rate": 5e-05, + "loss": 0.1615, + "loss/crossentropy": 2.7473002076148987, + "loss/hidden": 0.0, + "loss/logits": 0.1615452691912651, + "loss/reg": 2.8705270290374756, + "step": 424 + }, + { + "epoch": 0.00425, + "grad_norm": 0.3930635154247284, + "grad_norm_var": 0.004686561363231038, + "learning_rate": 5e-05, + "loss": 0.1775, + "loss/crossentropy": 2.785146713256836, + "loss/hidden": 0.0, + "loss/logits": 0.17748162522912025, + "loss/reg": 2.868584394454956, + "step": 425 + }, + { + "epoch": 0.00426, + "grad_norm": 0.3448260426521301, + "grad_norm_var": 0.0017484410160554464, + "learning_rate": 5e-05, + "loss": 0.1758, + "loss/crossentropy": 2.726165235042572, + "loss/hidden": 0.0, + "loss/logits": 0.17576807364821434, + "loss/reg": 2.8673741817474365, + "step": 426 + }, + { + "epoch": 0.00427, + "grad_norm": 0.3846610188484192, + "grad_norm_var": 0.0017836724819081718, + "learning_rate": 5e-05, + "loss": 0.1762, + "loss/crossentropy": 2.7086002230644226, + "loss/hidden": 0.0, + "loss/logits": 0.1762431263923645, + "loss/reg": 2.8651485443115234, + "step": 427 + }, + { + "epoch": 0.00428, + "grad_norm": 0.3494791090488434, + "grad_norm_var": 0.0017205006490997802, + "learning_rate": 5e-05, + "loss": 0.1818, + "loss/crossentropy": 2.7305288314819336, + "loss/hidden": 0.0, + "loss/logits": 0.18181117624044418, + "loss/reg": 2.8629541397094727, + "step": 428 + }, + { + "epoch": 0.00429, + "grad_norm": 0.3337409794330597, + "grad_norm_var": 0.0017762239427149495, + "learning_rate": 5e-05, + "loss": 0.1661, + "loss/crossentropy": 2.840167284011841, + "loss/hidden": 0.0, + "loss/logits": 0.1660769209265709, + "loss/reg": 2.8615217208862305, + "step": 429 + }, + { + "epoch": 0.0043, + "grad_norm": 0.4685284495353699, + "grad_norm_var": 0.0024887722894077887, + "learning_rate": 5e-05, + "loss": 0.1741, + "loss/crossentropy": 2.7593295574188232, + "loss/hidden": 0.0, + "loss/logits": 0.1740834154188633, + "loss/reg": 2.860365629196167, + "step": 430 + }, + { + "epoch": 0.00431, + "grad_norm": 0.35651838779449463, + "grad_norm_var": 0.002434815976952542, + "learning_rate": 5e-05, + "loss": 0.1673, + "loss/crossentropy": 2.777701735496521, + "loss/hidden": 0.0, + "loss/logits": 0.16725125908851624, + "loss/reg": 2.858222007751465, + "step": 431 + }, + { + "epoch": 0.00432, + "grad_norm": 0.34670454263687134, + "grad_norm_var": 0.0023984303943363817, + "learning_rate": 5e-05, + "loss": 0.165, + "loss/crossentropy": 2.749099850654602, + "loss/hidden": 0.0, + "loss/logits": 0.16504037007689476, + "loss/reg": 2.855973958969116, + "step": 432 + }, + { + "epoch": 0.00433, + "grad_norm": 0.3284713923931122, + "grad_norm_var": 0.0024153046998328874, + "learning_rate": 5e-05, + "loss": 0.1521, + "loss/crossentropy": 2.7869237661361694, + "loss/hidden": 0.0, + "loss/logits": 0.15208067372441292, + "loss/reg": 2.853942632675171, + "step": 433 + }, + { + "epoch": 0.00434, + "grad_norm": 0.48883649706840515, + "grad_norm_var": 0.0030697262784900037, + "learning_rate": 5e-05, + "loss": 0.1759, + "loss/crossentropy": 2.9403671622276306, + "loss/hidden": 0.0, + "loss/logits": 0.17589127644896507, + "loss/reg": 2.851728916168213, + "step": 434 + }, + { + "epoch": 0.00435, + "grad_norm": 0.36951273679733276, + "grad_norm_var": 0.0030654307324534447, + "learning_rate": 5e-05, + "loss": 0.185, + "loss/crossentropy": 2.7797312140464783, + "loss/hidden": 0.0, + "loss/logits": 0.1850355602800846, + "loss/reg": 2.8488640785217285, + "step": 435 + }, + { + "epoch": 0.00436, + "grad_norm": 0.4184967577457428, + "grad_norm_var": 0.0031605906698184564, + "learning_rate": 5e-05, + "loss": 0.1855, + "loss/crossentropy": 2.812410533428192, + "loss/hidden": 0.0, + "loss/logits": 0.18553681299090385, + "loss/reg": 2.8465514183044434, + "step": 436 + }, + { + "epoch": 0.00437, + "grad_norm": 0.4329560101032257, + "grad_norm_var": 0.0032767273553133062, + "learning_rate": 5e-05, + "loss": 0.1784, + "loss/crossentropy": 2.840768814086914, + "loss/hidden": 0.0, + "loss/logits": 0.17843929678201675, + "loss/reg": 2.844315767288208, + "step": 437 + }, + { + "epoch": 0.00438, + "grad_norm": 0.6038658022880554, + "grad_norm_var": 0.005936964872234999, + "learning_rate": 5e-05, + "loss": 0.1949, + "loss/crossentropy": 2.7183879017829895, + "loss/hidden": 0.0, + "loss/logits": 0.19491342082619667, + "loss/reg": 2.842548131942749, + "step": 438 + }, + { + "epoch": 0.00439, + "grad_norm": 0.4069391191005707, + "grad_norm_var": 0.005387125873613382, + "learning_rate": 5e-05, + "loss": 0.1875, + "loss/crossentropy": 2.7780433297157288, + "loss/hidden": 0.0, + "loss/logits": 0.18749799579381943, + "loss/reg": 2.840106964111328, + "step": 439 + }, + { + "epoch": 0.0044, + "grad_norm": 0.35941290855407715, + "grad_norm_var": 0.005220523762257064, + "learning_rate": 5e-05, + "loss": 0.1639, + "loss/crossentropy": 2.7595601081848145, + "loss/hidden": 0.0, + "loss/logits": 0.1638675332069397, + "loss/reg": 2.8378958702087402, + "step": 440 + }, + { + "epoch": 0.00441, + "grad_norm": 0.3669149875640869, + "grad_norm_var": 0.005284393934492052, + "learning_rate": 5e-05, + "loss": 0.1714, + "loss/crossentropy": 2.673116147518158, + "loss/hidden": 0.0, + "loss/logits": 0.1713937260210514, + "loss/reg": 2.8351917266845703, + "step": 441 + }, + { + "epoch": 0.00442, + "grad_norm": 0.3643859922885895, + "grad_norm_var": 0.0051709546313713755, + "learning_rate": 5e-05, + "loss": 0.1795, + "loss/crossentropy": 2.8026832342147827, + "loss/hidden": 0.0, + "loss/logits": 0.17951133847236633, + "loss/reg": 2.8323209285736084, + "step": 442 + }, + { + "epoch": 0.00443, + "grad_norm": 0.34250232577323914, + "grad_norm_var": 0.005361033629373261, + "learning_rate": 5e-05, + "loss": 0.1729, + "loss/crossentropy": 2.7829501032829285, + "loss/hidden": 0.0, + "loss/logits": 0.17286691814661026, + "loss/reg": 2.8302509784698486, + "step": 443 + }, + { + "epoch": 0.00444, + "grad_norm": 0.3323063552379608, + "grad_norm_var": 0.005486165176397177, + "learning_rate": 5e-05, + "loss": 0.1685, + "loss/crossentropy": 2.7025471329689026, + "loss/hidden": 0.0, + "loss/logits": 0.1684984639286995, + "loss/reg": 2.827455997467041, + "step": 444 + }, + { + "epoch": 0.00445, + "grad_norm": 0.35889074206352234, + "grad_norm_var": 0.005320257567319386, + "learning_rate": 5e-05, + "loss": 0.1872, + "loss/crossentropy": 2.6426368355751038, + "loss/hidden": 0.0, + "loss/logits": 0.18716050684452057, + "loss/reg": 2.8253841400146484, + "step": 445 + }, + { + "epoch": 0.00446, + "grad_norm": 0.39696604013442993, + "grad_norm_var": 0.004953801905317123, + "learning_rate": 5e-05, + "loss": 0.1844, + "loss/crossentropy": 2.6918662786483765, + "loss/hidden": 0.0, + "loss/logits": 0.18442435935139656, + "loss/reg": 2.8228838443756104, + "step": 446 + }, + { + "epoch": 0.00447, + "grad_norm": 0.3320043981075287, + "grad_norm_var": 0.005107676487313561, + "learning_rate": 5e-05, + "loss": 0.1689, + "loss/crossentropy": 2.7309769988059998, + "loss/hidden": 0.0, + "loss/logits": 0.16887113079428673, + "loss/reg": 2.8211417198181152, + "step": 447 + }, + { + "epoch": 0.00448, + "grad_norm": 0.3350951373577118, + "grad_norm_var": 0.0051840048892141, + "learning_rate": 5e-05, + "loss": 0.1736, + "loss/crossentropy": 2.7696202397346497, + "loss/hidden": 0.0, + "loss/logits": 0.17358100041747093, + "loss/reg": 2.818211793899536, + "step": 448 + }, + { + "epoch": 0.00449, + "grad_norm": 0.35995370149612427, + "grad_norm_var": 0.004988316730949372, + "learning_rate": 5e-05, + "loss": 0.185, + "loss/crossentropy": 2.7628800868988037, + "loss/hidden": 0.0, + "loss/logits": 0.1849852055311203, + "loss/reg": 2.8163435459136963, + "step": 449 + }, + { + "epoch": 0.0045, + "grad_norm": 0.3433259427547455, + "grad_norm_var": 0.004429295151525636, + "learning_rate": 5e-05, + "loss": 0.1776, + "loss/crossentropy": 2.92121821641922, + "loss/hidden": 0.0, + "loss/logits": 0.1775917150080204, + "loss/reg": 2.8140530586242676, + "step": 450 + }, + { + "epoch": 0.00451, + "grad_norm": 0.3525676131248474, + "grad_norm_var": 0.004477082320186199, + "learning_rate": 5e-05, + "loss": 0.1765, + "loss/crossentropy": 2.8223352432250977, + "loss/hidden": 0.0, + "loss/logits": 0.1765166036784649, + "loss/reg": 2.8114964962005615, + "step": 451 + }, + { + "epoch": 0.00452, + "grad_norm": 0.3349536955356598, + "grad_norm_var": 0.004502986709870172, + "learning_rate": 5e-05, + "loss": 0.1617, + "loss/crossentropy": 2.58266818523407, + "loss/hidden": 0.0, + "loss/logits": 0.16168920323252678, + "loss/reg": 2.8082778453826904, + "step": 452 + }, + { + "epoch": 0.00453, + "grad_norm": 0.3272739350795746, + "grad_norm_var": 0.004404667304666754, + "learning_rate": 5e-05, + "loss": 0.1606, + "loss/crossentropy": 2.791221022605896, + "loss/hidden": 0.0, + "loss/logits": 0.16057174652814865, + "loss/reg": 2.8055834770202637, + "step": 453 + }, + { + "epoch": 0.00454, + "grad_norm": 0.35802412033081055, + "grad_norm_var": 0.0005107777789474354, + "learning_rate": 5e-05, + "loss": 0.1758, + "loss/crossentropy": 2.856186628341675, + "loss/hidden": 0.0, + "loss/logits": 0.17580854520201683, + "loss/reg": 2.8030307292938232, + "step": 454 + }, + { + "epoch": 0.00455, + "grad_norm": 0.34605538845062256, + "grad_norm_var": 0.0003165176266941239, + "learning_rate": 5e-05, + "loss": 0.1651, + "loss/crossentropy": 2.734806716442108, + "loss/hidden": 0.0, + "loss/logits": 0.16513444855809212, + "loss/reg": 2.7998156547546387, + "step": 455 + }, + { + "epoch": 0.00456, + "grad_norm": 0.35396888852119446, + "grad_norm_var": 0.0003120198180476634, + "learning_rate": 5e-05, + "loss": 0.1701, + "loss/crossentropy": 2.8904529213905334, + "loss/hidden": 0.0, + "loss/logits": 0.17014532163739204, + "loss/reg": 2.796231508255005, + "step": 456 + }, + { + "epoch": 0.00457, + "grad_norm": 0.3613145649433136, + "grad_norm_var": 0.00030159148728288546, + "learning_rate": 5e-05, + "loss": 0.1835, + "loss/crossentropy": 2.813112735748291, + "loss/hidden": 0.0, + "loss/logits": 0.1834680140018463, + "loss/reg": 2.7929091453552246, + "step": 457 + }, + { + "epoch": 0.00458, + "grad_norm": 0.36372610926628113, + "grad_norm_var": 0.00030035069871777733, + "learning_rate": 5e-05, + "loss": 0.167, + "loss/crossentropy": 2.795239508152008, + "loss/hidden": 0.0, + "loss/logits": 0.1669648103415966, + "loss/reg": 2.789707899093628, + "step": 458 + }, + { + "epoch": 0.00459, + "grad_norm": 0.3581913113594055, + "grad_norm_var": 0.00030019062479412403, + "learning_rate": 5e-05, + "loss": 0.165, + "loss/crossentropy": 2.797567903995514, + "loss/hidden": 0.0, + "loss/logits": 0.16498373076319695, + "loss/reg": 2.786154270172119, + "step": 459 + }, + { + "epoch": 0.0046, + "grad_norm": 0.3571149408817291, + "grad_norm_var": 0.00027710791712463786, + "learning_rate": 5e-05, + "loss": 0.161, + "loss/crossentropy": 2.743883192539215, + "loss/hidden": 0.0, + "loss/logits": 0.16101711615920067, + "loss/reg": 2.7824792861938477, + "step": 460 + }, + { + "epoch": 0.00461, + "grad_norm": 0.356715589761734, + "grad_norm_var": 0.0002755397827386948, + "learning_rate": 5e-05, + "loss": 0.1732, + "loss/crossentropy": 2.724743604660034, + "loss/hidden": 0.0, + "loss/logits": 0.1731928214430809, + "loss/reg": 2.778890371322632, + "step": 461 + }, + { + "epoch": 0.00462, + "grad_norm": 0.3243059813976288, + "grad_norm_var": 0.00017305590364662023, + "learning_rate": 5e-05, + "loss": 0.1592, + "loss/crossentropy": 2.7731017470359802, + "loss/hidden": 0.0, + "loss/logits": 0.15923070535063744, + "loss/reg": 2.775613784790039, + "step": 462 + }, + { + "epoch": 0.00463, + "grad_norm": 0.3843972980976105, + "grad_norm_var": 0.00023436686166613171, + "learning_rate": 5e-05, + "loss": 0.1776, + "loss/crossentropy": 2.6426811814308167, + "loss/hidden": 0.0, + "loss/logits": 0.17761223763227463, + "loss/reg": 2.772561550140381, + "step": 463 + }, + { + "epoch": 0.00464, + "grad_norm": 0.3468632102012634, + "grad_norm_var": 0.00021796986892265539, + "learning_rate": 5e-05, + "loss": 0.1742, + "loss/crossentropy": 2.882816791534424, + "loss/hidden": 0.0, + "loss/logits": 0.17419364303350449, + "loss/reg": 2.769129991531372, + "step": 464 + }, + { + "epoch": 0.00465, + "grad_norm": 0.3967267870903015, + "grad_norm_var": 0.0003424789629975648, + "learning_rate": 5e-05, + "loss": 0.1785, + "loss/crossentropy": 2.9460648894309998, + "loss/hidden": 0.0, + "loss/logits": 0.1784828118979931, + "loss/reg": 2.767021656036377, + "step": 465 + }, + { + "epoch": 0.00466, + "grad_norm": 0.36927327513694763, + "grad_norm_var": 0.0003472996962895862, + "learning_rate": 5e-05, + "loss": 0.1723, + "loss/crossentropy": 2.7243736386299133, + "loss/hidden": 0.0, + "loss/logits": 0.17225344851613045, + "loss/reg": 2.7637596130371094, + "step": 466 + }, + { + "epoch": 0.00467, + "grad_norm": 0.6855224967002869, + "grad_norm_var": 0.007136168552583877, + "learning_rate": 5e-05, + "loss": 0.1725, + "loss/crossentropy": 2.9400131702423096, + "loss/hidden": 0.0, + "loss/logits": 0.17254997044801712, + "loss/reg": 2.7604963779449463, + "step": 467 + }, + { + "epoch": 0.00468, + "grad_norm": 0.41388872265815735, + "grad_norm_var": 0.007088047286249225, + "learning_rate": 5e-05, + "loss": 0.1608, + "loss/crossentropy": 2.7375746369361877, + "loss/hidden": 0.0, + "loss/logits": 0.16082635894417763, + "loss/reg": 2.7572667598724365, + "step": 468 + }, + { + "epoch": 0.00469, + "grad_norm": 0.3997427225112915, + "grad_norm_var": 0.006892705403346755, + "learning_rate": 5e-05, + "loss": 0.1704, + "loss/crossentropy": 2.8761582374572754, + "loss/hidden": 0.0, + "loss/logits": 0.17043552175164223, + "loss/reg": 2.754149913787842, + "step": 469 + }, + { + "epoch": 0.0047, + "grad_norm": 0.33424052596092224, + "grad_norm_var": 0.007016741295476203, + "learning_rate": 5e-05, + "loss": 0.1652, + "loss/crossentropy": 2.8255309462547302, + "loss/hidden": 0.0, + "loss/logits": 0.16519855335354805, + "loss/reg": 2.751276731491089, + "step": 470 + }, + { + "epoch": 0.00471, + "grad_norm": 0.41415977478027344, + "grad_norm_var": 0.006957502567752493, + "learning_rate": 5e-05, + "loss": 0.1832, + "loss/crossentropy": 2.676911950111389, + "loss/hidden": 0.0, + "loss/logits": 0.18318749964237213, + "loss/reg": 2.7486090660095215, + "step": 471 + }, + { + "epoch": 0.00472, + "grad_norm": 0.3688299357891083, + "grad_norm_var": 0.006902369057221236, + "learning_rate": 5e-05, + "loss": 0.162, + "loss/crossentropy": 2.6736281514167786, + "loss/hidden": 0.0, + "loss/logits": 0.16196409612894058, + "loss/reg": 2.745225667953491, + "step": 472 + }, + { + "epoch": 0.00473, + "grad_norm": 0.42543119192123413, + "grad_norm_var": 0.006916738926360373, + "learning_rate": 5e-05, + "loss": 0.1685, + "loss/crossentropy": 2.7871673703193665, + "loss/hidden": 0.0, + "loss/logits": 0.16850638762116432, + "loss/reg": 2.7421023845672607, + "step": 473 + }, + { + "epoch": 0.00474, + "grad_norm": 0.3542870581150055, + "grad_norm_var": 0.006960025235757868, + "learning_rate": 5e-05, + "loss": 0.1695, + "loss/crossentropy": 2.727401077747345, + "loss/hidden": 0.0, + "loss/logits": 0.1695132479071617, + "loss/reg": 2.738083839416504, + "step": 474 + }, + { + "epoch": 0.00475, + "grad_norm": 0.42508408427238464, + "grad_norm_var": 0.006928287935252916, + "learning_rate": 5e-05, + "loss": 0.1967, + "loss/crossentropy": 2.756391167640686, + "loss/hidden": 0.0, + "loss/logits": 0.19667796790599823, + "loss/reg": 2.734565496444702, + "step": 475 + }, + { + "epoch": 0.00476, + "grad_norm": 0.3327108919620514, + "grad_norm_var": 0.007096223362361916, + "learning_rate": 5e-05, + "loss": 0.1698, + "loss/crossentropy": 2.662286937236786, + "loss/hidden": 0.0, + "loss/logits": 0.16976173967123032, + "loss/reg": 2.730898141860962, + "step": 476 + }, + { + "epoch": 0.00477, + "grad_norm": 0.3538263142108917, + "grad_norm_var": 0.007111786918880665, + "learning_rate": 5e-05, + "loss": 0.1728, + "loss/crossentropy": 2.6600981950759888, + "loss/hidden": 0.0, + "loss/logits": 0.17283405736088753, + "loss/reg": 2.7276227474212646, + "step": 477 + }, + { + "epoch": 0.00478, + "grad_norm": 0.6719810962677002, + "grad_norm_var": 0.011362604241119423, + "learning_rate": 5e-05, + "loss": 0.1956, + "loss/crossentropy": 2.9192944169044495, + "loss/hidden": 0.0, + "loss/logits": 0.1955549158155918, + "loss/reg": 2.7246785163879395, + "step": 478 + }, + { + "epoch": 0.00479, + "grad_norm": 0.40175002813339233, + "grad_norm_var": 0.011305273259017534, + "learning_rate": 5e-05, + "loss": 0.1707, + "loss/crossentropy": 2.8099315762519836, + "loss/hidden": 0.0, + "loss/logits": 0.1706707924604416, + "loss/reg": 2.721214771270752, + "step": 479 + }, + { + "epoch": 0.0048, + "grad_norm": 0.5700014233589172, + "grad_norm_var": 0.012288996380569357, + "learning_rate": 5e-05, + "loss": 0.175, + "loss/crossentropy": 2.764845371246338, + "loss/hidden": 0.0, + "loss/logits": 0.17503220587968826, + "loss/reg": 2.71852970123291, + "step": 480 + }, + { + "epoch": 0.00481, + "grad_norm": 0.3602856993675232, + "grad_norm_var": 0.012545036289332723, + "learning_rate": 5e-05, + "loss": 0.1675, + "loss/crossentropy": 2.817295730113983, + "loss/hidden": 0.0, + "loss/logits": 0.16745564714074135, + "loss/reg": 2.7155778408050537, + "step": 481 + }, + { + "epoch": 0.00482, + "grad_norm": 0.37470605969429016, + "grad_norm_var": 0.012502846327791594, + "learning_rate": 5e-05, + "loss": 0.1722, + "loss/crossentropy": 2.7710434794425964, + "loss/hidden": 0.0, + "loss/logits": 0.172159094363451, + "loss/reg": 2.7133727073669434, + "step": 482 + }, + { + "epoch": 0.00483, + "grad_norm": 0.319488525390625, + "grad_norm_var": 0.008425663660975724, + "learning_rate": 5e-05, + "loss": 0.1524, + "loss/crossentropy": 2.783412456512451, + "loss/hidden": 0.0, + "loss/logits": 0.15241163223981857, + "loss/reg": 2.710848331451416, + "step": 483 + }, + { + "epoch": 0.00484, + "grad_norm": 0.3474343419075012, + "grad_norm_var": 0.008645296689366684, + "learning_rate": 5e-05, + "loss": 0.1582, + "loss/crossentropy": 2.8712441325187683, + "loss/hidden": 0.0, + "loss/logits": 0.1582440249621868, + "loss/reg": 2.708759307861328, + "step": 484 + }, + { + "epoch": 0.00485, + "grad_norm": 0.3881974518299103, + "grad_norm_var": 0.008659215056144554, + "learning_rate": 5e-05, + "loss": 0.168, + "loss/crossentropy": 2.7801290154457092, + "loss/hidden": 0.0, + "loss/logits": 0.16804108768701553, + "loss/reg": 2.707090139389038, + "step": 485 + }, + { + "epoch": 0.00486, + "grad_norm": 0.3865320086479187, + "grad_norm_var": 0.008353144350497502, + "learning_rate": 5e-05, + "loss": 0.1622, + "loss/crossentropy": 2.762259840965271, + "loss/hidden": 0.0, + "loss/logits": 0.1622123382985592, + "loss/reg": 2.704341173171997, + "step": 486 + }, + { + "epoch": 0.00487, + "grad_norm": 0.3601287007331848, + "grad_norm_var": 0.008476237288047564, + "learning_rate": 5e-05, + "loss": 0.1717, + "loss/crossentropy": 2.7640222311019897, + "loss/hidden": 0.0, + "loss/logits": 0.1716899275779724, + "loss/reg": 2.702449321746826, + "step": 487 + }, + { + "epoch": 0.00488, + "grad_norm": 0.3476349711418152, + "grad_norm_var": 0.008599584577097493, + "learning_rate": 5e-05, + "loss": 0.1771, + "loss/crossentropy": 2.8057321906089783, + "loss/hidden": 0.0, + "loss/logits": 0.1770944595336914, + "loss/reg": 2.6996164321899414, + "step": 488 + }, + { + "epoch": 0.00489, + "grad_norm": 0.35880380868911743, + "grad_norm_var": 0.008661929013581293, + "learning_rate": 5e-05, + "loss": 0.1759, + "loss/crossentropy": 2.7546836137771606, + "loss/hidden": 0.0, + "loss/logits": 0.17589908093214035, + "loss/reg": 2.69681978225708, + "step": 489 + }, + { + "epoch": 0.0049, + "grad_norm": 0.3216891586780548, + "grad_norm_var": 0.008914221483014847, + "learning_rate": 5e-05, + "loss": 0.1628, + "loss/crossentropy": 2.755846858024597, + "loss/hidden": 0.0, + "loss/logits": 0.16277150437235832, + "loss/reg": 2.6948180198669434, + "step": 490 + }, + { + "epoch": 0.00491, + "grad_norm": 0.3739294409751892, + "grad_norm_var": 0.008872687766587003, + "learning_rate": 5e-05, + "loss": 0.1737, + "loss/crossentropy": 2.758453607559204, + "loss/hidden": 0.0, + "loss/logits": 0.173715490847826, + "loss/reg": 2.692713737487793, + "step": 491 + }, + { + "epoch": 0.00492, + "grad_norm": 0.3633546531200409, + "grad_norm_var": 0.008689872848313067, + "learning_rate": 5e-05, + "loss": 0.1755, + "loss/crossentropy": 2.756626844406128, + "loss/hidden": 0.0, + "loss/logits": 0.17549088224768639, + "loss/reg": 2.6911702156066895, + "step": 492 + }, + { + "epoch": 0.00493, + "grad_norm": 0.4165309965610504, + "grad_norm_var": 0.00860196217059566, + "learning_rate": 5e-05, + "loss": 0.1882, + "loss/crossentropy": 2.69485205411911, + "loss/hidden": 0.0, + "loss/logits": 0.18822569772601128, + "loss/reg": 2.6890034675598145, + "step": 493 + }, + { + "epoch": 0.00494, + "grad_norm": 0.34585461020469666, + "grad_norm_var": 0.0033206140596304815, + "learning_rate": 5e-05, + "loss": 0.1742, + "loss/crossentropy": 2.73829984664917, + "loss/hidden": 0.0, + "loss/logits": 0.17416464537382126, + "loss/reg": 2.687713623046875, + "step": 494 + }, + { + "epoch": 0.00495, + "grad_norm": 0.3443280756473541, + "grad_norm_var": 0.003339269529387142, + "learning_rate": 5e-05, + "loss": 0.1613, + "loss/crossentropy": 2.745963931083679, + "loss/hidden": 0.0, + "loss/logits": 0.161319550126791, + "loss/reg": 2.685638189315796, + "step": 495 + }, + { + "epoch": 0.00496, + "grad_norm": 0.3689098656177521, + "grad_norm_var": 0.000602855553897171, + "learning_rate": 5e-05, + "loss": 0.1735, + "loss/crossentropy": 2.6744813919067383, + "loss/hidden": 0.0, + "loss/logits": 0.17346932739019394, + "loss/reg": 2.6839358806610107, + "step": 496 + }, + { + "epoch": 0.00497, + "grad_norm": 0.36457374691963196, + "grad_norm_var": 0.0006035317496342747, + "learning_rate": 5e-05, + "loss": 0.1735, + "loss/crossentropy": 2.799642562866211, + "loss/hidden": 0.0, + "loss/logits": 0.17352834343910217, + "loss/reg": 2.683485507965088, + "step": 497 + }, + { + "epoch": 0.00498, + "grad_norm": 0.35222312808036804, + "grad_norm_var": 0.0005951796117876367, + "learning_rate": 5e-05, + "loss": 0.1742, + "loss/crossentropy": 2.739416480064392, + "loss/hidden": 0.0, + "loss/logits": 0.1742345169186592, + "loss/reg": 2.6830482482910156, + "step": 498 + }, + { + "epoch": 0.00499, + "grad_norm": 0.3427422344684601, + "grad_norm_var": 0.0005034448418147264, + "learning_rate": 5e-05, + "loss": 0.1673, + "loss/crossentropy": 2.772252082824707, + "loss/hidden": 0.0, + "loss/logits": 0.16728588938713074, + "loss/reg": 2.682189464569092, + "step": 499 + }, + { + "epoch": 0.005, + "grad_norm": 0.39299577474594116, + "grad_norm_var": 0.0005481683329227494, + "learning_rate": 5e-05, + "loss": 0.1926, + "loss/crossentropy": 2.7702752351760864, + "loss/hidden": 0.0, + "loss/logits": 0.1926349699497223, + "loss/reg": 2.6808254718780518, + "step": 500 + }, + { + "epoch": 0.00501, + "grad_norm": 0.3431949019432068, + "grad_norm_var": 0.0005312130675710792, + "learning_rate": 5e-05, + "loss": 0.1634, + "loss/crossentropy": 2.7881234288215637, + "loss/hidden": 0.0, + "loss/logits": 0.1633942425251007, + "loss/reg": 2.6798629760742188, + "step": 501 + }, + { + "epoch": 0.00502, + "grad_norm": 0.36641839146614075, + "grad_norm_var": 0.0004892704880637311, + "learning_rate": 5e-05, + "loss": 0.1787, + "loss/crossentropy": 2.848407030105591, + "loss/hidden": 0.0, + "loss/logits": 0.17872987315058708, + "loss/reg": 2.677311658859253, + "step": 502 + }, + { + "epoch": 0.00503, + "grad_norm": 0.3278079330921173, + "grad_norm_var": 0.000554897538649816, + "learning_rate": 5e-05, + "loss": 0.1587, + "loss/crossentropy": 2.75662362575531, + "loss/hidden": 0.0, + "loss/logits": 0.15868044644594193, + "loss/reg": 2.675185203552246, + "step": 503 + }, + { + "epoch": 0.00504, + "grad_norm": 0.3251039683818817, + "grad_norm_var": 0.0006183250665441046, + "learning_rate": 5e-05, + "loss": 0.1551, + "loss/crossentropy": 2.731416165828705, + "loss/hidden": 0.0, + "loss/logits": 0.15506618097424507, + "loss/reg": 2.673948287963867, + "step": 504 + }, + { + "epoch": 0.00505, + "grad_norm": 0.35344070196151733, + "grad_norm_var": 0.0006186746986458047, + "learning_rate": 5e-05, + "loss": 0.167, + "loss/crossentropy": 2.740668296813965, + "loss/hidden": 0.0, + "loss/logits": 0.16695522889494896, + "loss/reg": 2.6712498664855957, + "step": 505 + }, + { + "epoch": 0.00506, + "grad_norm": 0.36658284068107605, + "grad_norm_var": 0.0005366058949143918, + "learning_rate": 5e-05, + "loss": 0.1668, + "loss/crossentropy": 2.802608013153076, + "loss/hidden": 0.0, + "loss/logits": 0.16682763025164604, + "loss/reg": 2.669286012649536, + "step": 506 + }, + { + "epoch": 0.00507, + "grad_norm": 0.4423954486846924, + "grad_norm_var": 0.000963591213409624, + "learning_rate": 5e-05, + "loss": 0.1963, + "loss/crossentropy": 2.811932861804962, + "loss/hidden": 0.0, + "loss/logits": 0.19632378965616226, + "loss/reg": 2.666898012161255, + "step": 507 + }, + { + "epoch": 0.00508, + "grad_norm": 0.3770610988140106, + "grad_norm_var": 0.000975015024308116, + "learning_rate": 5e-05, + "loss": 0.1753, + "loss/crossentropy": 2.7279282808303833, + "loss/hidden": 0.0, + "loss/logits": 0.17527905479073524, + "loss/reg": 2.664764881134033, + "step": 508 + }, + { + "epoch": 0.00509, + "grad_norm": 0.35589146614074707, + "grad_norm_var": 0.0007832244440521922, + "learning_rate": 5e-05, + "loss": 0.168, + "loss/crossentropy": 2.8977367281913757, + "loss/hidden": 0.0, + "loss/logits": 0.16798892244696617, + "loss/reg": 2.6622438430786133, + "step": 509 + }, + { + "epoch": 0.0051, + "grad_norm": 0.3419097661972046, + "grad_norm_var": 0.0007919503322765919, + "learning_rate": 5e-05, + "loss": 0.1729, + "loss/crossentropy": 2.6906025409698486, + "loss/hidden": 0.0, + "loss/logits": 0.172856405377388, + "loss/reg": 2.6595211029052734, + "step": 510 + }, + { + "epoch": 0.00511, + "grad_norm": 0.3972381353378296, + "grad_norm_var": 0.0008538971282195384, + "learning_rate": 5e-05, + "loss": 0.1825, + "loss/crossentropy": 2.762513279914856, + "loss/hidden": 0.0, + "loss/logits": 0.18245646730065346, + "loss/reg": 2.6572320461273193, + "step": 511 + }, + { + "epoch": 0.00512, + "grad_norm": 0.3489353358745575, + "grad_norm_var": 0.0008648399289393501, + "learning_rate": 5e-05, + "loss": 0.179, + "loss/crossentropy": 2.7472071647644043, + "loss/hidden": 0.0, + "loss/logits": 0.1789936050772667, + "loss/reg": 2.6544342041015625, + "step": 512 + }, + { + "epoch": 0.00513, + "grad_norm": 0.3673308491706848, + "grad_norm_var": 0.0008661114894439326, + "learning_rate": 5e-05, + "loss": 0.1745, + "loss/crossentropy": 2.69700163602829, + "loss/hidden": 0.0, + "loss/logits": 0.1745261810719967, + "loss/reg": 2.6520497798919678, + "step": 513 + }, + { + "epoch": 0.00514, + "grad_norm": 0.33870744705200195, + "grad_norm_var": 0.0008961917113334199, + "learning_rate": 5e-05, + "loss": 0.1649, + "loss/crossentropy": 2.762860357761383, + "loss/hidden": 0.0, + "loss/logits": 0.16494135558605194, + "loss/reg": 2.6500847339630127, + "step": 514 + }, + { + "epoch": 0.00515, + "grad_norm": 0.40411266684532166, + "grad_norm_var": 0.0009761766654230563, + "learning_rate": 5e-05, + "loss": 0.1628, + "loss/crossentropy": 3.01085501909256, + "loss/hidden": 0.0, + "loss/logits": 0.16284478455781937, + "loss/reg": 2.648311138153076, + "step": 515 + }, + { + "epoch": 0.00516, + "grad_norm": 0.37194308638572693, + "grad_norm_var": 0.0009268939874421604, + "learning_rate": 5e-05, + "loss": 0.182, + "loss/crossentropy": 2.721080005168915, + "loss/hidden": 0.0, + "loss/logits": 0.18202906847000122, + "loss/reg": 2.6469640731811523, + "step": 516 + }, + { + "epoch": 0.00517, + "grad_norm": 0.3380615711212158, + "grad_norm_var": 0.0009429551352979477, + "learning_rate": 5e-05, + "loss": 0.1639, + "loss/crossentropy": 2.6788495779037476, + "loss/hidden": 0.0, + "loss/logits": 0.16385124996304512, + "loss/reg": 2.6445441246032715, + "step": 517 + }, + { + "epoch": 0.00518, + "grad_norm": 0.37696361541748047, + "grad_norm_var": 0.0009533986625055632, + "learning_rate": 5e-05, + "loss": 0.1587, + "loss/crossentropy": 2.6845511198043823, + "loss/hidden": 0.0, + "loss/logits": 0.1586880125105381, + "loss/reg": 2.6424736976623535, + "step": 518 + }, + { + "epoch": 0.00519, + "grad_norm": 0.32983675599098206, + "grad_norm_var": 0.0009437052369864992, + "learning_rate": 5e-05, + "loss": 0.1585, + "loss/crossentropy": 2.5984672904014587, + "loss/hidden": 0.0, + "loss/logits": 0.15849433466792107, + "loss/reg": 2.639796257019043, + "step": 519 + }, + { + "epoch": 0.0052, + "grad_norm": 0.3439983129501343, + "grad_norm_var": 0.000866215802107521, + "learning_rate": 5e-05, + "loss": 0.1578, + "loss/crossentropy": 2.7057528495788574, + "loss/hidden": 0.0, + "loss/logits": 0.15775253251194954, + "loss/reg": 2.636976480484009, + "step": 520 + }, + { + "epoch": 0.00521, + "grad_norm": 0.4739494323730469, + "grad_norm_var": 0.0015736599047053415, + "learning_rate": 5e-05, + "loss": 0.178, + "loss/crossentropy": 2.6239394545555115, + "loss/hidden": 0.0, + "loss/logits": 0.17801255360245705, + "loss/reg": 2.6342852115631104, + "step": 521 + }, + { + "epoch": 0.00522, + "grad_norm": 0.5270029306411743, + "grad_norm_var": 0.003035565907296726, + "learning_rate": 5e-05, + "loss": 0.1801, + "loss/crossentropy": 2.890467643737793, + "loss/hidden": 0.0, + "loss/logits": 0.18007512018084526, + "loss/reg": 2.631289005279541, + "step": 522 + }, + { + "epoch": 0.00523, + "grad_norm": 0.42719003558158875, + "grad_norm_var": 0.002930528350278516, + "learning_rate": 5e-05, + "loss": 0.1781, + "loss/crossentropy": 2.9749565720558167, + "loss/hidden": 0.0, + "loss/logits": 0.17812742665410042, + "loss/reg": 2.6284420490264893, + "step": 523 + }, + { + "epoch": 0.00524, + "grad_norm": 0.37133005261421204, + "grad_norm_var": 0.0029367435634455913, + "learning_rate": 5e-05, + "loss": 0.1597, + "loss/crossentropy": 2.692670702934265, + "loss/hidden": 0.0, + "loss/logits": 0.1597190946340561, + "loss/reg": 2.6251702308654785, + "step": 524 + }, + { + "epoch": 0.00525, + "grad_norm": 0.3646347224712372, + "grad_norm_var": 0.0029109098946428253, + "learning_rate": 5e-05, + "loss": 0.1676, + "loss/crossentropy": 2.8696910738945007, + "loss/hidden": 0.0, + "loss/logits": 0.16764900088310242, + "loss/reg": 2.621973991394043, + "step": 525 + }, + { + "epoch": 0.00526, + "grad_norm": 0.3347557485103607, + "grad_norm_var": 0.002953013887398237, + "learning_rate": 5e-05, + "loss": 0.1659, + "loss/crossentropy": 2.844240427017212, + "loss/hidden": 0.0, + "loss/logits": 0.16587505862116814, + "loss/reg": 2.6180617809295654, + "step": 526 + }, + { + "epoch": 0.00527, + "grad_norm": 0.3301764726638794, + "grad_norm_var": 0.003100070614909223, + "learning_rate": 5e-05, + "loss": 0.1554, + "loss/crossentropy": 2.822225272655487, + "loss/hidden": 0.0, + "loss/logits": 0.15541274286806583, + "loss/reg": 2.6156363487243652, + "step": 527 + }, + { + "epoch": 0.00528, + "grad_norm": 0.3668423593044281, + "grad_norm_var": 0.003050578439524883, + "learning_rate": 5e-05, + "loss": 0.1725, + "loss/crossentropy": 2.873881459236145, + "loss/hidden": 0.0, + "loss/logits": 0.17249644920229912, + "loss/reg": 2.6129000186920166, + "step": 528 + }, + { + "epoch": 0.00529, + "grad_norm": 0.33062636852264404, + "grad_norm_var": 0.0031927551041592986, + "learning_rate": 5e-05, + "loss": 0.169, + "loss/crossentropy": 2.7202290296554565, + "loss/hidden": 0.0, + "loss/logits": 0.16901781037449837, + "loss/reg": 2.6098320484161377, + "step": 529 + }, + { + "epoch": 0.0053, + "grad_norm": 0.33170488476753235, + "grad_norm_var": 0.003231463613689256, + "learning_rate": 5e-05, + "loss": 0.1708, + "loss/crossentropy": 2.7543463706970215, + "loss/hidden": 0.0, + "loss/logits": 0.17077547311782837, + "loss/reg": 2.606674909591675, + "step": 530 + }, + { + "epoch": 0.00531, + "grad_norm": 0.3436318337917328, + "grad_norm_var": 0.0032369737172315838, + "learning_rate": 5e-05, + "loss": 0.1822, + "loss/crossentropy": 2.6231788992881775, + "loss/hidden": 0.0, + "loss/logits": 0.1821577101945877, + "loss/reg": 2.603997230529785, + "step": 531 + }, + { + "epoch": 0.00532, + "grad_norm": 0.33105242252349854, + "grad_norm_var": 0.0033454153420392264, + "learning_rate": 5e-05, + "loss": 0.1661, + "loss/crossentropy": 2.819184124469757, + "loss/hidden": 0.0, + "loss/logits": 0.16607840731739998, + "loss/reg": 2.6012203693389893, + "step": 532 + }, + { + "epoch": 0.00533, + "grad_norm": 0.3485148847103119, + "grad_norm_var": 0.0033075767398377588, + "learning_rate": 5e-05, + "loss": 0.1676, + "loss/crossentropy": 2.8594303727149963, + "loss/hidden": 0.0, + "loss/logits": 0.1676221825182438, + "loss/reg": 2.5986573696136475, + "step": 533 + }, + { + "epoch": 0.00534, + "grad_norm": 0.3541623651981354, + "grad_norm_var": 0.003321219936842216, + "learning_rate": 5e-05, + "loss": 0.1742, + "loss/crossentropy": 2.7382256984710693, + "loss/hidden": 0.0, + "loss/logits": 0.17420669272542, + "loss/reg": 2.5956368446350098, + "step": 534 + }, + { + "epoch": 0.00535, + "grad_norm": 0.362183541059494, + "grad_norm_var": 0.003216249066768325, + "learning_rate": 5e-05, + "loss": 0.1732, + "loss/crossentropy": 2.820302128791809, + "loss/hidden": 0.0, + "loss/logits": 0.1731831431388855, + "loss/reg": 2.591860294342041, + "step": 535 + }, + { + "epoch": 0.00536, + "grad_norm": 0.340348482131958, + "grad_norm_var": 0.0032303969391706505, + "learning_rate": 5e-05, + "loss": 0.1677, + "loss/crossentropy": 2.978896915912628, + "loss/hidden": 0.0, + "loss/logits": 0.16773569583892822, + "loss/reg": 2.5879762172698975, + "step": 536 + }, + { + "epoch": 0.00537, + "grad_norm": 0.359326034784317, + "grad_norm_var": 0.002480178301492671, + "learning_rate": 5e-05, + "loss": 0.1767, + "loss/crossentropy": 2.7645240426063538, + "loss/hidden": 0.0, + "loss/logits": 0.17668773606419563, + "loss/reg": 2.5847809314727783, + "step": 537 + }, + { + "epoch": 0.00538, + "grad_norm": 0.3420480489730835, + "grad_norm_var": 0.0005976425682412671, + "learning_rate": 5e-05, + "loss": 0.1758, + "loss/crossentropy": 2.7278724908828735, + "loss/hidden": 0.0, + "loss/logits": 0.17576001212000847, + "loss/reg": 2.581143379211426, + "step": 538 + }, + { + "epoch": 0.00539, + "grad_norm": 0.33362701535224915, + "grad_norm_var": 0.00021185911019383125, + "learning_rate": 5e-05, + "loss": 0.1708, + "loss/crossentropy": 2.6828721165657043, + "loss/hidden": 0.0, + "loss/logits": 0.1707863062620163, + "loss/reg": 2.5769548416137695, + "step": 539 + }, + { + "epoch": 0.0054, + "grad_norm": 0.6795082092285156, + "grad_norm_var": 0.007165518560383773, + "learning_rate": 5e-05, + "loss": 0.2001, + "loss/crossentropy": 2.7977577447891235, + "loss/hidden": 0.0, + "loss/logits": 0.20013980567455292, + "loss/reg": 2.5742313861846924, + "step": 540 + }, + { + "epoch": 0.00541, + "grad_norm": 0.35366278886795044, + "grad_norm_var": 0.007174778628811747, + "learning_rate": 5e-05, + "loss": 0.1703, + "loss/crossentropy": 2.859143853187561, + "loss/hidden": 0.0, + "loss/logits": 0.17025134339928627, + "loss/reg": 2.57037353515625, + "step": 541 + }, + { + "epoch": 0.00542, + "grad_norm": 0.3655552566051483, + "grad_norm_var": 0.007109308326582827, + "learning_rate": 5e-05, + "loss": 0.1775, + "loss/crossentropy": 2.8502614498138428, + "loss/hidden": 0.0, + "loss/logits": 0.1775321438908577, + "loss/reg": 2.566716432571411, + "step": 542 + }, + { + "epoch": 0.00543, + "grad_norm": 0.3574732542037964, + "grad_norm_var": 0.007021635262450318, + "learning_rate": 5e-05, + "loss": 0.1735, + "loss/crossentropy": 2.892129361629486, + "loss/hidden": 0.0, + "loss/logits": 0.17345865443348885, + "loss/reg": 2.563842296600342, + "step": 543 + }, + { + "epoch": 0.00544, + "grad_norm": 0.36598220467567444, + "grad_norm_var": 0.007021902205424502, + "learning_rate": 5e-05, + "loss": 0.1721, + "loss/crossentropy": 2.7138225436210632, + "loss/hidden": 0.0, + "loss/logits": 0.17211398482322693, + "loss/reg": 2.5608513355255127, + "step": 544 + }, + { + "epoch": 0.00545, + "grad_norm": 0.35922348499298096, + "grad_norm_var": 0.0069277921155704155, + "learning_rate": 5e-05, + "loss": 0.1695, + "loss/crossentropy": 2.8138818740844727, + "loss/hidden": 0.0, + "loss/logits": 0.16949571669101715, + "loss/reg": 2.557931423187256, + "step": 545 + }, + { + "epoch": 0.00546, + "grad_norm": 0.3538724184036255, + "grad_norm_var": 0.006843838113958241, + "learning_rate": 5e-05, + "loss": 0.1722, + "loss/crossentropy": 2.7698569893836975, + "loss/hidden": 0.0, + "loss/logits": 0.17218982055783272, + "loss/reg": 2.5551669597625732, + "step": 546 + }, + { + "epoch": 0.00547, + "grad_norm": 0.38070008158683777, + "grad_norm_var": 0.006790073386325786, + "learning_rate": 5e-05, + "loss": 0.1843, + "loss/crossentropy": 2.631078600883484, + "loss/hidden": 0.0, + "loss/logits": 0.1842627413570881, + "loss/reg": 2.5517876148223877, + "step": 547 + }, + { + "epoch": 0.00548, + "grad_norm": 0.35319533944129944, + "grad_norm_var": 0.006693321782661003, + "learning_rate": 5e-05, + "loss": 0.16, + "loss/crossentropy": 2.850399076938629, + "loss/hidden": 0.0, + "loss/logits": 0.15995023399591446, + "loss/reg": 2.548754930496216, + "step": 548 + }, + { + "epoch": 0.00549, + "grad_norm": 0.4596186578273773, + "grad_norm_var": 0.0070637908733671, + "learning_rate": 5e-05, + "loss": 0.164, + "loss/crossentropy": 2.868459641933441, + "loss/hidden": 0.0, + "loss/logits": 0.16395244374871254, + "loss/reg": 2.5462560653686523, + "step": 549 + }, + { + "epoch": 0.0055, + "grad_norm": 0.3474785387516022, + "grad_norm_var": 0.007091863949161529, + "learning_rate": 5e-05, + "loss": 0.1641, + "loss/crossentropy": 2.6114882230758667, + "loss/hidden": 0.0, + "loss/logits": 0.1641043722629547, + "loss/reg": 2.5431270599365234, + "step": 550 + }, + { + "epoch": 0.00551, + "grad_norm": 0.3570033013820648, + "grad_norm_var": 0.007107306178779664, + "learning_rate": 5e-05, + "loss": 0.1579, + "loss/crossentropy": 2.8220438957214355, + "loss/hidden": 0.0, + "loss/logits": 0.15791887789964676, + "loss/reg": 2.539910078048706, + "step": 551 + }, + { + "epoch": 0.00552, + "grad_norm": 0.32915255427360535, + "grad_norm_var": 0.0071770024029156184, + "learning_rate": 5e-05, + "loss": 0.1612, + "loss/crossentropy": 2.878856658935547, + "loss/hidden": 0.0, + "loss/logits": 0.16122740507125854, + "loss/reg": 2.5368034839630127, + "step": 552 + }, + { + "epoch": 0.00553, + "grad_norm": 0.3565903604030609, + "grad_norm_var": 0.007185408405122592, + "learning_rate": 5e-05, + "loss": 0.1689, + "loss/crossentropy": 2.705552637577057, + "loss/hidden": 0.0, + "loss/logits": 0.16892266646027565, + "loss/reg": 2.5333409309387207, + "step": 553 + }, + { + "epoch": 0.00554, + "grad_norm": 0.31767770648002625, + "grad_norm_var": 0.0073488319211029345, + "learning_rate": 5e-05, + "loss": 0.1579, + "loss/crossentropy": 2.685009717941284, + "loss/hidden": 0.0, + "loss/logits": 0.15788856148719788, + "loss/reg": 2.5296521186828613, + "step": 554 + }, + { + "epoch": 0.00555, + "grad_norm": 0.35047340393066406, + "grad_norm_var": 0.0072637659398345844, + "learning_rate": 5e-05, + "loss": 0.1746, + "loss/crossentropy": 2.5745012760162354, + "loss/hidden": 0.0, + "loss/logits": 0.17458590865135193, + "loss/reg": 2.5270395278930664, + "step": 555 + }, + { + "epoch": 0.00556, + "grad_norm": 0.3832140266895294, + "grad_norm_var": 0.0009360149891549837, + "learning_rate": 5e-05, + "loss": 0.1676, + "loss/crossentropy": 2.8551809787750244, + "loss/hidden": 0.0, + "loss/logits": 0.16756092011928558, + "loss/reg": 2.523982048034668, + "step": 556 + }, + { + "epoch": 0.00557, + "grad_norm": 0.4020329713821411, + "grad_norm_var": 0.0010289291164416311, + "learning_rate": 5e-05, + "loss": 0.1755, + "loss/crossentropy": 2.787672698497772, + "loss/hidden": 0.0, + "loss/logits": 0.17547398060560226, + "loss/reg": 2.520615816116333, + "step": 557 + }, + { + "epoch": 0.00558, + "grad_norm": 0.38412049412727356, + "grad_norm_var": 0.0010519623608851428, + "learning_rate": 5e-05, + "loss": 0.1815, + "loss/crossentropy": 2.846573293209076, + "loss/hidden": 0.0, + "loss/logits": 0.1815263032913208, + "loss/reg": 2.518004894256592, + "step": 558 + }, + { + "epoch": 0.00559, + "grad_norm": 0.3456071615219116, + "grad_norm_var": 0.0010744320361522322, + "learning_rate": 5e-05, + "loss": 0.1746, + "loss/crossentropy": 2.922893524169922, + "loss/hidden": 0.0, + "loss/logits": 0.17459525167942047, + "loss/reg": 2.5153868198394775, + "step": 559 + }, + { + "epoch": 0.0056, + "grad_norm": 0.36563733220100403, + "grad_norm_var": 0.0010744113839659304, + "learning_rate": 5e-05, + "loss": 0.169, + "loss/crossentropy": 2.7154372334480286, + "loss/hidden": 0.0, + "loss/logits": 0.16903281211853027, + "loss/reg": 2.5126099586486816, + "step": 560 + }, + { + "epoch": 0.00561, + "grad_norm": 0.3387238085269928, + "grad_norm_var": 0.00111742135319511, + "learning_rate": 5e-05, + "loss": 0.1666, + "loss/crossentropy": 2.5947054624557495, + "loss/hidden": 0.0, + "loss/logits": 0.16662058234214783, + "loss/reg": 2.509439706802368, + "step": 561 + }, + { + "epoch": 0.00562, + "grad_norm": 0.45574790239334106, + "grad_norm_var": 0.0016275854789366514, + "learning_rate": 5e-05, + "loss": 0.1814, + "loss/crossentropy": 2.961915969848633, + "loss/hidden": 0.0, + "loss/logits": 0.1813669353723526, + "loss/reg": 2.5074241161346436, + "step": 562 + }, + { + "epoch": 0.00563, + "grad_norm": 0.39113175868988037, + "grad_norm_var": 0.0016486631382784092, + "learning_rate": 5e-05, + "loss": 0.1643, + "loss/crossentropy": 2.7337673902511597, + "loss/hidden": 0.0, + "loss/logits": 0.164301548153162, + "loss/reg": 2.5046684741973877, + "step": 563 + }, + { + "epoch": 0.00564, + "grad_norm": 0.36300358176231384, + "grad_norm_var": 0.0016312765518430934, + "learning_rate": 5e-05, + "loss": 0.1602, + "loss/crossentropy": 2.713749051094055, + "loss/hidden": 0.0, + "loss/logits": 0.16021040827035904, + "loss/reg": 2.5020864009857178, + "step": 564 + }, + { + "epoch": 0.00565, + "grad_norm": 0.3250221312046051, + "grad_norm_var": 0.001185749693630452, + "learning_rate": 5e-05, + "loss": 0.1661, + "loss/crossentropy": 2.739534556865692, + "loss/hidden": 0.0, + "loss/logits": 0.166114691644907, + "loss/reg": 2.500089645385742, + "step": 565 + }, + { + "epoch": 0.00566, + "grad_norm": 0.3059675395488739, + "grad_norm_var": 0.0013809527139825861, + "learning_rate": 5e-05, + "loss": 0.1528, + "loss/crossentropy": 2.7676697373390198, + "loss/hidden": 0.0, + "loss/logits": 0.15275665000081062, + "loss/reg": 2.497802257537842, + "step": 566 + }, + { + "epoch": 0.00567, + "grad_norm": 0.41637444496154785, + "grad_norm_var": 0.0015720438674995396, + "learning_rate": 5e-05, + "loss": 0.1899, + "loss/crossentropy": 2.7852693796157837, + "loss/hidden": 0.0, + "loss/logits": 0.18987080082297325, + "loss/reg": 2.4960269927978516, + "step": 567 + }, + { + "epoch": 0.00568, + "grad_norm": 0.48216167092323303, + "grad_norm_var": 0.002316091582714641, + "learning_rate": 5e-05, + "loss": 0.179, + "loss/crossentropy": 2.919625759124756, + "loss/hidden": 0.0, + "loss/logits": 0.17901213094592094, + "loss/reg": 2.4943020343780518, + "step": 568 + }, + { + "epoch": 0.00569, + "grad_norm": 0.34773337841033936, + "grad_norm_var": 0.0023415161321106623, + "learning_rate": 5e-05, + "loss": 0.1689, + "loss/crossentropy": 2.829575002193451, + "loss/hidden": 0.0, + "loss/logits": 0.1688704527914524, + "loss/reg": 2.4922549724578857, + "step": 569 + }, + { + "epoch": 0.0057, + "grad_norm": 0.42466020584106445, + "grad_norm_var": 0.0022617987789910494, + "learning_rate": 5e-05, + "loss": 0.2065, + "loss/crossentropy": 2.847673773765564, + "loss/hidden": 0.0, + "loss/logits": 0.20647098124027252, + "loss/reg": 2.4896240234375, + "step": 570 + }, + { + "epoch": 0.00571, + "grad_norm": 0.39025840163230896, + "grad_norm_var": 0.0022035635328787567, + "learning_rate": 5e-05, + "loss": 0.181, + "loss/crossentropy": 2.9154597520828247, + "loss/hidden": 0.0, + "loss/logits": 0.1810290329158306, + "loss/reg": 2.487513542175293, + "step": 571 + }, + { + "epoch": 0.00572, + "grad_norm": 0.3611275851726532, + "grad_norm_var": 0.002232206094215346, + "learning_rate": 5e-05, + "loss": 0.1687, + "loss/crossentropy": 2.813008964061737, + "loss/hidden": 0.0, + "loss/logits": 0.16871189698576927, + "loss/reg": 2.4849367141723633, + "step": 572 + }, + { + "epoch": 0.00573, + "grad_norm": 0.37163245677948, + "grad_norm_var": 0.002205551603401897, + "learning_rate": 5e-05, + "loss": 0.1736, + "loss/crossentropy": 2.829798102378845, + "loss/hidden": 0.0, + "loss/logits": 0.173641849309206, + "loss/reg": 2.481811046600342, + "step": 573 + }, + { + "epoch": 0.00574, + "grad_norm": 0.37662971019744873, + "grad_norm_var": 0.002204250880404842, + "learning_rate": 5e-05, + "loss": 0.1641, + "loss/crossentropy": 2.786403477191925, + "loss/hidden": 0.0, + "loss/logits": 0.16413037478923798, + "loss/reg": 2.479344606399536, + "step": 574 + }, + { + "epoch": 0.00575, + "grad_norm": 0.4090428948402405, + "grad_norm_var": 0.002174681113915019, + "learning_rate": 5e-05, + "loss": 0.1684, + "loss/crossentropy": 2.685749888420105, + "loss/hidden": 0.0, + "loss/logits": 0.16840650886297226, + "loss/reg": 2.476419448852539, + "step": 575 + }, + { + "epoch": 0.00576, + "grad_norm": 0.35688483715057373, + "grad_norm_var": 0.0021995018187083346, + "learning_rate": 5e-05, + "loss": 0.1611, + "loss/crossentropy": 2.809792697429657, + "loss/hidden": 0.0, + "loss/logits": 0.1611488163471222, + "loss/reg": 2.4737355709075928, + "step": 576 + }, + { + "epoch": 0.00577, + "grad_norm": 0.38194504380226135, + "grad_norm_var": 0.002065385566742454, + "learning_rate": 5e-05, + "loss": 0.1615, + "loss/crossentropy": 2.850769340991974, + "loss/hidden": 0.0, + "loss/logits": 0.1615295149385929, + "loss/reg": 2.4713802337646484, + "step": 577 + }, + { + "epoch": 0.00578, + "grad_norm": 0.3567502200603485, + "grad_norm_var": 0.001743510873329986, + "learning_rate": 5e-05, + "loss": 0.1689, + "loss/crossentropy": 2.7103776335716248, + "loss/hidden": 0.0, + "loss/logits": 0.16894375160336494, + "loss/reg": 2.4685709476470947, + "step": 578 + }, + { + "epoch": 0.00579, + "grad_norm": 0.3396901786327362, + "grad_norm_var": 0.001824115359163836, + "learning_rate": 5e-05, + "loss": 0.166, + "loss/crossentropy": 2.7079854607582092, + "loss/hidden": 0.0, + "loss/logits": 0.1659584417939186, + "loss/reg": 2.465658664703369, + "step": 579 + }, + { + "epoch": 0.0058, + "grad_norm": 0.358395516872406, + "grad_norm_var": 0.0018331543648902808, + "learning_rate": 5e-05, + "loss": 0.1813, + "loss/crossentropy": 2.8853692412376404, + "loss/hidden": 0.0, + "loss/logits": 0.18133477121591568, + "loss/reg": 2.4633235931396484, + "step": 580 + }, + { + "epoch": 0.00581, + "grad_norm": 0.3434228301048279, + "grad_norm_var": 0.0017310432323107805, + "learning_rate": 5e-05, + "loss": 0.1739, + "loss/crossentropy": 2.666011691093445, + "loss/hidden": 0.0, + "loss/logits": 0.1739240102469921, + "loss/reg": 2.460447072982788, + "step": 581 + }, + { + "epoch": 0.00582, + "grad_norm": 0.3482820689678192, + "grad_norm_var": 0.0014454775261250163, + "learning_rate": 5e-05, + "loss": 0.1846, + "loss/crossentropy": 2.6244596242904663, + "loss/hidden": 0.0, + "loss/logits": 0.1845875158905983, + "loss/reg": 2.457307815551758, + "step": 582 + }, + { + "epoch": 0.00583, + "grad_norm": 0.36450985074043274, + "grad_norm_var": 0.0013555723186838029, + "learning_rate": 5e-05, + "loss": 0.1741, + "loss/crossentropy": 2.6909091472625732, + "loss/hidden": 0.0, + "loss/logits": 0.174148079007864, + "loss/reg": 2.4545810222625732, + "step": 583 + }, + { + "epoch": 0.00584, + "grad_norm": 0.34841907024383545, + "grad_norm_var": 0.0005772011049318792, + "learning_rate": 5e-05, + "loss": 0.1621, + "loss/crossentropy": 2.7992460131645203, + "loss/hidden": 0.0, + "loss/logits": 0.162076648324728, + "loss/reg": 2.4516048431396484, + "step": 584 + }, + { + "epoch": 0.00585, + "grad_norm": 0.36560627818107605, + "grad_norm_var": 0.0005501529366056079, + "learning_rate": 5e-05, + "loss": 0.1612, + "loss/crossentropy": 2.7556354999542236, + "loss/hidden": 0.0, + "loss/logits": 0.16119826585054398, + "loss/reg": 2.4482715129852295, + "step": 585 + }, + { + "epoch": 0.00586, + "grad_norm": 0.37393423914909363, + "grad_norm_var": 0.00033166715444868193, + "learning_rate": 5e-05, + "loss": 0.1779, + "loss/crossentropy": 2.6222774982452393, + "loss/hidden": 0.0, + "loss/logits": 0.1779084950685501, + "loss/reg": 2.4452362060546875, + "step": 586 + }, + { + "epoch": 0.00587, + "grad_norm": 0.3511587679386139, + "grad_norm_var": 0.0002976648126369145, + "learning_rate": 5e-05, + "loss": 0.1761, + "loss/crossentropy": 2.7342361211776733, + "loss/hidden": 0.0, + "loss/logits": 0.17614838480949402, + "loss/reg": 2.441678524017334, + "step": 587 + }, + { + "epoch": 0.00588, + "grad_norm": 0.33847615122795105, + "grad_norm_var": 0.0003352805276915209, + "learning_rate": 5e-05, + "loss": 0.173, + "loss/crossentropy": 2.7935328483581543, + "loss/hidden": 0.0, + "loss/logits": 0.17295999452471733, + "loss/reg": 2.437959671020508, + "step": 588 + }, + { + "epoch": 0.00589, + "grad_norm": 0.351034015417099, + "grad_norm_var": 0.00033410454836428903, + "learning_rate": 5e-05, + "loss": 0.1775, + "loss/crossentropy": 2.7590489387512207, + "loss/hidden": 0.0, + "loss/logits": 0.1775294505059719, + "loss/reg": 2.4346938133239746, + "step": 589 + }, + { + "epoch": 0.0059, + "grad_norm": 0.37800535559654236, + "grad_norm_var": 0.0003372250971240794, + "learning_rate": 5e-05, + "loss": 0.1645, + "loss/crossentropy": 2.75826096534729, + "loss/hidden": 0.0, + "loss/logits": 0.16445999220013618, + "loss/reg": 2.4319543838500977, + "step": 590 + }, + { + "epoch": 0.00591, + "grad_norm": 0.3323316276073456, + "grad_norm_var": 0.0002069473145354402, + "learning_rate": 5e-05, + "loss": 0.1644, + "loss/crossentropy": 2.963920295238495, + "loss/hidden": 0.0, + "loss/logits": 0.164412472397089, + "loss/reg": 2.4295387268066406, + "step": 591 + }, + { + "epoch": 0.00592, + "grad_norm": 0.8281128406524658, + "grad_norm_var": 0.014169124282143371, + "learning_rate": 5e-05, + "loss": 0.2256, + "loss/crossentropy": 2.9319988489151, + "loss/hidden": 0.0, + "loss/logits": 0.22560855001211166, + "loss/reg": 2.427125930786133, + "step": 592 + }, + { + "epoch": 0.00593, + "grad_norm": 0.37988972663879395, + "grad_norm_var": 0.014170226758262046, + "learning_rate": 5e-05, + "loss": 0.1795, + "loss/crossentropy": 2.92197585105896, + "loss/hidden": 0.0, + "loss/logits": 0.1794501654803753, + "loss/reg": 2.4247610569000244, + "step": 593 + }, + { + "epoch": 0.00594, + "grad_norm": 0.37449878454208374, + "grad_norm_var": 0.014123355612102569, + "learning_rate": 5e-05, + "loss": 0.1756, + "loss/crossentropy": 2.734030842781067, + "loss/hidden": 0.0, + "loss/logits": 0.17557094618678093, + "loss/reg": 2.4224398136138916, + "step": 594 + }, + { + "epoch": 0.00595, + "grad_norm": 0.3890518248081207, + "grad_norm_var": 0.013970946553029018, + "learning_rate": 5e-05, + "loss": 0.1721, + "loss/crossentropy": 2.6910988688468933, + "loss/hidden": 0.0, + "loss/logits": 0.17206770926713943, + "loss/reg": 2.4206151962280273, + "step": 595 + }, + { + "epoch": 0.00596, + "grad_norm": 0.45764538645744324, + "grad_norm_var": 0.014180672563351104, + "learning_rate": 5e-05, + "loss": 0.1886, + "loss/crossentropy": 2.706140458583832, + "loss/hidden": 0.0, + "loss/logits": 0.1886041909456253, + "loss/reg": 2.418341636657715, + "step": 596 + }, + { + "epoch": 0.00597, + "grad_norm": 0.3294787108898163, + "grad_norm_var": 0.014289226884282809, + "learning_rate": 5e-05, + "loss": 0.1693, + "loss/crossentropy": 2.772903263568878, + "loss/hidden": 0.0, + "loss/logits": 0.16925981268286705, + "loss/reg": 2.415613889694214, + "step": 597 + }, + { + "epoch": 0.00598, + "grad_norm": 0.3425086438655853, + "grad_norm_var": 0.014326812953815705, + "learning_rate": 5e-05, + "loss": 0.17, + "loss/crossentropy": 2.7024609446525574, + "loss/hidden": 0.0, + "loss/logits": 0.16995511576533318, + "loss/reg": 2.4122676849365234, + "step": 598 + }, + { + "epoch": 0.00599, + "grad_norm": 0.37222734093666077, + "grad_norm_var": 0.014300147579082, + "learning_rate": 5e-05, + "loss": 0.1789, + "loss/crossentropy": 2.8698896765708923, + "loss/hidden": 0.0, + "loss/logits": 0.17886632308363914, + "loss/reg": 2.4099037647247314, + "step": 599 + }, + { + "epoch": 0.006, + "grad_norm": 0.39135316014289856, + "grad_norm_var": 0.014151428197242365, + "learning_rate": 5e-05, + "loss": 0.1747, + "loss/crossentropy": 2.700629711151123, + "loss/hidden": 0.0, + "loss/logits": 0.17468373104929924, + "loss/reg": 2.40794038772583, + "step": 600 + }, + { + "epoch": 0.00601, + "grad_norm": 0.3728218376636505, + "grad_norm_var": 0.014124279912823712, + "learning_rate": 5e-05, + "loss": 0.169, + "loss/crossentropy": 2.794102430343628, + "loss/hidden": 0.0, + "loss/logits": 0.16897983103990555, + "loss/reg": 2.405545711517334, + "step": 601 + }, + { + "epoch": 0.00602, + "grad_norm": 0.37317147850990295, + "grad_norm_var": 0.014126729018321404, + "learning_rate": 5e-05, + "loss": 0.1747, + "loss/crossentropy": 2.6252577900886536, + "loss/hidden": 0.0, + "loss/logits": 0.17474820092320442, + "loss/reg": 2.402970790863037, + "step": 602 + }, + { + "epoch": 0.00603, + "grad_norm": 0.35492607951164246, + "grad_norm_var": 0.01410428304541661, + "learning_rate": 5e-05, + "loss": 0.1809, + "loss/crossentropy": 2.6527358889579773, + "loss/hidden": 0.0, + "loss/logits": 0.18093448877334595, + "loss/reg": 2.4007880687713623, + "step": 603 + }, + { + "epoch": 0.00604, + "grad_norm": 0.408010870218277, + "grad_norm_var": 0.013856041692491945, + "learning_rate": 5e-05, + "loss": 0.2018, + "loss/crossentropy": 2.874286651611328, + "loss/hidden": 0.0, + "loss/logits": 0.20177744701504707, + "loss/reg": 2.3988449573516846, + "step": 604 + }, + { + "epoch": 0.00605, + "grad_norm": 0.3291812837123871, + "grad_norm_var": 0.014034946168994126, + "learning_rate": 5e-05, + "loss": 0.1614, + "loss/crossentropy": 2.7926167249679565, + "loss/hidden": 0.0, + "loss/logits": 0.16140995919704437, + "loss/reg": 2.3966500759124756, + "step": 605 + }, + { + "epoch": 0.00606, + "grad_norm": 0.34659212827682495, + "grad_norm_var": 0.014192203001449558, + "learning_rate": 5e-05, + "loss": 0.1709, + "loss/crossentropy": 2.8195464611053467, + "loss/hidden": 0.0, + "loss/logits": 0.1709096021950245, + "loss/reg": 2.394033670425415, + "step": 606 + }, + { + "epoch": 0.00607, + "grad_norm": 0.32253992557525635, + "grad_norm_var": 0.014285055545239086, + "learning_rate": 5e-05, + "loss": 0.1649, + "loss/crossentropy": 2.7236337065696716, + "loss/hidden": 0.0, + "loss/logits": 0.16493552178144455, + "loss/reg": 2.3918449878692627, + "step": 607 + }, + { + "epoch": 0.00608, + "grad_norm": 0.350931316614151, + "grad_norm_var": 0.0011668026056699994, + "learning_rate": 5e-05, + "loss": 0.1718, + "loss/crossentropy": 2.800759196281433, + "loss/hidden": 0.0, + "loss/logits": 0.17175282910466194, + "loss/reg": 2.38920521736145, + "step": 608 + }, + { + "epoch": 0.00609, + "grad_norm": 0.40333986282348633, + "grad_norm_var": 0.001237012928824995, + "learning_rate": 5e-05, + "loss": 0.2046, + "loss/crossentropy": 2.7574119567871094, + "loss/hidden": 0.0, + "loss/logits": 0.2046247273683548, + "loss/reg": 2.3865227699279785, + "step": 609 + }, + { + "epoch": 0.0061, + "grad_norm": 0.3773089349269867, + "grad_norm_var": 0.0012392324335123346, + "learning_rate": 5e-05, + "loss": 0.1641, + "loss/crossentropy": 2.6313101649284363, + "loss/hidden": 0.0, + "loss/logits": 0.16410250216722488, + "loss/reg": 2.3836612701416016, + "step": 610 + }, + { + "epoch": 0.00611, + "grad_norm": 0.438357949256897, + "grad_norm_var": 0.0015159779907225465, + "learning_rate": 5e-05, + "loss": 0.2098, + "loss/crossentropy": 2.780138611793518, + "loss/hidden": 0.0, + "loss/logits": 0.20979087427258492, + "loss/reg": 2.380405902862549, + "step": 611 + }, + { + "epoch": 0.00612, + "grad_norm": 0.34121251106262207, + "grad_norm_var": 0.0010515226822608785, + "learning_rate": 5e-05, + "loss": 0.161, + "loss/crossentropy": 2.669090151786804, + "loss/hidden": 0.0, + "loss/logits": 0.1609921157360077, + "loss/reg": 2.3776426315307617, + "step": 612 + }, + { + "epoch": 0.00613, + "grad_norm": 0.36169829964637756, + "grad_norm_var": 0.0009600577824135296, + "learning_rate": 5e-05, + "loss": 0.1734, + "loss/crossentropy": 2.7925440073013306, + "loss/hidden": 0.0, + "loss/logits": 0.17342102900147438, + "loss/reg": 2.3744184970855713, + "step": 613 + }, + { + "epoch": 0.00614, + "grad_norm": 0.522160530090332, + "grad_norm_var": 0.002369345725690275, + "learning_rate": 5e-05, + "loss": 0.1663, + "loss/crossentropy": 2.698939800262451, + "loss/hidden": 0.0, + "loss/logits": 0.16627153754234314, + "loss/reg": 2.370917320251465, + "step": 614 + }, + { + "epoch": 0.00615, + "grad_norm": 0.4562234580516815, + "grad_norm_var": 0.002733171284208069, + "learning_rate": 5e-05, + "loss": 0.1686, + "loss/crossentropy": 2.8971627950668335, + "loss/hidden": 0.0, + "loss/logits": 0.16856613755226135, + "loss/reg": 2.368067979812622, + "step": 615 + }, + { + "epoch": 0.00616, + "grad_norm": 0.5767056345939636, + "grad_norm_var": 0.0050531115809510415, + "learning_rate": 5e-05, + "loss": 0.171, + "loss/crossentropy": 2.822002112865448, + "loss/hidden": 0.0, + "loss/logits": 0.17102698609232903, + "loss/reg": 2.3653640747070312, + "step": 616 + }, + { + "epoch": 0.00617, + "grad_norm": 0.3703908324241638, + "grad_norm_var": 0.005060977204500819, + "learning_rate": 5e-05, + "loss": 0.1799, + "loss/crossentropy": 2.761395037174225, + "loss/hidden": 0.0, + "loss/logits": 0.17990661412477493, + "loss/reg": 2.362797975540161, + "step": 617 + }, + { + "epoch": 0.00618, + "grad_norm": 0.44375622272491455, + "grad_norm_var": 0.005159430065949637, + "learning_rate": 5e-05, + "loss": 0.1662, + "loss/crossentropy": 2.8019450306892395, + "loss/hidden": 0.0, + "loss/logits": 0.1661831997334957, + "loss/reg": 2.3597350120544434, + "step": 618 + }, + { + "epoch": 0.00619, + "grad_norm": 0.41226035356521606, + "grad_norm_var": 0.005018716701479123, + "learning_rate": 5e-05, + "loss": 0.1737, + "loss/crossentropy": 2.837542712688446, + "loss/hidden": 0.0, + "loss/logits": 0.1737065464258194, + "loss/reg": 2.356935977935791, + "step": 619 + }, + { + "epoch": 0.0062, + "grad_norm": 0.36850520968437195, + "grad_norm_var": 0.005094037089036248, + "learning_rate": 5e-05, + "loss": 0.1691, + "loss/crossentropy": 2.872538685798645, + "loss/hidden": 0.0, + "loss/logits": 0.16909406706690788, + "loss/reg": 2.354841709136963, + "step": 620 + }, + { + "epoch": 0.00621, + "grad_norm": 0.3547448217868805, + "grad_norm_var": 0.004888988248098869, + "learning_rate": 5e-05, + "loss": 0.1777, + "loss/crossentropy": 2.727312922477722, + "loss/hidden": 0.0, + "loss/logits": 0.17773358151316643, + "loss/reg": 2.3518083095550537, + "step": 621 + }, + { + "epoch": 0.00622, + "grad_norm": 0.3340252637863159, + "grad_norm_var": 0.0049932414292845895, + "learning_rate": 5e-05, + "loss": 0.1673, + "loss/crossentropy": 2.7399535179138184, + "loss/hidden": 0.0, + "loss/logits": 0.16725115478038788, + "loss/reg": 2.349299907684326, + "step": 622 + }, + { + "epoch": 0.00623, + "grad_norm": 0.328477680683136, + "grad_norm_var": 0.004932429457390519, + "learning_rate": 5e-05, + "loss": 0.1658, + "loss/crossentropy": 2.7973376512527466, + "loss/hidden": 0.0, + "loss/logits": 0.1658070906996727, + "loss/reg": 2.346407175064087, + "step": 623 + }, + { + "epoch": 0.00624, + "grad_norm": 0.3988572061061859, + "grad_norm_var": 0.004746415643168555, + "learning_rate": 5e-05, + "loss": 0.1808, + "loss/crossentropy": 2.886197090148926, + "loss/hidden": 0.0, + "loss/logits": 0.18076446652412415, + "loss/reg": 2.343637228012085, + "step": 624 + }, + { + "epoch": 0.00625, + "grad_norm": 0.3653312921524048, + "grad_norm_var": 0.0048476613679717525, + "learning_rate": 5e-05, + "loss": 0.1752, + "loss/crossentropy": 2.6095593571662903, + "loss/hidden": 0.0, + "loss/logits": 0.17522242665290833, + "loss/reg": 2.34155011177063, + "step": 625 + }, + { + "epoch": 0.00626, + "grad_norm": 0.3519672751426697, + "grad_norm_var": 0.004975031863489395, + "learning_rate": 5e-05, + "loss": 0.1685, + "loss/crossentropy": 2.724495232105255, + "loss/hidden": 0.0, + "loss/logits": 0.16851425543427467, + "loss/reg": 2.339081048965454, + "step": 626 + }, + { + "epoch": 0.00627, + "grad_norm": 0.3507337272167206, + "grad_norm_var": 0.005024779798457324, + "learning_rate": 5e-05, + "loss": 0.1661, + "loss/crossentropy": 2.775688886642456, + "loss/hidden": 0.0, + "loss/logits": 0.1661130003631115, + "loss/reg": 2.3362221717834473, + "step": 627 + }, + { + "epoch": 0.00628, + "grad_norm": 0.35331088304519653, + "grad_norm_var": 0.004945443478871292, + "learning_rate": 5e-05, + "loss": 0.1803, + "loss/crossentropy": 2.6876689195632935, + "loss/hidden": 0.0, + "loss/logits": 0.18032584339380264, + "loss/reg": 2.3339033126831055, + "step": 628 + }, + { + "epoch": 0.00629, + "grad_norm": 0.3569658696651459, + "grad_norm_var": 0.004969005818722216, + "learning_rate": 5e-05, + "loss": 0.1646, + "loss/crossentropy": 2.87895804643631, + "loss/hidden": 0.0, + "loss/logits": 0.16462786123156548, + "loss/reg": 2.3317511081695557, + "step": 629 + }, + { + "epoch": 0.0063, + "grad_norm": 0.37102508544921875, + "grad_norm_var": 0.0038649155689368443, + "learning_rate": 5e-05, + "loss": 0.1807, + "loss/crossentropy": 2.8995742201805115, + "loss/hidden": 0.0, + "loss/logits": 0.1807471290230751, + "loss/reg": 2.3286077976226807, + "step": 630 + }, + { + "epoch": 0.00631, + "grad_norm": 0.37091144919395447, + "grad_norm_var": 0.0035332975201383715, + "learning_rate": 5e-05, + "loss": 0.1679, + "loss/crossentropy": 2.755174398422241, + "loss/hidden": 0.0, + "loss/logits": 0.16785116121172905, + "loss/reg": 2.3263347148895264, + "step": 631 + }, + { + "epoch": 0.00632, + "grad_norm": 0.3764369487762451, + "grad_norm_var": 0.000834165955391919, + "learning_rate": 5e-05, + "loss": 0.1597, + "loss/crossentropy": 2.7826634645462036, + "loss/hidden": 0.0, + "loss/logits": 0.1596829891204834, + "loss/reg": 2.3239433765411377, + "step": 632 + }, + { + "epoch": 0.00633, + "grad_norm": 0.34151408076286316, + "grad_norm_var": 0.0008818179956038841, + "learning_rate": 5e-05, + "loss": 0.1628, + "loss/crossentropy": 2.805456221103668, + "loss/hidden": 0.0, + "loss/logits": 0.16277796775102615, + "loss/reg": 2.3215837478637695, + "step": 633 + }, + { + "epoch": 0.00634, + "grad_norm": 0.7558053731918335, + "grad_norm_var": 0.010143553337954326, + "learning_rate": 5e-05, + "loss": 0.1858, + "loss/crossentropy": 2.7678999304771423, + "loss/hidden": 0.0, + "loss/logits": 0.18575545772910118, + "loss/reg": 2.3192989826202393, + "step": 634 + }, + { + "epoch": 0.00635, + "grad_norm": 0.3809748589992523, + "grad_norm_var": 0.010099062255010161, + "learning_rate": 5e-05, + "loss": 0.1792, + "loss/crossentropy": 2.86500483751297, + "loss/hidden": 0.0, + "loss/logits": 0.1791505441069603, + "loss/reg": 2.317030906677246, + "step": 635 + }, + { + "epoch": 0.00636, + "grad_norm": 0.40578746795654297, + "grad_norm_var": 0.010104068412990375, + "learning_rate": 5e-05, + "loss": 0.1882, + "loss/crossentropy": 2.8707818388938904, + "loss/hidden": 0.0, + "loss/logits": 0.188164584338665, + "loss/reg": 2.3146989345550537, + "step": 636 + }, + { + "epoch": 0.00637, + "grad_norm": 0.415227472782135, + "grad_norm_var": 0.010070131470069877, + "learning_rate": 5e-05, + "loss": 0.1748, + "loss/crossentropy": 2.831197440624237, + "loss/hidden": 0.0, + "loss/logits": 0.17476488277316093, + "loss/reg": 2.311936616897583, + "step": 637 + }, + { + "epoch": 0.00638, + "grad_norm": 0.4119730293750763, + "grad_norm_var": 0.00985685373482662, + "learning_rate": 5e-05, + "loss": 0.1699, + "loss/crossentropy": 2.6559138894081116, + "loss/hidden": 0.0, + "loss/logits": 0.16989587992429733, + "loss/reg": 2.3090522289276123, + "step": 638 + }, + { + "epoch": 0.00639, + "grad_norm": 0.3662709593772888, + "grad_norm_var": 0.009606093056996168, + "learning_rate": 5e-05, + "loss": 0.1775, + "loss/crossentropy": 2.767539858818054, + "loss/hidden": 0.0, + "loss/logits": 0.17754964902997017, + "loss/reg": 2.3056743144989014, + "step": 639 + }, + { + "epoch": 0.0064, + "grad_norm": 0.38491374254226685, + "grad_norm_var": 0.009617242443139995, + "learning_rate": 5e-05, + "loss": 0.1827, + "loss/crossentropy": 2.6669586896896362, + "loss/hidden": 0.0, + "loss/logits": 0.18266603723168373, + "loss/reg": 2.303258180618286, + "step": 640 + }, + { + "epoch": 0.00641, + "grad_norm": 0.4197373390197754, + "grad_norm_var": 0.009569272862985524, + "learning_rate": 5e-05, + "loss": 0.1778, + "loss/crossentropy": 2.7964502573013306, + "loss/hidden": 0.0, + "loss/logits": 0.17782465368509293, + "loss/reg": 2.300361156463623, + "step": 641 + }, + { + "epoch": 0.00642, + "grad_norm": 0.4097757339477539, + "grad_norm_var": 0.00940137989136159, + "learning_rate": 5e-05, + "loss": 0.1856, + "loss/crossentropy": 2.735614001750946, + "loss/hidden": 0.0, + "loss/logits": 0.1855894774198532, + "loss/reg": 2.2972419261932373, + "step": 642 + }, + { + "epoch": 0.00643, + "grad_norm": 0.35904356837272644, + "grad_norm_var": 0.00934616788177974, + "learning_rate": 5e-05, + "loss": 0.1833, + "loss/crossentropy": 2.774403393268585, + "loss/hidden": 0.0, + "loss/logits": 0.1832551322877407, + "loss/reg": 2.293954610824585, + "step": 643 + }, + { + "epoch": 0.00644, + "grad_norm": 0.34157049655914307, + "grad_norm_var": 0.009435664127140328, + "learning_rate": 5e-05, + "loss": 0.1618, + "loss/crossentropy": 2.8616234064102173, + "loss/hidden": 0.0, + "loss/logits": 0.16181085631251335, + "loss/reg": 2.2906179428100586, + "step": 644 + }, + { + "epoch": 0.00645, + "grad_norm": 0.4255986213684082, + "grad_norm_var": 0.009297406924193945, + "learning_rate": 5e-05, + "loss": 0.1853, + "loss/crossentropy": 2.654071033000946, + "loss/hidden": 0.0, + "loss/logits": 0.18530349805951118, + "loss/reg": 2.287349224090576, + "step": 645 + }, + { + "epoch": 0.00646, + "grad_norm": 0.3393001854419708, + "grad_norm_var": 0.009518979339113423, + "learning_rate": 5e-05, + "loss": 0.1665, + "loss/crossentropy": 2.73319810628891, + "loss/hidden": 0.0, + "loss/logits": 0.16647282242774963, + "loss/reg": 2.2834725379943848, + "step": 646 + }, + { + "epoch": 0.00647, + "grad_norm": 0.34969252347946167, + "grad_norm_var": 0.00964795505733251, + "learning_rate": 5e-05, + "loss": 0.1833, + "loss/crossentropy": 2.7993005514144897, + "loss/hidden": 0.0, + "loss/logits": 0.1833389550447464, + "loss/reg": 2.2799694538116455, + "step": 647 + }, + { + "epoch": 0.00648, + "grad_norm": 0.35388484597206116, + "grad_norm_var": 0.009766310746661707, + "learning_rate": 5e-05, + "loss": 0.1749, + "loss/crossentropy": 2.766145169734955, + "loss/hidden": 0.0, + "loss/logits": 0.17488964274525642, + "loss/reg": 2.277585029602051, + "step": 648 + }, + { + "epoch": 0.00649, + "grad_norm": 0.5462765097618103, + "grad_norm_var": 0.010685818975949597, + "learning_rate": 5e-05, + "loss": 0.1882, + "loss/crossentropy": 2.7475533485412598, + "loss/hidden": 0.0, + "loss/logits": 0.18824508786201477, + "loss/reg": 2.2750473022460938, + "step": 649 + }, + { + "epoch": 0.0065, + "grad_norm": 0.3537692725658417, + "grad_norm_var": 0.002605622083243005, + "learning_rate": 5e-05, + "loss": 0.1761, + "loss/crossentropy": 2.756273865699768, + "loss/hidden": 0.0, + "loss/logits": 0.17612234875559807, + "loss/reg": 2.2722957134246826, + "step": 650 + }, + { + "epoch": 0.00651, + "grad_norm": 0.3770252466201782, + "grad_norm_var": 0.0026121330513858157, + "learning_rate": 5e-05, + "loss": 0.1897, + "loss/crossentropy": 2.7889973521232605, + "loss/hidden": 0.0, + "loss/logits": 0.1897362545132637, + "loss/reg": 2.2701008319854736, + "step": 651 + }, + { + "epoch": 0.00652, + "grad_norm": 0.4475138187408447, + "grad_norm_var": 0.0028018836674080227, + "learning_rate": 5e-05, + "loss": 0.1951, + "loss/crossentropy": 2.531024992465973, + "loss/hidden": 0.0, + "loss/logits": 0.1951226033270359, + "loss/reg": 2.267695665359497, + "step": 652 + }, + { + "epoch": 0.00653, + "grad_norm": 0.3947466313838959, + "grad_norm_var": 0.002769718525090366, + "learning_rate": 5e-05, + "loss": 0.1958, + "loss/crossentropy": 2.886034905910492, + "loss/hidden": 0.0, + "loss/logits": 0.19575949385762215, + "loss/reg": 2.2648732662200928, + "step": 653 + }, + { + "epoch": 0.00654, + "grad_norm": 0.3775857090950012, + "grad_norm_var": 0.0027546537142078996, + "learning_rate": 5e-05, + "loss": 0.1746, + "loss/crossentropy": 2.7190786600112915, + "loss/hidden": 0.0, + "loss/logits": 0.17461128905415535, + "loss/reg": 2.2620925903320312, + "step": 654 + }, + { + "epoch": 0.00655, + "grad_norm": 0.34534481167793274, + "grad_norm_var": 0.002849399631435645, + "learning_rate": 5e-05, + "loss": 0.1792, + "loss/crossentropy": 2.8847506046295166, + "loss/hidden": 0.0, + "loss/logits": 0.1791832633316517, + "loss/reg": 2.2593743801116943, + "step": 655 + }, + { + "epoch": 0.00656, + "grad_norm": 0.3607633411884308, + "grad_norm_var": 0.0028993682580486144, + "learning_rate": 5e-05, + "loss": 0.1792, + "loss/crossentropy": 2.815674066543579, + "loss/hidden": 0.0, + "loss/logits": 0.179163109511137, + "loss/reg": 2.2569730281829834, + "step": 656 + }, + { + "epoch": 0.00657, + "grad_norm": 0.38781270384788513, + "grad_norm_var": 0.002826278400635814, + "learning_rate": 5e-05, + "loss": 0.1618, + "loss/crossentropy": 2.6106160283088684, + "loss/hidden": 0.0, + "loss/logits": 0.1617795117199421, + "loss/reg": 2.254523277282715, + "step": 657 + }, + { + "epoch": 0.00658, + "grad_norm": 0.40386784076690674, + "grad_norm_var": 0.0028094212847462165, + "learning_rate": 5e-05, + "loss": 0.1747, + "loss/crossentropy": 2.8471227884292603, + "loss/hidden": 0.0, + "loss/logits": 0.17474086582660675, + "loss/reg": 2.252164125442505, + "step": 658 + }, + { + "epoch": 0.00659, + "grad_norm": 0.36319243907928467, + "grad_norm_var": 0.002796007207749466, + "learning_rate": 5e-05, + "loss": 0.163, + "loss/crossentropy": 2.7625906467437744, + "loss/hidden": 0.0, + "loss/logits": 0.16296877712011337, + "loss/reg": 2.249460458755493, + "step": 659 + }, + { + "epoch": 0.0066, + "grad_norm": 0.3657222092151642, + "grad_norm_var": 0.00269101182172804, + "learning_rate": 5e-05, + "loss": 0.1777, + "loss/crossentropy": 2.781547486782074, + "loss/hidden": 0.0, + "loss/logits": 0.1777149885892868, + "loss/reg": 2.246467113494873, + "step": 660 + }, + { + "epoch": 0.00661, + "grad_norm": 0.38363057374954224, + "grad_norm_var": 0.0025851401210759276, + "learning_rate": 5e-05, + "loss": 0.1844, + "loss/crossentropy": 2.82689893245697, + "loss/hidden": 0.0, + "loss/logits": 0.1844283789396286, + "loss/reg": 2.2436530590057373, + "step": 661 + }, + { + "epoch": 0.00662, + "grad_norm": 0.4096749424934387, + "grad_norm_var": 0.0024716520181473594, + "learning_rate": 5e-05, + "loss": 0.1745, + "loss/crossentropy": 2.8063756823539734, + "loss/hidden": 0.0, + "loss/logits": 0.17451731115579605, + "loss/reg": 2.241178035736084, + "step": 662 + }, + { + "epoch": 0.00663, + "grad_norm": 0.42931249737739563, + "grad_norm_var": 0.0024528927297352344, + "learning_rate": 5e-05, + "loss": 0.186, + "loss/crossentropy": 2.8724401593208313, + "loss/hidden": 0.0, + "loss/logits": 0.1859952136874199, + "loss/reg": 2.2383124828338623, + "step": 663 + }, + { + "epoch": 0.00664, + "grad_norm": 0.3530314862728119, + "grad_norm_var": 0.0024574750299312478, + "learning_rate": 5e-05, + "loss": 0.1696, + "loss/crossentropy": 2.9292226433753967, + "loss/hidden": 0.0, + "loss/logits": 0.16955319419503212, + "loss/reg": 2.2356791496276855, + "step": 664 + }, + { + "epoch": 0.00665, + "grad_norm": 0.4304611384868622, + "grad_norm_var": 0.0009397736187397402, + "learning_rate": 5e-05, + "loss": 0.1902, + "loss/crossentropy": 2.7114855647087097, + "loss/hidden": 0.0, + "loss/logits": 0.19023016840219498, + "loss/reg": 2.2330329418182373, + "step": 665 + }, + { + "epoch": 0.00666, + "grad_norm": 0.32996541261672974, + "grad_norm_var": 0.0010789617804694747, + "learning_rate": 5e-05, + "loss": 0.158, + "loss/crossentropy": 2.8920618891716003, + "loss/hidden": 0.0, + "loss/logits": 0.1580132134258747, + "loss/reg": 2.2296440601348877, + "step": 666 + }, + { + "epoch": 0.00667, + "grad_norm": 0.3874596953392029, + "grad_norm_var": 0.0010747020479673205, + "learning_rate": 5e-05, + "loss": 0.1819, + "loss/crossentropy": 2.7297377586364746, + "loss/hidden": 0.0, + "loss/logits": 0.18189727514982224, + "loss/reg": 2.226966619491577, + "step": 667 + }, + { + "epoch": 0.00668, + "grad_norm": 0.36097773909568787, + "grad_norm_var": 0.000828712243429038, + "learning_rate": 5e-05, + "loss": 0.1726, + "loss/crossentropy": 2.6433697938919067, + "loss/hidden": 0.0, + "loss/logits": 0.17256683483719826, + "loss/reg": 2.2244138717651367, + "step": 668 + }, + { + "epoch": 0.00669, + "grad_norm": 0.3509676158428192, + "grad_norm_var": 0.0008637156407869958, + "learning_rate": 5e-05, + "loss": 0.1723, + "loss/crossentropy": 2.8315157890319824, + "loss/hidden": 0.0, + "loss/logits": 0.17234884947538376, + "loss/reg": 2.220712661743164, + "step": 669 + }, + { + "epoch": 0.0067, + "grad_norm": 0.3578469157218933, + "grad_norm_var": 0.0008878035089742793, + "learning_rate": 5e-05, + "loss": 0.1657, + "loss/crossentropy": 2.788190722465515, + "loss/hidden": 0.0, + "loss/logits": 0.16572094336152077, + "loss/reg": 2.217878818511963, + "step": 670 + }, + { + "epoch": 0.00671, + "grad_norm": 0.4930081069469452, + "grad_norm_var": 0.0016420680378558003, + "learning_rate": 5e-05, + "loss": 0.1818, + "loss/crossentropy": 3.013857901096344, + "loss/hidden": 0.0, + "loss/logits": 0.18175816163420677, + "loss/reg": 2.2148284912109375, + "step": 671 + }, + { + "epoch": 0.00672, + "grad_norm": 0.36925604939460754, + "grad_norm_var": 0.0016185866984450236, + "learning_rate": 5e-05, + "loss": 0.1642, + "loss/crossentropy": 2.8940696716308594, + "loss/hidden": 0.0, + "loss/logits": 0.1642276532948017, + "loss/reg": 2.2119295597076416, + "step": 672 + }, + { + "epoch": 0.00673, + "grad_norm": 0.4327005445957184, + "grad_norm_var": 0.0017552981165500747, + "learning_rate": 5e-05, + "loss": 0.1742, + "loss/crossentropy": 2.87309467792511, + "loss/hidden": 0.0, + "loss/logits": 0.17423933744430542, + "loss/reg": 2.209021806716919, + "step": 673 + }, + { + "epoch": 0.00674, + "grad_norm": 0.738524854183197, + "grad_norm_var": 0.009426579051544037, + "learning_rate": 5e-05, + "loss": 0.1868, + "loss/crossentropy": 2.8040258288383484, + "loss/hidden": 0.0, + "loss/logits": 0.1867678351700306, + "loss/reg": 2.2066619396209717, + "step": 674 + }, + { + "epoch": 0.00675, + "grad_norm": 0.4364205002784729, + "grad_norm_var": 0.009307313279513674, + "learning_rate": 5e-05, + "loss": 0.1796, + "loss/crossentropy": 2.718536138534546, + "loss/hidden": 0.0, + "loss/logits": 0.17956989258527756, + "loss/reg": 2.203990936279297, + "step": 675 + }, + { + "epoch": 0.00676, + "grad_norm": 0.41067376732826233, + "grad_norm_var": 0.009142390414932911, + "learning_rate": 5e-05, + "loss": 0.1642, + "loss/crossentropy": 2.72940456867218, + "loss/hidden": 0.0, + "loss/logits": 0.16421709582209587, + "loss/reg": 2.2013046741485596, + "step": 676 + }, + { + "epoch": 0.00677, + "grad_norm": 0.4327182173728943, + "grad_norm_var": 0.009073804614162174, + "learning_rate": 5e-05, + "loss": 0.1905, + "loss/crossentropy": 2.7371246814727783, + "loss/hidden": 0.0, + "loss/logits": 0.19054419547319412, + "loss/reg": 2.1991653442382812, + "step": 677 + }, + { + "epoch": 0.00678, + "grad_norm": 0.3779783844947815, + "grad_norm_var": 0.009181024716334075, + "learning_rate": 5e-05, + "loss": 0.1589, + "loss/crossentropy": 2.7500953674316406, + "loss/hidden": 0.0, + "loss/logits": 0.15891055390238762, + "loss/reg": 2.197261333465576, + "step": 678 + }, + { + "epoch": 0.00679, + "grad_norm": 0.3585035502910614, + "grad_norm_var": 0.009389539404841711, + "learning_rate": 5e-05, + "loss": 0.1836, + "loss/crossentropy": 2.869826376438141, + "loss/hidden": 0.0, + "loss/logits": 0.18359991908073425, + "loss/reg": 2.195239782333374, + "step": 679 + }, + { + "epoch": 0.0068, + "grad_norm": 0.3534944951534271, + "grad_norm_var": 0.009385802469305704, + "learning_rate": 5e-05, + "loss": 0.1689, + "loss/crossentropy": 2.8476794362068176, + "loss/hidden": 0.0, + "loss/logits": 0.1689487136900425, + "loss/reg": 2.1929402351379395, + "step": 680 + }, + { + "epoch": 0.00681, + "grad_norm": 0.3718988001346588, + "grad_norm_var": 0.009470130435250168, + "learning_rate": 5e-05, + "loss": 0.1704, + "loss/crossentropy": 2.7930009365081787, + "loss/hidden": 0.0, + "loss/logits": 0.17038631066679955, + "loss/reg": 2.19075608253479, + "step": 681 + }, + { + "epoch": 0.00682, + "grad_norm": 0.4854961037635803, + "grad_norm_var": 0.009319177707927173, + "learning_rate": 5e-05, + "loss": 0.1705, + "loss/crossentropy": 2.8028470277786255, + "loss/hidden": 0.0, + "loss/logits": 0.17054682224988937, + "loss/reg": 2.1886627674102783, + "step": 682 + }, + { + "epoch": 0.00683, + "grad_norm": 0.3880312144756317, + "grad_norm_var": 0.00931672834921676, + "learning_rate": 5e-05, + "loss": 0.1766, + "loss/crossentropy": 2.963544547557831, + "loss/hidden": 0.0, + "loss/logits": 0.17662956938147545, + "loss/reg": 2.1860859394073486, + "step": 683 + }, + { + "epoch": 0.00684, + "grad_norm": 0.3488878309726715, + "grad_norm_var": 0.009420855437860176, + "learning_rate": 5e-05, + "loss": 0.1667, + "loss/crossentropy": 2.959736704826355, + "loss/hidden": 0.0, + "loss/logits": 0.16670886427164078, + "loss/reg": 2.183668375015259, + "step": 684 + }, + { + "epoch": 0.00685, + "grad_norm": 0.8154363632202148, + "grad_norm_var": 0.018681551405985854, + "learning_rate": 5e-05, + "loss": 0.2213, + "loss/crossentropy": 2.911233067512512, + "loss/hidden": 0.0, + "loss/logits": 0.2212524674832821, + "loss/reg": 2.1813154220581055, + "step": 685 + }, + { + "epoch": 0.00686, + "grad_norm": 0.4155946969985962, + "grad_norm_var": 0.018194440840509217, + "learning_rate": 5e-05, + "loss": 0.1964, + "loss/crossentropy": 2.7186298966407776, + "loss/hidden": 0.0, + "loss/logits": 0.19640850275754929, + "loss/reg": 2.1795501708984375, + "step": 686 + }, + { + "epoch": 0.00687, + "grad_norm": 0.38160914182662964, + "grad_norm_var": 0.01835781299917098, + "learning_rate": 5e-05, + "loss": 0.1708, + "loss/crossentropy": 2.7301290035247803, + "loss/hidden": 0.0, + "loss/logits": 0.17081937566399574, + "loss/reg": 2.177623748779297, + "step": 687 + }, + { + "epoch": 0.00688, + "grad_norm": 0.41628003120422363, + "grad_norm_var": 0.01802219976069038, + "learning_rate": 5e-05, + "loss": 0.1771, + "loss/crossentropy": 2.7876546382904053, + "loss/hidden": 0.0, + "loss/logits": 0.17706667259335518, + "loss/reg": 2.175504446029663, + "step": 688 + }, + { + "epoch": 0.00689, + "grad_norm": 0.4177417755126953, + "grad_norm_var": 0.018066232212721724, + "learning_rate": 5e-05, + "loss": 0.1786, + "loss/crossentropy": 2.763257145881653, + "loss/hidden": 0.0, + "loss/logits": 0.1785966381430626, + "loss/reg": 2.173213481903076, + "step": 689 + }, + { + "epoch": 0.0069, + "grad_norm": 0.3603265583515167, + "grad_norm_var": 0.012296751904473697, + "learning_rate": 5e-05, + "loss": 0.1678, + "loss/crossentropy": 2.774847447872162, + "loss/hidden": 0.0, + "loss/logits": 0.16783085092902184, + "loss/reg": 2.1712427139282227, + "step": 690 + }, + { + "epoch": 0.00691, + "grad_norm": 0.4307333827018738, + "grad_norm_var": 0.01228874334383105, + "learning_rate": 5e-05, + "loss": 0.2034, + "loss/crossentropy": 2.6488924622535706, + "loss/hidden": 0.0, + "loss/logits": 0.20339511707425117, + "loss/reg": 2.170015573501587, + "step": 691 + }, + { + "epoch": 0.00692, + "grad_norm": 0.3678703010082245, + "grad_norm_var": 0.012472673417673882, + "learning_rate": 5e-05, + "loss": 0.1793, + "loss/crossentropy": 2.8285900950431824, + "loss/hidden": 0.0, + "loss/logits": 0.179282795637846, + "loss/reg": 2.1680784225463867, + "step": 692 + }, + { + "epoch": 0.00693, + "grad_norm": 0.3516632914543152, + "grad_norm_var": 0.012747599104723136, + "learning_rate": 5e-05, + "loss": 0.1638, + "loss/crossentropy": 2.72187340259552, + "loss/hidden": 0.0, + "loss/logits": 0.16377655416727066, + "loss/reg": 2.166708469390869, + "step": 693 + }, + { + "epoch": 0.00694, + "grad_norm": 0.37773895263671875, + "grad_norm_var": 0.012748787659448176, + "learning_rate": 5e-05, + "loss": 0.2, + "loss/crossentropy": 2.5079989433288574, + "loss/hidden": 0.0, + "loss/logits": 0.19995050877332687, + "loss/reg": 2.1644845008850098, + "step": 694 + }, + { + "epoch": 0.00695, + "grad_norm": 0.33557403087615967, + "grad_norm_var": 0.012954622340141124, + "learning_rate": 5e-05, + "loss": 0.173, + "loss/crossentropy": 2.733457326889038, + "loss/hidden": 0.0, + "loss/logits": 0.1730196811258793, + "loss/reg": 2.162649631500244, + "step": 695 + }, + { + "epoch": 0.00696, + "grad_norm": 0.3414340615272522, + "grad_norm_var": 0.01306044443406886, + "learning_rate": 5e-05, + "loss": 0.1699, + "loss/crossentropy": 2.770694136619568, + "loss/hidden": 0.0, + "loss/logits": 0.1698729656636715, + "loss/reg": 2.1605873107910156, + "step": 696 + }, + { + "epoch": 0.00697, + "grad_norm": 0.39742037653923035, + "grad_norm_var": 0.012961649579914787, + "learning_rate": 5e-05, + "loss": 0.1753, + "loss/crossentropy": 2.747798502445221, + "loss/hidden": 0.0, + "loss/logits": 0.17528066039085388, + "loss/reg": 2.158661127090454, + "step": 697 + }, + { + "epoch": 0.00698, + "grad_norm": 0.4672209620475769, + "grad_norm_var": 0.012809503544980934, + "learning_rate": 5e-05, + "loss": 0.1961, + "loss/crossentropy": 2.764335811138153, + "loss/hidden": 0.0, + "loss/logits": 0.1961456499993801, + "loss/reg": 2.157139539718628, + "step": 698 + }, + { + "epoch": 0.00699, + "grad_norm": 0.40900057554244995, + "grad_norm_var": 0.012766202979620484, + "learning_rate": 5e-05, + "loss": 0.1826, + "loss/crossentropy": 2.9526583552360535, + "loss/hidden": 0.0, + "loss/logits": 0.1826096773147583, + "loss/reg": 2.1556172370910645, + "step": 699 + }, + { + "epoch": 0.007, + "grad_norm": 0.45763787627220154, + "grad_norm_var": 0.01255169197725956, + "learning_rate": 5e-05, + "loss": 0.1845, + "loss/crossentropy": 2.9059385657310486, + "loss/hidden": 0.0, + "loss/logits": 0.18454358726739883, + "loss/reg": 2.1542842388153076, + "step": 700 + }, + { + "epoch": 0.00701, + "grad_norm": 0.568651020526886, + "grad_norm_var": 0.0033942912710514268, + "learning_rate": 5e-05, + "loss": 0.1801, + "loss/crossentropy": 2.771495759487152, + "loss/hidden": 0.0, + "loss/logits": 0.18007055297493935, + "loss/reg": 2.1522164344787598, + "step": 701 + }, + { + "epoch": 0.00702, + "grad_norm": 0.3590672016143799, + "grad_norm_var": 0.00352192003862181, + "learning_rate": 5e-05, + "loss": 0.1651, + "loss/crossentropy": 2.750881016254425, + "loss/hidden": 0.0, + "loss/logits": 0.1650897115468979, + "loss/reg": 2.1509296894073486, + "step": 702 + }, + { + "epoch": 0.00703, + "grad_norm": 0.36948493123054504, + "grad_norm_var": 0.0035648755964216056, + "learning_rate": 5e-05, + "loss": 0.1785, + "loss/crossentropy": 2.8027891516685486, + "loss/hidden": 0.0, + "loss/logits": 0.17848948016762733, + "loss/reg": 2.149231195449829, + "step": 703 + }, + { + "epoch": 0.00704, + "grad_norm": 0.3613908588886261, + "grad_norm_var": 0.0036467673242235915, + "learning_rate": 5e-05, + "loss": 0.1682, + "loss/crossentropy": 2.763719141483307, + "loss/hidden": 0.0, + "loss/logits": 0.16823140904307365, + "loss/reg": 2.1478073596954346, + "step": 704 + }, + { + "epoch": 0.00705, + "grad_norm": 0.38240060210227966, + "grad_norm_var": 0.003633263034560896, + "learning_rate": 5e-05, + "loss": 0.178, + "loss/crossentropy": 2.705716133117676, + "loss/hidden": 0.0, + "loss/logits": 0.1780022643506527, + "loss/reg": 2.1462342739105225, + "step": 705 + }, + { + "epoch": 0.00706, + "grad_norm": 0.3587467074394226, + "grad_norm_var": 0.0036409547879681387, + "learning_rate": 5e-05, + "loss": 0.1658, + "loss/crossentropy": 2.7772558331489563, + "loss/hidden": 0.0, + "loss/logits": 0.16575098782777786, + "loss/reg": 2.144062042236328, + "step": 706 + }, + { + "epoch": 0.00707, + "grad_norm": 0.36025822162628174, + "grad_norm_var": 0.0036250184261099458, + "learning_rate": 5e-05, + "loss": 0.1724, + "loss/crossentropy": 2.634014904499054, + "loss/hidden": 0.0, + "loss/logits": 0.17235567048192024, + "loss/reg": 2.1429154872894287, + "step": 707 + }, + { + "epoch": 0.00708, + "grad_norm": 0.35575759410858154, + "grad_norm_var": 0.0036725083584184842, + "learning_rate": 5e-05, + "loss": 0.1794, + "loss/crossentropy": 2.6474004983901978, + "loss/hidden": 0.0, + "loss/logits": 0.17943605780601501, + "loss/reg": 2.1421751976013184, + "step": 708 + }, + { + "epoch": 0.00709, + "grad_norm": 0.3865105211734772, + "grad_norm_var": 0.003566375202589933, + "learning_rate": 5e-05, + "loss": 0.1817, + "loss/crossentropy": 2.9082140922546387, + "loss/hidden": 0.0, + "loss/logits": 0.18172414600849152, + "loss/reg": 2.1403071880340576, + "step": 709 + }, + { + "epoch": 0.0071, + "grad_norm": 0.368362694978714, + "grad_norm_var": 0.003590971719305887, + "learning_rate": 5e-05, + "loss": 0.1711, + "loss/crossentropy": 2.6766469478607178, + "loss/hidden": 0.0, + "loss/logits": 0.17112310975790024, + "loss/reg": 2.138167142868042, + "step": 710 + }, + { + "epoch": 0.00711, + "grad_norm": 0.34797176718711853, + "grad_norm_var": 0.003506589552138199, + "learning_rate": 5e-05, + "loss": 0.163, + "loss/crossentropy": 2.8545928597450256, + "loss/hidden": 0.0, + "loss/logits": 0.16299721226096153, + "loss/reg": 2.1356112957000732, + "step": 711 + }, + { + "epoch": 0.00712, + "grad_norm": 0.3511999547481537, + "grad_norm_var": 0.0034451354888741254, + "learning_rate": 5e-05, + "loss": 0.1706, + "loss/crossentropy": 2.7516467571258545, + "loss/hidden": 0.0, + "loss/logits": 0.1705768182873726, + "loss/reg": 2.13396954536438, + "step": 712 + }, + { + "epoch": 0.00713, + "grad_norm": 0.4692562520503998, + "grad_norm_var": 0.0038021677070381584, + "learning_rate": 5e-05, + "loss": 0.1772, + "loss/crossentropy": 2.7622230648994446, + "loss/hidden": 0.0, + "loss/logits": 0.1771526113152504, + "loss/reg": 2.1316707134246826, + "step": 713 + }, + { + "epoch": 0.00714, + "grad_norm": 0.3500974774360657, + "grad_norm_var": 0.003583350276630167, + "learning_rate": 5e-05, + "loss": 0.1651, + "loss/crossentropy": 2.7385149598121643, + "loss/hidden": 0.0, + "loss/logits": 0.16512250155210495, + "loss/reg": 2.1301488876342773, + "step": 714 + }, + { + "epoch": 0.00715, + "grad_norm": 0.33279696106910706, + "grad_norm_var": 0.0037632620297312364, + "learning_rate": 5e-05, + "loss": 0.1639, + "loss/crossentropy": 2.6760587096214294, + "loss/hidden": 0.0, + "loss/logits": 0.16387901455163956, + "loss/reg": 2.128563165664673, + "step": 715 + }, + { + "epoch": 0.00716, + "grad_norm": 0.36436334252357483, + "grad_norm_var": 0.003418879723208453, + "learning_rate": 5e-05, + "loss": 0.1675, + "loss/crossentropy": 2.7055559158325195, + "loss/hidden": 0.0, + "loss/logits": 0.16751762479543686, + "loss/reg": 2.1272294521331787, + "step": 716 + }, + { + "epoch": 0.00717, + "grad_norm": 0.35308849811553955, + "grad_norm_var": 0.0009122804473129371, + "learning_rate": 5e-05, + "loss": 0.1685, + "loss/crossentropy": 2.827264368534088, + "loss/hidden": 0.0, + "loss/logits": 0.16845671087503433, + "loss/reg": 2.125559091567993, + "step": 717 + }, + { + "epoch": 0.00718, + "grad_norm": 0.36609259247779846, + "grad_norm_var": 0.0009080073745675876, + "learning_rate": 5e-05, + "loss": 0.1878, + "loss/crossentropy": 2.7995529770851135, + "loss/hidden": 0.0, + "loss/logits": 0.18778567016124725, + "loss/reg": 2.12422513961792, + "step": 718 + }, + { + "epoch": 0.00719, + "grad_norm": 0.3564467430114746, + "grad_norm_var": 0.0009149400496893722, + "learning_rate": 5e-05, + "loss": 0.1665, + "loss/crossentropy": 2.848701000213623, + "loss/hidden": 0.0, + "loss/logits": 0.16652807220816612, + "loss/reg": 2.1227705478668213, + "step": 719 + }, + { + "epoch": 0.0072, + "grad_norm": 0.3523035943508148, + "grad_norm_var": 0.0009263477116920882, + "learning_rate": 5e-05, + "loss": 0.169, + "loss/crossentropy": 2.7714666724205017, + "loss/hidden": 0.0, + "loss/logits": 0.16896242648363113, + "loss/reg": 2.1214191913604736, + "step": 720 + }, + { + "epoch": 0.00721, + "grad_norm": 0.39885270595550537, + "grad_norm_var": 0.0009792887842439849, + "learning_rate": 5e-05, + "loss": 0.1701, + "loss/crossentropy": 2.835131287574768, + "loss/hidden": 0.0, + "loss/logits": 0.1700747236609459, + "loss/reg": 2.1200404167175293, + "step": 721 + }, + { + "epoch": 0.00722, + "grad_norm": 0.40293964743614197, + "grad_norm_var": 0.0010526817455953927, + "learning_rate": 5e-05, + "loss": 0.1819, + "loss/crossentropy": 2.744925618171692, + "loss/hidden": 0.0, + "loss/logits": 0.1818903423845768, + "loss/reg": 2.1181347370147705, + "step": 722 + }, + { + "epoch": 0.00723, + "grad_norm": 0.5598530769348145, + "grad_norm_var": 0.0032894654306610577, + "learning_rate": 5e-05, + "loss": 0.1662, + "loss/crossentropy": 2.983691990375519, + "loss/hidden": 0.0, + "loss/logits": 0.16615596786141396, + "loss/reg": 2.1172993183135986, + "step": 723 + }, + { + "epoch": 0.00724, + "grad_norm": 0.39669546484947205, + "grad_norm_var": 0.003249640426166478, + "learning_rate": 5e-05, + "loss": 0.1735, + "loss/crossentropy": 2.709549069404602, + "loss/hidden": 0.0, + "loss/logits": 0.17345992848277092, + "loss/reg": 2.1152801513671875, + "step": 724 + }, + { + "epoch": 0.00725, + "grad_norm": 0.35726040601730347, + "grad_norm_var": 0.0032964500726321067, + "learning_rate": 5e-05, + "loss": 0.1624, + "loss/crossentropy": 2.7148231267929077, + "loss/hidden": 0.0, + "loss/logits": 0.16241873800754547, + "loss/reg": 2.1130924224853516, + "step": 725 + }, + { + "epoch": 0.00726, + "grad_norm": 0.3927571177482605, + "grad_norm_var": 0.0032861190572127997, + "learning_rate": 5e-05, + "loss": 0.1795, + "loss/crossentropy": 2.7939482927322388, + "loss/hidden": 0.0, + "loss/logits": 0.1794990859925747, + "loss/reg": 2.111480712890625, + "step": 726 + }, + { + "epoch": 0.00727, + "grad_norm": 0.3941044807434082, + "grad_norm_var": 0.0031944564404073005, + "learning_rate": 5e-05, + "loss": 0.1712, + "loss/crossentropy": 2.834249794483185, + "loss/hidden": 0.0, + "loss/logits": 0.1711888276040554, + "loss/reg": 2.109204053878784, + "step": 727 + }, + { + "epoch": 0.00728, + "grad_norm": 0.4828793704509735, + "grad_norm_var": 0.0036429198556795937, + "learning_rate": 5e-05, + "loss": 0.2082, + "loss/crossentropy": 2.899094045162201, + "loss/hidden": 0.0, + "loss/logits": 0.20823358744382858, + "loss/reg": 2.1061551570892334, + "step": 728 + }, + { + "epoch": 0.00729, + "grad_norm": 0.3574215769767761, + "grad_norm_var": 0.003326472236741973, + "learning_rate": 5e-05, + "loss": 0.1596, + "loss/crossentropy": 2.7636680603027344, + "loss/hidden": 0.0, + "loss/logits": 0.15961402654647827, + "loss/reg": 2.104001760482788, + "step": 729 + }, + { + "epoch": 0.0073, + "grad_norm": 0.40163764357566833, + "grad_norm_var": 0.003227754706050412, + "learning_rate": 5e-05, + "loss": 0.1797, + "loss/crossentropy": 2.8588566184043884, + "loss/hidden": 0.0, + "loss/logits": 0.17974677309393883, + "loss/reg": 2.102442741394043, + "step": 730 + }, + { + "epoch": 0.00731, + "grad_norm": 0.37189754843711853, + "grad_norm_var": 0.003015475193035148, + "learning_rate": 5e-05, + "loss": 0.1673, + "loss/crossentropy": 2.826458215713501, + "loss/hidden": 0.0, + "loss/logits": 0.16728204488754272, + "loss/reg": 2.1002049446105957, + "step": 731 + }, + { + "epoch": 0.00732, + "grad_norm": 0.3587784171104431, + "grad_norm_var": 0.003039707591927121, + "learning_rate": 5e-05, + "loss": 0.1645, + "loss/crossentropy": 2.731923222541809, + "loss/hidden": 0.0, + "loss/logits": 0.16448039561510086, + "loss/reg": 2.098315954208374, + "step": 732 + }, + { + "epoch": 0.00733, + "grad_norm": 0.37631648778915405, + "grad_norm_var": 0.002946915065401934, + "learning_rate": 5e-05, + "loss": 0.1714, + "loss/crossentropy": 2.880859136581421, + "loss/hidden": 0.0, + "loss/logits": 0.17137856781482697, + "loss/reg": 2.0962650775909424, + "step": 733 + }, + { + "epoch": 0.00734, + "grad_norm": 0.3563605844974518, + "grad_norm_var": 0.002990850657754888, + "learning_rate": 5e-05, + "loss": 0.1628, + "loss/crossentropy": 2.6355279088020325, + "loss/hidden": 0.0, + "loss/logits": 0.16278789564967155, + "loss/reg": 2.094142436981201, + "step": 734 + }, + { + "epoch": 0.00735, + "grad_norm": 0.37199047207832336, + "grad_norm_var": 0.0029265023383142925, + "learning_rate": 5e-05, + "loss": 0.1802, + "loss/crossentropy": 2.769617021083832, + "loss/hidden": 0.0, + "loss/logits": 0.18019907549023628, + "loss/reg": 2.0921125411987305, + "step": 735 + }, + { + "epoch": 0.00736, + "grad_norm": 0.489103764295578, + "grad_norm_var": 0.0033036264225515164, + "learning_rate": 5e-05, + "loss": 0.1832, + "loss/crossentropy": 2.7491883039474487, + "loss/hidden": 0.0, + "loss/logits": 0.1831774264574051, + "loss/reg": 2.0904338359832764, + "step": 736 + }, + { + "epoch": 0.00737, + "grad_norm": 0.5059826970100403, + "grad_norm_var": 0.003943075932518525, + "learning_rate": 5e-05, + "loss": 0.1817, + "loss/crossentropy": 2.8231146931648254, + "loss/hidden": 0.0, + "loss/logits": 0.18174266442656517, + "loss/reg": 2.0879714488983154, + "step": 737 + }, + { + "epoch": 0.00738, + "grad_norm": 0.6662333011627197, + "grad_norm_var": 0.007992879009924207, + "learning_rate": 5e-05, + "loss": 0.1861, + "loss/crossentropy": 2.7952335476875305, + "loss/hidden": 0.0, + "loss/logits": 0.18612126260995865, + "loss/reg": 2.0855941772460938, + "step": 738 + }, + { + "epoch": 0.00739, + "grad_norm": 0.43555790185928345, + "grad_norm_var": 0.006764259520141217, + "learning_rate": 5e-05, + "loss": 0.1823, + "loss/crossentropy": 2.7390406727790833, + "loss/hidden": 0.0, + "loss/logits": 0.1822943352162838, + "loss/reg": 2.083002805709839, + "step": 739 + }, + { + "epoch": 0.0074, + "grad_norm": 0.36206063628196716, + "grad_norm_var": 0.0069454028516603905, + "learning_rate": 5e-05, + "loss": 0.1747, + "loss/crossentropy": 2.6245489716529846, + "loss/hidden": 0.0, + "loss/logits": 0.17474905773997307, + "loss/reg": 2.0807597637176514, + "step": 740 + }, + { + "epoch": 0.00741, + "grad_norm": 0.4077146649360657, + "grad_norm_var": 0.006699115025232619, + "learning_rate": 5e-05, + "loss": 0.21, + "loss/crossentropy": 2.734727144241333, + "loss/hidden": 0.0, + "loss/logits": 0.20996900647878647, + "loss/reg": 2.0777878761291504, + "step": 741 + }, + { + "epoch": 0.00742, + "grad_norm": 0.4748740792274475, + "grad_norm_var": 0.0068148961293998615, + "learning_rate": 5e-05, + "loss": 0.1926, + "loss/crossentropy": 2.757317006587982, + "loss/hidden": 0.0, + "loss/logits": 0.19259492680430412, + "loss/reg": 2.0748398303985596, + "step": 742 + }, + { + "epoch": 0.00743, + "grad_norm": 0.3738694190979004, + "grad_norm_var": 0.006926021168675212, + "learning_rate": 5e-05, + "loss": 0.1671, + "loss/crossentropy": 2.7804144620895386, + "loss/hidden": 0.0, + "loss/logits": 0.1671152375638485, + "loss/reg": 2.072981119155884, + "step": 743 + }, + { + "epoch": 0.00744, + "grad_norm": 0.4373812675476074, + "grad_norm_var": 0.006701504868688938, + "learning_rate": 5e-05, + "loss": 0.1836, + "loss/crossentropy": 2.8251866698265076, + "loss/hidden": 0.0, + "loss/logits": 0.183602724224329, + "loss/reg": 2.069178581237793, + "step": 744 + }, + { + "epoch": 0.00745, + "grad_norm": 0.41339626908302307, + "grad_norm_var": 0.006417608208757027, + "learning_rate": 5e-05, + "loss": 0.1665, + "loss/crossentropy": 2.7784698605537415, + "loss/hidden": 0.0, + "loss/logits": 0.16648468375205994, + "loss/reg": 2.066897392272949, + "step": 745 + }, + { + "epoch": 0.00746, + "grad_norm": 0.36906108260154724, + "grad_norm_var": 0.0065862671267569286, + "learning_rate": 5e-05, + "loss": 0.1752, + "loss/crossentropy": 2.7134994864463806, + "loss/hidden": 0.0, + "loss/logits": 0.17522963881492615, + "loss/reg": 2.063711404800415, + "step": 746 + }, + { + "epoch": 0.00747, + "grad_norm": 0.3699776232242584, + "grad_norm_var": 0.006599620482715507, + "learning_rate": 5e-05, + "loss": 0.1816, + "loss/crossentropy": 2.6448380947113037, + "loss/hidden": 0.0, + "loss/logits": 0.1815556287765503, + "loss/reg": 2.0618152618408203, + "step": 747 + }, + { + "epoch": 0.00748, + "grad_norm": 0.35848432779312134, + "grad_norm_var": 0.006602145753337363, + "learning_rate": 5e-05, + "loss": 0.1759, + "loss/crossentropy": 2.577029287815094, + "loss/hidden": 0.0, + "loss/logits": 0.1758727729320526, + "loss/reg": 2.0592784881591797, + "step": 748 + }, + { + "epoch": 0.00749, + "grad_norm": 0.40015411376953125, + "grad_norm_var": 0.006489211309593653, + "learning_rate": 5e-05, + "loss": 0.2007, + "loss/crossentropy": 2.7719894647598267, + "loss/hidden": 0.0, + "loss/logits": 0.20066174119710922, + "loss/reg": 2.056396484375, + "step": 749 + }, + { + "epoch": 0.0075, + "grad_norm": 0.34235846996307373, + "grad_norm_var": 0.006628701391081989, + "learning_rate": 5e-05, + "loss": 0.1661, + "loss/crossentropy": 2.8526532649993896, + "loss/hidden": 0.0, + "loss/logits": 0.1660567931830883, + "loss/reg": 2.053225040435791, + "step": 750 + }, + { + "epoch": 0.00751, + "grad_norm": 0.37578198313713074, + "grad_norm_var": 0.006603490490161393, + "learning_rate": 5e-05, + "loss": 0.192, + "loss/crossentropy": 2.803673267364502, + "loss/hidden": 0.0, + "loss/logits": 0.19199685007333755, + "loss/reg": 2.0500073432922363, + "step": 751 + }, + { + "epoch": 0.00752, + "grad_norm": 0.3724234700202942, + "grad_norm_var": 0.006439587327084632, + "learning_rate": 5e-05, + "loss": 0.1611, + "loss/crossentropy": 2.8497246503829956, + "loss/hidden": 0.0, + "loss/logits": 0.16111965849995613, + "loss/reg": 2.0481040477752686, + "step": 752 + }, + { + "epoch": 0.00753, + "grad_norm": 0.37283533811569214, + "grad_norm_var": 0.005960471364599432, + "learning_rate": 5e-05, + "loss": 0.1816, + "loss/crossentropy": 2.580562174320221, + "loss/hidden": 0.0, + "loss/logits": 0.18162427470088005, + "loss/reg": 2.045363187789917, + "step": 753 + }, + { + "epoch": 0.00754, + "grad_norm": 0.42849722504615784, + "grad_norm_var": 0.0013156070888824681, + "learning_rate": 5e-05, + "loss": 0.1885, + "loss/crossentropy": 2.7384997606277466, + "loss/hidden": 0.0, + "loss/logits": 0.1885378062725067, + "loss/reg": 2.0420894622802734, + "step": 754 + }, + { + "epoch": 0.00755, + "grad_norm": 0.3246319890022278, + "grad_norm_var": 0.0014611472372319412, + "learning_rate": 5e-05, + "loss": 0.152, + "loss/crossentropy": 2.827781558036804, + "loss/hidden": 0.0, + "loss/logits": 0.15203238278627396, + "loss/reg": 2.0399134159088135, + "step": 755 + }, + { + "epoch": 0.00756, + "grad_norm": 0.3523566722869873, + "grad_norm_var": 0.0014986135555234647, + "learning_rate": 5e-05, + "loss": 0.1799, + "loss/crossentropy": 2.7049853801727295, + "loss/hidden": 0.0, + "loss/logits": 0.17991740256547928, + "loss/reg": 2.036095142364502, + "step": 756 + }, + { + "epoch": 0.00757, + "grad_norm": 0.3352646827697754, + "grad_norm_var": 0.0016155829783374783, + "learning_rate": 5e-05, + "loss": 0.1612, + "loss/crossentropy": 2.715296685695648, + "loss/hidden": 0.0, + "loss/logits": 0.16122159361839294, + "loss/reg": 2.0335283279418945, + "step": 757 + }, + { + "epoch": 0.00758, + "grad_norm": 0.36173179745674133, + "grad_norm_var": 0.001004548523534495, + "learning_rate": 5e-05, + "loss": 0.1768, + "loss/crossentropy": 2.8797001242637634, + "loss/hidden": 0.0, + "loss/logits": 0.17676853761076927, + "loss/reg": 2.0315983295440674, + "step": 758 + }, + { + "epoch": 0.00759, + "grad_norm": 0.43379032611846924, + "grad_norm_var": 0.0012258123535982288, + "learning_rate": 5e-05, + "loss": 0.2008, + "loss/crossentropy": 2.7367305159568787, + "loss/hidden": 0.0, + "loss/logits": 0.20076703280210495, + "loss/reg": 2.028825521469116, + "step": 759 + }, + { + "epoch": 0.0076, + "grad_norm": 0.7135851979255676, + "grad_norm_var": 0.008180404333396396, + "learning_rate": 5e-05, + "loss": 0.1988, + "loss/crossentropy": 2.7777557373046875, + "loss/hidden": 0.0, + "loss/logits": 0.19884883239865303, + "loss/reg": 2.0258262157440186, + "step": 760 + }, + { + "epoch": 0.00761, + "grad_norm": 0.36141064763069153, + "grad_norm_var": 0.008223674749041798, + "learning_rate": 5e-05, + "loss": 0.1723, + "loss/crossentropy": 2.755949318408966, + "loss/hidden": 0.0, + "loss/logits": 0.17225057631731033, + "loss/reg": 2.022468090057373, + "step": 761 + }, + { + "epoch": 0.00762, + "grad_norm": 0.47610145807266235, + "grad_norm_var": 0.008612084301677063, + "learning_rate": 5e-05, + "loss": 0.1786, + "loss/crossentropy": 2.701655924320221, + "loss/hidden": 0.0, + "loss/logits": 0.17863870784640312, + "loss/reg": 2.0194478034973145, + "step": 762 + }, + { + "epoch": 0.00763, + "grad_norm": 0.35960420966148376, + "grad_norm_var": 0.0086585523494028, + "learning_rate": 5e-05, + "loss": 0.1719, + "loss/crossentropy": 2.6496411561965942, + "loss/hidden": 0.0, + "loss/logits": 0.17191722244024277, + "loss/reg": 2.0173516273498535, + "step": 763 + }, + { + "epoch": 0.00764, + "grad_norm": 0.3759422302246094, + "grad_norm_var": 0.008585472348376118, + "learning_rate": 5e-05, + "loss": 0.1665, + "loss/crossentropy": 2.7261382937431335, + "loss/hidden": 0.0, + "loss/logits": 0.1664697714149952, + "loss/reg": 2.014254331588745, + "step": 764 + }, + { + "epoch": 0.00765, + "grad_norm": 0.3791477680206299, + "grad_norm_var": 0.008610251361000461, + "learning_rate": 5e-05, + "loss": 0.1716, + "loss/crossentropy": 2.779210090637207, + "loss/hidden": 0.0, + "loss/logits": 0.1715676300227642, + "loss/reg": 2.0109994411468506, + "step": 765 + }, + { + "epoch": 0.00766, + "grad_norm": 0.37698858976364136, + "grad_norm_var": 0.00842901980982322, + "learning_rate": 5e-05, + "loss": 0.178, + "loss/crossentropy": 2.5693264603614807, + "loss/hidden": 0.0, + "loss/logits": 0.17796850576996803, + "loss/reg": 2.007894277572632, + "step": 766 + }, + { + "epoch": 0.00767, + "grad_norm": 0.324692040681839, + "grad_norm_var": 0.008757168987509056, + "learning_rate": 5e-05, + "loss": 0.163, + "loss/crossentropy": 2.790699005126953, + "loss/hidden": 0.0, + "loss/logits": 0.16297711431980133, + "loss/reg": 2.0054080486297607, + "step": 767 + }, + { + "epoch": 0.00768, + "grad_norm": 0.37725430727005005, + "grad_norm_var": 0.008742918144709544, + "learning_rate": 5e-05, + "loss": 0.1701, + "loss/crossentropy": 2.90560781955719, + "loss/hidden": 0.0, + "loss/logits": 0.17007537558674812, + "loss/reg": 2.0026705265045166, + "step": 768 + }, + { + "epoch": 0.00769, + "grad_norm": 0.3565872013568878, + "grad_norm_var": 0.008812017421293783, + "learning_rate": 5e-05, + "loss": 0.1748, + "loss/crossentropy": 2.8573551774024963, + "loss/hidden": 0.0, + "loss/logits": 0.1747995764017105, + "loss/reg": 1.9999767541885376, + "step": 769 + }, + { + "epoch": 0.0077, + "grad_norm": 0.32768821716308594, + "grad_norm_var": 0.009011701837686615, + "learning_rate": 5e-05, + "loss": 0.1647, + "loss/crossentropy": 2.7850446105003357, + "loss/hidden": 0.0, + "loss/logits": 0.16465429961681366, + "loss/reg": 1.9974277019500732, + "step": 770 + }, + { + "epoch": 0.00771, + "grad_norm": 0.34194430708885193, + "grad_norm_var": 0.008880009468442519, + "learning_rate": 5e-05, + "loss": 0.1681, + "loss/crossentropy": 2.89225697517395, + "loss/hidden": 0.0, + "loss/logits": 0.16806093603372574, + "loss/reg": 1.9941447973251343, + "step": 771 + }, + { + "epoch": 0.00772, + "grad_norm": 0.36788639426231384, + "grad_norm_var": 0.008815313943155234, + "learning_rate": 5e-05, + "loss": 0.1824, + "loss/crossentropy": 2.8710330724716187, + "loss/hidden": 0.0, + "loss/logits": 0.1823921650648117, + "loss/reg": 1.991845965385437, + "step": 772 + }, + { + "epoch": 0.00773, + "grad_norm": 0.33500465750694275, + "grad_norm_var": 0.008817280025891942, + "learning_rate": 5e-05, + "loss": 0.1669, + "loss/crossentropy": 2.7821491956710815, + "loss/hidden": 0.0, + "loss/logits": 0.16685106977820396, + "loss/reg": 1.9901355504989624, + "step": 773 + }, + { + "epoch": 0.00774, + "grad_norm": 0.33815550804138184, + "grad_norm_var": 0.008946649562538052, + "learning_rate": 5e-05, + "loss": 0.162, + "loss/crossentropy": 2.7051143050193787, + "loss/hidden": 0.0, + "loss/logits": 0.16199326515197754, + "loss/reg": 1.9880555868148804, + "step": 774 + }, + { + "epoch": 0.00775, + "grad_norm": 0.32524728775024414, + "grad_norm_var": 0.009054478596347363, + "learning_rate": 5e-05, + "loss": 0.169, + "loss/crossentropy": 2.72264701128006, + "loss/hidden": 0.0, + "loss/logits": 0.16902651265263557, + "loss/reg": 1.986093521118164, + "step": 775 + }, + { + "epoch": 0.00776, + "grad_norm": 0.34697458148002625, + "grad_norm_var": 0.0013234442614042051, + "learning_rate": 5e-05, + "loss": 0.1691, + "loss/crossentropy": 2.780848979949951, + "loss/hidden": 0.0, + "loss/logits": 0.1690516211092472, + "loss/reg": 1.9844779968261719, + "step": 776 + }, + { + "epoch": 0.00777, + "grad_norm": 0.33995282649993896, + "grad_norm_var": 0.0013500864177136548, + "learning_rate": 5e-05, + "loss": 0.1612, + "loss/crossentropy": 2.772739827632904, + "loss/hidden": 0.0, + "loss/logits": 0.16119593381881714, + "loss/reg": 1.9825077056884766, + "step": 777 + }, + { + "epoch": 0.00778, + "grad_norm": 0.35139432549476624, + "grad_norm_var": 0.0003803343966673219, + "learning_rate": 5e-05, + "loss": 0.1668, + "loss/crossentropy": 2.7008825540542603, + "loss/hidden": 0.0, + "loss/logits": 0.16681700944900513, + "loss/reg": 1.9806305170059204, + "step": 778 + }, + { + "epoch": 0.00779, + "grad_norm": 0.4588527977466583, + "grad_norm_var": 0.00110283708340256, + "learning_rate": 5e-05, + "loss": 0.1907, + "loss/crossentropy": 2.632855713367462, + "loss/hidden": 0.0, + "loss/logits": 0.19068260118365288, + "loss/reg": 1.9793086051940918, + "step": 779 + }, + { + "epoch": 0.0078, + "grad_norm": 0.3829444646835327, + "grad_norm_var": 0.0011229031183707624, + "learning_rate": 5e-05, + "loss": 0.1875, + "loss/crossentropy": 2.9350045323371887, + "loss/hidden": 0.0, + "loss/logits": 0.18754199519753456, + "loss/reg": 1.9774302244186401, + "step": 780 + }, + { + "epoch": 0.00781, + "grad_norm": 0.46253493428230286, + "grad_norm_var": 0.0017907320044085833, + "learning_rate": 5e-05, + "loss": 0.1957, + "loss/crossentropy": 2.7478776574134827, + "loss/hidden": 0.0, + "loss/logits": 0.19565920531749725, + "loss/reg": 1.9757329225540161, + "step": 781 + }, + { + "epoch": 0.00782, + "grad_norm": 0.35229969024658203, + "grad_norm_var": 0.0017840355007145352, + "learning_rate": 5e-05, + "loss": 0.1611, + "loss/crossentropy": 2.805756628513336, + "loss/hidden": 0.0, + "loss/logits": 0.1610955037176609, + "loss/reg": 1.973933458328247, + "step": 782 + }, + { + "epoch": 0.00783, + "grad_norm": 0.3324076533317566, + "grad_norm_var": 0.0017495419673394963, + "learning_rate": 5e-05, + "loss": 0.1706, + "loss/crossentropy": 2.8343148827552795, + "loss/hidden": 0.0, + "loss/logits": 0.1706329919397831, + "loss/reg": 1.9718844890594482, + "step": 783 + }, + { + "epoch": 0.00784, + "grad_norm": 0.3563413619995117, + "grad_norm_var": 0.0017352353042652258, + "learning_rate": 5e-05, + "loss": 0.1768, + "loss/crossentropy": 2.825278103351593, + "loss/hidden": 0.0, + "loss/logits": 0.17681827396154404, + "loss/reg": 1.969612956047058, + "step": 784 + }, + { + "epoch": 0.00785, + "grad_norm": 0.33560603857040405, + "grad_norm_var": 0.0017751309342711038, + "learning_rate": 5e-05, + "loss": 0.1558, + "loss/crossentropy": 2.8132280111312866, + "loss/hidden": 0.0, + "loss/logits": 0.1558120921254158, + "loss/reg": 1.9675753116607666, + "step": 785 + }, + { + "epoch": 0.00786, + "grad_norm": 0.39733415842056274, + "grad_norm_var": 0.0017810049820061401, + "learning_rate": 5e-05, + "loss": 0.1849, + "loss/crossentropy": 2.8960456252098083, + "loss/hidden": 0.0, + "loss/logits": 0.18493180349469185, + "loss/reg": 1.965217113494873, + "step": 786 + }, + { + "epoch": 0.00787, + "grad_norm": 0.561698317527771, + "grad_norm_var": 0.004151387117344507, + "learning_rate": 5e-05, + "loss": 0.1987, + "loss/crossentropy": 2.66669100522995, + "loss/hidden": 0.0, + "loss/logits": 0.19869648292660713, + "loss/reg": 1.9629205465316772, + "step": 787 + }, + { + "epoch": 0.00788, + "grad_norm": 0.35911333560943604, + "grad_norm_var": 0.004167781816727311, + "learning_rate": 5e-05, + "loss": 0.1742, + "loss/crossentropy": 2.723667323589325, + "loss/hidden": 0.0, + "loss/logits": 0.17424843832850456, + "loss/reg": 1.9608547687530518, + "step": 788 + }, + { + "epoch": 0.00789, + "grad_norm": 0.3422897160053253, + "grad_norm_var": 0.004130072564222831, + "learning_rate": 5e-05, + "loss": 0.1683, + "loss/crossentropy": 2.853653848171234, + "loss/hidden": 0.0, + "loss/logits": 0.16829833760857582, + "loss/reg": 1.959040880203247, + "step": 789 + }, + { + "epoch": 0.0079, + "grad_norm": 0.373519629240036, + "grad_norm_var": 0.0040217911733014585, + "learning_rate": 5e-05, + "loss": 0.1697, + "loss/crossentropy": 2.7469093799591064, + "loss/hidden": 0.0, + "loss/logits": 0.1697460599243641, + "loss/reg": 1.9578640460968018, + "step": 790 + }, + { + "epoch": 0.00791, + "grad_norm": 0.42586550116539, + "grad_norm_var": 0.003921241787547673, + "learning_rate": 5e-05, + "loss": 0.1896, + "loss/crossentropy": 2.9876235127449036, + "loss/hidden": 0.0, + "loss/logits": 0.18961479887366295, + "loss/reg": 1.9558684825897217, + "step": 791 + }, + { + "epoch": 0.00792, + "grad_norm": 0.34371063113212585, + "grad_norm_var": 0.00393897634361432, + "learning_rate": 5e-05, + "loss": 0.1624, + "loss/crossentropy": 2.922863006591797, + "loss/hidden": 0.0, + "loss/logits": 0.16239817067980766, + "loss/reg": 1.9541829824447632, + "step": 792 + }, + { + "epoch": 0.00793, + "grad_norm": 0.3611912727355957, + "grad_norm_var": 0.0038367960883469387, + "learning_rate": 5e-05, + "loss": 0.1767, + "loss/crossentropy": 2.751186192035675, + "loss/hidden": 0.0, + "loss/logits": 0.17671825364232063, + "loss/reg": 1.9524297714233398, + "step": 793 + }, + { + "epoch": 0.00794, + "grad_norm": 0.3787733018398285, + "grad_norm_var": 0.0037525025406884736, + "learning_rate": 5e-05, + "loss": 0.1695, + "loss/crossentropy": 2.6539193391799927, + "loss/hidden": 0.0, + "loss/logits": 0.16952653229236603, + "loss/reg": 1.951439619064331, + "step": 794 + }, + { + "epoch": 0.00795, + "grad_norm": 0.37621310353279114, + "grad_norm_var": 0.003409985625982037, + "learning_rate": 5e-05, + "loss": 0.1827, + "loss/crossentropy": 2.672878086566925, + "loss/hidden": 0.0, + "loss/logits": 0.1826501600444317, + "loss/reg": 1.9504698514938354, + "step": 795 + }, + { + "epoch": 0.00796, + "grad_norm": 0.3580264747142792, + "grad_norm_var": 0.0034518512961513536, + "learning_rate": 5e-05, + "loss": 0.1741, + "loss/crossentropy": 2.8564891815185547, + "loss/hidden": 0.0, + "loss/logits": 0.17409207299351692, + "loss/reg": 1.9492477178573608, + "step": 796 + }, + { + "epoch": 0.00797, + "grad_norm": 0.3552623689174652, + "grad_norm_var": 0.0030235748866805395, + "learning_rate": 5e-05, + "loss": 0.1679, + "loss/crossentropy": 2.9642611145973206, + "loss/hidden": 0.0, + "loss/logits": 0.16786304488778114, + "loss/reg": 1.9484763145446777, + "step": 797 + }, + { + "epoch": 0.00798, + "grad_norm": 0.37029561400413513, + "grad_norm_var": 0.0029878997549970957, + "learning_rate": 5e-05, + "loss": 0.1837, + "loss/crossentropy": 2.7581509947776794, + "loss/hidden": 0.0, + "loss/logits": 0.18365685641765594, + "loss/reg": 1.946696400642395, + "step": 798 + }, + { + "epoch": 0.00799, + "grad_norm": 0.37257152795791626, + "grad_norm_var": 0.00285137706672662, + "learning_rate": 5e-05, + "loss": 0.1768, + "loss/crossentropy": 2.766197443008423, + "loss/hidden": 0.0, + "loss/logits": 0.17676663026213646, + "loss/reg": 1.9451103210449219, + "step": 799 + }, + { + "epoch": 0.008, + "grad_norm": 0.3937225043773651, + "grad_norm_var": 0.0028245897421089812, + "learning_rate": 5e-05, + "loss": 0.1593, + "loss/crossentropy": 2.968823492527008, + "loss/hidden": 0.0, + "loss/logits": 0.15929469466209412, + "loss/reg": 1.943403959274292, + "step": 800 + }, + { + "epoch": 0.00801, + "grad_norm": 0.5229995846748352, + "grad_norm_var": 0.003870799660257873, + "learning_rate": 5e-05, + "loss": 0.1904, + "loss/crossentropy": 2.5733524560928345, + "loss/hidden": 0.0, + "loss/logits": 0.19040565192699432, + "loss/reg": 1.9423651695251465, + "step": 801 + }, + { + "epoch": 0.00802, + "grad_norm": 0.4087795913219452, + "grad_norm_var": 0.003885163701405114, + "learning_rate": 5e-05, + "loss": 0.2011, + "loss/crossentropy": 2.643693685531616, + "loss/hidden": 0.0, + "loss/logits": 0.20110392943024635, + "loss/reg": 1.941137433052063, + "step": 802 + }, + { + "epoch": 0.00803, + "grad_norm": 0.369555801153183, + "grad_norm_var": 0.0018963737991296507, + "learning_rate": 5e-05, + "loss": 0.1768, + "loss/crossentropy": 2.702915072441101, + "loss/hidden": 0.0, + "loss/logits": 0.17682579904794693, + "loss/reg": 1.9396127462387085, + "step": 803 + }, + { + "epoch": 0.00804, + "grad_norm": 0.3822772204875946, + "grad_norm_var": 0.001859244513831604, + "learning_rate": 5e-05, + "loss": 0.1674, + "loss/crossentropy": 2.8051819801330566, + "loss/hidden": 0.0, + "loss/logits": 0.1674252152442932, + "loss/reg": 1.9382424354553223, + "step": 804 + }, + { + "epoch": 0.00805, + "grad_norm": 0.42195388674736023, + "grad_norm_var": 0.0018187903132861672, + "learning_rate": 5e-05, + "loss": 0.1982, + "loss/crossentropy": 2.7962412238121033, + "loss/hidden": 0.0, + "loss/logits": 0.19819854572415352, + "loss/reg": 1.9367636442184448, + "step": 805 + }, + { + "epoch": 0.00806, + "grad_norm": 0.39215224981307983, + "grad_norm_var": 0.001803471303692028, + "learning_rate": 5e-05, + "loss": 0.1862, + "loss/crossentropy": 2.838093400001526, + "loss/hidden": 0.0, + "loss/logits": 0.1862441450357437, + "loss/reg": 1.9356895685195923, + "step": 806 + }, + { + "epoch": 0.00807, + "grad_norm": 0.36561474204063416, + "grad_norm_var": 0.0017388941933010808, + "learning_rate": 5e-05, + "loss": 0.1659, + "loss/crossentropy": 2.8486820459365845, + "loss/hidden": 0.0, + "loss/logits": 0.16586757823824883, + "loss/reg": 1.9336965084075928, + "step": 807 + }, + { + "epoch": 0.00808, + "grad_norm": 0.3940856456756592, + "grad_norm_var": 0.0016146705961780842, + "learning_rate": 5e-05, + "loss": 0.1695, + "loss/crossentropy": 2.7590547800064087, + "loss/hidden": 0.0, + "loss/logits": 0.16953302919864655, + "loss/reg": 1.9318287372589111, + "step": 808 + }, + { + "epoch": 0.00809, + "grad_norm": 0.37031009793281555, + "grad_norm_var": 0.0015860965038246484, + "learning_rate": 5e-05, + "loss": 0.1663, + "loss/crossentropy": 2.803991198539734, + "loss/hidden": 0.0, + "loss/logits": 0.16627426072955132, + "loss/reg": 1.9298937320709229, + "step": 809 + }, + { + "epoch": 0.0081, + "grad_norm": 0.36467787623405457, + "grad_norm_var": 0.001618743456786816, + "learning_rate": 5e-05, + "loss": 0.1798, + "loss/crossentropy": 2.7522680163383484, + "loss/hidden": 0.0, + "loss/logits": 0.17983945459127426, + "loss/reg": 1.928220272064209, + "step": 810 + }, + { + "epoch": 0.00811, + "grad_norm": 0.644191324710846, + "grad_norm_var": 0.005662418748027209, + "learning_rate": 5e-05, + "loss": 0.1823, + "loss/crossentropy": 2.9207261204719543, + "loss/hidden": 0.0, + "loss/logits": 0.18232716247439384, + "loss/reg": 1.9264674186706543, + "step": 811 + }, + { + "epoch": 0.00812, + "grad_norm": 0.4135313034057617, + "grad_norm_var": 0.00550433789682554, + "learning_rate": 5e-05, + "loss": 0.1754, + "loss/crossentropy": 2.78128319978714, + "loss/hidden": 0.0, + "loss/logits": 0.17538663744926453, + "loss/reg": 1.9246680736541748, + "step": 812 + }, + { + "epoch": 0.00813, + "grad_norm": 0.44594907760620117, + "grad_norm_var": 0.005370096537218135, + "learning_rate": 5e-05, + "loss": 0.1898, + "loss/crossentropy": 2.7069836258888245, + "loss/hidden": 0.0, + "loss/logits": 0.18982965499162674, + "loss/reg": 1.9229440689086914, + "step": 813 + }, + { + "epoch": 0.00814, + "grad_norm": 0.41460415720939636, + "grad_norm_var": 0.005231401879877403, + "learning_rate": 5e-05, + "loss": 0.1965, + "loss/crossentropy": 2.658607244491577, + "loss/hidden": 0.0, + "loss/logits": 0.1964995227754116, + "loss/reg": 1.9211541414260864, + "step": 814 + }, + { + "epoch": 0.00815, + "grad_norm": 0.40847840905189514, + "grad_norm_var": 0.0050977892227572616, + "learning_rate": 5e-05, + "loss": 0.1865, + "loss/crossentropy": 2.789508819580078, + "loss/hidden": 0.0, + "loss/logits": 0.18652214854955673, + "loss/reg": 1.91935396194458, + "step": 815 + }, + { + "epoch": 0.00816, + "grad_norm": 0.39475539326667786, + "grad_norm_var": 0.005094298258556392, + "learning_rate": 5e-05, + "loss": 0.1876, + "loss/crossentropy": 2.771743655204773, + "loss/hidden": 0.0, + "loss/logits": 0.18758049979805946, + "loss/reg": 1.9176177978515625, + "step": 816 + }, + { + "epoch": 0.00817, + "grad_norm": 0.3788897395133972, + "grad_norm_var": 0.004405869730473085, + "learning_rate": 5e-05, + "loss": 0.1765, + "loss/crossentropy": 2.807315766811371, + "loss/hidden": 0.0, + "loss/logits": 0.17645375058054924, + "loss/reg": 1.9157112836837769, + "step": 817 + }, + { + "epoch": 0.00818, + "grad_norm": 0.3527612090110779, + "grad_norm_var": 0.004615691680188668, + "learning_rate": 5e-05, + "loss": 0.1644, + "loss/crossentropy": 2.8354954719543457, + "loss/hidden": 0.0, + "loss/logits": 0.16440149024128914, + "loss/reg": 1.914186716079712, + "step": 818 + }, + { + "epoch": 0.00819, + "grad_norm": 0.45434367656707764, + "grad_norm_var": 0.004640431192599914, + "learning_rate": 5e-05, + "loss": 0.2004, + "loss/crossentropy": 2.7823927998542786, + "loss/hidden": 0.0, + "loss/logits": 0.20038331300020218, + "loss/reg": 1.9128350019454956, + "step": 819 + }, + { + "epoch": 0.0082, + "grad_norm": 0.4440554082393646, + "grad_norm_var": 0.004630750512825665, + "learning_rate": 5e-05, + "loss": 0.198, + "loss/crossentropy": 2.7826399207115173, + "loss/hidden": 0.0, + "loss/logits": 0.19803617522120476, + "loss/reg": 1.9112329483032227, + "step": 820 + }, + { + "epoch": 0.00821, + "grad_norm": 0.8357638120651245, + "grad_norm_var": 0.015646654980531947, + "learning_rate": 5e-05, + "loss": 0.2236, + "loss/crossentropy": 2.855618476867676, + "loss/hidden": 0.0, + "loss/logits": 0.22355607897043228, + "loss/reg": 1.9094618558883667, + "step": 821 + }, + { + "epoch": 0.00822, + "grad_norm": 0.369484543800354, + "grad_norm_var": 0.01582983572015086, + "learning_rate": 5e-05, + "loss": 0.1688, + "loss/crossentropy": 2.7591440081596375, + "loss/hidden": 0.0, + "loss/logits": 0.16877064853906631, + "loss/reg": 1.9078381061553955, + "step": 822 + }, + { + "epoch": 0.00823, + "grad_norm": 0.3682078421115875, + "grad_norm_var": 0.015804289096973827, + "learning_rate": 5e-05, + "loss": 0.1661, + "loss/crossentropy": 2.6714991331100464, + "loss/hidden": 0.0, + "loss/logits": 0.16609660163521767, + "loss/reg": 1.9066871404647827, + "step": 823 + }, + { + "epoch": 0.00824, + "grad_norm": 0.4925972521305084, + "grad_norm_var": 0.015796176553567597, + "learning_rate": 5e-05, + "loss": 0.1983, + "loss/crossentropy": 2.690047025680542, + "loss/hidden": 0.0, + "loss/logits": 0.19832593947649002, + "loss/reg": 1.9051308631896973, + "step": 824 + }, + { + "epoch": 0.00825, + "grad_norm": 0.4004671275615692, + "grad_norm_var": 0.01554450060197241, + "learning_rate": 5e-05, + "loss": 0.1867, + "loss/crossentropy": 3.0430662631988525, + "loss/hidden": 0.0, + "loss/logits": 0.18668686598539352, + "loss/reg": 1.9037506580352783, + "step": 825 + }, + { + "epoch": 0.00826, + "grad_norm": 0.40644222497940063, + "grad_norm_var": 0.01518439463368978, + "learning_rate": 5e-05, + "loss": 0.1992, + "loss/crossentropy": 2.856709599494934, + "loss/hidden": 0.0, + "loss/logits": 0.19924000278115273, + "loss/reg": 1.9035279750823975, + "step": 826 + }, + { + "epoch": 0.00827, + "grad_norm": 0.387662410736084, + "grad_norm_var": 0.012707668169475368, + "learning_rate": 5e-05, + "loss": 0.171, + "loss/crossentropy": 2.8205041885375977, + "loss/hidden": 0.0, + "loss/logits": 0.17101648449897766, + "loss/reg": 1.9019997119903564, + "step": 827 + }, + { + "epoch": 0.00828, + "grad_norm": 0.4229760766029358, + "grad_norm_var": 0.012685578660426963, + "learning_rate": 5e-05, + "loss": 0.1705, + "loss/crossentropy": 2.8298428058624268, + "loss/hidden": 0.0, + "loss/logits": 0.1704978421330452, + "loss/reg": 1.9013198614120483, + "step": 828 + }, + { + "epoch": 0.00829, + "grad_norm": 0.4192207455635071, + "grad_norm_var": 0.012695092968616347, + "learning_rate": 5e-05, + "loss": 0.1699, + "loss/crossentropy": 2.8572763800621033, + "loss/hidden": 0.0, + "loss/logits": 0.16986168175935745, + "loss/reg": 1.9007604122161865, + "step": 829 + }, + { + "epoch": 0.0083, + "grad_norm": 0.3887600898742676, + "grad_norm_var": 0.012805118489640084, + "learning_rate": 5e-05, + "loss": 0.202, + "loss/crossentropy": 2.786255419254303, + "loss/hidden": 0.0, + "loss/logits": 0.20195355266332626, + "loss/reg": 1.9000667333602905, + "step": 830 + }, + { + "epoch": 0.00831, + "grad_norm": 0.48604434728622437, + "grad_norm_var": 0.012929568590754843, + "learning_rate": 5e-05, + "loss": 0.1961, + "loss/crossentropy": 2.845152735710144, + "loss/hidden": 0.0, + "loss/logits": 0.19609695672988892, + "loss/reg": 1.899495005607605, + "step": 831 + }, + { + "epoch": 0.00832, + "grad_norm": 0.38712021708488464, + "grad_norm_var": 0.012976881832390848, + "learning_rate": 5e-05, + "loss": 0.1889, + "loss/crossentropy": 2.7007412910461426, + "loss/hidden": 0.0, + "loss/logits": 0.1888689175248146, + "loss/reg": 1.8983198404312134, + "step": 832 + }, + { + "epoch": 0.00833, + "grad_norm": 0.3749590814113617, + "grad_norm_var": 0.013008393945473115, + "learning_rate": 5e-05, + "loss": 0.1785, + "loss/crossentropy": 2.607687532901764, + "loss/hidden": 0.0, + "loss/logits": 0.1784559190273285, + "loss/reg": 1.897759199142456, + "step": 833 + }, + { + "epoch": 0.00834, + "grad_norm": 0.35202544927597046, + "grad_norm_var": 0.013016684761580717, + "learning_rate": 5e-05, + "loss": 0.1707, + "loss/crossentropy": 2.7777926325798035, + "loss/hidden": 0.0, + "loss/logits": 0.17071311548352242, + "loss/reg": 1.8961632251739502, + "step": 834 + }, + { + "epoch": 0.00835, + "grad_norm": 0.3441760540008545, + "grad_norm_var": 0.013518763280912912, + "learning_rate": 5e-05, + "loss": 0.1615, + "loss/crossentropy": 2.835566759109497, + "loss/hidden": 0.0, + "loss/logits": 0.1615000143647194, + "loss/reg": 1.8950566053390503, + "step": 835 + }, + { + "epoch": 0.00836, + "grad_norm": 0.362005352973938, + "grad_norm_var": 0.013785734718565416, + "learning_rate": 5e-05, + "loss": 0.1631, + "loss/crossentropy": 2.818268299102783, + "loss/hidden": 0.0, + "loss/logits": 0.16310900822281837, + "loss/reg": 1.8935774564743042, + "step": 836 + }, + { + "epoch": 0.00837, + "grad_norm": 0.3725143074989319, + "grad_norm_var": 0.0018186987426010584, + "learning_rate": 5e-05, + "loss": 0.1929, + "loss/crossentropy": 2.763745427131653, + "loss/hidden": 0.0, + "loss/logits": 0.192863829433918, + "loss/reg": 1.8919721841812134, + "step": 837 + }, + { + "epoch": 0.00838, + "grad_norm": 0.39604651927948, + "grad_norm_var": 0.0017691837659245461, + "learning_rate": 5e-05, + "loss": 0.1847, + "loss/crossentropy": 2.729005455970764, + "loss/hidden": 0.0, + "loss/logits": 0.18469301983714104, + "loss/reg": 1.8908017873764038, + "step": 838 + }, + { + "epoch": 0.00839, + "grad_norm": 0.39175912737846375, + "grad_norm_var": 0.001711627371570279, + "learning_rate": 5e-05, + "loss": 0.1841, + "loss/crossentropy": 2.8563897609710693, + "loss/hidden": 0.0, + "loss/logits": 0.18407713249325752, + "loss/reg": 1.8892946243286133, + "step": 839 + }, + { + "epoch": 0.0084, + "grad_norm": 0.3497207462787628, + "grad_norm_var": 0.0012053613127933737, + "learning_rate": 5e-05, + "loss": 0.1657, + "loss/crossentropy": 2.9635773301124573, + "loss/hidden": 0.0, + "loss/logits": 0.1656595915555954, + "loss/reg": 1.8886394500732422, + "step": 840 + }, + { + "epoch": 0.00841, + "grad_norm": 0.36070436239242554, + "grad_norm_var": 0.0012493146014174172, + "learning_rate": 5e-05, + "loss": 0.1659, + "loss/crossentropy": 2.905772030353546, + "loss/hidden": 0.0, + "loss/logits": 0.16585366800427437, + "loss/reg": 1.8883072137832642, + "step": 841 + }, + { + "epoch": 0.00842, + "grad_norm": 0.5194427371025085, + "grad_norm_var": 0.002330769361460483, + "learning_rate": 5e-05, + "loss": 0.1903, + "loss/crossentropy": 2.914414703845978, + "loss/hidden": 0.0, + "loss/logits": 0.1903173327445984, + "loss/reg": 1.8872804641723633, + "step": 842 + }, + { + "epoch": 0.00843, + "grad_norm": 0.3658028841018677, + "grad_norm_var": 0.0023811347132161485, + "learning_rate": 5e-05, + "loss": 0.1722, + "loss/crossentropy": 2.8683270812034607, + "loss/hidden": 0.0, + "loss/logits": 0.17216329649090767, + "loss/reg": 1.8860183954238892, + "step": 843 + }, + { + "epoch": 0.00844, + "grad_norm": 0.3355120122432709, + "grad_norm_var": 0.0025135271396979795, + "learning_rate": 5e-05, + "loss": 0.166, + "loss/crossentropy": 2.7672330141067505, + "loss/hidden": 0.0, + "loss/logits": 0.16596197709441185, + "loss/reg": 1.8848904371261597, + "step": 844 + }, + { + "epoch": 0.00845, + "grad_norm": 0.45907968282699585, + "grad_norm_var": 0.002779472587279837, + "learning_rate": 5e-05, + "loss": 0.1881, + "loss/crossentropy": 2.79125440120697, + "loss/hidden": 0.0, + "loss/logits": 0.1881270818412304, + "loss/reg": 1.8831804990768433, + "step": 845 + }, + { + "epoch": 0.00846, + "grad_norm": 0.3753393888473511, + "grad_norm_var": 0.0027935829770950843, + "learning_rate": 5e-05, + "loss": 0.1801, + "loss/crossentropy": 2.753562033176422, + "loss/hidden": 0.0, + "loss/logits": 0.18013736233115196, + "loss/reg": 1.8817567825317383, + "step": 846 + }, + { + "epoch": 0.00847, + "grad_norm": 0.41996800899505615, + "grad_norm_var": 0.002216029114339835, + "learning_rate": 5e-05, + "loss": 0.1871, + "loss/crossentropy": 2.8630106449127197, + "loss/hidden": 0.0, + "loss/logits": 0.18711163103580475, + "loss/reg": 1.8805315494537354, + "step": 847 + }, + { + "epoch": 0.00848, + "grad_norm": 0.40139615535736084, + "grad_norm_var": 0.0022320677834542836, + "learning_rate": 5e-05, + "loss": 0.182, + "loss/crossentropy": 2.8663435578346252, + "loss/hidden": 0.0, + "loss/logits": 0.18201814219355583, + "loss/reg": 1.8795160055160522, + "step": 848 + }, + { + "epoch": 0.00849, + "grad_norm": 0.44251078367233276, + "grad_norm_var": 0.0024153193390402117, + "learning_rate": 5e-05, + "loss": 0.1721, + "loss/crossentropy": 2.8493316173553467, + "loss/hidden": 0.0, + "loss/logits": 0.1721041165292263, + "loss/reg": 1.8777539730072021, + "step": 849 + }, + { + "epoch": 0.0085, + "grad_norm": 0.39363330602645874, + "grad_norm_var": 0.00231007314672006, + "learning_rate": 5e-05, + "loss": 0.1933, + "loss/crossentropy": 2.8225064277648926, + "loss/hidden": 0.0, + "loss/logits": 0.19327203929424286, + "loss/reg": 1.8762940168380737, + "step": 850 + }, + { + "epoch": 0.00851, + "grad_norm": 0.3834942579269409, + "grad_norm_var": 0.0021502092497398652, + "learning_rate": 5e-05, + "loss": 0.1919, + "loss/crossentropy": 2.7274433970451355, + "loss/hidden": 0.0, + "loss/logits": 0.19194044917821884, + "loss/reg": 1.8752055168151855, + "step": 851 + }, + { + "epoch": 0.00852, + "grad_norm": 0.36249086260795593, + "grad_norm_var": 0.0021480519578248518, + "learning_rate": 5e-05, + "loss": 0.1759, + "loss/crossentropy": 2.725217640399933, + "loss/hidden": 0.0, + "loss/logits": 0.17594841867685318, + "loss/reg": 1.8736830949783325, + "step": 852 + }, + { + "epoch": 0.00853, + "grad_norm": 0.3869001269340515, + "grad_norm_var": 0.002116727725912885, + "learning_rate": 5e-05, + "loss": 0.1823, + "loss/crossentropy": 2.738001227378845, + "loss/hidden": 0.0, + "loss/logits": 0.1822870336472988, + "loss/reg": 1.8728076219558716, + "step": 853 + }, + { + "epoch": 0.00854, + "grad_norm": 0.3708244860172272, + "grad_norm_var": 0.002157970353941248, + "learning_rate": 5e-05, + "loss": 0.1668, + "loss/crossentropy": 2.788993239402771, + "loss/hidden": 0.0, + "loss/logits": 0.16682880371809006, + "loss/reg": 1.8720353841781616, + "step": 854 + }, + { + "epoch": 0.00855, + "grad_norm": 0.372335821390152, + "grad_norm_var": 0.002189712517136307, + "learning_rate": 5e-05, + "loss": 0.1718, + "loss/crossentropy": 2.813368082046509, + "loss/hidden": 0.0, + "loss/logits": 0.17184938862919807, + "loss/reg": 1.8711519241333008, + "step": 855 + }, + { + "epoch": 0.00856, + "grad_norm": 0.35767483711242676, + "grad_norm_var": 0.0021470276840197164, + "learning_rate": 5e-05, + "loss": 0.175, + "loss/crossentropy": 2.779674708843231, + "loss/hidden": 0.0, + "loss/logits": 0.17495984584093094, + "loss/reg": 1.869391679763794, + "step": 856 + }, + { + "epoch": 0.00857, + "grad_norm": 0.3517981767654419, + "grad_norm_var": 0.002191754274186038, + "learning_rate": 5e-05, + "loss": 0.1688, + "loss/crossentropy": 2.7776423692703247, + "loss/hidden": 0.0, + "loss/logits": 0.16876182705163956, + "loss/reg": 1.868484616279602, + "step": 857 + }, + { + "epoch": 0.00858, + "grad_norm": 0.8127824664115906, + "grad_norm_var": 0.012490247842596114, + "learning_rate": 5e-05, + "loss": 0.2224, + "loss/crossentropy": 2.9876713156700134, + "loss/hidden": 0.0, + "loss/logits": 0.2223958522081375, + "loss/reg": 1.8681334257125854, + "step": 858 + }, + { + "epoch": 0.00859, + "grad_norm": 0.4339921474456787, + "grad_norm_var": 0.012361098720851383, + "learning_rate": 5e-05, + "loss": 0.1752, + "loss/crossentropy": 2.8306267857551575, + "loss/hidden": 0.0, + "loss/logits": 0.17517483979463577, + "loss/reg": 1.8669747114181519, + "step": 859 + }, + { + "epoch": 0.0086, + "grad_norm": 0.5807726383209229, + "grad_norm_var": 0.013480947234538975, + "learning_rate": 5e-05, + "loss": 0.1787, + "loss/crossentropy": 2.731403112411499, + "loss/hidden": 0.0, + "loss/logits": 0.17869474738836288, + "loss/reg": 1.865167498588562, + "step": 860 + }, + { + "epoch": 0.00861, + "grad_norm": 0.377247154712677, + "grad_norm_var": 0.013599237642109623, + "learning_rate": 5e-05, + "loss": 0.1714, + "loss/crossentropy": 2.8269473910331726, + "loss/hidden": 0.0, + "loss/logits": 0.1713988333940506, + "loss/reg": 1.8640793561935425, + "step": 861 + }, + { + "epoch": 0.00862, + "grad_norm": 0.37849879264831543, + "grad_norm_var": 0.013578332002263205, + "learning_rate": 5e-05, + "loss": 0.169, + "loss/crossentropy": 2.8433790802955627, + "loss/hidden": 0.0, + "loss/logits": 0.16895778477191925, + "loss/reg": 1.8629158735275269, + "step": 862 + }, + { + "epoch": 0.00863, + "grad_norm": 0.4124751091003418, + "grad_norm_var": 0.013588511645486826, + "learning_rate": 5e-05, + "loss": 0.1803, + "loss/crossentropy": 2.7584823966026306, + "loss/hidden": 0.0, + "loss/logits": 0.18026690557599068, + "loss/reg": 1.8609492778778076, + "step": 863 + }, + { + "epoch": 0.00864, + "grad_norm": 0.37336814403533936, + "grad_norm_var": 0.013730216300815038, + "learning_rate": 5e-05, + "loss": 0.1707, + "loss/crossentropy": 2.7589592933654785, + "loss/hidden": 0.0, + "loss/logits": 0.17070752009749413, + "loss/reg": 1.8597759008407593, + "step": 864 + }, + { + "epoch": 0.00865, + "grad_norm": 0.8337989449501038, + "grad_norm_var": 0.02424293784323857, + "learning_rate": 5e-05, + "loss": 0.1781, + "loss/crossentropy": 2.8036633133888245, + "loss/hidden": 0.0, + "loss/logits": 0.17813356593251228, + "loss/reg": 1.8581993579864502, + "step": 865 + }, + { + "epoch": 0.00866, + "grad_norm": 0.35601744055747986, + "grad_norm_var": 0.02460846166740538, + "learning_rate": 5e-05, + "loss": 0.1759, + "loss/crossentropy": 2.715932846069336, + "loss/hidden": 0.0, + "loss/logits": 0.17593519389629364, + "loss/reg": 1.8565593957901, + "step": 866 + }, + { + "epoch": 0.00867, + "grad_norm": 0.425502747297287, + "grad_norm_var": 0.02436568774250706, + "learning_rate": 5e-05, + "loss": 0.1821, + "loss/crossentropy": 2.7275202870368958, + "loss/hidden": 0.0, + "loss/logits": 0.18210354447364807, + "loss/reg": 1.8554532527923584, + "step": 867 + }, + { + "epoch": 0.00868, + "grad_norm": 0.3844553232192993, + "grad_norm_var": 0.024142035969486622, + "learning_rate": 5e-05, + "loss": 0.1796, + "loss/crossentropy": 2.8464134335517883, + "loss/hidden": 0.0, + "loss/logits": 0.1795903705060482, + "loss/reg": 1.855208396911621, + "step": 868 + }, + { + "epoch": 0.00869, + "grad_norm": 0.35618311166763306, + "grad_norm_var": 0.02446160042257303, + "learning_rate": 5e-05, + "loss": 0.1758, + "loss/crossentropy": 2.87707781791687, + "loss/hidden": 0.0, + "loss/logits": 0.17583919316530228, + "loss/reg": 1.8549201488494873, + "step": 869 + }, + { + "epoch": 0.0087, + "grad_norm": 0.41672077775001526, + "grad_norm_var": 0.024117258377413187, + "learning_rate": 5e-05, + "loss": 0.1759, + "loss/crossentropy": 2.71548855304718, + "loss/hidden": 0.0, + "loss/logits": 0.1758672632277012, + "loss/reg": 1.8541263341903687, + "step": 870 + }, + { + "epoch": 0.00871, + "grad_norm": 0.3646162450313568, + "grad_norm_var": 0.024202440513241764, + "learning_rate": 5e-05, + "loss": 0.186, + "loss/crossentropy": 2.786548674106598, + "loss/hidden": 0.0, + "loss/logits": 0.18596061319112778, + "loss/reg": 1.8524823188781738, + "step": 871 + }, + { + "epoch": 0.00872, + "grad_norm": 0.37939974665641785, + "grad_norm_var": 0.023961625350842352, + "learning_rate": 5e-05, + "loss": 0.1748, + "loss/crossentropy": 2.835165321826935, + "loss/hidden": 0.0, + "loss/logits": 0.17476912215352058, + "loss/reg": 1.8506335020065308, + "step": 872 + }, + { + "epoch": 0.00873, + "grad_norm": 0.3887036442756653, + "grad_norm_var": 0.023551954015331347, + "learning_rate": 5e-05, + "loss": 0.1789, + "loss/crossentropy": 2.732525408267975, + "loss/hidden": 0.0, + "loss/logits": 0.17892278358340263, + "loss/reg": 1.849176287651062, + "step": 873 + }, + { + "epoch": 0.00874, + "grad_norm": 0.387320876121521, + "grad_norm_var": 0.014549813961372993, + "learning_rate": 5e-05, + "loss": 0.1941, + "loss/crossentropy": 2.792769968509674, + "loss/hidden": 0.0, + "loss/logits": 0.19409611076116562, + "loss/reg": 1.848083734512329, + "step": 874 + }, + { + "epoch": 0.00875, + "grad_norm": 0.3818178176879883, + "grad_norm_var": 0.014678730624869341, + "learning_rate": 5e-05, + "loss": 0.1865, + "loss/crossentropy": 2.762765884399414, + "loss/hidden": 0.0, + "loss/logits": 0.18646146729588509, + "loss/reg": 1.8465189933776855, + "step": 875 + }, + { + "epoch": 0.00876, + "grad_norm": 0.36014363169670105, + "grad_norm_var": 0.013132955726787365, + "learning_rate": 5e-05, + "loss": 0.1847, + "loss/crossentropy": 2.8113619089126587, + "loss/hidden": 0.0, + "loss/logits": 0.1847200095653534, + "loss/reg": 1.8454153537750244, + "step": 876 + }, + { + "epoch": 0.00877, + "grad_norm": 0.3916279077529907, + "grad_norm_var": 0.013081129963419124, + "learning_rate": 5e-05, + "loss": 0.181, + "loss/crossentropy": 2.767706513404846, + "loss/hidden": 0.0, + "loss/logits": 0.18097594752907753, + "loss/reg": 1.8445773124694824, + "step": 877 + }, + { + "epoch": 0.00878, + "grad_norm": 0.38396528363227844, + "grad_norm_var": 0.013058641234249413, + "learning_rate": 5e-05, + "loss": 0.1799, + "loss/crossentropy": 2.750400483608246, + "loss/hidden": 0.0, + "loss/logits": 0.179927259683609, + "loss/reg": 1.8441030979156494, + "step": 878 + }, + { + "epoch": 0.00879, + "grad_norm": 0.3784838020801544, + "grad_norm_var": 0.013129867131250705, + "learning_rate": 5e-05, + "loss": 0.1783, + "loss/crossentropy": 2.6576608419418335, + "loss/hidden": 0.0, + "loss/logits": 0.1783200539648533, + "loss/reg": 1.842759370803833, + "step": 879 + }, + { + "epoch": 0.0088, + "grad_norm": 0.3373940885066986, + "grad_norm_var": 0.013387093786405535, + "learning_rate": 5e-05, + "loss": 0.1665, + "loss/crossentropy": 2.7494055032730103, + "loss/hidden": 0.0, + "loss/logits": 0.16646360978484154, + "loss/reg": 1.8418775796890259, + "step": 880 + }, + { + "epoch": 0.00881, + "grad_norm": 0.3743676543235779, + "grad_norm_var": 0.000488954453456858, + "learning_rate": 5e-05, + "loss": 0.1718, + "loss/crossentropy": 2.7373632192611694, + "loss/hidden": 0.0, + "loss/logits": 0.17177791520953178, + "loss/reg": 1.8405792713165283, + "step": 881 + }, + { + "epoch": 0.00882, + "grad_norm": 0.36398422718048096, + "grad_norm_var": 0.0004683277690547882, + "learning_rate": 5e-05, + "loss": 0.1713, + "loss/crossentropy": 2.689941644668579, + "loss/hidden": 0.0, + "loss/logits": 0.1712586209177971, + "loss/reg": 1.8396137952804565, + "step": 882 + }, + { + "epoch": 0.00883, + "grad_norm": 0.36932700872421265, + "grad_norm_var": 0.0003222525763987627, + "learning_rate": 5e-05, + "loss": 0.1738, + "loss/crossentropy": 3.0094715356826782, + "loss/hidden": 0.0, + "loss/logits": 0.17375321686267853, + "loss/reg": 1.8387012481689453, + "step": 883 + }, + { + "epoch": 0.00884, + "grad_norm": 0.37647050619125366, + "grad_norm_var": 0.0003174026053568609, + "learning_rate": 5e-05, + "loss": 0.1734, + "loss/crossentropy": 2.675420820713043, + "loss/hidden": 0.0, + "loss/logits": 0.17337032034993172, + "loss/reg": 1.8372994661331177, + "step": 884 + }, + { + "epoch": 0.00885, + "grad_norm": 0.3657122850418091, + "grad_norm_var": 0.0002983341146215642, + "learning_rate": 5e-05, + "loss": 0.175, + "loss/crossentropy": 2.785289704799652, + "loss/hidden": 0.0, + "loss/logits": 0.1750231385231018, + "loss/reg": 1.836666464805603, + "step": 885 + }, + { + "epoch": 0.00886, + "grad_norm": 0.3565351963043213, + "grad_norm_var": 0.00019998832643003023, + "learning_rate": 5e-05, + "loss": 0.1611, + "loss/crossentropy": 2.67407763004303, + "loss/hidden": 0.0, + "loss/logits": 0.16111686453223228, + "loss/reg": 1.8361395597457886, + "step": 886 + }, + { + "epoch": 0.00887, + "grad_norm": 0.38317063450813293, + "grad_norm_var": 0.00020202125672466782, + "learning_rate": 5e-05, + "loss": 0.1793, + "loss/crossentropy": 2.7103012204170227, + "loss/hidden": 0.0, + "loss/logits": 0.17928585410118103, + "loss/reg": 1.8344452381134033, + "step": 887 + }, + { + "epoch": 0.00888, + "grad_norm": 0.39307963848114014, + "grad_norm_var": 0.00022420215532017082, + "learning_rate": 5e-05, + "loss": 0.1899, + "loss/crossentropy": 2.7159000635147095, + "loss/hidden": 0.0, + "loss/logits": 0.18990719318389893, + "loss/reg": 1.8328790664672852, + "step": 888 + }, + { + "epoch": 0.00889, + "grad_norm": 0.35862287878990173, + "grad_norm_var": 0.00022381402201028245, + "learning_rate": 5e-05, + "loss": 0.1834, + "loss/crossentropy": 2.847196877002716, + "loss/hidden": 0.0, + "loss/logits": 0.18342823907732964, + "loss/reg": 1.8318103551864624, + "step": 889 + }, + { + "epoch": 0.0089, + "grad_norm": 0.3539126515388489, + "grad_norm_var": 0.0002281156381275314, + "learning_rate": 5e-05, + "loss": 0.1777, + "loss/crossentropy": 2.853213608264923, + "loss/hidden": 0.0, + "loss/logits": 0.1777319796383381, + "loss/reg": 1.829829454421997, + "step": 890 + }, + { + "epoch": 0.00891, + "grad_norm": 0.41561535000801086, + "grad_norm_var": 0.000350336348254295, + "learning_rate": 5e-05, + "loss": 0.1895, + "loss/crossentropy": 2.6837574243545532, + "loss/hidden": 0.0, + "loss/logits": 0.18954132869839668, + "loss/reg": 1.8281813859939575, + "step": 891 + }, + { + "epoch": 0.00892, + "grad_norm": 0.3593007028102875, + "grad_norm_var": 0.00035178644306217054, + "learning_rate": 5e-05, + "loss": 0.1639, + "loss/crossentropy": 2.7398064136505127, + "loss/hidden": 0.0, + "loss/logits": 0.163859985768795, + "loss/reg": 1.8272664546966553, + "step": 892 + }, + { + "epoch": 0.00893, + "grad_norm": 0.3928586542606354, + "grad_norm_var": 0.00035500389449958367, + "learning_rate": 5e-05, + "loss": 0.187, + "loss/crossentropy": 2.7214816212654114, + "loss/hidden": 0.0, + "loss/logits": 0.18696707114577293, + "loss/reg": 1.8260765075683594, + "step": 893 + }, + { + "epoch": 0.00894, + "grad_norm": 0.38060957193374634, + "grad_norm_var": 0.00035065611870696014, + "learning_rate": 5e-05, + "loss": 0.1816, + "loss/crossentropy": 2.8307188153266907, + "loss/hidden": 0.0, + "loss/logits": 0.18159236386418343, + "loss/reg": 1.8246755599975586, + "step": 894 + }, + { + "epoch": 0.00895, + "grad_norm": 0.34957438707351685, + "grad_norm_var": 0.0003796919232549693, + "learning_rate": 5e-05, + "loss": 0.1801, + "loss/crossentropy": 2.7387137413024902, + "loss/hidden": 0.0, + "loss/logits": 0.18012140691280365, + "loss/reg": 1.8235573768615723, + "step": 895 + }, + { + "epoch": 0.00896, + "grad_norm": 0.3666534721851349, + "grad_norm_var": 0.00030342620785123544, + "learning_rate": 5e-05, + "loss": 0.2036, + "loss/crossentropy": 2.758805215358734, + "loss/hidden": 0.0, + "loss/logits": 0.20357270538806915, + "loss/reg": 1.8219776153564453, + "step": 896 + }, + { + "epoch": 0.00897, + "grad_norm": 0.5404136180877686, + "grad_norm_var": 0.0020682628614343557, + "learning_rate": 5e-05, + "loss": 0.1933, + "loss/crossentropy": 2.970340132713318, + "loss/hidden": 0.0, + "loss/logits": 0.19332898035645485, + "loss/reg": 1.8206608295440674, + "step": 897 + }, + { + "epoch": 0.00898, + "grad_norm": 0.3982648551464081, + "grad_norm_var": 0.0020554109287465, + "learning_rate": 5e-05, + "loss": 0.1768, + "loss/crossentropy": 2.6868785619735718, + "loss/hidden": 0.0, + "loss/logits": 0.1768476814031601, + "loss/reg": 1.8192402124404907, + "step": 898 + }, + { + "epoch": 0.00899, + "grad_norm": 0.38741791248321533, + "grad_norm_var": 0.002038042531465663, + "learning_rate": 5e-05, + "loss": 0.1808, + "loss/crossentropy": 2.816374719142914, + "loss/hidden": 0.0, + "loss/logits": 0.18077386170625687, + "loss/reg": 1.8179749250411987, + "step": 899 + }, + { + "epoch": 0.009, + "grad_norm": 0.3847026526927948, + "grad_norm_var": 0.0020316665201686695, + "learning_rate": 5e-05, + "loss": 0.1982, + "loss/crossentropy": 2.8102923035621643, + "loss/hidden": 0.0, + "loss/logits": 0.19821672514081, + "loss/reg": 1.8163453340530396, + "step": 900 + }, + { + "epoch": 0.00901, + "grad_norm": 0.3916863203048706, + "grad_norm_var": 0.0020013109603508254, + "learning_rate": 5e-05, + "loss": 0.1778, + "loss/crossentropy": 2.883694589138031, + "loss/hidden": 0.0, + "loss/logits": 0.17775952070951462, + "loss/reg": 1.8147519826889038, + "step": 901 + }, + { + "epoch": 0.00902, + "grad_norm": 0.36217865347862244, + "grad_norm_var": 0.001979417665481912, + "learning_rate": 5e-05, + "loss": 0.1669, + "loss/crossentropy": 2.8100743293762207, + "loss/hidden": 0.0, + "loss/logits": 0.1668529324233532, + "loss/reg": 1.8133151531219482, + "step": 902 + }, + { + "epoch": 0.00903, + "grad_norm": 0.42102572321891785, + "grad_norm_var": 0.00204143104015622, + "learning_rate": 5e-05, + "loss": 0.181, + "loss/crossentropy": 2.8006924986839294, + "loss/hidden": 0.0, + "loss/logits": 0.18102795258164406, + "loss/reg": 1.8111281394958496, + "step": 903 + }, + { + "epoch": 0.00904, + "grad_norm": 0.388874351978302, + "grad_norm_var": 0.002041367346731493, + "learning_rate": 5e-05, + "loss": 0.1818, + "loss/crossentropy": 2.8004772067070007, + "loss/hidden": 0.0, + "loss/logits": 0.18177441507577896, + "loss/reg": 1.8097094297409058, + "step": 904 + }, + { + "epoch": 0.00905, + "grad_norm": 0.3800460696220398, + "grad_norm_var": 0.0019783346485648203, + "learning_rate": 5e-05, + "loss": 0.176, + "loss/crossentropy": 2.693452537059784, + "loss/hidden": 0.0, + "loss/logits": 0.1760004386305809, + "loss/reg": 1.8086622953414917, + "step": 905 + }, + { + "epoch": 0.00906, + "grad_norm": 0.32960033416748047, + "grad_norm_var": 0.0021389732007735863, + "learning_rate": 5e-05, + "loss": 0.1641, + "loss/crossentropy": 2.7406028509140015, + "loss/hidden": 0.0, + "loss/logits": 0.1641414910554886, + "loss/reg": 1.8070775270462036, + "step": 906 + }, + { + "epoch": 0.00907, + "grad_norm": 0.41262900829315186, + "grad_norm_var": 0.002129550660359725, + "learning_rate": 5e-05, + "loss": 0.1639, + "loss/crossentropy": 2.815558433532715, + "loss/hidden": 0.0, + "loss/logits": 0.1639426201581955, + "loss/reg": 1.80557119846344, + "step": 907 + }, + { + "epoch": 0.00908, + "grad_norm": 0.34168651700019836, + "grad_norm_var": 0.0022218976438497353, + "learning_rate": 5e-05, + "loss": 0.1689, + "loss/crossentropy": 2.7727773189544678, + "loss/hidden": 0.0, + "loss/logits": 0.16886158660054207, + "loss/reg": 1.8040578365325928, + "step": 908 + }, + { + "epoch": 0.00909, + "grad_norm": 0.3481311500072479, + "grad_norm_var": 0.0023254939668475396, + "learning_rate": 5e-05, + "loss": 0.1723, + "loss/crossentropy": 2.7662405967712402, + "loss/hidden": 0.0, + "loss/logits": 0.17232706770300865, + "loss/reg": 1.802317500114441, + "step": 909 + }, + { + "epoch": 0.0091, + "grad_norm": 0.34673434495925903, + "grad_norm_var": 0.0024236772610501315, + "learning_rate": 5e-05, + "loss": 0.1721, + "loss/crossentropy": 2.7547109723091125, + "loss/hidden": 0.0, + "loss/logits": 0.17211218550801277, + "loss/reg": 1.8002270460128784, + "step": 910 + }, + { + "epoch": 0.00911, + "grad_norm": 0.358116090297699, + "grad_norm_var": 0.002388630196925971, + "learning_rate": 5e-05, + "loss": 0.1778, + "loss/crossentropy": 2.8174885511398315, + "loss/hidden": 0.0, + "loss/logits": 0.17777465283870697, + "loss/reg": 1.7986141443252563, + "step": 911 + }, + { + "epoch": 0.00912, + "grad_norm": 0.37328216433525085, + "grad_norm_var": 0.0023752628687049343, + "learning_rate": 5e-05, + "loss": 0.1846, + "loss/crossentropy": 2.7423484921455383, + "loss/hidden": 0.0, + "loss/logits": 0.18462468683719635, + "loss/reg": 1.7969509363174438, + "step": 912 + }, + { + "epoch": 0.00913, + "grad_norm": 0.4073435366153717, + "grad_norm_var": 0.000729848525378903, + "learning_rate": 5e-05, + "loss": 0.1656, + "loss/crossentropy": 2.6962223649024963, + "loss/hidden": 0.0, + "loss/logits": 0.16563431546092033, + "loss/reg": 1.7949342727661133, + "step": 913 + }, + { + "epoch": 0.00914, + "grad_norm": 0.37720003724098206, + "grad_norm_var": 0.0006978068548327905, + "learning_rate": 5e-05, + "loss": 0.1853, + "loss/crossentropy": 2.710608184337616, + "loss/hidden": 0.0, + "loss/logits": 0.18529681861400604, + "loss/reg": 1.7920844554901123, + "step": 914 + }, + { + "epoch": 0.00915, + "grad_norm": 0.36010050773620605, + "grad_norm_var": 0.0007016424011595597, + "learning_rate": 5e-05, + "loss": 0.1833, + "loss/crossentropy": 2.8862733840942383, + "loss/hidden": 0.0, + "loss/logits": 0.1833462007343769, + "loss/reg": 1.7895822525024414, + "step": 915 + }, + { + "epoch": 0.00916, + "grad_norm": 0.35757148265838623, + "grad_norm_var": 0.0007087821480995478, + "learning_rate": 5e-05, + "loss": 0.1741, + "loss/crossentropy": 2.736583173274994, + "loss/hidden": 0.0, + "loss/logits": 0.1740923710167408, + "loss/reg": 1.7870407104492188, + "step": 916 + }, + { + "epoch": 0.00917, + "grad_norm": 0.38085147738456726, + "grad_norm_var": 0.0006880592910958772, + "learning_rate": 5e-05, + "loss": 0.1758, + "loss/crossentropy": 2.722678780555725, + "loss/hidden": 0.0, + "loss/logits": 0.17577889189124107, + "loss/reg": 1.78484308719635, + "step": 917 + }, + { + "epoch": 0.00918, + "grad_norm": 0.3618144989013672, + "grad_norm_var": 0.000688524329092799, + "learning_rate": 5e-05, + "loss": 0.1729, + "loss/crossentropy": 2.7107303738594055, + "loss/hidden": 0.0, + "loss/logits": 0.17288268730044365, + "loss/reg": 1.781936764717102, + "step": 918 + }, + { + "epoch": 0.00919, + "grad_norm": 0.4562152326107025, + "grad_norm_var": 0.000997994245971834, + "learning_rate": 5e-05, + "loss": 0.209, + "loss/crossentropy": 2.6911511421203613, + "loss/hidden": 0.0, + "loss/logits": 0.20904593169689178, + "loss/reg": 1.7793291807174683, + "step": 919 + }, + { + "epoch": 0.0092, + "grad_norm": 0.7892553806304932, + "grad_norm_var": 0.011823798595284762, + "learning_rate": 5e-05, + "loss": 0.2098, + "loss/crossentropy": 2.8024474382400513, + "loss/hidden": 0.0, + "loss/logits": 0.20984026044607162, + "loss/reg": 1.7767542600631714, + "step": 920 + }, + { + "epoch": 0.00921, + "grad_norm": 0.4952187240123749, + "grad_norm_var": 0.012365066103201613, + "learning_rate": 5e-05, + "loss": 0.2215, + "loss/crossentropy": 2.836692988872528, + "loss/hidden": 0.0, + "loss/logits": 0.22146976366639137, + "loss/reg": 1.77475905418396, + "step": 921 + }, + { + "epoch": 0.00922, + "grad_norm": 0.4500190317630768, + "grad_norm_var": 0.01204494814009713, + "learning_rate": 5e-05, + "loss": 0.1957, + "loss/crossentropy": 2.7740437984466553, + "loss/hidden": 0.0, + "loss/logits": 0.1957194283604622, + "loss/reg": 1.7725263833999634, + "step": 922 + }, + { + "epoch": 0.00923, + "grad_norm": 0.4018624424934387, + "grad_norm_var": 0.012053458598524087, + "learning_rate": 5e-05, + "loss": 0.1985, + "loss/crossentropy": 2.783832609653473, + "loss/hidden": 0.0, + "loss/logits": 0.19849245250225067, + "loss/reg": 1.7706068754196167, + "step": 923 + }, + { + "epoch": 0.00924, + "grad_norm": 0.4053579866886139, + "grad_norm_var": 0.01170279735803306, + "learning_rate": 5e-05, + "loss": 0.1847, + "loss/crossentropy": 2.8584959506988525, + "loss/hidden": 0.0, + "loss/logits": 0.1846720688045025, + "loss/reg": 1.7687995433807373, + "step": 924 + }, + { + "epoch": 0.00925, + "grad_norm": 0.4355542063713074, + "grad_norm_var": 0.011379840002585932, + "learning_rate": 5e-05, + "loss": 0.1911, + "loss/crossentropy": 2.8072018027305603, + "loss/hidden": 0.0, + "loss/logits": 0.19105902686715126, + "loss/reg": 1.766356348991394, + "step": 925 + }, + { + "epoch": 0.00926, + "grad_norm": 0.41985246539115906, + "grad_norm_var": 0.010977469936536095, + "learning_rate": 5e-05, + "loss": 0.1772, + "loss/crossentropy": 2.825522303581238, + "loss/hidden": 0.0, + "loss/logits": 0.17719140276312828, + "loss/reg": 1.764008641242981, + "step": 926 + }, + { + "epoch": 0.00927, + "grad_norm": 0.4020366370677948, + "grad_norm_var": 0.010695516965112247, + "learning_rate": 5e-05, + "loss": 0.168, + "loss/crossentropy": 2.946666181087494, + "loss/hidden": 0.0, + "loss/logits": 0.16798604279756546, + "loss/reg": 1.7619134187698364, + "step": 927 + }, + { + "epoch": 0.00928, + "grad_norm": 0.4333237111568451, + "grad_norm_var": 0.01047000612816995, + "learning_rate": 5e-05, + "loss": 0.1732, + "loss/crossentropy": 2.904057264328003, + "loss/hidden": 0.0, + "loss/logits": 0.1731964722275734, + "loss/reg": 1.7594822645187378, + "step": 928 + }, + { + "epoch": 0.00929, + "grad_norm": 0.44914835691452026, + "grad_norm_var": 0.010434282299518182, + "learning_rate": 5e-05, + "loss": 0.1822, + "loss/crossentropy": 2.8881112933158875, + "loss/hidden": 0.0, + "loss/logits": 0.1821521669626236, + "loss/reg": 1.757930874824524, + "step": 929 + }, + { + "epoch": 0.0093, + "grad_norm": 0.8063428401947021, + "grad_norm_var": 0.018582235883409348, + "learning_rate": 5e-05, + "loss": 0.2108, + "loss/crossentropy": 3.000428855419159, + "loss/hidden": 0.0, + "loss/logits": 0.21083774790167809, + "loss/reg": 1.756495714187622, + "step": 930 + }, + { + "epoch": 0.00931, + "grad_norm": 0.37262633442878723, + "grad_norm_var": 0.018420551139004416, + "learning_rate": 5e-05, + "loss": 0.1727, + "loss/crossentropy": 2.899652659893036, + "loss/hidden": 0.0, + "loss/logits": 0.1727372407913208, + "loss/reg": 1.755653977394104, + "step": 931 + }, + { + "epoch": 0.00932, + "grad_norm": 0.44574448466300964, + "grad_norm_var": 0.017660345874116586, + "learning_rate": 5e-05, + "loss": 0.1771, + "loss/crossentropy": 2.7930703163146973, + "loss/hidden": 0.0, + "loss/logits": 0.17705539613962173, + "loss/reg": 1.7541528940200806, + "step": 932 + }, + { + "epoch": 0.00933, + "grad_norm": 0.4381559193134308, + "grad_norm_var": 0.017191491981390843, + "learning_rate": 5e-05, + "loss": 0.1928, + "loss/crossentropy": 3.009516716003418, + "loss/hidden": 0.0, + "loss/logits": 0.19276633486151695, + "loss/reg": 1.7523828744888306, + "step": 933 + }, + { + "epoch": 0.00934, + "grad_norm": 0.37119948863983154, + "grad_norm_var": 0.01705829482260827, + "learning_rate": 5e-05, + "loss": 0.1784, + "loss/crossentropy": 2.7011741995811462, + "loss/hidden": 0.0, + "loss/logits": 0.17841476574540138, + "loss/reg": 1.750117540359497, + "step": 934 + }, + { + "epoch": 0.00935, + "grad_norm": 0.38776126503944397, + "grad_norm_var": 0.017506596591750172, + "learning_rate": 5e-05, + "loss": 0.1765, + "loss/crossentropy": 2.755903959274292, + "loss/hidden": 0.0, + "loss/logits": 0.17649077624082565, + "loss/reg": 1.7485483884811401, + "step": 935 + }, + { + "epoch": 0.00936, + "grad_norm": 0.3909890949726105, + "grad_norm_var": 0.010412048008983836, + "learning_rate": 5e-05, + "loss": 0.1718, + "loss/crossentropy": 2.7981409430503845, + "loss/hidden": 0.0, + "loss/logits": 0.1718210205435753, + "loss/reg": 1.7470717430114746, + "step": 936 + }, + { + "epoch": 0.00937, + "grad_norm": 0.4047463536262512, + "grad_norm_var": 0.010306674977021387, + "learning_rate": 5e-05, + "loss": 0.1944, + "loss/crossentropy": 2.653374135494232, + "loss/hidden": 0.0, + "loss/logits": 0.1944383941590786, + "loss/reg": 1.7457630634307861, + "step": 937 + }, + { + "epoch": 0.00938, + "grad_norm": 0.4641042947769165, + "grad_norm_var": 0.010340857957444612, + "learning_rate": 5e-05, + "loss": 0.1933, + "loss/crossentropy": 3.054188370704651, + "loss/hidden": 0.0, + "loss/logits": 0.19333792477846146, + "loss/reg": 1.7441859245300293, + "step": 938 + }, + { + "epoch": 0.00939, + "grad_norm": 0.463642418384552, + "grad_norm_var": 0.01027101724812616, + "learning_rate": 5e-05, + "loss": 0.1906, + "loss/crossentropy": 2.7980846166610718, + "loss/hidden": 0.0, + "loss/logits": 0.19064636901021004, + "loss/reg": 1.7424683570861816, + "step": 939 + }, + { + "epoch": 0.0094, + "grad_norm": 0.3763858675956726, + "grad_norm_var": 0.010469512228889604, + "learning_rate": 5e-05, + "loss": 0.1817, + "loss/crossentropy": 2.8554503321647644, + "loss/hidden": 0.0, + "loss/logits": 0.1817203275859356, + "loss/reg": 1.7405589818954468, + "step": 940 + }, + { + "epoch": 0.00941, + "grad_norm": 0.41792160272598267, + "grad_norm_var": 0.010502572032980106, + "learning_rate": 5e-05, + "loss": 0.2081, + "loss/crossentropy": 2.7464479207992554, + "loss/hidden": 0.0, + "loss/logits": 0.20806986466050148, + "loss/reg": 1.7390387058258057, + "step": 941 + }, + { + "epoch": 0.00942, + "grad_norm": 0.405609130859375, + "grad_norm_var": 0.010553986517832181, + "learning_rate": 5e-05, + "loss": 0.1923, + "loss/crossentropy": 2.9190812706947327, + "loss/hidden": 0.0, + "loss/logits": 0.1922554075717926, + "loss/reg": 1.737121343612671, + "step": 942 + }, + { + "epoch": 0.00943, + "grad_norm": 0.5186859369277954, + "grad_norm_var": 0.010823950074372254, + "learning_rate": 5e-05, + "loss": 0.1892, + "loss/crossentropy": 2.846211016178131, + "loss/hidden": 0.0, + "loss/logits": 0.1892014741897583, + "loss/reg": 1.7357187271118164, + "step": 943 + }, + { + "epoch": 0.00944, + "grad_norm": 0.3441806137561798, + "grad_norm_var": 0.011478989118617007, + "learning_rate": 5e-05, + "loss": 0.1701, + "loss/crossentropy": 2.8220054507255554, + "loss/hidden": 0.0, + "loss/logits": 0.1700589321553707, + "loss/reg": 1.733402967453003, + "step": 944 + }, + { + "epoch": 0.00945, + "grad_norm": 0.37400367856025696, + "grad_norm_var": 0.011751047533430632, + "learning_rate": 5e-05, + "loss": 0.179, + "loss/crossentropy": 2.7692030668258667, + "loss/hidden": 0.0, + "loss/logits": 0.17900892347097397, + "loss/reg": 1.7316250801086426, + "step": 945 + }, + { + "epoch": 0.00946, + "grad_norm": 0.4089336395263672, + "grad_norm_var": 0.0020184395330867097, + "learning_rate": 5e-05, + "loss": 0.1982, + "loss/crossentropy": 2.891884744167328, + "loss/hidden": 0.0, + "loss/logits": 0.19816706702113152, + "loss/reg": 1.7298880815505981, + "step": 946 + }, + { + "epoch": 0.00947, + "grad_norm": 0.36752966046333313, + "grad_norm_var": 0.002046509202798789, + "learning_rate": 5e-05, + "loss": 0.1843, + "loss/crossentropy": 2.858055591583252, + "loss/hidden": 0.0, + "loss/logits": 0.18428384885191917, + "loss/reg": 1.7284300327301025, + "step": 947 + }, + { + "epoch": 0.00948, + "grad_norm": 0.36644455790519714, + "grad_norm_var": 0.002074549092487384, + "learning_rate": 5e-05, + "loss": 0.1714, + "loss/crossentropy": 2.848255932331085, + "loss/hidden": 0.0, + "loss/logits": 0.17144014686346054, + "loss/reg": 1.7270002365112305, + "step": 948 + }, + { + "epoch": 0.00949, + "grad_norm": 0.5910805463790894, + "grad_norm_var": 0.004186356490923812, + "learning_rate": 5e-05, + "loss": 0.23, + "loss/crossentropy": 2.7590108513832092, + "loss/hidden": 0.0, + "loss/logits": 0.22996008396148682, + "loss/reg": 1.7256983518600464, + "step": 949 + }, + { + "epoch": 0.0095, + "grad_norm": 0.35803112387657166, + "grad_norm_var": 0.00427554901524122, + "learning_rate": 5e-05, + "loss": 0.1684, + "loss/crossentropy": 2.7760064005851746, + "loss/hidden": 0.0, + "loss/logits": 0.16840620338916779, + "loss/reg": 1.723679542541504, + "step": 950 + }, + { + "epoch": 0.00951, + "grad_norm": 0.412681519985199, + "grad_norm_var": 0.004223846207916952, + "learning_rate": 5e-05, + "loss": 0.2018, + "loss/crossentropy": 2.9045740365982056, + "loss/hidden": 0.0, + "loss/logits": 0.20182525366544724, + "loss/reg": 1.72231125831604, + "step": 951 + }, + { + "epoch": 0.00952, + "grad_norm": 0.4021626114845276, + "grad_norm_var": 0.004193552649382352, + "learning_rate": 5e-05, + "loss": 0.1848, + "loss/crossentropy": 2.6521793007850647, + "loss/hidden": 0.0, + "loss/logits": 0.1848319098353386, + "loss/reg": 1.720641851425171, + "step": 952 + }, + { + "epoch": 0.00953, + "grad_norm": 0.3750251233577728, + "grad_norm_var": 0.00429834750938114, + "learning_rate": 5e-05, + "loss": 0.1791, + "loss/crossentropy": 2.7560397386550903, + "loss/hidden": 0.0, + "loss/logits": 0.17908834293484688, + "loss/reg": 1.7182477712631226, + "step": 953 + }, + { + "epoch": 0.00954, + "grad_norm": 0.5893900990486145, + "grad_norm_var": 0.006092951728716223, + "learning_rate": 5e-05, + "loss": 0.2129, + "loss/crossentropy": 2.835801601409912, + "loss/hidden": 0.0, + "loss/logits": 0.21293479949235916, + "loss/reg": 1.716722011566162, + "step": 954 + }, + { + "epoch": 0.00955, + "grad_norm": 0.40877264738082886, + "grad_norm_var": 0.005985476808116985, + "learning_rate": 5e-05, + "loss": 0.1938, + "loss/crossentropy": 2.689119517803192, + "loss/hidden": 0.0, + "loss/logits": 0.19383220747113228, + "loss/reg": 1.714568853378296, + "step": 955 + }, + { + "epoch": 0.00956, + "grad_norm": 0.38810843229293823, + "grad_norm_var": 0.005926205055061354, + "learning_rate": 5e-05, + "loss": 0.1705, + "loss/crossentropy": 2.948507070541382, + "loss/hidden": 0.0, + "loss/logits": 0.1705201156437397, + "loss/reg": 1.7119203805923462, + "step": 956 + }, + { + "epoch": 0.00957, + "grad_norm": 0.4206679165363312, + "grad_norm_var": 0.005925719462670757, + "learning_rate": 5e-05, + "loss": 0.1807, + "loss/crossentropy": 2.78257417678833, + "loss/hidden": 0.0, + "loss/logits": 0.18074193224310875, + "loss/reg": 1.7095727920532227, + "step": 957 + }, + { + "epoch": 0.00958, + "grad_norm": 0.3933105766773224, + "grad_norm_var": 0.005959929971731507, + "learning_rate": 5e-05, + "loss": 0.2045, + "loss/crossentropy": 2.7225964665412903, + "loss/hidden": 0.0, + "loss/logits": 0.2044883407652378, + "loss/reg": 1.707101583480835, + "step": 958 + }, + { + "epoch": 0.00959, + "grad_norm": 0.3582659661769867, + "grad_norm_var": 0.005456189170996354, + "learning_rate": 5e-05, + "loss": 0.1603, + "loss/crossentropy": 2.7268422842025757, + "loss/hidden": 0.0, + "loss/logits": 0.1602596789598465, + "loss/reg": 1.7055177688598633, + "step": 959 + }, + { + "epoch": 0.0096, + "grad_norm": 0.397594153881073, + "grad_norm_var": 0.0051663773874797295, + "learning_rate": 5e-05, + "loss": 0.1733, + "loss/crossentropy": 2.877332389354706, + "loss/hidden": 0.0, + "loss/logits": 0.1733493208885193, + "loss/reg": 1.7030569314956665, + "step": 960 + }, + { + "epoch": 0.00961, + "grad_norm": 0.4981625974178314, + "grad_norm_var": 0.005480135764708397, + "learning_rate": 5e-05, + "loss": 0.1826, + "loss/crossentropy": 2.656112492084503, + "loss/hidden": 0.0, + "loss/logits": 0.18259770050644875, + "loss/reg": 1.7019816637039185, + "step": 961 + }, + { + "epoch": 0.00962, + "grad_norm": 0.937751054763794, + "grad_norm_var": 0.022106629800203694, + "learning_rate": 5e-05, + "loss": 0.2083, + "loss/crossentropy": 2.907736301422119, + "loss/hidden": 0.0, + "loss/logits": 0.20831404626369476, + "loss/reg": 1.700728416442871, + "step": 962 + }, + { + "epoch": 0.00963, + "grad_norm": 0.3895174264907837, + "grad_norm_var": 0.021883161578962466, + "learning_rate": 5e-05, + "loss": 0.1803, + "loss/crossentropy": 2.845858633518219, + "loss/hidden": 0.0, + "loss/logits": 0.18031802773475647, + "loss/reg": 1.6990742683410645, + "step": 963 + }, + { + "epoch": 0.00964, + "grad_norm": 0.34548690915107727, + "grad_norm_var": 0.02215928485241057, + "learning_rate": 5e-05, + "loss": 0.1645, + "loss/crossentropy": 2.8244311213493347, + "loss/hidden": 0.0, + "loss/logits": 0.1645219847559929, + "loss/reg": 1.697798252105713, + "step": 964 + }, + { + "epoch": 0.00965, + "grad_norm": 0.36824318766593933, + "grad_norm_var": 0.021193656582446117, + "learning_rate": 5e-05, + "loss": 0.1717, + "loss/crossentropy": 2.884181797504425, + "loss/hidden": 0.0, + "loss/logits": 0.17166699841618538, + "loss/reg": 1.6957753896713257, + "step": 965 + }, + { + "epoch": 0.00966, + "grad_norm": 0.37774839997291565, + "grad_norm_var": 0.021001939954339834, + "learning_rate": 5e-05, + "loss": 0.1824, + "loss/crossentropy": 3.025804340839386, + "loss/hidden": 0.0, + "loss/logits": 0.18241329863667488, + "loss/reg": 1.6944239139556885, + "step": 966 + }, + { + "epoch": 0.00967, + "grad_norm": 0.36408743262290955, + "grad_norm_var": 0.02133579751543382, + "learning_rate": 5e-05, + "loss": 0.1816, + "loss/crossentropy": 2.6597015261650085, + "loss/hidden": 0.0, + "loss/logits": 0.1815933845937252, + "loss/reg": 1.692252516746521, + "step": 967 + }, + { + "epoch": 0.00968, + "grad_norm": 0.34311729669570923, + "grad_norm_var": 0.02183892884845392, + "learning_rate": 5e-05, + "loss": 0.1662, + "loss/crossentropy": 2.845684826374054, + "loss/hidden": 0.0, + "loss/logits": 0.1661607250571251, + "loss/reg": 1.6907639503479004, + "step": 968 + }, + { + "epoch": 0.00969, + "grad_norm": 0.38303908705711365, + "grad_norm_var": 0.021779175231247044, + "learning_rate": 5e-05, + "loss": 0.1693, + "loss/crossentropy": 2.558404505252838, + "loss/hidden": 0.0, + "loss/logits": 0.16925161331892014, + "loss/reg": 1.6893569231033325, + "step": 969 + }, + { + "epoch": 0.0097, + "grad_norm": 0.3850949704647064, + "grad_norm_var": 0.02018777587365078, + "learning_rate": 5e-05, + "loss": 0.1685, + "loss/crossentropy": 2.890751600265503, + "loss/hidden": 0.0, + "loss/logits": 0.1684512346982956, + "loss/reg": 1.688266634941101, + "step": 970 + }, + { + "epoch": 0.00971, + "grad_norm": 0.4068422317504883, + "grad_norm_var": 0.020191525445637973, + "learning_rate": 5e-05, + "loss": 0.1857, + "loss/crossentropy": 2.707846701145172, + "loss/hidden": 0.0, + "loss/logits": 0.18569114059209824, + "loss/reg": 1.6867531538009644, + "step": 971 + }, + { + "epoch": 0.00972, + "grad_norm": 0.3924512565135956, + "grad_norm_var": 0.020172897207266394, + "learning_rate": 5e-05, + "loss": 0.1829, + "loss/crossentropy": 2.848098576068878, + "loss/hidden": 0.0, + "loss/logits": 0.1828712299466133, + "loss/reg": 1.6852294206619263, + "step": 972 + }, + { + "epoch": 0.00973, + "grad_norm": 0.3714575469493866, + "grad_norm_var": 0.020336838096992275, + "learning_rate": 5e-05, + "loss": 0.1773, + "loss/crossentropy": 2.8703532814979553, + "loss/hidden": 0.0, + "loss/logits": 0.17731666564941406, + "loss/reg": 1.6838881969451904, + "step": 973 + }, + { + "epoch": 0.00974, + "grad_norm": 0.35195186734199524, + "grad_norm_var": 0.020588227081264298, + "learning_rate": 5e-05, + "loss": 0.1862, + "loss/crossentropy": 2.863659620285034, + "loss/hidden": 0.0, + "loss/logits": 0.18621815741062164, + "loss/reg": 1.6821165084838867, + "step": 974 + }, + { + "epoch": 0.00975, + "grad_norm": 0.441755086183548, + "grad_norm_var": 0.02037088575085001, + "learning_rate": 5e-05, + "loss": 0.1933, + "loss/crossentropy": 2.810901939868927, + "loss/hidden": 0.0, + "loss/logits": 0.19334488362073898, + "loss/reg": 1.6802574396133423, + "step": 975 + }, + { + "epoch": 0.00976, + "grad_norm": 0.40233367681503296, + "grad_norm_var": 0.02035677589008348, + "learning_rate": 5e-05, + "loss": 0.1983, + "loss/crossentropy": 2.681654691696167, + "loss/hidden": 0.0, + "loss/logits": 0.19833911955356598, + "loss/reg": 1.6781818866729736, + "step": 976 + }, + { + "epoch": 0.00977, + "grad_norm": 0.6531580686569214, + "grad_norm_var": 0.023423138566671976, + "learning_rate": 5e-05, + "loss": 0.2212, + "loss/crossentropy": 2.82851505279541, + "loss/hidden": 0.0, + "loss/logits": 0.2212192267179489, + "loss/reg": 1.6763266324996948, + "step": 977 + }, + { + "epoch": 0.00978, + "grad_norm": 0.3646674156188965, + "grad_norm_var": 0.005314392422501734, + "learning_rate": 5e-05, + "loss": 0.1715, + "loss/crossentropy": 2.7788134813308716, + "loss/hidden": 0.0, + "loss/logits": 0.17147252708673477, + "loss/reg": 1.674770712852478, + "step": 978 + }, + { + "epoch": 0.00979, + "grad_norm": 0.40374529361724854, + "grad_norm_var": 0.005314159555871933, + "learning_rate": 5e-05, + "loss": 0.2092, + "loss/crossentropy": 2.7746172547340393, + "loss/hidden": 0.0, + "loss/logits": 0.2092289738357067, + "loss/reg": 1.6723932027816772, + "step": 979 + }, + { + "epoch": 0.0098, + "grad_norm": 0.3737621009349823, + "grad_norm_var": 0.005169172562247167, + "learning_rate": 5e-05, + "loss": 0.1847, + "loss/crossentropy": 2.871635138988495, + "loss/hidden": 0.0, + "loss/logits": 0.18473126366734505, + "loss/reg": 1.6707124710083008, + "step": 980 + }, + { + "epoch": 0.00981, + "grad_norm": 0.37383797764778137, + "grad_norm_var": 0.005148210609649088, + "learning_rate": 5e-05, + "loss": 0.1733, + "loss/crossentropy": 2.760922133922577, + "loss/hidden": 0.0, + "loss/logits": 0.1733292043209076, + "loss/reg": 1.6693061590194702, + "step": 981 + }, + { + "epoch": 0.00982, + "grad_norm": 0.38922467827796936, + "grad_norm_var": 0.00512344066750369, + "learning_rate": 5e-05, + "loss": 0.164, + "loss/crossentropy": 2.8191832304000854, + "loss/hidden": 0.0, + "loss/logits": 0.16395087912678719, + "loss/reg": 1.6671026945114136, + "step": 982 + }, + { + "epoch": 0.00983, + "grad_norm": 0.40670332312583923, + "grad_norm_var": 0.0050327015332547465, + "learning_rate": 5e-05, + "loss": 0.1892, + "loss/crossentropy": 2.847275197505951, + "loss/hidden": 0.0, + "loss/logits": 0.18915896490216255, + "loss/reg": 1.6655491590499878, + "step": 983 + }, + { + "epoch": 0.00984, + "grad_norm": 0.3739645183086395, + "grad_norm_var": 0.004847126969690196, + "learning_rate": 5e-05, + "loss": 0.194, + "loss/crossentropy": 2.817361056804657, + "loss/hidden": 0.0, + "loss/logits": 0.1940479725599289, + "loss/reg": 1.664434552192688, + "step": 984 + }, + { + "epoch": 0.00985, + "grad_norm": 0.36827903985977173, + "grad_norm_var": 0.00490322302848593, + "learning_rate": 5e-05, + "loss": 0.1793, + "loss/crossentropy": 2.9278652667999268, + "loss/hidden": 0.0, + "loss/logits": 0.17926159501075745, + "loss/reg": 1.6631444692611694, + "step": 985 + }, + { + "epoch": 0.00986, + "grad_norm": 0.36838048696517944, + "grad_norm_var": 0.0049621510753778035, + "learning_rate": 5e-05, + "loss": 0.182, + "loss/crossentropy": 2.625900149345398, + "loss/hidden": 0.0, + "loss/logits": 0.1820085123181343, + "loss/reg": 1.6615486145019531, + "step": 986 + }, + { + "epoch": 0.00987, + "grad_norm": 0.406107097864151, + "grad_norm_var": 0.00496177464005331, + "learning_rate": 5e-05, + "loss": 0.165, + "loss/crossentropy": 2.6364856362342834, + "loss/hidden": 0.0, + "loss/logits": 0.16496483236551285, + "loss/reg": 1.6604608297348022, + "step": 987 + }, + { + "epoch": 0.00988, + "grad_norm": 0.3886563777923584, + "grad_norm_var": 0.004967815483619401, + "learning_rate": 5e-05, + "loss": 0.1803, + "loss/crossentropy": 2.796413004398346, + "loss/hidden": 0.0, + "loss/logits": 0.1803182028234005, + "loss/reg": 1.6593427658081055, + "step": 988 + }, + { + "epoch": 0.00989, + "grad_norm": 0.35161423683166504, + "grad_norm_var": 0.005074223354079936, + "learning_rate": 5e-05, + "loss": 0.1644, + "loss/crossentropy": 2.8155667185783386, + "loss/hidden": 0.0, + "loss/logits": 0.16441339999437332, + "loss/reg": 1.6583104133605957, + "step": 989 + }, + { + "epoch": 0.0099, + "grad_norm": 0.39407941699028015, + "grad_norm_var": 0.0049088886087087355, + "learning_rate": 5e-05, + "loss": 0.1928, + "loss/crossentropy": 2.8455575704574585, + "loss/hidden": 0.0, + "loss/logits": 0.19284628704190254, + "loss/reg": 1.658367395401001, + "step": 990 + }, + { + "epoch": 0.00991, + "grad_norm": 0.3695094883441925, + "grad_norm_var": 0.004869171230278472, + "learning_rate": 5e-05, + "loss": 0.1814, + "loss/crossentropy": 2.80289888381958, + "loss/hidden": 0.0, + "loss/logits": 0.18142832443118095, + "loss/reg": 1.6575336456298828, + "step": 991 + }, + { + "epoch": 0.00992, + "grad_norm": 0.3505973815917969, + "grad_norm_var": 0.005015199761620141, + "learning_rate": 5e-05, + "loss": 0.1769, + "loss/crossentropy": 2.731611430644989, + "loss/hidden": 0.0, + "loss/logits": 0.17688723653554916, + "loss/reg": 1.6567095518112183, + "step": 992 + }, + { + "epoch": 0.00993, + "grad_norm": 0.38008397817611694, + "grad_norm_var": 0.0003133497280889556, + "learning_rate": 5e-05, + "loss": 0.1836, + "loss/crossentropy": 2.7869952917099, + "loss/hidden": 0.0, + "loss/logits": 0.18361864984035492, + "loss/reg": 1.6552574634552002, + "step": 993 + }, + { + "epoch": 0.00994, + "grad_norm": 0.38647469878196716, + "grad_norm_var": 0.00030154116815576856, + "learning_rate": 5e-05, + "loss": 0.1794, + "loss/crossentropy": 2.744201898574829, + "loss/hidden": 0.0, + "loss/logits": 0.17939525097608566, + "loss/reg": 1.6545374393463135, + "step": 994 + }, + { + "epoch": 0.00995, + "grad_norm": 0.3995093107223511, + "grad_norm_var": 0.0002894285610608412, + "learning_rate": 5e-05, + "loss": 0.1916, + "loss/crossentropy": 2.694726526737213, + "loss/hidden": 0.0, + "loss/logits": 0.1916041001677513, + "loss/reg": 1.6533586978912354, + "step": 995 + }, + { + "epoch": 0.00996, + "grad_norm": 0.34584900736808777, + "grad_norm_var": 0.0003615231269390488, + "learning_rate": 5e-05, + "loss": 0.1739, + "loss/crossentropy": 2.8016315698623657, + "loss/hidden": 0.0, + "loss/logits": 0.1739257462322712, + "loss/reg": 1.651719570159912, + "step": 996 + }, + { + "epoch": 0.00997, + "grad_norm": 0.3925288915634155, + "grad_norm_var": 0.0003722265532580001, + "learning_rate": 5e-05, + "loss": 0.177, + "loss/crossentropy": 2.897447168827057, + "loss/hidden": 0.0, + "loss/logits": 0.17699377238750458, + "loss/reg": 1.6504392623901367, + "step": 997 + }, + { + "epoch": 0.00998, + "grad_norm": 0.39880403876304626, + "grad_norm_var": 0.0003904176090236522, + "learning_rate": 5e-05, + "loss": 0.1919, + "loss/crossentropy": 2.8275578022003174, + "loss/hidden": 0.0, + "loss/logits": 0.19189641624689102, + "loss/reg": 1.6491367816925049, + "step": 998 + }, + { + "epoch": 0.00999, + "grad_norm": 0.4310808479785919, + "grad_norm_var": 0.0005141220319849537, + "learning_rate": 5e-05, + "loss": 0.2298, + "loss/crossentropy": 2.7063609957695007, + "loss/hidden": 0.0, + "loss/logits": 0.22976921498775482, + "loss/reg": 1.6471576690673828, + "step": 999 + }, + { + "epoch": 0.01, + "grad_norm": 0.3714313507080078, + "grad_norm_var": 0.0005171003041950173, + "learning_rate": 5e-05, + "loss": 0.1901, + "loss/crossentropy": 2.7168938517570496, + "loss/hidden": 0.0, + "loss/logits": 0.19012651592493057, + "loss/reg": 1.6457953453063965, + "step": 1000 + }, + { + "epoch": 0.01001, + "grad_norm": 0.3641223907470703, + "grad_norm_var": 0.0005254723456020552, + "learning_rate": 5e-05, + "loss": 0.1885, + "loss/crossentropy": 2.753402054309845, + "loss/hidden": 0.0, + "loss/logits": 0.1885070614516735, + "loss/reg": 1.6438895463943481, + "step": 1001 + }, + { + "epoch": 0.01002, + "grad_norm": 0.39603546261787415, + "grad_norm_var": 0.0005260879240772306, + "learning_rate": 5e-05, + "loss": 0.1886, + "loss/crossentropy": 2.837618350982666, + "loss/hidden": 0.0, + "loss/logits": 0.18864833936095238, + "loss/reg": 1.642066240310669, + "step": 1002 + }, + { + "epoch": 0.01003, + "grad_norm": 0.48641237616539, + "grad_norm_var": 0.0011775773272432843, + "learning_rate": 5e-05, + "loss": 0.2391, + "loss/crossentropy": 2.831344962120056, + "loss/hidden": 0.0, + "loss/logits": 0.23907097056508064, + "loss/reg": 1.6403310298919678, + "step": 1003 + }, + { + "epoch": 0.01004, + "grad_norm": 0.3798205256462097, + "grad_norm_var": 0.0011815944076354482, + "learning_rate": 5e-05, + "loss": 0.1869, + "loss/crossentropy": 2.7991966605186462, + "loss/hidden": 0.0, + "loss/logits": 0.18689077720046043, + "loss/reg": 1.6383352279663086, + "step": 1004 + }, + { + "epoch": 0.01005, + "grad_norm": 0.3784593641757965, + "grad_norm_var": 0.0010986458368820136, + "learning_rate": 5e-05, + "loss": 0.1839, + "loss/crossentropy": 2.8769296407699585, + "loss/hidden": 0.0, + "loss/logits": 0.1838611364364624, + "loss/reg": 1.6359633207321167, + "step": 1005 + }, + { + "epoch": 0.01006, + "grad_norm": 0.3823866844177246, + "grad_norm_var": 0.001099349676319091, + "learning_rate": 5e-05, + "loss": 0.1884, + "loss/crossentropy": 2.7005032896995544, + "loss/hidden": 0.0, + "loss/logits": 0.18838313221931458, + "loss/reg": 1.6344705820083618, + "step": 1006 + }, + { + "epoch": 0.01007, + "grad_norm": 0.3799179494380951, + "grad_norm_var": 0.0010800167815802877, + "learning_rate": 5e-05, + "loss": 0.1908, + "loss/crossentropy": 2.6673877239227295, + "loss/hidden": 0.0, + "loss/logits": 0.1907733455300331, + "loss/reg": 1.6336002349853516, + "step": 1007 + }, + { + "epoch": 0.01008, + "grad_norm": 0.37894824147224426, + "grad_norm_var": 0.0009852009444313561, + "learning_rate": 5e-05, + "loss": 0.1828, + "loss/crossentropy": 2.7754920721054077, + "loss/hidden": 0.0, + "loss/logits": 0.18283528462052345, + "loss/reg": 1.6325920820236206, + "step": 1008 + }, + { + "epoch": 0.01009, + "grad_norm": 0.39319896697998047, + "grad_norm_var": 0.0009773145681171713, + "learning_rate": 5e-05, + "loss": 0.1865, + "loss/crossentropy": 2.8970988988876343, + "loss/hidden": 0.0, + "loss/logits": 0.18647681921720505, + "loss/reg": 1.631390929222107, + "step": 1009 + }, + { + "epoch": 0.0101, + "grad_norm": 0.38153842091560364, + "grad_norm_var": 0.0009821853173486716, + "learning_rate": 5e-05, + "loss": 0.1824, + "loss/crossentropy": 2.763012409210205, + "loss/hidden": 0.0, + "loss/logits": 0.18243329226970673, + "loss/reg": 1.6309009790420532, + "step": 1010 + }, + { + "epoch": 0.01011, + "grad_norm": 0.3883878290653229, + "grad_norm_var": 0.0009776724027208417, + "learning_rate": 5e-05, + "loss": 0.1901, + "loss/crossentropy": 2.765687584877014, + "loss/hidden": 0.0, + "loss/logits": 0.19011493027210236, + "loss/reg": 1.6300084590911865, + "step": 1011 + }, + { + "epoch": 0.01012, + "grad_norm": 0.35583510994911194, + "grad_norm_var": 0.0009243763684662879, + "learning_rate": 5e-05, + "loss": 0.1764, + "loss/crossentropy": 2.749864339828491, + "loss/hidden": 0.0, + "loss/logits": 0.1763710305094719, + "loss/reg": 1.6288458108901978, + "step": 1012 + }, + { + "epoch": 0.01013, + "grad_norm": 0.38475698232650757, + "grad_norm_var": 0.0009267555768795195, + "learning_rate": 5e-05, + "loss": 0.1884, + "loss/crossentropy": 2.7161130905151367, + "loss/hidden": 0.0, + "loss/logits": 0.18835009634494781, + "loss/reg": 1.6285719871520996, + "step": 1013 + }, + { + "epoch": 0.01014, + "grad_norm": 0.37661120295524597, + "grad_norm_var": 0.000933546249842306, + "learning_rate": 5e-05, + "loss": 0.1657, + "loss/crossentropy": 2.6909996271133423, + "loss/hidden": 0.0, + "loss/logits": 0.16574236378073692, + "loss/reg": 1.6282925605773926, + "step": 1014 + }, + { + "epoch": 0.01015, + "grad_norm": 0.3704264163970947, + "grad_norm_var": 0.0008256614127205612, + "learning_rate": 5e-05, + "loss": 0.1765, + "loss/crossentropy": 2.783431828022003, + "loss/hidden": 0.0, + "loss/logits": 0.1764850728213787, + "loss/reg": 1.627402663230896, + "step": 1015 + }, + { + "epoch": 0.01016, + "grad_norm": 0.37588080763816833, + "grad_norm_var": 0.0008185416610935044, + "learning_rate": 5e-05, + "loss": 0.1946, + "loss/crossentropy": 2.8895158171653748, + "loss/hidden": 0.0, + "loss/logits": 0.19460226222872734, + "loss/reg": 1.6271944046020508, + "step": 1016 + }, + { + "epoch": 0.01017, + "grad_norm": 0.4015043377876282, + "grad_norm_var": 0.0007978520380527006, + "learning_rate": 5e-05, + "loss": 0.1888, + "loss/crossentropy": 2.9011647701263428, + "loss/hidden": 0.0, + "loss/logits": 0.18881165981292725, + "loss/reg": 1.62646484375, + "step": 1017 + }, + { + "epoch": 0.01018, + "grad_norm": 0.3869231641292572, + "grad_norm_var": 0.0007934398262748814, + "learning_rate": 5e-05, + "loss": 0.1775, + "loss/crossentropy": 2.685133635997772, + "loss/hidden": 0.0, + "loss/logits": 0.17748012766242027, + "loss/reg": 1.62509286403656, + "step": 1018 + }, + { + "epoch": 0.01019, + "grad_norm": 0.43097683787345886, + "grad_norm_var": 0.00025487289950493577, + "learning_rate": 5e-05, + "loss": 0.194, + "loss/crossentropy": 2.7974401712417603, + "loss/hidden": 0.0, + "loss/logits": 0.19396745413541794, + "loss/reg": 1.624069094657898, + "step": 1019 + }, + { + "epoch": 0.0102, + "grad_norm": 0.3837282061576843, + "grad_norm_var": 0.0002535984477039547, + "learning_rate": 5e-05, + "loss": 0.18, + "loss/crossentropy": 2.7660019397735596, + "loss/hidden": 0.0, + "loss/logits": 0.17998040840029716, + "loss/reg": 1.6229311227798462, + "step": 1020 + }, + { + "epoch": 0.01021, + "grad_norm": 0.36976560950279236, + "grad_norm_var": 0.00026514185975165343, + "learning_rate": 5e-05, + "loss": 0.1684, + "loss/crossentropy": 2.773696482181549, + "loss/hidden": 0.0, + "loss/logits": 0.16836534813046455, + "loss/reg": 1.6225528717041016, + "step": 1021 + }, + { + "epoch": 0.01022, + "grad_norm": 0.3894343674182892, + "grad_norm_var": 0.00026691892163717516, + "learning_rate": 5e-05, + "loss": 0.2622, + "loss/crossentropy": 2.7716140151023865, + "loss/hidden": 0.0, + "loss/logits": 0.2621819078922272, + "loss/reg": 1.6223206520080566, + "step": 1022 + }, + { + "epoch": 0.01023, + "grad_norm": 0.38731515407562256, + "grad_norm_var": 0.0002660763662075608, + "learning_rate": 5e-05, + "loss": 0.1896, + "loss/crossentropy": 2.721261143684387, + "loss/hidden": 0.0, + "loss/logits": 0.18962380290031433, + "loss/reg": 1.6217230558395386, + "step": 1023 + }, + { + "epoch": 0.01024, + "grad_norm": 0.3601418733596802, + "grad_norm_var": 0.00030260891980304536, + "learning_rate": 5e-05, + "loss": 0.1769, + "loss/crossentropy": 2.8548576831817627, + "loss/hidden": 0.0, + "loss/logits": 0.17688385397195816, + "loss/reg": 1.621361494064331, + "step": 1024 + }, + { + "epoch": 0.01025, + "grad_norm": 0.37833818793296814, + "grad_norm_var": 0.00029724636529409785, + "learning_rate": 5e-05, + "loss": 0.1717, + "loss/crossentropy": 2.7824737429618835, + "loss/hidden": 0.0, + "loss/logits": 0.17165284976363182, + "loss/reg": 1.620347499847412, + "step": 1025 + }, + { + "epoch": 0.01026, + "grad_norm": 0.40456339716911316, + "grad_norm_var": 0.000327128476702739, + "learning_rate": 5e-05, + "loss": 0.1958, + "loss/crossentropy": 2.7456098794937134, + "loss/hidden": 0.0, + "loss/logits": 0.19577478617429733, + "loss/reg": 1.6197152137756348, + "step": 1026 + }, + { + "epoch": 0.01027, + "grad_norm": 0.36613523960113525, + "grad_norm_var": 0.0003451676569719416, + "learning_rate": 5e-05, + "loss": 0.1794, + "loss/crossentropy": 2.7014692425727844, + "loss/hidden": 0.0, + "loss/logits": 0.1794487200677395, + "loss/reg": 1.6191424131393433, + "step": 1027 + }, + { + "epoch": 0.01028, + "grad_norm": 0.36154961585998535, + "grad_norm_var": 0.00032678045604246365, + "learning_rate": 5e-05, + "loss": 0.1773, + "loss/crossentropy": 2.8211179971694946, + "loss/hidden": 0.0, + "loss/logits": 0.17734427005052567, + "loss/reg": 1.6176239252090454, + "step": 1028 + }, + { + "epoch": 0.01029, + "grad_norm": 0.40533965826034546, + "grad_norm_var": 0.00035807133543167187, + "learning_rate": 5e-05, + "loss": 0.1997, + "loss/crossentropy": 2.5644471049308777, + "loss/hidden": 0.0, + "loss/logits": 0.19972549006342888, + "loss/reg": 1.6164883375167847, + "step": 1029 + }, + { + "epoch": 0.0103, + "grad_norm": 0.3332688510417938, + "grad_norm_var": 0.0005198547791486281, + "learning_rate": 5e-05, + "loss": 0.1704, + "loss/crossentropy": 2.6996293663978577, + "loss/hidden": 0.0, + "loss/logits": 0.1703544519841671, + "loss/reg": 1.6148288249969482, + "step": 1030 + }, + { + "epoch": 0.01031, + "grad_norm": 0.42555445432662964, + "grad_norm_var": 0.0006278098210961592, + "learning_rate": 5e-05, + "loss": 0.1963, + "loss/crossentropy": 2.763964354991913, + "loss/hidden": 0.0, + "loss/logits": 0.19628288969397545, + "loss/reg": 1.6132444143295288, + "step": 1031 + }, + { + "epoch": 0.01032, + "grad_norm": 0.38525158166885376, + "grad_norm_var": 0.0006218714027041216, + "learning_rate": 5e-05, + "loss": 0.1826, + "loss/crossentropy": 2.6709354519844055, + "loss/hidden": 0.0, + "loss/logits": 0.18258166313171387, + "loss/reg": 1.612375259399414, + "step": 1032 + }, + { + "epoch": 0.01033, + "grad_norm": 0.37334632873535156, + "grad_norm_var": 0.0006117594391554903, + "learning_rate": 5e-05, + "loss": 0.1799, + "loss/crossentropy": 2.780134439468384, + "loss/hidden": 0.0, + "loss/logits": 0.1798795834183693, + "loss/reg": 1.6112585067749023, + "step": 1033 + }, + { + "epoch": 0.01034, + "grad_norm": 0.3481215536594391, + "grad_norm_var": 0.0006899686053054983, + "learning_rate": 5e-05, + "loss": 0.1677, + "loss/crossentropy": 2.7880823612213135, + "loss/hidden": 0.0, + "loss/logits": 0.16774233058094978, + "loss/reg": 1.6096283197402954, + "step": 1034 + }, + { + "epoch": 0.01035, + "grad_norm": 0.401376873254776, + "grad_norm_var": 0.0005491717474448839, + "learning_rate": 5e-05, + "loss": 0.1946, + "loss/crossentropy": 2.7819212675094604, + "loss/hidden": 0.0, + "loss/logits": 0.19464891031384468, + "loss/reg": 1.6077492237091064, + "step": 1035 + }, + { + "epoch": 0.01036, + "grad_norm": 0.4044601619243622, + "grad_norm_var": 0.0005875103191944693, + "learning_rate": 5e-05, + "loss": 0.193, + "loss/crossentropy": 2.76057767868042, + "loss/hidden": 0.0, + "loss/logits": 0.1929696425795555, + "loss/reg": 1.6054394245147705, + "step": 1036 + }, + { + "epoch": 0.01037, + "grad_norm": 0.3640334904193878, + "grad_norm_var": 0.0005980528349471677, + "learning_rate": 5e-05, + "loss": 0.173, + "loss/crossentropy": 2.805960953235626, + "loss/hidden": 0.0, + "loss/logits": 0.17302041500806808, + "loss/reg": 1.603266716003418, + "step": 1037 + }, + { + "epoch": 0.01038, + "grad_norm": 0.34832075238227844, + "grad_norm_var": 0.0006548009377475091, + "learning_rate": 5e-05, + "loss": 0.1659, + "loss/crossentropy": 2.9117356538772583, + "loss/hidden": 0.0, + "loss/logits": 0.1658683866262436, + "loss/reg": 1.6007298231124878, + "step": 1038 + }, + { + "epoch": 0.01039, + "grad_norm": 0.35593798756599426, + "grad_norm_var": 0.0006771319252449855, + "learning_rate": 5e-05, + "loss": 0.1783, + "loss/crossentropy": 2.8520063757896423, + "loss/hidden": 0.0, + "loss/logits": 0.17831408604979515, + "loss/reg": 1.5984032154083252, + "step": 1039 + }, + { + "epoch": 0.0104, + "grad_norm": 0.44101372361183167, + "grad_norm_var": 0.0009150763472360879, + "learning_rate": 5e-05, + "loss": 0.1763, + "loss/crossentropy": 2.9714547991752625, + "loss/hidden": 0.0, + "loss/logits": 0.1763400174677372, + "loss/reg": 1.596476435661316, + "step": 1040 + }, + { + "epoch": 0.01041, + "grad_norm": 0.42450451850891113, + "grad_norm_var": 0.001031664270957929, + "learning_rate": 5e-05, + "loss": 0.1857, + "loss/crossentropy": 2.8061989545822144, + "loss/hidden": 0.0, + "loss/logits": 0.18570463731884956, + "loss/reg": 1.5949974060058594, + "step": 1041 + }, + { + "epoch": 0.01042, + "grad_norm": 1.413458228111267, + "grad_norm_var": 0.06742490936139979, + "learning_rate": 5e-05, + "loss": 0.2472, + "loss/crossentropy": 2.941382944583893, + "loss/hidden": 0.0, + "loss/logits": 0.24719487875699997, + "loss/reg": 1.5933181047439575, + "step": 1042 + }, + { + "epoch": 0.01043, + "grad_norm": 0.4077235460281372, + "grad_norm_var": 0.0670847180936582, + "learning_rate": 5e-05, + "loss": 0.1743, + "loss/crossentropy": 2.744387209415436, + "loss/hidden": 0.0, + "loss/logits": 0.17426170781254768, + "loss/reg": 1.5918529033660889, + "step": 1043 + }, + { + "epoch": 0.01044, + "grad_norm": 0.3899991810321808, + "grad_norm_var": 0.06680138514417872, + "learning_rate": 5e-05, + "loss": 0.169, + "loss/crossentropy": 2.9295618534088135, + "loss/hidden": 0.0, + "loss/logits": 0.169007558375597, + "loss/reg": 1.5900723934173584, + "step": 1044 + }, + { + "epoch": 0.01045, + "grad_norm": 0.4866078794002533, + "grad_norm_var": 0.06671553563036674, + "learning_rate": 5e-05, + "loss": 0.2114, + "loss/crossentropy": 2.7827881574630737, + "loss/hidden": 0.0, + "loss/logits": 0.2113601267337799, + "loss/reg": 1.5889488458633423, + "step": 1045 + }, + { + "epoch": 0.01046, + "grad_norm": 0.39279693365097046, + "grad_norm_var": 0.0659594213964073, + "learning_rate": 5e-05, + "loss": 0.1807, + "loss/crossentropy": 2.780573546886444, + "loss/hidden": 0.0, + "loss/logits": 0.1807180792093277, + "loss/reg": 1.5870665311813354, + "step": 1046 + }, + { + "epoch": 0.01047, + "grad_norm": 0.3671790659427643, + "grad_norm_var": 0.06644172437070375, + "learning_rate": 5e-05, + "loss": 0.1795, + "loss/crossentropy": 2.8943958282470703, + "loss/hidden": 0.0, + "loss/logits": 0.17946722730994225, + "loss/reg": 1.5854285955429077, + "step": 1047 + }, + { + "epoch": 0.01048, + "grad_norm": 0.37834271788597107, + "grad_norm_var": 0.06651034798313456, + "learning_rate": 5e-05, + "loss": 0.1716, + "loss/crossentropy": 2.7288926243782043, + "loss/hidden": 0.0, + "loss/logits": 0.1715567633509636, + "loss/reg": 1.584210991859436, + "step": 1048 + }, + { + "epoch": 0.01049, + "grad_norm": 0.37263625860214233, + "grad_norm_var": 0.06651821205251345, + "learning_rate": 5e-05, + "loss": 0.1738, + "loss/crossentropy": 2.5842645168304443, + "loss/hidden": 0.0, + "loss/logits": 0.17380548641085625, + "loss/reg": 1.5832701921463013, + "step": 1049 + }, + { + "epoch": 0.0105, + "grad_norm": 0.357949823141098, + "grad_norm_var": 0.06638283943495621, + "learning_rate": 5e-05, + "loss": 0.1762, + "loss/crossentropy": 2.8539857864379883, + "loss/hidden": 0.0, + "loss/logits": 0.17616372555494308, + "loss/reg": 1.5817757844924927, + "step": 1050 + }, + { + "epoch": 0.01051, + "grad_norm": 0.40757089853286743, + "grad_norm_var": 0.06633959192563718, + "learning_rate": 5e-05, + "loss": 0.1901, + "loss/crossentropy": 2.877175509929657, + "loss/hidden": 0.0, + "loss/logits": 0.19005050137639046, + "loss/reg": 1.580580472946167, + "step": 1051 + }, + { + "epoch": 0.01052, + "grad_norm": 0.43088629841804504, + "grad_norm_var": 0.06619799704120433, + "learning_rate": 5e-05, + "loss": 0.2072, + "loss/crossentropy": 2.6631829738616943, + "loss/hidden": 0.0, + "loss/logits": 0.20722678676247597, + "loss/reg": 1.5795587301254272, + "step": 1052 + }, + { + "epoch": 0.01053, + "grad_norm": 2.6514923572540283, + "grad_norm_var": 0.3643590351017664, + "learning_rate": 5e-05, + "loss": 0.2386, + "loss/crossentropy": 2.8652138113975525, + "loss/hidden": 0.0, + "loss/logits": 0.23861064016819, + "loss/reg": 1.578200101852417, + "step": 1053 + }, + { + "epoch": 0.01054, + "grad_norm": 0.4709310531616211, + "grad_norm_var": 0.3611571581594747, + "learning_rate": 5e-05, + "loss": 0.184, + "loss/crossentropy": 2.751032590866089, + "loss/hidden": 0.0, + "loss/logits": 0.18403061106801033, + "loss/reg": 1.5764774084091187, + "step": 1054 + }, + { + "epoch": 0.01055, + "grad_norm": 0.542866051197052, + "grad_norm_var": 0.3570259510737727, + "learning_rate": 5e-05, + "loss": 0.2072, + "loss/crossentropy": 2.771138370037079, + "loss/hidden": 0.0, + "loss/logits": 0.20715508237481117, + "loss/reg": 1.5751893520355225, + "step": 1055 + }, + { + "epoch": 0.01056, + "grad_norm": 0.4095610976219177, + "grad_norm_var": 0.35784257490682114, + "learning_rate": 5e-05, + "loss": 0.1829, + "loss/crossentropy": 2.6514610052108765, + "loss/hidden": 0.0, + "loss/logits": 0.18285758048295975, + "loss/reg": 1.5734416246414185, + "step": 1056 + }, + { + "epoch": 0.01057, + "grad_norm": 0.45324698090553284, + "grad_norm_var": 0.35714871626115285, + "learning_rate": 5e-05, + "loss": 0.2123, + "loss/crossentropy": 2.73183411359787, + "loss/hidden": 0.0, + "loss/logits": 0.21226082369685173, + "loss/reg": 1.5717664957046509, + "step": 1057 + }, + { + "epoch": 0.01058, + "grad_norm": 0.42001500725746155, + "grad_norm_var": 0.3138407253297512, + "learning_rate": 5e-05, + "loss": 0.1883, + "loss/crossentropy": 2.7152724266052246, + "loss/hidden": 0.0, + "loss/logits": 0.1883089914917946, + "loss/reg": 1.5704617500305176, + "step": 1058 + }, + { + "epoch": 0.01059, + "grad_norm": 0.3464891314506531, + "grad_norm_var": 0.31530804811188473, + "learning_rate": 5e-05, + "loss": 0.1642, + "loss/crossentropy": 2.7491345405578613, + "loss/hidden": 0.0, + "loss/logits": 0.16423293575644493, + "loss/reg": 1.569446325302124, + "step": 1059 + }, + { + "epoch": 0.0106, + "grad_norm": 0.3669905960559845, + "grad_norm_var": 0.3158470526355899, + "learning_rate": 5e-05, + "loss": 0.1673, + "loss/crossentropy": 2.836626708507538, + "loss/hidden": 0.0, + "loss/logits": 0.16727466136217117, + "loss/reg": 1.5682600736618042, + "step": 1060 + }, + { + "epoch": 0.01061, + "grad_norm": 0.39248570799827576, + "grad_norm_var": 0.31723986653993524, + "learning_rate": 5e-05, + "loss": 0.1709, + "loss/crossentropy": 2.951793611049652, + "loss/hidden": 0.0, + "loss/logits": 0.17091352492570877, + "loss/reg": 1.5677317380905151, + "step": 1061 + }, + { + "epoch": 0.01062, + "grad_norm": 0.42078810930252075, + "grad_norm_var": 0.31671112367648746, + "learning_rate": 5e-05, + "loss": 0.1945, + "loss/crossentropy": 2.748759090900421, + "loss/hidden": 0.0, + "loss/logits": 0.19453415274620056, + "loss/reg": 1.5668708086013794, + "step": 1062 + }, + { + "epoch": 0.01063, + "grad_norm": 0.37992992997169495, + "grad_norm_var": 0.3164115915761645, + "learning_rate": 5e-05, + "loss": 0.1878, + "loss/crossentropy": 2.6824185848236084, + "loss/hidden": 0.0, + "loss/logits": 0.18784139305353165, + "loss/reg": 1.567291259765625, + "step": 1063 + }, + { + "epoch": 0.01064, + "grad_norm": 0.3837074935436249, + "grad_norm_var": 0.3162905057857987, + "learning_rate": 5e-05, + "loss": 0.1761, + "loss/crossentropy": 2.7311676144599915, + "loss/hidden": 0.0, + "loss/logits": 0.176094900816679, + "loss/reg": 1.566008448600769, + "step": 1064 + }, + { + "epoch": 0.01065, + "grad_norm": 0.3652944266796112, + "grad_norm_var": 0.3164679597230645, + "learning_rate": 5e-05, + "loss": 0.1734, + "loss/crossentropy": 2.7913513779640198, + "loss/hidden": 0.0, + "loss/logits": 0.17336497828364372, + "loss/reg": 1.5644934177398682, + "step": 1065 + }, + { + "epoch": 0.01066, + "grad_norm": 0.5269585847854614, + "grad_norm_var": 0.3139251636385244, + "learning_rate": 5e-05, + "loss": 0.1845, + "loss/crossentropy": 2.6032761335372925, + "loss/hidden": 0.0, + "loss/logits": 0.18454618379473686, + "loss/reg": 1.5634944438934326, + "step": 1066 + }, + { + "epoch": 0.01067, + "grad_norm": 0.4014894664287567, + "grad_norm_var": 0.3140515403632015, + "learning_rate": 5e-05, + "loss": 0.188, + "loss/crossentropy": 2.719841182231903, + "loss/hidden": 0.0, + "loss/logits": 0.18803178519010544, + "loss/reg": 1.5624383687973022, + "step": 1067 + }, + { + "epoch": 0.01068, + "grad_norm": 0.345795601606369, + "grad_norm_var": 0.31597113808328114, + "learning_rate": 5e-05, + "loss": 0.1734, + "loss/crossentropy": 2.8608756065368652, + "loss/hidden": 0.0, + "loss/logits": 0.17335866764187813, + "loss/reg": 1.5619043111801147, + "step": 1068 + }, + { + "epoch": 0.01069, + "grad_norm": 0.403914213180542, + "grad_norm_var": 0.0033892069415853284, + "learning_rate": 5e-05, + "loss": 0.1908, + "loss/crossentropy": 2.7109196186065674, + "loss/hidden": 0.0, + "loss/logits": 0.1907677985727787, + "loss/reg": 1.5616743564605713, + "step": 1069 + }, + { + "epoch": 0.0107, + "grad_norm": 0.408088743686676, + "grad_norm_var": 0.0031623901529525987, + "learning_rate": 5e-05, + "loss": 0.2084, + "loss/crossentropy": 2.7172587513923645, + "loss/hidden": 0.0, + "loss/logits": 0.20841724425554276, + "loss/reg": 1.560802936553955, + "step": 1070 + }, + { + "epoch": 0.01071, + "grad_norm": 0.36463382840156555, + "grad_norm_var": 0.002001661703932278, + "learning_rate": 5e-05, + "loss": 0.1717, + "loss/crossentropy": 2.778719425201416, + "loss/hidden": 0.0, + "loss/logits": 0.1717045158147812, + "loss/reg": 1.560097336769104, + "step": 1071 + }, + { + "epoch": 0.01072, + "grad_norm": 0.37026646733283997, + "grad_norm_var": 0.0020445979916204338, + "learning_rate": 5e-05, + "loss": 0.1748, + "loss/crossentropy": 2.7089297771453857, + "loss/hidden": 0.0, + "loss/logits": 0.17478087916970253, + "loss/reg": 1.5588798522949219, + "step": 1072 + }, + { + "epoch": 0.01073, + "grad_norm": 0.4527875781059265, + "grad_norm_var": 0.0020411585504943965, + "learning_rate": 5e-05, + "loss": 0.1905, + "loss/crossentropy": 2.9299193620681763, + "loss/hidden": 0.0, + "loss/logits": 0.19047147780656815, + "loss/reg": 1.5580799579620361, + "step": 1073 + }, + { + "epoch": 0.01074, + "grad_norm": 0.4295300841331482, + "grad_norm_var": 0.0020762032373007446, + "learning_rate": 5e-05, + "loss": 0.1935, + "loss/crossentropy": 2.6749280095100403, + "loss/hidden": 0.0, + "loss/logits": 0.19354819506406784, + "loss/reg": 1.5568370819091797, + "step": 1074 + }, + { + "epoch": 0.01075, + "grad_norm": 0.37927335500717163, + "grad_norm_var": 0.0019206305721111705, + "learning_rate": 5e-05, + "loss": 0.1866, + "loss/crossentropy": 2.7454851865768433, + "loss/hidden": 0.0, + "loss/logits": 0.1866455115377903, + "loss/reg": 1.5555474758148193, + "step": 1075 + }, + { + "epoch": 0.01076, + "grad_norm": 0.37798190116882324, + "grad_norm_var": 0.0018805443791561534, + "learning_rate": 5e-05, + "loss": 0.1654, + "loss/crossentropy": 2.839569091796875, + "loss/hidden": 0.0, + "loss/logits": 0.1654018685221672, + "loss/reg": 1.5542937517166138, + "step": 1076 + }, + { + "epoch": 0.01077, + "grad_norm": 0.40603601932525635, + "grad_norm_var": 0.0018781135855993687, + "learning_rate": 5e-05, + "loss": 0.1884, + "loss/crossentropy": 2.769132375717163, + "loss/hidden": 0.0, + "loss/logits": 0.18842901661992073, + "loss/reg": 1.5535392761230469, + "step": 1077 + }, + { + "epoch": 0.01078, + "grad_norm": 0.38742178678512573, + "grad_norm_var": 0.001859793659604044, + "learning_rate": 5e-05, + "loss": 0.1827, + "loss/crossentropy": 2.616896152496338, + "loss/hidden": 0.0, + "loss/logits": 0.18272512406110764, + "loss/reg": 1.5521759986877441, + "step": 1078 + }, + { + "epoch": 0.01079, + "grad_norm": 3.135983467102051, + "grad_norm_var": 0.4696119388561969, + "learning_rate": 5e-05, + "loss": 0.3609, + "loss/crossentropy": 2.867876887321472, + "loss/hidden": 0.0, + "loss/logits": 0.36090877279639244, + "loss/reg": 1.5513793230056763, + "step": 1079 + }, + { + "epoch": 0.0108, + "grad_norm": 0.44463542103767395, + "grad_norm_var": 0.46832083359345933, + "learning_rate": 5e-05, + "loss": 0.1889, + "loss/crossentropy": 2.739173710346222, + "loss/hidden": 0.0, + "loss/logits": 0.18892118334770203, + "loss/reg": 1.5504870414733887, + "step": 1080 + }, + { + "epoch": 0.01081, + "grad_norm": 0.45169124007225037, + "grad_norm_var": 0.4663715745961762, + "learning_rate": 5e-05, + "loss": 0.1868, + "loss/crossentropy": 2.576621174812317, + "loss/hidden": 0.0, + "loss/logits": 0.18683869391679764, + "loss/reg": 1.5502347946166992, + "step": 1081 + }, + { + "epoch": 0.01082, + "grad_norm": 0.4132075905799866, + "grad_norm_var": 0.46799089854198317, + "learning_rate": 5e-05, + "loss": 0.1763, + "loss/crossentropy": 2.822961449623108, + "loss/hidden": 0.0, + "loss/logits": 0.17634045705199242, + "loss/reg": 1.5503305196762085, + "step": 1082 + }, + { + "epoch": 0.01083, + "grad_norm": 0.4182119369506836, + "grad_norm_var": 0.4676253053735462, + "learning_rate": 5e-05, + "loss": 0.1947, + "loss/crossentropy": 2.9488189220428467, + "loss/hidden": 0.0, + "loss/logits": 0.19468581303954124, + "loss/reg": 1.5501155853271484, + "step": 1083 + }, + { + "epoch": 0.01084, + "grad_norm": 0.41536945104599, + "grad_norm_var": 0.4658077316127261, + "learning_rate": 5e-05, + "loss": 0.1691, + "loss/crossentropy": 2.7678072452545166, + "loss/hidden": 0.0, + "loss/logits": 0.16905947774648666, + "loss/reg": 1.549827218055725, + "step": 1084 + }, + { + "epoch": 0.01085, + "grad_norm": 0.42519059777259827, + "grad_norm_var": 0.4653402127084352, + "learning_rate": 5e-05, + "loss": 0.1772, + "loss/crossentropy": 2.7813703417778015, + "loss/hidden": 0.0, + "loss/logits": 0.17720388248562813, + "loss/reg": 1.550193428993225, + "step": 1085 + }, + { + "epoch": 0.01086, + "grad_norm": 0.4168826937675476, + "grad_norm_var": 0.4651434528428754, + "learning_rate": 5e-05, + "loss": 0.2069, + "loss/crossentropy": 2.845678389072418, + "loss/hidden": 0.0, + "loss/logits": 0.20686748251318932, + "loss/reg": 1.54852294921875, + "step": 1086 + }, + { + "epoch": 0.01087, + "grad_norm": 0.3667008578777313, + "grad_norm_var": 0.4650842073091175, + "learning_rate": 5e-05, + "loss": 0.1663, + "loss/crossentropy": 2.8423004746437073, + "loss/hidden": 0.0, + "loss/logits": 0.16627563163638115, + "loss/reg": 1.5476669073104858, + "step": 1087 + }, + { + "epoch": 0.01088, + "grad_norm": 0.3929131329059601, + "grad_norm_var": 0.46448085164061514, + "learning_rate": 5e-05, + "loss": 0.1842, + "loss/crossentropy": 2.74026757478714, + "loss/hidden": 0.0, + "loss/logits": 0.1841907612979412, + "loss/reg": 1.5473967790603638, + "step": 1088 + }, + { + "epoch": 0.01089, + "grad_norm": 0.39987713098526, + "grad_norm_var": 0.4655681808252753, + "learning_rate": 5e-05, + "loss": 0.1762, + "loss/crossentropy": 2.771194040775299, + "loss/hidden": 0.0, + "loss/logits": 0.17620433494448662, + "loss/reg": 1.5467705726623535, + "step": 1089 + }, + { + "epoch": 0.0109, + "grad_norm": 0.3864990472793579, + "grad_norm_var": 0.46654038035843726, + "learning_rate": 5e-05, + "loss": 0.1849, + "loss/crossentropy": 2.73794162273407, + "loss/hidden": 0.0, + "loss/logits": 0.1848638951778412, + "loss/reg": 1.5455354452133179, + "step": 1090 + }, + { + "epoch": 0.01091, + "grad_norm": 0.36454349756240845, + "grad_norm_var": 0.46694053852503253, + "learning_rate": 5e-05, + "loss": 0.1934, + "loss/crossentropy": 2.7609499096870422, + "loss/hidden": 0.0, + "loss/logits": 0.19338013604283333, + "loss/reg": 1.5440956354141235, + "step": 1091 + }, + { + "epoch": 0.01092, + "grad_norm": 0.44408488273620605, + "grad_norm_var": 0.46547544141069996, + "learning_rate": 5e-05, + "loss": 0.2098, + "loss/crossentropy": 2.9235315918922424, + "loss/hidden": 0.0, + "loss/logits": 0.20981372147798538, + "loss/reg": 1.5432744026184082, + "step": 1092 + }, + { + "epoch": 0.01093, + "grad_norm": 0.5225197672843933, + "grad_norm_var": 0.4636320430634635, + "learning_rate": 5e-05, + "loss": 0.2165, + "loss/crossentropy": 2.971177875995636, + "loss/hidden": 0.0, + "loss/logits": 0.21645646914839745, + "loss/reg": 1.542582392692566, + "step": 1093 + }, + { + "epoch": 0.01094, + "grad_norm": 0.4295833706855774, + "grad_norm_var": 0.4626234072637335, + "learning_rate": 5e-05, + "loss": 0.1917, + "loss/crossentropy": 2.743496000766754, + "loss/hidden": 0.0, + "loss/logits": 0.19165712594985962, + "loss/reg": 1.541102409362793, + "step": 1094 + }, + { + "epoch": 0.01095, + "grad_norm": 0.3936365842819214, + "grad_norm_var": 0.0014465937708503795, + "learning_rate": 5e-05, + "loss": 0.182, + "loss/crossentropy": 2.81677907705307, + "loss/hidden": 0.0, + "loss/logits": 0.18201437965035439, + "loss/reg": 1.5404126644134521, + "step": 1095 + }, + { + "epoch": 0.01096, + "grad_norm": 0.5806572437286377, + "grad_norm_var": 0.0030888115382388857, + "learning_rate": 5e-05, + "loss": 0.1851, + "loss/crossentropy": 2.8792348504066467, + "loss/hidden": 0.0, + "loss/logits": 0.18511545285582542, + "loss/reg": 1.5391901731491089, + "step": 1096 + }, + { + "epoch": 0.01097, + "grad_norm": 0.40794965624809265, + "grad_norm_var": 0.003060587668769073, + "learning_rate": 5e-05, + "loss": 0.1907, + "loss/crossentropy": 2.9066116213798523, + "loss/hidden": 0.0, + "loss/logits": 0.19067668542265892, + "loss/reg": 1.5390186309814453, + "step": 1097 + }, + { + "epoch": 0.01098, + "grad_norm": 0.3634330630302429, + "grad_norm_var": 0.003284496285873558, + "learning_rate": 5e-05, + "loss": 0.161, + "loss/crossentropy": 2.7170445919036865, + "loss/hidden": 0.0, + "loss/logits": 0.1610131524503231, + "loss/reg": 1.5397425889968872, + "step": 1098 + }, + { + "epoch": 0.01099, + "grad_norm": 1.1740427017211914, + "grad_norm_var": 0.03875858693632671, + "learning_rate": 5e-05, + "loss": 0.2375, + "loss/crossentropy": 2.8482487201690674, + "loss/hidden": 0.0, + "loss/logits": 0.23750562220811844, + "loss/reg": 1.539298415184021, + "step": 1099 + }, + { + "epoch": 0.011, + "grad_norm": 0.41955143213272095, + "grad_norm_var": 0.038730476788456265, + "learning_rate": 5e-05, + "loss": 0.1936, + "loss/crossentropy": 2.771632492542267, + "loss/hidden": 0.0, + "loss/logits": 0.19357017427682877, + "loss/reg": 1.5384517908096313, + "step": 1100 + }, + { + "epoch": 0.01101, + "grad_norm": 0.44815731048583984, + "grad_norm_var": 0.03863233892448896, + "learning_rate": 5e-05, + "loss": 0.1825, + "loss/crossentropy": 2.8682021498680115, + "loss/hidden": 0.0, + "loss/logits": 0.18247045576572418, + "loss/reg": 1.5383305549621582, + "step": 1101 + }, + { + "epoch": 0.01102, + "grad_norm": 0.8878780007362366, + "grad_norm_var": 0.049196589116129716, + "learning_rate": 5e-05, + "loss": 0.2108, + "loss/crossentropy": 2.742558717727661, + "loss/hidden": 0.0, + "loss/logits": 0.21081242337822914, + "loss/reg": 1.5376973152160645, + "step": 1102 + }, + { + "epoch": 0.01103, + "grad_norm": 0.45139309763908386, + "grad_norm_var": 0.04815231816326545, + "learning_rate": 5e-05, + "loss": 0.1853, + "loss/crossentropy": 2.6859241724014282, + "loss/hidden": 0.0, + "loss/logits": 0.18528878688812256, + "loss/reg": 1.5371124744415283, + "step": 1103 + }, + { + "epoch": 0.01104, + "grad_norm": 0.542457103729248, + "grad_norm_var": 0.04733165822270831, + "learning_rate": 5e-05, + "loss": 0.2534, + "loss/crossentropy": 3.040016829967499, + "loss/hidden": 0.0, + "loss/logits": 0.2534272372722626, + "loss/reg": 1.53706693649292, + "step": 1104 + }, + { + "epoch": 0.01105, + "grad_norm": 0.45765799283981323, + "grad_norm_var": 0.04666483176769951, + "learning_rate": 5e-05, + "loss": 0.1717, + "loss/crossentropy": 2.705473482608795, + "loss/hidden": 0.0, + "loss/logits": 0.17168359830975533, + "loss/reg": 1.5363619327545166, + "step": 1105 + }, + { + "epoch": 0.01106, + "grad_norm": 0.47550585865974426, + "grad_norm_var": 0.04560972358215081, + "learning_rate": 5e-05, + "loss": 0.1882, + "loss/crossentropy": 2.8135305643081665, + "loss/hidden": 0.0, + "loss/logits": 0.18824224174022675, + "loss/reg": 1.5352301597595215, + "step": 1106 + }, + { + "epoch": 0.01107, + "grad_norm": 0.45044830441474915, + "grad_norm_var": 0.044259536577999074, + "learning_rate": 5e-05, + "loss": 0.1995, + "loss/crossentropy": 2.8964192271232605, + "loss/hidden": 0.0, + "loss/logits": 0.19954011589288712, + "loss/reg": 1.535237431526184, + "step": 1107 + }, + { + "epoch": 0.01108, + "grad_norm": 0.38536059856414795, + "grad_norm_var": 0.045132585802010065, + "learning_rate": 5e-05, + "loss": 0.1859, + "loss/crossentropy": 2.7900161743164062, + "loss/hidden": 0.0, + "loss/logits": 0.18588732928037643, + "loss/reg": 1.5349178314208984, + "step": 1108 + }, + { + "epoch": 0.01109, + "grad_norm": 0.3790653944015503, + "grad_norm_var": 0.04645454606829357, + "learning_rate": 5e-05, + "loss": 0.1858, + "loss/crossentropy": 2.7974360585212708, + "loss/hidden": 0.0, + "loss/logits": 0.1857653297483921, + "loss/reg": 1.535470962524414, + "step": 1109 + }, + { + "epoch": 0.0111, + "grad_norm": 0.4460708796977997, + "grad_norm_var": 0.046282830384225, + "learning_rate": 5e-05, + "loss": 0.1936, + "loss/crossentropy": 2.8131036162376404, + "loss/hidden": 0.0, + "loss/logits": 0.19361238926649094, + "loss/reg": 1.5345796346664429, + "step": 1110 + }, + { + "epoch": 0.01111, + "grad_norm": 0.39801692962646484, + "grad_norm_var": 0.046212298527668116, + "learning_rate": 5e-05, + "loss": 0.173, + "loss/crossentropy": 2.8013834953308105, + "loss/hidden": 0.0, + "loss/logits": 0.1730150803923607, + "loss/reg": 1.5339577198028564, + "step": 1111 + }, + { + "epoch": 0.01112, + "grad_norm": 0.37219998240470886, + "grad_norm_var": 0.047151327489262096, + "learning_rate": 5e-05, + "loss": 0.173, + "loss/crossentropy": 2.745340883731842, + "loss/hidden": 0.0, + "loss/logits": 0.17297760024666786, + "loss/reg": 1.532616376876831, + "step": 1112 + }, + { + "epoch": 0.01113, + "grad_norm": 0.37448444962501526, + "grad_norm_var": 0.04764855990328396, + "learning_rate": 5e-05, + "loss": 0.1745, + "loss/crossentropy": 2.7970882058143616, + "loss/hidden": 0.0, + "loss/logits": 0.1744941510260105, + "loss/reg": 1.5317273139953613, + "step": 1113 + }, + { + "epoch": 0.01114, + "grad_norm": 0.44670793414115906, + "grad_norm_var": 0.04654778230486344, + "learning_rate": 5e-05, + "loss": 0.1897, + "loss/crossentropy": 2.8545820713043213, + "loss/hidden": 0.0, + "loss/logits": 0.18966153636574745, + "loss/reg": 1.5309053659439087, + "step": 1114 + }, + { + "epoch": 0.01115, + "grad_norm": 0.46504896879196167, + "grad_norm_var": 0.014889839873678161, + "learning_rate": 5e-05, + "loss": 0.2429, + "loss/crossentropy": 2.6743342876434326, + "loss/hidden": 0.0, + "loss/logits": 0.24294951558113098, + "loss/reg": 1.5288500785827637, + "step": 1115 + }, + { + "epoch": 0.01116, + "grad_norm": 0.38851630687713623, + "grad_norm_var": 0.015127761548291678, + "learning_rate": 5e-05, + "loss": 0.1782, + "loss/crossentropy": 2.8051797747612, + "loss/hidden": 0.0, + "loss/logits": 0.1781875565648079, + "loss/reg": 1.5276798009872437, + "step": 1116 + }, + { + "epoch": 0.01117, + "grad_norm": 0.4078308045864105, + "grad_norm_var": 0.015296091420591058, + "learning_rate": 5e-05, + "loss": 0.2051, + "loss/crossentropy": 2.6146674156188965, + "loss/hidden": 0.0, + "loss/logits": 0.20513415709137917, + "loss/reg": 1.5253890752792358, + "step": 1117 + }, + { + "epoch": 0.01118, + "grad_norm": 0.40175458788871765, + "grad_norm_var": 0.0022052748110299113, + "learning_rate": 5e-05, + "loss": 0.1816, + "loss/crossentropy": 2.786243498325348, + "loss/hidden": 0.0, + "loss/logits": 0.18161213025450706, + "loss/reg": 1.5237782001495361, + "step": 1118 + }, + { + "epoch": 0.01119, + "grad_norm": 0.38154929876327515, + "grad_norm_var": 0.00228912119924131, + "learning_rate": 5e-05, + "loss": 0.1814, + "loss/crossentropy": 2.8448697328567505, + "loss/hidden": 0.0, + "loss/logits": 0.18143780902028084, + "loss/reg": 1.522615671157837, + "step": 1119 + }, + { + "epoch": 0.0112, + "grad_norm": 0.38408970832824707, + "grad_norm_var": 0.0013403912284101576, + "learning_rate": 5e-05, + "loss": 0.1835, + "loss/crossentropy": 2.674954891204834, + "loss/hidden": 0.0, + "loss/logits": 0.18346881866455078, + "loss/reg": 1.5216854810714722, + "step": 1120 + }, + { + "epoch": 0.01121, + "grad_norm": 0.3568238317966461, + "grad_norm_var": 0.0013807554136454217, + "learning_rate": 5e-05, + "loss": 0.1846, + "loss/crossentropy": 2.753901481628418, + "loss/hidden": 0.0, + "loss/logits": 0.18456534296274185, + "loss/reg": 1.5205482244491577, + "step": 1121 + }, + { + "epoch": 0.01122, + "grad_norm": 0.4012138545513153, + "grad_norm_var": 0.0010480325632392688, + "learning_rate": 5e-05, + "loss": 0.1946, + "loss/crossentropy": 2.6712945103645325, + "loss/hidden": 0.0, + "loss/logits": 0.19459592550992966, + "loss/reg": 1.5195826292037964, + "step": 1122 + }, + { + "epoch": 0.01123, + "grad_norm": 0.3645152449607849, + "grad_norm_var": 0.0009595980710018252, + "learning_rate": 5e-05, + "loss": 0.1879, + "loss/crossentropy": 2.7605273723602295, + "loss/hidden": 0.0, + "loss/logits": 0.18793095648288727, + "loss/reg": 1.5193746089935303, + "step": 1123 + }, + { + "epoch": 0.01124, + "grad_norm": 0.35984060168266296, + "grad_norm_var": 0.0010401730322851521, + "learning_rate": 5e-05, + "loss": 0.1741, + "loss/crossentropy": 2.870726466178894, + "loss/hidden": 0.0, + "loss/logits": 0.1740589663386345, + "loss/reg": 1.5188905000686646, + "step": 1124 + }, + { + "epoch": 0.01125, + "grad_norm": 0.3941318690776825, + "grad_norm_var": 0.0010213796255876299, + "learning_rate": 5e-05, + "loss": 0.1802, + "loss/crossentropy": 2.7815005779266357, + "loss/hidden": 0.0, + "loss/logits": 0.1801726222038269, + "loss/reg": 1.5177239179611206, + "step": 1125 + }, + { + "epoch": 0.01126, + "grad_norm": 0.4047865569591522, + "grad_norm_var": 0.0008546231628673814, + "learning_rate": 5e-05, + "loss": 0.2063, + "loss/crossentropy": 2.715599477291107, + "loss/hidden": 0.0, + "loss/logits": 0.20628444477915764, + "loss/reg": 1.5171293020248413, + "step": 1126 + }, + { + "epoch": 0.01127, + "grad_norm": 0.41338419914245605, + "grad_norm_var": 0.0008779320407387677, + "learning_rate": 5e-05, + "loss": 0.2112, + "loss/crossentropy": 2.9712833166122437, + "loss/hidden": 0.0, + "loss/logits": 0.2111729383468628, + "loss/reg": 1.5161170959472656, + "step": 1127 + }, + { + "epoch": 0.01128, + "grad_norm": 0.3533566892147064, + "grad_norm_var": 0.0009569173440450385, + "learning_rate": 5e-05, + "loss": 0.1822, + "loss/crossentropy": 2.844532549381256, + "loss/hidden": 0.0, + "loss/logits": 0.1821518912911415, + "loss/reg": 1.5148218870162964, + "step": 1128 + }, + { + "epoch": 0.01129, + "grad_norm": 0.36923155188560486, + "grad_norm_var": 0.000972049210964802, + "learning_rate": 5e-05, + "loss": 0.184, + "loss/crossentropy": 2.8003381490707397, + "loss/hidden": 0.0, + "loss/logits": 0.18395673483610153, + "loss/reg": 1.5138037204742432, + "step": 1129 + }, + { + "epoch": 0.0113, + "grad_norm": 0.38235801458358765, + "grad_norm_var": 0.0007726070702099741, + "learning_rate": 5e-05, + "loss": 0.188, + "loss/crossentropy": 2.837286412715912, + "loss/hidden": 0.0, + "loss/logits": 0.1879820078611374, + "loss/reg": 1.512952208518982, + "step": 1130 + }, + { + "epoch": 0.01131, + "grad_norm": 0.37186941504478455, + "grad_norm_var": 0.00037387253486032, + "learning_rate": 5e-05, + "loss": 0.1758, + "loss/crossentropy": 2.737375319004059, + "loss/hidden": 0.0, + "loss/logits": 0.1758098341524601, + "loss/reg": 1.5121604204177856, + "step": 1131 + }, + { + "epoch": 0.01132, + "grad_norm": 0.36942365765571594, + "grad_norm_var": 0.00038376674257047566, + "learning_rate": 5e-05, + "loss": 0.1963, + "loss/crossentropy": 2.8107948303222656, + "loss/hidden": 0.0, + "loss/logits": 0.19633925706148148, + "loss/reg": 1.5109307765960693, + "step": 1132 + }, + { + "epoch": 0.01133, + "grad_norm": 0.36502739787101746, + "grad_norm_var": 0.00035233925543644756, + "learning_rate": 5e-05, + "loss": 0.192, + "loss/crossentropy": 2.739868938922882, + "loss/hidden": 0.0, + "loss/logits": 0.19196948036551476, + "loss/reg": 1.5099393129348755, + "step": 1133 + }, + { + "epoch": 0.01134, + "grad_norm": 0.35947197675704956, + "grad_norm_var": 0.0003390916400412743, + "learning_rate": 5e-05, + "loss": 0.1733, + "loss/crossentropy": 2.9228984713554382, + "loss/hidden": 0.0, + "loss/logits": 0.17327600717544556, + "loss/reg": 1.5089576244354248, + "step": 1134 + }, + { + "epoch": 0.01135, + "grad_norm": 0.4201592803001404, + "grad_norm_var": 0.00045598006875781453, + "learning_rate": 5e-05, + "loss": 0.2055, + "loss/crossentropy": 2.709896743297577, + "loss/hidden": 0.0, + "loss/logits": 0.2055364064872265, + "loss/reg": 1.5074372291564941, + "step": 1135 + }, + { + "epoch": 0.01136, + "grad_norm": 0.374734103679657, + "grad_norm_var": 0.0004555446863156713, + "learning_rate": 5e-05, + "loss": 0.1795, + "loss/crossentropy": 2.7715643644332886, + "loss/hidden": 0.0, + "loss/logits": 0.17949137836694717, + "loss/reg": 1.5065659284591675, + "step": 1136 + }, + { + "epoch": 0.01137, + "grad_norm": 0.3729066550731659, + "grad_norm_var": 0.00042464881057899067, + "learning_rate": 5e-05, + "loss": 0.1762, + "loss/crossentropy": 2.7452287673950195, + "loss/hidden": 0.0, + "loss/logits": 0.17615839838981628, + "loss/reg": 1.5052706003189087, + "step": 1137 + }, + { + "epoch": 0.01138, + "grad_norm": 0.38408100605010986, + "grad_norm_var": 0.0003940218106961842, + "learning_rate": 5e-05, + "loss": 0.1801, + "loss/crossentropy": 2.8566765189170837, + "loss/hidden": 0.0, + "loss/logits": 0.18010596185922623, + "loss/reg": 1.5037782192230225, + "step": 1138 + }, + { + "epoch": 0.01139, + "grad_norm": 0.3781294524669647, + "grad_norm_var": 0.00037984854208149933, + "learning_rate": 5e-05, + "loss": 0.1634, + "loss/crossentropy": 2.7707727551460266, + "loss/hidden": 0.0, + "loss/logits": 0.16339509561657906, + "loss/reg": 1.5025368928909302, + "step": 1139 + }, + { + "epoch": 0.0114, + "grad_norm": 0.4109313189983368, + "grad_norm_var": 0.00040868822139816053, + "learning_rate": 5e-05, + "loss": 0.1847, + "loss/crossentropy": 2.883831739425659, + "loss/hidden": 0.0, + "loss/logits": 0.18474670499563217, + "loss/reg": 1.50146484375, + "step": 1140 + }, + { + "epoch": 0.01141, + "grad_norm": 0.3971245288848877, + "grad_norm_var": 0.0004137900008258755, + "learning_rate": 5e-05, + "loss": 0.1926, + "loss/crossentropy": 2.7795395851135254, + "loss/hidden": 0.0, + "loss/logits": 0.19261505082249641, + "loss/reg": 1.5002492666244507, + "step": 1141 + }, + { + "epoch": 0.01142, + "grad_norm": 0.4321688115596771, + "grad_norm_var": 0.0005404274556179349, + "learning_rate": 5e-05, + "loss": 0.1898, + "loss/crossentropy": 2.7451062202453613, + "loss/hidden": 0.0, + "loss/logits": 0.18978586420416832, + "loss/reg": 1.4995501041412354, + "step": 1142 + }, + { + "epoch": 0.01143, + "grad_norm": 0.4023013710975647, + "grad_norm_var": 0.0005056395743542516, + "learning_rate": 5e-05, + "loss": 0.1771, + "loss/crossentropy": 2.6546586751937866, + "loss/hidden": 0.0, + "loss/logits": 0.1770654208958149, + "loss/reg": 1.4985374212265015, + "step": 1143 + }, + { + "epoch": 0.01144, + "grad_norm": 0.4330555200576782, + "grad_norm_var": 0.0005774834396144494, + "learning_rate": 5e-05, + "loss": 0.1908, + "loss/crossentropy": 2.648827910423279, + "loss/hidden": 0.0, + "loss/logits": 0.1907619796693325, + "loss/reg": 1.4983007907867432, + "step": 1144 + }, + { + "epoch": 0.01145, + "grad_norm": 0.38340097665786743, + "grad_norm_var": 0.0005528051964884484, + "learning_rate": 5e-05, + "loss": 0.1832, + "loss/crossentropy": 2.761549949645996, + "loss/hidden": 0.0, + "loss/logits": 0.18321385234594345, + "loss/reg": 1.4974865913391113, + "step": 1145 + }, + { + "epoch": 0.01146, + "grad_norm": 0.3774440586566925, + "grad_norm_var": 0.0005592043924350865, + "learning_rate": 5e-05, + "loss": 0.1877, + "loss/crossentropy": 2.8717733025550842, + "loss/hidden": 0.0, + "loss/logits": 0.1877019703388214, + "loss/reg": 1.4972758293151855, + "step": 1146 + }, + { + "epoch": 0.01147, + "grad_norm": 0.3859359323978424, + "grad_norm_var": 0.0005384773779493794, + "learning_rate": 5e-05, + "loss": 0.1838, + "loss/crossentropy": 2.9210824966430664, + "loss/hidden": 0.0, + "loss/logits": 0.18384063616394997, + "loss/reg": 1.4961239099502563, + "step": 1147 + }, + { + "epoch": 0.01148, + "grad_norm": 0.3788585066795349, + "grad_norm_var": 0.0005176612581260268, + "learning_rate": 5e-05, + "loss": 0.175, + "loss/crossentropy": 2.8554866313934326, + "loss/hidden": 0.0, + "loss/logits": 0.17496761679649353, + "loss/reg": 1.4948080778121948, + "step": 1148 + }, + { + "epoch": 0.01149, + "grad_norm": 0.381953626871109, + "grad_norm_var": 0.0004769895308362693, + "learning_rate": 5e-05, + "loss": 0.1828, + "loss/crossentropy": 2.760767877101898, + "loss/hidden": 0.0, + "loss/logits": 0.182776290923357, + "loss/reg": 1.49321448802948, + "step": 1149 + }, + { + "epoch": 0.0115, + "grad_norm": 0.41392236948013306, + "grad_norm_var": 0.0004258390348976608, + "learning_rate": 5e-05, + "loss": 0.1842, + "loss/crossentropy": 2.617997944355011, + "loss/hidden": 0.0, + "loss/logits": 0.1842118427157402, + "loss/reg": 1.4924064874649048, + "step": 1150 + }, + { + "epoch": 0.01151, + "grad_norm": 0.40094637870788574, + "grad_norm_var": 0.0003855969394288672, + "learning_rate": 5e-05, + "loss": 0.1935, + "loss/crossentropy": 2.6930890679359436, + "loss/hidden": 0.0, + "loss/logits": 0.1934753768146038, + "loss/reg": 1.4911280870437622, + "step": 1151 + }, + { + "epoch": 0.01152, + "grad_norm": 0.4077228009700775, + "grad_norm_var": 0.00036780126123923116, + "learning_rate": 5e-05, + "loss": 0.1885, + "loss/crossentropy": 2.696079134941101, + "loss/hidden": 0.0, + "loss/logits": 0.18851438537240028, + "loss/reg": 1.4894925355911255, + "step": 1152 + }, + { + "epoch": 0.01153, + "grad_norm": 0.37663567066192627, + "grad_norm_var": 0.0003570365498350214, + "learning_rate": 5e-05, + "loss": 0.186, + "loss/crossentropy": 2.7406232357025146, + "loss/hidden": 0.0, + "loss/logits": 0.18599893897771835, + "loss/reg": 1.4883166551589966, + "step": 1153 + }, + { + "epoch": 0.01154, + "grad_norm": 0.39306846261024475, + "grad_norm_var": 0.00034715706505394905, + "learning_rate": 5e-05, + "loss": 0.1793, + "loss/crossentropy": 2.8033538460731506, + "loss/hidden": 0.0, + "loss/logits": 0.17927935346961021, + "loss/reg": 1.4874869585037231, + "step": 1154 + }, + { + "epoch": 0.01155, + "grad_norm": 0.5115792155265808, + "grad_norm_var": 0.0011226610795350292, + "learning_rate": 5e-05, + "loss": 0.1898, + "loss/crossentropy": 2.8153198957443237, + "loss/hidden": 0.0, + "loss/logits": 0.18982412666082382, + "loss/reg": 1.4867078065872192, + "step": 1155 + }, + { + "epoch": 0.01156, + "grad_norm": 0.3997136354446411, + "grad_norm_var": 0.0011223134316026655, + "learning_rate": 5e-05, + "loss": 0.1839, + "loss/crossentropy": 2.676534593105316, + "loss/hidden": 0.0, + "loss/logits": 0.18394503742456436, + "loss/reg": 1.4859753847122192, + "step": 1156 + }, + { + "epoch": 0.01157, + "grad_norm": 0.40472909808158875, + "grad_norm_var": 0.0011182066388159624, + "learning_rate": 5e-05, + "loss": 0.2042, + "loss/crossentropy": 2.7133376002311707, + "loss/hidden": 0.0, + "loss/logits": 0.20424646511673927, + "loss/reg": 1.4849092960357666, + "step": 1157 + }, + { + "epoch": 0.01158, + "grad_norm": 0.3610299527645111, + "grad_norm_var": 0.0011788388166517098, + "learning_rate": 5e-05, + "loss": 0.1769, + "loss/crossentropy": 2.7634962797164917, + "loss/hidden": 0.0, + "loss/logits": 0.17686418071389198, + "loss/reg": 1.484142541885376, + "step": 1158 + }, + { + "epoch": 0.01159, + "grad_norm": 0.4668003022670746, + "grad_norm_var": 0.001452027449821891, + "learning_rate": 5e-05, + "loss": 0.1714, + "loss/crossentropy": 2.6544492840766907, + "loss/hidden": 0.0, + "loss/logits": 0.17144346237182617, + "loss/reg": 1.4833974838256836, + "step": 1159 + }, + { + "epoch": 0.0116, + "grad_norm": 0.38263368606567383, + "grad_norm_var": 0.0014209642141947163, + "learning_rate": 5e-05, + "loss": 0.1681, + "loss/crossentropy": 2.801379144191742, + "loss/hidden": 0.0, + "loss/logits": 0.1680724062025547, + "loss/reg": 1.4821968078613281, + "step": 1160 + }, + { + "epoch": 0.01161, + "grad_norm": 0.46218374371528625, + "grad_norm_var": 0.0016172066414785117, + "learning_rate": 5e-05, + "loss": 0.1866, + "loss/crossentropy": 2.8082536458969116, + "loss/hidden": 0.0, + "loss/logits": 0.18662720918655396, + "loss/reg": 1.4810516834259033, + "step": 1161 + }, + { + "epoch": 0.01162, + "grad_norm": 0.40168115496635437, + "grad_norm_var": 0.001559790115608121, + "learning_rate": 5e-05, + "loss": 0.1734, + "loss/crossentropy": 2.7967772483825684, + "loss/hidden": 0.0, + "loss/logits": 0.17344212904572487, + "loss/reg": 1.4801340103149414, + "step": 1162 + }, + { + "epoch": 0.01163, + "grad_norm": 0.4014493227005005, + "grad_norm_var": 0.0015290129465419374, + "learning_rate": 5e-05, + "loss": 0.1975, + "loss/crossentropy": 2.7243821024894714, + "loss/hidden": 0.0, + "loss/logits": 0.19749024882912636, + "loss/reg": 1.479145884513855, + "step": 1163 + }, + { + "epoch": 0.01164, + "grad_norm": 0.3888718783855438, + "grad_norm_var": 0.001494961513700081, + "learning_rate": 5e-05, + "loss": 0.1816, + "loss/crossentropy": 2.7566738724708557, + "loss/hidden": 0.0, + "loss/logits": 0.18155549839138985, + "loss/reg": 1.4781523942947388, + "step": 1164 + }, + { + "epoch": 0.01165, + "grad_norm": 0.3523938059806824, + "grad_norm_var": 0.0016588613416890366, + "learning_rate": 5e-05, + "loss": 0.174, + "loss/crossentropy": 2.7065756916999817, + "loss/hidden": 0.0, + "loss/logits": 0.17404457181692123, + "loss/reg": 1.4770623445510864, + "step": 1165 + }, + { + "epoch": 0.01166, + "grad_norm": 0.462964802980423, + "grad_norm_var": 0.0018489885102984402, + "learning_rate": 5e-05, + "loss": 0.1859, + "loss/crossentropy": 2.975113093852997, + "loss/hidden": 0.0, + "loss/logits": 0.18585091456770897, + "loss/reg": 1.4751616716384888, + "step": 1166 + }, + { + "epoch": 0.01167, + "grad_norm": 0.5501847863197327, + "grad_norm_var": 0.0030429283606156414, + "learning_rate": 5e-05, + "loss": 0.2033, + "loss/crossentropy": 2.8122783303260803, + "loss/hidden": 0.0, + "loss/logits": 0.20325875282287598, + "loss/reg": 1.4739700555801392, + "step": 1167 + }, + { + "epoch": 0.01168, + "grad_norm": 0.43727031350135803, + "grad_norm_var": 0.0030482293912123463, + "learning_rate": 5e-05, + "loss": 0.1871, + "loss/crossentropy": 2.6004568338394165, + "loss/hidden": 0.0, + "loss/logits": 0.18711896240711212, + "loss/reg": 1.472339153289795, + "step": 1168 + }, + { + "epoch": 0.01169, + "grad_norm": 0.5093014240264893, + "grad_norm_var": 0.003344487550155887, + "learning_rate": 5e-05, + "loss": 0.204, + "loss/crossentropy": 2.8668264746665955, + "loss/hidden": 0.0, + "loss/logits": 0.2039630264043808, + "loss/reg": 1.471176028251648, + "step": 1169 + }, + { + "epoch": 0.0117, + "grad_norm": 0.3894696831703186, + "grad_norm_var": 0.003363193736657033, + "learning_rate": 5e-05, + "loss": 0.1803, + "loss/crossentropy": 2.8249135613441467, + "loss/hidden": 0.0, + "loss/logits": 0.18029743060469627, + "loss/reg": 1.46927011013031, + "step": 1170 + }, + { + "epoch": 0.01171, + "grad_norm": 0.4055262506008148, + "grad_norm_var": 0.0029145778475038226, + "learning_rate": 5e-05, + "loss": 0.1911, + "loss/crossentropy": 2.7669249773025513, + "loss/hidden": 0.0, + "loss/logits": 0.1910816915333271, + "loss/reg": 1.4679526090621948, + "step": 1171 + }, + { + "epoch": 0.01172, + "grad_norm": 0.36875951290130615, + "grad_norm_var": 0.0030726867573331244, + "learning_rate": 5e-05, + "loss": 0.1749, + "loss/crossentropy": 2.7718464732170105, + "loss/hidden": 0.0, + "loss/logits": 0.17493540793657303, + "loss/reg": 1.4658503532409668, + "step": 1172 + }, + { + "epoch": 0.01173, + "grad_norm": 0.42354193329811096, + "grad_norm_var": 0.003052543245601517, + "learning_rate": 5e-05, + "loss": 0.1855, + "loss/crossentropy": 2.719308376312256, + "loss/hidden": 0.0, + "loss/logits": 0.1855236478149891, + "loss/reg": 1.464747667312622, + "step": 1173 + }, + { + "epoch": 0.01174, + "grad_norm": 0.39868998527526855, + "grad_norm_var": 0.0028312487941498285, + "learning_rate": 5e-05, + "loss": 0.1683, + "loss/crossentropy": 2.8142080307006836, + "loss/hidden": 0.0, + "loss/logits": 0.16827983036637306, + "loss/reg": 1.4631402492523193, + "step": 1174 + }, + { + "epoch": 0.01175, + "grad_norm": 0.3635683059692383, + "grad_norm_var": 0.002923433007255775, + "learning_rate": 5e-05, + "loss": 0.1691, + "loss/crossentropy": 2.8140381574630737, + "loss/hidden": 0.0, + "loss/logits": 0.16913769394159317, + "loss/reg": 1.461613655090332, + "step": 1175 + }, + { + "epoch": 0.01176, + "grad_norm": 0.4067941904067993, + "grad_norm_var": 0.0028438749166883564, + "learning_rate": 5e-05, + "loss": 0.1805, + "loss/crossentropy": 2.74730122089386, + "loss/hidden": 0.0, + "loss/logits": 0.1805243194103241, + "loss/reg": 1.4598890542984009, + "step": 1176 + }, + { + "epoch": 0.01177, + "grad_norm": 0.4573166072368622, + "grad_norm_var": 0.002818087802214993, + "learning_rate": 5e-05, + "loss": 0.1958, + "loss/crossentropy": 2.6282488107681274, + "loss/hidden": 0.0, + "loss/logits": 0.19583966955542564, + "loss/reg": 1.458559513092041, + "step": 1177 + }, + { + "epoch": 0.01178, + "grad_norm": 0.41611266136169434, + "grad_norm_var": 0.0027961219454357995, + "learning_rate": 5e-05, + "loss": 0.1893, + "loss/crossentropy": 2.8419344425201416, + "loss/hidden": 0.0, + "loss/logits": 0.1892707683146, + "loss/reg": 1.4577357769012451, + "step": 1178 + }, + { + "epoch": 0.01179, + "grad_norm": 0.4139103293418884, + "grad_norm_var": 0.002773736915110033, + "learning_rate": 5e-05, + "loss": 0.1878, + "loss/crossentropy": 2.8046358227729797, + "loss/hidden": 0.0, + "loss/logits": 0.18783502280712128, + "loss/reg": 1.457064151763916, + "step": 1179 + }, + { + "epoch": 0.0118, + "grad_norm": 0.39083123207092285, + "grad_norm_var": 0.002765441807365839, + "learning_rate": 5e-05, + "loss": 0.1937, + "loss/crossentropy": 2.7833199501037598, + "loss/hidden": 0.0, + "loss/logits": 0.1936798058450222, + "loss/reg": 1.4564727544784546, + "step": 1180 + }, + { + "epoch": 0.01181, + "grad_norm": 0.37335360050201416, + "grad_norm_var": 0.0025993115992857365, + "learning_rate": 5e-05, + "loss": 0.177, + "loss/crossentropy": 2.9156389832496643, + "loss/hidden": 0.0, + "loss/logits": 0.17697978019714355, + "loss/reg": 1.4559135437011719, + "step": 1181 + }, + { + "epoch": 0.01182, + "grad_norm": 0.3955034911632538, + "grad_norm_var": 0.0025240464809215823, + "learning_rate": 5e-05, + "loss": 0.1987, + "loss/crossentropy": 2.8472283482551575, + "loss/hidden": 0.0, + "loss/logits": 0.19867418706417084, + "loss/reg": 1.4550074338912964, + "step": 1182 + }, + { + "epoch": 0.01183, + "grad_norm": 0.4038008451461792, + "grad_norm_var": 0.0012981508534220532, + "learning_rate": 5e-05, + "loss": 0.1893, + "loss/crossentropy": 2.806343376636505, + "loss/hidden": 0.0, + "loss/logits": 0.18930381163954735, + "loss/reg": 1.454715609550476, + "step": 1183 + }, + { + "epoch": 0.01184, + "grad_norm": 0.5491846203804016, + "grad_norm_var": 0.0024937052353730376, + "learning_rate": 5e-05, + "loss": 0.1811, + "loss/crossentropy": 2.7931687235832214, + "loss/hidden": 0.0, + "loss/logits": 0.18108559772372246, + "loss/reg": 1.454012155532837, + "step": 1184 + }, + { + "epoch": 0.01185, + "grad_norm": 0.3679744601249695, + "grad_norm_var": 0.0019952852149375476, + "learning_rate": 5e-05, + "loss": 0.1894, + "loss/crossentropy": 2.670843780040741, + "loss/hidden": 0.0, + "loss/logits": 0.18943195790052414, + "loss/reg": 1.4539992809295654, + "step": 1185 + }, + { + "epoch": 0.01186, + "grad_norm": 0.5347173810005188, + "grad_norm_var": 0.0029594091193365646, + "learning_rate": 5e-05, + "loss": 0.1931, + "loss/crossentropy": 2.81303608417511, + "loss/hidden": 0.0, + "loss/logits": 0.19309942051768303, + "loss/reg": 1.4531627893447876, + "step": 1186 + }, + { + "epoch": 0.01187, + "grad_norm": 0.35835427045822144, + "grad_norm_var": 0.003169699938894862, + "learning_rate": 5e-05, + "loss": 0.1811, + "loss/crossentropy": 2.8339531421661377, + "loss/hidden": 0.0, + "loss/logits": 0.1810522824525833, + "loss/reg": 1.452269196510315, + "step": 1187 + }, + { + "epoch": 0.01188, + "grad_norm": 0.3567518889904022, + "grad_norm_var": 0.0032509833875422464, + "learning_rate": 5e-05, + "loss": 0.1743, + "loss/crossentropy": 2.765186607837677, + "loss/hidden": 0.0, + "loss/logits": 0.17427153512835503, + "loss/reg": 1.450973391532898, + "step": 1188 + }, + { + "epoch": 0.01189, + "grad_norm": 0.3796020746231079, + "grad_norm_var": 0.0033107722836770177, + "learning_rate": 5e-05, + "loss": 0.177, + "loss/crossentropy": 2.9006309509277344, + "loss/hidden": 0.0, + "loss/logits": 0.1769685558974743, + "loss/reg": 1.4492751359939575, + "step": 1189 + }, + { + "epoch": 0.0119, + "grad_norm": 0.44529664516448975, + "grad_norm_var": 0.0033737393452274926, + "learning_rate": 5e-05, + "loss": 0.1789, + "loss/crossentropy": 2.7190520763397217, + "loss/hidden": 0.0, + "loss/logits": 0.1789361834526062, + "loss/reg": 1.4478099346160889, + "step": 1190 + }, + { + "epoch": 0.01191, + "grad_norm": 0.3916754424571991, + "grad_norm_var": 0.003236675787769731, + "learning_rate": 5e-05, + "loss": 0.1817, + "loss/crossentropy": 2.8190470933914185, + "loss/hidden": 0.0, + "loss/logits": 0.18166551738977432, + "loss/reg": 1.4454478025436401, + "step": 1191 + }, + { + "epoch": 0.01192, + "grad_norm": 0.41403406858444214, + "grad_norm_var": 0.0032319593928060424, + "learning_rate": 5e-05, + "loss": 0.1816, + "loss/crossentropy": 2.867457687854767, + "loss/hidden": 0.0, + "loss/logits": 0.18155980482697487, + "loss/reg": 1.4430103302001953, + "step": 1192 + }, + { + "epoch": 0.01193, + "grad_norm": 0.38379180431365967, + "grad_norm_var": 0.0031601439954389837, + "learning_rate": 5e-05, + "loss": 0.1857, + "loss/crossentropy": 2.715328276157379, + "loss/hidden": 0.0, + "loss/logits": 0.18566227331757545, + "loss/reg": 1.441053867340088, + "step": 1193 + }, + { + "epoch": 0.01194, + "grad_norm": 0.37299588322639465, + "grad_norm_var": 0.0032465457322637874, + "learning_rate": 5e-05, + "loss": 0.1708, + "loss/crossentropy": 2.7711251378059387, + "loss/hidden": 0.0, + "loss/logits": 0.17077474668622017, + "loss/reg": 1.4382448196411133, + "step": 1194 + }, + { + "epoch": 0.01195, + "grad_norm": 0.40252241492271423, + "grad_norm_var": 0.00324603537587758, + "learning_rate": 5e-05, + "loss": 0.1955, + "loss/crossentropy": 2.7185396552085876, + "loss/hidden": 0.0, + "loss/logits": 0.19552023708820343, + "loss/reg": 1.43511164188385, + "step": 1195 + }, + { + "epoch": 0.01196, + "grad_norm": 0.37733253836631775, + "grad_norm_var": 0.0032874685602436522, + "learning_rate": 5e-05, + "loss": 0.1874, + "loss/crossentropy": 2.8184019327163696, + "loss/hidden": 0.0, + "loss/logits": 0.18738915398716927, + "loss/reg": 1.4331696033477783, + "step": 1196 + }, + { + "epoch": 0.01197, + "grad_norm": 0.39589473605155945, + "grad_norm_var": 0.003219060852671751, + "learning_rate": 5e-05, + "loss": 0.1866, + "loss/crossentropy": 2.7010214924812317, + "loss/hidden": 0.0, + "loss/logits": 0.186626598238945, + "loss/reg": 1.4316699504852295, + "step": 1197 + }, + { + "epoch": 0.01198, + "grad_norm": 0.4191206693649292, + "grad_norm_var": 0.0032142886338114205, + "learning_rate": 5e-05, + "loss": 0.1919, + "loss/crossentropy": 2.909880578517914, + "loss/hidden": 0.0, + "loss/logits": 0.19187119230628014, + "loss/reg": 1.4303897619247437, + "step": 1198 + }, + { + "epoch": 0.01199, + "grad_norm": 0.4035413861274719, + "grad_norm_var": 0.0032144922705757157, + "learning_rate": 5e-05, + "loss": 0.1846, + "loss/crossentropy": 2.6426594257354736, + "loss/hidden": 0.0, + "loss/logits": 0.1845902055501938, + "loss/reg": 1.4290152788162231, + "step": 1199 + }, + { + "epoch": 0.012, + "grad_norm": 0.36857399344444275, + "grad_norm_var": 0.0018906405469929315, + "learning_rate": 5e-05, + "loss": 0.1689, + "loss/crossentropy": 2.787937879562378, + "loss/hidden": 0.0, + "loss/logits": 0.16886601597070694, + "loss/reg": 1.4279104471206665, + "step": 1200 + }, + { + "epoch": 0.01201, + "grad_norm": 0.3827061653137207, + "grad_norm_var": 0.0018447143939107808, + "learning_rate": 5e-05, + "loss": 0.184, + "loss/crossentropy": 2.6812090277671814, + "loss/hidden": 0.0, + "loss/logits": 0.18400828912854195, + "loss/reg": 1.4265459775924683, + "step": 1201 + }, + { + "epoch": 0.01202, + "grad_norm": 0.3506307005882263, + "grad_norm_var": 0.0006360064193142537, + "learning_rate": 5e-05, + "loss": 0.1719, + "loss/crossentropy": 2.71059513092041, + "loss/hidden": 0.0, + "loss/logits": 0.17186394706368446, + "loss/reg": 1.4260975122451782, + "step": 1202 + }, + { + "epoch": 0.01203, + "grad_norm": 0.3920105993747711, + "grad_norm_var": 0.0005752191941902721, + "learning_rate": 5e-05, + "loss": 0.1798, + "loss/crossentropy": 2.7090110182762146, + "loss/hidden": 0.0, + "loss/logits": 0.1797771342098713, + "loss/reg": 1.4251253604888916, + "step": 1203 + }, + { + "epoch": 0.01204, + "grad_norm": 0.3974657356739044, + "grad_norm_var": 0.000499526406805432, + "learning_rate": 5e-05, + "loss": 0.1874, + "loss/crossentropy": 2.701002776622772, + "loss/hidden": 0.0, + "loss/logits": 0.18736432120203972, + "loss/reg": 1.4243783950805664, + "step": 1204 + }, + { + "epoch": 0.01205, + "grad_norm": 0.42775383591651917, + "grad_norm_var": 0.0005627563087383103, + "learning_rate": 5e-05, + "loss": 0.2098, + "loss/crossentropy": 2.734471619129181, + "loss/hidden": 0.0, + "loss/logits": 0.2097911797463894, + "loss/reg": 1.4237735271453857, + "step": 1205 + }, + { + "epoch": 0.01206, + "grad_norm": 0.40280991792678833, + "grad_norm_var": 0.0003925441234761018, + "learning_rate": 5e-05, + "loss": 0.204, + "loss/crossentropy": 2.649215042591095, + "loss/hidden": 0.0, + "loss/logits": 0.2039887085556984, + "loss/reg": 1.4232732057571411, + "step": 1206 + }, + { + "epoch": 0.01207, + "grad_norm": 0.4008637070655823, + "grad_norm_var": 0.00039659149065429666, + "learning_rate": 5e-05, + "loss": 0.1817, + "loss/crossentropy": 2.7435959577560425, + "loss/hidden": 0.0, + "loss/logits": 0.1816614270210266, + "loss/reg": 1.42289137840271, + "step": 1207 + }, + { + "epoch": 0.01208, + "grad_norm": 0.38267508149147034, + "grad_norm_var": 0.0003711633927555173, + "learning_rate": 5e-05, + "loss": 0.1742, + "loss/crossentropy": 2.8640989661216736, + "loss/hidden": 0.0, + "loss/logits": 0.17420271039009094, + "loss/reg": 1.422437071800232, + "step": 1208 + }, + { + "epoch": 0.01209, + "grad_norm": 0.3714866638183594, + "grad_norm_var": 0.00039293414504885845, + "learning_rate": 5e-05, + "loss": 0.169, + "loss/crossentropy": 2.746426820755005, + "loss/hidden": 0.0, + "loss/logits": 0.16898731514811516, + "loss/reg": 1.42264723777771, + "step": 1209 + }, + { + "epoch": 0.0121, + "grad_norm": 0.3865947723388672, + "grad_norm_var": 0.00037271053118719997, + "learning_rate": 5e-05, + "loss": 0.1848, + "loss/crossentropy": 2.849126398563385, + "loss/hidden": 0.0, + "loss/logits": 0.18478216230869293, + "loss/reg": 1.4227102994918823, + "step": 1210 + }, + { + "epoch": 0.01211, + "grad_norm": 0.46436965465545654, + "grad_norm_var": 0.0007037118140789371, + "learning_rate": 5e-05, + "loss": 0.1964, + "loss/crossentropy": 2.891853094100952, + "loss/hidden": 0.0, + "loss/logits": 0.1963624656200409, + "loss/reg": 1.421623945236206, + "step": 1211 + }, + { + "epoch": 0.01212, + "grad_norm": 0.3868556618690491, + "grad_norm_var": 0.0006866427169156234, + "learning_rate": 5e-05, + "loss": 0.185, + "loss/crossentropy": 2.7030688524246216, + "loss/hidden": 0.0, + "loss/logits": 0.18502762913703918, + "loss/reg": 1.4205013513565063, + "step": 1212 + }, + { + "epoch": 0.01213, + "grad_norm": 0.5466204285621643, + "grad_norm_var": 0.002107741306228805, + "learning_rate": 5e-05, + "loss": 0.2321, + "loss/crossentropy": 2.8207810521125793, + "loss/hidden": 0.0, + "loss/logits": 0.23205070197582245, + "loss/reg": 1.4198737144470215, + "step": 1213 + }, + { + "epoch": 0.01214, + "grad_norm": 0.36891767382621765, + "grad_norm_var": 0.0021724490893943573, + "learning_rate": 5e-05, + "loss": 0.1751, + "loss/crossentropy": 2.765327572822571, + "loss/hidden": 0.0, + "loss/logits": 0.17510851845145226, + "loss/reg": 1.4195406436920166, + "step": 1214 + }, + { + "epoch": 0.01215, + "grad_norm": 0.38599929213523865, + "grad_norm_var": 0.0021883509252218604, + "learning_rate": 5e-05, + "loss": 0.1777, + "loss/crossentropy": 2.7295534014701843, + "loss/hidden": 0.0, + "loss/logits": 0.1777309849858284, + "loss/reg": 1.4191190004348755, + "step": 1215 + }, + { + "epoch": 0.01216, + "grad_norm": 0.3923039436340332, + "grad_norm_var": 0.002120883638911231, + "learning_rate": 5e-05, + "loss": 0.1735, + "loss/crossentropy": 2.775855779647827, + "loss/hidden": 0.0, + "loss/logits": 0.17348914593458176, + "loss/reg": 1.4182158708572388, + "step": 1216 + }, + { + "epoch": 0.01217, + "grad_norm": 0.38923102617263794, + "grad_norm_var": 0.0021063207621189258, + "learning_rate": 5e-05, + "loss": 0.1852, + "loss/crossentropy": 2.6477224826812744, + "loss/hidden": 0.0, + "loss/logits": 0.18519454449415207, + "loss/reg": 1.4171086549758911, + "step": 1217 + }, + { + "epoch": 0.01218, + "grad_norm": 0.38138842582702637, + "grad_norm_var": 0.0019510417841006008, + "learning_rate": 5e-05, + "loss": 0.189, + "loss/crossentropy": 2.6555696725845337, + "loss/hidden": 0.0, + "loss/logits": 0.1890222169458866, + "loss/reg": 1.415981650352478, + "step": 1218 + }, + { + "epoch": 0.01219, + "grad_norm": 0.4084736108779907, + "grad_norm_var": 0.001939832634857904, + "learning_rate": 5e-05, + "loss": 0.1835, + "loss/crossentropy": 2.818666934967041, + "loss/hidden": 0.0, + "loss/logits": 0.18353594094514847, + "loss/reg": 1.4153097867965698, + "step": 1219 + }, + { + "epoch": 0.0122, + "grad_norm": 0.4247463345527649, + "grad_norm_var": 0.0019558024315882013, + "learning_rate": 5e-05, + "loss": 0.1755, + "loss/crossentropy": 2.7127944231033325, + "loss/hidden": 0.0, + "loss/logits": 0.17549164965748787, + "loss/reg": 1.4142556190490723, + "step": 1220 + }, + { + "epoch": 0.01221, + "grad_norm": 0.40716880559921265, + "grad_norm_var": 0.0019268832744298062, + "learning_rate": 5e-05, + "loss": 0.1938, + "loss/crossentropy": 2.643829822540283, + "loss/hidden": 0.0, + "loss/logits": 0.19382373616099358, + "loss/reg": 1.4136812686920166, + "step": 1221 + }, + { + "epoch": 0.01222, + "grad_norm": 0.35272741317749023, + "grad_norm_var": 0.0021068318421432236, + "learning_rate": 5e-05, + "loss": 0.1718, + "loss/crossentropy": 2.6937798261642456, + "loss/hidden": 0.0, + "loss/logits": 0.17175516858696938, + "loss/reg": 1.4124233722686768, + "step": 1222 + }, + { + "epoch": 0.01223, + "grad_norm": 0.3460560441017151, + "grad_norm_var": 0.002311292127889427, + "learning_rate": 5e-05, + "loss": 0.1711, + "loss/crossentropy": 2.764497935771942, + "loss/hidden": 0.0, + "loss/logits": 0.17111336812376976, + "loss/reg": 1.4124194383621216, + "step": 1223 + }, + { + "epoch": 0.01224, + "grad_norm": 0.38824766874313354, + "grad_norm_var": 0.00230056400932727, + "learning_rate": 5e-05, + "loss": 0.1841, + "loss/crossentropy": 2.713408052921295, + "loss/hidden": 0.0, + "loss/logits": 0.18410339578986168, + "loss/reg": 1.412917971611023, + "step": 1224 + }, + { + "epoch": 0.01225, + "grad_norm": 0.3764731287956238, + "grad_norm_var": 0.002283111285856387, + "learning_rate": 5e-05, + "loss": 0.1815, + "loss/crossentropy": 2.8800466656684875, + "loss/hidden": 0.0, + "loss/logits": 0.1814701072871685, + "loss/reg": 1.4119223356246948, + "step": 1225 + }, + { + "epoch": 0.01226, + "grad_norm": 0.35486990213394165, + "grad_norm_var": 0.0024043515928512467, + "learning_rate": 5e-05, + "loss": 0.1721, + "loss/crossentropy": 2.864288032054901, + "loss/hidden": 0.0, + "loss/logits": 0.1720508709549904, + "loss/reg": 1.410658359527588, + "step": 1226 + }, + { + "epoch": 0.01227, + "grad_norm": 0.3798040449619293, + "grad_norm_var": 0.0021075098216082667, + "learning_rate": 5e-05, + "loss": 0.186, + "loss/crossentropy": 2.800125002861023, + "loss/hidden": 0.0, + "loss/logits": 0.18600793182849884, + "loss/reg": 1.4104071855545044, + "step": 1227 + }, + { + "epoch": 0.01228, + "grad_norm": 0.38487929105758667, + "grad_norm_var": 0.002109404100500737, + "learning_rate": 5e-05, + "loss": 0.187, + "loss/crossentropy": 2.7868515253067017, + "loss/hidden": 0.0, + "loss/logits": 0.18701008334755898, + "loss/reg": 1.4095227718353271, + "step": 1228 + }, + { + "epoch": 0.01229, + "grad_norm": 0.45511749386787415, + "grad_norm_var": 0.0007584030638864074, + "learning_rate": 5e-05, + "loss": 0.1985, + "loss/crossentropy": 2.6571977138519287, + "loss/hidden": 0.0, + "loss/logits": 0.19848217070102692, + "loss/reg": 1.4084324836730957, + "step": 1229 + }, + { + "epoch": 0.0123, + "grad_norm": 0.6594117879867554, + "grad_norm_var": 0.005321544010225898, + "learning_rate": 5e-05, + "loss": 0.2011, + "loss/crossentropy": 2.8001416325569153, + "loss/hidden": 0.0, + "loss/logits": 0.20106521993875504, + "loss/reg": 1.4072535037994385, + "step": 1230 + }, + { + "epoch": 0.01231, + "grad_norm": 0.43426990509033203, + "grad_norm_var": 0.005342107314555718, + "learning_rate": 5e-05, + "loss": 0.2027, + "loss/crossentropy": 2.7976555228233337, + "loss/hidden": 0.0, + "loss/logits": 0.20270870998501778, + "loss/reg": 1.4066628217697144, + "step": 1231 + }, + { + "epoch": 0.01232, + "grad_norm": 0.7469918131828308, + "grad_norm_var": 0.01244134254394691, + "learning_rate": 5e-05, + "loss": 0.2375, + "loss/crossentropy": 2.800569176673889, + "loss/hidden": 0.0, + "loss/logits": 0.23753388598561287, + "loss/reg": 1.4050111770629883, + "step": 1232 + }, + { + "epoch": 0.01233, + "grad_norm": 0.36666056513786316, + "grad_norm_var": 0.012597725507063546, + "learning_rate": 5e-05, + "loss": 0.1755, + "loss/crossentropy": 2.8041948676109314, + "loss/hidden": 0.0, + "loss/logits": 0.17551551386713982, + "loss/reg": 1.4047677516937256, + "step": 1233 + }, + { + "epoch": 0.01234, + "grad_norm": 0.3527641296386719, + "grad_norm_var": 0.01283143182770274, + "learning_rate": 5e-05, + "loss": 0.1714, + "loss/crossentropy": 2.6635568737983704, + "loss/hidden": 0.0, + "loss/logits": 0.17143995314836502, + "loss/reg": 1.4030920267105103, + "step": 1234 + }, + { + "epoch": 0.01235, + "grad_norm": 0.40449002385139465, + "grad_norm_var": 0.012842484989278431, + "learning_rate": 5e-05, + "loss": 0.1751, + "loss/crossentropy": 2.6857893466949463, + "loss/hidden": 0.0, + "loss/logits": 0.17512645572423935, + "loss/reg": 1.4026522636413574, + "step": 1235 + }, + { + "epoch": 0.01236, + "grad_norm": 0.3718612492084503, + "grad_norm_var": 0.013034358750853499, + "learning_rate": 5e-05, + "loss": 0.1673, + "loss/crossentropy": 2.8389678597450256, + "loss/hidden": 0.0, + "loss/logits": 0.16729874536395073, + "loss/reg": 1.401383876800537, + "step": 1236 + }, + { + "epoch": 0.01237, + "grad_norm": 0.4983327388763428, + "grad_norm_var": 0.013350877741256121, + "learning_rate": 5e-05, + "loss": 0.2026, + "loss/crossentropy": 2.7713358998298645, + "loss/hidden": 0.0, + "loss/logits": 0.20256582275032997, + "loss/reg": 1.4005975723266602, + "step": 1237 + }, + { + "epoch": 0.01238, + "grad_norm": 0.441445916891098, + "grad_norm_var": 0.012933952665905564, + "learning_rate": 5e-05, + "loss": 0.1857, + "loss/crossentropy": 2.8286274671554565, + "loss/hidden": 0.0, + "loss/logits": 0.18565155193209648, + "loss/reg": 1.3995530605316162, + "step": 1238 + }, + { + "epoch": 0.01239, + "grad_norm": 0.3901333808898926, + "grad_norm_var": 0.012532041194226077, + "learning_rate": 5e-05, + "loss": 0.183, + "loss/crossentropy": 2.911182940006256, + "loss/hidden": 0.0, + "loss/logits": 0.18296772241592407, + "loss/reg": 1.3976836204528809, + "step": 1239 + }, + { + "epoch": 0.0124, + "grad_norm": 0.39856791496276855, + "grad_norm_var": 0.012470430313853701, + "learning_rate": 5e-05, + "loss": 0.1963, + "loss/crossentropy": 2.6836057305336, + "loss/hidden": 0.0, + "loss/logits": 0.19625534117221832, + "loss/reg": 1.396618366241455, + "step": 1240 + }, + { + "epoch": 0.01241, + "grad_norm": 0.41100946068763733, + "grad_norm_var": 0.012259332529219746, + "learning_rate": 5e-05, + "loss": 0.1932, + "loss/crossentropy": 2.6196849942207336, + "loss/hidden": 0.0, + "loss/logits": 0.19323748722672462, + "loss/reg": 1.395609736442566, + "step": 1241 + }, + { + "epoch": 0.01242, + "grad_norm": 0.348229318857193, + "grad_norm_var": 0.012338050864381422, + "learning_rate": 5e-05, + "loss": 0.1718, + "loss/crossentropy": 2.775640070438385, + "loss/hidden": 0.0, + "loss/logits": 0.17175709828734398, + "loss/reg": 1.3951023817062378, + "step": 1242 + }, + { + "epoch": 0.01243, + "grad_norm": 1.2390751838684082, + "grad_norm_var": 0.05155969127554597, + "learning_rate": 5e-05, + "loss": 0.2159, + "loss/crossentropy": 2.780874252319336, + "loss/hidden": 0.0, + "loss/logits": 0.21593540906906128, + "loss/reg": 1.3937615156173706, + "step": 1243 + }, + { + "epoch": 0.01244, + "grad_norm": 0.4731004238128662, + "grad_norm_var": 0.050763118391303465, + "learning_rate": 5e-05, + "loss": 0.1825, + "loss/crossentropy": 2.669394373893738, + "loss/hidden": 0.0, + "loss/logits": 0.1825188212096691, + "loss/reg": 1.392451524734497, + "step": 1244 + }, + { + "epoch": 0.01245, + "grad_norm": 0.43558576703071594, + "grad_norm_var": 0.050902455998128635, + "learning_rate": 5e-05, + "loss": 0.1864, + "loss/crossentropy": 2.6458783745765686, + "loss/hidden": 0.0, + "loss/logits": 0.1863899528980255, + "loss/reg": 1.3913257122039795, + "step": 1245 + }, + { + "epoch": 0.01246, + "grad_norm": 0.43400633335113525, + "grad_norm_var": 0.049234233763389666, + "learning_rate": 5e-05, + "loss": 0.183, + "loss/crossentropy": 2.8432045578956604, + "loss/hidden": 0.0, + "loss/logits": 0.18295767530798912, + "loss/reg": 1.3903818130493164, + "step": 1246 + }, + { + "epoch": 0.01247, + "grad_norm": 0.43098875880241394, + "grad_norm_var": 0.049256731879161936, + "learning_rate": 5e-05, + "loss": 0.1935, + "loss/crossentropy": 2.7787997722625732, + "loss/hidden": 0.0, + "loss/logits": 0.19347602501511574, + "loss/reg": 1.3895725011825562, + "step": 1247 + }, + { + "epoch": 0.01248, + "grad_norm": 1.1000005006790161, + "grad_norm_var": 0.06942585731693722, + "learning_rate": 5e-05, + "loss": 0.2263, + "loss/crossentropy": 2.7730624675750732, + "loss/hidden": 0.0, + "loss/logits": 0.22629190236330032, + "loss/reg": 1.3885997533798218, + "step": 1248 + }, + { + "epoch": 0.01249, + "grad_norm": 0.449535071849823, + "grad_norm_var": 0.06831525341155571, + "learning_rate": 5e-05, + "loss": 0.1994, + "loss/crossentropy": 2.7426183223724365, + "loss/hidden": 0.0, + "loss/logits": 0.1994258612394333, + "loss/reg": 1.3879303932189941, + "step": 1249 + }, + { + "epoch": 0.0125, + "grad_norm": 0.3891923129558563, + "grad_norm_var": 0.0676286766494714, + "learning_rate": 5e-05, + "loss": 0.1764, + "loss/crossentropy": 2.7025153636932373, + "loss/hidden": 0.0, + "loss/logits": 0.1763673946261406, + "loss/reg": 1.3863074779510498, + "step": 1250 + }, + { + "epoch": 0.01251, + "grad_norm": 0.39398977160453796, + "grad_norm_var": 0.06778814624374416, + "learning_rate": 5e-05, + "loss": 0.1842, + "loss/crossentropy": 2.850652813911438, + "loss/hidden": 0.0, + "loss/logits": 0.18415121361613274, + "loss/reg": 1.3853391408920288, + "step": 1251 + }, + { + "epoch": 0.01252, + "grad_norm": 0.495724618434906, + "grad_norm_var": 0.06641914754466918, + "learning_rate": 5e-05, + "loss": 0.1964, + "loss/crossentropy": 2.679881453514099, + "loss/hidden": 0.0, + "loss/logits": 0.19643474370241165, + "loss/reg": 1.3849084377288818, + "step": 1252 + }, + { + "epoch": 0.01253, + "grad_norm": 0.4119850695133209, + "grad_norm_var": 0.06714101490369723, + "learning_rate": 5e-05, + "loss": 0.1897, + "loss/crossentropy": 2.750667691230774, + "loss/hidden": 0.0, + "loss/logits": 0.1896573342382908, + "loss/reg": 1.3841875791549683, + "step": 1253 + }, + { + "epoch": 0.01254, + "grad_norm": 0.41677817702293396, + "grad_norm_var": 0.06742149598287875, + "learning_rate": 5e-05, + "loss": 0.1949, + "loss/crossentropy": 2.7512380480766296, + "loss/hidden": 0.0, + "loss/logits": 0.1948574222624302, + "loss/reg": 1.3836725950241089, + "step": 1254 + }, + { + "epoch": 0.01255, + "grad_norm": 0.3882921040058136, + "grad_norm_var": 0.06745202400909417, + "learning_rate": 5e-05, + "loss": 0.1828, + "loss/crossentropy": 2.830279588699341, + "loss/hidden": 0.0, + "loss/logits": 0.18280091881752014, + "loss/reg": 1.3836078643798828, + "step": 1255 + }, + { + "epoch": 0.01256, + "grad_norm": 0.41673871874809265, + "grad_norm_var": 0.06719419648756339, + "learning_rate": 5e-05, + "loss": 0.2026, + "loss/crossentropy": 2.7475191354751587, + "loss/hidden": 0.0, + "loss/logits": 0.20259103178977966, + "loss/reg": 1.3826485872268677, + "step": 1256 + }, + { + "epoch": 0.01257, + "grad_norm": 0.585350751876831, + "grad_norm_var": 0.06668494479683436, + "learning_rate": 5e-05, + "loss": 0.2359, + "loss/crossentropy": 2.9428182244300842, + "loss/hidden": 0.0, + "loss/logits": 0.23591554537415504, + "loss/reg": 1.381587266921997, + "step": 1257 + }, + { + "epoch": 0.01258, + "grad_norm": 0.42391830682754517, + "grad_norm_var": 0.06525364309366323, + "learning_rate": 5e-05, + "loss": 0.203, + "loss/crossentropy": 2.6863314509391785, + "loss/hidden": 0.0, + "loss/logits": 0.20304660126566887, + "loss/reg": 1.3804073333740234, + "step": 1258 + }, + { + "epoch": 0.01259, + "grad_norm": 0.3636891841888428, + "grad_norm_var": 0.030416591644257748, + "learning_rate": 5e-05, + "loss": 0.1812, + "loss/crossentropy": 2.770876467227936, + "loss/hidden": 0.0, + "loss/logits": 0.18117986992001534, + "loss/reg": 1.3794111013412476, + "step": 1259 + }, + { + "epoch": 0.0126, + "grad_norm": 0.4120618999004364, + "grad_norm_var": 0.030669422375767422, + "learning_rate": 5e-05, + "loss": 0.1856, + "loss/crossentropy": 2.6826838850975037, + "loss/hidden": 0.0, + "loss/logits": 0.18561375886201859, + "loss/reg": 1.378712773323059, + "step": 1260 + }, + { + "epoch": 0.01261, + "grad_norm": 0.38935086131095886, + "grad_norm_var": 0.0310259038505535, + "learning_rate": 5e-05, + "loss": 0.1873, + "loss/crossentropy": 2.8777846097946167, + "loss/hidden": 0.0, + "loss/logits": 0.18731402978301048, + "loss/reg": 1.3772296905517578, + "step": 1261 + }, + { + "epoch": 0.01262, + "grad_norm": 0.3571130335330963, + "grad_norm_var": 0.031752674237896614, + "learning_rate": 5e-05, + "loss": 0.1746, + "loss/crossentropy": 2.8309271931648254, + "loss/hidden": 0.0, + "loss/logits": 0.17464854568243027, + "loss/reg": 1.3755420446395874, + "step": 1262 + }, + { + "epoch": 0.01263, + "grad_norm": 0.4016406536102295, + "grad_norm_var": 0.0319358552762881, + "learning_rate": 5e-05, + "loss": 0.1905, + "loss/crossentropy": 2.8556240797042847, + "loss/hidden": 0.0, + "loss/logits": 0.19051608070731163, + "loss/reg": 1.3745484352111816, + "step": 1263 + }, + { + "epoch": 0.01264, + "grad_norm": 0.4311268627643585, + "grad_norm_var": 0.003017690530940461, + "learning_rate": 5e-05, + "loss": 0.2093, + "loss/crossentropy": 2.760189950466156, + "loss/hidden": 0.0, + "loss/logits": 0.20933211222290993, + "loss/reg": 1.3732630014419556, + "step": 1264 + }, + { + "epoch": 0.01265, + "grad_norm": 0.38528862595558167, + "grad_norm_var": 0.0030261360436078005, + "learning_rate": 5e-05, + "loss": 0.1821, + "loss/crossentropy": 2.698637902736664, + "loss/hidden": 0.0, + "loss/logits": 0.18210280314087868, + "loss/reg": 1.371398687362671, + "step": 1265 + }, + { + "epoch": 0.01266, + "grad_norm": 0.35614868998527527, + "grad_norm_var": 0.0032142068850269786, + "learning_rate": 5e-05, + "loss": 0.1788, + "loss/crossentropy": 2.8088836073875427, + "loss/hidden": 0.0, + "loss/logits": 0.17882408946752548, + "loss/reg": 1.3697328567504883, + "step": 1266 + }, + { + "epoch": 0.01267, + "grad_norm": 0.36676737666130066, + "grad_norm_var": 0.0033343322691377925, + "learning_rate": 5e-05, + "loss": 0.1819, + "loss/crossentropy": 2.6373648643493652, + "loss/hidden": 0.0, + "loss/logits": 0.18188165128231049, + "loss/reg": 1.3673834800720215, + "step": 1267 + }, + { + "epoch": 0.01268, + "grad_norm": 0.39788973331451416, + "grad_norm_var": 0.0028485353302448763, + "learning_rate": 5e-05, + "loss": 0.1929, + "loss/crossentropy": 2.72048681974411, + "loss/hidden": 0.0, + "loss/logits": 0.19285759702324867, + "loss/reg": 1.3652880191802979, + "step": 1268 + }, + { + "epoch": 0.01269, + "grad_norm": 0.39289984107017517, + "grad_norm_var": 0.0028573651350930524, + "learning_rate": 5e-05, + "loss": 0.1965, + "loss/crossentropy": 2.7955026030540466, + "loss/hidden": 0.0, + "loss/logits": 0.19650669395923615, + "loss/reg": 1.3635308742523193, + "step": 1269 + }, + { + "epoch": 0.0127, + "grad_norm": 0.3635425567626953, + "grad_norm_var": 0.002953132085538002, + "learning_rate": 5e-05, + "loss": 0.1756, + "loss/crossentropy": 2.7407087683677673, + "loss/hidden": 0.0, + "loss/logits": 0.17556580528616905, + "loss/reg": 1.3616644144058228, + "step": 1270 + }, + { + "epoch": 0.01271, + "grad_norm": 0.4003201723098755, + "grad_norm_var": 0.0029402084248949262, + "learning_rate": 5e-05, + "loss": 0.1871, + "loss/crossentropy": 2.6960673928260803, + "loss/hidden": 0.0, + "loss/logits": 0.1870543658733368, + "loss/reg": 1.3603110313415527, + "step": 1271 + }, + { + "epoch": 0.01272, + "grad_norm": 0.3728436529636383, + "grad_norm_var": 0.002978704676407059, + "learning_rate": 5e-05, + "loss": 0.1764, + "loss/crossentropy": 2.7935059666633606, + "loss/hidden": 0.0, + "loss/logits": 0.1764380931854248, + "loss/reg": 1.3583228588104248, + "step": 1272 + }, + { + "epoch": 0.01273, + "grad_norm": 0.3922203779220581, + "grad_norm_var": 0.0005369219153182336, + "learning_rate": 5e-05, + "loss": 0.1933, + "loss/crossentropy": 2.9126545190811157, + "loss/hidden": 0.0, + "loss/logits": 0.1932995654642582, + "loss/reg": 1.3570079803466797, + "step": 1273 + }, + { + "epoch": 0.01274, + "grad_norm": 0.3662412762641907, + "grad_norm_var": 0.0004680491238065465, + "learning_rate": 5e-05, + "loss": 0.1763, + "loss/crossentropy": 2.8062674403190613, + "loss/hidden": 0.0, + "loss/logits": 0.17627891525626183, + "loss/reg": 1.3558125495910645, + "step": 1274 + }, + { + "epoch": 0.01275, + "grad_norm": 0.389067679643631, + "grad_norm_var": 0.00043848758916393096, + "learning_rate": 5e-05, + "loss": 0.1952, + "loss/crossentropy": 2.7338613271713257, + "loss/hidden": 0.0, + "loss/logits": 0.19517629593610764, + "loss/reg": 1.3549078702926636, + "step": 1275 + }, + { + "epoch": 0.01276, + "grad_norm": 0.45303645730018616, + "grad_norm_var": 0.0006863072728836646, + "learning_rate": 5e-05, + "loss": 0.2122, + "loss/crossentropy": 2.838892161846161, + "loss/hidden": 0.0, + "loss/logits": 0.21219918876886368, + "loss/reg": 1.3540998697280884, + "step": 1276 + }, + { + "epoch": 0.01277, + "grad_norm": 0.36253467202186584, + "grad_norm_var": 0.0007280970613809353, + "learning_rate": 5e-05, + "loss": 0.1846, + "loss/crossentropy": 2.8312386870384216, + "loss/hidden": 0.0, + "loss/logits": 0.1846158690750599, + "loss/reg": 1.353420615196228, + "step": 1277 + }, + { + "epoch": 0.01278, + "grad_norm": 0.37399822473526, + "grad_norm_var": 0.0006790970538206727, + "learning_rate": 5e-05, + "loss": 0.1879, + "loss/crossentropy": 2.8536208868026733, + "loss/hidden": 0.0, + "loss/logits": 0.18789278343319893, + "loss/reg": 1.352396845817566, + "step": 1278 + }, + { + "epoch": 0.01279, + "grad_norm": 0.3941795825958252, + "grad_norm_var": 0.0006688551439072026, + "learning_rate": 5e-05, + "loss": 0.193, + "loss/crossentropy": 2.7190252542495728, + "loss/hidden": 0.0, + "loss/logits": 0.1929876208305359, + "loss/reg": 1.3508316278457642, + "step": 1279 + }, + { + "epoch": 0.0128, + "grad_norm": 0.38698315620422363, + "grad_norm_var": 0.0005331698153800672, + "learning_rate": 5e-05, + "loss": 0.187, + "loss/crossentropy": 2.773389518260956, + "loss/hidden": 0.0, + "loss/logits": 0.18697837367653847, + "loss/reg": 1.34941565990448, + "step": 1280 + }, + { + "epoch": 0.01281, + "grad_norm": 0.43258407711982727, + "grad_norm_var": 0.0006771733589316486, + "learning_rate": 5e-05, + "loss": 0.1964, + "loss/crossentropy": 2.7498987913131714, + "loss/hidden": 0.0, + "loss/logits": 0.19635442271828651, + "loss/reg": 1.348366379737854, + "step": 1281 + }, + { + "epoch": 0.01282, + "grad_norm": 0.4235626459121704, + "grad_norm_var": 0.0006787048817335125, + "learning_rate": 5e-05, + "loss": 0.1992, + "loss/crossentropy": 2.7566969990730286, + "loss/hidden": 0.0, + "loss/logits": 0.1991618350148201, + "loss/reg": 1.3475583791732788, + "step": 1282 + }, + { + "epoch": 0.01283, + "grad_norm": 0.3940984606742859, + "grad_norm_var": 0.0006341984147311871, + "learning_rate": 5e-05, + "loss": 0.2041, + "loss/crossentropy": 2.8078003525733948, + "loss/hidden": 0.0, + "loss/logits": 0.20406979322433472, + "loss/reg": 1.3463932275772095, + "step": 1283 + }, + { + "epoch": 0.01284, + "grad_norm": 0.384031742811203, + "grad_norm_var": 0.0006380904039433551, + "learning_rate": 5e-05, + "loss": 0.1879, + "loss/crossentropy": 2.9492968916893005, + "loss/hidden": 0.0, + "loss/logits": 0.18790540099143982, + "loss/reg": 1.3449573516845703, + "step": 1284 + }, + { + "epoch": 0.01285, + "grad_norm": 0.3790985345840454, + "grad_norm_var": 0.0006495060301425178, + "learning_rate": 5e-05, + "loss": 0.1866, + "loss/crossentropy": 2.795994758605957, + "loss/hidden": 0.0, + "loss/logits": 0.1865733079612255, + "loss/reg": 1.343949317932129, + "step": 1285 + }, + { + "epoch": 0.01286, + "grad_norm": 0.3845146894454956, + "grad_norm_var": 0.0005980594021735749, + "learning_rate": 5e-05, + "loss": 0.188, + "loss/crossentropy": 2.7280075550079346, + "loss/hidden": 0.0, + "loss/logits": 0.18804579600691795, + "loss/reg": 1.3426213264465332, + "step": 1286 + }, + { + "epoch": 0.01287, + "grad_norm": 0.5705528259277344, + "grad_norm_var": 0.002573541618339078, + "learning_rate": 5e-05, + "loss": 0.1958, + "loss/crossentropy": 2.4836789965629578, + "loss/hidden": 0.0, + "loss/logits": 0.19577286019921303, + "loss/reg": 1.3411113023757935, + "step": 1287 + }, + { + "epoch": 0.01288, + "grad_norm": 0.39455586671829224, + "grad_norm_var": 0.002513614459891574, + "learning_rate": 5e-05, + "loss": 0.1889, + "loss/crossentropy": 2.7349933981895447, + "loss/hidden": 0.0, + "loss/logits": 0.1888500116765499, + "loss/reg": 1.3391989469528198, + "step": 1288 + }, + { + "epoch": 0.01289, + "grad_norm": 0.3759537637233734, + "grad_norm_var": 0.0025580404579436185, + "learning_rate": 5e-05, + "loss": 0.1758, + "loss/crossentropy": 2.812786042690277, + "loss/hidden": 0.0, + "loss/logits": 0.17579347640275955, + "loss/reg": 1.3369227647781372, + "step": 1289 + }, + { + "epoch": 0.0129, + "grad_norm": 0.40224215388298035, + "grad_norm_var": 0.0024575000110041286, + "learning_rate": 5e-05, + "loss": 0.1939, + "loss/crossentropy": 2.8621543049812317, + "loss/hidden": 0.0, + "loss/logits": 0.19386817887425423, + "loss/reg": 1.3354421854019165, + "step": 1290 + }, + { + "epoch": 0.01291, + "grad_norm": 0.3668369650840759, + "grad_norm_var": 0.002539502080659517, + "learning_rate": 5e-05, + "loss": 0.1786, + "loss/crossentropy": 2.894817292690277, + "loss/hidden": 0.0, + "loss/logits": 0.17856686189770699, + "loss/reg": 1.333516240119934, + "step": 1291 + }, + { + "epoch": 0.01292, + "grad_norm": 0.3802977502346039, + "grad_norm_var": 0.0024035539250668444, + "learning_rate": 5e-05, + "loss": 0.1829, + "loss/crossentropy": 2.795164704322815, + "loss/hidden": 0.0, + "loss/logits": 0.1828628107905388, + "loss/reg": 1.3324875831604004, + "step": 1292 + }, + { + "epoch": 0.01293, + "grad_norm": 1.061220407485962, + "grad_norm_var": 0.02938838453965478, + "learning_rate": 5e-05, + "loss": 0.2468, + "loss/crossentropy": 2.8882861137390137, + "loss/hidden": 0.0, + "loss/logits": 0.24676746502518654, + "loss/reg": 1.331701636314392, + "step": 1293 + }, + { + "epoch": 0.01294, + "grad_norm": 0.39137813448905945, + "grad_norm_var": 0.029244943809875072, + "learning_rate": 5e-05, + "loss": 0.1857, + "loss/crossentropy": 2.9083763360977173, + "loss/hidden": 0.0, + "loss/logits": 0.18574338406324387, + "loss/reg": 1.3301653861999512, + "step": 1294 + }, + { + "epoch": 0.01295, + "grad_norm": 0.3713628649711609, + "grad_norm_var": 0.02943248635611706, + "learning_rate": 5e-05, + "loss": 0.1768, + "loss/crossentropy": 2.8202906250953674, + "loss/hidden": 0.0, + "loss/logits": 0.17676304280757904, + "loss/reg": 1.3285754919052124, + "step": 1295 + }, + { + "epoch": 0.01296, + "grad_norm": 0.39226874709129333, + "grad_norm_var": 0.029394258249184086, + "learning_rate": 5e-05, + "loss": 0.178, + "loss/crossentropy": 2.8032939434051514, + "loss/hidden": 0.0, + "loss/logits": 0.17801367118954659, + "loss/reg": 1.3272022008895874, + "step": 1296 + }, + { + "epoch": 0.01297, + "grad_norm": 0.40169134736061096, + "grad_norm_var": 0.02950107240310749, + "learning_rate": 5e-05, + "loss": 0.1853, + "loss/crossentropy": 2.6777111291885376, + "loss/hidden": 0.0, + "loss/logits": 0.18530849367380142, + "loss/reg": 1.3254737854003906, + "step": 1297 + }, + { + "epoch": 0.01298, + "grad_norm": 0.408547967672348, + "grad_norm_var": 0.029552281796611728, + "learning_rate": 5e-05, + "loss": 0.1901, + "loss/crossentropy": 2.762950897216797, + "loss/hidden": 0.0, + "loss/logits": 0.19006695970892906, + "loss/reg": 1.3238102197647095, + "step": 1298 + }, + { + "epoch": 0.01299, + "grad_norm": 0.40359973907470703, + "grad_norm_var": 0.029498297332966376, + "learning_rate": 5e-05, + "loss": 0.1799, + "loss/crossentropy": 2.7999081015586853, + "loss/hidden": 0.0, + "loss/logits": 0.17994213849306107, + "loss/reg": 1.3228790760040283, + "step": 1299 + }, + { + "epoch": 0.013, + "grad_norm": 0.42354899644851685, + "grad_norm_var": 0.029291732015891307, + "learning_rate": 5e-05, + "loss": 0.1883, + "loss/crossentropy": 2.711639881134033, + "loss/hidden": 0.0, + "loss/logits": 0.18831219896674156, + "loss/reg": 1.321997880935669, + "step": 1300 + }, + { + "epoch": 0.01301, + "grad_norm": 0.37877607345581055, + "grad_norm_var": 0.029294538805312784, + "learning_rate": 5e-05, + "loss": 0.1901, + "loss/crossentropy": 2.7869099378585815, + "loss/hidden": 0.0, + "loss/logits": 0.19013599306344986, + "loss/reg": 1.3209624290466309, + "step": 1301 + }, + { + "epoch": 0.01302, + "grad_norm": 0.8780012726783752, + "grad_norm_var": 0.04058730529278964, + "learning_rate": 5e-05, + "loss": 0.2086, + "loss/crossentropy": 2.9132827520370483, + "loss/hidden": 0.0, + "loss/logits": 0.2086055651307106, + "loss/reg": 1.320249319076538, + "step": 1302 + }, + { + "epoch": 0.01303, + "grad_norm": 0.4919426441192627, + "grad_norm_var": 0.03997255141455191, + "learning_rate": 5e-05, + "loss": 0.1929, + "loss/crossentropy": 2.873347520828247, + "loss/hidden": 0.0, + "loss/logits": 0.19293329864740372, + "loss/reg": 1.3193020820617676, + "step": 1303 + }, + { + "epoch": 0.01304, + "grad_norm": 0.4724264442920685, + "grad_norm_var": 0.03956677984298077, + "learning_rate": 5e-05, + "loss": 0.1762, + "loss/crossentropy": 2.660923659801483, + "loss/hidden": 0.0, + "loss/logits": 0.17619645223021507, + "loss/reg": 1.3196465969085693, + "step": 1304 + }, + { + "epoch": 0.01305, + "grad_norm": 0.42146193981170654, + "grad_norm_var": 0.03909519236833046, + "learning_rate": 5e-05, + "loss": 0.1923, + "loss/crossentropy": 2.8432238698005676, + "loss/hidden": 0.0, + "loss/logits": 0.19225074350833893, + "loss/reg": 1.3197365999221802, + "step": 1305 + }, + { + "epoch": 0.01306, + "grad_norm": 0.44016242027282715, + "grad_norm_var": 0.03880278698594292, + "learning_rate": 5e-05, + "loss": 0.1824, + "loss/crossentropy": 2.9219031929969788, + "loss/hidden": 0.0, + "loss/logits": 0.1823579967021942, + "loss/reg": 1.3191851377487183, + "step": 1306 + }, + { + "epoch": 0.01307, + "grad_norm": 0.40534767508506775, + "grad_norm_var": 0.0383132831443539, + "learning_rate": 5e-05, + "loss": 0.1821, + "loss/crossentropy": 2.8344457149505615, + "loss/hidden": 0.0, + "loss/logits": 0.18211128562688828, + "loss/reg": 1.3193784952163696, + "step": 1307 + }, + { + "epoch": 0.01308, + "grad_norm": 0.4818474054336548, + "grad_norm_var": 0.037572268534637604, + "learning_rate": 5e-05, + "loss": 0.1931, + "loss/crossentropy": 3.001499652862549, + "loss/hidden": 0.0, + "loss/logits": 0.19311653822660446, + "loss/reg": 1.319129467010498, + "step": 1308 + }, + { + "epoch": 0.01309, + "grad_norm": 0.41809335350990295, + "grad_norm_var": 0.014352758274943181, + "learning_rate": 5e-05, + "loss": 0.1866, + "loss/crossentropy": 2.908874988555908, + "loss/hidden": 0.0, + "loss/logits": 0.18662643805146217, + "loss/reg": 1.3184683322906494, + "step": 1309 + }, + { + "epoch": 0.0131, + "grad_norm": 0.4113363027572632, + "grad_norm_var": 0.01422490614724207, + "learning_rate": 5e-05, + "loss": 0.194, + "loss/crossentropy": 2.778001129627228, + "loss/hidden": 0.0, + "loss/logits": 0.19404159486293793, + "loss/reg": 1.3183917999267578, + "step": 1310 + }, + { + "epoch": 0.01311, + "grad_norm": 0.4028130769729614, + "grad_norm_var": 0.013956863128374053, + "learning_rate": 5e-05, + "loss": 0.1856, + "loss/crossentropy": 2.892741084098816, + "loss/hidden": 0.0, + "loss/logits": 0.18560755625367165, + "loss/reg": 1.318432092666626, + "step": 1311 + }, + { + "epoch": 0.01312, + "grad_norm": 0.5507920384407043, + "grad_norm_var": 0.014265137075122303, + "learning_rate": 5e-05, + "loss": 0.1943, + "loss/crossentropy": 2.7685837745666504, + "loss/hidden": 0.0, + "loss/logits": 0.19431418180465698, + "loss/reg": 1.3180075883865356, + "step": 1312 + }, + { + "epoch": 0.01313, + "grad_norm": 0.4126599431037903, + "grad_norm_var": 0.014184603572884372, + "learning_rate": 5e-05, + "loss": 0.1876, + "loss/crossentropy": 2.769340455532074, + "loss/hidden": 0.0, + "loss/logits": 0.18763459101319313, + "loss/reg": 1.316887378692627, + "step": 1313 + }, + { + "epoch": 0.01314, + "grad_norm": 0.4310034215450287, + "grad_norm_var": 0.014054329397543756, + "learning_rate": 5e-05, + "loss": 0.2148, + "loss/crossentropy": 2.7395371794700623, + "loss/hidden": 0.0, + "loss/logits": 0.21481387317180634, + "loss/reg": 1.316412091255188, + "step": 1314 + }, + { + "epoch": 0.01315, + "grad_norm": 0.3769264221191406, + "grad_norm_var": 0.01431356443074178, + "learning_rate": 5e-05, + "loss": 0.1931, + "loss/crossentropy": 2.707428455352783, + "loss/hidden": 0.0, + "loss/logits": 0.19313662126660347, + "loss/reg": 1.315134048461914, + "step": 1315 + }, + { + "epoch": 0.01316, + "grad_norm": 0.43392711877822876, + "grad_norm_var": 0.014266644976929577, + "learning_rate": 5e-05, + "loss": 0.1897, + "loss/crossentropy": 2.646990716457367, + "loss/hidden": 0.0, + "loss/logits": 0.18968894705176353, + "loss/reg": 1.3136744499206543, + "step": 1316 + }, + { + "epoch": 0.01317, + "grad_norm": 0.42039671540260315, + "grad_norm_var": 0.013907685821175575, + "learning_rate": 5e-05, + "loss": 0.2099, + "loss/crossentropy": 2.647752821445465, + "loss/hidden": 0.0, + "loss/logits": 0.20992901176214218, + "loss/reg": 1.3125907182693481, + "step": 1317 + }, + { + "epoch": 0.01318, + "grad_norm": 0.36365634202957153, + "grad_norm_var": 0.002157925123658914, + "learning_rate": 5e-05, + "loss": 0.1811, + "loss/crossentropy": 2.752347230911255, + "loss/hidden": 0.0, + "loss/logits": 0.18107367679476738, + "loss/reg": 1.3118373155593872, + "step": 1318 + }, + { + "epoch": 0.01319, + "grad_norm": 0.4149968922138214, + "grad_norm_var": 0.0019276034667143301, + "learning_rate": 5e-05, + "loss": 0.1831, + "loss/crossentropy": 2.833215117454529, + "loss/hidden": 0.0, + "loss/logits": 0.18308308348059654, + "loss/reg": 1.3113209009170532, + "step": 1319 + }, + { + "epoch": 0.0132, + "grad_norm": 0.4174114465713501, + "grad_norm_var": 0.0017954009995525433, + "learning_rate": 5e-05, + "loss": 0.178, + "loss/crossentropy": 2.8773661851882935, + "loss/hidden": 0.0, + "loss/logits": 0.17799117416143417, + "loss/reg": 1.3101435899734497, + "step": 1320 + }, + { + "epoch": 0.01321, + "grad_norm": 0.37340742349624634, + "grad_norm_var": 0.0019635318784405127, + "learning_rate": 5e-05, + "loss": 0.1792, + "loss/crossentropy": 2.759882092475891, + "loss/hidden": 0.0, + "loss/logits": 0.17917311936616898, + "loss/reg": 1.3096390962600708, + "step": 1321 + }, + { + "epoch": 0.01322, + "grad_norm": 0.36545029282569885, + "grad_norm_var": 0.0021332032625857996, + "learning_rate": 5e-05, + "loss": 0.1815, + "loss/crossentropy": 2.7518441677093506, + "loss/hidden": 0.0, + "loss/logits": 0.18147645145654678, + "loss/reg": 1.3092446327209473, + "step": 1322 + }, + { + "epoch": 0.01323, + "grad_norm": 0.38353613018989563, + "grad_norm_var": 0.0021982906675894606, + "learning_rate": 5e-05, + "loss": 0.1787, + "loss/crossentropy": 2.895625650882721, + "loss/hidden": 0.0, + "loss/logits": 0.17865781486034393, + "loss/reg": 1.3078157901763916, + "step": 1323 + }, + { + "epoch": 0.01324, + "grad_norm": 0.3809302747249603, + "grad_norm_var": 0.0019506857096142267, + "learning_rate": 5e-05, + "loss": 0.1827, + "loss/crossentropy": 2.7681140899658203, + "loss/hidden": 0.0, + "loss/logits": 0.1827395148575306, + "loss/reg": 1.3067607879638672, + "step": 1324 + }, + { + "epoch": 0.01325, + "grad_norm": 0.45117852091789246, + "grad_norm_var": 0.0020555368006153394, + "learning_rate": 5e-05, + "loss": 0.2006, + "loss/crossentropy": 2.8935614228248596, + "loss/hidden": 0.0, + "loss/logits": 0.20061300694942474, + "loss/reg": 1.3059515953063965, + "step": 1325 + }, + { + "epoch": 0.01326, + "grad_norm": 0.3950449228286743, + "grad_norm_var": 0.0020733523569008445, + "learning_rate": 5e-05, + "loss": 0.1918, + "loss/crossentropy": 2.716562330722809, + "loss/hidden": 0.0, + "loss/logits": 0.19175706058740616, + "loss/reg": 1.3049736022949219, + "step": 1326 + }, + { + "epoch": 0.01327, + "grad_norm": 0.39624467492103577, + "grad_norm_var": 0.002083116547425116, + "learning_rate": 5e-05, + "loss": 0.1822, + "loss/crossentropy": 2.9077075123786926, + "loss/hidden": 0.0, + "loss/logits": 0.18224596232175827, + "loss/reg": 1.3039295673370361, + "step": 1327 + }, + { + "epoch": 0.01328, + "grad_norm": 0.3917173743247986, + "grad_norm_var": 0.0006884956392181489, + "learning_rate": 5e-05, + "loss": 0.1963, + "loss/crossentropy": 2.8372305631637573, + "loss/hidden": 0.0, + "loss/logits": 0.1963268630206585, + "loss/reg": 1.3027836084365845, + "step": 1328 + }, + { + "epoch": 0.01329, + "grad_norm": 0.39020252227783203, + "grad_norm_var": 0.0006836971401257201, + "learning_rate": 5e-05, + "loss": 0.1848, + "loss/crossentropy": 2.6796911358833313, + "loss/hidden": 0.0, + "loss/logits": 0.18476086854934692, + "loss/reg": 1.3020461797714233, + "step": 1329 + }, + { + "epoch": 0.0133, + "grad_norm": 0.38087961077690125, + "grad_norm_var": 0.0006276855907307866, + "learning_rate": 5e-05, + "loss": 0.1847, + "loss/crossentropy": 2.7201030254364014, + "loss/hidden": 0.0, + "loss/logits": 0.1847059205174446, + "loss/reg": 1.300846815109253, + "step": 1330 + }, + { + "epoch": 0.01331, + "grad_norm": 0.3877928555011749, + "grad_norm_var": 0.0006074390999117995, + "learning_rate": 5e-05, + "loss": 0.1843, + "loss/crossentropy": 2.839367628097534, + "loss/hidden": 0.0, + "loss/logits": 0.18426093831658363, + "loss/reg": 1.3001853227615356, + "step": 1331 + }, + { + "epoch": 0.01332, + "grad_norm": 0.3711041212081909, + "grad_norm_var": 0.0005420569547143316, + "learning_rate": 5e-05, + "loss": 0.1696, + "loss/crossentropy": 2.7172279357910156, + "loss/hidden": 0.0, + "loss/logits": 0.16955699026584625, + "loss/reg": 1.2990162372589111, + "step": 1332 + }, + { + "epoch": 0.01333, + "grad_norm": 0.6289080381393433, + "grad_norm_var": 0.00402807478378359, + "learning_rate": 5e-05, + "loss": 0.1972, + "loss/crossentropy": 2.678100347518921, + "loss/hidden": 0.0, + "loss/logits": 0.19720054045319557, + "loss/reg": 1.2978439331054688, + "step": 1333 + }, + { + "epoch": 0.01334, + "grad_norm": 0.4056476652622223, + "grad_norm_var": 0.003902441977409969, + "learning_rate": 5e-05, + "loss": 0.1864, + "loss/crossentropy": 2.7936259508132935, + "loss/hidden": 0.0, + "loss/logits": 0.1864231936633587, + "loss/reg": 1.2967522144317627, + "step": 1334 + }, + { + "epoch": 0.01335, + "grad_norm": 0.3485678732395172, + "grad_norm_var": 0.004119842087168672, + "learning_rate": 5e-05, + "loss": 0.1709, + "loss/crossentropy": 2.693207561969757, + "loss/hidden": 0.0, + "loss/logits": 0.1709160953760147, + "loss/reg": 1.2951072454452515, + "step": 1335 + }, + { + "epoch": 0.01336, + "grad_norm": 0.40611714124679565, + "grad_norm_var": 0.0041079969860560875, + "learning_rate": 5e-05, + "loss": 0.183, + "loss/crossentropy": 2.7255959510803223, + "loss/hidden": 0.0, + "loss/logits": 0.18296395614743233, + "loss/reg": 1.294013500213623, + "step": 1336 + }, + { + "epoch": 0.01337, + "grad_norm": 0.3974861204624176, + "grad_norm_var": 0.004047475093204926, + "learning_rate": 5e-05, + "loss": 0.1847, + "loss/crossentropy": 2.8238306641578674, + "loss/hidden": 0.0, + "loss/logits": 0.18465031683444977, + "loss/reg": 1.2925174236297607, + "step": 1337 + }, + { + "epoch": 0.01338, + "grad_norm": 0.4148518443107605, + "grad_norm_var": 0.003939165560142958, + "learning_rate": 5e-05, + "loss": 0.2004, + "loss/crossentropy": 2.8640605807304382, + "loss/hidden": 0.0, + "loss/logits": 0.20043480768799782, + "loss/reg": 1.2910975217819214, + "step": 1338 + }, + { + "epoch": 0.01339, + "grad_norm": 0.4171488881111145, + "grad_norm_var": 0.0038995204542005596, + "learning_rate": 5e-05, + "loss": 0.1905, + "loss/crossentropy": 2.6302874088287354, + "loss/hidden": 0.0, + "loss/logits": 0.19046159461140633, + "loss/reg": 1.2900209426879883, + "step": 1339 + }, + { + "epoch": 0.0134, + "grad_norm": 0.39216965436935425, + "grad_norm_var": 0.0038634942425959514, + "learning_rate": 5e-05, + "loss": 0.1867, + "loss/crossentropy": 2.7624495029449463, + "loss/hidden": 0.0, + "loss/logits": 0.18672016263008118, + "loss/reg": 1.2881990671157837, + "step": 1340 + }, + { + "epoch": 0.01341, + "grad_norm": 0.41747212409973145, + "grad_norm_var": 0.003753668540790267, + "learning_rate": 5e-05, + "loss": 0.1779, + "loss/crossentropy": 2.8837625980377197, + "loss/hidden": 0.0, + "loss/logits": 0.1778956986963749, + "loss/reg": 1.2869876623153687, + "step": 1341 + }, + { + "epoch": 0.01342, + "grad_norm": 0.3829328715801239, + "grad_norm_var": 0.0037851070907451876, + "learning_rate": 5e-05, + "loss": 0.1933, + "loss/crossentropy": 2.7400670051574707, + "loss/hidden": 0.0, + "loss/logits": 0.19326373934745789, + "loss/reg": 1.2858026027679443, + "step": 1342 + }, + { + "epoch": 0.01343, + "grad_norm": 0.4228571355342865, + "grad_norm_var": 0.0037873835369272063, + "learning_rate": 5e-05, + "loss": 0.2003, + "loss/crossentropy": 2.9916569590568542, + "loss/hidden": 0.0, + "loss/logits": 0.20030486211180687, + "loss/reg": 1.2851616144180298, + "step": 1343 + }, + { + "epoch": 0.01344, + "grad_norm": 0.42717915773391724, + "grad_norm_var": 0.0037807597262457747, + "learning_rate": 5e-05, + "loss": 0.1911, + "loss/crossentropy": 2.8114970922470093, + "loss/hidden": 0.0, + "loss/logits": 0.19114911556243896, + "loss/reg": 1.2845793962478638, + "step": 1344 + }, + { + "epoch": 0.01345, + "grad_norm": 0.3848547339439392, + "grad_norm_var": 0.0037980591833326137, + "learning_rate": 5e-05, + "loss": 0.1756, + "loss/crossentropy": 2.7906049489974976, + "loss/hidden": 0.0, + "loss/logits": 0.17564281448721886, + "loss/reg": 1.2834604978561401, + "step": 1345 + }, + { + "epoch": 0.01346, + "grad_norm": 0.5220564603805542, + "grad_norm_var": 0.004465037808005083, + "learning_rate": 5e-05, + "loss": 0.1935, + "loss/crossentropy": 2.8656685948371887, + "loss/hidden": 0.0, + "loss/logits": 0.19348178058862686, + "loss/reg": 1.2817208766937256, + "step": 1346 + }, + { + "epoch": 0.01347, + "grad_norm": 0.4464890658855438, + "grad_norm_var": 0.004424811622567286, + "learning_rate": 5e-05, + "loss": 0.1937, + "loss/crossentropy": 2.7914677262306213, + "loss/hidden": 0.0, + "loss/logits": 0.19365255907177925, + "loss/reg": 1.2804713249206543, + "step": 1347 + }, + { + "epoch": 0.01348, + "grad_norm": 0.3959408700466156, + "grad_norm_var": 0.004287815978112041, + "learning_rate": 5e-05, + "loss": 0.1987, + "loss/crossentropy": 2.793258845806122, + "loss/hidden": 0.0, + "loss/logits": 0.19866468757390976, + "loss/reg": 1.2791686058044434, + "step": 1348 + }, + { + "epoch": 0.01349, + "grad_norm": 0.42396214604377747, + "grad_norm_var": 0.001359216418765108, + "learning_rate": 5e-05, + "loss": 0.1701, + "loss/crossentropy": 2.9762020111083984, + "loss/hidden": 0.0, + "loss/logits": 0.17010829970240593, + "loss/reg": 1.2777467966079712, + "step": 1349 + }, + { + "epoch": 0.0135, + "grad_norm": 0.42238879203796387, + "grad_norm_var": 0.0013606376487376281, + "learning_rate": 5e-05, + "loss": 0.1885, + "loss/crossentropy": 2.819011688232422, + "loss/hidden": 0.0, + "loss/logits": 0.1885228455066681, + "loss/reg": 1.276718258857727, + "step": 1350 + }, + { + "epoch": 0.01351, + "grad_norm": 0.4367927610874176, + "grad_norm_var": 0.0010785369168515246, + "learning_rate": 5e-05, + "loss": 0.2036, + "loss/crossentropy": 2.7756308913230896, + "loss/hidden": 0.0, + "loss/logits": 0.2036239691078663, + "loss/reg": 1.2753915786743164, + "step": 1351 + }, + { + "epoch": 0.01352, + "grad_norm": 0.39300453662872314, + "grad_norm_var": 0.0011125389978846934, + "learning_rate": 5e-05, + "loss": 0.1899, + "loss/crossentropy": 2.837214469909668, + "loss/hidden": 0.0, + "loss/logits": 0.18992070853710175, + "loss/reg": 1.2738478183746338, + "step": 1352 + }, + { + "epoch": 0.01353, + "grad_norm": 0.3900686204433441, + "grad_norm_var": 0.0011368585379263293, + "learning_rate": 5e-05, + "loss": 0.1934, + "loss/crossentropy": 2.6612390875816345, + "loss/hidden": 0.0, + "loss/logits": 0.1934347301721573, + "loss/reg": 1.2721461057662964, + "step": 1353 + }, + { + "epoch": 0.01354, + "grad_norm": 0.37273284792900085, + "grad_norm_var": 0.0012661753083164603, + "learning_rate": 5e-05, + "loss": 0.1694, + "loss/crossentropy": 2.7896453142166138, + "loss/hidden": 0.0, + "loss/logits": 0.16944236680865288, + "loss/reg": 1.2709778547286987, + "step": 1354 + }, + { + "epoch": 0.01355, + "grad_norm": 0.37389302253723145, + "grad_norm_var": 0.001373625563106732, + "learning_rate": 5e-05, + "loss": 0.1727, + "loss/crossentropy": 2.920479476451874, + "loss/hidden": 0.0, + "loss/logits": 0.1727452278137207, + "loss/reg": 1.2689437866210938, + "step": 1355 + }, + { + "epoch": 0.01356, + "grad_norm": 0.42584228515625, + "grad_norm_var": 0.0013518686663250771, + "learning_rate": 5e-05, + "loss": 0.1935, + "loss/crossentropy": 2.99138343334198, + "loss/hidden": 0.0, + "loss/logits": 0.19352904707193375, + "loss/reg": 1.2677642107009888, + "step": 1356 + }, + { + "epoch": 0.01357, + "grad_norm": 0.4118824601173401, + "grad_norm_var": 0.0013519076041731489, + "learning_rate": 5e-05, + "loss": 0.184, + "loss/crossentropy": 2.8795509934425354, + "loss/hidden": 0.0, + "loss/logits": 0.18404123187065125, + "loss/reg": 1.2662688493728638, + "step": 1357 + }, + { + "epoch": 0.01358, + "grad_norm": 0.4201810657978058, + "grad_norm_var": 0.0012815735880918075, + "learning_rate": 5e-05, + "loss": 0.2088, + "loss/crossentropy": 2.915390193462372, + "loss/hidden": 0.0, + "loss/logits": 0.2087855376303196, + "loss/reg": 1.2648541927337646, + "step": 1358 + }, + { + "epoch": 0.01359, + "grad_norm": 0.48833900690078735, + "grad_norm_var": 0.0016017265945368526, + "learning_rate": 5e-05, + "loss": 0.185, + "loss/crossentropy": 2.7243436574935913, + "loss/hidden": 0.0, + "loss/logits": 0.18500912189483643, + "loss/reg": 1.2632609605789185, + "step": 1359 + }, + { + "epoch": 0.0136, + "grad_norm": 0.41109177470207214, + "grad_norm_var": 0.0016045950663166645, + "learning_rate": 5e-05, + "loss": 0.1884, + "loss/crossentropy": 2.761650860309601, + "loss/hidden": 0.0, + "loss/logits": 0.18839628621935844, + "loss/reg": 1.261253833770752, + "step": 1360 + }, + { + "epoch": 0.01361, + "grad_norm": 0.38177841901779175, + "grad_norm_var": 0.0016195899755527302, + "learning_rate": 5e-05, + "loss": 0.1966, + "loss/crossentropy": 2.728949189186096, + "loss/hidden": 0.0, + "loss/logits": 0.19656601920723915, + "loss/reg": 1.2589702606201172, + "step": 1361 + }, + { + "epoch": 0.01362, + "grad_norm": 0.4108126759529114, + "grad_norm_var": 0.0008759893825818062, + "learning_rate": 5e-05, + "loss": 0.1738, + "loss/crossentropy": 2.734529137611389, + "loss/hidden": 0.0, + "loss/logits": 0.1738160066306591, + "loss/reg": 1.2575434446334839, + "step": 1362 + }, + { + "epoch": 0.01363, + "grad_norm": 0.4372514486312866, + "grad_norm_var": 0.0008398593237139059, + "learning_rate": 5e-05, + "loss": 0.1968, + "loss/crossentropy": 2.65754634141922, + "loss/hidden": 0.0, + "loss/logits": 0.1968196965754032, + "loss/reg": 1.2563024759292603, + "step": 1363 + }, + { + "epoch": 0.01364, + "grad_norm": 0.720674991607666, + "grad_norm_var": 0.006724574980634479, + "learning_rate": 5e-05, + "loss": 0.2195, + "loss/crossentropy": 2.953807055950165, + "loss/hidden": 0.0, + "loss/logits": 0.2195480540394783, + "loss/reg": 1.255125641822815, + "step": 1364 + }, + { + "epoch": 0.01365, + "grad_norm": 0.39298033714294434, + "grad_norm_var": 0.0068200160138982965, + "learning_rate": 5e-05, + "loss": 0.1835, + "loss/crossentropy": 2.741532266139984, + "loss/hidden": 0.0, + "loss/logits": 0.18346120789647102, + "loss/reg": 1.253417730331421, + "step": 1365 + }, + { + "epoch": 0.01366, + "grad_norm": 0.42354610562324524, + "grad_norm_var": 0.006818831556282904, + "learning_rate": 5e-05, + "loss": 0.1736, + "loss/crossentropy": 2.758584976196289, + "loss/hidden": 0.0, + "loss/logits": 0.1735590063035488, + "loss/reg": 1.251852035522461, + "step": 1366 + }, + { + "epoch": 0.01367, + "grad_norm": 0.4350874722003937, + "grad_norm_var": 0.006817623328532463, + "learning_rate": 5e-05, + "loss": 0.1933, + "loss/crossentropy": 2.854390501976013, + "loss/hidden": 0.0, + "loss/logits": 0.19326837360858917, + "loss/reg": 1.2506519556045532, + "step": 1367 + }, + { + "epoch": 0.01368, + "grad_norm": 0.38499191403388977, + "grad_norm_var": 0.006861772154808465, + "learning_rate": 5e-05, + "loss": 0.1821, + "loss/crossentropy": 2.8086124658584595, + "loss/hidden": 0.0, + "loss/logits": 0.18208570405840874, + "loss/reg": 1.2490259408950806, + "step": 1368 + }, + { + "epoch": 0.01369, + "grad_norm": 0.5313814878463745, + "grad_norm_var": 0.007356119875327716, + "learning_rate": 5e-05, + "loss": 0.2032, + "loss/crossentropy": 2.9096702933311462, + "loss/hidden": 0.0, + "loss/logits": 0.2031627707183361, + "loss/reg": 1.2478126287460327, + "step": 1369 + }, + { + "epoch": 0.0137, + "grad_norm": 0.4552428722381592, + "grad_norm_var": 0.007053640487362378, + "learning_rate": 5e-05, + "loss": 0.18, + "loss/crossentropy": 2.741479814052582, + "loss/hidden": 0.0, + "loss/logits": 0.1799832098186016, + "loss/reg": 1.2461893558502197, + "step": 1370 + }, + { + "epoch": 0.01371, + "grad_norm": 0.4122316539287567, + "grad_norm_var": 0.006786819829008536, + "learning_rate": 5e-05, + "loss": 0.1803, + "loss/crossentropy": 3.0401941537857056, + "loss/hidden": 0.0, + "loss/logits": 0.18025413900613785, + "loss/reg": 1.24508535861969, + "step": 1371 + }, + { + "epoch": 0.01372, + "grad_norm": 0.3969644606113434, + "grad_norm_var": 0.006918315747275635, + "learning_rate": 5e-05, + "loss": 0.1886, + "loss/crossentropy": 2.7754027247428894, + "loss/hidden": 0.0, + "loss/logits": 0.18864772096276283, + "loss/reg": 1.2440794706344604, + "step": 1372 + }, + { + "epoch": 0.01373, + "grad_norm": 0.3795020282268524, + "grad_norm_var": 0.00712532709277743, + "learning_rate": 5e-05, + "loss": 0.1775, + "loss/crossentropy": 2.8709006309509277, + "loss/hidden": 0.0, + "loss/logits": 0.17753225937485695, + "loss/reg": 1.2425460815429688, + "step": 1373 + }, + { + "epoch": 0.01374, + "grad_norm": 0.34354275465011597, + "grad_norm_var": 0.00772179540161035, + "learning_rate": 5e-05, + "loss": 0.1637, + "loss/crossentropy": 2.771426022052765, + "loss/hidden": 0.0, + "loss/logits": 0.16372710466384888, + "loss/reg": 1.2412630319595337, + "step": 1374 + }, + { + "epoch": 0.01375, + "grad_norm": 0.39372017979621887, + "grad_norm_var": 0.007644236740270749, + "learning_rate": 5e-05, + "loss": 0.1984, + "loss/crossentropy": 2.7911269068717957, + "loss/hidden": 0.0, + "loss/logits": 0.19837456196546555, + "loss/reg": 1.2404334545135498, + "step": 1375 + }, + { + "epoch": 0.01376, + "grad_norm": 0.37725409865379333, + "grad_norm_var": 0.007809791729928861, + "learning_rate": 5e-05, + "loss": 0.1804, + "loss/crossentropy": 2.705751419067383, + "loss/hidden": 0.0, + "loss/logits": 0.18037205934524536, + "loss/reg": 1.239854335784912, + "step": 1376 + }, + { + "epoch": 0.01377, + "grad_norm": 0.3886111080646515, + "grad_norm_var": 0.007768951436301293, + "learning_rate": 5e-05, + "loss": 0.1887, + "loss/crossentropy": 2.786842703819275, + "loss/hidden": 0.0, + "loss/logits": 0.18867934867739677, + "loss/reg": 1.2384082078933716, + "step": 1377 + }, + { + "epoch": 0.01378, + "grad_norm": 0.37910887598991394, + "grad_norm_var": 0.007913883052354128, + "learning_rate": 5e-05, + "loss": 0.1883, + "loss/crossentropy": 2.793938934803009, + "loss/hidden": 0.0, + "loss/logits": 0.18826347962021828, + "loss/reg": 1.2371619939804077, + "step": 1378 + }, + { + "epoch": 0.01379, + "grad_norm": 0.3881366550922394, + "grad_norm_var": 0.008005739815983074, + "learning_rate": 5e-05, + "loss": 0.1788, + "loss/crossentropy": 2.756206750869751, + "loss/hidden": 0.0, + "loss/logits": 0.17876296862959862, + "loss/reg": 1.2357430458068848, + "step": 1379 + }, + { + "epoch": 0.0138, + "grad_norm": 0.39993518590927124, + "grad_norm_var": 0.0017986913450700728, + "learning_rate": 5e-05, + "loss": 0.1854, + "loss/crossentropy": 2.8165774941444397, + "loss/hidden": 0.0, + "loss/logits": 0.1853647120296955, + "loss/reg": 1.2344536781311035, + "step": 1380 + }, + { + "epoch": 0.01381, + "grad_norm": 0.37117767333984375, + "grad_norm_var": 0.0018637489993303094, + "learning_rate": 5e-05, + "loss": 0.1809, + "loss/crossentropy": 2.833341598510742, + "loss/hidden": 0.0, + "loss/logits": 0.18087629228830338, + "loss/reg": 1.2332347631454468, + "step": 1381 + }, + { + "epoch": 0.01382, + "grad_norm": 0.44362714886665344, + "grad_norm_var": 0.0019418828305195295, + "learning_rate": 5e-05, + "loss": 0.206, + "loss/crossentropy": 2.8209856152534485, + "loss/hidden": 0.0, + "loss/logits": 0.20602793619036674, + "loss/reg": 1.232313632965088, + "step": 1382 + }, + { + "epoch": 0.01383, + "grad_norm": 0.4029310345649719, + "grad_norm_var": 0.0018776474781246222, + "learning_rate": 5e-05, + "loss": 0.203, + "loss/crossentropy": 2.745832860469818, + "loss/hidden": 0.0, + "loss/logits": 0.20303602144122124, + "loss/reg": 1.2314114570617676, + "step": 1383 + }, + { + "epoch": 0.01384, + "grad_norm": 0.4554927945137024, + "grad_norm_var": 0.002018806747643931, + "learning_rate": 5e-05, + "loss": 0.2238, + "loss/crossentropy": 2.7095658779144287, + "loss/hidden": 0.0, + "loss/logits": 0.22382865101099014, + "loss/reg": 1.2301923036575317, + "step": 1384 + }, + { + "epoch": 0.01385, + "grad_norm": 0.4818246066570282, + "grad_norm_var": 0.001353271385290237, + "learning_rate": 5e-05, + "loss": 0.178, + "loss/crossentropy": 2.8538730144500732, + "loss/hidden": 0.0, + "loss/logits": 0.17802419140934944, + "loss/reg": 1.2288519144058228, + "step": 1385 + }, + { + "epoch": 0.01386, + "grad_norm": 0.47836628556251526, + "grad_norm_var": 0.001543655778380985, + "learning_rate": 5e-05, + "loss": 0.2098, + "loss/crossentropy": 2.9244675636291504, + "loss/hidden": 0.0, + "loss/logits": 0.2098112478852272, + "loss/reg": 1.2279032468795776, + "step": 1386 + }, + { + "epoch": 0.01387, + "grad_norm": 0.4103398323059082, + "grad_norm_var": 0.0015422512386726428, + "learning_rate": 5e-05, + "loss": 0.2022, + "loss/crossentropy": 2.692758023738861, + "loss/hidden": 0.0, + "loss/logits": 0.20222963392734528, + "loss/reg": 1.2269119024276733, + "step": 1387 + }, + { + "epoch": 0.01388, + "grad_norm": 0.4230862557888031, + "grad_norm_var": 0.0015546177559936443, + "learning_rate": 5e-05, + "loss": 0.1805, + "loss/crossentropy": 2.6759063601493835, + "loss/hidden": 0.0, + "loss/logits": 0.18045221269130707, + "loss/reg": 1.225614070892334, + "step": 1388 + }, + { + "epoch": 0.01389, + "grad_norm": 0.39890357851982117, + "grad_norm_var": 0.0015062573807308984, + "learning_rate": 5e-05, + "loss": 0.1814, + "loss/crossentropy": 2.7969236969947815, + "loss/hidden": 0.0, + "loss/logits": 0.18138685822486877, + "loss/reg": 1.2248356342315674, + "step": 1389 + }, + { + "epoch": 0.0139, + "grad_norm": 0.38309741020202637, + "grad_norm_var": 0.0012614423849095946, + "learning_rate": 5e-05, + "loss": 0.1879, + "loss/crossentropy": 2.840832471847534, + "loss/hidden": 0.0, + "loss/logits": 0.18787847086787224, + "loss/reg": 1.2237766981124878, + "step": 1390 + }, + { + "epoch": 0.01391, + "grad_norm": 0.3969389796257019, + "grad_norm_var": 0.001254684277324917, + "learning_rate": 5e-05, + "loss": 0.1805, + "loss/crossentropy": 2.8534193634986877, + "loss/hidden": 0.0, + "loss/logits": 0.18053820729255676, + "loss/reg": 1.223007321357727, + "step": 1391 + }, + { + "epoch": 0.01392, + "grad_norm": 0.35356929898262024, + "grad_norm_var": 0.0013968724081272735, + "learning_rate": 5e-05, + "loss": 0.1766, + "loss/crossentropy": 2.804259717464447, + "loss/hidden": 0.0, + "loss/logits": 0.1765923760831356, + "loss/reg": 1.2222816944122314, + "step": 1392 + }, + { + "epoch": 0.01393, + "grad_norm": 0.39217162132263184, + "grad_norm_var": 0.001387654680048911, + "learning_rate": 5e-05, + "loss": 0.1876, + "loss/crossentropy": 2.804800271987915, + "loss/hidden": 0.0, + "loss/logits": 0.1875942163169384, + "loss/reg": 1.2213808298110962, + "step": 1393 + }, + { + "epoch": 0.01394, + "grad_norm": 0.377047598361969, + "grad_norm_var": 0.0013963880523254369, + "learning_rate": 5e-05, + "loss": 0.1973, + "loss/crossentropy": 2.8182566165924072, + "loss/hidden": 0.0, + "loss/logits": 0.19730354100465775, + "loss/reg": 1.2202363014221191, + "step": 1394 + }, + { + "epoch": 0.01395, + "grad_norm": 0.3902486562728882, + "grad_norm_var": 0.0013905691464131417, + "learning_rate": 5e-05, + "loss": 0.1893, + "loss/crossentropy": 2.7491560578346252, + "loss/hidden": 0.0, + "loss/logits": 0.189250610768795, + "loss/reg": 1.2191710472106934, + "step": 1395 + }, + { + "epoch": 0.01396, + "grad_norm": 0.5830679535865784, + "grad_norm_var": 0.003242805657963747, + "learning_rate": 5e-05, + "loss": 0.1978, + "loss/crossentropy": 2.810506761074066, + "loss/hidden": 0.0, + "loss/logits": 0.1978430114686489, + "loss/reg": 1.2181440591812134, + "step": 1396 + }, + { + "epoch": 0.01397, + "grad_norm": 0.41446352005004883, + "grad_norm_var": 0.0030702379351137993, + "learning_rate": 5e-05, + "loss": 0.1982, + "loss/crossentropy": 2.829331338405609, + "loss/hidden": 0.0, + "loss/logits": 0.19816021993756294, + "loss/reg": 1.2169166803359985, + "step": 1397 + }, + { + "epoch": 0.01398, + "grad_norm": 0.4109343886375427, + "grad_norm_var": 0.003051804093662188, + "learning_rate": 5e-05, + "loss": 0.1832, + "loss/crossentropy": 2.9817265272140503, + "loss/hidden": 0.0, + "loss/logits": 0.18317783251404762, + "loss/reg": 1.2161028385162354, + "step": 1398 + }, + { + "epoch": 0.01399, + "grad_norm": 0.4030122756958008, + "grad_norm_var": 0.003051597620713731, + "learning_rate": 5e-05, + "loss": 0.1845, + "loss/crossentropy": 2.7645658254623413, + "loss/hidden": 0.0, + "loss/logits": 0.18450627103447914, + "loss/reg": 1.2153202295303345, + "step": 1399 + }, + { + "epoch": 0.014, + "grad_norm": 0.9211554527282715, + "grad_norm_var": 0.018681524358094104, + "learning_rate": 5e-05, + "loss": 0.2265, + "loss/crossentropy": 2.772430121898651, + "loss/hidden": 0.0, + "loss/logits": 0.22651683166623116, + "loss/reg": 1.2144694328308105, + "step": 1400 + }, + { + "epoch": 0.01401, + "grad_norm": 0.5132554173469543, + "grad_norm_var": 0.018871863342353364, + "learning_rate": 5e-05, + "loss": 0.1939, + "loss/crossentropy": 2.8380813002586365, + "loss/hidden": 0.0, + "loss/logits": 0.19385619089007378, + "loss/reg": 1.2133592367172241, + "step": 1401 + }, + { + "epoch": 0.01402, + "grad_norm": 0.3983222544193268, + "grad_norm_var": 0.019002687433299182, + "learning_rate": 5e-05, + "loss": 0.1807, + "loss/crossentropy": 2.7452619075775146, + "loss/hidden": 0.0, + "loss/logits": 0.18066129088401794, + "loss/reg": 1.2125701904296875, + "step": 1402 + }, + { + "epoch": 0.01403, + "grad_norm": 0.38246041536331177, + "grad_norm_var": 0.01919163386322751, + "learning_rate": 5e-05, + "loss": 0.1766, + "loss/crossentropy": 2.8391433358192444, + "loss/hidden": 0.0, + "loss/logits": 0.17655245959758759, + "loss/reg": 1.2120070457458496, + "step": 1403 + }, + { + "epoch": 0.01404, + "grad_norm": 0.41270893812179565, + "grad_norm_var": 0.01923056479132349, + "learning_rate": 5e-05, + "loss": 0.1941, + "loss/crossentropy": 2.694108784198761, + "loss/hidden": 0.0, + "loss/logits": 0.19414566829800606, + "loss/reg": 1.211472511291504, + "step": 1404 + }, + { + "epoch": 0.01405, + "grad_norm": 0.37444761395454407, + "grad_norm_var": 0.0194205713803524, + "learning_rate": 5e-05, + "loss": 0.1884, + "loss/crossentropy": 2.756447434425354, + "loss/hidden": 0.0, + "loss/logits": 0.188365388661623, + "loss/reg": 1.2103835344314575, + "step": 1405 + }, + { + "epoch": 0.01406, + "grad_norm": 0.3629455864429474, + "grad_norm_var": 0.019610079451670957, + "learning_rate": 5e-05, + "loss": 0.1854, + "loss/crossentropy": 2.8100785613059998, + "loss/hidden": 0.0, + "loss/logits": 0.18542328104376793, + "loss/reg": 1.210111379623413, + "step": 1406 + }, + { + "epoch": 0.01407, + "grad_norm": 0.36319929361343384, + "grad_norm_var": 0.01988808713783746, + "learning_rate": 5e-05, + "loss": 0.1782, + "loss/crossentropy": 2.7677817940711975, + "loss/hidden": 0.0, + "loss/logits": 0.1782296933233738, + "loss/reg": 1.209017276763916, + "step": 1407 + }, + { + "epoch": 0.01408, + "grad_norm": 0.7494403123855591, + "grad_norm_var": 0.025077728825443388, + "learning_rate": 5e-05, + "loss": 0.2119, + "loss/crossentropy": 2.7342721819877625, + "loss/hidden": 0.0, + "loss/logits": 0.21190783753991127, + "loss/reg": 1.207706332206726, + "step": 1408 + }, + { + "epoch": 0.01409, + "grad_norm": 0.359062135219574, + "grad_norm_var": 0.025470202190572132, + "learning_rate": 5e-05, + "loss": 0.1667, + "loss/crossentropy": 2.7984707355499268, + "loss/hidden": 0.0, + "loss/logits": 0.1667410060763359, + "loss/reg": 1.2061673402786255, + "step": 1409 + }, + { + "epoch": 0.0141, + "grad_norm": 0.3760697543621063, + "grad_norm_var": 0.02548153168728078, + "learning_rate": 5e-05, + "loss": 0.1798, + "loss/crossentropy": 2.8415337800979614, + "loss/hidden": 0.0, + "loss/logits": 0.17982684448361397, + "loss/reg": 1.2050466537475586, + "step": 1410 + }, + { + "epoch": 0.01411, + "grad_norm": 0.4107130467891693, + "grad_norm_var": 0.025308039267595398, + "learning_rate": 5e-05, + "loss": 0.1743, + "loss/crossentropy": 2.8946332335472107, + "loss/hidden": 0.0, + "loss/logits": 0.17427558451890945, + "loss/reg": 1.204237699508667, + "step": 1411 + }, + { + "epoch": 0.01412, + "grad_norm": 0.4082576632499695, + "grad_norm_var": 0.02445911428786796, + "learning_rate": 5e-05, + "loss": 0.1823, + "loss/crossentropy": 2.7471116185188293, + "loss/hidden": 0.0, + "loss/logits": 0.18230270966887474, + "loss/reg": 1.2035255432128906, + "step": 1412 + }, + { + "epoch": 0.01413, + "grad_norm": 0.7261360883712769, + "grad_norm_var": 0.028896584983750567, + "learning_rate": 5e-05, + "loss": 0.2141, + "loss/crossentropy": 2.7916662096977234, + "loss/hidden": 0.0, + "loss/logits": 0.2141283005475998, + "loss/reg": 1.2025699615478516, + "step": 1413 + }, + { + "epoch": 0.01414, + "grad_norm": 0.4041743576526642, + "grad_norm_var": 0.028955615300985102, + "learning_rate": 5e-05, + "loss": 0.1847, + "loss/crossentropy": 2.785338282585144, + "loss/hidden": 0.0, + "loss/logits": 0.18472053483128548, + "loss/reg": 1.201487421989441, + "step": 1414 + }, + { + "epoch": 0.01415, + "grad_norm": 0.42084982991218567, + "grad_norm_var": 0.028809439139849055, + "learning_rate": 5e-05, + "loss": 0.178, + "loss/crossentropy": 2.7464206218719482, + "loss/hidden": 0.0, + "loss/logits": 0.1779973767697811, + "loss/reg": 1.200051188468933, + "step": 1415 + }, + { + "epoch": 0.01416, + "grad_norm": 0.45047450065612793, + "grad_norm_var": 0.01459023840276699, + "learning_rate": 5e-05, + "loss": 0.1828, + "loss/crossentropy": 2.907436788082123, + "loss/hidden": 0.0, + "loss/logits": 0.18275701627135277, + "loss/reg": 1.19915771484375, + "step": 1416 + }, + { + "epoch": 0.01417, + "grad_norm": 0.4240787625312805, + "grad_norm_var": 0.014270135412653876, + "learning_rate": 5e-05, + "loss": 0.1825, + "loss/crossentropy": 2.6383658051490784, + "loss/hidden": 0.0, + "loss/logits": 0.1825261414051056, + "loss/reg": 1.1980401277542114, + "step": 1417 + }, + { + "epoch": 0.01418, + "grad_norm": 0.3851097524166107, + "grad_norm_var": 0.014352634082185445, + "learning_rate": 5e-05, + "loss": 0.1909, + "loss/crossentropy": 2.715300142765045, + "loss/hidden": 0.0, + "loss/logits": 0.1909404620528221, + "loss/reg": 1.1972358226776123, + "step": 1418 + }, + { + "epoch": 0.01419, + "grad_norm": 0.4259493350982666, + "grad_norm_var": 0.014148020705458992, + "learning_rate": 5e-05, + "loss": 0.1954, + "loss/crossentropy": 2.795480966567993, + "loss/hidden": 0.0, + "loss/logits": 0.19538183510303497, + "loss/reg": 1.1965022087097168, + "step": 1419 + }, + { + "epoch": 0.0142, + "grad_norm": 0.41580289602279663, + "grad_norm_var": 0.014137009585771305, + "learning_rate": 5e-05, + "loss": 0.1873, + "loss/crossentropy": 2.878966212272644, + "loss/hidden": 0.0, + "loss/logits": 0.1872658208012581, + "loss/reg": 1.1956799030303955, + "step": 1420 + }, + { + "epoch": 0.01421, + "grad_norm": 0.47517159581184387, + "grad_norm_var": 0.013876705878671021, + "learning_rate": 5e-05, + "loss": 0.2072, + "loss/crossentropy": 2.8808937072753906, + "loss/hidden": 0.0, + "loss/logits": 0.2071789838373661, + "loss/reg": 1.1947969198226929, + "step": 1421 + }, + { + "epoch": 0.01422, + "grad_norm": 0.3958096206188202, + "grad_norm_var": 0.013574404616716058, + "learning_rate": 5e-05, + "loss": 0.1873, + "loss/crossentropy": 2.647930860519409, + "loss/hidden": 0.0, + "loss/logits": 0.18728242069482803, + "loss/reg": 1.194016933441162, + "step": 1422 + }, + { + "epoch": 0.01423, + "grad_norm": 0.4426226317882538, + "grad_norm_var": 0.01305587928614605, + "learning_rate": 5e-05, + "loss": 0.204, + "loss/crossentropy": 2.8057883977890015, + "loss/hidden": 0.0, + "loss/logits": 0.2040250115096569, + "loss/reg": 1.1931583881378174, + "step": 1423 + }, + { + "epoch": 0.01424, + "grad_norm": 0.41137373447418213, + "grad_norm_var": 0.006897930700183829, + "learning_rate": 5e-05, + "loss": 0.1964, + "loss/crossentropy": 2.8716946840286255, + "loss/hidden": 0.0, + "loss/logits": 0.19638033583760262, + "loss/reg": 1.1923456192016602, + "step": 1424 + }, + { + "epoch": 0.01425, + "grad_norm": 0.42817750573158264, + "grad_norm_var": 0.0065130178351596475, + "learning_rate": 5e-05, + "loss": 0.1853, + "loss/crossentropy": 2.7237064242362976, + "loss/hidden": 0.0, + "loss/logits": 0.18527411296963692, + "loss/reg": 1.1919002532958984, + "step": 1425 + }, + { + "epoch": 0.01426, + "grad_norm": 0.4095504879951477, + "grad_norm_var": 0.0063086320451773165, + "learning_rate": 5e-05, + "loss": 0.184, + "loss/crossentropy": 2.836295783519745, + "loss/hidden": 0.0, + "loss/logits": 0.18403561040759087, + "loss/reg": 1.1909234523773193, + "step": 1426 + }, + { + "epoch": 0.01427, + "grad_norm": 0.4399351477622986, + "grad_norm_var": 0.006249292355520449, + "learning_rate": 5e-05, + "loss": 0.2038, + "loss/crossentropy": 2.7117483019828796, + "loss/hidden": 0.0, + "loss/logits": 0.2037728913128376, + "loss/reg": 1.189060091972351, + "step": 1427 + }, + { + "epoch": 0.01428, + "grad_norm": 0.3880847692489624, + "grad_norm_var": 0.006364050570575138, + "learning_rate": 5e-05, + "loss": 0.1862, + "loss/crossentropy": 2.7042183876037598, + "loss/hidden": 0.0, + "loss/logits": 0.1861833930015564, + "loss/reg": 1.1879582405090332, + "step": 1428 + }, + { + "epoch": 0.01429, + "grad_norm": 0.45626869797706604, + "grad_norm_var": 0.0006274098049336244, + "learning_rate": 5e-05, + "loss": 0.2024, + "loss/crossentropy": 2.824433743953705, + "loss/hidden": 0.0, + "loss/logits": 0.20241528376936913, + "loss/reg": 1.1868696212768555, + "step": 1429 + }, + { + "epoch": 0.0143, + "grad_norm": 0.4194117486476898, + "grad_norm_var": 0.0006029838264533775, + "learning_rate": 5e-05, + "loss": 0.1963, + "loss/crossentropy": 2.7539783120155334, + "loss/hidden": 0.0, + "loss/logits": 0.19634708017110825, + "loss/reg": 1.1856297254562378, + "step": 1430 + }, + { + "epoch": 0.01431, + "grad_norm": 0.3876844048500061, + "grad_norm_var": 0.0006869516146238024, + "learning_rate": 5e-05, + "loss": 0.1954, + "loss/crossentropy": 2.681014835834503, + "loss/hidden": 0.0, + "loss/logits": 0.19539380818605423, + "loss/reg": 1.1842849254608154, + "step": 1431 + }, + { + "epoch": 0.01432, + "grad_norm": 0.4049186110496521, + "grad_norm_var": 0.000645033648734498, + "learning_rate": 5e-05, + "loss": 0.1788, + "loss/crossentropy": 2.772512137889862, + "loss/hidden": 0.0, + "loss/logits": 0.17883709073066711, + "loss/reg": 1.183428406715393, + "step": 1432 + }, + { + "epoch": 0.01433, + "grad_norm": 0.4411693513393402, + "grad_norm_var": 0.0006740150025339157, + "learning_rate": 5e-05, + "loss": 0.1939, + "loss/crossentropy": 2.8390111923217773, + "loss/hidden": 0.0, + "loss/logits": 0.19385584443807602, + "loss/reg": 1.1821532249450684, + "step": 1433 + }, + { + "epoch": 0.01434, + "grad_norm": 0.37364938855171204, + "grad_norm_var": 0.0007362101089197252, + "learning_rate": 5e-05, + "loss": 0.1847, + "loss/crossentropy": 2.7289167046546936, + "loss/hidden": 0.0, + "loss/logits": 0.18471518531441689, + "loss/reg": 1.1817139387130737, + "step": 1434 + }, + { + "epoch": 0.01435, + "grad_norm": 0.3801406919956207, + "grad_norm_var": 0.0008293373200224867, + "learning_rate": 5e-05, + "loss": 0.177, + "loss/crossentropy": 2.7374696731567383, + "loss/hidden": 0.0, + "loss/logits": 0.17699915915727615, + "loss/reg": 1.1811518669128418, + "step": 1435 + }, + { + "epoch": 0.01436, + "grad_norm": 0.4569256901741028, + "grad_norm_var": 0.0009292300730142089, + "learning_rate": 5e-05, + "loss": 0.1851, + "loss/crossentropy": 2.712208926677704, + "loss/hidden": 0.0, + "loss/logits": 0.1851220801472664, + "loss/reg": 1.1803579330444336, + "step": 1436 + }, + { + "epoch": 0.01437, + "grad_norm": 0.4534268379211426, + "grad_norm_var": 0.0007971731126398485, + "learning_rate": 5e-05, + "loss": 0.1905, + "loss/crossentropy": 2.938688635826111, + "loss/hidden": 0.0, + "loss/logits": 0.19052105024456978, + "loss/reg": 1.1800974607467651, + "step": 1437 + }, + { + "epoch": 0.01438, + "grad_norm": 3.6807291507720947, + "grad_norm_var": 0.6654650831373831, + "learning_rate": 5e-05, + "loss": 0.3041, + "loss/crossentropy": 2.723634898662567, + "loss/hidden": 0.0, + "loss/logits": 0.3040575832128525, + "loss/reg": 1.1786816120147705, + "step": 1438 + }, + { + "epoch": 0.01439, + "grad_norm": 0.478336364030838, + "grad_norm_var": 0.6646840673393943, + "learning_rate": 5e-05, + "loss": 0.1943, + "loss/crossentropy": 2.604291319847107, + "loss/hidden": 0.0, + "loss/logits": 0.1942521370947361, + "loss/reg": 1.177720546722412, + "step": 1439 + }, + { + "epoch": 0.0144, + "grad_norm": 0.5175783038139343, + "grad_norm_var": 0.6623552944700096, + "learning_rate": 5e-05, + "loss": 0.1927, + "loss/crossentropy": 2.9179378747940063, + "loss/hidden": 0.0, + "loss/logits": 0.19269224256277084, + "loss/reg": 1.1768488883972168, + "step": 1440 + }, + { + "epoch": 0.01441, + "grad_norm": 0.44153159856796265, + "grad_norm_var": 0.6620030812325127, + "learning_rate": 5e-05, + "loss": 0.2094, + "loss/crossentropy": 2.699263036251068, + "loss/hidden": 0.0, + "loss/logits": 0.20937075465917587, + "loss/reg": 1.1756783723831177, + "step": 1441 + }, + { + "epoch": 0.01442, + "grad_norm": 0.456778347492218, + "grad_norm_var": 0.6607348854967332, + "learning_rate": 5e-05, + "loss": 0.2043, + "loss/crossentropy": 2.8013316988945007, + "loss/hidden": 0.0, + "loss/logits": 0.2043425552546978, + "loss/reg": 1.174521565437317, + "step": 1442 + }, + { + "epoch": 0.01443, + "grad_norm": 0.3832143247127533, + "grad_norm_var": 0.6624190273713508, + "learning_rate": 5e-05, + "loss": 0.169, + "loss/crossentropy": 2.7974475622177124, + "loss/hidden": 0.0, + "loss/logits": 0.16904297098517418, + "loss/reg": 1.1735647916793823, + "step": 1443 + }, + { + "epoch": 0.01444, + "grad_norm": 0.40240901708602905, + "grad_norm_var": 0.6619650609078701, + "learning_rate": 5e-05, + "loss": 0.1943, + "loss/crossentropy": 2.6992990374565125, + "loss/hidden": 0.0, + "loss/logits": 0.19426801800727844, + "loss/reg": 1.1723049879074097, + "step": 1444 + }, + { + "epoch": 0.01445, + "grad_norm": 0.413047194480896, + "grad_norm_var": 0.6631025192839431, + "learning_rate": 5e-05, + "loss": 0.1708, + "loss/crossentropy": 2.6653656363487244, + "loss/hidden": 0.0, + "loss/logits": 0.17078221589326859, + "loss/reg": 1.171108603477478, + "step": 1445 + }, + { + "epoch": 0.01446, + "grad_norm": 0.4184585213661194, + "grad_norm_var": 0.6631294281930682, + "learning_rate": 5e-05, + "loss": 0.2102, + "loss/crossentropy": 2.701816439628601, + "loss/hidden": 0.0, + "loss/logits": 0.21016155928373337, + "loss/reg": 1.1695311069488525, + "step": 1446 + }, + { + "epoch": 0.01447, + "grad_norm": 0.36679932475090027, + "grad_norm_var": 0.6638332006424051, + "learning_rate": 5e-05, + "loss": 0.1826, + "loss/crossentropy": 2.8065091371536255, + "loss/hidden": 0.0, + "loss/logits": 0.182582326233387, + "loss/reg": 1.1683489084243774, + "step": 1447 + }, + { + "epoch": 0.01448, + "grad_norm": 0.37978798151016235, + "grad_norm_var": 0.6646245839910803, + "learning_rate": 5e-05, + "loss": 0.1913, + "loss/crossentropy": 2.7764082551002502, + "loss/hidden": 0.0, + "loss/logits": 0.19127563014626503, + "loss/reg": 1.1672946214675903, + "step": 1448 + }, + { + "epoch": 0.01449, + "grad_norm": 0.41191771626472473, + "grad_norm_var": 0.665405763465797, + "learning_rate": 5e-05, + "loss": 0.1907, + "loss/crossentropy": 2.740322172641754, + "loss/hidden": 0.0, + "loss/logits": 0.1907496452331543, + "loss/reg": 1.166246771812439, + "step": 1449 + }, + { + "epoch": 0.0145, + "grad_norm": 0.4475890100002289, + "grad_norm_var": 0.6632604096159077, + "learning_rate": 5e-05, + "loss": 0.2022, + "loss/crossentropy": 2.8474504351615906, + "loss/hidden": 0.0, + "loss/logits": 0.20218346267938614, + "loss/reg": 1.1651406288146973, + "step": 1450 + }, + { + "epoch": 0.01451, + "grad_norm": 0.3883093595504761, + "grad_norm_var": 0.6629918541871671, + "learning_rate": 5e-05, + "loss": 0.2002, + "loss/crossentropy": 2.771294593811035, + "loss/hidden": 0.0, + "loss/logits": 0.20018937811255455, + "loss/reg": 1.1640667915344238, + "step": 1451 + }, + { + "epoch": 0.01452, + "grad_norm": 0.38779163360595703, + "grad_norm_var": 0.6648956523531993, + "learning_rate": 5e-05, + "loss": 0.1846, + "loss/crossentropy": 2.838146388530731, + "loss/hidden": 0.0, + "loss/logits": 0.1846405677497387, + "loss/reg": 1.1632236242294312, + "step": 1452 + }, + { + "epoch": 0.01453, + "grad_norm": 0.44142037630081177, + "grad_norm_var": 0.6651820988867188, + "learning_rate": 5e-05, + "loss": 0.1992, + "loss/crossentropy": 2.7226682901382446, + "loss/hidden": 0.0, + "loss/logits": 0.19923654943704605, + "loss/reg": 1.1626334190368652, + "step": 1453 + }, + { + "epoch": 0.01454, + "grad_norm": 0.4469018578529358, + "grad_norm_var": 0.0016475347938865329, + "learning_rate": 5e-05, + "loss": 0.1943, + "loss/crossentropy": 2.8452677726745605, + "loss/hidden": 0.0, + "loss/logits": 0.19433218985795975, + "loss/reg": 1.1622459888458252, + "step": 1454 + }, + { + "epoch": 0.01455, + "grad_norm": 0.4018121063709259, + "grad_norm_var": 0.0014577680517240807, + "learning_rate": 5e-05, + "loss": 0.1998, + "loss/crossentropy": 2.738975167274475, + "loss/hidden": 0.0, + "loss/logits": 0.19976536184549332, + "loss/reg": 1.1620265245437622, + "step": 1455 + }, + { + "epoch": 0.01456, + "grad_norm": 0.42509543895721436, + "grad_norm_var": 0.0007777995787594194, + "learning_rate": 5e-05, + "loss": 0.1924, + "loss/crossentropy": 2.826959192752838, + "loss/hidden": 0.0, + "loss/logits": 0.19238118827342987, + "loss/reg": 1.161539912223816, + "step": 1456 + }, + { + "epoch": 0.01457, + "grad_norm": 0.440341979265213, + "grad_norm_var": 0.0007734106803447698, + "learning_rate": 5e-05, + "loss": 0.1978, + "loss/crossentropy": 2.6413469314575195, + "loss/hidden": 0.0, + "loss/logits": 0.19783642515540123, + "loss/reg": 1.1616432666778564, + "step": 1457 + }, + { + "epoch": 0.01458, + "grad_norm": 0.39853590726852417, + "grad_norm_var": 0.0006472376220099108, + "learning_rate": 5e-05, + "loss": 0.1848, + "loss/crossentropy": 2.698549807071686, + "loss/hidden": 0.0, + "loss/logits": 0.18476466834545135, + "loss/reg": 1.1611636877059937, + "step": 1458 + }, + { + "epoch": 0.01459, + "grad_norm": 0.38865047693252563, + "grad_norm_var": 0.0006299673554417813, + "learning_rate": 5e-05, + "loss": 0.1884, + "loss/crossentropy": 2.7270365953445435, + "loss/hidden": 0.0, + "loss/logits": 0.18844184651970863, + "loss/reg": 1.1610968112945557, + "step": 1459 + }, + { + "epoch": 0.0146, + "grad_norm": 0.4249439239501953, + "grad_norm_var": 0.000639110550863183, + "learning_rate": 5e-05, + "loss": 0.2042, + "loss/crossentropy": 2.7351967096328735, + "loss/hidden": 0.0, + "loss/logits": 0.2041994333267212, + "loss/reg": 1.1608000993728638, + "step": 1460 + }, + { + "epoch": 0.01461, + "grad_norm": 0.3722969889640808, + "grad_norm_var": 0.0007336083208048571, + "learning_rate": 5e-05, + "loss": 0.1818, + "loss/crossentropy": 2.7943307161331177, + "loss/hidden": 0.0, + "loss/logits": 0.18180421367287636, + "loss/reg": 1.1603269577026367, + "step": 1461 + }, + { + "epoch": 0.01462, + "grad_norm": 0.43652695417404175, + "grad_norm_var": 0.0007773033601769286, + "learning_rate": 5e-05, + "loss": 0.2023, + "loss/crossentropy": 2.662752866744995, + "loss/hidden": 0.0, + "loss/logits": 0.2022755742073059, + "loss/reg": 1.1591933965682983, + "step": 1462 + }, + { + "epoch": 0.01463, + "grad_norm": 0.4718083441257477, + "grad_norm_var": 0.0008627420285323261, + "learning_rate": 5e-05, + "loss": 0.1951, + "loss/crossentropy": 2.9354034662246704, + "loss/hidden": 0.0, + "loss/logits": 0.19507525488734245, + "loss/reg": 1.158470869064331, + "step": 1463 + }, + { + "epoch": 0.01464, + "grad_norm": 0.385351300239563, + "grad_norm_var": 0.0008374568626427566, + "learning_rate": 5e-05, + "loss": 0.1869, + "loss/crossentropy": 2.822139620780945, + "loss/hidden": 0.0, + "loss/logits": 0.186939537525177, + "loss/reg": 1.1580942869186401, + "step": 1464 + }, + { + "epoch": 0.01465, + "grad_norm": 0.3974020779132843, + "grad_norm_var": 0.0008601347897435793, + "learning_rate": 5e-05, + "loss": 0.2006, + "loss/crossentropy": 2.843861997127533, + "loss/hidden": 0.0, + "loss/logits": 0.20057183876633644, + "loss/reg": 1.157541275024414, + "step": 1465 + }, + { + "epoch": 0.01466, + "grad_norm": 0.38286757469177246, + "grad_norm_var": 0.0008486814366043779, + "learning_rate": 5e-05, + "loss": 0.189, + "loss/crossentropy": 2.6517167687416077, + "loss/hidden": 0.0, + "loss/logits": 0.18896743655204773, + "loss/reg": 1.1566143035888672, + "step": 1466 + }, + { + "epoch": 0.01467, + "grad_norm": 0.42905157804489136, + "grad_norm_var": 0.0008243923150278073, + "learning_rate": 5e-05, + "loss": 0.1984, + "loss/crossentropy": 2.897773861885071, + "loss/hidden": 0.0, + "loss/logits": 0.19835495948791504, + "loss/reg": 1.156038761138916, + "step": 1467 + }, + { + "epoch": 0.01468, + "grad_norm": 0.3854566514492035, + "grad_norm_var": 0.0008330248364320133, + "learning_rate": 5e-05, + "loss": 0.1803, + "loss/crossentropy": 2.8033392429351807, + "loss/hidden": 0.0, + "loss/logits": 0.18027521297335625, + "loss/reg": 1.1554698944091797, + "step": 1468 + }, + { + "epoch": 0.01469, + "grad_norm": 0.4645752012729645, + "grad_norm_var": 0.0009503278882020183, + "learning_rate": 5e-05, + "loss": 0.2117, + "loss/crossentropy": 2.7153387665748596, + "loss/hidden": 0.0, + "loss/logits": 0.21166526898741722, + "loss/reg": 1.1556744575500488, + "step": 1469 + }, + { + "epoch": 0.0147, + "grad_norm": 0.3899242877960205, + "grad_norm_var": 0.0009163884442397426, + "learning_rate": 5e-05, + "loss": 0.1912, + "loss/crossentropy": 2.720719635486603, + "loss/hidden": 0.0, + "loss/logits": 0.19123655557632446, + "loss/reg": 1.1561858654022217, + "step": 1470 + }, + { + "epoch": 0.01471, + "grad_norm": 0.3779591917991638, + "grad_norm_var": 0.0009848749223748934, + "learning_rate": 5e-05, + "loss": 0.1796, + "loss/crossentropy": 2.8007051944732666, + "loss/hidden": 0.0, + "loss/logits": 0.1796492263674736, + "loss/reg": 1.1558936834335327, + "step": 1471 + }, + { + "epoch": 0.01472, + "grad_norm": 0.6328070759773254, + "grad_norm_var": 0.004080776063957622, + "learning_rate": 5e-05, + "loss": 0.2355, + "loss/crossentropy": 2.686060070991516, + "loss/hidden": 0.0, + "loss/logits": 0.23549868538975716, + "loss/reg": 1.1561976671218872, + "step": 1472 + }, + { + "epoch": 0.01473, + "grad_norm": 0.472528874874115, + "grad_norm_var": 0.004217134203378577, + "learning_rate": 5e-05, + "loss": 0.1867, + "loss/crossentropy": 2.742728590965271, + "loss/hidden": 0.0, + "loss/logits": 0.18668468296527863, + "loss/reg": 1.1556323766708374, + "step": 1473 + }, + { + "epoch": 0.01474, + "grad_norm": 0.38901230692863464, + "grad_norm_var": 0.004257255456704956, + "learning_rate": 5e-05, + "loss": 0.1722, + "loss/crossentropy": 2.911827266216278, + "loss/hidden": 0.0, + "loss/logits": 0.17221413552761078, + "loss/reg": 1.1559501886367798, + "step": 1474 + }, + { + "epoch": 0.01475, + "grad_norm": 0.4404449462890625, + "grad_norm_var": 0.004173393020864262, + "learning_rate": 5e-05, + "loss": 0.1911, + "loss/crossentropy": 2.542245864868164, + "loss/hidden": 0.0, + "loss/logits": 0.19110393524169922, + "loss/reg": 1.1563279628753662, + "step": 1475 + }, + { + "epoch": 0.01476, + "grad_norm": 0.47592392563819885, + "grad_norm_var": 0.004312948871115774, + "learning_rate": 5e-05, + "loss": 0.2094, + "loss/crossentropy": 2.9327672123908997, + "loss/hidden": 0.0, + "loss/logits": 0.20938289538025856, + "loss/reg": 1.1564908027648926, + "step": 1476 + }, + { + "epoch": 0.01477, + "grad_norm": 0.37822598218917847, + "grad_norm_var": 0.004268347129857242, + "learning_rate": 5e-05, + "loss": 0.1757, + "loss/crossentropy": 2.8230528831481934, + "loss/hidden": 0.0, + "loss/logits": 0.17565542086958885, + "loss/reg": 1.156173586845398, + "step": 1477 + }, + { + "epoch": 0.01478, + "grad_norm": 0.3917330503463745, + "grad_norm_var": 0.004365919184094539, + "learning_rate": 5e-05, + "loss": 0.2045, + "loss/crossentropy": 2.8280991911888123, + "loss/hidden": 0.0, + "loss/logits": 0.20451990514993668, + "loss/reg": 1.1554744243621826, + "step": 1478 + }, + { + "epoch": 0.01479, + "grad_norm": 0.37878331542015076, + "grad_norm_var": 0.004376637666552326, + "learning_rate": 5e-05, + "loss": 0.1829, + "loss/crossentropy": 2.8972471356391907, + "loss/hidden": 0.0, + "loss/logits": 0.18294430896639824, + "loss/reg": 1.1554595232009888, + "step": 1479 + }, + { + "epoch": 0.0148, + "grad_norm": 0.37876391410827637, + "grad_norm_var": 0.004412639484719334, + "learning_rate": 5e-05, + "loss": 0.2315, + "loss/crossentropy": 2.6865721940994263, + "loss/hidden": 0.0, + "loss/logits": 0.2314772792160511, + "loss/reg": 1.1548503637313843, + "step": 1480 + }, + { + "epoch": 0.01481, + "grad_norm": 0.39876559376716614, + "grad_norm_var": 0.004408130788441378, + "learning_rate": 5e-05, + "loss": 0.1891, + "loss/crossentropy": 2.991089105606079, + "loss/hidden": 0.0, + "loss/logits": 0.18908962607383728, + "loss/reg": 1.153546929359436, + "step": 1481 + }, + { + "epoch": 0.01482, + "grad_norm": 0.42962586879730225, + "grad_norm_var": 0.004295032189139869, + "learning_rate": 5e-05, + "loss": 0.1799, + "loss/crossentropy": 2.81504887342453, + "loss/hidden": 0.0, + "loss/logits": 0.17986131832003593, + "loss/reg": 1.1533591747283936, + "step": 1482 + }, + { + "epoch": 0.01483, + "grad_norm": 0.39817601442337036, + "grad_norm_var": 0.0043414287038569055, + "learning_rate": 5e-05, + "loss": 0.1768, + "loss/crossentropy": 2.8316676020622253, + "loss/hidden": 0.0, + "loss/logits": 0.1767561361193657, + "loss/reg": 1.1531206369400024, + "step": 1483 + }, + { + "epoch": 0.01484, + "grad_norm": 0.3925790786743164, + "grad_norm_var": 0.004308073096685746, + "learning_rate": 5e-05, + "loss": 0.1703, + "loss/crossentropy": 2.805059552192688, + "loss/hidden": 0.0, + "loss/logits": 0.1703021265566349, + "loss/reg": 1.1523410081863403, + "step": 1484 + }, + { + "epoch": 0.01485, + "grad_norm": 0.3768901228904724, + "grad_norm_var": 0.004318495561248111, + "learning_rate": 5e-05, + "loss": 0.1814, + "loss/crossentropy": 2.7650709748268127, + "loss/hidden": 0.0, + "loss/logits": 0.18137887865304947, + "loss/reg": 1.151556134223938, + "step": 1485 + }, + { + "epoch": 0.01486, + "grad_norm": 0.40476369857788086, + "grad_norm_var": 0.004274959281858711, + "learning_rate": 5e-05, + "loss": 0.1834, + "loss/crossentropy": 2.8746532201766968, + "loss/hidden": 0.0, + "loss/logits": 0.18340081721544266, + "loss/reg": 1.1500896215438843, + "step": 1486 + }, + { + "epoch": 0.01487, + "grad_norm": 0.44126570224761963, + "grad_norm_var": 0.004172172160196756, + "learning_rate": 5e-05, + "loss": 0.1932, + "loss/crossentropy": 2.694306790828705, + "loss/hidden": 0.0, + "loss/logits": 0.19317873939871788, + "loss/reg": 1.149519443511963, + "step": 1487 + }, + { + "epoch": 0.01488, + "grad_norm": 0.47749271988868713, + "grad_norm_var": 0.0013509307920328006, + "learning_rate": 5e-05, + "loss": 0.1916, + "loss/crossentropy": 2.719083070755005, + "loss/hidden": 0.0, + "loss/logits": 0.1916189342737198, + "loss/reg": 1.1487241983413696, + "step": 1488 + }, + { + "epoch": 0.01489, + "grad_norm": 0.3436330556869507, + "grad_norm_var": 0.0013844778204995265, + "learning_rate": 5e-05, + "loss": 0.1799, + "loss/crossentropy": 2.7870516180992126, + "loss/hidden": 0.0, + "loss/logits": 0.17992451414465904, + "loss/reg": 1.1486061811447144, + "step": 1489 + }, + { + "epoch": 0.0149, + "grad_norm": 0.38551199436187744, + "grad_norm_var": 0.001393174193328169, + "learning_rate": 5e-05, + "loss": 0.1891, + "loss/crossentropy": 2.8131721019744873, + "loss/hidden": 0.0, + "loss/logits": 0.18907897546887398, + "loss/reg": 1.1475741863250732, + "step": 1490 + }, + { + "epoch": 0.01491, + "grad_norm": 0.4178476333618164, + "grad_norm_var": 0.001320663123918564, + "learning_rate": 5e-05, + "loss": 0.2016, + "loss/crossentropy": 2.775062322616577, + "loss/hidden": 0.0, + "loss/logits": 0.20159471407532692, + "loss/reg": 1.146192193031311, + "step": 1491 + }, + { + "epoch": 0.01492, + "grad_norm": 0.3631218373775482, + "grad_norm_var": 0.001039799575198197, + "learning_rate": 5e-05, + "loss": 0.1682, + "loss/crossentropy": 2.776621460914612, + "loss/hidden": 0.0, + "loss/logits": 0.16820930317044258, + "loss/reg": 1.1452136039733887, + "step": 1492 + }, + { + "epoch": 0.01493, + "grad_norm": 4.2707037925720215, + "grad_norm_var": 0.9380895971594498, + "learning_rate": 5e-05, + "loss": 0.3158, + "loss/crossentropy": 2.92247998714447, + "loss/hidden": 0.0, + "loss/logits": 0.3158273994922638, + "loss/reg": 1.1442078351974487, + "step": 1493 + }, + { + "epoch": 0.01494, + "grad_norm": 0.4497520625591278, + "grad_norm_var": 0.9363747553262743, + "learning_rate": 5e-05, + "loss": 0.1921, + "loss/crossentropy": 2.8871845602989197, + "loss/hidden": 0.0, + "loss/logits": 0.19214729592204094, + "loss/reg": 1.1430447101593018, + "step": 1494 + }, + { + "epoch": 0.01495, + "grad_norm": 0.45193424820899963, + "grad_norm_var": 0.9341201756923909, + "learning_rate": 5e-05, + "loss": 0.1944, + "loss/crossentropy": 2.8296074271202087, + "loss/hidden": 0.0, + "loss/logits": 0.19437526538968086, + "loss/reg": 1.1418877840042114, + "step": 1495 + }, + { + "epoch": 0.01496, + "grad_norm": 0.4302029609680176, + "grad_norm_var": 0.9324334842618291, + "learning_rate": 5e-05, + "loss": 0.2005, + "loss/crossentropy": 2.7669071555137634, + "loss/hidden": 0.0, + "loss/logits": 0.2004767581820488, + "loss/reg": 1.1404200792312622, + "step": 1496 + }, + { + "epoch": 0.01497, + "grad_norm": 0.4347190260887146, + "grad_norm_var": 0.9313002422194117, + "learning_rate": 5e-05, + "loss": 0.1958, + "loss/crossentropy": 2.8296796679496765, + "loss/hidden": 0.0, + "loss/logits": 0.19576232135295868, + "loss/reg": 1.139227032661438, + "step": 1497 + }, + { + "epoch": 0.01498, + "grad_norm": 0.5106848478317261, + "grad_norm_var": 0.9292830465830048, + "learning_rate": 5e-05, + "loss": 0.193, + "loss/crossentropy": 2.730704605579376, + "loss/hidden": 0.0, + "loss/logits": 0.1929554119706154, + "loss/reg": 1.1384540796279907, + "step": 1498 + }, + { + "epoch": 0.01499, + "grad_norm": 0.3934985101222992, + "grad_norm_var": 0.9294472871619052, + "learning_rate": 5e-05, + "loss": 0.1763, + "loss/crossentropy": 2.88557106256485, + "loss/hidden": 0.0, + "loss/logits": 0.17628108337521553, + "loss/reg": 1.1374356746673584, + "step": 1499 + }, + { + "epoch": 0.015, + "grad_norm": 0.38453835248947144, + "grad_norm_var": 0.9297369973056901, + "learning_rate": 5e-05, + "loss": 0.1891, + "loss/crossentropy": 2.733524262905121, + "loss/hidden": 0.0, + "loss/logits": 0.18906502798199654, + "loss/reg": 1.1368626356124878, + "step": 1500 + }, + { + "epoch": 0.01501, + "grad_norm": 0.38647788763046265, + "grad_norm_var": 0.9293826966113136, + "learning_rate": 5e-05, + "loss": 0.1832, + "loss/crossentropy": 2.6723055839538574, + "loss/hidden": 0.0, + "loss/logits": 0.18317405134439468, + "loss/reg": 1.1361547708511353, + "step": 1501 + }, + { + "epoch": 0.01502, + "grad_norm": 0.38139402866363525, + "grad_norm_var": 0.9302094379605873, + "learning_rate": 5e-05, + "loss": 0.1736, + "loss/crossentropy": 2.8804500699043274, + "loss/hidden": 0.0, + "loss/logits": 0.1735868975520134, + "loss/reg": 1.1355422735214233, + "step": 1502 + }, + { + "epoch": 0.01503, + "grad_norm": 0.3744097948074341, + "grad_norm_var": 0.9324178817578533, + "learning_rate": 5e-05, + "loss": 0.169, + "loss/crossentropy": 2.929677963256836, + "loss/hidden": 0.0, + "loss/logits": 0.16901340708136559, + "loss/reg": 1.1346811056137085, + "step": 1503 + }, + { + "epoch": 0.01504, + "grad_norm": 0.3716701567173004, + "grad_norm_var": 0.9356011201024083, + "learning_rate": 5e-05, + "loss": 0.168, + "loss/crossentropy": 2.940733551979065, + "loss/hidden": 0.0, + "loss/logits": 0.16800947487354279, + "loss/reg": 1.1341255903244019, + "step": 1504 + }, + { + "epoch": 0.01505, + "grad_norm": 0.4324607849121094, + "grad_norm_var": 0.9325026880056351, + "learning_rate": 5e-05, + "loss": 0.1995, + "loss/crossentropy": 2.7044883370399475, + "loss/hidden": 0.0, + "loss/logits": 0.19945520162582397, + "loss/reg": 1.133963942527771, + "step": 1505 + }, + { + "epoch": 0.01506, + "grad_norm": 0.3936706781387329, + "grad_norm_var": 0.9322164850582716, + "learning_rate": 5e-05, + "loss": 0.1905, + "loss/crossentropy": 2.8886741995811462, + "loss/hidden": 0.0, + "loss/logits": 0.1905427686870098, + "loss/reg": 1.133609652519226, + "step": 1506 + }, + { + "epoch": 0.01507, + "grad_norm": 0.41537290811538696, + "grad_norm_var": 0.9322944406545675, + "learning_rate": 5e-05, + "loss": 0.1973, + "loss/crossentropy": 2.8340283036231995, + "loss/hidden": 0.0, + "loss/logits": 0.1972740888595581, + "loss/reg": 1.133344054222107, + "step": 1507 + }, + { + "epoch": 0.01508, + "grad_norm": 0.46224504709243774, + "grad_norm_var": 0.9290801736495811, + "learning_rate": 5e-05, + "loss": 0.1792, + "loss/crossentropy": 2.8049793243408203, + "loss/hidden": 0.0, + "loss/logits": 0.1792147532105446, + "loss/reg": 1.133016586303711, + "step": 1508 + }, + { + "epoch": 0.01509, + "grad_norm": 0.3661869168281555, + "grad_norm_var": 0.001638684958712311, + "learning_rate": 5e-05, + "loss": 0.181, + "loss/crossentropy": 2.787099599838257, + "loss/hidden": 0.0, + "loss/logits": 0.18104476854205132, + "loss/reg": 1.1325634717941284, + "step": 1509 + }, + { + "epoch": 0.0151, + "grad_norm": 0.5948194265365601, + "grad_norm_var": 0.00362709916255467, + "learning_rate": 5e-05, + "loss": 0.2071, + "loss/crossentropy": 2.9607895612716675, + "loss/hidden": 0.0, + "loss/logits": 0.20714980363845825, + "loss/reg": 1.1316790580749512, + "step": 1510 + }, + { + "epoch": 0.01511, + "grad_norm": 0.41173163056373596, + "grad_norm_var": 0.003578473170561654, + "learning_rate": 5e-05, + "loss": 0.1857, + "loss/crossentropy": 2.8691484332084656, + "loss/hidden": 0.0, + "loss/logits": 0.18565627187490463, + "loss/reg": 1.1312055587768555, + "step": 1511 + }, + { + "epoch": 0.01512, + "grad_norm": 0.4034125506877899, + "grad_norm_var": 0.003592262118630174, + "learning_rate": 5e-05, + "loss": 0.1852, + "loss/crossentropy": 2.8478543758392334, + "loss/hidden": 0.0, + "loss/logits": 0.18515947088599205, + "loss/reg": 1.1307348012924194, + "step": 1512 + }, + { + "epoch": 0.01513, + "grad_norm": 0.3978036046028137, + "grad_norm_var": 0.003604153126838305, + "learning_rate": 5e-05, + "loss": 0.1859, + "loss/crossentropy": 2.8709346055984497, + "loss/hidden": 0.0, + "loss/logits": 0.1858948990702629, + "loss/reg": 1.1300814151763916, + "step": 1513 + }, + { + "epoch": 0.01514, + "grad_norm": 0.3758530914783478, + "grad_norm_var": 0.00306556512898114, + "learning_rate": 5e-05, + "loss": 0.1829, + "loss/crossentropy": 2.767078220844269, + "loss/hidden": 0.0, + "loss/logits": 0.18287447094917297, + "loss/reg": 1.1289632320404053, + "step": 1514 + }, + { + "epoch": 0.01515, + "grad_norm": 0.37056660652160645, + "grad_norm_var": 0.0031461246167071356, + "learning_rate": 5e-05, + "loss": 0.1714, + "loss/crossentropy": 2.8232129216194153, + "loss/hidden": 0.0, + "loss/logits": 0.1713615171611309, + "loss/reg": 1.1283029317855835, + "step": 1515 + }, + { + "epoch": 0.01516, + "grad_norm": 0.4838574230670929, + "grad_norm_var": 0.0034564083210849917, + "learning_rate": 5e-05, + "loss": 0.2098, + "loss/crossentropy": 2.7439082264900208, + "loss/hidden": 0.0, + "loss/logits": 0.20981475710868835, + "loss/reg": 1.1274932622909546, + "step": 1516 + }, + { + "epoch": 0.01517, + "grad_norm": 0.4795788526535034, + "grad_norm_var": 0.003658104504286123, + "learning_rate": 5e-05, + "loss": 0.222, + "loss/crossentropy": 2.814515173435211, + "loss/hidden": 0.0, + "loss/logits": 0.22201483324170113, + "loss/reg": 1.126589298248291, + "step": 1517 + }, + { + "epoch": 0.01518, + "grad_norm": 0.43272629380226135, + "grad_norm_var": 0.003560685680539777, + "learning_rate": 5e-05, + "loss": 0.1866, + "loss/crossentropy": 2.6954237818717957, + "loss/hidden": 0.0, + "loss/logits": 0.1866161711513996, + "loss/reg": 1.1255377531051636, + "step": 1518 + }, + { + "epoch": 0.01519, + "grad_norm": 0.44345027208328247, + "grad_norm_var": 0.0034122455072800612, + "learning_rate": 5e-05, + "loss": 0.187, + "loss/crossentropy": 2.97675484418869, + "loss/hidden": 0.0, + "loss/logits": 0.18700629100203514, + "loss/reg": 1.1250556707382202, + "step": 1519 + }, + { + "epoch": 0.0152, + "grad_norm": 0.37308746576309204, + "grad_norm_var": 0.003401874892602097, + "learning_rate": 5e-05, + "loss": 0.1712, + "loss/crossentropy": 2.7208551168441772, + "loss/hidden": 0.0, + "loss/logits": 0.17121143266558647, + "loss/reg": 1.1241092681884766, + "step": 1520 + }, + { + "epoch": 0.01521, + "grad_norm": 0.38577350974082947, + "grad_norm_var": 0.003505989678647703, + "learning_rate": 5e-05, + "loss": 0.1967, + "loss/crossentropy": 2.7316006422042847, + "loss/hidden": 0.0, + "loss/logits": 0.19670463353395462, + "loss/reg": 1.1233248710632324, + "step": 1521 + }, + { + "epoch": 0.01522, + "grad_norm": 0.4153127372264862, + "grad_norm_var": 0.0034466381379363717, + "learning_rate": 5e-05, + "loss": 0.1854, + "loss/crossentropy": 2.852459490299225, + "loss/hidden": 0.0, + "loss/logits": 0.18538159132003784, + "loss/reg": 1.122011661529541, + "step": 1522 + }, + { + "epoch": 0.01523, + "grad_norm": 0.3831879794597626, + "grad_norm_var": 0.0035558519997709315, + "learning_rate": 5e-05, + "loss": 0.1772, + "loss/crossentropy": 2.785507321357727, + "loss/hidden": 0.0, + "loss/logits": 0.1772114671766758, + "loss/reg": 1.1209535598754883, + "step": 1523 + }, + { + "epoch": 0.01524, + "grad_norm": 0.47046220302581787, + "grad_norm_var": 0.0036022759188557058, + "learning_rate": 5e-05, + "loss": 0.2032, + "loss/crossentropy": 2.862886071205139, + "loss/hidden": 0.0, + "loss/logits": 0.2032475359737873, + "loss/reg": 1.119650959968567, + "step": 1524 + }, + { + "epoch": 0.01525, + "grad_norm": 0.4245082437992096, + "grad_norm_var": 0.0033634452527405304, + "learning_rate": 5e-05, + "loss": 0.2064, + "loss/crossentropy": 2.7247531414031982, + "loss/hidden": 0.0, + "loss/logits": 0.20639275014400482, + "loss/reg": 1.1184643507003784, + "step": 1525 + }, + { + "epoch": 0.01526, + "grad_norm": 0.39092564582824707, + "grad_norm_var": 0.0014234374246410498, + "learning_rate": 5e-05, + "loss": 0.1809, + "loss/crossentropy": 2.6980656385421753, + "loss/hidden": 0.0, + "loss/logits": 0.18094860762357712, + "loss/reg": 1.1170532703399658, + "step": 1526 + }, + { + "epoch": 0.01527, + "grad_norm": 0.46039342880249023, + "grad_norm_var": 0.0015493220358197223, + "learning_rate": 5e-05, + "loss": 0.1912, + "loss/crossentropy": 2.7977852821350098, + "loss/hidden": 0.0, + "loss/logits": 0.1911815144121647, + "loss/reg": 1.1160552501678467, + "step": 1527 + }, + { + "epoch": 0.01528, + "grad_norm": 0.5170849561691284, + "grad_norm_var": 0.002133071464395893, + "learning_rate": 5e-05, + "loss": 0.2056, + "loss/crossentropy": 2.8577861189842224, + "loss/hidden": 0.0, + "loss/logits": 0.2055775336921215, + "loss/reg": 1.1147196292877197, + "step": 1528 + }, + { + "epoch": 0.01529, + "grad_norm": 0.40889468789100647, + "grad_norm_var": 0.0021001187915984892, + "learning_rate": 5e-05, + "loss": 0.1939, + "loss/crossentropy": 2.659440815448761, + "loss/hidden": 0.0, + "loss/logits": 0.1939220428466797, + "loss/reg": 1.112278938293457, + "step": 1529 + }, + { + "epoch": 0.0153, + "grad_norm": 0.4347395598888397, + "grad_norm_var": 0.0019232800669717184, + "learning_rate": 5e-05, + "loss": 0.1986, + "loss/crossentropy": 2.8192054629325867, + "loss/hidden": 0.0, + "loss/logits": 0.1986180804669857, + "loss/reg": 1.1106244325637817, + "step": 1530 + }, + { + "epoch": 0.01531, + "grad_norm": 0.3992787003517151, + "grad_norm_var": 0.0017485805047590564, + "learning_rate": 5e-05, + "loss": 0.197, + "loss/crossentropy": 2.817795991897583, + "loss/hidden": 0.0, + "loss/logits": 0.19699057191610336, + "loss/reg": 1.1089487075805664, + "step": 1531 + }, + { + "epoch": 0.01532, + "grad_norm": 0.4026416540145874, + "grad_norm_var": 0.0015933646211622964, + "learning_rate": 5e-05, + "loss": 0.1914, + "loss/crossentropy": 2.862860321998596, + "loss/hidden": 0.0, + "loss/logits": 0.19142070785164833, + "loss/reg": 1.107688307762146, + "step": 1532 + }, + { + "epoch": 0.01533, + "grad_norm": 0.3913553059101105, + "grad_norm_var": 0.0014540163735419368, + "learning_rate": 5e-05, + "loss": 0.183, + "loss/crossentropy": 2.7344642281532288, + "loss/hidden": 0.0, + "loss/logits": 0.1830131895840168, + "loss/reg": 1.1065510511398315, + "step": 1533 + }, + { + "epoch": 0.01534, + "grad_norm": 0.5044860243797302, + "grad_norm_var": 0.0018893563688275254, + "learning_rate": 5e-05, + "loss": 0.2228, + "loss/crossentropy": 2.7584890127182007, + "loss/hidden": 0.0, + "loss/logits": 0.22278590872883797, + "loss/reg": 1.1057510375976562, + "step": 1534 + }, + { + "epoch": 0.01535, + "grad_norm": 0.4163862466812134, + "grad_norm_var": 0.0018698157391243955, + "learning_rate": 5e-05, + "loss": 0.2103, + "loss/crossentropy": 2.730086088180542, + "loss/hidden": 0.0, + "loss/logits": 0.21033897623419762, + "loss/reg": 1.1047923564910889, + "step": 1535 + }, + { + "epoch": 0.01536, + "grad_norm": 0.49695706367492676, + "grad_norm_var": 0.00199358529955254, + "learning_rate": 5e-05, + "loss": 0.2009, + "loss/crossentropy": 2.966780424118042, + "loss/hidden": 0.0, + "loss/logits": 0.20090845972299576, + "loss/reg": 1.1038782596588135, + "step": 1536 + }, + { + "epoch": 0.01537, + "grad_norm": 0.5590111613273621, + "grad_norm_var": 0.002815411142489177, + "learning_rate": 5e-05, + "loss": 0.2021, + "loss/crossentropy": 2.7672842741012573, + "loss/hidden": 0.0, + "loss/logits": 0.20211173966526985, + "loss/reg": 1.101956844329834, + "step": 1537 + }, + { + "epoch": 0.01538, + "grad_norm": 0.41098716855049133, + "grad_norm_var": 0.0028321029196908634, + "learning_rate": 5e-05, + "loss": 0.1785, + "loss/crossentropy": 2.83419930934906, + "loss/hidden": 0.0, + "loss/logits": 0.17854700610041618, + "loss/reg": 1.100677490234375, + "step": 1538 + }, + { + "epoch": 0.01539, + "grad_norm": 0.5386168956756592, + "grad_norm_var": 0.0031240819845108885, + "learning_rate": 5e-05, + "loss": 0.2135, + "loss/crossentropy": 2.6488757133483887, + "loss/hidden": 0.0, + "loss/logits": 0.21346936747431755, + "loss/reg": 1.0994669198989868, + "step": 1539 + }, + { + "epoch": 0.0154, + "grad_norm": 1.3307965993881226, + "grad_norm_var": 0.051540649597419906, + "learning_rate": 5e-05, + "loss": 0.1943, + "loss/crossentropy": 2.665076494216919, + "loss/hidden": 0.0, + "loss/logits": 0.19429544359445572, + "loss/reg": 1.0974620580673218, + "step": 1540 + }, + { + "epoch": 0.01541, + "grad_norm": 0.4543551504611969, + "grad_norm_var": 0.05127424614667148, + "learning_rate": 5e-05, + "loss": 0.2127, + "loss/crossentropy": 2.8328760862350464, + "loss/hidden": 0.0, + "loss/logits": 0.2126643992960453, + "loss/reg": 1.0955594778060913, + "step": 1541 + }, + { + "epoch": 0.01542, + "grad_norm": 0.45249900221824646, + "grad_norm_var": 0.05055573652069357, + "learning_rate": 5e-05, + "loss": 0.2003, + "loss/crossentropy": 2.802838683128357, + "loss/hidden": 0.0, + "loss/logits": 0.20028693228960037, + "loss/reg": 1.0934381484985352, + "step": 1542 + }, + { + "epoch": 0.01543, + "grad_norm": 0.4271480143070221, + "grad_norm_var": 0.0508498280485958, + "learning_rate": 5e-05, + "loss": 0.1916, + "loss/crossentropy": 2.7794657349586487, + "loss/hidden": 0.0, + "loss/logits": 0.19155221432447433, + "loss/reg": 1.09139084815979, + "step": 1543 + }, + { + "epoch": 0.01544, + "grad_norm": 0.4178108870983124, + "grad_norm_var": 0.051359794317072924, + "learning_rate": 5e-05, + "loss": 0.1807, + "loss/crossentropy": 2.984102964401245, + "loss/hidden": 0.0, + "loss/logits": 0.18070745840668678, + "loss/reg": 1.0887871980667114, + "step": 1544 + }, + { + "epoch": 0.01545, + "grad_norm": 0.43329110741615295, + "grad_norm_var": 0.05109129627541022, + "learning_rate": 5e-05, + "loss": 0.1711, + "loss/crossentropy": 2.7687519192695618, + "loss/hidden": 0.0, + "loss/logits": 0.17112785205245018, + "loss/reg": 1.0872141122817993, + "step": 1545 + }, + { + "epoch": 0.01546, + "grad_norm": 0.4670257866382599, + "grad_norm_var": 0.050856580550541, + "learning_rate": 5e-05, + "loss": 0.1948, + "loss/crossentropy": 2.6094987988471985, + "loss/hidden": 0.0, + "loss/logits": 0.19480736926198006, + "loss/reg": 1.0858657360076904, + "step": 1546 + }, + { + "epoch": 0.01547, + "grad_norm": 0.40323877334594727, + "grad_norm_var": 0.05080099145439707, + "learning_rate": 5e-05, + "loss": 0.1918, + "loss/crossentropy": 2.9429529309272766, + "loss/hidden": 0.0, + "loss/logits": 0.1918117143213749, + "loss/reg": 1.0839253664016724, + "step": 1547 + }, + { + "epoch": 0.01548, + "grad_norm": 0.37677061557769775, + "grad_norm_var": 0.05120164181760689, + "learning_rate": 5e-05, + "loss": 0.184, + "loss/crossentropy": 2.781135082244873, + "loss/hidden": 0.0, + "loss/logits": 0.18400835990905762, + "loss/reg": 1.0825845003128052, + "step": 1548 + }, + { + "epoch": 0.01549, + "grad_norm": 0.4067257046699524, + "grad_norm_var": 0.05098341124146938, + "learning_rate": 5e-05, + "loss": 0.1959, + "loss/crossentropy": 3.0511435866355896, + "loss/hidden": 0.0, + "loss/logits": 0.195915374904871, + "loss/reg": 1.0807474851608276, + "step": 1549 + }, + { + "epoch": 0.0155, + "grad_norm": 0.41131263971328735, + "grad_norm_var": 0.05154488197435978, + "learning_rate": 5e-05, + "loss": 0.2111, + "loss/crossentropy": 2.7555925250053406, + "loss/hidden": 0.0, + "loss/logits": 0.21109894663095474, + "loss/reg": 1.079037070274353, + "step": 1550 + }, + { + "epoch": 0.01551, + "grad_norm": 0.395988792181015, + "grad_norm_var": 0.05179878503089578, + "learning_rate": 5e-05, + "loss": 0.2093, + "loss/crossentropy": 2.9266753792762756, + "loss/hidden": 0.0, + "loss/logits": 0.2093418724834919, + "loss/reg": 1.0777536630630493, + "step": 1551 + }, + { + "epoch": 0.01552, + "grad_norm": 0.377541184425354, + "grad_norm_var": 0.0527211149077442, + "learning_rate": 5e-05, + "loss": 0.1792, + "loss/crossentropy": 2.864599108695984, + "loss/hidden": 0.0, + "loss/logits": 0.17924968153238297, + "loss/reg": 1.0760817527770996, + "step": 1552 + }, + { + "epoch": 0.01553, + "grad_norm": 0.3881519138813019, + "grad_norm_var": 0.05300642886035351, + "learning_rate": 5e-05, + "loss": 0.1847, + "loss/crossentropy": 2.7285693287849426, + "loss/hidden": 0.0, + "loss/logits": 0.18472957983613014, + "loss/reg": 1.0742844343185425, + "step": 1553 + }, + { + "epoch": 0.01554, + "grad_norm": 0.706222653388977, + "grad_norm_var": 0.05570734295763023, + "learning_rate": 5e-05, + "loss": 0.2179, + "loss/crossentropy": 2.935713231563568, + "loss/hidden": 0.0, + "loss/logits": 0.21790685132145882, + "loss/reg": 1.0728493928909302, + "step": 1554 + }, + { + "epoch": 0.01555, + "grad_norm": 0.43061429262161255, + "grad_norm_var": 0.055869027275741875, + "learning_rate": 5e-05, + "loss": 0.1799, + "loss/crossentropy": 2.8454378843307495, + "loss/hidden": 0.0, + "loss/logits": 0.1798604428768158, + "loss/reg": 1.0717158317565918, + "step": 1555 + }, + { + "epoch": 0.01556, + "grad_norm": 0.48051029443740845, + "grad_norm_var": 0.006013161612129384, + "learning_rate": 5e-05, + "loss": 0.1964, + "loss/crossentropy": 2.836775243282318, + "loss/hidden": 0.0, + "loss/logits": 0.19637484848499298, + "loss/reg": 1.0707546472549438, + "step": 1556 + }, + { + "epoch": 0.01557, + "grad_norm": 0.4592551290988922, + "grad_norm_var": 0.006024481601364788, + "learning_rate": 5e-05, + "loss": 0.1982, + "loss/crossentropy": 2.8153125047683716, + "loss/hidden": 0.0, + "loss/logits": 0.1982250064611435, + "loss/reg": 1.0695213079452515, + "step": 1557 + }, + { + "epoch": 0.01558, + "grad_norm": 0.4210123121738434, + "grad_norm_var": 0.006032424887954294, + "learning_rate": 5e-05, + "loss": 0.1965, + "loss/crossentropy": 2.8357452750205994, + "loss/hidden": 0.0, + "loss/logits": 0.1964869536459446, + "loss/reg": 1.0685877799987793, + "step": 1558 + }, + { + "epoch": 0.01559, + "grad_norm": 0.4791198968887329, + "grad_norm_var": 0.00612837245298895, + "learning_rate": 5e-05, + "loss": 0.2072, + "loss/crossentropy": 2.9406415820121765, + "loss/hidden": 0.0, + "loss/logits": 0.20718776062130928, + "loss/reg": 1.0677986145019531, + "step": 1559 + }, + { + "epoch": 0.0156, + "grad_norm": 0.401761531829834, + "grad_norm_var": 0.006193905709313973, + "learning_rate": 5e-05, + "loss": 0.1813, + "loss/crossentropy": 2.733909487724304, + "loss/hidden": 0.0, + "loss/logits": 0.18126443028450012, + "loss/reg": 1.0671542882919312, + "step": 1560 + }, + { + "epoch": 0.01561, + "grad_norm": 0.37871912121772766, + "grad_norm_var": 0.006428189979391045, + "learning_rate": 5e-05, + "loss": 0.1826, + "loss/crossentropy": 2.821140468120575, + "loss/hidden": 0.0, + "loss/logits": 0.18263829499483109, + "loss/reg": 1.0664088726043701, + "step": 1561 + }, + { + "epoch": 0.01562, + "grad_norm": 0.40654441714286804, + "grad_norm_var": 0.0064106344187308505, + "learning_rate": 5e-05, + "loss": 0.1958, + "loss/crossentropy": 2.847402274608612, + "loss/hidden": 0.0, + "loss/logits": 0.19583850726485252, + "loss/reg": 1.0656745433807373, + "step": 1562 + }, + { + "epoch": 0.01563, + "grad_norm": 0.39909881353378296, + "grad_norm_var": 0.006427978041990023, + "learning_rate": 5e-05, + "loss": 0.187, + "loss/crossentropy": 2.8001545667648315, + "loss/hidden": 0.0, + "loss/logits": 0.1870410367846489, + "loss/reg": 1.0649704933166504, + "step": 1563 + }, + { + "epoch": 0.01564, + "grad_norm": 0.44327160716056824, + "grad_norm_var": 0.006210596260875845, + "learning_rate": 5e-05, + "loss": 0.1935, + "loss/crossentropy": 2.895441710948944, + "loss/hidden": 0.0, + "loss/logits": 0.19354696199297905, + "loss/reg": 1.0645458698272705, + "step": 1564 + }, + { + "epoch": 0.01565, + "grad_norm": 0.43015867471694946, + "grad_norm_var": 0.006151527259060663, + "learning_rate": 5e-05, + "loss": 0.2007, + "loss/crossentropy": 2.6846776008605957, + "loss/hidden": 0.0, + "loss/logits": 0.20071979612112045, + "loss/reg": 1.0638952255249023, + "step": 1565 + }, + { + "epoch": 0.01566, + "grad_norm": 0.4013887643814087, + "grad_norm_var": 0.006193100862394488, + "learning_rate": 5e-05, + "loss": 0.1911, + "loss/crossentropy": 2.6647936701774597, + "loss/hidden": 0.0, + "loss/logits": 0.19105948507785797, + "loss/reg": 1.063437819480896, + "step": 1566 + }, + { + "epoch": 0.01567, + "grad_norm": 0.3776607811450958, + "grad_norm_var": 0.00631544015384116, + "learning_rate": 5e-05, + "loss": 0.1794, + "loss/crossentropy": 2.7828028202056885, + "loss/hidden": 0.0, + "loss/logits": 0.17943480983376503, + "loss/reg": 1.0625545978546143, + "step": 1567 + }, + { + "epoch": 0.01568, + "grad_norm": 0.4237598478794098, + "grad_norm_var": 0.006086760813740237, + "learning_rate": 5e-05, + "loss": 0.2011, + "loss/crossentropy": 2.6718530654907227, + "loss/hidden": 0.0, + "loss/logits": 0.20108015462756157, + "loss/reg": 1.0613658428192139, + "step": 1568 + }, + { + "epoch": 0.01569, + "grad_norm": 0.3995186686515808, + "grad_norm_var": 0.006017464457152703, + "learning_rate": 5e-05, + "loss": 0.1917, + "loss/crossentropy": 2.5245614051818848, + "loss/hidden": 0.0, + "loss/logits": 0.1916515864431858, + "loss/reg": 1.0607917308807373, + "step": 1569 + }, + { + "epoch": 0.0157, + "grad_norm": 0.38678285479545593, + "grad_norm_var": 0.0010524458516556479, + "learning_rate": 5e-05, + "loss": 0.1844, + "loss/crossentropy": 2.8043148517608643, + "loss/hidden": 0.0, + "loss/logits": 0.18437979742884636, + "loss/reg": 1.0602530241012573, + "step": 1570 + }, + { + "epoch": 0.01571, + "grad_norm": 0.40594568848609924, + "grad_norm_var": 0.0010553984485499417, + "learning_rate": 5e-05, + "loss": 0.1904, + "loss/crossentropy": 2.783930778503418, + "loss/hidden": 0.0, + "loss/logits": 0.19035086408257484, + "loss/reg": 1.0592772960662842, + "step": 1571 + }, + { + "epoch": 0.01572, + "grad_norm": 0.4098511040210724, + "grad_norm_var": 0.0007823522709671089, + "learning_rate": 5e-05, + "loss": 0.2076, + "loss/crossentropy": 2.5536091327667236, + "loss/hidden": 0.0, + "loss/logits": 0.20764099434018135, + "loss/reg": 1.0582836866378784, + "step": 1572 + }, + { + "epoch": 0.01573, + "grad_norm": 2.0963947772979736, + "grad_norm_var": 0.17817707747996261, + "learning_rate": 5e-05, + "loss": 0.2349, + "loss/crossentropy": 2.964250385761261, + "loss/hidden": 0.0, + "loss/logits": 0.234896432608366, + "loss/reg": 1.0572166442871094, + "step": 1573 + }, + { + "epoch": 0.01574, + "grad_norm": 0.45353516936302185, + "grad_norm_var": 0.17782993109395742, + "learning_rate": 5e-05, + "loss": 0.1932, + "loss/crossentropy": 2.8702264428138733, + "loss/hidden": 0.0, + "loss/logits": 0.19320252165198326, + "loss/reg": 1.056014060974121, + "step": 1574 + }, + { + "epoch": 0.01575, + "grad_norm": 0.44855397939682007, + "grad_norm_var": 0.17804818136024572, + "learning_rate": 5e-05, + "loss": 0.1982, + "loss/crossentropy": 2.949871063232422, + "loss/hidden": 0.0, + "loss/logits": 0.1982114426791668, + "loss/reg": 1.0550678968429565, + "step": 1575 + }, + { + "epoch": 0.01576, + "grad_norm": 0.4332554340362549, + "grad_norm_var": 0.17762864137172196, + "learning_rate": 5e-05, + "loss": 0.191, + "loss/crossentropy": 2.7771596908569336, + "loss/hidden": 0.0, + "loss/logits": 0.191011194139719, + "loss/reg": 1.05426025390625, + "step": 1576 + }, + { + "epoch": 0.01577, + "grad_norm": 0.38208287954330444, + "grad_norm_var": 0.17756670040897735, + "learning_rate": 5e-05, + "loss": 0.1773, + "loss/crossentropy": 2.774085760116577, + "loss/hidden": 0.0, + "loss/logits": 0.17731792107224464, + "loss/reg": 1.0530881881713867, + "step": 1577 + }, + { + "epoch": 0.01578, + "grad_norm": 0.4420539140701294, + "grad_norm_var": 0.17711490965873147, + "learning_rate": 5e-05, + "loss": 0.21, + "loss/crossentropy": 2.8334729075431824, + "loss/hidden": 0.0, + "loss/logits": 0.20999984815716743, + "loss/reg": 1.052188515663147, + "step": 1578 + }, + { + "epoch": 0.01579, + "grad_norm": 0.42621752619743347, + "grad_norm_var": 0.17672070717077115, + "learning_rate": 5e-05, + "loss": 0.1972, + "loss/crossentropy": 2.653954863548279, + "loss/hidden": 0.0, + "loss/logits": 0.1972290314733982, + "loss/reg": 1.051479697227478, + "step": 1579 + }, + { + "epoch": 0.0158, + "grad_norm": 0.4033251404762268, + "grad_norm_var": 0.17724256929511586, + "learning_rate": 5e-05, + "loss": 0.1978, + "loss/crossentropy": 2.7112594842910767, + "loss/hidden": 0.0, + "loss/logits": 0.19781038537621498, + "loss/reg": 1.0506196022033691, + "step": 1580 + }, + { + "epoch": 0.01581, + "grad_norm": 0.39284470677375793, + "grad_norm_var": 0.1777767191095864, + "learning_rate": 5e-05, + "loss": 0.1847, + "loss/crossentropy": 2.7284326553344727, + "loss/hidden": 0.0, + "loss/logits": 0.18470782414078712, + "loss/reg": 1.0500985383987427, + "step": 1581 + }, + { + "epoch": 0.01582, + "grad_norm": 0.37770310044288635, + "grad_norm_var": 0.17817909777804894, + "learning_rate": 5e-05, + "loss": 0.1905, + "loss/crossentropy": 2.775408685207367, + "loss/hidden": 0.0, + "loss/logits": 0.19050265476107597, + "loss/reg": 1.0495445728302002, + "step": 1582 + }, + { + "epoch": 0.01583, + "grad_norm": 0.4136863946914673, + "grad_norm_var": 0.17759466596601922, + "learning_rate": 5e-05, + "loss": 0.1884, + "loss/crossentropy": 2.937465488910675, + "loss/hidden": 0.0, + "loss/logits": 0.18836676329374313, + "loss/reg": 1.0486500263214111, + "step": 1583 + }, + { + "epoch": 0.01584, + "grad_norm": 0.6148908138275146, + "grad_norm_var": 0.17746426467375553, + "learning_rate": 5e-05, + "loss": 0.1929, + "loss/crossentropy": 2.843154191970825, + "loss/hidden": 0.0, + "loss/logits": 0.1928807981312275, + "loss/reg": 1.0476218461990356, + "step": 1584 + }, + { + "epoch": 0.01585, + "grad_norm": 0.44110244512557983, + "grad_norm_var": 0.17684658441090229, + "learning_rate": 5e-05, + "loss": 0.2037, + "loss/crossentropy": 2.7909162044525146, + "loss/hidden": 0.0, + "loss/logits": 0.2037404477596283, + "loss/reg": 1.0467737913131714, + "step": 1585 + }, + { + "epoch": 0.01586, + "grad_norm": 0.397225022315979, + "grad_norm_var": 0.17664980315666612, + "learning_rate": 5e-05, + "loss": 0.1753, + "loss/crossentropy": 2.7290444374084473, + "loss/hidden": 0.0, + "loss/logits": 0.17533109337091446, + "loss/reg": 1.0455046892166138, + "step": 1586 + }, + { + "epoch": 0.01587, + "grad_norm": 0.43149325251579285, + "grad_norm_var": 0.17625553391335924, + "learning_rate": 5e-05, + "loss": 0.1832, + "loss/crossentropy": 2.9028236269950867, + "loss/hidden": 0.0, + "loss/logits": 0.18315931409597397, + "loss/reg": 1.0444459915161133, + "step": 1587 + }, + { + "epoch": 0.01588, + "grad_norm": 0.5195469260215759, + "grad_norm_var": 0.17517331211055512, + "learning_rate": 5e-05, + "loss": 0.2144, + "loss/crossentropy": 2.842075824737549, + "loss/hidden": 0.0, + "loss/logits": 0.21435903385281563, + "loss/reg": 1.043382167816162, + "step": 1588 + }, + { + "epoch": 0.01589, + "grad_norm": 0.39358392357826233, + "grad_norm_var": 0.0035111967362226287, + "learning_rate": 5e-05, + "loss": 0.1765, + "loss/crossentropy": 2.796789824962616, + "loss/hidden": 0.0, + "loss/logits": 0.17651647701859474, + "loss/reg": 1.042464256286621, + "step": 1589 + }, + { + "epoch": 0.0159, + "grad_norm": 0.38947010040283203, + "grad_norm_var": 0.00361531631975656, + "learning_rate": 5e-05, + "loss": 0.1797, + "loss/crossentropy": 2.7255443334579468, + "loss/hidden": 0.0, + "loss/logits": 0.17971770092844963, + "loss/reg": 1.0416078567504883, + "step": 1590 + }, + { + "epoch": 0.01591, + "grad_norm": 0.3702358901500702, + "grad_norm_var": 0.003822570496432457, + "learning_rate": 5e-05, + "loss": 0.1738, + "loss/crossentropy": 2.6745933890342712, + "loss/hidden": 0.0, + "loss/logits": 0.17375827953219414, + "loss/reg": 1.0408596992492676, + "step": 1591 + }, + { + "epoch": 0.01592, + "grad_norm": 0.4127400815486908, + "grad_norm_var": 0.0038312033002130524, + "learning_rate": 5e-05, + "loss": 0.1792, + "loss/crossentropy": 2.825109601020813, + "loss/hidden": 0.0, + "loss/logits": 0.1792273335158825, + "loss/reg": 1.0395222902297974, + "step": 1592 + }, + { + "epoch": 0.01593, + "grad_norm": 0.44498032331466675, + "grad_norm_var": 0.00371424276920929, + "learning_rate": 5e-05, + "loss": 0.1982, + "loss/crossentropy": 2.847081184387207, + "loss/hidden": 0.0, + "loss/logits": 0.19815902039408684, + "loss/reg": 1.038297176361084, + "step": 1593 + }, + { + "epoch": 0.01594, + "grad_norm": 0.3808656930923462, + "grad_norm_var": 0.0038453633106530346, + "learning_rate": 5e-05, + "loss": 0.1886, + "loss/crossentropy": 2.8437030911445618, + "loss/hidden": 0.0, + "loss/logits": 0.18859212845563889, + "loss/reg": 1.037431240081787, + "step": 1594 + }, + { + "epoch": 0.01595, + "grad_norm": 0.4359935522079468, + "grad_norm_var": 0.0038521160414555075, + "learning_rate": 5e-05, + "loss": 0.2084, + "loss/crossentropy": 2.7890624403953552, + "loss/hidden": 0.0, + "loss/logits": 0.20844709128141403, + "loss/reg": 1.035805344581604, + "step": 1595 + }, + { + "epoch": 0.01596, + "grad_norm": 0.3994602560997009, + "grad_norm_var": 0.0038648531464093713, + "learning_rate": 5e-05, + "loss": 0.1794, + "loss/crossentropy": 2.976751923561096, + "loss/hidden": 0.0, + "loss/logits": 0.17941803485155106, + "loss/reg": 1.034838080406189, + "step": 1596 + }, + { + "epoch": 0.01597, + "grad_norm": 0.4446309208869934, + "grad_norm_var": 0.0038036113876331397, + "learning_rate": 5e-05, + "loss": 0.2044, + "loss/crossentropy": 2.6885805130004883, + "loss/hidden": 0.0, + "loss/logits": 0.20441588386893272, + "loss/reg": 1.0337451696395874, + "step": 1597 + }, + { + "epoch": 0.01598, + "grad_norm": 0.38748055696487427, + "grad_norm_var": 0.0037424185106978165, + "learning_rate": 5e-05, + "loss": 0.1867, + "loss/crossentropy": 2.676852226257324, + "loss/hidden": 0.0, + "loss/logits": 0.18669695407152176, + "loss/reg": 1.0320935249328613, + "step": 1598 + }, + { + "epoch": 0.01599, + "grad_norm": 0.3517172932624817, + "grad_norm_var": 0.004115871219999908, + "learning_rate": 5e-05, + "loss": 0.1697, + "loss/crossentropy": 2.7314975261688232, + "loss/hidden": 0.0, + "loss/logits": 0.16971135139465332, + "loss/reg": 1.03118097782135, + "step": 1599 + }, + { + "epoch": 0.016, + "grad_norm": 0.3928627669811249, + "grad_norm_var": 0.001603946516322626, + "learning_rate": 5e-05, + "loss": 0.1882, + "loss/crossentropy": 2.722111701965332, + "loss/hidden": 0.0, + "loss/logits": 0.1881674863398075, + "loss/reg": 1.0301860570907593, + "step": 1600 + }, + { + "epoch": 0.01601, + "grad_norm": 0.43568360805511475, + "grad_norm_var": 0.0015848176222982625, + "learning_rate": 5e-05, + "loss": 0.2035, + "loss/crossentropy": 2.847353994846344, + "loss/hidden": 0.0, + "loss/logits": 0.20354657620191574, + "loss/reg": 1.0290837287902832, + "step": 1601 + }, + { + "epoch": 0.01602, + "grad_norm": 0.4481942355632782, + "grad_norm_var": 0.0016484863625255031, + "learning_rate": 5e-05, + "loss": 0.1995, + "loss/crossentropy": 2.8443565368652344, + "loss/hidden": 0.0, + "loss/logits": 0.19950192049145699, + "loss/reg": 1.0280483961105347, + "step": 1602 + }, + { + "epoch": 0.01603, + "grad_norm": 0.375472754240036, + "grad_norm_var": 0.0017209400432947792, + "learning_rate": 5e-05, + "loss": 0.1828, + "loss/crossentropy": 2.907568633556366, + "loss/hidden": 0.0, + "loss/logits": 0.18281982839107513, + "loss/reg": 1.0267391204833984, + "step": 1603 + }, + { + "epoch": 0.01604, + "grad_norm": 0.43082571029663086, + "grad_norm_var": 0.000933965990925841, + "learning_rate": 5e-05, + "loss": 0.1993, + "loss/crossentropy": 2.843041956424713, + "loss/hidden": 0.0, + "loss/logits": 0.19929581135511398, + "loss/reg": 1.0255545377731323, + "step": 1604 + }, + { + "epoch": 0.01605, + "grad_norm": 0.46895185112953186, + "grad_norm_var": 0.0011653483970377318, + "learning_rate": 5e-05, + "loss": 0.1959, + "loss/crossentropy": 2.8054389357566833, + "loss/hidden": 0.0, + "loss/logits": 0.1959143802523613, + "loss/reg": 1.024338722229004, + "step": 1605 + }, + { + "epoch": 0.01606, + "grad_norm": 0.40604454278945923, + "grad_norm_var": 0.0011358271508217518, + "learning_rate": 5e-05, + "loss": 0.1909, + "loss/crossentropy": 2.6769182085990906, + "loss/hidden": 0.0, + "loss/logits": 0.19093864411115646, + "loss/reg": 1.0232833623886108, + "step": 1606 + }, + { + "epoch": 0.01607, + "grad_norm": 0.3953154683113098, + "grad_norm_var": 0.0010367066058140713, + "learning_rate": 5e-05, + "loss": 0.1903, + "loss/crossentropy": 2.784206509590149, + "loss/hidden": 0.0, + "loss/logits": 0.1903013363480568, + "loss/reg": 1.0223585367202759, + "step": 1607 + }, + { + "epoch": 0.01608, + "grad_norm": 0.4797690510749817, + "grad_norm_var": 0.0013133904404337778, + "learning_rate": 5e-05, + "loss": 0.2239, + "loss/crossentropy": 2.7858375906944275, + "loss/hidden": 0.0, + "loss/logits": 0.2238890789449215, + "loss/reg": 1.0218397378921509, + "step": 1608 + }, + { + "epoch": 0.01609, + "grad_norm": 0.39962831139564514, + "grad_norm_var": 0.0012751071067100051, + "learning_rate": 5e-05, + "loss": 0.188, + "loss/crossentropy": 2.769327938556671, + "loss/hidden": 0.0, + "loss/logits": 0.1880376748740673, + "loss/reg": 1.0211185216903687, + "step": 1609 + }, + { + "epoch": 0.0161, + "grad_norm": 0.4070293605327606, + "grad_norm_var": 0.0012003623105043696, + "learning_rate": 5e-05, + "loss": 0.1911, + "loss/crossentropy": 2.886406660079956, + "loss/hidden": 0.0, + "loss/logits": 0.1911228708922863, + "loss/reg": 1.0199602842330933, + "step": 1610 + }, + { + "epoch": 0.01611, + "grad_norm": 0.3688849210739136, + "grad_norm_var": 0.001304648081967604, + "learning_rate": 5e-05, + "loss": 0.1884, + "loss/crossentropy": 2.839527428150177, + "loss/hidden": 0.0, + "loss/logits": 0.188365176320076, + "loss/reg": 1.0188500881195068, + "step": 1611 + }, + { + "epoch": 0.01612, + "grad_norm": 0.3744886815547943, + "grad_norm_var": 0.001385363352798156, + "learning_rate": 5e-05, + "loss": 0.1894, + "loss/crossentropy": 2.902321934700012, + "loss/hidden": 0.0, + "loss/logits": 0.18943318352103233, + "loss/reg": 1.017642617225647, + "step": 1612 + }, + { + "epoch": 0.01613, + "grad_norm": 0.39889490604400635, + "grad_norm_var": 0.0013075760766252942, + "learning_rate": 5e-05, + "loss": 0.2063, + "loss/crossentropy": 2.8871779441833496, + "loss/hidden": 0.0, + "loss/logits": 0.20626594126224518, + "loss/reg": 1.016883134841919, + "step": 1613 + }, + { + "epoch": 0.01614, + "grad_norm": 0.583963930606842, + "grad_norm_var": 0.00319393139732812, + "learning_rate": 5e-05, + "loss": 0.1922, + "loss/crossentropy": 2.8867177963256836, + "loss/hidden": 0.0, + "loss/logits": 0.19224493950605392, + "loss/reg": 1.0164117813110352, + "step": 1614 + }, + { + "epoch": 0.01615, + "grad_norm": 0.46384310722351074, + "grad_norm_var": 0.0029609833884467586, + "learning_rate": 5e-05, + "loss": 0.1865, + "loss/crossentropy": 2.8498631715774536, + "loss/hidden": 0.0, + "loss/logits": 0.18652616068720818, + "loss/reg": 1.01522958278656, + "step": 1615 + }, + { + "epoch": 0.01616, + "grad_norm": 0.45459720492362976, + "grad_norm_var": 0.002919291729982876, + "learning_rate": 5e-05, + "loss": 0.1997, + "loss/crossentropy": 2.9225170612335205, + "loss/hidden": 0.0, + "loss/logits": 0.19967232644557953, + "loss/reg": 1.0138804912567139, + "step": 1616 + }, + { + "epoch": 0.01617, + "grad_norm": 0.4312615394592285, + "grad_norm_var": 0.0029175898021926217, + "learning_rate": 5e-05, + "loss": 0.1956, + "loss/crossentropy": 2.8372015953063965, + "loss/hidden": 0.0, + "loss/logits": 0.1956445276737213, + "loss/reg": 1.0130113363265991, + "step": 1617 + }, + { + "epoch": 0.01618, + "grad_norm": 0.41878917813301086, + "grad_norm_var": 0.0029020530857978622, + "learning_rate": 5e-05, + "loss": 0.2, + "loss/crossentropy": 2.7289316654205322, + "loss/hidden": 0.0, + "loss/logits": 0.20003606751561165, + "loss/reg": 1.0116477012634277, + "step": 1618 + }, + { + "epoch": 0.01619, + "grad_norm": 0.3937804698944092, + "grad_norm_var": 0.002793291740125229, + "learning_rate": 5e-05, + "loss": 0.1809, + "loss/crossentropy": 2.9862471222877502, + "loss/hidden": 0.0, + "loss/logits": 0.18085769563913345, + "loss/reg": 1.0105339288711548, + "step": 1619 + }, + { + "epoch": 0.0162, + "grad_norm": 0.38236817717552185, + "grad_norm_var": 0.0029331274073497783, + "learning_rate": 5e-05, + "loss": 0.1781, + "loss/crossentropy": 2.660866141319275, + "loss/hidden": 0.0, + "loss/logits": 0.17813289538025856, + "loss/reg": 1.0092902183532715, + "step": 1620 + }, + { + "epoch": 0.01621, + "grad_norm": 2.7290685176849365, + "grad_norm_var": 0.3349158996765001, + "learning_rate": 5e-05, + "loss": 0.2398, + "loss/crossentropy": 2.8020981550216675, + "loss/hidden": 0.0, + "loss/logits": 0.2397916540503502, + "loss/reg": 1.0086039304733276, + "step": 1621 + }, + { + "epoch": 0.01622, + "grad_norm": 0.3994118869304657, + "grad_norm_var": 0.33506186009023053, + "learning_rate": 5e-05, + "loss": 0.18, + "loss/crossentropy": 2.760943591594696, + "loss/hidden": 0.0, + "loss/logits": 0.1800093874335289, + "loss/reg": 1.0072078704833984, + "step": 1622 + }, + { + "epoch": 0.01623, + "grad_norm": 0.5013747215270996, + "grad_norm_var": 0.3333290261657852, + "learning_rate": 5e-05, + "loss": 0.206, + "loss/crossentropy": 2.761034667491913, + "loss/hidden": 0.0, + "loss/logits": 0.20601341128349304, + "loss/reg": 1.0063005685806274, + "step": 1623 + }, + { + "epoch": 0.01624, + "grad_norm": 0.5265280604362488, + "grad_norm_var": 0.3328769613967595, + "learning_rate": 5e-05, + "loss": 0.1844, + "loss/crossentropy": 2.8885090947151184, + "loss/hidden": 0.0, + "loss/logits": 0.18442480266094208, + "loss/reg": 1.0057168006896973, + "step": 1624 + }, + { + "epoch": 0.01625, + "grad_norm": 0.4721510708332062, + "grad_norm_var": 0.3314893959527416, + "learning_rate": 5e-05, + "loss": 0.1941, + "loss/crossentropy": 2.6390693187713623, + "loss/hidden": 0.0, + "loss/logits": 0.19406652450561523, + "loss/reg": 1.0045299530029297, + "step": 1625 + }, + { + "epoch": 0.01626, + "grad_norm": 0.45044615864753723, + "grad_norm_var": 0.3305963341312829, + "learning_rate": 5e-05, + "loss": 0.1854, + "loss/crossentropy": 2.8466389775276184, + "loss/hidden": 0.0, + "loss/logits": 0.18537183478474617, + "loss/reg": 1.0036040544509888, + "step": 1626 + }, + { + "epoch": 0.01627, + "grad_norm": 0.4176648259162903, + "grad_norm_var": 0.3293435667823592, + "learning_rate": 5e-05, + "loss": 0.1729, + "loss/crossentropy": 2.8753581643104553, + "loss/hidden": 0.0, + "loss/logits": 0.1729147806763649, + "loss/reg": 1.0026119947433472, + "step": 1627 + }, + { + "epoch": 0.01628, + "grad_norm": 0.45000559091567993, + "grad_norm_var": 0.32755605843470553, + "learning_rate": 5e-05, + "loss": 0.1877, + "loss/crossentropy": 2.8436471819877625, + "loss/hidden": 0.0, + "loss/logits": 0.18771208450198174, + "loss/reg": 1.0014100074768066, + "step": 1628 + }, + { + "epoch": 0.01629, + "grad_norm": 0.6647088527679443, + "grad_norm_var": 0.32512335965386835, + "learning_rate": 5e-05, + "loss": 0.247, + "loss/crossentropy": 3.064721703529358, + "loss/hidden": 0.0, + "loss/logits": 0.24700120091438293, + "loss/reg": 1.0006461143493652, + "step": 1629 + }, + { + "epoch": 0.0163, + "grad_norm": 0.4309985935688019, + "grad_norm_var": 0.3270912337702334, + "learning_rate": 5e-05, + "loss": 0.2076, + "loss/crossentropy": 2.7790536284446716, + "loss/hidden": 0.0, + "loss/logits": 0.20761951059103012, + "loss/reg": 0.9996326565742493, + "step": 1630 + }, + { + "epoch": 0.01631, + "grad_norm": 0.4092963933944702, + "grad_norm_var": 0.32826153742197073, + "learning_rate": 5e-05, + "loss": 0.1858, + "loss/crossentropy": 2.753304898738861, + "loss/hidden": 0.0, + "loss/logits": 0.1858477033674717, + "loss/reg": 0.998843252658844, + "step": 1631 + }, + { + "epoch": 0.01632, + "grad_norm": 0.43260130286216736, + "grad_norm_var": 0.32870582994017805, + "learning_rate": 5e-05, + "loss": 0.2128, + "loss/crossentropy": 2.7732598185539246, + "loss/hidden": 0.0, + "loss/logits": 0.2127918191254139, + "loss/reg": 0.9981245994567871, + "step": 1632 + }, + { + "epoch": 0.01633, + "grad_norm": 0.4002577066421509, + "grad_norm_var": 0.3294403105987868, + "learning_rate": 5e-05, + "loss": 0.1824, + "loss/crossentropy": 2.7656468749046326, + "loss/hidden": 0.0, + "loss/logits": 0.18241503462195396, + "loss/reg": 0.996983528137207, + "step": 1633 + }, + { + "epoch": 0.01634, + "grad_norm": 0.4860977530479431, + "grad_norm_var": 0.3281648073013982, + "learning_rate": 5e-05, + "loss": 0.2161, + "loss/crossentropy": 2.7581685185432434, + "loss/hidden": 0.0, + "loss/logits": 0.2160632722079754, + "loss/reg": 0.9963746666908264, + "step": 1634 + }, + { + "epoch": 0.01635, + "grad_norm": 0.7037771940231323, + "grad_norm_var": 0.3257848148583186, + "learning_rate": 5e-05, + "loss": 0.1847, + "loss/crossentropy": 2.8083826303482056, + "loss/hidden": 0.0, + "loss/logits": 0.1846979483962059, + "loss/reg": 0.9959037899971008, + "step": 1635 + }, + { + "epoch": 0.01636, + "grad_norm": 0.3976283669471741, + "grad_norm_var": 0.3253239044098365, + "learning_rate": 5e-05, + "loss": 0.1766, + "loss/crossentropy": 2.87517386674881, + "loss/hidden": 0.0, + "loss/logits": 0.17655937373638153, + "loss/reg": 0.9958146214485168, + "step": 1636 + }, + { + "epoch": 0.01637, + "grad_norm": 0.40411144495010376, + "grad_norm_var": 0.008434168810514371, + "learning_rate": 5e-05, + "loss": 0.188, + "loss/crossentropy": 2.9585282802581787, + "loss/hidden": 0.0, + "loss/logits": 0.18801874667406082, + "loss/reg": 0.9954627156257629, + "step": 1637 + }, + { + "epoch": 0.01638, + "grad_norm": 0.47144263982772827, + "grad_norm_var": 0.008064267432894153, + "learning_rate": 5e-05, + "loss": 0.2147, + "loss/crossentropy": 2.8516178727149963, + "loss/hidden": 0.0, + "loss/logits": 0.21468612551689148, + "loss/reg": 0.9962711930274963, + "step": 1638 + }, + { + "epoch": 0.01639, + "grad_norm": 0.4136725068092346, + "grad_norm_var": 0.00825053359325154, + "learning_rate": 5e-05, + "loss": 0.1893, + "loss/crossentropy": 2.710533022880554, + "loss/hidden": 0.0, + "loss/logits": 0.189330842345953, + "loss/reg": 0.9961523413658142, + "step": 1639 + }, + { + "epoch": 0.0164, + "grad_norm": 0.4156164824962616, + "grad_norm_var": 0.008193946810416774, + "learning_rate": 5e-05, + "loss": 0.2061, + "loss/crossentropy": 2.6554945707321167, + "loss/hidden": 0.0, + "loss/logits": 0.206108208745718, + "loss/reg": 0.9952605962753296, + "step": 1640 + }, + { + "epoch": 0.01641, + "grad_norm": 0.5130243897438049, + "grad_norm_var": 0.008343982594008044, + "learning_rate": 5e-05, + "loss": 0.2016, + "loss/crossentropy": 2.844529688358307, + "loss/hidden": 0.0, + "loss/logits": 0.2015930339694023, + "loss/reg": 0.9954249858856201, + "step": 1641 + }, + { + "epoch": 0.01642, + "grad_norm": 0.40717145800590515, + "grad_norm_var": 0.00855270077346663, + "learning_rate": 5e-05, + "loss": 0.1893, + "loss/crossentropy": 2.8812676668167114, + "loss/hidden": 0.0, + "loss/logits": 0.18932242318987846, + "loss/reg": 0.9956634044647217, + "step": 1642 + }, + { + "epoch": 0.01643, + "grad_norm": 0.4588758051395416, + "grad_norm_var": 0.008406279557532174, + "learning_rate": 5e-05, + "loss": 0.2035, + "loss/crossentropy": 2.959150493144989, + "loss/hidden": 0.0, + "loss/logits": 0.20350589975714684, + "loss/reg": 0.9954665899276733, + "step": 1643 + }, + { + "epoch": 0.01644, + "grad_norm": 0.41808342933654785, + "grad_norm_var": 0.00853891966052895, + "learning_rate": 5e-05, + "loss": 0.1793, + "loss/crossentropy": 2.720207452774048, + "loss/hidden": 0.0, + "loss/logits": 0.17933955788612366, + "loss/reg": 0.9951499700546265, + "step": 1644 + }, + { + "epoch": 0.01645, + "grad_norm": 0.4281888008117676, + "grad_norm_var": 0.0057123534006301175, + "learning_rate": 5e-05, + "loss": 0.2003, + "loss/crossentropy": 2.7021594643592834, + "loss/hidden": 0.0, + "loss/logits": 0.20027698203921318, + "loss/reg": 0.9951516389846802, + "step": 1645 + }, + { + "epoch": 0.01646, + "grad_norm": 0.4419175088405609, + "grad_norm_var": 0.0056929746093528485, + "learning_rate": 5e-05, + "loss": 0.1888, + "loss/crossentropy": 2.7900770902633667, + "loss/hidden": 0.0, + "loss/logits": 0.18883102387189865, + "loss/reg": 0.9944278597831726, + "step": 1646 + }, + { + "epoch": 0.01647, + "grad_norm": 0.406677782535553, + "grad_norm_var": 0.005707653242778928, + "learning_rate": 5e-05, + "loss": 0.2018, + "loss/crossentropy": 2.8403636813163757, + "loss/hidden": 0.0, + "loss/logits": 0.20182918012142181, + "loss/reg": 0.9936374425888062, + "step": 1647 + }, + { + "epoch": 0.01648, + "grad_norm": 0.39660346508026123, + "grad_norm_var": 0.005871895630400322, + "learning_rate": 5e-05, + "loss": 0.1949, + "loss/crossentropy": 2.6434635519981384, + "loss/hidden": 0.0, + "loss/logits": 0.19490381330251694, + "loss/reg": 0.9928113222122192, + "step": 1648 + }, + { + "epoch": 0.01649, + "grad_norm": 0.38837823271751404, + "grad_norm_var": 0.005955855741034688, + "learning_rate": 5e-05, + "loss": 0.196, + "loss/crossentropy": 2.7492038011550903, + "loss/hidden": 0.0, + "loss/logits": 0.19599515572190285, + "loss/reg": 0.9914868474006653, + "step": 1649 + }, + { + "epoch": 0.0165, + "grad_norm": 0.39013776183128357, + "grad_norm_var": 0.006030547116352913, + "learning_rate": 5e-05, + "loss": 0.1913, + "loss/crossentropy": 2.8989550471305847, + "loss/hidden": 0.0, + "loss/logits": 0.19131910800933838, + "loss/reg": 0.9906550645828247, + "step": 1650 + }, + { + "epoch": 0.01651, + "grad_norm": 0.46770238876342773, + "grad_norm_var": 0.0012410480978832794, + "learning_rate": 5e-05, + "loss": 0.194, + "loss/crossentropy": 3.0211783051490784, + "loss/hidden": 0.0, + "loss/logits": 0.19395827502012253, + "loss/reg": 0.9903994798660278, + "step": 1651 + }, + { + "epoch": 0.01652, + "grad_norm": 0.4251278340816498, + "grad_norm_var": 0.0011835438271421083, + "learning_rate": 5e-05, + "loss": 0.1983, + "loss/crossentropy": 2.7187276482582092, + "loss/hidden": 0.0, + "loss/logits": 0.1982722319662571, + "loss/reg": 0.9893469214439392, + "step": 1652 + }, + { + "epoch": 0.01653, + "grad_norm": 0.48329293727874756, + "grad_norm_var": 0.0013240331607979549, + "learning_rate": 5e-05, + "loss": 0.2075, + "loss/crossentropy": 2.614067316055298, + "loss/hidden": 0.0, + "loss/logits": 0.2074788697063923, + "loss/reg": 0.988538384437561, + "step": 1653 + }, + { + "epoch": 0.01654, + "grad_norm": 0.7802302241325378, + "grad_norm_var": 0.008871511150981197, + "learning_rate": 5e-05, + "loss": 0.1867, + "loss/crossentropy": 2.801050305366516, + "loss/hidden": 0.0, + "loss/logits": 0.18674134090542793, + "loss/reg": 0.9879339933395386, + "step": 1654 + }, + { + "epoch": 0.01655, + "grad_norm": 0.5053502917289734, + "grad_norm_var": 0.008926244689548623, + "learning_rate": 5e-05, + "loss": 0.2008, + "loss/crossentropy": 2.7182639241218567, + "loss/hidden": 0.0, + "loss/logits": 0.20081807672977448, + "loss/reg": 0.9869012236595154, + "step": 1655 + }, + { + "epoch": 0.01656, + "grad_norm": 0.47397318482398987, + "grad_norm_var": 0.00881009549445061, + "learning_rate": 5e-05, + "loss": 0.1863, + "loss/crossentropy": 3.0697373151779175, + "loss/hidden": 0.0, + "loss/logits": 0.18633192032575607, + "loss/reg": 0.9862724542617798, + "step": 1656 + }, + { + "epoch": 0.01657, + "grad_norm": 0.44242963194847107, + "grad_norm_var": 0.008637024175784314, + "learning_rate": 5e-05, + "loss": 0.191, + "loss/crossentropy": 2.746702015399933, + "loss/hidden": 0.0, + "loss/logits": 0.19096798822283745, + "loss/reg": 0.9855261445045471, + "step": 1657 + }, + { + "epoch": 0.01658, + "grad_norm": 0.4351765811443329, + "grad_norm_var": 0.008499481917788154, + "learning_rate": 5e-05, + "loss": 0.19, + "loss/crossentropy": 2.7543291449546814, + "loss/hidden": 0.0, + "loss/logits": 0.1900062970817089, + "loss/reg": 0.9842270016670227, + "step": 1658 + }, + { + "epoch": 0.01659, + "grad_norm": 0.4238717555999756, + "grad_norm_var": 0.008576100925275236, + "learning_rate": 5e-05, + "loss": 0.1885, + "loss/crossentropy": 2.738615930080414, + "loss/hidden": 0.0, + "loss/logits": 0.1884973607957363, + "loss/reg": 0.9834668636322021, + "step": 1659 + }, + { + "epoch": 0.0166, + "grad_norm": 0.4111272394657135, + "grad_norm_var": 0.008614938397928507, + "learning_rate": 5e-05, + "loss": 0.1987, + "loss/crossentropy": 2.6902480721473694, + "loss/hidden": 0.0, + "loss/logits": 0.19869986921548843, + "loss/reg": 0.9826908111572266, + "step": 1660 + }, + { + "epoch": 0.01661, + "grad_norm": 0.41095390915870667, + "grad_norm_var": 0.008698014381044639, + "learning_rate": 5e-05, + "loss": 0.1866, + "loss/crossentropy": 2.7943766117095947, + "loss/hidden": 0.0, + "loss/logits": 0.18661296740174294, + "loss/reg": 0.9816150665283203, + "step": 1661 + }, + { + "epoch": 0.01662, + "grad_norm": 0.41411253809928894, + "grad_norm_var": 0.008795518968246081, + "learning_rate": 5e-05, + "loss": 0.2033, + "loss/crossentropy": 2.859815239906311, + "loss/hidden": 0.0, + "loss/logits": 0.20330817252397537, + "loss/reg": 0.9813138246536255, + "step": 1662 + }, + { + "epoch": 0.01663, + "grad_norm": 0.4265795946121216, + "grad_norm_var": 0.0086961695125602, + "learning_rate": 5e-05, + "loss": 0.1944, + "loss/crossentropy": 2.9196831583976746, + "loss/hidden": 0.0, + "loss/logits": 0.19435005262494087, + "loss/reg": 0.9806890487670898, + "step": 1663 + }, + { + "epoch": 0.01664, + "grad_norm": 0.4456392526626587, + "grad_norm_var": 0.008466672332987986, + "learning_rate": 5e-05, + "loss": 0.2035, + "loss/crossentropy": 2.7248425483703613, + "loss/hidden": 0.0, + "loss/logits": 0.20353588461875916, + "loss/reg": 0.980077862739563, + "step": 1664 + }, + { + "epoch": 0.01665, + "grad_norm": 0.3930458724498749, + "grad_norm_var": 0.008424857113765827, + "learning_rate": 5e-05, + "loss": 0.1924, + "loss/crossentropy": 2.8926807045936584, + "loss/hidden": 0.0, + "loss/logits": 0.19242505729198456, + "loss/reg": 0.979702353477478, + "step": 1665 + }, + { + "epoch": 0.01666, + "grad_norm": 0.3947600722312927, + "grad_norm_var": 0.008384339501580958, + "learning_rate": 5e-05, + "loss": 0.189, + "loss/crossentropy": 2.923057436943054, + "loss/hidden": 0.0, + "loss/logits": 0.18904650956392288, + "loss/reg": 0.9788217544555664, + "step": 1666 + }, + { + "epoch": 0.01667, + "grad_norm": 0.44706106185913086, + "grad_norm_var": 0.008385190103097765, + "learning_rate": 5e-05, + "loss": 0.204, + "loss/crossentropy": 2.7078863978385925, + "loss/hidden": 0.0, + "loss/logits": 0.2040410153567791, + "loss/reg": 0.9782266616821289, + "step": 1667 + }, + { + "epoch": 0.01668, + "grad_norm": 0.39598697423934937, + "grad_norm_var": 0.00856227985747096, + "learning_rate": 5e-05, + "loss": 0.2027, + "loss/crossentropy": 2.7848398089408875, + "loss/hidden": 0.0, + "loss/logits": 0.20268257707357407, + "loss/reg": 0.9783090949058533, + "step": 1668 + }, + { + "epoch": 0.01669, + "grad_norm": 0.43456798791885376, + "grad_norm_var": 0.008528310952534387, + "learning_rate": 5e-05, + "loss": 0.2359, + "loss/crossentropy": 2.9408648014068604, + "loss/hidden": 0.0, + "loss/logits": 0.23594681918621063, + "loss/reg": 0.9780111908912659, + "step": 1669 + }, + { + "epoch": 0.0167, + "grad_norm": 0.44477957487106323, + "grad_norm_var": 0.0008885970048522614, + "learning_rate": 5e-05, + "loss": 0.2097, + "loss/crossentropy": 2.7345515489578247, + "loss/hidden": 0.0, + "loss/logits": 0.20972703397274017, + "loss/reg": 0.9774311184883118, + "step": 1670 + }, + { + "epoch": 0.01671, + "grad_norm": 0.4334873557090759, + "grad_norm_var": 0.0005010059813244501, + "learning_rate": 5e-05, + "loss": 0.1958, + "loss/crossentropy": 2.802227795124054, + "loss/hidden": 0.0, + "loss/logits": 0.1957888901233673, + "loss/reg": 0.976824939250946, + "step": 1671 + }, + { + "epoch": 0.01672, + "grad_norm": 0.44768232107162476, + "grad_norm_var": 0.00037857010970889784, + "learning_rate": 5e-05, + "loss": 0.2204, + "loss/crossentropy": 2.687469720840454, + "loss/hidden": 0.0, + "loss/logits": 0.22035083919763565, + "loss/reg": 0.9762384295463562, + "step": 1672 + }, + { + "epoch": 0.01673, + "grad_norm": 0.4268612861633301, + "grad_norm_var": 0.0003577020661750141, + "learning_rate": 5e-05, + "loss": 0.2018, + "loss/crossentropy": 2.8833935260772705, + "loss/hidden": 0.0, + "loss/logits": 0.2018217258155346, + "loss/reg": 0.9758449792861938, + "step": 1673 + }, + { + "epoch": 0.01674, + "grad_norm": 0.4269968867301941, + "grad_norm_var": 0.00034980973717595453, + "learning_rate": 5e-05, + "loss": 0.197, + "loss/crossentropy": 2.7447726726531982, + "loss/hidden": 0.0, + "loss/logits": 0.19696272909641266, + "loss/reg": 0.9751297831535339, + "step": 1674 + }, + { + "epoch": 0.01675, + "grad_norm": 0.4018579125404358, + "grad_norm_var": 0.00037928433144642784, + "learning_rate": 5e-05, + "loss": 0.1974, + "loss/crossentropy": 2.845329523086548, + "loss/hidden": 0.0, + "loss/logits": 0.1974216252565384, + "loss/reg": 0.9747862219810486, + "step": 1675 + }, + { + "epoch": 0.01676, + "grad_norm": 0.4319971203804016, + "grad_norm_var": 0.0003756425543625282, + "learning_rate": 5e-05, + "loss": 0.1963, + "loss/crossentropy": 2.912673830986023, + "loss/hidden": 0.0, + "loss/logits": 0.19628258049488068, + "loss/reg": 0.9739908576011658, + "step": 1676 + }, + { + "epoch": 0.01677, + "grad_norm": 0.42803555727005005, + "grad_norm_var": 0.00036525195673638616, + "learning_rate": 5e-05, + "loss": 0.1863, + "loss/crossentropy": 2.8631667494773865, + "loss/hidden": 0.0, + "loss/logits": 0.18627800792455673, + "loss/reg": 0.9731059074401855, + "step": 1677 + }, + { + "epoch": 0.01678, + "grad_norm": 0.42730480432510376, + "grad_norm_var": 0.00035769842100891923, + "learning_rate": 5e-05, + "loss": 0.1918, + "loss/crossentropy": 2.771612226963043, + "loss/hidden": 0.0, + "loss/logits": 0.19182706996798515, + "loss/reg": 0.971794605255127, + "step": 1678 + }, + { + "epoch": 0.01679, + "grad_norm": 0.4047600030899048, + "grad_norm_var": 0.0003840668623575424, + "learning_rate": 5e-05, + "loss": 0.1884, + "loss/crossentropy": 2.746255576610565, + "loss/hidden": 0.0, + "loss/logits": 0.1884135976433754, + "loss/reg": 0.9709142446517944, + "step": 1679 + }, + { + "epoch": 0.0168, + "grad_norm": 0.40580958127975464, + "grad_norm_var": 0.000368572634361064, + "learning_rate": 5e-05, + "loss": 0.194, + "loss/crossentropy": 2.7972524762153625, + "loss/hidden": 0.0, + "loss/logits": 0.19399097189307213, + "loss/reg": 0.9700757265090942, + "step": 1680 + }, + { + "epoch": 0.01681, + "grad_norm": 0.4170977473258972, + "grad_norm_var": 0.00031327910748755924, + "learning_rate": 5e-05, + "loss": 0.1982, + "loss/crossentropy": 2.680404782295227, + "loss/hidden": 0.0, + "loss/logits": 0.19815754517912865, + "loss/reg": 0.9692074656486511, + "step": 1681 + }, + { + "epoch": 0.01682, + "grad_norm": 0.4438115656375885, + "grad_norm_var": 0.00027853475307475136, + "learning_rate": 5e-05, + "loss": 0.2074, + "loss/crossentropy": 2.7973380088806152, + "loss/hidden": 0.0, + "loss/logits": 0.20738579705357552, + "loss/reg": 0.9686961770057678, + "step": 1682 + }, + { + "epoch": 0.01683, + "grad_norm": 0.4141625463962555, + "grad_norm_var": 0.0002543706883667192, + "learning_rate": 5e-05, + "loss": 0.1951, + "loss/crossentropy": 2.7903233766555786, + "loss/hidden": 0.0, + "loss/logits": 0.1950708031654358, + "loss/reg": 0.9684246778488159, + "step": 1683 + }, + { + "epoch": 0.01684, + "grad_norm": 0.3853682577610016, + "grad_norm_var": 0.0003011857786094733, + "learning_rate": 5e-05, + "loss": 0.1829, + "loss/crossentropy": 2.902022361755371, + "loss/hidden": 0.0, + "loss/logits": 0.18294353038072586, + "loss/reg": 0.96816486120224, + "step": 1684 + }, + { + "epoch": 0.01685, + "grad_norm": 0.4095843732357025, + "grad_norm_var": 0.0003030324449647607, + "learning_rate": 5e-05, + "loss": 0.2011, + "loss/crossentropy": 2.8587907552719116, + "loss/hidden": 0.0, + "loss/logits": 0.20112555101513863, + "loss/reg": 0.967812180519104, + "step": 1685 + }, + { + "epoch": 0.01686, + "grad_norm": 0.398264616727829, + "grad_norm_var": 0.00029604972872923445, + "learning_rate": 5e-05, + "loss": 0.1938, + "loss/crossentropy": 2.7175191044807434, + "loss/hidden": 0.0, + "loss/logits": 0.19377201423048973, + "loss/reg": 0.9674014449119568, + "step": 1686 + }, + { + "epoch": 0.01687, + "grad_norm": 0.3565382957458496, + "grad_norm_var": 0.0005168949377238404, + "learning_rate": 5e-05, + "loss": 0.1782, + "loss/crossentropy": 2.8121435046195984, + "loss/hidden": 0.0, + "loss/logits": 0.17816277965903282, + "loss/reg": 0.9671922922134399, + "step": 1687 + }, + { + "epoch": 0.01688, + "grad_norm": 0.38014495372772217, + "grad_norm_var": 0.0004998676381996507, + "learning_rate": 5e-05, + "loss": 0.18, + "loss/crossentropy": 2.8066705465316772, + "loss/hidden": 0.0, + "loss/logits": 0.18004292249679565, + "loss/reg": 0.9661559462547302, + "step": 1688 + }, + { + "epoch": 0.01689, + "grad_norm": 0.439016729593277, + "grad_norm_var": 0.0005365721033507261, + "learning_rate": 5e-05, + "loss": 0.1977, + "loss/crossentropy": 2.816479742527008, + "loss/hidden": 0.0, + "loss/logits": 0.19771433249115944, + "loss/reg": 0.965285062789917, + "step": 1689 + }, + { + "epoch": 0.0169, + "grad_norm": 0.423315167427063, + "grad_norm_var": 0.0005294054421746204, + "learning_rate": 5e-05, + "loss": 0.2077, + "loss/crossentropy": 2.815659761428833, + "loss/hidden": 0.0, + "loss/logits": 0.20774946361780167, + "loss/reg": 0.9643948078155518, + "step": 1690 + }, + { + "epoch": 0.01691, + "grad_norm": 0.4508022367954254, + "grad_norm_var": 0.0006231092694258996, + "learning_rate": 5e-05, + "loss": 0.2144, + "loss/crossentropy": 2.7999706864356995, + "loss/hidden": 0.0, + "loss/logits": 0.21436100453138351, + "loss/reg": 0.9632377028465271, + "step": 1691 + }, + { + "epoch": 0.01692, + "grad_norm": 0.3995899260044098, + "grad_norm_var": 0.0006088267676019518, + "learning_rate": 5e-05, + "loss": 0.1911, + "loss/crossentropy": 2.879646420478821, + "loss/hidden": 0.0, + "loss/logits": 0.19112277030944824, + "loss/reg": 0.962549090385437, + "step": 1692 + }, + { + "epoch": 0.01693, + "grad_norm": 0.5071508288383484, + "grad_norm_var": 0.0011747166082547742, + "learning_rate": 5e-05, + "loss": 0.1898, + "loss/crossentropy": 2.949573576450348, + "loss/hidden": 0.0, + "loss/logits": 0.1898249238729477, + "loss/reg": 0.9616365432739258, + "step": 1693 + }, + { + "epoch": 0.01694, + "grad_norm": 0.3963351845741272, + "grad_norm_var": 0.0011897154306865331, + "learning_rate": 5e-05, + "loss": 0.199, + "loss/crossentropy": 2.7994354367256165, + "loss/hidden": 0.0, + "loss/logits": 0.19900693371891975, + "loss/reg": 0.960827648639679, + "step": 1694 + }, + { + "epoch": 0.01695, + "grad_norm": 0.4103904068470001, + "grad_norm_var": 0.0011843963912345878, + "learning_rate": 5e-05, + "loss": 0.1908, + "loss/crossentropy": 2.764429032802582, + "loss/hidden": 0.0, + "loss/logits": 0.19081446528434753, + "loss/reg": 0.9593966007232666, + "step": 1695 + }, + { + "epoch": 0.01696, + "grad_norm": 0.3964170515537262, + "grad_norm_var": 0.0012012147403077358, + "learning_rate": 5e-05, + "loss": 0.1988, + "loss/crossentropy": 2.680214524269104, + "loss/hidden": 0.0, + "loss/logits": 0.1987549141049385, + "loss/reg": 0.9580814838409424, + "step": 1696 + }, + { + "epoch": 0.01697, + "grad_norm": 0.4489021897315979, + "grad_norm_var": 0.001276513715549162, + "learning_rate": 5e-05, + "loss": 0.21, + "loss/crossentropy": 2.820174813270569, + "loss/hidden": 0.0, + "loss/logits": 0.21000191941857338, + "loss/reg": 0.9565646648406982, + "step": 1697 + }, + { + "epoch": 0.01698, + "grad_norm": 0.4084279537200928, + "grad_norm_var": 0.0012246727050356085, + "learning_rate": 5e-05, + "loss": 0.1853, + "loss/crossentropy": 2.754970967769623, + "loss/hidden": 0.0, + "loss/logits": 0.1853071115911007, + "loss/reg": 0.9545485377311707, + "step": 1698 + }, + { + "epoch": 0.01699, + "grad_norm": 0.4521838128566742, + "grad_norm_var": 0.0013157176445986315, + "learning_rate": 5e-05, + "loss": 0.1892, + "loss/crossentropy": 2.8605205416679382, + "loss/hidden": 0.0, + "loss/logits": 0.18919217213988304, + "loss/reg": 0.9533572793006897, + "step": 1699 + }, + { + "epoch": 0.017, + "grad_norm": 0.40928900241851807, + "grad_norm_var": 0.0012525002442726437, + "learning_rate": 5e-05, + "loss": 0.2029, + "loss/crossentropy": 2.7299290895462036, + "loss/hidden": 0.0, + "loss/logits": 0.20293082669377327, + "loss/reg": 0.9519543051719666, + "step": 1700 + }, + { + "epoch": 0.01701, + "grad_norm": 0.4375574588775635, + "grad_norm_var": 0.0012704019431191704, + "learning_rate": 5e-05, + "loss": 0.21, + "loss/crossentropy": 2.7553977370262146, + "loss/hidden": 0.0, + "loss/logits": 0.21000895649194717, + "loss/reg": 0.9501931667327881, + "step": 1701 + }, + { + "epoch": 0.01702, + "grad_norm": 0.40013188123703003, + "grad_norm_var": 0.001265296725807552, + "learning_rate": 5e-05, + "loss": 0.1911, + "loss/crossentropy": 2.8755266666412354, + "loss/hidden": 0.0, + "loss/logits": 0.19107430800795555, + "loss/reg": 0.9485136866569519, + "step": 1702 + }, + { + "epoch": 0.01703, + "grad_norm": 0.3753577470779419, + "grad_norm_var": 0.0011287875673520631, + "learning_rate": 5e-05, + "loss": 0.1879, + "loss/crossentropy": 2.7124125957489014, + "loss/hidden": 0.0, + "loss/logits": 0.18789031356573105, + "loss/reg": 0.9467406272888184, + "step": 1703 + }, + { + "epoch": 0.01704, + "grad_norm": 0.37156128883361816, + "grad_norm_var": 0.0011800800264768559, + "learning_rate": 5e-05, + "loss": 0.1828, + "loss/crossentropy": 2.7135950922966003, + "loss/hidden": 0.0, + "loss/logits": 0.18276651576161385, + "loss/reg": 0.9453279376029968, + "step": 1704 + }, + { + "epoch": 0.01705, + "grad_norm": 0.3806215524673462, + "grad_norm_var": 0.0012482685718171423, + "learning_rate": 5e-05, + "loss": 0.1773, + "loss/crossentropy": 2.664647400379181, + "loss/hidden": 0.0, + "loss/logits": 0.17725708335638046, + "loss/reg": 0.944482684135437, + "step": 1705 + }, + { + "epoch": 0.01706, + "grad_norm": 0.38079720735549927, + "grad_norm_var": 0.0013240482296425864, + "learning_rate": 5e-05, + "loss": 0.1882, + "loss/crossentropy": 2.859362006187439, + "loss/hidden": 0.0, + "loss/logits": 0.18818871304392815, + "loss/reg": 0.9428150653839111, + "step": 1706 + }, + { + "epoch": 0.01707, + "grad_norm": 0.3702297508716583, + "grad_norm_var": 0.0013354449290563766, + "learning_rate": 5e-05, + "loss": 0.1834, + "loss/crossentropy": 2.7437954545021057, + "loss/hidden": 0.0, + "loss/logits": 0.1833883598446846, + "loss/reg": 0.9419358372688293, + "step": 1707 + }, + { + "epoch": 0.01708, + "grad_norm": 0.38292014598846436, + "grad_norm_var": 0.0013738587391907587, + "learning_rate": 5e-05, + "loss": 0.1883, + "loss/crossentropy": 2.8175190687179565, + "loss/hidden": 0.0, + "loss/logits": 0.18825723230838776, + "loss/reg": 0.9411101341247559, + "step": 1708 + }, + { + "epoch": 0.01709, + "grad_norm": 0.4336113929748535, + "grad_norm_var": 0.000739829895405449, + "learning_rate": 5e-05, + "loss": 0.2004, + "loss/crossentropy": 2.7488914132118225, + "loss/hidden": 0.0, + "loss/logits": 0.2003837488591671, + "loss/reg": 0.9400129914283752, + "step": 1709 + }, + { + "epoch": 0.0171, + "grad_norm": 0.3866313397884369, + "grad_norm_var": 0.0007548829773588358, + "learning_rate": 5e-05, + "loss": 0.1859, + "loss/crossentropy": 2.865144968032837, + "loss/hidden": 0.0, + "loss/logits": 0.18591173365712166, + "loss/reg": 0.9393734335899353, + "step": 1710 + }, + { + "epoch": 0.01711, + "grad_norm": 0.3687353730201721, + "grad_norm_var": 0.0008212520908905674, + "learning_rate": 5e-05, + "loss": 0.1897, + "loss/crossentropy": 2.923434257507324, + "loss/hidden": 0.0, + "loss/logits": 0.1896546706557274, + "loss/reg": 0.9383866786956787, + "step": 1711 + }, + { + "epoch": 0.01712, + "grad_norm": 0.38244619965553284, + "grad_norm_var": 0.0008405183279570849, + "learning_rate": 5e-05, + "loss": 0.1866, + "loss/crossentropy": 2.958771765232086, + "loss/hidden": 0.0, + "loss/logits": 0.18655554950237274, + "loss/reg": 0.9373697638511658, + "step": 1712 + }, + { + "epoch": 0.01713, + "grad_norm": 0.3985337018966675, + "grad_norm_var": 0.0006662152040347369, + "learning_rate": 5e-05, + "loss": 0.1755, + "loss/crossentropy": 2.790857195854187, + "loss/hidden": 0.0, + "loss/logits": 0.1754690706729889, + "loss/reg": 0.9367863535881042, + "step": 1713 + }, + { + "epoch": 0.01714, + "grad_norm": 0.46521270275115967, + "grad_norm_var": 0.0009604061373006178, + "learning_rate": 5e-05, + "loss": 0.2082, + "loss/crossentropy": 2.8150055408477783, + "loss/hidden": 0.0, + "loss/logits": 0.20824339613318443, + "loss/reg": 0.9358097314834595, + "step": 1714 + }, + { + "epoch": 0.01715, + "grad_norm": 0.36640942096710205, + "grad_norm_var": 0.0008204419803184575, + "learning_rate": 5e-05, + "loss": 0.1915, + "loss/crossentropy": 2.9064807295799255, + "loss/hidden": 0.0, + "loss/logits": 0.19147857278585434, + "loss/reg": 0.9348723888397217, + "step": 1715 + }, + { + "epoch": 0.01716, + "grad_norm": 0.4536788761615753, + "grad_norm_var": 0.001031849466324708, + "learning_rate": 5e-05, + "loss": 0.196, + "loss/crossentropy": 2.7744264602661133, + "loss/hidden": 0.0, + "loss/logits": 0.19603459164500237, + "loss/reg": 0.9344622492790222, + "step": 1716 + }, + { + "epoch": 0.01717, + "grad_norm": 0.37372827529907227, + "grad_norm_var": 0.0009426139138331162, + "learning_rate": 5e-05, + "loss": 0.1807, + "loss/crossentropy": 2.7083056569099426, + "loss/hidden": 0.0, + "loss/logits": 0.18066375702619553, + "loss/reg": 0.9334821701049805, + "step": 1717 + }, + { + "epoch": 0.01718, + "grad_norm": 0.3796086311340332, + "grad_norm_var": 0.0009498690764029612, + "learning_rate": 5e-05, + "loss": 0.1873, + "loss/crossentropy": 2.9045706391334534, + "loss/hidden": 0.0, + "loss/logits": 0.18725696206092834, + "loss/reg": 0.9323367476463318, + "step": 1718 + }, + { + "epoch": 0.01719, + "grad_norm": 0.3763802945613861, + "grad_norm_var": 0.0009476817574635772, + "learning_rate": 5e-05, + "loss": 0.1874, + "loss/crossentropy": 2.670018196105957, + "loss/hidden": 0.0, + "loss/logits": 0.1874183751642704, + "loss/reg": 0.9317356944084167, + "step": 1719 + }, + { + "epoch": 0.0172, + "grad_norm": 0.39076095819473267, + "grad_norm_var": 0.0009185417773364161, + "learning_rate": 5e-05, + "loss": 0.1851, + "loss/crossentropy": 2.751845359802246, + "loss/hidden": 0.0, + "loss/logits": 0.1850602775812149, + "loss/reg": 0.9313357472419739, + "step": 1720 + }, + { + "epoch": 0.01721, + "grad_norm": 0.42364755272865295, + "grad_norm_var": 0.0009624046398820714, + "learning_rate": 5e-05, + "loss": 0.2073, + "loss/crossentropy": 2.7992833256721497, + "loss/hidden": 0.0, + "loss/logits": 0.2073342464864254, + "loss/reg": 0.9311196208000183, + "step": 1721 + }, + { + "epoch": 0.01722, + "grad_norm": 0.44127926230430603, + "grad_norm_var": 0.0010697798969358, + "learning_rate": 5e-05, + "loss": 0.219, + "loss/crossentropy": 2.814840614795685, + "loss/hidden": 0.0, + "loss/logits": 0.21901006624102592, + "loss/reg": 0.9308744668960571, + "step": 1722 + }, + { + "epoch": 0.01723, + "grad_norm": 0.4107615351676941, + "grad_norm_var": 0.0010136604388472008, + "learning_rate": 5e-05, + "loss": 0.1874, + "loss/crossentropy": 2.699351191520691, + "loss/hidden": 0.0, + "loss/logits": 0.18741191178560257, + "loss/reg": 0.9312902092933655, + "step": 1723 + }, + { + "epoch": 0.01724, + "grad_norm": 0.5666653513908386, + "grad_norm_var": 0.0026527682925742368, + "learning_rate": 5e-05, + "loss": 0.1933, + "loss/crossentropy": 2.854068696498871, + "loss/hidden": 0.0, + "loss/logits": 0.19326234608888626, + "loss/reg": 0.9310418367385864, + "step": 1724 + }, + { + "epoch": 0.01725, + "grad_norm": 0.3888385593891144, + "grad_norm_var": 0.0026587771173752217, + "learning_rate": 5e-05, + "loss": 0.1921, + "loss/crossentropy": 2.8429774045944214, + "loss/hidden": 0.0, + "loss/logits": 0.1921473778784275, + "loss/reg": 0.9303467869758606, + "step": 1725 + }, + { + "epoch": 0.01726, + "grad_norm": 0.4140770435333252, + "grad_norm_var": 0.002617294349685162, + "learning_rate": 5e-05, + "loss": 0.1912, + "loss/crossentropy": 2.6299397349357605, + "loss/hidden": 0.0, + "loss/logits": 0.19124232232570648, + "loss/reg": 0.9298058748245239, + "step": 1726 + }, + { + "epoch": 0.01727, + "grad_norm": 0.3864987790584564, + "grad_norm_var": 0.002533247945002985, + "learning_rate": 5e-05, + "loss": 0.181, + "loss/crossentropy": 2.730083465576172, + "loss/hidden": 0.0, + "loss/logits": 0.18100566416978836, + "loss/reg": 0.9297086000442505, + "step": 1727 + }, + { + "epoch": 0.01728, + "grad_norm": 0.41775962710380554, + "grad_norm_var": 0.0024642286621332312, + "learning_rate": 5e-05, + "loss": 0.1956, + "loss/crossentropy": 2.6796945929527283, + "loss/hidden": 0.0, + "loss/logits": 0.19562242552638054, + "loss/reg": 0.9300265908241272, + "step": 1728 + }, + { + "epoch": 0.01729, + "grad_norm": 0.4079970121383667, + "grad_norm_var": 0.0024479575636529288, + "learning_rate": 5e-05, + "loss": 0.1956, + "loss/crossentropy": 2.7550109028816223, + "loss/hidden": 0.0, + "loss/logits": 0.19564897194504738, + "loss/reg": 0.9302366375923157, + "step": 1729 + }, + { + "epoch": 0.0173, + "grad_norm": 0.4165860116481781, + "grad_norm_var": 0.0022796285006947414, + "learning_rate": 5e-05, + "loss": 0.1943, + "loss/crossentropy": 2.813656747341156, + "loss/hidden": 0.0, + "loss/logits": 0.1943197064101696, + "loss/reg": 0.9302929639816284, + "step": 1730 + }, + { + "epoch": 0.01731, + "grad_norm": 0.43071046471595764, + "grad_norm_var": 0.0021350215473328604, + "learning_rate": 5e-05, + "loss": 0.1972, + "loss/crossentropy": 2.8851414918899536, + "loss/hidden": 0.0, + "loss/logits": 0.1972125954926014, + "loss/reg": 0.9297398924827576, + "step": 1731 + }, + { + "epoch": 0.01732, + "grad_norm": 0.3740787208080292, + "grad_norm_var": 0.0021463760989535135, + "learning_rate": 5e-05, + "loss": 0.176, + "loss/crossentropy": 2.8831735253334045, + "loss/hidden": 0.0, + "loss/logits": 0.17595698684453964, + "loss/reg": 0.9294543862342834, + "step": 1732 + }, + { + "epoch": 0.01733, + "grad_norm": 0.41656848788261414, + "grad_norm_var": 0.002039838173721368, + "learning_rate": 5e-05, + "loss": 0.1961, + "loss/crossentropy": 2.7793456315994263, + "loss/hidden": 0.0, + "loss/logits": 0.19613324478268623, + "loss/reg": 0.9291931390762329, + "step": 1733 + }, + { + "epoch": 0.01734, + "grad_norm": 0.4268653094768524, + "grad_norm_var": 0.001955542062938446, + "learning_rate": 5e-05, + "loss": 0.199, + "loss/crossentropy": 2.697963237762451, + "loss/hidden": 0.0, + "loss/logits": 0.19901638850569725, + "loss/reg": 0.9292311668395996, + "step": 1734 + }, + { + "epoch": 0.01735, + "grad_norm": 0.4020274877548218, + "grad_norm_var": 0.0018540141631924782, + "learning_rate": 5e-05, + "loss": 0.1985, + "loss/crossentropy": 2.826058506965637, + "loss/hidden": 0.0, + "loss/logits": 0.1985473819077015, + "loss/reg": 0.9292099475860596, + "step": 1735 + }, + { + "epoch": 0.01736, + "grad_norm": 0.4104580283164978, + "grad_norm_var": 0.0018022734387696784, + "learning_rate": 5e-05, + "loss": 0.2002, + "loss/crossentropy": 2.895944595336914, + "loss/hidden": 0.0, + "loss/logits": 0.20016875490546227, + "loss/reg": 0.9295958280563354, + "step": 1736 + }, + { + "epoch": 0.01737, + "grad_norm": 0.3876482844352722, + "grad_norm_var": 0.0018702079285484432, + "learning_rate": 5e-05, + "loss": 0.1926, + "loss/crossentropy": 2.8814991116523743, + "loss/hidden": 0.0, + "loss/logits": 0.1926160380244255, + "loss/reg": 0.9298628568649292, + "step": 1737 + }, + { + "epoch": 0.01738, + "grad_norm": 0.3946945369243622, + "grad_norm_var": 0.001865447438802307, + "learning_rate": 5e-05, + "loss": 0.181, + "loss/crossentropy": 2.8898362517356873, + "loss/hidden": 0.0, + "loss/logits": 0.18102619051933289, + "loss/reg": 0.9295737147331238, + "step": 1738 + }, + { + "epoch": 0.01739, + "grad_norm": 0.4132324457168579, + "grad_norm_var": 0.0018641807090493641, + "learning_rate": 5e-05, + "loss": 0.2007, + "loss/crossentropy": 2.8055079579353333, + "loss/hidden": 0.0, + "loss/logits": 0.2006586194038391, + "loss/reg": 0.929633617401123, + "step": 1739 + }, + { + "epoch": 0.0174, + "grad_norm": 0.3722969591617584, + "grad_norm_var": 0.00031866605833391994, + "learning_rate": 5e-05, + "loss": 0.1788, + "loss/crossentropy": 2.881201148033142, + "loss/hidden": 0.0, + "loss/logits": 0.17884038016200066, + "loss/reg": 0.9294272661209106, + "step": 1740 + }, + { + "epoch": 0.01741, + "grad_norm": 0.3921547830104828, + "grad_norm_var": 0.0003127507684731838, + "learning_rate": 5e-05, + "loss": 0.1989, + "loss/crossentropy": 2.870577871799469, + "loss/hidden": 0.0, + "loss/logits": 0.19886576384305954, + "loss/reg": 0.9293291568756104, + "step": 1741 + }, + { + "epoch": 0.01742, + "grad_norm": 0.4074016213417053, + "grad_norm_var": 0.0003065474628230665, + "learning_rate": 5e-05, + "loss": 0.1887, + "loss/crossentropy": 2.692419111728668, + "loss/hidden": 0.0, + "loss/logits": 0.188666682690382, + "loss/reg": 0.9284968972206116, + "step": 1742 + }, + { + "epoch": 0.01743, + "grad_norm": 0.3687678873538971, + "grad_norm_var": 0.0003665339924477558, + "learning_rate": 5e-05, + "loss": 0.1843, + "loss/crossentropy": 2.7657968401908875, + "loss/hidden": 0.0, + "loss/logits": 0.1842886470258236, + "loss/reg": 0.9279156923294067, + "step": 1743 + }, + { + "epoch": 0.01744, + "grad_norm": 0.46656665205955505, + "grad_norm_var": 0.0006150264403531095, + "learning_rate": 5e-05, + "loss": 0.2095, + "loss/crossentropy": 2.838911771774292, + "loss/hidden": 0.0, + "loss/logits": 0.20953642204403877, + "loss/reg": 0.927099883556366, + "step": 1744 + }, + { + "epoch": 0.01745, + "grad_norm": 0.39029231667518616, + "grad_norm_var": 0.0006287310128329874, + "learning_rate": 5e-05, + "loss": 0.1956, + "loss/crossentropy": 2.7523877024650574, + "loss/hidden": 0.0, + "loss/logits": 0.19562172889709473, + "loss/reg": 0.9264803528785706, + "step": 1745 + }, + { + "epoch": 0.01746, + "grad_norm": 0.3831719756126404, + "grad_norm_var": 0.0006442070246771051, + "learning_rate": 5e-05, + "loss": 0.1905, + "loss/crossentropy": 2.740496337413788, + "loss/hidden": 0.0, + "loss/logits": 0.19047483429312706, + "loss/reg": 0.9256248474121094, + "step": 1746 + }, + { + "epoch": 0.01747, + "grad_norm": 0.5675250887870789, + "grad_norm_var": 0.0023322043705761336, + "learning_rate": 5e-05, + "loss": 0.2027, + "loss/crossentropy": 2.843151092529297, + "loss/hidden": 0.0, + "loss/logits": 0.2027064487338066, + "loss/reg": 0.9250745177268982, + "step": 1747 + }, + { + "epoch": 0.01748, + "grad_norm": 0.4640829563140869, + "grad_norm_var": 0.0023971129605368327, + "learning_rate": 5e-05, + "loss": 0.2053, + "loss/crossentropy": 2.857450842857361, + "loss/hidden": 0.0, + "loss/logits": 0.20526857301592827, + "loss/reg": 0.9246965646743774, + "step": 1748 + }, + { + "epoch": 0.01749, + "grad_norm": 0.4389997124671936, + "grad_norm_var": 0.002428811116496136, + "learning_rate": 5e-05, + "loss": 0.2078, + "loss/crossentropy": 3.014944314956665, + "loss/hidden": 0.0, + "loss/logits": 0.20782027766108513, + "loss/reg": 0.9241983890533447, + "step": 1749 + }, + { + "epoch": 0.0175, + "grad_norm": 0.4219127297401428, + "grad_norm_var": 0.002424415101141249, + "learning_rate": 5e-05, + "loss": 0.1952, + "loss/crossentropy": 2.778669834136963, + "loss/hidden": 0.0, + "loss/logits": 0.19519609212875366, + "loss/reg": 0.9234949946403503, + "step": 1750 + }, + { + "epoch": 0.01751, + "grad_norm": 0.44038647413253784, + "grad_norm_var": 0.0024368494019202675, + "learning_rate": 5e-05, + "loss": 0.2084, + "loss/crossentropy": 2.752516746520996, + "loss/hidden": 0.0, + "loss/logits": 0.20839788019657135, + "loss/reg": 0.922467052936554, + "step": 1751 + }, + { + "epoch": 0.01752, + "grad_norm": 0.41221359372138977, + "grad_norm_var": 0.0024348144491298704, + "learning_rate": 5e-05, + "loss": 0.1956, + "loss/crossentropy": 2.842723250389099, + "loss/hidden": 0.0, + "loss/logits": 0.19562644511461258, + "loss/reg": 0.9218528866767883, + "step": 1752 + }, + { + "epoch": 0.01753, + "grad_norm": 0.4004841148853302, + "grad_norm_var": 0.002389599515625429, + "learning_rate": 5e-05, + "loss": 0.1954, + "loss/crossentropy": 2.939370810985565, + "loss/hidden": 0.0, + "loss/logits": 0.19543294981122017, + "loss/reg": 0.9208647012710571, + "step": 1753 + }, + { + "epoch": 0.01754, + "grad_norm": 0.4379563331604004, + "grad_norm_var": 0.002355491992859593, + "learning_rate": 5e-05, + "loss": 0.2182, + "loss/crossentropy": 2.744364857673645, + "loss/hidden": 0.0, + "loss/logits": 0.21822267770767212, + "loss/reg": 0.9200965166091919, + "step": 1754 + }, + { + "epoch": 0.01755, + "grad_norm": 0.3759958744049072, + "grad_norm_var": 0.002493577858945939, + "learning_rate": 5e-05, + "loss": 0.1754, + "loss/crossentropy": 2.886771500110626, + "loss/hidden": 0.0, + "loss/logits": 0.17540156841278076, + "loss/reg": 0.9194830656051636, + "step": 1755 + }, + { + "epoch": 0.01756, + "grad_norm": 0.4182957112789154, + "grad_norm_var": 0.00232550336918698, + "learning_rate": 5e-05, + "loss": 0.197, + "loss/crossentropy": 2.7246428728103638, + "loss/hidden": 0.0, + "loss/logits": 0.1970425397157669, + "loss/reg": 0.919228732585907, + "step": 1756 + }, + { + "epoch": 0.01757, + "grad_norm": 0.4069487452507019, + "grad_norm_var": 0.0022760944225958942, + "learning_rate": 5e-05, + "loss": 0.1988, + "loss/crossentropy": 2.910832703113556, + "loss/hidden": 0.0, + "loss/logits": 0.19881373643875122, + "loss/reg": 0.9180617928504944, + "step": 1757 + }, + { + "epoch": 0.01758, + "grad_norm": 0.4275709092617035, + "grad_norm_var": 0.0022540248202831217, + "learning_rate": 5e-05, + "loss": 0.2148, + "loss/crossentropy": 2.7431459426879883, + "loss/hidden": 0.0, + "loss/logits": 0.2147684395313263, + "loss/reg": 0.9176238179206848, + "step": 1758 + }, + { + "epoch": 0.01759, + "grad_norm": 0.39265963435173035, + "grad_norm_var": 0.0021063545561460987, + "learning_rate": 5e-05, + "loss": 0.1873, + "loss/crossentropy": 2.80772066116333, + "loss/hidden": 0.0, + "loss/logits": 0.18733786791563034, + "loss/reg": 0.9171220064163208, + "step": 1759 + }, + { + "epoch": 0.0176, + "grad_norm": 0.3848111927509308, + "grad_norm_var": 0.0021016960850855507, + "learning_rate": 5e-05, + "loss": 0.1889, + "loss/crossentropy": 2.85300612449646, + "loss/hidden": 0.0, + "loss/logits": 0.18891701474785805, + "loss/reg": 0.9165941476821899, + "step": 1760 + }, + { + "epoch": 0.01761, + "grad_norm": 0.4237481355667114, + "grad_norm_var": 0.0020270584799056005, + "learning_rate": 5e-05, + "loss": 0.1808, + "loss/crossentropy": 2.8984915018081665, + "loss/hidden": 0.0, + "loss/logits": 0.18082596361637115, + "loss/reg": 0.9160624146461487, + "step": 1761 + }, + { + "epoch": 0.01762, + "grad_norm": 0.5142870545387268, + "grad_norm_var": 0.0023738048932404015, + "learning_rate": 5e-05, + "loss": 0.1935, + "loss/crossentropy": 2.873230457305908, + "loss/hidden": 0.0, + "loss/logits": 0.19349532574415207, + "loss/reg": 0.916169285774231, + "step": 1762 + }, + { + "epoch": 0.01763, + "grad_norm": 0.4356222152709961, + "grad_norm_var": 0.0010951696449246946, + "learning_rate": 5e-05, + "loss": 0.1864, + "loss/crossentropy": 2.8136460185050964, + "loss/hidden": 0.0, + "loss/logits": 0.18640804663300514, + "loss/reg": 0.9159582257270813, + "step": 1763 + }, + { + "epoch": 0.01764, + "grad_norm": 0.393518328666687, + "grad_norm_var": 0.001036296866566729, + "learning_rate": 5e-05, + "loss": 0.1895, + "loss/crossentropy": 2.749118745326996, + "loss/hidden": 0.0, + "loss/logits": 0.1894579976797104, + "loss/reg": 0.9152243733406067, + "step": 1764 + }, + { + "epoch": 0.01765, + "grad_norm": 0.3967163562774658, + "grad_norm_var": 0.0010428298323417332, + "learning_rate": 5e-05, + "loss": 0.1923, + "loss/crossentropy": 2.7416646480560303, + "loss/hidden": 0.0, + "loss/logits": 0.1922891065478325, + "loss/reg": 0.9147580862045288, + "step": 1765 + }, + { + "epoch": 0.01766, + "grad_norm": 0.40663349628448486, + "grad_norm_var": 0.0010488292205994862, + "learning_rate": 5e-05, + "loss": 0.1912, + "loss/crossentropy": 2.7590591311454773, + "loss/hidden": 0.0, + "loss/logits": 0.19123412668704987, + "loss/reg": 0.914356529712677, + "step": 1766 + }, + { + "epoch": 0.01767, + "grad_norm": 0.4140712022781372, + "grad_norm_var": 0.0010091434052932632, + "learning_rate": 5e-05, + "loss": 0.1919, + "loss/crossentropy": 2.687725067138672, + "loss/hidden": 0.0, + "loss/logits": 0.191885843873024, + "loss/reg": 0.9137458801269531, + "step": 1767 + }, + { + "epoch": 0.01768, + "grad_norm": 0.3837500512599945, + "grad_norm_var": 0.0010707176300563356, + "learning_rate": 5e-05, + "loss": 0.1946, + "loss/crossentropy": 2.7783952951431274, + "loss/hidden": 0.0, + "loss/logits": 0.1945618949830532, + "loss/reg": 0.9133679270744324, + "step": 1768 + }, + { + "epoch": 0.01769, + "grad_norm": 0.39224839210510254, + "grad_norm_var": 0.0010890483887377974, + "learning_rate": 5e-05, + "loss": 0.2031, + "loss/crossentropy": 2.741241693496704, + "loss/hidden": 0.0, + "loss/logits": 0.20311570912599564, + "loss/reg": 0.912524938583374, + "step": 1769 + }, + { + "epoch": 0.0177, + "grad_norm": 0.3952781558036804, + "grad_norm_var": 0.0010597493335637827, + "learning_rate": 5e-05, + "loss": 0.1776, + "loss/crossentropy": 2.7046775817871094, + "loss/hidden": 0.0, + "loss/logits": 0.17761826515197754, + "loss/reg": 0.9117524027824402, + "step": 1770 + }, + { + "epoch": 0.01771, + "grad_norm": 0.44809678196907043, + "grad_norm_var": 0.0010564659434679176, + "learning_rate": 5e-05, + "loss": 0.1885, + "loss/crossentropy": 2.6977961659431458, + "loss/hidden": 0.0, + "loss/logits": 0.18849042057991028, + "loss/reg": 0.9117720723152161, + "step": 1771 + }, + { + "epoch": 0.01772, + "grad_norm": 0.4902764856815338, + "grad_norm_var": 0.0014153685782453203, + "learning_rate": 5e-05, + "loss": 0.1989, + "loss/crossentropy": 2.854245662689209, + "loss/hidden": 0.0, + "loss/logits": 0.19886160641908646, + "loss/reg": 0.9108124375343323, + "step": 1772 + }, + { + "epoch": 0.01773, + "grad_norm": 0.42389383912086487, + "grad_norm_var": 0.001405770734557695, + "learning_rate": 5e-05, + "loss": 0.1969, + "loss/crossentropy": 2.941379427909851, + "loss/hidden": 0.0, + "loss/logits": 0.19685113802552223, + "loss/reg": 0.9101334810256958, + "step": 1773 + }, + { + "epoch": 0.01774, + "grad_norm": 0.45862945914268494, + "grad_norm_var": 0.0014965888956112356, + "learning_rate": 5e-05, + "loss": 0.1856, + "loss/crossentropy": 2.79221248626709, + "loss/hidden": 0.0, + "loss/logits": 0.1856122799217701, + "loss/reg": 0.9094204306602478, + "step": 1774 + }, + { + "epoch": 0.01775, + "grad_norm": 0.4052278697490692, + "grad_norm_var": 0.001457059190942837, + "learning_rate": 5e-05, + "loss": 0.1898, + "loss/crossentropy": 2.8591490983963013, + "loss/hidden": 0.0, + "loss/logits": 0.18978318944573402, + "loss/reg": 0.9089229106903076, + "step": 1775 + }, + { + "epoch": 0.01776, + "grad_norm": 0.3891145884990692, + "grad_norm_var": 0.0014363471457248366, + "learning_rate": 5e-05, + "loss": 0.1751, + "loss/crossentropy": 2.749578833580017, + "loss/hidden": 0.0, + "loss/logits": 0.17506226524710655, + "loss/reg": 0.9084846377372742, + "step": 1776 + }, + { + "epoch": 0.01777, + "grad_norm": 0.39679303765296936, + "grad_norm_var": 0.0014797685463354545, + "learning_rate": 5e-05, + "loss": 0.1901, + "loss/crossentropy": 3.001101851463318, + "loss/hidden": 0.0, + "loss/logits": 0.19014138355851173, + "loss/reg": 0.9078476428985596, + "step": 1777 + }, + { + "epoch": 0.01778, + "grad_norm": 0.3706504702568054, + "grad_norm_var": 0.0009924082079833714, + "learning_rate": 5e-05, + "loss": 0.1846, + "loss/crossentropy": 2.7723464965820312, + "loss/hidden": 0.0, + "loss/logits": 0.18464886024594307, + "loss/reg": 0.9068264365196228, + "step": 1778 + }, + { + "epoch": 0.01779, + "grad_norm": 0.3968674838542938, + "grad_norm_var": 0.0009669675906873842, + "learning_rate": 5e-05, + "loss": 0.1895, + "loss/crossentropy": 2.761741518974304, + "loss/hidden": 0.0, + "loss/logits": 0.18945276364684105, + "loss/reg": 0.906024694442749, + "step": 1779 + }, + { + "epoch": 0.0178, + "grad_norm": 0.39632725715637207, + "grad_norm_var": 0.0009612466044541615, + "learning_rate": 5e-05, + "loss": 0.1862, + "loss/crossentropy": 2.8378600478172302, + "loss/hidden": 0.0, + "loss/logits": 0.1862327829003334, + "loss/reg": 0.9050126075744629, + "step": 1780 + }, + { + "epoch": 0.01781, + "grad_norm": 0.3713095486164093, + "grad_norm_var": 0.0010475586715930519, + "learning_rate": 5e-05, + "loss": 0.1713, + "loss/crossentropy": 2.830242097377777, + "loss/hidden": 0.0, + "loss/logits": 0.1712886579334736, + "loss/reg": 0.9039465188980103, + "step": 1781 + }, + { + "epoch": 0.01782, + "grad_norm": 0.4007900059223175, + "grad_norm_var": 0.0010513013471431778, + "learning_rate": 5e-05, + "loss": 0.1823, + "loss/crossentropy": 2.8021321892738342, + "loss/hidden": 0.0, + "loss/logits": 0.18226920068264008, + "loss/reg": 0.9026550054550171, + "step": 1782 + }, + { + "epoch": 0.01783, + "grad_norm": 0.5193232297897339, + "grad_norm_var": 0.0018242062912838624, + "learning_rate": 5e-05, + "loss": 0.1818, + "loss/crossentropy": 2.5900211334228516, + "loss/hidden": 0.0, + "loss/logits": 0.18178314715623856, + "loss/reg": 0.9019155502319336, + "step": 1783 + }, + { + "epoch": 0.01784, + "grad_norm": 0.403618723154068, + "grad_norm_var": 0.0017663287180598138, + "learning_rate": 5e-05, + "loss": 0.189, + "loss/crossentropy": 2.8122188448905945, + "loss/hidden": 0.0, + "loss/logits": 0.18899159505963326, + "loss/reg": 0.9012361168861389, + "step": 1784 + }, + { + "epoch": 0.01785, + "grad_norm": 0.4231337904930115, + "grad_norm_var": 0.0017275082001659947, + "learning_rate": 5e-05, + "loss": 0.2196, + "loss/crossentropy": 2.824800193309784, + "loss/hidden": 0.0, + "loss/logits": 0.21959979087114334, + "loss/reg": 0.8999006748199463, + "step": 1785 + }, + { + "epoch": 0.01786, + "grad_norm": 0.44012317061424255, + "grad_norm_var": 0.0017168415806013914, + "learning_rate": 5e-05, + "loss": 0.1897, + "loss/crossentropy": 2.738426446914673, + "loss/hidden": 0.0, + "loss/logits": 0.18970657885074615, + "loss/reg": 0.8987688422203064, + "step": 1786 + }, + { + "epoch": 0.01787, + "grad_norm": 0.4444918632507324, + "grad_norm_var": 0.0017045747668082202, + "learning_rate": 5e-05, + "loss": 0.1968, + "loss/crossentropy": 2.7222702503204346, + "loss/hidden": 0.0, + "loss/logits": 0.19678190723061562, + "loss/reg": 0.8982342481613159, + "step": 1787 + }, + { + "epoch": 0.01788, + "grad_norm": 0.41831323504447937, + "grad_norm_var": 0.0013602734497308237, + "learning_rate": 5e-05, + "loss": 0.1958, + "loss/crossentropy": 2.7708369493484497, + "loss/hidden": 0.0, + "loss/logits": 0.19575121998786926, + "loss/reg": 0.896869957447052, + "step": 1788 + }, + { + "epoch": 0.01789, + "grad_norm": 0.39989328384399414, + "grad_norm_var": 0.0013715357724878973, + "learning_rate": 5e-05, + "loss": 0.1859, + "loss/crossentropy": 2.7035407423973083, + "loss/hidden": 0.0, + "loss/logits": 0.18592968955636024, + "loss/reg": 0.8959349989891052, + "step": 1789 + }, + { + "epoch": 0.0179, + "grad_norm": 0.41748160123825073, + "grad_norm_var": 0.0012361403251163084, + "learning_rate": 5e-05, + "loss": 0.1749, + "loss/crossentropy": 2.8941848278045654, + "loss/hidden": 0.0, + "loss/logits": 0.17486931383609772, + "loss/reg": 0.8951555490493774, + "step": 1790 + }, + { + "epoch": 0.01791, + "grad_norm": 0.44419723749160767, + "grad_norm_var": 0.0012953922793792927, + "learning_rate": 5e-05, + "loss": 0.2246, + "loss/crossentropy": 2.8155258893966675, + "loss/hidden": 0.0, + "loss/logits": 0.2246222421526909, + "loss/reg": 0.8940463662147522, + "step": 1791 + }, + { + "epoch": 0.01792, + "grad_norm": 0.406249076128006, + "grad_norm_var": 0.0012556850385435415, + "learning_rate": 5e-05, + "loss": 0.1927, + "loss/crossentropy": 2.8289509415626526, + "loss/hidden": 0.0, + "loss/logits": 0.19272882491350174, + "loss/reg": 0.8930648565292358, + "step": 1792 + }, + { + "epoch": 0.01793, + "grad_norm": 0.3946913182735443, + "grad_norm_var": 0.0012612307282537335, + "learning_rate": 5e-05, + "loss": 0.1943, + "loss/crossentropy": 2.834330677986145, + "loss/hidden": 0.0, + "loss/logits": 0.1943346932530403, + "loss/reg": 0.8924468159675598, + "step": 1793 + }, + { + "epoch": 0.01794, + "grad_norm": 0.4092795252799988, + "grad_norm_var": 0.0011236675583754518, + "learning_rate": 5e-05, + "loss": 0.1992, + "loss/crossentropy": 2.8072381019592285, + "loss/hidden": 0.0, + "loss/logits": 0.19915487617254257, + "loss/reg": 0.8920629620552063, + "step": 1794 + }, + { + "epoch": 0.01795, + "grad_norm": 0.4286874234676361, + "grad_norm_var": 0.0010977976660271338, + "learning_rate": 5e-05, + "loss": 0.1936, + "loss/crossentropy": 2.908243238925934, + "loss/hidden": 0.0, + "loss/logits": 0.19364729523658752, + "loss/reg": 0.8911548852920532, + "step": 1795 + }, + { + "epoch": 0.01796, + "grad_norm": 0.3970155119895935, + "grad_norm_var": 0.0010956668734328988, + "learning_rate": 5e-05, + "loss": 0.194, + "loss/crossentropy": 2.6946675181388855, + "loss/hidden": 0.0, + "loss/logits": 0.19396164640784264, + "loss/reg": 0.8905576467514038, + "step": 1796 + }, + { + "epoch": 0.01797, + "grad_norm": 0.4096953570842743, + "grad_norm_var": 0.0009390040878516359, + "learning_rate": 5e-05, + "loss": 0.1915, + "loss/crossentropy": 2.8345912098884583, + "loss/hidden": 0.0, + "loss/logits": 0.1914876624941826, + "loss/reg": 0.889485239982605, + "step": 1797 + }, + { + "epoch": 0.01798, + "grad_norm": 0.3989505469799042, + "grad_norm_var": 0.0009444939561368048, + "learning_rate": 5e-05, + "loss": 0.1952, + "loss/crossentropy": 2.85462749004364, + "loss/hidden": 0.0, + "loss/logits": 0.19523821398615837, + "loss/reg": 0.8887150883674622, + "step": 1798 + }, + { + "epoch": 0.01799, + "grad_norm": 0.4110300540924072, + "grad_norm_var": 0.0002750364050750983, + "learning_rate": 5e-05, + "loss": 0.2033, + "loss/crossentropy": 2.7547779083251953, + "loss/hidden": 0.0, + "loss/logits": 0.2033349722623825, + "loss/reg": 0.8880630135536194, + "step": 1799 + }, + { + "epoch": 0.018, + "grad_norm": 0.40208929777145386, + "grad_norm_var": 0.00027759083654341394, + "learning_rate": 5e-05, + "loss": 0.1839, + "loss/crossentropy": 2.8043800592422485, + "loss/hidden": 0.0, + "loss/logits": 0.18389111012220383, + "loss/reg": 0.8876356482505798, + "step": 1800 + }, + { + "epoch": 0.01801, + "grad_norm": 0.4190218448638916, + "grad_norm_var": 0.0002743705401916411, + "learning_rate": 5e-05, + "loss": 0.1958, + "loss/crossentropy": 2.8892692923545837, + "loss/hidden": 0.0, + "loss/logits": 0.1957857720553875, + "loss/reg": 0.8869847655296326, + "step": 1801 + }, + { + "epoch": 0.01802, + "grad_norm": 0.43014124035835266, + "grad_norm_var": 0.00024726162186390713, + "learning_rate": 5e-05, + "loss": 0.2099, + "loss/crossentropy": 2.7159982323646545, + "loss/hidden": 0.0, + "loss/logits": 0.20992730557918549, + "loss/reg": 0.8867628574371338, + "step": 1802 + }, + { + "epoch": 0.01803, + "grad_norm": 0.4165356457233429, + "grad_norm_var": 0.00018413420812303608, + "learning_rate": 5e-05, + "loss": 0.1914, + "loss/crossentropy": 2.7939720153808594, + "loss/hidden": 0.0, + "loss/logits": 0.19136624038219452, + "loss/reg": 0.8857271075248718, + "step": 1803 + }, + { + "epoch": 0.01804, + "grad_norm": 0.412634015083313, + "grad_norm_var": 0.00018190296511185015, + "learning_rate": 5e-05, + "loss": 0.1892, + "loss/crossentropy": 2.863085687160492, + "loss/hidden": 0.0, + "loss/logits": 0.1892290711402893, + "loss/reg": 0.8853285312652588, + "step": 1804 + }, + { + "epoch": 0.01805, + "grad_norm": 0.4068357050418854, + "grad_norm_var": 0.0001733850609784889, + "learning_rate": 5e-05, + "loss": 0.1868, + "loss/crossentropy": 2.7323646545410156, + "loss/hidden": 0.0, + "loss/logits": 0.1867602802813053, + "loss/reg": 0.8856411576271057, + "step": 1805 + }, + { + "epoch": 0.01806, + "grad_norm": 0.4004560112953186, + "grad_norm_var": 0.00018083683617112396, + "learning_rate": 5e-05, + "loss": 0.1831, + "loss/crossentropy": 2.775236189365387, + "loss/hidden": 0.0, + "loss/logits": 0.18314369022846222, + "loss/reg": 0.8853859901428223, + "step": 1806 + }, + { + "epoch": 0.01807, + "grad_norm": 0.43723416328430176, + "grad_norm_var": 0.00015371433146776543, + "learning_rate": 5e-05, + "loss": 0.2064, + "loss/crossentropy": 2.7276336550712585, + "loss/hidden": 0.0, + "loss/logits": 0.20644037798047066, + "loss/reg": 0.8855364918708801, + "step": 1807 + }, + { + "epoch": 0.01808, + "grad_norm": 0.3960845470428467, + "grad_norm_var": 0.0001669956005184307, + "learning_rate": 5e-05, + "loss": 0.1905, + "loss/crossentropy": 2.749648630619049, + "loss/hidden": 0.0, + "loss/logits": 0.19049681723117828, + "loss/reg": 0.8853706121444702, + "step": 1808 + }, + { + "epoch": 0.01809, + "grad_norm": 0.47138044238090515, + "grad_norm_var": 0.0003714023544985031, + "learning_rate": 5e-05, + "loss": 0.2125, + "loss/crossentropy": 2.845011830329895, + "loss/hidden": 0.0, + "loss/logits": 0.2125193141400814, + "loss/reg": 0.8859027624130249, + "step": 1809 + }, + { + "epoch": 0.0181, + "grad_norm": 0.40787723660469055, + "grad_norm_var": 0.00037267745666983804, + "learning_rate": 5e-05, + "loss": 0.201, + "loss/crossentropy": 2.754331648349762, + "loss/hidden": 0.0, + "loss/logits": 0.20103870332241058, + "loss/reg": 0.8857205510139465, + "step": 1810 + }, + { + "epoch": 0.01811, + "grad_norm": 0.46917012333869934, + "grad_norm_var": 0.0005470735478984542, + "learning_rate": 5e-05, + "loss": 0.225, + "loss/crossentropy": 2.6890124678611755, + "loss/hidden": 0.0, + "loss/logits": 0.22502825409173965, + "loss/reg": 0.8853986859321594, + "step": 1811 + }, + { + "epoch": 0.01812, + "grad_norm": 0.4247317612171173, + "grad_norm_var": 0.0005179641686820132, + "learning_rate": 5e-05, + "loss": 0.2016, + "loss/crossentropy": 2.6866010427474976, + "loss/hidden": 0.0, + "loss/logits": 0.20160917937755585, + "loss/reg": 0.8847169280052185, + "step": 1812 + }, + { + "epoch": 0.01813, + "grad_norm": 0.38973426818847656, + "grad_norm_var": 0.0005692725583755849, + "learning_rate": 5e-05, + "loss": 0.1844, + "loss/crossentropy": 2.746491849422455, + "loss/hidden": 0.0, + "loss/logits": 0.18436771258711815, + "loss/reg": 0.8842182159423828, + "step": 1813 + }, + { + "epoch": 0.01814, + "grad_norm": 0.4148683547973633, + "grad_norm_var": 0.0005438949840141445, + "learning_rate": 5e-05, + "loss": 0.2008, + "loss/crossentropy": 2.878942310810089, + "loss/hidden": 0.0, + "loss/logits": 0.20079002529382706, + "loss/reg": 0.8834556341171265, + "step": 1814 + }, + { + "epoch": 0.01815, + "grad_norm": 0.4110451936721802, + "grad_norm_var": 0.0005438781752580094, + "learning_rate": 5e-05, + "loss": 0.1863, + "loss/crossentropy": 3.0287649035453796, + "loss/hidden": 0.0, + "loss/logits": 0.18630259484052658, + "loss/reg": 0.8829677104949951, + "step": 1815 + }, + { + "epoch": 0.01816, + "grad_norm": 0.3800681531429291, + "grad_norm_var": 0.0006249104218365738, + "learning_rate": 5e-05, + "loss": 0.2033, + "loss/crossentropy": 2.6700210571289062, + "loss/hidden": 0.0, + "loss/logits": 0.20325781777501106, + "loss/reg": 0.8822290897369385, + "step": 1816 + }, + { + "epoch": 0.01817, + "grad_norm": 0.4045352339744568, + "grad_norm_var": 0.0006360311616276112, + "learning_rate": 5e-05, + "loss": 0.1842, + "loss/crossentropy": 2.6943440437316895, + "loss/hidden": 0.0, + "loss/logits": 0.18424637615680695, + "loss/reg": 0.8814281225204468, + "step": 1817 + }, + { + "epoch": 0.01818, + "grad_norm": 0.38361549377441406, + "grad_norm_var": 0.0006903171502589604, + "learning_rate": 5e-05, + "loss": 0.1903, + "loss/crossentropy": 2.660310387611389, + "loss/hidden": 0.0, + "loss/logits": 0.190257228910923, + "loss/reg": 0.8801477551460266, + "step": 1818 + }, + { + "epoch": 0.01819, + "grad_norm": 0.4341121315956116, + "grad_norm_var": 0.0007151567713507371, + "learning_rate": 5e-05, + "loss": 0.1944, + "loss/crossentropy": 2.691216230392456, + "loss/hidden": 0.0, + "loss/logits": 0.19436386600136757, + "loss/reg": 0.879374623298645, + "step": 1819 + }, + { + "epoch": 0.0182, + "grad_norm": 0.41172513365745544, + "grad_norm_var": 0.0007155283160471648, + "learning_rate": 5e-05, + "loss": 0.1961, + "loss/crossentropy": 2.8783839344978333, + "loss/hidden": 0.0, + "loss/logits": 0.19605087116360664, + "loss/reg": 0.878777027130127, + "step": 1820 + }, + { + "epoch": 0.01821, + "grad_norm": 0.41848644614219666, + "grad_norm_var": 0.0007109920889231902, + "learning_rate": 5e-05, + "loss": 0.1942, + "loss/crossentropy": 2.805996596813202, + "loss/hidden": 0.0, + "loss/logits": 0.19415006786584854, + "loss/reg": 0.8780494928359985, + "step": 1821 + }, + { + "epoch": 0.01822, + "grad_norm": 0.4151057004928589, + "grad_norm_var": 0.0006941503368942059, + "learning_rate": 5e-05, + "loss": 0.2074, + "loss/crossentropy": 2.691302001476288, + "loss/hidden": 0.0, + "loss/logits": 0.20735519751906395, + "loss/reg": 0.8774803876876831, + "step": 1822 + }, + { + "epoch": 0.01823, + "grad_norm": 0.40295282006263733, + "grad_norm_var": 0.0006744779437835765, + "learning_rate": 5e-05, + "loss": 0.1963, + "loss/crossentropy": 2.7196571826934814, + "loss/hidden": 0.0, + "loss/logits": 0.19634438306093216, + "loss/reg": 0.8762959241867065, + "step": 1823 + }, + { + "epoch": 0.01824, + "grad_norm": 0.38869625329971313, + "grad_norm_var": 0.0006962458575604883, + "learning_rate": 5e-05, + "loss": 0.1882, + "loss/crossentropy": 2.771869957447052, + "loss/hidden": 0.0, + "loss/logits": 0.18820732459425926, + "loss/reg": 0.8756154179573059, + "step": 1824 + }, + { + "epoch": 0.01825, + "grad_norm": 0.38419196009635925, + "grad_norm_var": 0.0005072875532496912, + "learning_rate": 5e-05, + "loss": 0.1916, + "loss/crossentropy": 2.6809815168380737, + "loss/hidden": 0.0, + "loss/logits": 0.1916203871369362, + "loss/reg": 0.8744083642959595, + "step": 1825 + }, + { + "epoch": 0.01826, + "grad_norm": 0.4487023949623108, + "grad_norm_var": 0.0006063934180459326, + "learning_rate": 5e-05, + "loss": 0.2178, + "loss/crossentropy": 2.8156700134277344, + "loss/hidden": 0.0, + "loss/logits": 0.21776284649968147, + "loss/reg": 0.8737536072731018, + "step": 1826 + }, + { + "epoch": 0.01827, + "grad_norm": 0.3885592520236969, + "grad_norm_var": 0.0003911630525487042, + "learning_rate": 5e-05, + "loss": 0.1871, + "loss/crossentropy": 2.7764856219291687, + "loss/hidden": 0.0, + "loss/logits": 0.18714625388383865, + "loss/reg": 0.8732184767723083, + "step": 1827 + }, + { + "epoch": 0.01828, + "grad_norm": 0.4574081599712372, + "grad_norm_var": 0.0005381117093431365, + "learning_rate": 5e-05, + "loss": 0.1966, + "loss/crossentropy": 2.8399417996406555, + "loss/hidden": 0.0, + "loss/logits": 0.19662820547819138, + "loss/reg": 0.8722001910209656, + "step": 1828 + }, + { + "epoch": 0.01829, + "grad_norm": 0.4126952588558197, + "grad_norm_var": 0.0005140311352538636, + "learning_rate": 5e-05, + "loss": 0.2029, + "loss/crossentropy": 2.8371532559394836, + "loss/hidden": 0.0, + "loss/logits": 0.20286116003990173, + "loss/reg": 0.8717215061187744, + "step": 1829 + }, + { + "epoch": 0.0183, + "grad_norm": 0.44622132182121277, + "grad_norm_var": 0.0005966652735205321, + "learning_rate": 5e-05, + "loss": 0.1875, + "loss/crossentropy": 2.9045282006263733, + "loss/hidden": 0.0, + "loss/logits": 0.18746205791831017, + "loss/reg": 0.8713307976722717, + "step": 1830 + }, + { + "epoch": 0.01831, + "grad_norm": 0.42101621627807617, + "grad_norm_var": 0.000601932039182614, + "learning_rate": 5e-05, + "loss": 0.1964, + "loss/crossentropy": 2.7737737894058228, + "loss/hidden": 0.0, + "loss/logits": 0.19638841599225998, + "loss/reg": 0.8712663650512695, + "step": 1831 + }, + { + "epoch": 0.01832, + "grad_norm": 0.43446776270866394, + "grad_norm_var": 0.0005525170621598444, + "learning_rate": 5e-05, + "loss": 0.2063, + "loss/crossentropy": 2.769006133079529, + "loss/hidden": 0.0, + "loss/logits": 0.20632325112819672, + "loss/reg": 0.8707724809646606, + "step": 1832 + }, + { + "epoch": 0.01833, + "grad_norm": 0.42732134461402893, + "grad_norm_var": 0.0005508020339593708, + "learning_rate": 5e-05, + "loss": 0.1962, + "loss/crossentropy": 2.6726030111312866, + "loss/hidden": 0.0, + "loss/logits": 0.19617467373609543, + "loss/reg": 0.8704696893692017, + "step": 1833 + }, + { + "epoch": 0.01834, + "grad_norm": 0.36816877126693726, + "grad_norm_var": 0.000634894013014827, + "learning_rate": 5e-05, + "loss": 0.1791, + "loss/crossentropy": 2.8035976886749268, + "loss/hidden": 0.0, + "loss/logits": 0.1790780983865261, + "loss/reg": 0.870456337928772, + "step": 1834 + }, + { + "epoch": 0.01835, + "grad_norm": 0.4553738534450531, + "grad_norm_var": 0.0007138150602069756, + "learning_rate": 5e-05, + "loss": 0.2036, + "loss/crossentropy": 2.8905494809150696, + "loss/hidden": 0.0, + "loss/logits": 0.2036396525800228, + "loss/reg": 0.8699305653572083, + "step": 1835 + }, + { + "epoch": 0.01836, + "grad_norm": 0.40657714009284973, + "grad_norm_var": 0.0007194821629816439, + "learning_rate": 5e-05, + "loss": 0.1953, + "loss/crossentropy": 2.873349666595459, + "loss/hidden": 0.0, + "loss/logits": 0.19530102238059044, + "loss/reg": 0.8697292804718018, + "step": 1836 + }, + { + "epoch": 0.01837, + "grad_norm": 0.4543238580226898, + "grad_norm_var": 0.0008056768340881389, + "learning_rate": 5e-05, + "loss": 0.2024, + "loss/crossentropy": 3.1166374683380127, + "loss/hidden": 0.0, + "loss/logits": 0.20240385457873344, + "loss/reg": 0.869637131690979, + "step": 1837 + }, + { + "epoch": 0.01838, + "grad_norm": 0.421379953622818, + "grad_norm_var": 0.000804472493204796, + "learning_rate": 5e-05, + "loss": 0.2083, + "loss/crossentropy": 2.50061959028244, + "loss/hidden": 0.0, + "loss/logits": 0.20833520963788033, + "loss/reg": 0.8689462542533875, + "step": 1838 + }, + { + "epoch": 0.01839, + "grad_norm": 0.4087032377719879, + "grad_norm_var": 0.0007935618870177426, + "learning_rate": 5e-05, + "loss": 0.2058, + "loss/crossentropy": 2.8472816348075867, + "loss/hidden": 0.0, + "loss/logits": 0.2057865411043167, + "loss/reg": 0.8682885766029358, + "step": 1839 + }, + { + "epoch": 0.0184, + "grad_norm": 0.37850746512413025, + "grad_norm_var": 0.0008428996161608304, + "learning_rate": 5e-05, + "loss": 0.1772, + "loss/crossentropy": 2.649823546409607, + "loss/hidden": 0.0, + "loss/logits": 0.17720508575439453, + "loss/reg": 0.8673396110534668, + "step": 1840 + }, + { + "epoch": 0.01841, + "grad_norm": 0.4339388310909271, + "grad_norm_var": 0.0007627055638354899, + "learning_rate": 5e-05, + "loss": 0.1935, + "loss/crossentropy": 2.782778263092041, + "loss/hidden": 0.0, + "loss/logits": 0.19349564984440804, + "loss/reg": 0.8672105669975281, + "step": 1841 + }, + { + "epoch": 0.01842, + "grad_norm": 0.4713878333568573, + "grad_norm_var": 0.0008734888219704917, + "learning_rate": 5e-05, + "loss": 0.2131, + "loss/crossentropy": 2.9232393503189087, + "loss/hidden": 0.0, + "loss/logits": 0.21310868114233017, + "loss/reg": 0.8665778040885925, + "step": 1842 + }, + { + "epoch": 0.01843, + "grad_norm": 0.4213177263736725, + "grad_norm_var": 0.0007852010018439011, + "learning_rate": 5e-05, + "loss": 0.1883, + "loss/crossentropy": 2.7351691126823425, + "loss/hidden": 0.0, + "loss/logits": 0.18833507969975471, + "loss/reg": 0.8655192852020264, + "step": 1843 + }, + { + "epoch": 0.01844, + "grad_norm": 0.3939739465713501, + "grad_norm_var": 0.0007725325420692782, + "learning_rate": 5e-05, + "loss": 0.194, + "loss/crossentropy": 2.9118587970733643, + "loss/hidden": 0.0, + "loss/logits": 0.19397348538041115, + "loss/reg": 0.8646129369735718, + "step": 1844 + }, + { + "epoch": 0.01845, + "grad_norm": 0.47830748558044434, + "grad_norm_var": 0.0009583470904952189, + "learning_rate": 5e-05, + "loss": 0.2142, + "loss/crossentropy": 2.748432219028473, + "loss/hidden": 0.0, + "loss/logits": 0.21417668461799622, + "loss/reg": 0.8641530871391296, + "step": 1845 + }, + { + "epoch": 0.01846, + "grad_norm": 0.40475931763648987, + "grad_norm_var": 0.000955724836401813, + "learning_rate": 5e-05, + "loss": 0.1785, + "loss/crossentropy": 2.8377654552459717, + "loss/hidden": 0.0, + "loss/logits": 0.17845793813467026, + "loss/reg": 0.8638774156570435, + "step": 1846 + }, + { + "epoch": 0.01847, + "grad_norm": 0.412720263004303, + "grad_norm_var": 0.00096301732033884, + "learning_rate": 5e-05, + "loss": 0.1763, + "loss/crossentropy": 2.839793860912323, + "loss/hidden": 0.0, + "loss/logits": 0.17627452313899994, + "loss/reg": 0.8637427091598511, + "step": 1847 + }, + { + "epoch": 0.01848, + "grad_norm": 0.40244048833847046, + "grad_norm_var": 0.0009790173845574976, + "learning_rate": 5e-05, + "loss": 0.194, + "loss/crossentropy": 2.894149899482727, + "loss/hidden": 0.0, + "loss/logits": 0.19404540956020355, + "loss/reg": 0.8632751703262329, + "step": 1848 + }, + { + "epoch": 0.01849, + "grad_norm": 0.4253540635108948, + "grad_norm_var": 0.0009776536425150676, + "learning_rate": 5e-05, + "loss": 0.1916, + "loss/crossentropy": 2.7934269309043884, + "loss/hidden": 0.0, + "loss/logits": 0.19155260547995567, + "loss/reg": 0.8631685376167297, + "step": 1849 + }, + { + "epoch": 0.0185, + "grad_norm": 0.43653759360313416, + "grad_norm_var": 0.0007874927555983254, + "learning_rate": 5e-05, + "loss": 0.2, + "loss/crossentropy": 2.778549313545227, + "loss/hidden": 0.0, + "loss/logits": 0.2000497616827488, + "loss/reg": 0.8622583150863647, + "step": 1850 + }, + { + "epoch": 0.01851, + "grad_norm": 0.4531484544277191, + "grad_norm_var": 0.0007788936634817945, + "learning_rate": 5e-05, + "loss": 0.2031, + "loss/crossentropy": 2.610984146595001, + "loss/hidden": 0.0, + "loss/logits": 0.2030741162598133, + "loss/reg": 0.8614676594734192, + "step": 1851 + }, + { + "epoch": 0.01852, + "grad_norm": 0.47085142135620117, + "grad_norm_var": 0.0008774013336590041, + "learning_rate": 5e-05, + "loss": 0.2051, + "loss/crossentropy": 2.735887825489044, + "loss/hidden": 0.0, + "loss/logits": 0.20505616813898087, + "loss/reg": 0.861035168170929, + "step": 1852 + }, + { + "epoch": 0.01853, + "grad_norm": 0.4040316045284271, + "grad_norm_var": 0.0008672012734844969, + "learning_rate": 5e-05, + "loss": 0.2319, + "loss/crossentropy": 2.7442225217819214, + "loss/hidden": 0.0, + "loss/logits": 0.231895312666893, + "loss/reg": 0.8610429167747498, + "step": 1853 + }, + { + "epoch": 0.01854, + "grad_norm": 0.41742706298828125, + "grad_norm_var": 0.0008706576516620286, + "learning_rate": 5e-05, + "loss": 0.2052, + "loss/crossentropy": 2.814299166202545, + "loss/hidden": 0.0, + "loss/logits": 0.20518312603235245, + "loss/reg": 0.8606191873550415, + "step": 1854 + }, + { + "epoch": 0.01855, + "grad_norm": 0.463815838098526, + "grad_norm_var": 0.000934583450987155, + "learning_rate": 5e-05, + "loss": 0.213, + "loss/crossentropy": 2.772307276725769, + "loss/hidden": 0.0, + "loss/logits": 0.21297916769981384, + "loss/reg": 0.8602007627487183, + "step": 1855 + }, + { + "epoch": 0.01856, + "grad_norm": 0.4082344174385071, + "grad_norm_var": 0.0007885627261811449, + "learning_rate": 5e-05, + "loss": 0.1897, + "loss/crossentropy": 2.8122514486312866, + "loss/hidden": 0.0, + "loss/logits": 0.18972306698560715, + "loss/reg": 0.86012864112854, + "step": 1856 + }, + { + "epoch": 0.01857, + "grad_norm": 0.4443175792694092, + "grad_norm_var": 0.0007991676930914182, + "learning_rate": 5e-05, + "loss": 0.2157, + "loss/crossentropy": 2.6094889044761658, + "loss/hidden": 0.0, + "loss/logits": 0.21571212634444237, + "loss/reg": 0.8605313301086426, + "step": 1857 + }, + { + "epoch": 0.01858, + "grad_norm": 0.47876986861228943, + "grad_norm_var": 0.0008415495263010108, + "learning_rate": 5e-05, + "loss": 0.2221, + "loss/crossentropy": 2.6963194012641907, + "loss/hidden": 0.0, + "loss/logits": 0.22212516888976097, + "loss/reg": 0.8600078821182251, + "step": 1858 + }, + { + "epoch": 0.01859, + "grad_norm": 0.8267810940742493, + "grad_norm_var": 0.01052554114220392, + "learning_rate": 5e-05, + "loss": 0.2183, + "loss/crossentropy": 2.7884796261787415, + "loss/hidden": 0.0, + "loss/logits": 0.21834344044327736, + "loss/reg": 0.8599786758422852, + "step": 1859 + }, + { + "epoch": 0.0186, + "grad_norm": 0.5109309554100037, + "grad_norm_var": 0.01038839950993009, + "learning_rate": 5e-05, + "loss": 0.1894, + "loss/crossentropy": 2.711488723754883, + "loss/hidden": 0.0, + "loss/logits": 0.18943556025624275, + "loss/reg": 0.8601222038269043, + "step": 1860 + }, + { + "epoch": 0.01861, + "grad_norm": 0.44234102964401245, + "grad_norm_var": 0.0104049609113968, + "learning_rate": 5e-05, + "loss": 0.1861, + "loss/crossentropy": 2.85012149810791, + "loss/hidden": 0.0, + "loss/logits": 0.1861085630953312, + "loss/reg": 0.8600141406059265, + "step": 1861 + }, + { + "epoch": 0.01862, + "grad_norm": 0.4488767981529236, + "grad_norm_var": 0.010186053331192524, + "learning_rate": 5e-05, + "loss": 0.1893, + "loss/crossentropy": 2.6850525736808777, + "loss/hidden": 0.0, + "loss/logits": 0.18929185718297958, + "loss/reg": 0.8600738644599915, + "step": 1862 + }, + { + "epoch": 0.01863, + "grad_norm": 0.42296475172042847, + "grad_norm_var": 0.01012064050706446, + "learning_rate": 5e-05, + "loss": 0.1931, + "loss/crossentropy": 2.823063313961029, + "loss/hidden": 0.0, + "loss/logits": 0.19307823106646538, + "loss/reg": 0.860470712184906, + "step": 1863 + }, + { + "epoch": 0.01864, + "grad_norm": 0.4614562392234802, + "grad_norm_var": 0.009837779451030537, + "learning_rate": 5e-05, + "loss": 0.1957, + "loss/crossentropy": 2.8724761605262756, + "loss/hidden": 0.0, + "loss/logits": 0.19572097808122635, + "loss/reg": 0.8609605431556702, + "step": 1864 + }, + { + "epoch": 0.01865, + "grad_norm": 0.4670039117336273, + "grad_norm_var": 0.009699710240888716, + "learning_rate": 5e-05, + "loss": 0.1882, + "loss/crossentropy": 2.908924877643585, + "loss/hidden": 0.0, + "loss/logits": 0.1882306933403015, + "loss/reg": 0.8610231280326843, + "step": 1865 + }, + { + "epoch": 0.01866, + "grad_norm": 0.4499320387840271, + "grad_norm_var": 0.009646977562170866, + "learning_rate": 5e-05, + "loss": 0.2121, + "loss/crossentropy": 2.960861086845398, + "loss/hidden": 0.0, + "loss/logits": 0.21211780235171318, + "loss/reg": 0.8618782162666321, + "step": 1866 + }, + { + "epoch": 0.01867, + "grad_norm": 0.4010671079158783, + "grad_norm_var": 0.009955610707336915, + "learning_rate": 5e-05, + "loss": 0.1869, + "loss/crossentropy": 2.8528873920440674, + "loss/hidden": 0.0, + "loss/logits": 0.18694154918193817, + "loss/reg": 0.8613529205322266, + "step": 1867 + }, + { + "epoch": 0.01868, + "grad_norm": 0.4370492100715637, + "grad_norm_var": 0.010022847689133333, + "learning_rate": 5e-05, + "loss": 0.2016, + "loss/crossentropy": 2.8674468398094177, + "loss/hidden": 0.0, + "loss/logits": 0.20157770067453384, + "loss/reg": 0.8607704043388367, + "step": 1868 + }, + { + "epoch": 0.01869, + "grad_norm": 0.44706204533576965, + "grad_norm_var": 0.009772638036635502, + "learning_rate": 5e-05, + "loss": 0.2073, + "loss/crossentropy": 2.6901365518569946, + "loss/hidden": 0.0, + "loss/logits": 0.20734887197613716, + "loss/reg": 0.8604735136032104, + "step": 1869 + }, + { + "epoch": 0.0187, + "grad_norm": 0.4227212071418762, + "grad_norm_var": 0.009736925025791706, + "learning_rate": 5e-05, + "loss": 0.1888, + "loss/crossentropy": 2.9464325308799744, + "loss/hidden": 0.0, + "loss/logits": 0.18880737200379372, + "loss/reg": 0.8606537580490112, + "step": 1870 + }, + { + "epoch": 0.01871, + "grad_norm": 0.4298408329486847, + "grad_norm_var": 0.00984085547868165, + "learning_rate": 5e-05, + "loss": 0.1967, + "loss/crossentropy": 2.808286964893341, + "loss/hidden": 0.0, + "loss/logits": 0.19672463834285736, + "loss/reg": 0.8605533838272095, + "step": 1871 + }, + { + "epoch": 0.01872, + "grad_norm": 0.4341031014919281, + "grad_norm_var": 0.009674092300272127, + "learning_rate": 5e-05, + "loss": 0.2119, + "loss/crossentropy": 2.726570785045624, + "loss/hidden": 0.0, + "loss/logits": 0.21193281561136246, + "loss/reg": 0.8598572611808777, + "step": 1872 + }, + { + "epoch": 0.01873, + "grad_norm": 0.40797707438468933, + "grad_norm_var": 0.009882653573959918, + "learning_rate": 5e-05, + "loss": 0.2006, + "loss/crossentropy": 2.7558920979499817, + "loss/hidden": 0.0, + "loss/logits": 0.2006087228655815, + "loss/reg": 0.8593345880508423, + "step": 1873 + }, + { + "epoch": 0.01874, + "grad_norm": 0.4248046576976776, + "grad_norm_var": 0.00998757024144767, + "learning_rate": 5e-05, + "loss": 0.2014, + "loss/crossentropy": 2.8610259294509888, + "loss/hidden": 0.0, + "loss/logits": 0.20143526792526245, + "loss/reg": 0.8589630722999573, + "step": 1874 + }, + { + "epoch": 0.01875, + "grad_norm": 0.415412962436676, + "grad_norm_var": 0.000703250459169195, + "learning_rate": 5e-05, + "loss": 0.1902, + "loss/crossentropy": 2.6619481444358826, + "loss/hidden": 0.0, + "loss/logits": 0.1901528425514698, + "loss/reg": 0.8587875962257385, + "step": 1875 + }, + { + "epoch": 0.01876, + "grad_norm": 0.3922654688358307, + "grad_norm_var": 0.0004447968186206778, + "learning_rate": 5e-05, + "loss": 0.1917, + "loss/crossentropy": 2.875247359275818, + "loss/hidden": 0.0, + "loss/logits": 0.19171729311347008, + "loss/reg": 0.8583662509918213, + "step": 1876 + }, + { + "epoch": 0.01877, + "grad_norm": 0.41292887926101685, + "grad_norm_var": 0.00045656488741578903, + "learning_rate": 5e-05, + "loss": 0.2055, + "loss/crossentropy": 2.69901043176651, + "loss/hidden": 0.0, + "loss/logits": 0.20547962561249733, + "loss/reg": 0.8579082489013672, + "step": 1877 + }, + { + "epoch": 0.01878, + "grad_norm": 0.49054309725761414, + "grad_norm_var": 0.0006715142851825543, + "learning_rate": 5e-05, + "loss": 0.2157, + "loss/crossentropy": 2.7710567116737366, + "loss/hidden": 0.0, + "loss/logits": 0.21566850692033768, + "loss/reg": 0.8571624159812927, + "step": 1878 + }, + { + "epoch": 0.01879, + "grad_norm": 0.4159642457962036, + "grad_norm_var": 0.0006833101582665221, + "learning_rate": 5e-05, + "loss": 0.2045, + "loss/crossentropy": 2.7000783681869507, + "loss/hidden": 0.0, + "loss/logits": 0.20450333133339882, + "loss/reg": 0.8566645979881287, + "step": 1879 + }, + { + "epoch": 0.0188, + "grad_norm": 0.41294634342193604, + "grad_norm_var": 0.0006391081317155453, + "learning_rate": 5e-05, + "loss": 0.1999, + "loss/crossentropy": 2.711737096309662, + "loss/hidden": 0.0, + "loss/logits": 0.19987360760569572, + "loss/reg": 0.8563292622566223, + "step": 1880 + }, + { + "epoch": 0.01881, + "grad_norm": 0.3933762013912201, + "grad_norm_var": 0.0006033787049420665, + "learning_rate": 5e-05, + "loss": 0.1932, + "loss/crossentropy": 2.5067747831344604, + "loss/hidden": 0.0, + "loss/logits": 0.1932012103497982, + "loss/reg": 0.856222927570343, + "step": 1881 + }, + { + "epoch": 0.01882, + "grad_norm": 0.39141905307769775, + "grad_norm_var": 0.0006169972349161612, + "learning_rate": 5e-05, + "loss": 0.1942, + "loss/crossentropy": 2.7386473417282104, + "loss/hidden": 0.0, + "loss/logits": 0.19419138133525848, + "loss/reg": 0.8559076189994812, + "step": 1882 + }, + { + "epoch": 0.01883, + "grad_norm": 0.40779832005500793, + "grad_norm_var": 0.0006023050366477952, + "learning_rate": 5e-05, + "loss": 0.1822, + "loss/crossentropy": 2.7678736448287964, + "loss/hidden": 0.0, + "loss/logits": 0.18222039192914963, + "loss/reg": 0.8556982278823853, + "step": 1883 + }, + { + "epoch": 0.01884, + "grad_norm": 0.3862144649028778, + "grad_norm_var": 0.0006551248482100055, + "learning_rate": 5e-05, + "loss": 0.1922, + "loss/crossentropy": 2.750099837779999, + "loss/hidden": 0.0, + "loss/logits": 0.19219841435551643, + "loss/reg": 0.8550093173980713, + "step": 1884 + }, + { + "epoch": 0.01885, + "grad_norm": 0.3694680333137512, + "grad_norm_var": 0.0007290592163974356, + "learning_rate": 5e-05, + "loss": 0.1775, + "loss/crossentropy": 2.774383544921875, + "loss/hidden": 0.0, + "loss/logits": 0.17750544473528862, + "loss/reg": 0.8547077775001526, + "step": 1885 + }, + { + "epoch": 0.01886, + "grad_norm": 0.3932516574859619, + "grad_norm_var": 0.0007450872750751143, + "learning_rate": 5e-05, + "loss": 0.1907, + "loss/crossentropy": 2.850859224796295, + "loss/hidden": 0.0, + "loss/logits": 0.19068051874637604, + "loss/reg": 0.8540486693382263, + "step": 1886 + }, + { + "epoch": 0.01887, + "grad_norm": 0.4164002239704132, + "grad_norm_var": 0.0007228728195628069, + "learning_rate": 5e-05, + "loss": 0.2016, + "loss/crossentropy": 2.7217097878456116, + "loss/hidden": 0.0, + "loss/logits": 0.20161013677716255, + "loss/reg": 0.8533588647842407, + "step": 1887 + }, + { + "epoch": 0.01888, + "grad_norm": 0.4806760251522064, + "grad_norm_var": 0.001006219679751382, + "learning_rate": 5e-05, + "loss": 0.1965, + "loss/crossentropy": 2.9720823168754578, + "loss/hidden": 0.0, + "loss/logits": 0.19651786983013153, + "loss/reg": 0.8531802892684937, + "step": 1888 + }, + { + "epoch": 0.01889, + "grad_norm": 0.4182739555835724, + "grad_norm_var": 0.0010056544745641364, + "learning_rate": 5e-05, + "loss": 0.199, + "loss/crossentropy": 2.8201688528060913, + "loss/hidden": 0.0, + "loss/logits": 0.19896001368761063, + "loss/reg": 0.852277398109436, + "step": 1889 + }, + { + "epoch": 0.0189, + "grad_norm": 0.45667311549186707, + "grad_norm_var": 0.0011156389935623918, + "learning_rate": 5e-05, + "loss": 0.202, + "loss/crossentropy": 2.78415310382843, + "loss/hidden": 0.0, + "loss/logits": 0.20198464766144753, + "loss/reg": 0.8516212105751038, + "step": 1890 + }, + { + "epoch": 0.01891, + "grad_norm": 0.4377621114253998, + "grad_norm_var": 0.0011455522062469323, + "learning_rate": 5e-05, + "loss": 0.2086, + "loss/crossentropy": 2.7693448662757874, + "loss/hidden": 0.0, + "loss/logits": 0.20859498530626297, + "loss/reg": 0.8512976169586182, + "step": 1891 + }, + { + "epoch": 0.01892, + "grad_norm": 0.39116787910461426, + "grad_norm_var": 0.0011492835139240007, + "learning_rate": 5e-05, + "loss": 0.1986, + "loss/crossentropy": 2.9238383769989014, + "loss/hidden": 0.0, + "loss/logits": 0.19861867651343346, + "loss/reg": 0.8505960702896118, + "step": 1892 + }, + { + "epoch": 0.01893, + "grad_norm": 0.49259769916534424, + "grad_norm_var": 0.001500831881940127, + "learning_rate": 5e-05, + "loss": 0.1856, + "loss/crossentropy": 2.7086846828460693, + "loss/hidden": 0.0, + "loss/logits": 0.18557342514395714, + "loss/reg": 0.8502125144004822, + "step": 1893 + }, + { + "epoch": 0.01894, + "grad_norm": 0.4160771667957306, + "grad_norm_var": 0.001168426734231694, + "learning_rate": 5e-05, + "loss": 0.1981, + "loss/crossentropy": 2.7745996713638306, + "loss/hidden": 0.0, + "loss/logits": 0.198074109852314, + "loss/reg": 0.8496638536453247, + "step": 1894 + }, + { + "epoch": 0.01895, + "grad_norm": 0.41058945655822754, + "grad_norm_var": 0.001171335815736004, + "learning_rate": 5e-05, + "loss": 0.1913, + "loss/crossentropy": 2.8345044255256653, + "loss/hidden": 0.0, + "loss/logits": 0.1913003958761692, + "loss/reg": 0.8487659692764282, + "step": 1895 + }, + { + "epoch": 0.01896, + "grad_norm": 0.39137691259384155, + "grad_norm_var": 0.0012125551676121801, + "learning_rate": 5e-05, + "loss": 0.1827, + "loss/crossentropy": 2.756899118423462, + "loss/hidden": 0.0, + "loss/logits": 0.18267197161912918, + "loss/reg": 0.8477123379707336, + "step": 1896 + }, + { + "epoch": 0.01897, + "grad_norm": 0.4524717628955841, + "grad_norm_var": 0.0012539780327675333, + "learning_rate": 5e-05, + "loss": 0.2036, + "loss/crossentropy": 2.8982661962509155, + "loss/hidden": 0.0, + "loss/logits": 0.2035953588783741, + "loss/reg": 0.8468810319900513, + "step": 1897 + }, + { + "epoch": 0.01898, + "grad_norm": 0.4107264280319214, + "grad_norm_var": 0.0012049521548068006, + "learning_rate": 5e-05, + "loss": 0.2077, + "loss/crossentropy": 2.693164646625519, + "loss/hidden": 0.0, + "loss/logits": 0.20767628774046898, + "loss/reg": 0.8459100127220154, + "step": 1898 + }, + { + "epoch": 0.01899, + "grad_norm": 0.403199702501297, + "grad_norm_var": 0.0012141969750022205, + "learning_rate": 5e-05, + "loss": 0.1958, + "loss/crossentropy": 2.6866546869277954, + "loss/hidden": 0.0, + "loss/logits": 0.19577254354953766, + "loss/reg": 0.8448077440261841, + "step": 1899 + }, + { + "epoch": 0.019, + "grad_norm": 0.40245264768600464, + "grad_norm_var": 0.0011565908327204018, + "learning_rate": 5e-05, + "loss": 0.1885, + "loss/crossentropy": 2.6641695499420166, + "loss/hidden": 0.0, + "loss/logits": 0.18845220282673836, + "loss/reg": 0.8437227010726929, + "step": 1900 + }, + { + "epoch": 0.01901, + "grad_norm": 0.3748202621936798, + "grad_norm_var": 0.001121286883045806, + "learning_rate": 5e-05, + "loss": 0.1874, + "loss/crossentropy": 2.87463241815567, + "loss/hidden": 0.0, + "loss/logits": 0.18736360594630241, + "loss/reg": 0.8430832624435425, + "step": 1901 + }, + { + "epoch": 0.01902, + "grad_norm": 0.4243621528148651, + "grad_norm_var": 0.0010634312725284515, + "learning_rate": 5e-05, + "loss": 0.213, + "loss/crossentropy": 2.8850821256637573, + "loss/hidden": 0.0, + "loss/logits": 0.2129940651357174, + "loss/reg": 0.8422893285751343, + "step": 1902 + }, + { + "epoch": 0.01903, + "grad_norm": 0.375873863697052, + "grad_norm_var": 0.0012056692127686082, + "learning_rate": 5e-05, + "loss": 0.1939, + "loss/crossentropy": 2.9269325137138367, + "loss/hidden": 0.0, + "loss/logits": 0.19388863816857338, + "loss/reg": 0.8414551019668579, + "step": 1903 + }, + { + "epoch": 0.01904, + "grad_norm": 0.42073309421539307, + "grad_norm_var": 0.0009548363804637934, + "learning_rate": 5e-05, + "loss": 0.2025, + "loss/crossentropy": 2.8575509190559387, + "loss/hidden": 0.0, + "loss/logits": 0.20251396670937538, + "loss/reg": 0.8409824371337891, + "step": 1904 + }, + { + "epoch": 0.01905, + "grad_norm": 0.44542282819747925, + "grad_norm_var": 0.001003894760507947, + "learning_rate": 5e-05, + "loss": 0.2087, + "loss/crossentropy": 2.881743311882019, + "loss/hidden": 0.0, + "loss/logits": 0.2086678370833397, + "loss/reg": 0.8402405381202698, + "step": 1905 + }, + { + "epoch": 0.01906, + "grad_norm": 0.4176529049873352, + "grad_norm_var": 0.0009038042833217001, + "learning_rate": 5e-05, + "loss": 0.2046, + "loss/crossentropy": 2.762676239013672, + "loss/hidden": 0.0, + "loss/logits": 0.20464308559894562, + "loss/reg": 0.8392151594161987, + "step": 1906 + }, + { + "epoch": 0.01907, + "grad_norm": 0.4172472059726715, + "grad_norm_var": 0.0008725113390647336, + "learning_rate": 5e-05, + "loss": 0.1949, + "loss/crossentropy": 2.9665162563323975, + "loss/hidden": 0.0, + "loss/logits": 0.19487519562244415, + "loss/reg": 0.8380366563796997, + "step": 1907 + }, + { + "epoch": 0.01908, + "grad_norm": 0.4117933511734009, + "grad_norm_var": 0.0008323956791387459, + "learning_rate": 5e-05, + "loss": 0.1829, + "loss/crossentropy": 2.9393438696861267, + "loss/hidden": 0.0, + "loss/logits": 0.18285026401281357, + "loss/reg": 0.8377097249031067, + "step": 1908 + }, + { + "epoch": 0.01909, + "grad_norm": 0.4283354580402374, + "grad_norm_var": 0.0004402894728716948, + "learning_rate": 5e-05, + "loss": 0.2004, + "loss/crossentropy": 2.8670120239257812, + "loss/hidden": 0.0, + "loss/logits": 0.20037797465920448, + "loss/reg": 0.83690345287323, + "step": 1909 + }, + { + "epoch": 0.0191, + "grad_norm": 0.4141778349876404, + "grad_norm_var": 0.0004396586654966228, + "learning_rate": 5e-05, + "loss": 0.192, + "loss/crossentropy": 2.7264450788497925, + "loss/hidden": 0.0, + "loss/logits": 0.19202467799186707, + "loss/reg": 0.8363360166549683, + "step": 1910 + }, + { + "epoch": 0.01911, + "grad_norm": 0.3901000916957855, + "grad_norm_var": 0.00047132750558592784, + "learning_rate": 5e-05, + "loss": 0.1746, + "loss/crossentropy": 2.662300944328308, + "loss/hidden": 0.0, + "loss/logits": 0.17456327751278877, + "loss/reg": 0.8360676765441895, + "step": 1911 + }, + { + "epoch": 0.01912, + "grad_norm": 0.4108651280403137, + "grad_norm_var": 0.00044330438000829143, + "learning_rate": 5e-05, + "loss": 0.1979, + "loss/crossentropy": 2.7080438137054443, + "loss/hidden": 0.0, + "loss/logits": 0.19789596647024155, + "loss/reg": 0.8352373242378235, + "step": 1912 + }, + { + "epoch": 0.01913, + "grad_norm": 0.3944729268550873, + "grad_norm_var": 0.0003445506227322981, + "learning_rate": 5e-05, + "loss": 0.1837, + "loss/crossentropy": 2.5074052810668945, + "loss/hidden": 0.0, + "loss/logits": 0.18367857486009598, + "loss/reg": 0.8342070579528809, + "step": 1913 + }, + { + "epoch": 0.01914, + "grad_norm": 0.40939974784851074, + "grad_norm_var": 0.0003443357351905522, + "learning_rate": 5e-05, + "loss": 0.1968, + "loss/crossentropy": 2.710819363594055, + "loss/hidden": 0.0, + "loss/logits": 0.19682180508971214, + "loss/reg": 0.8332958221435547, + "step": 1914 + }, + { + "epoch": 0.01915, + "grad_norm": 0.7204210758209229, + "grad_norm_var": 0.006396513333546554, + "learning_rate": 5e-05, + "loss": 0.2156, + "loss/crossentropy": 2.985659956932068, + "loss/hidden": 0.0, + "loss/logits": 0.215598963201046, + "loss/reg": 0.8321488499641418, + "step": 1915 + }, + { + "epoch": 0.01916, + "grad_norm": 0.5111802220344543, + "grad_norm_var": 0.006755829513090935, + "learning_rate": 5e-05, + "loss": 0.2083, + "loss/crossentropy": 2.765591084957123, + "loss/hidden": 0.0, + "loss/logits": 0.20833655446767807, + "loss/reg": 0.8310338258743286, + "step": 1916 + }, + { + "epoch": 0.01917, + "grad_norm": 0.4084041714668274, + "grad_norm_var": 0.006554926472281564, + "learning_rate": 5e-05, + "loss": 0.1914, + "loss/crossentropy": 2.8071027994155884, + "loss/hidden": 0.0, + "loss/logits": 0.19143269956111908, + "loss/reg": 0.8302226662635803, + "step": 1917 + }, + { + "epoch": 0.01918, + "grad_norm": 0.44032594561576843, + "grad_norm_var": 0.006542831349444234, + "learning_rate": 5e-05, + "loss": 0.2035, + "loss/crossentropy": 2.7133736610412598, + "loss/hidden": 0.0, + "loss/logits": 0.20351077988743782, + "loss/reg": 0.8293207883834839, + "step": 1918 + }, + { + "epoch": 0.01919, + "grad_norm": 0.46147823333740234, + "grad_norm_var": 0.00628573912805136, + "learning_rate": 5e-05, + "loss": 0.2017, + "loss/crossentropy": 2.8348661065101624, + "loss/hidden": 0.0, + "loss/logits": 0.2017073966562748, + "loss/reg": 0.8288859724998474, + "step": 1919 + }, + { + "epoch": 0.0192, + "grad_norm": 0.3947501480579376, + "grad_norm_var": 0.0064081085864480395, + "learning_rate": 5e-05, + "loss": 0.1891, + "loss/crossentropy": 2.7925063967704773, + "loss/hidden": 0.0, + "loss/logits": 0.1890932358801365, + "loss/reg": 0.8279058933258057, + "step": 1920 + }, + { + "epoch": 0.01921, + "grad_norm": 0.4013521373271942, + "grad_norm_var": 0.006510863884384721, + "learning_rate": 5e-05, + "loss": 0.1978, + "loss/crossentropy": 2.7446338534355164, + "loss/hidden": 0.0, + "loss/logits": 0.19776693731546402, + "loss/reg": 0.8273690342903137, + "step": 1921 + }, + { + "epoch": 0.01922, + "grad_norm": 0.3815420866012573, + "grad_norm_var": 0.006697539133585843, + "learning_rate": 5e-05, + "loss": 0.1782, + "loss/crossentropy": 2.911185562610626, + "loss/hidden": 0.0, + "loss/logits": 0.17821616306900978, + "loss/reg": 0.8266914486885071, + "step": 1922 + }, + { + "epoch": 0.01923, + "grad_norm": 0.36894431710243225, + "grad_norm_var": 0.006972125815576998, + "learning_rate": 5e-05, + "loss": 0.184, + "loss/crossentropy": 2.8202965259552, + "loss/hidden": 0.0, + "loss/logits": 0.1839989572763443, + "loss/reg": 0.826550304889679, + "step": 1923 + }, + { + "epoch": 0.01924, + "grad_norm": 0.41022172570228577, + "grad_norm_var": 0.006976979996265461, + "learning_rate": 5e-05, + "loss": 0.1841, + "loss/crossentropy": 2.8062856197357178, + "loss/hidden": 0.0, + "loss/logits": 0.18412147462368011, + "loss/reg": 0.8260629177093506, + "step": 1924 + }, + { + "epoch": 0.01925, + "grad_norm": 0.42535021901130676, + "grad_norm_var": 0.006979840681534692, + "learning_rate": 5e-05, + "loss": 0.2046, + "loss/crossentropy": 2.9198508858680725, + "loss/hidden": 0.0, + "loss/logits": 0.20462898537516594, + "loss/reg": 0.8256332874298096, + "step": 1925 + }, + { + "epoch": 0.01926, + "grad_norm": 0.41190701723098755, + "grad_norm_var": 0.006986145451577327, + "learning_rate": 5e-05, + "loss": 0.1974, + "loss/crossentropy": 2.8014107942581177, + "loss/hidden": 0.0, + "loss/logits": 0.1974184885621071, + "loss/reg": 0.8247300386428833, + "step": 1926 + }, + { + "epoch": 0.01927, + "grad_norm": 0.44452086091041565, + "grad_norm_var": 0.006854194152772293, + "learning_rate": 5e-05, + "loss": 0.2007, + "loss/crossentropy": 2.822911560535431, + "loss/hidden": 0.0, + "loss/logits": 0.20071227848529816, + "loss/reg": 0.8244222402572632, + "step": 1927 + }, + { + "epoch": 0.01928, + "grad_norm": 0.5008922219276428, + "grad_norm_var": 0.007044683448397379, + "learning_rate": 5e-05, + "loss": 0.2141, + "loss/crossentropy": 2.8664467334747314, + "loss/hidden": 0.0, + "loss/logits": 0.2140694372355938, + "loss/reg": 0.8241965174674988, + "step": 1928 + }, + { + "epoch": 0.01929, + "grad_norm": 0.42727142572402954, + "grad_norm_var": 0.006900477335719032, + "learning_rate": 5e-05, + "loss": 0.1976, + "loss/crossentropy": 2.8532140851020813, + "loss/hidden": 0.0, + "loss/logits": 0.19760168716311455, + "loss/reg": 0.8241434097290039, + "step": 1929 + }, + { + "epoch": 0.0193, + "grad_norm": 0.3961251378059387, + "grad_norm_var": 0.006974275889228206, + "learning_rate": 5e-05, + "loss": 0.1868, + "loss/crossentropy": 2.774713635444641, + "loss/hidden": 0.0, + "loss/logits": 0.18682782351970673, + "loss/reg": 0.8236058354377747, + "step": 1930 + }, + { + "epoch": 0.01931, + "grad_norm": 0.4069480299949646, + "grad_norm_var": 0.001564247241811408, + "learning_rate": 5e-05, + "loss": 0.183, + "loss/crossentropy": 2.7575384974479675, + "loss/hidden": 0.0, + "loss/logits": 0.1830037534236908, + "loss/reg": 0.8234732747077942, + "step": 1931 + }, + { + "epoch": 0.01932, + "grad_norm": 0.3841603994369507, + "grad_norm_var": 0.001103778174189011, + "learning_rate": 5e-05, + "loss": 0.1853, + "loss/crossentropy": 2.786939561367035, + "loss/hidden": 0.0, + "loss/logits": 0.1853381060063839, + "loss/reg": 0.8227251768112183, + "step": 1932 + }, + { + "epoch": 0.01933, + "grad_norm": 0.42709746956825256, + "grad_norm_var": 0.0011054095386222298, + "learning_rate": 5e-05, + "loss": 0.2276, + "loss/crossentropy": 2.7064501643180847, + "loss/hidden": 0.0, + "loss/logits": 0.22762993723154068, + "loss/reg": 0.8224155306816101, + "step": 1933 + }, + { + "epoch": 0.01934, + "grad_norm": 0.4189620614051819, + "grad_norm_var": 0.0010694294421293808, + "learning_rate": 5e-05, + "loss": 0.2046, + "loss/crossentropy": 2.8003990650177, + "loss/hidden": 0.0, + "loss/logits": 0.20464803650975227, + "loss/reg": 0.8217348456382751, + "step": 1934 + }, + { + "epoch": 0.01935, + "grad_norm": 0.42320385575294495, + "grad_norm_var": 0.0009306623657763987, + "learning_rate": 5e-05, + "loss": 0.1894, + "loss/crossentropy": 2.7599529027938843, + "loss/hidden": 0.0, + "loss/logits": 0.18942034244537354, + "loss/reg": 0.8217450976371765, + "step": 1935 + }, + { + "epoch": 0.01936, + "grad_norm": 0.43891292810440063, + "grad_norm_var": 0.0009394853920891936, + "learning_rate": 5e-05, + "loss": 0.1988, + "loss/crossentropy": 2.941531538963318, + "loss/hidden": 0.0, + "loss/logits": 0.19884098693728447, + "loss/reg": 0.8212395906448364, + "step": 1936 + }, + { + "epoch": 0.01937, + "grad_norm": 0.38071680068969727, + "grad_norm_var": 0.0010083631744948343, + "learning_rate": 5e-05, + "loss": 0.1832, + "loss/crossentropy": 2.777757167816162, + "loss/hidden": 0.0, + "loss/logits": 0.1832246519625187, + "loss/reg": 0.8206287622451782, + "step": 1937 + }, + { + "epoch": 0.01938, + "grad_norm": 0.4053645431995392, + "grad_norm_var": 0.0009362139371330458, + "learning_rate": 5e-05, + "loss": 0.1942, + "loss/crossentropy": 2.6445663571357727, + "loss/hidden": 0.0, + "loss/logits": 0.19421947374939919, + "loss/reg": 0.8199341893196106, + "step": 1938 + }, + { + "epoch": 0.01939, + "grad_norm": 0.3973230719566345, + "grad_norm_var": 0.0008050451379228551, + "learning_rate": 5e-05, + "loss": 0.1926, + "loss/crossentropy": 2.7140950560569763, + "loss/hidden": 0.0, + "loss/logits": 0.19255831465125084, + "loss/reg": 0.8188463449478149, + "step": 1939 + }, + { + "epoch": 0.0194, + "grad_norm": 0.3956315219402313, + "grad_norm_var": 0.0008348160456935834, + "learning_rate": 5e-05, + "loss": 0.1975, + "loss/crossentropy": 2.704246759414673, + "loss/hidden": 0.0, + "loss/logits": 0.19747210666537285, + "loss/reg": 0.8183398246765137, + "step": 1940 + }, + { + "epoch": 0.01941, + "grad_norm": 0.4323425889015198, + "grad_norm_var": 0.0008449350953672311, + "learning_rate": 5e-05, + "loss": 0.2033, + "loss/crossentropy": 2.6787729263305664, + "loss/hidden": 0.0, + "loss/logits": 0.2032655067741871, + "loss/reg": 0.8175671100616455, + "step": 1941 + }, + { + "epoch": 0.01942, + "grad_norm": 0.42692750692367554, + "grad_norm_var": 0.000846410359529115, + "learning_rate": 5e-05, + "loss": 0.1898, + "loss/crossentropy": 2.741370141506195, + "loss/hidden": 0.0, + "loss/logits": 0.1897616647183895, + "loss/reg": 0.8167078495025635, + "step": 1942 + }, + { + "epoch": 0.01943, + "grad_norm": 0.5386472344398499, + "grad_norm_var": 0.0017185548646476998, + "learning_rate": 5e-05, + "loss": 0.2308, + "loss/crossentropy": 2.8304240703582764, + "loss/hidden": 0.0, + "loss/logits": 0.23077243193984032, + "loss/reg": 0.8157216310501099, + "step": 1943 + }, + { + "epoch": 0.01944, + "grad_norm": 0.39153966307640076, + "grad_norm_var": 0.001359874314681516, + "learning_rate": 5e-05, + "loss": 0.1842, + "loss/crossentropy": 2.911763846874237, + "loss/hidden": 0.0, + "loss/logits": 0.18419482558965683, + "loss/reg": 0.8156632781028748, + "step": 1944 + }, + { + "epoch": 0.01945, + "grad_norm": 0.46944209933280945, + "grad_norm_var": 0.0015220376425207978, + "learning_rate": 5e-05, + "loss": 0.2035, + "loss/crossentropy": 2.9459604024887085, + "loss/hidden": 0.0, + "loss/logits": 0.20351316034793854, + "loss/reg": 0.8148277401924133, + "step": 1945 + }, + { + "epoch": 0.01946, + "grad_norm": 0.5551398396492004, + "grad_norm_var": 0.002578514831995271, + "learning_rate": 5e-05, + "loss": 0.2106, + "loss/crossentropy": 2.837183356285095, + "loss/hidden": 0.0, + "loss/logits": 0.21064143255352974, + "loss/reg": 0.8145296573638916, + "step": 1946 + }, + { + "epoch": 0.01947, + "grad_norm": 0.5528098940849304, + "grad_norm_var": 0.0034449012988506072, + "learning_rate": 5e-05, + "loss": 0.1961, + "loss/crossentropy": 2.9039652347564697, + "loss/hidden": 0.0, + "loss/logits": 0.19607802107930183, + "loss/reg": 0.8138279318809509, + "step": 1947 + }, + { + "epoch": 0.01948, + "grad_norm": 0.415836900472641, + "grad_norm_var": 0.0032722428995358302, + "learning_rate": 5e-05, + "loss": 0.2086, + "loss/crossentropy": 2.6907394528388977, + "loss/hidden": 0.0, + "loss/logits": 0.20860682427883148, + "loss/reg": 0.8135868310928345, + "step": 1948 + }, + { + "epoch": 0.01949, + "grad_norm": 0.38912898302078247, + "grad_norm_var": 0.00343712172172476, + "learning_rate": 5e-05, + "loss": 0.1984, + "loss/crossentropy": 2.80822890996933, + "loss/hidden": 0.0, + "loss/logits": 0.19835097342729568, + "loss/reg": 0.813839316368103, + "step": 1949 + }, + { + "epoch": 0.0195, + "grad_norm": 0.40182438492774963, + "grad_norm_var": 0.0035023975724768897, + "learning_rate": 5e-05, + "loss": 0.1943, + "loss/crossentropy": 2.7360247373580933, + "loss/hidden": 0.0, + "loss/logits": 0.19426304474473, + "loss/reg": 0.8130156993865967, + "step": 1950 + }, + { + "epoch": 0.01951, + "grad_norm": 0.5061089992523193, + "grad_norm_var": 0.0037637273327097245, + "learning_rate": 5e-05, + "loss": 0.2302, + "loss/crossentropy": 2.810616374015808, + "loss/hidden": 0.0, + "loss/logits": 0.23021040111780167, + "loss/reg": 0.8126420974731445, + "step": 1951 + }, + { + "epoch": 0.01952, + "grad_norm": 0.40442657470703125, + "grad_norm_var": 0.0038596389857129174, + "learning_rate": 5e-05, + "loss": 0.1996, + "loss/crossentropy": 2.7186889052391052, + "loss/hidden": 0.0, + "loss/logits": 0.1996094360947609, + "loss/reg": 0.812751293182373, + "step": 1952 + }, + { + "epoch": 0.01953, + "grad_norm": 0.3843165338039398, + "grad_norm_var": 0.0038312987729529585, + "learning_rate": 5e-05, + "loss": 0.1957, + "loss/crossentropy": 2.771174371242523, + "loss/hidden": 0.0, + "loss/logits": 0.19567937403917313, + "loss/reg": 0.8128808736801147, + "step": 1953 + }, + { + "epoch": 0.01954, + "grad_norm": 0.46193453669548035, + "grad_norm_var": 0.00375742651823829, + "learning_rate": 5e-05, + "loss": 0.1962, + "loss/crossentropy": 2.7205318808555603, + "loss/hidden": 0.0, + "loss/logits": 0.19615139067173004, + "loss/reg": 0.8125832676887512, + "step": 1954 + }, + { + "epoch": 0.01955, + "grad_norm": 0.4301360249519348, + "grad_norm_var": 0.003615205873543022, + "learning_rate": 5e-05, + "loss": 0.2023, + "loss/crossentropy": 2.7779752612113953, + "loss/hidden": 0.0, + "loss/logits": 0.2022506631910801, + "loss/reg": 0.8117827773094177, + "step": 1955 + }, + { + "epoch": 0.01956, + "grad_norm": 0.4634301960468292, + "grad_norm_var": 0.0034357660159029303, + "learning_rate": 5e-05, + "loss": 0.205, + "loss/crossentropy": 2.9738866090774536, + "loss/hidden": 0.0, + "loss/logits": 0.20504064112901688, + "loss/reg": 0.8119235038757324, + "step": 1956 + }, + { + "epoch": 0.01957, + "grad_norm": 0.4130321741104126, + "grad_norm_var": 0.003508395486898848, + "learning_rate": 5e-05, + "loss": 0.1876, + "loss/crossentropy": 2.8298826217651367, + "loss/hidden": 0.0, + "loss/logits": 0.18758833408355713, + "loss/reg": 0.8116447925567627, + "step": 1957 + }, + { + "epoch": 0.01958, + "grad_norm": 0.47880375385284424, + "grad_norm_var": 0.003514979627616995, + "learning_rate": 5e-05, + "loss": 0.2127, + "loss/crossentropy": 2.865356743335724, + "loss/hidden": 0.0, + "loss/logits": 0.21270957589149475, + "loss/reg": 0.8115543127059937, + "step": 1958 + }, + { + "epoch": 0.01959, + "grad_norm": 0.5161337852478027, + "grad_norm_var": 0.0032911683468492196, + "learning_rate": 5e-05, + "loss": 0.1939, + "loss/crossentropy": 2.847059905529022, + "loss/hidden": 0.0, + "loss/logits": 0.19394532218575478, + "loss/reg": 0.8105104565620422, + "step": 1959 + }, + { + "epoch": 0.0196, + "grad_norm": 0.41269540786743164, + "grad_norm_var": 0.0031482363185309096, + "learning_rate": 5e-05, + "loss": 0.1936, + "loss/crossentropy": 2.763217329978943, + "loss/hidden": 0.0, + "loss/logits": 0.19357923790812492, + "loss/reg": 0.8093386888504028, + "step": 1960 + }, + { + "epoch": 0.01961, + "grad_norm": 0.4319096505641937, + "grad_norm_var": 0.0031562494539512987, + "learning_rate": 5e-05, + "loss": 0.2449, + "loss/crossentropy": 2.7221856117248535, + "loss/hidden": 0.0, + "loss/logits": 0.24492282792925835, + "loss/reg": 0.8088083267211914, + "step": 1961 + }, + { + "epoch": 0.01962, + "grad_norm": 0.5298008322715759, + "grad_norm_var": 0.0028448906488864, + "learning_rate": 5e-05, + "loss": 0.2001, + "loss/crossentropy": 2.8264525532722473, + "loss/hidden": 0.0, + "loss/logits": 0.20010977983474731, + "loss/reg": 0.8084967136383057, + "step": 1962 + }, + { + "epoch": 0.01963, + "grad_norm": 0.47640174627304077, + "grad_norm_var": 0.0021574920282253717, + "learning_rate": 5e-05, + "loss": 0.228, + "loss/crossentropy": 2.74677574634552, + "loss/hidden": 0.0, + "loss/logits": 0.2279682606458664, + "loss/reg": 0.8080564141273499, + "step": 1963 + }, + { + "epoch": 0.01964, + "grad_norm": 0.44032299518585205, + "grad_norm_var": 0.0021005854531945625, + "learning_rate": 5e-05, + "loss": 0.2062, + "loss/crossentropy": 2.8946619629859924, + "loss/hidden": 0.0, + "loss/logits": 0.2062392719089985, + "loss/reg": 0.8072237968444824, + "step": 1964 + }, + { + "epoch": 0.01965, + "grad_norm": 0.4359903037548065, + "grad_norm_var": 0.0018807734680394244, + "learning_rate": 5e-05, + "loss": 0.1935, + "loss/crossentropy": 2.8006544709205627, + "loss/hidden": 0.0, + "loss/logits": 0.1934516429901123, + "loss/reg": 0.8061104416847229, + "step": 1965 + }, + { + "epoch": 0.01966, + "grad_norm": 0.4103769063949585, + "grad_norm_var": 0.0018313161015808082, + "learning_rate": 5e-05, + "loss": 0.1877, + "loss/crossentropy": 2.85004460811615, + "loss/hidden": 0.0, + "loss/logits": 0.18770591914653778, + "loss/reg": 0.805844247341156, + "step": 1966 + }, + { + "epoch": 0.01967, + "grad_norm": 0.45468464493751526, + "grad_norm_var": 0.0016100881394809363, + "learning_rate": 5e-05, + "loss": 0.2063, + "loss/crossentropy": 2.8525925278663635, + "loss/hidden": 0.0, + "loss/logits": 0.20628943294286728, + "loss/reg": 0.8054131865501404, + "step": 1967 + }, + { + "epoch": 0.01968, + "grad_norm": 0.41498661041259766, + "grad_norm_var": 0.0015577833495147076, + "learning_rate": 5e-05, + "loss": 0.1929, + "loss/crossentropy": 2.7838199138641357, + "loss/hidden": 0.0, + "loss/logits": 0.1929318793118, + "loss/reg": 0.8045769929885864, + "step": 1968 + }, + { + "epoch": 0.01969, + "grad_norm": 0.4064616858959198, + "grad_norm_var": 0.0014028036544412833, + "learning_rate": 5e-05, + "loss": 0.1996, + "loss/crossentropy": 2.7728294730186462, + "loss/hidden": 0.0, + "loss/logits": 0.1995893009006977, + "loss/reg": 0.8039410710334778, + "step": 1969 + }, + { + "epoch": 0.0197, + "grad_norm": 0.3879988193511963, + "grad_norm_var": 0.0016126988674289818, + "learning_rate": 5e-05, + "loss": 0.1848, + "loss/crossentropy": 2.737799882888794, + "loss/hidden": 0.0, + "loss/logits": 0.18477895110845566, + "loss/reg": 0.8039387464523315, + "step": 1970 + }, + { + "epoch": 0.01971, + "grad_norm": 0.4101479649543762, + "grad_norm_var": 0.0016744785608907168, + "learning_rate": 5e-05, + "loss": 0.1831, + "loss/crossentropy": 2.82141375541687, + "loss/hidden": 0.0, + "loss/logits": 0.18309226632118225, + "loss/reg": 0.80408775806427, + "step": 1971 + }, + { + "epoch": 0.01972, + "grad_norm": 0.6650723814964294, + "grad_norm_var": 0.004773083863224518, + "learning_rate": 5e-05, + "loss": 0.2239, + "loss/crossentropy": 3.000373601913452, + "loss/hidden": 0.0, + "loss/logits": 0.22394775971770287, + "loss/reg": 0.8045146465301514, + "step": 1972 + }, + { + "epoch": 0.01973, + "grad_norm": 0.458319753408432, + "grad_norm_var": 0.004646034076159048, + "learning_rate": 5e-05, + "loss": 0.1992, + "loss/crossentropy": 2.6664801836013794, + "loss/hidden": 0.0, + "loss/logits": 0.1992279216647148, + "loss/reg": 0.8046901226043701, + "step": 1973 + }, + { + "epoch": 0.01974, + "grad_norm": 0.4235832989215851, + "grad_norm_var": 0.004684412564852858, + "learning_rate": 5e-05, + "loss": 0.1941, + "loss/crossentropy": 2.769956588745117, + "loss/hidden": 0.0, + "loss/logits": 0.1941225603222847, + "loss/reg": 0.8042617440223694, + "step": 1974 + }, + { + "epoch": 0.01975, + "grad_norm": 0.4769344925880432, + "grad_norm_var": 0.0044592586608794465, + "learning_rate": 5e-05, + "loss": 0.2022, + "loss/crossentropy": 2.8257822394371033, + "loss/hidden": 0.0, + "loss/logits": 0.2021910548210144, + "loss/reg": 0.8034544587135315, + "step": 1975 + }, + { + "epoch": 0.01976, + "grad_norm": 0.4848521649837494, + "grad_norm_var": 0.004404308109408961, + "learning_rate": 5e-05, + "loss": 0.2129, + "loss/crossentropy": 2.8919442892074585, + "loss/hidden": 0.0, + "loss/logits": 0.21290498971939087, + "loss/reg": 0.8028760552406311, + "step": 1976 + }, + { + "epoch": 0.01977, + "grad_norm": 0.3805828094482422, + "grad_norm_var": 0.004738891169894188, + "learning_rate": 5e-05, + "loss": 0.1892, + "loss/crossentropy": 2.8650283813476562, + "loss/hidden": 0.0, + "loss/logits": 0.18921462818980217, + "loss/reg": 0.8020926713943481, + "step": 1977 + }, + { + "epoch": 0.01978, + "grad_norm": 0.4142460227012634, + "grad_norm_var": 0.004398356120446204, + "learning_rate": 5e-05, + "loss": 0.1878, + "loss/crossentropy": 2.8711732625961304, + "loss/hidden": 0.0, + "loss/logits": 0.18783869594335556, + "loss/reg": 0.8014591932296753, + "step": 1978 + }, + { + "epoch": 0.01979, + "grad_norm": 0.44899171590805054, + "grad_norm_var": 0.0043353381509027384, + "learning_rate": 5e-05, + "loss": 0.208, + "loss/crossentropy": 2.5625649094581604, + "loss/hidden": 0.0, + "loss/logits": 0.2080221101641655, + "loss/reg": 0.8012404441833496, + "step": 1979 + }, + { + "epoch": 0.0198, + "grad_norm": 0.44338473677635193, + "grad_norm_var": 0.004334179241258988, + "learning_rate": 5e-05, + "loss": 0.2112, + "loss/crossentropy": 2.8066795468330383, + "loss/hidden": 0.0, + "loss/logits": 0.21121276542544365, + "loss/reg": 0.8007535934448242, + "step": 1980 + }, + { + "epoch": 0.01981, + "grad_norm": 0.3934018313884735, + "grad_norm_var": 0.004497499997501934, + "learning_rate": 5e-05, + "loss": 0.1875, + "loss/crossentropy": 2.74673193693161, + "loss/hidden": 0.0, + "loss/logits": 0.18748626857995987, + "loss/reg": 0.8003087043762207, + "step": 1981 + }, + { + "epoch": 0.01982, + "grad_norm": 0.47937899827957153, + "grad_norm_var": 0.004502974365799129, + "learning_rate": 5e-05, + "loss": 0.2173, + "loss/crossentropy": 2.823216140270233, + "loss/hidden": 0.0, + "loss/logits": 0.21725162491202354, + "loss/reg": 0.8001217842102051, + "step": 1982 + }, + { + "epoch": 0.01983, + "grad_norm": 0.41723868250846863, + "grad_norm_var": 0.0045494442842594855, + "learning_rate": 5e-05, + "loss": 0.1981, + "loss/crossentropy": 2.7272141575813293, + "loss/hidden": 0.0, + "loss/logits": 0.19813034310936928, + "loss/reg": 0.8001703023910522, + "step": 1983 + }, + { + "epoch": 0.01984, + "grad_norm": 0.4236186444759369, + "grad_norm_var": 0.004520594879298286, + "learning_rate": 5e-05, + "loss": 0.2114, + "loss/crossentropy": 2.7918258905410767, + "loss/hidden": 0.0, + "loss/logits": 0.21141138672828674, + "loss/reg": 0.7996878027915955, + "step": 1984 + }, + { + "epoch": 0.01985, + "grad_norm": 0.43094608187675476, + "grad_norm_var": 0.004433431641809552, + "learning_rate": 5e-05, + "loss": 0.2094, + "loss/crossentropy": 2.647144854068756, + "loss/hidden": 0.0, + "loss/logits": 0.20941178873181343, + "loss/reg": 0.7999981045722961, + "step": 1985 + }, + { + "epoch": 0.01986, + "grad_norm": 0.4323650598526001, + "grad_norm_var": 0.004212350788636702, + "learning_rate": 5e-05, + "loss": 0.2113, + "loss/crossentropy": 2.794140636920929, + "loss/hidden": 0.0, + "loss/logits": 0.21128448471426964, + "loss/reg": 0.7999016046524048, + "step": 1986 + }, + { + "epoch": 0.01987, + "grad_norm": 0.45955783128738403, + "grad_norm_var": 0.004109362838493628, + "learning_rate": 5e-05, + "loss": 0.2124, + "loss/crossentropy": 2.7951821088790894, + "loss/hidden": 0.0, + "loss/logits": 0.21235604584217072, + "loss/reg": 0.7995927333831787, + "step": 1987 + }, + { + "epoch": 0.01988, + "grad_norm": 0.4196851849555969, + "grad_norm_var": 0.0009023983358114496, + "learning_rate": 5e-05, + "loss": 0.1901, + "loss/crossentropy": 2.8231590390205383, + "loss/hidden": 0.0, + "loss/logits": 0.19013461470603943, + "loss/reg": 0.7984586358070374, + "step": 1988 + }, + { + "epoch": 0.01989, + "grad_norm": 0.49775421619415283, + "grad_norm_var": 0.0011133027865477651, + "learning_rate": 5e-05, + "loss": 0.208, + "loss/crossentropy": 3.028570830821991, + "loss/hidden": 0.0, + "loss/logits": 0.20801334083080292, + "loss/reg": 0.7981004118919373, + "step": 1989 + }, + { + "epoch": 0.0199, + "grad_norm": 0.4030906856060028, + "grad_norm_var": 0.0011821039332649108, + "learning_rate": 5e-05, + "loss": 0.195, + "loss/crossentropy": 2.868641436100006, + "loss/hidden": 0.0, + "loss/logits": 0.19496846944093704, + "loss/reg": 0.7968870401382446, + "step": 1990 + }, + { + "epoch": 0.01991, + "grad_norm": 0.5345245003700256, + "grad_norm_var": 0.0016893028660055229, + "learning_rate": 5e-05, + "loss": 0.2065, + "loss/crossentropy": 2.789147198200226, + "loss/hidden": 0.0, + "loss/logits": 0.20645881444215775, + "loss/reg": 0.7962032556533813, + "step": 1991 + }, + { + "epoch": 0.01992, + "grad_norm": 0.40988919138908386, + "grad_norm_var": 0.0016069727992540472, + "learning_rate": 5e-05, + "loss": 0.2057, + "loss/crossentropy": 2.6691997051239014, + "loss/hidden": 0.0, + "loss/logits": 0.20573721826076508, + "loss/reg": 0.7955927848815918, + "step": 1992 + }, + { + "epoch": 0.01993, + "grad_norm": 0.42493656277656555, + "grad_norm_var": 0.0013975202967209408, + "learning_rate": 5e-05, + "loss": 0.1814, + "loss/crossentropy": 2.8890662789344788, + "loss/hidden": 0.0, + "loss/logits": 0.18143537640571594, + "loss/reg": 0.7952697277069092, + "step": 1993 + }, + { + "epoch": 0.01994, + "grad_norm": 0.5122359395027161, + "grad_norm_var": 0.0016668707279849186, + "learning_rate": 5e-05, + "loss": 0.2132, + "loss/crossentropy": 2.8018561005592346, + "loss/hidden": 0.0, + "loss/logits": 0.21317176520824432, + "loss/reg": 0.7952179312705994, + "step": 1994 + }, + { + "epoch": 0.01995, + "grad_norm": 0.39479175209999084, + "grad_norm_var": 0.0018265944699556594, + "learning_rate": 5e-05, + "loss": 0.1812, + "loss/crossentropy": 2.8895358443260193, + "loss/hidden": 0.0, + "loss/logits": 0.18123216927051544, + "loss/reg": 0.7947417497634888, + "step": 1995 + }, + { + "epoch": 0.01996, + "grad_norm": 0.4241412281990051, + "grad_norm_var": 0.0018469557738500933, + "learning_rate": 5e-05, + "loss": 0.2372, + "loss/crossentropy": 2.824765920639038, + "loss/hidden": 0.0, + "loss/logits": 0.2372448891401291, + "loss/reg": 0.7937231063842773, + "step": 1996 + }, + { + "epoch": 0.01997, + "grad_norm": 0.46130889654159546, + "grad_norm_var": 0.0017033186931065185, + "learning_rate": 5e-05, + "loss": 0.1925, + "loss/crossentropy": 3.0325597524642944, + "loss/hidden": 0.0, + "loss/logits": 0.19254948571324348, + "loss/reg": 0.7932283282279968, + "step": 1997 + }, + { + "epoch": 0.01998, + "grad_norm": 0.4147680103778839, + "grad_norm_var": 0.0016710034497371704, + "learning_rate": 5e-05, + "loss": 0.1925, + "loss/crossentropy": 2.8635621070861816, + "loss/hidden": 0.0, + "loss/logits": 0.19247011095285416, + "loss/reg": 0.7929666042327881, + "step": 1998 + }, + { + "epoch": 0.01999, + "grad_norm": 0.3953905701637268, + "grad_norm_var": 0.001770939335639273, + "learning_rate": 5e-05, + "loss": 0.1963, + "loss/crossentropy": 2.6696969866752625, + "loss/hidden": 0.0, + "loss/logits": 0.19630739465355873, + "loss/reg": 0.7929765582084656, + "step": 1999 + }, + { + "epoch": 0.02, + "grad_norm": 0.46709325909614563, + "grad_norm_var": 0.0017944712625772832, + "learning_rate": 5e-05, + "loss": 0.2193, + "loss/crossentropy": 2.767913043498993, + "loss/hidden": 0.0, + "loss/logits": 0.21925517171621323, + "loss/reg": 0.792606770992279, + "step": 2000 + }, + { + "epoch": 0.02001, + "grad_norm": 0.45867905020713806, + "grad_norm_var": 0.0017992449389282942, + "learning_rate": 5e-05, + "loss": 0.2153, + "loss/crossentropy": 2.922570765018463, + "loss/hidden": 0.0, + "loss/logits": 0.21534525230526924, + "loss/reg": 0.7923213243484497, + "step": 2001 + }, + { + "epoch": 0.02002, + "grad_norm": 0.43041613698005676, + "grad_norm_var": 0.0018026066344080031, + "learning_rate": 5e-05, + "loss": 0.2142, + "loss/crossentropy": 2.7274850606918335, + "loss/hidden": 0.0, + "loss/logits": 0.21420374512672424, + "loss/reg": 0.7912262678146362, + "step": 2002 + }, + { + "epoch": 0.02003, + "grad_norm": 0.42794111371040344, + "grad_norm_var": 0.0018006208666221195, + "learning_rate": 5e-05, + "loss": 0.1993, + "loss/crossentropy": 2.8696178197860718, + "loss/hidden": 0.0, + "loss/logits": 0.19926932454109192, + "loss/reg": 0.7907223701477051, + "step": 2003 + }, + { + "epoch": 0.02004, + "grad_norm": 0.4015541672706604, + "grad_norm_var": 0.0018758141210452532, + "learning_rate": 5e-05, + "loss": 0.1901, + "loss/crossentropy": 2.6416637897491455, + "loss/hidden": 0.0, + "loss/logits": 0.1901119463145733, + "loss/reg": 0.79008549451828, + "step": 2004 + }, + { + "epoch": 0.02005, + "grad_norm": 0.3912619650363922, + "grad_norm_var": 0.00178098250965076, + "learning_rate": 5e-05, + "loss": 0.1863, + "loss/crossentropy": 2.787119746208191, + "loss/hidden": 0.0, + "loss/logits": 0.1862845979630947, + "loss/reg": 0.7895205020904541, + "step": 2005 + }, + { + "epoch": 0.02006, + "grad_norm": 0.38186684250831604, + "grad_norm_var": 0.0018980233172146033, + "learning_rate": 5e-05, + "loss": 0.1836, + "loss/crossentropy": 2.7930368185043335, + "loss/hidden": 0.0, + "loss/logits": 0.18358103185892105, + "loss/reg": 0.7883749008178711, + "step": 2006 + }, + { + "epoch": 0.02007, + "grad_norm": 0.4110807478427887, + "grad_norm_var": 0.0011822916301412634, + "learning_rate": 5e-05, + "loss": 0.1933, + "loss/crossentropy": 2.853162944316864, + "loss/hidden": 0.0, + "loss/logits": 0.19330233335494995, + "loss/reg": 0.7876327037811279, + "step": 2007 + }, + { + "epoch": 0.02008, + "grad_norm": 0.42640382051467896, + "grad_norm_var": 0.0011650519189251086, + "learning_rate": 5e-05, + "loss": 0.2, + "loss/crossentropy": 2.7722811102867126, + "loss/hidden": 0.0, + "loss/logits": 0.20003266260027885, + "loss/reg": 0.7878388166427612, + "step": 2008 + }, + { + "epoch": 0.02009, + "grad_norm": 0.3917252719402313, + "grad_norm_var": 0.0012408759914644297, + "learning_rate": 5e-05, + "loss": 0.1847, + "loss/crossentropy": 2.796004891395569, + "loss/hidden": 0.0, + "loss/logits": 0.18473106622695923, + "loss/reg": 0.7874587774276733, + "step": 2009 + }, + { + "epoch": 0.0201, + "grad_norm": 0.40263262391090393, + "grad_norm_var": 0.0007083030749790257, + "learning_rate": 5e-05, + "loss": 0.196, + "loss/crossentropy": 2.715982139110565, + "loss/hidden": 0.0, + "loss/logits": 0.19602053612470627, + "loss/reg": 0.786444365978241, + "step": 2010 + }, + { + "epoch": 0.02011, + "grad_norm": 0.5482214093208313, + "grad_norm_var": 0.0017136956623622357, + "learning_rate": 5e-05, + "loss": 0.2137, + "loss/crossentropy": 2.9367846250534058, + "loss/hidden": 0.0, + "loss/logits": 0.21374288946390152, + "loss/reg": 0.7857764959335327, + "step": 2011 + }, + { + "epoch": 0.02012, + "grad_norm": 0.46156400442123413, + "grad_norm_var": 0.001786185251215248, + "learning_rate": 5e-05, + "loss": 0.2091, + "loss/crossentropy": 2.7322534322738647, + "loss/hidden": 0.0, + "loss/logits": 0.20913825184106827, + "loss/reg": 0.7854152917861938, + "step": 2012 + }, + { + "epoch": 0.02013, + "grad_norm": 0.42086315155029297, + "grad_norm_var": 0.0017168574129462636, + "learning_rate": 5e-05, + "loss": 0.1866, + "loss/crossentropy": 2.8187963366508484, + "loss/hidden": 0.0, + "loss/logits": 0.18663941696286201, + "loss/reg": 0.7846931219100952, + "step": 2013 + }, + { + "epoch": 0.02014, + "grad_norm": 0.41395890712738037, + "grad_norm_var": 0.0017182142942895144, + "learning_rate": 5e-05, + "loss": 0.2033, + "loss/crossentropy": 2.8044846057891846, + "loss/hidden": 0.0, + "loss/logits": 0.20327125489711761, + "loss/reg": 0.7836028337478638, + "step": 2014 + }, + { + "epoch": 0.02015, + "grad_norm": 0.41405707597732544, + "grad_norm_var": 0.0016615295410278249, + "learning_rate": 5e-05, + "loss": 0.1921, + "loss/crossentropy": 2.714905261993408, + "loss/hidden": 0.0, + "loss/logits": 0.19213714450597763, + "loss/reg": 0.7821534276008606, + "step": 2015 + }, + { + "epoch": 0.02016, + "grad_norm": 0.4387468993663788, + "grad_norm_var": 0.0015643074562327979, + "learning_rate": 5e-05, + "loss": 0.2043, + "loss/crossentropy": 2.8306316137313843, + "loss/hidden": 0.0, + "loss/logits": 0.2042708769440651, + "loss/reg": 0.7808225750923157, + "step": 2016 + }, + { + "epoch": 0.02017, + "grad_norm": 0.4950420558452606, + "grad_norm_var": 0.0018038833485815216, + "learning_rate": 5e-05, + "loss": 0.2062, + "loss/crossentropy": 2.6279239654541016, + "loss/hidden": 0.0, + "loss/logits": 0.20624027773737907, + "loss/reg": 0.7798194289207458, + "step": 2017 + }, + { + "epoch": 0.02018, + "grad_norm": 0.39097896218299866, + "grad_norm_var": 0.0018914525477079244, + "learning_rate": 5e-05, + "loss": 0.1861, + "loss/crossentropy": 2.710873246192932, + "loss/hidden": 0.0, + "loss/logits": 0.1860567443072796, + "loss/reg": 0.7789042592048645, + "step": 2018 + }, + { + "epoch": 0.02019, + "grad_norm": 0.4121586084365845, + "grad_norm_var": 0.0019031855236857945, + "learning_rate": 5e-05, + "loss": 0.204, + "loss/crossentropy": 2.755960702896118, + "loss/hidden": 0.0, + "loss/logits": 0.20395539328455925, + "loss/reg": 0.7776239514350891, + "step": 2019 + }, + { + "epoch": 0.0202, + "grad_norm": 0.5050489902496338, + "grad_norm_var": 0.0022472724635592173, + "learning_rate": 5e-05, + "loss": 0.2215, + "loss/crossentropy": 3.09210604429245, + "loss/hidden": 0.0, + "loss/logits": 0.2215287908911705, + "loss/reg": 0.776678204536438, + "step": 2020 + }, + { + "epoch": 0.02021, + "grad_norm": 0.44800421595573425, + "grad_norm_var": 0.0021433145062942434, + "learning_rate": 5e-05, + "loss": 0.2016, + "loss/crossentropy": 2.828740417957306, + "loss/hidden": 0.0, + "loss/logits": 0.20157499238848686, + "loss/reg": 0.7761678695678711, + "step": 2021 + }, + { + "epoch": 0.02022, + "grad_norm": 0.44801315665245056, + "grad_norm_var": 0.0019468672564254626, + "learning_rate": 5e-05, + "loss": 0.2247, + "loss/crossentropy": 2.783461630344391, + "loss/hidden": 0.0, + "loss/logits": 0.22474189475178719, + "loss/reg": 0.7752158045768738, + "step": 2022 + }, + { + "epoch": 0.02023, + "grad_norm": 0.37896543741226196, + "grad_norm_var": 0.0021320850202390163, + "learning_rate": 5e-05, + "loss": 0.1868, + "loss/crossentropy": 2.7922526597976685, + "loss/hidden": 0.0, + "loss/logits": 0.1868385560810566, + "loss/reg": 0.7744381427764893, + "step": 2023 + }, + { + "epoch": 0.02024, + "grad_norm": 0.41788995265960693, + "grad_norm_var": 0.002148955070002556, + "learning_rate": 5e-05, + "loss": 0.2052, + "loss/crossentropy": 2.819299638271332, + "loss/hidden": 0.0, + "loss/logits": 0.20517907291650772, + "loss/reg": 0.7740638852119446, + "step": 2024 + }, + { + "epoch": 0.02025, + "grad_norm": 0.40508779883384705, + "grad_norm_var": 0.0020799100672643455, + "learning_rate": 5e-05, + "loss": 0.181, + "loss/crossentropy": 2.7847508788108826, + "loss/hidden": 0.0, + "loss/logits": 0.1810491904616356, + "loss/reg": 0.7732000946998596, + "step": 2025 + }, + { + "epoch": 0.02026, + "grad_norm": 0.4470130205154419, + "grad_norm_var": 0.001996231460853733, + "learning_rate": 5e-05, + "loss": 0.2006, + "loss/crossentropy": 2.8389241099357605, + "loss/hidden": 0.0, + "loss/logits": 0.20058820396661758, + "loss/reg": 0.7725628614425659, + "step": 2026 + }, + { + "epoch": 0.02027, + "grad_norm": 0.40527573227882385, + "grad_norm_var": 0.001217372493147369, + "learning_rate": 5e-05, + "loss": 0.1844, + "loss/crossentropy": 2.7487269043922424, + "loss/hidden": 0.0, + "loss/logits": 0.18441111594438553, + "loss/reg": 0.772181510925293, + "step": 2027 + }, + { + "epoch": 0.02028, + "grad_norm": 0.4763030707836151, + "grad_norm_var": 0.00129019565286607, + "learning_rate": 5e-05, + "loss": 0.2007, + "loss/crossentropy": 2.7631067037582397, + "loss/hidden": 0.0, + "loss/logits": 0.20074495300650597, + "loss/reg": 0.7724031209945679, + "step": 2028 + }, + { + "epoch": 0.02029, + "grad_norm": 0.43850651383399963, + "grad_norm_var": 0.0012826573213887138, + "learning_rate": 5e-05, + "loss": 0.205, + "loss/crossentropy": 2.907250940799713, + "loss/hidden": 0.0, + "loss/logits": 0.20495330914855003, + "loss/reg": 0.771911084651947, + "step": 2029 + }, + { + "epoch": 0.0203, + "grad_norm": 0.4471217393875122, + "grad_norm_var": 0.0012652505259942992, + "learning_rate": 5e-05, + "loss": 0.227, + "loss/crossentropy": 2.7142640948295593, + "loss/hidden": 0.0, + "loss/logits": 0.22702933102846146, + "loss/reg": 0.7713619470596313, + "step": 2030 + }, + { + "epoch": 0.02031, + "grad_norm": 0.5346863269805908, + "grad_norm_var": 0.0018296138412881432, + "learning_rate": 5e-05, + "loss": 0.2326, + "loss/crossentropy": 2.8878950476646423, + "loss/hidden": 0.0, + "loss/logits": 0.23255538940429688, + "loss/reg": 0.7706761956214905, + "step": 2031 + }, + { + "epoch": 0.02032, + "grad_norm": 0.3999450206756592, + "grad_norm_var": 0.0019459891413056004, + "learning_rate": 5e-05, + "loss": 0.1865, + "loss/crossentropy": 2.7558109760284424, + "loss/hidden": 0.0, + "loss/logits": 0.1864626482129097, + "loss/reg": 0.7697986364364624, + "step": 2032 + }, + { + "epoch": 0.02033, + "grad_norm": 0.40023139119148254, + "grad_norm_var": 0.0018199286398646653, + "learning_rate": 5e-05, + "loss": 0.1953, + "loss/crossentropy": 2.7935965061187744, + "loss/hidden": 0.0, + "loss/logits": 0.19533519074320793, + "loss/reg": 0.7691691517829895, + "step": 2033 + }, + { + "epoch": 0.02034, + "grad_norm": 0.43439429998397827, + "grad_norm_var": 0.0016846350472792518, + "learning_rate": 5e-05, + "loss": 0.1923, + "loss/crossentropy": 2.7175851464271545, + "loss/hidden": 0.0, + "loss/logits": 0.19227071478962898, + "loss/reg": 0.7690432667732239, + "step": 2034 + }, + { + "epoch": 0.02035, + "grad_norm": 0.4358638525009155, + "grad_norm_var": 0.0016399273105053625, + "learning_rate": 5e-05, + "loss": 0.1857, + "loss/crossentropy": 2.7960886359214783, + "loss/hidden": 0.0, + "loss/logits": 0.1857166662812233, + "loss/reg": 0.7683178782463074, + "step": 2035 + }, + { + "epoch": 0.02036, + "grad_norm": 0.507217526435852, + "grad_norm_var": 0.0016593483111965216, + "learning_rate": 5e-05, + "loss": 0.2232, + "loss/crossentropy": 2.6404988169670105, + "loss/hidden": 0.0, + "loss/logits": 0.22318963706493378, + "loss/reg": 0.7678051590919495, + "step": 2036 + }, + { + "epoch": 0.02037, + "grad_norm": 0.44423407316207886, + "grad_norm_var": 0.0016557267017157872, + "learning_rate": 5e-05, + "loss": 0.2149, + "loss/crossentropy": 2.9561411142349243, + "loss/hidden": 0.0, + "loss/logits": 0.2148638814687729, + "loss/reg": 0.7671772837638855, + "step": 2037 + }, + { + "epoch": 0.02038, + "grad_norm": 0.592780590057373, + "grad_norm_var": 0.003143474282350367, + "learning_rate": 5e-05, + "loss": 0.2098, + "loss/crossentropy": 3.065129578113556, + "loss/hidden": 0.0, + "loss/logits": 0.20983118563890457, + "loss/reg": 0.766478419303894, + "step": 2038 + }, + { + "epoch": 0.02039, + "grad_norm": 0.44167473912239075, + "grad_norm_var": 0.002813336154450363, + "learning_rate": 5e-05, + "loss": 0.1874, + "loss/crossentropy": 2.6283465027809143, + "loss/hidden": 0.0, + "loss/logits": 0.1873929686844349, + "loss/reg": 0.765807569026947, + "step": 2039 + }, + { + "epoch": 0.0204, + "grad_norm": 0.4720562696456909, + "grad_norm_var": 0.002752065460761921, + "learning_rate": 5e-05, + "loss": 0.2107, + "loss/crossentropy": 2.7698283195495605, + "loss/hidden": 0.0, + "loss/logits": 0.2106912024319172, + "loss/reg": 0.7647716403007507, + "step": 2040 + }, + { + "epoch": 0.02041, + "grad_norm": 0.4778895080089569, + "grad_norm_var": 0.002597377380887744, + "learning_rate": 5e-05, + "loss": 0.2195, + "loss/crossentropy": 2.894709527492523, + "loss/hidden": 0.0, + "loss/logits": 0.21950312331318855, + "loss/reg": 0.7642495036125183, + "step": 2041 + }, + { + "epoch": 0.02042, + "grad_norm": 0.44014400243759155, + "grad_norm_var": 0.0026119455940839605, + "learning_rate": 5e-05, + "loss": 0.1901, + "loss/crossentropy": 2.808235228061676, + "loss/hidden": 0.0, + "loss/logits": 0.19009733200073242, + "loss/reg": 0.7634332776069641, + "step": 2042 + }, + { + "epoch": 0.02043, + "grad_norm": 0.5013467669487, + "grad_norm_var": 0.0024971565753850104, + "learning_rate": 5e-05, + "loss": 0.2164, + "loss/crossentropy": 2.887919783592224, + "loss/hidden": 0.0, + "loss/logits": 0.21637145057320595, + "loss/reg": 0.7626502513885498, + "step": 2043 + }, + { + "epoch": 0.02044, + "grad_norm": 0.4834643304347992, + "grad_norm_var": 0.0025108920437893036, + "learning_rate": 5e-05, + "loss": 0.1955, + "loss/crossentropy": 2.902633309364319, + "loss/hidden": 0.0, + "loss/logits": 0.19545603543519974, + "loss/reg": 0.7619370222091675, + "step": 2044 + }, + { + "epoch": 0.02045, + "grad_norm": 0.5475570559501648, + "grad_norm_var": 0.002858423704662257, + "learning_rate": 5e-05, + "loss": 0.2055, + "loss/crossentropy": 2.875135123729706, + "loss/hidden": 0.0, + "loss/logits": 0.2055196538567543, + "loss/reg": 0.7612566947937012, + "step": 2045 + }, + { + "epoch": 0.02046, + "grad_norm": 0.4333057105541229, + "grad_norm_var": 0.0029171740506706125, + "learning_rate": 5e-05, + "loss": 0.1906, + "loss/crossentropy": 2.753622591495514, + "loss/hidden": 0.0, + "loss/logits": 0.19055837392807007, + "loss/reg": 0.7604986429214478, + "step": 2046 + }, + { + "epoch": 0.02047, + "grad_norm": 0.4104223847389221, + "grad_norm_var": 0.002838255884097522, + "learning_rate": 5e-05, + "loss": 0.2037, + "loss/crossentropy": 2.787410259246826, + "loss/hidden": 0.0, + "loss/logits": 0.20372052118182182, + "loss/reg": 0.7602266669273376, + "step": 2047 + }, + { + "epoch": 0.02048, + "grad_norm": 0.3856104612350464, + "grad_norm_var": 0.0029733491188438563, + "learning_rate": 5e-05, + "loss": 0.1863, + "loss/crossentropy": 2.816024959087372, + "loss/hidden": 0.0, + "loss/logits": 0.18626219779253006, + "loss/reg": 0.7592066526412964, + "step": 2048 + }, + { + "epoch": 0.02049, + "grad_norm": 0.4980980157852173, + "grad_norm_var": 0.0027527487604953973, + "learning_rate": 5e-05, + "loss": 0.2061, + "loss/crossentropy": 2.7231903672218323, + "loss/hidden": 0.0, + "loss/logits": 0.20614684745669365, + "loss/reg": 0.7587620615959167, + "step": 2049 + }, + { + "epoch": 0.0205, + "grad_norm": 0.4526423215866089, + "grad_norm_var": 0.0026890493840079797, + "learning_rate": 5e-05, + "loss": 0.1908, + "loss/crossentropy": 2.8997724056243896, + "loss/hidden": 0.0, + "loss/logits": 0.19084924086928368, + "loss/reg": 0.7578598856925964, + "step": 2050 + }, + { + "epoch": 0.02051, + "grad_norm": 0.4213176369667053, + "grad_norm_var": 0.002769002971053173, + "learning_rate": 5e-05, + "loss": 0.1968, + "loss/crossentropy": 2.855745255947113, + "loss/hidden": 0.0, + "loss/logits": 0.19679997861385345, + "loss/reg": 0.7572537660598755, + "step": 2051 + }, + { + "epoch": 0.02052, + "grad_norm": 0.46856489777565, + "grad_norm_var": 0.002667274304655687, + "learning_rate": 5e-05, + "loss": 0.2167, + "loss/crossentropy": 2.9302791953086853, + "loss/hidden": 0.0, + "loss/logits": 0.2166934683918953, + "loss/reg": 0.756544291973114, + "step": 2052 + }, + { + "epoch": 0.02053, + "grad_norm": 0.47074151039123535, + "grad_norm_var": 0.002630924255147138, + "learning_rate": 5e-05, + "loss": 0.2017, + "loss/crossentropy": 3.0150145888328552, + "loss/hidden": 0.0, + "loss/logits": 0.20169193670153618, + "loss/reg": 0.7555555701255798, + "step": 2053 + }, + { + "epoch": 0.02054, + "grad_norm": 0.42726972699165344, + "grad_norm_var": 0.0016026304119011402, + "learning_rate": 5e-05, + "loss": 0.1854, + "loss/crossentropy": 2.699667513370514, + "loss/hidden": 0.0, + "loss/logits": 0.18540003523230553, + "loss/reg": 0.7541791796684265, + "step": 2054 + }, + { + "epoch": 0.02055, + "grad_norm": 0.4207848310470581, + "grad_norm_var": 0.0016760904391970718, + "learning_rate": 5e-05, + "loss": 0.1994, + "loss/crossentropy": 2.756662607192993, + "loss/hidden": 0.0, + "loss/logits": 0.19937605410814285, + "loss/reg": 0.7526571154594421, + "step": 2055 + }, + { + "epoch": 0.02056, + "grad_norm": 0.569922924041748, + "grad_norm_var": 0.0024718154918546417, + "learning_rate": 5e-05, + "loss": 0.2171, + "loss/crossentropy": 2.889861047267914, + "loss/hidden": 0.0, + "loss/logits": 0.2170621156692505, + "loss/reg": 0.7518259286880493, + "step": 2056 + }, + { + "epoch": 0.02057, + "grad_norm": 0.6570650339126587, + "grad_norm_var": 0.004832403007177636, + "learning_rate": 5e-05, + "loss": 0.2179, + "loss/crossentropy": 2.8951274752616882, + "loss/hidden": 0.0, + "loss/logits": 0.21785294264554977, + "loss/reg": 0.7505285739898682, + "step": 2057 + }, + { + "epoch": 0.02058, + "grad_norm": 0.5134196877479553, + "grad_norm_var": 0.004834609868108908, + "learning_rate": 5e-05, + "loss": 0.2121, + "loss/crossentropy": 2.876060128211975, + "loss/hidden": 0.0, + "loss/logits": 0.21208372339606285, + "loss/reg": 0.7500611543655396, + "step": 2058 + }, + { + "epoch": 0.02059, + "grad_norm": 0.5163374543190002, + "grad_norm_var": 0.0048936288451811635, + "learning_rate": 5e-05, + "loss": 0.2123, + "loss/crossentropy": 2.7346776723861694, + "loss/hidden": 0.0, + "loss/logits": 0.21225189417600632, + "loss/reg": 0.749172031879425, + "step": 2059 + }, + { + "epoch": 0.0206, + "grad_norm": 0.4336012899875641, + "grad_norm_var": 0.0050245473742638914, + "learning_rate": 5e-05, + "loss": 0.1936, + "loss/crossentropy": 2.763307809829712, + "loss/hidden": 0.0, + "loss/logits": 0.19356538355350494, + "loss/reg": 0.7488576173782349, + "step": 2060 + }, + { + "epoch": 0.02061, + "grad_norm": 0.4777183532714844, + "grad_norm_var": 0.004669265275000578, + "learning_rate": 5e-05, + "loss": 0.2026, + "loss/crossentropy": 2.8021973967552185, + "loss/hidden": 0.0, + "loss/logits": 0.20260873064398766, + "loss/reg": 0.7478004097938538, + "step": 2061 + }, + { + "epoch": 0.02062, + "grad_norm": 0.5013943314552307, + "grad_norm_var": 0.004604997438704391, + "learning_rate": 5e-05, + "loss": 0.1981, + "loss/crossentropy": 2.747301399707794, + "loss/hidden": 0.0, + "loss/logits": 0.19807493686676025, + "loss/reg": 0.7468694448471069, + "step": 2062 + }, + { + "epoch": 0.02063, + "grad_norm": 0.4923614263534546, + "grad_norm_var": 0.004302089359458317, + "learning_rate": 5e-05, + "loss": 0.2267, + "loss/crossentropy": 2.74979430437088, + "loss/hidden": 0.0, + "loss/logits": 0.22671936079859734, + "loss/reg": 0.7461242079734802, + "step": 2063 + }, + { + "epoch": 0.02064, + "grad_norm": 0.4585872292518616, + "grad_norm_var": 0.0037001789454152445, + "learning_rate": 5e-05, + "loss": 0.1968, + "loss/crossentropy": 2.7241804003715515, + "loss/hidden": 0.0, + "loss/logits": 0.1968403123319149, + "loss/reg": 0.7452719211578369, + "step": 2064 + }, + { + "epoch": 0.02065, + "grad_norm": 0.4442161023616791, + "grad_norm_var": 0.0037964357369112633, + "learning_rate": 5e-05, + "loss": 0.1985, + "loss/crossentropy": 2.6155949234962463, + "loss/hidden": 0.0, + "loss/logits": 0.1985289826989174, + "loss/reg": 0.7442812323570251, + "step": 2065 + }, + { + "epoch": 0.02066, + "grad_norm": 0.7877947092056274, + "grad_norm_var": 0.00946602796148852, + "learning_rate": 5e-05, + "loss": 0.2133, + "loss/crossentropy": 2.825562536716461, + "loss/hidden": 0.0, + "loss/logits": 0.21333030611276627, + "loss/reg": 0.743831217288971, + "step": 2066 + }, + { + "epoch": 0.02067, + "grad_norm": 0.40486806631088257, + "grad_norm_var": 0.00966388706196278, + "learning_rate": 5e-05, + "loss": 0.1968, + "loss/crossentropy": 2.7725449800491333, + "loss/hidden": 0.0, + "loss/logits": 0.19678157940506935, + "loss/reg": 0.7429914474487305, + "step": 2067 + }, + { + "epoch": 0.02068, + "grad_norm": 0.42132043838500977, + "grad_norm_var": 0.010018985338277922, + "learning_rate": 5e-05, + "loss": 0.1922, + "loss/crossentropy": 2.7504552602767944, + "loss/hidden": 0.0, + "loss/logits": 0.19219397753477097, + "loss/reg": 0.7418984770774841, + "step": 2068 + }, + { + "epoch": 0.02069, + "grad_norm": 0.445605993270874, + "grad_norm_var": 0.010155985495560607, + "learning_rate": 5e-05, + "loss": 0.1929, + "loss/crossentropy": 2.7287486791610718, + "loss/hidden": 0.0, + "loss/logits": 0.19293370097875595, + "loss/reg": 0.7408042550086975, + "step": 2069 + }, + { + "epoch": 0.0207, + "grad_norm": 0.4684641361236572, + "grad_norm_var": 0.00987208945418024, + "learning_rate": 5e-05, + "loss": 0.1997, + "loss/crossentropy": 2.8292333483695984, + "loss/hidden": 0.0, + "loss/logits": 0.19970304146409035, + "loss/reg": 0.7397096157073975, + "step": 2070 + }, + { + "epoch": 0.02071, + "grad_norm": 0.40488651394844055, + "grad_norm_var": 0.010057588645068652, + "learning_rate": 5e-05, + "loss": 0.1862, + "loss/crossentropy": 2.8630303740501404, + "loss/hidden": 0.0, + "loss/logits": 0.18624230101704597, + "loss/reg": 0.7384559512138367, + "step": 2071 + }, + { + "epoch": 0.02072, + "grad_norm": 0.3865104019641876, + "grad_norm_var": 0.010446409162130917, + "learning_rate": 5e-05, + "loss": 0.1994, + "loss/crossentropy": 2.6214022040367126, + "loss/hidden": 0.0, + "loss/logits": 0.1993757300078869, + "loss/reg": 0.737078070640564, + "step": 2072 + }, + { + "epoch": 0.02073, + "grad_norm": 0.44390615820884705, + "grad_norm_var": 0.008492101432279287, + "learning_rate": 5e-05, + "loss": 0.2028, + "loss/crossentropy": 2.754049599170685, + "loss/hidden": 0.0, + "loss/logits": 0.20278025791049004, + "loss/reg": 0.7356628775596619, + "step": 2073 + }, + { + "epoch": 0.02074, + "grad_norm": 0.42425239086151123, + "grad_norm_var": 0.008532993565663486, + "learning_rate": 5e-05, + "loss": 0.2005, + "loss/crossentropy": 2.816314995288849, + "loss/hidden": 0.0, + "loss/logits": 0.20054404065012932, + "loss/reg": 0.7346155643463135, + "step": 2074 + }, + { + "epoch": 0.02075, + "grad_norm": 0.4910520613193512, + "grad_norm_var": 0.008415009008559456, + "learning_rate": 5e-05, + "loss": 0.2142, + "loss/crossentropy": 2.8799827694892883, + "loss/hidden": 0.0, + "loss/logits": 0.21417224779725075, + "loss/reg": 0.7328586578369141, + "step": 2075 + }, + { + "epoch": 0.02076, + "grad_norm": 0.47552022337913513, + "grad_norm_var": 0.008333083007723122, + "learning_rate": 5e-05, + "loss": 0.2148, + "loss/crossentropy": 2.753495752811432, + "loss/hidden": 0.0, + "loss/logits": 0.21484605595469475, + "loss/reg": 0.7316145300865173, + "step": 2076 + }, + { + "epoch": 0.02077, + "grad_norm": 0.5269806385040283, + "grad_norm_var": 0.008531980410446243, + "learning_rate": 5e-05, + "loss": 0.2311, + "loss/crossentropy": 2.987771213054657, + "loss/hidden": 0.0, + "loss/logits": 0.23114652931690216, + "loss/reg": 0.7309261560440063, + "step": 2077 + }, + { + "epoch": 0.02078, + "grad_norm": 0.4908853769302368, + "grad_norm_var": 0.008499948125407879, + "learning_rate": 5e-05, + "loss": 0.1985, + "loss/crossentropy": 2.704439699649811, + "loss/hidden": 0.0, + "loss/logits": 0.19849738106131554, + "loss/reg": 0.7301661372184753, + "step": 2078 + }, + { + "epoch": 0.02079, + "grad_norm": 0.46514371037483215, + "grad_norm_var": 0.008475806445585148, + "learning_rate": 5e-05, + "loss": 0.2027, + "loss/crossentropy": 2.751134514808655, + "loss/hidden": 0.0, + "loss/logits": 0.20273500680923462, + "loss/reg": 0.7291837334632874, + "step": 2079 + }, + { + "epoch": 0.0208, + "grad_norm": 0.41338494420051575, + "grad_norm_var": 0.008679825309377224, + "learning_rate": 5e-05, + "loss": 0.1923, + "loss/crossentropy": 2.8265891075134277, + "loss/hidden": 0.0, + "loss/logits": 0.19234300032258034, + "loss/reg": 0.7284231781959534, + "step": 2080 + }, + { + "epoch": 0.02081, + "grad_norm": 0.42556217312812805, + "grad_norm_var": 0.008761784253873476, + "learning_rate": 5e-05, + "loss": 0.1944, + "loss/crossentropy": 2.7246302366256714, + "loss/hidden": 0.0, + "loss/logits": 0.19444361701607704, + "loss/reg": 0.7273736596107483, + "step": 2081 + }, + { + "epoch": 0.02082, + "grad_norm": 0.4288738965988159, + "grad_norm_var": 0.0014736838453877816, + "learning_rate": 5e-05, + "loss": 0.206, + "loss/crossentropy": 2.7712178826332092, + "loss/hidden": 0.0, + "loss/logits": 0.20597194507718086, + "loss/reg": 0.7261093854904175, + "step": 2082 + }, + { + "epoch": 0.02083, + "grad_norm": 0.45129653811454773, + "grad_norm_var": 0.0013610504914753723, + "learning_rate": 5e-05, + "loss": 0.2028, + "loss/crossentropy": 2.762493908405304, + "loss/hidden": 0.0, + "loss/logits": 0.20281768590211868, + "loss/reg": 0.7254123687744141, + "step": 2083 + }, + { + "epoch": 0.02084, + "grad_norm": 0.42450350522994995, + "grad_norm_var": 0.0013504761954087822, + "learning_rate": 5e-05, + "loss": 0.1978, + "loss/crossentropy": 2.768760085105896, + "loss/hidden": 0.0, + "loss/logits": 0.1978367082774639, + "loss/reg": 0.7250544428825378, + "step": 2084 + }, + { + "epoch": 0.02085, + "grad_norm": 0.4185989797115326, + "grad_norm_var": 0.001404419412730166, + "learning_rate": 5e-05, + "loss": 0.1866, + "loss/crossentropy": 2.941966950893402, + "loss/hidden": 0.0, + "loss/logits": 0.18660201132297516, + "loss/reg": 0.7242149710655212, + "step": 2085 + }, + { + "epoch": 0.02086, + "grad_norm": 0.47596874833106995, + "grad_norm_var": 0.0014301783137724974, + "learning_rate": 5e-05, + "loss": 0.2063, + "loss/crossentropy": 2.6456299424171448, + "loss/hidden": 0.0, + "loss/logits": 0.20631443709135056, + "loss/reg": 0.7236818075180054, + "step": 2086 + }, + { + "epoch": 0.02087, + "grad_norm": 0.44746094942092896, + "grad_norm_var": 0.0013060618526516373, + "learning_rate": 5e-05, + "loss": 0.2082, + "loss/crossentropy": 2.8238280415534973, + "loss/hidden": 0.0, + "loss/logits": 0.20815334841609, + "loss/reg": 0.7229150533676147, + "step": 2087 + }, + { + "epoch": 0.02088, + "grad_norm": 0.45542579889297485, + "grad_norm_var": 0.0010253069646252203, + "learning_rate": 5e-05, + "loss": 0.2119, + "loss/crossentropy": 2.761290967464447, + "loss/hidden": 0.0, + "loss/logits": 0.2119154967367649, + "loss/reg": 0.7227045893669128, + "step": 2088 + }, + { + "epoch": 0.02089, + "grad_norm": 0.42840731143951416, + "grad_norm_var": 0.0010605098714797498, + "learning_rate": 5e-05, + "loss": 0.1902, + "loss/crossentropy": 2.7376968264579773, + "loss/hidden": 0.0, + "loss/logits": 0.19021346420049667, + "loss/reg": 0.7225263714790344, + "step": 2089 + }, + { + "epoch": 0.0209, + "grad_norm": 0.406536728143692, + "grad_norm_var": 0.0011473382412116966, + "learning_rate": 5e-05, + "loss": 0.1991, + "loss/crossentropy": 2.8355453610420227, + "loss/hidden": 0.0, + "loss/logits": 0.19908315688371658, + "loss/reg": 0.7219676971435547, + "step": 2090 + }, + { + "epoch": 0.02091, + "grad_norm": 0.39009061455726624, + "grad_norm_var": 0.0012533304734618296, + "learning_rate": 5e-05, + "loss": 0.1836, + "loss/crossentropy": 2.8361605405807495, + "loss/hidden": 0.0, + "loss/logits": 0.1835748590528965, + "loss/reg": 0.7220778465270996, + "step": 2091 + }, + { + "epoch": 0.02092, + "grad_norm": 0.3975403606891632, + "grad_norm_var": 0.0013190710932737344, + "learning_rate": 5e-05, + "loss": 0.1903, + "loss/crossentropy": 2.6920266151428223, + "loss/hidden": 0.0, + "loss/logits": 0.19031701236963272, + "loss/reg": 0.7214723229408264, + "step": 2092 + }, + { + "epoch": 0.02093, + "grad_norm": 0.43469369411468506, + "grad_norm_var": 0.00078620792301702, + "learning_rate": 5e-05, + "loss": 0.2077, + "loss/crossentropy": 2.6953823566436768, + "loss/hidden": 0.0, + "loss/logits": 0.2076844982802868, + "loss/reg": 0.7212677001953125, + "step": 2093 + }, + { + "epoch": 0.02094, + "grad_norm": 0.4071224629878998, + "grad_norm_var": 0.0005966448776881202, + "learning_rate": 5e-05, + "loss": 0.2036, + "loss/crossentropy": 2.771484136581421, + "loss/hidden": 0.0, + "loss/logits": 0.203644510358572, + "loss/reg": 0.7214339375495911, + "step": 2094 + }, + { + "epoch": 0.02095, + "grad_norm": 1.391247272491455, + "grad_norm_var": 0.05861290930529792, + "learning_rate": 5e-05, + "loss": 0.2448, + "loss/crossentropy": 2.897869110107422, + "loss/hidden": 0.0, + "loss/logits": 0.24478957056999207, + "loss/reg": 0.7214595079421997, + "step": 2095 + }, + { + "epoch": 0.02096, + "grad_norm": 0.4210922420024872, + "grad_norm_var": 0.05854066943645456, + "learning_rate": 5e-05, + "loss": 0.1729, + "loss/crossentropy": 2.741064727306366, + "loss/hidden": 0.0, + "loss/logits": 0.17292725667357445, + "loss/reg": 0.7207840085029602, + "step": 2096 + }, + { + "epoch": 0.02097, + "grad_norm": 0.45224177837371826, + "grad_norm_var": 0.05836384380140959, + "learning_rate": 5e-05, + "loss": 0.196, + "loss/crossentropy": 2.5982664227485657, + "loss/hidden": 0.0, + "loss/logits": 0.19603603333234787, + "loss/reg": 0.7212735414505005, + "step": 2097 + }, + { + "epoch": 0.02098, + "grad_norm": 0.5133381485939026, + "grad_norm_var": 0.05812759964966805, + "learning_rate": 5e-05, + "loss": 0.1934, + "loss/crossentropy": 2.792477607727051, + "loss/hidden": 0.0, + "loss/logits": 0.19335027039051056, + "loss/reg": 0.7217422723770142, + "step": 2098 + }, + { + "epoch": 0.02099, + "grad_norm": 0.5267723202705383, + "grad_norm_var": 0.05804661906832807, + "learning_rate": 5e-05, + "loss": 0.1938, + "loss/crossentropy": 2.865249812602997, + "loss/hidden": 0.0, + "loss/logits": 0.19375771284103394, + "loss/reg": 0.7221139073371887, + "step": 2099 + }, + { + "epoch": 0.021, + "grad_norm": 0.4996579587459564, + "grad_norm_var": 0.057648722312777484, + "learning_rate": 5e-05, + "loss": 0.2079, + "loss/crossentropy": 2.8032256364822388, + "loss/hidden": 0.0, + "loss/logits": 0.20792688056826591, + "loss/reg": 0.7217006087303162, + "step": 2100 + }, + { + "epoch": 0.02101, + "grad_norm": 0.5150094628334045, + "grad_norm_var": 0.0571300876865543, + "learning_rate": 5e-05, + "loss": 0.217, + "loss/crossentropy": 2.9016406536102295, + "loss/hidden": 0.0, + "loss/logits": 0.21699880808591843, + "loss/reg": 0.7220216989517212, + "step": 2101 + }, + { + "epoch": 0.02102, + "grad_norm": 0.45621562004089355, + "grad_norm_var": 0.057244533088116716, + "learning_rate": 5e-05, + "loss": 0.2169, + "loss/crossentropy": 2.710154116153717, + "loss/hidden": 0.0, + "loss/logits": 0.21690407767891884, + "loss/reg": 0.7216529846191406, + "step": 2102 + }, + { + "epoch": 0.02103, + "grad_norm": 0.45114558935165405, + "grad_norm_var": 0.05721518361892574, + "learning_rate": 5e-05, + "loss": 0.1979, + "loss/crossentropy": 2.709060311317444, + "loss/hidden": 0.0, + "loss/logits": 0.19787561148405075, + "loss/reg": 0.7216683030128479, + "step": 2103 + }, + { + "epoch": 0.02104, + "grad_norm": 0.50423663854599, + "grad_norm_var": 0.0570143907591466, + "learning_rate": 5e-05, + "loss": 0.2098, + "loss/crossentropy": 2.7507155537605286, + "loss/hidden": 0.0, + "loss/logits": 0.20979801565408707, + "loss/reg": 0.7212762236595154, + "step": 2104 + }, + { + "epoch": 0.02105, + "grad_norm": 0.45337167382240295, + "grad_norm_var": 0.05677440033368632, + "learning_rate": 5e-05, + "loss": 0.2092, + "loss/crossentropy": 2.8006672859191895, + "loss/hidden": 0.0, + "loss/logits": 0.20916958525776863, + "loss/reg": 0.7212364673614502, + "step": 2105 + }, + { + "epoch": 0.02106, + "grad_norm": 0.41459375619888306, + "grad_norm_var": 0.05666326053352654, + "learning_rate": 5e-05, + "loss": 0.1981, + "loss/crossentropy": 2.8682488203048706, + "loss/hidden": 0.0, + "loss/logits": 0.1981184296309948, + "loss/reg": 0.7207834124565125, + "step": 2106 + }, + { + "epoch": 0.02107, + "grad_norm": 0.4104544520378113, + "grad_norm_var": 0.05635200080838608, + "learning_rate": 5e-05, + "loss": 0.1994, + "loss/crossentropy": 2.849208354949951, + "loss/hidden": 0.0, + "loss/logits": 0.1993747167289257, + "loss/reg": 0.7205051779747009, + "step": 2107 + }, + { + "epoch": 0.02108, + "grad_norm": 0.4306841194629669, + "grad_norm_var": 0.05589917158874545, + "learning_rate": 5e-05, + "loss": 0.21, + "loss/crossentropy": 2.963488519191742, + "loss/hidden": 0.0, + "loss/logits": 0.21002190560102463, + "loss/reg": 0.7205133438110352, + "step": 2108 + }, + { + "epoch": 0.02109, + "grad_norm": 0.4335455596446991, + "grad_norm_var": 0.05591194830747893, + "learning_rate": 5e-05, + "loss": 0.2184, + "loss/crossentropy": 2.7145076394081116, + "loss/hidden": 0.0, + "loss/logits": 0.21841401234269142, + "loss/reg": 0.7203344106674194, + "step": 2109 + }, + { + "epoch": 0.0211, + "grad_norm": 0.40822094678878784, + "grad_norm_var": 0.055895850658830334, + "learning_rate": 5e-05, + "loss": 0.1995, + "loss/crossentropy": 2.780724048614502, + "loss/hidden": 0.0, + "loss/logits": 0.1994817852973938, + "loss/reg": 0.7200361490249634, + "step": 2110 + }, + { + "epoch": 0.02111, + "grad_norm": 0.40742164850234985, + "grad_norm_var": 0.0017900600667299592, + "learning_rate": 5e-05, + "loss": 0.1899, + "loss/crossentropy": 2.8590556979179382, + "loss/hidden": 0.0, + "loss/logits": 0.18990258127450943, + "loss/reg": 0.7193716168403625, + "step": 2111 + }, + { + "epoch": 0.02112, + "grad_norm": 0.4198627769947052, + "grad_norm_var": 0.0017958974337097626, + "learning_rate": 5e-05, + "loss": 0.2033, + "loss/crossentropy": 2.7853169441223145, + "loss/hidden": 0.0, + "loss/logits": 0.2033110074698925, + "loss/reg": 0.7188916206359863, + "step": 2112 + }, + { + "epoch": 0.02113, + "grad_norm": 0.41504207253456116, + "grad_norm_var": 0.0019012661554018706, + "learning_rate": 5e-05, + "loss": 0.1875, + "loss/crossentropy": 2.8004114627838135, + "loss/hidden": 0.0, + "loss/logits": 0.18751712888479233, + "loss/reg": 0.7185385823249817, + "step": 2113 + }, + { + "epoch": 0.02114, + "grad_norm": 0.4135783314704895, + "grad_norm_var": 0.0017303119095431658, + "learning_rate": 5e-05, + "loss": 0.205, + "loss/crossentropy": 2.858197033405304, + "loss/hidden": 0.0, + "loss/logits": 0.20497623085975647, + "loss/reg": 0.7183231115341187, + "step": 2114 + }, + { + "epoch": 0.02115, + "grad_norm": 0.42267823219299316, + "grad_norm_var": 0.0013071360129919274, + "learning_rate": 5e-05, + "loss": 0.2057, + "loss/crossentropy": 2.8748321533203125, + "loss/hidden": 0.0, + "loss/logits": 0.20568474382162094, + "loss/reg": 0.7179359197616577, + "step": 2115 + }, + { + "epoch": 0.02116, + "grad_norm": 0.656605064868927, + "grad_norm_var": 0.00407452129050692, + "learning_rate": 5e-05, + "loss": 0.2144, + "loss/crossentropy": 2.9658663272857666, + "loss/hidden": 0.0, + "loss/logits": 0.21444855630397797, + "loss/reg": 0.7175853848457336, + "step": 2116 + }, + { + "epoch": 0.02117, + "grad_norm": 0.3986641466617584, + "grad_norm_var": 0.003924343155112216, + "learning_rate": 5e-05, + "loss": 0.1824, + "loss/crossentropy": 2.75046569108963, + "loss/hidden": 0.0, + "loss/logits": 0.1823912039399147, + "loss/reg": 0.7169254422187805, + "step": 2117 + }, + { + "epoch": 0.02118, + "grad_norm": 0.3981616199016571, + "grad_norm_var": 0.004036714272380767, + "learning_rate": 5e-05, + "loss": 0.1855, + "loss/crossentropy": 2.85973197221756, + "loss/hidden": 0.0, + "loss/logits": 0.18552669882774353, + "loss/reg": 0.7165629863739014, + "step": 2118 + }, + { + "epoch": 0.02119, + "grad_norm": 0.45171210169792175, + "grad_norm_var": 0.004037584395892916, + "learning_rate": 5e-05, + "loss": 0.2139, + "loss/crossentropy": 2.7842766642570496, + "loss/hidden": 0.0, + "loss/logits": 0.2139478214085102, + "loss/reg": 0.7160403728485107, + "step": 2119 + }, + { + "epoch": 0.0212, + "grad_norm": 0.4330573081970215, + "grad_norm_var": 0.003743905538799893, + "learning_rate": 5e-05, + "loss": 0.2165, + "loss/crossentropy": 2.697123169898987, + "loss/hidden": 0.0, + "loss/logits": 0.21645596250891685, + "loss/reg": 0.715305745601654, + "step": 2120 + }, + { + "epoch": 0.02121, + "grad_norm": 0.39919087290763855, + "grad_norm_var": 0.0037981148011965024, + "learning_rate": 5e-05, + "loss": 0.1868, + "loss/crossentropy": 2.7567432522773743, + "loss/hidden": 0.0, + "loss/logits": 0.18682954460382462, + "loss/reg": 0.7145901322364807, + "step": 2121 + }, + { + "epoch": 0.02122, + "grad_norm": 0.4769824147224426, + "grad_norm_var": 0.0038958268929462155, + "learning_rate": 5e-05, + "loss": 0.2209, + "loss/crossentropy": 2.954287350177765, + "loss/hidden": 0.0, + "loss/logits": 0.2208559811115265, + "loss/reg": 0.7139098644256592, + "step": 2122 + }, + { + "epoch": 0.02123, + "grad_norm": 0.520776629447937, + "grad_norm_var": 0.004280875098341008, + "learning_rate": 5e-05, + "loss": 0.2266, + "loss/crossentropy": 2.7800422310829163, + "loss/hidden": 0.0, + "loss/logits": 0.22664930671453476, + "loss/reg": 0.7132286429405212, + "step": 2123 + }, + { + "epoch": 0.02124, + "grad_norm": 0.4547531306743622, + "grad_norm_var": 0.004277922562663387, + "learning_rate": 5e-05, + "loss": 0.2163, + "loss/crossentropy": 2.726837158203125, + "loss/hidden": 0.0, + "loss/logits": 0.21631214022636414, + "loss/reg": 0.7120151519775391, + "step": 2124 + }, + { + "epoch": 0.02125, + "grad_norm": 0.4272185266017914, + "grad_norm_var": 0.004289573617419921, + "learning_rate": 5e-05, + "loss": 0.2057, + "loss/crossentropy": 2.860227942466736, + "loss/hidden": 0.0, + "loss/logits": 0.20567086711525917, + "loss/reg": 0.7110404968261719, + "step": 2125 + }, + { + "epoch": 0.02126, + "grad_norm": 0.4602227210998535, + "grad_norm_var": 0.004210540727909977, + "learning_rate": 5e-05, + "loss": 0.1845, + "loss/crossentropy": 2.9573395252227783, + "loss/hidden": 0.0, + "loss/logits": 0.18453369289636612, + "loss/reg": 0.7102705836296082, + "step": 2126 + }, + { + "epoch": 0.02127, + "grad_norm": 0.4463096261024475, + "grad_norm_var": 0.004098568827145721, + "learning_rate": 5e-05, + "loss": 0.1865, + "loss/crossentropy": 2.78344464302063, + "loss/hidden": 0.0, + "loss/logits": 0.18653558567166328, + "loss/reg": 0.7091106176376343, + "step": 2127 + }, + { + "epoch": 0.02128, + "grad_norm": 0.49815741181373596, + "grad_norm_var": 0.004170468497763465, + "learning_rate": 5e-05, + "loss": 0.1903, + "loss/crossentropy": 3.0202105045318604, + "loss/hidden": 0.0, + "loss/logits": 0.19034047797322273, + "loss/reg": 0.7076919674873352, + "step": 2128 + }, + { + "epoch": 0.02129, + "grad_norm": 0.46333765983581543, + "grad_norm_var": 0.004061714825833725, + "learning_rate": 5e-05, + "loss": 0.2017, + "loss/crossentropy": 2.74775093793869, + "loss/hidden": 0.0, + "loss/logits": 0.20167529210448265, + "loss/reg": 0.7065026760101318, + "step": 2129 + }, + { + "epoch": 0.0213, + "grad_norm": 0.4539763629436493, + "grad_norm_var": 0.003926661763002537, + "learning_rate": 5e-05, + "loss": 0.2204, + "loss/crossentropy": 2.783592104911804, + "loss/hidden": 0.0, + "loss/logits": 0.22035584598779678, + "loss/reg": 0.7057085633277893, + "step": 2130 + }, + { + "epoch": 0.02131, + "grad_norm": 0.4040832817554474, + "grad_norm_var": 0.0040410848984061, + "learning_rate": 5e-05, + "loss": 0.194, + "loss/crossentropy": 2.7474722862243652, + "loss/hidden": 0.0, + "loss/logits": 0.194016944617033, + "loss/reg": 0.7045621275901794, + "step": 2131 + }, + { + "epoch": 0.02132, + "grad_norm": 0.9614255428314209, + "grad_norm_var": 0.017881524092894625, + "learning_rate": 5e-05, + "loss": 0.3017, + "loss/crossentropy": 2.9067559838294983, + "loss/hidden": 0.0, + "loss/logits": 0.3017450012266636, + "loss/reg": 0.7034192085266113, + "step": 2132 + }, + { + "epoch": 0.02133, + "grad_norm": 0.42803218960762024, + "grad_norm_var": 0.017624763530123117, + "learning_rate": 5e-05, + "loss": 0.2012, + "loss/crossentropy": 2.7625133991241455, + "loss/hidden": 0.0, + "loss/logits": 0.20123181492090225, + "loss/reg": 0.7029147744178772, + "step": 2133 + }, + { + "epoch": 0.02134, + "grad_norm": 0.4184335768222809, + "grad_norm_var": 0.017429684494454364, + "learning_rate": 5e-05, + "loss": 0.194, + "loss/crossentropy": 2.8571687936782837, + "loss/hidden": 0.0, + "loss/logits": 0.1939922235906124, + "loss/reg": 0.7021790742874146, + "step": 2134 + }, + { + "epoch": 0.02135, + "grad_norm": 0.42721158266067505, + "grad_norm_var": 0.01756321837129809, + "learning_rate": 5e-05, + "loss": 0.1901, + "loss/crossentropy": 2.832636833190918, + "loss/hidden": 0.0, + "loss/logits": 0.19009744748473167, + "loss/reg": 0.7015358209609985, + "step": 2135 + }, + { + "epoch": 0.02136, + "grad_norm": 0.414785772562027, + "grad_norm_var": 0.017697405811683734, + "learning_rate": 5e-05, + "loss": 0.1858, + "loss/crossentropy": 2.6991535425186157, + "loss/hidden": 0.0, + "loss/logits": 0.18576501309871674, + "loss/reg": 0.701002299785614, + "step": 2136 + }, + { + "epoch": 0.02137, + "grad_norm": 0.44407960772514343, + "grad_norm_var": 0.017349077524662908, + "learning_rate": 5e-05, + "loss": 0.1964, + "loss/crossentropy": 2.814389169216156, + "loss/hidden": 0.0, + "loss/logits": 0.19636918604373932, + "loss/reg": 0.7007303833961487, + "step": 2137 + }, + { + "epoch": 0.02138, + "grad_norm": 0.39341655373573303, + "grad_norm_var": 0.01783293177001443, + "learning_rate": 5e-05, + "loss": 0.1798, + "loss/crossentropy": 2.802081346511841, + "loss/hidden": 0.0, + "loss/logits": 0.17981423810124397, + "loss/reg": 0.7004950642585754, + "step": 2138 + }, + { + "epoch": 0.02139, + "grad_norm": 0.4358324110507965, + "grad_norm_var": 0.0177769222019774, + "learning_rate": 5e-05, + "loss": 0.1927, + "loss/crossentropy": 2.852492332458496, + "loss/hidden": 0.0, + "loss/logits": 0.1926623359322548, + "loss/reg": 0.7001385688781738, + "step": 2139 + }, + { + "epoch": 0.0214, + "grad_norm": 0.4297977685928345, + "grad_norm_var": 0.017868922449052662, + "learning_rate": 5e-05, + "loss": 0.1999, + "loss/crossentropy": 2.9621934294700623, + "loss/hidden": 0.0, + "loss/logits": 0.1999046579003334, + "loss/reg": 0.6997998356819153, + "step": 2140 + }, + { + "epoch": 0.02141, + "grad_norm": 0.5451433658599854, + "grad_norm_var": 0.018078840577590086, + "learning_rate": 5e-05, + "loss": 0.2239, + "loss/crossentropy": 2.6318374276161194, + "loss/hidden": 0.0, + "loss/logits": 0.22391459345817566, + "loss/reg": 0.699411928653717, + "step": 2141 + }, + { + "epoch": 0.02142, + "grad_norm": 0.44166216254234314, + "grad_norm_var": 0.01814069148697642, + "learning_rate": 5e-05, + "loss": 0.2051, + "loss/crossentropy": 2.7267025113105774, + "loss/hidden": 0.0, + "loss/logits": 0.2050914727151394, + "loss/reg": 0.6993353962898254, + "step": 2142 + }, + { + "epoch": 0.02143, + "grad_norm": 0.45107167959213257, + "grad_norm_var": 0.018123666532333615, + "learning_rate": 5e-05, + "loss": 0.216, + "loss/crossentropy": 2.7990114092826843, + "loss/hidden": 0.0, + "loss/logits": 0.21597999334335327, + "loss/reg": 0.69881671667099, + "step": 2143 + }, + { + "epoch": 0.02144, + "grad_norm": 0.4275515675544739, + "grad_norm_var": 0.01822338091435839, + "learning_rate": 5e-05, + "loss": 0.201, + "loss/crossentropy": 2.8991892337799072, + "loss/hidden": 0.0, + "loss/logits": 0.20100881904363632, + "loss/reg": 0.6984178423881531, + "step": 2144 + }, + { + "epoch": 0.02145, + "grad_norm": 0.4591313898563385, + "grad_norm_var": 0.01822891866343661, + "learning_rate": 5e-05, + "loss": 0.1914, + "loss/crossentropy": 2.840840756893158, + "loss/hidden": 0.0, + "loss/logits": 0.19139451161026955, + "loss/reg": 0.6978896260261536, + "step": 2145 + }, + { + "epoch": 0.02146, + "grad_norm": 0.4136827886104584, + "grad_norm_var": 0.01842172839156562, + "learning_rate": 5e-05, + "loss": 0.2019, + "loss/crossentropy": 2.7860628366470337, + "loss/hidden": 0.0, + "loss/logits": 0.20194584876298904, + "loss/reg": 0.6977356672286987, + "step": 2146 + }, + { + "epoch": 0.02147, + "grad_norm": 0.4279073476791382, + "grad_norm_var": 0.018252710890857785, + "learning_rate": 5e-05, + "loss": 0.1859, + "loss/crossentropy": 2.6382302045822144, + "loss/hidden": 0.0, + "loss/logits": 0.18589358404278755, + "loss/reg": 0.6971644759178162, + "step": 2147 + }, + { + "epoch": 0.02148, + "grad_norm": 0.40666037797927856, + "grad_norm_var": 0.0011340233579855549, + "learning_rate": 5e-05, + "loss": 0.2014, + "loss/crossentropy": 2.6020625829696655, + "loss/hidden": 0.0, + "loss/logits": 0.20142236724495888, + "loss/reg": 0.6966284513473511, + "step": 2148 + }, + { + "epoch": 0.02149, + "grad_norm": 0.5555269718170166, + "grad_norm_var": 0.0020268329370143187, + "learning_rate": 5e-05, + "loss": 0.2112, + "loss/crossentropy": 2.762993812561035, + "loss/hidden": 0.0, + "loss/logits": 0.21117206662893295, + "loss/reg": 0.696413516998291, + "step": 2149 + }, + { + "epoch": 0.0215, + "grad_norm": 0.42094966769218445, + "grad_norm_var": 0.002018905426315539, + "learning_rate": 5e-05, + "loss": 0.2098, + "loss/crossentropy": 2.86466646194458, + "loss/hidden": 0.0, + "loss/logits": 0.20980717986822128, + "loss/reg": 0.6962559223175049, + "step": 2150 + }, + { + "epoch": 0.02151, + "grad_norm": 0.44652220606803894, + "grad_norm_var": 0.002000528790596094, + "learning_rate": 5e-05, + "loss": 0.2183, + "loss/crossentropy": 2.7573264837265015, + "loss/hidden": 0.0, + "loss/logits": 0.2183287926018238, + "loss/reg": 0.695887565612793, + "step": 2151 + }, + { + "epoch": 0.02152, + "grad_norm": 0.48507681488990784, + "grad_norm_var": 0.002029836370972519, + "learning_rate": 5e-05, + "loss": 0.2026, + "loss/crossentropy": 2.735235571861267, + "loss/hidden": 0.0, + "loss/logits": 0.20264559239149094, + "loss/reg": 0.6949869394302368, + "step": 2152 + }, + { + "epoch": 0.02153, + "grad_norm": 0.41482922434806824, + "grad_norm_var": 0.002102503301435669, + "learning_rate": 5e-05, + "loss": 0.2005, + "loss/crossentropy": 2.7061509490013123, + "loss/hidden": 0.0, + "loss/logits": 0.20053908228874207, + "loss/reg": 0.6943217515945435, + "step": 2153 + }, + { + "epoch": 0.02154, + "grad_norm": 0.4805706739425659, + "grad_norm_var": 0.0019525680473374033, + "learning_rate": 5e-05, + "loss": 0.2218, + "loss/crossentropy": 2.848098576068878, + "loss/hidden": 0.0, + "loss/logits": 0.2218431904911995, + "loss/reg": 0.6937242746353149, + "step": 2154 + }, + { + "epoch": 0.02155, + "grad_norm": 0.4552999436855316, + "grad_norm_var": 0.0019326801220143222, + "learning_rate": 5e-05, + "loss": 0.2206, + "loss/crossentropy": 2.7905853390693665, + "loss/hidden": 0.0, + "loss/logits": 0.2206210121512413, + "loss/reg": 0.6934657096862793, + "step": 2155 + }, + { + "epoch": 0.02156, + "grad_norm": 0.4011346101760864, + "grad_norm_var": 0.0020758987797539746, + "learning_rate": 5e-05, + "loss": 0.1896, + "loss/crossentropy": 2.887292504310608, + "loss/hidden": 0.0, + "loss/logits": 0.1896429806947708, + "loss/reg": 0.6924875378608704, + "step": 2156 + }, + { + "epoch": 0.02157, + "grad_norm": 0.43871423602104187, + "grad_norm_var": 0.0014627298866179714, + "learning_rate": 5e-05, + "loss": 0.2192, + "loss/crossentropy": 2.7467220425605774, + "loss/hidden": 0.0, + "loss/logits": 0.2192327156662941, + "loss/reg": 0.6915358901023865, + "step": 2157 + }, + { + "epoch": 0.02158, + "grad_norm": 0.41332143545150757, + "grad_norm_var": 0.0015270285051698807, + "learning_rate": 5e-05, + "loss": 0.2128, + "loss/crossentropy": 2.7894550561904907, + "loss/hidden": 0.0, + "loss/logits": 0.212764460593462, + "loss/reg": 0.6912392377853394, + "step": 2158 + }, + { + "epoch": 0.02159, + "grad_norm": 0.46855345368385315, + "grad_norm_var": 0.001563493918946135, + "learning_rate": 5e-05, + "loss": 0.2258, + "loss/crossentropy": 2.8350266814231873, + "loss/hidden": 0.0, + "loss/logits": 0.2257903330028057, + "loss/reg": 0.6905578374862671, + "step": 2159 + }, + { + "epoch": 0.0216, + "grad_norm": 0.41814684867858887, + "grad_norm_var": 0.0015905436945837458, + "learning_rate": 5e-05, + "loss": 0.2009, + "loss/crossentropy": 2.7631781101226807, + "loss/hidden": 0.0, + "loss/logits": 0.2008948102593422, + "loss/reg": 0.6902044415473938, + "step": 2160 + }, + { + "epoch": 0.02161, + "grad_norm": 0.4215793013572693, + "grad_norm_var": 0.0016035512425466524, + "learning_rate": 5e-05, + "loss": 0.1969, + "loss/crossentropy": 2.7035427689552307, + "loss/hidden": 0.0, + "loss/logits": 0.19686412066221237, + "loss/reg": 0.6898484826087952, + "step": 2161 + }, + { + "epoch": 0.02162, + "grad_norm": 0.4309665858745575, + "grad_norm_var": 0.0015574722350559344, + "learning_rate": 5e-05, + "loss": 0.2195, + "loss/crossentropy": 2.7287055253982544, + "loss/hidden": 0.0, + "loss/logits": 0.2195291370153427, + "loss/reg": 0.6893757581710815, + "step": 2162 + }, + { + "epoch": 0.02163, + "grad_norm": 0.4318053424358368, + "grad_norm_var": 0.0015506505111380186, + "learning_rate": 5e-05, + "loss": 0.1944, + "loss/crossentropy": 2.8184618949890137, + "loss/hidden": 0.0, + "loss/logits": 0.19441154599189758, + "loss/reg": 0.6885489821434021, + "step": 2163 + }, + { + "epoch": 0.02164, + "grad_norm": 0.4206545352935791, + "grad_norm_var": 0.0014948913199979708, + "learning_rate": 5e-05, + "loss": 0.1954, + "loss/crossentropy": 2.824199080467224, + "loss/hidden": 0.0, + "loss/logits": 0.19543399661779404, + "loss/reg": 0.6880328059196472, + "step": 2164 + }, + { + "epoch": 0.02165, + "grad_norm": 0.48655855655670166, + "grad_norm_var": 0.0007664029028332409, + "learning_rate": 5e-05, + "loss": 0.2186, + "loss/crossentropy": 2.863620102405548, + "loss/hidden": 0.0, + "loss/logits": 0.21863248199224472, + "loss/reg": 0.6872255206108093, + "step": 2165 + }, + { + "epoch": 0.02166, + "grad_norm": 0.41735610365867615, + "grad_norm_var": 0.0007761786093330892, + "learning_rate": 5e-05, + "loss": 0.2053, + "loss/crossentropy": 2.7099905610084534, + "loss/hidden": 0.0, + "loss/logits": 0.205287616699934, + "loss/reg": 0.6865286231040955, + "step": 2166 + }, + { + "epoch": 0.02167, + "grad_norm": 0.41900861263275146, + "grad_norm_var": 0.0007975214828756331, + "learning_rate": 5e-05, + "loss": 0.1948, + "loss/crossentropy": 2.843027651309967, + "loss/hidden": 0.0, + "loss/logits": 0.19483652338385582, + "loss/reg": 0.6857686042785645, + "step": 2167 + }, + { + "epoch": 0.02168, + "grad_norm": 0.41439321637153625, + "grad_norm_var": 0.0006635019809905292, + "learning_rate": 5e-05, + "loss": 0.2053, + "loss/crossentropy": 2.7984771132469177, + "loss/hidden": 0.0, + "loss/logits": 0.20531423389911652, + "loss/reg": 0.685128927230835, + "step": 2168 + }, + { + "epoch": 0.02169, + "grad_norm": 0.38311371207237244, + "grad_norm_var": 0.0008045015983382105, + "learning_rate": 5e-05, + "loss": 0.1803, + "loss/crossentropy": 2.7924957871437073, + "loss/hidden": 0.0, + "loss/logits": 0.1802998222410679, + "loss/reg": 0.6848140954971313, + "step": 2169 + }, + { + "epoch": 0.0217, + "grad_norm": 0.43701696395874023, + "grad_norm_var": 0.0006370735913676502, + "learning_rate": 5e-05, + "loss": 0.2052, + "loss/crossentropy": 2.8674927353858948, + "loss/hidden": 0.0, + "loss/logits": 0.20517823472619057, + "loss/reg": 0.684516191482544, + "step": 2170 + }, + { + "epoch": 0.02171, + "grad_norm": 0.4245651364326477, + "grad_norm_var": 0.0005867031899962923, + "learning_rate": 5e-05, + "loss": 0.2063, + "loss/crossentropy": 2.759288966655731, + "loss/hidden": 0.0, + "loss/logits": 0.20629791170358658, + "loss/reg": 0.6836106777191162, + "step": 2171 + }, + { + "epoch": 0.02172, + "grad_norm": 0.40898624062538147, + "grad_norm_var": 0.0005638125688654702, + "learning_rate": 5e-05, + "loss": 0.1969, + "loss/crossentropy": 2.8363333344459534, + "loss/hidden": 0.0, + "loss/logits": 0.19685319811105728, + "loss/reg": 0.6828330755233765, + "step": 2172 + }, + { + "epoch": 0.02173, + "grad_norm": 0.4359913468360901, + "grad_norm_var": 0.0005600852551820487, + "learning_rate": 5e-05, + "loss": 0.1983, + "loss/crossentropy": 2.736388862133026, + "loss/hidden": 0.0, + "loss/logits": 0.1983194276690483, + "loss/reg": 0.6821224689483643, + "step": 2173 + }, + { + "epoch": 0.02174, + "grad_norm": 0.5771790742874146, + "grad_norm_var": 0.0019392993977182156, + "learning_rate": 5e-05, + "loss": 0.2047, + "loss/crossentropy": 2.7388861179351807, + "loss/hidden": 0.0, + "loss/logits": 0.2046787217259407, + "loss/reg": 0.6815424561500549, + "step": 2174 + }, + { + "epoch": 0.02175, + "grad_norm": 0.5016182065010071, + "grad_norm_var": 0.0021456691602406005, + "learning_rate": 5e-05, + "loss": 0.2102, + "loss/crossentropy": 2.67424213886261, + "loss/hidden": 0.0, + "loss/logits": 0.21016032993793488, + "loss/reg": 0.6806396842002869, + "step": 2175 + }, + { + "epoch": 0.02176, + "grad_norm": 0.5204771757125854, + "grad_norm_var": 0.0025114039298453237, + "learning_rate": 5e-05, + "loss": 0.2135, + "loss/crossentropy": 2.884347915649414, + "loss/hidden": 0.0, + "loss/logits": 0.21353007480502129, + "loss/reg": 0.679993212223053, + "step": 2176 + }, + { + "epoch": 0.02177, + "grad_norm": 0.4536159038543701, + "grad_norm_var": 0.0024724990081046155, + "learning_rate": 5e-05, + "loss": 0.196, + "loss/crossentropy": 2.7048420310020447, + "loss/hidden": 0.0, + "loss/logits": 0.19598710164427757, + "loss/reg": 0.6792700886726379, + "step": 2177 + }, + { + "epoch": 0.02178, + "grad_norm": 0.4368113875389099, + "grad_norm_var": 0.002461588452222089, + "learning_rate": 5e-05, + "loss": 0.1946, + "loss/crossentropy": 2.713423490524292, + "loss/hidden": 0.0, + "loss/logits": 0.1945551261305809, + "loss/reg": 0.678525984287262, + "step": 2178 + }, + { + "epoch": 0.02179, + "grad_norm": 0.402449369430542, + "grad_norm_var": 0.002579118963506725, + "learning_rate": 5e-05, + "loss": 0.1912, + "loss/crossentropy": 2.827597141265869, + "loss/hidden": 0.0, + "loss/logits": 0.1912226378917694, + "loss/reg": 0.6779956221580505, + "step": 2179 + }, + { + "epoch": 0.0218, + "grad_norm": 0.4402579069137573, + "grad_norm_var": 0.0025362696377524566, + "learning_rate": 5e-05, + "loss": 0.205, + "loss/crossentropy": 2.7179870009422302, + "loss/hidden": 0.0, + "loss/logits": 0.20495088025927544, + "loss/reg": 0.6774372458457947, + "step": 2180 + }, + { + "epoch": 0.02181, + "grad_norm": 0.43025892972946167, + "grad_norm_var": 0.0024408930074824625, + "learning_rate": 5e-05, + "loss": 0.2015, + "loss/crossentropy": 2.886035144329071, + "loss/hidden": 0.0, + "loss/logits": 0.2015029489994049, + "loss/reg": 0.6766461133956909, + "step": 2181 + }, + { + "epoch": 0.02182, + "grad_norm": 0.448896199464798, + "grad_norm_var": 0.002391256542667423, + "learning_rate": 5e-05, + "loss": 0.1978, + "loss/crossentropy": 2.6068813800811768, + "loss/hidden": 0.0, + "loss/logits": 0.1977914310991764, + "loss/reg": 0.6760473251342773, + "step": 2182 + }, + { + "epoch": 0.02183, + "grad_norm": 0.41257068514823914, + "grad_norm_var": 0.002416943124190433, + "learning_rate": 5e-05, + "loss": 0.2033, + "loss/crossentropy": 2.751125752925873, + "loss/hidden": 0.0, + "loss/logits": 0.20326784998178482, + "loss/reg": 0.6757020950317383, + "step": 2183 + }, + { + "epoch": 0.02184, + "grad_norm": 0.6041910648345947, + "grad_norm_var": 0.003880875867707549, + "learning_rate": 5e-05, + "loss": 0.2092, + "loss/crossentropy": 3.20320200920105, + "loss/hidden": 0.0, + "loss/logits": 0.20922166109085083, + "loss/reg": 0.6751511096954346, + "step": 2184 + }, + { + "epoch": 0.02185, + "grad_norm": 0.4808026850223541, + "grad_norm_var": 0.0035100545399124883, + "learning_rate": 5e-05, + "loss": 0.2086, + "loss/crossentropy": 2.871992290019989, + "loss/hidden": 0.0, + "loss/logits": 0.20859647169709206, + "loss/reg": 0.6743239760398865, + "step": 2185 + }, + { + "epoch": 0.02186, + "grad_norm": 0.46273502707481384, + "grad_norm_var": 0.0034606477018267654, + "learning_rate": 5e-05, + "loss": 0.1967, + "loss/crossentropy": 2.8617104291915894, + "loss/hidden": 0.0, + "loss/logits": 0.19668934494256973, + "loss/reg": 0.6734572649002075, + "step": 2186 + }, + { + "epoch": 0.02187, + "grad_norm": 0.4677128493785858, + "grad_norm_var": 0.0033438769350355576, + "learning_rate": 5e-05, + "loss": 0.1944, + "loss/crossentropy": 2.8544273376464844, + "loss/hidden": 0.0, + "loss/logits": 0.19444719329476357, + "loss/reg": 0.6727681756019592, + "step": 2187 + }, + { + "epoch": 0.02188, + "grad_norm": 0.4619734585285187, + "grad_norm_var": 0.003103946366710216, + "learning_rate": 5e-05, + "loss": 0.208, + "loss/crossentropy": 2.7287431955337524, + "loss/hidden": 0.0, + "loss/logits": 0.207965437322855, + "loss/reg": 0.6725651025772095, + "step": 2188 + }, + { + "epoch": 0.02189, + "grad_norm": 0.5305468440055847, + "grad_norm_var": 0.003220160200706701, + "learning_rate": 5e-05, + "loss": 0.2188, + "loss/crossentropy": 2.952690064907074, + "loss/hidden": 0.0, + "loss/logits": 0.21882307529449463, + "loss/reg": 0.6716290712356567, + "step": 2189 + }, + { + "epoch": 0.0219, + "grad_norm": 0.5237597227096558, + "grad_norm_var": 0.002685021480990339, + "learning_rate": 5e-05, + "loss": 0.2227, + "loss/crossentropy": 2.9236454367637634, + "loss/hidden": 0.0, + "loss/logits": 0.2227388396859169, + "loss/reg": 0.6711364984512329, + "step": 2190 + }, + { + "epoch": 0.02191, + "grad_norm": 0.4429808557033539, + "grad_norm_var": 0.0026813890157582026, + "learning_rate": 5e-05, + "loss": 0.1994, + "loss/crossentropy": 2.6307492852211, + "loss/hidden": 0.0, + "loss/logits": 0.19941957667469978, + "loss/reg": 0.6705188155174255, + "step": 2191 + }, + { + "epoch": 0.02192, + "grad_norm": 0.4466393291950226, + "grad_norm_var": 0.002525215264992875, + "learning_rate": 5e-05, + "loss": 0.1993, + "loss/crossentropy": 2.848267436027527, + "loss/hidden": 0.0, + "loss/logits": 0.1992984376847744, + "loss/reg": 0.6696529388427734, + "step": 2192 + }, + { + "epoch": 0.02193, + "grad_norm": 0.41059961915016174, + "grad_norm_var": 0.0027083821512908224, + "learning_rate": 5e-05, + "loss": 0.203, + "loss/crossentropy": 2.89735746383667, + "loss/hidden": 0.0, + "loss/logits": 0.20299606397747993, + "loss/reg": 0.6685121059417725, + "step": 2193 + }, + { + "epoch": 0.02194, + "grad_norm": 0.40344536304473877, + "grad_norm_var": 0.002893132308264272, + "learning_rate": 5e-05, + "loss": 0.1923, + "loss/crossentropy": 2.807808578014374, + "loss/hidden": 0.0, + "loss/logits": 0.1923409029841423, + "loss/reg": 0.6674231886863708, + "step": 2194 + }, + { + "epoch": 0.02195, + "grad_norm": 0.46464207768440247, + "grad_norm_var": 0.002652558118381032, + "learning_rate": 5e-05, + "loss": 0.2043, + "loss/crossentropy": 2.7143484354019165, + "loss/hidden": 0.0, + "loss/logits": 0.2042921595275402, + "loss/reg": 0.6665908098220825, + "step": 2195 + }, + { + "epoch": 0.02196, + "grad_norm": 0.4110061824321747, + "grad_norm_var": 0.002800589892713179, + "learning_rate": 5e-05, + "loss": 0.1986, + "loss/crossentropy": 2.71746289730072, + "loss/hidden": 0.0, + "loss/logits": 0.19860674813389778, + "loss/reg": 0.6659626960754395, + "step": 2196 + }, + { + "epoch": 0.02197, + "grad_norm": 0.43590620160102844, + "grad_norm_var": 0.0027781766481714005, + "learning_rate": 5e-05, + "loss": 0.2043, + "loss/crossentropy": 2.8812056183815002, + "loss/hidden": 0.0, + "loss/logits": 0.20434146001935005, + "loss/reg": 0.6653489470481873, + "step": 2197 + }, + { + "epoch": 0.02198, + "grad_norm": 0.381786048412323, + "grad_norm_var": 0.003186091779586209, + "learning_rate": 5e-05, + "loss": 0.1881, + "loss/crossentropy": 2.7886149287223816, + "loss/hidden": 0.0, + "loss/logits": 0.188095360994339, + "loss/reg": 0.665099024772644, + "step": 2198 + }, + { + "epoch": 0.02199, + "grad_norm": 0.47209522128105164, + "grad_norm_var": 0.0030403890929959726, + "learning_rate": 5e-05, + "loss": 0.1953, + "loss/crossentropy": 2.762462854385376, + "loss/hidden": 0.0, + "loss/logits": 0.19527531415224075, + "loss/reg": 0.6646116971969604, + "step": 2199 + }, + { + "epoch": 0.022, + "grad_norm": 0.39929649233818054, + "grad_norm_var": 0.0017947578208100741, + "learning_rate": 5e-05, + "loss": 0.1892, + "loss/crossentropy": 2.772857427597046, + "loss/hidden": 0.0, + "loss/logits": 0.18919479101896286, + "loss/reg": 0.663826048374176, + "step": 2200 + }, + { + "epoch": 0.02201, + "grad_norm": 0.4259912967681885, + "grad_norm_var": 0.0017555541713624796, + "learning_rate": 5e-05, + "loss": 0.1873, + "loss/crossentropy": 2.925145983695984, + "loss/hidden": 0.0, + "loss/logits": 0.18727776035666466, + "loss/reg": 0.6631879210472107, + "step": 2201 + }, + { + "epoch": 0.02202, + "grad_norm": 0.41853076219558716, + "grad_norm_var": 0.0017809304189222144, + "learning_rate": 5e-05, + "loss": 0.2046, + "loss/crossentropy": 2.8232511281967163, + "loss/hidden": 0.0, + "loss/logits": 0.2046380490064621, + "loss/reg": 0.6626787185668945, + "step": 2202 + }, + { + "epoch": 0.02203, + "grad_norm": 0.4105367064476013, + "grad_norm_var": 0.0018010982581391037, + "learning_rate": 5e-05, + "loss": 0.1985, + "loss/crossentropy": 2.8416303396224976, + "loss/hidden": 0.0, + "loss/logits": 0.19848119094967842, + "loss/reg": 0.6623620390892029, + "step": 2203 + }, + { + "epoch": 0.02204, + "grad_norm": 0.41036859154701233, + "grad_norm_var": 0.0018162345450625339, + "learning_rate": 5e-05, + "loss": 0.1813, + "loss/crossentropy": 2.6539486050605774, + "loss/hidden": 0.0, + "loss/logits": 0.18125443533062935, + "loss/reg": 0.6616894006729126, + "step": 2204 + }, + { + "epoch": 0.02205, + "grad_norm": 0.40184709429740906, + "grad_norm_var": 0.0012420512651165764, + "learning_rate": 5e-05, + "loss": 0.1933, + "loss/crossentropy": 2.810888350009918, + "loss/hidden": 0.0, + "loss/logits": 0.1932699754834175, + "loss/reg": 0.6615607142448425, + "step": 2205 + }, + { + "epoch": 0.02206, + "grad_norm": 0.4064597487449646, + "grad_norm_var": 0.0006154991297322592, + "learning_rate": 5e-05, + "loss": 0.199, + "loss/crossentropy": 2.797262191772461, + "loss/hidden": 0.0, + "loss/logits": 0.19896076247096062, + "loss/reg": 0.6614593267440796, + "step": 2206 + }, + { + "epoch": 0.02207, + "grad_norm": 0.4584852457046509, + "grad_norm_var": 0.0006751710080178719, + "learning_rate": 5e-05, + "loss": 0.1956, + "loss/crossentropy": 2.5982040762901306, + "loss/hidden": 0.0, + "loss/logits": 0.1956261582672596, + "loss/reg": 0.6615445613861084, + "step": 2207 + }, + { + "epoch": 0.02208, + "grad_norm": 0.41586074233055115, + "grad_norm_var": 0.0006347089942988012, + "learning_rate": 5e-05, + "loss": 0.1958, + "loss/crossentropy": 2.897459328174591, + "loss/hidden": 0.0, + "loss/logits": 0.19579169154167175, + "loss/reg": 0.6611100435256958, + "step": 2208 + }, + { + "epoch": 0.02209, + "grad_norm": 0.4266660511493683, + "grad_norm_var": 0.0006297866080468635, + "learning_rate": 5e-05, + "loss": 0.2035, + "loss/crossentropy": 2.788914978504181, + "loss/hidden": 0.0, + "loss/logits": 0.20345713198184967, + "loss/reg": 0.6611925363540649, + "step": 2209 + }, + { + "epoch": 0.0221, + "grad_norm": 0.48467713594436646, + "grad_norm_var": 0.0008473796395787011, + "learning_rate": 5e-05, + "loss": 0.2378, + "loss/crossentropy": 2.7921268343925476, + "loss/hidden": 0.0, + "loss/logits": 0.23782894015312195, + "loss/reg": 0.6612811088562012, + "step": 2210 + }, + { + "epoch": 0.02211, + "grad_norm": 0.412604957818985, + "grad_norm_var": 0.0007520479633503663, + "learning_rate": 5e-05, + "loss": 0.1866, + "loss/crossentropy": 2.7436190247535706, + "loss/hidden": 0.0, + "loss/logits": 0.18662283942103386, + "loss/reg": 0.6610515117645264, + "step": 2211 + }, + { + "epoch": 0.02212, + "grad_norm": 0.43781405687332153, + "grad_norm_var": 0.0007531737819775231, + "learning_rate": 5e-05, + "loss": 0.2181, + "loss/crossentropy": 2.8103312849998474, + "loss/hidden": 0.0, + "loss/logits": 0.21812525391578674, + "loss/reg": 0.6607252359390259, + "step": 2212 + }, + { + "epoch": 0.02213, + "grad_norm": 0.3758794069290161, + "grad_norm_var": 0.0008905491473294706, + "learning_rate": 5e-05, + "loss": 0.1908, + "loss/crossentropy": 2.8121737241744995, + "loss/hidden": 0.0, + "loss/logits": 0.19082265719771385, + "loss/reg": 0.660344123840332, + "step": 2213 + }, + { + "epoch": 0.02214, + "grad_norm": 0.40602922439575195, + "grad_norm_var": 0.0007999404863282581, + "learning_rate": 5e-05, + "loss": 0.1928, + "loss/crossentropy": 2.8053001761436462, + "loss/hidden": 0.0, + "loss/logits": 0.19281848147511482, + "loss/reg": 0.6598142385482788, + "step": 2214 + }, + { + "epoch": 0.02215, + "grad_norm": 0.4154147505760193, + "grad_norm_var": 0.0006274064204118801, + "learning_rate": 5e-05, + "loss": 0.2052, + "loss/crossentropy": 2.8110750913619995, + "loss/hidden": 0.0, + "loss/logits": 0.2051638960838318, + "loss/reg": 0.6595247387886047, + "step": 2215 + }, + { + "epoch": 0.02216, + "grad_norm": 0.41716277599334717, + "grad_norm_var": 0.0006000529526535403, + "learning_rate": 5e-05, + "loss": 0.2129, + "loss/crossentropy": 2.6529202461242676, + "loss/hidden": 0.0, + "loss/logits": 0.21293757110834122, + "loss/reg": 0.6591362357139587, + "step": 2216 + }, + { + "epoch": 0.02217, + "grad_norm": 0.37741243839263916, + "grad_norm_var": 0.0007104926301494433, + "learning_rate": 5e-05, + "loss": 0.187, + "loss/crossentropy": 2.8193177580833435, + "loss/hidden": 0.0, + "loss/logits": 0.18699510395526886, + "loss/reg": 0.6589513421058655, + "step": 2217 + }, + { + "epoch": 0.02218, + "grad_norm": 0.39788559079170227, + "grad_norm_var": 0.0007335629691482974, + "learning_rate": 5e-05, + "loss": 0.2101, + "loss/crossentropy": 2.7125787138938904, + "loss/hidden": 0.0, + "loss/logits": 0.21007565781474113, + "loss/reg": 0.6587637066841125, + "step": 2218 + }, + { + "epoch": 0.02219, + "grad_norm": 0.3994773328304291, + "grad_norm_var": 0.0007491808798861836, + "learning_rate": 5e-05, + "loss": 0.1953, + "loss/crossentropy": 2.6460587978363037, + "loss/hidden": 0.0, + "loss/logits": 0.19534288719296455, + "loss/reg": 0.6586536169052124, + "step": 2219 + }, + { + "epoch": 0.0222, + "grad_norm": 0.415223628282547, + "grad_norm_var": 0.000747492342450177, + "learning_rate": 5e-05, + "loss": 0.2048, + "loss/crossentropy": 2.830595314502716, + "loss/hidden": 0.0, + "loss/logits": 0.20481149479746819, + "loss/reg": 0.6581150889396667, + "step": 2220 + }, + { + "epoch": 0.02221, + "grad_norm": 0.4300384223461151, + "grad_norm_var": 0.000745633661896176, + "learning_rate": 5e-05, + "loss": 0.2081, + "loss/crossentropy": 2.735981583595276, + "loss/hidden": 0.0, + "loss/logits": 0.20806902647018433, + "loss/reg": 0.6579728126525879, + "step": 2221 + }, + { + "epoch": 0.02222, + "grad_norm": 0.424029141664505, + "grad_norm_var": 0.0007394894867327177, + "learning_rate": 5e-05, + "loss": 0.1959, + "loss/crossentropy": 2.626233160495758, + "loss/hidden": 0.0, + "loss/logits": 0.19588829949498177, + "loss/reg": 0.658187747001648, + "step": 2222 + }, + { + "epoch": 0.02223, + "grad_norm": 0.41887691617012024, + "grad_norm_var": 0.0006259322218636047, + "learning_rate": 5e-05, + "loss": 0.1966, + "loss/crossentropy": 2.742566704750061, + "loss/hidden": 0.0, + "loss/logits": 0.1966009922325611, + "loss/reg": 0.6574844121932983, + "step": 2223 + }, + { + "epoch": 0.02224, + "grad_norm": 0.40258875489234924, + "grad_norm_var": 0.0006370829697786425, + "learning_rate": 5e-05, + "loss": 0.1925, + "loss/crossentropy": 2.833762288093567, + "loss/hidden": 0.0, + "loss/logits": 0.19246890768408775, + "loss/reg": 0.6576274633407593, + "step": 2224 + }, + { + "epoch": 0.02225, + "grad_norm": 0.4399501383304596, + "grad_norm_var": 0.000668578088888246, + "learning_rate": 5e-05, + "loss": 0.195, + "loss/crossentropy": 2.8631489276885986, + "loss/hidden": 0.0, + "loss/logits": 0.19495688006281853, + "loss/reg": 0.6572732925415039, + "step": 2225 + }, + { + "epoch": 0.02226, + "grad_norm": 0.45047199726104736, + "grad_norm_var": 0.0004282211553628697, + "learning_rate": 5e-05, + "loss": 0.2184, + "loss/crossentropy": 2.75924551486969, + "loss/hidden": 0.0, + "loss/logits": 0.21844982728362083, + "loss/reg": 0.6573055982589722, + "step": 2226 + }, + { + "epoch": 0.02227, + "grad_norm": 1.0621033906936646, + "grad_norm_var": 0.026689921921804624, + "learning_rate": 5e-05, + "loss": 0.2529, + "loss/crossentropy": 2.9512603878974915, + "loss/hidden": 0.0, + "loss/logits": 0.25287147983908653, + "loss/reg": 0.6576065421104431, + "step": 2227 + }, + { + "epoch": 0.02228, + "grad_norm": 0.4119041860103607, + "grad_norm_var": 0.026789169053382175, + "learning_rate": 5e-05, + "loss": 0.1884, + "loss/crossentropy": 2.7873952388763428, + "loss/hidden": 0.0, + "loss/logits": 0.18844201788306236, + "loss/reg": 0.6577906012535095, + "step": 2228 + }, + { + "epoch": 0.02229, + "grad_norm": 0.4193934500217438, + "grad_norm_var": 0.026461355189684576, + "learning_rate": 5e-05, + "loss": 0.1957, + "loss/crossentropy": 2.842826247215271, + "loss/hidden": 0.0, + "loss/logits": 0.1957399807870388, + "loss/reg": 0.6582816243171692, + "step": 2229 + }, + { + "epoch": 0.0223, + "grad_norm": 0.4402545094490051, + "grad_norm_var": 0.02630882310212582, + "learning_rate": 5e-05, + "loss": 0.1868, + "loss/crossentropy": 2.769850790500641, + "loss/hidden": 0.0, + "loss/logits": 0.18677347525954247, + "loss/reg": 0.6578382253646851, + "step": 2230 + }, + { + "epoch": 0.02231, + "grad_norm": 0.48326408863067627, + "grad_norm_var": 0.026214579591820792, + "learning_rate": 5e-05, + "loss": 0.1986, + "loss/crossentropy": 2.703267514705658, + "loss/hidden": 0.0, + "loss/logits": 0.1986374482512474, + "loss/reg": 0.6582728028297424, + "step": 2231 + }, + { + "epoch": 0.02232, + "grad_norm": 0.4916859567165375, + "grad_norm_var": 0.026117383670837338, + "learning_rate": 5e-05, + "loss": 0.2007, + "loss/crossentropy": 2.724755346775055, + "loss/hidden": 0.0, + "loss/logits": 0.20072279125452042, + "loss/reg": 0.6576346755027771, + "step": 2232 + }, + { + "epoch": 0.02233, + "grad_norm": 0.46447932720184326, + "grad_norm_var": 0.02555655735308611, + "learning_rate": 5e-05, + "loss": 0.2059, + "loss/crossentropy": 2.8488651514053345, + "loss/hidden": 0.0, + "loss/logits": 0.20591600984334946, + "loss/reg": 0.6575437188148499, + "step": 2233 + }, + { + "epoch": 0.02234, + "grad_norm": 0.43081918358802795, + "grad_norm_var": 0.025299001444115215, + "learning_rate": 5e-05, + "loss": 0.2038, + "loss/crossentropy": 2.6137781143188477, + "loss/hidden": 0.0, + "loss/logits": 0.2038256861269474, + "loss/reg": 0.6572914123535156, + "step": 2234 + }, + { + "epoch": 0.02235, + "grad_norm": 0.4186486005783081, + "grad_norm_var": 0.025131390470558405, + "learning_rate": 5e-05, + "loss": 0.1964, + "loss/crossentropy": 2.8289464116096497, + "loss/hidden": 0.0, + "loss/logits": 0.19643982127308846, + "loss/reg": 0.6575180888175964, + "step": 2235 + }, + { + "epoch": 0.02236, + "grad_norm": 0.44940683245658875, + "grad_norm_var": 0.024930911786940277, + "learning_rate": 5e-05, + "loss": 0.1932, + "loss/crossentropy": 2.7464110255241394, + "loss/hidden": 0.0, + "loss/logits": 0.1932026483118534, + "loss/reg": 0.6572230458259583, + "step": 2236 + }, + { + "epoch": 0.02237, + "grad_norm": 0.48707953095436096, + "grad_norm_var": 0.024774290295996322, + "learning_rate": 5e-05, + "loss": 0.2077, + "loss/crossentropy": 2.7238675951957703, + "loss/hidden": 0.0, + "loss/logits": 0.20767732337117195, + "loss/reg": 0.6567288041114807, + "step": 2237 + }, + { + "epoch": 0.02238, + "grad_norm": 0.6050520539283752, + "grad_norm_var": 0.025448875352469886, + "learning_rate": 5e-05, + "loss": 0.1999, + "loss/crossentropy": 2.729361414909363, + "loss/hidden": 0.0, + "loss/logits": 0.1998637430369854, + "loss/reg": 0.6562169790267944, + "step": 2238 + }, + { + "epoch": 0.02239, + "grad_norm": 0.47344842553138733, + "grad_norm_var": 0.025101135718260267, + "learning_rate": 5e-05, + "loss": 0.225, + "loss/crossentropy": 2.778840482234955, + "loss/hidden": 0.0, + "loss/logits": 0.22495561838150024, + "loss/reg": 0.6560289263725281, + "step": 2239 + }, + { + "epoch": 0.0224, + "grad_norm": 0.4637131094932556, + "grad_norm_var": 0.024576129626628727, + "learning_rate": 5e-05, + "loss": 0.1946, + "loss/crossentropy": 2.863752841949463, + "loss/hidden": 0.0, + "loss/logits": 0.19456767290830612, + "loss/reg": 0.6564752459526062, + "step": 2240 + }, + { + "epoch": 0.02241, + "grad_norm": 0.4673026204109192, + "grad_norm_var": 0.02440578544524078, + "learning_rate": 5e-05, + "loss": 0.1915, + "loss/crossentropy": 2.740968406200409, + "loss/hidden": 0.0, + "loss/logits": 0.1915142983198166, + "loss/reg": 0.6564088463783264, + "step": 2241 + }, + { + "epoch": 0.02242, + "grad_norm": 0.44082167744636536, + "grad_norm_var": 0.024476864289205702, + "learning_rate": 5e-05, + "loss": 0.1904, + "loss/crossentropy": 2.8120031356811523, + "loss/hidden": 0.0, + "loss/logits": 0.19039550051093102, + "loss/reg": 0.6564489603042603, + "step": 2242 + }, + { + "epoch": 0.02243, + "grad_norm": 0.4410727024078369, + "grad_norm_var": 0.0020858763568367394, + "learning_rate": 5e-05, + "loss": 0.1915, + "loss/crossentropy": 2.859192371368408, + "loss/hidden": 0.0, + "loss/logits": 0.1914524808526039, + "loss/reg": 0.6563150882720947, + "step": 2243 + }, + { + "epoch": 0.02244, + "grad_norm": 0.597095787525177, + "grad_norm_var": 0.002998034091058991, + "learning_rate": 5e-05, + "loss": 0.2092, + "loss/crossentropy": 2.8128835558891296, + "loss/hidden": 0.0, + "loss/logits": 0.20924413949251175, + "loss/reg": 0.655822217464447, + "step": 2244 + }, + { + "epoch": 0.02245, + "grad_norm": 0.5483686923980713, + "grad_norm_var": 0.003109889656134198, + "learning_rate": 5e-05, + "loss": 0.2488, + "loss/crossentropy": 2.8830860257148743, + "loss/hidden": 0.0, + "loss/logits": 0.24882910400629044, + "loss/reg": 0.655737578868866, + "step": 2245 + }, + { + "epoch": 0.02246, + "grad_norm": 0.4254612624645233, + "grad_norm_var": 0.00320473782891858, + "learning_rate": 5e-05, + "loss": 0.1986, + "loss/crossentropy": 2.7620843052864075, + "loss/hidden": 0.0, + "loss/logits": 0.1985759325325489, + "loss/reg": 0.6556224226951599, + "step": 2246 + }, + { + "epoch": 0.02247, + "grad_norm": 0.4196719229221344, + "grad_norm_var": 0.003433900505547275, + "learning_rate": 5e-05, + "loss": 0.1972, + "loss/crossentropy": 2.6765187978744507, + "loss/hidden": 0.0, + "loss/logits": 0.19721293821930885, + "loss/reg": 0.6555527448654175, + "step": 2247 + }, + { + "epoch": 0.02248, + "grad_norm": 0.4392344355583191, + "grad_norm_var": 0.003499700408750217, + "learning_rate": 5e-05, + "loss": 0.2202, + "loss/crossentropy": 2.8308944702148438, + "loss/hidden": 0.0, + "loss/logits": 0.22015779465436935, + "loss/reg": 0.6554519534111023, + "step": 2248 + }, + { + "epoch": 0.02249, + "grad_norm": 0.41292694211006165, + "grad_norm_var": 0.0037259508605585355, + "learning_rate": 5e-05, + "loss": 0.2005, + "loss/crossentropy": 2.852585017681122, + "loss/hidden": 0.0, + "loss/logits": 0.2004982978105545, + "loss/reg": 0.6551834344863892, + "step": 2249 + }, + { + "epoch": 0.0225, + "grad_norm": 0.44400349259376526, + "grad_norm_var": 0.0036679251207930807, + "learning_rate": 5e-05, + "loss": 0.2061, + "loss/crossentropy": 2.8694820404052734, + "loss/hidden": 0.0, + "loss/logits": 0.2060687355697155, + "loss/reg": 0.6543809771537781, + "step": 2250 + }, + { + "epoch": 0.02251, + "grad_norm": 0.4148118495941162, + "grad_norm_var": 0.003695540331417376, + "learning_rate": 5e-05, + "loss": 0.2017, + "loss/crossentropy": 2.8827332854270935, + "loss/hidden": 0.0, + "loss/logits": 0.20171796530485153, + "loss/reg": 0.6540721654891968, + "step": 2251 + }, + { + "epoch": 0.02252, + "grad_norm": 0.41835105419158936, + "grad_norm_var": 0.0038435419106991517, + "learning_rate": 5e-05, + "loss": 0.1975, + "loss/crossentropy": 2.9073886275291443, + "loss/hidden": 0.0, + "loss/logits": 0.19753522798419, + "loss/reg": 0.6539055109024048, + "step": 2252 + }, + { + "epoch": 0.02253, + "grad_norm": 0.6406252980232239, + "grad_norm_var": 0.005694344442027818, + "learning_rate": 5e-05, + "loss": 0.2393, + "loss/crossentropy": 2.897995173931122, + "loss/hidden": 0.0, + "loss/logits": 0.23933952674269676, + "loss/reg": 0.6533620357513428, + "step": 2253 + }, + { + "epoch": 0.02254, + "grad_norm": 0.4354853630065918, + "grad_norm_var": 0.004624489753011914, + "learning_rate": 5e-05, + "loss": 0.2047, + "loss/crossentropy": 3.0027363300323486, + "loss/hidden": 0.0, + "loss/logits": 0.2046515829861164, + "loss/reg": 0.6530390977859497, + "step": 2254 + }, + { + "epoch": 0.02255, + "grad_norm": 0.46653592586517334, + "grad_norm_var": 0.004622131644458434, + "learning_rate": 5e-05, + "loss": 0.2056, + "loss/crossentropy": 2.866030991077423, + "loss/hidden": 0.0, + "loss/logits": 0.2055933140218258, + "loss/reg": 0.6522712707519531, + "step": 2255 + }, + { + "epoch": 0.02256, + "grad_norm": 0.4899330139160156, + "grad_norm_var": 0.004652847584934957, + "learning_rate": 5e-05, + "loss": 0.2132, + "loss/crossentropy": 2.793207585811615, + "loss/hidden": 0.0, + "loss/logits": 0.2132275551557541, + "loss/reg": 0.6518944501876831, + "step": 2256 + }, + { + "epoch": 0.02257, + "grad_norm": 0.5109931826591492, + "grad_norm_var": 0.00476310039218234, + "learning_rate": 5e-05, + "loss": 0.2282, + "loss/crossentropy": 2.660158932209015, + "loss/hidden": 0.0, + "loss/logits": 0.22820783033967018, + "loss/reg": 0.6515896916389465, + "step": 2257 + }, + { + "epoch": 0.02258, + "grad_norm": 0.5232426524162292, + "grad_norm_var": 0.00484958166390399, + "learning_rate": 5e-05, + "loss": 0.2106, + "loss/crossentropy": 2.913247287273407, + "loss/hidden": 0.0, + "loss/logits": 0.21055587381124496, + "loss/reg": 0.6516128182411194, + "step": 2258 + }, + { + "epoch": 0.02259, + "grad_norm": 0.48126721382141113, + "grad_norm_var": 0.004759414822563457, + "learning_rate": 5e-05, + "loss": 0.2225, + "loss/crossentropy": 2.699722707271576, + "loss/hidden": 0.0, + "loss/logits": 0.22250869125127792, + "loss/reg": 0.6510916948318481, + "step": 2259 + }, + { + "epoch": 0.0226, + "grad_norm": 0.51381516456604, + "grad_norm_var": 0.003884329908351569, + "learning_rate": 5e-05, + "loss": 0.2046, + "loss/crossentropy": 2.7305811643600464, + "loss/hidden": 0.0, + "loss/logits": 0.20461101084947586, + "loss/reg": 0.6511334180831909, + "step": 2260 + }, + { + "epoch": 0.02261, + "grad_norm": 0.95738685131073, + "grad_norm_var": 0.01839359399902786, + "learning_rate": 5e-05, + "loss": 0.2867, + "loss/crossentropy": 2.94184148311615, + "loss/hidden": 0.0, + "loss/logits": 0.2866925150156021, + "loss/reg": 0.6505939364433289, + "step": 2261 + }, + { + "epoch": 0.02262, + "grad_norm": 0.45802873373031616, + "grad_norm_var": 0.018137909661330668, + "learning_rate": 5e-05, + "loss": 0.2202, + "loss/crossentropy": 2.7122802138328552, + "loss/hidden": 0.0, + "loss/logits": 0.22022777423262596, + "loss/reg": 0.6503925323486328, + "step": 2262 + }, + { + "epoch": 0.02263, + "grad_norm": 0.6308577656745911, + "grad_norm_var": 0.01861718095681959, + "learning_rate": 5e-05, + "loss": 0.2362, + "loss/crossentropy": 2.843177318572998, + "loss/hidden": 0.0, + "loss/logits": 0.23622819036245346, + "loss/reg": 0.6501750946044922, + "step": 2263 + }, + { + "epoch": 0.02264, + "grad_norm": 0.4880024492740631, + "grad_norm_var": 0.01827418419110881, + "learning_rate": 5e-05, + "loss": 0.1993, + "loss/crossentropy": 2.8725591897964478, + "loss/hidden": 0.0, + "loss/logits": 0.19932042807340622, + "loss/reg": 0.6497499346733093, + "step": 2264 + }, + { + "epoch": 0.02265, + "grad_norm": 0.4666669964790344, + "grad_norm_var": 0.01770257565261392, + "learning_rate": 5e-05, + "loss": 0.2026, + "loss/crossentropy": 2.758443057537079, + "loss/hidden": 0.0, + "loss/logits": 0.20261453092098236, + "loss/reg": 0.6491585969924927, + "step": 2265 + }, + { + "epoch": 0.02266, + "grad_norm": 0.4420171082019806, + "grad_norm_var": 0.01772328121050628, + "learning_rate": 5e-05, + "loss": 0.1994, + "loss/crossentropy": 2.8608362078666687, + "loss/hidden": 0.0, + "loss/logits": 0.19936417788267136, + "loss/reg": 0.6493498682975769, + "step": 2266 + }, + { + "epoch": 0.02267, + "grad_norm": 0.461956262588501, + "grad_norm_var": 0.017193909195642697, + "learning_rate": 5e-05, + "loss": 0.201, + "loss/crossentropy": 2.8266310691833496, + "loss/hidden": 0.0, + "loss/logits": 0.20099755376577377, + "loss/reg": 0.6488476991653442, + "step": 2267 + }, + { + "epoch": 0.02268, + "grad_norm": 0.514008641242981, + "grad_norm_var": 0.016417395766547438, + "learning_rate": 5e-05, + "loss": 0.2426, + "loss/crossentropy": 2.718923807144165, + "loss/hidden": 0.0, + "loss/logits": 0.24257025495171547, + "loss/reg": 0.6486131548881531, + "step": 2268 + }, + { + "epoch": 0.02269, + "grad_norm": 2.9227592945098877, + "grad_norm_var": 0.37557179205689967, + "learning_rate": 5e-05, + "loss": 0.2929, + "loss/crossentropy": 2.821976602077484, + "loss/hidden": 0.0, + "loss/logits": 0.2928861491382122, + "loss/reg": 0.6476935148239136, + "step": 2269 + }, + { + "epoch": 0.0227, + "grad_norm": 0.4250056743621826, + "grad_norm_var": 0.3759100928660886, + "learning_rate": 5e-05, + "loss": 0.1987, + "loss/crossentropy": 2.8084466457366943, + "loss/hidden": 0.0, + "loss/logits": 0.1987301968038082, + "loss/reg": 0.6472914814949036, + "step": 2270 + }, + { + "epoch": 0.02271, + "grad_norm": 0.4536933898925781, + "grad_norm_var": 0.37627227604680213, + "learning_rate": 5e-05, + "loss": 0.2114, + "loss/crossentropy": 2.725661337375641, + "loss/hidden": 0.0, + "loss/logits": 0.21139056608080864, + "loss/reg": 0.6460861563682556, + "step": 2271 + }, + { + "epoch": 0.02272, + "grad_norm": 0.4636286497116089, + "grad_norm_var": 0.3769513646169643, + "learning_rate": 5e-05, + "loss": 0.2239, + "loss/crossentropy": 2.771003305912018, + "loss/hidden": 0.0, + "loss/logits": 0.22385068237781525, + "loss/reg": 0.6454333662986755, + "step": 2272 + }, + { + "epoch": 0.02273, + "grad_norm": 0.4696909785270691, + "grad_norm_var": 0.3779313301878248, + "learning_rate": 5e-05, + "loss": 0.2193, + "loss/crossentropy": 2.868667781352997, + "loss/hidden": 0.0, + "loss/logits": 0.21933767199516296, + "loss/reg": 0.644971489906311, + "step": 2273 + }, + { + "epoch": 0.02274, + "grad_norm": 0.8008906841278076, + "grad_norm_var": 0.37742743336964557, + "learning_rate": 5e-05, + "loss": 0.2436, + "loss/crossentropy": 2.828441619873047, + "loss/hidden": 0.0, + "loss/logits": 0.24362245202064514, + "loss/reg": 0.6448448896408081, + "step": 2274 + }, + { + "epoch": 0.02275, + "grad_norm": 0.49082106351852417, + "grad_norm_var": 0.37717443576090054, + "learning_rate": 5e-05, + "loss": 0.2093, + "loss/crossentropy": 2.73420113325119, + "loss/hidden": 0.0, + "loss/logits": 0.20925306156277657, + "loss/reg": 0.6451004147529602, + "step": 2275 + }, + { + "epoch": 0.02276, + "grad_norm": 0.6399700045585632, + "grad_norm_var": 0.3752904963869677, + "learning_rate": 5e-05, + "loss": 0.2347, + "loss/crossentropy": 3.006948947906494, + "loss/hidden": 0.0, + "loss/logits": 0.23469094559550285, + "loss/reg": 0.6452967524528503, + "step": 2276 + }, + { + "epoch": 0.02277, + "grad_norm": 0.46703338623046875, + "grad_norm_var": 0.37302198197604, + "learning_rate": 5e-05, + "loss": 0.1958, + "loss/crossentropy": 2.696506142616272, + "loss/hidden": 0.0, + "loss/logits": 0.19578645378351212, + "loss/reg": 0.6449645757675171, + "step": 2277 + }, + { + "epoch": 0.02278, + "grad_norm": 0.41870376467704773, + "grad_norm_var": 0.3741891171085768, + "learning_rate": 5e-05, + "loss": 0.1872, + "loss/crossentropy": 2.8157997131347656, + "loss/hidden": 0.0, + "loss/logits": 0.1872161142528057, + "loss/reg": 0.6446811556816101, + "step": 2278 + }, + { + "epoch": 0.02279, + "grad_norm": 0.45508822798728943, + "grad_norm_var": 0.37679673641093026, + "learning_rate": 5e-05, + "loss": 0.1963, + "loss/crossentropy": 2.8438076972961426, + "loss/hidden": 0.0, + "loss/logits": 0.19626940414309502, + "loss/reg": 0.6446341872215271, + "step": 2279 + }, + { + "epoch": 0.0228, + "grad_norm": 0.8779433369636536, + "grad_norm_var": 0.3779427053181054, + "learning_rate": 5e-05, + "loss": 0.2557, + "loss/crossentropy": 2.989022374153137, + "loss/hidden": 0.0, + "loss/logits": 0.2556800991296768, + "loss/reg": 0.644194483757019, + "step": 2280 + }, + { + "epoch": 0.02281, + "grad_norm": 0.46257394552230835, + "grad_norm_var": 0.37805642062235173, + "learning_rate": 5e-05, + "loss": 0.2006, + "loss/crossentropy": 2.7652095556259155, + "loss/hidden": 0.0, + "loss/logits": 0.20064184814691544, + "loss/reg": 0.6444392204284668, + "step": 2281 + }, + { + "epoch": 0.02282, + "grad_norm": 0.5013570785522461, + "grad_norm_var": 0.3764500575080739, + "learning_rate": 5e-05, + "loss": 0.214, + "loss/crossentropy": 2.803834617137909, + "loss/hidden": 0.0, + "loss/logits": 0.21403327584266663, + "loss/reg": 0.6442294716835022, + "step": 2282 + }, + { + "epoch": 0.02283, + "grad_norm": 0.4890649914741516, + "grad_norm_var": 0.3757202659671127, + "learning_rate": 5e-05, + "loss": 0.2194, + "loss/crossentropy": 2.742681860923767, + "loss/hidden": 0.0, + "loss/logits": 0.21939171105623245, + "loss/reg": 0.6439263224601746, + "step": 2283 + }, + { + "epoch": 0.02284, + "grad_norm": 0.6570902466773987, + "grad_norm_var": 0.37386618732501925, + "learning_rate": 5e-05, + "loss": 0.2578, + "loss/crossentropy": 2.809187114238739, + "loss/hidden": 0.0, + "loss/logits": 0.25776413455605507, + "loss/reg": 0.6434304714202881, + "step": 2284 + }, + { + "epoch": 0.02285, + "grad_norm": 0.48095011711120605, + "grad_norm_var": 0.0186792983891932, + "learning_rate": 5e-05, + "loss": 0.2235, + "loss/crossentropy": 2.8083168268203735, + "loss/hidden": 0.0, + "loss/logits": 0.22352950647473335, + "loss/reg": 0.6429867148399353, + "step": 2285 + }, + { + "epoch": 0.02286, + "grad_norm": 0.4240189790725708, + "grad_norm_var": 0.01869377662112335, + "learning_rate": 5e-05, + "loss": 0.2133, + "loss/crossentropy": 2.8698734641075134, + "loss/hidden": 0.0, + "loss/logits": 0.2133381925523281, + "loss/reg": 0.6421407461166382, + "step": 2286 + }, + { + "epoch": 0.02287, + "grad_norm": 0.42852783203125, + "grad_norm_var": 0.019004606133941948, + "learning_rate": 5e-05, + "loss": 0.2072, + "loss/crossentropy": 2.848720133304596, + "loss/hidden": 0.0, + "loss/logits": 0.20724551379680634, + "loss/reg": 0.6415654420852661, + "step": 2287 + }, + { + "epoch": 0.02288, + "grad_norm": 0.4214157462120056, + "grad_norm_var": 0.019506198028654238, + "learning_rate": 5e-05, + "loss": 0.1944, + "loss/crossentropy": 2.7180134654045105, + "loss/hidden": 0.0, + "loss/logits": 0.19436291232705116, + "loss/reg": 0.6408950686454773, + "step": 2288 + }, + { + "epoch": 0.02289, + "grad_norm": 0.5080603957176208, + "grad_norm_var": 0.019288031409682545, + "learning_rate": 5e-05, + "loss": 0.2253, + "loss/crossentropy": 2.7465009093284607, + "loss/hidden": 0.0, + "loss/logits": 0.22526658326387405, + "loss/reg": 0.6404685378074646, + "step": 2289 + }, + { + "epoch": 0.0229, + "grad_norm": 0.393258273601532, + "grad_norm_var": 0.015097916490568863, + "learning_rate": 5e-05, + "loss": 0.1882, + "loss/crossentropy": 2.768847942352295, + "loss/hidden": 0.0, + "loss/logits": 0.18824293091893196, + "loss/reg": 0.6401697397232056, + "step": 2290 + }, + { + "epoch": 0.02291, + "grad_norm": 0.5133631229400635, + "grad_norm_var": 0.01508031960896757, + "learning_rate": 5e-05, + "loss": 0.234, + "loss/crossentropy": 2.7502260208129883, + "loss/hidden": 0.0, + "loss/logits": 0.23396670073270798, + "loss/reg": 0.6396507024765015, + "step": 2291 + }, + { + "epoch": 0.02292, + "grad_norm": 0.42501306533813477, + "grad_norm_var": 0.01420450697436575, + "learning_rate": 5e-05, + "loss": 0.2049, + "loss/crossentropy": 2.820326089859009, + "loss/hidden": 0.0, + "loss/logits": 0.20486219227313995, + "loss/reg": 0.6389489769935608, + "step": 2292 + }, + { + "epoch": 0.02293, + "grad_norm": 0.42067885398864746, + "grad_norm_var": 0.014512991489169558, + "learning_rate": 5e-05, + "loss": 0.1978, + "loss/crossentropy": 2.6890400052070618, + "loss/hidden": 0.0, + "loss/logits": 0.1978270560503006, + "loss/reg": 0.6384695768356323, + "step": 2293 + }, + { + "epoch": 0.02294, + "grad_norm": 0.4248322546482086, + "grad_norm_var": 0.014455185321162967, + "learning_rate": 5e-05, + "loss": 0.2005, + "loss/crossentropy": 2.8716264963150024, + "loss/hidden": 0.0, + "loss/logits": 0.20054928585886955, + "loss/reg": 0.6384212970733643, + "step": 2294 + }, + { + "epoch": 0.02295, + "grad_norm": 0.43125322461128235, + "grad_norm_var": 0.014610229504596006, + "learning_rate": 5e-05, + "loss": 0.2121, + "loss/crossentropy": 2.7577659487724304, + "loss/hidden": 0.0, + "loss/logits": 0.21213258430361748, + "loss/reg": 0.638514518737793, + "step": 2295 + }, + { + "epoch": 0.02296, + "grad_norm": 0.4548972249031067, + "grad_norm_var": 0.003981738543298855, + "learning_rate": 5e-05, + "loss": 0.2093, + "loss/crossentropy": 2.7305163741111755, + "loss/hidden": 0.0, + "loss/logits": 0.20929726213216782, + "loss/reg": 0.6382765769958496, + "step": 2296 + }, + { + "epoch": 0.02297, + "grad_norm": 0.47402554750442505, + "grad_norm_var": 0.003986578256624599, + "learning_rate": 5e-05, + "loss": 0.2262, + "loss/crossentropy": 2.7139264345169067, + "loss/hidden": 0.0, + "loss/logits": 0.2262122519314289, + "loss/reg": 0.6376994252204895, + "step": 2297 + }, + { + "epoch": 0.02298, + "grad_norm": 0.5048164129257202, + "grad_norm_var": 0.004003870642095701, + "learning_rate": 5e-05, + "loss": 0.2139, + "loss/crossentropy": 2.735919415950775, + "loss/hidden": 0.0, + "loss/logits": 0.21385344117879868, + "loss/reg": 0.6378376483917236, + "step": 2298 + }, + { + "epoch": 0.02299, + "grad_norm": 0.4408647119998932, + "grad_norm_var": 0.003998941569542528, + "learning_rate": 5e-05, + "loss": 0.2051, + "loss/crossentropy": 2.7567177414894104, + "loss/hidden": 0.0, + "loss/logits": 0.20511357486248016, + "loss/reg": 0.6380979418754578, + "step": 2299 + }, + { + "epoch": 0.023, + "grad_norm": 0.4789188504219055, + "grad_norm_var": 0.0013648371387985987, + "learning_rate": 5e-05, + "loss": 0.2255, + "loss/crossentropy": 2.9137268662452698, + "loss/hidden": 0.0, + "loss/logits": 0.22550494223833084, + "loss/reg": 0.638322651386261, + "step": 2300 + }, + { + "epoch": 0.02301, + "grad_norm": 0.41670024394989014, + "grad_norm_var": 0.001371030177625221, + "learning_rate": 5e-05, + "loss": 0.1906, + "loss/crossentropy": 2.8916245698928833, + "loss/hidden": 0.0, + "loss/logits": 0.19059956818819046, + "loss/reg": 0.6391466856002808, + "step": 2301 + }, + { + "epoch": 0.02302, + "grad_norm": 0.45289623737335205, + "grad_norm_var": 0.0013325845270320694, + "learning_rate": 5e-05, + "loss": 0.2135, + "loss/crossentropy": 2.7781118750572205, + "loss/hidden": 0.0, + "loss/logits": 0.21348335593938828, + "loss/reg": 0.6398518681526184, + "step": 2302 + }, + { + "epoch": 0.02303, + "grad_norm": 0.47499120235443115, + "grad_norm_var": 0.0013385467809591749, + "learning_rate": 5e-05, + "loss": 0.2057, + "loss/crossentropy": 2.9010541439056396, + "loss/hidden": 0.0, + "loss/logits": 0.205675158649683, + "loss/reg": 0.6403761506080627, + "step": 2303 + }, + { + "epoch": 0.02304, + "grad_norm": 0.8079317808151245, + "grad_norm_var": 0.009086701420856906, + "learning_rate": 5e-05, + "loss": 0.2331, + "loss/crossentropy": 2.7909732460975647, + "loss/hidden": 0.0, + "loss/logits": 0.23312172666192055, + "loss/reg": 0.641444206237793, + "step": 2304 + }, + { + "epoch": 0.02305, + "grad_norm": 0.4529048204421997, + "grad_norm_var": 0.00904404864292042, + "learning_rate": 5e-05, + "loss": 0.2156, + "loss/crossentropy": 2.8021045923233032, + "loss/hidden": 0.0, + "loss/logits": 0.21561753377318382, + "loss/reg": 0.6419049501419067, + "step": 2305 + }, + { + "epoch": 0.02306, + "grad_norm": 0.4624601900577545, + "grad_norm_var": 0.008607961765128503, + "learning_rate": 5e-05, + "loss": 0.2131, + "loss/crossentropy": 2.7249650359153748, + "loss/hidden": 0.0, + "loss/logits": 0.2131466008722782, + "loss/reg": 0.6419620513916016, + "step": 2306 + }, + { + "epoch": 0.02307, + "grad_norm": 0.4756065011024475, + "grad_norm_var": 0.008515430492197378, + "learning_rate": 5e-05, + "loss": 0.1976, + "loss/crossentropy": 2.8718077540397644, + "loss/hidden": 0.0, + "loss/logits": 0.19755731150507927, + "loss/reg": 0.6424854397773743, + "step": 2307 + }, + { + "epoch": 0.02308, + "grad_norm": 0.4885087311267853, + "grad_norm_var": 0.008344857543338273, + "learning_rate": 5e-05, + "loss": 0.2252, + "loss/crossentropy": 2.8741283416748047, + "loss/hidden": 0.0, + "loss/logits": 0.22517314553260803, + "loss/reg": 0.6421217918395996, + "step": 2308 + }, + { + "epoch": 0.02309, + "grad_norm": 0.4277467727661133, + "grad_norm_var": 0.008293119451190257, + "learning_rate": 5e-05, + "loss": 0.2049, + "loss/crossentropy": 2.795592427253723, + "loss/hidden": 0.0, + "loss/logits": 0.20493775233626366, + "loss/reg": 0.6426491141319275, + "step": 2309 + }, + { + "epoch": 0.0231, + "grad_norm": 0.5211502313613892, + "grad_norm_var": 0.008172999851816343, + "learning_rate": 5e-05, + "loss": 0.2117, + "loss/crossentropy": 2.8724001049995422, + "loss/hidden": 0.0, + "loss/logits": 0.21168380975723267, + "loss/reg": 0.6429906487464905, + "step": 2310 + }, + { + "epoch": 0.02311, + "grad_norm": 0.7134928703308105, + "grad_norm_var": 0.011115762704886711, + "learning_rate": 5e-05, + "loss": 0.2385, + "loss/crossentropy": 2.8153502345085144, + "loss/hidden": 0.0, + "loss/logits": 0.23845964670181274, + "loss/reg": 0.6427722573280334, + "step": 2311 + }, + { + "epoch": 0.02312, + "grad_norm": 0.4354418218135834, + "grad_norm_var": 0.01126418671923588, + "learning_rate": 5e-05, + "loss": 0.2028, + "loss/crossentropy": 2.8604108095169067, + "loss/hidden": 0.0, + "loss/logits": 0.20278507471084595, + "loss/reg": 0.6428348422050476, + "step": 2312 + }, + { + "epoch": 0.02313, + "grad_norm": 0.4990330636501312, + "grad_norm_var": 0.011210734863325375, + "learning_rate": 5e-05, + "loss": 0.2152, + "loss/crossentropy": 2.904971122741699, + "loss/hidden": 0.0, + "loss/logits": 0.21523715928196907, + "loss/reg": 0.6428065896034241, + "step": 2313 + }, + { + "epoch": 0.02314, + "grad_norm": 0.5782985687255859, + "grad_norm_var": 0.011562661931197211, + "learning_rate": 5e-05, + "loss": 0.2327, + "loss/crossentropy": 2.68763530254364, + "loss/hidden": 0.0, + "loss/logits": 0.23272082582116127, + "loss/reg": 0.64236980676651, + "step": 2314 + }, + { + "epoch": 0.02315, + "grad_norm": 0.473962664604187, + "grad_norm_var": 0.011335147553317585, + "learning_rate": 5e-05, + "loss": 0.218, + "loss/crossentropy": 2.6716864705085754, + "loss/hidden": 0.0, + "loss/logits": 0.21799590811133385, + "loss/reg": 0.6420884132385254, + "step": 2315 + }, + { + "epoch": 0.02316, + "grad_norm": 0.4475126564502716, + "grad_norm_var": 0.011526958108506144, + "learning_rate": 5e-05, + "loss": 0.211, + "loss/crossentropy": 2.7679070830345154, + "loss/hidden": 0.0, + "loss/logits": 0.2110486775636673, + "loss/reg": 0.6415696144104004, + "step": 2316 + }, + { + "epoch": 0.02317, + "grad_norm": 0.45600003004074097, + "grad_norm_var": 0.01114487050420344, + "learning_rate": 5e-05, + "loss": 0.2074, + "loss/crossentropy": 2.8766338229179382, + "loss/hidden": 0.0, + "loss/logits": 0.20742720738053322, + "loss/reg": 0.6410428285598755, + "step": 2317 + }, + { + "epoch": 0.02318, + "grad_norm": 0.4907890558242798, + "grad_norm_var": 0.01094359578672709, + "learning_rate": 5e-05, + "loss": 0.1959, + "loss/crossentropy": 2.8390864729881287, + "loss/hidden": 0.0, + "loss/logits": 0.19589436426758766, + "loss/reg": 0.6408200263977051, + "step": 2318 + }, + { + "epoch": 0.02319, + "grad_norm": 1.1138983964920044, + "grad_norm_var": 0.03322991517530266, + "learning_rate": 5e-05, + "loss": 0.3027, + "loss/crossentropy": 2.886744976043701, + "loss/hidden": 0.0, + "loss/logits": 0.3026874288916588, + "loss/reg": 0.6402056217193604, + "step": 2319 + }, + { + "epoch": 0.0232, + "grad_norm": 0.4708307981491089, + "grad_norm_var": 0.028864701939447186, + "learning_rate": 5e-05, + "loss": 0.1896, + "loss/crossentropy": 2.7965083718299866, + "loss/hidden": 0.0, + "loss/logits": 0.18961480259895325, + "loss/reg": 0.639504075050354, + "step": 2320 + }, + { + "epoch": 0.02321, + "grad_norm": 0.6505262851715088, + "grad_norm_var": 0.029228656351045615, + "learning_rate": 5e-05, + "loss": 0.2033, + "loss/crossentropy": 2.834762454032898, + "loss/hidden": 0.0, + "loss/logits": 0.20328499004244804, + "loss/reg": 0.6393487453460693, + "step": 2321 + }, + { + "epoch": 0.02322, + "grad_norm": 0.6051056385040283, + "grad_norm_var": 0.028948055240186635, + "learning_rate": 5e-05, + "loss": 0.2199, + "loss/crossentropy": 2.6770065426826477, + "loss/hidden": 0.0, + "loss/logits": 0.21985314413905144, + "loss/reg": 0.6385535001754761, + "step": 2322 + }, + { + "epoch": 0.02323, + "grad_norm": 0.48217689990997314, + "grad_norm_var": 0.028882957805189104, + "learning_rate": 5e-05, + "loss": 0.2039, + "loss/crossentropy": 2.8045378923416138, + "loss/hidden": 0.0, + "loss/logits": 0.20390581339597702, + "loss/reg": 0.6383077502250671, + "step": 2323 + }, + { + "epoch": 0.02324, + "grad_norm": 0.4804023206233978, + "grad_norm_var": 0.028957207990246304, + "learning_rate": 5e-05, + "loss": 0.2099, + "loss/crossentropy": 2.723294258117676, + "loss/hidden": 0.0, + "loss/logits": 0.20989179611206055, + "loss/reg": 0.6378633379936218, + "step": 2324 + }, + { + "epoch": 0.02325, + "grad_norm": 0.4616905450820923, + "grad_norm_var": 0.02846280523083061, + "learning_rate": 5e-05, + "loss": 0.2036, + "loss/crossentropy": 2.670876979827881, + "loss/hidden": 0.0, + "loss/logits": 0.2036052718758583, + "loss/reg": 0.6375999450683594, + "step": 2325 + }, + { + "epoch": 0.02326, + "grad_norm": 0.4369848370552063, + "grad_norm_var": 0.029285626186562993, + "learning_rate": 5e-05, + "loss": 0.2112, + "loss/crossentropy": 2.7181246280670166, + "loss/hidden": 0.0, + "loss/logits": 0.21119512990117073, + "loss/reg": 0.6367100477218628, + "step": 2326 + }, + { + "epoch": 0.02327, + "grad_norm": 0.4409303069114685, + "grad_norm_var": 0.027978415570877265, + "learning_rate": 5e-05, + "loss": 0.211, + "loss/crossentropy": 2.720276415348053, + "loss/hidden": 0.0, + "loss/logits": 0.21099728718400002, + "loss/reg": 0.63608318567276, + "step": 2327 + }, + { + "epoch": 0.02328, + "grad_norm": 0.43456465005874634, + "grad_norm_var": 0.02798984141665279, + "learning_rate": 5e-05, + "loss": 0.2122, + "loss/crossentropy": 2.7633227109909058, + "loss/hidden": 0.0, + "loss/logits": 0.2121913656592369, + "loss/reg": 0.6359296441078186, + "step": 2328 + }, + { + "epoch": 0.02329, + "grad_norm": 0.4653891623020172, + "grad_norm_var": 0.028211472567838607, + "learning_rate": 5e-05, + "loss": 0.2256, + "loss/crossentropy": 2.627290904521942, + "loss/hidden": 0.0, + "loss/logits": 0.22563253715634346, + "loss/reg": 0.635438084602356, + "step": 2329 + }, + { + "epoch": 0.0233, + "grad_norm": 0.4926622211933136, + "grad_norm_var": 0.028124807387216188, + "learning_rate": 5e-05, + "loss": 0.2008, + "loss/crossentropy": 2.8919323682785034, + "loss/hidden": 0.0, + "loss/logits": 0.2008451260626316, + "loss/reg": 0.6348303556442261, + "step": 2330 + }, + { + "epoch": 0.02331, + "grad_norm": 0.40452897548675537, + "grad_norm_var": 0.02890059954464316, + "learning_rate": 5e-05, + "loss": 0.204, + "loss/crossentropy": 2.892720401287079, + "loss/hidden": 0.0, + "loss/logits": 0.20395006239414215, + "loss/reg": 0.6345018744468689, + "step": 2331 + }, + { + "epoch": 0.02332, + "grad_norm": 0.4458627700805664, + "grad_norm_var": 0.028916908182268955, + "learning_rate": 5e-05, + "loss": 0.2016, + "loss/crossentropy": 2.7548307180404663, + "loss/hidden": 0.0, + "loss/logits": 0.20156589522957802, + "loss/reg": 0.6343698501586914, + "step": 2332 + }, + { + "epoch": 0.02333, + "grad_norm": 0.8211351037025452, + "grad_norm_var": 0.034096259866480676, + "learning_rate": 5e-05, + "loss": 0.2516, + "loss/crossentropy": 2.9307456612586975, + "loss/hidden": 0.0, + "loss/logits": 0.25161348283290863, + "loss/reg": 0.6339602470397949, + "step": 2333 + }, + { + "epoch": 0.02334, + "grad_norm": 0.5167918801307678, + "grad_norm_var": 0.033955447662380824, + "learning_rate": 5e-05, + "loss": 0.2048, + "loss/crossentropy": 2.93019437789917, + "loss/hidden": 0.0, + "loss/logits": 0.20476159080863, + "loss/reg": 0.6337273120880127, + "step": 2334 + }, + { + "epoch": 0.02335, + "grad_norm": 0.5606244206428528, + "grad_norm_var": 0.011135945001819652, + "learning_rate": 5e-05, + "loss": 0.2259, + "loss/crossentropy": 2.8020551204681396, + "loss/hidden": 0.0, + "loss/logits": 0.22587483748793602, + "loss/reg": 0.6334927678108215, + "step": 2335 + }, + { + "epoch": 0.02336, + "grad_norm": 0.5032116174697876, + "grad_norm_var": 0.011029612354715406, + "learning_rate": 5e-05, + "loss": 0.218, + "loss/crossentropy": 2.6723164319992065, + "loss/hidden": 0.0, + "loss/logits": 0.21800407767295837, + "loss/reg": 0.6330782771110535, + "step": 2336 + }, + { + "epoch": 0.02337, + "grad_norm": 0.44819605350494385, + "grad_norm_var": 0.0098689851248993, + "learning_rate": 5e-05, + "loss": 0.2189, + "loss/crossentropy": 2.7837693095207214, + "loss/hidden": 0.0, + "loss/logits": 0.21894612908363342, + "loss/reg": 0.6331006288528442, + "step": 2337 + }, + { + "epoch": 0.02338, + "grad_norm": 0.4284153878688812, + "grad_norm_var": 0.009344427206519868, + "learning_rate": 5e-05, + "loss": 0.2147, + "loss/crossentropy": 2.847710371017456, + "loss/hidden": 0.0, + "loss/logits": 0.21465658023953438, + "loss/reg": 0.6332995295524597, + "step": 2338 + }, + { + "epoch": 0.02339, + "grad_norm": 0.4317200481891632, + "grad_norm_var": 0.009549266526419376, + "learning_rate": 5e-05, + "loss": 0.2028, + "loss/crossentropy": 2.877539098262787, + "loss/hidden": 0.0, + "loss/logits": 0.20283129811286926, + "loss/reg": 0.6329610347747803, + "step": 2339 + }, + { + "epoch": 0.0234, + "grad_norm": 0.5034545063972473, + "grad_norm_var": 0.009565829180528774, + "learning_rate": 5e-05, + "loss": 0.2478, + "loss/crossentropy": 2.7589054107666016, + "loss/hidden": 0.0, + "loss/logits": 0.24779587984085083, + "loss/reg": 0.6325832605361938, + "step": 2340 + }, + { + "epoch": 0.02341, + "grad_norm": 0.4617745876312256, + "grad_norm_var": 0.009565543097278351, + "learning_rate": 5e-05, + "loss": 0.1953, + "loss/crossentropy": 2.883599817752838, + "loss/hidden": 0.0, + "loss/logits": 0.19532188400626183, + "loss/reg": 0.6324228048324585, + "step": 2341 + }, + { + "epoch": 0.02342, + "grad_norm": 0.7896232008934021, + "grad_norm_var": 0.014973542137485592, + "learning_rate": 5e-05, + "loss": 0.2265, + "loss/crossentropy": 2.835487723350525, + "loss/hidden": 0.0, + "loss/logits": 0.2264947146177292, + "loss/reg": 0.6318197846412659, + "step": 2342 + }, + { + "epoch": 0.02343, + "grad_norm": 0.45438581705093384, + "grad_norm_var": 0.014862188410815472, + "learning_rate": 5e-05, + "loss": 0.2049, + "loss/crossentropy": 2.8573489785194397, + "loss/hidden": 0.0, + "loss/logits": 0.20491677150130272, + "loss/reg": 0.6310228109359741, + "step": 2343 + }, + { + "epoch": 0.02344, + "grad_norm": 0.4458915889263153, + "grad_norm_var": 0.014756059339380463, + "learning_rate": 5e-05, + "loss": 0.212, + "loss/crossentropy": 2.786020517349243, + "loss/hidden": 0.0, + "loss/logits": 0.2120414450764656, + "loss/reg": 0.6305103302001953, + "step": 2344 + }, + { + "epoch": 0.02345, + "grad_norm": 0.4996333718299866, + "grad_norm_var": 0.014621762346612839, + "learning_rate": 5e-05, + "loss": 0.2007, + "loss/crossentropy": 2.837901294231415, + "loss/hidden": 0.0, + "loss/logits": 0.2007434330880642, + "loss/reg": 0.629725456237793, + "step": 2345 + }, + { + "epoch": 0.02346, + "grad_norm": 0.5147289633750916, + "grad_norm_var": 0.0145923739613341, + "learning_rate": 5e-05, + "loss": 0.2462, + "loss/crossentropy": 2.852295398712158, + "loss/hidden": 0.0, + "loss/logits": 0.2461700476706028, + "loss/reg": 0.6294355392456055, + "step": 2346 + }, + { + "epoch": 0.02347, + "grad_norm": 0.4098435938358307, + "grad_norm_var": 0.014516301619381741, + "learning_rate": 5e-05, + "loss": 0.1951, + "loss/crossentropy": 2.982568860054016, + "loss/hidden": 0.0, + "loss/logits": 0.1951267048716545, + "loss/reg": 0.6292477250099182, + "step": 2347 + }, + { + "epoch": 0.02348, + "grad_norm": 0.6898613572120667, + "grad_norm_var": 0.015997578029016055, + "learning_rate": 5e-05, + "loss": 0.2407, + "loss/crossentropy": 2.991710662841797, + "loss/hidden": 0.0, + "loss/logits": 0.24073050171136856, + "loss/reg": 0.6285145878791809, + "step": 2348 + }, + { + "epoch": 0.02349, + "grad_norm": 0.5064083337783813, + "grad_norm_var": 0.009969460451236204, + "learning_rate": 5e-05, + "loss": 0.2338, + "loss/crossentropy": 2.698013126850128, + "loss/hidden": 0.0, + "loss/logits": 0.2337537296116352, + "loss/reg": 0.627836287021637, + "step": 2349 + }, + { + "epoch": 0.0235, + "grad_norm": 0.41762736439704895, + "grad_norm_var": 0.010498030883773667, + "learning_rate": 5e-05, + "loss": 0.2026, + "loss/crossentropy": 2.857945740222931, + "loss/hidden": 0.0, + "loss/logits": 0.20263588801026344, + "loss/reg": 0.6277275681495667, + "step": 2350 + }, + { + "epoch": 0.02351, + "grad_norm": 0.4151814877986908, + "grad_norm_var": 0.010723747987946612, + "learning_rate": 5e-05, + "loss": 0.2013, + "loss/crossentropy": 2.796386420726776, + "loss/hidden": 0.0, + "loss/logits": 0.20130639895796776, + "loss/reg": 0.6276928782463074, + "step": 2351 + }, + { + "epoch": 0.02352, + "grad_norm": 0.4268440008163452, + "grad_norm_var": 0.01100460805175422, + "learning_rate": 5e-05, + "loss": 0.2119, + "loss/crossentropy": 2.7920687198638916, + "loss/hidden": 0.0, + "loss/logits": 0.21190569549798965, + "loss/reg": 0.6281136274337769, + "step": 2352 + }, + { + "epoch": 0.02353, + "grad_norm": 0.4252191483974457, + "grad_norm_var": 0.011166361556820058, + "learning_rate": 5e-05, + "loss": 0.2033, + "loss/crossentropy": 2.777624487876892, + "loss/hidden": 0.0, + "loss/logits": 0.20329252630472183, + "loss/reg": 0.628277599811554, + "step": 2353 + }, + { + "epoch": 0.02354, + "grad_norm": 0.4438311457633972, + "grad_norm_var": 0.011057121852057473, + "learning_rate": 5e-05, + "loss": 0.1945, + "loss/crossentropy": 2.8656126260757446, + "loss/hidden": 0.0, + "loss/logits": 0.1945394165813923, + "loss/reg": 0.6287229657173157, + "step": 2354 + }, + { + "epoch": 0.02355, + "grad_norm": 0.514130711555481, + "grad_norm_var": 0.010843933864936157, + "learning_rate": 5e-05, + "loss": 0.2481, + "loss/crossentropy": 2.72928386926651, + "loss/hidden": 0.0, + "loss/logits": 0.24811574444174767, + "loss/reg": 0.6288861632347107, + "step": 2355 + }, + { + "epoch": 0.02356, + "grad_norm": 0.4875626266002655, + "grad_norm_var": 0.01084159725215567, + "learning_rate": 5e-05, + "loss": 0.2061, + "loss/crossentropy": 2.6636879444122314, + "loss/hidden": 0.0, + "loss/logits": 0.2060905285179615, + "loss/reg": 0.6288546919822693, + "step": 2356 + }, + { + "epoch": 0.02357, + "grad_norm": 0.42352166771888733, + "grad_norm_var": 0.01109695168538372, + "learning_rate": 5e-05, + "loss": 0.2039, + "loss/crossentropy": 2.830349326133728, + "loss/hidden": 0.0, + "loss/logits": 0.2038898654282093, + "loss/reg": 0.6291917562484741, + "step": 2357 + }, + { + "epoch": 0.02358, + "grad_norm": 0.4163033366203308, + "grad_norm_var": 0.004968975014820768, + "learning_rate": 5e-05, + "loss": 0.1965, + "loss/crossentropy": 2.907049238681793, + "loss/hidden": 0.0, + "loss/logits": 0.1965312324464321, + "loss/reg": 0.6289337277412415, + "step": 2358 + }, + { + "epoch": 0.02359, + "grad_norm": 0.4276890754699707, + "grad_norm_var": 0.005062642091017303, + "learning_rate": 5e-05, + "loss": 0.2113, + "loss/crossentropy": 2.7691001892089844, + "loss/hidden": 0.0, + "loss/logits": 0.2112610973417759, + "loss/reg": 0.6294490694999695, + "step": 2359 + }, + { + "epoch": 0.0236, + "grad_norm": 0.40343669056892395, + "grad_norm_var": 0.005292048580926182, + "learning_rate": 5e-05, + "loss": 0.1949, + "loss/crossentropy": 2.8914926052093506, + "loss/hidden": 0.0, + "loss/logits": 0.19489578157663345, + "loss/reg": 0.6290164589881897, + "step": 2360 + }, + { + "epoch": 0.02361, + "grad_norm": 0.42505788803100586, + "grad_norm_var": 0.005283972711642919, + "learning_rate": 5e-05, + "loss": 0.2082, + "loss/crossentropy": 2.902132272720337, + "loss/hidden": 0.0, + "loss/logits": 0.20815201476216316, + "loss/reg": 0.6282820105552673, + "step": 2361 + }, + { + "epoch": 0.02362, + "grad_norm": 0.43126583099365234, + "grad_norm_var": 0.0051014370819990424, + "learning_rate": 5e-05, + "loss": 0.2075, + "loss/crossentropy": 2.7266613841056824, + "loss/hidden": 0.0, + "loss/logits": 0.2074590139091015, + "loss/reg": 0.6279135346412659, + "step": 2362 + }, + { + "epoch": 0.02363, + "grad_norm": 0.4245338439941406, + "grad_norm_var": 0.005028462054595891, + "learning_rate": 5e-05, + "loss": 0.212, + "loss/crossentropy": 2.7949140071868896, + "loss/hidden": 0.0, + "loss/logits": 0.2119845375418663, + "loss/reg": 0.6272363066673279, + "step": 2363 + }, + { + "epoch": 0.02364, + "grad_norm": 0.46379032731056213, + "grad_norm_var": 0.0011404652004450436, + "learning_rate": 5e-05, + "loss": 0.2303, + "loss/crossentropy": 2.925580382347107, + "loss/hidden": 0.0, + "loss/logits": 0.23027806356549263, + "loss/reg": 0.626708984375, + "step": 2364 + }, + { + "epoch": 0.02365, + "grad_norm": 0.4108589291572571, + "grad_norm_var": 0.0008749100543359571, + "learning_rate": 5e-05, + "loss": 0.1957, + "loss/crossentropy": 2.8026323914527893, + "loss/hidden": 0.0, + "loss/logits": 0.19566894695162773, + "loss/reg": 0.6264309883117676, + "step": 2365 + }, + { + "epoch": 0.02366, + "grad_norm": 0.40429288148880005, + "grad_norm_var": 0.0009165608524856594, + "learning_rate": 5e-05, + "loss": 0.1884, + "loss/crossentropy": 2.769781708717346, + "loss/hidden": 0.0, + "loss/logits": 0.18835539743304253, + "loss/reg": 0.6261439323425293, + "step": 2366 + }, + { + "epoch": 0.02367, + "grad_norm": 0.49481290578842163, + "grad_norm_var": 0.0011133963445998558, + "learning_rate": 5e-05, + "loss": 0.2002, + "loss/crossentropy": 2.7985722422599792, + "loss/hidden": 0.0, + "loss/logits": 0.20017676427960396, + "loss/reg": 0.6257706880569458, + "step": 2367 + }, + { + "epoch": 0.02368, + "grad_norm": 0.4848945140838623, + "grad_norm_var": 0.0012303351620891652, + "learning_rate": 5e-05, + "loss": 0.2177, + "loss/crossentropy": 2.734506845474243, + "loss/hidden": 0.0, + "loss/logits": 0.21771074831485748, + "loss/reg": 0.6254269480705261, + "step": 2368 + }, + { + "epoch": 0.02369, + "grad_norm": 0.42091602087020874, + "grad_norm_var": 0.0012414504490849835, + "learning_rate": 5e-05, + "loss": 0.2052, + "loss/crossentropy": 2.6914102435112, + "loss/hidden": 0.0, + "loss/logits": 0.20518701151013374, + "loss/reg": 0.6253869533538818, + "step": 2369 + }, + { + "epoch": 0.0237, + "grad_norm": 0.41647371649742126, + "grad_norm_var": 0.0012826645948488, + "learning_rate": 5e-05, + "loss": 0.2021, + "loss/crossentropy": 2.77417916059494, + "loss/hidden": 0.0, + "loss/logits": 0.2021188586950302, + "loss/reg": 0.6251001358032227, + "step": 2370 + }, + { + "epoch": 0.02371, + "grad_norm": 0.45126456022262573, + "grad_norm_var": 0.0009132975176663967, + "learning_rate": 5e-05, + "loss": 0.2003, + "loss/crossentropy": 2.7900009155273438, + "loss/hidden": 0.0, + "loss/logits": 0.20031750202178955, + "loss/reg": 0.6249777674674988, + "step": 2371 + }, + { + "epoch": 0.02372, + "grad_norm": 0.4075815975666046, + "grad_norm_var": 0.0007703517618221079, + "learning_rate": 5e-05, + "loss": 0.1997, + "loss/crossentropy": 2.7443358302116394, + "loss/hidden": 0.0, + "loss/logits": 0.19966210052371025, + "loss/reg": 0.6250377893447876, + "step": 2372 + }, + { + "epoch": 0.02373, + "grad_norm": 0.5169476866722107, + "grad_norm_var": 0.001214396317878593, + "learning_rate": 5e-05, + "loss": 0.2054, + "loss/crossentropy": 2.656807601451874, + "loss/hidden": 0.0, + "loss/logits": 0.20535742118954659, + "loss/reg": 0.6247585415840149, + "step": 2373 + }, + { + "epoch": 0.02374, + "grad_norm": 0.4856709837913513, + "grad_norm_var": 0.0013190206118508148, + "learning_rate": 5e-05, + "loss": 0.2078, + "loss/crossentropy": 2.8747299909591675, + "loss/hidden": 0.0, + "loss/logits": 0.20778127759695053, + "loss/reg": 0.6244027614593506, + "step": 2374 + }, + { + "epoch": 0.02375, + "grad_norm": 0.4853595793247223, + "grad_norm_var": 0.001418053618842067, + "learning_rate": 5e-05, + "loss": 0.2188, + "loss/crossentropy": 2.680226683616638, + "loss/hidden": 0.0, + "loss/logits": 0.21883514896035194, + "loss/reg": 0.6237896680831909, + "step": 2375 + }, + { + "epoch": 0.02376, + "grad_norm": 0.46014684438705444, + "grad_norm_var": 0.0013013985859490725, + "learning_rate": 5e-05, + "loss": 0.214, + "loss/crossentropy": 2.941298723220825, + "loss/hidden": 0.0, + "loss/logits": 0.21399037167429924, + "loss/reg": 0.6232113838195801, + "step": 2376 + }, + { + "epoch": 0.02377, + "grad_norm": 0.4384094178676605, + "grad_norm_var": 0.0012699328767099597, + "learning_rate": 5e-05, + "loss": 0.2047, + "loss/crossentropy": 2.863753080368042, + "loss/hidden": 0.0, + "loss/logits": 0.20469892024993896, + "loss/reg": 0.6228435635566711, + "step": 2377 + }, + { + "epoch": 0.02378, + "grad_norm": 0.4748360216617584, + "grad_norm_var": 0.0012807564694280978, + "learning_rate": 5e-05, + "loss": 0.2062, + "loss/crossentropy": 2.7548925280570984, + "loss/hidden": 0.0, + "loss/logits": 0.20624235272407532, + "loss/reg": 0.6221728920936584, + "step": 2378 + }, + { + "epoch": 0.02379, + "grad_norm": 0.4624280631542206, + "grad_norm_var": 0.0012289545403452729, + "learning_rate": 5e-05, + "loss": 0.2156, + "loss/crossentropy": 2.831230640411377, + "loss/hidden": 0.0, + "loss/logits": 0.21557524427771568, + "loss/reg": 0.6216285824775696, + "step": 2379 + }, + { + "epoch": 0.0238, + "grad_norm": 0.4147163927555084, + "grad_norm_var": 0.0013214152810874346, + "learning_rate": 5e-05, + "loss": 0.2026, + "loss/crossentropy": 2.7272024154663086, + "loss/hidden": 0.0, + "loss/logits": 0.20263291522860527, + "loss/reg": 0.6214691400527954, + "step": 2380 + }, + { + "epoch": 0.02381, + "grad_norm": 0.4634335935115814, + "grad_norm_var": 0.001206821235458344, + "learning_rate": 5e-05, + "loss": 0.2147, + "loss/crossentropy": 2.6593633890151978, + "loss/hidden": 0.0, + "loss/logits": 0.21471738070249557, + "loss/reg": 0.6214774250984192, + "step": 2381 + }, + { + "epoch": 0.02382, + "grad_norm": 0.5388256907463074, + "grad_norm_var": 0.001425994681830156, + "learning_rate": 5e-05, + "loss": 0.2214, + "loss/crossentropy": 2.8513535857200623, + "loss/hidden": 0.0, + "loss/logits": 0.22139018774032593, + "loss/reg": 0.6215904355049133, + "step": 2382 + }, + { + "epoch": 0.02383, + "grad_norm": 0.4625283181667328, + "grad_norm_var": 0.0013565412529649933, + "learning_rate": 5e-05, + "loss": 0.2102, + "loss/crossentropy": 2.864177644252777, + "loss/hidden": 0.0, + "loss/logits": 0.21016574651002884, + "loss/reg": 0.6213264465332031, + "step": 2383 + }, + { + "epoch": 0.02384, + "grad_norm": 0.4466176927089691, + "grad_norm_var": 0.0013288533749153063, + "learning_rate": 5e-05, + "loss": 0.2214, + "loss/crossentropy": 2.763855218887329, + "loss/hidden": 0.0, + "loss/logits": 0.22137384489178658, + "loss/reg": 0.6208057999610901, + "step": 2384 + }, + { + "epoch": 0.02385, + "grad_norm": 0.4485272169113159, + "grad_norm_var": 0.0012357999913707117, + "learning_rate": 5e-05, + "loss": 0.2029, + "loss/crossentropy": 2.6498608589172363, + "loss/hidden": 0.0, + "loss/logits": 0.2029251754283905, + "loss/reg": 0.6201765537261963, + "step": 2385 + }, + { + "epoch": 0.02386, + "grad_norm": 0.4225854277610779, + "grad_norm_var": 0.0012019640259396428, + "learning_rate": 5e-05, + "loss": 0.2053, + "loss/crossentropy": 2.744654655456543, + "loss/hidden": 0.0, + "loss/logits": 0.20525849983096123, + "loss/reg": 0.6194151043891907, + "step": 2386 + }, + { + "epoch": 0.02387, + "grad_norm": 0.4451136887073517, + "grad_norm_var": 0.0012125116255017184, + "learning_rate": 5e-05, + "loss": 0.2166, + "loss/crossentropy": 2.781644105911255, + "loss/hidden": 0.0, + "loss/logits": 0.21663788706064224, + "loss/reg": 0.6187278628349304, + "step": 2387 + }, + { + "epoch": 0.02388, + "grad_norm": 0.4336879253387451, + "grad_norm_var": 0.0010696610205097343, + "learning_rate": 5e-05, + "loss": 0.2085, + "loss/crossentropy": 2.672874093055725, + "loss/hidden": 0.0, + "loss/logits": 0.2084796205163002, + "loss/reg": 0.618300199508667, + "step": 2388 + }, + { + "epoch": 0.02389, + "grad_norm": 0.4334910809993744, + "grad_norm_var": 0.0008989895490526904, + "learning_rate": 5e-05, + "loss": 0.2108, + "loss/crossentropy": 2.745558977127075, + "loss/hidden": 0.0, + "loss/logits": 0.21078234538435936, + "loss/reg": 0.617465078830719, + "step": 2389 + }, + { + "epoch": 0.0239, + "grad_norm": 0.42380258440971375, + "grad_norm_var": 0.0009039674765972709, + "learning_rate": 5e-05, + "loss": 0.1911, + "loss/crossentropy": 2.745790958404541, + "loss/hidden": 0.0, + "loss/logits": 0.19111737236380577, + "loss/reg": 0.6171723008155823, + "step": 2390 + }, + { + "epoch": 0.02391, + "grad_norm": 0.4253969192504883, + "grad_norm_var": 0.0008732247305954714, + "learning_rate": 5e-05, + "loss": 0.2023, + "loss/crossentropy": 2.724030315876007, + "loss/hidden": 0.0, + "loss/logits": 0.20234277471899986, + "loss/reg": 0.6164988279342651, + "step": 2391 + }, + { + "epoch": 0.02392, + "grad_norm": 0.4185367822647095, + "grad_norm_var": 0.0009232514104947261, + "learning_rate": 5e-05, + "loss": 0.2037, + "loss/crossentropy": 2.945016086101532, + "loss/hidden": 0.0, + "loss/logits": 0.2037295177578926, + "loss/reg": 0.6161252856254578, + "step": 2392 + }, + { + "epoch": 0.02393, + "grad_norm": 0.41686755418777466, + "grad_norm_var": 0.000977097113293394, + "learning_rate": 5e-05, + "loss": 0.193, + "loss/crossentropy": 2.8395623564720154, + "loss/hidden": 0.0, + "loss/logits": 0.19302665069699287, + "loss/reg": 0.6156157851219177, + "step": 2393 + }, + { + "epoch": 0.02394, + "grad_norm": 0.4640732705593109, + "grad_norm_var": 0.0009425432326445659, + "learning_rate": 5e-05, + "loss": 0.2126, + "loss/crossentropy": 2.6625736355781555, + "loss/hidden": 0.0, + "loss/logits": 0.2126368097960949, + "loss/reg": 0.6151567101478577, + "step": 2394 + }, + { + "epoch": 0.02395, + "grad_norm": 0.8228129148483276, + "grad_norm_var": 0.00989541351132813, + "learning_rate": 5e-05, + "loss": 0.2101, + "loss/crossentropy": 2.7208781838417053, + "loss/hidden": 0.0, + "loss/logits": 0.21014802902936935, + "loss/reg": 0.6151673793792725, + "step": 2395 + }, + { + "epoch": 0.02396, + "grad_norm": 0.4248697757720947, + "grad_norm_var": 0.009830313031899426, + "learning_rate": 5e-05, + "loss": 0.2012, + "loss/crossentropy": 2.7677990794181824, + "loss/hidden": 0.0, + "loss/logits": 0.20115802437067032, + "loss/reg": 0.6149205565452576, + "step": 2396 + }, + { + "epoch": 0.02397, + "grad_norm": 0.5107229351997375, + "grad_norm_var": 0.009940038933850677, + "learning_rate": 5e-05, + "loss": 0.1956, + "loss/crossentropy": 2.967633843421936, + "loss/hidden": 0.0, + "loss/logits": 0.19555510953068733, + "loss/reg": 0.6152756810188293, + "step": 2397 + }, + { + "epoch": 0.02398, + "grad_norm": 0.5129979848861694, + "grad_norm_var": 0.009748689321330757, + "learning_rate": 5e-05, + "loss": 0.2229, + "loss/crossentropy": 2.7032816410064697, + "loss/hidden": 0.0, + "loss/logits": 0.2228730320930481, + "loss/reg": 0.6153419017791748, + "step": 2398 + }, + { + "epoch": 0.02399, + "grad_norm": 0.4326947033405304, + "grad_norm_var": 0.009832206311512515, + "learning_rate": 5e-05, + "loss": 0.1925, + "loss/crossentropy": 2.970871090888977, + "loss/hidden": 0.0, + "loss/logits": 0.1924908459186554, + "loss/reg": 0.6155155301094055, + "step": 2399 + }, + { + "epoch": 0.024, + "grad_norm": 0.45099180936813354, + "grad_norm_var": 0.009821121224636124, + "learning_rate": 5e-05, + "loss": 0.201, + "loss/crossentropy": 2.7635421752929688, + "loss/hidden": 0.0, + "loss/logits": 0.20103586837649345, + "loss/reg": 0.6148897409439087, + "step": 2400 + }, + { + "epoch": 0.02401, + "grad_norm": 0.4729997515678406, + "grad_norm_var": 0.009795181746437362, + "learning_rate": 5e-05, + "loss": 0.2024, + "loss/crossentropy": 2.748467206954956, + "loss/hidden": 0.0, + "loss/logits": 0.20238135010004044, + "loss/reg": 0.6148425936698914, + "step": 2401 + }, + { + "epoch": 0.02402, + "grad_norm": 0.48339179158210754, + "grad_norm_var": 0.009646089338132728, + "learning_rate": 5e-05, + "loss": 0.2005, + "loss/crossentropy": 2.986349105834961, + "loss/hidden": 0.0, + "loss/logits": 0.20049353316426277, + "loss/reg": 0.6148819327354431, + "step": 2402 + }, + { + "epoch": 0.02403, + "grad_norm": 0.5055953860282898, + "grad_norm_var": 0.009647591439865677, + "learning_rate": 5e-05, + "loss": 0.2072, + "loss/crossentropy": 2.9469191431999207, + "loss/hidden": 0.0, + "loss/logits": 0.20716840773820877, + "loss/reg": 0.6148055791854858, + "step": 2403 + }, + { + "epoch": 0.02404, + "grad_norm": 0.4750330448150635, + "grad_norm_var": 0.00951534288443986, + "learning_rate": 5e-05, + "loss": 0.1963, + "loss/crossentropy": 2.7149519324302673, + "loss/hidden": 0.0, + "loss/logits": 0.19631870836019516, + "loss/reg": 0.6145406365394592, + "step": 2404 + }, + { + "epoch": 0.02405, + "grad_norm": 0.4176200032234192, + "grad_norm_var": 0.00962874888723988, + "learning_rate": 5e-05, + "loss": 0.2004, + "loss/crossentropy": 2.785822570323944, + "loss/hidden": 0.0, + "loss/logits": 0.20043864846229553, + "loss/reg": 0.6145894527435303, + "step": 2405 + }, + { + "epoch": 0.02406, + "grad_norm": 0.42603835463523865, + "grad_norm_var": 0.009612711007167896, + "learning_rate": 5e-05, + "loss": 0.2089, + "loss/crossentropy": 2.777181923389435, + "loss/hidden": 0.0, + "loss/logits": 0.2088986374437809, + "loss/reg": 0.6144052743911743, + "step": 2406 + }, + { + "epoch": 0.02407, + "grad_norm": 0.421411395072937, + "grad_norm_var": 0.009642077136610302, + "learning_rate": 5e-05, + "loss": 0.1908, + "loss/crossentropy": 2.6668550968170166, + "loss/hidden": 0.0, + "loss/logits": 0.19079336524009705, + "loss/reg": 0.6140021085739136, + "step": 2407 + }, + { + "epoch": 0.02408, + "grad_norm": 0.4130021631717682, + "grad_norm_var": 0.009688271769575485, + "learning_rate": 5e-05, + "loss": 0.1917, + "loss/crossentropy": 2.87978059053421, + "loss/hidden": 0.0, + "loss/logits": 0.1916884407401085, + "loss/reg": 0.6136030554771423, + "step": 2408 + }, + { + "epoch": 0.02409, + "grad_norm": 0.41854622960090637, + "grad_norm_var": 0.009674721335092921, + "learning_rate": 5e-05, + "loss": 0.2054, + "loss/crossentropy": 2.6859437227249146, + "loss/hidden": 0.0, + "loss/logits": 0.20543144270777702, + "loss/reg": 0.6131571531295776, + "step": 2409 + }, + { + "epoch": 0.0241, + "grad_norm": 0.4186927080154419, + "grad_norm_var": 0.009889516388260525, + "learning_rate": 5e-05, + "loss": 0.2017, + "loss/crossentropy": 2.7654284238815308, + "loss/hidden": 0.0, + "loss/logits": 0.20165761932730675, + "loss/reg": 0.6126837730407715, + "step": 2410 + }, + { + "epoch": 0.02411, + "grad_norm": 0.4544939398765564, + "grad_norm_var": 0.0013101600681157326, + "learning_rate": 5e-05, + "loss": 0.1967, + "loss/crossentropy": 2.8269935250282288, + "loss/hidden": 0.0, + "loss/logits": 0.19670745357871056, + "loss/reg": 0.6123327016830444, + "step": 2411 + }, + { + "epoch": 0.02412, + "grad_norm": 0.43378445506095886, + "grad_norm_var": 0.0012823518055573727, + "learning_rate": 5e-05, + "loss": 0.2004, + "loss/crossentropy": 2.9235960245132446, + "loss/hidden": 0.0, + "loss/logits": 0.20035070925951004, + "loss/reg": 0.6122226715087891, + "step": 2412 + }, + { + "epoch": 0.02413, + "grad_norm": 0.41004472970962524, + "grad_norm_var": 0.0011410132246501893, + "learning_rate": 5e-05, + "loss": 0.2099, + "loss/crossentropy": 2.7366536259651184, + "loss/hidden": 0.0, + "loss/logits": 0.2098781280219555, + "loss/reg": 0.6116908192634583, + "step": 2413 + }, + { + "epoch": 0.02414, + "grad_norm": 0.38210535049438477, + "grad_norm_var": 0.0010549136310553653, + "learning_rate": 5e-05, + "loss": 0.192, + "loss/crossentropy": 2.714003264904022, + "loss/hidden": 0.0, + "loss/logits": 0.1920245625078678, + "loss/reg": 0.6113671660423279, + "step": 2414 + }, + { + "epoch": 0.02415, + "grad_norm": 0.44822657108306885, + "grad_norm_var": 0.0010579110803860758, + "learning_rate": 5e-05, + "loss": 0.1968, + "loss/crossentropy": 2.7015531063079834, + "loss/hidden": 0.0, + "loss/logits": 0.19675321131944656, + "loss/reg": 0.6114233136177063, + "step": 2415 + }, + { + "epoch": 0.02416, + "grad_norm": 0.4882771968841553, + "grad_norm_var": 0.001201935730819914, + "learning_rate": 5e-05, + "loss": 0.2193, + "loss/crossentropy": 2.871196687221527, + "loss/hidden": 0.0, + "loss/logits": 0.21932413429021835, + "loss/reg": 0.6109235286712646, + "step": 2416 + }, + { + "epoch": 0.02417, + "grad_norm": 0.4202914834022522, + "grad_norm_var": 0.0011565095741638332, + "learning_rate": 5e-05, + "loss": 0.2067, + "loss/crossentropy": 2.8948104977607727, + "loss/hidden": 0.0, + "loss/logits": 0.20665007829666138, + "loss/reg": 0.6108868718147278, + "step": 2417 + }, + { + "epoch": 0.02418, + "grad_norm": 0.45309826731681824, + "grad_norm_var": 0.0010326816556566383, + "learning_rate": 5e-05, + "loss": 0.2289, + "loss/crossentropy": 2.758852541446686, + "loss/hidden": 0.0, + "loss/logits": 0.2288607656955719, + "loss/reg": 0.6108802556991577, + "step": 2418 + }, + { + "epoch": 0.02419, + "grad_norm": 0.4295118451118469, + "grad_norm_var": 0.0006949732304443365, + "learning_rate": 5e-05, + "loss": 0.1999, + "loss/crossentropy": 2.6620365977287292, + "loss/hidden": 0.0, + "loss/logits": 0.19994743913412094, + "loss/reg": 0.6103068590164185, + "step": 2419 + }, + { + "epoch": 0.0242, + "grad_norm": 0.4199262261390686, + "grad_norm_var": 0.0005677454365330589, + "learning_rate": 5e-05, + "loss": 0.2045, + "loss/crossentropy": 2.8004820942878723, + "loss/hidden": 0.0, + "loss/logits": 0.20446155220270157, + "loss/reg": 0.6100711822509766, + "step": 2420 + }, + { + "epoch": 0.02421, + "grad_norm": 0.4160470962524414, + "grad_norm_var": 0.0005701696488549939, + "learning_rate": 5e-05, + "loss": 0.2024, + "loss/crossentropy": 2.9780563712120056, + "loss/hidden": 0.0, + "loss/logits": 0.20243285596370697, + "loss/reg": 0.6096611022949219, + "step": 2421 + }, + { + "epoch": 0.02422, + "grad_norm": 0.4500364661216736, + "grad_norm_var": 0.0005987876964759072, + "learning_rate": 5e-05, + "loss": 0.1952, + "loss/crossentropy": 2.736417591571808, + "loss/hidden": 0.0, + "loss/logits": 0.19521570205688477, + "loss/reg": 0.6087237000465393, + "step": 2422 + }, + { + "epoch": 0.02423, + "grad_norm": 0.4263134300708771, + "grad_norm_var": 0.0005947783000057272, + "learning_rate": 5e-05, + "loss": 0.2023, + "loss/crossentropy": 2.7574185729026794, + "loss/hidden": 0.0, + "loss/logits": 0.20229537785053253, + "loss/reg": 0.6079822778701782, + "step": 2423 + }, + { + "epoch": 0.02424, + "grad_norm": 0.4212382733821869, + "grad_norm_var": 0.0005801871576726551, + "learning_rate": 5e-05, + "loss": 0.2014, + "loss/crossentropy": 2.8864298462867737, + "loss/hidden": 0.0, + "loss/logits": 0.201363705098629, + "loss/reg": 0.606991708278656, + "step": 2424 + }, + { + "epoch": 0.02425, + "grad_norm": 0.4393732249736786, + "grad_norm_var": 0.000573645375930835, + "learning_rate": 5e-05, + "loss": 0.204, + "loss/crossentropy": 2.606777787208557, + "loss/hidden": 0.0, + "loss/logits": 0.20404522493481636, + "loss/reg": 0.6065829992294312, + "step": 2425 + }, + { + "epoch": 0.02426, + "grad_norm": 0.4372482895851135, + "grad_norm_var": 0.0005623247588216788, + "learning_rate": 5e-05, + "loss": 0.2035, + "loss/crossentropy": 2.785549581050873, + "loss/hidden": 0.0, + "loss/logits": 0.20350359752774239, + "loss/reg": 0.6062883138656616, + "step": 2426 + }, + { + "epoch": 0.02427, + "grad_norm": 0.5289545059204102, + "grad_norm_var": 0.0011209902474666245, + "learning_rate": 5e-05, + "loss": 0.2352, + "loss/crossentropy": 2.7544320225715637, + "loss/hidden": 0.0, + "loss/logits": 0.23516413941979408, + "loss/reg": 0.605414092540741, + "step": 2427 + }, + { + "epoch": 0.02428, + "grad_norm": 0.44168800115585327, + "grad_norm_var": 0.0011206840156934442, + "learning_rate": 5e-05, + "loss": 0.1972, + "loss/crossentropy": 2.767220139503479, + "loss/hidden": 0.0, + "loss/logits": 0.1971777193248272, + "loss/reg": 0.6044897437095642, + "step": 2428 + }, + { + "epoch": 0.02429, + "grad_norm": 0.43984389305114746, + "grad_norm_var": 0.0010640230031723134, + "learning_rate": 5e-05, + "loss": 0.2074, + "loss/crossentropy": 2.871880531311035, + "loss/hidden": 0.0, + "loss/logits": 0.20737262815237045, + "loss/reg": 0.6037927269935608, + "step": 2429 + }, + { + "epoch": 0.0243, + "grad_norm": 0.419048547744751, + "grad_norm_var": 0.0008634766926070459, + "learning_rate": 5e-05, + "loss": 0.1967, + "loss/crossentropy": 2.719171464443207, + "loss/hidden": 0.0, + "loss/logits": 0.1966518834233284, + "loss/reg": 0.6033772826194763, + "step": 2430 + }, + { + "epoch": 0.02431, + "grad_norm": 1.078757882118225, + "grad_norm_var": 0.026197629190601986, + "learning_rate": 5e-05, + "loss": 0.2082, + "loss/crossentropy": 2.5126050114631653, + "loss/hidden": 0.0, + "loss/logits": 0.20820065215229988, + "loss/reg": 0.6028873920440674, + "step": 2431 + }, + { + "epoch": 0.02432, + "grad_norm": 0.39769697189331055, + "grad_norm_var": 0.026632845407153605, + "learning_rate": 5e-05, + "loss": 0.196, + "loss/crossentropy": 2.714447498321533, + "loss/hidden": 0.0, + "loss/logits": 0.1960112527012825, + "loss/reg": 0.602187991142273, + "step": 2432 + }, + { + "epoch": 0.02433, + "grad_norm": 0.4483490288257599, + "grad_norm_var": 0.026472922289809075, + "learning_rate": 5e-05, + "loss": 0.209, + "loss/crossentropy": 2.7179154753684998, + "loss/hidden": 0.0, + "loss/logits": 0.20899199694395065, + "loss/reg": 0.6017115712165833, + "step": 2433 + }, + { + "epoch": 0.02434, + "grad_norm": 0.4355013966560364, + "grad_norm_var": 0.026550573790363016, + "learning_rate": 5e-05, + "loss": 0.2176, + "loss/crossentropy": 2.7355130314826965, + "loss/hidden": 0.0, + "loss/logits": 0.21763329207897186, + "loss/reg": 0.6012679934501648, + "step": 2434 + }, + { + "epoch": 0.02435, + "grad_norm": 0.49007782340049744, + "grad_norm_var": 0.026397593860115528, + "learning_rate": 5e-05, + "loss": 0.2128, + "loss/crossentropy": 2.6995787024497986, + "loss/hidden": 0.0, + "loss/logits": 0.2127709574997425, + "loss/reg": 0.601233184337616, + "step": 2435 + }, + { + "epoch": 0.02436, + "grad_norm": 0.39531245827674866, + "grad_norm_var": 0.02663468284039053, + "learning_rate": 5e-05, + "loss": 0.1955, + "loss/crossentropy": 2.8342103958129883, + "loss/hidden": 0.0, + "loss/logits": 0.1955052614212036, + "loss/reg": 0.6011907458305359, + "step": 2436 + }, + { + "epoch": 0.02437, + "grad_norm": 0.44183245301246643, + "grad_norm_var": 0.02645948346597739, + "learning_rate": 5e-05, + "loss": 0.2096, + "loss/crossentropy": 2.8581228256225586, + "loss/hidden": 0.0, + "loss/logits": 0.2096470631659031, + "loss/reg": 0.6008337736129761, + "step": 2437 + }, + { + "epoch": 0.02438, + "grad_norm": 0.5140143632888794, + "grad_norm_var": 0.02645369615362846, + "learning_rate": 5e-05, + "loss": 0.2144, + "loss/crossentropy": 2.8347874879837036, + "loss/hidden": 0.0, + "loss/logits": 0.21435070782899857, + "loss/reg": 0.6003647446632385, + "step": 2438 + }, + { + "epoch": 0.02439, + "grad_norm": 0.4482136070728302, + "grad_norm_var": 0.026313172876804958, + "learning_rate": 5e-05, + "loss": 0.2255, + "loss/crossentropy": 2.810681939125061, + "loss/hidden": 0.0, + "loss/logits": 0.2255197986960411, + "loss/reg": 0.5999137759208679, + "step": 2439 + }, + { + "epoch": 0.0244, + "grad_norm": 0.39792290329933167, + "grad_norm_var": 0.02654869767730116, + "learning_rate": 5e-05, + "loss": 0.1918, + "loss/crossentropy": 2.6554084420204163, + "loss/hidden": 0.0, + "loss/logits": 0.1917927786707878, + "loss/reg": 0.5991533994674683, + "step": 2440 + }, + { + "epoch": 0.02441, + "grad_norm": 0.43067991733551025, + "grad_norm_var": 0.026605860779431233, + "learning_rate": 5e-05, + "loss": 0.1923, + "loss/crossentropy": 2.758982539176941, + "loss/hidden": 0.0, + "loss/logits": 0.19225477054715157, + "loss/reg": 0.5983273983001709, + "step": 2441 + }, + { + "epoch": 0.02442, + "grad_norm": 0.6358528137207031, + "grad_norm_var": 0.027831191975493042, + "learning_rate": 5e-05, + "loss": 0.2262, + "loss/crossentropy": 2.8911746740341187, + "loss/hidden": 0.0, + "loss/logits": 0.2261839397251606, + "loss/reg": 0.5980895757675171, + "step": 2442 + }, + { + "epoch": 0.02443, + "grad_norm": 0.43324828147888184, + "grad_norm_var": 0.02798932350628398, + "learning_rate": 5e-05, + "loss": 0.199, + "loss/crossentropy": 2.7125543355941772, + "loss/hidden": 0.0, + "loss/logits": 0.1989964246749878, + "loss/reg": 0.5974434614181519, + "step": 2443 + }, + { + "epoch": 0.02444, + "grad_norm": 0.41978228092193604, + "grad_norm_var": 0.02816189042229961, + "learning_rate": 5e-05, + "loss": 0.2064, + "loss/crossentropy": 2.864742338657379, + "loss/hidden": 0.0, + "loss/logits": 0.20643724501132965, + "loss/reg": 0.5969957113265991, + "step": 2444 + }, + { + "epoch": 0.02445, + "grad_norm": 0.5701945424079895, + "grad_norm_var": 0.02836719001947839, + "learning_rate": 5e-05, + "loss": 0.2245, + "loss/crossentropy": 2.7456684708595276, + "loss/hidden": 0.0, + "loss/logits": 0.2244725376367569, + "loss/reg": 0.596680223941803, + "step": 2445 + }, + { + "epoch": 0.02446, + "grad_norm": 0.42100149393081665, + "grad_norm_var": 0.02834705739977156, + "learning_rate": 5e-05, + "loss": 0.2086, + "loss/crossentropy": 2.7281334400177, + "loss/hidden": 0.0, + "loss/logits": 0.20858720317482948, + "loss/reg": 0.5960950255393982, + "step": 2446 + }, + { + "epoch": 0.02447, + "grad_norm": 0.47768640518188477, + "grad_norm_var": 0.00433599590891635, + "learning_rate": 5e-05, + "loss": 0.2109, + "loss/crossentropy": 2.977556347846985, + "loss/hidden": 0.0, + "loss/logits": 0.21092526987195015, + "loss/reg": 0.5954275727272034, + "step": 2447 + }, + { + "epoch": 0.02448, + "grad_norm": 0.43607234954833984, + "grad_norm_var": 0.0041100928978652805, + "learning_rate": 5e-05, + "loss": 0.1972, + "loss/crossentropy": 2.701544463634491, + "loss/hidden": 0.0, + "loss/logits": 0.19721229374408722, + "loss/reg": 0.5951036810874939, + "step": 2448 + }, + { + "epoch": 0.02449, + "grad_norm": 0.4819416403770447, + "grad_norm_var": 0.00411843140412863, + "learning_rate": 5e-05, + "loss": 0.212, + "loss/crossentropy": 2.6057817935943604, + "loss/hidden": 0.0, + "loss/logits": 0.212038304656744, + "loss/reg": 0.5943446755409241, + "step": 2449 + }, + { + "epoch": 0.0245, + "grad_norm": 0.4397919476032257, + "grad_norm_var": 0.004103087920146331, + "learning_rate": 5e-05, + "loss": 0.2151, + "loss/crossentropy": 2.877394914627075, + "loss/hidden": 0.0, + "loss/logits": 0.21511249616742134, + "loss/reg": 0.593959629535675, + "step": 2450 + }, + { + "epoch": 0.02451, + "grad_norm": 0.42385128140449524, + "grad_norm_var": 0.004152249632537656, + "learning_rate": 5e-05, + "loss": 0.1948, + "loss/crossentropy": 2.840520203113556, + "loss/hidden": 0.0, + "loss/logits": 0.19477657228708267, + "loss/reg": 0.5928636789321899, + "step": 2451 + }, + { + "epoch": 0.02452, + "grad_norm": 0.5425018668174744, + "grad_norm_var": 0.004227710155897428, + "learning_rate": 5e-05, + "loss": 0.2114, + "loss/crossentropy": 2.782041072845459, + "loss/hidden": 0.0, + "loss/logits": 0.21143995225429535, + "loss/reg": 0.5921515226364136, + "step": 2452 + }, + { + "epoch": 0.02453, + "grad_norm": 0.4344117045402527, + "grad_norm_var": 0.00425868711266942, + "learning_rate": 5e-05, + "loss": 0.1953, + "loss/crossentropy": 2.6724759936332703, + "loss/hidden": 0.0, + "loss/logits": 0.1952899768948555, + "loss/reg": 0.5912289619445801, + "step": 2453 + }, + { + "epoch": 0.02454, + "grad_norm": 0.4714544713497162, + "grad_norm_var": 0.004117578647449938, + "learning_rate": 5e-05, + "loss": 0.2166, + "loss/crossentropy": 2.665508806705475, + "loss/hidden": 0.0, + "loss/logits": 0.21656208857893944, + "loss/reg": 0.5903236269950867, + "step": 2454 + }, + { + "epoch": 0.02455, + "grad_norm": 0.40555208921432495, + "grad_norm_var": 0.004335561646784436, + "learning_rate": 5e-05, + "loss": 0.1915, + "loss/crossentropy": 2.769826829433441, + "loss/hidden": 0.0, + "loss/logits": 0.1914786696434021, + "loss/reg": 0.5894419550895691, + "step": 2455 + }, + { + "epoch": 0.02456, + "grad_norm": 0.43317025899887085, + "grad_norm_var": 0.004103274414986357, + "learning_rate": 5e-05, + "loss": 0.2035, + "loss/crossentropy": 2.763172686100006, + "loss/hidden": 0.0, + "loss/logits": 0.2035318687558174, + "loss/reg": 0.5879473090171814, + "step": 2456 + }, + { + "epoch": 0.02457, + "grad_norm": 0.4515876770019531, + "grad_norm_var": 0.004031925557854014, + "learning_rate": 5e-05, + "loss": 0.1998, + "loss/crossentropy": 2.802380859851837, + "loss/hidden": 0.0, + "loss/logits": 0.19983287900686264, + "loss/reg": 0.5873618125915527, + "step": 2457 + }, + { + "epoch": 0.02458, + "grad_norm": 0.9067421555519104, + "grad_norm_var": 0.014703191252883866, + "learning_rate": 5e-05, + "loss": 0.2055, + "loss/crossentropy": 2.678984522819519, + "loss/hidden": 0.0, + "loss/logits": 0.20553578063845634, + "loss/reg": 0.5859858393669128, + "step": 2458 + }, + { + "epoch": 0.02459, + "grad_norm": 0.46635305881500244, + "grad_norm_var": 0.014546293336608444, + "learning_rate": 5e-05, + "loss": 0.2024, + "loss/crossentropy": 2.8421053290367126, + "loss/hidden": 0.0, + "loss/logits": 0.2024044394493103, + "loss/reg": 0.5854579210281372, + "step": 2459 + }, + { + "epoch": 0.0246, + "grad_norm": 0.40132805705070496, + "grad_norm_var": 0.014731448472074919, + "learning_rate": 5e-05, + "loss": 0.1946, + "loss/crossentropy": 2.726720094680786, + "loss/hidden": 0.0, + "loss/logits": 0.1946224682033062, + "loss/reg": 0.5842671990394592, + "step": 2460 + }, + { + "epoch": 0.02461, + "grad_norm": 0.430347740650177, + "grad_norm_var": 0.014369454250099226, + "learning_rate": 5e-05, + "loss": 0.205, + "loss/crossentropy": 2.7490745186805725, + "loss/hidden": 0.0, + "loss/logits": 0.20495863631367683, + "loss/reg": 0.5837558507919312, + "step": 2461 + }, + { + "epoch": 0.02462, + "grad_norm": 0.4891451895236969, + "grad_norm_var": 0.014155544046926549, + "learning_rate": 5e-05, + "loss": 0.22, + "loss/crossentropy": 2.773320257663727, + "loss/hidden": 0.0, + "loss/logits": 0.220025185495615, + "loss/reg": 0.5832417607307434, + "step": 2462 + }, + { + "epoch": 0.02463, + "grad_norm": 0.4171487092971802, + "grad_norm_var": 0.014409291900582632, + "learning_rate": 5e-05, + "loss": 0.2053, + "loss/crossentropy": 2.897250771522522, + "loss/hidden": 0.0, + "loss/logits": 0.20525911822915077, + "loss/reg": 0.5827232003211975, + "step": 2463 + }, + { + "epoch": 0.02464, + "grad_norm": 0.4398230314254761, + "grad_norm_var": 0.01438972232747731, + "learning_rate": 5e-05, + "loss": 0.2072, + "loss/crossentropy": 2.7803579568862915, + "loss/hidden": 0.0, + "loss/logits": 0.20723994448781013, + "loss/reg": 0.5825102925300598, + "step": 2464 + }, + { + "epoch": 0.02465, + "grad_norm": 0.4308079481124878, + "grad_norm_var": 0.01452078962456336, + "learning_rate": 5e-05, + "loss": 0.2077, + "loss/crossentropy": 2.702156960964203, + "loss/hidden": 0.0, + "loss/logits": 0.20767095312476158, + "loss/reg": 0.581748366355896, + "step": 2465 + }, + { + "epoch": 0.02466, + "grad_norm": 0.4506363570690155, + "grad_norm_var": 0.014478675997086207, + "learning_rate": 5e-05, + "loss": 0.2008, + "loss/crossentropy": 2.8078810572624207, + "loss/hidden": 0.0, + "loss/logits": 0.20075619220733643, + "loss/reg": 0.580965518951416, + "step": 2466 + }, + { + "epoch": 0.02467, + "grad_norm": 0.4482618570327759, + "grad_norm_var": 0.014350487566095405, + "learning_rate": 5e-05, + "loss": 0.2117, + "loss/crossentropy": 2.7655863165855408, + "loss/hidden": 0.0, + "loss/logits": 0.2117096483707428, + "loss/reg": 0.58036208152771, + "step": 2467 + }, + { + "epoch": 0.02468, + "grad_norm": 0.42369329929351807, + "grad_norm_var": 0.014182478944860044, + "learning_rate": 5e-05, + "loss": 0.1954, + "loss/crossentropy": 2.841384708881378, + "loss/hidden": 0.0, + "loss/logits": 0.19541235268115997, + "loss/reg": 0.5799896121025085, + "step": 2468 + }, + { + "epoch": 0.02469, + "grad_norm": 0.5219395160675049, + "grad_norm_var": 0.01426021987365389, + "learning_rate": 5e-05, + "loss": 0.2225, + "loss/crossentropy": 2.8222363591194153, + "loss/hidden": 0.0, + "loss/logits": 0.22250224277377129, + "loss/reg": 0.5794958472251892, + "step": 2469 + }, + { + "epoch": 0.0247, + "grad_norm": 0.45128709077835083, + "grad_norm_var": 0.014293155765559803, + "learning_rate": 5e-05, + "loss": 0.2025, + "loss/crossentropy": 2.742594063282013, + "loss/hidden": 0.0, + "loss/logits": 0.20252467319369316, + "loss/reg": 0.5788162350654602, + "step": 2470 + }, + { + "epoch": 0.02471, + "grad_norm": 0.4234894812107086, + "grad_norm_var": 0.014151979496668037, + "learning_rate": 5e-05, + "loss": 0.2067, + "loss/crossentropy": 2.8864450454711914, + "loss/hidden": 0.0, + "loss/logits": 0.20670805871486664, + "loss/reg": 0.5781189203262329, + "step": 2471 + }, + { + "epoch": 0.02472, + "grad_norm": 0.43583935499191284, + "grad_norm_var": 0.014137855106342261, + "learning_rate": 5e-05, + "loss": 0.213, + "loss/crossentropy": 2.7736966609954834, + "loss/hidden": 0.0, + "loss/logits": 0.21295856311917305, + "loss/reg": 0.5775285363197327, + "step": 2472 + }, + { + "epoch": 0.02473, + "grad_norm": 0.43680688738822937, + "grad_norm_var": 0.014196224889668077, + "learning_rate": 5e-05, + "loss": 0.2051, + "loss/crossentropy": 2.8773239850997925, + "loss/hidden": 0.0, + "loss/logits": 0.20512542128562927, + "loss/reg": 0.5764645338058472, + "step": 2473 + }, + { + "epoch": 0.02474, + "grad_norm": 0.3981086015701294, + "grad_norm_var": 0.0009739858125403603, + "learning_rate": 5e-05, + "loss": 0.1942, + "loss/crossentropy": 2.769380271434784, + "loss/hidden": 0.0, + "loss/logits": 0.194191575050354, + "loss/reg": 0.5755466222763062, + "step": 2474 + }, + { + "epoch": 0.02475, + "grad_norm": 0.44826868176460266, + "grad_norm_var": 0.0009346523025701565, + "learning_rate": 5e-05, + "loss": 0.195, + "loss/crossentropy": 2.805808424949646, + "loss/hidden": 0.0, + "loss/logits": 0.19498459622263908, + "loss/reg": 0.5743960738182068, + "step": 2475 + }, + { + "epoch": 0.02476, + "grad_norm": 0.40798959136009216, + "grad_norm_var": 0.0009026924046857084, + "learning_rate": 5e-05, + "loss": 0.2012, + "loss/crossentropy": 2.73064124584198, + "loss/hidden": 0.0, + "loss/logits": 0.20116568729281425, + "loss/reg": 0.5736358165740967, + "step": 2476 + }, + { + "epoch": 0.02477, + "grad_norm": 0.4161427617073059, + "grad_norm_var": 0.0009351942049128888, + "learning_rate": 5e-05, + "loss": 0.1931, + "loss/crossentropy": 2.6585023999214172, + "loss/hidden": 0.0, + "loss/logits": 0.19313867390155792, + "loss/reg": 0.5731735229492188, + "step": 2477 + }, + { + "epoch": 0.02478, + "grad_norm": 0.41280585527420044, + "grad_norm_var": 0.0007988078345607404, + "learning_rate": 5e-05, + "loss": 0.1942, + "loss/crossentropy": 2.8232686519622803, + "loss/hidden": 0.0, + "loss/logits": 0.19419541954994202, + "loss/reg": 0.5724911689758301, + "step": 2478 + }, + { + "epoch": 0.02479, + "grad_norm": 0.45089125633239746, + "grad_norm_var": 0.0007887973845356518, + "learning_rate": 5e-05, + "loss": 0.2155, + "loss/crossentropy": 2.673654556274414, + "loss/hidden": 0.0, + "loss/logits": 0.21554561331868172, + "loss/reg": 0.5719723105430603, + "step": 2479 + }, + { + "epoch": 0.0248, + "grad_norm": 0.428713321685791, + "grad_norm_var": 0.0007927733544556189, + "learning_rate": 5e-05, + "loss": 0.2068, + "loss/crossentropy": 2.767833948135376, + "loss/hidden": 0.0, + "loss/logits": 0.20680801570415497, + "loss/reg": 0.5708890557289124, + "step": 2480 + }, + { + "epoch": 0.02481, + "grad_norm": 0.8086097240447998, + "grad_norm_var": 0.009421635662198584, + "learning_rate": 5e-05, + "loss": 0.2268, + "loss/crossentropy": 2.81828373670578, + "loss/hidden": 0.0, + "loss/logits": 0.22679607197642326, + "loss/reg": 0.5702183246612549, + "step": 2481 + }, + { + "epoch": 0.02482, + "grad_norm": 0.45609942078590393, + "grad_norm_var": 0.009416521827261297, + "learning_rate": 5e-05, + "loss": 0.208, + "loss/crossentropy": 2.940394163131714, + "loss/hidden": 0.0, + "loss/logits": 0.20797326043248177, + "loss/reg": 0.569516122341156, + "step": 2482 + }, + { + "epoch": 0.02483, + "grad_norm": 0.4065435528755188, + "grad_norm_var": 0.009593700949473757, + "learning_rate": 5e-05, + "loss": 0.1961, + "loss/crossentropy": 2.7684956192970276, + "loss/hidden": 0.0, + "loss/logits": 0.1960587427020073, + "loss/reg": 0.5684271454811096, + "step": 2483 + }, + { + "epoch": 0.02484, + "grad_norm": 0.4588800072669983, + "grad_norm_var": 0.009510356745733969, + "learning_rate": 5e-05, + "loss": 0.2106, + "loss/crossentropy": 2.9209553599357605, + "loss/hidden": 0.0, + "loss/logits": 0.2105855904519558, + "loss/reg": 0.56758713722229, + "step": 2484 + }, + { + "epoch": 0.02485, + "grad_norm": 0.41982463002204895, + "grad_norm_var": 0.009320801302600623, + "learning_rate": 5e-05, + "loss": 0.1983, + "loss/crossentropy": 2.6672328114509583, + "loss/hidden": 0.0, + "loss/logits": 0.1983148530125618, + "loss/reg": 0.5664477944374084, + "step": 2485 + }, + { + "epoch": 0.02486, + "grad_norm": 0.45277613401412964, + "grad_norm_var": 0.009320447171608596, + "learning_rate": 5e-05, + "loss": 0.2074, + "loss/crossentropy": 2.883937656879425, + "loss/hidden": 0.0, + "loss/logits": 0.2073887586593628, + "loss/reg": 0.5655733942985535, + "step": 2486 + }, + { + "epoch": 0.02487, + "grad_norm": 0.4694211483001709, + "grad_norm_var": 0.009266297540877958, + "learning_rate": 5e-05, + "loss": 0.2113, + "loss/crossentropy": 2.8682127594947815, + "loss/hidden": 0.0, + "loss/logits": 0.21134477853775024, + "loss/reg": 0.5648528933525085, + "step": 2487 + }, + { + "epoch": 0.02488, + "grad_norm": 0.6089085936546326, + "grad_norm_var": 0.010656228192876028, + "learning_rate": 5e-05, + "loss": 0.2251, + "loss/crossentropy": 2.649954915046692, + "loss/hidden": 0.0, + "loss/logits": 0.22513502091169357, + "loss/reg": 0.5644046664237976, + "step": 2488 + }, + { + "epoch": 0.02489, + "grad_norm": 0.45947757363319397, + "grad_norm_var": 0.010595423556582809, + "learning_rate": 5e-05, + "loss": 0.1968, + "loss/crossentropy": 2.7428256273269653, + "loss/hidden": 0.0, + "loss/logits": 0.19676993414759636, + "loss/reg": 0.5639327764511108, + "step": 2489 + }, + { + "epoch": 0.0249, + "grad_norm": 0.44068512320518494, + "grad_norm_var": 0.010306471138783958, + "learning_rate": 5e-05, + "loss": 0.1988, + "loss/crossentropy": 2.688177466392517, + "loss/hidden": 0.0, + "loss/logits": 0.19883006438612938, + "loss/reg": 0.5636557340621948, + "step": 2490 + }, + { + "epoch": 0.02491, + "grad_norm": 0.4223071336746216, + "grad_norm_var": 0.010429453172503201, + "learning_rate": 5e-05, + "loss": 0.2056, + "loss/crossentropy": 2.857997417449951, + "loss/hidden": 0.0, + "loss/logits": 0.20556020364165306, + "loss/reg": 0.563023567199707, + "step": 2491 + }, + { + "epoch": 0.02492, + "grad_norm": 0.43173637986183167, + "grad_norm_var": 0.010268342798515174, + "learning_rate": 5e-05, + "loss": 0.2072, + "loss/crossentropy": 2.90788334608078, + "loss/hidden": 0.0, + "loss/logits": 0.20715632662177086, + "loss/reg": 0.5627500414848328, + "step": 2492 + }, + { + "epoch": 0.02493, + "grad_norm": 0.4579859673976898, + "grad_norm_var": 0.010068989776315658, + "learning_rate": 5e-05, + "loss": 0.218, + "loss/crossentropy": 2.9963285326957703, + "loss/hidden": 0.0, + "loss/logits": 0.21795838326215744, + "loss/reg": 0.5624452829360962, + "step": 2493 + }, + { + "epoch": 0.02494, + "grad_norm": 0.4575490951538086, + "grad_norm_var": 0.009828421095817186, + "learning_rate": 5e-05, + "loss": 0.2126, + "loss/crossentropy": 2.7028623819351196, + "loss/hidden": 0.0, + "loss/logits": 0.21258383989334106, + "loss/reg": 0.5622525811195374, + "step": 2494 + }, + { + "epoch": 0.02495, + "grad_norm": 0.4384433925151825, + "grad_norm_var": 0.009881273474493801, + "learning_rate": 5e-05, + "loss": 0.2163, + "loss/crossentropy": 2.8180198073387146, + "loss/hidden": 0.0, + "loss/logits": 0.21625970676541328, + "loss/reg": 0.5621782541275024, + "step": 2495 + }, + { + "epoch": 0.02496, + "grad_norm": 0.442807674407959, + "grad_norm_var": 0.009804595449916671, + "learning_rate": 5e-05, + "loss": 0.2107, + "loss/crossentropy": 2.757575273513794, + "loss/hidden": 0.0, + "loss/logits": 0.21068798378109932, + "loss/reg": 0.5625290274620056, + "step": 2496 + }, + { + "epoch": 0.02497, + "grad_norm": 0.4132538437843323, + "grad_norm_var": 0.0020934065592683997, + "learning_rate": 5e-05, + "loss": 0.1987, + "loss/crossentropy": 2.8229694962501526, + "loss/hidden": 0.0, + "loss/logits": 0.19867859035730362, + "loss/reg": 0.5622415542602539, + "step": 2497 + }, + { + "epoch": 0.02498, + "grad_norm": 0.4141785204410553, + "grad_norm_var": 0.0021819699426020283, + "learning_rate": 5e-05, + "loss": 0.1995, + "loss/crossentropy": 2.7077760696411133, + "loss/hidden": 0.0, + "loss/logits": 0.1995224393904209, + "loss/reg": 0.5620848536491394, + "step": 2498 + }, + { + "epoch": 0.02499, + "grad_norm": 0.4733083248138428, + "grad_norm_var": 0.002076622846784082, + "learning_rate": 5e-05, + "loss": 0.2838, + "loss/crossentropy": 2.892969787120819, + "loss/hidden": 0.0, + "loss/logits": 0.28383559361100197, + "loss/reg": 0.5618056654930115, + "step": 2499 + }, + { + "epoch": 0.025, + "grad_norm": 0.45247259736061096, + "grad_norm_var": 0.0020748885211168875, + "learning_rate": 5e-05, + "loss": 0.22, + "loss/crossentropy": 2.6589337587356567, + "loss/hidden": 0.0, + "loss/logits": 0.2200109176337719, + "loss/reg": 0.5620695948600769, + "step": 2500 + }, + { + "epoch": 0.02501, + "grad_norm": 0.4027824401855469, + "grad_norm_var": 0.0021694383738458853, + "learning_rate": 5e-05, + "loss": 0.1903, + "loss/crossentropy": 2.788836419582367, + "loss/hidden": 0.0, + "loss/logits": 0.19026359543204308, + "loss/reg": 0.5617285370826721, + "step": 2501 + }, + { + "epoch": 0.02502, + "grad_norm": 0.41833147406578064, + "grad_norm_var": 0.0022417752447706127, + "learning_rate": 5e-05, + "loss": 0.2075, + "loss/crossentropy": 2.856125056743622, + "loss/hidden": 0.0, + "loss/logits": 0.20748553797602654, + "loss/reg": 0.5614275932312012, + "step": 2502 + }, + { + "epoch": 0.02503, + "grad_norm": 0.4432801306247711, + "grad_norm_var": 0.002217587950600619, + "learning_rate": 5e-05, + "loss": 0.2146, + "loss/crossentropy": 2.8253413438796997, + "loss/hidden": 0.0, + "loss/logits": 0.21463757753372192, + "loss/reg": 0.5610749125480652, + "step": 2503 + }, + { + "epoch": 0.02504, + "grad_norm": 0.4230194687843323, + "grad_norm_var": 0.00040383554284825947, + "learning_rate": 5e-05, + "loss": 0.2153, + "loss/crossentropy": 2.776782214641571, + "loss/hidden": 0.0, + "loss/logits": 0.21526965871453285, + "loss/reg": 0.560530960559845, + "step": 2504 + }, + { + "epoch": 0.02505, + "grad_norm": 0.43840688467025757, + "grad_norm_var": 0.00036836797712435455, + "learning_rate": 5e-05, + "loss": 0.22, + "loss/crossentropy": 2.7886182069778442, + "loss/hidden": 0.0, + "loss/logits": 0.2200198583304882, + "loss/reg": 0.5601452589035034, + "step": 2505 + }, + { + "epoch": 0.02506, + "grad_norm": 0.40774574875831604, + "grad_norm_var": 0.0004141075342925996, + "learning_rate": 5e-05, + "loss": 0.2076, + "loss/crossentropy": 2.874053418636322, + "loss/hidden": 0.0, + "loss/logits": 0.20756380259990692, + "loss/reg": 0.5592626929283142, + "step": 2506 + }, + { + "epoch": 0.02507, + "grad_norm": 0.3885657787322998, + "grad_norm_var": 0.0005360699074550724, + "learning_rate": 5e-05, + "loss": 0.177, + "loss/crossentropy": 2.6676279306411743, + "loss/hidden": 0.0, + "loss/logits": 0.1769523061811924, + "loss/reg": 0.5585103631019592, + "step": 2507 + }, + { + "epoch": 0.02508, + "grad_norm": 0.4245500862598419, + "grad_norm_var": 0.0005390631691622497, + "learning_rate": 5e-05, + "loss": 0.1962, + "loss/crossentropy": 2.8318361043930054, + "loss/hidden": 0.0, + "loss/logits": 0.1962013579905033, + "loss/reg": 0.5579128265380859, + "step": 2508 + }, + { + "epoch": 0.02509, + "grad_norm": 0.42939281463623047, + "grad_norm_var": 0.0004874417095659648, + "learning_rate": 5e-05, + "loss": 0.2001, + "loss/crossentropy": 2.817953944206238, + "loss/hidden": 0.0, + "loss/logits": 0.20014868676662445, + "loss/reg": 0.5567783713340759, + "step": 2509 + }, + { + "epoch": 0.0251, + "grad_norm": 0.41922691464424133, + "grad_norm_var": 0.00043465900762111137, + "learning_rate": 5e-05, + "loss": 0.2098, + "loss/crossentropy": 2.7899520993232727, + "loss/hidden": 0.0, + "loss/logits": 0.2098364382982254, + "loss/reg": 0.5560342669487, + "step": 2510 + }, + { + "epoch": 0.02511, + "grad_norm": 0.42519885301589966, + "grad_norm_var": 0.0004251677344973855, + "learning_rate": 5e-05, + "loss": 0.1968, + "loss/crossentropy": 2.912817597389221, + "loss/hidden": 0.0, + "loss/logits": 0.19678284600377083, + "loss/reg": 0.55498868227005, + "step": 2511 + }, + { + "epoch": 0.02512, + "grad_norm": 0.4234619140625, + "grad_norm_var": 0.00040528867074511495, + "learning_rate": 5e-05, + "loss": 0.2079, + "loss/crossentropy": 2.6732526421546936, + "loss/hidden": 0.0, + "loss/logits": 0.20789264515042305, + "loss/reg": 0.5544357895851135, + "step": 2512 + }, + { + "epoch": 0.02513, + "grad_norm": 0.5458094477653503, + "grad_norm_var": 0.001298992620875039, + "learning_rate": 5e-05, + "loss": 0.225, + "loss/crossentropy": 2.853252112865448, + "loss/hidden": 0.0, + "loss/logits": 0.2250494249165058, + "loss/reg": 0.5534573197364807, + "step": 2513 + }, + { + "epoch": 0.02514, + "grad_norm": 0.4273252785205841, + "grad_norm_var": 0.0012766130714017998, + "learning_rate": 5e-05, + "loss": 0.2118, + "loss/crossentropy": 2.9148301482200623, + "loss/hidden": 0.0, + "loss/logits": 0.21181168407201767, + "loss/reg": 0.5526054501533508, + "step": 2514 + }, + { + "epoch": 0.02515, + "grad_norm": 0.43446993827819824, + "grad_norm_var": 0.001166969994970446, + "learning_rate": 5e-05, + "loss": 0.1978, + "loss/crossentropy": 2.831529915332794, + "loss/hidden": 0.0, + "loss/logits": 0.1977701261639595, + "loss/reg": 0.551939070224762, + "step": 2515 + }, + { + "epoch": 0.02516, + "grad_norm": 0.42194291949272156, + "grad_norm_var": 0.0011398623878308182, + "learning_rate": 5e-05, + "loss": 0.1991, + "loss/crossentropy": 2.8191142082214355, + "loss/hidden": 0.0, + "loss/logits": 0.19909179955720901, + "loss/reg": 0.5516489744186401, + "step": 2516 + }, + { + "epoch": 0.02517, + "grad_norm": 0.4939989447593689, + "grad_norm_var": 0.0013337983567356937, + "learning_rate": 5e-05, + "loss": 0.2208, + "loss/crossentropy": 2.898717999458313, + "loss/hidden": 0.0, + "loss/logits": 0.22084418311715126, + "loss/reg": 0.551250159740448, + "step": 2517 + }, + { + "epoch": 0.02518, + "grad_norm": 0.44375526905059814, + "grad_norm_var": 0.0013166914161024668, + "learning_rate": 5e-05, + "loss": 0.2116, + "loss/crossentropy": 2.79244726896286, + "loss/hidden": 0.0, + "loss/logits": 0.21164287999272346, + "loss/reg": 0.5506622791290283, + "step": 2518 + }, + { + "epoch": 0.02519, + "grad_norm": 0.43564674258232117, + "grad_norm_var": 0.0013138237247535046, + "learning_rate": 5e-05, + "loss": 0.2222, + "loss/crossentropy": 2.7589996457099915, + "loss/hidden": 0.0, + "loss/logits": 0.22221456840634346, + "loss/reg": 0.5504961609840393, + "step": 2519 + }, + { + "epoch": 0.0252, + "grad_norm": 0.4234123229980469, + "grad_norm_var": 0.0013131321078669433, + "learning_rate": 5e-05, + "loss": 0.199, + "loss/crossentropy": 2.985638439655304, + "loss/hidden": 0.0, + "loss/logits": 0.19903402775526047, + "loss/reg": 0.550297200679779, + "step": 2520 + }, + { + "epoch": 0.02521, + "grad_norm": 0.4368554949760437, + "grad_norm_var": 0.001312873997038233, + "learning_rate": 5e-05, + "loss": 0.2078, + "loss/crossentropy": 2.844280779361725, + "loss/hidden": 0.0, + "loss/logits": 0.20779069885611534, + "loss/reg": 0.5505556464195251, + "step": 2521 + }, + { + "epoch": 0.02522, + "grad_norm": 0.41397765278816223, + "grad_norm_var": 0.0012915459698079315, + "learning_rate": 5e-05, + "loss": 0.2004, + "loss/crossentropy": 2.7090278267860413, + "loss/hidden": 0.0, + "loss/logits": 0.200420830398798, + "loss/reg": 0.5504327416419983, + "step": 2522 + }, + { + "epoch": 0.02523, + "grad_norm": 0.4127320349216461, + "grad_norm_var": 0.0011728713275674077, + "learning_rate": 5e-05, + "loss": 0.2012, + "loss/crossentropy": 2.976333498954773, + "loss/hidden": 0.0, + "loss/logits": 0.2011500895023346, + "loss/reg": 0.5503354668617249, + "step": 2523 + }, + { + "epoch": 0.02524, + "grad_norm": 0.5731796026229858, + "grad_norm_var": 0.0022823487092659502, + "learning_rate": 5e-05, + "loss": 0.2356, + "loss/crossentropy": 2.7545700073242188, + "loss/hidden": 0.0, + "loss/logits": 0.23556442186236382, + "loss/reg": 0.5501889586448669, + "step": 2524 + }, + { + "epoch": 0.02525, + "grad_norm": 0.4420778453350067, + "grad_norm_var": 0.002261739405355571, + "learning_rate": 5e-05, + "loss": 0.1989, + "loss/crossentropy": 2.937168836593628, + "loss/hidden": 0.0, + "loss/logits": 0.19890838861465454, + "loss/reg": 0.5499712824821472, + "step": 2525 + }, + { + "epoch": 0.02526, + "grad_norm": 0.4234352111816406, + "grad_norm_var": 0.0022465236668465375, + "learning_rate": 5e-05, + "loss": 0.1967, + "loss/crossentropy": 2.825637936592102, + "loss/hidden": 0.0, + "loss/logits": 0.19673464074730873, + "loss/reg": 0.5502470135688782, + "step": 2526 + }, + { + "epoch": 0.02527, + "grad_norm": 0.4312266707420349, + "grad_norm_var": 0.002230002966043602, + "learning_rate": 5e-05, + "loss": 0.2072, + "loss/crossentropy": 2.780506908893585, + "loss/hidden": 0.0, + "loss/logits": 0.20722387731075287, + "loss/reg": 0.5499589443206787, + "step": 2527 + }, + { + "epoch": 0.02528, + "grad_norm": 0.4663938581943512, + "grad_norm_var": 0.002199261159597828, + "learning_rate": 5e-05, + "loss": 0.1988, + "loss/crossentropy": 2.8749152421951294, + "loss/hidden": 0.0, + "loss/logits": 0.19877149164676666, + "loss/reg": 0.5501741766929626, + "step": 2528 + }, + { + "epoch": 0.02529, + "grad_norm": 0.44269999861717224, + "grad_norm_var": 0.0015690982566499035, + "learning_rate": 5e-05, + "loss": 0.1851, + "loss/crossentropy": 2.851559281349182, + "loss/hidden": 0.0, + "loss/logits": 0.18508239835500717, + "loss/reg": 0.5503494739532471, + "step": 2529 + }, + { + "epoch": 0.0253, + "grad_norm": 0.44874244928359985, + "grad_norm_var": 0.001546735776943875, + "learning_rate": 5e-05, + "loss": 0.2042, + "loss/crossentropy": 2.8295534253120422, + "loss/hidden": 0.0, + "loss/logits": 0.20418165624141693, + "loss/reg": 0.550430953502655, + "step": 2530 + }, + { + "epoch": 0.02531, + "grad_norm": 0.4400537312030792, + "grad_norm_var": 0.0015397025478705473, + "learning_rate": 5e-05, + "loss": 0.2035, + "loss/crossentropy": 2.787615120410919, + "loss/hidden": 0.0, + "loss/logits": 0.20352720096707344, + "loss/reg": 0.5502205491065979, + "step": 2531 + }, + { + "epoch": 0.02532, + "grad_norm": 0.4186142086982727, + "grad_norm_var": 0.0015514642525340627, + "learning_rate": 5e-05, + "loss": 0.1993, + "loss/crossentropy": 2.697046399116516, + "loss/hidden": 0.0, + "loss/logits": 0.1992897167801857, + "loss/reg": 0.5502745509147644, + "step": 2532 + }, + { + "epoch": 0.02533, + "grad_norm": 0.4206503927707672, + "grad_norm_var": 0.0014248970851371556, + "learning_rate": 5e-05, + "loss": 0.2013, + "loss/crossentropy": 2.82133686542511, + "loss/hidden": 0.0, + "loss/logits": 0.201254490762949, + "loss/reg": 0.5506644248962402, + "step": 2533 + }, + { + "epoch": 0.02534, + "grad_norm": 0.7551618218421936, + "grad_norm_var": 0.007554883084351853, + "learning_rate": 5e-05, + "loss": 0.2155, + "loss/crossentropy": 2.8442057967185974, + "loss/hidden": 0.0, + "loss/logits": 0.2154698185622692, + "loss/reg": 0.5511941313743591, + "step": 2534 + }, + { + "epoch": 0.02535, + "grad_norm": 0.39897021651268005, + "grad_norm_var": 0.007765646606706571, + "learning_rate": 5e-05, + "loss": 0.1833, + "loss/crossentropy": 2.8167617321014404, + "loss/hidden": 0.0, + "loss/logits": 0.18327052146196365, + "loss/reg": 0.5509635806083679, + "step": 2535 + }, + { + "epoch": 0.02536, + "grad_norm": 0.441813588142395, + "grad_norm_var": 0.007698853563202353, + "learning_rate": 5e-05, + "loss": 0.1931, + "loss/crossentropy": 2.7964720726013184, + "loss/hidden": 0.0, + "loss/logits": 0.1930750347673893, + "loss/reg": 0.5508456230163574, + "step": 2536 + }, + { + "epoch": 0.02537, + "grad_norm": 0.7968040108680725, + "grad_norm_var": 0.014666008071015509, + "learning_rate": 5e-05, + "loss": 0.2137, + "loss/crossentropy": 2.7459107637405396, + "loss/hidden": 0.0, + "loss/logits": 0.21374249830842018, + "loss/reg": 0.5509466528892517, + "step": 2537 + }, + { + "epoch": 0.02538, + "grad_norm": 0.44139617681503296, + "grad_norm_var": 0.01446099704354182, + "learning_rate": 5e-05, + "loss": 0.2066, + "loss/crossentropy": 2.763332784175873, + "loss/hidden": 0.0, + "loss/logits": 0.20660928264260292, + "loss/reg": 0.550536572933197, + "step": 2538 + }, + { + "epoch": 0.02539, + "grad_norm": 0.5968551635742188, + "grad_norm_var": 0.014814949524534705, + "learning_rate": 5e-05, + "loss": 0.2218, + "loss/crossentropy": 2.717930257320404, + "loss/hidden": 0.0, + "loss/logits": 0.22179723531007767, + "loss/reg": 0.5504844784736633, + "step": 2539 + }, + { + "epoch": 0.0254, + "grad_norm": 0.4726264774799347, + "grad_norm_var": 0.0144138680312763, + "learning_rate": 5e-05, + "loss": 0.2076, + "loss/crossentropy": 2.8354408144950867, + "loss/hidden": 0.0, + "loss/logits": 0.20758863165974617, + "loss/reg": 0.5505013465881348, + "step": 2540 + }, + { + "epoch": 0.02541, + "grad_norm": 0.45826902985572815, + "grad_norm_var": 0.01432713153035667, + "learning_rate": 5e-05, + "loss": 0.1968, + "loss/crossentropy": 2.6961912512779236, + "loss/hidden": 0.0, + "loss/logits": 0.1967562697827816, + "loss/reg": 0.550010621547699, + "step": 2541 + }, + { + "epoch": 0.02542, + "grad_norm": 0.41058483719825745, + "grad_norm_var": 0.014452971755157702, + "learning_rate": 5e-05, + "loss": 0.1952, + "loss/crossentropy": 2.8003902435302734, + "loss/hidden": 0.0, + "loss/logits": 0.19523901492357254, + "loss/reg": 0.5497992038726807, + "step": 2542 + }, + { + "epoch": 0.02543, + "grad_norm": 0.5624150037765503, + "grad_norm_var": 0.014499627352902526, + "learning_rate": 5e-05, + "loss": 0.2471, + "loss/crossentropy": 2.9444841742515564, + "loss/hidden": 0.0, + "loss/logits": 0.24706361815333366, + "loss/reg": 0.5498580932617188, + "step": 2543 + }, + { + "epoch": 0.02544, + "grad_norm": 0.44264090061187744, + "grad_norm_var": 0.014635790472741479, + "learning_rate": 5e-05, + "loss": 0.2362, + "loss/crossentropy": 2.797008216381073, + "loss/hidden": 0.0, + "loss/logits": 0.23618372157216072, + "loss/reg": 0.5495421886444092, + "step": 2544 + }, + { + "epoch": 0.02545, + "grad_norm": 0.3931272029876709, + "grad_norm_var": 0.015146759583646707, + "learning_rate": 5e-05, + "loss": 0.1849, + "loss/crossentropy": 2.833752155303955, + "loss/hidden": 0.0, + "loss/logits": 0.1848716326057911, + "loss/reg": 0.5492741465568542, + "step": 2545 + }, + { + "epoch": 0.02546, + "grad_norm": 0.4413206875324249, + "grad_norm_var": 0.015194661442190737, + "learning_rate": 5e-05, + "loss": 0.2146, + "loss/crossentropy": 2.71982604265213, + "loss/hidden": 0.0, + "loss/logits": 0.21464554220438004, + "loss/reg": 0.5494512319564819, + "step": 2546 + }, + { + "epoch": 0.02547, + "grad_norm": 0.4243248999118805, + "grad_norm_var": 0.01532159441952038, + "learning_rate": 5e-05, + "loss": 0.2078, + "loss/crossentropy": 2.936064302921295, + "loss/hidden": 0.0, + "loss/logits": 0.20776668563485146, + "loss/reg": 0.5488707423210144, + "step": 2547 + }, + { + "epoch": 0.02548, + "grad_norm": 0.41279980540275574, + "grad_norm_var": 0.015380773188731078, + "learning_rate": 5e-05, + "loss": 0.2023, + "loss/crossentropy": 2.890467643737793, + "loss/hidden": 0.0, + "loss/logits": 0.2023102529346943, + "loss/reg": 0.5489451289176941, + "step": 2548 + }, + { + "epoch": 0.02549, + "grad_norm": 0.4138180911540985, + "grad_norm_var": 0.015448560791187305, + "learning_rate": 5e-05, + "loss": 0.1898, + "loss/crossentropy": 2.836877405643463, + "loss/hidden": 0.0, + "loss/logits": 0.189823966473341, + "loss/reg": 0.5485518574714661, + "step": 2549 + }, + { + "epoch": 0.0255, + "grad_norm": 0.4836493730545044, + "grad_norm_var": 0.010508577613362968, + "learning_rate": 5e-05, + "loss": 0.1983, + "loss/crossentropy": 2.770488977432251, + "loss/hidden": 0.0, + "loss/logits": 0.19830937683582306, + "loss/reg": 0.5483593344688416, + "step": 2550 + }, + { + "epoch": 0.02551, + "grad_norm": 0.409914493560791, + "grad_norm_var": 0.010405901149206361, + "learning_rate": 5e-05, + "loss": 0.1847, + "loss/crossentropy": 2.9024451971054077, + "loss/hidden": 0.0, + "loss/logits": 0.18472952023148537, + "loss/reg": 0.5478702783584595, + "step": 2551 + }, + { + "epoch": 0.02552, + "grad_norm": 0.4463954567909241, + "grad_norm_var": 0.010386849039989961, + "learning_rate": 5e-05, + "loss": 0.1998, + "loss/crossentropy": 2.755467116832733, + "loss/hidden": 0.0, + "loss/logits": 0.19977246597409248, + "loss/reg": 0.5472862124443054, + "step": 2552 + }, + { + "epoch": 0.02553, + "grad_norm": 0.480539470911026, + "grad_norm_var": 0.003086571292285119, + "learning_rate": 5e-05, + "loss": 0.219, + "loss/crossentropy": 2.8570539951324463, + "loss/hidden": 0.0, + "loss/logits": 0.21900739148259163, + "loss/reg": 0.5465055704116821, + "step": 2553 + }, + { + "epoch": 0.02554, + "grad_norm": 0.402667373418808, + "grad_norm_var": 0.0032540101961155937, + "learning_rate": 5e-05, + "loss": 0.1912, + "loss/crossentropy": 2.7164729237556458, + "loss/hidden": 0.0, + "loss/logits": 0.19116409122943878, + "loss/reg": 0.5457414388656616, + "step": 2554 + }, + { + "epoch": 0.02555, + "grad_norm": 0.4199041724205017, + "grad_norm_var": 0.0018227687702213205, + "learning_rate": 5e-05, + "loss": 0.1925, + "loss/crossentropy": 2.6879688501358032, + "loss/hidden": 0.0, + "loss/logits": 0.19247112050652504, + "loss/reg": 0.5456957221031189, + "step": 2555 + }, + { + "epoch": 0.02556, + "grad_norm": 0.4406300485134125, + "grad_norm_var": 0.0017568952831501763, + "learning_rate": 5e-05, + "loss": 0.2071, + "loss/crossentropy": 2.8118010759353638, + "loss/hidden": 0.0, + "loss/logits": 0.20705407112836838, + "loss/reg": 0.5449569821357727, + "step": 2556 + }, + { + "epoch": 0.02557, + "grad_norm": 0.4363040030002594, + "grad_norm_var": 0.0017340944999869046, + "learning_rate": 5e-05, + "loss": 0.2087, + "loss/crossentropy": 2.8322975039482117, + "loss/hidden": 0.0, + "loss/logits": 0.2087150290608406, + "loss/reg": 0.544964611530304, + "step": 2557 + }, + { + "epoch": 0.02558, + "grad_norm": 0.43009093403816223, + "grad_norm_var": 0.0016844542958575404, + "learning_rate": 5e-05, + "loss": 0.2065, + "loss/crossentropy": 2.7985161542892456, + "loss/hidden": 0.0, + "loss/logits": 0.20646902918815613, + "loss/reg": 0.5444036722183228, + "step": 2558 + }, + { + "epoch": 0.02559, + "grad_norm": 0.4451965391635895, + "grad_norm_var": 0.0006305043410487432, + "learning_rate": 5e-05, + "loss": 0.2169, + "loss/crossentropy": 2.7708137035369873, + "loss/hidden": 0.0, + "loss/logits": 0.21691185608506203, + "loss/reg": 0.5441729426383972, + "step": 2559 + }, + { + "epoch": 0.0256, + "grad_norm": 0.4010668694972992, + "grad_norm_var": 0.0006834676736219773, + "learning_rate": 5e-05, + "loss": 0.1912, + "loss/crossentropy": 2.6713278889656067, + "loss/hidden": 0.0, + "loss/logits": 0.19123424217104912, + "loss/reg": 0.5439656376838684, + "step": 2560 + }, + { + "epoch": 0.02561, + "grad_norm": 0.3885950446128845, + "grad_norm_var": 0.0007070993052358513, + "learning_rate": 5e-05, + "loss": 0.1997, + "loss/crossentropy": 2.770969033241272, + "loss/hidden": 0.0, + "loss/logits": 0.19974718987941742, + "loss/reg": 0.5437681078910828, + "step": 2561 + }, + { + "epoch": 0.02562, + "grad_norm": 0.40832313895225525, + "grad_norm_var": 0.0007245791727639136, + "learning_rate": 5e-05, + "loss": 0.2003, + "loss/crossentropy": 2.764010787010193, + "loss/hidden": 0.0, + "loss/logits": 0.20027167722582817, + "loss/reg": 0.5433593392372131, + "step": 2562 + }, + { + "epoch": 0.02563, + "grad_norm": 0.4734903872013092, + "grad_norm_var": 0.0008531140790122257, + "learning_rate": 5e-05, + "loss": 0.2193, + "loss/crossentropy": 2.652809977531433, + "loss/hidden": 0.0, + "loss/logits": 0.2193353809416294, + "loss/reg": 0.5431020259857178, + "step": 2563 + }, + { + "epoch": 0.02564, + "grad_norm": 0.42476779222488403, + "grad_norm_var": 0.000833284280348856, + "learning_rate": 5e-05, + "loss": 0.2138, + "loss/crossentropy": 2.6855757236480713, + "loss/hidden": 0.0, + "loss/logits": 0.21383944898843765, + "loss/reg": 0.5424613356590271, + "step": 2564 + }, + { + "epoch": 0.02565, + "grad_norm": 0.47078070044517517, + "grad_norm_var": 0.0009011437606734063, + "learning_rate": 5e-05, + "loss": 0.21, + "loss/crossentropy": 2.8521881699562073, + "loss/hidden": 0.0, + "loss/logits": 0.20999043434858322, + "loss/reg": 0.542188286781311, + "step": 2565 + }, + { + "epoch": 0.02566, + "grad_norm": 0.43568429350852966, + "grad_norm_var": 0.0007347304862577138, + "learning_rate": 5e-05, + "loss": 0.2063, + "loss/crossentropy": 2.831596076488495, + "loss/hidden": 0.0, + "loss/logits": 0.206312894821167, + "loss/reg": 0.5419338941574097, + "step": 2566 + }, + { + "epoch": 0.02567, + "grad_norm": 0.42786744236946106, + "grad_norm_var": 0.0007016564112002281, + "learning_rate": 5e-05, + "loss": 0.2016, + "loss/crossentropy": 2.9559253454208374, + "loss/hidden": 0.0, + "loss/logits": 0.2015715166926384, + "loss/reg": 0.5418655872344971, + "step": 2567 + }, + { + "epoch": 0.02568, + "grad_norm": 0.42222917079925537, + "grad_norm_var": 0.0006958612358014538, + "learning_rate": 5e-05, + "loss": 0.2048, + "loss/crossentropy": 2.69766628742218, + "loss/hidden": 0.0, + "loss/logits": 0.20476100221276283, + "loss/reg": 0.541411280632019, + "step": 2568 + }, + { + "epoch": 0.02569, + "grad_norm": 0.415901243686676, + "grad_norm_var": 0.0005365778582940887, + "learning_rate": 5e-05, + "loss": 0.1967, + "loss/crossentropy": 2.818337559700012, + "loss/hidden": 0.0, + "loss/logits": 0.1967054195702076, + "loss/reg": 0.5414465665817261, + "step": 2569 + }, + { + "epoch": 0.0257, + "grad_norm": 0.3977818787097931, + "grad_norm_var": 0.0005543880265369598, + "learning_rate": 5e-05, + "loss": 0.2011, + "loss/crossentropy": 2.874771237373352, + "loss/hidden": 0.0, + "loss/logits": 0.2011183612048626, + "loss/reg": 0.5411196947097778, + "step": 2570 + }, + { + "epoch": 0.02571, + "grad_norm": 0.4456484615802765, + "grad_norm_var": 0.0005700352485087971, + "learning_rate": 5e-05, + "loss": 0.2106, + "loss/crossentropy": 2.712975323200226, + "loss/hidden": 0.0, + "loss/logits": 0.21063318476080894, + "loss/reg": 0.5410975217819214, + "step": 2571 + }, + { + "epoch": 0.02572, + "grad_norm": 0.4426000714302063, + "grad_norm_var": 0.0005733267956284002, + "learning_rate": 5e-05, + "loss": 0.2081, + "loss/crossentropy": 2.773666501045227, + "loss/hidden": 0.0, + "loss/logits": 0.20812853798270226, + "loss/reg": 0.541155993938446, + "step": 2572 + }, + { + "epoch": 0.02573, + "grad_norm": 0.41351503133773804, + "grad_norm_var": 0.0005840340440666836, + "learning_rate": 5e-05, + "loss": 0.197, + "loss/crossentropy": 2.8845558762550354, + "loss/hidden": 0.0, + "loss/logits": 0.19696297124028206, + "loss/reg": 0.5414305925369263, + "step": 2573 + }, + { + "epoch": 0.02574, + "grad_norm": 0.433570921421051, + "grad_norm_var": 0.0005858904969982092, + "learning_rate": 5e-05, + "loss": 0.1964, + "loss/crossentropy": 2.8925362825393677, + "loss/hidden": 0.0, + "loss/logits": 0.1964482069015503, + "loss/reg": 0.5415604710578918, + "step": 2574 + }, + { + "epoch": 0.02575, + "grad_norm": 0.4247989356517792, + "grad_norm_var": 0.0005649585419991276, + "learning_rate": 5e-05, + "loss": 0.1958, + "loss/crossentropy": 2.8365857005119324, + "loss/hidden": 0.0, + "loss/logits": 0.19576196745038033, + "loss/reg": 0.5411614179611206, + "step": 2575 + }, + { + "epoch": 0.02576, + "grad_norm": 0.43207255005836487, + "grad_norm_var": 0.0005192228720419837, + "learning_rate": 5e-05, + "loss": 0.1984, + "loss/crossentropy": 2.779477298259735, + "loss/hidden": 0.0, + "loss/logits": 0.19839901477098465, + "loss/reg": 0.5414489507675171, + "step": 2576 + }, + { + "epoch": 0.02577, + "grad_norm": 0.4159988760948181, + "grad_norm_var": 0.00041998044119392344, + "learning_rate": 5e-05, + "loss": 0.2052, + "loss/crossentropy": 2.7850451469421387, + "loss/hidden": 0.0, + "loss/logits": 0.2051517814397812, + "loss/reg": 0.541140079498291, + "step": 2577 + }, + { + "epoch": 0.02578, + "grad_norm": 0.42943474650382996, + "grad_norm_var": 0.00038593381932324757, + "learning_rate": 5e-05, + "loss": 0.2014, + "loss/crossentropy": 2.87819504737854, + "loss/hidden": 0.0, + "loss/logits": 0.20142462849617004, + "loss/reg": 0.5408768057823181, + "step": 2578 + }, + { + "epoch": 0.02579, + "grad_norm": 0.50783771276474, + "grad_norm_var": 0.0006513552488828456, + "learning_rate": 5e-05, + "loss": 0.2209, + "loss/crossentropy": 2.8141302466392517, + "loss/hidden": 0.0, + "loss/logits": 0.22085712105035782, + "loss/reg": 0.5405548214912415, + "step": 2579 + }, + { + "epoch": 0.0258, + "grad_norm": 0.4425601363182068, + "grad_norm_var": 0.0006497594873037915, + "learning_rate": 5e-05, + "loss": 0.1989, + "loss/crossentropy": 2.95922589302063, + "loss/hidden": 0.0, + "loss/logits": 0.19886454194784164, + "loss/reg": 0.5404248833656311, + "step": 2580 + }, + { + "epoch": 0.02581, + "grad_norm": 0.4714566469192505, + "grad_norm_var": 0.0006530224985494789, + "learning_rate": 5e-05, + "loss": 0.2151, + "loss/crossentropy": 2.9030808806419373, + "loss/hidden": 0.0, + "loss/logits": 0.21509142592549324, + "loss/reg": 0.539705753326416, + "step": 2581 + }, + { + "epoch": 0.02582, + "grad_norm": 0.41765519976615906, + "grad_norm_var": 0.0006715365187356567, + "learning_rate": 5e-05, + "loss": 0.1922, + "loss/crossentropy": 2.7505761981010437, + "loss/hidden": 0.0, + "loss/logits": 0.192151490598917, + "loss/reg": 0.539389431476593, + "step": 2582 + }, + { + "epoch": 0.02583, + "grad_norm": 0.41287410259246826, + "grad_norm_var": 0.000697462501578731, + "learning_rate": 5e-05, + "loss": 0.1974, + "loss/crossentropy": 2.808144748210907, + "loss/hidden": 0.0, + "loss/logits": 0.19738909229636192, + "loss/reg": 0.5390953421592712, + "step": 2583 + }, + { + "epoch": 0.02584, + "grad_norm": 0.45872962474823, + "grad_norm_var": 0.0007289394137978503, + "learning_rate": 5e-05, + "loss": 0.2196, + "loss/crossentropy": 2.6323054432868958, + "loss/hidden": 0.0, + "loss/logits": 0.21956656128168106, + "loss/reg": 0.5383455157279968, + "step": 2584 + }, + { + "epoch": 0.02585, + "grad_norm": 0.41521814465522766, + "grad_norm_var": 0.0007307219577792901, + "learning_rate": 5e-05, + "loss": 0.1843, + "loss/crossentropy": 2.9003251791000366, + "loss/hidden": 0.0, + "loss/logits": 0.1842983514070511, + "loss/reg": 0.5378218293190002, + "step": 2585 + }, + { + "epoch": 0.02586, + "grad_norm": 0.4990185797214508, + "grad_norm_var": 0.0008674187337737357, + "learning_rate": 5e-05, + "loss": 0.2135, + "loss/crossentropy": 2.9992847442626953, + "loss/hidden": 0.0, + "loss/logits": 0.21352282539010048, + "loss/reg": 0.537528932094574, + "step": 2586 + }, + { + "epoch": 0.02587, + "grad_norm": 0.4412449598312378, + "grad_norm_var": 0.0008661578871688983, + "learning_rate": 5e-05, + "loss": 0.1955, + "loss/crossentropy": 2.878824770450592, + "loss/hidden": 0.0, + "loss/logits": 0.19554269686341286, + "loss/reg": 0.5371176600456238, + "step": 2587 + }, + { + "epoch": 0.02588, + "grad_norm": 0.4823025166988373, + "grad_norm_var": 0.0009722902132668687, + "learning_rate": 5e-05, + "loss": 0.2066, + "loss/crossentropy": 2.906935393810272, + "loss/hidden": 0.0, + "loss/logits": 0.20661735907196999, + "loss/reg": 0.5367439985275269, + "step": 2588 + }, + { + "epoch": 0.02589, + "grad_norm": 0.44674089550971985, + "grad_norm_var": 0.0009078170290962362, + "learning_rate": 5e-05, + "loss": 0.2171, + "loss/crossentropy": 2.896657109260559, + "loss/hidden": 0.0, + "loss/logits": 0.21705543622374535, + "loss/reg": 0.5365266799926758, + "step": 2589 + }, + { + "epoch": 0.0259, + "grad_norm": 0.42683106660842896, + "grad_norm_var": 0.0009215735623975263, + "learning_rate": 5e-05, + "loss": 0.2108, + "loss/crossentropy": 2.89300137758255, + "loss/hidden": 0.0, + "loss/logits": 0.21082526817917824, + "loss/reg": 0.5362415909767151, + "step": 2590 + }, + { + "epoch": 0.02591, + "grad_norm": 0.4368167519569397, + "grad_norm_var": 0.0008977524376205914, + "learning_rate": 5e-05, + "loss": 0.2062, + "loss/crossentropy": 2.9134311079978943, + "loss/hidden": 0.0, + "loss/logits": 0.20622878894209862, + "loss/reg": 0.5360001921653748, + "step": 2591 + }, + { + "epoch": 0.02592, + "grad_norm": 0.4137730300426483, + "grad_norm_var": 0.0009527849059200381, + "learning_rate": 5e-05, + "loss": 0.2088, + "loss/crossentropy": 2.7248812317848206, + "loss/hidden": 0.0, + "loss/logits": 0.2088310532271862, + "loss/reg": 0.5357056260108948, + "step": 2592 + }, + { + "epoch": 0.02593, + "grad_norm": 0.43711450695991516, + "grad_norm_var": 0.0008992666810847538, + "learning_rate": 5e-05, + "loss": 0.2143, + "loss/crossentropy": 2.8187623023986816, + "loss/hidden": 0.0, + "loss/logits": 0.2142937146127224, + "loss/reg": 0.5355096459388733, + "step": 2593 + }, + { + "epoch": 0.02594, + "grad_norm": 0.39129528403282166, + "grad_norm_var": 0.001075565916038516, + "learning_rate": 5e-05, + "loss": 0.1908, + "loss/crossentropy": 2.7801267504692078, + "loss/hidden": 0.0, + "loss/logits": 0.19080017507076263, + "loss/reg": 0.5350464582443237, + "step": 2594 + }, + { + "epoch": 0.02595, + "grad_norm": 0.4266131520271301, + "grad_norm_var": 0.0007948335012002872, + "learning_rate": 5e-05, + "loss": 0.2077, + "loss/crossentropy": 2.8751733899116516, + "loss/hidden": 0.0, + "loss/logits": 0.20774037390947342, + "loss/reg": 0.534430742263794, + "step": 2595 + }, + { + "epoch": 0.02596, + "grad_norm": 0.4446522295475006, + "grad_norm_var": 0.0007961656116108847, + "learning_rate": 5e-05, + "loss": 0.2044, + "loss/crossentropy": 2.805653750896454, + "loss/hidden": 0.0, + "loss/logits": 0.20436738803982735, + "loss/reg": 0.5340574979782104, + "step": 2596 + }, + { + "epoch": 0.02597, + "grad_norm": 0.438000351190567, + "grad_norm_var": 0.0007208757195188061, + "learning_rate": 5e-05, + "loss": 0.2007, + "loss/crossentropy": 2.868275284767151, + "loss/hidden": 0.0, + "loss/logits": 0.20066888257861137, + "loss/reg": 0.5334810614585876, + "step": 2597 + }, + { + "epoch": 0.02598, + "grad_norm": 0.6230400800704956, + "grad_norm_var": 0.0028328987675226144, + "learning_rate": 5e-05, + "loss": 0.2338, + "loss/crossentropy": 2.841219961643219, + "loss/hidden": 0.0, + "loss/logits": 0.23376845568418503, + "loss/reg": 0.5331010818481445, + "step": 2598 + }, + { + "epoch": 0.02599, + "grad_norm": 0.4347565770149231, + "grad_norm_var": 0.00275555131828501, + "learning_rate": 5e-05, + "loss": 0.2071, + "loss/crossentropy": 2.733871340751648, + "loss/hidden": 0.0, + "loss/logits": 0.20712962001562119, + "loss/reg": 0.5322952270507812, + "step": 2599 + }, + { + "epoch": 0.026, + "grad_norm": 0.4292391836643219, + "grad_norm_var": 0.0027795496716512603, + "learning_rate": 5e-05, + "loss": 0.1951, + "loss/crossentropy": 2.771967053413391, + "loss/hidden": 0.0, + "loss/logits": 0.19509857520461082, + "loss/reg": 0.5319327116012573, + "step": 2600 + }, + { + "epoch": 0.02601, + "grad_norm": 1.3506224155426025, + "grad_norm_var": 0.053231865488185516, + "learning_rate": 5e-05, + "loss": 0.2315, + "loss/crossentropy": 2.8156538605690002, + "loss/hidden": 0.0, + "loss/logits": 0.23150773346424103, + "loss/reg": 0.5311727523803711, + "step": 2601 + }, + { + "epoch": 0.02602, + "grad_norm": 0.460089772939682, + "grad_norm_var": 0.05337127290474258, + "learning_rate": 5e-05, + "loss": 0.2154, + "loss/crossentropy": 2.721584916114807, + "loss/hidden": 0.0, + "loss/logits": 0.21538399159908295, + "loss/reg": 0.5307376980781555, + "step": 2602 + }, + { + "epoch": 0.02603, + "grad_norm": 0.4268496036529541, + "grad_norm_var": 0.05350697056033284, + "learning_rate": 5e-05, + "loss": 0.2013, + "loss/crossentropy": 2.791659116744995, + "loss/hidden": 0.0, + "loss/logits": 0.20125148817896843, + "loss/reg": 0.530678927898407, + "step": 2603 + }, + { + "epoch": 0.02604, + "grad_norm": 0.4951406419277191, + "grad_norm_var": 0.05347962415418689, + "learning_rate": 5e-05, + "loss": 0.2126, + "loss/crossentropy": 2.881497383117676, + "loss/hidden": 0.0, + "loss/logits": 0.21259039267897606, + "loss/reg": 0.5308570265769958, + "step": 2604 + }, + { + "epoch": 0.02605, + "grad_norm": 0.5374444127082825, + "grad_norm_var": 0.05328805467085673, + "learning_rate": 5e-05, + "loss": 0.2116, + "loss/crossentropy": 2.830753982067108, + "loss/hidden": 0.0, + "loss/logits": 0.21158229187130928, + "loss/reg": 0.5308424830436707, + "step": 2605 + }, + { + "epoch": 0.02606, + "grad_norm": 0.5248410105705261, + "grad_norm_var": 0.05279154657399799, + "learning_rate": 5e-05, + "loss": 0.2042, + "loss/crossentropy": 2.8558992743492126, + "loss/hidden": 0.0, + "loss/logits": 0.20416738465428352, + "loss/reg": 0.5307204127311707, + "step": 2606 + }, + { + "epoch": 0.02607, + "grad_norm": 0.5762840509414673, + "grad_norm_var": 0.0525181718048571, + "learning_rate": 5e-05, + "loss": 0.2013, + "loss/crossentropy": 2.9106109738349915, + "loss/hidden": 0.0, + "loss/logits": 0.20126280561089516, + "loss/reg": 0.5305396318435669, + "step": 2607 + }, + { + "epoch": 0.02608, + "grad_norm": 0.4873623847961426, + "grad_norm_var": 0.051759301415372945, + "learning_rate": 5e-05, + "loss": 0.1869, + "loss/crossentropy": 2.8261622190475464, + "loss/hidden": 0.0, + "loss/logits": 0.1869312971830368, + "loss/reg": 0.5306512117385864, + "step": 2608 + }, + { + "epoch": 0.02609, + "grad_norm": 0.510123610496521, + "grad_norm_var": 0.05118621325102606, + "learning_rate": 5e-05, + "loss": 0.2083, + "loss/crossentropy": 2.8890808820724487, + "loss/hidden": 0.0, + "loss/logits": 0.20833243429660797, + "loss/reg": 0.5303729176521301, + "step": 2609 + }, + { + "epoch": 0.0261, + "grad_norm": 0.5831773281097412, + "grad_norm_var": 0.04981663135842492, + "learning_rate": 5e-05, + "loss": 0.2095, + "loss/crossentropy": 2.6961584091186523, + "loss/hidden": 0.0, + "loss/logits": 0.20948517695069313, + "loss/reg": 0.5296767354011536, + "step": 2610 + }, + { + "epoch": 0.02611, + "grad_norm": 0.4680619239807129, + "grad_norm_var": 0.04925998796593408, + "learning_rate": 5e-05, + "loss": 0.2046, + "loss/crossentropy": 2.9065521955490112, + "loss/hidden": 0.0, + "loss/logits": 0.20462613552808762, + "loss/reg": 0.5293965935707092, + "step": 2611 + }, + { + "epoch": 0.02612, + "grad_norm": 0.46184584498405457, + "grad_norm_var": 0.04903843421986803, + "learning_rate": 5e-05, + "loss": 0.1881, + "loss/crossentropy": 2.774681031703949, + "loss/hidden": 0.0, + "loss/logits": 0.188124168664217, + "loss/reg": 0.5288440585136414, + "step": 2612 + }, + { + "epoch": 0.02613, + "grad_norm": 0.5287350416183472, + "grad_norm_var": 0.048192814582690306, + "learning_rate": 5e-05, + "loss": 0.201, + "loss/crossentropy": 2.822981595993042, + "loss/hidden": 0.0, + "loss/logits": 0.20100250095129013, + "loss/reg": 0.5282597541809082, + "step": 2613 + }, + { + "epoch": 0.02614, + "grad_norm": 0.4631010591983795, + "grad_norm_var": 0.048364102954362394, + "learning_rate": 5e-05, + "loss": 0.1951, + "loss/crossentropy": 2.690920829772949, + "loss/hidden": 0.0, + "loss/logits": 0.1951279453933239, + "loss/reg": 0.5278733968734741, + "step": 2614 + }, + { + "epoch": 0.02615, + "grad_norm": 0.42318376898765564, + "grad_norm_var": 0.048544288266499064, + "learning_rate": 5e-05, + "loss": 0.1911, + "loss/crossentropy": 2.7097662687301636, + "loss/hidden": 0.0, + "loss/logits": 0.19105671718716621, + "loss/reg": 0.5275110006332397, + "step": 2615 + }, + { + "epoch": 0.02616, + "grad_norm": 0.4997815787792206, + "grad_norm_var": 0.04776290946554092, + "learning_rate": 5e-05, + "loss": 0.2143, + "loss/crossentropy": 2.8465227484703064, + "loss/hidden": 0.0, + "loss/logits": 0.2143278308212757, + "loss/reg": 0.5278035402297974, + "step": 2616 + }, + { + "epoch": 0.02617, + "grad_norm": 0.46174725890159607, + "grad_norm_var": 0.0022321275036855585, + "learning_rate": 5e-05, + "loss": 0.1989, + "loss/crossentropy": 2.733360528945923, + "loss/hidden": 0.0, + "loss/logits": 0.19889701157808304, + "loss/reg": 0.5274414420127869, + "step": 2617 + }, + { + "epoch": 0.02618, + "grad_norm": 0.45418161153793335, + "grad_norm_var": 0.0022612076777431007, + "learning_rate": 5e-05, + "loss": 0.2142, + "loss/crossentropy": 2.717996895313263, + "loss/hidden": 0.0, + "loss/logits": 0.21418941766023636, + "loss/reg": 0.5274969339370728, + "step": 2618 + }, + { + "epoch": 0.02619, + "grad_norm": 0.45905518531799316, + "grad_norm_var": 0.0020382576653902233, + "learning_rate": 5e-05, + "loss": 0.2053, + "loss/crossentropy": 2.8800554275512695, + "loss/hidden": 0.0, + "loss/logits": 0.205328106880188, + "loss/reg": 0.5275300741195679, + "step": 2619 + }, + { + "epoch": 0.0262, + "grad_norm": 0.4228639006614685, + "grad_norm_var": 0.0023718702394115556, + "learning_rate": 5e-05, + "loss": 0.2046, + "loss/crossentropy": 2.832355260848999, + "loss/hidden": 0.0, + "loss/logits": 0.2046058215200901, + "loss/reg": 0.5280609130859375, + "step": 2620 + }, + { + "epoch": 0.02621, + "grad_norm": 0.4382401406764984, + "grad_norm_var": 0.002377418576029422, + "learning_rate": 5e-05, + "loss": 0.1926, + "loss/crossentropy": 2.8222222328186035, + "loss/hidden": 0.0, + "loss/logits": 0.19262420758605003, + "loss/reg": 0.527805507183075, + "step": 2621 + }, + { + "epoch": 0.02622, + "grad_norm": 0.4532676935195923, + "grad_norm_var": 0.002318924929760513, + "learning_rate": 5e-05, + "loss": 0.2015, + "loss/crossentropy": 2.8229929208755493, + "loss/hidden": 0.0, + "loss/logits": 0.20146819576621056, + "loss/reg": 0.5277212262153625, + "step": 2622 + }, + { + "epoch": 0.02623, + "grad_norm": 0.4287426769733429, + "grad_norm_var": 0.0017988758557177817, + "learning_rate": 5e-05, + "loss": 0.2022, + "loss/crossentropy": 2.823437809944153, + "loss/hidden": 0.0, + "loss/logits": 0.2021707445383072, + "loss/reg": 0.527053713798523, + "step": 2623 + }, + { + "epoch": 0.02624, + "grad_norm": 0.43862003087997437, + "grad_norm_var": 0.0018440602233982658, + "learning_rate": 5e-05, + "loss": 0.2061, + "loss/crossentropy": 2.801253616809845, + "loss/hidden": 0.0, + "loss/logits": 0.2060890980064869, + "loss/reg": 0.526616096496582, + "step": 2624 + }, + { + "epoch": 0.02625, + "grad_norm": 0.40915781259536743, + "grad_norm_var": 0.001919779835634113, + "learning_rate": 5e-05, + "loss": 0.2006, + "loss/crossentropy": 2.8015910983085632, + "loss/hidden": 0.0, + "loss/logits": 0.2006482109427452, + "loss/reg": 0.5263166427612305, + "step": 2625 + }, + { + "epoch": 0.02626, + "grad_norm": 0.48433706164360046, + "grad_norm_var": 0.0009348593180592483, + "learning_rate": 5e-05, + "loss": 0.2169, + "loss/crossentropy": 2.9656863808631897, + "loss/hidden": 0.0, + "loss/logits": 0.2169286049902439, + "loss/reg": 0.52571702003479, + "step": 2626 + }, + { + "epoch": 0.02627, + "grad_norm": 0.857741117477417, + "grad_norm_var": 0.011055679242502474, + "learning_rate": 5e-05, + "loss": 0.2119, + "loss/crossentropy": 2.7929531931877136, + "loss/hidden": 0.0, + "loss/logits": 0.21189912036061287, + "loss/reg": 0.5252140760421753, + "step": 2627 + }, + { + "epoch": 0.02628, + "grad_norm": 0.4756178855895996, + "grad_norm_var": 0.01103366946046069, + "learning_rate": 5e-05, + "loss": 0.2303, + "loss/crossentropy": 2.8412453532218933, + "loss/hidden": 0.0, + "loss/logits": 0.230278130620718, + "loss/reg": 0.5248338580131531, + "step": 2628 + }, + { + "epoch": 0.02629, + "grad_norm": 0.5045449733734131, + "grad_norm_var": 0.010916758592587098, + "learning_rate": 5e-05, + "loss": 0.2273, + "loss/crossentropy": 2.7457079887390137, + "loss/hidden": 0.0, + "loss/logits": 0.22728293389081955, + "loss/reg": 0.5242133140563965, + "step": 2629 + }, + { + "epoch": 0.0263, + "grad_norm": 0.49859246611595154, + "grad_norm_var": 0.010917237354791083, + "learning_rate": 5e-05, + "loss": 0.1987, + "loss/crossentropy": 2.74990576505661, + "loss/hidden": 0.0, + "loss/logits": 0.19872421026229858, + "loss/reg": 0.5239636898040771, + "step": 2630 + }, + { + "epoch": 0.02631, + "grad_norm": 0.580632209777832, + "grad_norm_var": 0.011234926908154226, + "learning_rate": 5e-05, + "loss": 0.2081, + "loss/crossentropy": 2.8466989398002625, + "loss/hidden": 0.0, + "loss/logits": 0.20809010416269302, + "loss/reg": 0.5236205458641052, + "step": 2631 + }, + { + "epoch": 0.02632, + "grad_norm": 0.4662504196166992, + "grad_norm_var": 0.011269045431813689, + "learning_rate": 5e-05, + "loss": 0.1978, + "loss/crossentropy": 2.781718671321869, + "loss/hidden": 0.0, + "loss/logits": 0.1977783851325512, + "loss/reg": 0.5231173038482666, + "step": 2632 + }, + { + "epoch": 0.02633, + "grad_norm": 0.4562200605869293, + "grad_norm_var": 0.011291480803711418, + "learning_rate": 5e-05, + "loss": 0.2173, + "loss/crossentropy": 2.7935692071914673, + "loss/hidden": 0.0, + "loss/logits": 0.21725642308592796, + "loss/reg": 0.5224733352661133, + "step": 2633 + }, + { + "epoch": 0.02634, + "grad_norm": 0.3951549828052521, + "grad_norm_var": 0.011785267661664382, + "learning_rate": 5e-05, + "loss": 0.1993, + "loss/crossentropy": 2.7473865151405334, + "loss/hidden": 0.0, + "loss/logits": 0.19925842806696892, + "loss/reg": 0.5222446322441101, + "step": 2634 + }, + { + "epoch": 0.02635, + "grad_norm": 0.4547794461250305, + "grad_norm_var": 0.01180152344116235, + "learning_rate": 5e-05, + "loss": 0.2007, + "loss/crossentropy": 2.749935567378998, + "loss/hidden": 0.0, + "loss/logits": 0.20071029663085938, + "loss/reg": 0.5219306945800781, + "step": 2635 + }, + { + "epoch": 0.02636, + "grad_norm": 0.46452295780181885, + "grad_norm_var": 0.01156319977269858, + "learning_rate": 5e-05, + "loss": 0.207, + "loss/crossentropy": 2.7920570373535156, + "loss/hidden": 0.0, + "loss/logits": 0.20704756677150726, + "loss/reg": 0.5213372707366943, + "step": 2636 + }, + { + "epoch": 0.02637, + "grad_norm": 0.43983954191207886, + "grad_norm_var": 0.011552769221888677, + "learning_rate": 5e-05, + "loss": 0.2078, + "loss/crossentropy": 2.8299378156661987, + "loss/hidden": 0.0, + "loss/logits": 0.20782317221164703, + "loss/reg": 0.5211907029151917, + "step": 2637 + }, + { + "epoch": 0.02638, + "grad_norm": 0.4226306080818176, + "grad_norm_var": 0.011753318659268248, + "learning_rate": 5e-05, + "loss": 0.1968, + "loss/crossentropy": 2.626910448074341, + "loss/hidden": 0.0, + "loss/logits": 0.19679496809840202, + "loss/reg": 0.5207825303077698, + "step": 2638 + }, + { + "epoch": 0.02639, + "grad_norm": 0.4697932004928589, + "grad_norm_var": 0.011544774305078423, + "learning_rate": 5e-05, + "loss": 0.2059, + "loss/crossentropy": 2.6778536438941956, + "loss/hidden": 0.0, + "loss/logits": 0.20591752603650093, + "loss/reg": 0.5202928781509399, + "step": 2639 + }, + { + "epoch": 0.0264, + "grad_norm": 0.5112264752388, + "grad_norm_var": 0.011389901160338178, + "learning_rate": 5e-05, + "loss": 0.2192, + "loss/crossentropy": 2.9026376008987427, + "loss/hidden": 0.0, + "loss/logits": 0.2192208170890808, + "loss/reg": 0.5200670957565308, + "step": 2640 + }, + { + "epoch": 0.02641, + "grad_norm": 0.6163347363471985, + "grad_norm_var": 0.011751270736131744, + "learning_rate": 5e-05, + "loss": 0.2369, + "loss/crossentropy": 2.6764743328094482, + "loss/hidden": 0.0, + "loss/logits": 0.23691648244857788, + "loss/reg": 0.5194461941719055, + "step": 2641 + }, + { + "epoch": 0.02642, + "grad_norm": 0.46747079491615295, + "grad_norm_var": 0.011818078321830624, + "learning_rate": 5e-05, + "loss": 0.221, + "loss/crossentropy": 2.682372212409973, + "loss/hidden": 0.0, + "loss/logits": 0.22104769200086594, + "loss/reg": 0.5190898180007935, + "step": 2642 + }, + { + "epoch": 0.02643, + "grad_norm": 0.4604860842227936, + "grad_norm_var": 0.0030020183287800097, + "learning_rate": 5e-05, + "loss": 0.1985, + "loss/crossentropy": 2.8597342371940613, + "loss/hidden": 0.0, + "loss/logits": 0.19849200546741486, + "loss/reg": 0.5189164280891418, + "step": 2643 + }, + { + "epoch": 0.02644, + "grad_norm": 0.42130711674690247, + "grad_norm_var": 0.0032199590440307774, + "learning_rate": 5e-05, + "loss": 0.1943, + "loss/crossentropy": 2.805720031261444, + "loss/hidden": 0.0, + "loss/logits": 0.19433778524398804, + "loss/reg": 0.5186786651611328, + "step": 2644 + }, + { + "epoch": 0.02645, + "grad_norm": 0.4330293834209442, + "grad_norm_var": 0.0032756419315143467, + "learning_rate": 5e-05, + "loss": 0.2219, + "loss/crossentropy": 2.752020239830017, + "loss/hidden": 0.0, + "loss/logits": 0.2219337411224842, + "loss/reg": 0.5182340145111084, + "step": 2645 + }, + { + "epoch": 0.02646, + "grad_norm": 0.5088123083114624, + "grad_norm_var": 0.0033178718345673835, + "learning_rate": 5e-05, + "loss": 0.2087, + "loss/crossentropy": 2.7935571670532227, + "loss/hidden": 0.0, + "loss/logits": 0.2087486796081066, + "loss/reg": 0.518045961856842, + "step": 2646 + }, + { + "epoch": 0.02647, + "grad_norm": 0.4569924771785736, + "grad_norm_var": 0.002499451982306496, + "learning_rate": 5e-05, + "loss": 0.2033, + "loss/crossentropy": 2.8601977825164795, + "loss/hidden": 0.0, + "loss/logits": 0.20328442007303238, + "loss/reg": 0.5178834795951843, + "step": 2647 + }, + { + "epoch": 0.02648, + "grad_norm": 0.47098761796951294, + "grad_norm_var": 0.0025014528607127163, + "learning_rate": 5e-05, + "loss": 0.218, + "loss/crossentropy": 2.964486539363861, + "loss/hidden": 0.0, + "loss/logits": 0.21797813475131989, + "loss/reg": 0.5175227522850037, + "step": 2648 + }, + { + "epoch": 0.02649, + "grad_norm": 0.4377838671207428, + "grad_norm_var": 0.002545751695702825, + "learning_rate": 5e-05, + "loss": 0.1913, + "loss/crossentropy": 2.796652615070343, + "loss/hidden": 0.0, + "loss/logits": 0.19125105440616608, + "loss/reg": 0.5174310803413391, + "step": 2649 + }, + { + "epoch": 0.0265, + "grad_norm": 0.42444074153900146, + "grad_norm_var": 0.0023287860329807676, + "learning_rate": 5e-05, + "loss": 0.2003, + "loss/crossentropy": 2.866143226623535, + "loss/hidden": 0.0, + "loss/logits": 0.20031168684363365, + "loss/reg": 0.517005980014801, + "step": 2650 + }, + { + "epoch": 0.02651, + "grad_norm": 0.44882041215896606, + "grad_norm_var": 0.002340140921261839, + "learning_rate": 5e-05, + "loss": 0.2093, + "loss/crossentropy": 2.7818318605422974, + "loss/hidden": 0.0, + "loss/logits": 0.20926685631275177, + "loss/reg": 0.51676344871521, + "step": 2651 + }, + { + "epoch": 0.02652, + "grad_norm": 0.43426263332366943, + "grad_norm_var": 0.002402947090216149, + "learning_rate": 5e-05, + "loss": 0.2196, + "loss/crossentropy": 2.8635306358337402, + "loss/hidden": 0.0, + "loss/logits": 0.21962060034275055, + "loss/reg": 0.5161485075950623, + "step": 2652 + }, + { + "epoch": 0.02653, + "grad_norm": 0.4355899691581726, + "grad_norm_var": 0.002417773039445814, + "learning_rate": 5e-05, + "loss": 0.209, + "loss/crossentropy": 2.862092673778534, + "loss/hidden": 0.0, + "loss/logits": 0.20897655189037323, + "loss/reg": 0.5157283544540405, + "step": 2653 + }, + { + "epoch": 0.02654, + "grad_norm": 0.421115905046463, + "grad_norm_var": 0.0024262205252687926, + "learning_rate": 5e-05, + "loss": 0.2056, + "loss/crossentropy": 2.748015880584717, + "loss/hidden": 0.0, + "loss/logits": 0.20555206760764122, + "loss/reg": 0.5153583884239197, + "step": 2654 + }, + { + "epoch": 0.02655, + "grad_norm": 0.4659707248210907, + "grad_norm_var": 0.002424004479490982, + "learning_rate": 5e-05, + "loss": 0.2066, + "loss/crossentropy": 2.685142159461975, + "loss/hidden": 0.0, + "loss/logits": 0.20658260211348534, + "loss/reg": 0.5148130655288696, + "step": 2655 + }, + { + "epoch": 0.02656, + "grad_norm": 0.4709790349006653, + "grad_norm_var": 0.0022686706393573175, + "learning_rate": 5e-05, + "loss": 0.2099, + "loss/crossentropy": 2.982099175453186, + "loss/hidden": 0.0, + "loss/logits": 0.20985327288508415, + "loss/reg": 0.5144376158714294, + "step": 2656 + }, + { + "epoch": 0.02657, + "grad_norm": 0.46308234333992004, + "grad_norm_var": 0.000560444083555256, + "learning_rate": 5e-05, + "loss": 0.201, + "loss/crossentropy": 2.8071009516716003, + "loss/hidden": 0.0, + "loss/logits": 0.2010471671819687, + "loss/reg": 0.5138293504714966, + "step": 2657 + }, + { + "epoch": 0.02658, + "grad_norm": 0.4737538993358612, + "grad_norm_var": 0.0005764411076125501, + "learning_rate": 5e-05, + "loss": 0.2349, + "loss/crossentropy": 2.867660105228424, + "loss/hidden": 0.0, + "loss/logits": 0.23493582010269165, + "loss/reg": 0.513386607170105, + "step": 2658 + }, + { + "epoch": 0.02659, + "grad_norm": 0.4169728457927704, + "grad_norm_var": 0.0006438817171652844, + "learning_rate": 5e-05, + "loss": 0.1883, + "loss/crossentropy": 2.8713210821151733, + "loss/hidden": 0.0, + "loss/logits": 0.1882842220366001, + "loss/reg": 0.5130794644355774, + "step": 2659 + }, + { + "epoch": 0.0266, + "grad_norm": 0.46998944878578186, + "grad_norm_var": 0.0006122909722079369, + "learning_rate": 5e-05, + "loss": 0.223, + "loss/crossentropy": 2.6248444318771362, + "loss/hidden": 0.0, + "loss/logits": 0.22303304821252823, + "loss/reg": 0.5127474069595337, + "step": 2660 + }, + { + "epoch": 0.02661, + "grad_norm": 0.45122289657592773, + "grad_norm_var": 0.0005868712793243821, + "learning_rate": 5e-05, + "loss": 0.2076, + "loss/crossentropy": 2.962437629699707, + "loss/hidden": 0.0, + "loss/logits": 0.2075929418206215, + "loss/reg": 0.51222825050354, + "step": 2661 + }, + { + "epoch": 0.02662, + "grad_norm": 0.41702502965927124, + "grad_norm_var": 0.0004325040324381598, + "learning_rate": 5e-05, + "loss": 0.198, + "loss/crossentropy": 2.8591843843460083, + "loss/hidden": 0.0, + "loss/logits": 0.19804061204195023, + "loss/reg": 0.5116597414016724, + "step": 2662 + }, + { + "epoch": 0.02663, + "grad_norm": 0.48242413997650146, + "grad_norm_var": 0.0005053291372923578, + "learning_rate": 5e-05, + "loss": 0.2105, + "loss/crossentropy": 2.9002949595451355, + "loss/hidden": 0.0, + "loss/logits": 0.21049673110246658, + "loss/reg": 0.5110887289047241, + "step": 2663 + }, + { + "epoch": 0.02664, + "grad_norm": 0.4562484622001648, + "grad_norm_var": 0.0004757480557940045, + "learning_rate": 5e-05, + "loss": 0.2035, + "loss/crossentropy": 2.985000431537628, + "loss/hidden": 0.0, + "loss/logits": 0.203524362295866, + "loss/reg": 0.510291576385498, + "step": 2664 + }, + { + "epoch": 0.02665, + "grad_norm": 0.4563778042793274, + "grad_norm_var": 0.00047176803112011594, + "learning_rate": 5e-05, + "loss": 0.2034, + "loss/crossentropy": 2.8551156520843506, + "loss/hidden": 0.0, + "loss/logits": 0.20335150510072708, + "loss/reg": 0.5098468661308289, + "step": 2665 + }, + { + "epoch": 0.02666, + "grad_norm": 0.4413127899169922, + "grad_norm_var": 0.0004337097426749518, + "learning_rate": 5e-05, + "loss": 0.2032, + "loss/crossentropy": 2.794453203678131, + "loss/hidden": 0.0, + "loss/logits": 0.2031778134405613, + "loss/reg": 0.5093777775764465, + "step": 2666 + }, + { + "epoch": 0.02667, + "grad_norm": 0.46010512113571167, + "grad_norm_var": 0.0004394097970649824, + "learning_rate": 5e-05, + "loss": 0.2043, + "loss/crossentropy": 2.8909578919410706, + "loss/hidden": 0.0, + "loss/logits": 0.20428812503814697, + "loss/reg": 0.5085967779159546, + "step": 2667 + }, + { + "epoch": 0.02668, + "grad_norm": 0.4390087425708771, + "grad_norm_var": 0.0004302088672399784, + "learning_rate": 5e-05, + "loss": 0.2061, + "loss/crossentropy": 2.909857213497162, + "loss/hidden": 0.0, + "loss/logits": 0.2060927376151085, + "loss/reg": 0.5081804394721985, + "step": 2668 + }, + { + "epoch": 0.02669, + "grad_norm": 0.40289121866226196, + "grad_norm_var": 0.0005656308186297883, + "learning_rate": 5e-05, + "loss": 0.1898, + "loss/crossentropy": 2.7919223308563232, + "loss/hidden": 0.0, + "loss/logits": 0.1897556632757187, + "loss/reg": 0.5074996948242188, + "step": 2669 + }, + { + "epoch": 0.0267, + "grad_norm": 0.4173761308193207, + "grad_norm_var": 0.0005805485982932882, + "learning_rate": 5e-05, + "loss": 0.1975, + "loss/crossentropy": 2.948771119117737, + "loss/hidden": 0.0, + "loss/logits": 0.19749267399311066, + "loss/reg": 0.5070592164993286, + "step": 2670 + }, + { + "epoch": 0.02671, + "grad_norm": 0.4362142086029053, + "grad_norm_var": 0.0005687409416929393, + "learning_rate": 5e-05, + "loss": 0.2038, + "loss/crossentropy": 2.885682463645935, + "loss/hidden": 0.0, + "loss/logits": 0.20376916229724884, + "loss/reg": 0.5064482092857361, + "step": 2671 + }, + { + "epoch": 0.02672, + "grad_norm": 0.4874463677406311, + "grad_norm_var": 0.0006379291868861141, + "learning_rate": 5e-05, + "loss": 0.2273, + "loss/crossentropy": 2.7172476649284363, + "loss/hidden": 0.0, + "loss/logits": 0.22731425240635872, + "loss/reg": 0.5062919855117798, + "step": 2672 + }, + { + "epoch": 0.02673, + "grad_norm": 0.42697882652282715, + "grad_norm_var": 0.0006478306762847812, + "learning_rate": 5e-05, + "loss": 0.2004, + "loss/crossentropy": 2.7278050780296326, + "loss/hidden": 0.0, + "loss/logits": 0.20042840763926506, + "loss/reg": 0.5059646964073181, + "step": 2673 + }, + { + "epoch": 0.02674, + "grad_norm": 0.42211660742759705, + "grad_norm_var": 0.0006231158774990402, + "learning_rate": 5e-05, + "loss": 0.2068, + "loss/crossentropy": 2.854327380657196, + "loss/hidden": 0.0, + "loss/logits": 0.20676289498806, + "loss/reg": 0.5055683851242065, + "step": 2674 + }, + { + "epoch": 0.02675, + "grad_norm": 0.7901020050048828, + "grad_norm_var": 0.008043173488264681, + "learning_rate": 5e-05, + "loss": 0.2441, + "loss/crossentropy": 2.9003363251686096, + "loss/hidden": 0.0, + "loss/logits": 0.2441476471722126, + "loss/reg": 0.5050994753837585, + "step": 2675 + }, + { + "epoch": 0.02676, + "grad_norm": 0.4878675043582916, + "grad_norm_var": 0.008072534737743808, + "learning_rate": 5e-05, + "loss": 0.2394, + "loss/crossentropy": 2.716213822364807, + "loss/hidden": 0.0, + "loss/logits": 0.23937305063009262, + "loss/reg": 0.5046854615211487, + "step": 2676 + }, + { + "epoch": 0.02677, + "grad_norm": 0.3972325026988983, + "grad_norm_var": 0.008369517656648636, + "learning_rate": 5e-05, + "loss": 0.2047, + "loss/crossentropy": 2.8334112763404846, + "loss/hidden": 0.0, + "loss/logits": 0.20471908524632454, + "loss/reg": 0.5039368867874146, + "step": 2677 + }, + { + "epoch": 0.02678, + "grad_norm": 0.5002397298812866, + "grad_norm_var": 0.00828337883668372, + "learning_rate": 5e-05, + "loss": 0.2095, + "loss/crossentropy": 2.9182010293006897, + "loss/hidden": 0.0, + "loss/logits": 0.2095079980790615, + "loss/reg": 0.5028918981552124, + "step": 2678 + }, + { + "epoch": 0.02679, + "grad_norm": 0.4768761396408081, + "grad_norm_var": 0.008275369647450707, + "learning_rate": 5e-05, + "loss": 0.2164, + "loss/crossentropy": 2.9058250784873962, + "loss/hidden": 0.0, + "loss/logits": 0.21640894189476967, + "loss/reg": 0.5023829936981201, + "step": 2679 + }, + { + "epoch": 0.0268, + "grad_norm": 0.44214802980422974, + "grad_norm_var": 0.008311110954558028, + "learning_rate": 5e-05, + "loss": 0.2043, + "loss/crossentropy": 2.900021195411682, + "loss/hidden": 0.0, + "loss/logits": 0.20430920273065567, + "loss/reg": 0.5014623999595642, + "step": 2680 + }, + { + "epoch": 0.02681, + "grad_norm": 0.4228157103061676, + "grad_norm_var": 0.008432483950598464, + "learning_rate": 5e-05, + "loss": 0.1883, + "loss/crossentropy": 2.78199964761734, + "loss/hidden": 0.0, + "loss/logits": 0.18827009573578835, + "loss/reg": 0.5011174082756042, + "step": 2681 + }, + { + "epoch": 0.02682, + "grad_norm": 0.4369513690471649, + "grad_norm_var": 0.00844783752056311, + "learning_rate": 5e-05, + "loss": 0.1994, + "loss/crossentropy": 2.882530629634857, + "loss/hidden": 0.0, + "loss/logits": 0.1993602067232132, + "loss/reg": 0.5004196166992188, + "step": 2682 + }, + { + "epoch": 0.02683, + "grad_norm": 0.6355071663856506, + "grad_norm_var": 0.010246917389360333, + "learning_rate": 5e-05, + "loss": 0.2513, + "loss/crossentropy": 2.716792106628418, + "loss/hidden": 0.0, + "loss/logits": 0.2512522153556347, + "loss/reg": 0.4996320307254791, + "step": 2683 + }, + { + "epoch": 0.02684, + "grad_norm": 0.4017265737056732, + "grad_norm_var": 0.010519465156970987, + "learning_rate": 5e-05, + "loss": 0.2053, + "loss/crossentropy": 2.7997671961784363, + "loss/hidden": 0.0, + "loss/logits": 0.20533058047294617, + "loss/reg": 0.49912288784980774, + "step": 2684 + }, + { + "epoch": 0.02685, + "grad_norm": 0.5914204716682434, + "grad_norm_var": 0.010952672082235947, + "learning_rate": 5e-05, + "loss": 0.2287, + "loss/crossentropy": 2.8028494715690613, + "loss/hidden": 0.0, + "loss/logits": 0.22867391258478165, + "loss/reg": 0.49832239747047424, + "step": 2685 + }, + { + "epoch": 0.02686, + "grad_norm": 0.42868176102638245, + "grad_norm_var": 0.010857496668430585, + "learning_rate": 5e-05, + "loss": 0.2072, + "loss/crossentropy": 2.762281060218811, + "loss/hidden": 0.0, + "loss/logits": 0.2072487585246563, + "loss/reg": 0.49790605902671814, + "step": 2686 + }, + { + "epoch": 0.02687, + "grad_norm": 0.5279750823974609, + "grad_norm_var": 0.010768266037480153, + "learning_rate": 5e-05, + "loss": 0.2246, + "loss/crossentropy": 2.8855302333831787, + "loss/hidden": 0.0, + "loss/logits": 0.22463449835777283, + "loss/reg": 0.4971890449523926, + "step": 2687 + }, + { + "epoch": 0.02688, + "grad_norm": 0.5609726309776306, + "grad_norm_var": 0.011059002981293567, + "learning_rate": 5e-05, + "loss": 0.217, + "loss/crossentropy": 2.8281803131103516, + "loss/hidden": 0.0, + "loss/logits": 0.21702812984585762, + "loss/reg": 0.4968169629573822, + "step": 2688 + }, + { + "epoch": 0.02689, + "grad_norm": 0.439956933259964, + "grad_norm_var": 0.010948622551934288, + "learning_rate": 5e-05, + "loss": 0.2327, + "loss/crossentropy": 2.893004536628723, + "loss/hidden": 0.0, + "loss/logits": 0.23273225873708725, + "loss/reg": 0.49707284569740295, + "step": 2689 + }, + { + "epoch": 0.0269, + "grad_norm": 0.4424091577529907, + "grad_norm_var": 0.010769958420850278, + "learning_rate": 5e-05, + "loss": 0.2023, + "loss/crossentropy": 2.886108100414276, + "loss/hidden": 0.0, + "loss/logits": 0.202349241822958, + "loss/reg": 0.496567964553833, + "step": 2690 + }, + { + "epoch": 0.02691, + "grad_norm": 0.421037882566452, + "grad_norm_var": 0.004954835270687852, + "learning_rate": 5e-05, + "loss": 0.2054, + "loss/crossentropy": 2.80495947599411, + "loss/hidden": 0.0, + "loss/logits": 0.20537864416837692, + "loss/reg": 0.49587953090667725, + "step": 2691 + }, + { + "epoch": 0.02692, + "grad_norm": 0.42684081196784973, + "grad_norm_var": 0.005089927399211014, + "learning_rate": 5e-05, + "loss": 0.2, + "loss/crossentropy": 2.829827308654785, + "loss/hidden": 0.0, + "loss/logits": 0.2000039741396904, + "loss/reg": 0.49548622965812683, + "step": 2692 + }, + { + "epoch": 0.02693, + "grad_norm": 0.4825857877731323, + "grad_norm_var": 0.004693801187267606, + "learning_rate": 5e-05, + "loss": 0.2076, + "loss/crossentropy": 2.843507766723633, + "loss/hidden": 0.0, + "loss/logits": 0.20755332335829735, + "loss/reg": 0.49538809061050415, + "step": 2693 + }, + { + "epoch": 0.02694, + "grad_norm": 0.4072496294975281, + "grad_norm_var": 0.0049508686876926595, + "learning_rate": 5e-05, + "loss": 0.2045, + "loss/crossentropy": 2.811407446861267, + "loss/hidden": 0.0, + "loss/logits": 0.20451678335666656, + "loss/reg": 0.4953059256076813, + "step": 2694 + }, + { + "epoch": 0.02695, + "grad_norm": 0.5818904638290405, + "grad_norm_var": 0.005714384544861325, + "learning_rate": 5e-05, + "loss": 0.2367, + "loss/crossentropy": 2.8804036378860474, + "loss/hidden": 0.0, + "loss/logits": 0.2366720587015152, + "loss/reg": 0.49509456753730774, + "step": 2695 + }, + { + "epoch": 0.02696, + "grad_norm": 0.45094430446624756, + "grad_norm_var": 0.005677012917727083, + "learning_rate": 5e-05, + "loss": 0.2076, + "loss/crossentropy": 2.8479952216148376, + "loss/hidden": 0.0, + "loss/logits": 0.20764769986271858, + "loss/reg": 0.4949437379837036, + "step": 2696 + }, + { + "epoch": 0.02697, + "grad_norm": 0.5007409453392029, + "grad_norm_var": 0.005476046912691841, + "learning_rate": 5e-05, + "loss": 0.2048, + "loss/crossentropy": 2.676621735095978, + "loss/hidden": 0.0, + "loss/logits": 0.2048255279660225, + "loss/reg": 0.49491044878959656, + "step": 2697 + }, + { + "epoch": 0.02698, + "grad_norm": 0.44982513785362244, + "grad_norm_var": 0.005406408856451354, + "learning_rate": 5e-05, + "loss": 0.2102, + "loss/crossentropy": 2.7590322494506836, + "loss/hidden": 0.0, + "loss/logits": 0.21019677817821503, + "loss/reg": 0.4948073923587799, + "step": 2698 + }, + { + "epoch": 0.02699, + "grad_norm": 0.4295031428337097, + "grad_norm_var": 0.003907180678697234, + "learning_rate": 5e-05, + "loss": 0.2155, + "loss/crossentropy": 2.843106210231781, + "loss/hidden": 0.0, + "loss/logits": 0.21552268043160439, + "loss/reg": 0.49498945474624634, + "step": 2699 + }, + { + "epoch": 0.027, + "grad_norm": 0.5483637452125549, + "grad_norm_var": 0.00388719311412216, + "learning_rate": 5e-05, + "loss": 0.2393, + "loss/crossentropy": 2.916628658771515, + "loss/hidden": 0.0, + "loss/logits": 0.23932239413261414, + "loss/reg": 0.49485525488853455, + "step": 2700 + }, + { + "epoch": 0.02701, + "grad_norm": 0.7517091631889343, + "grad_norm_var": 0.007860342169022216, + "learning_rate": 5e-05, + "loss": 0.2051, + "loss/crossentropy": 3.0502232909202576, + "loss/hidden": 0.0, + "loss/logits": 0.20510347560048103, + "loss/reg": 0.49471724033355713, + "step": 2701 + }, + { + "epoch": 0.02702, + "grad_norm": 0.472533255815506, + "grad_norm_var": 0.007618102056275181, + "learning_rate": 5e-05, + "loss": 0.2216, + "loss/crossentropy": 2.751072585582733, + "loss/hidden": 0.0, + "loss/logits": 0.22163891792297363, + "loss/reg": 0.4943944811820984, + "step": 2702 + }, + { + "epoch": 0.02703, + "grad_norm": 0.43951353430747986, + "grad_norm_var": 0.007699485476738607, + "learning_rate": 5e-05, + "loss": 0.1995, + "loss/crossentropy": 2.7923761010169983, + "loss/hidden": 0.0, + "loss/logits": 0.19950497150421143, + "loss/reg": 0.4943939447402954, + "step": 2703 + }, + { + "epoch": 0.02704, + "grad_norm": 0.45529529452323914, + "grad_norm_var": 0.00736756569212957, + "learning_rate": 5e-05, + "loss": 0.2126, + "loss/crossentropy": 2.9060553312301636, + "loss/hidden": 0.0, + "loss/logits": 0.21256953105330467, + "loss/reg": 0.4941692352294922, + "step": 2704 + }, + { + "epoch": 0.02705, + "grad_norm": 0.4454949200153351, + "grad_norm_var": 0.007338973373068565, + "learning_rate": 5e-05, + "loss": 0.2018, + "loss/crossentropy": 2.7437862753868103, + "loss/hidden": 0.0, + "loss/logits": 0.20176933705806732, + "loss/reg": 0.4937989115715027, + "step": 2705 + }, + { + "epoch": 0.02706, + "grad_norm": 0.4485436975955963, + "grad_norm_var": 0.007309252467952243, + "learning_rate": 5e-05, + "loss": 0.2071, + "loss/crossentropy": 2.923901617527008, + "loss/hidden": 0.0, + "loss/logits": 0.2071411944925785, + "loss/reg": 0.4936811327934265, + "step": 2706 + }, + { + "epoch": 0.02707, + "grad_norm": 0.5375815033912659, + "grad_norm_var": 0.007210784335841292, + "learning_rate": 5e-05, + "loss": 0.21, + "loss/crossentropy": 3.0090640783309937, + "loss/hidden": 0.0, + "loss/logits": 0.20996586605906487, + "loss/reg": 0.49395912885665894, + "step": 2707 + }, + { + "epoch": 0.02708, + "grad_norm": 0.4422512352466583, + "grad_norm_var": 0.007097314285126392, + "learning_rate": 5e-05, + "loss": 0.203, + "loss/crossentropy": 2.880916714668274, + "loss/hidden": 0.0, + "loss/logits": 0.20301148667931557, + "loss/reg": 0.4938308894634247, + "step": 2708 + }, + { + "epoch": 0.02709, + "grad_norm": 0.47793814539909363, + "grad_norm_var": 0.007103414721416594, + "learning_rate": 5e-05, + "loss": 0.2117, + "loss/crossentropy": 2.897944927215576, + "loss/hidden": 0.0, + "loss/logits": 0.2116544097661972, + "loss/reg": 0.4941149950027466, + "step": 2709 + }, + { + "epoch": 0.0271, + "grad_norm": 0.49204546213150024, + "grad_norm_var": 0.006617665114071218, + "learning_rate": 5e-05, + "loss": 0.2295, + "loss/crossentropy": 2.8218055963516235, + "loss/hidden": 0.0, + "loss/logits": 0.22950421273708344, + "loss/reg": 0.49380600452423096, + "step": 2710 + }, + { + "epoch": 0.02711, + "grad_norm": 0.4426657259464264, + "grad_norm_var": 0.006221004628452227, + "learning_rate": 5e-05, + "loss": 0.2235, + "loss/crossentropy": 2.8301597237586975, + "loss/hidden": 0.0, + "loss/logits": 0.2234809249639511, + "loss/reg": 0.4936286509037018, + "step": 2711 + }, + { + "epoch": 0.02712, + "grad_norm": 0.4114704132080078, + "grad_norm_var": 0.006505839848202474, + "learning_rate": 5e-05, + "loss": 0.2068, + "loss/crossentropy": 2.80025452375412, + "loss/hidden": 0.0, + "loss/logits": 0.2068319246172905, + "loss/reg": 0.49337029457092285, + "step": 2712 + }, + { + "epoch": 0.02713, + "grad_norm": 0.8172334432601929, + "grad_norm_var": 0.01346886875388436, + "learning_rate": 5e-05, + "loss": 0.216, + "loss/crossentropy": 2.8597967624664307, + "loss/hidden": 0.0, + "loss/logits": 0.21595800668001175, + "loss/reg": 0.49330413341522217, + "step": 2713 + }, + { + "epoch": 0.02714, + "grad_norm": 0.45851320028305054, + "grad_norm_var": 0.01341097692531954, + "learning_rate": 5e-05, + "loss": 0.1939, + "loss/crossentropy": 3.0621681213378906, + "loss/hidden": 0.0, + "loss/logits": 0.19387879222631454, + "loss/reg": 0.4931429922580719, + "step": 2714 + }, + { + "epoch": 0.02715, + "grad_norm": 0.49427828192710876, + "grad_norm_var": 0.013026216888396407, + "learning_rate": 5e-05, + "loss": 0.2112, + "loss/crossentropy": 2.8689560294151306, + "loss/hidden": 0.0, + "loss/logits": 0.21119757741689682, + "loss/reg": 0.4933239221572876, + "step": 2715 + }, + { + "epoch": 0.02716, + "grad_norm": 0.5070939064025879, + "grad_norm_var": 0.012913115137429746, + "learning_rate": 5e-05, + "loss": 0.2086, + "loss/crossentropy": 2.956903040409088, + "loss/hidden": 0.0, + "loss/logits": 0.2086162231862545, + "loss/reg": 0.49291473627090454, + "step": 2716 + }, + { + "epoch": 0.02717, + "grad_norm": 0.46927252411842346, + "grad_norm_var": 0.008641471303578226, + "learning_rate": 5e-05, + "loss": 0.1983, + "loss/crossentropy": 2.756330192089081, + "loss/hidden": 0.0, + "loss/logits": 0.19833580404520035, + "loss/reg": 0.4927137494087219, + "step": 2717 + }, + { + "epoch": 0.02718, + "grad_norm": 0.4514217674732208, + "grad_norm_var": 0.008713519291420665, + "learning_rate": 5e-05, + "loss": 0.2046, + "loss/crossentropy": 2.9121298789978027, + "loss/hidden": 0.0, + "loss/logits": 0.20458823069930077, + "loss/reg": 0.49226832389831543, + "step": 2718 + }, + { + "epoch": 0.02719, + "grad_norm": 0.494526743888855, + "grad_norm_var": 0.008554990735303535, + "learning_rate": 5e-05, + "loss": 0.2457, + "loss/crossentropy": 2.8027809858322144, + "loss/hidden": 0.0, + "loss/logits": 0.24570264294743538, + "loss/reg": 0.49154040217399597, + "step": 2719 + }, + { + "epoch": 0.0272, + "grad_norm": 0.41557425260543823, + "grad_norm_var": 0.008839264092461422, + "learning_rate": 5e-05, + "loss": 0.1814, + "loss/crossentropy": 2.7638903856277466, + "loss/hidden": 0.0, + "loss/logits": 0.18144457787275314, + "loss/reg": 0.4913667142391205, + "step": 2720 + }, + { + "epoch": 0.02721, + "grad_norm": 0.40607714653015137, + "grad_norm_var": 0.009159080133180893, + "learning_rate": 5e-05, + "loss": 0.1975, + "loss/crossentropy": 2.7841968536376953, + "loss/hidden": 0.0, + "loss/logits": 0.19746608659625053, + "loss/reg": 0.4909701943397522, + "step": 2721 + }, + { + "epoch": 0.02722, + "grad_norm": 0.4192264676094055, + "grad_norm_var": 0.009356890205550528, + "learning_rate": 5e-05, + "loss": 0.1964, + "loss/crossentropy": 2.9196910858154297, + "loss/hidden": 0.0, + "loss/logits": 0.19638753309845924, + "loss/reg": 0.4903043210506439, + "step": 2722 + }, + { + "epoch": 0.02723, + "grad_norm": 0.48189038038253784, + "grad_norm_var": 0.009149695831990398, + "learning_rate": 5e-05, + "loss": 0.2309, + "loss/crossentropy": 2.804868996143341, + "loss/hidden": 0.0, + "loss/logits": 0.2308640331029892, + "loss/reg": 0.4893714487552643, + "step": 2723 + }, + { + "epoch": 0.02724, + "grad_norm": 0.4295250177383423, + "grad_norm_var": 0.009224028179889739, + "learning_rate": 5e-05, + "loss": 0.2074, + "loss/crossentropy": 2.8346243500709534, + "loss/hidden": 0.0, + "loss/logits": 0.20738471299409866, + "loss/reg": 0.48909807205200195, + "step": 2724 + }, + { + "epoch": 0.02725, + "grad_norm": 0.4399339258670807, + "grad_norm_var": 0.00932118412993885, + "learning_rate": 5e-05, + "loss": 0.2181, + "loss/crossentropy": 2.8263626098632812, + "loss/hidden": 0.0, + "loss/logits": 0.21813852339982986, + "loss/reg": 0.48882365226745605, + "step": 2725 + }, + { + "epoch": 0.02726, + "grad_norm": 0.4290226995944977, + "grad_norm_var": 0.009442341181151075, + "learning_rate": 5e-05, + "loss": 0.207, + "loss/crossentropy": 2.823870301246643, + "loss/hidden": 0.0, + "loss/logits": 0.20698799192905426, + "loss/reg": 0.48837563395500183, + "step": 2726 + }, + { + "epoch": 0.02727, + "grad_norm": 0.8225472569465637, + "grad_norm_var": 0.01692612510768792, + "learning_rate": 5e-05, + "loss": 0.2315, + "loss/crossentropy": 2.8211364150047302, + "loss/hidden": 0.0, + "loss/logits": 0.23154771327972412, + "loss/reg": 0.48784375190734863, + "step": 2727 + }, + { + "epoch": 0.02728, + "grad_norm": 0.5462093949317932, + "grad_norm_var": 0.016529163347392677, + "learning_rate": 5e-05, + "loss": 0.2133, + "loss/crossentropy": 2.8850455284118652, + "loss/hidden": 0.0, + "loss/logits": 0.21331286802887917, + "loss/reg": 0.48744627833366394, + "step": 2728 + }, + { + "epoch": 0.02729, + "grad_norm": 0.47115883231163025, + "grad_norm_var": 0.009613931905116961, + "learning_rate": 5e-05, + "loss": 0.2074, + "loss/crossentropy": 2.7841280698776245, + "loss/hidden": 0.0, + "loss/logits": 0.20736776292324066, + "loss/reg": 0.4869265556335449, + "step": 2729 + }, + { + "epoch": 0.0273, + "grad_norm": 0.4366164803504944, + "grad_norm_var": 0.009716898674810286, + "learning_rate": 5e-05, + "loss": 0.2024, + "loss/crossentropy": 2.9152287244796753, + "loss/hidden": 0.0, + "loss/logits": 0.2024410106241703, + "loss/reg": 0.48651543259620667, + "step": 2730 + }, + { + "epoch": 0.02731, + "grad_norm": 0.47324204444885254, + "grad_norm_var": 0.009710534222329589, + "learning_rate": 5e-05, + "loss": 0.204, + "loss/crossentropy": 2.813485562801361, + "loss/hidden": 0.0, + "loss/logits": 0.2039998322725296, + "loss/reg": 0.4864475429058075, + "step": 2731 + }, + { + "epoch": 0.02732, + "grad_norm": 0.4294508993625641, + "grad_norm_var": 0.009815455088653675, + "learning_rate": 5e-05, + "loss": 0.1886, + "loss/crossentropy": 2.712465524673462, + "loss/hidden": 0.0, + "loss/logits": 0.1885991059243679, + "loss/reg": 0.48705872893333435, + "step": 2732 + }, + { + "epoch": 0.02733, + "grad_norm": 0.4322575032711029, + "grad_norm_var": 0.009934195606300718, + "learning_rate": 5e-05, + "loss": 0.1979, + "loss/crossentropy": 2.7683774828910828, + "loss/hidden": 0.0, + "loss/logits": 0.1979178860783577, + "loss/reg": 0.48735907673835754, + "step": 2733 + }, + { + "epoch": 0.02734, + "grad_norm": 0.5307685136795044, + "grad_norm_var": 0.010092339002801771, + "learning_rate": 5e-05, + "loss": 0.2189, + "loss/crossentropy": 2.9596521258354187, + "loss/hidden": 0.0, + "loss/logits": 0.21886243671178818, + "loss/reg": 0.487589567899704, + "step": 2734 + }, + { + "epoch": 0.02735, + "grad_norm": 0.42579784989356995, + "grad_norm_var": 0.010241862355615305, + "learning_rate": 5e-05, + "loss": 0.1971, + "loss/crossentropy": 2.8340309262275696, + "loss/hidden": 0.0, + "loss/logits": 0.19712654501199722, + "loss/reg": 0.48765915632247925, + "step": 2735 + }, + { + "epoch": 0.02736, + "grad_norm": 0.43546414375305176, + "grad_norm_var": 0.010110765358108271, + "learning_rate": 5e-05, + "loss": 0.2041, + "loss/crossentropy": 2.713832139968872, + "loss/hidden": 0.0, + "loss/logits": 0.20410532131791115, + "loss/reg": 0.4877520501613617, + "step": 2736 + }, + { + "epoch": 0.02737, + "grad_norm": 0.44803476333618164, + "grad_norm_var": 0.009832001719103664, + "learning_rate": 5e-05, + "loss": 0.2136, + "loss/crossentropy": 2.819288194179535, + "loss/hidden": 0.0, + "loss/logits": 0.21360984817147255, + "loss/reg": 0.48735707998275757, + "step": 2737 + }, + { + "epoch": 0.02738, + "grad_norm": 5.030335903167725, + "grad_norm_var": 1.3024719210001696, + "learning_rate": 5e-05, + "loss": 0.3771, + "loss/crossentropy": 3.019712209701538, + "loss/hidden": 0.0, + "loss/logits": 0.37709441408514977, + "loss/reg": 0.4874415993690491, + "step": 2738 + }, + { + "epoch": 0.02739, + "grad_norm": 0.46844637393951416, + "grad_norm_var": 1.3029931943766202, + "learning_rate": 5e-05, + "loss": 0.201, + "loss/crossentropy": 2.8346911668777466, + "loss/hidden": 0.0, + "loss/logits": 0.20099680870771408, + "loss/reg": 0.4877273440361023, + "step": 2739 + }, + { + "epoch": 0.0274, + "grad_norm": 0.46742016077041626, + "grad_norm_var": 1.301385114693839, + "learning_rate": 5e-05, + "loss": 0.2149, + "loss/crossentropy": 2.651547133922577, + "loss/hidden": 0.0, + "loss/logits": 0.21491020917892456, + "loss/reg": 0.48811763525009155, + "step": 2740 + }, + { + "epoch": 0.02741, + "grad_norm": 0.4408324956893921, + "grad_norm_var": 1.3013458694722078, + "learning_rate": 5e-05, + "loss": 0.1984, + "loss/crossentropy": 2.8628365993499756, + "loss/hidden": 0.0, + "loss/logits": 0.19844652339816093, + "loss/reg": 0.48821038007736206, + "step": 2741 + }, + { + "epoch": 0.02742, + "grad_norm": 0.4630972743034363, + "grad_norm_var": 1.2998784811885697, + "learning_rate": 5e-05, + "loss": 0.2162, + "loss/crossentropy": 2.7797998785972595, + "loss/hidden": 0.0, + "loss/logits": 0.21620075777173042, + "loss/reg": 0.4880053400993347, + "step": 2742 + }, + { + "epoch": 0.02743, + "grad_norm": 1.7904237508773804, + "grad_norm_var": 1.3651952224984893, + "learning_rate": 5e-05, + "loss": 0.238, + "loss/crossentropy": 2.8886029720306396, + "loss/hidden": 0.0, + "loss/logits": 0.23804711550474167, + "loss/reg": 0.4883866608142853, + "step": 2743 + }, + { + "epoch": 0.02744, + "grad_norm": 0.4395548105239868, + "grad_norm_var": 1.3699503419957013, + "learning_rate": 5e-05, + "loss": 0.1969, + "loss/crossentropy": 2.739936649799347, + "loss/hidden": 0.0, + "loss/logits": 0.19692730531096458, + "loss/reg": 0.48859313130378723, + "step": 2744 + }, + { + "epoch": 0.02745, + "grad_norm": 0.5783601999282837, + "grad_norm_var": 1.3656262406296642, + "learning_rate": 5e-05, + "loss": 0.2579, + "loss/crossentropy": 2.74318128824234, + "loss/hidden": 0.0, + "loss/logits": 0.2579203136265278, + "loss/reg": 0.48877862095832825, + "step": 2745 + }, + { + "epoch": 0.02746, + "grad_norm": 0.4566448926925659, + "grad_norm_var": 1.3645991124789532, + "learning_rate": 5e-05, + "loss": 0.2213, + "loss/crossentropy": 2.6746232509613037, + "loss/hidden": 0.0, + "loss/logits": 0.22128013893961906, + "loss/reg": 0.488521933555603, + "step": 2746 + }, + { + "epoch": 0.02747, + "grad_norm": 0.41291606426239014, + "grad_norm_var": 1.3677112813951549, + "learning_rate": 5e-05, + "loss": 0.1872, + "loss/crossentropy": 2.868433892726898, + "loss/hidden": 0.0, + "loss/logits": 0.18722086399793625, + "loss/reg": 0.48819610476493835, + "step": 2747 + }, + { + "epoch": 0.02748, + "grad_norm": 0.6127803921699524, + "grad_norm_var": 1.3600670220677116, + "learning_rate": 5e-05, + "loss": 0.2214, + "loss/crossentropy": 3.0859760642051697, + "loss/hidden": 0.0, + "loss/logits": 0.2213713675737381, + "loss/reg": 0.4879070222377777, + "step": 2748 + }, + { + "epoch": 0.02749, + "grad_norm": 0.5004788637161255, + "grad_norm_var": 1.356652909760155, + "learning_rate": 5e-05, + "loss": 0.206, + "loss/crossentropy": 2.9830501675605774, + "loss/hidden": 0.0, + "loss/logits": 0.20597152784466743, + "loss/reg": 0.4875172972679138, + "step": 2749 + }, + { + "epoch": 0.0275, + "grad_norm": 0.47342923283576965, + "grad_norm_var": 1.3592518627598917, + "learning_rate": 5e-05, + "loss": 0.2001, + "loss/crossentropy": 3.0037205815315247, + "loss/hidden": 0.0, + "loss/logits": 0.20014699175953865, + "loss/reg": 0.4872077703475952, + "step": 2750 + }, + { + "epoch": 0.02751, + "grad_norm": 0.4295707941055298, + "grad_norm_var": 1.3590442579354556, + "learning_rate": 5e-05, + "loss": 0.1885, + "loss/crossentropy": 2.7643763422966003, + "loss/hidden": 0.0, + "loss/logits": 0.18849025666713715, + "loss/reg": 0.4870486557483673, + "step": 2751 + }, + { + "epoch": 0.02752, + "grad_norm": 0.8130649924278259, + "grad_norm_var": 1.3475640673541873, + "learning_rate": 5e-05, + "loss": 0.2403, + "loss/crossentropy": 3.0180888175964355, + "loss/hidden": 0.0, + "loss/logits": 0.24030575156211853, + "loss/reg": 0.48642897605895996, + "step": 2752 + }, + { + "epoch": 0.02753, + "grad_norm": 0.5237664580345154, + "grad_norm_var": 1.3437214114776563, + "learning_rate": 5e-05, + "loss": 0.2147, + "loss/crossentropy": 2.76887971162796, + "loss/hidden": 0.0, + "loss/logits": 0.21474824845790863, + "loss/reg": 0.4859190583229065, + "step": 2753 + }, + { + "epoch": 0.02754, + "grad_norm": 0.4932967722415924, + "grad_norm_var": 0.11280536719743435, + "learning_rate": 5e-05, + "loss": 0.2086, + "loss/crossentropy": 2.6622884273529053, + "loss/hidden": 0.0, + "loss/logits": 0.20863648876547813, + "loss/reg": 0.48588013648986816, + "step": 2754 + }, + { + "epoch": 0.02755, + "grad_norm": 0.43882760405540466, + "grad_norm_var": 0.11332149458104664, + "learning_rate": 5e-05, + "loss": 0.1811, + "loss/crossentropy": 2.770517647266388, + "loss/hidden": 0.0, + "loss/logits": 0.18111392855644226, + "loss/reg": 0.48529940843582153, + "step": 2755 + }, + { + "epoch": 0.02756, + "grad_norm": 0.5302618145942688, + "grad_norm_var": 0.11259649577151827, + "learning_rate": 5e-05, + "loss": 0.2439, + "loss/crossentropy": 2.8216320872306824, + "loss/hidden": 0.0, + "loss/logits": 0.24389966949820518, + "loss/reg": 0.48500895500183105, + "step": 2756 + }, + { + "epoch": 0.02757, + "grad_norm": 0.4503178894519806, + "grad_norm_var": 0.11241683877535748, + "learning_rate": 5e-05, + "loss": 0.2384, + "loss/crossentropy": 2.777120530605316, + "loss/hidden": 0.0, + "loss/logits": 0.2383824624121189, + "loss/reg": 0.484750360250473, + "step": 2757 + }, + { + "epoch": 0.02758, + "grad_norm": 0.44505950808525085, + "grad_norm_var": 0.11273738771800075, + "learning_rate": 5e-05, + "loss": 0.2158, + "loss/crossentropy": 2.7890790700912476, + "loss/hidden": 0.0, + "loss/logits": 0.21584049239754677, + "loss/reg": 0.4842144548892975, + "step": 2758 + }, + { + "epoch": 0.02759, + "grad_norm": 0.4434998035430908, + "grad_norm_var": 0.009966008855130144, + "learning_rate": 5e-05, + "loss": 0.2128, + "loss/crossentropy": 2.9288337230682373, + "loss/hidden": 0.0, + "loss/logits": 0.21275563910603523, + "loss/reg": 0.48395270109176636, + "step": 2759 + }, + { + "epoch": 0.0276, + "grad_norm": 0.49238672852516174, + "grad_norm_var": 0.009696251838199983, + "learning_rate": 5e-05, + "loss": 0.2395, + "loss/crossentropy": 2.8521869778633118, + "loss/hidden": 0.0, + "loss/logits": 0.2394689843058586, + "loss/reg": 0.4834091365337372, + "step": 2760 + }, + { + "epoch": 0.02761, + "grad_norm": 0.43678662180900574, + "grad_norm_var": 0.009581460026098857, + "learning_rate": 5e-05, + "loss": 0.2027, + "loss/crossentropy": 2.7796119451522827, + "loss/hidden": 0.0, + "loss/logits": 0.20271554961800575, + "loss/reg": 0.4828082323074341, + "step": 2761 + }, + { + "epoch": 0.02762, + "grad_norm": 0.4237591326236725, + "grad_norm_var": 0.009826298168348761, + "learning_rate": 5e-05, + "loss": 0.1967, + "loss/crossentropy": 2.783937096595764, + "loss/hidden": 0.0, + "loss/logits": 0.19665150716900826, + "loss/reg": 0.4822863042354584, + "step": 2762 + }, + { + "epoch": 0.02763, + "grad_norm": 0.47288358211517334, + "grad_norm_var": 0.009394637352571051, + "learning_rate": 5e-05, + "loss": 0.2105, + "loss/crossentropy": 2.769045412540436, + "loss/hidden": 0.0, + "loss/logits": 0.2104603797197342, + "loss/reg": 0.4818100333213806, + "step": 2763 + }, + { + "epoch": 0.02764, + "grad_norm": 0.5302960872650146, + "grad_norm_var": 0.008565887436070991, + "learning_rate": 5e-05, + "loss": 0.2152, + "loss/crossentropy": 2.804442822933197, + "loss/hidden": 0.0, + "loss/logits": 0.21523810178041458, + "loss/reg": 0.4810851514339447, + "step": 2764 + }, + { + "epoch": 0.02765, + "grad_norm": 0.4107396602630615, + "grad_norm_var": 0.008986964786522595, + "learning_rate": 5e-05, + "loss": 0.1994, + "loss/crossentropy": 2.844242215156555, + "loss/hidden": 0.0, + "loss/logits": 0.19937966391444206, + "loss/reg": 0.48041123151779175, + "step": 2765 + }, + { + "epoch": 0.02766, + "grad_norm": 0.4369696080684662, + "grad_norm_var": 0.00914086272889952, + "learning_rate": 5e-05, + "loss": 0.1981, + "loss/crossentropy": 2.8472766280174255, + "loss/hidden": 0.0, + "loss/logits": 0.19812384620308876, + "loss/reg": 0.4792887270450592, + "step": 2766 + }, + { + "epoch": 0.02767, + "grad_norm": 0.4638201892375946, + "grad_norm_var": 0.008957775769299566, + "learning_rate": 5e-05, + "loss": 0.2088, + "loss/crossentropy": 2.941045582294464, + "loss/hidden": 0.0, + "loss/logits": 0.20877783372998238, + "loss/reg": 0.4784465432167053, + "step": 2767 + }, + { + "epoch": 0.02768, + "grad_norm": 0.4169032573699951, + "grad_norm_var": 0.001588869010582879, + "learning_rate": 5e-05, + "loss": 0.2058, + "loss/crossentropy": 2.77127867937088, + "loss/hidden": 0.0, + "loss/logits": 0.20584552735090256, + "loss/reg": 0.478077232837677, + "step": 2768 + }, + { + "epoch": 0.02769, + "grad_norm": 0.4756011664867401, + "grad_norm_var": 0.0013442499677515467, + "learning_rate": 5e-05, + "loss": 0.2165, + "loss/crossentropy": 2.9689376950263977, + "loss/hidden": 0.0, + "loss/logits": 0.21651886403560638, + "loss/reg": 0.477236807346344, + "step": 2769 + }, + { + "epoch": 0.0277, + "grad_norm": 0.44378915429115295, + "grad_norm_var": 0.0012782266928213677, + "learning_rate": 5e-05, + "loss": 0.2119, + "loss/crossentropy": 2.800460457801819, + "loss/hidden": 0.0, + "loss/logits": 0.21186111122369766, + "loss/reg": 0.4768323004245758, + "step": 2770 + }, + { + "epoch": 0.02771, + "grad_norm": 0.4458007514476776, + "grad_norm_var": 0.0012643756083769258, + "learning_rate": 5e-05, + "loss": 0.2207, + "loss/crossentropy": 2.7388806343078613, + "loss/hidden": 0.0, + "loss/logits": 0.22070271894335747, + "loss/reg": 0.47619932889938354, + "step": 2771 + }, + { + "epoch": 0.02772, + "grad_norm": 0.4545891582965851, + "grad_norm_var": 0.0008874191092536917, + "learning_rate": 5e-05, + "loss": 0.2091, + "loss/crossentropy": 2.8388237953186035, + "loss/hidden": 0.0, + "loss/logits": 0.20909543707966805, + "loss/reg": 0.47597333788871765, + "step": 2772 + }, + { + "epoch": 0.02773, + "grad_norm": 0.42682191729545593, + "grad_norm_var": 0.0009293860206788869, + "learning_rate": 5e-05, + "loss": 0.2088, + "loss/crossentropy": 2.846268594264984, + "loss/hidden": 0.0, + "loss/logits": 0.20878321677446365, + "loss/reg": 0.47523069381713867, + "step": 2773 + }, + { + "epoch": 0.02774, + "grad_norm": 1.037343978881836, + "grad_norm_var": 0.022367020438005255, + "learning_rate": 5e-05, + "loss": 0.2587, + "loss/crossentropy": 2.898918092250824, + "loss/hidden": 0.0, + "loss/logits": 0.2586851119995117, + "loss/reg": 0.47476115822792053, + "step": 2774 + }, + { + "epoch": 0.02775, + "grad_norm": 0.4403015375137329, + "grad_norm_var": 0.02238674257042311, + "learning_rate": 5e-05, + "loss": 0.2122, + "loss/crossentropy": 2.7450714111328125, + "loss/hidden": 0.0, + "loss/logits": 0.21221909672021866, + "loss/reg": 0.4744551479816437, + "step": 2775 + }, + { + "epoch": 0.02776, + "grad_norm": 0.44348451495170593, + "grad_norm_var": 0.02250792693270197, + "learning_rate": 5e-05, + "loss": 0.2118, + "loss/crossentropy": 2.7346010208129883, + "loss/hidden": 0.0, + "loss/logits": 0.21181001886725426, + "loss/reg": 0.4742518961429596, + "step": 2776 + }, + { + "epoch": 0.02777, + "grad_norm": 0.4409497082233429, + "grad_norm_var": 0.02248225174873455, + "learning_rate": 5e-05, + "loss": 0.2008, + "loss/crossentropy": 2.847502648830414, + "loss/hidden": 0.0, + "loss/logits": 0.20084399357438087, + "loss/reg": 0.4743136167526245, + "step": 2777 + }, + { + "epoch": 0.02778, + "grad_norm": 0.4770478308200836, + "grad_norm_var": 0.022222805931596888, + "learning_rate": 5e-05, + "loss": 0.2314, + "loss/crossentropy": 2.535971224308014, + "loss/hidden": 0.0, + "loss/logits": 0.23144260793924332, + "loss/reg": 0.47391852736473083, + "step": 2778 + }, + { + "epoch": 0.02779, + "grad_norm": 0.45485442876815796, + "grad_norm_var": 0.022280863326886724, + "learning_rate": 5e-05, + "loss": 0.2023, + "loss/crossentropy": 2.7433395385742188, + "loss/hidden": 0.0, + "loss/logits": 0.20232609659433365, + "loss/reg": 0.4740648567676544, + "step": 2779 + }, + { + "epoch": 0.0278, + "grad_norm": 0.4213268458843231, + "grad_norm_var": 0.022400588616330538, + "learning_rate": 5e-05, + "loss": 0.1822, + "loss/crossentropy": 2.736327588558197, + "loss/hidden": 0.0, + "loss/logits": 0.1821596696972847, + "loss/reg": 0.4738776981830597, + "step": 2780 + }, + { + "epoch": 0.02781, + "grad_norm": 0.49712780117988586, + "grad_norm_var": 0.022061804501078027, + "learning_rate": 5e-05, + "loss": 0.2241, + "loss/crossentropy": 2.7281267046928406, + "loss/hidden": 0.0, + "loss/logits": 0.22411000356078148, + "loss/reg": 0.47379863262176514, + "step": 2781 + }, + { + "epoch": 0.02782, + "grad_norm": 0.40840572118759155, + "grad_norm_var": 0.02229970532440095, + "learning_rate": 5e-05, + "loss": 0.203, + "loss/crossentropy": 2.9373412132263184, + "loss/hidden": 0.0, + "loss/logits": 0.20300345867872238, + "loss/reg": 0.4741005003452301, + "step": 2782 + }, + { + "epoch": 0.02783, + "grad_norm": 0.5561774373054504, + "grad_norm_var": 0.022581113569634784, + "learning_rate": 5e-05, + "loss": 0.2343, + "loss/crossentropy": 2.910749912261963, + "loss/hidden": 0.0, + "loss/logits": 0.23427263647317886, + "loss/reg": 0.47393694519996643, + "step": 2783 + }, + { + "epoch": 0.02784, + "grad_norm": 0.6062034368515015, + "grad_norm_var": 0.022974981388959495, + "learning_rate": 5e-05, + "loss": 0.2747, + "loss/crossentropy": 2.913097620010376, + "loss/hidden": 0.0, + "loss/logits": 0.27474479377269745, + "loss/reg": 0.4740515649318695, + "step": 2784 + }, + { + "epoch": 0.02785, + "grad_norm": 0.45984625816345215, + "grad_norm_var": 0.023045664280298493, + "learning_rate": 5e-05, + "loss": 0.2112, + "loss/crossentropy": 2.8111666440963745, + "loss/hidden": 0.0, + "loss/logits": 0.21119607985019684, + "loss/reg": 0.4735124409198761, + "step": 2785 + }, + { + "epoch": 0.02786, + "grad_norm": 0.4405754804611206, + "grad_norm_var": 0.0230707723541652, + "learning_rate": 5e-05, + "loss": 0.2197, + "loss/crossentropy": 2.631424903869629, + "loss/hidden": 0.0, + "loss/logits": 0.21970823779702187, + "loss/reg": 0.47307515144348145, + "step": 2786 + }, + { + "epoch": 0.02787, + "grad_norm": 0.45616576075553894, + "grad_norm_var": 0.023001645756457245, + "learning_rate": 5e-05, + "loss": 0.2167, + "loss/crossentropy": 2.730878233909607, + "loss/hidden": 0.0, + "loss/logits": 0.21666047722101212, + "loss/reg": 0.4730065166950226, + "step": 2787 + }, + { + "epoch": 0.02788, + "grad_norm": 0.4624628722667694, + "grad_norm_var": 0.022956454415747098, + "learning_rate": 5e-05, + "loss": 0.231, + "loss/crossentropy": 2.7670437693595886, + "loss/hidden": 0.0, + "loss/logits": 0.2309555634856224, + "loss/reg": 0.47289201617240906, + "step": 2788 + }, + { + "epoch": 0.02789, + "grad_norm": 0.8682827353477478, + "grad_norm_var": 0.030722527476555118, + "learning_rate": 5e-05, + "loss": 0.2439, + "loss/crossentropy": 2.704658269882202, + "loss/hidden": 0.0, + "loss/logits": 0.24392283707857132, + "loss/reg": 0.4728879928588867, + "step": 2789 + }, + { + "epoch": 0.0279, + "grad_norm": 0.5829588770866394, + "grad_norm_var": 0.012853609752808614, + "learning_rate": 5e-05, + "loss": 0.2472, + "loss/crossentropy": 2.9476634860038757, + "loss/hidden": 0.0, + "loss/logits": 0.2472483292222023, + "loss/reg": 0.4728856384754181, + "step": 2790 + }, + { + "epoch": 0.02791, + "grad_norm": 0.46524012088775635, + "grad_norm_var": 0.012690613817185907, + "learning_rate": 5e-05, + "loss": 0.2071, + "loss/crossentropy": 3.017505466938019, + "loss/hidden": 0.0, + "loss/logits": 0.2071487493813038, + "loss/reg": 0.4723857343196869, + "step": 2791 + }, + { + "epoch": 0.02792, + "grad_norm": 0.47604405879974365, + "grad_norm_var": 0.012500368127881823, + "learning_rate": 5e-05, + "loss": 0.2198, + "loss/crossentropy": 2.8384276628494263, + "loss/hidden": 0.0, + "loss/logits": 0.2198197804391384, + "loss/reg": 0.4715425968170166, + "step": 2792 + }, + { + "epoch": 0.02793, + "grad_norm": 0.42586222290992737, + "grad_norm_var": 0.01264264690917482, + "learning_rate": 5e-05, + "loss": 0.1991, + "loss/crossentropy": 2.805938959121704, + "loss/hidden": 0.0, + "loss/logits": 0.19911884889006615, + "loss/reg": 0.4711867570877075, + "step": 2793 + }, + { + "epoch": 0.02794, + "grad_norm": 0.5171465277671814, + "grad_norm_var": 0.012600851914226777, + "learning_rate": 5e-05, + "loss": 0.2291, + "loss/crossentropy": 2.6323679089546204, + "loss/hidden": 0.0, + "loss/logits": 0.229069996625185, + "loss/reg": 0.47057151794433594, + "step": 2794 + }, + { + "epoch": 0.02795, + "grad_norm": 0.7830535173416138, + "grad_norm_var": 0.01708756165997146, + "learning_rate": 5e-05, + "loss": 0.2507, + "loss/crossentropy": 2.7495815753936768, + "loss/hidden": 0.0, + "loss/logits": 0.25072456151247025, + "loss/reg": 0.4698469638824463, + "step": 2795 + }, + { + "epoch": 0.02796, + "grad_norm": 0.48294103145599365, + "grad_norm_var": 0.016459331230302122, + "learning_rate": 5e-05, + "loss": 0.203, + "loss/crossentropy": 2.8198480010032654, + "loss/hidden": 0.0, + "loss/logits": 0.20303139835596085, + "loss/reg": 0.468721866607666, + "step": 2796 + }, + { + "epoch": 0.02797, + "grad_norm": 0.5084864497184753, + "grad_norm_var": 0.0164168064211973, + "learning_rate": 5e-05, + "loss": 0.252, + "loss/crossentropy": 2.715098261833191, + "loss/hidden": 0.0, + "loss/logits": 0.25204743072390556, + "loss/reg": 0.4676380455493927, + "step": 2797 + }, + { + "epoch": 0.02798, + "grad_norm": 0.4653077721595764, + "grad_norm_var": 0.01568722938355057, + "learning_rate": 5e-05, + "loss": 0.2076, + "loss/crossentropy": 2.780573308467865, + "loss/hidden": 0.0, + "loss/logits": 0.20763636007905006, + "loss/reg": 0.46664535999298096, + "step": 2798 + }, + { + "epoch": 0.02799, + "grad_norm": 0.46740907430648804, + "grad_norm_var": 0.01592666597927907, + "learning_rate": 5e-05, + "loss": 0.2095, + "loss/crossentropy": 2.685659170150757, + "loss/hidden": 0.0, + "loss/logits": 0.20953763648867607, + "loss/reg": 0.46577781438827515, + "step": 2799 + }, + { + "epoch": 0.028, + "grad_norm": 0.5289340019226074, + "grad_norm_var": 0.015506997628440896, + "learning_rate": 5e-05, + "loss": 0.2184, + "loss/crossentropy": 2.7469412088394165, + "loss/hidden": 0.0, + "loss/logits": 0.21843300387263298, + "loss/reg": 0.46451595425605774, + "step": 2800 + }, + { + "epoch": 0.02801, + "grad_norm": 0.48317569494247437, + "grad_norm_var": 0.015340152134815007, + "learning_rate": 5e-05, + "loss": 0.2211, + "loss/crossentropy": 2.950335383415222, + "loss/hidden": 0.0, + "loss/logits": 0.22105418890714645, + "loss/reg": 0.4635762870311737, + "step": 2801 + }, + { + "epoch": 0.02802, + "grad_norm": 0.46565622091293335, + "grad_norm_var": 0.015094207685775685, + "learning_rate": 5e-05, + "loss": 0.2084, + "loss/crossentropy": 2.709966003894806, + "loss/hidden": 0.0, + "loss/logits": 0.20841461792588234, + "loss/reg": 0.4626789689064026, + "step": 2802 + }, + { + "epoch": 0.02803, + "grad_norm": 0.49599260091781616, + "grad_norm_var": 0.014814831208655817, + "learning_rate": 5e-05, + "loss": 0.2197, + "loss/crossentropy": 2.8181806802749634, + "loss/hidden": 0.0, + "loss/logits": 0.21970366314053535, + "loss/reg": 0.4622538983821869, + "step": 2803 + }, + { + "epoch": 0.02804, + "grad_norm": 4.541540145874023, + "grad_norm_var": 1.0180479690587574, + "learning_rate": 5e-05, + "loss": 0.3133, + "loss/crossentropy": 2.8754201531410217, + "loss/hidden": 0.0, + "loss/logits": 0.3132971450686455, + "loss/reg": 0.4617142975330353, + "step": 2804 + }, + { + "epoch": 0.02805, + "grad_norm": 0.4235670268535614, + "grad_norm_var": 1.0254631406247932, + "learning_rate": 5e-05, + "loss": 0.2021, + "loss/crossentropy": 3.0043500661849976, + "loss/hidden": 0.0, + "loss/logits": 0.2021479643881321, + "loss/reg": 0.4611737132072449, + "step": 2805 + }, + { + "epoch": 0.02806, + "grad_norm": 0.43490007519721985, + "grad_norm_var": 1.0302706275962195, + "learning_rate": 5e-05, + "loss": 0.2032, + "loss/crossentropy": 2.7861366868019104, + "loss/hidden": 0.0, + "loss/logits": 0.20320403575897217, + "loss/reg": 0.4604767858982086, + "step": 2806 + }, + { + "epoch": 0.02807, + "grad_norm": 0.4773966670036316, + "grad_norm_var": 1.0298218240704924, + "learning_rate": 5e-05, + "loss": 0.2009, + "loss/crossentropy": 2.6422954201698303, + "loss/hidden": 0.0, + "loss/logits": 0.20088626071810722, + "loss/reg": 0.4593361020088196, + "step": 2807 + }, + { + "epoch": 0.02808, + "grad_norm": 0.45974504947662354, + "grad_norm_var": 1.03043072119995, + "learning_rate": 5e-05, + "loss": 0.2107, + "loss/crossentropy": 2.7720006108283997, + "loss/hidden": 0.0, + "loss/logits": 0.2107018418610096, + "loss/reg": 0.4588342607021332, + "step": 2808 + }, + { + "epoch": 0.02809, + "grad_norm": 0.4311244487762451, + "grad_norm_var": 1.0302067322836244, + "learning_rate": 5e-05, + "loss": 0.2076, + "loss/crossentropy": 2.7455742359161377, + "loss/hidden": 0.0, + "loss/logits": 0.20756198465824127, + "loss/reg": 0.45858126878738403, + "step": 2809 + }, + { + "epoch": 0.0281, + "grad_norm": 0.4278031289577484, + "grad_norm_var": 1.0334544447565217, + "learning_rate": 5e-05, + "loss": 0.204, + "loss/crossentropy": 2.759481430053711, + "loss/hidden": 0.0, + "loss/logits": 0.20401455089449883, + "loss/reg": 0.45833343267440796, + "step": 2810 + }, + { + "epoch": 0.02811, + "grad_norm": 0.46259742975234985, + "grad_norm_var": 1.0381320281987703, + "learning_rate": 5e-05, + "loss": 0.2181, + "loss/crossentropy": 2.9239853024482727, + "loss/hidden": 0.0, + "loss/logits": 0.2181418426334858, + "loss/reg": 0.4577609896659851, + "step": 2811 + }, + { + "epoch": 0.02812, + "grad_norm": 0.475321501493454, + "grad_norm_var": 1.0383788163159213, + "learning_rate": 5e-05, + "loss": 0.2192, + "loss/crossentropy": 2.700235664844513, + "loss/hidden": 0.0, + "loss/logits": 0.2192099392414093, + "loss/reg": 0.45773032307624817, + "step": 2812 + }, + { + "epoch": 0.02813, + "grad_norm": 0.44477590918540955, + "grad_norm_var": 1.0404446322633916, + "learning_rate": 5e-05, + "loss": 0.2138, + "loss/crossentropy": 2.7975884675979614, + "loss/hidden": 0.0, + "loss/logits": 0.21384206041693687, + "loss/reg": 0.45748886466026306, + "step": 2813 + }, + { + "epoch": 0.02814, + "grad_norm": 0.47548604011535645, + "grad_norm_var": 1.040108411385007, + "learning_rate": 5e-05, + "loss": 0.2303, + "loss/crossentropy": 2.711508631706238, + "loss/hidden": 0.0, + "loss/logits": 0.23031622543931007, + "loss/reg": 0.45761197805404663, + "step": 2814 + }, + { + "epoch": 0.02815, + "grad_norm": 0.44389015436172485, + "grad_norm_var": 1.0409302549019996, + "learning_rate": 5e-05, + "loss": 0.2098, + "loss/crossentropy": 2.8788601756095886, + "loss/hidden": 0.0, + "loss/logits": 0.2098495289683342, + "loss/reg": 0.4570474326610565, + "step": 2815 + }, + { + "epoch": 0.02816, + "grad_norm": 0.4832332134246826, + "grad_norm_var": 1.0422067228557497, + "learning_rate": 5e-05, + "loss": 0.2346, + "loss/crossentropy": 2.76019024848938, + "loss/hidden": 0.0, + "loss/logits": 0.23459240049123764, + "loss/reg": 0.4567791819572449, + "step": 2816 + }, + { + "epoch": 0.02817, + "grad_norm": 0.43828243017196655, + "grad_norm_var": 1.0437151715964486, + "learning_rate": 5e-05, + "loss": 0.2008, + "loss/crossentropy": 2.9324379563331604, + "loss/hidden": 0.0, + "loss/logits": 0.20081858709454536, + "loss/reg": 0.45680752396583557, + "step": 2817 + }, + { + "epoch": 0.02818, + "grad_norm": 0.44994044303894043, + "grad_norm_var": 1.0442454063364015, + "learning_rate": 5e-05, + "loss": 0.2169, + "loss/crossentropy": 2.71914279460907, + "loss/hidden": 0.0, + "loss/logits": 0.21691810339689255, + "loss/reg": 0.4567461311817169, + "step": 2818 + }, + { + "epoch": 0.02819, + "grad_norm": 0.45157700777053833, + "grad_norm_var": 1.0456381429810815, + "learning_rate": 5e-05, + "loss": 0.2066, + "loss/crossentropy": 2.6957846879959106, + "loss/hidden": 0.0, + "loss/logits": 0.20657053589820862, + "loss/reg": 0.4563171863555908, + "step": 2819 + }, + { + "epoch": 0.0282, + "grad_norm": 0.5022869110107422, + "grad_norm_var": 0.0005129284565939825, + "learning_rate": 5e-05, + "loss": 0.2124, + "loss/crossentropy": 3.037287652492523, + "loss/hidden": 0.0, + "loss/logits": 0.21237129718065262, + "loss/reg": 0.4561023712158203, + "step": 2820 + }, + { + "epoch": 0.02821, + "grad_norm": 0.4568628668785095, + "grad_norm_var": 0.0004421370038907302, + "learning_rate": 5e-05, + "loss": 0.2184, + "loss/crossentropy": 2.868661105632782, + "loss/hidden": 0.0, + "loss/logits": 0.21844060719013214, + "loss/reg": 0.4558677673339844, + "step": 2821 + }, + { + "epoch": 0.02822, + "grad_norm": 0.447999507188797, + "grad_norm_var": 0.0004139103114272865, + "learning_rate": 5e-05, + "loss": 0.2048, + "loss/crossentropy": 2.728588879108429, + "loss/hidden": 0.0, + "loss/logits": 0.20480403304100037, + "loss/reg": 0.45604148507118225, + "step": 2822 + }, + { + "epoch": 0.02823, + "grad_norm": 0.45520225167274475, + "grad_norm_var": 0.0003873573108583828, + "learning_rate": 5e-05, + "loss": 0.2232, + "loss/crossentropy": 2.690777361392975, + "loss/hidden": 0.0, + "loss/logits": 0.22323986887931824, + "loss/reg": 0.4557596445083618, + "step": 2823 + }, + { + "epoch": 0.02824, + "grad_norm": 0.5820746421813965, + "grad_norm_var": 0.0013733995227566613, + "learning_rate": 5e-05, + "loss": 0.2265, + "loss/crossentropy": 2.7574767470359802, + "loss/hidden": 0.0, + "loss/logits": 0.22653541713953018, + "loss/reg": 0.45526859164237976, + "step": 2824 + }, + { + "epoch": 0.02825, + "grad_norm": 0.44519567489624023, + "grad_norm_var": 0.0013235718441802194, + "learning_rate": 5e-05, + "loss": 0.2122, + "loss/crossentropy": 2.8236334919929504, + "loss/hidden": 0.0, + "loss/logits": 0.21223018318414688, + "loss/reg": 0.4549461901187897, + "step": 2825 + }, + { + "epoch": 0.02826, + "grad_norm": 0.492565393447876, + "grad_norm_var": 0.0012631475369059657, + "learning_rate": 5e-05, + "loss": 0.2095, + "loss/crossentropy": 2.6619699597358704, + "loss/hidden": 0.0, + "loss/logits": 0.20947019010782242, + "loss/reg": 0.45470142364501953, + "step": 2826 + }, + { + "epoch": 0.02827, + "grad_norm": 0.5405644178390503, + "grad_norm_var": 0.001574378700437921, + "learning_rate": 5e-05, + "loss": 0.2101, + "loss/crossentropy": 2.680967092514038, + "loss/hidden": 0.0, + "loss/logits": 0.21007801592350006, + "loss/reg": 0.4546498656272888, + "step": 2827 + }, + { + "epoch": 0.02828, + "grad_norm": 0.45908603072166443, + "grad_norm_var": 0.001588162665395032, + "learning_rate": 5e-05, + "loss": 0.2051, + "loss/crossentropy": 2.8818032145500183, + "loss/hidden": 0.0, + "loss/logits": 0.20513414591550827, + "loss/reg": 0.454274445772171, + "step": 2828 + }, + { + "epoch": 0.02829, + "grad_norm": 0.45710518956184387, + "grad_norm_var": 0.001551160569139857, + "learning_rate": 5e-05, + "loss": 0.2164, + "loss/crossentropy": 2.8063295483589172, + "loss/hidden": 0.0, + "loss/logits": 0.2164478786289692, + "loss/reg": 0.45417892932891846, + "step": 2829 + }, + { + "epoch": 0.0283, + "grad_norm": 0.5659121870994568, + "grad_norm_var": 0.002082128262688284, + "learning_rate": 5e-05, + "loss": 0.2213, + "loss/crossentropy": 2.7590752243995667, + "loss/hidden": 0.0, + "loss/logits": 0.22126565128564835, + "loss/reg": 0.45354872941970825, + "step": 2830 + }, + { + "epoch": 0.02831, + "grad_norm": 0.4645305871963501, + "grad_norm_var": 0.0020107927586687038, + "learning_rate": 5e-05, + "loss": 0.2515, + "loss/crossentropy": 2.7477108240127563, + "loss/hidden": 0.0, + "loss/logits": 0.2515309937298298, + "loss/reg": 0.4533059000968933, + "step": 2831 + }, + { + "epoch": 0.02832, + "grad_norm": 0.41779667139053345, + "grad_norm_var": 0.002256976701805513, + "learning_rate": 5e-05, + "loss": 0.1985, + "loss/crossentropy": 2.8637927770614624, + "loss/hidden": 0.0, + "loss/logits": 0.19845005497336388, + "loss/reg": 0.45307794213294983, + "step": 2832 + }, + { + "epoch": 0.02833, + "grad_norm": 0.4265285134315491, + "grad_norm_var": 0.002325797618500225, + "learning_rate": 5e-05, + "loss": 0.2059, + "loss/crossentropy": 2.77262544631958, + "loss/hidden": 0.0, + "loss/logits": 0.20589154213666916, + "loss/reg": 0.4527614116668701, + "step": 2833 + }, + { + "epoch": 0.02834, + "grad_norm": 0.4445587694644928, + "grad_norm_var": 0.0023462723641732297, + "learning_rate": 5e-05, + "loss": 0.2078, + "loss/crossentropy": 2.8611199259757996, + "loss/hidden": 0.0, + "loss/logits": 0.20776279270648956, + "loss/reg": 0.4524039924144745, + "step": 2834 + }, + { + "epoch": 0.02835, + "grad_norm": 0.392612487077713, + "grad_norm_var": 0.002752561695390554, + "learning_rate": 5e-05, + "loss": 0.1885, + "loss/crossentropy": 2.819760799407959, + "loss/hidden": 0.0, + "loss/logits": 0.1884596347808838, + "loss/reg": 0.4516162574291229, + "step": 2835 + }, + { + "epoch": 0.02836, + "grad_norm": 0.45119181275367737, + "grad_norm_var": 0.002708919993174386, + "learning_rate": 5e-05, + "loss": 0.2003, + "loss/crossentropy": 2.850569725036621, + "loss/hidden": 0.0, + "loss/logits": 0.200263861566782, + "loss/reg": 0.4512544572353363, + "step": 2836 + }, + { + "epoch": 0.02837, + "grad_norm": 0.6889685392379761, + "grad_norm_var": 0.005708521034918024, + "learning_rate": 5e-05, + "loss": 0.2162, + "loss/crossentropy": 2.883878469467163, + "loss/hidden": 0.0, + "loss/logits": 0.21617483720183372, + "loss/reg": 0.45047903060913086, + "step": 2837 + }, + { + "epoch": 0.02838, + "grad_norm": 0.4877657890319824, + "grad_norm_var": 0.005620487286427655, + "learning_rate": 5e-05, + "loss": 0.2081, + "loss/crossentropy": 2.854725956916809, + "loss/hidden": 0.0, + "loss/logits": 0.20810750126838684, + "loss/reg": 0.45008766651153564, + "step": 2838 + }, + { + "epoch": 0.02839, + "grad_norm": 0.45416489243507385, + "grad_norm_var": 0.0056247767938858665, + "learning_rate": 5e-05, + "loss": 0.2017, + "loss/crossentropy": 2.8403486013412476, + "loss/hidden": 0.0, + "loss/logits": 0.20165572315454483, + "loss/reg": 0.44952845573425293, + "step": 2839 + }, + { + "epoch": 0.0284, + "grad_norm": 0.459900438785553, + "grad_norm_var": 0.004987163725605429, + "learning_rate": 5e-05, + "loss": 0.1997, + "loss/crossentropy": 2.8550066351890564, + "loss/hidden": 0.0, + "loss/logits": 0.1996704563498497, + "loss/reg": 0.44924047589302063, + "step": 2840 + }, + { + "epoch": 0.02841, + "grad_norm": 0.4411807060241699, + "grad_norm_var": 0.005005747305554387, + "learning_rate": 5e-05, + "loss": 0.2079, + "loss/crossentropy": 2.688932418823242, + "loss/hidden": 0.0, + "loss/logits": 0.20793037861585617, + "loss/reg": 0.4489123523235321, + "step": 2841 + }, + { + "epoch": 0.02842, + "grad_norm": 0.4405941963195801, + "grad_norm_var": 0.0050720842548740205, + "learning_rate": 5e-05, + "loss": 0.201, + "loss/crossentropy": 2.7241137623786926, + "loss/hidden": 0.0, + "loss/logits": 0.20095163583755493, + "loss/reg": 0.44856101274490356, + "step": 2842 + }, + { + "epoch": 0.02843, + "grad_norm": 0.4979267716407776, + "grad_norm_var": 0.0048102936870089595, + "learning_rate": 5e-05, + "loss": 0.231, + "loss/crossentropy": 3.0183266401290894, + "loss/hidden": 0.0, + "loss/logits": 0.23095272853970528, + "loss/reg": 0.44800013303756714, + "step": 2843 + }, + { + "epoch": 0.02844, + "grad_norm": 0.4643518924713135, + "grad_norm_var": 0.004803055183970584, + "learning_rate": 5e-05, + "loss": 0.2191, + "loss/crossentropy": 2.6867568492889404, + "loss/hidden": 0.0, + "loss/logits": 0.21913553029298782, + "loss/reg": 0.4477306008338928, + "step": 2844 + }, + { + "epoch": 0.02845, + "grad_norm": 0.45990869402885437, + "grad_norm_var": 0.004797906545039832, + "learning_rate": 5e-05, + "loss": 0.2149, + "loss/crossentropy": 2.745564341545105, + "loss/hidden": 0.0, + "loss/logits": 0.2149292193353176, + "loss/reg": 0.4474531412124634, + "step": 2845 + }, + { + "epoch": 0.02846, + "grad_norm": 0.43941348791122437, + "grad_norm_var": 0.004220269419472411, + "learning_rate": 5e-05, + "loss": 0.1982, + "loss/crossentropy": 2.9024176001548767, + "loss/hidden": 0.0, + "loss/logits": 0.19822592288255692, + "loss/reg": 0.447593629360199, + "step": 2846 + }, + { + "epoch": 0.02847, + "grad_norm": 0.4183911979198456, + "grad_norm_var": 0.0043529010441073265, + "learning_rate": 5e-05, + "loss": 0.2078, + "loss/crossentropy": 2.867360234260559, + "loss/hidden": 0.0, + "loss/logits": 0.20783520489931107, + "loss/reg": 0.4472410976886749, + "step": 2847 + }, + { + "epoch": 0.02848, + "grad_norm": 0.8789546489715576, + "grad_norm_var": 0.014952527671800657, + "learning_rate": 5e-05, + "loss": 0.2982, + "loss/crossentropy": 2.9557228684425354, + "loss/hidden": 0.0, + "loss/logits": 0.29819074645638466, + "loss/reg": 0.4474218785762787, + "step": 2848 + }, + { + "epoch": 0.02849, + "grad_norm": 0.4329950511455536, + "grad_norm_var": 0.014900070169991711, + "learning_rate": 5e-05, + "loss": 0.1938, + "loss/crossentropy": 2.904766857624054, + "loss/hidden": 0.0, + "loss/logits": 0.1938365213572979, + "loss/reg": 0.4470826983451843, + "step": 2849 + }, + { + "epoch": 0.0285, + "grad_norm": 0.4885863959789276, + "grad_norm_var": 0.014749740823990332, + "learning_rate": 5e-05, + "loss": 0.2287, + "loss/crossentropy": 2.9103984236717224, + "loss/hidden": 0.0, + "loss/logits": 0.2286950685083866, + "loss/reg": 0.446959912776947, + "step": 2850 + }, + { + "epoch": 0.02851, + "grad_norm": 0.5715169906616211, + "grad_norm_var": 0.014342250883571772, + "learning_rate": 5e-05, + "loss": 0.1965, + "loss/crossentropy": 2.973586916923523, + "loss/hidden": 0.0, + "loss/logits": 0.19652536511421204, + "loss/reg": 0.4468175768852234, + "step": 2851 + }, + { + "epoch": 0.02852, + "grad_norm": 0.45727914571762085, + "grad_norm_var": 0.014301106219405772, + "learning_rate": 5e-05, + "loss": 0.2077, + "loss/crossentropy": 2.7873034477233887, + "loss/hidden": 0.0, + "loss/logits": 0.20770062878727913, + "loss/reg": 0.4463878870010376, + "step": 2852 + }, + { + "epoch": 0.02853, + "grad_norm": 0.4700813591480255, + "grad_norm_var": 0.011929930693987467, + "learning_rate": 5e-05, + "loss": 0.2326, + "loss/crossentropy": 2.878050684928894, + "loss/hidden": 0.0, + "loss/logits": 0.2325824238359928, + "loss/reg": 0.4460793137550354, + "step": 2853 + }, + { + "epoch": 0.02854, + "grad_norm": 1.4903044700622559, + "grad_norm_var": 0.07425676692807313, + "learning_rate": 5e-05, + "loss": 0.2991, + "loss/crossentropy": 2.9930173754692078, + "loss/hidden": 0.0, + "loss/logits": 0.29914769157767296, + "loss/reg": 0.44557204842567444, + "step": 2854 + }, + { + "epoch": 0.02855, + "grad_norm": 0.4951091706752777, + "grad_norm_var": 0.07381599154580355, + "learning_rate": 5e-05, + "loss": 0.2135, + "loss/crossentropy": 2.7768259048461914, + "loss/hidden": 0.0, + "loss/logits": 0.21352139860391617, + "loss/reg": 0.44532668590545654, + "step": 2855 + }, + { + "epoch": 0.02856, + "grad_norm": 0.48731565475463867, + "grad_norm_var": 0.07350928987656892, + "learning_rate": 5e-05, + "loss": 0.221, + "loss/crossentropy": 2.6807295083999634, + "loss/hidden": 0.0, + "loss/logits": 0.22097661718726158, + "loss/reg": 0.4446757137775421, + "step": 2856 + }, + { + "epoch": 0.02857, + "grad_norm": 0.45502719283103943, + "grad_norm_var": 0.0733049192060762, + "learning_rate": 5e-05, + "loss": 0.2101, + "loss/crossentropy": 2.8643643856048584, + "loss/hidden": 0.0, + "loss/logits": 0.21007532626390457, + "loss/reg": 0.4443725049495697, + "step": 2857 + }, + { + "epoch": 0.02858, + "grad_norm": 0.5700808167457581, + "grad_norm_var": 0.07230452766931625, + "learning_rate": 5e-05, + "loss": 0.2358, + "loss/crossentropy": 2.862323224544525, + "loss/hidden": 0.0, + "loss/logits": 0.23579580709338188, + "loss/reg": 0.44386300444602966, + "step": 2858 + }, + { + "epoch": 0.02859, + "grad_norm": 0.4562830924987793, + "grad_norm_var": 0.07279826282913202, + "learning_rate": 5e-05, + "loss": 0.2215, + "loss/crossentropy": 2.8194819688796997, + "loss/hidden": 0.0, + "loss/logits": 0.22148104384541512, + "loss/reg": 0.4435482919216156, + "step": 2859 + }, + { + "epoch": 0.0286, + "grad_norm": 0.419639527797699, + "grad_norm_var": 0.07352160147444166, + "learning_rate": 5e-05, + "loss": 0.2086, + "loss/crossentropy": 2.8357619643211365, + "loss/hidden": 0.0, + "loss/logits": 0.20855940133333206, + "loss/reg": 0.4430595636367798, + "step": 2860 + }, + { + "epoch": 0.02861, + "grad_norm": 0.4108657240867615, + "grad_norm_var": 0.07433905381171363, + "learning_rate": 5e-05, + "loss": 0.2028, + "loss/crossentropy": 2.690234422683716, + "loss/hidden": 0.0, + "loss/logits": 0.202754907310009, + "loss/reg": 0.4430074691772461, + "step": 2861 + }, + { + "epoch": 0.02862, + "grad_norm": 0.45748135447502136, + "grad_norm_var": 0.07407169167725704, + "learning_rate": 5e-05, + "loss": 0.2273, + "loss/crossentropy": 2.8170511722564697, + "loss/hidden": 0.0, + "loss/logits": 0.22729111835360527, + "loss/reg": 0.4426112771034241, + "step": 2862 + }, + { + "epoch": 0.02863, + "grad_norm": 0.5977925062179565, + "grad_norm_var": 0.07269606813771154, + "learning_rate": 5e-05, + "loss": 0.2166, + "loss/crossentropy": 2.763596296310425, + "loss/hidden": 0.0, + "loss/logits": 0.21657276153564453, + "loss/reg": 0.442623496055603, + "step": 2863 + }, + { + "epoch": 0.02864, + "grad_norm": 0.43456876277923584, + "grad_norm_var": 0.06680400331332083, + "learning_rate": 5e-05, + "loss": 0.2051, + "loss/crossentropy": 2.7858778834342957, + "loss/hidden": 0.0, + "loss/logits": 0.20507103204727173, + "loss/reg": 0.4421454966068268, + "step": 2864 + }, + { + "epoch": 0.02865, + "grad_norm": 0.4324786365032196, + "grad_norm_var": 0.06681162421421465, + "learning_rate": 5e-05, + "loss": 0.2104, + "loss/crossentropy": 2.8077516555786133, + "loss/hidden": 0.0, + "loss/logits": 0.2103988155722618, + "loss/reg": 0.4415656328201294, + "step": 2865 + }, + { + "epoch": 0.02866, + "grad_norm": 0.43425074219703674, + "grad_norm_var": 0.06739326287742396, + "learning_rate": 5e-05, + "loss": 0.2107, + "loss/crossentropy": 2.684048652648926, + "loss/hidden": 0.0, + "loss/logits": 0.21073822677135468, + "loss/reg": 0.4412558376789093, + "step": 2866 + }, + { + "epoch": 0.02867, + "grad_norm": 0.4690887928009033, + "grad_norm_var": 0.06761861752890229, + "learning_rate": 5e-05, + "loss": 0.2021, + "loss/crossentropy": 2.6845428347587585, + "loss/hidden": 0.0, + "loss/logits": 0.20206031948328018, + "loss/reg": 0.44092607498168945, + "step": 2867 + }, + { + "epoch": 0.02868, + "grad_norm": 0.41999655961990356, + "grad_norm_var": 0.06808489840990514, + "learning_rate": 5e-05, + "loss": 0.2042, + "loss/crossentropy": 2.8988237977027893, + "loss/hidden": 0.0, + "loss/logits": 0.20421407371759415, + "loss/reg": 0.4407598078250885, + "step": 2868 + }, + { + "epoch": 0.02869, + "grad_norm": 0.44096383452415466, + "grad_norm_var": 0.06837545346536664, + "learning_rate": 5e-05, + "loss": 0.214, + "loss/crossentropy": 2.8452200293540955, + "loss/hidden": 0.0, + "loss/logits": 0.21404718235135078, + "loss/reg": 0.44066742062568665, + "step": 2869 + }, + { + "epoch": 0.0287, + "grad_norm": 0.470173716545105, + "grad_norm_var": 0.0027245645108871248, + "learning_rate": 5e-05, + "loss": 0.2079, + "loss/crossentropy": 2.7973289489746094, + "loss/hidden": 0.0, + "loss/logits": 0.20790128409862518, + "loss/reg": 0.4401834309101105, + "step": 2870 + }, + { + "epoch": 0.02871, + "grad_norm": 0.411109983921051, + "grad_norm_var": 0.002836117709308002, + "learning_rate": 5e-05, + "loss": 0.2095, + "loss/crossentropy": 2.7796115279197693, + "loss/hidden": 0.0, + "loss/logits": 0.209464393556118, + "loss/reg": 0.4395567774772644, + "step": 2871 + }, + { + "epoch": 0.02872, + "grad_norm": 0.43543246388435364, + "grad_norm_var": 0.0028184732537607005, + "learning_rate": 5e-05, + "loss": 0.1985, + "loss/crossentropy": 2.7762089371681213, + "loss/hidden": 0.0, + "loss/logits": 0.19854741171002388, + "loss/reg": 0.43918320536613464, + "step": 2872 + }, + { + "epoch": 0.02873, + "grad_norm": 0.44884034991264343, + "grad_norm_var": 0.0028226596812492843, + "learning_rate": 5e-05, + "loss": 0.2139, + "loss/crossentropy": 2.6633995175361633, + "loss/hidden": 0.0, + "loss/logits": 0.2138892151415348, + "loss/reg": 0.43913817405700684, + "step": 2873 + }, + { + "epoch": 0.02874, + "grad_norm": 0.41936764121055603, + "grad_norm_var": 0.001966235343164776, + "learning_rate": 5e-05, + "loss": 0.1983, + "loss/crossentropy": 2.81494140625, + "loss/hidden": 0.0, + "loss/logits": 0.19825275242328644, + "loss/reg": 0.43911609053611755, + "step": 2874 + }, + { + "epoch": 0.02875, + "grad_norm": 0.4571692645549774, + "grad_norm_var": 0.001967334507276005, + "learning_rate": 5e-05, + "loss": 0.2008, + "loss/crossentropy": 2.8269460797309875, + "loss/hidden": 0.0, + "loss/logits": 0.20076368004083633, + "loss/reg": 0.4389042556285858, + "step": 2875 + }, + { + "epoch": 0.02876, + "grad_norm": 0.4609224796295166, + "grad_norm_var": 0.0019207655220100002, + "learning_rate": 5e-05, + "loss": 0.1915, + "loss/crossentropy": 2.8614678382873535, + "loss/hidden": 0.0, + "loss/logits": 0.1914963573217392, + "loss/reg": 0.4393470287322998, + "step": 2876 + }, + { + "epoch": 0.02877, + "grad_norm": 0.45160263776779175, + "grad_norm_var": 0.0018117520558449099, + "learning_rate": 5e-05, + "loss": 0.208, + "loss/crossentropy": 2.853985607624054, + "loss/hidden": 0.0, + "loss/logits": 0.20802179723978043, + "loss/reg": 0.4394395053386688, + "step": 2877 + }, + { + "epoch": 0.02878, + "grad_norm": 0.4564264416694641, + "grad_norm_var": 0.0018111318541542058, + "learning_rate": 5e-05, + "loss": 0.1956, + "loss/crossentropy": 2.86997252702713, + "loss/hidden": 0.0, + "loss/logits": 0.195590578019619, + "loss/reg": 0.43931296467781067, + "step": 2878 + }, + { + "epoch": 0.02879, + "grad_norm": 0.49965763092041016, + "grad_norm_var": 0.0005120847552019729, + "learning_rate": 5e-05, + "loss": 0.2208, + "loss/crossentropy": 2.719831883907318, + "loss/hidden": 0.0, + "loss/logits": 0.22080516815185547, + "loss/reg": 0.43935999274253845, + "step": 2879 + }, + { + "epoch": 0.0288, + "grad_norm": 0.42510664463043213, + "grad_norm_var": 0.0005325793577519536, + "learning_rate": 5e-05, + "loss": 0.1999, + "loss/crossentropy": 2.8110690116882324, + "loss/hidden": 0.0, + "loss/logits": 0.1999005265533924, + "loss/reg": 0.43925940990448, + "step": 2880 + }, + { + "epoch": 0.02881, + "grad_norm": 0.43318071961402893, + "grad_norm_var": 0.0005313643794504015, + "learning_rate": 5e-05, + "loss": 0.2067, + "loss/crossentropy": 2.759833872318268, + "loss/hidden": 0.0, + "loss/logits": 0.20667526498436928, + "loss/reg": 0.4387754201889038, + "step": 2881 + }, + { + "epoch": 0.02882, + "grad_norm": 0.45197704434394836, + "grad_norm_var": 0.0005236341219386358, + "learning_rate": 5e-05, + "loss": 0.2155, + "loss/crossentropy": 2.784320652484894, + "loss/hidden": 0.0, + "loss/logits": 0.21551981568336487, + "loss/reg": 0.43896326422691345, + "step": 2882 + }, + { + "epoch": 0.02883, + "grad_norm": 0.43831557035446167, + "grad_norm_var": 0.0004919363408685744, + "learning_rate": 5e-05, + "loss": 0.2136, + "loss/crossentropy": 2.750246524810791, + "loss/hidden": 0.0, + "loss/logits": 0.21355559304356575, + "loss/reg": 0.43863677978515625, + "step": 2883 + }, + { + "epoch": 0.02884, + "grad_norm": 0.7667464017868042, + "grad_norm_var": 0.006849958169046799, + "learning_rate": 5e-05, + "loss": 0.2533, + "loss/crossentropy": 2.7600563168525696, + "loss/hidden": 0.0, + "loss/logits": 0.2533155083656311, + "loss/reg": 0.43825453519821167, + "step": 2884 + }, + { + "epoch": 0.02885, + "grad_norm": 0.42572563886642456, + "grad_norm_var": 0.006916734220569376, + "learning_rate": 5e-05, + "loss": 0.1941, + "loss/crossentropy": 2.8089194297790527, + "loss/hidden": 0.0, + "loss/logits": 0.19409573078155518, + "loss/reg": 0.4381675124168396, + "step": 2885 + }, + { + "epoch": 0.02886, + "grad_norm": 0.47641533613204956, + "grad_norm_var": 0.006922863334027637, + "learning_rate": 5e-05, + "loss": 0.2178, + "loss/crossentropy": 2.795935273170471, + "loss/hidden": 0.0, + "loss/logits": 0.21775543689727783, + "loss/reg": 0.4377654790878296, + "step": 2886 + }, + { + "epoch": 0.02887, + "grad_norm": 0.4405703842639923, + "grad_norm_var": 0.006761007034326823, + "learning_rate": 5e-05, + "loss": 0.204, + "loss/crossentropy": 2.801277816295624, + "loss/hidden": 0.0, + "loss/logits": 0.20397379249334335, + "loss/reg": 0.4373535215854645, + "step": 2887 + }, + { + "epoch": 0.02888, + "grad_norm": 0.5539225339889526, + "grad_norm_var": 0.007124513140865914, + "learning_rate": 5e-05, + "loss": 0.2247, + "loss/crossentropy": 2.8959710001945496, + "loss/hidden": 0.0, + "loss/logits": 0.22470634058117867, + "loss/reg": 0.436652272939682, + "step": 2888 + }, + { + "epoch": 0.02889, + "grad_norm": 1.322914958000183, + "grad_norm_var": 0.05178286733058259, + "learning_rate": 5e-05, + "loss": 0.2945, + "loss/crossentropy": 2.828279972076416, + "loss/hidden": 0.0, + "loss/logits": 0.2944767475128174, + "loss/reg": 0.4362984299659729, + "step": 2889 + }, + { + "epoch": 0.0289, + "grad_norm": 0.4668998420238495, + "grad_norm_var": 0.05122291916550908, + "learning_rate": 5e-05, + "loss": 0.2143, + "loss/crossentropy": 2.8046092987060547, + "loss/hidden": 0.0, + "loss/logits": 0.2143494188785553, + "loss/reg": 0.4358566999435425, + "step": 2890 + }, + { + "epoch": 0.02891, + "grad_norm": 1.3104586601257324, + "grad_norm_var": 0.08810511012542742, + "learning_rate": 5e-05, + "loss": 0.2561, + "loss/crossentropy": 3.050446569919586, + "loss/hidden": 0.0, + "loss/logits": 0.2560820244252682, + "loss/reg": 0.435291588306427, + "step": 2891 + }, + { + "epoch": 0.02892, + "grad_norm": 0.46758073568344116, + "grad_norm_var": 0.08799657243257995, + "learning_rate": 5e-05, + "loss": 0.2251, + "loss/crossentropy": 2.8260610103607178, + "loss/hidden": 0.0, + "loss/logits": 0.2251061201095581, + "loss/reg": 0.435118705034256, + "step": 2892 + }, + { + "epoch": 0.02893, + "grad_norm": 0.4710516035556793, + "grad_norm_var": 0.08766983114729969, + "learning_rate": 5e-05, + "loss": 0.2173, + "loss/crossentropy": 2.73756343126297, + "loss/hidden": 0.0, + "loss/logits": 0.21728923544287682, + "loss/reg": 0.43532776832580566, + "step": 2893 + }, + { + "epoch": 0.02894, + "grad_norm": 0.4977162480354309, + "grad_norm_var": 0.08705239249022875, + "learning_rate": 5e-05, + "loss": 0.2053, + "loss/crossentropy": 2.8385772109031677, + "loss/hidden": 0.0, + "loss/logits": 0.20532149821519852, + "loss/reg": 0.4350016117095947, + "step": 2894 + }, + { + "epoch": 0.02895, + "grad_norm": 0.6021206378936768, + "grad_norm_var": 0.08646729003610103, + "learning_rate": 5e-05, + "loss": 0.2105, + "loss/crossentropy": 2.7629491090774536, + "loss/hidden": 0.0, + "loss/logits": 0.21053143218159676, + "loss/reg": 0.43496209383010864, + "step": 2895 + }, + { + "epoch": 0.02896, + "grad_norm": 0.5295746922492981, + "grad_norm_var": 0.0847562024099644, + "learning_rate": 5e-05, + "loss": 0.2015, + "loss/crossentropy": 2.766226887702942, + "loss/hidden": 0.0, + "loss/logits": 0.20146498456597328, + "loss/reg": 0.43495821952819824, + "step": 2896 + }, + { + "epoch": 0.02897, + "grad_norm": 0.46542394161224365, + "grad_norm_var": 0.08408918275792439, + "learning_rate": 5e-05, + "loss": 0.2091, + "loss/crossentropy": 2.8468995094299316, + "loss/hidden": 0.0, + "loss/logits": 0.2091306820511818, + "loss/reg": 0.4347335994243622, + "step": 2897 + }, + { + "epoch": 0.02898, + "grad_norm": 0.4753681421279907, + "grad_norm_var": 0.08364468401008886, + "learning_rate": 5e-05, + "loss": 0.2101, + "loss/crossentropy": 2.9253774285316467, + "loss/hidden": 0.0, + "loss/logits": 0.2100818008184433, + "loss/reg": 0.4348641633987427, + "step": 2898 + }, + { + "epoch": 0.02899, + "grad_norm": 0.49682146310806274, + "grad_norm_var": 0.0825433291032469, + "learning_rate": 5e-05, + "loss": 0.2211, + "loss/crossentropy": 2.6903114914894104, + "loss/hidden": 0.0, + "loss/logits": 0.22107788547873497, + "loss/reg": 0.4349150061607361, + "step": 2899 + }, + { + "epoch": 0.029, + "grad_norm": 0.4417564570903778, + "grad_norm_var": 0.08237756629224548, + "learning_rate": 5e-05, + "loss": 0.2027, + "loss/crossentropy": 2.9534873366355896, + "loss/hidden": 0.0, + "loss/logits": 0.20269013196229935, + "loss/reg": 0.43467414379119873, + "step": 2900 + }, + { + "epoch": 0.02901, + "grad_norm": 0.4322088360786438, + "grad_norm_var": 0.08223795674130371, + "learning_rate": 5e-05, + "loss": 0.2025, + "loss/crossentropy": 2.727617383003235, + "loss/hidden": 0.0, + "loss/logits": 0.2025168538093567, + "loss/reg": 0.43458107113838196, + "step": 2901 + }, + { + "epoch": 0.02902, + "grad_norm": 0.6878774166107178, + "grad_norm_var": 0.08181116724688532, + "learning_rate": 5e-05, + "loss": 0.2093, + "loss/crossentropy": 2.8063265085220337, + "loss/hidden": 0.0, + "loss/logits": 0.20928652957081795, + "loss/reg": 0.434550940990448, + "step": 2902 + }, + { + "epoch": 0.02903, + "grad_norm": 0.5270038843154907, + "grad_norm_var": 0.08039589852372317, + "learning_rate": 5e-05, + "loss": 0.205, + "loss/crossentropy": 2.8273242712020874, + "loss/hidden": 0.0, + "loss/logits": 0.2049809880554676, + "loss/reg": 0.43495526909828186, + "step": 2903 + }, + { + "epoch": 0.02904, + "grad_norm": 0.5345247387886047, + "grad_norm_var": 0.08056262628696453, + "learning_rate": 5e-05, + "loss": 0.2047, + "loss/crossentropy": 2.8473464250564575, + "loss/hidden": 0.0, + "loss/logits": 0.2047046385705471, + "loss/reg": 0.434909850358963, + "step": 2904 + }, + { + "epoch": 0.02905, + "grad_norm": 0.4608438313007355, + "grad_norm_var": 0.04484555671349521, + "learning_rate": 5e-05, + "loss": 0.2184, + "loss/crossentropy": 2.804013729095459, + "loss/hidden": 0.0, + "loss/logits": 0.21839474886655807, + "loss/reg": 0.4345337748527527, + "step": 2905 + }, + { + "epoch": 0.02906, + "grad_norm": 0.6053507328033447, + "grad_norm_var": 0.044431990083754806, + "learning_rate": 5e-05, + "loss": 0.2104, + "loss/crossentropy": 2.7997120022773743, + "loss/hidden": 0.0, + "loss/logits": 0.2103927582502365, + "loss/reg": 0.43413323163986206, + "step": 2906 + }, + { + "epoch": 0.02907, + "grad_norm": 0.6289812326431274, + "grad_norm_var": 0.005527716779769624, + "learning_rate": 5e-05, + "loss": 0.2505, + "loss/crossentropy": 2.744085133075714, + "loss/hidden": 0.0, + "loss/logits": 0.25048669800162315, + "loss/reg": 0.4340226650238037, + "step": 2907 + }, + { + "epoch": 0.02908, + "grad_norm": 0.5207942128181458, + "grad_norm_var": 0.0053309103954739204, + "learning_rate": 5e-05, + "loss": 0.2271, + "loss/crossentropy": 2.768359422683716, + "loss/hidden": 0.0, + "loss/logits": 0.2271416336297989, + "loss/reg": 0.43376651406288147, + "step": 2908 + }, + { + "epoch": 0.02909, + "grad_norm": 0.7949036359786987, + "grad_norm_var": 0.009617356062349458, + "learning_rate": 5e-05, + "loss": 0.2633, + "loss/crossentropy": 2.73787122964859, + "loss/hidden": 0.0, + "loss/logits": 0.26328161731362343, + "loss/reg": 0.4332568943500519, + "step": 2909 + }, + { + "epoch": 0.0291, + "grad_norm": 0.4809052050113678, + "grad_norm_var": 0.009738380576331656, + "learning_rate": 5e-05, + "loss": 0.2142, + "loss/crossentropy": 2.7631189823150635, + "loss/hidden": 0.0, + "loss/logits": 0.21416160464286804, + "loss/reg": 0.43308326601982117, + "step": 2910 + }, + { + "epoch": 0.02911, + "grad_norm": 0.5630953907966614, + "grad_norm_var": 0.00952478831177646, + "learning_rate": 5e-05, + "loss": 0.265, + "loss/crossentropy": 2.597219705581665, + "loss/hidden": 0.0, + "loss/logits": 0.2649861052632332, + "loss/reg": 0.4332560896873474, + "step": 2911 + }, + { + "epoch": 0.02912, + "grad_norm": 0.47166475653648376, + "grad_norm_var": 0.009817505304391938, + "learning_rate": 5e-05, + "loss": 0.2173, + "loss/crossentropy": 2.7941606044769287, + "loss/hidden": 0.0, + "loss/logits": 0.21725543215870857, + "loss/reg": 0.43294525146484375, + "step": 2912 + }, + { + "epoch": 0.02913, + "grad_norm": 0.4496498703956604, + "grad_norm_var": 0.009983007693775114, + "learning_rate": 5e-05, + "loss": 0.2167, + "loss/crossentropy": 2.764277219772339, + "loss/hidden": 0.0, + "loss/logits": 0.21674080193042755, + "loss/reg": 0.43276655673980713, + "step": 2913 + }, + { + "epoch": 0.02914, + "grad_norm": 0.6527702808380127, + "grad_norm_var": 0.010522098063843544, + "learning_rate": 5e-05, + "loss": 0.2526, + "loss/crossentropy": 2.7757217288017273, + "loss/hidden": 0.0, + "loss/logits": 0.252617422491312, + "loss/reg": 0.4325600266456604, + "step": 2914 + }, + { + "epoch": 0.02915, + "grad_norm": 0.451974481344223, + "grad_norm_var": 0.010946784367987539, + "learning_rate": 5e-05, + "loss": 0.2089, + "loss/crossentropy": 2.7318632006645203, + "loss/hidden": 0.0, + "loss/logits": 0.20888511836528778, + "loss/reg": 0.432567834854126, + "step": 2915 + }, + { + "epoch": 0.02916, + "grad_norm": 0.44678446650505066, + "grad_norm_var": 0.010879807445404484, + "learning_rate": 5e-05, + "loss": 0.2017, + "loss/crossentropy": 2.8584572672843933, + "loss/hidden": 0.0, + "loss/logits": 0.2017306312918663, + "loss/reg": 0.43244996666908264, + "step": 2916 + }, + { + "epoch": 0.02917, + "grad_norm": 0.4978337585926056, + "grad_norm_var": 0.010167883853457838, + "learning_rate": 5e-05, + "loss": 0.2148, + "loss/crossentropy": 2.794499158859253, + "loss/hidden": 0.0, + "loss/logits": 0.21482348442077637, + "loss/reg": 0.43255454301834106, + "step": 2917 + }, + { + "epoch": 0.02918, + "grad_norm": 0.5144265294075012, + "grad_norm_var": 0.008823351669898694, + "learning_rate": 5e-05, + "loss": 0.2267, + "loss/crossentropy": 3.0272130370140076, + "loss/hidden": 0.0, + "loss/logits": 0.22667042165994644, + "loss/reg": 0.432171493768692, + "step": 2918 + }, + { + "epoch": 0.02919, + "grad_norm": 0.43666350841522217, + "grad_norm_var": 0.00946100240858585, + "learning_rate": 5e-05, + "loss": 0.211, + "loss/crossentropy": 2.7977908849716187, + "loss/hidden": 0.0, + "loss/logits": 0.21101978421211243, + "loss/reg": 0.43189969658851624, + "step": 2919 + }, + { + "epoch": 0.0292, + "grad_norm": 0.4572715759277344, + "grad_norm_var": 0.009807463230091785, + "learning_rate": 5e-05, + "loss": 0.2016, + "loss/crossentropy": 2.6837905645370483, + "loss/hidden": 0.0, + "loss/logits": 0.20155343040823936, + "loss/reg": 0.4312141239643097, + "step": 2920 + }, + { + "epoch": 0.02921, + "grad_norm": 0.46218210458755493, + "grad_norm_var": 0.009795749155402626, + "learning_rate": 5e-05, + "loss": 0.2067, + "loss/crossentropy": 2.661113142967224, + "loss/hidden": 0.0, + "loss/logits": 0.20674116164445877, + "loss/reg": 0.4311680197715759, + "step": 2921 + }, + { + "epoch": 0.02922, + "grad_norm": 0.4486991763114929, + "grad_norm_var": 0.009697224061569124, + "learning_rate": 5e-05, + "loss": 0.2076, + "loss/crossentropy": 2.817441463470459, + "loss/hidden": 0.0, + "loss/logits": 0.20757173001766205, + "loss/reg": 0.4307200312614441, + "step": 2922 + }, + { + "epoch": 0.02923, + "grad_norm": 0.4407883286476135, + "grad_norm_var": 0.009111234084253565, + "learning_rate": 5e-05, + "loss": 0.2066, + "loss/crossentropy": 2.812344968318939, + "loss/hidden": 0.0, + "loss/logits": 0.20664214715361595, + "loss/reg": 0.4306470453739166, + "step": 2923 + }, + { + "epoch": 0.02924, + "grad_norm": 0.43172478675842285, + "grad_norm_var": 0.009427223143019097, + "learning_rate": 5e-05, + "loss": 0.2093, + "loss/crossentropy": 2.63931941986084, + "loss/hidden": 0.0, + "loss/logits": 0.20927761867642403, + "loss/reg": 0.4303757846355438, + "step": 2924 + }, + { + "epoch": 0.02925, + "grad_norm": 0.46868380904197693, + "grad_norm_var": 0.0032549493250898435, + "learning_rate": 5e-05, + "loss": 0.2027, + "loss/crossentropy": 2.7613174319267273, + "loss/hidden": 0.0, + "loss/logits": 0.20268363878130913, + "loss/reg": 0.42978519201278687, + "step": 2925 + }, + { + "epoch": 0.02926, + "grad_norm": 0.4842621386051178, + "grad_norm_var": 0.003256195369968973, + "learning_rate": 5e-05, + "loss": 0.2304, + "loss/crossentropy": 2.857293725013733, + "loss/hidden": 0.0, + "loss/logits": 0.23036231473088264, + "loss/reg": 0.4293884038925171, + "step": 2926 + }, + { + "epoch": 0.02927, + "grad_norm": 0.4133976995944977, + "grad_norm_var": 0.0029963219170064687, + "learning_rate": 5e-05, + "loss": 0.2025, + "loss/crossentropy": 2.8262358903884888, + "loss/hidden": 0.0, + "loss/logits": 0.20250090956687927, + "loss/reg": 0.42856737971305847, + "step": 2927 + }, + { + "epoch": 0.02928, + "grad_norm": 0.4278252124786377, + "grad_norm_var": 0.0031099166592948387, + "learning_rate": 5e-05, + "loss": 0.2043, + "loss/crossentropy": 2.7385696172714233, + "loss/hidden": 0.0, + "loss/logits": 0.20430608838796616, + "loss/reg": 0.4280559718608856, + "step": 2928 + }, + { + "epoch": 0.02929, + "grad_norm": 0.4801313281059265, + "grad_norm_var": 0.0030941859744619974, + "learning_rate": 5e-05, + "loss": 0.2157, + "loss/crossentropy": 2.899641692638397, + "loss/hidden": 0.0, + "loss/logits": 0.21570446714758873, + "loss/reg": 0.4272966980934143, + "step": 2929 + }, + { + "epoch": 0.0293, + "grad_norm": 0.43685558438301086, + "grad_norm_var": 0.0007379357287445469, + "learning_rate": 5e-05, + "loss": 0.2015, + "loss/crossentropy": 2.8251423239707947, + "loss/hidden": 0.0, + "loss/logits": 0.2014743648469448, + "loss/reg": 0.4267607629299164, + "step": 2930 + }, + { + "epoch": 0.02931, + "grad_norm": 0.44084784388542175, + "grad_norm_var": 0.0007519703655822737, + "learning_rate": 5e-05, + "loss": 0.2038, + "loss/crossentropy": 2.6310330629348755, + "loss/hidden": 0.0, + "loss/logits": 0.2037803828716278, + "loss/reg": 0.4258156418800354, + "step": 2931 + }, + { + "epoch": 0.02932, + "grad_norm": 0.6596364974975586, + "grad_norm_var": 0.003335575138394198, + "learning_rate": 5e-05, + "loss": 0.2422, + "loss/crossentropy": 2.9863718152046204, + "loss/hidden": 0.0, + "loss/logits": 0.24216420948505402, + "loss/reg": 0.425359845161438, + "step": 2932 + }, + { + "epoch": 0.02933, + "grad_norm": 0.4427257776260376, + "grad_norm_var": 0.0033122459103471593, + "learning_rate": 5e-05, + "loss": 0.2207, + "loss/crossentropy": 2.83390474319458, + "loss/hidden": 0.0, + "loss/logits": 0.22071523219347, + "loss/reg": 0.4251004755496979, + "step": 2933 + }, + { + "epoch": 0.02934, + "grad_norm": 0.5108687281608582, + "grad_norm_var": 0.0032897719007115023, + "learning_rate": 5e-05, + "loss": 0.2286, + "loss/crossentropy": 2.849147856235504, + "loss/hidden": 0.0, + "loss/logits": 0.22855570167303085, + "loss/reg": 0.42414724826812744, + "step": 2934 + }, + { + "epoch": 0.02935, + "grad_norm": 0.46716398000717163, + "grad_norm_var": 0.0032320257572898137, + "learning_rate": 5e-05, + "loss": 0.2191, + "loss/crossentropy": 2.836436152458191, + "loss/hidden": 0.0, + "loss/logits": 0.2190553918480873, + "loss/reg": 0.4234054386615753, + "step": 2935 + }, + { + "epoch": 0.02936, + "grad_norm": 0.4688037633895874, + "grad_norm_var": 0.0032252767415079913, + "learning_rate": 5e-05, + "loss": 0.2158, + "loss/crossentropy": 2.894056558609009, + "loss/hidden": 0.0, + "loss/logits": 0.21578609198331833, + "loss/reg": 0.42306944727897644, + "step": 2936 + }, + { + "epoch": 0.02937, + "grad_norm": 0.46226829290390015, + "grad_norm_var": 0.0032252127921666134, + "learning_rate": 5e-05, + "loss": 0.2067, + "loss/crossentropy": 2.776409327983856, + "loss/hidden": 0.0, + "loss/logits": 0.2067032866179943, + "loss/reg": 0.42247679829597473, + "step": 2937 + }, + { + "epoch": 0.02938, + "grad_norm": 0.43042436242103577, + "grad_norm_var": 0.00329260988404112, + "learning_rate": 5e-05, + "loss": 0.2082, + "loss/crossentropy": 2.829971671104431, + "loss/hidden": 0.0, + "loss/logits": 0.2081555612385273, + "loss/reg": 0.4219067692756653, + "step": 2938 + }, + { + "epoch": 0.02939, + "grad_norm": 0.45234474539756775, + "grad_norm_var": 0.0032611069234958637, + "learning_rate": 5e-05, + "loss": 0.22, + "loss/crossentropy": 2.828520655632019, + "loss/hidden": 0.0, + "loss/logits": 0.22000523656606674, + "loss/reg": 0.4217160642147064, + "step": 2939 + }, + { + "epoch": 0.0294, + "grad_norm": 0.4976692199707031, + "grad_norm_var": 0.0032194604239382846, + "learning_rate": 5e-05, + "loss": 0.2116, + "loss/crossentropy": 2.7917208671569824, + "loss/hidden": 0.0, + "loss/logits": 0.2115989811718464, + "loss/reg": 0.4211964011192322, + "step": 2940 + }, + { + "epoch": 0.02941, + "grad_norm": 0.4692341983318329, + "grad_norm_var": 0.0032192731075702677, + "learning_rate": 5e-05, + "loss": 0.2104, + "loss/crossentropy": 2.9231185913085938, + "loss/hidden": 0.0, + "loss/logits": 0.21041087433695793, + "loss/reg": 0.420807808637619, + "step": 2941 + }, + { + "epoch": 0.02942, + "grad_norm": 0.4621233344078064, + "grad_norm_var": 0.003212318974609696, + "learning_rate": 5e-05, + "loss": 0.2176, + "loss/crossentropy": 2.772266685962677, + "loss/hidden": 0.0, + "loss/logits": 0.21759070456027985, + "loss/reg": 0.42004191875457764, + "step": 2942 + }, + { + "epoch": 0.02943, + "grad_norm": 0.43007317185401917, + "grad_norm_var": 0.003103526618714847, + "learning_rate": 5e-05, + "loss": 0.2106, + "loss/crossentropy": 2.8438419103622437, + "loss/hidden": 0.0, + "loss/logits": 0.21057388931512833, + "loss/reg": 0.41981741786003113, + "step": 2943 + }, + { + "epoch": 0.02944, + "grad_norm": 0.43187806010246277, + "grad_norm_var": 0.0030811212516048494, + "learning_rate": 5e-05, + "loss": 0.2184, + "loss/crossentropy": 2.777413785457611, + "loss/hidden": 0.0, + "loss/logits": 0.21842098236083984, + "loss/reg": 0.4195232391357422, + "step": 2944 + }, + { + "epoch": 0.02945, + "grad_norm": 0.4012479782104492, + "grad_norm_var": 0.0033786250483900343, + "learning_rate": 5e-05, + "loss": 0.2001, + "loss/crossentropy": 2.7778549790382385, + "loss/hidden": 0.0, + "loss/logits": 0.20011166483163834, + "loss/reg": 0.419201523065567, + "step": 2945 + }, + { + "epoch": 0.02946, + "grad_norm": 0.4389004409313202, + "grad_norm_var": 0.003370801090443065, + "learning_rate": 5e-05, + "loss": 0.207, + "loss/crossentropy": 2.8169692158699036, + "loss/hidden": 0.0, + "loss/logits": 0.20698180422186852, + "loss/reg": 0.4188322424888611, + "step": 2946 + }, + { + "epoch": 0.02947, + "grad_norm": 0.8466213345527649, + "grad_norm_var": 0.012266222626710111, + "learning_rate": 5e-05, + "loss": 0.2649, + "loss/crossentropy": 2.6959760189056396, + "loss/hidden": 0.0, + "loss/logits": 0.26486995071172714, + "loss/reg": 0.41867193579673767, + "step": 2947 + }, + { + "epoch": 0.02948, + "grad_norm": 0.4189806580543518, + "grad_norm_var": 0.01050686543942363, + "learning_rate": 5e-05, + "loss": 0.2033, + "loss/crossentropy": 2.8492271304130554, + "loss/hidden": 0.0, + "loss/logits": 0.20329753309488297, + "loss/reg": 0.41824716329574585, + "step": 2948 + }, + { + "epoch": 0.02949, + "grad_norm": 0.4209999740123749, + "grad_norm_var": 0.010635529112133802, + "learning_rate": 5e-05, + "loss": 0.211, + "loss/crossentropy": 2.897164821624756, + "loss/hidden": 0.0, + "loss/logits": 0.2109513059258461, + "loss/reg": 0.41802337765693665, + "step": 2949 + }, + { + "epoch": 0.0295, + "grad_norm": 0.5650335550308228, + "grad_norm_var": 0.011073602319834678, + "learning_rate": 5e-05, + "loss": 0.2262, + "loss/crossentropy": 2.9027532935142517, + "loss/hidden": 0.0, + "loss/logits": 0.22624987363815308, + "loss/reg": 0.4178922474384308, + "step": 2950 + }, + { + "epoch": 0.02951, + "grad_norm": 0.5040097236633301, + "grad_norm_var": 0.011100376797746251, + "learning_rate": 5e-05, + "loss": 0.2284, + "loss/crossentropy": 2.962436556816101, + "loss/hidden": 0.0, + "loss/logits": 0.22835344076156616, + "loss/reg": 0.4176394045352936, + "step": 2951 + }, + { + "epoch": 0.02952, + "grad_norm": 0.4552556872367859, + "grad_norm_var": 0.011134400890902922, + "learning_rate": 5e-05, + "loss": 0.1955, + "loss/crossentropy": 2.772472620010376, + "loss/hidden": 0.0, + "loss/logits": 0.19549401104450226, + "loss/reg": 0.4172772467136383, + "step": 2952 + }, + { + "epoch": 0.02953, + "grad_norm": 0.4560832679271698, + "grad_norm_var": 0.011151778735905716, + "learning_rate": 5e-05, + "loss": 0.2151, + "loss/crossentropy": 2.9248319268226624, + "loss/hidden": 0.0, + "loss/logits": 0.21511637419462204, + "loss/reg": 0.41695258021354675, + "step": 2953 + }, + { + "epoch": 0.02954, + "grad_norm": 0.4492620825767517, + "grad_norm_var": 0.011049300437476388, + "learning_rate": 5e-05, + "loss": 0.2143, + "loss/crossentropy": 2.8482340574264526, + "loss/hidden": 0.0, + "loss/logits": 0.21425675600767136, + "loss/reg": 0.41689223051071167, + "step": 2954 + }, + { + "epoch": 0.02955, + "grad_norm": 0.44360166788101196, + "grad_norm_var": 0.011087753555208064, + "learning_rate": 5e-05, + "loss": 0.2078, + "loss/crossentropy": 2.806627571582794, + "loss/hidden": 0.0, + "loss/logits": 0.20778347551822662, + "loss/reg": 0.41666102409362793, + "step": 2955 + }, + { + "epoch": 0.02956, + "grad_norm": 0.45185887813568115, + "grad_norm_var": 0.011115180359174677, + "learning_rate": 5e-05, + "loss": 0.2066, + "loss/crossentropy": 2.8933902978897095, + "loss/hidden": 0.0, + "loss/logits": 0.20655349642038345, + "loss/reg": 0.41660138964653015, + "step": 2956 + }, + { + "epoch": 0.02957, + "grad_norm": 0.5129558444023132, + "grad_norm_var": 0.011184586834910049, + "learning_rate": 5e-05, + "loss": 0.2418, + "loss/crossentropy": 2.86870801448822, + "loss/hidden": 0.0, + "loss/logits": 0.24177123606204987, + "loss/reg": 0.4165757894515991, + "step": 2957 + }, + { + "epoch": 0.02958, + "grad_norm": 0.4195878803730011, + "grad_norm_var": 0.011402201129203116, + "learning_rate": 5e-05, + "loss": 0.1983, + "loss/crossentropy": 2.9676802158355713, + "loss/hidden": 0.0, + "loss/logits": 0.1982722021639347, + "loss/reg": 0.41636115312576294, + "step": 2958 + }, + { + "epoch": 0.02959, + "grad_norm": 0.5062670707702637, + "grad_norm_var": 0.011279195468136676, + "learning_rate": 5e-05, + "loss": 0.2212, + "loss/crossentropy": 2.852513611316681, + "loss/hidden": 0.0, + "loss/logits": 0.22121194377541542, + "loss/reg": 0.41601788997650146, + "step": 2959 + }, + { + "epoch": 0.0296, + "grad_norm": 0.46564754843711853, + "grad_norm_var": 0.011121822904308938, + "learning_rate": 5e-05, + "loss": 0.214, + "loss/crossentropy": 2.696369707584381, + "loss/hidden": 0.0, + "loss/logits": 0.2140330858528614, + "loss/reg": 0.41595664620399475, + "step": 2960 + }, + { + "epoch": 0.02961, + "grad_norm": 0.4084406793117523, + "grad_norm_var": 0.01104495686429375, + "learning_rate": 5e-05, + "loss": 0.1934, + "loss/crossentropy": 2.7269797325134277, + "loss/hidden": 0.0, + "loss/logits": 0.19342916458845139, + "loss/reg": 0.41576412320137024, + "step": 2961 + }, + { + "epoch": 0.02962, + "grad_norm": 0.4234437942504883, + "grad_norm_var": 0.011155346185401553, + "learning_rate": 5e-05, + "loss": 0.2043, + "loss/crossentropy": 2.8256075382232666, + "loss/hidden": 0.0, + "loss/logits": 0.2043049857020378, + "loss/reg": 0.4155352711677551, + "step": 2962 + }, + { + "epoch": 0.02963, + "grad_norm": 0.45543137192726135, + "grad_norm_var": 0.0018190533652009103, + "learning_rate": 5e-05, + "loss": 0.205, + "loss/crossentropy": 2.8743650913238525, + "loss/hidden": 0.0, + "loss/logits": 0.2050127200782299, + "loss/reg": 0.41537606716156006, + "step": 2963 + }, + { + "epoch": 0.02964, + "grad_norm": 0.42601439356803894, + "grad_norm_var": 0.001783860295882204, + "learning_rate": 5e-05, + "loss": 0.2063, + "loss/crossentropy": 2.8320910930633545, + "loss/hidden": 0.0, + "loss/logits": 0.20625050365924835, + "loss/reg": 0.4153366982936859, + "step": 2964 + }, + { + "epoch": 0.02965, + "grad_norm": 0.45843085646629333, + "grad_norm_var": 0.001675572061438102, + "learning_rate": 5e-05, + "loss": 0.1974, + "loss/crossentropy": 2.870903789997101, + "loss/hidden": 0.0, + "loss/logits": 0.1974387988448143, + "loss/reg": 0.41534003615379333, + "step": 2965 + }, + { + "epoch": 0.02966, + "grad_norm": 0.45849472284317017, + "grad_norm_var": 0.0009296481189787505, + "learning_rate": 5e-05, + "loss": 0.2198, + "loss/crossentropy": 2.6051303148269653, + "loss/hidden": 0.0, + "loss/logits": 0.21984346583485603, + "loss/reg": 0.41506198048591614, + "step": 2966 + }, + { + "epoch": 0.02967, + "grad_norm": 3.0287957191467285, + "grad_norm_var": 0.41552612560704694, + "learning_rate": 5e-05, + "loss": 0.3476, + "loss/crossentropy": 2.924436092376709, + "loss/hidden": 0.0, + "loss/logits": 0.3476310223340988, + "loss/reg": 0.4149434566497803, + "step": 2967 + }, + { + "epoch": 0.02968, + "grad_norm": 0.474252849817276, + "grad_norm_var": 0.4151472902461947, + "learning_rate": 5e-05, + "loss": 0.2201, + "loss/crossentropy": 2.7435556650161743, + "loss/hidden": 0.0, + "loss/logits": 0.22010482475161552, + "loss/reg": 0.41469597816467285, + "step": 2968 + }, + { + "epoch": 0.02969, + "grad_norm": 0.4726714491844177, + "grad_norm_var": 0.41481320082235024, + "learning_rate": 5e-05, + "loss": 0.2086, + "loss/crossentropy": 2.831117272377014, + "loss/hidden": 0.0, + "loss/logits": 0.20860330387949944, + "loss/reg": 0.41413137316703796, + "step": 2969 + }, + { + "epoch": 0.0297, + "grad_norm": 0.4573473632335663, + "grad_norm_var": 0.4146375936550494, + "learning_rate": 5e-05, + "loss": 0.2177, + "loss/crossentropy": 2.750133216381073, + "loss/hidden": 0.0, + "loss/logits": 0.2177412062883377, + "loss/reg": 0.41368725895881653, + "step": 2970 + }, + { + "epoch": 0.02971, + "grad_norm": 0.5396948456764221, + "grad_norm_var": 0.41300007255875165, + "learning_rate": 5e-05, + "loss": 0.2341, + "loss/crossentropy": 2.8374890089035034, + "loss/hidden": 0.0, + "loss/logits": 0.23408197611570358, + "loss/reg": 0.41330164670944214, + "step": 2971 + }, + { + "epoch": 0.02972, + "grad_norm": 0.63718181848526, + "grad_norm_var": 0.4109311408974749, + "learning_rate": 5e-05, + "loss": 0.2365, + "loss/crossentropy": 2.7857717275619507, + "loss/hidden": 0.0, + "loss/logits": 0.2364787496626377, + "loss/reg": 0.41307657957077026, + "step": 2972 + }, + { + "epoch": 0.02973, + "grad_norm": 0.5609456896781921, + "grad_norm_var": 0.4103002980416261, + "learning_rate": 5e-05, + "loss": 0.2021, + "loss/crossentropy": 2.7734429836273193, + "loss/hidden": 0.0, + "loss/logits": 0.20208783447742462, + "loss/reg": 0.41312071681022644, + "step": 2973 + }, + { + "epoch": 0.02974, + "grad_norm": 0.44507384300231934, + "grad_norm_var": 0.4096019620118688, + "learning_rate": 5e-05, + "loss": 0.1988, + "loss/crossentropy": 2.8414387106895447, + "loss/hidden": 0.0, + "loss/logits": 0.19881393015384674, + "loss/reg": 0.4127810001373291, + "step": 2974 + }, + { + "epoch": 0.02975, + "grad_norm": 0.477894127368927, + "grad_norm_var": 0.4101530255648533, + "learning_rate": 5e-05, + "loss": 0.2087, + "loss/crossentropy": 2.8319895267486572, + "loss/hidden": 0.0, + "loss/logits": 0.20870358869433403, + "loss/reg": 0.41251644492149353, + "step": 2975 + }, + { + "epoch": 0.02976, + "grad_norm": 0.4555509686470032, + "grad_norm_var": 0.41038988498656775, + "learning_rate": 5e-05, + "loss": 0.1965, + "loss/crossentropy": 2.799358904361725, + "loss/hidden": 0.0, + "loss/logits": 0.19652298092842102, + "loss/reg": 0.41222381591796875, + "step": 2976 + }, + { + "epoch": 0.02977, + "grad_norm": 0.43295231461524963, + "grad_norm_var": 0.4096829742534434, + "learning_rate": 5e-05, + "loss": 0.2038, + "loss/crossentropy": 2.949941873550415, + "loss/hidden": 0.0, + "loss/logits": 0.20376397296786308, + "loss/reg": 0.41205185651779175, + "step": 2977 + }, + { + "epoch": 0.02978, + "grad_norm": 0.43264952301979065, + "grad_norm_var": 0.40942521135820337, + "learning_rate": 5e-05, + "loss": 0.202, + "loss/crossentropy": 2.8312630653381348, + "loss/hidden": 0.0, + "loss/logits": 0.20201675221323967, + "loss/reg": 0.4116247594356537, + "step": 2978 + }, + { + "epoch": 0.02979, + "grad_norm": 0.4517178535461426, + "grad_norm_var": 0.40951663605456234, + "learning_rate": 5e-05, + "loss": 0.2235, + "loss/crossentropy": 2.8521989583969116, + "loss/hidden": 0.0, + "loss/logits": 0.22349534556269646, + "loss/reg": 0.41152116656303406, + "step": 2979 + }, + { + "epoch": 0.0298, + "grad_norm": 0.45812955498695374, + "grad_norm_var": 0.4086729241086346, + "learning_rate": 5e-05, + "loss": 0.2301, + "loss/crossentropy": 3.049180746078491, + "loss/hidden": 0.0, + "loss/logits": 0.23012201860547066, + "loss/reg": 0.41161587834358215, + "step": 2980 + }, + { + "epoch": 0.02981, + "grad_norm": 0.45392271876335144, + "grad_norm_var": 0.40878339981097145, + "learning_rate": 5e-05, + "loss": 0.2035, + "loss/crossentropy": 2.952320873737335, + "loss/hidden": 0.0, + "loss/logits": 0.20351208746433258, + "loss/reg": 0.4112384021282196, + "step": 2981 + }, + { + "epoch": 0.02982, + "grad_norm": 0.4621219336986542, + "grad_norm_var": 0.40869652340421986, + "learning_rate": 5e-05, + "loss": 0.2002, + "loss/crossentropy": 2.8466333150863647, + "loss/hidden": 0.0, + "loss/logits": 0.20022378489375114, + "loss/reg": 0.4111112058162689, + "step": 2982 + }, + { + "epoch": 0.02983, + "grad_norm": 0.4917875826358795, + "grad_norm_var": 0.0029386913636527457, + "learning_rate": 5e-05, + "loss": 0.2199, + "loss/crossentropy": 2.8287700414657593, + "loss/hidden": 0.0, + "loss/logits": 0.219918143004179, + "loss/reg": 0.4106608033180237, + "step": 2983 + }, + { + "epoch": 0.02984, + "grad_norm": 0.5056775212287903, + "grad_norm_var": 0.0029700731394197376, + "learning_rate": 5e-05, + "loss": 0.233, + "loss/crossentropy": 2.7453145384788513, + "loss/hidden": 0.0, + "loss/logits": 0.2329668514430523, + "loss/reg": 0.41041040420532227, + "step": 2984 + }, + { + "epoch": 0.02985, + "grad_norm": 0.4265885055065155, + "grad_norm_var": 0.0031690738825403742, + "learning_rate": 5e-05, + "loss": 0.2125, + "loss/crossentropy": 2.881341814994812, + "loss/hidden": 0.0, + "loss/logits": 0.21252062544226646, + "loss/reg": 0.4104888439178467, + "step": 2985 + }, + { + "epoch": 0.02986, + "grad_norm": 0.505568265914917, + "grad_norm_var": 0.0031650468013554645, + "learning_rate": 5e-05, + "loss": 0.2198, + "loss/crossentropy": 2.8155256509780884, + "loss/hidden": 0.0, + "loss/logits": 0.21983979642391205, + "loss/reg": 0.4105539321899414, + "step": 2986 + }, + { + "epoch": 0.02987, + "grad_norm": 0.4987407922744751, + "grad_norm_var": 0.002963517002138109, + "learning_rate": 5e-05, + "loss": 0.2129, + "loss/crossentropy": 2.741531789302826, + "loss/hidden": 0.0, + "loss/logits": 0.21285096928477287, + "loss/reg": 0.4104522466659546, + "step": 2987 + }, + { + "epoch": 0.02988, + "grad_norm": 0.4507018029689789, + "grad_norm_var": 0.0012544183887361153, + "learning_rate": 5e-05, + "loss": 0.2221, + "loss/crossentropy": 2.8095011711120605, + "loss/hidden": 0.0, + "loss/logits": 0.22207725420594215, + "loss/reg": 0.41037923097610474, + "step": 2988 + }, + { + "epoch": 0.02989, + "grad_norm": 0.5525490641593933, + "grad_norm_var": 0.0011563084834823082, + "learning_rate": 5e-05, + "loss": 0.2291, + "loss/crossentropy": 2.6801156401634216, + "loss/hidden": 0.0, + "loss/logits": 0.2290969230234623, + "loss/reg": 0.409972608089447, + "step": 2989 + }, + { + "epoch": 0.0299, + "grad_norm": 0.41515612602233887, + "grad_norm_var": 0.0013071006984379943, + "learning_rate": 5e-05, + "loss": 0.1982, + "loss/crossentropy": 2.853612780570984, + "loss/hidden": 0.0, + "loss/logits": 0.19822300225496292, + "loss/reg": 0.40966442227363586, + "step": 2990 + }, + { + "epoch": 0.02991, + "grad_norm": 0.4788128435611725, + "grad_norm_var": 0.001308490162923681, + "learning_rate": 5e-05, + "loss": 0.2365, + "loss/crossentropy": 2.7433003783226013, + "loss/hidden": 0.0, + "loss/logits": 0.23647205531597137, + "loss/reg": 0.40945765376091003, + "step": 2991 + }, + { + "epoch": 0.02992, + "grad_norm": 0.4172877371311188, + "grad_norm_var": 0.0014586051454637873, + "learning_rate": 5e-05, + "loss": 0.2023, + "loss/crossentropy": 2.775649845600128, + "loss/hidden": 0.0, + "loss/logits": 0.2023325301706791, + "loss/reg": 0.4093344807624817, + "step": 2992 + }, + { + "epoch": 0.02993, + "grad_norm": 0.4973691701889038, + "grad_norm_var": 0.001445721404281027, + "learning_rate": 5e-05, + "loss": 0.2171, + "loss/crossentropy": 3.028523087501526, + "loss/hidden": 0.0, + "loss/logits": 0.21711710467934608, + "loss/reg": 0.4092247188091278, + "step": 2993 + }, + { + "epoch": 0.02994, + "grad_norm": 0.46081361174583435, + "grad_norm_var": 0.001360018560644068, + "learning_rate": 5e-05, + "loss": 0.1977, + "loss/crossentropy": 2.868141233921051, + "loss/hidden": 0.0, + "loss/logits": 0.1977062188088894, + "loss/reg": 0.40902450680732727, + "step": 2994 + }, + { + "epoch": 0.02995, + "grad_norm": 0.43707484006881714, + "grad_norm_var": 0.0014099612461600372, + "learning_rate": 5e-05, + "loss": 0.2174, + "loss/crossentropy": 2.95755273103714, + "loss/hidden": 0.0, + "loss/logits": 0.2173781655728817, + "loss/reg": 0.4086230397224426, + "step": 2995 + }, + { + "epoch": 0.02996, + "grad_norm": 0.43464338779449463, + "grad_norm_var": 0.0014801017929861734, + "learning_rate": 5e-05, + "loss": 0.2108, + "loss/crossentropy": 2.943657338619232, + "loss/hidden": 0.0, + "loss/logits": 0.21084435656666756, + "loss/reg": 0.40813905000686646, + "step": 2996 + }, + { + "epoch": 0.02997, + "grad_norm": 0.43105897307395935, + "grad_norm_var": 0.0015558437610451391, + "learning_rate": 5e-05, + "loss": 0.2116, + "loss/crossentropy": 2.790625810623169, + "loss/hidden": 0.0, + "loss/logits": 0.21155769750475883, + "loss/reg": 0.4077605903148651, + "step": 2997 + }, + { + "epoch": 0.02998, + "grad_norm": 0.6937394738197327, + "grad_norm_var": 0.004769788690252208, + "learning_rate": 5e-05, + "loss": 0.212, + "loss/crossentropy": 2.865418255329132, + "loss/hidden": 0.0, + "loss/logits": 0.211971715092659, + "loss/reg": 0.4074494242668152, + "step": 2998 + }, + { + "epoch": 0.02999, + "grad_norm": 0.4569818675518036, + "grad_norm_var": 0.00479589623642556, + "learning_rate": 5e-05, + "loss": 0.2216, + "loss/crossentropy": 2.8738847970962524, + "loss/hidden": 0.0, + "loss/logits": 0.2215680405497551, + "loss/reg": 0.4072476327419281, + "step": 2999 + }, + { + "epoch": 0.03, + "grad_norm": 0.4593050479888916, + "grad_norm_var": 0.004764871966963317, + "learning_rate": 5e-05, + "loss": 0.2118, + "loss/crossentropy": 2.7084482312202454, + "loss/hidden": 0.0, + "loss/logits": 0.21175257116556168, + "loss/reg": 0.40698838233947754, + "step": 3000 + }, + { + "epoch": 0.03001, + "grad_norm": 0.4397144615650177, + "grad_norm_var": 0.0046891208985484455, + "learning_rate": 5e-05, + "loss": 0.2064, + "loss/crossentropy": 2.747729480266571, + "loss/hidden": 0.0, + "loss/logits": 0.20641162991523743, + "loss/reg": 0.407015323638916, + "step": 3001 + }, + { + "epoch": 0.03002, + "grad_norm": 0.4318627119064331, + "grad_norm_var": 0.004746375904967156, + "learning_rate": 5e-05, + "loss": 0.206, + "loss/crossentropy": 2.787463963031769, + "loss/hidden": 0.0, + "loss/logits": 0.20604726672172546, + "loss/reg": 0.4069214165210724, + "step": 3002 + }, + { + "epoch": 0.03003, + "grad_norm": 0.44153761863708496, + "grad_norm_var": 0.004748751284728636, + "learning_rate": 5e-05, + "loss": 0.2173, + "loss/crossentropy": 2.8046445846557617, + "loss/hidden": 0.0, + "loss/logits": 0.21730247884988785, + "loss/reg": 0.40676385164260864, + "step": 3003 + }, + { + "epoch": 0.03004, + "grad_norm": 0.4525713324546814, + "grad_norm_var": 0.004744492521182882, + "learning_rate": 5e-05, + "loss": 0.2206, + "loss/crossentropy": 2.7294583320617676, + "loss/hidden": 0.0, + "loss/logits": 0.22060563787817955, + "loss/reg": 0.4063129425048828, + "step": 3004 + }, + { + "epoch": 0.03005, + "grad_norm": 0.4181974232196808, + "grad_norm_var": 0.004372036798314153, + "learning_rate": 5e-05, + "loss": 0.2068, + "loss/crossentropy": 2.696288824081421, + "loss/hidden": 0.0, + "loss/logits": 0.20675234124064445, + "loss/reg": 0.405872106552124, + "step": 3005 + }, + { + "epoch": 0.03006, + "grad_norm": 0.47249698638916016, + "grad_norm_var": 0.004231756130832868, + "learning_rate": 5e-05, + "loss": 0.2244, + "loss/crossentropy": 2.89243221282959, + "loss/hidden": 0.0, + "loss/logits": 0.22441335022449493, + "loss/reg": 0.4053153395652771, + "step": 3006 + }, + { + "epoch": 0.03007, + "grad_norm": 0.4448484182357788, + "grad_norm_var": 0.004236623000908309, + "learning_rate": 5e-05, + "loss": 0.2085, + "loss/crossentropy": 2.7324312925338745, + "loss/hidden": 0.0, + "loss/logits": 0.20850449055433273, + "loss/reg": 0.40495994687080383, + "step": 3007 + }, + { + "epoch": 0.03008, + "grad_norm": 0.4372553527355194, + "grad_norm_var": 0.004142917950953103, + "learning_rate": 5e-05, + "loss": 0.1989, + "loss/crossentropy": 2.894156754016876, + "loss/hidden": 0.0, + "loss/logits": 0.19885458052158356, + "loss/reg": 0.40466898679733276, + "step": 3008 + }, + { + "epoch": 0.03009, + "grad_norm": 0.5639891624450684, + "grad_norm_var": 0.004724780287416171, + "learning_rate": 5e-05, + "loss": 0.2264, + "loss/crossentropy": 2.7507764101028442, + "loss/hidden": 0.0, + "loss/logits": 0.2263515554368496, + "loss/reg": 0.40433433651924133, + "step": 3009 + }, + { + "epoch": 0.0301, + "grad_norm": 0.4454791247844696, + "grad_norm_var": 0.004752648357734835, + "learning_rate": 5e-05, + "loss": 0.2023, + "loss/crossentropy": 2.6277093291282654, + "loss/hidden": 0.0, + "loss/logits": 0.20228134468197823, + "loss/reg": 0.4042360484600067, + "step": 3010 + }, + { + "epoch": 0.03011, + "grad_norm": 0.46629035472869873, + "grad_norm_var": 0.004692161924048444, + "learning_rate": 5e-05, + "loss": 0.2305, + "loss/crossentropy": 2.8652483224868774, + "loss/hidden": 0.0, + "loss/logits": 0.2304811030626297, + "loss/reg": 0.40383026003837585, + "step": 3011 + }, + { + "epoch": 0.03012, + "grad_norm": 0.4272403120994568, + "grad_norm_var": 0.004728634444874124, + "learning_rate": 5e-05, + "loss": 0.2132, + "loss/crossentropy": 2.817522644996643, + "loss/hidden": 0.0, + "loss/logits": 0.2132273204624653, + "loss/reg": 0.4037969410419464, + "step": 3012 + }, + { + "epoch": 0.03013, + "grad_norm": 0.434648722410202, + "grad_norm_var": 0.004711921112448299, + "learning_rate": 5e-05, + "loss": 0.2094, + "loss/crossentropy": 2.735086441040039, + "loss/hidden": 0.0, + "loss/logits": 0.2094290480017662, + "loss/reg": 0.4036831855773926, + "step": 3013 + }, + { + "epoch": 0.03014, + "grad_norm": 1.4197618961334229, + "grad_norm_var": 0.05951960411449542, + "learning_rate": 5e-05, + "loss": 0.2492, + "loss/crossentropy": 3.003745436668396, + "loss/hidden": 0.0, + "loss/logits": 0.249249380081892, + "loss/reg": 0.40324291586875916, + "step": 3014 + }, + { + "epoch": 0.03015, + "grad_norm": 0.43403294682502747, + "grad_norm_var": 0.059724726887875924, + "learning_rate": 5e-05, + "loss": 0.2038, + "loss/crossentropy": 2.7997838854789734, + "loss/hidden": 0.0, + "loss/logits": 0.2037663869559765, + "loss/reg": 0.4032883942127228, + "step": 3015 + }, + { + "epoch": 0.03016, + "grad_norm": 0.5060783624649048, + "grad_norm_var": 0.059533910767967116, + "learning_rate": 5e-05, + "loss": 0.2037, + "loss/crossentropy": 2.8521798849105835, + "loss/hidden": 0.0, + "loss/logits": 0.20374320447444916, + "loss/reg": 0.4033139944076538, + "step": 3016 + }, + { + "epoch": 0.03017, + "grad_norm": 0.5364549160003662, + "grad_norm_var": 0.059150963361671936, + "learning_rate": 5e-05, + "loss": 0.2152, + "loss/crossentropy": 2.8101565837860107, + "loss/hidden": 0.0, + "loss/logits": 0.21517881751060486, + "loss/reg": 0.4031355679035187, + "step": 3017 + }, + { + "epoch": 0.03018, + "grad_norm": 0.561177670955658, + "grad_norm_var": 0.058662713118680744, + "learning_rate": 5e-05, + "loss": 0.2206, + "loss/crossentropy": 2.7943480014801025, + "loss/hidden": 0.0, + "loss/logits": 0.2205517739057541, + "loss/reg": 0.40295350551605225, + "step": 3018 + }, + { + "epoch": 0.03019, + "grad_norm": 0.4853002429008484, + "grad_norm_var": 0.05827277390621719, + "learning_rate": 5e-05, + "loss": 0.2236, + "loss/crossentropy": 2.6148573756217957, + "loss/hidden": 0.0, + "loss/logits": 0.22360238060355186, + "loss/reg": 0.4028816223144531, + "step": 3019 + }, + { + "epoch": 0.0302, + "grad_norm": 0.4350965917110443, + "grad_norm_var": 0.05847602588665346, + "learning_rate": 5e-05, + "loss": 0.2053, + "loss/crossentropy": 2.7065069675445557, + "loss/hidden": 0.0, + "loss/logits": 0.20534758642315865, + "loss/reg": 0.4025703966617584, + "step": 3020 + }, + { + "epoch": 0.03021, + "grad_norm": 0.41484951972961426, + "grad_norm_var": 0.05852686656285524, + "learning_rate": 5e-05, + "loss": 0.1928, + "loss/crossentropy": 2.9356210231781006, + "loss/hidden": 0.0, + "loss/logits": 0.1927533522248268, + "loss/reg": 0.40253397822380066, + "step": 3021 + }, + { + "epoch": 0.03022, + "grad_norm": 0.49251025915145874, + "grad_norm_var": 0.05839762263977747, + "learning_rate": 5e-05, + "loss": 0.2032, + "loss/crossentropy": 2.8911741971969604, + "loss/hidden": 0.0, + "loss/logits": 0.20323756337165833, + "loss/reg": 0.40223628282546997, + "step": 3022 + }, + { + "epoch": 0.03023, + "grad_norm": 0.5433621406555176, + "grad_norm_var": 0.05786516726355509, + "learning_rate": 5e-05, + "loss": 0.2125, + "loss/crossentropy": 2.78800767660141, + "loss/hidden": 0.0, + "loss/logits": 0.21251022815704346, + "loss/reg": 0.40198686718940735, + "step": 3023 + }, + { + "epoch": 0.03024, + "grad_norm": 0.4588299095630646, + "grad_norm_var": 0.057605259879881535, + "learning_rate": 5e-05, + "loss": 0.2146, + "loss/crossentropy": 2.8331373929977417, + "loss/hidden": 0.0, + "loss/logits": 0.21456065028905869, + "loss/reg": 0.40154939889907837, + "step": 3024 + }, + { + "epoch": 0.03025, + "grad_norm": 0.4346790611743927, + "grad_norm_var": 0.05822066959679386, + "learning_rate": 5e-05, + "loss": 0.2128, + "loss/crossentropy": 2.8522785305976868, + "loss/hidden": 0.0, + "loss/logits": 0.2127925232052803, + "loss/reg": 0.40095213055610657, + "step": 3025 + }, + { + "epoch": 0.03026, + "grad_norm": 0.48451489210128784, + "grad_norm_var": 0.05787085779615249, + "learning_rate": 5e-05, + "loss": 0.2008, + "loss/crossentropy": 2.7884156703948975, + "loss/hidden": 0.0, + "loss/logits": 0.20078472048044205, + "loss/reg": 0.40065431594848633, + "step": 3026 + }, + { + "epoch": 0.03027, + "grad_norm": 0.4133094847202301, + "grad_norm_var": 0.05852055277800764, + "learning_rate": 5e-05, + "loss": 0.2029, + "loss/crossentropy": 2.911341965198517, + "loss/hidden": 0.0, + "loss/logits": 0.20290425419807434, + "loss/reg": 0.39991655945777893, + "step": 3027 + }, + { + "epoch": 0.03028, + "grad_norm": 0.45020559430122375, + "grad_norm_var": 0.05823850804848683, + "learning_rate": 5e-05, + "loss": 0.2196, + "loss/crossentropy": 2.8901261687278748, + "loss/hidden": 0.0, + "loss/logits": 0.2195754460990429, + "loss/reg": 0.3996748924255371, + "step": 3028 + }, + { + "epoch": 0.03029, + "grad_norm": 0.4794008433818817, + "grad_norm_var": 0.05778547048910945, + "learning_rate": 5e-05, + "loss": 0.2197, + "loss/crossentropy": 2.7915091514587402, + "loss/hidden": 0.0, + "loss/logits": 0.21972005814313889, + "loss/reg": 0.3994423449039459, + "step": 3029 + }, + { + "epoch": 0.0303, + "grad_norm": 0.4068341553211212, + "grad_norm_var": 0.0023304795328047117, + "learning_rate": 5e-05, + "loss": 0.2036, + "loss/crossentropy": 2.8088658452033997, + "loss/hidden": 0.0, + "loss/logits": 0.20358917117118835, + "loss/reg": 0.3989386558532715, + "step": 3030 + }, + { + "epoch": 0.03031, + "grad_norm": 0.4388318657875061, + "grad_norm_var": 0.0023082398406893117, + "learning_rate": 5e-05, + "loss": 0.2164, + "loss/crossentropy": 2.755333960056305, + "loss/hidden": 0.0, + "loss/logits": 0.2164311483502388, + "loss/reg": 0.39855140447616577, + "step": 3031 + }, + { + "epoch": 0.03032, + "grad_norm": 0.45733383297920227, + "grad_norm_var": 0.0022309658211788837, + "learning_rate": 5e-05, + "loss": 0.2156, + "loss/crossentropy": 2.9809648394584656, + "loss/hidden": 0.0, + "loss/logits": 0.2156263366341591, + "loss/reg": 0.39832744002342224, + "step": 3032 + }, + { + "epoch": 0.03033, + "grad_norm": 0.4208433926105499, + "grad_norm_var": 0.0020156381544362352, + "learning_rate": 5e-05, + "loss": 0.1875, + "loss/crossentropy": 2.9573736786842346, + "loss/hidden": 0.0, + "loss/logits": 0.18751448020339012, + "loss/reg": 0.3983627259731293, + "step": 3033 + }, + { + "epoch": 0.03034, + "grad_norm": 0.4416068196296692, + "grad_norm_var": 0.001313177444841159, + "learning_rate": 5e-05, + "loss": 0.2177, + "loss/crossentropy": 2.594088852405548, + "loss/hidden": 0.0, + "loss/logits": 0.21766668185591698, + "loss/reg": 0.39849621057510376, + "step": 3034 + }, + { + "epoch": 0.03035, + "grad_norm": 0.4776979386806488, + "grad_norm_var": 0.0012846511920712326, + "learning_rate": 5e-05, + "loss": 0.2229, + "loss/crossentropy": 2.8138654232025146, + "loss/hidden": 0.0, + "loss/logits": 0.22291235625743866, + "loss/reg": 0.39837443828582764, + "step": 3035 + }, + { + "epoch": 0.03036, + "grad_norm": 0.41231635212898254, + "grad_norm_var": 0.0013718259733462043, + "learning_rate": 5e-05, + "loss": 0.2071, + "loss/crossentropy": 2.77077978849411, + "loss/hidden": 0.0, + "loss/logits": 0.20708630234003067, + "loss/reg": 0.39841732382774353, + "step": 3036 + }, + { + "epoch": 0.03037, + "grad_norm": 0.4618535041809082, + "grad_norm_var": 0.0012789915909739917, + "learning_rate": 5e-05, + "loss": 0.205, + "loss/crossentropy": 2.8846208453178406, + "loss/hidden": 0.0, + "loss/logits": 0.20496291294693947, + "loss/reg": 0.398523211479187, + "step": 3037 + }, + { + "epoch": 0.03038, + "grad_norm": 0.4427744150161743, + "grad_norm_var": 0.0011824148317892617, + "learning_rate": 5e-05, + "loss": 0.2087, + "loss/crossentropy": 2.896592140197754, + "loss/hidden": 0.0, + "loss/logits": 0.20867354795336723, + "loss/reg": 0.39841124415397644, + "step": 3038 + }, + { + "epoch": 0.03039, + "grad_norm": 0.472601979970932, + "grad_norm_var": 0.0006288941570083121, + "learning_rate": 5e-05, + "loss": 0.2155, + "loss/crossentropy": 2.7712117433547974, + "loss/hidden": 0.0, + "loss/logits": 0.21548501402139664, + "loss/reg": 0.3983190953731537, + "step": 3039 + }, + { + "epoch": 0.0304, + "grad_norm": 0.4372209906578064, + "grad_norm_var": 0.0006242882844309961, + "learning_rate": 5e-05, + "loss": 0.2179, + "loss/crossentropy": 2.8383272886276245, + "loss/hidden": 0.0, + "loss/logits": 0.21785932406783104, + "loss/reg": 0.39815738797187805, + "step": 3040 + }, + { + "epoch": 0.03041, + "grad_norm": 0.4225694537162781, + "grad_norm_var": 0.0006513312781409615, + "learning_rate": 5e-05, + "loss": 0.2119, + "loss/crossentropy": 2.9678894877433777, + "loss/hidden": 0.0, + "loss/logits": 0.21190844103693962, + "loss/reg": 0.39790207147598267, + "step": 3041 + }, + { + "epoch": 0.03042, + "grad_norm": 0.4791715443134308, + "grad_norm_var": 0.0006249597350523218, + "learning_rate": 5e-05, + "loss": 0.2028, + "loss/crossentropy": 2.9027262330055237, + "loss/hidden": 0.0, + "loss/logits": 0.20280738919973373, + "loss/reg": 0.39794525504112244, + "step": 3042 + }, + { + "epoch": 0.03043, + "grad_norm": 0.4933237135410309, + "grad_norm_var": 0.0006906289491075412, + "learning_rate": 5e-05, + "loss": 0.2362, + "loss/crossentropy": 2.7814919352531433, + "loss/hidden": 0.0, + "loss/logits": 0.23622025921940804, + "loss/reg": 0.39792802929878235, + "step": 3043 + }, + { + "epoch": 0.03044, + "grad_norm": 0.4225125014781952, + "grad_norm_var": 0.0007365521934462382, + "learning_rate": 5e-05, + "loss": 0.205, + "loss/crossentropy": 2.7942729592323303, + "loss/hidden": 0.0, + "loss/logits": 0.20500367134809494, + "loss/reg": 0.3980974555015564, + "step": 3044 + }, + { + "epoch": 0.03045, + "grad_norm": 0.4311182200908661, + "grad_norm_var": 0.0006796589613730309, + "learning_rate": 5e-05, + "loss": 0.2031, + "loss/crossentropy": 2.7577481865882874, + "loss/hidden": 0.0, + "loss/logits": 0.20309137180447578, + "loss/reg": 0.39828598499298096, + "step": 3045 + }, + { + "epoch": 0.03046, + "grad_norm": 0.4314897954463959, + "grad_norm_var": 0.0005924710921448788, + "learning_rate": 5e-05, + "loss": 0.2006, + "loss/crossentropy": 2.7588038444519043, + "loss/hidden": 0.0, + "loss/logits": 0.20063773542642593, + "loss/reg": 0.39819660782814026, + "step": 3046 + }, + { + "epoch": 0.03047, + "grad_norm": 0.40626370906829834, + "grad_norm_var": 0.0006918630387390194, + "learning_rate": 5e-05, + "loss": 0.1957, + "loss/crossentropy": 2.7322017550468445, + "loss/hidden": 0.0, + "loss/logits": 0.19568296521902084, + "loss/reg": 0.39800480008125305, + "step": 3047 + }, + { + "epoch": 0.03048, + "grad_norm": 0.4363268315792084, + "grad_norm_var": 0.0006832693101546994, + "learning_rate": 5e-05, + "loss": 0.2061, + "loss/crossentropy": 2.7159923911094666, + "loss/hidden": 0.0, + "loss/logits": 0.20607688277959824, + "loss/reg": 0.3975684344768524, + "step": 3048 + }, + { + "epoch": 0.03049, + "grad_norm": 0.5293384194374084, + "grad_norm_var": 0.0010969209039714618, + "learning_rate": 5e-05, + "loss": 0.2205, + "loss/crossentropy": 2.711432635784149, + "loss/hidden": 0.0, + "loss/logits": 0.22052596509456635, + "loss/reg": 0.39780956506729126, + "step": 3049 + }, + { + "epoch": 0.0305, + "grad_norm": 0.506104052066803, + "grad_norm_var": 0.001285710843558437, + "learning_rate": 5e-05, + "loss": 0.2388, + "loss/crossentropy": 2.78479266166687, + "loss/hidden": 0.0, + "loss/logits": 0.23884322866797447, + "loss/reg": 0.39776161313056946, + "step": 3050 + }, + { + "epoch": 0.03051, + "grad_norm": 0.5201892852783203, + "grad_norm_var": 0.0015332826721788557, + "learning_rate": 5e-05, + "loss": 0.21, + "loss/crossentropy": 2.7641706466674805, + "loss/hidden": 0.0, + "loss/logits": 0.21004071831703186, + "loss/reg": 0.3974432945251465, + "step": 3051 + }, + { + "epoch": 0.03052, + "grad_norm": 0.39363333582878113, + "grad_norm_var": 0.001665346026001598, + "learning_rate": 5e-05, + "loss": 0.1938, + "loss/crossentropy": 2.7352213263511658, + "loss/hidden": 0.0, + "loss/logits": 0.19383027404546738, + "loss/reg": 0.3973250687122345, + "step": 3052 + }, + { + "epoch": 0.03053, + "grad_norm": 0.4225972592830658, + "grad_norm_var": 0.001727913126859563, + "learning_rate": 5e-05, + "loss": 0.2045, + "loss/crossentropy": 2.796771466732025, + "loss/hidden": 0.0, + "loss/logits": 0.2044854573905468, + "loss/reg": 0.39734533429145813, + "step": 3053 + }, + { + "epoch": 0.03054, + "grad_norm": 0.44614896178245544, + "grad_norm_var": 0.0017240454529127518, + "learning_rate": 5e-05, + "loss": 0.2031, + "loss/crossentropy": 2.8176177740097046, + "loss/hidden": 0.0, + "loss/logits": 0.20314955711364746, + "loss/reg": 0.3971204161643982, + "step": 3054 + }, + { + "epoch": 0.03055, + "grad_norm": 0.8071470856666565, + "grad_norm_var": 0.009586161876492687, + "learning_rate": 5e-05, + "loss": 0.2335, + "loss/crossentropy": 2.7193825244903564, + "loss/hidden": 0.0, + "loss/logits": 0.2335340902209282, + "loss/reg": 0.39717304706573486, + "step": 3055 + }, + { + "epoch": 0.03056, + "grad_norm": 0.4427300691604614, + "grad_norm_var": 0.009560989922628086, + "learning_rate": 5e-05, + "loss": 0.2041, + "loss/crossentropy": 2.776980400085449, + "loss/hidden": 0.0, + "loss/logits": 0.20412253215909004, + "loss/reg": 0.39708614349365234, + "step": 3056 + }, + { + "epoch": 0.03057, + "grad_norm": 0.4620968699455261, + "grad_norm_var": 0.009385390246766637, + "learning_rate": 5e-05, + "loss": 0.2024, + "loss/crossentropy": 2.8334758281707764, + "loss/hidden": 0.0, + "loss/logits": 0.2024206519126892, + "loss/reg": 0.39706647396087646, + "step": 3057 + }, + { + "epoch": 0.03058, + "grad_norm": 0.4607102572917938, + "grad_norm_var": 0.009401067971332937, + "learning_rate": 5e-05, + "loss": 0.2249, + "loss/crossentropy": 2.84277480840683, + "loss/hidden": 0.0, + "loss/logits": 0.2248813547194004, + "loss/reg": 0.3970212936401367, + "step": 3058 + }, + { + "epoch": 0.03059, + "grad_norm": 0.4471275806427002, + "grad_norm_var": 0.009426099325550712, + "learning_rate": 5e-05, + "loss": 0.2074, + "loss/crossentropy": 2.714890778064728, + "loss/hidden": 0.0, + "loss/logits": 0.2073524221777916, + "loss/reg": 0.3974565863609314, + "step": 3059 + }, + { + "epoch": 0.0306, + "grad_norm": 0.4428834021091461, + "grad_norm_var": 0.009315323648627312, + "learning_rate": 5e-05, + "loss": 0.1927, + "loss/crossentropy": 2.7059999108314514, + "loss/hidden": 0.0, + "loss/logits": 0.19272072613239288, + "loss/reg": 0.39739498496055603, + "step": 3060 + }, + { + "epoch": 0.03061, + "grad_norm": 0.5060510635375977, + "grad_norm_var": 0.00923663336807445, + "learning_rate": 5e-05, + "loss": 0.223, + "loss/crossentropy": 2.757682740688324, + "loss/hidden": 0.0, + "loss/logits": 0.22299232706427574, + "loss/reg": 0.3972777724266052, + "step": 3061 + }, + { + "epoch": 0.03062, + "grad_norm": 0.4319446384906769, + "grad_norm_var": 0.009233776991932289, + "learning_rate": 5e-05, + "loss": 0.2071, + "loss/crossentropy": 2.808611214160919, + "loss/hidden": 0.0, + "loss/logits": 0.2070915549993515, + "loss/reg": 0.3971980810165405, + "step": 3062 + }, + { + "epoch": 0.03063, + "grad_norm": 0.4342910647392273, + "grad_norm_var": 0.009011690956267441, + "learning_rate": 5e-05, + "loss": 0.2074, + "loss/crossentropy": 2.7680245637893677, + "loss/hidden": 0.0, + "loss/logits": 0.20738160237669945, + "loss/reg": 0.39710837602615356, + "step": 3063 + }, + { + "epoch": 0.03064, + "grad_norm": 0.47819551825523376, + "grad_norm_var": 0.008874195682015702, + "learning_rate": 5e-05, + "loss": 0.207, + "loss/crossentropy": 2.8481932878494263, + "loss/hidden": 0.0, + "loss/logits": 0.20701946318149567, + "loss/reg": 0.3970259428024292, + "step": 3064 + }, + { + "epoch": 0.03065, + "grad_norm": 0.4897119700908661, + "grad_norm_var": 0.008728559407074243, + "learning_rate": 5e-05, + "loss": 0.2105, + "loss/crossentropy": 2.76278156042099, + "loss/hidden": 0.0, + "loss/logits": 0.2105187401175499, + "loss/reg": 0.39654862880706787, + "step": 3065 + }, + { + "epoch": 0.03066, + "grad_norm": 0.4575293958187103, + "grad_norm_var": 0.008711642272262484, + "learning_rate": 5e-05, + "loss": 0.2223, + "loss/crossentropy": 2.8392109274864197, + "loss/hidden": 0.0, + "loss/logits": 0.2223300337791443, + "loss/reg": 0.3964754641056061, + "step": 3066 + }, + { + "epoch": 0.03067, + "grad_norm": 0.45358696579933167, + "grad_norm_var": 0.008611448290133087, + "learning_rate": 5e-05, + "loss": 0.2194, + "loss/crossentropy": 2.882247507572174, + "loss/hidden": 0.0, + "loss/logits": 0.21941563859581947, + "loss/reg": 0.3961576819419861, + "step": 3067 + }, + { + "epoch": 0.03068, + "grad_norm": 0.5554410219192505, + "grad_norm_var": 0.008524214504109914, + "learning_rate": 5e-05, + "loss": 0.2468, + "loss/crossentropy": 2.9757755994796753, + "loss/hidden": 0.0, + "loss/logits": 0.24682281911373138, + "loss/reg": 0.39594757556915283, + "step": 3068 + }, + { + "epoch": 0.03069, + "grad_norm": 0.4306425154209137, + "grad_norm_var": 0.008462782433041448, + "learning_rate": 5e-05, + "loss": 0.2089, + "loss/crossentropy": 2.8143198490142822, + "loss/hidden": 0.0, + "loss/logits": 0.2088850513100624, + "loss/reg": 0.39541342854499817, + "step": 3069 + }, + { + "epoch": 0.0307, + "grad_norm": 0.4220050275325775, + "grad_norm_var": 0.008621515621974185, + "learning_rate": 5e-05, + "loss": 0.1983, + "loss/crossentropy": 2.7359976172447205, + "loss/hidden": 0.0, + "loss/logits": 0.19830382615327835, + "loss/reg": 0.3949725031852722, + "step": 3070 + }, + { + "epoch": 0.03071, + "grad_norm": 0.44482648372650146, + "grad_norm_var": 0.0011490927933506812, + "learning_rate": 5e-05, + "loss": 0.1983, + "loss/crossentropy": 2.9423199892044067, + "loss/hidden": 0.0, + "loss/logits": 0.19831636175513268, + "loss/reg": 0.3948777914047241, + "step": 3071 + }, + { + "epoch": 0.03072, + "grad_norm": 0.5303253531455994, + "grad_norm_var": 0.0014271143061801994, + "learning_rate": 5e-05, + "loss": 0.2501, + "loss/crossentropy": 2.6319790482521057, + "loss/hidden": 0.0, + "loss/logits": 0.25007568299770355, + "loss/reg": 0.394430935382843, + "step": 3072 + }, + { + "epoch": 0.03073, + "grad_norm": 0.47260767221450806, + "grad_norm_var": 0.0014293050908878299, + "learning_rate": 5e-05, + "loss": 0.2149, + "loss/crossentropy": 2.801904082298279, + "loss/hidden": 0.0, + "loss/logits": 0.214866004884243, + "loss/reg": 0.3942534029483795, + "step": 3073 + }, + { + "epoch": 0.03074, + "grad_norm": 0.4308335781097412, + "grad_norm_var": 0.0015066336318305526, + "learning_rate": 5e-05, + "loss": 0.2232, + "loss/crossentropy": 2.8193368911743164, + "loss/hidden": 0.0, + "loss/logits": 0.2231505587697029, + "loss/reg": 0.3936229348182678, + "step": 3074 + }, + { + "epoch": 0.03075, + "grad_norm": 0.42338499426841736, + "grad_norm_var": 0.001596070245053538, + "learning_rate": 5e-05, + "loss": 0.2163, + "loss/crossentropy": 2.768975019454956, + "loss/hidden": 0.0, + "loss/logits": 0.21634433791041374, + "loss/reg": 0.39347314834594727, + "step": 3075 + }, + { + "epoch": 0.03076, + "grad_norm": 0.4502422511577606, + "grad_norm_var": 0.001579946095298453, + "learning_rate": 5e-05, + "loss": 0.2027, + "loss/crossentropy": 2.7641252279281616, + "loss/hidden": 0.0, + "loss/logits": 0.20268548280000687, + "loss/reg": 0.39299461245536804, + "step": 3076 + }, + { + "epoch": 0.03077, + "grad_norm": 0.4843948781490326, + "grad_norm_var": 0.0014856016253207206, + "learning_rate": 5e-05, + "loss": 0.2069, + "loss/crossentropy": 2.8533742427825928, + "loss/hidden": 0.0, + "loss/logits": 0.20689698681235313, + "loss/reg": 0.39270803332328796, + "step": 3077 + }, + { + "epoch": 0.03078, + "grad_norm": 0.4364874064922333, + "grad_norm_var": 0.001468763918073357, + "learning_rate": 5e-05, + "loss": 0.2146, + "loss/crossentropy": 2.875757575035095, + "loss/hidden": 0.0, + "loss/logits": 0.2146349586546421, + "loss/reg": 0.3924676477909088, + "step": 3078 + }, + { + "epoch": 0.03079, + "grad_norm": 0.4508589804172516, + "grad_norm_var": 0.0014243633270431765, + "learning_rate": 5e-05, + "loss": 0.2068, + "loss/crossentropy": 2.7768712639808655, + "loss/hidden": 0.0, + "loss/logits": 0.20679758489131927, + "loss/reg": 0.3920327425003052, + "step": 3079 + }, + { + "epoch": 0.0308, + "grad_norm": 0.4127097725868225, + "grad_norm_var": 0.0015613861449169134, + "learning_rate": 5e-05, + "loss": 0.2027, + "loss/crossentropy": 2.7912158370018005, + "loss/hidden": 0.0, + "loss/logits": 0.2026847079396248, + "loss/reg": 0.392025887966156, + "step": 3080 + }, + { + "epoch": 0.03081, + "grad_norm": 0.46597495675086975, + "grad_norm_var": 0.0014997142876565543, + "learning_rate": 5e-05, + "loss": 0.2308, + "loss/crossentropy": 2.851341485977173, + "loss/hidden": 0.0, + "loss/logits": 0.23079203069210052, + "loss/reg": 0.3920225203037262, + "step": 3081 + }, + { + "epoch": 0.03082, + "grad_norm": 0.4098932147026062, + "grad_norm_var": 0.0016420878285708226, + "learning_rate": 5e-05, + "loss": 0.2078, + "loss/crossentropy": 2.784606397151947, + "loss/hidden": 0.0, + "loss/logits": 0.20782194659113884, + "loss/reg": 0.3917164206504822, + "step": 3082 + }, + { + "epoch": 0.03083, + "grad_norm": 0.6632619500160217, + "grad_norm_var": 0.004360416998337987, + "learning_rate": 5e-05, + "loss": 0.2394, + "loss/crossentropy": 2.960343897342682, + "loss/hidden": 0.0, + "loss/logits": 0.23943787068128586, + "loss/reg": 0.39117884635925293, + "step": 3083 + }, + { + "epoch": 0.03084, + "grad_norm": 0.4344150424003601, + "grad_norm_var": 0.003860709354318918, + "learning_rate": 5e-05, + "loss": 0.2048, + "loss/crossentropy": 2.862846553325653, + "loss/hidden": 0.0, + "loss/logits": 0.20478564500808716, + "loss/reg": 0.3905263841152191, + "step": 3084 + }, + { + "epoch": 0.03085, + "grad_norm": 0.41771531105041504, + "grad_norm_var": 0.003922063790597981, + "learning_rate": 5e-05, + "loss": 0.2083, + "loss/crossentropy": 2.7566166520118713, + "loss/hidden": 0.0, + "loss/logits": 0.2082574926316738, + "loss/reg": 0.39018648862838745, + "step": 3085 + }, + { + "epoch": 0.03086, + "grad_norm": 0.4744522273540497, + "grad_norm_var": 0.0038326839572999168, + "learning_rate": 5e-05, + "loss": 0.2089, + "loss/crossentropy": 2.807427167892456, + "loss/hidden": 0.0, + "loss/logits": 0.20893971249461174, + "loss/reg": 0.3899044692516327, + "step": 3086 + }, + { + "epoch": 0.03087, + "grad_norm": 0.41140273213386536, + "grad_norm_var": 0.003981931723220559, + "learning_rate": 5e-05, + "loss": 0.1959, + "loss/crossentropy": 2.8307939767837524, + "loss/hidden": 0.0, + "loss/logits": 0.19591112434864044, + "loss/reg": 0.3898000121116638, + "step": 3087 + }, + { + "epoch": 0.03088, + "grad_norm": 0.77448970079422, + "grad_norm_var": 0.009979173620170305, + "learning_rate": 5e-05, + "loss": 0.2221, + "loss/crossentropy": 2.7979838252067566, + "loss/hidden": 0.0, + "loss/logits": 0.22205092385411263, + "loss/reg": 0.3893855810165405, + "step": 3088 + }, + { + "epoch": 0.03089, + "grad_norm": 0.4630696177482605, + "grad_norm_var": 0.0099889451440866, + "learning_rate": 5e-05, + "loss": 0.2071, + "loss/crossentropy": 2.8363125324249268, + "loss/hidden": 0.0, + "loss/logits": 0.20714826881885529, + "loss/reg": 0.38919103145599365, + "step": 3089 + }, + { + "epoch": 0.0309, + "grad_norm": 0.4816327393054962, + "grad_norm_var": 0.009849562551472598, + "learning_rate": 5e-05, + "loss": 0.2201, + "loss/crossentropy": 2.7747485637664795, + "loss/hidden": 0.0, + "loss/logits": 0.22006035968661308, + "loss/reg": 0.3885936737060547, + "step": 3090 + }, + { + "epoch": 0.03091, + "grad_norm": 0.4451737701892853, + "grad_norm_var": 0.009719409105592929, + "learning_rate": 5e-05, + "loss": 0.1967, + "loss/crossentropy": 2.776797294616699, + "loss/hidden": 0.0, + "loss/logits": 0.1966601312160492, + "loss/reg": 0.38822290301322937, + "step": 3091 + }, + { + "epoch": 0.03092, + "grad_norm": 0.49965107440948486, + "grad_norm_var": 0.009677521804659372, + "learning_rate": 5e-05, + "loss": 0.2407, + "loss/crossentropy": 2.795443296432495, + "loss/hidden": 0.0, + "loss/logits": 0.24069646000862122, + "loss/reg": 0.38770148158073425, + "step": 3092 + }, + { + "epoch": 0.03093, + "grad_norm": 0.6744674444198608, + "grad_norm_var": 0.011974673777051323, + "learning_rate": 5e-05, + "loss": 0.2331, + "loss/crossentropy": 2.865045189857483, + "loss/hidden": 0.0, + "loss/logits": 0.23312364891171455, + "loss/reg": 0.386991024017334, + "step": 3093 + }, + { + "epoch": 0.03094, + "grad_norm": 0.4267069399356842, + "grad_norm_var": 0.012056602376572197, + "learning_rate": 5e-05, + "loss": 0.1974, + "loss/crossentropy": 2.7200207710266113, + "loss/hidden": 0.0, + "loss/logits": 0.19739928096532822, + "loss/reg": 0.38675180077552795, + "step": 3094 + }, + { + "epoch": 0.03095, + "grad_norm": 0.4225987195968628, + "grad_norm_var": 0.012269516063240478, + "learning_rate": 5e-05, + "loss": 0.1965, + "loss/crossentropy": 2.9032493829727173, + "loss/hidden": 0.0, + "loss/logits": 0.19650064036250114, + "loss/reg": 0.38677260279655457, + "step": 3095 + }, + { + "epoch": 0.03096, + "grad_norm": 0.4576040506362915, + "grad_norm_var": 0.011918760149098503, + "learning_rate": 5e-05, + "loss": 0.2097, + "loss/crossentropy": 2.8028374910354614, + "loss/hidden": 0.0, + "loss/logits": 0.209733746945858, + "loss/reg": 0.3867338001728058, + "step": 3096 + }, + { + "epoch": 0.03097, + "grad_norm": 0.4458896219730377, + "grad_norm_var": 0.01202212433571431, + "learning_rate": 5e-05, + "loss": 0.2176, + "loss/crossentropy": 2.7899158000946045, + "loss/hidden": 0.0, + "loss/logits": 0.21757405623793602, + "loss/reg": 0.38665956258773804, + "step": 3097 + }, + { + "epoch": 0.03098, + "grad_norm": 0.41944143176078796, + "grad_norm_var": 0.011920871772283178, + "learning_rate": 5e-05, + "loss": 0.2039, + "loss/crossentropy": 2.7526310682296753, + "loss/hidden": 0.0, + "loss/logits": 0.20389534905552864, + "loss/reg": 0.3862384855747223, + "step": 3098 + }, + { + "epoch": 0.03099, + "grad_norm": 0.39820730686187744, + "grad_norm_var": 0.010347531700842702, + "learning_rate": 5e-05, + "loss": 0.1988, + "loss/crossentropy": 2.7705804109573364, + "loss/hidden": 0.0, + "loss/logits": 0.19879060238599777, + "loss/reg": 0.3859007656574249, + "step": 3099 + }, + { + "epoch": 0.031, + "grad_norm": 0.41860753297805786, + "grad_norm_var": 0.010454869085998016, + "learning_rate": 5e-05, + "loss": 0.1947, + "loss/crossentropy": 2.8576217889785767, + "loss/hidden": 0.0, + "loss/logits": 0.19469799101352692, + "loss/reg": 0.3855467736721039, + "step": 3100 + }, + { + "epoch": 0.03101, + "grad_norm": 0.44522061944007874, + "grad_norm_var": 0.01028493775371732, + "learning_rate": 5e-05, + "loss": 0.2043, + "loss/crossentropy": 2.759491264820099, + "loss/hidden": 0.0, + "loss/logits": 0.20432671904563904, + "loss/reg": 0.38542354106903076, + "step": 3101 + }, + { + "epoch": 0.03102, + "grad_norm": 0.48490917682647705, + "grad_norm_var": 0.010285900423027731, + "learning_rate": 5e-05, + "loss": 0.2295, + "loss/crossentropy": 2.7385717034339905, + "loss/hidden": 0.0, + "loss/logits": 0.2295326590538025, + "loss/reg": 0.3855573236942291, + "step": 3102 + }, + { + "epoch": 0.03103, + "grad_norm": 0.4180155098438263, + "grad_norm_var": 0.010228753187548482, + "learning_rate": 5e-05, + "loss": 0.1994, + "loss/crossentropy": 2.8116554021835327, + "loss/hidden": 0.0, + "loss/logits": 0.19944601505994797, + "loss/reg": 0.3852228820323944, + "step": 3103 + }, + { + "epoch": 0.03104, + "grad_norm": 0.5501394271850586, + "grad_norm_var": 0.004557322408432359, + "learning_rate": 5e-05, + "loss": 0.2221, + "loss/crossentropy": 2.929330885410309, + "loss/hidden": 0.0, + "loss/logits": 0.2220667265355587, + "loss/reg": 0.38537877798080444, + "step": 3104 + }, + { + "epoch": 0.03105, + "grad_norm": 0.4674672484397888, + "grad_norm_var": 0.004556983832318547, + "learning_rate": 5e-05, + "loss": 0.2151, + "loss/crossentropy": 2.7536840438842773, + "loss/hidden": 0.0, + "loss/logits": 0.21512821689248085, + "loss/reg": 0.3851679265499115, + "step": 3105 + }, + { + "epoch": 0.03106, + "grad_norm": 0.42702817916870117, + "grad_norm_var": 0.00462939993853612, + "learning_rate": 5e-05, + "loss": 0.2068, + "loss/crossentropy": 2.681324541568756, + "loss/hidden": 0.0, + "loss/logits": 0.20684696733951569, + "loss/reg": 0.3850401043891907, + "step": 3106 + }, + { + "epoch": 0.03107, + "grad_norm": 0.4066462516784668, + "grad_norm_var": 0.004811540104903259, + "learning_rate": 5e-05, + "loss": 0.1943, + "loss/crossentropy": 2.776600658893585, + "loss/hidden": 0.0, + "loss/logits": 0.19434643909335136, + "loss/reg": 0.38479727506637573, + "step": 3107 + }, + { + "epoch": 0.03108, + "grad_norm": 0.44697463512420654, + "grad_norm_var": 0.004707616794326989, + "learning_rate": 5e-05, + "loss": 0.2185, + "loss/crossentropy": 2.8858057856559753, + "loss/hidden": 0.0, + "loss/logits": 0.21848510205745697, + "loss/reg": 0.3843531310558319, + "step": 3108 + }, + { + "epoch": 0.03109, + "grad_norm": 0.4720774292945862, + "grad_norm_var": 0.0013957910822716617, + "learning_rate": 5e-05, + "loss": 0.2307, + "loss/crossentropy": 2.925503671169281, + "loss/hidden": 0.0, + "loss/logits": 0.23067449778318405, + "loss/reg": 0.383881151676178, + "step": 3109 + }, + { + "epoch": 0.0311, + "grad_norm": 0.4657842814922333, + "grad_norm_var": 0.0013999779002556742, + "learning_rate": 5e-05, + "loss": 0.2049, + "loss/crossentropy": 2.9126378297805786, + "loss/hidden": 0.0, + "loss/logits": 0.2049425020813942, + "loss/reg": 0.38368770480155945, + "step": 3110 + }, + { + "epoch": 0.03111, + "grad_norm": 0.4198952615261078, + "grad_norm_var": 0.0014091090066120267, + "learning_rate": 5e-05, + "loss": 0.2011, + "loss/crossentropy": 2.7111876606941223, + "loss/hidden": 0.0, + "loss/logits": 0.20109078288078308, + "loss/reg": 0.3834381699562073, + "step": 3111 + }, + { + "epoch": 0.03112, + "grad_norm": 0.41884908080101013, + "grad_norm_var": 0.001445572727021634, + "learning_rate": 5e-05, + "loss": 0.2024, + "loss/crossentropy": 2.672496497631073, + "loss/hidden": 0.0, + "loss/logits": 0.2024308182299137, + "loss/reg": 0.3830927312374115, + "step": 3112 + }, + { + "epoch": 0.03113, + "grad_norm": 0.46422043442726135, + "grad_norm_var": 0.0014710162079482243, + "learning_rate": 5e-05, + "loss": 0.2087, + "loss/crossentropy": 2.7916512489318848, + "loss/hidden": 0.0, + "loss/logits": 0.20868254825472832, + "loss/reg": 0.3830258250236511, + "step": 3113 + }, + { + "epoch": 0.03114, + "grad_norm": 0.44209468364715576, + "grad_norm_var": 0.001425233700921528, + "learning_rate": 5e-05, + "loss": 0.2061, + "loss/crossentropy": 2.8319700360298157, + "loss/hidden": 0.0, + "loss/logits": 0.20612385123968124, + "loss/reg": 0.38264548778533936, + "step": 3114 + }, + { + "epoch": 0.03115, + "grad_norm": 0.460742324590683, + "grad_norm_var": 0.0012658697139770526, + "learning_rate": 5e-05, + "loss": 0.2053, + "loss/crossentropy": 2.784672498703003, + "loss/hidden": 0.0, + "loss/logits": 0.20533087477087975, + "loss/reg": 0.3823259770870209, + "step": 3115 + }, + { + "epoch": 0.03116, + "grad_norm": 0.4369744062423706, + "grad_norm_var": 0.0012087487382272167, + "learning_rate": 5e-05, + "loss": 0.2059, + "loss/crossentropy": 2.778048574924469, + "loss/hidden": 0.0, + "loss/logits": 0.2059408314526081, + "loss/reg": 0.3820633888244629, + "step": 3116 + }, + { + "epoch": 0.03117, + "grad_norm": 0.44377362728118896, + "grad_norm_var": 0.0012101277395994091, + "learning_rate": 5e-05, + "loss": 0.2043, + "loss/crossentropy": 2.839882791042328, + "loss/hidden": 0.0, + "loss/logits": 0.2043137401342392, + "loss/reg": 0.3816540241241455, + "step": 3117 + }, + { + "epoch": 0.03118, + "grad_norm": 0.4408813714981079, + "grad_norm_var": 0.0011357404394097012, + "learning_rate": 5e-05, + "loss": 0.2068, + "loss/crossentropy": 2.8581590056419373, + "loss/hidden": 0.0, + "loss/logits": 0.20684538409113884, + "loss/reg": 0.38135936856269836, + "step": 3118 + }, + { + "epoch": 0.03119, + "grad_norm": 0.4020978808403015, + "grad_norm_var": 0.001217012963026139, + "learning_rate": 5e-05, + "loss": 0.2018, + "loss/crossentropy": 2.821607768535614, + "loss/hidden": 0.0, + "loss/logits": 0.2017582207918167, + "loss/reg": 0.38070547580718994, + "step": 3119 + }, + { + "epoch": 0.0312, + "grad_norm": 0.4077533483505249, + "grad_norm_var": 0.0005422349881561576, + "learning_rate": 5e-05, + "loss": 0.2085, + "loss/crossentropy": 2.797456741333008, + "loss/hidden": 0.0, + "loss/logits": 0.20849959552288055, + "loss/reg": 0.38049301505088806, + "step": 3120 + }, + { + "epoch": 0.03121, + "grad_norm": 0.4301411807537079, + "grad_norm_var": 0.0004874060980804155, + "learning_rate": 5e-05, + "loss": 0.2118, + "loss/crossentropy": 2.7308985590934753, + "loss/hidden": 0.0, + "loss/logits": 0.21178072318434715, + "loss/reg": 0.38022667169570923, + "step": 3121 + }, + { + "epoch": 0.03122, + "grad_norm": 0.45849886536598206, + "grad_norm_var": 0.0005090544255199428, + "learning_rate": 5e-05, + "loss": 0.2097, + "loss/crossentropy": 2.7549915313720703, + "loss/hidden": 0.0, + "loss/logits": 0.20971110835671425, + "loss/reg": 0.3798936903476715, + "step": 3122 + }, + { + "epoch": 0.03123, + "grad_norm": 0.45116475224494934, + "grad_norm_var": 0.0004433242388356332, + "learning_rate": 5e-05, + "loss": 0.2098, + "loss/crossentropy": 2.609461486339569, + "loss/hidden": 0.0, + "loss/logits": 0.20984326303005219, + "loss/reg": 0.37964266538619995, + "step": 3123 + }, + { + "epoch": 0.03124, + "grad_norm": 0.41967323422431946, + "grad_norm_var": 0.00046950853653940177, + "learning_rate": 5e-05, + "loss": 0.1975, + "loss/crossentropy": 2.850436210632324, + "loss/hidden": 0.0, + "loss/logits": 0.19750242680311203, + "loss/reg": 0.37930527329444885, + "step": 3124 + }, + { + "epoch": 0.03125, + "grad_norm": 0.43050140142440796, + "grad_norm_var": 0.0003978603954797581, + "learning_rate": 5e-05, + "loss": 0.2014, + "loss/crossentropy": 2.775246560573578, + "loss/hidden": 0.0, + "loss/logits": 0.20141823962330818, + "loss/reg": 0.3792084753513336, + "step": 3125 + }, + { + "epoch": 0.03126, + "grad_norm": 0.4274786412715912, + "grad_norm_var": 0.0003428885880761438, + "learning_rate": 5e-05, + "loss": 0.2047, + "loss/crossentropy": 2.9402623176574707, + "loss/hidden": 0.0, + "loss/logits": 0.20472651720046997, + "loss/reg": 0.378872275352478, + "step": 3126 + }, + { + "epoch": 0.03127, + "grad_norm": 0.4412992596626282, + "grad_norm_var": 0.0003293529985885465, + "learning_rate": 5e-05, + "loss": 0.2034, + "loss/crossentropy": 2.6967748403549194, + "loss/hidden": 0.0, + "loss/logits": 0.20338549837470055, + "loss/reg": 0.3787257969379425, + "step": 3127 + }, + { + "epoch": 0.03128, + "grad_norm": 0.41138821840286255, + "grad_norm_var": 0.00034990243123736995, + "learning_rate": 5e-05, + "loss": 0.2118, + "loss/crossentropy": 2.8564756512641907, + "loss/hidden": 0.0, + "loss/logits": 0.21183637902140617, + "loss/reg": 0.37852153182029724, + "step": 3128 + }, + { + "epoch": 0.03129, + "grad_norm": 0.440180242061615, + "grad_norm_var": 0.0002941007618558533, + "learning_rate": 5e-05, + "loss": 0.2183, + "loss/crossentropy": 2.898647129535675, + "loss/hidden": 0.0, + "loss/logits": 0.21831082925200462, + "loss/reg": 0.3778568506240845, + "step": 3129 + }, + { + "epoch": 0.0313, + "grad_norm": 0.4751511812210083, + "grad_norm_var": 0.00039789685087444996, + "learning_rate": 5e-05, + "loss": 0.234, + "loss/crossentropy": 2.8254252672195435, + "loss/hidden": 0.0, + "loss/logits": 0.2339526191353798, + "loss/reg": 0.3776779770851135, + "step": 3130 + }, + { + "epoch": 0.03131, + "grad_norm": 0.41846200823783875, + "grad_norm_var": 0.00037074059640584224, + "learning_rate": 5e-05, + "loss": 0.1924, + "loss/crossentropy": 2.8661490082740784, + "loss/hidden": 0.0, + "loss/logits": 0.1924164853990078, + "loss/reg": 0.3772597908973694, + "step": 3131 + }, + { + "epoch": 0.03132, + "grad_norm": 0.4470352828502655, + "grad_norm_var": 0.00038177632629355176, + "learning_rate": 5e-05, + "loss": 0.2099, + "loss/crossentropy": 2.742294669151306, + "loss/hidden": 0.0, + "loss/logits": 0.20994874462485313, + "loss/reg": 0.3772224187850952, + "step": 3132 + }, + { + "epoch": 0.03133, + "grad_norm": 0.4869903326034546, + "grad_norm_var": 0.0005542912306107817, + "learning_rate": 5e-05, + "loss": 0.21, + "loss/crossentropy": 2.663846731185913, + "loss/hidden": 0.0, + "loss/logits": 0.20997771993279457, + "loss/reg": 0.37676265835762024, + "step": 3133 + }, + { + "epoch": 0.03134, + "grad_norm": 0.41779398918151855, + "grad_norm_var": 0.0005750218961174114, + "learning_rate": 5e-05, + "loss": 0.2092, + "loss/crossentropy": 2.6771955490112305, + "loss/hidden": 0.0, + "loss/logits": 0.20918389782309532, + "loss/reg": 0.37648409605026245, + "step": 3134 + }, + { + "epoch": 0.03135, + "grad_norm": 0.4037352204322815, + "grad_norm_var": 0.0005679299823759624, + "learning_rate": 5e-05, + "loss": 0.1992, + "loss/crossentropy": 2.7772940397262573, + "loss/hidden": 0.0, + "loss/logits": 0.19920722395181656, + "loss/reg": 0.37649333477020264, + "step": 3135 + }, + { + "epoch": 0.03136, + "grad_norm": 0.42099225521087646, + "grad_norm_var": 0.0005299892850654353, + "learning_rate": 5e-05, + "loss": 0.2038, + "loss/crossentropy": 2.6453572511672974, + "loss/hidden": 0.0, + "loss/logits": 0.20384974777698517, + "loss/reg": 0.3760591149330139, + "step": 3136 + }, + { + "epoch": 0.03137, + "grad_norm": 0.432195246219635, + "grad_norm_var": 0.0005285716087569151, + "learning_rate": 5e-05, + "loss": 0.2011, + "loss/crossentropy": 2.8915597200393677, + "loss/hidden": 0.0, + "loss/logits": 0.20110679417848587, + "loss/reg": 0.3759898841381073, + "step": 3137 + }, + { + "epoch": 0.03138, + "grad_norm": 0.7050199508666992, + "grad_norm_var": 0.005052952252721144, + "learning_rate": 5e-05, + "loss": 0.2103, + "loss/crossentropy": 2.8845019340515137, + "loss/hidden": 0.0, + "loss/logits": 0.21026690304279327, + "loss/reg": 0.3757486939430237, + "step": 3138 + }, + { + "epoch": 0.03139, + "grad_norm": 0.5058969855308533, + "grad_norm_var": 0.00523542339279525, + "learning_rate": 5e-05, + "loss": 0.2129, + "loss/crossentropy": 2.897139310836792, + "loss/hidden": 0.0, + "loss/logits": 0.21292290464043617, + "loss/reg": 0.37589675188064575, + "step": 3139 + }, + { + "epoch": 0.0314, + "grad_norm": 0.48990732431411743, + "grad_norm_var": 0.005210685760398562, + "learning_rate": 5e-05, + "loss": 0.2172, + "loss/crossentropy": 3.0301440954208374, + "loss/hidden": 0.0, + "loss/logits": 0.21722887456417084, + "loss/reg": 0.3763388395309448, + "step": 3140 + }, + { + "epoch": 0.03141, + "grad_norm": 0.44014158844947815, + "grad_norm_var": 0.005179057615102304, + "learning_rate": 5e-05, + "loss": 0.1978, + "loss/crossentropy": 2.8169822692871094, + "loss/hidden": 0.0, + "loss/logits": 0.19776873663067818, + "loss/reg": 0.37635737657546997, + "step": 3141 + }, + { + "epoch": 0.03142, + "grad_norm": 0.4349666237831116, + "grad_norm_var": 0.00514986386841176, + "learning_rate": 5e-05, + "loss": 0.2065, + "loss/crossentropy": 2.852429747581482, + "loss/hidden": 0.0, + "loss/logits": 0.20649616792798042, + "loss/reg": 0.3764333128929138, + "step": 3142 + }, + { + "epoch": 0.03143, + "grad_norm": 0.45099425315856934, + "grad_norm_var": 0.00513066332941288, + "learning_rate": 5e-05, + "loss": 0.216, + "loss/crossentropy": 2.7381709814071655, + "loss/hidden": 0.0, + "loss/logits": 0.21602829545736313, + "loss/reg": 0.3764631152153015, + "step": 3143 + }, + { + "epoch": 0.03144, + "grad_norm": 0.4647751450538635, + "grad_norm_var": 0.004953491135304555, + "learning_rate": 5e-05, + "loss": 0.2221, + "loss/crossentropy": 2.788229525089264, + "loss/hidden": 0.0, + "loss/logits": 0.22205504402518272, + "loss/reg": 0.3766438066959381, + "step": 3144 + }, + { + "epoch": 0.03145, + "grad_norm": 0.4330652058124542, + "grad_norm_var": 0.004979859253785672, + "learning_rate": 5e-05, + "loss": 0.205, + "loss/crossentropy": 2.807882249355316, + "loss/hidden": 0.0, + "loss/logits": 0.20500856265425682, + "loss/reg": 0.3762151002883911, + "step": 3145 + }, + { + "epoch": 0.03146, + "grad_norm": 0.5392932891845703, + "grad_norm_var": 0.005330696334862909, + "learning_rate": 5e-05, + "loss": 0.2257, + "loss/crossentropy": 2.813581109046936, + "loss/hidden": 0.0, + "loss/logits": 0.22568966448307037, + "loss/reg": 0.3763831853866577, + "step": 3146 + }, + { + "epoch": 0.03147, + "grad_norm": 0.4334738552570343, + "grad_norm_var": 0.005245218413013678, + "learning_rate": 5e-05, + "loss": 0.1957, + "loss/crossentropy": 2.8640422224998474, + "loss/hidden": 0.0, + "loss/logits": 0.19567247107625008, + "loss/reg": 0.3766272962093353, + "step": 3147 + }, + { + "epoch": 0.03148, + "grad_norm": 0.47255152463912964, + "grad_norm_var": 0.005210699146163326, + "learning_rate": 5e-05, + "loss": 0.2117, + "loss/crossentropy": 2.8773242235183716, + "loss/hidden": 0.0, + "loss/logits": 0.2117210105061531, + "loss/reg": 0.3764202296733856, + "step": 3148 + }, + { + "epoch": 0.03149, + "grad_norm": 0.44457170367240906, + "grad_norm_var": 0.005231232302390272, + "learning_rate": 5e-05, + "loss": 0.2042, + "loss/crossentropy": 2.8501864671707153, + "loss/hidden": 0.0, + "loss/logits": 0.20415519922971725, + "loss/reg": 0.37661027908325195, + "step": 3149 + }, + { + "epoch": 0.0315, + "grad_norm": 0.4542436897754669, + "grad_norm_var": 0.005069851988259441, + "learning_rate": 5e-05, + "loss": 0.2041, + "loss/crossentropy": 2.8384602069854736, + "loss/hidden": 0.0, + "loss/logits": 0.20408564805984497, + "loss/reg": 0.3764829635620117, + "step": 3150 + }, + { + "epoch": 0.03151, + "grad_norm": 0.4649834930896759, + "grad_norm_var": 0.004760191802785248, + "learning_rate": 5e-05, + "loss": 0.1999, + "loss/crossentropy": 2.749688744544983, + "loss/hidden": 0.0, + "loss/logits": 0.19994951039552689, + "loss/reg": 0.37602946162223816, + "step": 3151 + }, + { + "epoch": 0.03152, + "grad_norm": 0.3961663544178009, + "grad_norm_var": 0.0049748097097448395, + "learning_rate": 5e-05, + "loss": 0.198, + "loss/crossentropy": 2.7284331917762756, + "loss/hidden": 0.0, + "loss/logits": 0.19804034382104874, + "loss/reg": 0.375903844833374, + "step": 3152 + }, + { + "epoch": 0.03153, + "grad_norm": 0.5178595185279846, + "grad_norm_var": 0.004971497178416965, + "learning_rate": 5e-05, + "loss": 0.2557, + "loss/crossentropy": 2.7699463963508606, + "loss/hidden": 0.0, + "loss/logits": 0.2556835897266865, + "loss/reg": 0.37545251846313477, + "step": 3153 + }, + { + "epoch": 0.03154, + "grad_norm": 0.4488166570663452, + "grad_norm_var": 0.0013187128671826032, + "learning_rate": 5e-05, + "loss": 0.208, + "loss/crossentropy": 2.8534839749336243, + "loss/hidden": 0.0, + "loss/logits": 0.20799975842237473, + "loss/reg": 0.375466912984848, + "step": 3154 + }, + { + "epoch": 0.03155, + "grad_norm": 0.4126938581466675, + "grad_norm_var": 0.0013159003840731468, + "learning_rate": 5e-05, + "loss": 0.198, + "loss/crossentropy": 2.8207828998565674, + "loss/hidden": 0.0, + "loss/logits": 0.19802149385213852, + "loss/reg": 0.3758436441421509, + "step": 3155 + }, + { + "epoch": 0.03156, + "grad_norm": 0.45187854766845703, + "grad_norm_var": 0.0012351534770335016, + "learning_rate": 5e-05, + "loss": 0.2097, + "loss/crossentropy": 2.8580872416496277, + "loss/hidden": 0.0, + "loss/logits": 0.20971157774329185, + "loss/reg": 0.37599754333496094, + "step": 3156 + }, + { + "epoch": 0.03157, + "grad_norm": 0.41906777024269104, + "grad_norm_var": 0.0013012310537776886, + "learning_rate": 5e-05, + "loss": 0.1977, + "loss/crossentropy": 2.7271682024002075, + "loss/hidden": 0.0, + "loss/logits": 0.19772380217909813, + "loss/reg": 0.37621283531188965, + "step": 3157 + }, + { + "epoch": 0.03158, + "grad_norm": 0.45668354630470276, + "grad_norm_var": 0.001280046451021851, + "learning_rate": 5e-05, + "loss": 0.2188, + "loss/crossentropy": 2.680020749568939, + "loss/hidden": 0.0, + "loss/logits": 0.21880685165524483, + "loss/reg": 0.3763059377670288, + "step": 3158 + }, + { + "epoch": 0.03159, + "grad_norm": 0.478261262178421, + "grad_norm_var": 0.0013162416345416817, + "learning_rate": 5e-05, + "loss": 0.2113, + "loss/crossentropy": 2.6823845505714417, + "loss/hidden": 0.0, + "loss/logits": 0.21134750917553902, + "loss/reg": 0.3760102391242981, + "step": 3159 + }, + { + "epoch": 0.0316, + "grad_norm": 0.4267452359199524, + "grad_norm_var": 0.0013597248484158997, + "learning_rate": 5e-05, + "loss": 0.2125, + "loss/crossentropy": 2.9382660388946533, + "loss/hidden": 0.0, + "loss/logits": 0.21247807145118713, + "loss/reg": 0.37550970911979675, + "step": 3160 + }, + { + "epoch": 0.03161, + "grad_norm": 0.4203246831893921, + "grad_norm_var": 0.0014039839523270223, + "learning_rate": 5e-05, + "loss": 0.2077, + "loss/crossentropy": 2.861503005027771, + "loss/hidden": 0.0, + "loss/logits": 0.20771582797169685, + "loss/reg": 0.37527230381965637, + "step": 3161 + }, + { + "epoch": 0.03162, + "grad_norm": 0.42335766553878784, + "grad_norm_var": 0.0009000886420167683, + "learning_rate": 5e-05, + "loss": 0.1985, + "loss/crossentropy": 2.877030313014984, + "loss/hidden": 0.0, + "loss/logits": 0.19846957921981812, + "loss/reg": 0.3753155767917633, + "step": 3162 + }, + { + "epoch": 0.03163, + "grad_norm": 0.46654513478279114, + "grad_norm_var": 0.0009171580839026767, + "learning_rate": 5e-05, + "loss": 0.2255, + "loss/crossentropy": 2.8887664079666138, + "loss/hidden": 0.0, + "loss/logits": 0.22549311816692352, + "loss/reg": 0.3750663101673126, + "step": 3163 + }, + { + "epoch": 0.03164, + "grad_norm": 0.4282910227775574, + "grad_norm_var": 0.0008898198527471942, + "learning_rate": 5e-05, + "loss": 0.199, + "loss/crossentropy": 2.8998924493789673, + "loss/hidden": 0.0, + "loss/logits": 0.19902435690164566, + "loss/reg": 0.37455540895462036, + "step": 3164 + }, + { + "epoch": 0.03165, + "grad_norm": 0.47788357734680176, + "grad_norm_var": 0.0009599125231827229, + "learning_rate": 5e-05, + "loss": 0.2233, + "loss/crossentropy": 2.8205342292785645, + "loss/hidden": 0.0, + "loss/logits": 0.22328586876392365, + "loss/reg": 0.37428417801856995, + "step": 3165 + }, + { + "epoch": 0.03166, + "grad_norm": 0.4424133598804474, + "grad_norm_var": 0.0009564255782810135, + "learning_rate": 5e-05, + "loss": 0.202, + "loss/crossentropy": 2.8172916173934937, + "loss/hidden": 0.0, + "loss/logits": 0.20195399224758148, + "loss/reg": 0.37391921877861023, + "step": 3166 + }, + { + "epoch": 0.03167, + "grad_norm": 0.47984713315963745, + "grad_norm_var": 0.0010083543682911273, + "learning_rate": 5e-05, + "loss": 0.2356, + "loss/crossentropy": 2.7736324667930603, + "loss/hidden": 0.0, + "loss/logits": 0.23563861101865768, + "loss/reg": 0.3736342489719391, + "step": 3167 + }, + { + "epoch": 0.03168, + "grad_norm": 0.4563756585121155, + "grad_norm_var": 0.0008294304416526269, + "learning_rate": 5e-05, + "loss": 0.2195, + "loss/crossentropy": 2.7438533306121826, + "loss/hidden": 0.0, + "loss/logits": 0.219478290528059, + "loss/reg": 0.3736617863178253, + "step": 3168 + }, + { + "epoch": 0.03169, + "grad_norm": 0.515541136264801, + "grad_norm_var": 0.0008089259000210884, + "learning_rate": 5e-05, + "loss": 0.2161, + "loss/crossentropy": 2.761348009109497, + "loss/hidden": 0.0, + "loss/logits": 0.21606112644076347, + "loss/reg": 0.373412549495697, + "step": 3169 + }, + { + "epoch": 0.0317, + "grad_norm": 0.46756377816200256, + "grad_norm_var": 0.0008271955432374837, + "learning_rate": 5e-05, + "loss": 0.1947, + "loss/crossentropy": 3.1867759227752686, + "loss/hidden": 0.0, + "loss/logits": 0.19467567279934883, + "loss/reg": 0.37338200211524963, + "step": 3170 + }, + { + "epoch": 0.03171, + "grad_norm": 0.4456403851509094, + "grad_norm_var": 0.0007247118876652081, + "learning_rate": 5e-05, + "loss": 0.2114, + "loss/crossentropy": 2.798861801624298, + "loss/hidden": 0.0, + "loss/logits": 0.21139798685908318, + "loss/reg": 0.37328818440437317, + "step": 3171 + }, + { + "epoch": 0.03172, + "grad_norm": 0.4351838529109955, + "grad_norm_var": 0.0007457991437421208, + "learning_rate": 5e-05, + "loss": 0.2078, + "loss/crossentropy": 2.8185710310935974, + "loss/hidden": 0.0, + "loss/logits": 0.20778124779462814, + "loss/reg": 0.3730847239494324, + "step": 3172 + }, + { + "epoch": 0.03173, + "grad_norm": 0.44151589274406433, + "grad_norm_var": 0.0006772799987032508, + "learning_rate": 5e-05, + "loss": 0.2063, + "loss/crossentropy": 2.856807768344879, + "loss/hidden": 0.0, + "loss/logits": 0.2063431330025196, + "loss/reg": 0.3729942739009857, + "step": 3173 + }, + { + "epoch": 0.03174, + "grad_norm": 0.4401620924472809, + "grad_norm_var": 0.0006881769284442544, + "learning_rate": 5e-05, + "loss": 0.1835, + "loss/crossentropy": 2.9926698207855225, + "loss/hidden": 0.0, + "loss/logits": 0.18354609608650208, + "loss/reg": 0.37280207872390747, + "step": 3174 + }, + { + "epoch": 0.03175, + "grad_norm": 0.5176730751991272, + "grad_norm_var": 0.0009187744353656621, + "learning_rate": 5e-05, + "loss": 0.2273, + "loss/crossentropy": 2.8195388317108154, + "loss/hidden": 0.0, + "loss/logits": 0.22732477262616158, + "loss/reg": 0.3726043701171875, + "step": 3175 + }, + { + "epoch": 0.03176, + "grad_norm": 0.43104103207588196, + "grad_norm_var": 0.0009035629696960515, + "learning_rate": 5e-05, + "loss": 0.2064, + "loss/crossentropy": 2.860792875289917, + "loss/hidden": 0.0, + "loss/logits": 0.20637111365795135, + "loss/reg": 0.37231892347335815, + "step": 3176 + }, + { + "epoch": 0.03177, + "grad_norm": 0.42835095524787903, + "grad_norm_var": 0.0008698548002522788, + "learning_rate": 5e-05, + "loss": 0.2119, + "loss/crossentropy": 2.8032827377319336, + "loss/hidden": 0.0, + "loss/logits": 0.21190067753195763, + "loss/reg": 0.3719193637371063, + "step": 3177 + }, + { + "epoch": 0.03178, + "grad_norm": 0.4543002247810364, + "grad_norm_var": 0.0007946659518183778, + "learning_rate": 5e-05, + "loss": 0.2033, + "loss/crossentropy": 2.9426137804985046, + "loss/hidden": 0.0, + "loss/logits": 0.20332912355661392, + "loss/reg": 0.37155595421791077, + "step": 3178 + }, + { + "epoch": 0.03179, + "grad_norm": 0.6794347763061523, + "grad_norm_var": 0.0038692645584429307, + "learning_rate": 5e-05, + "loss": 0.2479, + "loss/crossentropy": 2.8206589818000793, + "loss/hidden": 0.0, + "loss/logits": 0.2478645220398903, + "loss/reg": 0.3715088367462158, + "step": 3179 + }, + { + "epoch": 0.0318, + "grad_norm": 0.4545481503009796, + "grad_norm_var": 0.003761690609740498, + "learning_rate": 5e-05, + "loss": 0.2071, + "loss/crossentropy": 2.7489424347877502, + "loss/hidden": 0.0, + "loss/logits": 0.20709169656038284, + "loss/reg": 0.37143614888191223, + "step": 3180 + }, + { + "epoch": 0.03181, + "grad_norm": 0.4211520254611969, + "grad_norm_var": 0.0039256563689113975, + "learning_rate": 5e-05, + "loss": 0.2181, + "loss/crossentropy": 2.820417821407318, + "loss/hidden": 0.0, + "loss/logits": 0.21814584359526634, + "loss/reg": 0.37149256467819214, + "step": 3181 + }, + { + "epoch": 0.03182, + "grad_norm": 0.5502594113349915, + "grad_norm_var": 0.004264217132588621, + "learning_rate": 5e-05, + "loss": 0.2237, + "loss/crossentropy": 2.824505031108856, + "loss/hidden": 0.0, + "loss/logits": 0.2236938178539276, + "loss/reg": 0.37125542759895325, + "step": 3182 + }, + { + "epoch": 0.03183, + "grad_norm": 0.4553758203983307, + "grad_norm_var": 0.004289620454713339, + "learning_rate": 5e-05, + "loss": 0.2131, + "loss/crossentropy": 2.817155599594116, + "loss/hidden": 0.0, + "loss/logits": 0.21313072368502617, + "loss/reg": 0.37125083804130554, + "step": 3183 + }, + { + "epoch": 0.03184, + "grad_norm": 0.42795270681381226, + "grad_norm_var": 0.004409299998931652, + "learning_rate": 5e-05, + "loss": 0.2007, + "loss/crossentropy": 2.8978899121284485, + "loss/hidden": 0.0, + "loss/logits": 0.20072172582149506, + "loss/reg": 0.3714345097541809, + "step": 3184 + }, + { + "epoch": 0.03185, + "grad_norm": 0.46656161546707153, + "grad_norm_var": 0.004280477141425243, + "learning_rate": 5e-05, + "loss": 0.2109, + "loss/crossentropy": 2.792574405670166, + "loss/hidden": 0.0, + "loss/logits": 0.21089863404631615, + "loss/reg": 0.3713243305683136, + "step": 3185 + }, + { + "epoch": 0.03186, + "grad_norm": 0.47125568985939026, + "grad_norm_var": 0.004280230829183825, + "learning_rate": 5e-05, + "loss": 0.2099, + "loss/crossentropy": 2.7252144813537598, + "loss/hidden": 0.0, + "loss/logits": 0.2099367491900921, + "loss/reg": 0.37113699316978455, + "step": 3186 + }, + { + "epoch": 0.03187, + "grad_norm": 0.5728321671485901, + "grad_norm_var": 0.004877795650008916, + "learning_rate": 5e-05, + "loss": 0.2296, + "loss/crossentropy": 2.795195698738098, + "loss/hidden": 0.0, + "loss/logits": 0.22955923154950142, + "loss/reg": 0.3710094690322876, + "step": 3187 + }, + { + "epoch": 0.03188, + "grad_norm": 0.45578908920288086, + "grad_norm_var": 0.004786768830075265, + "learning_rate": 5e-05, + "loss": 0.2108, + "loss/crossentropy": 2.784304141998291, + "loss/hidden": 0.0, + "loss/logits": 0.21078245341777802, + "loss/reg": 0.37102121114730835, + "step": 3188 + }, + { + "epoch": 0.03189, + "grad_norm": 0.48545441031455994, + "grad_norm_var": 0.004686291854147943, + "learning_rate": 5e-05, + "loss": 0.2305, + "loss/crossentropy": 2.7350648045539856, + "loss/hidden": 0.0, + "loss/logits": 0.23047351837158203, + "loss/reg": 0.3709156811237335, + "step": 3189 + }, + { + "epoch": 0.0319, + "grad_norm": 0.4348323345184326, + "grad_norm_var": 0.004717805067114933, + "learning_rate": 5e-05, + "loss": 0.2208, + "loss/crossentropy": 2.9618518948554993, + "loss/hidden": 0.0, + "loss/logits": 0.22084569558501244, + "loss/reg": 0.37058982253074646, + "step": 3190 + }, + { + "epoch": 0.03191, + "grad_norm": 0.42109641432762146, + "grad_norm_var": 0.004837213446789083, + "learning_rate": 5e-05, + "loss": 0.2152, + "loss/crossentropy": 2.89162278175354, + "loss/hidden": 0.0, + "loss/logits": 0.21516217663884163, + "loss/reg": 0.3705490827560425, + "step": 3191 + }, + { + "epoch": 0.03192, + "grad_norm": 0.41352981328964233, + "grad_norm_var": 0.004960509128550295, + "learning_rate": 5e-05, + "loss": 0.1793, + "loss/crossentropy": 2.800557851791382, + "loss/hidden": 0.0, + "loss/logits": 0.17930593341588974, + "loss/reg": 0.37007200717926025, + "step": 3192 + }, + { + "epoch": 0.03193, + "grad_norm": 0.4911285638809204, + "grad_norm_var": 0.004820160233853565, + "learning_rate": 5e-05, + "loss": 0.2136, + "loss/crossentropy": 2.747347116470337, + "loss/hidden": 0.0, + "loss/logits": 0.2135833129286766, + "loss/reg": 0.36984696984291077, + "step": 3193 + }, + { + "epoch": 0.03194, + "grad_norm": 0.9794400334358215, + "grad_norm_var": 0.02036363754155688, + "learning_rate": 5e-05, + "loss": 0.253, + "loss/crossentropy": 2.895307421684265, + "loss/hidden": 0.0, + "loss/logits": 0.25296223536133766, + "loss/reg": 0.3695753812789917, + "step": 3194 + }, + { + "epoch": 0.03195, + "grad_norm": 0.5106605887413025, + "grad_norm_var": 0.018360137455953485, + "learning_rate": 5e-05, + "loss": 0.216, + "loss/crossentropy": 2.823952853679657, + "loss/hidden": 0.0, + "loss/logits": 0.2160152643918991, + "loss/reg": 0.369642972946167, + "step": 3195 + }, + { + "epoch": 0.03196, + "grad_norm": 0.48798590898513794, + "grad_norm_var": 0.01822406939643827, + "learning_rate": 5e-05, + "loss": 0.1995, + "loss/crossentropy": 2.7356008887290955, + "loss/hidden": 0.0, + "loss/logits": 0.19949671998620033, + "loss/reg": 0.36937016248703003, + "step": 3196 + }, + { + "epoch": 0.03197, + "grad_norm": 0.5175269842147827, + "grad_norm_var": 0.017754994807338362, + "learning_rate": 5e-05, + "loss": 0.2826, + "loss/crossentropy": 2.661824345588684, + "loss/hidden": 0.0, + "loss/logits": 0.2826337702572346, + "loss/reg": 0.3692253828048706, + "step": 3197 + }, + { + "epoch": 0.03198, + "grad_norm": 0.4901745915412903, + "grad_norm_var": 0.01764892863667426, + "learning_rate": 5e-05, + "loss": 0.2046, + "loss/crossentropy": 2.850338876247406, + "loss/hidden": 0.0, + "loss/logits": 0.20458348840475082, + "loss/reg": 0.3690522611141205, + "step": 3198 + }, + { + "epoch": 0.03199, + "grad_norm": 0.44657716155052185, + "grad_norm_var": 0.01771210106761966, + "learning_rate": 5e-05, + "loss": 0.1913, + "loss/crossentropy": 2.746738076210022, + "loss/hidden": 0.0, + "loss/logits": 0.19134128838777542, + "loss/reg": 0.3688264787197113, + "step": 3199 + }, + { + "epoch": 0.032, + "grad_norm": 0.5486326217651367, + "grad_norm_var": 0.017389829978657605, + "learning_rate": 5e-05, + "loss": 0.2209, + "loss/crossentropy": 2.822535216808319, + "loss/hidden": 0.0, + "loss/logits": 0.22091369703412056, + "loss/reg": 0.368611216545105, + "step": 3200 + }, + { + "epoch": 0.03201, + "grad_norm": 0.510017454624176, + "grad_norm_var": 0.01724404529015633, + "learning_rate": 5e-05, + "loss": 0.2056, + "loss/crossentropy": 2.7396174669265747, + "loss/hidden": 0.0, + "loss/logits": 0.2056000977754593, + "loss/reg": 0.36843183636665344, + "step": 3201 + }, + { + "epoch": 0.03202, + "grad_norm": 0.4709870219230652, + "grad_norm_var": 0.017245609962512065, + "learning_rate": 5e-05, + "loss": 0.209, + "loss/crossentropy": 2.782393515110016, + "loss/hidden": 0.0, + "loss/logits": 0.20898321270942688, + "loss/reg": 0.3684435188770294, + "step": 3202 + }, + { + "epoch": 0.03203, + "grad_norm": 0.444202721118927, + "grad_norm_var": 0.017284275256138636, + "learning_rate": 5e-05, + "loss": 0.2072, + "loss/crossentropy": 2.7235705256462097, + "loss/hidden": 0.0, + "loss/logits": 0.20715422555804253, + "loss/reg": 0.36817440390586853, + "step": 3203 + }, + { + "epoch": 0.03204, + "grad_norm": 0.5809649229049683, + "grad_norm_var": 0.01741300657021979, + "learning_rate": 5e-05, + "loss": 0.214, + "loss/crossentropy": 2.8450931310653687, + "loss/hidden": 0.0, + "loss/logits": 0.21396317332983017, + "loss/reg": 0.3676774501800537, + "step": 3204 + }, + { + "epoch": 0.03205, + "grad_norm": 0.5148164629936218, + "grad_norm_var": 0.017352881506633187, + "learning_rate": 5e-05, + "loss": 0.217, + "loss/crossentropy": 2.7670685052871704, + "loss/hidden": 0.0, + "loss/logits": 0.21703018248081207, + "loss/reg": 0.3674672842025757, + "step": 3205 + }, + { + "epoch": 0.03206, + "grad_norm": 0.4270023703575134, + "grad_norm_var": 0.017441880858129942, + "learning_rate": 5e-05, + "loss": 0.1995, + "loss/crossentropy": 2.8668400645256042, + "loss/hidden": 0.0, + "loss/logits": 0.19953147694468498, + "loss/reg": 0.367195725440979, + "step": 3206 + }, + { + "epoch": 0.03207, + "grad_norm": 0.45148077607154846, + "grad_norm_var": 0.017115421579565825, + "learning_rate": 5e-05, + "loss": 0.2064, + "loss/crossentropy": 2.8125604391098022, + "loss/hidden": 0.0, + "loss/logits": 0.20638568699359894, + "loss/reg": 0.36662063002586365, + "step": 3207 + }, + { + "epoch": 0.03208, + "grad_norm": 0.42709144949913025, + "grad_norm_var": 0.01693833613964178, + "learning_rate": 5e-05, + "loss": 0.2049, + "loss/crossentropy": 2.7250061631202698, + "loss/hidden": 0.0, + "loss/logits": 0.2048729620873928, + "loss/reg": 0.3663308620452881, + "step": 3208 + }, + { + "epoch": 0.03209, + "grad_norm": 0.47204315662384033, + "grad_norm_var": 0.017031182378122973, + "learning_rate": 5e-05, + "loss": 0.2387, + "loss/crossentropy": 2.8468820452690125, + "loss/hidden": 0.0, + "loss/logits": 0.2386760339140892, + "loss/reg": 0.3661031126976013, + "step": 3209 + }, + { + "epoch": 0.0321, + "grad_norm": 0.5870795845985413, + "grad_norm_var": 0.002485291927999648, + "learning_rate": 5e-05, + "loss": 0.24, + "loss/crossentropy": 2.7169207334518433, + "loss/hidden": 0.0, + "loss/logits": 0.2399534210562706, + "loss/reg": 0.36584195494651794, + "step": 3210 + }, + { + "epoch": 0.03211, + "grad_norm": 0.45873352885246277, + "grad_norm_var": 0.0025312159198082273, + "learning_rate": 5e-05, + "loss": 0.2263, + "loss/crossentropy": 2.707998275756836, + "loss/hidden": 0.0, + "loss/logits": 0.22634555026888847, + "loss/reg": 0.3658137917518616, + "step": 3211 + }, + { + "epoch": 0.03212, + "grad_norm": 0.44665300846099854, + "grad_norm_var": 0.0026474781108541735, + "learning_rate": 5e-05, + "loss": 0.1921, + "loss/crossentropy": 2.9598397612571716, + "loss/hidden": 0.0, + "loss/logits": 0.1920909471809864, + "loss/reg": 0.3658459782600403, + "step": 3212 + }, + { + "epoch": 0.03213, + "grad_norm": 0.46003440022468567, + "grad_norm_var": 0.0026210058659565604, + "learning_rate": 5e-05, + "loss": 0.2095, + "loss/crossentropy": 2.8616157174110413, + "loss/hidden": 0.0, + "loss/logits": 0.20945832133293152, + "loss/reg": 0.3654417097568512, + "step": 3213 + }, + { + "epoch": 0.03214, + "grad_norm": 0.430207222700119, + "grad_norm_var": 0.0027926389894147824, + "learning_rate": 5e-05, + "loss": 0.1941, + "loss/crossentropy": 2.8163504600524902, + "loss/hidden": 0.0, + "loss/logits": 0.19405406340956688, + "loss/reg": 0.36562058329582214, + "step": 3214 + }, + { + "epoch": 0.03215, + "grad_norm": 0.48328617215156555, + "grad_norm_var": 0.002714335090688576, + "learning_rate": 5e-05, + "loss": 0.2077, + "loss/crossentropy": 2.867554783821106, + "loss/hidden": 0.0, + "loss/logits": 0.20765169709920883, + "loss/reg": 0.3656025826931, + "step": 3215 + }, + { + "epoch": 0.03216, + "grad_norm": 0.4911864995956421, + "grad_norm_var": 0.0024108074184924296, + "learning_rate": 5e-05, + "loss": 0.2274, + "loss/crossentropy": 2.8728580474853516, + "loss/hidden": 0.0, + "loss/logits": 0.22738640755414963, + "loss/reg": 0.36527106165885925, + "step": 3216 + }, + { + "epoch": 0.03217, + "grad_norm": 0.46762752532958984, + "grad_norm_var": 0.0023449023642010845, + "learning_rate": 5e-05, + "loss": 0.2177, + "loss/crossentropy": 3.001012682914734, + "loss/hidden": 0.0, + "loss/logits": 0.21773692965507507, + "loss/reg": 0.36481958627700806, + "step": 3217 + }, + { + "epoch": 0.03218, + "grad_norm": 0.5026620030403137, + "grad_norm_var": 0.002387124555954741, + "learning_rate": 5e-05, + "loss": 0.2331, + "loss/crossentropy": 2.646042287349701, + "loss/hidden": 0.0, + "loss/logits": 0.23311198875308037, + "loss/reg": 0.36434870958328247, + "step": 3218 + }, + { + "epoch": 0.03219, + "grad_norm": 0.473395973443985, + "grad_norm_var": 0.0023095486273285244, + "learning_rate": 5e-05, + "loss": 0.2079, + "loss/crossentropy": 2.839150547981262, + "loss/hidden": 0.0, + "loss/logits": 0.2079375497996807, + "loss/reg": 0.3639160990715027, + "step": 3219 + }, + { + "epoch": 0.0322, + "grad_norm": 0.4067559242248535, + "grad_norm_var": 0.0018528216733045574, + "learning_rate": 5e-05, + "loss": 0.1987, + "loss/crossentropy": 2.8410377502441406, + "loss/hidden": 0.0, + "loss/logits": 0.19873205572366714, + "loss/reg": 0.363482803106308, + "step": 3220 + }, + { + "epoch": 0.03221, + "grad_norm": 0.43369486927986145, + "grad_norm_var": 0.0017658894771473755, + "learning_rate": 5e-05, + "loss": 0.2074, + "loss/crossentropy": 2.936634659767151, + "loss/hidden": 0.0, + "loss/logits": 0.20736025273799896, + "loss/reg": 0.363351047039032, + "step": 3221 + }, + { + "epoch": 0.03222, + "grad_norm": 0.44040152430534363, + "grad_norm_var": 0.0017115779177718005, + "learning_rate": 5e-05, + "loss": 0.2214, + "loss/crossentropy": 2.7779428362846375, + "loss/hidden": 0.0, + "loss/logits": 0.22137216106057167, + "loss/reg": 0.3631560802459717, + "step": 3222 + }, + { + "epoch": 0.03223, + "grad_norm": 0.4380062222480774, + "grad_norm_var": 0.0017463535352859902, + "learning_rate": 5e-05, + "loss": 0.212, + "loss/crossentropy": 2.899549722671509, + "loss/hidden": 0.0, + "loss/logits": 0.21199552342295647, + "loss/reg": 0.36278215050697327, + "step": 3223 + }, + { + "epoch": 0.03224, + "grad_norm": 0.4314251244068146, + "grad_norm_var": 0.0017263863697228505, + "learning_rate": 5e-05, + "loss": 0.2001, + "loss/crossentropy": 2.748305380344391, + "loss/hidden": 0.0, + "loss/logits": 0.20014015957713127, + "loss/reg": 0.36207252740859985, + "step": 3224 + }, + { + "epoch": 0.03225, + "grad_norm": 0.4427112936973572, + "grad_norm_var": 0.0017485053333430609, + "learning_rate": 5e-05, + "loss": 0.2221, + "loss/crossentropy": 2.775816023349762, + "loss/hidden": 0.0, + "loss/logits": 0.22214603424072266, + "loss/reg": 0.3619568943977356, + "step": 3225 + }, + { + "epoch": 0.03226, + "grad_norm": 0.4993099272251129, + "grad_norm_var": 0.0007675769800928619, + "learning_rate": 5e-05, + "loss": 0.2285, + "loss/crossentropy": 2.772304594516754, + "loss/hidden": 0.0, + "loss/logits": 0.228526521474123, + "loss/reg": 0.3616727292537689, + "step": 3226 + }, + { + "epoch": 0.03227, + "grad_norm": 0.4595354497432709, + "grad_norm_var": 0.0007678420126025487, + "learning_rate": 5e-05, + "loss": 0.2267, + "loss/crossentropy": 2.842741310596466, + "loss/hidden": 0.0, + "loss/logits": 0.22674810141324997, + "loss/reg": 0.36108747124671936, + "step": 3227 + }, + { + "epoch": 0.03228, + "grad_norm": 0.4454059600830078, + "grad_norm_var": 0.0007696065638966964, + "learning_rate": 5e-05, + "loss": 0.2169, + "loss/crossentropy": 2.825203537940979, + "loss/hidden": 0.0, + "loss/logits": 0.2169191762804985, + "loss/reg": 0.36091694235801697, + "step": 3228 + }, + { + "epoch": 0.03229, + "grad_norm": 0.45315125584602356, + "grad_norm_var": 0.0007694183827987329, + "learning_rate": 5e-05, + "loss": 0.2072, + "loss/crossentropy": 2.8501468300819397, + "loss/hidden": 0.0, + "loss/logits": 0.2072094865143299, + "loss/reg": 0.36083343625068665, + "step": 3229 + }, + { + "epoch": 0.0323, + "grad_norm": 0.46631288528442383, + "grad_norm_var": 0.000725894536610013, + "learning_rate": 5e-05, + "loss": 0.2113, + "loss/crossentropy": 2.6978471279144287, + "loss/hidden": 0.0, + "loss/logits": 0.21134812757372856, + "loss/reg": 0.36062148213386536, + "step": 3230 + }, + { + "epoch": 0.03231, + "grad_norm": 0.44928887486457825, + "grad_norm_var": 0.0006854574670588291, + "learning_rate": 5e-05, + "loss": 0.2172, + "loss/crossentropy": 2.811010718345642, + "loss/hidden": 0.0, + "loss/logits": 0.21721578761935234, + "loss/reg": 0.36046040058135986, + "step": 3231 + }, + { + "epoch": 0.03232, + "grad_norm": 0.4217364192008972, + "grad_norm_var": 0.0006639064832472268, + "learning_rate": 5e-05, + "loss": 0.2082, + "loss/crossentropy": 2.802652955055237, + "loss/hidden": 0.0, + "loss/logits": 0.2081868089735508, + "loss/reg": 0.36017006635665894, + "step": 3232 + }, + { + "epoch": 0.03233, + "grad_norm": 0.4804527461528778, + "grad_norm_var": 0.000700972261627734, + "learning_rate": 5e-05, + "loss": 0.2321, + "loss/crossentropy": 2.8499671816825867, + "loss/hidden": 0.0, + "loss/logits": 0.23213455826044083, + "loss/reg": 0.36020952463150024, + "step": 3233 + }, + { + "epoch": 0.03234, + "grad_norm": 0.4153884947299957, + "grad_norm_var": 0.0005963936651609547, + "learning_rate": 5e-05, + "loss": 0.1992, + "loss/crossentropy": 2.759731113910675, + "loss/hidden": 0.0, + "loss/logits": 0.19919241592288017, + "loss/reg": 0.36013203859329224, + "step": 3234 + }, + { + "epoch": 0.03235, + "grad_norm": 0.524064838886261, + "grad_norm_var": 0.000933079460142372, + "learning_rate": 5e-05, + "loss": 0.2568, + "loss/crossentropy": 2.8707882165908813, + "loss/hidden": 0.0, + "loss/logits": 0.25680655241012573, + "loss/reg": 0.3598106801509857, + "step": 3235 + }, + { + "epoch": 0.03236, + "grad_norm": 0.470485657453537, + "grad_norm_var": 0.0008154056818130417, + "learning_rate": 5e-05, + "loss": 0.22, + "loss/crossentropy": 2.732699394226074, + "loss/hidden": 0.0, + "loss/logits": 0.21995601058006287, + "loss/reg": 0.3595196008682251, + "step": 3236 + }, + { + "epoch": 0.03237, + "grad_norm": 0.4322890043258667, + "grad_norm_var": 0.0008194217415063876, + "learning_rate": 5e-05, + "loss": 0.2102, + "loss/crossentropy": 2.7646228075027466, + "loss/hidden": 0.0, + "loss/logits": 0.21015330404043198, + "loss/reg": 0.3590663969516754, + "step": 3237 + }, + { + "epoch": 0.03238, + "grad_norm": 1.1920406818389893, + "grad_norm_var": 0.03472932413291745, + "learning_rate": 5e-05, + "loss": 0.3281, + "loss/crossentropy": 2.9872185587882996, + "loss/hidden": 0.0, + "loss/logits": 0.32812733575701714, + "loss/reg": 0.3588946759700775, + "step": 3238 + }, + { + "epoch": 0.03239, + "grad_norm": 0.4736666977405548, + "grad_norm_var": 0.03450761947577619, + "learning_rate": 5e-05, + "loss": 0.208, + "loss/crossentropy": 2.906652510166168, + "loss/hidden": 0.0, + "loss/logits": 0.20802787318825722, + "loss/reg": 0.35879722237586975, + "step": 3239 + }, + { + "epoch": 0.0324, + "grad_norm": 0.48356953263282776, + "grad_norm_var": 0.034175902710181706, + "learning_rate": 5e-05, + "loss": 0.2063, + "loss/crossentropy": 2.816983461380005, + "loss/hidden": 0.0, + "loss/logits": 0.2063377946615219, + "loss/reg": 0.35854190587997437, + "step": 3240 + }, + { + "epoch": 0.03241, + "grad_norm": 0.4673386216163635, + "grad_norm_var": 0.03400323968067985, + "learning_rate": 5e-05, + "loss": 0.197, + "loss/crossentropy": 2.79868483543396, + "loss/hidden": 0.0, + "loss/logits": 0.1970490962266922, + "loss/reg": 0.35839852690696716, + "step": 3241 + }, + { + "epoch": 0.03242, + "grad_norm": 0.47684213519096375, + "grad_norm_var": 0.03406195301141392, + "learning_rate": 5e-05, + "loss": 0.2054, + "loss/crossentropy": 2.9273493885993958, + "loss/hidden": 0.0, + "loss/logits": 0.20541013777256012, + "loss/reg": 0.3583439588546753, + "step": 3242 + }, + { + "epoch": 0.03243, + "grad_norm": 0.49290409684181213, + "grad_norm_var": 0.03392048740884533, + "learning_rate": 5e-05, + "loss": 0.2107, + "loss/crossentropy": 2.699168026447296, + "loss/hidden": 0.0, + "loss/logits": 0.2106897048652172, + "loss/reg": 0.35806161165237427, + "step": 3243 + }, + { + "epoch": 0.03244, + "grad_norm": 0.4355164170265198, + "grad_norm_var": 0.0340105328615732, + "learning_rate": 5e-05, + "loss": 0.1977, + "loss/crossentropy": 2.827685534954071, + "loss/hidden": 0.0, + "loss/logits": 0.19768108427524567, + "loss/reg": 0.3578363060951233, + "step": 3244 + }, + { + "epoch": 0.03245, + "grad_norm": 0.43994027376174927, + "grad_norm_var": 0.03411883105767212, + "learning_rate": 5e-05, + "loss": 0.2014, + "loss/crossentropy": 2.8871012926101685, + "loss/hidden": 0.0, + "loss/logits": 0.20140090957283974, + "loss/reg": 0.3575197160243988, + "step": 3245 + }, + { + "epoch": 0.03246, + "grad_norm": 0.48074615001678467, + "grad_norm_var": 0.03405236807134428, + "learning_rate": 5e-05, + "loss": 0.2047, + "loss/crossentropy": 2.7434681057929993, + "loss/hidden": 0.0, + "loss/logits": 0.20465601980686188, + "loss/reg": 0.3573475182056427, + "step": 3246 + }, + { + "epoch": 0.03247, + "grad_norm": 0.4465608596801758, + "grad_norm_var": 0.034074376532951596, + "learning_rate": 5e-05, + "loss": 0.2091, + "loss/crossentropy": 2.791071355342865, + "loss/hidden": 0.0, + "loss/logits": 0.2090749628841877, + "loss/reg": 0.3571856617927551, + "step": 3247 + }, + { + "epoch": 0.03248, + "grad_norm": 0.4418194591999054, + "grad_norm_var": 0.03386766563142663, + "learning_rate": 5e-05, + "loss": 0.2021, + "loss/crossentropy": 2.849919378757477, + "loss/hidden": 0.0, + "loss/logits": 0.202085729688406, + "loss/reg": 0.35694095492362976, + "step": 3248 + }, + { + "epoch": 0.03249, + "grad_norm": 0.43605127930641174, + "grad_norm_var": 0.0341634507130791, + "learning_rate": 5e-05, + "loss": 0.2064, + "loss/crossentropy": 2.863614559173584, + "loss/hidden": 0.0, + "loss/logits": 0.20638347789645195, + "loss/reg": 0.3569033443927765, + "step": 3249 + }, + { + "epoch": 0.0325, + "grad_norm": 0.4347374141216278, + "grad_norm_var": 0.033950952594285495, + "learning_rate": 5e-05, + "loss": 0.2087, + "loss/crossentropy": 2.6511946320533752, + "loss/hidden": 0.0, + "loss/logits": 0.2086954265832901, + "loss/reg": 0.35669848322868347, + "step": 3250 + }, + { + "epoch": 0.03251, + "grad_norm": 0.41705432534217834, + "grad_norm_var": 0.03443795258050842, + "learning_rate": 5e-05, + "loss": 0.2035, + "loss/crossentropy": 2.8809354305267334, + "loss/hidden": 0.0, + "loss/logits": 0.20349853858351707, + "loss/reg": 0.35634830594062805, + "step": 3251 + }, + { + "epoch": 0.03252, + "grad_norm": 0.4902518391609192, + "grad_norm_var": 0.03438103491742949, + "learning_rate": 5e-05, + "loss": 0.2464, + "loss/crossentropy": 2.8293603658676147, + "loss/hidden": 0.0, + "loss/logits": 0.2464492842555046, + "loss/reg": 0.35592418909072876, + "step": 3252 + }, + { + "epoch": 0.03253, + "grad_norm": 0.4696522355079651, + "grad_norm_var": 0.034118097254266865, + "learning_rate": 5e-05, + "loss": 0.2175, + "loss/crossentropy": 2.7406665086746216, + "loss/hidden": 0.0, + "loss/logits": 0.2175241783261299, + "loss/reg": 0.3555231988430023, + "step": 3253 + }, + { + "epoch": 0.03254, + "grad_norm": 0.4167405068874359, + "grad_norm_var": 0.0006560927412042876, + "learning_rate": 5e-05, + "loss": 0.1987, + "loss/crossentropy": 3.050465166568756, + "loss/hidden": 0.0, + "loss/logits": 0.1987169198691845, + "loss/reg": 0.35530954599380493, + "step": 3254 + }, + { + "epoch": 0.03255, + "grad_norm": 0.43837636709213257, + "grad_norm_var": 0.0006529760639038873, + "learning_rate": 5e-05, + "loss": 0.204, + "loss/crossentropy": 2.5662304162979126, + "loss/hidden": 0.0, + "loss/logits": 0.20397039130330086, + "loss/reg": 0.3549506664276123, + "step": 3255 + }, + { + "epoch": 0.03256, + "grad_norm": 0.4348146617412567, + "grad_norm_var": 0.0006109861438170939, + "learning_rate": 5e-05, + "loss": 0.2175, + "loss/crossentropy": 2.8748361468315125, + "loss/hidden": 0.0, + "loss/logits": 0.2174939103424549, + "loss/reg": 0.3546925485134125, + "step": 3256 + }, + { + "epoch": 0.03257, + "grad_norm": 0.5853509902954102, + "grad_norm_var": 0.001735215306382054, + "learning_rate": 5e-05, + "loss": 0.2446, + "loss/crossentropy": 2.740552306175232, + "loss/hidden": 0.0, + "loss/logits": 0.24455486983060837, + "loss/reg": 0.354370653629303, + "step": 3257 + }, + { + "epoch": 0.03258, + "grad_norm": 0.43605515360832214, + "grad_norm_var": 0.0017399014620474032, + "learning_rate": 5e-05, + "loss": 0.2207, + "loss/crossentropy": 2.721983253955841, + "loss/hidden": 0.0, + "loss/logits": 0.22071374207735062, + "loss/reg": 0.354070246219635, + "step": 3258 + }, + { + "epoch": 0.03259, + "grad_norm": 0.4744669497013092, + "grad_norm_var": 0.0016705140398547127, + "learning_rate": 5e-05, + "loss": 0.2074, + "loss/crossentropy": 2.868718147277832, + "loss/hidden": 0.0, + "loss/logits": 0.20741213858127594, + "loss/reg": 0.3541673421859741, + "step": 3259 + }, + { + "epoch": 0.0326, + "grad_norm": 0.46306195855140686, + "grad_norm_var": 0.0016468063615678022, + "learning_rate": 5e-05, + "loss": 0.2111, + "loss/crossentropy": 2.8705337047576904, + "loss/hidden": 0.0, + "loss/logits": 0.21107375994324684, + "loss/reg": 0.35370275378227234, + "step": 3260 + }, + { + "epoch": 0.03261, + "grad_norm": 0.46187031269073486, + "grad_norm_var": 0.0016281364510309815, + "learning_rate": 5e-05, + "loss": 0.2321, + "loss/crossentropy": 2.737088441848755, + "loss/hidden": 0.0, + "loss/logits": 0.2320832870900631, + "loss/reg": 0.3533537685871124, + "step": 3261 + }, + { + "epoch": 0.03262, + "grad_norm": 0.42402103543281555, + "grad_norm_var": 0.0016570239151233678, + "learning_rate": 5e-05, + "loss": 0.2038, + "loss/crossentropy": 2.8911256194114685, + "loss/hidden": 0.0, + "loss/logits": 0.20377064496278763, + "loss/reg": 0.35343238711357117, + "step": 3262 + }, + { + "epoch": 0.03263, + "grad_norm": 0.4065952003002167, + "grad_norm_var": 0.0017987867845222069, + "learning_rate": 5e-05, + "loss": 0.1908, + "loss/crossentropy": 2.871249258518219, + "loss/hidden": 0.0, + "loss/logits": 0.19084149599075317, + "loss/reg": 0.35343417525291443, + "step": 3263 + }, + { + "epoch": 0.03264, + "grad_norm": 0.4251510202884674, + "grad_norm_var": 0.0018386273585035639, + "learning_rate": 5e-05, + "loss": 0.195, + "loss/crossentropy": 2.822460412979126, + "loss/hidden": 0.0, + "loss/logits": 0.19495854154229164, + "loss/reg": 0.3534773886203766, + "step": 3264 + }, + { + "epoch": 0.03265, + "grad_norm": 0.5351699590682983, + "grad_norm_var": 0.002256544187861905, + "learning_rate": 5e-05, + "loss": 0.2102, + "loss/crossentropy": 2.8036988973617554, + "loss/hidden": 0.0, + "loss/logits": 0.21015527471899986, + "loss/reg": 0.3533090353012085, + "step": 3265 + }, + { + "epoch": 0.03266, + "grad_norm": 0.48570090532302856, + "grad_norm_var": 0.0022670150109320238, + "learning_rate": 5e-05, + "loss": 0.2407, + "loss/crossentropy": 2.8525343537330627, + "loss/hidden": 0.0, + "loss/logits": 0.240671094506979, + "loss/reg": 0.3530595302581787, + "step": 3266 + }, + { + "epoch": 0.03267, + "grad_norm": 0.4781879484653473, + "grad_norm_var": 0.002148333230237572, + "learning_rate": 5e-05, + "loss": 0.2237, + "loss/crossentropy": 2.820028781890869, + "loss/hidden": 0.0, + "loss/logits": 0.22369850426912308, + "loss/reg": 0.3529737591743469, + "step": 3267 + }, + { + "epoch": 0.03268, + "grad_norm": 0.4913827180862427, + "grad_norm_var": 0.0021523576888278625, + "learning_rate": 5e-05, + "loss": 0.2144, + "loss/crossentropy": 2.849581778049469, + "loss/hidden": 0.0, + "loss/logits": 0.2144278734922409, + "loss/reg": 0.3527119755744934, + "step": 3268 + }, + { + "epoch": 0.03269, + "grad_norm": 0.44072675704956055, + "grad_norm_var": 0.0021834774645316875, + "learning_rate": 5e-05, + "loss": 0.2045, + "loss/crossentropy": 2.7309719920158386, + "loss/hidden": 0.0, + "loss/logits": 0.20452339202165604, + "loss/reg": 0.3527584969997406, + "step": 3269 + }, + { + "epoch": 0.0327, + "grad_norm": 0.4803256690502167, + "grad_norm_var": 0.002049452862051822, + "learning_rate": 5e-05, + "loss": 0.2125, + "loss/crossentropy": 2.8694698810577393, + "loss/hidden": 0.0, + "loss/logits": 0.21253256127238274, + "loss/reg": 0.35254791378974915, + "step": 3270 + }, + { + "epoch": 0.03271, + "grad_norm": 0.46455883979797363, + "grad_norm_var": 0.0019947168345352, + "learning_rate": 5e-05, + "loss": 0.2122, + "loss/crossentropy": 2.7510117888450623, + "loss/hidden": 0.0, + "loss/logits": 0.21221501007676125, + "loss/reg": 0.3524121642112732, + "step": 3271 + }, + { + "epoch": 0.03272, + "grad_norm": 0.4355923533439636, + "grad_norm_var": 0.0019913172023258447, + "learning_rate": 5e-05, + "loss": 0.2031, + "loss/crossentropy": 2.7414827942848206, + "loss/hidden": 0.0, + "loss/logits": 0.2031046226620674, + "loss/reg": 0.35225507616996765, + "step": 3272 + }, + { + "epoch": 0.03273, + "grad_norm": 0.43772315979003906, + "grad_norm_var": 0.001043805685130635, + "learning_rate": 5e-05, + "loss": 0.217, + "loss/crossentropy": 2.6462822556495667, + "loss/hidden": 0.0, + "loss/logits": 0.2170335240662098, + "loss/reg": 0.35193732380867004, + "step": 3273 + }, + { + "epoch": 0.03274, + "grad_norm": 0.415850430727005, + "grad_norm_var": 0.0011305585193011798, + "learning_rate": 5e-05, + "loss": 0.1948, + "loss/crossentropy": 2.740182638168335, + "loss/hidden": 0.0, + "loss/logits": 0.19481822103261948, + "loss/reg": 0.3516504764556885, + "step": 3274 + }, + { + "epoch": 0.03275, + "grad_norm": 0.4890633225440979, + "grad_norm_var": 0.0011768483339018253, + "learning_rate": 5e-05, + "loss": 0.2038, + "loss/crossentropy": 2.800735831260681, + "loss/hidden": 0.0, + "loss/logits": 0.20377683639526367, + "loss/reg": 0.35150378942489624, + "step": 3275 + }, + { + "epoch": 0.03276, + "grad_norm": 0.41153162717819214, + "grad_norm_var": 0.0013110280516800631, + "learning_rate": 5e-05, + "loss": 0.19, + "loss/crossentropy": 2.891985595226288, + "loss/hidden": 0.0, + "loss/logits": 0.18995629996061325, + "loss/reg": 0.3510856032371521, + "step": 3276 + }, + { + "epoch": 0.03277, + "grad_norm": 0.46996766328811646, + "grad_norm_var": 0.001322310621547788, + "learning_rate": 5e-05, + "loss": 0.2251, + "loss/crossentropy": 2.724545896053314, + "loss/hidden": 0.0, + "loss/logits": 0.2251247949898243, + "loss/reg": 0.3506470322608948, + "step": 3277 + }, + { + "epoch": 0.03278, + "grad_norm": 0.4248080551624298, + "grad_norm_var": 0.001319022785011311, + "learning_rate": 5e-05, + "loss": 0.207, + "loss/crossentropy": 2.9220770597457886, + "loss/hidden": 0.0, + "loss/logits": 0.20699621737003326, + "loss/reg": 0.3501574695110321, + "step": 3278 + }, + { + "epoch": 0.03279, + "grad_norm": 0.4463200867176056, + "grad_norm_var": 0.0011571849958876463, + "learning_rate": 5e-05, + "loss": 0.2265, + "loss/crossentropy": 2.712380528450012, + "loss/hidden": 0.0, + "loss/logits": 0.2264934554696083, + "loss/reg": 0.34990739822387695, + "step": 3279 + }, + { + "epoch": 0.0328, + "grad_norm": 0.4228689670562744, + "grad_norm_var": 0.0011675827833105463, + "learning_rate": 5e-05, + "loss": 0.1987, + "loss/crossentropy": 2.8232643008232117, + "loss/hidden": 0.0, + "loss/logits": 0.1986948773264885, + "loss/reg": 0.34925052523612976, + "step": 3280 + }, + { + "epoch": 0.03281, + "grad_norm": 0.44568783044815063, + "grad_norm_var": 0.000748638703677366, + "learning_rate": 5e-05, + "loss": 0.2209, + "loss/crossentropy": 2.875556230545044, + "loss/hidden": 0.0, + "loss/logits": 0.22088206186890602, + "loss/reg": 0.3486167788505554, + "step": 3281 + }, + { + "epoch": 0.03282, + "grad_norm": 0.43480202555656433, + "grad_norm_var": 0.0006853643750972675, + "learning_rate": 5e-05, + "loss": 0.2008, + "loss/crossentropy": 2.872281849384308, + "loss/hidden": 0.0, + "loss/logits": 0.20082836970686913, + "loss/reg": 0.3477589786052704, + "step": 3282 + }, + { + "epoch": 0.03283, + "grad_norm": 0.4948584735393524, + "grad_norm_var": 0.000766860829082446, + "learning_rate": 5e-05, + "loss": 0.2215, + "loss/crossentropy": 2.7852450609207153, + "loss/hidden": 0.0, + "loss/logits": 0.22146426141262054, + "loss/reg": 0.3472515642642975, + "step": 3283 + }, + { + "epoch": 0.03284, + "grad_norm": 0.435166597366333, + "grad_norm_var": 0.0006570357954521544, + "learning_rate": 5e-05, + "loss": 0.2022, + "loss/crossentropy": 2.784038543701172, + "loss/hidden": 0.0, + "loss/logits": 0.20217393711209297, + "loss/reg": 0.34674590826034546, + "step": 3284 + }, + { + "epoch": 0.03285, + "grad_norm": 0.49524563550949097, + "grad_norm_var": 0.0007981796483985448, + "learning_rate": 5e-05, + "loss": 0.2172, + "loss/crossentropy": 2.8294427394866943, + "loss/hidden": 0.0, + "loss/logits": 0.21715334057807922, + "loss/reg": 0.34604397416114807, + "step": 3285 + }, + { + "epoch": 0.03286, + "grad_norm": 0.4368216395378113, + "grad_norm_var": 0.000742146550330658, + "learning_rate": 5e-05, + "loss": 0.2114, + "loss/crossentropy": 2.748842418193817, + "loss/hidden": 0.0, + "loss/logits": 0.21142562851309776, + "loss/reg": 0.3456406891345978, + "step": 3286 + }, + { + "epoch": 0.03287, + "grad_norm": 0.4058722257614136, + "grad_norm_var": 0.0008243444285070252, + "learning_rate": 5e-05, + "loss": 0.2022, + "loss/crossentropy": 2.939522862434387, + "loss/hidden": 0.0, + "loss/logits": 0.20216752588748932, + "loss/reg": 0.34547197818756104, + "step": 3287 + }, + { + "epoch": 0.03288, + "grad_norm": 0.4450176954269409, + "grad_norm_var": 0.0008194736964894589, + "learning_rate": 5e-05, + "loss": 0.2054, + "loss/crossentropy": 2.7356417179107666, + "loss/hidden": 0.0, + "loss/logits": 0.20538344606757164, + "loss/reg": 0.3449644148349762, + "step": 3288 + }, + { + "epoch": 0.03289, + "grad_norm": 0.4292418956756592, + "grad_norm_var": 0.0008316050394014322, + "learning_rate": 5e-05, + "loss": 0.2093, + "loss/crossentropy": 2.7967238426208496, + "loss/hidden": 0.0, + "loss/logits": 0.20931872725486755, + "loss/reg": 0.34438905119895935, + "step": 3289 + }, + { + "epoch": 0.0329, + "grad_norm": 0.4853546917438507, + "grad_norm_var": 0.0008731712968600018, + "learning_rate": 5e-05, + "loss": 0.202, + "loss/crossentropy": 2.9585098028182983, + "loss/hidden": 0.0, + "loss/logits": 0.20196180418133736, + "loss/reg": 0.34406962990760803, + "step": 3290 + }, + { + "epoch": 0.03291, + "grad_norm": 0.4511321187019348, + "grad_norm_var": 0.0007568803266787882, + "learning_rate": 5e-05, + "loss": 0.2207, + "loss/crossentropy": 2.7466835379600525, + "loss/hidden": 0.0, + "loss/logits": 0.22072728350758553, + "loss/reg": 0.3436236083507538, + "step": 3291 + }, + { + "epoch": 0.03292, + "grad_norm": 0.4551484286785126, + "grad_norm_var": 0.0006758020784317716, + "learning_rate": 5e-05, + "loss": 0.2064, + "loss/crossentropy": 2.805193305015564, + "loss/hidden": 0.0, + "loss/logits": 0.20642517134547234, + "loss/reg": 0.3434530198574066, + "step": 3292 + }, + { + "epoch": 0.03293, + "grad_norm": 0.45400893688201904, + "grad_norm_var": 0.0006463478371674232, + "learning_rate": 5e-05, + "loss": 0.2234, + "loss/crossentropy": 2.8916329741477966, + "loss/hidden": 0.0, + "loss/logits": 0.22335302457213402, + "loss/reg": 0.3430635929107666, + "step": 3293 + }, + { + "epoch": 0.03294, + "grad_norm": 0.43430471420288086, + "grad_norm_var": 0.0006230650777574121, + "learning_rate": 5e-05, + "loss": 0.1977, + "loss/crossentropy": 2.76904433965683, + "loss/hidden": 0.0, + "loss/logits": 0.1976575255393982, + "loss/reg": 0.3429409861564636, + "step": 3294 + }, + { + "epoch": 0.03295, + "grad_norm": 0.4048338234424591, + "grad_norm_var": 0.0007412585947779271, + "learning_rate": 5e-05, + "loss": 0.1999, + "loss/crossentropy": 2.771683692932129, + "loss/hidden": 0.0, + "loss/logits": 0.19992046058177948, + "loss/reg": 0.34264039993286133, + "step": 3295 + }, + { + "epoch": 0.03296, + "grad_norm": 0.45488741993904114, + "grad_norm_var": 0.0007080864556668232, + "learning_rate": 5e-05, + "loss": 0.2152, + "loss/crossentropy": 2.640643298625946, + "loss/hidden": 0.0, + "loss/logits": 0.21521049737930298, + "loss/reg": 0.3423854410648346, + "step": 3296 + }, + { + "epoch": 0.03297, + "grad_norm": 0.43550756573677063, + "grad_norm_var": 0.0007172258604706531, + "learning_rate": 5e-05, + "loss": 0.2159, + "loss/crossentropy": 2.7846437096595764, + "loss/hidden": 0.0, + "loss/logits": 0.2159421443939209, + "loss/reg": 0.34248262643814087, + "step": 3297 + }, + { + "epoch": 0.03298, + "grad_norm": 0.4601631462574005, + "grad_norm_var": 0.0007161346827104158, + "learning_rate": 5e-05, + "loss": 0.2126, + "loss/crossentropy": 2.8827401995658875, + "loss/hidden": 0.0, + "loss/logits": 0.212619137018919, + "loss/reg": 0.34215909242630005, + "step": 3298 + }, + { + "epoch": 0.03299, + "grad_norm": 0.4022318720817566, + "grad_norm_var": 0.0006810361035271365, + "learning_rate": 5e-05, + "loss": 0.1961, + "loss/crossentropy": 2.869608759880066, + "loss/hidden": 0.0, + "loss/logits": 0.19611463695764542, + "loss/reg": 0.341789186000824, + "step": 3299 + }, + { + "epoch": 0.033, + "grad_norm": 0.5019791722297668, + "grad_norm_var": 0.0008919530811604275, + "learning_rate": 5e-05, + "loss": 0.2216, + "loss/crossentropy": 2.6767367720603943, + "loss/hidden": 0.0, + "loss/logits": 0.22157222032546997, + "loss/reg": 0.3417852222919464, + "step": 3300 + }, + { + "epoch": 0.03301, + "grad_norm": 0.42123717069625854, + "grad_norm_var": 0.0007580497506439669, + "learning_rate": 5e-05, + "loss": 0.1992, + "loss/crossentropy": 2.908818781375885, + "loss/hidden": 0.0, + "loss/logits": 0.19916212558746338, + "loss/reg": 0.341805636882782, + "step": 3301 + }, + { + "epoch": 0.03302, + "grad_norm": 0.4637666940689087, + "grad_norm_var": 0.0007835334007564754, + "learning_rate": 5e-05, + "loss": 0.2347, + "loss/crossentropy": 2.7194618582725525, + "loss/hidden": 0.0, + "loss/logits": 0.23469486832618713, + "loss/reg": 0.3420849144458771, + "step": 3302 + }, + { + "epoch": 0.03303, + "grad_norm": 0.4583110809326172, + "grad_norm_var": 0.000688513956782207, + "learning_rate": 5e-05, + "loss": 0.2087, + "loss/crossentropy": 2.884438157081604, + "loss/hidden": 0.0, + "loss/logits": 0.2087065763771534, + "loss/reg": 0.3423635959625244, + "step": 3303 + }, + { + "epoch": 0.03304, + "grad_norm": 0.46903032064437866, + "grad_norm_var": 0.0007171793061347002, + "learning_rate": 5e-05, + "loss": 0.2349, + "loss/crossentropy": 2.754501461982727, + "loss/hidden": 0.0, + "loss/logits": 0.23486686497926712, + "loss/reg": 0.34201860427856445, + "step": 3304 + }, + { + "epoch": 0.03305, + "grad_norm": 0.44147467613220215, + "grad_norm_var": 0.0006945973010811309, + "learning_rate": 5e-05, + "loss": 0.2074, + "loss/crossentropy": 2.8503193259239197, + "loss/hidden": 0.0, + "loss/logits": 0.20735208690166473, + "loss/reg": 0.3422415852546692, + "step": 3305 + }, + { + "epoch": 0.03306, + "grad_norm": 0.40413209795951843, + "grad_norm_var": 0.0007195503830555339, + "learning_rate": 5e-05, + "loss": 0.1997, + "loss/crossentropy": 2.8844183683395386, + "loss/hidden": 0.0, + "loss/logits": 0.19970670714974403, + "loss/reg": 0.3419416546821594, + "step": 3306 + }, + { + "epoch": 0.03307, + "grad_norm": 0.5725996494293213, + "grad_norm_var": 0.0017489584941423756, + "learning_rate": 5e-05, + "loss": 0.2366, + "loss/crossentropy": 2.865859627723694, + "loss/hidden": 0.0, + "loss/logits": 0.23662826046347618, + "loss/reg": 0.3421829342842102, + "step": 3307 + }, + { + "epoch": 0.03308, + "grad_norm": 0.42913734912872314, + "grad_norm_var": 0.0017806757558032774, + "learning_rate": 5e-05, + "loss": 0.1985, + "loss/crossentropy": 2.6719802021980286, + "loss/hidden": 0.0, + "loss/logits": 0.1984855867922306, + "loss/reg": 0.3418743312358856, + "step": 3308 + }, + { + "epoch": 0.03309, + "grad_norm": 0.4586308002471924, + "grad_norm_var": 0.0017841884210907193, + "learning_rate": 5e-05, + "loss": 0.201, + "loss/crossentropy": 2.9841980934143066, + "loss/hidden": 0.0, + "loss/logits": 0.20096250250935555, + "loss/reg": 0.34157583117485046, + "step": 3309 + }, + { + "epoch": 0.0331, + "grad_norm": 0.4613407254219055, + "grad_norm_var": 0.0017705392814226747, + "learning_rate": 5e-05, + "loss": 0.2191, + "loss/crossentropy": 2.782851994037628, + "loss/hidden": 0.0, + "loss/logits": 0.21906671673059464, + "loss/reg": 0.3412996530532837, + "step": 3310 + }, + { + "epoch": 0.03311, + "grad_norm": 0.4346563220024109, + "grad_norm_var": 0.0016367720422220196, + "learning_rate": 5e-05, + "loss": 0.201, + "loss/crossentropy": 2.770762026309967, + "loss/hidden": 0.0, + "loss/logits": 0.20097051188349724, + "loss/reg": 0.3410702049732208, + "step": 3311 + }, + { + "epoch": 0.03312, + "grad_norm": 0.49678584933280945, + "grad_norm_var": 0.0017496711578167752, + "learning_rate": 5e-05, + "loss": 0.2339, + "loss/crossentropy": 2.7821558713912964, + "loss/hidden": 0.0, + "loss/logits": 0.2338976263999939, + "loss/reg": 0.34074535965919495, + "step": 3312 + }, + { + "epoch": 0.03313, + "grad_norm": 0.44414517283439636, + "grad_norm_var": 0.001729654843112612, + "learning_rate": 5e-05, + "loss": 0.2026, + "loss/crossentropy": 2.7551111578941345, + "loss/hidden": 0.0, + "loss/logits": 0.20263633131980896, + "loss/reg": 0.3401501178741455, + "step": 3313 + }, + { + "epoch": 0.03314, + "grad_norm": 0.49137255549430847, + "grad_norm_var": 0.0018017118579779872, + "learning_rate": 5e-05, + "loss": 0.2182, + "loss/crossentropy": 2.7374969124794006, + "loss/hidden": 0.0, + "loss/logits": 0.21816492453217506, + "loss/reg": 0.3395603597164154, + "step": 3314 + }, + { + "epoch": 0.03315, + "grad_norm": 0.448561429977417, + "grad_norm_var": 0.0015825537949105092, + "learning_rate": 5e-05, + "loss": 0.2108, + "loss/crossentropy": 2.8044087290763855, + "loss/hidden": 0.0, + "loss/logits": 0.21077901497483253, + "loss/reg": 0.33908531069755554, + "step": 3315 + }, + { + "epoch": 0.03316, + "grad_norm": 0.4341532588005066, + "grad_norm_var": 0.00151144322034041, + "learning_rate": 5e-05, + "loss": 0.2099, + "loss/crossentropy": 2.863246738910675, + "loss/hidden": 0.0, + "loss/logits": 0.20985147356987, + "loss/reg": 0.3387399911880493, + "step": 3316 + }, + { + "epoch": 0.03317, + "grad_norm": 0.43470895290374756, + "grad_norm_var": 0.001456601609520843, + "learning_rate": 5e-05, + "loss": 0.2085, + "loss/crossentropy": 2.7948164343833923, + "loss/hidden": 0.0, + "loss/logits": 0.2084757164120674, + "loss/reg": 0.33823728561401367, + "step": 3317 + }, + { + "epoch": 0.03318, + "grad_norm": 0.4400990605354309, + "grad_norm_var": 0.0014763339234040126, + "learning_rate": 5e-05, + "loss": 0.2067, + "loss/crossentropy": 2.8429133892059326, + "loss/hidden": 0.0, + "loss/logits": 0.20665664225816727, + "loss/reg": 0.3376792371273041, + "step": 3318 + }, + { + "epoch": 0.03319, + "grad_norm": 0.47080379724502563, + "grad_norm_var": 0.001487528788647019, + "learning_rate": 5e-05, + "loss": 0.2179, + "loss/crossentropy": 2.676921308040619, + "loss/hidden": 0.0, + "loss/logits": 0.217885360121727, + "loss/reg": 0.337134450674057, + "step": 3319 + }, + { + "epoch": 0.0332, + "grad_norm": 0.48441100120544434, + "grad_norm_var": 0.0015244691095800621, + "learning_rate": 5e-05, + "loss": 0.2258, + "loss/crossentropy": 2.9089431762695312, + "loss/hidden": 0.0, + "loss/logits": 0.22575374692678452, + "loss/reg": 0.3366064727306366, + "step": 3320 + }, + { + "epoch": 0.03321, + "grad_norm": 0.43533167243003845, + "grad_norm_var": 0.0015413362828969176, + "learning_rate": 5e-05, + "loss": 0.2077, + "loss/crossentropy": 2.689952790737152, + "loss/hidden": 0.0, + "loss/logits": 0.20767606422305107, + "loss/reg": 0.33617380261421204, + "step": 3321 + }, + { + "epoch": 0.03322, + "grad_norm": 0.4322916567325592, + "grad_norm_var": 0.0013856234415642358, + "learning_rate": 5e-05, + "loss": 0.2078, + "loss/crossentropy": 2.7451765537261963, + "loss/hidden": 0.0, + "loss/logits": 0.20780746266245842, + "loss/reg": 0.33565908670425415, + "step": 3322 + }, + { + "epoch": 0.03323, + "grad_norm": 0.49749287962913513, + "grad_norm_var": 0.0006162394783644813, + "learning_rate": 5e-05, + "loss": 0.2242, + "loss/crossentropy": 2.9691489934921265, + "loss/hidden": 0.0, + "loss/logits": 0.22419293969869614, + "loss/reg": 0.3352530300617218, + "step": 3323 + }, + { + "epoch": 0.03324, + "grad_norm": 0.4868699610233307, + "grad_norm_var": 0.0006187747537428382, + "learning_rate": 5e-05, + "loss": 0.2256, + "loss/crossentropy": 2.8639636039733887, + "loss/hidden": 0.0, + "loss/logits": 0.22563356161117554, + "loss/reg": 0.335117906332016, + "step": 3324 + }, + { + "epoch": 0.03325, + "grad_norm": 0.43938568234443665, + "grad_norm_var": 0.0006440982298184276, + "learning_rate": 5e-05, + "loss": 0.1956, + "loss/crossentropy": 2.7339144945144653, + "loss/hidden": 0.0, + "loss/logits": 0.1956343576312065, + "loss/reg": 0.3345509171485901, + "step": 3325 + }, + { + "epoch": 0.03326, + "grad_norm": 0.45624011754989624, + "grad_norm_var": 0.0006436397247090516, + "learning_rate": 5e-05, + "loss": 0.2207, + "loss/crossentropy": 2.8281431794166565, + "loss/hidden": 0.0, + "loss/logits": 0.22072631120681763, + "loss/reg": 0.33448168635368347, + "step": 3326 + }, + { + "epoch": 0.03327, + "grad_norm": 0.4613226354122162, + "grad_norm_var": 0.0006052378184467504, + "learning_rate": 5e-05, + "loss": 0.2072, + "loss/crossentropy": 2.8457218408584595, + "loss/hidden": 0.0, + "loss/logits": 0.2072107456624508, + "loss/reg": 0.33394381403923035, + "step": 3327 + }, + { + "epoch": 0.03328, + "grad_norm": 0.6003445982933044, + "grad_norm_var": 0.0017886455177287411, + "learning_rate": 5e-05, + "loss": 0.2095, + "loss/crossentropy": 2.905011236667633, + "loss/hidden": 0.0, + "loss/logits": 0.2094787172973156, + "loss/reg": 0.33364689350128174, + "step": 3328 + }, + { + "epoch": 0.03329, + "grad_norm": 0.4454887807369232, + "grad_norm_var": 0.0017848259247716196, + "learning_rate": 5e-05, + "loss": 0.2112, + "loss/crossentropy": 2.894716799259186, + "loss/hidden": 0.0, + "loss/logits": 0.21120398491621017, + "loss/reg": 0.33314260840415955, + "step": 3329 + }, + { + "epoch": 0.0333, + "grad_norm": 0.4312256872653961, + "grad_norm_var": 0.0018088940269670524, + "learning_rate": 5e-05, + "loss": 0.2032, + "loss/crossentropy": 2.955307960510254, + "loss/hidden": 0.0, + "loss/logits": 0.20319363474845886, + "loss/reg": 0.3329194188117981, + "step": 3330 + }, + { + "epoch": 0.03331, + "grad_norm": 0.4044085144996643, + "grad_norm_var": 0.0020123268036554027, + "learning_rate": 5e-05, + "loss": 0.1824, + "loss/crossentropy": 2.652659833431244, + "loss/hidden": 0.0, + "loss/logits": 0.18239285424351692, + "loss/reg": 0.3326061964035034, + "step": 3331 + }, + { + "epoch": 0.03332, + "grad_norm": 0.659225344657898, + "grad_norm_var": 0.004412935408369837, + "learning_rate": 5e-05, + "loss": 0.2755, + "loss/crossentropy": 2.968281865119934, + "loss/hidden": 0.0, + "loss/logits": 0.27551500871777534, + "loss/reg": 0.3319263458251953, + "step": 3332 + }, + { + "epoch": 0.03333, + "grad_norm": 0.4303460419178009, + "grad_norm_var": 0.004436823397784813, + "learning_rate": 5e-05, + "loss": 0.2047, + "loss/crossentropy": 2.834151029586792, + "loss/hidden": 0.0, + "loss/logits": 0.20470694452524185, + "loss/reg": 0.33170631527900696, + "step": 3333 + }, + { + "epoch": 0.03334, + "grad_norm": 0.4330630302429199, + "grad_norm_var": 0.0044712103945624155, + "learning_rate": 5e-05, + "loss": 0.2048, + "loss/crossentropy": 2.768010199069977, + "loss/hidden": 0.0, + "loss/logits": 0.2048095390200615, + "loss/reg": 0.33160072565078735, + "step": 3334 + }, + { + "epoch": 0.03335, + "grad_norm": 0.4293711185455322, + "grad_norm_var": 0.004590721483861352, + "learning_rate": 5e-05, + "loss": 0.2062, + "loss/crossentropy": 2.8136463165283203, + "loss/hidden": 0.0, + "loss/logits": 0.20617398992180824, + "loss/reg": 0.33117398619651794, + "step": 3335 + }, + { + "epoch": 0.03336, + "grad_norm": 0.4499529004096985, + "grad_norm_var": 0.004600679432539971, + "learning_rate": 5e-05, + "loss": 0.2212, + "loss/crossentropy": 2.6231557726860046, + "loss/hidden": 0.0, + "loss/logits": 0.22117187827825546, + "loss/reg": 0.3306238055229187, + "step": 3336 + }, + { + "epoch": 0.03337, + "grad_norm": 0.46395689249038696, + "grad_norm_var": 0.004526166860620625, + "learning_rate": 5e-05, + "loss": 0.2195, + "loss/crossentropy": 2.701095938682556, + "loss/hidden": 0.0, + "loss/logits": 0.21953120082616806, + "loss/reg": 0.33030036091804504, + "step": 3337 + }, + { + "epoch": 0.03338, + "grad_norm": 0.4392194449901581, + "grad_norm_var": 0.004494278198036724, + "learning_rate": 5e-05, + "loss": 0.203, + "loss/crossentropy": 2.871384024620056, + "loss/hidden": 0.0, + "loss/logits": 0.20299387350678444, + "loss/reg": 0.3298823833465576, + "step": 3338 + }, + { + "epoch": 0.03339, + "grad_norm": 0.4366452991962433, + "grad_norm_var": 0.004506642633865473, + "learning_rate": 5e-05, + "loss": 0.2208, + "loss/crossentropy": 2.8053683042526245, + "loss/hidden": 0.0, + "loss/logits": 0.22080536931753159, + "loss/reg": 0.32952871918678284, + "step": 3339 + }, + { + "epoch": 0.0334, + "grad_norm": 0.47408613562583923, + "grad_norm_var": 0.004482462599529417, + "learning_rate": 5e-05, + "loss": 0.2288, + "loss/crossentropy": 2.78849059343338, + "loss/hidden": 0.0, + "loss/logits": 0.22883687168359756, + "loss/reg": 0.32918334007263184, + "step": 3340 + }, + { + "epoch": 0.03341, + "grad_norm": 0.5844402313232422, + "grad_norm_var": 0.005284853366532701, + "learning_rate": 5e-05, + "loss": 0.243, + "loss/crossentropy": 2.9017791748046875, + "loss/hidden": 0.0, + "loss/logits": 0.24297234788537025, + "loss/reg": 0.3288957178592682, + "step": 3341 + }, + { + "epoch": 0.03342, + "grad_norm": 0.43903791904449463, + "grad_norm_var": 0.005346281181868984, + "learning_rate": 5e-05, + "loss": 0.1966, + "loss/crossentropy": 2.8350831270217896, + "loss/hidden": 0.0, + "loss/logits": 0.19662794470787048, + "loss/reg": 0.3287445902824402, + "step": 3342 + }, + { + "epoch": 0.03343, + "grad_norm": 0.4707552194595337, + "grad_norm_var": 0.005336044625063903, + "learning_rate": 5e-05, + "loss": 0.2089, + "loss/crossentropy": 2.7903363704681396, + "loss/hidden": 0.0, + "loss/logits": 0.20886846259236336, + "loss/reg": 0.32836976647377014, + "step": 3343 + }, + { + "epoch": 0.03344, + "grad_norm": 0.46024543046951294, + "grad_norm_var": 0.004211512204602623, + "learning_rate": 5e-05, + "loss": 0.2344, + "loss/crossentropy": 2.8145530819892883, + "loss/hidden": 0.0, + "loss/logits": 0.23438073694705963, + "loss/reg": 0.32823023200035095, + "step": 3344 + }, + { + "epoch": 0.03345, + "grad_norm": 0.44513022899627686, + "grad_norm_var": 0.004212487276013314, + "learning_rate": 5e-05, + "loss": 0.2119, + "loss/crossentropy": 2.8127337098121643, + "loss/hidden": 0.0, + "loss/logits": 0.2119305282831192, + "loss/reg": 0.3281492292881012, + "step": 3345 + }, + { + "epoch": 0.03346, + "grad_norm": 0.4396408498287201, + "grad_norm_var": 0.004178238635654389, + "learning_rate": 5e-05, + "loss": 0.2197, + "loss/crossentropy": 2.759284496307373, + "loss/hidden": 0.0, + "loss/logits": 0.219670832157135, + "loss/reg": 0.3277316093444824, + "step": 3346 + }, + { + "epoch": 0.03347, + "grad_norm": 0.46714770793914795, + "grad_norm_var": 0.003907182096609078, + "learning_rate": 5e-05, + "loss": 0.2176, + "loss/crossentropy": 2.7995559573173523, + "loss/hidden": 0.0, + "loss/logits": 0.2175961174070835, + "loss/reg": 0.327602744102478, + "step": 3347 + }, + { + "epoch": 0.03348, + "grad_norm": 0.44814378023147583, + "grad_norm_var": 0.0013702807487103248, + "learning_rate": 5e-05, + "loss": 0.2131, + "loss/crossentropy": 2.7553564310073853, + "loss/hidden": 0.0, + "loss/logits": 0.21311866864562035, + "loss/reg": 0.32724568247795105, + "step": 3348 + }, + { + "epoch": 0.03349, + "grad_norm": 0.4419894218444824, + "grad_norm_var": 0.0013374541591385587, + "learning_rate": 5e-05, + "loss": 0.2202, + "loss/crossentropy": 2.802499532699585, + "loss/hidden": 0.0, + "loss/logits": 0.2202381007373333, + "loss/reg": 0.3269749581813812, + "step": 3349 + }, + { + "epoch": 0.0335, + "grad_norm": 0.4227686822414398, + "grad_norm_var": 0.0013778615971180093, + "learning_rate": 5e-05, + "loss": 0.2028, + "loss/crossentropy": 2.7025352120399475, + "loss/hidden": 0.0, + "loss/logits": 0.2027539163827896, + "loss/reg": 0.3271377682685852, + "step": 3350 + }, + { + "epoch": 0.03351, + "grad_norm": 0.42599451541900635, + "grad_norm_var": 0.001391028038852667, + "learning_rate": 5e-05, + "loss": 0.2111, + "loss/crossentropy": 2.776321589946747, + "loss/hidden": 0.0, + "loss/logits": 0.21106871590018272, + "loss/reg": 0.32700681686401367, + "step": 3351 + }, + { + "epoch": 0.03352, + "grad_norm": 0.42842352390289307, + "grad_norm_var": 0.0014397164684764225, + "learning_rate": 5e-05, + "loss": 0.2042, + "loss/crossentropy": 2.8175362944602966, + "loss/hidden": 0.0, + "loss/logits": 0.20416942983865738, + "loss/reg": 0.32655900716781616, + "step": 3352 + }, + { + "epoch": 0.03353, + "grad_norm": 0.4209572672843933, + "grad_norm_var": 0.0015066569205479182, + "learning_rate": 5e-05, + "loss": 0.1957, + "loss/crossentropy": 2.9498921632766724, + "loss/hidden": 0.0, + "loss/logits": 0.19566785916686058, + "loss/reg": 0.3261592984199524, + "step": 3353 + }, + { + "epoch": 0.03354, + "grad_norm": 0.7333403825759888, + "grad_norm_var": 0.0063812028620572414, + "learning_rate": 5e-05, + "loss": 0.2184, + "loss/crossentropy": 2.8250110745429993, + "loss/hidden": 0.0, + "loss/logits": 0.21837767958641052, + "loss/reg": 0.32596513628959656, + "step": 3354 + }, + { + "epoch": 0.03355, + "grad_norm": 0.47320815920829773, + "grad_norm_var": 0.006296437855803664, + "learning_rate": 5e-05, + "loss": 0.2221, + "loss/crossentropy": 2.5203073024749756, + "loss/hidden": 0.0, + "loss/logits": 0.22205721214413643, + "loss/reg": 0.3259431719779968, + "step": 3355 + }, + { + "epoch": 0.03356, + "grad_norm": 0.4423348605632782, + "grad_norm_var": 0.00635678270149828, + "learning_rate": 5e-05, + "loss": 0.2174, + "loss/crossentropy": 2.9015894532203674, + "loss/hidden": 0.0, + "loss/logits": 0.21739209070801735, + "loss/reg": 0.3260539770126343, + "step": 3356 + }, + { + "epoch": 0.03357, + "grad_norm": 0.4725763499736786, + "grad_norm_var": 0.0054539419830412295, + "learning_rate": 5e-05, + "loss": 0.226, + "loss/crossentropy": 2.766432046890259, + "loss/hidden": 0.0, + "loss/logits": 0.22603664919734, + "loss/reg": 0.3258333206176758, + "step": 3357 + }, + { + "epoch": 0.03358, + "grad_norm": 0.42586198449134827, + "grad_norm_var": 0.0055094903070797676, + "learning_rate": 5e-05, + "loss": 0.1945, + "loss/crossentropy": 2.9604466557502747, + "loss/hidden": 0.0, + "loss/logits": 0.19453942775726318, + "loss/reg": 0.3256404995918274, + "step": 3358 + }, + { + "epoch": 0.03359, + "grad_norm": 0.42825081944465637, + "grad_norm_var": 0.005582179154764093, + "learning_rate": 5e-05, + "loss": 0.211, + "loss/crossentropy": 2.7473760843276978, + "loss/hidden": 0.0, + "loss/logits": 0.21096381545066833, + "loss/reg": 0.3257521390914917, + "step": 3359 + }, + { + "epoch": 0.0336, + "grad_norm": 0.47887372970581055, + "grad_norm_var": 0.005601991153162826, + "learning_rate": 5e-05, + "loss": 0.2211, + "loss/crossentropy": 2.767969310283661, + "loss/hidden": 0.0, + "loss/logits": 0.22105859220027924, + "loss/reg": 0.3257397413253784, + "step": 3360 + }, + { + "epoch": 0.03361, + "grad_norm": 0.5057399272918701, + "grad_norm_var": 0.005693923002827831, + "learning_rate": 5e-05, + "loss": 0.2126, + "loss/crossentropy": 2.793567180633545, + "loss/hidden": 0.0, + "loss/logits": 0.2126450464129448, + "loss/reg": 0.3255554437637329, + "step": 3361 + }, + { + "epoch": 0.03362, + "grad_norm": 0.4243748188018799, + "grad_norm_var": 0.005762046851764528, + "learning_rate": 5e-05, + "loss": 0.2103, + "loss/crossentropy": 2.6604357957839966, + "loss/hidden": 0.0, + "loss/logits": 0.2103486843407154, + "loss/reg": 0.32521042227745056, + "step": 3362 + }, + { + "epoch": 0.03363, + "grad_norm": 0.4738243818283081, + "grad_norm_var": 0.005766745697344773, + "learning_rate": 5e-05, + "loss": 0.2104, + "loss/crossentropy": 2.850685954093933, + "loss/hidden": 0.0, + "loss/logits": 0.210371483117342, + "loss/reg": 0.3250933587551117, + "step": 3363 + }, + { + "epoch": 0.03364, + "grad_norm": 0.44660061597824097, + "grad_norm_var": 0.005770448466641283, + "learning_rate": 5e-05, + "loss": 0.2132, + "loss/crossentropy": 2.664084792137146, + "loss/hidden": 0.0, + "loss/logits": 0.2132006250321865, + "loss/reg": 0.3247338533401489, + "step": 3364 + }, + { + "epoch": 0.03365, + "grad_norm": 0.4796585440635681, + "grad_norm_var": 0.0057419548286391136, + "learning_rate": 5e-05, + "loss": 0.2219, + "loss/crossentropy": 2.8530101776123047, + "loss/hidden": 0.0, + "loss/logits": 0.22185612469911575, + "loss/reg": 0.324609637260437, + "step": 3365 + }, + { + "epoch": 0.03366, + "grad_norm": 0.4779711067676544, + "grad_norm_var": 0.0056018918040165126, + "learning_rate": 5e-05, + "loss": 0.1995, + "loss/crossentropy": 3.0463197231292725, + "loss/hidden": 0.0, + "loss/logits": 0.19948142394423485, + "loss/reg": 0.3244163990020752, + "step": 3366 + }, + { + "epoch": 0.03367, + "grad_norm": 0.4843481481075287, + "grad_norm_var": 0.005463580810338408, + "learning_rate": 5e-05, + "loss": 0.2191, + "loss/crossentropy": 2.7274221777915955, + "loss/hidden": 0.0, + "loss/logits": 0.2191130891442299, + "loss/reg": 0.32445353269577026, + "step": 3367 + }, + { + "epoch": 0.03368, + "grad_norm": 0.4979908764362335, + "grad_norm_var": 0.005336149018922128, + "learning_rate": 5e-05, + "loss": 0.2379, + "loss/crossentropy": 2.793381154537201, + "loss/hidden": 0.0, + "loss/logits": 0.2378510646522045, + "loss/reg": 0.3243396580219269, + "step": 3368 + }, + { + "epoch": 0.03369, + "grad_norm": 0.5039383172988892, + "grad_norm_var": 0.005122999868676695, + "learning_rate": 5e-05, + "loss": 0.2328, + "loss/crossentropy": 2.747665822505951, + "loss/hidden": 0.0, + "loss/logits": 0.23283886909484863, + "loss/reg": 0.32398363947868347, + "step": 3369 + }, + { + "epoch": 0.0337, + "grad_norm": 0.5165467262268066, + "grad_norm_var": 0.000861919331754694, + "learning_rate": 5e-05, + "loss": 0.2137, + "loss/crossentropy": 2.81030535697937, + "loss/hidden": 0.0, + "loss/logits": 0.21372531354427338, + "loss/reg": 0.32368314266204834, + "step": 3370 + }, + { + "epoch": 0.03371, + "grad_norm": 0.4463249742984772, + "grad_norm_var": 0.0008982996078778023, + "learning_rate": 5e-05, + "loss": 0.2038, + "loss/crossentropy": 2.9013622403144836, + "loss/hidden": 0.0, + "loss/logits": 0.2038046233355999, + "loss/reg": 0.3234466314315796, + "step": 3371 + }, + { + "epoch": 0.03372, + "grad_norm": 0.4630885720252991, + "grad_norm_var": 0.000851222307970334, + "learning_rate": 5e-05, + "loss": 0.2028, + "loss/crossentropy": 2.841128945350647, + "loss/hidden": 0.0, + "loss/logits": 0.20275317877531052, + "loss/reg": 0.32332801818847656, + "step": 3372 + }, + { + "epoch": 0.03373, + "grad_norm": 0.5638495087623596, + "grad_norm_var": 0.0013987094413064228, + "learning_rate": 5e-05, + "loss": 0.2099, + "loss/crossentropy": 2.888921320438385, + "loss/hidden": 0.0, + "loss/logits": 0.2099459543824196, + "loss/reg": 0.3232120871543884, + "step": 3373 + }, + { + "epoch": 0.03374, + "grad_norm": 0.47671762108802795, + "grad_norm_var": 0.0012198526995013262, + "learning_rate": 5e-05, + "loss": 0.2257, + "loss/crossentropy": 2.666372537612915, + "loss/hidden": 0.0, + "loss/logits": 0.2257164716720581, + "loss/reg": 0.322893887758255, + "step": 3374 + }, + { + "epoch": 0.03375, + "grad_norm": 0.4806428849697113, + "grad_norm_var": 0.0010351073240850272, + "learning_rate": 5e-05, + "loss": 0.2229, + "loss/crossentropy": 2.6270893216133118, + "loss/hidden": 0.0, + "loss/logits": 0.22289558872580528, + "loss/reg": 0.3225509226322174, + "step": 3375 + }, + { + "epoch": 0.03376, + "grad_norm": 0.47514545917510986, + "grad_norm_var": 0.0010377939502332456, + "learning_rate": 5e-05, + "loss": 0.2092, + "loss/crossentropy": 2.7416624426841736, + "loss/hidden": 0.0, + "loss/logits": 0.20923787727952003, + "loss/reg": 0.32235169410705566, + "step": 3376 + }, + { + "epoch": 0.03377, + "grad_norm": 0.41965579986572266, + "grad_norm_var": 0.0012318810906239096, + "learning_rate": 5e-05, + "loss": 0.2057, + "loss/crossentropy": 2.712504029273987, + "loss/hidden": 0.0, + "loss/logits": 0.2057042419910431, + "loss/reg": 0.3220829367637634, + "step": 3377 + }, + { + "epoch": 0.03378, + "grad_norm": 0.4628400504589081, + "grad_norm_var": 0.0010548794750419217, + "learning_rate": 5e-05, + "loss": 0.2113, + "loss/crossentropy": 2.650109112262726, + "loss/hidden": 0.0, + "loss/logits": 0.2113422527909279, + "loss/reg": 0.3219255208969116, + "step": 3378 + }, + { + "epoch": 0.03379, + "grad_norm": 0.5326417088508606, + "grad_norm_var": 0.0012279869433879894, + "learning_rate": 5e-05, + "loss": 0.229, + "loss/crossentropy": 2.8088294863700867, + "loss/hidden": 0.0, + "loss/logits": 0.2289992719888687, + "loss/reg": 0.32161253690719604, + "step": 3379 + }, + { + "epoch": 0.0338, + "grad_norm": 0.4348560869693756, + "grad_norm_var": 0.0012936031401779827, + "learning_rate": 5e-05, + "loss": 0.2001, + "loss/crossentropy": 2.955207943916321, + "loss/hidden": 0.0, + "loss/logits": 0.20012956112623215, + "loss/reg": 0.3212406635284424, + "step": 3380 + }, + { + "epoch": 0.03381, + "grad_norm": 0.4032558798789978, + "grad_norm_var": 0.0016849755720917958, + "learning_rate": 5e-05, + "loss": 0.1962, + "loss/crossentropy": 2.7673054933547974, + "loss/hidden": 0.0, + "loss/logits": 0.19616872817277908, + "loss/reg": 0.3210945725440979, + "step": 3381 + }, + { + "epoch": 0.03382, + "grad_norm": 0.44532230496406555, + "grad_norm_var": 0.0017494955972587286, + "learning_rate": 5e-05, + "loss": 0.2096, + "loss/crossentropy": 2.7581847310066223, + "loss/hidden": 0.0, + "loss/logits": 0.20961792767047882, + "loss/reg": 0.32048481702804565, + "step": 3382 + }, + { + "epoch": 0.03383, + "grad_norm": 0.4587815999984741, + "grad_norm_var": 0.0017600084895035692, + "learning_rate": 5e-05, + "loss": 0.2094, + "loss/crossentropy": 2.832453191280365, + "loss/hidden": 0.0, + "loss/logits": 0.20936638116836548, + "loss/reg": 0.32001957297325134, + "step": 3383 + }, + { + "epoch": 0.03384, + "grad_norm": 0.5860366821289062, + "grad_norm_var": 0.0025279140670777965, + "learning_rate": 5e-05, + "loss": 0.2319, + "loss/crossentropy": 2.872506022453308, + "loss/hidden": 0.0, + "loss/logits": 0.23190215602517128, + "loss/reg": 0.31983861327171326, + "step": 3384 + }, + { + "epoch": 0.03385, + "grad_norm": 0.41698506474494934, + "grad_norm_var": 0.002715429114408722, + "learning_rate": 5e-05, + "loss": 0.2009, + "loss/crossentropy": 2.61422598361969, + "loss/hidden": 0.0, + "loss/logits": 0.20087514072656631, + "loss/reg": 0.3194868564605713, + "step": 3385 + }, + { + "epoch": 0.03386, + "grad_norm": 0.41842615604400635, + "grad_norm_var": 0.0027594587424522293, + "learning_rate": 5e-05, + "loss": 0.2036, + "loss/crossentropy": 2.7608628273010254, + "loss/hidden": 0.0, + "loss/logits": 0.20358722284436226, + "loss/reg": 0.31921035051345825, + "step": 3386 + }, + { + "epoch": 0.03387, + "grad_norm": 0.4228498339653015, + "grad_norm_var": 0.0028610736883493953, + "learning_rate": 5e-05, + "loss": 0.1945, + "loss/crossentropy": 2.813345491886139, + "loss/hidden": 0.0, + "loss/logits": 0.19449561834335327, + "loss/reg": 0.3186832368373871, + "step": 3387 + }, + { + "epoch": 0.03388, + "grad_norm": 0.46133795380592346, + "grad_norm_var": 0.0028620191344919223, + "learning_rate": 5e-05, + "loss": 0.2114, + "loss/crossentropy": 2.90591299533844, + "loss/hidden": 0.0, + "loss/logits": 0.2113543152809143, + "loss/reg": 0.3185129761695862, + "step": 3388 + }, + { + "epoch": 0.03389, + "grad_norm": 0.45164352655410767, + "grad_norm_var": 0.002188126202315266, + "learning_rate": 5e-05, + "loss": 0.2047, + "loss/crossentropy": 2.7424387335777283, + "loss/hidden": 0.0, + "loss/logits": 0.20469153299927711, + "loss/reg": 0.3183594048023224, + "step": 3389 + }, + { + "epoch": 0.0339, + "grad_norm": 0.43692922592163086, + "grad_norm_var": 0.0021941175520565626, + "learning_rate": 5e-05, + "loss": 0.1995, + "loss/crossentropy": 2.644283652305603, + "loss/hidden": 0.0, + "loss/logits": 0.19947988539934158, + "loss/reg": 0.31780046224594116, + "step": 3390 + }, + { + "epoch": 0.03391, + "grad_norm": 0.45439088344573975, + "grad_norm_var": 0.0021534169017868644, + "learning_rate": 5e-05, + "loss": 0.2081, + "loss/crossentropy": 2.8124276399612427, + "loss/hidden": 0.0, + "loss/logits": 0.2081209048628807, + "loss/reg": 0.3175513744354248, + "step": 3391 + }, + { + "epoch": 0.03392, + "grad_norm": 0.47744810581207275, + "grad_norm_var": 0.002159912264599095, + "learning_rate": 5e-05, + "loss": 0.2106, + "loss/crossentropy": 2.922151744365692, + "loss/hidden": 0.0, + "loss/logits": 0.2106141746044159, + "loss/reg": 0.3173667788505554, + "step": 3392 + }, + { + "epoch": 0.03393, + "grad_norm": 0.43970972299575806, + "grad_norm_var": 0.0020899735990327906, + "learning_rate": 5e-05, + "loss": 0.2074, + "loss/crossentropy": 2.807827055454254, + "loss/hidden": 0.0, + "loss/logits": 0.20739486068487167, + "loss/reg": 0.31717273592948914, + "step": 3393 + }, + { + "epoch": 0.03394, + "grad_norm": 0.3982374370098114, + "grad_norm_var": 0.002295912507401321, + "learning_rate": 5e-05, + "loss": 0.1928, + "loss/crossentropy": 2.823506474494934, + "loss/hidden": 0.0, + "loss/logits": 0.1928495615720749, + "loss/reg": 0.31674036383628845, + "step": 3394 + }, + { + "epoch": 0.03395, + "grad_norm": 0.4523276090621948, + "grad_norm_var": 0.001840090222162749, + "learning_rate": 5e-05, + "loss": 0.204, + "loss/crossentropy": 2.811570107936859, + "loss/hidden": 0.0, + "loss/logits": 0.20404096692800522, + "loss/reg": 0.3166060447692871, + "step": 3395 + }, + { + "epoch": 0.03396, + "grad_norm": 0.5185806155204773, + "grad_norm_var": 0.0021380750864246068, + "learning_rate": 5e-05, + "loss": 0.2207, + "loss/crossentropy": 2.782260477542877, + "loss/hidden": 0.0, + "loss/logits": 0.2207183837890625, + "loss/reg": 0.31610777974128723, + "step": 3396 + }, + { + "epoch": 0.03397, + "grad_norm": 0.45550963282585144, + "grad_norm_var": 0.0019646512480855174, + "learning_rate": 5e-05, + "loss": 0.2246, + "loss/crossentropy": 2.7926118969917297, + "loss/hidden": 0.0, + "loss/logits": 0.22456150874495506, + "loss/reg": 0.31576007604599, + "step": 3397 + }, + { + "epoch": 0.03398, + "grad_norm": 0.4245316982269287, + "grad_norm_var": 0.002021009216530937, + "learning_rate": 5e-05, + "loss": 0.1973, + "loss/crossentropy": 2.8270943760871887, + "loss/hidden": 0.0, + "loss/logits": 0.1972883678972721, + "loss/reg": 0.3156219720840454, + "step": 3398 + }, + { + "epoch": 0.03399, + "grad_norm": 0.42613276839256287, + "grad_norm_var": 0.002069461819859634, + "learning_rate": 5e-05, + "loss": 0.202, + "loss/crossentropy": 2.813815712928772, + "loss/hidden": 0.0, + "loss/logits": 0.20200231671333313, + "loss/reg": 0.3149683177471161, + "step": 3399 + }, + { + "epoch": 0.034, + "grad_norm": 0.4285763204097748, + "grad_norm_var": 0.0008169206920887333, + "learning_rate": 5e-05, + "loss": 0.2074, + "loss/crossentropy": 2.8352705240249634, + "loss/hidden": 0.0, + "loss/logits": 0.20739219710230827, + "loss/reg": 0.31450355052948, + "step": 3400 + }, + { + "epoch": 0.03401, + "grad_norm": 0.42315489053726196, + "grad_norm_var": 0.0007981242239603763, + "learning_rate": 5e-05, + "loss": 0.1917, + "loss/crossentropy": 2.8452664017677307, + "loss/hidden": 0.0, + "loss/logits": 0.19169894233345985, + "loss/reg": 0.31386011838912964, + "step": 3401 + }, + { + "epoch": 0.03402, + "grad_norm": 0.47341740131378174, + "grad_norm_var": 0.0008061284085377356, + "learning_rate": 5e-05, + "loss": 0.2239, + "loss/crossentropy": 2.9343382120132446, + "loss/hidden": 0.0, + "loss/logits": 0.22390039265155792, + "loss/reg": 0.31361958384513855, + "step": 3402 + }, + { + "epoch": 0.03403, + "grad_norm": 0.49255242943763733, + "grad_norm_var": 0.0008895328984112272, + "learning_rate": 5e-05, + "loss": 0.2262, + "loss/crossentropy": 2.849633574485779, + "loss/hidden": 0.0, + "loss/logits": 0.22615572437644005, + "loss/reg": 0.31315940618515015, + "step": 3403 + }, + { + "epoch": 0.03404, + "grad_norm": 0.42623665928840637, + "grad_norm_var": 0.0009177112433719584, + "learning_rate": 5e-05, + "loss": 0.2039, + "loss/crossentropy": 2.730575919151306, + "loss/hidden": 0.0, + "loss/logits": 0.2039080299437046, + "loss/reg": 0.31247958540916443, + "step": 3404 + }, + { + "epoch": 0.03405, + "grad_norm": 0.48908570408821106, + "grad_norm_var": 0.0010199701453092087, + "learning_rate": 5e-05, + "loss": 0.2188, + "loss/crossentropy": 2.7119243144989014, + "loss/hidden": 0.0, + "loss/logits": 0.2188262678682804, + "loss/reg": 0.31157541275024414, + "step": 3405 + }, + { + "epoch": 0.03406, + "grad_norm": 0.4407983422279358, + "grad_norm_var": 0.0010136204380307454, + "learning_rate": 5e-05, + "loss": 0.2073, + "loss/crossentropy": 2.914222776889801, + "loss/hidden": 0.0, + "loss/logits": 0.20734045654535294, + "loss/reg": 0.31122690439224243, + "step": 3406 + }, + { + "epoch": 0.03407, + "grad_norm": 0.43865782022476196, + "grad_norm_var": 0.0010225927495846462, + "learning_rate": 5e-05, + "loss": 0.2139, + "loss/crossentropy": 2.724530816078186, + "loss/hidden": 0.0, + "loss/logits": 0.2138579562306404, + "loss/reg": 0.3104621171951294, + "step": 3407 + }, + { + "epoch": 0.03408, + "grad_norm": 0.4395671784877777, + "grad_norm_var": 0.0009752082613096864, + "learning_rate": 5e-05, + "loss": 0.2065, + "loss/crossentropy": 2.826694130897522, + "loss/hidden": 0.0, + "loss/logits": 0.20651047304272652, + "loss/reg": 0.3103015422821045, + "step": 3408 + }, + { + "epoch": 0.03409, + "grad_norm": 0.5002748966217041, + "grad_norm_var": 0.0011379863209039906, + "learning_rate": 5e-05, + "loss": 0.2037, + "loss/crossentropy": 2.9579413533210754, + "loss/hidden": 0.0, + "loss/logits": 0.2036524973809719, + "loss/reg": 0.3099273145198822, + "step": 3409 + }, + { + "epoch": 0.0341, + "grad_norm": 0.4922187924385071, + "grad_norm_var": 0.0010197403533521313, + "learning_rate": 5e-05, + "loss": 0.2113, + "loss/crossentropy": 2.8619267344474792, + "loss/hidden": 0.0, + "loss/logits": 0.21130403876304626, + "loss/reg": 0.3094329833984375, + "step": 3410 + }, + { + "epoch": 0.03411, + "grad_norm": 0.49935105443000793, + "grad_norm_var": 0.0011248749125182035, + "learning_rate": 5e-05, + "loss": 0.217, + "loss/crossentropy": 2.864948809146881, + "loss/hidden": 0.0, + "loss/logits": 0.2169676013290882, + "loss/reg": 0.30929726362228394, + "step": 3411 + }, + { + "epoch": 0.03412, + "grad_norm": 0.470816045999527, + "grad_norm_var": 0.0008978302614703856, + "learning_rate": 5e-05, + "loss": 0.2189, + "loss/crossentropy": 2.8520806431770325, + "loss/hidden": 0.0, + "loss/logits": 0.21886668354272842, + "loss/reg": 0.3090590536594391, + "step": 3412 + }, + { + "epoch": 0.03413, + "grad_norm": 0.5266633629798889, + "grad_norm_var": 0.001194852890343808, + "learning_rate": 5e-05, + "loss": 0.2265, + "loss/crossentropy": 2.6757559180259705, + "loss/hidden": 0.0, + "loss/logits": 0.22648253664374352, + "loss/reg": 0.3089248538017273, + "step": 3413 + }, + { + "epoch": 0.03414, + "grad_norm": 0.5027031898498535, + "grad_norm_var": 0.0011862266621379901, + "learning_rate": 5e-05, + "loss": 0.2209, + "loss/crossentropy": 2.7924565076828003, + "loss/hidden": 0.0, + "loss/logits": 0.2208872176706791, + "loss/reg": 0.3087013363838196, + "step": 3414 + }, + { + "epoch": 0.03415, + "grad_norm": 0.4646728038787842, + "grad_norm_var": 0.0010696326822876244, + "learning_rate": 5e-05, + "loss": 0.2053, + "loss/crossentropy": 2.6517043709754944, + "loss/hidden": 0.0, + "loss/logits": 0.2053392417728901, + "loss/reg": 0.3086439073085785, + "step": 3415 + }, + { + "epoch": 0.03416, + "grad_norm": 0.46045663952827454, + "grad_norm_var": 0.0009600644429555142, + "learning_rate": 5e-05, + "loss": 0.2115, + "loss/crossentropy": 2.6647701263427734, + "loss/hidden": 0.0, + "loss/logits": 0.21149789541959763, + "loss/reg": 0.3087012767791748, + "step": 3416 + }, + { + "epoch": 0.03417, + "grad_norm": 0.4192192852497101, + "grad_norm_var": 0.0009862908575162847, + "learning_rate": 5e-05, + "loss": 0.2071, + "loss/crossentropy": 2.7838265895843506, + "loss/hidden": 0.0, + "loss/logits": 0.2070765271782875, + "loss/reg": 0.3082455098628998, + "step": 3417 + }, + { + "epoch": 0.03418, + "grad_norm": 0.46196916699409485, + "grad_norm_var": 0.0009908582205070146, + "learning_rate": 5e-05, + "loss": 0.208, + "loss/crossentropy": 2.772991180419922, + "loss/hidden": 0.0, + "loss/logits": 0.20799624174833298, + "loss/reg": 0.3079775869846344, + "step": 3418 + }, + { + "epoch": 0.03419, + "grad_norm": 0.44500529766082764, + "grad_norm_var": 0.000991257612074851, + "learning_rate": 5e-05, + "loss": 0.1955, + "loss/crossentropy": 2.7502389550209045, + "loss/hidden": 0.0, + "loss/logits": 0.19546188786625862, + "loss/reg": 0.3073209524154663, + "step": 3419 + }, + { + "epoch": 0.0342, + "grad_norm": 0.4305124580860138, + "grad_norm_var": 0.0009689578533085423, + "learning_rate": 5e-05, + "loss": 0.2119, + "loss/crossentropy": 2.8956750631332397, + "loss/hidden": 0.0, + "loss/logits": 0.21187294647097588, + "loss/reg": 0.3071844279766083, + "step": 3420 + }, + { + "epoch": 0.03421, + "grad_norm": 0.4897916913032532, + "grad_norm_var": 0.0009710092999609566, + "learning_rate": 5e-05, + "loss": 0.2334, + "loss/crossentropy": 2.814716875553131, + "loss/hidden": 0.0, + "loss/logits": 0.23338667303323746, + "loss/reg": 0.3068380355834961, + "step": 3421 + }, + { + "epoch": 0.03422, + "grad_norm": 0.42954736948013306, + "grad_norm_var": 0.001019227860694516, + "learning_rate": 5e-05, + "loss": 0.1987, + "loss/crossentropy": 2.6915127635002136, + "loss/hidden": 0.0, + "loss/logits": 0.19869909808039665, + "loss/reg": 0.306535929441452, + "step": 3422 + }, + { + "epoch": 0.03423, + "grad_norm": 0.44614943861961365, + "grad_norm_var": 0.0009944608954388793, + "learning_rate": 5e-05, + "loss": 0.205, + "loss/crossentropy": 2.912255346775055, + "loss/hidden": 0.0, + "loss/logits": 0.20497187972068787, + "loss/reg": 0.3060789704322815, + "step": 3423 + }, + { + "epoch": 0.03424, + "grad_norm": 0.45084381103515625, + "grad_norm_var": 0.0009605117406699374, + "learning_rate": 5e-05, + "loss": 0.2015, + "loss/crossentropy": 2.655648410320282, + "loss/hidden": 0.0, + "loss/logits": 0.2014988511800766, + "loss/reg": 0.30580899119377136, + "step": 3424 + }, + { + "epoch": 0.03425, + "grad_norm": 0.4769604206085205, + "grad_norm_var": 0.000894581419457489, + "learning_rate": 5e-05, + "loss": 0.2383, + "loss/crossentropy": 2.8703418970108032, + "loss/hidden": 0.0, + "loss/logits": 0.23831868916749954, + "loss/reg": 0.3054695725440979, + "step": 3425 + }, + { + "epoch": 0.03426, + "grad_norm": 0.40381941199302673, + "grad_norm_var": 0.0010819700596125248, + "learning_rate": 5e-05, + "loss": 0.1898, + "loss/crossentropy": 2.8884570598602295, + "loss/hidden": 0.0, + "loss/logits": 0.1898237019777298, + "loss/reg": 0.3050405979156494, + "step": 3426 + }, + { + "epoch": 0.03427, + "grad_norm": 0.4384555518627167, + "grad_norm_var": 0.0010036081264082953, + "learning_rate": 5e-05, + "loss": 0.2162, + "loss/crossentropy": 2.7600648999214172, + "loss/hidden": 0.0, + "loss/logits": 0.21617481485009193, + "loss/reg": 0.3046659529209137, + "step": 3427 + }, + { + "epoch": 0.03428, + "grad_norm": 0.4599361717700958, + "grad_norm_var": 0.0009914705628220944, + "learning_rate": 5e-05, + "loss": 0.2186, + "loss/crossentropy": 3.006561815738678, + "loss/hidden": 0.0, + "loss/logits": 0.21857880055904388, + "loss/reg": 0.30446070432662964, + "step": 3428 + }, + { + "epoch": 0.03429, + "grad_norm": 0.46557021141052246, + "grad_norm_var": 0.0006545881099752737, + "learning_rate": 5e-05, + "loss": 0.2244, + "loss/crossentropy": 2.826019763946533, + "loss/hidden": 0.0, + "loss/logits": 0.2243686430156231, + "loss/reg": 0.30385369062423706, + "step": 3429 + }, + { + "epoch": 0.0343, + "grad_norm": 0.43080607056617737, + "grad_norm_var": 0.0004997638206038564, + "learning_rate": 5e-05, + "loss": 0.2111, + "loss/crossentropy": 2.8396846652030945, + "loss/hidden": 0.0, + "loss/logits": 0.21108417212963104, + "loss/reg": 0.3035240173339844, + "step": 3430 + }, + { + "epoch": 0.03431, + "grad_norm": 0.40994423627853394, + "grad_norm_var": 0.0005679078172656819, + "learning_rate": 5e-05, + "loss": 0.1913, + "loss/crossentropy": 2.858437180519104, + "loss/hidden": 0.0, + "loss/logits": 0.1913478523492813, + "loss/reg": 0.3034055829048157, + "step": 3431 + }, + { + "epoch": 0.03432, + "grad_norm": 0.4852507412433624, + "grad_norm_var": 0.0006576365892895712, + "learning_rate": 5e-05, + "loss": 0.214, + "loss/crossentropy": 2.7699002623558044, + "loss/hidden": 0.0, + "loss/logits": 0.21401474624872208, + "loss/reg": 0.30310121178627014, + "step": 3432 + }, + { + "epoch": 0.03433, + "grad_norm": 0.4605794847011566, + "grad_norm_var": 0.0006141838138783921, + "learning_rate": 5e-05, + "loss": 0.1976, + "loss/crossentropy": 2.939629375934601, + "loss/hidden": 0.0, + "loss/logits": 0.1975945234298706, + "loss/reg": 0.3030165731906891, + "step": 3433 + }, + { + "epoch": 0.03434, + "grad_norm": 0.42524921894073486, + "grad_norm_var": 0.000635308332459165, + "learning_rate": 5e-05, + "loss": 0.2049, + "loss/crossentropy": 2.751047372817993, + "loss/hidden": 0.0, + "loss/logits": 0.2048589624464512, + "loss/reg": 0.3027656078338623, + "step": 3434 + }, + { + "epoch": 0.03435, + "grad_norm": 0.4185696840286255, + "grad_norm_var": 0.0006852284475830898, + "learning_rate": 5e-05, + "loss": 0.2057, + "loss/crossentropy": 2.824570417404175, + "loss/hidden": 0.0, + "loss/logits": 0.20569567009806633, + "loss/reg": 0.3026576340198517, + "step": 3435 + }, + { + "epoch": 0.03436, + "grad_norm": 0.5146148204803467, + "grad_norm_var": 0.000963453833145537, + "learning_rate": 5e-05, + "loss": 0.2159, + "loss/crossentropy": 2.7935361862182617, + "loss/hidden": 0.0, + "loss/logits": 0.21587760001420975, + "loss/reg": 0.30234214663505554, + "step": 3436 + }, + { + "epoch": 0.03437, + "grad_norm": 0.42939940094947815, + "grad_norm_var": 0.0008740548495021372, + "learning_rate": 5e-05, + "loss": 0.212, + "loss/crossentropy": 2.8025014400482178, + "loss/hidden": 0.0, + "loss/logits": 0.21195732429623604, + "loss/reg": 0.30183935165405273, + "step": 3437 + }, + { + "epoch": 0.03438, + "grad_norm": 0.450013667345047, + "grad_norm_var": 0.0008536839078200982, + "learning_rate": 5e-05, + "loss": 0.2135, + "loss/crossentropy": 2.8855369091033936, + "loss/hidden": 0.0, + "loss/logits": 0.2134605087339878, + "loss/reg": 0.30164408683776855, + "step": 3438 + }, + { + "epoch": 0.03439, + "grad_norm": 0.4308350384235382, + "grad_norm_var": 0.0008718862625594034, + "learning_rate": 5e-05, + "loss": 0.2063, + "loss/crossentropy": 2.697295129299164, + "loss/hidden": 0.0, + "loss/logits": 0.20631689578294754, + "loss/reg": 0.30145496129989624, + "step": 3439 + }, + { + "epoch": 0.0344, + "grad_norm": 0.42222005128860474, + "grad_norm_var": 0.0009081490271103498, + "learning_rate": 5e-05, + "loss": 0.1955, + "loss/crossentropy": 2.838318943977356, + "loss/hidden": 0.0, + "loss/logits": 0.19551560655236244, + "loss/reg": 0.30134761333465576, + "step": 3440 + }, + { + "epoch": 0.03441, + "grad_norm": 0.4328944683074951, + "grad_norm_var": 0.0008425466097243039, + "learning_rate": 5e-05, + "loss": 0.2182, + "loss/crossentropy": 2.9234573245048523, + "loss/hidden": 0.0, + "loss/logits": 0.21824132651090622, + "loss/reg": 0.3013145327568054, + "step": 3441 + }, + { + "epoch": 0.03442, + "grad_norm": 0.421405166387558, + "grad_norm_var": 0.0007714482136897183, + "learning_rate": 5e-05, + "loss": 0.1979, + "loss/crossentropy": 2.8316492438316345, + "loss/hidden": 0.0, + "loss/logits": 0.19791367277503014, + "loss/reg": 0.3008849322795868, + "step": 3442 + }, + { + "epoch": 0.03443, + "grad_norm": 0.5067538022994995, + "grad_norm_var": 0.0010171976830754658, + "learning_rate": 5e-05, + "loss": 0.2222, + "loss/crossentropy": 2.8968807458877563, + "loss/hidden": 0.0, + "loss/logits": 0.22221988812088966, + "loss/reg": 0.3007335960865021, + "step": 3443 + }, + { + "epoch": 0.03444, + "grad_norm": 0.42312657833099365, + "grad_norm_var": 0.0010420857034547759, + "learning_rate": 5e-05, + "loss": 0.1958, + "loss/crossentropy": 2.883744537830353, + "loss/hidden": 0.0, + "loss/logits": 0.19581160321831703, + "loss/reg": 0.30008652806282043, + "step": 3444 + }, + { + "epoch": 0.03445, + "grad_norm": 0.541495144367218, + "grad_norm_var": 0.0016060356935209464, + "learning_rate": 5e-05, + "loss": 0.2425, + "loss/crossentropy": 2.9303057193756104, + "loss/hidden": 0.0, + "loss/logits": 0.24246376380324364, + "loss/reg": 0.29996445775032043, + "step": 3445 + }, + { + "epoch": 0.03446, + "grad_norm": 0.4337821900844574, + "grad_norm_var": 0.0015988945059346707, + "learning_rate": 5e-05, + "loss": 0.2075, + "loss/crossentropy": 2.8507127165794373, + "loss/hidden": 0.0, + "loss/logits": 0.20749986544251442, + "loss/reg": 0.2996787428855896, + "step": 3446 + }, + { + "epoch": 0.03447, + "grad_norm": 0.5210930705070496, + "grad_norm_var": 0.0017717219991186916, + "learning_rate": 5e-05, + "loss": 0.2243, + "loss/crossentropy": 2.820759952068329, + "loss/hidden": 0.0, + "loss/logits": 0.22433586418628693, + "loss/reg": 0.29939916729927063, + "step": 3447 + }, + { + "epoch": 0.03448, + "grad_norm": 0.5004349946975708, + "grad_norm_var": 0.0018426591912261703, + "learning_rate": 5e-05, + "loss": 0.2256, + "loss/crossentropy": 2.783009886741638, + "loss/hidden": 0.0, + "loss/logits": 0.22562990710139275, + "loss/reg": 0.2992880046367645, + "step": 3448 + }, + { + "epoch": 0.03449, + "grad_norm": 0.45130056142807007, + "grad_norm_var": 0.0018451944212075078, + "learning_rate": 5e-05, + "loss": 0.2525, + "loss/crossentropy": 2.8177643418312073, + "loss/hidden": 0.0, + "loss/logits": 0.2524820938706398, + "loss/reg": 0.29873955249786377, + "step": 3449 + }, + { + "epoch": 0.0345, + "grad_norm": 0.5856472253799438, + "grad_norm_var": 0.00275917526843887, + "learning_rate": 5e-05, + "loss": 0.2413, + "loss/crossentropy": 2.71544873714447, + "loss/hidden": 0.0, + "loss/logits": 0.241291094571352, + "loss/reg": 0.29853007197380066, + "step": 3450 + }, + { + "epoch": 0.03451, + "grad_norm": 0.4948863685131073, + "grad_norm_var": 0.0026230162637226923, + "learning_rate": 5e-05, + "loss": 0.211, + "loss/crossentropy": 2.785673975944519, + "loss/hidden": 0.0, + "loss/logits": 0.21104982122778893, + "loss/reg": 0.298255980014801, + "step": 3451 + }, + { + "epoch": 0.03452, + "grad_norm": 0.4726856052875519, + "grad_norm_var": 0.0024974153901039407, + "learning_rate": 5e-05, + "loss": 0.2161, + "loss/crossentropy": 2.9387874603271484, + "loss/hidden": 0.0, + "loss/logits": 0.21610870957374573, + "loss/reg": 0.2981424331665039, + "step": 3452 + }, + { + "epoch": 0.03453, + "grad_norm": 0.4319626986980438, + "grad_norm_var": 0.0024839931474235242, + "learning_rate": 5e-05, + "loss": 0.2017, + "loss/crossentropy": 2.9254860877990723, + "loss/hidden": 0.0, + "loss/logits": 0.2017272412776947, + "loss/reg": 0.29804694652557373, + "step": 3453 + }, + { + "epoch": 0.03454, + "grad_norm": 0.4521547555923462, + "grad_norm_var": 0.0024785644218842475, + "learning_rate": 5e-05, + "loss": 0.1958, + "loss/crossentropy": 2.9576077461242676, + "loss/hidden": 0.0, + "loss/logits": 0.19583330303430557, + "loss/reg": 0.2978787124156952, + "step": 3454 + }, + { + "epoch": 0.03455, + "grad_norm": 0.5148643255233765, + "grad_norm_var": 0.00247919642606211, + "learning_rate": 5e-05, + "loss": 0.2191, + "loss/crossentropy": 2.880778670310974, + "loss/hidden": 0.0, + "loss/logits": 0.21905693784356117, + "loss/reg": 0.29765868186950684, + "step": 3455 + }, + { + "epoch": 0.03456, + "grad_norm": 0.5567469000816345, + "grad_norm_var": 0.0026560601968747322, + "learning_rate": 5e-05, + "loss": 0.2189, + "loss/crossentropy": 2.837165892124176, + "loss/hidden": 0.0, + "loss/logits": 0.21889247000217438, + "loss/reg": 0.2977781593799591, + "step": 3456 + }, + { + "epoch": 0.03457, + "grad_norm": 0.43610498309135437, + "grad_norm_var": 0.0026349017405923546, + "learning_rate": 5e-05, + "loss": 0.2013, + "loss/crossentropy": 2.798915207386017, + "loss/hidden": 0.0, + "loss/logits": 0.20126622542738914, + "loss/reg": 0.29761630296707153, + "step": 3457 + }, + { + "epoch": 0.03458, + "grad_norm": 0.4674636721611023, + "grad_norm_var": 0.0023829145367567526, + "learning_rate": 5e-05, + "loss": 0.2076, + "loss/crossentropy": 2.8675752878189087, + "loss/hidden": 0.0, + "loss/logits": 0.20763374119997025, + "loss/reg": 0.2974594533443451, + "step": 3458 + }, + { + "epoch": 0.03459, + "grad_norm": 0.43675512075424194, + "grad_norm_var": 0.002503914345664671, + "learning_rate": 5e-05, + "loss": 0.2025, + "loss/crossentropy": 2.8322522044181824, + "loss/hidden": 0.0, + "loss/logits": 0.20250633358955383, + "loss/reg": 0.2974369525909424, + "step": 3459 + }, + { + "epoch": 0.0346, + "grad_norm": 0.43477073311805725, + "grad_norm_var": 0.0024201591260153863, + "learning_rate": 5e-05, + "loss": 0.1945, + "loss/crossentropy": 2.899631440639496, + "loss/hidden": 0.0, + "loss/logits": 0.19445638731122017, + "loss/reg": 0.29706504940986633, + "step": 3460 + }, + { + "epoch": 0.03461, + "grad_norm": 0.4307887554168701, + "grad_norm_var": 0.0023265420732093032, + "learning_rate": 5e-05, + "loss": 0.1973, + "loss/crossentropy": 2.784806191921234, + "loss/hidden": 0.0, + "loss/logits": 0.19730350375175476, + "loss/reg": 0.29722949862480164, + "step": 3461 + }, + { + "epoch": 0.03462, + "grad_norm": 0.47160109877586365, + "grad_norm_var": 0.002201334664334394, + "learning_rate": 5e-05, + "loss": 0.2215, + "loss/crossentropy": 2.720271408557892, + "loss/hidden": 0.0, + "loss/logits": 0.22154288738965988, + "loss/reg": 0.2970200479030609, + "step": 3462 + }, + { + "epoch": 0.03463, + "grad_norm": 0.4421418309211731, + "grad_norm_var": 0.0021446911223680586, + "learning_rate": 5e-05, + "loss": 0.2367, + "loss/crossentropy": 2.6709551215171814, + "loss/hidden": 0.0, + "loss/logits": 0.2366904504597187, + "loss/reg": 0.2970776855945587, + "step": 3463 + }, + { + "epoch": 0.03464, + "grad_norm": 0.48807594180107117, + "grad_norm_var": 0.0021102961470697936, + "learning_rate": 5e-05, + "loss": 0.2209, + "loss/crossentropy": 2.819739878177643, + "loss/hidden": 0.0, + "loss/logits": 0.22092506289482117, + "loss/reg": 0.2969701886177063, + "step": 3464 + }, + { + "epoch": 0.03465, + "grad_norm": 0.44153526425361633, + "grad_norm_var": 0.0021445057182433985, + "learning_rate": 5e-05, + "loss": 0.2122, + "loss/crossentropy": 3.0078428387641907, + "loss/hidden": 0.0, + "loss/logits": 0.2122441940009594, + "loss/reg": 0.29687702655792236, + "step": 3465 + }, + { + "epoch": 0.03466, + "grad_norm": 0.4897230863571167, + "grad_norm_var": 0.0012710050932615827, + "learning_rate": 5e-05, + "loss": 0.2208, + "loss/crossentropy": 2.8594778180122375, + "loss/hidden": 0.0, + "loss/logits": 0.22082935273647308, + "loss/reg": 0.29667195677757263, + "step": 3466 + }, + { + "epoch": 0.03467, + "grad_norm": 0.44513434171676636, + "grad_norm_var": 0.0012366842567187934, + "learning_rate": 5e-05, + "loss": 0.2114, + "loss/crossentropy": 2.8085198998451233, + "loss/hidden": 0.0, + "loss/logits": 0.21140417456626892, + "loss/reg": 0.2966107726097107, + "step": 3467 + }, + { + "epoch": 0.03468, + "grad_norm": 0.49435728788375854, + "grad_norm_var": 0.0012932109020369934, + "learning_rate": 5e-05, + "loss": 0.2265, + "loss/crossentropy": 2.7659301161766052, + "loss/hidden": 0.0, + "loss/logits": 0.22649891674518585, + "loss/reg": 0.2964695394039154, + "step": 3468 + }, + { + "epoch": 0.03469, + "grad_norm": 0.4500627815723419, + "grad_norm_var": 0.0012348340629394865, + "learning_rate": 5e-05, + "loss": 0.2217, + "loss/crossentropy": 2.7386192679405212, + "loss/hidden": 0.0, + "loss/logits": 0.22168244794011116, + "loss/reg": 0.2962060272693634, + "step": 3469 + }, + { + "epoch": 0.0347, + "grad_norm": 0.4220843017101288, + "grad_norm_var": 0.0013459276492261752, + "learning_rate": 5e-05, + "loss": 0.2045, + "loss/crossentropy": 2.8527488708496094, + "loss/hidden": 0.0, + "loss/logits": 0.20451489463448524, + "loss/reg": 0.2962695360183716, + "step": 3470 + }, + { + "epoch": 0.03471, + "grad_norm": 0.4673876464366913, + "grad_norm_var": 0.0011641142467220403, + "learning_rate": 5e-05, + "loss": 0.2886, + "loss/crossentropy": 2.814348518848419, + "loss/hidden": 0.0, + "loss/logits": 0.28858909010887146, + "loss/reg": 0.29623720049858093, + "step": 3471 + }, + { + "epoch": 0.03472, + "grad_norm": 0.43469342589378357, + "grad_norm_var": 0.0005357264113342179, + "learning_rate": 5e-05, + "loss": 0.2032, + "loss/crossentropy": 2.7805095911026, + "loss/hidden": 0.0, + "loss/logits": 0.20319537073373795, + "loss/reg": 0.2962431311607361, + "step": 3472 + }, + { + "epoch": 0.03473, + "grad_norm": 0.5111095905303955, + "grad_norm_var": 0.0007154462096652973, + "learning_rate": 5e-05, + "loss": 0.2164, + "loss/crossentropy": 2.8466604351997375, + "loss/hidden": 0.0, + "loss/logits": 0.21639756858348846, + "loss/reg": 0.29607081413269043, + "step": 3473 + }, + { + "epoch": 0.03474, + "grad_norm": 0.44391965866088867, + "grad_norm_var": 0.0007203210419972183, + "learning_rate": 5e-05, + "loss": 0.2064, + "loss/crossentropy": 2.8813791275024414, + "loss/hidden": 0.0, + "loss/logits": 0.2063780166208744, + "loss/reg": 0.2959335446357727, + "step": 3474 + }, + { + "epoch": 0.03475, + "grad_norm": 0.43357813358306885, + "grad_norm_var": 0.0007293194964468341, + "learning_rate": 5e-05, + "loss": 0.2072, + "loss/crossentropy": 2.7121651768684387, + "loss/hidden": 0.0, + "loss/logits": 0.2071615345776081, + "loss/reg": 0.2960609495639801, + "step": 3475 + }, + { + "epoch": 0.03476, + "grad_norm": 0.5286844372749329, + "grad_norm_var": 0.0010108419718928396, + "learning_rate": 5e-05, + "loss": 0.2381, + "loss/crossentropy": 2.7287914156913757, + "loss/hidden": 0.0, + "loss/logits": 0.23806891217827797, + "loss/reg": 0.2959684133529663, + "step": 3476 + }, + { + "epoch": 0.03477, + "grad_norm": 0.44730639457702637, + "grad_norm_var": 0.0009587597630538291, + "learning_rate": 5e-05, + "loss": 0.2055, + "loss/crossentropy": 2.8350663781166077, + "loss/hidden": 0.0, + "loss/logits": 0.20549916103482246, + "loss/reg": 0.2957552373409271, + "step": 3477 + }, + { + "epoch": 0.03478, + "grad_norm": 0.4753485918045044, + "grad_norm_var": 0.0009638291391591113, + "learning_rate": 5e-05, + "loss": 0.2176, + "loss/crossentropy": 2.9086169600486755, + "loss/hidden": 0.0, + "loss/logits": 0.2176027111709118, + "loss/reg": 0.29548409581184387, + "step": 3478 + }, + { + "epoch": 0.03479, + "grad_norm": 0.4515187740325928, + "grad_norm_var": 0.000942688324959858, + "learning_rate": 5e-05, + "loss": 0.2069, + "loss/crossentropy": 2.721864342689514, + "loss/hidden": 0.0, + "loss/logits": 0.20685727521777153, + "loss/reg": 0.29500463604927063, + "step": 3479 + }, + { + "epoch": 0.0348, + "grad_norm": 0.43227124214172363, + "grad_norm_var": 0.0009584251743306423, + "learning_rate": 5e-05, + "loss": 0.2179, + "loss/crossentropy": 2.771520972251892, + "loss/hidden": 0.0, + "loss/logits": 0.21787375584244728, + "loss/reg": 0.2949616014957428, + "step": 3480 + }, + { + "epoch": 0.03481, + "grad_norm": 0.46259137988090515, + "grad_norm_var": 0.0009327665667722844, + "learning_rate": 5e-05, + "loss": 0.2172, + "loss/crossentropy": 2.818275570869446, + "loss/hidden": 0.0, + "loss/logits": 0.21722986549139023, + "loss/reg": 0.2947467565536499, + "step": 3481 + }, + { + "epoch": 0.03482, + "grad_norm": 0.4369131624698639, + "grad_norm_var": 0.0009108839481129986, + "learning_rate": 5e-05, + "loss": 0.2093, + "loss/crossentropy": 2.760987341403961, + "loss/hidden": 0.0, + "loss/logits": 0.2092873677611351, + "loss/reg": 0.2945036292076111, + "step": 3482 + }, + { + "epoch": 0.03483, + "grad_norm": 0.4489278197288513, + "grad_norm_var": 0.0009049926578683791, + "learning_rate": 5e-05, + "loss": 0.2099, + "loss/crossentropy": 2.781856119632721, + "loss/hidden": 0.0, + "loss/logits": 0.2098829410970211, + "loss/reg": 0.29415830969810486, + "step": 3483 + }, + { + "epoch": 0.03484, + "grad_norm": 0.4363648593425751, + "grad_norm_var": 0.0008402253779630701, + "learning_rate": 5e-05, + "loss": 0.2028, + "loss/crossentropy": 2.8522695302963257, + "loss/hidden": 0.0, + "loss/logits": 0.20283931866288185, + "loss/reg": 0.2938477694988251, + "step": 3484 + }, + { + "epoch": 0.03485, + "grad_norm": 0.5259154438972473, + "grad_norm_var": 0.0011481475368901872, + "learning_rate": 5e-05, + "loss": 0.2276, + "loss/crossentropy": 2.770556330680847, + "loss/hidden": 0.0, + "loss/logits": 0.22764533013105392, + "loss/reg": 0.2940202057361603, + "step": 3485 + }, + { + "epoch": 0.03486, + "grad_norm": 0.4551275074481964, + "grad_norm_var": 0.0010497222930002438, + "learning_rate": 5e-05, + "loss": 0.215, + "loss/crossentropy": 2.852599620819092, + "loss/hidden": 0.0, + "loss/logits": 0.2150140292942524, + "loss/reg": 0.29384151101112366, + "step": 3486 + }, + { + "epoch": 0.03487, + "grad_norm": 0.8854374289512634, + "grad_norm_var": 0.012274072046449307, + "learning_rate": 5e-05, + "loss": 0.2416, + "loss/crossentropy": 2.8396798372268677, + "loss/hidden": 0.0, + "loss/logits": 0.24163749068975449, + "loss/reg": 0.29383984208106995, + "step": 3487 + }, + { + "epoch": 0.03488, + "grad_norm": 0.5152629613876343, + "grad_norm_var": 0.012105989546798195, + "learning_rate": 5e-05, + "loss": 0.2308, + "loss/crossentropy": 2.7422037720680237, + "loss/hidden": 0.0, + "loss/logits": 0.23080037906765938, + "loss/reg": 0.29374125599861145, + "step": 3488 + }, + { + "epoch": 0.03489, + "grad_norm": 0.42561081051826477, + "grad_norm_var": 0.012358043361188165, + "learning_rate": 5e-05, + "loss": 0.2056, + "loss/crossentropy": 2.83659166097641, + "loss/hidden": 0.0, + "loss/logits": 0.2056441865861416, + "loss/reg": 0.29350730776786804, + "step": 3489 + }, + { + "epoch": 0.0349, + "grad_norm": 0.4867115318775177, + "grad_norm_var": 0.012222134582438437, + "learning_rate": 5e-05, + "loss": 0.2207, + "loss/crossentropy": 2.8425204157829285, + "loss/hidden": 0.0, + "loss/logits": 0.22071266919374466, + "loss/reg": 0.29361405968666077, + "step": 3490 + }, + { + "epoch": 0.03491, + "grad_norm": 0.4842241406440735, + "grad_norm_var": 0.011998247455658273, + "learning_rate": 5e-05, + "loss": 0.2148, + "loss/crossentropy": 2.7573837637901306, + "loss/hidden": 0.0, + "loss/logits": 0.2148086130619049, + "loss/reg": 0.2931942939758301, + "step": 3491 + }, + { + "epoch": 0.03492, + "grad_norm": 0.5524334907531738, + "grad_norm_var": 0.012144472834270307, + "learning_rate": 5e-05, + "loss": 0.226, + "loss/crossentropy": 2.7822689414024353, + "loss/hidden": 0.0, + "loss/logits": 0.22597333788871765, + "loss/reg": 0.29317212104797363, + "step": 3492 + }, + { + "epoch": 0.03493, + "grad_norm": 0.4339727759361267, + "grad_norm_var": 0.012240593265021063, + "learning_rate": 5e-05, + "loss": 0.2182, + "loss/crossentropy": 2.8865193724632263, + "loss/hidden": 0.0, + "loss/logits": 0.2182406187057495, + "loss/reg": 0.2929086685180664, + "step": 3493 + }, + { + "epoch": 0.03494, + "grad_norm": 0.47786369919776917, + "grad_norm_var": 0.012234636837907356, + "learning_rate": 5e-05, + "loss": 0.2106, + "loss/crossentropy": 2.762267529964447, + "loss/hidden": 0.0, + "loss/logits": 0.21059424430131912, + "loss/reg": 0.2927929162979126, + "step": 3494 + }, + { + "epoch": 0.03495, + "grad_norm": 0.46272894740104675, + "grad_norm_var": 0.01217832717081458, + "learning_rate": 5e-05, + "loss": 0.2204, + "loss/crossentropy": 2.894877016544342, + "loss/hidden": 0.0, + "loss/logits": 0.2204066440463066, + "loss/reg": 0.29245835542678833, + "step": 3495 + }, + { + "epoch": 0.03496, + "grad_norm": 0.6182472109794617, + "grad_norm_var": 0.012780893089770713, + "learning_rate": 5e-05, + "loss": 0.2282, + "loss/crossentropy": 2.807343602180481, + "loss/hidden": 0.0, + "loss/logits": 0.22821874916553497, + "loss/reg": 0.2924083173274994, + "step": 3496 + }, + { + "epoch": 0.03497, + "grad_norm": 0.461630254983902, + "grad_norm_var": 0.012786612419885775, + "learning_rate": 5e-05, + "loss": 0.213, + "loss/crossentropy": 2.9110602736473083, + "loss/hidden": 0.0, + "loss/logits": 0.21296852827072144, + "loss/reg": 0.2920280992984772, + "step": 3497 + }, + { + "epoch": 0.03498, + "grad_norm": 0.5834048986434937, + "grad_norm_var": 0.012764549025198645, + "learning_rate": 5e-05, + "loss": 0.2078, + "loss/crossentropy": 2.86440771818161, + "loss/hidden": 0.0, + "loss/logits": 0.2077944613993168, + "loss/reg": 0.29135945439338684, + "step": 3498 + }, + { + "epoch": 0.03499, + "grad_norm": 0.6049054861068726, + "grad_norm_var": 0.012892988615956969, + "learning_rate": 5e-05, + "loss": 0.2413, + "loss/crossentropy": 2.6971964836120605, + "loss/hidden": 0.0, + "loss/logits": 0.2413032315671444, + "loss/reg": 0.2911268472671509, + "step": 3499 + }, + { + "epoch": 0.035, + "grad_norm": 0.46097418665885925, + "grad_norm_var": 0.012637988049330284, + "learning_rate": 5e-05, + "loss": 0.1964, + "loss/crossentropy": 2.9122344255447388, + "loss/hidden": 0.0, + "loss/logits": 0.19637194648385048, + "loss/reg": 0.29109621047973633, + "step": 3500 + }, + { + "epoch": 0.03501, + "grad_norm": 0.5647832155227661, + "grad_norm_var": 0.01272599265677083, + "learning_rate": 5e-05, + "loss": 0.2446, + "loss/crossentropy": 3.0184065103530884, + "loss/hidden": 0.0, + "loss/logits": 0.24457957223057747, + "loss/reg": 0.2909912168979645, + "step": 3501 + }, + { + "epoch": 0.03502, + "grad_norm": 0.6037790179252625, + "grad_norm_var": 0.012631360982239966, + "learning_rate": 5e-05, + "loss": 0.2287, + "loss/crossentropy": 2.7291190028190613, + "loss/hidden": 0.0, + "loss/logits": 0.22873178124427795, + "loss/reg": 0.29077935218811035, + "step": 3502 + }, + { + "epoch": 0.03503, + "grad_norm": 0.46327149868011475, + "grad_norm_var": 0.004262680156909212, + "learning_rate": 5e-05, + "loss": 0.2124, + "loss/crossentropy": 2.710666835308075, + "loss/hidden": 0.0, + "loss/logits": 0.21240443363785744, + "loss/reg": 0.2905091643333435, + "step": 3503 + }, + { + "epoch": 0.03504, + "grad_norm": 0.4714578688144684, + "grad_norm_var": 0.004366401467908162, + "learning_rate": 5e-05, + "loss": 0.2167, + "loss/crossentropy": 2.8594489693641663, + "loss/hidden": 0.0, + "loss/logits": 0.21666935458779335, + "loss/reg": 0.2905624806880951, + "step": 3504 + }, + { + "epoch": 0.03505, + "grad_norm": 0.4621098041534424, + "grad_norm_var": 0.004040196696204849, + "learning_rate": 5e-05, + "loss": 0.2161, + "loss/crossentropy": 2.737568974494934, + "loss/hidden": 0.0, + "loss/logits": 0.2161366194486618, + "loss/reg": 0.2904980778694153, + "step": 3505 + }, + { + "epoch": 0.03506, + "grad_norm": 0.46641144156455994, + "grad_norm_var": 0.004134484558768096, + "learning_rate": 5e-05, + "loss": 0.206, + "loss/crossentropy": 2.698035180568695, + "loss/hidden": 0.0, + "loss/logits": 0.20600149780511856, + "loss/reg": 0.2904922664165497, + "step": 3506 + }, + { + "epoch": 0.03507, + "grad_norm": 0.4269760549068451, + "grad_norm_var": 0.0045418869021391734, + "learning_rate": 5e-05, + "loss": 0.2043, + "loss/crossentropy": 2.844978988170624, + "loss/hidden": 0.0, + "loss/logits": 0.20427390560507774, + "loss/reg": 0.2900683879852295, + "step": 3507 + }, + { + "epoch": 0.03508, + "grad_norm": 0.45179930329322815, + "grad_norm_var": 0.004567691525625846, + "learning_rate": 5e-05, + "loss": 0.2105, + "loss/crossentropy": 2.8465536236763, + "loss/hidden": 0.0, + "loss/logits": 0.21054675430059433, + "loss/reg": 0.28985241055488586, + "step": 3508 + }, + { + "epoch": 0.03509, + "grad_norm": 0.47084084153175354, + "grad_norm_var": 0.004323673855974693, + "learning_rate": 5e-05, + "loss": 0.2164, + "loss/crossentropy": 2.901433825492859, + "loss/hidden": 0.0, + "loss/logits": 0.21643272787332535, + "loss/reg": 0.28954771161079407, + "step": 3509 + }, + { + "epoch": 0.0351, + "grad_norm": 0.4179805815219879, + "grad_norm_var": 0.004750085531068804, + "learning_rate": 5e-05, + "loss": 0.2047, + "loss/crossentropy": 2.8391751050949097, + "loss/hidden": 0.0, + "loss/logits": 0.20467520505189896, + "loss/reg": 0.289450466632843, + "step": 3510 + }, + { + "epoch": 0.03511, + "grad_norm": 0.4620131552219391, + "grad_norm_var": 0.004753622772587518, + "learning_rate": 5e-05, + "loss": 0.2121, + "loss/crossentropy": 2.7464330196380615, + "loss/hidden": 0.0, + "loss/logits": 0.21205678582191467, + "loss/reg": 0.2892228066921234, + "step": 3511 + }, + { + "epoch": 0.03512, + "grad_norm": 0.44783827662467957, + "grad_norm_var": 0.0038684853283433777, + "learning_rate": 5e-05, + "loss": 0.2037, + "loss/crossentropy": 2.7305158376693726, + "loss/hidden": 0.0, + "loss/logits": 0.2036760114133358, + "loss/reg": 0.28856512904167175, + "step": 3512 + }, + { + "epoch": 0.03513, + "grad_norm": 0.4602659046649933, + "grad_norm_var": 0.003873537113201368, + "learning_rate": 5e-05, + "loss": 0.2203, + "loss/crossentropy": 2.800485372543335, + "loss/hidden": 0.0, + "loss/logits": 0.2203170396387577, + "loss/reg": 0.288239449262619, + "step": 3513 + }, + { + "epoch": 0.03514, + "grad_norm": 0.4468267261981964, + "grad_norm_var": 0.003314328542655724, + "learning_rate": 5e-05, + "loss": 0.2141, + "loss/crossentropy": 2.786963939666748, + "loss/hidden": 0.0, + "loss/logits": 0.21405880898237228, + "loss/reg": 0.28792980313301086, + "step": 3514 + }, + { + "epoch": 0.03515, + "grad_norm": 0.4161602258682251, + "grad_norm_var": 0.002401013880453932, + "learning_rate": 5e-05, + "loss": 0.2088, + "loss/crossentropy": 2.831013023853302, + "loss/hidden": 0.0, + "loss/logits": 0.2088221162557602, + "loss/reg": 0.2874002754688263, + "step": 3515 + }, + { + "epoch": 0.03516, + "grad_norm": 0.41334033012390137, + "grad_norm_var": 0.002589626105760923, + "learning_rate": 5e-05, + "loss": 0.1932, + "loss/crossentropy": 2.8332353830337524, + "loss/hidden": 0.0, + "loss/logits": 0.1932496540248394, + "loss/reg": 0.28721338510513306, + "step": 3516 + }, + { + "epoch": 0.03517, + "grad_norm": 0.46974971890449524, + "grad_norm_var": 0.0018943563109537784, + "learning_rate": 5e-05, + "loss": 0.2218, + "loss/crossentropy": 2.851756453514099, + "loss/hidden": 0.0, + "loss/logits": 0.2217661589384079, + "loss/reg": 0.2867795526981354, + "step": 3517 + }, + { + "epoch": 0.03518, + "grad_norm": 0.46980446577072144, + "grad_norm_var": 0.00043756794601499023, + "learning_rate": 5e-05, + "loss": 0.2173, + "loss/crossentropy": 2.831221342086792, + "loss/hidden": 0.0, + "loss/logits": 0.21726617589592934, + "loss/reg": 0.28656435012817383, + "step": 3518 + }, + { + "epoch": 0.03519, + "grad_norm": 0.4136711061000824, + "grad_norm_var": 0.0005105239909539556, + "learning_rate": 5e-05, + "loss": 0.2042, + "loss/crossentropy": 2.8080859780311584, + "loss/hidden": 0.0, + "loss/logits": 0.20415761321783066, + "loss/reg": 0.2862515449523926, + "step": 3519 + }, + { + "epoch": 0.0352, + "grad_norm": 0.428845077753067, + "grad_norm_var": 0.0004904660923873501, + "learning_rate": 5e-05, + "loss": 0.217, + "loss/crossentropy": 2.8880520462989807, + "loss/hidden": 0.0, + "loss/logits": 0.21704135090112686, + "loss/reg": 0.2856188714504242, + "step": 3520 + }, + { + "epoch": 0.03521, + "grad_norm": 0.447121798992157, + "grad_norm_var": 0.00047089253033541685, + "learning_rate": 5e-05, + "loss": 0.2142, + "loss/crossentropy": 2.642792761325836, + "loss/hidden": 0.0, + "loss/logits": 0.2141987346112728, + "loss/reg": 0.2853628695011139, + "step": 3521 + }, + { + "epoch": 0.03522, + "grad_norm": 0.4422473609447479, + "grad_norm_var": 0.0004363162578580197, + "learning_rate": 5e-05, + "loss": 0.2018, + "loss/crossentropy": 2.785871922969818, + "loss/hidden": 0.0, + "loss/logits": 0.20180003717541695, + "loss/reg": 0.2852080166339874, + "step": 3522 + }, + { + "epoch": 0.03523, + "grad_norm": 0.4292442798614502, + "grad_norm_var": 0.0004318393041934265, + "learning_rate": 5e-05, + "loss": 0.1975, + "loss/crossentropy": 2.8003852367401123, + "loss/hidden": 0.0, + "loss/logits": 0.197463970631361, + "loss/reg": 0.2848413586616516, + "step": 3523 + }, + { + "epoch": 0.03524, + "grad_norm": 0.4693783223628998, + "grad_norm_var": 0.0004718143366913119, + "learning_rate": 5e-05, + "loss": 0.2189, + "loss/crossentropy": 2.8674152493476868, + "loss/hidden": 0.0, + "loss/logits": 0.21892580017447472, + "loss/reg": 0.2843046486377716, + "step": 3524 + }, + { + "epoch": 0.03525, + "grad_norm": 0.4428574740886688, + "grad_norm_var": 0.0004209195787033123, + "learning_rate": 5e-05, + "loss": 0.209, + "loss/crossentropy": 2.740976870059967, + "loss/hidden": 0.0, + "loss/logits": 0.20904400944709778, + "loss/reg": 0.2843325138092041, + "step": 3525 + }, + { + "epoch": 0.03526, + "grad_norm": 0.44237223267555237, + "grad_norm_var": 0.00037890127706203446, + "learning_rate": 5e-05, + "loss": 0.2014, + "loss/crossentropy": 2.721426844596863, + "loss/hidden": 0.0, + "loss/logits": 0.20140238106250763, + "loss/reg": 0.2838611900806427, + "step": 3526 + }, + { + "epoch": 0.03527, + "grad_norm": 0.43181681632995605, + "grad_norm_var": 0.0003627961891291183, + "learning_rate": 5e-05, + "loss": 0.2119, + "loss/crossentropy": 2.80571973323822, + "loss/hidden": 0.0, + "loss/logits": 0.21186371892690659, + "loss/reg": 0.2836456894874573, + "step": 3527 + }, + { + "epoch": 0.03528, + "grad_norm": 0.42360466718673706, + "grad_norm_var": 0.00038054322105498156, + "learning_rate": 5e-05, + "loss": 0.1965, + "loss/crossentropy": 2.976572871208191, + "loss/hidden": 0.0, + "loss/logits": 0.19645503163337708, + "loss/reg": 0.2836010456085205, + "step": 3528 + }, + { + "epoch": 0.03529, + "grad_norm": 0.7888292670249939, + "grad_norm_var": 0.007995473327800024, + "learning_rate": 5e-05, + "loss": 0.2161, + "loss/crossentropy": 2.99369353055954, + "loss/hidden": 0.0, + "loss/logits": 0.21605869755148888, + "loss/reg": 0.2833597958087921, + "step": 3529 + }, + { + "epoch": 0.0353, + "grad_norm": 0.45686376094818115, + "grad_norm_var": 0.007982812905717056, + "learning_rate": 5e-05, + "loss": 0.2136, + "loss/crossentropy": 2.8032811880111694, + "loss/hidden": 0.0, + "loss/logits": 0.2135760523378849, + "loss/reg": 0.28316569328308105, + "step": 3530 + }, + { + "epoch": 0.03531, + "grad_norm": 0.4665684700012207, + "grad_norm_var": 0.007836090696474, + "learning_rate": 5e-05, + "loss": 0.2133, + "loss/crossentropy": 2.8168593645095825, + "loss/hidden": 0.0, + "loss/logits": 0.2133481204509735, + "loss/reg": 0.28309592604637146, + "step": 3531 + }, + { + "epoch": 0.03532, + "grad_norm": 0.4923142194747925, + "grad_norm_var": 0.007684351781145553, + "learning_rate": 5e-05, + "loss": 0.2298, + "loss/crossentropy": 2.60097736120224, + "loss/hidden": 0.0, + "loss/logits": 0.22982950881123543, + "loss/reg": 0.28298187255859375, + "step": 3532 + }, + { + "epoch": 0.03533, + "grad_norm": 0.4571343958377838, + "grad_norm_var": 0.00769422416071495, + "learning_rate": 5e-05, + "loss": 0.215, + "loss/crossentropy": 2.7043333649635315, + "loss/hidden": 0.0, + "loss/logits": 0.21502839401364326, + "loss/reg": 0.28271788358688354, + "step": 3533 + }, + { + "epoch": 0.03534, + "grad_norm": 0.4886736571788788, + "grad_norm_var": 0.00771870955821156, + "learning_rate": 5e-05, + "loss": 0.239, + "loss/crossentropy": 2.856890022754669, + "loss/hidden": 0.0, + "loss/logits": 0.23899228125810623, + "loss/reg": 0.2825174927711487, + "step": 3534 + }, + { + "epoch": 0.03535, + "grad_norm": 0.456377238035202, + "grad_norm_var": 0.007511403594981185, + "learning_rate": 5e-05, + "loss": 0.2062, + "loss/crossentropy": 2.7235931754112244, + "loss/hidden": 0.0, + "loss/logits": 0.20617057383060455, + "loss/reg": 0.28229567408561707, + "step": 3535 + }, + { + "epoch": 0.03536, + "grad_norm": 0.42747098207473755, + "grad_norm_var": 0.007519568397211732, + "learning_rate": 5e-05, + "loss": 0.2078, + "loss/crossentropy": 2.78042995929718, + "loss/hidden": 0.0, + "loss/logits": 0.2078290916979313, + "loss/reg": 0.2821597456932068, + "step": 3536 + }, + { + "epoch": 0.03537, + "grad_norm": 0.4743330776691437, + "grad_norm_var": 0.00747311838870767, + "learning_rate": 5e-05, + "loss": 0.2188, + "loss/crossentropy": 2.810151159763336, + "loss/hidden": 0.0, + "loss/logits": 0.21876702830195427, + "loss/reg": 0.2822117507457733, + "step": 3537 + }, + { + "epoch": 0.03538, + "grad_norm": 0.4344124495983124, + "grad_norm_var": 0.0075105229267758014, + "learning_rate": 5e-05, + "loss": 0.1989, + "loss/crossentropy": 2.8970921635627747, + "loss/hidden": 0.0, + "loss/logits": 0.19887107238173485, + "loss/reg": 0.282124400138855, + "step": 3538 + }, + { + "epoch": 0.03539, + "grad_norm": 0.4879232347011566, + "grad_norm_var": 0.0073764167371585905, + "learning_rate": 5e-05, + "loss": 0.2048, + "loss/crossentropy": 2.7567803263664246, + "loss/hidden": 0.0, + "loss/logits": 0.20481270551681519, + "loss/reg": 0.2820613980293274, + "step": 3539 + }, + { + "epoch": 0.0354, + "grad_norm": 0.4646925628185272, + "grad_norm_var": 0.007382899497357439, + "learning_rate": 5e-05, + "loss": 0.2115, + "loss/crossentropy": 2.803367257118225, + "loss/hidden": 0.0, + "loss/logits": 0.21152058988809586, + "loss/reg": 0.28185805678367615, + "step": 3540 + }, + { + "epoch": 0.03541, + "grad_norm": 0.4793310761451721, + "grad_norm_var": 0.007298714501439448, + "learning_rate": 5e-05, + "loss": 0.2245, + "loss/crossentropy": 2.7624236941337585, + "loss/hidden": 0.0, + "loss/logits": 0.22447143867611885, + "loss/reg": 0.2819213569164276, + "step": 3541 + }, + { + "epoch": 0.03542, + "grad_norm": 0.4681227505207062, + "grad_norm_var": 0.007212528912805505, + "learning_rate": 5e-05, + "loss": 0.2027, + "loss/crossentropy": 2.85365092754364, + "loss/hidden": 0.0, + "loss/logits": 0.20270080864429474, + "loss/reg": 0.28167110681533813, + "step": 3542 + }, + { + "epoch": 0.03543, + "grad_norm": 0.47110751271247864, + "grad_norm_var": 0.007050546684160486, + "learning_rate": 5e-05, + "loss": 0.2116, + "loss/crossentropy": 2.881369173526764, + "loss/hidden": 0.0, + "loss/logits": 0.21159860119223595, + "loss/reg": 0.28160277009010315, + "step": 3543 + }, + { + "epoch": 0.03544, + "grad_norm": 0.44448980689048767, + "grad_norm_var": 0.006910712650615203, + "learning_rate": 5e-05, + "loss": 0.2108, + "loss/crossentropy": 2.840867042541504, + "loss/hidden": 0.0, + "loss/logits": 0.21075089275836945, + "loss/reg": 0.2814091444015503, + "step": 3544 + }, + { + "epoch": 0.03545, + "grad_norm": 0.4539512097835541, + "grad_norm_var": 0.0003497862699277913, + "learning_rate": 5e-05, + "loss": 0.196, + "loss/crossentropy": 2.844698965549469, + "loss/hidden": 0.0, + "loss/logits": 0.19604019448161125, + "loss/reg": 0.28119003772735596, + "step": 3545 + }, + { + "epoch": 0.03546, + "grad_norm": 0.44927334785461426, + "grad_norm_var": 0.00036059465936289317, + "learning_rate": 5e-05, + "loss": 0.2, + "loss/crossentropy": 2.7735085487365723, + "loss/hidden": 0.0, + "loss/logits": 0.19998596608638763, + "loss/reg": 0.28089478611946106, + "step": 3546 + }, + { + "epoch": 0.03547, + "grad_norm": 0.4274989068508148, + "grad_norm_var": 0.00044006937702126454, + "learning_rate": 5e-05, + "loss": 0.1938, + "loss/crossentropy": 2.816484570503235, + "loss/hidden": 0.0, + "loss/logits": 0.19380242377519608, + "loss/reg": 0.28094780445098877, + "step": 3547 + }, + { + "epoch": 0.03548, + "grad_norm": 0.4141296446323395, + "grad_norm_var": 0.0004964034662743644, + "learning_rate": 5e-05, + "loss": 0.2013, + "loss/crossentropy": 2.7964967489242554, + "loss/hidden": 0.0, + "loss/logits": 0.20134970918297768, + "loss/reg": 0.28074485063552856, + "step": 3548 + }, + { + "epoch": 0.03549, + "grad_norm": 0.4376387298107147, + "grad_norm_var": 0.0005176844503714547, + "learning_rate": 5e-05, + "loss": 0.2237, + "loss/crossentropy": 2.789761543273926, + "loss/hidden": 0.0, + "loss/logits": 0.2236720658838749, + "loss/reg": 0.28063204884529114, + "step": 3549 + }, + { + "epoch": 0.0355, + "grad_norm": 0.41746455430984497, + "grad_norm_var": 0.0005145488429375048, + "learning_rate": 5e-05, + "loss": 0.2024, + "loss/crossentropy": 2.793704569339752, + "loss/hidden": 0.0, + "loss/logits": 0.20241760462522507, + "loss/reg": 0.2804163992404938, + "step": 3550 + }, + { + "epoch": 0.03551, + "grad_norm": 0.7507920265197754, + "grad_norm_var": 0.006162233238072778, + "learning_rate": 5e-05, + "loss": 0.2543, + "loss/crossentropy": 3.0283141136169434, + "loss/hidden": 0.0, + "loss/logits": 0.2542958930134773, + "loss/reg": 0.2802495062351227, + "step": 3551 + }, + { + "epoch": 0.03552, + "grad_norm": 0.4687981903553009, + "grad_norm_var": 0.0060406134295741595, + "learning_rate": 5e-05, + "loss": 0.2039, + "loss/crossentropy": 2.8233417868614197, + "loss/hidden": 0.0, + "loss/logits": 0.20388777181506157, + "loss/reg": 0.28015264868736267, + "step": 3552 + }, + { + "epoch": 0.03553, + "grad_norm": 0.48985666036605835, + "grad_norm_var": 0.006061544009596202, + "learning_rate": 5e-05, + "loss": 0.2217, + "loss/crossentropy": 2.783440053462982, + "loss/hidden": 0.0, + "loss/logits": 0.22165561839938164, + "loss/reg": 0.27981939911842346, + "step": 3553 + }, + { + "epoch": 0.03554, + "grad_norm": 0.4771634638309479, + "grad_norm_var": 0.00595885220317716, + "learning_rate": 5e-05, + "loss": 0.2164, + "loss/crossentropy": 2.734297215938568, + "loss/hidden": 0.0, + "loss/logits": 0.21644136682152748, + "loss/reg": 0.2796027660369873, + "step": 3554 + }, + { + "epoch": 0.03555, + "grad_norm": 0.45651954412460327, + "grad_norm_var": 0.00596696210216745, + "learning_rate": 5e-05, + "loss": 0.1998, + "loss/crossentropy": 2.676268756389618, + "loss/hidden": 0.0, + "loss/logits": 0.1998187117278576, + "loss/reg": 0.2794448733329773, + "step": 3555 + }, + { + "epoch": 0.03556, + "grad_norm": 0.5619596838951111, + "grad_norm_var": 0.006448235298908763, + "learning_rate": 5e-05, + "loss": 0.2254, + "loss/crossentropy": 2.9027057886123657, + "loss/hidden": 0.0, + "loss/logits": 0.2254171445965767, + "loss/reg": 0.27899038791656494, + "step": 3556 + }, + { + "epoch": 0.03557, + "grad_norm": 0.5448945760726929, + "grad_norm_var": 0.006717551776526683, + "learning_rate": 5e-05, + "loss": 0.2204, + "loss/crossentropy": 2.8669285774230957, + "loss/hidden": 0.0, + "loss/logits": 0.22035924345254898, + "loss/reg": 0.27889689803123474, + "step": 3557 + }, + { + "epoch": 0.03558, + "grad_norm": 0.46167734265327454, + "grad_norm_var": 0.006733237598222674, + "learning_rate": 5e-05, + "loss": 0.2189, + "loss/crossentropy": 2.7351719737052917, + "loss/hidden": 0.0, + "loss/logits": 0.2188594900071621, + "loss/reg": 0.2789222300052643, + "step": 3558 + }, + { + "epoch": 0.03559, + "grad_norm": 0.46591147780418396, + "grad_norm_var": 0.006743130210401734, + "learning_rate": 5e-05, + "loss": 0.2118, + "loss/crossentropy": 2.9333495497703552, + "loss/hidden": 0.0, + "loss/logits": 0.21182797849178314, + "loss/reg": 0.27893635630607605, + "step": 3559 + }, + { + "epoch": 0.0356, + "grad_norm": 0.44463685154914856, + "grad_norm_var": 0.0067423838614392795, + "learning_rate": 5e-05, + "loss": 0.209, + "loss/crossentropy": 2.7975680232048035, + "loss/hidden": 0.0, + "loss/logits": 0.20898044854402542, + "loss/reg": 0.2787291705608368, + "step": 3560 + }, + { + "epoch": 0.03561, + "grad_norm": 0.6086260080337524, + "grad_norm_var": 0.007646089587040909, + "learning_rate": 5e-05, + "loss": 0.2351, + "loss/crossentropy": 2.8059693574905396, + "loss/hidden": 0.0, + "loss/logits": 0.23511432111263275, + "loss/reg": 0.2788389027118683, + "step": 3561 + }, + { + "epoch": 0.03562, + "grad_norm": 0.5030738115310669, + "grad_norm_var": 0.007518329641453751, + "learning_rate": 5e-05, + "loss": 0.1893, + "loss/crossentropy": 2.75816011428833, + "loss/hidden": 0.0, + "loss/logits": 0.1893228255212307, + "loss/reg": 0.2788075804710388, + "step": 3562 + }, + { + "epoch": 0.03563, + "grad_norm": 0.5023828148841858, + "grad_norm_var": 0.007188197909429065, + "learning_rate": 5e-05, + "loss": 0.2333, + "loss/crossentropy": 2.7504284977912903, + "loss/hidden": 0.0, + "loss/logits": 0.23330477997660637, + "loss/reg": 0.278866171836853, + "step": 3563 + }, + { + "epoch": 0.03564, + "grad_norm": 0.5184929966926575, + "grad_norm_var": 0.006669228593778091, + "learning_rate": 5e-05, + "loss": 0.244, + "loss/crossentropy": 2.6866130232810974, + "loss/hidden": 0.0, + "loss/logits": 0.24398411065340042, + "loss/reg": 0.27868354320526123, + "step": 3564 + }, + { + "epoch": 0.03565, + "grad_norm": 0.4746047258377075, + "grad_norm_var": 0.0064134164818079285, + "learning_rate": 5e-05, + "loss": 0.2115, + "loss/crossentropy": 2.8229872584342957, + "loss/hidden": 0.0, + "loss/logits": 0.2115282565355301, + "loss/reg": 0.2787753939628601, + "step": 3565 + }, + { + "epoch": 0.03566, + "grad_norm": 0.49341538548469543, + "grad_norm_var": 0.005845183613122496, + "learning_rate": 5e-05, + "loss": 0.2277, + "loss/crossentropy": 2.79108202457428, + "loss/hidden": 0.0, + "loss/logits": 0.22768378257751465, + "loss/reg": 0.27862784266471863, + "step": 3566 + }, + { + "epoch": 0.03567, + "grad_norm": 0.49316930770874023, + "grad_norm_var": 0.0018569768110048882, + "learning_rate": 5e-05, + "loss": 0.2017, + "loss/crossentropy": 2.709626019001007, + "loss/hidden": 0.0, + "loss/logits": 0.20169832184910774, + "loss/reg": 0.2786957323551178, + "step": 3567 + }, + { + "epoch": 0.03568, + "grad_norm": 0.43333667516708374, + "grad_norm_var": 0.0020728113016111474, + "learning_rate": 5e-05, + "loss": 0.2037, + "loss/crossentropy": 2.833505690097809, + "loss/hidden": 0.0, + "loss/logits": 0.20371923595666885, + "loss/reg": 0.2786054015159607, + "step": 3568 + }, + { + "epoch": 0.03569, + "grad_norm": 0.44542914628982544, + "grad_norm_var": 0.0022302406106757783, + "learning_rate": 5e-05, + "loss": 0.2071, + "loss/crossentropy": 2.7718849778175354, + "loss/hidden": 0.0, + "loss/logits": 0.20710446313023567, + "loss/reg": 0.2785038948059082, + "step": 3569 + }, + { + "epoch": 0.0357, + "grad_norm": 0.4701756238937378, + "grad_norm_var": 0.0022478899837532484, + "learning_rate": 5e-05, + "loss": 0.2251, + "loss/crossentropy": 2.7484123706817627, + "loss/hidden": 0.0, + "loss/logits": 0.2250528782606125, + "loss/reg": 0.27825888991355896, + "step": 3570 + }, + { + "epoch": 0.03571, + "grad_norm": 0.507922351360321, + "grad_norm_var": 0.0021671566441709535, + "learning_rate": 5e-05, + "loss": 0.2274, + "loss/crossentropy": 2.7786290049552917, + "loss/hidden": 0.0, + "loss/logits": 0.22743286564946175, + "loss/reg": 0.27814286947250366, + "step": 3571 + }, + { + "epoch": 0.03572, + "grad_norm": 0.4388575255870819, + "grad_norm_var": 0.002025199363078504, + "learning_rate": 5e-05, + "loss": 0.2082, + "loss/crossentropy": 2.7407360672950745, + "loss/hidden": 0.0, + "loss/logits": 0.20818087831139565, + "loss/reg": 0.27783942222595215, + "step": 3572 + }, + { + "epoch": 0.03573, + "grad_norm": 0.4527556598186493, + "grad_norm_var": 0.0018557676248280419, + "learning_rate": 5e-05, + "loss": 0.2158, + "loss/crossentropy": 2.7286829948425293, + "loss/hidden": 0.0, + "loss/logits": 0.2158493436872959, + "loss/reg": 0.27765488624572754, + "step": 3573 + }, + { + "epoch": 0.03574, + "grad_norm": 0.45720791816711426, + "grad_norm_var": 0.0018692187639963386, + "learning_rate": 5e-05, + "loss": 0.22, + "loss/crossentropy": 2.8338009119033813, + "loss/hidden": 0.0, + "loss/logits": 0.21996081620454788, + "loss/reg": 0.2775925397872925, + "step": 3574 + }, + { + "epoch": 0.03575, + "grad_norm": 0.489196240901947, + "grad_norm_var": 0.001853544448809193, + "learning_rate": 5e-05, + "loss": 0.2042, + "loss/crossentropy": 2.8343674540519714, + "loss/hidden": 0.0, + "loss/logits": 0.20421718433499336, + "loss/reg": 0.2772521376609802, + "step": 3575 + }, + { + "epoch": 0.03576, + "grad_norm": 0.4659740626811981, + "grad_norm_var": 0.0017719181768824032, + "learning_rate": 5e-05, + "loss": 0.2024, + "loss/crossentropy": 2.738154172897339, + "loss/hidden": 0.0, + "loss/logits": 0.20236926525831223, + "loss/reg": 0.2767479717731476, + "step": 3576 + }, + { + "epoch": 0.03577, + "grad_norm": 0.46750786900520325, + "grad_norm_var": 0.0006841209598426231, + "learning_rate": 5e-05, + "loss": 0.2104, + "loss/crossentropy": 2.842708706855774, + "loss/hidden": 0.0, + "loss/logits": 0.21043826267123222, + "loss/reg": 0.27641284465789795, + "step": 3577 + }, + { + "epoch": 0.03578, + "grad_norm": 0.4459070861339569, + "grad_norm_var": 0.000680820329855066, + "learning_rate": 5e-05, + "loss": 0.2054, + "loss/crossentropy": 2.8173733949661255, + "loss/hidden": 0.0, + "loss/logits": 0.20539779588580132, + "loss/reg": 0.27636200189590454, + "step": 3578 + }, + { + "epoch": 0.03579, + "grad_norm": 0.6176108717918396, + "grad_norm_var": 0.0019732951071801913, + "learning_rate": 5e-05, + "loss": 0.267, + "loss/crossentropy": 2.872570812702179, + "loss/hidden": 0.0, + "loss/logits": 0.26699213311076164, + "loss/reg": 0.27600064873695374, + "step": 3579 + }, + { + "epoch": 0.0358, + "grad_norm": 0.7906016707420349, + "grad_norm_var": 0.008016691696521491, + "learning_rate": 5e-05, + "loss": 0.2357, + "loss/crossentropy": 2.7850855588912964, + "loss/hidden": 0.0, + "loss/logits": 0.23571234941482544, + "loss/reg": 0.27590641379356384, + "step": 3580 + }, + { + "epoch": 0.03581, + "grad_norm": 0.4778536260128021, + "grad_norm_var": 0.008007875541764723, + "learning_rate": 5e-05, + "loss": 0.2179, + "loss/crossentropy": 2.6468613743782043, + "loss/hidden": 0.0, + "loss/logits": 0.21786613762378693, + "loss/reg": 0.275579035282135, + "step": 3581 + }, + { + "epoch": 0.03582, + "grad_norm": 0.49757063388824463, + "grad_norm_var": 0.008007144547350364, + "learning_rate": 5e-05, + "loss": 0.1985, + "loss/crossentropy": 2.931581139564514, + "loss/hidden": 0.0, + "loss/logits": 0.19852975383400917, + "loss/reg": 0.275715708732605, + "step": 3582 + }, + { + "epoch": 0.03583, + "grad_norm": 0.5591050982475281, + "grad_norm_var": 0.008245695338067908, + "learning_rate": 5e-05, + "loss": 0.2076, + "loss/crossentropy": 2.9250137209892273, + "loss/hidden": 0.0, + "loss/logits": 0.20764250308275223, + "loss/reg": 0.2754662036895752, + "step": 3583 + }, + { + "epoch": 0.03584, + "grad_norm": 0.5235317945480347, + "grad_norm_var": 0.007939661898737722, + "learning_rate": 5e-05, + "loss": 0.2324, + "loss/crossentropy": 2.7105743288993835, + "loss/hidden": 0.0, + "loss/logits": 0.23244012892246246, + "loss/reg": 0.2752685844898224, + "step": 3584 + }, + { + "epoch": 0.03585, + "grad_norm": 0.5367690324783325, + "grad_norm_var": 0.00771489598589287, + "learning_rate": 5e-05, + "loss": 0.2431, + "loss/crossentropy": 2.874784290790558, + "loss/hidden": 0.0, + "loss/logits": 0.2431362345814705, + "loss/reg": 0.2747848331928253, + "step": 3585 + }, + { + "epoch": 0.03586, + "grad_norm": 0.45094841718673706, + "grad_norm_var": 0.007846272466366008, + "learning_rate": 5e-05, + "loss": 0.197, + "loss/crossentropy": 2.90638929605484, + "loss/hidden": 0.0, + "loss/logits": 0.19700265303254128, + "loss/reg": 0.27447885274887085, + "step": 3586 + }, + { + "epoch": 0.03587, + "grad_norm": 0.5341423153877258, + "grad_norm_var": 0.007877755540484858, + "learning_rate": 5e-05, + "loss": 0.2092, + "loss/crossentropy": 2.761461615562439, + "loss/hidden": 0.0, + "loss/logits": 0.2092147246003151, + "loss/reg": 0.2738022804260254, + "step": 3587 + }, + { + "epoch": 0.03588, + "grad_norm": 0.5030940771102905, + "grad_norm_var": 0.007501947477215802, + "learning_rate": 5e-05, + "loss": 0.2059, + "loss/crossentropy": 2.8130834698677063, + "loss/hidden": 0.0, + "loss/logits": 0.20593668892979622, + "loss/reg": 0.27328455448150635, + "step": 3588 + }, + { + "epoch": 0.03589, + "grad_norm": 0.46660342812538147, + "grad_norm_var": 0.007395570357246758, + "learning_rate": 5e-05, + "loss": 0.2082, + "loss/crossentropy": 2.9003056287765503, + "loss/hidden": 0.0, + "loss/logits": 0.20817217230796814, + "loss/reg": 0.27304553985595703, + "step": 3589 + }, + { + "epoch": 0.0359, + "grad_norm": 0.43521207571029663, + "grad_norm_var": 0.0076032965767474205, + "learning_rate": 5e-05, + "loss": 0.2054, + "loss/crossentropy": 2.620424211025238, + "loss/hidden": 0.0, + "loss/logits": 0.20535606890916824, + "loss/reg": 0.2724834382534027, + "step": 3590 + }, + { + "epoch": 0.03591, + "grad_norm": 0.9958042502403259, + "grad_norm_var": 0.021809731884896134, + "learning_rate": 5e-05, + "loss": 0.2593, + "loss/crossentropy": 3.0804898738861084, + "loss/hidden": 0.0, + "loss/logits": 0.2592828869819641, + "loss/reg": 0.27241891622543335, + "step": 3591 + }, + { + "epoch": 0.03592, + "grad_norm": 0.4977077841758728, + "grad_norm_var": 0.021525543610918105, + "learning_rate": 5e-05, + "loss": 0.2102, + "loss/crossentropy": 2.8079196214675903, + "loss/hidden": 0.0, + "loss/logits": 0.21021957322955132, + "loss/reg": 0.27169322967529297, + "step": 3592 + }, + { + "epoch": 0.03593, + "grad_norm": 0.49555152654647827, + "grad_norm_var": 0.021266252725627408, + "learning_rate": 5e-05, + "loss": 0.2092, + "loss/crossentropy": 2.7514816522598267, + "loss/hidden": 0.0, + "loss/logits": 0.20924659445881844, + "loss/reg": 0.271274596452713, + "step": 3593 + }, + { + "epoch": 0.03594, + "grad_norm": 0.5144512057304382, + "grad_norm_var": 0.020592567181349015, + "learning_rate": 5e-05, + "loss": 0.2215, + "loss/crossentropy": 2.861645460128784, + "loss/hidden": 0.0, + "loss/logits": 0.2215314693748951, + "loss/reg": 0.27069756388664246, + "step": 3594 + }, + { + "epoch": 0.03595, + "grad_norm": 0.49260416626930237, + "grad_norm_var": 0.02054291259142875, + "learning_rate": 5e-05, + "loss": 0.2112, + "loss/crossentropy": 2.8425373435020447, + "loss/hidden": 0.0, + "loss/logits": 0.21120919287204742, + "loss/reg": 0.2701127231121063, + "step": 3595 + }, + { + "epoch": 0.03596, + "grad_norm": 0.5230684280395508, + "grad_norm_var": 0.016370338600041077, + "learning_rate": 5e-05, + "loss": 0.2141, + "loss/crossentropy": 2.868765652179718, + "loss/hidden": 0.0, + "loss/logits": 0.21407174319028854, + "loss/reg": 0.2699401080608368, + "step": 3596 + }, + { + "epoch": 0.03597, + "grad_norm": 0.46828901767730713, + "grad_norm_var": 0.01644447183927865, + "learning_rate": 5e-05, + "loss": 0.2124, + "loss/crossentropy": 2.9233362078666687, + "loss/hidden": 0.0, + "loss/logits": 0.212408896535635, + "loss/reg": 0.2696254551410675, + "step": 3597 + }, + { + "epoch": 0.03598, + "grad_norm": 1.4987528324127197, + "grad_norm_var": 0.07464272064656717, + "learning_rate": 5e-05, + "loss": 0.2362, + "loss/crossentropy": 2.8309401273727417, + "loss/hidden": 0.0, + "loss/logits": 0.23618243262171745, + "loss/reg": 0.26932501792907715, + "step": 3598 + }, + { + "epoch": 0.03599, + "grad_norm": 0.5652022957801819, + "grad_norm_var": 0.07461710098994699, + "learning_rate": 5e-05, + "loss": 0.2219, + "loss/crossentropy": 2.763696014881134, + "loss/hidden": 0.0, + "loss/logits": 0.2218712754547596, + "loss/reg": 0.2689019441604614, + "step": 3599 + }, + { + "epoch": 0.036, + "grad_norm": 0.5266576409339905, + "grad_norm_var": 0.07458840102909446, + "learning_rate": 5e-05, + "loss": 0.2203, + "loss/crossentropy": 2.6964364647865295, + "loss/hidden": 0.0, + "loss/logits": 0.22032522037625313, + "loss/reg": 0.26860567927360535, + "step": 3600 + }, + { + "epoch": 0.03601, + "grad_norm": 0.5227334499359131, + "grad_norm_var": 0.07470791645233259, + "learning_rate": 5e-05, + "loss": 0.2094, + "loss/crossentropy": 2.7273462414741516, + "loss/hidden": 0.0, + "loss/logits": 0.209357600659132, + "loss/reg": 0.2686079144477844, + "step": 3601 + }, + { + "epoch": 0.03602, + "grad_norm": 0.48851364850997925, + "grad_norm_var": 0.0740837360117224, + "learning_rate": 5e-05, + "loss": 0.2065, + "loss/crossentropy": 2.7596755027770996, + "loss/hidden": 0.0, + "loss/logits": 0.2065003179013729, + "loss/reg": 0.2686166763305664, + "step": 3602 + }, + { + "epoch": 0.03603, + "grad_norm": 0.5078998804092407, + "grad_norm_var": 0.07434155248705955, + "learning_rate": 5e-05, + "loss": 0.2256, + "loss/crossentropy": 2.9185792207717896, + "loss/hidden": 0.0, + "loss/logits": 0.22563817352056503, + "loss/reg": 0.2682807147502899, + "step": 3603 + }, + { + "epoch": 0.03604, + "grad_norm": 0.5011216402053833, + "grad_norm_var": 0.07436567265776994, + "learning_rate": 5e-05, + "loss": 0.2275, + "loss/crossentropy": 2.886626422405243, + "loss/hidden": 0.0, + "loss/logits": 0.22745725139975548, + "loss/reg": 0.2680383622646332, + "step": 3604 + }, + { + "epoch": 0.03605, + "grad_norm": 0.47318345308303833, + "grad_norm_var": 0.07425681885342271, + "learning_rate": 5e-05, + "loss": 0.2103, + "loss/crossentropy": 2.7409799695014954, + "loss/hidden": 0.0, + "loss/logits": 0.210341926664114, + "loss/reg": 0.26800036430358887, + "step": 3605 + }, + { + "epoch": 0.03606, + "grad_norm": 0.536225438117981, + "grad_norm_var": 0.07275360544891137, + "learning_rate": 5e-05, + "loss": 0.2351, + "loss/crossentropy": 2.8300971388816833, + "loss/hidden": 0.0, + "loss/logits": 0.23511292785406113, + "loss/reg": 0.26779475808143616, + "step": 3606 + }, + { + "epoch": 0.03607, + "grad_norm": 0.535716712474823, + "grad_norm_var": 0.06173280348022286, + "learning_rate": 5e-05, + "loss": 0.2388, + "loss/crossentropy": 2.791872978210449, + "loss/hidden": 0.0, + "loss/logits": 0.23884716257452965, + "loss/reg": 0.26779958605766296, + "step": 3607 + }, + { + "epoch": 0.03608, + "grad_norm": 0.4528808295726776, + "grad_norm_var": 0.06230081954390886, + "learning_rate": 5e-05, + "loss": 0.2145, + "loss/crossentropy": 2.7410359382629395, + "loss/hidden": 0.0, + "loss/logits": 0.21450118720531464, + "loss/reg": 0.26740995049476624, + "step": 3608 + }, + { + "epoch": 0.03609, + "grad_norm": 0.5375247597694397, + "grad_norm_var": 0.062000281228263635, + "learning_rate": 5e-05, + "loss": 0.2372, + "loss/crossentropy": 2.716187059879303, + "loss/hidden": 0.0, + "loss/logits": 0.23724940046668053, + "loss/reg": 0.26710063219070435, + "step": 3609 + }, + { + "epoch": 0.0361, + "grad_norm": 0.5766829252243042, + "grad_norm_var": 0.061768536418600264, + "learning_rate": 5e-05, + "loss": 0.233, + "loss/crossentropy": 3.24883896112442, + "loss/hidden": 0.0, + "loss/logits": 0.23301170021295547, + "loss/reg": 0.26693665981292725, + "step": 3610 + }, + { + "epoch": 0.03611, + "grad_norm": 4.283941745758057, + "grad_norm_var": 0.9182835544270209, + "learning_rate": 5e-05, + "loss": 0.3879, + "loss/crossentropy": 2.898510992527008, + "loss/hidden": 0.0, + "loss/logits": 0.38785193488001823, + "loss/reg": 0.2666880786418915, + "step": 3611 + }, + { + "epoch": 0.03612, + "grad_norm": 0.5324205756187439, + "grad_norm_var": 0.917928238382053, + "learning_rate": 5e-05, + "loss": 0.221, + "loss/crossentropy": 2.80214786529541, + "loss/hidden": 0.0, + "loss/logits": 0.22104964405298233, + "loss/reg": 0.2664594352245331, + "step": 3612 + }, + { + "epoch": 0.03613, + "grad_norm": 0.5152071118354797, + "grad_norm_var": 0.9159094947737833, + "learning_rate": 5e-05, + "loss": 0.2184, + "loss/crossentropy": 2.7658802270889282, + "loss/hidden": 0.0, + "loss/logits": 0.21844440326094627, + "loss/reg": 0.26627403497695923, + "step": 3613 + }, + { + "epoch": 0.03614, + "grad_norm": 0.6171427369117737, + "grad_norm_var": 0.8842207203103536, + "learning_rate": 5e-05, + "loss": 0.2633, + "loss/crossentropy": 2.6127710938453674, + "loss/hidden": 0.0, + "loss/logits": 0.26326583325862885, + "loss/reg": 0.2662105858325958, + "step": 3614 + }, + { + "epoch": 0.03615, + "grad_norm": 0.5389800667762756, + "grad_norm_var": 0.884947619035095, + "learning_rate": 5e-05, + "loss": 0.2083, + "loss/crossentropy": 2.815691828727722, + "loss/hidden": 0.0, + "loss/logits": 0.20832183212041855, + "loss/reg": 0.2661559283733368, + "step": 3615 + }, + { + "epoch": 0.03616, + "grad_norm": 0.5178030133247375, + "grad_norm_var": 0.8852270356688958, + "learning_rate": 5e-05, + "loss": 0.2192, + "loss/crossentropy": 2.832362174987793, + "loss/hidden": 0.0, + "loss/logits": 0.21919643506407738, + "loss/reg": 0.2661411166191101, + "step": 3616 + }, + { + "epoch": 0.03617, + "grad_norm": 0.45517534017562866, + "grad_norm_var": 0.8876371310441368, + "learning_rate": 5e-05, + "loss": 0.219, + "loss/crossentropy": 2.76655513048172, + "loss/hidden": 0.0, + "loss/logits": 0.218952938914299, + "loss/reg": 0.2660287618637085, + "step": 3617 + }, + { + "epoch": 0.03618, + "grad_norm": 0.5153691172599792, + "grad_norm_var": 0.8867301355943928, + "learning_rate": 5e-05, + "loss": 0.2306, + "loss/crossentropy": 2.9136536717414856, + "loss/hidden": 0.0, + "loss/logits": 0.23057959228754044, + "loss/reg": 0.26623740792274475, + "step": 3618 + }, + { + "epoch": 0.03619, + "grad_norm": 0.5503095984458923, + "grad_norm_var": 0.8854391822312774, + "learning_rate": 5e-05, + "loss": 0.2263, + "loss/crossentropy": 2.947294294834137, + "loss/hidden": 0.0, + "loss/logits": 0.22625672072172165, + "loss/reg": 0.26644253730773926, + "step": 3619 + }, + { + "epoch": 0.0362, + "grad_norm": 0.45200008153915405, + "grad_norm_var": 0.8872772088292715, + "learning_rate": 5e-05, + "loss": 0.1941, + "loss/crossentropy": 2.9176940321922302, + "loss/hidden": 0.0, + "loss/logits": 0.19414426013827324, + "loss/reg": 0.26657766103744507, + "step": 3620 + }, + { + "epoch": 0.03621, + "grad_norm": 0.5229759812355042, + "grad_norm_var": 0.8855568005104815, + "learning_rate": 5e-05, + "loss": 0.2268, + "loss/crossentropy": 2.9266289472579956, + "loss/hidden": 0.0, + "loss/logits": 0.22684241831302643, + "loss/reg": 0.26679763197898865, + "step": 3621 + }, + { + "epoch": 0.03622, + "grad_norm": 0.49282607436180115, + "grad_norm_var": 0.8869623049100813, + "learning_rate": 5e-05, + "loss": 0.2235, + "loss/crossentropy": 2.845338761806488, + "loss/hidden": 0.0, + "loss/logits": 0.22354109957814217, + "loss/reg": 0.26704666018486023, + "step": 3622 + }, + { + "epoch": 0.03623, + "grad_norm": 0.4759957194328308, + "grad_norm_var": 0.8889397648915708, + "learning_rate": 5e-05, + "loss": 0.2152, + "loss/crossentropy": 2.799579679965973, + "loss/hidden": 0.0, + "loss/logits": 0.21524207293987274, + "loss/reg": 0.2671261429786682, + "step": 3623 + }, + { + "epoch": 0.03624, + "grad_norm": 1.3226732015609741, + "grad_norm_var": 0.9014959454048186, + "learning_rate": 5e-05, + "loss": 0.2381, + "loss/crossentropy": 2.692450165748596, + "loss/hidden": 0.0, + "loss/logits": 0.23813338205218315, + "loss/reg": 0.2671808898448944, + "step": 3624 + }, + { + "epoch": 0.03625, + "grad_norm": 0.5129947662353516, + "grad_norm_var": 0.902413900045301, + "learning_rate": 5e-05, + "loss": 0.2382, + "loss/crossentropy": 2.661430060863495, + "loss/hidden": 0.0, + "loss/logits": 0.23817892372608185, + "loss/reg": 0.26714998483657837, + "step": 3625 + }, + { + "epoch": 0.03626, + "grad_norm": 0.5445718169212341, + "grad_norm_var": 0.903456548953296, + "learning_rate": 5e-05, + "loss": 0.2233, + "loss/crossentropy": 2.763332486152649, + "loss/hidden": 0.0, + "loss/logits": 0.2233215905725956, + "loss/reg": 0.26720747351646423, + "step": 3626 + }, + { + "epoch": 0.03627, + "grad_norm": 0.41878366470336914, + "grad_norm_var": 0.04333018622676722, + "learning_rate": 5e-05, + "loss": 0.1939, + "loss/crossentropy": 2.8701395988464355, + "loss/hidden": 0.0, + "loss/logits": 0.19392110779881477, + "loss/reg": 0.2674783170223236, + "step": 3627 + }, + { + "epoch": 0.03628, + "grad_norm": 0.4712296426296234, + "grad_norm_var": 0.043802086446077124, + "learning_rate": 5e-05, + "loss": 0.2318, + "loss/crossentropy": 2.790402054786682, + "loss/hidden": 0.0, + "loss/logits": 0.2317562848329544, + "loss/reg": 0.267649382352829, + "step": 3628 + }, + { + "epoch": 0.03629, + "grad_norm": 0.4205145537853241, + "grad_norm_var": 0.04489966656812519, + "learning_rate": 5e-05, + "loss": 0.1987, + "loss/crossentropy": 2.765511929988861, + "loss/hidden": 0.0, + "loss/logits": 0.19868389144539833, + "loss/reg": 0.26746445894241333, + "step": 3629 + }, + { + "epoch": 0.0363, + "grad_norm": 0.4566093981266022, + "grad_norm_var": 0.04511245568337638, + "learning_rate": 5e-05, + "loss": 0.2133, + "loss/crossentropy": 2.894012928009033, + "loss/hidden": 0.0, + "loss/logits": 0.21326545253396034, + "loss/reg": 0.26772376894950867, + "step": 3630 + }, + { + "epoch": 0.03631, + "grad_norm": 0.4339826703071594, + "grad_norm_var": 0.04584097263037374, + "learning_rate": 5e-05, + "loss": 0.2108, + "loss/crossentropy": 2.8516780138015747, + "loss/hidden": 0.0, + "loss/logits": 0.21084148064255714, + "loss/reg": 0.267825722694397, + "step": 3631 + }, + { + "epoch": 0.03632, + "grad_norm": 0.4698779881000519, + "grad_norm_var": 0.04609593540841121, + "learning_rate": 5e-05, + "loss": 0.2287, + "loss/crossentropy": 2.7218549251556396, + "loss/hidden": 0.0, + "loss/logits": 0.22872519120573997, + "loss/reg": 0.26805245876312256, + "step": 3632 + }, + { + "epoch": 0.03633, + "grad_norm": 0.45610156655311584, + "grad_norm_var": 0.04608647140115279, + "learning_rate": 5e-05, + "loss": 0.2105, + "loss/crossentropy": 2.7358906269073486, + "loss/hidden": 0.0, + "loss/logits": 0.21053969115018845, + "loss/reg": 0.26831546425819397, + "step": 3633 + }, + { + "epoch": 0.03634, + "grad_norm": 0.5173577070236206, + "grad_norm_var": 0.04608222915020293, + "learning_rate": 5e-05, + "loss": 0.2447, + "loss/crossentropy": 2.7156218886375427, + "loss/hidden": 0.0, + "loss/logits": 0.24466003105044365, + "loss/reg": 0.2682298421859741, + "step": 3634 + }, + { + "epoch": 0.03635, + "grad_norm": 0.48467662930488586, + "grad_norm_var": 0.046194952748851034, + "learning_rate": 5e-05, + "loss": 0.2383, + "loss/crossentropy": 2.704411804676056, + "loss/hidden": 0.0, + "loss/logits": 0.23834974318742752, + "loss/reg": 0.2683819532394409, + "step": 3635 + }, + { + "epoch": 0.03636, + "grad_norm": 0.4171604514122009, + "grad_norm_var": 0.0466253578763851, + "learning_rate": 5e-05, + "loss": 0.1958, + "loss/crossentropy": 2.8142791390419006, + "loss/hidden": 0.0, + "loss/logits": 0.19576114416122437, + "loss/reg": 0.2686312198638916, + "step": 3636 + }, + { + "epoch": 0.03637, + "grad_norm": 0.43346184492111206, + "grad_norm_var": 0.04716398842011312, + "learning_rate": 5e-05, + "loss": 0.2135, + "loss/crossentropy": 2.693488359451294, + "loss/hidden": 0.0, + "loss/logits": 0.21349593624472618, + "loss/reg": 0.26879361271858215, + "step": 3637 + }, + { + "epoch": 0.03638, + "grad_norm": 0.4339055120944977, + "grad_norm_var": 0.04759877538799365, + "learning_rate": 5e-05, + "loss": 0.2055, + "loss/crossentropy": 2.8132728338241577, + "loss/hidden": 0.0, + "loss/logits": 0.20545856282114983, + "loss/reg": 0.2686738669872284, + "step": 3638 + }, + { + "epoch": 0.03639, + "grad_norm": 0.44247153401374817, + "grad_norm_var": 0.04785171453246022, + "learning_rate": 5e-05, + "loss": 0.2083, + "loss/crossentropy": 2.767302632331848, + "loss/hidden": 0.0, + "loss/logits": 0.2082630842924118, + "loss/reg": 0.2688526511192322, + "step": 3639 + }, + { + "epoch": 0.0364, + "grad_norm": 0.4333265423774719, + "grad_norm_var": 0.0014848976852074093, + "learning_rate": 5e-05, + "loss": 0.2121, + "loss/crossentropy": 2.847707152366638, + "loss/hidden": 0.0, + "loss/logits": 0.2121058739721775, + "loss/reg": 0.26881274580955505, + "step": 3640 + }, + { + "epoch": 0.03641, + "grad_norm": 0.5963634848594666, + "grad_norm_var": 0.002517388252854798, + "learning_rate": 5e-05, + "loss": 0.2264, + "loss/crossentropy": 2.8798015117645264, + "loss/hidden": 0.0, + "loss/logits": 0.22638476639986038, + "loss/reg": 0.26869288086891174, + "step": 3641 + }, + { + "epoch": 0.03642, + "grad_norm": 0.40759971737861633, + "grad_norm_var": 0.0022257936954328946, + "learning_rate": 5e-05, + "loss": 0.2061, + "loss/crossentropy": 2.7957159280776978, + "loss/hidden": 0.0, + "loss/logits": 0.20614811033010483, + "loss/reg": 0.26847901940345764, + "step": 3642 + }, + { + "epoch": 0.03643, + "grad_norm": 0.44818761944770813, + "grad_norm_var": 0.0021345545441589548, + "learning_rate": 5e-05, + "loss": 0.204, + "loss/crossentropy": 2.7853572368621826, + "loss/hidden": 0.0, + "loss/logits": 0.20398345589637756, + "loss/reg": 0.2685984969139099, + "step": 3643 + }, + { + "epoch": 0.03644, + "grad_norm": 0.5432966351509094, + "grad_norm_var": 0.0025893872688218483, + "learning_rate": 5e-05, + "loss": 0.2418, + "loss/crossentropy": 2.6607653498649597, + "loss/hidden": 0.0, + "loss/logits": 0.24180743843317032, + "loss/reg": 0.2685321569442749, + "step": 3644 + }, + { + "epoch": 0.03645, + "grad_norm": 0.45063552260398865, + "grad_norm_var": 0.002478754524635548, + "learning_rate": 5e-05, + "loss": 0.1983, + "loss/crossentropy": 2.7874990105628967, + "loss/hidden": 0.0, + "loss/logits": 0.19825860857963562, + "loss/reg": 0.268402099609375, + "step": 3645 + }, + { + "epoch": 0.03646, + "grad_norm": 0.46034568548202515, + "grad_norm_var": 0.00247591362925134, + "learning_rate": 5e-05, + "loss": 0.2079, + "loss/crossentropy": 2.9528937339782715, + "loss/hidden": 0.0, + "loss/logits": 0.20791326090693474, + "loss/reg": 0.2686212360858917, + "step": 3646 + }, + { + "epoch": 0.03647, + "grad_norm": 0.46994683146476746, + "grad_norm_var": 0.002411388776034562, + "learning_rate": 5e-05, + "loss": 0.2206, + "loss/crossentropy": 2.7905288338661194, + "loss/hidden": 0.0, + "loss/logits": 0.22063898295164108, + "loss/reg": 0.2684536576271057, + "step": 3647 + }, + { + "epoch": 0.03648, + "grad_norm": 0.550639271736145, + "grad_norm_var": 0.0028549312026190303, + "learning_rate": 5e-05, + "loss": 0.244, + "loss/crossentropy": 2.7532055377960205, + "loss/hidden": 0.0, + "loss/logits": 0.24404236674308777, + "loss/reg": 0.2683795690536499, + "step": 3648 + }, + { + "epoch": 0.03649, + "grad_norm": 0.4343002140522003, + "grad_norm_var": 0.0029296665359867784, + "learning_rate": 5e-05, + "loss": 0.2053, + "loss/crossentropy": 2.778397500514984, + "loss/hidden": 0.0, + "loss/logits": 0.20533019304275513, + "loss/reg": 0.26831522583961487, + "step": 3649 + }, + { + "epoch": 0.0365, + "grad_norm": 0.4407162070274353, + "grad_norm_var": 0.002815191688197283, + "learning_rate": 5e-05, + "loss": 0.2168, + "loss/crossentropy": 2.8641749024391174, + "loss/hidden": 0.0, + "loss/logits": 0.21677367389202118, + "loss/reg": 0.2683478593826294, + "step": 3650 + }, + { + "epoch": 0.03651, + "grad_norm": 0.471329927444458, + "grad_norm_var": 0.002792091650207355, + "learning_rate": 5e-05, + "loss": 0.2188, + "loss/crossentropy": 2.9030200242996216, + "loss/hidden": 0.0, + "loss/logits": 0.21878204494714737, + "loss/reg": 0.2681308090686798, + "step": 3651 + }, + { + "epoch": 0.03652, + "grad_norm": 0.46046459674835205, + "grad_norm_var": 0.0026353527693149287, + "learning_rate": 5e-05, + "loss": 0.2179, + "loss/crossentropy": 2.8360047936439514, + "loss/hidden": 0.0, + "loss/logits": 0.21794674918055534, + "loss/reg": 0.26792600750923157, + "step": 3652 + }, + { + "epoch": 0.03653, + "grad_norm": 0.496724009513855, + "grad_norm_var": 0.0025999600166386138, + "learning_rate": 5e-05, + "loss": 0.2393, + "loss/crossentropy": 2.7677061557769775, + "loss/hidden": 0.0, + "loss/logits": 0.23932693526148796, + "loss/reg": 0.26791101694107056, + "step": 3653 + }, + { + "epoch": 0.03654, + "grad_norm": 0.4580288529396057, + "grad_norm_var": 0.0025161635592839903, + "learning_rate": 5e-05, + "loss": 0.2166, + "loss/crossentropy": 2.6982569098472595, + "loss/hidden": 0.0, + "loss/logits": 0.21663112938404083, + "loss/reg": 0.26748159527778625, + "step": 3654 + }, + { + "epoch": 0.03655, + "grad_norm": 0.4460406005382538, + "grad_norm_var": 0.002502539715302158, + "learning_rate": 5e-05, + "loss": 0.2181, + "loss/crossentropy": 2.6475043892860413, + "loss/hidden": 0.0, + "loss/logits": 0.21805047243833542, + "loss/reg": 0.26722604036331177, + "step": 3655 + }, + { + "epoch": 0.03656, + "grad_norm": 0.41221126914024353, + "grad_norm_var": 0.002642091539747678, + "learning_rate": 5e-05, + "loss": 0.1934, + "loss/crossentropy": 2.848959743976593, + "loss/hidden": 0.0, + "loss/logits": 0.19336726516485214, + "loss/reg": 0.26693791151046753, + "step": 3656 + }, + { + "epoch": 0.03657, + "grad_norm": 0.49134907126426697, + "grad_norm_var": 0.0015854914587176546, + "learning_rate": 5e-05, + "loss": 0.2288, + "loss/crossentropy": 2.8573663234710693, + "loss/hidden": 0.0, + "loss/logits": 0.2287500910460949, + "loss/reg": 0.26660287380218506, + "step": 3657 + }, + { + "epoch": 0.03658, + "grad_norm": 0.5253614187240601, + "grad_norm_var": 0.0015491739523921392, + "learning_rate": 5e-05, + "loss": 0.2316, + "loss/crossentropy": 2.7873921990394592, + "loss/hidden": 0.0, + "loss/logits": 0.23164081946015358, + "loss/reg": 0.2665122449398041, + "step": 3658 + }, + { + "epoch": 0.03659, + "grad_norm": 0.47914209961891174, + "grad_norm_var": 0.0015088255043867608, + "learning_rate": 5e-05, + "loss": 0.2139, + "loss/crossentropy": 2.98260498046875, + "loss/hidden": 0.0, + "loss/logits": 0.2138897106051445, + "loss/reg": 0.2664526104927063, + "step": 3659 + }, + { + "epoch": 0.0366, + "grad_norm": 0.46370911598205566, + "grad_norm_var": 0.0011736907149239213, + "learning_rate": 5e-05, + "loss": 0.2143, + "loss/crossentropy": 2.8249927759170532, + "loss/hidden": 0.0, + "loss/logits": 0.2143365778028965, + "loss/reg": 0.2661987841129303, + "step": 3660 + }, + { + "epoch": 0.03661, + "grad_norm": 0.4675880968570709, + "grad_norm_var": 0.0011491614665180116, + "learning_rate": 5e-05, + "loss": 0.2235, + "loss/crossentropy": 2.7350924015045166, + "loss/hidden": 0.0, + "loss/logits": 0.22353879734873772, + "loss/reg": 0.26607581973075867, + "step": 3661 + }, + { + "epoch": 0.03662, + "grad_norm": 0.45968735218048096, + "grad_norm_var": 0.0011500793137130003, + "learning_rate": 5e-05, + "loss": 0.223, + "loss/crossentropy": 2.7700870633125305, + "loss/hidden": 0.0, + "loss/logits": 0.22302913293242455, + "loss/reg": 0.2660394012928009, + "step": 3662 + }, + { + "epoch": 0.03663, + "grad_norm": 0.4577554762363434, + "grad_norm_var": 0.0011601904982311913, + "learning_rate": 5e-05, + "loss": 0.2221, + "loss/crossentropy": 2.764958918094635, + "loss/hidden": 0.0, + "loss/logits": 0.22211208194494247, + "loss/reg": 0.2660435736179352, + "step": 3663 + }, + { + "epoch": 0.03664, + "grad_norm": 0.45585089921951294, + "grad_norm_var": 0.0006986754119015577, + "learning_rate": 5e-05, + "loss": 0.2247, + "loss/crossentropy": 2.774477183818817, + "loss/hidden": 0.0, + "loss/logits": 0.2247392050921917, + "loss/reg": 0.26596400141716003, + "step": 3664 + }, + { + "epoch": 0.03665, + "grad_norm": 0.5037508606910706, + "grad_norm_var": 0.0007272798570879739, + "learning_rate": 5e-05, + "loss": 0.2202, + "loss/crossentropy": 2.856147885322571, + "loss/hidden": 0.0, + "loss/logits": 0.22020234540104866, + "loss/reg": 0.26561811566352844, + "step": 3665 + }, + { + "epoch": 0.03666, + "grad_norm": 0.5131802558898926, + "grad_norm_var": 0.0007908246834608778, + "learning_rate": 5e-05, + "loss": 0.226, + "loss/crossentropy": 2.7833815217018127, + "loss/hidden": 0.0, + "loss/logits": 0.22597884759306908, + "loss/reg": 0.26534491777420044, + "step": 3666 + }, + { + "epoch": 0.03667, + "grad_norm": 0.5268526077270508, + "grad_norm_var": 0.0009738297688116256, + "learning_rate": 5e-05, + "loss": 0.2375, + "loss/crossentropy": 2.872670352458954, + "loss/hidden": 0.0, + "loss/logits": 0.23751704394817352, + "loss/reg": 0.2652420699596405, + "step": 3667 + }, + { + "epoch": 0.03668, + "grad_norm": 0.5515263080596924, + "grad_norm_var": 0.0013021829707280683, + "learning_rate": 5e-05, + "loss": 0.2353, + "loss/crossentropy": 3.017123758792877, + "loss/hidden": 0.0, + "loss/logits": 0.23534763231873512, + "loss/reg": 0.2649061381816864, + "step": 3668 + }, + { + "epoch": 0.03669, + "grad_norm": 0.4641459584236145, + "grad_norm_var": 0.0013036787174317628, + "learning_rate": 5e-05, + "loss": 0.2268, + "loss/crossentropy": 2.910761773586273, + "loss/hidden": 0.0, + "loss/logits": 0.22682759165763855, + "loss/reg": 0.2648044228553772, + "step": 3669 + }, + { + "epoch": 0.0367, + "grad_norm": 0.44602489471435547, + "grad_norm_var": 0.001347467984965774, + "learning_rate": 5e-05, + "loss": 0.2141, + "loss/crossentropy": 2.8910064697265625, + "loss/hidden": 0.0, + "loss/logits": 0.21414534747600555, + "loss/reg": 0.2645316421985626, + "step": 3670 + }, + { + "epoch": 0.03671, + "grad_norm": 0.4321902394294739, + "grad_norm_var": 0.0014203444744450308, + "learning_rate": 5e-05, + "loss": 0.2101, + "loss/crossentropy": 2.802459418773651, + "loss/hidden": 0.0, + "loss/logits": 0.21006691828370094, + "loss/reg": 0.2642155587673187, + "step": 3671 + }, + { + "epoch": 0.03672, + "grad_norm": 0.47454068064689636, + "grad_norm_var": 0.0011152030328169323, + "learning_rate": 5e-05, + "loss": 0.2165, + "loss/crossentropy": 2.849172055721283, + "loss/hidden": 0.0, + "loss/logits": 0.21647870540618896, + "loss/reg": 0.26403406262397766, + "step": 3672 + }, + { + "epoch": 0.03673, + "grad_norm": 0.4421045184135437, + "grad_norm_var": 0.0012056506433096932, + "learning_rate": 5e-05, + "loss": 0.2088, + "loss/crossentropy": 2.781054735183716, + "loss/hidden": 0.0, + "loss/logits": 0.2087702490389347, + "loss/reg": 0.2638676166534424, + "step": 3673 + }, + { + "epoch": 0.03674, + "grad_norm": 0.5616590976715088, + "grad_norm_var": 0.0015125488826583933, + "learning_rate": 5e-05, + "loss": 0.2625, + "loss/crossentropy": 2.8683714866638184, + "loss/hidden": 0.0, + "loss/logits": 0.2624559998512268, + "loss/reg": 0.26331305503845215, + "step": 3674 + }, + { + "epoch": 0.03675, + "grad_norm": 0.5115512609481812, + "grad_norm_var": 0.0015691660244003776, + "learning_rate": 5e-05, + "loss": 0.2271, + "loss/crossentropy": 2.8795764446258545, + "loss/hidden": 0.0, + "loss/logits": 0.2270837128162384, + "loss/reg": 0.26312321424484253, + "step": 3675 + }, + { + "epoch": 0.03676, + "grad_norm": 0.4702123999595642, + "grad_norm_var": 0.0015548589547777565, + "learning_rate": 5e-05, + "loss": 0.2092, + "loss/crossentropy": 2.8324323296546936, + "loss/hidden": 0.0, + "loss/logits": 0.2091764323413372, + "loss/reg": 0.2628766894340515, + "step": 3676 + }, + { + "epoch": 0.03677, + "grad_norm": 0.473122239112854, + "grad_norm_var": 0.001544911090089407, + "learning_rate": 5e-05, + "loss": 0.2092, + "loss/crossentropy": 2.8605101704597473, + "loss/hidden": 0.0, + "loss/logits": 0.2092110812664032, + "loss/reg": 0.2625032067298889, + "step": 3677 + }, + { + "epoch": 0.03678, + "grad_norm": 0.8172870874404907, + "grad_norm_var": 0.008377571019742014, + "learning_rate": 5e-05, + "loss": 0.2671, + "loss/crossentropy": 2.8208828568458557, + "loss/hidden": 0.0, + "loss/logits": 0.2670702412724495, + "loss/reg": 0.2621743977069855, + "step": 3678 + }, + { + "epoch": 0.03679, + "grad_norm": 0.45002228021621704, + "grad_norm_var": 0.00843142410331682, + "learning_rate": 5e-05, + "loss": 0.2159, + "loss/crossentropy": 2.8929147720336914, + "loss/hidden": 0.0, + "loss/logits": 0.21585796028375626, + "loss/reg": 0.2621217370033264, + "step": 3679 + }, + { + "epoch": 0.0368, + "grad_norm": 0.4515419006347656, + "grad_norm_var": 0.008461325849254974, + "learning_rate": 5e-05, + "loss": 0.2158, + "loss/crossentropy": 2.7629511952400208, + "loss/hidden": 0.0, + "loss/logits": 0.2157776616513729, + "loss/reg": 0.2618427276611328, + "step": 3680 + }, + { + "epoch": 0.03681, + "grad_norm": 0.4298454523086548, + "grad_norm_var": 0.008820992297189944, + "learning_rate": 5e-05, + "loss": 0.2032, + "loss/crossentropy": 2.908952295780182, + "loss/hidden": 0.0, + "loss/logits": 0.20324388146400452, + "loss/reg": 0.2616802155971527, + "step": 3681 + }, + { + "epoch": 0.03682, + "grad_norm": 0.44032371044158936, + "grad_norm_var": 0.00903430847767472, + "learning_rate": 5e-05, + "loss": 0.1983, + "loss/crossentropy": 2.926515519618988, + "loss/hidden": 0.0, + "loss/logits": 0.1982731856405735, + "loss/reg": 0.26142629981040955, + "step": 3682 + }, + { + "epoch": 0.03683, + "grad_norm": 0.4640195369720459, + "grad_norm_var": 0.00902622235532506, + "learning_rate": 5e-05, + "loss": 0.2088, + "loss/crossentropy": 2.702130436897278, + "loss/hidden": 0.0, + "loss/logits": 0.20882532000541687, + "loss/reg": 0.2610430419445038, + "step": 3683 + }, + { + "epoch": 0.03684, + "grad_norm": 0.47784459590911865, + "grad_norm_var": 0.008785718785942006, + "learning_rate": 5e-05, + "loss": 0.2213, + "loss/crossentropy": 2.718394100666046, + "loss/hidden": 0.0, + "loss/logits": 0.22125036641955376, + "loss/reg": 0.2607634663581848, + "step": 3684 + }, + { + "epoch": 0.03685, + "grad_norm": 0.5629860758781433, + "grad_norm_var": 0.00908322783801347, + "learning_rate": 5e-05, + "loss": 0.2286, + "loss/crossentropy": 2.875462770462036, + "loss/hidden": 0.0, + "loss/logits": 0.22856371849775314, + "loss/reg": 0.2605856955051422, + "step": 3685 + }, + { + "epoch": 0.03686, + "grad_norm": 0.4923284649848938, + "grad_norm_var": 0.00892054762224976, + "learning_rate": 5e-05, + "loss": 0.224, + "loss/crossentropy": 2.7778496146202087, + "loss/hidden": 0.0, + "loss/logits": 0.2240319550037384, + "loss/reg": 0.2603089511394501, + "step": 3686 + }, + { + "epoch": 0.03687, + "grad_norm": 0.4407700300216675, + "grad_norm_var": 0.008851037928212983, + "learning_rate": 5e-05, + "loss": 0.1993, + "loss/crossentropy": 2.7389103770256042, + "loss/hidden": 0.0, + "loss/logits": 0.19928928837180138, + "loss/reg": 0.26005059480667114, + "step": 3687 + }, + { + "epoch": 0.03688, + "grad_norm": 0.4743127226829529, + "grad_norm_var": 0.008851739313314984, + "learning_rate": 5e-05, + "loss": 0.2327, + "loss/crossentropy": 2.8765321373939514, + "loss/hidden": 0.0, + "loss/logits": 0.2327064350247383, + "loss/reg": 0.2598707675933838, + "step": 3688 + }, + { + "epoch": 0.03689, + "grad_norm": 0.41637763381004333, + "grad_norm_var": 0.009083112059058514, + "learning_rate": 5e-05, + "loss": 0.194, + "loss/crossentropy": 2.8455315828323364, + "loss/hidden": 0.0, + "loss/logits": 0.193967055529356, + "loss/reg": 0.25980645418167114, + "step": 3689 + }, + { + "epoch": 0.0369, + "grad_norm": 0.42543286085128784, + "grad_norm_var": 0.00904832408120836, + "learning_rate": 5e-05, + "loss": 0.2085, + "loss/crossentropy": 2.678619146347046, + "loss/hidden": 0.0, + "loss/logits": 0.20849591121077538, + "loss/reg": 0.259416788816452, + "step": 3690 + }, + { + "epoch": 0.03691, + "grad_norm": 0.4200587868690491, + "grad_norm_var": 0.009276560926922939, + "learning_rate": 5e-05, + "loss": 0.1952, + "loss/crossentropy": 2.654641628265381, + "loss/hidden": 0.0, + "loss/logits": 0.19517119601368904, + "loss/reg": 0.2591559886932373, + "step": 3691 + }, + { + "epoch": 0.03692, + "grad_norm": 0.4682547152042389, + "grad_norm_var": 0.009279787354775118, + "learning_rate": 5e-05, + "loss": 0.208, + "loss/crossentropy": 2.9318991899490356, + "loss/hidden": 0.0, + "loss/logits": 0.20796247944235802, + "loss/reg": 0.25884363055229187, + "step": 3692 + }, + { + "epoch": 0.03693, + "grad_norm": 0.5327221751213074, + "grad_norm_var": 0.009434959390259682, + "learning_rate": 5e-05, + "loss": 0.2317, + "loss/crossentropy": 2.684523344039917, + "loss/hidden": 0.0, + "loss/logits": 0.23167557269334793, + "loss/reg": 0.25859346985816956, + "step": 3693 + }, + { + "epoch": 0.03694, + "grad_norm": 0.49869754910469055, + "grad_norm_var": 0.0016745328483392385, + "learning_rate": 5e-05, + "loss": 0.222, + "loss/crossentropy": 2.852847456932068, + "loss/hidden": 0.0, + "loss/logits": 0.22203634306788445, + "loss/reg": 0.2583184838294983, + "step": 3694 + }, + { + "epoch": 0.03695, + "grad_norm": 0.44983941316604614, + "grad_norm_var": 0.0016749085692822482, + "learning_rate": 5e-05, + "loss": 0.2124, + "loss/crossentropy": 2.807231903076172, + "loss/hidden": 0.0, + "loss/logits": 0.2124084047973156, + "loss/reg": 0.25804126262664795, + "step": 3695 + }, + { + "epoch": 0.03696, + "grad_norm": 0.4497048258781433, + "grad_norm_var": 0.0016784979573064166, + "learning_rate": 5e-05, + "loss": 0.2121, + "loss/crossentropy": 2.6767860054969788, + "loss/hidden": 0.0, + "loss/logits": 0.21208174526691437, + "loss/reg": 0.2579083740711212, + "step": 3696 + }, + { + "epoch": 0.03697, + "grad_norm": 0.42181602120399475, + "grad_norm_var": 0.0017203990112785394, + "learning_rate": 5e-05, + "loss": 0.1925, + "loss/crossentropy": 2.8963430523872375, + "loss/hidden": 0.0, + "loss/logits": 0.19253386929631233, + "loss/reg": 0.25781750679016113, + "step": 3697 + }, + { + "epoch": 0.03698, + "grad_norm": 0.4319908022880554, + "grad_norm_var": 0.0017518423070619349, + "learning_rate": 5e-05, + "loss": 0.2051, + "loss/crossentropy": 2.89365816116333, + "loss/hidden": 0.0, + "loss/logits": 0.20511019602417946, + "loss/reg": 0.2577417492866516, + "step": 3698 + }, + { + "epoch": 0.03699, + "grad_norm": 0.42873644828796387, + "grad_norm_var": 0.001830484425754501, + "learning_rate": 5e-05, + "loss": 0.2066, + "loss/crossentropy": 2.7497430443763733, + "loss/hidden": 0.0, + "loss/logits": 0.20660752803087234, + "loss/reg": 0.2575954794883728, + "step": 3699 + }, + { + "epoch": 0.037, + "grad_norm": 0.4495640695095062, + "grad_norm_var": 0.0018206954832286552, + "learning_rate": 5e-05, + "loss": 0.2032, + "loss/crossentropy": 2.8328912258148193, + "loss/hidden": 0.0, + "loss/logits": 0.20321227982640266, + "loss/reg": 0.2576431632041931, + "step": 3700 + }, + { + "epoch": 0.03701, + "grad_norm": 0.488770455121994, + "grad_norm_var": 0.001148074696402664, + "learning_rate": 5e-05, + "loss": 0.2289, + "loss/crossentropy": 2.8800224661827087, + "loss/hidden": 0.0, + "loss/logits": 0.2289186455309391, + "loss/reg": 0.2574650049209595, + "step": 3701 + }, + { + "epoch": 0.03702, + "grad_norm": 0.4438477158546448, + "grad_norm_var": 0.0010574670660639404, + "learning_rate": 5e-05, + "loss": 0.2062, + "loss/crossentropy": 2.7820246815681458, + "loss/hidden": 0.0, + "loss/logits": 0.20623688772320747, + "loss/reg": 0.2571657598018646, + "step": 3702 + }, + { + "epoch": 0.03703, + "grad_norm": 0.46059173345565796, + "grad_norm_var": 0.0010508742074623564, + "learning_rate": 5e-05, + "loss": 0.2113, + "loss/crossentropy": 2.796632945537567, + "loss/hidden": 0.0, + "loss/logits": 0.2112884297966957, + "loss/reg": 0.25697728991508484, + "step": 3703 + }, + { + "epoch": 0.03704, + "grad_norm": 0.47417518496513367, + "grad_norm_var": 0.0010504991259433086, + "learning_rate": 5e-05, + "loss": 0.2079, + "loss/crossentropy": 3.0615599155426025, + "loss/hidden": 0.0, + "loss/logits": 0.20785971358418465, + "loss/reg": 0.2567383646965027, + "step": 3704 + }, + { + "epoch": 0.03705, + "grad_norm": 0.4458358585834503, + "grad_norm_var": 0.0009578035280678502, + "learning_rate": 5e-05, + "loss": 0.2102, + "loss/crossentropy": 2.806802213191986, + "loss/hidden": 0.0, + "loss/logits": 0.21017561480402946, + "loss/reg": 0.2564956545829773, + "step": 3705 + }, + { + "epoch": 0.03706, + "grad_norm": 0.5962980389595032, + "grad_norm_var": 0.002094592331455388, + "learning_rate": 5e-05, + "loss": 0.2631, + "loss/crossentropy": 3.0710458159446716, + "loss/hidden": 0.0, + "loss/logits": 0.2631177678704262, + "loss/reg": 0.25620976090431213, + "step": 3706 + }, + { + "epoch": 0.03707, + "grad_norm": 0.4416109323501587, + "grad_norm_var": 0.0019907249789150407, + "learning_rate": 5e-05, + "loss": 0.2064, + "loss/crossentropy": 2.7474347352981567, + "loss/hidden": 0.0, + "loss/logits": 0.20641466230154037, + "loss/reg": 0.2561103105545044, + "step": 3707 + }, + { + "epoch": 0.03708, + "grad_norm": 0.4542236328125, + "grad_norm_var": 0.0020019046663479014, + "learning_rate": 5e-05, + "loss": 0.2135, + "loss/crossentropy": 2.8576775789260864, + "loss/hidden": 0.0, + "loss/logits": 0.21350213885307312, + "loss/reg": 0.25580233335494995, + "step": 3708 + }, + { + "epoch": 0.03709, + "grad_norm": 0.4328541159629822, + "grad_norm_var": 0.0017471416622280688, + "learning_rate": 5e-05, + "loss": 0.2056, + "loss/crossentropy": 2.788160741329193, + "loss/hidden": 0.0, + "loss/logits": 0.20560774952173233, + "loss/reg": 0.255576491355896, + "step": 3709 + }, + { + "epoch": 0.0371, + "grad_norm": 0.4923756420612335, + "grad_norm_var": 0.0017174713885732106, + "learning_rate": 5e-05, + "loss": 0.2252, + "loss/crossentropy": 2.876648426055908, + "loss/hidden": 0.0, + "loss/logits": 0.2251887023448944, + "loss/reg": 0.25538861751556396, + "step": 3710 + }, + { + "epoch": 0.03711, + "grad_norm": 0.4119583070278168, + "grad_norm_var": 0.001859182263008352, + "learning_rate": 5e-05, + "loss": 0.1975, + "loss/crossentropy": 2.746454179286957, + "loss/hidden": 0.0, + "loss/logits": 0.19747582077980042, + "loss/reg": 0.2551484704017639, + "step": 3711 + }, + { + "epoch": 0.03712, + "grad_norm": 0.439299613237381, + "grad_norm_var": 0.0018771412841447788, + "learning_rate": 5e-05, + "loss": 0.2124, + "loss/crossentropy": 2.6385587453842163, + "loss/hidden": 0.0, + "loss/logits": 0.21239455416798592, + "loss/reg": 0.25512099266052246, + "step": 3712 + }, + { + "epoch": 0.03713, + "grad_norm": 0.6211816072463989, + "grad_norm_var": 0.0034228054988814944, + "learning_rate": 5e-05, + "loss": 0.2416, + "loss/crossentropy": 2.933294117450714, + "loss/hidden": 0.0, + "loss/logits": 0.2415720857679844, + "loss/reg": 0.2550555169582367, + "step": 3713 + }, + { + "epoch": 0.03714, + "grad_norm": 0.5660223960876465, + "grad_norm_var": 0.003873794595094178, + "learning_rate": 5e-05, + "loss": 0.2283, + "loss/crossentropy": 2.8421451449394226, + "loss/hidden": 0.0, + "loss/logits": 0.228276327252388, + "loss/reg": 0.2549034357070923, + "step": 3714 + }, + { + "epoch": 0.03715, + "grad_norm": 0.4500696659088135, + "grad_norm_var": 0.0037622283652556337, + "learning_rate": 5e-05, + "loss": 0.2015, + "loss/crossentropy": 2.8352553844451904, + "loss/hidden": 0.0, + "loss/logits": 0.20145705342292786, + "loss/reg": 0.2548215687274933, + "step": 3715 + }, + { + "epoch": 0.03716, + "grad_norm": 0.4699699282646179, + "grad_norm_var": 0.0037073689287088355, + "learning_rate": 5e-05, + "loss": 0.2119, + "loss/crossentropy": 2.8189589977264404, + "loss/hidden": 0.0, + "loss/logits": 0.21185483783483505, + "loss/reg": 0.25480562448501587, + "step": 3716 + }, + { + "epoch": 0.03717, + "grad_norm": 0.47919321060180664, + "grad_norm_var": 0.0037026271455472587, + "learning_rate": 5e-05, + "loss": 0.2319, + "loss/crossentropy": 2.702464520931244, + "loss/hidden": 0.0, + "loss/logits": 0.231886588037014, + "loss/reg": 0.2548554539680481, + "step": 3717 + }, + { + "epoch": 0.03718, + "grad_norm": 0.4404509663581848, + "grad_norm_var": 0.003719707693339939, + "learning_rate": 5e-05, + "loss": 0.217, + "loss/crossentropy": 2.7906147837638855, + "loss/hidden": 0.0, + "loss/logits": 0.21701078489422798, + "loss/reg": 0.25475841760635376, + "step": 3718 + }, + { + "epoch": 0.03719, + "grad_norm": 0.49743545055389404, + "grad_norm_var": 0.003710399981651883, + "learning_rate": 5e-05, + "loss": 0.2274, + "loss/crossentropy": 2.8841626048088074, + "loss/hidden": 0.0, + "loss/logits": 0.22738995403051376, + "loss/reg": 0.25482499599456787, + "step": 3719 + }, + { + "epoch": 0.0372, + "grad_norm": 0.46126675605773926, + "grad_norm_var": 0.003734384359796635, + "learning_rate": 5e-05, + "loss": 0.2251, + "loss/crossentropy": 2.7380823493003845, + "loss/hidden": 0.0, + "loss/logits": 0.2250608466565609, + "loss/reg": 0.25472524762153625, + "step": 3720 + }, + { + "epoch": 0.03721, + "grad_norm": 0.4460058808326721, + "grad_norm_var": 0.0037335832755742517, + "learning_rate": 5e-05, + "loss": 0.2059, + "loss/crossentropy": 2.975006401538849, + "loss/hidden": 0.0, + "loss/logits": 0.20589367672801018, + "loss/reg": 0.25484487414360046, + "step": 3721 + }, + { + "epoch": 0.03722, + "grad_norm": 0.4220404624938965, + "grad_norm_var": 0.002958687924025875, + "learning_rate": 5e-05, + "loss": 0.2102, + "loss/crossentropy": 2.725655198097229, + "loss/hidden": 0.0, + "loss/logits": 0.21015916392207146, + "loss/reg": 0.2548152506351471, + "step": 3722 + }, + { + "epoch": 0.03723, + "grad_norm": 0.42459216713905334, + "grad_norm_var": 0.0030420549679218184, + "learning_rate": 5e-05, + "loss": 0.2161, + "loss/crossentropy": 2.752350330352783, + "loss/hidden": 0.0, + "loss/logits": 0.21611294150352478, + "loss/reg": 0.25451046228408813, + "step": 3723 + }, + { + "epoch": 0.03724, + "grad_norm": 0.4409867525100708, + "grad_norm_var": 0.003079629869068971, + "learning_rate": 5e-05, + "loss": 0.2052, + "loss/crossentropy": 2.705482542514801, + "loss/hidden": 0.0, + "loss/logits": 0.20517781004309654, + "loss/reg": 0.25462427735328674, + "step": 3724 + }, + { + "epoch": 0.03725, + "grad_norm": 0.4429112374782562, + "grad_norm_var": 0.0030381770411390266, + "learning_rate": 5e-05, + "loss": 0.2049, + "loss/crossentropy": 2.877839505672455, + "loss/hidden": 0.0, + "loss/logits": 0.20487242564558983, + "loss/reg": 0.25443145632743835, + "step": 3725 + }, + { + "epoch": 0.03726, + "grad_norm": 0.4843325614929199, + "grad_norm_var": 0.0030172699161667388, + "learning_rate": 5e-05, + "loss": 0.2314, + "loss/crossentropy": 2.884976387023926, + "loss/hidden": 0.0, + "loss/logits": 0.2314334250986576, + "loss/reg": 0.2543991506099701, + "step": 3726 + }, + { + "epoch": 0.03727, + "grad_norm": 0.44399428367614746, + "grad_norm_var": 0.002839439751260638, + "learning_rate": 5e-05, + "loss": 0.2234, + "loss/crossentropy": 2.63736754655838, + "loss/hidden": 0.0, + "loss/logits": 0.22344433143734932, + "loss/reg": 0.254031777381897, + "step": 3727 + }, + { + "epoch": 0.03728, + "grad_norm": 0.43622657656669617, + "grad_norm_var": 0.0028528588548367892, + "learning_rate": 5e-05, + "loss": 0.2225, + "loss/crossentropy": 2.8716397881507874, + "loss/hidden": 0.0, + "loss/logits": 0.22247420251369476, + "loss/reg": 0.253974974155426, + "step": 3728 + }, + { + "epoch": 0.03729, + "grad_norm": 0.44113075733184814, + "grad_norm_var": 0.0012596422415405114, + "learning_rate": 5e-05, + "loss": 0.2002, + "loss/crossentropy": 2.813425838947296, + "loss/hidden": 0.0, + "loss/logits": 0.20015795156359673, + "loss/reg": 0.253641277551651, + "step": 3729 + }, + { + "epoch": 0.0373, + "grad_norm": 0.4588140845298767, + "grad_norm_var": 0.00045051703936834034, + "learning_rate": 5e-05, + "loss": 0.2191, + "loss/crossentropy": 2.688777506351471, + "loss/hidden": 0.0, + "loss/logits": 0.2191067785024643, + "loss/reg": 0.2538132071495056, + "step": 3730 + }, + { + "epoch": 0.03731, + "grad_norm": 0.4334384799003601, + "grad_norm_var": 0.00047311327497467546, + "learning_rate": 5e-05, + "loss": 0.211, + "loss/crossentropy": 2.814151644706726, + "loss/hidden": 0.0, + "loss/logits": 0.21097059547901154, + "loss/reg": 0.25364983081817627, + "step": 3731 + }, + { + "epoch": 0.03732, + "grad_norm": 0.4663276672363281, + "grad_norm_var": 0.0004649360244587582, + "learning_rate": 5e-05, + "loss": 0.2226, + "loss/crossentropy": 2.7224910855293274, + "loss/hidden": 0.0, + "loss/logits": 0.22259847074747086, + "loss/reg": 0.2535631060600281, + "step": 3732 + }, + { + "epoch": 0.03733, + "grad_norm": 0.44616183638572693, + "grad_norm_var": 0.0004098262682474374, + "learning_rate": 5e-05, + "loss": 0.215, + "loss/crossentropy": 2.8783986568450928, + "loss/hidden": 0.0, + "loss/logits": 0.21497951075434685, + "loss/reg": 0.2539403736591339, + "step": 3733 + }, + { + "epoch": 0.03734, + "grad_norm": 0.4556655287742615, + "grad_norm_var": 0.0004066830359234164, + "learning_rate": 5e-05, + "loss": 0.2087, + "loss/crossentropy": 2.9251001477241516, + "loss/hidden": 0.0, + "loss/logits": 0.208707295358181, + "loss/reg": 0.2542058229446411, + "step": 3734 + }, + { + "epoch": 0.03735, + "grad_norm": 0.4750877320766449, + "grad_norm_var": 0.0002968013830071059, + "learning_rate": 5e-05, + "loss": 0.2193, + "loss/crossentropy": 2.817636728286743, + "loss/hidden": 0.0, + "loss/logits": 0.21927836164832115, + "loss/reg": 0.2543424069881439, + "step": 3735 + }, + { + "epoch": 0.03736, + "grad_norm": 0.4622752368450165, + "grad_norm_var": 0.00029855655085514834, + "learning_rate": 5e-05, + "loss": 0.2024, + "loss/crossentropy": 2.7588168382644653, + "loss/hidden": 0.0, + "loss/logits": 0.2024042308330536, + "loss/reg": 0.25439441204071045, + "step": 3736 + }, + { + "epoch": 0.03737, + "grad_norm": 0.47738221287727356, + "grad_norm_var": 0.0003486084256801155, + "learning_rate": 5e-05, + "loss": 0.2254, + "loss/crossentropy": 2.8511962294578552, + "loss/hidden": 0.0, + "loss/logits": 0.2254251353442669, + "loss/reg": 0.2541787028312683, + "step": 3737 + }, + { + "epoch": 0.03738, + "grad_norm": 0.5131192207336426, + "grad_norm_var": 0.0005189033205176311, + "learning_rate": 5e-05, + "loss": 0.2136, + "loss/crossentropy": 2.7038198709487915, + "loss/hidden": 0.0, + "loss/logits": 0.2136346884071827, + "loss/reg": 0.2540472149848938, + "step": 3738 + }, + { + "epoch": 0.03739, + "grad_norm": 0.5254777669906616, + "grad_norm_var": 0.0007271230752673663, + "learning_rate": 5e-05, + "loss": 0.2358, + "loss/crossentropy": 2.853603959083557, + "loss/hidden": 0.0, + "loss/logits": 0.23582389950752258, + "loss/reg": 0.2538125813007355, + "step": 3739 + }, + { + "epoch": 0.0374, + "grad_norm": 0.4528425633907318, + "grad_norm_var": 0.0007015713018718395, + "learning_rate": 5e-05, + "loss": 0.2083, + "loss/crossentropy": 2.769948959350586, + "loss/hidden": 0.0, + "loss/logits": 0.20828789845108986, + "loss/reg": 0.2536051869392395, + "step": 3740 + }, + { + "epoch": 0.03741, + "grad_norm": 0.44504672288894653, + "grad_norm_var": 0.0006960085081940903, + "learning_rate": 5e-05, + "loss": 0.2093, + "loss/crossentropy": 2.8212124705314636, + "loss/hidden": 0.0, + "loss/logits": 0.20927158743143082, + "loss/reg": 0.2536773085594177, + "step": 3741 + }, + { + "epoch": 0.03742, + "grad_norm": 0.4591151177883148, + "grad_norm_var": 0.0006659856863321268, + "learning_rate": 5e-05, + "loss": 0.2128, + "loss/crossentropy": 2.849063813686371, + "loss/hidden": 0.0, + "loss/logits": 0.2127867043018341, + "loss/reg": 0.2533324956893921, + "step": 3742 + }, + { + "epoch": 0.03743, + "grad_norm": 0.4688289761543274, + "grad_norm_var": 0.0006448892244350153, + "learning_rate": 5e-05, + "loss": 0.2236, + "loss/crossentropy": 2.7431007027626038, + "loss/hidden": 0.0, + "loss/logits": 0.22355732321739197, + "loss/reg": 0.25298261642456055, + "step": 3743 + }, + { + "epoch": 0.03744, + "grad_norm": 0.4559742510318756, + "grad_norm_var": 0.0005972960623392217, + "learning_rate": 5e-05, + "loss": 0.217, + "loss/crossentropy": 3.0659831762313843, + "loss/hidden": 0.0, + "loss/logits": 0.21695949882268906, + "loss/reg": 0.25304290652275085, + "step": 3744 + }, + { + "epoch": 0.03745, + "grad_norm": 0.4603862166404724, + "grad_norm_var": 0.000559719018605694, + "learning_rate": 5e-05, + "loss": 0.2279, + "loss/crossentropy": 2.731625974178314, + "loss/hidden": 0.0, + "loss/logits": 0.22787205129861832, + "loss/reg": 0.2530505359172821, + "step": 3745 + }, + { + "epoch": 0.03746, + "grad_norm": 0.48302218317985535, + "grad_norm_var": 0.0005731630826417551, + "learning_rate": 5e-05, + "loss": 0.2095, + "loss/crossentropy": 2.921384572982788, + "loss/hidden": 0.0, + "loss/logits": 0.20945870131254196, + "loss/reg": 0.25330305099487305, + "step": 3746 + }, + { + "epoch": 0.03747, + "grad_norm": 0.48163607716560364, + "grad_norm_var": 0.0004993990478168582, + "learning_rate": 5e-05, + "loss": 0.2087, + "loss/crossentropy": 2.6829290986061096, + "loss/hidden": 0.0, + "loss/logits": 0.20871155336499214, + "loss/reg": 0.25332391262054443, + "step": 3747 + }, + { + "epoch": 0.03748, + "grad_norm": 0.46560269594192505, + "grad_norm_var": 0.0004998373166412014, + "learning_rate": 5e-05, + "loss": 0.2242, + "loss/crossentropy": 2.7269518971443176, + "loss/hidden": 0.0, + "loss/logits": 0.2241867482662201, + "loss/reg": 0.25338754057884216, + "step": 3748 + }, + { + "epoch": 0.03749, + "grad_norm": 0.4579211175441742, + "grad_norm_var": 0.0004703567646745399, + "learning_rate": 5e-05, + "loss": 0.2025, + "loss/crossentropy": 2.7872859239578247, + "loss/hidden": 0.0, + "loss/logits": 0.20252982899546623, + "loss/reg": 0.25387802720069885, + "step": 3749 + }, + { + "epoch": 0.0375, + "grad_norm": 0.4422069787979126, + "grad_norm_var": 0.0005095743382814081, + "learning_rate": 5e-05, + "loss": 0.2015, + "loss/crossentropy": 2.8137415647506714, + "loss/hidden": 0.0, + "loss/logits": 0.2014608457684517, + "loss/reg": 0.25407373905181885, + "step": 3750 + }, + { + "epoch": 0.03751, + "grad_norm": 0.4747011363506317, + "grad_norm_var": 0.0005093405149568834, + "learning_rate": 5e-05, + "loss": 0.2163, + "loss/crossentropy": 2.8428375720977783, + "loss/hidden": 0.0, + "loss/logits": 0.21631651744246483, + "loss/reg": 0.2540430724620819, + "step": 3751 + }, + { + "epoch": 0.03752, + "grad_norm": 0.43153664469718933, + "grad_norm_var": 0.0006014728171441146, + "learning_rate": 5e-05, + "loss": 0.2097, + "loss/crossentropy": 2.982938528060913, + "loss/hidden": 0.0, + "loss/logits": 0.20974989235401154, + "loss/reg": 0.2542761564254761, + "step": 3752 + }, + { + "epoch": 0.03753, + "grad_norm": 0.4872390627861023, + "grad_norm_var": 0.0006193171570129478, + "learning_rate": 5e-05, + "loss": 0.2174, + "loss/crossentropy": 2.825524687767029, + "loss/hidden": 0.0, + "loss/logits": 0.21735894307494164, + "loss/reg": 0.2545720636844635, + "step": 3753 + }, + { + "epoch": 0.03754, + "grad_norm": 0.4437912106513977, + "grad_norm_var": 0.0005122685138102175, + "learning_rate": 5e-05, + "loss": 0.2093, + "loss/crossentropy": 2.7609708309173584, + "loss/hidden": 0.0, + "loss/logits": 0.20932863652706146, + "loss/reg": 0.25490623712539673, + "step": 3754 + }, + { + "epoch": 0.03755, + "grad_norm": 0.46327343583106995, + "grad_norm_var": 0.0002500860359608207, + "learning_rate": 5e-05, + "loss": 0.2109, + "loss/crossentropy": 2.920207381248474, + "loss/hidden": 0.0, + "loss/logits": 0.2108902633190155, + "loss/reg": 0.25514519214630127, + "step": 3755 + }, + { + "epoch": 0.03756, + "grad_norm": 0.46275192499160767, + "grad_norm_var": 0.00024568271496458276, + "learning_rate": 5e-05, + "loss": 0.2097, + "loss/crossentropy": 2.8299626111984253, + "loss/hidden": 0.0, + "loss/logits": 0.20974653586745262, + "loss/reg": 0.25523173809051514, + "step": 3756 + }, + { + "epoch": 0.03757, + "grad_norm": 0.4368641972541809, + "grad_norm_var": 0.0002677520181779395, + "learning_rate": 5e-05, + "loss": 0.2113, + "loss/crossentropy": 2.8057809472084045, + "loss/hidden": 0.0, + "loss/logits": 0.21131840348243713, + "loss/reg": 0.2551957368850708, + "step": 3757 + }, + { + "epoch": 0.03758, + "grad_norm": 0.4738249182701111, + "grad_norm_var": 0.00027771964393486317, + "learning_rate": 5e-05, + "loss": 0.2272, + "loss/crossentropy": 2.9929850697517395, + "loss/hidden": 0.0, + "loss/logits": 0.22724370285868645, + "loss/reg": 0.2550574541091919, + "step": 3758 + }, + { + "epoch": 0.03759, + "grad_norm": 0.48034486174583435, + "grad_norm_var": 0.00029672773908059706, + "learning_rate": 5e-05, + "loss": 0.2082, + "loss/crossentropy": 2.7764168977737427, + "loss/hidden": 0.0, + "loss/logits": 0.20819029957056046, + "loss/reg": 0.2551427185535431, + "step": 3759 + }, + { + "epoch": 0.0376, + "grad_norm": 0.7950881719589233, + "grad_norm_var": 0.007186012172302218, + "learning_rate": 5e-05, + "loss": 0.2363, + "loss/crossentropy": 2.903538942337036, + "loss/hidden": 0.0, + "loss/logits": 0.23634663596749306, + "loss/reg": 0.2552259564399719, + "step": 3760 + }, + { + "epoch": 0.03761, + "grad_norm": 0.48777440190315247, + "grad_norm_var": 0.007147531777008931, + "learning_rate": 5e-05, + "loss": 0.2175, + "loss/crossentropy": 2.9171142578125, + "loss/hidden": 0.0, + "loss/logits": 0.21751128137111664, + "loss/reg": 0.25528132915496826, + "step": 3761 + }, + { + "epoch": 0.03762, + "grad_norm": 0.49855509400367737, + "grad_norm_var": 0.007157534032224966, + "learning_rate": 5e-05, + "loss": 0.248, + "loss/crossentropy": 2.843334972858429, + "loss/hidden": 0.0, + "loss/logits": 0.2479972317814827, + "loss/reg": 0.25530219078063965, + "step": 3762 + }, + { + "epoch": 0.03763, + "grad_norm": 0.4663482904434204, + "grad_norm_var": 0.007181942652090767, + "learning_rate": 5e-05, + "loss": 0.2184, + "loss/crossentropy": 2.806527256965637, + "loss/hidden": 0.0, + "loss/logits": 0.21838588640093803, + "loss/reg": 0.25537627935409546, + "step": 3763 + }, + { + "epoch": 0.03764, + "grad_norm": 0.4813302159309387, + "grad_norm_var": 0.007155700681028358, + "learning_rate": 5e-05, + "loss": 0.2164, + "loss/crossentropy": 2.9456242322921753, + "loss/hidden": 0.0, + "loss/logits": 0.21639467030763626, + "loss/reg": 0.25550857186317444, + "step": 3764 + }, + { + "epoch": 0.03765, + "grad_norm": 0.5061232447624207, + "grad_norm_var": 0.007117421030660495, + "learning_rate": 5e-05, + "loss": 0.2398, + "loss/crossentropy": 2.811173439025879, + "loss/hidden": 0.0, + "loss/logits": 0.23982955887913704, + "loss/reg": 0.2552703320980072, + "step": 3765 + }, + { + "epoch": 0.03766, + "grad_norm": 0.4746570885181427, + "grad_norm_var": 0.006978678881106323, + "learning_rate": 5e-05, + "loss": 0.2092, + "loss/crossentropy": 2.8836421370506287, + "loss/hidden": 0.0, + "loss/logits": 0.20921091735363007, + "loss/reg": 0.25541752576828003, + "step": 3766 + }, + { + "epoch": 0.03767, + "grad_norm": 0.5717188119888306, + "grad_norm_var": 0.007349485974515879, + "learning_rate": 5e-05, + "loss": 0.252, + "loss/crossentropy": 3.055665910243988, + "loss/hidden": 0.0, + "loss/logits": 0.2519707717001438, + "loss/reg": 0.2555775046348572, + "step": 3767 + }, + { + "epoch": 0.03768, + "grad_norm": 0.5360286235809326, + "grad_norm_var": 0.0071118142499013385, + "learning_rate": 5e-05, + "loss": 0.2104, + "loss/crossentropy": 2.7472686171531677, + "loss/hidden": 0.0, + "loss/logits": 0.21039827167987823, + "loss/reg": 0.2554837465286255, + "step": 3768 + }, + { + "epoch": 0.03769, + "grad_norm": 0.5434842705726624, + "grad_norm_var": 0.007183034982532688, + "learning_rate": 5e-05, + "loss": 0.238, + "loss/crossentropy": 2.640752911567688, + "loss/hidden": 0.0, + "loss/logits": 0.23801813274621964, + "loss/reg": 0.25557440519332886, + "step": 3769 + }, + { + "epoch": 0.0377, + "grad_norm": 0.5136911273002625, + "grad_norm_var": 0.006893503692808548, + "learning_rate": 5e-05, + "loss": 0.2361, + "loss/crossentropy": 2.741530120372772, + "loss/hidden": 0.0, + "loss/logits": 0.23609915748238564, + "loss/reg": 0.2554108798503876, + "step": 3770 + }, + { + "epoch": 0.03771, + "grad_norm": 0.47689148783683777, + "grad_norm_var": 0.006816635654563241, + "learning_rate": 5e-05, + "loss": 0.2157, + "loss/crossentropy": 2.8431026339530945, + "loss/hidden": 0.0, + "loss/logits": 0.21574878692626953, + "loss/reg": 0.25572720170021057, + "step": 3771 + }, + { + "epoch": 0.03772, + "grad_norm": 0.5376308560371399, + "grad_norm_var": 0.006666968927397809, + "learning_rate": 5e-05, + "loss": 0.2302, + "loss/crossentropy": 2.9191681146621704, + "loss/hidden": 0.0, + "loss/logits": 0.23019658401608467, + "loss/reg": 0.2557709813117981, + "step": 3772 + }, + { + "epoch": 0.03773, + "grad_norm": 0.5739825367927551, + "grad_norm_var": 0.006367432818298957, + "learning_rate": 5e-05, + "loss": 0.2278, + "loss/crossentropy": 2.7959272861480713, + "loss/hidden": 0.0, + "loss/logits": 0.22782022133469582, + "loss/reg": 0.2556956112384796, + "step": 3773 + }, + { + "epoch": 0.03774, + "grad_norm": 0.6689357757568359, + "grad_norm_var": 0.007386978298480662, + "learning_rate": 5e-05, + "loss": 0.2615, + "loss/crossentropy": 2.8235539197921753, + "loss/hidden": 0.0, + "loss/logits": 0.26148612797260284, + "loss/reg": 0.2559046745300293, + "step": 3774 + }, + { + "epoch": 0.03775, + "grad_norm": 0.46428194642066956, + "grad_norm_var": 0.007527199375405665, + "learning_rate": 5e-05, + "loss": 0.2163, + "loss/crossentropy": 2.7708794474601746, + "loss/hidden": 0.0, + "loss/logits": 0.21629633754491806, + "loss/reg": 0.256122350692749, + "step": 3775 + }, + { + "epoch": 0.03776, + "grad_norm": 0.4610794484615326, + "grad_norm_var": 0.0030185732097321337, + "learning_rate": 5e-05, + "loss": 0.2107, + "loss/crossentropy": 2.777141273021698, + "loss/hidden": 0.0, + "loss/logits": 0.21071505546569824, + "loss/reg": 0.25618964433670044, + "step": 3776 + }, + { + "epoch": 0.03777, + "grad_norm": 0.4443977177143097, + "grad_norm_var": 0.003301767985228787, + "learning_rate": 5e-05, + "loss": 0.2029, + "loss/crossentropy": 2.812096118927002, + "loss/hidden": 0.0, + "loss/logits": 0.20294029265642166, + "loss/reg": 0.25662654638290405, + "step": 3777 + }, + { + "epoch": 0.03778, + "grad_norm": 0.5494642853736877, + "grad_norm_var": 0.0033609770307037306, + "learning_rate": 5e-05, + "loss": 0.2271, + "loss/crossentropy": 3.047999858856201, + "loss/hidden": 0.0, + "loss/logits": 0.22713027149438858, + "loss/reg": 0.2566857635974884, + "step": 3778 + }, + { + "epoch": 0.03779, + "grad_norm": 0.5248891711235046, + "grad_norm_var": 0.00318076064902405, + "learning_rate": 5e-05, + "loss": 0.2763, + "loss/crossentropy": 2.894843816757202, + "loss/hidden": 0.0, + "loss/logits": 0.2762502208352089, + "loss/reg": 0.2567864656448364, + "step": 3779 + }, + { + "epoch": 0.0378, + "grad_norm": 0.4835475981235504, + "grad_norm_var": 0.0031694765243321043, + "learning_rate": 5e-05, + "loss": 0.2286, + "loss/crossentropy": 2.828524649143219, + "loss/hidden": 0.0, + "loss/logits": 0.22864408418536186, + "loss/reg": 0.25724467635154724, + "step": 3780 + }, + { + "epoch": 0.03781, + "grad_norm": 0.4422677755355835, + "grad_norm_var": 0.003548218261609041, + "learning_rate": 5e-05, + "loss": 0.2088, + "loss/crossentropy": 2.745059013366699, + "loss/hidden": 0.0, + "loss/logits": 0.20878632739186287, + "loss/reg": 0.2572964131832123, + "step": 3781 + }, + { + "epoch": 0.03782, + "grad_norm": 0.5908615589141846, + "grad_norm_var": 0.00374101931840685, + "learning_rate": 5e-05, + "loss": 0.2328, + "loss/crossentropy": 2.8742793202400208, + "loss/hidden": 0.0, + "loss/logits": 0.2328467257320881, + "loss/reg": 0.2575393617153168, + "step": 3782 + }, + { + "epoch": 0.03783, + "grad_norm": 0.5759640336036682, + "grad_norm_var": 0.0037691859096674646, + "learning_rate": 5e-05, + "loss": 0.2417, + "loss/crossentropy": 2.7561416029930115, + "loss/hidden": 0.0, + "loss/logits": 0.24169990047812462, + "loss/reg": 0.2572792172431946, + "step": 3783 + }, + { + "epoch": 0.03784, + "grad_norm": 0.4410279393196106, + "grad_norm_var": 0.004183583143964672, + "learning_rate": 5e-05, + "loss": 0.1895, + "loss/crossentropy": 2.892998516559601, + "loss/hidden": 0.0, + "loss/logits": 0.18946704640984535, + "loss/reg": 0.2573431134223938, + "step": 3784 + }, + { + "epoch": 0.03785, + "grad_norm": 0.46788281202316284, + "grad_norm_var": 0.004286691020657415, + "learning_rate": 5e-05, + "loss": 0.2363, + "loss/crossentropy": 2.8443214893341064, + "loss/hidden": 0.0, + "loss/logits": 0.23633148148655891, + "loss/reg": 0.25732988119125366, + "step": 3785 + }, + { + "epoch": 0.03786, + "grad_norm": 0.50051349401474, + "grad_norm_var": 0.004297295752140355, + "learning_rate": 5e-05, + "loss": 0.2151, + "loss/crossentropy": 2.8573893904685974, + "loss/hidden": 0.0, + "loss/logits": 0.2150929532945156, + "loss/reg": 0.2573558986186981, + "step": 3786 + }, + { + "epoch": 0.03787, + "grad_norm": 0.4325128495693207, + "grad_norm_var": 0.004632426371358215, + "learning_rate": 5e-05, + "loss": 0.2095, + "loss/crossentropy": 2.896679699420929, + "loss/hidden": 0.0, + "loss/logits": 0.20951822772622108, + "loss/reg": 0.25735408067703247, + "step": 3787 + }, + { + "epoch": 0.03788, + "grad_norm": 0.9549630880355835, + "grad_norm_var": 0.017057956743093808, + "learning_rate": 5e-05, + "loss": 0.2415, + "loss/crossentropy": 2.9087589979171753, + "loss/hidden": 0.0, + "loss/logits": 0.2415159046649933, + "loss/reg": 0.2577936053276062, + "step": 3788 + }, + { + "epoch": 0.03789, + "grad_norm": 0.5315117835998535, + "grad_norm_var": 0.01695580824327495, + "learning_rate": 5e-05, + "loss": 0.2349, + "loss/crossentropy": 2.822271227836609, + "loss/hidden": 0.0, + "loss/logits": 0.23490264266729355, + "loss/reg": 0.25780847668647766, + "step": 3789 + }, + { + "epoch": 0.0379, + "grad_norm": 0.4867192506790161, + "grad_norm_var": 0.015737619106708466, + "learning_rate": 5e-05, + "loss": 0.2187, + "loss/crossentropy": 2.7458943724632263, + "loss/hidden": 0.0, + "loss/logits": 0.21872223913669586, + "loss/reg": 0.25779151916503906, + "step": 3790 + }, + { + "epoch": 0.03791, + "grad_norm": 0.4815129339694977, + "grad_norm_var": 0.01562358713017723, + "learning_rate": 5e-05, + "loss": 0.2165, + "loss/crossentropy": 2.827815890312195, + "loss/hidden": 0.0, + "loss/logits": 0.2165384292602539, + "loss/reg": 0.2582205832004547, + "step": 3791 + }, + { + "epoch": 0.03792, + "grad_norm": 0.47639232873916626, + "grad_norm_var": 0.015511675756802805, + "learning_rate": 5e-05, + "loss": 0.2117, + "loss/crossentropy": 3.0117987394332886, + "loss/hidden": 0.0, + "loss/logits": 0.2116827517747879, + "loss/reg": 0.2582623064517975, + "step": 3792 + }, + { + "epoch": 0.03793, + "grad_norm": 0.5095559358596802, + "grad_norm_var": 0.015085226892777288, + "learning_rate": 5e-05, + "loss": 0.206, + "loss/crossentropy": 2.7606024146080017, + "loss/hidden": 0.0, + "loss/logits": 0.2060089334845543, + "loss/reg": 0.25828999280929565, + "step": 3793 + }, + { + "epoch": 0.03794, + "grad_norm": 0.4798007309436798, + "grad_norm_var": 0.015190090820517794, + "learning_rate": 5e-05, + "loss": 0.2101, + "loss/crossentropy": 2.8686147928237915, + "loss/hidden": 0.0, + "loss/logits": 0.21008682996034622, + "loss/reg": 0.2581632435321808, + "step": 3794 + }, + { + "epoch": 0.03795, + "grad_norm": 0.4851318299770355, + "grad_norm_var": 0.015282817067609703, + "learning_rate": 5e-05, + "loss": 0.2077, + "loss/crossentropy": 2.7739294171333313, + "loss/hidden": 0.0, + "loss/logits": 0.2077014148235321, + "loss/reg": 0.25801289081573486, + "step": 3795 + }, + { + "epoch": 0.03796, + "grad_norm": 0.4855894148349762, + "grad_norm_var": 0.015272810621640754, + "learning_rate": 5e-05, + "loss": 0.2115, + "loss/crossentropy": 2.875756800174713, + "loss/hidden": 0.0, + "loss/logits": 0.21149547025561333, + "loss/reg": 0.2581404149532318, + "step": 3796 + }, + { + "epoch": 0.03797, + "grad_norm": 0.4923122823238373, + "grad_norm_var": 0.014901401331745712, + "learning_rate": 5e-05, + "loss": 0.2043, + "loss/crossentropy": 2.752384126186371, + "loss/hidden": 0.0, + "loss/logits": 0.20430465042591095, + "loss/reg": 0.2583819627761841, + "step": 3797 + }, + { + "epoch": 0.03798, + "grad_norm": 0.4924747347831726, + "grad_norm_var": 0.014636059040035931, + "learning_rate": 5e-05, + "loss": 0.2101, + "loss/crossentropy": 2.721092462539673, + "loss/hidden": 0.0, + "loss/logits": 0.21006473153829575, + "loss/reg": 0.258209228515625, + "step": 3798 + }, + { + "epoch": 0.03799, + "grad_norm": 0.5496509671211243, + "grad_norm_var": 0.01447725725056479, + "learning_rate": 5e-05, + "loss": 0.2041, + "loss/crossentropy": 2.801886260509491, + "loss/hidden": 0.0, + "loss/logits": 0.20410484820604324, + "loss/reg": 0.2579821050167084, + "step": 3799 + }, + { + "epoch": 0.038, + "grad_norm": 0.4637112617492676, + "grad_norm_var": 0.01428048312805097, + "learning_rate": 5e-05, + "loss": 0.2078, + "loss/crossentropy": 2.9305243492126465, + "loss/hidden": 0.0, + "loss/logits": 0.20783938840031624, + "loss/reg": 0.25779542326927185, + "step": 3800 + }, + { + "epoch": 0.03801, + "grad_norm": 0.4821458160877228, + "grad_norm_var": 0.014197622422810651, + "learning_rate": 5e-05, + "loss": 0.2348, + "loss/crossentropy": 2.840073883533478, + "loss/hidden": 0.0, + "loss/logits": 0.2348450906574726, + "loss/reg": 0.2575339376926422, + "step": 3801 + }, + { + "epoch": 0.03802, + "grad_norm": 0.4477142095565796, + "grad_norm_var": 0.01450222036471983, + "learning_rate": 5e-05, + "loss": 0.2135, + "loss/crossentropy": 2.7766947746276855, + "loss/hidden": 0.0, + "loss/logits": 0.21347364410758018, + "loss/reg": 0.2574549913406372, + "step": 3802 + }, + { + "epoch": 0.03803, + "grad_norm": 0.4773425757884979, + "grad_norm_var": 0.014130406023235731, + "learning_rate": 5e-05, + "loss": 0.2205, + "loss/crossentropy": 2.875381052494049, + "loss/hidden": 0.0, + "loss/logits": 0.22051294147968292, + "loss/reg": 0.2572784423828125, + "step": 3803 + }, + { + "epoch": 0.03804, + "grad_norm": 0.4418962001800537, + "grad_norm_var": 0.0007270526067172711, + "learning_rate": 5e-05, + "loss": 0.21, + "loss/crossentropy": 2.76113224029541, + "loss/hidden": 0.0, + "loss/logits": 0.21000144630670547, + "loss/reg": 0.257037490606308, + "step": 3804 + }, + { + "epoch": 0.03805, + "grad_norm": 0.4536399841308594, + "grad_norm_var": 0.0006383515749379343, + "learning_rate": 5e-05, + "loss": 0.2091, + "loss/crossentropy": 2.674719989299774, + "loss/hidden": 0.0, + "loss/logits": 0.2090994156897068, + "loss/reg": 0.25666505098342896, + "step": 3805 + }, + { + "epoch": 0.03806, + "grad_norm": 0.43385013937950134, + "grad_norm_var": 0.0007769571056444935, + "learning_rate": 5e-05, + "loss": 0.2101, + "loss/crossentropy": 2.8785808086395264, + "loss/hidden": 0.0, + "loss/logits": 0.2100531868636608, + "loss/reg": 0.25674352049827576, + "step": 3806 + }, + { + "epoch": 0.03807, + "grad_norm": 0.4417382478713989, + "grad_norm_var": 0.0008587685131780062, + "learning_rate": 5e-05, + "loss": 0.2116, + "loss/crossentropy": 2.803421974182129, + "loss/hidden": 0.0, + "loss/logits": 0.21164898946881294, + "loss/reg": 0.2567269206047058, + "step": 3807 + }, + { + "epoch": 0.03808, + "grad_norm": 0.658728301525116, + "grad_norm_var": 0.002950846483239285, + "learning_rate": 5e-05, + "loss": 0.2304, + "loss/crossentropy": 2.858955144882202, + "loss/hidden": 0.0, + "loss/logits": 0.23040905222296715, + "loss/reg": 0.2564926743507385, + "step": 3808 + }, + { + "epoch": 0.03809, + "grad_norm": 0.45412248373031616, + "grad_norm_var": 0.0029777033206645855, + "learning_rate": 5e-05, + "loss": 0.1961, + "loss/crossentropy": 2.883843719959259, + "loss/hidden": 0.0, + "loss/logits": 0.19605745747685432, + "loss/reg": 0.25639957189559937, + "step": 3809 + }, + { + "epoch": 0.0381, + "grad_norm": 0.46778276562690735, + "grad_norm_var": 0.0029930434747034123, + "learning_rate": 5e-05, + "loss": 0.2226, + "loss/crossentropy": 2.8281607627868652, + "loss/hidden": 0.0, + "loss/logits": 0.22262327373027802, + "loss/reg": 0.2563241720199585, + "step": 3810 + }, + { + "epoch": 0.03811, + "grad_norm": 0.5328463315963745, + "grad_norm_var": 0.0031489652519717617, + "learning_rate": 5e-05, + "loss": 0.2279, + "loss/crossentropy": 2.793286681175232, + "loss/hidden": 0.0, + "loss/logits": 0.22792084142565727, + "loss/reg": 0.2556760311126709, + "step": 3811 + }, + { + "epoch": 0.03812, + "grad_norm": 0.4541209936141968, + "grad_norm_var": 0.0032124601968501567, + "learning_rate": 5e-05, + "loss": 0.2098, + "loss/crossentropy": 2.937571406364441, + "loss/hidden": 0.0, + "loss/logits": 0.20982392504811287, + "loss/reg": 0.25559690594673157, + "step": 3812 + }, + { + "epoch": 0.03813, + "grad_norm": 0.47385188937187195, + "grad_norm_var": 0.003213311486336971, + "learning_rate": 5e-05, + "loss": 0.2142, + "loss/crossentropy": 2.639613628387451, + "loss/hidden": 0.0, + "loss/logits": 0.21416950598359108, + "loss/reg": 0.25537216663360596, + "step": 3813 + }, + { + "epoch": 0.03814, + "grad_norm": 0.45707860589027405, + "grad_norm_var": 0.0032461980628100276, + "learning_rate": 5e-05, + "loss": 0.2186, + "loss/crossentropy": 2.786936342716217, + "loss/hidden": 0.0, + "loss/logits": 0.2185552939772606, + "loss/reg": 0.25475040078163147, + "step": 3814 + }, + { + "epoch": 0.03815, + "grad_norm": 0.5764497518539429, + "grad_norm_var": 0.0035376762924877645, + "learning_rate": 5e-05, + "loss": 0.2436, + "loss/crossentropy": 2.7970526814460754, + "loss/hidden": 0.0, + "loss/logits": 0.24355681240558624, + "loss/reg": 0.25465065240859985, + "step": 3815 + }, + { + "epoch": 0.03816, + "grad_norm": 0.46506041288375854, + "grad_norm_var": 0.0035344437180896047, + "learning_rate": 5e-05, + "loss": 0.2175, + "loss/crossentropy": 2.795070707798004, + "loss/hidden": 0.0, + "loss/logits": 0.21746884286403656, + "loss/reg": 0.2542681396007538, + "step": 3816 + }, + { + "epoch": 0.03817, + "grad_norm": 0.4720771610736847, + "grad_norm_var": 0.003541118444656262, + "learning_rate": 5e-05, + "loss": 0.2258, + "loss/crossentropy": 2.7562968730926514, + "loss/hidden": 0.0, + "loss/logits": 0.22581356763839722, + "loss/reg": 0.253933310508728, + "step": 3817 + }, + { + "epoch": 0.03818, + "grad_norm": 0.4409226179122925, + "grad_norm_var": 0.0035748392426764603, + "learning_rate": 5e-05, + "loss": 0.1949, + "loss/crossentropy": 2.7795788049697876, + "loss/hidden": 0.0, + "loss/logits": 0.19491446390748024, + "loss/reg": 0.25382328033447266, + "step": 3818 + }, + { + "epoch": 0.03819, + "grad_norm": 0.48809221386909485, + "grad_norm_var": 0.0035763258312488944, + "learning_rate": 5e-05, + "loss": 0.2307, + "loss/crossentropy": 2.6211600303649902, + "loss/hidden": 0.0, + "loss/logits": 0.23074542731046677, + "loss/reg": 0.2533603012561798, + "step": 3819 + }, + { + "epoch": 0.0382, + "grad_norm": 0.42368847131729126, + "grad_norm_var": 0.0036944449618110387, + "learning_rate": 5e-05, + "loss": 0.21, + "loss/crossentropy": 2.8866859674453735, + "loss/hidden": 0.0, + "loss/logits": 0.21000991389155388, + "loss/reg": 0.25314196944236755, + "step": 3820 + }, + { + "epoch": 0.03821, + "grad_norm": 0.45670294761657715, + "grad_norm_var": 0.003683907387286907, + "learning_rate": 5e-05, + "loss": 0.214, + "loss/crossentropy": 2.8535776138305664, + "loss/hidden": 0.0, + "loss/logits": 0.21400442719459534, + "loss/reg": 0.2532041370868683, + "step": 3821 + }, + { + "epoch": 0.03822, + "grad_norm": 1.0551466941833496, + "grad_norm_var": 0.02389785839473107, + "learning_rate": 5e-05, + "loss": 0.2476, + "loss/crossentropy": 2.7279049158096313, + "loss/hidden": 0.0, + "loss/logits": 0.24761522933840752, + "loss/reg": 0.25282353162765503, + "step": 3822 + }, + { + "epoch": 0.03823, + "grad_norm": 0.47388118505477905, + "grad_norm_var": 0.0236274489994661, + "learning_rate": 5e-05, + "loss": 0.2207, + "loss/crossentropy": 2.826627731323242, + "loss/hidden": 0.0, + "loss/logits": 0.2206873558461666, + "loss/reg": 0.2526404559612274, + "step": 3823 + }, + { + "epoch": 0.03824, + "grad_norm": 0.5150571465492249, + "grad_norm_var": 0.02229661550632918, + "learning_rate": 5e-05, + "loss": 0.2126, + "loss/crossentropy": 2.8266612887382507, + "loss/hidden": 0.0, + "loss/logits": 0.21261855587363243, + "loss/reg": 0.25252002477645874, + "step": 3824 + }, + { + "epoch": 0.03825, + "grad_norm": 0.5419411063194275, + "grad_norm_var": 0.022090035176982867, + "learning_rate": 5e-05, + "loss": 0.2189, + "loss/crossentropy": 2.656123697757721, + "loss/hidden": 0.0, + "loss/logits": 0.21891948580741882, + "loss/reg": 0.2526087760925293, + "step": 3825 + }, + { + "epoch": 0.03826, + "grad_norm": 0.4845564365386963, + "grad_norm_var": 0.0219943730429368, + "learning_rate": 5e-05, + "loss": 0.2118, + "loss/crossentropy": 2.836141288280487, + "loss/hidden": 0.0, + "loss/logits": 0.21182826906442642, + "loss/reg": 0.2528057396411896, + "step": 3826 + }, + { + "epoch": 0.03827, + "grad_norm": 0.5258753895759583, + "grad_norm_var": 0.02198497474485349, + "learning_rate": 5e-05, + "loss": 0.1974, + "loss/crossentropy": 2.9693310260772705, + "loss/hidden": 0.0, + "loss/logits": 0.19743412733078003, + "loss/reg": 0.2527637779712677, + "step": 3827 + }, + { + "epoch": 0.03828, + "grad_norm": 0.5640915632247925, + "grad_norm_var": 0.021789054977777042, + "learning_rate": 5e-05, + "loss": 0.2088, + "loss/crossentropy": 2.830752968788147, + "loss/hidden": 0.0, + "loss/logits": 0.20876751095056534, + "loss/reg": 0.2529897391796112, + "step": 3828 + }, + { + "epoch": 0.03829, + "grad_norm": 0.5758481621742249, + "grad_norm_var": 0.02173136646618152, + "learning_rate": 5e-05, + "loss": 0.2273, + "loss/crossentropy": 2.9311159253120422, + "loss/hidden": 0.0, + "loss/logits": 0.22730537876486778, + "loss/reg": 0.25301939249038696, + "step": 3829 + }, + { + "epoch": 0.0383, + "grad_norm": 0.4483696222305298, + "grad_norm_var": 0.021823429825483544, + "learning_rate": 5e-05, + "loss": 0.2193, + "loss/crossentropy": 2.8757064938545227, + "loss/hidden": 0.0, + "loss/logits": 0.2193061076104641, + "loss/reg": 0.2530243992805481, + "step": 3830 + }, + { + "epoch": 0.03831, + "grad_norm": 0.4765346646308899, + "grad_norm_var": 0.021851679156879068, + "learning_rate": 5e-05, + "loss": 0.2159, + "loss/crossentropy": 2.765141725540161, + "loss/hidden": 0.0, + "loss/logits": 0.21594956144690514, + "loss/reg": 0.2531888782978058, + "step": 3831 + }, + { + "epoch": 0.03832, + "grad_norm": 0.5312706828117371, + "grad_norm_var": 0.021592188879688463, + "learning_rate": 5e-05, + "loss": 0.2293, + "loss/crossentropy": 2.8394598960876465, + "loss/hidden": 0.0, + "loss/logits": 0.22932486981153488, + "loss/reg": 0.2532878518104553, + "step": 3832 + }, + { + "epoch": 0.03833, + "grad_norm": 0.4553411602973938, + "grad_norm_var": 0.0217381186467823, + "learning_rate": 5e-05, + "loss": 0.2137, + "loss/crossentropy": 2.8754897117614746, + "loss/hidden": 0.0, + "loss/logits": 0.21367378532886505, + "loss/reg": 0.2532351613044739, + "step": 3833 + }, + { + "epoch": 0.03834, + "grad_norm": 0.4579719305038452, + "grad_norm_var": 0.021557013981359167, + "learning_rate": 5e-05, + "loss": 0.2333, + "loss/crossentropy": 2.785922646522522, + "loss/hidden": 0.0, + "loss/logits": 0.23326639831066132, + "loss/reg": 0.25349098443984985, + "step": 3834 + }, + { + "epoch": 0.03835, + "grad_norm": 5.656593322753906, + "grad_norm_var": 1.6625072031514254, + "learning_rate": 5e-05, + "loss": 0.427, + "loss/crossentropy": 3.025012791156769, + "loss/hidden": 0.0, + "loss/logits": 0.4269937202334404, + "loss/reg": 0.2534947693347931, + "step": 3835 + }, + { + "epoch": 0.03836, + "grad_norm": 0.5433942079544067, + "grad_norm_var": 1.6565557711768624, + "learning_rate": 5e-05, + "loss": 0.2066, + "loss/crossentropy": 2.7992900609970093, + "loss/hidden": 0.0, + "loss/logits": 0.20664867013692856, + "loss/reg": 0.2533682584762573, + "step": 3836 + }, + { + "epoch": 0.03837, + "grad_norm": 0.5409733057022095, + "grad_norm_var": 1.6524663404093256, + "learning_rate": 5e-05, + "loss": 0.2138, + "loss/crossentropy": 2.797449052333832, + "loss/hidden": 0.0, + "loss/logits": 0.21379457786679268, + "loss/reg": 0.25352224707603455, + "step": 3837 + }, + { + "epoch": 0.03838, + "grad_norm": 0.534355640411377, + "grad_norm_var": 1.6562439520325172, + "learning_rate": 5e-05, + "loss": 0.2315, + "loss/crossentropy": 2.772941291332245, + "loss/hidden": 0.0, + "loss/logits": 0.23147650435566902, + "loss/reg": 0.2533625662326813, + "step": 3838 + }, + { + "epoch": 0.03839, + "grad_norm": 0.5117605328559875, + "grad_norm_var": 1.6545204854401063, + "learning_rate": 5e-05, + "loss": 0.2353, + "loss/crossentropy": 2.8165289163589478, + "loss/hidden": 0.0, + "loss/logits": 0.23532608151435852, + "loss/reg": 0.2530653476715088, + "step": 3839 + }, + { + "epoch": 0.0384, + "grad_norm": 0.516423761844635, + "grad_norm_var": 1.6544622588489861, + "learning_rate": 5e-05, + "loss": 0.2202, + "loss/crossentropy": 2.723934829235077, + "loss/hidden": 0.0, + "loss/logits": 0.2201736569404602, + "loss/reg": 0.25318798422813416, + "step": 3840 + }, + { + "epoch": 0.03841, + "grad_norm": 0.8466431498527527, + "grad_norm_var": 1.6483454200134364, + "learning_rate": 5e-05, + "loss": 0.2676, + "loss/crossentropy": 2.9478878378868103, + "loss/hidden": 0.0, + "loss/logits": 0.26764898374676704, + "loss/reg": 0.25297120213508606, + "step": 3841 + }, + { + "epoch": 0.03842, + "grad_norm": 0.4475407898426056, + "grad_norm_var": 1.6502562657561868, + "learning_rate": 5e-05, + "loss": 0.2066, + "loss/crossentropy": 2.8472391963005066, + "loss/hidden": 0.0, + "loss/logits": 0.20658797398209572, + "loss/reg": 0.2525861859321594, + "step": 3842 + }, + { + "epoch": 0.03843, + "grad_norm": 0.4677143394947052, + "grad_norm_var": 1.6529971971013426, + "learning_rate": 5e-05, + "loss": 0.2252, + "loss/crossentropy": 2.9182642102241516, + "loss/hidden": 0.0, + "loss/logits": 0.2251940406858921, + "loss/reg": 0.25255918502807617, + "step": 3843 + }, + { + "epoch": 0.03844, + "grad_norm": 0.4726514518260956, + "grad_norm_var": 1.6569863959085473, + "learning_rate": 5e-05, + "loss": 0.2012, + "loss/crossentropy": 2.7663196325302124, + "loss/hidden": 0.0, + "loss/logits": 0.20124061033129692, + "loss/reg": 0.25245600938796997, + "step": 3844 + }, + { + "epoch": 0.03845, + "grad_norm": 0.47108012437820435, + "grad_norm_var": 1.6614002527486222, + "learning_rate": 5e-05, + "loss": 0.2103, + "loss/crossentropy": 2.863184869289398, + "loss/hidden": 0.0, + "loss/logits": 0.21026422828435898, + "loss/reg": 0.25213998556137085, + "step": 3845 + }, + { + "epoch": 0.03846, + "grad_norm": 0.44843316078186035, + "grad_norm_var": 1.6613969676846931, + "learning_rate": 5e-05, + "loss": 0.2217, + "loss/crossentropy": 2.833566665649414, + "loss/hidden": 0.0, + "loss/logits": 0.22165338695049286, + "loss/reg": 0.2519301474094391, + "step": 3846 + }, + { + "epoch": 0.03847, + "grad_norm": 0.4591122567653656, + "grad_norm_var": 1.6622513619349781, + "learning_rate": 5e-05, + "loss": 0.2238, + "loss/crossentropy": 3.005862593650818, + "loss/hidden": 0.0, + "loss/logits": 0.22384252399206161, + "loss/reg": 0.2517870366573334, + "step": 3847 + }, + { + "epoch": 0.03848, + "grad_norm": 0.518927812576294, + "grad_norm_var": 1.662760865375757, + "learning_rate": 5e-05, + "loss": 0.2271, + "loss/crossentropy": 2.8139342665672302, + "loss/hidden": 0.0, + "loss/logits": 0.2271469607949257, + "loss/reg": 0.2514152526855469, + "step": 3848 + }, + { + "epoch": 0.03849, + "grad_norm": 0.5566403269767761, + "grad_norm_var": 1.658283683103866, + "learning_rate": 5e-05, + "loss": 0.2438, + "loss/crossentropy": 2.80019348859787, + "loss/hidden": 0.0, + "loss/logits": 0.2437540851533413, + "loss/reg": 0.2512364387512207, + "step": 3849 + }, + { + "epoch": 0.0385, + "grad_norm": 0.4780506193637848, + "grad_norm_var": 1.6572844211290483, + "learning_rate": 5e-05, + "loss": 0.2023, + "loss/crossentropy": 2.837947905063629, + "loss/hidden": 0.0, + "loss/logits": 0.20230404660105705, + "loss/reg": 0.25088760256767273, + "step": 3850 + }, + { + "epoch": 0.03851, + "grad_norm": 0.47090739011764526, + "grad_norm_var": 0.008990212651813852, + "learning_rate": 5e-05, + "loss": 0.1986, + "loss/crossentropy": 2.7609986066818237, + "loss/hidden": 0.0, + "loss/logits": 0.19861363619565964, + "loss/reg": 0.250800758600235, + "step": 3851 + }, + { + "epoch": 0.03852, + "grad_norm": 0.48719602823257446, + "grad_norm_var": 0.008995733135470193, + "learning_rate": 5e-05, + "loss": 0.2254, + "loss/crossentropy": 2.5655117630958557, + "loss/hidden": 0.0, + "loss/logits": 0.2254483923316002, + "loss/reg": 0.2506735622882843, + "step": 3852 + }, + { + "epoch": 0.03853, + "grad_norm": 0.42110538482666016, + "grad_norm_var": 0.009467060998775253, + "learning_rate": 5e-05, + "loss": 0.2055, + "loss/crossentropy": 2.8442137241363525, + "loss/hidden": 0.0, + "loss/logits": 0.2055313028395176, + "loss/reg": 0.2506205439567566, + "step": 3853 + }, + { + "epoch": 0.03854, + "grad_norm": 0.43769338726997375, + "grad_norm_var": 0.009695682872341689, + "learning_rate": 5e-05, + "loss": 0.2132, + "loss/crossentropy": 2.847848355770111, + "loss/hidden": 0.0, + "loss/logits": 0.2132219411432743, + "loss/reg": 0.25040343403816223, + "step": 3854 + }, + { + "epoch": 0.03855, + "grad_norm": 0.4390872120857239, + "grad_norm_var": 0.009919009135965535, + "learning_rate": 5e-05, + "loss": 0.2117, + "loss/crossentropy": 2.8070595264434814, + "loss/hidden": 0.0, + "loss/logits": 0.2116520032286644, + "loss/reg": 0.25026118755340576, + "step": 3855 + }, + { + "epoch": 0.03856, + "grad_norm": 0.735524594783783, + "grad_norm_var": 0.013510125175995924, + "learning_rate": 5e-05, + "loss": 0.2623, + "loss/crossentropy": 2.9184842109680176, + "loss/hidden": 0.0, + "loss/logits": 0.26232312247157097, + "loss/reg": 0.25005921721458435, + "step": 3856 + }, + { + "epoch": 0.03857, + "grad_norm": 0.46430420875549316, + "grad_norm_var": 0.005479604515029838, + "learning_rate": 5e-05, + "loss": 0.2127, + "loss/crossentropy": 2.719912111759186, + "loss/hidden": 0.0, + "loss/logits": 0.21269413456320763, + "loss/reg": 0.25008636713027954, + "step": 3857 + }, + { + "epoch": 0.03858, + "grad_norm": 0.444013386964798, + "grad_norm_var": 0.005498469417542854, + "learning_rate": 5e-05, + "loss": 0.208, + "loss/crossentropy": 2.8017241954803467, + "loss/hidden": 0.0, + "loss/logits": 0.20804350078105927, + "loss/reg": 0.2502193748950958, + "step": 3858 + }, + { + "epoch": 0.03859, + "grad_norm": 0.46745169162750244, + "grad_norm_var": 0.00549910629946941, + "learning_rate": 5e-05, + "loss": 0.1866, + "loss/crossentropy": 2.7241448760032654, + "loss/hidden": 0.0, + "loss/logits": 0.1866181679069996, + "loss/reg": 0.2501846253871918, + "step": 3859 + }, + { + "epoch": 0.0386, + "grad_norm": 0.5296929478645325, + "grad_norm_var": 0.005602758023883088, + "learning_rate": 5e-05, + "loss": 0.233, + "loss/crossentropy": 2.8863091468811035, + "loss/hidden": 0.0, + "loss/logits": 0.23304373025894165, + "loss/reg": 0.2503035068511963, + "step": 3860 + }, + { + "epoch": 0.03861, + "grad_norm": 0.460347980260849, + "grad_norm_var": 0.005636066093887134, + "learning_rate": 5e-05, + "loss": 0.2099, + "loss/crossentropy": 2.7646356225013733, + "loss/hidden": 0.0, + "loss/logits": 0.20988069474697113, + "loss/reg": 0.25001388788223267, + "step": 3861 + }, + { + "epoch": 0.03862, + "grad_norm": 0.44294047355651855, + "grad_norm_var": 0.005667408876353497, + "learning_rate": 5e-05, + "loss": 0.2044, + "loss/crossentropy": 2.926476001739502, + "loss/hidden": 0.0, + "loss/logits": 0.204400934278965, + "loss/reg": 0.24992187321186066, + "step": 3862 + }, + { + "epoch": 0.03863, + "grad_norm": 0.4585827887058258, + "grad_norm_var": 0.005669487791232941, + "learning_rate": 5e-05, + "loss": 0.2059, + "loss/crossentropy": 2.8097155690193176, + "loss/hidden": 0.0, + "loss/logits": 0.20586463809013367, + "loss/reg": 0.24986816942691803, + "step": 3863 + }, + { + "epoch": 0.03864, + "grad_norm": 0.45602357387542725, + "grad_norm_var": 0.005659739218899296, + "learning_rate": 5e-05, + "loss": 0.2044, + "loss/crossentropy": 2.8682404160499573, + "loss/hidden": 0.0, + "loss/logits": 0.20435643941164017, + "loss/reg": 0.24990713596343994, + "step": 3864 + }, + { + "epoch": 0.03865, + "grad_norm": 0.8101062178611755, + "grad_norm_var": 0.012118213642139664, + "learning_rate": 5e-05, + "loss": 0.2825, + "loss/crossentropy": 2.936920166015625, + "loss/hidden": 0.0, + "loss/logits": 0.2825143374502659, + "loss/reg": 0.24998560547828674, + "step": 3865 + }, + { + "epoch": 0.03866, + "grad_norm": 0.5126198530197144, + "grad_norm_var": 0.012090861103620875, + "learning_rate": 5e-05, + "loss": 0.2311, + "loss/crossentropy": 2.6884096264839172, + "loss/hidden": 0.0, + "loss/logits": 0.23106591403484344, + "loss/reg": 0.2502279579639435, + "step": 3866 + }, + { + "epoch": 0.03867, + "grad_norm": 0.4451051950454712, + "grad_norm_var": 0.012240641844237565, + "learning_rate": 5e-05, + "loss": 0.2081, + "loss/crossentropy": 2.8367812037467957, + "loss/hidden": 0.0, + "loss/logits": 0.20807603001594543, + "loss/reg": 0.250394344329834, + "step": 3867 + }, + { + "epoch": 0.03868, + "grad_norm": 1.8023252487182617, + "grad_norm_var": 0.11796399693246315, + "learning_rate": 5e-05, + "loss": 0.225, + "loss/crossentropy": 3.0031420588493347, + "loss/hidden": 0.0, + "loss/logits": 0.22499702125787735, + "loss/reg": 0.25042060017585754, + "step": 3868 + }, + { + "epoch": 0.03869, + "grad_norm": 0.5388759970664978, + "grad_norm_var": 0.11628973222599867, + "learning_rate": 5e-05, + "loss": 0.2259, + "loss/crossentropy": 2.8657209277153015, + "loss/hidden": 0.0, + "loss/logits": 0.22592902928590775, + "loss/reg": 0.25062963366508484, + "step": 3869 + }, + { + "epoch": 0.0387, + "grad_norm": 0.5202139616012573, + "grad_norm_var": 0.11503631604530465, + "learning_rate": 5e-05, + "loss": 0.2175, + "loss/crossentropy": 2.769507944583893, + "loss/hidden": 0.0, + "loss/logits": 0.217498030513525, + "loss/reg": 0.25056856870651245, + "step": 3870 + }, + { + "epoch": 0.03871, + "grad_norm": 0.5543947219848633, + "grad_norm_var": 0.11346331634928948, + "learning_rate": 5e-05, + "loss": 0.226, + "loss/crossentropy": 2.7507259249687195, + "loss/hidden": 0.0, + "loss/logits": 0.22602715715765953, + "loss/reg": 0.2504148483276367, + "step": 3871 + }, + { + "epoch": 0.03872, + "grad_norm": 0.5142320990562439, + "grad_norm_var": 0.11260363749727861, + "learning_rate": 5e-05, + "loss": 0.2157, + "loss/crossentropy": 2.79412841796875, + "loss/hidden": 0.0, + "loss/logits": 0.21565331891179085, + "loss/reg": 0.2504916787147522, + "step": 3872 + }, + { + "epoch": 0.03873, + "grad_norm": 0.5330715775489807, + "grad_norm_var": 0.11175745058882222, + "learning_rate": 5e-05, + "loss": 0.2289, + "loss/crossentropy": 2.9365938901901245, + "loss/hidden": 0.0, + "loss/logits": 0.22891785204410553, + "loss/reg": 0.2504209280014038, + "step": 3873 + }, + { + "epoch": 0.03874, + "grad_norm": 0.46284762024879456, + "grad_norm_var": 0.1114051677482566, + "learning_rate": 5e-05, + "loss": 0.2023, + "loss/crossentropy": 2.727176785469055, + "loss/hidden": 0.0, + "loss/logits": 0.20225486159324646, + "loss/reg": 0.2505394220352173, + "step": 3874 + }, + { + "epoch": 0.03875, + "grad_norm": 0.4411965608596802, + "grad_norm_var": 0.11189231383801689, + "learning_rate": 5e-05, + "loss": 0.2088, + "loss/crossentropy": 2.898755669593811, + "loss/hidden": 0.0, + "loss/logits": 0.20877858996391296, + "loss/reg": 0.2505587339401245, + "step": 3875 + }, + { + "epoch": 0.03876, + "grad_norm": 0.48787495493888855, + "grad_norm_var": 0.11235270366392616, + "learning_rate": 5e-05, + "loss": 0.2138, + "loss/crossentropy": 2.9193203449249268, + "loss/hidden": 0.0, + "loss/logits": 0.21376967430114746, + "loss/reg": 0.2505887746810913, + "step": 3876 + }, + { + "epoch": 0.03877, + "grad_norm": 0.5106173753738403, + "grad_norm_var": 0.11164132022457005, + "learning_rate": 5e-05, + "loss": 0.2233, + "loss/crossentropy": 2.8417840003967285, + "loss/hidden": 0.0, + "loss/logits": 0.2232597954571247, + "loss/reg": 0.25058606266975403, + "step": 3877 + }, + { + "epoch": 0.03878, + "grad_norm": 0.5038735866546631, + "grad_norm_var": 0.11065268945507045, + "learning_rate": 5e-05, + "loss": 0.2139, + "loss/crossentropy": 2.6507399678230286, + "loss/hidden": 0.0, + "loss/logits": 0.21393678337335587, + "loss/reg": 0.25066274404525757, + "step": 3878 + }, + { + "epoch": 0.03879, + "grad_norm": 0.4730871021747589, + "grad_norm_var": 0.11039815635682058, + "learning_rate": 5e-05, + "loss": 0.215, + "loss/crossentropy": 2.8162384033203125, + "loss/hidden": 0.0, + "loss/logits": 0.21500388160347939, + "loss/reg": 0.25080233812332153, + "step": 3879 + }, + { + "epoch": 0.0388, + "grad_norm": 0.5061549544334412, + "grad_norm_var": 0.10960687299169279, + "learning_rate": 5e-05, + "loss": 0.2425, + "loss/crossentropy": 2.912952184677124, + "loss/hidden": 0.0, + "loss/logits": 0.2425498440861702, + "loss/reg": 0.250506192445755, + "step": 3880 + }, + { + "epoch": 0.03881, + "grad_norm": 0.5022459030151367, + "grad_norm_var": 0.10694862719710173, + "learning_rate": 5e-05, + "loss": 0.2204, + "loss/crossentropy": 2.801741600036621, + "loss/hidden": 0.0, + "loss/logits": 0.22036828845739365, + "loss/reg": 0.2505602240562439, + "step": 3881 + }, + { + "epoch": 0.03882, + "grad_norm": 0.4719526469707489, + "grad_norm_var": 0.10742708470768463, + "learning_rate": 5e-05, + "loss": 0.2232, + "loss/crossentropy": 2.8985652327537537, + "loss/hidden": 0.0, + "loss/logits": 0.2231869474053383, + "loss/reg": 0.25035297870635986, + "step": 3882 + }, + { + "epoch": 0.03883, + "grad_norm": 0.4678591787815094, + "grad_norm_var": 0.1070524533528865, + "learning_rate": 5e-05, + "loss": 0.2116, + "loss/crossentropy": 2.821328580379486, + "loss/hidden": 0.0, + "loss/logits": 0.21162472292780876, + "loss/reg": 0.25027382373809814, + "step": 3883 + }, + { + "epoch": 0.03884, + "grad_norm": 0.4119280278682709, + "grad_norm_var": 0.0014007877449115469, + "learning_rate": 5e-05, + "loss": 0.1935, + "loss/crossentropy": 2.737049698829651, + "loss/hidden": 0.0, + "loss/logits": 0.19353076070547104, + "loss/reg": 0.25012946128845215, + "step": 3884 + }, + { + "epoch": 0.03885, + "grad_norm": 0.4446844458580017, + "grad_norm_var": 0.0013888936672044494, + "learning_rate": 5e-05, + "loss": 0.2126, + "loss/crossentropy": 2.7935622334480286, + "loss/hidden": 0.0, + "loss/logits": 0.21255939081311226, + "loss/reg": 0.2498779445886612, + "step": 3885 + }, + { + "epoch": 0.03886, + "grad_norm": 0.4404624402523041, + "grad_norm_var": 0.0014426912135676558, + "learning_rate": 5e-05, + "loss": 0.2239, + "loss/crossentropy": 2.8056779503822327, + "loss/hidden": 0.0, + "loss/logits": 0.22392763569951057, + "loss/reg": 0.2500760853290558, + "step": 3886 + }, + { + "epoch": 0.03887, + "grad_norm": 0.4477121829986572, + "grad_norm_var": 0.0011371228056460366, + "learning_rate": 5e-05, + "loss": 0.2162, + "loss/crossentropy": 2.8016260266304016, + "loss/hidden": 0.0, + "loss/logits": 0.21615919098258018, + "loss/reg": 0.25001260638237, + "step": 3887 + }, + { + "epoch": 0.03888, + "grad_norm": 0.46485382318496704, + "grad_norm_var": 0.001039363753164931, + "learning_rate": 5e-05, + "loss": 0.2195, + "loss/crossentropy": 2.927013576030731, + "loss/hidden": 0.0, + "loss/logits": 0.21947459131479263, + "loss/reg": 0.25012537837028503, + "step": 3888 + }, + { + "epoch": 0.03889, + "grad_norm": 0.4816358685493469, + "grad_norm_var": 0.0007937775130269942, + "learning_rate": 5e-05, + "loss": 0.2194, + "loss/crossentropy": 2.880588471889496, + "loss/hidden": 0.0, + "loss/logits": 0.2194320112466812, + "loss/reg": 0.2503191828727722, + "step": 3889 + }, + { + "epoch": 0.0389, + "grad_norm": 0.49554702639579773, + "grad_norm_var": 0.0008296980281087903, + "learning_rate": 5e-05, + "loss": 0.2172, + "loss/crossentropy": 2.7949899435043335, + "loss/hidden": 0.0, + "loss/logits": 0.21716166660189629, + "loss/reg": 0.24994048476219177, + "step": 3890 + }, + { + "epoch": 0.03891, + "grad_norm": 0.4577173590660095, + "grad_norm_var": 0.0007789468081390323, + "learning_rate": 5e-05, + "loss": 0.2049, + "loss/crossentropy": 2.953920900821686, + "loss/hidden": 0.0, + "loss/logits": 0.20485319197177887, + "loss/reg": 0.2497967779636383, + "step": 3891 + }, + { + "epoch": 0.03892, + "grad_norm": 0.4301223158836365, + "grad_norm_var": 0.0008729644422377299, + "learning_rate": 5e-05, + "loss": 0.2062, + "loss/crossentropy": 2.8626121282577515, + "loss/hidden": 0.0, + "loss/logits": 0.20618019625544548, + "loss/reg": 0.2496677190065384, + "step": 3892 + }, + { + "epoch": 0.03893, + "grad_norm": 0.4614681601524353, + "grad_norm_var": 0.0007538576043667551, + "learning_rate": 5e-05, + "loss": 0.2104, + "loss/crossentropy": 2.75000137090683, + "loss/hidden": 0.0, + "loss/logits": 0.21035070717334747, + "loss/reg": 0.2497120499610901, + "step": 3893 + }, + { + "epoch": 0.03894, + "grad_norm": 0.5140235424041748, + "grad_norm_var": 0.0008111031037818961, + "learning_rate": 5e-05, + "loss": 0.2185, + "loss/crossentropy": 2.7834548354148865, + "loss/hidden": 0.0, + "loss/logits": 0.2184583581984043, + "loss/reg": 0.24955534934997559, + "step": 3894 + }, + { + "epoch": 0.03895, + "grad_norm": 0.42230507731437683, + "grad_norm_var": 0.0009308329542997355, + "learning_rate": 5e-05, + "loss": 0.1949, + "loss/crossentropy": 2.830475687980652, + "loss/hidden": 0.0, + "loss/logits": 0.19485175982117653, + "loss/reg": 0.2495090812444687, + "step": 3895 + }, + { + "epoch": 0.03896, + "grad_norm": 0.4876771867275238, + "grad_norm_var": 0.0008478026339282925, + "learning_rate": 5e-05, + "loss": 0.2268, + "loss/crossentropy": 2.699580490589142, + "loss/hidden": 0.0, + "loss/logits": 0.22678394988179207, + "loss/reg": 0.24952757358551025, + "step": 3896 + }, + { + "epoch": 0.03897, + "grad_norm": 0.4797470271587372, + "grad_norm_var": 0.0007606199223575861, + "learning_rate": 5e-05, + "loss": 0.2226, + "loss/crossentropy": 2.681858479976654, + "loss/hidden": 0.0, + "loss/logits": 0.222591832280159, + "loss/reg": 0.24926866590976715, + "step": 3897 + }, + { + "epoch": 0.03898, + "grad_norm": 0.4446737766265869, + "grad_norm_var": 0.0007681319516706809, + "learning_rate": 5e-05, + "loss": 0.2051, + "loss/crossentropy": 2.7850680351257324, + "loss/hidden": 0.0, + "loss/logits": 0.20510638877749443, + "loss/reg": 0.24889107048511505, + "step": 3898 + }, + { + "epoch": 0.03899, + "grad_norm": 0.434939444065094, + "grad_norm_var": 0.0007992873502123743, + "learning_rate": 5e-05, + "loss": 0.2111, + "loss/crossentropy": 2.7808972597122192, + "loss/hidden": 0.0, + "loss/logits": 0.2110598012804985, + "loss/reg": 0.2487393617630005, + "step": 3899 + }, + { + "epoch": 0.039, + "grad_norm": 0.42273029685020447, + "grad_norm_var": 0.0007409882029511649, + "learning_rate": 5e-05, + "loss": 0.2126, + "loss/crossentropy": 2.74051696062088, + "loss/hidden": 0.0, + "loss/logits": 0.21262278780341148, + "loss/reg": 0.2483047991991043, + "step": 3900 + }, + { + "epoch": 0.03901, + "grad_norm": 0.45365315675735474, + "grad_norm_var": 0.0007299205515149509, + "learning_rate": 5e-05, + "loss": 0.2201, + "loss/crossentropy": 2.801327347755432, + "loss/hidden": 0.0, + "loss/logits": 0.22008633613586426, + "loss/reg": 0.24835486710071564, + "step": 3901 + }, + { + "epoch": 0.03902, + "grad_norm": 0.7717142105102539, + "grad_norm_var": 0.006782217882282367, + "learning_rate": 5e-05, + "loss": 0.2679, + "loss/crossentropy": 2.633960485458374, + "loss/hidden": 0.0, + "loss/logits": 0.26793915405869484, + "loss/reg": 0.24796071648597717, + "step": 3902 + }, + { + "epoch": 0.03903, + "grad_norm": 0.48662111163139343, + "grad_norm_var": 0.006712406003367318, + "learning_rate": 5e-05, + "loss": 0.2182, + "loss/crossentropy": 2.8236266374588013, + "loss/hidden": 0.0, + "loss/logits": 0.21817582845687866, + "loss/reg": 0.24775618314743042, + "step": 3903 + }, + { + "epoch": 0.03904, + "grad_norm": 0.484358549118042, + "grad_norm_var": 0.00669201011173682, + "learning_rate": 5e-05, + "loss": 0.2216, + "loss/crossentropy": 2.9452527165412903, + "loss/hidden": 0.0, + "loss/logits": 0.22162308543920517, + "loss/reg": 0.24779576063156128, + "step": 3904 + }, + { + "epoch": 0.03905, + "grad_norm": 0.5247238874435425, + "grad_norm_var": 0.006799873760310563, + "learning_rate": 5e-05, + "loss": 0.2414, + "loss/crossentropy": 2.943159818649292, + "loss/hidden": 0.0, + "loss/logits": 0.2414020337164402, + "loss/reg": 0.247343972325325, + "step": 3905 + }, + { + "epoch": 0.03906, + "grad_norm": 0.5147313475608826, + "grad_norm_var": 0.006847932515643888, + "learning_rate": 5e-05, + "loss": 0.2331, + "loss/crossentropy": 2.925995171070099, + "loss/hidden": 0.0, + "loss/logits": 0.23305874690413475, + "loss/reg": 0.24713972210884094, + "step": 3906 + }, + { + "epoch": 0.03907, + "grad_norm": 0.4734063446521759, + "grad_norm_var": 0.006802164958283645, + "learning_rate": 5e-05, + "loss": 0.2249, + "loss/crossentropy": 2.8619205951690674, + "loss/hidden": 0.0, + "loss/logits": 0.2248731255531311, + "loss/reg": 0.24667775630950928, + "step": 3907 + }, + { + "epoch": 0.03908, + "grad_norm": 0.46132388710975647, + "grad_norm_var": 0.006622514996505209, + "learning_rate": 5e-05, + "loss": 0.2114, + "loss/crossentropy": 2.8602414727211, + "loss/hidden": 0.0, + "loss/logits": 0.21138635650277138, + "loss/reg": 0.24634701013565063, + "step": 3908 + }, + { + "epoch": 0.03909, + "grad_norm": 0.4875330328941345, + "grad_norm_var": 0.006566232270102553, + "learning_rate": 5e-05, + "loss": 0.2366, + "loss/crossentropy": 2.789766550064087, + "loss/hidden": 0.0, + "loss/logits": 0.23664450645446777, + "loss/reg": 0.2460123747587204, + "step": 3909 + }, + { + "epoch": 0.0391, + "grad_norm": 0.4639303982257843, + "grad_norm_var": 0.006572695864343325, + "learning_rate": 5e-05, + "loss": 0.2135, + "loss/crossentropy": 2.8614270091056824, + "loss/hidden": 0.0, + "loss/logits": 0.21350545436143875, + "loss/reg": 0.24574372172355652, + "step": 3910 + }, + { + "epoch": 0.03911, + "grad_norm": 0.5007550716400146, + "grad_norm_var": 0.006266209688373314, + "learning_rate": 5e-05, + "loss": 0.2125, + "loss/crossentropy": 2.8823363184928894, + "loss/hidden": 0.0, + "loss/logits": 0.21248095482587814, + "loss/reg": 0.2455577254295349, + "step": 3911 + }, + { + "epoch": 0.03912, + "grad_norm": 0.5608285069465637, + "grad_norm_var": 0.006545983715813822, + "learning_rate": 5e-05, + "loss": 0.2544, + "loss/crossentropy": 2.9610557556152344, + "loss/hidden": 0.0, + "loss/logits": 0.2544005699455738, + "loss/reg": 0.24526464939117432, + "step": 3912 + }, + { + "epoch": 0.03913, + "grad_norm": 0.5287060737609863, + "grad_norm_var": 0.00657759299609992, + "learning_rate": 5e-05, + "loss": 0.2238, + "loss/crossentropy": 2.876520812511444, + "loss/hidden": 0.0, + "loss/logits": 0.22384852916002274, + "loss/reg": 0.24493934214115143, + "step": 3913 + }, + { + "epoch": 0.03914, + "grad_norm": 0.47683656215667725, + "grad_norm_var": 0.006401065462955287, + "learning_rate": 5e-05, + "loss": 0.2268, + "loss/crossentropy": 2.821972370147705, + "loss/hidden": 0.0, + "loss/logits": 0.22680183872580528, + "loss/reg": 0.24470391869544983, + "step": 3914 + }, + { + "epoch": 0.03915, + "grad_norm": 0.42487967014312744, + "grad_norm_var": 0.006498578966086799, + "learning_rate": 5e-05, + "loss": 0.205, + "loss/crossentropy": 2.9192967414855957, + "loss/hidden": 0.0, + "loss/logits": 0.20497868210077286, + "loss/reg": 0.24456655979156494, + "step": 3915 + }, + { + "epoch": 0.03916, + "grad_norm": 0.46598610281944275, + "grad_norm_var": 0.006156631433810278, + "learning_rate": 5e-05, + "loss": 0.2028, + "loss/crossentropy": 2.7125489115715027, + "loss/hidden": 0.0, + "loss/logits": 0.20277633890509605, + "loss/reg": 0.24429091811180115, + "step": 3916 + }, + { + "epoch": 0.03917, + "grad_norm": 0.5518943071365356, + "grad_norm_var": 0.006087265995786591, + "learning_rate": 5e-05, + "loss": 0.2412, + "loss/crossentropy": 2.7468611001968384, + "loss/hidden": 0.0, + "loss/logits": 0.24123656004667282, + "loss/reg": 0.24411815404891968, + "step": 3917 + }, + { + "epoch": 0.03918, + "grad_norm": 0.44201910495758057, + "grad_norm_var": 0.0014262426178396385, + "learning_rate": 5e-05, + "loss": 0.2, + "loss/crossentropy": 2.815846562385559, + "loss/hidden": 0.0, + "loss/logits": 0.19996864721179008, + "loss/reg": 0.24395155906677246, + "step": 3918 + }, + { + "epoch": 0.03919, + "grad_norm": 0.4329304099082947, + "grad_norm_var": 0.0016344177702672714, + "learning_rate": 5e-05, + "loss": 0.2077, + "loss/crossentropy": 2.754484534263611, + "loss/hidden": 0.0, + "loss/logits": 0.20765281841158867, + "loss/reg": 0.2436724752187729, + "step": 3919 + }, + { + "epoch": 0.0392, + "grad_norm": 0.49822279810905457, + "grad_norm_var": 0.0016412199632784507, + "learning_rate": 5e-05, + "loss": 0.2108, + "loss/crossentropy": 2.9608566761016846, + "loss/hidden": 0.0, + "loss/logits": 0.21081344783306122, + "loss/reg": 0.24339643120765686, + "step": 3920 + }, + { + "epoch": 0.03921, + "grad_norm": 0.5080316066741943, + "grad_norm_var": 0.0015769988350875174, + "learning_rate": 5e-05, + "loss": 0.2411, + "loss/crossentropy": 2.7641957998275757, + "loss/hidden": 0.0, + "loss/logits": 0.24108855798840523, + "loss/reg": 0.24273645877838135, + "step": 3921 + }, + { + "epoch": 0.03922, + "grad_norm": 0.545331597328186, + "grad_norm_var": 0.0017486632296886406, + "learning_rate": 5e-05, + "loss": 0.2517, + "loss/crossentropy": 2.9042473435401917, + "loss/hidden": 0.0, + "loss/logits": 0.2517416961491108, + "loss/reg": 0.24216045439243317, + "step": 3922 + }, + { + "epoch": 0.03923, + "grad_norm": 0.4350402355194092, + "grad_norm_var": 0.0019199870192110632, + "learning_rate": 5e-05, + "loss": 0.2111, + "loss/crossentropy": 3.0123788714408875, + "loss/hidden": 0.0, + "loss/logits": 0.21111518889665604, + "loss/reg": 0.24211488664150238, + "step": 3923 + }, + { + "epoch": 0.03924, + "grad_norm": 0.585286021232605, + "grad_norm_var": 0.002464024631203184, + "learning_rate": 5e-05, + "loss": 0.2319, + "loss/crossentropy": 2.6299514174461365, + "loss/hidden": 0.0, + "loss/logits": 0.2318543642759323, + "loss/reg": 0.2417222559452057, + "step": 3924 + }, + { + "epoch": 0.03925, + "grad_norm": 0.47289931774139404, + "grad_norm_var": 0.0024905404153666155, + "learning_rate": 5e-05, + "loss": 0.2342, + "loss/crossentropy": 2.7037989497184753, + "loss/hidden": 0.0, + "loss/logits": 0.23416489362716675, + "loss/reg": 0.24122025072574615, + "step": 3925 + }, + { + "epoch": 0.03926, + "grad_norm": 0.4458968937397003, + "grad_norm_var": 0.002581601001471863, + "learning_rate": 5e-05, + "loss": 0.2155, + "loss/crossentropy": 2.7740334272384644, + "loss/hidden": 0.0, + "loss/logits": 0.21551137045025826, + "loss/reg": 0.24097836017608643, + "step": 3926 + }, + { + "epoch": 0.03927, + "grad_norm": 0.4783949851989746, + "grad_norm_var": 0.002587407875277666, + "learning_rate": 5e-05, + "loss": 0.2243, + "loss/crossentropy": 2.87155818939209, + "loss/hidden": 0.0, + "loss/logits": 0.2243029661476612, + "loss/reg": 0.24070221185684204, + "step": 3927 + }, + { + "epoch": 0.03928, + "grad_norm": 0.5064007043838501, + "grad_norm_var": 0.002264531537710938, + "learning_rate": 5e-05, + "loss": 0.2202, + "loss/crossentropy": 2.9479112029075623, + "loss/hidden": 0.0, + "loss/logits": 0.22016747295856476, + "loss/reg": 0.2407466620206833, + "step": 3928 + }, + { + "epoch": 0.03929, + "grad_norm": 0.4224546551704407, + "grad_norm_var": 0.0023852551521838, + "learning_rate": 5e-05, + "loss": 0.2009, + "loss/crossentropy": 2.782082676887512, + "loss/hidden": 0.0, + "loss/logits": 0.20087717846035957, + "loss/reg": 0.24073515832424164, + "step": 3929 + }, + { + "epoch": 0.0393, + "grad_norm": 0.4628050923347473, + "grad_norm_var": 0.0024049408367893305, + "learning_rate": 5e-05, + "loss": 0.225, + "loss/crossentropy": 2.798001289367676, + "loss/hidden": 0.0, + "loss/logits": 0.22498973831534386, + "loss/reg": 0.2404928207397461, + "step": 3930 + }, + { + "epoch": 0.03931, + "grad_norm": 0.44205453991889954, + "grad_norm_var": 0.0022973707262852936, + "learning_rate": 5e-05, + "loss": 0.2101, + "loss/crossentropy": 2.885380208492279, + "loss/hidden": 0.0, + "loss/logits": 0.2101321667432785, + "loss/reg": 0.2404123842716217, + "step": 3931 + }, + { + "epoch": 0.03932, + "grad_norm": 0.4698442220687866, + "grad_norm_var": 0.0022905889617989045, + "learning_rate": 5e-05, + "loss": 0.2341, + "loss/crossentropy": 2.985645890235901, + "loss/hidden": 0.0, + "loss/logits": 0.23405024036765099, + "loss/reg": 0.24023555219173431, + "step": 3932 + }, + { + "epoch": 0.03933, + "grad_norm": 0.5205316543579102, + "grad_norm_var": 0.002056523596578356, + "learning_rate": 5e-05, + "loss": 0.227, + "loss/crossentropy": 2.734183371067047, + "loss/hidden": 0.0, + "loss/logits": 0.22699563205242157, + "loss/reg": 0.24000902473926544, + "step": 3933 + }, + { + "epoch": 0.03934, + "grad_norm": 0.44747623801231384, + "grad_norm_var": 0.0020312884644476596, + "learning_rate": 5e-05, + "loss": 0.2079, + "loss/crossentropy": 2.7227450013160706, + "loss/hidden": 0.0, + "loss/logits": 0.20787812769412994, + "loss/reg": 0.2398722767829895, + "step": 3934 + }, + { + "epoch": 0.03935, + "grad_norm": 0.4331180155277252, + "grad_norm_var": 0.0020301232656892727, + "learning_rate": 5e-05, + "loss": 0.2146, + "loss/crossentropy": 2.8540369272232056, + "loss/hidden": 0.0, + "loss/logits": 0.2146170400083065, + "loss/reg": 0.23989921808242798, + "step": 3935 + }, + { + "epoch": 0.03936, + "grad_norm": 0.46711790561676025, + "grad_norm_var": 0.0020134071079807915, + "learning_rate": 5e-05, + "loss": 0.2015, + "loss/crossentropy": 2.783471941947937, + "loss/hidden": 0.0, + "loss/logits": 0.2014571949839592, + "loss/reg": 0.23989592492580414, + "step": 3936 + }, + { + "epoch": 0.03937, + "grad_norm": 0.5145784616470337, + "grad_norm_var": 0.0020425909925279335, + "learning_rate": 5e-05, + "loss": 0.2172, + "loss/crossentropy": 2.79709929227829, + "loss/hidden": 0.0, + "loss/logits": 0.21715455129742622, + "loss/reg": 0.2399318665266037, + "step": 3937 + }, + { + "epoch": 0.03938, + "grad_norm": 0.5184196829795837, + "grad_norm_var": 0.0018465296992108786, + "learning_rate": 5e-05, + "loss": 0.225, + "loss/crossentropy": 2.7347378730773926, + "loss/hidden": 0.0, + "loss/logits": 0.22500104829669, + "loss/reg": 0.2397473007440567, + "step": 3938 + }, + { + "epoch": 0.03939, + "grad_norm": 0.5781364440917969, + "grad_norm_var": 0.0023372861445783025, + "learning_rate": 5e-05, + "loss": 0.2295, + "loss/crossentropy": 2.979839861392975, + "loss/hidden": 0.0, + "loss/logits": 0.2295360006392002, + "loss/reg": 0.23973165452480316, + "step": 3939 + }, + { + "epoch": 0.0394, + "grad_norm": 0.5130437612533569, + "grad_norm_var": 0.0017007448088140957, + "learning_rate": 5e-05, + "loss": 0.2081, + "loss/crossentropy": 2.79296875, + "loss/hidden": 0.0, + "loss/logits": 0.20814047753810883, + "loss/reg": 0.23938225209712982, + "step": 3940 + }, + { + "epoch": 0.03941, + "grad_norm": 0.4615393579006195, + "grad_norm_var": 0.001720812479762505, + "learning_rate": 5e-05, + "loss": 0.216, + "loss/crossentropy": 2.744757831096649, + "loss/hidden": 0.0, + "loss/logits": 0.2159627340734005, + "loss/reg": 0.239255890250206, + "step": 3941 + }, + { + "epoch": 0.03942, + "grad_norm": 0.45822733640670776, + "grad_norm_var": 0.0016740611959548345, + "learning_rate": 5e-05, + "loss": 0.2105, + "loss/crossentropy": 2.7283080220222473, + "loss/hidden": 0.0, + "loss/logits": 0.21052411198616028, + "loss/reg": 0.23890970647335052, + "step": 3942 + }, + { + "epoch": 0.03943, + "grad_norm": 0.4556250274181366, + "grad_norm_var": 0.0017140220880046742, + "learning_rate": 5e-05, + "loss": 0.2095, + "loss/crossentropy": 2.885561943054199, + "loss/hidden": 0.0, + "loss/logits": 0.2095492146909237, + "loss/reg": 0.23868787288665771, + "step": 3943 + }, + { + "epoch": 0.03944, + "grad_norm": 0.45943742990493774, + "grad_norm_var": 0.001683177543715634, + "learning_rate": 5e-05, + "loss": 0.2197, + "loss/crossentropy": 2.869614839553833, + "loss/hidden": 0.0, + "loss/logits": 0.21967365965247154, + "loss/reg": 0.2385634481906891, + "step": 3944 + }, + { + "epoch": 0.03945, + "grad_norm": 0.4661356806755066, + "grad_norm_var": 0.0014875128852547528, + "learning_rate": 5e-05, + "loss": 0.2173, + "loss/crossentropy": 2.9110459089279175, + "loss/hidden": 0.0, + "loss/logits": 0.21729177981615067, + "loss/reg": 0.23843981325626373, + "step": 3945 + }, + { + "epoch": 0.03946, + "grad_norm": 0.47789040207862854, + "grad_norm_var": 0.0014686475049210538, + "learning_rate": 5e-05, + "loss": 0.2315, + "loss/crossentropy": 3.0412408113479614, + "loss/hidden": 0.0, + "loss/logits": 0.23153096064925194, + "loss/reg": 0.23823124170303345, + "step": 3946 + }, + { + "epoch": 0.03947, + "grad_norm": 0.4420715570449829, + "grad_norm_var": 0.0014685609762535598, + "learning_rate": 5e-05, + "loss": 0.2089, + "loss/crossentropy": 2.7671589851379395, + "loss/hidden": 0.0, + "loss/logits": 0.2088688164949417, + "loss/reg": 0.2380591630935669, + "step": 3947 + }, + { + "epoch": 0.03948, + "grad_norm": 0.4474121332168579, + "grad_norm_var": 0.0015309831780320083, + "learning_rate": 5e-05, + "loss": 0.2198, + "loss/crossentropy": 2.756869077682495, + "loss/hidden": 0.0, + "loss/logits": 0.2198498174548149, + "loss/reg": 0.2378970831632614, + "step": 3948 + }, + { + "epoch": 0.03949, + "grad_norm": 0.4706645905971527, + "grad_norm_var": 0.0014089159183737691, + "learning_rate": 5e-05, + "loss": 0.2213, + "loss/crossentropy": 2.700082838535309, + "loss/hidden": 0.0, + "loss/logits": 0.22129274532198906, + "loss/reg": 0.23777876794338226, + "step": 3949 + }, + { + "epoch": 0.0395, + "grad_norm": 0.4734157919883728, + "grad_norm_var": 0.0013534209314162641, + "learning_rate": 5e-05, + "loss": 0.2247, + "loss/crossentropy": 2.759484827518463, + "loss/hidden": 0.0, + "loss/logits": 0.22474313899874687, + "loss/reg": 0.23756346106529236, + "step": 3950 + }, + { + "epoch": 0.03951, + "grad_norm": 0.4716768264770508, + "grad_norm_var": 0.0012191867058862253, + "learning_rate": 5e-05, + "loss": 0.2179, + "loss/crossentropy": 2.794048309326172, + "loss/hidden": 0.0, + "loss/logits": 0.21789296343922615, + "loss/reg": 0.23732411861419678, + "step": 3951 + }, + { + "epoch": 0.03952, + "grad_norm": 0.5169031620025635, + "grad_norm_var": 0.0012904972499208967, + "learning_rate": 5e-05, + "loss": 0.2164, + "loss/crossentropy": 2.7615034580230713, + "loss/hidden": 0.0, + "loss/logits": 0.21637966111302376, + "loss/reg": 0.23709163069725037, + "step": 3952 + }, + { + "epoch": 0.03953, + "grad_norm": 0.4258163571357727, + "grad_norm_var": 0.0014070996204407363, + "learning_rate": 5e-05, + "loss": 0.1976, + "loss/crossentropy": 3.00202739238739, + "loss/hidden": 0.0, + "loss/logits": 0.1975814625620842, + "loss/reg": 0.23693785071372986, + "step": 3953 + }, + { + "epoch": 0.03954, + "grad_norm": 0.46711137890815735, + "grad_norm_var": 0.0012901649571699185, + "learning_rate": 5e-05, + "loss": 0.2171, + "loss/crossentropy": 2.8913829922676086, + "loss/hidden": 0.0, + "loss/logits": 0.21706626564264297, + "loss/reg": 0.23670707643032074, + "step": 3954 + }, + { + "epoch": 0.03955, + "grad_norm": 0.45789042115211487, + "grad_norm_var": 0.000525369492019682, + "learning_rate": 5e-05, + "loss": 0.2095, + "loss/crossentropy": 2.9351855516433716, + "loss/hidden": 0.0, + "loss/logits": 0.20947088301181793, + "loss/reg": 0.23634666204452515, + "step": 3955 + }, + { + "epoch": 0.03956, + "grad_norm": 0.43468812108039856, + "grad_norm_var": 0.00042339506905080627, + "learning_rate": 5e-05, + "loss": 0.2033, + "loss/crossentropy": 2.836532771587372, + "loss/hidden": 0.0, + "loss/logits": 0.2032812498509884, + "loss/reg": 0.2362404316663742, + "step": 3956 + }, + { + "epoch": 0.03957, + "grad_norm": 0.5410988926887512, + "grad_norm_var": 0.0008177588628602776, + "learning_rate": 5e-05, + "loss": 0.2235, + "loss/crossentropy": 2.629551351070404, + "loss/hidden": 0.0, + "loss/logits": 0.22353583946824074, + "loss/reg": 0.2360714226961136, + "step": 3957 + }, + { + "epoch": 0.03958, + "grad_norm": 0.48771390318870544, + "grad_norm_var": 0.0008390681964139686, + "learning_rate": 5e-05, + "loss": 0.2164, + "loss/crossentropy": 2.7000937461853027, + "loss/hidden": 0.0, + "loss/logits": 0.2163931392133236, + "loss/reg": 0.23623806238174438, + "step": 3958 + }, + { + "epoch": 0.03959, + "grad_norm": 0.5827957987785339, + "grad_norm_var": 0.001632009269884079, + "learning_rate": 5e-05, + "loss": 0.2421, + "loss/crossentropy": 2.8730274438858032, + "loss/hidden": 0.0, + "loss/logits": 0.24210872128605843, + "loss/reg": 0.23590560257434845, + "step": 3959 + }, + { + "epoch": 0.0396, + "grad_norm": 0.4652520418167114, + "grad_norm_var": 0.0016209559843830472, + "learning_rate": 5e-05, + "loss": 0.2067, + "loss/crossentropy": 2.882779359817505, + "loss/hidden": 0.0, + "loss/logits": 0.20669499412178993, + "loss/reg": 0.2358488142490387, + "step": 3960 + }, + { + "epoch": 0.03961, + "grad_norm": 0.45139381289482117, + "grad_norm_var": 0.00165546794701437, + "learning_rate": 5e-05, + "loss": 0.2134, + "loss/crossentropy": 2.7751084566116333, + "loss/hidden": 0.0, + "loss/logits": 0.21342160180211067, + "loss/reg": 0.23568053543567657, + "step": 3961 + }, + { + "epoch": 0.03962, + "grad_norm": 0.4589799642562866, + "grad_norm_var": 0.0016727043448619009, + "learning_rate": 5e-05, + "loss": 0.2356, + "loss/crossentropy": 2.851916193962097, + "loss/hidden": 0.0, + "loss/logits": 0.23561665788292885, + "loss/reg": 0.23575744032859802, + "step": 3962 + }, + { + "epoch": 0.03963, + "grad_norm": 0.4858406186103821, + "grad_norm_var": 0.0016021369900875103, + "learning_rate": 5e-05, + "loss": 0.2151, + "loss/crossentropy": 2.855204224586487, + "loss/hidden": 0.0, + "loss/logits": 0.2150660641491413, + "loss/reg": 0.23563572764396667, + "step": 3963 + }, + { + "epoch": 0.03964, + "grad_norm": 0.46132054924964905, + "grad_norm_var": 0.0015585866607361275, + "learning_rate": 5e-05, + "loss": 0.1918, + "loss/crossentropy": 2.6469059586524963, + "loss/hidden": 0.0, + "loss/logits": 0.19178001582622528, + "loss/reg": 0.23559235036373138, + "step": 3964 + }, + { + "epoch": 0.03965, + "grad_norm": 0.4671718180179596, + "grad_norm_var": 0.0015628980395362313, + "learning_rate": 5e-05, + "loss": 0.22, + "loss/crossentropy": 2.7581081986427307, + "loss/hidden": 0.0, + "loss/logits": 0.2200218141078949, + "loss/reg": 0.23569577932357788, + "step": 3965 + }, + { + "epoch": 0.03966, + "grad_norm": 0.4820942282676697, + "grad_norm_var": 0.0015622233985590742, + "learning_rate": 5e-05, + "loss": 0.2142, + "loss/crossentropy": 2.7764683961868286, + "loss/hidden": 0.0, + "loss/logits": 0.21415767818689346, + "loss/reg": 0.2357397973537445, + "step": 3966 + }, + { + "epoch": 0.03967, + "grad_norm": 0.4275006353855133, + "grad_norm_var": 0.0017250274264436261, + "learning_rate": 5e-05, + "loss": 0.2103, + "loss/crossentropy": 2.7596601247787476, + "loss/hidden": 0.0, + "loss/logits": 0.21026629954576492, + "loss/reg": 0.23553304374217987, + "step": 3967 + }, + { + "epoch": 0.03968, + "grad_norm": 0.4303382933139801, + "grad_norm_var": 0.001719514417664192, + "learning_rate": 5e-05, + "loss": 0.2115, + "loss/crossentropy": 2.90465772151947, + "loss/hidden": 0.0, + "loss/logits": 0.21146779879927635, + "loss/reg": 0.2355106621980667, + "step": 3968 + }, + { + "epoch": 0.03969, + "grad_norm": 0.4820546805858612, + "grad_norm_var": 0.0015825939265573298, + "learning_rate": 5e-05, + "loss": 0.211, + "loss/crossentropy": 2.6966288685798645, + "loss/hidden": 0.0, + "loss/logits": 0.21100781485438347, + "loss/reg": 0.23554740846157074, + "step": 3969 + }, + { + "epoch": 0.0397, + "grad_norm": 0.4867878556251526, + "grad_norm_var": 0.0015888429262801318, + "learning_rate": 5e-05, + "loss": 0.2183, + "loss/crossentropy": 2.864223599433899, + "loss/hidden": 0.0, + "loss/logits": 0.21826937794685364, + "loss/reg": 0.23557902872562408, + "step": 3970 + }, + { + "epoch": 0.03971, + "grad_norm": 0.5144354104995728, + "grad_norm_var": 0.0016583049227650193, + "learning_rate": 5e-05, + "loss": 0.2252, + "loss/crossentropy": 2.6925675868988037, + "loss/hidden": 0.0, + "loss/logits": 0.2252073884010315, + "loss/reg": 0.23559461534023285, + "step": 3971 + }, + { + "epoch": 0.03972, + "grad_norm": 0.6709385514259338, + "grad_norm_var": 0.00375979490885738, + "learning_rate": 5e-05, + "loss": 0.2219, + "loss/crossentropy": 2.847380816936493, + "loss/hidden": 0.0, + "loss/logits": 0.22186212986707687, + "loss/reg": 0.23549318313598633, + "step": 3972 + }, + { + "epoch": 0.03973, + "grad_norm": 0.5426727533340454, + "grad_norm_var": 0.003769941971190719, + "learning_rate": 5e-05, + "loss": 0.22, + "loss/crossentropy": 2.7607813477516174, + "loss/hidden": 0.0, + "loss/logits": 0.21995210275053978, + "loss/reg": 0.2353045493364334, + "step": 3973 + }, + { + "epoch": 0.03974, + "grad_norm": 0.48984676599502563, + "grad_norm_var": 0.0037685578855414163, + "learning_rate": 5e-05, + "loss": 0.2276, + "loss/crossentropy": 2.777942180633545, + "loss/hidden": 0.0, + "loss/logits": 0.2275594174861908, + "loss/reg": 0.23546826839447021, + "step": 3974 + }, + { + "epoch": 0.03975, + "grad_norm": 0.4400253891944885, + "grad_norm_var": 0.003346753500871206, + "learning_rate": 5e-05, + "loss": 0.2058, + "loss/crossentropy": 2.8478062748908997, + "loss/hidden": 0.0, + "loss/logits": 0.20582742616534233, + "loss/reg": 0.23560136556625366, + "step": 3975 + }, + { + "epoch": 0.03976, + "grad_norm": 0.4352748990058899, + "grad_norm_var": 0.003481013446995836, + "learning_rate": 5e-05, + "loss": 0.2001, + "loss/crossentropy": 2.829244077205658, + "loss/hidden": 0.0, + "loss/logits": 0.20008230209350586, + "loss/reg": 0.23555028438568115, + "step": 3976 + }, + { + "epoch": 0.03977, + "grad_norm": 0.4628485441207886, + "grad_norm_var": 0.003441068438090063, + "learning_rate": 5e-05, + "loss": 0.2199, + "loss/crossentropy": 2.892146050930023, + "loss/hidden": 0.0, + "loss/logits": 0.21986335888504982, + "loss/reg": 0.23565202951431274, + "step": 3977 + }, + { + "epoch": 0.03978, + "grad_norm": 0.48948347568511963, + "grad_norm_var": 0.0033989544785456758, + "learning_rate": 5e-05, + "loss": 0.2306, + "loss/crossentropy": 2.824368417263031, + "loss/hidden": 0.0, + "loss/logits": 0.23062653467059135, + "loss/reg": 0.23576240241527557, + "step": 3978 + }, + { + "epoch": 0.03979, + "grad_norm": 0.47073084115982056, + "grad_norm_var": 0.003412617230564315, + "learning_rate": 5e-05, + "loss": 0.2125, + "loss/crossentropy": 2.886205017566681, + "loss/hidden": 0.0, + "loss/logits": 0.21254407614469528, + "loss/reg": 0.23562392592430115, + "step": 3979 + }, + { + "epoch": 0.0398, + "grad_norm": 0.41535133123397827, + "grad_norm_var": 0.003687346530435661, + "learning_rate": 5e-05, + "loss": 0.1956, + "loss/crossentropy": 2.8395267724990845, + "loss/hidden": 0.0, + "loss/logits": 0.195577971637249, + "loss/reg": 0.23568850755691528, + "step": 3980 + }, + { + "epoch": 0.03981, + "grad_norm": 0.4306774437427521, + "grad_norm_var": 0.003841387517810313, + "learning_rate": 5e-05, + "loss": 0.1907, + "loss/crossentropy": 2.7085479497909546, + "loss/hidden": 0.0, + "loss/logits": 0.19072964787483215, + "loss/reg": 0.23585109412670135, + "step": 3981 + }, + { + "epoch": 0.03982, + "grad_norm": 0.5069337487220764, + "grad_norm_var": 0.0038887363958275117, + "learning_rate": 5e-05, + "loss": 0.2407, + "loss/crossentropy": 2.71796053647995, + "loss/hidden": 0.0, + "loss/logits": 0.24067912995815277, + "loss/reg": 0.23601506650447845, + "step": 3982 + }, + { + "epoch": 0.03983, + "grad_norm": 0.4649655818939209, + "grad_norm_var": 0.0037092470341030135, + "learning_rate": 5e-05, + "loss": 0.2304, + "loss/crossentropy": 2.7549756169319153, + "loss/hidden": 0.0, + "loss/logits": 0.23041418194770813, + "loss/reg": 0.23596158623695374, + "step": 3983 + }, + { + "epoch": 0.03984, + "grad_norm": 0.4213128685951233, + "grad_norm_var": 0.0037781143007795817, + "learning_rate": 5e-05, + "loss": 0.2009, + "loss/crossentropy": 2.860415279865265, + "loss/hidden": 0.0, + "loss/logits": 0.20086908340454102, + "loss/reg": 0.23606030642986298, + "step": 3984 + }, + { + "epoch": 0.03985, + "grad_norm": 0.5503379702568054, + "grad_norm_var": 0.004063003236417281, + "learning_rate": 5e-05, + "loss": 0.2316, + "loss/crossentropy": 2.8638846278190613, + "loss/hidden": 0.0, + "loss/logits": 0.231573648750782, + "loss/reg": 0.23608103394508362, + "step": 3985 + }, + { + "epoch": 0.03986, + "grad_norm": 0.41302528977394104, + "grad_norm_var": 0.004405530151037951, + "learning_rate": 5e-05, + "loss": 0.1982, + "loss/crossentropy": 2.8055331110954285, + "loss/hidden": 0.0, + "loss/logits": 0.19816380739212036, + "loss/reg": 0.23607899248600006, + "step": 3986 + }, + { + "epoch": 0.03987, + "grad_norm": 0.45534971356391907, + "grad_norm_var": 0.004371574105041302, + "learning_rate": 5e-05, + "loss": 0.2223, + "loss/crossentropy": 2.9571288228034973, + "loss/hidden": 0.0, + "loss/logits": 0.22229866310954094, + "loss/reg": 0.23597030341625214, + "step": 3987 + }, + { + "epoch": 0.03988, + "grad_norm": 0.4766867756843567, + "grad_norm_var": 0.0017518406925482234, + "learning_rate": 5e-05, + "loss": 0.2175, + "loss/crossentropy": 2.783082365989685, + "loss/hidden": 0.0, + "loss/logits": 0.21751204133033752, + "loss/reg": 0.2358643263578415, + "step": 3988 + }, + { + "epoch": 0.03989, + "grad_norm": 0.4712783396244049, + "grad_norm_var": 0.001346211808832587, + "learning_rate": 5e-05, + "loss": 0.2086, + "loss/crossentropy": 2.926737904548645, + "loss/hidden": 0.0, + "loss/logits": 0.2085995115339756, + "loss/reg": 0.23569098114967346, + "step": 3989 + }, + { + "epoch": 0.0399, + "grad_norm": 0.426708847284317, + "grad_norm_var": 0.0013620568549595627, + "learning_rate": 5e-05, + "loss": 0.209, + "loss/crossentropy": 2.832667112350464, + "loss/hidden": 0.0, + "loss/logits": 0.20904811471700668, + "loss/reg": 0.23570221662521362, + "step": 3990 + }, + { + "epoch": 0.03991, + "grad_norm": 0.4457605183124542, + "grad_norm_var": 0.0013502247404703145, + "learning_rate": 5e-05, + "loss": 0.2071, + "loss/crossentropy": 2.6847055554389954, + "loss/hidden": 0.0, + "loss/logits": 0.20705030485987663, + "loss/reg": 0.2356504499912262, + "step": 3991 + }, + { + "epoch": 0.03992, + "grad_norm": 0.4534139335155487, + "grad_norm_var": 0.0013145082188132447, + "learning_rate": 5e-05, + "loss": 0.1993, + "loss/crossentropy": 2.854257881641388, + "loss/hidden": 0.0, + "loss/logits": 0.19925282895565033, + "loss/reg": 0.2356453686952591, + "step": 3992 + }, + { + "epoch": 0.03993, + "grad_norm": 0.48812609910964966, + "grad_norm_var": 0.0013651250787359737, + "learning_rate": 5e-05, + "loss": 0.2049, + "loss/crossentropy": 2.758602499961853, + "loss/hidden": 0.0, + "loss/logits": 0.20487185567617416, + "loss/reg": 0.23552529513835907, + "step": 3993 + }, + { + "epoch": 0.03994, + "grad_norm": 0.46584558486938477, + "grad_norm_var": 0.001311091095749654, + "learning_rate": 5e-05, + "loss": 0.2134, + "loss/crossentropy": 2.744314968585968, + "loss/hidden": 0.0, + "loss/logits": 0.21343671903014183, + "loss/reg": 0.2356114387512207, + "step": 3994 + }, + { + "epoch": 0.03995, + "grad_norm": 0.4465269446372986, + "grad_norm_var": 0.0013123699999324759, + "learning_rate": 5e-05, + "loss": 0.2276, + "loss/crossentropy": 2.959661900997162, + "loss/hidden": 0.0, + "loss/logits": 0.22757936269044876, + "loss/reg": 0.2354321926832199, + "step": 3995 + }, + { + "epoch": 0.03996, + "grad_norm": 0.44977691769599915, + "grad_norm_var": 0.0011894454703286871, + "learning_rate": 5e-05, + "loss": 0.219, + "loss/crossentropy": 2.833622455596924, + "loss/hidden": 0.0, + "loss/logits": 0.21900470554828644, + "loss/reg": 0.23544976115226746, + "step": 3996 + }, + { + "epoch": 0.03997, + "grad_norm": 0.4313839375972748, + "grad_norm_var": 0.0011866749030247092, + "learning_rate": 5e-05, + "loss": 0.2092, + "loss/crossentropy": 2.7510908246040344, + "loss/hidden": 0.0, + "loss/logits": 0.2092135027050972, + "loss/reg": 0.23542697727680206, + "step": 3997 + }, + { + "epoch": 0.03998, + "grad_norm": 0.4205797016620636, + "grad_norm_var": 0.0011176984885951867, + "learning_rate": 5e-05, + "loss": 0.1983, + "loss/crossentropy": 2.789881467819214, + "loss/hidden": 0.0, + "loss/logits": 0.1983497552573681, + "loss/reg": 0.23545628786087036, + "step": 3998 + }, + { + "epoch": 0.03999, + "grad_norm": 0.43564102053642273, + "grad_norm_var": 0.0011327429474737342, + "learning_rate": 5e-05, + "loss": 0.2025, + "loss/crossentropy": 2.6554754972457886, + "loss/hidden": 0.0, + "loss/logits": 0.20248664170503616, + "loss/reg": 0.23537719249725342, + "step": 3999 + }, + { + "epoch": 0.04, + "grad_norm": 0.4207809865474701, + "grad_norm_var": 0.001135024445236547, + "learning_rate": 5e-05, + "loss": 0.2087, + "loss/crossentropy": 2.83248108625412, + "loss/hidden": 0.0, + "loss/logits": 0.20865388959646225, + "loss/reg": 0.23541438579559326, + "step": 4000 + } + ], + "logging_steps": 1, + "max_steps": 100000, + "num_input_tokens_seen": 0, + "num_train_epochs": 9223372036854775807, + "save_steps": 1000, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": true, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 2.060987570061312e+18, + "train_batch_size": 4, + "trial_name": null, + "trial_params": null +}