diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,12018 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.01, + "eval_steps": 1000, + "global_step": 1000, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 1e-05, + "grad_norm": 0.37634041905403137, + "learning_rate": 5e-06, + "loss": 0.169, + "loss/crossentropy": 2.8720462918281555, + "loss/hidden": 0.0, + "loss/logits": 0.16897856071591377, + "loss/reg": 4.4040703773498535, + "step": 1 + }, + { + "epoch": 2e-05, + "grad_norm": 0.35649582743644714, + "learning_rate": 1e-05, + "loss": 0.1696, + "loss/crossentropy": 2.715533673763275, + "loss/hidden": 0.0, + "loss/logits": 0.1695844642817974, + "loss/reg": 4.399058818817139, + "step": 2 + }, + { + "epoch": 3e-05, + "grad_norm": 0.3591013252735138, + "learning_rate": 1.5e-05, + "loss": 0.1782, + "loss/crossentropy": 2.6291310787200928, + "loss/hidden": 0.0, + "loss/logits": 0.1782267540693283, + "loss/reg": 4.394084930419922, + "step": 3 + }, + { + "epoch": 4e-05, + "grad_norm": 0.36401960253715515, + "learning_rate": 2e-05, + "loss": 0.1843, + "loss/crossentropy": 2.7142109274864197, + "loss/hidden": 0.0, + "loss/logits": 0.1843317598104477, + "loss/reg": 4.389005661010742, + "step": 4 + }, + { + "epoch": 5e-05, + "grad_norm": 0.3119131922721863, + "learning_rate": 2.5e-05, + "loss": 0.1625, + "loss/crossentropy": 2.7586326003074646, + "loss/hidden": 0.0, + "loss/logits": 0.1625315584242344, + "loss/reg": 4.3841166496276855, + "step": 5 + }, + { + "epoch": 6e-05, + "grad_norm": 0.3388400673866272, + "learning_rate": 3e-05, + "loss": 0.1844, + "loss/crossentropy": 2.8104345202445984, + "loss/hidden": 0.0, + "loss/logits": 0.1844346523284912, + "loss/reg": 4.3792877197265625, + "step": 6 + }, + { + "epoch": 7e-05, + "grad_norm": 0.4783320426940918, + "learning_rate": 3.5e-05, + "loss": 0.1843, + "loss/crossentropy": 2.8321655988693237, + "loss/hidden": 0.0, + "loss/logits": 0.18431555479764938, + "loss/reg": 4.37478494644165, + "step": 7 + }, + { + "epoch": 8e-05, + "grad_norm": 0.29636114835739136, + "learning_rate": 4e-05, + "loss": 0.1589, + "loss/crossentropy": 2.6809526681900024, + "loss/hidden": 0.0, + "loss/logits": 0.15894119441509247, + "loss/reg": 4.370139122009277, + "step": 8 + }, + { + "epoch": 9e-05, + "grad_norm": 0.30071625113487244, + "learning_rate": 4.5e-05, + "loss": 0.1657, + "loss/crossentropy": 2.6759764552116394, + "loss/hidden": 0.0, + "loss/logits": 0.16574353352189064, + "loss/reg": 4.365106105804443, + "step": 9 + }, + { + "epoch": 0.0001, + "grad_norm": 0.28883349895477295, + "learning_rate": 5e-05, + "loss": 0.1572, + "loss/crossentropy": 2.808637499809265, + "loss/hidden": 0.0, + "loss/logits": 0.15719739720225334, + "loss/reg": 4.360220909118652, + "step": 10 + }, + { + "epoch": 0.00011, + "grad_norm": 0.28243017196655273, + "learning_rate": 5e-05, + "loss": 0.1426, + "loss/crossentropy": 2.72423392534256, + "loss/hidden": 0.0, + "loss/logits": 0.14257685840129852, + "loss/reg": 4.355813503265381, + "step": 11 + }, + { + "epoch": 0.00012, + "grad_norm": 0.31152331829071045, + "learning_rate": 5e-05, + "loss": 0.147, + "loss/crossentropy": 2.710044264793396, + "loss/hidden": 0.0, + "loss/logits": 0.14701137319207191, + "loss/reg": 4.351265907287598, + "step": 12 + }, + { + "epoch": 0.00013, + "grad_norm": 0.2739678919315338, + "learning_rate": 5e-05, + "loss": 0.1499, + "loss/crossentropy": 2.7644649744033813, + "loss/hidden": 0.0, + "loss/logits": 0.149860430508852, + "loss/reg": 4.346287727355957, + "step": 13 + }, + { + "epoch": 0.00014, + "grad_norm": 0.2712353467941284, + "learning_rate": 5e-05, + "loss": 0.1454, + "loss/crossentropy": 2.7370432019233704, + "loss/hidden": 0.0, + "loss/logits": 0.14539287611842155, + "loss/reg": 4.340969085693359, + "step": 14 + }, + { + "epoch": 0.00015, + "grad_norm": 0.2667863667011261, + "learning_rate": 5e-05, + "loss": 0.1403, + "loss/crossentropy": 2.5638718008995056, + "loss/hidden": 0.0, + "loss/logits": 0.14029696956276894, + "loss/reg": 4.336019515991211, + "step": 15 + }, + { + "epoch": 0.00016, + "grad_norm": 0.30467212200164795, + "grad_norm_var": 0.0029449483710212204, + "learning_rate": 5e-05, + "loss": 0.1361, + "loss/crossentropy": 2.797445595264435, + "loss/hidden": 0.0, + "loss/logits": 0.13607431203126907, + "loss/reg": 4.330692291259766, + "step": 16 + }, + { + "epoch": 0.00017, + "grad_norm": 0.2617621421813965, + "grad_norm_var": 0.0029635281595075556, + "learning_rate": 5e-05, + "loss": 0.1443, + "loss/crossentropy": 2.7542406916618347, + "loss/hidden": 0.0, + "loss/logits": 0.14427556470036507, + "loss/reg": 4.325323581695557, + "step": 17 + }, + { + "epoch": 0.00018, + "grad_norm": 0.28648674488067627, + "grad_norm_var": 0.0028982593896559215, + "learning_rate": 5e-05, + "loss": 0.1396, + "loss/crossentropy": 2.674492835998535, + "loss/hidden": 0.0, + "loss/logits": 0.13961521908640862, + "loss/reg": 4.31995153427124, + "step": 18 + }, + { + "epoch": 0.00019, + "grad_norm": 0.26269060373306274, + "grad_norm_var": 0.002877724259904054, + "learning_rate": 5e-05, + "loss": 0.141, + "loss/crossentropy": 2.8323662281036377, + "loss/hidden": 0.0, + "loss/logits": 0.14103225618600845, + "loss/reg": 4.315446853637695, + "step": 19 + }, + { + "epoch": 0.0002, + "grad_norm": 0.2718074321746826, + "grad_norm_var": 0.0026993307095730186, + "learning_rate": 5e-05, + "loss": 0.1314, + "loss/crossentropy": 2.63212913274765, + "loss/hidden": 0.0, + "loss/logits": 0.1313977725803852, + "loss/reg": 4.310704708099365, + "step": 20 + }, + { + "epoch": 0.00021, + "grad_norm": 0.2430431842803955, + "grad_norm_var": 0.0028911751903802204, + "learning_rate": 5e-05, + "loss": 0.1324, + "loss/crossentropy": 2.664808928966522, + "loss/hidden": 0.0, + "loss/logits": 0.1324238833039999, + "loss/reg": 4.305792808532715, + "step": 21 + }, + { + "epoch": 0.00022, + "grad_norm": 0.24898661673069, + "grad_norm_var": 0.00288514612507397, + "learning_rate": 5e-05, + "loss": 0.1242, + "loss/crossentropy": 2.7142711877822876, + "loss/hidden": 0.0, + "loss/logits": 0.12423932552337646, + "loss/reg": 4.300712585449219, + "step": 22 + }, + { + "epoch": 0.00023, + "grad_norm": 0.3123313784599304, + "grad_norm_var": 0.0004523056580034851, + "learning_rate": 5e-05, + "loss": 0.1321, + "loss/crossentropy": 2.7829225063323975, + "loss/hidden": 0.0, + "loss/logits": 0.13212688639760017, + "loss/reg": 4.295501232147217, + "step": 23 + }, + { + "epoch": 0.00024, + "grad_norm": 0.25187963247299194, + "grad_norm_var": 0.00048027979198491945, + "learning_rate": 5e-05, + "loss": 0.1248, + "loss/crossentropy": 2.692659854888916, + "loss/hidden": 0.0, + "loss/logits": 0.12482420355081558, + "loss/reg": 4.2908830642700195, + "step": 24 + }, + { + "epoch": 0.00025, + "grad_norm": 0.2151177078485489, + "grad_norm_var": 0.0006726495064564575, + "learning_rate": 5e-05, + "loss": 0.1232, + "loss/crossentropy": 2.738182246685028, + "loss/hidden": 0.0, + "loss/logits": 0.1231868714094162, + "loss/reg": 4.285846710205078, + "step": 25 + }, + { + "epoch": 0.00026, + "grad_norm": 0.23308518528938293, + "grad_norm_var": 0.0007424884519799501, + "learning_rate": 5e-05, + "loss": 0.1174, + "loss/crossentropy": 2.555102586746216, + "loss/hidden": 0.0, + "loss/logits": 0.11737299524247646, + "loss/reg": 4.281113147735596, + "step": 26 + }, + { + "epoch": 0.00027, + "grad_norm": 0.24523235857486725, + "grad_norm_var": 0.0007604384721796281, + "learning_rate": 5e-05, + "loss": 0.1201, + "loss/crossentropy": 2.6816893815994263, + "loss/hidden": 0.0, + "loss/logits": 0.12014555744826794, + "loss/reg": 4.2765069007873535, + "step": 27 + }, + { + "epoch": 0.00028, + "grad_norm": 0.25897473096847534, + "grad_norm_var": 0.0006160828367585275, + "learning_rate": 5e-05, + "loss": 0.1227, + "loss/crossentropy": 2.7505548000335693, + "loss/hidden": 0.0, + "loss/logits": 0.12271320074796677, + "loss/reg": 4.27158260345459, + "step": 28 + }, + { + "epoch": 0.00029, + "grad_norm": 0.23087331652641296, + "grad_norm_var": 0.0006691547004593392, + "learning_rate": 5e-05, + "loss": 0.1181, + "loss/crossentropy": 2.8483291268348694, + "loss/hidden": 0.0, + "loss/logits": 0.11810225620865822, + "loss/reg": 4.267061233520508, + "step": 29 + }, + { + "epoch": 0.0003, + "grad_norm": 1.2210192680358887, + "grad_norm_var": 0.05843327221954173, + "learning_rate": 5e-05, + "loss": 0.1723, + "loss/crossentropy": 2.8535077571868896, + "loss/hidden": 0.0, + "loss/logits": 0.17234252952039242, + "loss/reg": 4.262645244598389, + "step": 30 + }, + { + "epoch": 0.00031, + "grad_norm": 0.2712586224079132, + "grad_norm_var": 0.058402986662709634, + "learning_rate": 5e-05, + "loss": 0.1156, + "loss/crossentropy": 2.6525614261627197, + "loss/hidden": 0.0, + "loss/logits": 0.11560441367328167, + "loss/reg": 4.258092403411865, + "step": 31 + }, + { + "epoch": 0.00032, + "grad_norm": 0.5226843953132629, + "grad_norm_var": 0.06092943089461011, + "learning_rate": 5e-05, + "loss": 0.1537, + "loss/crossentropy": 2.6228127479553223, + "loss/hidden": 0.0, + "loss/logits": 0.15369537472724915, + "loss/reg": 4.253781318664551, + "step": 32 + }, + { + "epoch": 0.00033, + "grad_norm": 0.35246461629867554, + "grad_norm_var": 0.06057510886832484, + "learning_rate": 5e-05, + "loss": 0.1216, + "loss/crossentropy": 2.6986429691314697, + "loss/hidden": 0.0, + "loss/logits": 0.12163393199443817, + "loss/reg": 4.249208450317383, + "step": 33 + }, + { + "epoch": 0.00034, + "grad_norm": 0.2868311405181885, + "grad_norm_var": 0.060572693607631393, + "learning_rate": 5e-05, + "loss": 0.1215, + "loss/crossentropy": 2.7423174381256104, + "loss/hidden": 0.0, + "loss/logits": 0.12151895463466644, + "loss/reg": 4.244677543640137, + "step": 34 + }, + { + "epoch": 0.00035, + "grad_norm": 0.2556142210960388, + "grad_norm_var": 0.06064807497415105, + "learning_rate": 5e-05, + "loss": 0.1137, + "loss/crossentropy": 2.7171207070350647, + "loss/hidden": 0.0, + "loss/logits": 0.1137176975607872, + "loss/reg": 4.2399797439575195, + "step": 35 + }, + { + "epoch": 0.00036, + "grad_norm": 0.2783287763595581, + "grad_norm_var": 0.060592460146055585, + "learning_rate": 5e-05, + "loss": 0.1138, + "loss/crossentropy": 2.7394094467163086, + "loss/hidden": 0.0, + "loss/logits": 0.11381806619465351, + "loss/reg": 4.235424041748047, + "step": 36 + }, + { + "epoch": 0.00037, + "grad_norm": 0.3065175712108612, + "grad_norm_var": 0.06003019540430902, + "learning_rate": 5e-05, + "loss": 0.1235, + "loss/crossentropy": 2.755502223968506, + "loss/hidden": 0.0, + "loss/logits": 0.12348765879869461, + "loss/reg": 4.2310051918029785, + "step": 37 + }, + { + "epoch": 0.00038, + "grad_norm": 0.26492562890052795, + "grad_norm_var": 0.059845851287469956, + "learning_rate": 5e-05, + "loss": 0.1119, + "loss/crossentropy": 2.8106552362442017, + "loss/hidden": 0.0, + "loss/logits": 0.11191634088754654, + "loss/reg": 4.226707935333252, + "step": 38 + }, + { + "epoch": 0.00039, + "grad_norm": 0.24673967063426971, + "grad_norm_var": 0.06039341868271975, + "learning_rate": 5e-05, + "loss": 0.1161, + "loss/crossentropy": 2.7490118741989136, + "loss/hidden": 0.0, + "loss/logits": 0.11609707958996296, + "loss/reg": 4.222842216491699, + "step": 39 + }, + { + "epoch": 0.0004, + "grad_norm": 0.2973298132419586, + "grad_norm_var": 0.05998792869591778, + "learning_rate": 5e-05, + "loss": 0.1124, + "loss/crossentropy": 2.7798808813095093, + "loss/hidden": 0.0, + "loss/logits": 0.11244922317564487, + "loss/reg": 4.218531131744385, + "step": 40 + }, + { + "epoch": 0.00041, + "grad_norm": 0.7517657279968262, + "grad_norm_var": 0.06884148715130983, + "learning_rate": 5e-05, + "loss": 0.1545, + "loss/crossentropy": 2.749855697154999, + "loss/hidden": 0.0, + "loss/logits": 0.15445118583738804, + "loss/reg": 4.214253902435303, + "step": 41 + }, + { + "epoch": 0.00042, + "grad_norm": 0.2417730987071991, + "grad_norm_var": 0.06868010027414732, + "learning_rate": 5e-05, + "loss": 0.1099, + "loss/crossentropy": 2.751042366027832, + "loss/hidden": 0.0, + "loss/logits": 0.1099155992269516, + "loss/reg": 4.2101359367370605, + "step": 42 + }, + { + "epoch": 0.00043, + "grad_norm": 0.2631951570510864, + "grad_norm_var": 0.06838462807177058, + "learning_rate": 5e-05, + "loss": 0.1165, + "loss/crossentropy": 2.7250843048095703, + "loss/hidden": 0.0, + "loss/logits": 0.11648696288466454, + "loss/reg": 4.206397533416748, + "step": 43 + }, + { + "epoch": 0.00044, + "grad_norm": 0.2518296241760254, + "grad_norm_var": 0.06850134865244813, + "learning_rate": 5e-05, + "loss": 0.1111, + "loss/crossentropy": 2.7153283953666687, + "loss/hidden": 0.0, + "loss/logits": 0.11108221486210823, + "loss/reg": 4.201878547668457, + "step": 44 + }, + { + "epoch": 0.00045, + "grad_norm": 0.24082158505916595, + "grad_norm_var": 0.06831278207672915, + "learning_rate": 5e-05, + "loss": 0.1177, + "loss/crossentropy": 2.6632660627365112, + "loss/hidden": 0.0, + "loss/logits": 0.11769118346273899, + "loss/reg": 4.19778299331665, + "step": 45 + }, + { + "epoch": 0.00046, + "grad_norm": 0.260890394449234, + "grad_norm_var": 0.018048092726357542, + "learning_rate": 5e-05, + "loss": 0.1227, + "loss/crossentropy": 2.7315176129341125, + "loss/hidden": 0.0, + "loss/logits": 0.12269957736134529, + "loss/reg": 4.193592071533203, + "step": 46 + }, + { + "epoch": 0.00047, + "grad_norm": 0.25268790125846863, + "grad_norm_var": 0.018186152495949234, + "learning_rate": 5e-05, + "loss": 0.1178, + "loss/crossentropy": 2.774504065513611, + "loss/hidden": 0.0, + "loss/logits": 0.11776839196681976, + "loss/reg": 4.189169406890869, + "step": 47 + }, + { + "epoch": 0.00048, + "grad_norm": 0.2759403884410858, + "grad_norm_var": 0.015229396543742831, + "learning_rate": 5e-05, + "loss": 0.1289, + "loss/crossentropy": 2.8515073657035828, + "loss/hidden": 0.0, + "loss/logits": 0.12885254248976707, + "loss/reg": 4.185054779052734, + "step": 48 + }, + { + "epoch": 0.00049, + "grad_norm": 0.24765782058238983, + "grad_norm_var": 0.015206926335741973, + "learning_rate": 5e-05, + "loss": 0.1256, + "loss/crossentropy": 2.7131593823432922, + "loss/hidden": 0.0, + "loss/logits": 0.1256290916353464, + "loss/reg": 4.1810526847839355, + "step": 49 + }, + { + "epoch": 0.0005, + "grad_norm": 0.3096969425678253, + "grad_norm_var": 0.015214156358291781, + "learning_rate": 5e-05, + "loss": 0.1401, + "loss/crossentropy": 2.7528311014175415, + "loss/hidden": 0.0, + "loss/logits": 0.14005928859114647, + "loss/reg": 4.176880359649658, + "step": 50 + }, + { + "epoch": 0.00051, + "grad_norm": 0.33225017786026, + "grad_norm_var": 0.015162352298149247, + "learning_rate": 5e-05, + "loss": 0.1618, + "loss/crossentropy": 2.73341304063797, + "loss/hidden": 0.0, + "loss/logits": 0.1618291698396206, + "loss/reg": 4.173260688781738, + "step": 51 + }, + { + "epoch": 0.00052, + "grad_norm": 0.33166685700416565, + "grad_norm_var": 0.015176107188209845, + "learning_rate": 5e-05, + "loss": 0.1704, + "loss/crossentropy": 2.824883460998535, + "loss/hidden": 0.0, + "loss/logits": 0.1703827939927578, + "loss/reg": 4.168625354766846, + "step": 52 + }, + { + "epoch": 0.00053, + "grad_norm": 0.4255874752998352, + "grad_norm_var": 0.01609058098027729, + "learning_rate": 5e-05, + "loss": 0.1856, + "loss/crossentropy": 2.8565452694892883, + "loss/hidden": 0.0, + "loss/logits": 0.18561138212680817, + "loss/reg": 4.164296627044678, + "step": 53 + }, + { + "epoch": 0.00054, + "grad_norm": 0.33207008242607117, + "grad_norm_var": 0.015949373509081675, + "learning_rate": 5e-05, + "loss": 0.1762, + "loss/crossentropy": 2.7211243510246277, + "loss/hidden": 0.0, + "loss/logits": 0.1762254200875759, + "loss/reg": 4.16010856628418, + "step": 54 + }, + { + "epoch": 0.00055, + "grad_norm": 0.3105420470237732, + "grad_norm_var": 0.01561146008609899, + "learning_rate": 5e-05, + "loss": 0.172, + "loss/crossentropy": 2.7821205854415894, + "loss/hidden": 0.0, + "loss/logits": 0.17203472182154655, + "loss/reg": 4.155950546264648, + "step": 55 + }, + { + "epoch": 0.00056, + "grad_norm": 0.3342844247817993, + "grad_norm_var": 0.015583353488029018, + "learning_rate": 5e-05, + "loss": 0.1675, + "loss/crossentropy": 2.783965766429901, + "loss/hidden": 0.0, + "loss/logits": 0.1675088219344616, + "loss/reg": 4.151437759399414, + "step": 56 + }, + { + "epoch": 0.00057, + "grad_norm": 0.3392151892185211, + "grad_norm_var": 0.0026173613848745727, + "learning_rate": 5e-05, + "loss": 0.1675, + "loss/crossentropy": 2.782883048057556, + "loss/hidden": 0.0, + "loss/logits": 0.16754426062107086, + "loss/reg": 4.1469950675964355, + "step": 57 + }, + { + "epoch": 0.00058, + "grad_norm": 0.46169230341911316, + "grad_norm_var": 0.004024211017059094, + "learning_rate": 5e-05, + "loss": 0.1828, + "loss/crossentropy": 2.6869139075279236, + "loss/hidden": 0.0, + "loss/logits": 0.18278859555721283, + "loss/reg": 4.142712116241455, + "step": 58 + }, + { + "epoch": 0.00059, + "grad_norm": 0.35874953866004944, + "grad_norm_var": 0.00399056950783742, + "learning_rate": 5e-05, + "loss": 0.179, + "loss/crossentropy": 2.683705747127533, + "loss/hidden": 0.0, + "loss/logits": 0.17896704375743866, + "loss/reg": 4.138728141784668, + "step": 59 + }, + { + "epoch": 0.0006, + "grad_norm": 0.3390788435935974, + "grad_norm_var": 0.0037128700604173097, + "learning_rate": 5e-05, + "loss": 0.1824, + "loss/crossentropy": 2.6724974513053894, + "loss/hidden": 0.0, + "loss/logits": 0.18236950412392616, + "loss/reg": 4.1345534324646, + "step": 60 + }, + { + "epoch": 0.00061, + "grad_norm": 0.3341596722602844, + "grad_norm_var": 0.003246451116369023, + "learning_rate": 5e-05, + "loss": 0.1694, + "loss/crossentropy": 2.956072986125946, + "loss/hidden": 0.0, + "loss/logits": 0.16935936734080315, + "loss/reg": 4.130521774291992, + "step": 61 + }, + { + "epoch": 0.00062, + "grad_norm": 0.33658263087272644, + "grad_norm_var": 0.0029283974011622186, + "learning_rate": 5e-05, + "loss": 0.1668, + "loss/crossentropy": 2.8409587144851685, + "loss/hidden": 0.0, + "loss/logits": 0.16678539663553238, + "loss/reg": 4.126163005828857, + "step": 62 + }, + { + "epoch": 0.00063, + "grad_norm": 0.33723217248916626, + "grad_norm_var": 0.0024741312804299983, + "learning_rate": 5e-05, + "loss": 0.1856, + "loss/crossentropy": 2.7388935685157776, + "loss/hidden": 0.0, + "loss/logits": 0.18555545806884766, + "loss/reg": 4.121931552886963, + "step": 63 + }, + { + "epoch": 0.00064, + "grad_norm": 0.34580445289611816, + "grad_norm_var": 0.0022020224702210757, + "learning_rate": 5e-05, + "loss": 0.1658, + "loss/crossentropy": 2.6729788780212402, + "loss/hidden": 0.0, + "loss/logits": 0.16578427329659462, + "loss/reg": 4.117753982543945, + "step": 64 + }, + { + "epoch": 0.00065, + "grad_norm": 0.33867374062538147, + "grad_norm_var": 0.0015716415803633144, + "learning_rate": 5e-05, + "loss": 0.1643, + "loss/crossentropy": 2.8432253003120422, + "loss/hidden": 0.0, + "loss/logits": 0.16425132378935814, + "loss/reg": 4.113894939422607, + "step": 65 + }, + { + "epoch": 0.00066, + "grad_norm": 0.42098623514175415, + "grad_norm_var": 0.001778022217079652, + "learning_rate": 5e-05, + "loss": 0.2155, + "loss/crossentropy": 2.6712504625320435, + "loss/hidden": 0.0, + "loss/logits": 0.21550852805376053, + "loss/reg": 4.10945463180542, + "step": 66 + }, + { + "epoch": 0.00067, + "grad_norm": 0.35403043031692505, + "grad_norm_var": 0.0017418631675115888, + "learning_rate": 5e-05, + "loss": 0.1798, + "loss/crossentropy": 2.7415149211883545, + "loss/hidden": 0.0, + "loss/logits": 0.1797672137618065, + "loss/reg": 4.1049418449401855, + "step": 67 + }, + { + "epoch": 0.00068, + "grad_norm": 0.34834232926368713, + "grad_norm_var": 0.0017045350753313, + "learning_rate": 5e-05, + "loss": 0.1783, + "loss/crossentropy": 2.6858341097831726, + "loss/hidden": 0.0, + "loss/logits": 0.17833665013313293, + "loss/reg": 4.100775718688965, + "step": 68 + }, + { + "epoch": 0.00069, + "grad_norm": 0.3541049063205719, + "grad_norm_var": 0.0013731843169029498, + "learning_rate": 5e-05, + "loss": 0.1744, + "loss/crossentropy": 2.8710713982582092, + "loss/hidden": 0.0, + "loss/logits": 0.1744227409362793, + "loss/reg": 4.096506595611572, + "step": 69 + }, + { + "epoch": 0.0007, + "grad_norm": 0.3736323118209839, + "grad_norm_var": 0.0013660110363047928, + "learning_rate": 5e-05, + "loss": 0.1994, + "loss/crossentropy": 2.858128011226654, + "loss/hidden": 0.0, + "loss/logits": 0.19940509647130966, + "loss/reg": 4.091678142547607, + "step": 70 + }, + { + "epoch": 0.00071, + "grad_norm": 0.33025625348091125, + "grad_norm_var": 0.001272272953577754, + "learning_rate": 5e-05, + "loss": 0.1646, + "loss/crossentropy": 2.692229390144348, + "loss/hidden": 0.0, + "loss/logits": 0.16458340734243393, + "loss/reg": 4.087361812591553, + "step": 71 + }, + { + "epoch": 0.00072, + "grad_norm": 0.6907688975334167, + "grad_norm_var": 0.00815051878013667, + "learning_rate": 5e-05, + "loss": 0.1757, + "loss/crossentropy": 2.886055052280426, + "loss/hidden": 0.0, + "loss/logits": 0.1757429726421833, + "loss/reg": 4.08318567276001, + "step": 72 + }, + { + "epoch": 0.00073, + "grad_norm": 0.3311053514480591, + "grad_norm_var": 0.008197602515626375, + "learning_rate": 5e-05, + "loss": 0.1682, + "loss/crossentropy": 2.704796850681305, + "loss/hidden": 0.0, + "loss/logits": 0.1681583784520626, + "loss/reg": 4.079033374786377, + "step": 73 + }, + { + "epoch": 0.00074, + "grad_norm": 0.3336332142353058, + "grad_norm_var": 0.0078012237613196535, + "learning_rate": 5e-05, + "loss": 0.1689, + "loss/crossentropy": 2.6181225776672363, + "loss/hidden": 0.0, + "loss/logits": 0.16892167925834656, + "loss/reg": 4.074740409851074, + "step": 74 + }, + { + "epoch": 0.00075, + "grad_norm": 0.33766406774520874, + "grad_norm_var": 0.007861895340318493, + "learning_rate": 5e-05, + "loss": 0.1712, + "loss/crossentropy": 2.756729245185852, + "loss/hidden": 0.0, + "loss/logits": 0.17122048512101173, + "loss/reg": 4.070303916931152, + "step": 75 + }, + { + "epoch": 0.00076, + "grad_norm": 0.34048837423324585, + "grad_norm_var": 0.007856372064757134, + "learning_rate": 5e-05, + "loss": 0.1763, + "loss/crossentropy": 2.62674218416214, + "loss/hidden": 0.0, + "loss/logits": 0.17628077790141106, + "loss/reg": 4.065893650054932, + "step": 76 + }, + { + "epoch": 0.00077, + "grad_norm": 0.3368911147117615, + "grad_norm_var": 0.007844070912018693, + "learning_rate": 5e-05, + "loss": 0.1789, + "loss/crossentropy": 2.838981509208679, + "loss/hidden": 0.0, + "loss/logits": 0.17892110347747803, + "loss/reg": 4.061193943023682, + "step": 77 + }, + { + "epoch": 0.00078, + "grad_norm": 0.2983826696872711, + "grad_norm_var": 0.008102358070792626, + "learning_rate": 5e-05, + "loss": 0.151, + "loss/crossentropy": 2.8157095909118652, + "loss/hidden": 0.0, + "loss/logits": 0.15098581835627556, + "loss/reg": 4.05631685256958, + "step": 78 + }, + { + "epoch": 0.00079, + "grad_norm": 0.34036847949028015, + "grad_norm_var": 0.008090524798600873, + "learning_rate": 5e-05, + "loss": 0.1772, + "loss/crossentropy": 2.742383122444153, + "loss/hidden": 0.0, + "loss/logits": 0.17718595638871193, + "loss/reg": 4.051788330078125, + "step": 79 + }, + { + "epoch": 0.0008, + "grad_norm": 0.3196929097175598, + "grad_norm_var": 0.008207612908988405, + "learning_rate": 5e-05, + "loss": 0.1574, + "loss/crossentropy": 2.64748877286911, + "loss/hidden": 0.0, + "loss/logits": 0.15740340948104858, + "loss/reg": 4.046438694000244, + "step": 80 + }, + { + "epoch": 0.00081, + "grad_norm": 0.3145473897457123, + "grad_norm_var": 0.008330494258097032, + "learning_rate": 5e-05, + "loss": 0.1591, + "loss/crossentropy": 2.7640033960342407, + "loss/hidden": 0.0, + "loss/logits": 0.15912048518657684, + "loss/reg": 4.041863441467285, + "step": 81 + }, + { + "epoch": 0.00082, + "grad_norm": 0.37658828496932983, + "grad_norm_var": 0.008116681055328008, + "learning_rate": 5e-05, + "loss": 0.1783, + "loss/crossentropy": 2.8226330876350403, + "loss/hidden": 0.0, + "loss/logits": 0.17833809927105904, + "loss/reg": 4.0372796058654785, + "step": 82 + }, + { + "epoch": 0.00083, + "grad_norm": 0.36421865224838257, + "grad_norm_var": 0.00811331907494814, + "learning_rate": 5e-05, + "loss": 0.1636, + "loss/crossentropy": 2.762717604637146, + "loss/hidden": 0.0, + "loss/logits": 0.16359057649970055, + "loss/reg": 4.032177925109863, + "step": 83 + }, + { + "epoch": 0.00084, + "grad_norm": 0.3138120174407959, + "grad_norm_var": 0.00825034262581384, + "learning_rate": 5e-05, + "loss": 0.1606, + "loss/crossentropy": 2.625426709651947, + "loss/hidden": 0.0, + "loss/logits": 0.16061001271009445, + "loss/reg": 4.027446269989014, + "step": 84 + }, + { + "epoch": 0.00085, + "grad_norm": 0.34441590309143066, + "grad_norm_var": 0.00826351514204321, + "learning_rate": 5e-05, + "loss": 0.1667, + "loss/crossentropy": 2.8294222950935364, + "loss/hidden": 0.0, + "loss/logits": 0.16673466563224792, + "loss/reg": 4.022748947143555, + "step": 85 + }, + { + "epoch": 0.00086, + "grad_norm": 0.316683828830719, + "grad_norm_var": 0.00835627592765974, + "learning_rate": 5e-05, + "loss": 0.1564, + "loss/crossentropy": 2.8250383734703064, + "loss/hidden": 0.0, + "loss/logits": 0.1564498096704483, + "loss/reg": 4.017378330230713, + "step": 86 + }, + { + "epoch": 0.00087, + "grad_norm": 0.3178180456161499, + "grad_norm_var": 0.008407967451986308, + "learning_rate": 5e-05, + "loss": 0.1589, + "loss/crossentropy": 2.831330358982086, + "loss/hidden": 0.0, + "loss/logits": 0.15890633687376976, + "loss/reg": 4.012408256530762, + "step": 87 + }, + { + "epoch": 0.00088, + "grad_norm": 0.33865824341773987, + "grad_norm_var": 0.00038455914158520567, + "learning_rate": 5e-05, + "loss": 0.1665, + "loss/crossentropy": 2.8202422857284546, + "loss/hidden": 0.0, + "loss/logits": 0.16647625714540482, + "loss/reg": 4.00655460357666, + "step": 88 + }, + { + "epoch": 0.00089, + "grad_norm": 0.33375900983810425, + "grad_norm_var": 0.00038439593085719167, + "learning_rate": 5e-05, + "loss": 0.1655, + "loss/crossentropy": 2.748092472553253, + "loss/hidden": 0.0, + "loss/logits": 0.1655096672475338, + "loss/reg": 4.000852584838867, + "step": 89 + }, + { + "epoch": 0.0009, + "grad_norm": 0.41060250997543335, + "grad_norm_var": 0.000761403690223957, + "learning_rate": 5e-05, + "loss": 0.1679, + "loss/crossentropy": 2.8519994616508484, + "loss/hidden": 0.0, + "loss/logits": 0.1679377369582653, + "loss/reg": 3.9966533184051514, + "step": 90 + }, + { + "epoch": 0.00091, + "grad_norm": 0.3349744379520416, + "grad_norm_var": 0.0007618998964447029, + "learning_rate": 5e-05, + "loss": 0.1663, + "loss/crossentropy": 2.8302014470100403, + "loss/hidden": 0.0, + "loss/logits": 0.16629018262028694, + "loss/reg": 3.9916272163391113, + "step": 91 + }, + { + "epoch": 0.00092, + "grad_norm": 0.40859073400497437, + "grad_norm_var": 0.0010778266384652254, + "learning_rate": 5e-05, + "loss": 0.1631, + "loss/crossentropy": 2.831357002258301, + "loss/hidden": 0.0, + "loss/logits": 0.16314184293150902, + "loss/reg": 3.9862587451934814, + "step": 92 + }, + { + "epoch": 0.00093, + "grad_norm": 0.3679395616054535, + "grad_norm_var": 0.0011174436691973562, + "learning_rate": 5e-05, + "loss": 0.1749, + "loss/crossentropy": 2.653463125228882, + "loss/hidden": 0.0, + "loss/logits": 0.17491210997104645, + "loss/reg": 3.9809703826904297, + "step": 93 + }, + { + "epoch": 0.00094, + "grad_norm": 0.33192068338394165, + "grad_norm_var": 0.000984578674839117, + "learning_rate": 5e-05, + "loss": 0.1689, + "loss/crossentropy": 2.9128816723823547, + "loss/hidden": 0.0, + "loss/logits": 0.16890091821551323, + "loss/reg": 3.9768238067626953, + "step": 94 + }, + { + "epoch": 0.00095, + "grad_norm": 0.33981162309646606, + "grad_norm_var": 0.000985009641976816, + "learning_rate": 5e-05, + "loss": 0.1651, + "loss/crossentropy": 2.8998738527297974, + "loss/hidden": 0.0, + "loss/logits": 0.1651129573583603, + "loss/reg": 3.9723405838012695, + "step": 95 + }, + { + "epoch": 0.00096, + "grad_norm": 0.31845277547836304, + "grad_norm_var": 0.0009894353533322537, + "learning_rate": 5e-05, + "loss": 0.1566, + "loss/crossentropy": 2.738618314266205, + "loss/hidden": 0.0, + "loss/logits": 0.15662826597690582, + "loss/reg": 3.9680373668670654, + "step": 96 + }, + { + "epoch": 0.00097, + "grad_norm": 0.3521839678287506, + "grad_norm_var": 0.0009211371554959176, + "learning_rate": 5e-05, + "loss": 0.1571, + "loss/crossentropy": 2.896687388420105, + "loss/hidden": 0.0, + "loss/logits": 0.15710216015577316, + "loss/reg": 3.964097499847412, + "step": 97 + }, + { + "epoch": 0.00098, + "grad_norm": 0.41529935598373413, + "grad_norm_var": 0.0011615701056859014, + "learning_rate": 5e-05, + "loss": 0.1761, + "loss/crossentropy": 2.6711183190345764, + "loss/hidden": 0.0, + "loss/logits": 0.176058791577816, + "loss/reg": 3.959585428237915, + "step": 98 + }, + { + "epoch": 0.00099, + "grad_norm": 0.3406970202922821, + "grad_norm_var": 0.0011533483453351997, + "learning_rate": 5e-05, + "loss": 0.1755, + "loss/crossentropy": 2.762200713157654, + "loss/hidden": 0.0, + "loss/logits": 0.17553818225860596, + "loss/reg": 3.9551267623901367, + "step": 99 + }, + { + "epoch": 0.001, + "grad_norm": 0.3295409083366394, + "grad_norm_var": 0.0010948026927074712, + "learning_rate": 5e-05, + "loss": 0.1791, + "loss/crossentropy": 2.666721522808075, + "loss/hidden": 0.0, + "loss/logits": 0.17914289608597755, + "loss/reg": 3.9509167671203613, + "step": 100 + }, + { + "epoch": 0.00101, + "grad_norm": 0.3429720401763916, + "grad_norm_var": 0.001096024238407974, + "learning_rate": 5e-05, + "loss": 0.1793, + "loss/crossentropy": 2.82060843706131, + "loss/hidden": 0.0, + "loss/logits": 0.1792576014995575, + "loss/reg": 3.9469358921051025, + "step": 101 + }, + { + "epoch": 0.00102, + "grad_norm": 0.3215195834636688, + "grad_norm_var": 0.0010760084324249537, + "learning_rate": 5e-05, + "loss": 0.1632, + "loss/crossentropy": 2.808405876159668, + "loss/hidden": 0.0, + "loss/logits": 0.16316882148385048, + "loss/reg": 3.943436622619629, + "step": 102 + }, + { + "epoch": 0.00103, + "grad_norm": 0.33158427476882935, + "grad_norm_var": 0.0010282390377130302, + "learning_rate": 5e-05, + "loss": 0.1783, + "loss/crossentropy": 2.8497248888015747, + "loss/hidden": 0.0, + "loss/logits": 0.1783306896686554, + "loss/reg": 3.9394803047180176, + "step": 103 + }, + { + "epoch": 0.00104, + "grad_norm": 0.3384368121623993, + "grad_norm_var": 0.001028611107856688, + "learning_rate": 5e-05, + "loss": 0.1773, + "loss/crossentropy": 2.8479551672935486, + "loss/hidden": 0.0, + "loss/logits": 0.17731818184256554, + "loss/reg": 3.935678243637085, + "step": 104 + }, + { + "epoch": 0.00105, + "grad_norm": 0.3275454342365265, + "grad_norm_var": 0.0010454262321925218, + "learning_rate": 5e-05, + "loss": 0.172, + "loss/crossentropy": 2.7240310311317444, + "loss/hidden": 0.0, + "loss/logits": 0.17204875499010086, + "loss/reg": 3.932224750518799, + "step": 105 + }, + { + "epoch": 0.00106, + "grad_norm": 0.3352244198322296, + "grad_norm_var": 0.0007990449288615142, + "learning_rate": 5e-05, + "loss": 0.1687, + "loss/crossentropy": 2.657980978488922, + "loss/hidden": 0.0, + "loss/logits": 0.16869833320379257, + "loss/reg": 3.92889142036438, + "step": 106 + }, + { + "epoch": 0.00107, + "grad_norm": 0.3195781409740448, + "grad_norm_var": 0.00083658300653268, + "learning_rate": 5e-05, + "loss": 0.1642, + "loss/crossentropy": 2.7351735830307007, + "loss/hidden": 0.0, + "loss/logits": 0.16421591117978096, + "loss/reg": 3.9260904788970947, + "step": 107 + }, + { + "epoch": 0.00108, + "grad_norm": 0.3216703534126282, + "grad_norm_var": 0.0005727423089818255, + "learning_rate": 5e-05, + "loss": 0.1611, + "loss/crossentropy": 2.835266649723053, + "loss/hidden": 0.0, + "loss/logits": 0.1611352562904358, + "loss/reg": 3.923356533050537, + "step": 108 + }, + { + "epoch": 0.00109, + "grad_norm": 0.3534785807132721, + "grad_norm_var": 0.0005312635552543169, + "learning_rate": 5e-05, + "loss": 0.1689, + "loss/crossentropy": 2.8821677565574646, + "loss/hidden": 0.0, + "loss/logits": 0.1689467802643776, + "loss/reg": 3.9208316802978516, + "step": 109 + }, + { + "epoch": 0.0011, + "grad_norm": 0.33851271867752075, + "grad_norm_var": 0.0005279815580263729, + "learning_rate": 5e-05, + "loss": 0.171, + "loss/crossentropy": 2.7201637029647827, + "loss/hidden": 0.0, + "loss/logits": 0.17095838487148285, + "loss/reg": 3.918743133544922, + "step": 110 + }, + { + "epoch": 0.00111, + "grad_norm": 0.32998839020729065, + "grad_norm_var": 0.0005331548233647158, + "learning_rate": 5e-05, + "loss": 0.166, + "loss/crossentropy": 2.6836928725242615, + "loss/hidden": 0.0, + "loss/logits": 0.16604754701256752, + "loss/reg": 3.914886951446533, + "step": 111 + }, + { + "epoch": 0.00112, + "grad_norm": 0.420744925737381, + "grad_norm_var": 0.0009131281860373264, + "learning_rate": 5e-05, + "loss": 0.1738, + "loss/crossentropy": 2.568650722503662, + "loss/hidden": 0.0, + "loss/logits": 0.1737859919667244, + "loss/reg": 3.9106812477111816, + "step": 112 + }, + { + "epoch": 0.00113, + "grad_norm": 0.3349835276603699, + "grad_norm_var": 0.000914996833659265, + "learning_rate": 5e-05, + "loss": 0.1522, + "loss/crossentropy": 2.7411792278289795, + "loss/hidden": 0.0, + "loss/logits": 0.15223057195544243, + "loss/reg": 3.9069032669067383, + "step": 113 + }, + { + "epoch": 0.00114, + "grad_norm": 0.34276068210601807, + "grad_norm_var": 0.0005529241807124034, + "learning_rate": 5e-05, + "loss": 0.1567, + "loss/crossentropy": 2.80877947807312, + "loss/hidden": 0.0, + "loss/logits": 0.1567244492471218, + "loss/reg": 3.90332293510437, + "step": 114 + }, + { + "epoch": 0.00115, + "grad_norm": 0.35375383496284485, + "grad_norm_var": 0.0005659636539689298, + "learning_rate": 5e-05, + "loss": 0.1657, + "loss/crossentropy": 2.698065936565399, + "loss/hidden": 0.0, + "loss/logits": 0.16574329882860184, + "loss/reg": 3.8998756408691406, + "step": 115 + }, + { + "epoch": 0.00116, + "grad_norm": 0.33278602361679077, + "grad_norm_var": 0.0005620343134485931, + "learning_rate": 5e-05, + "loss": 0.1739, + "loss/crossentropy": 2.7814364433288574, + "loss/hidden": 0.0, + "loss/logits": 0.17385346069931984, + "loss/reg": 3.8964290618896484, + "step": 116 + }, + { + "epoch": 0.00117, + "grad_norm": 0.35139891505241394, + "grad_norm_var": 0.0005694228893132684, + "learning_rate": 5e-05, + "loss": 0.1701, + "loss/crossentropy": 2.7721198201179504, + "loss/hidden": 0.0, + "loss/logits": 0.1701316274702549, + "loss/reg": 3.8925936222076416, + "step": 117 + }, + { + "epoch": 0.00118, + "grad_norm": 0.3708522915840149, + "grad_norm_var": 0.0005942298534055627, + "learning_rate": 5e-05, + "loss": 0.1723, + "loss/crossentropy": 2.8753750920295715, + "loss/hidden": 0.0, + "loss/logits": 0.17226434499025345, + "loss/reg": 3.888739824295044, + "step": 118 + }, + { + "epoch": 0.00119, + "grad_norm": 0.32619452476501465, + "grad_norm_var": 0.0006049363247454272, + "learning_rate": 5e-05, + "loss": 0.1559, + "loss/crossentropy": 2.792622923851013, + "loss/hidden": 0.0, + "loss/logits": 0.15585486218333244, + "loss/reg": 3.8849120140075684, + "step": 119 + }, + { + "epoch": 0.0012, + "grad_norm": 0.3160404562950134, + "grad_norm_var": 0.0006517621123632485, + "learning_rate": 5e-05, + "loss": 0.1657, + "loss/crossentropy": 2.833389937877655, + "loss/hidden": 0.0, + "loss/logits": 0.16574294120073318, + "loss/reg": 3.8814921379089355, + "step": 120 + }, + { + "epoch": 0.00121, + "grad_norm": 2.6332755088806152, + "grad_norm_var": 0.328414929277446, + "learning_rate": 5e-05, + "loss": 0.2807, + "loss/crossentropy": 2.960978329181671, + "loss/hidden": 0.0, + "loss/logits": 0.280683059245348, + "loss/reg": 3.8778162002563477, + "step": 121 + }, + { + "epoch": 0.00122, + "grad_norm": 0.39280807971954346, + "grad_norm_var": 0.32746202761424736, + "learning_rate": 5e-05, + "loss": 0.1791, + "loss/crossentropy": 2.8656354546546936, + "loss/hidden": 0.0, + "loss/logits": 0.17905254289507866, + "loss/reg": 3.8742706775665283, + "step": 122 + }, + { + "epoch": 0.00123, + "grad_norm": 0.36644095182418823, + "grad_norm_var": 0.3265348837601918, + "learning_rate": 5e-05, + "loss": 0.1765, + "loss/crossentropy": 2.776346266269684, + "loss/hidden": 0.0, + "loss/logits": 0.1764557734131813, + "loss/reg": 3.8701822757720947, + "step": 123 + }, + { + "epoch": 0.00124, + "grad_norm": 0.39717525243759155, + "grad_norm_var": 0.3251678188828664, + "learning_rate": 5e-05, + "loss": 0.1796, + "loss/crossentropy": 2.9204375743865967, + "loss/hidden": 0.0, + "loss/logits": 0.1796155981719494, + "loss/reg": 3.866316556930542, + "step": 124 + }, + { + "epoch": 0.00125, + "grad_norm": 0.366623193025589, + "grad_norm_var": 0.3249260727271075, + "learning_rate": 5e-05, + "loss": 0.1654, + "loss/crossentropy": 2.42034849524498, + "loss/hidden": 0.0, + "loss/logits": 0.165392205119133, + "loss/reg": 3.8625807762145996, + "step": 125 + }, + { + "epoch": 0.00126, + "grad_norm": 0.3638598918914795, + "grad_norm_var": 0.32442588175429127, + "learning_rate": 5e-05, + "loss": 0.1601, + "loss/crossentropy": 2.936553716659546, + "loss/hidden": 0.0, + "loss/logits": 0.16014225035905838, + "loss/reg": 3.8585283756256104, + "step": 126 + }, + { + "epoch": 0.00127, + "grad_norm": 0.3437521159648895, + "grad_norm_var": 0.3241257586372512, + "learning_rate": 5e-05, + "loss": 0.1603, + "loss/crossentropy": 2.8428520560264587, + "loss/hidden": 0.0, + "loss/logits": 0.16030794754624367, + "loss/reg": 3.854602813720703, + "step": 127 + }, + { + "epoch": 0.00128, + "grad_norm": 0.3604683578014374, + "grad_norm_var": 0.3249965569466151, + "learning_rate": 5e-05, + "loss": 0.1688, + "loss/crossentropy": 2.717309355735779, + "loss/hidden": 0.0, + "loss/logits": 0.1687549129128456, + "loss/reg": 3.85067081451416, + "step": 128 + }, + { + "epoch": 0.00129, + "grad_norm": 0.3499651849269867, + "grad_norm_var": 0.32468680185211135, + "learning_rate": 5e-05, + "loss": 0.1748, + "loss/crossentropy": 2.819560468196869, + "loss/hidden": 0.0, + "loss/logits": 0.17475899681448936, + "loss/reg": 3.8467037677764893, + "step": 129 + }, + { + "epoch": 0.0013, + "grad_norm": 0.3231496512889862, + "grad_norm_var": 0.32511678466571453, + "learning_rate": 5e-05, + "loss": 0.1695, + "loss/crossentropy": 2.5843223929405212, + "loss/hidden": 0.0, + "loss/logits": 0.16951489821076393, + "loss/reg": 3.843282699584961, + "step": 130 + }, + { + "epoch": 0.00131, + "grad_norm": 0.3588982820510864, + "grad_norm_var": 0.325020330590364, + "learning_rate": 5e-05, + "loss": 0.169, + "loss/crossentropy": 2.725651264190674, + "loss/hidden": 0.0, + "loss/logits": 0.16896183416247368, + "loss/reg": 3.839895725250244, + "step": 131 + }, + { + "epoch": 0.00132, + "grad_norm": 0.37743306159973145, + "grad_norm_var": 0.32416673149153025, + "learning_rate": 5e-05, + "loss": 0.1833, + "loss/crossentropy": 3.0410608053207397, + "loss/hidden": 0.0, + "loss/logits": 0.1833292953670025, + "loss/reg": 3.8355963230133057, + "step": 132 + }, + { + "epoch": 0.00133, + "grad_norm": 0.32988330721855164, + "grad_norm_var": 0.32462166470000664, + "learning_rate": 5e-05, + "loss": 0.1654, + "loss/crossentropy": 2.7005507349967957, + "loss/hidden": 0.0, + "loss/logits": 0.1653790920972824, + "loss/reg": 3.831345558166504, + "step": 133 + }, + { + "epoch": 0.00134, + "grad_norm": 0.35988613963127136, + "grad_norm_var": 0.32481589623167567, + "learning_rate": 5e-05, + "loss": 0.1792, + "loss/crossentropy": 2.7048683762550354, + "loss/hidden": 0.0, + "loss/logits": 0.17917973920702934, + "loss/reg": 3.8267781734466553, + "step": 134 + }, + { + "epoch": 0.00135, + "grad_norm": 0.32649827003479004, + "grad_norm_var": 0.324808949416691, + "learning_rate": 5e-05, + "loss": 0.1642, + "loss/crossentropy": 2.791461765766144, + "loss/hidden": 0.0, + "loss/logits": 0.16420895606279373, + "loss/reg": 3.8223133087158203, + "step": 135 + }, + { + "epoch": 0.00136, + "grad_norm": 0.6779212355613708, + "grad_norm_var": 0.32421967313153754, + "learning_rate": 5e-05, + "loss": 0.2361, + "loss/crossentropy": 3.063343107700348, + "loss/hidden": 0.0, + "loss/logits": 0.2360655590891838, + "loss/reg": 3.818582057952881, + "step": 136 + }, + { + "epoch": 0.00137, + "grad_norm": 0.4217770993709564, + "grad_norm_var": 0.0069040846383882, + "learning_rate": 5e-05, + "loss": 0.1936, + "loss/crossentropy": 2.8291149735450745, + "loss/hidden": 0.0, + "loss/logits": 0.19361505657434464, + "loss/reg": 3.814713716506958, + "step": 137 + }, + { + "epoch": 0.00138, + "grad_norm": 0.3183574378490448, + "grad_norm_var": 0.0071460434004817905, + "learning_rate": 5e-05, + "loss": 0.1596, + "loss/crossentropy": 2.733646512031555, + "loss/hidden": 0.0, + "loss/logits": 0.15959006920456886, + "loss/reg": 3.8112361431121826, + "step": 138 + }, + { + "epoch": 0.00139, + "grad_norm": 0.35119444131851196, + "grad_norm_var": 0.007183318962822194, + "learning_rate": 5e-05, + "loss": 0.1706, + "loss/crossentropy": 2.777931809425354, + "loss/hidden": 0.0, + "loss/logits": 0.17056189104914665, + "loss/reg": 3.807130813598633, + "step": 139 + }, + { + "epoch": 0.0014, + "grad_norm": 0.3381962478160858, + "grad_norm_var": 0.007239536480815012, + "learning_rate": 5e-05, + "loss": 0.1651, + "loss/crossentropy": 2.865752935409546, + "loss/hidden": 0.0, + "loss/logits": 0.16511252894997597, + "loss/reg": 3.8030734062194824, + "step": 140 + }, + { + "epoch": 0.00141, + "grad_norm": 0.35082533955574036, + "grad_norm_var": 0.007268548808216302, + "learning_rate": 5e-05, + "loss": 0.1608, + "loss/crossentropy": 2.734546184539795, + "loss/hidden": 0.0, + "loss/logits": 0.16080284118652344, + "loss/reg": 3.7996251583099365, + "step": 141 + }, + { + "epoch": 0.00142, + "grad_norm": 0.4269000291824341, + "grad_norm_var": 0.007448472313405929, + "learning_rate": 5e-05, + "loss": 0.1806, + "loss/crossentropy": 2.9227113127708435, + "loss/hidden": 0.0, + "loss/logits": 0.1805506870150566, + "loss/reg": 3.7955057621002197, + "step": 142 + }, + { + "epoch": 0.00143, + "grad_norm": 0.3532395660877228, + "grad_norm_var": 0.0074133753520221855, + "learning_rate": 5e-05, + "loss": 0.1588, + "loss/crossentropy": 2.9407125115394592, + "loss/hidden": 0.0, + "loss/logits": 0.15880529955029488, + "loss/reg": 3.791508197784424, + "step": 143 + }, + { + "epoch": 0.00144, + "grad_norm": 0.3449239134788513, + "grad_norm_var": 0.007461781173789813, + "learning_rate": 5e-05, + "loss": 0.1652, + "loss/crossentropy": 2.8305121660232544, + "loss/hidden": 0.0, + "loss/logits": 0.1652398444712162, + "loss/reg": 3.7871744632720947, + "step": 144 + }, + { + "epoch": 0.00145, + "grad_norm": 0.3272966742515564, + "grad_norm_var": 0.007571273873210712, + "learning_rate": 5e-05, + "loss": 0.1714, + "loss/crossentropy": 2.876939594745636, + "loss/hidden": 0.0, + "loss/logits": 0.17143940553069115, + "loss/reg": 3.7832887172698975, + "step": 145 + }, + { + "epoch": 0.00146, + "grad_norm": 0.31960922479629517, + "grad_norm_var": 0.007596131782178968, + "learning_rate": 5e-05, + "loss": 0.1558, + "loss/crossentropy": 2.7597694993019104, + "loss/hidden": 0.0, + "loss/logits": 0.15579523891210556, + "loss/reg": 3.77976393699646, + "step": 146 + }, + { + "epoch": 0.00147, + "grad_norm": 0.3329758048057556, + "grad_norm_var": 0.007690076208493398, + "learning_rate": 5e-05, + "loss": 0.1602, + "loss/crossentropy": 2.823091506958008, + "loss/hidden": 0.0, + "loss/logits": 0.16016652062535286, + "loss/reg": 3.776364326477051, + "step": 147 + }, + { + "epoch": 0.00148, + "grad_norm": 0.3245135545730591, + "grad_norm_var": 0.007828939248271782, + "learning_rate": 5e-05, + "loss": 0.1608, + "loss/crossentropy": 2.622242748737335, + "loss/hidden": 0.0, + "loss/logits": 0.16081608831882477, + "loss/reg": 3.7724997997283936, + "step": 148 + }, + { + "epoch": 0.00149, + "grad_norm": 0.3239537179470062, + "grad_norm_var": 0.007862062788276463, + "learning_rate": 5e-05, + "loss": 0.1559, + "loss/crossentropy": 2.826173484325409, + "loss/hidden": 0.0, + "loss/logits": 0.15591008588671684, + "loss/reg": 3.7680001258850098, + "step": 149 + }, + { + "epoch": 0.0015, + "grad_norm": 0.3199516534805298, + "grad_norm_var": 0.00800828926831548, + "learning_rate": 5e-05, + "loss": 0.1705, + "loss/crossentropy": 2.73406845331192, + "loss/hidden": 0.0, + "loss/logits": 0.17048393934965134, + "loss/reg": 3.7640268802642822, + "step": 150 + }, + { + "epoch": 0.00151, + "grad_norm": 0.3810157775878906, + "grad_norm_var": 0.00790594146931481, + "learning_rate": 5e-05, + "loss": 0.1772, + "loss/crossentropy": 2.746786952018738, + "loss/hidden": 0.0, + "loss/logits": 0.17715823650360107, + "loss/reg": 3.760627508163452, + "step": 151 + }, + { + "epoch": 0.00152, + "grad_norm": 0.33840498328208923, + "grad_norm_var": 0.0011503711202599262, + "learning_rate": 5e-05, + "loss": 0.168, + "loss/crossentropy": 2.7671576738357544, + "loss/hidden": 0.0, + "loss/logits": 0.1679898537695408, + "loss/reg": 3.7571685314178467, + "step": 152 + }, + { + "epoch": 0.00153, + "grad_norm": 0.35103219747543335, + "grad_norm_var": 0.0007702874374444798, + "learning_rate": 5e-05, + "loss": 0.1683, + "loss/crossentropy": 2.8394588828086853, + "loss/hidden": 0.0, + "loss/logits": 0.1682782731950283, + "loss/reg": 3.7533957958221436, + "step": 153 + }, + { + "epoch": 0.00154, + "grad_norm": 0.34948527812957764, + "grad_norm_var": 0.000724837481797543, + "learning_rate": 5e-05, + "loss": 0.1551, + "loss/crossentropy": 2.637475073337555, + "loss/hidden": 0.0, + "loss/logits": 0.1551469974219799, + "loss/reg": 3.7496984004974365, + "step": 154 + }, + { + "epoch": 0.00155, + "grad_norm": 0.32411250472068787, + "grad_norm_var": 0.000751360146424022, + "learning_rate": 5e-05, + "loss": 0.1655, + "loss/crossentropy": 2.65782767534256, + "loss/hidden": 0.0, + "loss/logits": 0.16551653295755386, + "loss/reg": 3.7462174892425537, + "step": 155 + }, + { + "epoch": 0.00156, + "grad_norm": 0.3659244775772095, + "grad_norm_var": 0.0007773935392291246, + "learning_rate": 5e-05, + "loss": 0.1618, + "loss/crossentropy": 2.8054139614105225, + "loss/hidden": 0.0, + "loss/logits": 0.16182733327150345, + "loss/reg": 3.7422730922698975, + "step": 156 + }, + { + "epoch": 0.00157, + "grad_norm": 0.3639696538448334, + "grad_norm_var": 0.0007968496539047743, + "learning_rate": 5e-05, + "loss": 0.172, + "loss/crossentropy": 2.643721103668213, + "loss/hidden": 0.0, + "loss/logits": 0.17196981981396675, + "loss/reg": 3.7390189170837402, + "step": 157 + }, + { + "epoch": 0.00158, + "grad_norm": 0.372111439704895, + "grad_norm_var": 0.0003986384080602812, + "learning_rate": 5e-05, + "loss": 0.1752, + "loss/crossentropy": 2.6860750317573547, + "loss/hidden": 0.0, + "loss/logits": 0.17522458359599113, + "loss/reg": 3.7351748943328857, + "step": 158 + }, + { + "epoch": 0.00159, + "grad_norm": 0.3412966728210449, + "grad_norm_var": 0.0003916975034196302, + "learning_rate": 5e-05, + "loss": 0.1732, + "loss/crossentropy": 2.7506829500198364, + "loss/hidden": 0.0, + "loss/logits": 0.17320549115538597, + "loss/reg": 3.731645345687866, + "step": 159 + }, + { + "epoch": 0.0016, + "grad_norm": 0.31508323550224304, + "grad_norm_var": 0.0004378510847698321, + "learning_rate": 5e-05, + "loss": 0.1676, + "loss/crossentropy": 2.672293782234192, + "loss/hidden": 0.0, + "loss/logits": 0.16758090257644653, + "loss/reg": 3.727598190307617, + "step": 160 + }, + { + "epoch": 0.00161, + "grad_norm": 0.39773106575012207, + "grad_norm_var": 0.0006223116385708161, + "learning_rate": 5e-05, + "loss": 0.1867, + "loss/crossentropy": 2.975751519203186, + "loss/hidden": 0.0, + "loss/logits": 0.18666821345686913, + "loss/reg": 3.7237842082977295, + "step": 161 + }, + { + "epoch": 0.00162, + "grad_norm": 0.3057797849178314, + "grad_norm_var": 0.0006812186499233134, + "learning_rate": 5e-05, + "loss": 0.1511, + "loss/crossentropy": 2.768982172012329, + "loss/hidden": 0.0, + "loss/logits": 0.15112394466996193, + "loss/reg": 3.7201473712921143, + "step": 162 + }, + { + "epoch": 0.00163, + "grad_norm": 0.39109617471694946, + "grad_norm_var": 0.0008052929738533592, + "learning_rate": 5e-05, + "loss": 0.1692, + "loss/crossentropy": 2.7556854486465454, + "loss/hidden": 0.0, + "loss/logits": 0.1692204400897026, + "loss/reg": 3.715847969055176, + "step": 163 + }, + { + "epoch": 0.00164, + "grad_norm": 0.3230038285255432, + "grad_norm_var": 0.0008101312463145642, + "learning_rate": 5e-05, + "loss": 0.158, + "loss/crossentropy": 2.663906216621399, + "loss/hidden": 0.0, + "loss/logits": 0.1579984687268734, + "loss/reg": 3.712200403213501, + "step": 164 + }, + { + "epoch": 0.00165, + "grad_norm": 0.32820436358451843, + "grad_norm_var": 0.0007977755717131292, + "learning_rate": 5e-05, + "loss": 0.1535, + "loss/crossentropy": 2.7556238174438477, + "loss/hidden": 0.0, + "loss/logits": 0.15348907560110092, + "loss/reg": 3.7093729972839355, + "step": 165 + }, + { + "epoch": 0.00166, + "grad_norm": 0.37247714400291443, + "grad_norm_var": 0.0007736858685811421, + "learning_rate": 5e-05, + "loss": 0.168, + "loss/crossentropy": 2.623964309692383, + "loss/hidden": 0.0, + "loss/logits": 0.16797634214162827, + "loss/reg": 3.7055835723876953, + "step": 166 + }, + { + "epoch": 0.00167, + "grad_norm": 0.31921809911727905, + "grad_norm_var": 0.0007674848471050747, + "learning_rate": 5e-05, + "loss": 0.1618, + "loss/crossentropy": 2.6233983039855957, + "loss/hidden": 0.0, + "loss/logits": 0.16180693730711937, + "loss/reg": 3.7018704414367676, + "step": 167 + }, + { + "epoch": 0.00168, + "grad_norm": 0.41518375277519226, + "grad_norm_var": 0.0010434978692974088, + "learning_rate": 5e-05, + "loss": 0.1842, + "loss/crossentropy": 2.794585347175598, + "loss/hidden": 0.0, + "loss/logits": 0.18423354998230934, + "loss/reg": 3.6984987258911133, + "step": 168 + }, + { + "epoch": 0.00169, + "grad_norm": 0.3530808985233307, + "grad_norm_var": 0.0010434324942960296, + "learning_rate": 5e-05, + "loss": 0.1818, + "loss/crossentropy": 2.725895941257477, + "loss/hidden": 0.0, + "loss/logits": 0.18175217881798744, + "loss/reg": 3.6949737071990967, + "step": 169 + }, + { + "epoch": 0.0017, + "grad_norm": 0.35729339718818665, + "grad_norm_var": 0.001044250197534243, + "learning_rate": 5e-05, + "loss": 0.1758, + "loss/crossentropy": 2.8144423365592957, + "loss/hidden": 0.0, + "loss/logits": 0.1758369542658329, + "loss/reg": 3.6909079551696777, + "step": 170 + }, + { + "epoch": 0.00171, + "grad_norm": 0.3258056044578552, + "grad_norm_var": 0.0010379424391956011, + "learning_rate": 5e-05, + "loss": 0.1615, + "loss/crossentropy": 2.6860609650611877, + "loss/hidden": 0.0, + "loss/logits": 0.1615053378045559, + "loss/reg": 3.6872336864471436, + "step": 171 + }, + { + "epoch": 0.00172, + "grad_norm": 0.3320024907588959, + "grad_norm_var": 0.0010511954351829684, + "learning_rate": 5e-05, + "loss": 0.1669, + "loss/crossentropy": 2.7618680596351624, + "loss/hidden": 0.0, + "loss/logits": 0.16686224937438965, + "loss/reg": 3.684033155441284, + "step": 172 + }, + { + "epoch": 0.00173, + "grad_norm": 0.32370057702064514, + "grad_norm_var": 0.001082015111668518, + "learning_rate": 5e-05, + "loss": 0.1568, + "loss/crossentropy": 2.8911356329917908, + "loss/hidden": 0.0, + "loss/logits": 0.15675026923418045, + "loss/reg": 3.6797101497650146, + "step": 173 + }, + { + "epoch": 0.00174, + "grad_norm": 0.3590388298034668, + "grad_norm_var": 0.0010512214971074684, + "learning_rate": 5e-05, + "loss": 0.1608, + "loss/crossentropy": 2.894763946533203, + "loss/hidden": 0.0, + "loss/logits": 0.16077794507145882, + "loss/reg": 3.676694393157959, + "step": 174 + }, + { + "epoch": 0.00175, + "grad_norm": 0.362693190574646, + "grad_norm_var": 0.0010621381304175893, + "learning_rate": 5e-05, + "loss": 0.181, + "loss/crossentropy": 2.9355967044830322, + "loss/hidden": 0.0, + "loss/logits": 0.18095535412430763, + "loss/reg": 3.6728715896606445, + "step": 175 + }, + { + "epoch": 0.00176, + "grad_norm": 0.3421201705932617, + "grad_norm_var": 0.0009861454944628978, + "learning_rate": 5e-05, + "loss": 0.1752, + "loss/crossentropy": 2.771928548812866, + "loss/hidden": 0.0, + "loss/logits": 0.17522436380386353, + "loss/reg": 3.6698157787323, + "step": 176 + }, + { + "epoch": 0.00177, + "grad_norm": 0.3921768069267273, + "grad_norm_var": 0.0009531156716223066, + "learning_rate": 5e-05, + "loss": 0.1682, + "loss/crossentropy": 2.9020140171051025, + "loss/hidden": 0.0, + "loss/logits": 0.16816864535212517, + "loss/reg": 3.6669130325317383, + "step": 177 + }, + { + "epoch": 0.00178, + "grad_norm": 0.414460688829422, + "grad_norm_var": 0.0010479472090343092, + "learning_rate": 5e-05, + "loss": 0.1652, + "loss/crossentropy": 2.871070384979248, + "loss/hidden": 0.0, + "loss/logits": 0.1651761755347252, + "loss/reg": 3.6637353897094727, + "step": 178 + }, + { + "epoch": 0.00179, + "grad_norm": 0.37821123003959656, + "grad_norm_var": 0.0009996989224075473, + "learning_rate": 5e-05, + "loss": 0.1618, + "loss/crossentropy": 2.8318552374839783, + "loss/hidden": 0.0, + "loss/logits": 0.16177014261484146, + "loss/reg": 3.6601526737213135, + "step": 179 + }, + { + "epoch": 0.0018, + "grad_norm": 0.33756861090660095, + "grad_norm_var": 0.0009485554235717804, + "learning_rate": 5e-05, + "loss": 0.164, + "loss/crossentropy": 2.7179840803146362, + "loss/hidden": 0.0, + "loss/logits": 0.16402991488575935, + "loss/reg": 3.655977725982666, + "step": 180 + }, + { + "epoch": 0.00181, + "grad_norm": 0.3508152663707733, + "grad_norm_var": 0.0008934631549546879, + "learning_rate": 5e-05, + "loss": 0.1824, + "loss/crossentropy": 2.655538856983185, + "loss/hidden": 0.0, + "loss/logits": 0.18240001425147057, + "loss/reg": 3.6522390842437744, + "step": 181 + }, + { + "epoch": 0.00182, + "grad_norm": 0.4800889194011688, + "grad_norm_var": 0.0018179163356779901, + "learning_rate": 5e-05, + "loss": 0.1773, + "loss/crossentropy": 2.9170504808425903, + "loss/hidden": 0.0, + "loss/logits": 0.17730093747377396, + "loss/reg": 3.648420810699463, + "step": 182 + }, + { + "epoch": 0.00183, + "grad_norm": 0.32715606689453125, + "grad_norm_var": 0.0017731703957083382, + "learning_rate": 5e-05, + "loss": 0.1599, + "loss/crossentropy": 2.6978230476379395, + "loss/hidden": 0.0, + "loss/logits": 0.15988203510642052, + "loss/reg": 3.644439458847046, + "step": 183 + }, + { + "epoch": 0.00184, + "grad_norm": 0.3219493329524994, + "grad_norm_var": 0.0017014689354580615, + "learning_rate": 5e-05, + "loss": 0.1588, + "loss/crossentropy": 2.772395610809326, + "loss/hidden": 0.0, + "loss/logits": 0.158803328871727, + "loss/reg": 3.6410605907440186, + "step": 184 + }, + { + "epoch": 0.00185, + "grad_norm": 0.3204100728034973, + "grad_norm_var": 0.0017978203455529696, + "learning_rate": 5e-05, + "loss": 0.1595, + "loss/crossentropy": 2.7290788292884827, + "loss/hidden": 0.0, + "loss/logits": 0.15948805212974548, + "loss/reg": 3.637272596359253, + "step": 185 + }, + { + "epoch": 0.00186, + "grad_norm": 0.34646865725517273, + "grad_norm_var": 0.0018059373173852718, + "learning_rate": 5e-05, + "loss": 0.1723, + "loss/crossentropy": 2.68435937166214, + "loss/hidden": 0.0, + "loss/logits": 0.17226089164614677, + "loss/reg": 3.6334545612335205, + "step": 186 + }, + { + "epoch": 0.00187, + "grad_norm": 0.35515356063842773, + "grad_norm_var": 0.001737051018651666, + "learning_rate": 5e-05, + "loss": 0.1656, + "loss/crossentropy": 2.8159299492836, + "loss/hidden": 0.0, + "loss/logits": 0.16557194665074348, + "loss/reg": 3.629215717315674, + "step": 187 + }, + { + "epoch": 0.00188, + "grad_norm": 0.31605055928230286, + "grad_norm_var": 0.0018103786054489293, + "learning_rate": 5e-05, + "loss": 0.1587, + "loss/crossentropy": 2.737620174884796, + "loss/hidden": 0.0, + "loss/logits": 0.15867746248841286, + "loss/reg": 3.6252663135528564, + "step": 188 + }, + { + "epoch": 0.00189, + "grad_norm": 0.3383916914463043, + "grad_norm_var": 0.0017566740185558556, + "learning_rate": 5e-05, + "loss": 0.1621, + "loss/crossentropy": 2.7829577326774597, + "loss/hidden": 0.0, + "loss/logits": 0.162098228931427, + "loss/reg": 3.621067523956299, + "step": 189 + }, + { + "epoch": 0.0019, + "grad_norm": 0.4556836783885956, + "grad_norm_var": 0.0023419423247556044, + "learning_rate": 5e-05, + "loss": 0.1687, + "loss/crossentropy": 2.9624626636505127, + "loss/hidden": 0.0, + "loss/logits": 0.1686898171901703, + "loss/reg": 3.6163265705108643, + "step": 190 + }, + { + "epoch": 0.00191, + "grad_norm": 0.3975931406021118, + "grad_norm_var": 0.0024075083289669527, + "learning_rate": 5e-05, + "loss": 0.155, + "loss/crossentropy": 2.731001079082489, + "loss/hidden": 0.0, + "loss/logits": 0.15501929074525833, + "loss/reg": 3.6121585369110107, + "step": 191 + }, + { + "epoch": 0.00192, + "grad_norm": 0.37328633666038513, + "grad_norm_var": 0.002364231645687964, + "learning_rate": 5e-05, + "loss": 0.1683, + "loss/crossentropy": 2.754942238330841, + "loss/hidden": 0.0, + "loss/logits": 0.16829831898212433, + "loss/reg": 3.6073873043060303, + "step": 192 + }, + { + "epoch": 0.00193, + "grad_norm": 0.3342723250389099, + "grad_norm_var": 0.0023955576435807737, + "learning_rate": 5e-05, + "loss": 0.1663, + "loss/crossentropy": 2.7424720525741577, + "loss/hidden": 0.0, + "loss/logits": 0.16633369401097298, + "loss/reg": 3.6036906242370605, + "step": 193 + }, + { + "epoch": 0.00194, + "grad_norm": 0.38286155462265015, + "grad_norm_var": 0.002251566346172081, + "learning_rate": 5e-05, + "loss": 0.1652, + "loss/crossentropy": 2.9778133630752563, + "loss/hidden": 0.0, + "loss/logits": 0.16522743180394173, + "loss/reg": 3.600316286087036, + "step": 194 + }, + { + "epoch": 0.00195, + "grad_norm": 0.36051952838897705, + "grad_norm_var": 0.0022364206403587715, + "learning_rate": 5e-05, + "loss": 0.1772, + "loss/crossentropy": 2.6842609643936157, + "loss/hidden": 0.0, + "loss/logits": 0.1771794743835926, + "loss/reg": 3.596491813659668, + "step": 195 + }, + { + "epoch": 0.00196, + "grad_norm": 0.3526027202606201, + "grad_norm_var": 0.0022007878333510996, + "learning_rate": 5e-05, + "loss": 0.1561, + "loss/crossentropy": 2.7837477922439575, + "loss/hidden": 0.0, + "loss/logits": 0.15605639293789864, + "loss/reg": 3.5926032066345215, + "step": 196 + }, + { + "epoch": 0.00197, + "grad_norm": 0.35895583033561707, + "grad_norm_var": 0.002191344445433652, + "learning_rate": 5e-05, + "loss": 0.1801, + "loss/crossentropy": 2.85478812456131, + "loss/hidden": 0.0, + "loss/logits": 0.1800978109240532, + "loss/reg": 3.589280843734741, + "step": 197 + }, + { + "epoch": 0.00198, + "grad_norm": 0.3372839689254761, + "grad_norm_var": 0.0012524713862786308, + "learning_rate": 5e-05, + "loss": 0.1571, + "loss/crossentropy": 2.805725872516632, + "loss/hidden": 0.0, + "loss/logits": 0.15710647776722908, + "loss/reg": 3.5851662158966064, + "step": 198 + }, + { + "epoch": 0.00199, + "grad_norm": 0.33652499318122864, + "grad_norm_var": 0.0012232813247675149, + "learning_rate": 5e-05, + "loss": 0.1652, + "loss/crossentropy": 2.657254457473755, + "loss/hidden": 0.0, + "loss/logits": 0.1651643067598343, + "loss/reg": 3.581798553466797, + "step": 199 + }, + { + "epoch": 0.002, + "grad_norm": 0.36757001280784607, + "grad_norm_var": 0.001149275638629573, + "learning_rate": 5e-05, + "loss": 0.1756, + "loss/crossentropy": 2.7496553659439087, + "loss/hidden": 0.0, + "loss/logits": 0.17555997148156166, + "loss/reg": 3.577878475189209, + "step": 200 + }, + { + "epoch": 0.00201, + "grad_norm": 0.4317435324192047, + "grad_norm_var": 0.0013607474972908151, + "learning_rate": 5e-05, + "loss": 0.1643, + "loss/crossentropy": 3.168861448764801, + "loss/hidden": 0.0, + "loss/logits": 0.164311021566391, + "loss/reg": 3.5741024017333984, + "step": 201 + }, + { + "epoch": 0.00202, + "grad_norm": 0.3569833040237427, + "grad_norm_var": 0.0013412425012825579, + "learning_rate": 5e-05, + "loss": 0.1778, + "loss/crossentropy": 2.7941558957099915, + "loss/hidden": 0.0, + "loss/logits": 0.17778108268976212, + "loss/reg": 3.5706787109375, + "step": 202 + }, + { + "epoch": 0.00203, + "grad_norm": 0.31648150086402893, + "grad_norm_var": 0.0014904716039333447, + "learning_rate": 5e-05, + "loss": 0.156, + "loss/crossentropy": 2.872058689594269, + "loss/hidden": 0.0, + "loss/logits": 0.1559964008629322, + "loss/reg": 3.5671305656433105, + "step": 203 + }, + { + "epoch": 0.00204, + "grad_norm": 0.32686129212379456, + "grad_norm_var": 0.0014293085106024154, + "learning_rate": 5e-05, + "loss": 0.1593, + "loss/crossentropy": 2.7316592931747437, + "loss/hidden": 0.0, + "loss/logits": 0.15925980731844902, + "loss/reg": 3.5632758140563965, + "step": 204 + }, + { + "epoch": 0.00205, + "grad_norm": 0.3191937506198883, + "grad_norm_var": 0.001518472211395964, + "learning_rate": 5e-05, + "loss": 0.1527, + "loss/crossentropy": 2.7802085876464844, + "loss/hidden": 0.0, + "loss/logits": 0.15268265083432198, + "loss/reg": 3.559633493423462, + "step": 205 + }, + { + "epoch": 0.00206, + "grad_norm": 0.34924882650375366, + "grad_norm_var": 0.0009115629505157467, + "learning_rate": 5e-05, + "loss": 0.1773, + "loss/crossentropy": 2.792604923248291, + "loss/hidden": 0.0, + "loss/logits": 0.17729893326759338, + "loss/reg": 3.555882453918457, + "step": 206 + }, + { + "epoch": 0.00207, + "grad_norm": 0.38204553723335266, + "grad_norm_var": 0.0008412229229646054, + "learning_rate": 5e-05, + "loss": 0.1735, + "loss/crossentropy": 2.729912281036377, + "loss/hidden": 0.0, + "loss/logits": 0.17347190529108047, + "loss/reg": 3.551867723464966, + "step": 207 + }, + { + "epoch": 0.00208, + "grad_norm": 0.316631555557251, + "grad_norm_var": 0.0009067368526577339, + "learning_rate": 5e-05, + "loss": 0.1521, + "loss/crossentropy": 2.6910020112991333, + "loss/hidden": 0.0, + "loss/logits": 0.15211007744073868, + "loss/reg": 3.547140598297119, + "step": 208 + }, + { + "epoch": 0.00209, + "grad_norm": 0.3024788200855255, + "grad_norm_var": 0.0010444754089082963, + "learning_rate": 5e-05, + "loss": 0.1534, + "loss/crossentropy": 2.6174367666244507, + "loss/hidden": 0.0, + "loss/logits": 0.15340904891490936, + "loss/reg": 3.5430798530578613, + "step": 209 + }, + { + "epoch": 0.0021, + "grad_norm": 0.31879743933677673, + "grad_norm_var": 0.0010192142441715734, + "learning_rate": 5e-05, + "loss": 0.1644, + "loss/crossentropy": 2.6434658765792847, + "loss/hidden": 0.0, + "loss/logits": 0.164449330419302, + "loss/reg": 3.539293050765991, + "step": 210 + }, + { + "epoch": 0.00211, + "grad_norm": 0.37038934230804443, + "grad_norm_var": 0.0010445807718520773, + "learning_rate": 5e-05, + "loss": 0.1618, + "loss/crossentropy": 2.7187950015068054, + "loss/hidden": 0.0, + "loss/logits": 0.16179471090435982, + "loss/reg": 3.5359723567962646, + "step": 211 + }, + { + "epoch": 0.00212, + "grad_norm": 0.3256055414676666, + "grad_norm_var": 0.0010681195543044476, + "learning_rate": 5e-05, + "loss": 0.1634, + "loss/crossentropy": 2.6802476048469543, + "loss/hidden": 0.0, + "loss/logits": 0.16339639574289322, + "loss/reg": 3.5320651531219482, + "step": 212 + }, + { + "epoch": 0.00213, + "grad_norm": 0.363210529088974, + "grad_norm_var": 0.0010772816324646883, + "learning_rate": 5e-05, + "loss": 0.1682, + "loss/crossentropy": 2.925456941127777, + "loss/hidden": 0.0, + "loss/logits": 0.16816257312893867, + "loss/reg": 3.527592420578003, + "step": 213 + }, + { + "epoch": 0.00214, + "grad_norm": 0.3341169059276581, + "grad_norm_var": 0.0010811945233913268, + "learning_rate": 5e-05, + "loss": 0.169, + "loss/crossentropy": 2.8775156140327454, + "loss/hidden": 0.0, + "loss/logits": 0.1689641959965229, + "loss/reg": 3.5241305828094482, + "step": 214 + }, + { + "epoch": 0.00215, + "grad_norm": 0.7971848249435425, + "grad_norm_var": 0.013831743286372744, + "learning_rate": 5e-05, + "loss": 0.1898, + "loss/crossentropy": 2.769020676612854, + "loss/hidden": 0.0, + "loss/logits": 0.18977811932563782, + "loss/reg": 3.5197558403015137, + "step": 215 + }, + { + "epoch": 0.00216, + "grad_norm": 0.3044687807559967, + "grad_norm_var": 0.014131832632900828, + "learning_rate": 5e-05, + "loss": 0.1467, + "loss/crossentropy": 2.792181670665741, + "loss/hidden": 0.0, + "loss/logits": 0.1466773971915245, + "loss/reg": 3.516072988510132, + "step": 216 + }, + { + "epoch": 0.00217, + "grad_norm": 0.3434732258319855, + "grad_norm_var": 0.013888774653188173, + "learning_rate": 5e-05, + "loss": 0.1698, + "loss/crossentropy": 2.577077627182007, + "loss/hidden": 0.0, + "loss/logits": 0.16977669671177864, + "loss/reg": 3.512517213821411, + "step": 217 + }, + { + "epoch": 0.00218, + "grad_norm": 0.37019920349121094, + "grad_norm_var": 0.013886977393692842, + "learning_rate": 5e-05, + "loss": 0.1943, + "loss/crossentropy": 2.722847878932953, + "loss/hidden": 0.0, + "loss/logits": 0.19428952783346176, + "loss/reg": 3.5091969966888428, + "step": 218 + }, + { + "epoch": 0.00219, + "grad_norm": 0.31637635827064514, + "grad_norm_var": 0.013887658605223226, + "learning_rate": 5e-05, + "loss": 0.1547, + "loss/crossentropy": 2.787532150745392, + "loss/hidden": 0.0, + "loss/logits": 0.1546883024275303, + "loss/reg": 3.5059781074523926, + "step": 219 + }, + { + "epoch": 0.0022, + "grad_norm": 0.368344783782959, + "grad_norm_var": 0.013784165910995568, + "learning_rate": 5e-05, + "loss": 0.1773, + "loss/crossentropy": 2.7095659971237183, + "loss/hidden": 0.0, + "loss/logits": 0.1773015893995762, + "loss/reg": 3.502683162689209, + "step": 220 + }, + { + "epoch": 0.00221, + "grad_norm": 0.3447912037372589, + "grad_norm_var": 0.013659872247631084, + "learning_rate": 5e-05, + "loss": 0.1688, + "loss/crossentropy": 2.7072474360466003, + "loss/hidden": 0.0, + "loss/logits": 0.1687602400779724, + "loss/reg": 3.4986109733581543, + "step": 221 + }, + { + "epoch": 0.00222, + "grad_norm": 0.3812227249145508, + "grad_norm_var": 0.013638668912457892, + "learning_rate": 5e-05, + "loss": 0.1811, + "loss/crossentropy": 2.8128660917282104, + "loss/hidden": 0.0, + "loss/logits": 0.18113631010055542, + "loss/reg": 3.4947147369384766, + "step": 222 + }, + { + "epoch": 0.00223, + "grad_norm": 0.339374303817749, + "grad_norm_var": 0.013690814024359154, + "learning_rate": 5e-05, + "loss": 0.1716, + "loss/crossentropy": 2.885101020336151, + "loss/hidden": 0.0, + "loss/logits": 0.17156245186924934, + "loss/reg": 3.4905753135681152, + "step": 223 + }, + { + "epoch": 0.00224, + "grad_norm": 0.3169143497943878, + "grad_norm_var": 0.013688861707923295, + "learning_rate": 5e-05, + "loss": 0.1589, + "loss/crossentropy": 2.6434147357940674, + "loss/hidden": 0.0, + "loss/logits": 0.15887855738401413, + "loss/reg": 3.486919641494751, + "step": 224 + }, + { + "epoch": 0.00225, + "grad_norm": 0.4436502456665039, + "grad_norm_var": 0.013690624557478688, + "learning_rate": 5e-05, + "loss": 0.2037, + "loss/crossentropy": 2.9042821526527405, + "loss/hidden": 0.0, + "loss/logits": 0.20374128222465515, + "loss/reg": 3.483499765396118, + "step": 225 + }, + { + "epoch": 0.00226, + "grad_norm": 0.44937804341316223, + "grad_norm_var": 0.01373632101878638, + "learning_rate": 5e-05, + "loss": 0.1588, + "loss/crossentropy": 2.79194039106369, + "loss/hidden": 0.0, + "loss/logits": 0.1587841510772705, + "loss/reg": 3.480142593383789, + "step": 226 + }, + { + "epoch": 0.00227, + "grad_norm": 0.3453376889228821, + "grad_norm_var": 0.013826164241530992, + "learning_rate": 5e-05, + "loss": 0.1659, + "loss/crossentropy": 2.7488330006599426, + "loss/hidden": 0.0, + "loss/logits": 0.16589100658893585, + "loss/reg": 3.476062297821045, + "step": 227 + }, + { + "epoch": 0.00228, + "grad_norm": 0.3845584988594055, + "grad_norm_var": 0.013584549048495138, + "learning_rate": 5e-05, + "loss": 0.1842, + "loss/crossentropy": 2.6935607194900513, + "loss/hidden": 0.0, + "loss/logits": 0.18416164070367813, + "loss/reg": 3.4726946353912354, + "step": 228 + }, + { + "epoch": 0.00229, + "grad_norm": 0.3347846567630768, + "grad_norm_var": 0.013727727146734722, + "learning_rate": 5e-05, + "loss": 0.1767, + "loss/crossentropy": 2.6182947754859924, + "loss/hidden": 0.0, + "loss/logits": 0.1767422929406166, + "loss/reg": 3.469238519668579, + "step": 229 + }, + { + "epoch": 0.0023, + "grad_norm": 0.35126739740371704, + "grad_norm_var": 0.01362772883112919, + "learning_rate": 5e-05, + "loss": 0.1694, + "loss/crossentropy": 2.8005401492118835, + "loss/hidden": 0.0, + "loss/logits": 0.1694028675556183, + "loss/reg": 3.4662599563598633, + "step": 230 + }, + { + "epoch": 0.00231, + "grad_norm": 0.37644773721694946, + "grad_norm_var": 0.0016784352808341082, + "learning_rate": 5e-05, + "loss": 0.1677, + "loss/crossentropy": 2.7537949085235596, + "loss/hidden": 0.0, + "loss/logits": 0.16772692278027534, + "loss/reg": 3.4623701572418213, + "step": 231 + }, + { + "epoch": 0.00232, + "grad_norm": 0.33086928725242615, + "grad_norm_var": 0.0015241936410912834, + "learning_rate": 5e-05, + "loss": 0.1624, + "loss/crossentropy": 2.7844293117523193, + "loss/hidden": 0.0, + "loss/logits": 0.1624348722398281, + "loss/reg": 3.459073066711426, + "step": 232 + }, + { + "epoch": 0.00233, + "grad_norm": 0.3152429461479187, + "grad_norm_var": 0.0016449122438399724, + "learning_rate": 5e-05, + "loss": 0.1607, + "loss/crossentropy": 2.5863555669784546, + "loss/hidden": 0.0, + "loss/logits": 0.16065017879009247, + "loss/reg": 3.456038475036621, + "step": 233 + }, + { + "epoch": 0.00234, + "grad_norm": 0.34679386019706726, + "grad_norm_var": 0.001649030072333372, + "learning_rate": 5e-05, + "loss": 0.1656, + "loss/crossentropy": 2.9068891406059265, + "loss/hidden": 0.0, + "loss/logits": 0.16555847227573395, + "loss/reg": 3.452618360519409, + "step": 234 + }, + { + "epoch": 0.00235, + "grad_norm": 0.36684513092041016, + "grad_norm_var": 0.001520832425550959, + "learning_rate": 5e-05, + "loss": 0.1878, + "loss/crossentropy": 2.6781840920448303, + "loss/hidden": 0.0, + "loss/logits": 0.18775511160492897, + "loss/reg": 3.4493637084960938, + "step": 235 + }, + { + "epoch": 0.00236, + "grad_norm": 0.39043235778808594, + "grad_norm_var": 0.0015693055369300879, + "learning_rate": 5e-05, + "loss": 0.1559, + "loss/crossentropy": 2.9237093925476074, + "loss/hidden": 0.0, + "loss/logits": 0.15592358261346817, + "loss/reg": 3.446392059326172, + "step": 236 + }, + { + "epoch": 0.00237, + "grad_norm": 0.3486286997795105, + "grad_norm_var": 0.0015605921838873513, + "learning_rate": 5e-05, + "loss": 0.1524, + "loss/crossentropy": 2.8276549577713013, + "loss/hidden": 0.0, + "loss/logits": 0.1523873247206211, + "loss/reg": 3.443490505218506, + "step": 237 + }, + { + "epoch": 0.00238, + "grad_norm": 0.4030380845069885, + "grad_norm_var": 0.0016408419596595262, + "learning_rate": 5e-05, + "loss": 0.1839, + "loss/crossentropy": 2.7374503016471863, + "loss/hidden": 0.0, + "loss/logits": 0.1839219257235527, + "loss/reg": 3.440230131149292, + "step": 238 + }, + { + "epoch": 0.00239, + "grad_norm": 0.3677695095539093, + "grad_norm_var": 0.0015933721835237928, + "learning_rate": 5e-05, + "loss": 0.1725, + "loss/crossentropy": 2.637487053871155, + "loss/hidden": 0.0, + "loss/logits": 0.1725292131304741, + "loss/reg": 3.4369444847106934, + "step": 239 + }, + { + "epoch": 0.0024, + "grad_norm": 0.3092736303806305, + "grad_norm_var": 0.001648043714460871, + "learning_rate": 5e-05, + "loss": 0.1608, + "loss/crossentropy": 2.785566747188568, + "loss/hidden": 0.0, + "loss/logits": 0.16075557842850685, + "loss/reg": 3.4329705238342285, + "step": 240 + }, + { + "epoch": 0.00241, + "grad_norm": 0.3242727518081665, + "grad_norm_var": 0.001311046071157899, + "learning_rate": 5e-05, + "loss": 0.1641, + "loss/crossentropy": 2.7823829650878906, + "loss/hidden": 0.0, + "loss/logits": 0.16410458087921143, + "loss/reg": 3.429222345352173, + "step": 241 + }, + { + "epoch": 0.00242, + "grad_norm": 0.3544396758079529, + "grad_norm_var": 0.0007310749754719385, + "learning_rate": 5e-05, + "loss": 0.1742, + "loss/crossentropy": 2.7899482250213623, + "loss/hidden": 0.0, + "loss/logits": 0.1741911694407463, + "loss/reg": 3.4251747131347656, + "step": 242 + }, + { + "epoch": 0.00243, + "grad_norm": 0.3156209886074066, + "grad_norm_var": 0.0008171231835736463, + "learning_rate": 5e-05, + "loss": 0.159, + "loss/crossentropy": 2.7414376735687256, + "loss/hidden": 0.0, + "loss/logits": 0.15898872911930084, + "loss/reg": 3.421576976776123, + "step": 243 + }, + { + "epoch": 0.00244, + "grad_norm": 0.3353999853134155, + "grad_norm_var": 0.000749955482525048, + "learning_rate": 5e-05, + "loss": 0.1669, + "loss/crossentropy": 2.707472503185272, + "loss/hidden": 0.0, + "loss/logits": 0.16693224385380745, + "loss/reg": 3.417820930480957, + "step": 244 + }, + { + "epoch": 0.00245, + "grad_norm": 0.32766133546829224, + "grad_norm_var": 0.0007658640613261528, + "learning_rate": 5e-05, + "loss": 0.1761, + "loss/crossentropy": 2.6950490474700928, + "loss/hidden": 0.0, + "loss/logits": 0.17608999833464622, + "loss/reg": 3.414095640182495, + "step": 245 + }, + { + "epoch": 0.00246, + "grad_norm": 0.31360548734664917, + "grad_norm_var": 0.0008368534177580581, + "learning_rate": 5e-05, + "loss": 0.1578, + "loss/crossentropy": 2.6977627873420715, + "loss/hidden": 0.0, + "loss/logits": 0.15783175826072693, + "loss/reg": 3.409533739089966, + "step": 246 + }, + { + "epoch": 0.00247, + "grad_norm": 0.35324403643608093, + "grad_norm_var": 0.0007744365123312817, + "learning_rate": 5e-05, + "loss": 0.1688, + "loss/crossentropy": 2.8509859442710876, + "loss/hidden": 0.0, + "loss/logits": 0.16875524446368217, + "loss/reg": 3.4052798748016357, + "step": 247 + }, + { + "epoch": 0.00248, + "grad_norm": 0.41796907782554626, + "grad_norm_var": 0.0010967197155327421, + "learning_rate": 5e-05, + "loss": 0.18, + "loss/crossentropy": 2.701251804828644, + "loss/hidden": 0.0, + "loss/logits": 0.1800428181886673, + "loss/reg": 3.4006803035736084, + "step": 248 + }, + { + "epoch": 0.00249, + "grad_norm": 0.33844876289367676, + "grad_norm_var": 0.0010247223552569313, + "learning_rate": 5e-05, + "loss": 0.1737, + "loss/crossentropy": 2.7646324038505554, + "loss/hidden": 0.0, + "loss/logits": 0.17367269843816757, + "loss/reg": 3.3968873023986816, + "step": 249 + }, + { + "epoch": 0.0025, + "grad_norm": 0.31011876463890076, + "grad_norm_var": 0.0011285754764581786, + "learning_rate": 5e-05, + "loss": 0.1591, + "loss/crossentropy": 2.7303661704063416, + "loss/hidden": 0.0, + "loss/logits": 0.15912269055843353, + "loss/reg": 3.3925790786743164, + "step": 250 + }, + { + "epoch": 0.00251, + "grad_norm": 0.4837491512298584, + "grad_norm_var": 0.0022679356659945546, + "learning_rate": 5e-05, + "loss": 0.1845, + "loss/crossentropy": 2.718783438205719, + "loss/hidden": 0.0, + "loss/logits": 0.18454211205244064, + "loss/reg": 3.389193296432495, + "step": 251 + }, + { + "epoch": 0.00252, + "grad_norm": 0.30302709341049194, + "grad_norm_var": 0.002342444325573334, + "learning_rate": 5e-05, + "loss": 0.1527, + "loss/crossentropy": 2.7513213753700256, + "loss/hidden": 0.0, + "loss/logits": 0.15272299572825432, + "loss/reg": 3.384976863861084, + "step": 252 + }, + { + "epoch": 0.00253, + "grad_norm": 0.3376671075820923, + "grad_norm_var": 0.002352530797232196, + "learning_rate": 5e-05, + "loss": 0.1717, + "loss/crossentropy": 2.7082377672195435, + "loss/hidden": 0.0, + "loss/logits": 0.1717442087829113, + "loss/reg": 3.381958246231079, + "step": 253 + }, + { + "epoch": 0.00254, + "grad_norm": 0.3470434546470642, + "grad_norm_var": 0.00215032290339258, + "learning_rate": 5e-05, + "loss": 0.1751, + "loss/crossentropy": 2.7747623324394226, + "loss/hidden": 0.0, + "loss/logits": 0.17506984621286392, + "loss/reg": 3.3780975341796875, + "step": 254 + }, + { + "epoch": 0.00255, + "grad_norm": 0.35893791913986206, + "grad_norm_var": 0.002129806794166807, + "learning_rate": 5e-05, + "loss": 0.176, + "loss/crossentropy": 2.6670790910720825, + "loss/hidden": 0.0, + "loss/logits": 0.17602670192718506, + "loss/reg": 3.374431848526001, + "step": 255 + }, + { + "epoch": 0.00256, + "grad_norm": 0.33274415135383606, + "grad_norm_var": 0.002050384071076557, + "learning_rate": 5e-05, + "loss": 0.1683, + "loss/crossentropy": 2.930284321308136, + "loss/hidden": 0.0, + "loss/logits": 0.16833152994513512, + "loss/reg": 3.3709969520568848, + "step": 256 + }, + { + "epoch": 0.00257, + "grad_norm": 0.3107251822948456, + "grad_norm_var": 0.0021031284267367073, + "learning_rate": 5e-05, + "loss": 0.154, + "loss/crossentropy": 2.7738651037216187, + "loss/hidden": 0.0, + "loss/logits": 0.15401111543178558, + "loss/reg": 3.36681866645813, + "step": 257 + }, + { + "epoch": 0.00258, + "grad_norm": 0.3238702118396759, + "grad_norm_var": 0.00212825610345269, + "learning_rate": 5e-05, + "loss": 0.1485, + "loss/crossentropy": 2.7926384806632996, + "loss/hidden": 0.0, + "loss/logits": 0.14850713685154915, + "loss/reg": 3.363298177719116, + "step": 258 + }, + { + "epoch": 0.00259, + "grad_norm": 0.3937188982963562, + "grad_norm_var": 0.0022101531057158843, + "learning_rate": 5e-05, + "loss": 0.1796, + "loss/crossentropy": 2.732594311237335, + "loss/hidden": 0.0, + "loss/logits": 0.17957409471273422, + "loss/reg": 3.3592464923858643, + "step": 259 + }, + { + "epoch": 0.0026, + "grad_norm": 0.35869738459587097, + "grad_norm_var": 0.002201067050087302, + "learning_rate": 5e-05, + "loss": 0.1608, + "loss/crossentropy": 2.6412158608436584, + "loss/hidden": 0.0, + "loss/logits": 0.1607726439833641, + "loss/reg": 3.355531692504883, + "step": 260 + }, + { + "epoch": 0.00261, + "grad_norm": 0.342753529548645, + "grad_norm_var": 0.002168938888458849, + "learning_rate": 5e-05, + "loss": 0.1679, + "loss/crossentropy": 2.8253002762794495, + "loss/hidden": 0.0, + "loss/logits": 0.16785955801606178, + "loss/reg": 3.3510327339172363, + "step": 261 + }, + { + "epoch": 0.00262, + "grad_norm": 0.3396557867527008, + "grad_norm_var": 0.0020792270475482005, + "learning_rate": 5e-05, + "loss": 0.1719, + "loss/crossentropy": 2.5446697473526, + "loss/hidden": 0.0, + "loss/logits": 0.1719457022845745, + "loss/reg": 3.3462460041046143, + "step": 262 + }, + { + "epoch": 0.00263, + "grad_norm": 0.326615571975708, + "grad_norm_var": 0.002123647634079288, + "learning_rate": 5e-05, + "loss": 0.1662, + "loss/crossentropy": 2.7185133695602417, + "loss/hidden": 0.0, + "loss/logits": 0.16621045768260956, + "loss/reg": 3.342698097229004, + "step": 263 + }, + { + "epoch": 0.00264, + "grad_norm": 0.372024804353714, + "grad_norm_var": 0.0018490612448516057, + "learning_rate": 5e-05, + "loss": 0.1785, + "loss/crossentropy": 2.90339195728302, + "loss/hidden": 0.0, + "loss/logits": 0.17848360165953636, + "loss/reg": 3.3390297889709473, + "step": 264 + }, + { + "epoch": 0.00265, + "grad_norm": 0.336412638425827, + "grad_norm_var": 0.0018521135396843155, + "learning_rate": 5e-05, + "loss": 0.1685, + "loss/crossentropy": 2.6998194456100464, + "loss/hidden": 0.0, + "loss/logits": 0.1684669330716133, + "loss/reg": 3.3360044956207275, + "step": 265 + }, + { + "epoch": 0.00266, + "grad_norm": 0.3179170787334442, + "grad_norm_var": 0.0018158920564407192, + "learning_rate": 5e-05, + "loss": 0.164, + "loss/crossentropy": 2.713620126247406, + "loss/hidden": 0.0, + "loss/logits": 0.16402245312929153, + "loss/reg": 3.33297061920166, + "step": 266 + }, + { + "epoch": 0.00267, + "grad_norm": 0.32180216908454895, + "grad_norm_var": 0.0005475447645484169, + "learning_rate": 5e-05, + "loss": 0.1561, + "loss/crossentropy": 2.8285736441612244, + "loss/hidden": 0.0, + "loss/logits": 0.15614933148026466, + "loss/reg": 3.330070972442627, + "step": 267 + }, + { + "epoch": 0.00268, + "grad_norm": 0.34155359864234924, + "grad_norm_var": 0.00045564919377512797, + "learning_rate": 5e-05, + "loss": 0.1666, + "loss/crossentropy": 2.698326587677002, + "loss/hidden": 0.0, + "loss/logits": 0.16663997247815132, + "loss/reg": 3.326782464981079, + "step": 268 + }, + { + "epoch": 0.00269, + "grad_norm": 0.3281239867210388, + "grad_norm_var": 0.0004660702159405468, + "learning_rate": 5e-05, + "loss": 0.1547, + "loss/crossentropy": 2.7132135033607483, + "loss/hidden": 0.0, + "loss/logits": 0.15473050251603127, + "loss/reg": 3.3232901096343994, + "step": 269 + }, + { + "epoch": 0.0027, + "grad_norm": 0.3694444000720978, + "grad_norm_var": 0.0005161187813034911, + "learning_rate": 5e-05, + "loss": 0.1658, + "loss/crossentropy": 2.943029820919037, + "loss/hidden": 0.0, + "loss/logits": 0.16579603031277657, + "loss/reg": 3.319425344467163, + "step": 270 + }, + { + "epoch": 0.00271, + "grad_norm": 0.3521305024623871, + "grad_norm_var": 0.0005038113254072218, + "learning_rate": 5e-05, + "loss": 0.1762, + "loss/crossentropy": 2.813421130180359, + "loss/hidden": 0.0, + "loss/logits": 0.17624986171722412, + "loss/reg": 3.31587553024292, + "step": 271 + }, + { + "epoch": 0.00272, + "grad_norm": 0.3419167995452881, + "grad_norm_var": 0.0004980410714001496, + "learning_rate": 5e-05, + "loss": 0.1579, + "loss/crossentropy": 2.6725985407829285, + "loss/hidden": 0.0, + "loss/logits": 0.1579086296260357, + "loss/reg": 3.313774347305298, + "step": 272 + }, + { + "epoch": 0.00273, + "grad_norm": 0.45973771810531616, + "grad_norm_var": 0.001257799356739812, + "learning_rate": 5e-05, + "loss": 0.1806, + "loss/crossentropy": 2.7593576908111572, + "loss/hidden": 0.0, + "loss/logits": 0.1806396320462227, + "loss/reg": 3.311671257019043, + "step": 273 + }, + { + "epoch": 0.00274, + "grad_norm": 0.327812522649765, + "grad_norm_var": 0.00124416933097297, + "learning_rate": 5e-05, + "loss": 0.1544, + "loss/crossentropy": 2.7368595004081726, + "loss/hidden": 0.0, + "loss/logits": 0.15438436716794968, + "loss/reg": 3.308312177658081, + "step": 274 + }, + { + "epoch": 0.00275, + "grad_norm": 0.43593037128448486, + "grad_norm_var": 0.001590926391083336, + "learning_rate": 5e-05, + "loss": 0.1721, + "loss/crossentropy": 2.8178694248199463, + "loss/hidden": 0.0, + "loss/logits": 0.17214355245232582, + "loss/reg": 3.30526065826416, + "step": 275 + }, + { + "epoch": 0.00276, + "grad_norm": 0.361247181892395, + "grad_norm_var": 0.0015927484925991053, + "learning_rate": 5e-05, + "loss": 0.1735, + "loss/crossentropy": 2.734869599342346, + "loss/hidden": 0.0, + "loss/logits": 0.17345493659377098, + "loss/reg": 3.3015174865722656, + "step": 276 + }, + { + "epoch": 0.00277, + "grad_norm": 0.3708873689174652, + "grad_norm_var": 0.0015974331537993436, + "learning_rate": 5e-05, + "loss": 0.1628, + "loss/crossentropy": 2.67022705078125, + "loss/hidden": 0.0, + "loss/logits": 0.16277828440070152, + "loss/reg": 3.2979342937469482, + "step": 277 + }, + { + "epoch": 0.00278, + "grad_norm": 0.3481086790561676, + "grad_norm_var": 0.0015829700282981919, + "learning_rate": 5e-05, + "loss": 0.1577, + "loss/crossentropy": 2.700168251991272, + "loss/hidden": 0.0, + "loss/logits": 0.15773406997323036, + "loss/reg": 3.294177532196045, + "step": 278 + }, + { + "epoch": 0.00279, + "grad_norm": 0.5134589076042175, + "grad_norm_var": 0.003008442642246208, + "learning_rate": 5e-05, + "loss": 0.1766, + "loss/crossentropy": 2.9033528566360474, + "loss/hidden": 0.0, + "loss/logits": 0.17655130848288536, + "loss/reg": 3.2909915447235107, + "step": 279 + }, + { + "epoch": 0.0028, + "grad_norm": 0.38205716013908386, + "grad_norm_var": 0.0030192383608610503, + "learning_rate": 5e-05, + "loss": 0.1934, + "loss/crossentropy": 2.7160211205482483, + "loss/hidden": 0.0, + "loss/logits": 0.19341961666941643, + "loss/reg": 3.2878634929656982, + "step": 280 + }, + { + "epoch": 0.00281, + "grad_norm": 0.3628558814525604, + "grad_norm_var": 0.002947045102075485, + "learning_rate": 5e-05, + "loss": 0.1709, + "loss/crossentropy": 2.6912715435028076, + "loss/hidden": 0.0, + "loss/logits": 0.17094064503908157, + "loss/reg": 3.284353256225586, + "step": 281 + }, + { + "epoch": 0.00282, + "grad_norm": 0.32757696509361267, + "grad_norm_var": 0.002884588952342071, + "learning_rate": 5e-05, + "loss": 0.1702, + "loss/crossentropy": 2.7818912267684937, + "loss/hidden": 0.0, + "loss/logits": 0.17016061395406723, + "loss/reg": 3.281097412109375, + "step": 282 + }, + { + "epoch": 0.00283, + "grad_norm": 0.5035110712051392, + "grad_norm_var": 0.003743174506031214, + "learning_rate": 5e-05, + "loss": 0.1891, + "loss/crossentropy": 2.8356027603149414, + "loss/hidden": 0.0, + "loss/logits": 0.1890563629567623, + "loss/reg": 3.277557611465454, + "step": 283 + }, + { + "epoch": 0.00284, + "grad_norm": 0.4021988809108734, + "grad_norm_var": 0.003638735284583853, + "learning_rate": 5e-05, + "loss": 0.1574, + "loss/crossentropy": 2.768595337867737, + "loss/hidden": 0.0, + "loss/logits": 0.15735788643360138, + "loss/reg": 3.274083375930786, + "step": 284 + }, + { + "epoch": 0.00285, + "grad_norm": 0.3557356297969818, + "grad_norm_var": 0.0034707811870306284, + "learning_rate": 5e-05, + "loss": 0.1618, + "loss/crossentropy": 2.7620763182640076, + "loss/hidden": 0.0, + "loss/logits": 0.16183337569236755, + "loss/reg": 3.2709619998931885, + "step": 285 + }, + { + "epoch": 0.00286, + "grad_norm": 0.35383257269859314, + "grad_norm_var": 0.0035254991255895857, + "learning_rate": 5e-05, + "loss": 0.164, + "loss/crossentropy": 2.7777530550956726, + "loss/hidden": 0.0, + "loss/logits": 0.16404272243380547, + "loss/reg": 3.2678396701812744, + "step": 286 + }, + { + "epoch": 0.00287, + "grad_norm": 0.3291935324668884, + "grad_norm_var": 0.0036663583934404683, + "learning_rate": 5e-05, + "loss": 0.1545, + "loss/crossentropy": 2.7828534841537476, + "loss/hidden": 0.0, + "loss/logits": 0.15450828149914742, + "loss/reg": 3.2639055252075195, + "step": 287 + }, + { + "epoch": 0.00288, + "grad_norm": 0.3174595534801483, + "grad_norm_var": 0.003847509504795695, + "learning_rate": 5e-05, + "loss": 0.163, + "loss/crossentropy": 2.7973127365112305, + "loss/hidden": 0.0, + "loss/logits": 0.16296324506402016, + "loss/reg": 3.2608911991119385, + "step": 288 + }, + { + "epoch": 0.00289, + "grad_norm": 0.3723791539669037, + "grad_norm_var": 0.0034478366033269445, + "learning_rate": 5e-05, + "loss": 0.1757, + "loss/crossentropy": 2.630415976047516, + "loss/hidden": 0.0, + "loss/logits": 0.17573364078998566, + "loss/reg": 3.257523775100708, + "step": 289 + }, + { + "epoch": 0.0029, + "grad_norm": 0.38034215569496155, + "grad_norm_var": 0.003261674725795945, + "learning_rate": 5e-05, + "loss": 0.1689, + "loss/crossentropy": 2.704579532146454, + "loss/hidden": 0.0, + "loss/logits": 0.16892588511109352, + "loss/reg": 3.254065752029419, + "step": 290 + }, + { + "epoch": 0.00291, + "grad_norm": 1.2464065551757812, + "grad_norm_var": 0.05011180607704591, + "learning_rate": 5e-05, + "loss": 0.1921, + "loss/crossentropy": 2.8595897555351257, + "loss/hidden": 0.0, + "loss/logits": 0.19211571291089058, + "loss/reg": 3.2510294914245605, + "step": 291 + }, + { + "epoch": 0.00292, + "grad_norm": 0.3307066559791565, + "grad_norm_var": 0.05046209325623486, + "learning_rate": 5e-05, + "loss": 0.1681, + "loss/crossentropy": 2.8195151686668396, + "loss/hidden": 0.0, + "loss/logits": 0.16810721158981323, + "loss/reg": 3.2478625774383545, + "step": 292 + }, + { + "epoch": 0.00293, + "grad_norm": 0.33664193749427795, + "grad_norm_var": 0.05081007066127065, + "learning_rate": 5e-05, + "loss": 0.1602, + "loss/crossentropy": 2.912789523601532, + "loss/hidden": 0.0, + "loss/logits": 0.16015203669667244, + "loss/reg": 3.2451751232147217, + "step": 293 + }, + { + "epoch": 0.00294, + "grad_norm": 0.42365437746047974, + "grad_norm_var": 0.05035293502217161, + "learning_rate": 5e-05, + "loss": 0.1833, + "loss/crossentropy": 2.7459517121315002, + "loss/hidden": 0.0, + "loss/logits": 0.18333038315176964, + "loss/reg": 3.242023229598999, + "step": 294 + }, + { + "epoch": 0.00295, + "grad_norm": 0.40840578079223633, + "grad_norm_var": 0.049924464393714924, + "learning_rate": 5e-05, + "loss": 0.176, + "loss/crossentropy": 2.778249144554138, + "loss/hidden": 0.0, + "loss/logits": 0.17595936357975006, + "loss/reg": 3.2388389110565186, + "step": 295 + }, + { + "epoch": 0.00296, + "grad_norm": 0.3591618835926056, + "grad_norm_var": 0.05009460642018338, + "learning_rate": 5e-05, + "loss": 0.1722, + "loss/crossentropy": 2.7121748328208923, + "loss/hidden": 0.0, + "loss/logits": 0.17217102646827698, + "loss/reg": 3.2364017963409424, + "step": 296 + }, + { + "epoch": 0.00297, + "grad_norm": 0.31615006923675537, + "grad_norm_var": 0.05062186135785553, + "learning_rate": 5e-05, + "loss": 0.1488, + "loss/crossentropy": 2.7933038473129272, + "loss/hidden": 0.0, + "loss/logits": 0.14879318699240685, + "loss/reg": 3.2344272136688232, + "step": 297 + }, + { + "epoch": 0.00298, + "grad_norm": 0.3586377799510956, + "grad_norm_var": 0.050288172636787816, + "learning_rate": 5e-05, + "loss": 0.1492, + "loss/crossentropy": 2.869333803653717, + "loss/hidden": 0.0, + "loss/logits": 0.14924299344420433, + "loss/reg": 3.232086181640625, + "step": 298 + }, + { + "epoch": 0.00299, + "grad_norm": 0.37798771262168884, + "grad_norm_var": 0.04995309393064352, + "learning_rate": 5e-05, + "loss": 0.1652, + "loss/crossentropy": 2.913083255290985, + "loss/hidden": 0.0, + "loss/logits": 0.1651969812810421, + "loss/reg": 3.2296016216278076, + "step": 299 + }, + { + "epoch": 0.003, + "grad_norm": 0.5914519429206848, + "grad_norm_var": 0.05182304954391634, + "learning_rate": 5e-05, + "loss": 0.1893, + "loss/crossentropy": 2.8007007241249084, + "loss/hidden": 0.0, + "loss/logits": 0.18929021432995796, + "loss/reg": 3.2266576290130615, + "step": 300 + }, + { + "epoch": 0.00301, + "grad_norm": 0.3292617201805115, + "grad_norm_var": 0.05212417516215169, + "learning_rate": 5e-05, + "loss": 0.16, + "loss/crossentropy": 2.7326099276542664, + "loss/hidden": 0.0, + "loss/logits": 0.15998771041631699, + "loss/reg": 3.2236239910125732, + "step": 301 + }, + { + "epoch": 0.00302, + "grad_norm": 0.3807355761528015, + "grad_norm_var": 0.05190702763823275, + "learning_rate": 5e-05, + "loss": 0.1831, + "loss/crossentropy": 2.7199636101722717, + "loss/hidden": 0.0, + "loss/logits": 0.1831417679786682, + "loss/reg": 3.220425844192505, + "step": 302 + }, + { + "epoch": 0.00303, + "grad_norm": 0.4008902907371521, + "grad_norm_var": 0.05127743798183474, + "learning_rate": 5e-05, + "loss": 0.1777, + "loss/crossentropy": 2.7570589184761047, + "loss/hidden": 0.0, + "loss/logits": 0.177694384008646, + "loss/reg": 3.2171859741210938, + "step": 303 + }, + { + "epoch": 0.00304, + "grad_norm": 0.35962697863578796, + "grad_norm_var": 0.05073816419262332, + "learning_rate": 5e-05, + "loss": 0.1574, + "loss/crossentropy": 2.809377670288086, + "loss/hidden": 0.0, + "loss/logits": 0.15741629898548126, + "loss/reg": 3.2137656211853027, + "step": 304 + }, + { + "epoch": 0.00305, + "grad_norm": 0.404453843832016, + "grad_norm_var": 0.05053133217663517, + "learning_rate": 5e-05, + "loss": 0.196, + "loss/crossentropy": 2.6374824047088623, + "loss/hidden": 0.0, + "loss/logits": 0.19603004679083824, + "loss/reg": 3.2106897830963135, + "step": 305 + }, + { + "epoch": 0.00306, + "grad_norm": 0.3411775231361389, + "grad_norm_var": 0.05092714807133293, + "learning_rate": 5e-05, + "loss": 0.1701, + "loss/crossentropy": 2.7098072171211243, + "loss/hidden": 0.0, + "loss/logits": 0.17005891352891922, + "loss/reg": 3.2072975635528564, + "step": 306 + }, + { + "epoch": 0.00307, + "grad_norm": 0.48913073539733887, + "grad_norm_var": 0.004874772049479148, + "learning_rate": 5e-05, + "loss": 0.2396, + "loss/crossentropy": 2.8529589772224426, + "loss/hidden": 0.0, + "loss/logits": 0.23958228901028633, + "loss/reg": 3.2032482624053955, + "step": 307 + }, + { + "epoch": 0.00308, + "grad_norm": 0.3359135389328003, + "grad_norm_var": 0.004836687315538634, + "learning_rate": 5e-05, + "loss": 0.154, + "loss/crossentropy": 2.825865149497986, + "loss/hidden": 0.0, + "loss/logits": 0.15401727706193924, + "loss/reg": 3.2001187801361084, + "step": 308 + }, + { + "epoch": 0.00309, + "grad_norm": 0.3673790693283081, + "grad_norm_var": 0.004683902714770716, + "learning_rate": 5e-05, + "loss": 0.1701, + "loss/crossentropy": 2.757752239704132, + "loss/hidden": 0.0, + "loss/logits": 0.1700810343027115, + "loss/reg": 3.197371244430542, + "step": 309 + }, + { + "epoch": 0.0031, + "grad_norm": 0.3675878643989563, + "grad_norm_var": 0.004630661781797581, + "learning_rate": 5e-05, + "loss": 0.1742, + "loss/crossentropy": 2.9055893421173096, + "loss/hidden": 0.0, + "loss/logits": 0.17424792051315308, + "loss/reg": 3.193922758102417, + "step": 310 + }, + { + "epoch": 0.00311, + "grad_norm": 0.3216918110847473, + "grad_norm_var": 0.004850203191397428, + "learning_rate": 5e-05, + "loss": 0.1529, + "loss/crossentropy": 2.7421942353248596, + "loss/hidden": 0.0, + "loss/logits": 0.15289029106497765, + "loss/reg": 3.1913256645202637, + "step": 311 + }, + { + "epoch": 0.00312, + "grad_norm": 0.30283358693122864, + "grad_norm_var": 0.005214980747276743, + "learning_rate": 5e-05, + "loss": 0.1501, + "loss/crossentropy": 2.68456107378006, + "loss/hidden": 0.0, + "loss/logits": 0.15012749284505844, + "loss/reg": 3.188302755355835, + "step": 312 + }, + { + "epoch": 0.00313, + "grad_norm": 0.3840731978416443, + "grad_norm_var": 0.004944937932171186, + "learning_rate": 5e-05, + "loss": 0.1565, + "loss/crossentropy": 2.7386457920074463, + "loss/hidden": 0.0, + "loss/logits": 0.15645165741443634, + "loss/reg": 3.1856327056884766, + "step": 313 + }, + { + "epoch": 0.00314, + "grad_norm": 0.3471456468105316, + "grad_norm_var": 0.004989069609234416, + "learning_rate": 5e-05, + "loss": 0.1661, + "loss/crossentropy": 2.8941837549209595, + "loss/hidden": 0.0, + "loss/logits": 0.16613885760307312, + "loss/reg": 3.182285785675049, + "step": 314 + }, + { + "epoch": 0.00315, + "grad_norm": 0.40083590149879456, + "grad_norm_var": 0.005011503442318803, + "learning_rate": 5e-05, + "loss": 0.1816, + "loss/crossentropy": 2.830922782421112, + "loss/hidden": 0.0, + "loss/logits": 0.18162691593170166, + "loss/reg": 3.1795294284820557, + "step": 315 + }, + { + "epoch": 0.00316, + "grad_norm": 0.40112313628196716, + "grad_norm_var": 0.001979603921073251, + "learning_rate": 5e-05, + "loss": 0.163, + "loss/crossentropy": 2.5529216527938843, + "loss/hidden": 0.0, + "loss/logits": 0.1629548817873001, + "loss/reg": 3.1763863563537598, + "step": 316 + }, + { + "epoch": 0.00317, + "grad_norm": 0.3936459720134735, + "grad_norm_var": 0.001881530067811854, + "learning_rate": 5e-05, + "loss": 0.1851, + "loss/crossentropy": 2.780943751335144, + "loss/hidden": 0.0, + "loss/logits": 0.18506472185254097, + "loss/reg": 3.173980474472046, + "step": 317 + }, + { + "epoch": 0.00318, + "grad_norm": 0.3727342486381531, + "grad_norm_var": 0.0018792953911145364, + "learning_rate": 5e-05, + "loss": 0.1827, + "loss/crossentropy": 2.76874041557312, + "loss/hidden": 0.0, + "loss/logits": 0.1826922371983528, + "loss/reg": 3.1704628467559814, + "step": 318 + }, + { + "epoch": 0.00319, + "grad_norm": 0.3470066785812378, + "grad_norm_var": 0.0018703712702832478, + "learning_rate": 5e-05, + "loss": 0.1598, + "loss/crossentropy": 2.740228831768036, + "loss/hidden": 0.0, + "loss/logits": 0.1597808077931404, + "loss/reg": 3.1674654483795166, + "step": 319 + }, + { + "epoch": 0.0032, + "grad_norm": 0.3653993010520935, + "grad_norm_var": 0.0018636832816178708, + "learning_rate": 5e-05, + "loss": 0.1549, + "loss/crossentropy": 2.883521616458893, + "loss/hidden": 0.0, + "loss/logits": 0.15490083023905754, + "loss/reg": 3.1641883850097656, + "step": 320 + }, + { + "epoch": 0.00321, + "grad_norm": 0.3510638475418091, + "grad_norm_var": 0.0018064205203171017, + "learning_rate": 5e-05, + "loss": 0.1577, + "loss/crossentropy": 2.9007150530815125, + "loss/hidden": 0.0, + "loss/logits": 0.15773681923747063, + "loss/reg": 3.1610915660858154, + "step": 321 + }, + { + "epoch": 0.00322, + "grad_norm": 0.5068875551223755, + "grad_norm_var": 0.0029290004167608335, + "learning_rate": 5e-05, + "loss": 0.2013, + "loss/crossentropy": 2.716179847717285, + "loss/hidden": 0.0, + "loss/logits": 0.2013130635023117, + "loss/reg": 3.1571173667907715, + "step": 322 + }, + { + "epoch": 0.00323, + "grad_norm": 0.40178200602531433, + "grad_norm_var": 0.0021162756618779235, + "learning_rate": 5e-05, + "loss": 0.1809, + "loss/crossentropy": 2.9381837844848633, + "loss/hidden": 0.0, + "loss/logits": 0.1809130534529686, + "loss/reg": 3.1540093421936035, + "step": 323 + }, + { + "epoch": 0.00324, + "grad_norm": 0.35252845287323, + "grad_norm_var": 0.0020514948206895294, + "learning_rate": 5e-05, + "loss": 0.1617, + "loss/crossentropy": 2.743869721889496, + "loss/hidden": 0.0, + "loss/logits": 0.16171807795763016, + "loss/reg": 3.1503779888153076, + "step": 324 + }, + { + "epoch": 0.00325, + "grad_norm": 0.36802011728286743, + "grad_norm_var": 0.0020509560983741053, + "learning_rate": 5e-05, + "loss": 0.1722, + "loss/crossentropy": 2.934039294719696, + "loss/hidden": 0.0, + "loss/logits": 0.17217914387583733, + "loss/reg": 3.1467440128326416, + "step": 325 + }, + { + "epoch": 0.00326, + "grad_norm": 0.4267319142818451, + "grad_norm_var": 0.0022188398751517274, + "learning_rate": 5e-05, + "loss": 0.1924, + "loss/crossentropy": 2.802468180656433, + "loss/hidden": 0.0, + "loss/logits": 0.19241869449615479, + "loss/reg": 3.1427693367004395, + "step": 326 + }, + { + "epoch": 0.00327, + "grad_norm": 0.34044548869132996, + "grad_norm_var": 0.0021007258044081806, + "learning_rate": 5e-05, + "loss": 0.1522, + "loss/crossentropy": 2.8443135619163513, + "loss/hidden": 0.0, + "loss/logits": 0.1522187888622284, + "loss/reg": 3.1388471126556396, + "step": 327 + }, + { + "epoch": 0.00328, + "grad_norm": 0.4276120066642761, + "grad_norm_var": 0.001808451579785623, + "learning_rate": 5e-05, + "loss": 0.1708, + "loss/crossentropy": 2.8915366530418396, + "loss/hidden": 0.0, + "loss/logits": 0.17084889113903046, + "loss/reg": 3.134800434112549, + "step": 328 + }, + { + "epoch": 0.00329, + "grad_norm": 0.3486219346523285, + "grad_norm_var": 0.0018993689379833108, + "learning_rate": 5e-05, + "loss": 0.1687, + "loss/crossentropy": 2.575106978416443, + "loss/hidden": 0.0, + "loss/logits": 0.16874929517507553, + "loss/reg": 3.1303272247314453, + "step": 329 + }, + { + "epoch": 0.0033, + "grad_norm": 0.365105539560318, + "grad_norm_var": 0.0018301403367672127, + "learning_rate": 5e-05, + "loss": 0.1842, + "loss/crossentropy": 2.6813217401504517, + "loss/hidden": 0.0, + "loss/logits": 0.18416454643011093, + "loss/reg": 3.1269419193267822, + "step": 330 + }, + { + "epoch": 0.00331, + "grad_norm": 0.5757820010185242, + "grad_norm_var": 0.004098500311938921, + "learning_rate": 5e-05, + "loss": 0.1935, + "loss/crossentropy": 2.9679067730903625, + "loss/hidden": 0.0, + "loss/logits": 0.19354857876896858, + "loss/reg": 3.1228113174438477, + "step": 331 + }, + { + "epoch": 0.00332, + "grad_norm": 0.405617356300354, + "grad_norm_var": 0.004102514647771457, + "learning_rate": 5e-05, + "loss": 0.1716, + "loss/crossentropy": 2.843691408634186, + "loss/hidden": 0.0, + "loss/logits": 0.1716487891972065, + "loss/reg": 3.120131015777588, + "step": 332 + }, + { + "epoch": 0.00333, + "grad_norm": 0.3825243413448334, + "grad_norm_var": 0.004114939464605513, + "learning_rate": 5e-05, + "loss": 0.1692, + "loss/crossentropy": 2.722069561481476, + "loss/hidden": 0.0, + "loss/logits": 0.16915880143642426, + "loss/reg": 3.117812395095825, + "step": 333 + }, + { + "epoch": 0.00334, + "grad_norm": 0.38414397835731506, + "grad_norm_var": 0.004087504594686679, + "learning_rate": 5e-05, + "loss": 0.1678, + "loss/crossentropy": 2.727014124393463, + "loss/hidden": 0.0, + "loss/logits": 0.1677936352789402, + "loss/reg": 3.115847587585449, + "step": 334 + }, + { + "epoch": 0.00335, + "grad_norm": 0.4531976580619812, + "grad_norm_var": 0.0040868556651997325, + "learning_rate": 5e-05, + "loss": 0.1726, + "loss/crossentropy": 2.5817691683769226, + "loss/hidden": 0.0, + "loss/logits": 0.17255331575870514, + "loss/reg": 3.1140594482421875, + "step": 335 + }, + { + "epoch": 0.00336, + "grad_norm": 0.33963721990585327, + "grad_norm_var": 0.004259094561609115, + "learning_rate": 5e-05, + "loss": 0.1605, + "loss/crossentropy": 2.7009602189064026, + "loss/hidden": 0.0, + "loss/logits": 0.16050074249505997, + "loss/reg": 3.1128299236297607, + "step": 336 + }, + { + "epoch": 0.00337, + "grad_norm": 0.36085280776023865, + "grad_norm_var": 0.00419878945557195, + "learning_rate": 5e-05, + "loss": 0.1641, + "loss/crossentropy": 2.7012510299682617, + "loss/hidden": 0.0, + "loss/logits": 0.16410352289676666, + "loss/reg": 3.109898805618286, + "step": 337 + }, + { + "epoch": 0.00338, + "grad_norm": 0.3331363797187805, + "grad_norm_var": 0.003666565441549352, + "learning_rate": 5e-05, + "loss": 0.1619, + "loss/crossentropy": 2.802642047405243, + "loss/hidden": 0.0, + "loss/logits": 0.1618719846010208, + "loss/reg": 3.1067681312561035, + "step": 338 + }, + { + "epoch": 0.00339, + "grad_norm": 0.41531723737716675, + "grad_norm_var": 0.003696375336840474, + "learning_rate": 5e-05, + "loss": 0.1877, + "loss/crossentropy": 2.622368335723877, + "loss/hidden": 0.0, + "loss/logits": 0.18770882859826088, + "loss/reg": 3.103231191635132, + "step": 339 + }, + { + "epoch": 0.0034, + "grad_norm": 0.3483443260192871, + "grad_norm_var": 0.0037197436901762657, + "learning_rate": 5e-05, + "loss": 0.1603, + "loss/crossentropy": 2.7523834109306335, + "loss/hidden": 0.0, + "loss/logits": 0.16034872457385063, + "loss/reg": 3.1008808612823486, + "step": 340 + }, + { + "epoch": 0.00341, + "grad_norm": 0.46117284893989563, + "grad_norm_var": 0.003961845355148208, + "learning_rate": 5e-05, + "loss": 0.1801, + "loss/crossentropy": 2.820544958114624, + "loss/hidden": 0.0, + "loss/logits": 0.18007208034396172, + "loss/reg": 3.098954439163208, + "step": 341 + }, + { + "epoch": 0.00342, + "grad_norm": 0.38020917773246765, + "grad_norm_var": 0.003918987421685794, + "learning_rate": 5e-05, + "loss": 0.1711, + "loss/crossentropy": 2.864526093006134, + "loss/hidden": 0.0, + "loss/logits": 0.1710633859038353, + "loss/reg": 3.0957016944885254, + "step": 342 + }, + { + "epoch": 0.00343, + "grad_norm": 0.3978392779827118, + "grad_norm_var": 0.0037065638898653073, + "learning_rate": 5e-05, + "loss": 0.1782, + "loss/crossentropy": 2.859494388103485, + "loss/hidden": 0.0, + "loss/logits": 0.1782137230038643, + "loss/reg": 3.09333872795105, + "step": 343 + }, + { + "epoch": 0.00344, + "grad_norm": 0.36975786089897156, + "grad_norm_var": 0.0036926924317912187, + "learning_rate": 5e-05, + "loss": 0.1658, + "loss/crossentropy": 2.7827839255332947, + "loss/hidden": 0.0, + "loss/logits": 0.1657763496041298, + "loss/reg": 3.090590715408325, + "step": 344 + }, + { + "epoch": 0.00345, + "grad_norm": 0.333897203207016, + "grad_norm_var": 0.0037974520830182084, + "learning_rate": 5e-05, + "loss": 0.1612, + "loss/crossentropy": 2.7804853320121765, + "loss/hidden": 0.0, + "loss/logits": 0.1611798331141472, + "loss/reg": 3.0877137184143066, + "step": 345 + }, + { + "epoch": 0.00346, + "grad_norm": 0.43794891238212585, + "grad_norm_var": 0.0038469119530984567, + "learning_rate": 5e-05, + "loss": 0.1678, + "loss/crossentropy": 2.7229984402656555, + "loss/hidden": 0.0, + "loss/logits": 0.1677897423505783, + "loss/reg": 3.085155725479126, + "step": 346 + }, + { + "epoch": 0.00347, + "grad_norm": 0.33257824182510376, + "grad_norm_var": 0.0018017603976316725, + "learning_rate": 5e-05, + "loss": 0.1745, + "loss/crossentropy": 2.7936434745788574, + "loss/hidden": 0.0, + "loss/logits": 0.17448442056775093, + "loss/reg": 3.0829925537109375, + "step": 347 + }, + { + "epoch": 0.00348, + "grad_norm": 0.393646240234375, + "grad_norm_var": 0.001775431972661142, + "learning_rate": 5e-05, + "loss": 0.1647, + "loss/crossentropy": 2.8590177297592163, + "loss/hidden": 0.0, + "loss/logits": 0.16474304348230362, + "loss/reg": 3.080383062362671, + "step": 348 + }, + { + "epoch": 0.00349, + "grad_norm": 0.34549105167388916, + "grad_norm_var": 0.0018623256252658482, + "learning_rate": 5e-05, + "loss": 0.1678, + "loss/crossentropy": 2.7182729840278625, + "loss/hidden": 0.0, + "loss/logits": 0.16776488721370697, + "loss/reg": 3.0792622566223145, + "step": 349 + }, + { + "epoch": 0.0035, + "grad_norm": 0.9833559393882751, + "grad_norm_var": 0.024598539346201563, + "learning_rate": 5e-05, + "loss": 0.1911, + "loss/crossentropy": 2.720784068107605, + "loss/hidden": 0.0, + "loss/logits": 0.19106518849730492, + "loss/reg": 3.0780370235443115, + "step": 350 + }, + { + "epoch": 0.00351, + "grad_norm": 0.3550430238246918, + "grad_norm_var": 0.02473872020472854, + "learning_rate": 5e-05, + "loss": 0.1683, + "loss/crossentropy": 2.7388776540756226, + "loss/hidden": 0.0, + "loss/logits": 0.16827991232275963, + "loss/reg": 3.07700777053833, + "step": 351 + }, + { + "epoch": 0.00352, + "grad_norm": 0.32236865162849426, + "grad_norm_var": 0.024923428623413246, + "learning_rate": 5e-05, + "loss": 0.1555, + "loss/crossentropy": 2.787019371986389, + "loss/hidden": 0.0, + "loss/logits": 0.1554781049489975, + "loss/reg": 3.075169324874878, + "step": 352 + }, + { + "epoch": 0.00353, + "grad_norm": 0.34089383482933044, + "grad_norm_var": 0.025080939274787682, + "learning_rate": 5e-05, + "loss": 0.1625, + "loss/crossentropy": 2.763973832130432, + "loss/hidden": 0.0, + "loss/logits": 0.1625489443540573, + "loss/reg": 3.0732734203338623, + "step": 353 + }, + { + "epoch": 0.00354, + "grad_norm": 0.35467466711997986, + "grad_norm_var": 0.02489081345717287, + "learning_rate": 5e-05, + "loss": 0.161, + "loss/crossentropy": 2.6696255207061768, + "loss/hidden": 0.0, + "loss/logits": 0.1610397771000862, + "loss/reg": 3.071441411972046, + "step": 354 + }, + { + "epoch": 0.00355, + "grad_norm": 0.3465348184108734, + "grad_norm_var": 0.02514492485323772, + "learning_rate": 5e-05, + "loss": 0.1692, + "loss/crossentropy": 2.8481903076171875, + "loss/hidden": 0.0, + "loss/logits": 0.16923030093312263, + "loss/reg": 3.068796157836914, + "step": 355 + }, + { + "epoch": 0.00356, + "grad_norm": 0.3337138891220093, + "grad_norm_var": 0.025271718941650815, + "learning_rate": 5e-05, + "loss": 0.1675, + "loss/crossentropy": 2.734935760498047, + "loss/hidden": 0.0, + "loss/logits": 0.1674855425953865, + "loss/reg": 3.066350221633911, + "step": 356 + }, + { + "epoch": 0.00357, + "grad_norm": 0.3486330509185791, + "grad_norm_var": 0.02522896182872459, + "learning_rate": 5e-05, + "loss": 0.1751, + "loss/crossentropy": 2.8282878398895264, + "loss/hidden": 0.0, + "loss/logits": 0.17508375644683838, + "loss/reg": 3.0633251667022705, + "step": 357 + }, + { + "epoch": 0.00358, + "grad_norm": 0.3714129626750946, + "grad_norm_var": 0.025255292610223575, + "learning_rate": 5e-05, + "loss": 0.1798, + "loss/crossentropy": 2.723433256149292, + "loss/hidden": 0.0, + "loss/logits": 0.17982058972120285, + "loss/reg": 3.060797929763794, + "step": 358 + }, + { + "epoch": 0.00359, + "grad_norm": 0.38819992542266846, + "grad_norm_var": 0.025261289598676597, + "learning_rate": 5e-05, + "loss": 0.179, + "loss/crossentropy": 2.5971017479896545, + "loss/hidden": 0.0, + "loss/logits": 0.17904112860560417, + "loss/reg": 3.057884693145752, + "step": 359 + }, + { + "epoch": 0.0036, + "grad_norm": 0.3948271870613098, + "grad_norm_var": 0.02520822524046924, + "learning_rate": 5e-05, + "loss": 0.1826, + "loss/crossentropy": 2.631825864315033, + "loss/hidden": 0.0, + "loss/logits": 0.1826026625931263, + "loss/reg": 3.0550246238708496, + "step": 360 + }, + { + "epoch": 0.00361, + "grad_norm": 0.46747469902038574, + "grad_norm_var": 0.025164775676019657, + "learning_rate": 5e-05, + "loss": 0.1849, + "loss/crossentropy": 2.628718376159668, + "loss/hidden": 0.0, + "loss/logits": 0.1848563477396965, + "loss/reg": 3.052072525024414, + "step": 361 + }, + { + "epoch": 0.00362, + "grad_norm": 0.9672635197639465, + "grad_norm_var": 0.044838716189940266, + "learning_rate": 5e-05, + "loss": 0.2236, + "loss/crossentropy": 2.7798518538475037, + "loss/hidden": 0.0, + "loss/logits": 0.22359847277402878, + "loss/reg": 3.048891305923462, + "step": 362 + }, + { + "epoch": 0.00363, + "grad_norm": 0.3722783029079437, + "grad_norm_var": 0.04436658011174813, + "learning_rate": 5e-05, + "loss": 0.175, + "loss/crossentropy": 2.7808294892311096, + "loss/hidden": 0.0, + "loss/logits": 0.17498808726668358, + "loss/reg": 3.0457394123077393, + "step": 363 + }, + { + "epoch": 0.00364, + "grad_norm": 0.3547132611274719, + "grad_norm_var": 0.04471680473078544, + "learning_rate": 5e-05, + "loss": 0.1602, + "loss/crossentropy": 2.7656018137931824, + "loss/hidden": 0.0, + "loss/logits": 0.16020696610212326, + "loss/reg": 3.043097734451294, + "step": 364 + }, + { + "epoch": 0.00365, + "grad_norm": 0.4774816632270813, + "grad_norm_var": 0.04413484451680517, + "learning_rate": 5e-05, + "loss": 0.1831, + "loss/crossentropy": 2.9051772356033325, + "loss/hidden": 0.0, + "loss/logits": 0.18311960250139236, + "loss/reg": 3.0403990745544434, + "step": 365 + }, + { + "epoch": 0.00366, + "grad_norm": 0.41332709789276123, + "grad_norm_var": 0.0238056716485936, + "learning_rate": 5e-05, + "loss": 0.1705, + "loss/crossentropy": 2.8529672026634216, + "loss/hidden": 0.0, + "loss/logits": 0.17049206793308258, + "loss/reg": 3.037370204925537, + "step": 366 + }, + { + "epoch": 0.00367, + "grad_norm": 0.39109355211257935, + "grad_norm_var": 0.023608062717162412, + "learning_rate": 5e-05, + "loss": 0.1798, + "loss/crossentropy": 2.796768307685852, + "loss/hidden": 0.0, + "loss/logits": 0.17983367666602135, + "loss/reg": 3.0343563556671143, + "step": 367 + }, + { + "epoch": 0.00368, + "grad_norm": 0.36531057953834534, + "grad_norm_var": 0.023191193861390014, + "learning_rate": 5e-05, + "loss": 0.1715, + "loss/crossentropy": 2.8276050686836243, + "loss/hidden": 0.0, + "loss/logits": 0.1715383380651474, + "loss/reg": 3.031611919403076, + "step": 368 + }, + { + "epoch": 0.00369, + "grad_norm": 0.33283501863479614, + "grad_norm_var": 0.023278092934366657, + "learning_rate": 5e-05, + "loss": 0.1499, + "loss/crossentropy": 2.6641258597373962, + "loss/hidden": 0.0, + "loss/logits": 0.14990831911563873, + "loss/reg": 3.0287797451019287, + "step": 369 + }, + { + "epoch": 0.0037, + "grad_norm": 0.4542810618877411, + "grad_norm_var": 0.023063995994232415, + "learning_rate": 5e-05, + "loss": 0.1722, + "loss/crossentropy": 2.7453941702842712, + "loss/hidden": 0.0, + "loss/logits": 0.1721569411456585, + "loss/reg": 3.0258800983428955, + "step": 370 + }, + { + "epoch": 0.00371, + "grad_norm": 0.3705763816833496, + "grad_norm_var": 0.022852728398777448, + "learning_rate": 5e-05, + "loss": 0.1849, + "loss/crossentropy": 2.635721504688263, + "loss/hidden": 0.0, + "loss/logits": 0.18491211906075478, + "loss/reg": 3.0235960483551025, + "step": 371 + }, + { + "epoch": 0.00372, + "grad_norm": 0.4085729420185089, + "grad_norm_var": 0.022289690361487074, + "learning_rate": 5e-05, + "loss": 0.1871, + "loss/crossentropy": 2.732766628265381, + "loss/hidden": 0.0, + "loss/logits": 0.18710973486304283, + "loss/reg": 3.0203185081481934, + "step": 372 + }, + { + "epoch": 0.00373, + "grad_norm": 0.3334612250328064, + "grad_norm_var": 0.022468457594482887, + "learning_rate": 5e-05, + "loss": 0.1662, + "loss/crossentropy": 2.783429443836212, + "loss/hidden": 0.0, + "loss/logits": 0.1662071831524372, + "loss/reg": 3.0171730518341064, + "step": 373 + }, + { + "epoch": 0.00374, + "grad_norm": 0.35536903142929077, + "grad_norm_var": 0.022607616164545874, + "learning_rate": 5e-05, + "loss": 0.1654, + "loss/crossentropy": 2.818749785423279, + "loss/hidden": 0.0, + "loss/logits": 0.16541225090622902, + "loss/reg": 3.0140058994293213, + "step": 374 + }, + { + "epoch": 0.00375, + "grad_norm": 0.348376989364624, + "grad_norm_var": 0.022917750109528078, + "learning_rate": 5e-05, + "loss": 0.1628, + "loss/crossentropy": 2.9099320769309998, + "loss/hidden": 0.0, + "loss/logits": 0.1627991460263729, + "loss/reg": 3.010875701904297, + "step": 375 + }, + { + "epoch": 0.00376, + "grad_norm": 0.3394787311553955, + "grad_norm_var": 0.023335225496050292, + "learning_rate": 5e-05, + "loss": 0.1605, + "loss/crossentropy": 2.811407744884491, + "loss/hidden": 0.0, + "loss/logits": 0.16051743179559708, + "loss/reg": 3.0073623657226562, + "step": 376 + }, + { + "epoch": 0.00377, + "grad_norm": 0.42454567551612854, + "grad_norm_var": 0.02319007765550817, + "learning_rate": 5e-05, + "loss": 0.1645, + "loss/crossentropy": 2.773725748062134, + "loss/hidden": 0.0, + "loss/logits": 0.16450630128383636, + "loss/reg": 3.004185676574707, + "step": 377 + }, + { + "epoch": 0.00378, + "grad_norm": 0.3412385582923889, + "grad_norm_var": 0.001946629707866813, + "learning_rate": 5e-05, + "loss": 0.1742, + "loss/crossentropy": 2.785709500312805, + "loss/hidden": 0.0, + "loss/logits": 0.17420916631817818, + "loss/reg": 3.0007028579711914, + "step": 378 + }, + { + "epoch": 0.00379, + "grad_norm": 0.3544103503227234, + "grad_norm_var": 0.001985417588834304, + "learning_rate": 5e-05, + "loss": 0.1716, + "loss/crossentropy": 2.670408546924591, + "loss/hidden": 0.0, + "loss/logits": 0.17155754566192627, + "loss/reg": 2.9972825050354004, + "step": 379 + }, + { + "epoch": 0.0038, + "grad_norm": 0.36286091804504395, + "grad_norm_var": 0.001963109812450625, + "learning_rate": 5e-05, + "loss": 0.1786, + "loss/crossentropy": 2.822770357131958, + "loss/hidden": 0.0, + "loss/logits": 0.17864727228879929, + "loss/reg": 2.993536949157715, + "step": 380 + }, + { + "epoch": 0.00381, + "grad_norm": 0.5003440976142883, + "grad_norm_var": 0.002294225514870618, + "learning_rate": 5e-05, + "loss": 0.1896, + "loss/crossentropy": 2.790800392627716, + "loss/hidden": 0.0, + "loss/logits": 0.18961890786886215, + "loss/reg": 2.990283489227295, + "step": 381 + }, + { + "epoch": 0.00382, + "grad_norm": 0.3698294758796692, + "grad_norm_var": 0.0022250210916228584, + "learning_rate": 5e-05, + "loss": 0.1648, + "loss/crossentropy": 2.8308547139167786, + "loss/hidden": 0.0, + "loss/logits": 0.16477400809526443, + "loss/reg": 2.986691474914551, + "step": 382 + }, + { + "epoch": 0.00383, + "grad_norm": 0.36506953835487366, + "grad_norm_var": 0.0022229105132923347, + "learning_rate": 5e-05, + "loss": 0.1682, + "loss/crossentropy": 2.744426727294922, + "loss/hidden": 0.0, + "loss/logits": 0.16819821670651436, + "loss/reg": 2.983008861541748, + "step": 383 + }, + { + "epoch": 0.00384, + "grad_norm": 0.3243113160133362, + "grad_norm_var": 0.002390011819316588, + "learning_rate": 5e-05, + "loss": 0.1596, + "loss/crossentropy": 2.89188152551651, + "loss/hidden": 0.0, + "loss/logits": 0.1595698669552803, + "loss/reg": 2.979191541671753, + "step": 384 + }, + { + "epoch": 0.00385, + "grad_norm": 0.36836785078048706, + "grad_norm_var": 0.002273433106164295, + "learning_rate": 5e-05, + "loss": 0.1747, + "loss/crossentropy": 2.902570128440857, + "loss/hidden": 0.0, + "loss/logits": 0.1746898777782917, + "loss/reg": 2.975698947906494, + "step": 385 + }, + { + "epoch": 0.00386, + "grad_norm": 0.3365325927734375, + "grad_norm_var": 0.001915978849994604, + "learning_rate": 5e-05, + "loss": 0.1608, + "loss/crossentropy": 2.766001045703888, + "loss/hidden": 0.0, + "loss/logits": 0.16081618145108223, + "loss/reg": 2.972667694091797, + "step": 386 + }, + { + "epoch": 0.00387, + "grad_norm": 0.35417604446411133, + "grad_norm_var": 0.0019292530227877358, + "learning_rate": 5e-05, + "loss": 0.1605, + "loss/crossentropy": 2.7814798951148987, + "loss/hidden": 0.0, + "loss/logits": 0.16046970710158348, + "loss/reg": 2.969967842102051, + "step": 387 + }, + { + "epoch": 0.00388, + "grad_norm": 1.9537514448165894, + "grad_norm_var": 0.15952536292831518, + "learning_rate": 5e-05, + "loss": 0.1926, + "loss/crossentropy": 2.782427728176117, + "loss/hidden": 0.0, + "loss/logits": 0.1925731934607029, + "loss/reg": 2.9671437740325928, + "step": 388 + }, + { + "epoch": 0.00389, + "grad_norm": 0.3620118498802185, + "grad_norm_var": 0.15907744774636304, + "learning_rate": 5e-05, + "loss": 0.1714, + "loss/crossentropy": 2.716952919960022, + "loss/hidden": 0.0, + "loss/logits": 0.17136194929480553, + "loss/reg": 2.9639716148376465, + "step": 389 + }, + { + "epoch": 0.0039, + "grad_norm": 0.3765539526939392, + "grad_norm_var": 0.1587921781193889, + "learning_rate": 5e-05, + "loss": 0.1689, + "loss/crossentropy": 2.8395472168922424, + "loss/hidden": 0.0, + "loss/logits": 0.16893662884831429, + "loss/reg": 2.9617481231689453, + "step": 390 + }, + { + "epoch": 0.00391, + "grad_norm": 0.39779096841812134, + "grad_norm_var": 0.15815917569478716, + "learning_rate": 5e-05, + "loss": 0.1677, + "loss/crossentropy": 2.813215434551239, + "loss/hidden": 0.0, + "loss/logits": 0.1676802597939968, + "loss/reg": 2.958872079849243, + "step": 391 + }, + { + "epoch": 0.00392, + "grad_norm": 2.267273187637329, + "grad_norm_var": 0.35670344578828533, + "learning_rate": 5e-05, + "loss": 0.1898, + "loss/crossentropy": 2.710484206676483, + "loss/hidden": 0.0, + "loss/logits": 0.18981827050447464, + "loss/reg": 2.9562907218933105, + "step": 392 + }, + { + "epoch": 0.00393, + "grad_norm": 0.4732659161090851, + "grad_norm_var": 0.35576926148027355, + "learning_rate": 5e-05, + "loss": 0.1789, + "loss/crossentropy": 2.7463297247886658, + "loss/hidden": 0.0, + "loss/logits": 0.17890166491270065, + "loss/reg": 2.953505277633667, + "step": 393 + }, + { + "epoch": 0.00394, + "grad_norm": 0.46487849950790405, + "grad_norm_var": 0.3525539310677323, + "learning_rate": 5e-05, + "loss": 0.1737, + "loss/crossentropy": 2.781617820262909, + "loss/hidden": 0.0, + "loss/logits": 0.17365656793117523, + "loss/reg": 2.951185464859009, + "step": 394 + }, + { + "epoch": 0.00395, + "grad_norm": 0.36613309383392334, + "grad_norm_var": 0.352175585204308, + "learning_rate": 5e-05, + "loss": 0.1749, + "loss/crossentropy": 2.9521047472953796, + "loss/hidden": 0.0, + "loss/logits": 0.1749158501625061, + "loss/reg": 2.949521780014038, + "step": 395 + }, + { + "epoch": 0.00396, + "grad_norm": 0.33889761567115784, + "grad_norm_var": 0.35297777688562104, + "learning_rate": 5e-05, + "loss": 0.1611, + "loss/crossentropy": 2.709399461746216, + "loss/hidden": 0.0, + "loss/logits": 0.161102045327425, + "loss/reg": 2.94758677482605, + "step": 396 + }, + { + "epoch": 0.00397, + "grad_norm": 0.37460586428642273, + "grad_norm_var": 0.3556567542520952, + "learning_rate": 5e-05, + "loss": 0.1709, + "loss/crossentropy": 2.8318939208984375, + "loss/hidden": 0.0, + "loss/logits": 0.1709057316184044, + "loss/reg": 2.9449052810668945, + "step": 397 + }, + { + "epoch": 0.00398, + "grad_norm": 0.36912715435028076, + "grad_norm_var": 0.3556777153015602, + "learning_rate": 5e-05, + "loss": 0.1699, + "loss/crossentropy": 2.7699413895606995, + "loss/hidden": 0.0, + "loss/logits": 0.16991987824440002, + "loss/reg": 2.9416375160217285, + "step": 398 + }, + { + "epoch": 0.00399, + "grad_norm": 0.4335935711860657, + "grad_norm_var": 0.35388598085202433, + "learning_rate": 5e-05, + "loss": 0.1621, + "loss/crossentropy": 2.6929262280464172, + "loss/hidden": 0.0, + "loss/logits": 0.16208457946777344, + "loss/reg": 2.9385571479797363, + "step": 399 + }, + { + "epoch": 0.004, + "grad_norm": 0.36466526985168457, + "grad_norm_var": 0.35251743192284957, + "learning_rate": 5e-05, + "loss": 0.1603, + "loss/crossentropy": 2.9028329849243164, + "loss/hidden": 0.0, + "loss/logits": 0.16026458516716957, + "loss/reg": 2.935270071029663, + "step": 400 + }, + { + "epoch": 0.00401, + "grad_norm": 0.31859657168388367, + "grad_norm_var": 0.3542100800677372, + "learning_rate": 5e-05, + "loss": 0.155, + "loss/crossentropy": 2.7707905769348145, + "loss/hidden": 0.0, + "loss/logits": 0.15500668808817863, + "loss/reg": 2.9319961071014404, + "step": 401 + }, + { + "epoch": 0.00402, + "grad_norm": 0.39714375138282776, + "grad_norm_var": 0.35233479687143326, + "learning_rate": 5e-05, + "loss": 0.1722, + "loss/crossentropy": 2.5947351455688477, + "loss/hidden": 0.0, + "loss/logits": 0.17224042490124702, + "loss/reg": 2.929413318634033, + "step": 402 + }, + { + "epoch": 0.00403, + "grad_norm": 2.267681121826172, + "grad_norm_var": 0.518261838886723, + "learning_rate": 5e-05, + "loss": 0.2313, + "loss/crossentropy": 2.703626811504364, + "loss/hidden": 0.0, + "loss/logits": 0.23128759488463402, + "loss/reg": 2.925968647003174, + "step": 403 + }, + { + "epoch": 0.00404, + "grad_norm": 0.38179340958595276, + "grad_norm_var": 0.414193396025083, + "learning_rate": 5e-05, + "loss": 0.1852, + "loss/crossentropy": 2.820598065853119, + "loss/hidden": 0.0, + "loss/logits": 0.18524771928787231, + "loss/reg": 2.9223124980926514, + "step": 404 + }, + { + "epoch": 0.00405, + "grad_norm": 0.35225149989128113, + "grad_norm_var": 0.4145378570625937, + "learning_rate": 5e-05, + "loss": 0.1672, + "loss/crossentropy": 2.761472165584564, + "loss/hidden": 0.0, + "loss/logits": 0.1672290712594986, + "loss/reg": 2.9187073707580566, + "step": 405 + }, + { + "epoch": 0.00406, + "grad_norm": 0.35987603664398193, + "grad_norm_var": 0.4150999685132215, + "learning_rate": 5e-05, + "loss": 0.1731, + "loss/crossentropy": 2.7906153202056885, + "loss/hidden": 0.0, + "loss/logits": 0.17313192784786224, + "loss/reg": 2.915867805480957, + "step": 406 + }, + { + "epoch": 0.00407, + "grad_norm": 0.36714085936546326, + "grad_norm_var": 0.416068714723823, + "learning_rate": 5e-05, + "loss": 0.1729, + "loss/crossentropy": 2.7815486192703247, + "loss/hidden": 0.0, + "loss/logits": 0.17289156094193459, + "loss/reg": 2.9124350547790527, + "step": 407 + }, + { + "epoch": 0.00408, + "grad_norm": 0.43249061703681946, + "grad_norm_var": 0.22313248530400895, + "learning_rate": 5e-05, + "loss": 0.1886, + "loss/crossentropy": 2.7781424522399902, + "loss/hidden": 0.0, + "loss/logits": 0.18864024803042412, + "loss/reg": 2.9094510078430176, + "step": 408 + }, + { + "epoch": 0.00409, + "grad_norm": 0.34418484568595886, + "grad_norm_var": 0.22470081409508588, + "learning_rate": 5e-05, + "loss": 0.1713, + "loss/crossentropy": 2.7818892002105713, + "loss/hidden": 0.0, + "loss/logits": 0.17131192237138748, + "loss/reg": 2.9064505100250244, + "step": 409 + }, + { + "epoch": 0.0041, + "grad_norm": 0.5792982578277588, + "grad_norm_var": 0.2250470715469535, + "learning_rate": 5e-05, + "loss": 0.1925, + "loss/crossentropy": 2.8059155344963074, + "loss/hidden": 0.0, + "loss/logits": 0.1925133354961872, + "loss/reg": 2.9032788276672363, + "step": 410 + }, + { + "epoch": 0.00411, + "grad_norm": 0.35917767882347107, + "grad_norm_var": 0.22517699381034958, + "learning_rate": 5e-05, + "loss": 0.1597, + "loss/crossentropy": 2.9948400259017944, + "loss/hidden": 0.0, + "loss/logits": 0.1597156822681427, + "loss/reg": 2.900125741958618, + "step": 411 + }, + { + "epoch": 0.00412, + "grad_norm": 0.32936394214630127, + "grad_norm_var": 0.2253906803631866, + "learning_rate": 5e-05, + "loss": 0.16, + "loss/crossentropy": 2.8464353680610657, + "loss/hidden": 0.0, + "loss/logits": 0.15996254980564117, + "loss/reg": 2.897120952606201, + "step": 412 + }, + { + "epoch": 0.00413, + "grad_norm": 0.3636591136455536, + "grad_norm_var": 0.22558401797348096, + "learning_rate": 5e-05, + "loss": 0.1896, + "loss/crossentropy": 2.5940242409706116, + "loss/hidden": 0.0, + "loss/logits": 0.18961919099092484, + "loss/reg": 2.8938040733337402, + "step": 413 + }, + { + "epoch": 0.00414, + "grad_norm": 0.3614409565925598, + "grad_norm_var": 0.2257231161008428, + "learning_rate": 5e-05, + "loss": 0.1669, + "loss/crossentropy": 2.9938586950302124, + "loss/hidden": 0.0, + "loss/logits": 0.16688520461320877, + "loss/reg": 2.891237497329712, + "step": 414 + }, + { + "epoch": 0.00415, + "grad_norm": 0.33793795108795166, + "grad_norm_var": 0.2271517945765009, + "learning_rate": 5e-05, + "loss": 0.1662, + "loss/crossentropy": 2.7566803693771362, + "loss/hidden": 0.0, + "loss/logits": 0.16617370769381523, + "loss/reg": 2.888444185256958, + "step": 415 + }, + { + "epoch": 0.00416, + "grad_norm": 0.33697640895843506, + "grad_norm_var": 0.22768012665927795, + "learning_rate": 5e-05, + "loss": 0.1609, + "loss/crossentropy": 2.7538956999778748, + "loss/hidden": 0.0, + "loss/logits": 0.1609276346862316, + "loss/reg": 2.8854165077209473, + "step": 416 + }, + { + "epoch": 0.00417, + "grad_norm": 0.33163169026374817, + "grad_norm_var": 0.22738752034767185, + "learning_rate": 5e-05, + "loss": 0.1686, + "loss/crossentropy": 2.716135025024414, + "loss/hidden": 0.0, + "loss/logits": 0.16858552396297455, + "loss/reg": 2.883502244949341, + "step": 417 + }, + { + "epoch": 0.00418, + "grad_norm": 0.3197973072528839, + "grad_norm_var": 0.22875903165210631, + "learning_rate": 5e-05, + "loss": 0.1622, + "loss/crossentropy": 2.8385114669799805, + "loss/hidden": 0.0, + "loss/logits": 0.16216163337230682, + "loss/reg": 2.8822600841522217, + "step": 418 + }, + { + "epoch": 0.00419, + "grad_norm": 0.3929068446159363, + "grad_norm_var": 0.0038269076397950408, + "learning_rate": 5e-05, + "loss": 0.1747, + "loss/crossentropy": 2.7871418595314026, + "loss/hidden": 0.0, + "loss/logits": 0.17471741139888763, + "loss/reg": 2.8793692588806152, + "step": 419 + }, + { + "epoch": 0.0042, + "grad_norm": 0.3870161473751068, + "grad_norm_var": 0.0038355224442556198, + "learning_rate": 5e-05, + "loss": 0.1704, + "loss/crossentropy": 2.7993595600128174, + "loss/hidden": 0.0, + "loss/logits": 0.1703585907816887, + "loss/reg": 2.8776159286499023, + "step": 420 + }, + { + "epoch": 0.00421, + "grad_norm": 0.35682201385498047, + "grad_norm_var": 0.0038246732894099337, + "learning_rate": 5e-05, + "loss": 0.1886, + "loss/crossentropy": 2.655856966972351, + "loss/hidden": 0.0, + "loss/logits": 0.18859218060970306, + "loss/reg": 2.87612247467041, + "step": 421 + }, + { + "epoch": 0.00422, + "grad_norm": 0.33115604519844055, + "grad_norm_var": 0.003924500155300174, + "learning_rate": 5e-05, + "loss": 0.1611, + "loss/crossentropy": 2.8695462942123413, + "loss/hidden": 0.0, + "loss/logits": 0.1611364483833313, + "loss/reg": 2.873897075653076, + "step": 422 + }, + { + "epoch": 0.00423, + "grad_norm": 0.4912989139556885, + "grad_norm_var": 0.004829238325957341, + "learning_rate": 5e-05, + "loss": 0.1784, + "loss/crossentropy": 2.7167177200317383, + "loss/hidden": 0.0, + "loss/logits": 0.17839327454566956, + "loss/reg": 2.8718714714050293, + "step": 423 + }, + { + "epoch": 0.00424, + "grad_norm": 0.3349898159503937, + "grad_norm_var": 0.004720821391959795, + "learning_rate": 5e-05, + "loss": 0.1615, + "loss/crossentropy": 2.7473002076148987, + "loss/hidden": 0.0, + "loss/logits": 0.1615452691912651, + "loss/reg": 2.8705270290374756, + "step": 424 + }, + { + "epoch": 0.00425, + "grad_norm": 0.3930635154247284, + "grad_norm_var": 0.004686561363231038, + "learning_rate": 5e-05, + "loss": 0.1775, + "loss/crossentropy": 2.785146713256836, + "loss/hidden": 0.0, + "loss/logits": 0.17748162522912025, + "loss/reg": 2.868584394454956, + "step": 425 + }, + { + "epoch": 0.00426, + "grad_norm": 0.3448260426521301, + "grad_norm_var": 0.0017484410160554464, + "learning_rate": 5e-05, + "loss": 0.1758, + "loss/crossentropy": 2.726165235042572, + "loss/hidden": 0.0, + "loss/logits": 0.17576807364821434, + "loss/reg": 2.8673741817474365, + "step": 426 + }, + { + "epoch": 0.00427, + "grad_norm": 0.3846610188484192, + "grad_norm_var": 0.0017836724819081718, + "learning_rate": 5e-05, + "loss": 0.1762, + "loss/crossentropy": 2.7086002230644226, + "loss/hidden": 0.0, + "loss/logits": 0.1762431263923645, + "loss/reg": 2.8651485443115234, + "step": 427 + }, + { + "epoch": 0.00428, + "grad_norm": 0.3494791090488434, + "grad_norm_var": 0.0017205006490997802, + "learning_rate": 5e-05, + "loss": 0.1818, + "loss/crossentropy": 2.7305288314819336, + "loss/hidden": 0.0, + "loss/logits": 0.18181117624044418, + "loss/reg": 2.8629541397094727, + "step": 428 + }, + { + "epoch": 0.00429, + "grad_norm": 0.3337409794330597, + "grad_norm_var": 0.0017762239427149495, + "learning_rate": 5e-05, + "loss": 0.1661, + "loss/crossentropy": 2.840167284011841, + "loss/hidden": 0.0, + "loss/logits": 0.1660769209265709, + "loss/reg": 2.8615217208862305, + "step": 429 + }, + { + "epoch": 0.0043, + "grad_norm": 0.4685284495353699, + "grad_norm_var": 0.0024887722894077887, + "learning_rate": 5e-05, + "loss": 0.1741, + "loss/crossentropy": 2.7593295574188232, + "loss/hidden": 0.0, + "loss/logits": 0.1740834154188633, + "loss/reg": 2.860365629196167, + "step": 430 + }, + { + "epoch": 0.00431, + "grad_norm": 0.35651838779449463, + "grad_norm_var": 0.002434815976952542, + "learning_rate": 5e-05, + "loss": 0.1673, + "loss/crossentropy": 2.777701735496521, + "loss/hidden": 0.0, + "loss/logits": 0.16725125908851624, + "loss/reg": 2.858222007751465, + "step": 431 + }, + { + "epoch": 0.00432, + "grad_norm": 0.34670454263687134, + "grad_norm_var": 0.0023984303943363817, + "learning_rate": 5e-05, + "loss": 0.165, + "loss/crossentropy": 2.749099850654602, + "loss/hidden": 0.0, + "loss/logits": 0.16504037007689476, + "loss/reg": 2.855973958969116, + "step": 432 + }, + { + "epoch": 0.00433, + "grad_norm": 0.3284713923931122, + "grad_norm_var": 0.0024153046998328874, + "learning_rate": 5e-05, + "loss": 0.1521, + "loss/crossentropy": 2.7869237661361694, + "loss/hidden": 0.0, + "loss/logits": 0.15208067372441292, + "loss/reg": 2.853942632675171, + "step": 433 + }, + { + "epoch": 0.00434, + "grad_norm": 0.48883649706840515, + "grad_norm_var": 0.0030697262784900037, + "learning_rate": 5e-05, + "loss": 0.1759, + "loss/crossentropy": 2.9403671622276306, + "loss/hidden": 0.0, + "loss/logits": 0.17589127644896507, + "loss/reg": 2.851728916168213, + "step": 434 + }, + { + "epoch": 0.00435, + "grad_norm": 0.36951273679733276, + "grad_norm_var": 0.0030654307324534447, + "learning_rate": 5e-05, + "loss": 0.185, + "loss/crossentropy": 2.7797312140464783, + "loss/hidden": 0.0, + "loss/logits": 0.1850355602800846, + "loss/reg": 2.8488640785217285, + "step": 435 + }, + { + "epoch": 0.00436, + "grad_norm": 0.4184967577457428, + "grad_norm_var": 0.0031605906698184564, + "learning_rate": 5e-05, + "loss": 0.1855, + "loss/crossentropy": 2.812410533428192, + "loss/hidden": 0.0, + "loss/logits": 0.18553681299090385, + "loss/reg": 2.8465514183044434, + "step": 436 + }, + { + "epoch": 0.00437, + "grad_norm": 0.4329560101032257, + "grad_norm_var": 0.0032767273553133062, + "learning_rate": 5e-05, + "loss": 0.1784, + "loss/crossentropy": 2.840768814086914, + "loss/hidden": 0.0, + "loss/logits": 0.17843929678201675, + "loss/reg": 2.844315767288208, + "step": 437 + }, + { + "epoch": 0.00438, + "grad_norm": 0.6038658022880554, + "grad_norm_var": 0.005936964872234999, + "learning_rate": 5e-05, + "loss": 0.1949, + "loss/crossentropy": 2.7183879017829895, + "loss/hidden": 0.0, + "loss/logits": 0.19491342082619667, + "loss/reg": 2.842548131942749, + "step": 438 + }, + { + "epoch": 0.00439, + "grad_norm": 0.4069391191005707, + "grad_norm_var": 0.005387125873613382, + "learning_rate": 5e-05, + "loss": 0.1875, + "loss/crossentropy": 2.7780433297157288, + "loss/hidden": 0.0, + "loss/logits": 0.18749799579381943, + "loss/reg": 2.840106964111328, + "step": 439 + }, + { + "epoch": 0.0044, + "grad_norm": 0.35941290855407715, + "grad_norm_var": 0.005220523762257064, + "learning_rate": 5e-05, + "loss": 0.1639, + "loss/crossentropy": 2.7595601081848145, + "loss/hidden": 0.0, + "loss/logits": 0.1638675332069397, + "loss/reg": 2.8378958702087402, + "step": 440 + }, + { + "epoch": 0.00441, + "grad_norm": 0.3669149875640869, + "grad_norm_var": 0.005284393934492052, + "learning_rate": 5e-05, + "loss": 0.1714, + "loss/crossentropy": 2.673116147518158, + "loss/hidden": 0.0, + "loss/logits": 0.1713937260210514, + "loss/reg": 2.8351917266845703, + "step": 441 + }, + { + "epoch": 0.00442, + "grad_norm": 0.3643859922885895, + "grad_norm_var": 0.0051709546313713755, + "learning_rate": 5e-05, + "loss": 0.1795, + "loss/crossentropy": 2.8026832342147827, + "loss/hidden": 0.0, + "loss/logits": 0.17951133847236633, + "loss/reg": 2.8323209285736084, + "step": 442 + }, + { + "epoch": 0.00443, + "grad_norm": 0.34250232577323914, + "grad_norm_var": 0.005361033629373261, + "learning_rate": 5e-05, + "loss": 0.1729, + "loss/crossentropy": 2.7829501032829285, + "loss/hidden": 0.0, + "loss/logits": 0.17286691814661026, + "loss/reg": 2.8302509784698486, + "step": 443 + }, + { + "epoch": 0.00444, + "grad_norm": 0.3323063552379608, + "grad_norm_var": 0.005486165176397177, + "learning_rate": 5e-05, + "loss": 0.1685, + "loss/crossentropy": 2.7025471329689026, + "loss/hidden": 0.0, + "loss/logits": 0.1684984639286995, + "loss/reg": 2.827455997467041, + "step": 444 + }, + { + "epoch": 0.00445, + "grad_norm": 0.35889074206352234, + "grad_norm_var": 0.005320257567319386, + "learning_rate": 5e-05, + "loss": 0.1872, + "loss/crossentropy": 2.6426368355751038, + "loss/hidden": 0.0, + "loss/logits": 0.18716050684452057, + "loss/reg": 2.8253841400146484, + "step": 445 + }, + { + "epoch": 0.00446, + "grad_norm": 0.39696604013442993, + "grad_norm_var": 0.004953801905317123, + "learning_rate": 5e-05, + "loss": 0.1844, + "loss/crossentropy": 2.6918662786483765, + "loss/hidden": 0.0, + "loss/logits": 0.18442435935139656, + "loss/reg": 2.8228838443756104, + "step": 446 + }, + { + "epoch": 0.00447, + "grad_norm": 0.3320043981075287, + "grad_norm_var": 0.005107676487313561, + "learning_rate": 5e-05, + "loss": 0.1689, + "loss/crossentropy": 2.7309769988059998, + "loss/hidden": 0.0, + "loss/logits": 0.16887113079428673, + "loss/reg": 2.8211417198181152, + "step": 447 + }, + { + "epoch": 0.00448, + "grad_norm": 0.3350951373577118, + "grad_norm_var": 0.0051840048892141, + "learning_rate": 5e-05, + "loss": 0.1736, + "loss/crossentropy": 2.7696202397346497, + "loss/hidden": 0.0, + "loss/logits": 0.17358100041747093, + "loss/reg": 2.818211793899536, + "step": 448 + }, + { + "epoch": 0.00449, + "grad_norm": 0.35995370149612427, + "grad_norm_var": 0.004988316730949372, + "learning_rate": 5e-05, + "loss": 0.185, + "loss/crossentropy": 2.7628800868988037, + "loss/hidden": 0.0, + "loss/logits": 0.1849852055311203, + "loss/reg": 2.8163435459136963, + "step": 449 + }, + { + "epoch": 0.0045, + "grad_norm": 0.3433259427547455, + "grad_norm_var": 0.004429295151525636, + "learning_rate": 5e-05, + "loss": 0.1776, + "loss/crossentropy": 2.92121821641922, + "loss/hidden": 0.0, + "loss/logits": 0.1775917150080204, + "loss/reg": 2.8140530586242676, + "step": 450 + }, + { + "epoch": 0.00451, + "grad_norm": 0.3525676131248474, + "grad_norm_var": 0.004477082320186199, + "learning_rate": 5e-05, + "loss": 0.1765, + "loss/crossentropy": 2.8223352432250977, + "loss/hidden": 0.0, + "loss/logits": 0.1765166036784649, + "loss/reg": 2.8114964962005615, + "step": 451 + }, + { + "epoch": 0.00452, + "grad_norm": 0.3349536955356598, + "grad_norm_var": 0.004502986709870172, + "learning_rate": 5e-05, + "loss": 0.1617, + "loss/crossentropy": 2.58266818523407, + "loss/hidden": 0.0, + "loss/logits": 0.16168920323252678, + "loss/reg": 2.8082778453826904, + "step": 452 + }, + { + "epoch": 0.00453, + "grad_norm": 0.3272739350795746, + "grad_norm_var": 0.004404667304666754, + "learning_rate": 5e-05, + "loss": 0.1606, + "loss/crossentropy": 2.791221022605896, + "loss/hidden": 0.0, + "loss/logits": 0.16057174652814865, + "loss/reg": 2.8055834770202637, + "step": 453 + }, + { + "epoch": 0.00454, + "grad_norm": 0.35802412033081055, + "grad_norm_var": 0.0005107777789474354, + "learning_rate": 5e-05, + "loss": 0.1758, + "loss/crossentropy": 2.856186628341675, + "loss/hidden": 0.0, + "loss/logits": 0.17580854520201683, + "loss/reg": 2.8030307292938232, + "step": 454 + }, + { + "epoch": 0.00455, + "grad_norm": 0.34605538845062256, + "grad_norm_var": 0.0003165176266941239, + "learning_rate": 5e-05, + "loss": 0.1651, + "loss/crossentropy": 2.734806716442108, + "loss/hidden": 0.0, + "loss/logits": 0.16513444855809212, + "loss/reg": 2.7998156547546387, + "step": 455 + }, + { + "epoch": 0.00456, + "grad_norm": 0.35396888852119446, + "grad_norm_var": 0.0003120198180476634, + "learning_rate": 5e-05, + "loss": 0.1701, + "loss/crossentropy": 2.8904529213905334, + "loss/hidden": 0.0, + "loss/logits": 0.17014532163739204, + "loss/reg": 2.796231508255005, + "step": 456 + }, + { + "epoch": 0.00457, + "grad_norm": 0.3613145649433136, + "grad_norm_var": 0.00030159148728288546, + "learning_rate": 5e-05, + "loss": 0.1835, + "loss/crossentropy": 2.813112735748291, + "loss/hidden": 0.0, + "loss/logits": 0.1834680140018463, + "loss/reg": 2.7929091453552246, + "step": 457 + }, + { + "epoch": 0.00458, + "grad_norm": 0.36372610926628113, + "grad_norm_var": 0.00030035069871777733, + "learning_rate": 5e-05, + "loss": 0.167, + "loss/crossentropy": 2.795239508152008, + "loss/hidden": 0.0, + "loss/logits": 0.1669648103415966, + "loss/reg": 2.789707899093628, + "step": 458 + }, + { + "epoch": 0.00459, + "grad_norm": 0.3581913113594055, + "grad_norm_var": 0.00030019062479412403, + "learning_rate": 5e-05, + "loss": 0.165, + "loss/crossentropy": 2.797567903995514, + "loss/hidden": 0.0, + "loss/logits": 0.16498373076319695, + "loss/reg": 2.786154270172119, + "step": 459 + }, + { + "epoch": 0.0046, + "grad_norm": 0.3571149408817291, + "grad_norm_var": 0.00027710791712463786, + "learning_rate": 5e-05, + "loss": 0.161, + "loss/crossentropy": 2.743883192539215, + "loss/hidden": 0.0, + "loss/logits": 0.16101711615920067, + "loss/reg": 2.7824792861938477, + "step": 460 + }, + { + "epoch": 0.00461, + "grad_norm": 0.356715589761734, + "grad_norm_var": 0.0002755397827386948, + "learning_rate": 5e-05, + "loss": 0.1732, + "loss/crossentropy": 2.724743604660034, + "loss/hidden": 0.0, + "loss/logits": 0.1731928214430809, + "loss/reg": 2.778890371322632, + "step": 461 + }, + { + "epoch": 0.00462, + "grad_norm": 0.3243059813976288, + "grad_norm_var": 0.00017305590364662023, + "learning_rate": 5e-05, + "loss": 0.1592, + "loss/crossentropy": 2.7731017470359802, + "loss/hidden": 0.0, + "loss/logits": 0.15923070535063744, + "loss/reg": 2.775613784790039, + "step": 462 + }, + { + "epoch": 0.00463, + "grad_norm": 0.3843972980976105, + "grad_norm_var": 0.00023436686166613171, + "learning_rate": 5e-05, + "loss": 0.1776, + "loss/crossentropy": 2.6426811814308167, + "loss/hidden": 0.0, + "loss/logits": 0.17761223763227463, + "loss/reg": 2.772561550140381, + "step": 463 + }, + { + "epoch": 0.00464, + "grad_norm": 0.3468632102012634, + "grad_norm_var": 0.00021796986892265539, + "learning_rate": 5e-05, + "loss": 0.1742, + "loss/crossentropy": 2.882816791534424, + "loss/hidden": 0.0, + "loss/logits": 0.17419364303350449, + "loss/reg": 2.769129991531372, + "step": 464 + }, + { + "epoch": 0.00465, + "grad_norm": 0.3967267870903015, + "grad_norm_var": 0.0003424789629975648, + "learning_rate": 5e-05, + "loss": 0.1785, + "loss/crossentropy": 2.9460648894309998, + "loss/hidden": 0.0, + "loss/logits": 0.1784828118979931, + "loss/reg": 2.767021656036377, + "step": 465 + }, + { + "epoch": 0.00466, + "grad_norm": 0.36927327513694763, + "grad_norm_var": 0.0003472996962895862, + "learning_rate": 5e-05, + "loss": 0.1723, + "loss/crossentropy": 2.7243736386299133, + "loss/hidden": 0.0, + "loss/logits": 0.17225344851613045, + "loss/reg": 2.7637596130371094, + "step": 466 + }, + { + "epoch": 0.00467, + "grad_norm": 0.6855224967002869, + "grad_norm_var": 0.007136168552583877, + "learning_rate": 5e-05, + "loss": 0.1725, + "loss/crossentropy": 2.9400131702423096, + "loss/hidden": 0.0, + "loss/logits": 0.17254997044801712, + "loss/reg": 2.7604963779449463, + "step": 467 + }, + { + "epoch": 0.00468, + "grad_norm": 0.41388872265815735, + "grad_norm_var": 0.007088047286249225, + "learning_rate": 5e-05, + "loss": 0.1608, + "loss/crossentropy": 2.7375746369361877, + "loss/hidden": 0.0, + "loss/logits": 0.16082635894417763, + "loss/reg": 2.7572667598724365, + "step": 468 + }, + { + "epoch": 0.00469, + "grad_norm": 0.3997427225112915, + "grad_norm_var": 0.006892705403346755, + "learning_rate": 5e-05, + "loss": 0.1704, + "loss/crossentropy": 2.8761582374572754, + "loss/hidden": 0.0, + "loss/logits": 0.17043552175164223, + "loss/reg": 2.754149913787842, + "step": 469 + }, + { + "epoch": 0.0047, + "grad_norm": 0.33424052596092224, + "grad_norm_var": 0.007016741295476203, + "learning_rate": 5e-05, + "loss": 0.1652, + "loss/crossentropy": 2.8255309462547302, + "loss/hidden": 0.0, + "loss/logits": 0.16519855335354805, + "loss/reg": 2.751276731491089, + "step": 470 + }, + { + "epoch": 0.00471, + "grad_norm": 0.41415977478027344, + "grad_norm_var": 0.006957502567752493, + "learning_rate": 5e-05, + "loss": 0.1832, + "loss/crossentropy": 2.676911950111389, + "loss/hidden": 0.0, + "loss/logits": 0.18318749964237213, + "loss/reg": 2.7486090660095215, + "step": 471 + }, + { + "epoch": 0.00472, + "grad_norm": 0.3688299357891083, + "grad_norm_var": 0.006902369057221236, + "learning_rate": 5e-05, + "loss": 0.162, + "loss/crossentropy": 2.6736281514167786, + "loss/hidden": 0.0, + "loss/logits": 0.16196409612894058, + "loss/reg": 2.745225667953491, + "step": 472 + }, + { + "epoch": 0.00473, + "grad_norm": 0.42543119192123413, + "grad_norm_var": 0.006916738926360373, + "learning_rate": 5e-05, + "loss": 0.1685, + "loss/crossentropy": 2.7871673703193665, + "loss/hidden": 0.0, + "loss/logits": 0.16850638762116432, + "loss/reg": 2.7421023845672607, + "step": 473 + }, + { + "epoch": 0.00474, + "grad_norm": 0.3542870581150055, + "grad_norm_var": 0.006960025235757868, + "learning_rate": 5e-05, + "loss": 0.1695, + "loss/crossentropy": 2.727401077747345, + "loss/hidden": 0.0, + "loss/logits": 0.1695132479071617, + "loss/reg": 2.738083839416504, + "step": 474 + }, + { + "epoch": 0.00475, + "grad_norm": 0.42508408427238464, + "grad_norm_var": 0.006928287935252916, + "learning_rate": 5e-05, + "loss": 0.1967, + "loss/crossentropy": 2.756391167640686, + "loss/hidden": 0.0, + "loss/logits": 0.19667796790599823, + "loss/reg": 2.734565496444702, + "step": 475 + }, + { + "epoch": 0.00476, + "grad_norm": 0.3327108919620514, + "grad_norm_var": 0.007096223362361916, + "learning_rate": 5e-05, + "loss": 0.1698, + "loss/crossentropy": 2.662286937236786, + "loss/hidden": 0.0, + "loss/logits": 0.16976173967123032, + "loss/reg": 2.730898141860962, + "step": 476 + }, + { + "epoch": 0.00477, + "grad_norm": 0.3538263142108917, + "grad_norm_var": 0.007111786918880665, + "learning_rate": 5e-05, + "loss": 0.1728, + "loss/crossentropy": 2.6600981950759888, + "loss/hidden": 0.0, + "loss/logits": 0.17283405736088753, + "loss/reg": 2.7276227474212646, + "step": 477 + }, + { + "epoch": 0.00478, + "grad_norm": 0.6719810962677002, + "grad_norm_var": 0.011362604241119423, + "learning_rate": 5e-05, + "loss": 0.1956, + "loss/crossentropy": 2.9192944169044495, + "loss/hidden": 0.0, + "loss/logits": 0.1955549158155918, + "loss/reg": 2.7246785163879395, + "step": 478 + }, + { + "epoch": 0.00479, + "grad_norm": 0.40175002813339233, + "grad_norm_var": 0.011305273259017534, + "learning_rate": 5e-05, + "loss": 0.1707, + "loss/crossentropy": 2.8099315762519836, + "loss/hidden": 0.0, + "loss/logits": 0.1706707924604416, + "loss/reg": 2.721214771270752, + "step": 479 + }, + { + "epoch": 0.0048, + "grad_norm": 0.5700014233589172, + "grad_norm_var": 0.012288996380569357, + "learning_rate": 5e-05, + "loss": 0.175, + "loss/crossentropy": 2.764845371246338, + "loss/hidden": 0.0, + "loss/logits": 0.17503220587968826, + "loss/reg": 2.71852970123291, + "step": 480 + }, + { + "epoch": 0.00481, + "grad_norm": 0.3602856993675232, + "grad_norm_var": 0.012545036289332723, + "learning_rate": 5e-05, + "loss": 0.1675, + "loss/crossentropy": 2.817295730113983, + "loss/hidden": 0.0, + "loss/logits": 0.16745564714074135, + "loss/reg": 2.7155778408050537, + "step": 481 + }, + { + "epoch": 0.00482, + "grad_norm": 0.37470605969429016, + "grad_norm_var": 0.012502846327791594, + "learning_rate": 5e-05, + "loss": 0.1722, + "loss/crossentropy": 2.7710434794425964, + "loss/hidden": 0.0, + "loss/logits": 0.172159094363451, + "loss/reg": 2.7133727073669434, + "step": 482 + }, + { + "epoch": 0.00483, + "grad_norm": 0.319488525390625, + "grad_norm_var": 0.008425663660975724, + "learning_rate": 5e-05, + "loss": 0.1524, + "loss/crossentropy": 2.783412456512451, + "loss/hidden": 0.0, + "loss/logits": 0.15241163223981857, + "loss/reg": 2.710848331451416, + "step": 483 + }, + { + "epoch": 0.00484, + "grad_norm": 0.3474343419075012, + "grad_norm_var": 0.008645296689366684, + "learning_rate": 5e-05, + "loss": 0.1582, + "loss/crossentropy": 2.8712441325187683, + "loss/hidden": 0.0, + "loss/logits": 0.1582440249621868, + "loss/reg": 2.708759307861328, + "step": 484 + }, + { + "epoch": 0.00485, + "grad_norm": 0.3881974518299103, + "grad_norm_var": 0.008659215056144554, + "learning_rate": 5e-05, + "loss": 0.168, + "loss/crossentropy": 2.7801290154457092, + "loss/hidden": 0.0, + "loss/logits": 0.16804108768701553, + "loss/reg": 2.707090139389038, + "step": 485 + }, + { + "epoch": 0.00486, + "grad_norm": 0.3865320086479187, + "grad_norm_var": 0.008353144350497502, + "learning_rate": 5e-05, + "loss": 0.1622, + "loss/crossentropy": 2.762259840965271, + "loss/hidden": 0.0, + "loss/logits": 0.1622123382985592, + "loss/reg": 2.704341173171997, + "step": 486 + }, + { + "epoch": 0.00487, + "grad_norm": 0.3601287007331848, + "grad_norm_var": 0.008476237288047564, + "learning_rate": 5e-05, + "loss": 0.1717, + "loss/crossentropy": 2.7640222311019897, + "loss/hidden": 0.0, + "loss/logits": 0.1716899275779724, + "loss/reg": 2.702449321746826, + "step": 487 + }, + { + "epoch": 0.00488, + "grad_norm": 0.3476349711418152, + "grad_norm_var": 0.008599584577097493, + "learning_rate": 5e-05, + "loss": 0.1771, + "loss/crossentropy": 2.8057321906089783, + "loss/hidden": 0.0, + "loss/logits": 0.1770944595336914, + "loss/reg": 2.6996164321899414, + "step": 488 + }, + { + "epoch": 0.00489, + "grad_norm": 0.35880380868911743, + "grad_norm_var": 0.008661929013581293, + "learning_rate": 5e-05, + "loss": 0.1759, + "loss/crossentropy": 2.7546836137771606, + "loss/hidden": 0.0, + "loss/logits": 0.17589908093214035, + "loss/reg": 2.69681978225708, + "step": 489 + }, + { + "epoch": 0.0049, + "grad_norm": 0.3216891586780548, + "grad_norm_var": 0.008914221483014847, + "learning_rate": 5e-05, + "loss": 0.1628, + "loss/crossentropy": 2.755846858024597, + "loss/hidden": 0.0, + "loss/logits": 0.16277150437235832, + "loss/reg": 2.6948180198669434, + "step": 490 + }, + { + "epoch": 0.00491, + "grad_norm": 0.3739294409751892, + "grad_norm_var": 0.008872687766587003, + "learning_rate": 5e-05, + "loss": 0.1737, + "loss/crossentropy": 2.758453607559204, + "loss/hidden": 0.0, + "loss/logits": 0.173715490847826, + "loss/reg": 2.692713737487793, + "step": 491 + }, + { + "epoch": 0.00492, + "grad_norm": 0.3633546531200409, + "grad_norm_var": 0.008689872848313067, + "learning_rate": 5e-05, + "loss": 0.1755, + "loss/crossentropy": 2.756626844406128, + "loss/hidden": 0.0, + "loss/logits": 0.17549088224768639, + "loss/reg": 2.6911702156066895, + "step": 492 + }, + { + "epoch": 0.00493, + "grad_norm": 0.4165309965610504, + "grad_norm_var": 0.00860196217059566, + "learning_rate": 5e-05, + "loss": 0.1882, + "loss/crossentropy": 2.69485205411911, + "loss/hidden": 0.0, + "loss/logits": 0.18822569772601128, + "loss/reg": 2.6890034675598145, + "step": 493 + }, + { + "epoch": 0.00494, + "grad_norm": 0.34585461020469666, + "grad_norm_var": 0.0033206140596304815, + "learning_rate": 5e-05, + "loss": 0.1742, + "loss/crossentropy": 2.73829984664917, + "loss/hidden": 0.0, + "loss/logits": 0.17416464537382126, + "loss/reg": 2.687713623046875, + "step": 494 + }, + { + "epoch": 0.00495, + "grad_norm": 0.3443280756473541, + "grad_norm_var": 0.003339269529387142, + "learning_rate": 5e-05, + "loss": 0.1613, + "loss/crossentropy": 2.745963931083679, + "loss/hidden": 0.0, + "loss/logits": 0.161319550126791, + "loss/reg": 2.685638189315796, + "step": 495 + }, + { + "epoch": 0.00496, + "grad_norm": 0.3689098656177521, + "grad_norm_var": 0.000602855553897171, + "learning_rate": 5e-05, + "loss": 0.1735, + "loss/crossentropy": 2.6744813919067383, + "loss/hidden": 0.0, + "loss/logits": 0.17346932739019394, + "loss/reg": 2.6839358806610107, + "step": 496 + }, + { + "epoch": 0.00497, + "grad_norm": 0.36457374691963196, + "grad_norm_var": 0.0006035317496342747, + "learning_rate": 5e-05, + "loss": 0.1735, + "loss/crossentropy": 2.799642562866211, + "loss/hidden": 0.0, + "loss/logits": 0.17352834343910217, + "loss/reg": 2.683485507965088, + "step": 497 + }, + { + "epoch": 0.00498, + "grad_norm": 0.35222312808036804, + "grad_norm_var": 0.0005951796117876367, + "learning_rate": 5e-05, + "loss": 0.1742, + "loss/crossentropy": 2.739416480064392, + "loss/hidden": 0.0, + "loss/logits": 0.1742345169186592, + "loss/reg": 2.6830482482910156, + "step": 498 + }, + { + "epoch": 0.00499, + "grad_norm": 0.3427422344684601, + "grad_norm_var": 0.0005034448418147264, + "learning_rate": 5e-05, + "loss": 0.1673, + "loss/crossentropy": 2.772252082824707, + "loss/hidden": 0.0, + "loss/logits": 0.16728588938713074, + "loss/reg": 2.682189464569092, + "step": 499 + }, + { + "epoch": 0.005, + "grad_norm": 0.39299577474594116, + "grad_norm_var": 0.0005481683329227494, + "learning_rate": 5e-05, + "loss": 0.1926, + "loss/crossentropy": 2.7702752351760864, + "loss/hidden": 0.0, + "loss/logits": 0.1926349699497223, + "loss/reg": 2.6808254718780518, + "step": 500 + }, + { + "epoch": 0.00501, + "grad_norm": 0.3431949019432068, + "grad_norm_var": 0.0005312130675710792, + "learning_rate": 5e-05, + "loss": 0.1634, + "loss/crossentropy": 2.7881234288215637, + "loss/hidden": 0.0, + "loss/logits": 0.1633942425251007, + "loss/reg": 2.6798629760742188, + "step": 501 + }, + { + "epoch": 0.00502, + "grad_norm": 0.36641839146614075, + "grad_norm_var": 0.0004892704880637311, + "learning_rate": 5e-05, + "loss": 0.1787, + "loss/crossentropy": 2.848407030105591, + "loss/hidden": 0.0, + "loss/logits": 0.17872987315058708, + "loss/reg": 2.677311658859253, + "step": 502 + }, + { + "epoch": 0.00503, + "grad_norm": 0.3278079330921173, + "grad_norm_var": 0.000554897538649816, + "learning_rate": 5e-05, + "loss": 0.1587, + "loss/crossentropy": 2.75662362575531, + "loss/hidden": 0.0, + "loss/logits": 0.15868044644594193, + "loss/reg": 2.675185203552246, + "step": 503 + }, + { + "epoch": 0.00504, + "grad_norm": 0.3251039683818817, + "grad_norm_var": 0.0006183250665441046, + "learning_rate": 5e-05, + "loss": 0.1551, + "loss/crossentropy": 2.731416165828705, + "loss/hidden": 0.0, + "loss/logits": 0.15506618097424507, + "loss/reg": 2.673948287963867, + "step": 504 + }, + { + "epoch": 0.00505, + "grad_norm": 0.35344070196151733, + "grad_norm_var": 0.0006186746986458047, + "learning_rate": 5e-05, + "loss": 0.167, + "loss/crossentropy": 2.740668296813965, + "loss/hidden": 0.0, + "loss/logits": 0.16695522889494896, + "loss/reg": 2.6712498664855957, + "step": 505 + }, + { + "epoch": 0.00506, + "grad_norm": 0.36658284068107605, + "grad_norm_var": 0.0005366058949143918, + "learning_rate": 5e-05, + "loss": 0.1668, + "loss/crossentropy": 2.802608013153076, + "loss/hidden": 0.0, + "loss/logits": 0.16682763025164604, + "loss/reg": 2.669286012649536, + "step": 506 + }, + { + "epoch": 0.00507, + "grad_norm": 0.4423954486846924, + "grad_norm_var": 0.000963591213409624, + "learning_rate": 5e-05, + "loss": 0.1963, + "loss/crossentropy": 2.811932861804962, + "loss/hidden": 0.0, + "loss/logits": 0.19632378965616226, + "loss/reg": 2.666898012161255, + "step": 507 + }, + { + "epoch": 0.00508, + "grad_norm": 0.3770610988140106, + "grad_norm_var": 0.000975015024308116, + "learning_rate": 5e-05, + "loss": 0.1753, + "loss/crossentropy": 2.7279282808303833, + "loss/hidden": 0.0, + "loss/logits": 0.17527905479073524, + "loss/reg": 2.664764881134033, + "step": 508 + }, + { + "epoch": 0.00509, + "grad_norm": 0.35589146614074707, + "grad_norm_var": 0.0007832244440521922, + "learning_rate": 5e-05, + "loss": 0.168, + "loss/crossentropy": 2.8977367281913757, + "loss/hidden": 0.0, + "loss/logits": 0.16798892244696617, + "loss/reg": 2.6622438430786133, + "step": 509 + }, + { + "epoch": 0.0051, + "grad_norm": 0.3419097661972046, + "grad_norm_var": 0.0007919503322765919, + "learning_rate": 5e-05, + "loss": 0.1729, + "loss/crossentropy": 2.6906025409698486, + "loss/hidden": 0.0, + "loss/logits": 0.172856405377388, + "loss/reg": 2.6595211029052734, + "step": 510 + }, + { + "epoch": 0.00511, + "grad_norm": 0.3972381353378296, + "grad_norm_var": 0.0008538971282195384, + "learning_rate": 5e-05, + "loss": 0.1825, + "loss/crossentropy": 2.762513279914856, + "loss/hidden": 0.0, + "loss/logits": 0.18245646730065346, + "loss/reg": 2.6572320461273193, + "step": 511 + }, + { + "epoch": 0.00512, + "grad_norm": 0.3489353358745575, + "grad_norm_var": 0.0008648399289393501, + "learning_rate": 5e-05, + "loss": 0.179, + "loss/crossentropy": 2.7472071647644043, + "loss/hidden": 0.0, + "loss/logits": 0.1789936050772667, + "loss/reg": 2.6544342041015625, + "step": 512 + }, + { + "epoch": 0.00513, + "grad_norm": 0.3673308491706848, + "grad_norm_var": 0.0008661114894439326, + "learning_rate": 5e-05, + "loss": 0.1745, + "loss/crossentropy": 2.69700163602829, + "loss/hidden": 0.0, + "loss/logits": 0.1745261810719967, + "loss/reg": 2.6520497798919678, + "step": 513 + }, + { + "epoch": 0.00514, + "grad_norm": 0.33870744705200195, + "grad_norm_var": 0.0008961917113334199, + "learning_rate": 5e-05, + "loss": 0.1649, + "loss/crossentropy": 2.762860357761383, + "loss/hidden": 0.0, + "loss/logits": 0.16494135558605194, + "loss/reg": 2.6500847339630127, + "step": 514 + }, + { + "epoch": 0.00515, + "grad_norm": 0.40411266684532166, + "grad_norm_var": 0.0009761766654230563, + "learning_rate": 5e-05, + "loss": 0.1628, + "loss/crossentropy": 3.01085501909256, + "loss/hidden": 0.0, + "loss/logits": 0.16284478455781937, + "loss/reg": 2.648311138153076, + "step": 515 + }, + { + "epoch": 0.00516, + "grad_norm": 0.37194308638572693, + "grad_norm_var": 0.0009268939874421604, + "learning_rate": 5e-05, + "loss": 0.182, + "loss/crossentropy": 2.721080005168915, + "loss/hidden": 0.0, + "loss/logits": 0.18202906847000122, + "loss/reg": 2.6469640731811523, + "step": 516 + }, + { + "epoch": 0.00517, + "grad_norm": 0.3380615711212158, + "grad_norm_var": 0.0009429551352979477, + "learning_rate": 5e-05, + "loss": 0.1639, + "loss/crossentropy": 2.6788495779037476, + "loss/hidden": 0.0, + "loss/logits": 0.16385124996304512, + "loss/reg": 2.6445441246032715, + "step": 517 + }, + { + "epoch": 0.00518, + "grad_norm": 0.37696361541748047, + "grad_norm_var": 0.0009533986625055632, + "learning_rate": 5e-05, + "loss": 0.1587, + "loss/crossentropy": 2.6845511198043823, + "loss/hidden": 0.0, + "loss/logits": 0.1586880125105381, + "loss/reg": 2.6424736976623535, + "step": 518 + }, + { + "epoch": 0.00519, + "grad_norm": 0.32983675599098206, + "grad_norm_var": 0.0009437052369864992, + "learning_rate": 5e-05, + "loss": 0.1585, + "loss/crossentropy": 2.5984672904014587, + "loss/hidden": 0.0, + "loss/logits": 0.15849433466792107, + "loss/reg": 2.639796257019043, + "step": 519 + }, + { + "epoch": 0.0052, + "grad_norm": 0.3439983129501343, + "grad_norm_var": 0.000866215802107521, + "learning_rate": 5e-05, + "loss": 0.1578, + "loss/crossentropy": 2.7057528495788574, + "loss/hidden": 0.0, + "loss/logits": 0.15775253251194954, + "loss/reg": 2.636976480484009, + "step": 520 + }, + { + "epoch": 0.00521, + "grad_norm": 0.4739494323730469, + "grad_norm_var": 0.0015736599047053415, + "learning_rate": 5e-05, + "loss": 0.178, + "loss/crossentropy": 2.6239394545555115, + "loss/hidden": 0.0, + "loss/logits": 0.17801255360245705, + "loss/reg": 2.6342852115631104, + "step": 521 + }, + { + "epoch": 0.00522, + "grad_norm": 0.5270029306411743, + "grad_norm_var": 0.003035565907296726, + "learning_rate": 5e-05, + "loss": 0.1801, + "loss/crossentropy": 2.890467643737793, + "loss/hidden": 0.0, + "loss/logits": 0.18007512018084526, + "loss/reg": 2.631289005279541, + "step": 522 + }, + { + "epoch": 0.00523, + "grad_norm": 0.42719003558158875, + "grad_norm_var": 0.002930528350278516, + "learning_rate": 5e-05, + "loss": 0.1781, + "loss/crossentropy": 2.9749565720558167, + "loss/hidden": 0.0, + "loss/logits": 0.17812742665410042, + "loss/reg": 2.6284420490264893, + "step": 523 + }, + { + "epoch": 0.00524, + "grad_norm": 0.37133005261421204, + "grad_norm_var": 0.0029367435634455913, + "learning_rate": 5e-05, + "loss": 0.1597, + "loss/crossentropy": 2.692670702934265, + "loss/hidden": 0.0, + "loss/logits": 0.1597190946340561, + "loss/reg": 2.6251702308654785, + "step": 524 + }, + { + "epoch": 0.00525, + "grad_norm": 0.3646347224712372, + "grad_norm_var": 0.0029109098946428253, + "learning_rate": 5e-05, + "loss": 0.1676, + "loss/crossentropy": 2.8696910738945007, + "loss/hidden": 0.0, + "loss/logits": 0.16764900088310242, + "loss/reg": 2.621973991394043, + "step": 525 + }, + { + "epoch": 0.00526, + "grad_norm": 0.3347557485103607, + "grad_norm_var": 0.002953013887398237, + "learning_rate": 5e-05, + "loss": 0.1659, + "loss/crossentropy": 2.844240427017212, + "loss/hidden": 0.0, + "loss/logits": 0.16587505862116814, + "loss/reg": 2.6180617809295654, + "step": 526 + }, + { + "epoch": 0.00527, + "grad_norm": 0.3301764726638794, + "grad_norm_var": 0.003100070614909223, + "learning_rate": 5e-05, + "loss": 0.1554, + "loss/crossentropy": 2.822225272655487, + "loss/hidden": 0.0, + "loss/logits": 0.15541274286806583, + "loss/reg": 2.6156363487243652, + "step": 527 + }, + { + "epoch": 0.00528, + "grad_norm": 0.3668423593044281, + "grad_norm_var": 0.003050578439524883, + "learning_rate": 5e-05, + "loss": 0.1725, + "loss/crossentropy": 2.873881459236145, + "loss/hidden": 0.0, + "loss/logits": 0.17249644920229912, + "loss/reg": 2.6129000186920166, + "step": 528 + }, + { + "epoch": 0.00529, + "grad_norm": 0.33062636852264404, + "grad_norm_var": 0.0031927551041592986, + "learning_rate": 5e-05, + "loss": 0.169, + "loss/crossentropy": 2.7202290296554565, + "loss/hidden": 0.0, + "loss/logits": 0.16901781037449837, + "loss/reg": 2.6098320484161377, + "step": 529 + }, + { + "epoch": 0.0053, + "grad_norm": 0.33170488476753235, + "grad_norm_var": 0.003231463613689256, + "learning_rate": 5e-05, + "loss": 0.1708, + "loss/crossentropy": 2.7543463706970215, + "loss/hidden": 0.0, + "loss/logits": 0.17077547311782837, + "loss/reg": 2.606674909591675, + "step": 530 + }, + { + "epoch": 0.00531, + "grad_norm": 0.3436318337917328, + "grad_norm_var": 0.0032369737172315838, + "learning_rate": 5e-05, + "loss": 0.1822, + "loss/crossentropy": 2.6231788992881775, + "loss/hidden": 0.0, + "loss/logits": 0.1821577101945877, + "loss/reg": 2.603997230529785, + "step": 531 + }, + { + "epoch": 0.00532, + "grad_norm": 0.33105242252349854, + "grad_norm_var": 0.0033454153420392264, + "learning_rate": 5e-05, + "loss": 0.1661, + "loss/crossentropy": 2.819184124469757, + "loss/hidden": 0.0, + "loss/logits": 0.16607840731739998, + "loss/reg": 2.6012203693389893, + "step": 532 + }, + { + "epoch": 0.00533, + "grad_norm": 0.3485148847103119, + "grad_norm_var": 0.0033075767398377588, + "learning_rate": 5e-05, + "loss": 0.1676, + "loss/crossentropy": 2.8594303727149963, + "loss/hidden": 0.0, + "loss/logits": 0.1676221825182438, + "loss/reg": 2.5986573696136475, + "step": 533 + }, + { + "epoch": 0.00534, + "grad_norm": 0.3541623651981354, + "grad_norm_var": 0.003321219936842216, + "learning_rate": 5e-05, + "loss": 0.1742, + "loss/crossentropy": 2.7382256984710693, + "loss/hidden": 0.0, + "loss/logits": 0.17420669272542, + "loss/reg": 2.5956368446350098, + "step": 534 + }, + { + "epoch": 0.00535, + "grad_norm": 0.362183541059494, + "grad_norm_var": 0.003216249066768325, + "learning_rate": 5e-05, + "loss": 0.1732, + "loss/crossentropy": 2.820302128791809, + "loss/hidden": 0.0, + "loss/logits": 0.1731831431388855, + "loss/reg": 2.591860294342041, + "step": 535 + }, + { + "epoch": 0.00536, + "grad_norm": 0.340348482131958, + "grad_norm_var": 0.0032303969391706505, + "learning_rate": 5e-05, + "loss": 0.1677, + "loss/crossentropy": 2.978896915912628, + "loss/hidden": 0.0, + "loss/logits": 0.16773569583892822, + "loss/reg": 2.5879762172698975, + "step": 536 + }, + { + "epoch": 0.00537, + "grad_norm": 0.359326034784317, + "grad_norm_var": 0.002480178301492671, + "learning_rate": 5e-05, + "loss": 0.1767, + "loss/crossentropy": 2.7645240426063538, + "loss/hidden": 0.0, + "loss/logits": 0.17668773606419563, + "loss/reg": 2.5847809314727783, + "step": 537 + }, + { + "epoch": 0.00538, + "grad_norm": 0.3420480489730835, + "grad_norm_var": 0.0005976425682412671, + "learning_rate": 5e-05, + "loss": 0.1758, + "loss/crossentropy": 2.7278724908828735, + "loss/hidden": 0.0, + "loss/logits": 0.17576001212000847, + "loss/reg": 2.581143379211426, + "step": 538 + }, + { + "epoch": 0.00539, + "grad_norm": 0.33362701535224915, + "grad_norm_var": 0.00021185911019383125, + "learning_rate": 5e-05, + "loss": 0.1708, + "loss/crossentropy": 2.6828721165657043, + "loss/hidden": 0.0, + "loss/logits": 0.1707863062620163, + "loss/reg": 2.5769548416137695, + "step": 539 + }, + { + "epoch": 0.0054, + "grad_norm": 0.6795082092285156, + "grad_norm_var": 0.007165518560383773, + "learning_rate": 5e-05, + "loss": 0.2001, + "loss/crossentropy": 2.7977577447891235, + "loss/hidden": 0.0, + "loss/logits": 0.20013980567455292, + "loss/reg": 2.5742313861846924, + "step": 540 + }, + { + "epoch": 0.00541, + "grad_norm": 0.35366278886795044, + "grad_norm_var": 0.007174778628811747, + "learning_rate": 5e-05, + "loss": 0.1703, + "loss/crossentropy": 2.859143853187561, + "loss/hidden": 0.0, + "loss/logits": 0.17025134339928627, + "loss/reg": 2.57037353515625, + "step": 541 + }, + { + "epoch": 0.00542, + "grad_norm": 0.3655552566051483, + "grad_norm_var": 0.007109308326582827, + "learning_rate": 5e-05, + "loss": 0.1775, + "loss/crossentropy": 2.8502614498138428, + "loss/hidden": 0.0, + "loss/logits": 0.1775321438908577, + "loss/reg": 2.566716432571411, + "step": 542 + }, + { + "epoch": 0.00543, + "grad_norm": 0.3574732542037964, + "grad_norm_var": 0.007021635262450318, + "learning_rate": 5e-05, + "loss": 0.1735, + "loss/crossentropy": 2.892129361629486, + "loss/hidden": 0.0, + "loss/logits": 0.17345865443348885, + "loss/reg": 2.563842296600342, + "step": 543 + }, + { + "epoch": 0.00544, + "grad_norm": 0.36598220467567444, + "grad_norm_var": 0.007021902205424502, + "learning_rate": 5e-05, + "loss": 0.1721, + "loss/crossentropy": 2.7138225436210632, + "loss/hidden": 0.0, + "loss/logits": 0.17211398482322693, + "loss/reg": 2.5608513355255127, + "step": 544 + }, + { + "epoch": 0.00545, + "grad_norm": 0.35922348499298096, + "grad_norm_var": 0.0069277921155704155, + "learning_rate": 5e-05, + "loss": 0.1695, + "loss/crossentropy": 2.8138818740844727, + "loss/hidden": 0.0, + "loss/logits": 0.16949571669101715, + "loss/reg": 2.557931423187256, + "step": 545 + }, + { + "epoch": 0.00546, + "grad_norm": 0.3538724184036255, + "grad_norm_var": 0.006843838113958241, + "learning_rate": 5e-05, + "loss": 0.1722, + "loss/crossentropy": 2.7698569893836975, + "loss/hidden": 0.0, + "loss/logits": 0.17218982055783272, + "loss/reg": 2.5551669597625732, + "step": 546 + }, + { + "epoch": 0.00547, + "grad_norm": 0.38070008158683777, + "grad_norm_var": 0.006790073386325786, + "learning_rate": 5e-05, + "loss": 0.1843, + "loss/crossentropy": 2.631078600883484, + "loss/hidden": 0.0, + "loss/logits": 0.1842627413570881, + "loss/reg": 2.5517876148223877, + "step": 547 + }, + { + "epoch": 0.00548, + "grad_norm": 0.35319533944129944, + "grad_norm_var": 0.006693321782661003, + "learning_rate": 5e-05, + "loss": 0.16, + "loss/crossentropy": 2.850399076938629, + "loss/hidden": 0.0, + "loss/logits": 0.15995023399591446, + "loss/reg": 2.548754930496216, + "step": 548 + }, + { + "epoch": 0.00549, + "grad_norm": 0.4596186578273773, + "grad_norm_var": 0.0070637908733671, + "learning_rate": 5e-05, + "loss": 0.164, + "loss/crossentropy": 2.868459641933441, + "loss/hidden": 0.0, + "loss/logits": 0.16395244374871254, + "loss/reg": 2.5462560653686523, + "step": 549 + }, + { + "epoch": 0.0055, + "grad_norm": 0.3474785387516022, + "grad_norm_var": 0.007091863949161529, + "learning_rate": 5e-05, + "loss": 0.1641, + "loss/crossentropy": 2.6114882230758667, + "loss/hidden": 0.0, + "loss/logits": 0.1641043722629547, + "loss/reg": 2.5431270599365234, + "step": 550 + }, + { + "epoch": 0.00551, + "grad_norm": 0.3570033013820648, + "grad_norm_var": 0.007107306178779664, + "learning_rate": 5e-05, + "loss": 0.1579, + "loss/crossentropy": 2.8220438957214355, + "loss/hidden": 0.0, + "loss/logits": 0.15791887789964676, + "loss/reg": 2.539910078048706, + "step": 551 + }, + { + "epoch": 0.00552, + "grad_norm": 0.32915255427360535, + "grad_norm_var": 0.0071770024029156184, + "learning_rate": 5e-05, + "loss": 0.1612, + "loss/crossentropy": 2.878856658935547, + "loss/hidden": 0.0, + "loss/logits": 0.16122740507125854, + "loss/reg": 2.5368034839630127, + "step": 552 + }, + { + "epoch": 0.00553, + "grad_norm": 0.3565903604030609, + "grad_norm_var": 0.007185408405122592, + "learning_rate": 5e-05, + "loss": 0.1689, + "loss/crossentropy": 2.705552637577057, + "loss/hidden": 0.0, + "loss/logits": 0.16892266646027565, + "loss/reg": 2.5333409309387207, + "step": 553 + }, + { + "epoch": 0.00554, + "grad_norm": 0.31767770648002625, + "grad_norm_var": 0.0073488319211029345, + "learning_rate": 5e-05, + "loss": 0.1579, + "loss/crossentropy": 2.685009717941284, + "loss/hidden": 0.0, + "loss/logits": 0.15788856148719788, + "loss/reg": 2.5296521186828613, + "step": 554 + }, + { + "epoch": 0.00555, + "grad_norm": 0.35047340393066406, + "grad_norm_var": 0.0072637659398345844, + "learning_rate": 5e-05, + "loss": 0.1746, + "loss/crossentropy": 2.5745012760162354, + "loss/hidden": 0.0, + "loss/logits": 0.17458590865135193, + "loss/reg": 2.5270395278930664, + "step": 555 + }, + { + "epoch": 0.00556, + "grad_norm": 0.3832140266895294, + "grad_norm_var": 0.0009360149891549837, + "learning_rate": 5e-05, + "loss": 0.1676, + "loss/crossentropy": 2.8551809787750244, + "loss/hidden": 0.0, + "loss/logits": 0.16756092011928558, + "loss/reg": 2.523982048034668, + "step": 556 + }, + { + "epoch": 0.00557, + "grad_norm": 0.4020329713821411, + "grad_norm_var": 0.0010289291164416311, + "learning_rate": 5e-05, + "loss": 0.1755, + "loss/crossentropy": 2.787672698497772, + "loss/hidden": 0.0, + "loss/logits": 0.17547398060560226, + "loss/reg": 2.520615816116333, + "step": 557 + }, + { + "epoch": 0.00558, + "grad_norm": 0.38412049412727356, + "grad_norm_var": 0.0010519623608851428, + "learning_rate": 5e-05, + "loss": 0.1815, + "loss/crossentropy": 2.846573293209076, + "loss/hidden": 0.0, + "loss/logits": 0.1815263032913208, + "loss/reg": 2.518004894256592, + "step": 558 + }, + { + "epoch": 0.00559, + "grad_norm": 0.3456071615219116, + "grad_norm_var": 0.0010744320361522322, + "learning_rate": 5e-05, + "loss": 0.1746, + "loss/crossentropy": 2.922893524169922, + "loss/hidden": 0.0, + "loss/logits": 0.17459525167942047, + "loss/reg": 2.5153868198394775, + "step": 559 + }, + { + "epoch": 0.0056, + "grad_norm": 0.36563733220100403, + "grad_norm_var": 0.0010744113839659304, + "learning_rate": 5e-05, + "loss": 0.169, + "loss/crossentropy": 2.7154372334480286, + "loss/hidden": 0.0, + "loss/logits": 0.16903281211853027, + "loss/reg": 2.5126099586486816, + "step": 560 + }, + { + "epoch": 0.00561, + "grad_norm": 0.3387238085269928, + "grad_norm_var": 0.00111742135319511, + "learning_rate": 5e-05, + "loss": 0.1666, + "loss/crossentropy": 2.5947054624557495, + "loss/hidden": 0.0, + "loss/logits": 0.16662058234214783, + "loss/reg": 2.509439706802368, + "step": 561 + }, + { + "epoch": 0.00562, + "grad_norm": 0.45574790239334106, + "grad_norm_var": 0.0016275854789366514, + "learning_rate": 5e-05, + "loss": 0.1814, + "loss/crossentropy": 2.961915969848633, + "loss/hidden": 0.0, + "loss/logits": 0.1813669353723526, + "loss/reg": 2.5074241161346436, + "step": 562 + }, + { + "epoch": 0.00563, + "grad_norm": 0.39113175868988037, + "grad_norm_var": 0.0016486631382784092, + "learning_rate": 5e-05, + "loss": 0.1643, + "loss/crossentropy": 2.7337673902511597, + "loss/hidden": 0.0, + "loss/logits": 0.164301548153162, + "loss/reg": 2.5046684741973877, + "step": 563 + }, + { + "epoch": 0.00564, + "grad_norm": 0.36300358176231384, + "grad_norm_var": 0.0016312765518430934, + "learning_rate": 5e-05, + "loss": 0.1602, + "loss/crossentropy": 2.713749051094055, + "loss/hidden": 0.0, + "loss/logits": 0.16021040827035904, + "loss/reg": 2.5020864009857178, + "step": 564 + }, + { + "epoch": 0.00565, + "grad_norm": 0.3250221312046051, + "grad_norm_var": 0.001185749693630452, + "learning_rate": 5e-05, + "loss": 0.1661, + "loss/crossentropy": 2.739534556865692, + "loss/hidden": 0.0, + "loss/logits": 0.166114691644907, + "loss/reg": 2.500089645385742, + "step": 565 + }, + { + "epoch": 0.00566, + "grad_norm": 0.3059675395488739, + "grad_norm_var": 0.0013809527139825861, + "learning_rate": 5e-05, + "loss": 0.1528, + "loss/crossentropy": 2.7676697373390198, + "loss/hidden": 0.0, + "loss/logits": 0.15275665000081062, + "loss/reg": 2.497802257537842, + "step": 566 + }, + { + "epoch": 0.00567, + "grad_norm": 0.41637444496154785, + "grad_norm_var": 0.0015720438674995396, + "learning_rate": 5e-05, + "loss": 0.1899, + "loss/crossentropy": 2.7852693796157837, + "loss/hidden": 0.0, + "loss/logits": 0.18987080082297325, + "loss/reg": 2.4960269927978516, + "step": 567 + }, + { + "epoch": 0.00568, + "grad_norm": 0.48216167092323303, + "grad_norm_var": 0.002316091582714641, + "learning_rate": 5e-05, + "loss": 0.179, + "loss/crossentropy": 2.919625759124756, + "loss/hidden": 0.0, + "loss/logits": 0.17901213094592094, + "loss/reg": 2.4943020343780518, + "step": 568 + }, + { + "epoch": 0.00569, + "grad_norm": 0.34773337841033936, + "grad_norm_var": 0.0023415161321106623, + "learning_rate": 5e-05, + "loss": 0.1689, + "loss/crossentropy": 2.829575002193451, + "loss/hidden": 0.0, + "loss/logits": 0.1688704527914524, + "loss/reg": 2.4922549724578857, + "step": 569 + }, + { + "epoch": 0.0057, + "grad_norm": 0.42466020584106445, + "grad_norm_var": 0.0022617987789910494, + "learning_rate": 5e-05, + "loss": 0.2065, + "loss/crossentropy": 2.847673773765564, + "loss/hidden": 0.0, + "loss/logits": 0.20647098124027252, + "loss/reg": 2.4896240234375, + "step": 570 + }, + { + "epoch": 0.00571, + "grad_norm": 0.39025840163230896, + "grad_norm_var": 0.0022035635328787567, + "learning_rate": 5e-05, + "loss": 0.181, + "loss/crossentropy": 2.9154597520828247, + "loss/hidden": 0.0, + "loss/logits": 0.1810290329158306, + "loss/reg": 2.487513542175293, + "step": 571 + }, + { + "epoch": 0.00572, + "grad_norm": 0.3611275851726532, + "grad_norm_var": 0.002232206094215346, + "learning_rate": 5e-05, + "loss": 0.1687, + "loss/crossentropy": 2.813008964061737, + "loss/hidden": 0.0, + "loss/logits": 0.16871189698576927, + "loss/reg": 2.4849367141723633, + "step": 572 + }, + { + "epoch": 0.00573, + "grad_norm": 0.37163245677948, + "grad_norm_var": 0.002205551603401897, + "learning_rate": 5e-05, + "loss": 0.1736, + "loss/crossentropy": 2.829798102378845, + "loss/hidden": 0.0, + "loss/logits": 0.173641849309206, + "loss/reg": 2.481811046600342, + "step": 573 + }, + { + "epoch": 0.00574, + "grad_norm": 0.37662971019744873, + "grad_norm_var": 0.002204250880404842, + "learning_rate": 5e-05, + "loss": 0.1641, + "loss/crossentropy": 2.786403477191925, + "loss/hidden": 0.0, + "loss/logits": 0.16413037478923798, + "loss/reg": 2.479344606399536, + "step": 574 + }, + { + "epoch": 0.00575, + "grad_norm": 0.4090428948402405, + "grad_norm_var": 0.002174681113915019, + "learning_rate": 5e-05, + "loss": 0.1684, + "loss/crossentropy": 2.685749888420105, + "loss/hidden": 0.0, + "loss/logits": 0.16840650886297226, + "loss/reg": 2.476419448852539, + "step": 575 + }, + { + "epoch": 0.00576, + "grad_norm": 0.35688483715057373, + "grad_norm_var": 0.0021995018187083346, + "learning_rate": 5e-05, + "loss": 0.1611, + "loss/crossentropy": 2.809792697429657, + "loss/hidden": 0.0, + "loss/logits": 0.1611488163471222, + "loss/reg": 2.4737355709075928, + "step": 576 + }, + { + "epoch": 0.00577, + "grad_norm": 0.38194504380226135, + "grad_norm_var": 0.002065385566742454, + "learning_rate": 5e-05, + "loss": 0.1615, + "loss/crossentropy": 2.850769340991974, + "loss/hidden": 0.0, + "loss/logits": 0.1615295149385929, + "loss/reg": 2.4713802337646484, + "step": 577 + }, + { + "epoch": 0.00578, + "grad_norm": 0.3567502200603485, + "grad_norm_var": 0.001743510873329986, + "learning_rate": 5e-05, + "loss": 0.1689, + "loss/crossentropy": 2.7103776335716248, + "loss/hidden": 0.0, + "loss/logits": 0.16894375160336494, + "loss/reg": 2.4685709476470947, + "step": 578 + }, + { + "epoch": 0.00579, + "grad_norm": 0.3396901786327362, + "grad_norm_var": 0.001824115359163836, + "learning_rate": 5e-05, + "loss": 0.166, + "loss/crossentropy": 2.7079854607582092, + "loss/hidden": 0.0, + "loss/logits": 0.1659584417939186, + "loss/reg": 2.465658664703369, + "step": 579 + }, + { + "epoch": 0.0058, + "grad_norm": 0.358395516872406, + "grad_norm_var": 0.0018331543648902808, + "learning_rate": 5e-05, + "loss": 0.1813, + "loss/crossentropy": 2.8853692412376404, + "loss/hidden": 0.0, + "loss/logits": 0.18133477121591568, + "loss/reg": 2.4633235931396484, + "step": 580 + }, + { + "epoch": 0.00581, + "grad_norm": 0.3434228301048279, + "grad_norm_var": 0.0017310432323107805, + "learning_rate": 5e-05, + "loss": 0.1739, + "loss/crossentropy": 2.666011691093445, + "loss/hidden": 0.0, + "loss/logits": 0.1739240102469921, + "loss/reg": 2.460447072982788, + "step": 581 + }, + { + "epoch": 0.00582, + "grad_norm": 0.3482820689678192, + "grad_norm_var": 0.0014454775261250163, + "learning_rate": 5e-05, + "loss": 0.1846, + "loss/crossentropy": 2.6244596242904663, + "loss/hidden": 0.0, + "loss/logits": 0.1845875158905983, + "loss/reg": 2.457307815551758, + "step": 582 + }, + { + "epoch": 0.00583, + "grad_norm": 0.36450985074043274, + "grad_norm_var": 0.0013555723186838029, + "learning_rate": 5e-05, + "loss": 0.1741, + "loss/crossentropy": 2.6909091472625732, + "loss/hidden": 0.0, + "loss/logits": 0.174148079007864, + "loss/reg": 2.4545810222625732, + "step": 583 + }, + { + "epoch": 0.00584, + "grad_norm": 0.34841907024383545, + "grad_norm_var": 0.0005772011049318792, + "learning_rate": 5e-05, + "loss": 0.1621, + "loss/crossentropy": 2.7992460131645203, + "loss/hidden": 0.0, + "loss/logits": 0.162076648324728, + "loss/reg": 2.4516048431396484, + "step": 584 + }, + { + "epoch": 0.00585, + "grad_norm": 0.36560627818107605, + "grad_norm_var": 0.0005501529366056079, + "learning_rate": 5e-05, + "loss": 0.1612, + "loss/crossentropy": 2.7556354999542236, + "loss/hidden": 0.0, + "loss/logits": 0.16119826585054398, + "loss/reg": 2.4482715129852295, + "step": 585 + }, + { + "epoch": 0.00586, + "grad_norm": 0.37393423914909363, + "grad_norm_var": 0.00033166715444868193, + "learning_rate": 5e-05, + "loss": 0.1779, + "loss/crossentropy": 2.6222774982452393, + "loss/hidden": 0.0, + "loss/logits": 0.1779084950685501, + "loss/reg": 2.4452362060546875, + "step": 586 + }, + { + "epoch": 0.00587, + "grad_norm": 0.3511587679386139, + "grad_norm_var": 0.0002976648126369145, + "learning_rate": 5e-05, + "loss": 0.1761, + "loss/crossentropy": 2.7342361211776733, + "loss/hidden": 0.0, + "loss/logits": 0.17614838480949402, + "loss/reg": 2.441678524017334, + "step": 587 + }, + { + "epoch": 0.00588, + "grad_norm": 0.33847615122795105, + "grad_norm_var": 0.0003352805276915209, + "learning_rate": 5e-05, + "loss": 0.173, + "loss/crossentropy": 2.7935328483581543, + "loss/hidden": 0.0, + "loss/logits": 0.17295999452471733, + "loss/reg": 2.437959671020508, + "step": 588 + }, + { + "epoch": 0.00589, + "grad_norm": 0.351034015417099, + "grad_norm_var": 0.00033410454836428903, + "learning_rate": 5e-05, + "loss": 0.1775, + "loss/crossentropy": 2.7590489387512207, + "loss/hidden": 0.0, + "loss/logits": 0.1775294505059719, + "loss/reg": 2.4346938133239746, + "step": 589 + }, + { + "epoch": 0.0059, + "grad_norm": 0.37800535559654236, + "grad_norm_var": 0.0003372250971240794, + "learning_rate": 5e-05, + "loss": 0.1645, + "loss/crossentropy": 2.75826096534729, + "loss/hidden": 0.0, + "loss/logits": 0.16445999220013618, + "loss/reg": 2.4319543838500977, + "step": 590 + }, + { + "epoch": 0.00591, + "grad_norm": 0.3323316276073456, + "grad_norm_var": 0.0002069473145354402, + "learning_rate": 5e-05, + "loss": 0.1644, + "loss/crossentropy": 2.963920295238495, + "loss/hidden": 0.0, + "loss/logits": 0.164412472397089, + "loss/reg": 2.4295387268066406, + "step": 591 + }, + { + "epoch": 0.00592, + "grad_norm": 0.8281128406524658, + "grad_norm_var": 0.014169124282143371, + "learning_rate": 5e-05, + "loss": 0.2256, + "loss/crossentropy": 2.9319988489151, + "loss/hidden": 0.0, + "loss/logits": 0.22560855001211166, + "loss/reg": 2.427125930786133, + "step": 592 + }, + { + "epoch": 0.00593, + "grad_norm": 0.37988972663879395, + "grad_norm_var": 0.014170226758262046, + "learning_rate": 5e-05, + "loss": 0.1795, + "loss/crossentropy": 2.92197585105896, + "loss/hidden": 0.0, + "loss/logits": 0.1794501654803753, + "loss/reg": 2.4247610569000244, + "step": 593 + }, + { + "epoch": 0.00594, + "grad_norm": 0.37449878454208374, + "grad_norm_var": 0.014123355612102569, + "learning_rate": 5e-05, + "loss": 0.1756, + "loss/crossentropy": 2.734030842781067, + "loss/hidden": 0.0, + "loss/logits": 0.17557094618678093, + "loss/reg": 2.4224398136138916, + "step": 594 + }, + { + "epoch": 0.00595, + "grad_norm": 0.3890518248081207, + "grad_norm_var": 0.013970946553029018, + "learning_rate": 5e-05, + "loss": 0.1721, + "loss/crossentropy": 2.6910988688468933, + "loss/hidden": 0.0, + "loss/logits": 0.17206770926713943, + "loss/reg": 2.4206151962280273, + "step": 595 + }, + { + "epoch": 0.00596, + "grad_norm": 0.45764538645744324, + "grad_norm_var": 0.014180672563351104, + "learning_rate": 5e-05, + "loss": 0.1886, + "loss/crossentropy": 2.706140458583832, + "loss/hidden": 0.0, + "loss/logits": 0.1886041909456253, + "loss/reg": 2.418341636657715, + "step": 596 + }, + { + "epoch": 0.00597, + "grad_norm": 0.3294787108898163, + "grad_norm_var": 0.014289226884282809, + "learning_rate": 5e-05, + "loss": 0.1693, + "loss/crossentropy": 2.772903263568878, + "loss/hidden": 0.0, + "loss/logits": 0.16925981268286705, + "loss/reg": 2.415613889694214, + "step": 597 + }, + { + "epoch": 0.00598, + "grad_norm": 0.3425086438655853, + "grad_norm_var": 0.014326812953815705, + "learning_rate": 5e-05, + "loss": 0.17, + "loss/crossentropy": 2.7024609446525574, + "loss/hidden": 0.0, + "loss/logits": 0.16995511576533318, + "loss/reg": 2.4122676849365234, + "step": 598 + }, + { + "epoch": 0.00599, + "grad_norm": 0.37222734093666077, + "grad_norm_var": 0.014300147579082, + "learning_rate": 5e-05, + "loss": 0.1789, + "loss/crossentropy": 2.8698896765708923, + "loss/hidden": 0.0, + "loss/logits": 0.17886632308363914, + "loss/reg": 2.4099037647247314, + "step": 599 + }, + { + "epoch": 0.006, + "grad_norm": 0.39135316014289856, + "grad_norm_var": 0.014151428197242365, + "learning_rate": 5e-05, + "loss": 0.1747, + "loss/crossentropy": 2.700629711151123, + "loss/hidden": 0.0, + "loss/logits": 0.17468373104929924, + "loss/reg": 2.40794038772583, + "step": 600 + }, + { + "epoch": 0.00601, + "grad_norm": 0.3728218376636505, + "grad_norm_var": 0.014124279912823712, + "learning_rate": 5e-05, + "loss": 0.169, + "loss/crossentropy": 2.794102430343628, + "loss/hidden": 0.0, + "loss/logits": 0.16897983103990555, + "loss/reg": 2.405545711517334, + "step": 601 + }, + { + "epoch": 0.00602, + "grad_norm": 0.37317147850990295, + "grad_norm_var": 0.014126729018321404, + "learning_rate": 5e-05, + "loss": 0.1747, + "loss/crossentropy": 2.6252577900886536, + "loss/hidden": 0.0, + "loss/logits": 0.17474820092320442, + "loss/reg": 2.402970790863037, + "step": 602 + }, + { + "epoch": 0.00603, + "grad_norm": 0.35492607951164246, + "grad_norm_var": 0.01410428304541661, + "learning_rate": 5e-05, + "loss": 0.1809, + "loss/crossentropy": 2.6527358889579773, + "loss/hidden": 0.0, + "loss/logits": 0.18093448877334595, + "loss/reg": 2.4007880687713623, + "step": 603 + }, + { + "epoch": 0.00604, + "grad_norm": 0.408010870218277, + "grad_norm_var": 0.013856041692491945, + "learning_rate": 5e-05, + "loss": 0.2018, + "loss/crossentropy": 2.874286651611328, + "loss/hidden": 0.0, + "loss/logits": 0.20177744701504707, + "loss/reg": 2.3988449573516846, + "step": 604 + }, + { + "epoch": 0.00605, + "grad_norm": 0.3291812837123871, + "grad_norm_var": 0.014034946168994126, + "learning_rate": 5e-05, + "loss": 0.1614, + "loss/crossentropy": 2.7926167249679565, + "loss/hidden": 0.0, + "loss/logits": 0.16140995919704437, + "loss/reg": 2.3966500759124756, + "step": 605 + }, + { + "epoch": 0.00606, + "grad_norm": 0.34659212827682495, + "grad_norm_var": 0.014192203001449558, + "learning_rate": 5e-05, + "loss": 0.1709, + "loss/crossentropy": 2.8195464611053467, + "loss/hidden": 0.0, + "loss/logits": 0.1709096021950245, + "loss/reg": 2.394033670425415, + "step": 606 + }, + { + "epoch": 0.00607, + "grad_norm": 0.32253992557525635, + "grad_norm_var": 0.014285055545239086, + "learning_rate": 5e-05, + "loss": 0.1649, + "loss/crossentropy": 2.7236337065696716, + "loss/hidden": 0.0, + "loss/logits": 0.16493552178144455, + "loss/reg": 2.3918449878692627, + "step": 607 + }, + { + "epoch": 0.00608, + "grad_norm": 0.350931316614151, + "grad_norm_var": 0.0011668026056699994, + "learning_rate": 5e-05, + "loss": 0.1718, + "loss/crossentropy": 2.800759196281433, + "loss/hidden": 0.0, + "loss/logits": 0.17175282910466194, + "loss/reg": 2.38920521736145, + "step": 608 + }, + { + "epoch": 0.00609, + "grad_norm": 0.40333986282348633, + "grad_norm_var": 0.001237012928824995, + "learning_rate": 5e-05, + "loss": 0.2046, + "loss/crossentropy": 2.7574119567871094, + "loss/hidden": 0.0, + "loss/logits": 0.2046247273683548, + "loss/reg": 2.3865227699279785, + "step": 609 + }, + { + "epoch": 0.0061, + "grad_norm": 0.3773089349269867, + "grad_norm_var": 0.0012392324335123346, + "learning_rate": 5e-05, + "loss": 0.1641, + "loss/crossentropy": 2.6313101649284363, + "loss/hidden": 0.0, + "loss/logits": 0.16410250216722488, + "loss/reg": 2.3836612701416016, + "step": 610 + }, + { + "epoch": 0.00611, + "grad_norm": 0.438357949256897, + "grad_norm_var": 0.0015159779907225465, + "learning_rate": 5e-05, + "loss": 0.2098, + "loss/crossentropy": 2.780138611793518, + "loss/hidden": 0.0, + "loss/logits": 0.20979087427258492, + "loss/reg": 2.380405902862549, + "step": 611 + }, + { + "epoch": 0.00612, + "grad_norm": 0.34121251106262207, + "grad_norm_var": 0.0010515226822608785, + "learning_rate": 5e-05, + "loss": 0.161, + "loss/crossentropy": 2.669090151786804, + "loss/hidden": 0.0, + "loss/logits": 0.1609921157360077, + "loss/reg": 2.3776426315307617, + "step": 612 + }, + { + "epoch": 0.00613, + "grad_norm": 0.36169829964637756, + "grad_norm_var": 0.0009600577824135296, + "learning_rate": 5e-05, + "loss": 0.1734, + "loss/crossentropy": 2.7925440073013306, + "loss/hidden": 0.0, + "loss/logits": 0.17342102900147438, + "loss/reg": 2.3744184970855713, + "step": 613 + }, + { + "epoch": 0.00614, + "grad_norm": 0.522160530090332, + "grad_norm_var": 0.002369345725690275, + "learning_rate": 5e-05, + "loss": 0.1663, + "loss/crossentropy": 2.698939800262451, + "loss/hidden": 0.0, + "loss/logits": 0.16627153754234314, + "loss/reg": 2.370917320251465, + "step": 614 + }, + { + "epoch": 0.00615, + "grad_norm": 0.4562234580516815, + "grad_norm_var": 0.002733171284208069, + "learning_rate": 5e-05, + "loss": 0.1686, + "loss/crossentropy": 2.8971627950668335, + "loss/hidden": 0.0, + "loss/logits": 0.16856613755226135, + "loss/reg": 2.368067979812622, + "step": 615 + }, + { + "epoch": 0.00616, + "grad_norm": 0.5767056345939636, + "grad_norm_var": 0.0050531115809510415, + "learning_rate": 5e-05, + "loss": 0.171, + "loss/crossentropy": 2.822002112865448, + "loss/hidden": 0.0, + "loss/logits": 0.17102698609232903, + "loss/reg": 2.3653640747070312, + "step": 616 + }, + { + "epoch": 0.00617, + "grad_norm": 0.3703908324241638, + "grad_norm_var": 0.005060977204500819, + "learning_rate": 5e-05, + "loss": 0.1799, + "loss/crossentropy": 2.761395037174225, + "loss/hidden": 0.0, + "loss/logits": 0.17990661412477493, + "loss/reg": 2.362797975540161, + "step": 617 + }, + { + "epoch": 0.00618, + "grad_norm": 0.44375622272491455, + "grad_norm_var": 0.005159430065949637, + "learning_rate": 5e-05, + "loss": 0.1662, + "loss/crossentropy": 2.8019450306892395, + "loss/hidden": 0.0, + "loss/logits": 0.1661831997334957, + "loss/reg": 2.3597350120544434, + "step": 618 + }, + { + "epoch": 0.00619, + "grad_norm": 0.41226035356521606, + "grad_norm_var": 0.005018716701479123, + "learning_rate": 5e-05, + "loss": 0.1737, + "loss/crossentropy": 2.837542712688446, + "loss/hidden": 0.0, + "loss/logits": 0.1737065464258194, + "loss/reg": 2.356935977935791, + "step": 619 + }, + { + "epoch": 0.0062, + "grad_norm": 0.36850520968437195, + "grad_norm_var": 0.005094037089036248, + "learning_rate": 5e-05, + "loss": 0.1691, + "loss/crossentropy": 2.872538685798645, + "loss/hidden": 0.0, + "loss/logits": 0.16909406706690788, + "loss/reg": 2.354841709136963, + "step": 620 + }, + { + "epoch": 0.00621, + "grad_norm": 0.3547448217868805, + "grad_norm_var": 0.004888988248098869, + "learning_rate": 5e-05, + "loss": 0.1777, + "loss/crossentropy": 2.727312922477722, + "loss/hidden": 0.0, + "loss/logits": 0.17773358151316643, + "loss/reg": 2.3518083095550537, + "step": 621 + }, + { + "epoch": 0.00622, + "grad_norm": 0.3340252637863159, + "grad_norm_var": 0.0049932414292845895, + "learning_rate": 5e-05, + "loss": 0.1673, + "loss/crossentropy": 2.7399535179138184, + "loss/hidden": 0.0, + "loss/logits": 0.16725115478038788, + "loss/reg": 2.349299907684326, + "step": 622 + }, + { + "epoch": 0.00623, + "grad_norm": 0.328477680683136, + "grad_norm_var": 0.004932429457390519, + "learning_rate": 5e-05, + "loss": 0.1658, + "loss/crossentropy": 2.7973376512527466, + "loss/hidden": 0.0, + "loss/logits": 0.1658070906996727, + "loss/reg": 2.346407175064087, + "step": 623 + }, + { + "epoch": 0.00624, + "grad_norm": 0.3988572061061859, + "grad_norm_var": 0.004746415643168555, + "learning_rate": 5e-05, + "loss": 0.1808, + "loss/crossentropy": 2.886197090148926, + "loss/hidden": 0.0, + "loss/logits": 0.18076446652412415, + "loss/reg": 2.343637228012085, + "step": 624 + }, + { + "epoch": 0.00625, + "grad_norm": 0.3653312921524048, + "grad_norm_var": 0.0048476613679717525, + "learning_rate": 5e-05, + "loss": 0.1752, + "loss/crossentropy": 2.6095593571662903, + "loss/hidden": 0.0, + "loss/logits": 0.17522242665290833, + "loss/reg": 2.34155011177063, + "step": 625 + }, + { + "epoch": 0.00626, + "grad_norm": 0.3519672751426697, + "grad_norm_var": 0.004975031863489395, + "learning_rate": 5e-05, + "loss": 0.1685, + "loss/crossentropy": 2.724495232105255, + "loss/hidden": 0.0, + "loss/logits": 0.16851425543427467, + "loss/reg": 2.339081048965454, + "step": 626 + }, + { + "epoch": 0.00627, + "grad_norm": 0.3507337272167206, + "grad_norm_var": 0.005024779798457324, + "learning_rate": 5e-05, + "loss": 0.1661, + "loss/crossentropy": 2.775688886642456, + "loss/hidden": 0.0, + "loss/logits": 0.1661130003631115, + "loss/reg": 2.3362221717834473, + "step": 627 + }, + { + "epoch": 0.00628, + "grad_norm": 0.35331088304519653, + "grad_norm_var": 0.004945443478871292, + "learning_rate": 5e-05, + "loss": 0.1803, + "loss/crossentropy": 2.6876689195632935, + "loss/hidden": 0.0, + "loss/logits": 0.18032584339380264, + "loss/reg": 2.3339033126831055, + "step": 628 + }, + { + "epoch": 0.00629, + "grad_norm": 0.3569658696651459, + "grad_norm_var": 0.004969005818722216, + "learning_rate": 5e-05, + "loss": 0.1646, + "loss/crossentropy": 2.87895804643631, + "loss/hidden": 0.0, + "loss/logits": 0.16462786123156548, + "loss/reg": 2.3317511081695557, + "step": 629 + }, + { + "epoch": 0.0063, + "grad_norm": 0.37102508544921875, + "grad_norm_var": 0.0038649155689368443, + "learning_rate": 5e-05, + "loss": 0.1807, + "loss/crossentropy": 2.8995742201805115, + "loss/hidden": 0.0, + "loss/logits": 0.1807471290230751, + "loss/reg": 2.3286077976226807, + "step": 630 + }, + { + "epoch": 0.00631, + "grad_norm": 0.37091144919395447, + "grad_norm_var": 0.0035332975201383715, + "learning_rate": 5e-05, + "loss": 0.1679, + "loss/crossentropy": 2.755174398422241, + "loss/hidden": 0.0, + "loss/logits": 0.16785116121172905, + "loss/reg": 2.3263347148895264, + "step": 631 + }, + { + "epoch": 0.00632, + "grad_norm": 0.3764369487762451, + "grad_norm_var": 0.000834165955391919, + "learning_rate": 5e-05, + "loss": 0.1597, + "loss/crossentropy": 2.7826634645462036, + "loss/hidden": 0.0, + "loss/logits": 0.1596829891204834, + "loss/reg": 2.3239433765411377, + "step": 632 + }, + { + "epoch": 0.00633, + "grad_norm": 0.34151408076286316, + "grad_norm_var": 0.0008818179956038841, + "learning_rate": 5e-05, + "loss": 0.1628, + "loss/crossentropy": 2.805456221103668, + "loss/hidden": 0.0, + "loss/logits": 0.16277796775102615, + "loss/reg": 2.3215837478637695, + "step": 633 + }, + { + "epoch": 0.00634, + "grad_norm": 0.7558053731918335, + "grad_norm_var": 0.010143553337954326, + "learning_rate": 5e-05, + "loss": 0.1858, + "loss/crossentropy": 2.7678999304771423, + "loss/hidden": 0.0, + "loss/logits": 0.18575545772910118, + "loss/reg": 2.3192989826202393, + "step": 634 + }, + { + "epoch": 0.00635, + "grad_norm": 0.3809748589992523, + "grad_norm_var": 0.010099062255010161, + "learning_rate": 5e-05, + "loss": 0.1792, + "loss/crossentropy": 2.86500483751297, + "loss/hidden": 0.0, + "loss/logits": 0.1791505441069603, + "loss/reg": 2.317030906677246, + "step": 635 + }, + { + "epoch": 0.00636, + "grad_norm": 0.40578746795654297, + "grad_norm_var": 0.010104068412990375, + "learning_rate": 5e-05, + "loss": 0.1882, + "loss/crossentropy": 2.8707818388938904, + "loss/hidden": 0.0, + "loss/logits": 0.188164584338665, + "loss/reg": 2.3146989345550537, + "step": 636 + }, + { + "epoch": 0.00637, + "grad_norm": 0.415227472782135, + "grad_norm_var": 0.010070131470069877, + "learning_rate": 5e-05, + "loss": 0.1748, + "loss/crossentropy": 2.831197440624237, + "loss/hidden": 0.0, + "loss/logits": 0.17476488277316093, + "loss/reg": 2.311936616897583, + "step": 637 + }, + { + "epoch": 0.00638, + "grad_norm": 0.4119730293750763, + "grad_norm_var": 0.00985685373482662, + "learning_rate": 5e-05, + "loss": 0.1699, + "loss/crossentropy": 2.6559138894081116, + "loss/hidden": 0.0, + "loss/logits": 0.16989587992429733, + "loss/reg": 2.3090522289276123, + "step": 638 + }, + { + "epoch": 0.00639, + "grad_norm": 0.3662709593772888, + "grad_norm_var": 0.009606093056996168, + "learning_rate": 5e-05, + "loss": 0.1775, + "loss/crossentropy": 2.767539858818054, + "loss/hidden": 0.0, + "loss/logits": 0.17754964902997017, + "loss/reg": 2.3056743144989014, + "step": 639 + }, + { + "epoch": 0.0064, + "grad_norm": 0.38491374254226685, + "grad_norm_var": 0.009617242443139995, + "learning_rate": 5e-05, + "loss": 0.1827, + "loss/crossentropy": 2.6669586896896362, + "loss/hidden": 0.0, + "loss/logits": 0.18266603723168373, + "loss/reg": 2.303258180618286, + "step": 640 + }, + { + "epoch": 0.00641, + "grad_norm": 0.4197373390197754, + "grad_norm_var": 0.009569272862985524, + "learning_rate": 5e-05, + "loss": 0.1778, + "loss/crossentropy": 2.7964502573013306, + "loss/hidden": 0.0, + "loss/logits": 0.17782465368509293, + "loss/reg": 2.300361156463623, + "step": 641 + }, + { + "epoch": 0.00642, + "grad_norm": 0.4097757339477539, + "grad_norm_var": 0.00940137989136159, + "learning_rate": 5e-05, + "loss": 0.1856, + "loss/crossentropy": 2.735614001750946, + "loss/hidden": 0.0, + "loss/logits": 0.1855894774198532, + "loss/reg": 2.2972419261932373, + "step": 642 + }, + { + "epoch": 0.00643, + "grad_norm": 0.35904356837272644, + "grad_norm_var": 0.00934616788177974, + "learning_rate": 5e-05, + "loss": 0.1833, + "loss/crossentropy": 2.774403393268585, + "loss/hidden": 0.0, + "loss/logits": 0.1832551322877407, + "loss/reg": 2.293954610824585, + "step": 643 + }, + { + "epoch": 0.00644, + "grad_norm": 0.34157049655914307, + "grad_norm_var": 0.009435664127140328, + "learning_rate": 5e-05, + "loss": 0.1618, + "loss/crossentropy": 2.8616234064102173, + "loss/hidden": 0.0, + "loss/logits": 0.16181085631251335, + "loss/reg": 2.2906179428100586, + "step": 644 + }, + { + "epoch": 0.00645, + "grad_norm": 0.4255986213684082, + "grad_norm_var": 0.009297406924193945, + "learning_rate": 5e-05, + "loss": 0.1853, + "loss/crossentropy": 2.654071033000946, + "loss/hidden": 0.0, + "loss/logits": 0.18530349805951118, + "loss/reg": 2.287349224090576, + "step": 645 + }, + { + "epoch": 0.00646, + "grad_norm": 0.3393001854419708, + "grad_norm_var": 0.009518979339113423, + "learning_rate": 5e-05, + "loss": 0.1665, + "loss/crossentropy": 2.73319810628891, + "loss/hidden": 0.0, + "loss/logits": 0.16647282242774963, + "loss/reg": 2.2834725379943848, + "step": 646 + }, + { + "epoch": 0.00647, + "grad_norm": 0.34969252347946167, + "grad_norm_var": 0.00964795505733251, + "learning_rate": 5e-05, + "loss": 0.1833, + "loss/crossentropy": 2.7993005514144897, + "loss/hidden": 0.0, + "loss/logits": 0.1833389550447464, + "loss/reg": 2.2799694538116455, + "step": 647 + }, + { + "epoch": 0.00648, + "grad_norm": 0.35388484597206116, + "grad_norm_var": 0.009766310746661707, + "learning_rate": 5e-05, + "loss": 0.1749, + "loss/crossentropy": 2.766145169734955, + "loss/hidden": 0.0, + "loss/logits": 0.17488964274525642, + "loss/reg": 2.277585029602051, + "step": 648 + }, + { + "epoch": 0.00649, + "grad_norm": 0.5462765097618103, + "grad_norm_var": 0.010685818975949597, + "learning_rate": 5e-05, + "loss": 0.1882, + "loss/crossentropy": 2.7475533485412598, + "loss/hidden": 0.0, + "loss/logits": 0.18824508786201477, + "loss/reg": 2.2750473022460938, + "step": 649 + }, + { + "epoch": 0.0065, + "grad_norm": 0.3537692725658417, + "grad_norm_var": 0.002605622083243005, + "learning_rate": 5e-05, + "loss": 0.1761, + "loss/crossentropy": 2.756273865699768, + "loss/hidden": 0.0, + "loss/logits": 0.17612234875559807, + "loss/reg": 2.2722957134246826, + "step": 650 + }, + { + "epoch": 0.00651, + "grad_norm": 0.3770252466201782, + "grad_norm_var": 0.0026121330513858157, + "learning_rate": 5e-05, + "loss": 0.1897, + "loss/crossentropy": 2.7889973521232605, + "loss/hidden": 0.0, + "loss/logits": 0.1897362545132637, + "loss/reg": 2.2701008319854736, + "step": 651 + }, + { + "epoch": 0.00652, + "grad_norm": 0.4475138187408447, + "grad_norm_var": 0.0028018836674080227, + "learning_rate": 5e-05, + "loss": 0.1951, + "loss/crossentropy": 2.531024992465973, + "loss/hidden": 0.0, + "loss/logits": 0.1951226033270359, + "loss/reg": 2.267695665359497, + "step": 652 + }, + { + "epoch": 0.00653, + "grad_norm": 0.3947466313838959, + "grad_norm_var": 0.002769718525090366, + "learning_rate": 5e-05, + "loss": 0.1958, + "loss/crossentropy": 2.886034905910492, + "loss/hidden": 0.0, + "loss/logits": 0.19575949385762215, + "loss/reg": 2.2648732662200928, + "step": 653 + }, + { + "epoch": 0.00654, + "grad_norm": 0.3775857090950012, + "grad_norm_var": 0.0027546537142078996, + "learning_rate": 5e-05, + "loss": 0.1746, + "loss/crossentropy": 2.7190786600112915, + "loss/hidden": 0.0, + "loss/logits": 0.17461128905415535, + "loss/reg": 2.2620925903320312, + "step": 654 + }, + { + "epoch": 0.00655, + "grad_norm": 0.34534481167793274, + "grad_norm_var": 0.002849399631435645, + "learning_rate": 5e-05, + "loss": 0.1792, + "loss/crossentropy": 2.8847506046295166, + "loss/hidden": 0.0, + "loss/logits": 0.1791832633316517, + "loss/reg": 2.2593743801116943, + "step": 655 + }, + { + "epoch": 0.00656, + "grad_norm": 0.3607633411884308, + "grad_norm_var": 0.0028993682580486144, + "learning_rate": 5e-05, + "loss": 0.1792, + "loss/crossentropy": 2.815674066543579, + "loss/hidden": 0.0, + "loss/logits": 0.179163109511137, + "loss/reg": 2.2569730281829834, + "step": 656 + }, + { + "epoch": 0.00657, + "grad_norm": 0.38781270384788513, + "grad_norm_var": 0.002826278400635814, + "learning_rate": 5e-05, + "loss": 0.1618, + "loss/crossentropy": 2.6106160283088684, + "loss/hidden": 0.0, + "loss/logits": 0.1617795117199421, + "loss/reg": 2.254523277282715, + "step": 657 + }, + { + "epoch": 0.00658, + "grad_norm": 0.40386784076690674, + "grad_norm_var": 0.0028094212847462165, + "learning_rate": 5e-05, + "loss": 0.1747, + "loss/crossentropy": 2.8471227884292603, + "loss/hidden": 0.0, + "loss/logits": 0.17474086582660675, + "loss/reg": 2.252164125442505, + "step": 658 + }, + { + "epoch": 0.00659, + "grad_norm": 0.36319243907928467, + "grad_norm_var": 0.002796007207749466, + "learning_rate": 5e-05, + "loss": 0.163, + "loss/crossentropy": 2.7625906467437744, + "loss/hidden": 0.0, + "loss/logits": 0.16296877712011337, + "loss/reg": 2.249460458755493, + "step": 659 + }, + { + "epoch": 0.0066, + "grad_norm": 0.3657222092151642, + "grad_norm_var": 0.00269101182172804, + "learning_rate": 5e-05, + "loss": 0.1777, + "loss/crossentropy": 2.781547486782074, + "loss/hidden": 0.0, + "loss/logits": 0.1777149885892868, + "loss/reg": 2.246467113494873, + "step": 660 + }, + { + "epoch": 0.00661, + "grad_norm": 0.38363057374954224, + "grad_norm_var": 0.0025851401210759276, + "learning_rate": 5e-05, + "loss": 0.1844, + "loss/crossentropy": 2.82689893245697, + "loss/hidden": 0.0, + "loss/logits": 0.1844283789396286, + "loss/reg": 2.2436530590057373, + "step": 661 + }, + { + "epoch": 0.00662, + "grad_norm": 0.4096749424934387, + "grad_norm_var": 0.0024716520181473594, + "learning_rate": 5e-05, + "loss": 0.1745, + "loss/crossentropy": 2.8063756823539734, + "loss/hidden": 0.0, + "loss/logits": 0.17451731115579605, + "loss/reg": 2.241178035736084, + "step": 662 + }, + { + "epoch": 0.00663, + "grad_norm": 0.42931249737739563, + "grad_norm_var": 0.0024528927297352344, + "learning_rate": 5e-05, + "loss": 0.186, + "loss/crossentropy": 2.8724401593208313, + "loss/hidden": 0.0, + "loss/logits": 0.1859952136874199, + "loss/reg": 2.2383124828338623, + "step": 663 + }, + { + "epoch": 0.00664, + "grad_norm": 0.3530314862728119, + "grad_norm_var": 0.0024574750299312478, + "learning_rate": 5e-05, + "loss": 0.1696, + "loss/crossentropy": 2.9292226433753967, + "loss/hidden": 0.0, + "loss/logits": 0.16955319419503212, + "loss/reg": 2.2356791496276855, + "step": 664 + }, + { + "epoch": 0.00665, + "grad_norm": 0.4304611384868622, + "grad_norm_var": 0.0009397736187397402, + "learning_rate": 5e-05, + "loss": 0.1902, + "loss/crossentropy": 2.7114855647087097, + "loss/hidden": 0.0, + "loss/logits": 0.19023016840219498, + "loss/reg": 2.2330329418182373, + "step": 665 + }, + { + "epoch": 0.00666, + "grad_norm": 0.32996541261672974, + "grad_norm_var": 0.0010789617804694747, + "learning_rate": 5e-05, + "loss": 0.158, + "loss/crossentropy": 2.8920618891716003, + "loss/hidden": 0.0, + "loss/logits": 0.1580132134258747, + "loss/reg": 2.2296440601348877, + "step": 666 + }, + { + "epoch": 0.00667, + "grad_norm": 0.3874596953392029, + "grad_norm_var": 0.0010747020479673205, + "learning_rate": 5e-05, + "loss": 0.1819, + "loss/crossentropy": 2.7297377586364746, + "loss/hidden": 0.0, + "loss/logits": 0.18189727514982224, + "loss/reg": 2.226966619491577, + "step": 667 + }, + { + "epoch": 0.00668, + "grad_norm": 0.36097773909568787, + "grad_norm_var": 0.000828712243429038, + "learning_rate": 5e-05, + "loss": 0.1726, + "loss/crossentropy": 2.6433697938919067, + "loss/hidden": 0.0, + "loss/logits": 0.17256683483719826, + "loss/reg": 2.2244138717651367, + "step": 668 + }, + { + "epoch": 0.00669, + "grad_norm": 0.3509676158428192, + "grad_norm_var": 0.0008637156407869958, + "learning_rate": 5e-05, + "loss": 0.1723, + "loss/crossentropy": 2.8315157890319824, + "loss/hidden": 0.0, + "loss/logits": 0.17234884947538376, + "loss/reg": 2.220712661743164, + "step": 669 + }, + { + "epoch": 0.0067, + "grad_norm": 0.3578469157218933, + "grad_norm_var": 0.0008878035089742793, + "learning_rate": 5e-05, + "loss": 0.1657, + "loss/crossentropy": 2.788190722465515, + "loss/hidden": 0.0, + "loss/logits": 0.16572094336152077, + "loss/reg": 2.217878818511963, + "step": 670 + }, + { + "epoch": 0.00671, + "grad_norm": 0.4930081069469452, + "grad_norm_var": 0.0016420680378558003, + "learning_rate": 5e-05, + "loss": 0.1818, + "loss/crossentropy": 3.013857901096344, + "loss/hidden": 0.0, + "loss/logits": 0.18175816163420677, + "loss/reg": 2.2148284912109375, + "step": 671 + }, + { + "epoch": 0.00672, + "grad_norm": 0.36925604939460754, + "grad_norm_var": 0.0016185866984450236, + "learning_rate": 5e-05, + "loss": 0.1642, + "loss/crossentropy": 2.8940696716308594, + "loss/hidden": 0.0, + "loss/logits": 0.1642276532948017, + "loss/reg": 2.2119295597076416, + "step": 672 + }, + { + "epoch": 0.00673, + "grad_norm": 0.4327005445957184, + "grad_norm_var": 0.0017552981165500747, + "learning_rate": 5e-05, + "loss": 0.1742, + "loss/crossentropy": 2.87309467792511, + "loss/hidden": 0.0, + "loss/logits": 0.17423933744430542, + "loss/reg": 2.209021806716919, + "step": 673 + }, + { + "epoch": 0.00674, + "grad_norm": 0.738524854183197, + "grad_norm_var": 0.009426579051544037, + "learning_rate": 5e-05, + "loss": 0.1868, + "loss/crossentropy": 2.8040258288383484, + "loss/hidden": 0.0, + "loss/logits": 0.1867678351700306, + "loss/reg": 2.2066619396209717, + "step": 674 + }, + { + "epoch": 0.00675, + "grad_norm": 0.4364205002784729, + "grad_norm_var": 0.009307313279513674, + "learning_rate": 5e-05, + "loss": 0.1796, + "loss/crossentropy": 2.718536138534546, + "loss/hidden": 0.0, + "loss/logits": 0.17956989258527756, + "loss/reg": 2.203990936279297, + "step": 675 + }, + { + "epoch": 0.00676, + "grad_norm": 0.41067376732826233, + "grad_norm_var": 0.009142390414932911, + "learning_rate": 5e-05, + "loss": 0.1642, + "loss/crossentropy": 2.72940456867218, + "loss/hidden": 0.0, + "loss/logits": 0.16421709582209587, + "loss/reg": 2.2013046741485596, + "step": 676 + }, + { + "epoch": 0.00677, + "grad_norm": 0.4327182173728943, + "grad_norm_var": 0.009073804614162174, + "learning_rate": 5e-05, + "loss": 0.1905, + "loss/crossentropy": 2.7371246814727783, + "loss/hidden": 0.0, + "loss/logits": 0.19054419547319412, + "loss/reg": 2.1991653442382812, + "step": 677 + }, + { + "epoch": 0.00678, + "grad_norm": 0.3779783844947815, + "grad_norm_var": 0.009181024716334075, + "learning_rate": 5e-05, + "loss": 0.1589, + "loss/crossentropy": 2.7500953674316406, + "loss/hidden": 0.0, + "loss/logits": 0.15891055390238762, + "loss/reg": 2.197261333465576, + "step": 678 + }, + { + "epoch": 0.00679, + "grad_norm": 0.3585035502910614, + "grad_norm_var": 0.009389539404841711, + "learning_rate": 5e-05, + "loss": 0.1836, + "loss/crossentropy": 2.869826376438141, + "loss/hidden": 0.0, + "loss/logits": 0.18359991908073425, + "loss/reg": 2.195239782333374, + "step": 679 + }, + { + "epoch": 0.0068, + "grad_norm": 0.3534944951534271, + "grad_norm_var": 0.009385802469305704, + "learning_rate": 5e-05, + "loss": 0.1689, + "loss/crossentropy": 2.8476794362068176, + "loss/hidden": 0.0, + "loss/logits": 0.1689487136900425, + "loss/reg": 2.1929402351379395, + "step": 680 + }, + { + "epoch": 0.00681, + "grad_norm": 0.3718988001346588, + "grad_norm_var": 0.009470130435250168, + "learning_rate": 5e-05, + "loss": 0.1704, + "loss/crossentropy": 2.7930009365081787, + "loss/hidden": 0.0, + "loss/logits": 0.17038631066679955, + "loss/reg": 2.19075608253479, + "step": 681 + }, + { + "epoch": 0.00682, + "grad_norm": 0.4854961037635803, + "grad_norm_var": 0.009319177707927173, + "learning_rate": 5e-05, + "loss": 0.1705, + "loss/crossentropy": 2.8028470277786255, + "loss/hidden": 0.0, + "loss/logits": 0.17054682224988937, + "loss/reg": 2.1886627674102783, + "step": 682 + }, + { + "epoch": 0.00683, + "grad_norm": 0.3880312144756317, + "grad_norm_var": 0.00931672834921676, + "learning_rate": 5e-05, + "loss": 0.1766, + "loss/crossentropy": 2.963544547557831, + "loss/hidden": 0.0, + "loss/logits": 0.17662956938147545, + "loss/reg": 2.1860859394073486, + "step": 683 + }, + { + "epoch": 0.00684, + "grad_norm": 0.3488878309726715, + "grad_norm_var": 0.009420855437860176, + "learning_rate": 5e-05, + "loss": 0.1667, + "loss/crossentropy": 2.959736704826355, + "loss/hidden": 0.0, + "loss/logits": 0.16670886427164078, + "loss/reg": 2.183668375015259, + "step": 684 + }, + { + "epoch": 0.00685, + "grad_norm": 0.8154363632202148, + "grad_norm_var": 0.018681551405985854, + "learning_rate": 5e-05, + "loss": 0.2213, + "loss/crossentropy": 2.911233067512512, + "loss/hidden": 0.0, + "loss/logits": 0.2212524674832821, + "loss/reg": 2.1813154220581055, + "step": 685 + }, + { + "epoch": 0.00686, + "grad_norm": 0.4155946969985962, + "grad_norm_var": 0.018194440840509217, + "learning_rate": 5e-05, + "loss": 0.1964, + "loss/crossentropy": 2.7186298966407776, + "loss/hidden": 0.0, + "loss/logits": 0.19640850275754929, + "loss/reg": 2.1795501708984375, + "step": 686 + }, + { + "epoch": 0.00687, + "grad_norm": 0.38160914182662964, + "grad_norm_var": 0.01835781299917098, + "learning_rate": 5e-05, + "loss": 0.1708, + "loss/crossentropy": 2.7301290035247803, + "loss/hidden": 0.0, + "loss/logits": 0.17081937566399574, + "loss/reg": 2.177623748779297, + "step": 687 + }, + { + "epoch": 0.00688, + "grad_norm": 0.41628003120422363, + "grad_norm_var": 0.01802219976069038, + "learning_rate": 5e-05, + "loss": 0.1771, + "loss/crossentropy": 2.7876546382904053, + "loss/hidden": 0.0, + "loss/logits": 0.17706667259335518, + "loss/reg": 2.175504446029663, + "step": 688 + }, + { + "epoch": 0.00689, + "grad_norm": 0.4177417755126953, + "grad_norm_var": 0.018066232212721724, + "learning_rate": 5e-05, + "loss": 0.1786, + "loss/crossentropy": 2.763257145881653, + "loss/hidden": 0.0, + "loss/logits": 0.1785966381430626, + "loss/reg": 2.173213481903076, + "step": 689 + }, + { + "epoch": 0.0069, + "grad_norm": 0.3603265583515167, + "grad_norm_var": 0.012296751904473697, + "learning_rate": 5e-05, + "loss": 0.1678, + "loss/crossentropy": 2.774847447872162, + "loss/hidden": 0.0, + "loss/logits": 0.16783085092902184, + "loss/reg": 2.1712427139282227, + "step": 690 + }, + { + "epoch": 0.00691, + "grad_norm": 0.4307333827018738, + "grad_norm_var": 0.01228874334383105, + "learning_rate": 5e-05, + "loss": 0.2034, + "loss/crossentropy": 2.6488924622535706, + "loss/hidden": 0.0, + "loss/logits": 0.20339511707425117, + "loss/reg": 2.170015573501587, + "step": 691 + }, + { + "epoch": 0.00692, + "grad_norm": 0.3678703010082245, + "grad_norm_var": 0.012472673417673882, + "learning_rate": 5e-05, + "loss": 0.1793, + "loss/crossentropy": 2.8285900950431824, + "loss/hidden": 0.0, + "loss/logits": 0.179282795637846, + "loss/reg": 2.1680784225463867, + "step": 692 + }, + { + "epoch": 0.00693, + "grad_norm": 0.3516632914543152, + "grad_norm_var": 0.012747599104723136, + "learning_rate": 5e-05, + "loss": 0.1638, + "loss/crossentropy": 2.72187340259552, + "loss/hidden": 0.0, + "loss/logits": 0.16377655416727066, + "loss/reg": 2.166708469390869, + "step": 693 + }, + { + "epoch": 0.00694, + "grad_norm": 0.37773895263671875, + "grad_norm_var": 0.012748787659448176, + "learning_rate": 5e-05, + "loss": 0.2, + "loss/crossentropy": 2.5079989433288574, + "loss/hidden": 0.0, + "loss/logits": 0.19995050877332687, + "loss/reg": 2.1644845008850098, + "step": 694 + }, + { + "epoch": 0.00695, + "grad_norm": 0.33557403087615967, + "grad_norm_var": 0.012954622340141124, + "learning_rate": 5e-05, + "loss": 0.173, + "loss/crossentropy": 2.733457326889038, + "loss/hidden": 0.0, + "loss/logits": 0.1730196811258793, + "loss/reg": 2.162649631500244, + "step": 695 + }, + { + "epoch": 0.00696, + "grad_norm": 0.3414340615272522, + "grad_norm_var": 0.01306044443406886, + "learning_rate": 5e-05, + "loss": 0.1699, + "loss/crossentropy": 2.770694136619568, + "loss/hidden": 0.0, + "loss/logits": 0.1698729656636715, + "loss/reg": 2.1605873107910156, + "step": 696 + }, + { + "epoch": 0.00697, + "grad_norm": 0.39742037653923035, + "grad_norm_var": 0.012961649579914787, + "learning_rate": 5e-05, + "loss": 0.1753, + "loss/crossentropy": 2.747798502445221, + "loss/hidden": 0.0, + "loss/logits": 0.17528066039085388, + "loss/reg": 2.158661127090454, + "step": 697 + }, + { + "epoch": 0.00698, + "grad_norm": 0.4672209620475769, + "grad_norm_var": 0.012809503544980934, + "learning_rate": 5e-05, + "loss": 0.1961, + "loss/crossentropy": 2.764335811138153, + "loss/hidden": 0.0, + "loss/logits": 0.1961456499993801, + "loss/reg": 2.157139539718628, + "step": 698 + }, + { + "epoch": 0.00699, + "grad_norm": 0.40900057554244995, + "grad_norm_var": 0.012766202979620484, + "learning_rate": 5e-05, + "loss": 0.1826, + "loss/crossentropy": 2.9526583552360535, + "loss/hidden": 0.0, + "loss/logits": 0.1826096773147583, + "loss/reg": 2.1556172370910645, + "step": 699 + }, + { + "epoch": 0.007, + "grad_norm": 0.45763787627220154, + "grad_norm_var": 0.01255169197725956, + "learning_rate": 5e-05, + "loss": 0.1845, + "loss/crossentropy": 2.9059385657310486, + "loss/hidden": 0.0, + "loss/logits": 0.18454358726739883, + "loss/reg": 2.1542842388153076, + "step": 700 + }, + { + "epoch": 0.00701, + "grad_norm": 0.568651020526886, + "grad_norm_var": 0.0033942912710514268, + "learning_rate": 5e-05, + "loss": 0.1801, + "loss/crossentropy": 2.771495759487152, + "loss/hidden": 0.0, + "loss/logits": 0.18007055297493935, + "loss/reg": 2.1522164344787598, + "step": 701 + }, + { + "epoch": 0.00702, + "grad_norm": 0.3590672016143799, + "grad_norm_var": 0.00352192003862181, + "learning_rate": 5e-05, + "loss": 0.1651, + "loss/crossentropy": 2.750881016254425, + "loss/hidden": 0.0, + "loss/logits": 0.1650897115468979, + "loss/reg": 2.1509296894073486, + "step": 702 + }, + { + "epoch": 0.00703, + "grad_norm": 0.36948493123054504, + "grad_norm_var": 0.0035648755964216056, + "learning_rate": 5e-05, + "loss": 0.1785, + "loss/crossentropy": 2.8027891516685486, + "loss/hidden": 0.0, + "loss/logits": 0.17848948016762733, + "loss/reg": 2.149231195449829, + "step": 703 + }, + { + "epoch": 0.00704, + "grad_norm": 0.3613908588886261, + "grad_norm_var": 0.0036467673242235915, + "learning_rate": 5e-05, + "loss": 0.1682, + "loss/crossentropy": 2.763719141483307, + "loss/hidden": 0.0, + "loss/logits": 0.16823140904307365, + "loss/reg": 2.1478073596954346, + "step": 704 + }, + { + "epoch": 0.00705, + "grad_norm": 0.38240060210227966, + "grad_norm_var": 0.003633263034560896, + "learning_rate": 5e-05, + "loss": 0.178, + "loss/crossentropy": 2.705716133117676, + "loss/hidden": 0.0, + "loss/logits": 0.1780022643506527, + "loss/reg": 2.1462342739105225, + "step": 705 + }, + { + "epoch": 0.00706, + "grad_norm": 0.3587467074394226, + "grad_norm_var": 0.0036409547879681387, + "learning_rate": 5e-05, + "loss": 0.1658, + "loss/crossentropy": 2.7772558331489563, + "loss/hidden": 0.0, + "loss/logits": 0.16575098782777786, + "loss/reg": 2.144062042236328, + "step": 706 + }, + { + "epoch": 0.00707, + "grad_norm": 0.36025822162628174, + "grad_norm_var": 0.0036250184261099458, + "learning_rate": 5e-05, + "loss": 0.1724, + "loss/crossentropy": 2.634014904499054, + "loss/hidden": 0.0, + "loss/logits": 0.17235567048192024, + "loss/reg": 2.1429154872894287, + "step": 707 + }, + { + "epoch": 0.00708, + "grad_norm": 0.35575759410858154, + "grad_norm_var": 0.0036725083584184842, + "learning_rate": 5e-05, + "loss": 0.1794, + "loss/crossentropy": 2.6474004983901978, + "loss/hidden": 0.0, + "loss/logits": 0.17943605780601501, + "loss/reg": 2.1421751976013184, + "step": 708 + }, + { + "epoch": 0.00709, + "grad_norm": 0.3865105211734772, + "grad_norm_var": 0.003566375202589933, + "learning_rate": 5e-05, + "loss": 0.1817, + "loss/crossentropy": 2.9082140922546387, + "loss/hidden": 0.0, + "loss/logits": 0.18172414600849152, + "loss/reg": 2.1403071880340576, + "step": 709 + }, + { + "epoch": 0.0071, + "grad_norm": 0.368362694978714, + "grad_norm_var": 0.003590971719305887, + "learning_rate": 5e-05, + "loss": 0.1711, + "loss/crossentropy": 2.6766469478607178, + "loss/hidden": 0.0, + "loss/logits": 0.17112310975790024, + "loss/reg": 2.138167142868042, + "step": 710 + }, + { + "epoch": 0.00711, + "grad_norm": 0.34797176718711853, + "grad_norm_var": 0.003506589552138199, + "learning_rate": 5e-05, + "loss": 0.163, + "loss/crossentropy": 2.8545928597450256, + "loss/hidden": 0.0, + "loss/logits": 0.16299721226096153, + "loss/reg": 2.1356112957000732, + "step": 711 + }, + { + "epoch": 0.00712, + "grad_norm": 0.3511999547481537, + "grad_norm_var": 0.0034451354888741254, + "learning_rate": 5e-05, + "loss": 0.1706, + "loss/crossentropy": 2.7516467571258545, + "loss/hidden": 0.0, + "loss/logits": 0.1705768182873726, + "loss/reg": 2.13396954536438, + "step": 712 + }, + { + "epoch": 0.00713, + "grad_norm": 0.4692562520503998, + "grad_norm_var": 0.0038021677070381584, + "learning_rate": 5e-05, + "loss": 0.1772, + "loss/crossentropy": 2.7622230648994446, + "loss/hidden": 0.0, + "loss/logits": 0.1771526113152504, + "loss/reg": 2.1316707134246826, + "step": 713 + }, + { + "epoch": 0.00714, + "grad_norm": 0.3500974774360657, + "grad_norm_var": 0.003583350276630167, + "learning_rate": 5e-05, + "loss": 0.1651, + "loss/crossentropy": 2.7385149598121643, + "loss/hidden": 0.0, + "loss/logits": 0.16512250155210495, + "loss/reg": 2.1301488876342773, + "step": 714 + }, + { + "epoch": 0.00715, + "grad_norm": 0.33279696106910706, + "grad_norm_var": 0.0037632620297312364, + "learning_rate": 5e-05, + "loss": 0.1639, + "loss/crossentropy": 2.6760587096214294, + "loss/hidden": 0.0, + "loss/logits": 0.16387901455163956, + "loss/reg": 2.128563165664673, + "step": 715 + }, + { + "epoch": 0.00716, + "grad_norm": 0.36436334252357483, + "grad_norm_var": 0.003418879723208453, + "learning_rate": 5e-05, + "loss": 0.1675, + "loss/crossentropy": 2.7055559158325195, + "loss/hidden": 0.0, + "loss/logits": 0.16751762479543686, + "loss/reg": 2.1272294521331787, + "step": 716 + }, + { + "epoch": 0.00717, + "grad_norm": 0.35308849811553955, + "grad_norm_var": 0.0009122804473129371, + "learning_rate": 5e-05, + "loss": 0.1685, + "loss/crossentropy": 2.827264368534088, + "loss/hidden": 0.0, + "loss/logits": 0.16845671087503433, + "loss/reg": 2.125559091567993, + "step": 717 + }, + { + "epoch": 0.00718, + "grad_norm": 0.36609259247779846, + "grad_norm_var": 0.0009080073745675876, + "learning_rate": 5e-05, + "loss": 0.1878, + "loss/crossentropy": 2.7995529770851135, + "loss/hidden": 0.0, + "loss/logits": 0.18778567016124725, + "loss/reg": 2.12422513961792, + "step": 718 + }, + { + "epoch": 0.00719, + "grad_norm": 0.3564467430114746, + "grad_norm_var": 0.0009149400496893722, + "learning_rate": 5e-05, + "loss": 0.1665, + "loss/crossentropy": 2.848701000213623, + "loss/hidden": 0.0, + "loss/logits": 0.16652807220816612, + "loss/reg": 2.1227705478668213, + "step": 719 + }, + { + "epoch": 0.0072, + "grad_norm": 0.3523035943508148, + "grad_norm_var": 0.0009263477116920882, + "learning_rate": 5e-05, + "loss": 0.169, + "loss/crossentropy": 2.7714666724205017, + "loss/hidden": 0.0, + "loss/logits": 0.16896242648363113, + "loss/reg": 2.1214191913604736, + "step": 720 + }, + { + "epoch": 0.00721, + "grad_norm": 0.39885270595550537, + "grad_norm_var": 0.0009792887842439849, + "learning_rate": 5e-05, + "loss": 0.1701, + "loss/crossentropy": 2.835131287574768, + "loss/hidden": 0.0, + "loss/logits": 0.1700747236609459, + "loss/reg": 2.1200404167175293, + "step": 721 + }, + { + "epoch": 0.00722, + "grad_norm": 0.40293964743614197, + "grad_norm_var": 0.0010526817455953927, + "learning_rate": 5e-05, + "loss": 0.1819, + "loss/crossentropy": 2.744925618171692, + "loss/hidden": 0.0, + "loss/logits": 0.1818903423845768, + "loss/reg": 2.1181347370147705, + "step": 722 + }, + { + "epoch": 0.00723, + "grad_norm": 0.5598530769348145, + "grad_norm_var": 0.0032894654306610577, + "learning_rate": 5e-05, + "loss": 0.1662, + "loss/crossentropy": 2.983691990375519, + "loss/hidden": 0.0, + "loss/logits": 0.16615596786141396, + "loss/reg": 2.1172993183135986, + "step": 723 + }, + { + "epoch": 0.00724, + "grad_norm": 0.39669546484947205, + "grad_norm_var": 0.003249640426166478, + "learning_rate": 5e-05, + "loss": 0.1735, + "loss/crossentropy": 2.709549069404602, + "loss/hidden": 0.0, + "loss/logits": 0.17345992848277092, + "loss/reg": 2.1152801513671875, + "step": 724 + }, + { + "epoch": 0.00725, + "grad_norm": 0.35726040601730347, + "grad_norm_var": 0.0032964500726321067, + "learning_rate": 5e-05, + "loss": 0.1624, + "loss/crossentropy": 2.7148231267929077, + "loss/hidden": 0.0, + "loss/logits": 0.16241873800754547, + "loss/reg": 2.1130924224853516, + "step": 725 + }, + { + "epoch": 0.00726, + "grad_norm": 0.3927571177482605, + "grad_norm_var": 0.0032861190572127997, + "learning_rate": 5e-05, + "loss": 0.1795, + "loss/crossentropy": 2.7939482927322388, + "loss/hidden": 0.0, + "loss/logits": 0.1794990859925747, + "loss/reg": 2.111480712890625, + "step": 726 + }, + { + "epoch": 0.00727, + "grad_norm": 0.3941044807434082, + "grad_norm_var": 0.0031944564404073005, + "learning_rate": 5e-05, + "loss": 0.1712, + "loss/crossentropy": 2.834249794483185, + "loss/hidden": 0.0, + "loss/logits": 0.1711888276040554, + "loss/reg": 2.109204053878784, + "step": 727 + }, + { + "epoch": 0.00728, + "grad_norm": 0.4828793704509735, + "grad_norm_var": 0.0036429198556795937, + "learning_rate": 5e-05, + "loss": 0.2082, + "loss/crossentropy": 2.899094045162201, + "loss/hidden": 0.0, + "loss/logits": 0.20823358744382858, + "loss/reg": 2.1061551570892334, + "step": 728 + }, + { + "epoch": 0.00729, + "grad_norm": 0.3574215769767761, + "grad_norm_var": 0.003326472236741973, + "learning_rate": 5e-05, + "loss": 0.1596, + "loss/crossentropy": 2.7636680603027344, + "loss/hidden": 0.0, + "loss/logits": 0.15961402654647827, + "loss/reg": 2.104001760482788, + "step": 729 + }, + { + "epoch": 0.0073, + "grad_norm": 0.40163764357566833, + "grad_norm_var": 0.003227754706050412, + "learning_rate": 5e-05, + "loss": 0.1797, + "loss/crossentropy": 2.8588566184043884, + "loss/hidden": 0.0, + "loss/logits": 0.17974677309393883, + "loss/reg": 2.102442741394043, + "step": 730 + }, + { + "epoch": 0.00731, + "grad_norm": 0.37189754843711853, + "grad_norm_var": 0.003015475193035148, + "learning_rate": 5e-05, + "loss": 0.1673, + "loss/crossentropy": 2.826458215713501, + "loss/hidden": 0.0, + "loss/logits": 0.16728204488754272, + "loss/reg": 2.1002049446105957, + "step": 731 + }, + { + "epoch": 0.00732, + "grad_norm": 0.3587784171104431, + "grad_norm_var": 0.003039707591927121, + "learning_rate": 5e-05, + "loss": 0.1645, + "loss/crossentropy": 2.731923222541809, + "loss/hidden": 0.0, + "loss/logits": 0.16448039561510086, + "loss/reg": 2.098315954208374, + "step": 732 + }, + { + "epoch": 0.00733, + "grad_norm": 0.37631648778915405, + "grad_norm_var": 0.002946915065401934, + "learning_rate": 5e-05, + "loss": 0.1714, + "loss/crossentropy": 2.880859136581421, + "loss/hidden": 0.0, + "loss/logits": 0.17137856781482697, + "loss/reg": 2.0962650775909424, + "step": 733 + }, + { + "epoch": 0.00734, + "grad_norm": 0.3563605844974518, + "grad_norm_var": 0.002990850657754888, + "learning_rate": 5e-05, + "loss": 0.1628, + "loss/crossentropy": 2.6355279088020325, + "loss/hidden": 0.0, + "loss/logits": 0.16278789564967155, + "loss/reg": 2.094142436981201, + "step": 734 + }, + { + "epoch": 0.00735, + "grad_norm": 0.37199047207832336, + "grad_norm_var": 0.0029265023383142925, + "learning_rate": 5e-05, + "loss": 0.1802, + "loss/crossentropy": 2.769617021083832, + "loss/hidden": 0.0, + "loss/logits": 0.18019907549023628, + "loss/reg": 2.0921125411987305, + "step": 735 + }, + { + "epoch": 0.00736, + "grad_norm": 0.489103764295578, + "grad_norm_var": 0.0033036264225515164, + "learning_rate": 5e-05, + "loss": 0.1832, + "loss/crossentropy": 2.7491883039474487, + "loss/hidden": 0.0, + "loss/logits": 0.1831774264574051, + "loss/reg": 2.0904338359832764, + "step": 736 + }, + { + "epoch": 0.00737, + "grad_norm": 0.5059826970100403, + "grad_norm_var": 0.003943075932518525, + "learning_rate": 5e-05, + "loss": 0.1817, + "loss/crossentropy": 2.8231146931648254, + "loss/hidden": 0.0, + "loss/logits": 0.18174266442656517, + "loss/reg": 2.0879714488983154, + "step": 737 + }, + { + "epoch": 0.00738, + "grad_norm": 0.6662333011627197, + "grad_norm_var": 0.007992879009924207, + "learning_rate": 5e-05, + "loss": 0.1861, + "loss/crossentropy": 2.7952335476875305, + "loss/hidden": 0.0, + "loss/logits": 0.18612126260995865, + "loss/reg": 2.0855941772460938, + "step": 738 + }, + { + "epoch": 0.00739, + "grad_norm": 0.43555790185928345, + "grad_norm_var": 0.006764259520141217, + "learning_rate": 5e-05, + "loss": 0.1823, + "loss/crossentropy": 2.7390406727790833, + "loss/hidden": 0.0, + "loss/logits": 0.1822943352162838, + "loss/reg": 2.083002805709839, + "step": 739 + }, + { + "epoch": 0.0074, + "grad_norm": 0.36206063628196716, + "grad_norm_var": 0.0069454028516603905, + "learning_rate": 5e-05, + "loss": 0.1747, + "loss/crossentropy": 2.6245489716529846, + "loss/hidden": 0.0, + "loss/logits": 0.17474905773997307, + "loss/reg": 2.0807597637176514, + "step": 740 + }, + { + "epoch": 0.00741, + "grad_norm": 0.4077146649360657, + "grad_norm_var": 0.006699115025232619, + "learning_rate": 5e-05, + "loss": 0.21, + "loss/crossentropy": 2.734727144241333, + "loss/hidden": 0.0, + "loss/logits": 0.20996900647878647, + "loss/reg": 2.0777878761291504, + "step": 741 + }, + { + "epoch": 0.00742, + "grad_norm": 0.4748740792274475, + "grad_norm_var": 0.0068148961293998615, + "learning_rate": 5e-05, + "loss": 0.1926, + "loss/crossentropy": 2.757317006587982, + "loss/hidden": 0.0, + "loss/logits": 0.19259492680430412, + "loss/reg": 2.0748398303985596, + "step": 742 + }, + { + "epoch": 0.00743, + "grad_norm": 0.3738694190979004, + "grad_norm_var": 0.006926021168675212, + "learning_rate": 5e-05, + "loss": 0.1671, + "loss/crossentropy": 2.7804144620895386, + "loss/hidden": 0.0, + "loss/logits": 0.1671152375638485, + "loss/reg": 2.072981119155884, + "step": 743 + }, + { + "epoch": 0.00744, + "grad_norm": 0.4373812675476074, + "grad_norm_var": 0.006701504868688938, + "learning_rate": 5e-05, + "loss": 0.1836, + "loss/crossentropy": 2.8251866698265076, + "loss/hidden": 0.0, + "loss/logits": 0.183602724224329, + "loss/reg": 2.069178581237793, + "step": 744 + }, + { + "epoch": 0.00745, + "grad_norm": 0.41339626908302307, + "grad_norm_var": 0.006417608208757027, + "learning_rate": 5e-05, + "loss": 0.1665, + "loss/crossentropy": 2.7784698605537415, + "loss/hidden": 0.0, + "loss/logits": 0.16648468375205994, + "loss/reg": 2.066897392272949, + "step": 745 + }, + { + "epoch": 0.00746, + "grad_norm": 0.36906108260154724, + "grad_norm_var": 0.0065862671267569286, + "learning_rate": 5e-05, + "loss": 0.1752, + "loss/crossentropy": 2.7134994864463806, + "loss/hidden": 0.0, + "loss/logits": 0.17522963881492615, + "loss/reg": 2.063711404800415, + "step": 746 + }, + { + "epoch": 0.00747, + "grad_norm": 0.3699776232242584, + "grad_norm_var": 0.006599620482715507, + "learning_rate": 5e-05, + "loss": 0.1816, + "loss/crossentropy": 2.6448380947113037, + "loss/hidden": 0.0, + "loss/logits": 0.1815556287765503, + "loss/reg": 2.0618152618408203, + "step": 747 + }, + { + "epoch": 0.00748, + "grad_norm": 0.35848432779312134, + "grad_norm_var": 0.006602145753337363, + "learning_rate": 5e-05, + "loss": 0.1759, + "loss/crossentropy": 2.577029287815094, + "loss/hidden": 0.0, + "loss/logits": 0.1758727729320526, + "loss/reg": 2.0592784881591797, + "step": 748 + }, + { + "epoch": 0.00749, + "grad_norm": 0.40015411376953125, + "grad_norm_var": 0.006489211309593653, + "learning_rate": 5e-05, + "loss": 0.2007, + "loss/crossentropy": 2.7719894647598267, + "loss/hidden": 0.0, + "loss/logits": 0.20066174119710922, + "loss/reg": 2.056396484375, + "step": 749 + }, + { + "epoch": 0.0075, + "grad_norm": 0.34235846996307373, + "grad_norm_var": 0.006628701391081989, + "learning_rate": 5e-05, + "loss": 0.1661, + "loss/crossentropy": 2.8526532649993896, + "loss/hidden": 0.0, + "loss/logits": 0.1660567931830883, + "loss/reg": 2.053225040435791, + "step": 750 + }, + { + "epoch": 0.00751, + "grad_norm": 0.37578198313713074, + "grad_norm_var": 0.006603490490161393, + "learning_rate": 5e-05, + "loss": 0.192, + "loss/crossentropy": 2.803673267364502, + "loss/hidden": 0.0, + "loss/logits": 0.19199685007333755, + "loss/reg": 2.0500073432922363, + "step": 751 + }, + { + "epoch": 0.00752, + "grad_norm": 0.3724234700202942, + "grad_norm_var": 0.006439587327084632, + "learning_rate": 5e-05, + "loss": 0.1611, + "loss/crossentropy": 2.8497246503829956, + "loss/hidden": 0.0, + "loss/logits": 0.16111965849995613, + "loss/reg": 2.0481040477752686, + "step": 752 + }, + { + "epoch": 0.00753, + "grad_norm": 0.37283533811569214, + "grad_norm_var": 0.005960471364599432, + "learning_rate": 5e-05, + "loss": 0.1816, + "loss/crossentropy": 2.580562174320221, + "loss/hidden": 0.0, + "loss/logits": 0.18162427470088005, + "loss/reg": 2.045363187789917, + "step": 753 + }, + { + "epoch": 0.00754, + "grad_norm": 0.42849722504615784, + "grad_norm_var": 0.0013156070888824681, + "learning_rate": 5e-05, + "loss": 0.1885, + "loss/crossentropy": 2.7384997606277466, + "loss/hidden": 0.0, + "loss/logits": 0.1885378062725067, + "loss/reg": 2.0420894622802734, + "step": 754 + }, + { + "epoch": 0.00755, + "grad_norm": 0.3246319890022278, + "grad_norm_var": 0.0014611472372319412, + "learning_rate": 5e-05, + "loss": 0.152, + "loss/crossentropy": 2.827781558036804, + "loss/hidden": 0.0, + "loss/logits": 0.15203238278627396, + "loss/reg": 2.0399134159088135, + "step": 755 + }, + { + "epoch": 0.00756, + "grad_norm": 0.3523566722869873, + "grad_norm_var": 0.0014986135555234647, + "learning_rate": 5e-05, + "loss": 0.1799, + "loss/crossentropy": 2.7049853801727295, + "loss/hidden": 0.0, + "loss/logits": 0.17991740256547928, + "loss/reg": 2.036095142364502, + "step": 756 + }, + { + "epoch": 0.00757, + "grad_norm": 0.3352646827697754, + "grad_norm_var": 0.0016155829783374783, + "learning_rate": 5e-05, + "loss": 0.1612, + "loss/crossentropy": 2.715296685695648, + "loss/hidden": 0.0, + "loss/logits": 0.16122159361839294, + "loss/reg": 2.0335283279418945, + "step": 757 + }, + { + "epoch": 0.00758, + "grad_norm": 0.36173179745674133, + "grad_norm_var": 0.001004548523534495, + "learning_rate": 5e-05, + "loss": 0.1768, + "loss/crossentropy": 2.8797001242637634, + "loss/hidden": 0.0, + "loss/logits": 0.17676853761076927, + "loss/reg": 2.0315983295440674, + "step": 758 + }, + { + "epoch": 0.00759, + "grad_norm": 0.43379032611846924, + "grad_norm_var": 0.0012258123535982288, + "learning_rate": 5e-05, + "loss": 0.2008, + "loss/crossentropy": 2.7367305159568787, + "loss/hidden": 0.0, + "loss/logits": 0.20076703280210495, + "loss/reg": 2.028825521469116, + "step": 759 + }, + { + "epoch": 0.0076, + "grad_norm": 0.7135851979255676, + "grad_norm_var": 0.008180404333396396, + "learning_rate": 5e-05, + "loss": 0.1988, + "loss/crossentropy": 2.7777557373046875, + "loss/hidden": 0.0, + "loss/logits": 0.19884883239865303, + "loss/reg": 2.0258262157440186, + "step": 760 + }, + { + "epoch": 0.00761, + "grad_norm": 0.36141064763069153, + "grad_norm_var": 0.008223674749041798, + "learning_rate": 5e-05, + "loss": 0.1723, + "loss/crossentropy": 2.755949318408966, + "loss/hidden": 0.0, + "loss/logits": 0.17225057631731033, + "loss/reg": 2.022468090057373, + "step": 761 + }, + { + "epoch": 0.00762, + "grad_norm": 0.47610145807266235, + "grad_norm_var": 0.008612084301677063, + "learning_rate": 5e-05, + "loss": 0.1786, + "loss/crossentropy": 2.701655924320221, + "loss/hidden": 0.0, + "loss/logits": 0.17863870784640312, + "loss/reg": 2.0194478034973145, + "step": 762 + }, + { + "epoch": 0.00763, + "grad_norm": 0.35960420966148376, + "grad_norm_var": 0.0086585523494028, + "learning_rate": 5e-05, + "loss": 0.1719, + "loss/crossentropy": 2.6496411561965942, + "loss/hidden": 0.0, + "loss/logits": 0.17191722244024277, + "loss/reg": 2.0173516273498535, + "step": 763 + }, + { + "epoch": 0.00764, + "grad_norm": 0.3759422302246094, + "grad_norm_var": 0.008585472348376118, + "learning_rate": 5e-05, + "loss": 0.1665, + "loss/crossentropy": 2.7261382937431335, + "loss/hidden": 0.0, + "loss/logits": 0.1664697714149952, + "loss/reg": 2.014254331588745, + "step": 764 + }, + { + "epoch": 0.00765, + "grad_norm": 0.3791477680206299, + "grad_norm_var": 0.008610251361000461, + "learning_rate": 5e-05, + "loss": 0.1716, + "loss/crossentropy": 2.779210090637207, + "loss/hidden": 0.0, + "loss/logits": 0.1715676300227642, + "loss/reg": 2.0109994411468506, + "step": 765 + }, + { + "epoch": 0.00766, + "grad_norm": 0.37698858976364136, + "grad_norm_var": 0.00842901980982322, + "learning_rate": 5e-05, + "loss": 0.178, + "loss/crossentropy": 2.5693264603614807, + "loss/hidden": 0.0, + "loss/logits": 0.17796850576996803, + "loss/reg": 2.007894277572632, + "step": 766 + }, + { + "epoch": 0.00767, + "grad_norm": 0.324692040681839, + "grad_norm_var": 0.008757168987509056, + "learning_rate": 5e-05, + "loss": 0.163, + "loss/crossentropy": 2.790699005126953, + "loss/hidden": 0.0, + "loss/logits": 0.16297711431980133, + "loss/reg": 2.0054080486297607, + "step": 767 + }, + { + "epoch": 0.00768, + "grad_norm": 0.37725430727005005, + "grad_norm_var": 0.008742918144709544, + "learning_rate": 5e-05, + "loss": 0.1701, + "loss/crossentropy": 2.90560781955719, + "loss/hidden": 0.0, + "loss/logits": 0.17007537558674812, + "loss/reg": 2.0026705265045166, + "step": 768 + }, + { + "epoch": 0.00769, + "grad_norm": 0.3565872013568878, + "grad_norm_var": 0.008812017421293783, + "learning_rate": 5e-05, + "loss": 0.1748, + "loss/crossentropy": 2.8573551774024963, + "loss/hidden": 0.0, + "loss/logits": 0.1747995764017105, + "loss/reg": 1.9999767541885376, + "step": 769 + }, + { + "epoch": 0.0077, + "grad_norm": 0.32768821716308594, + "grad_norm_var": 0.009011701837686615, + "learning_rate": 5e-05, + "loss": 0.1647, + "loss/crossentropy": 2.7850446105003357, + "loss/hidden": 0.0, + "loss/logits": 0.16465429961681366, + "loss/reg": 1.9974277019500732, + "step": 770 + }, + { + "epoch": 0.00771, + "grad_norm": 0.34194430708885193, + "grad_norm_var": 0.008880009468442519, + "learning_rate": 5e-05, + "loss": 0.1681, + "loss/crossentropy": 2.89225697517395, + "loss/hidden": 0.0, + "loss/logits": 0.16806093603372574, + "loss/reg": 1.9941447973251343, + "step": 771 + }, + { + "epoch": 0.00772, + "grad_norm": 0.36788639426231384, + "grad_norm_var": 0.008815313943155234, + "learning_rate": 5e-05, + "loss": 0.1824, + "loss/crossentropy": 2.8710330724716187, + "loss/hidden": 0.0, + "loss/logits": 0.1823921650648117, + "loss/reg": 1.991845965385437, + "step": 772 + }, + { + "epoch": 0.00773, + "grad_norm": 0.33500465750694275, + "grad_norm_var": 0.008817280025891942, + "learning_rate": 5e-05, + "loss": 0.1669, + "loss/crossentropy": 2.7821491956710815, + "loss/hidden": 0.0, + "loss/logits": 0.16685106977820396, + "loss/reg": 1.9901355504989624, + "step": 773 + }, + { + "epoch": 0.00774, + "grad_norm": 0.33815550804138184, + "grad_norm_var": 0.008946649562538052, + "learning_rate": 5e-05, + "loss": 0.162, + "loss/crossentropy": 2.7051143050193787, + "loss/hidden": 0.0, + "loss/logits": 0.16199326515197754, + "loss/reg": 1.9880555868148804, + "step": 774 + }, + { + "epoch": 0.00775, + "grad_norm": 0.32524728775024414, + "grad_norm_var": 0.009054478596347363, + "learning_rate": 5e-05, + "loss": 0.169, + "loss/crossentropy": 2.72264701128006, + "loss/hidden": 0.0, + "loss/logits": 0.16902651265263557, + "loss/reg": 1.986093521118164, + "step": 775 + }, + { + "epoch": 0.00776, + "grad_norm": 0.34697458148002625, + "grad_norm_var": 0.0013234442614042051, + "learning_rate": 5e-05, + "loss": 0.1691, + "loss/crossentropy": 2.780848979949951, + "loss/hidden": 0.0, + "loss/logits": 0.1690516211092472, + "loss/reg": 1.9844779968261719, + "step": 776 + }, + { + "epoch": 0.00777, + "grad_norm": 0.33995282649993896, + "grad_norm_var": 0.0013500864177136548, + "learning_rate": 5e-05, + "loss": 0.1612, + "loss/crossentropy": 2.772739827632904, + "loss/hidden": 0.0, + "loss/logits": 0.16119593381881714, + "loss/reg": 1.9825077056884766, + "step": 777 + }, + { + "epoch": 0.00778, + "grad_norm": 0.35139432549476624, + "grad_norm_var": 0.0003803343966673219, + "learning_rate": 5e-05, + "loss": 0.1668, + "loss/crossentropy": 2.7008825540542603, + "loss/hidden": 0.0, + "loss/logits": 0.16681700944900513, + "loss/reg": 1.9806305170059204, + "step": 778 + }, + { + "epoch": 0.00779, + "grad_norm": 0.4588527977466583, + "grad_norm_var": 0.00110283708340256, + "learning_rate": 5e-05, + "loss": 0.1907, + "loss/crossentropy": 2.632855713367462, + "loss/hidden": 0.0, + "loss/logits": 0.19068260118365288, + "loss/reg": 1.9793086051940918, + "step": 779 + }, + { + "epoch": 0.0078, + "grad_norm": 0.3829444646835327, + "grad_norm_var": 0.0011229031183707624, + "learning_rate": 5e-05, + "loss": 0.1875, + "loss/crossentropy": 2.9350045323371887, + "loss/hidden": 0.0, + "loss/logits": 0.18754199519753456, + "loss/reg": 1.9774302244186401, + "step": 780 + }, + { + "epoch": 0.00781, + "grad_norm": 0.46253493428230286, + "grad_norm_var": 0.0017907320044085833, + "learning_rate": 5e-05, + "loss": 0.1957, + "loss/crossentropy": 2.7478776574134827, + "loss/hidden": 0.0, + "loss/logits": 0.19565920531749725, + "loss/reg": 1.9757329225540161, + "step": 781 + }, + { + "epoch": 0.00782, + "grad_norm": 0.35229969024658203, + "grad_norm_var": 0.0017840355007145352, + "learning_rate": 5e-05, + "loss": 0.1611, + "loss/crossentropy": 2.805756628513336, + "loss/hidden": 0.0, + "loss/logits": 0.1610955037176609, + "loss/reg": 1.973933458328247, + "step": 782 + }, + { + "epoch": 0.00783, + "grad_norm": 0.3324076533317566, + "grad_norm_var": 0.0017495419673394963, + "learning_rate": 5e-05, + "loss": 0.1706, + "loss/crossentropy": 2.8343148827552795, + "loss/hidden": 0.0, + "loss/logits": 0.1706329919397831, + "loss/reg": 1.9718844890594482, + "step": 783 + }, + { + "epoch": 0.00784, + "grad_norm": 0.3563413619995117, + "grad_norm_var": 0.0017352353042652258, + "learning_rate": 5e-05, + "loss": 0.1768, + "loss/crossentropy": 2.825278103351593, + "loss/hidden": 0.0, + "loss/logits": 0.17681827396154404, + "loss/reg": 1.969612956047058, + "step": 784 + }, + { + "epoch": 0.00785, + "grad_norm": 0.33560603857040405, + "grad_norm_var": 0.0017751309342711038, + "learning_rate": 5e-05, + "loss": 0.1558, + "loss/crossentropy": 2.8132280111312866, + "loss/hidden": 0.0, + "loss/logits": 0.1558120921254158, + "loss/reg": 1.9675753116607666, + "step": 785 + }, + { + "epoch": 0.00786, + "grad_norm": 0.39733415842056274, + "grad_norm_var": 0.0017810049820061401, + "learning_rate": 5e-05, + "loss": 0.1849, + "loss/crossentropy": 2.8960456252098083, + "loss/hidden": 0.0, + "loss/logits": 0.18493180349469185, + "loss/reg": 1.965217113494873, + "step": 786 + }, + { + "epoch": 0.00787, + "grad_norm": 0.561698317527771, + "grad_norm_var": 0.004151387117344507, + "learning_rate": 5e-05, + "loss": 0.1987, + "loss/crossentropy": 2.66669100522995, + "loss/hidden": 0.0, + "loss/logits": 0.19869648292660713, + "loss/reg": 1.9629205465316772, + "step": 787 + }, + { + "epoch": 0.00788, + "grad_norm": 0.35911333560943604, + "grad_norm_var": 0.004167781816727311, + "learning_rate": 5e-05, + "loss": 0.1742, + "loss/crossentropy": 2.723667323589325, + "loss/hidden": 0.0, + "loss/logits": 0.17424843832850456, + "loss/reg": 1.9608547687530518, + "step": 788 + }, + { + "epoch": 0.00789, + "grad_norm": 0.3422897160053253, + "grad_norm_var": 0.004130072564222831, + "learning_rate": 5e-05, + "loss": 0.1683, + "loss/crossentropy": 2.853653848171234, + "loss/hidden": 0.0, + "loss/logits": 0.16829833760857582, + "loss/reg": 1.959040880203247, + "step": 789 + }, + { + "epoch": 0.0079, + "grad_norm": 0.373519629240036, + "grad_norm_var": 0.0040217911733014585, + "learning_rate": 5e-05, + "loss": 0.1697, + "loss/crossentropy": 2.7469093799591064, + "loss/hidden": 0.0, + "loss/logits": 0.1697460599243641, + "loss/reg": 1.9578640460968018, + "step": 790 + }, + { + "epoch": 0.00791, + "grad_norm": 0.42586550116539, + "grad_norm_var": 0.003921241787547673, + "learning_rate": 5e-05, + "loss": 0.1896, + "loss/crossentropy": 2.9876235127449036, + "loss/hidden": 0.0, + "loss/logits": 0.18961479887366295, + "loss/reg": 1.9558684825897217, + "step": 791 + }, + { + "epoch": 0.00792, + "grad_norm": 0.34371063113212585, + "grad_norm_var": 0.00393897634361432, + "learning_rate": 5e-05, + "loss": 0.1624, + "loss/crossentropy": 2.922863006591797, + "loss/hidden": 0.0, + "loss/logits": 0.16239817067980766, + "loss/reg": 1.9541829824447632, + "step": 792 + }, + { + "epoch": 0.00793, + "grad_norm": 0.3611912727355957, + "grad_norm_var": 0.0038367960883469387, + "learning_rate": 5e-05, + "loss": 0.1767, + "loss/crossentropy": 2.751186192035675, + "loss/hidden": 0.0, + "loss/logits": 0.17671825364232063, + "loss/reg": 1.9524297714233398, + "step": 793 + }, + { + "epoch": 0.00794, + "grad_norm": 0.3787733018398285, + "grad_norm_var": 0.0037525025406884736, + "learning_rate": 5e-05, + "loss": 0.1695, + "loss/crossentropy": 2.6539193391799927, + "loss/hidden": 0.0, + "loss/logits": 0.16952653229236603, + "loss/reg": 1.951439619064331, + "step": 794 + }, + { + "epoch": 0.00795, + "grad_norm": 0.37621310353279114, + "grad_norm_var": 0.003409985625982037, + "learning_rate": 5e-05, + "loss": 0.1827, + "loss/crossentropy": 2.672878086566925, + "loss/hidden": 0.0, + "loss/logits": 0.1826501600444317, + "loss/reg": 1.9504698514938354, + "step": 795 + }, + { + "epoch": 0.00796, + "grad_norm": 0.3580264747142792, + "grad_norm_var": 0.0034518512961513536, + "learning_rate": 5e-05, + "loss": 0.1741, + "loss/crossentropy": 2.8564891815185547, + "loss/hidden": 0.0, + "loss/logits": 0.17409207299351692, + "loss/reg": 1.9492477178573608, + "step": 796 + }, + { + "epoch": 0.00797, + "grad_norm": 0.3552623689174652, + "grad_norm_var": 0.0030235748866805395, + "learning_rate": 5e-05, + "loss": 0.1679, + "loss/crossentropy": 2.9642611145973206, + "loss/hidden": 0.0, + "loss/logits": 0.16786304488778114, + "loss/reg": 1.9484763145446777, + "step": 797 + }, + { + "epoch": 0.00798, + "grad_norm": 0.37029561400413513, + "grad_norm_var": 0.0029878997549970957, + "learning_rate": 5e-05, + "loss": 0.1837, + "loss/crossentropy": 2.7581509947776794, + "loss/hidden": 0.0, + "loss/logits": 0.18365685641765594, + "loss/reg": 1.946696400642395, + "step": 798 + }, + { + "epoch": 0.00799, + "grad_norm": 0.37257152795791626, + "grad_norm_var": 0.00285137706672662, + "learning_rate": 5e-05, + "loss": 0.1768, + "loss/crossentropy": 2.766197443008423, + "loss/hidden": 0.0, + "loss/logits": 0.17676663026213646, + "loss/reg": 1.9451103210449219, + "step": 799 + }, + { + "epoch": 0.008, + "grad_norm": 0.3937225043773651, + "grad_norm_var": 0.0028245897421089812, + "learning_rate": 5e-05, + "loss": 0.1593, + "loss/crossentropy": 2.968823492527008, + "loss/hidden": 0.0, + "loss/logits": 0.15929469466209412, + "loss/reg": 1.943403959274292, + "step": 800 + }, + { + "epoch": 0.00801, + "grad_norm": 0.5229995846748352, + "grad_norm_var": 0.003870799660257873, + "learning_rate": 5e-05, + "loss": 0.1904, + "loss/crossentropy": 2.5733524560928345, + "loss/hidden": 0.0, + "loss/logits": 0.19040565192699432, + "loss/reg": 1.9423651695251465, + "step": 801 + }, + { + "epoch": 0.00802, + "grad_norm": 0.4087795913219452, + "grad_norm_var": 0.003885163701405114, + "learning_rate": 5e-05, + "loss": 0.2011, + "loss/crossentropy": 2.643693685531616, + "loss/hidden": 0.0, + "loss/logits": 0.20110392943024635, + "loss/reg": 1.941137433052063, + "step": 802 + }, + { + "epoch": 0.00803, + "grad_norm": 0.369555801153183, + "grad_norm_var": 0.0018963737991296507, + "learning_rate": 5e-05, + "loss": 0.1768, + "loss/crossentropy": 2.702915072441101, + "loss/hidden": 0.0, + "loss/logits": 0.17682579904794693, + "loss/reg": 1.9396127462387085, + "step": 803 + }, + { + "epoch": 0.00804, + "grad_norm": 0.3822772204875946, + "grad_norm_var": 0.001859244513831604, + "learning_rate": 5e-05, + "loss": 0.1674, + "loss/crossentropy": 2.8051819801330566, + "loss/hidden": 0.0, + "loss/logits": 0.1674252152442932, + "loss/reg": 1.9382424354553223, + "step": 804 + }, + { + "epoch": 0.00805, + "grad_norm": 0.42195388674736023, + "grad_norm_var": 0.0018187903132861672, + "learning_rate": 5e-05, + "loss": 0.1982, + "loss/crossentropy": 2.7962412238121033, + "loss/hidden": 0.0, + "loss/logits": 0.19819854572415352, + "loss/reg": 1.9367636442184448, + "step": 805 + }, + { + "epoch": 0.00806, + "grad_norm": 0.39215224981307983, + "grad_norm_var": 0.001803471303692028, + "learning_rate": 5e-05, + "loss": 0.1862, + "loss/crossentropy": 2.838093400001526, + "loss/hidden": 0.0, + "loss/logits": 0.1862441450357437, + "loss/reg": 1.9356895685195923, + "step": 806 + }, + { + "epoch": 0.00807, + "grad_norm": 0.36561474204063416, + "grad_norm_var": 0.0017388941933010808, + "learning_rate": 5e-05, + "loss": 0.1659, + "loss/crossentropy": 2.8486820459365845, + "loss/hidden": 0.0, + "loss/logits": 0.16586757823824883, + "loss/reg": 1.9336965084075928, + "step": 807 + }, + { + "epoch": 0.00808, + "grad_norm": 0.3940856456756592, + "grad_norm_var": 0.0016146705961780842, + "learning_rate": 5e-05, + "loss": 0.1695, + "loss/crossentropy": 2.7590547800064087, + "loss/hidden": 0.0, + "loss/logits": 0.16953302919864655, + "loss/reg": 1.9318287372589111, + "step": 808 + }, + { + "epoch": 0.00809, + "grad_norm": 0.37031009793281555, + "grad_norm_var": 0.0015860965038246484, + "learning_rate": 5e-05, + "loss": 0.1663, + "loss/crossentropy": 2.803991198539734, + "loss/hidden": 0.0, + "loss/logits": 0.16627426072955132, + "loss/reg": 1.9298937320709229, + "step": 809 + }, + { + "epoch": 0.0081, + "grad_norm": 0.36467787623405457, + "grad_norm_var": 0.001618743456786816, + "learning_rate": 5e-05, + "loss": 0.1798, + "loss/crossentropy": 2.7522680163383484, + "loss/hidden": 0.0, + "loss/logits": 0.17983945459127426, + "loss/reg": 1.928220272064209, + "step": 810 + }, + { + "epoch": 0.00811, + "grad_norm": 0.644191324710846, + "grad_norm_var": 0.005662418748027209, + "learning_rate": 5e-05, + "loss": 0.1823, + "loss/crossentropy": 2.9207261204719543, + "loss/hidden": 0.0, + "loss/logits": 0.18232716247439384, + "loss/reg": 1.9264674186706543, + "step": 811 + }, + { + "epoch": 0.00812, + "grad_norm": 0.4135313034057617, + "grad_norm_var": 0.00550433789682554, + "learning_rate": 5e-05, + "loss": 0.1754, + "loss/crossentropy": 2.78128319978714, + "loss/hidden": 0.0, + "loss/logits": 0.17538663744926453, + "loss/reg": 1.9246680736541748, + "step": 812 + }, + { + "epoch": 0.00813, + "grad_norm": 0.44594907760620117, + "grad_norm_var": 0.005370096537218135, + "learning_rate": 5e-05, + "loss": 0.1898, + "loss/crossentropy": 2.7069836258888245, + "loss/hidden": 0.0, + "loss/logits": 0.18982965499162674, + "loss/reg": 1.9229440689086914, + "step": 813 + }, + { + "epoch": 0.00814, + "grad_norm": 0.41460415720939636, + "grad_norm_var": 0.005231401879877403, + "learning_rate": 5e-05, + "loss": 0.1965, + "loss/crossentropy": 2.658607244491577, + "loss/hidden": 0.0, + "loss/logits": 0.1964995227754116, + "loss/reg": 1.9211541414260864, + "step": 814 + }, + { + "epoch": 0.00815, + "grad_norm": 0.40847840905189514, + "grad_norm_var": 0.0050977892227572616, + "learning_rate": 5e-05, + "loss": 0.1865, + "loss/crossentropy": 2.789508819580078, + "loss/hidden": 0.0, + "loss/logits": 0.18652214854955673, + "loss/reg": 1.91935396194458, + "step": 815 + }, + { + "epoch": 0.00816, + "grad_norm": 0.39475539326667786, + "grad_norm_var": 0.005094298258556392, + "learning_rate": 5e-05, + "loss": 0.1876, + "loss/crossentropy": 2.771743655204773, + "loss/hidden": 0.0, + "loss/logits": 0.18758049979805946, + "loss/reg": 1.9176177978515625, + "step": 816 + }, + { + "epoch": 0.00817, + "grad_norm": 0.3788897395133972, + "grad_norm_var": 0.004405869730473085, + "learning_rate": 5e-05, + "loss": 0.1765, + "loss/crossentropy": 2.807315766811371, + "loss/hidden": 0.0, + "loss/logits": 0.17645375058054924, + "loss/reg": 1.9157112836837769, + "step": 817 + }, + { + "epoch": 0.00818, + "grad_norm": 0.3527612090110779, + "grad_norm_var": 0.004615691680188668, + "learning_rate": 5e-05, + "loss": 0.1644, + "loss/crossentropy": 2.8354954719543457, + "loss/hidden": 0.0, + "loss/logits": 0.16440149024128914, + "loss/reg": 1.914186716079712, + "step": 818 + }, + { + "epoch": 0.00819, + "grad_norm": 0.45434367656707764, + "grad_norm_var": 0.004640431192599914, + "learning_rate": 5e-05, + "loss": 0.2004, + "loss/crossentropy": 2.7823927998542786, + "loss/hidden": 0.0, + "loss/logits": 0.20038331300020218, + "loss/reg": 1.9128350019454956, + "step": 819 + }, + { + "epoch": 0.0082, + "grad_norm": 0.4440554082393646, + "grad_norm_var": 0.004630750512825665, + "learning_rate": 5e-05, + "loss": 0.198, + "loss/crossentropy": 2.7826399207115173, + "loss/hidden": 0.0, + "loss/logits": 0.19803617522120476, + "loss/reg": 1.9112329483032227, + "step": 820 + }, + { + "epoch": 0.00821, + "grad_norm": 0.8357638120651245, + "grad_norm_var": 0.015646654980531947, + "learning_rate": 5e-05, + "loss": 0.2236, + "loss/crossentropy": 2.855618476867676, + "loss/hidden": 0.0, + "loss/logits": 0.22355607897043228, + "loss/reg": 1.9094618558883667, + "step": 821 + }, + { + "epoch": 0.00822, + "grad_norm": 0.369484543800354, + "grad_norm_var": 0.01582983572015086, + "learning_rate": 5e-05, + "loss": 0.1688, + "loss/crossentropy": 2.7591440081596375, + "loss/hidden": 0.0, + "loss/logits": 0.16877064853906631, + "loss/reg": 1.9078381061553955, + "step": 822 + }, + { + "epoch": 0.00823, + "grad_norm": 0.3682078421115875, + "grad_norm_var": 0.015804289096973827, + "learning_rate": 5e-05, + "loss": 0.1661, + "loss/crossentropy": 2.6714991331100464, + "loss/hidden": 0.0, + "loss/logits": 0.16609660163521767, + "loss/reg": 1.9066871404647827, + "step": 823 + }, + { + "epoch": 0.00824, + "grad_norm": 0.4925972521305084, + "grad_norm_var": 0.015796176553567597, + "learning_rate": 5e-05, + "loss": 0.1983, + "loss/crossentropy": 2.690047025680542, + "loss/hidden": 0.0, + "loss/logits": 0.19832593947649002, + "loss/reg": 1.9051308631896973, + "step": 824 + }, + { + "epoch": 0.00825, + "grad_norm": 0.4004671275615692, + "grad_norm_var": 0.01554450060197241, + "learning_rate": 5e-05, + "loss": 0.1867, + "loss/crossentropy": 3.0430662631988525, + "loss/hidden": 0.0, + "loss/logits": 0.18668686598539352, + "loss/reg": 1.9037506580352783, + "step": 825 + }, + { + "epoch": 0.00826, + "grad_norm": 0.40644222497940063, + "grad_norm_var": 0.01518439463368978, + "learning_rate": 5e-05, + "loss": 0.1992, + "loss/crossentropy": 2.856709599494934, + "loss/hidden": 0.0, + "loss/logits": 0.19924000278115273, + "loss/reg": 1.9035279750823975, + "step": 826 + }, + { + "epoch": 0.00827, + "grad_norm": 0.387662410736084, + "grad_norm_var": 0.012707668169475368, + "learning_rate": 5e-05, + "loss": 0.171, + "loss/crossentropy": 2.8205041885375977, + "loss/hidden": 0.0, + "loss/logits": 0.17101648449897766, + "loss/reg": 1.9019997119903564, + "step": 827 + }, + { + "epoch": 0.00828, + "grad_norm": 0.4229760766029358, + "grad_norm_var": 0.012685578660426963, + "learning_rate": 5e-05, + "loss": 0.1705, + "loss/crossentropy": 2.8298428058624268, + "loss/hidden": 0.0, + "loss/logits": 0.1704978421330452, + "loss/reg": 1.9013198614120483, + "step": 828 + }, + { + "epoch": 0.00829, + "grad_norm": 0.4192207455635071, + "grad_norm_var": 0.012695092968616347, + "learning_rate": 5e-05, + "loss": 0.1699, + "loss/crossentropy": 2.8572763800621033, + "loss/hidden": 0.0, + "loss/logits": 0.16986168175935745, + "loss/reg": 1.9007604122161865, + "step": 829 + }, + { + "epoch": 0.0083, + "grad_norm": 0.3887600898742676, + "grad_norm_var": 0.012805118489640084, + "learning_rate": 5e-05, + "loss": 0.202, + "loss/crossentropy": 2.786255419254303, + "loss/hidden": 0.0, + "loss/logits": 0.20195355266332626, + "loss/reg": 1.9000667333602905, + "step": 830 + }, + { + "epoch": 0.00831, + "grad_norm": 0.48604434728622437, + "grad_norm_var": 0.012929568590754843, + "learning_rate": 5e-05, + "loss": 0.1961, + "loss/crossentropy": 2.845152735710144, + "loss/hidden": 0.0, + "loss/logits": 0.19609695672988892, + "loss/reg": 1.899495005607605, + "step": 831 + }, + { + "epoch": 0.00832, + "grad_norm": 0.38712021708488464, + "grad_norm_var": 0.012976881832390848, + "learning_rate": 5e-05, + "loss": 0.1889, + "loss/crossentropy": 2.7007412910461426, + "loss/hidden": 0.0, + "loss/logits": 0.1888689175248146, + "loss/reg": 1.8983198404312134, + "step": 832 + }, + { + "epoch": 0.00833, + "grad_norm": 0.3749590814113617, + "grad_norm_var": 0.013008393945473115, + "learning_rate": 5e-05, + "loss": 0.1785, + "loss/crossentropy": 2.607687532901764, + "loss/hidden": 0.0, + "loss/logits": 0.1784559190273285, + "loss/reg": 1.897759199142456, + "step": 833 + }, + { + "epoch": 0.00834, + "grad_norm": 0.35202544927597046, + "grad_norm_var": 0.013016684761580717, + "learning_rate": 5e-05, + "loss": 0.1707, + "loss/crossentropy": 2.7777926325798035, + "loss/hidden": 0.0, + "loss/logits": 0.17071311548352242, + "loss/reg": 1.8961632251739502, + "step": 834 + }, + { + "epoch": 0.00835, + "grad_norm": 0.3441760540008545, + "grad_norm_var": 0.013518763280912912, + "learning_rate": 5e-05, + "loss": 0.1615, + "loss/crossentropy": 2.835566759109497, + "loss/hidden": 0.0, + "loss/logits": 0.1615000143647194, + "loss/reg": 1.8950566053390503, + "step": 835 + }, + { + "epoch": 0.00836, + "grad_norm": 0.362005352973938, + "grad_norm_var": 0.013785734718565416, + "learning_rate": 5e-05, + "loss": 0.1631, + "loss/crossentropy": 2.818268299102783, + "loss/hidden": 0.0, + "loss/logits": 0.16310900822281837, + "loss/reg": 1.8935774564743042, + "step": 836 + }, + { + "epoch": 0.00837, + "grad_norm": 0.3725143074989319, + "grad_norm_var": 0.0018186987426010584, + "learning_rate": 5e-05, + "loss": 0.1929, + "loss/crossentropy": 2.763745427131653, + "loss/hidden": 0.0, + "loss/logits": 0.192863829433918, + "loss/reg": 1.8919721841812134, + "step": 837 + }, + { + "epoch": 0.00838, + "grad_norm": 0.39604651927948, + "grad_norm_var": 0.0017691837659245461, + "learning_rate": 5e-05, + "loss": 0.1847, + "loss/crossentropy": 2.729005455970764, + "loss/hidden": 0.0, + "loss/logits": 0.18469301983714104, + "loss/reg": 1.8908017873764038, + "step": 838 + }, + { + "epoch": 0.00839, + "grad_norm": 0.39175912737846375, + "grad_norm_var": 0.001711627371570279, + "learning_rate": 5e-05, + "loss": 0.1841, + "loss/crossentropy": 2.8563897609710693, + "loss/hidden": 0.0, + "loss/logits": 0.18407713249325752, + "loss/reg": 1.8892946243286133, + "step": 839 + }, + { + "epoch": 0.0084, + "grad_norm": 0.3497207462787628, + "grad_norm_var": 0.0012053613127933737, + "learning_rate": 5e-05, + "loss": 0.1657, + "loss/crossentropy": 2.9635773301124573, + "loss/hidden": 0.0, + "loss/logits": 0.1656595915555954, + "loss/reg": 1.8886394500732422, + "step": 840 + }, + { + "epoch": 0.00841, + "grad_norm": 0.36070436239242554, + "grad_norm_var": 0.0012493146014174172, + "learning_rate": 5e-05, + "loss": 0.1659, + "loss/crossentropy": 2.905772030353546, + "loss/hidden": 0.0, + "loss/logits": 0.16585366800427437, + "loss/reg": 1.8883072137832642, + "step": 841 + }, + { + "epoch": 0.00842, + "grad_norm": 0.5194427371025085, + "grad_norm_var": 0.002330769361460483, + "learning_rate": 5e-05, + "loss": 0.1903, + "loss/crossentropy": 2.914414703845978, + "loss/hidden": 0.0, + "loss/logits": 0.1903173327445984, + "loss/reg": 1.8872804641723633, + "step": 842 + }, + { + "epoch": 0.00843, + "grad_norm": 0.3658028841018677, + "grad_norm_var": 0.0023811347132161485, + "learning_rate": 5e-05, + "loss": 0.1722, + "loss/crossentropy": 2.8683270812034607, + "loss/hidden": 0.0, + "loss/logits": 0.17216329649090767, + "loss/reg": 1.8860183954238892, + "step": 843 + }, + { + "epoch": 0.00844, + "grad_norm": 0.3355120122432709, + "grad_norm_var": 0.0025135271396979795, + "learning_rate": 5e-05, + "loss": 0.166, + "loss/crossentropy": 2.7672330141067505, + "loss/hidden": 0.0, + "loss/logits": 0.16596197709441185, + "loss/reg": 1.8848904371261597, + "step": 844 + }, + { + "epoch": 0.00845, + "grad_norm": 0.45907968282699585, + "grad_norm_var": 0.002779472587279837, + "learning_rate": 5e-05, + "loss": 0.1881, + "loss/crossentropy": 2.79125440120697, + "loss/hidden": 0.0, + "loss/logits": 0.1881270818412304, + "loss/reg": 1.8831804990768433, + "step": 845 + }, + { + "epoch": 0.00846, + "grad_norm": 0.3753393888473511, + "grad_norm_var": 0.0027935829770950843, + "learning_rate": 5e-05, + "loss": 0.1801, + "loss/crossentropy": 2.753562033176422, + "loss/hidden": 0.0, + "loss/logits": 0.18013736233115196, + "loss/reg": 1.8817567825317383, + "step": 846 + }, + { + "epoch": 0.00847, + "grad_norm": 0.41996800899505615, + "grad_norm_var": 0.002216029114339835, + "learning_rate": 5e-05, + "loss": 0.1871, + "loss/crossentropy": 2.8630106449127197, + "loss/hidden": 0.0, + "loss/logits": 0.18711163103580475, + "loss/reg": 1.8805315494537354, + "step": 847 + }, + { + "epoch": 0.00848, + "grad_norm": 0.40139615535736084, + "grad_norm_var": 0.0022320677834542836, + "learning_rate": 5e-05, + "loss": 0.182, + "loss/crossentropy": 2.8663435578346252, + "loss/hidden": 0.0, + "loss/logits": 0.18201814219355583, + "loss/reg": 1.8795160055160522, + "step": 848 + }, + { + "epoch": 0.00849, + "grad_norm": 0.44251078367233276, + "grad_norm_var": 0.0024153193390402117, + "learning_rate": 5e-05, + "loss": 0.1721, + "loss/crossentropy": 2.8493316173553467, + "loss/hidden": 0.0, + "loss/logits": 0.1721041165292263, + "loss/reg": 1.8777539730072021, + "step": 849 + }, + { + "epoch": 0.0085, + "grad_norm": 0.39363330602645874, + "grad_norm_var": 0.00231007314672006, + "learning_rate": 5e-05, + "loss": 0.1933, + "loss/crossentropy": 2.8225064277648926, + "loss/hidden": 0.0, + "loss/logits": 0.19327203929424286, + "loss/reg": 1.8762940168380737, + "step": 850 + }, + { + "epoch": 0.00851, + "grad_norm": 0.3834942579269409, + "grad_norm_var": 0.0021502092497398652, + "learning_rate": 5e-05, + "loss": 0.1919, + "loss/crossentropy": 2.7274433970451355, + "loss/hidden": 0.0, + "loss/logits": 0.19194044917821884, + "loss/reg": 1.8752055168151855, + "step": 851 + }, + { + "epoch": 0.00852, + "grad_norm": 0.36249086260795593, + "grad_norm_var": 0.0021480519578248518, + "learning_rate": 5e-05, + "loss": 0.1759, + "loss/crossentropy": 2.725217640399933, + "loss/hidden": 0.0, + "loss/logits": 0.17594841867685318, + "loss/reg": 1.8736830949783325, + "step": 852 + }, + { + "epoch": 0.00853, + "grad_norm": 0.3869001269340515, + "grad_norm_var": 0.002116727725912885, + "learning_rate": 5e-05, + "loss": 0.1823, + "loss/crossentropy": 2.738001227378845, + "loss/hidden": 0.0, + "loss/logits": 0.1822870336472988, + "loss/reg": 1.8728076219558716, + "step": 853 + }, + { + "epoch": 0.00854, + "grad_norm": 0.3708244860172272, + "grad_norm_var": 0.002157970353941248, + "learning_rate": 5e-05, + "loss": 0.1668, + "loss/crossentropy": 2.788993239402771, + "loss/hidden": 0.0, + "loss/logits": 0.16682880371809006, + "loss/reg": 1.8720353841781616, + "step": 854 + }, + { + "epoch": 0.00855, + "grad_norm": 0.372335821390152, + "grad_norm_var": 0.002189712517136307, + "learning_rate": 5e-05, + "loss": 0.1718, + "loss/crossentropy": 2.813368082046509, + "loss/hidden": 0.0, + "loss/logits": 0.17184938862919807, + "loss/reg": 1.8711519241333008, + "step": 855 + }, + { + "epoch": 0.00856, + "grad_norm": 0.35767483711242676, + "grad_norm_var": 0.0021470276840197164, + "learning_rate": 5e-05, + "loss": 0.175, + "loss/crossentropy": 2.779674708843231, + "loss/hidden": 0.0, + "loss/logits": 0.17495984584093094, + "loss/reg": 1.869391679763794, + "step": 856 + }, + { + "epoch": 0.00857, + "grad_norm": 0.3517981767654419, + "grad_norm_var": 0.002191754274186038, + "learning_rate": 5e-05, + "loss": 0.1688, + "loss/crossentropy": 2.7776423692703247, + "loss/hidden": 0.0, + "loss/logits": 0.16876182705163956, + "loss/reg": 1.868484616279602, + "step": 857 + }, + { + "epoch": 0.00858, + "grad_norm": 0.8127824664115906, + "grad_norm_var": 0.012490247842596114, + "learning_rate": 5e-05, + "loss": 0.2224, + "loss/crossentropy": 2.9876713156700134, + "loss/hidden": 0.0, + "loss/logits": 0.2223958522081375, + "loss/reg": 1.8681334257125854, + "step": 858 + }, + { + "epoch": 0.00859, + "grad_norm": 0.4339921474456787, + "grad_norm_var": 0.012361098720851383, + "learning_rate": 5e-05, + "loss": 0.1752, + "loss/crossentropy": 2.8306267857551575, + "loss/hidden": 0.0, + "loss/logits": 0.17517483979463577, + "loss/reg": 1.8669747114181519, + "step": 859 + }, + { + "epoch": 0.0086, + "grad_norm": 0.5807726383209229, + "grad_norm_var": 0.013480947234538975, + "learning_rate": 5e-05, + "loss": 0.1787, + "loss/crossentropy": 2.731403112411499, + "loss/hidden": 0.0, + "loss/logits": 0.17869474738836288, + "loss/reg": 1.865167498588562, + "step": 860 + }, + { + "epoch": 0.00861, + "grad_norm": 0.377247154712677, + "grad_norm_var": 0.013599237642109623, + "learning_rate": 5e-05, + "loss": 0.1714, + "loss/crossentropy": 2.8269473910331726, + "loss/hidden": 0.0, + "loss/logits": 0.1713988333940506, + "loss/reg": 1.8640793561935425, + "step": 861 + }, + { + "epoch": 0.00862, + "grad_norm": 0.37849879264831543, + "grad_norm_var": 0.013578332002263205, + "learning_rate": 5e-05, + "loss": 0.169, + "loss/crossentropy": 2.8433790802955627, + "loss/hidden": 0.0, + "loss/logits": 0.16895778477191925, + "loss/reg": 1.8629158735275269, + "step": 862 + }, + { + "epoch": 0.00863, + "grad_norm": 0.4124751091003418, + "grad_norm_var": 0.013588511645486826, + "learning_rate": 5e-05, + "loss": 0.1803, + "loss/crossentropy": 2.7584823966026306, + "loss/hidden": 0.0, + "loss/logits": 0.18026690557599068, + "loss/reg": 1.8609492778778076, + "step": 863 + }, + { + "epoch": 0.00864, + "grad_norm": 0.37336814403533936, + "grad_norm_var": 0.013730216300815038, + "learning_rate": 5e-05, + "loss": 0.1707, + "loss/crossentropy": 2.7589592933654785, + "loss/hidden": 0.0, + "loss/logits": 0.17070752009749413, + "loss/reg": 1.8597759008407593, + "step": 864 + }, + { + "epoch": 0.00865, + "grad_norm": 0.8337989449501038, + "grad_norm_var": 0.02424293784323857, + "learning_rate": 5e-05, + "loss": 0.1781, + "loss/crossentropy": 2.8036633133888245, + "loss/hidden": 0.0, + "loss/logits": 0.17813356593251228, + "loss/reg": 1.8581993579864502, + "step": 865 + }, + { + "epoch": 0.00866, + "grad_norm": 0.35601744055747986, + "grad_norm_var": 0.02460846166740538, + "learning_rate": 5e-05, + "loss": 0.1759, + "loss/crossentropy": 2.715932846069336, + "loss/hidden": 0.0, + "loss/logits": 0.17593519389629364, + "loss/reg": 1.8565593957901, + "step": 866 + }, + { + "epoch": 0.00867, + "grad_norm": 0.425502747297287, + "grad_norm_var": 0.02436568774250706, + "learning_rate": 5e-05, + "loss": 0.1821, + "loss/crossentropy": 2.7275202870368958, + "loss/hidden": 0.0, + "loss/logits": 0.18210354447364807, + "loss/reg": 1.8554532527923584, + "step": 867 + }, + { + "epoch": 0.00868, + "grad_norm": 0.3844553232192993, + "grad_norm_var": 0.024142035969486622, + "learning_rate": 5e-05, + "loss": 0.1796, + "loss/crossentropy": 2.8464134335517883, + "loss/hidden": 0.0, + "loss/logits": 0.1795903705060482, + "loss/reg": 1.855208396911621, + "step": 868 + }, + { + "epoch": 0.00869, + "grad_norm": 0.35618311166763306, + "grad_norm_var": 0.02446160042257303, + "learning_rate": 5e-05, + "loss": 0.1758, + "loss/crossentropy": 2.87707781791687, + "loss/hidden": 0.0, + "loss/logits": 0.17583919316530228, + "loss/reg": 1.8549201488494873, + "step": 869 + }, + { + "epoch": 0.0087, + "grad_norm": 0.41672077775001526, + "grad_norm_var": 0.024117258377413187, + "learning_rate": 5e-05, + "loss": 0.1759, + "loss/crossentropy": 2.71548855304718, + "loss/hidden": 0.0, + "loss/logits": 0.1758672632277012, + "loss/reg": 1.8541263341903687, + "step": 870 + }, + { + "epoch": 0.00871, + "grad_norm": 0.3646162450313568, + "grad_norm_var": 0.024202440513241764, + "learning_rate": 5e-05, + "loss": 0.186, + "loss/crossentropy": 2.786548674106598, + "loss/hidden": 0.0, + "loss/logits": 0.18596061319112778, + "loss/reg": 1.8524823188781738, + "step": 871 + }, + { + "epoch": 0.00872, + "grad_norm": 0.37939974665641785, + "grad_norm_var": 0.023961625350842352, + "learning_rate": 5e-05, + "loss": 0.1748, + "loss/crossentropy": 2.835165321826935, + "loss/hidden": 0.0, + "loss/logits": 0.17476912215352058, + "loss/reg": 1.8506335020065308, + "step": 872 + }, + { + "epoch": 0.00873, + "grad_norm": 0.3887036442756653, + "grad_norm_var": 0.023551954015331347, + "learning_rate": 5e-05, + "loss": 0.1789, + "loss/crossentropy": 2.732525408267975, + "loss/hidden": 0.0, + "loss/logits": 0.17892278358340263, + "loss/reg": 1.849176287651062, + "step": 873 + }, + { + "epoch": 0.00874, + "grad_norm": 0.387320876121521, + "grad_norm_var": 0.014549813961372993, + "learning_rate": 5e-05, + "loss": 0.1941, + "loss/crossentropy": 2.792769968509674, + "loss/hidden": 0.0, + "loss/logits": 0.19409611076116562, + "loss/reg": 1.848083734512329, + "step": 874 + }, + { + "epoch": 0.00875, + "grad_norm": 0.3818178176879883, + "grad_norm_var": 0.014678730624869341, + "learning_rate": 5e-05, + "loss": 0.1865, + "loss/crossentropy": 2.762765884399414, + "loss/hidden": 0.0, + "loss/logits": 0.18646146729588509, + "loss/reg": 1.8465189933776855, + "step": 875 + }, + { + "epoch": 0.00876, + "grad_norm": 0.36014363169670105, + "grad_norm_var": 0.013132955726787365, + "learning_rate": 5e-05, + "loss": 0.1847, + "loss/crossentropy": 2.8113619089126587, + "loss/hidden": 0.0, + "loss/logits": 0.1847200095653534, + "loss/reg": 1.8454153537750244, + "step": 876 + }, + { + "epoch": 0.00877, + "grad_norm": 0.3916279077529907, + "grad_norm_var": 0.013081129963419124, + "learning_rate": 5e-05, + "loss": 0.181, + "loss/crossentropy": 2.767706513404846, + "loss/hidden": 0.0, + "loss/logits": 0.18097594752907753, + "loss/reg": 1.8445773124694824, + "step": 877 + }, + { + "epoch": 0.00878, + "grad_norm": 0.38396528363227844, + "grad_norm_var": 0.013058641234249413, + "learning_rate": 5e-05, + "loss": 0.1799, + "loss/crossentropy": 2.750400483608246, + "loss/hidden": 0.0, + "loss/logits": 0.179927259683609, + "loss/reg": 1.8441030979156494, + "step": 878 + }, + { + "epoch": 0.00879, + "grad_norm": 0.3784838020801544, + "grad_norm_var": 0.013129867131250705, + "learning_rate": 5e-05, + "loss": 0.1783, + "loss/crossentropy": 2.6576608419418335, + "loss/hidden": 0.0, + "loss/logits": 0.1783200539648533, + "loss/reg": 1.842759370803833, + "step": 879 + }, + { + "epoch": 0.0088, + "grad_norm": 0.3373940885066986, + "grad_norm_var": 0.013387093786405535, + "learning_rate": 5e-05, + "loss": 0.1665, + "loss/crossentropy": 2.7494055032730103, + "loss/hidden": 0.0, + "loss/logits": 0.16646360978484154, + "loss/reg": 1.8418775796890259, + "step": 880 + }, + { + "epoch": 0.00881, + "grad_norm": 0.3743676543235779, + "grad_norm_var": 0.000488954453456858, + "learning_rate": 5e-05, + "loss": 0.1718, + "loss/crossentropy": 2.7373632192611694, + "loss/hidden": 0.0, + "loss/logits": 0.17177791520953178, + "loss/reg": 1.8405792713165283, + "step": 881 + }, + { + "epoch": 0.00882, + "grad_norm": 0.36398422718048096, + "grad_norm_var": 0.0004683277690547882, + "learning_rate": 5e-05, + "loss": 0.1713, + "loss/crossentropy": 2.689941644668579, + "loss/hidden": 0.0, + "loss/logits": 0.1712586209177971, + "loss/reg": 1.8396137952804565, + "step": 882 + }, + { + "epoch": 0.00883, + "grad_norm": 0.36932700872421265, + "grad_norm_var": 0.0003222525763987627, + "learning_rate": 5e-05, + "loss": 0.1738, + "loss/crossentropy": 3.0094715356826782, + "loss/hidden": 0.0, + "loss/logits": 0.17375321686267853, + "loss/reg": 1.8387012481689453, + "step": 883 + }, + { + "epoch": 0.00884, + "grad_norm": 0.37647050619125366, + "grad_norm_var": 0.0003174026053568609, + "learning_rate": 5e-05, + "loss": 0.1734, + "loss/crossentropy": 2.675420820713043, + "loss/hidden": 0.0, + "loss/logits": 0.17337032034993172, + "loss/reg": 1.8372994661331177, + "step": 884 + }, + { + "epoch": 0.00885, + "grad_norm": 0.3657122850418091, + "grad_norm_var": 0.0002983341146215642, + "learning_rate": 5e-05, + "loss": 0.175, + "loss/crossentropy": 2.785289704799652, + "loss/hidden": 0.0, + "loss/logits": 0.1750231385231018, + "loss/reg": 1.836666464805603, + "step": 885 + }, + { + "epoch": 0.00886, + "grad_norm": 0.3565351963043213, + "grad_norm_var": 0.00019998832643003023, + "learning_rate": 5e-05, + "loss": 0.1611, + "loss/crossentropy": 2.67407763004303, + "loss/hidden": 0.0, + "loss/logits": 0.16111686453223228, + "loss/reg": 1.8361395597457886, + "step": 886 + }, + { + "epoch": 0.00887, + "grad_norm": 0.38317063450813293, + "grad_norm_var": 0.00020202125672466782, + "learning_rate": 5e-05, + "loss": 0.1793, + "loss/crossentropy": 2.7103012204170227, + "loss/hidden": 0.0, + "loss/logits": 0.17928585410118103, + "loss/reg": 1.8344452381134033, + "step": 887 + }, + { + "epoch": 0.00888, + "grad_norm": 0.39307963848114014, + "grad_norm_var": 0.00022420215532017082, + "learning_rate": 5e-05, + "loss": 0.1899, + "loss/crossentropy": 2.7159000635147095, + "loss/hidden": 0.0, + "loss/logits": 0.18990719318389893, + "loss/reg": 1.8328790664672852, + "step": 888 + }, + { + "epoch": 0.00889, + "grad_norm": 0.35862287878990173, + "grad_norm_var": 0.00022381402201028245, + "learning_rate": 5e-05, + "loss": 0.1834, + "loss/crossentropy": 2.847196877002716, + "loss/hidden": 0.0, + "loss/logits": 0.18342823907732964, + "loss/reg": 1.8318103551864624, + "step": 889 + }, + { + "epoch": 0.0089, + "grad_norm": 0.3539126515388489, + "grad_norm_var": 0.0002281156381275314, + "learning_rate": 5e-05, + "loss": 0.1777, + "loss/crossentropy": 2.853213608264923, + "loss/hidden": 0.0, + "loss/logits": 0.1777319796383381, + "loss/reg": 1.829829454421997, + "step": 890 + }, + { + "epoch": 0.00891, + "grad_norm": 0.41561535000801086, + "grad_norm_var": 0.000350336348254295, + "learning_rate": 5e-05, + "loss": 0.1895, + "loss/crossentropy": 2.6837574243545532, + "loss/hidden": 0.0, + "loss/logits": 0.18954132869839668, + "loss/reg": 1.8281813859939575, + "step": 891 + }, + { + "epoch": 0.00892, + "grad_norm": 0.3593007028102875, + "grad_norm_var": 0.00035178644306217054, + "learning_rate": 5e-05, + "loss": 0.1639, + "loss/crossentropy": 2.7398064136505127, + "loss/hidden": 0.0, + "loss/logits": 0.163859985768795, + "loss/reg": 1.8272664546966553, + "step": 892 + }, + { + "epoch": 0.00893, + "grad_norm": 0.3928586542606354, + "grad_norm_var": 0.00035500389449958367, + "learning_rate": 5e-05, + "loss": 0.187, + "loss/crossentropy": 2.7214816212654114, + "loss/hidden": 0.0, + "loss/logits": 0.18696707114577293, + "loss/reg": 1.8260765075683594, + "step": 893 + }, + { + "epoch": 0.00894, + "grad_norm": 0.38060957193374634, + "grad_norm_var": 0.00035065611870696014, + "learning_rate": 5e-05, + "loss": 0.1816, + "loss/crossentropy": 2.8307188153266907, + "loss/hidden": 0.0, + "loss/logits": 0.18159236386418343, + "loss/reg": 1.8246755599975586, + "step": 894 + }, + { + "epoch": 0.00895, + "grad_norm": 0.34957438707351685, + "grad_norm_var": 0.0003796919232549693, + "learning_rate": 5e-05, + "loss": 0.1801, + "loss/crossentropy": 2.7387137413024902, + "loss/hidden": 0.0, + "loss/logits": 0.18012140691280365, + "loss/reg": 1.8235573768615723, + "step": 895 + }, + { + "epoch": 0.00896, + "grad_norm": 0.3666534721851349, + "grad_norm_var": 0.00030342620785123544, + "learning_rate": 5e-05, + "loss": 0.2036, + "loss/crossentropy": 2.758805215358734, + "loss/hidden": 0.0, + "loss/logits": 0.20357270538806915, + "loss/reg": 1.8219776153564453, + "step": 896 + }, + { + "epoch": 0.00897, + "grad_norm": 0.5404136180877686, + "grad_norm_var": 0.0020682628614343557, + "learning_rate": 5e-05, + "loss": 0.1933, + "loss/crossentropy": 2.970340132713318, + "loss/hidden": 0.0, + "loss/logits": 0.19332898035645485, + "loss/reg": 1.8206608295440674, + "step": 897 + }, + { + "epoch": 0.00898, + "grad_norm": 0.3982648551464081, + "grad_norm_var": 0.0020554109287465, + "learning_rate": 5e-05, + "loss": 0.1768, + "loss/crossentropy": 2.6868785619735718, + "loss/hidden": 0.0, + "loss/logits": 0.1768476814031601, + "loss/reg": 1.8192402124404907, + "step": 898 + }, + { + "epoch": 0.00899, + "grad_norm": 0.38741791248321533, + "grad_norm_var": 0.002038042531465663, + "learning_rate": 5e-05, + "loss": 0.1808, + "loss/crossentropy": 2.816374719142914, + "loss/hidden": 0.0, + "loss/logits": 0.18077386170625687, + "loss/reg": 1.8179749250411987, + "step": 899 + }, + { + "epoch": 0.009, + "grad_norm": 0.3847026526927948, + "grad_norm_var": 0.0020316665201686695, + "learning_rate": 5e-05, + "loss": 0.1982, + "loss/crossentropy": 2.8102923035621643, + "loss/hidden": 0.0, + "loss/logits": 0.19821672514081, + "loss/reg": 1.8163453340530396, + "step": 900 + }, + { + "epoch": 0.00901, + "grad_norm": 0.3916863203048706, + "grad_norm_var": 0.0020013109603508254, + "learning_rate": 5e-05, + "loss": 0.1778, + "loss/crossentropy": 2.883694589138031, + "loss/hidden": 0.0, + "loss/logits": 0.17775952070951462, + "loss/reg": 1.8147519826889038, + "step": 901 + }, + { + "epoch": 0.00902, + "grad_norm": 0.36217865347862244, + "grad_norm_var": 0.001979417665481912, + "learning_rate": 5e-05, + "loss": 0.1669, + "loss/crossentropy": 2.8100743293762207, + "loss/hidden": 0.0, + "loss/logits": 0.1668529324233532, + "loss/reg": 1.8133151531219482, + "step": 902 + }, + { + "epoch": 0.00903, + "grad_norm": 0.42102572321891785, + "grad_norm_var": 0.00204143104015622, + "learning_rate": 5e-05, + "loss": 0.181, + "loss/crossentropy": 2.8006924986839294, + "loss/hidden": 0.0, + "loss/logits": 0.18102795258164406, + "loss/reg": 1.8111281394958496, + "step": 903 + }, + { + "epoch": 0.00904, + "grad_norm": 0.388874351978302, + "grad_norm_var": 0.002041367346731493, + "learning_rate": 5e-05, + "loss": 0.1818, + "loss/crossentropy": 2.8004772067070007, + "loss/hidden": 0.0, + "loss/logits": 0.18177441507577896, + "loss/reg": 1.8097094297409058, + "step": 904 + }, + { + "epoch": 0.00905, + "grad_norm": 0.3800460696220398, + "grad_norm_var": 0.0019783346485648203, + "learning_rate": 5e-05, + "loss": 0.176, + "loss/crossentropy": 2.693452537059784, + "loss/hidden": 0.0, + "loss/logits": 0.1760004386305809, + "loss/reg": 1.8086622953414917, + "step": 905 + }, + { + "epoch": 0.00906, + "grad_norm": 0.32960033416748047, + "grad_norm_var": 0.0021389732007735863, + "learning_rate": 5e-05, + "loss": 0.1641, + "loss/crossentropy": 2.7406028509140015, + "loss/hidden": 0.0, + "loss/logits": 0.1641414910554886, + "loss/reg": 1.8070775270462036, + "step": 906 + }, + { + "epoch": 0.00907, + "grad_norm": 0.41262900829315186, + "grad_norm_var": 0.002129550660359725, + "learning_rate": 5e-05, + "loss": 0.1639, + "loss/crossentropy": 2.815558433532715, + "loss/hidden": 0.0, + "loss/logits": 0.1639426201581955, + "loss/reg": 1.80557119846344, + "step": 907 + }, + { + "epoch": 0.00908, + "grad_norm": 0.34168651700019836, + "grad_norm_var": 0.0022218976438497353, + "learning_rate": 5e-05, + "loss": 0.1689, + "loss/crossentropy": 2.7727773189544678, + "loss/hidden": 0.0, + "loss/logits": 0.16886158660054207, + "loss/reg": 1.8040578365325928, + "step": 908 + }, + { + "epoch": 0.00909, + "grad_norm": 0.3481311500072479, + "grad_norm_var": 0.0023254939668475396, + "learning_rate": 5e-05, + "loss": 0.1723, + "loss/crossentropy": 2.7662405967712402, + "loss/hidden": 0.0, + "loss/logits": 0.17232706770300865, + "loss/reg": 1.802317500114441, + "step": 909 + }, + { + "epoch": 0.0091, + "grad_norm": 0.34673434495925903, + "grad_norm_var": 0.0024236772610501315, + "learning_rate": 5e-05, + "loss": 0.1721, + "loss/crossentropy": 2.7547109723091125, + "loss/hidden": 0.0, + "loss/logits": 0.17211218550801277, + "loss/reg": 1.8002270460128784, + "step": 910 + }, + { + "epoch": 0.00911, + "grad_norm": 0.358116090297699, + "grad_norm_var": 0.002388630196925971, + "learning_rate": 5e-05, + "loss": 0.1778, + "loss/crossentropy": 2.8174885511398315, + "loss/hidden": 0.0, + "loss/logits": 0.17777465283870697, + "loss/reg": 1.7986141443252563, + "step": 911 + }, + { + "epoch": 0.00912, + "grad_norm": 0.37328216433525085, + "grad_norm_var": 0.0023752628687049343, + "learning_rate": 5e-05, + "loss": 0.1846, + "loss/crossentropy": 2.7423484921455383, + "loss/hidden": 0.0, + "loss/logits": 0.18462468683719635, + "loss/reg": 1.7969509363174438, + "step": 912 + }, + { + "epoch": 0.00913, + "grad_norm": 0.4073435366153717, + "grad_norm_var": 0.000729848525378903, + "learning_rate": 5e-05, + "loss": 0.1656, + "loss/crossentropy": 2.6962223649024963, + "loss/hidden": 0.0, + "loss/logits": 0.16563431546092033, + "loss/reg": 1.7949342727661133, + "step": 913 + }, + { + "epoch": 0.00914, + "grad_norm": 0.37720003724098206, + "grad_norm_var": 0.0006978068548327905, + "learning_rate": 5e-05, + "loss": 0.1853, + "loss/crossentropy": 2.710608184337616, + "loss/hidden": 0.0, + "loss/logits": 0.18529681861400604, + "loss/reg": 1.7920844554901123, + "step": 914 + }, + { + "epoch": 0.00915, + "grad_norm": 0.36010050773620605, + "grad_norm_var": 0.0007016424011595597, + "learning_rate": 5e-05, + "loss": 0.1833, + "loss/crossentropy": 2.8862733840942383, + "loss/hidden": 0.0, + "loss/logits": 0.1833462007343769, + "loss/reg": 1.7895822525024414, + "step": 915 + }, + { + "epoch": 0.00916, + "grad_norm": 0.35757148265838623, + "grad_norm_var": 0.0007087821480995478, + "learning_rate": 5e-05, + "loss": 0.1741, + "loss/crossentropy": 2.736583173274994, + "loss/hidden": 0.0, + "loss/logits": 0.1740923710167408, + "loss/reg": 1.7870407104492188, + "step": 916 + }, + { + "epoch": 0.00917, + "grad_norm": 0.38085147738456726, + "grad_norm_var": 0.0006880592910958772, + "learning_rate": 5e-05, + "loss": 0.1758, + "loss/crossentropy": 2.722678780555725, + "loss/hidden": 0.0, + "loss/logits": 0.17577889189124107, + "loss/reg": 1.78484308719635, + "step": 917 + }, + { + "epoch": 0.00918, + "grad_norm": 0.3618144989013672, + "grad_norm_var": 0.000688524329092799, + "learning_rate": 5e-05, + "loss": 0.1729, + "loss/crossentropy": 2.7107303738594055, + "loss/hidden": 0.0, + "loss/logits": 0.17288268730044365, + "loss/reg": 1.781936764717102, + "step": 918 + }, + { + "epoch": 0.00919, + "grad_norm": 0.4562152326107025, + "grad_norm_var": 0.000997994245971834, + "learning_rate": 5e-05, + "loss": 0.209, + "loss/crossentropy": 2.6911511421203613, + "loss/hidden": 0.0, + "loss/logits": 0.20904593169689178, + "loss/reg": 1.7793291807174683, + "step": 919 + }, + { + "epoch": 0.0092, + "grad_norm": 0.7892553806304932, + "grad_norm_var": 0.011823798595284762, + "learning_rate": 5e-05, + "loss": 0.2098, + "loss/crossentropy": 2.8024474382400513, + "loss/hidden": 0.0, + "loss/logits": 0.20984026044607162, + "loss/reg": 1.7767542600631714, + "step": 920 + }, + { + "epoch": 0.00921, + "grad_norm": 0.4952187240123749, + "grad_norm_var": 0.012365066103201613, + "learning_rate": 5e-05, + "loss": 0.2215, + "loss/crossentropy": 2.836692988872528, + "loss/hidden": 0.0, + "loss/logits": 0.22146976366639137, + "loss/reg": 1.77475905418396, + "step": 921 + }, + { + "epoch": 0.00922, + "grad_norm": 0.4500190317630768, + "grad_norm_var": 0.01204494814009713, + "learning_rate": 5e-05, + "loss": 0.1957, + "loss/crossentropy": 2.7740437984466553, + "loss/hidden": 0.0, + "loss/logits": 0.1957194283604622, + "loss/reg": 1.7725263833999634, + "step": 922 + }, + { + "epoch": 0.00923, + "grad_norm": 0.4018624424934387, + "grad_norm_var": 0.012053458598524087, + "learning_rate": 5e-05, + "loss": 0.1985, + "loss/crossentropy": 2.783832609653473, + "loss/hidden": 0.0, + "loss/logits": 0.19849245250225067, + "loss/reg": 1.7706068754196167, + "step": 923 + }, + { + "epoch": 0.00924, + "grad_norm": 0.4053579866886139, + "grad_norm_var": 0.01170279735803306, + "learning_rate": 5e-05, + "loss": 0.1847, + "loss/crossentropy": 2.8584959506988525, + "loss/hidden": 0.0, + "loss/logits": 0.1846720688045025, + "loss/reg": 1.7687995433807373, + "step": 924 + }, + { + "epoch": 0.00925, + "grad_norm": 0.4355542063713074, + "grad_norm_var": 0.011379840002585932, + "learning_rate": 5e-05, + "loss": 0.1911, + "loss/crossentropy": 2.8072018027305603, + "loss/hidden": 0.0, + "loss/logits": 0.19105902686715126, + "loss/reg": 1.766356348991394, + "step": 925 + }, + { + "epoch": 0.00926, + "grad_norm": 0.41985246539115906, + "grad_norm_var": 0.010977469936536095, + "learning_rate": 5e-05, + "loss": 0.1772, + "loss/crossentropy": 2.825522303581238, + "loss/hidden": 0.0, + "loss/logits": 0.17719140276312828, + "loss/reg": 1.764008641242981, + "step": 926 + }, + { + "epoch": 0.00927, + "grad_norm": 0.4020366370677948, + "grad_norm_var": 0.010695516965112247, + "learning_rate": 5e-05, + "loss": 0.168, + "loss/crossentropy": 2.946666181087494, + "loss/hidden": 0.0, + "loss/logits": 0.16798604279756546, + "loss/reg": 1.7619134187698364, + "step": 927 + }, + { + "epoch": 0.00928, + "grad_norm": 0.4333237111568451, + "grad_norm_var": 0.01047000612816995, + "learning_rate": 5e-05, + "loss": 0.1732, + "loss/crossentropy": 2.904057264328003, + "loss/hidden": 0.0, + "loss/logits": 0.1731964722275734, + "loss/reg": 1.7594822645187378, + "step": 928 + }, + { + "epoch": 0.00929, + "grad_norm": 0.44914835691452026, + "grad_norm_var": 0.010434282299518182, + "learning_rate": 5e-05, + "loss": 0.1822, + "loss/crossentropy": 2.8881112933158875, + "loss/hidden": 0.0, + "loss/logits": 0.1821521669626236, + "loss/reg": 1.757930874824524, + "step": 929 + }, + { + "epoch": 0.0093, + "grad_norm": 0.8063428401947021, + "grad_norm_var": 0.018582235883409348, + "learning_rate": 5e-05, + "loss": 0.2108, + "loss/crossentropy": 3.000428855419159, + "loss/hidden": 0.0, + "loss/logits": 0.21083774790167809, + "loss/reg": 1.756495714187622, + "step": 930 + }, + { + "epoch": 0.00931, + "grad_norm": 0.37262633442878723, + "grad_norm_var": 0.018420551139004416, + "learning_rate": 5e-05, + "loss": 0.1727, + "loss/crossentropy": 2.899652659893036, + "loss/hidden": 0.0, + "loss/logits": 0.1727372407913208, + "loss/reg": 1.755653977394104, + "step": 931 + }, + { + "epoch": 0.00932, + "grad_norm": 0.44574448466300964, + "grad_norm_var": 0.017660345874116586, + "learning_rate": 5e-05, + "loss": 0.1771, + "loss/crossentropy": 2.7930703163146973, + "loss/hidden": 0.0, + "loss/logits": 0.17705539613962173, + "loss/reg": 1.7541528940200806, + "step": 932 + }, + { + "epoch": 0.00933, + "grad_norm": 0.4381559193134308, + "grad_norm_var": 0.017191491981390843, + "learning_rate": 5e-05, + "loss": 0.1928, + "loss/crossentropy": 3.009516716003418, + "loss/hidden": 0.0, + "loss/logits": 0.19276633486151695, + "loss/reg": 1.7523828744888306, + "step": 933 + }, + { + "epoch": 0.00934, + "grad_norm": 0.37119948863983154, + "grad_norm_var": 0.01705829482260827, + "learning_rate": 5e-05, + "loss": 0.1784, + "loss/crossentropy": 2.7011741995811462, + "loss/hidden": 0.0, + "loss/logits": 0.17841476574540138, + "loss/reg": 1.750117540359497, + "step": 934 + }, + { + "epoch": 0.00935, + "grad_norm": 0.38776126503944397, + "grad_norm_var": 0.017506596591750172, + "learning_rate": 5e-05, + "loss": 0.1765, + "loss/crossentropy": 2.755903959274292, + "loss/hidden": 0.0, + "loss/logits": 0.17649077624082565, + "loss/reg": 1.7485483884811401, + "step": 935 + }, + { + "epoch": 0.00936, + "grad_norm": 0.3909890949726105, + "grad_norm_var": 0.010412048008983836, + "learning_rate": 5e-05, + "loss": 0.1718, + "loss/crossentropy": 2.7981409430503845, + "loss/hidden": 0.0, + "loss/logits": 0.1718210205435753, + "loss/reg": 1.7470717430114746, + "step": 936 + }, + { + "epoch": 0.00937, + "grad_norm": 0.4047463536262512, + "grad_norm_var": 0.010306674977021387, + "learning_rate": 5e-05, + "loss": 0.1944, + "loss/crossentropy": 2.653374135494232, + "loss/hidden": 0.0, + "loss/logits": 0.1944383941590786, + "loss/reg": 1.7457630634307861, + "step": 937 + }, + { + "epoch": 0.00938, + "grad_norm": 0.4641042947769165, + "grad_norm_var": 0.010340857957444612, + "learning_rate": 5e-05, + "loss": 0.1933, + "loss/crossentropy": 3.054188370704651, + "loss/hidden": 0.0, + "loss/logits": 0.19333792477846146, + "loss/reg": 1.7441859245300293, + "step": 938 + }, + { + "epoch": 0.00939, + "grad_norm": 0.463642418384552, + "grad_norm_var": 0.01027101724812616, + "learning_rate": 5e-05, + "loss": 0.1906, + "loss/crossentropy": 2.7980846166610718, + "loss/hidden": 0.0, + "loss/logits": 0.19064636901021004, + "loss/reg": 1.7424683570861816, + "step": 939 + }, + { + "epoch": 0.0094, + "grad_norm": 0.3763858675956726, + "grad_norm_var": 0.010469512228889604, + "learning_rate": 5e-05, + "loss": 0.1817, + "loss/crossentropy": 2.8554503321647644, + "loss/hidden": 0.0, + "loss/logits": 0.1817203275859356, + "loss/reg": 1.7405589818954468, + "step": 940 + }, + { + "epoch": 0.00941, + "grad_norm": 0.41792160272598267, + "grad_norm_var": 0.010502572032980106, + "learning_rate": 5e-05, + "loss": 0.2081, + "loss/crossentropy": 2.7464479207992554, + "loss/hidden": 0.0, + "loss/logits": 0.20806986466050148, + "loss/reg": 1.7390387058258057, + "step": 941 + }, + { + "epoch": 0.00942, + "grad_norm": 0.405609130859375, + "grad_norm_var": 0.010553986517832181, + "learning_rate": 5e-05, + "loss": 0.1923, + "loss/crossentropy": 2.9190812706947327, + "loss/hidden": 0.0, + "loss/logits": 0.1922554075717926, + "loss/reg": 1.737121343612671, + "step": 942 + }, + { + "epoch": 0.00943, + "grad_norm": 0.5186859369277954, + "grad_norm_var": 0.010823950074372254, + "learning_rate": 5e-05, + "loss": 0.1892, + "loss/crossentropy": 2.846211016178131, + "loss/hidden": 0.0, + "loss/logits": 0.1892014741897583, + "loss/reg": 1.7357187271118164, + "step": 943 + }, + { + "epoch": 0.00944, + "grad_norm": 0.3441806137561798, + "grad_norm_var": 0.011478989118617007, + "learning_rate": 5e-05, + "loss": 0.1701, + "loss/crossentropy": 2.8220054507255554, + "loss/hidden": 0.0, + "loss/logits": 0.1700589321553707, + "loss/reg": 1.733402967453003, + "step": 944 + }, + { + "epoch": 0.00945, + "grad_norm": 0.37400367856025696, + "grad_norm_var": 0.011751047533430632, + "learning_rate": 5e-05, + "loss": 0.179, + "loss/crossentropy": 2.7692030668258667, + "loss/hidden": 0.0, + "loss/logits": 0.17900892347097397, + "loss/reg": 1.7316250801086426, + "step": 945 + }, + { + "epoch": 0.00946, + "grad_norm": 0.4089336395263672, + "grad_norm_var": 0.0020184395330867097, + "learning_rate": 5e-05, + "loss": 0.1982, + "loss/crossentropy": 2.891884744167328, + "loss/hidden": 0.0, + "loss/logits": 0.19816706702113152, + "loss/reg": 1.7298880815505981, + "step": 946 + }, + { + "epoch": 0.00947, + "grad_norm": 0.36752966046333313, + "grad_norm_var": 0.002046509202798789, + "learning_rate": 5e-05, + "loss": 0.1843, + "loss/crossentropy": 2.858055591583252, + "loss/hidden": 0.0, + "loss/logits": 0.18428384885191917, + "loss/reg": 1.7284300327301025, + "step": 947 + }, + { + "epoch": 0.00948, + "grad_norm": 0.36644455790519714, + "grad_norm_var": 0.002074549092487384, + "learning_rate": 5e-05, + "loss": 0.1714, + "loss/crossentropy": 2.848255932331085, + "loss/hidden": 0.0, + "loss/logits": 0.17144014686346054, + "loss/reg": 1.7270002365112305, + "step": 948 + }, + { + "epoch": 0.00949, + "grad_norm": 0.5910805463790894, + "grad_norm_var": 0.004186356490923812, + "learning_rate": 5e-05, + "loss": 0.23, + "loss/crossentropy": 2.7590108513832092, + "loss/hidden": 0.0, + "loss/logits": 0.22996008396148682, + "loss/reg": 1.7256983518600464, + "step": 949 + }, + { + "epoch": 0.0095, + "grad_norm": 0.35803112387657166, + "grad_norm_var": 0.00427554901524122, + "learning_rate": 5e-05, + "loss": 0.1684, + "loss/crossentropy": 2.7760064005851746, + "loss/hidden": 0.0, + "loss/logits": 0.16840620338916779, + "loss/reg": 1.723679542541504, + "step": 950 + }, + { + "epoch": 0.00951, + "grad_norm": 0.412681519985199, + "grad_norm_var": 0.004223846207916952, + "learning_rate": 5e-05, + "loss": 0.2018, + "loss/crossentropy": 2.9045740365982056, + "loss/hidden": 0.0, + "loss/logits": 0.20182525366544724, + "loss/reg": 1.72231125831604, + "step": 951 + }, + { + "epoch": 0.00952, + "grad_norm": 0.4021626114845276, + "grad_norm_var": 0.004193552649382352, + "learning_rate": 5e-05, + "loss": 0.1848, + "loss/crossentropy": 2.6521793007850647, + "loss/hidden": 0.0, + "loss/logits": 0.1848319098353386, + "loss/reg": 1.720641851425171, + "step": 952 + }, + { + "epoch": 0.00953, + "grad_norm": 0.3750251233577728, + "grad_norm_var": 0.00429834750938114, + "learning_rate": 5e-05, + "loss": 0.1791, + "loss/crossentropy": 2.7560397386550903, + "loss/hidden": 0.0, + "loss/logits": 0.17908834293484688, + "loss/reg": 1.7182477712631226, + "step": 953 + }, + { + "epoch": 0.00954, + "grad_norm": 0.5893900990486145, + "grad_norm_var": 0.006092951728716223, + "learning_rate": 5e-05, + "loss": 0.2129, + "loss/crossentropy": 2.835801601409912, + "loss/hidden": 0.0, + "loss/logits": 0.21293479949235916, + "loss/reg": 1.716722011566162, + "step": 954 + }, + { + "epoch": 0.00955, + "grad_norm": 0.40877264738082886, + "grad_norm_var": 0.005985476808116985, + "learning_rate": 5e-05, + "loss": 0.1938, + "loss/crossentropy": 2.689119517803192, + "loss/hidden": 0.0, + "loss/logits": 0.19383220747113228, + "loss/reg": 1.714568853378296, + "step": 955 + }, + { + "epoch": 0.00956, + "grad_norm": 0.38810843229293823, + "grad_norm_var": 0.005926205055061354, + "learning_rate": 5e-05, + "loss": 0.1705, + "loss/crossentropy": 2.948507070541382, + "loss/hidden": 0.0, + "loss/logits": 0.1705201156437397, + "loss/reg": 1.7119203805923462, + "step": 956 + }, + { + "epoch": 0.00957, + "grad_norm": 0.4206679165363312, + "grad_norm_var": 0.005925719462670757, + "learning_rate": 5e-05, + "loss": 0.1807, + "loss/crossentropy": 2.78257417678833, + "loss/hidden": 0.0, + "loss/logits": 0.18074193224310875, + "loss/reg": 1.7095727920532227, + "step": 957 + }, + { + "epoch": 0.00958, + "grad_norm": 0.3933105766773224, + "grad_norm_var": 0.005959929971731507, + "learning_rate": 5e-05, + "loss": 0.2045, + "loss/crossentropy": 2.7225964665412903, + "loss/hidden": 0.0, + "loss/logits": 0.2044883407652378, + "loss/reg": 1.707101583480835, + "step": 958 + }, + { + "epoch": 0.00959, + "grad_norm": 0.3582659661769867, + "grad_norm_var": 0.005456189170996354, + "learning_rate": 5e-05, + "loss": 0.1603, + "loss/crossentropy": 2.7268422842025757, + "loss/hidden": 0.0, + "loss/logits": 0.1602596789598465, + "loss/reg": 1.7055177688598633, + "step": 959 + }, + { + "epoch": 0.0096, + "grad_norm": 0.397594153881073, + "grad_norm_var": 0.0051663773874797295, + "learning_rate": 5e-05, + "loss": 0.1733, + "loss/crossentropy": 2.877332389354706, + "loss/hidden": 0.0, + "loss/logits": 0.1733493208885193, + "loss/reg": 1.7030569314956665, + "step": 960 + }, + { + "epoch": 0.00961, + "grad_norm": 0.4981625974178314, + "grad_norm_var": 0.005480135764708397, + "learning_rate": 5e-05, + "loss": 0.1826, + "loss/crossentropy": 2.656112492084503, + "loss/hidden": 0.0, + "loss/logits": 0.18259770050644875, + "loss/reg": 1.7019816637039185, + "step": 961 + }, + { + "epoch": 0.00962, + "grad_norm": 0.937751054763794, + "grad_norm_var": 0.022106629800203694, + "learning_rate": 5e-05, + "loss": 0.2083, + "loss/crossentropy": 2.907736301422119, + "loss/hidden": 0.0, + "loss/logits": 0.20831404626369476, + "loss/reg": 1.700728416442871, + "step": 962 + }, + { + "epoch": 0.00963, + "grad_norm": 0.3895174264907837, + "grad_norm_var": 0.021883161578962466, + "learning_rate": 5e-05, + "loss": 0.1803, + "loss/crossentropy": 2.845858633518219, + "loss/hidden": 0.0, + "loss/logits": 0.18031802773475647, + "loss/reg": 1.6990742683410645, + "step": 963 + }, + { + "epoch": 0.00964, + "grad_norm": 0.34548690915107727, + "grad_norm_var": 0.02215928485241057, + "learning_rate": 5e-05, + "loss": 0.1645, + "loss/crossentropy": 2.8244311213493347, + "loss/hidden": 0.0, + "loss/logits": 0.1645219847559929, + "loss/reg": 1.697798252105713, + "step": 964 + }, + { + "epoch": 0.00965, + "grad_norm": 0.36824318766593933, + "grad_norm_var": 0.021193656582446117, + "learning_rate": 5e-05, + "loss": 0.1717, + "loss/crossentropy": 2.884181797504425, + "loss/hidden": 0.0, + "loss/logits": 0.17166699841618538, + "loss/reg": 1.6957753896713257, + "step": 965 + }, + { + "epoch": 0.00966, + "grad_norm": 0.37774839997291565, + "grad_norm_var": 0.021001939954339834, + "learning_rate": 5e-05, + "loss": 0.1824, + "loss/crossentropy": 3.025804340839386, + "loss/hidden": 0.0, + "loss/logits": 0.18241329863667488, + "loss/reg": 1.6944239139556885, + "step": 966 + }, + { + "epoch": 0.00967, + "grad_norm": 0.36408743262290955, + "grad_norm_var": 0.02133579751543382, + "learning_rate": 5e-05, + "loss": 0.1816, + "loss/crossentropy": 2.6597015261650085, + "loss/hidden": 0.0, + "loss/logits": 0.1815933845937252, + "loss/reg": 1.692252516746521, + "step": 967 + }, + { + "epoch": 0.00968, + "grad_norm": 0.34311729669570923, + "grad_norm_var": 0.02183892884845392, + "learning_rate": 5e-05, + "loss": 0.1662, + "loss/crossentropy": 2.845684826374054, + "loss/hidden": 0.0, + "loss/logits": 0.1661607250571251, + "loss/reg": 1.6907639503479004, + "step": 968 + }, + { + "epoch": 0.00969, + "grad_norm": 0.38303908705711365, + "grad_norm_var": 0.021779175231247044, + "learning_rate": 5e-05, + "loss": 0.1693, + "loss/crossentropy": 2.558404505252838, + "loss/hidden": 0.0, + "loss/logits": 0.16925161331892014, + "loss/reg": 1.6893569231033325, + "step": 969 + }, + { + "epoch": 0.0097, + "grad_norm": 0.3850949704647064, + "grad_norm_var": 0.02018777587365078, + "learning_rate": 5e-05, + "loss": 0.1685, + "loss/crossentropy": 2.890751600265503, + "loss/hidden": 0.0, + "loss/logits": 0.1684512346982956, + "loss/reg": 1.688266634941101, + "step": 970 + }, + { + "epoch": 0.00971, + "grad_norm": 0.4068422317504883, + "grad_norm_var": 0.020191525445637973, + "learning_rate": 5e-05, + "loss": 0.1857, + "loss/crossentropy": 2.707846701145172, + "loss/hidden": 0.0, + "loss/logits": 0.18569114059209824, + "loss/reg": 1.6867531538009644, + "step": 971 + }, + { + "epoch": 0.00972, + "grad_norm": 0.3924512565135956, + "grad_norm_var": 0.020172897207266394, + "learning_rate": 5e-05, + "loss": 0.1829, + "loss/crossentropy": 2.848098576068878, + "loss/hidden": 0.0, + "loss/logits": 0.1828712299466133, + "loss/reg": 1.6852294206619263, + "step": 972 + }, + { + "epoch": 0.00973, + "grad_norm": 0.3714575469493866, + "grad_norm_var": 0.020336838096992275, + "learning_rate": 5e-05, + "loss": 0.1773, + "loss/crossentropy": 2.8703532814979553, + "loss/hidden": 0.0, + "loss/logits": 0.17731666564941406, + "loss/reg": 1.6838881969451904, + "step": 973 + }, + { + "epoch": 0.00974, + "grad_norm": 0.35195186734199524, + "grad_norm_var": 0.020588227081264298, + "learning_rate": 5e-05, + "loss": 0.1862, + "loss/crossentropy": 2.863659620285034, + "loss/hidden": 0.0, + "loss/logits": 0.18621815741062164, + "loss/reg": 1.6821165084838867, + "step": 974 + }, + { + "epoch": 0.00975, + "grad_norm": 0.441755086183548, + "grad_norm_var": 0.02037088575085001, + "learning_rate": 5e-05, + "loss": 0.1933, + "loss/crossentropy": 2.810901939868927, + "loss/hidden": 0.0, + "loss/logits": 0.19334488362073898, + "loss/reg": 1.6802574396133423, + "step": 975 + }, + { + "epoch": 0.00976, + "grad_norm": 0.40233367681503296, + "grad_norm_var": 0.02035677589008348, + "learning_rate": 5e-05, + "loss": 0.1983, + "loss/crossentropy": 2.681654691696167, + "loss/hidden": 0.0, + "loss/logits": 0.19833911955356598, + "loss/reg": 1.6781818866729736, + "step": 976 + }, + { + "epoch": 0.00977, + "grad_norm": 0.6531580686569214, + "grad_norm_var": 0.023423138566671976, + "learning_rate": 5e-05, + "loss": 0.2212, + "loss/crossentropy": 2.82851505279541, + "loss/hidden": 0.0, + "loss/logits": 0.2212192267179489, + "loss/reg": 1.6763266324996948, + "step": 977 + }, + { + "epoch": 0.00978, + "grad_norm": 0.3646674156188965, + "grad_norm_var": 0.005314392422501734, + "learning_rate": 5e-05, + "loss": 0.1715, + "loss/crossentropy": 2.7788134813308716, + "loss/hidden": 0.0, + "loss/logits": 0.17147252708673477, + "loss/reg": 1.674770712852478, + "step": 978 + }, + { + "epoch": 0.00979, + "grad_norm": 0.40374529361724854, + "grad_norm_var": 0.005314159555871933, + "learning_rate": 5e-05, + "loss": 0.2092, + "loss/crossentropy": 2.7746172547340393, + "loss/hidden": 0.0, + "loss/logits": 0.2092289738357067, + "loss/reg": 1.6723932027816772, + "step": 979 + }, + { + "epoch": 0.0098, + "grad_norm": 0.3737621009349823, + "grad_norm_var": 0.005169172562247167, + "learning_rate": 5e-05, + "loss": 0.1847, + "loss/crossentropy": 2.871635138988495, + "loss/hidden": 0.0, + "loss/logits": 0.18473126366734505, + "loss/reg": 1.6707124710083008, + "step": 980 + }, + { + "epoch": 0.00981, + "grad_norm": 0.37383797764778137, + "grad_norm_var": 0.005148210609649088, + "learning_rate": 5e-05, + "loss": 0.1733, + "loss/crossentropy": 2.760922133922577, + "loss/hidden": 0.0, + "loss/logits": 0.1733292043209076, + "loss/reg": 1.6693061590194702, + "step": 981 + }, + { + "epoch": 0.00982, + "grad_norm": 0.38922467827796936, + "grad_norm_var": 0.00512344066750369, + "learning_rate": 5e-05, + "loss": 0.164, + "loss/crossentropy": 2.8191832304000854, + "loss/hidden": 0.0, + "loss/logits": 0.16395087912678719, + "loss/reg": 1.6671026945114136, + "step": 982 + }, + { + "epoch": 0.00983, + "grad_norm": 0.40670332312583923, + "grad_norm_var": 0.0050327015332547465, + "learning_rate": 5e-05, + "loss": 0.1892, + "loss/crossentropy": 2.847275197505951, + "loss/hidden": 0.0, + "loss/logits": 0.18915896490216255, + "loss/reg": 1.6655491590499878, + "step": 983 + }, + { + "epoch": 0.00984, + "grad_norm": 0.3739645183086395, + "grad_norm_var": 0.004847126969690196, + "learning_rate": 5e-05, + "loss": 0.194, + "loss/crossentropy": 2.817361056804657, + "loss/hidden": 0.0, + "loss/logits": 0.1940479725599289, + "loss/reg": 1.664434552192688, + "step": 984 + }, + { + "epoch": 0.00985, + "grad_norm": 0.36827903985977173, + "grad_norm_var": 0.00490322302848593, + "learning_rate": 5e-05, + "loss": 0.1793, + "loss/crossentropy": 2.9278652667999268, + "loss/hidden": 0.0, + "loss/logits": 0.17926159501075745, + "loss/reg": 1.6631444692611694, + "step": 985 + }, + { + "epoch": 0.00986, + "grad_norm": 0.36838048696517944, + "grad_norm_var": 0.0049621510753778035, + "learning_rate": 5e-05, + "loss": 0.182, + "loss/crossentropy": 2.625900149345398, + "loss/hidden": 0.0, + "loss/logits": 0.1820085123181343, + "loss/reg": 1.6615486145019531, + "step": 986 + }, + { + "epoch": 0.00987, + "grad_norm": 0.406107097864151, + "grad_norm_var": 0.00496177464005331, + "learning_rate": 5e-05, + "loss": 0.165, + "loss/crossentropy": 2.6364856362342834, + "loss/hidden": 0.0, + "loss/logits": 0.16496483236551285, + "loss/reg": 1.6604608297348022, + "step": 987 + }, + { + "epoch": 0.00988, + "grad_norm": 0.3886563777923584, + "grad_norm_var": 0.004967815483619401, + "learning_rate": 5e-05, + "loss": 0.1803, + "loss/crossentropy": 2.796413004398346, + "loss/hidden": 0.0, + "loss/logits": 0.1803182028234005, + "loss/reg": 1.6593427658081055, + "step": 988 + }, + { + "epoch": 0.00989, + "grad_norm": 0.35161423683166504, + "grad_norm_var": 0.005074223354079936, + "learning_rate": 5e-05, + "loss": 0.1644, + "loss/crossentropy": 2.8155667185783386, + "loss/hidden": 0.0, + "loss/logits": 0.16441339999437332, + "loss/reg": 1.6583104133605957, + "step": 989 + }, + { + "epoch": 0.0099, + "grad_norm": 0.39407941699028015, + "grad_norm_var": 0.0049088886087087355, + "learning_rate": 5e-05, + "loss": 0.1928, + "loss/crossentropy": 2.8455575704574585, + "loss/hidden": 0.0, + "loss/logits": 0.19284628704190254, + "loss/reg": 1.658367395401001, + "step": 990 + }, + { + "epoch": 0.00991, + "grad_norm": 0.3695094883441925, + "grad_norm_var": 0.004869171230278472, + "learning_rate": 5e-05, + "loss": 0.1814, + "loss/crossentropy": 2.80289888381958, + "loss/hidden": 0.0, + "loss/logits": 0.18142832443118095, + "loss/reg": 1.6575336456298828, + "step": 991 + }, + { + "epoch": 0.00992, + "grad_norm": 0.3505973815917969, + "grad_norm_var": 0.005015199761620141, + "learning_rate": 5e-05, + "loss": 0.1769, + "loss/crossentropy": 2.731611430644989, + "loss/hidden": 0.0, + "loss/logits": 0.17688723653554916, + "loss/reg": 1.6567095518112183, + "step": 992 + }, + { + "epoch": 0.00993, + "grad_norm": 0.38008397817611694, + "grad_norm_var": 0.0003133497280889556, + "learning_rate": 5e-05, + "loss": 0.1836, + "loss/crossentropy": 2.7869952917099, + "loss/hidden": 0.0, + "loss/logits": 0.18361864984035492, + "loss/reg": 1.6552574634552002, + "step": 993 + }, + { + "epoch": 0.00994, + "grad_norm": 0.38647469878196716, + "grad_norm_var": 0.00030154116815576856, + "learning_rate": 5e-05, + "loss": 0.1794, + "loss/crossentropy": 2.744201898574829, + "loss/hidden": 0.0, + "loss/logits": 0.17939525097608566, + "loss/reg": 1.6545374393463135, + "step": 994 + }, + { + "epoch": 0.00995, + "grad_norm": 0.3995093107223511, + "grad_norm_var": 0.0002894285610608412, + "learning_rate": 5e-05, + "loss": 0.1916, + "loss/crossentropy": 2.694726526737213, + "loss/hidden": 0.0, + "loss/logits": 0.1916041001677513, + "loss/reg": 1.6533586978912354, + "step": 995 + }, + { + "epoch": 0.00996, + "grad_norm": 0.34584900736808777, + "grad_norm_var": 0.0003615231269390488, + "learning_rate": 5e-05, + "loss": 0.1739, + "loss/crossentropy": 2.8016315698623657, + "loss/hidden": 0.0, + "loss/logits": 0.1739257462322712, + "loss/reg": 1.651719570159912, + "step": 996 + }, + { + "epoch": 0.00997, + "grad_norm": 0.3925288915634155, + "grad_norm_var": 0.0003722265532580001, + "learning_rate": 5e-05, + "loss": 0.177, + "loss/crossentropy": 2.897447168827057, + "loss/hidden": 0.0, + "loss/logits": 0.17699377238750458, + "loss/reg": 1.6504392623901367, + "step": 997 + }, + { + "epoch": 0.00998, + "grad_norm": 0.39880403876304626, + "grad_norm_var": 0.0003904176090236522, + "learning_rate": 5e-05, + "loss": 0.1919, + "loss/crossentropy": 2.8275578022003174, + "loss/hidden": 0.0, + "loss/logits": 0.19189641624689102, + "loss/reg": 1.6491367816925049, + "step": 998 + }, + { + "epoch": 0.00999, + "grad_norm": 0.4310808479785919, + "grad_norm_var": 0.0005141220319849537, + "learning_rate": 5e-05, + "loss": 0.2298, + "loss/crossentropy": 2.7063609957695007, + "loss/hidden": 0.0, + "loss/logits": 0.22976921498775482, + "loss/reg": 1.6471576690673828, + "step": 999 + }, + { + "epoch": 0.01, + "grad_norm": 0.3714313507080078, + "grad_norm_var": 0.0005171003041950173, + "learning_rate": 5e-05, + "loss": 0.1901, + "loss/crossentropy": 2.7168938517570496, + "loss/hidden": 0.0, + "loss/logits": 0.19012651592493057, + "loss/reg": 1.6457953453063965, + "step": 1000 + } + ], + "logging_steps": 1, + "max_steps": 100000, + "num_input_tokens_seen": 0, + "num_train_epochs": 9223372036854775807, + "save_steps": 1000, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": true, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 5.15246892515328e+17, + "train_batch_size": 4, + "trial_name": null, + "trial_params": null +}